From c454d419cc5e036daaf8ebf73ccb82fa751a5cd0 Mon Sep 17 00:00:00 2001 From: lisj Date: Fri, 12 May 2023 22:03:59 -0400 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=AD=90=E6=A8=A1=E5=9D=97?= =?UTF-8?q?=E7=9A=84gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- third_party/METIS/.gitignore | 61 - third_party/dlpack/.gitignore | 32 - third_party/dmlc-core/.gitignore | 48 - third_party/dmlc-core/make/config.mk | 53 + third_party/googletest/.gitignore | 84 - .../googletest/googlemock/build-aux/.keep | 0 .../googletest/googlemock/make/Makefile | 117 + .../googletest/googletest/make/Makefile | 88 + .../googletest/scripts/test/Makefile | 59 + third_party/libxsmm/.gitignore | 96 - third_party/libxsmm/.state | 68 + third_party/libxsmm/.theme/main.html | 14 + third_party/libxsmm/bin/.make | 0 .../libxsmm/bin/libxsmm_gemm_generator | Bin 0 -> 297960 bytes .../libxsmm/documentation/libxsmm-dev.pptm | Bin 0 -> 402432 bytes .../libxsmm/documentation/libxsmm_aux.md | 203 + .../libxsmm/documentation/libxsmm_be.md | 76 + .../libxsmm/documentation/libxsmm_compat.md | 97 + .../libxsmm/documentation/libxsmm_dl.md | 133 + .../libxsmm/documentation/libxsmm_fortran.md | 14 + .../documentation/libxsmm_magazine.docx | Bin 0 -> 185856 bytes .../libxsmm/documentation/libxsmm_mm.docx | Bin 0 -> 187392 bytes .../libxsmm/documentation/libxsmm_mm.md | 238 + .../libxsmm/documentation/libxsmm_prof.md | 45 + .../libxsmm/documentation/libxsmm_qna.md | 58 + .../libxsmm/documentation/libxsmm_samples.md | 706 +++ .../libxsmm/documentation/libxsmm_samples.pdf | Bin 0 -> 223171 bytes .../libxsmm/documentation/libxsmm_tune.md | 159 + .../libxsmm/documentation/libxsmm_valid.md | 97 + third_party/libxsmm/ide/_vs2019-configure.bat | 17 + .../ide/libxsmm_generator_gemm_driver.vcxproj | 395 ++ third_party/libxsmm/include/.make | 0 third_party/libxsmm/include/libxsmm.f | 2087 +++++++ third_party/libxsmm/include/libxsmm.mod | Bin 0 -> 16591 bytes third_party/libxsmm/include/libxsmm_config.h | 45 + third_party/libxsmm/include/libxsmm_cpuid.h | 76 + third_party/libxsmm/include/libxsmm_dnn.h | 132 + .../libxsmm/include/libxsmm_dnn_convolution.h | 93 + .../include/libxsmm_dnn_fullyconnected.h | 65 + .../include/libxsmm_dnn_fusedbatchnorm.h | 39 + .../include/libxsmm_dnn_fusedgroupnorm.h | 39 + .../libxsmm/include/libxsmm_dnn_optimizer.h | 55 + .../libxsmm/include/libxsmm_dnn_pooling.h | 65 + .../libxsmm/include/libxsmm_dnn_rnncell.h | 79 + .../libxsmm/include/libxsmm_dnn_softmaxloss.h | 51 + .../libxsmm/include/libxsmm_dnn_tensor.h | 199 + .../libxsmm/include/libxsmm_frontend.h | 590 ++ third_party/libxsmm/include/libxsmm_fsspmdm.h | 40 + .../libxsmm/include/libxsmm_generator.h | 219 + .../libxsmm/include/libxsmm_intrinsics_x86.h | 1022 ++++ third_party/libxsmm/include/libxsmm_macros.h | 983 ++++ third_party/libxsmm/include/libxsmm_malloc.h | 397 ++ third_party/libxsmm/include/libxsmm_math.h | 140 + third_party/libxsmm/include/libxsmm_memory.h | 85 + third_party/libxsmm/include/libxsmm_mhd.h | 167 + third_party/libxsmm/include/libxsmm_rng.h | 57 + third_party/libxsmm/include/libxsmm_source.h | 144 + third_party/libxsmm/include/libxsmm_spmdm.h | 115 + third_party/libxsmm/include/libxsmm_sync.h | 816 +++ third_party/libxsmm/include/libxsmm_timer.h | 41 + .../libxsmm/include/libxsmm_typedefs.h | 878 +++ third_party/libxsmm/include/libxsmm_version.h | 13 + third_party/libxsmm/obj/.make | 0 third_party/libxsmm/obj/intel64/.make | 0 .../intel64/generator_aarch64_instructions.o | Bin 0 -> 33008 bytes .../libxsmm/obj/intel64/generator_common.o | Bin 0 -> 40072 bytes .../obj/intel64/generator_common_aarch64.o | Bin 0 -> 14736 bytes .../obj/intel64/generator_common_x86.o | Bin 0 -> 51272 bytes .../libxsmm/obj/intel64/generator_gemm.o | Bin 0 -> 10912 bytes .../obj/intel64/generator_gemm_aarch64.o | Bin 0 -> 28768 bytes .../libxsmm/obj/intel64/generator_gemm_amx.o | Bin 0 -> 39920 bytes .../obj/intel64/generator_gemm_amx_emu.o | Bin 0 -> 41992 bytes .../intel64/generator_gemm_amx_microkernel.o | Bin 0 -> 29368 bytes .../generator_gemm_amx_microkernel_emu.o | Bin 0 -> 21344 bytes .../intel64/generator_gemm_avx2_microkernel.o | Bin 0 -> 4856 bytes .../generator_gemm_avx512_microkernel.o | Bin 0 -> 30248 bytes .../intel64/generator_gemm_avx_microkernel.o | Bin 0 -> 5200 bytes .../obj/intel64/generator_gemm_common.o | Bin 0 -> 27944 bytes .../intel64/generator_gemm_common_aarch64.o | Bin 0 -> 6416 bytes .../obj/intel64/generator_gemm_noarch.o | Bin 0 -> 3728 bytes .../generator_gemm_sse_avx_avx2_avx512.o | Bin 0 -> 15704 bytes .../intel64/generator_gemm_sse_microkernel.o | Bin 0 -> 4688 bytes .../obj/intel64/generator_mateltwise.o | Bin 0 -> 2112 bytes .../generator_mateltwise_misc_avx_avx512.o | Bin 0 -> 9224 bytes .../generator_mateltwise_reduce_avx_avx512.o | Bin 0 -> 123024 bytes .../generator_mateltwise_sse_avx_avx512.o | Bin 0 -> 16624 bytes .../generator_mateltwise_transform_avx.o | Bin 0 -> 13536 bytes .../generator_mateltwise_transform_avx512.o | Bin 0 -> 60312 bytes .../generator_mateltwise_transform_common.o | Bin 0 -> 2808 bytes ...enerator_mateltwise_transform_common_x86.o | Bin 0 -> 6432 bytes .../generator_mateltwise_transform_sse.o | Bin 0 -> 5184 bytes ...rator_mateltwise_unary_binary_avx_avx512.o | Bin 0 -> 49904 bytes .../obj/intel64/generator_matequation.o | Bin 0 -> 2072 bytes .../generator_matequation_avx_avx512.o | Bin 0 -> 17024 bytes ...nerator_matequation_regblocks_avx_avx512.o | Bin 0 -> 39864 bytes ...generator_matequation_scratch_avx_avx512.o | Bin 0 -> 8256 bytes .../obj/intel64/generator_packed_gemm_ac_rm.o | Bin 0 -> 2224 bytes .../generator_packed_gemm_ac_rm_aarch64.o | Bin 0 -> 9760 bytes ...erator_packed_gemm_ac_rm_avx_avx2_avx512.o | Bin 0 -> 12008 bytes .../obj/intel64/generator_packed_gemm_bc_rm.o | Bin 0 -> 2224 bytes .../generator_packed_gemm_bc_rm_aarch64.o | Bin 0 -> 9712 bytes ...erator_packed_gemm_bc_rm_avx_avx2_avx512.o | Bin 0 -> 11904 bytes .../obj/intel64/generator_packed_spgemm.o | Bin 0 -> 4048 bytes .../generator_packed_spgemm_csc_bsparse.o | Bin 0 -> 2272 bytes ...erator_packed_spgemm_csc_bsparse_aarch64.o | Bin 0 -> 8928 bytes ...acked_spgemm_csc_bsparse_avx_avx2_avx512.o | Bin 0 -> 14176 bytes .../generator_packed_spgemm_csc_csparse.o | Bin 0 -> 2128 bytes ...acked_spgemm_csc_csparse_avx_avx2_avx512.o | Bin 0 -> 13760 bytes .../generator_packed_spgemm_csr_asparse.o | Bin 0 -> 2272 bytes ...erator_packed_spgemm_csr_asparse_aarch64.o | Bin 0 -> 10040 bytes ...acked_spgemm_csr_asparse_avx_avx2_avx512.o | Bin 0 -> 11352 bytes .../generator_packed_spgemm_csr_bsparse.o | Bin 0 -> 2272 bytes ...erator_packed_spgemm_csr_bsparse_aarch64.o | Bin 0 -> 9120 bytes ...acked_spgemm_csr_bsparse_avx_avx2_avx512.o | Bin 0 -> 10768 bytes .../libxsmm/obj/intel64/generator_spgemm.o | Bin 0 -> 8640 bytes .../intel64/generator_spgemm_csc_asparse.o | Bin 0 -> 16088 bytes .../intel64/generator_spgemm_csc_bsparse.o | Bin 0 -> 7352 bytes .../obj/intel64/generator_spgemm_csc_reader.o | Bin 0 -> 5968 bytes .../intel64/generator_spgemm_csr_asparse.o | Bin 0 -> 7352 bytes .../generator_spgemm_csr_asparse_reg.o | Bin 0 -> 13232 bytes .../obj/intel64/generator_spgemm_csr_reader.o | Bin 0 -> 5952 bytes .../obj/intel64/generator_x86_instructions.o | Bin 0 -> 145384 bytes third_party/libxsmm/obj/intel64/libxsmm-mod.o | Bin 0 -> 48440 bytes .../libxsmm/obj/intel64/libxsmm_cpuid_arm.o | Bin 0 -> 1360 bytes .../libxsmm/obj/intel64/libxsmm_cpuid_x86.o | Bin 0 -> 5888 bytes third_party/libxsmm/obj/intel64/libxsmm_dnn.o | Bin 0 -> 35720 bytes .../obj/intel64/libxsmm_dnn_convolution.o | Bin 0 -> 40672 bytes .../libxsmm_dnn_convolution_backward.o | Bin 0 -> 71840 bytes .../intel64/libxsmm_dnn_convolution_forward.o | Bin 0 -> 70808 bytes .../libxsmm_dnn_convolution_weight_update.o | Bin 0 -> 92448 bytes .../obj/intel64/libxsmm_dnn_elementwise.o | Bin 0 -> 50016 bytes .../obj/intel64/libxsmm_dnn_fullyconnected.o | Bin 0 -> 25464 bytes ...nn_fullyconnected_backward_weight_update.o | Bin 0 -> 71888 bytes .../libxsmm_dnn_fullyconnected_forward.o | Bin 0 -> 43496 bytes .../obj/intel64/libxsmm_dnn_fusedbatchnorm.o | Bin 0 -> 10056 bytes .../libxsmm_dnn_fusedbatchnorm_backward.o | Bin 0 -> 94400 bytes .../libxsmm_dnn_fusedbatchnorm_forward.o | Bin 0 -> 128584 bytes .../obj/intel64/libxsmm_dnn_fusedgroupnorm.o | Bin 0 -> 10056 bytes .../libxsmm_dnn_fusedgroupnorm_backward.o | Bin 0 -> 90264 bytes .../libxsmm_dnn_fusedgroupnorm_forward.o | Bin 0 -> 71528 bytes .../obj/intel64/libxsmm_dnn_optimizer.o | Bin 0 -> 7176 bytes .../obj/intel64/libxsmm_dnn_optimizer_sgd.o | Bin 0 -> 4496 bytes .../libxsmm/obj/intel64/libxsmm_dnn_pooling.o | Bin 0 -> 8168 bytes .../intel64/libxsmm_dnn_pooling_backward.o | Bin 0 -> 14360 bytes .../obj/intel64/libxsmm_dnn_pooling_forward.o | Bin 0 -> 15776 bytes .../libxsmm/obj/intel64/libxsmm_dnn_rnncell.o | Bin 0 -> 25824 bytes ...bxsmm_dnn_rnncell_backward_weight_update.o | Bin 0 -> 172840 bytes .../obj/intel64/libxsmm_dnn_rnncell_forward.o | Bin 0 -> 177992 bytes .../obj/intel64/libxsmm_dnn_softmaxloss.o | Bin 0 -> 7768 bytes .../libxsmm_dnn_softmaxloss_backward.o | Bin 0 -> 5392 bytes .../intel64/libxsmm_dnn_softmaxloss_forward.o | Bin 0 -> 11736 bytes .../libxsmm/obj/intel64/libxsmm_dnn_tensor.o | Bin 0 -> 67568 bytes third_party/libxsmm/obj/intel64/libxsmm_ext.o | Bin 0 -> 7600 bytes .../libxsmm/obj/intel64/libxsmm_ext_gemm.o | Bin 0 -> 24136 bytes .../libxsmm/obj/intel64/libxsmm_ext_xcopy.o | Bin 0 -> 22352 bytes .../libxsmm/obj/intel64/libxsmm_fsspmdm.o | Bin 0 -> 20296 bytes .../libxsmm/obj/intel64/libxsmm_gemm.o | Bin 0 -> 68904 bytes .../libxsmm/obj/intel64/libxsmm_generator.o | Bin 0 -> 15128 bytes .../intel64/libxsmm_generator_gemm_driver.o | Bin 0 -> 11544 bytes .../libxsmm/obj/intel64/libxsmm_hash.o | Bin 0 -> 14392 bytes .../libxsmm/obj/intel64/libxsmm_main.o | Bin 0 -> 187464 bytes .../libxsmm/obj/intel64/libxsmm_malloc.o | Bin 0 -> 66040 bytes .../libxsmm/obj/intel64/libxsmm_math.o | Bin 0 -> 27520 bytes .../libxsmm/obj/intel64/libxsmm_matrixeqn.o | Bin 0 -> 42136 bytes .../libxsmm/obj/intel64/libxsmm_memory.o | Bin 0 -> 29272 bytes third_party/libxsmm/obj/intel64/libxsmm_mhd.o | Bin 0 -> 91616 bytes .../libxsmm/obj/intel64/libxsmm_noblas.o | Bin 0 -> 7600 bytes .../libxsmm/obj/intel64/libxsmm_perf.o | Bin 0 -> 3472 bytes .../libxsmm/obj/intel64/libxsmm_python.o | Bin 0 -> 944 bytes third_party/libxsmm/obj/intel64/libxsmm_rng.o | Bin 0 -> 10128 bytes .../libxsmm/obj/intel64/libxsmm_spmdm.o | Bin 0 -> 163176 bytes .../libxsmm/obj/intel64/libxsmm_sync.o | Bin 0 -> 12088 bytes .../libxsmm/obj/intel64/libxsmm_timer.o | Bin 0 -> 4312 bytes .../libxsmm/obj/intel64/libxsmm_trace.o | Bin 0 -> 7608 bytes .../libxsmm/obj/intel64/libxsmm_xcopy.o | Bin 0 -> 111176 bytes third_party/libxsmm/obj/libxsmm_dispatch.h | 7 + third_party/libxsmm/samples/cp2k/.make | 0 .../tvm_cnnlayer/libxsmm_wrapper/Makefile | 19 + .../libxsmm_wrapper/batch_reduce_plus_init.cc | 89 + third_party/libxsmm/samples/nek/.make | 0 third_party/libxsmm/samples/smm/.make | 0 .../libxsmm/samples/utilities/mhd/mhd_in.mhd | Bin 0 -> 27225 bytes third_party/libxsmm/scripts/libxsmm_config.py | 145 + .../libxsmm/scripts/libxsmm_dispatch.py | 116 + .../libxsmm/scripts/libxsmm_interface.py | 195 + third_party/libxsmm/scripts/libxsmm_source.sh | 68 + .../libxsmm/scripts/libxsmm_specialized.py | 205 + .../libxsmm/scripts/libxsmm_utilities.py | 320 ++ .../libxsmm/scripts/libxsmm_version.sh | 30 + third_party/libxsmm/src/libxsmm_cpuid_arm.c | 96 + third_party/libxsmm/src/libxsmm_cpuid_x86.c | 336 ++ third_party/libxsmm/src/libxsmm_diff.h | 144 + third_party/libxsmm/src/libxsmm_dnn.c | 759 +++ .../libxsmm/src/libxsmm_dnn_convolution.c | 2747 +++++++++ .../src/libxsmm_dnn_convolution_backward.c | 719 +++ .../src/libxsmm_dnn_convolution_backward.h | 22 + .../src/libxsmm_dnn_convolution_forward.c | 544 ++ .../src/libxsmm_dnn_convolution_forward.h | 22 + .../libxsmm_dnn_convolution_weight_update.c | 914 +++ .../libxsmm_dnn_convolution_weight_update.h | 22 + .../libxsmm/src/libxsmm_dnn_elementwise.c | 618 ++ .../libxsmm/src/libxsmm_dnn_elementwise.h | 65 + .../libxsmm/src/libxsmm_dnn_fullyconnected.c | 1514 +++++ ...nn_fullyconnected_backward_weight_update.c | 1281 +++++ ...nn_fullyconnected_backward_weight_update.h | 22 + .../src/libxsmm_dnn_fullyconnected_forward.c | 649 +++ .../src/libxsmm_dnn_fullyconnected_forward.h | 22 + .../libxsmm/src/libxsmm_dnn_fusedbatchnorm.c | 638 +++ .../src/libxsmm_dnn_fusedbatchnorm_backward.c | 604 ++ .../src/libxsmm_dnn_fusedbatchnorm_backward.h | 22 + .../src/libxsmm_dnn_fusedbatchnorm_forward.c | 618 ++ .../src/libxsmm_dnn_fusedbatchnorm_forward.h | 22 + .../libxsmm/src/libxsmm_dnn_fusedgroupnorm.c | 648 +++ .../src/libxsmm_dnn_fusedgroupnorm_backward.c | 581 ++ .../src/libxsmm_dnn_fusedgroupnorm_backward.h | 22 + .../src/libxsmm_dnn_fusedgroupnorm_forward.c | 500 ++ .../src/libxsmm_dnn_fusedgroupnorm_forward.h | 20 + .../libxsmm/src/libxsmm_dnn_optimizer.c | 345 ++ .../libxsmm/src/libxsmm_dnn_optimizer_sgd.c | 103 + .../libxsmm/src/libxsmm_dnn_optimizer_sgd.h | 18 + third_party/libxsmm/src/libxsmm_dnn_pooling.c | 451 ++ .../src/libxsmm_dnn_pooling_backward.c | 301 + .../src/libxsmm_dnn_pooling_backward.h | 20 + .../libxsmm/src/libxsmm_dnn_pooling_forward.c | 301 + .../libxsmm/src/libxsmm_dnn_pooling_forward.h | 20 + third_party/libxsmm/src/libxsmm_dnn_rnncell.c | 2357 ++++++++ ...bxsmm_dnn_rnncell_backward_weight_update.c | 1016 ++++ ...bxsmm_dnn_rnncell_backward_weight_update.h | 21 + .../libxsmm/src/libxsmm_dnn_rnncell_forward.c | 740 +++ .../libxsmm/src/libxsmm_dnn_rnncell_forward.h | 21 + .../libxsmm/src/libxsmm_dnn_softmaxloss.c | 382 ++ .../src/libxsmm_dnn_softmaxloss_backward.c | 103 + .../src/libxsmm_dnn_softmaxloss_backward.h | 18 + .../src/libxsmm_dnn_softmaxloss_forward.c | 103 + .../src/libxsmm_dnn_softmaxloss_forward.h | 18 + third_party/libxsmm/src/libxsmm_dnn_tensor.c | 642 +++ third_party/libxsmm/src/libxsmm_ext.c | 267 + third_party/libxsmm/src/libxsmm_ext.h | 46 + third_party/libxsmm/src/libxsmm_ext_gemm.c | 1268 +++++ third_party/libxsmm/src/libxsmm_ext_xcopy.c | 472 ++ third_party/libxsmm/src/libxsmm_fsspmdm.c | 602 ++ third_party/libxsmm/src/libxsmm_gemm.c | 2156 +++++++ third_party/libxsmm/src/libxsmm_gemm.h | 219 + third_party/libxsmm/src/libxsmm_generator.c | 530 ++ .../src/libxsmm_generator_gemm_driver.c | 280 + third_party/libxsmm/src/libxsmm_hash.c | 595 ++ third_party/libxsmm/src/libxsmm_hash.h | 47 + third_party/libxsmm/src/libxsmm_main.c | 4981 +++++++++++++++++ third_party/libxsmm/src/libxsmm_main.h | 1069 ++++ third_party/libxsmm/src/libxsmm_malloc.c | 2617 +++++++++ third_party/libxsmm/src/libxsmm_math.c | 569 ++ third_party/libxsmm/src/libxsmm_matrixeqn.c | 1265 +++++ third_party/libxsmm/src/libxsmm_matrixeqn.h | 148 + third_party/libxsmm/src/libxsmm_memory.c | 593 ++ third_party/libxsmm/src/libxsmm_mhd.c | 925 +++ third_party/libxsmm/src/libxsmm_perf.c | 287 + third_party/libxsmm/src/libxsmm_perf.h | 23 + third_party/libxsmm/src/libxsmm_python.c | 142 + third_party/libxsmm/src/libxsmm_rng.c | 314 ++ third_party/libxsmm/src/libxsmm_spmdm.c | 612 ++ third_party/libxsmm/src/libxsmm_spmdm_begin.h | 64 + .../libxsmm/src/libxsmm_spmdm_begin_avx2.h | 166 + .../libxsmm/src/libxsmm_spmdm_begin_avx512.h | 310 + third_party/libxsmm/src/libxsmm_spmdm_end.h | 42 + third_party/libxsmm/src/libxsmm_sync.c | 673 +++ third_party/libxsmm/src/libxsmm_timer.c | 221 + third_party/libxsmm/src/libxsmm_trace.c | 567 ++ third_party/libxsmm/src/libxsmm_trace.h | 124 + third_party/libxsmm/src/libxsmm_xcopy.c | 735 +++ third_party/libxsmm/src/libxsmm_xcopy.h | 286 + .../libxsmm/src/template/libxsmm_config.h | 44 + .../libxsmm_dnn_bf16_macros_define.tpl.c | 95 + .../libxsmm_dnn_bf16_macros_undefine.tpl.c | 28 + ...t_bwd_custom_custom_fallback_generic.tpl.c | 177 + ..._custom_custom_fallback_generic_bf16.tpl.c | 172 + ...onvolve_st_bwd_custom_custom_generic.tpl.c | 352 ++ ...ve_st_bwd_custom_custom_generic_bf16.tpl.c | 407 ++ ...t_bwd_custom_custom_generic_bf16_amx.tpl.c | 530 ++ ...wd_nhwc_custom-rsck_fallback_generic.tpl.c | 191 + ...olve_st_bwd_nhwc_custom-rsck_generic.tpl.c | 364 ++ ...onvolve_st_fwd_custom_custom_generic.tpl.c | 519 ++ ...ve_st_fwd_custom_custom_generic_bf16.tpl.c | 609 ++ ...t_fwd_custom_custom_generic_bf16_amx.tpl.c | 732 +++ ...e_st_fwd_custom_custom_generic_i8i32.tpl.c | 170 + ...ve_st_fwd_custom_custom_generic_i8i8.tpl.c | 61 + ...olve_st_fwd_nhwc_custom-rsck_generic.tpl.c | 522 ++ ...onvolve_st_upd_custom_custom_generic.tpl.c | 577 ++ ...ve_st_upd_custom_custom_generic_bf16.tpl.c | 723 +++ ...t_upd_custom_custom_generic_bf16_amx.tpl.c | 783 +++ ...olve_st_upd_nhwc_custom-rsck_generic.tpl.c | 675 +++ ...lyconnected_st_bwdupd_custom_generic.tpl.c | 246 + ...onnected_st_bwdupd_ncnc_kcck_generic.tpl.c | 346 ++ ...ted_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c | 625 +++ ...st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c | 604 ++ ...fullyconnected_st_fwd_custom_generic.tpl.c | 102 + ...lyconnected_st_fwd_ncnc_kcck_generic.tpl.c | 235 + ...nected_st_fwd_ncnc_kcck_generic_bf16.tpl.c | 379 ++ ...ed_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c | 223 + ..._ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c | 177 + ...rm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c | 251 + ...rm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c | 312 ++ ...rm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c | 386 ++ ...fusedbatchnorm_st_bwd_custom_generic.tpl.c | 274 + ...rm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c | 248 + ...rm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c | 294 + ...rm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c | 348 ++ ...fusedbatchnorm_st_fwd_custom_generic.tpl.c | 265 + ...rm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c | 222 + ...rm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c | 280 + ...rm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c | 360 ++ ...fusedgroupnorm_st_bwd_custom_generic.tpl.c | 264 + ...rm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c | 232 + ...rm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c | 275 + ...rm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c | 332 ++ ...fusedgroupnorm_st_fwd_custom_generic.tpl.c | 229 + ...libxsmm_dnn_optimizer_sgd_st_generic.tpl.c | 91 + ...ng_st_bwd_custom_f32_bf16_c16_avx512.tpl.c | 153 + ...ng_st_bwd_custom_f32_bf16_c32_avx512.tpl.c | 161 + ...ng_st_bwd_custom_f32_bf16_c64_avx512.tpl.c | 170 + ...mm_dnn_pooling_st_bwd_custom_generic.tpl.c | 184 + ...ng_st_fwd_custom_f32_bf16_c16_avx512.tpl.c | 171 + ...ng_st_fwd_custom_f32_bf16_c32_avx512.tpl.c | 181 + ...ng_st_fwd_custom_f32_bf16_c64_avx512.tpl.c | 205 + ...mm_dnn_pooling_st_fwd_custom_generic.tpl.c | 194 + ..._rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c | 637 +++ ...mm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c | 626 +++ ...dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c | 285 + ...bxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c | 222 + ...rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c | 360 ++ ...ll_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c | 361 ++ ...t_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c | 376 ++ ...m_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c | 306 + ..._rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c | 447 ++ ...cell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c | 441 ++ ..._rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c | 526 ++ ...ell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c | 343 ++ ...st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c | 342 ++ ...ll_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c | 366 ++ ..._lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c | 405 ++ ...nn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c | 214 + ...ncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c | 283 + ...l_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c | 291 + ...xsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c | 138 + ...dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c | 223 + ...rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c | 236 + ...rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c | 254 + ...ll_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c | 331 ++ ...t_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c | 331 ++ ...nn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c | 237 + ...ncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c | 374 ++ ...l_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c | 374 ++ ...ncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c | 226 + ...lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c | 409 ++ ..._rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c | 357 ++ ...mm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c | 425 ++ ...dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c | 92 + ...bxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c | 136 + ...smm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c | 234 + ..._dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c | 148 + ..._dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c | 179 + ...libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c | 34 + ...ibxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c | 34 + ...bxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c | 51 + ...xsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c | 51 + ...bxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c | 64 + ...xsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c | 63 + ...libxsmm_dnn_zero_rim_st_input_custom.tpl.c | 25 + .../libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c | 25 + ..._internal_gru_bwdupd_fused_eltwise_1.tpl.c | 72 + ..._internal_gru_bwdupd_fused_eltwise_2.tpl.c | 38 + ...m_internal_lstm_bwdupd_fused_eltwise.tpl.c | 113 + ...upd_fused_eltwise_ncnc_reformat_bf16.tpl.c | 159 + ...l_lstm_bwdupd_fused_eltwise_reformat.tpl.c | 124 + ...m_bwdupd_fused_eltwise_reformat_bf16.tpl.c | 169 + ...xsmm_internal_lstm_fwd_fused_eltwise.tpl.c | 50 + ...internal_lstm_fwd_fused_eltwise_bf16.tpl.c | 50 + .../src/template/libxsmm_matdiff.tpl.c | 174 + ...ibxsmm_spmdm_compute_bfloat16_thread.tpl.c | 564 ++ .../libxsmm_spmdm_compute_fp32_thread.tpl.c | 542 ++ ...dm_createSparseSlice_bfloat16_thread.tpl.c | 126 + ..._spmdm_createSparseSlice_fp32_thread.tpl.c | 129 + .../libxsmm/src/template/libxsmm_version.h | 12 + third_party/libxsmm/tests/mhd_image.mhd | 13 + third_party/nanoflann/.gitignore | 3 - third_party/pcg/.gitignore | 33 - third_party/phmap/.gitignore | 8 - third_party/tensorpipe/.gitignore | 4 - third_party/thrust/.gitignore | 3 - third_party/tvm/.gitignore | 235 - .../app/src/main/jni/make/config.mk | 54 + .../app/src/main/jni/make/config.mk | 43 + .../app/src/main/jni/make/config.mk | 57 + .../AppIcon.appiconset/Contents.json | 93 + third_party/tvm/apps/sgx/Cargo.lock | 853 +++ third_party/tvm/nnvm/make/config.mk | 63 + .../test_arm_compute_lib/test_config.json | 8 + .../auto_scheduler/ci_logs/conv2d.json | 2 + .../auto_scheduler/ci_logs/matmul.json | 2 + .../ci_logs/resnet-18-NHWC-B1.json | 26 + third_party/tvm/web/.eslintrc.json | 34 + third_party/tvm/web/package.json | 32 + third_party/tvm/web/tsconfig.json | 13 + third_party/tvm/web/typedoc.json | 11 + third_party/xbyak/.gitignore | 1 - 404 files changed, 89187 insertions(+), 608 deletions(-) delete mode 100644 third_party/METIS/.gitignore delete mode 100644 third_party/dlpack/.gitignore delete mode 100644 third_party/dmlc-core/.gitignore create mode 100644 third_party/dmlc-core/make/config.mk delete mode 100644 third_party/googletest/.gitignore create mode 100644 third_party/googletest/googlemock/build-aux/.keep create mode 100644 third_party/googletest/googlemock/make/Makefile create mode 100644 third_party/googletest/googletest/make/Makefile create mode 100644 third_party/googletest/googletest/scripts/test/Makefile delete mode 100644 third_party/libxsmm/.gitignore create mode 100644 third_party/libxsmm/.state create mode 100644 third_party/libxsmm/.theme/main.html create mode 100644 third_party/libxsmm/bin/.make create mode 100755 third_party/libxsmm/bin/libxsmm_gemm_generator create mode 100644 third_party/libxsmm/documentation/libxsmm-dev.pptm create mode 100644 third_party/libxsmm/documentation/libxsmm_aux.md create mode 100644 third_party/libxsmm/documentation/libxsmm_be.md create mode 100644 third_party/libxsmm/documentation/libxsmm_compat.md create mode 100644 third_party/libxsmm/documentation/libxsmm_dl.md create mode 100644 third_party/libxsmm/documentation/libxsmm_fortran.md create mode 100644 third_party/libxsmm/documentation/libxsmm_magazine.docx create mode 100644 third_party/libxsmm/documentation/libxsmm_mm.docx create mode 100644 third_party/libxsmm/documentation/libxsmm_mm.md create mode 100644 third_party/libxsmm/documentation/libxsmm_prof.md create mode 100644 third_party/libxsmm/documentation/libxsmm_qna.md create mode 100644 third_party/libxsmm/documentation/libxsmm_samples.md create mode 100644 third_party/libxsmm/documentation/libxsmm_samples.pdf create mode 100644 third_party/libxsmm/documentation/libxsmm_tune.md create mode 100644 third_party/libxsmm/documentation/libxsmm_valid.md create mode 100644 third_party/libxsmm/ide/_vs2019-configure.bat create mode 100644 third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj create mode 100644 third_party/libxsmm/include/.make create mode 100644 third_party/libxsmm/include/libxsmm.f create mode 100644 third_party/libxsmm/include/libxsmm.mod create mode 100644 third_party/libxsmm/include/libxsmm_config.h create mode 100644 third_party/libxsmm/include/libxsmm_cpuid.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_convolution.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_optimizer.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_pooling.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_rnncell.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h create mode 100644 third_party/libxsmm/include/libxsmm_dnn_tensor.h create mode 100644 third_party/libxsmm/include/libxsmm_frontend.h create mode 100644 third_party/libxsmm/include/libxsmm_fsspmdm.h create mode 100644 third_party/libxsmm/include/libxsmm_generator.h create mode 100644 third_party/libxsmm/include/libxsmm_intrinsics_x86.h create mode 100644 third_party/libxsmm/include/libxsmm_macros.h create mode 100644 third_party/libxsmm/include/libxsmm_malloc.h create mode 100644 third_party/libxsmm/include/libxsmm_math.h create mode 100644 third_party/libxsmm/include/libxsmm_memory.h create mode 100644 third_party/libxsmm/include/libxsmm_mhd.h create mode 100644 third_party/libxsmm/include/libxsmm_rng.h create mode 100644 third_party/libxsmm/include/libxsmm_source.h create mode 100644 third_party/libxsmm/include/libxsmm_spmdm.h create mode 100644 third_party/libxsmm/include/libxsmm_sync.h create mode 100644 third_party/libxsmm/include/libxsmm_timer.h create mode 100644 third_party/libxsmm/include/libxsmm_typedefs.h create mode 100644 third_party/libxsmm/include/libxsmm_version.h create mode 100644 third_party/libxsmm/obj/.make create mode 100644 third_party/libxsmm/obj/intel64/.make create mode 100644 third_party/libxsmm/obj/intel64/generator_aarch64_instructions.o create mode 100644 third_party/libxsmm/obj/intel64/generator_common.o create mode 100644 third_party/libxsmm/obj/intel64/generator_common_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_common_x86.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_amx.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_amx_emu.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel_emu.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_avx2_microkernel.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_avx512_microkernel.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_avx_microkernel.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_common.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_common_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_noarch.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_sse_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_gemm_sse_microkernel.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_misc_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_reduce_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_sse_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_transform_avx.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_transform_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_transform_common.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_transform_common_x86.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_transform_sse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_mateltwise_unary_binary_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_matequation.o create mode 100644 third_party/libxsmm/obj/intel64/generator_matequation_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_matequation_regblocks_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_matequation_scratch_avx_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_csparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_csparse_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse_aarch64.o create mode 100644 third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csc_asparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csc_bsparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csc_reader.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csr_asparse.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csr_asparse_reg.o create mode 100644 third_party/libxsmm/obj/intel64/generator_spgemm_csr_reader.o create mode 100644 third_party/libxsmm/obj/intel64/generator_x86_instructions.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm-mod.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_cpuid_arm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_cpuid_x86.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution_backward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution_weight_update.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_elementwise.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fullyconnected.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fullyconnected_backward_weight_update.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fullyconnected_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedbatchnorm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedbatchnorm_backward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedbatchnorm_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_backward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_optimizer.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_optimizer_sgd.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_pooling.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_pooling_backward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_pooling_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_backward_weight_update.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_softmaxloss.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_softmaxloss_backward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_softmaxloss_forward.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_dnn_tensor.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_ext.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_ext_gemm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_ext_xcopy.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_fsspmdm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_gemm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_generator.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_generator_gemm_driver.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_hash.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_main.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_malloc.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_math.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_matrixeqn.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_memory.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_mhd.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_noblas.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_perf.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_python.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_rng.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_spmdm.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_sync.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_timer.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_trace.o create mode 100644 third_party/libxsmm/obj/intel64/libxsmm_xcopy.o create mode 100644 third_party/libxsmm/obj/libxsmm_dispatch.h create mode 100644 third_party/libxsmm/samples/cp2k/.make create mode 100644 third_party/libxsmm/samples/deeplearning/tvm_cnnlayer/libxsmm_wrapper/Makefile create mode 100644 third_party/libxsmm/samples/deeplearning/tvm_cnnlayer/libxsmm_wrapper/batch_reduce_plus_init.cc create mode 100644 third_party/libxsmm/samples/nek/.make create mode 100644 third_party/libxsmm/samples/smm/.make create mode 100644 third_party/libxsmm/samples/utilities/mhd/mhd_in.mhd create mode 100755 third_party/libxsmm/scripts/libxsmm_config.py create mode 100755 third_party/libxsmm/scripts/libxsmm_dispatch.py create mode 100755 third_party/libxsmm/scripts/libxsmm_interface.py create mode 100755 third_party/libxsmm/scripts/libxsmm_source.sh create mode 100755 third_party/libxsmm/scripts/libxsmm_specialized.py create mode 100755 third_party/libxsmm/scripts/libxsmm_utilities.py create mode 100755 third_party/libxsmm/scripts/libxsmm_version.sh create mode 100644 third_party/libxsmm/src/libxsmm_cpuid_arm.c create mode 100644 third_party/libxsmm/src/libxsmm_cpuid_x86.c create mode 100644 third_party/libxsmm/src/libxsmm_diff.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_backward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_backward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_elementwise.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_elementwise.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fullyconnected.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_optimizer.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_pooling.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_pooling_backward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_pooling_backward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_pooling_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_pooling_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_rnncell.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_softmaxloss.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.c create mode 100644 third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.h create mode 100644 third_party/libxsmm/src/libxsmm_dnn_tensor.c create mode 100644 third_party/libxsmm/src/libxsmm_ext.c create mode 100644 third_party/libxsmm/src/libxsmm_ext.h create mode 100644 third_party/libxsmm/src/libxsmm_ext_gemm.c create mode 100644 third_party/libxsmm/src/libxsmm_ext_xcopy.c create mode 100644 third_party/libxsmm/src/libxsmm_fsspmdm.c create mode 100644 third_party/libxsmm/src/libxsmm_gemm.c create mode 100644 third_party/libxsmm/src/libxsmm_gemm.h create mode 100644 third_party/libxsmm/src/libxsmm_generator.c create mode 100644 third_party/libxsmm/src/libxsmm_generator_gemm_driver.c create mode 100644 third_party/libxsmm/src/libxsmm_hash.c create mode 100644 third_party/libxsmm/src/libxsmm_hash.h create mode 100644 third_party/libxsmm/src/libxsmm_main.c create mode 100644 third_party/libxsmm/src/libxsmm_main.h create mode 100644 third_party/libxsmm/src/libxsmm_malloc.c create mode 100644 third_party/libxsmm/src/libxsmm_math.c create mode 100644 third_party/libxsmm/src/libxsmm_matrixeqn.c create mode 100644 third_party/libxsmm/src/libxsmm_matrixeqn.h create mode 100644 third_party/libxsmm/src/libxsmm_memory.c create mode 100644 third_party/libxsmm/src/libxsmm_mhd.c create mode 100644 third_party/libxsmm/src/libxsmm_perf.c create mode 100644 third_party/libxsmm/src/libxsmm_perf.h create mode 100644 third_party/libxsmm/src/libxsmm_python.c create mode 100644 third_party/libxsmm/src/libxsmm_rng.c create mode 100644 third_party/libxsmm/src/libxsmm_spmdm.c create mode 100644 third_party/libxsmm/src/libxsmm_spmdm_begin.h create mode 100644 third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h create mode 100644 third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h create mode 100644 third_party/libxsmm/src/libxsmm_spmdm_end.h create mode 100644 third_party/libxsmm/src/libxsmm_sync.c create mode 100644 third_party/libxsmm/src/libxsmm_timer.c create mode 100644 third_party/libxsmm/src/libxsmm_trace.c create mode 100644 third_party/libxsmm/src/libxsmm_trace.h create mode 100644 third_party/libxsmm/src/libxsmm_xcopy.c create mode 100644 third_party/libxsmm/src/libxsmm_xcopy.h create mode 100644 third_party/libxsmm/src/template/libxsmm_config.h create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_define.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_undefine.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_custom.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_ncnc_reformat_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_matdiff.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_spmdm_compute_fp32_thread.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c create mode 100644 third_party/libxsmm/src/template/libxsmm_version.h create mode 100644 third_party/libxsmm/tests/mhd_image.mhd delete mode 100644 third_party/nanoflann/.gitignore delete mode 100644 third_party/pcg/.gitignore delete mode 100644 third_party/phmap/.gitignore delete mode 100644 third_party/tensorpipe/.gitignore delete mode 100644 third_party/thrust/.gitignore delete mode 100644 third_party/tvm/.gitignore create mode 100644 third_party/tvm/apps/android_camera/app/src/main/jni/make/config.mk create mode 100644 third_party/tvm/apps/android_deploy/app/src/main/jni/make/config.mk create mode 100644 third_party/tvm/apps/android_rpc/app/src/main/jni/make/config.mk create mode 100644 third_party/tvm/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json create mode 100644 third_party/tvm/apps/sgx/Cargo.lock create mode 100644 third_party/tvm/nnvm/make/config.mk create mode 100644 third_party/tvm/tests/python/contrib/test_arm_compute_lib/test_config.json create mode 100644 third_party/tvm/tutorials/auto_scheduler/ci_logs/conv2d.json create mode 100644 third_party/tvm/tutorials/auto_scheduler/ci_logs/matmul.json create mode 100644 third_party/tvm/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json create mode 100644 third_party/tvm/web/.eslintrc.json create mode 100644 third_party/tvm/web/package.json create mode 100644 third_party/tvm/web/tsconfig.json create mode 100644 third_party/tvm/web/typedoc.json delete mode 100644 third_party/xbyak/.gitignore diff --git a/third_party/METIS/.gitignore b/third_party/METIS/.gitignore deleted file mode 100644 index 4796f278..00000000 --- a/third_party/METIS/.gitignore +++ /dev/null @@ -1,61 +0,0 @@ -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf - -# GK things -build/ -graphs/*.part.* -graphs/*.iperm -graphs/*.epart.* -graphs/*.npart.* -.svn/ - diff --git a/third_party/dlpack/.gitignore b/third_party/dlpack/.gitignore deleted file mode 100644 index 21c857e0..00000000 --- a/third_party/dlpack/.gitignore +++ /dev/null @@ -1,32 +0,0 @@ -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app -*~ -build -bin diff --git a/third_party/dmlc-core/.gitignore b/third_party/dmlc-core/.gitignore deleted file mode 100644 index 124d3960..00000000 --- a/third_party/dmlc-core/.gitignore +++ /dev/null @@ -1,48 +0,0 @@ -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app -*~ -config.mk -*.pyc - -# Vim -*.swp -*.swo -*.swn -*.csv -.vimrc - -# Emacs -.clang_complete -deps -recommonmark -build - -# CLion -.idea -cmake-build-* diff --git a/third_party/dmlc-core/make/config.mk b/third_party/dmlc-core/make/config.mk new file mode 100644 index 00000000..a6be9ad5 --- /dev/null +++ b/third_party/dmlc-core/make/config.mk @@ -0,0 +1,53 @@ +#----------------------------------------------------- +# dmlc-core: the configuration compile script +# +# This is the default configuration setup for +# If you want to change configuration, do the following steps: +# +# - copy this file to the root of dmlc-core folder +# - modify the configuration you want +# - type make or make -j n on each of the folder +#---------------------------------------------------- + +# choice of compiler +export CC = gcc +export CXX = g++ +export MPICXX = mpicxx + +# choice of archiver +export AR = ar + +# the additional link flags you want to add +ADD_LDFLAGS = + +# the additional compile flags you want to add +ADD_CFLAGS = + +# whether to compile with -fPIC option +# Note: to build shared library(so files), fPIC is required +WITH_FPIC = 1 + +# whether use openmp during compile +USE_OPENMP = 0 + +# whether use HDFS support during compile +USE_HDFS = 0 + +# whether use AWS S3 support during compile +USE_S3 = 0 + +# whether use Azure blob support during compile +USE_AZURE = 0 + +# path to libjvm.so +LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server + +# whether building unittest (gtest is required) +BUILD_TEST=0 + +# path to gtest library (only used when $BUILD_TEST=1) +# there should be an include path in $GTEST_PATH/include and library in $GTEST_PATH/lib +GTEST_PATH= + +# path to third-party dependences such as glog +DEPS_PATH= diff --git a/third_party/googletest/.gitignore b/third_party/googletest/.gitignore deleted file mode 100644 index f08cb72a..00000000 --- a/third_party/googletest/.gitignore +++ /dev/null @@ -1,84 +0,0 @@ -# Ignore CI build directory -build/ -xcuserdata -cmake-build-debug/ -.idea/ -bazel-bin -bazel-genfiles -bazel-googletest -bazel-out -bazel-testlogs -# python -*.pyc - -# Visual Studio files -.vs -*.sdf -*.opensdf -*.VC.opendb -*.suo -*.user -_ReSharper.Caches/ -Win32-Debug/ -Win32-Release/ -x64-Debug/ -x64-Release/ - -# Ignore autoconf / automake files -Makefile.in -aclocal.m4 -configure -build-aux/ -autom4te.cache/ -googletest/m4/libtool.m4 -googletest/m4/ltoptions.m4 -googletest/m4/ltsugar.m4 -googletest/m4/ltversion.m4 -googletest/m4/lt~obsolete.m4 -googlemock/m4 - -# Ignore generated directories. -googlemock/fused-src/ -googletest/fused-src/ - -# macOS files -.DS_Store -googletest/.DS_Store -googletest/xcode/.DS_Store - -# Ignore cmake generated directories and files. -CMakeFiles -CTestTestfile.cmake -Makefile -cmake_install.cmake -googlemock/CMakeFiles -googlemock/CTestTestfile.cmake -googlemock/Makefile -googlemock/cmake_install.cmake -googlemock/gtest -/bin -/googlemock/gmock.dir -/googlemock/gmock_main.dir -/googlemock/RUN_TESTS.vcxproj.filters -/googlemock/RUN_TESTS.vcxproj -/googlemock/INSTALL.vcxproj.filters -/googlemock/INSTALL.vcxproj -/googlemock/gmock_main.vcxproj.filters -/googlemock/gmock_main.vcxproj -/googlemock/gmock.vcxproj.filters -/googlemock/gmock.vcxproj -/googlemock/gmock.sln -/googlemock/ALL_BUILD.vcxproj.filters -/googlemock/ALL_BUILD.vcxproj -/lib -/Win32 -/ZERO_CHECK.vcxproj.filters -/ZERO_CHECK.vcxproj -/RUN_TESTS.vcxproj.filters -/RUN_TESTS.vcxproj -/INSTALL.vcxproj.filters -/INSTALL.vcxproj -/googletest-distribution.sln -/CMakeCache.txt -/ALL_BUILD.vcxproj.filters -/ALL_BUILD.vcxproj diff --git a/third_party/googletest/googlemock/build-aux/.keep b/third_party/googletest/googlemock/build-aux/.keep new file mode 100644 index 00000000..e69de29b diff --git a/third_party/googletest/googlemock/make/Makefile b/third_party/googletest/googlemock/make/Makefile new file mode 100644 index 00000000..386293a0 --- /dev/null +++ b/third_party/googletest/googlemock/make/Makefile @@ -0,0 +1,117 @@ +# A sample Makefile for building both Google Mock and Google Test and +# using them in user tests. This file is self-contained, so you don't +# need to use the Makefile in Google Test's source tree. Please tweak +# it to suit your environment and project. You may want to move it to +# your project's root directory. +# +# SYNOPSIS: +# +# make [all] - makes everything. +# make TARGET - makes the given target. +# make clean - removes all files generated by make. + +# Please tweak the following variable definitions as needed by your +# project, except GMOCK_HEADERS and GTEST_HEADERS, which you can use +# in your own targets but shouldn't modify. + +# Points to the root of Google Test, relative to where this file is. +# Remember to tweak this if you move this file, or if you want to use +# a copy of Google Test at a different location. +GTEST_DIR = ../../googletest + +# Points to the location of the Google Test libraries +GTEST_LIB_DIR = . + +# Points to the root of Google Mock, relative to where this file is. +# Remember to tweak this if you move this file. +GMOCK_DIR = .. + +# Where to find user code. +USER_DIR = ../test + +# Flags passed to the preprocessor. +# Set Google Test and Google Mock's header directories as system +# directories, such that the compiler doesn't generate warnings in +# these headers. +CPPFLAGS += -isystem $(GTEST_DIR)/include -isystem $(GMOCK_DIR)/include + +# Flags passed to the C++ compiler. +CXXFLAGS += -g -Wall -Wextra -pthread -std=c++11 + +# Google Test libraries +GTEST_LIBS = libgtest.a libgtest_main.a libgmock.a libgmock_main.a + +# All tests produced by this Makefile. Remember to add new tests you +# created to the list. +TESTS = gmock_test + +# All Google Test headers. Usually you shouldn't change this +# definition. +GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \ + $(GTEST_DIR)/include/gtest/internal/*.h + +# All Google Mock headers. Note that all Google Test headers are +# included here too, as they are #included by Google Mock headers. +# Usually you shouldn't change this definition. +GMOCK_HEADERS = $(GMOCK_DIR)/include/gmock/*.h \ + $(GMOCK_DIR)/include/gmock/internal/*.h \ + $(GTEST_HEADERS) + +# House-keeping build targets. + +all : $(GTEST_LIBS) $(TESTS) + +clean : + rm -f $(GTEST_LIBS) $(TESTS) *.o + +# Builds gmock.a and gmock_main.a. These libraries contain both +# Google Mock and Google Test. A test should link with either gmock.a +# or gmock_main.a, depending on whether it defines its own main() +# function. It's fine if your test only uses features from Google +# Test (and not Google Mock). + +# Usually you shouldn't tweak such internal variables, indicated by a +# trailing _. +GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS) +GMOCK_SRCS_ = $(GMOCK_DIR)/src/*.cc $(GMOCK_HEADERS) + +# For simplicity and to avoid depending on implementation details of +# Google Mock and Google Test, the dependencies specified below are +# conservative and not optimized. This is fine as Google Mock and +# Google Test compile fast and for ordinary users their source rarely +# changes. +gtest-all.o : $(GTEST_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \ + -c $(GTEST_DIR)/src/gtest-all.cc + +gtest_main.o : $(GTEST_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \ + -c $(GTEST_DIR)/src/gtest_main.cc + +gmock-all.o : $(GMOCK_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \ + -c $(GMOCK_DIR)/src/gmock-all.cc + +gmock_main.o : $(GMOCK_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) -I$(GMOCK_DIR) $(CXXFLAGS) \ + -c $(GMOCK_DIR)/src/gmock_main.cc + +libgtest.a : gtest-all.o + $(AR) $(ARFLAGS) $@ $^ + +libgtest_main.a : gtest_main.o + $(AR) $(ARFLAGS) $@ $^ + +libgmock.a : gmock-all.o + $(AR) $(ARFLAGS) $@ $^ + +libgmock_main.a : gmock_main.o + $(AR) $(ARFLAGS) $@ $^ + +# Builds a sample test. + +gmock_test.o : $(USER_DIR)/gmock_test.cc $(GMOCK_HEADERS) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/gmock_test.cc + +gmock_test : gmock_test.o $(GTEST_LIBS) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -L$(GTEST_LIB_DIR) -lgmock -lpthread $^ -o $@ diff --git a/third_party/googletest/googletest/make/Makefile b/third_party/googletest/googletest/make/Makefile new file mode 100644 index 00000000..b62da67a --- /dev/null +++ b/third_party/googletest/googletest/make/Makefile @@ -0,0 +1,88 @@ +# A sample Makefile for building Google Test and using it in user +# tests. Please tweak it to suit your environment and project. You +# may want to move it to your project's root directory. +# +# SYNOPSIS: +# +# make [all] - makes everything. +# make TARGET - makes the given target. +# make clean - removes all files generated by make. + +# Please tweak the following variable definitions as needed by your +# project, except GTEST_HEADERS, which you can use in your own targets +# but shouldn't modify. + +# Points to the root of Google Test, relative to where this file is. +# Remember to tweak this if you move this file. +GTEST_DIR = .. + +# Points to the location of the Google Test libraries +GTEST_LIB_DIR = . + +# Where to find user code. +USER_DIR = ../samples + +# Flags passed to the preprocessor. +# Set Google Test's header directory as a system directory, such that +# the compiler doesn't generate warnings in Google Test headers. +CPPFLAGS += -isystem $(GTEST_DIR)/include + +# Flags passed to the C++ compiler. +CXXFLAGS += -g -Wall -Wextra -pthread -std=c++11 + +# Google Test libraries +GTEST_LIBS = libgtest.a libgtest_main.a + +# All tests produced by this Makefile. Remember to add new tests you +# created to the list. +TESTS = sample1_unittest + +# All Google Test headers. Usually you shouldn't change this +# definition. +GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \ + $(GTEST_DIR)/include/gtest/internal/*.h + +# House-keeping build targets. + +all : $(GTEST_LIBS) $(TESTS) + +clean : + rm -f $(GTEST_LIBS) $(TESTS) *.o + +# Builds gtest.a and gtest_main.a. + +# Usually you shouldn't tweak such internal variables, indicated by a +# trailing _. +GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS) + +# For simplicity and to avoid depending on Google Test's +# implementation details, the dependencies specified below are +# conservative and not optimized. This is fine as Google Test +# compiles fast and for ordinary users its source rarely changes. +gtest-all.o : $(GTEST_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \ + $(GTEST_DIR)/src/gtest-all.cc + +gtest_main.o : $(GTEST_SRCS_) + $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \ + $(GTEST_DIR)/src/gtest_main.cc + +libgtest.a : gtest-all.o + $(AR) $(ARFLAGS) $@ $^ + +libgtest_main.a : gtest-all.o gtest_main.o + $(AR) $(ARFLAGS) $@ $^ + +# Builds a sample test. A test should link with either gtest.a or +# gtest_main.a, depending on whether it defines its own main() +# function. + +sample1.o : $(USER_DIR)/sample1.cc $(USER_DIR)/sample1.h $(GTEST_HEADERS) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/sample1.cc + +sample1_unittest.o : $(USER_DIR)/sample1_unittest.cc \ + $(USER_DIR)/sample1.h $(GTEST_HEADERS) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(USER_DIR)/sample1_unittest.cc + +sample1_unittest : sample1.o sample1_unittest.o $(GTEST_LIBS) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -L$(GTEST_LIB_DIR) -lgtest_main -lpthread $^ -o $@ diff --git a/third_party/googletest/googletest/scripts/test/Makefile b/third_party/googletest/googletest/scripts/test/Makefile new file mode 100644 index 00000000..cdff5846 --- /dev/null +++ b/third_party/googletest/googletest/scripts/test/Makefile @@ -0,0 +1,59 @@ +# A Makefile for fusing Google Test and building a sample test against it. +# +# SYNOPSIS: +# +# make [all] - makes everything. +# make TARGET - makes the given target. +# make check - makes everything and runs the built sample test. +# make clean - removes all files generated by make. + +# Points to the root of fused Google Test, relative to where this file is. +FUSED_GTEST_DIR = output + +# Paths to the fused gtest files. +FUSED_GTEST_H = $(FUSED_GTEST_DIR)/gtest/gtest.h +FUSED_GTEST_ALL_CC = $(FUSED_GTEST_DIR)/gtest/gtest-all.cc + +# Where to find the sample test. +SAMPLE_DIR = ../../samples + +# Where to find gtest_main.cc. +GTEST_MAIN_CC = ../../src/gtest_main.cc + +# Flags passed to the preprocessor. +# We have no idea here whether pthreads is available in the system, so +# disable its use. +CPPFLAGS += -I$(FUSED_GTEST_DIR) -DGTEST_HAS_PTHREAD=0 + +# Flags passed to the C++ compiler. +CXXFLAGS += -g + +all : sample1_unittest + +check : all + ./sample1_unittest + +clean : + rm -rf $(FUSED_GTEST_DIR) sample1_unittest *.o + +$(FUSED_GTEST_H) : + ../fuse_gtest_files.py $(FUSED_GTEST_DIR) + +$(FUSED_GTEST_ALL_CC) : + ../fuse_gtest_files.py $(FUSED_GTEST_DIR) + +gtest-all.o : $(FUSED_GTEST_H) $(FUSED_GTEST_ALL_CC) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(FUSED_GTEST_DIR)/gtest/gtest-all.cc + +gtest_main.o : $(FUSED_GTEST_H) $(GTEST_MAIN_CC) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(GTEST_MAIN_CC) + +sample1.o : $(SAMPLE_DIR)/sample1.cc $(SAMPLE_DIR)/sample1.h + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(SAMPLE_DIR)/sample1.cc + +sample1_unittest.o : $(SAMPLE_DIR)/sample1_unittest.cc \ + $(SAMPLE_DIR)/sample1.h $(FUSED_GTEST_H) + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(SAMPLE_DIR)/sample1_unittest.cc + +sample1_unittest : sample1.o sample1_unittest.o gtest-all.o gtest_main.o + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ diff --git a/third_party/libxsmm/.gitignore b/third_party/libxsmm/.gitignore deleted file mode 100644 index 5429e7db..00000000 --- a/third_party/libxsmm/.gitignore +++ /dev/null @@ -1,96 +0,0 @@ -My Amplifier* -VTune Amplifier Results -libxsmm*-* -libxsmm*_* -opentuner.db -bin/libxsmm_generator -include/libxsmm_version.h -include/libxsmm.f -lib/libxsmm* -lib/module -ide/GPUCache -ide/_vs*-*.bat -ide/.vs -ide/obj -ide/r*ah -samples2 -samples/*/bin -samples/*/*.sln -samples/*/*.dat -samples/*/*.pdf -samples/*/*.png -inspector* -licenses -bazel-* -python* -html -site -bin -tmp -obj -.couscous -.vscode -.state -.make -.vs -threadsafety-*.txt -malloc-trace-*.txt -blas-trace-*.txt -codecov-*.txt -keywords.txt -notes.txt -err*.txt -out*.txt -log.txt -_*.txt -.env.sh -.env_?????? -.tool_??????.sh -.libxsmm_??????.* -*.lastcodeanalysissucceeded -*.amplxeproj -*.advixeproj -*.inspxeproj -*.stackdump -*.opensdf -*.opendb -*.VC.db -*.dylib -*.sarif -*.docx -*.user -*.tlog -*.gcno -*.gcda -*.gcov -*.html -*.iobj -*.ipdb -*.URL -*.log -*.suo -*.exe -*.zip -*.pyc -*.sdf -*.ilk -*.pdb -*.vsp -*.obj -*.lib -*.mod -*.bin -*.jit -*.smm -*.soa -*.csr -*.dll -*.mhd -*.out -*.err -*.so -*.o -*.a -*.i -*.s -*.*~ diff --git a/third_party/libxsmm/.state b/third_party/libxsmm/.state new file mode 100644 index 00000000..720e0337 --- /dev/null +++ b/third_party/libxsmm/.state @@ -0,0 +1,68 @@ +"ABSDIR=/public$HOME/dgl/third_party/libxsmm\n" +"ABSLIBS=0\n" +"ALPHA=1\n" +"AR=/usr/bin/gcc-ar\n" +"ASIMD=0\n" +"ASNEEDED=0\n" +"AUTOPIN=0\n" +"BETA=1\n" +"BLAS_CLDFLAGS=-lm\n" +"CACHE=1\n" +"CACHELINE=64\n" +"CC=gcc\n" +"CC_NAME=gcc\n" +"CC_VERSION=8.5.0\n" +"CFLAGS=-fPIC -Wall -O2 -fopenmp-simd -funroll-loops -ftree-vectorize -fdata-sections -ffunction-sections -fvisibility=hidden -pthread\n" +"COMMAND=/usr/bin/command\n" +"COMPATIBLE=0\n" +"CPUFLAGS_X86=fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev sev_es\n" +"CTARGET=-msse4.2\n" +"CXXFLAGS=-fPIC -std=c++14 -Wall -O2 -fopenmp-simd -funroll-loops -ftree-vectorize -fdata-sections -ffunction-sections -fvisibility=hidden -fvisibility-inlines-hidden -pthread\n" +"CXXLDFLAGS=-lc\n" +"CXX_NAME=g++\n" +"CXX_VERSION=8.5.0\n" +"DBG=0\n" +"FAT=0\n" +"FCFLAGS=-fPIC -O2 -ftree-vectorize -fdata-sections -ffunction-sections\n" +"FLD=gcc\n" +"FLUSH=stdbuf -o0 -e0\n" +"FORTRAN=0\n" +"GCC_VERSION=8.5.0\n" +"GLIBC=1\n" +"ILP64=0\n" +"INSTRUMENT=0\n" +"INTRINSICS=1006\n" +"IPO=0\n" +"JITDUMP=0\n" +"LD=gcc\n" +"LDFLAGS=-Wl,--gc-sections -Wl,-z,relro,-z,now -lm -lrt -ldl\n" +"LIBATOMIC=0\n" +"LIBC=-lc\n" +"LNKSOFT=1\n" +"MAINTAINER=0\n" +"MALLOC=0\n" +"MIC=0\n" +"MKL=0\n" +"MNAME=x86_64\n" +"OFFLOAD=0\n" +"OMP=0\n" +"OMPFLAG=-fopenmp\n" +"OMPLIB=-L/usr/lib/gcc/x86_64-redhat-linux/8/ -lgomp\n" +"OMPRT=gomp\n" +"PERF=0\n" +"PLATFORM=0\n" +"PREFETCH=1\n" +"SONAMELNK=2\n" +"SPACES=0\n" +"STATIC=1\n" +"SYM=0\n" +"THREADS=1\n" +"THRESHOLD=0\n" +"TRACE=0\n" +"UNAME=Linux\n" +"VISIBILITY=0\n" +"WCHECK=0\n" +"WERROR_CFLAG=-Werror\n" +"WRAP=1\n" +"XLD=g++\n" +"\n" diff --git a/third_party/libxsmm/.theme/main.html b/third_party/libxsmm/.theme/main.html new file mode 100644 index 00000000..bc42330b --- /dev/null +++ b/third_party/libxsmm/.theme/main.html @@ -0,0 +1,14 @@ +{% extends "base.html" %} + +{% block site_meta %} + + + {{ super() }} +{% endblock %} + +{% block footer %} +
+ {%- if config.copyright %} +

{{ config.copyright }}

+ {%- endif %} +{% endblock %} diff --git a/third_party/libxsmm/bin/.make b/third_party/libxsmm/bin/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/bin/libxsmm_gemm_generator b/third_party/libxsmm/bin/libxsmm_gemm_generator new file mode 100755 index 0000000000000000000000000000000000000000..b69afe7e5ca42e5ced46197f316127e1df29c569 GIT binary patch literal 297960 zcmcG%34B!5`9C}f0RjYWRFkYD9z_1EJW7|W78{}g*hc)H^6>7LU( zosefqG5+VjVmswL3o;eld1kAurvsj>S8bP9XBH#Jd@&!MzVbKqbU5U*qg|#R#dc0T zmo8L%I!})$U&>jJysloR%2zKd7r67BL5A>eZjyMuG>GY4S z@c$Kd`HIY9e)pK;eYv?YnM;5+dC1Tjy#>GGk)h+U3>L~!_KcNKd=Egf;; zc^3{FQnP5t1=4TQC+^q@Q>7*L#_p<=_e7pBmOpmnp@#0U%YSV8W7X{08D9kZ%^v+M z>pK4<`G5WIV_=?q9PAMwc073P3GllCo_`!V&jWVSap3m?c0BrfP5`H=9uI%@3Gm-I z0es*I=$wB7`0f+X*?I!_Z6|>DJ^}r;C%~U{0{o39z}HTI-}waavq2{x|F{456adGQ z^LrMGNOonRj*7e9xRY*Id77;hdV#yv3n8a}?@v5kYQoi1Z3cxx7kQERxls z1&im;D`&pIQxRTRwW#d6ifci-BAEhKnPR0Tw79JLM$i1E*M>Y5i>l`@^Z*iCRIVOX zo{DSchiW`!^QutSbN&45YvzYgRK6$-^y)B*D$1%B)y(%?uP}f;e_^esrlxG(!U`hJ zU%c2;v1IYJAv8~}P;2qE3qvehTy}jmx=UW{?p{CdTDD(Jj%pU#h^RK-b6oEuJRRjIq_9_X0m+OugVEd&fAi~pEq6b z^Y{U6C!UcEcY?c+f;1F!`U%&42Qoqsc2@Eiw-=Uf*Y1DpJ} zzyLmXG^He>nTJmo*PnEOgN`4RXR5hzq^3O9*6|;&Z zzlC|KmQ^76^~_VHtbED;lXtg^6_#N{!WoxhG?_!=R zWbKmtubHRnSWS|@jd`k!^>@kN#5`5SdRX#{nWu_aDa=h$m`P+ zW53d;=~MM7Qzr-NKi`Rh$e|2v`RNd8AU-WO5TB4|91R!;0`Xh(gYk;&!q$K>#aq0d zA_*8lZ@`>^ZUo{t4=il8fj;jDbF=~$xPS%b5Ct6J0uD4!Rls5!SYou8ofIe_^w^-! zn`eFlcF|0!%IBLuQ~4Q%t>0=V`MlX?t^&-p2?;dUT(6)DlH|AR#t!pQK>e{VHWi~6 zLE}@O=5N)XtwSc(6i5u8I=h3%r}<3tDxk%hwAdghT;%A|>f zWQ)%`gGRGIn<*s7b~U=y0L*~#Rlun7mYNsagdS#jV)#N;W}a<>4F$f^2ImSmR13_Y z4LbTGJ0B_oXp{~5Oo2)P8e)UqP@qu&^|wLW6zCj)a&6E*6sQkC88+x40co)b@F#)z z6LpH8&t^#%%sN#XJw**z?<*AeT7Y$93)t!hQcX7UWSdk6kkZ$qhUZ%?cZxT=LL1WS ze}EI}!PH|&X|eZJ=`3$9VEumUl=fRLrOhby7d8dr%e;C1Sep;lG|O)^`{J{_GyGb~ z%K&L5ErG-`?_7VE@Ab%$3@v&A8bm5ni&|`da$&1JH6HX9Ow>!k1`_z|^A7aIeck}z zrSU1=8D{>!Gd%FT`945ql^8pGu-4r1vDVO~=+|OfJ#YS@tt}q%=2FXhfi?)0ORomJ z1&nYXaH!pRJ6Mi@U5|F>El6tpJ!S-YxeoZAK>ap4jVrIxXX#hzv-LS!LH%@_dOuJH z8M2u6*RsLKnej4O0oGr zL!kJ&zMkE)dcl4{u*9Z64%BRUMR%#9BD<)-RrHK1I?pcZg(Av<%PJ77n68*Tc36SG z1Guos`Ufh}+@#$V=&e3^6O9mKJ;hrv>A#Kju;-laa-0zJ@1QHz=^HFpt z)C-}^g-k}@zm$(b#{$jAIs}Zb1I^|ofs7r2-N!;bL7Zf0%&AC!q|&!A{iaHvN}Lx|+RJp4N)KXsjY^-%^lFvv$Ml0L z-G}LwF8ruUUqtv)mHq|ORVqD#>A5QXGx9J+rKd6-P-%ndkt+Qx(*-Ksoprq`{XOA% zDt$BIIV#;Y8EKD7_aOYc@1^g<2{%=`nCW*_+RyYZmA-`OZ7N;F^g5M(hv_F(`VFQZ zR_WFdd7wMt9B%2oO?@-{=Ie@mR8O5er0#VY+rq!}Pv1;`@^ zGk(AzR!t-D^S}XZQ78OAAs2t9<>Sxn0&P&1R~rOZt__;v&DRFa@)nrSe1~uba|(3y z*zIoFpof;%`;2~mR(=& z&HEMSio+RN!w_KCAAT9pzqb4WP_I9H41cwTGeF#@t$m$SY^W>#cGhBD0YmHAdSqY! z$j6$m%Zu=L;LXpOaw`Z0j8E9R(-y$j^g|0`*WfBcTeT(Jh4PENgud1v{!)o{i_dtC z(%lcBz5xUv<2_pJYdGqZJuB`o5L=^gH~#25TI>a2M$tRTY!j}(6^i$4L2I2jmHDI3 zhfYrX`6T%1`lEZLOHbghFY((f&LZ_cydtnW(WUysyU~4A+5(y#!9jSiY6+~I1Xk(~ z4}*Iy)>a`*w!mLBgi4HSqt_vU@gw8bMZlE)!h>fcJ~;}YNVu<%V2_Knd$$+vDQuFu z10Wv~qnz{&;{Ol9aaXLwNM~!yxr=9y^pdNRqg)JW%P7YfL+b~(a7#jqc1Nb9rP-x^ zMH#hS;KxP%G-%(au)0m_ZSDK@-QbtAgzy_^X-Za%jF5U;q+VByC8|Ejs=q~5a}w&= zS_^%bQGXu{*`xj{R$Lb2TzejXM(4F|tJ9*b;I8hYUvb4^eTK9%Pq^%b<^q0qNV`2$ zyTjP-{cQK`yWH)Tv}{fCt4A)9ip^57i>mkoyvL?>toX-N#gsldQj0xri?9y*9(x=a z3e2(K$R#jf%!?$f$Fm`aM@hC3WU1m~LD~8xG^t``y+!q@Km3U()I%7z|HfgW*DTVx z$EMTv8_@X{({hSGjBn>qrBXU-vW$21EC^a@C5(s>Dy5ZV_Tn@tBzhg1gf>>$Z9Kp> z?qC~gyp@q|e=@^kEd> z{R~1~YXlG|Pl_;lJ;t6AbGXuNZE{96(Q6*r`j^dC7TG$8DA3ji)vk3wAU^Sk*{eH; za_SEspk$-nU?{6Iv{(l`cbmKE9aEe>o1#O#ga6DPK(1Q1WQ0!9H{_#73^wltnzaS} zz^rh=?$2CAP{CS?QnNdHt2BCt&wD;QZm!2Fz#5N2MaU{eGBsZM6nz~%K#B2ViSZ2r zmX`v?YnZPVUEaZisq6Wm(OjW@)r6@l|G?w2EDy9~c~Jp@7FcCMyRWG)MpuiiM20}M z#QiWxTAxwhmQ$1Q`~yu8i(Di>Is_%ab0g;x1^P`$Y3HfHMnv8-)*8M6D*%Momw*}b z+b=Od#x_ieT9ke`5FhCcVC8=f;9Mwt`voWhW*SUjnZy+g)-Iz~Pr%e`ypEor2ls8Q zd&d)fC;SUC`UQr@nuHNa$pbve90-%qOVMiIJJFIu41NRjX1VC;+I?H}y0_{1LxWM~ zOE251qc!;Xa4DOkm$yDtddNJzs`ZM>2jktJo9FecpM#!BkPl*|AXXCZV&C*uW;3O^)K9=xJQZGAGkog zCLJ-^SwZAPY;irIL=W^Z&^ZEIATOV`X^W11sZ*OGdwX^K47lQ8cK|=d8}}2TpN&dC zzYZ8Xm3|%#8qZAz!G8xSfLExS722YX`yj3Zk2~sV>nd2LxKyj0o}WOD9Z^cU17scr znMQ6Pk~yWy(Pxk?F+K_!I|9br)UIvR1LHj=2O&(g^;75033fj~!JPCt`?@_q{XB;1 zj(+}nKjB}QllY{5TAv}|s1yh~`t}>h$9!_dPxQ?k$8PRGrw=^t9uFV8@dd>^3@Yr* zOYwg7bD&38fdfa67J$7#yqmd;95@X-_!#eW`pz(lK8MoFw&`{6LZ3ehi}8y-uTy$M zUPPZ)t9&rt@$l?+`dkb?O5)jO0DM?SlKR%u)rK=pq4O~?ZI>Ot<7cul(Pf4$YF;2n z^mt&B?zZ&mPQP4w{3F#fG3pQLh{;Z-^!ODZLjJ)0yTX-`Xx8iALgck+H@0;Ar|v!JIsHIQ$89TWjTVlM17#5qdL1Qa47F#=W@($jR?Sm%G)e}!J)B~Bw zn{BWPMsty)SxXQiLSq{-l>XQ|TJ&oq^mW|og$t%p1mTRDGdzd~KS!h5@&^HvO(9== zTz27}fbn9$2zj&N+rh-aVcr<1F#*XZxC z_wP5p2^en$jW4G{Te&5bAK9J}z$}%kx4ogy)UPPIE_=!A(5!*3O(R>sz$qtWeck!| zTmJ*~Rf|53(9>>Pi(;pPJ^A*W4387+{(?SP=KGSC?XD^L7EeZPx004^Dz3@#vQ2pE zgBTPEN>c4xW!S7_oQJM9^a8o~Ft7I1*RfVD#$C+xj>l!lhIvbj_lyH}oWU~><1=(P zXna$N!T*xGZn=@I86p67Jbr8Tj4O)XU-D|uIN*!llG8dm56c~{hkV*oH|OfwQ*C;D zYIZzaENqs&d?lzQch#(amS%b?R$_d5ay1q(tf283X=8^TW9GXZ%N}##LX0l=71?-C ztY`qNZ{v_@4S(W5G6w%G67_8=%s7QT0XaW*ER46L17D(5^W%#$0QY%&YHMLAwc?-ZV#}X+O3w^JdP5=p1=<*Vcj&E`=sz2;(&;A$6^8E9e^VXv{DtqZKmC2iEHs?ds7m-n>2f~A3r-+Z5w6lVXby&rD9KVA zq{;>LIHe#21iRE0fl!|1_0G{}=#i~izM{TfEw&KUiIzv1^tV2xFImY^v;5*i>L6Dt zW|ybXpT}UI(jN$ym2$BxwPmP=YHJmTiV}oj^sGN7c6AbljQyv;xTLCBNzd^Ay9m_K ziwGc!r1;30_llW%WD6wF6B1Yi2~|dr%1OZHbw6O+gO_F) zIA1Q6B71)TrPxi#=<&SB50_}m??Afnxz??kc2D#E0n|UGSmE&`qSgI?mOlV_M3}#p z@dI3u)|I(Ri?A**$TcZrRcXy}1H%P}F(M3Z$Y zS!W0!!cXTV+X|nHy@Lk8joBgDMjaUN)Zr{6)>}{mdG-4|0BSKF`$<);cDt}=%?3pL z`UdXr7CvW<7eKMADAZevPp!k~A-Vt0dOX!{o+1{$W8FX;V>cWgT+Fn7 zy4sYh4I&}aVC*1hB{k@Gq732M$YJPAM`%mrFqEhe30a^;4rhl-3!5T`bFjVFytl*t z(IUrW^}AK|Ur6;kky!@^O&X!o?E0NTxgfF`P|rG^LTLULSWq6c#0QNtcIl$E|3-;k zcL+Q@&s6@Izs{LFX0&%!OG*3n*%i?Z~<3Ct0CTHiyw}sDvy7@e=8p%PM zbu0BFQJ=KErN}2mF%9fd6lFH?Y@B)`8SfF<*BRa{#|i?WVr5|yhDdFIo{wT{KK|JJ zhHl1iYthkZymm$TI!0t!(*qsk z60R83WFsfQ!~kr^avk0`gqd(!Ukp2~XH~Pd_ALZ%eS zxTG%S8X8ZzXJBmM(qk@+3y>{LIA{)-&ggfFH_I2lK2OdMqPL-OAA`|Ph`{YcV{7D4 z2W|NcXfKcmdW!>zDTw3W^BHdh6D3)}gwGq~g_?ZcuNi1w9YYh#yaAlQ#~RRjiE9f~ zkElJ+)RuSo#aPljESrsAnfEP23-#MLaH$oJ^4qQ3;a|3bnbz&_yxgpINpUk}42j5f zC_JkvKVzea-&zA)#=BTk^bH!vxJ9A;DicG3h&W^A0)`4avEF2^txP|~{&u1jbh~F@S#y*^y z+mgKpH{e9Flv-))woaiu^kfEw{_F9X-0m!8`c|ZcDx?6a1YHSdqeBz(T6e$$e*vMk z?m(!u0{*9U2Zm`VtWGVdtntQPR01u?@d)ZpBdI*|#;+h7j;elzer4-67~AFO9~T3C z{oE0F376|IUJs+UZ19{3nI0|*e7LSs_#7a5X?%ru6-srP@-XCoN5&c9f9qgB=v`og zf$x;u5J1TqK_=Zj=o{$*mqdk@J`tp8l4uD8niSo^)SMd7#T1?oe2NC853AA~mi7W| z5tQn)>aRHmGlVT=i0F^#W$QMG=zi2V?x?(sH7;eRh)fo7p)DEGBj&!c49^_nEqw;` zjq3+dHVpPb^K59FEEsH+D|%ClZbe8XD%TYQ!qcTFrz78>rlgWxn&^w;)~#CXw7ZI#DyK!z=>1amzIMEW(m>Enj^Y*1DnFXc9NS2BPDxsnWlk5?5 zhT^PZ)R^L3@KY=bA6O!C53m?DDShVsrF@72!r^!+>6nADxh^8UVkT87wiSJ0KcLcD ztP(E8x{H1#dB;I-#WbN|Zbok+hel|z+mVo9-g*|Pm0(oHRrwJP^ct^>@X#@t*encs z`xB-He^YT7Lx0mHc7Y95bvSF7b5KNm`Dw|=ew3W3shucfQ)jntO0%1ynWD!+jFS*%n5%vU`G3k>%6AG`@SL zccUY+q6XIQFp7vr96kNK1l4TMw%3(@l+M8Ra`z)Gnuu0?@u=#4J167n_O+u>HRw(6 zw&;ik9?@;-rO;ZTx}EM-+`sNQ3jW_o5^VZDa?ZFXGAtwCdlZoH}2 z@1?>NH+`8E>dNC^Ep{I~FTxN;c_?87k`}>q(d9UD@<$Gi>nET4D|A7j~r=3Mw< z(Af!w3itRDqcORz@NPpc;mc?{_ziZUoAg9=hCZlCFZ!mYr^G-0L}^B3KVkl&=a+mV zF|Ktnt`*tqDGck#j9K0h_O$3jXjZJ|1F)h`%Dn^$bM-i{b}So;IQNfsocq)b7@$+YOltmUZNUWoDtcY_DdNuF zxE6(73TTE3XZm!KD$Juwn0Y_oFG=-FyXy^<>+w;w?0+>#Y~*cCP_CB6|Q z)_h0aww!&uiKIrK9+j)!o(I1fv>VF?b7I)uMzlUm$+LA^ofh2%qUzBMw;2izu+HWn}%|_yN=$^F2Gz*;si!RAWo|M6tERh?0F zgX~7IH4(r|3(=a`A-WIpJ1!cNy?EqB)3UV49U%OZOL(|dlw$fD>;_>CmHh6w{{MsgZXg*^;jtj|ey$g^3#Ye^82`o z^1I9>zx|hp{Cb|R8Zu#lcdEKVOcv+t|9@8t1jgRV6{KgB|ym@UhobrM| zVwSgXE}8w5UXYUtyHv{;c>^_`;8{ z(F1RmkL7e)_BS4{Y(K z`9SQIggTmy&tZy3wm@b7VrNIF>;uSRqMpK&-Ar-|%wiS|56az)4(H-l#oFh2cSzmM zI1CvmBwQZ)0=0o>K4TLtHpej)*|s2Uqovqmo_SLyoR%LOLgU5l=W|m6dtx|M6p>IO zym1H-VXXvd3@mjC=9%Bwf|+pz1fxZJ!O=Ow!8wrRlz_D8VJx690vVw*@wbB(vye|o zitkcbuXBwh@=6t43K3e*q5y0(iqR%|%{~vEpIM3BUYKgL^zpo_N8wt=3_$OrP&bfjSkCH2mUbuX~~8$E2Z z+4QPX^lnh}yvLz8CPfddeNWb2MbBPL^=u@1CQ>s`11>jQ3gTfOc;eR8F!QlrU=|Js zpcvUWrcbhJeZh_=Cqv9T`t%ax4GQ5UzJf=?4MGCE1CCQ5Eb;^WU=lcQuj1yAw(5EjoV=)uSLsMvK-03sw9{> zFUbt_n9RVfkeOb1fSDc+Gbf22JA6FJC5S)o(nQb4&_0f(w&I*1Q&u*oLN2%m6VB3} znwkq~1a;$82nPzu4wRiw3mXQ_c;CEcVkR*0>K^Zh`{UEIUHcw-{PJO_H4=FGBvLWE zpe7*M6mUt|3nb7RAb)&Lc4Y?9>>6tI#AVcEskeKGSV2BpE#ZEj+68>yydOa$2tu)I zsXzDKF(=tjEEy()YTnY;lctqw4FXa6Po6ofA~h0a1EQ1(1)fJP3nN0WibrrW6=@IMp`H%OF#m)w@Gf=OLV23WpfdKuU?6#ZQB{(%3PqwL3 zq?pJ_psJ_Y$}+c{_tOd!P{Bnc7pUg*7q=&Z{+iMB+Jc!YwUz#ciXe>TB8)n=Ci z;VW_MK!I9+kuqv*2-{YDWr&j8XS3mn(lG4qD$c0vgvgJk*Sx<2Oz%I z(|RJ6qo{1{ki8ITpS@bo65H8*hmQ`ZL3L=SjVsLfPsgD;1XRUpwAh27EktE?D?&H_ z0HSmgDS;~wkFn~+YERn50iNlXxTcx;;X7)tRtq3Iq((tJ_JAm|6*@)^&W zhtEyxzSN4LmL&`GluDe!)_xqR`hANq=?Jv>s}9iVqmbvI!;pvhcsfa`c>zh@>BI#t zEfIM?O1e`iQl)7lLGLx%NyeT*yD1Myv^1Tn)O#28tRX^G#)-bf+MOCVf4sB3H3bb2 zPccHL9t4&3o9~{(i2$AHgLV@+&!CUU)E1i`qQD>5Dl-(KQ<>aNWPSJz^AEt`j*qnf z3@Cd~I<%CzVp@IQTm{sWx$ZzKVy=&&;&IIt>@=HmR3&x{HvSnN+S*(CMn3(4XNy{l ztt#`hg;q?URlsvX!L2>Nxaw{qm>uR~(0ao{Ro6(%%Q$b{M}XoiI-U%_Si0i_aJ4IUpFXOA|RV z!O$}eOm9P~tPx(44nq7*fsnbr9htwfTjoat)8rud)j~tb;Z?9@Ed-ybsaMT=PP`yN z(~J6)@y0d;O-^Kz3KeA7~b-V;QWWwHc0rmWEj&=NqRcsjWA+gLY_S<8UAj;&a6K^ z);_rrUlYc8@%wZ)eQa;D5a`O{aEX+P8^rr-+EWN923fRSmLpCnt;}FpV$-l45Q3CG z)=lWU=o#V)5JK1#9?9ttaYX2RiMh>lK*M?;^P%9eEB`=dWP9}RZd-utM(Y67c>-uF zn?KHkOu_-mBX&7jtN)bWzF|9|Q`BpF;brhm`!BQO`S^4?AHy>L2^unnkhCpk#Lq(K z63Df1k71IEelWaD(F7GO`Z=0FM-ZI6Z&s5+G9;Fnhzk@IoGiZ3 z@h)MkHemN8ah@mjI&-S5aPj61d^ zXIyDdrlhxqr!$7vjp6^($qQmW$XmC`TqV7;;|o$USGkBFPs2A$k3-{K;?c~s7_UfD zNb6NZA!NoMhfJ4rGBV%aqsYLZW9O%LNs2qQBcn{u+>A|s*RTlcIG|elF+nIfCDo@m zJl>`5=n_V~r_%bRLI$^@Q<{UyQK*5k%h;!IeJMg(^iI0M!D9Zl?i~CNG_XMEOvi#z zR(dopv#-EprH6BtZ_%zROmFjZZ?TT;xVj5y10d3f>(k>aqmw%qtUqQSQh!($q z;MhY_(PWHY7@DHPakM2DOQ$5LCy`PFl;|Hh;MQ3`oo5G2+^pPDzNFpmXb8yK0qRg6 zT}dCcQgR|6`(#a4O5vdS{?FJS*krtq@*Q6a;Cz?+nLa|fpQ*s3H#46A3S%BQW5?+^ z?lo}1VAgysbGG=&e<>zqA^HT;Rs|=@v~^**M6#+MGbHWqM+F+O>0URQ{ZK(128nAu ziFP9-Ou4?&|kz$2b23N<7HzMx7y?_uo-%MM7$Qh99y8i_@&5BpQ6Vz zaKWnp0SRKJe9iZ|#85rGt$H5rfPM)}*4FC%j4jxP(W2YoC$MnDExfJj-BD-+8VauKD5ybn!8@Fh?O8>r6n+G3 zS~(~Xdy3pVHzPi-T#F6>gl~9D0y}Wo{z80lpcUJwcRYnz%ZJS=?ot)l{%&ky*!f(6 z+)>YP$oR%T=&&ByhmB;cryvzvFt5hn%+Q7UbUA3|0h`r3)GV_d&7AHo@? zKEt1wo`KhfB1h5s5Fz)5j*aB6k`-A;DoaIk0cy?mC+1|#QFV@hwjXvmU2xKBIW74U z*JPLls5irIM^;dHGo( ztwfrGvG92_ONEUjVzN8zBX-R@nD+vC#ufgeW1->yYcDE+J9D=5JT`$FFp-wbNR~GP zuM!sF>Y^5#2WJW6nvt5QwvJ5daC9VmMf~PjxGNN$gs=*#m#<@6j~_I^w+LDgt6|>D zUymok*Y`L`CFiMgOr`Y`8RJ6hOR&qIg%@yu=eayl9>ppZR~~~#xi@(Z0WB(^{v3s& z$bLVRb|8)&@>pklVq#VXuEXK2G5kL)2or(Le4)oXaINqKF^t20<11U={Kiq}d7d6G z$qk|*e6``@4!#agtN>noa(*T5H24$VOCVx!jN20!CVgt0!EUzsI$-}Dd!)F! zHdJ3nT{I6oi*F5Vnp_G=72tNB&#*R~IiO=cJ3QkPRiwarBV)Nui+f-1W#I!TRP%z+ zsmxfP+dGTZ(CSoe)40-!Xt1KEuR}RD{+mC_)Tcn6Xx6+94YDyBbS}k_?JK%iTYe^n zaJ=y{ZEK)1cxagzHc!{#qd;yN$l;Z4z4?PoeG+OG(A+_C4tAmW3KQ%d-)Sg-`6OUI zB1*`y1aCHFK-xigpxlcfW4v{ar5U6>&`Ll@#Qxz7^vL$i0K8hhuS0$y(u6n|=|B-A zh6o;$N;_SmlZf4dc37ndpDKwih3kmtcqMMMb*9(361D{5HL}q8%6#=y?317mSc%@j zg?8k#4Dp%tmHCCO;}hkp;5Vzh_*3r9gOX6_d_RhML7!VNCNb>ygpElI?gamPD`y|$ z0N!PkeoR&}OG)I42J}!AD|ivE zYld$RVwCcI@t$6P(ex==v_V#O6MA9_31F{4VSH_w2KMAjN?EgYKLT(_fKtJ!(-zJDI<3ejW3d^wR)kj@qYolM8R$iqga0sd&^w)NM7dGW zZotY1VgC&1YW&3f%~HUqmoR)d`bJce(Snh|P=lD<@@ z=iwd}4GpaO2Yss_{1)SPfo-ZVm1&hX>XWI;P~!#b-KO}p&-f;3iZ6cPm?FXLOfmOE z+Z4qFD>KF(u^p;5W|hE8_{3-4c_GG;6LOxT zu)Kqu&(Std>jTZjd?*f29E{xO{deTc45%q1Z-@oP3l&}?+M8cuyi3c?wzc*iqUed9 zaBR2!(r^67c!l!}j+<;d7oF9qu<*p&QyN>3tqoYWuV{aGx8K;$<3aep5hXCu3^;m* zHF-gpX@2-O)YgGeX^tQVJT%7{?2unxup*w-Y)14D+Wm|3u_wpgf>3x)5@wBKJ6|6Kgx;R^DqRq86)1N+CbV zg2hrm9lrvpI1g#QGVfx^WyvbPNR=!_iNv*MCQGYf8y4UCQS@(xo3hD_BbkMCTcCL{ zGlLNgb0nRiH3Pi>-3IJVl@)syuW4E`4?R~}ezD0XG?Doc&?fNYQC^%>@ zPvTudCc<`Al_Wx-m3pRu>WTUVM zv!W@;f2uV4~x)?j1ZvTk|KG zC1qLbf*DQt_TI`0_yuNa!r#T0=elQ3)e(t(HA5Zg|)o@g}NO3uyKiTi+ z{T}C?tRMM#xPSAVEL77@(&q+RbQmr!>*!1Q(AJK*B)P?%J+dnln$UlZ8MQh;)VXDJ zh|S{tmerY|?k%I`E6cjPTe!NI#Z@d`oy}ymByk9j>o*h5&}C!1i5vHLM>dS|a+d2@ z7=*G5zlhvil@Y#x(X!0_O?Wl$tKJ>4qWu&xEtrdtUH&uw`gVpVa`RHyFvlpefrEey zkDekMZf8c9D&amQ>*4a~6g>K~2(Dmbcbj-bZmtIYt$-U6-203tRszhoo$yEnK^TBK z03aCYGd^P6Y@HO@mti+)j9aQjJE1>h9_+;UF1hgp-&(<^VY-jO5b1`=n@pw1_hO#0 zJ9>l}7?6HGK=2$8WXT}DbCT$FM>nRYc<1t)eAU{ZN4<5RXBOe!ywg*8<=J>*FW{ty zHb4?|&g%&}Yz|+HT{OIpWO`i=ddQB-V_QCav8KPB=zbOw+Mp_61Kew%4eocofDtMZ zJfKi=i2Hqi@g=Ux#L|M5(BMKqjJtQCo^$wv{IpK$KLa414qRu%7Gn~L|1w1fWop*w1M4!&7zYYcn~08BCMh1W5XX8 z*dtE@=1<&`iH_mbL!Wli(;ER{?=d7nZ?*Y1T#g5Kja>moB&)($1G8j2z8WA|w`rSd zusH=R6%Jtwhn>6MrWnn*(L1F=^Ru zajHJ4GTR&~OggoE@dwtNs2ZA_JW18fF@O9U`&Ofty?DtgzD}y4DTQHouF{hdNUqr; zl~Mws)P=(n3#6xc7b^W%fi#iIPXv-@&QukhuEn3&<}`OAf%tHRYf@!jv%m1^RD?`+ zN=;OwsO_#Po>MZ(H_gAMWCGlHoEj_rJE2Ux=F_MY-#~MaEg(l<)Y#f>*wq~+8ED>$ zO6_F=)u8?O6Z11FGu-2xt|)^|ot2Fz-UTvda%@^0^%;-6iJY}tHvQ|6^t6ng>v(Rg zB>}hm_`}1=;dln(!)toUBvsz2gZd~`Al^eJD|5>qQQrQWfoUK8+>1J!RwDL=kjzsq zMZ@z!(>C;_7Ea=9D1&u+83Ut961+##g!L=Z z5q`+~-7KVxu~WFG+ZT{<54~mV6e$-mb4m@Ls>jC*(WFK&V-oC~r%wlk6?^AX; z{9;b7dCe)ZAPP3xMeEV1$2>_BXmMkys-U1(W@mb0U+_JRTr6pEOJEC|fe68@MrP&A z-j6A`Zke|az7KF2dW%nb;X6}uZo4GN4=H2zw7>oTzG>xTdL*N^pN;o71^HsaT|Lk| zMBGn(?j$w|bnuk`UAu>y=b;M<y}Zt z7f>7uV*n%|^2hJ11GWq`PTSl=Eo!B!fwQqCV|A8U(kIBQ%mKzr(YR`1ELkRM6#FMY z(i#;D4QerZBj{8`tj%r`vcX5&zh=t~4=`|sIz>L~%x^0!e-x*oP`cY~;wp?Q++*D% zk+2750vo{>8&GbK=05IG$!KE7!Wqp+*(OFadI30Y%tlV)E_n_4ee=COI5Nz0uP0!J zeSzd?HtqqYsjoFinlA=xN_e{Hz6ix7EQ|XoB5&-sph^C*o4gK93PXPcC95)tC#<>f zMmu;n0g9_0VLj8f8XqedV;RnQ%2C*8)2!^`HPY|{|}^W zoqrDmgsB`dwHM-Z)(;qxVvtp2w2qk`QF@y;H_6T4fwnc#;m0QUF-V9JEn{;V*)O!; z7uvrhs6{s;uSYi50g53kQn{$g?Z-op{ED}0AC}~1Q2{L$#vTQxa@mb`Uaha}>N7rI zucv0gCV44KaVFD<=>car^jZ?U!s1?{jEBu%%;E2ID$Pa;-?|dwU8%;4e#IWI4ZYC4 z$Iv~sKO%n3`6P&s)1=;*5A2_Cw6z0d#W>8Cl^k|21hJXqXpY5wZ^W6b&(?oZKZ5$G zDkL}nL^6k-dCPbBBR^2PlsTB*%Ld4)NPzXHdS`j8}k5_a~oTZ-Bqjb zBKc@k&BnQ`xdt^^&zhv-V+jv_!t_g->7AAv4L7!LlwEN*dOvDWO72FbyO%hPzMPc5 z(s$SRoK75AeQ2s0+E@#QS`5H#b>E+2Tiv;sv3Wm0b&Y0ak8>F?1Zcg3&9mhGf~_w& zgjf2^{bhZlnB%!%YNg_b`fbyRJ1uoWvEl<&oYL!2YJD!sNPUMKNXpIWIH!IE>r;W& zySRMzWSdKu7gOqMo=npBIeIR7tj@%hA8Jpj#-XJ678cBx02Duvg*`*nIO|z&Di;ZT z{|QIuwxh$LF&h9>u6Pep}R(|LpyY;j5%9T*P?$;4tC0An7gn^D0FQ3NDJq(g`0@ZK1v*7GxSHGD@hEJ zG33WjEPs+08AyK4EXCUXdwQ>B5~BRa&PedG%g#txv(?UUDl^w2qwdSdbhBQRst~ged;|_!-VNyc03tS#0Nr@o8};{e90(}$sxvd%Tv9L2#5k6BpyFI*3akH+`bv2hazmDX*zt<0~>)_sJx zf1g9+mDrQi~1C#S0e&Dxt}jg;?P28^DEIzm2KjqSAkTfKFKQ!jkS z=NLfad5`HWv-MN1mB6h8Hbk9*GdI#Nf}ld)O1?o~;L!SX62t2$eDw-F$TLU0+SWGX z3cd=R!E>3Sp<47sFprJWV)7W^;m|N1Y!(C)Sw|_Wl0khw;W& zcrR`L&GNnsK9>}G8ADO`aaQOY+*9hW$8R~?7mqQj#S*0z{~yEo&knBZumZ)wt5}3+ z0%T~)z=k!#=cz)eygT6{?F*h8QnR zWPML{fKQ2cqm+$ibZDZ)3ebjO=E5V`beKGn!ABFmY6%o6!%0lgD>57p1d(kyMvVQ6 zPtL2vk$F#tu1^Sqs#L_F8M{Tt%u|Fk#5$5P>}GEDK;e0_0$F|$6m2=-j`|>|&)j<$ z+>FbPPtIYJs-_u7J?j&UtkRe9#+T!bqa}DN)HvYh2pq(uq~kMYSZH(eL-iondI<5C#rHey{cA8!d$O6I02TuAanQX!fT-`={O(ZuE)E2it?=X# z)y-b2;@Kca52|X;m74B;nqM5lLg?AsA$Y7W3ZdGLmBju6#+;6JMih*+2;c{?t|isNpN$m@SE7gsAU4!y#q^;pFIJLhqU2VSM%@O-eQ+^MB1n>7$ z%Z#tb3pmN=7FFXiOEHcPM|!opw&07*dXe6f;Xocvg!bs#U0dlF2K$VOxPUTqmVU5U zW|UBVd~yL^-BtEzf8lMC`P%pBnm-F`vzhR%xci6?O!%x9p=Y+9^Dt0einQ87_{bh+ zzz_F5ODIM9Qj7&SUhLgMkH}*c=_< zVH{(pvvtWfVe5*S;3g)&Ll@aLBDJf8GbrqL^860INyS{;@H~L?y8wbyp8w|s5*y|F zNt8W)&xkK?FLgyVxpu(z^f|w8KMWMz%->kd*NE zy`Q5VE<@$!HPJ6v~IPK0|$+&RFBUAFao4%pxSJ7^%Yp&4F?YC~rU= zeO*5C`@1>&J$_>sK8e%D+f2B#r05?3{YQc8Lyx&tk1r;2+Z`Qi!h`f!vJKBjTOnST zL8u#P8>K~8OUw)XmTw*V;)}BN*6mv8WX@~&=8k+jhCgf-{uVY9zzyP3eh>U0A_=)f zwMjf(H(W7eRDtDWc6gr;pCPP6L-;k9QheU$QQ^P`HNp!wxY60b9!0*wUC1fkw!&6y z76&QfCe+|ZE~9uF2gW6GHcd#(x(~K5_mg25!Ndx09g$E2U%z{q!!yty`zSONFXU&z zLF8^(u*M&oZ_K`3q0sH}`i@^L=D%=6K^^un32gp>8FS9a;OI z7QcjOH;B*MkOPp~3_d-^+vX6^!@gJmcT#R*gsh~geYk)8_5Gt)Nb(3T|8dX&%pvrUHmhTeN9c}5^9x~mm^SOBiac6 zu@QO#fmZMngpEL;ZjSse=#KD$IL@d!K8%*kewd|aT)~g%<$+Uf zE(9=C5CXi5NP-pN;IW($X6)!&?ypkK|z&~ zQ9-})84t=QA8o6G>eqoCG!NB(P)1c$AFoZwsrj@o(aa`#kx}>`*t8cW10cYT@MO9P zPwQRTu)+5N@e#}D(}2L0_4@P+Rws2j9aPUlnhp=id?c0oT4g)dHR8ssVF=79N%~?r zpe<~j3aQrtzj5ML3W^t1sxVuMo5#(%_@#Jbs9BHf%f_(B?gOH{#pl6XNo`8k7OE|0NELG4D5@c@E9OUMpQV4$<>Sq|+%wtV4evwj##QFme1o=C`!Bun>i{`K> zY3m?JYd0@t(P=1xl+GLQIlhv%krFDz|F(iK88jZX734{!AR8bs8J1J9bhFA)U&T3_ z*?Uo1wE=DYlWUHZ9JuvkP$cSuU*?P)IujGys8{!M+_TMhqit=Q5e2lFTaZ%tkv2_R z9>FLD65U6DF{3R|mem-@*x9ysC4ZB*;ZBsbbXr_EO!i+e?ZbZ)zQl$P_w4_~j&lS) zN5SVU_3VGi4IiuEWq_}B!_Qana=;&Q!}}=s)qpp+;cb*zr^PkZ9_u=ECCSgH3Vt=< zlihHxYl(j?e%Q|q|CfT71Fp8?*)N<${Q+=&-Qp3LXU1TI$!B!3Iwb3%T&_PXF0zY_ z-qy#Iy^IU`Ut30572xw;fHFUs`9hDi+0Kt<{sMei(asNLeyGR#1M)bb-rXuAR{0r@ zG>XAV@`O4+qOMV8<7u7(WW)m;@A@`g&c|oyp%PjHek}|?f5KO1wCET_&f0Rk>Qc=Wpo`1Nfk_Xb>jDIX{14VCyw z_>#uj>lf9Qmxrr&jLC*+^|1cRO(A~YzrL;UT=qT-AzFRg4C(hS%nFnf;IXqX-jO3+ zwT7RQ&W75G>*tl1Uof;~Om<;)IY^yhT@6wdE9={4p4JI;azICG7$Z`gA#%M;D9@C1 zZ%I#QI`)E$dPl8cXD{sEf*4e@cIl$U6~?rz9b-CqtS%(%#AEesM{(0ki$=kl)=+_| zzrL+hMlw?%xB~^qB!TRLKhbVG8Gq~BCKD2yD{Wi#ZQT#qr`r3|IUPU1FGFCC(Hdq;e-@wB`D+}YVmi|rJ{PTcU@#$HOP`g#Jt2_4 zNk98v*d3)eN%}8@$KHhRiX7>pHSlL8!N;(Qg_Nhht{RN&7}GVyOOEhTj2<+e!+x(q zs~8{3IChs%f#mK06}_A+FV;xHZK&KzZjZRj=>HX7h3~`$tY-%!d)kId{~QqC-grN$l|j)?S)+kWQD?! zG4X(4ecRoV-X<0~^W4~8{6dPf*iCDwWs4Z)s~0btS6+q|=hcL6z%cI^ljE`enWgmg z5vjjI?Cmb$_?J>2qjMqaH`ZP=FSKC(;&RbVH8?$>oY_`2D3IUf;I~|QJm!?FziZI~ zgeHV{ei&~MVls^!$6j4dO174 zGa%24HQ#ko7Ft;=t;{B7Y?V;%q&2KWmB9K?dG*ybVG6Xj^#+WE6c<|P&&=Lg1zNNx^|7Hr`1@WgF;n>a zQPT8F^=-Ths>Qy;L>)N-%3lKkvxiN3=^`4LqK$9erTCnkpM~B&Db!}?cSH~WDHPDd zzX4d%$R37k=xlme-@}F5a+@xEg``h2n2z1#v|P)U*VCqwc47OJXHxYC_NGT=zv|o0 zmGnBXi(IB-LmVE?0WD}RMlNit?G2y-f46jCN7R26b@02w;vaYsP>bFrRAxx}LP_5^ z2=Lgu*kOtsfp)$LVU@tr+}gP$Xmk?ok%RTJ)^H5a`Zh+dn5U%0Ck4XD zREyRT=eEo6x4w-pp=q&dX$umpVq4<d|BUCF@bgY`|4W5#lrt^Y4HPT zk+;k0+b$Bw!+_M@THiKGa+NHts|8U)r;p>%ny|<*c>WQE){bWlsem5j~=DY$m-iFCxBA)JI)mik4ta)T~+L( z5VE|l%z@VtC-!q`0k(VwT4;n;p_A~oHE@nu`zutuQ{UGAY}WS5Y_vn@&6O6ul@^u> zq)Z@R3+-}AZQ7!}RR9C=mu*J;F>5%y~QqLzod4bfs&8c@Q z>Zv(R7_1hH+rLfeQ@-%`5gbo_+Z?HPnNx2J>e(&|6ENDFmbL#5r(dT_y%qG;4Qr(r z{RDCaAdwvXDe&h5kkOTE*I%;L;Kooj|&&2GJf0eY2e5aXipP{ z7TU95vC^KdsOBCoo&#Y%M$OvA^=)G$cQLG2yDeXAc&t#X6>5md7Ynt@s`(zP39~;U zDB=1Ak=toZ$1W3Gc&ahLU8f=tVcqdluJ4A`w_PeNR!RNQ(iXozEpc)u)YmsuUXzTI z&#;0LmNJfpW_|)A!+fOXlN&JGm$#ek&j7ZPx6GJpiNDS@W8MI{Vm6lFO?B`vXWs!l z*bV$+Md5|j`Ua{dEQg}cww}k#o|0eWNDjgn;y2co)rJ<%uddP8UkH56ju)UhTm!02 z%dzf8HRze}F+|~~X6_rnIi1Kt(iT4~W8VpQ62idUULUTazDfkg?-3-|EAU3|LD>;Q zR@W?^SF3`@0fOP94#B=Qp@Pv8O?aqj1I3x2~oS1^?A>_EBRxEL7d_tJ%&DeX9}V~P&^}sHd;5#AOb9AELMQ zIp$<#r;tMf$~P;$5ivM1Y>Ob-{lj?71|G-bCXe-GvOeHr6#N>D)2bvK{Tl)}>`C>n zdNT*ii6*L&c(9jF#9NnUFATOiS!00Z*bC(Q9wsW-mA%Hos=yYo?R>MX-oTfUT?5R` zRmxDSYv56Bi}@Z?c4}PjAeL>;Zo7I}uzH z@!5~w)~!Hu`T@E023}xY=-R6JRaju9tre#TLFzf${SuR<<8Cl8$(8;2ZvGGv?*Jaw zTPh%)M~w*8ln0C+5)o&zZKdxI0Z+7&^9t~8T*wrSK2VnNp zsIrQ6*8mX{JccFP`Vqf6q5K{6^IqVUY$&TLpMU+L(0r$#Zvn-npV0G#Y;S#KRZ`D; zTWj2OCXkM-4i|>1YZRRx)^C6UdC~uopY=5%#gA??Wbc+$vN@Ohq2I@_VpI08cz)G< z==Z#u`RG9p>te;4j4SNsb>Nk(b5?44W9R~9IQArvZS`>BgD$se1M3}p&|Mgs2fDqj zPrgmr4d~uVyd>R57?*92Y9DJIuv7CZ^mhXAF|UWOR{DE?S~=`+2+G%&U#-lLdn0yq zvq%u@Fg(g%bYi>k>)gv*B6`P8mni0Q zu$Nc!F88Wp1={k}LPvI)BX>!fd#Is}wUc~_<*PqF{4!s@s{N~R1TSlky6(L*xmBMS zzGn|lhAuI$#=9tdN&7E&rrPr*?!5OOJcH&tI{ba?{Y^3tJLd_r2>NRZU+vh3dTR~* zZi3pC58;G@yY}H&VUzRTvf6(SeOWsp@vqe%Eq$5q?JU0(dZ~G8sB5yR#P11%a0Es&nzIszXc7Hj4R#Mo5-rD;}ky(J%x1Dh|*XX54*DkDY z}SuJDBnRTh-O6v)9FE#ut1BvO`ffmGtIN59pv@HKo zl4vRN!J)vQHxhSdG1fNTz^ASc1fJNsay@To7q(i_Yt*n-YK^p!V7k)eq|F!wRr@p7*KkdRlGY$VS z;@{mK|DP`WC*E-L6DEFHd;EX6@W-X$`-wlaJ^tTa_#4vX(~I~W+T%av!k?Um|1s_h zTdzPZ()#;17yka&-26X7{72g3|J8+mT^fFj_%-eEpLF5>@T!~sRN{|rkN<=VKaz$& zfcU&2kjDRCT=?D6@V_qu{)gCPNyGoM3xDz}Zhp2C|LOMlt6li}IgKRk?jGVt+T;I^ z3qO*EKcD#g{C*lgkGt@@rQr`J{+aFZA9LZ~z1z)CN8%sgE{@x8JnF*lmWKZtrXp)= zd;CXS_z&!I(|?Tkcels?lMDZ%H2g5}%i81fci$X8_Rh<0`hMaMZI932Lw4|QPQ&j- z{0{B$|KP&k|B{>j$Mb;y3O8ch@_ERGA4$W1hWL-P$N#+x|MWEc81ZY`<3H%af8a$o zKU0Z6x;_2_F8tHe@COk8l=k@ijWI_)cfa7K|NUIxf5@F0cYpa~SPuTVY53cT&%4rT z`21-r2Y>6o-SqDvexyAuy+zcLMfKJh2F$G_Kw|Kam)`ooETW_$d5T=?-c{Eoyw zz+D-4e^tZ4U6aw#UEQg&%*;&Cg@Rzq>vDT`v4H)9}N@FKdtg8yEh+ce?5O zi9fVGKEK7_$g?I5zZda4w8!7z!aqIDzkNI#_^)8c1y@3I+%m%TwEsigyT?aaUHktT z2pBOsL8HbNj;W3cEmo+gM1$s$8AvopfP|o;&;wG7s8ka{MGa0y93MtGN?T9Wer@aN zvDI&}M{Btm33Bs(X}#A1*2^NI_vY7y+$u|#hd=5u)3-{ky(3N4#&2%HCC9A zwuJNr<>!aq6hr-_-aqMAFC9%p&C`RQ&|JNL%>USHbu9QH@1s*&S`Ng(F;P@KxTUy# zW$P(esV-qxrOvM=4%bPo0y(`3V3Xw5$KaL=^l*A`Bfr_JfyVSHip>j)$qqSl($QA3 zlah6m)L~kibgkqKxb>RTIu|p600-v2{!nnl~Lz~iq=quHb;PI5FJzjsQP&Tez)}g z`Xt$~a$kzy^aZ6jxvWoLSCqb>L}!IL^m&t6X`rXqty`Iom*^#jS-4FLnCSWYqb=UU zEZrft&W_k4Ocbrf?<#Hmci7gYw6XaYeD@v)AnxF7cO6rwQd)OlVfUREk1kq<%I{3> zhP;Wj-C?$-DhVzVj&<85zd!K}@doW`Jk7P8T`fP1 zK6+;V|@{y-5{ZSnzwkyeXGLvsU*SRj&J=RWGU+Zz?8di`Cz8qrt zvR4Sb`2k3_I4P$8Y6v_(r|oG@PWb2-&$-!7%-Uz##2r zg(8XW;BhAGLTtjn;HnKw$}G5lr47kvmoC19 z7Ke4qj=t!!X+NH>{hRVwa91}RECj$_f*)R9b_ls?wrn!}j}9!31($crYxEL0hwf(u z-p!$b)~Q(W0PS8uh#0|rAsIni8BU2nU~nW^)L=88Rnale!8+bshmg#7gFDD(&7a@S>;diMVUG4R2`#8<{U_0{diNBX+Cy~)irTt9iqRS_@3jkph@ zH~w6-QXCp4yv4<`!+%YVj<+Y$f7|g%`tK>O%KdaL*XM@3nMhwm>bCTdM4EdAN91N= zBp7mR$V-_~dv|Q-=i&%K4AVPz{41WB!s9DMG_3m6i;n!%OYXu}<1x;bK-w2vE5pOy zWgmL!5BQlB`@(yXhf|MwLGB4p`Y%>HiS%ElM`v!~AHJ$!*;${+=briHJMbdXMXO8o z8;^dzov4ZXMcYrKr!>Q@vAuMeCoWw)<(_ce|5Nu=+%lb%Ubv!9?&^(;UOKG-@gxrV zE?=vsH`~@vhG&aSCzHXWsP3` zc9?T*F6YYW)cG!X_WV^D|9@;gUiAW|%AvWVxFGX+qv8HYtmY*u&CfzPfRDB#xo^33x@%pL8 zYx01vOZZrW{Hsd5{&fi}!-O0En}-6Kkle~VX;oddPvbh6`HF-queOiZbjy9^H2bKt ztFU9hf8`kacvJU}r+(GPWAyPx%NcnyI|-1quisTw^u}}13|U;HHzKZfY_{;;gzpTcq~jvszkOKz`FE0 z69Mmutyb9%J}l^jKy>;>SW=86ZiTI1$X;pPM4lFcGbPvRE&ISZDwlB6J_)k&wGQ0a zJ>kad1zs7c`T3(@D!59CpDr+dtoa$cCmq+X(Uz&T(RJl=!L6o z_|I%=uncHL{{{@f&E#=ad=|GjRNfLzZylE(cGJe-+g1U9aKYPEFvu#nAJh0?57BF= zazI#`fFEkylK$b6VqcN8o}~Qejd82j`WF0w6x}$*4Io!0v&y6jWu_k7?kXJSN`6$> z)@+wF^ed7kxug(`Zlyeu!V<(rpq;SW^~d(w7{v04o45||(jAO_!W3@sr@2o&Q)JsD zC@(s;*EYo*iN_`ey~0c+gE+DgEWRKcR|oF}!`#~wBk4c(9tuLFXV-A;()1q4|0((l zf8hW<#rf-@$E-(>IrXSj{GHhu?Ym!YoyXos`v6J5W-ek6*^By?e#L7_q}P}~qo&ND z{l{|!T(`cG8+*1~#=&OGh~&;5E&X%ONf<^QGG7mFe~7!eCgCERz4Yt*rn7^hC^3=E z!H<>kn=;grL9uzR`|4a&pIlcQ9QC!;&_=IsN(>f%9RQSknnW6JO@3*2r%ib8wwocj z8172p{7JFa*~aA|3bX#i?fFxDjr5`Jx? zBm|%2@1~4mHA;^~%_WY1#{x3p7^X#$h~oYok~4At+L_Ts7H#X{cHW&}XpGfMIw0U$ z@O?yGgPjW67wcgS?HZWNL`OO8)o@Fuu+fkA=bvGS7|0b_)mA z<^Vathv@ES!P4g=MHftDBvqug%sCm4{DcN^rAU=8s0#l?T8u{*k1Hn@Nbe~w&X0-* z*HKoXri7<5Z~2iz^7kewR?O2;PJ;oPo8a*S7^OJ2UDFvH>FvLU?a++Hv0=9^N;5?M+<1H}{NQAI zIzK$po42v3=!U(?56ZmAb6)15|LB{#7mlCe%?oCIc+F(*#XTTnrshIKXG?5amJ8;mfqx7!t{8&!4yVvP2^Q1? ziEY~s1hW8-!8m033?{tsc-7B(ME$U+1BC6Ger#%+m~1TiH2_t@RMqrKPNjfmXc1Ql zdGxlg@d92r(ZLwdJ{mnWYmdx8XE~hB+@!J_K=T-HtMrrMvaakf**{{;UC{~(ly3|*yKBf z%^pd!GZ2yjOAgR;!OsXlH#0V~NkKeZz0t+jmxGm|nZ{vdKth}`g&Au!jtib_a1Cj? zjGx9-5KW$}Wtk)w6TmDnlOjx~E7=T}LQ(0^=|Vyc%o;WaeK(%D6vrXEx;XI&A>KA4 zE*5<{@IhCUZJ`dWVlZ{_>oyc{mJtqxcy1^Ny+ad`H|el~Ow9^< zg?-3=5Z2E)tY9X&c?&IoHXYlfKZdd3CuynTr%hb%$A`QWUoory@G~y2lwu}4=7%Cb zxx!xjmHg1u9hx~Q^B_MH{N<1E<3B7F$%8fzYs#f9nXY-b47}qtCHr^lbu8)reLRgz ze;^-a=sn{n;I>5NlJ_ddxF8~nQb-Ap&jL~s1P}mAy%fNM0azKFo~;b(7Fm^eh$RyG zAQUKuLK4gazr=|I>WOi`P66|tcTpz!7Kd?ybr?*BEu#(QDn`N01+7-Ihq&7%m zW(XwM#3mhQoSaeK8NK6~p?%bd@83y(BqKIZeByajFr%ntxMX*~T7PIvrE=OS1rds; zArH@S@(|&N2S^xvuEb(68mT6IK+2HGKVw;iJA>>wL0C&O^2C+Z#n^D2&9I!ZBz^PE zH;>FT4kyyl+npcu^yGklQ?WPj#l*m;yv&R;*0p^#A`{X02iY50%im16p_<$xPq{HoKugj~&VHUHs2w=U*nXKvgm5~WEbH-I!Dp(4r8 z8i6MFK)_ibqD4|@yEh?7u}zf=yP*&_UYfJsIk1p~sAT6{_P?`o4j9ra6o+x!DT!|ys9CZ zT0=G$oo=}b0QqtMgF5;L*_tnOy6QbV)0pY1eMsDVdw%~1@0`*081Q+SF?&sGliNfu zlhEiFHwuqlglEpt9?xkD@$`$f{U8l`7$dreQyVZNJ`%0rd4PzgIncWPCz@hPV`BC% zYYkJBs+pbnb;y%1X3>wTb5TtAO+S_6MKSCopcp>^CU(I>he{iKy9W!lNf42_Jm^v9 z$OW9kzEWMl7gCx-#SdmV=|?CW8{;ruMH4a_HP+griIL^7#n6$N`u)oCf+0U+G-}59 zDQgochxz%IaJ^fN0lXdmfCJL6biP;2U>*2LA~U|sJi|EpJ9Sf8Jr_$osgQdFVeZQoSo!M@fkm$AZCY8VRDO-e_&SlZ4~-a zdZW_!>q^hJyP5TP_IXU+3*r4W@A33n!KL+j_V$AGi8wJ{#C!TVB|oU-6tCepcl|xW z@gM#Q-w&3tRgf*=gZ0a$e^;87oS3~IT%roT#e1%O)xQNm;a@9%jmrPcAgAAs@joj_ zKTqjFSGwXet}RGERq21?gI4?HHvF>R<<+&v{(|kP=v+-(GSBFwQ~L`H!V4SY{`iS7 z$`r=M#i)ew^y|_1ISt8cFOD#ZOBoAg)LGJU04pn#f))ql(V4OSUV4kSatmI#<}$d_ zA0*%2r63sX*YZ}=w!4P{z_7sR;MMjh66v+h4h88#w#y|emM?&eXzAs0myA2NgZZe{ zW=U>UPe4d5$FD&FL0YFjb8Bz~W42W{$@pKc4f?Fqn7ne%xfj}*pZrhLC9F*F5JwAk zD*e62SUJ7vCx3l-B9t5`hwTvs{S=Bgr(;p&$V~OYHf+3R>99mVN6a8ix7%V0XUB?v zns6gd1I9M%KI@PR6a}z+_EM&De{?w06sm5t(LCd7jl+0k2NlwBEHY2GOn6I^Ua zV(`zTtC43N^1RM>S)8C+#M!K%O@gl!jMtUW%ga+Qbtd}-wdU)Nx0*Dy6fhtR3iH&fvHhUNtP zQBF-v`cG)N=AmRuUJg^)T{nY-XvYL}O?Ib0OZYb}0(McQBg5)YWSVnHj>rzEZZV~s z69P(lu9h^w9eEQUi!31_#Uz-Wl}?RY_p7!34${G=;T|!1qb_^kcR7q!(?+|<1C950 zSiixDC2IX!4=~>U!iNHR^Dgqm(nXYRv4{IT2(Pj}lR{Omed9OO(+aIq&`A>MZ4T1G z-co3>7Z1&sY&KC|nf(vREY2_L(s}>%&#*=LY?;4jr6_UA3+Z`_}O_gq$X zjA=aN5d(RHD2;P;?F@o(sX4{L`M=jBX~tL$Ja({^CyR$*1TPwFY+Zev zOI2mZLUw|v4SuUcKA>dvz@~shouVQ>VW#)b62f^`P+vKMqY{22Ugl!_(R%5ZZ~}aB zlh^q{G3tJCBJv`-!yh%KgYm>RgO4yK@)U_#CL#Pq*mV5+z|=;z?eP)sjlX!WzG;`K z5vSlNv4A13*oV$A4z5ATV1N9MzWd#;-rfM|Q84tWjxBS2dgeYm5h>;yuN_0RlfHy+ zY<&<~{vc>7J-44ZI3&Z6Er!GnIfMze-J{T8nTO7hd_Io8UDaTcasoS_b2XL{k@YS(46ft-lf5NIuG4CqOjCW-6^Rp} zFwA8c7*;LtB0D-zVfrFVzS%;BLrxTFn!Fs2Ny_=lCU0TMLbA1N~5(!tq0K~XuH z8Agc$u{(qm#;Xn+*J`p)gciLnLg{+Vi}M1ikNdyVsI5&8l>t?+v;KhN_$$i%am5VM zlDNP0pCqnrk12k{HMKUT%J>o;h+bdIc+$y&#sVUI%k_Qa3WE44zZGLD?0Il4kEVR- zD%UQjmHOG^pzEBb%VuPw_F4dMBFN zQA}U()6BSHPbf8vR||PQRu7pbj&$F_C;LHo7$^(E_x~ zdVYg9UJ6E#!Vpq2pCMUld#(e>r*o@e1uHMBu~7Ki-rdv&4tZB-$j_*urqjZjNK3z$ zpT}lIU|55IE;FzirEGXOwjOJhvTYmm2n%NaFIDB`mo;yBtn`eCE8n1ahRkKX8`yI^ z(M?OiN_NQA30ftWm#L32SX^3In6QyrtuU-7jP*~nYJ*$CQeI|mtS3ctMJyE%i-j$e zqwx)$@An>O6@}_cLs(K>g_UmrZYxhdEA09?zM)&8Z&N6diN_)aY1Nu&>Ij6Et^eic z3-Wyo@b!3Gt}rg)aFJm4^rglHxi~2%8ESp~(ozvb-Trkk3wa0AwE!PpMUdJ5R!97lA|*$m&lCn z;kd*|(!DU#f3)LTSZ7mjL?vdK6g)MKrrYH}E3^Pm`#j&tJ%368WLM@Q?z z(&{La(pGAumn3&JPcb@rWy1vWTR(zw3VE69LN34_`w+Om35aulQy@3$^^%h)XXr%> zOj-$kz)L8Dj?;H1J|K%CFG z|2@<65X3vlO0f#+GEGogS2Xn^U#l@#e1qS09O2Lka%?O1^TU7pTN&`S$L{5mezlLR z-<9F-3*qm^@ONAITSQgLR~G(OhQEWuUvv#iZwP;%A^%wd0= z{R9PX)f%8OTg-R2UZIENdl#76F&%kJu$hG~?7cVam>%mvr+Sz@iWl*{-&Envd%LJo z;`j4N;)s{@MvfhcilQ=**M1bn`QE%$a#QIgJ2bj!=Xd;gD}&-_`;|&=+t~y2-<1k% z!zM07f}E8)cTcsEk-JO1e#zaKAMvcM3+{sqpL8F_)BZt0<9D?fUemQ0zOE>8)d6*U zVKt0ytGgo)k=NqwW@R?q89Ca#yqNTDS ztNE-_c6N8i?xl#H?z5qS+Q!%c!bG}5LqjPK#`Khk_Wk9j;g?4Yfl6Bu6T_4C+hX_o zhjRD(Xw3a~4CR+Nyf?-9)p|h6UG&xq^oE{V0-A`7@ks1ms;>LQ9 z!hW>I;FN@aYa1;m{C@ZQ)qn8&J)og}G1bj6;?K9V>x$Tuxa~AXm>r_9Hi-Mhztc(i zZRa5UWCt=)=LH!0y{~O-0N_4>-|zQ+>HVHx{`-Vqd_NXP!4d3q)=qtz1C$s;bwL@u z{>Cf_W+7UpLd7SES^;0)TGSua|y<1}psb2PMh89ay}- z9Z9`(!~O{dQEEN2N81YPe^JXp_URZjVHlvAz-ABP!nt`{7KWHazc-$J zv>Gji9!KoR5U0XB*E=tc@NW~`k4`>;9lE*EjJ$@+?4tx&%DwPz%7d24^Dls|51QE` z{_I&IHxm@;213$TKSmu$x)&4aXF0yh9h{}#Fsm=Y6xkj8>=q{6uS>l8n-V8|Q{oxl zlz7xPCHDHJ#320*&^}FKatqU6RP^;7dg}SrpUKp}ywmpQWe(EHJ4{Fg7 z6zBD?)lQuoyLhBO_m{=oSoCOO!QHA>yHE>O-0K=sy%oWkS%uEXK5aW)^qa*g)s#D1 z1;wptqmMbBgW+ZoOlwYAj^QGIhb2qkIl`KM6=TKWy2_hD@Yf~w{HDbB=6^%UC%-B2 z!EZ{u?VA#>_@=}rB^ujl%jZjM_XgB(^O4Pvy=U2JvF!_j>k_mF?*uOyU@pRjUG&y< zsJ%%h4G;`3>HZsb21k17l|)I?ZCms~aV30#2x;Rfc$r;8v`al_)8x;uSEY<-%FW)) zfhqI2SPf~c%>{*AgQ8{d>4WPj$bm!|0_vAyIlq08*rGQ5@rX?I>qu4TI!b)ltj6tp z-cOVS2W{QEm!KZ+8;_{*qIR~G#YPIJF@;~YchPh?LMW$pk?_f%i zXEk{Y=1l2q#TuE#XQ`Uf0?4<(qTukb$bvt`MJRHN%=8=PC)l37JJ!N09hGWYKXri_ z24{v!Zm9Mz_>rnf5JlykOQtjZ_5}<1jV^v}M5b~72)Now4z}Jfc#5n89YE1e27VUs z6GtV+s@^n6CRub4lUeeAVK#QVMC;CMfe*);_KJ?Fb84deC z&Tk%<8y=U$<6>TPX^J5;teckGbbDmD?`U$Ec1Lkiw!x{%`N z6cA8Vsfd_H^snKAh~B_2YzDLHDK{!peF&dKS9MtQ%Ff59Urjrfo=M(BjcfT;m10%x@j8g6@D;wfmybCL z)UA6$tQfY1!bNbWyN#oCcR@5W&VT3^Ziom$074e^PJ`tpQoJ48l&jaNJlmic8)ml= zMN0o||LMw$$T>0<&_ts|IJZ$!sXQ768{HQd)`#30H>9!WRCHA06@{uX~Yof>l(iU5XgqtAA?F z2*$6-q8yGE3mh$Wah|NP!5Hb(I*IM%BEs^vpTaN57aI!sQ+TDNx^*js{~S3auQJU& zq`=*)j3P)5UldZmhP>pvizJd_;P#klcRH*e|80;An)2caimWG140JyQmHDLo$fJkh z7ltXKDA;kKfb1c}|ARUFyP!`JLn1(q=g-UxJzFv@hhdR)F9eMO!V}hX}yy$A-&EpaSZ!Ywb@l~0yAZ_(d|}8-E?Wq4jkCKHo+%aHhI@dUN8N72{lY%5 zNM_%DLvCf8ZNJZ*Lnrs{7oC3)9j(vw`V_jP_ts4%KLe!_*#$ z=${F7d|33+x_;?C`Tbhby?-M;K&$-o%5?AOqd%!k|3J=X9K58T+k^DYO5c?}o#Mq5 z&!_c3>}=WKlhpaDEw9-8(oj@DCmu!{tOSc&&B=Lx}Uhcg-A z>2f8%I^WaguJI_liQx`M!@SQe>QUK9oL$N$!41pUDjP?EG%Q$##+na;lhkEJHQ!8 zDdksX+Ltf(mrHuvJ)jQ7eqz(!AW(&D>FeBzGp>qY*7;opNn?B8onQz;%$Owvt-Kbm zcL*wK=g=rJqbxYd6%CKxx|Liehp9S4!^GP9J>H$bG|wpE@TBxR*5+3gR2htjX@}E3 z;l(P_Y2~ZT6ZCMfJ_Bgj9fv8(JS<4r9G=jjm_otpwa$D=8!qX<_XWg^B1?0aX{L_g zVRE@NYDSEOxGuzUqa8d6Wx|Lk1T4!h;@qM;k-A7~rM!z8hRzWOF6-v)4=n|x2a&269`!hxQI*sg>3 znSYsA66y6$s_AmHCf^Fpy4sv?U{$fWii?M}}ep}l{3pIKbZ58Xn6|pwnP=;>iNB_nBjB22d zwFP~f&jA}@1B28)eQUlIjnCgdIE6l`Z%20Pn_K5LcW&y#!F(Us)Yo0vCC^S%=?k` zI-rZEpRdhuI$oP;q|=cX z$ZiV_HOva@eiX+JPD@~y4Z&S{$r)|ksuwq<;E#1{O!fyeql+zp3?8hMUT-;q-;svoK*}H< zx=IR;TS*#`DIN3`#k}O|NO0NL7ajk<6g`pj_-WvgOP@e!o&0JjxH%Gw1UarE0`8;kJ$QyBdhq>R4^&x?!rnjd^<@`+MOiN4bYbrDuPqC9ii3uI$`=+Ls-jh2 zL{qvYQ}6#*H>S{73&wHh$vg^f2>D|yvzLD*v#e)Ij_h)jF&uImn28ir4r7FCwBoPE zmhX9Po@>`*Y)@IvL1o@5Z=F9JOCRWM{FbK_F+GxggRAy%Mh<04pz+AflMK`&w zCivoT2M7{*u)Z!~KoprlrJKbj)wuLr2RI@LlXKwy;u~@}pyz}+U_U7Ws-i>DXVuvS zXLPUMn%5sgn{u2Qmd5gPP#1(kq*Y?$WF)mDsy>aTcC*$rYB*BJ3{!PXN02Xd5>{rM zwN6NQON$uK-jW*X@?22nOIhMd`c&rDJEJl!!yTp4J1NO1UJqIbnO!PO{SOUQHU#ZgiZPgU%#Q{~2qp%XO8f}{KV|u!L~hEyqKt>int9k58vji(O&Ul{W48f1r5XSUT3I!1tHgozIQR2|}sNd3W>t0gumxTf5phV94!pnRe(z7>5&l?C>1`&7B2 zt4jY*s&U?0Rj7O6lY;MduWvZeWnbk_t>1tCTokMm0LNg$`r@3F6s!Y+SCEoYusZnd zk|n}RxE_{gou<53or~*zXDGo|sWvS&19M%U4YvineS%-D_E;le>1ly&^hrTuSH*fC z{qq9bY-jTBM@f9-#pOW|ooYHeIt|IaJ)`qaVtgmqUq$6XYr!T+I@e(7T!W;G4G79P znaZn4Kf2aPZ~TNvw+k-8ujio#aMnKWeeDS=0ydGWQ$wi`MPiUyZLdRp0AM+oZU3BK zS!UqFO#bN@Cj+osY5y~yL2H@2V!;q8)rO{X3G%qgY6t#ae!YR)*pQaFL!5Dn4W(T{ zNo1;*#WS%V&4a{dqOuaKcc!+5(_!JVpq zQYWX~{nM*3VT`s1O!kvG<7EMA|PT3oBK2a?aqOiE(1 zF^wC$Xq-ddk=;_|==4q|puw`4dX_m0dJfhu?64tkj!v(`95WtyuQqdeF}*AE@HmR9 zrg-^VstO z*1vznrl`1z9=84lIDjf!M^f%ya`Jk-R{G~mi?&bZtK!1g10d2HEakVrgtxwA#2AFu zHVsjy@iZzk=(t4co#?Gt=c8L^69Yf6Pae2_EvhyF8~1d1l0NJ|6cUptmx!|qK8;fCiOf=@7ro-awOOC2BxX5 zR$mi4F;1)A;({m7*{AvQ4Hcy~jF%4OIro{64+(9G`n z6SV$`c$t1@2+xjsO*5|6xd=KUJ{n{Mz`g9jiO7@eL;5E}p5V>{SRA`6e?gp$GnN`1 zM<3Ixk}85PPVAPJUkYjEur)ZH={lSuMdwgkCI!+qagAvLmk7+oKy>lMa&G<&#^H|~ zoala3@Ew{}TROQZi&vc*oqr^h9IZP{PBNH~P(Lubn#SG&Vqy|o37MHO9C*d)Lv-;7 zE)=3IyIYJ@T$7U+?{Jl0tRD)CpE7Kum6!QdwaL!n2q?6>Jn_?TH5F}-u}PqbXF&Ek z_(pn_lO!I>uQuH(;yo_~)-@&tD{7g@Dx;}BPAC(e`$m_{(c6cGjsSAHw^=N$ouiAW@t5@ex+-qXOClL049W<%6NKJH1P_HAJ^!uMQ^hk@W1BLx{+<$RW846Qf+lw?BDn%?qtGbvRFeY zS?N|X8tT$-F+DO_C;XpjaO!3+##}Ma4p7P9hHqlvU%VZk=-PE|^uqq!l$P5HL<#sj zIoPpZeh}gs@iY3|H9*fF+%0XN0eTOqrnb4gk;s{uQeze>Rx=FIk=&Sakc_28n{98V zIkl1KIqe?q**y`l3+lHGg_JUXtrUr=V91k_Qq8r&{nOmq7AAeEE*;cmTDgu1GpFD3 zl36qo2tFn=En`{(Qiz)d8oluCJ>uI;W3M8(HZeN=sh56vbb77Lzx(nn%bSpGhq)m> zG!_(LBm$h3Ue%V?=SQcvjVAsCeW=WjarlVWT9(iQTIbiN*M#J4Y`IuV@rp#%n1+@* z@fNJy+}QHn99P3;@?a3M*^xZr%zz#T&t1-Slc-CY6lr}Pw$tZ7^Lg_s%ij7>JpG)$ z{YKw@!#DE%hd$h{5A(x1e?=YoIQlXwT$HOYn!1)pSN=q$%ycPY_VGLdf0;@huCz0| zN*!O2w##nd&u++8bfsUwK(Pxn1`gY}NUmI6Pm@1;zC;-ncj%=jvuk(hwC>;eEHeRU zH&k}&*|Vt8=;Sr)N{c(aaCQR;AHnh+EgEIb_@0f6$r8WmztpieKlR39ULZ#8)LUeN zQ9Jc^7Q{v79)X!Vxm+_|1Y6y8aWmV+ZhV+ZPhD|GjNjzk6&MAzw=&S2E~VOPr`}Pa zG*`BMsxK>$+NsGBJ{H#0{%?GM$f;uNZhb8^>*Wmf4O#alVH$j8`ZPIC^~ap+^p!Cl-5U6TofxhYw9B3VFMgR+m|bEav#%%cHoml zOL0NRbrx6vipi9f3MFKK%=sBW$lN{qf1xk_lcp{R@l2D^lmZ}DJr$3R>2$}{m`v~d zPwHE)*1a8j>ZiU@dnfgt>9^cdJmE}#!Chg^90@NMUWzY&mlt$NUKg11W;AlQC#4?H-138kzHsy+v}b$Q@k z6L5Eh$f%wAu>t4T|EXC0f7szd4Y6QcVIX3^D^whl1MU=!(kkxLk6HEq9mDtKG2G~? zeJ&RE`9rD=$EO>c@~`ULEwxEcIJA8k4#{bF*{?zYd0TlR1JybqE}pZ|Q_@S5%GDqHqjH=>32B3^pv<>K(A zMlUnNmlD0q(o6FFNXxNUh9v&Y<3MZ_1S?B^Q)~VR$fUR4eLU&45n>MUj}Tyb{YHsp5JKS2Xsozouz(y8z_zR zG5TKg7}}Z{&@Qs?jj<_;J26WZW^JSdyD645-=hq>ZB!v&G0}FNMkTME$iPJ2i%}n%4gOb01X;9{Y91M{0Od=ApN<3}eC>;}J4s8Vvs1x)dSGL9^7USPx zuQ#vk*S(LOthE-AUCq2<6% z=7IaEB{Xkxr*@c#lrc(>xiUvlk#LWpz`jIh#& zy@9i!!2+DTK)FJk;3YJ8G%Q4kFXKc*E02>z=AuZRQ3g@;T8biy|3)TbGyKG)fAbn| z*(NsU7OopH1&LjhZA58y=P@X`nsQNS%}UY291QR;tB5{2p}#o;~TYuS~xZ&y3rZ z*nf5Mz1^O?mfh5bGMzbVM@`!vc7%`Sg>@YFmxpX`QxxsRQnlC3Z>N4W<84n2uZgBk z(pP`>OgTNa&q?_|JE#|5)1w;Y>Zl)IS`YHpjcT-X6Oo>_*pA@R-_U)&r*Bgqk~s0& zr$h%evUJ!W%;DFIU5DU^AF|^0XV)(cla^`D4xX4~Uu%{&xB~9$ffFJ{sV!RVm9dS( z@oXt@vnNynupZ^)H9Xoro2j4cs)yr+ZGpPqj*k`jomMNg>pFG0z;46?M{pM#z#aAO z+}iD|n7e|wn&)DC|MZTUm0P*egB5EL7^QYSv7tw+73@Se6>IYYgXX64L}o;UhZpdB zn7O5BfKd1IIiO*CXLZZ|nOn}_OLEgnT}eq@6xK%BRd&JkbUvh>Y&jMXYpkn+?Z_rh zM1_rd*_Cc^V)*yd4S^j}Z-Td2iIL(?WM2qh?A-N<@Wqa2704QHAE2x{qV3W?vUich zz7p2Oru)^jv_eW#@e=;e+sJE7^F6vzz%<67?xZ!B+xQ{-SYA(EH(!OtDhbfYG5PGd z#*nGs^T}Ehi*8>B@2ym(A={^?*u%9Z719dFRciUBu~MjE%;X3AiMi)5VBZZ$Zl;VX z_Y9OFFJ;fRmNncMYti;{5<|Xt5==S1*vrR|FYZ>G@I`w-Idh)03<`vZ4n|&t2zZW^ zg$(0g9s`$xgAbTL`>XS_jY*H!UO*#7y{ zYt@B3CUy%(@-f8T0eo=So5xd#y+2TaVej|aiFgRi<6+1|PRjS#+64n75v^i~Mj+4Di?LPsA-q7Xl)QHim& z+pjlCnB4&~-Mb*FB=?sf8UaKGMu=Cr1`R&^_oAY61<_;+9N z`Eee+6)Kngp2nAhr=X<+axL{H*}*}PTq|2>CHqffM)7+%^a{uM)8sHnr>nfFumw;K zC)IuIs?@P>a63uapXd18_&7U&eL%;jU#jVo$ze^d{{}*EPOkBnu!?a-3?;8Y^slMs zfqm)`X@cI=~#ndm%RYay$l z9mIphSCT#q3*NYfeex66r5+K*_)kVI9|9GcL8abv8UnvH;rmeiW+N0>7=85Y6S$$7 zDu#GE7#%hQOa1qQ6en7CAjieH`m}sO21gx1J0mK#uVy~KiK5$A=Kj@#Zr{aj6p zGuW7Z%@qiJLwZf}4bO?IOtdeUGB+w=ZAe#1~tQ&p;8K|EFmq9JD84(f(=jwWYoV4&pfL9Y0!4uz+dkc>LY9fNB z>O@ZF0G{2^8o^krW0|2kkQrQ`DQ07UUG{-H$7BZ0!#7BIqGuC+RU0|e2IX-*4V4$= zK`n>ugQV*!Lf{?D`7Q+a{cZ2w9V04=m~o=1`S9geBg+g~0BZP)0dBTm>pg1efhw__T*j-gJWV(PW# zqrvaSSeqq$E#(sl!O!@VCwqU$t~yjYO=)(=(XQ*$yGEzC)}>$Ge67aG_BC+V^Yp3h z-Qt$-dVcGXo`2nOSd}h;^;pDg=ieUAd)8{@rxVb4+D3pe0q6^i4+&wmHX^f12G1K< zj1+L=iKh}FmM!5E)b&Kww&+cpXpN;}1&*~^+WhgL@ytx|zxXKS$3H+=PIuCwv8Efq z2t$KCqhP@CW<|2nzgD=lH1kSlIDCr?hTCZnOa1c4V^BUaQ=Y3gZWU9j71UKEE@8r6 z!EdZ;&{{1V9uSPjh{qp~xdpE{HFI=a=}K`si%2BFEEPGs#2;U(CSA$k+my?`2W8#- zI6D0yu4n7g8}NH9w_NS}F(Z51M;^gaTss*?aGPv9Y9Tm{1y3RivKyJJOny0Lq7%}u z{3C^xJnxMUwu}_*#SH>mo^|gUo##M+_xu||4s@jyktA4Rg~oI-Ots(-en`pe{rPrh z4^`e32w7AvZN~%;@tj?58Gsw}FNzgdM`R!R+a4!L7(v)5bPbtf#B=Duc>3cx*v8zNK!Q#DFt>!`VyMnY}cJ&|xqD?u>JMVKcnv4>n`0s#)cp zU&B#hbu4=ODj8%|dF6Jcor9~Fyy)#XkdmEpdhEn_`rJw0jvAXxS}OhVgZ-b>yYV>0 z350RMMI#UeiDE^$LAGKSl3Ljs&pu19Lpwzs{$V>&pU&>(HoT_QN}DG&j>>`wxQ(Is zg_Gk|J7-VxxO!CeTD1LEit9!rSt^($RU4D!`g1a+^F9CEiiCf4c^nUmL}jX4cjYh5 z%*v)meD>2$tzR8ee=38LzM3R?%3j65b8<-%Y!JDCNzF+Re#WtNcd(wPAYh$$F3_Z| zpezG^Sl3<=Cso*(=ztMpm)x!G^e3D@)}o8bEgl~9SD2SRp}tI~Gn^_Ak3oj!kOL7C z?3zUz<14xj{kW~Dog6I{dDIaC~=VS7Mb zC8a!4LYqG-5f+xJE3XrS`COm5FE*HLbxWiV^MO$E(m$n;)-BUt>8W+grCe*t%$TYN z6di!p0;mdvJdygq|_5XzmCne=nUJo?L+Ole0w zNwN~I%`C85ZTRC(TtHW1*wS*+LjTHQ?dbwjJiRwKNWf%f1I!nZmh!qJZq(B|>M<8A zX?2 zSR_OG?~jY?gL}w8W28?fohQ@=FNancYYnpieRg1A%KMJssjD=H&cysLy$NkxMshV+ zC}0~%y+Jj%F{GnwhSs-W`35BCpOKD?B2Ihx%Q@V z{gtQW+a>n{$9JH-Zf-mcp!Jx@mB|(50Qyiv0)5pkpX^vmS)e&P!QjHvCD*mYT14<9 z^>jm+G0ZSL%ur4SjhruK=oe-{<@#p{4nyjeB9_k3ztKliuawgVAUYq$*-6} z(?0eYW_PzivJR&rEwPF9P+zK8u#&lO;&~!ISVb%wYLy@D$1RO^1cSK91-6^<0A=X= zLiaW1t(L$m5R6@GP$6golhfw=-S~3Wfeh(r%OQ?m7zhkwYy>rqY*@)}w#7Dr(DT>!Xcbo%dZ`E0?g$K8KRDeJ!-teKl%1@GZ3ZT3a8g7PYE;I)Rs5$o|r@ z9T_hwdXbpN&wB$wJMsXU6m-%@KDoYy-GyAp0Y@*BQ>lqkJZz`eTO$3(TUN>6%?A*_ z5Eu36c?KZ6BMQJ_7vJd zT)i~0xwMj}?3yn9hIl2g%f2U&xj^e?^^Ed{g(ed`izYl{t@L)xE?LLr8EPux)Gqnj z-t;<>a7VkiGrcFCY2M3Caa#rr>`d;}HI895gzx17#D0lP-Ck-^INz13vLybUH8nTS zjGaDY3wukav$(dIK_lNn-e69S8QX_4nJp!V5~jx4TWA&L7k0F?dr7xxu*vQNl2G}| zO_K`G=p$S`oS^GE4Z%M+k;Q1exqQb7jOK(Bv{!ve?wz-m{OVh_NF!7#mK`cZSxDtH<*pl+QJ)Z~HOc_k3W`4tXF#TR) z-^&a=f#%L>0O|?41Oa%kFoI_4!noA#zrD#k z9s->&G$E0>EMF#YWsc630XxTqWl&>|$C2MN45yCLB#uE`^uEF(dixNO{Fl+c4ppC7D>cGICdFi z0Il;yyN6h6%1N~SYQ(&PB9$>1O1kk{VA(5< z2nbIJp~hH)MCKh-8ww#4$8^Pr5nQsD1RMSmTFJe=qvna6Ye_!trk_SoB+6pYy z2wt;ukuk(u)NB|{G12BE<{TcsM)LJF!FMkEj#^cqw~XzVegVDZjO4~$L=s62r>O40 zv65ni>Ll&ip`>jlqThy~KV`Frku9fBh3!Ab(j}DGPw;5f=OnM{@#}KZw{P$|D!R3T z*dE^@ADY$_2p*dtE&C8gr{>fA%FgTkyq24@mH5#3?v^@b0LFW z1%{2mDCrn%1*Nl3cfn=t=e0c$UcXP!hIzUr`(*| z+M6Mc){fABxl67T_#Oa9isT*bnB6de0gYUFl?77wdG=y09^8Vk+dP~33>DJm+15Ck)U;do}_9+3qc)H(m2*eEy9&^vWsz%cJ?C_wL(L&~zoEG?x#s%OVMhnB{;Vzt_miHJ6+?&gA=-GHD3An6 zTD&SN7i#XitI?0@36M|^?CqpavY^gq}2VEqoPoG#3M0W&C5 zZlt0v=W?-L3Vuzl3nyo%&~(>4Am6eZ{3d4}WDzxB9_zrcz&lQI6crQ8+rsr9e7-;M ze5sOhJG{$R5=1DU-N1ehqjx~I+S(mGAp1MO1Ub?OlbyXmM`w07Y2i3c_>&#IP$O(M zDx8NB(Zt`KYmo6>bJ4e+%TtRv?Hj1|tL>PlotM1?`ROVZ!6U3!2jrHRU6rnRme#*& zf}@Nz;gp^0snTSHM7c0&Vgiv4N`n`<$)*J*=OMVk3)cz-6JK)8x8rR5w5g-4Hj>?3 z2Da?S#nW(L7<(A7GkSJ`coZON&}M%D_6ipHAu}nza-*p3d?w7f7u0`a&gXJD&vsXu z3!E4};bh)+PXqiOhL`7i$mI*o-(Eit?-=mKGYBUbXcts{!sb{e562a!wxW$R%a#Oz zFfvm*EO?tu{A?o?nZPronoeIZ)olILDCVOHC=<$!xSu+6d`)%1Q)Ghs`!dvQhlEK? z%8O~YnL#yDa3foU|4_dbVnkb{AF}n7;0^Xyg-6qqjOJ)VEw|B%6MzLO)onCoQi4MRu-+BrL#Ov3Rxg@z!GJbG#*n3hrPOfKE38 zEa~2(@;8c^BTorV4dDwn0i^V`4Ejpdd)p@(GOQ$k4n?YWYdU+g1y*NCZ}@dPCj|(; zl@b%r+x(PGvTWLvWmmDTq&suUvcHF|7^gc$&+Pa3%*zdUR&WHRiyfk=*ZF=iM_WPp z(QdP-Qd6}*t|WUt#jRSKv)ivhV))I3q>9J-Z676G(Y6Ftu0$S&^8{>xmRP_r!sGIz zn9b=>c9>+K=G`s@hI!faE|$Lj!NCu^$t&qT6`|gD9L|)CeVs-jRZBB-zU=t&ASPk8 z7vJ81v`S!QW#M4EKgP(egehyx<)&wSbS(QV(ML!1i{H#zm;E53Xz!$Br&-Ux7Elze zKT|SWPPT6Gk=^xQ!XlTj|L9LH231qoMa26v8;sl8!O_0r8VSsSPt3caVh10+m|^h@1FUB8K0vP3X&h%TvTSe`6DEz(CScpi4O$wFN0$1hxUm-8 zu-*+VCi>RjVk)6prNHUHj$DGEYzSlXO@#s4)+VZ_9X75C;`-|2vG*=mIpU5L0LCrb(xPypq=vmSB z8s6w{Dg7-mDFwfI%h7ynke4|Ap0~4zFPeGaif8?UK>>k^JNaQPHDqP*Z8r0*eb)#7 z6dZ-6pGt0wR)g^OSj58I72DU+>j^ZseQjkl^$I(l$@hU1U*@UbP`iEog*}k3gM4l8 zk#9X;OxfZn+11Qga4r6?EB>TMWTysey+Wx5|Upk0qF2YLiO?-N==ge?p zi$Bsj4aamdl=kk#S0OC)OdWh{s`nTRhcVHWa$+>W*C`lF-!3ArA7PlV%P`}ZHB zpY8uX!c)SZUE=8qHC(ryUDRS@qbD;A-px>n;$d zD3ZrR3cruIgEq1sq~;2%Y7(cEc*etj};Wr%Ey!@ z-|mq_(Q0qgjsX#Jq=$dOv%rdM0_rEF?%CjWy`}>NYd83Vjfni_+^<)M^Cl})=-AWt zlp)|v`>T45yp}8AomM2%dq}1|`?XBvZ;zG>as#l1Fl9}pZX=r!C~Pf(Sn!{KnD@EH zT|Ao5PG7=v|LNwEwTO@tfBl#{lkVnbC8YwpDMG)^fP?6x~Lc#Tb0jUt~r5=Kf&5!E99;2?pN zq7|e;*F05A361qHm(WJUA(Y4-Ll(DAaq!HIaa!Lz)H#rCb}KL-3f6uhaA@Fy{y8zo zCRlW$*R~5$ypbX9%3EV9uXDq27l`uUEl8>g?rNOkoo7|pJS7)FHc#2*ayL~Cg-q$^ z!^!Fb)UI%{YPk@0_!|?|_Xt)9GZXTw2_-8xSrx4GXbx3toECgIxqYM4@o{o@?`S(N zL0Kp9nEFHUQZ~Zk@zLkjlLp5){FVAmew#_u_AE<#;aV`-ycH!pt?3aL<(fucO zp=ML`Jdrt?`Y%RtZh~n47b_Emvc_i&8k^x+OI)o^0IMi_dzje)IHp0I)&6@5b4}D{ZR%zsMk)W-l^y)>x?VojH7KTy}9W_6-J4qW|O^rBMUCi6{c?t+c9A!paw2 zLx}AcTM%jt-AL()@cyGs`7wk_$JNzn`;WrXuAs(_E9PPTXM<;?T6W_`%yEAHm>}U_ z#6swDRE2l6C0&m6p9?w=yc}0@5<8XtL5gzD2F76>d{%UEV3eRGomqyHpMQNxx?1Nf zTC4D{XlxTgW^nL-I^57PMb64epIps8^JKP|7f(j~93*%ksSOehAw%m;G+VF`Rn4kb z`^)2ly0BJIF}gnj(FRIPgve=9@(E@-(?L8qshlD*=Zx7nuj3j>Q zAVvxTh(sjOVUDP}yal^%G4WZ_o?s7`7#qF7#r84>R;Y5b>*@FB8 zfDM(MQ-T$)KtN|Kf86hfKTVE3rmB0r^YaPDIK5zz(X8fVBZefR#;z@6woME7jAu7+ zPL?|r9JnInI-A;4|BtbAfsd-X_J1Y`kZANoMU4uY_Qpyowy{EmikgubnbaT=I8haCRoL<^m+uPfRdTXz}ebDw=1XN5wdH6)E?-!uJ8KZ()@qxns`&;{* znF*r*+kRwD&RKiywbx#I?X}lld+oU_{Iw)x>7w~=d_}m($e}xVWRdPFa%*$#K?KK2 zagCF2Za%Z5{K87?w>tM3%k|I?<@)s;8O7tY4{q(9;A93A>e{D1paNz7Sp%j>9$e*} zas(rKsJO{ENWgEDc%AsWwdh^Ne~+-x)Y4LxV!^m6_K{$Jvy+_rJ-G7o4Fw-g#@4~G zp4^qJh}q~Hk{fBKfkJs7Lv|^q%_4tV?nx4Tg?}gThvt4AdIgf%v+6!l%QR(72nWKq zLT^hlJ1J^?x3zw}+HL3D)cU9Kg|G*odm8H0njDdI`2l09!{)Bjb05+10H9=&+4JS^ zdn@-JRKZ}z8^GI;GhYE5u(ka6oVVZ;ZJIHC(5*Naa13b0-G@=Zoh%}>rp)~v7h|43 zPM(JiwSKdXnmlY$fgA`w7{c*`K5RTf3mIb8nV)k6lK+0O6bNeo=iLh}ssl9F1n(a} z*PM~(Rh6qzjMT`G25|foul`$(8A6wHEx@C9Tl<0PGTVi!NMzHOQHiPBWNcf4BQ2Co z_~mus1Q@AYax6*b9tll0*|`P!dZq?;kmX>S%|<+9DRsdU5xwk~1gD73si>Z~zbZ7E zLW3zBfH3^=>EZT^m&PnXQ9!t}MrhGMJt@jqZ2Az-K#W7^^0##&AlVrsGk8>N) z)TT-z`2b1in_6EUTvx5_8VpaT9$G4Lo7HJ<;N?%$+9EC7qG_T;?g)fRuD%M5-a%jn zsT-xToUo|>4Jes)7}V>x=iVVvx*UCo>{#2&i;QnMf>$Z@MIjs=8@AiM_g69=<&XLt zw1PK+{{TEU4iHS+A-Qp2ra!B4hcWADaO7Sk)?di?ESEAJMQ9o}^FWw-mg<8@9+{w$8 zY&UD_gW{UF1tTg)Bb(mDQ*2E7|H=-njQdY$I^))L9oIXZLihqYHsP%JB?UV;Em`=GjlUzLFymAd@^2!=A@YAw0ALP1 z5~DR2M0uKaSG3jEqr6jvvw?tX>zNL*FP0M4Xsk+E+?9&cv;l#mc128FW=V1heT)QS&&Xfic{S{? z4>z^Yy|(%4bxv20fwIQ@8sg6A?4W)9R}%iy91X3&)3pubt5acH+m^@DvyD0avW^XK z;~DYSvHB4n+dg6aTPDe|A>Oe))|$#a4QGr#v%^c%CyfH=e4s%z4vAY4!`HcC@APHi z_X`b6g#fG}?|Yr|#au!At2);0-MTOilOi2pnT$CBYrOP7uHO=`f8pe8`LVDq8GBbK z;->|dycHsta;8<*`cvIY87`7_tYZr!Gr~l$a=iXc&~OXpw+c882ge!bBu2jPb#X#u zXY3WJb?yv62YX#>VKlvQbwQgJ?t7Fy5%xO^^FqHr@4C6r`)lwPs`tabR{s-3CE_Nq zjUD#;tJ*JAzqd1I8DHu18mY{3X?I<|f($4ukNCUwG4ynG@6nF`bQG-DGF9tILUYZl z`DcFJcYN=Pth`6d7Hn=fe)8Z120)x!(OGtpJX#~-ERC|(zpGW+WH5hDt8PoHcJ1MK zN^FZ-Ny7ktuKqdY(XFK!r@00>L7p%nYEf0xqU%hOX=iJVgKXdNDQbWaC$fpEf3#l_d8gvnq7j22!7O8k^tJUC71SvvHP1pv$y#=o&>$#kiY_;*&n+bnlUGPTWcx|ipXj8@rjL0*jrztV^#lf7v z0BvN^VU1xRi4kO#+U7asf)k+5EOx5rZ#y&>XSynPp<#wHQU9p(7smbP<1s7*=lcl8 zLy;CBrAT~XJzL~epqu;luKM+@Kf8W4jbr#{mWASmkd!R>z>+;~3I4UYz&8|rq+w9` z=CFy_7R%-P{;4Ip9W&uq)dlxbm`H<*G1Q9aF+rP&$zY=9FrEgH6N1SQ#W<~zbZ=XU z%jz~cI94LC6Yvg43|dl-Y*E^k#93mJ=tM0^jc1%=kgjuKsQV6_F+NAW0OhB(k)K2) z%JU-;uJ##*YZ!+8x0*hyW%O~Td5hd6S_o(y9`v5Eupu`OL%LX8uc_C?q^^CTxCYn% zsYntU!j5#2I|zHpQr*o;iD3FaT|+~yq+w@WWbzUV+gwCt{c|QFoJ?n=3Bl-1RHW7T zd#rT|@({I`J}heQu4>0dZC|wmm#?pr@=^2E?61BXHv7k_AU~eBjbRjbwc!E~z1H1J z93xODT=FX`Kr$=t2~{xjkN-E0jA!l@^)~ycGJht2G&uR#tD8TphG9Q3>Yf&<=lG7C1YF@CaQV2g+~$C+$nc`n@I zX04R-Qetv3n^(ElH80FB>Pxy%oykcm*H1KoaSg@Ir)krhKrI5IgGc{cQo{j6DPvo( zvxz(nK?@hen{m3CW|EL7d&L&HjuIdZ2RV0+Qb#jRQT&ng3QWcw1Oz7lR@&&n7Q7nI z>}Tby#MP8wxi-$ZDz1kwO)DQN%B?%9j^w(jl@FKPG@Zl=%6L2DAd>6eN~=K|95m{4 z=5cbVe6dEFHC!9c}QX<=NO>d}C51=*f3Jlh2`PNIS*}F_+$2E$K zzZ1Qmo2@mGQ(=dj+wbzd_JCF=g+M92FT+r>NHdN{Qz$uggZ^pMKOB#5t~)oF^{tqW zI9FVgdrj_@j*q#L?PU6xKCRWR_@8)7z>PQOzj(mns>+qf`Q$3Tn2PhdKGL01_D*)k zn;JFMdE;G?29hv+@#J^W<3*~|wZ%kC`3yokllj_}kBl_QMs5A`Ex~tRqY|vlr_>U3 zq*>i_2k;l?BKy63;ODs`RSS3a;geU7T3o;J2m^*V=eMQgn6)G?q1JHCGg~RiaIf!mYqevvRy!@Y z;Z?Jo9t4ZoC|^{tN8TA*FBC;6vq&Brsy01ES zD0b5BF@!`HFg*E+Q*@8O?9d;fILKB{H-u?v9ZR!nOYfPR@s^nkn$ITdGO}&DjI7Cj zj8s{lMJY7om8&D6E&0qftMFGX-nhNIIdS?+Yy4XH%BH;LK16gUGR~zoP5zs@GngVn zj!P)|;yNtGcy?k8;2HtT*`srIthY9Do@T87PnwEt#ky1ZOD3Shq_IX1IXOFYWTeBm zi^}C~W_y7Fxtcj{f^^w4iWlEI`0qPeViCP!Dl&|I>C`r>4*n##K&qzom-2Ux^*1O0 zwwl|RL)T~LTad>bIusxREPgP6M)*0DV--7x)^>=K)96kF2zSwFQUjsn6kMULd`Ngj z(IgzA-;Cu*10tFmJS&3GT^EDqjJ=fX>} zD!t4(seN%J!5c1m!;y3;BIUne7rFhOlDfz3q7JTcJGyAZ%tEj}jI@}nBI}D}{YTEZ z`%5U);Z2$v>V;hkBw6LD%BaM}l#qjWmjaO8;HufRz_AqI0OrVEme(3DyOs=>NR+2! zzM2@NG%oPW@+IGfMEwqeGKOP0%&LhTv(LT|V%5t$q?H5Rm~5h7-rf601C=Elf))Kyo*OVTTo#8fD-(7@*|8xn?T4QakW znw_j)rz`wU3toDj0%%P}L@a&b{LFRvLSYI2<3#;tVCBaHk5kWgQ40J?tZU(SpnS!} z$8Fj@3Byi_nq=$+TSBK*4QGr;e%Sh}Wb9LIhvO1G36`LA`sT=KqY2@#dfKS_1YV}OzP^jaN@bA+H|3}sU>GmO-dZbs_BpRA) zBXodzU0%9gE7ZBk37d*>lZI``Q!M;v+L@ zVCJe(!Aa~n*lC5`g8_nM27@A~+;lL`AfL4+*iH1N&6i!jRXus0!)XmGF}jG$;Ycdx zbbLaN!Q^z(tP2p3T8x^K+)W6FTT74+iGdo=TqzH_6FKwOXaq#!h%=g|&Eyu8?-7Tp zwcCVYYG^8$gAMFNovo1EcAOh%zdDTi7x5#gHd#Ws$LL#oKkF^09Osr&e{LZ#d4EpS zHpYZ03QfLwVJftdiMKH8lI+wNJ|Z~5d^XY;qT?#);JP_=37x>U{ucGa5}v;?_(6b8X63?lE%IV=c*W z=LL2=E1D4{!FRt&4w+<4wbIA@hyvs?Wj1(_y*UQ|4oihN18J<)ia!_(F!yBN@ixh; zF`@{Dc=i3?$Gx9-$J?GKQ*Jj2@+~7QFlPh9=VpJkK`e{6y$Hlwuj?C7ml<2G69#kl z#H_H@)KDxT-R3yg@-t-j*ADX1opb|RqeRVLcncX{ApnZ}Wx>x$`;K_c&Dpx!GQ)n9%-}2`bxA#2rAWEgd7MgQPuof4yQRG- zMFlD0NT#`qnCY<<-c2REvAskqtMy&yD>B%~iIE;0&tA=~X&(XpKnOkoa6z(77|#x@ z;gT{w{w(~cRTgLP^_@*{K(OX}&z~`$C-xK}N9GaR!6ob@GkuQ9RNdl;_#@oX`;)V$ zm&L~VYxr`j`=Tr)#GI0)^ZG+djg+eU-IsgAG9Amy$jt#ITa%fpd-Uz9f6t6n?9$lk zh`DH`b$gU5eoxrpD#XwP?LKmM;|lL4pLt7LNOmm( z9i;J7TYJMEh*upF@LCWIHC=<1JXYOFjki&w5kS}8OnoMsuhFv)0808M32;F_E=QX7 z>+K^=%j0b?Ax&7Af7b?9_V^FqEPO{C2bCzdr*i7^?X^DNmLZ3TVdv3w7avy?eMFE~ z#M@p4F*|JgxbOed_xnTfwwL+-hS&Kn?5F&X`hJfFg4g+!zH^OX_V~*Iv0>o-d^u!` z>U;S3`T3&Qse-V>sK+Z|)X$hOg5}D2v>sMF`+P=%@$JDzhVAi3@XLiu8rKwCau}in zM|A^JL-4&LVZKHi-U($>B22d?-oCyYn0Wig`tiDSGpG>mzc+}4B7R>u{F|24o_Fab z7k&T2=}Qg*bcXhLMo<;2OKH-Epvs}oHJPE)$iV)bc(qqZyiB%!Gp|^-*3cxAt=u8< zy_ub~oAU8O`E&Y|pPw(UD_284E6FTy?6e>t~I@8<7@p1%!H zd$h4O$Qrep_|AfE+D?`!Ym}DYgL3zq8MRx!Tw-_(#hgAU7DXs>k5GCy6nwK#vbj&m zALdJ5MM?QxOb&1jq?Sy(%*`8?Lfd8v>?Ah@jG-8gydstGK|i%IcxR4{O#m62UlDT5 z8gH(TC_ol7@3!P%FMT0`I<$Wuef9?sQ>9Nig}lkB$gd@-cTa zbE|}KuaUG)QXS_PQG@v0;Wm)>p9@tHkx^%8SD6D-`UlvY`?$pIgzO%Wy0##hSbVsdE>Iv<-4git39O>bv^O>$?|7SKm=zR9`Ht@8!4q z)|Xl-vw=kEAEmN4RuCq)a_(PcZ1F#i!KFGjf7I;vr0PFTWKZ4E-qX#eo?O}csXwBt zlxt_!MXAN!5$G)bgYW$wU|QgJub)9td}ZSh!?U8XWcQBgCL`Q$D5#A`V~>Mvawnw zjF)`R`&swV_+wvh6Hag8DiXe}jQgp9P4`VZwWQ=+{%c>U(qpo>Rt{EC|B69WjQM*t zcFonbWBn=C(DQ)INnV{kYv1vX(S8qTMf){!7tA8-n&_juvBc+;17^b-hTJr505~oBnHpzNRI* z!cjDVdPGQ9Lv961Xuj)QDKh<+T68E~QOkectvxJm)X7atX3zx}sl$2>;Y--3qEbaj z+}ERijk~YZhnFbDxyKY;*kUzatS+2FZLEVwVRX)i!cn*m1N{=r^L==x^fd3HH8Ll0 zXr;~Ts(_m*OVcXB>&_ABy1Z$zHkKuS9qVzBP1NOnufSGv~O;c7|=ei z>o-*2v{)B1>OcCXHg~&Ggm_2z=}S{466&*2#wbbUupy~R)>Rsl>f%*n&{S~PPBXSDEfPq>=pKNOESihsAr<{P}F|{kl451ZzY*;zQ<}fY@9l(F%>?YRz{{q~~ zF*b*-9Ai@$-zJY*UR346@q_(iSB|l{ipX1k+z_g}+D&P*<64jRPy;jK@E?GT-{Y_0 z*UdT?popkW!zPp@JI9Cg#3)ot_b3z+0%9N)M2^(7^IQC zY4nt~x zskQ#-Xma?rn;e8a@5BmAo~8rg)e|fDpM`Jx!8^X~KZS2BsH!iyU=LsjzMNfv z830T(C;Wp8c*hzKA51yxFMcXmwSD6gF4ttVvDS;N4ocWGa@DfhJ7zaH4&xo(8By)L zGt1|PyVKlT<2I-h+~t%RKa?TDr3{?8{dB$R-zGOuOu2S56WP!pnn+g~z2=a_&&fRt zlpBxn$xUnql1vr;w!9y;1T_})Wr4>CkEVytpEo)5)DY?$`tjhgai1WlEZ@ciH;ZKsHbX~LShavPsa19lNUGT$F z>w+P>>4FJB37_NtSbs%)Vp6wuxQpyX`O~zS^E9b}neo-Qy<51;1n#VkA1TLPwa=b> zu(#Uc(tk16hL_iK3v4E)HlAF=VW+z}nlOz&aYE+W-oU!mx6D;7auO_fQ;+!QHwVAd z;{;trW0y4r_u#P)T?va7td<$wVzHB%Q=;o?S_ols7dl-GEMW&w#nuKP3sBk^T*XV` z+lhO|PIsjW7x#?M>=%5?0H!KMWi8*eTfhMrjx^gEV*-?jt}37mBYB%1*Na1}*!K!} z38Bwl2sDvW!3&BT@fMz{@Q2zkA8#kJimkSl%;nt76gF%PCV$xcXN(iZlt+vcL)@iQ z2uD?LHwc|@u$TPFRH!IO^DQt(`+~{LsRJ_;$cnEkXV^M?JbC2a>aE~=qK0gX@c*(Z z|JjaBrEQyHt><=av*Uf=ucpdPW3rQV$T_I@Vtx-4 zzsms2CGzV~Wi&njWeF+Yug+CTZQA&V`qA-B{W5`DUpCWa&B5)2%M|0ae*7aZiE@D7 zYMa4_T|}}K{2i-jK*VSFUn%Z*gj zV+ntCQD9CDY-XD!93&2t#>RW3BS#E40GPRImNDv3E*n7pQKll`n|EE_E*ivDoH#)3vj8N-?|v zXZ9J}R`R~ZUs4L4grW=%oQb9<$F7kv$jb9W#&(UfkP=h~sSVD+@CD zvVKu~j2jSYYM%jNh%_W70kgY~Ifvy+*fAGuvc`iWn8IU$+qrCU`-t@NyYp8bgg34y z{3n?km{l$>*VP8L&8S_r8dkm}AtNbV+<&P4Gh*FHKr$2IwM71>@DO{LKq1SMh8%dh+w3)+{t7NzXu(2x zFuwDZ#Okfl_SE2b?Wb*Y>Tu7(DpkA2cd^GC@CF z9|%njV-NKudV%ez;G)d#J7l;uB)VP+3ruoHY$XKToAf1 zmHypLK8s{#K)gNB?qj_Dqo?`t=GJ4rS#~=_I6hd1`^XxC`gwz|N*~fC`>s1EPgf`< z*>-w~AdMQG{U=S>TI~Q0>e`u}uj9d%S2#iRS4G@^E8#!Geo3OPf-{6{zI|M-h&->e z3)Uqwtpih;syaI}pE;Jf=Yvd>(jd+ztw?uzZ`pJlFXeWJct6SBjUar`jxtVwJ<&S$DK$Gxe{U)k3<|AAKfvz+}+Sq zFrk(n9hT^z1pBOLo|7vSmgvYmbqKUR9M60k-Lw*tV zFs19$>Ux4whSe#Q^e5OU7FDL6uTCKl7^&N_+qM2-9fFTa0x%iLAtaLv*xpDSN2E#T zwY=brIwpqe-so5uT0~JHlV~YQbEt!OZS&2Yv8E9OkAJm`m*n46pj|3SN39GgdcPp&d$SR@# zg741?JHXvA!l@nEHf&mKwbMH`WFGkqt+YD@?5qww0hl|yBTca9Z#AOC($io9@l?jU zp>-ka>Uf#!!9FP>q$ne9DI#<&VX_-r9U-|Zf!BN$=?x=c3?#Tw+x1=C3c3sf?OcXh zbn1CC_;Y7Pm4JT0IyW*M`Y^&QXp1U&nl31!rwDeu6%Hmvki)1hA|>elHKVqF-?9ep zG^nd=QgBdIcrg_9ozLApjVb>F`M8+xMkF$EmV7tbw5}PQIevDXo7GwHak`A_AqbIg zU*E-vxxDL+Nvn zNz+MK(SyNBCc{1%MHszGqf0Vqq>i9EZ^1wLZ*ui`M35WFP(sSd^7>Uy0$fTTv`{%( z2n!Zt?xO^IEu1CsAI|+RFH8>+cXOz8U8Lg(wpLD*N#4>Mq$bWDNTj^9#LFnVrwP4D z#-8Sh1acGdwJJm9WnryHuOl@_1m|wPR zuWGH7k=U`hH2s};2e-nM5eltz6GObNSxzDCChk`0hmJGcPCvmTWm>6T)JV&)E})aG zt4C?V3tYj5jA0h6P5)Ns+zD=>t1aC8v$(&rcTbH&mJQHbZ8Nv@ZDtZ?`fHh1oo{<6 zLEcp&HCXn#&F|kng3y#PX=|{S&sL`qBP}ti&5bmo(3jJOBCH>9t62om9V?#?}StLVzi zh8gdUZI=Y?KhVgri#BFA&{c%uLcKZ-HJ1Jl$vZ=i|AsJok+(AecWN}zD$k!?M-|sa zE({pvY6T6<$lX>M&5M!Hhk1G;fmh0!6zP&-;z^Qx?ika#1`_z1p);nMQ@y8aF>)i_ zqhav!6>b_e|>)k#fxQJQ|3$US2b(;Do~W{lf1%;Y9xX zp@r}76R)cBD4oLh9gpSlKh5{NHj7l*AgQu%p)pOBsSh!yNXk*9&k$6drP$eeVQwp$ zku{dacWPib-I988!P_zhIp@Lqab^-=lpYAuNcTW4)0qB$O+AC9E3B&}kzRCp21d+SrA`dS-xiIHqr17s;h4kwGykrvUM25-JCE1Ge8*` z*%#pcd9DcjjFibknv|(%Q@rCX4lHciVV1rrVKV}W!MKSTTO(gGU7KL(YsKBg1lu0H`l(>)m0ugM70ToHYmj!Vi%F>mzq?h<2xsD`mw+-cy_OZAw9&KPf{AV# z)?9errPSDLx12X*e^92p# zW@2JGR+I;$^EFZ;VsKdF^>qn#|LN3rqndqo#mU)$hk@2LJKtbV-fCnwxOU#;@>SDL z%`wUjK1WNuME!b2NyXA>%}>UCDCxfrbR1@oF%}$=r)AgHU&GuVnk-LP+c6||-~Nac zOXAr1Dl9iUi@w22-yPK$&sGf-UxIsn5;1(&=3m2%f%SE^#>fts{&FJrno`b9RyRYm zxV`NX*Kist?8?F#6WXF`_HY599S@ZTslIiA!}v@^2o*d@z}BRyd)8l*h;e~IvfA-M zbE`?#oUlHbPvu^h<~19U@@XYvtRr+de+b#wIeMVMX*Ztz5Yrj#aG2Z8(@)CR!a-aO z5$l8^t^~jTvEzn15-ajYjKm@nI3k$&A(^>IOYluld|`F`^oYJq7dw8K*&DxKa~#J6 z_%}Dv%%;X-bk=yyHUED;|0u*Onq@6QhT1B|VV)dezLYtwU@Ms)*>V`NfF*30*I#qqpHZ1qp&YkTm zc4Dk^+(;(i(`>6g<-Zx6xg3hFtFhU;eajZ6wf3Fs_;=Q@rj^}_xs7*>=o4@2zsAma z`lVROREx}K?=APW?K{H#EdR}=KiQ}Jb~QB)uUR#5L^$%XrjN$Iny%8mLll3Df!cbE zB{Q*-b5&B`b7hpqpJoPZ>*EE80k`+Q%==LTS{L=ML_Xw?*M7Oh6+9EJ+oxo%C0|z} zb3?g&@u?3#)Vi645eprbB_p(Q80CNbRF1a|T1B9j80t05D#LVxecWH6g-bm4EFT9Z zW6!D!lJ!p}My~YcKF%PNE8JUnKr%aXr&)(|pR<1Ymp6{01>vpG9UIG>vIs6bR<3l^ zM-9giwBu`6q~41LiFbS!OaFj>OVh)3aHrPX>~^cP5Lv^W1#0#KwQ?VgstNW1CkKON z2w5RzYu42i25lf5&}OB?jN&acF|MY*;Kgs7?;lGcEOk~L!lzGwKUyfPg5JUd&36*q zMImhkOvl?k=TTto)>F45uyCTgf5IW*a6GkI^@a)=aC@Ew3P;jlW)14G#(y9M6MFzXDmfWZRVCk)cnVswOkEVTmLW9eB$pyEN09}$-k0^<5#2#t7WHmU>rJ4;UNwH5}| zR50btywHrvo_;vYRmnWRAlmig0W

y5OZ@P&IXm7QWX1W3tUfr5xsbsePkTcc~Po zn8lre-kc|?Llz$ybR&sHFgcukxcXowH|iC4Zn%|q9yrEi&pKQzJ0MSIk;GAyuEK&Y z35Ekpfal?om=KnBKe?+67;L4%HXkm5IRG4DkYk*evM+uSKRme4$X5=&oY$KbxLgXL}<(dpuwUCB|P6PgCr zvA8pvAvlMs#YSyES*g+4w74%jo5EI1{Y9yLW^zh;;S;XYzD7BQdQqo2PVd5J8?pOp zDJg`()^BN~tD)FDU({u1xh^XQRZ&-j82$nVbwk)?!n(U|u-a}i*k;#dVK?l`4Z6W# zCmC$Qb(sSKuhR!&A|urr9>ISNc&C2wCKbAjTp75>xD-UkD3L@)F=vhv^U`&rR}yjn zwI!U~c`x4aW~`ByC!Ne;e(GU7+DJ#oCz7!(iFw!89U1L+Ggt@p*HsSQoTEW%{vq95 z*1Ct*l_}<7TnsLki)m1Z&CH(Cbiq@>S5K10NLIxcODgHXaZ~tbGIrfZFgMwwagAB9 zv@CV*D7Hc5fawm#9_<9enaLbY<|O1nlxOLQLKv0!I#ly^1iO}qAW~Dq9t2`x$5O>) z4sarvv5p(6$}J3;HVsuxq~BV>TB&=|(z5C{FK9(oa9gZRQsFFwKS!A?hT>k-y}=+z za(2{$B*C=O$YscZqhn>M=mw8+TnF{iYh>BAsY%F6nbJU}tO+%Vd0dR0^dD0(P)h4- zP9k8tTNuY=2Yt<3xF+7wO{%wrkA`U3qJkGsF^(bI2+?}uXFa2{!w!gdEF)s6yQ4>{ z0K;Wm&a0$G6koRU+xDSh+oc>q9w+(`{uGoZ=8gRxILrIjB`WUs|Zw(yyF z$L2Csx$~*3MQRi$i?Ts7#VfC;QI$X=YrRkAzyCo@` zw0?jDd5hP0owaP{y3P=_(`bf6zM0{yyL(r}xJP0bI&pZ3HCxlQxA1ANbHx`>^%v=n zsqtwlIyzfU(bHW~>RcuMc%8=-iqa02zGYT-aFtUbwxWVDDDkh*8ui)0+%q&Mj6>NS zFGi?VOJT9X8aO&Ta2Q%)xOBlIjo!lNyv~g* z05GsdQqL;zsF=hkXp?eL%mn*CT-B~2s?iCO=?)33Jz-3C;3%tH;=`v^ApxrNL?OtI z)A$qd*~1Ymk9AM#Jpr z)rdv^WNe**Lb&X>HMB#C=EIV~V&IwlTr;{9QfDm~$|=*TSfX@#9Z8Qgfg{y}xJ?jm z9;?VHh2SQZ8jXFWEbQ)cK}_1wC$xx)^nWN8M%&C1k>v#=909rS1Gqb}so}fCW`DlR zuf**!A2SDQ8c2GH9!nek5Pz7B2kj7>Nloz>_5rkHKi%o2l4X*jR*g;hD5g^|kzk+W7!065m z2lMU#RsulByhK;p`i{XC#xeF;<)v(um-zu3BZ;~?&Ao1+SHl{Qv~+R*VP-?lDF19_ zBK8p=Rd(rW=WokPN{*DVz$;At88K{y5jsl1DsP<}y`z|Vs}!+NlW8qSLDto;z}Bs1 z7?D`$dyHW6g++9m$;Jd6zrTNDTbG+);R_WLp|kLSguiL#jlpaf$zoKAxn!Vn#AU$^ z6}q-?ANq&n%HF*zL(TB^cVwGtP3S2nC8gNzQi#xG@k!<3u+=&9MmGUd11RHmL@L;u zim+W~;4P4?A$Y3>h=g}T?8kLtyVn5ttnL znK$=bhC1TLCK0zG$e6f)&h6{&5+U*I4Y8$jZg0FxYQ?A(54_~RLTQ>A3(6#9#xKb! ztxwbq74gX`RP_Me@QyCGC09R8F-e{37;MJMmbJ zLs;z%RY2IV{P?iz1w`zkIo*w2)Vy@gy6!G7u{C7$WHno{4XDXr+ELW5#prRQZgls) zL9&4qbsJI&U!EQTWyqqF!mlG6$!v1x(QB7>l+<3R*5iU!y%77D6DIFx%Ov7euz3+~ zpOTJcrQ^+RBsuTlsH{lNQR=OU>?X|kjl|FSdLTT z*&Ac-JBA}3%2Ar)kf(Ty=0~uF#LdKjgcrfdUT*$~k5Gvk>8Y6Xl#G6F!F!49If^HLUJ^Je zHwI;@{vH*nGgGfFOOC~<5}}$I!xAZ7ckb~|F>iRC?K5(mRdW#H?y}a(j*kxX=5n~J#2${~K{`rNrYWC<#W)p7hqpO+Xo?Og zaDo87ABV`xTR`l>s|o*6PjSX-5c@uqZ1&&H7gNZ_&gT^I7a#fJ?3c*lL}V}b&7NJg zR1F8Q+&n}cw~X}`JPOPBc_f$zpSO)>8M1I9xo2-i{3unOIi;eR$cH3)mjb8L5kn|H zl|pA5GKxN#KLnEmh0*mTc?Lj$u$)VHPaQH=KSO6wGHkiSt2lM&}kxQi4m*h?P9ZgoS&6;15Ony=y4! zPM>^nnAXu7Yu#WGBA~GC-aL*e;80(v;GpElHD1T>p{~#)7)bxO{;(8n3~lSbT{=Bj zJBh#nNw*l!(S(UZg9Qk|QW0-;u{&7k^RSYvbc{<eE@;mEp=hv(6tebDYmJ|2%x^tCAPDfpsaL*vHGc_=nyeL9EJ{-3Z=F z%bn&SbaNhtk+&ms;0PAKPQI1)ht%wX3`YL+rQY6FQ&7eNw*9&q8{M6~gsE6oM+hAP zDaN8&J=jj#3{-$paFK`WEDcBwExcwC(!9HPbf#**F_~feXs!!hp_4NcYFxUPO~DhV z#Y#xPl_{@N9L_gUD)PoOfbQs>xwFv?^E3C<{fcUiHp_n}y`oi5qbAj(yS;_C7;~e! zF4)2%f-0}z8x)|{o9ohMVFJ;(IQZ+4QUM|^_!+$hlx)sL5olxP7ON|Bt6>e=0k_(S zJ}{UCx76JaEvi<1xl}fv)Nbmy2wCkHA{z!WsE$bW7T#)gIX5QL^Dswx+RRlp`I)4FGo5wa@WZhQ zJA#eF7UmZ`p-wTjzpp;u>9=EMDlV2Qf@5Bn z@NQe7I42%#BDQWG>#vJXa`)cDR6z&2Q7^8BYA*Lza)?G$K13#o(TJ!H7im6&C`m;V zMHC7UjCX|I8ooP1?@r)d)T8_@DNq^JZnM7DU#o(P*_aX36a*%OQgi#R!Bz!vz$8*y z!Wq~t##$l09q{~WZB=j}kU&Jh65xA)5Tpdtjngg+?Zdx3Rb4})64^BjUIw32+H1VKaTWYRK3%-d@>4x_Mxte^W@!P(v&CfJih175BX!GGt#hbkcLE<4h{x> zEz+T;kohLbv{I>1F9%x>&yQb@+bClia*BWzt8u5AqY_Ig(N|2>T_|(HiLr!E0p$1ea*oJUQaECctr{r!&pbG05={k`F8{-}8 zi7eI$1CssZ5X@Wa)_{dD>Ms-QJxE_5t(kimTbs~EXHOXx?&dIYmjFO_w~^+g+$0%$ z_8BC1Z2g;N_s$yU%XZEd-Xsqli_GeN`wWK7{7|mTDQ;p<~Wyzlm8sv{t z)#FqZ4ImA+TY(D(71j9~b<+5YDLd3D8lpMK!0;~nGDCt5r>dO^Qf9|3yt3<;1XSeB zlj&&E+_!lX+Qm&PSLE}1#XC;mf`z7)%Vqz>GkanHb+Lc8^p4N~TJ++Mow-kl7Qt@a z&CLu=E6s3_o{_Z=zdm2jAZgDsd3OzuN; zlMJ$>*G6`Z1)h9(Pu(J@lZp!!E_U;UROJ9E69+igVEzh~8X-2~>C>2 zEpt}t!CL#@4SS?~-KMTis3-}3=|K2<2@^G&=QqGM&q+WaaQ#byD+_Stz!`)k zWi2lWCh1%ILlS_g3PvjE1DM#3oiF&;|F&jBO z5o1NdF;)F|og09DK*?`hB6|O*jdy>wWNg~}=0EB`;@#QvW<`>a`EvQ7 z{eg8;{eT83wFfIg=uHthpXtrz=P&+F4+xVGuI%?CP`*hO59ptwqQ(3e{s;I$7T3G@ z34RSTIOmhXBRyQHB>f)qD4GHZ+K$<=^FdQ3B+qHx1E+n*2@Tpyj{7IjUG_b5)S=Xw z`yk@yhdnft-6@?~i2G!}S=k<_(aTBtNs!`Zof}&RN`Uo~z9QbfU4g;DtPe$U7nIlQ|+!4UB%b7jTPzhbpR2@a2Wy8 ze0Od7l6X4;+(@#yPxRxCLFeiCmeHYVbFcBpJWdb;+h(6srBivJy$t9*0U6z)D0Ime z6u(bLx^uF|^YE|OdtG^!Bc*cT{loVDeZ9Z3`2C;ly*2Gj-a9)Czr{fqrup&mo>#s9 znjf=zvem41_iAT&-&p4TXE)mo%ammKf5c!ei}kv0=ZTMx%<#5!1HdH*Ye@Lrx-aim zg@2<3JJ+VK_t*5SE3e!-@>vXoAzpH;Am&XT)HrYU;A{K05*YhvweMlwGaY@D@W+&+ zIi0gK;d5L}-qdL|{vVYn;v!MSeWSP%xisO7G`PmucPb4$t6-9+t8E>{U>Nm%f~Q@5y|Ac6RAx0l94H=iU=TZbiQGv!8iXG3#%6a>uUu z+0_$Kg7)OdXT6)+B)XX|Nzgt9BpPCDn}$_>@Tai5+0V`mElSWn2+-XoXumSsN4yFN z+V4QkAfl6szULFPce!sT^!XMgXy2E9qhNi}$36+#r5X9YSKsf21nq0}-4d|BbIa!v zu>XUP|Ca>pRsRbkECKsTk24JtM2@ybkFb?``Ck7Hm-Nu@_dK{%)fyB_4nsCs5b_4i z667do`wVRD5oSJv z7d0$%y&<^8A^N~re=$=;NctF<^fq_UnEMvern%Zo*Y5~_Frh@=t4kk#kXW1 zlid%=zCN3so*kBeee)w!euxul+M*>@f&x*x_0-}*LUv2pepv7iN-$=kCVE{5G4OKN z@=r99E{0q_Gm_SIgBj~jKVDahnn7mv_wPm@X)qGupuxnuqb`Sheup1HKl`T}&x|5>bf_F*HMP3D5PI1;77-Fcjc*mzK1Xfayo!HbSeGMxHi_+H8lT;mC z=jt3^oh<^BFS?rWgduSsi)}#{d?Hb=%1UJiX@i#(BPhSmV5}i7 zJSBHfEPWpLNNS!=PjSbId($@>$ zQO-}8G>n~S79SV0?2(%zc>jCwB{O*hM{+s)X$V+yhhISztOY=kN1W&*GYH{&@3;OU zI(%K_%~et-ZC}^YQeq>>4+xp0Q;n=4%Z?~VG}_($}IHh6QN zQbg;a%&|GkYd}ZVav(wHZEAD{R(#vZM5Fhko>P@v<}phIhs<*7Fg!zs$ewN@YvUWUIe zq4aiJf**CV2EAYXpq=Ad4gx|Skr-*wH=$YqkM!Ma(kVWoE^Vm@-nxZl*fiP*Bk8z8 zuUUE!kxlF&LYvA|23HWpqt3GHILG>*k#NTbw1ze+Bk2lf1QV$n+%i!3(zlU(`0Q1zCM1@Xj4 zh*SshNFx=r8!P~p-Mi$-PrWWB=IJZ*%Ylt_Kt|Uts3%w+eynQBJ21mVUhXmqs=ZzE z*Jns1MYb%WWb$o|Sl*|tr%B~md$+SGg|opmm>mv}`?q81)1%g&Vqv)Hi(OqZ2f@!o z1k+*c013So`L0QG5OXUv%`fs72qp3EBE_lF&mZ zQ4A;=qtGDB&nbZlegodz49nFI7IDs9aUK#(%G4!+?%Y%32Robeg}qXMIJyXMHvP zty*{?Hy1%Sm8&aP#r;x$Ysb3&TsWHkLC2V?awWeOVO5Qzyw1yMT<0?eg3u@IiVhEXmohrmc`#Y9J^( zKu!i^zWwci#a>I+IDVXf=BdH>H3d5t&H#Wabk`YxA zN;Si*)|+z@9Bbb(prm_fdeE#N>Z$Rl^uDvE>#6OibcKC8s`Yv&$fJN}K5Wh$*wc$r zq(8Spv97nZGhvucc<+`$LA_aw$6xsqNsK7G;!ym#aAaq&kSDi367G_t(!c9HwD5Q1 z(DbcV_t5k$_I+r&(`0ZiZR-7#TQ7F4&c;2LsK#sRnbGqRX^yyTY(=9tZ<+eJXZ;>% zZ4I=>hHdel*ALYgxgOD2=P@T&_PBv}cuAY&>8W_-yUEHI;+16Is=8Uig{rI|=6v}K z)ic1k*j+tky|*!*?0p$;Ok_VQ?QKtGhR^-+SG9t^nm$=F9U@Yhy`3lf*}<-#m-3s+ z>_5%H{etE=iOB4+>j=16bO33N!(U(V>| zaDfz&7Bl=csP-FDL^>(tb?#J_54s1e>( z2L1<*Q_@F`4R528Dg!B`Rh7%){=N=0~zFYYSRTou7e{fz! zH_w*NT5XatQIc{iL+@Yz&XAfylcS;cUq(YkFgmRINPg*($AF{jE~M-7U2;CffM zGJdHi96ZNBO*kcdqg34PXk2$!`buw3n$D2+YulKfKI)5y-Sz2OYK{Bv zWyWBs*yNIISnIkiS^0>fL(2xlE7vw|ThZoqDb(A4FX2~nOQYtKJ>4uq*H=~m!Fbk2 z3H1ep$&rI$`^CX?QMcTvhy_2pzc2|DfQ;w0Y?~vG<}a5@t*B$Q>;@FN-L#V zNy&+nUY@9|$sLXQYtHO-c6Jv{f9YM)^p(M%4stSZE5C^EaKTA6MB;mTD88K`+@%IL zd@tY--)F&M;Can+RtT5im+5=sE#UiZF+K#oiJ53v2sg;^^?N{YSW5``qw&h8Lu`k#6s?J4HE8RI z&gj{201qYINab3fl73BIh|t}SW{iWUL;MmA{5%H_4qH-cE$;VUz%7{sK-42IQbL@0 z??Bh$7x_(P_PP$ZaC~2E5F>*tLWo8DM&q}LAKsi^(6;Z0t*bfQ6!*Z|!@aJ5@XoOU zi(G>|!Q_jZpkq@RgPiL+K4oM#mmSuzLphyyl%)r=IH>7ae}I^zwmud2_slnRJ`!3b zyHNsK4L(7^%Jr~vRr`9`qVdXA$;xHDi=94(ht)7TD-)H^ zo5Y#@U5Sq3)~weD^lUgt(o@n{$!X*5Wv!%~#{}=$5L)2+h6a7Aa+TZ|r5)@0cYGY$ z-!dgI!JYjHxmAseyv|niK=0RPpntHx`TIit7V)(_pJC7a%k_RN&o-kS2LlzVu(IMiOSzZbWDmtj+zPKP#YXkoAJV_|pGq+};K&+g@6R}0%HcXIG~JNtjE zKwmpMY%2Qk3+?2d-VP+_?0HDg1TaLS=$24|{xs0F>Q;WSh=&XA>>{{-{?1v%ogv(% z{32<40T)`t=Oatbx+n=@Qi912U(AQxVib6RTX}utn;O>-eQ_+kq>Z6SBv5Ta6XLnhV)xLvz$ESze z@0IDJ?04Vv;Y^!fD>UO61g42*91t=7e8PI|wq^Jj*xTw!c0STNwe91w*^^v1 z{fY`qf37G0;Jw{+S6qvjPino?Mi3KNaaV5bS-*efn#xt9 z{qkStdj!e*Y1dcf-{>K!AlurA*_+nd6N8)far}LRU*?zZ0SEU6^0lv~TN1+EX>j|G zc5rVMbNLL22!0a6UCVDlf>Z58eM%Qf7UPo^(cJ{u-McJ4(On_b^LBNdqb=^zpQlqJ zm(9Ac$tVBNy>FwKS2rydybv_?yi>bWd~Y84=s zRl`(1@Xo%44(8lZk$uED)b|WuaBvZ-UY)VZf6DE|Zx`Bc!TUV#rp8oHZ$t2$zIpeq za1~nRR%@`74^95_i_2(y?ot{O*%xj+Yhj9lCH&gW0VSt-T~G1OEz-TY|0$j!D&AcA zY}-CLY}UDLA0IYrsuPwMdFQXvc|ME}$wl|ho^=N^gl)AX`C;;qzoxA@a~QI-2g{;x zp2VhQfwYy$qp}-;`DtEY=2OXR?kG8I)_W=|R5ULj!6x{nZe1CclQOoe#Z#XP}l)2vMu#@BBcW0iD6Pu*k87yx;RTF;LIq zL!^Yq88sEb5h0w%Z$Z>--Q8#X>CIVI3Xk45cX>C67N5vJOZ3lDxPBT^7krQICz_@8 z``eZi8!V1*TTV*8mG)5Kb-iZ~gl7##g>GAp;%u~sMz8Avdtk{jM-MBBh+&_J2k<;* z58&CtL)&KX>fd!X?aFtz_|E`DbQ-N1I#@lIAV%;63CFhe)#- zSlTGQXb(X79ACVrm05f{KZSz9VoKpvRWmwAB6BLVc)lyMRH$e6k1yVfzvGLlqLM); zh0Zu<+wuypbMLT>po5Jxy0N3?dv7auLyMGgGB_zM03 zCQnv<@GUm##J|pNgFVJz*<31FeD3F9`x|Vd*ZFIM9d58hVwWs#`5f$v?9y#p-o|wU z279c*Dh_CI^u^Sl+=K&Orh5x$K zne~TMxdBjvqw?aGAyhYujT#7E(&V0~ugFMbx0*Q}2 zm6^z$&!sENZ|*Le_H0Ko^lfRFV5gbHfAKhJJSupE-kUSNZ`I14>Xk++D%7lYNr%=CakTli@dl9T}gQUY?zj%-UyB^8d zwwyt7uEBosAOZFt2FoBh-eA9YkO1ooR)a({V|?)!4-#O{HdqFUh2by$;z0uJ(FV&P z`6q(^WrGCRe!$uw`J1LzQb6PtcYd@WPMndvf3`(*u>3xDU%lG!ayiuPH!-dp&l z<`&+fAZH(f#j)OhGvU8;Hc<~QDmk|&CFG}RUndv?=;^Hr{-tc2X1=h=+Pea(-n>pG zFck=IZi9=S%5TG7Xl?^q6&> z+#LR+GEZ5TYDMAq;Hs|{hcH1{zVq5#Y$a{jAU*32ka<$FlcfWTP-lZ+m~@wD@2DF) zM7f+Etz5(E!wgvSaBEwr(Uvc>_RKF`dVhxf(0#4-^GKSIw;#?2f8d2}_^^Lya_smG zORD{)6EeeZcBQW4)sjn#rQ-#l4QINPPhUmcDHc;DLnqv?ZN|ePwDUP zplkmyjZw+`m|Jw0*4lfmzM5fs85BiR;JJ_}pJYu)SP8#~#~r}YfWPxKFCG_AEZAD^ z8u~oHyQ6=$R&pi7G?w@sUhI!)=Y^7{wyPN(8xGSV4M}%m-JJ2v%uZ$-cWS*>xhCnq znD7m@a_dQ%VeLKdlqTw5?b*cgZ5@WHcj$Nx`bo!ZYWN|x#mQ6Lf2QNjiU%5b74O){hVlUa zMKsXLj@}CGFK-_}gjQ9fcN>QbJ}ONg`al~5@>rhUkAEv35Gt$lEWkiC$i_3HC?bGzM^c#SbzE0Yc2+F9s=LYIWrkiOL;~-okD1%73MYbZoE4o)kOirPMDvUN4Qkkj!2`E|n-H zd?z{YxMJLlUJ&T`R#hTF!hVk*|Rp5WQRRo4%x9+lJky<&ufU!8(K>E zSiiV;>%(@hMEiHi|UI);7nU^%fqCV&PA3?xC=>V|k;3 zE9M=Xn74Przp;uTw3~b4$hC5DOr1cv63(nVtd7v+mdH*&%1_LjUU#6&;py&3&R;>> zr<_tvMdj#I68oJ<(X;9f%yGsEk->h;!>hBEu`%j!;1k(PzYYAQz%z1-;JwZvfJ;;^ z*o~;KYw;9GZ;+rA4z5}Dr4AI!!K{) ztMQ&(jU{HAH>h;Xyg}a@Gw=V=_9xI$71;wnp3VXR0xu{aC~DBC1W<`0i=gH~H*_>X zHUSX<4WgqsgAx){)HJ=M+xQfhQD=12aU92S9A{hx6)+IM;J)C#0it3*gW!q~rStpT zTiu-w`aS1=&j0t*L%Lqot-b2j-Rf4sQSMVT&&acwf8^JNvIe?*TYW4`px?USHe5+A z>+EMS==A&V`t<@}x|#cIrkJ_MW!eO1{wG;p&lh$SYrg#u&|RpavT76W_{tg?*PNSO ztk0%`v)@>p-prMMk?na#{CPwZvYESQDOz|8hz@F#ckVH0VSf_t3x17W8tdheTjcCN zL`)aU8<~}|I{xDrdoGK0D3lJ!&Ffr~>x|>g)#~D{;zDPn*EuiSxh&5cTUPuA{>i*f zRSwp2SWGS~{)9~{OF_(^n7J?43FO=CsU_&S>0oF*Q)(PF1^1ze`-}Go*iJk6N?C9pUt@PReVPN#={jxL*5IcjX@K0j_IG)+XCM&{cFWxp|%QxxluIIy!eYd7udYe#z>Iw3lSbzg93$LiDP zz0l%IS7!<0uuX?HHP_Y-ixR~rt(DK=Gb~)Y#ko4WJ;TJ58CKV z%4Yq{F5cmFW_p-o&>6#HR#9|t>_=pS%yn6GkaJ0n9UX?1=E}jbT}(9o2I=yTu^(mT z2##f$Fjt?_`%mfpi+1$n@$9l(7UPTSyj^Ga-{H*64X(@c<{#?Qu1zRN>HqB+++Nya z4l{dAR#DP|p-c{V)Ba<+(0=g`wW9pl)MoKHi*uxFi4WDlxO*o+Y~3Bz#ZmDwy3}U# zls9^^bMp&}%Ibja{T;AkW&9ue*<6SSoBc%e;Vne7hi>#am*GA3vQESu|0QuU49{34 z6~!-zMfUr01t2aq%X{TdzEy&Gq!=y_WUoEFj$xc%{2;-HFl4SS{s-R`#Se&sxA}UW z{a`okgjWCD!?08rKOi21??mh>7Z96x<2(2%FCu=pphtgXE*UJ8(O;m)&|1 zn-{A?vs?Gc+0?{tt&rXNnlJVNyS3|uHkzd-S2X3oExyl~hvd_-ki=JWU_-+R6JgE^|LP$6yb#WphSrLu={<@IQB9e{icc`)`p zm&U@IllR3f-}lnnl~PQ@gT_yW(gs@4eMDO;@6l+vTD^jARhi{<*%Es>xP7Fz%Pzj{ zXepTPMvhd>fA7e$LKeNINk!QBvXW$}fh1m$XVQbOmZ3%A)BCD!yqCDcwhbJS4;!>1 z+NbO6-2d2J-uFdw2QjCe8M!4B1v`|395|25QWQBa&L{ayY?s}0<=BEo0b%Yv(u&;3 z-r}^B#)@c2W+Th3B~thzFP4yUklYz-;dl6-mGXc=M8@y-r;nnS?Lprxw!a zwKC3?=MG}-FFNKV*A`r| zBKBez@eSxLc;B0!7472=P_$MC%_M7#tGSGn?8DFVmy8LnDVAnrSD)l%iY3U2vE=eV z;-EhDMN2Nin|uyD%crc#isFBGqZP$>x)-O;X(>e#3X%yEHGz7pmHoQT9@aOu#r>M7 z6{!I-PPKK#x>FT4M172bz9K+1-;#Ex$2<7J;yR*zqjp%!TQTCNFQ#eug)ilK7;O$a zNVkalzT8R4KyTDP!&z0Y z)B9l}D?A^`2{=>GgxttD{BI`5@&DqtIY&0xXgl^gSJwoGJS;fm8Nnf7llg*8<_k7i zSiDz81!AUp*j?E^UZi&5=xGP zxU+Y_p3u4_XU5(jNA4fJUG{ph$Xd@~BlnVh>(gj?eWz#+Q>w@roD3Eo?2XNky-?I) zFJy5~G`MYEx}XP0pnyQRq-@|?sVgmrau9h=#ez7Qa}gcn=R>bJMH-W+VpvFhKI=xh7R^=dB9VdJrj2)$Ss=`UNJ$AY|r z&j*wJe6STcm6*WKLyGbiS)s=?e7uCi7ZMKddFDj<@V$@zw61k&wa-}M)wNz#eX7;C z8HVb-KZ?2+bj-WXFaYqeo=efl#k{x&L_PQmw|fOS7KlkPS6x3M3}kijBya31b;Tg4 z-7m*v8KYI08ZGx^;}66MF4}sv6Wp6)Mdabw*oRd4#%)6U9sj4_8Kup&S@AolMaac1 z{2F=y9`P#DF*}!j*PH~yJc^dC5-Fncmhtt#uBAx$KSC+`1DTOcK1_1RvG#UyOi1Tg zk3aad(Fa@up+AW88gHG8cR{%RMQ?esQi{5be-xgxEbFHY6pwuI9`GJ? zW9{kQQQyK4ZSs!#ltXB}H~+mQU3{Z{u)@~>puN9=-n;u1O?qX98c)y}+M*Uy3Xvrc zA1Q?H=a&$=A#h1@w4#cPy@j7WKgW+8CQ) zn)wO*Jh)G|NO~%v+#>o~mj51d_Fl3YzwDD&gu8XZSo8;4qal;&;V zw9`^?0shBLT=-|>K5yji-zwa-EO9*a2HRNTTaiTB7=Aak^hcn_M#U;Rd@!ZQ-fxN@ zVal)RDT^!=>du(1vqH=G6#-Y|ZfKdn&(a?&w6>?T<3&o7mkdEHc2>4 zcnM*a=$-}eEutq$i8Cr$gMO}j#l7fTWyKERSJpwGwTJ<z zbbd{;^Ma@7d>cMh?bwF0P+N6=Xq^@APOEe*>VKe=Ktjf^!r^fRZ|uEQgtNG?T7i5` zu5Q}{HV8|b8{d98m)b-R#^o}6>)iGS?m)}EC_Kd$#!Rh_z9iFU7_Bql8nKudH9*t0A8$DKkBJoK$ zLJ*9Zc5!n3ZE$~qjj~WvctI%&*1aq-#w#R@uPieO|4GB=9g|1cA*`F>np;#5yzruw zAox;hk&%CbU+jNcXBFkmvm!sxaVbagPS$eHJzBy(4gZ1oCI59t@{zK4Y`twz-gc{g zm@fAf5)Sq=`38r{*Ihs(JCdc!Qu$e~)J23pleWu(afy*SCHx6t_bjkcy8OO_WDh(q zx$38DEoN6dspS-Cj6C*kh5o8BCoxKa){~Esn42}`4XtbGK#94Qm_X}QB@(`ZaLP|Z zp!HnofAD-wmq!`tzlQrse8?xTPbw~{qd0B5^yYYpzdJL2R#Lm}U|g+WZs zC4p9X373t|SM6A5`8S_0-KuzDh!pf^3QFR$)&f)*#h%2wkeX_#y ze%m(1pNiyC_^r&kvstm5cK}!p#3#?i@E+){_%D~BI@y}SbMb#)%Dvos6i*p62X95S za-7`0UU;k9*V*~&4IZ?@d+3q3VF}mIILmHOq&mNNDE{t5%IJmU$pzCD(lq?#dZWDL z7{7KWA;gbINgiJ$3kIr0@U;Ch4>?wOs5O zmuBI7&m8)(E%C#5iaH2qdtcB7nfD5Z8Q}D>!tySRokTSv+5JW+>ZGj(RhS1#yqR$k1alI4Rxa(6SrS@=&q8AZXYP0|Lw z397Ttp@ia%C8e@vTHRZ`$snI%IgVcWuCD7i!z|Vz;3AVZu zGNopU%wT6`b0uF^WF4zgVgz6JN|G^h){LYCi5DC_M#GA4!b559YC-$%v4m1|TH&Fc zidY{~{vc{oY&l+Y4J5pTU9zXKL}Gng^i5uq&+rKX4^|BAV3h}c7Rc!HB}N>pF%{9(v8y*Cy8kUV2Ut5lm`H zm|ew~f5I|Mt^i4N|E}oCn_f9ggI7c@W`@=}Y{4n1DY7S1O#A9_pQ8aQgpe^Lw4rl8 zrviIgDvLXifz`;%~qKZCN%@36{?I9(R26~faR`W zOfs!1qp@7@WokQ%fM$JeWz`mWkjf5j^o&OiD_V-ms-J?*g-Ajws$Lk|oO_bxb7mkU ztavqRY;;&x9u`1&vnj{+yx_0e;*8l(xmlG}ZDr9hhmMcto~l9@?PyY&^r9@S+nc-t z+-^4(*O9^t2xWpmC0y=#+ZTHa@k(xSIbuC0T1RQz9x;lg!{s1Cawd?$b!GBMl^xA3 zMv&}{om1e}Vyv%k7>{9?qxdMc}oJ;!Jp*a#w7vkgpp&T%O)A_dsr&4RiPRiih*IxpJ&0+n0o2oQAJS z!(V3L!`s~PzkB4ql$g*0n zf%~E$*bI34Ss`U`zx9Fpm`U=o%fAwLII!-#n*s%uLUSu+o+AHX>{F=jO^taz86&9f zQHE7g1=SUSA5y69bH>xK+to)0_dz>}zrhqkqR-CVO$1Do6~2`KDzDR%*P7$0|OI<-CT^^z*&uRDuW zq#>8hgcY66(4<(=Sz%#CyTn(0Zup@G_+7A~ysF9PT!H@Zf*cft?x&2s;;-O3^op}^ zICW*VoXxUNq^I($8^yUAwtBt1==GpR(r-;`+Fc0khQ|2l=nyovV@~o$hn(vE1GH<_ zE4}K>Gn%wZafl-UXQBtRz#DA)+3s7)t3CiX913ojo>LYb2I8p6uBb|kjmm>sDBi+9 z(8kKD&B+>pJ*XJiTsJ}d3PFEX{qN=I74ddIZtQFatE{)R;`)n$ap!0!&3Z-5Ni#?*0z_}YfjwJS-t;aC?Xp37$ zj2T~TPHH$krs@}ht)k(W)20i%EQA{Jwt1m6?KY-Zsk`_w z+xN@TO%rfRzPVrOX8 zKnd^f9xqMM28yW4)H%$gvJI`@NbMu3J*D~IVya~FmZ$UX{aQ-~<-V0pZLjx3lL~_U zUpjR_I`0}u6|UC3GoAX8=_A8-ugS|uT$N0%o@&|~(tOhAxuhidHimz#a*V#M?oq67 z29XZ+m5_G4hY5jAgkd#gSSyQUbx$ei8wtr`bazNd@yf>%(s|n~q3fl*7YrN&*=Ry^ z{3!|P06iihJp%5RkO)59J2Xz}xY5MXq2&@1{Ov9^ag_3V6G!TN3F)xRB7~6bmZ@O2 ziy4>{W_#oMq>ydvR)g9$@rz*h5&>k7r2(xvF|f*yKb?fTk6*;T@cDK@Kl>O3|6no!upUTN1L=p#X}lCz6v z%0`_hnw3ieiB-pjhr$`K;E==6oEw%j$qTE&FLSF85r}XP#0j{=5s`h<&R*ym zJ0Gvakcgu{CU-DFG2d$dMDR3~m$r#=rgSY`!X);p>f=~uS5G=B%cy9wIO$8*c||@^ z-f(*ScOM(hJGZng)`~(Ck29)>i5G3LT+!K|1+~9|(!C8Aq1U;VgP8}( zy|K!I3DF_P!G-tjAe0SJw~V$Z5O@z8#2-i)dEyfqR=HQ0Tm0=KTmKkeZh7|%{w9!^ zEN5EiI#R62bm3YPMl!ZgOYsbMNS&u)BN=;Xe`blZrNo~KWWwr9dsL1U!oMfw4M&<0 zNF1yZL}`gt{{xg>AYtTQUA0srQ%z7RHLDEl62hvD<-7?b{-kRa?9f(oWgzhv{dz;8 z@6+%`4c{%{@ClOhN&cki-9TbeVeULDbT5&$Rv_!_Ofeq*Ur(@Mh#CP>~7Grm-fw;of$lxIA`URW!#Y;x|U!gGt8gu2b$gx7FN=&`MHmY3-iS#g#5LFeHkQx>x78jhPo^C`i zzDEx4{*6^XO|iof5)OVxY9KNCwC-^v8HBC+9lFvDi=iGFBll>`^x+OAuLq`fP9Tw| zUr%Z7E*gGV!z%F)A4J%_lRrvd2NKhVcPFp3B#@|3Jbbl=^Cdp~IcJ^wJB3cJ>rrE* zJ<}EDFm2nJ8ty0YVL5f(QJL^D(M!~StnG|SIC!YSUsysoBTOxFdTqWRN5tkwF zWIQQp`P;;TpCufbqBR_^?HDgvo+dGk|CbFCd~pKtwY2YQ#U4eP*B+1id8f&UT)Fx{UDiLZZ9TXv>|gHsf~ ztAv-Y0W^^4A>qiW8b4mbp%bK>`!dThn!-K`v#3w^A}iEY!>V5kR|D$q%Y;u#a*2}j z>MHmW=s&&!iStxjFXCGX6CWN<*j>#Z)=$N;GRlu>>O>7cAgKrQN5^Y&Zz<^}O`oOV z<&yrr;nxC*nm(Kak&W8lgoNv5-~x$XBpjTs`MxGxQxQn~C~={&8rP~}X-OclTf*V{ zDbhWL2(5QfpB~iPN7EPeu?Tl1>|T?qw?*;<_wX4=e57T4CHc=wrDy4R_b#6`R|gWq z_3H(N9;M+m8qSq)_*lZO+;n6-s2B${>+hZFrx9j3DaE$bMqQdmi@K}L)QehaU8B)gb2l?demT=@Kg`F+oP#=M<&n$0& zw68v-ZJMub{f=a3<$Z3ep(Q@!)19VS$|(OirFvfDJCInb;q@9`trXA@0e)P8W!iwx z6D}fjuS6~H!C&X5y{~_FpK2##0^EvJtbM;bH{B`qith3Tj7tDsEFE4CGOLhUYhHl8qU}7U$lfPrG%ffI>ih3=&bIL zh)5qj4IkB*wd5?A zskxBG+^8{+XpFQ(&cS>2Yd&9r#KRh%so{Gh99}Os1t&;mbM7>1{lgXFPujadgx$LV z)A}FN68dQRGaBxy;YYQ!YAJ1>!q(9g+B-sFb`T|WYoUBCpU4;HjuMW%qTv#ST1VI| zG|j*7h7kveWoa_v+V!NP8nW z0$r^|$@+dq$8nd|yGp~`HT;;)X$4{TO$APl(^TSXXL71bdT=F4PGt25tYJCP{vqIf z60*T8d99gd6w_lD*FNacMC;8rEb3qxGwtEFA*V}UC2*hkJ@>X&c%3s5y0hKu z`%tEEx?u^phw?|-=OFjZt{r8q{01%BPY>}JX0IniZH##SC3xA`7MvdJhNDHfHa&jg zJ1S`}a*s{7&nsLo12Wq_O1NXyO>bi?M68K!ntKyG%Nrm6>VBT$7Fh{)i$ti_zU59X zvK5mL5z#v}8JF)SFs|{%WRZb9k6)DEzmzfHb20`zxL5|fYy7Mv+{yeRM0#7`rewhV zjmD@D>8K=NSAJD0601hm^9ABYf5YK_k!(vdvTf9C-Qqi6G0k|JU(NL-^}7Edmzf_K zEBbwj++e`PF#pG26mb@psPsN+$8_}b1CblyQreyp|7!}a9XlKUJ(Oqh9DCuUWPwXl zfZ!P^k5@+kF|UGWRy-*M*IuCd4{2{7^%qStx)Xi!JT9WHWLwSt(ekaQUz7^;c@u?8 zHe|Q478scE#);TB|AsxJ*x3%GwoAlT75Nx0&P7PJl-0uI8w+aD;L1@g`VTQ$4vV@$BxrlCJ>A&Gj*4G{d_>me27VRqC*@|5DW+;p z29cX>RBl{woyg7h(4~N$kI2pH@gvHYoF~Z5BA4oyoTef-le1FfW_rSeuj(K-djTtZG=2KX^$&Fo8Yh#YlW&Tbg#Iq-9$HEM>EMVp|ZFZED)pAMa`^<%&xY3D1%u zJMfEYYQtAk6tXGvaXBB{EZ{DjD)vwY{f?M+Y1wCz9h@FIg&M`gDBMB-r|2RsJKI)d zDDpC|*e>^C4dLabbPqPnr=2F-{FL!*bHNyeV%m&3vtWt*_`nxMfJO#E3xxzg62!|= z#Ck<#X>w^A1oZnC4FXC^)ryd+wTV=1E3LpX8{ZJ;BUPJ)Q0@GJ@zF5{%hK*3RAb!D zne&j!rc}lf#$i;34DHW+x#uHZO`ek~ISJJei)&JqcNw|Ld@aF9U}kDqWiR39MKm-) zzzyAC&XnPrf4WM?3{|5V>Ig5f(I!Lr;PHtwOV7a&x5f!8%@bZMC1N3M*4w!v;abW@ zPzl!-l`I))*Jcf?yefQ#8 z7&&!OO7B2d*Z-g+Z|L{0NC+ehzp+J%jfhp1$eJsQdHGpm#%PSO4P2!$T{Xr? zP#@M3M8=J!xQ12A8Wsynw}C$y{IrpD)evb#8g-8w3D10qsjt=;l_`a$D9o)o+D773 zModZzFigWLl?wk=Btl0MLH|T*VbvEa7BI4gt_o@RZ&A8h^`b9LNr-k5Q@bc7KieW- z_+>THt2g)(IFV#+BL_Osh;R zI9B6~>}|N@c|`NnF@)qZavCx8ti3dlFmkFw`5KdyRE4rNzmaC`A**|7W_}}IlV-Rg zuu9XjMkdxw6l0*0s8s@Ug2GU*$_tbHgOR1l%P!7Jl@dW8Mj8aX4JnT4zmYoCNcQC} zu}kjcnxxzGT;moJE@9eup3@wN4tBzJ2wP+LtOxdQX#NoftXQCYAXZ zIn`J#ML&5SH~TS`5KMMuXv32qRG#L)iVVN{O@|Mlx`$LKuz4p#mWSE5#SHRrb(H z(+y86PZ_(vPo)e0TRTr=Iq-AAEktt`Du=sS;Krxm3>Ph{%<#9hvjYj`d4rE=9>ZDR zt6{^7-zxb#rSi{^@|XXlvYm4!9Jx|qmB$S&5ZDh%OpjBNZ&ny1c@VjoyHa6jgL3%6 z6ExlMqDK*~o#U*$T{vaYD)p2MVKQXyC!+^8axMp^jfe^)j7|Qn0@F!hXs^){UMHXRujwcnsop}(p}NA*JjwAk zLY@r#Z0z61OTGxVcPYu}2Zw3Q?HXfbrABUMWTkzyR3j@5QlKjyope1Id8Ju@Mh5v6 z>4NW##PDPJnv{|l8o8N~7#g{mkr3e-^erPkeHHZP7%On1*F$TE_u49UnqGN1`-#j{BMxP8F}FmglkWA zBL8%MHnnw^LIMnlHV}Vo1tWib<xV)@(J}2Cl*ffjtB-Nn*H&@p`hg}$~d6ccegG1A#K?H@{2}{5g$sYE62fkC(;hiejjZA@zg*>NSAT!|nMB57I>L@P^ zW!Ij;CEB0F{_&3@a`~@4)v!70Bxi~}8UAOad0bR19P8IA+cVV7l{P3n2Lr|PLEXPN zv0d!qULwrZ?Bf1-!xjFkuW(wuR1gz?ezg(6kP3fQDBKoCERrlTnU*>vF@L5gBrz}l z?m$V5OI{iIH~B@ENw|OVix-kJ1+HCUqpsBNxCl$2KPBM;{3?(A z!U9ApcOq}ouPd2xH$_ffOTcE%@*Jb~YK+$rlCCbFeH-xeB=Wu zrV-UAS`j9QFF6XT2Uzuo>6gerst*pd>R}N+lDO2lbWQTw^3=kb#a5#$w|A}jdy!6* z$G%Ko+t~wqY6lV`;;e9Z3eAd)0ixk77RrsqlV}SULPF0HWly$aai89)IR}AZ4HLlC zYQ1TJ3p?>Y_e}cY$3Wm$B&)2srK0M*f`+s5y(J$6_aU-ah`mEaLpL6T3bb7~k>-Ye zB*vD#haV>~>m(*}l<;kBa}`GH^sUG?;hEd!X-tSp0&N!%UNXd9I} z`y>Sk-fA_XAHBHQYJ@v+cx#+jHJtX3Au5$9V#OH`+xOCXS?W9ZP2%5H!zqhLAM~BA zhPzeiL95|zMPAVI-;_O5pjG+rP>5LUSy~O3Bhmx(PpyW&6#I6$vnaZBwXcy!j8gN} zYV2XW3S;>4k%W_X8R?JG>>-gJ1X>O4j3a-DJ1v8B!;|L{haG{O+pR{t^_}q7twuDq zL-^}`fN)<^>AOfTT}DO#_X4R_WV7Zq-1+4Sg$5v(}Del(mg=XOB_Va2;XyHZ6&A49&Mt;;z@YI)*dL zdE#NzDoXvyHPNKuvh+h~&iWxbUP_AwhXUZ-x>Kk{vEv^g@WZqyfj4rh0wM#gDv1hZ z3BbXr`VIde(gQ2fLrXOpwGWA@<({YEDP9royVXdj)(Q9h7KP}%8r|xA&1rP2_baS9 z{ck6X9#2zMD_U>Xk7`SUHKcl-cRcMvog~aN{mFu&~`7-pmUK2e;V|APZtwv|JQ+VoDqr2Qj*!`aD+P@_K z)N1rhYIV~(UTagUo0n5?hHiUM@;3`eBijRwP1w%h1>!9sYjoHqjlC+0yyGmmS|G<6(tGxMQ!K%}%fz<)RN>EO5@% z)Ouu7;>f124sY7UgWbz|qelBWpUYPz@9{$SFx*y^e8*h{9)P&xd0P}B)=I8w{ldkE zN-Fa?Rh|jaVf}oeudVQZiLf1SF?XYWsHj>e!EC$_m0Daj$ia7&$A6dCS@s?tM#dmD zt}1Zv2uIQP-! zN<%k!OV%gb$}fsFX=nX86f5^TDylGPU0>#PE}+T1`ni)8iwoXqJ3`OE8+9a)lyKOu z^&g~T^&NlR*A=JL5d{8Fg?|Nj)=S%A`gN@4?ytE;s$#E|o3-v~8Rt#geC?;)8@HAt zo~NXXG)C=t+ZJfg#Tv<8D=(xZU$_RUpOz}tNcP%+q$JPX(FV_Yd0CNX-mCv0M2`#6 zz;!;M7@nF3dD%uahFtjff1v@_&pfvxwg;aD@hww$FnExx$4j$f35FQ&q=ddAO;k`q zjVxm1;tMt_rB#G81EHcaMEnO-*PZHGZe~6YAY7KrRl>y`in{Ptg*3 z`OJ73s-u&{%Ltma>Ym+>$L+P}YP`}OBQoQKCUIZZ7%kVVb)7#e6VBw<_@CPGR;oTN zcWY+6p>-5~Q+vD*K1WW6aA(SP|H2=GPY(2-iWYgj-e>O0q#VcX$>e?Jt?{cTai95b zsc|T;YO$TOJih3#689`hGpisu*Jroao9e%g*NDt`GhW)xBQxWrzpgl6NNs=Z;xC{o zoF2QMDQKB_FFE!u`X9V#v6MERgU3T5p&_70GWA>~(0frR@r00`$2EB{5VWiiXVsJO zHzEkYsQaqNV$Uuc4^^Dd%O_iru=!MCy4Z@$F`q&*R`&)>>BaC6!C=YzdB*uit)KgC z)#DBv)UM^5GMv)h(!i)b$U>8qU3|8>-jF6|tXAwNQuO&*g;xDfoj`zAW2^BkAYTpR z(1lISw*`l^LN+^)yA=(CX__ILtsXS3{tU=sq72pi61p(_ZvaS?A)9ANRDH}u8NBi! zO3+4`lvT7yAPtI<@^wGpf-tz>vLegS_~8?kjVHIP@p{VjPYG3Eo-df!Ig8t0J34#0 zeX4%Ee(79U^@epvZgDW4U+HP7^n4ZGQ;pZdlbu@JsP3zby;bQgV$j_HLiU<6nit#)%G^v#K-%1AEn_(zZ3mmc-4&jAa6MlC zI@`_Jme=W}o>8ORbsMq^xNO^*1TJ2eOEtO?O&hi9RKUP?E(z$?z_ znlZZI!WZ5Nd;`Amb_;15FLqYjqNEl;AFl6W?^Sx-O3!Yeb6cGZnzgdT=D8Ar*bg$E zT-JM3frHPN>4n!4U-ZP-)B+cOxL_H{BO`4DdSb=yZYIX;PvQ;Xe>d?p7w~Cl{~Kk` zrId0)H@f#qp0tlK(o`WhgMi^RL>pH#l20SJ*G-#}vWpda%RIraq#MR&utZhTh>$c- z;tc~MbYsgYEBH1FzZ|~#F#N+F*|sV3G;zDahv70l<^%V*=z3eZqK9S0xFe0`e01FY zmtn4A{ASi?NbtYOhL8M(lYz1f*CAHySfiRKL&}TW6)SSHX{a#ER_Icq$YK~Xal7I! z)8j4OA480d)10rG!VFC*^t4s~h$+mFp@$PyQ$=xxXXzoIW3+T3=-NO|d>Fb@tk~V2 zsq>m>!+3lo(XC8)dR`4tyFuc&XT}?X`(MOck$;UuMLU4N6oFm^&Wx^mN_!+JAJ z(Dx8-HRn={p|?Nhv-Tug1evfWA~uyG4qpH1s!#U!NIo z*elV-S@nN6lVtcRxy9z*sU=H447+i%>9*lFL>otn5owl736alC3C7!vXydrQWTK2P z`e>8I=!g68;hsVbvi=Qgnj=w1Wwy^SR4qaXcWsY%Zc>I$3@pWp-Q6vVO#JyL4AZfi z$QqkZ!vjAmR2k=yGK}|0hDI`U7;htt4N98a^hl4JVQ4Os_!~0gl`vagOB4kwT91sE zTAaqeJ}v^y?%dmvOy>Y0{ZoQJBu+?bO8Hcq;7~95oWiGJHS(l@l#XbxXr>HrCH+#y zF!Y%af+GWHG|XdE+QsZ;H{F| z=^_16Di&!kLa#6JnuXv`9c}HB)4q=%8mc=z6n~Ra@kso9_0Q77E(ppHAzLqAyD{A= zC>HWp+7kcw0Q~2SE0hk%6Xka#n-zKoNU?{ZO=r@kUo}WrDw~#ylb$jf{FL6NpDx$u zL&eXK=-$d7)}a&vH4cg8jqTwn`F6tkzf#0x=qU+I@y1$>cQK7ldl!ore5jOnnR}vA z&iLhD-|nyPR8I{gkow87)ltq(T+>VwmIKB!D0@+H$vRLW9qYwN@y=t;SNY9$6j>hD zs#<)RTwCjO;If_ri zmHk3?IyJ$RqTE?%lH_~K;B_5E_jF>+co-V59B{XU$QA0BLEZ(Fg}{1dK?NAll>a}> zJXzqk>*}$m*fYfcT$wKUhh-5pQI08B%MsEiSq>{sw#anJo^seA*>SaroZYx;p{`TT zQ9mu|Wtp*w9(X`EQ5imZR)U$5^pcJjZh%DVsOv^)RvCNuT%y#z(RAaesnQLzH)SiN z_dse>NTT#2qs|jdokOW}tky{h#!;4R^>ZlXcI}qGY8O<4l3@)0Kk$VienS>;LqvC|` zn9yJXcMP9%yA^_bBT_a<<~K!`_yT|0&lB3al#+2h?_8M^e7;WFBoX4}Mt^ZgVMj}> zm233VTKOtc3S=92oxt&P^b9*%Ar!Rtlq_F#{wds*PR@(e7m@W~X!jopITNy>&26XP zH6Bl}Ai<*tIWus~O#zVI*cL9OWffK1Q%%5E&A4XoFOp)llym-?xzokT4DCPY)AJ+`4;g`?Wy)!n;6EV<`l!Oj23nU z<_Bq0Y!e(%UY7fb&{Zocc&0J7w=(vjJ4~uwVXrMCr#vXo@_c%|%lc2&&)(O{XY*~Z z87uYb8)BNxokFZ>k?xQYe6~-$@^$tMC$jB*@}{2SKpFKDy!1eZOZA`m61b+&gY%=< zt2i@4PDmyGs>PzcDfw2FH}%B$VqftEpP@Xs zbTyL)*Kv`9vrzS6UZpRD(7#s`C_vdUGm*(723*1hE z?9$vW`vXH@^JX2-9QQ?=I~23>ktlM@#K_z)D&0vi<>F|dW~!Jnq?0oI2~q>g=rdl3Z(M=2$kgV3KSmrHC!i3$nL-vlyhNp&h>-K7p-L<_sZdu~!YG{mr>U7ue4|Mu%+Nn>#c1dYxbzvkCc6&(zJG`Z2T_LB6;zvZFnH(c`?a#Rcw8!SQMQ($~=Y z(EddR$$($h@Y*%he-iWTjxRg}e6tw6J4_CYN$4J`)BavYf7N-S782u5j?p7=qu88e@wZQMQ~VdxuF~F)nlcm*3PwzPaBlP zM__ao#|FEiiuawHJh-)M!qHs+q3#8yQ|5fsw%teP|L?Ya(2KTVWs+{2P*dKCV54)~ zxw^7B2ZHPCe6cTV=l873PZ>d+TtPZa5r0s*jHb_-!ei4r`$mTpj_02@T0Bq(xIv`= z<_)@7aB}id=h%D5_FqoQ_T{ci&2WFi5`Felb^{L;fWbn=ugZgB z!M%rD^$B9=&GVck6RQfAm-A#u+o#}7;jsHB_uE1_%J;|#KQvy!C5xol%Ps^9@n5+7 z(cmkrjD5?y%M~$~vIbCAc-@jybaX0WZH#B9JD8JLf?dQpJ_>t`Un9CCy-NO`twB0{KmkmooukV1r*s$Nbn{d$pfLuVX_Wz3EJbFnXcf z^Ie6rx{hd0|8mc_<(_YR(L>7vJQD}uRbLF6!0_$lB|?QA!Usa>`8)?<{Rj7IJFoRc z2jG-4H`tUj)|obN9r)Fb7DoH?@Kl*|%eVV2cn1FN=j*Nd2IKi;Vxh@QNkFFI70-ZwCj z)=D0S>Z^~HoS9qR^i`q%E?zkA8!hPMjD`C{@lsLM>W%j5G&b7*beCO}ZvO%AIbue+ z(U9#0#q6Oae@IgnyW{bbIj2O#$-bsV3yXP%;Q2gXv_u(caO&1hf&#aL0*?Vpi7D6U z_*>j&5)^nC1~`~{tpOxsYSbEWAhQ$iM!LX#4?OowZ|?}_oOUwubh&5xS+cyB`NbA- z;el|M`2~&h62Yn@-~;?Zul^=>D`GvHn|wPNX~A&b246vH--auhR;Ac5uQPHn_$90U zzrgx<9J9RYh2U<#wQ^Hs)tk{?J$%lh9G^2U7o*iaa9P=a?(#4L`!VB9U%2mT|@q1ot+X27Nxs;#m|1yzdb1p3`E{s2g zeHwn$$wS8y@xSw#c}$(tUB^V6Ul(zTSH`~KtYR=mG8nhe>f}kjm9uKr&2p0ajSDiT z^tV`E`bEByj`@xvUrV#SCdNcT1DAfJ=d~{;Pigw(DZMsKU3MSt^Kd|I>+CDpi>Fa% z?q1%zw43&m^!4H6oZJ&g@|MJT=FoP|#k;8-S>N_U%JVbs*qn+{{&CR)o=7Y{6}eBP z=WR&qg8%}(Z1DPcvB<>woN4RUp+JR=Ef#OVqjl*RdcuzVY_GwG8ZO@y$$Deo^8k@A z_P)JZPMf4#PVo3@wC7puKOVotl*q=AXuOja!LMMO0Q1*D1b7hrlsuVIdw-< z%e5=eJ>01ORW`7rWMdJzVa+CRdw&flN0`E`b1^A5bd{kn-0rFJ?bRrxGv2kaSLe(3 z*X{6fBB~Hd=hf%yX*L{S2F@x?*CHp_G&_S-Dv;jv(4r8om4wy3AeZ3 zUNeAiT~DqM#rEo@eA}^K)W6>^Ow3N1nB&qDvrSN)tSPhH#-^vlTe4Rr6z2T2S65R` z%SwCAsC4<0r2N=NXfD>2NF_}_Q+$gQf0%P#a@D?{UbRiFM{2XMGIMedkvV_E^-@Bz z{57E7WV^+F6ufa&R_c_wZ-O~xz#t1z&Wgh&EIfx$(!nmMJ}cX+hEb(AW%{D66O>z##21dcaXWAYX$^2O|qj{^-nDbj_aQf zo|Uo$&w^v?$b+=RV^@+fwuyY%zW;Ij>%$WAZDpyOU!piBx-* zcR;EK!CR$eSH9zU4ETVWA$S(F4aizGj_S3;5Io0FobUqwBeM+4U-;$pJzd~VOvCBv zI~LHmlZ0EsuUUuxZ;jW?)OcO?|L=J9<1Ti3yl!kCug3{y975aSEYsuF8++v70W~vT z!xM~yogN40wC7n#GG5^u%y?bSFXPoi;7&-x>3D4iG=6>(&d0AAFFBv4mB)TLC-%-% zo;M5Z9LmZFByR0le}6R=O`JY_-PSYj;p#rv1rqA~u=p3at!MqiR#dAv{4~o;v8EYOQM+hly0BVm2{45>ec^s+z z+4&U8Q*F}pklcHKOhM>T7}(y{mcUx|ww_Y6JgBbRt0++S%ZI8wU(h2G$%s9fv+p7X zlXDb`AwCEvk>+g(HV2)HpF2p;%nm(ca6%MiMmCQIHG}YElsgk%l?>++evB{^SozBU z<3Hl5@u|B7W2r~B#0~tmlNhnr7}Fo!2U7Sm@L5~i_4u-ud9p&oz^%!?EV)Sfkv!*j zSH|{AH#*S`x)fhfzrStHT=@4HEc3eZ4o0sayJ8+#FMN)v;>BQR~Nak@VX8~5Ew|@(v#6j`uSVY zGjzYTJglR53*nk>PGr%ptLLJ3sJ+sV!w!j@4AKhmrTVQM_)^v(G9hz63U?3kiIPr~ zRfu&rAcfl?BHT&w$@S9wjbapffDjUTCy-b{OBVmhxnt3;pXXvBS#uNDrygoePSY{T zX&M#3n?A}kea0{OggQ&f?ReeG8_Pvmzbl@X-Qn4rl~{Pm%w325L2hy2=UaN_-d{Zp z65@I*d?*VQ34@SsL**s3&3sG$c>DsT0kV0NRni%eiphK?XGTc5o5TCt&Ycrm-!eJ9 z{-n$$v*Z;*El{i+!b6iKGF}uXYWt8HMN;5jd>slrodWH1?OAF1@bcqetzOo}XV$k@ z#fpws-Zo2_T{KtFYppKc-2*PXmYZk8c=HHzn$Z()3-AuH%w zMwixbF~pckxL2`OSBQey$Z51@t;H{r7V(V!W&AI^JH-P*Vh^E~9U=sJHd&t;y_A6- zogK*}BG2dg9AleNr@y_c{`WtL}VG^t^w z&@6_6?Y7S?(E+Zb-8xXIh8vA5fM>{1Mdw(Kjtt2&B5p$Kggbs z5UrQMc-MW7z_ZfUt3Qv z;@#xk>ku3)%9Yno%e>apOFQv7GoL_Li7H$W<2VAc2QT*(l2YL1eJcQ0E-K_r)@8*! z8vgXsgXCR5TQc?%GooUm`VhmT;WPV60mo7RPcl>&kCTl3HRC`jeyC*hN}fRkd;&W} z;Koa-M_4P%iWK&QsJD268gEB#V}8X;G+oR0m24;SY^y%V`1mu_ok?wHhPh4K`597^ zCyvr%OqR-d=}V{^P{trUsS_fR_q`cW%m5G$pyX$0O_-d2A2QNtFB;+}=bJvuWZP6P;y!Jmq3bi;CF_{euxG7wh&= zz=X1^FXcO@dK!Oot54%^UiGp3?Nn`9D|>L+;}j5hjVD@=!!dFC1ZVasZ#)}g^FY=V!H)x7mDw<-qUO-66cz-_*>TrIgFlZ zcZGAxL&PGQX)D4&FZOABuHNU9z(My=8W^Y#1j=eWFGp~%)!3~f_FXCmRRlK_@(hgC zc-D9P8TW1cvdpaalGSRw<|mSWNM+;M4bBo`_Rvtt^J9EOW}Ztm&*Gnf*h?OO$ieZ( zcdOPbdG^I~GV=`AJmaYAtY1>~kdb3jfcImnMoJ?7`n4Q(3_xrb-O47!@hYP4321;QVuG&nfM}TrgC!HFk_Q}{S*<>=t zr!z1arDig&G^Mxa>irGil-@55OdBc$ia!0{lQBv9nw*R+I9X3Ozb9F(#;M84*qv;A ztXFC>G|!LmM>6w#a)eelRcE7gUB=jYlNnp3HT&Y#nYsR9a-F5~aX>Bz2bl}WwLgA( zX09v9b?%&sQ9nbYrRQT>$NA`5;HIU) z9YuD@H?h~;X0(Qi*6X{mWs;{<^_2vs==nSY%VN2&&rOBBT!!%TDeMt;ubYzcK-FDM zim&)d{G^?avg#XXuUs*02rrjI!RdWD4NA_W=&Pn*BZRuqSjA(9vp~MXfLBR`k*t)A z+CC&+C6Mj4={vTQ;zwz1mDCnDwY|9HWNNzmj1D#Jyc((D?ZhA5u?24vuM{3h_jkg|35)#$drpEJf6*H?GLb``5 zxYJ(PH2}^5jhkN?=-;8E7b}h2_#q4rGl#86t|;2^NJs`3xiR%0ek-|F^`X|3Orsb1 zL_zy0?H119MSf%7x!q{)fn*X)m$-0lXYW=R`LSO9Pd_2@78IOcPIzf&p!FZnsXE>uLCH30AqVI&CQu-kCN#_2<1b!|r>O9E`?InzX3Tw#% zTCgbB=n#R&wutLRA;>{W_Vm<^y8E>-Z}E`0g<)cngOc{0*G@L1>8(JkspDeiFD8$N zpLe>9PTuUV2k;}h=UG>oVS0dHk$1y2tvZCp$F?T<{=V`0J^R~IOIxV?r+?TF%PZr+ zQ@em#5#ieJGtlW$~-X4GAcgS>i{monkLy-7&qrCP6!6 ze0if-4YO(5j8 z0q8A+voj9~vITlGO4x$JR$&y%bA};pOGX5li zWf`-0t87~)Pu8u?`<~N^`!l*QOX*>B#W_Q=9)Xl0f@Cdyv2=5ztX2_S+SFq& zi+0y&@b`GW*q^){BO;7uL>R(lwfpB7NHJXbFY2_Y`kA{c^$U8*DT_WWj#TwTAug~d$9I* zaoezbNSphg)Q7vg_k$X2_keYh)mzp{etZSZk=`xkHzi+V0{pT4!sBMqEPqvm*K?3X z60WXpn1Rsxdy(A?Xt=%rMQH3MNsWz(&PUTiSq8kEVhh_S1o4$gwy>!$GPc6v1Ty~kAC;1Bf)(=YprK=TfMdIpPpZGvX+b&1$kh8 z*0U082gekvEndnKw9`9Bn(+^sAv3!YlPg*xMVXmJO}iy6?$D7 zL8Jd^n{s;#9jjh#= zpnpJ+pPt_LFk*6NQKY;txv|3OFCEVIFgn?UsY}XX zPW<>V$@FGpj-I@CQXc!lh~qhG(a!IRf3yqy@;_NLm_eg-4({@VH^(%B^Xz}fmtu5) zW&Yi$4$J(lMZ)`R6)JQ`ypvd2Jtu{1@;(PRJwD%@FU-Sl&RBI#Fy6_uS|)5`CoT*} zJ-mLInlI+cj$W&C<#M+=h5u>DEgc(jCJk|~Opkwa4nB}J=1Iph2O;GjCK%6Si6}^H zy-kg*iUBMSmW9o@xu4T)$uB^(-HxpTNbW}{ln$+cJqMUC%HrIK!HoMKzMw2eehz|f zl6y>Is&e&6*-E41gJ_>%!7KR{V(8Bcp)(rtx&gduDkGITUB4S4s-pb(tR(zce$j6y z?GsX1voF`}k(@E*C=*%pNSGb=a6`U4>HQy~8?G-@Z*ilAG>kR-W1l^%n6_z0SEkGF z0scna{}r*@akhbs^G;j`08})*D2aje) zv5d|d)ntXd#5LJfy26qsovoK`5rVZ8Mh77Y^F@!~vPx;5wiCHSZpEk; zD|9zg!f3b$>x!cN|DWLlQ||!+zYgx^kXxgJY+R`bc6WRohBJb*m@$BMm9De}l|w1-ZDxpGvR=U$^FLB}3= z3U<8kagS(Fj0@J z%>iw8Wp%iaTi}{3-qdC5up=;}y9}h$YgiKdv5BBA-nlG;4$qeXtupNouG8{Mny~W7 zsDb-c(av5FMzn`}3pFrl=DhYrxiNK+i%5oA8Aqym`Q#2&T}o9gw?WZ4r?k-s2=h0# z4p1XF%oAQEaV=BX7C(3)E!q?x10N+0`#%smlY&}aw_`82>`vF0aW4p3q>x>LhXXd2 zRli(jI`&R^?3LI%6|uO#Xnk0u$5lGsJP!~zH3I2%W^s`m&Vcc1n6Y%4$?J`mXFnUFVHT7KxTU`K@Hr%Jy$1s~&3q zRYq})bkmY<#_9S0Pf8A&EXtN-Xj1IfcC=6Z6 zgO6WzE0DPzaOwjlCDYYJLWOuxD6}O_+2?(;L^^9bbK)sO_jI;@hsvFE)F!=6xI-a-#lRQ&-S;9}%u*!S(k9Q6iTW<}Of!kNyP zfwpr_HcFiF(dvO7xix>&ELwghR>22Nbi4zz%SZiG>C7$Ux$!pJx@~>ssQ3AK-k1NC z6&lWJV_)~MB0~tlSAK6>f9TA@%(bnalH#d8UZTz}IW zqq1wNUgbKH>Z~SMD|7J|Gb7s@%^m6+^}XdZC_oD~RH7lt5r7l%Su;MGo9`R-o#kAn z=^s{9HQNYhb4ed^q}Mr{n?pn3Sr`n_Q&v>Hi?yIAw(vI-iiEFCr|}R zC$?BOu>%1}b)rkM6J1Ov-nDLeNYhiD=$h(;81tG=gaK&pM7LBYx+Obt-6icEC`fgH zmo~Hm*PT^am1=zVRO7oRXW+UMOv9m_djvQ2FiqEH@3;c{hbfRvb_?umqnw>$RJ5@wDQ=Ln z;-|bsN?U^{0?<1EC3F1VOsF@3(rqTF&4%xy6DVyV&`+6uEWChIQJ?EagxvRG&ytaB z>=~+~Az)FVR*Vpdi^NlysIwEOQuv*)KYl88v|N{>F95$jBcIWR(LY%xwOQ^~6_Q zs3W0;aCFAM!U7C6%;;sseXX$A$1)HM$-hpJAz^s5sCdZaGH316WD#L%FAUNQ^uK{l zjWy)g^i1d&&>V9Ih`Ca7NvHR-9rGTVnYWU>?d8&}HQFpU{=)?s^LZRlI#XZ$YNj?G zBQv#3W{M6nQ^RFMe>LLTQ<*Go63c7}(Ej+%RMN6c&qE?I=QpJhWqfR%Um08`>U?s5 zgi{kWOebn~a-!<6Qt~%ccbF(qRj!=K9EqN6<-~0MjQ5x!?@3v!ObK#>Ic}4E=B(sI z!K5Z9YV+|OCh8-+x|(IeLhnHq=1AwtOw=ntyHncF!RF4txW-+ap?8}*f9u|kiYm1} zD`PL=ctf@S+vyCwJW?5AZ**=pihfK1gpeSVT|Jk-In`(JH@A8OfAgx3<}aoIWEwKU z8(laeyHSr?P&=RA5!rrh0Z1(7Ykv9*q#T`={_4-y?DSW%7|GW%JvA@WMsP23+Ho=i zKni#+!G$AG?k^mXN6_?KYv7?)YX~bv=Oir(?*E%%Go2J{b!HpLb>vpJEFo_ z0GgtO6{B8Ta*w+km0)oHA+_C_JAWGdy3iHhPtBb#2!7qyy@Vi!Y@fU}Ad4;7OX!9{ zhHe-Ye`Btp8~QRO{)SVY1qLbopGjXemqGJ4^w6P<=aViHh;Ia5?(3Z-lH`U7{Sl%+ zEA;E7_=9mOzR6!Dc&}0Jmc@%_Q*Ur%-<17v>Nib>sE0lwEw#i;{J}G_s*h?oyCB%q zW%SwkwTB!?c&7H_0r!FsCvH+dg%(SY{WnI0JGa8$#W%Ons{^Rl1uJ;v7>rvU`%w^b zfg#{=AINL#Nrlsnc=?C2LO<0-4mD4qXAF@cix6BUOsNWj+)P7Z6 zGUXSu%5;Q99smE(_Ac;IRaf7CCW#m|dSV+jZJ|axDydXLi%JA*4$Q!$294akAx}lB zJZPx~(TWmG0?a#1wSBPKN-KSez1m`1ih>$JMJ`qmyn%SZ`x#Kd8;A(J-{0D2E(z54 zdH?U{??*Cc=B&N;+H0@9_S$Q&z4o?X#YN(RmP$*HM}~q=_Ud~UKk(gw5ANy$wYDqN z*9CQb7pOmWg{l_R@GelTU7?N_)VI1oT?`a)?mZo?6TBZDbSK{Xm0a6*S@Vw$O)RZ; z|6QHn;*LMdVAMQThMCz;rE6rxy@a29waY+BO z%clMy#_1NeI^yXj4t9;9pb_XgMm_d%n1G*>+|RI zk^DJ{EDF6xJzjdSemQuwnnvo3u0-0pdPTvy`o^OD7uGkk6n7(U%;)>JGu!|FgYmce z`K}}4;hr}B=3h`Ske;Cun3o@aKijX^{jS&*g~jH%V)^lR;eN&b=8Ap0u-Ii3`-1VO z2wC-HZu}jzAJ&w^`qKFYZSWj1`SJJEbJld^eGcIJjlYGt*Jtv&VEoN2#JXhn{_M}$ zui5dp01L5T{KY{~IR2)9qtp1C2YhgB7pQZ)Lj77$yT)|JRnrw}zM!7z0#(u#>c0i` zmo88*f7W&UeGNDH|8V@xr*K!%-f#S6{pDc}*)dFMHbY&N8dA=d)1k5cmy}JdX-sjj z?=e!53{pm-N0UP|W(0d-exuap|NZU*>FiiN{M+H6kpWe`3v=#1w_fntuGT6+^AvgC z6eZEuPd=NUrU=4mj>dvtJ;UxxZV>DZ(tKx|hm0pTW<~s|M%Dn;!8Hbn0ni*BU}Bme z%(Q24krf~dS6nPBgF4V%{!3ZhA*I7ia@`p!neYN{I9+QswLkGVMz%CER2lWOgs8%;W9jEzeB$ejJAMkyF5B81h z4Ar|U)M$nURp(HGuKoii%b8B-clC;2$k z$YK>sCkS6|Dq*hXEOz?3ZX6ix<+C?Mp5*$n)@6tAeZVFBDfjtPIhIMfug{s zb2#4*tbab@!fR&8uKMRCes08}w!i*)#0uo~&)@leVEuE4;W&`~xrsOb5Bg`{ar^6^ zuMIO1%jfd^9&F3B{J-g+(-6td`scywPWtEW^ECR+J~sXH+(-IeU>`36KA7GGYGGHX z#ezD&3skZz)cu0`P8X<4yFx7$)ZtyA&IGEn{<#4+_5V=+6j3;HT5jB40rv~W?LqGy zUpQ|6z5ZE(ZS?;^|NIQ}|5y6wwdj8OXYaWt9G&&gab)2rXva}^Q|^oO&kX6^0{wGT zsCN&je+WmL@orhTjGj6$L1w4qZtja(Jg|qi`0DP7Bq{YuZLARniV za$68FR9pX%EtlaX)iwVV97nAH$86RSE*_yFxyJ&UX1p65hL-FmTZH?@qBCSrOqT61 zhQOaG(-TNelU2(@!acM_TlBzx3F0SP_zVs{2f8sY62YFK?x`vGNKXa9NBhMdeJI`c zXFLU3f8M>#S}tk|EV!5&i#hZ}B0bp$Nk>fZJNJ|FyCs5O z7&Z7^^|i@c{G%Sp59>M3Q{MWzH=7d{Lpc=JR*!!y!THw&u}5}_Q%C=FN0WblsVYjJ zdBtDlMCDMz@{fM!d$%w5TQ(NOIlR^W3_GU&%6Qd>*hC%rlOV+%R?mY`xl{VAP9(Pk z&#jUNG*0;PoT#F_;>1g|y-UG(WHqdu#!vQJ`~Uh9$cP)34d%6N1|kL{r`=x-z{AJ# zhhz1zntd(PqC<*kPB~u6o?crqN5-(=ruiL8Z2&^N1>j-Jy5=c==Y|;FIo@BsnOyw$ zZr6($%eDi+uj4-SDoQwR^(HyAM0VmM@1`DiP`WYf%L!iZ$0m3^SIJ~-S{bLXobMx^ z_>gTK4>ml)X_qgd)2lG>II&iA6xf+Q_AU_t9BOSyK9*>Gq}V0RLEfmZ*ApQ81SB8H z2itbDxR4%270D(ygZwuT4SNJe4+I&EJEX&*pEp=BV&ZF9X%isbA*r-l>Ok^u9=#98PrsF87Mk zl@olg_jYQHdA%Q>dPou|gGjKv5J0&BeBc0{IsibW0f=reJbwUy?gsFY19;^C08s<@ z*a5uV1t6m?%&y-*90EbFisBEZ9;y1C5~D5(Np2u#pGdCUDTo4!nsWeyiUsg*DFQ=@ z0HTJWfT98nr2^<>B!A*ae!qZZFdQU+zJ_6k1K0sT0fue@7+@H7I)J^MFbq2Q3#(`( z^su4s5zbRG?Q1lj(v5+)j9z}%yM0+KJ)(#_9hGwu&V?~E&aYeR zdxO@->X!T7z*RmQd$aC35&=rji_VvDJtV;}fy1GhoEM$V*Wm8g&^e6orx~Vj{fDQH ze+1B_a=z+c!~NXgQFH|V63Mwy#cQo>Gi>b4>hTZYgGAom6TO#betM7(MuHv{E2*qq zji|F#JjcA6MNtjd;3rh8r5dV?bN$=cx;d|VaQ=I=ra$`3n>A5mNrP$*(2Z$M^MZyX zeVoyMQw?gepCQ|@bC9xr#|rMA`7>lmM^tB+T2NmcW|`nszR3nHd4{}>Ini;nfcm_e z1%{CjfoMcu<^!(Jd&AFVdILzGgqkrg+QJvj z6ICA$qWeGEF{Fp*wLb}CvLw)L+q;rLj6TJ@ng_$$9}H`MFuIK2gF83xV@G8|#A?#% zAI0}je-ZTu?+;PMbD}Lk+Rln9j=r{IFcr6zqU}+I>s_-s{7}&H54{VXb*+AUJ$27D z4(f9=nDn_l{FI~w$3TNP(7qRWjKR)oTV$h`CjnLzPaVby+bKOVhaRMX+@jxDgkQ&5 zmq{_5w)~JO2ewR-MxM;BH!&_3Jwy*Z9m5lF9g^E#6g#?z72H^}&Z`_DJiakva~8cx@Llf*PT?U-A{dq*l_QsnU;cVG$BjmataZ%bs12-v`}HHw8%B#q08IOzR_FeSJHJ@d_GF(-wO-CE4$jA- z*Dcl63shdsFS23&mSS5Bh{-DKe&#%(B*B_FhB5iQG(idM1NCIv;Ml1p@zgO|S5p{# zl>^h2)x-Q@=NSWal%$^!5k~7RY^d;UqbKeyd@L4Ou0GPmXXo`YvCJ}+~MIHK+2{>kzwQFSXKmwu2)#UT{mrxmIRt3MrPUe-& z7oO=I5PhTy<-47 zY8r~Qh{hkN$y@lKv+BHsXIJ=7UdIPlG~JJ-#=H3l zwnZBe(y9b(&DOpZV8o9QGVYk|z@2${G#iQXJqN%w;X`eeWZqKVt& z0UmtpM?P?3p?;SJ2#SBl=^U%nN(ul1uNs;@FCv@1vM^q_az6b=$qx%a|E03j`r- z495#fSAOIj*%p14x)j~jO3yiCO_ReWyrT`T@P*I~K$H2x43AE4q78oQ17EA(12L-|9+9U2dJ+M*w4 z5u6i3(EP(#0I2x@%!x{|x%P(uBB5L;AOVQE(a*Ce4htbRrQ?SSMR!BtI@?gZfmRh2 zZ-U|op@KofYi zuN1zoGpMqP5UU_~@p&XoVxnn^qy0oaA2sOfp|BHo6V!Znk6ZtG?eY;D>~pj$nMsed z)y{ym994&-l1yKeo(LgyL`CTf1Pm8{C|=%Sj6l+k)H|%e@|Ee4CA^vwozJV7glrpf zs$#uVtdw{6ME{U2_M+ZVtS`k5;?*LvLany@!(!c4tQ+sGTwSCqGX(<^sT(7#^>?E)KWsu1^(e^A4 z*fHZ<_w4j$UCX^18nPAQ?54)_nOnPS+frkcxUA1iomga3uwnpiCm$>_5-{H*6Qh(K`qMx5=#2~_(mxr<4!HeaRY z)9PR-^`-}v$eyaDCYDxL#h5elOAw=hpU*gcSp@N%w9NFQ>a_<|Y!9m09<1t_&%z?Q zK7BTkR-a)?R9nU50&{oaCL+qHdo>lzu{PU7Soa)j zwf$msA8K_EElFR~jk?L_BK=9VMuD?#YFk&72E(Wxb|35*tlD8nw)WOXIrI*Zj*2+V zf5w`>o_gkHNtkPVw_n2MW73QBvz{RyX*b3{2OjrjCV!-LPIz7+xgzr;%P{a<<_bQO z&t=F|$Mm@(b2Fd0iTBhqvJ>yg;Zsv^98dclCuBSIQI{9^FvjiINv72?KlJIX+k`v$ zq<4iCiDQ9D(o)Sx*)~cKu+Y`0)#v>1^WN}t8K1#J>uEqUbD)#+wgudnVsd^b4^Gav zd9!}Pm&UXcM@?6?eKrkMt~jhLKlvVV|It@CFp!?q12ZEh^~ntG*C*?o+#~a$_%R6b z1n$i8fASlWt_4}nFUWF!LALW3IL;pqz3Cee7*`E-o>>GL)iNdCA%OH^>NA}A3MX}{ z5ILLtiRyBr)*eDZTa>K}LrcBoWXC?Ee;ZsvEi>Li0lgPCE>DzXWzIQAN z1+(8q1oI~u>Bci}VPeQGdGH-|FKS!XT*vP>6#QQA-M*r}gH|0x$*2cQ+sAd)e4JFjQmHW#q zgxMU=kNYjJ0Kdb*%TL^)Kbe`ro;8%)*|u?p@(tlrQ}81_?SDM2Fpc+BN5hueLbe>Y z0k%vHP!_qKho=Ylt<9i*`$y4;at@yB}KdAY@Po2(dt_4xZ=rNdUt-JB- zjm2tVS|WDgqG->d$-RBnYYESwexZ@uWZ1E|up*usRlZ|TRdMsh80~=Kq)&$*%R;18H3uv5o8D!4p1gY<-OQ{5@XVuN`qWHTmqX7a*VVTkyRa zD*?X{Z@oLGPxss3?Q#IHk+bw2zk$Ql(J#NXky*rI&)whhXfCcROc-YIl)h4;9Cd_B zOP_9CmR+~fZc@JN)W{$qjojZ2O!=|?L{oaRKA}ha)aXjwJn6AY)R}EG`z5V!$opd^ zJ)P-wOl=~=NCFlm;Cw&$6^mLK1e<0Cs7P^$GM>&iCdL8 z?61Y7w{UvdXsw?U)CNwUoGe#{NZDyA<<3z?0%8H>>d>ko>B?RXcV|i=a~K2;X`&mZ zmV4K~@GD~rU+U?eY*zREEd#Og>;K4k$J*f^IqDO&2yx&q=mOIN`q9@@w0DJKMj z5KL1Cq!Qxjx#fPUA~<#)v+MUMo*(gkdP)R;3geEEhmSezdvh5YdnG#OHk}tIRG6V~ zZW$ffi;?#PXxuif_e zIt@X$zbYq0&pw;tiWEV$l$l=fcMPs7onBQb#(QsBSra;0` z5VV~cj-I-go5@{=eiPwfHC)Y$vT7%*0;d?DS+G;BX{UG2iDCexp+N{yT? zqqj(MU5+mor>yA>G&Mh#w!U55il40QkBC4VaxHOARJz_iQa zCLF1TGUr9riMJb?oOVrpC-_e&5djRBGW9OWzPOA{c~>Afp6+{0YIt9N6o6M>D`{b* zrH1#zdm=a>hNR2SkhazV2!MtHBwvoSbvxpR{%z4)=I&}Ir%O=*}b2(g+p1Ty`pEY8Q`am4E~!U@ITXrsUu1m zrMPhn;Zy+5&JNQr1=j&-14F7qidJf5L^vuC`hHWscu-XF9#1rBl9*d{rEh4lOh>Yy zYHfHoX^Y0ZKYD0#r9Y}cDT9ZG`F3W!J6l5&t|6pt(PjFQV!3KRPK%C7a6@7G)Y)W) z(vEwz0IP!U^v3$&Ynif|wLQd)uHd1|6@N6IYu~g#H7z<$HbJd&CrwtUK~GU2i=m%R z@_sY_rApe>NBOea*jx2p~V6xZm-F%P@i^L@!(gyFn)EEfi00-kw_lw z?Aq4wjRS1pXM`&eOw!Lh^-OR(blWy)XlVT|UHKMI=6fG1B46MeawBe!@Y+`MO`9Wo zur_ylvufaVOvvzh$QoL{rC3KuxgCS+%^q+(ef?k-FzbrEBS&)a3+B;b{A`&vlnzZs zYjZPyF+%xU(yT18rOjAUoN@Xpe>oxYCv`8<$h^VH{8H~ot{81hZgk&Y)Az$k=xyH{ zmcLd?2l7K>@>K`$jsVUC;6;6JSpK>J9JL?DSlx@)UIW^p+_u(`zM(h)Dxn*i9?mK8 zL8Xc1hUsvQyq$ee77Z1r&+aCw1a1{k_F$Exwlb9wIoLHKnm>mZIr0pYDyOEGzg}GT zVe|2R@|pD2NM`I;xVS#P`bz|Yz}SEC<=Lyr*b}?0Xk{FJ;f#7O^L=wSOp;0=55kHX zR&c5B1`pZiS;0T~RKNEO{v1A4o5`q?r3^T2tJ}CLV-2Ne@=j=foi1mF3!fVtSg=|+ z3$97@bx&d^Go6aYU~O+c1Z#UC)^>386I4IrF{!=0&5prPkD$CBOOl^wspNv)MGJ$u zS?PpGV|9%L;e5$KLSk^bq~Jse!BHjy@0*}K&HaX&m)X-zt_z>jj1qf>cWHiqA^iOz zet$;K##2^I;qU*%Bm1m(^Iw@dV#)PB7d`pu?}^v9A=uv|<@1lxi5GAH^FfN7SA;i{>{9J_&wXkGDCI70DH&N1Y^y3r3alXb7hW0GJsA<#f z^Q>ShpRQ?F+7oLnao;t~4IzJ}TA)o^`}~68^S^}je=>Z2(+x8JWc|NV{f~8lfZfJe z5iiY9(->yJ_Ya;s!BPCAeV!Gp;L}n3fITVXg`r%%Qp3#URC5D7Pq4*RhjjZ2txmX> zg}tI4=pvv+BJf@jc$x_O8Uz-`kGH-%M#<{`K!tqBEvcx%{vFjkHL?w;#Vh8|d_H0| zuV6;^(&Df$=lXAco)#iIy<+LgzwjD`_lS4%a(>jLG@nl{0HHH`G5L`;mZ?ULVL2zf zg-`p*PkD^_kyTj>h#QcO<+)OrC{f;L{Z2NkiLd5MJ)Z*@kIdkeT2V1>V$|F>@yM6(fU4f z!25RYw)JgX3r|7;(R7%NGu%^g^B~*Y^XVMf{&)acdb`yji-qs{qlXh5!4VFBBox+T zEcFYnsFi_BoJXu=QgndU0#@3bGWnv%fXWV74;wzRpl~k=e?d7vb(RW^rf{cXpY$#$ z=8iTN3js)>zY`{hhdkyv}mWPn|N7HguJlQ%`ZrFo~J@qgX}C-(Mb! zyyrh8rBJ`)@yL{GupC)4A3zrU)X~F)bUePU(Ro>X`CATeoL)_Cm>B8kH{kG3lN)2n z&n99_z2jT(pZX*CEf?Y;uf5<*wWFfgYm*&Km9=>9lP@#;8uOCFFi3sZe6?d+mWQn?idF5= zgVWt{rmP+YBmLp|s<~=Ac(<2puTggT>SmMZ-K^k z?k%}-V%gT_iIkQebdWmLGLYCJgUYHvLb-`5_kERH z^G%ftxR8-@535{paR~_*sETz0YY+?-l~ebWU4G@!?X{Or;!p51%85^%8c;Z&aKgUz z-9%&R9*eXNSsWNKt7JzFg);q=}j+W}^s^muZ)#!b)Gca9}Iy5_T* z4_Y@zprbU|HwvCI-j!3Nigh)Z6c@2%I}gb%cJ`lzvRA_bFQ_N*h2L}u62B3qqd6^@ zc-y{xONxjaPj2B?mo4Dvu8^ARf_j4la{=VLT;kIS=X^mP4P*=>%bZCYvC+N7&UL%|eTW}@ zX#yLxbEC^>6IH1S=M{9>BpBKzLiyB~i(8&Y|pV&!%(W@bzYhb$Ef*|9` zH;mo>GjCYmu$l{|&fK?;glzZa8Lvw*g85w9JQP>wWAL~%D?V9y(=Npozy%OG8b3_6ih~x{29l)&;-%%!XES+(aNq(liu}ePz9FK{tEh6N~u(~Mf;r? z84Q6$b`0A!WAz09AHq>1BFjkQ^EIdx^*l9L@ohx-Q!zZ+y=pTQW-G}GeRtNp01v4jwO=isl@5H0)*AeUn_~#EDr{>X?-=Sg7;-=5?ZL^ zRQLky8h`ogCEmg(V>Ro7qX4r`k`lC*bIusH29s(q-51>|16kzvrIZxwR~6Z-iZt7IurOM)v(t|eH z8z$yjeV##@c{>Ram#nM7RP{+zgZi8b)!}9r4OZ2Xq83Raz6#=Mx$ebs8y&^S$jriZ zZSrv-@pxvlgkvbB-RR*s0^pe!O|Yboy7KXOKy#|&gx?;3iA6v-*?|qYQCt)%g6gi) z!$ue0q2Ny+I4-Jbib5AN3)%RmmIJc!w>vtqF;`t#j)&4>c_yl=h2O|?k?)1I#Hgi! zjk$*Y3pvYthIY#CXF6Ff6lBl@@ie^K3RvNOKzL-(HUn4)clG~IMS@YvY>zONFe_Nz z1NmmFf)kIy(BsK%2CO;=$n*$p1`y<}&4Bl`Eig`{_uCAR;fvYCWT0Uro!GHHl$iBw&z?iBClFWIbWy8)$9@8%M|x^1Jn8gJ$&(Bmh!_^t2R zUVv@t#^C38=l!?sfL^!5n(+-Rh*L+z(i0;qOK)W&E3&9+qjaus84o z-^k;`*1&GI22OyDl5tENdZZunfp<87dMaP zZ%K24zopH6_-h*jyV)4{BUL1l%XQc=bid=Bci;0H0Sz1x1|aLPKT6DVCBmJ7hUB9T z;0^QD_t+0*-6PjFgSIi<1aq$|Hs?Ou2%MzjinoMq41{|QWJG9K?$EHokZxjUz-=O! zyPv}B47^@SZMmI+9)+6%xvhdTi|QVkI!8ML&o>{Je432`IN6SCY|S0)qc)R`){`f{wl0p zeVaHAAGefdM+mbV96$JwdG*(F9Par$)gNGE04aGSmb{W{%{At1U*JOw^7HdA&k7=8^Y&yo?i){wJ)&2yLll)JVww3yFl^u z+3X8YIL;Q3pT2=3!GqWrxQ2ZJYC@0?rf8W}71l9H$muBlXkS1@>o%}2aDj|F_5~E~ zT$E`D=hXv+-F0!h)_ef>?iBYP{u^%BhdSvW>%8E^3fHSub-lfIEf})f1GfSkXFW#b zD2k`~Te8o3YTXc5&iZAMA*lhsWYIb02sTT(#A#rv*Bz+3Kg8=wqf=ipIoYqbGUt1C zSxXzukg7t`8ngv9ke}s5Kf^xH3ck&!VF-HiWU&N=`vd4_baE(SE93f>-Gpk$eYJh6 zAR&wg*gwf$%o0jjWIKGCCqCVNFMPVq?ANY&;StVkI_ zyM*rc`h9%QZ4E3v+)?~@`#dZ76Q7Ra`SyfWYd#>iLzC@+1C96wm#Z+`7w}uZ8!6Zu z=)(Lq0^U0A5*NQJ3ag;V)>+&?B##A`W6M6Pc>sM!I;< ztoe>@dtxD}fjt$#Q`vy;n#c^zZwh{Nu_<2L�BURt`*;_T=*sF0emP?Bhs9j*V^- zNBad&RmtdltTtAMc_aS-W~T)|Zk-?6Tl@Qe3nY+x?cRfJ2MBL`?k{rx#*CsBvB;+) zC^Hdl*fk&(VJV1r{JPLtyyL>K{^V9!No;*sEHLq*drG~wed0SOAS%3B(zmIR1U$ym zHx>pw`mOIoT$l^NT3*MLT(F}x9Ig{2_4F~W^%fX2ld74y_cfDBb5O)k`j(E=1r_3s z9GZ)GpBDaiK4h=zv6t=t&fKRkN=i71t76Q;PAYT7gsn4 zj`Lgd>=&7poh>c)8mAx%T%g zuCcOcN|RQ8xH0=8IPgb%C1OGq0bGcgFZzGk#AH+g+A$Z@ZJqL1WAZ(p5EUI5u9*5$ z2r7t*#s8a+SETR&;VN?xi?}z4SiBOy(bg3EU((z7yx+_+Gu%(WwYH7>$+LZqQ-qUz zV&}g&R(p|klfqJ;KPr@aY|d(iT#3aa4oX)ZN4F-QA>f%a&a0tFpc7XR~4&hIs{mr_xA*p~U z27Dz0+YNa6jdlzPqY#Hwuy%gT2K2~$3?lclft<|-6FbL%;Pc9`Wp!^^9O8|{Ax@EX z?$9;=ZR;YV%i<7|Tj>prugcVoywArXy4mj1wb?H4iJA*O_z~3KV2ZI``;wLL-qi~I zEb|kFC(E0JBP(?TV>h@gB+J4r?f9#0L&c%>rh4p5r#3{J9?2iY>X!2?D-ivQg;|_gViZ35v^5@7}c2%lNCKE z9Se%>X9R=$l8~G*xNw37q5O6VH?=6xb1Jw{+64d7)h7GFw8>C#CV-(nq|R62!#Bi- z%tu&90&}5*bQVFm$a}d4LF!9EJ!K2TRx*p|@)A;m6544&f%`}ZQBM;ylj&NTa}EOy zOhjyH@CCbn=!nc*Ol7=Yn>kzpoaKiRjX#QeXl8IhbarQm#o2Ju_PE>9{E}mo zalucOl6d z55}NpTXz7R2T-;UX|^L9gM9}TKz|$PrR>pwQ{|P5?_takJ`Z2XCn@*7N8XN}mt5kH z8pEF@HHQim*O)TKR@)BB1bLbhLGb%M`?!2iJBGt8_xZuunnyx!pJl^?nN%#@nj`g} z6j>_k&;8Qyz&9?tYWx)w4&^%$5W&y&n*FLgZAX2>(bl6spNad&hjgtZeD@M4(?Yh^ ztHHaX&4TJ+WLJ{@^>)aNEJ+`g8cEeTl8)~}($YLhpbbiRZ9PE`v2-XoW;iX@aK(pM z`g7R*Cu*R%#&LS)zrdfv{rK znZF~-x)U>>TkwVQy;!e1%l9BhTaOF*{6EY06cBVG>EpM$$oH2WNqPAm*oCAn^8F27 zf1!M@r-lOg{zs&sXWstp|Al-n0g=h~mH>{uinCx zy;NFLBG=5fh9y~TYH~?ZI}|8kGnRr|3YE^n(THT8Q55{we>-yHMBtQM424Vz%udOT zreKI_8$}@GTGT+7zpCZ(xE~?{CAe4l*L;R*%fSOU6}*_ z2H=c%&enX&oTUQ4$rd1WC?)*S!NV$m8qXDa|2K|EX^dEMuU|4nmWGx7Go{7W3Xb%m zII>g+HxyR-SIXNo;4mhZWOc3X zXB?aZf;gAlXrAg$g#Y;xfOzt=c*#hc>b19<=O}G{@3o>LVk!TtNo>{m*S!TSDiXE= zj(!Sowq1^+zb*t{Hy@6Qow%zUN6&pdERf^ql`2rk(Rmcg4nuaWd1Llyg=VUvNTU#bdy6{@7T54zPIbhoZ0>IuR8UD-6X5d2SocNV;U*=FU~ zG*|@+*))Yhc{c6yOSq*f6udPMXU=USjebd$-Vg0TabcyWQhv+@nMkMEOnx(ebVG72 zO1;uyDH>f(8qUWUT(S9bfxWs|Nk0~2%;{N$+8r4-`g zuEE`kg6lxcQ<<|smug2JsD_8aPCj&K@|0i%Fi3YT=2T_>i_5&$|2ha72#OTa>V_rX zxMD)}_{%0-(eS`km&uqKK0K$Zjw(YV4ox0mN{W@<(BxRm(O$PCFj({UU~>C}cybg= zv%%GaQ_)#QYy9NPMB+DfG7ejBhws&lst8sS%4?@2s^K`WX9oICrzERSA8M`8p=KlOGZvLu5!iOf; z4M~;W9_t_H#LSoRjhSy<3JP8ZuY6rp%z^-i z$9P$KlDzQ6HiWnn)Ke1kS)!41p2=X z{Yr1JQ|*BPqwtP7QTB%iSAN>5HNXC))RM8N{bdMKZ}iPnm`|Y%{>hyx$3n;*}t)i)lB1sFpgz7za}f++32I4#f);`)GVLk^<&YN>@$;1>Uv z0?JIT3F^Z90<(*>41?t$x$9Mi6bQC`(y6J{Su&X!?1wi@^zfR$N$ruB!zR!o zh?p1>QKqZ5cLaA*EVGs3@O75%oI-ZIr|3$hOvkUrc;)QLpb8{N^A%ThOvEoIs z64cz~MX2mpa>vA?LFsaQ02x!?77U~?4~Xd*eAoG;t26BsHNB#}*X%hXHf*jf+)QeH ztS3yn;1!tGCE9!fn`O?vsNbQp!n&U=)`QeTjc3BQ*p56w#Pkz04Mo+RDOuz^!qq#p z2kVNXLK}N@DV{HC4S=_`!S!L$1c_e~PPbi)wr14#ERfXDe}@$}%I6$cb~~vG6ywsR zhay+yvYp-&w;y3eMtn7FPOc+y23&uIRYuLO*4H?bu@+V#bj>&F63)}i;OFdA_9n0h zJ$-pB`Qa#aHbKmaNT*ItP!A8rjUsk8FgufFf~B$pQt{B74B!x{K${fgaXIJ+cqDYj zm4vu{!SR#^%1)DWudw-bmgno0oT*~a2@k>d7a?tmNLSs4Ve`mB403{&du^Z7oIElk zMwq}cOq77}JePEDv_o>IkYHwry}X%IY$YiHF1X+;@58z#5iyJk{NclG5gQz@id`6fSTUTY;u0EI8p#;L zxzHE9#&cIO@P<8jlLic6Zi+@g+KrhE-%lVw$!L=7qdP*-+ z6k&P`Iy3%8XAfka07qU0+?sHQ9vhhD)6>g1HM?dkU&e3C zGbJA-{ybyvI{cK)XU*8#7el-G;Ou?m9r{{LF@I%!PZe)S*7}dvH(l#39@c~0lv8e$ z6W}kK$SZ%E&Ic|LnnL)Y$ufiI>XKnSd`e*3wR{$t@@tj5jB-+JL-{28W_?-11KvqR zMI{w2-q7S2-65N(qT+t~(av3}P=zX6?Z!Enymoe$81_Fr=ybif*XfVs_TkCrXMARj z8rEmijL)oDD5*yRrFu+mxX|!rc0KO*O#!2kISQ<}AL4_!3{xKCZUw)ILA?w7)DUZN9;8`83k}V>RrM z)?L+J+br;s%#nu4-m`{f>-QFq>47cd!9h*i}B|m$4Mb*Vdti zIMiJ3#W!x9%^k*(=9gOE@TUB=bywMxKc=b(iOO6B!MT2#GE=4Nz4kHus`qDQND&&o z*Y{&H-!1s=8xz6Ux|@Xau}zud+)HQQ$WgUd6Wk~Mh@r`U;h()F1cyVT)c8&hx5 zj9u2{MEkj{*+D=f_rG5{e?IU+ChCiA+u6T}3Fi6@^x?(+*i6MOash5zDs-R<@Fj3X zyCP%>Mfq!=_32n0Th8m_sf$Z@e$Zz<$2LCQ`2h!ieQ)j~KA8z&|MJZT#cMe6yDUNw z&9&e6|8vdPJH8H$9BJKpu;y#lF({4J_apI|b+NjQQ@5pV;IuZe(-$_TTbIK8kY8Hwhu%E`WRy1IUs5l&-baV0hE$#DSohzM zb*7p+<-*kU6{)MswIArU>7;wCX05kyFHy)N5|_T9@X#Hbbot_FE~AcHBz5ZArdZ9E z22L{un|Q^YQih0p{Ew9tC^z|C{{58@z=GS-QC&zk{Qdc&qMV_ZX-m8FWT_vAbV+0scTt{6xwbjKthT{g~w_rncwD#_vV}EfBH4aKQ_h3c6BGPc_`k*`H z$LrpmvUWex9X}l(#)N!$|3&d(dPS_}9dF@{<(Z9)z&tHxkGX?VWU_vxef@QH-r`K+ z()CP5izqpezHW+DsscYKQ{Tyhqj&{0xZeL^t&S>5I;=9#{lSY-1^^2LtG=Bg7W z9_!mz=CRZK1T3O&KS@5HT$AnJ;A|>VTW~Sj+>!a}+j|RWBoC2y-*V<0_5?>J zpYs;4@4yjr-tgo)%{yO*8_1xG(1x^AZSJ`P40-t?b~YCtliZzrfyrFiL1qIkucVG} zwD7E>wYTb%AC*w47pGc3(9Bx7F2he%SO{fowGfBkUaZ4Y5qTOzQf?)7HCOpHt1Pfm z>Bh}4y;l!?bSjXHOdJiI9g|N2k8J&H&g~6MPdccG^J>X$vTeG@zR6j~SRGC*@8+2j zX{K_^M?==(7hog2Nv`WC>0W(Gm?En+L#QohD4>mNjMq&p4nFu7oz2cLC)dklb(DKE z2Y^YTrHZ|2-4|)TfUuxujtt_2P5*!l%R)P`m@1wPQL}o%DYP!RF8D3jSak}PJXj~$ zdkPrzdWx~^jgS)n$m@YTf2;u6u09DEgPmCHEq*$La~RJ9PWhq66JRt`quCXy>{P^1 z8v(}yJ}BLIQb?w^*fpPswmgx0xmb}4Ke#*BnnFXKvC2CEZ3KuhbW(`}Le!t{1l0Oi zsUz@S0BN$LaMoDHt(GKJGKPqFz8+qRV#+_4<302r@N7UEA#MULCgVT*CDCc(; zG86~;!9jkohW@|MzeKPD)6)#rP){s^<`>jqBGQ<&n=xuN2U147gPLJqR!Fnt0 z3e2Uzz;x_{TyH#WFT?mk2Q%4VMw}4#hQ_J{wi8UF!8D!_c1E^WA+EF&Tokx5q){UI z*?-UxgVWParR{H8ocmc~s4~yI!E(c*!sMAd-#l|QYJE6n!p0kMYPp@63)ZZ5O&8#x zbYDrIaWp3vVw7vpU4Z7sK&cx8S?)7wJ|dSECPn)x!Om3qvYVyWE;6-t z`byBzY=YbdT&tu6rwvI*Zy1~|zgs^JPDi^*$B~YJ-L-hlsu;SI904tQNdzHd%+%j$ zj5D;~-{@T}f=v6dPF~Bo0!zkkGfGFd()t1F%+@UtH-TchqXk*$PK*R&A9a1}R3Q4+ zoB4gvDM&{OhbYG8OItrJZmu;c;UWvesVJ(u z-VvMjp!=z={6UUrby09GkVDe3-o*w94XX^312^KckS7?S- zn6~_mP}-FdhSyNiWjS=N=j?oZFdBSWO~{T0XxcT%Z?QKmXV@jB5J?B4bG*|p!A5|WL6ar zxPWoU!|sGPdJ~*rkd5I;0Scb$MV^o>B+%gqXD`@|6w)Qc zNQP3|{52VQ1LU*O`Ti{(Md(my45~1RO+m+s$45^`(N>UC69xS==iO99XJ|oUg;ZKf zn{piy%I8-znRQtEocGV$zofZMzR+hqfp0rMqD#lbalC45Uys@@^=^AK@+kE`fgWeQrpsOa0DuY5Y{bAU--$dBV5032^M+#VDOb_lh4Su0A)RxQA&{6KMKs&2I;vu+33p`o~9;wKJswWvQ zWS@wb2j81nMTIgd2$r9qI^*2K09vbb;&f`mbxCPzSLriC<*&MqG{YgAwnFjd2da?$ z*kA#x6MX5V-b{&)t0ksmZQH^oEKG=+srB{U(rNb}G5(aAd>qwm0x3?|(Jr)lC&f=PY@*CfsVU%^5*~Kr-bhBfjdqYZ42f+XS%3762n+LOWjy5<%Q^d3DMK{s(dNaOO1^u zlaD|`356G$Qp&qY&IFywry{NDx86-WeRH&0hfyjl@EreTUQpETNXN|i^>>3Qp1$aC zr;ep_12-RJPNPwV&r&>U_p2iF0d6Oyx~!*_MVa%}?40}MK>Bd!2g(y;M_|@vP;crWs;aJk~n%%etP9Ku)RUDf=B6?bO1gX9O9B9x`?kWGkY4Q&+ zpkyJ_zj*`s#m)v`(Ez79Ten46ky5qNIAa~x-*1zV$+%4SN^j-_$)FVU0@`T-4hnN9 zZtf%b^Vzis?ZFo(B1*dDJ7+zq5-r4oE%y~n`;{k|#9DnouVLOVQhH@G;C-}j@BrXB zd-|m8tDHSuG$4z>pwmF-d^8kat3k{OM`8}7z$~@A4$IGi0)z_7Y$%C zr{P7xDL~p9%OIs0x^glLS!$3hw6c(n5f0p18A3vKY4H5wtk#yZW7ZI4P_Be(xe{u< zoA0J_V{m8-Z%e+IGmxLO_8Z0me?mJ5(H_pB;5rSrPR>OBftkqRs^&)8?JP#KUvq|< zF(>S_Q^DuF3Yx9lxH2Mgb}5_-iJ;TEsXd`o~KKrvN#G*|<2{9sd^cY{S`ng6lZQeX4-L zMvHok&NI9{fYv%5x5M|W(-_i2a=r)lKQE>0wBmFNO12qTQS!IA`4n|160|lnNMD?o zGY3P)o5Vf1$T3FHLvWYD2&2dB-c{;28IyU=&jCJkL10w7(FNbLMzuq2Wo*1$3^<{O z_}DO$?+f|PQk3x`yecX6nP&m@2R$hyCqiz#(4&IRL%|;X5bXVj8Pq2gI{*r6kQ*GH zUyb!F*a9v>ZZbWDF$(p}3v2@o&6YXRrZ=ja!%=FEegctA5RM`~oHWm>a&+D9?+`?evAP22`5Yt9iRA0vV2EICb3>x7l(}x_HhC>2t7r(d;#Q9O*@>YyAC4TE`7-JMbR-VKfJBEoP zv|Y;OnNc9XVL59onbBejTa0ajU=Hq*LR(Dv1~kRD?`R(9<`mS^7PDv&imYy;UKL(w z+FmA7MQDggb=dsw45h-VWQBd5+ME@(06-goiirp-fcosr5LR$bu7c18+e5MaQ3sFX z4iPr$fK+1-HeGszP#c#(y+#!8F~umW&TE&Y;VdwnqmyA%Wfqv$Z?Y6;UBu5_>@UnI z*;NRojzWz%kKTpdB+e*T01)sfXfcR>6{>R1L>v zlIcq+JMRb{B>=6+?CE5GY0c7ke6d4n@g>U8M+k|Guj^u|fn!oPH8H4`(IYG>?^S;$ z%SpDuYK5_-B|j?|No5rm%(K;pe3z##xZI2}#yJCG0I=0w`)Wikmb!V4N~A_L;~P%> zOhYxEx@wZnQTC0ee%;1*EjKElm;p+LBMqdOhH6ww-D~^u;U>5wA7b7HG4b`(Ko4la zUP*t+46J5C0FHpG2~;_W^f%;Tp^D28-&n+3+#J13Uu)qi6Mw91ubiVlXg*N^-90rd z>V8B{kI5cw1v9HM(8lT#OVSV&SOauP)&HC2)^gF_GTghQD+Eocafn>W~BdaPJ`GJ+tsULLP|(Ytvx zJ#rY@5Sz2A-kbaQ@~y`n@(MiJE1qHqqw{`Csru?>Zd3X;z&~CMj{K|o zFYL>kCP87fY**^DO6gGYW~ndfPQ z+;SW_FTmRi(l^t!)$@wot9NSB=f4zCWxyznVu$-E)iPGcO9!q$Y1{6u`^!dO7MNH z+bZIbC(|>2dSv_j8JWrN|M(IZHO^nR2VPAQ=Ptr42!qCCbYUOk8`KrCbopUyU3=cF zvFz~s76V9F!3Mu(4Z(S`P1C&&B{3+M7-R~v=Ps?V`&=g$jkL4Zw6z)Oiy#0Cr;6MD z^0$l0U#lF$sndf^TptKSC)3_qFxnaC%MkNcs^sRJsZAVQ-OCHF?O>v+6REz@cxskB zPdd5@0eFMq9A?vD2NSzbw(^;y02>Jty%K60dmK9MSKI!sbBTmn6VjXlhVQVYxK zwORTug~G8_%%gSrl*!2rebxh!kd$}d*&JCC>Cy&*k_3}`jG*tkwtoXG-Abt5eWO!+ePGg=OO)ocz7CQRF=WXM^R!UE)X07Jb ztTziQeT56l?M9pI$z`h^1a=wr5a8xB7N~X%jiV7q^SEg{f{PbXU zSQE1`S%=HL){_VUR^M}Ubn0X(p57#BNmem&2^Fd3WiqM9<278x`_aI3uP?>YSF+(= zAdtl{-PU7m*U%q+$MqD+Oag`3E7VTn3(+u_0n0-isH`@DrD*Wqe}@5ekZtV zkH*q{PxDGr>Zfj*$0I^_%fI;KFnns(Dm}oZhacl%bUby}GW{?-p1S=(Jq+_xvo!Sr zar+`Z(fs$_%P0F_KYa#2h=)o>MWtbPPJ4lCjkJa+GiKm%mYL zS%*!%OnWvbDY^LpkWK@(?PWD39gnyrD0~NXO8J>M=}yXbP;%ck zG!wgZoON1yyEMsE%8$7K3=@i)&umPUav2#Gn#xeXbT5kYD><33ps{_HG4N`sWtuc+ z@YY%A$qR5fh(Obo1==#FD_{ML7ICUDGX^SMJfA=M3w8a)i_Fx?>iYId3cB+_6w^GV z{xOl_hNBAUH-@Uu>5LIv4yFO?M_xpzni#8)8;`74AD$kZxZVxVkZG%TDQc1V0%IK3 zW*KJuaV$I3ao#Z$-w#Y1NMi{6*LgwaTc=shk)|gJ-~GwivuEpm(4B8B-%_HuNp3^& zYo3nPtY}?Pk-3F%{jC38<6IXoAplhEkG6@AO4SZ1o%2^`>t((Qjhfd{cZNNdZvOH% zf9KnV{}~1i_@9Z_yb@23s>u8e;E=!4mVR7*{i)`=~4y-Ie$6H@TmN9Tn z5B~VHiT!jblv=-aI{ELCd%d<7=~_CV3RzD+w-bBre15o9-J&VHf=;;{z5Z3{^~^cE z&&tmiliXbi)~F|=y*%}k)BEpXT$1nxS@(%%lS$F;2z7IISb4!vt<`RPlxn;`6b0})8R zz^ytpFVyVjih<1a*3>r2YNlzZsDrz=Vtmn5a_(SFV>?z-Lw$tE(q*yZSz^WTT1@cT z0y4I`-qTv=YSub0VNP0is}snGgW}Wav)}XG_!=7N zKjS<6G$Z{roc~*AhjL%PV+HNg2}eJD?mjb8>yXWK9PO3u?tfawMNp#l5VQL!P*;%V8O7_?oo2L-!(}HK&(~_Ja_I(L= zR<%_6W#}CMN451PG^{swIXmRBXpiLM^~>HTcT@=jJGrv;o!#{< zWbA3%%itS8Wa->T;%#^fNAZ@WBQKsnal}&{bLep`&Z{i;sjXe=glM9yk3*pv)Z88K z#P{kHiJbR1XrC>I5fF2+7eU3DE&SgmBZ-=Id?R_QOPR#hi66G;7H zhiG9alPvQBAdJX!m^fq{pTV}z!LG7AV7Z?R}b52OG4xq0k;F3MQI<# z1u@q3X~9}T0Hp-b)sR1;ba#2JxW?Y$rm9$az_%&v2L5R56K^5v znMIfuU;RYdb)bb(&E3MTwNKl}jAu8cjB*p8;{kYN6v7SCwBdsy9lKUmG{~m#enI)4 z!};(7QwT-L4_jZ~t&=BQ-fK-ZPm++SMb2l$87?MtlptaSi?Ji(3HD9J;NWv-pjAUy z;psGl*^Qt2mc2QtYX1`h>p=H;N)~z=;hqXqhD~4;xA2! zD|%s<*p9W`Z8HUfE4G^kM?MUf_;y++Ntt-PEP{T1Bm>Xr1g*XMGL73J5i)|`uPl(0 zaQ}pj$9UakBRJp$jr!2=Vxu` zEMeM3-C1US3?5x)Q+*!A+4gvC6zPM4Zd+`;RA=P3&O7&HKk2+7$7P<>PRq0IxXg@q z??5zBFNq$0Dw0Uei$2ZwxVLz6Z#&khgEP(1$;-BI{)8JQW2yeAifWEqF+f^&aVBI8 z5A1vNqDMK!wQU)9mY1H&Ag*bPw(yfaZw)`^ho4%T>Gx&fr&fRZzB&Bd9)1=b#wthX+($9os*Cw`4!_E!TB#6>%Ib}?(U@NQo6ZAPjeb!l zd+jF!)`mA!83a%^$ww>+WiS`d2S;mrBp=}zR|^*?w;c~a%V!6(pC{cuv?}@3jJJ!b zu|M}l;$PXs6{uWBQq=NRh1a%$I{X=rj;$`8?6*Foan204lMl8Ty!LP6X8~cociYPP zHVAl;&J|aBnTJpP)R;P<$G46vTKWT??XG+JyZJku2uzhnga3rrUzF67!Ci)GSnylQ z>eOzq^n4CLuc8v0@3Ug0Cbg5l)~U#H8oH`U|XrdNAqX4{0{ zA~4IlygZhCJh=KlyKt!e6^+aXE~@U9Z+4rqFOsx`{pCNwl?gHLhjXJs#4$oAv?vxj z461v@y(4e?*u7e0_jUl|q=Ms#?WM3^#oMA1JwN%7Bv?PpiAn|m`Ddm3d`PhnAZwRj zkY--MZ zyfHm5N>3xAa~+Q5!Lb^@{o=>%WCJ;oc7smwH}#ewB$1wL!>bWw*1dT1+4I?xU}Pj3 zskI(#+OoovD4UiiT-LGZb+OqPyp4>Edm&41^?q8~*URZ&mAXx|rpkZqr%LZ6WAiU4 zdJ%;#XD_pT8?WN2?os8AzvY*7y?jXII;Gv+f1&m4EsEICvZdCU3gvhCsmfo+Q>DKg zsYt6aWbqdT49(KJ^g~tftkQX13Kv$k%~k99zv%q2_y@+|%f3$yz#GZ7U{CLsVRZS{ z3qpIgF?H&B-$a&20H2lRrdrAJ5yAY$66F>7^XFVDxZg#g66P1?Q&^K;@E#YJkd;9C zG_UR1JfOK3MJH*atNmcASHpjz2L*&$NAFyhdT?G7#k3bpQ0MeR;%zvMUPdTs#C1i@ z?)$flSz5=Y%I`?comZ_Ax~z;v(@6h3gmvV!a5D*s^@oW$J>!htdgZe8{$j>MHL8AO zZ;k0n;?O2}=^Hgqv2mky%dxi2K+4B*&J!FA%h|nR$fnCr+{*U37N7L$8cl8Z^a*rdR^`Y7Dn^We@-(K4)mGl-5?aeOVByW!5 z^q@BLIivbkv}G@t)(&{$(g){m=Ck#|2Q{`LoN0C&vvr@vymKBQQNhqlH~eBJqg6s9 zdAvMv>6*kHoS0-j{|S@V+i)l~ch0R;(1^QcSn9zx>E_XcQ*-7?VvJkC>es``b2S8n zy^>Mlajm}+mm@#1s2(~8J544E@8LXM6F*5EX*slsnlC?u{;3KcR3SntDAWR^)9c+P zXTGYtQM6w47EkGo773Mz_md9lM}2%s_3%|1mTHdW(n&RHGTW2Z-JC4fre0+*U>ZH9 z0d0MS)cqneSO!V9{|DGaky&rvtr(=jdeI$A-L^;^C1>QNZS&-GMK;s-^3%YO>jeID zH}}SL-yb(7x8QSf+Rb=n?Nt5IVA6C;#$vT+UNGKSI~f#JjBJe$=2crCX<}vm`ivo> zvD2nB`uI8ftnRa(**^VXR0UnDMqRdNU!NWR{bIh}+{&lj3TM~&2B-5njGQSAA@1eV za+Y-Hm*6OaU;yv4@4_Jg;)p&IP_T2n%Cmu1w1mmjYFNgnEw&W7Q4~D4KSTxP3n2ah zL?MauDONz@Tti?4oLznJI1x~tC*WFps~1bMZ#6tw4jeqLLw|x1K+xeN?0L}iyPw?7 zF4Tv?0DdRg(lmK-F0M#iIxpHn#feK@v`ymDMN!!cTwAmmd<_1{s+8GEUj@B@x4F5K z6$TrXGS8mo=RWI8+M*kPLlsn}+H7=iyO>OPT$y;NTT*R{gip0dKWOz><#6Q7E~nC# zuri62t|i+@|1W%$?b2BxhLAp2HgW0PsFuI^+IxmLsGWt8%BK5Lbgn9ArW`{#%YPD< zP!&&e5t?phn`f40BPj2&%C8z99aCsETs0|LHP~A?Z?0&Z;8QXa;X1~Ahgu^+RW1ul+Yng4|N9EKBJB45@A>!qT$CeiK3{^{axYDs zg}mAo*7%#6dutM}x3wYa|J~mEz{ho5_q~e+C{bK_MaPs)$FjYWMH`eTiliu-rsWs_ z@s|Wh@INF`S}gt%5bI;IrX)k_)2ZJ9)M*j8znMt0~#ePwJXVN)e$ zQ#EB%rDap~m1*3DiS=wI25{f^H-C2T?%oAJ(nwY4G_gM*)Dx z&NV)F$KP@MsdliO{bw$1{pY^!m{Lh;1Tt11-~(Iq1|vV2mnm=9YpvHc0;I{EUCwA7JO}TC53#Pjk3PE>y zS%om2Ro&%W{d0Fb?fMIgRr~1zx54=8H{fnZSg3Q*4QwZ8-{sSWyvySM-??q&al`+i z-gQwC|LwMwFPh%<$H(3*jAwqFB!757ZFQ6KL(E!4 z02K57szNOL?l36~?<-KKW8liO_gWKgqA%Rw7s+r?SPA|G;-e(^CkPKSF!K+xYlojs;Y;BZ7Zo`8 zxTKy0yhL{d2RDTHl6L=5$HLJZY=pYy;6MI|G!4dN4bJB@h1Frfrf1Hv$$|$bl)h`^ zPMKW#3bvxK*eVF6pU%garc`wn38hb##}~EgqlNhJ*qj$TAI-;`HWYp*mLyr;n~$}; zXsXl>`tf{JbbJ2MeqW@#$Z|qSh`&i}C6RQRINzcglWiB!7^LjWMA9p^L0>Tq8YK?s zqicUpJ^#1q`G2OKZ)cI}m>0lU==sZNd^?ieHmaem$l$U{eXSY9j{SpK(%$`HA=^y1#wiwy#>w6H1nUs zwoSK<{Q1D;7+vOEZ+L@6_B8l-A)-f7pC|Aq57zTU}p7~HTiy5~O zw%`(wNK6*EoPU>l{)LP_25Nvy7iOs>aV<;Aqx&xG3u&pdzqOZ~_whv&>W!_~9iO{IWqoh*DJo0N)*0A&f(%Z4U4%`=QlmGvex6Nmm*4Tn zn%S~>zR_NB2jcNB$Piy*mytb+PhV)=r#R{vQ5*ZtZSVU_-I4qs+?`4FpPQ-Q-TvH) zSC`$%-8#?RCx2J_%Kxr;@8xg(EC!=rpQ)aB|JwiL?Mt8gOBPxi`2a_->~-0oT0ioK zllqkE;t(AJ1rxpU`-aQcm@!DH_e#GdhsVc852Pk@nFIZ)`+g+VKQ%OQ=aMC{6u*I? z;r`LgNPlW#ES1S-`Qn5L9v_>W7#i(QDar(Nn7`Z1->v3vdt-g7mK&2FO07-Z2iE#j zE}I$8^{;fX4MnjHC9#`Ls?2b9FtaJO*1YuePw>1ZH8$=7I(Ig0ZrjziecPtguFh4d z#?IItQ-s>S{?T0jJ?8Jn%-@xp3j*@!kHyGdLdEeT|jM~<2SmkqD zLLKSna<#<^XX~=M$Ho+qVuMz>Qm1=!jTzp=Qmd)Z}Ppc<2CEPN(WurN$-( z`^OIt<@!@Tu$H*7abgFk7+=MPt`xr__|4Z2U%zv6>!#G`SZ2I;a8>GXZUiz%dsd|e zbB9-@4vr4%kHQX4txEL{>rZydWX>PuPa8Ei?Y20w1F724vC;mOt5Vs4+|Yrs(cvSG zqSRP+VrXnMmm0_n5BFqx4~pONi5x=V8_7(J4^5?pMzfO>soKrEHm61=rVN)_zA7qj zRDC(Riqb{Z%Yx}|XkT||C- zjgDmpVudQPSYpMWsn2u3DXUMUt4`m5*B= zIoh$Ooy*F#c7QGUyu` z@9&-1lsep>>dlM_eeZaG2K@EAI#P!R`$rwuriOB34^7Ff)1L>|u1g;r9vjPs zP+T*7mgH5)+Dawtsviof#j`97)H_^Mv>)ZQ_Q<%EJbS2+Q>Ke|Te> zfh!LxYNP7MNWjd&X29a$isG50d)^a{n;7 zZzT7RkozWb|0uaXMD8CW_a7nmkH?N>zt?u|dr!BRyOA`bz1bs0qmyGYIBH^SER}ucJzTyG|u z=^dIlQk;XG&mg`5&!8lrSuzbe%wsQkI*rK#n=jrH4c$P1{6L7KOO z{>jmUh_TU>bY~PFJAP3mlcNrbEDCj60!;ZM;mebeu7FQ@nW2+;0@8PwT*(E>Cy~i2 zT3!_@r4~Nw%cMs8`}@S^sUE7-=u_jVi9uE9c&h)&Nkl1eBgL@8eLV|s3`%{`=*tv!# z!c11g&~}tl06C3L^n-a|Y;v@WfBUC;sZHC+`fiY>a8iozV@iEsV$hCP)N!BKP7#s~ z%F&=MZ)n>)jB4wl(4l+BhqCHum0=^`7LZ#k=6W}X;iD0XnM*0(6d#>r%w?`NJ~ony zNK{}k2H-i$<7;wTs)rtaP_+QS$WZV2nAJ*(*(K)FcU)uot=E{oV`>`(Jun2Dj2A%X z={)cc!@T;Mm0CUHW0}6*OwMqf1Y@C(B1+6OK9wELOh_}K!>Nn$(NJ{kl0t)5Q+}uT zrgk)U)Kg1&nJNv-XhWk$Fy~d`axCme8tv;0yA>4N1&p zK{z&^VU%*sE+dTZoRwJr+VSj+6bS`9d3jandYU>FBv)j8T&H}BCI&u@kTa$7DH1jU zt{|TxA;^qhgy^{*`BX87@VKg3=){q%9YZVdcx3tveSjg(jbcV16(fNHL$8zEhld#8=!}^38VUtn#e8idHIW%V&_8j1 zzDkRwV;NfXk_GA)qs0ZI60DMz9UnV1q=A^i7sT-1ZQI&>(-dXeaGh{c7_1E6+fG)M zWOSLJtt9#oPhJJI?NvchPOX{=uG~8loWq^Kb>r6HVz>+U%mk-!SzH}%DeltBncxiW zD6SKi!dh|T#3Lqc)>ePffDHbR8^ z04Vp5mh|g@^AvhS<+S^lhCGd>FeTKN)_ZOc15k*3_YU<~k?%?6YyDc`JQ*H;8GSyZ zvIv`RKG-+A;hCsE1v`PakPqcD>E6Nq-h=6Zu`xt!6jUK>F8D#Tmr{72KC*j&_A@1p z@0udmQ~IE3)B5zfbpG9Z#ePZb1KIJ+fsqU77gD0?Z{b5T%K9xJ4VM)TA)r9WV$G3u-z>|`=9#qIB7y#ftX#daf&nT|`o(ydDj^&gOJu+pgU|P`hYD^V$u@97;ybs*LNkw}z){SW*YZMwl69 zGc3;Z-6unEj=5=n&Syn#pN6m(;G8)skG&9MpL~6ez4ON|$;tlb?TcYw z+xGO64N*=uMQ>wNxSr)}E1jI4_4Ci(7yeRbQ`)~Czt6+N`80YDJCDo9d${ns!ou(V zSw2zuIK7wH<&7zR|3U1GxG&*e!M%=~#@&H?2qzpKF09H($3#;!>^{k2TmQ%;lURmL z#Hkr*^{Rf(By#WOy}P$Hb?#bcnavKW;AK6r0Oz|2+L-}T21;+%eKa~RiS-_uC^FcK zw(xHb{wcS=r=d6u%ySi%(i-|xlZeJ3{d71-5RVoSdd z`}taI<_}^g{|L76k6{m&-B~=RzO3KK5zM0#Yd54>>(1$|XMmTHOzxnKFa2AIe!Jur z>H_J3TwhObZX*5Uz(}Tm<(lCOpysIx%ESM}Jz>W#5E`Y{f57yk=Ha2#ib?Zy@4bEi z)Oxp1IjT>cDt?p=AU5TwNtxRGXf0nty$2iby~IAIbgN1EDYxIm<#dhk0|tRB*8-?8 zyeN7%Ke}Sl(sA#{QuVw%mbxd^-~;s;SJoaFP&D;PZEI|5*wdm{GhT9G%eIcrC4P8h z07$X^O?xeIUlsQa!$dVbF~%Zi8YMEDOOX{-;#1}A?gja0*Z<7=UdC+?aAlL1Vp_l| zl+20nS!!72@I>T>M*70>WqI{yue!iLu_2+gV3{4EN zAo&TijIGt{3SF=QJhFp1GqVcD^E4*pvV46&1$gGTmErXfO7GSqkRHTsz-zAHFFiXX4#+3A@Ki_&u)=dWNr1 z{*076*2Jum1kIIx^TC(yIe0IK^ufdT<5IY{<9-OI8%dVoZpXb7_b#0HO68S&mYi0- zNeA#_s_Iy4k?QDLo75Vm)(y30c^gi=`EH!P<@_Gp9XR0&yYSkgo>$`T!QG4dDDE!Y z2XJ>gw*vozIGV)p`+V{$i+&wDzdkq%HZc`Gx@1Y-p01Mb5Cl3O+{=x!9tA@UZyJT7Z}2n z+?VP(Vit+E@3JeK<#<^AJ21`!t74>ts^PJ5by8-x;aWp?Z z!bWaRS%BXu!$9(uFNBgJPP$mutk>FOs zuv<+!MqVhsv4H`t;;6~<13O`CQe`>4WR)hogMh4tR^c4mESUaKe(>^rLs}NPKRcG2 zu#fZRd9C@`jcWI2wS6l?tB&+AImoiuUcNSFP|sg)nqS^vW7F@iDW3&2`t{;7ynA@4 z4~w~7(r`R9C%)lHR{^dyOBrK9{2Z#sYi*1f)oVjiAk_V@h3@O9{khALJB08omxlEl);D+kAmm{I z|AMM~-gYeDQ-!uSuHCtBmGK&_Spe~bHh)N4BqS{DXsl(}!){aWJbW@2ek*=imI?Qv zdHPQ=zFoU_-rwH1(-Bu;d)1u$AbfVg$Tgd*hJE)pa^7I~TKsvi&(9A<_^*TG=8{LE zKi>;<$>D4zv2`h4vj z!guYJvH*HD9;b&!M%He)F@u#WmuQ8v6RI$3vyz-`m{xIH=k^9T4`&#j#Eon;-dXzSAYNM~)=iAR-9oAwW zFsq^)*7poeeAKG63!I0P(0$EKJ6C=84v%35Ky&HOjNCwz&RhS*`CMBO=AvKYQmB5` zt=D$Z>MQE#2cmjLFct>b*BVKN#Uaptx7v5is z+Tq$*XI{Q6;hzog&vlQf3vCaEDCM(&7B=M1DTn-a!nF3*rggG6Z8qxOKryZ}ziV%p zx$29JS$&~0O^n@mvF0pS#rzh~O4n76Yp*WL3=x0l%D)nS2u;J-fk_Um*rUOPGeE}R z!WgFGf8r|bwQ=_mATEXqdPvADlQ#2%e&mVSWT4I8&RU=D;^bD_HfHeYj_rGQweLuo zO@nOq>tjoz`FZ*y@$GSkd_s|K*o+;^%m(MWXM?M_TAqh`7UsIHA+5)`F+Cf2I&~H; zL{6GjTC?=&CSVKTnBsEJgKt+`Lz&4KV~Tb+YZs8QYlp}wws9(_1=5|8-KzPob-*0=3IfCx?9_Lqi z3Vb{o`ZzX+<9v zGn>pjGjJG!dkD;4Lw6`6bkl$^&rRPAv-3RoNBY_Q&UqGF3wDQc${s0GV4lM{PWU4S z*O;@KQC&#~62r;K-NsbOf9LT91N#m4&hwoO0jPC0EfNI7WGY@)j2iRhqN0?=X9j& zW006XwE<4814sPsT9IRztX-jBj=NaQL$39 z1b_ov8P4Rdd<=jVFg-2To_`6F91G(G0wT(;pvvWU1+D-WSza|=h4hunIe$K7 zay~(`D8Jlnu;$6x;L7-H5aW3%?h?=E^d9OTQT^G^MIT@wwMs!QV~kQ87q0MoKD}1_ z3zhuWi{Gv0o88%AIe)EmwvE!i&=Bdbnbf*eZoHQlj$dp0%sm=iKVIg&8w+YHn|sUvW6!LG8?vpV!#1KhQ11_yS)JuZ`d^Xa2fBsy z12mlUx_n<&34XBsI12v{vL8p`zn*>^g{P<=ul<|t$Me#+=9=j<@+d5Q`q=UmzP};N zvt`0(f&F=2y62Za=1z{kF8LEhZ>5Rd_4cnHg#3x3^RH9>MCD&e{*>z3UWY*JY)VDg zKg`A$Klz5w7OEeGb;A6!S3)b}kzQ8Nm&%^(xq=89%Jj3+JR@Kf6q%p&K+jK=>WeyG zP<<^3kF$KtJp+I!eih15tOrZ?&Y@q1=L6yA`P-owUl=z0JU^b0o{cu2vlRKcf@@w? zccpTL;7!J5pbM#wRbmFt1*0}CxMr>qZ@5;%&-ogvfUi(K5ot)g!FtN$iLNitk5PJB zZjcMuxEo5Fbr&iRt_k0+);-OgyR50`@Ywi4uARY5#C&{i{iuvCgU4+qE5TEtp2g31 zub2!sP53$AOtA{^A@DL56#GWX3xD)`6B7$_nb-jqO_+<=Bd6S7;43dIWIT}R74zY> z_7UBm*f*9sz*osU-pZ|^ZIbpYh44q zLDp1bOVXV-Y-7A{Y~Vgy?L=}(j_jf8H@VF4ldG5X^!H}81Y&O?%BQfKD&5I4UDTn} z2D3YEwmP*7Bd=~c)0A@w2jq3LE5`R;hM@!64nJV*cZYS^7KG$6)8-b=(8MK-<`>0p zf3@Wq{99I@%=q83_W7GC&w})_3Ua}Y6Ii_6=&%60>D?=Qa@k2Q(7%tKnGMe1F5%Yw z^4xAI@}UzqU%GJoQJ4M<{$*S(@~IUoyACzU5N#~%n7cNtzpv*A!lKZ1S!W0doRC_d zA1_Mv0@sJrLmSrX&J`p54;aHY7HY~ncRl(6H(!mF(wp&h*d{kiwUy+F(Mz_OE7-IW zpml{%iA?baGpte|q_|74FJHtt=ycklwViuPE2P)>B0bIgmFPRchuBa43T=q1!=3&C zxsBkt@>PrPlJiS(Um*M(?hNjf^H1U*!%g7^aa}kcr?eiX)x{}|(hBD4SKudHif6^2 zU>JN93WmIu0rT58waq{IU z?ax}4yq3KK73$CEXY>TthrFa*(VJJ!sQC(TrvG@LoLiGuLsdxc^IOV#L zxm$NEYMD9ma|Kg#7Fwo4`l5V7#VX^43h=5=n_s?Q`FwqEG*_InCu?GS%v=R+${l42 z3(eR{*+)3o8)|T@PVJUj&HtWEpJ%)Z^xgFY7fq}dAV-Dx{yI8dM1t>Ab4|SXg3TKzo&f%_~oG><6;# zbWQn`g5~q^<9lR>(cy*#E^#dYPiTbJPGh&xCYLJS1mrqoh4z_S-b`p?MYz^0j$0Mv zjzKUNKl$;_Bt%mAks~Df7M%ZzA|%)roq=(|`G<3=P+w-eDwL}7`050tJ3_qR;Wbtv z-_m-bY`(PqF--lO?NorwqbQW9V*Zm}>{UpyV$ui#&$l7wjNEFxeRAAhSsY5}^h~BW zBOK^YZ3yelY!e#qP5X_sA6qe`@g_TXBsZiDTij*AJ@*UnhIOcJ`aILG^FU?@p6W?#efqZ{W*a#$Cjn$7vp`%g1#8s@pBh)(yRx z3!R51xb!~PF34Sd2}?(uPKU0i=V=(Z|y{zO&e zr?VzGwW&&^LhZcv>Tk^kC(h6BE|X5K7Ppb~a<#bSxW%{%^=9igq%v1Xr~Wl5HTlY` z%)5SPes_v=a#R0?c_F^sIot)@C7kESQh7WuIXrAOqH42rdTc<-47Vk5PyD8QMY$`e zwP#yN*^6hsN|E1NE5+%0@OOy2eSW5$OE`b+10V6cU!k5#^{D)Yw@T@)g245#j>Rjc zD-7_3#B=C5hZxf)xfaah&+9iR^x$85n<^;SynvvarIR!&z`9QtjA z{3D7X{lpZ)EP!|cQUTu2ug80)Y|8o4tzEakbPp^r-TDQld(fv#>nnokG}o3M8q+n1 zDD>&tyJ$|$!`Hax^X8#S_2$#By~gx>ZH{q^0ot=hh5GXNAGpT!>lc{b*%8Lqx@xIZ z&U`vo&GV*n%{Fg3#|HDI^K4Zq-KG?GypG&eTMx_1Z*cx*{~o&Wmy|1VcdeS|537`G zNQAf6dh4wBfc4f}??LNru-=EQx6yhZvEEvb#^G7(5Uq8X);d&cAG~X&y|ZwRet(VS zJg7X!GgJE8WB&GDPI> z9AWDmzIBeYbq?P;hi{$3x6a{P=kPt?@IA2Bw@oVgH($FH?$%x@PW@ZUeKr}r2HzpOMyB&RLZn+=g$H;QO#h^c62EFEXnqM6Ljq%)djl(L>TU_-6 z<|=!aYl;S`Q=_<_zZ=%SNBNcJr{OyzqTqbkLg3TfcJI@d`jz1I)hBl~Mkz?q`vTzS z%D&?K+#KomC*O;5EPkn!zh9xX#w~L#fC1!UNOQW|e1+1T@Giz!{8kCSq0d}7FgKBv z;EU-i++9$ddu97Y*nU|G6MnChzk!#%JDfP;c}{pu-dUuI0Luqef_M8kDz?Ah(U(V5 z;h_@#L;X|fe*N(a%-0Oxn=aYs8Hu5C{=NCTSy;i1gjL`|;Ng`Uf-m2XUqH8z{L9bt zEew7E>@{ckeba?`bSgX8-PPGzw{cBvYWdQ|u@mmM)cuaSpQo_f{p#G$kN-Y?ammle zd*9bj;r{ph{lnuJXgCsUHz#|1TrIEPXZ(9Py#0PN-#!oDKWuKI!L%B455ofvfrocF z*{^bkKFj0d3-p-}qp=@zgzx`Mj;A+HZA`D-z`aXpeXg2M1sH$AeRMrv|55N=hxvpx z-(?*~WMZtx4$DoYTuoxX*KP^)`RD2DWsAz3^!f60Wq#pnoH5mRIz6&>-A3-SUNOm% zGnam+M@DSExxU&DhHmWUOA6i;5r)e|!qw};3qHBYrwBinAg4iZ!&aFr`Rs^-EIJdk z?!gUdeQz_{r>ATmWY+eueShVR;57)mMa7hZ;+hlX%Dtq#d;-Ok&SzLCcT{!dO2Ivt zDZj1fR77WlA|8EQiHmCM)HT;(bwta7;`_+XN%uTZi=ee{(V%)%9wOzF$` z0PLmZBIw5XU~asX>np0fB^6dEcX4qnx2ChX+HfawpPO^vO>`&=#2;YFpl+ve$%#ZJy@HtiUUG7@*@w*4o z>6NJuelQfsbu@$-D9>RqfgSeL?)k4go8chzEeboHx$giu`YYylXMu|149!rR&(JhD z_m>z6J4ap>^O+-?R3Vp8nc$r3$!D(egfcSfN-iP~HBx~duj0eU1acf-f7VQ`SwsIQ zs$hYI_(njG~kByAiedHE{Hw3Vmf#Ue`D=*pbsit zy@zo8vumb>d_3qFei!~_{1f=g_e=-V_^ppm2gmua(89vM@FpKJy7+0z#m9`gK0`U={}}Y~VWacMrh{AgxKY;$@ZulCe-Xd+ z3($>!@t;fwuj5a>1RnhJ_>1`v(nb6`@E8BWbg%}0Iew$UzcL+kSOyG9OL4`iIlOQa+k=`oB&GZ^-{2)4|>E2k&1JkAL<1a@x!65$0TlwxVA5ofG%J&NSkkaz^%mgofkbK_9cMLy7yYLO-jVpm$ zI}^N$e-Zyp{PQ0H?jG=bly48;Oa2XfyBL4@KH%@CTtnbrM>(FF3D$2Q{Kxoa@rR-7 zdA@7Bk@%nByT|w!@w?^!Dg4wX%6p1$FXErXzbOCbW`agO7B%?KW`fHPLGQ2fEygVa6TFOna)$a){OnBdZT#g`v%#`9!sDFn zz`s~M%lDkAznazLctieeoPF7<_M8p6 z<-eUXQ23|uU%(%{oxMl+=ifCO>~1H1Is23FPu~F^{Mz?%?@n zCwLCCPiq(c6nm!dPk(AQxOF#l;-~OW|0wj}4?Y84y&r>)J<#zv@Z&H3{A}^bWu|Nl)o z{EOcqyod6=Nj>AApJ5+ZFaAwIu(1z3Z{r(&7<pKOyMo}3LE7W} zeCrQ?`JF*0L`RlUjX##u?^S!}I=o=&a z5arGCy}`q@_me@;g|FW->&2=*u`^aRwY2Izx8A(?1PcQSM`Xn=Fvi@kG>tbem52C= zt;e;#4Oy%OgyhoZ8vFPUfApRQ?nYuk6OO8nW#2g+Tu{2kmR1Z%cN^UtN-1PDp)naYLf|U&Qlojfwbp=p~nb-kONTgtH5r zZ!jKzA#TfeJl>I9{wz3CPuFZq*2T{(Nv6mJ+=k$IYxo`PPlexG6S3W(c?q0rQjB@U zIJbD5FNb036J6m~Q0@-DgVOw#g0lvEC-0gLer1uRHQW|0rESq2|7Xd;Ey?Vj< zKhCp$wbcDl!oEb<_t1}QKY)$H{eTL3_NGSL4=&tZ{k5t{vdQUK^5X5=lb3Gqz;8&B zw6}_<-GuCLA%plGyvMgDFW=soymEVnWbr;iTL|5a->85uyJQcOs)JPfc;8EcPW*Zy zZV}>rgycx^Fex4v*kJ-Y2|PmJE{FV+gegalN(q|?+hWpLbdT`9gZD{%gZ@v2eitFi zSS3+iss3K7L7r?vhoau}lZ-k?*zu2|w<)ap)~X~46@HQMmmew)&k}xx@Yg?vp6U9~ zPsH0)-%XM`P2%-JeK#ggRmGo8o~~+3o~i1>uUEji$og+ejFO<4gnL{<^3v~aTp#`F z>c^*pdlX>gOQYlqT(Tp%yv0bD*hZp`tN+za2V0a5QTDuK;&YqEkhq=G#U?6T{Cl`OA8?$rXzkZ$sQPWc|MZcL`0RlZLzYDumcPS!#%63dF%7{grb zA>cZJyHi5FAj3^v8W~=w%a&wk^(U*6U3-(=J;}kn$!tfmb1JeXweX z2X`g2oypGQpg&Ds%UY*{)wh)ARnl(C>kYbza7-8+9f|7o)#OGwN`{K|MEnm7RcRwi z$P7Uvz8D25+LF9bRsD|WF{U+9eSbW85pj44rnn5( zUcp_hYD5$30B#F$2==ND;_8WOB(8zDZNxPa#~8xxlW-wyq=j3 zzF#b7jGk=dUV&FzlHJuKsHhv;%Ea83MD@9NaZjByOsM`Wxfcst5_thkiy^Csj*JJqggv6U7OT#?jw@J}X%KiBC-*=_)xANZQ zAj#e8V-NA6@N=JJOjZE%JI(nz345Kef2n#PY|LoLc0M>v+|f^A3w)Q2tG=|D*v5op z*D>PWd;vR{^fFY?rB5UmOMaaq{Heb`9W?Q*pTb@y>?~nYm-JKEdBR>M%*<6?*z1H{ zAPhD%KhbxEuvZEDigXrcPVt8!p(9~*s9SGCzdSV^{70px4WDPCS+0LQxujSDYe_ss za@jSP{mgXmcU10n&1VbkUoxLNv7-$4ueT_%p-(&U5eRDb8m|L&ii4j$k+wl8_2=F#NhS{^SE z-cI=2MVHY%&D^#nd8#H}pP#ihCfbR_^-yv1lC%p~i+%cKVEx~1LcXT>d`7BiS(pTMm4$N-DkkP_{?;$QgrcCTbv~9Zo<~8?zV`Z zS}NyP7k@JfIdv<3ik*h$%ivq~OVhz4)Iq6yD(I^%cAj~rs(Mcq8rF8Ss7LVY5rA#@ zgS>AwninZ{MQaHNkS~%%T4Gc3JfIhF7eRB$$k9RK+KFp+am-Co4lW^t&rm}?i}Pf6 z0hlvYSR2kVdY(gypGO0*$kG7pA~HkV7UH%L*F;i6=G2K+hqn=97UPz|%f!Q$W*@$4d^i51Alrp60uP&-SeV@(Oo?O#BkooCJEN0_TyUTrPb&kOo0 zI#2wchfGR`jpe8zIklTk*hC-fpbzd!c0Vk&he;>H?NR7C#_+0vzx(mXL2z?IvN6K4 z*pwI{yxgwUkwDzbW<}2X$f21WwvYpgdYe)9n+e=Wpk{8p$!;bH*tE>(f0SxFMsuB@ zTXe@YqbRSIwnU@R!vN4MqAl4SyFk^vf@S2=uTKZRn4iZOnN-jLOg-gwj{lXMdK~Bx zveSf96GZ7uno>2DeMU~CusJQRG!o-O;~oCtd9{ZT2I2@nlxtHta7Nz z_Ay;H2AJ)@V0O*NI2c$rqL>dMr%tfW_6qB4S2eg7^r$GC0(wnN_01T1Yfuz18lS|S zs%c4{#y?ZT^r~hzah=3%3B|R#IG6xK?1`EV7uNPX_Sm^_-iTKx#*@4{zNS}n`g)oPS#D#(V!(>(Z=rB~Eo zC9R$$Ogy{SBnIp?fqlw>F*8&*p1{5EZpQ0Bm=2a}>^FU&ApZ)sE;C=3ed4pp!Nz2^ z-paOpbT-yKs`@M0<{J|oQFXqNtZw~b`rm(=4y1o5UuS=8olifK{@Dubi@?^2=IZ6f z4%^9dH{q|oJ{|lkB-w>M zsDs&o7Dmb^@kZCzn%c|np}sFo2mhAmQvN9Da*jVbpGbCXHFQ$gMolr%PV1srpefl2 z|Iud4Sok{+{+)SSYhiBLnzxa=wKeW z_2Khzr#^H#^+Axbl}Lq&pKxf~9a@1l5vloj^+%Oy8;HZ8QfR!1Nhn%Lqs&`Egei?t zY@Q*qgGebhp6Z8*6xAKXN$K$=Y9?!Ir;#cZDHyO)dL2t7d5nTmb~_y@;W(B?OdQR4 z5qJE(vDlf*>?z5Q=Y_sfupw-=I{K-q>bfe1?ycZ%2e((Wly3*)FJl%XudRO{CuLl= zlz1gWV|>72+y+Kfu5UGz->UfMLy=*W?G>`~q`U!74ZaV){S$1i#dfU1xNhtfPFFcm z75|qg7%GOe6RG;LM#ujz0%b_{=4i$o%2+vU8^F?>rOU|hE?95oR#+=xvJfc~t=;%A zrE3nsdRuxJUvDH6FgjcYsqE<;4uRD6eN?3^NF&6_RLuZs2x}p32XQKh3|z_>1qLnN znV~A?@&c))Ock^@>PO~t>Jg8nw<8NqaV!WJIN8bD zLvSs`$-eG!?sRc&z$s34c^?&OI=EMcDWgvn z=1B#)>c+F~>R*Yrp|kvPI&RG4KL{K8)CXd*XKtRE*M@$Ibgz@{Px5?Mpxy9Yi49%r zjMryF&jG3V;NP4L{Wy!|my$DoEjD!NQPaxrwwd5yzHSS8wyE0^8+xy7=o47NyPYNc zNM7DD=g!O9tzt7N^ZQrezjG!Sm4a;e&e-tG{Jf0s(8_)d-JPitozspWUbf1XftGKl z?TS=0DN@sR&8}tS*n_WigZL>P!VAP2$YcT1dPDq2UFwJNr3o^dMI$&-rFWS|dfhv= ze)Fz)VGqA+S%#T)9zOhDy@L5TQt1S=X}!h{XBt^#3BbNkD4`0WVMnwvi8OWzW%W2| zL<&nVwjf%c(SWLT93_?s#8ijsO4%qf3KL5m7E?bDT6C8^Ly3#3#h495a5Q%`|#zJXCF0FI8$O6_4Nx{-8^(BB^o;N(M#xJdvW( zH@`0uvul&3kSI%)Mg|-#38XI+y)@mYNGBtd$5-GOnUE}DDuxL2u>L*%D=tB}eyu8H ziSZ1ym2@In+Ygwjli6w98ETCZX&z(sk#PTLN?;Q(T7;*}1F6zv4U>iu&RDIk1zi_=Jm;rYO!)^I{P(&8zxV-|>iJ=dfRO zV#FW*&`i+5{`JzarbMP_?#^C1=7mjmT88<^P{MYTNi#LXfTjkoCtMWoVS#s5m9cg* zkyDK9jH0~@RDcvvsis8RBgt+y&e?DQC|nG^8-PFZ7RxZE?d0(YdF;@B18vfak0!M| zY_{5Uvk8J{te09tHah^b+X^&_5110m6pCGNw@j8=K<1eVD5jpl?bs0$aipfuZ%eS} zhP{B!aG^qK*>)cY*!hFm)FNP#Oycc^7Ip+;iZoj=x>*=@23KE2S{)`wOu*ZWAy(ny zG*u8}u}3q48Cw&3AqNt-CD5J?dKp{E#%3A+hoSrlDn!_MJ_cW3ugZt&i=!Tpib`UC zWc*h_3OQRHQZ_RZiaM^kK>wz4DIKBI^cK_R&32cECwzZWm7d3UsI<_Qb4_i~%1|Q~ zFW6$cnd+;Ww%Q1Fb5)aCIaL*9_&U$HwQLpp)i%roNAfnbg3eK9e>!95>5nim&=QgQ zzGmH8xX`=0Zt43D!n=rPpQlYweN2=29)h+qbN0HfU!9205P85v+T@+QYbmPPfu|!| zb_-Q*(`gbfq%tn#kt%N?(Ylg$lW1|mM{4opAaMfp3m1wrRtZh=W$e(RlV-S5+3QW@ z*{VZ?Ycj4U|btT-*sR7LTbYB?>w z*X627S~%Ah5-L}jAe3e|iBeZ32D%FNcEHGcOl>x4&@r`1J~_87kGFWzwDE7_-2p0cPY&-mwf zXe2;(BAGa3C(;Vlennm&(id8()hx35XkAlt{v5v6zP3B61T2d85+|w@Cn{Q9t!S3j zCB!d9a$UWtN^z?e+B}e~Rx&HazXnE?%h!SGxtT~`t_hQyt%LZ_khqTk&*w7A=v@t> zO49pd?p;OO@7|SO?%sv&}QXy|TOX@LM?L~K5hIMYZZB~FG98AuTN z)t{|OPBEnfvlpq^_>tt)V+64TWUR1eeTf&gvG7-mP4;DHateESRs4Aq6@SXcwHPSI zk2G;pCTd?~oib*y@>QBQ5PblO92vaZ-0+Pke>gccK#W$g6jNV_Vc(M}pMWF>NgUr? z5V$Ug9UdDy*>F)GvEhOZqccki5E8#l33l!bk)gzHSb;;bBjAe&Cx~|7CmZ`yYuVed zeJ1z=%|A-^j1=Yv#-_mHbWQcIYhqfiPt|L_fyp-ZD)=t!oe73`j>SR9kps30Z_}X* z#ebXl6lcGp;j_)jJrleGeMftI*c;!= zE}Aycp>>4aitm{T{*>p~&g9bP;_P!i&H{osN5@zq)8aCM6+B-AnUhehe z-$lA*q}%0aS;(Gh)!hl;9s;hz!G+7ue%WUiG2egsrv9=$DpRC?71&N-FQ{D#b`I0m z%gWywe=ga@3ix1hkd|#Z|*Xc#98Hnyy3jJdL zwbMNa>@&cg1@?QD=*D!J-zV4^|B}hyZpdcYB>wY}y<$rue#n$)$`DG4VyEEYE9BKa zNFC+p{9B6m7KG>g-Hg#r6Gm%8Eu{NetvLLQK5Bcql(=YJ1PS)5M%hKvFHuWb<6>N(IN+DO{|`It7NBJrA3ZC$?TH_LCw)IWb;YJOB1u#wu2@eDw+%f+5*}Iy67z- zwBYxXQ?gU|d^F=_8y(6;Hoo0Td7;w)AQF0s;+#YEml-y-)!T#lIWMK(XZL+-Ux>yj zAH!a8<)a4FG(Qw9%ZDs7;na+<(ZOO6#rTXpy^MLpXfub85xJV&62lG|O~N%dRzV)6 zY}2-En&_*lM2;3_C^qA`2IX*!t6s)tZB^HR=wVd71IcW+h0)mTV=}~08ye$FfrO4T zAmimg+G0XX*2aW0>T&2siknMxYr%w^Py$L@#Gwl@jHm8lU}#6m8g6`Hdz=`mEcy(h z+d@>nvq095DK?8x{6ZDD+Gm!g77Lf=5i((_I8Y9w!Vn|bi#z`j#+y&f1pk|7^_k-w zmgKzQW}OX;_pz~(Gi*igXh7>PVO|2wf?>eF)Ei?XeJgDAv>?BKoWy}-k z=qG7|!r2x9mLl};f`6EJv(EXmJSc~_4ymZUWaLo_VGwg^_0YeIWKUu_hOd)SOLrav zI?YMY*itHwEbl6X%yN34#1~tkVA_CB;cKZ>36!UH6`BBF530v-0=?VSLJOe!EseKp zuQr2BNxXme_y#2%r~ za+g4UZ<(?87v@4{?00<4bVb(c17x1X*H9}fu#9Rl3hG@Fx`TIPj8?!?1W0U-;A@y? zIo+CI*{o#)WNeo7mf=ni?CP~rARo}^0)$-BRaRfwLnUPe;VqI@>gte&C2Dt6ONa>e z;86Cyz;j-w6d`!p~10y`(t||_6B*h{|NJk<1@jU zTX`}1XGtDru4&fUumjv4|6U~bu+G*JPo6htL75(N0`$`D?A5(J{$m7U#^EsOc>)RO z;(={(w_``LAw&3_oZ%>`z|Y-|h2?gp;J5;s>Z&S2v2onq?NY`6GY=fpxjo+NLiXXa z0hvRn1vrYNpLF~6AJ_SVnP97{%i|p9K-O|z@@aOgovNv>VxpGHpYAs6L7MLk19#!4 znD3$EXx{X^Wb|`_JI)*acm;gj+^Hc|*8enTdyVie z!vCEMti``f*xmndCSZ!ByqClDOeavRo01zFl6AW@N7PSrE~`VQCv8Gu3z(Pw-W>DN zM)gHGT)ZIwf>qj&`%?aT+N4n`>nF*#merAI@ z6{EPtZn~IdgH5X zr}-W9rr%92-h#nru{m4OrOEiSi;mYkSN(k4QDGilLn(ob-fz2Be}HG zod@ay{YCOx_OE7wHnmN0USH4W)$H=x;qlL#mznFYZ^XXy^_k!}s4Z_HtK2#Df*k8b zXtgC%EFHjpM%|R{C=2XC7a!-4 zg7`&0$<?$-c6dTq`9kx7gNTP^U}sXFDGlajC-R}}jJVMiS(#o`y+58jwut@liaieA;Ac3W)HoFmO~(%fB?1}hPKR<$0H72Svw?@`)a{_A{mNJLI8@;%T!|D|^7~9X^ zGMm>8&E`=NjBePHtZPZOvNfE_RJqrX_I1)e$TMxS+(==i<>(-3o6Mef(w-uEKWVR! z_8|!yQ%=J-CH80|H`+Pf68{$*Z{a{c$4M@u4`1PY#f6&c#9|lvFg~k$gxXXrmvXq| z3g<*F)WG)do2(^)}cLD)G@}4DxqWlqZjBaK7H**A1ou;duS=7l8e3b(-Hidezx-OG$AZ$n6 z0iNpLpp#$KnHhizh?+^Zi}M^7L21kmmxTx!4?(lA2bPoE>LQVHrG`bf0_74)oe6CJ zS8JGkLW~tHB1+O>cLJGKNG*EV8e^jNnlY@C3B>WcW`mcVO#X>@#XWVbq}Ig$B(gqR z5)-s;JH)Xe)zm8Mxi#@0T~ZR-mUxC{)17!1@170*f@ihib2@wN)|-&}PwU>CL*&Ce zryXM?x%tREsS z_9|hg2>Z_w`5Rrt$mwRCA27W9vyu6?nz3~`hosl9o(+CkeS|u7V=FSzot19Z*vb-r zP5cka2UiXOGYKmla-~X%&Ixdzyq~-J=BJapMrz`}7)fViq7Gc=z;*nA+2CGB=W)$* zp4D38)9}xkns|G7D$bxk8h)=gIBK@D-mw1qah!yXR&ew^I2(K-&-cFVU43mRxWw9}8v0pjDwec9<2pbmKKnmvy70FFwZ}nmFN?vl%f-hoLE9V9 z*81VuU|(KGEy&>#zF>cN%~P=2h?6wTlhISIR1mO`{RI-U*Kg52$Eid3Sn(Hi<4ZcT zOYABQag;u2C5aJ>oUg9ooU9RMk|BFZ$)wSO?$-)zG0;vMvJK2KS336js=`fViMRANd_(bWfDuz^ND+f0po(ffZ>)1rxCs` zwT}IPj2JfbVbxYa543U{di!u!pn1B9dr4h?acks-F}X0uF!K4AV5F_I&OUrL{nW5Y zE@J#^QI0kt8Z=E2>)MIl*uuA-c~*P4ewnwCVL49aEo7Js>ca1h39ecxL^GW?cBz-Z zc@CWZT=;VuTf%Z4t746gP)pIz3v%?%2N*qz!dlFI7t`Q7vvoFTsU(NO_TUSt>gs5o zX@R`hFs_}&{xjQ=|BfHRGP5u*Hbj22ymvPwv`$mn=jmcUUmjX-#=Td_@5)Z@MqO;{ z*xhAk_^)K{$v&%^>c6N~BXKuh&HjY6pzZ3PV?0Qb#2M})S{8}Pr1CQDf2C%xff9Ut z*g=_jbt49qv)mcW5|p{C!frdkM#AeexIW@9694Xb>xBWfv;tZaljVWvTcfmTXs+@mXTzPjg`N1f#5XS?6y5F?R7?{}Wj#r)6q2$({g$l>mbz|XiR2az z)7`>B0+OD+g6QfmrCM!6*>t0KW`;yrR1G*M*fnjHWuB8aW=%l0!@(1m@{2^nx zec&h$akm-6Uw)YTkUl#b{Aph9 zoBi2QV>stE7gc|+CbC>jiAv^32He-+lRF+^zW@Aeuv&(olD<;LCnu_^Hz1G9da*8W z(*{*-mDY*TN2xRwO9#_*{?^CQZCHCQ#OZ{O4$lbA$7#`8a60p=^G-fa7r5B9czcYd zG|pa2c?x`|!Bq_BC{CwF z6sOa*iqnY_tx)@;z*prT&f49S3T?}&aaTG9#o#k2vhm70T>)UZ(nbZ5~nWb z&(JB`@6Hj`xQXDx;=&nZ8uk;OFzw>!*v=Pq&)DLbY?hc@WmnaG?5n=Oz3O@1_4A~td%vryFGuTK9G&=edZT2EDn^?Ww4+I9 zd9*^Wicv+XmsLMrMW_x!i3L?9x}oO~VPZ=R8zxLC8zZ|3df5&VOIUd^rhUf;3DZr1 z9_fC<;%5l-`=*tXc0K0^Q|0@F+7@m5@l)JIf90oUgYW0-qR@{^`j_d))r%qtWPm-z zND(23ww=t51_SwrQ^lo~zx+{_HyVdh%W zGR)lAW~{W9=DKeB^FRIn56u^-+Bc!O>!n$HSGyV43U}+7H7HZhPXDGVAE(6Knj&#x zALE9tXilb^idJ=Sg1&}Z?=E8TxWFv=VokpSxRIo951z3WN=(mlg1x%JQ=v2zm zH>1jQKEu4?7iWXt%lGkZ#cMRCk2^ay>s1q4_hEbKA~rQ)O2D@85)Yh%#lCP18^_T_ z6Zq_!H49C8=Niy_ya1+n7Xk*;MVqljuyb({TPJKFg4ke{FMA7Tc@5Hvmqc^rmcg!N zE#;PEJ}3oR-n{FM=3qNoc`!;`>pv&7B*h?J-ZK+G$d8 zToGlHOq6P`&_(bX<bD+;`BX~*GMbX zP(OvWJNWOL5R0>$uvfk|YwuKL&HU*&JLK8qz@&aLdt{N|Y^NZ73U{2e*1!4q-p=P+2BKtC&K3=3U}n|ik@0*P4sK7BZg}91*Y`1;mvGK3F$(ye%h7q z#f;~9tS|-_Z1$$#u)L-9oQOJXH2dPp7dDv)jYI;p8T->yAFa_ zF`qAM+s{^0IZtCyF0^wke4ZM>wnf@tiKQ`orlv$k(U8r7=40&tm9`_5$sShsC*bw2 z?TymV_L{M|@UH6%#JSpeu6Rijg8;E(0uT%)L9cuFNAiVGy0gE*K=1Y(v4z)ud9z9L z86K3wVSMGF96d#%Pze2dm_i{i6n&NlA*i_F`!ggM1yi`}g47mG0s68_OAWvKqUFo} zQzQ^6zU&zX;LEP?FsVWiepd#1&yy~M$`P4uDnpw}J%B35Q&37_o}w>01Z+Rp2%ndj zcwi0P+u1(Iewg1xCM=qZS6a3HBqcfiEJ~F&;Q8+YVCqJ9l5S|XANwhzHv3AC(|&Tr zYrnWZ8KHM=4)*VMi@#20*rN)H*Ud^&!J^ORV0=DFf&mgLLuG`$z-}P-lyQpM?tB(0 zLZyAlGFxazw1GHDKSymqeF)pWwj?yA)9hrwOSp=9nixVh{`PEe`%QDz(-zf}BsVAJ zjxpV^e5F<#;@$I=Z$-VA@|AxtYDEEvf7I+f_Vr0Wl!|Hj&a4`vFtBs)A4m(eXLuE z`J~NRn9kxt%Vausk@rh9k-cPDhlq6#olbh2OP0Uv3=O)mSp?}=GU_Ki#~y;#k!63$ zvf}vo4+zNVsG~YeMvnvhl%qoT>oNO)f{XZ8!dASho?U(&-xty5CB8S#bp4G4P$hodo|W=BN-n z1+Z>h)7fd6Omsi8?jq3Hc$roFC1?I!7XenPO5J5GbBiy$4#w-+WL<5jyP5r+2+9W@ zJPVK6YE+vj5yuP9A_0pVN{7$YP`U=%V^^C9yINScAF{!rG9WflTY3Ke?Qn}#Pl@?- zs9mxCsK4;4Jw=a7s}ddH_iD=fKtYc(x_aTx z<{efwMN8&8J!rnzc_l6UMadi!{B*C4^3#2U9HF!-wwrS( zTs9@>)0$0rU2S1oi_VIol574?y4WjJ*E3^G`~TZ)Q1#BDx-;u|g}QS(=!NR)FV-Y4 zRx{_U9>#~4s`a(FNVSwZlrFO8`x4jGUan>%QT1M>9xz}wP}k1xQy|s^9xj3Ray7lv zq-2&_&4w(VT|RH{FhRg$`0ymMPx}KKv+NJO$~P7z4~_gQ?D;9spP#4wK<=w;B&%1* z_tam^2B@><8*OE4*{vEEjJ|>HJ{na;X5LcsB=cwRNNsuR_f`$ORReF;z*{x&Rt>yW z18>#9TQ%@j4ZKwYZ`HtCHSksq%vA%aWpyU{$~&yDsjz;fey0u2hqD#O{Cql=TZ-Pb z4yvDz|CoEY)dt7TEvYk4Z*%V(2RECicf0q~PqARek57h}pVsg6{<)^}7fFOJ`x?`7wto6#q<%*i*BRf3U%A=7d%91U*jP-*0QB?ud*6n}V&4yJ`Fy>y z!ez>P!r=?tyQlMtd+&7d9?xHuf7Z22DgN)e_P!}mXQJNe{0};RgY%o6zt{N#&OhY* zXPy6(&i}IWzv}$obpG!<|Nn9Rcb$LJtuBA(f6)0GoZsyHz0MzSerb8;dvDpic~h#k zW!s*Wsg0{2T)if>Zq2&2YaU#?K2^K3zc1CAnXuvatzENHxWorb^+3b6)Zbn8KxS>M z)Ah;EI6o?lkMsWZrt$sp_gy{i*|lrdvElyQ0o&gn zbhzHHvg>Edzi51NbaZI+KED!c zfXdeQdzruSgFcQ}(9jK$tzV_e)ly#wj& z_}D~$@5Ip9s2bVJJBP1%`_A2M%@4{G~8aJnQj!|9aE-zVn}!)KT*2 zKJEKXldGfH>fB&%VtgXg6I*TCB^z5k3d5|fZ)m%3B6GmK9T=Tl-7`5f+;`tlUu?DB z2E}fx`;Ls}j*M7;V%)|Y>L1UE+e|X|k|u7ve>fum_mmx;P?kgdo#>z9?*K2vjrCKAt&Z;ruh`Gb2O2pEv#re5PmjZ$6xuzGv|Elll`l-;h*}e#$=Z~c069k8+&?OK+3!o z{%Dyf{JWEvF#JpLIurepgTLFn75>%v&LU z5AVybbs)hvxO84mJL(eFImDu0@}?C2(}bx`{5WxLNgZ!7^jqWQ^;i35^YYK~kKn@a zmmK^h2XEHuLjX(Nv-h7VfzLX9CF>OJ-*{BDR`_{1AM)cog!A`f{Ki2GmmjYjKq>#K zeg*IAzuU>RZU_I5pLc-HOBKJyPVydxU;MqzCdK0K+5C6=igj?M`G2_tzI(>Pch6Y( z-|U=E-(|F06B6ugTsg)hy|zkC1dz)7F*@P1x% z#^vwRIrMq{^Wncsgzpe4&L*xuSO5&`QSeQqx$#x`+3dPPur5}>_hnH;l2Ml zk$U#!9`B4X7Qh#+eHvjj(@}w-*_(||4F>->L%slAAQmIYaIO5 fJYg1*rOH0*uke&4F_E>;xRgF46Kd1DN8|l}gl=(D literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/documentation/libxsmm-dev.pptm b/third_party/libxsmm/documentation/libxsmm-dev.pptm new file mode 100644 index 0000000000000000000000000000000000000000..2251456058af800dd2a3a9a953f3d57ac0ed6fc5 GIT binary patch literal 402432 zcmeF(byybN-Y9;$qy&_1q`Nz$LFtfgq(K^_1O!1+r9n`-8ziM0K|uuR6eLtilsaFY zy`SHC&%5_|&))C(<9Gge)^&ZZnR{l|teNjx^If-dtK(zWmtIwqEhGG=B8h;EaB+!= zfc77@W5YV)pP%s&5Rm@<`Qq~O^3Toauz~?F0T#FhuIk|cT!06lD#CRD)&45~KS$sk z+DC!Dxaz~JzDE7~Ivz3t!qvWi6(Q6k0yn@-KnzF#DIf#nfC5kgDnJcr04<;c^nd}} z0*rtOFas9A3fKTU-~gO}3vdG-zzg^QKM(+dKnMr}5g-c0fH;rOcc%0xh5obbv0<1Nz_|FaUISXbKnye> zCRT9277&B0X#CfO4zkdn?O!eb>!bg+mjD*bzfk}9fCjy80WDiV54!!)vdtf{{8!F% zbqon)#6R0d`QKC;PoDVWzNL#M|5*OV!}Q=D-JwN$=!L6({dWsK0uth%?fXWPGpLyV_d#>{M)#s~z{jc))RXqQl=l|-Qf0h3?B5;)x@51Q2dInu# z9J)gG@q`R3`^R{^dM@l?TwaY)&VTx(T#cVU!Ro(VuEy9^{i|GewY(}c;Qy2tc;OiB zKl0Vpb94Vc9sjqd=^vkLsPv!8pY8uE=hJ~GSU_tY5F=Muzski|IsWRpub!ueFmout zj3WcHkpj$4d@x%Hz-%NBpEzK45`@|6YTIpyq#W$I+A9Owc_7+6u!S3DE#5zl3;bh_ z`B(epe~s1O?){%#>(zDs30D6o{}`jYaF16r%fD90ApczDj;m~XHKPBOT1arvpDp}j z`^pgEtN8wFg#rQbkNkI)udcqo{Il|X?Qi)P0s9~G@Bi%>;{R{t->Z2-`tKL|kNxy- z-~azN|6Gmbe_H;2-~Vj>`9IzNKVIMe+5-RI!||EoA*{_X3>->>5TZgJJ_pXI-E zrvKgdmVf13CLf4a|?3)&#H8Pt1IgEZ`5|Q@ss~s+pg|hZWbOMKCbT8 zf1b$dzk4Dp8+T88xI_z28#+mGXhY7z(}Kps!pT$H-XHFpN05%j(#h4z@qd1VgKpb< z!j1C^z)m|054*qZ6Xd6(v9fowvvHSky6@_4?`h{OK_?@x{pVI$I}3Z4KjQo6+Fe&` z8`;~ke>=&4sHCkVBf!H89RQc1XW`@p*CEKI>8PNo{Rlca6oi3r@EAmZNbm$ifoSj) z#DG{32jW2jNCeNobMOKrfn<;ZQb8I>2N@s}WPz6;8{~jj;5Envc_1GYfI?6Nia`k| z1!bTdRDeoQ1*$;}s0DSP9yEYP@CG!2X3zp!K^tfXZ$Ss>1YMvT^nhN_2l~NPju-?( zU>J;mQSc6ofpPF2d;k+*5_|+x;1iezGvG6r1#@5?EPzF@1eU=HSOsff9c+LvU=wVC zuV5SOfL*W$zJc%H2iOM(;1C>vWAGE4fM4J@I0a|m99)1uCTc{;TL1|l0~CM?&;UBX z0GI#^Tm#qu2jBucfDf(%0ze3ezzuK{5CalG3djICpa7JB3Qz+YKnv&qJzxO003%=m z%zy>30ye-7H~=T$0^EQH@B%)-4+MZ95CXzL1c(ALAPyveB#;8qKnC0fvOo^V0|lT6 zlz=i&0jl5*xC_*PI?w=`KnrLC9iR*JfIhef41ggp0>;1um;y6k4lIBrumaY=2H1l8 zzz*002jB>tfHQCbuD}gE0Pes8cmglr4Sawv@B{wfAqW70AP5A55by|uf-n#c9)k!F z37&u`5DlJ!7!V8MKs-nQiQpM{4qkvHkPK2lDo6wAAOmEAEbtO!gBOcc%0xh5obbv0<1Nz_|FaU=z z%U}hpf;F%XHozCK3AVsjunl&=F4zO#z<2Ni?1KYv2#&xp_z6zHFYp_jf-`UqF2E&# zpLh_i{_u_jkO2xn1!w>rU;s>j1+D>XfCF&B)jK@=zc2s%v&Ub{zasEg1pbP^UlI5# z0)Iu|uL%4VfxjZ~R|NjwjsW4`{~m;NJ8Q{DQ=5u4BfDrW!#f2 z7ZNpeM!fHvIjhBWdWgAjR?GEEbv_BzO^)a@+w@Z(J&K2`vfWZoQ?|q-@MceaC~BEG zn3RL8aPEFlOu%Mss>Qq*9`+3-lH&RjQiQ$EP8P7X^_uac_&C!-taMV*E%bq1g;@Va z9z|Am@#1S(A%bIJ zd%NMPk%_;RW>{8^#ym~MMVLf5;!V=b`#+}bxDb0knXi%gXQZ|1oV=`)#VP7@HMcR# zF-?%$W5O9V&E9!HTG zjdg74Ce=uy&h(M$r6B4sIwyM`F;>7X#cIOEPm1Gx<>hP7i9S4AUO3mE^WCeA*sP9F z=iT+_{1!cLQc-HU@lIt<WR4oWJW09d!lLD~guFLo z4@|G;9@NQ{EjqDr^WL38ENL!J;GWBhf0}JAy|q3Z?m{!#X|RLS^z>lqIcfT-9 z(5#4~ZE0(!bZF+C3w7@7?SFgx6>(-i3P~k1j1D>R4jS)Dsok#z&8lzDXRTK@wmrK< zTBw%qh!I_|&s5#sjT0;PwBW)XAjNhSM&CHC@MiIUL2evH*Tq=zt3;pt$0LXBNiLW8 zluE`;6R(1Tulq-7xe=ppl7!JTW217CoH08XdQK0tWYIr9ih9MUCwA&qG7u2qM?}LZ z6!yZ9Zr|PB)M81hUQ5qX3CXvx0X;0cD%wUs*aIooI^b|&$naZQ>K(1F-f2xnwM%}N z`J*$7cx2fx7zLI4GZC96Rre@JJ^2tK1h$vZ11@|Ho(3!o>U69k z5}UF&(gjUZ^zOu7zxHayH<q+d%5-%gj-_zg{2Wsc|QNt$)z>Jtl=Hvj(MSgg>9PV zEO2me^pM3TTP53?(F2Q<46`$G$uN;=5lfcw`6sDv5$le~8^1s3Rpms-4@g!~$G)&A z5Z6yYH`pVjBH-Y^lYqdkJo8JE_*+y?ruBotli zDVT4dRD`VK_&VuYsb>3bo=W_^0vt3`bjEqZpmz*YLgv$*Ora&>6cO98qqeAeW^L!|S7pu-k3RNNt8R!sR!hD&I4s}vU0+geg^iix zf$I8BUCYokD$iwKiaDAi?K1mknj_Pusrxp0?cA>$>L~SsLRRA+4P7$5k(qgi7a&KH z@e!-3=0~0CtHI_>{PDTyY@yYuHq@qtR3+Og=Gd2{Z|3IN2ZbD+517XV4k_&)@3PM| zZy=72$=_sV_&gL!zBa^d*?sXR~RNNElYs!*E-pAo!QqyBGSr91f zB_6#q?ct%zU-*-|v+5XeU-B1yDi)4)nw;^yr*Ul(6!|CJ=K1xeiNA>{k%mR$n6#SS zyOM`{-K43&2tWw?eDrMMxqF|$61&`*f^}z~5E%jrZQ=R(4(F{@7J766WMbwplqYg8 zaZ${IE{<^>>m4zgEfil5?e(o(=Xs9CPmGj<+8*4?*2mab8$Q|b8Z*)`<@{dtTx)FH z7hW_8?K!;mXLcuQJ&n|X^&32N#-+cIL+bBa5WW1|D9N`;dJwCiq$gniVULNpAk$ z-yi3}jd(q_AGX6&Chj`<<3){&7=`Tpn>q;|t)CceMDD7W-wsZ!#q=*+mG9>t z8MLZP=fj|65yyz#Z>E=xCBBa-tUqR5a28|Dv`BKiy(njKlPQx>3xBWZSE!~F?@yBO zkUb|o!F-OK)my>IMK#xK$TGIA9B_j>L~tXK)bmFJcJ(}%=3_9e$2*;UE~hq-&7@q- zas@*1^o2dy&VpDnt4A-1X_&D~e110CXKbJGQS_G%Pe+%%LE3uY>)o0G-ZI=?1qmuN9>vJqhXjk(*OgS`V^ji1v*KKa zD6M*I7cW!wWKnY%bovFF;=f4vqt}z!7o-$FE;>Xh<7guB8@wmgBoRB}^LX0#n}e89 zHeV^HMxH<%DeVF}k`_1B-N$Rz&kx*-K1Hhf#>5f6S7tF_vKPFozG+09Qtk zoxkl|^DmoTjW5}cb!!iiG!6n5CBm>fW_m+QSMih25G#Cb1|+N~SKWz(#hqDf!cQ$+ zn=MG>2xTLB!WHK%di?qfc#fv4468g^7Mng+tKQ?Mc2Fm;2)MOvPam#IpwYHCau&Xi z%UG&xsAR{6dFwm!0m=CTTF#~n#2Z_f(ekpu!A4>5^T z-r&_pWSq_p?J9nKcw>i0MDaG>EhhTOw4IO{b9=*wag|I_!Od#LbZc88FQeoiy4ser zGTlr05tdNitgCl+cS>1mn!xlLU0J!`@6G%|1(Rn1ngMD?#)iCR=Q0=jXVX>JA6jfk z#K}iHijn+$Du2#tA-+(eU_yQL>fO>n6A9LP(TIW8ov7wq!KlsfH@3FIV#pX{PjNjQ zj2#I5yb1z*61D@pf^459W0D+NWHwF+FbRz5^ktoqS=_r>Q&h2Ka#Mfo;=H6GEtmpP zLuFTd-%3N;&Pk)?w`ECTAwvO|eQx@pLge6v%xps-$321q%bXW)JV_)L1_~$AO}Gz5 zZxK9-YLUYhBylPI8me9OTfKP*o%MS`?PgMHcrQcDllsz~eOtxx ze4%{zvfccR_Ha-77tiCS!i;#L+p1)l8VIoo#Q?(|9fhqXIyTp^+tGW{}K z{;k2tcwdr>qm31i8@hzf)w`m%X0Rtr`Jb`N(FvOlE2%dI%|WY!FvSPu@8_{H{kQbIPsE&rm+Bk|H`D>tMq{U|h_&>5X%o zaI>*oUO%7+#Z%Gq=R5Xz!&|)?6rHNnCvNv#FBIdC_jHR=dBfmRYsvQUndVOBt(Re8*_}*M>xhE_B(yp#@5-IEE(fN~ zPL;y0HzEoM?8*)=#@t#rt{I!VSIaVTKYz2=v<2hoq0N_yJQiim)?);%^PhR%wc3o& zqL;UAc{RQ>{jN$4mGM*|Qdu(Vl`XO1^)OTtjF3ILsMp5Od_xxbQFqKUsO5sLeT~P)fc*FU=66XQcFj1r8TL$C6W&zcZ|My-?K4d_c(LFucy$`&-6OAh zLON0GA=>)7KSd!uh{KYK&EsKK3YVDMJ0l(XDBA}%(!}#W!xg@-YH484o|7S?cP zR8vW5rSNk9CC$FQuI3bi1xp8mUS8}(N8kW5=0Y_Bmr?WNY#nC$NqlvUzizY1+ePj> zK2+4*L|IDFYO1Z$0nbw{&WJ5>IBI(ISK4 z`a_}E?4hKg#TqyDDsFHHv1_RGzzX&a{5MaD>6zd96i=ER6(SC#mPJuCncLZ-V2`AkB`}XL5Nl1U zWvVc5GQF!aC!vM+?&u8> zM_eoc4WZ~y68ne#$FJ54KKkjJP27G-sl$+ly-rEdvfUmi8k{m{UvbOnNQ2Yx0OOlg zzcmgz))e~Qw%la=L!L}#(rat2XrU6uLe`Fy&hGuSpvM!>^*q9gl zHv2!(rg{00exbFjc+2U;9VQX`jAe@>8hx@n$U?v(hzvve=)|8%G$kKqjO-p{=c8$4 zDnF~LC##U7k>XW5=xw$1Gv1$@g02?tbl8+-nN55p1&L9CM_DF|#yIn}>gP7w?TuEP z`Js>5HZ}A)9YT17OoUA~nk+58Pa{VU6h5pEr!JMx>5UI(W47Ez6OY#?iQBlOD0}(_ zB~I|C+k)2}r$^zB(+h{}^nW>?QDn~{7c3rcI@?G&Ar;t@Ym@6>rOqBGDRodgf9)fs zd@i6lMp6>i%$Ms_Ql(3T zjP#T8AkCrbJ|==#>RvY0g`lqOejKwF%_qK79b0YuA_|`B;Q=_u*42ygiCfvmAHUEI zg)EmTynD0sMM5uW?z<{Wpux!Lai({@qUWPuCRr-v<|Q8aDc?Pg(M+0mUGbYV zhb({pYZ%62$-9+ebtZGPZ-ggYjfQQRYny*`dSmcUD5 z+<9O%Qk~2#*lZ3ScQ zuI1$b>Wc2)Z{Kyfc`XRKG| zs3~@#RJ(B~gx&6?TMQkd5jBf!;Lw>OOW+dDeFyf(ghTwQ2<~bs+A<`YnhGgHu4Vl8 zK5C!a>~8xm9SOc34faF5c-e{N-rP@YA)wJ`km^kP>a>4kDWBFtH1nKU-y~(2e0$GP zZsC0{f!~4OTg%1aZd6tWaYtH4y(2}rCv4ce&z?4xRx{v1Xq$JmtWrQQA!5{=?6#E4ROkB@$2mPXP=SZ_lOpN$yq6c9Y`kpZ`*n5! zvkfX`)i zsUo&TZU>V0j2B#t^G`YqH-FfwE5dWVNlt2Rtd61`eAACWgsC6wwFt5#B~J`Gzlv*=@I z;pMvXa<#zsNDX(j=+`D?)|A8Ztw$;@%|vLJH<8zNyO4IYEw4L znnwDpS(iv2N22+$n15|tTJAba)P|^7opOu%BbS?aWtnubnF5t$i>6rz{PLM(5%L6O z7s0pI?$GF<%zG&I(0)xBRCEp>wLEC4X>~=CQ;a5WXNbw7@^)Fg7>;|cl*MhC$-5)H zQdK7(V~#0VJw%Qxc*kSwUEH}8W^eRJUPh>%%YC2b_sfqqmI642H6Dhx|EgH+U5>-r zVv%LAibM|bz`JgTvEF;%&K;|#@BPuscUbv4$cZwzWkFI_mh zDtUzU-yE?fq3z|#(|)eICFd=CMTKsD8bM%ig}<>c2pjs*!eZ@*<-z8%9NU^eHk$($`5iNgBe z%8!5iNhsuVG&Ox6`!jb*l8{{LUezW4K90oNK>n!bw+&Z6pY874SNayHF}Fzjil$91 z;T!HFM9%b&_tZU~c>ZJ|-f?}YVns)tmaah9^Tcf|-eG-5rNvTcO2ElJQEPa_KIh4- z%v*))7(%ZfnC$;lZ*I~Ij1_Oya3~9=VOB7R*OKyK^0d~cuU}fe)h1GKJWy+VnjUUN zLBae8n~NfxF(#d}__WffE3wdyP9`ahg(9$a%f!rr&|fxdeVgZlRaXCX!A^sIVln4g zHsRo-w$r`%wqQznX&-syXtu3E6Rm>L2M<1Ln73h`OR(!pq~<;_W$Pe|NvD;eEz5Z> zL?U8bxzw>eqE?At6sa3dmPObb^NIb7)YONf61~ay6qia1BvSpGclDja%_Lt}4Ljvw zt*jPtQ~o%-W>M;2zrvAh-)B{itts8vPAu6<*kAK;%ZqvdRWdYSq8a-j>1_^{!A+A& zgY!-dVz+T2m*LHI$@dGatfYMrR5$6ZbuwDqd$<{mGdLJW-`qPK=cqfNOfkul(BPa1 zG}hTUeL>aQ@W7m{c|vFF6{dcpCOh(VjH6zw*poNR2Np@sy42M^wFezHssyo`-r9UN z-)uGL{^*#(O=JGK7q<0aTFHC2m``(z&R*Z89rzn!kyauQ`C7k*2DKskPv)5L6)sql zm29$;g$}xkt7|pf3$<5H`#n->)Ktpw`>fkQ-L%4jr7B&{a$H+gC}OBi{h2;r#hRqR zqP#@=8Mf&>2`~Etj=P3&7iwPIxqGcRahR*GnjNkcnfbkqx(Gzk+5eDvEA8!`dgT4A z%Z3{fLuNbqRHYoAm5k4dU(`QZ8e}?ky3jGKH;EYS6ka00DqvZ0dC%7DbxrAyRndHdOg-gzCPG_z7WR$Ps^xI^%%VUF=@g*{N4))aA4D!K?3Xi~3!AicD4%W6yo; zYl42NzZ*mBiAjyiZf%K@=;!h*xS$se`QjCk&=f5(m)&c6k1h-2!b~PYS*v$kbuj~! zZ?Z}W2;Sf%;&&#WbymF%b#gX2ZrI3CGp!ymPtZ3cksB)*%tiB%U`Y1=*%^c-GUm8w zMV(ZQRQ>PSP`i?}pTwnlCz68;M*P>ItvYc#WC|jPNoG?Dy?O$1y#M42XGr z&^PyD_glQ>1x??2dl&g$mmK$9r9{hY1%%eiH@>~5twqZI>`X<^wra>etoS@}-5r>~ z(Rtx6i|oQeRsJMLX?lp1eN~3i?C_o8FGH+ES$}aUs)=Q5V&myd9Q-%qwLETPi9MLw z$2Qw70?HpF>w7v7x^d8qtth1Sf@_P)S^0>&D&i{p>RBS zswvu0EAe%91k)pDj6ZT`c!zsF56zrX!5BMMUu@`!W^J59&5)?hJ(eORhl3ps`6agY z3jfpY@Y@gLO5fU;^`Bw9UF9aJ#gJ+YQuz{Q>0kCEK@H9EuB7c8C3$|gK~l=3ZSYRL z(kE5d)`RmfTw8&1D&sv5fr=5To4xgvZfXRb5A6$#w1YK!8Tkc9@pJ5+MP%heQ^VA8@)qlxXL?BU9@E(!Ubk9YeBJzwq`JwoeM zl4=v7#S%>A#C>;-QH3K}i>M%tATivtf5$v^_uF_f<1Q^5wg#qiJ*ETs+g)QCW5vfZ zSzNa2kG$)W#L>>z$!0H$JC*6tOX{fPV|0D%C_;4N3d zWeB?ua$?^nHR`AZ^%udtVYaKk=dV?B2**vk?9=OfiY7Y?-W>Bl=xFDvFB3lflyye0 zP*}2ELtPgx`4vN7yh6J8oy?BZPIJbn@aUyz+^+iCzW;MQ%bwYqLu*7;hNscmy`3c)M@-{mu;4j-B2(EWzQ(~_cTKjwfNLQtrps|G~Jtf-3{D08IS4O z2<@F0Tcf5_2V=K~G23}Vi`F0Vzb6yE)9~|wf@Y+O(`2q>yuC$WV9~V1?w8Z_VH%m2 zi{GqDimRjgNEkPY+7tzn>$0bq6w6dwOv(|vYVq}Pvbc%g9kv{3eHVyWPId7_YtJ1y z`RV-pdtzs{rqqv&kGE(BH8$^fAMpyxTOo#ASB&Xt<2d#1=BwNeG}Wm774yF6X0J^J zXYQ?NMa|T_Hm#wfO_k=z1S;_G;UQk4mnm!C-)iHL;Vopzcgq5E5jA0Qm zf$>_?qw((s3F1OKPfU6S9_Tw68kiW)xU^&iGoa70wa6Z((MC29G5C2{jQ*OYy?8I| z&&iViTUQc^%RxfZKhAamdsplxrlw=f+Dg|Zd(aO}(h|}YHw(H$OA zdt!~+7PnCMel5h!`SHjJ8ikrSAtPN;R+Bwk=88#nOgx;Px|=PH@dAEcr5dkajSZgY zCQNrzqAzNUW$CvU?v%4JcKcOZ_8kNt9(*9o8UmOt(r#h*l9i9 zrIy<_X&3w zwMN6@XcMD#=0szjiEYAt)KvX1h@$s@yv5n^kLEt8Scs*e#8Wjkx*lrN9CR8=d77fj zN>pru(;^=_CM-`DV0e3LMB#Eus^;-JpEc>w%J$QvJ|}~GX^n>A;fx2U`{qRP@38x| zi_eZGO|~m`dpG)vF{B8v3(M_}FZt0#Y02bn*w*B~$(Y5o+IAppX8UZglz_FCzKE{# z>C5k0x5VE~TgmJtm%l$X+}3f#F6zMycClk=t_&N#Nf~{&ckz`m_XC~Q!G75i-o@W^ zhLV{aFW91$>UQp99lsY)%!m~k;9H5lXCaidx%oEek~!lG$HqAh88g=lBAibXq&VSP zh1fiE^x`K@<`218XIaFYM1M^;l4w(=6uguA-DPq)Rr8(=?RzdWWek6*JEAsc#G+5x zD}m6d=-aq&Xf329zX}k9EOtDpqx!^)YG#e> zJPrE>)085w{;s6Owd9BO-uc+rdGG_QsncHgEinuE`r!UvCXUmbgYGo7d(% zv_XfnlP^tN)0=R4hW^11RbGRIa#03#vzBw_RHP~S>+6!(Q|Lo+ZPC+BwL zO$aVCR{MvjcB>}JL*yqArbh%LCB88vc1Pc>JCVwN=^2K%a5FL0bsx{Oe#5a zgS6NC5?^1dC$rifw`m`zjOAC~&(OIf!2jIeaP5RIX1wg(s3Mh-?!xqQ9ES?iIsI?T z5+8qX;XCK8^!SF!QNGT?nG2px44jnkk+o<{MQo#JDN#adN~voxyWJdXQq@H?t#a&C zZM5}5o{rW$x3g6ronWVVKd99@xC8y2d*nWA^6$c#H)B6t@K?jkxE}m;lrpXSaSehX%#o>v;_*+N=%{}U0!w8Ih{BxgJ@C$0A zeN8j6Ub`SJ@ZOI^{B16ja5kJW|Dpu*zS7M1ybC4G$SIrsz!;OSiw^HB-1ccSXrtLV z`E2Er_0MMNtSRq2y%*>y_ay5^QwG|i_>B$X!O;rO2NpNe8Xm6gSZ3FJY<}41^7k9*Q8}0N?kHy5WTxA-a&4Wr ziVJqdsuLc&^hT;!&Ov!7GR{Bh&>w59rLflJOhJ`-(b~ZzoBD*@6CIU>XyT^sko~=} zTLrFr(h_Oyk|EKyaW^m~0;KEu4FWtqUkK;d6t1*&$P#kyOsH=48nVl|N80sOhN$Hc zD<{r%a>>8;%*PX-Q+IO`2`?f(jqMu1+lxH4sxq~g*?2(mOOJEC1WScoF|7k#ZAy?! z@qyhM^8{0_;`N8W1$Es;<)7=}7~*2}`&D}d&Q1@0AF|)!P(Ew>{b;c6l8BS@v}M}- z9izmU!i}J0uaW@!`exiu=CKmGdjg?p)M9bNmT+L)Qlmbm+ zp$&8)KUMki%=0?K@xSKMHc2F_rXl#Ehpsx+J$*8#Vt(DTp#C}KaFTkizk~WZHI1sB zSBs8dYr4jKE7E%T^F^dnAo4i_E?oPSOP-AuWn>{4D z#rKE_;X~!JfQd4Dt3?80T`L(!d)$s`0{a%~?k|Bw%l8-;3=yx`W1sv&iELZJ{S}YW zu8xaBA?np!jE#1t)wt71RQs*v?b~J|LLA)WT+`|wr^MdnT#;foB(J&fJF_5=9iD9X z`BrqUD(nQ}5m7OU6ok<}*eta;S&<VCbv*ZWb(>WjkLm*1);9%j#cjC1wX z-X!QCN3B)wI`wc`%y!)Q+(n9m$GN$&@5}h^qYyq(zy75Qa}(01cvg%A#MBT0AFiFF?2~}>tw4f!Ku$A_IRz4YAM1nq}PWSlPuSXI}SSabz zCFM#DhqTq9sPD=?zPCBbi}QPqFtNd{*Az5}t$M?-q?ed=%lLS1?Q?g`)4?y2MMT{% z&_hblZ!_Oke;?P#j5N5#pZ-E$s{C~id8Nj8MB*GbP6ES;2n@uMwD})9=r;E?q9$lh z-N%AD!lxRx?JvwZa9^iMb3RU4>19*rkv`V94jNnhN&2+y0@?l5#Fy%5Mt4*#It!)u zy3UHh)4R^1NbbixO)PTo?=B?<#$+0Zc3(d?*WI>i-NzHH8cB#wkMzFr@%ap*rJe3~ zEh-HYli4eruw56*l$-tNb2M=y zfq2Rv*ZAS3Xontya`0+DnuH8SM0q2AKzZvO?FoaIs3zv2+YyxSUsaY)Y|}WtcVgB5 zy}?L-Lrm=Y(bEBUyTz)Z=-jTx9}@$8`~)4Gd}P82ft;K!3`tQ>4J*Z^OkH-i7Ui3_ zG%&w-ZPrYuXU8)%R(um)qw8S6>v6An(Hr?Fic8sHWr81m5|qaK?WGg*P_ou&jkPzb zz5ageEzxF@U@oJ>zQADf0-PHS4cv1CU14aGoxQvwOJZNw9XKoVs}{Odo1~csQg#^J zFm*16?jd_ApJ`-J+mr81Mz<8F8`s_z2)bd1DD~bi~4nuyK!I)hMwhdEmzx263a_GW}20&eqrah#xAhd zk3D^^wZ?axT90oQ^suRck1|s^kQ$35`Z)o^!6pvFwSX=AI0AGhHt!MlM{LNQgbC!9 zF`vd-YOq@mu+jCtxpdy0JM`69cW^1+yaZmHeE_b2w`G8B= zC*&{J5{`0M%%2qme?KWh$$3AUJU7M=dvW01@oFhJ@N9skz)geB{hjFOuMrY1^6^g<)IQLDdzt7$!tskztd(HCM z`HZ-r-~Lu=_xqoBQR=y?{a&*Yax|FGu~j|@w0>A*rmaO9K4!IIf!yKjQky#O%S_cHBtDZZ8L1oon1PYCC?0V;0U8+C7ebIpL7xbNA2_v4o zf&j6SO;PJHB7=L=03Ywx97|N~^OE?_759g47)x&^*uKrOzI(S$YCOdBz7^`CBjySg zt7#iXc822h@OKT;znnh_j?DrUJ2Zrito81kS#BA^^NNmGRD@C zJR?a2N|rP>OsT>P->9->r2ghCulEZ7SWZVds{P&S^=s3o0>hdi59bxmI=c!i?q?Oa^T zlARtsp!Rzf@?4RjiMwF%+mLBd?#ZWGLR@e6P0xv^&b}4P11sYeojn=FzCJynQ38Z2 zqmgn05$oz7kiL~~kdL!-W`(n-P4ANLoDDg4GMqDy*nbi|v<>0?_*KMR!WO+0Av`Ku z1o^RIFRI8Jub8HwRts&0muie@7H+Ic7+-|YwG4)Sxyas}qM*WmpO2rHF>YmfD)T&p zc1wSPUf@-?-sp(Pi3)9pcGo1?b_+^q+Xs2(j_LJiA)KMZIYm>#NcGmVz!Nr)c$;qL z>Me<7IT>pN8N$NwRNdXCz3%J?R;x3ow#+Bzwj`!4P3rF!iS221FbtWiNbJRpb5h;k ze!$G4P4}{kL0F@hV>q~>`oi-D&s`!y_BpRM4E94$lyGj`_HyG#?lRbCJYMMBi$iny zJ9~tsV&}JA&Z8AAd1}IbMcimh>7>3koSvdf#EshRgfSODOT7C*y_g0!VlZh;pq1Od zgN!a_O7lKt&^s67GKn=IYy0A0%eigmrO@s``=bwLHkLzy_zO#7hUMC8R5ja0@u@i( znweY_yJ7EGEA3;4@`vs4P;}<2vqq`8xgDb{1X8JT-x>*FvMk3~pt^i{9Ns!+weT#~ zk7AfbQVh$;Z1c%EhFoevjb8vFOEa}u37XcAmb*8C$Yv20)*06Z>@-ZhgwhQJ#}&-D z&6m7#k|eKXZF;`@f$#pV!sHqszF#1&ABOZ}?xYT07d$@lTM+%|mhtSxGfS5Gl*gh%pGaH!8OWqZ_OUY+Wxk(WH5 zBzcI~-4D=hc;#C=#?j-nepSYzAtJqE$)Pfe!!15BpBi&b>mj%k-kuzpTafF`L6TQa~ z5)8?;dL3!!LJ{(2>tlD;J0~?NVyjI4#xn)~7-vfA7+)=oE#Lfqfz&Bf}ypdWeE zvNItmQu%B{Nk zEtM^L`?YsnhweS}+1?7(t8dbAnXRtE_;P)b^R88Eb%;z^;mfz|Dlf8rf|Q z_0IUtmDD6TJk53)TB&5b9l-U=AXtxBc(Ar#+bY!gvp#j~#w)7dk4{H@=wIXo;xp$G zq&1;2A6;bjulZtPYm}lB5X@laza6lNRVlqN5dIl#n*YND)h6lAPDBVTjm+oVWHbcH zzF&tv{1L>wi3S1-*JCAruK3)*x_$X02k||;4H6A1`8iouzuF((&lXfO>ahItW(YnN zO~#Tjb)0wp?Q}ce%|~SJC1z)zy?wKP29?=inQ~S#W&0s>(GjRMPJV36$0f{>58`WZ z_{M#QnyQ26rMT`$(W5@qiPgJjx~YX-`c9Nz7W>5$Uep74z@D3&YJ6` z*DDASvSsT>pC5a=g@rsfH5Gg7yCPsr@3pi!aWuVx=sWhkQHafbtYDmS@Bp8PyY-6> zV_2teocHUGpR3;Ja}!AZH{C+@UVh_7Gb>DkayGxtHciRo zZ{G4t*Ne(WHr)LLo8yS@i6smBbbool$|moc@I^2|&iB=E9)c+DM0w1tSFVmZ`=VrH zFS%b5Ee7NsAAXT|T-G}1Cq-)Z63OE@p0V!e#U;xg-*>@~S*cLT#;mly5#`{~B=?Nz z&t(nr-M4rTDF_>8w)ZxUD!&-mJ*;4vjKFzT>#vL4Y_K4>`XrBxvqod_23LoO2DZaD z()Vfww6cNp*Q)QnV%c_+%0}LVEHcI~=W-;zFtSx5u_)ThPsqEIk~?-N%cJJfSg%{0 zJV4)!WLXs1#wFqSLNqU;F?-iP{_R>GLr~7ZJqk~o6mM0Qwbe8`&wSY;uReU41D@ww zS&Kmjk$0_}3SGiGS#KE}w*3|$=(sNbib!X%mra-#y#$+RKbCXS_WZYqLZu_Z>b6=x zWBr0m_nr2YI=-~2-<7Tv-sfCliF$J!5y6Q>-_jY@w{D}0zOWD z9XhnRHo1i#yxM$VE&hWjr#}Ad#CuhCMv_m$lk-de-FEws)R$c?Tj+19KMwgM?A~SgGbt5Y4)1X!rq7?B3@RQK9XhiMEpioYr%Lb|3u!$J zTt31xlwohv=_Uzbw7qVU=ET5qYUzA~jg>vfX&SX0zwK34Q9~D}(6o#0{Ep<_IK%1K zYE>lnOx(eg=5vX;*3`&r${4Gq{n{I+Q7^SN{njY$aT!c+AiFEAIMn9F>|COB0XS+|N`9Oscr zDh#D?W%GK=ZDkD|YWFQ(w-YTMA0NNIwb_19 zqWRQnTmz9BbBs)+l&kZsw%S>$y5BDYt%&3Uk6pOJ=bJ)OpR%ucU`oFB-IFiOXw=*> zqv@-hpr|b9;+CV-%{pWxRN4KegqN?|Wo&AFAjFk7rqfs@nb!Fz^Hw?GwK&B?X8K*GiOR1Mz?>+K7T8(iLG#SWqS)9~(j z==BQI&wy@91M|{pJiJd~F*({fV)raW8%hMRJ(BTq`}_LtN!2v=SI0z3Cw3kBBaFP8 zJ*jky`XNJ$|DlT`h}>Ak_nJg6e`c=F!{%nCp($y$t<6J%_GPD#9p~ocz>)M_9D@Cmcy{hQ-uCwe zO3A)c#!Krq%RbE)*HvUETHVtmOUYtv8C?x&_L5)RKJl*dHvGKi#?YYH*QLmxRp+Nd zmLIH;K~1$BJR16QBIWx;ro(~n{{SgK*1zVSG_aEksoj5jQs@mXjHACRyMjldBw&g*>H79J(o&vfNC8`FlDHLh z!p=mBxOZpd&uV>Zej!m7r;xbWX8r`sz0)51kTPx4IeMI8k}n^4Y!6pXo$|c!%34qQ zdzTq7Wm9StPE_`~aBGuv0~t^Ln0aoew4sk*O{w&L{`M0fLF)pq(@EsnM#5A7q96&1G#yo-86!FGrYmY62EOA5i}VQclYAJ*y~FedfaFn)=;o;L@89wvrWQoWSTS z;U7I@xQ6?vT)IwRrfqv?00e5y6770gUBWz_vP1z<$gUee%2%j_CUZrmJlr6_@`JR zr5Jx(@j$PlDogF2UU@Qq0xf7shI(VdH$^=|8~st<_k zdXVBP7Nh;?rb&JnUF$Qvs;-*Mw#W6rgU7`PQ=-|Tqi48YsR{zQRxF7By7k1iG7?&J znFe;wY$BhGzq^912+f+ol0SWmyrybW?vi8yg5%GL6as4?fLYSLNLTPHwGG39iz{HS zi$AZoDf#GC?oiOn4@~9>f%MT)3vvXn^l#uwdVrFEt69zI2iC2_WgWa68!aD@>gR|) zT$PcS+&OPFi`MFy6Iu_rRm_f<0V|Q^V%S!|rnt7=3~nASBKFM$UOe8s&A*Gy1Q-L; z)Yh(}+?{5vvPPrhjr6Fbp(IqQ55t?e&m+d5-;nf*5Pgc1f>^28RjL~yWA4w4`TFs>tc_q@4M@+mI%K` z=*XLgFgj$17I?xhuu(a?(TzWTHtybQ_L(w+p6A|QYhl2Xd%pM5o;#w9K!35IAk!2& z%oCLq3{Tjjwg@Lb*enNdc(~YLXKdbYw%D`81#iOnSF`Fcm>w|Nr1k@I+C!t3*sv4* zY@ODniuMM?it5(!4~apmy=ecP6a%k+Lrom`+O!$IBYgM^lbHJ!Ud3A|+G4tli_cCS z=s2oB7n9Afv*XAkeg&Er1NF%)voW2D*dby*GOusv17ZhXCx}}f0JqMx07n*8714(; zKDfod%|F!5`p0P``=L{WirbdFET6(gA3_)Wf+1=%`5yJUlwzsC6{JoQQzx@S*CH=y z6L*S-uhy0hH#KCW1PaQWUzN0P3*U8Lkm84(m8tdir0eYgDW^6*1!CR#;_mEikge^r z0kY0;e-xNcxi$L&p%}m^AWZ_hCj6CTrg+Y6Ki|=B(MO4D=4`SFUmaIR-`i`4`4~+8 zTn=US^~F}c{8r1)M}a zS`__mGgDh+c7y-USM8(bS?{gI$z(B=NNAc^By1YrZYpfCryosU)$AO&`Xt;_NJMkb zz`8T&z*W{|zs;y3k5@UXd*07`RzSpQdPK;$SGur^iIkG(MCR$99d5~iMAz|)j0BpD zOWpBG43;8w`Z(pb=otSA@oq){12j*c#eA()<}B9DmE)z}8JrNa=hbBOTqoR#PD9nT zM7`1=rGo|-I#ca+menKT2ZqA8f;lVQlBM`}&0 zjil$OuX{hi9)Ier zsJsPg5=mv-g;cWh@M&89`ZWjp2%Jp!b>R&K>OqVa{Gcyprz^-yoO}>y zun|#F0ISVIQ0&RkBsR~+r?IHWoISP**u_!3xKojHtYXPbe^gJ3X|Qf29ufc( z*Ozc8^b*3CEAVVkD^#NM*OsiaRi`TY3cNr_AR&NSwW^W zqa^~I=$6y_C(0PX0Shg2ZInL$5~Axl^3t4f4@9;!uzCEuh+fK@zom|>NbaEQ`5qYv z1?~J#&jE+)amzk(d-XYo5V)d+o`r3;^T%GwJ*|8K;I%=u09fAQh&Hb&a_)SM#ifMT zqtYQQ0pcYth`?S5Z1MJ4v&dy*`nS_(MKOW zV{hDW;;O?-m7OwMrayWL5_6N*+m|dm#IG{oXIt|tPqpagyL=KRWD^L(he|C)_;LCv zE(fH+-a<54s{jrF6hoasrr|w#=8;=YK!C4zjv0_dN=u8SWUB<= zJTH6@Vl$j0r;WOm7ltJ2l!;?Z_e1Uj?a&E@@C@+gv#I2EQtmxP2nZZvVOQlK6)S(t z*_~Oo!XP-qAK_qR7CN``W@6b>3`Yi^+&!oQDmgP~BfLAVjx zPSn0Nuo0Ih?|IJm%OxC4cshdgvMR<5CCV11kAL80h`LAS86Lu|?$n;LY$~Zmnz= z;n$l!?EqZXEVBx^Ntu^vLQ04OdPjD!=UQl+2VFw5M9lG(N7vRz7JRw@`4#XggGX+g zW#**ZqTg=iAfYNt7tL4vV9EEg#m51Sg2$6i*+FLQMxqMzl?>0{RZs4TiPPpUlM5Ma z-g9S;THTc`;mxVs0S#`SUy{*8aME!qrTI$I_6XziQNc`+EAE553*sIKV?+*E1d6pE zB)IkLYgz(4?TO~>( zx%E)dJq)ucF@ynnj}2aa@05HFiP>+lm z)9vK8772zJ;l$MTNdGUl=`L+0czAS#y3T4t`u%9Nb+&`SYK~!JEA}u|WHN8OJk$6; zPtwIZG@fQnH&w7TQURdsPV=E&UdkGw4^`S_CO?4p!KLIpgoSb z*4zmOsU}Zzv}em?JY-Yku`N%QR@WT@gCbkSxS;z(V|F>sAy?kZStK}-R>cFFerceC zG{4Pv)w^Zzz`j^~o2m=PmL#x_el^$=2~y%wf}bP>)h2O)NpY1tI4x;jFr-;7Ag*_& zuo7CPDFVj1oMSCP&w$Zn5%XJUj9{v#%;zO5?*yBP z_YWm(R*%d;86e$9_-xaB-h?Y@`FiSeX%`n=u@T}m(A0!T&K(YA6|yme?a19dRpSCB zN0Eq-dZ=?rZ7~2kCQ38=$+fj8dmwZ-vf9bv7q(Pw-|}X4`yR0uEeDaVv3)e>_K#i# zWzY9(pYtSz1y7$beQ5uX(X#Q4;}~Wj6hvcLuQ6yywO@L*|Cb$o+}7qb1{b;J<2vBS z79fqo13TXq5WV1TCHiU9bK@%qib8DNp&E`CH#RypUW0NX$&Js*XTWL#CnOPqYr`hE z{_h4{+cj4YUMX<5^u{wkdvEea^~70fYhd~j_VI_Y=y>z-j65fCuC1SAUOsqN;htHs zBk2V;r`};N5St9x|G%}X2Q(~8%x@V?aNe>Xl}D!KIEXZ#3k5ASPPfBlgyGjwgfIqf z%13@lGpCQ}Fuy1Lm)bGg?xV0~b-6E8JB<{<={3UD5U*T=(Wdw?HY75|_t8V4qMeyd z)bDiyB?f}R;v2(+^Z6()_4AX12;YIpnoWc0QP7Vt4|Gd7B&P}e%aMT zasNtvtdS-a11dR;Hf%DUg*SM5iukl0h9(2@G}T+w|JjPx#89JM;M^02Va*?aV`x&# zbbkBEztj4&5xIh3iZYDDhIysxw0t!_sF}!)UHiB2RDF>g=sLKjmf*>tSJOCqeU)tX zlH>3nfcr}MNeo&Hv9DF3y#96F(L9l8gUr#$jt23_z+O@dt>HL6uk0#yFv@NXSEW=z2Z3;PNZ zkFONK8TqSg3nVA!&dO5Aj4SqzV}djCkqU)apZUISzG2Sv$mioPAKPVm^efaB$WigO zEuxQ87(K#kT|)~a)`ji4GP(;X**cB~cwUws)H}U4FplUWf*Uk~(5>pIgsiB08IwdA zCT&gpWJq)(BF~G`&quYxU7tha_kAIITb^1CtGGq#G2Wb*v;+puiiNLR3nr6H=#d@Y zHAAiKv%kSXTS)y^smruQDy$+8f{3aP_l`};m31(5-tiN&Gbo(SDe`RK|Gab)syfh| z;d>Q-;p~;+jo3g0s@zLG_`U!FVo*bA7k zs1i;{2ceTvnSfVeZ)Zpa+8s>M5%aRTy;s5k{CiO$fF>Zb@Ws#PWOP<+l5==xj`Tc_ zmgX29ddrV7q@5;^fSdrhVU=oz)IKR(e8}FXkOw3HodZu{gexJlNIm=64$h}s<1?g4 z^g2Hg$CZ0Q8}2xw@+N;R6h{lEc@^OS=l&bG}?={I{Wz z!&rlGkJ;E~oWUZ#FALotG51XDp=JIvNjJ49PspvEpOe``(yKmbD=nxYj&*|Dj&c&2 z2ZboT`;r9E?B3qFFA=_YX4J9k#plRdf@(+62wV>AVM@HNo&eNX0j2Yir51ub8DOsf zAq&4gG^J#>R8z|M5h~vElsWYi>@B7OiW|taYJhUJKJ)d4?qX5vT>f1;s&gAstAI-z zg=75~dFekeuY+?Md;K(cql1jnH7dr!RkhR3_G5z&!xs4P&JdB)ySg(K`6#C4d3lpm zlh(@)K-hRZp=5K3$9)}Dc)UkskAxFHPZQ4GFtDIIDbH?2#z^WgJuFXe+QvvO;?0!; zG6`ig#mJL?TN~V{)o$&XhFCr5 zaClqz>CUHklOJ$FPPOsX1=*2Bvn<0{WL9k`c~5)?hsI*rh&GquQUK)=XiNEoTUaEp z*@#|k+-UV)3dTESi@IkCMiKF>E%M(-SzAm&A&N)XRMc$65d853O2r^HW=c*m5ge!( zSbNoRk*76K5rOBXK?Y^ZmhmClEkb^4hZg)RjA?6wclVC67gBX=(q=NnX9uUtC=YR- zaWK)pKb(Z8Enl#byk|Y%FJtN*WLQFdNJxXQz4 z>CdsCIuIoY*VjDG;Y}*=hF^5N6T?{~N^-aycaH00L1+LKuAsRGMWWU}D3R8*x}`*g zUQc2r)&1@0MD>DFdx8sPWQo$LB4^1IC{1!RyiQh!CyZb78vOc`4^K53%(#y)ga^C9 zBLTXgi+K5r({t0*8u;K}V{GQaWGUGZ`B#K`P9#s~O>?>=)L0a{#W0d@iKqo>i3WxH zZ>WrGdmm%2j$zP5q%6fyk<$z5I@xy6Z#8v(o_3dwtnHIq7yCTTP=bd`g3IdqAG(B` zfzyWKem!qfUN#$>m%oS8=Bjx2c8ynlcsl|yWNB=15PODEr76aOts?ySiJ=6%C6yLQw-Dl@;dRJV7U9 z;NMFkFdlg&O5wT(Dm_WX>06Mv?UG-{7e?TH1thvhfX=0OG^a(2E{VmmfQI#dpIM0&)?Q;QIQE z>y_b&j5x#*S-(n z)ygsNNSjypu2ZtZdl*Xi9)&Tn1T}sN5l#+>tRdisaud?BPALG_e3h{kV7dip$fxj& zL}GVxz!X$#wUBF06dX|&_|ixg^f_uh0`s*GzTVk8q#SBDr|)w)q=)V}o?)g^&o+WQ zn1<%>;}S*&cMD+xG8Cg;RbTkm?JHg^EF?f1@SQ+1oysh9L2m7QurTi-RJQu>)vCG z42vPaPX(n3Yp?CM*V1^uL}%^d=%?K-tp?PS2oNf70?Gu6IU2z3$xK$pU`|W~0|&-Y zAm-5B)CRwLz;PH?>%3yrm~V=V-HuQvtySyS(Vf3?p_-ZU7hyy}pz=$ehMq29v1+!G z6b}=PAn1h+p^~0hNUHawC+dQJ;f>^Wy6IJ*G#u~nKhWS;iAvCRJ#<4&^ZHM;=xb5M z6>He%;1E8yCbD~3x+Wes^cjJ}^m9^JT#Yt!1w{MCKzc$I0T+K}2h6dG%nb}e%{grTM` zQynijyj2r-vi18nj=VpV>(RUbT03Y878L<#b{k3VxjI4GX-w3p4Q-_Z76$(_AK9T0 zv?bSJWET;YIp!yXUej%6``i8Nyiu!&6QKgEyKGeOs@qEq1C4Lyrl zv3P80@@K;rod4i%+VvedXeAU{(miJ{QLFZmY`j_vt?xqigmSxoADd) z(qyMG{RKcX-g>0Ignd*{;Hb^nHwx>>l^FR*o@{Lhwf1;S5>;2$9mO(xU{27vI!F`@ z99+b>@ZEW%G9s4Xj1V%zH4N}CM|w(@=B<9>^>9>7D|A={2_HT15#9Tspa$4g#;1z%9 ze?@{DhT?f7`v6Y;DErKLxw@Efa&%9?2oowC9saYqqMs~kR0c3d(`e5%-&u7%-XUDl zK$&W+Q#^jvAT&H6!@zO&{YLtx1Vuh-S=P3HH1a<+28vxs0G7RN1K^VvMk=|S_|uU_i&6;LguVaC+W|sNvd}` z;``cHa!p8ewx&KLYY;ML#=&l#yoT4?8$5r5em4{;b)MN>0q7nT@JW~7nxp<1^p~Jwcxq08B48-`G zCV_~3x;G|e(UVeDoDh~M_M{4N;O~=~8HX{_x3q6Q?%rd@skv^lIE0=Eh@&6 z!;%6I^Mn@%gA{^PMN6BJQIi1a`-JKSY?nB*o25`e2O>IVyUTxagkNx147m5V4`EXo@cZGQ(W{MwfzWgZj}PSy5J&ZOD;%c|B_JxA->}e zKykLl6l#@jv{PfROEcMeWEa7K@P-|LyqfJ57|`N{B%mkNZS#?1UVJ%2&U>)9A=e_a zP_~?RcSUGyw!Uv+EOQyGB-wf%dLy#4$xO^Zs;I|mOfGc#5!h(7-(wu}lwQZG)7>~% zc2jSvrGQ;qhBZjPMP8tBO=JlM5%5vvuZRT9IueKu*w~6K$kcX#mVaC5t5Jmy z&nWSp_4=61IOVz#RJP{m-HaJv#Cl5?1jQ)1v)#`N@5c2DWPgh#TLARR-J<>>j)J(_ zY%T`tJzVDT>>iO5@4B_uSm?h@%C74TWca;7U~?4*7+qkzl2r75;8wPVAU#426R`yQ za&`}Dq-MohnCF$|@fJd(%iDl1&%DR;;Zp%H{{bB4r#^`qS@s=;DK$^$w~6$HdnE0E9BkTLmNwd82PZ|cL9XO?e$|>#1Vn`K?F?yON!hvouadcG{?kLC z=t#OB^OQPZOJXLR&`+`;hg#6J4|YfCxI26xVV2H+#-TPr1{_p{_gh@p^>i{I7GQXUy_Hg(ezlqY@@BYa;H z*As(*u2D$XlQ>-?x+E3O4LdO-oi^^8F<;GV z`t<-Te;LA}3L15u|80HrGGA}m?bokP1nt*x+tv`bqNCV}!gVeKJ6VPN=z5wQ=4A`j z07$jkgM!seG%Id2b6yEmfIM1` zM;xA^xqKX#KXHHGaurya{vcVEAWs7tlm{`7Fab@0C0sR6|WdF`q6(UiDu$kzV8C{NIPitBV+WlqBM z__g|brKs5Q#ze$-iLvO;1Yt0`GAUn$QX|Ma$KXTHGmb`qA!CF&cFhUp%mlcA`p{%z zg#Ty`*haB3(Wgg3g-@SltVDS>N8fX0kN+c#ukZ3;S@g5R`wMC6h@>@g!x_Elmb}hg zs(Ac=0m(e&ij|lQ>SlQ20|J>KsBKP-i{4Nr9Iv;;O6&@~(&8p!IW2A+ZdX*+_Y4`Dq%s{4k~n!>!4*)sM)Txx~lv^nvF5HNT4JpWzK~jGaFgk=+B

z)-mgKk3qNfJ+Twa;wcn=86Wm4lCDPz;$UAZ6Y4!g-*oEhwDezEdMPFzvf z*bfiT5^xkq(kWMfY~=icvKWd!)h>Pim9|m9Q&+C5k_hmK!ZodaczYCeNCufvsla=4m8m-p`9+G; zpM}TvIqZu+M#d`h16JHJ=-9J;)N#mMb3Ue4d`2E zTJ0X!V3@LuhJa!EePKmUgruFWnz*RV6==b-_hU`cK23j_+r&ry+I^w8!r`E7}X@XGY0vLS*v$H;w8Mj3oO%Anl#|0EF}%If`>GGJyd%e?i% z)n|%!%)=?FjDYLf&b_RUGO?%73V^+Y`%kw6{{VWC1O;VJwBwah9nUpPa+se$Rxg)^ zsd6d}iMuQfokn+(WXR2n6F-oXG@ZR*3D@FDu1;wMl9b{Xy?=dR>SC7148Afeqdm$s zB$SGyUtted2%HUk{C%I;iZx=*VmpY(SR;N=43&J8#?ojWm*AZ-6oOmK{gjW(`Q%D` zGsFl>-B-j|qzEx*@&NT2$DqUB#J``bIpX1f`FK^GksoTW{c~7X8T}@suSHrM`rh$p z5OZ@Y-k}1!%{(oaQhLGv2Xr2poKOX&e>m2lE&Q(fd^wDdwR7oML;p}PiJSB#p`9MptpK8sWoqTJ87?4him5d z+U+|Pv_eFc&T|kxL0GUz;16JM@i3w_awh=9A@sy|DQ5fR<|OP zUx@&s=sqP6Rd!?DH2-r3wEddt{WE#5F&o{k%&tb+FAUdTc7XQ5bHv+@;}lpvjf_D= zOxL0EKU9YkjEI`D??@FTU^kUdJ{ zoaX)?P_k(aY}lDu2Hw?H#kFiusl1}BN-Qf%%KJDF6BjijI`26v7yl(R;B^W5ap4}P zeR7Ln*&YJ4X7C-rJ{t2~PdQiKRfscgtNYj00p4y)2Nm9!YK@t2ttsj<7xL@LGNTfL zN8@!%ipiN?*kBLBFL=s>I0gqpsNR2RET_c{#qZseA_S&gT%K=H62%+m!TsHb4;M#CkYYS2dlr2ft@1L5E_9r zxk9aZDIJnit<#0*yRlBW5F~<8A?b;PEtm^f1zxC=%jQR(k=S0lLF`60;49(X;I8X1 z^y^sft-%Z3la-WJEG?H1Hcn_xnEh zrK5ox9J>CF^WGjM+3$~h@&9Bl3KUG zyB3+%@N!gf_u>~yH@+iF^eQWF1bXakaOA_@5@KazjN*Z(Uou_bC%Nj#$=%pyyP>Ax zl3)gmEem&2i4J*HmL1g|lIq=~8BrTt&)6ygBsi?Y8VPveIvVY6ME3*-+9gVj+^=dh z6OlLe@4y+h-WCG3U2n#pM#{SaY_DCBB(Zu@D8#5dol_cJohXY%;v5uerk8Yg)obwOcn+HBSP^+kx z@eby(OT>wG_KWvNr5`fCIoH7A8IXp)81SiyhLe@g%US}_Kvry?EsQcpf1#AhR5?b# zd)=&90PaY>S6cQfG0Q%+#$ZT1;Uw zh{?9t_f>LP{|0=!Qf;Zc(~-~pYg&T~rh7-~a!t(p`@aLtqmgw%nYq4%GxXUO_JkJ= zeL-LuT36=VOmzKH^ZFDy*UZHT_r}sVN3xb|#BF({ityHq4`xt9X%5VoM+h4P^0FcDOVffj4ZMu;Pv_9ObA@_JN>$Bp04K$(g87ebOandk3CZSE(~%$ebcpXr zceYO(sRXo-Mw^WmfFN+y*65u#=7+oC17k6`4E11)7~<6=;)2Ovd^Yh zEX!9+9;HC+Tc9P2BBG@a8mp9juI~`sHoDgv;qUY~2lT?y^?bWfc^};2J_*QcHWXza zT`}U4gP@1KfOeoxT=nI%?BYd@sq>z*pG5(C@bqFAH3#RO(fO>{!(Yu>WbE=A7LqEeI6ulymxh6` zfT)MYxTTJyko|LPS_F+M!rn*K1j7LcnX6Zf`wgV0p{Z^VBMc#MEPz!$}3^KZiUga(;ig}V- zVz)QFWQZ}LLn`}>uDCl+N93lc)k|r^W1*&VjnuxdFW~7>A~-Bo1SZW9pk6$-2rmi* zw5Ic%x_PX^C(OnHxg^@2XB@X8bhxdvTwj-=rVk-78fibln~GqVU}vwM2zdrUKfp-U zL1mYb&)_zCQ=E+**Ic34HN_5#%IG~Gq~g0Cm2#4VjWl~vhleSy6)$XziL2cjsc>7) zT*I4Qi$WsU$Ajt)EhIURU=fwXJ4Ln2Dy+*6DE;R^Xt)~h7Zwb2Kd1Pt3#-nJGA`?#gZoB6gRD)(H zA_x=x?SDuV!`ayYZ0*lG4c0#{4_1-uXTOJVEVGz_TG@?3hYlgRcj zATi3+dfoL-NB@kin3!#%cwb;}F7~s%TfaW#@#Lg{OW(BD414#l)2SY8PmExoTq(+cKY~N@WGU*WERrsSb-=r%}%I;d&dd zQv|7d2a8r+V;F|G8Qk|Jb>|*V~|D&+1cHCn_ui}PP~^(0D!i5nKa4D*Aeq6 z>CsS}udLW4aV~QLzt(CMjZN?|Rzh11&lbxMIIrSBU&u9|Y77fW2p1S1h?e)`5M$=D zPUehOHR363xjzP>B5g3g;n`1LIZYfL7kH81VJe}SDF1~VGY^EDkN1yUULa6cgO zM0q0*Kx&R*{odoQ@%Rd$F(D56@D{945sT;qbaE-%Cg==sUJ|(@Khv_uakCXge7csW zE)NZS&^2EF^q)D;bL04`9ORIL8lu>=Ci4Swh7_lO*-PiQ&l8?kqKLzgP2Sd~90-DXPX>u>;%4*=!MF+=VZ9Yc>KC17!N>*dABD0HNM!`UWxBh_W};C z@E1EqKPe7O5k!&%hC2N}VK}({$;AkvrZ30ILMs{#7;MoweyRcVr-j#vG1<_-wH5Oi zL%(PC8>k8!pBA*n)_8q!rlR3e$5GryF4ZXiAgi}FbZAZ4r2_XMVMH8}f0vbZw`Pgwb%ojk0en+6pk|({tSWP1)dOXNUj56^9zT9z zBgVG6S5A$Nfi*q1M~42+iURBz>XE01)n^h_XFLPHeSXeL1dcsF`42MFfE(9XM_iXx z{QfOpBqZQ#O_KFR+qPmD_x@mGN84Wtv)RTbuB_Oc*qGlYWR4&0}<)BuE(qiF$h z_hfrk_bE~B@DCAda-IwhZ7v)nDMn=k;)5Y0+M!lC`l9&*$QMvlbGU6X}>e3PMajL za(#!arVpu1T>p*eY=eD1F)5YP1(E|M;(*36Uot(>S|9DNv1Tt*VOvKEc(lBvO;=*a zigzw=iTn%RwV$cZY~=xhV! znhs;nrsQY`6`GEDNsP5j+)uNzLWgbYeB{}w1M8!Kzcfup%PFCVPK_cdS z%_4eD7m^DQ$HCpZQpA(bB&zYFQ=YA69+njkBcDd@5YYUbafAHYkV({v);L2E!_Bw1 zUj}u4s#8$u;Pz=2J)_3Ulx!K;n|cQ6ajnsPzJb-1sw_l(I_2vVirk`*k6dDCN#|(C z(vBzeCXlm+u<`b>$n}iuXi}ovH^$)N=)rF?hP;6{gPea1=8>EhQA`ZKRDIQxmVpRF zHL)AecP$@Y96Mr*z(Xd+;a6aU08@O4&~XvGExy%Z*<6w9oL5i*Z!F$5I?aUK-=!&E zK0tLwal1S-m~$(U@hcNsr>UaQ2lE?ZVdcK_d-ZhBZliR+9ZsoYT=XwQa*$-D`pSyt znepBXB&=73M!YCPrUEJqM=jh-c~jwW#gkRYi~t?mlZ3tsQ={~PIz6dAfei-rt3URjdY!)I&m-1HhQY zb1nuUp;53!&5k0tjXvnv!T)$G*GkyO4;@=@_dErSnJNw7#rO7&aD;DrOEM3Zd2bP> zly!pMPon$)K(*@YM2UF$PSdR#MJSniWV{bA9Ol1(j9Gn$z`%1%v9ONrXv*9d;X+&M zQ>$XE+~t!DZ#+pAwn0J`wSG%YE=LBa@#7BLnO~rr-$KxNF)4A!GTcgnh{mq{=N$)1 zKRlpdNc8phQPMuvI-g3agG+Qr_%Si^TiX`OS4_ly{)6Ry0U)nR52A}AK`H7AVKF-n zGeyXXJM^cp`vPI!RltI(v1|ws12%rNIwTmhOKgvOncevj-wvshB_+gKIAL*Yvq4te z^(9S@Zh$(HMwCAf>1QwChTiZbusPe(ZnODZSXVaM94-Pig}kSike8*tPX-wCg|2md z+q7@&0hP5i245y$jnjpJ6O{Nt%$6yTc%*_Eg^5@v{U(O-7pt^(+mUdlz-B84MGcb* zfFGE2{({O>=p=Qw=8%0L)6K|HroBkDm2u@%4r9y9Z*<`He*4^y{_3w7cV(a`!{X0* zzWAK3IOO&WxXTVxhklu;F;MC-S{zBNqAv=DPKu&a;79j!7~z&_XKw_M2gS@{r#`iJ;HDOEr0l#! zvI;?B9e{|-P;1Eev=RsEvoYgE9r6Sp=Rd8>BUfzi)z>dMkM7Z7ALPm_p2BBL3EIg3 zo5p}0i1kS;Z9fZoY&J#x5+S08yYoOQp{xg2u+xqbaoi=pXs~yR8qSLAXxIisKJIn{ zQ-f6xqp?9aVZBgDNl-)p=Ba;opEYkd0Gr-l|70k4Qgr_sfiSO`)9VcC-LysYX(R}J zQ_Jttsu&5m>s>j%XOK%4&7fzniJTSL>htffrc2ji=$u|7>q~F8d7+M7JiqU_Y=cWX ztL0eFJ&yPuvkg~tti1?;i$Z6{<%+<2v0Ovk_R;ku}SIQ$jA~IXjI}I@;b<`WUf@6Xe%ZcDNoZq)f$OzL z<|$zVR@qE!X6)MHL1rM!3HfJku%v6VD}Jv~`ioVQ^y)mb*XFDWFxJY;!b0JQt%p+b;H(ru`KufdvLahZDk11g-p7zHC_ zq{_w|lsGCNw*a#v44%0#R>8$2r4HGCd?cGzhsa!+0j)B3qtF*-FQ0!L;o9yPAo$BF zExTOQyyc=*knX4AV(d5HICn=CS8)Iw?#_rz-A&`0*)n^#JGeT+1vSFZohom-#Q`LQ783ofdGyS6N;eO2}TE;{YQsFHtQQ%EJ# zWP-8Afz)s=6b~9!*_sf5o~pR_5KDmU7zAynhd3gTY^`t#JPcGVI;ASIdYF<>1(RS0 z7M|p^vM!>l*jBm>bZ^1NAu+K6_a0_4A%R6&+b!p-k>V_{fNu)wu6DhSd?EhJL^Bxm zPDa?vomfLv-ScYF<~dH8eh-=txUNN)?B%zaJIVJBh%Dubo==>n9{g2W!?b1WfieN|H;!j z^poIJW17fGFb4EBL;=#qQtK4RVTe6C5;>rl><8MOwer(J_buC*4n3`h$judM-cdj zQFzR*`M2zv2G{MY(zB&end3$)k(rluOF4+P;abLqCQQ*{ap==$FR*8fjs@W z_}R&Ur14-wVRAa1Cg48?WA@vz&A#SCz61~9EY0A8( z@`b}#B(RfQCfCf5GkASADPtj2eS{GN_m7s0^9T513i3B^?fe?sYIE)KP71KjBWu@( zk1c!cHGT)^rBWKMYjU^~)dKJdSP>8-nHmL$IfRkWiV!2@LQjm=f&ZpzYp~CHj9qt6 zKU^*6ZKH=?0T5+Bj+`{@0Z>T2q7~fU?wV#X$}r8H84(~)lq2NKVEj>Q-X&+C3}evn-|VDJvC z=jCtXHvF&hG@#u{j0ZiPFR4-c2fPSNsS&?gL6q`=WP!k_vmlsH?CPaQ6lr^nEC4R> zKRm$OIy*CpwCx6(ecn4cMq-%+t9$93$2kS7Zm!W!~o6Z2COzSApL?NLc3os&$2wjixKO3Hq z_rNz55hzD$tBy^ct};5kMfzU=KtR90KmsHJr9tDKQev(USD?IO6D$>&(W|sapoAH> zI5D?o{oZm7gujS@+0sBa#n@Yr=@D0yOrDWA(wg@T;-#DCS~G}b%+<{ai;HqhEkN+; z!vRn3{59&(?IwRcZG7_!;O)K@>&qaNGpBm-pYLcN2G z!9(fyEqVv|#K<;pdNu6RYE50t+g=Z+d}uFLb}A=MvWiG@_%!4G4v-ebA_5n~-<%tb zk-6;^SUz&c@PtQ?zK$a*LeYfWes34>m5!Ms$f!pJP- z$<~AXmNvp|R&2|P`__NLTnskoI95PMdqH8}5~COg#CvoIG{K_J!kVowP+|`m5V7fH zdQ}Ep@CIAQ?LEDwA*@D&pkr-_`X+?suhof&w?6=B^t@oBCB=`WS(_+J4^J|(a!t;6 z6xk<=VU1pG#Xs~Qk&AH`CWpVG@7>m(ZnuWY?a@FP@Anbj&~4^DIK_Oosqv<3-;(G} z9sFb_5)WRHyaQBO+fg~saAJ|N(CXQ<`rE&i9?Ah%xf?``iMtUBBs+$B<{clF(Z(q} zFL=iBdzlU0-yxMhK{}AggX(2?O$mVbn!Ln5Wa;cU14$6Lo9FAdaO%KIP4%oSJv=Ef zG}9GRcR=wVujR`#$+iFdGyahy@f9CH;#(rfId>&n*G? z-uWLtwVcR0gxN*T{n&9^N_jI1 zu@?D!_NsU%i!gPCBda@4f?_t44n{}p?1cnbuZ5x!La8smPyDAMD`4dHNtj_qou-mo zb8~k`e`FfO4WVcZJhq%owdGV9ZO{KVmhzE=+7KS# z%?RLqw~5g6%0i|_9w6$mPqf8GZ$z+MyA7nF2^J{Ds(T$`Z$st|DE;i;Fe^$(T^P7) zgaQQRXDNfI;$$JRB`g2o#GbcnuH?1Z<|&A_b5lSU(ZRDz4%?*{z(HV{38jSB;YN{( z>Zn^1*85&cyrEJLZ+QLtiOhdM)k>zzX)O;M&r0D*I1;sVXc`{n?+b7w=xI7%&r=Q1 z_ag=nhLBxM878+g3Yted0*N&2SBx2mzViUlL35f*8k$79Y9lFvg(k@ANCWPoh(7XG z&Yl@|6xUBmsDQ_9KO*m%aA7b#W|^$3GQ1j8!cKmCvSaT*1=}c0sDAuBSIRU&b22uR5gjYz?T=y-$-e~wRSVNl zC1(+=4L7v}a<`o6%^26pKV$G9`O(`xisD-SI1wM_1weywHj95B{_JN+1d%+I+vEM_ z9(@NnK9sQ}YXwk}CA6%UV~?kg>#NfbdFbHF$0BSSvKGkqY5{-Ude*f*=dzvtd*6PR zF(;{oivNl*f5zVAdIl|j9l5@UWCh?AH>#4`DEwGKIwUZR`lX4fdnh^^g)@rmI7jr3 zP^FIPn-e|CBv66{BiU^lf7S58hqR%Ln=H-FR3y*xXe~m<&&3X(_Ghw zdEyeCZfl?hhbll7X=gCHj;@Gm6Mz7kK4g^|5M;+TG!Bb_C2A-9gaYGQ;`^Np*1=+) z#BbU>AUkQSezo?{*FFn3t{Kd^)-myR2@4oz*Ut0adR3FO;8ha$v?REgp^p@*`P=v} znJbI&b+fXcY7cdqE~bAHoedFGyz22|kz;x`+rRv&M$ggM6{3SmtW5{RHe!&)=4u(g3da(cCyaM2VJ>#{609+oMbi z$9R#K6qnU$j}>r|i)nJ#gjgg^1~mzrDtKAHWCJ1oIncF3g%DJ=#-Jf?78vrG&9?W* zUJ)cRBtB}CmT{~O9=lKh_pevf`{o#n-i*Cn0BsePDP5B<+1Eg3g=rqzTai&OBYryE z0T3WTZ-d~0K~u|S`wlN6dXNbjRWbP>F9iL-Sz`{PU@elV^dzJ`5^!FADi!H zjKEGsHLf$oH*_QQsmB|JqtSEHy(rXd7$g=iy4iyi&{@pv*EzniObVDn%D&dc_sDMM z^`P{^IMu{fI>a7SW-civM!o9MGBT-|@_jkIzO$3m1di}CKyTHpa$p>ndRVQtmMJHp zc+*@M|2Hf<drhshOn%8LZto`}b z!0do5GBctMwr$AsoC=FIECwEgxOB8eR~(Hxr;-n0{>cd`4YU@3Ar^mLlC|T&OT(`v z0LNe7rT+34K#z-AiDFgm^yp9^63Cbk4leG1m;8LTj%cq^8{}StN^ztE-??ga!W+Wf zkqFI2nJ$yg*mx^`gs+^S4Br-665^wAD%I|ZJ5D){a}talEfJ9V=QEO6OYPc&CR?xU&gVd#B`VenRIF!?+XG%TY*vV zJYL?(_Q4O>+E(sy?&2MV{U5_4(nMR&hY4UGc-kP$Lh8Ee~0Juzn)-& z-oJzm@LJL87n-*(r{h40L}q#m)dF?#&Ms->qfxw4k+6g@=uEr|pRIE7klTRCk|8LC zCStW)SVOE{q z*b5D=;SVi_pvwv*)1;FB)|X+Rf0>DJ$Pdku59uhL>?bicq7mt0+0*16mxCT2s}1AK zm9(!B!4FYb-k9%RDmkT4s8$cXmJun?2~L3XT*>WgANZkcW8;|n2+a>7 z;(^(kEJ1?`PYp4>?Ga(NLDSPgiyX3trfz=*NukJPR`J~78M<}93=z!KMi~%X;_tl~ ze(Hc)X#Y8CI&@NV%}<`CJVQTFXTeWj-S*C zbLrxUBBujTi}G}(l-TRNuoKExWTgBF{KVVGp}%8KDAJax`2e%hfw9WaJ{f) zVp(eyO}yL6i$BM8@y_FNWt6t>;1SBucyF~xS}eTSLUgy*)CroU1~M|;len<^2;V0x zJK7jMPE7%c{7dZLjgiy>i>>z#t6LB<`fbi@CRNePpoZ$Bux$nv*ll*@!x@ za>Q|HCFmumcgk$BVm5(pMcG?OIEbWkvdW^g)5SFcWDNnN1{ZRfAAG2p`B(LV&czsh z!11D0l**m1Kq#tiU8_A+A%i1}?04N9nuM7sO~H_mRHnkh+e-QLwNAtos~kBtPp1B( z^vpsii4Dq;>HYPT=^73$2eKgBVNl<^+$sDxz0`oL%G+ckco6#FJGC+F_mtkUt1A!QTh6m)L?6VB~CXAJ)MfxKcwp()2l=}<&jEdNuXEg;>?>3_Vh z+hFZ+MQ0JPqQBI}D=Y`-m=4u#O(5O@EHs{u&8HGjw4l>6~ z;;s0kSe?o7ZEUN})R&ZkNR5iCRw~eI4G?C}#FOElr$u<+?2Is34ey6sCjFk8<$Fk2 z11pIM_maVaZiK9Ova@>o2L@i$RoEww$-ad7Nh!R(uR98B6w++*OtK(n0woLeI(wF2 z@?RT`VcOlnSE&y4D`x4z;N?H(2G~nF*f{CmtwybiMpJa!e@->h&Wl&7?~c7(1rbK9 zKN?rIC;b6>QB$qWnb20Tyv!B%)=Kgk=h~VdQLJDEAl9v5s^pf^@_?{Y)vbmN@(FL* zBatTj{y6$C69FG{t?QJ#+28f~_^oR(;@l!+MIjKsFPT}*3P1-$hGv@E5xVlLzO_$( zWTTuci1Z)NS&A{12F`BSymJ#)+zk}k1hvR08N-IH)FmZR$u&M!mDCKmn=?7oVI1MB z;nOZOb$7J48)Dw;P7o;GGo6|2-o(=iyOQvjY1E06YTt~iN$A>jghYP*ddQub65mma zbxQxnzYWuKSzRcPWP~R^Xq&ub7jZS8J+~N%@e^7fXQ_g;s@O*OBQpXX)CUsGf-G~E zxsF=G2ZN&x?1e~y{{uC~))Di( z{&O7Y9QsU7TIcQxzMyc>Kmm!5-#YE+KgU!Hj0TPucX7aM8+}^yr%`<kYx@fj-?xOrm4qpB??oY8eh~QR4fz4Ans#5ABeNZ*CP67EsdOUtefln71F9OFvXnPU@mrU6TlmHLInEmAbK=45%J;f(*l09cM z>=aP+SmZgaNo(Jf{ZQ8+?+0z>3KtOw8Eaajl{kp7Zls#00fvyjhLgv67zgV_+PaC% z6Z_w%qw&eNMv|~$;FMPMiRFy-aFBg&TqZxy9{1T=3;LM4+QcC-)V3JP?tTuvt$|viSm~a?1r?khQoV)mfZ(Ntl&!UJ zP8Iy?(JS|YmbKck!}rZJxdb~r6XZ=7*=8qgcc|ob6;84ey2dioMsxs%3)n0kMgGID z#zS2#k;v{#X1$hXw0%?Rvf&~I8L+yM&D@R~_hOo* zgnCsrs-9i(NnEFHHb6;DDwgg2^tMpDQHlaRty**QSuT~K`t10i_Q>vYRAxb8-@b5N zF!_IRFJ5h0NZzm0S@AZ;N;Mz&=<1DN!-^?0*Z-T= zJIbir&wXB_bD6y?SoBBaXZ$ zsxa-rC=oZ*|GutXv~l)^*>r%?^_H+k8{{9z>gD=jNunu zZ`Y^?$`z&MoIk(`?QZHo@%zWIq;t#{_Hu|)L+zt zH~whqxIunw=W;j*-S%9hZvEhagCP%KrqSV+ElGTo3}z`duIjSfW7sDmU0npcfE5H*K+&gM~LALg&?XJ zYZgah77+}E#o8AR$S9z`(wqZ4cu(XMgM(lr9f^)ETZ#`@2N9X?@H0=7$vqJ342ylA z)QA&VGNCe$=&@AClzT_VC%mYNX%!_ouc$*8TNE1NlM+Afgnq9A{mh1>?)PT2Wf_8p z*<1P(#OgJN?@Y|})McSTvPMfG3Xr+RxqX%q)=uqm_Dsyq*aYgOR*c)#vD2p{8?(@io@9Ld zRvO93Jk~l;;tk#{TT<}s`&Cl|(LLrMEyDUU%j0)x>dCIV^>j{F62$hu??kT`&pC}? zNTpxhp)kP5%`)pEX2Pzxv&~p{wW2d|Qu4&dd zdgZSFN{pNbVL+Rq3;dY2H5UotzF=}p`4NQC_pvfNBZATL%aASY7p1qauqhsz1jN0%VUPs}cQ4$e0 zWL%F{K_W%Yu@uvg-punX$P?Vc7a`Rhmfg?|dFh5W&-BX~dy*HkqV*DPL3OpXo3azK z&QbU-`DiN3T>l^|D05f7hKKz?43@6(u!`^Yi>;(H$TeWs#$20WbD z8&C=8@jse>4rjGsm*xMJm*gm~gFK~f)M9H8&`>%zs1#)Kmj(pV=!!=tYq9Ch77*&n z)ksV}$BP2n>AsFEYXB%;AuhOjOo?~ofcRlp3sS#>PjGk;^H0|tP~olK{p)C$sA7bF zU(d!js&BKcC@gJ?m5#Y}MEbhYj)dFkOW2ent)wvIg-b^z;Ciz0Pqv7&1$CAmfHZ5W zKaDH3Fy$q9r!So{=?^%DVyP{y0RA2w`T>p z93TnReoI>xkd78CCt*=eM3kMPM-qB1|Kr|w z_Bo=C;zJh=zZ>;S?@;%T*1%@%5fWUKd{ zhGx*E4v;(eGbO(`LrM)(8(i%TQ8UaH&gD??~AZX43bNChMU?z zSbLXE-Lh)~HcmQS+7UEwW$T|nX|5FqzzD*EP{$21qm`=aXU*zupE^)0$*Qu@z+vCv zoZ{?^*2xiMERLsZ1hgSfYc3Q5M&LWcP1^CaIMT$D%FmD*l9J2>OR&h8PhgV?+P{n`O74@D*@WvD@XudXdh0FBmo3 z2B&bzFEQHwbIroo)kkX*Sk__u!oio~HFZ4()%aU#%dom`=Z1Q*nM)E&_=xyvTxFK; z*oARXoQscl#yVCElL4B|1)oS}(cC(!`zsdQRTQ@Vy%!$(qy7DZX1S<=OuOM!IQ7<| z-(jR1W%Vm%nK--GF}rnQijpa3X?GI^HFLjfLrW44UyuI9#{RQ52z06`^c)>1NWw3u zn4Ncg;=@V`2d1?((FI>;Tt1xLIw9>h2$0=YtKfH|=c4a&U{IkU9anPdoFml#be!_2 z7$n_k@j!@ES46QUGM>*IGra zQlewohB<%U%E*DVpAzKu)j|Usa8m6=|XRl1vKl$TNG>f0Hp4}Amv-^)axcA~Yj z`)(1dnTs3rGyEgJMomyaJ!o!$v5s6OvRh%fw-i8s!TAtUiT&Zk_aG9xekPZ3Z7QWO z6Zw%|gR)E3W1;!uoYO|-Xf8xjAqcMkiCXlZ<^0bc=$#=g;{mo>}#aa z0iF)Tw&q#XlZi&;P^F8-8h|O5wRHTOnbE%C+I@oq*k6ynEyL|+odcsol`MtSmHRd0 zA-ETY{5_|}jM0s-{#<2=9>U0da*-q0OxRs{n!*%f2phF(G@}<5-Hd?etly{}p78z#rYE*rvn%Yv`k{XSsgUQIKCTg=QOxHkyFEUCe5`#FR_dwO^^FNTIGz>Hrg== z0w;L;af>*$Xf@s((#lLG0}H-EMKN`60?M`H1B$=NEt`Od{+=m8CXLCo`_y#lWBB<4 zJbTjw0h^^n{R?)-w@LZdLqvg6H)COTbdxh~cbmoC{SYl_6*)9!q)4>&Tc7a!W>#SW zk_v|%Z~yVV;OwV~4aLQCG`vHOn&%29q05_d*Q$FHR1XIS@e~gMr=bHi7brhjuLY_# zhHh`b-R72aRuw4Wu?4k?4gXEaES8!SkgI7#7HC#W-}hhHa$*d;NP!7J!!y<7`q9M3 zAtFv;;elWu^qqc*!=pYcD7TRxUkS1M#c8oy?i~~)lh9;TBdA*A%6XQ59i~Z{Nf|64 zeW>@NtJ}Mr$%u(_rt+=r~i$q_?@001^kj^-MzYa0z?7^GTT}o2WR&4hhdEG0> z;qs=mNtIeCWm+`b_9Xxq;OKR4Ni zeSv^vUIt__GkM@89qh&l>R}L@pPWTm)?QDuy>USgmj4_?x9dvA%pmV7u)&AOx?&$&x z@^suerBR3*wcL{q_TxH?4CNF}8%qsY&89)m%;lj?3{Zt965f}i^fs|_U@?%vZG!9C z{YzTi2^h{CM*)#5Zbb|;qkZ=Xq1O&MT99DDUS`LWgHYv5AQ)veX|Mm-&>IyowIdxO zp_FB$4wjyAo=iT3zP)?(vW<_g`v{=eKqTd4F8o~C!`dn|7#+cZz0_HPzPH7#6Tkjz zbL?pPyoQo=(cF6GUYmAl~1%UN8Sf&;C`lCl5<#F0LL(o zI4t1Oy?*lX=n|o4ZhKQvK=Pn(WJd|zm544~<&BQsEX57gPSq!7t{&%I90+Ix0ypKBdFfxq`w5yL0HJk1j7aD_i3l$Le=VdOyOc_RU~R~eBi9rT4(`b z8u!wvvtE&RGIOu@r~=;wGej+ws5z1bc%vqYQII6`Kk{eV9m`UZZqB3BZNrOgNe5DL76{E|cnk8moKFiG!` z_~Rg+P}|iqXAeb1#6HR3k3s8`xKzS8PQ6v4RWMCBw+y(@JoE z!j}Fk&0Tui@R zZ&=4`v`mjtC+=kG9%JKi;u20V^fjn|4))$hl(}{H=+47l3?A+){`^FtG8sHM$1h(L z1l)8P5J#2h2LmGQgo}0CrzzTS(<?0 ztq%t2tc`{pnmTGRI+52Dr@ajx<{gBpjLY!m1Wc~3GmU#cDaxhI3wE;f$sNZ$Vzxt6 zF;yzWI5oli0v=U9JoDug*OTrX@lbXscR!s}S4aQW zneDB;yq-lb%B!7=5kb-Qumv}&N0KIWy9TvyrZw!$UES+!Q?pzoTTUzSRFKqoNR$L= zsAWpY`dvB$u4R6`eRNQtF@f9&W~69+JYODonz0ao9XYhNNosAi8iqm}d$r9k_}1iW zyt6gu{vrJgo9k{*t;0RuokpkgAiHeR)I~Pd1C8uKB7ku$lyz;@>PF#X;%$_Wxe&fw zSt`Ju83#c#2%XJ0bS&}hSgDgh(U|Da6>DqgsKmY>GJKnSemcy6jvn>n9Tn8-70O(blsany&fL>3Ci`)}q}V1m+>iv077E z=1kNj=6dxqQ*tkP{t0q<;%Y%>FQ=k1=Y!FmIz~NnQ&H$OQXissH-gV&;zuXOftad! zI?(-5D0r^+7uW1MV|7fmL&{75`3)>_A4y+?9BX#)pc?K z%qa^rSgsvEj?#yHY;Cia=TuQF*UrtGB0O9I zX6^=Xt~|l<|C(QtzdH;Vum47h4uWIA(msk_3oln8DeZ!6O*QIluuw$7i32`$D*yk% z>qD6g61Bw=ssS2nDo|bJwQSM1ZiCvXbPegb+hcbnywqvN3_Q4`s8d zr%40hHfEn?w>yH5Ar#crC^`KIo32KrSG>5H1Su@kj=kxS;8=8!aov#JJTqEB-S=tF zBJDNcrR9vK|1-FhI!qw0BJVZ^J>tropM#Yh z*FAiI_GM%>(#)3(dMpr8Yk`eWkJb6eQu~vblYu+Q>wpGxqDx5w5;Kb*i_HW>Z#_oF z7L@MTCI&uC-zpjSKmK^?KJxe3*7>+#cjR8;kc-XX7E7~Hg=GG3S}yGL;mzg@ zxKN#jLSBxg41gZjhQc2*Cn>#Vg|8g@!4--XjzqN=*~rZd!p|jY?44%PM-FhG1Z13o z#=58M67K2+dQd~&4=?QJwDf!mX!GgEq3Z2qsnF>RZw6%3b8_zstbOdHGcIYZ3;yk9T)ng2c$ zX*3!d6A!3)pN6C9za=yf;A(N@dwT*0Vtum3A{daW!ZN&prbL=)~I984nmYnPkL z=Moto&p)(f=mfV7(_XhGcE`2U2@2f<{77{cYkX*4rqnxyx{q^W&J|e}C z%b&Zk1eS$+Z`O*WP5hWW{I?_AFLMosFeiLVne?H(B)hq4m2@5fb65P9Yb4F+E7OM! zY0||Zw8JxiSfJZ|bzhDO7UeIKgPVV~&G+gC$LBT7@7TN-Ppp20e#RBx`S@1I!$+Tm}NF z^|@86cPT|3E`nmRvdW-tnj-Xy08y08fKHGcm~URgQYm7ilTPWvQ=T|u_%2i2_a~(E z4c||X()GYW0pcl0_!86gvCyg|ixm%@O%+>ypY!s9kUH&lHAfhl0FI-?rl#RGwhWRm z^lw=cjVQ&YrXMK}O#8k7ZYcCI?6P!pl7jS#{>~y)hY2xt*>t+ocp)W;ReuomjxXta z8ybKQyra#)^B_1heNc0i%a#VVhOn3Jr7}KVTE=7aVLp(|)G$MD$!qS0HBGn1x4X_|C$a(M)_sBzV}FS2qwWa+A#G9O2M*>O5u ztJNWgkPj(c)>%NNA$nGTkK5aB&Q1hyy6GxfJj1$%q5)`Q)+4M%xqkxNP7$7yr3}iU z+ljiymjSF7v}T}y0#T%}{`LTQ%gikYTEDErhLM*dA1V+HkQ=ZNh158gVnSqo9{_Nl zDj>BW9P(*P3-0v?LpIurvSgeuU| z2O|~im^*__LpXgkgs;_LR`H4e<>Yi-AmL6K229RICH)ehO8E{qsOHcfG46UjZVs`) z3YMy**(g57aj|@sW%?)(iotpn!E4KZROiB4EL>#w5z*Zi1}P+8tQ!d(l59H2gVFaA zo;b1A(=Cp8E@Y+I=F6J|I?>q{gk3_hZgo-CMLB_L0X1h%Oz98? z4`g`|y>vLoU%iq%p%RCvMw65%WiTpNgN(t90QN*iqPYu0vG6yOHAxxi@RRE z2(Ow5cb}*g6W`|}86~=;{F%N$IaI*Qa@`_Wb5ii@G8$-D9BJaF2OPftB*~Zl_LuX! zipC5)xiB65(ra1w#4|*q-Bapa(){N3_U$!9dFj1*_Y~D3a0wK?0JuoOA`V#dDn!_( z9*=0_kWZmh)7IjZT-u4fXrr>kLi!ER7MJSPxU&{sP)1b+>p{_DcFGrR8!Yuue2yR^ zK7Bz9D502)j=<*|EhTGxFAPW(5-yl`z~cLWgw)`n8!Cvl8*WX@(P(nwcHt)bM8I=qPL>b!V6sCQyU_*#RqFqse1*YOMgtIu@ zbHS7q+Mcaze+FW|tbF%49*@(4H@GE zLkmR{EW2b3bjf5Mb(t0ppOwqxVj&puG&{vaFtL*Ee?UJpZg1F5)YEuZIzfo;)BsJU z-NIF>v+lZ#X(v6P9gqM9Q?2YYL=nF>nYAXl>#oYrU5*quKbKtaTFIjO3t&s0}-t>fkAz3wC*Gik|x|7e~DEPq8v7 z2VUz%kRkt!?$fw!XGKVdNZc^EAoTPUU5xVIJI@@b*4&1{y8?E;CVaX=c?=Ub)v4h2;SN`ix}PJ zlBHl~Cm|Gli-p4|%(Q zM?EG*OErc>jIWG`mzgju>T{Td{x2HZ7sk4jFCtEoK`F)gn>=&u zju`cpAkkCYCC-?Y>1Y&6`7%FHjW(kOpamDU24U;e32BEhD9qP;Qah15)Bb~6wrMqy zzGh^a;S>15q`a@;A}=W$tHMBfH}TKl{&1+rtS4qGQ6wZ}8C{cJS~iOp_De-qZetoQ zxcS;1WGW~2+#{ZJ2`KtVy5zj{bzCnYRgJ52)u4fGhJ$D?mJDzl<8 zj$iO%3_}nHhHdoaP+X|a|60#rHYKrRtPl18e%d~aUF+@;hU)0a!U&gv1Bm7N00o#6 zm>!hy&7u~-y?_WjyC_YcWY3+(3f`)WgjFbUHi{%BV^axC0>5Q(fpp62uE-!96Ii4x zyPoMlUMkIG^hZ+~i%|K;15bt07ez5i<1@F!hSV^`MBmq!w?aYZcq9YmQ{qCjObeeXnTE4 zFl{C7*oC2*Sj=GZ;Z^#)gW#G?En<#_OeF${tX|{Ybv*Wm4i{tnG@3h`Aj&h7ZdPp( z7U7&PKNQG{n8JYp&8Xc4=Oxb^=R9)VEb&4b;YMWFP5XoG2iaB=cO5Zrx6WsB0c7~QQ}Tr`QGGlbOphaSI*0J>v}QQqyU`XYSDp4#p$l~+w}!< z!Y|_6k1lDeNyo|3;y(%VMEifOie#`qBrRSr%-M@N|L_ye!d}Tzu-SF+0RuK#KVm;Nrbg{#^;iuYRgm2=z)}onpnS%_dG;0xQcLN113SU6E^=?K* z*DFleyeBo#Ygc4O32v^_FsBMl^$5F5ukzFDBL84~=ZpYiqCaH5JGO+to6Nl#QM=G1k0O{|fv}FP%2o6)N9d$WoO1)|`0HZBa870JfPrU{dzM6i zXz#~NxYOmX@!1(wHC&NW?+~;&Q6_6Zb2GlNqAA9#)r1Fx@wZXT{eruVua`CJ0*5{6 zkgPp>%9&6anx3?X5X?`WLWbU+4gG)~9|fX}dXzdVGn6iWE(O))jy4rSeN4`$nkDh5 ziN--GY6kjeEKXH1iu>6V_`)-S0UZFG$e9&VGrrh*+^V)J{lPskh6T8{_+;I53Z(EeF30y9o{du+X zQLgS>L`RSBmD?GnGfY@)NM;s*gYG3?xBcEiL|>g&Ic{j@$@r`F8YUH$N)Spqc^rVI zA^+^LKpkXq8GtX&I4^LeeG{0h|LR>Is^=OaCzQ>PZ~ho~fENFB>7`JcifPt*Agji0 zCu*>6Y|8R{jUXZGEMLjM5V*K|`&X@p&6}qCQWh)5&f3cqYZ=&XvRHjgGoN*mmN8jlayO~t3 zrX}V8S?a%IfEzUa{)Jnawo$ns2$;Eu1xopU4x!ibEw`wWQ270e~n?l@T{{V zv$i5gA@?rhC2)c|s+UF$re~P8=ME9-euOu$pOc8=mp)URT5*xcr1E*b!$vb6X~7#| zaiS@y*zNS|U=JaP;9EWA=yy(d7aCDo2A|H@weV_}!z=*g={n*qo*<(49m0vOb9;&% z@AE$^(#{-oGdazk_zm5yx#s=!i;aMTHFO4%cRgI650?pW$Hor4_0tUKj}h?lo~Qen zaz)jZ1Vo|Se3U0$py?DOUBsTz6+~%(jtJB&r!GGW8%W}~nZSxgjxlALyzB9%#THcj z-0cu9Rp&sKtU?-&MFaclf?9J69R(!{PM8Y@9cV{b9=t^6sq^z+{wb0BZ9moPB=o$T z)ST4^gm0J*;O}M=`AX5Ex|f|+BH7hjkUJmJdsnhqYj&<=a3DO*n zmvk2)|L^O~@}jcg_Khk;mY>1nd1x5Rw}?BxZ-U1z8XnQOp5_JpoPrI7l20`W$LD#` z0bGmAeSBFowsmHx&X=>J0BXYi0X4U(BBGvFo*iaB4B-i zCORw2mD4;xmWe>h{w}B|IvrIgAYtJ}8VYc+UN=+0(0T^B_=|Cdr?pzwlNe5ScY7H`>&m&MOU8F~$g{3WsdaW`t^>^5->J zDfLT_@*F7sxA!DDGB5F(TzLOX5qC=Zp#(T!>>jP6&qt>Vy5)J=h`UEd-$eeu;TP=P zkBoOFutb)Jg#MZtd!_Wtw_p zQ4CyAZB~MDou)51xVA^AJ3BL}``paJt`hJsx;UJH^fEY&&Tk3#Yg@{tkg9FjG+k74*BAAI8W!7h=Z zpq1SBe9)pAX7qC!#0b_uI?B}Dp^MuDjJpLjQIpJHd>{1hY1;`>AD?#RFV5cj3+ikm zKf1R}IFhna6k=}Ah;60wK|MMJYK6%K9a~U6M#r)yC1b(cJg%6FTxl%WbEzBN9%D15 zZL%NZ5D}X(i8rWLXlJ3qSqXb#Z~-JkJLgQ`4Ic9gYx-l42Vob@RZsjB73Ch$eM@F~ zL9F}c1eY;p4%BYiF)J~wwEWB+vNPEK(z1TMiuZ`x~xmW@qxs@ zt%|@8Pk><6&&j+Jm;z?4gvOkkHG(FR!?EKQTrP$pZgF-@T$8?h-2 zVgoQ+xiZ;Vs_%%{hi?|_<<_{ej2BfwCyhxoqC&qrnOyY9n34A2hmG73$6L4*8`xIVMxm_K z(92S0dKGO8v#i%tDd}w0&vUzVgGp<c=qdnUPFJ?>hvzCHW45>z<_eVjB$O@D@o# z7v_s6%TtZQX(wIHkbUVy_*sNM&QBbk)>_UPJ&-|;Jy6#Lx)$FQm<)dm2+6{bg+r@< z+0f_R4dE`KBPfF+7zf8kKGgqXxd9>RYjk+hzmtRX?^i4jH^EMxK9gk-Eja$uMsz&K zDIFmN;~eHiJ=1v01WmqUXj2|L;i)UTf;cL5p;CO{FoopM4q|W_iRw7=L3SmSLw!nn zVzc|<;TYbu^N|GmNd9m9_cTkxnXq5#aSEjxfhY_%g57wZ44{K&SY9BUH`cA^tM@7H z?$FGh*9J%tQqj!*P9A7BsT07}wFl0t2hr|O1abUdS+_WD5c`joU*AzxK^qtgY7+ADI zB+n>NY+q}0hOsQH9=fSH?dd1J1<@v6IZY$t6*aQgHZAM|FG+zJ3NEPOa&>T|iG*K) zSO~5Iue`tO#MV{Z5n~FLHn%<4;FBW=!n_0>xkZ7b$f5n2N!(;8a4&3CYEXEA6b9G! z7NpoXQVv-O3gpAim<{Pk(QBr`B8h+=^7vo&pg&13n5wbJ z7-WJtUlTvU5!@7)Ec3I_>R|?Z8O@wk3GBg=n|d*ihZnz%RKL z9m*9^mRO92>b?ub6Bg)`e#cmh3+j+%)=`y^lpO_0^Y|_wc%U6JDl|umMeAXxD3Uqh zpwUY-eQl^iZ$flrsRj+p$q=46AH0es{&G9C%8#pCPpy%HC#orEb+8D~j`RzNOJy7iR5?Ty zyb5Zp6r5{^74mcmUC_l~x{R6q?Pw(wp{FX|nkh=GY{EJBajMJK^fNM3Y{+)<2;=FS z=ZJ@eiB~k{(7{e1FEl^6J&O)Hmeov9xO&ECt!$OecjM)r94Q2x`*g>I_JhQQup^bg zF&+`?tl17z0%hGLSX0tT3J&ExDTux~Md>xM;Id_?gfegU3G9$E8&=2^jo*uah8rcU zuS_=wnI~AZh-v=E&>iDDsl?7`QnUT2x#BYg$#2?Bj<1 z%94kEod^C3!~j~;>sKygVqZ|ZK?yb=?TS8QvoVM>O=t|D4O<2RtXAMVVZjCzAj2`+Jd?%3rtD|)NagH} zHH+|$2)=UEoPg#J5TF-lCc7^ou+UHK;6LdLp_}ySUQ6-X87^*gw%L&m{4A$!8IH%B z&5u`<$))c}-=-(sy(QtQeHOy3eT{xtd!VG!Z(2(b{WM_HES4R}8*x#f_j2}=mUycA zmq#v>ow4ojI;;c>#2{wf8kZt|6Bn3^Sr*lJu+V{DEKY^$Kx!{)$ayF_;4#f0|0C+! zoq6FSXMe~k5odqZEIl|JEjp{c(uY*}nEYxduS({wB|g0k>KF$5z}i3tG4q}KLJuUr z_sOooMrST8h}Iea^@XS1u!=>6WGv3phqje$kH)>XaTgvb%><^U4u8|18NSm%z3m1>iJCi2~ z9qQAxFwGG-}FSU3-%B1CAr~-h9QFXtnSjPf8KysD5cY;PFA@#{>+2FHlz2yyew3 z2%i(cPvF4MiD0DO1pyp>pD5>-bUvPLuZJEjLoq$P8dCA&!*fhJNS|@+#)}E~r4Pdo z{qx#p2SPP|I$i5YIceeXz4XW)Uc9w-suO1M6=u9iQOpHOOuy6TOi)WS(u5PUvttf` z7!xuo-;vUZVnS-!w~xsHJ~RtT&%`d3b`+PB_5w+A z=1q0i5oGwk`sCOh%@HX*JWcZCZ54uBtRsdIUTwvtP|P7PeGV2!Oguq}<$#-pqFM~f zi_|W?0&HuSb)2YD#G;JMm!+2>e0M(+)%RuPuFtfgHkEiLX{O|o1xh0=ST{P?eU=x2 zs|d4o1#9vyPbYvPR$9v|IHyE>5~oJq%d_6;Tm|%U??h7jQf?T`ydsA ziC8_;MTy|>0BB-a9FksDf2Y zj*jDC(yj)U)5EJ>OhjQ^;D|cw{>>%(6B^MT+9A_~63ujkQvKX>+=v!-mQLqAK1hmhxcb~Ej=R?k~77nmc2OBGR$sZB+LOoMT@W?iFu zJqS60$T>ZbdQC5;%NWGxHw_W%VD_6iqYlf+8Muuw6BCE0o(CA7&U(`#Dd}=;ArAO~PlH=^-rW?t1P{cO2K&)usfXWf zZi0z_Jd+~jZ$OPx8htIq`{#ml5-Nn3R0r|1Y3_HIB;j_foYeln(HwI(#ICA4YMR+C zF24Jpr%iuUO2M?Para_vMs#&se(f(=oBF?hcPjbc{jJV(k%J}EL3{Qj>|P<{UUg+- zHlkm-o!s2Ax0{xAkgQA;ISk8^P+a5nQ_L^kKuk~1tn7Zp9o!c{GZbyd1!|@UvR5K9 z<{%tU4xBs`S&8|2#8q(wZRB66YDDyDwrcC2T8~top_;3pHAL4!PwSOD+pY78l}=WF zSP4Up!!NOD&^WJ?zUd&7%<_?CD@$G*vXGExWv$EEp8Z;0nOTY^1E$wb-GBQcgx6Zx zIrTH!SM0D&1Zuq82N5^W#8H#e6)KAk0z&^=D||Kb0eATv3iu4HAPmL9SZgJO78c<* z79kKsFe)(g;|%YX?4#?Lfs3z0+^vzT*mr74-l~(!(dj7OEkC~uX_7*t^(viUfgr3TL;8H@eocC+W$H)>sT3azFXhEy+Je+ zfL^0Ds|sgFi@_5zPMQHdV;I%G6Q0ZRDxOs}{MNf}mWQhS`1v0679Egf_!}~R=Zu2y z%;6NB2jY;`a@DHE&B;mW$8STW5Lr7(?8Y zqMmw7B7jF3H7e&&Sbw0FQOc0>fz2!-T!>-B`<~dYDgpz_naEg zmXjcRQU00D+Z$(hZfiWi%yMKGDDaML6;gmG^7R=J`aGz%{#N{tziDAWuQiVdabrtF zN4m^;YFI)`MDKXvo)m#L%f-Gsn)ao0gXX0LYBhBlH>;5Es{D+dK(hVY;ZGZmC0CN6 z%s9z_i%FQ+(*dAs1^R$mb%pqD{;|*;kb~xETdbFwLNe#^kS5VNLV!pd*n#fdT}?DX zuZ?glPqRrvGMF9proXj7Gm^vR56ld9vQ16?S~o-9lE9L01g=m&c6EP*?((kb7!fcG zy&?{t{4FEiSW)SDT>Ok1joBf$=uno0`#7lLESPF?O|D+I&uDM;>sG=hlM)s|ctHn| zLD3v{+POFUO;{jR>d z%$Bg;v73Sm@@-3Y)`}r9oy{@NqGHOrc3nX1!FVdX*jml0W0Q^o&#`G2jmKF+`(qr;KH!TT zP*BhgGjL{^ArBItn+iMpdYmlNj!Rfwvl@Uv^*yb)m%|nH#Q>I~a{eq+4IK3XV)h+W zlx^|&exYZ&k*+y28gE7WFSt!KplIo`8ErTx#L<1{Lp86Gdh_g|bw-#}T^|C20M6(d zn_%x2-f(VX4Z-Q2$nl=4h{3U93Gu30#M>HcdlOhfuCY1`G+gq-zJ%9Gme%OmqN;ux z1KIt`h|ZoxN^SSCEUrk8Zvjo>=!D#Xq9Z%br|AM#MN07AL*?);vG@zW7JxOe{Z1Md*>IRLa*_* ztjc!Ubv)JrjzNP$&JeF7VwD^5A-TIdP-_)pKAT0vb_ax?Wc7Lgd6GEDd$KYUeioJA zq;fRN?El@*QXNPEuN#p9V4^9}XZl%S4?o{UN}FZj6Hwa?W8_<>L$Lh9`Xcb5?YK59Nx9n?j^#0AYAmy zSKUx}dSYFpXAxnx7}YuSaKw@U*o~Tx(`O(#Qp6g@&EHxb@|FP}C-PHI@j8L5=+9g3 z15N#9tzNhQCTD;gF%_u3Zrz{CMPCBDOlZq3-YlJ8Btn#@=~hmf%Rx>eB=rg%`2I)@ z*9kNUBYjPCtjT_8IPDl>$GYK)b^OWx)no@uZhGzjqfc;4Z7HO6y*U-lt98sq3wD8% z*VE%{NrzQkCnquW-V#RSqyy73{d1ok_T)fdsa zdShZ|qL|bVzD9Ccu!zx$;i^o-RANKK(CqVeRtW~zJH_zCGt>zp;*>j%(1yK&-HQ2b ziRQXrq@W8B5DFkuvV`)A0Bni_pCG7$Y7N9Q%uu{V$!c(n5l?H(Vrf^(L~kI>%?%)j z2utWQCioBPTRna-9{IouB-65T+q$jQJ-+jwBj}iQ$)Zh#2)I=Cp|8rSA!r7nBI9nEsc0xFcL- zWWm4JrHz}qGLSVERq*-mdHP7BhhFhd-ZoSFz9YbLzU16 zNe+u_hMWwUK2kfr9)vAcwoI?!O;T6(pold~^<{T0-ebH`S}KXIFtD`TMp}F8eVrEJ zC{>cQ@?fW0jzPrb2y&cy#M_Fi!m$eB6QRwpNH{=ENBe|0a=m*{2EWi{YMh)c7C~Un zn+a{Xu3~k7aRN(w@hS|R7J#I}ctlN_kX?SRTpY}s)5mK4S+vvaMYq*wIPYy zQh|5_LOXVrc|xgDa9wEAayrrR$^`Ih2tM*u(tXL_5Y3i214##70ZxVeJ?A)s?&*o1 z(79>U;u2?z!!Nr2hRx^^47BN2!EG(}n#n())eqdDMTPU(8qsNc|k+|5rDT7=D{T!W}*4 z?KT>NS z+EB~A3Yn=3Gtc`{ejg!34dy0@clMx&tzjKC+GVeYTAM-yg2R%kEh%y}fC6#a1Aaq? zEu68k@0_CROiYUwEQK5#z4k(Aa`vdXpUhlV(YVJo@-(sLV~lN#Ot*(2zM5~Qf&|s^ zJL^MEh<^mk|2KnbVp__qgWun_iyC#9U-MP(eH96YuE& zM)O_v;Ze}72!xHW`cnh(VzF+X6GvOYlRRsrsz*=2f#QC}De^$;7M7b)^To*SP)b_e z!Dc_41SFJ{=B0d;Bjg5(pGLdNOL*71+~p%X>E^zQ$R`o4blQ+zidt$(X{MCdNm&MX zpU=j0869c2p`)c=-QRIRTK}%CAwP|bQST-2B{H72CZ*V`l zYPn3oI2IuiVux@U{?b0!W-sp40ZMsI6$4F1rajan+mnueKsh}6%AJJ6!ms?*2<5qpO}U<1{v zoh4VE0M8ZDnrKd0{iJdO7}C~lTwYRex?JD#t!BN=;J5XVVrTmf80Gc*wjAzn8ZL=N zcCGQESA(O&SaaB@_9edQxaL}$om~Zy+aPzZD>wq;g z6P<8874Hpaa;e7jyx#*RDcVhjFPR`|5$sOanKCGt6vA*4LGCmCS+Z*eIO>eSc6riw4dUVSgIc@cb*q%qaeG$6O1Bj3 zNR#!L{3GzcO9|3;W6ZMKx>r#2$hv6~`{t*nEAA-n72Rw&2zX*6h z;p9PW62fDdfi1-!mvN~*dD7kqK~T~6|6uT9-UJkiv@@*!kClY|z01Z`Cy>z*Hf?2C z)rs*31NVqYDP)m-#eo(ka!$sucRCD?Qi!rCaj#Q;d{NQ#I#^y>ZUPM-ael`k+`GMIG#v4fkFT^wzQ|sSpbz~j1u_fsi*llKa0Wh%N(t#^_&)Nd?~wD zxcMdC(Ih=S?^P4UOEUw@^;Sx{qC%S?^cYrvUY0vm8WCZMa4uW;b)2eEsPzupiE%b_ z9JlLZZT!BmcqyQ8I=*g4xpL#c%GRV4Q<-<(AMb5{5aWo@K~95M&G+p7M_}HP3<9Y` zrP7sI7Og9=qO-d9=Oe12Xh-1Wdl{QSRYQ<`=de-)1n`X{TfwP^{X(c?ME9B7)G9=_ z%eYh7>vy$#PlerJZC01gsk=E+gJ&)7cb|cT3F0O2?eM-GCp@L13o`Nc|9gii=YmP$ z>0)$O2Ev2mOCGpV_tulPrkLi8M&5S`mxz)*r^uEz3WvOQ6L~LCURC1QS333&1I^Sr z-NBxGnG(f9q1{|3T31XOnQL0+EH-Pa?40_gw=Q;wo$F%_f*-$GA^s9X&_^K$v-$>< z5C)h^D*;(5j7~@?<8hbK)vFT7uP=A#)ug>$xvb!)QeDuAVCzybZ*AaoW1QgfDUkqLQ&a_HhUv1Bmtji+`m)ti$dI zYc2ASPQ3L#YBt!Xwz2nG0=?aW)pTSz=^I3%(;`=u6-%?zYFrO(8ewC>p5Xo03CuH z`CoQZk>D-Lzm&(@-V=R?VpA`mgKxkI9esgplZ7lT5MbNy*MZvSQs7$_WoKuzdn-Tl zU!io?7^rhx6=XyFN7F71=gdR}FV~COZWWsy=usU_Bw(J#H$)_-r{)3-S`sdVWE7(j zoJHR?8z_|oH}e?4PWU^)6#P?m#NwM~pDWj`wl)QfzW^tdyg(39wTT{>aiQzVj~C!; zUqE2_U7-*$M1vX$(Emu<)ltEgV9b@zlI)hq9Z;*I(c7&VgQWu*qI6N}@90hwM8sd| zaz%@i+9n0L6`oy`qpLMi-5OGAN>#o#?=NQ`@Q$JC1dejIw7tga#*0x5^%;FYow;a> z6z6#X;PMf;++E5G^Guhvqf$kQ*)<_78J~C%`bw$${3o3borhxYxCI%X$zg8b?;#L$ zh)Uf3gr*E;tgenjpG!rECha2`tPD1KD~oBLCD|lz z?+?Vwb<4%>;wQimQ8*fwdAGMqHkyLy%lo7qayMLrOU@qs%jQyM1cVTv+GSXJcRzG9 z*HORUYH}JxorRBhx<85G^EgZSW>)7!r9ZR?`1HFTQS4blL-+Z^reI6d|BzJoF60DW zUCNVi9}ePli62DPwsq0&@1oVa;C=)3zE1F3`gs({&7C&8-Vfd4%z4HknT?00nB2Zp z?`>xl|1JRjQLX&%gt1*W#nuK_P1Z7A3V%ZE%}DEK)*-~M1y#4&#|0AQst3)ea*5rj z&FIu6PGvtTV*lpHy^(P<>BsIzR?2MlYtRDIk(mHectH%AIBKqlr9F)_jJ0c7e{<^4 zO&R2}Ux&zRkm?8c z34^OJ$QD_&`g)#@>F0s?d+)=}h3fR;eGUB>HRLLETC>i;A6D*0bVEMEx!1{ky!i&s zf&Z8m5^hJhkKf6i9pC!(_L&ATmwcfI78isWI3sea*2GfPwWwA-&Psv0za^!*Z?hd~7Mu>IY% z6^6$+-&p&E5(!@dLO7Wl0R|IS^Me(|oAjY@OwnzAi1FH^X(EfMmBzB8jWJ6`*R*Gw;Id z0X~A2ChgfyyB#F#spC4HiU_Tr>1(Lbdy>4&Wx;}Bk}cqsq44Qwa$?R--G)rBO`wY0 z(PdEbAMSt66X+`U=3$a8KJWo%&A-fSPS67XFfWKgCDVKuc4YRvNyy4>FgW+Bi{@y7 zqTgG2R+@O}lM}uyae^E4X;IJ0H8APc(<@OGm#6}Q=h-H!K6^UyH>$c|CkNR?0}`3r z7m)L0RSp$|?`cn3O?_B^9NKuZQXBjOp3lz3qR=XLY?Ct9V4`=6v`0XTU4JAMV`yLT z{*4nbN$}wgAD#evZ^rdq?@OajCv)&Jh`fkWgH3 zY+&&TOC0CO5k|`O|H-$qv!w#ZAzfu7t?}2@!sN_Wsb!TD5`VA%<}GM$4AaNBj^Yiq=fM#Ax3 z`op`~Up9+=s{zNDf0PQ$3iR1AXWOyk%t5{)#cQG04Ah_C)875Vftl9;L!E-hF}k1b@Sg7mm~G z-zw48`-~l<6_W@X@9{-Eo$H*+P;17`3tL$gM-i&$?+plaQ}c9PGW(%!xi8|=m^e$B zz;7Xt2+2l3l)H~0W*>O}@>b!h|1$<~W1Bw=zkvHKY&IJkTq-ja@IO-d){#PR+W`0RW0lG@ekF`oQ(QUm@)l-@gdJ-y3FWm zsI+HlPcxmP$4N9TqfrPTky^nOo}CSIo&Y)sTK}peP`Dh>aCch1{a=B>m7zNk!2X6g zx5^X@IrwvM7%`0P0S5>N-MGw%W$Ln3%#>gei5!{)i~5`>IeJ)X3v0j`;hvu+JBJi_ z@SO-@s0@0)+;$#nD{0zcqBS``ejwd1F+2deL;)J?$pqJ`>WG=>5GOo zcqQ<#{Q9l2s>1R1(&xWL)#RPvy(NoY>d~b6c#ZV0D?A(Shxdc+A+~J@U zWnu-Q>*V}k2|Hy_2k*N)&C#2|O8{r|*ENA8@pzKRr2F+S^)U93>_Ll;o}7YBV4B;! zHS-yD(J$pZS5Ha}NGpNAC$hJO5eussm;KaH3p(cX5HpI={``poD@)@@LmtJf>tw8) zk?hkuM`y8)v0-8&#AxDfpCS4G=OC}P>j=?tvHrr!lEs&2LD|i_j^LsL`Fl>FIgHh4 zH@2VR`lNJLv%Ce~xvcbhr>B}<_mdn2&mgsQe(h0pn{@<Dzx@*_dF_lJ?cZ9sFxV$D<0KPGbNDiTSdoUAg3w0S*N8`QC0;;Z=(i2zVwK4FX+{ zUQWmHk5)|>C)9?2AUsC)GIJu;xiHVQ;Q8=FgS1b5gLNSefOx3ui8Q`vu7@3Th!gD@ z;;0pdoNNgQiaoFQzOpAX3OjRr-<`e+G%b+VyJ@bL#a;zY+jg$~_|@<)Rr<3ck}^{GQxaGLWxEf+^pt_xvelW;rrGF* z#7i)q=FF$SxoaNQ@MamJ*O_dmn_K6OLz|tCZ%S1p0u*OsEN__T42O1o`eMs~t?31n z=wb4@$9(}FW2c^boGHT}$&t_sR6O4!ec?s0XX25V`I)Y>=4u@1J;=p#N7y(QBGmi} zce1t!==fpw7Q;+I5PwQtwQgQF&8}5YS86VOB#Ga(A9PR4L`7W$rb@2~y>XRcU$}h` zELFbCU04?9jY&V~21Q3dDdneUa0k5}AuvcTH|1z;;NMQ6Y<9@?T!@`Qg4f3JHs|OX zX-K!xTDc`Lz`SGz+8mN?DoQne=D*{EmWz5q8tC<`15U+k`uPQwWl*`9NQvfv4DuhQ zThYv0ThC4G+SG78x*$|c8y}MKNpC$yC%NU zjT5{8uS_udr5@rlxMax}m$}&c_JA&pkF)Zy@zltg=)Iu1=X2L`vC?*w!p!xyAf-ul zvpBnbf8`4J?u3OBl}Od!l?`b#=Q}XP*@XR1ZSmZt=k-WFB@U;oi(xrJPqp>mq6v*+ zh4!K4<#)7A=c=8__ak0!J1Wh_t&V#+gohrW_-AUsrXiXtLb42>_AGV5fSg#9COHt` zF|j}JtEc~t`KU}maqU5J32_E3xWvjjhpjy>&9~*Er2McI(1E3k)P)`YM7TzgjWMfz z2Q2`^Cy3>;M2>H%V%QQ}&yk6WZMlvr4OmO7U&$6Hb3jjd&i!&!+ke}IQKGnQ{_>Xw zB$n|K@H7eklqB15*53Z4x(O*U@F(BU=e#8@wG~4)?rV4 zU9UpkqHE=BY1{aei^41YEez#cgHB)aKQ?NCF(SybP9iASx zK`Az^6}Lz1Y{=pDXB*-G$OFx}I4YmNcO@2z2}9U;oz4j+ZX+0w0>bd&eQ||124dk@ zMACk%BA}>Nj1cT*-tmTQmI;@yAZ4~^Ly>nBC0iDq3mMCmNiNX81Lf{@P^*x zcaSBmKTk{XW~vG~YmD+y&l0hX1uo=vWj{uoZi^Ftc~Xd0go3O(!?`}1KX2B$sC%k| zTJfV%FgD-^^$F8au-Aiju$&CQVEu(d1m>Pfwb|@nnq+ajy)fY)Bjl#+{G$iS!j-Gt#O*_QVCxO z!8(OoMGy)EMO{>7?F4>zZP4=U(rtM3Be&4@egh~7XFOKxgdFFG4K4AR)`Cu$6czO) zNkU?kv-q*gsOBCkhW`7+HWPxI=2_&^5b;En_ay)$K*OpRYfiZ$;l;UCe$zXe7RTqg z2XZRI_G{OIAVdlg1G9C4=<+VMYP0>7qfkT+_hUkUM7TXC*&Gyg%6?%KLt(}3Cd(;q zcFoPclpM{v7?^jopA5JN!ui9%@Tq3KscCWn01keAi)E6O%fIYZ^PXCBuLZ0waK^@J zt=1WHi*%Xot1OD9Iw%Bs*iW-=Wj^ElVR`Fh&i<1NYHbY@pdn^caqbjhRO;ataL$C) zk8!6i|G-8X+yoRe8M*2t`7EV6klB}Wfcal1$n5*It2AzTQRXr1Hf3V(GtYQ7Mr$-w zFss{|cYRiDVCjxPGl5%E`$ioX5fy6U33KTfSW(ay5S7b>8Gby(T8e_O{lfaCW;zWF zh_bj3>baI$;39Mp#~{Cn(cG+PI9brWLV?~k`wlB=s>mDYi;p8-r+d{1mHka5(}~gz zUm+MyQ%%0aFhhX~|Ic<}BzGkrg2@f+xRbWp4TX0b z*oZCaitU()$F5Tr%8LL?BPY^iT}nYwZI=W>$fiCcRc1{QvxH4LzcsE8)U=OE=w1yS zN$UFEWQa{6y9C@uEUR)@3orVb+d}~{;2W&5Kcvqil?wc0($+6*M17vR`TSX!%H-;mwrOnN2pAqsRy(q&W+_ZVHa@b2cL56B?E;yAVq?x#1F zCRr=5J#QfTo=BwLypW_bkxAbArxv4ZPdF@&pawqLZdB}gpHW(P+XFW8Q}jdBp~9ci zY|#wPe83zT#ogjecv$a^hIDC}s$3-l)SzZP0a-Q1>onbI{t$uWXLE#ptx>HCVyZd( zbtXKN%bPq0gH%iw*>qzl^lPjjc5FTv;{n)z`Odu+@+jGk0GD`TL zf!ycCXzPj)wJE%kUOnFPqB29C6<6~@>nBd~$+KxdqvdxKOu4xY<<`UsDDdkLTwDiU z6msbgxZseT8U;ICrIVv!@gr4Fxn5G)lumaZYz#N3x1s1zvtOp(9E$FQ4riu7DbD1d zA$ru30E~a{8j50nNs5ag+|YO9$=_P(Hq&+Txl+7|m;lk4vIZ`4*&C3N=y*{nGNCr;nhwo|HCLWk_D~;1yGjEz^z=)h&>^iPsmn>?N9!2;QQf9UJw;7@N6^$=|TTVor=Bw9_GSRgU~!7kBW z8tscgA+aF3US-aUD(t3R!Y$Le&EAUL#R4nS165SD8Mjix5#Ys=dK>6}wtDiS#%wT1 z3;rVhq)#*w?3Enl=g<7Aoy^?T%d#{VylZBKAfymjdA@_sG`y+^lq_4hLnw zt8GXU?8J#1eJGZ+Ux6uPREqVnQx=N zIdY4w5e2vFy^t*8b^*nTkv7EFqVHyk1hr~E1wQ9Z#>X~K;w~1PdtH-(vzhXZlX~RG zQ7g=(6;VgF)Sa0x2r-c}=l`ux1?2p`zv$EKW9b<#D{RVQi8++;mzPY>!tTxs$A@*8x33+mTllb zp}d^JX;S%y?oNyJ>V^+4h$Z}bSZ2cMbLQq0!*25QE|Bv|GxKVB-u z%XM0q#WWW+RmU0B3h2@Q`D?9Me6z*t z6NEmP(*>W#;mZ6hB*p(ziZCxd%V|yUtc~JJO$uzP*DOtTA%U53JpgR;8evIyGwzDo zAp`G$Qclbs1%HHfW^e|6rSwH)QHD>Da>G&Nw>_M&dr7oX%bf?SXm%fZGT^g@G4%DXM-y!QL5OFv0MVw=P#OYKWdb-Ska^fiVK)DWEIb`OiALSATdB zqX8u(ZM=4?Cc*JQT+P6qODm)?g$=0vpy_K~oC+NCcz+a<(5=m~+UhsU1p@xEy!s27 zxbNCWl*rK#j7KMXC0KiYVQ{Oz0KDf%s97)vD?~ztm(k$~+(pyUXwY)W^B(M=Em94? z7;e5&`W2&`{$+A$9^TTJoyura$`8I|%(-z7;CcR7itHH>5+r3mFS|K_ut;}v(}(fH zd}xZ#e)G_zO~HF!yk>W0Xfm%vDmP|0Y|lX3B@`NhT%S@cc>~?TVE3BG8qQ9!t_kYs z0HXpObY<{sUM}pp^VrcbUJ1MMq*`rgY7Ubyn+8=Qs}QC!G+p_X{KT33m-e7Wwy38h z;}aW4in{;9*OvgVbMrd8DRSm1+}dyA0o)aWaJvpaSG}ZWwj?6T!uLBeB; zpG{*-Q4>JP#+ItlYfU5B8mw)%zg@+|2J+us1ErZLA^6l(EF@LRh|5Ya+Hm47ADLIgy>u?tH643Lg|MGPbyv6!a8KF$6Y#w2zsVD&WmF@UXeVmWU+aL`*xUkgN(RoWGWK)E}yUUi3)+Je9cd*vL4C{u<1oAPO7@jRkIovDQ;AaJyzJ3Fglz(ZBhgkYI$ zqLnV&`QDJcS%jvdB#M#*#!nv6v~ID+A{=a+di%U8DIA!qHzSh%-*g6j(E6@^k-t%c z{&!F#=k_H7=+B7dnDqTxP%yY|udGX0*a>C2=b2h28Q2{GeLv~byy3l(&7-MB?pUiD zYh!=n$WaCziis0mOA#SpnkE&+zvq}s){b5u)6?Fb9l+yDi!FSkuoBW=0 zY}&JEgZo)i?m+3+6-Dz|EX&mgX($%%D&gB+YTKy|{~bGK_F@_;=t0R?YlA-!O-?rc zh{-VusOTpD=RQ8RhkReEsn#EN;_Y$8fg}>$)~rsP1Si2-W1A*L`UEvrVapw@X?*+Y zIxkxcw1F>b2PCYf*c$KV%l3+nBRbJ_*NxP3XzB5O&DBh@sl217ug`8 zObTyS07CUKo(by3!Mc}Rui?2r+0G5q>wHYVcY1W`t<8Y&v4)kf0Vy9G^ZUCm4!Xs2 zrmvk&tHJ-n(dpUrpd#9aM_C6=S{WkgW$%1cxNaa0c)5M>9sMC}^zTYfP8&}w=g^)U z#D66)v1`*qqOV7h6HX;@KL8knIYk`k)`4i(__}cF(kA?gc%KV+J-NG2e{qeK+r?o1 zexPX!qgbl4VOeYp+T%75zgON`sDM^d+$VZsXGOz5x%3M%NxbL)ZNHkySKVKZsf8nf z9L`J8U*m{k{^&AX#|g1JGbIypA|D#84{uQM;%o2Z83ID%04$3h`XP=7q2gk2XdEf_ zMxQ;c-V^E!alunYWucs>WZdxdVJsKbi;_n9i2*CC!F5fX-zt zd1lyiEc#~c25KP|NCv5G`8}el3`6B!WJ$9F?iF16)_t1+{G_Jd1?JZU4e*0M% z*VO#{nT0?DTG}JWb1Rl)B~X+sDz5*|7I=sp#i9>0Qk5Xf@d?ysLsQxhx$D=}Dm@g4 zqC{WBf+8HB27}?!U!#~@ltf>uJY*r6Rli)ZIyy}L#FI^-$k>14@s+C|)-5svE-^ zXPF=uZB(fa{khMa-v`gl3uuWfI+@?5ApDD4>74&1f5>6OkP>(Yv=8>g-2ta%ZxZN; z1>_3(0i#wZzDkRnDkCPHRe_(neWW=8(To>fOjJGpKu49AtGEH(l!qUL=wQ`JpSCRM ztx-!`C1h1brB`QRp0B_hI!B>95MN<q<*ArO zueve7zD2{Db~E|3*^O7g+&t$r|C+Q`^liHWarj=ps-EDWgt+6oa0+TeJ7DOMrTJlB zonKVr6!>~#g^YZU3d}ORNzl#oReC`JnDD<0F9Qw-(rg{-5Fpj=wT%e>TBhLCxeL3Y zL6#E+<^z|L?}xkNO%uA>4ab5Rk}q(dt?_?YS13{o<8MzZE(pc7Ni^%s$TNrvdSG6H zfqLnFodNOI*@GhLS&8^?m8S4yUG4c-*D?;8?p-$?Y|RQqpeKl5@sQf4w=E<}+dZ4r zb1p&l(2pDhOnMI2UdRvVN-9ir#kLCA*0w682wzb zk*~?FVANSwMA^`r%rQ*ti%?{#g!6#_QKuoP5PEU4k~!KHPmwq3+{HT>U~_ok-JB=% zyF38$K07W=A#HwDN`&`}-)01xiWyDEY_QuE*rYzf1CfvY^1|PgsA}{L%fru5F}ZlX zbOS*EXDivVyS5qC7)TuT&`?r5@9gwttgR9(@%=V^^pRXOoXVo01QZDcuOF)Km8-eX=vUDI-g=ByAhER#An=|m!PN6mA0Zr{>8njRtwS)& z5~llSlD3zcB}k3I;iFg<`6_71KgFxkecMha8iNF#k~z$4;fbHudB)c+1bVe2%{DFF z^mj>JJ&?#Uj1Ono9+a2ZW75YTipXFSD*SK-#o{>Cm zT=NhTD6Nb{x9(;0)mtMk$pKU>0hZ7O8=^XiM++a6>y6japZCMwfUyJ0rF@azvD_b( ziWClEp*V`Bf>}XtPd2@s@Lqb!*$QN5)QHpZ+vQjkb7!|j2@60M_6Js(j>LYwFg!F+ zyq_uNojv;IY>vKU-cGT-A$$ddq!oms;x?xqa^R%27h8I}q}^Y%DxWFp+8nI#?-GDr z$Y$;3R?@+F9Kb5!ls!A%n?>C`jM9|or12YKtK{^!ju|1F21q^|5%Lvzf?&{qyC*e&Xe2FKgqYouK+Z`mQLBK6p|Wou0<9Yd*n8ZQ}A|B^s|Nx>e<{1 zw>E*=oMnokWJ2MaK!c0U$E*7JZl4ias`8|?^%?6W=lC?yl!9NNX^u0@$Y1r+WK*Z? zE{gWTAUq-v#cbWBRiYz^03|n-%y06@uQHOHoD~7WJz!Wo0upbupSkV*&%I2D@<{r+ zA*cMha=|(Um)X?iP_PJoT zf7DLgqD^Q-9aypue`JJxO5sNMgZ}auS?7|0FL-M`eyHorb*M#oEZubqb%`*clS;t zUPxwdeN!piPQ&Pe?KKM->`y>F{d`Gr({i`WN-8B9O?s>ZHPLddUsmCvorAU`>coJY z{<-(T!7I=gBDszx04LS(n@m{DV-YvOYbH3<`^gtbviaLZ)sY~D>L7sTs$)(}&SJ<7 zCj@3kA9M#LlkH85Ow*#!)0GWfn}4gA5Pm$OYc?oN1qYLpkZ~gRCw2|oDom9Q+Ba+F zD$~sf;9mtqvr2(kD=YtY+ybWw)AdcvzzxFj4V?0xe6H&dEPn?Y@3$cZBh)yXX#?n% zdfk`;5W9`+we)k8#f;c)wh157&yi+)fJzb{(J^UqyjQJ&VXw$LSl8mH!xJEKM5dEg zv*BL;;t?TS_ujnKO0P>1!>AHE|5LWNO$8IQnz{U$4HN zps1dypw(qht(4z@Y1DrhLRmelM|?>+Wztx&b;!NVa>nz&ESYA(M5Y#DqE@y*6;KAk(WejtiS_P1gVQ;dM%F*#=oMkcgA3 z02BiN61lvNn7XTk8qrB&GdUEi&hjrxTcGe$wouj*ImZnZg)*6_fuZCN@UyVzU3<-= z=Y|U^jlo|UGOyV8qFkI!^EXO7O^*@{zd#(SHa5V?GtA8aFRDUC*<}>DbFnwryiM)I zul)p=dUtMPiO?FXt+l}Uce|PYCx7d4+bUC}&!bs4XMRUslu%W_#Je4ww*Tw2w#x{F zvrp4Gy8Tl#4^I<(gtijnm-39xWzhcBT@S&bvoi2L;N!=*DYerbWqb;a#9NSe)C|T`u>kYV__Wk zv)+EBq%@U&&JYSAN}R+!-w5Y5GG^#Z{$bqlJJj&>%IEhS(P$mHf_Zj+g5ZgES-ux8 zv!Lo3$4OQ(B70XzRMT2fSVqf5#6Sr@%l*(Vy8%LkQwEq+3aN3`CiD!C|GTB>UlVV} zjpnhs$GMX;-ydzp{(4FCDIwS2-> zFZcS8I_<}?=Mj0a$Q7XBL(t%cG&n#W2-w+he~SLo(1!mJa2FsJfkijILrhu5&s0-r zg!Xe9Vf^k357YJ5AvFAx*$hHs8KLYry&?mF`iKTa^PfnQdG%)ukX&gn8{Rh)yYZQyxYvjfft?-+2JE-{GZDR)zw1D)A%eXt#j}amC!1XC^Za-_uo2TXMUasA=k^#R8Wk1!XorBNiMP5{t)qus+$Wz5l#^m$9 zReKG483-<_7@wm;JF=Rf!%!}jc1m{IndNJ%Mv>JP!~b`BsJS zIPSU=y|RHl=9=~Bt|x*Cq_IjN^_Oqh97bbqU6=YleX(U2Cc$zwPH!&6#rHb|Dnpv` zEOlt%FayU8u(LhQIiHpMbC^u>L1>9Hd+))TiJ_=;^oE8 zrv}R3!G^PwOAv`XaKiK$F{80l%3)jh`ZCsL=a~acn-k3;z@Lj|bMw|a*H4%l9y|;s z%rax~#xGgzpf?#3fQ*Et0BTz5n#ccCK8?;p<}v|_?Bl%={5H~5U+WSxWSfXLdOf4s z(V)L|1a=b}L%3ABE+I7q7OPu>Y|F%E_HUb{387CcFPAk*Yrw$4lcO5DpXU6f2LC+3 zFZmjw#)MKaOxk|!Ad+(`X%6GgV!&Ws%6iW{_*3n513CGrf}eWu#m!#z7_)gg9HqrR zhUSmle8zZmEYCXt_@r?}CA5HbvUT!55o}fI=cm&A@n0cdqSZ27%x|(S{H81GuaFs% zT(R~v0&P%x^DXu6{MDBq{xepks+D3gwKCR|XZ7`K=77sCtri%MwVhoz>DvFemm?~! zQ{hukK#S>*;*Su|NW#9r1*&EtJwQF45bh^jC9NTIH{(v|?=EEa(;ENBf*b>db)Y~Y z;Q{zYNlu1=a@gb5YL*)A_&Kj1fet6!+Tot62B^iO$%hjy3ui{Hn61)u@AG8Aao4Yc ztn(WA35cW`P&4>7ksCMQMeCaExkR(pHfrF6dALRR-?}2Q<4M}MKC$QnAMI$uwH_dz zxt@tk-^&`9{;bM-^P{&?W*_vQeUw=C6{qsG-l#;2!mz*DQ#;gMHph!7LkoXR(@Zvd zle`&P$35$vqx8s}zlZ|J`w|?O3t#;-oVVvJL-|d4E$nz6s$!DnpghQWyDB?aC9A?l zh*h-@?e@}|LX4;nH1xOT*bc)vZ;=Et*ho|6BLvjXy%zr}IHnM?bSWkZOq(&&v+#Hh zEo0r%;y-@0C}CT))it{oU#`+B75wC?&7m9{!*U9zO++U?b`kibH+vWQnn?1Y)gy4% z#TTktWwt;fZHhVsiD*}j*Ztc6A2{pMU$~M8BMM<(21rf!Wt`635jdlwYlb=2-UaSA zBX(D=RU#2^N9u?JdLzXiPOpO_1u3}SI(7z^*D$j%WUST1s7804D7F%DLPP+Ya>Az5 zYdM*zT9RUw)yZ(d*1q(oPe57+MDwMD>JWjGwH(N8ti@-LhL)UQEZr&~OP*lM$S@9- z;Mf0c+kP9l-s&C+li2BWCVZx==xc_H(o^T}#u?X}#;3K&YEyw(9C^Ub(&hB3U02ZY z*>CN+Qe@>5r;3A2#6gQpM>nhw^XF>L$WqSI#JnqXH5odp?7bLhw!xJkZSgof*sN-z zyvGu)c~E7nD)|$wMloP<0>)L! z>p+oy5%akKACn{Nb<(C5)su|b63$bwbUXn<;0E)v0^N*$o%%WMKZ5oWWq~LXEB;3+ms-`( zq|k}*J`=kPx1nV`andoiojBU8WzuiUu|7)Ag59tdV7tiK3cwy1g2SRJ)&42Z!+8aN z0&GbTszj2@iI6!mD~q<_QT3U) z=-V>a&@c%n$OS1oLQ(-|GMKU%PaHq0O-};S<$Gk+?48K}$p^gQ|<)?`cExK_>$3vz)XXohWhR+>xABR;D5m9C`V0G!LFN6BojBTHq8dRgQ#umzs^@uu+W zVW1Bh!)FBCN~DPEFOf(y*dA<(J5a;qTM0>oSzq`IRJ?(7hbs$35K-8M3EuQ1CVCME z6@G){!>9z3f}T?<1^D-k7(q($Wq$=JDdJ(yr9MfYt4KZ4ohh)?Qe5b-vS>t__=evkTv;>3_~fP5 z&MF}~49Cuw^i3l?Bq@;PA2|$UVmwjR-c+@vIElmHNK~%!UP-QqJrWP}bYb}SJE=OF z823m^TrMVJ++&OdG|IGObGs^t%RK-t5_LNZ0s--z58oa4R{n-pef^l?%1fJU-q|jn zr7);HjD&89M2YQot?j5!Tcb>QfBuORj={Yks;Av)5-2NG1q+wCZgBdvhSh5+%R6AO zl2VUhKJPJDlzBJ1gsY*9`MPQCp#bHm!QYxxruv{81NMsR+Sx>;BcUNr z-SqJA4P8%&;FPv91f1wbXW3M%;?>{2mI1ZAfvQqRFJieXNEUo&5tRdz5T1U0bo!nHsn=NO* z4=_ry|0;R%YW_OG#J8I(@K2wYqFxRds$A!Y<;=e$bu|Qo6IIv`a0-*1aw`_2dM&Z& zDxVATAhwbMI|Y z@;5^lL|y^C@7t)9L&j z>h>DnsXDWd2J07aGThI_f2e`n>M@cUa_@99X=c|6%M&1JG@4>He5D zhONZ#8h`kK%yUe7*;YAIge_wk;v#gGR;u_-&E*xIFg8A)&RdROWf7bEEQggC`W6x@ zm{n3eiI|X;hCNO(2NRUQ`nF?gmkc#+7;YAm1Jv||WW(YHTuvgR*LeL?XYB+ZvivS4 z^sDf!!z@K&(W8U7_J7Sk>177YGTV%7WvQI{i`VV~gk<}3OdPTNhG#ru{i%Ybfh@>= z-eH?Q`lkX$zXAb$O|D+_gv=~3@#9o20EDR}6zi#mchUP5-b;adNeIV6nsjFKC)>+V zaRfT%$72pAg4K>YoF|q>Py!l}+?^VS>-oNsy5c8}K_MkVY#s%(=YRAi|0JBvqw|%iT4p>hoesP7jyZ40OI}A zz6xExCf;pm^3B&7df{K7zz?x}`YJ=yFX}>>DfaVr0}^0-mj%vepG#U+!^W$Lvnqpq zNf}M+CIR`Bjv!JpFe+?9j`6<3#s8RD$dvM(9EJ<+}Dm53|ceG*S`mh>7zj~ z0mAeX(A_%74!H8&e|-gtw{BG^7TI?0(&qn zU|&^8Sqk~n3h5KQF;IB>6AY+UG&^(7M??o_pmDhl{bX1>onEp=5l|l+4`id(zurGq zQ87|F!lb)$*v2vG__nPd?_Gxp`k9O}@scNmSL1K$b(odqfDDyqYaK3-Vi|7P{~HkcpEHf)-^EYQnv>d^!AWz%z9a=@ z8Ml{{qP~pegaG)_es$)1=z-aUEc_q+QEKI6CMD3uR`maTt7_OBsS{NKASQ(fS?Kxw zsk^}`5jc!M*keC(+?2IpArK5%FJ0CyU73kWL5{db zU=9t8KRbn4EU6v;I3RArzEJOcd3|1J%2pJKLUR>w8+Sjc8PqM^RR(z1Ctzt_4DX>y zX(z*l)JAA(U0+Y@hUrc3i-Y*I6krK@l%oK_W(dgm>|pBsf$j;XVf{8ZW4%yzMFU43 zX5^YVD`}1D;}4fkyoQB?S>m-4Vm0q4Ds3Ksb&e1`Y6ML-obeKU*ZdvIfc5c&a%MU6 z$?SSuC@eQaop!=&kl_vg!1Z#kU|uykMKypBq{wwGR+#^nI|cLi|gt z^TPrgjHjPy4;}YzT^ePSZ?t}RFk@cER^CTzIU)?ZPd?9Dy}c$_SezG|&bf?1B2am~ zaf9NPsVY4B@y$(iR#X%YNWJE`r1Gk%YB+;(Is8B zu_o1E_3nYpB%9A2Aqt*L<<1@1nI;ug{3b}9>WhGgQMX>)qZScb zEUb*gntn4v^Y_I8mJQb3PSohyg7ompFex87M7NN(Ql`zH-%_<0v~1-hmFM7p{_L8* zKPE*|72t}GBBzB`KnB<4-X@9lP|p}ibW5f50tu=lK^LlpT+#eKC0clcK=X6iXriDQVTTVQEq%5pi~W5zwMpw;j@&TH zb8~4H^hGIvov79deW)!WiJYMZDdcrl>Y?7~PA=TVz}vNoFyU<-d}kG=a(T7H41SO> zg4hpKBc?Q<1H6w_#D{Ai)YtmJ4&REV^ghy{l7TFm0Hb(rj%$2CmJl6*qQG6(1rp;Z zt=ioWSij*!Jnhhi0md=>jffW`>T-UilqB@k%7txv3Ky0X&*d|24Wm$VhaG5l#Mnqx zSh4~%i=JHtVl|18^5~f~!Y5@ctfMfQn7(G87Vc=DEp5!rO?wO8=~W*#f4E46=&e@Y zQo40$^5$6mc*3n?G^9=-=YC~Qu_1Vb%~m}A+!%XDGZion+)hVv~~Cw zEeXY4R7(hXZLEPJs@|)L-QPvn!a_V;c-d}WlI%Hy4=E2;9RvfauF-h+E&Oz0yXPoo zoyuSU;GU;@y0EtNel#+^YH>6{DYymln`AGIFO4SPm24@ko~|1;;`Gs^ICb71Py#Ic z;E=jPuyY@jdbQ7c=<|8t%i>QNs2KNO3&uz;+`8#FBD47U>yin?`rOO3ozY=#b642f6p&g0Ee9R|A`MD1;Q2O%guQ=t82#2; zR9!Z!G_~C4ZT-fxsQ`taz!NFlh>^_d`+w;E@d6XHA8!JL4LWQUrf8CrDH!7*94}Ey zotAX9SnZUtXK3PW>fP*F>WBxQd$1CCWN^R}4Ky@9bd1b^@2`{-rU{?VRFdAB=&Ynb z1Lnusx62Z#8ojRku7hj6OK?7SeRPRc7ls9vs({xx1yp-4+6uO@BZlp|z2f#$!=vW^ zw0#HmaP8FLY}h>_Xclk5Cy{2S(n<%teY)xS*FkFJ8+=;j|M;gX_O|v zrXEf5Vp0n%foNST6+zA3u=#fS9k~TRTZsTtJj$8h5=hGL7P3$?{*t3weQtXHl0 zZ09-;;MN^}jLg~;*63$MyJn@+Hi3p_qPG(Ae#W&c6cL)zV74dYNb}b9_0Pr{d%-aW zV1q2iC!2gP-xjRckcMJrau9hN|F?Aoyj}vR039dHQ6F{=;U!No=9mPVea*$K(K$Dl z1P7UqZJssFUd!onaS;LlwiFZz0s^+bsvWXgX$X)hN}L=p*ZbR zh^)J^_ZjQ8%|@cf849j~@Vj1<>*g$U`sC(R5E!MB@{5FsQ*zTLAF~#O_|;_skJQKr zFaDdAoh21wXi^cUX|*Wx(B|Jp6Xx*|$0Kj22fNhO$ikZo zfJvjk%Rnw&DNh-O4+}>GiH2WbgLpmjZbw{di@Ff{8AS93G93~%%A-YV%+tO9QTY7E zY+~!yRwv+2`yGT!*wwHF!JWX%=DKiEh2~_@o!hOiE(Ql4^WxfHdnAqGdtA#@&PCyqHS6cyzum<6>HFb zAQe99T#p`7wc_z4k$%LDej{8Ls8m0F(Tm5V&=okAQA5(ksZ|l3=HLPpfR1klBb0?8 zoQ%LE-(%-JCV=7iYvGgvm;NinQFGzQ((c#5INw}L2Dc!0TA}IzA+?mjtg-0KX!)Xr z9wMeba@YHOite?YUD=c%V2OHyq7lR6O6!&6Y|aattR&Gbt%w9#1w`zQO<%j38kj+N zn81ro#U-1ZHQctmw%&Y@CzNEp*V+}HH9;8n>b}diZ&}#5!99pux_$)aK4kJ^WqYE`GWpmu8CNEnAuq@d`2Q5B^ zY~0gwkB*`wOKwBGLfu=a9;Dst(tDu=zO43^dl0?ZwgA4bBU3^dcv`{ObLin6Wza<-XvTS;Lu ziJ$0xQEV9XIxW7ogI0n4bFm92ZNn?ezL_!wWWYLlU{30tE@bMw& z9y1DP^sq%sX&RviT*X)8qkN~0%XXBxno%Yd!pu&v0M;a$Ywv!U zeEb}*^i}-`S{_kgrO#?)lg8x61MSmguIr15(F zZ|m)j9+Y%X6Kw@yz22Eq)-1kD9&SCT?m_$(82D%hI|Z+t?I=xNnl__2kHQ<^sf;}( z@j#G1knu*Y6%w8%@_UwSo|HC#y_4`bJJMBC#vyaY_{bCds7AL&Cx?S0915-6K8}R3 zh6@~3f;f+sx)?Bw)Z0v*oC+n#qub=4-=W`OF*OIwMW1jK0ZMh>&+`}--CMye6v&e_ z$PYTvqsI7Q)_`r*Xk6u)UFT)(*|+XqSA3yx%ETCVlXsch^L(NUk#lrD@eP`Im%&wB zV!!tFM7^uiGP(aUN;v$W^4G*|h`|iFbi1@>+FN?9#R^>^S0sicV(9cMrqt)H3`}Az z<+i@fw;OOp9nY2hA#fpqrb>in%&Tz!CH<5oSJ3F#!6PD-U>e4ZtMjY@;Om(UJ5}84 zhjsYkBT5XURjP3csiCWVs&IjHFAgFYxKG)q|! zncPNJQ9ng^S>f|%-|Zi#8HJmU7ibyT&=dg+&`f~38y7eAQup3+p&~nzO54BE<8%3) z_2SH*1v<%`!_%p8560^i9%0Qsf}NDoY$qmSA(i(eHJ9wrne1!_{nz8>r!Ze`A3_4n zvf@-D_;HAS-f7lZH?rq$<+fFwYO|=y-4jm|u!m+KY9j*qS@q#ULoK*!mG@pYtSNy` zOI~#$F#c|p^33-*{Nuc_?1l@wZ?;+PGKzDzw66@QnnP1erbaM}r4gLw7S{)5WR? zvPp=3oMH2sPx9Pig~s)0mRY?nFYU_?>ci~PVdIk`J|X^6gSB$039*PHy##yN$48Yj zW?0`&TJ0QjOYr12WGFcNxk58bv4F2a(v#hEg6ee#z{js$sqN$#ziXPMISAi1cs`zO zL7-5H4d&$RiFrd1fg={)O0f>Y_N~hs4X73k|NB$40roMur18q?K5GL&z3Yay$H85CJvCQZ-hq%k<4$20YG(t7b@w-(;uN%c<@ED5)h@#6 zW=?sd0KE}Y#lJbHr3ik?>kfpql2!J#)ZeoeapjQ}g%@V&haZ0dysDI`;y6cB~zN9DLv;t zHQ_7s^Yz#U0-_zRuOetJ@%*w-PI5tz_4!hL4-lKj*Uj0%fa1WKYHdvmmpv_u2EAX% zof;I(IxJrGZm66~t|jFqDsh-hd?{6MwE6?WxL+i?d3{6HWz>1Thr4q zuzfGHMKXpmV7OvZ!=#a`dt$eW7@p<8fMUq3kE+qe(&_>?WxkfEdR!2lN=fNxP3WK% z_QkqR$=rW=85CAeh?zPaB{YgtB~lX9q=w3j8a`p9{ODOyB+(F97y{ce$H7$D(R`@5 zE>alI&fh(;|IE??H=~YU+P-p{nUKfsKdherm>X~8n{7X!za$aEDVkEJu^*I~a4pi9 z>kFCl95_DQwH}5S=U6bWG4#)HU06|MJPXVV_8yvB_%V+vxFX(o!mF%GW#e1jq^OYO z9Ojp<8agW}d!XO-O1dUMG=O!@G<@xBz35Z-Qsa0WM{S`)>#dG>^n662kD|T68-0Fe zUPI68(Z94XgYQ8eE5M(#nnD?^jG7jSs8E8QM%tnDfH*}O6kMM5meVc zek|`L3r-{aCO4A;hoJn;I?#u!wo1)pFBgSIHeqK{w8_qRxrjmKq2-B7Pb{#EyP$%t zVi?}bFw}kK^0VypD5^l|!cyp)Ae-zVRtE#O3GTV1KYO9h*mT6)c~#KOqy=>cijDqaEz$C5p6O zf;z*TMeGTwv&bw8^O61>iv)!O3`ge4o$!MVrYG22!!OtuWB4gDdVAcySc9p>Jf064eW1?&N z#;(uoVHa3xbD<0kHESVd%~+>onLCq&hrY;4vSiIIyF6EHr1}DnfF?#iu@> zGM;2K6jqjt+O=89G!ws=7Oi>)0RE;6rDJ>^hPo_D+<(-VuqxmbT0j@fnF*tyRDY57 zatA31E1Q%{m~P8TU>D_tprVcq`uwm&5vD?*fo=%C3N^^Z73t@llM|n6!j_J~Rq|w{ zo=`JQAg11&qQpOX=7F-GWCkCm78<(9XlciX^2s_e1ZGyqCZ_sW^`Xe8 z(MNZWd;Z9$aDnUC7&|eQoBypS*^}AG28YuTdV>2W&q0N+)s9bo|6!&1gi~TOO(Nyiwp-(ty8L< zFyUXy>_$W%1y0WLyfC}1%RoTDy`G@H5HSMVXDN-__F;aJ^y z=Imy;+4f+m&_45Qcd)=ggFF`J&3`FNQmM$PR}D&>Kq*^~#2%qOpmtd_(a*THk=)2t ze=LUMnW6@SUgNP1M8G`ScoltR7`{&;p8h^k;tu{&OODHmt3nt>;%ZM>f*_Yp*)Ap) z96s-2u!oFU*nE}f7o>GMpp$d?!i#QZgtNu@u_|Txalb{o*$@1&jqZW+%XsA>I$9opGN9lu-Z++neqNwc%JUbj5ut&tZ=Y8Bwt6(Fk0Pl z$>annInDxvrqV>v4oIIEr||`|4jyLEv7klQP<8x621a6#_QTPvZ;H`Uq4zBjF@@si z&-)gAY2@>H(yd>xE>ZvE8cl{1nXPkH5yTmZ`4_;2O<2v<(IvK;FnW$H2myFzch@=3PQ?Or9|x*Je8AjbcO} zxYwBLTbjjF!&MUAMTEbaFM5Mb8VvvBP!)P=f0aVdc|fm2XBhi`3Tw~|bYXiMx&5r|jtG_<9=QWQ@-WoP zzZq+XWi7ZOF|hLG+C%++b-^@LgV`xYT#zt7<{@g$jTH{$+ALx9*086%pA&A$^EfZPz^^ev zIGf;?xa$F)=A*P>>FGd14Dqb()ST0FB_PNFV}PuWoDcDz@JfJETeh>y2WEn}g2Va` zK8Ee?bXHZlur0#tXIO<%vMbiDADJ0yRc}W&u8z8=4ptqLd(ou2<^1czf$`pt(G8G# zXQpn4UNOtomI5b*EU4G1Y{_e8Xi|J}7XiKIt>1T4WGc308NoVf@i_;Tz4$ss==513_zB=mx$9TS-*;wUnZ(9*~ z76DIkD>9W*7wI^fq13~-@al#ss!5MCD8Hvx9E?)sKL`n}+aG%)zdUCV=*5!OfBfia ze<#DkEGMk(mEQ3BQ}Y(vEKFfQ)WnF0nSUIb3VPNp<>r_9w;ve{i$Kfw&wn->`ENneq=F+}gq@1aP}s|$|Mlc$jMA{{ zGs@wwEcU+U>r=zQf_mgL$(J23ITEHchar(i$kgaJ%-iELwR;5`VdaNf?kEdITcFa< z1I~z}YnMUx5Gcme2D;?>QdYLgyQ$pG-yvdp<2q$tH+#s;Cl z8a#AaI6XKlm>anD#Qt0X%YP9M#@5^a2J%k09#^$=g;aaM+4c+=_%Thb{q#~}dD2wl zy9p;!e`%cba6Xd2jof#sYmYS=v8>Uh%MS2k9S<6O3i|@`b z9Ek)>KD&&vGLgFB_MGpkwtStCuq>^;igW#ee2_80PtYfSZi zt=VYVhQ9zS&?M)x!11ea7W&$yM<;c}NpDQWzg7l&9%)GopL~e4v&*@J4V> z8F}gsK{D$#P3JmlVmEwVg+PYQwsPH#pUh{qoR+67tZ9#*#yTUQO{2Ln;ZqFjZ-YKCN~A_MC4`4v(Q z@g`~C`L5afQ82nPBxF#T$Zunz<-adZ-vp1Gr1Lal1=2ZDPsN&nv-F;}b{YEW8f>Dy+-zKM95JSx*RIO_ zAeUf9nB@Kp`L^zjtKu8V><4C&EhoFdw?}(&8A}MlH1kU4D*nU7HB2B;PO!zcWJmM= z5+edO^SW^%COu3cCBK4C`4w*`szUtnS1Xm{HNNC1WC^Ln#k#}7(9x2%fxG`gNZ9ea zL1{-DPu@}O;iX!|5UlIWHxaZtn7Jj}MB-cG#WA@6JN}@cIP`WjOIwERh76mZy;yO- z(f0v)i`TXD4|Xu2!rE{B?&*8YATC#X-@9*X7XV*X!Twp`A;rb3Hb-p>GAT2K-^6djo;B3p+b=-+ z)?Vr_D5;K}Iu+n7_{Y5z>2?gmS#I+ZHAsq%G>fNjAqytH@ce`c4}f`H0S;HcH6BJw zF{;RI0a%g;ZI>dS$leDMFeVu3a6-i7TevT-^#rvC_+UlOS%z4XhWWO_AevM+-EzCs z39xziw|!gI#Y=!X3j8yZBe>O?yZuGfj&4jkr=E{OO!GH#9cChQTrN8v_qeIk&E7nO z+{0!h*yZfE_o z<-^_;O4VZ|+f+JYVZ%wXX0Ky){bJmkmHNGr7*_^C8YcXqU;&)8)(>1_XVata)U_by z6f*SR!Bd1*7lsRRaG^kLPqZab&4Lkig%eF9Ha^&xr%M4i*5;6h^si$LOur6WhLZ9Y zH+wbnr<3(H7hw{z3iar83tdao7T%!8eW=C>_~q-tenigzrsXd@a~k82RK5C9;YzEr zD3Ygm{GZ63n0FY9@~bLTXES3hsetJOf)8;s2FUVg#@Y=CjIe~FxqeAJige4-N{WF# zAP0O+nYKg-`$cR2Qo}V};dOJ>0a#K0MTs6e2yB_*PgyR4P?K$Tre2R?1&icSk)}Y4 zz(4Cgf^%j4i;F#m849Io`zy38D4Z?^#kz4&G8Fy>hWXW*)z^0S9}E(|q%WB*|+$7aX9Rn4JMm*pTEm($HF@Dq)MV5DH=yp7Hq& zct5<%!=rItn^qwt)e{<~1zgdQrQH=uf6UpYZd!&{t*&lQdFWPUJblChBNm!pK0x>Xb}zGsSVWFT@D`#<)=%!!G zaAh>sns!uxDFLq306jp$zv)Akc5d(b5jq!9!_Z$MdD$*7Q=xU0hHmI~==SFX1q3@m z_2isTl**r$93^Ek<_eKcXT=i4Q-7da=#vUR^WTU&U1!bX+X>s2eqTc)yb>Ya+veUG zoN*wd@yEL8QqO>17Obo6OnZemteiL?&KQ#8;E=L8HV2vTQEA&|S%iZcJQ(*LpcBMB z`1o5#dtX~cSbJ6F_n|M<(!>yxyL^~ZsDAXAWWmTHSf;_BIj6LA;`N+$TLnr(kp-b{p|Ix)rdAmU)C@C0`?l&e1T5x z0AeLJouozW(Om2imo0qRr_kF4FZa0run_Pr!xmw~d*WP^+j)gPG~?V7Y|EbV?r_FXn`gZDd#+(G4XPvXGuv6+bz|x3Gg+xL!ipuaho3nAssW7ly-2 zVd9-bsl`kGAKz<_g30MvgCeyDGs@oAp0|nxV81|609v9$&tPX&zQUgWh+dLpaGQr! z0Wc^vVM+wm!C5GRDbvOuN~D0_SU)sV{Vm}$16`T;C!6RZh=jsonis-=OL@)b=UX9% z&sIpCrxQC#w|hdK(d9h&#A|fz+`mU;`X(@9$U%mKEQq(Vkt>yY4r0^Qol7_anT26>V3+bwiGjem?Pn&>z(me~fIy_`)yALDzZ@fhIf zRxDgN5L(N~QMo(cj#Ejs7IA`+ZJY~NRP>O)q)UhqR>;l^#a$AF{`odNNS)Z>FY8r? zyc4`-B`ujDrw>Nw?#>8@C)~uHSI)wEm*o>>A%;G~s4d)s1Xi5+Z!7AQfSz!6)pa4V zEjfI)HmC1E%&bhvt`N%L^nuTu?TLY5{sD&B;t!&XjSmZ>cNwNyKoJM((s{QMeed>$hDXy$4O6N|V2 zqr}kwxxtDeHK>Yf|Nn}7i9r|!frue{4-n8pID^CUdB=h5Ys(?555&z*u)Q|(1Mw=g za5(M~h*gl&LB|gAp5CdBZ}G`h6*}fYi(W4&Hu_e)60uPZFc(E|{utedwF{f{-dqrO z2-^}G5AP%F7dP6z4e(fBa>$M?V&Xho-MoMA8FNS8HSybXW6eaFHR|#@oLRSCGYe5v zFpOJzGqv7(&2;rw$u7A5NhHJxxW|lEr7;a+Pp#=FmuG1)udmszZktY__^*bhmvn2| zm2u%v&P|)uOO(T#5HNbMoe6n@Ow|!9X*V%TAIXhH=f<5EMPQ=h#6?ybsF97-`{!S#VT>SZ8oM7dKu4bW`kMWsWMf7(V~p<@ zrmW#TE+FX&V;zYRDPL#MV4kK7YgcE6lD4ClO?VCu3O>qe!kJbb(0P&O45#V)0 zgwHSeotl%!1fJ{BzXwQA>?5KmiuMYxZg~F^Vz!rCvKt}p(4)k)?r^p3RZ~%E3lm6r z4Aei_ChqL2MO=^foGl)yDjgXV9ul&-eCq0`%}8?&WqxyY^zj<%fFz%%w%Rd^fl_8Y zz;1$=Zje15Omr6#iYv8YykeGQ3?^1O@E4Y>lcdG5rA*NlpFkq!u z(Qq*!&gMtBH>*DvY~_$r046LBAd6+SQlRY-tQQ&`wJ|NIGvKVaMI8LLGy2gbh~8vw z<_#&)E;beGT)6yZwdt04M0?vGz1+hT13EFTSt~Gezr7~xinCnUE=Jq++>tyN{*LQ; z{{-n+)#I#dIMu<0{z5Xs@wPV_@ly~ohDQ}GNq)qN6WJsbF;?oiXpC_Tu0XFH2AVfr z5BucEkLdLpV|ZCN*)=MfXgL6@xEmbdaI>l%ekX)?;zk(%mG(&$NfMTp^Tfn@LYxfkRy5c+BbNQt=7$LAx za;vGIE3eTPxhztcy-CQ(qLmUcwEFf~s@4phyKq^pC_e<4Z?{UNq^HA+=mjHJh!|cX z>0oo9kTS~H6%2w{&09S5alo`3;d2KB2>$VSJ%XR0#;;%oedrf~Phz8RiCr9bjX{yb zmUDWW`2})ouO}TowuoiHcS{F#8o;OM1p!_}DvWr5#L9v||BZar`Z53+1a zG;|SO9?J3I(^u*sPLT0yn7lYgP-jFaycUCWd9~Q`oIsQqhdG1G=KnS-H0$DlDToJ^ zU))n_*Kqo_!kR;`1$^$I6&lIU@#Sf4Jk>?%VE>G`tJd8%ca}qMpYE#uE$!*<7IXlr znMH(xv$z5sUJ8x=BfA+ei~*& zABysWm`-F9=TcAKqcD&MPhXG=5d1axDIH2he|S|PsY0sr4zgu}k=7B(cqCV-n(!S) z3BCjJA@+yqsIpO?{I}q6t*tZ2K=&O=&gKZ^9+tVQHo7<6AmJ36#3IJ_Cs-7^ViP=j zfb(PG$tiAyxl=ymN!3u>0wiu*_C|6zl*&7snXxKZC6#H@iuIBxKFhgw3OmTF9o5EH zARY3h@L@oYIB2{HA5?rly&q-6`AT#n>I7_&V8xmmcrujfo^$Yy} ziJgZ*rCF&bU3!|4Qu6K(jC{IRFx7Hn)@2N|WbsBKO>f&unb zk$?)w7bW+L#oh|xO+_0}_%xzFc06M&*kt0TN0*Wqx-Kn$J_$~wQ*fbV5JuY`Q$ zH}r*VRi{MQ_(lvZWA;(`+{#UR^C;!$PdYNzv2oc6QT>wUhEs949Z| z?_oTM$+UfsLp9Zkg?i?1LoHiuy;CM)A*H8{Jban?wuw=u3h2D4$2BNjkJIQT~UUPcmJA)XYWnXE9d~4ZTGY zr)0P#X5Q^`+>k5Q5#f?OqakO)peWF2np^;)d#{e7S-g#+=5o$K9)1~9ntB4O2$qXn z?>1=e4q59b{76`#pD3z7xOU6|641qz@vsI}O;3Tm_PmHuEvjZ8P*@WZ`jH}yJ`{#a zwP!;cx~JT26o!BYYr(=T<4CSH0;RlTLX?MzT}}n858jP{^@Opn&0r zY-NfB%mZs~XwGtyYQp#3i=oK@ICU4ON&6X&a`->**Bz!NxYO#ZSvPks_dlRw+zWhV z*k(8;!>*(4A*lj8H0%jy7+PY&HOn4n5Y__rM-@;1IJjJXf^c2QFWb<5wjcv=&>=Vk zG>1FM-I)d~sjgxG0x?OlB!MgqZo|{CVdxJ?dBJ2oYj>6jfGIipX&`jy*1Y5xQTkig z?YYm2p9Pnoy2&$h6oa*#_Q_&Is>)SnYuW+z z0d+_sgr&O_Q)xrHm%OKb@p}^B0W`iJsPZA=iIIUxUFF_|3nRAOiXWMX-ff(xb*~^Y zo563R&x~szu%vHNZ|^6^X1Lthhq*bdM?Cn{P4IMW!JkDz16TsR);ze(qg)}Hv3t=H zYMs;=8=(%3`FWIOyuc%vjn51c5gSWeARaB_av!{Cb|4tPrBzn}Qg9^!Scir#U~kot z*&TN*t-e`<{pk?H69tW?NOS#9?<|(3joI5diJJ@$DtmT-C8~Q1p`a4=7~HW`8ueOX zWfpmf=s8(0r^og1M$@9r0j^s-=fCtAT0@|c+|PfQlf6eTs!_PIS#~K z&EZ)CPdNlHChPs;mo>%x+Ajou6$lv3`0C~zU2n^fOek}gS=%}JWd<@z>0Wt`6<_vd z5UlNF?7Iz(!Q1LY!xITWxEl?fuaE7N1fh>Z<|#jqU&7T7oL|Ifg4E*oaXJ{MM$eJD zcFx@r80PC|fgHwlgQk{Gx%?)&G1GFXNr9@El6^5m5wYns>F|HqQFG6Qaz_r?B<=5Z z%H4gWJt(6)sa}lWl+dai{g1wxevZm#ttJ&*Td>emNKzSBHnv$VMeR6cCRNpaJhT`h z>r4?eg{T1DW`@yf?L*fVb`_vrW}G)5H40L}#_)2fyIbSemBfnW6T3rbI0f}~DByV< z4#9uAp^N)~+}gO-U_(FDA(MB%Xy{5?$(ni?zR)yKZoUl36Du`Nz*UX6|Fy^JP%Dj}?sTf`0%sgVe zduL--$-N;{HX=G!>k=h}x4;hA#-)a_4>d1|ER503v3|UXt=Q%u&VpBw zzHl^s4}MAqNJu6Nn|>+yj-a}y#(aU=Q7C!q?)&PkGcz$lq8 zO#ijCYddu8tH4Rpws8?wXi?Oy-<)8oQ*H|qs?rU6jEKJ3Z2OtKXGQ``DgbgW*7%Nm zV^`ci5Y(+@d0%lE+luoe2VmBcAu^3>1lB9JWc&F-! zT_#?0o?tFz!)Nj9HSLx+S(WG|Dv)$w6mT%hizI~S!a$dTIx=5~RAW*_sWX0WaD^Qb zD*I`xHl!S%Fkm%c8r!t@bu>=!MJ|)kxoor_zmwT#0if@0`*vTZ&|U6SdjmOPnwS)w zqGyK2L^1o;hEK5$+k^7$dJ=KF>w2)C*p3G9p$DC-xLw%-gxwGVyauv%BIT4#*qk*u zA$Eai6TMLTb6UC@#pndD za?4jv5`tDtk2Ns_(11E&7F;H&(->Ked!}m-^qLB>{dJlY31jktaXTR6CMwsWN_-e2w)XJq8wU6+j(H9mi$DA)-PbrRR++I?t{I)%Ha*g%N;G>I%0$zExB=pzV|4E_ zMOV?>Q1GdDco=~QpeIzX|DDh@NZi|A--VzfEwn+nL;8XNvp5Z(wohIOjk?I%>3iRF)#*M&t?cX%L#HcnL&oBNDIo*HlPU4fELH=LNkf2m1Th;t z&i-}glwREBN>4K7t=w-raSqqBB?TGln*v;|?g*g@1^CWvY>lqzV#sd&-!U3Eqa6N~))2BDc(XxJJA-rL z?O+KEveDV|-t$L9t|3tBb(z8a&>~B(ZMtz)H4rUA1oc~V{htRjZ9v5N!w`7VebS%8 zl~FQrs}72IkAa2+|38$DpVG#U$pL-$+0TTPS+Z?|YU?DwqE_?bnH+4Wi8Y4zq9E;r zFpe}+adjph?n6wE8^CGdP)0iswL;$y`k;;x$%8~jk~p^h_y%nnT4+7Xw=x(Gz2c0M z>!#!LMI(ckBHX7_ac!r&(LNU5K&3@ANfMJ6UAIJ}-ul(8lO6j;qBjifkVzvMdAW1m z?_U#j$_($c{IKn5&nFCmt0BBwM<8idAvFe>oFa?n98yT(a3Mh`IIM(H`kt-=(MmvD zP_k&oc$9V)+s0t9Hn%o3tTW2zvUXZ}V`3m>w+Sxpxl*jo2DddEn4yU(DDyHKGwE1l z=GseR$Bu!rnK+uWxf_j3mXgQ}Vc%9t@?6t)Y9jl(D6orxx}X~+T*{b4=R;TO%==6} z+nPOe!t*j3To z=%O^ym-GMfolkd{>c}E*rWFuQ2dP$+Hi1pXBb2?nTyMpMw=(Kq)!3TMJ37fAqkqYw zl`hB1&~?U0R&pivR?<0Wn*yA%ew z58V*KVbU73Na_!{6qt6%m4*tz&&OR^m6G)thYhn}7t5ST%YbtYN z95g@m7d7~yxt|o}H&&>gSATtN0ArN%iXxq*VFsYkj*M}VlsEp5mr7 zH;sdPG%4`}H)=vj1%>7u0BxT7d&*i?0YP_Oo&56nvKjNyQ(J<#7GiFD%#Y0Kqv6Cj zz#T)VLI4E_FoancA3+^Tt9Fa_Sc9wdwNJ{j$=uuS?Dv_C-((6UeFbAfGr$kI{XaC4 z71>sD4t#~h&d@<(7hJ&0MYmJ zk4m)D-N9UY`;v$HSWn6d;RuGi|El}?Ue`Ys^68b?q$`W1LF&}d6dablZ;U^cht4#TdJL6?~%WjR^IMnR^5=!=(hsnzx*yb8V z^a1wV98z-`JT7sS`#s3SIY(iTXUZ9og%y-Zv7=b`h0lOFyJtvHOPm9Anhv*< z<`rp|WO&6` zbxaj(oS+6g=Srqp5*!5>{FH^>8-k3S-N z0?ZYXj$l>18BE8^x;+y@J4lt|=W#iMr5T{G()deo0%q_Eo|yxnG6?C_{1lCdmU`^Q*U=Z`&x8niXemAg?}1XB!w%R?+ubX7Swa?z{ul$vC&% zN;^8KB1~$YJx0#7b_o5bdjKaLVest2OiZ)M&4@`Tmy~kc$9V^b(;%-+-R2u^HqP?kqSt&~;p-x!+`} zp|dRnK37=131xdgp7>2;hmPPv+TuhH*hj3$so+~H29Ub z)nsIQu9Wk1=rmb;@%8Xp9m{}`qs5E6;ZZY5+of%E-l9xd0Up{q!g@qHg z>VHdNi*BA}lEp|iSOt2(u)J^E9j*P`*DoGDO2AczxGc^MPKu(iF}fO znz#thH4d?4wr@dvmz$hXVy{xkhD{LTeSt%7`D0;UrbWXAQ)<~h4PEai%;M`3Xl&os z#zD-oxdgvY5l{-QCYBm@_Tha(8I0;Q5Q$kSPOk{<M0~^d19=dLR|O0?2#rILSu#rV_Rv#u3j50k-SkmDbvX!H(} zS7I{Gwj*b#zBk+d>3@OjS;HiN?{4MRD@pOjscqmUj3sk2I_(JVvfYO#%)7BY*mdXn^=z58=;}{$A@FA1BenwkG#&7*IuVclX z4El1Eok38?^KF>=MyXW9aYo_^bm|~ByLNQcf z7P0z!S6-5g>;&A!tq)zY$`TfB@lNC36z#Dyp=WG(OZn*lFi`P?z#QdRC6Y!}^&eW? z#a6TpDz>_gCZ42o5V)%h%KaQ{6TGT0qJ*L?oQ}oMpB7K$L)e8w>Xc|nyO50wp^|0d z)MkJVI86U=-RH73Bf q8?s2{-)O8GY;@Kdiqk{a<6*>D)Rjl({5;zZ5=Rx`RWP~ zm0*Ws4dP&v5DxvC1DvObN4ws0@*>KoR1%_Hun_Ico2n6EQ~#pAe?=v+mj7AjML}mp zMV=sXjBP=GQ_11IDB60qp?;NzF)E_L|6s!rTZH*a=wQ7cfXb@;o8n`5fR~L(hiXzr zVOy?NE2D`p@l5j4)14QKb#)7YD8)|qXUmPE^v1JJHkL?00Njf7VzR5FI>owYQ)!fx zy?QZ=joC*j1FpMuKMe2=Q2nCn6T!nGycNf$tVXN;T7dUEK2}Ty`TOzvBaPn9j$kdq zAqa##r(fO6=$F;ta7El^_7gy3I4{{>7A3(hJ7luLU|@qjv!C5{$bHnJGy7=cOkwPj z|G4fD*!)pJ9;SvfE)(UyuPv(O3{=U(*AihrTYub3*Mo#y?xXi^_W>+aZ&qf=i+yQX6D2%@8r=y%oo4)Hss5Xyv@bU=1Je?B;B1JNJ3^e!;a`mLgbSfY!sqq$JhiijgxN(l zODl7d$HA}rPm{#06D6XyUO^vof^kxUWlPY=i<8qO@)FrpKdJijCD!)=@+B_Ini&UL z3xPmc(xCo+Y?!Z&J<`jU{gxLESjpB`-(m#fgpAIJ+LXvcysAf>2CagDl7bHML^2XQ zYH->4!N+P2ux8RGPPp3Bw2Rjc@<00=`DuX=h-HjjZe#irl!xp7WTODZJM9nidd_SS z)f$RD49>iry~G%q)E-Yg>#>c!NgxBRID8}26JrO`CYRyY8V*rk*PBn$tG)_aa+QCY zoljEL_|;9@1N&%rl{KCbq&1(afq+)n<85bynZ6Bs9IfS$T*WFRYl%UKzE$yaF194Fo z2DS;d`Mkw$k78@X&>s+~Jr5ENF#HPJi+feo6!G=?DriBN5Fo+LO>kG<6{T=qrUlSN zZh>Qp!9%;R{T^EHPS8UExLh$`Y_y4fce8s-$x5(MJS2|F4yCimsDO*Y7Oa=`q>`=p z97v+9C;Nl(VP;1gho=%HW?n~dnIG^?xl4KyZjvofX5Ms@V^s?kdNyV`B5SqE=W6=@ z9Di&SF6rszCR1A}FEL4xn*hXjv$r5cVc)kEjq-T5Bhv*t2Gwe;sxLry!bRm_A=MnT zeZ3511U8y2@;oKX$e|x+DM}0*7ugqpqmrnjpLz@7m{a`7!FNlmX~`rHDdzaKfE{;h zz|bVb@h_it!TLo|0@pD^=alHG9^BcV`%#V1DO{4=7;gdU;YPuan)fBLT7EWd6km*KK%ggsqaKd!ZEOSK#vi? zkLz!W`J>a&>&J5)>#tGpMbnb;N#t z!S{U?a)#pZyfD}de}k`NW8aYMuV<=QFc#w-s0EFMH>w@q9KWnP;tKmjqF9=c;G?QT z-r?k}8@wshz9Ym90}}f7#v=B#V-Qh#-2AA+PUKZMZrkrD>H;*@*Dqm@erZ(j_ygj8 z5Ao)dKy3{(QtUG9jDjibCpos4!_z;=rhp6SPfM1~#kA(8b6enB?=;Y6R#|`I<(EhG zgkFSC*|+Ieh5WI0(A@X$FF_B6(uaPxFGt;nh*`Db4Pi@UV9e_wMVH7z>NhP($v|Zr zLOr3L)WGXMWZ8#D5vD0B^Sd!YF3)aCBZckCk5V;L#mcxLF9=7K?gu6?Uz~T`wT643 z86hbj8_08tp#Uq~#Mjfz3LAn$!4Ps==F@`G_Z)-xDJmkJu_+gT0xVsTUi(UA_J+!wfEs@*B3+9=Zc1Q7ph><$MFR__uN?% zU|r^G2Zvuk14yCEv2LAOGRd(6D z3Y0I(oC3Ezm%6PS@Im;F8k}L*m!h$1#gE0*%Sz?{m&BjkU^p`T70z4_WksCLhvZ_< z!Y%(dZ+_Tq!1h{`s5IZPdS1B<%AMGZ`*So#SX^s)pDSNX+7yV4XK5oWeB=gn8CKmm zYYcn^&trjiGM)W5R>_uAg?{ly(x1c_Ycf5K_3p1mw(}1>62xUlAt=G#5(EY~CpgiM zW5Q&;X7OJQdA3gK2~KZ86=&J(Ay%IW8MLo+vflL;Rv2HgFIvF{S~rHvjwzRCw;bHc z8twzV9~}$@EN}q=p2i`6`o6TJBt{XCN9KfSTZ;HNgSJemc%!J_nnj+WGp`_3F%vnO6NG@4P0E1JV z;xkvL>|`y@cdc$>`|bYpCj=}SqR+CxIWKvNCd?yj3XQ!$}} zTo2ez6)nSeigJpTW7CgD4Hp@2W-#pv3*Wqv`qYz+3AaI%>RGL`*uMpY%eDKVT`o#v zjidr^~bd!=MQ@LNCMyWj8^dz8ah>L4zqDdsa6Hur@wYH#GS@=kGZE1 zRk%-tJ*h1+D*uBDhCJ(wH7MAIoqtCk*!MU9``%lY86e85C#U1HBQJT>z@G`+VkvaG zqv$-4oBgS-4a&qw(vJ2%h&~#>_01Jm=xrmh>7i`226g4yRl~m(l?$=E=!d@E%jU#*7=mI1((nKEI1 zVo&0eiN61}3XPK7WFg8zIhS$rj1@b%)9<% z(}Jn8EN`LgB+!z4g&9jmu44CoAQFZyoa!p>zjZ$J3EEFG)=l|Z^$RsCO_2aQfi^8Z zh))$bQT!()Hg5`_@aH}o%wnt?pjNdVF(XduB~_M$)*(>nCXBuuh=#U~^@WcK{q+AS zt=Sds$F3l2c)YAbBm;K#t8ck0)X|3wYhZM%5A`=A6vez3=Q%o1syHO6s8-1=LWQ!A z-POYWE&&SKN7=OkB$5Rt-O~WG@!ITVQcj!1n`{fc(EoYLlwSdZp zG6Wt}^!4;(z9uOm;2r0Gfy|gTXSN5`_aY99JpLZo^(^1~aH?(m*=_t86LWfjaMbvoUeamn0BP%2_W{_aB$(fE9} zhSg-b>d2j_6|$g{6$Vf^-u+I#E$3W^c-m$ycU3$dTkGLbi5WvlC1|Pmfqo~sl?YC? zF#+g3p)LZ>(@oZlW}d_3OZ_-?j3fa%r8{DCEXnA4Z& z4+wg-4?dWb*&xzy-RLt&);IZ+hF95hvl85AhOT+h{OQa5KpFV8N;qrx_w=b2{K24J z&3E!ZA16F4=TGbaB9|u1kIB93R_*bfP3Zi_op5%|+F@FXzz@zWrXd=<1Ai|Ia@AGH z2rJyCI^ccA#=~j1ZjT$!I%EtimI?iT3+*m2TkU31cZJeRA^1Xo&0J}$gJ@NW?lzU6 zT$NRGF`Tv)$xUZJ|70?E2}QwJ)bt$I+H=|cFDazuAW_U(_g9J%4jOGmn5>hy2vC3{ zo&{0~qd}4(#$-f0VWc_=R?uyQ_s@dJr*SBlS2{Jnu4`HsD*<-sgqR;NY4w5TzVR#) z3=MsC1X;AW*0LYu?3Tn*PLso1!~GVmw>5df%=;$5Ra;j^9nuxwY~rs29U6I@Cbbzg zr#qo)iX?yO#ltnIcr#WkZK0BSNQY34V6-4z5p&Erfr*Ytm&Zswk!jQQ3;U}Y;JT1j zS1a)(KhQwW&RfZ$Ru{okZuTcC`(d&sI|b@Daj2p&J#O9gyIN$MCmKrIdT<0CKJ-?b=&`L_yG4Z0{bSOZJM<@Aq*(U&3Acs zz)YpsI-uwjaZ4A-sDK$;E^tR`Y>Te}wlyY>v~|puIKQCJ#U7W%X^5VqI6hp5h$M8C z>i@5wshxxG66B1~S~RimZ)bTz#Zrxq>Cn?(7YM z5&r*3HgGRv&Ecyiouu$vl{j&6ng?m|O?YFCo)l=1JE_v|OMMNDU5JV%)3ap z$lmoaWNo0`K%m@MWC6a!vBN%CHYUA^44hUzS3TrR^{Y1WV!)J8-)d@R+G(iiF4kj= zP%X!$p&t41L6LU&y?Cw#D`u?3Bh2kw7juLrb2i6d=6OFr4c%T%Y=F~ z@ED0J*p1+fA)c(yoBFaL0w7$gzK{nf#xIDeM;K%#GD3l}rusl)AWm!}9{~|03Zx<= z_xhcD0_IM11&xB$S8b5o3QF}0pK4{tZvDN8#|#NO|!Rtqj7=!{!W$RsJ$le+(g(R?ebr1c5sMddJ3Yw!cQ9X%MRJ{ za0s(=5vh2c*1@-N-v&&3yAv%KdJvQ!OXL3F@G%PW1=P=4!Z1%4DyWbT{RI|9I?(I> z+5htdbl?gEuW_u0qPJGg=W_+r%3M47(Qh`JH(b-UxTpnxg#u- z#1ft}m1eYjGw3-kk!4QtcQZNgm@a>4&U*q#ExfSZ);HhIy#rUQ33~}csQ}~`%4)vJ z436(!Ux*kX`7PzbbMKWjv#hr}cKQ=Ht4V?iR%uI`O7Ir}R*E%JDfN*c265O6R|q-) zMgiEo$HmVaSkZG}$?^>JAlyO!Lk14d9ynMeZF8q_c~q^o$%#u+z}^pi3a*Cfcb7OtH0%mwB!zMq^V26(JgvIWO$;y^i|7d@K?jY?dW3+Y zEGQ!&UxOQci85w%Q8W%9>>vvbeuT4Q_#6vTBU{tjjoELoq(($M-b76aInnpvR ztHV(mvJicT-zeam_;nbvsf~K@O7P$QBeAO#hTtWBTqih}xO%m!CfFg}x4eX)t18IE}!Acl*nqm`*Q)|#2QuUwcB2)jp znMtm*`KDI}cMG$(gcuw=j)2s?O(go>7$P6DFpyDTmkyDR@U{i*>_^Ccn{09U=#4_K zmbqPCW}FGq-dCPQ<_FB{gwDX@o3%5J&WMM!&A+5LnOb7pns_$j=Qjs-2$m)@AEI%& zF+rgiI`}aCy{Fy(mZP&&6odA0#mAR>J}%4t;mrzDTbu~XjOV=kcCT8U_GP?W{i@0rD((7Y+v{fG0**Y(hc-h&TUn1pwUDaJPW68pM%V3V&B~eu zns1sHV#?}^<>eNIW?nX7cGS0{vkk&670<~x4-AC|;3+cUEBtlW84{IP%C?%R2xx^P{Y zB_`UwltQDY0(0uipTS=lL!N`4CE!l+?vG>Ak=|&J5B2c1DH*~}xEUaZu>cw(W#zI- z7yLzKLRG`tm_@mSxxMB4Gq`i~Ypg$~U{9BGMn&>3F6SoM2ij~R0p8EIXgPMB=R&3J zLp4O~9`glj!ubd51mPxRW>`}zrx(?8#Z~fVhxRiO)OrY5?+%u!gBL(ax zJX%`3-L)3fARaA|pw+$mDP}N_COvQ@?C7>O+{wm9Q9}zJDM%FABjQ=14GTp(a;oET z*N&|js|=Xn#RV#>we6@W`PXxlOy5rBn%mFq8)pAy$`mw<{ZIR}s_79MAEM}7zpUSSW2bu< zH^|Ub=?V-x`KVqhg19bsJKnjEd^gwOUylUY+pi20UI#Jg`79`?bi-zVr=;v^rJ$`_ zV-N+CHp-G$pe2daXxVpa-~uL1+%uH99$)SNm_tg-k^m1@_us2L89c$pO^!@~l}aw5 ztgg~M^*Kj7_@k@U>$+6gYGB`dZ6EU(`W3n_S%$M=+}U6Mv+qeM>uC7%&eTeHllERl ztUE+}to|aDgIyy@LXu~c(@FOM-m0kH;XfBV%|Uu!&;$n+IR(~QWh!6BK-Re@9{*C1 zz2UDACFtWjE+O=DbN~al&N*H-r?iE?j<<1c4mDt#xiacqPwch&7z(br7uo~Es?LKs z{NK1TE(qGTN$ltIlUDgmY=`#QcqsA$5Sc$Xy#h>)?2nCpgm#8s+Fl~(k)20@8#{{4&3J^ zU%F@rM(M!snl$@BQhO=o|D!|RN(Aa_co#Y)S+>_^OjDXiqilpO`03+X*!q^pn_lJc z6{m80&IPUZJX*iHC2C|;g4k7LMJXw+--iVKN7z4k`MLEa6)ePbqbt_+fe9_VGydAj zOi9vI?VQp1{ObfV<|WChmK78Fnsf#Jf-52^x> zDvW=mM@%*Y01HkpoGL1a1{;~f(DOh(>X@R5GUqsv69rNf&?VwX`So5S%;e~mGv0A4 zeInKI&5=W-%w)zGXVlyq&V=j_a7 zJ$hvall?TscAUTxs8#Lf;YrWOko2bxuRByk!wo)q4NE$suQ;3IPhwG%ftf4SoA!t- zI;q6f*=Ya~z*%DA(`~PlB9e!e>&qO9VPJHfJCN>UR{edabDe3%Agh6cnrifx4}kY0 zb&Bhnh{_8w;gxZ-kqIYjCIQ+*N@lgV)&_#5@Gq(05bb+yPIv_eHm}MH6>${M!-AyJ zt4fuYm{d>GL7*C9!N>YV4a)hC`fCvEhp>Mf7dlZH1OG8=entA9(WN_yFgAzPO~~NM zu)J4A!HE&-wP@5MjEr`@yd@c0oyi3U$+7_K`TChAIh*P^7y`F~T1w1(Iv@8ZD!){= z{<&q6$U+j(I#6Tl__ePu5aL7lg*A;l>eLO?-SQ(xQ|A6rbx-_C80LZJJlA2#0W>J6o-E~ z2Dk{{XDCZ1KZKX zL7n(o7aXr7x3f#n7v96LePPd}jbPVJixT~W8b$_p_HtFLuy|VFcx`BEW){YQ)`aBGYiZo(?Z8WYAtb$r)IM8PXlruP0cOGFSDgHHJ zELi*#PH+Z@_fpmBa_Z{XRKuz&q$RX|U#s|h(?&U!v-nJCmK3LY4Lpqfn6$<~0__I* z8^uw|6889_WU(u@eEO>f#xkWWfC$7kTj6B1p9KIoK*zuEmxs0o_krW;Ad6s3{|2EefAz(iEz3%JIvz4el=+yjh?0Cv#Mrm&cDH>|kd0sn*kwfRDo&6j~$*_lC9UQwN^q?hGP`Z?h7^$k0lks-i<6d6o+M24Z_Yd}D`sO5f z7gbP=Q7;!Sr!E5cbA9N8T_DMO%i{94(VpV}787enbC{PWqp_#ZDaPhT-cRcQOC8Hx zWzt5PScE=-O-L`9f~gnmjBR7E3XiStPD=shaqURdig{H8DW(u+;cFAwuie46ZyVi1 z5UE@Ih3TL#M{AuXJtBWl@!Q+lMJVVy zyp&(%=(S-}PGTx!Qb&dsGQ0KPMaj}OmoJm9ZVi+XlvRkg-90btTi$X$o&^dc4U24> zzV?={dZ!I4Oj6npGe(Ncb)woT&-1x2VKr3c-=19&wuxr z#v(sf8!j~28~+veHgzz3<;xBE)1s!E=vJB-Z1ES)&PthoAp;r~-=S*z`+V#wD-_&B z0B>$G);f~;`CpM3S{$TqrJtDWC(dRd*4TUKbG2Gv^fIVF&Pk5 zoRr`P5E|JEQ(VP>cg0Nm+OJr0_8aAAra(QGf&^CH(&ECXKSYJqYTOt z{LlBtbV`Kehg*ca5^oJidR@oyZ2^>|=w8REv;yi78+x=+9r)}ac1&Y`8ore}!PSB&PdJa-GMauYw-T5g z(MG@2LL>7$8FgFbb6WSQrSOk(lFU69vw>nU(twG<@S<$YGx3ezmH-5x!^P|P=uNjR zJ|i`j9?4Tf`Nks^dA#@*ogmN~qT#>JDak568N)eoF5Qh>@Bh3XWo=aI411a?wyp8) zq=DMb!yKhQ&)2mik%w~JZOF*W>|e5g*%WoV)>Bipw9%Q;&^!Z5WQjP_(P;$ohAw5D z@EB0Bi!2~M$@Tj{2{a7y30>(YKsp*9uDc@-X!iz?Alle~Y(_p8k6Z4zy{Tn{71W<0 zTdm+c`@ILhIh@w&@Y5dI

-l_9J1%GKG*=QP9fq)dElqV#d=w8Qx~T2 zOF}iERP-HxB1=FkgQaIpD7=F=-`bsG^2ignb?E&3M+!4*q?N9auM>TO5_U2@mr{TF|zna1lQwKoV>& z$R5C=KWNOpRFvHAP59E)i*;442Qs32SD4_-%lE~)EW86PdazoiAcrNe}3cX zFL4YsyU-cM;=1u~t3E3)bG7@)&R7=C(aK30N@PHzE?DBi`0D+S#$SL>x45I{>(Xx) zw4<(35v&}?)AsI4FI78|SM!iw6BZJt%*A!MQ=mEfzGR1oq$d=;GmMFfMQbRLV<9pO zvM2N1ks@ZBlMmD{2eD+NUmAOkP&FqkDi(`cZW{cyPP+JPyFCYTRegqYdr3rFeV0K| zism#o{l*Co-+eGldOf1NG_GQduoq%)%h`5eC2bE?PJ7t-=JEKy!i zum*9e1$OE_0TOktK$2Lqoa)st?2L-c!Cv~(;6oM__7FlR1_8zoZmlk(6{* z30RRc1F~4kL&v?B9a`;O6D@1`9=3nIKZ+^Jl|<=N3CdlGc0|!J$y+XZh3PyyErq0d81s*OAaLSr^!Q~_ zOfDh&Xtbbel42x}_|+52VR0wp)eN+igzGjNmlG{}LNPPW8~spuTTLFUI2k&9Z-CIo zYdYeEL$J6(F7i37a5<@R77As=0V{&{m`&r?@y#WRTk+j$CeisJR-S*-HQ+F@{q5zY zVx>|%SxGU(3Tvk42C(uejZyB68D%0uw5P{L>*0&*3th$_y9>v$wuVfoESSuI6{f1R zH4JSITI%MqoWG$9I7{VYvTDE?Ihr!pKTsyEq`m>&^Unnr|c&LWQgS>1soT@aqxe(&V)Ila+eu^8jnSa;RB%pJ9Yj za&dMAdHJIok48!$Gae|iSh1=b*n?8^mA*Gukv&B}T|K~;_ z9jG@>fU1@y!AKm5Uu$M=ERm@*^%=egk<)AucLlomo=Q=Vf8#`6bOZ6+w+p-|G#3MQ zr$*#)yX-3+o1S^vXm86$7`VEUF1YgB0)TX}k+w2iy8%u;-B(gfgoUk)a3+=(_9%3S zf9gGTY_t9f(=H*fT^ms!Yv-c_#x0erqqkbHOjWdEe*&S|*O72wos-B-H4z6n7*>cP zRraCfoj=$U>5-4^l*%ar2FNXIe1^XoprAwInu*Lo#{-jZWBr0fZI;h6j_lph#tCaCmNTHD*H&6V3>PU-9^(Nyn5 z3qvXr8|PU~7*WMtuW9PEMJ6Z1d5Po+3sV=G@u}Vdy#K0oCgwkzhHI|y^B0bGS$3-NE4&B(JD(H^=~fiNUYul^(N{MI(ZnA!x3!A0nRnMsCB{n;~Rrpb|*XF3{g#MR0#9nFZj zT|_=6X>-yn_{3&?16Oj2KSD`oa=;|!r~X0>hRQ4e$pe2#j*oJ2MWA7wnexKkS=A|P zZolD)9RT>N-OK%TR<;reiUe@5rpWsv>UFH49oz{&Te)0K;UeJ1=S9ez(^Y-Edwj^U z8;iHo`Cp~_(DqLSDKyOaEX7)4X8<>l#TX|P3JV6X$Dw&WQVvJ|TG_EB5GN_k!|tDg zr7lF2(!WOXe7$wjMkmpwLT?z*Ji#YD=XbwxW)2#f{-=Do_6#ogC z`FPh({zW?q{0TpswdL|ww6gvLxg=1QRJ)ZG?C+FYN2*I?5Q?fTLSTtQey^NvIV%pd z{xDb^F4_t8pL3|Gp<#WktjAQEmmvi(FY&_rYz01d%yz;h6tjb;|Ns+T>0o}!Oe-KxRB?ruC6VH?G$#S~i_(M8 zR}E*@AAnnF(~xA^m?Yx|(3oX~o%jjtvP6-^__flcvgYAynMy#9RBKxV9Z!(BA=CiBdYswwHHxk2+MOVdPohIV$74O5Xs3{h6>azkNbf=6`&`8 zr#rd@a`^$4J(Nbw;0!TgKWZNONyfQ`RdPaKSikOH3`BLF?$30cbZJJ6P__=V2QUZW zecAHOU2hE|;;2d{O0=aVA^dWyMZJok0)>|-ww>FPyxPltYM_Tsxe}l?GHcxU02@95 z9HBQ{?_FZp7DONKll>P{nV7EC0HY&%sF(sWFtQz*=qbi2R9oxkc&{Okj|@*y`q4}$ z`lLnd?-IrCAE^$RO{LW0^6Qy*pqq!25r+AN2%y{M)vA-Z8x)iYr#e3|+VHIc)tah@ zYU8c!B?{3^Ghz@jyPq!@9#q5nTFwiWve~5K2)gwE`*t8r{PlQ{QX+7u+;0JPr)81L z=R3=-uIiZ&byZ;!CTaLgly+5CIyC~oP09M^q_0js(dss(l;=L7OHRius`CwNffkaI zSV)WC_-a~f$bT{BaaAek52N*pv{Id8ikLwK>pI!mY=B36Vdn@1)}eD(umH4(X@)Oz z(az$U%m9wz2d-5F%ffA}&}6I+u`MyAN+H*T%wpt%fig*Q#MI3(JFzsHNUF%w?`{r$J zO3{(u)E)TTp2s2}4xF+l_S71~A_!`gGe*6ptkp$C7Y14+6tU4xJL(ZK_b9{T*Ten; zkU5M-nR!v-D4>=zkhAr}C_>!ncfz4kY75QwPy$8P{F~)DWR&Y?{eQi@ zl8gP^Z~WdW|I#UJVAdrWx?1$M)jUQmF4AZ=j9aEW5`$yY>Ed0XdAT*D6HJoExEd=@ zwT$Uw>wi;TYHU&CVEwiI7?r}iE3I-LNy|jfIBmoH9=hw^h01VRmu$XB_BSafc>b}U za1?Gp;;4~`6@MNvoa!ot{Zm|Dlg51XUc75lpiCY!LX1Aj0&EiJ-9UaWkyI|<28L1Z z#BT5A*dJUs0mC|(!x~Krr@~g~b@S>C6@h&ShlX%@Kicq1Vr^cYp2HXYqSSZg^NCh( zIW#HP%*4YrYmpL?3JtUNMgpx7T_`)$4-8t+jI)5pwFnNtEMy5762JcnzqQ4wb6t5< z-=REy(a7YDyM6pDNsKX;1O(wGmUO8e-i+rV2HCS033fJtCtD(w-3Lc;qo{g=eu-0g zF9%x}9t2ZXaed(1UOzw>QaacANBi!9Q<-x4J1h-7iqzMf9Mo`w_5Dg&Gii^p&6SmK zpD;z_1d?Qm6_TOTMHxMPSp8p0?&bId3ISh}r3RCCCd2vY1{NEXLPB@9l}D)t;5Fpm z-a1aueGz;BiZu3fhIo-%nJrGO|I^|j^`DW@{o`apsV=LAy!fVssvMHAr#H1&@ST5g z)d2p2Z#)assDDeyo~D1EGeHHLRLT#OU2)ogMA`hjW&>SYt>zrdxc>10D$>l79$KpD zP_H_RY(C30C_86kIc)>1SfwLApQ2~22}yAxJt>V_>S3n8;**>~&^EjIHj=hpC4aHh zE;X@W_fv&F_ivni*8hgv1;>)8;%+how>&KHv!!dj!-UhaVpw}{%ZihP9xlXnm%=e% zaF!5q*=QD%(IA*}2V1Bhi`2x;NRN}Dolw2WSQz9J(euHk_*n;hJHazndPLbqUx6}} zi7Oja`;aE10eT>M$(M1|#Z7VUTHG9*N4C|*82ju{kBds@69JpCZ>svP`>KOGlODzb zqP+W=a&I%I)P|k+fs_Qkv;;#x>bS3tvV^;quAW1NsQa0bwShbFf-NoP8QUS#A06$K|?wnRm0#(|NUr!NEN5flCtS zo728F@5^R3_E_%C>#%SHnZ1q#VW)0dWry+Br0Fggto(k|yhN#&iBEVD)_sd1FWqaa zFfR~?RM_s@<{(sST+JikLr@>4bJVEtWF>$TeMhZaB46Dn&skzn79PprnISh{*?Xnr8DrqY!-7e>{d0ilQ|H56^b)x){K6uo5I5WTc zm5=`Gi~6&<`|~IB(35$D(PtUugBBz8=j!_fkH5 z{|rU9EoR8f&Lu?<5uY$~%8=!+A7a(C@`!{B32|Gbg<#Ml0=RuJ#poPp;ZZCzj3D=8 z-_@q&r-%rQN06H1|2>(9J2_=?x5P z&<4fLt!@AZs@WGE#Rf_LuyjV#byu70#JIx7_Egd=tAdMoO`TP@L$9Q)tXUR%W|cvI z{uEt*)l?YFQOZBjNp+B2dUjiN7(P`T8sX?vcTA{&CrFS^rms9JRx@8BUc6M7F)CnF z-rUDfcblz_WTFFd37cKQjV)iSFS=w0<*$o-7zC$2CkCn!Z-%ylZ-$s}9o#~&`!YK0 zXPK3P5eRwqdeh>X0}NXN{;QBz+X0AosMozO1|%Xb>0iOcvwgM#W~S?rmR1RpYl*Ft-gC5Pm6v$_OPqnn*4cm&wVZeyR4c zk84*hNOKZhh-P5IYG(p?b|X1V*O=vATzVT#<+;msD_@r;(8?2)>M4;Y0BT%~q}=>E zJd}a?R_{F#^Ntnq0l}BMp@j>P0HcE3rSII|y>wv}sSNsv6M}F-W&(<_^wLw!!yWsf zVIxXSXb!|+Ls0A^P0A@Qg=8W9>dS8?9YZNFBUQ6NL9*$+ zxNo3YP8Kp>mENKsDeupCQZ(fw@Ahn~yCHowLoEMZPLsVaRM_TVAl+Dr-NS!Khw%0I%1oAEH3Rk^MYoVWh@$Nb-mKuQa7MSQ zT3&G^GaZSrppzu`TNaoV-}$#8xfxClJhkFKxAM$dDV|(L+$M7VzMzp}>0Phe9XtJh z=kJpPB2XbZQ9-$v%DQ{et8k(sx=egHJw?UmXA0u7Km zzLg=dz~R4_P7{pCqb%&l#tGS``L<*dhU#6pjQLmW_qc|Z$wKReoy?S0dS{#W3TNvj ze9#_`iuEz7uOzDCT!U*50;;Ig@~jvt)dwY0jn5#d=rm+0 z!uD$)y7STN^$<(UNAYdtyMS**if^fas!p@5)L#b9QZj$mPa7kh_6q+Os#Eh3kLKB-s!qFWXz=%bJl+p*Y(jmlVw)|1e_(w{RE#v!~ zG=9vdTV?C^O_&A7h%c54E75da@nW=ViS&zkepH<_>NBdK{jFg&V}BX=Y{m^Qsq+fn z-Q{zS?z1s}(rwTrMc4awIC`qwj}2FODcH6%p{1{e>Oz46DlfyEii2UKFHeM}ki$t3 zLKUFhJYNTxNu4D^2ueT~Jo&EA<-mGHoC1OiRGK1wWS9ZSXC5%^=|*(EaspwEzKc|( z6`eL@vAzyqr!&kou`}8i!wh&$BIi-c{R1k3Dd~enJQWB5KMQc7cwEd4&$-rXEJ%n~ zr~DOR3^Up|J$A_drC9w@dX8ch7$oip2&yc}$$d|7&tNtiAFeIu4>==AA9n5^f}KcZ z3F6NmzBV&?Fah`5ev?Uh$v`?p(N=cI^JgMJ1R~W8_H$E z2RnFK3p|#4AIV%nj6|+GXT^gb;HT1pCTHg@aog>{0(C~Q1Zh-8ZBll39bKlsyW_>| z?!0*aFeQf8*cWCYXWISUhr+54-WvIUuoR1_%%~5FEhZS=#p`nj#Lv}`lnt1|>r!iV zX^Dh9->%weOHfB6t~AC0ej%%L**F_xifJ2h$q!+Ao@?q(s6#XGE<|GlPe1)4TRR56 zFB66I2)eO+c5JKqmC90+g0VFT$ip6X8Wj z0*&5k)8WY`Lv9|%s=fOLmINUvYP75f43tnVB&el*_ICkUvnce+G7`+`P1p05X4U)T zN{*i~D?&Xy%>|ko)SNj2z$*>zGzYFB#fv7nH1!O8)(}beYn?>DPdr#ap`eAEK*?}U zy6<>V6K@UrBhe@sf_csa-*~>H>;NxGyU$q6@Z$wYDL`>H=-1xS?Hu3WZA13DFB2k+4{kveJ+ zqAo^eeV)c?OEXS~OZ!NNO-0LL-gX&%)g)h+re{;!`2zQ&O_qE1JYx6)n{B&*#8yc@ zQc7#lIUSF3*qB6Mu>SR70#Uo4LV28H74opGgp$(*m+n0%8+?P)lQdENx3S1u)%K_= zOIpcXmO_-W~HID;C|b08Q~R&VOrbEyj=5sBKJQ69%tj{Meh`v)%i6&Yy_*R zN*z_gyQhuGp%tx4P%Sdts50=AY8K~aX5lTY(<+ta=0>3AR~KZ>{_$n4e8~=yKR6{Iio(LYhlmGo@`Dj)5Vs zdQh?1pw_T=*0p76v4o!XCo7FePjU*qQ(U%307I{OO=R=K0ril1u*60`Z#vFNHPDd_ z#Aw#=fo4q8qId50Sq}AIvqp_wEgAWkC(T%hj5209Sr$*V_CKJ@!?J~CRQo@V;Jf2^ z8nCOiG~Lt}s1F&ood3tu?9aQe8TDLGd&|}+Eh)q2NQrQcg z>i3vlxf$?by{D}=Vgt$IvCL9s4&v%w=d4=tkcWIkL0IMdoX_e=0xNI(du83(Lwzkr z0+GN|C#)FKt4mq=%MCUr+a$_$B`hsrhiB8}n3ddI{mY|l1#{{yC(8~DqWkjR7J%(* z3GNDg3m2elQn6IJ=8vrDQ3$NDl<(HKl%{Ng>*_Ys*|E1e;Bl*pdaHM51 zzG80L$sfJvd}ROUv~B=b2qlAgS+~WwdiwI#QPoy=#QP!KN%k3Tbtj5mH)5*hOttAJ zG_0=%FCS|HaU~qtbIl0loLmwauy`ohC@}F4mS%VrKSdR zkp!ghYSt`}FokUUWgahDEM-vd0%23k_q3&|aMH5cpRW8ikIx(ntg|o1sLo$GQ4^5< zyD30ilvoHI=MyoR4tB2%VhJTLz}qHUfZOWJUhY?C$k(1o=mpK~|HTNJCW2@Kc4t%x z+>oxetS}=9+ut`UNhuu`clk)NkeZ7}AuYAy{JP?0{}82+Uv~P5BDPZWTWp7Dmft5+ zZ==DAv@Q0ekPEvo+ri3oUsKwcOldk8XZ}Sm$s#gNOTVp;RV;$-wx%vv#%9Rd;Yg*; z;hP%+NSjj(YM@-W5XFA{!R1Moxun-r2)Z>6UlF5>WnPHqW-!a=j@l%0^BPGMyHkr* zPQ)#w{dyn?RDiHzQN(|YKkqS5o`cQ8rd7#cj)lbIwru5z!KjGqN z;bIzeD7w9AfC8loJah{G1;joi3sJ=Dyc1(^$5afY3?w7C8q4wEgad)yZnv9-fk+r= zM~T(+yO9i>tzX?hY+{(Jy;0PqL(m-$IQ+Jxlcs6h)1(;$%0`;Wj$1!)>o=_yH8)W? zB(&*#gX4i31wN15FJOXAPi*&#Z@K8Q_jw+hhs1zaPn;Zpw9L08ozvWjzs(^rWX0S( zWv3pxGmQNdK|klK6!o-;#&+@XAz*5lP`@_2m z{p?m0(AONrnh8@7DIMj#gg=riGMwVKNFpcnOTqhpYj*HXKr6S~M=emXR-AV8+oEPltt9f!voev_aV5JZxsi?K| zQsU}YCF0eR;Sv6t zyy#48Wf2r&iN=H@m))$v(r1Vl*D08PiEb~AP#t?pw; zr$OzkOS5AwIUf&Zhto-@Al~jkJ2=BM7gaHdbzaM6O1U2?f!hzQDqXv%K{TI$8lb`^ z6q(P=6-#3si_N?{0tsH|Pi`65qp;h7w(n__Z7KAK3L zC2atkofcJDniwp_YKxVi*zd z3fjt~wCRXWn89#=;Bh^`^^7*ll{#~=rcayaoTQ4E1yENWe27u#FDwm~_6vT=nY18u zU)e?L%f;Hp++Mc!XKSCge|D0TyV9LR2+^X*qXSRC5UalEl=^d*LWb;t=L+M4HzDnL ztub2y*vn&|VMRtNo&UpOTm5p+>;}gg=>Da2LqzS1V{|kt6^hR3&p|VOAgePf>m0yqQN6316qse*uJgB8$`y&eR&5?E}`eRU~ItgLRr~ zbGeVmXUtM(aehE}lo@$(jh?>gnK?HzUYf2Foz=)>K&!@?@z6m%4g1p22d_yJz+6c- zDC2LV-Kk5@`wyK+sdQL?exyw;2#=T!Uwl!q&3yq}t)OXa>aO}%GZF~!I#k%Lo}Rcy z1Zfu$M#iNuL?ec+S_&>Zele&kn)f3JyJ6c<+ILY|-^aQ1rQnjQHKEUEWg^83Cijfn z^ZcZU4LPI*>;6a_eZyizew)NcP!D+m4 zFTE!`gHqW|nafax-swmBJ2fC0+Flc^Qi5y|kxl_Jd^}axI!)2d#AiA+-uNMa z8e3XH%h?UnBKBlmC3SUvu_4RSdITZgVrB$xfL0hA8S(^=?>p2KGF(%1Qt*)-{h$;@A$liwtlP)Z9g zrZ{?F&q<#v2tRYFdLRc6T=FJ;(^G51e5BPF3Wm-wSRJ);O}xK9|LacaInffH=&^V> zCucjtALqEvE7-o6`K$?H@HygtpkQpO85zfvG|w#cqqW_ga1pnEeL5nu=o5FU8AyS^ z`=7=&5$W0!y(8D0SKeL`2;QeDmvht{Ulys4zX!(Q(8%epsn#|s9n5B_)UZimTtojI6($0DO$^7p;?`qac!>X^1uTz?w zD!FSr)OY+yvNlE8|>$!&EO_P3I*oP!ig;6aAVA5nDy&E41=YNCKdDTjs`!S zj#26)a29e*kA1mxX;$l`zx}WW4X_3Lk)X!7Ql;OdE;927vmkVX*VRPp*x?nCpzg=7 zj4u?rD%wXRNns@#Lrz!ISglC9+nxQvTn=bjD@7DM&0?QWSokJIk1x6Hv-s1)sFbAdZB7y zJh~662Kd?gJ$^RA3QY>WlX|5=Q7c9_<$6nDC+6`O-5PI175aOTAS3V9{R& z$s{*ltx7hXY(s#vaMOQU-Fcpp`41iw}dAxo4?-Wy1}R+tworW$b>?o3!S9+E@35VRe) zQw?3DfDXBjbtU+4Y_Wv+QfGxgpr9gpr!hA*|&S5<~qYZjwC37BCNwu%JF&I={;`fyBU^9y6 zykq3!1e*)wHeRDPY^Pw%t_QB5yrx#QAKR_=cE2^G-@(hV@4gFvGs<+*AC^W0q7_;l zt=e4N+IPMj0F%-TW$CJ{EaBnJ^`_=8u3*O( zR{`E^0k!0TB|GPT`a2awc>)8mj704jfA1FG=QVc7rQb?SQ6VPa+UhB$x zATZ7Z;eO)6$Hh_CyvX#_8%_ydv$#6jun_1sN4=M<_6)RozBSTWd|$pmW;C(4VnAL? zu%Yf|!dZ3?xlfo`d<@WGQn+9EIYT8Qf#TC9lPSC%IenK0W}s}JPwBhSoO!cN*f*(6 zytA`NmO3ve&r2rw!Q(P$_!1S;bhi3p&yUtfXBI-5}8k@(2J^ z2?o*scKC5^Xv?(2>rP_g7HGWt>u1JKgynSIc&=ro?n2*7o<>C?&Rk;PPc;wWNoCWb zBCw>nG*wYCrVbQSsS|&eq*fSzcOt=tY)OqYF4i`Rha`)VgbwfH#!^e{b0j>Ws~oG4 z`AVo2zBLXZ1WtU@|W0L7jj>hd9Be#*T?8vo? zCCBOtoDE9$P^<%%yL*b!mkaBiJr>vy6>XTRFXk~%}-9kBOc#w)^WyV zNC6#Ap<*R!oK;c2Q_G?D%&|1SXz`v=5v--ANU-uBeomo|D4$`1FYu&zf&&6LMph*{ zbqgPkfK49u5o#hwsQBU~I+Di$+0AIL7azg4=UIJ%j5%G{ia{sPd>G2&yLc<_tm1FRWF{}CFk zTl-iE$sOflh||=nJX|VvwmYJ%aI+P{f!yjS;dc==QCee-&)n4J%iiU~npnXV4aG~L zG>9FZ@s_XcoEfBJw;rX%uldFxwsCDxr)X#>*u=lTt9w(p0^>=E z4p;;fuDAsqE6IELIZ1EXAugJZ0Ia75j!aNWNcp8zkal@OZL<7Y>j%h&pL8VHiYgkC z$X&I{-Ze8YVM*(9;CkHa;OgY?bN8_H0mB7R31gR%BXVvKS!vOzQV2L1?J?+qY8FaH zAU0gS9{=k3ZAI(y?pamFvvI4#IJ~_r3SnS2hVv^4d7kolV{l>EKXe{rN3tk`oOB;9=*Ov+FGVdtA}Fc6#!S^$~0J`>yT7Q5YY zm$S|AJcpCIVWy>+905uDAQj3c)qZ=Xls;5fb|-ucr1ZlFTKVi=;-qDBLug{K;W!OMLQ6)xaFZDDx|)Hl}WHdKRP|H zSsnOf54D0d4IF4>@L}7S<5*rgC6Kmmt-eoy#G?_XmU)X!xh+c;eHHR^F|Z>Oi9veb zLl~0gXpWo2gLjU3hZm5OrYLep+wtMa?Q=u*r84uuWkvzP=MM>e7CQ!3WrNMm8K4zL z%Fka|a-Nut^cONVXjJ%O#%G0THGyjK$Aks9N&7bt=czylWOKa=`x-6k5v|2{OujzUKYd7Fjr{A<4_ zxyr#-^@*X0wh~AQ>wL^@Fl# zY4V3eZ(GU_4=isusIoreafxwJu}R3A@2M2y&n#&JKy_F%_BBG3(D@`j#=AHT-WAM0 z(gk6t)KSU#ssBPeFWO}F%|Z*7Z7xs!Fpj7G(V!TZ?47#sbpwJKn46?>e_~V#xQ$=} zrNNV2;A?EOq1gPvQ*vHOUqJ=I`f5pf1^uQ~juFPjO*{)jOrxuo*LfAqwX8(kugXS? zQ;~QHsv?4>fw<)!Z-;);QwEHpI`^n8HOQZSG6GEgA&1#7^MWyk%5o(b>P=sR%ypnu zcyt_aei;u%ajwiJjB;Ya_}g%`aWwQbs%QT_F{|lrQzs#E;cel^;0mF;+g)HDesaDP znUKNOT?Tz&TVBvp(p~mkaO-zGYh!AvNe1$M21s1Pyld9|>fE4yGu>+%25@b*FAK`S zeD$(hz+?=V3Q|h5!eMny8O+eVAqHq_ByY^kqn;WI^`=2^aOsGEG4LF74Y(CDkXaUb zwT6OhD8}pK19(tM4R(J+1xj9EW!7)3{`c7hZ>D1(`urZRlP6;pGVR%|R`-4?pa9b6 z$!bx?{ay`H6EGUP7m3K{L$}orD$pid>#&EVljc zX%()U_-FbSeFY&K3-eWvz?ro-G39HtGK*G6yI!TNZjX(Pp_eFi2BM?|1WR%iyf;}D zS~-1dgph>DU%D!-i}|kqO_ISPzItUOSveD`opX|eRh2-B4_IFbI58Wd1Iv-ek;jS& zHV^4fRb8&tsjqJ@(J1gNm8I%&(|GwW#ab;?uWy_NE$QCQRC7lfy**7b4s!kU1s}Fl zn)){#igZ+UB6+$ykc7~}-N!r4kIu#p~i@2mOhK70C6;~Z|sB?dZd zt0-{{47V?^U*|zIwUn#t;fvZGIPkMk4cr-||AWQ%_mRt8g{?XV-H+aaH{TLroVu#e z8i^f<^ql0R(B_I$w^v3a3A5L%PPHwvC146zV{^L@B#6`pPu>bR`egz=L(XTlDS6%+ z-x$$MuzIt57NFN*IOt82f3Ks^nAraF>~eYGVaoHPy#`e8^krj}sbX3E_~+k5($$=t z`8GS}lG_^im~`9n53cO0r$|tc>IAyMdXKtU#Os+b*-0N7J**l)F{n$aKgF$3vRewu zIxdToALxAodCfir#b7k$hz&0s)N=~iWJ`pa*8$M>uK@y!pY0t$Xc#hO5<=Hbj3jRR zXs%uj(wcw<+m%AP%d1>iSTSFOSpqcA;H(p=B4D~|G9i+WqZ3|NQHxmcQ|A0^m)D|s zM(D;%zgnky$lT_aB(DsJdfDtB^E58;#p>(4R#j;i_ryCYlqELf;>5DCA$A7UxhZwwf`ZD5rNJR^cEdB-u-(LYYc=Z zCl1mwq`F;~#&2Y-Cy}326X*8flI0#-UG*keKjlZAWBPE>W92Dm<(Z$-o&RHR z+#mRi-W;N)jHB2X!IVVgFH3Nv-q;XT4X-LfVu ztNU7&Rzsb=?NezBcrd^rw7=S}eYjdCpV?6Ia|JqwvC3waPF~RXYt{;63@6@KZ{~q@ z6oIYLdDT0;bkX$jL)ue}_1h!pLV#x7tGq*3}|xri0n| zsn(WNGuF&3)*#RZJUsdx8)7;tuy(mc-xb{o$p*wWxezmOOs|!sAi#Bl%zrF+;@B(bt=~PbQnLwXDBCGhh(E_4Bi8A0BRyJ*ckNS?Em9gUQ#rgMOim$DR&;I!V_1WbUR-bqJy^r;qv1UAd^_7k5kI z^7oksBA`B^V{G}@32978%LfCas7@ctm0r}>C*u%ct$CuB@P}a6=IsXW$4ot&f&d`{ z7maRJY3n2aOx@Q9py;FZfQO;Y>JZGx5z*j^%AGCEY}SqIZQuqA$SFx5{=2*Y4}K9i z$6KQv>c_*a2m-5h3eZHst#FJSp8L_>?(iBiyWC*)og1Xn=jl&lr3iksmmfiBZb%xD-45R69Mz zEvDZa}=*08^?(P9s(Kr)K}``(B(_lqX}#LJRRsAywKm@T$cP)d#x(2#r?m^6-gfdmTr@Q zB^5e5p^Usf0oIhUmp$wpXG~v6Cycxga@DSzYzC81c3MV2iQQUGmi`OLxX;;!!EOz~ z!qAO*{M-`zj3@;q@ zon$0TD`gKYv667{^li5-L`CPdWdZkMgm8jNSDnii@Yx44zzMh)o4{pm#;bQp~%q9fSb1>-+!NRo&Y!ouB#6Ehu0XWn=-h%|nNT z#WoyEsE2N*cZYB5QQR1}h)_Qy9uWUi$|W?IIDMNhGI-4)rIHZ0jgQMy?}ze!@zTp& zn#4`2HC%3BU4BDd?!Gsgv zIAx42RmxyvOT}aTj2V)rJ?#8*IZ(`O5E-KhnP+%Ib=`+UfyZ_>WXpcK*U^?~%8TRj z2QVMHv|J@3Y?XFpzq(m6EBK1@Ns%;j$xu#QZ#ca^sieoy;k!aNl6E zOs3G5_YF*Ot|H`nAgwA4DWNODZFtIO94%9hmy`*ahWCGInOYN&G{u7eM?kp0`ID~! zmR^ZmmCje9Z}~;nFFzGJ>o6N#K@JNwmh4rkxm-A`+;bW~etf)1X@`c^3QF`&S8kr- zTWoT>&7E5vK&^L>lXR<2+9;U~!xk=p$Y}J+rr5q^zD#OAM}Qp*r~+>b$eCaM$+;8o z1r?oZ;T9jAO3{ILU-93|Y6NXoP8nIyoTD1l6TM_mLwiK76Wg;NuYa8*!o0fq@%Qsp zOkMAYjBZ(KXI9rBcINs$j)#i{K zrkp#~K4S|opx9=1As-mV)y`<_9Hv`qdlewF0$M160dd@EbvY?~$J5gT*5bon(a=B7Bgy&wyR2BFh&|mi zsQ_#ieDKoHzD-1>YH?e7Hu$tPx|cWP%pQM_2$)61Bjy|#rfpzPJ5}}(FSa-{f|OH9 z9>PMNn~tqW9A?=^0|J<* z36{rqg0C0A>)~+GG;!lXJyj>6OHw4jA2^lH(Xwh%cg$GODL&m5;{=uus_q#vX!HAgcb1fs+&Y2v)>Dw$N<(b-lX z{b1Jxo`{AVWrqXb6eeN({l@z?u5-e{bDMd_Hv{4s)ObN}vbbfCtK@lJQ!=Xw6#3dG zIEBrzG@0j0su?mFfek_oN_$n2!PVt=3Ihl~TLYsu7*=@#HSYrQLoMRdf<9JlQd)Is zif$GMpBlM?knKP=z)DlNL|K~bJV&xE*AI|{_*eQ263 z_yXs|tuwsJN&V7XOs*0)Y4L3L3<9_oh52)u8$eFw_18v~%8ck(@~pcGbp6yIP|{nV zl2Up9!~Ulq*5jEw_mg<0kqR3%SuKl=lY zGc#5PniC0y@O2>&!-xA%5rD(~6PVTt`jzf~a8M7ROua$|(o2uCZFHR$X`kFr!nt%Y zxgpbY`ybHQd3og$9R^oj%`_mbD?arP1P!T$rWV%;Fk2-gthfe!gcrL-tMi2~)quQf zS-!fOcOSwgTjGSZ3FPxp+;XV#4be3R@sNt-Y%DW#0Ye)=vE0}lD-zie^Uj&2kZGct zpOUHhn{<~vaD&=Wt*{`_jx0Am1uuH2a35wm%On*ee+vBC@CErW{@ z#&&Md4++n!|Kax*AQzm=%kT+dM(G*n@O$s8dZ1k2grMXhOfIu@U$`&=Jf8zW`@qbd zm_x`-pUgg-XXO#d63~xoFf-ggoDUnVGS6&g=iE;@|GkJj-QDHWTreThC1vN+|DLL= zyu4W`{g5#~BG*y}H$qQp@@(IlxJ;%NZEdrgNXsd9`(uIC$Ow&UJga&8^-~AGfE3Gs zaB1@W^Bh;}rS+WB9pS*oJ4*>{e|g;TzJRTcth!j`9x#i4pdMYahqG3NlHa^IvW%o# zuz`iyME+HJkqIewNz60?)j&aw=#t#gFt&Q)R5?JT#;{BJQA)T~VrV~j+3&~BNRSCZ zeX!Q8#_&FVc09AmZ48;bEFGaFo@k(p*Dk(|)Xd@p?K^cdP!Xr;N{!S3z0aVea|+uV z*{=lI7A!qcCkp(75Xi+}<-{@I>ll&}&12g(FJ8djeP?}Kp=UsZ0^ZMgE9op8wWy?5 zaP2etaEM0H$;L$|0JXRz_U~kL{w#bmKsmP{Dj*|uNj_sADlO*D*eRcxKEg0dt9r2j zORJ*w`A~J*te?}pvKkrVlL{G#$KBEw7`!8z_%Q5^LEp5nETIo}A;Nb`zEi-y4pz^u zvhxlg$JXk3R|3PVugzdPhp3J*OK=;JZ8&Q`B+GRK`qwKT%7s}DKlHD=^D7c$G1D%x zFyVbWoFM&#=tnAdsfW0g`3l+%RHqa1j9}s)A7_}YE3DD(%P`iA+wLIlFerv~XA-r~ zmN9H1@Mx)etANc70_8-2P!5kp3DCM0Cm1L~^_)<8b>NUE4LYS~*AOw%UgS^<60D!} z`AHb6jXzDJb#@os!MiKlc|`BV>U+^KjC2>BcbG)JNI!Jk055?+jMjG++b+^I_sW5o}) zWv%7~COw@Xx%{iOV5kEFTDTXVSx}D)TP?D>G+V-@6p()JN-?>!sdtW^{aRYj!b2^z zMCEs;T#mk7vS+!{@%dnvQd4ONVpof3u5T#JIpv!|Po=Xcd2F$?{;lhBMQS18ltmKA zbp+#vb(F#dK@;L$r(P%zQIfIV1;GVHvlCn`AUbhWjlX|3p$f7R&iox*KtmCyX~l8q z2Q5~_Llr&m`LQ0>3qN_g5th z(7ov>%QYP~ydGS|b$k@*ZJWGR9p&;I12M*1>+Q#fNbUXDku`8;Q}_I$qQpl@3fcjf zw#!pGz3}2c!$|dnq38{aHY05RF^kTQAOGjrgPM*X3etsmf&>oX8TaO9*E{q@qJGp} z+PsVTZ`VPuMCk?r;QZKTK{u|rCESg630I&9_WS!$7lOd7@Onfk?nF^$dsE}MJ+^SS zSF0cPAKuXB$6|5IT;m>*han$u4fDw{NLG`ENDxwDCeqL=p6v9ygK{^tr)cf{vC`<; zpgHS!B4U3*zlpiH^4rY7k>&aFu>Mk8;Qt{slyhfde;{62M2+m%xvCWRqt@dAO>|Xa z1KiHxT&J^L5#LZssp_!fE)Tl-hrzMPTy-H=3V{NOh0N5j)1S6`CZ9E8{h~N~SF8?1 zUgr-69WY?K%Gb=Uv*$^DbCp|oxbaU%k55r@pdE?~+EAGh$tP!)cA}*oYym#x=Q&Wf zolEJqSqP9d2PgcRJ_*VqS#=IY+KG8KrBX`qGJhTWbY@FPjQZ|#PU$7`MH}0wdwg;pT$#;T($qM+p zCGOSZI$QILkcZ&i5B?GzTE-`B`WR}Udw2iGXi`FErNo@QpN4Q`dl98?i4X{4{XQ|JpssD9)mQ#JmdDR z_Z$Bbt~cv8VtN->-H=y}uR)BEb?cTC@Gla|E|B%3e^#lk*BVDEq~CJ7-SH+{VP+v8 z0Qz;2^AnC^U;y4OFqEOobALX_C|UO?O((F~40KI_-Z7!*g>+t?_3Ui4zId4qsp;7@QXPpNl?bWAnIcr0 zvq>WHP%Bj{>ykV)C-o(LqCGA}-ztN7HYPi=MH3#b)9?%@Ods2IkXZ-)Cg0@oESQH% zAN0*F_!*+5>5r5w^Sksg)014AcM2;pJ(~uw=Op`X_e1FyZ1}!pq*bV=a?l$grD!Kt z?RF0s)%3D3{l~9C06CCUcJQSq@V&eSa}onz&~>Ja6{1cg)?XvkS$R5+7Z{X!KDM-R z_Zk$Vsbjc0WV72ZOBk^3q~)MoaOa$Tk~Pp!-^t%hn@mU;So?|TQWLi{2Z+bnS?wN* zItw$NN0^8?S62lZi6)PBu?~nTD`xbjQ8=LscIMjXXli&jo-wG2FYFn!Sjp#sD9zjC zzqV#@#pyjNQg5DfaAy_$-l{ikC2)d%5-;YP1c_AsqD1uoZKKw6M#VzY&dzIj?TI2M z^BVM$cG3trrV)rU#JF|5V|N;%Bw^S6_}$OVn<}@xE{AKqPZ{nBIo8VW!`9h z^X}gG7sIT`mB8EvQS$@_OLWB}J*%}3?hH7I#K4ywQ+2yfGY z9%B!IUGVBJ-Dm3AScEN&IE}fQ(IB{c(WIeX6smFnV44<&XM`vbJ#dIo71^kZd7GXDx@bfn{Q%s4 z9O6>W$=|`{gPH0r$WIK9)1A}NI@+8KH(LY=K0ipzJw$>$1*97y7upidEgqA_kZJv@ zo1$i*qRcgZJ!)Hv&<&7neEZWLw0R^nebmAoqyPvU$OZ9wu=&S){Hdr@hM!sPMLiG6}X{ZYVBBmB9h0f#pN!5H0)ZhqM&!~5(NDn5U-%` zYz`aA+#@FQ6JRh^Nrc$5d8bH?9qOItfE*qEC1Fjlv<;3&6%Qh+29xUYNXn7>JRucx zVHR2&J5d{=+~yig|KwENQx(A;nX1l^T&ALYvnxx}Xfq{h0F5^Jz2%CFSU_w8s?3D2 z@rUhKDRxAQavMcf5f5S_GVN~JCB_d&_xJUNUv))1!Q#0Qb?1}Gn(fCY8!u;9*l>;W zu@Fysp(Yk6{9p}WF~AltS`xY30+V)Z>;62+e3_~?x$1B@YvhgCY5(+|jtg3rQ%&WM zCOm}vA&*5rm1nbeCy@0U?S@kk!*Z%17G7`YUFq)aO|I0>=Tpf&M2LQBGm$0d?Rp4D zYN9WYt!_{IBGhZ;yOQJHH{3yQ{P4P{1c(gsl~$JF%6o;{qw1EF4$Z{uKi`lec6`#C z%|{0kgT3IA(LwW*7@M~xOCOAqKX8RJSjF*$xZ3*sm$q2o)w)9)dV*bqy*NvVy7GJy z7$uKgva8UPOgQ{W4pIn?>wH}*>UKq5d@RUn3u%ENd*-oSEGHt+U5g;&J)7r?li$F{ zAFlJ~{Qw0Jc8e4-a*x|DafmUWaT5)zeL9y+MRf+fuYXi1lk~>fH6Y&CZk;UV%KJ{={~~LcKkj z$BmYM5rmKgr4ZCL)s!y=Ckx>MFj09?AWO_g9*#^c`6?+HY4l~6^L0Gb+uJa>Yqruy zzZ>@fTJ`qC`0g`iKT5Uhe@D;8KbF@PC)Cgd`@r%}16nkh5vng1m%P zI|!%s({8+vP#Zc*Mjm8o)8{!r55hAOp7CX&qpn)qai8Xt#4y^Q|?` zKEg!dvK@g#ZOqg2fH%IEdg@@}jr$7yY&E70)3e3ue9;{HYZ=&qw>x&lMXV2a6TaE> zvW9PRg|>#M06LVhR;KgrKRk@HZl|`Ips4r^2=aX4^{r$YVcyH^kN;o^le;*6`~85Q z<~}|^%B-9{ysr%1evLuT-GiEm1NWxHdyca}H0gu!K0RkSP(~T?NIPT-){H&>>)j~b z1cA#C0AR;3D1G0*=4D0RDNlev7GyuH-ax6C+1-Il8}b~$yNHsA&$7dnKmvRGztS-? z$E3vy1cO{tT-W2C{<){!jsd5F6qqb|`aL+=wqDW7+Lli4i=#G56WW0K+1W_*7 z*pmVT|B0XxBdnh--^`ixOdnufI-<0&?)C%AlYv6ka4IXvLtodVrZh)11%TtKRM>Q& z-v6vMmx1A{h6AaAy)R>iAx z;aYGr9hCe&xoQZHr+11Jj-Ja)eu>BNzBaA6B>BVlb>sJRY-t)}?Eh*8CzMCGCJ;hg zV7i$8NT`2Mu60onA${#CKBRJO<#L5jRCl0kv$ECiCLH+S%u)9(%q{pt)Gbh7vBDbsylygl{-wU}lJm!Mu^pyNp}pI`k8~ z8(B?wTh1&^KmUjZ@cIoPNMZAg(S9^ki5DAH^W3An4yt3k5=p7GO|rrS|8gR!Ih}#> zM=_(atlypM2+hpnJ zLK4j#g~VGUlmM9(ABQPdGvKJIJo!yvy9Sw&_wGt@Evv=b&Rt+Dp2bfi4Sfa z;?(Ss1a`P-KXcLoL6sL2t5!|UT@j*}hQkH2$AwzWQw%2$Fn1@^?zRCoCnH-Zl??Dt z^9-xd_^NfoJeu{?VUzRZR<$GzGx7d^jo}$M31=f4Ka`DvFJMIY7&J+!52opJ-pO3dqK5*J@DN)gmyCo1tQb%S5l-l{hQs zIHjB(wI||&AezLf3#S*DCThtfipl$6ECaF0_1bVsZSt;Wc;ZQ}i?)&PCqxH&9gl!R zV)ALl#24lhFH<`kun9g`^Tc9cQ|z(e-OO1PAcpLpAzRBeedPS@dOi#{fD(ODyF8yz zmg0thx0UY+Twkdi>W7J0^&Tygi3hds6_YdY0*fHos?4Bmo<@fbU^;L64S3}NWu;nx zHwtQo;tUA!#nWL#FCV)YSF5+_OXUdLh7-lm{AHOvl%h98ACK4HS% z`|Zhhd^&AWDzKZ_^3nAFN6@mG-0H|G&_ZB>dn|Gz*xM0lI50&itS=e=g~}}ch>xF_ zVS=Rz6q{T4$3ke|ORKaVb(L-gUVytTnx#y=nMBQnQg|}$=peF-^&gll-c%xx*Dhre zf*E58WfV&BA#s=R*YZbXs6Iy}Flj#*0-*Rwu=APV?<#1{9LXslSFmw@!MROhpf0&3 z8HcPXkkMP2iTzjTcj|2FYZR#0Na<|Hh?3JV%}bg}Ha+h7nog#e(rhK?XGfjD2@o&G z@a|@YP@>bxjxTGSVgU&gV}ud-9d`MLsJr<)W{otasV!}t8z1b9ZL%2;=FCb+&F64% zjyCGINhYj{AGl+AnSxP)|FRHfAO{pjwVdu!uND48E}r0ASems_HsGZh?h{QW+qtMD zdp;F%Guc2^ax9J`XD-Z^ThK#@VYN&{f)Se~*-%K=H8P@5@&sOfo?vfHEkev+Z6?od zV`z0u6)!@}AG@oLgM;55b6uc)N9Y_<=HKbCu#PAF(uO>ts0to!F&E5@G5Z-KJIFEH zUXa+ZSr2hrZ&TpkrFAC$p8wAVzG_~WWmlUfuEH?5=n&+}6D_y;KO7($I&%L0Y#y3t zn}88?&v7aOy=nEP4aE}R)uM zq zD`wn93hEWY(wnkReIK!>3M>T_r4T$!lrp@_V^Js#upwJ0EDn}xbuVNSQ5Z&C4hhc^Hw*1AvqSnB(Ox-- zi4dbQz?dW@t4KTeE9C=RbtIj67+MPj*&34jQN&(nJ_0>@NlgH6s_@kSO9<>t- zGj<$ZahPSbRJtcII_+ywRtTceug2Clc>(@(9b;jG$; zd9)}ppMjRq)SbopfEMU8_U|bl6Z>i~-}N|3uA#m2WuC_=6^`Or(Un__EMPSOoga#3f+O8% z=tjam%L{c+id-QNXJ^9PdD@10J(%N=ZV!cAX0WLqa$+;Se2K><1L#W8$(XoGr)QOF zLCEn|xVsSt2Prmm4g>s9-T=ZJC#NoK!DGsCf!UP|c}XvU?vc?fqzmz(7*!Up`(w^f zyH=i^fj~a0E+h~?@*eJT)qWOd9;(etD6+&n9gvrPTO_9k;_Zx1!wYJcoifVVi}th| zZ@!cV{dp#n%HIX)H-pBSzIV*{WYMP;`UjPfqq(~4`eH1=SSY5a&|ntk2&e?y3Kx;e zJlH;Qs5JlNkHxDT5tQ6=R)6#3xtczxj_9NJx#4>u%vflQT-C%QNga!DGU`RYZd)53 zZ$onoLBm`H6Kp|??P@4E@gIOJFmvo9OR+PhwN?^(C88{ehBzUZCMqMCOa9p$jCr}z zwAycrn!`Q9l|xog0)?HA?_H5ey(E%P8A5rc6pg(;nVHL$m3h`VYF)uPYuN3uJ!(6* zheV^0095)J&u7dGx9tB2 zX#eU@D%b38ku#xpv@QbIM^FAs(OK5erbO6EwQk%}I>1tnow)q;m5JK0t1gy_BGsfX z_wm=68403tx#>|S%>v1x@})3nwZ6J+`Z#>W91OctFi$|PE|hm3(|uEk4pQDk>`aJ6 zNKXJchc}RGwlvui-=(Hbu^lyp+w^S_CV9CjG%PjxOA3dObtBRfPVbd#VQ68js#l?1 zFz8zwXvKBg^F`t>)rk{Lcy@pt#gQAuG4Z+}^iz>I@qQhoK2nNCn}$cjbSdp+ z3&_`e9ObB}+&&6p-1YWIw`*i8C?jjbOFR7zYNg~+H?)3w;Q4loe&)8JH}KWpt=`C* zmYb#X+M>m~3i2^g!JtYWvofO*{ZQ#-;!zM8`$zJF-FBvLPlC1e;m}R7zZ>Y zNgUV%F9C5B&6|ly7O9tXue*K&UMl9{T-(8)nh)EVi|icxVCvS+<0@D1;)&7lq`;Qw zH#AQEH|O79uq`~DXHn0Nj@835>L&6?j&gzivL*-kaWF7hek!u2y8D)uhD!z2_&c}_ z&CxkF`K-f!<_xgCjczIe!D{&P#6E~oh0(2)V0H{3ghgR8Wh6y$aBpu!$-nR3%+!!a z$Nr4vbB^!Ldz*=pN|eCb1cazdrhRt;gwo^)_k!J;KMdpVri%l)lpX+s^pYsA9Gvm9 zSy7}cT33yv3RY;}Q@>deR7H}hhR46af4!e)rz=?W9JI$hoYN^jVq&<%TcMtlimPeq zOSG|l1=Pt2Oo(%Nv(j4^kaxMvgc5{xq4A}c4b5)U`MHB(BB&&Jr) z_7+?0`lyAL(&<5wgynz?O1DFnP=+w$nmi>s^;QAu zk0w*;Oi)4XNDnUM8vKA(StV@1OBwYrVdwy70wJ8VdKKKWYXIdnlTtr+KeQQHi}Uv? zXv(u|&suFPQ9aLPqewj)Iz^R(<5KQxi34x4@Tz!~G-_Uqa+7W4-pnFl%Nl1=@KQ`& z^5KDW)n^0afwD@U6h)&nt&zq6g_D|;@iEtU^vv( zyyAWf#e$s}LP57p4XzB3mgnfRAijTcbRzk51T={y4XTY03=o%%h3)XFR;tN|F`F z6ecpmb;Z74%9jY9dgsLMiI0Pi(IYItTu?g zNKb`k(S|s;Y*0pR#rbt?4WDL++vIOEpf;wF&}vg{c`kQu*QWwGPQ(mL0H>XE5B3wF zV8@dUsX4Aydp>IIG|v!YZLrN0Xe=8(bf_OaHHz7lh3xPHILjEJOA3YEJ~PREE%y@J zd@xG|QHg+wd0YN5J*q8%G@(rhAZ-xFkjM+UYQWn~1GXa*Njkx}NL9M4kScr&rMih3 z?=_!Yqy$?&)fnqUQ&q?JB|moLdmp>;57i;$Exg?AK0rkOhxE(vL?w($D1t&Ow)36F zu8d9P0ebJ|RpdOEnAh;>c}hATjyLWDj;tQ}*I*lx<~T%_GWSCe+6AfS$vkA=2T;f{ zHsA@n_%8naY3AuwM^4s^$xLfdCIH?;Z7Z{uRC|in|6^% z9CBrDq~O+BS2BdTt*h$&NMCAqy*XxpQb2#eZW5_ou6U!>PgpVGL>yx<^PP&5bfPk} zathc)OeVmTrT~VD@q>}w2o7nw6YYzg*RY@Da#bbApye;pN0XZ~D-%=06vyv+bKn{R z&ay+-FWAS8uw_|X>S@SX)IVFU9}lJ)>t~T>TbDz{I-uTasrHN33fGmLs-Qw{=`lJ< zz?4_6(+zRysHsYXs zuFy!SC|pIcOR0^uja~2Lh91gYXWH8Cw{KOpH--hvyHkowW2W;}F9NDnlntC!+PBU+ z1(Z-&QNtHPO;3c7TUv=S%E(F>u3Tv!$BN=({``~G7K7e_Gu~Ooo5X1-E!Pz3cy(V; zf-v$h`y3{ss2${dB0Z(76_6&cyfkwUX@vQ+=SL*-X&r+&h2tz_98H z-lTNzXBI}bFvwf z$oQ1MA&G$nuca!{^NrtqNi>6O`(-+a{;obv=eMv2ZbkxYoS-W#O?2gKVWWH=TYL^ z-^%UGtJ8W`gXsj~Xn!)AeV%EnTpZZ=85*tiAWo=SaIF{}@k~&U#uTFv zb|I@|!xdK!ZK%3gan~9dSXgDV+{oywL~j-PoH5umJON#qA4J#w=T~AWv$#XOfdgFd zTD(uVk4Ggdl#O}+alHW~p;b@D40mIjtqWZvAXz|yNelHg!WwGPYKa%ZJtY-AU@L2i z1L39wbs7(VTPy_`0h4BZXC#p5erlWB1JNG=`|yRs<5)JxRRhp_N>;svyV}(dmHX_iHMp?UUx(L7pO%Z|9Xtk z?NLs09>BOF)s;m`kf%ymd3l|f1E{BEdKr8c&XLVzP<2(u`4twj({bq>bo1d;@|T{o zPX-ofPgJf6_}$(aC>ko4)}*2XMQGX5{TChgm$xJY&4lIMnizAg{Q~JsWOxI7=5x=0 zSY-Rjt0rMA6RKA?5#FawZ2D#tEne8yBiFafE_~}vmiuS_uQ?&zc@sEn%5e&gEltWD z)Ip@cAS|(-j>q0!FFvF*(QGM+>JKjoj1mG2MiNXkwE`XG=}(BC47-rx)7%4!EE+1H z7ovK1+Bkz8roNC%L^Mt@$frkAuITwP3ur2QRLoE6<%SbaxC8yn$;kvigoQo~Yfb&( z*C&HP?<_B#6vuSw*w&QxZ-$>}F~N7F^@#)+e%7?Qc4nL~2uW19g7B=chTj+b#i<$x zDZNN|u(BjG!JA)sRq8;TVrQ(COzn=5j%dO+HP!0**O`f6i5_Qq7OAx(8HxTNZbZc6 zYLd~%P5Olk!Xm;%XCAdN6)1??7q%pp&lfm^c)Sb@IsHy>E*7P+2@f4C8EDCeUA+odPO1X=**t8ncuu zb`Im%X*ooIVrdj~xS%{gwHs#YKSf;C>JHaRsmUzHm3OtE39fWn)3&xT<2!Kz0=4$+ z);O|@{9BaL91ex3lMM+j$UT3vazdIlw07B?WZIxUz_Yg0!U&3^Ph<-^7fkvL925L! z704)LCrgu>(QHvAAM;xgZESU?X-~(8YwHDt;vi5M{WvHOa(_Aw0`X+7^4q!z{(Btnk`kis#)FnuSZ}aw~)8G_XC~1eVq-jOq4+OTy_K zXmzg^!Un(!K~k^!y8uupS{S&jy+fg&6Zc8Vs{v%J0Lw^50euPnishpx#vN}$04P~{ z_?4izgcFiX_ivqj0V*t4#hKiuoIyIHMnXB!FFp3jOX-H?oO(ocvb5|>E4y``?`oZUu?bOteMnX(vE9NQxpo5Nx;(Mavr*M+6z*{ih!wmT#ERD-0 zKcuUWL9mI!43U@VK*HdhlF*2T*5wv zZmp!%*F{I#%%~e34v>IllMY&u2JC%U293boUqA_d`ecpOVXzY$tN`ZzZbm_8Vx^ z+`AiIT7R!!!Tk4u+(47Sc~`umAKQ>tG=s@QerFjRX;F1^K)EgEg?S27X#?{n4V(OE+~M1PjEmDyX`bKSIVZvVfPc?AdPrESwh6T-{EgN(hj}#SK}X|ef&3Gg z464OaLUw(SXu^m0K-2~45r@YYH9OA>QKoDuZ)@q&8N6Ozm(FbUU-AwcZ3@Uw3pkb2 z!tN7oWa!zusup%cG1PIO@#N)MiAa@#UtD|lH`khskv`Rb*??KxP@+NQyy_0lM9Wxayft#gmJ zx7Znw-5?EDeoz9|9lk!c%K9LtF|eRB#>83ys~i5K0ZBsN>9V)#GS8&)QS%1+c2#RIQ`GZuQh)%h#`DI)pihROo z#cwNIki|DZY$ch`HxTGqKK)up!-jW}SR4!h4c{W19EYH~>tFVK-5kgf)QEhb>+5QP z>cRI(a1H$V^Q(5qwL<%HKcBAnB`6ZTFTt$;{ojm2a7pOs*hBlo%9JqTKbm!qVz$T` z^?**UV7L{cy2|FzI>P4pl`CBy{l!(cgu-hMO^~zj2h%@;N(y3M!61ahi+0O8%m#Dw z`&<2Y!u*f8C)5SzCj6^Ie#>uw8UlpcHXb=q%8+A>1=tNY6y?m|_71=hfyoDIIPDn> z-P%OyGI0~a)uj%o;_Batqt8szes(Dt2Dl|+^0=AmuZ_!aL{^$@Glf1{AeE;ralVDPp*>4I5RWcQIuP`eYZgvW zPjMbi68^Hy)Pi)U<(!Dbuk~O8YDm?h`u21J;j)v`Ke4=mx?@uG z+BJ#&c7gQzKzt~iLO4-qZZK$L?=>q(;kPY1)*(yUA#wKmk6Z$(WlkZQxCCTn9i=;= zwyp$;yUkb(nsArKlT?DV@fWmeYOq(Kd8`zJv7ATJY`iK_HB)t5^gOE6N`rGv$pzh&Hgp)fi8#DMj5hgQX?x- zVWg)RDGmz@)c0`SkLZ8n&UUU|6PBUxJ=cI}H!=G{d2=jj8{!F$!6Q#DO=ci&qp=b+ z(H$+M7%WNOH7ZKYU^O zmgQK%20=Ula|}GXjc!kR|0kby>ZyUhb9ZLUfMo9jRehd-R6rl}S zb12v6jU1m3E`zjx97Z$fW}7jZRsLD)ye!G|%MK>t=^{uJe%+Xo zg-zy@UK+C~9^$s_pyYmYE_go=_I;@)n;n57b?JOAl~j2-^J`5~uNA>xbu1&EzTP9955oCWrj+&Gvv)e9{R3@hfP< zJnViPmq(#MSj&pmHY)sZ1xGITOlQTc4KXK*aCq3ta*uLo)_X;Baw_`ma$hI03@khz z{%nUAGOh`1^0ROq1ZiW&y@UX#G_qAL4*I|f-WohkqH3E2y9qaIi=B+a!}67Y!5lYk z)#Sse2TBRtJ22?!dg)bI$P0D@`;j69N)u=l6y-LHRTR_l{JtRPeb@oH*XDUz1>jxTfrJM$cH@J}c6FPPm&M;`yT0yO<3?e-lBN zrkZMb4K8YNr0yhv^L+d;0?gWA0Y3V^8%;GG`Ah+lBg*`S&8_&N@U{BL>d% z#lSmq4c=D(s9XB?zui0`^pFUzM36lr(?d3M%5>xQO`^s|LHc)z8Lb!T3V178JA9aG z(r45gAm%O#r_CrD168MUGAMK+6%%(=$Mqbto)H`slu&ii*cGFE--K;PO_7*6jj&Kh zD>)KUq3tt_B?8Lt4ns+B#6O|;=1B`I=%R%8_x7VDb*H(l`GWo04#JB6erWwN-%JsD zs+>wjQKDOXt)rUHhNRFSb88FBS(CBYk#9&Ic@a?NX}UhIfRczc91S?Y5qn7E;j6(! zz(iRJ8oH~=u}OSYx+f~5%psj$Dry=ke$J5CbZ0Ws4G%9glOMAlO=X+eB|LX6Y0ZYk z3XQypIH=8o7}Ok=VYgJE#w9TD!*`}r0&dpd_dz`gg7B~n3H)b$0AB_>^36NL^_e8NIC}^Qva{uDDdWhusbptK)1QRh z91w?rvD6v*VT(SJ#tOV&Zv=-FAeTHF4P?pH^X11Z7|4{mSo`0)Cj4LgRkWfEvSB(r zl;XOnCw^4cN#)f`m#n@LB;`ra%Uh*YOvgl?=pPDsas)Z+F+;+!o>Xrah}s4Ys1N1y zIwnGXE#84lzQJLfINr|$EAX5#%DcFe%>Sy*>>c^OQB6nn)* zy^ZU7nlE&OE>k_XaFQTEQdlRK9`1SWcVAIGsl02Xdub`S2$i<##t3l%tybi8`0PX- zl2sla0jTaf>j)k`mu{jNsk9)TDv7?!aE&uU1)6^^xhtdizxNnX3Q=dUyF<^IMy)bB ztBn(2I(#$@%q=u86MTcJ&MuFFe8V2|ak@67f7SBDx|&A0#lM644JATqn>EsEpbnRb zwLIulkJTn5XN_&C@YH*sdK`!!Mn=SOeY)mq?3&3`-M)b~_Um?vbB%8S3K98IM2x=n zJ{H1PA_cm)p6;Kot-=<<%2wom4n+X~Sp9sQ)?T<1eE;2~5}PZP^hJb7l#ie4 zP-gYBMJob#ro1hXjJ(xL`1ykKB>Ujd83A&!jD&1}*HkWOwG8%S02>cKe%u_B?$dh% zh@A3yj{cWCG1OzCihfpOAPScNK|sF0ad$CUx#whR8p`A76i(Gm?`YgK0Ae}V&6`OA z9YQ;0)$tw?Lw*nu#MZ5f2h0w}dU=Y>VUhL`Aji$U25r5wRdqe3 zD<$!~5-j+aS;QFES5}7?EratUgD>H9Rf^qb}jdN7Y2ES2`l8$O5Lp5j zklVe`TsSyWmFkh&hC&6ETFwuVOr-7|X0W+q=S2MH_!fWQ*+p*WH*>QD!| z`-OA8!%~DSbFIqwPbWXalt-OES?Eo-?eH!9q({4DaVVsID5ON&sDgcvG5Zflkb zx2MDz^RW&t&`lFW0f&}dRwf7FCx(y0ov$aw^OX$%=X~`M8v4!_>G^&+rR`-5Ep^RT zLsx{j31@cD^|XIGRrVXt2bkGefKc!hF6iGfKKEA>Rw>Mg7K;+8GzC}y;mfta4;g#j z%>bTSyNydk!0U#n?VSG&w0ksC#Fv{8WgnQjnRDk&q?J|1qsTN|BhJUN!V?t=9rvLi zGgjcDMd-G7+9B4nNCv}ZPGgpiCO*}u0>1?>uZM%v9)L1s4z#T7d>o1hq`WY{@m1s` zG+&;09jF9GWRJwHpO`y5_IqTd5FCJ^ftMH_Q&?tv7in}@HJFDbw-#_ zb&j!)O{eS9c1N2LV_KF|)z8hr|3a+B$NZ-%YA=m4GTJ{^( z(7c`J2hr}``9m>MMfJw?nqXvzA9+mD>el^F*GE#O7{fbn?}t0aGIvr{6~Q4Q#I5s} zh#ZrujSlB81E6~6@EaWS2KD}0g~Y9(_a3%plm!AJE-J-o?IdwvI0c8;I~mj#r+O|MKJ|9bhDAjKVMN#+&S?5{>gmMpAt1-I z1h_Nyb_0}$le)q}m;=kQDZHy9T+>$FIxrlx$zTVErB-0aUso;zMu&|Rk*sxqH>L_s zZ)VaCPs=WzVIIuh@OR2HXDBBs)(&%M@V8V>n_OWXX`nknHog^u*$egn9jf0YGosCV z_54xSX)Y3TUCO4MV&CTmny~+au|pg}+K^1f)k;1CZig1cQ3)Oyffa@0ZIT8wg3efX zdieJKZ?$m){lmgDscP}?SE4J%s;Bi3EWf%?`$K)7LBlty8xu!Z(JQ9R)2=(@$@K#% z#1}o!BO-K7f_1fX=iq-D^R^)TlatXJrt`&|s-JKb=AZC-r&F-$ zw0Q-@GgrN*Ki!yI#&Ieh_!v051(Bwp=q&8Na{z%r2!(hD~s< zJ~CyzS0P#>{C0fWu%Y~%ZR$-gLKC}g*%#TA1Bv&yhFamKH;9$bv+qMOE&KCB0Chei zMsh0z*LW9}>sNX}`7p}Va%*=!$}`BXHT7#HqY1ARdv#zq3tS3L);~#^lr2JOv*Q5P z4XX{-GbwibcW0+Ei|Wm)!77@Z(jlw!^3kNAO zQ=ko!-Pw0_OUE$)sR{J4)xW-6KtGW6t(fmIdtjH5|Ey6F*;YITjud-{m%aGjnVH4kx)W1d)r;U!GNuhfIOET&Kh6 z6`uMgjZsgN`?J6(%aicGt@>@9bDO;$9jBp5#hye#)8&^OhWaj3KAG%8geB{345%Md z!I%K@5V09dDKd9jm3Z@f`?8WrdhYhB)-qzpdhT%;Cba)?7y`QkQ3QE7@oDVSbfZn z_T_;mmLT7|GvEm*SU)DnT|(=Bad#BIY)z~33wN$nEN5%^RCxtd#{QWCNuCltIwG_8 z`^D7iFR#A1V&4b4%P%&y(n-N8Zg$4MgrW5kRsIj3A{(5@AI?ww#+yhZ|8UtuLksYk zA8zZ5!;yii#>q!9rD6kmvq$7!cQV1iF0SQYQ-do{kGZ9PgMhJRqts*?4+4RC&|YGL z#*!B-EeUMkS(n>$DF-D_#LDnYHquR01mgylbteHaoC7Ftu`R|WV(L9x zm9ld~v1HgisXmG=(&*;I1IEqX%`~v9Byj*&v10@-jAjC7vO^e(Y8n?gTf!R();nV2 z_+E20DDOp(B;fl)X}@a*Nb#|iKCSQLEFvC^tH6iT&tq(lULS#JM?mP4*dii!yh^d) zGnr)cn)q8X4b0agy!(5|Ic99H!HDP*__l3lK|`JU#0EHWV?`j}^E`5^T}(TFNcyfEb45HahCeR3Ccv<2-N_y<$om2jS|6^z9>so6HVBO4ce zox?kEUMH8wH*NU7k#j7^|?%__QdC zUoK(Ld;1cu6*P;y^!x7V&jbMTF{pC64xxO3E)WY0STBLWD*fI z*nd=r+s}uTn>?A*pHiC#G=t|`UGcR&&NY#t2+}bkR+Kw80#)0*Mjp3 zl;=VbV*LOwwBJP77k6rxFknk$LqZ4=Va~M7UU~0^U1f1*=cqvBtQcR?Pk=m?@%&um zsCYlC*KZ{QZf<#p{HhR;#H=`h6NnqPq6sBPbEIn4Mzfdexk{~{PX1HADxDYMvVjb^ zy00b@`+^YkG^h8X4cSS^26^_luDNVbpt)`j@&hb)^h=T7wa~AAE()$W(DkH;agH{D z#%{ZW)i@RZ*Q8$xZDL=qv}KMgh>3(z1Vf(Z~|*ZB}?(*mv|m^K&6Q87@`LjnLGG@bdJ~E{zws)W&`RheX>1* zFd#@+NlW0!N1efNU0iHC5zRT!xL8%L9iUVZ2Zu()4hxj5O*0Ry&%Z3`QVs(Ir5M>v z=5&bkxI^169JZ@4_UqL{TOW`e0W?5HH)q5U$|RrH&ot}(eMdATpWj|aa)Yv;Xr9PK zh8Il2%0a(HWo(V2N*iY>WNG`1&m$FAHki>vl^*N!)LFq4tu#pBO>It+t|7O0VH}f@ z5@m!XGumOxqE1Vo>H^K?Yq5a|Dk|sMX4AahDg49Bzdw0IopRVjtrM#e@6HXH}cHCEKVZ!S@(BwIY9n}YPt zpLs&riFheWFy$~XXMU3B^j?JE5w0*`B5h9%1X-hE?9w#{K)G^veL$OXJte(9)%Eh5rtGrP0_~b(n6V>y{^}o2z@~8cNMhJOWp}j|jZoN6OpmdvIMu?^}W8O=;NmcS- ze0!JYmpfwOh<&(2DsNXI>sUQ5wU1Ppi1p;-?UA3J*dV=mPqn~BfJ)?oLu|?}O}@0N zaUz8)t8u8V=|>}J1a$VkZhIB&cCad+O8CSqAa@A##--$vFSgFAM#cC6<3w{!j56lM*3MHMP_KO($obcKMa^HD z*x}kFE>KP9`WgDH$h0=CNsC}5+!|rGS?#!Et(0^{$j$jvp7T+{&fpa9AOdTZ{Vlsk zLxi_f@?p{2J7=m+D+>A7%pCTgE?F!K&ASx>f*#7R>(Yc1fDO)=@>$2@p0tO`WRv@h zx&2F_+cxS{!T5AOus+wJa|{2y#NuuqLdzxj3q5;U*L)WmUq@7H>VKI}O?f%+LOuymGiIiQS=MBoVFvkAc$DhJ}$MU;N-9lEBEuMT3ytib(LEa#kxjKwu(VtuWG z{l$+33`L*7!31)(dNDDx=-|%9o82HGC?DlOLb!7|U)2NOLEe9aX~ibn&vC2eoehtL z=A#dH2m@Y!;lf|QpkFf-C4$}8g6wuAF?>0@QPBivvsjehNu^wdGV=t~_sRoEOJ9*e z=@S4`tF>p1ijVs^V2WYy5RhyCv`6d#p;N^Hi0Ky#X97`a$3koaH37?08LH=n>p5#8 zUMg{*$1?E!Onds?yA?D=YDu}3V%3NLO4&?{%U4}7ef8*GV4w1kvKWNe4j}T>U4No~ zI%bSAo&WQX^wJ@+KBbUWu%_He6@Lt`ck4E)Xb%ZR2?9@J%q*>~rQ z&}Fa`f#?3)Zkd6lENBW={A;r2KkN)(IE1tdpYthB%e3-F*i2%Y(Z9JpIT~8c@{K?8JhzJv zM^XnJ+naa0fVAf=ZAiy4kV2_2hiO?b6m5Q6pZF%r1m9%*hJmm?(8XgIJVSPSHX-Yy z1$XYi~0-C>Cn8?Z_*e`*n7nH$)!X`V>l6Tz&0+VqNofB*Z6fTURR(v+3zaWYO#00ph#|N2W4IA z$=kl8aWXIknh4N>tOA6feW;AD{e)SsF<`7M^T%BGJ&ygrbqQ8P&h= z&@UKmmFgY!1!3DvXZL8@rNWUP4&Oa6?ONxIMxTaw^^`!93kNUj;3Y=iv+W>=U{kGK zQ2kg=5|v?Xmn?pmO7b%hwpL6H?Dk*0)q8FzZ?6DiNtA$27&q5-VqIITE?0z>*1Th- ziP(cmSDA)SboFx5MEE0o0LBrzKh8=|>VY(@Lu_l%$5YA}`R?l>AED3GtE>Fp^v*-V`C|f&5Nuvp zDG?rL{FAkLxxB7EC>pzyO)}SBfICfKh~f=8>k)zS9#(}hMP%y`zv3g$lGqH5#H(z1 zR9CHM2KUfmWA+~NT$sX=p#)BZOY!Zlg(lQXmBb5#z&0{rlafe{giS#&=tg4gn^k2#=4;cRWpFVqi+n$UxHoSlPYI)TlT$b7YO_~>) z{cORE8K9x;6Y{TlM7pL{BNs*>D`5!gnHd?9C*1Pq^}o{VN`5hX7Nyjw)Hdik%K*E4 zAmFkW;?>W~vUm=aaEuaLGc`)tSdk^f6J0#;&4dPzA4)rhhBfC*LT|S4bpJ`T(i;K7 zRrorcT#a;9lrSfP(0p$)$9xM|tNFk}0$g7b>uJb3?`#4$8XY?vCb8I z8~cM$()aAh^Tmg`ulnKC@k~3hwE39;oI@UzeS~UGp5WG6kRZqmh z#{$cJOo#Rjcq$hjS_2KKh&3Pkq_AUyjenXoZV}PmsqU~moVNY)=<~$oI?Lr=&b@*X zFED4IT=xr=tnxd008uEup&8s|jbcS`{%<-)jYg~tzZ|fS;v*5ty?Cp9oAN{{+jChU zCF*rA079uO(VA1sk&|^-T}fvr00$>#4I^)0hdga;ZZ3fa|zX-VOZe1M&oxmO! zR_cU*+7!~6Y3Iu{#3xb_6}tZJebW_PmdxJ)suQR-rAuRQyHDVJIDl-pjQ0pOi4#B> zYWRpJmi|$vgR#8D?obIW2;W|Ui)0PITq*US% zM8|h_?eCq6u@@TnhKcY=J@1Qz(*OUZ`VLRcT8KUn{M$J7KPRklKDW*0nuB3OAspLF31|{@N$od39;TcWN|YBzwlV1ASb!G~ zE+8cq z>?Hixw1k2X6eKT=9(lX$Lf|A{X95S9A|;dz1}$M|ep)XPEH|EFZ_0Gu*vM~KoLA#B zmuU39nX#PJouLVy-2oA@8h^l#asIqd*0jRSpe&jww>LrmclP?{j&A_=1 zUul0tOz$w+5pe1b_C_<&o;!-p)R(Jvtv7!o+QZX_%$t>VOw^{Y$%@gbgHA&+-Sx<1 zD(kdud@s0oesm;hVzkGaJJz?XggACvUK>`qx8xOiR{Q5>rdi5G+6czE`KQ4Gj!T+& zfD~t6%Y;0LyKK4)$Du9<^NGfX4T10k=uj$U&_bn+jwq~B*d-8UT1DAvlYj}^lLtuT z?GRZq`t@}OHDa|wam~%*5-p2{!BP|>Q#MT0Is0;0Scu@4$D*bZx zBkWJ=0FuRC3VC(WW!C;-RSzPV3qAlof@WA7Ivft;KrW}}-NA;XO8~7$T-BspLLl(K z3pf_(8`OWdK8}m?p5rK7_W`wuwkbU2WOBi;Q4V^<_CM^!C^^zZOlt3^U20ErIUxFu zND{9Pq5^Ta`UHd!Zl9??8v4&RULzH~7j!?R!KRG4rkb8{B^nbE<4$i|;#`QNo#&$wf&iFHy?+f>k`=Lp)Y998Oa0{x6vy6OZ)v9-t}nv>$J z&`=1DISmtN-EC;hb+s5ido5%#_)4QOT714xg#ipWxlDB{#*GQAiBTB1dD)X-#=BfV z1_P>5Pc|s39!7^QM;b91V#U3RD!)X!y8orSksb(~qq$grC6NBH0BG0GQVD?fVQ}w+r>WW=OXK$Oh zaI|p|z@95R^|6sda=Mz8u))G44#)$LS8{Mx2oLIwBy^=|C*RL6w zG^MdEr06rlI}`>_2{Wth40HjYZ3VK6j(!eH1^Qcn-l*B_xbuZC&^U0%BWw8iZ_YWh zTn6!OE!JF>>nNy!Ci-&^Z})YUW|u0&T1A#dt<+s2VjNFG!c6VKn|CZu+NocpAqx)Redp?g&BbXWJ)> zpkJT6qQCNWC|d=eF8Cu(<2E74Hz$@D^^NX5Vj;{Zl0;PQez^5m z4`b^|a)paGSA`d^$X^j5@?2a==AV7D`*+d|O1Nl`ml+uyDZ~EDqI?^%u-{s9h7fF^ z_$VoYQ{CGMCv3@@ajwfn7F!c;MS|}#JR1ffFD0jYh2=Y4A8m*iZi_>W`xnH zS+S{k`ovgTUX>FWIX7)gi~6p6+1Jj09&X17CT1V82Ibaut7+koZ*?DDi4b@47WC+2 zt$PRLvkwocI?=I9BhY&Ovo$p%E>WN~px`=zE_0^q!qe3v^{tuvd}tX{+R6^TUbvGb zBUKJfabuPLzctk1Ars0)`zO9^3J!*))74B;XIbx9Gh&)M+tf5#En zKt59b6B`2sFRxDiZfC?;9$Et$M^UbLAk2N;on5eQ6&P}R>CsSwe*_}4n@@XixW zOTMt$BBS-YI{#Z2TWnou4jA7wheM<9Z<}g8D?*74x(W(XX*kqc zpm{lIIoV?0!O($$H(}GuoU<8&Le-QTm1zBpL(+Wem|XcSpelrg6c1hy9uw(s8j&pYc^q%Lc53oD2H`51^-*2j9z!$=XP`*wmun0yA_me92~F@b$d zD|D-@RGcZ|kS@lF4?+-iM(dJ^ScL=|j}a$B(hoCfn%(~n?_|?1?}zRQ5>+`o#aLD` za5vbCM)-bq;;`ap3_1u!&*;(YV;%=H-WRi+!=`CKAs_>W%`C^E)ATA|hYYlrmuNdf zVx?IR2oLatb-5g)=(?EI=GN=bE2jueyq?6cY%9(Y*DN8nE$ym;T&GX0Rf(t!O)W-5EVwAR^DA=Ymxee!(1!av z&m{gbFbFqfeS}y_HnHygvw?<%EU>_6n!~(JN9*AxVxnwvpMm4Y*Blsn6pKCJBkqZQ zSWspaqD@9V4HXp5q+A>~jp}%hDw{ow!M2<$__H$fdh+9-7&yD0(pV0ObD6gu*cdbE z(Ef<!CjxL z`H@zRCKxby2Uy3_W06YxnAg)-Rk}H8HlY7#<=FDdcRE` z34`9(ZOG%AfI%6~ciR|l-xFci+ctp2cnyRHPFpi;M(eNl^83>UL;D)({qURi80k@) z07X(iC)V9L?5iu^(-%tTavp7Aq18Uqvg&e4S%-Fty+lLi)7Jz}t?d>E0B0oW1E$ZK|*|M+08R}bi%{s;& zAIl6oA*y-%D{}FJ8WLguq3**j6u076D?)V2m2Aep8EfBOA{Z@M&p}TQA1t}yV!t(h zs-~3VLrP)g)(|I9gI{27vCGvrV`yK3$1{=aax|>lzx7&u$8%pK&^xe-BBghFP*l4x zKuGFIkZ-xYAJ>4Y!Kjyb3^>71PO3#0w)wvRc9;PyI5y0$q$WC5!h#;9HGPaFT*;E_ zRcE4Li&>CXgj5^j5J#nCSN{rZEi&NJCC)z#mxj|Qg6{p(INs1##5d@Nw#i5<>CoPd zj-B+6b@{9Awe&u5cel0k_)i-%$UW+4@8Z+spgd1^LXaz3JEUipd`Ya?W}1XLQpGia zPpc4zX9^Du**RxgnSZ4LTvG84$ z2*yG4H1*he6rto*@I1NWWeer=ffhSi9j8i@m|nXD&kezL0(djwt$6;e(SlIPz_8cI zazEU)_;+u&j!8*j78>W)hG3_Z1<~c^>@MRNtu(x=td)fBfVABLy&;a&G!v&9D_i?wt7 zrM$1T$)4$CEbM?m8J<+9QRVAH#X``3nHA=(QxqrrA}z|Wmjm}J2gf@G)u1*~uBo&h zDn^Y0O;`YXH3irQHYqTKuF5S4FM$#$K>U~fE3dlfrqB;MBUZdIQ>A9@$d=_NhZNWW z%VHa)?GBE2bKH}3TGtDCGiwXYb4k!Fr^&`41Mvi@P2A_j?jvFC2~KH4LpEb z_&-32iG^dw#I%#DDp_HKs^7S4jd?_MBlh-#uUv=83y>5L#aV?@79N7?81-t8sptwA zeiSvu1?}>FUB39qOxOl44i8qAfn-xNjAMsYSH4Ufd^WF>#|C%P@@a zQXUSTZ#M@?d7`SaB#CvSl+e3pO8636EMcyb3wD+bJ%vpo26HakPuByyXl@!)zSqn zb;U{&E85)0FID_Eaf(+@sLZ5BaB`ZU_(g~qr8eIlm;68n``0rmX#widGY0;Q>q1*M zrbRQMDs-bbUq)P~)ycgD;o5yWg`6(Dzz2U2GV(DJK=)0V_jdl;p5%Ed21r1oH(NrX z`h|9+vWdmu^=#^2#ipr-XFiC+nGw%JU=rh6_~zr zPyYtmQGFME@lAIOR^H`9jKEDl%r-Ti5C;BB_R4 z53?q1M6*_x*Nzwe$|Jq^4=;@d)tTve*`~QByB90Ffp8o=Gxj>2889@#_Cmy7qKp#6 zquWK1d}dA~u?STGwHyt(L-mAn2s4cGUNt&ua4KpgU0SkFR5jdqp%`XDVId`YX{DzW zDTS|=1|iW93RXAq4=D%}(!;>zge9&BO6pBKoc)t0NV&(pDuUJG&g%-VSb18O8I56D zbqrZ_^FV+ce`c)^lyg3Jvh=8#QG4E_f+Fl~Ep78(hdYNzPlo*$oVmF+d_#el*`dn# z1EUUy;|$8r0G=f~;k<_nlMDW?1e8)KMQVK%asxylAwO=E9Uc_&rZ0!w_r3n^ z?OdQ#$V~$24Nr1DpLL5dYECbLj@*GouZfC4;t&XBT{iO5^PP|Qk<9pRA@`dj(5Kxb zriW79WvM&#l@WR$LtEOT%U_eYf=e-PZU%>l<(<#a+yC$%?CN!~oE>YZE`t^QT9(B0LwUx)h zWt3+{f_zY)7^Tlr>5oLyV5YD zl)@9wLOiX^bX68Jb9fK^nA*#JVkTLej_vQG?B_AK&a%&HMwJV9`)&ijj6AI71{+ei zJy6{(7E;Y!5bZM$y0Sy<`1cp}Bj&WxfZOcaMh#nlK%f$G3mxv-Fv`Lvs5VSogfNbV zRe#`}h-pBI%iC2NagPEVZOl~z6}Xs16>aC7C0odHLOwyvex}{fb!xL0WKDog8kBHN zHp(Ra8j*{z?|E$j}G>AKHtk8*%b9+gew5d3-mPD+k9Rm)5bV9k~;SPa`0 zHv*E3^6A`vQz3Mvnbwd8_L;ET-I2^+%p7&MG*KscmivN*i^~(RHBh|GjVywa>1h{( zZFasBp#T5Mbfo=U?e9RAUJO{@bTIUzZdQgCkO)!#nI=V9YFG5!g`C2zZjtX_Nhg_J zRR(EI7&~r2>1qJ?0k?M4rMZD2No9>DB`Y^rO-J9I$yWoTc?8hdSvKQJSOC!9^s;dy zBIqaLNj;S1+0t5vV$aw-L0x5$p0|HNz_UA0W(QhWkZ5uLG?s2Q{Ms>Om}-;q{CN&6 zq>9z zLg~G%$^dZk!vU$=+IQ9g=9R2{>*x*mmPb){26`46D-}mRrgT=F?qd_-l+=g`brp>s=M#yaV8K z%5s5MOUGCd&6E;)Bb3s@(&&CH6Y`nNLhK^jyY-)9V7(;q3M$&p7}ScvROzYO^+n9& zfd&K4X;Oa}LHrEpC$pDKb|(Th z$dBclm_>*9QHS9vW>d2>LeJ>u3?anKoI>a$Cypf|Pkjb2*6X7d&?eyT+AX3B>%+ta!fcXmBkb;2W zs!F4^_F@N%uDXOjPyhj9X`Im<=zhzfTPM8MoS`4qL!c6==bWSIJ92^a3VAbeEFa{g zQzkNwhJp4GU{Ro{r%fBzqwr66__~0*utAM;(h^suITP z8~mKV8Fia2q36RAziyzfe;u1)3ZS{-=m%5(X`Rz}VQ;?rEnq)Xx&J(ln+j$E4wR~( zLViCjOvoD@xOd?yft8B!L}&M$BNK3EH3>`^Q2nn|po>4k*WC@Qg)Eacts(x4E-wep zB;{`e2B-0=`|APJ^?HR4+yQW+JJnF%kyq14En<#=g1$}_=q{6EJp-2d`1tD{Dl)*2*(v{YU>qadkR zEwoULGcc5*;ZU!MuOa-Yx@o}!@VW9ORRQARNhA-h71Q+VB%^?@VT ze0y_+dS%PbgQocT61CtDgNVsh8f*eK_N8B6gkSeW8LsljG8`2H$>4;~J#R~F{L4&V z4SlFT_vX4TWdIPUM0}u=%Z}?_16gTugZ6m>;zY`r0u~$iJrJfF`OPv+4AlDD!`^x} zz5)=rw=hXB{AeTe5n`|1b%IPK!)B)| zi^Fj`S*&ZG7qmMoEJuC;jkx{IC3C8DIEjO-u55@UvWVb+%?ZYdF6N#iHX}P5W@0uh z%DmI4F=FcV2h94|N@+1X=Y#Uq#fcUu)#kiMfmML+WnM%j7%<)>Xwjn+2n9X)LC z6!0Z&{eq2cli%aQV;V3+T(PN)ttJ6foL6-<%~pYv?jOGB(_t}f4Zw|x61OvFa4q~2 zo@9}hhF_=19LdnL^h7d=|F+qLwLWTFHaCN{2rvqtzpoa9TIcTCQIG~Lmr3NkyfoNV z@XFoKwpSz<2ZQ;+orE{i>YXvteDxiNeo63_n`a{lMpz%j!I)pysYEKaS0VV<`wyw~ zp}()sR631ci7ZhmJ07+8yG5`-<4#U4hLXjSmbYxE#zYcGlo(#@;~cpw`p4%ffV=>RY4I&Ja#kke^lq6`cHfG2D4F} zZ{$gZiYRRcC8w^h`F|Yde6Yz+`Ss;&i$;ZRj<(xw!e25G3)r-XG{Fq!QC;*54l)JB zpL|4m#zYX(8vo=r5_>QwRSidQ?5g|_7Jju?hv2dFn|-b`UiKp4_x zA~%THddXyRp%l=!1au{yE2Kiz{TD+BM>wT_MLNQ>Xio+|Jr_k7Le0D*0P@-8bih`56eX713j)kByR-XTKM~0tHzr`Zn&60zuAYH|;PADOex&WyyR) z2C0IXDKyH#_;?CNEBzEnc+sJ$E%zZ@rAUZlgW4xI15DPZ;ug2Awt;q>!Q0jwjPI=M zjGv4-Lg`~CHcSUTLY=zynSV-?2J9p~#>v}Utms2jJh z1a=c%aVrd6NI;>iHO7{{-qWDin!F|dTT{hHNF4t1B5@?|DO{qQ{dIk(5LrF+%-Xx8_6pEn92kLY z{;#8cloRqzbY+9Ejg|w~u}=J9R{OGs^2+?c(B=@$Aa~OhhTc^+W@YMlTK_p%a-m5U z+Mj?L9WA<(F}k_ub^!rQ=!iE>&@nrPQxox;&%0!#Lm-cQ2OG8MP~D)h0SXlW zLr`n>kacxLoqzM4kQ|CDQhfoBaYN^MiU9x~L9L*1WzLYn+esA~WLjrsQyj@?y?Bx* zNg#+JM;bJxZUN;;5|N0uKqV4GT~v-SF|gnJ(Zg9op=aK?wI=2viZY_+n+x=9v2S@U zswj?i!EVC#VS~I>bf+~qU-p|?=uThO`r|c(8G6vI3+Ytlq;24+YAy7>OK_CAsdw^U zSE53`ose~nGK4~#o3E-MIDKy2*XCi&FQ2^_LPUTq)789@>hs&`Cg%Sb&IpUv#xd>o z#dGSaIVX;`ria^{J!5NA5OPV9z!o~b)U*t?#ry|k275^YCjwn-y~}a=D$_K*icVB$ zagm2Aq598`|EzTqyPuQD;tP&3W3bcp4;>hArSR^ii<_4P`OBuz`3X()*7~BI9io9!B0Kr zB_U3DQWgzSI*oUR$l{hi7bFK5!}Ia5*m7TtXa3nWAmNPf$RA0HoR(pF#G+GH#iTlP zp>~~|dZ)q55*h(#Ciy^bGps>KH@WW3vx2+o9Ioxf$1;50(6`)m+tbs$!B>NFC~4Y+ zk3F3WX%_rHgRxd(lX!`(GYu}9xhfBlQOPkcgvgG z!T1^v_Hi7WM_i*4#DE5L^GJh|E7vlnJH_*jDx)S*D3$;WF|IYxOPwu0~3+d*EP&C`NL=#IHSHnxe*Md1Ryi zMzhDo4l2_@wUtzhuh?rGdW9oEw0XLL$`+3W(GZ>3%o9U^$2*gvt)?wR`nzOX?ywQE zggVlWkT5i+E_SNHJc+}`6fBq)(p1`Sc!d9TfPoGcL~o*|sMD2HQL~Ve;^Y0S5`0p1;T=gtd_trd;@v zGWDf!y@K-4y1sR~ay8A3!awi$xXf6bW7m&;n7&>zlKdq-`G#rVt*g6=W5j}r13%Bh zGEDym9)ep-s0X!IPB#&9@ZrFSwb^lX%48HB3D2(ACM8}<#3!yuo&Z5WzP}p@HcSg6 zFf-RXxfTmjenWi*9G&L{6jQ<-H<+)RwLpDRk1PVMT@E?NdW(q*(^BOjkYEKhuy_a_ zI)9NPURc)MSneG}Kpj>sZsh#=~H`pVT8PD%|z$xApJT2U2O9B1Yph+LTIn`P^MXm4;kd`msCf` zQX_5Cpe@#B7vii*bY&&mTl@B9pfEspu=*ZNCP3C%Ig?ux>ujKo_Z{>$oho1DaIS1w zB(*o(+}!lFgIotWH{gFK7aLM_7m(LCqq~6XdR66tN%Y0G;(1tAB~5cIvh%e>a1FI5 zkgnD3cf$4V1q%a&eH`sAKTzVXrqT(gY`X=8v!Q|xQ^T5geGCFtoG^XwoW0~7PbZor+ zR;qJ)>h>e6h4%kDSkkIaUv}fA;Q5NmNVS|&$v|=j5e8b%;Mq!%rsV8hy5>}!s_TUf z%M_@P%v-GDe zPn!qXg*kB|hJYBae5fQ=Sv<3|Us=+v()%a@3DU+0F!mZ89!qWN^1 zSt$MxF&Z;#TGW0d9r1ZwF7LiIk>TW8KCN=Slt-8z%XG&I|W=B+JE&+aF-YdPDn zXQ`4~X=wG>A0nhhv#QLf@IQQJ83w?MkL^r@ncj z9EE=E(ZBCBB#GEuP!S?gOK?Tg%ra|2>35ks^Gs4H3oLdLaXdy1_gFtF`MG3m>$Z_+ zshDLMi+a}~v&DBkP{uZeWqzXUkbCT43tb(Kd2a^bZ~|Rh$awZV&73n%{iIaOWCcWfRWJJnI+g=|j=p zS<(5lgZh0Op>T?H?P%CLTxY5Bc1+8a76uucxI_QxGIaIXZ?5}$)4X$`0u7QSbnhTv zv(><}&c{6Bv0s6+ljQcHxM4dxd+AZO=?x z2U2!xIHSf8YG@6opW&22jBuvP7i1$-=~NIY5`+H$yYf_C3`};2=k<=zb$DQ0|yyvgpjlMivrh zuo5+<0;e5_uX3)~o`c~~2o*-9r(zbHB+p$@Mr58khGq?6y5(7u`pjk*Jwze|PGw`y zZqR{hyf*}9A0ySV!~$42mX4l)yUk0PiFw}7$`=m-ovBDvsJGjas*JWg4_XF)K7M+~ zM1!TxM;cfjYU;B3%M9j2oGK=L=ja+!3eWJjuweTBU6hyURZ(3Lb9vGL$ltY2*>%o~ zMy0R;NzUE3Bw7Xg&a~pKFuS02=^LUw(U+uo=0p=otE4En;oG*8WWLam9e7kRV#UKa zK?~{2DbsJp{r(bQZCK}A6BDgp3BW!-$HzJaoxd$m>sfX8+vSr)(F|6!HAjCkK*8X& zyJar7)*+{bWue)kk6^78E4Ye-kgH>UtFUDO2w2s1DC;~~G{#=ibX(yiicuC)v`0TW zjBw>dEC9f9ftz$)No6MuvSy3G9g$*9LMkw;o(I8cSm_xd%w;wFGGHYTWd~Ifng3t= zaHbS8!7}&n2u(fbmUkFn@P&b00BCw0!Q7oIDU>wvn5w;{^k;L+$Y6O)h!b**&?mCj zclt&~V`El4&^Ix)VsaDRl-}z@Q|tAEQ!9^{a1HoavEJ-)`#7PqTPlb1-N%q=Fd{U` zjShkm&S(nd!bi+Bj9#0zlQ|Y`?p0C7(4nvrPJ+Q2`^=BL$oD@UT7C5!o)L?xs(;-P z41Ojz{o-IZWXHr^%NE%>Dd#PlHF;-a29qLY_)#aIF59o56}vj5cSMjs?)u(=gO;YhVZR z+56jo!rtu}iy)->8f2CR$Eo%RXarXm!N%D><^UJW5Os<6VUOkGS**i}@5*IxkYrg= zb2`46&Yc3~b%yi8R0}eGu2^BfuRz{6Jtjjiw85IfC#c}dkru*az5x&Bk$>W}0N_+d z>NL``(a|`AyHQVQ>r(s!dTj~d+R{U@cPLGserixdksb1d<{SHdiZ;80P+_aX2BUAP z!h$bZ2&z4CK~u0w_|7S}h6R^C>i#hpuLHrgrEe@nr8!~wXruPRF)Mfn;&C9I>>bUf zY|)>5oa`imsss_hl4txf`PijfW(0W#at5(KG%pDl{w5u8>B*#;EpKs2bntj2&{emQ zvs(r(>Y88mBUp|P{tNbm2Q?OiEvd@NoiCNXMb5#!6+wyA!E;Az6!eXCzmTJ3{|J-t zqb?ba0sXdpPcsedK!+I8Z1)Hr5*#b^)WCJ{FbXXCd|+_Q>f)a>EbltDwkmt%u*TQi z&I)@!z_Gs~2U9RAUxU6RNPhuMtE}?g#_m(zsI~7F3iM*6NpUi9&~xc-qGgi*FvmyP zA~j#RjU#<}qj_%L>UXA7Nf>oJC?8!-J9!YBl#_YdF?3ohSE3U)CZ1t&HC0HAIj$)_ z=5bc1J+9pjUVqw3&^3$!lknj&0lFD+6(l61^#hBMv;SG^Z9SI4>k?LXUJ0cd;-+?; z%}oSQV)<1?S!#-YN4s2c-+oPOL);GPW)Tim@LE?PM%`Buy|bZB=jSg$G#~O)@&U5m z&|XBmqLI-YtLhH5B2Viw4)@Xt&23X(0g5 zensvBf|uO>RsZ5~CyYsX457PP7UWy+i+w@eobAVa7G=xEq_nmiR6+uOlh+UlVr|Qo z$|;xw`+9hXZ4~94s?Y2zB-tikpXh|&%2Nk-duc3^@f+7M>sY)o+#2=G;lC3(k9x&U zDgiwR+ETgPzVlsjF8l9J6Lo9cZUrWtgZQe7$@yBQ4dc=Cv!epOk)>!Vl9poGL6Q*} z@tqS0KsRe?7i(jx|9#ZAz}Pl&ijLpC`6iBDr!T+E>~ZGb5UcBLSgpegqTN)dQL6UU z0ywSWbfNLjV9jdvm+6k8&5&eUak+iuZptY1l>?u34_eDy<)}=rd=h0~k)1+}oPoD< z^KXK{Y3*CXCSR~=LoT@UUus~%bdukDu_HVb-=l9`t`Kr^eKRxvBF(}#WgX;(|G=MC(9dV|*-az^bg(&Qb_fru)$S#*}+{Y*p{|gg&INYT$B0INB#UQLR zjC|$l@Nyd87Y=^bH%B2CYBzHBNGR-I`iIW2uRY;Tv=RG)L+ojq83<<43=h{wjfsay zT0;|=mIg5}7h1}0MDsuI@~seFb5eIqEyRd%hMpyzqk?EYud0h?x9;>chSOgP5h|B7 z^MY+;Zb=8deNV7I~szCjCa#v_xo}LhkhX=Ru-h1aMFA z!NlXEUHKiK&3pX~T64d_I1t+-+4#~0bjLrSLmi} zfp2g0O_WU2mMzKM-&FFjlv#x@lO^=>W+>UHg(pIIsY9t+F!3xSvy|dXi|w<=y&C?| zT2JqT{#X6Hm}i;4k)57I_D@V6#GXRrb2I~2$9}`pd8p1O!gWX#MlY_Dxm!$&8IfdG zne;K(WGeddtKR$=u?0}mR-N{zSoenhKi1n6AB*JLoZona$xb&4_3jT58t_Ss^>v?3 z#&fDvHZug~Z+`B7eHB_h5oB@qr@`wEKdb1sJMhRsAl`zY02wziT|BHHJIv~G=5DD) zw;15c&59gOVo>2)>Z;4SBKgG$l}+nT;cHB57*#^5`IAvf&N6d%*sk`&1ed!pjJp*degorpyh`vqWWn z`Ka$45(+RoJ3m|F(2DRlRNtYvv+a0hWvJtd%aEM6_-(=X+=^-dZ}r8Jkhkbl;SV^F zl!q<6x>Q*UWKp`emFggNs)PBN_4|#R%G;qNSxbOgZjw7D>;t}9-Q`uUAbVKfj)K_B zk?28yp0flZ@}-9$Id$J3`9K`bw%dz{4(dL8J<4=#0|>>BXhAryb3QTt`Qg!#s5faK z%}$@vw(PadM%Uf$`tl!W`d>iHY%9f?8y&lCnk_>1b^OP=aWqw}Tfna&%)?eOl-n7a+2@TP_vIS-kJs?FHIxo+=~Ww^ z5Hw%8$ZlS7|2wNQeW^3*Ls%GO9PZTuW$sr`6(P2~M_G>&$ds=rblOWHAHQ4LDFFd4 zgiK1^qz-m-B#iIIDiTfVSMbbMV^OKw{i9n0&7VU(_RFV0crO8*FQz*NBIZNjZj@P$ z85S^Pl0lqr3u9WXJw1LB5R;0>e*^oi>*D|3miv0Mvbtfo)e2h~p4AV);swzj#{W^E zZ=6vxG3JLe-P{1`GP|xpO#t!?Wqr_}x$$5K(W_>(fAhZ08D!{g#9cO9lU@`75|=6# zm#Ex*URsoJ1zTP~pk_Qoap?8Zk%#mm9IfoQ&Vaz2k!R)z5!WS+mr_hiC@2EIS)>B$ z_tizALIN0Vy-MX!@LO7^LO!WXG&q<`ojNicz-JrKu(RiU7;fwuT!d9OK+0?VkWT;I zFSXq;&Dv)t7eNxh+D8$7`_*RjBA;9Mlb&p+Zi+8-8GG+n!QD$OgCgl57WVy^D-;W1 zr1El7P~0V*KUa({dEUt-Jye|8JHqrzcuJsD-m?dw6LC%pJ6KGz#;Ver<<&7;6qQp0 z@(63%f_mNOqDlj~)(ypFAd=u6q)0-1D3JYTsd}gDt{>J)7z#WPQtxlbzM=MAtGddG z0Up%H$uKJ95sBnqh(DoP;@9?b&R8Uz1i{uj@t?zKtpFEm4kHGXDneVq=4t0=SfyCDq80GM_#1^OS>;=w!U!Cf=-_E!uy3SE-*BCaGCx0H6a9Lr6a9aS;Qgg@G6;@w60)yt|yvc&PSe%weBM}M6 z#FRB8wM?1)z8T#&_Dr77u>IN?-D&ry%eQWnlPy@;tr3h;X$7b4vLLYO)cBxNo^uh5 zaiGVRCA%gti!HP1VH9xg_@HvQuDJ>j-k;{NDq;;^h$kr4mA>R)oA3x`m1S}r`J|5u z^58n{E`=G3yp>7^QG|5q+`4V~5`WMf$_S0Tz-2W*e9I3c>Qz^LrcT@oylu8#&AXxY zRk#H%u55xWSTy$kg3^(7{eL85ph%qy8XXzAwDJG{h`bY?JK{1yfZ{v*3_pc4=mPx6 zGZ#r)s7-XN8FV|k0L&%0G3I!2XN&j=_VN>x>vRsQAu$V+nCD8>-lQVN4`wPC!+IAX zEiqnk6HJK_%g;v5<5O6{O2}w0TH@0h1`S#4CX|wMcJPLWBzkHP5yE0T&GWP}p!Tf5zIy`P

Qf z#FOBD`qYd!;@22SgM3)+3Js=lBEKV`_>X&~O{Pady&nfWY~jvyMEyN;9{kO}=O*Dk ziwdIX1QWzE)@6nnCgv%;vMLxwH>1E@LaVsK+#!gXhDaG zQMkA5ZKv9l6-z0W;rvLmczC@?#ti^dbM!ksD;KmZf#z8WPkP;J$>j@=kl3Gto=p>1 zhDds&aEx$t6F7PHL+!GwXLTgg2jTZTFW9U6Wi#Fq}Ix=B$5^<0#<9+E6VU za}0FukWF=3I8eA`5?L=P3X%91F;|0TN#@JwNs2J&eT~gHI-20JkJMpA{7?y+LRII# z;gO!V$Diuj*5!^yZeX@-=EoVFmK4wjd-;IARD&muw`B1y_CcAP2Vd**c4?;NT4^Q^ zj>hAE)GkYdg&FT$oxzZ$&4#AlYXW^gQB#${R|%e@bZpa^iZ{Cvx+*aLZFw11;jRgo zQW??vKu`zM%YA%&Gn`Nc?=H9;@Or56WO{nd=1G#SFWw3VpPhtyAp`GShFq=R)|EK< zQm4TS;62+>u{h;>pU8OgerbBH{VuVNTbhRpj8i*tsoN(obdesVyAn9?>;u%$K_FTf zc&CV|u%TAa)BbA(*L24Y{Em7k48jRA#oauo+ix z$OYEe`yWBSYQgAdq8>`r9S|nGPFvx3PqRCDHigerD=EK_$trTi^^cc*dwVsA^e0FX z;-Hy;tw%chAkguRcg1vL2;f9k4i|>mRtsHnrBk@8h zajt=aDR1OjnAtJ*Sy~t`eL$8Loh6-+8`LX>2_w#3DI8)a3MzrXG`Yqm<(dIIDyEiw zr<*WxzQ&$ z;=d3)))_1KA7!CC**JzEX&c=R4rt!7EYmq5UmmvowLz}E*us89j$U*HcLK3o&0g5) z?`%(c*OK#^avce1)W-=hw~jLkND^QI-3u?s@Ri7 zsQC&7dB-Mi1gO)}+@Z>@|Ay@%6w!Rc<;lb}{IxgK2mc9#@f7@L{inGk9@}`h@mYi| zab)pSDfz3`PxWdD3pl0zESk0g{0|aOSS5{Ufuk>1ngnytUoc>5u!GUabQ8e?TAdMf zEP;DpyJ!05hFuO7pg%%NvQ1(FpXKi7b6j*^%4gt^Qg@3BnpEXx!FG>y|B@>Z7GOrX z$@Ou>3d8D~pab(q_?zD_uM}kCJr@*kc)xn`LD{%6@JI9cyLb!l!0`XSSkayW40#+y z>m5eqQZAg>`~*&xlo3A!kYT1t>M10GT?6<&!nXYM|Hdq|Cz8~*FaBQDV3LfO5Hc1( zWyIg4E5cU=I#);k*XzZ0p%WQRhlf@ZoJAy1vQ=+)4H@}ABq)@uy*!tGizry;?tVIu zv}3Jui4|2e2Aq0D`<&0@u>oo7`eI`v7rQT^{C}u`#8?qeu*8byH64)v$JAr}Ciq;* z2-7R9t}=j+ZhcCeqlc6!l%nS234um%k9D@kQN;Q}a%o$}W|uux`XgaVfGw-^C0Qh> zRoL|ol1uu`!I*#G7FrMG-%r~l&bzYH3hqSI1~m7ePFykRj+>>iw z*b7`P&x&)`4bjxac~AqsVRqt`StTC|-qm;<9qjA}6>-rMocl0O1Kl)E8dIMKR8<%CtWPo;%}1&0wc!fa zHaC2mC+#qxq|g>_GM+?@MfhAxmN|XzvC^I57rcGXA)YLx`d?$zhYhaKeC%`Yg;~!y zCiT9#{u!XmN$UC#{k>a>+WV2H;gNFWtHmibVCD~qpTaJRZKfpA7W^=XZvc;l-9g^e zr$6@ku>pHgdwEF8k+`A)^+`5+0sLK@UUjG4D4EE_Vq8kOydJ8^ka|T+uTsRC{(Uf! zKnlAmJ*psKQ<^&H{=k^Av6v~rPaqsvP1W77bl+SO=|xlsREn~hU#MtWwIokpcA)qH ze8Se0ErDx3H8EQ-)*N#gKC)Ihh%s1_^xOeDg5KHfAGc z@#rezq-$Cv&a?)nl*f26lD|Pgu%b*XQkR#Vso~FOZx@+Xt->FgeNV&f%fZpgTen(s zZ3oS!gr{e4+5xFTkeyN3uG^ZAvo~AeF z2BXkXZOnlTTZ$(ZiyR0$%VQKRU(%PSS4>d9l6lBvkYhHGhpXN$rsGg^bkRwdPiHQ9 zb@qTAw)bL$yjwedGRXioV9OG>*@znr=dbkOYbdh(xL?D6Gn1=r7=)wEi$}^oD@P?0 z;iTlFwy)HI(wHuS2`rUyNK2;5)}q=$4xkrwaC@@Z*^Tf(g1m^K=^5d=&)gp%U3XsP z=M_(SVcMz^ZF6zF)Vy_i9>s|mnuQ=cV4A)bCD_CUZAtCOxS5@a8%_6QOfG_27 z57@RO{7?||aHAuGX}PDlHX1D69Nx&)Yq+=+zuqHgIN`ec^8~$F>>O+Lg7fYdP-D6|+4=-L zC=Wwq3HAjeq&!&Ts9%ovBu6)f6Czjcyk2imBUux>AVZPM*J+X2xqRQP5}24b_uSP9 z*>Q@yiGm0WW1aDf-Qu6zd7(!k`~WwzZjZ{$c)U2`;6gtRA6@!hsOtQc~tTsqcq zI-rSl3@LclTz%P!X0gB2N*y_SAr$j(O(?Ce)!7#-WG8}OayGt1FS=&}Bq_3Ci{UBL zU#%(6=$*;xua!U5{b$PR7=e>T{JqNrEbKy#8 ztb5KI`p=LFdDjxYY43b6Khw~xO0yT712drU4bJ)@6b{^IM1nO5`dWOOXOL&zU=Trg zSunlONf^<533=U88?aTw%-8g7A#+Nh32>k-mPZ^d7)cO`P4`*)C zZRr#^I$IvSb2EaL4rS^HFz=c*M7u><_976Nt+-N~_1PoM>55gHFh%V) znO&0d`l7e&ffl7rSLAF`1oZU{ico)Ghu0_#3!|lR`PiIfq;1=pU=b1dGWWB^~2{AXU&|xEP*QHWXs4e8kIs&w@(Ro}RHz+=>l?_aL0ZiCPo-k5vWMP&@W>R)5W z!Hhrz;!DD!zovp(el%9);?13E6k4KY9ttDbc~r03wAsw2?*Y!L~3fl40UCIOS>^i%jy9Nq}Y+c9wy| z@3xaY0~MTw5{>b@w#ot7Br)Y_8?*_lqBga=e)dT0@cgmrXgI~mm7|1%m7Sp$Nklw+ z_amONlIk62$vzgC>xxkK7RMYBRTo7Y%Tl2?1&uz$7nITtId>zRG)|q{<#!uW0UU zw?*B~#0d-2>WR#G`w1s$XbKFX?j64E#QNV0Lm9dCd~fEf1$9YyS#s*c8IxASUCDNi zvkd*PL~g`fE#_h^%PrPV4FZM_xhxqv6D!Gx7Zta-!!2^IPM^pS6Tg4K{S6SX9J4iT z$uVdyIUtbuR-|YOGsx7-NOEFZp(RaeohFto=nIpa9|0jEy@auMKm<8v8nVqVZ;@&j zaCl>Ku1W433vZ$Qk{V(2-a3GKl5`5+m2@30KHwS05m7qZ!@RKB70)nnR+8J#kRwxH zn|O<*6c+yzDA}8a9auB`j8v}=6gfctvrU8?Hh}{^U_;e66CTGj*3Hi8*a<`2QnzjT zN;(829nZw1XIHFK;G}UU)J5Hr(VlE0iK>h9eWEH{ti#+jpeo!M zQho-F-JJvLf`JM)Ln{{W8pp1dUb{9mWF-UP1R4qFV{iFHb9{(Xgnwb>6_ z@@ll29WO_EJvGrEFlR%bX}1kLlMLQ478@^3m`8=Y=yNP;|6e1B7aIg%r1G@*r$R_pBtpPr2NSi}nX5ca#n5 z*5-35gZPcSo)q&zfw{ZZ0{;l(s5ypkuU}UKc>i?oCmG*8=}3IBR#!TmX5ub579NwroZL^5Qvk$;|VUx*e?uWMMJg2WdmCy|F2zG)Ro9il$9W_A9*JKD-{{UyKV1ya$!0eeXj{n0Hi5k zW{(dWbVWbbk1Gm@8UCe{J^T2DcnOaJTTFLo_h196hiAvd^GQ*v9E;S?a(dP|H0cQc zSABZ0wS<4|b?E@#)2#gTpKH!tD}&qK&Eem*rV9kTXG|c828m(gC1w)`Qe%+tv}|IM zx{r6d8${goHmSGK&eYS-R9}*dqW0`+G#81kd3L@4d1uVPRUtjcM=DCg-Xi-;DoCYF zCUdb0Bf6mRO=8Gel!yL{FF6d^Lh4ufkrY4u9`6Nx+V;`qRt;{W->jdgEt6~b*FIm)2+f4JrIa@$O z=|iH*^IdQvFe2;leoZMYA)<2nr>Y(okw+NCHs?D}FK;y=EaAA*5W8vDA=L&cY~$tX z{yDdKe1Gq=nuPUN7^~YP{k4Pa0aN)mT9pI}E}rX{;QeAT-`(MR?YGEI&hnm6l73PL z5G~84jC4H-dSjx$m6%5Irn&oO`|RKI8^O`BgqT0x=pG4rzn9%}5h1%~{7U^&Gj_)J zR@U%p1$Caq)VWwECQmV=({hR5U0UFtj4(?`&kyN&^C;y9-Q;bO3m(8-TkMH90M`UP zn73Q!e%gAIAE#$=#kMSS394-PkoFe0LXu7>wiCa8e4O=}U$3g0`qKf=IXjO3jFi*3 z7$&DnT(5gCRgzQQR7#@oA_7UjjD)5ed1(2<`5_v%>;MVB_G1KSy&yCZ%W=nyD z{l069CLXS8fh*PDC@F&W%?1LKS8fVVbUiluvzl8i2%%C>*X5LW42IKIAfg`dfj_e=Id+TxOzgksY@oxE^gr1W|KbhrrU1_?7R9ZkIIkuqTMO zRu%BlPiEAYOFG@RJ^kH%-%$wQU}a%4k&d_+A{pG#Hfzy|{jV~M=xyA@%gxvJTpsF( zoOu!Lg8$_^mreAK41c(}>V+*sW2Re(PKAhQ94J7ok@?~p7 zk|ojYf28v?w6(9?$J?WA=r=PT#ecz<;K+e3BK$xd4|*yX;oHrA`x-?JP0`Hz4B9oy z^XYq0rj6`V(7K1g7O zMuHBst#r2bn(D`|k}?MFSwr{OW0IWB*5zIYuFdpZYC>*M(tKyX#}ZTi$;wCfPg2og zx;bs0@Pg56Ut__J)FX?5j=uZB+jIRp)dusK6icwIld{2xJJTL1rYyPwBIa#mR(+oc z&^`MT*d77RL*@0@XH{gSo`87Q+@ajGci+?}B@b2Ewb#8&`9g*H$JR zJZFIsHB*9?aS( zS3f;Ch#Wlm{a^GF@MEKy9*OJwIc8F(jydbRjV_drrAo5AJqwQLVvB!l9%AAr;eJL#sy^HV)()`_c8eWLymCK_ z*LIu0%?I7>B(w@@=i&`YTol;X@!dq#Q}?SS=zWiH3T;2xAR{R^{fEW5JN8h^kVSst zYah3<8iuK$Rb;^MtE{GSUH9RTB32_EZQfv=#`J)1D@wv7YOw1AJ2pytgK=_;07Zv7M+Lv;ck;(+ng?C)%ceG-0roI_5 zT1%00z?6;a^gW`3=uN4K;p>=tL6UM5D>F@pV`d38!G>JR#SO8bMU~ z+)wg9T-X5aY#kBLcK8ngR>xJnVD>8WfMc~#dn~;_hZOit_8LDcnGjJX=9@G#FnS;& zJrzS#pedT+^q^tkNb{IE+r6AIpgxHHI~P)gUOUWVlEQ+U!)(; zshU4FhG}*RRJz(@-JSJrG^8YSBL71$*)2*{S*|FX=MH~qRkndS5}Vdb+9-mO|HQ8C z{=|QrqMRP`Eg9rG&eRv0j{5FnR*Soyafm0Qr{Dl(aB z3FW`ArNLGm9(dC6r2>{1xUj|NTSfPssyhs`}Xm{I_($z%{c z2+4?&Gv|NlOVQql4VqEqRMJz625%fw&XQ<;j%wOE&;wFKjb*Xhgb)y%CUa@2!n?p^ z{VTrJWL;KQe8XV!02e>naIc?B#lO7OWDzb(B3pWms{aipK^pn_sVWqhI}I z!hIky&slG~fL?k8SJ?n4E}ex=K4)=9hYUD3#*EgTo9vf|HUxLs!r6poSNqWmi+~^{ z08^-&uSA$*!7?gxqo8-S8G95n>Q>3eQ-S^ft#VDlOuUt8R&=B?sPBGUAaJjWHV37_ zd!H!QCsy~XKVi3(M4YaZ2>wV+Jag^6v~Q*cFKdjgZok!?;B3vr8mZqez?|9Z#IUno zUox*XZ8yjfAVVe0gb(atxb_`IffMP`&G0EJOdDin_XG=9nZZ=wejuRN{-tMaqu52QYw>XGLE|u#k^PCZW-=R?>ur;;^KUWC)KRt7mrIUKV*hgc@nf9| zZ;0bw>-m}egKu@>DpydV)f2cT9+07q=BU_JBj5{;KA&lJz9Z9)+LSh)4TH@^#^29E zE#bvi5|phqdN6@${V709s06dOd&ZrkEj9%Z&I3SsFQwIIO`=|u zW_=X-@o7&NC)NawR3#-3!tH7YI`N#}ueOm7hv%BIl;Vh;k8bZ=mra}yqHZ+T5ZEBp z*`uhHCWU$NP_>juX_|&Eek|blI0_<6F1hVm( z-NeBlW!ap{As9z6Y;8;l3+K;HRPXG(A`#0+^Ih>rcwOw`p@t+I4zN~55td&6%r8?H@#{p$t z3=)og8BNs)C-Y%!*hv00`QqZ-tQXaOKsB%`eALDWo0pzcgi2JTHiqHz3`aRe>~ttW z{MH45pp3DGKpx#Twra|s=k7L;?St4KFI#HTV zkc=!z=i7JD!!<#9@Htn4Ts!+CI^Hm%8i8*$+1xa7^9tlKY-`)_srl-DqXYof+@7X3 z&f~>OW)1k_0766hd7<38nyU$MsonI&Od~;5-{rc+(d{eHUI($hHvv(@H;k`#(k>vI zgWr(GqwU=2Ba>+pf?%l7MvuB>2oT8g>vgD^#oZy9HFPyUId$*EtlP(HVaqJKj`}Aa zmF+`2Dc8D3wzi`&g{fzHj)ZEQ<7?gA`WiFV{;?z93dTkoLO~pB-T0j%7lp6epN0Bm z6M7~l)>RLt3fU3Dn4?Kr>RkUcmBYR%PX!6OOKnE|ge#HP7Qt2Lfy>J%<&{i8&d&nN zQGXy!k=Xj+$z%`EC@icLl_xq^WZ~ALOl(noLlTDwsj0q9rL}E7V|17?i3GCVtscB7 zRfh*GtY9+=^8y>n|zhYr0K7+_+17;hFJFW^u-j1oBfZ=0h6PI`2AVTW)c9K|cvMt03W+4^2SnoKlB zHQh$mDw-koslD7P@Bay>vl5Tp-%P)pu+c=KykUE4_whwQ+@7Px;p>d(GKpll1OVCn z37QU>q$R6aC>`O=+o&+}w~w2g#7{>_mB0aOYjxjfO-wcNXg<8~-%6NWF5dR%=)}gF z9KiE~G7183xs-Ol*yKih&!+HHlw#nyujyfFyzWDy6tf$6@4NY6nV;-f{ z0BltJYRP=j^g1m%QCtOKQm2gc+cisJyW$JZz58>eG{~WOM8^h?gJ?C&{o`+pj3kA> z-2YWl0Y->fK<`0F8>Lvqetweitl4_3|lZ| zYbdB;G;S3_VsnMNHCYPOSP%USbel{4UbRbe+;r?{ov9P={T;p#_ZkMA%0@5C>qgVE zf-bjPlE>QF9a;Q2Aa7jx#jVqY`Vsuxq%AgFmZh8;$la=L>;_z53w%r_D@woK4V5|b z@Ulcsvij@NY1T>w*?m*qz03!ZjbPb%#_J=g2$Y3++?obz?TJPjqI?rkt-jkdeh&?n z8A+e?@O=SI3NN8*e;#;LwJI_@8(8>QPC^(dPxk90?9n96RL^V?4SJE?UE^A*9LV{T zU|^dW#5cxSVM~Eq706M_RW$~U9wd&XZ7vvT!g49Oi;tMVkj_~m$xzx+TYK{MTEt9| zr#d$*m&DI-M@ghwm}LJhQs$E(y`#%EZ~l$RzsHw=&6`tE=DO?aEPkr53(>Q1SSI%J zZ$C+*H3MXS^U6mz(7ee$!C$m0x7?xK@-vx8b020D-XOwRz?(6`HSjaP<>JN9agcVn z!>p{DGT_SppTbHie>$sI7r<%AsN>UVS#g>6fL3=yrI_-{!$g8yA^Us-Bk9a;a}bUF zR)>0;+9}Q>XQ;4TwqoM9U$%;lTAiioQ#`h(&q+;Dg+kw;=f)Lad(0>tZT1IZECy@q zq|p2(o&wF{DDWs5fE)EFR?8@a2V44b=@L6{n~fV*_?Qp6@^$-X%jNNp>JyeZQ1Oh=jV^k$XbN=>ov*WdH*Q1PP1idBM!FW|>C~hluI%YJb>nogb|9Tq=m< z1&wc`iOi<(3i7CCH$$rA64E$}Dq87Yp>Z+`z|vl}^Z( z0zkTGHzoaMWEZpqdEJuIg5g}{eLFGN#_30YJ(hLHdN4fSa$kO$PMMFW%P`d_s8O1U z1IUluxy7Gp*JyHJ_QB3BJVucc&D=b6wJ|ufvD4>rsTkF(t&cw z@3xub>tic!ry_&%_qS7~(YtjY^aJ9Ttr{7Nm{*p0>oVnk?E&PCCVHxOG3kBkgoD^9YYZ|9PgpnY1yCE!G1xk21_Z)f>FyZL5YUs>DK5JNNu`K z?Vo!|$qX_4E)ud^u;HMBk`mhDb!zZNoem?`0pCrhlKwE5UZ`Syk6n(P0bt<^Hq}#D zO&ka#WxI2K0MpRjJO{q9Bs{JJfu7uO!OxX#c_oiT(rpfdOyMTrzCvnXfle|Wl{$_j z-cDo>*|>?H9p52UukL&M8-yrH%lq~bgXb(Kjg>;=EK&~HFd60>6u%dSWatBInNKO@cAB4P;Tk`7pT5vK`C9cM zQ=gXz{LN8^!PxO0itMhOoUHeVcqvaXQ5rn&3os;z#oURrf!se1C3dK(IGx7hsK1Xl zi_L&TGe1o z)cME%S&pzI1wP3DML@d0@H``9e&mgJ5 zKI|+nZQ$ztc}N}{VmU_DS_6a5PKD94s@-eMrZ|LTZ7XXwC4r!8UKPVVNPOmUJ$(r6 z3h0P7Z5ZdkjaS#Lgyvu-nD@H^3-;CLzLNz5R#suKVEPq?>befXkN?H!T*tC!s&qBS z{lfJ;9`GsE7Y^i`siox&#bkkN(P(G`wYT;ZktBxZ8?K#{$Ls~jLbwFCDW?VXzFa9k z+VXS?acY);iRIRt%v(lorp%Dx-|S<11`;O;+EZ#aOU9i`<@EfMz@?W$LUEh=Gs}4$ zxr+K^4kkzo}_~4M!mtz-Iv>M{yt`G%@Oh*A(75!74 z0^=S5gnO`(x=b`W^tVZZo2HQluus)0Amoggep3!oK#KEdVHXE8cr!<;e0|T z<)%5Y0deA8%mluo)NkPO#*F$EZ? z!p4tU#6dFcR)C1hYf|r-g-W-a{M%o)bd3*e*VG6jViOU)Y6R6Huw8&~=309>F(=>x z6oUVrcF>zyy1P1?47=s}6&sXo^AsBZv(4*Izeseaov}TocpN)4S}s|iauRia?61JK zjtGEabG3wtqyRX)Y>h^tP2wn*YmsoyGqEg}N6u$EF>!as0^nntvQdi_KTC-*QN3fgS zbk+0wn0>h`^};>05Vpi5f0rCVSBHmve#3<`FFz_s1u_O+ab|z&g^t_Tz+Z^S}GlXp51OexGdW-f*8HRsM?k1Ir|F_DH07`&wyql> zv8_yqXHl4+xYuqdmlv!?uffI&r9wP)&2r`t^oTBug5{-7Y`JVpJ)qipVV#CmiZyOI zts$~L&1@YR_k*f6%)1onGW#{Y8p1UIpmoT94r`japK(*3OFfB@_JmAD&Eg55>#&VT z)Bo@PcfAzOTfx)Pb)F#LmE0ONW~h19JQm4`sNLhy9z~Y-Tq9pAz_+{cd=QmVA@^LH z>Gbhw&f-uNw9$3Dr;rhzPVzFY`QPnpUOE>OH70chQRoB2LzIZDSNde7XtbS>E6hD8@g95`v)G&46_4od;5l;=f? zz0nw)PeLHNy7iyV9mIC4> zfz&G>A_ujAH=^=?;XfP9~0S?D4Ps>KAg7d;1 znF|sr6tB1|pj<}cdu!`C#^%6$`(bP-IH%oc`?xaKnF8|>pk~r0OY1hCr#~W=)@aEZ zeNRq1qk>_0&3c-l)0Rd7pRw?9<w51`Ktg|T;d*@YSm8cakFpo07 zRQg?gUhIdm#q*1%aNARf{+&I*HP#J?fY}0@`?##Ek6m5&I3P$z_1$q*6us?n^bi@3 z0a|+g{YM6i-eUL7i!bkE!xwRiz2TY|cZPh(;F2h;PPY~=wEX{VoW@&zddXYV?lDSP z^jJX2r=~WJ0rn|g09UAhnY1&+f={g$AJc*`YEzd0T`aV{Ar5n$2Y=sSpMb~w8B7iJ z5Y*vvEGYWeUwv|e`48Ofwfe_`Vgx%J(pZFib(z1glwk%8UKQKa0I^ z-sj9F1Bd(FKn|!vqs>_EX2E*y`?Gbs<$1q>r8{#9$U?N>p>hwLP?sP7RXal+4~T+) zRNezok6EumGi&vAQKwlv+5$XkPTgQzK02q2ienV8~(F-dcG?3A!&h0#7*e>@>RY;pYQJ3YSrZFOQz3EZg8Bd(| zp;3223ewyJM}9+puSX)Pvq|RP!#$j`phb<%@mgnZSw~F9EJ;)=p~*!NLvIk9XyoCe z-Ubg2-BoZmb}_)f{8`+7jto`SGw&v_30ISLsaAGXC)-Ei7DXNi%IjTK^VD92$!wC~ z#hlX#A)f(ELWqP#czHGWo~R)$3r~;F0Rhs;EH?J8AWC7U&LIPZ4`&>Os>nbOh{iHN zzdPu~g=ygZ7AaF3v)lWV~=*(eS+t0Zx?3J4Y?A#I(WmfomA1d z{ACSW5{EIaUTOz4dEx@8&yfmmG%tFAu@eujMLps*^T##K0HyNN6^aF@X=vJ%f$yCk z)G6n&1gs>Yvo=@3isb6sbZ%IBB<8Sy+AUlC3h8L1y&I{EI2gT8PlFu^7T`{)d&u{) zUN30^^vSX5R@=T?Dha$nhHrQLWgJU}ldNQ2-fP_DZg7vmgp*3-%b(XE?VT;VQ$%2y zPZO(>Sm*j9MgI2Vt>8qlpPJ3D8lRf)H-V3@)H1J!Mw;CK#R45b5!r_+EKBcABpr?e zSOtd=Z2#(%{O%Ms;p(`*9+3vd`R^3eJS;sl^sKJA!u#?kz?pKyg67Md(bHRDLI z$YQGDYBQcLU5b~ATw^@NV7tG@ri1NHHc|eq7}a0ygG26-jcGQ!N<*hrcM%Az&WBz* zNZi?QdtWT}{^k|EEPANw?@D%;{y5>Qt%Vc+5QM~@hd9Egs(zjv^o(;Dmd4e%{LWpG zo{6N)c_)ik)Ya9Vs z(Pv`8cg5~gvrq$L1p*P%)YPpLfV2)5csFBfKTY5S!*HMhu&lJ3OvZ>aNV! zCIcLQIU-I@9J_9cO_J+ET(k$wYgAm0o%&8n*N(^$kr?l_se)ML*@(Vaju2jQam6l< zXG77DdFrz+Juso%Y%&WaOW)D{7Y^*qa8F|k;LW1H+QH`@WU-Z|LF)G&Mz!NDf;#MK zc$xb%{HN)f0%-o|=gQU;$@QVg#`F*~Pls$MNaQtBmAk8CfMvVbD zMpUdXgMQYiqS;ul-7CRxYjAF~GU+3^;IQy$tr<_@=;FFYEWt5X+oo8FewK zl7yGXWVWaenG&lKGuxCPN(3GyAlQ~_!iG=EB$8pTN{ACGB|>Z~k`Ai9Z#Ghpju|kh zIn|qGGa1OIVs0_JB$ZXhx z@!ar(dpN~zM!k3hSe-cf=soc>sJyE6HjdQkrDJrXkSWL{Cb3Pc$aE8u2|O>x8N&!Yhs-(Avq~R0)UVcjH;@ENz1ZPy(@evfV_yYd-ya=(CVNL;DD)#hOky~$IS?^eA@7mV zZK~7ph0S0P(RL5%TWuBpQO_-b#>-6a7J>6Xj9iZ$?21tABNu^#0~w3>S=$)d&N}gX{_@l&m_{_prw7j|*}{N1U!JZ~+5(5^D$S zu4GC+WgTbe)5nsxz_D>Y9%HV#KuE~$R<}2LZ=0<7TAR#u>$@T|(d_hIA2My{{_L)@ zYZn^y)2aYLZ$WVnmY~`)*-A)keo<4WAScHvNOo#F7vE-drV^=p^SNRdq)MgT%@DTy}Jz9vxvUo|8_LZml&|1HS0wbuea(#mJ3?%p1W{Etg`Iu)kBq9qicc~B!Dj&3*3U*_TP(_Md`u- z!Ww94tC~^)T9PqC@rhp;zTh&}eI`|F%a(6kCPfLLQEf}1XR2k~?op{{X!;Gh>w=L= z3(Hu!{c~n{d&o(KP}Eya&+woKkmL;lA7+FoB}_$c`Sz)XuUkU?fQ5daFuyMc$6+t8 z0m8`gb<8xD>S>~T3JT89|FUqQH@qX%ag$uwazcGIJ+8xr;NGDoYO#RDbBaO_d$*&a zxHdA^*AqQEpP8%o-U2N#%*DEc3K$$44d3@Z8`tHeS1ED>o1?w(rlVB-EI9&iIf$Va z_Gm2LY(CqSe)DnMSJ7qojrJC?c(G0U`Lu-y>kS} zccZPSr07o1+J6=kzc6Ql8tw?O@; zSS75A#36-l2D#NIJ%neJBV^NuO}nXm@Ipi=OBwZBMm8^g;0*n418Qrdun7w9RW8c! z<(ub{u`N^EP(D0`W&SN;@INh+NQQ`)yi7xg(w#3LhYj?u?!GNqbvxQrDzFbS`s(NS zMwxDb%Q8ihr0J+P`h%xG9V^Waczi?d#R2_G+X3D8#W%l~WpprHYp;-*t~V2F_j1g< z_7|E7k=)14?G^inl-zDH9M)PEr4?tia(J1vrE7mG`o1qu^M&gj#9X?D+Oae= zky4EvF`9yMdZpLha|hg$wq9`h1U!;d|9X1VX2ZY0U{WHT0T>OT zW5#mPPwmf904&36tT=zXhcrG% zos{SO_>Y23IK}-o#W)IpYj<4;nO}Wvw4LJ`_MoeT?-{-$r1l>=Jw|p8z>zu3^!*LH z3e07mElOlSV)?COPs>+}T4s17L@GoMv`c*?wnN(}V)PfE%WsFjeA{XfjNLTHiE$QMe)?=OXCDj#-Zf~q#8na;L=U@!_Wsrhi^eWS zHlhM?jv4OYWc;b(Mb}^G)%0Ki7J16N>2G`#LvZm5=M;&9R1e)FJ?{7vOhfVcbm6MJa%En*(a zK8^=hP;0MG+92=sW9=Gcb9F0TMgWf&Q(G-3g!t_p!ibIZqXvqA!6(x8gOZktbjblYTH%gfSzOfq1aWN#K($%$$yCt6eqUwcjw-2B%7?|T^tWW~S zRin_e{AhiFS%|Ty@+kMYui&T&Z$y@tO{4$Xab5V32>wd&;px~y7MK%FGnw-q6@})R zc?`KTh$?cSKB7Y$khg@V?QJS>Nt})zo*sxPp4L>)_L0SzH-v5YPly`DCHQfk;d_-$ z6z*1m3B*tGh;P1%ewxt{*u)>qH@u0Hdds~yMqpN!FiB_KIw)6#8>*n?*C{7W%Acct z^Prmq;6E3vgi+m;@e9vuwMlhYk&I&9P&L^W14tNnMNbQqDGiR$Od z*xcfk9@DM-oQ=0BoV4bE7V{Jy+jzo=P#6So z*k9b;L4FlcWUg<9Pc^F*`Y2tdHdqqeOi|^{ZnSre`~3T=y<1v=O7&Sj;N$F-wjAX~ zaJa1o8)9-JP#PBk=LH2w4cUik*jTBashB~CzJtz7su{V;{X%}firO*S4RG?PE2V^Xfd9>lV~TT zrE-n`uJ)a1Ab&btJAIq8nB43vuW>{PWz3rOlC{7NpVDEpxh{KBXs5eMg2|05o^~FL zBKxxG9b)`}Dk3o`BHyvA0vwzdSv>we*tb-`w6#bQ`bJ@bv(BCo5oJsoy-c$eeST2qeS2f)FkG$4s#6UF`D5BUNSykAE`08)8a77%>VwMdfE7~(;RQ$t;enu(6Z$8^W*7ulz!ZEx9|Ct zCEI~6hYY8)78Ff_XHtJ|(bJkn2F2q+bqyIbeV|li^qfIff%esco)}>PLfu9*xHe!k zhWRr;Gwjx#<(h6PvxFcpk>$EQ4SuLnz1WsFADf_5Ng#XxM}BS3-j8Si`}6?Jlc71+v9jHMPn9y5IiyINH;< z^MQI}NRaaCkU78|Z#cls2^vUk@z3hJ0wY-cDke{;3Tl+$^d!~L|D(33l+wfwob3V# zRDu|Ka^$V9W!RTColwi8*D!Ot>zONymnW(K$oA3Ml{G}AXKDq;h%`uQyQx@8q6j(N z0-74Oqqb7~5h0G3>*!#uoEiad6MIgU*07P9?8EkwgQo?V3e&0bU8C*Jh0rEKELz~A zV6mH)&DO4=7il`m+%Hv6F8{PeK2PTaj6UrGJZLl~em^2hp4A;^e5KJ%%3uinkKYQ- z4&wThxE2@Kw;M!z*e^Eh*zMwv4M{TJlR;#XKyrORL3KM88YJ4%qIU6|jut)wlWonkbzg=A!ol;@7K!aVQ=8jYwQ<)2ZQoBBpW*4SY2SD^ko4r4ld%n z*qJL2T6}N6xUU5S`QUH4juAIzmL@lk0v<}uY^a?CzXXCD8SQ-3LO`O9Nsm=^%@o*- zcrh5zsH!H`q?LMkR4ZLmco}KA(a>FVK6{Dl)fss^1fC9UwL zn{QFI;pi%wloJrUwW_s#ZqPo?&fPlU`aDfa3lX8@_ORRb!fAHNi$%<8;sJB1ZIzEw zbWR55%03U=QpiYpv5bq1pX!m0^+hs7aiXm4yCTe}387y3upB%!>;yF~t!Rs2$;wC? zP+m2%=y9;~A6r2v<{iQ&2hg9yg2yQS#=zBh1){W` zFM?8`Sd4XG6s#qm$|WIG zLqn9!(V;pTnA)aKq^4O7+BLijZ5grgGj7n3CKo+&2zo2|?+dgMHc{5c%m*EL7zX+C z*QFu2rE_H2NJH!gv_2py7WED6CpPoVD+DCp@VslfoaS1DXyIeopJ7|ps}_@YVSgQ+ zuy)88nDfYcy(02JZBB06i0st*4gK4VIdb`lVT!_k{>9%cc9N2izm~`-m?U_L zN^KQ&(*bnSfZy=_;E<)D1s8q<5{AF;2=gu%Z}!%BYmL76{-e{EojTKw>+^CR$%se# zm=atmR8|oVM6oQm0n6Ens+buujve>03xD$W+;G><>)yXvlxf#-Jlishpttp3apAz) zFRJ{vpY#GwsGs|mxsJe^KI{NR5$0XLXNa#gKtehAt+gbZ?gSk7knV}prF zgv*c%9JmF>V4YcgS*aQJrT)rO`37w4(>ujS1&Hu!0dJ z&}vCTXaH}6fkaCCyREW*H^F?*Ch^oxrJ?Jn$o1$Pe#13ZK3g;^l5_?kDcq=TBG=G} zb4vbUDc%Nq1G6ZL6DRGMs~MZGNI(au`~BQC3Y+}J{f&-X_In~!#&C9sHiQplr(IV1 z!z;JmuRRbU0q+&f$X!KTAvPd7Iln6n(a+K&Viq3nt0!lUUJYbc6QJp$=y(~2h=~U{ zY1GX(%Gj(8gd5>O()@8gmvM+t#NAy9$>Fz}*!mNkeySF1TS#aj(d;`wZ;>a%f7U;@ zQV!%~66>OVg`?6+kmu?Ti++`G4%rOZ7DbhrGvagLtp`lE=&IykgQ;~b|BaRylx4=W ze)x?=Uq*VJZyYLYhPUu$zlrch(;nME%2cwszuH#aAIC<39BvUw3p+^owA1_3&|t2V z25J1@_p`|~3Q)*vQCXFOi$GrPFieg)HL_o`%DGA*$uGo{( zcw)iGjSBH9AF{-_)g>`VUCjJX?fWTlIHJF<=9SIuI2{u`8&Y zOqDIN|9s$$ge(*L4HJXL)zLB(^j#mDA4gAJ#_(!TMb3dE0^+9h1r@X8A#q;C)-qUgH0tk2i)KvfJ7I(?Wg3=O+jwZ#)Xr1`yG*EG)Ip2|H_< zDIJ{fXnbE*az8&j_Vq;Y{fglQdH?f%r(e!_(1jzC0$quLzX{5`v!GK0{x7%G&u}tn%89bPk;t^NluAfsk0^MV8?Q0L%YtHtQ#_xl z@*9Lte{wTSndRi9Vl%!LcMf)x_By1t*42SO4_C*|RQl=6*3@9KKo=m@G2k>+HK03`M3K*` zKa<&Xh+P};Yv-ZI_CpX!vi1;wD6#Rpo!DVXKhc>+3@n@2F-Q=ph+H*1uA#iQMM1Qj6=>~k_`d2A$`8JQ{wNN7sauiyW%LHjJia583Z}8 z;7JO;4PS$Ugz3@|O^%D_g0mbRz}Bm8f4XF=tLH-baDTRAU&rTJNWs(dp#7i3%x#Ify^Zvr;iHfI2rKN}rLA%YgZ$X-fMohgPdX zOZffTmbA{C43Qtu&Poov2w*f3!`K{`;%+_<&SPH#Ag2FJ4MAb;eY(hKY`f<%X6JDn zU??cO!%Twq50q>WmG)IPFK(i#5b{ck6?qZGVVexg4rKaZSL|s6wpPjH}Z&?r41KKK{96@>ksT4Z zgf_WWVO&3INDqyP*HQPmPtrgzO$SnvZWmIsLWQ8d+8S`$Vm9S8-E~^m^XB7c8*B9R z9*nu;?n3fn+y^+2pg%Lu#4&xk*PVD^dMpJ;VdBV9*Zt>?#;xF)2#KEirpXky#Is4H z##eBxIBz0roH6+o*EwJK47*5pgC%}sK0>DDq6_3L&ajA=`-{ODhNT?pr#uS!n=S@;20 z;*+Gb^)~t;^HHmA5PHt8qx~P?@qoy1SnO3vVWzg@95)_X+N^9%zi#*}wIcqQ0t0h}>6c&=rAl~liRCa=% z4yR?|-k?j(f7<3^9heOQeoVPO8ODIYyk4dqL1wGYF%Rqg5N-NMR%DjGBqC8)Fff8k z6E2tl22Plk=@kHY!EVZ-Hm+8ihkM!m7N2q`EJ_k^i9wqnVX*(dO%vl}lQunz zK1oZFN4bK>8>E79e(=d#T(r85GdB?#kvyzGHoVR|DjPgo%AR%}N$Xm=SaO#Yh+na8 zgv&#$#~2U`5hNom1x<54XYc+FhZ=diy1ybrciyswDE5s*FlEJ%y9k0_d#yo9 zqozzIe`V=Ts*q$*=pON7nfH|Q9(WlG2S`E2VU&n4t)_$4PH#Q=pS zu<(Na0DqApNuZI+kgd?A{^fBag@0SBPEB!$kh175OnE9`?{7Fi^l$3+S@DBxo(;G_5xNre_Y@JTmWm9%|UaK-N34iX@oeT>= z3Ha*K-rqL`Is1AC0h_v*sI7)Nd5hP)i{A%m+^stAyA3?K^30W+TO=tg>e>T9TNq?R zjDdaPJZiGV$TSwsEq|^GOrKlgms1+Vnss`m6e$s|8v`OEr9oE@JzgY$LwTOiQdPo5 zdt3eBrs!4k%0WShcH$$*Fmn;Cac41-wk9`Z<7-M@m@ko!*w~d-DnEKo&e!l!P6e~0 zV5rzpyCP6;o|w!h?6`N}pB*{;RbBsey<6OizR`!#UY}U}@8L<~3<{*jx7*p2LD3gn zSDo=U-IJXQHrm3?waM$<_NSI0nK9lux(Q7)r-fDdK>QM;fQiK?n+k!+9t>>lGZvz> z4*p7c+X3@NMVt*q$6-MNH2>C;L0bVk=-({=Gtz@%f1IvZ}VCTDgo7v(% zja>E>@xCkx=-s4s)(ba2p6uM;#=B2KEQ+Gt?j@W*9kI2dD`hF--|GlAF_CQ3Ou^OJ z6+$tTox__y6CU#AE2J6Q{o%e1F5hgs;DHOPl&BiNp)?~yjl?KGXuw|5rm4(ce`-@2 zhf(O3X**I~7YgXJkvEP~9pRgddHHS0dp2n6QXAv#Y|xkZDy6Vg2&;$XBR9t+04pn# zsIBQnts>2L8F&CzM8%ZvAA+9sLKippR>zgP-{Eboc0Y45J(>Yz5v6w_K}0`0X&NNw zA_44aid~YJ_%~F`(6ysQ-`-9nxl>j`Gv(%PF5U_r%-ndUY(WNM#7$|a`Deb*FuHQl z&f|984#$x`2Q&qFwp<7}bgd@IkPAEw3Pbm@RQXkA;xzaytLfandgxn8fnggS>{*z3zJ^=b;i6bKvvAYi8cDC@Ym}W ziOd37fwo$4YmTJ#Fbif*!X47Y7>smuTGz>)pB98s>d**-q;bj+uiGleCf~}sA&Has zcxZoaj{h5eUan2*2g3b9;keWWE5H6K>0|D|sd+Cu0Sj)YdCqekX}MjCCVemib0lbt z9|EmJ^x`uDeCQrW#aNf z=5|1*Ox6HDg;92M0WP_6-XkfeC1WF=3FBB}m|Wc1mMCc5zFxzIa@BE#h5kK2qydZa zstZFzirIBO{6Zzi{TCam7IgY6(XS{OO*yR{B(4q!r4n6(JJ3MU(H#aG4f*m_1BYQ4 zEyClfXDOvB;VNP_=9||13jcoFPm{e+<&X~l6Ps3Blg5HXio28}>|s}Ezswz`*HeG5 zEB~pR(NwHc9Lvtxdl&UYqNXcOfOyDJ1nEMQYkso_jZ;wd6tK+~*C1;z1II7wa1t$F zQD{9H#BP)v2LVLfP5~FFfki9ofMBG)2a)S^{L#C(Vu*Zt@rS<{*h4yJ1M0sbZN_hK z!0*v$-kfjLZ~8}UhbC*-;EagXR|#&w<4~)jBTNi1!D@KeYapMQU95gb&o2A)Vy%Jk zzJ4y~(`ujrhkh3d>DyA6!Yp06Vd?-N7F>9to%!F!o&stOGDoO&dn<ktHZ9{8gp)97Q|l5c}jVCk;XA!98{R&6B8*j zK?m_t##M66e55W3yQ#cS(A7_R*yM=W;WLV*|api zn+F0L$g%V@@c)jz(=nArRrr0%QYT-;1V)+IODgK$M7* zj4^|coUDKk;UN8ejwHR$y2@Pcz0 z+fMd|UIlfQCV(jnAPrCCNo$g7Sw}OMz>{txc7DTB!^>W}ucca$0|0z&tKX1>>d(HL zU-`8a3?CAd0OV>xO}7l{G3J(hAU3FPzqvpm9XT&&r9%2!D3W}5Y#B-`>>A=b6ToE(#K~b z6!_^-+fg;t5II-+4gjtW9cxYW!X*by{sSEu`C;(IxFv&KF?Bd11|m{^kC}G;F+4o! zC49yVrxdx-7<9IN1rNRa*%7V(Dgk7587r_)J8(+@|Czi6;X~@_gv>r>;@h}6o3tHv z*iw}Fzc0i6bwwiam&IEn&;rmXQ3lImr;XOJ=%7%|gg01iu}&m0t#g_BKchF?_il8r z6FKF#2Jj`J&)ApeOa9wx>!eoIe*trFhEZZixK}eSWQ2%#ymCaYhkK4x@a`6$(UL|b z4Dtw{E1lM9@;FYtW{>eFXR0x7YQAU`h#*mjBr4As;ByIp7UbXqZu7CJh#~Mg#0b)O z)J2lTdH6^`{*VVp9<%mjLAgyCk}`QTrGbB0x{J(vbbgC^;{i1_NFZAU7k%6_h25(8 zm0ED2Dl1Zileu(kR|{zGD(l;8vgdH@ghHs2Z~&AuNG($$t*2ds+FWN!U4jADvU#s~ z<^|-)A;(aqM(6;ysWVleob5=sV?=R|ZJ4MP~*mnrdbR~cVdz}HY{7KscR z1O!8{ukm3Jl#j%pNh}4}P`=hMmxIihw_VcRr6;s-vSs2*K&`JqnSQ*`!99GLXAc3l z2GzsjrBaObN`V@b0EB9zQKXt|(If2&F``j_4t`bk>P6r?%$%cLlZ{%TofO*anw*OT zjie{tNTqEfkS+$N%p|e53aF8Th(iAdK6&pMR+#lrO%WIbQSgz5vXP{_Gbt4MEow6z0`7jF15LZ9m5soMX$SsOd0!T1!8EhXTd8i~ z`GQDtpbf%L%zAEofh3-tE?`suMZjda@6>hAX)4d0sdtF-oG4!7Kx(tx{}hu&GbteE%fGr-f(;F3)4LN zaJyBP#%|@V2s@_J@bUg{^Ifn`_%_OK=94!hyuF!@vO6Ja4ynKG;1dm2e&t+N=lg1U z_gcRoa0qEAQfC#kS0#=1Mf{Z^)5vUK#6V&^o>sI9?{pSChJVAC;ad*83 zXXa}i{$Gx3XADMFSG%4Z=|=S0GDbYB$-)_;39RdUGZr8L?*#){D&g@4Yt=f9&Ln|u zvs7Bt*(TOflRkdZlZtZOM}&bEm0r@l%iW=SDm))`u{`>3 z8suR3&dD*ZYdLL$)1r|O0=7Uj)#c~4Q)n#w$vRW7WfMhni$F#icn8wL;p(fzfcRYX zLs4Ugt-XpnI3bf}cU5aF-gMIq5XK2g_(EDBGZ@ok@}f9SMPH1Li2;<$sX$%rUszoV_K0R5P{uz@3XT-|E=4w-Bo0-U<9Sg!eLCO$BN9eqM0N zvjHm%;AvS?vrqvZ+PAuT1R9I-i1iAe+B1YWChe%ECqI#B%;;OTl+&<*7GwGyYqn+l z=|JSD?roA#xZ-<-PNO!Cq^tNu(6rV;-?Mls+#f z{hHT$JF<-dk1TTxoYt>XNB7?~k+&5lwJM3JQk|P{s`h|>V^aB{IL|iD<^4SN<6W3% zHi*)9M|^n%9Fc5;aSc#qLKJ-Sk;?e_@zAFuvIuwfg|B#Y^;begy6DZuzHL1!P zhaB|(#AShlx$ZTvOi$yqE8|q?!gGY_dzTGi7V{Y3ry@{DN(?lY*|$*-orBv2E*C0k zF`AK?-YQTx$2iGjN5OmNS(mIo_KO#NJFv??NZg6bBEZ?~wK4t>gzDi5A9@s`6DOVR zA10wK6%7kE{Im;K)nfijCpK_$kN|@Dn^+=ldKp}GKx%PQA9mk@-+?FvTdWV7et#*u z(4xG!5*z+_Vh9u_Ps?TYA}H!@XK&~W5pYVSyFj#U3w)wZkESDKTO!Z9|^;A}M zYNoJ`3sM%v(nVMZ)$ZXO+rOko&2^u3`ph_W0Z-2z8SiNM4>{BbXCo> zBT&axvcMf|4CRAYs&EI1yu1!3`$>gW#SI~EFqw^M^+=htI z`==6edgY?s9U84K22!MLAF;L?+CBnwIXLKdyfz<<2B|YF6D62K&jWF9S-jeB|M5}j z4hf?zcCN%0&aSwc#Q|e8nz1FYUPrMW@APHRT+nGs_W~y|rKx_0!1ek5o$3a^oO2?~ z@U-|zoigd30m@Cvf}P>5yluR#?=XYe2#5(;1ZF4L>-uqr5N^NT$+4o;v8)c$q!y6F zoWZ^*ues;k|0w<>{NqqWEYHN`w8*EDHSi-%hGkk*3n1O;SJ%36-jQF3DK~~AF;L>? z*MKXid8AEQj@9?8TVqkXvJbTWl@29yM(t+}uk7^$yD2+c!4027{ogc-ascTl&dDDL zP8*u$x%D46jEhvT{~D-0l5m$Yl$|f16nnmdD1c@jIULSmF0`J3a83!l4pi{N;!Khl zN)^qnAh7~qsxeBMvyY_PoHKw6oYi6Ng4!pyE+Ba5p)!3=qz+GSQ=-Ab$x%;+VdeekAjQy^f44jsZu=K?y_<6+oLQ*#YMk_{gw91(f_ zR3UAiB6Oq|yDS?hCoi!1_MbNs)8q*sBPLxrk2~%0`HghczxCni*@Q*>bcdWTopDiQ zjh+zTfFLkOPP2^%K;Q{k6R)dF)GUv`DEot|XY{q8I$U5o(*Qj{!oN~;bi@?x5{xxA z0LAT9whY3%e1DlTScjsH_g@fb0Y|#P{zV2|PErPU0l>MT=5+ae7Cwzx+Nci9Ps#R2 z%_Y;{Sg7=|&z9q^Nu?rC=({Q7Pk@iqL)Z-*k@nUL@RK{y@rN*6uE7{zrXmXJOqA}w zK8GjIbg8=|yJ?#NWcjoxbh+#BIU~ebk(V_$Tm~AkuOLg;Z8maK)&l-0JdV~e4(Xyj=%Ee=a1Z35fuao&&v{!MP^;*$jg^k>?se9l?XDMf7^w# zOTwXbZB%?Z$lj#}dTWXDhT#0J<{?Ik8Hht7osW=`Q~BwK&*-|HNf|=!nK><+2%4 zR}F{#uS8s-17f@ojL(rY0e6A+_$@l%4+q^nj<#7BAn@z<27j=9q}6G zn^u2QF8yYQylA>5TP7QI^{P=PrtT`6J9czs1F;in4JE`{1iQAad#YE3*FaG+x=W*R zCKj%d;q;ng?E4k7r;!n8wzWxZIt#@!BbIFqYe<`BtEqDN*v5)i=(=UMy=;+Qax9WY)$xG=2cOze80;3;c7- z)uMbH&H2g3xFxsTR=Ib9$dT9aZ8M^z8!8H)Yfn}CeANdS7s-f0>|O0>T6zcQ&v+v8 zI5{o8tw*&xxx&Nn4n(VUtG`JG2GrHQ#}f?}X9-gSCTL5c2B&98{v}hHe3U(`TJ3~& zqmhhI4EsR9sPhUhv7+JCKWWMiYs?K=dbq-{WQJeLfXwYI;+QpT0p8V$%a&>yvM6jL zDn_le!yxwHD>3iLgdzQreK71UMIn;t#YfEp1Bxjq9;TA04ls}Tu7K{n9(pzZ%T}^0 zXcRl?-NV)|Ev0!*-#+AzDJ8D_v)7A9$DE=?v>DLRJcd7XAec}qPH)`{2l`^||CXNG zNgCJ15!+k8FRlVd42c8Cy0`G3pnLE z>t6#gf@&%}^nX{If0%jdz6EXZT0vCtzIPIb+dnB^ot>p`Rs=dy*?iR;2pi!JSyoQN zAODm2%)l~>c@1x z4Xt2ux0dFNsXt2(7!0-`pDQKowV8W01U!+l5><4P;Q3B^ zemkyuf};$&(Qu+AQUBhOrbQ0ZbL|boIlg+(c$4w(+;<+Qx1~&5TRtRHAd57HSU7^% zxiQsUa99_`x9L|aUZC1l`fApK+W_dpt$s+Yjh7bZSPcv)biPm53Bqx0?jBahYgs*b zzIp1WvH{mK3EPNxk(lIRLn71pd74=c8IClGrL${u-aiH>2kmjB(U%==1~hF3WeODL z3p#2uOT?fbi#m7n7n{6Bg25BJO36!Ubj0>n_D{Sl^By1{gB}0ro#W@wuVqN_^5jN4 z%0LsKd32;st6vK*mv^*v%W^aibg<^m3}}70f@2CGlkT-E=x648pqtmr99hncLhAvI z-x0p#AlCK7Ao)QmLne8Q2ePFEb%|;OARDvAtxdLE*M=~s9;%h{4l-@@YS{lj; zGi@T3BQXZ=cDN`8h~_d3WR;CZClO)8>a$*}?_mF$4^ZM}3BuP$uHvC@ex~y9zpq{W zdEA+nWVi?GGOR(cX(S;$jpS*6bnuQh3>eW1bf66Kg%U^XOW71?AIybDq+TTk*r9|@ z{rqglfxn^#$It*YHWlj{fcK|=fKyHxaXU7>)E$^t4*AfYIU7?Sf-?R}rn>8%{5~<< zCG$BSoz8qYx*j$9l`2jO$NGituyWpE{?~(1&)-Iwfj_uzAC(e#M(JLlcY4Qs*@$!W066n+6z`n-p zmrUleS|5?@dzQ`x-%RjNK3#r!FKStT$Ua!&ubywBP#jG=i0fgbe>%LRf4Xa>m;u=> zXRS7`eg~59MelQGw)yO;nzA|{q|b-cMwA#a-aZHBTc@TN(^bgUjp}r^f&m0EoU1Ck z0RI4{l|V=~&mKW8 zM9_xaGs`0|g~7#-j>^}1k>ez^Tl0P`{8#}?T9gM%5k`!Sods*LrKK27;o2t)!>id7 zylP~&QRA@XM_F;LVa6#bp$XL2c#4u8dybJjtlLBNrGf3iOFOt zrR52UOS|!~0)B@-bS9ER1RTGMHOAcpJtO^~?&Mdd{1#BQsBE3rmS9bN1nkWs0Xx5GCd4s)MlH|`o2#j3qy>wx8aqfp zqpLK;=X~@nUMS}8!&}*A3U5RkjwTG^zaku?>^6uzzNjn0E<8+fQfcwhw!_dB{m)>N zK)-sEBKDe zn_|hBSK=t#cJ6u#cBcGBX{L))`v`Q7`p`Un#Y z`VKTjj1astd?R2=q&hb)a}{|t6UGBabU%xG^d{e^l5XPSE1$($-68LV8`D9J>FH1B zDvw0`^W;fmtr>BDERQVlhFYinN{qf~I6+i$H-X0a|6A%7I1W}}1=T`FZ886bE8|GE zdm_ZYW!%dU_cgYaiX(3c-#*vtue>gFzx)8IfSLI_+@1n;A1`xC`XrPDydrOW>hr3T z^DAQj&0?vLYTH}liu{>QpGMp#6_koh=jJXa-J(cc3H^Jc@-|s?4_3>$AQ|f3XnJKO z8hOz1ApLV!XOt4K6~0fbd)_a)dUy<>xr5qY*(}t!k|Yi#IZBP6{>A-4A-J95lSOmR z2_$HQDuog9uP!x z7uK>zYuB?r{5N)utoEL-OpW)SlNV%owWDC5mu(Z7EHPPh`}QEj(A0T8QEstND zY<%0TMWixA&%E_hmlW&>) zg(cEQ@pAb34~|@dETb@Y!oS;_TA-*71Wu{W{?yOffvjwCCBM(Ts)MJ^GAt#@S2h51X{}WZI|t(^(XWcNRfmgBHFvmnKFnAECQ9?WWs!eZ7#m;b+CZ3k8YU(tA}) zn`P}QYlC6w4HL=W{T>=_-40COty|;1y07i?D4x~Z2sSomOfb>xC$qwW*(C61o+>Ap zxwkst7fIVO<6O^n!goy1FF-xX4CpyJONjL5ZvVLye9low_O*nEr?WI?nGB# z-%0UQv2ju&LHi;o@!4?3qAn%iDFsF;f8EblKdto!9|v(0@2X=0fLhc;2mrp6*vq1l z(0Z;a)dsK)wg1VIdOvhRyfVvx44&0^$uEKheYA;-fFz()!1DN$Mdf${wEFvfG`~>T zR#n|@gG>eDKBoDC=DOWijgl*&Ji<)sDhq_-tK#79TexCu(y)0IfZkXe z>L0W04QL%MM27mu{M4JMpTsc)q5JBSEgg6!zr>?qwU&gazLbb7EW`dL;U>8(s%Y21 zWjrz+&50G%;~D@NI5@%7pP?yE0w#<*12DJYCA~7W-BXZS4~Ro_SX|vGf3@#q%9x5& zZ+Zo*HKUm!NkEbis%;vPw0nxWAa;^khP{`Q>tK%jWm&@Fupl}ji9xqK%ftYic2R>L z)I?`ng(gtMWX8%)E#3dYXvt%QK$P@=j){leQ}C1etLo*DB>eUCZRig`|kBEsjy&&NrAD#`pNA!nJt^>yTXr zS8W8=1Zh~8iGcB<5wYoUP##8OkVo(!Jv+i8L9d_GjLU`aFPx-*P)c*=*;IJ|L9cU8 zn~WYaX-5rcA9y8ciYFRaVrWmmWv&CQ+Rx^CBde2ddxJ>zh)EdR;)ckz&L?>=`?wli zq>x@txd!{tS-+c`C88tGn-~&wPEH=%_Xd*zn0d(oYO8(|v`ghDc`W1N_sM1AIK?{U z(wo-17qF7BWbYWtJai6yaz zGWy+MBqoQ3loi`^UN;xpIK)|);aFTOq zoigdgCLyKgFW0so7`XL!`{LlG8W?`KKZd?Q=Rj1Tr9^)B)&J#D4`vtnj{h!rH3jia z8E2j3yqTDb={j%EdeGV&vOn(;+3GG_83Q|c!gccxc!$)T%o4CB^wt?I=?d;;=?I1d zeN-Jv_jtVEz$fs4Uipr-)m+(?5~1q$AnoIcg zoF9FbIEh+ah;@clBPFmB)B|Ex4ixKzBpDd7_^2=IUUW$_Vsb510Jg_(@$wv>`8=9eVdmf}u z#zu3h0yP>ggC@nSw^Odx3L3En%e@h?ipy?6?M8GSV|Rd>;k~bIq$2U-jMwT}D+wXT z_^A;{1E;Cjgi89(l!E&|{tWVA;t@$q*KEj?&qB_-mTj=>vZfmj910j3?&(WcRHrvN zYaTdP+RO1YUy;a-s2_@?_%H@_4j3^V^M}Et12xN8z>DY-p((03r&6UGSqR9KPhb0e%E<`!{2SJMP^T2Nv&1*u$@&;3y#9*G-tgdo z77xC)6`h-l=UX%;Hpo4+SKEJ9PJ0+UqT=aNNxV>GM+COWrxr@X-u_M8$m9{|CUGL@ zi-rn^%5?Vwh5bm^{KT#z(vr>O|5PX6E-Jmc60mbYcWtL77rkojLbpXiecG4^Wa?)N%kM~+e!#cB$;;{;IcISJpH-~b z&*Tm1r!@sS%e$Ea0MNcvws!@v7Ua$T3er}d%Z<>uL7tGqLNPTB13CcbQ^!i_yLXX8 z-5r=G-=}r#DC`LCMaNC^{uVBgWnF9`6OtpE0MeoiGruQLd(yM^OZ~j`<(Z(jZLloa zoZ)oB!T>}eSLd70?)9eEHM&n(WI#!yXM2ZZj*aiP_Vj`x7#C07o$c(t#6M2vKJNTq zOees%kTK;I<4NC4Rws)i;(u1C6BAvCUsFbHT&Dp_2iwmj-F-+#KjrlyimB+gc z6=pr2kT%*I@2s(kR3ZphBB`fzrhTpW^z|h??B_G6ZAOClvn# z9)Ewb*2zk|v6*l`4cVMIe47@0kVV>Xs2|R1(+YBC@`N~rm3nB73pSk$AGCJusAUg4 z!r-!SSa5|yU^deN!YT=3CrbSSl9d1s2*~{)U+fk8kRWSYdjikkDPr}{Y{g^MJq2CxRy_8sq#hMUsc0WnHe_{n z;8;8^{G)<7SV#;yihhwO)+bCG5KklW;KzUpSQx2BP4X+VY_cVEhk(W znL?s}sBo+#U0p8NKErB(6EG!`1!3a7r=~AiRde+OSMZc!0sd#7offB6&rEx~+gXah z<1(Ce#R|8I^znWkwA+9x0BaaPb1M!+Q^14?fj5H3ef-bB9(=1wZl+LniL|+1K+>ie z*j}sJ)Y1cFF5HdW0?29)$b6sb!XQy@d~} zq29v;gSHJmTC@a9Q0x1G&e3m2AKM}^KfAo9bh)S0s>5ic-|HFuI6OnJ{z$9TbsaBM z21CHEGN>{Oq@V$Nl3lv>_jNgX`QwRZY{4x^n<+H-ou#6pMz4UM?zDC0mQJjEq`wHWZxR6wM_0C@LxZb^nY67pu`K z=bVgQ2|S4E8{aVv*2!?iU&Rs_5u_Ea4Cp2$*DT){YJ6r%03qaw@qHhmlcA4q$Apo2 zq2#q+m51PON&}+V)_SCLznfA}5YT-aU-48;BGB<-|X^ z6-PrQXUruiS4=?vjBtN^pfJnHmkzQ9!-}Z1cAQ1?a9f$@c7*P=}gV94z8>7d)62(x21gRVZ{0vSkykcDZhYtRmSC# z_|){3i7KxTt?{o(v|wk7^v}BC@}-a_U^B6aUNTkw(d>UP(%TAL1~(-Y+V;{$WUy@h znD__hIlSB_C2d)Rq6A_rqHIgN4m*huG-I7VAQlNg`~X*F-wE7MMRmw|M2J||(sTYj z^80?B6c%%gE8)XtuEmCq5L!xBmyq%dg9oLtS!_n)xFdbe6{B^G4fy)?th*2k;n>G# z3-e^Po6;zGG%Ef@uF}1cZ)V_tDK3T7*ofdj_tgYuQBdE>GN*AL4E+l#eWz@C#C@Ok znFnat>S7*H|H1Ikz?FQvcg6`X=v1T%Gat%!^@)BFDB6yOw}FGLdYxUe>nwf&k96>o zhBkX+%S|x1$*O?zv_OVyXdvoB8^m=UG2yfbIq_?2PMv{s_}(*n?9dhR(PAJ&&L)}T zg&C&fB^$!JouPAiVDIJf5Pv443_nn*VFFc6otV&J|O?$t-m>XKL^XyLX?8TUK)*lq_&blkn3S@?TvAB#V13esA|=k=%^yG~I)sB>Lv zH=7~8GF)2Jh3EHd%h)|}O#0t5QO!NDzj-GQbq9Dl{ovvNMKN1xqav%?5n4f8}>)}ZIlG%;zKhTY+ za-ZD%bo60DKKs2ljq(7BFKdR#A3})j`4*28f zMcy{=m8?&E0Vd@!L@Tk!Rh4f}{^$XHyq`O_G-ocBy)b%xPl7w051;~?-kQkXMkX|P zUZlu3Xwsu#1R-@syfFhMHv9F;(eeg72eI;%;*lqfCMQXV&Ycb*JY(N(z6VpCZfLb~p$O z`^ni8;Zj_vP$+mrecS0hJ-(;m@fKMSr=8nZ(d3Tda z*;?Omo=PQQgeLVR^uh{-6TB@7@YXBHw}sLzbPIG7pR>Oz%ox&5ri$~$1m%(-ulNMY zqs`eIFkK9)a{Hd?OmR&ld2!pS$pu#)g&oi~*)3(J2A z(%awpv|hK{>ONczoCZCdeTTc6hbA-cYC>|wobk{77}LS^P%KjX=9kBDqTb%k9c$&eaD#ZFBJhtg*Twyv_nuRYQ)QTs4QmL*w~$%;t9 zg`UD}=mKe-AL*|SbML0Am_Rnlo8w}+^;J4qV$KRRjLa;;(Y#B{8PJE%LU z`ZLwH1@}>M$Z}bfK(y5kp+CkXkQAPrp42kq`$1PlG57ZApev1S)_=gT<0=413zH$z z#%)O2JJiz0FfToPf|e085PJ=c7xU1|`N*kiCjAN>f%f|(Tf689K6 z;Vl3pCutUWE7-G@9Xk_SMad0!j{}Dd?r1XsuP`iY>OJ`O3 zrYwI53L_3M08Zn?93lz{f(BkcpE(A+Y} zFowP2g0u4x=ns0C^KtTS>wn;1epZztN?24UDKytqh@*YYwazRBZ6b4I0|mW`oZGxi zgdaFM&+6FwCD53cz7Wse^1`Aie1QA^)StvY(hbWdZ_R#!nciG-s+#UtstwAdbJ&7F z+Tu**07zl~@XN*YARR7y6ZIHOFM7!8-;r7%*YVv)q@+-*3PY%3)AnAVEj`g&j37I& zyhfuLM54tu5sfqDK7S*%9+sV^ZNzYMZAak!qmB7fGyO{xN9zY=N1hph0PGVna8=rY zzjK_oZ}Ng=MER%$vPaWzleTn0ug8%;gNZ_EK&zsVm}#jbftpjn)GK&_Ravvfg2d$B zN5yF9=2fF%C29%SE zYg+=)NqQomb|j8rr;E+rcmYA;34H5V5tDg6yl5r(@MX27_tiZsY7-rN2!Iy-fnJ`W z9Kf7IXj}q#Qh=GZ#XHNq1lq?-KVA&|gzt;;q~?~jER+hA=;jZ|C@+FQiY0&HaJuD) zL|vfB=5B5T8y5=#VyFksK+DG#8%IuYVVP5W_s_h4uT7Upt+Gyj|~_#DWBeJD`|e@fu@ zA&m-+xPQBJt1(}iPmL6iGa<-Vm_1x$;z}-0hZdmRL+88K{a!5cSZUH^yr?o0fCz-U z%z%4`M!kc1^e|lhc2IqmppgnE&XPyg1j={|%AEX>m zxc?{A7I9Av#fslo)hT>V9?EhmoMWg?q4=rtMr!j2z&4>pzaC$+lmO}J9SrXb6TRfO zoNu9A!6R8|VZR|I0((DVc0Ng>)?{)qUlr2jYV3(Wm3bQiL#Y@DgO%NuzI9^2kGlo; zQ0BBqI(fA|B|66EgRS~2HhQ3#D&ZKA!_$F9JZGuO^;+?DTV>gA^TvA^_w+czJ+5)Bj~c<&Ii3z1&H8JX0Xm zcOL>T(7S}i68B^>vj-E6fu2>VJgCbKS~=Z+)sHIyCrY6*HnTi0go;%V?Rg=uN;9>} zqm&mPwpRAh=k`vWD))EOqz)$frC$w`x*I1qUMcooNx(%I5Uj_1zj+^*WrFMkCLAYO zx>7B`+|k1n=wu8lIKyubO&%NbtztEN;W+duQvFLKSA>Mt#(z}yZ2e`ax>;@5$)HZa zOfAj-{V6gxg1s`+FQ$x3w(>IRA>6v6N9#8QIGCu9F{(}yu@7O}O_2+=DKJMrM{n*s zLEP}>=p^jV3$IU5;URgyn>5E6Q*LgkM>RN;ePV@wAa|l+d!7ZzPA^raG_Po$joh^B zZ1ht`I;&Wv+?nd86YBmHIa`kdPn5bwQdiW`&k*E8|2eaWnr&qF+W;`LR@p$R)Uq$( zuHMj2H$(%a&4CX7nUWd|PAM^Uh3cizY7ITfqCU#)bzd=$X9OOx^IRZn@ zsN2*w5CiN{7I0u*ird%s-G>ZejO`22NCagl+R;|#_ILfO-T_A(4_J&JTP^L&vj5(aC>zvBmMTjJQQjdWYz?{J7~4bpf&2}8Jo%AhER0{kt+$jw}p#* zRn6D5(RCbS_`er1gb~ku>jM>T7Kxx0omthbD^qWH?oj7#)mOj$3?iw*C?CHWqMasd z@g!>iz}(xMmDEMyDCqz1HJq>+0C?oL$Wt)lRyz1C_dJXCk?tX7Ht?l^Rr$&=mk-_? zJsOn0W;R7JIF(lkqhQbArLqaraHbzb1QH7ZvOkzS`G?eQR8d|;YRMlY&(1Hu@kSLx zwYkA7R= z8Ryq`KdOh^2`XENoK;(OO(X!2*jVY9On@Bu$*)!0kbJt>0NGKtK?{f{+kUNT7J+whlAJI)*4_IeL zjC;D{M&Jz$_VX3Jmx)0rizqItJLxYyA*#}fAd3K`;|iQI!xcUcJQ_((V)GtU7&GUP zT50e^oE!LcfiNcCnOk&bt?LJpH!E?UTm-3RrOAc-F`&$+XFplL~%eq>_- zCV$s)j^@&d+JR{42FVIN9K&NW*E27AB~eWP+*#j}d@V9^8rDbv_>~G*gDn!|$Bnq& znjx+~V1q-J&7Ia!A+ly)hwY$Utz+l8(Fk)}KU4neA85-+-vcEmWp{otI7RmMZvp@=isf}MWZf1! zkVW7h$Q?_X;S0&@V|NJkWo&py%~gc0XC_2MF0CHbs(yEzF65%4XR|vf2ch1**_h{Q z`Kiaq*7fB@UmmxVudFJ2y2kv!rtV;Rwbg z<(mPIwSraNc==~Ed5uTvBL)0Km7CXblpgTnUEgqV6o{oFpClra=v-i1lf*A6GTi;K z$PBMOZc>e_lz!*%kESJ{5k0%M5mkkG820mETer&im?|-F`YqzM0f>6s0WLS z44f?A>c>LrYvcDLaxEt2*-k9AAXIwT0fI>9n{9^R&FG-!RtE->5(zUVSB{)V54Nv$ z_tZn~RyMC4rhvyvurFlbf+}6}A)^^KIVD^7ybOdCo$h|icj64NyZRVeujD<7D9|7S z6>DhMihE{EBvy4tZo{A#aCfNDKjGPEg`hX5Z_-tUvWDLISLw^i{=EJlH=IXrl;YlJ z%BE!=AJ83<%X`$XvYyZ#fEInuayx($;h@`8uZ{aCWh-M~;;>{Ofd6Hsx9?3T4v0Lw zMy(lk>87oeWCmicCOAy|z&W^N`)d<6XYv`o_PU{3m*FXF8Kr^)85>W zQAJ)?+qtt)RKQ9ih^Y#xA=6gbQ?r7r3SGh*%bm<%?@jj&R8wVM&OxHVt66BfxHV_h z7p!pJ5|^TH9KJC>=rPbYZ5dXQ5q@^ z?=Y}~RBvYj+Lm&PM7r!+5_NiiUIl?U!w1~11cB6Rc)avYFB`@2Ka!(Omf1>)-5QpX z(vD2$T2!rKMj_=sk%^JI#0gojxcG>5n^ zc$?zZ;thnO?Aw;|C$9i5Mo1{&OVA*Q-c(e2{|r+nthF719iN!H633%d`S z*P*6L9T=}%_Ul*mZK2b_RZprl-j!!!-9tzR<6A38<3D)ouVR8Z&oLPN>WmO-#|Q)X z6;eZI9$)liTViDTsYD!{gU|t=@CpqW9TgA8{o@ znxe3~H5LKgRLeSU@#^EXq0r~+>Vx{M#IdTIMJC=y;ZF9$d^ElK2^F~lqTtJBEurb_ zAzkW9D?AgF}24WS7**aTL%CFGnjoIpJ* zVx4X{)!~QKB3Z4~w6fM=`-52xx>mDS((PLISHg?sT7^}H06PH&XvR7xc+J)Qc#CW_ zRnlI*UdNg(2)qXeY>ZliAhM}*w!*pes`xcdh{pkVUs4}VQW2@Nz`+z`@GyoH*Sgt-bdfetu@&4 z!je6t0ImtCJ>Fg=?yVTO*gh~W*up05kj=wd=CxR^=II@~iP3QKY|rU833zf2t=(WT zD(s-+y-viYreXZ2Snx{=J1!;X)HBd|7T<@j77wPNC70kMzM0TqhWcImlBqu7L(9f*yLifRScQNtW-!i~UhAVU<- z7`r9%Ujy;eWf3rjnq(YpQCio}i5MX}SOCv5IO;CLw%Tf$+UymOIy#vWXTbd4i@x{6 zHWw_k3=^x5q|BG~w6*PiW7ckz+2KD+y0B&UpLn)u5$mUQxqgj1Dz(yOczBFsvldD* z5H=9iT;pn3Ww#BOt!_YOe z^r?iivxtgohW`}!xH5Cn$IHN4Tdq?3;46RKftQzU1ZvQE zK4T=LMun=+5CI;H3X?f4XV)VV6k6mrJDDvIg}kGpJf%LiA^G|$wki_)J>8S`G*lbur$3*r$ z5CvDTh;fxqb=V_phuSfP1WX4iH*w4PYZ1=Po&|!luv5s>q8V$s>qWHg0!;+OXpk|8 zH8Kdu+cmEf?(m8uxZ-6PhvGZz6gN)ucEZ0A!$f>RFITsyTGI|Lwgq^`J5%&w=tDPH zZ@M(i6iTb`5!9*Mobw_T7{&U(MJx>StKCAC^1Z1ODd14iPD_1=uW4`d&!Vo5Zf)BV zkrPu7Dx__1G%|=8avr{kR4b56l+RTNmsh96Ye6z&seDQlp*PwK*u~!c-B+#XO7F)# zUkF%N)-5=Q*{~1etoTJIHzlM7jFCmq#s+N!4!47{Bkc>Jk@1*#$Q(dW8fnoi83Ew^ z)qqYxUG4{v8w$|tHq!8A&wPCSgbl)*7b-mLs_?(mwXBm0jloV*(f9%OVt4C7@Bdg^ zc7ZXYC%riIw7aZVj>S3!1WOZ!oF0S`2Znx2(}4iol%75a^c^nqVQN%)G1b44&`$o> z4g*2lNG2C=vreJ}b;tBcUi>U=Dd;S(DosVx7N>7$pqX-iS;NoHY`lxYN-&7|qteDl zlWPpv#cRBioJ-m2Qm5i1`D&N3ZV+oy_7EfOO&7xVXOh^x}q90iVe5;ri zzIR&koVGAKR_~MdYPnL{@2~`utnwd&5VQl&coVe#=dq`oQCJN8<<`&LY+~ERg6^dX zAm5+-iVCa3CX5a$UtYhBe1`=Z@r=7)2kf?O!Si4RPNKQgP_EeL zidtvP?8h042J`Dbw^+2S6M4}>#mejpx7^h`A)R@LSEbb-0Y77_{*z|}d76qntNX}| z06H@L&%dd&#w9SW|2+yuNa{0O=;SVGZ$+kLgsK!o>Un3UPZn%`W5iaBYs4N$YY!59 zy|v!ZB_tq5&|8Si!$Spm0Ee7GEai^dd4KfQ#K;QFAbd}IP?#xicgqMfiVrrulFud5 zW5o51KzBcm1R#V?(QRXLCiU~4JZg+Q!K3+etgF9MBv>mUi(e_OsEaK(xAQ5gwE7n76QellxE=9&D z6e3$0B3>u}mSG0;I`?I~UUC8eilek^WRawwlhm>1D5ZXVnA-holb`BtodK3`EIRpt z<$|M5l=f5UZTB{ZS3%@#8JU$*Px=mk@5E@9?-rO!m8pIUBZgKcFOy97RJ;0%Ja}Tp z-W;k}#IhT;^Fz`Ur~(4#fyqvgf;u4;qKXk=gz@2FU3jZ+Vr|rhyR31NbX=1DEg=?} zbdQy`NG#A_d%`~u9B7>1EKEU};{+1L1xg`Sfgk?nB6XYVA!-0ioz0WRgD{cNM4qRB zQM;5$6PCB7_C$LiRX8E3u3z8N5aX9Z(PfDBvkWXRSCc5rSb@gtU=wdFxq}=`4!7!T z_8l~O2#E(`(beIVc`lpoyoqs%Te}cHJyn$3fC4MowE2C|n&?d-ZiI@o-eew`6}eU- z{uYhXUII&_7^(6*7CjC}X`m~Vv_c$<21%}V;6z?~&R{Rwj%PRx5JFmMIlyYK1{<4) z7w^mPqegOMUfGRJB=Aqa*3gusu>W-x51KY=NNb-$HIL+gF2+0Md^V%^I4H#!5vWz> zLF2no@INM>>YX>0ST$IR;Uh{&S*@%H7k*h(8aE@okSkG#{hg?-aj3s^MafSrf21N| zhOd&E(uRIJ#C81wGZqdhru!u6t}woV&|VDboylgEaTkJLjOEoV!=Fv}6^OUf46ad} z+piA;$LYAC5FL{^(k*4BWW$>WXSAIji#0qeYTKsEk;@?B6TXBlm*-A!#mPR0e~8+l zy_pm?ifo!c+GOTi6JX3Vgc;FkyjHJTY7C^A#&#UZZ6hmZ%y+-0C`iK@KJ%=m z2hf|rD0INuNP>SxZy3tW9&*=E8oe^|nRQ8AXv(O=?I}0`ogsi7Xsoqp>&N?;>lG{K zHw7U@SE?wEp@`13_BuJSC!gKKa;zENE8S>124dHU#FFd`g&l=*pVWynN5ncL!lg6{ zMP&ivCN-0s8tL8XE2uV<{UZpkv(M>*Q@#%%cLxI4AFY+;yLEpAX%!7~K+zzi4%y~* zl;BFnn>C*zV3^b#p;a^PFy8#?Y)OLDTlYiuL;9g(ko^oibnl|GlQr~i=`>%=H*6X- z@bR+1%6n$(l2YmO`6uTG(}8QsF;rW3cY^7FW_2tz zpgUvtiC~!x_ob@n2%bI5o-s$a&jKicnlw&A<|`7!&;RRx&Z6QS3qq1R4dV-}#b;E~rWZ z2)tI}oye`jt-f?eg$*FVBrQJ3oThAe{78YPdOZ@0DoIDFMWYlcI`Dz@W{$HcPP*#| zsxTPyi1pcfZVXEZT=hO@Esxr*KO53BnRqdn0WZ0eyfHNGtw7AfCU$xTrY45GGJ=@N zyr5*Igon+ckxP^uBhJR?H@(iN<#vHOq7u{Y&IHW0r6qqyrl48=!-G=WYmRnx$Bm> z_RBcdhy5DPt!h`8LmKxi!Q|j_)z*3TuDH;+m0$2>FOH9LLOs(G8n8Nfz~w4BOR}X0 zd0w-GsIEHt%^9{T0KS$8}E>umkcZAmP-B|1lJ!x{CTRf6m?g}yTkE0)h> zXZj8ZovtsUFp!x`->44`XPzk!8of?$X>jHM>43cKgcRzN4hbYtY^~fy@-UUcO(yO# zELof4RluYkBrjtp5;wKdQS9ko8)mhGaQ+M_Oe4optUwA;!LY!P-{;JW_tf9f&l_Hr z_w|2fNnefK1%bT(#MCos0#69Cr(G&wyU26cgFDSSzx{EM zwt4sD*~{06Vt?&4u*YQ@+#sJFgzjGt5Dx|lNB`LMQDzs1M3#Qak@}ckqU>5nC(ipZ z{csTUr*#6UhXsVTm#cd6zk8VU5FPw2U|2NmJry1ffH$AjOYa()09=#s&uxaitIsUDDe&R;3&qf#^8VF%?6U!z$mGiSAXm22^fO&{h6VbgP1cf{c zWt&riU{A-9ClLCw&>Awo`GM-+AQpdTeBSVWewbeA>k~mypOpC(7A~YrwF5()lcJc+ z&Cp>kv}dw8y2GRJpD3PDS2gf%E4Kzd4rQi$x?X{jTW0PS?xSstjlJe`{ygdZjgNvf*VILG5 zh3>qnEk8Zr;Pd^X+$riOJCNubNg!mP8XJ)8*J&JcWw~?})y;#i>fRhd zf1e4!%L5Oii|k`s3v$wPC`Lc6AXlcOi{N}ractKUdDXy_mebPrg&{=-%<&tobI9R( zMGQ;_*MSB##)b*Id;gOeQPFDhP#yCbWjiaEzw${A>ls|zE!+Lz9{$&N@yFA!cDkk@ zhG?iK*+z$MEg3CMmg!QKp%1Kjzt}rytLP-9v}PX;jWfK`cthOjial1YYk7@^#pKA# zyz4YQ$YJaXRGo=$ta|H@c!*4C>k+M}a05O)bB<&Pm^)vTmK8m3=lbrvc;~Y_`iz=;r{ql2DrGf;t|%?od;Jl55$ClN&{pgUxm(+3?ohLugo{?V-`p z?XP8h(?Is}0?dyJl^P2EYot2ZS$&n-A;_r3eh)>$QXBoqBuWeZWv3*qW8tH(KY96a z4;SRqLv`?G;^~v=uplT;qFhFb!f&|FhKnCLiZkJ9mMA zqOvX;rT)@W-b=oEc7yUW$+@{jNE9m7OO$e!UGg>;1dBUU_(S7G@EqsLBLFIR008RG z1G4LQvepq_3%!y|61EleV;h|vl?cY+;?grX#X=7BcC!#}&J@n`Lkm97Si zc#cjMl|da+M8Vr`nXf>33ym>zT&d@ILJ^K5j0N#5ZWI1JAW~x|!+RzS1Doz7D_%4NpwjXzT)s#cmhf26!eQ)2u$m+l&=6 zaFJQwHbi?KpR%YYbQHkBf?&@c@0&c@@oWuXZqzSo?HQwG%-|BVK0MfWh?|&*6t3ZL z-ve%Jwcq5R5OFp^Rzn zZ+)ujphz19iC*5w6+l(0UPJf6&(b5OTJ-E&Tr{ez<)VoN^r8msVuH*iyMuVmX2^0oo- zSb7G@=jBS&F-<^cMR`xXA!zWFz)lp+6s-T2K^(G)XD6fJM&=#wnX35B@|sfk#{{Bb z-OD*Zz7-X2lqkDIqf53n$W1(qi`{%mEkR^|SwFb$vx`6TrUovdR{%=c85>lM*Z17M z2b8wntYCnTPw0B!%BECi6Q!bM`K7uBmW8lpI zV+sK<*D^)hMPLi0V5)DR<`ETUtiG~MVH%Z+u#3ZtKxyN%X2Y*jN#ii?pd}rmta;8h zAGpPSLT3})@2R|f_J(7)J&AHSD4JIEuL0{Xw zxcg0#bV<7}FoaUHLQWPW_)Tg49oV7o;XTrlRQQ9`P&`$%8>fDZ{hS%2 zDu4J+Y<7lfdN>+ix?UL;96f0NLUlkL9)&-)eE{&QaMdr#8t(cZ5Pf_WIFPu)w4wuP zd?1Nl7wgS{vusm@XU`d{z_}%V#2qwqPJPJlnmVj7ZH_5xr?yw_e3Wf)&w+52eKh*> zqbweDOUZjpQMM-~NoFHq<+#>uBrq{`z+p*1eLGcqE<~N9#;|Z_lpb;)8|1OM+0#4(^=ata7HIaX-S(l{+e_m6cC4D3UQ=n{1CI%JC=Ph}{aR~?5TX)?T zS}uL6{a=~=LBma@3V@+$n~4}VoZNffVhEKx(-UNh$#4t-1is_KZ#)l8qKSQ4OfCsntw+&GW55sge>jsuB5+H!2%0W|FQMq{XP4$zd z=$ zDW5CkcqfKW6`*s>Cr1H5|H+38)o9BD8+#Dm2CZ(rEXQ!VdEN}MR z%M1FI%R>mCq_3`UCt4JbqaF~Jy5Q&fEn+2EC39KZ_x*l&Pa+>P>8b2d%H!-p@7cGN z-b`W6onuj3wNTK$`pzc&RY{FtD1uUPe&`x-6CY?UorXKNeBSH9mMpIktc_0 z+ohxwJsH=>`-6jD`IFv1O3L^XJeS>xlurbid3 z6a=+2g8tm(6{xpB;TK2p%U*%5p0QcfhZYkMUyJLpD=qD-3qyB9fS5niUCV$Kl^ZiDoA63Y>hG2Q_K?q}+>YT)SrF*(7bSE9a$bvqoCayWe^ zIvMQ}x^r+9kq}H&TlBTCCTf??sZykHxPh^bLs4Ranc{}|vKo!cV+?9CIg@#=g@k^? zyz$z@m}slfpB8zxRZH}rNz-=XP-c|5W3A7_^ZrnHUT-})FkV`Sr0Jzz8jme54tU>U zp|kMP?xm=xCQH6duU|xMeMjB<-GXJJ;&Cx{B$dxr)i<5Eko&s6ID@0E_ApQy$(XOd zARkB0s~qFWo=Os>dYLMp;U6sMw-q8a2w6jT+$O^8Xs0ZfSX|&pSv~0oJQQb`Kjd?P z>i+?B)NLeMzOyDuCOT3Xa;?y^v4nfBPVJj5M^7rOt9@Hy>LCS;<#^YdGP}6ELQG$@ zXE3TuO5%QrK8UmVX2Ea>j^v9)TkS~4tY(8A3(xviKC&LGxaWITG zHSRU&-{t+e^O*1@29ud&eIl+!&1z(po&wOj7A`{R(TIKb6aZjbmjHF90Pl<8(K%tl zTS_|nc}>JFV1qa0)7Xiq#u(@1N-a_xeJ@RMD&$oMUlB-CIgs>PpkK{I&ZM&@1WW&* z+f$ZJt)`$=h=~Y+9;e!AB)&y(Lq-+er)ipRr%IYwN}X?CyLfZI;MA$3aZVat7=QWD zVE3_(eEUWhlZi^ee0{7j6v-jSpqszibt{AEk`;IfD$PMHAQ+8`72pBDnuf_QUh0Pk zgX>X!$g)NlM^L3yu|J<*{uSrPIs~J{)pyt70_<={mTAtTBR*@_GdDD2J9^lkPq}gY zzt;z%;9D`TB=u)z3zw%@rl#JRc`cNsk0q@}pb!p{p0k@$e=*l_*;mJLNTo41i0obB zD>D8L`a|>y*HXg*6XRcmq`!Bly!n<5Ht_~7ESJ4|0qv}HcImzd?Tv=lJjcUPi1M6f z8m}FGnS$LFk(_T5+O>7b)+*ux!2Aoiu~^&ff4)?_3L#OkbBIK`Y6(~=Xf74l3R)xy|%Y2U4u0$XiaLCwmQnRkbH zqGGk-a!Rnor>Gf1qz9S>RP{VxXEQpHo%uWM3#^$HxHJUd^jE+19jKbZHLXsz%R?Q> z40hq&&jV2zu0PFi)`{hnj}I9?3T0T%r}LS+?hMu|N;jy3dAa7LE2>=4Yn{U`ksla6 zf`fe3Tu(!NdFJWEC~x>5LS;R;|B(6x=2zuzY24`;xy9F8jl}$6C9K>2 zV!zHYhfP2@QSjp4P5O@>jAcgJQ1co6vC5TNb3?qVafgpmxmnWvz8==;@8l#i=4(`A zo3L&ViP>*&KV4$s?E&UCn`NUA>snZ+X;0?&B^x$)@U@9^Z!&Ceu#fKV*Fgg1m{N@X zR7h7vvQlf~_n27OR(7sOm8Dvo=A7i6CF~oKn`^R&f6Cme?7KYDD5BC1yotGzHiR0s z9Rx}rBHAL~!_sxla660O#^^tr=D;)x+laK+d2!c!oeLOZ7(}3F`PF7Hct}?zH^qs$ zL@_t->J2eD2j**lGu_{;R1$`|&6y>BPrYkBWJ#8vppnfiRm6&ieflkkzB9z*`GFKidB^`b~%|6 z1a|1E3;{^$VKHT-o)Lz<`9=%0f88ROc6K0Iq7_Mvx6*+qPd$1nm)q%`kfM|jg=xCe zrJd~H_>{5;p$cLKY6w!vSUTq^SUcLTLeI_|m;CAM!I8p)x2l;ZgDv$ZHD8PrF<=@0 zP*k7Mh9$Vv1D%TKdy!c|K!{O8{!Khy#K1=Uk9$k9xju;=#EK_b+ug~KWg071;HWy`vpdK;iy>wlnpOq8b zZ9C(l@Gnj^Dx*Pxr63;lN^}Z-P808{s8);FFHaF?+)fS%J&}`M*`Q1q)ViZHMb0rB~cX++Tdp1p^3fPAknvMfVvR}*(*=T z0{Y-;uAjZm85nxs^QBh;Mkus3_s`o(!=1{)(M8|t4=r7!BCJ4CdPx54L^`SF8@FH5 zW5KTeO?kjghyUr)=CrZ4c3Fips=n#;Qk{kLsLKn^g^iV*)>1W^D!U6lqHt~X(`sc# zbF41W3XeA8GK2g0uIfo`WSw zdpz(cTRX-_KVAIe(0}|m%!W6Yc!Uox;ere7?kIpp>&h4=XBo~%0 z7y|CV=AP=03m7|_SO7mXWrpEmqw=b_-$SIea0cW3v}0dk?C>`yM2I)N9$ssd=dvb*CXu3@IEvv)+Qi^%=PPtAeqPolx#=)|Ywq zj{of$MNzK^lB)tU0KjS_*c(%1g7^I#y;Lsc^2<>&P3BY1gI)g20hjf>!7uaq;7J-0 z$gT&^&98(L%a7zUN?&*$3wZW3e|Djuml2q6DtEf4mj8v++V156`XKzk{Zkzt8eo_TPfWXsZSoY=LtGCi>H3__ecs1k59i4Bg$Mns{&rEJz zMz`ipX2pV}v}1*w*FhEdb(XT`Sj2wB@~yl0ZdOx#;w?gAJJUAy@pJfOCa&^t_H0FV zJEflb+5wf6$bM>b&@FVTJHemd#yeu{8MTOp`8kqz_})e7oTZXFF?Nzx0y%qm-dqC& zzWW{Uc6Ny0rN;cbED;q>wO3`}4$kv72xKb=#7Hvpu-2St+ttnH*OZ|ff{=ub=C~{S zCk{^%J$m^)$y4*Ri~B`2EyjZ?11$kVNm(+9=fZl-J*aU}}J4lHGVGFv7|L;p? z-#;Ud9zTqG+fMCCFUHyiilRDFAbJjiG*`lRC5xRPhkALjN!9IVKrd+ZM>bOMk>JpE z*G!rjoSzw?l(j-XkY}?pez*|M>~8e)9tX7cweXWHo+HN8dELRz$Yyq^+jhV@bEU-f zOd>N~?7T&Ho&oCJj)Ig=Deph46NgbqFiMYjZr)udzCS!+(MsU-Q!sU)gZaakr4K>? z!@XJc!=k)B7uvvrD`vxiB1M|`*O8w-$H}L?Sm-15l8FMpV@sD`9xe;kx=-m}#<~nP z`dVhIUX+_XMu>36tgUaOqv(jVkbO0oW_l!bOoP-`h?a~{8S92kNgtP4=h2Oj7*#3N z)v87D=7K@D;4dRaJR*&k*l(utkNo9<;MyhlI{L*b>DF&eQW3i8m6qsQp2ewaowCYv zQ8Ne0dTx%P(W7)X;AYtC^B)^Od~NW{WgVjNZd6a?M4ItVq{G1{GpN@i$}>!8mbkz} zE7U+SrilhunSBR8GWO~&;ie{W^OkAULBIVGoW~%%OI!@b3f!f30AZ)i&pF|4Uhwr z;9zdr6?)$5grL2^Q9zKup^cn|6;jwMJ_+>*6~wS{qV(2FH=sNB8j#w|9u`iW>}ff2 zOEt6sWzbB(r{ncAn4T7aYV|zb~c~XeE{cWRw+MWedtr+2Ph; zSxlRFvq^!Ly(v=D`?HS)A4kOW**_(XT^(XwYuR91UY1Sd1Ofml70r#Gm9B1l0N2T3 z4Wu0KZ*wLv@&WB@FGLOv`cdc$;M%!T0;Oz*l4|Mr(Cwq2$R7Z-66tFrFHplv{@OoN zC`K9?=S781jP}F&Rt%Pk?(i;ztTWtR+VZQC?7rgi*Os?@-u&Er=XZM13**YBKLT)V zIcNVT&TCHgzL9iG{QFGYtV+)rclW3i1#^nZbB5(ms#oH6x01FI7b6QL@+|!>nZBZ` zL*N@6eG>Wc%Vuw+9vERia*M;I-Am=VBbFHgs5y885|mjO3yK*@W)8>jmf_&Bc855t z-bE^6)^YrPh+b<1R+DOjF+tFP*h@Dv;h~hF*z;yuRqFVj9D_{WX!r-t`>XU6R=} z=y|*=7_nIyA(yRcqZKB6vep?s=+oc-QiYw|W8-aoS$JR##U8P)S?*o9T!fk)v&u7zx)WNykq+;2C(s z5q4VLot#X1K~RQWC#$KSq*%MP2dQVbNo?VCcjXgC>3KkHO?2L-NnC^?Uv&EbwAKuV zNW+>pv;P=<$&q>nmstDODS9){mYDP5{!Nz=;m+i3CPUem4@_Y$PlC=;W_6W?i)_|9 z%7$c9-*-!zzwR;a2`3KD!7;|-fI?iLuf1#f9Tkd-E7S-rE24`&VZ~^qKA>?7p3)Rw zt*i|vrzD%AocB0Vjrrn3o`370CPS4g$hey3fCoU9yzDF=Y4UQ|M&i3iGZZfj>GDy6wW9!o0-cNgYH$an&36&;F|nucT%3zwF6Te zpwL#=I;sjIrT+bR?$;R0@1o=ccQv281anFFhbnJT7R|k^AC@n^<l+_!{6$ti&Y~4$60ozgDZ+XQ>%!o8r zVfE0!C{K3u0^S!PB6XZZH>8=-@;2)!_vIgDwu%HW7NT=_+9hDzU}$bR0nVNNmo8>2 z!+b*QoWo=dJJnckwv91?lR377Gx_;k2wN2UWVIoT*A=)5<0< zVrES}=k7V9Y|JPT$lQkr#7;R>jTOD}&qH@*Vm;R9zO%wo_{B%>d?4nITGGXe-T0Te z4&I)gQfRiN8@V@~Ttpm^%ogmdzLHsJUo*ZnOa_D$1ssnJ+z9>_s1U zMLC$-wB-o1JTElf0T8Fr!EJ_9Cu>DfFQ15-fiwBV0rafTrzd(8}zB3jH=-BgP(vd*ds_nM1YB za~cTa0}th#laE`+RF@uZHepe-*ZmQqyQ#JUDKRR6$b7$fpptavja0a!dt4e+jwi!N zhCqaf16Sk0Mx0u6#{Wq8fHuL}klek`@% zh$v{mi`LUDVkBgE#nM{I-f^x+X!YSa)lGa9_TdpP~m{CbJh64H)I$DeBfv4yt1gBSWOX`y|p zA_lHgYFMiygXgv@xFc=tH8dV_rVE?LnPSu;0PbGou71RKsSLdeB}j>$T@NviA1AYe zcEpx|i_+bcA9raofaTqTDEU;>dxB2ik=0klamkx4^I^GhWDKsPG>Dd&Gpll}gZ(z3 zb{_4xP0Lj3di6Sm>^RCgkU2^0^xRTj>HCO*_{AFGQ?sUdJgZWgXno(rZ$!b|ki6O^ znB~$4GXjvWAkL!wEEj;Mb@Zz{jABY{dp2pjgTMrC9pwC!MNH4Nf%*CoL1nH!69H1y z@Tw}a;#1m4sn`>Dg#V0R-dh!<`3%w<)kkiv5g@!lHf^~^h&_Y~193`hPD}Sa2#r)1 zgQ*wjpMP6MwS^KPnC9*HgNyiU8x>$VloLL3IeWasM(xchgS&Z01iTwa_YEBwy#(cC zB~W()78MTe40= zW1hF?lNmr>bC=Q+Eo{4XWQDQFWI&hAOJ)=VK9TRQpBvdp8%y-0OY;`9N2#{Rh$PxdGg$PsZI;CgwBDQ&G+mx}<5chdCGp48GL-=YxgHm`pS}b#-w(w#FeI z(Y1wo9Nd^^J^)PbrPrxhvOb_sQR7pJk3_SQL#o$7@5}SgPr*2gQs-@)IK#sB_9Wz( zAylNzOpHNhA5`XgwGQ%FCZP;vgP`>wdh07>D6UBHkqgF#pxOs4Z$oKX>+20raciZi z;9;FvH$G3u6&|r#-x_U;0}B#3op~0Ty*Q9|>ljeWMxnxIWQA^;?|y$I28i+ zznn(?*kXp);skPTcSf!x)iOg9UZU{42utm}e0HjAaniTyc>9jybcut&$RZ*C5N+Z< zg@JVnUzJrE<5iFL$JQG4=&*1Hj&<`X(!8&!Q}^iCF|iUHj*MBCVYj)x;AgF7S3fNX z9bv#u9mOowy=;WJU7|(sCX)7N+z-YgFWl*O9x#vs6y*g2NDW18r8yXA|*!upBS zgcV&|KYgDfxV0SEH<4^jy@TKu$7>ZXE~1+wR>kv0z^tHIrt83VfiPb#ENlg}mJ>g= zWW1jf8SeT`%WQLYiCBsF{aaT4`4?lo7YYZ*r>owc3~o)bftp7AD{AWUHsYUQwh&>W0znn_l8#gVvOdw%FgvEwW4!p zcOc*kIw(B+ptu-DDiqMVV+o044SNuKU+iZcrCR7ztLF(q(IaKX5&z#=yhg?B$4OiD z$wH(2Aut9JU+O9E949^g-y%J+JY-~BN7q{Xz4G!1579nD%$iR~P?<`D>)i~F!*=Bf zgg&lYg10MB=C!UyHi9sg^r|>4 z-ipIwF?p-U=}J0#-MYlEJoLZfKU{bA0vxke4$YOf|i}}wPyqp*ps}& z&yX6f{I}4*j}~FrNjYN)AssWdO|d z_loBMjy^w{1uxsKSWVJGX+Xb?MD{%&Q8+yXs_=)c#QeA7kT%D3rx`I#GuX!U>rQp% z1JqZ8b?|%g{?OND!ks*Rr_NcJAdLOFs}XXR>mU%|RT^^3F_eGv&2v2Hy8Q1r8}uan z!y~d;o`D-q`D;Tc@HJZT9zBxjIj-Z^W6^aT@ohZ}p1|&YBp&f03*?`>`bT|w0W$@+ z`0H;U5p#A)w__B-B5t)MWycH>LDD)}SG=*z2fIRT@BV`J{+GgE%JCzIY$6VJ$qkcq z(x5&zDyxbFY|>HfG212F`!Vke8>LJpT?V_p7gVRPCUh~&K&%**z^M|cKt|g+)~9p< zoZb_KdEVcl0U|eWB8GV)an-`(I$eX9aC=x4w_$d^8hq30H|-(bp@n>8|KSyO{H=6! zHn6pP5blZPl{PS$3iKQ3=Tu^!2cjQalDWBP%9N%Z$nmMby$tU2`^EwuqA~csO*upl z2w_IUdnp#K3;OJIcb`9kL)SG!0!R1?o@3fCxT6;!Ml0nzq**AY@9!aUhb8-3Kx z;u+n%TIfXXv4>mr9SL^OLgZb7CQR?n?0`2#YI{&QDIgopRa|taux(S<4DYvg=y`;w zb;bSYe|vJno=SzikPVcRJyYoi(CoMW7^B_FTC3YI`_qSR)8|Tvb5mgbQ9X+vRw(*{7 zgiLU>_XsbNTC`taFOY--8d)b`65W@dc7d>|fx-SalCJP^;jjC95mNd5V7$-zS~>*u zkA@u?@@v3`$th$WQFSh_u$Iz&CQFon6=V8^{YW=7nKWKm*mj{Y$%4{KkHgSnL_AiB z!GMR;4_fvXR;wRXtInfrRUlS`mcC~)BO}nTS9#MOgOfmM9Ju|U9@2wI^GQT$%gFQO zV_J?lO;hi%iRA$GP>52uuzR}`X=4Rbc&$grr-xP&7jQw8@z1LWrL{o-O&FCuGjeSp z#9sB7&7-(rdrAxS-d#2)+u@vcd>As;His(nN1NeV274R0t^a1XM24^UPK_f_X)bej z{G94V2AFMF(7_r%sFP~y#F56kBh_Ehq9=Q0sB6rGp-!mhrcs3uz zf(Acny4|x$A+7+lr*9yQNS4HH^QAQnUKk8RLhrHuBguK*)f6ZM9cu3#1zG24Wi zv4vl+y5sX-vXwARcXYR^E2oo()ie1d!FpQJT6;u-{`Bo((7?Rhi*eSdpOLuf$?GGE zs1;)2kh}=>Dj5z83NK|sPt;)S|L~s4jGe%5b>N|JKQr1FM;JS%3N<7( zKpZgX+_J|()=;Eqrg%tdID^}`8If~zKp_e;FBQ;FsDP8n4-1A?^G}_Tdg4d=QYcrb|M)B_*A8HkT@Px+c8BTdBazPP z%y4;_H(7&TmVC0-VCEH^x=|u50)NXDLB!__Oc1hOS?)gl%DGUb>UwCry1b^*JY)J)nAFuu+&Go{!_8);td@=ZHu+)=y7854dztq zrLhBA>vIM*4;y*2s&{5`zzo^%6L_jj%kk8cIX!VrElp0SbTBR8ZZ1BJp$J(VTAxX- zcA3J@hzBbxzH9*!F5W7eL}Bota7C?pZWZ}V3v9jCEnHyRL~UIcIsUgy76L28@?rEAgI$eFw35^E7=S$k63s)4kaybVFfs~vitL-dtN%J^!8yO4Zrlk3A} zc&UM_hdjrXag0$Sf~jR#jq;h>mnFLxIzmB$0RP-THcr-TVTwY6T^%A$uIAx~=q-Pw zR2G7jL5y1qC-GRRqV`-;s?Y#J z++_{k%`~VNsuT!XX?c?bfpL408yzp&6u+VYJC#4|DO7)1PfZc53h`dHhDhXi$D zh98yIAyimJmaH0ZG=yDHb`d61T%)be6azwr|NP@T6*RbtvMo_v^>7@AfpYE@SH7s9 zn)-G0&js}#asN0*zhoXD=d=oVmnJ$4FbHt_5H@kZ z9AO~q-^oJsB-hzY;80Avevs0|w}odZZ;vTX*V1Qn^Qqy*MWyoj1PfV%0EW7Bo$BO~ z&`>`BJQgM((u8QPcjZ0G-f~aB3-4rp4ab+&!DFvnVjNWp6n=#EUP_tp8a!vp*&=#rc#IMj80ln zjY|b9|B;9uXy!jClaLdfd33v@Wj<>;621ckV*sfu(Y_pPHh+|Q7`b)W`jABS9DEYY z2L>o@Tdjdkz5d7& zEYwM$H@h473s{~V&DS!@450V)ikYT2nAv=itA2Qc zv^@H8^1u|aX-srLRSl^`F7J{{qyGGGs)i0RLqT6D#GX)4QQ%j3L@hL4N6C8E!u+T6 zbH}b+I@&Ia9!)RlNR_*48!ax|fgKRu1>{K)X72rbFf8)(ruME>0OUES0N-@mk$ zL+#;@iJnn05ZhO{X0OpP&%LRtyDGR<_(Tf7n%BdY>rln1SO?#)%}juodNkoiLlAz; zvb}`0EEv-hrCIhU>m($KizlvfW5zB>c%4R&ABKAbw6<05ibV7_b8DRH>EZb_j$6j#<5ZY5>Op{ z``t3eBd>z`@_U9WR{-vyGdo&oRAeHT8p*)>Nwt6d#KXM(O)R&<2M`G4ihLvJpsz6X zg!mF0WD5_h_L~h!A(>XkhiDMEf07bjdq3K!2zQyDwc`8Xg>wNVbJD;(Y#vKC@Q}rb zpuGVB`X#TC7swf=sov6HcBL{{ zQ!lSn!r2rkJZh?%<_^Gy)A>LLMUn)zHckYQw%W!k^NUoGTP&Fnueo%hJj$?4NilSiTzuT>lwSJhr2BSe1DoLA>2AHI%xheNGw|97yb z!NeO9*VOY{jl-F&R>zbv1wC@ry_i`%lDx=L^!>8&>c!0y#)bTVcLWaY`yLd!=mYLA zN%Z2Ql1*?TKJJ8=h%wmy3qSvp79z86e5-P{5)I+nJNdJJ&dFU$)Huh71Md3JgQl6a z5M1u)SxHmj3vH*KlF{QcE8wKQLGdyB6(xn#sV^+ui;M`ho!pev;Y{u4y!GA*2iptD z6I)z%CO*qPmVeJ@YIBl`FF1F0>YW)${0$us6600U7S>Y2vJfB!lymRRm9Aw>Rs|HOX3I!ene+5FDZh@XQEGs_-vi zl`qzsM{~k1cNu_a5zQ?ioqyBsp~KDN6^?n9JHS!=#0FfFo#Qn1du=6znk5tg#~-CE z?936ZKEu21ZkhXLa7Qcfo?yY*qHS*HdCqSok!(8^SI&KVE&6I7_f;MHu7#)0Y>rc4 zIXQw=(Xax|2Ju!uqiD;q;?|m{yiJuf0re+u@?XW!%bEYS!mUJ2A8hCAem+vq{Y-oi znxe2q5d{5TqJgv6QZ*5RDKv_Gxb|nlg8p&Y7$5!6Zp5TD3t1D-^Cb2!WLlbxgQiYD zp<*5pj8e3f4q> z8`xt#zs(4>f|skvQZ=HKG&P@#aoz(UG?NCivq0aBVQt4nwP+)|9d5#928GnR*_oq^ zz(7))Z$tLegHtep0fRoAz&9!$Nuw2#guJjb{<}KUg>wjHgi zo9ghPc4UTxkkw(qAt!J`1|t5Bb^v-M9HtB<*oQGzys#tEZ5Kw;+WHL*IPW>Z)aaA7H=UE62kh zLRWv*LNY*r34&n#fkC?yW>Ex{a6kYRoq6yaf5EQbR1>X|d-)^tvI4BaY)P{nxq}L5 z-n9hr@=qROLR;g1(@v^1^Gk+h*i+yi0$Ul|WtzY_#qf>>79wx~lR*I3wzG*4c%Uy0 z9RYADc9lW2<=&MdK;97WYr1wvzf>9e%KZbc$(FA&)DIFefGO6QV_sq@cOTNc`t2CB zr)WKN!pt46Z}{eH`kI@e3Vjf{<;V4-;`=OONXy73H83dSU;Uq zvZ=|DeSS(-eZnj5X$9L@iXyvB$K*eHevDlgajJ1AOlnX?2d0v&jB%dm30BjJ4UK)6 zOElw=vTWWkjvZT(bAd>E&@Q^{2=e zUV{vGBY~r>THK>T+=;*`VT&0w8ejDc;cl1C1_Rg1-w$-dA5-Bf=yX3MtnIBJdSPJ7 zdjCjR0T@|?L;@mqjPD67Ay!rk#Pno}d+hd5PH|61SeTQ+R8OKU2{X`ip0nwPsY0H^pkDUQ=xB$6;*Oh`N8aQDp{;vwLWC zzE-Q4XbrVc;(rX6!NO6$X1m=P6*J%zOjFi7cxSu{nS99%;C4pop*H{U09M!?J0 z8F^)P&M%1U-E}SDnp36tchx0w6H0u|PSJ0(W8Rdg0u{8W3Xasvit<~i&$}*p8Iq^C zXpcg18e~@H4IC^;T-J$s$W?8jW#j7bh$gVqs!0snX!W)LGeFG0{l=>yXE!aHz}nU| zRoSivVNm~eYZpCWIL(*WW&PkQ7uL9+@Esn#6o$#o5#4}M`?ExZ(Mh1tkCRbJG+k)} zr_A%XD0!)rZjVF%b%LK))oB+*lidbsx7P|Y#oxsx_xaPM{B^D0RYMX9Si4O63=M|O z4r?z6%u^Kc&%zKnr;7Wj?0dF+r_1r?q}H$4FOY=&KA0)NNXSD*gc%YGB?f zw&WTMQ9v=A*m=nu%Z;&IB_%L~{#U|p(Hm?)hp|B$E% z7!*DgKZ2;`&G^0Mc5f)CBw~qO zh55ZOlXO$Mq=BvD$+HvX>jXoZ&&Y$uc{GG4bBVIs@*O)*)2ioZv<12KWhpVTk+Kdi zbijZ`+MfVgT7GH6USiyeUV@%N#RAu%TyJA^D5k7-urilOeyRGx#7&j`QMLvQWkWbX zscDpfWb&?Rj8Nk;v(gvYm2+tJMY&d$7ZmMbwJlLHU2Ha?6%v*M2m zL{+VMAx5PQ9>9(xAIl>)A)xuapE5H2)oQ4&LIgu<(uz^$P&=AEvFZfts}Ha_{iJDM!gXx9cz_Y|2Cl zapKnB!M_S!UR--&Ic?(jgI!c7b>QXXes31;Mm1EPY|)vKhy+~Xt6-OFHluITUMC-! zE?IQ(jw$1!>GpC$E!(^6Z$E)VBU&Q27;|A047T@28#8ADPxewTq7_kHaAoipQ()av z3%{0;iPq}y$>8x?r-IqY9_pg;6GGu5;-ebk?pX=$v3HDpoB*)V4+cU2nXC!it>pqQ ztOqY;47(E=1N6`C%zO6sA%_5P+oWL?T#kktPSJK&EB zYJn`fNnlMmN9aD^aQuy80|Wn`^V*=1G7i54#YQpoHExk~E-L3c*uPC!?UWjQpxZym zZm2HN`b|(j>3Bm6=4SDoFwOF4$><0G*OGVtgpSk0A*5BgoQ)CeUmHOXG)z>gK zo!kj*c&%e@Dw@c5ZB`xeXxDXoFbc8s1fTVZe`m+XsGwlOd)F?#FLV$u=D>6H9J?x5 zAWAEPz*KaNK0Z;{4J9g19Trf-v09Sogb0x?t0wo%Knmg#ZheLDW?M?qhZT&>T&9vA z;}>$huCEMYnaotMFbfjclq6AU6AfP8ryUGHc9Uq8c@ABZp<1FA*LS~yV}2HL7ZJP`(4W7 z*i~Ajw`wA7Uhh?n;eUp#^$9JBh^FXllp52Rs1OS^>Wg~b+lKeeKbA|tny(C{xArvb z?|lQ?Eh+>agpz9e9VWC&Qt$Kqy9E&&X(=GqDn64Aez(MX3avBb6=;WQFZKsb8f4_v z=-l)3b;&L?*#q%!Dz67 zti83{m%oW8Emls23aTp5kZBf@C`Uf6*vxRqy?=UUYE{CEMRlRfW z;y35K2#V@!0(2?t5~#U&&Nq#NN)m?X(K3a}?Df$$??#xRzy#QQz%Pcx8s}Ej%=Ze( zuUpj;d7_LLm65iVY^s;O9;p$EHiim8jo3Mk?_I)e=dbcDa)9*Le$G3zuBft9juU3M zU|dvfD~Oa^*@#8X-x=PS%b_p)`;D;WmuH2-t}Xlb3hS+Unguat)9X6L1v9+yuJWuVez5}8p56!x+f{A<@x$d8!v~nlz5%dzR!PESYT&fPG#@-_n)r$$@&yI=QRE$A6Ic!=-HPZ}Oi583@3os+6cf_Twe z2}UPVW)YZ3&ai}I>C4Lh$m*N&~obi#bfmVfyqrDjfgLSFkX?(}Ck|IE;Zau;C&nh~)% zv-oC;v=wZh^d!ad*-q!Sr-cKN`njr>;k#0y^XLj1()3UB?yD|9nZv;g18WozQh_Ax z2knb*nX?%F|J!)b)wL7^(w$3-#y|hJuwCm(I{P7FDov`WM1Tn(F==ulj)d?l3k&%o zl_-kuzpOVjjqw(NPJ`>xE${$nqTXm{go{`1fSCW|4|JBX2OgqzI7RKfWd8s(Ju}*r zgTrZG2^wE8LDj$k$np@BPStxlVeO12X}pA~;8jJ(pF_H$YSN>XX75vyHxKRfnhfUD zK1d$<`L^Xz-d?TEhnS?&_3T%RM5m*xY{8$u_2k;G&L;^`vvLhdOu~nxn`BBwE)1rL ztR;J?jC8#Zv6lu@et`KyYPo?N5#lE|)esY@EOfolY1);M3rup_sg*(PPq*~lAGlEj5&@A=vE8C+(3KUzzweQ z0v=G?!+c$88auX_w|3Nm&^=^yKcVva=DAb6iu{XM;m)wcjvLY01d)hFTc%Me&nWKw zEOUy34thS=1h~JYOcae|JI`Kk_UDF3L|Slfmo33dj&w(7ss;A8QO|f4dkP_9-;dn< zx3qUlE7iFq*abn}=BIcA$N(R5$NEoOW^d}mtUC;{_Gb4x9Z%l2rJNj%8it$%&UvF17sP@?k9 zpNv2OPv@_-0G+q{B}&51<$x%36dXcyQMEyteS%`NKkt=Hzn`TsP{Qj8_5Q2pB`WASD+euPKc2S$(wS zZ87g^QV~NGLBxFdtC*~)D*`~E2ISbKM3mF$>%GKYXPp*ygv;N(nag0Om9fEKrMyjd zCQUem)hm`q-YaZM^;=@NX}pu~(*&AMsozkT#yfOmO1Ptwp%Xm_VA z2M7@A{t-v0$jse+$WxX4b*}jDB}Sv`;*VJ+_;MbJ^>L@@iZ}o~4lB;p2TZO$3i(K( zTGbGdH(@LXQqZiu2eg?~39mz3aU8e~HQGSXT-;-U1v%hGkQvJGxY08r=kri6M5osy z?=n8s<&#fm(>)o8`oW)odkIUoe@{s3Kto9^;KZ9+!6kKN?6-{)okyb8_2oy>>{)Ow z_UC4cGe$vk1KF0k%pI%mklCbU$WbgOCt}xz?MMaCgE=`ZF4SIoOG;Fwg&}(j9p#LV zpV}u>U(oFAI)*UjBs1DMMDEbPCTgK1qe(VW-j6;HAS4e9SG7E+dBYaqf>^o-*8p) zs{(q9vu(Ixu0b?6W7V~c_eupa_IWK(mM5Vo*!&9k@!hwPZG?nPE&1KgTayjx;vzsq2}c}u0ai6N6|SHc5gQ7W*v$j2m#?3%`# zbCv~vX?SF)I?-*}D5^>>JA8#xF&4tII078~aZebKBNTa-{S-$oCYc7hIVijlsnrd8%<|xbA2{+i9^oZ1UF!Lku~L*#hL+L`K90CFCZvihpTj4KV&H zmr$^&A{cqzf*aceK^mr!`AAEWc=}Ac#h+c}Q}^MBl0#=wAK8E-;kn73h_-tKKbtFV zl0^hfjrNzml<`B9aEa&DuP>VDEJTo@CI4 zMsx&Xe5)bukwg^Q!rRMEvn|mi5`LyepiYMXa|r)eH%o-F>Q6bvaJ$A)kh=oeOQ6D^ z5CSY+t1~OLR!>;-Gpl}qj}3R#Ae9qC=d8mswLb}{gYl$y@(IxEPL5?T-H0`2__y}-@~_Y8Cw`yyug7}IA_~^` z`mm7la9ix8UOXv5A1K0>m0hoy(hVt+49H_on!61Y{$)M1t=a4h(&&| z5e%)>;nRU@5#yj#z~|i0_*T|=C-JK*-^t(hGv$;hRe09IeiZB| zn%X%2*LV|u*(n6nF?p!frFlUg@Ni#WDUkx0bu{(L{FKro7>9w)iVf>q?1SXE?ystK zpE$jW>>al9Q_ApLS9*cMJjBI0VIY-#li)~}!`tD`nigY0>a$LO33~1NeyYW{iPY2q8tOP2&$#-Uu#PX-A-~x53s$W`qb@PKp`V&1H ztqpfbFS9Pwc7Y#|rsl8xJw`+{qT|tfqBH)57el$&vLND?N^EeoFS$S?Q-h{o&!36| zT~Ca?&aap2pEaBa>)0EUGdrLVm#YMhe9qi>hk!F7;xd!_spO5&S^Nz&ZXM@#B@h*G zdbo6!WbJ3Z0AuRMb}ZOGt;x`MZ-18vQuFg6$ja{kW#onq-`(IN2gX9)jg{iH zSF}xfeq=Dd67M1n9ay)SK?GipUrJQQSBnayUS!CB}cAXyCrE0y_!W zkr9P;Q@ad=hVPiRfDEb(lr#W}QCm#Qj2%}z;g&n(@bSk}y9d(^jkF7VHth5Ev}?M+ zWLl^okGMvJsZs1>8x#RTa!Gs#E#96fjw|>=Q=*s>!0xW73p)4yaq^X5eCpa!ZzfLL zD*LK5I%6Mqc>s8Hth~ZZfHaol;v;}($q}qiSS*O=gp7akpN=Hkq+bt3(J(%!@qMM& zR@H89MB{n?4us#yK#L(ye7J_WkT14x-(y31V}I~APNuiZA{7EWQ4APdh@HXU5)Q5& zQc1?;?gDq6j%AoOv9o zdHw{IY5y}l>iFw#*f%}FSF_=(c=FMi(nv#Nu))^F1!foBEWhu2@4xE`*-G|Az0}t6VDtx`6rXCzXj~6%dd++`A?Tnf@g|g zSy5l*r;vKmr}pN`8|-s^E|payEr+=9Pa=kot=nUi;C$1#9o-m}?4Qql2#6C}*D#_5 zjl&|BfW}s)eB%8RFdWeZ4wH)Gc%9`XhY%fMbt}T-xxXx!x>qp?e8MRRRQwm0K6zBh z;*rn~)c>MRyL-|ncmB!Zo4PGs1Q#p4-ZWnMVoUHgelLMnD~tY54P#hhx133%w)2we zY%)wG5e?lD-V=Ftu{2G*ANctVo^{^jSI>(3GF2>e_95&md+Dm6E z6x7bO+9-Q#xx8F|V~gswy+^3^GA*9)C0dnDY5_@DmUnXEkE!gWW*ds-c=3%A2oYQ_TdLmqMsdlpmJ3<#)d?CoRvKersxgGrxJyBM07~;4uz=$57$|6 zF{)ftrI+u6^s!%a&F6ltGy+Ca_W0KfIN^%AtMRoV;>?_3z}xS$p1>{6UAQISKkj(G zC)C>7w^KEXBs@MhMcPLT&uw<$(=INLQ3@M|CT1ojL%!e?nR*I%y;;NdrZ0Vj4uppf z&b&KZFW4qZh~joF<+Z%`%U8g$n&pP?Q%heE^LBMZsbKWKr&w*MNpA=!t)6!Q8Cka; zFn0SU>ojGEXxY7{4ZX0HUAMx7n@?9^=hFern2zbd;5l57{t?xZZ_t~~UK}#<`@$0|6(iPQ5 znwUG`)ZK+573~y-r#J!lev?Y-^fBa(Gf--S|xk7_$3(frO zY*9g(yzR}i?Ey(*C}TQDn`c(6&^!h8FVQn6qEsu_1n0k}fRKMuS*18+({ zg0q3Kj;1WoTM}+WlwnOCFylxx`3(`t?HR7X$*}w{D8*S=_2wE&eUWk)R^_|6v9~dFVF+4f@$SkHcp#J0Ej()Zt(k{eSJmObsGWuk<4!C z-}2$*bqUcYcKxuF5UAC%MoQqKIW;R|rb#H<#&Zpwgr+4^(iudOZ+j%?O}x02K_L#4 zY}bOyC5wouq^SK1ey|e#9m=hqhS;Y(zHgc@Wd03WZhf^xPsqe3=fzusDf3x;?uiFR zWq*gHln7kwH*5CZs1Ki==KmFzlQ65<{vx4Nl!f@F^hIQ)q#i@-YKfC^ih@o`WEJAko*8%x3$J$9mTw&EWA&dekJ(1lnZb&SD zMli_EF-YMR@r#1B_L(#g2+u~86rOLGEaKiztKP75Zb*b=>rDaMcFj{l(?2mg-e{R+ z%p_3qZ-PoDkO#^IO-c7B*Hkng)?&@bx=g~nDrgxb93r#&2ILCk`z(~+z2Inz%UP!2 z!P#qRw?tm-3Kr!coiF*{%|9i|iDo#&;VBZqvr1iV4c5a6FAKmo_jx|z)QEo8?|4OS zEx)qB33px-U7U>8m_;ZpbA8g8=ewuB0AP#i}{0Nn*m`Hu}_T< z1o=MW%lpWFoS3hutyLJr5CH59+CrPpXt=biVRc%0D3W&GMC8Ce3dt7q)R-6M{>?e| z{)GJf=|aA)D0*2Ns;^_>XRRST=^SQ%a~zgw`Z-XE;Vh1Y*HT_H{2R@v=5W2bM4XG)?>4km-5 zUXksRA6Z{qhkUC-(Gu2K`1grWA}+8;5Wxiy#x|2l>x&aHXvXZ4%^Pu>4v6kV`q$t` zoa4{(Fdg|$<}=PtTu5Eh0T~Eo8TSzrXH}8nD*+c=qy{Sda7KGmx3NkhLAWY5!yH18 zy?K!A+%NRU_~su*?IMwHR*tw`K^Xh(Aa+q(c{)FG?Rc%FxL%BWu{$v>Vwv+aKHMor z=MF5=oZVd&uF~+s8j~)5>o(r!p%u0h-8v7B*Kc0u0|?H3ED%190Gw8o`ZCz3^v1zx z`P(5uiCMTNKyWrq$k3Lq(G~;uw*#BEy5Ahe=46>+ZD!uLMmOwAm3Qr!xWpP{#_O)~ zIS{~1%Yo5eu)#}g4RM>%VH2G@i07%bpbX-Y83w*5z!Q+u2`^7#r|eME#K~{m@t{~P za}=G~!^NHOijMalHTc8`#VD22SVknLEo&GDoHYhY?XKi({|#L@@lPdwc&Y;R#OHYSHfy5a#?Cq&7-g?oQAi4c;wb3#x3#3bG(ya3%~k@h(#?C74$|B^ zd)}fXp)PS0DcYWe7{(^jDv8*VW>J+a$H}r0NuU*3_8V)<9t31Qj)m!Y|;k zeFm14G5g?I5#9)@IBg8t%1fjimiFXiK^sW&@Z)YLYr2e)7 zUKY8wl0!MhUC5mO50r;+Uwc^p%?Q?X%_q^JqhTBJ!2gP0K8f-&$=G3)XTL$8R4qjn z7N;Kco}&M5x_DFe6ru-!lJ#XeAJGVo3p3}B7+WHdO|ontcQFM*z1jdF8;U7qVEMV; zCH3^f^rj+6R=aj8KWn(sC#?@@u!I6KdtHf+DK8?*0SPn@X4h$L%CYL#JtP1+ZD@c0 z{5mTCEhtQfKCM7^h@5DzsqM8612mk@Cvn7eBu8B(O`t`gRErWquD4r)K&ix-0eytS z!8yD_A2kRn_B~ao*t5%8)k4ULc_dY_fD8_)ysEovmK+2W_gZlGN45N1BT08P0EAy8 zr3PQo#e=(YT~>6B2HzI*^D9TmI?&UmVU#LZ3&!r^xs);T$c{@f2>*4WWsuq!3V69X zApJ|{r5Vic`kOG>F^#TOn3wlrh47!B0~+ES84{7T@P=_`=CDNEbps^fJywoVe_3Uj zZQM*+@-`BFAnsA{FE<~VJqZ+E#%R0h2&if;iyrG<-xP@se|^%L%14G7$Q=;iG^V7{ z$7}hxnXc&0i&j^Rj0gO4M@CWGLSICk&yt&(c=ylQ{$45xjDpXNH{rtjhpWR*-2bOp z3lu2JVkd@vh2NQkd7<79*oNb;)$?o1=)jXERQY>RUYD~*6gWB_heDOdod?#hcE!o$ zkUKR={x@Kn_p^ zx&fy#Och^04wM+$DiOo7^A%=BU;7_E1S7hfyew(ANyd$|i~97xj_ZuGRm3^zw}1wt zGpflAhM;xUcYS?hX@J`VN6O4#elFz)+NLtAJBv?zjaLR407~m4U;*Wp$;9O2K1UT& zxP2$k5cucXZI}J}C@OgDjlI;P*00U6z=wXO?nham&H2!9;G3+VMY8w~fy&sSn47M+ zs6`{;^0#oE+Rq{KPuaH94n`I^TmbJ1Cl=(0p3|DIOk9}K{&Q_)Uhk54aGbVzi&s-*(J}_>x>D#R97V!_Z2e@Z!IC*B<6q6Qgc|4WvGd_{#u@42Ux}( zo2a4r_4ZF2DXw^1L8=8p-V#~--tQU%!$R=-$&!?`gZn+=gx@q-3f@|rK;5*#i*>}& zxN8+2X>rNYpI^8nxAF*&#LACBhaW&8SJahhzkJtq5J4N&%HZr4#_&I`6@WBfi3#C& zn^sk_H(0RHZfJ{bqP(x#>^P1)e1e^eMzTpo(EXS<|&{T zhDq7@rNhlhf=@au<>|nk{!aK`>6dokLkXg>^1d0yV*A8vDAN=STf(D?(ao*!Vg&(a z#UU?!)RNA^%CoY(xT!gWD%#k#1FY5D+ih;JuzeB|H2}?k_PJ1TE65AXqw^kYRSqiV zIm-6%g-0P2ni?>n23y}ktQxkunVvD# zW|p%@Oqg}D6RRhXx5YMJDNePBOV|g0GT*K}-(En%YX3@mvxi3V?U)|By#P4nNjW-u zrU=_k3H{Ik<3e-5XqEG@&H9qB5BZdzPRb<_mvWT6bJhH%!@#I9rF0%7X~X7_ZOGa@ zbVt^c5BIMgw`ys1$oP+ahzbuC7@wbIPb-jPuTuw7mBNh@JF2iWSBCY>mP!7r3F||B z8wP}H^_sMnKpwlS#5ZVRE$8u@wfWvoqrvG2`V*LNIT`2p7B3mWEME1Z>K&7<9xJdW<8~w?=Oyb5oy+GF&i6Eugg=mf;8;f!5N}5L9TW$goVk zwFD6Ai1bWZv5U5!8+K*|t1j>_nC$2DSd>g1*IPL)GPz-)UKTj~nlR^GgL~Q)2^dig z4jt~3?!-7u>=WZ9!5N2He-XWsiopm(XvGvfM*+8t?wnxL!#`FH#%5&6Yu+=TOByk+ zX#Rm6f9=)Xo9n<95Hzm)q$1SmXm9-|S?KcJJY;X7fSQ-Tcx*%$cW8pL4P0rG#1Q>* zK`!i1x^!_FIqRu4$%Cp`hoMm$fzuqd;q|*judq7hVDiE!$)#|6{hA4Nmm^)|r8Q-Q zHVTXg(em;DK*3l3{*YL$gztCo=$umPbpn+mwZRmWx*;f5(|&5Z z97V!*qRn=-Ixld2=E8^7=;)GCSdrTY z*>QwI+0iHt!2s1D0hOI5JUz8d$Lo*Preft=Klr;~`rKO$?ok>2w1^IwJACf^S zRceAR3Ey?v)_b^YOgb^~HSd5*RogrdlABa@X+qMDW|ch+1R?8RM-&taJ%$g*Er;uz zzN;o6t5|7YP&pPD?VeI~nmDxOV!J*XA8?tH{`L{e$xl+5$3Qf?&uy37Mcc{odcrzJ zaO#T=GlUu9(zhTbu#0GNDT$|Y(uvD`pz&+udk?dssn(qTNDSiS15BQqV^oVZojG*uvs9n_QM;XV+L6v{I`3e%7 zHlND9T&VI)a*Q^rH5@M#-Pwf8ik2797IGjhwnCszlXDO6-6Z#7yZ$@AJ!MRUD+2$p{KlLE$ zk#ppaO0hLUKaDC4kh$Ux@1haMP4&I#%DEqNW+&!i|LO)M$wd39NzN@(dNraSZHVBe zNRAr%cLu7Qp!Y?(;x~X;XxDRZAIB8U+P7)*-$Hjv?OUukERB_*Cs2Oto%B)ODh*I#CmBS33iZ>u=m`?q8(M}a{5l};x1jbLDtGFY0W z8MmEk-YMBs0%x?Xffg@)a6qP*I~Q?}%&tYnKhw^x*yUN=ClGy=#4}eeO)1cVmD|Xr zv|c6}Pk(o@c85ey-UN2|9-GJp5UOt9-3!9ya0cp?QUn`P?s|6Q(~h5_p&~|=n3FDR zb`wz*!_JJvz|(79(r|5rcT=_-=*oANtKJi7@6I@@KBt!jtU)zM3P7qfC6XBZ7a7IB zy`vDSfwgl_q~}$M4z?}WiNCVclWv^4&=+g3wR_dxKK}Z6#d2pplNJ+4)&@rvVlEacKxp>MKuA9n z0j0(O3C7+vjyqx{-9}6nX!Zqdoj#|uJ;4zwjt6_Bc^yCv=iSr?G*+mcnt6|y)ZaUx zk;ZT$#aDU;9yIV_`KT_i=4t-nUJ}rWqKIfbkK)Xh=}~+hjG~WJyZiKE%S|v$Cw`>0T||% zMGp*vcA+jYaYAZ0YbsL9fmUC#mx=nkV1o8DJgyqdbqr94c#+PAh^f#Se zqOqgld%K^cePw||7S>~U;4Szm%Dnj!5{t+Bs-(nA^P-kHPWVkTQQ~h&n>)m#b^zW0 z`)7b(M9ae4qHDr0DF^@%{1L%s2{1GnK1bp_5(Q}wSs5r{!VkK{KecFPk=miI>Qo+p z3_lerCIhkySTlq}#|NGt|GOUQK#p0tMx-Smt9(4YeEt6?FYaMs$p%Qjl1|z|*m?-D zlZH+^f#Kxmv|&`<#3P0-KPtoiv{b+eKd`+Kbw2Q3_Y_>RyNmQ|EqeLRcJ`2q}dfm-qg%!pIwe@?c8V4kFiUDmMcNXono*fg&GqZRmhDkb) zmn;yvg`u)fk#-hK#azXN=-hZOJ$J1b&9}_FX*sS$xnhJr^BEW&oO^BYMF&o3^k__O z#@2W=x&5XRgH3y)wcpwl2x zvh>v5`Tn3gV@(fh`uPCnOwOQB6*d7NmFLFMvk}>t_cNqzb{q^|e*^ip=vp7(B1wb? zx??Q!cUID?hTK*eW`L`@YRJ05S6k8qI%taWDT?lFyn)JJa$+bqBghFNfwu#r#q+RK z^)*$@{^Z;e5&#tyD>yQcf&57r$Z)!1OY6>G?kM~|O`3t=rg#~x@ueq6G5l&;EJ863gIJ|+|GlY4W2sG~p72keQCdc_b^Wh3jxBQAbXPK_Rnz)a ztgX4{GgVR*%;-nuy$M9)MHG=(xUKK)JfQmU_(BP-5!kH{anbjn;MqoTp0 z@|Fkej?M>)1bYhLUSd_bS;&(0&le z5Hs2=^QA|J2wlb@*_jmP97J`D--}w)fIF^ctUJP|~YG|P&0(8LV`-2ay%r4Xi80fr8iJi*(u&!+v7ZpZ(Dthg${+X=W1CIw~G=OP$2 z9ZzM=f`7)tWy#+mJ%b{qYZ9=QCX&jJ>{sbD%c+bDU+oZi2Yl*di$g>DjbA>3w;)bo z@sZ||*xXeDH*3HXt7wQ`Xe{QQ!)cU}zEHGcJQ*5w+z6rk*Kr(R6^xeyf%HEcIrG#~ zxbfRvxmGQ6123ipd@vh?0E?)qvC2i^!?WHBV5C>Xp3r6Wbr-I9ePZ;deEW!`M4)pz z_onWyEWjSJJ-zY2?!6sol5xc3sCPWl2X}f|H_#Uj=GgJ!l;iTIJ2-A5n@gv8M}l7M z<6DVX)kI|a6~uWmG7lifEDGP9)vC;nDYynln#MV9^g1i?b#i1k7sR)cVw7uWoNX@m zUBWM+&EdruyFaN7Pks~LM2Sv7P5yY{>&xTo5@L#S93#O1=V6f5>idqSB`cuVI7sjs z#l+NK`Mk7+qGLcZvW@LQMlz<4F=UVM5q9Hzvthfmk~K$UXo0^|N-nT|4oLViZO&(m6?p%~v5=`qd)YPOQ#F z_i8Vg!f2;qnpqS4Je-@sZ(H~?9TNWjIoNwHIo+)T>s=1uXvm-k4%p7KpI$Tqt3zJ; zojVkEmH{G-Zu{VXAAV1Q)y9b>wQF^6Lt5!;R$E#!tv`#7jE_PC(;zNnikwO0)zq6^ zzFf1hzSz4C>s^AFn=FXL-))?#(VNz^$mlpxa03&Pre<&M0E~ zHK}0avb#DIYv-nJSP?z1t#<9NJLzus%C+rkfkzb6$h3RPq908+p}44C?^wJkrz=8@ zN_52WQeq!p{V}mC`^k5#Ib^=f$T$EFyj~W8fJ%nb@jnx(T6D|D3HjH&^O~AIwDVqq zxa-T`4#YJjeT@yZKzh`y{LQap=eeG1f2)nOa;T}?9dP7Gr{|_Y)e*c3-06eqHW@5f zF>>Qg9};Q*(NerJERuUrcS^FBPNgB1u0pvR=^;E7SGv@-we}}P6kbY1mA%Bb7%J4U z6hW0OqR3#hxI~(`s#0be2%7nT&p&&9xkrKvC@D z{}RBQC#HS|rkRhY`|ED|vqCVWgj3<|_48~o`ip{0ggcB#b*jcjBhJcNm(A{fu-oFR z9BA+s3bdM1SlX-5l=b!9T49N2p8(7#KQR7kpwa`gXucP77A3urve;QgZ7n-?j7~;U? z!k@nr#bEkdS><<9Ul3s5O=H9`T{r?_Isq3n80VSf?9Q_Qs?|?o zH}&sW2?ltTi<{e(##OR?5EAD-hzO_EPQd`pIKTCZfh?S0W%J>Le~V;BwP&a{W|^uk z(UToxwo+Dp3*&8{q+Yi6X<~Phig3mRx0zq%8(xQ<7Lu7QzVTs1%0nBwjvlouAB@2F zR3B7uiTilll>7K}G(h2%Iukm%EeKwKYH)WvPLseegY|m#8BPi!UkiBL9{N|4tESgHX+aNuj<-y)tIaD{mVA|H ziXzT^dQM@bBk5GsRYG1@y@agop+1es&@T;o`2;7r8w<*AE^74r{=f8bVm9yWJY$r< z4l|CIsb@y6XtZ-j2m@nRkuZKgz_)8!PH>VCKa>;xsCf$vqmHLT&iPc zu3og36nUcbDQ$3aLQBP(DCCRyHw;9{rVf?Nae4cY2rNnf@_&x;=soxH|otX?y>&D!gLx5RNhKPPZ9(06+2^Y|z#`jK?T3vdybj zC%1pWPG5+)14O@x`qs6QJ`|aSky;71Rldq<^`ttC2}u`xQ8UvfP)6ZTQg8fsRMNLD z14X*Wg>src(V&b2OzzqOW&%i7-{zvYK>LX@4PA%n_*`sL*8EW#XVB!8k(l?Qv^W(2 znlBu-25QVr+Lz8U)DG59O@-+e8IApdq``VHFmaI$tr}e6NaqH4z|mgA6UZplEcc;; z3Q1H3J2SH^FPDTPRI8=t_@cjG3s&iTc?m{k&U&c+X(En}^n%sw^7iD~0P9sJ_m-#G zGM1uF0NraD;Oeu4MbPe+Vg(O~>ezwTg>T81)jo>Te)H5LJ=4n8v-=l!+PII#;Wgs( zr$m}xvpdTfiXQ+x9_H+v(8x^TSAy{$^1sq>D8}|}^fL4Ax+uMdXD$sHf5^zOXr@b( zn@HK%@VtAOM+IP!p%JGbArDMj4Q{M2q)+!fG#RU+`!lKf-6W6Khn(HzeqOBMPulWY zIcxoF$sNMd&01oH-iGc0yWf_VF(@MCueOMM#1?XfUW`Un{@8C)B!vN?B4KZd-9`s1 z8E-wMjSNltdp_n`*t_}7gIid=EjmeVAO(7jxQ4Gw#}s?otptFu3RA@AVVDIAg{VK- zJvF(q}U(7yS3&#!iUmbMe1NUOT*Cd)!ys{}mCQbnogenlbB z6-5%!NPJTiiCAI_!Rw5EfO?Wd!eXehXiX&Rz-wp{-=}a<{}dWXfP=slu#mbwuEoX3yu62A9>`hPJfE z8_6cqLzkQ@qk_*Uu0W6y`+^DAWnRM5U46DG{26TwH3 z()2VR-NvPae{C);Ehw4P)k8^eJ0!_*1g`_u20#h=LdkR~9>L__o7bA4DgY$M`jXdz zntH&Bu3x2yb z&}nMcsYlc^Y>j{ny#V#x2WI#kv@YryUoc|&KvxX)s%D5yWQL` z+YIkR40~d2wYv)r=hvt2_?%Lm{0B&r@!i8*D$Kuv57qTixxqGQDem@}4M9Rv&2z`3 zIX6G~>Y2k-x!*q^ZKv8)|F=!kSZ=6kM>WWJ>hcEoE=JdwY*@jF^_%EWcTTe|m#hWZ z4o021nM=NT^KcQ^ZcjM;T7`UFzc9 z8*vMWnnE48e_kzt9ZAJRFPA*|DV21EX$uQvf&knh6x?jvz5DRtvrgjR>XU0*p{22n zuN2`P+Df{-8=VFAn~xi3J8S~O??IY7q0iSVzn zc;HN<9bJCk1?yTh;~s6r7JfX}OBT}Vat!|Wl*S?`mh4{#6Qwl2*d-G}cF!;DXTvH>K^wA@Cdx4~3j%|IqaT$;sx zu=y^T=CJKC{1i~Scf{43anydxQwWAGJKtaS0vnr_x2$au z63dIEUhmsLCt_BF**hle;m zCLPc%2Qo!01~jAe4xm3POkw@Ew*7@z*k};L2++U|Vk~(%7kd?)HNw}d0}$MjTeH-n za4xU25D(^mL=XrWRRHMWDhnDFCW6Y!9!?W!YGVkT-=;M)5FjwKOWGO?$Mi{JY%MGNsZI#n7)wcmxBqSF< zgIayJ8)&Rc5uQv3gMUR_ncv4Eg|dNUAL~QdR>{d!_?!;}xCqZdvAbz5Cpv{(HQKJ( zagvrWr_IdUsUL7+S&ZZiwHzn&y zu~89r5-%Qn1!wzku*y|HB%M(~M0rBpp3zO0?S`Xa1E!{4*5;N!NcD_f0=9M7XJ?Hl z%gv#4+3jcQH)d~@CUBRV@ZmS%0QzkC;ta{whhP>+4uMfeaB5LZ{AnR;{oP?^S)hhVzs52^MXz7X6Kc zjoLJ^ajL)1(SYvKzq8& zk^5#ZmfLhnC8)_=N($Qg6PFyKbJrRez5Iky9~7Q+yQ#$k@QDGyHtzPmEG3BUuxX5y zZz43MU%C(FGp9$K?YBJ!?qJ+i2&5+wX!R4EcZGip%AV)^>})AI1t-sHIdkq*5w!cg zZ70Tcez54N6KPCS?rQGD3|^6|x3wg9GG^!nYk0)SN_;Zj6oXwS=`|`+Z=Rqo4Elp1 zEb*y^JPZEo7)GzCa>(_XArC+lFKDZ-zQ%``Tn*MLm$%YMh}soDIvJ+3RGQ1D$&hob z-;|v}!K1PF9~)gNM(siS-fJV~|8?S{5Po4@$}X|QYQE!0)?!PmkC^gA&{4&7{Fw@Q zhs~`f!63n4=WrriA_;J3lqkMChW(*SDpLFrzxPqrT4+P{#rj{7{oR*~zJ$C3j?{GE z0h|C60yH&c;LszF)MT_cT|RIw#wx}p6f>sNuW}oU@u@8;`QCgPMX~*~kL-J>ug@6D zFJbREIBl=9LsfDVeYjoL>--9`9s)>zsSp}+@;n0DJAGYVY@jBOlo_sEwypxYd}WiY zrh6jzJ0N>FPe-vMqK8!WIRC9}RNy9W zkqZQ_9832NghYxX7uf~LMPe3Zt0l>Jq~=cEusv|))v^FCtofjt12F1PfZN_g3*8yQ zD*6;%;iWWTP`n85tmiDdF=XtA&XdbC=`GGN(j)(BBOD z0;rnQuDt!_V+6eY!8$>*#`&={*D!;Dgdjtv>%(29SatbMm|2a7rW@ozUdT* zYySWiaP1oI-vL|wiifsp38RcCEBJpRvJ*HZF8FbbYcq5IQwSZWEMgCfQv_Yfj(=~ef zxk7s*@>5&knb-%*RF0Yq1ZSdp=3=T;1~y-p$%K^4%pr0DQ#bm_aAVT)4X3XLj?!Q{ zIwRwn+lsEc{bmA2=-vZCWuCmQ|3%?UwIQMkQ`js`M-Wo-mj3ftN3~0Cu$dea8}*Zf zLbu*)F(`K&thud1y}0(h)g#n`oE~87QlYPRxg9&jQk=x{Ut3iQ%UKoyj?q8rG&t~? zE4HW$*j`I!+u$5^GUB{QJnrX^tHVZH5Er0uXXyc3_Xq*W)d);-{bDS58&CW0Fn$4zaBFDbW1Mlm4 z!kb5rtk9Rg-~-8#iwro#JoRJPN^tzFnI1QJv5YD(dgnL|P(m+gFI5uuH0qP#i<)(h zV$leHwh=iYCU@=x2S4We`M^$E9nNyu4iW||;k)Ge*?nQClm)kZ@V2?IqKyWw(iaw5 zk#?&9Lk|0+zd&f^R^g){gn$4^Bn>1p^I_~oe`P!JreK)Q=+t>kKFEv0Hy!-!)yNz> zgkz^w>JcQ_0ja(C&nwZ`3`9}V)s$7Qa6rcrSyv2$BA!2m*iS?$vUo!hTC3;uBf5GJ z6CE2hU|LJntiRyV+W{;-aa7xsO^A!^52Gtr`e$hj7C1e(9bRuWj_JQ?9D<&`qkZ1@ zILYv1E_d039sHnpmV44AGd>jLmP$ z8;KDdK5ttajV(kof`Nexh`F}ogl@Aja`5j4l?C|m{ryb`mOz=9dtnFHK3DuI317cJ zn1Kz{%61man@Aza3FLq+!0R(}gq6MDAPWl-ha4+FO@WT_gkbEbP)@A34TQHV?<GOP-`>l<>_CxtQFC2gQav|T&R&j*mff>16k{o(*oi&h?etd{ z#A^Rx83_?o98Q(WuX$k2AX)Y*#L{xQzMm&MJDAyuI_oVrX{UW#bWEQ*e7-?G8D`Ii^CE*V@F{|T` zBor#lZx;A`(4WG5nvlt-`5R`)L}V^2=vq-^$XT(3_VLHVwy-jrV1lK^NeKqEho81L z=`m~seY?fV*z9Rf=loe)aZLzTNju2U$yv(t8L2W>(pQ<_(opujDZSWVX^128UjLUe z=39}=;WmPm?<)$BRZlab42vxfp2hw1LeLa$p}`2(5VuW!?9V_eV}pTT;Rx!j^9?7a zdwz4w;g=IZ%_qEKK_RJhd2K#s-p;7Ug=4HIO~!W|gaFKEb-}nl7?zs_w_@+GH!+|b zy1!y$b_FYnv5+v5n&SW#&l26V6=Rv@&Ad&WfAIRf&hLpK{V=}C=SMa1XULnbH?T>w z4?Kg4H;E2HqQ(%t!+i8EV!ech+0D2LYH3eIw|^|`_u@}%^lr*el@N)I%s zr|fpF1}Zy$$k%spttvNJTPIcutMQMi!gk9xGuj)~?l6Yg->?x;Qt#_q_!*@HVvzDo z>$cvLOPR+HR|eShTntRV0p^Bp+c*C^^;3?Syw9lg zmyW~5C5f}Oj=W+3H;@2zK!~3|&5F|&k*wY2t3=QzN}uGzXxiBO2#WBwzb%)eSV$%A zU3Gdd2bF!v4vt!kZS*Dz z)BB%*VPZ<% z4d1@&os41K%vgZIAbPM0L}v9eJFbAF%P|WDEk=uNzvJ|>9rAAJk)`EdOK;+o;#-P=c*1()Ne7QNz18Kkj8u30neFm!BVxEg_$wWnJ&750FW?<9>NhB$j zfd{07dq)Wayb{ttx9MxuTe{(aCaBt*`Qsiv4~(2XiU9=Hc3+$S0ZY9j?ju+rTJa1J zNt(7I7S?k>#8-~XO;W|-%4`&#&Q`=Q%5EfKQBVW@I8kHLkmRDhlztvCUH%`~hg6ur z#EA?c-U-7UfR;rtfZZg&U^W|@qpeR(1L{Z@yTlZ3H=12OA?flA-rDPgpY$l%T2BIfeAdN$l?7266b zAP@3cVF#P_M&)|PUru!fWr>OTy|KzOQUD z#O6Kj2x1RtvJFJL5Rz`l%~d@_mwW>0K9-4-F?~ zC5S}mXQyO?M#duCd7airCD>~aC}Ge6H%^0XZ4$D0+-SXyN5sm$asF02_{y`KCFT2| zc3O?Y_G}@IMmKC}HTvDn{I=cHyYe(C){7=ML$FJB`&lw#ooYn0R=`HlGH#j@#PVeV z;T#vD3j?tV!I8=UMecsCZU0HVohXsCukQs&q_IGDF~S8}-jMn3DTHTJz_~T$O-i!@ z$rGNped!UF4AtNLJ*p-TAWYVH(D1@IeFmIBFeqBouxv#+XF!>~ru8t|T2yIW3rt~w zl!yI+nSl?59L-k)9xlTdC_G)Ned~0atKr>ZwRMf*dPPf?9F4L$ zM?o^8>Qq7mF%P2J_==yl6^1a*7T&xQ?EUdrw%jgDBAm&s>nU;sZ9uC z3OIviQV+WWzE8PP{AYMVSO2^RX?sIR%J&yzAI7{L8LCU3uSLTGRNrWY5@o|g8)MF& z^{>A?J7TTTJf^xcTKFAD32yr?7-8l!L(K0qoAAiY+ud}^(q;#KJKOC)_#Hsj$K^v~ zvnAK}nEEPE<_bvasp5K_s|_GOWD(qr28(enL!Jvzg zSQkedjvFZ$m=~M25TdKr)|U<52K`I2@L*->5We^_T$z1JPzQASzMX}$Whn#obGx?b zwgN8BU zZpY}J;NXI0Gv5)L_x!_6DmW~t*T5O>s=|&+1)rfZ1JZW2N9RyxaCeElGW-)XB$X)g zBy^+7_C4*}nL`9jHJe@q31w$U`5lJT@v{1*3|8m*sHhfI<9`%dyup!z8g4u7&7E*O zkTg*P52$NX>+R6pyv@*tvs05p;4TQE0ws1} z93tLTL5NS2V>q%ZNvyu#-djkkpNSwY!IG1;v%~$@#}=dDRB)9VmZ_bcc$kL&THPUz z#x&|Wq7`WWK)z6(gKg*fw1 zIrzlNXHBB|_?kS!bckeEL?=tp_=B3*TPkTLOZ9kyI*kNP0d9Fu>=VeT zHnYH3YQ-B#N`!;IEGC$(pi24EwD$4Srxab49@LEMQCHYz!MsA)!QZ9*E?t~4n)Ev( ztrrE4Jj<%3DS6h86S^qtdLLm_tXff<1zleeTHG zAzDR>kbligH-9hktxp$jm|(7S@&&b`^-q4x00FzJFxIcEj|$+V889|6(Bk_9V5?rd6! zl>m%N0V|nUe5S|G6IcBK-zU=5lQB{GkP8CUG$TX9^L#M~GdADKw`$;o)L;*g@{**x z-!`Pn zl(d(DVS2%eU*Wc}+=VM8J{7YT6;;7GJXw0@e#M@c=>UCi|4xl*+yG_Ny~+!Ogrz8S zlqOEf@@ysq2hgTs`Tw(-ld&DiNCp5$S%9vwpt$vGpH;7t*R+2+|6Ur@77tsM-jzYk zeB>_;O<`N-(sae>a0u_oYfUZ4yc7L_$7wx*(AC=Fup_?$IFUef7z~`GZuZb|xPo-5 z-XWW=XXYUg^^T#ScQPt|4ga!&Qlv7G$Sr@$9krQ^+VO;= zhiSs^uOtR|cFu`x;KTE?tYxmRnptV6ZySCztBxVlEjgT>v;A<{(cWF@z%5hpVmGsd zQQFD5Cxd)M(=j)eMgh>4ZRrD&$8G$o!QQ)&8`-sz}cl{88hGYoYB`%`EUHZK@O9T=ybs_3sq0J9C^UV;}! z`WkUwxt)zoE}Yt}9HKw^6iPB+P}f9I?t&r;AMrL}Vst;8Ac@KNVu#$3-2x~YMejZ* z6-J1uL1t7mg2?2MEyACAnQWQk?q}egL>Yw@H%G(hJKyha11DZYdFpVe5x!pydcMDN z{;d04mD>(mt=~ku*ju7>@hVl!(SXp2^91uT;DY#%nJN5Kg0?C`=mOZ^JSsI(JIpB& zpPiDSgO8`noON#5;`PeC#F+1wDq(rG)C4FUEJ$5ps?Xll;PC)#w3;9Um@}o#ebE@s z0~4tpY#rfdg&|hSb2wj7LA$6%I>p9AZ&-kK#@EK^!Byn?(0#9K$;7e3geN-xS=EQ2 zek31}-`GY9`7@X?a(KhKMUF&R*#g3~F?X9}Cy%h@S7zK$^ONbs*7;e;4Z-rbc|22A51A*`#rMPGxezudndSObgh2W$=-~5qPMdN@mhk4*yJ{WLyH}W9!e=g+k z4J3_PNDVOp(nP6E9n3?vA0L?vpslf(BR*@1T86-?$4P~bWkn2_1StoGccbyd6!By5 zDs>T!pVRV-|9<169+jOUsXUY1){H?gktDkfUw}&ZfL-T)5eE^+ltwY(An0mK0Rg?H zg%5`USD>xvaXa6gj6O_t_P`XYrO4?Gv5}Po=$A4)g9C1{BBovxZ8%(3%8wnzK z@E^dVfT`iBtIttUU>_pL*?k@el{<{8DK>+pZe4KJJ2>-SZ2pA48J553=8TVM>!j5t zOu|vPZ@Ho>SHrV|t0C6w_=cQNZbDW8=G5^klbiFJ@Pj@WLN;1c2#rgZU8Y&*q@81H zE%J~p3+F|(9&1DnhaxDP^bgXcr{D5j(5x*+U4UCB8DV4DLaWu9cEG*tS07y<0Fa?0 zk&zil@zgsH4qA^{txT|ThXin68uRemD35hzeUhaxokHV+Uw+Hz!D#_bQVEf&MKZVt z$uX+`R5#LvRfao8y#P&U%09pO!(HEy)FnVC6cb+gZcaS$V06}PD=a^OTr7?BAvRHE zc2POqR>geyjK-^~+s`eNn3ICTlXC>>u+#Upa0T-jC5HZN`CdpG;HR)^#rOV;Flv8G zw#%wfqPURn1((Jn?-I{PSArFyQ)foYBPYJoWZeDo97znLnC#G3allr$wWK;oFWi^lw+7Dt4Uhl#?;XBfPJ5z=KqaAB*L-jG zsnfv+alQ%z0ZJ}LDuqdvRYbb_9_Kwg62K^TsnD|h&~1>qw&tOQ-Ht9qz_mmub){WCBwXX zllil19Q18EeRSJ={UTBotd_G83yj==I5sa1fR9*%mCtdhs)&`4_iZ*DG&{3r>aa$^ zt7y-oxr9vJcHun9qwhEv#>1f3-h7|Xu=YB?lhf3m6l+0l_$iz>Iwifb*RS^Tr=_XF zQIekCOGz+eZ&g3xw1}VCutT>3!)d5p^?$J)jZ*)2orBvW9gVOAdTRbzXZS9IvMxbj z+L>JFxPVO#fUAsfQ=uek5k2&a|3RDL0BvqPeLRGG0*u_ppafD`;)M=ySpuwIwMOm2t4y13;UHzweP-62g{xJ>*xTMFo=Q+{H4(qqu{> zrObKjM;1}jm)w&1V3hQeQ2C7c0wGjO+bamj)vU|wb2e-&nN;e%WB8@w^=!o5)RwQWCt%y4^RyT! zB%Z4_LU`j1D0>51?&mXWzZyDcdKk_^Oo7Vh4)hSO;N6!)p({ZeEg}9=<``uAkGKAo z(onz$n*hVs4Eveu-PU=4E-~}+ziCnNVK$0Po#^gZ>_W7NmIJ>Dm_2gY)~lvsNZzUe z0J)tagtTY1B03RLy0Bk>X^uMIhnK0t?4pKf7u1QMXUGJYzEXa)&AVVxdgE0yOpYY- z{N2h_SP%QM|J!cY!Y$vevl^9t37lfY2J&PibYZ4QPpc$9A)Ls;eaET)*d|)6MW4HS zcDY|Dbo$sMych0r?(#wX`=_w##lv7}S?xhnrTue6ZAVW;xfkKnNw!I`5IpA>1wBxM z*qCX}>fWS>#}dD6d^lDhz+!V#%^C^pyptM0+wpGD)99MYc|b~WY?EVZ!EiX-I*1oI z8wpR-oov1B{je3*`=b@5QN|=fU}tN1@*W>2<Mxm*PI_eV+S7$_z#D}1YABD}l5s6dps0vhcAH@KB)WdLINO z>I*n1-(BfGb=R_R7E^$~vgO8GUf~4~&?($rW36=_Do0bYiU6WlTW;^xkor@07jjW+ zTIkPxT?xM8owq_@m3-=O*~Rp?Y*kbe$YBD=T_iJ1tjSC>uTXBtht8;nD#Ng9=ffw0 z&a<4_Y!@8V4~(&0TfE7Q$u~k4WRP-CZm7o{THEbyncyGi&cZCP>nc+=FiQY{=j+K_ z4q;3Pv)1W;Hc=?E&X`0Bf5#MJOEO;6zTB6vM*DhxI1FZvqyV4rEyZclPZUL*$Y8!> zVvx>U&8j)5Kns6sBJmFygahwB1xZnHqT39vQ3+d4r9NgI6~G-)z{WrbqZRDs1N8@a zovXSu{Imw{1slLHS&Oy6UpEtJ zVVXmQ%OCy_Qo=ez#Mt?E*kDT@C|v<2XHM9u+2;HiPcQnzZ5F=t`=}h}HU!yyvycI< z1t;OpnhY*y<#hTcELUXg(i7BTs2ww0>S%{XWKQJojkr?|rT;sf4x~jy2vFX!A;bzWIw!H*_(*r(0SbZ|9cifX2xMVVbcb1tx%PSPc*UNfO}Z)hn%43OQ}(V z_#|&E3^mQ$3?G6|VH`dgkDA_Mpk?EyaulQEd`){x>p&bm0lb@V;@s)wK`Gg@7%+#5#9w7B)GE*JUz^$9mp6KBy)$>r-qQsc`^+tMX3MD3a@S8JquIhhfUq+l{txe!S zvq&}Lu#k0@*XM(8Q`KF5fG*7r39C*IUYM?mdqgL+$tZH2pHH?9R_Ja+WBTQ4rkhlf3GbJI3TgjPX zUnb|(<$=cOr;ih8W#d&`wl+>f@4bT`@}%?qICJx-Bv@g{u*f;2yj zPebh1>KxoLrkKymMX@d!a6anIOZuw0xuog%0ZsVbOG(7y*oBu2&F5)`%Ms4?q`a=* z+z`lx0$J57OavXSmj=tO#a5*4IV!|A2}u)z(QcA&-ifh_ZUR*(fQW_oUZ#cLo4*or zSH5%v9Oohw(8j87z-3fh+zonZ@{OqsE;-w-N=+EXoFypCAeKxhfo3f(F-1m2PN}Js z27Lfx70p%do?%lLefV`t>~r_NLx7W7@)TOlw}T% z*vYIU%5W|d3w5rypktp#SPw@)ewS_(>{6UxDW#{PQC8?hW}_pivedG#9DTF3jTw({ zcFdi;u52{Pj%=cJs&0ssgZj_=-$bGSN1qiy7exBX;b%N0Tj zL>S)M88#RNPQXre(IrA#A2dXhYe!;9m;bF?GRQOTNyBTjNg_6$tKE3F)U{0?-Pk%w zVXMf2ZLt}c8x&5@BI@<))M+vuKl?jYWX2wScVx=(C0vL%2s9DHx4nyn)T zcDU`l08Zh>?!z}nz}+?hHuK%69+FS%Kghp+-!XwntqzZOp%f#!?dXn5yHj#7GL>tQ zTdFpc9*196O19B`6XC2thR%B3)J3skMcEl3v?aEEYt0Fu0% z0iXICQ}9O|x`^niZnQlyR5X~LQOnN1wP!^&c*J7HDP9ey2!#m^70?!=zs`&QEF8Dp zui-^5LsDDh*Ciw^gzB(jiE$6915tLfh9S(WU=3yS3v7p;k?KjHS|sGa9izI(wPx2#To3J+-T?C$PAU| zUVbvG0}thk{+5%A1I1*~-#u6w_6oLh%Z@n@q7V$xyJ`t@oZZv`OjGdSWh0xBF2Uhl zpHl9;W{{1cl9WDr)0Z65#xdhnG5jyslI#v@A24wMAxlolG=11V^Sh&h7;lAr z6P!P5Sy?lEezlKiRK8H-#xKNQxSykR8V*Ql5_WZ+`}FL+nYUk%@97eK?OuT!5hM*c zhuR#rmepi?|9mq??g?ZVPIH!`Z7j(B7Joh;zvw=nCAEgpDdhoN@)~8rU4&ny4Qa}W zP14xAVSU$tl0XL@sw^yWN81r($mIct__j%4{%Otx$_)i5bVu0hrk>6pNyNftH^rWh zC=Xm}BCE^Gs2qh}X)Hq;=Z)giLPhz~;NQ8wOoLx#4f1QTu^XbIHvdpVTfKw6d{uHI zD;+XknPR-n%xHfd9$zul4VG@EzfVD`0;d`no|YA@S=mb6$7K$Y73(Q5 z$po|-n`JMUTt=N}mNF%FRe&z~u~jfmv%xwbyKxNu8yYxJOuP^8muod5tu@|1h3BO^ zYC|p(-4s5goighTCsBY3H-RzTr|f5Gn3`4hkryswV$4Uj=k?HHo5>{3qUw#6?=Vg) zi9Q^YGan24%8J2BTe|%8!eDjyH4fZlnskX*3LSxoAtR9iD(lxO%s*rwT(23i!cwI} zd7LImclOFG8xSvesu4wp@#mMsTqI= z2wXP1nR6ZK>X9n-jRvOZxfXW8h}#orVSI`UFFcJpNxNkUORLT(7uLpdXSVI`I{YS^ zIlxWs!y7p+H4@4SDwc$(4XqkP2f?{V*P@$Gq_aAIx+(>K0WSS@ln+MC@m!MuuC%2O zvLx%oTazhR`jp0d8PT7BZ{w@6ngELRkZyBnyG!;pmlbypKS}A2a>~jx9@DldLZlMq zEKD!AETK+IuNOgz5n8*Mj<4>D-Xpj5{InSDJ!Y+{uF_EV99Hhmv;VH9XV!X_5G0Hr zbt;`=F`Rw0SBBV(x!nqvy+cYUu?Y4ZA2lY(D0xV8KCD?L_j@6 zdVrkwgv0%sBGb>OZ$&qumz6059V36KP}(lgZ)*;U*C7uw@gAk9tOhsmL2<>3DRLuz z@lvq&kO9>_`u^3LR%K>ZRMZeBU-(VGKlOxMh?m(mWRt^H#e%9dxeVTLE4;VF3%)hS z@OkLx3Fgb~=J^?=m!mv?TlNr8pEbM(<^T@V$!Z8mJ2Q(@^(6Tjl!V1Na@AHaPPBxF zu5&I@8TR}6?8CW7#5w&H>cYy$Y`Y13S50#Uecr4X4}?0IkIU)f<`0T?k?61V?d>(0 zweYsQyV;WaKd*Ig36asLSogHslP7I?DehlsMi3Y}5(=)0>GzV8HzQ30qArHZX0jRZ0?Yn6GZcQ>e&|lpHT3KzeL#85*}H5KM8#&{fA?Z? z=6Iz}B6Smz*MI!ta0W@A==MqoEa$qVa~_hx?ftk}DyGNor6!ac*bGm){|rtqp;oS6 zPqm2)z^Hu^^$y`#F<`Uki_qy@-$816 zyR>%;lw1Du1HV_}lH3c-Qw@U_isi6_ANe%#5GX7J?4G$^T2rx+ZGa2%1n|Y{R?t7oQ4wnSMkBP#j=(G zu@|#aAZJ#D&&brIjM6v+4Q*&z+TQ~%A@Mw)5Kr@_Q(%sPKpU99!o89+ z&Rq2ELO!*@wBhZCoOL|1G^z9mRFC<5vVc(u4#@E4pxj|ETb&^&85%P+oqJ)|?2FL3 zZwAupx9P2RRIbd7=ysjtE4T{PTa0=LMEiyGlezf~{I{(|i%M(h|K?20jK9E?G z%AyPG+`>&8p<%9(XOF7}4T5LP++RGBt!BHO7so$q4a%IpnE0X{T@3qw@^4^I8v*nh z*o>@7>|^Fv6z@W(cPKY~>=;aR?yeW6>yV$MuH@Y8ApshLXSA%-KjP=tp*wQ{jengd z&JwJ53erxZ z`xXlrp^zkiUjNgO;KYD{XqzY>;Oa(5?5BSINdZ){``yGsGF4D>jrX0YhQ!+qqvxs&|tOa2vsh*X@Wsp zCKBP6_{{~t+EEE;ICy}cZ6n75t=lx`fc1{a7+vYqn14Vbzy#548?d+;zd8I27GNrP zta;gcol0?{L+RQbdF{ikfrnLXL zIB0xiM=R(&b1dm9i>?q_)!{9m-DetnYFh<%VZ8=137}6Xa=wJPKeAqXyh?TE%J-jv zRx>Q1-!TmG9=Xoii7)wZC+>3#2I(FQs24H#cDXgz$x0@4WQ`Y!VBC@JYN5=90UZ1G z+;w)S|kvs>@}%A2dIVybHKFl($yT zCzkB;>Sv$L*l10vR|g3PmR4?Kxp&52Z$&@B=H{X*&Oq{#@{0&l7O;FOQJVg+=PKe2 zoP6hu-MfwtYI{|&99{$|?1&wZgWrkZ=m~>{n~&p9F-l+!-&bLsx8tLHv|eb4wtR$-qe! z_pyslq*v$g?>&_u9LY*k(t;q(v=gYaTT?-pmG7?6eIWyjGtRp2?@ zNI=3whS*^P3V2~nYuvdPjG6=B%Iv2}xwzmrbQYP8Je@FM_v%xeEf z4p=(kb0aOe26E2N(W<>p^?z7SWzD*DAh|KE(0I^rGZs%26jcD9fRO%L2`xlW@9ZO2Ao_xEOgpsqghNl(~8aj ze)RC)Er7A-{ry^{`Uwkto+$vR%bXME8S#5c;0;a;C-~>>PXgwOMiXHCmW!E`!1@Nv zz5M2Z8cgTdWKIhESr0lBUor31@SU(8o=*JWMYLz=z<2ZSDga0<@=^KN0frA&iw!=W z|LMvCs-m9d&ibe7mh>V~CqkIb?b*{g7wsi8dQ8IWas*O%cH&^^{Nn!cnl3DZ7r{OE z7rfe+#t$K^B+o(RsE`ipu0?JYkDa*Va@~c{phuFnViTQgiRD6FtnME})mGg)oGbYsh#V5roO73=Kx?ISY489w?KTiBGNv{gShd z8H++OhoJ^e^As$Th*gCPI*8MH>P|-)870T;#*|lWTQw5C;b=M^u=nYWd5Xe zaK%_~<3l7%A>vwUBKX1AT&80)6YXdWE|Mi-S^il5-UtlQ%da0Q&5pCzjf7yr1)t+Y z#t(0&r^@JEet9rpqkLIBJ*VblP_oZ9A)YcTx(*46GEJ(9vEJ*p{1(;03tYle@CC1l zh?sr__|s=Y60|&8*|iEdB@7-88b<=mlQg1g?x??ixLRifTMVn}V6km4`ch(J_b&sS z#fUCUNr<9c7h|1kp9OP>cYKDS zT?|e@5cHZ9U)7w`MQ?X;c?)t#sA=nke|*$blEuZO$ z!hn!{mXjOxgpaN~em39~8^s`GWnWI>5TYD)Z$%?vyS7q*ow5XR4!|k9N$=av7z3|R zrO1zy;yj7o2MQx=E15Y*GEjL!+{S7H)BB|*0CXeb&+n%VHW638(}TlpE-%qRVij`* zNgmgj_}3p~gj|T67BOyQ*oL>PWMPyomjvPn9wm1FHRwT!Dc`LUWeY+kY>vbOaHE;J zfYrk8@>Z(V>ws*_RPdt;SOE_1lUaweS4^bAFbdu z%JGxMSJ!NI7;m8sMMhX@0rz+GKLtoc*zB-!{<0t4+@L?CUH}0ErI41SxYngr7qeiw zNp&Zf3abGL9?5#k;21y!Dna9W0w$IzTta5wbEsMp!$+*T0Rieq)gjeKiYh}$Zk)3> zCU%^yy{qNdSVI&F%TzV+;PRLz4Hq}P3#vtDWa(`4B5FlIMlYOD3kpmMrEik6gAk_B zpC44I4)dwve|(Wy!xuvzSF_%2X1Cd`BYIjjvfKrPTJRQ=Pq`k#veu%~Fw$RCkSP7R z<=uRJR>Y=>oqwYwn_=0JbQt=!qGvEYtC#fgWNw=*YkrH!gLlK(2oP!5>#2xz2w5el z>MEq4#CaR{*S_Hy0^KJW5aqTd9R8ll|7P#R)kJ@3*(=?SXXB^(dNSo-#%XL@VvH98 z*@kC#&+{hLGH|#w+(wuu3e3~~QDHa<$L3d*%a zkLv`4ZI;9!y{NK(U2>)+?&GB8iJpQGRUA0fBBGi*Usb<52v1{+k3fiQ^EkOHZ)UvX z@iTBzl?fUl?AWE3rL$GEB`X}s1)daQ>m7e}838%Up$>(iWym?wkr|Zx<1{C| z4U>zyDt7r$Z0{YYvmrfT93FYVv}B&*w%z|84FH-=FP`ZUXd;8->JhYP$EE}?OT)JT zV$$@DCEwuKg()hagbMnjpX@%yN^X%)Ar*s%z4pmU@vy)N64B9P@&yfVBW3YxJ3ue> z!mW7eY|A)#;S!I(xx?^=sk1RUgwvLZMDUl7dSSi#h0A-_Uu3${KfA9RD$%E?E!qi{K z(z+tpE^}k$#=*HChi$7A19XYHR(Xmo85J}ZrnGN<3Oj;fZ8hW|RdWQ^ zdK^3PM{?hEs$KVVyN?V=wlcC>p=$)NO>E{1Wp+H|zMyaC_-)tB#7_lSNREhBLINRTIa zn;@M{zJuR+K5;sp$)ZdSPI`&TW`jVq(>=+c`yNvnLplOq{TZVhHnh{>2M0rJ$8SYV z9~u^Xe*yF3X8DMArx)^=fp<7#&0pCFw&7MHZZg7!uLFuN?UU~0b}bvKbKJtMz3uNhIA3q{?mUqWplffdf}1L~^F3i*^!t)QD5jQBLu|W(}iT6#m)sNsO=A z4v9Pc;BQom=Rb}L(oiEQ8R7Sf8OT%izOHQ4*=p(8ouI<=A3*o@jbLewjYbZ7 zdx>m{YCW+qgAdMme`Azi7Y$$!IThNDHVB``QASaboKI|%~b9h=Eh0nV|{j7!AT0r$u@=D-gkC>!ubwhaA zdH^UkYk;iywKAbpgf{D3%?wW6*}GD&fG6Kpa4=y;KfW};#+d%aAiHHl%(l@(yOoE)bMFw&{^;&KbuGrr@q8;fRu*-QV{6cxdDhli zCv>i6Y?rL>bqaQ0zY8pZu)Flws_7umIxosBlWH(muG@{vn7t>3S!`ve%k9Z?(1A5)SkPJ(Iu*Vc^BFSc zx+7viW%7Qt0SKdLB`N)+2Q;c`QuCiFVA(C46hF9?Q3)O^tYv;bAm8I zl6P6+bVRB!GnypzKU(UlgX-BW1}@o&Doi+aYOUJQ9T7|(VV?lu=iLVsoz>hRh$X>< z2w08Lm)SAoeIelhB%MSJ5Er0%FM9aP*2rsei_LynD=QTiC!%hW^A_d>EOu%t@t&E<)OGP1gpI!~HC*9-w1t zYR?WpMmVy)+gxk|4F9KGKdU3$2k;V(^kz{yGBhRClbkK2eC1Y& z(=pq)_L4FWsvOO)6BcUK5q)PpP*wB+?NX{i@aj%6bmt*(b#mVS2bEaCr0ZlfhEC>{ za!&?Ntpe1ZCL6LeQ${M`R?Zrc(S00pyd6OEFF`4m0DyK2uwZWv{P>RrL2U#|-cwoE zLyDVn#jVSQ-#x1GcGbD;Uv%F=Pg%FyH|*LACfVC@EFJW^(+#%^t9+8t0w38vWcm$b zaj=pRyg=wriMqyaMg^h7C4)6Qs`IZWao8;n*Dty%j1DxmZ zx&uHnkiJszkXq(;PjBdvL1NnccT3#?Oy)b<*o&>o8rWg`mQef3#y)F$hio=ACRsg@ zfo?fjy+=)#xKxE`Tk%2_M*7dZCzB!)ej=ylULiy+3x)~o-%d+tXM0)eE}=mO`VcOk zFL;R7Z637o*AAe)9!= z#_~nM+mP}$g2i2`&+&5E=>>_cZXn!TwUhsrv9MF{QgDoCQW( z3a2p82K>kP7o?CL-E(>}YHL_OQLzwUo^`L${)J5qDjW|K;Tvm~OV#>sN-qe4%RU zEnnrMieAQAVDLFQ&alwd zQ#GGtWT1Ws^Qehoe?C>~YHPL-P!Ib-cZyrc%6^K;r8+G@k91iO_sW|6-ve3!I&U%W zdAi25XU-0<4!)Uxa$vQ0VuW-0ud0{LD7xSqI8Qm`?QH@BbZS?2>3FxJ_!2A&q|!n? z-?f4Fn!pxr0YXKQ0$JH_jYVk#>B?S>o+v^)M{@zp9-L=$rAK(9J&_fEVW+ zh=7ovj5@J~`Y6jOUUPzdU~foYNdbRF-}J243*TW5!sO@m&Cno;NsD5b!375m+B9*CcT#AlXwtl%RT=~?ISgd*Jj#4Kd)q`!gv$yjeik{q6^cqbu zQ)|W&t3?x#yh?Bo<;jR_qE)vp6f{?5FQIio8S6wPJJ^R?LGxo8)aOkP_3DoGbHUb* zyWkl}3RQ~ZiY1qB;6ay7W!4C1cDg=5<8p?Op^?nT``opf6Aq8QvOW&SS439nUyuXL z!m-I`P)ND`w=J6J**TdjoO<^s?$*xwmU58()5-zKjB9icXVHcE7gZnsm6HC!HsMph zS`K~;j=T1{D??6S<0#3c82F=(}RRhV9|;Sc}M1mN}q0p1Bplj#?D0yYuWO6Ieo zykT5R;I+}!N}a-GG~hy+7(8?P9undKhd=AO>HOO%7yO62f#@^%aIf{Jyf5cjTW!2v z`{^QgoQS8qAPDR*qa^}nq8r)EU&RJeK(EE2kTpU29A zuvP^%Wr!&8v*AH(zT$)Jo!7gm_iJ|sQOx9gs^cXg7w0G(ZkJEHubHSgTifS)xo>sW zlNy$t4+Cj@b(Qk~gNMI->xweXn81ryRktRcpN4?5K2LF&PMU(Q${YrWwL4$UD(@L z-IiX6ZGuI4d>@%%S4x$z7hEhSV#iv4Dr^r;cXiM ztsaqV?<(O>X>y9s<6TRBZ`2S?Z6Dr;<8=FkJJ%Z3gT%fHCIdO8pNK#{H7gK9+c6$m z#}+y+_4>6_cNgD7cN?QmdL*$Ml*6?(=(g5b!|7Adh(t&KR^63qNdE)OgC2vIzNPw%#?aO&lB1{gf>r51xV`>x{Ls5!WC&8msKKGa z)b(;jV~b3QZNQx^t8nA}(qKvN?mJGt85`WTIEs&9GLjT3)+ih03!`BXS+0cQ=x3&6 zEH7$vFB#fh*6^a0vw2SU#K#GqGuOM;((H$C`65|EYiBLs@AsuHiz6=G0xm2lr$*k% z>HhU4S+^3~fO-=e+oXGsfoaKC*sjxOY$17&*4`EbDx_>!e!EiO_Rz;2e35g8J|bea z%utNOq-j=7JDm7SXCIa1qE*E0hDGw>X-KCknbg!qY{d%BC~1yD3`7Z*5mu}aMgKha(w#AsE;%^|EfDZb$4N!EIa~K$LHpYksbY6 zmkyV7$zT%8bKN4C^V-AE>!>kj(0*&1cKR=&>56Ou(o*l+5;-d({N|7mv$s`js+_XD zAaHoh79bPCNRzf3kBxB`6cJL`rq2_?y=hKD_;6@0T(p!XtK%DSs=1g3&F}bmpa9JD z)&MRs3b+q>Fs+&HC5y@Tz(AAUOS@VA10PAXTaj?5^=IGJ$%no-&xK^dKtah@Sq6Fo zOA?k#k8V%aRupgdWIBJvJy=zlaTOtT1!2$|lUWk+AF{wkWOBR{7$BhZUQRQVK2qOLRnZmFow+^oGwaoHaVrrBv%hfHfiw}bp*Bh{1> z$~lKAb6vLeAF59q8lC^?+-R#p-XBmA5gfS4S7EVgifM4u4iEQQ|KgOF@@`VTkDJJx{^uhp6{vdDCmM-p*rKXu+s@pg0- zV&5h&pUdOLS2@m;C8SS-t{(tW`d&F?YEcR6Sb z?Y4#_BWpb-1^KMZ20HO@vHCpcPGG>q-1hap_06NNBe~Z)-2BvnJ$TuyhCxgY=p|$j zCP1`1jfqPBgF#fRebe_`7zRY?!2mE69-oWzLheu>`&83E!aMpd9YJgGM=d>VpS5Bo zpLg-cGTkG~iXZD`v9^`W;6{<20ciR1RWeiy7wG($6|A~vpe&FLX5!k#MaBn6h`(o% z{yS2SwK1XkEmg>Qb<~icW9uL>q#iwOcisay24y)=1e?t~fN617=lU&1Enyx($swO( zn8IiCkrhjPGlk(!WF@es+(;8qM>HpC5^U28pP)4QZ{{K_OJ}R<#+XPH%wh6Di5Srl zekcneiRu5A%(xWx0&k-fZ{wM-sdM0^Ypr9ere=Lp5Nt=@=ydd&$z+ytL#p_FLszF3 zT;qt)5qLZ7MJs~imlU#mSHg7|_b)}EiK?7|!q1I2wl)lTjcwib?$9%xY_~Z1-@BZ% zRG0h}wn-8aL2?tp!pQmQR*A*pJ?O13It(6u!fLq-sEkkb2gzlB#r`-8%TMP~BgzGD zZ*JoZ#cEu~5+`%u+e$N}wCok>*;jdT;CvAJ{miToJl&d8m+M2mcT01h7S2Y6V z>GD}I%ZjN>aXAS1s+&mRoj#z=ll_{^=nkW58Y=0@w*kSlSdHWIHRL}oZ$}Vc*@#MyX}1$$H{LxxAy}x z6@t=pR~mp4pI$A*xbt&O7sEr^2a9n(rJl^-%;`^rHko$y>!Mt&Io~!=puLZ1f4@u) zkoK$0&v?yG`RRtF%UtvmN4nHNMMu&~~?2Wn%v{3!QL?CDDikp%4# zK#_w#N|V~<+~rwO+wIqN+sm|FiT9_YiVn~X6U*kEkxk(W1yqjXOI#MeD=UJwohgr% zcGx+IFW|hDrlvKIH^m3pM#tEaMrfaB(THx8P1o5hu0)RN49VDHaWV%y#wjiLRf zvqUi)?91x-QK^0ik6v(-tF?y<`p#1N-gFbB*&-bq3m-G7k-KTi&aW>c6W~--Tu&GU zDLYMG#=4k?jSBnKhdB?_G~B8vngtzE@{6ifMLZ1@A~?kV zn&4Pg!3?mN@^FQb??!&VBCY>&*XZfZ#0oiZoQzHlujzM!JSues&OX_+PKhVXf>OJX z_OfyjuP-^5YUX?fI~?|ut+fnf6s;^^^yDVKR??(|em_>Coe);1$R-3F!O^ZmHjVbCw>L*pmQ>rcn_?&3*kZ6wi1} zNuTmNS9>DC$Tpdbq(aI;+mcEyRYE-4A^p9cQLCte~j?Wamu$gPcNg{mk>WI zhcHEATfA)DA%F;IbJ7QY#`(a~(7k$Z!l@f_Y0m1{BmV~*Ohcnq*tesi2EqX+_E+Tk z?c*{@DGRpUY#|7N_qu^k4w6GaxZ>EXk9{p2pD-sGg*G+(z&};&VVvL<7E9JNaiI zRMHUh!Wu@Oz%x|Gj)CC%;XNEOFHRsJnfQ^PD2ab`2&TN{bt-}{mI(VwWOF{t8%KTO zu=UT*gbmu#f1RP94?}QkFs^-bdwsf9JEAPOFRrWP+^iPe^I!9*FX`_~Njy@kB>Po= zo6i*(vD*42~FLUtW>4=hf3gKl*?C|0mTaol5=fxRcK(3w~EWm-*=mbo zhuuRsR^t_6nVTrZ2Ka_ezS$G!w6#>g{)f~DC^`JT{1b3gulf&a)JPO+q1yKQC=_F} z94nQRGpIl0bjevjlwfiTyR#gBuH2><1LmDnRC+WyW z<=lte7%BrDKV1P2m@f5o=I<$I%Z&fqpZ0msuQ5>(fuQfT!e`kU{bqjt=CvEQ+Ye}i zjg@j$m3GyfeiDm|i_&j)ti^}NKW_Nq+f(K%q9YJDrz}p;P9tl9UGbHbo#}2@r`dOh zm2Bt5pI`+23q+X(@nk(pt=J9cyYRj|9)Jym+E%Kx?gfSv`)NC ztdk8GY^^y*mpZz%Ws8qT>y;N{%FL|162_JcrqeSrq%V@19c4DTfdsWaE~-yI5STva zEeU^1+l_aYw14ce0fW1mWyq-CHuCVm0jINkK&3I6@DIYZmB?w>kVW&luA%ZC(vkxU zJnJbJDDAHo_Bfb6f|rks1#yIF9-4gaWXq&X$UdZcOS4u8?$v21J&w1gJuvIe_63VU z-ddM7wR;F{Z)&ak6VzP*Vz)wT7~#q zc;GaUmjn%yAJ|s>iyRXtn%rAQc`zeDuZYb}Eb#5fzJZD$9#R`Aa zS=Q$ZF?&$J2W+7YMmAa}X*)8NzxAw%lQ_+>yVAh>H6JTdLCAuo<*EK9l-8e&M(7`Y zc8y76H?=nPGa`MGQ69!nS%fqoOOYR!zm>9MM6-mvOoOeH{6xEn+krV!dD++XBw_On zIBvNn?QuTsCo8K{(2f|H#^!&e>7xz1e5?mQ3p0jtaY#xkx&NShN`S^MO?Z(NnLSo; z#Al`sBWA^$t|N&Km>!J)2o8=U|0hjbej&9E z=S|H(yiW?H^L2b8fOza*0bl=-<-kry$^7VT7|&Vn^<#YecXCmT^7}z z&zGE{tg&^14$Wfxa9LfXy`Wm0jLsObAdOoXxhlNO6pYdWm85E(96k&bp4897$#?7^hky2j}cQzYc z>%ai$AaCc`cg`q0WNDH@-QI}uqv#SAN)!lyVh$nB&gQX z20y2mrBE+`Y5sm9*|1%x+9Te5-Ohj$kk0cA*QG+}`d0Zh524&qUZ&a%k;o#SNpq`I z*N&{jQ*G?xpvbM?1*&$`l;hY?|4Sb!RiSe|B?a;UNBnUX4N|TggJJ!OSDw6}jknm! zEL0@q&a9nn^#R^8;TQ8ybH%yY`h3<(#B)%B6{S_t#q*Kp+OalSW|i&?BZ=6{07u&B zvx~~j{jwmvzHX#-Ll*P2L;9-`n>D~Rm91dREYnluiqIMX^0SYNZ6cU|0zL-U>MXFmA|30#J;zOMo(Lu@s+er!QJ;Bgdxz2Rq8~a39-11B%V8;_ zO#%)qqBP<6LMM0Nc_b+We63wQYl3rm?PA|z{r*RU>(2?(xxm06!u{{V(C7Rq6#`Mp z40Hyn9$Mn3yUY{{d>FQ*WXD1m+hAPD)Qw78hiBAI{? z+fkoUwWNeeG;+F0PH=sM-DMG*lm$o2XHf!VPkm@9kU6nQT-KkKqf(nW06^v_JL~M; zIC@vPAS8wUqW9U)4)r`CvwEnW7V~w8=b2a|{%_L%Ax^%5uu12g`izF-f&PKd0CgvtAz}49{8RSUQ;_uP<&QFvFEMt8ad%eDX~iGTnagpFuEa3${!e7Y;6ulXCl zv(&`rjc&sb<9aE{m~R}9rGCW)wTL4RvhdP=BBq83-LUprol97*@m*0xX=2YNe8LWxRtMllhsc8hToxr-tn8QV~+1cM`d??0uOWa~&B4S62+Na1MlUf5}qH{xW!0PK>C&tcyd}o`&^zTUADVg}?;k z;LeD@;zIh8Z8Axhev^2LFeb3Aoaas9(-_)nftlu{wx4K|OZRQAd=D%0NW1_B%y zVU&r7>z4{;HXduN)@Gx|O{POx!YHeqy(Yj#u#M(ni@oXBYCj;A1yT-sS9A%uRht$g z$9FLwLrKK*sOoDi6x}}lo8#pEx4p^#Xz3FA4d8K|3ohUJ7Lg2lVbphDYq zhAM(_->Z!We=Y%OOy8=FakIBR^ts7>Sd%z`b&fT+T>ckwBU4%nlVMHH&tTJ}`xv1Y zTl1E0uJSs0nKo6rQ=_Bl0V*FhxmPySS&&*AB@ME91zqqPbA`gwy&#Qt@ziRY*Ve9` zQte!Hc;PXgMmlR`2qs3?1q?0ZZ?K?#$iaU+*zo;BX#jZ(&Owvte`{c zJM6wokc(lplp6*p9^fM(atN{+xI1lAaJWiDx6Fd=t*BtuV2L0jDC z${(%482|>UIk@aNPZS#5JZt~>C69LkX`~6?40oc(eZ4zvV%v-hyun`2M6}KzriaY~ z8tO_cIhVy~)g0jt|CY0zbGp!+uHkc&Ya>POe4RJCVm#V*WPxWAQ#6eFBr|;mi(o{Y zBEen5Slp4}T>*4mXYck--g6LD#i7+^OyOTMvw}EhLQ_CuCs8juD z5}r5}$ioRQQBI`v#;z;Zg7Ht#k_Ho*Cy)?#!1~FH;G;Pqcp-3BX!xBR>!r$WOtm%S z`>(5L;~cR{$q4+&L_IY7qm7EX6WH$v0jSoucrm;)E~Mmknv3kpzeK2tIR*^?=kUz6 zeJ6M@JB=WO=rOnrSYHt(BCZtt^|J<=u}*Dy5)exV3LFEpWT+y2J)iqSF8CPjKTF8`i1CUzF&!g-e8iu`({ZTQxiufY{LS@14 zJRV4#l`J26tl|=4?Z4L$!>Ad$L!#@L$@i5uL4sY6@w@jxxVyP9QiAL4f&%l?9j!JEj$Jrwrb*Zs|4GMwDomn#~Ellj&p)h|sAd$J#!pb^fbG(bo{fW@D? z?I+5Gw`y2?<(8Hc=LB_jj4izlVDZD!l00OtnM2ZrVPZ0!i(i^HeP?vTKzXN`a&e|+T{4t8FqPbu>C1PW!>z94{1HvGwuF22V?PnJi#;jK9zEMtb3u~t+5hGFOMqj99+ z%1JdN3OslQw@1~~7PzBOH7yvv%EKu-JKxk`#RS6x z@NwdQS7wB4a;%-0CbH+B@!Y@_G6zy*$aEBZ;}`#ZrIjU1?M#Dt%)ZJ<-i49GRCroH z0$*%V#Ai%hB4h2N4%m8w_nQ0k*AH`?@cmMNu}05-X#=sJ6h#<@5XQWNqVJ;lccU@s zHEW-(ju=ywc)4Rtk5LR!sppL)`C+aZQ0!!$61Re2ynRD34gd|kO@-qwdnONXx=a~KgKo|$`pKD9;rl9_of3z=r(wImO?mAcCxvhIlQw|Yl^e;olfP@Gecy|DULEr+hHL^sphd3G&C|+I<$@5V zG~Re+y3Exp)YTO9(?qSnf0K}yM~R|6^7>&m=C&S1>u2ixV!VEoI)x=}m*k?eY7Ua3 z?*YB#tKj9`<}Mu4dokv^(?$Z+`2#{bPovM^QSIg66@Bs#A#Yz*Q7aW+MGVm#7DZrR zveB5U+B9}vQ%G31nM@a8;~ojn(5gN#=nHAqEHRf&?D?^%a;P!kOQauTt~D+RAiqaP zOSt8H0EwrvB`-B&S+xjt-GIiL-b)s&bkL-&iNDb=Fzg@LEN;KCHv!qGV9HmfF=!h?v|hXc0)^oU-g>>IH)c+4{}jT{@rVJ-&mQVXUj{hz!{RLM4P? z$k5Dwx9iXdpZ5muzDwwOS-|$|YYHUT0F^rI%(LL*BH`QCD=Cr$wx>`so=iG?6D;g+ zG(3_awa%g-b3QCTjVgVF%$OeJ>qKB?nvm&bQuM{db*>~tY$)O~wyo(WJ%JWAWtcfyN zE!-gNI!Jq=><9xe3}I(m|LnwtR^;u_`^!s!Ke-b>bXJrPYanFTsNJLhDz)4BRD6DA zC2{l7ROAc@m#bc~4d8S^OEN(@h^j^(ObXQjI= z4J4xy&~*eq?$Gs$)veUZ3(7=Z$y-)rAfls7sRDH6+;+_u?$IHTr+`C(bbmkKr)#PU zc)Pg|Hp&u5q;r4Yzp7LLHzkeJ0}B3N(LyJr=)y{6$`RaEL%IMIQtL@VfObnW`$v>e zcVN>hLDK|iBM=O^c3jev_NW^Pc(5ggCC|THCnY?{8jmj?({75Eac=&ZD8~;#}>bR5QZ&3R|b?>m#1jGb75l zVgb(9xWJdTGersOmn_pAry9JQf9@+r&WzEe+W>T8LJmldT+UZ;t(=*sQX$queGVO! zTKZB)lZitS&5m?HXIab$1Mvl~FeV~a^Zl1l+aMkPWGzINQ3BO4%; zCE0xES5Gsh0N*pdqIy!Hn^X>7Hc4AvAvMj*=m6ZLgI+9mX#OXNYVkPk&kJd1d_M;P zoIzcP*K=X93>7Qi|Jo9?p9+c{ALip23}eakI?u_w_2M!IKSS1NQLF@X0iB!}I|#M#5D{9@%wE?Na{cg_bC{i&Wcb*TbcwcM zE-g@#Rf)g@Fdhv*QKVHb11JRLAo(KR<4Pf;gk%a!D$(gx_(Lp58p*z2*QJ6jp>aQ_ zlO86P5Kl-F8x9S5!Aah9I%9Yd?BP=j35uW9k%nL@N;6lx)aDeyNwS8E`d|YxL$&2q zDZj8l8V*^VA+lcad00?SG}z6|*BV+w!n}LM0bQ19fe9@i`)6NNxFr=9Px*6*rI%Wf z$cFo>#g+glT=&jOb`g^oC0ZWLNcNH<|Z)3fQ#;8WjMPI3_-Zja^gCm zhdzzgMk4Sozisz%eRuJIKM@Iqwlmgcvz*vK^|Zuop=`S!P>6(eyIEE$2`=1Z?d zouNF_mJW}!XkT0Ff(Cc&!eu2a5b-X*OE+Mg{SnY)2Ppef*>Ni-E#D)lrxH3VVh#Yb zVT;-`jY2j1l3+|hWb4ym?I?N6f4!rvQWv!8v<>(ZcIn zTMno4whbl}@LRIt%+^1=nSdUwXB&7EC20lO6EL8A#B=>hp{rhPk;(`2Ex>N90ea`q zUYj-$UvD3m!GH`9hg>q)+!PgO8`A+V8;hV@KbAysMWN_vxh0K~Ljc2tug*2oL!yGL z)eOpDhY~;Ayx}q+_k`!oe})K|182~N54~G%9dX=nlmr)OafcHP!0mbdLX^xBdB5pb z`kpCi#QfC&Fg^?HlQ05uix^=7$DZK8QT+uZ7r6M;!&eyHdusQur$ywDAZw&@@N>3w zDz2z7~G;5$(3x6)t^ad0w%AVAljho|+hFG|!TeS)_J%+l5f2>UvYd~ah<jjXnfvFlrqH za_`Z9vgPKf-MWZfLwZpguB7uJ&c%jDj8J=hIoGXH>TSJmwo|N%BCiR?u`cRT0Z#BF zXJ!4pDUX#TSPiYx0ty8_2U9kinnjbnXAr5?I5L0dWD!KSt^dql6T=h84pP~Cq2#1$ zXu+)?<3PUp% z-Z{+?{OCE(BksQCaXS%lly<|_VN=Xw%6CHB{{{ATF?%=;YuV)Ol<)TI+*^K4*fO*i zpzXRMgKOUFJyn-uwl>&YrooN$IXDyXoV8t@|YBsw(Yo28IPC9>FLe^>XgsbCFhoXqlF_YKH`_(8J`lG3hHHXNwBQWu@q+>v8D~_pVLRZs_NmBBM zjFmL8jWZk78O|yQ@Tx`ap01K$FA}1j`w?W93$bH>2N3d+Z+q*4D6hsb1nV3^+$qW(ho9+A4d? zJW9dx76gZ3va>Vr98BOxXDRI_XID{CnDQ&eD9w2c>3xK@{XnjRo+7?pVNNe~6=S;s zLPGHYJ6hK~dyY~M`}FG$C9fXCJF~I<-S+;yfKAQdB~rQ?)td2W8U?o~ha3|nH+L4J zZ8z}2Jq%C$_K${c(KtR>r0_Lz0k)>dl6}3QD746bbWPGAki1mcRu~UWu(n_IOKYs} zAo&AwEb^~=gAnAnFnpq9@CwB0q{8xFb?TQqnk0!&4|)mdP5{h2PjVZD?ZddwemS#a zCZX0j^MG+cr-Y+}ESeB1hrOhjhSOOkKdJZ(fd>QSgk&aofS7I%u^|_p!~yd4h>MR` zHElMAE{XbRxjm7lkn6Te)FM7fTqFZQttuVG!`ejmhtehKP~<6{k7&%ri^{|_X4cW0 z`N|LWmb>A28R+#mX1Kd43*rIfE_t4M5q9v-y`a6rGVuyQx55U`#z#Q`H}F{1-Kr2iN;&5I9SuPLe` zODNq6J_CI*j<9ad`6lq8@++#H4{Uu=&@4{pcI?hU!x44F%KHr_*5Wp~RVrPl-X?6b z!zqw-Gr~P0)}!V=8csKwn`~f8hvVm9sa_`AEzQX!*)JUO2v3DxB}cR%|CBMQ$m}Ym z+7i($AZS1Fju>)d8WN5hhF^!FegL>W8pOCG$LJzk#!}B|e*CFlnc9ScOyf)G)Eh_umlOSB@z4D64Cw)iV1fPapo#FlZvv@c zVb=IW4FmU{uGQ_**%zfu+&3`cA(A3Hg+n#|n}9_OwYX%Qsy9%SxChM5qw^*LvYG7^ z-1rS=xw-3;p!a_XZ#-^~+sQ2__fEj19qERGRM&K=`%o*Y`BFC&_QyMA{~FPGDVq6wc?synJ$ zqqV{ur)}YWwGiMx+r-MMobnX`tvjAK#Nsps;u9K^@JZvtvo*L;I@+b!wb&8T{L_ff zm3HEyY@ZMRLQ9~YRYgZWy$#pUI90$P{^GODdH`_3k}*@%XaZ()Qa-0{g}p&5P`eqL z$BXh*157Qhw=&QsPw&@ji|?}@*;%7sno-4I)<^|2auyJcU(?57Gw+2J%sXuFGK$6a z*x=M9LMb6P^hbVo_idi%y6axh$geF|V7|#YiGX$7yKm*kd>S_6MY-kVm7@Di67b`g z9QkYn`ka`>a)DXGTi0yaJbhv0@>nWz$g`srPUM2VlRlVR!Ne`LaXng2e3tsgdHjD6Sf3sa|DBVPb$2-7SsW4 zQFggU)qhYdkg4lYlN#GS015Cx2|{EGvfWUpCpy;qWQ4L-vu{S6L-boW1f42ayxDX@ zk>r9`zGXMylz~pFV($^wlSL#2&YY*Vv9+^s1z;QunQQQ89ySypuO^uRu-s-MiDpJ+ zb&{aj-2MTb&_@lr;FP*`JZ@ei`02UMJ#&t#aU9P#2B+suB0I`#*rzZanDiSkfH((nLdG=Qtazd8IL8*sZ53#H@KqxMCLzPTt6ek&Alybm#zCl~^)k{2FX*oJlu1?1IP>+ZHdrcecT`hJpugS+T zj{gt(bBRpemdNO#pzf^qsF{b5q_~9F_m)_Fl^J_$wy6Uuhi2B_QUq?VL^m%42 zuCiD9I}Jt-G{NPhGKgJA*l7*R8|ZB$#!uc14_=Asm!TW=5U6_XdQJg!UyEzGLndZ@ zu-(NbQ*3wPIYSc=L2~H9G-x2$KNs?dxWyi982e;@LV2FD9o@l}(iOK6VxOzv%d1xI>PkOS6lNall`b}W5g}i_#^PfBz6M?uD~zS6Yn(6D0`#k2~xgA!83NP zuXK;N4mlaULO)m{qVaaN2c_mLj+fvhS%ZnIR)e>rFwJ?E_z4IFC8&C%$afcKs@JUJ z>SIgrF}5-k7z{m?3q&P9P99{*58P_&GpEV)kzz*v+o1m*nmR)Ho`trL`2OEtmn@%c zIiz=7Sxb$dq)5!ch?R0hjcY`rU_;crgPxtKzj`%(G z{Q2ioL1>pKIo+k4-nK33)iQd%ep`wzgMy9hP5NYxtcMiPK*;`S?9 zNPFJRpB;cz+prghFiRs!-Oi*JbAzGq;v6ZQ2;V3wqeLFh=(7>6A?^w?PuQ}x~b;5$?12CNwc(vB)nfj%n%db(3u)Hl+w%%pO(}- zmjI!2&AN7a?we!~lIJirZ04*g-SVo>mzbgq8*(7xGPoJ?%BBEdw;pF6Hbd=H+DrZK zY@d$*B`<~;H3K$!aDh0;wP3l`EhzFLxv~C3%vbheQ5!Q?wQ>zy8g{Np(a2Ik(&~6U zajpX9ugT;r-SaDD2KdfDZC|SZ(HJNx8oL}ilt_?eKCCcP_QQ8VMlU*_AYKNWinQHT zP|xlN6Kb?6L|F|+LH|W{%F4Gm3WZ_qjZr8+=rSf^NG@BL%7L^kuq7qv8ikKCi2+c~p<7R9I*=ji1&Br*y!dH~f_3Tewdi`uq3&AItW9%$4-lGqEpLuf~(< zkfB{eITBiUJ=5U?+9=}Peb=H^<>Y@aBphhNq%rNWdOzKw_st zsmsvbZQrZcJFFhoez~!O$V=34@uWT&a5mM`A!s@=J5c7GU;Amm5JIfZTgJBhgQW*h z*vdH`d@Xmx+W4G10l=P{n15Yo(ZJ^X9fIBYo8wPi?_tfLRc?@! z&@TS<#)gTfR^)Ebo%WZDW@Old*HmyQqf23Vr=$dw?g!8&K0L}^x54n_e|+LAbmsnd$Dh&vT>|FCzHX>lrGyYdqb_yoPi z2V&K<;qkCfgho$Jsxxf~ihQ4Hb`xDVky{D}_48pG?N8IH*@L+z@Sj zjH2_PmXSLoB)5ZmO&E??HZ>}p={Xqj7J<^nmUQF&VGikUM7!|Y@ph_V@ey3CvN5og z3Qtb6?VCUGJQ%`j%#Shwuz9(7RH=jdS@{hfbhewR&~}TsxOs{fDNat9htR7 z=GAcP5gahCMkQyWnHGBQnzyEn-4Qm*x>1Yssj}jz^1ORQR^6K-lqb_KShQ6LxA7&I zk>e?%xyE?LqT5ARX`k0Ta5&?s^Z)>3}jP@Mwh$+HwXfarhCGg2aNn zCi)oM5Ms^(cmdU>*iz;fhPj_;h4}m?b$p#aDXMGLbX`W7#ITJ7MrPTO%fL)*erN{U?;kZdL}x-X*n zmr~4CgPXSquub9%7;rQchR|pTeEJb41(H&MFcnHc7-zMuZqy_we>2ma{YzQx0St%o z&T;a+Rg;6Ty0+oMyT@Sfwd@9Pk@@tBSA)-YqbF(n8W`SMoluCunADc_K?j+j zAv_>LW6Mwvgj}EF``o%<=)Nsky=5Qt=X`JjW*fv0$Ti3Oe3o%XfI%oJV)8IP6@5=& zPcCLE1g|TBabx&DxYr2k?O$X>P^>8{w^D`YN#jG4?-@Iddj=$4(nsoloA4svRx5eg z3-hVkpd*g;z7)v}(1J1^36!%wjEc(E=N-_D_wuHW>sH6UGHl4!aG9-WA)kL5 zPiw{tc$+(5-4tGw>7>O!6e?U=(|%xEB&5UlN(~1P;v$3uUb}Hihpr=`EvM=`LD>l> zGM|48at{?XIYysSVZHNE{>aR)o~AE-VO*}I8rI8@)RSVe_9!KR8d7pce)e1ck((lY z!Gym_G~3AL2=;4UzXM7yH!w0$`xzDKa@%a2a%AqjQ)^E3z|7q)q$xV9JSJ*K*X>Zu z{P~}qZs?~n7h}|lI?`R{w- zq$`+-Y%wl$>9@`2yynX8dXpo1g=re(4OWi~KqFaH*pK_=SDh{7>nt^Ncke}sb={WD z*|?^UYWI?j(}WfezcN(Az_(C%8!YKJ&mq$R3-1JMj*2OkEm>gBU@2bg8+Vt z?|eN=e-EGb5@S=2bi&L>VwDIi0Vo#-2$WIN-!v=VoCTUvQ>%7H+^?7KZ#zEEe28qz z14}$aywkrA{>E0h$sxUmC0%i{k=~>?H@NPa$tuq~DMs0YFsB4b<3fS`Zq*d-xbzgH zx_BY>Mb~@D)d*p(aN+B=C%Sd;0poKy&<>m6R8UQ&m<@=$P+F$v30vJ>3^^cstKNg^ zh;PUN!~<4A>=W`W`pSG15(P3D&QbpwLemW8h9Ri0B5MvMTDphYO9PX4DgOM8bYT(N zYbEEr%Xciy_cfm-24lmN#8cLPmReg@f5i&oDQM&KU!iP|L_I)-w2?sTXr14jX@?j5 zmhEYKeH?DrJ{SdvOPLMyXat&7%s4e$f4O*$2!(V(Ex!Eqz+Y>q^fW3s)>pd(xp~2D zG|ZH$AB(eG&M-=Ed$YIu79weAt3W)#ZGEjtSCpFg9 zSLbtnH`XHCcIqc$zj*n*CZOo+#=kq>ML+xxz}U zlXmTcnvCZ^YvN2vB@h{p5oqxiK^g507FSL7wzDp34Ht;@Y*k%nF^uy8 zR;4UIdkhlc{Luf+h3)<=SeREGdbu?wy>HsAyhbr!yx=6|GC?MyZZGkmJS5>Db=+;dmb>)<--SYFB}90NwvAZ5L7s=DYy8iZtXKkI@NAD0d~XcT(Nu5gKETvhRDJ3E(`&GEP1B?S z;}oO7A2G#syD9D0-N9iNkyi@o-*zX~WeC7l@Rlp+`zQI3Zfh-TdBo~U^QKMf?=+MM zgFXc`$Gm`{RJ~^66!mi8qtSk$j+|2X{W_3QWtWRX%X-@U{<7PomzKHqUouM~qq0B= z1obJ%v`z$Kf&+Z<4XWj9-@{x;TYh{cZEv3V>tyGU;dR^|2$Tzr|_rW0uI(Vyj9t~bR~+?wa|v8kZ7bBW+h$@hq{NM})R$Ww$+KRnj( zsT;KbV}#LJTep_p=-!5cws4|86p+;Ka-X=+7qGpX`K0O z!dT$^+gj}m0S3MosGcNfccUyoYeY}zAoCjB{qBWXL3h4tRn=j@+oVM#aMizq; zqsggu`pI;V&)Aex5NUY8v{Iy}q@Fs_lu*DY38v)dWWoJ}r5lBO+oJbOhYO>vBYGiN zQ2_z8Hac|xmh7+*#R~}7P#5#*463t^+})8+uzLz{aodJY+y~aOY-0@nc3|cxT{LMM zbmf(-Vn~EgCk@D4G)x#92I^SfK@q#TG(IZXfF>)WR^%R3X@e=4%3EKe|2YTbwjEpEhujljTp7^Gx>NgJhx@OL z3IOWef%y6C)18J`NN(^188osFrBlF9s{%y%^dvW-rl+J}aE0V_)~d!r;gzyG zw5K)Tum`BwB_c5^*}k9M_X(sp29hyjW>^Z$SZ*Byw4br2t(`=undD9|!yx(fx;N}gO6;vi9=IRaEdG~~ z4a|nCD#d|<(lA)S4NIXiRX(}<1QR~p#nrZn0r%Cz2aVQ_;D6_GyFS}HA;QqGuO|*qzek}!I;^*V8|dBvxR%Ea0NzoO(z+B@b!VEpSlYS1UcCF_ z=~0YJ4~Qmq^`AQ?^5JtOAZhx;hkrL?53}q8#>&bFSffEAL%i_`)kjrM{g(fVLNZFW z!iIsPdThBbf7oCY8;ldIQJKZZc>{g;eZjg6y6u}?Mn!#Pxs87NRPX9k9^in{Fg5?V zQEeYOPEXg%wZigi4cG5X8wX(52}pLuo|Ckj<1&db&{DQ~N572W%@1!9u-Fvj7!rOZ zw~iYMDXKF%&Zs0&j;_wWE@G{8>>x8`hq>c_KYI+pC(l5Zofd%5YK1sx^$qs{#^akP zdj)*yUU)#&_^{L>zoE~ppAKcs;pHqzR~wSN5umD%ss+Aetf6Wz<6sOf_88pxb-zRK zL%H{i957hrK}EBi15UfOk@75rXps*A8JLBnBsJ?3|DW(23$E^Lb3Vg)n9hy#LUsfU zs?M_lB!Lbqa9B=2B~Wna5w<~-?8x;&hntuQgsb0|LswP~Qf7#q1B(OBZ5pt4 zBF+R`5KwX~Io-!G{+y91epH?>H}h@d+`*n@gX;Bx%|ms~wdT<9V=Muv7H1P1+lt^@ z=L~S$P|n~O@T!`el-{O66gV&F-j8ZpiRT zHHnrRmI%-YH;&63!hAFqAQLnIEFnPu(HOytpSeS@dC*(|C&ERYd;#)$6;$5n|MGYn zgX1fDr5IElI4tIG!IH^JYQpNv`qJ)VjRBHezxw#mad(5Bcvoy+gu4$N8v_u+K;*tV zlnkCA+3_JmNjlrdoz*Fk!|moy&huX8@)01#l-DA{p9fjpRqR5gr9kH2>i$X({$V@4 zK4CnUZglox&J8q9mL9j~(^sN5?TUvPyZ>k$Bu@L@$JrM>Ha?a8g0kj9+zZp)P*cxj zuj4UfZN!=lh+&v_avO&nS)7?cb;_%%)$#%Cg*Xs#G!IKLC%5)k#TK#Q^a{md36;c! zCJ{p@^l3`H))JWWViRF%lktu(-ZRCASOQ?qd?9P3#UancHX2)LkZzh^SfLp)vOm3r zy;w#$kOt}BYbhGU^i0+0N${?o~ zbCf~DwLcGsw!kvVWwgXrhsr935|M8Y4u*lIh5o6Z9$XiN2?TFIFV7kRU|z2TNDLe* zZVzHj8HzXw33axtbRPaNA?I`Qc1ZaK?lfr6l0Bp5Au?T1;qEE4NPdmiMH_FV<~-lx zp4YvB*wCwOq*mgbi_(LYAH-xkO1b&1&Ohx^g*y$PR-7(-4;59qU|pH8CuNkiVRf6d zI{j*YSMdf36{ud~P3(HaqTpy@M!|M~Bq%H0I4}G_GooIc*F2&Uz#00)>HA7E%LxT+ z?>d0k&%o|hZ>f2S+KXEIJ^9zqxD}?S#OZ?K{d@ky*2KFS&2R&x5$R zlHqjgC}bk&?J*Xp>k3))iJ27CzD=_ffSV10XOa1oK6?ciD{%B@1lr~JR&5=UUVfGY z&24Mdz6=pIeb;^O_Z7_9j#JM|L1aV^i{JrG{KVrO1bBT%6?lS`&#BGYH>oIyX>{tp zws7c1vI$S7@VUo3;ji-iVyp?|r$FMHuX>zovRhhL03juM)rhS@HD(&d)@MQCrTZjv ztU76GyJ|9lmIv{(D=B{(3TZQ|?h8-qXaC$+mPTGEvwfUejB1Rlg)SYo)UxFOTZ-f> z8(Ew!vN|sjQs61~4pdnQgkg*FHcQC0bK+7}rU#iUU#m8R3{s2Id0wQh00 zBA;sCmk=KnsI!RfrXH+gS-0>i#*xvseZ)MZI2gU<@-*e@aLm8*XWT;yYBGBjm8k}thpzFSpy)a& zyBNnPV0k9gX~Lm8SSkqRM*jJjMGg;=-8|Sj6W&&bib3gH$j4fQ{t)cGyzTzC3SA9qGN!x6!#|YPu{V znG+WXUfeF9;7jH|T&diJCMPh)uI?4z@d8BRFqDj(j8?WkT&J9X+G7O1o0RS7zSI1< z)U78bm5ebF=H%66@=thmBhuOyB_Ou666nx~yVwx>BNJN&`p~5{9wSW_sx*vXCB5 zde{vrZ=U7keQ1|`F!`7=rUmO-Trcpn2j)NW4xPY;nfIMSHEw;6mQ%}RMYnt<;m2RX z!#B7)HlONB)MI@?3M$j4&=aJ#6#=LW+%hx>>G9p1t{{{S04+za`LGrra6%(7!rstZ z1F$i>vrgSu$PZN_Hxl?cKqoXerg05V$p(z;O-=TxdWK&B%D5yYvH;M+xXf6TY6ZQs z{IUNLBis>gHjs_LMF>+;k5|a9uUY=N8=1r;8%I5+-50&w6KS}rHxw2m*aKfTsyhmC zoM&6;?eF-n$%8(L&zEE0x!9Fza-o4h@lahdfUgV*f?0;l9z3H*$D}jcropTfMS36@J|?gQM!Goyk$c#c&^JEmd~rlPSRMlMd|xemzMiru8mBOcfG} zX7(L4lDy0OGH-NIkrMR|KNfNj1T8%BQP8JBxl^jr0uqh5l3P-m2>Lwbv2SU>ji78{ z$nf)bO%LS;8L*Z&^9z=xl3GT5>*&K_LYU@c+I@Y_K{T`KpU_J4FawD!4rJC#_e9$q zx!*3Ip3iNLZGxuaVN`x9MujB-UXK$bUMH7riVGzwW_$YIL&41ea&0uU5#jv zZ=4Cm#D}UmQ!sMaFO9?-r^R}S@?6<$yP#Ll*gUPou-tLa@ozILYD16Y(b1XBa3Rko z&lwp1bcw1lu=aIZ=z|Az?hno22Oa)Fq=OuqS06(Byz|(ArZtV&Rb)6dWrxYQYo>`Tk$=;rUKj!f%OA8J0x)Z30igb z_$wY!a#{1^gd3yrVe=ywcpT@*=*a_VlMCC_@Ig94z-@-{Nve?Ym|f23@$c(ylI;`j z=EGnc20B`L@j{ZH-Zo@eFLqxN$@~rObf3EHAPNBVWOA-XT82oF5kJ=J*$%~>bO%tv ze#0!Bf`8!8oqtqEbc$?+sH)s~W&k%otYWe1w-jGUG;?>|AdQ4T0E{8&>{y<7A3O2GkqcU-wzH!hv2w_T|bDR1#|fm&>n+~ zgX%j`*j2>$0Tfg;(?usGI7`yUWI9y?I`4l06pUGsbS=E9*inDbtpw3il3dz{C8Cl}jbhTQe|yD%u9c8${m+PvKVXB+QSvg~(QzSE9DJU#Vrfte#TUJ`I1AT$ zcrd(Ni)m?L_enTCV`p(6vnG0#qFWLP5Kn9$t?!=CxujKqBnfPs-iY&}Bb;F&sE??} z20ALxB%g;fzi2p2Ja;q0PH5RimcOKgU->b%Z#t;j94{=OKV)d&O1>wVM+w>+IY1p%seuE1JITyv z8GCB2ag2JkAscJQRg`Ni0UOk7ZvG2nhgJ|mqZwSujwP_Tx$(<&pVj!X30R&n`RHCK>A#vQ6+)#i_#~E>^Uq7^{#i*aTrA zDQO_;;SJdW^*V32(4MitUZb_uhPh0Y9G8+FS@p5fkM6c*D5mF z5yqTo*Ayvy+taYn36?1AO%8%nt?j(O+MKsBwPLuT1GYgVH^S=FwkXeIMVB9mM_eJ4 zUPftIxO@p?wEX{x4UaCe!)UkL54G`}Gb685%!el?ElueC-sCAJnZoO-e%n&lrJ7b9 z?G9nSB3sSTsf08%D=Wr@+S|#M%J1{*&&#|ys(4O0c8hW>YI;^`$n)IJ+{ReVw%_-6 zgzgG0iI_0%>6-1PIfBdERV1j{P)4RuMECvxF*H({2$j&8g<{|3`BLu0)sQ#JSWIMr zY{X+m#?^@W_~gUm^!KfG3}~+bo16^7lAT^6SO&|ZU*c+#z$$}Myeu?lpO67qoN-=- zos>$&c%ta^w)}N2CP+;GbJy3NWkq%3lwar2x7cUs;*IFT`xmZrgg%Hiuqf~y!?chGb&Ki6=Jas?&3u;&b5nfFrwM|Hy}VXC`ltenb{sElKcAb*4d0?S5fV}yPINV8 z^s3*l_V(q_{O-3m<hIReGmY+8|2h$bsBm8iykV`M0fC8 z-H4#{H4+vzn(^?@L(0Uoez4ocPVtj{xdf_xO5Mrsus(y6@w$KkJbmY$lVTLh04+I7 zO-F&eAV4jFK75mFvpTu zNtNp?jG(<_mmz!5lche5PU~76#}n~7tn%!Lu?$_U1CuCyNf$!9 zT<}i)0iG`rfmUar|H#6ypNZe8&0iZ*fwP!`GQIt@3|2VlsWc@&gl=KaA1Ht$UIXJ# zUve=c0TE4?mO*>!fK|XV)n#Bg-gKj7c$zlj%j~Et38U5oDIJd&I?tIapXs-lF-Qb( z^~{?)qG@5K5!-1$WuxW+&HIqvllnP4OP$q9+#G0S{TO$mb%t`^@YB{S=s6< z(j9%HR|#I-RE+=Z6wL^Gf*IyjLC6+wWxal%BzS8EdtX$tb`I;lTDmY*i=vx%S9fek zeC8@RoHE-d>=~nJ*$YD$g35t(ncLQKBE{jw%R3hOVDMlwG8~?%LHVK*z^mtw^}#G7 z#BFf*tn7=p@}g0=7efg4BcD7FcY}w0NFu)Mi}hM+Ga`kF#gm*Hg7qo>kU@FYtTDB! z{~O5h2m|=Ln~S@LrP4u_=mDK93q_#S6kN0i_et?-dcYAay;A$i9+ikqW`+~Gyp%i= z2^1TwkPT{u+;ml`L{bB1WdsUAM_rt@((ENj);cD)^3ll0{x&wmSX?in7HfoFhc3V? zIa~UqPJl9fkD(o^HJ{$oKi9&nyyr^I*o-89tLg1;k9+-XfQ-~x{qji*SeCgBy^2k{ zlvu!F5$Ho-N(NHps;O`l+9nCglmq08XBlVq^P^ef!oIbZ#?`HWg4`G5P8bzgd&Kf( z_H99@Sv?e#nbjF{W40VB3U7pw^*B@}ba$O1b<;_%aIicaq!*=&2eYUM3eA4|8$c;lY8zS)2+3@S3$# zJ!goV7;^cjDIY|HcOi;e3uDb4Ui0J08pBWlAT!c(UxW)${c=<^W>0NH@Ig4+5nu*` zOM3r+nyTQwcN9taY?n^ut1RC*#<*suF+sw)=bIW_-R03e@l6~SV*1Q zU}{2a%lI~+1=gf@(#1Zb31-3X0)Wzg}>|EGXQsd6=v>ws)IxfBBWG*z*yjt_&x)}Di3~C zV;x6xj___zRxFlXR=A)oJLXsH1x>**!p4$Iq(6V3)U#Z40y*>tZCX*D5MnS*Gn^1o z#4R~UEr>+z5QMsbalBpR z-Q$`uuU;q^8{HqR*}5lPl*Qz30l_S-*y#$u+{wWuBenX|7{c-2ZYhu;;hU@wjFQ`H zpk+4=lWH}D!$0aaKhgIfo()%Yd`NAC|Hj#1C~mhLg~}5Avj#>wfW+*Jw5i$ThGARS zR@wg_ayBv?{vlQjm9lxHtM%`sJvg+7t_@lW?a6l`jMjC|qvRH*8v$q!<1)b?*?E#p zP=ynZ0%@r%w`D(K9@u3m1x|>g`g~ok!Nte%4~;1Cr(Z$A4ULvoU%a{pO-s=Uu5kpUfNJ|OG}Kh*vN(GbLxUC1%p&?}0=#i)=M}_-D07t8 z!Ers=q9wp$RY^4zpkPF@N!E|J-#LgMt9xM&gWAOd#H8Fs7jDF#b_|}!DxnRRP zF{(k)$~dJk09@U?*C~dJmUp8mXltZC2n>OezZ!PIm`8>p1?Of!@!5bG{DE#(dG@Po>$_gjsbh~%UHtWH;2=c#( zI;Q$4s!a>87z`H{IykT&Ok|3OTP;XK%b_z-hl~}bTsin}05G*M*lxMEuMoHsBCgWr zp!hqt6kT9)WzH!R#(E%W()p|tGcyVh9e$Mh{W-X3c1vhHk?QsBoQ47Sq9<&;idzG2{rWt#AiB`{osO)uW|zyALHi>yMtn9H=`Ugwj4KV zr|J5W8cTH^zJ<|0RB(=#fg3-r_viOqTc|rpe-!JiIP&%cc@B6qsu)`Rfd>V~jy!KG zxi3kXcc@MjOoLt58S{q8?K>!-@}wzTzE`!C@kfa*8!JCz3k&Nk?|pOur5^QUJC!v! zkF5``y%Xu?*TR@*vJ?}WIsDj&T8|qqeB@fW^O++|Bv^x_@KV15YX41+5&5vKDWwH= zdH+BmA8T&eYWDoPNGnk)OS8X5hn0@p5L=dX7jxb*&1l1qlVBDXGrz;-4*Ag>E2>*? zmSqP+CtP~<$K4LO;^ousQ>m>BQ8mBO=M|Y(mMmE}YPn&#^dkmy z@;A=mHiHUZ-bpJHr*h=8Z_O&;xTveWer121{n8uvWq9BaZkaasSFTFnTs*HbC&yAY z^sqt6e(*0`A@|2H$OvVkfwEmuYBf9|E6^d|x7?0%;zVs)cH4(AJA7>9y=Q*S04s%+ zc5IU|j`fH+i^XZ8;p&Oor=%;Z3$kGWgG)$kyQUwdK{i1_W4or z@Cq2SO6p@3UR^=I;><>qWYb$1yT4cQ?@z% z0eNR(JW}0^&kbQw@*Pm3B}<=R@?}w8@&A7TOiAktc{a*-`(l8D!JEhe)Eu_3Aeq#) zlGxKAJA&hQX5c8-RcTNO$>)8Kj#0j9wsSgyFrI;tLgQ6m!UUxV9=gAA|>4!1OD9geIWgy1#E0;ULvYFKr;+T1~Uh)r0{a!%3It&Q25J#%Vgon?0)ZI{{ht z_89S`qt1&Qi%|B78%37spT`B=(v7!a*;kXq4ZZcU+8ON(ZEa<(Z$E2p^q*ZC8~hb- z6q3I7qN$y|)&xuhwf3v8OBOPVlhpb<1)63_UZ)l>3X9`|+wOE?XE@W&ja54eVZrCW$vg{QJv)6w|I%kyyfrGB%HLeZL0w{bwepY0v2 z!8r+XF37>-Ax7BP{tp~rBr&SbHxnpJFo<)?c&1=YgbGeloQkx;w?eo*R+B-Vk1yR2 zt4uT<-pup1-xo_{r0v!1TUt;->vP_N&s3kEpl)}X9RoxiJ=%<)f|R!y!s9P)hQj9n zE*2Mj0e)GGg>TNknCO-w6(8MgnuHwG`N&5&VD|Zoiy;wR|Df&Q5feY5A2WZ(ab?>(mab)^n_wad-1U(pIoaHVa7usr}k; zzA8InO;m&@GQ7NwH4Kg@wiD=20m@!&xCH{(<6MPLwkGwv_YzqCK~crt_lys5Ots-sN-;SLJN!Bt++?9$3|p3Tp|B#HmE#b2UUHDI;#4Uh63;pak!Uf0nwQa@;~`jkHpY&mUi+#uY}`y)}MRP!Fll={B(JD z2AA^YU|U@|N^i}tG+pxqfDhTH=u2Cet4C_Jf+y<^y*-QT@&fyKgF-Oc%<v8TO`uZ5m%6!9P{gDv{cBFF#N;hgm_iBz=Mt(lNLP^_MFmk>5}q&y1`KRJ z;mnyY;v*Y^-dgYbpadUw-vAhtSqaU34>nwLYAz)FBL3Sj9Ce}Y zkmUm)r&swE)V**(qlO35CQ5DU#Rf>t!R8(t_rsF0{2QKyiZ5DfsTz#m2F!pn3y8;W z`$f6azcs`3@t zcCu-zvW9KT>qSZi!9fxjH8zTSN}8|rykfXz5Z zXJvm*MdZAdlF)0VMagX_yv07}fYrIq^aJYVf1yup1y%W2*-v`e`Up>oL<2gfxN4dm z-6Mv8rzOdeQz^=)kFtNejNJZF1$fdo4?sB3ar{b+q|d99MhUu_H3+Da4b0Z{_W8U< z`mRASS;d1XzLCDnS;5)H7T|9ugAgK?TziGfBD2S=R)kE-v~jSADl%ED`hk=?6&;YL z5fBykpw7)iJ@|munB7^(_Z zoq+0gC?p7sZ`ly=IV)aCvPwi&mC6ZrhR)`a_mcyPz(s(9?yGmQ%aj;v02|d^&tWK={96zDQ9Y zCvTD$Nr+C@l+`CkGN5j$3*34Tn*n4sCDv2JJWJZVSQyc6G#sjM-BU)B9r;zb;kr+m zCje##au39fb8M%lK8~^D?vunp z_G8!qOuQT)DOv}J3}QBH)6-5vt@~aktg(3#T=PcyiD{edI+vEux}s^xz+1Kx0>JF% zWe@#k9Q?*eEiHa;5?*zt5dQ;qD+FC4sC|UXfx_^ih0lg_)%*10h)G8XqB1UywEzY; zx#skItXCyLTc_Wm4ZU_BtPhs=tLZd(Ns=SA(A)F=vfSis;ABuqiB8-BlJ2wA-z&E6 zA(i{9F(&JIU_fdE=%#RvMj>{^sy#oISE>7o6FxP6mEIPidc+)Pt1Dv--6Tn2l=kn0 z0=dX{(~f;{I51!gA+-d@=aeA*yHR;+s>1?1?%#0F{0BxFud?J|$}H4MC9SPfz#s1x zOlt1*0-+r#U%Ppd5?QgL=;9+`B*M*e5QRN4VXfX*gg-aOygLVUHoW*X2r&Uqn&}P+|8Kwz0WZeF8_)5HfO^ky!4E>+lcd; z^iwrb8}N68hrN4(XN#b=2-uKteLWgi#Mn5raAO=srue-;{N_(sZ+M(&il=4D>T~Io z7_}gdVQ)k12sR{M&2Rb0G*9vu%D?n(Ejb99E{G!OX|>goCb@kQkyM4HdVTCA?;NH3 zitI^V#g3Q$qcc?p5ClL-ps1ymX=2?i0>~B!m-*&r=t<;bpS3x;h+${dry&YllX)%~ z(_cC*i0J&ZkmknNBOy-qDVSKny7y*DP>XUuP=?&*E+l5SWE0nH#pSbNZAB zinwhABI>u}wIeYNblmp($w>oJOcT<4+06A1r#=8|a9ssbVh)bq#mE%AyD*TV|HBuy;3a4%fH>ATB?W-#`k0Z#3?F~rCS{702DSOqg*}Sf zO;f*1f#=e6R_iG8qhM`Vd)xWNrlAq^UC0N&*)Fq=3DZ|%i8(9Fh{U&8IBd7pmq@!M}4)v~V>pj=%IC0kR8<_H%D$=E`Fo93Eab`Dj@-0%MWqwgo4 zi4Sq~ouj3D@8WrmY}oD{D9)n{rh^F#@l|>9wU>(`^pY^1?QPVRwIVEvE9Q_BbTT0h zVtab*O+=DeOlLTw&K%q_pe6yaI#)H`lUVJ) zYq&sAuB$&rl2mt5H%Z*BrDzf!Mr*)5MK_M&n+?JmdkWRTJB5g)m64hCPt^AK;_ zu12Y?2LOS&c?Oc{Z2q;$oLtY?_iCHgYBrK7pL<0t36KAw4<;gqkkYVfa}HRwLL`K!bmhy4pqa;0qOcw{>ws zk~C-N#K`b>_(iX(A^CvzXFtKJF%!OzQ_AG^4gKN$Xh5QR#b1^5te@zbnOYqRbCsMJ zL+o7@cGq^^!R|f_w0_V#c^szja8W4-8|(LZ(xL#U2XXI}32U~MdM~t}>LO4?K5#Ix zJ&B0p6?6xQeC4K}bZz0oElC5}G(;ZBG8F1YO>v+JKZy(-mR6 z9Uk9AFzG?7T`!q&>;z%*Ws`J#KknH)U;0YDi62rC46v~xC0u~1I=*-4+n;`w$7eU*{Z6&Xvj~l*SYI9ho3+`-6OqrnWh1C!_-P;a%6d;nk5(T~36oWyie#v3`j zt|K$_36La0>ouaCTv#dgzf%kchUIcN*6-`{$8sim2T01TOZ}*UZZI*^9J5U4;(K@N zVoQ$E%G|*zEcm5n2%MY%UPJV%PHS-mupCbX!RBUy^_)$1R{zoM_y9pbzQ3!Vd!&^C zjiUgqGbtaX>h2{rY&gz6x8HXNRP&N2&08$oM!zkNul;;=c-wGYru@? zT2rn3gN{`w->8*;;Gv9pm0}FN-u-gG39OUB7NGf17Q(F*0z_ev{;RM|x~mFIjfvIq ziZfc8LfwZy2jSH?v=Ap<3N0aOTwrh-6nsns3-duENiQDaw)=RTsPJDGrBrGN!H2%) z<0`IdIKe8|Q6&aJK@VYDV&1{FXCp2IgZl8{OcP!M;s#=gO^y6WT4tz~?JG_(-gS`Vzy3Sp9lOb6YN5JLY_{VkC zud-)Db>J=b$E`^DH$VLp6uMfrfAvaolZM{v!m**Rz=cvJduuf)8gz$=@TDZ)ZAZ(G zyd;X*NO^+yAa?=_vYXiYKhKF~byq!%m=Pm^633EcwCaXFnR+Hs?sVy$34l_rn)YVyko&w}awMY#D>P z;Gqf(-O=^s+d&f}9h&EnXuIO%%cMaU-u|3^R52pnjn*-}0_Y{*Kl@yG4fsgvR;VGH z0-@m3j{)0TJT6!4n`@Kq={dStP;?ZhjMUiYa>=NzLeuuq!!1q=cEoIZ{=x^Pl6bjQ zLdLPeaVA7BK5s?^_s^%J`5ceJT4PZ~tyqvowS$>q04&Sw!+;ZtH_88G4eV1S_ja@x zB!$RmKocdnv9os*kwC!%Aevo9dFwwkk1j_?Lp6{t87vx=Uv<#UB?Iz-piY0 zA3NoV)1wp)ISHB?PEt{=-|U%xw`m=U;Lr@wwXp@5F~yCK-w7E)CGFQNMTppM9FJkW zTfgRtzOvEpU!)EgNXq{!InPhB%#s_ymPX-rH}XOH)5;3gznvCl14E+>(WP<{1orNo zuf)6gsgE~J!CKxDH+p`!v8@=@j=l=}t+L*_oCq#5zE8yE)?jTtO)rIk3S8nQ?jzN_ zL9-4R{$)g94YSKxgbgUzh#yEwMEhKOZ&vUa5{6;^ASMgD?iOFZ`~f7s9D-m13UWt< z<#}AFtz!cx4KZC{PZ?$v=h2)JO*`ioV5y2TbEcK2g>9YGn&03!eSPnGP!{mPC+X(E zFbl6N^||sa0?Bi+Uid|tI?KGt?Z$43Gnd63_OI$V5*dSvhnppNwF!D5(_8`0!Q0pH z^%o4|w!M7ipTc<3dfi!PSOX4Tw}|rNlrhq&YmGfxz^qpCuOQ*If{SqnymIm~!6?Du=%ll+nb2!A`8gwabW;|4ljU75dYDeAlWaFdHKcRtJdIMO<58yqsBQxS{{DN zByt=uOLRvFyLvB-PVj5U@6Pw#SuwnV{bUyec_A5FR(n)76g=WFPo7RQ>eW48QRd!Z zaB#xq4fo4kvA}TXJ#N*Hzf3F>cQ#NF&p1}xAygXF}R?(}5ShK7^zH!m8Z#lp)FWULQN z;TG_YD=6X?DJmPDrt7Y@wrND$r82o}V$gu0MO7gn1{6UYJ<<$Vzi@XdL*FF*6fy3aN1Y1K}Smw2Qesx`YWOzd4sr`0(uvv(tLtH}E!`m!$C z5uzWfip>>aG4m_WQ?34f8&{YdyLt286dJfJ?#C~hH576ZLJKTF*1TE8z3Vcb;lZiK_@k{=Unr2sjdEPdeuNHy}4r=1qYl_v%Yk;Pq{H^SumA$x>T z5(xEKNmz?a;4?d2(X-KNp3E#Aq*e4rM0y8Y<~f^t7G*by@{BT^c@qV>E(8AIX}COk z;2M@uE$l;s2Bg@-6^v&n6lWmB#e}&p4J$n0u|5e~MV#e>7O1QpMQ4HEz1o(rB}4D7 z0iLnM+8+PCj^TCF8C;G|iSi1v;AaJ`(rfkf*;`(-9Kry6i|4QLFV5d({TUFQ=TA&( z1MHZHp02a5M}2Z9#a2H1|KaJdOfi&BG~vebd%p3KCb=J$-wuv5`;In`25aq8;_Xg1 zS27aVZHyx3;6T{J_bjurioB5rx~z0zmDXkeoBt2hRy8!aD(HS7Q$!^`br<<(}sK6DXypm zq{AE{*fu;7e63p1b{|U}==(UzMml7X9lZ2<+;Q1uutn)T#d{uC2l|^ZFLgazP<18o{OrMLp@~{{mZ6#MXdj#> zF^e=R_YZ#ERDu_^aL63E?1FMhvWbbh(+S>kwi0#ZO*w;1ie4Q3LX5{=p>xe`MsEyh zlG>gr$c5HHt1E{&?E#|HbQg=68c4V+%UTtM>>Ga`UI71Td|hCM%-nDf_0cOngUbd` zSQ+tlWUT7rKc_wqcj84}t$4kb}l0_YmMzU{G4JI=hS7r(cNB!ftHAFxAVk#gJ3kY6iwnk33OxG`+73LyP7Hh zzjH5VBQ3FnsKZMeWWZkND6MWU%oZ@~G9hy}bk}~+ceTy8WK<4}-{(wlW<=e2&7}~w z6PPU{C=gjp%2xGhYZ-5pqeQ*9i#?XJr;I}RYK>Vs;B3JL$Cn}vPODJADCH*d$bMI7 zz`|Ka?FmHsal_+$4BB0;&K(}N&L6ZSJ_gu;Xt_afWVEV{ z47AG~q7U9NCe>jR>FDO}RyFWuE6b420l}n9jmFwMs|Ahjfl9Gr0kLR$T7MMMUw`=K zaR;jJcZ`1jlptFeLd{xMHgM-4pHC0C?Kk zKOc%N!mmyQ)68Ty=+oYatTONqIN1V-0=bhB+12N4VrZJ1aouhtJs|@OwbdA%GnxLZ6aMmX%+RTy``A-rZ3=QLsy1s?Gs zsL9@}y0$Y8V0`X>S`>*uAV&YKiJWpAK4+iuDi7~?OBbL>Ofo!@?vywVYx5>Zy`;;LLbzN@?H^MjKd0x!GdfkM{Ca^(Ix-_yTGx&|9~I#DWxseCnvU1833*NayQktGNGbx6WE&KjHNK zGa;A7<=XCw>G7M@B0bnS@e5$q z=s6+7ETb;)Ih4@;F8$wd2rQ0D1^vTofr7U9m&a@qm^$}^(~r3DmJAz}14hV}d=n$9 zTgDb0$^O(BWjEr_cLrKKVfXD zrxF)?GTj!t<&w9j)nC#IaWd7zA@_tqgFpY0e~V?sS9sp5>Xke@{j=tLeO5l$P4aeF zutk3~e#Vo6L)#vedUTTXuOmTqVSnv778)fEPNJQKyd%sJgH-9$U-(yYE9p4;RU}Ra zeU~-Yw=b=o7+VJjmf(DS;m}gjFTOj%h^hWvCF#?*N$=c!2Yd7JWf07%pE5y7+yup= zvV3)U;u0w^d#Ev-R2=k+VXK%{wKS_$YLWh1thlMQAcrmPxsd`Z!x3<4gG~Tb{=ny` z;0+<5C7u^nmq6}AAst4x$!syW#6*~46dMRObDE5WoRWNpsHsBv&a~;W9UzUKiwmO$ z2qBsMTILFXO<%SwFt?VLidkXFD^)UqQQ6U;66KdDAN4nwFUu0M%J}iD#CMHEmZvTA zJefKb;y#7Y=PSJWBEK2}@;4&0?ZS^@k)~Y0YaD6?HXSq|HE2R@)m$iClfMwSOwu9s zwj<5O*vswpy1aFSzo?Q1-J%1>8&ie6JV`Y@eNnikTO$v#a#1Rb^gh0MdFo)>t36Bt4i(3WTh zTQdft#Hq^uuMYCJ5B$FtLr{?3Zf4cIy|%}a)*5#pc1lqCs)N?-A>3>GXWkE|Ih`M| zO5c|()%*xQ_(iltb)66|9Z5@D!uB0TPfTo>c3vq0gM96)VmyZ-k0kz4pXkkfi}&2| zQSy!MPo$uT>{)C%3InQg$}I5fMd_R~=@KDC!n-{y@k9mN!v&!pd#MAGxP$f8N&IU9yHQ`Hfi)t@*;M5)faIycm4YBm|ndxV^% zQ(Zr^Gy=6Juvb@u5; znJGm1jMj#{joHRH_7jVESC(Q9J(G7H)+ zK5uaT%ncMkyP3qP&TZ(I#m(4Qo!&_E*KqS;oTKMdYQ&mecG5~Z2qX-h^S3uMK)n^_VoMr-?++x zAI78ZhJqPA44<@{%cv6Vg^-Z!xN;lqzNzRpi5n7aN2lE7ma$uZo)>vM7r?zO4*W$v z9s8Ex=MKOCe?X}A+GeoYN!Iv~arv(_Ev>e-Hm<%yoa&p!VeDY_AO5M987#$Hl;B8= zs;`REXQWY9T(<>vxTlSFmxBhd$yICryc-=){&s(<0YYV$y81t;5diJN@t@cyW;#y3 zG)2j3p;}UVfvyyG)4lH|@jA)Q-Dg}^Nu&v>LT185jq-N~W7K{7oLf({PNy_T6Dd-f z>cXX5ZS(sJ3l!qY&w%hjJ?vaLH;)m~iHJTsxd9v>Z>DLA`pth*pr}9|QBmk1O#0~h zu(c57G~rSM3$73bb37J6zcDq1b-+9aVLjKLxhdcD`0fLU4;tAFW5D0and zZmq1{$nsiv#F>t(pr}i>o`a}IturA6jv5cDBanJb8ppX_S$`ZYNady#Gs-c*iWBu> zlxVw^=$F5gsq3=u_GLMh-M$vZGU`jjNjngNH->(0YkM=d3dwJn-tGR-NqS^8_@>o? zc!O$*?}SBiq%TmWqT3B;4!I`mj1#q9DLK-U^o2rhe%$WhwiEcq#jkZMcXCKrirEpL zxXh11bZ34fT(W?&KNIb=QuHf>{!9sk{Ews24~4QW0q(~Qgo4J@0B6tKhc~Px+UWUFGkkZyNZbYzby0qOPXQ zm#bdSIO0wroCQBsX}02cV@m;=4>AL3x+t71aGg?&i8O3dCA&}!TCU@cocW>(#ju1x z`HFU1NbdsqfvnT!*7bVyF}H~ZOOtlJ#pcNQgxXhX7m3AnlqmK%g(D zAkm@nqM7%bdY%#|KbiE}QtxKZ2yt_DBksnUIAy+Hx<=%*lIkmf3srkLBm%TuLxNVL z5U^Ww_2ntcDTzEcW{1-d^faX!S-!RsQSNHGysSU%R3+Eh`@>ov?XB^V|h z8AGP4eY*d0v@#MWkAwAJOUnZ6(8yk~Fh&Qv$MON9;~CT?YlJ{s?&~szQI3T{ykFnT zi9OA5J2)JYc5NApHwS^mtIIhrHT84^8K%O#W2R-y(ShqVI2~a;WqXnY;GTs^sWw?a zzuY|1PWTrE%`8O!^x?CmTIN)mesC^>N5go=A)t83mRynL%&0gEM`0kElB;};$@n+U z4+v^@Jzb)VV$LOianTezOLzG5VBh1-s*DTKlTQpYK@z`-U5fw=dqN{`LAO$2w&~XF zhfT=jjKzsEJ46Rvt&S9ZQem?_?tILKt<6-@YW-p@7{CNq%VYk!U+h-Q>_Tj6T3Q@T zqm>#0U*lkVs2o==GfL8AX80c6zyhrY4;PF1)Kh<^r>1!#-fC6v2t$QI-4SqVofOuix=Lj{_6k#{rPk_(8%Z<(aBd=8Ad;uys-tZ`y>OQhzBtWCWw(XP+W;w zuqfux7c%8_Nt*}~PTSvqy6M;-XATSY{y_6 z@&F!&$kE)e8^k_GSK_btD+Gryw^l;7(&w*Xu0>OAx%-I9K8&v+wsIqjgC&uhQU{if zhm-E5Lhv;v$zI(U1+eayE2FbfkizMYjzkc7$ACB7f1j8*{fu!>fbJ-cR3QMSTZyC) z$dVw0m7I5BJ}}}^taV{xrO0BwX0ZeC)PKd`=(FNt@>k$}5K8fY#bcz645|%`)D`Y@ z!y#a=ud76qrar){X|9sY-kX8>>6spI>+gvy{lbMhd_f|S&b+H3tW|nDmB22KB5uX~ z`m`HGr-9U`yKIocXTrfgGG6#{; zFhK`6Z98I+CC$6r1UcLtiLfET6$t@c8t^yR2C&@=%L;3w_UqN7^-Wyw~`&@wV!gw zJX9Rpq&ZMG`fVMgo8ahTqLz1ekNJlNNr|WLlVa*5o?H$JHMV2xE?a}Hg0$wTzbKYJ z&S(t7Zu`NoRUOj(#b)>wxMt;SJ-s9{p|j^)xqR#uO%n0_r4P<(gnzEbyFh$9=4O&q z{QlM`{*M^5^r1zg$rGQCAUgal$hPSYN^?O&7Fl4azT6_1P{>J0pSQ`Y6OpUx9FvM46T!;Kb-;TEAiIV|{P{Pv zY9MdL5yv*+vl8$_9#o}=BI*#n7WZYUVz%UR9@)Cx*jhwV^UuY0_!Y6fVy9JnvneKd zq}AK=9%`3pFSi#8DffZm<%Ww_stC%T5-|;V*Zg70jwnWx{mC>+GltOU8n*m9&kW!| z*1B0YS2;V)oIrPRvR3+BI;zHfUbctL2yff+xH!Xt(hV0B7?NN(ggNK_yVyp?*dQ<@ z7!~TkuMh^k9$YGBihuKn4fL* z36UvoMqc9g@ze!oo3h+^e_ltUQ`|7*);>pM=JnPSgTao!-+8ks?63dL1_4G}sL@Gj zdcoRCo6C^GUlEOS^q>CeZWWzvc_C{j{Ajc7HdS~f?e>4PA>6OUW=MQAFJ{!{=p#4| zQv$EVkqiqH@k;Cpsbe$491w07UU61IL~R8mlUq|(0lvp0KYd!0#6x7}e06(J%A=vh z>NXGK9DnzZ=^E5S_jQA2%a`MzOx8sYiTJJ+bE2wn*NAaBeAHib_SFNxFQ0Xc(WnCO z`M^pFxHu8OZ+vO);ELhaRB}bKj@mivlLKzMXqbSXfn7PoPbA1qCZdt~ofgv|>F+De z1uEiqx>o;H~boU#w7FA-qIXL`89erby`ctio< z@aEv2V-m!M5-BW_EH%s-!!vbIF$y?!6h~cogQ3Rwt^>m7#Q;Ua$GyW~wF7kqkv=&% z;u;PT(JwByFqPUnn!F{4uZ?mv_875E1l7hS;AWt{0EOD?5asWd z`kDkoq@PkHA6DYQ%iB3%=}&iy%mwnc$D?(Hp19TNmd}4cgr(zB?fIhbEv+>9a!}8W zc72HkAO0+~E_nCtgTV^;$VO}2oT<*v4s--qWeSvgrjXxIkZ0SwY0JHB?@L)@L>H*J z@+4qM_t0C$tb}-Tt~dw2k5Ir#M#i!Yx~dQ-?oaH18gMNJot6_FP~sh}lT~xtb1QvP zPK(ec$Y6>InlRQ@!6lWBzWPQxB=?a+Aw^jCNSo4L_YjF39liYn8U6zx>Dt4-Rrj?~ z;EonWhfkqSVe*gtU>7hS*Y`Wpm0kJx<$YO+AKE#Aq2}nrbBhn;gQJVLYyKR`;7l+%a!LzOam@Z zkWH&htOoZ0G&`=OHXA?|dMeYY8wEff!IM&$+>`!=HbASy&!=>;P-2!Ns6o4XisiM1 z>Y7J|={kTFV^OAkP5eW?7SuRwX1E$1=R3&o!|u;E?etWRlZUO08q<_@D6!h5qv);d z`NB%Ef{WFq(eG_L1`n6~(O6)rm0oqY$R#yGPXWMl>+d`VF7?ULNu9vH3wtBn2vSgD(LybDjS}Td~!Z^6)ZDxgOHq&X6M~k%33D zOw(FW3ep&M9gKD8YAoX!0oDF=GjbI-LJ%AUlN9;3$@f z@CIffivN1iG1k{C{FHd~^v2MqeMb1{KwX7NZG8yAzoX6eqm1eGI! z;!T7>x9}J>S_~4#Bm6kSuzucMR2`ZZx0O5fijkOno*f#8$Jqalb8n?}SV9=ZCZa--!C=`Q&>ug0-H48zQxTEGt@MI|_!r0ZliD zzgo2|p2wIh1#8ynLym{jtih7Ld!O(Yjon;gpr0RAl@nAUVn`y~ms-}`@lsq#U~@?| z{G^COt+QH;ALCanM&tzwRlL2wr0q$m&~~!0qmO6NiyJ-ZbHYx&8FZ2YxmPtwe#sc# z-({3XsT7&ORKyANl8hsCad?$s2JZeksxuCDFR?i|g$cYlW*aNR zXdMk1i{2^B9Y>;+V^$}IeJtS)ePaE>q#_siiITzBejt?rxwS1P^(+#b$JsDTuBhjH z2QlTA(fUELMMQnkY9VXCwyN<5qJbsfjv5V=TJ1hVb>CX4-u*Fw5y=CrX65)Zzh$0S zXJJ7GlO|2GgUmH*u;y2HETkr5T2SdZ6I4bQMwE#C(!2XHxCS|hr*@&w%)TW0s+wiY z7F=Mura4ksyL2a5W>w47$o0a0#In`}kP;9UA%*LE)c^c4xC2a!DoR%sboi$Q65w3M zbaq|xU0Ti==Voo@Y%k8xLacwZ8zUPDi5bv!=7kYw01+ov~PUE*)bhWOZhrsHHsQJn;N-7(cjR%w2l4UnxLBL<` zTE3UZV`IReKpMMJpA-@a1KT90Zc}=IQ-jnAu`{7q%eh&??+n2fj9>~Q_9VN{%^o^j zmL+My0gPy@9q0KCMiJzRKxwI0;I8#EBUg&2Wo$THd5RFAezr;HIa5iK+Q?buCJE&9 zjG!^x071!ZZCUKv)Vd-3&#K;;0S^`~sp;;cJ^Yn0*>Xe`|3rZ*#q1mCaNtOJU^H*- zqZ7_RwLUqG2kgEJ0#?x7hJ-uJl=URW!TK7q*#8vg zNDIydQk@no_M!q);A+%4Y4R~-sslc>DDY(cOH{)aPeFOd%&&b7`zC*9_LPXDW zKAny;=Na_T3d0hiyxS406QXC=V9DGAK{$Sbl#S|T zjsAfyLcJ|)5UuX1P)~A_clxKVGv9t}k=d*~5*1LRdu*Vl>#80TItQWA5lT@ql|D0A+2%O_U9{nU=2S#k-hhA1RwDanZspCAT5lI;J$) zES5vNlIh#%LgSw5J%@??aFVKp*7qZ8YSq$k?uuLOtfQn)1xXqTrTeqI}4CbAX&=CLe%;yyF9NI z#wceO`~%Ycc#FLRZ8hqBiRdGa7$6f1$JeSo%^pZb|MDde{fCd-y2-cR&6vaGrzQt5v7uKgyQC0ip zVA0?}w~fQ0@%h$^v7G!;wn;ox!3L{N9}jL0QU@!QyrJ_O6bc#^`62_2m^Mmu$60Qdx^PnRIMOtiYO@}5djuaL>e=IcJnx!qCD<>gbuyZQ zh66UzpQ6o3-1G?@Hp#+}8QYawGpT*Lt0I{S_3Z3Fgj*MdRwg?k7Ik)3yE8+sQHEH0?8*VL_i5| z|HEe#I6~ljjSe9nSxChWaFm{+9n|Xx{%_H*2I5{cX`l>-8ESHCE#(H48BhSi+>gl!vVW`bt?)}ZWP zdUb%b-mgY?RyN~X0Ye6|45yRcI+l58(oa;yQ*VsZ@)@sFdf@eiykm* z->3XM+ptop7nJzIT-%Ar&>-a6D24FrHN40Lcp~v7bws8fCMhBuCD3kb{p3{d)NJ3+ zeh|(B1h%t_XemnqOahQ(ib1S1@~x4Fi0#@jiBqV44Fn!EHT^e*D(1R{1^*>0g^dFB z<9fRpBHS-{r+cQhsipqrT`E?hHSt&ydb? zUpo^Bl4`Kws&hH&8o(t8%u=!9qy6}54KJ?oWJ%Jv`r66oinnh&oFK?KxB`~1=CQkFyL#g6UnSu*xQbxC+*3hDPSmb>0=YwZY1`g5T zxP#II`1bhgoIPyEi5sPl&TY!B!$K}GaBe@0XGwrISl5+wJcrgEEA+a7T_MH_b4_tF>ls`CFJ z1l_8Y;z5AU1aCBig###X{2-r8^p3%0?xQ$w!qQtJE*EP5s9ZHGR>t2*VKD2o-AD;4 zYtbG>I$|)%Dry2JDE&a9Imqy1WyC)}dV~A876P|+Z>ZAyT;?GnrTUKvqdaKXnVye5 zPkcRRRZu!^4t-K!hKDI16C1cfa*KpF@`mNsW_54f#IMBoPoy0wjZ`~?&rjVr^IcuN z6I;5zTv`J`^+-DOoo+Td7T#V>oLb*dHEZ{8VQ$%Y_E#T{aFnKD#nS($%5ya>24r5+ z6jT$gUdi6dY+zX@D2JE4f?s!$iq`|snojD+^xPg>N?qYQJD^t z0Z+1~ksW#7u3;{c25tU%R^)P?ax;KOkkEcQxh4lC6m+WRiTZSvfixAU+mMX4&ib&T zPC=qd_Y65So(@#P8+N2hpM;U7ot5sujGaz+!4g|jP#zvTePS}C?R)lB17%MQSE=yd zW2xTrUO!i%prPZ@Sl~R24v7WzpvevvvWWnS_Ah;CCBA9?pmwJ!G$q1_pawqAWD_^f ze)y?-tq}94NC8-MXR)5(zQfOY3P0OV@{tLOu%2DzT|I*5z8SFh9=D5KN$Nikto+M5 zmeqdKUI#EpK5HDh$EMwWT#ym24}!g(#XEWE>VX?Fj{|9nhmH7JZPy$0Q?Bm<;18rN z;?RSw^f?P-CDW;M(Q=h!bRFYyuY#fp7Ogs;eWEb1Sz_sCA(~KW3~aTPSjD3v;+}MKQzU~dbxMOh+NG};L7lB zRn)BNH?NDMs^mhkbyzy(c|b9IkgyJPY8tFn#13&es47Uen0GBIj-ARWA*uikMz-zL zJE(jaUQ-gUV>K~*M`WtS4K~++qUgDiueMW}$U#kBgT6zEiV*$|q-;_`F5B06Z~l() zX|#9H8eL`wTU&*%fJF~Fny1vl8vSj+`j&itO`wrzAMQWac2W0Dq;DeA>F261&#JC! za_rkrSej3i^NR{tuZgc@r+$eKR&gb{Vor-fNTR7_Xr?yN(fbq=oKIfvKsZ9p9!Z2# ziyIbvxLDM(q$KdrRke`}ml|?+0IBk%It!XSUx5sdgKS0OfaPmb)?TwFq8j*e-Nz+G zhSsHBaL@3EMHH8JZpvmjiA%wG%7(`KI;HQ6{sZ1f^N9p0_=fjEf`UnQbW?ufq02UT%J> zJR15iBX;D#f@F>v&GS0L*T0SKZm&!K<-0N!Kem6zeik@)iPt5IQ~XgDXv|EI6x`@1N~X+q+xK`}6crj6EE z{QCwb?hoPYfNtFw-%t9yL=HMw7#7R5&kCKT2hHMDh|CEYYGvu?^rWjW&8JrTg#g&L z;~ViCXa^ihHg9Bd<*N1^?|*m2u$?;;vM+KOHD6JOL@Do*(h{#}O@G|7Idv0pHAF4q z0@buz$=FyUd2Yvc_fy_q1k(aPqKYUfH%5Y<5 zq|P$@Tc_B(5z=31Z!UhWS#^N{P3^Q<#9XrC>Odk&PZBO*J_(s~ba1|!uycSIZIs&?bb;L9wivU}R zc!!@*5C|gHo$GSN@JEzQR4aM6hN)g%!FzXj*d|ZF0^FzYmDqUm(2;B|%mheyN;HG@ z&%vgGax~N@cq#U_rWTi%H>V|ZU-_3-6&k%qmg0`3pZH24cl%Q0yx)ZnVS6@9JVfBU&UlR?iI;wg!pxqwLJv?#i@(BKr3g<1=FD40akcS1>_e zv9ct2y5zGk8<00_D=9TTe!Y0oS0W~7s$_NGKb4<9LD3d%!mo3kL;%Zj!A6b~As?9T zHwsg>XSXjnK>Q1+?N`x>Fp#(3G6atEl|_o$ep)g_)zDld-tQrJ61?5HsW#i*Y~719 z+rwk{KT_0Lc-C_jo^??`I88IbK7EwYKcqSXsnfZq_+4S9b(X$3^W=pp8^+5UlsPb) zHJ7Wt;g!kT;x=(Fr0ehEo>{-D@}Xvb!{I?Nv1V^h?d1!H=xT_h9^si&3)Y0=Bz{$q zc+?1owgBdee{DmHejr05f&*`8CR&BbwyO?9FzmWF(Smpj6@~R`$rna=lp3g!E(2GN zC~&dl>eFQ=RKq^hi>LzJ$2iys7*lte%TsY>A%UVekwPOIFvEI`GexQEYW_@Gpgl$; zn2hc~Grm!(h~?oy@H?^C><#Wga8l-&h7YL7&xRibw^kUcX$8TMX487$q) zRI{J{#V(7pl+;fKjPDPe-?5(8_~h~A24hf>^X`m&lHKyaDdB%k8KYaaB#KP@)alS{ zjA<4oFaYtO90!KO6ucBZwqt8Tyl+&%p6qnVrdV@#9b-a5vrFB&TIgM-@9*X8PZ#IX zNV|WQ(cqj)Dt>g3x?y+NZ)1KPOPsW=4f*E`5D7zSX8muPD4*J?+g{cUQ>eT}wY)Wk zQ@*#p7N-)_n4=sK;)fz%s*G4`LtF3MR=r+nuM=A{rNMG=gn=Rz{N`m7TNvC+rxT}E;y+iThFF6eLr@1Cd<(GNWShum^I(vjV z0MGsfhFtupgacvjxvElA`6<_BF8di(c{c?j-v7pqfR$s~dzMtgALbw-jQdbDCn&#k zat`o;5CpDUugmR!E2Xehqy8!UPa)~o6MDEq1M__NH8uecd@S2W!Q%r@!6eQJralUx zNwfa3^Oe2TX5_(fO40g8`s#j5^qgHq{=5;}2)R|%zE+zNlnO1C=((K#5529q)hj2%I0IN73 z!LvKKRnW~9KR76)hhjAr3@Q{MtQg`o0$g5!npmjvj&(6Lt?x@(WAV z6{C1a?pSI?e6N*_(9cgN;hJNlZAcuTRtjv|8%75)yWgrYbwYW`igo?y>%cQMTysnD zoHgB#2O+)L{RM_gv~&a&-DRVLb$kq%yd20;RoS}(AuzLSd-dlu;T z$u^H3?g$_D)HF;aY}PepGyQg?z;A}%YxihTd2QVgwjP5 zX!()oYO`%;ZJ2%`S=+8L*xf@H;#2Yo;+};R1!`$JhBq{bT*a~xha12VdtuNfM?k^) z40h_<(kX7D0D~jrs8yF2ic+zZe(=hY6}Jl?ZdL%LR;hC3T8s17|civwCug}brTgfQ=Q-}2 zo^U*;(>>U|+pydi9Xn}0(TU`1{sDDp!^hOP+0NoSZuRgOKc{5hji+Vgu7Z+JR&*Qa z)wJIC{3n4cBxJU#by<@cQFuwl#bo2l^t%jcXFE&H8;n>j8X@GLRO+^IQ@-VJd0_K( zWaqr6H_*>Xfc%k8wv`87ABBSF4YBX{16^Wl8UBHYzbOgOsX2Z$x-7D5eV7^(-SA7l zfyN=Q``cS|_B;hd>>(xqMqjTdZY2841sfRwRg`27=#;g`;LKYAup8$h#q%qwN-Ex8 z17OyrVd7wi+AoJIN;a3$(+Y6qe@RgtD->umXdP1{FOx3Pv*{2l1`B0P)%04Xu6Azh z#KhVKI)E-e*H_AgHt+6tpHQMHGw$Z=)Gr*7LHuNs9VGzy-8`R?C^)*#GE;0UE;M?w zK3-T=fk+jI3AdF$#2T?8fc{t0n)Ln3@Y15C@&@J_r1sG*K*GG_m*0eMX>6^VOV^-! zMEe0XH3Q+uv%PSBk)xjFtd`aUsFpqR_8z+OJTgem6ba!w%og&#UevO!ghglja;*v; z<9Xxox<<@exrC4l#TNwYlV}8?N{J}v>Ice|vW`uij66Q(;I(;5A%7l9Q_CmBGNs{Z zjgdnQD*omqMjwN0m0=BLVDI1V4SFB02g1^a=h1SQNuFGIgw$l}rqOs#GWZqKM+KMx z=x#@~=zu26IVD}drz*iq4DJ2x%-c3Zv8{pUUcEk)6X2U={D~liTStL?u4^$ZX1&t) z!34@xFV7XU{$KJFWV)@!!TOy$oHgnyKT)Og_arNLQ);>$$jO$g3g4e%aS9mh!kT5#qSX%?RkC<(9?{bt{gqWTVvZO?63Q?Lt} zQZdHZwmz>M7Fv)|HBC(PoB|OaLjlm8WK}76i#^c+*jip5HnA^VT9S3ihwYjXQ#Wvz z1P{YWR$uHQg&180AM+aOtJ>vi{IzN*V}`DTGIMI3wn=xnE*S z3q70X_h=(IZef`M6dgABl3&pAU zbNwczOuCW#XN`0x+$p1#8lU(^K~wMa_^3^xwMqWtI8DWA zFDyq0ps`&!S_23hO-$^;8aTdlfp;K+`Z-gcYEM?WDNPXQG1gxy@UCQ%&b@?DM7@aI z=b(D(>eLFlZcdq+`njo>fxtaBU({$s*lo%EhqBviFVR49LpY1b!a>V24%Zc!Z2|sP zz^~ocJ8<2jy+{i=7*>(Nx&lsX+qwfqRx&~~Y!nAQ9|42pm6yY9ey>z4j!@221c&!U zlw+#=&;SbkI91Os@?_x*!ix~Dn-)^c0HtrGLrj`{{=@ykUc?s|yD~QG|=_CK(Hd~IWj*^>`0Jzh{5jjH0 z-+nQkJGdmX^d5}&vp)%NI=ZM>)qWeZO+lCV&U?%5t@^6&98g#a-`3ObRPS(i=%fi7 zHJVBpw%rff(DkQkO^DW>!EqnOh)&eX;taB`#m9vb->#&2_~?bR;R*A_fC2>X<@9SNnE@bi^ur%w&KS-n0sdp$`@7!zXKsjwEz)?df3eliT90#am^{ zkVx1L*jYgr@h`v}nKJsiWYUbtx^{*{oX~`cG1(N(5&Mx6t`z!!fH_J9H?EI9NV9GM!6lIe$v7GtEHn=aPUXu zEDc-vn;@S zeJ?2GXc67a(Tv6&=~AHHtQ4TR4f`8cUtmpqqu*VhX6<&d*Q!QDu z1KZUe6=`{>OT>p8hr&+1OoY$Edgn%Bakx4iz;VW?#y~RzZCaN!tJ`M@;(o$Yslwh)@iG_Ex&g;eLuE4BkznuN&>Y8*!Aii{5hs zzs$%L*%_8oC6X(6@~m<^i_lLC4wa+lKqv4}`g3&k6qe>42EUXU3#H&DPhzXI*HPIn zn7_u8A2ABf55LaI7Bv3=8C`?a8rIWsOet4zN~yYEe*xx%iMBNj z|8&9UPo01>=r?8!#F+0H~L$+y_^QRAfpiHvv1a{ybV1nHk8O%nID(#`=*jMirSL z%}5IecILca$zBWoB2A!gCqh5sSNbT;=blUWMDdR~<664(?GHGp*-7Q_Xd0uy0b=bN zR78jhu1hlM7+ZlU2+0mSu)qiY5lfoPz$r}h-~~nkPfSsF*#fy*n7dOkPfC?U5FSvb zhj)kwA=myBuso?ZYK7q%(+lc^)lhoJTTdRXU65maPTuv*;dW}k#hXGen&KO(9=JX_ zq80Io-=clVULnJ}I&)A(A~E{GjbSft%_mmtI0_Mp;u8YmsQG{xxd@>^TQV@^kb!2e zJOz?WxA^M?)!}A_UT-(u>vXw$*y&)Zc$DLf;XtI`l(5FY-n5lbqQQ0hmA_qA^#=sM z$Mf9F{N+&k+^9;;hKUnMa@o5!+Bg4^-zm#hQdm7PE%}iu2{*}jFw^uA?S}cO?~A&I z=|`bC(2}=hC)Zn7%y;Ve_6ryr)hDN07WFFOl<7uF^WC{w71BSUch*jsp7v9;wjqMUb)Z@O zGygTQmn(M9R-?R;J}`Y83@!PUoG(+O@4}9`Is6d4i#<`=E(sZVP>?i&9l5MgA?uA7 zdy@cp@CwzF6} z?@<2x^vvGt+Y_jS4euph0~zv%|57d+MbT z(w4QD3N*GUU=Ae~_nATB_P@sGSYy7+r|@;z$; z=`sO|<)XZH>?4>vhZ=aRpNK@~?oLp@A7~w2sx6^Qfk{losUBtNy0~a0U+@sdSg4VR zCFeNY-{&m0B+?hL&*fBhcfWP3smOnS9Ixq$TfKf{nNl+}I#O1UoK6gU_rr?UF@KZA z$vtTf&V8u9_*79ORqZj6zON%Wet#t!T< zA8O5%v83=Xl7i{c3Viq(w8MOpH;Hm_GclcmGF~d`qE;-&d~xFfze!)8ry? z(p2^_)U0h)-a-0jBwk9GLn1rGMB-~%LYHX!02xu#^sF2w>W0Q)B&cQRRc{p>Ky`U5;1&b(xI-7%UZvdIAYO;Uqw~xj-Z}&$i(}u64GiWicp3tt6%T* z_H5qq%!$9<7>NUxFRnVnTH4>Efq4Gol{u=V3Xsa8lnY5w2{##SHqh|e&C!*b!A4W0 z{#&96<)8ay;2#ckBz;0F;9zFa#Sj9;CAewjzEqixPt)Mlrut0$#~pXI4*SbL+E0VZ z{%g0ZapYP2-W{dpYe$yf>4X3{VLt<=E-57zE|`k_q2e>$4SM9%>JzJSkCB^sXAFWc zIrT)~J6GQDK36Td^{-$L?+BmPb&_-C(plOe7IbPYahDS|&ih)RSrv1K^_+m|PuW0{ zl@?W+G!+MbQF4jNeQbOB)qZI?jj|%wM+amKG7t%Dt1p0X{vrW~&{^6PvQ4LZ@tfBl zJMI}5e4t80-lFYfLhfnmAf@MZc%{1K;d=I)hPpqt=Ipm1X@&dZ1b^x&i}{ogDm|@+ z#E64BDhD)jbA2>1hKJfWi(BD6vUi8e8BpOjU-W^{?8pX%No`U4LK;s)0M6DsmfmuM z4?*}VP@H$)zXcE1y)T+>c%XD#1xz7@o#WpKo@NG)I1fcseGZl;lDsovl;`vx&*>BxR67*6?QRP;HVP1`5iq+b=~>I3t!+^rnZ0mp{eWcp2(?JbU=?Z{J3bW_V{{D}ykDEh%_lKklhJJUyB70|C|A7Y6z>l&8RR#52zDs-mk^(Y~h)=Aj-_21vBcW*RL-nb6 z{22xn2V$cksWmhijrt3`MLG&K%GmQ#kP#b5xN>MRi5Yd^#P3jbL8VBn)^x*pmaj9! z{AYIbkmeM66mmi9S`*UT4v+hHf;o~eOqx3QNpO)<;?#b9P-3CyeBdvc=2}i=Y>~SZ z8a_CAbMUYEi$e*4D!Rbx7r9bB%VCVJ_)HRixM#wtFFzapp!u zk)S)(ARr=8NKrHAh5(+DIDh+%j+D#YBkuyZ2|RZmCW6-o`M%+3m#Hg)o;l^7&s3>; zGn2Np7<^4HWMo38iRr%QshpUD|0qtx+&n7>d8WnEt>l0rPC`J;Bh!c!G^7BZK$lKy z{l{j3QWWR@L;sNmdjb~{>Z>iy0kR>0_C=PwQzZA>*6~vGLvbC0uEenh0p$*^t`d8( zK$sfhO3Rl+3$yi zV=~`o#4=%GXH>m6AlGIOG~2F!^~RQ@(DHiG+L4?OI)pzU6a(zw3f}BK2&?GRxE7nC zWjL#)pbH)W^!Cm4gs*VWIeafC$PtKpXmaM(_>Hv&iRw;N#PR?IXdvO%2zdG;1SY$h zJ5WxoY`;{@oSu>Oe8fie#T!Hf0SG9_0dSithuK)*mC!0cYg#NEeSwao921f(+cl8}lY* zlI1YBP9Iqu#MM+C;5TD7*ISQFE`m`?*-}IV4Su+T8v)#;wi=_ySdoS>pJa0oW#9-B z)h50Q`<`?b`hN64uO6V~9F-lSIfyU32{GYS`_Jho38$fP(A^$&{Ys$=}^wbGQBj*R0^MBEFMA2}^WpiGxu9eU-3 zxlfbNSrgNLe-wGU~ZJM1C5>2gl|G5~K ze0KW@@&$>icZ5X0uVroGL|rpAL4%dNaA=$6;o18cr(7Pv`qauG8#H{zRau*1Pm3#1 zG{VCycbqyVh>EFz^0?WQGR46@c;?2cv%=3jJgKXVQ|-iMPlq}a7$@cKsaY4VKWoUL zp3}k;ay}pFY@O2(MUsNw+bON++kIeuiPTFsJhTFwW~nvgPEIa3HRGa|GwuZSoxwGL zQ1Y8pM3^kwhf=qB6)gjN(?%m)9bcu_POM959S~0!c!yvyT5%47>5Q_Pp+ozlp?|+9 zg?ol(*P)em|E=xfQ%iqZhfw{48BxOY$&vfo^GHKU{WZ*aincXk3R=;JdO?F){PP*q zmI-7#ZCvw8Q2_sg;YNx`Dg3TjKF)-dJ@Oy+rG}Q=HdZjh6#KHa9(B z1wHtkk3JH(G&p}MI9YKFbev4}REXM>vK?(tb55^mZZrU9J1`$4?b?HRw19-MG62o`R@nx!T@70%p=kS z_u=9BsUrXvl>znkZT?W2?pLXB2o+3t&%NVIt9x7v%jiLY8tRS=7WU7otGNivLTxI9 za$KQLz<*gmXk^+4B3?y^t~VCE~Jf_>j* znAcS)3RD9-hVrkp?0 z2+>qZY_s4)K}uw{0z% z6R}I+cdc|=naN{EBNq;ZsEJG+-{H=gfYLkJc5jM38UjV^D2ZEBtGgee?+18 zduVO50b2o|@>eUF^H9%iuU1IdcrbeIMKb7b5&)9$I*+Jz+Mf&$0J%TtMdla??FtH} z@1$a|tdf-B+ZTo@>M06yp&e$U&-AtRD{jIOXlNluq4% z9=X0q_pP&0`HsI`Wet#|Aut>E`*SodM?lx-Hlg4dRGT<>1eRL==G%R;5etLJMc8sb zdv*I^<_T!W#-7rw4=;m0+8ZVnpax!^oIop+6f(^ZGVL({`lk6^O|P+6q1Kn45ZY|b z5#LeILJt&ih;kt45^7hN`PJ5M#(1Pi_>{${;hmXY1jK#U1zkDW1`A@Ex(hFGiZiIh9EE8NuQ#NVPpv7M>=KTdgcH29@`Z(4n(C((y;6t~T%S zO$7r^KLtMv5mv;T32A~U{4xsrhP&6K&euuL5cIFkMPBzIsHYmzl$LM=Txi&u9SOWR zswTsY!#H$g?`EboRU8d5I8B)^@*CJ3FwUxQF6k!1VY1XEwQV2QPAG)pbAS}G~ z7p{o93;pSVnz<4>+ly zxa*(IKQ#hbUGqk27MkLu+R$F6KnP6GMW+gDXa@r=TQG&c?M8=P+Js^1_PLY8wOb50+8g4nOqI@P1 zr%!&i9q&8dh)A2*W^#$x!R;M(x&cH*PnO)qd+C3+&x6nnoyMm!r)>U7vRMejDKMu# z@r#R!@g%?r`CO0}{5$3=expvvzM!%elJ3O6W#TAns+a5l#YOShay3=~$wNZ@8 z$Y+JVev}3@y35q|uy~;{cuUpX*dzB$Hhxnuf#n0TOaSAQ2owKr-12hC+C^3Nau-c#RCo z0>E?*cl048Prw6Q3FRp+va?gVL++qP6X=t##x)x$&{7eMeYL}w%Gn6TE0c2W39>qc zh8N^(qP`mBWUGes{Fe~_k*E5VZlbRxSoe)bkK-b4HrX`vySi$}ME{&bRv)|CQ=WuG zgyti^2x@v6KTpjv6=rXaduC=o6}LNs1QVT1rf^a>PCSUzv0Pm&azAs}7F^U%+bt1q8jO-`d{2jTC0Il1fw~C%7P; z`3RLdgSb=>0|n=R3UolXGPiW97Xp#?O3NoyMhSh`vQhjevtTtJ(6a1M5bJ$iKHbDg zc6>dWhIS!dBk=nOtQ|8DXh)D0rCZI%V0#PsDbww)&+~j)&v~*5$Y30sE68_g1{&yS z&t#eNyE`>E52`8qQ;v#2NdHqvQLBrYUDcVu%iHak%W=aFSA?ZuXMhf_-`OV(q=8}k znAI-`fUBO1tUjb6$McK_?IzWUeSo={m#9VtVfU2=FknjO?`DzLp*ILhyx;#0!TCZ}4gLC#&5T$(hfT;Q?v7O#cz#8%=tOMJfJeEkN;{-5~RN&_QfOybrgP<=0$${k!TFlsk zoy6LvR)IbT$jUJ9xGrL3xE|08{lY$mavuqun&9hy(=kUM1_O=6Wsc@F(DJ_t;H^-~ zB|R>&=YhaU`R7?B^ydu!3EId!@e~Y~hYmilz9i}Y3~S~!p{^WmlM?8(y!V9hlgrn> z01a45eEWaRFY9~*V9)@Y{;UWi+am%qsyf7|9rP&EPqK+^rT+#9+M@;3L`S+ zO#+b^)SF3>aIvyvhLhYPmsJt98fCfBOh5_fIetsDRR15HLjlW=X0nJXR28D{ig?W#|7;7=Af;~EVFQ)-^mHG(?uFtbbb$Rg zmANGV;pz);LVc!tD+tQ?$iZv@7%|h6Y%1MCOv;8S`#FJ;26DDNwdz&!^sODjtkSO( z@CytJNF42XYlK3Tdsr(eWNqF2@%wX@F8Q)A98L50gcMwFLlDRF)QP>*wmNT?-(ubc7k5c9)+DuQnn%l}nj zAvvfmF-N3sCVA~9leqc^Rt2OGf zP%d@kbDSrP;h3^ZL!aWK`&E@{LMEaBC%-5aJK#uUk%dKk09_4jG&AvUa^dcur9tCO ziT<`?G4j-?@VnGj>2dh+fC^>K=v?6`2pDkEhyTFHj%`Ys&r0;$z|Xnh!x{un%H#26 zuKv&y)1erKObI_0a4H# zC$H~XFCyU#z7wcimG{dhI!W_0BhaMnDF>~ns#-!EEbPBe(OcnS)(T~@LQ!2ssSJ}WCRh80BF8NU}1$t;x?$e zVuPJmzpcJ28m>iEzZ1Cc!I_#qJ15R;uP;gDFf!R&+-`frJrlRv?}|>1mK%!Bvrr26 z^aUW-Ur^t%l!x!cCvJpkDFe{0HI{B{2Z_h$$#ITnY5D={i}cW}^*ltlAz_u0M9KeM z^Sc{b9$CRx8rkNcOrK}hv>jS3zaH&3zgjnls3Mm@L- zgySIAv{&$|zkc~w43h$jc>(~MU+JBd@o6Nlw+-(`A)^(`Ie1RIjW2_alq9rogCtM( zKh$Wq7JIVY`IuXZ!6*u;EY(NKMaScFzNS|~sY5o=j0*Q#)n5s^%%JR(^R!VKF0)A3 z%GQUzf|j$CEvAMix6@4Q$*iz7+t=|2>i}q#LT3bE$R@in^5qI%1*}k9V3J zsVYArNa8q=ix)*~aSY9i$ed(2vaO8vU)CdsfY2|lo2ZUq3-8@Wa~nzbi3QUv$s;J4 z$%dy4finTlt%!^p0w58IA?F2oglDmG2ZJzPOte8s z#rPfR+(1`|li>Gpq>Z}cY$%irFpG9h;sV-iEA)IDl$RbE&V1P4DV&}lIR7uxEcL7A zmpOL;f8%)hSVtP%gPO|qR)6vHBRF1auB8Z>8L4j1NMNOqlv)y5x9>Oqj&wdx^jJdm zAnF%w@$2+0LAz9WP9Tgv$N_m^U)j?&W-emMtG<;G+*gDsm%>f;>6Eu#XIRmi1H3`k zZ+$XmQ-0hW{N}UJU~PeYxA5qN#Rlo4u|N{bw2m(-Pg<5jhqvnI^YZvykgRxe@n+eg z#QN5xq`^@_TqSAn5Vkjr@KN(oItwI*7Zj50l~4po@?Z`g8a9Lxb>48XTw#}D0R4!K zU6(WpZw~p6+V#M_`Ee;y4Rkd(Ez4_2RW#dRPyMC`h0A70d=OBdNxI@yXWcw0HiyjkxjC$~ zw>temJF3Xkk4tWz#(5F49ggX@&1pK#`>CE*fwB0+qy)gDVkhRf=3+&X(6$;5mXvpz zJi|KAzC?Oya^eJQUEm^b;VqkP^er07d?T)9&Dbk~^rHwHomjuS_b-Jjiv}UcD+i_Y172IU&=;YWgkgud;+yb7M(J z#hmssp41s!P()x>80kmHbL`->mzMLe&^nTLlx5`>$CEUM9 z-{K(B1Jf2;(ND(2B~jx~=m;YTgpKJwe^y4p_7>0DS4eNFp0uqH&+I~GFxKJJClWvy zF5@aYz7Qx)nmk>6W!irdf|6W9)pE85x0G`?D0_jeK!+xlvxNDuA5T*ZG&EfC&4l?3 zzR$!ez}uU5k6A>Utqa_lrQN3}8ny@eB}b8Z>azn1J0QH|i#qAAq8`6fIDKt%tKChI zDkvIue(GT>mclM)Umaa>Fo>E1CrbMIiaoToNqHty)Z;S*=DE?-*%xhu^@Cz0keMZ7P&ORXaw~J1fKmVTt(&2V;Bns^IET4Dhu90QQt~zR255XZ$_{V(f3FDq}Fp8 z_O;eVx=vyemrURHwbWr2{zqAv!=Mzeolw1q_!k@xO^*0yIl-6zs%AE~v@&t#qNMTGX#gi#?voV+8p zL#=0uTu1xrF=htOW|9yl5)vD81TFP0gsx$cT$>~fDNcsfBQbkuW+rSR!ZRkD+55)0 z;1%fP$S4U9^mAW5iecpNhu>|B3{nIpex z6U8td8hr9{SBYM^X%>$0khqn4; z<{xE^8Av{Vfr;JyYnaAZpW1e|Rj7HW;}dFk&qoWh+jI5E<&2(?Iqt!sGeli~y5XDJ zB8}n=*duHfsBCIDK)uBPe4X&b3JPZ_yG20Iv^%!W%oZrQCtpY#6)3rhocqyO$4t)D z9?KTeBV<&D7C_wyG*KC{Wb6jEpB?HgwlqyZQo$ffsG-(p5FiuZXHT$g^N^R@&&}u^51OSV$Z3+n9ugQ(eWrmeH`->GQ zS^6l;C-@nTx@PLIX9&1W| zFIr>}rqi%Qp*CM1?1Jir__CL)RQO<;Al)CMY?V1R*l#B^+0!1II)+PK^TW;*TD*qN zb+A#FtEp+SMY+X$HJ{wRzT0xJ2jWu+CvChAgLS3v;&;PW`MDUGF&_h0MRF{DQCNzs zf7$T`0wCQcC(W`f1NIe~KK;iHzTAWG8@MI59Vcd4qW;+Mawc-Ycs@KvA4z-Sl6%80_pce}lBcRtF@g9=)jlBu#{a)bJ)ozpKJpMRhS zgvQ@Ekx`1}kn&V*rNyIGfNtg=xN20>YCiIonmXSJc`0PNUPE*Lh z6orMsung!dA1Hs_a@_l@2zU7M_-oV{~DWBK#vM$4qNFVF? zpXWO+*()3*RGdgm^7Z>vv_;)~jIQ<4W8MC5oJm_0 z>}aQgsW`YS9wB9+0EzBwpUGP6QXLTW&bh%_D4fB%I04nkQ^$5#eAUh>Jc``i$j&b) zd0EH)JStsf;(6c7zKZ(&xZD7l4Wy=V3~CW+fVL4V@o;-YqxqZTuEdO9UB$ns_A3BB z1ytf=Gg?YbzU!jM>@QST*3M06#S9=1g zA~GYR%z|nONaV9NL#brkH&<|%e5$k~2r&75ZUQuhx01p`W+1@`xl6DYq{rC z!%Gn1d52jK1!+?BYe+8q>3rHj2z|u!FbHT5j6bfy*4vQi61?{S5RuD^wcQqZH!R@b zyDx_g=#>|}JII{R^yRhmzM1KMjr3?wsg1*%lHnYui@Rd<9*OhZR~)0~JWi0owlP>_ zFCXpLGrn>LGgrlAr@$`=77_8u2=_m+*|%o{(j6H(WtM2!zykzxm!EdnyoX0C&7=Y8eX$9aVnQtm?@t))T(gD1u!&(^QS!2Z zBd?l~`Z&K<$Tu$peWr^f0DtS%BFzhc#3IYm z(}NULN;$cQLFpfS@b6(2dW*Jp6kfVZpKkeXnU|YGfzv#&u)R2pk_)9~f5;h;2Kht+ zXu1WYH$O)n3OLGuW*{HFzbKPD?qb}|7LtGR$f ze$)nP0as8{NH#m&#fORNi3TN?{@8K-`9~g$hoE#J{tg!j4cgqQfaf zR>>Gv3x73`Pb%O>rt|a_su$Do$G!p*Y2)=k(Zc5d6C1Rzr`_|VkbK&NHinC0&aCNh zY@09mfz%r!Jxg!5zuQ1-jRE;_Gi@JvjLe>;Hv^n*dGnq_kSe{ksCLsEwMr#w1ueNU zI;t9U?R2ic_vEuakr^=Lm&F83+FpufvT`!0w0M1CKI7zT>7;hhz=5{O}W z(f_6P7XoI~QK_Iz+D$Qj*X&}$9xqG`FSzQIm7JpyJOcawn>Lv}?JbHLJV!iy;`Poo zp2~9!Nm3EZ9t z=m-@8UZ_904SVeeZ?f7-Wjp(IpYNB@EQyq$=t#?TVoH+=i;^N=X;$dq=FNNfN%*B) zD%t!w|9&=L6|r{Ljj9wE982-11Ri=uQB!{YP4+Nlj9~|>KEII7)aZ^$VVgp1R!;LH zKbf)rsMlId=i!nJ8f4HT+jaSX!z)JFSVYnl z*2CW?XW$K{qZW82HaK_H0qTk53RFAhqD{g#@E4OTN~uCAD4aJ7)&K}S?oey>P8VLZN{M~FW^j3i{EpPl_l;R{UKfnUnOc4 zfN<{peT0S7P)%#l6gf$JdOCk_AKuemuBAV$gAL#Sv@2#1!|klh;5;jo1jVOJ6Qveo0D*v^%D zFlG-mM>_GGI)RK^fk|53jzzd}a!gijfZAa;D&~{a+C4%%KBig5Tq^mVy~PyL z>7XaD-zdXYPA?9Lj_5&gTAY5F5Xu!r2f9)xr5%SsZ&spKx6Il+P~BqFJnQkUt@tA5 zTys@@6n$<0PMvSP=n8dP%I0^=3evfmG)> zFAA{PbcxKkp1Hh63jU*1naE)!^sk1fAjz#5GCW@#@Z1ZUGZ>3ogyov${wV?=)PqV3 z2jlMeLkRlfe60p1(geL)%&!(;WL2BcT_Wn?RG>Vd3iM0+6`>y-|3U?H4M>hdTB&`) za7yfX4F$JPp`yG2!KX$X43hZ3`dD$$Jy+*AsBqsYikEx_Z^j#GE2~x@NKv>9V*ouz z(5Yg zVZMUI9HQNANlpj9ddI0JEi8Ah$^rQxIP}3lQnGcJ$t9w4cX`w_gXhEfjTcTk5Rf{s zDyL9(xZHD`{$25}w{6JzsOoZTkIJK-&LxgpcIQvm#OC|L@aYkvpk6lPwGek+6}>u- zvu-EP+$k@bxZ#&QZZbAB!hAI#7L1-e`&gGs4U|TaF|k2!ZdR0U3eGcRrSE-*4ZbN; z8C0k%wOQ$7zO+2MB63ZFI#8Ph9yAxZ6F#S|VC*<)O2hC6zVB*SQ4+f+JeuwCC27P3>p_{rl4iXSPD>uK=z{HqVeXhi=AeZ+k^@B3a zdX7m4B7xV`EluKUNWT@NvYioIFbUAevNx^5oNE$Vm<2KA>;jnn$w5jfK$#!!;Qy}|`G!pHvKH0EY+%u^N+1jL zHIw}?m7Lr>tgSs_@O3L4A}dloZ?F@`29DwcI1__LL_*vtgczhG(y(P2r@dX~%Y*R7 zXHxnPC(x+RqJJ%nwI;zKmH2y`7uda;!}x9*qZf=PU-U+-~i znAnChGF$85KDG#u-==~<9np`w%s$t+4I1QE`bE8F0U3>790-}~zST6Iw5P;zm3z@K z#TolX;DK!tV}NXG%I!~FL`%q*LXFxY z)#j4=pT@G_=U_V>?l}x~P4zj6pq?jycFw_JMBBcKD!9l<8L`<{|Gn$_tPpX@uiGrU zc_o5LPT-wf1RgMbTTgQ?+I=zOGqT(cY|3te?Hd)ZrSAZ@7#KrKq&%*u&T?rUq#7!h zrt%hRfRM19rsmS-lTpbPA5p?5@R-@z0`X4f3Yw8TG8r{o8XX!$iC4bJ zU9&dWFRo)#1lRQwbq~L1Vp&PmAC-ZfDskQgtFf4Vf){uu8z9N#;&$tDRZk$%7KuK}&1sAUnAF5657;ONm^=o)Mv;>_29+=U?Vb8s*JnIDMX zQNTG^Ed`B%^tWw=oaDMWW7I3EUdR5|tW+ul*1V`mX}cR4lYs8^JRK5-=T%NeKwnt#F~R=ad>ue_glg}4IcR`v^8F0`UN52_K0nn9rZ+w6 zZI6!!QfK1U%^4S#ZodUbg*{te;7h9{(S(&rE|JEY6Mox{cU1d_?P#sxmokRVAC^gK z)hhePq7aqc5)MsXv}Nm*y}zoOnahtX0=y8eKNS37_=?Daz2ou%AT;Uz=`IQM&r#)` zq!F6EVQLUSxv+fZnE~Yq;jN>2DdMZ8tl0>O)~Js?Jyb7=lwVqk86GsW;su=F> zP?iP0eOC`FM?j{73L6C7t|X>hKsPpj^rG{UCtS@rOdI~v=vH~VdXWaBNF5jie*v?W zoX~tBG{XkNh}e|N`>UQ#mN4GYaq*3Y0u8H)nKWc}m-*4-{aJsElKT|!*bU!#wPzpiwE}(G zgc`m(*8%Y>6)#N($;AMzcz0(dT${=TP5;`E2~V#VYwH_Y>TbTbEX0BzXT*r&N0!%K zh*EpcNF;!sd=VooJ3fKr4DZ`wC;iBNPuoR?G@LyZa$5-RL|3%&Tyn+L10?mC~zq{fs>rOim?Gng366~Kwoln+TGUbB$;xm|5kOy z>H|<|#g*Kg`+9rXY!Vnpyk+f9FNMB&60=(jjJL>_>h{jZvZ7?t1Xx`{6eG>r%B__T zUtc2<4nRI@sbT_2nQB?U#N`|>y(u~AGKufZ#2Y7STxHcj=<@Frc@8QntBNZB?cGiw z#k%rMNVMKvInXcNh9TkW_NXZuO2;`Lnmwl@X?U&53#?D5*IgpoXiNgILd zOvIln11t{1dK)@&4a14W)cQM0=MFDb%_y$|Lbkl8KcC^a_n^zKQdn+|Wn>(T#86+k zAe5*l*>_8i%$V?f%`a*9NbHTAP;;q;zX>d3+b2qK?|x(v0u%h|tlm>aXWwee@Xo}K zr_c=2rW=-3^vG&kqqQ#%N?)<^(9RA1^i3E5WsK8Gw=4%UOxA5(Z=bjGmz^R$_ zWo*N+t#gdP*$nXAXd^g)+gmTt&2fzhDU*5Pl<|fj+LoB&@>z0#hfHFLG;hoT{019j zyPEgGPj$%Zmu`-9dtKNvltf?tH74R;0joskP=vK~*B*jTK14l<4b7TxskO51uXZOI z$jA6lmpKTHD{xkWy!TZ&8;j9kH! zFy^ks$I9mTQ$taFM%Z{~_bE}jSi6fHmXrGxn=^#E1lZMxPeRIr8yE+Lg~8&9N{i%- z*?-i17$;}a`tCrE7}|lA7#YZb>OShfAVz8EXTdV5$qf!(z}FKuZu^DPiHvS#Q%-R7 zg6YV)AvjH82^2}5*=h%>^qlN^86D9%*aMzPImBbR>O6Z!qkYujIRHyM zbN%D_H-EX(do#8ZcdcZPnC>X@Rg`O%-Vn(wd3yqvvR@X5ZZC{&?4q!NKQ+X9>& zwcB~7-<*NXq1ARJRqZpgAPyU2FJjxeJlrr3RdKX?D--}t`5Ta~fr~AJLm3d!LRQOayU14_Ud1i$Q{OKT=lFfw=~3;T6xTs3kesx2<5TYZIj8o-7L_Y#Bk#f+-RM3`H+!NuMh6! zQu&)af+6}`v-VW0B*0v$RD`ZS1HRg0UF6-W^;~FwuX8fiNMGp?j9;uAdNDy}XuSaqkR4!?s9NRC#O1;2V8&pBlz?xZFww>G6YHO3sdJR1oRp;{01I~~ zh5T=i4vVwUDy7xAKuL@6F^JC-K+>a_c)1uW zx1P?vvgn}t-U_#n{|5!%ukgZwW3r=x4U1{~4BQC^aq7?H)3Xa6xP zunLOaHzqqHBt-0-(xBQygclZ@?Frlu@SO9+m=&?S<+>}ri0TF|3Uz|vc%$DNoLp0* zD~ab6MQh?P!GQjcq*>^PrJ|uw2b=h3(t}6c2%9qRNgU|brPq^t@ zM)(Rn1|$>j($)qs>8oGMB$?}~!JX8;_41E`_l0gh_aJF*nyS|rtgUT;B<=oGxZd1- z`ZryUgSo^62QIxskKzD=;;ae1G#z<6FAp}{#=}zQ#&OGE`D&>~_^(riHo5ZC_faQf zU$nE`;&gCbe`IzC^E1#1R*FiD5G>^k`R(;H@_5H)I^=!V=DBdBfIr(id`Ye!J1BlS zif@x0{ieKo6;aNpzIX~9v%HN|fG4Rx#|1_VU?jS3V^#KzaMA#5_f?mk8j9D`D0Hdz z2~K!kuyn46RRKwlwA}u>ns;gLff@_|t(cT)w@v#pgDjr#Ft}x^^7yZ?+Zf~8*n!qr z>?M<7#FMc^w^|j~z~KpbxDnjJodJ57>$*wpR!Yj_FgQ>Q(4J3dZw9(*UiLrqN+0#k ze!Jf1Pn48fE|g$7_wnCZZ3Sx*N{-MTX#~z_yNQ2_MIE%uI6{)rHakVefk6!3w5K@y z7;K;5N9twDf4oLjFi136ot#e9lb(f3-VZbXl=N2#7-(F1jSY-butZT`_xIL@pZ6A4 zs0Dp1Oi!IP#hsJ1dV&N+gdQTr_%zJknR=bTlv2_FS}&8QRf^_S-5pXz@-ij^Kfq)S+29$Of++Q=M_8`!_Oqg@wZDzNK!)<7_jY}l=!u3=zqiKm@5 z<@Ik{Zy-GCtNP;h_|!i7W^|$Bwvol6a4=A-*EKDKt$D;OyG7eW#{1fty%(ymSh8H; z)u32uV{^aTvrp1!Z%#CC)u;}Ev3G1zcG@Ql z6=Jef*qCBsVr7)VlgwKyL=NmZ@%tKx1V#xHjoc93j=}ENMUf`wy=*BL@tzEe!F#Vt zQ8Ta&8YM}Dei3I(GQEKlt8(iZIMzZ>{A{GmwauN&Y#$qFzy= z-q%zr7J;@oAvv?NAKs*gK{qizf@3j3T{p1wB(VxdVr(O=c=P#@W{-7-xH|YmUgXK4 zdI+h6^R`x~tMjz2L8sGFt1yeuC*=bc*}St9m7mmSbri((uGa}IxKhRU9bnQNv|t|a z$xp&kvn8$?MDfR6`~doZnK{Rwwk+9c=rZMxI_@lbo-TYB$uPS{QYK6h5OK*?=cXR% z13>x$1tdPXgn7kC?a|{J4MM4>*O&J**2r3 zwFh)G!jLtuc|u6r<84TI*p2(moCVhx@mfctlL<#n9B9OuryTr2K*UO5W)&i*1u|bOg@1*yM60#b@UalxLS~Xt3>p)^=s~Y27e&&eW3D@4BfTG zByEW_qlkO&0`GWNmR)UBQVnxbv75z>S_mY;ebXinvYy-gG6=tLo(;alWKh9`4V#^k z%28dCGNxuDXf$xtQWe@$$(#^{!a{G>Y%=81vY8%cgj93m^WvQi^h70~KvM=*bCa6A z$WR<5UwQO%H>f_vUmbB}0F078`%t&fnu7NJV*Ry zFE6%pf6%s_CuE6f78SC|*HIbT?ud{{W-O2nbd(p!&LdnF1~n4ej2eQ$H^432BqaB> zbM!|>f=p5w0X#a`@yBQLRN14 zvG5>Hw`fSlgSOaP^rkr|6wMU-A==9pla1Swkc=VWgPb;K$kquzTdD#%EeB&_FEJ>g zD}%;e&;$l9`i&p`_aeGJhu>1yHsrza>~F4f0rIWv3ZqcbCe)h^el?KLNk6QZ|?KRH^KAnjRi>gITNZ%?Ek4!~RUao5{}f0myJ%;b+bE^>w# zHsEIKG;`5DE`{T4zvgrl`2X|_tDS}I>9N$FCw>=}k zD?%}|!~`}b{Cgzr+x?ooA3PR-M)1Xo=Q(!)n*Bu`NVI(;6pC8`owY*w{t>a=RHldj zLR#L9^c)Nn&19zZIX)X7JDjKOrLyqVs|inGG3%g4ki4=fZnV(?dgBQpDunZ?)z=}W zU)>vz@&Hb4s`dr(OD;`Xzm4}$g|c4%7igIs$Un;!OhV6(v&y?E6#o*I)Au*S+;5YQ z$RRhs9$d^kIqe*ii8lCjla833#h4)`m8^42PlaK#^uOONU3fQDUWtX{;x}(>K2xgX zA^?d@!b_k4XG7X6%;^fAm)R)9nkAvMG|VA>>m$GL?4+M29yb)m5uF@KQalYbZ=s4+ z49SkC9o#J#&R#6nNn7M~_gUz5$2LEE@Mous;C1#;2guH!KPwM!k-n+g6@4pyjW`FeNG zO*azv)2$jk$H0{Vqj<6nB?H_$GVOisKW=axDT5)u8M1h&O*02b!q*F)|TFiSqTAu?f+-(9-G$do~E6A2y`mg?O}-Y%jikVe1o4 zr{f1iXhvFm*!t|n!#;w$hl$ZeKtM~mX@2&uZwXZR@krsNL>}-fY_?!ha z^D1IZ&Wn31(THTzU8nR!k4F}uohFoXlQv6*rl6^PwZyKL|CA3ZcQ^NiR8xNU_A{K5 zQViV5iB*&o*tpUO!+2ppa^#Ler<1H2Y;nTaRtrcXK~OXMTfM&`k&)DD^HWg?Hm&79 z#zaJ*rgTOPGFGn}smG2=5@<&b1x7vlq$h@!YpC5~eJdWh(6|4$8+}G;UBHWrYj?>u z41giY1}+jTTG!bfMO&={T50sNWCUD7bd~<;(r;You!@}!_ z;#=0fni(ai&%F|`6IVw+O?Hn7wcPO{LxVp}AaIWQElhiJKy9c|*9@((Ls{wJ&;@wn zp!NS<91iD$>@Xz%SmzOkcv2u?IbZ5H{@xA?SB*c}4e953rl~9j{71^4iia&{R@MHi z^_qsmo$m{h9qx%(gG6js;siWNI-BQQ{-NIaOT=24E7xgs$2SCNgl^Wfvz=G%zZIKZ zBH34TBq54#R|KKIWzE#?Ua_fpy?NKthpN_J5z$ zdnwC0QWAKpLs7GXMe|m~FKM#dMs1wgGWWzOE0hqNARC?QL zDgDkpbx5CzuKVPhE(up=-f9hos=6oNfkU&c-W*^Hf@tGQUt@QgDqQ7no0mmK7MKpf zL9Yg&&PA)naQU#`8-mMk~~~lFnPTZrHkxw8o?iL zBS3h|7{0O<-KAyYMul{!6(nuBfnfye0cY)#6tWH723(0%M&!zyv7TViJ%DHa|Kd;j zfb%#SOnF!zA$Xw2E9;fBY|12sKYKNL8!-ZA*wkCjRK#Is(Wq(hYW2?}hW4!DAmmgV zZAzp|FKeZk^^_pS>xkfomvwSbc5z?5%0-m+>!V!mGjE3ec)H1s&8Q82vuAegNL?i;=5&~xS&P<3sm%tKS3bE;Ew)I zND^l1q?g8DY6C5IwKgW&dU1ryRm7x#khrGR=>FH0fOGy)(1i<6c^ z?(U}JPa{SudcreIEA!L{^LWTT`siQyAeHouG6UW zKWcDsGTan}TUbd)G@JdP&; z-v+`94oJX)sUs{E`vpeL+2&Zi15r5iZDE)NoLB1}@nbBFqOCW|Q+tjHf_F)3rtM}0 zsO-%z=NdjnW=%8WlskA5tC>){w!R+*>a(;d3uW*J#nMk;SyFO~Y+L*#MejQROSx38uVGb51QC9w z8$-=-S2gVlk0<|C__N9)tfaT3ejsOP28Y(>GH^tPS(5u`nzlu zF$-qD)4(T>sC)HEw+~FPWNZ`GhPxC~5VvC65bp9(`bNv3^oG6D6SA&V|Ej*dEmUhg zce5kg4vd$60g1q-YG6hR{|w8Ie&O&^C42hXgaW1}B`~h!Tsk&#z%bC_aNC%4Q?ixr zGTad*QHDwm1Jz?~PkgMvd~L>=w8o9z{`tZvF=j!zixo%vb%P6!*0vr)z;&%2*>qhE zEPbhi{lFl(m{FsS^vuk+I840EL>qvg>|NUqe4eRycxq2@p!VtFAtad&cN)OvEy1%4 z!*P@tUtn2`9e%7s8oL%6K9%GOg#7Pn!M-Z-vX-{vPBN_qdy)W-7bSU00#M=Al2C40 zw@m*i+bR9_0Iy8-PZ#~dy(8IJ=`9c6!94=ZWPNLBkU69Vzyl4OC&DmpAb!Wl{HP;j z9?YeW=>z8yh+6g|MSjHGaB9e>POzzwOy+?HUgQAmL}Hjm-~&F26n1n(I{{rkYx3ffg zm$o%}5lo8Eow3Hq6;m*PQ++nIXa8scMXZ1|aBTg#1`y~oq4AE}A_VO3R>SpeKEvK~ zz}mOy@I!8b=fPl4MnfRgyE9R5BjGXb6!UWxvn`c4(T(r^x!T$t6IDAUPe=kp!UG@v z)j_peZ`sNP4FR$xnhq(E1F@o@hSpatxrbJ5>JG6Ti9D!~n9#LXTSV3tq(D|eQhM|! zFpiC5^uI*$?w{;r$=i&kUszb?eQDb(iw2?9%V(5}>M5`mcMwDxKNKe!*+J%|^>6oX z1Y3qeSMS<9WXV$OtQ=7OG{ue&(j*sXSpAo#pvu8j5HSa`8^qp{-CP{HLhI=1d7rAV zX-l9A54#R6s>DNpfD*rAl;PqXtblz_P@w(;yc;RNuy7DGxF8|PzR*&QIsCjylnlEh z$;)KugdocZ2N(lNz`QVjd-#-JFhrWWZwq;JeUDV+RfO^1^Z#Gt0;Rsb$I%mV($7H& zA4haK@Ui(TD3xkkc&O}RvIp$N+q+a_|4#=7P2uy+`H_mJqbE%fDxZ{K<1zF5{|}!n z>Fo1k&uK>(@Kp_B?m=2g&;5uF8g~nD(G6|eECdlByw1zJx07?> zzn9k1Ishz0??&ZSbw_}?AM@xo`db0F7O4#ay?yONZFstp0gGir{XEQ5c!)Ao39nPM zR<rY1x+-t{l{-B=9NbUOhbYr_S!SK)q6enas0 z;wi)x7qPPQiOQ}znV{n)U`KhaB@S3Y0_m|W@?cRwZ-wXGYE|51xu9V^|B%$jD>lMr~H@3G&(}l@TsZWp(n== zvZtT2!cQ~^L_r`>8Z^z3^G?p(JXw=EnL5xRP{T~0*~`#P%%ZEST(N6>aCk|JQ&~6O zHSEB{xU0kCXGEj!HYn@XoTUJ9#srp-Jz5wjRENO#9}+)ZTinxy1Db5}ZiV1gt!p@m z@q&jkM_YqvW0_L%-bi!vZCYmhRPicK*DO(}S08o-YY66JHouNDf82Sdn3@P8LH7ZF(q%FUDcn!u({0lABA(bjbW#+Kp?EJHT;P<1l~gc0VGvH=~ZJD&z&JGE*6bC@yGcAZhZp=h!-=^A2Gs9uHBF| zRcNM06y71m;dE;|RME3dLtNmCQHuQFXj(iIRB2E+Rk2bDCC(9dFntG%u-ajyag8<}`@d`8RM^^^MZ3;n>Ub^8FkLAB z{I2?+;Fy-bzm`u#`zaiK-y~FALsYgT4=z+G(o15>Yc4tm1O?w{xfTe}3l*-x16xzp zye)?SP;;LPRm+X^z8#=`%RkLs6m%RpKKWv}@MEx`_u9Y(viKHRz<2*Fh5ej+-&9Ax zcaxVq{OVG&2VtF38kFUj0sQj14cm0#)9(Ii>8W8lgoOJhKfEMqMTg(L*4shOSd z1-7fnIbUo-eF}tL#xV?ko9ICq9K}3FofW1G!;RFML|RTMWpfXJ`pGzN!;lnO_<>?NC!oZ&Bncm97V?46n3F5yEX&~n7ei3|Un8SXYd z^>S3q4R7T)J|%J1*6)4}3Fku~(f2wkE4xC^Ye!cHe`M&hib@Nx@o}b-WZ~58NURqK z<=aJf`$IG-N`(fwWXSDikXl^suar4JgCTK%|LWio@sK+LvC}V>%8llLF2HjYD0i?l zc|(3bnYMDMM=atE*^RfV=#<2GLuSm4QGm6CQ)a_aB!Y$<9InZxs8^D#ELt8kl4QhC zT5an?misEA-w9NQUrxN0JrR-8RFu!g3&$ocn1v(<^=+qEpuDX|pHM_4GzaeQ0-+8N zkJlZ{R&a#(ir4tGs{0E2oaKK&4ZAvaZB|ZffPnEH#eWz}^haf&INgz&tG5HI8qm)@ zA1FRpBqDnERjNd8EHu;oeEUmTh*fvs@-!__;UYqRc;u|0$uGf_J#qOwigAwJ_{a(H zDYW*E!KELdR?p)@r`qDI>0PEDLf7BjUJ6!qdFo)(;rUg=__I4q5J}R<&RmV&e)Ocz zV4|^dJve7Lv6z3PjzwHEMYmq+`!$@S$<&YIOst?E&FGSZvPEw0s}4tsqPo0i1z5jQ zyd>pDUXt^-Fm+-dUDR#YxxTsfU9cJSi9)WO{HaC$d>cqk3d2MHO(T}&=qWVRxdS-T z5B&5hpoDg5sXJN+o#>dXu_0ie#C=fqrx3x)M29NX80eRMHNa4-1T&yj2lv7qVe&!( zieI8rmQKn}YC4i^mblIQ6Q%q4gn(0c)4#yn4WxhN-_vtsCI~yhxf34YmP2XnhyUWN zm0P(dR2>dQ)HV?N!FGx%yd&;ev{v4?eDpxTO1D+Z0p&^iz^mHRTPpI0vuBG8l1g&#BK-vh<9~UE|-IeY?guE$%SxT@iM>h z@kLY@vh`agr^}vhQajTc^dSYioRMr5*n$Dft7(yyC$?O~r-pTi+ZbCv8+YzFb(9pj zQ_`w9df~AyjF|DZcX9)fPTWI9@2+ePB-i{EY!SQj4a&0hjql}4<71g3iF>r z=NeSH>W+|Fb(Z0s8aXQC#zPx_RRd8w8eqQ`{KVD54OL18Z7|{?E|#&$k-*oYWbRi9 z#${|>pKvI$v_R|w{@;t=u#+kw$tnm`A6jZNJ3DnL7y=~}1F@lNLJW8U1j;1FxQrN0 z{cZpIT>i^2x!y^`sS!U=9E;)NE<@Dx7W4)EY3$Nppytq5sT3L&T6ugs+n7( zY3*MWG2sz)@Y>$EwTU_X@R;s~+28IyO`@^WWskyyeM5y_OF3HUz*C4&rHY2%M$RcyyD8mi^HlY2Gm!K9=wH|K->AQmW5J|0Jp5Tp0GQu=ht2h zJUC-T;b^ahpTM8^7)I{H6g%K1lBq9ZoL$V%i7%pwOW%?2MYZ(aKDrd&bLmWrB`x#B z1(!c_6t`+)Kb&2lPMq8D+uxiB^3@S*GgE`nyl;6g=;Uhyu=Zz5354(4e{7`pH4YO% z#Eh}F9ZtFV?AfF?_>I%$saI$ASOt$eRPQ(sr}?;pFvO5877Rr?@|djHg2?oDU6iKVNe!a^Y`ei$fX0R>A5dSqk-mMS=P zeFA;T2dSgw3{7pYS-BM2(Ai~jnWhlN(la~2nmww1SksLZBQ40LogI>uKw1w^A_CQ+ zSQ}>S9Y^w-I&czs6|6?ObfzA`Ry?y?2-g5`1P&hij2jbeK>D}ZWy(R(vWp6)M zbL|BSLH@M-_9{v?$Vd=6hE5P(2t${EYE4hN$_%!)wb3AT-`Wqv$;x(`IvhKAJsl^o zXX$NIE%MMAKTv<}%7)m6A=rSu+lTkh*AuAYJt{5Sg#5|Sk=H5F=3Q0Q_IyDDBvRZVfaE!T7Rl7mR1!>Yk(y)< z@;!0H6(*{4c;im+M()uewF;ENs9^m*uJxIc7+qBQJ(Pb@5q$tZA*^$USaTpv+-1)u z0yhr6|h#9^>JPY z!ImXF=i@7*Gb}HR*E?hDjo{av)G9UL%kocj-G!>~+W*V%_P*cm1w@v5A4xp5Z{83o zAODt?j)sx_zVmB~9A)$t@L|9WSkOb7;{*xCvP1!U(+t?#WtH-;lYSb=o!9N#&G{S8U2oc+^R9XjLO*4QM51*$KRTjm z0`E6C>9HT|z0j=(Ax}miKq>9eC2Grko{0Q=wN?JZ=OWzxo{V$+$kDC2>CA;5c86!f zuC2AmSQ1{I0)C7rH)x9G_&<0$5fb+*0ii_GJ_YHP4TRsgHYKn$VW{%1h2mcr`tY+* z*ycITd-p*2`h9_oZOMNmalFhz&b_x{ES9wAlOvB*%jG{HH}9aodaR3DAkM+$0^Pb> zJ`|o%8PttsxPN8a*MrTaOYTTj%wn&qd5l0F!k8V^5U&xEgIS_vO-xjcgE5OZ$*w-K zTicW~nBPcb@@jWWs#9plC?%JN2vc3{Q9B{@5$chGuA0Q)?Lsp^ zZ`&0aQ4*XX zB`E!{^j#5AnK2%2nz6R*h&)6ZIW?+z#8BZ$fZ1hzZ50mJIFWE+&A;zh{9LV2u{_IA z5p3Xn8;bfq^fL|I)V5UJ}+hw(5CfC)Ga%gjPLcDkpd~=CIk-k8|Wu)6h<&FyG@wDF`wi(7FzM6^s5~eYg$2OGK zrV~LHY_U~Hxx8*lPr23Dr6vGH9Rzt3dn^7C z^;#&-zdmQKo#5q14grr|5dUu(8`L2Bhw<7w%zs9bZ!Yqmw?XFh!(df_w33+hHGD$n zED=3NNi*z#@^70`q(_6$_U*VN==LGlYSU&_daA05zB~7Kf`p6g-KI^;PxH7RY*j(? zoCruFaaXoZ_txxfq*SP&8$FrM6!FBv!pYgC?j=SoZueB)c~*hnK1=&&>WH#GIyyLV zIqt319f-;7^hv9q4alY|d>SZtUKgzQV)2*LM2~0ltz--*e{pvPP@i*y2dFi@aEDO8 z-%gTyU%JZ4d~9Cc&JKcOHh>GAz)U(3^i&65dAcCF*+~H5B&FCJ7K9Tt&K(&df{zQocM%qxueR@#L?TY@|+^6$aZd-vi)no1vubuLt z_;T(QNYWCUdrCk1qYHkkTvNJUtITDuBmFOzj5ICIs#g}2&))<0tp&kO5$T)@`zAng zqmz3@w3$xip;wMtYyWR_IFLN5;`_L|JYhl|akZJPG1Rm-R@#~{xo(44I|UCK=1NSL z#6uLjv&$LsZ|BTY>mq&Y{SVP zX4cEm2V{bCD!&LZq+!wMu!~+q3WJ1$H{%A!X`w%`=!VH%5;&WK_VkMYaXL1W%AxK% z&u1W4gc2KA?J_PO{o51rU34?dN0)Um@sr1yV=d>zo^BGv%Bwx13~?GSX)?<{v&_VfV8IDnl-&5Uq+KPl!O^H^S;U5Bn(W2EyZ{T zXxut@k3Ndiwp?`bc_*7)%e*mUJA40$NKP9SU9Ne|zHLaIi7sg7D=D}89&$p?L$IZg zOJpv)8$0ympOI`n_dK{+LHJE%CvXtm(O>CrjJq5x9O zK{N91gnGOJ+SUCU$Bq~2NE;No#JDXk5)R&!=58=KExT!46S@@iu+BJwpB_p*av*0H z7vGTPCy>+D4J%UlKm-EtsT)DdMCZuVPYg2JeV^h@ z;-InR8}pD|kNz3>rG9@te`F^@#j~{etIPhy4W={eTAZM8gyzV0hT&-GNLOaVa#-=1ABnI=)Uxj zm1{)Z4(q0~m!u&v2X(Qq)QJSKhKGHTizn<%_SGDX3=#T>q*g>QaZvQ#Ts3nAqqRk0 zIZ8)4vvlNBA6Lj>VPeA9cITW%FUXFPlF4_n76k2vKe-*CdpKDx63C2geg+14za zN3kxF9@U2uC!#->(wWn9vbXXm)#XA3M0EJ*+KqnKnNNZ)t$MP!129 zbhVmFmP#%gpCikc$;n7;E#w-N*Qc9xg0YyPus;ubY$O8}UM;YLtocuYMvCWKrL+gw z4sqyqdpk`cMo0ZJDyV~M`ETp0lp?c$d>>CD2rKlkmU-l-Aox1IWNhIBG64>yC3`kl zALI=m0T^UE30bla{TU`1-Ob5882Y;k42!oV{SqY!Mt_7_SaLr0>(5S@LuN(Wwfhm? zu3=+D^3u)+#x&_6*3+!^J|4Zk%yRn0^SgPDMh(i0;2y(8ubzcMB4G_h4Y!&U4y)zP zLYFeQ?(9+I`@yZINh3s*2q(8W=#&!yl%_KsxbtjJmIIatefpH*p3&}5>7hXZl?&GR zPFHc2Cf+M7!{H|twsXpA0!(nrqVGWlJ*L3$0QL@hUeT<$M4^a77+Q3dEbg`~Gfg29 z`74{-q-!1RQxr^Xm?}%RaBGp-2;NxM*Qu7{#EvpabMqKrXYO3nY?M7B!vO>;7N{qUzOt}tVgMiYesUmWV<)O!U=`^4BLYS* ziD{IcRJFx7|M!o;EPSey6u#<%Y9O8!yg$-;MlF+qOE6EP^(quA$NI!!2&&R|#v;+) zRjv-D5}ZK5e@iOvUWxlv2fh#J+pRhU>eruEt*8{}X6#)QWuSOzGTg1=Xle@zf#nxC z($yGR6lZ5R_;t{tYYgGomg3gnRo5OE5wCsj`vyj~d2Gvj?DxwX5*0+DZj@&vbcHMfH~W1AN!pGE2|IRk9p`zJ-?fKm^grWG zg2rC`YaDB$rvX$^`ig|*W_e0x|1=bYzY>w7Fs`9&-ad%aWe|(4HdjFXF8WU0=r zY9_C=h*#7#v;t%dQanCe-pYnt@c-$s5{YHbkFfmIx)39D==vAk}@BB?EmA< zu?oAGX*`=$@;Z6@xC@ptk^tLoigbPRxYBn2yfSgGsCA)G4y{d~ZQEkA|B)b_v%?fL zm{9Znh>l`tvWs!Y-l2H+l#h*U-}h&Mlwq~{ceh;eA`Ls(cb$QUg00l?HK-3^vRcQU z#IA4?sl*^y@QM_J3f8mb0DmH{SWyOF(^^5N%?uY>2M9;2RaiE@z0TyFj!q^L7g%DZ zJoZae-x%7TzYohw0);|zs=nZ-?N($kAHM~I>K|^i$08fq%7x~gMB_(DC<4rX(bwNz z@k4U7U$_T<7)aH^v<5CV)Q^e`dE_Udv;EMuNwd?jz!vAa@f&aJWUR7kNi=w1A1 zIieTjq@?06N3@Wexdt`z>EbfDCJA9-u-UvEP6C_}*BP_ENqEc9!XvOd%-~qx6w)fd%R@oC`Du1XVE6+=?mQ#qf4G6F{ZQ`O*&H|2&VyIb=VBwFY3LDD45)R9sn505^ zrD##nhwU7See_@0cXm`cr@yFEU5v6uqC%7eZs;jW3(^^*@j`y(DY8dyi9Foyddk2M2v%b@i zT=WXg*kHm+^z=am(LduzI?Ul}fa<-G4qWlI5-YE(M#o~lzn`qa>GlE{>e031nY}tE z1%Gb=6qy&lXljWCKJjB5R>8Zme4&`jRB~dlvf$rGT*?f>AtiiplRt)EwK2^&_IX?{ zviej#Cx;kpd+(E0o0bgXG1Tl(D;|{!we{v92kV#q+msM%7YO{Zb26gLxJge&*6)F) z=5pTAymwpeCZ#W$%D-6k*a3)}i7O>V8sr4aq5mx=aHAaCYBT5C;Xf($k80#PMe7*V3 zZi`!XjYQSCI2i25xJR0T;n+7x!KJ6jM~HC*UpLxPtD*e3+%}s!JmS)id?FR!J}t{w zwI}CUN5=G3Dz`u>TLT{<98dCejm7o7bH6aerlgfpOZ{u5acepMHVgU zAq12A2O;xYB<~ydURJdbvE8RMKj3ANwW#;Q6M@OGs&gP8X(mh-&*rK2CB*Q3-nUIh z*C?tM(egNq?e*<9otmpLlS72&sNH+?f;m|bBV9?AT)*y{RrTu95^Z6kr)MSWH8F}M zfjfIMsM{*`DXiD+Ccw-F@6pIVJ{9rbWBU=~nD|4(*9ZlJR6U;Ef}L`=|3Hw!ayd0+ z+wV9ndu`XhcSgtZ9j;;9Sh%Tyb93T;mlSS ztHzhuX?@jDDtM~3rjh)JB|I9MYxm1~&}M%&LB{fEaUMvXHHyGGgl*4Q0kjVzulrZV|ZSUV~sA>_IK18f4&LIJ|p^CpUR2nZ9cpG zgQlP%V7pWyB-<#`W1JpNqMJTJNv1IpT9WGB2B#An!x(BttI-X{2KNxw;Uk%g|Av*% z)UJ$EZPyHS1y&NC3kP_TE-Ho$>wFmcR&wpY z@By9tgq6bgV2J37{dk)7@G~6Z%0|gO)ry2z(dvp1s~j6%tq{fdaU8YRiLj?|%QsvG z$=mM)>cz06us>~Q6TJqT`$ERIxG`q0AwijJ$%;f_@1v2n7K}ccbAL!pQ`^J8GY`(P z>1tj&Vr}v;l}OBmW}U`!)b9cVmtkx6NWh7<0pwtvSeyBfjfSnOpBZ=VRD@RG@SyJj zH@bGaU^|pAGji|x{4VZ^=RmuaTL>7h#`7E;QbyTb9izLk7Z>7wKbbCOd!NUF%dp2& zk$gm3;SSLgJ&Us{<)>}&Z@R8(fu@(z7vvVhG0aU~re3@n;8{RmR&*{QnZ@pm!9KGO zpe128-9%YIpFc{rg+jcBGc%YxEipJ}>sdU_reBJ;Dr`vM6^lw|KaN zX%jLVx!fqX*{o%WRo1;0Fu}?+b0bHdS_s;Tt47Sa+N`#+M5siyQ3gXQ%$&+6qz)_dv9qg9a;v8vjaBfLtXW3mE1g(hP^?XolDbA=21{$paFB$lj$#PL| z-7MH1-e}H2UR&l?H}rZWU69ISH8?AMX*FT+ScvvqL;Br>S*6=`Y2%}=_c3rEyT7^P z>fA^9bC0Sb_Q!wK8Ua5XY>5l^k(=+HB{51V0i@|Z=T6ihTUn35W>=vr;NsL^r!hn! z*?a(UC>z}rGK7pM&mtEP1aSEa}&SLCeqU#{tne(*ul$ z7Vg@DIY)!5P3*6rB87pnhCLagpIR7!Dt197m+`Oh`(pdiC{in!O;@3BNLR<+jvgNj ziW?=?6f}e&!(KU{O7;>DDsB|ZQB3#qDr3@Fvt>0W7aYk6vQ@Y)hZ1aY^A0iVtW(L6 zLU7%38COO2hYZ1^lZP#iv(#qM%=t zmWb_h9k{G@^f#&CdnNZoNXlhdxl;LF89mYrui-GigVET6D77NgzkWepDEVhRzP>@7 z{KcqXh42x@mbd*MS<{aMGM#&~bk}aG+^cZ0JZ}iZ?w-f_qlMLA1)!m{ z5(7uj$wBc-fSf()5E4eow;rs6H}LB4n|_!GdlO~z$d z3c6A-YvOO{SJXUr3|{BNCnYY;cw=3F%_DmJyT^XLbFg!t9Zcaw&eLLB}-X@ zu>inLcaIymVFp49?JKlt5&~Md0?`$55V97!?Siu{sK?>*jBXvCh*t7{BSE~7m6A`bS32G3K~p^H+w9PeW|v zuBm$Bcl~Z~I5zKZJ$R;^U(meIeW<%4L5?cdV#rYSL*Unx%h&ove%O))Ggzy_9WRX+ zZT?em!yoQi1U$y?(pu`}sHqe|9=aJ`s>ZhC1s z+>o@P9Dct^O7Se-nFHp<7l>iWj@Ap12so-I-y244=NFTE{yhG$&J6jBc2;uiH^;K}d zy)aOc1~Roo<;Fdq=Bj_p6!RLvB4||D@Ubj|%jRMZ;U>qA(h*tQhxHZIQVEjHc~QwN)M=9IDHUHkL_Z*|%KJZN8VUe5v zjwzyjGJvta&g~1|ZP0XE^MoaDVj7%2d+s5V;JF7*ezfg|U;Uq^Pe*0sd)RefMU}Bg zrT<eXx42U9~sjPRxDz;3!){ecyFtIwDUM0hCP9Ngmz;VS4JAM z|M~d|#fsm0hEE)wy}sa-Bfss)i*RFVql?*q5p8|sA@}aMcL*eHfmx-wTObCn{-*uW zy1$J=ZPfy&NgxSrqmHIJ)4R}sOa0qX0gp-!6D80un+wqhg0E~awYA!s}d znCIrf!cBb}Vj;sSQb>TwIz$6aU_5MWaJhEX`PBF#2sF|5YPv_Izb0q|);1odOe?GF-i+iQJ(A z`fQ^p!e_wUAJ`{n8=t%v0q)2plu;qqpJX?KMnPA1g82pQ99m)#_y4-6?-m}<4(M!G zL^t??Cx|$BKb|=g z-4SQ_-bAwV^FsF#cmd9+5gTlesjT^NS80DG07dJ&dhn2?4I!?o(vQE?aVl3u>AZC* zI0uQqZXcNRyF@z&SS$(KeY(V_nA78gNyH&j@DbKWa^sVA0e^gUOA<#FioL`nbGVc5 z+~JK%ASL&Lmckx&R}6;4x_8 zfn0;{bi-$x{(WF@s!3?F4ySlF_oHnG9`3>2ELajM1+u9Gai9)Zq3-@?&a1KEC9Tx} zZ%kjNjQ7u5?7r^GgOmnsbOYzu9#C_7ic;9>ayp91fDnXzbgpaJUREEODy6aa9^xCb zt8lHeRDYM!33kEOv!RisnmDm`e77X#J7^n!q>;FS@?#~slkRjdbJ6?IeLsFhBQrxN zVfiBEmgja{Q6zlFGGX~0urk3mY(AAqF9HXePO!pn@S#O)H+6(`7Zf+PntobD^EX|+ zCE>fOdvGF8v7uHSt5bw2CV6?datUwI?ZZJwq%-y}EdUKyU%-FV#UZOfDp?7smPLlh zAKxgd#*;@CJCmg7{>l>Px@NcPTs}!8UF>TtzIUjkL|Wq|q9ADWOXhDhWN|pBpV-f*6bTC{LR}A3_5X8- zf~hs!D7L1J$3V$@O@qnS%`Jz_>^*G=gY{Nn|KhPn0q84CZil{O%uO|I$;F-od={*h zd$e!gUaTV0H=w5?0UIYnIMHD~2na}XEgVCdOS?NWj8&Gf+LHe&1{LxS-81rP#}2b$ zvZ`$b%LW_@0U`{a@NmYA))q{0VtznhS>;aWh@Q42#kciUHSB+Bf9bLy^j3mEv)uy` zmsa2)UyFndwO?x}Lh?#W`FD5R;acGO;3+R`f9pz9va(a`3v+~iR*0$W&BZrZ+ zCeoI8)zNYW&JSzvo3^&27j+R%biN)F81*5sZhBm(A-hd_pEc9Pe)ui=I=r;GOL3e! zo~fS)L6_{w&eq8BD!)kl%nkQ(P9ovbp!-o!Geh6|v3tPT8p0ow(^j6qw`X%oqfbrV z-+C|YLe3oW^wyj5* zWvBm5T+cm2`wV=ddKz9-!9;ToLQiRYxj^#w{wC^?f%HczJtUR8R&$w{?N4>U@TC?} zkU&TkI%jU1Y3jj|Lh+woMi6~c6&OiA;}PXRsTe`1axcfJmYu5KNhx}SzOweWeWv>$ zz=|lCM2&_ip@7_DkMn+(n=#0S0c6gDuv#6I1f4tjiJZQp{Ss?ac|n&alk{tX8lYH- zO*Z?goL<)=?#YGLcr$lq`LK616Us#S7#HTJm7#9x{y$XcR%BbanUw8YLBZ1fSeo#` z3tuX~tMtw3Myq$tvd4QCZs6YYFl;ibj_SkCT><{APbEmRDdaVlJJp#zsm`hJN*VpG z^^TxF_jb`OoERXms{xl_tT^wPRbby3BBNTgjv;{Gnwu`|p;$&{u9Rh>24gUSIclhS zPFJ^}iCU*Bpo^p4fLFb(fYLY?ZD2F95G`aJno;(^z_jyT73tHct>PYJ;nvA^{^=*E z6EiC*@e1e`G(Eu&fymO|fW-yturc3!qz$&=t*n{XZG~u`hJ~}>GC$#F3Yd?kmjhFw zuC&fRnp7V+PA-b-+Zes-Ix{C?LK`vkEFF?NM46}9a$h>}s|YAcH*QVwJD}rf_+yXt z%DbhSfq0=o1Au~u@M<}Sr#)>+qT_s=g>=(zXioaLm@*lc=4aQs-^N8f9?O6mUYA5!1eJMtB1&CK z+nAWf{wZgAR5<)5(h2V)+YIW%Gddp+q}K}nED+JDl84~M#L3`cPZWF8ZpKJOEBPWR zS)9G(2H?iu z*4Q6;YY(c9oqV!$y#kFw)@OPP{LbT88PKF41x!lT*?3;!+)a;02@LRmTo zz&a(4)6hk}tqx5|J|4yK@#qEwSF(c4bZVcA|hKOM|g(q z_Gd1U+0z{Uw}3ni@06-(*kpGb%pdQ$%}9GVSzGbu57wO^WrBdArXEB?v&xb?cC3O| zyRTI)ri`+hRe2_lqh7x`aO7U)$gFhj?suFFqHq45ll=E4 zUiO#sqw%?;rel{1=~WA#amQ+WV`*RLH6xiFX#~8%s`lIlwT2aKUYB4$)z(AI zUqn36_3ytWXsPNT?_E8R+gSo2 z*1!C{k$hU+<0s*yVM+@@gXTixJ>9zc1;RU zrD97C8(h@``i;-2hufd?HDlL;5#k2HSV*8EXwb#Z6mW0O1Sy;2C|#4`CP`lJey-lP zs;wW-{rX*F$s%P4NJ>!_wx%m-1+C1Yw4`!^^ z+~Gn!;VrMTw;O)fuMXL_wj1t1@JK_AoCX8olOt%5BAjli5>5@s#(W!7;rY0cKo}uj zaC@+rWxttC9tD6ild8{k6ITAn_>bSa4_K+V>lzpVX;`!z2hi;X`zic8NEX9c`}Lv$ z!jXqEwt-AXV$GvVwPU67DH&zVfcXc@lQx|Ibi5Gyxlg7WJN=+x7qLu-aCnG0nnae9 zi`1z7V$UYNgYnR5$Y1X|^F7FaJ@DlH}&`!Xch^;ZsG z;(%()LIM)^u3W@t6Rc;6=~32|3j_3>?fv8^J@Hx1#1!Cv{x+YCrPr|Qgx1dvyL;Av zo~aW{a@jNXIGJhpC{kGau@1Pfl-PU0ebIL3nk6ZN%VNtA!Qcl2DQ;(~@nA$xrz`H_ zQ0P~N*!;Epp2sQARW;~WRYj=RKmyRRmIbGt2MUYfhlROirk^X_yIpX^Q`G0TF3}kT6s1@8`%16FG?La8~NyU69bt% zE-gGNR4MJEmGx4_3ak0@bW}d%S0t{;aw`v(opG^qLW=12oJDMy@Pvk;8DzM&M_|5$ zfg8c1b!eVosYstQ`U*1h$znC|UjH26(!4)^0SS>K=Sl41}4JF}0L&yGo1YOj3S4R3Ef-fYYD8_p{Ks?SAU!>iKHEXT+ zO4t@)LO!v^AXLPbN|n{lG2sVjrq=4YKu%p;95sA63Eg^%hPB%?*j(|65ucoKENK%y zYY+7Chnb=9pewuxZ<{Rf7R&?5w}`$0sJNHsK-ky&54%PM2Pa@Nh?p zr^=mvg4o#y_~wnSH>yH}`KiYe{23{Yli1qTIkqRQgE4ISxtHdNfth$6Q7axjP zTSzi`*Lt$9JD*|8$+Z+p>za$sfIE+Qvsk=U1qGr(>}6$nt&x?1A^-?><W{hABmgR?l8ev=G?U}BDt`gohAK?NcW;GBSsJgV5>Z&pVn^(}XUOb2JhWP2 zE^>!QCT79iFzM1D=KGwQs3eHC2dvj;QXg zfiN-AG)@p;2N)iH^$EXMcY-cPy z=Gz{(3&zzPRW%Gg4AOs}vv#HCJ~i7c3+kqe=l-uf&T+SdYEdNpS1ZhY=G%}MRw5JZ za;SrMSg>^O%%fVxDN&aN8WI8vq9FE4!xS%>+2f^F|48p6lO7H_U$jgGpD9%Xi!opK zSPKq<$0TgH0K(wCA_qPcbt)t;U{MnAzTvS4jmbx$>L3?;LO@^bR*ZCv+mK$WJso+) zOSBkZ?Dzan#sU%Xk(zCq4Q;w_99OR0Z@-+eI&|J3x48Vu6s`HCff$bF^->)S-zBTQ zWmSsDH5*c}Ev_FVvu+0<%c*2mcQ**jo6;)uwJ0hSBV$M9*(8qgyNOEoNB8zf41+si z-+TGd&1Iqmbt~r!@QM_wUj5zS>K=8~55KE}?Y0ZtUojL;*=v8%h`&?oH{;2QMfMYc zP8hb|2w%Dkoh+u`oD^IXO`LSfI&OsD6m|q^2A=`9!MhDaQl0126Z#KwE!IjAJ?eDr zQcAC}y9fV97Z3bXZ;&Q&p!BgO|Z>uI-`B@b zynGOu5OpizU^ZYR;~mKc+yv6bHWdwd+XxxY9vtvxU}>WUt{@!#6$fW6-`p)K{4W7# zD+=mp@aS--pDR1VL_!-}@s) zM{ckmSLh(DDZ3=ZnRt@W&^^5hd(G|RyZjp@);<0 zG26DfqE)(|E*d}ZoA;rx}2nJ}gSvvqq z`$v6l|DPtLO&S9^J z|0uAM`C|5uH~Bcx6h|DjAG)H>1z#Wjg%zG|eMLC+bs*FCn<(u;Md5`d0*JlSUxVX{ zyEbz3+7dBV`lyDrZ!|6f=e?hkFCx?$^q|xTTnq~D>-~tOc{;;g zVO|~~Y8?SeoDFUdjLjjDY)m^ZheTax&TfH76(S>Ls|9#^BI%apsLh<|VTsxe0HwuR z$^H(p>vrjdD-N9wpWGUSg-*deex{K~<4kJeR8^k5kxLTdzA(d#>f00}1;N1DxMdJ* z;50QHN2ll=;m^Q1>W;pi` ztzUfZTKw8GpEmbp19rr<#E;O^97$yJG?u{q}rIPCsSwAu4^RW(I8D zr5-|0{BdIH`~v01D2*TMX-4}pSw@uMN*`Rz8VQwj$5^(3=y$i+#uLXJTZFLv-Jy!J2;RxYzv9Wy(l>IA+HrVzt zKo{${mt8RV1jBuGM6!nNz9lRp?JwU1ae@>b@`^>I{=rIkDHzbg!u*D0yvmwk`LF68 zYe*^_;T;m@TS^hxZERQs`Hf9;aw8we-PrOzd<&9-WU$kwZ%=T?E^H|OIVVG9Ukc_C ze9yzp(Z|6qQ#QMTrYV2{90{kBbbqImBc4BdT_9d$F3dV&b=(m^yn)&Ou^~-j7=wXzVfZp1X-^5Q`cmlC`F{?&R8)T9q%rm zV+Dv>Q3$gsFJo4RVag;3x1;q3Ip<`s1mQ#1V;Dba8beE65M8R=rM==Q58#Fr@-~ww zCVDUY3_rTMHvuO6JnYQPs7Fw4=^;e|>bmgQ=^?=gnuop>x&aia@c2?{WlZlnF!;CE zMntWn%`O)mX6ZNPc+i7nzFlSACT=1b$%ZUl3L>=5ezo8acikwHqCo4Bec4?^4!3Rz zv&V}l=ES5rNfs~Rc6h18P{U@k-e(jo?_%`K4dBQd6ttCvm^Q;RP|bBBP`p;v;42cZ zH5v~Usb->+^W4cd4sN)%JVi`KaekfWVT8bFw=TU>>yEtE|JNC<}@bHzI!7!y;xb$q_f<(%=UIMoiJqM9+;pN0%!H?_B-^+;s- z0br-J)XeUkC$eoPsH-C{!2nm|QN6D1i8552I1<-+!>RL|-aD0et~u=5thpphT)4=i`9R==DusBLZPOi_4*jNeU8+ zgEU2=cEYgRX5=SLHp=k9H|DK2Fpk_>^&kv@oJEh(jOJ;E%|72}CpOPAOWbMQGfKNs zy5!;yf)5_Vp}OP}2#={X6qL4SQtMD-rAX6NYn9BR*tT@`&Ot|{J{(neE68GS)b{d& zqE@)=dUbzNG=In9`8|NYucegeDU3v*ZKjh{er7n-mLFI;t0^pf8FZ3EU~__iK@W*S zdY2%#zap*SPDG{E)_fIzHOUHDUqe9=Ztw6x=!<*YCoDcP&|XFCOHS&+59Q+3&+?|s zfErRt4eUCt;O4?Vnk98QB%4o%gt6kN@a&L43n+X=h3ibs|FRe zHcHAe{!zT92yO*uTMYfv7=9#$02tnrh4T%Z&TMNZ%rDC5 zS1`R(m;_tVGJ;n4MaSx`AWoDWV0Q;iOivvh_GgO^(A9=~LUXcb+ zP6Q|qRTuYf<3dxd$AGHi1m&6M)j5_s>)A7^;L3m?n8m%))XjU;o#7+M`C7!>v_|@E zYXN%bvZqrq7pC--m71L3(JKU_$zCBO>+Wl04~-zWf^)k7iNAqGE9tK$nD`@t84heB z171c|ej9*!vYP$S%6JF0{$-v*kLWxjxt>le8p^jzyr`ht3et%y=Z2PxVLN7^LES$2 z=p{aX^fX0Ni9=QacnKQ+LO0v$q_3;^^vb(x_B75h4qeQ?XDA7 zr^&SrHdoTkY7KDrCTYG(6>kVS@n2ukR8j8W6x^?i`bhj^)dB=vrJ>Y={iBIC7Im%Igbr?kkR$w-}_o#t3GORnZQ^?LMc4n8vzc8!iP(Y5Mr zD95rxq5IsvICu*JZ3x>KC2NjN42-(IBBoqmAZzx&RHao-nI#aHO zobT=j%e0AE0jT09v~U9Kk|=&99esKri3P?`Zhoe!6wxBy@#5gu?99$n{#88p0- zjMX-g<+iQh2H6cc>cQ7Pw;U^O_er(tDw=sx#vta~aNQVfsPI4Izo7JbMs4sw-oPO5 zX?0P&ZJN@W{|EDABNDK$W_BvY8^ryRBSWWP$WlfKueo?g%3M;3jU#LSk}mJ^&q*&o zlS8U}j<4jvMaB`;hrpjhw)`w2jx!XEFkXb~2~^k;l86>CZ$6OoBjmV2p^$vb6#SYM zqLI5y>Oa=x@e1D{_k>k-?=lXCDK!8E#~1dm1k;sJUIBfatE8tABE1sTz=Z#TdnY0B zOq-U2stLJz^ca9^S=sWmUtAT}i>9L;bxD%sxFR{t z1+4W1qs#>lMpxsX<(}Egk|K#Opw+jQC6{AuP`XD1lcM$C`8Xi{7I%)84~+a6679Ae z&2`?WAPZyH`g1y$O?EXt*TZq=#Bq>4mqL1l!oI@|Am!B}+V)&N(C6PdLkL$}6%URJ zuV?erVHLb6y*w2vQVi5hXoviC-=h!g_w`6axxSkoR`QgAnFuqTJR%dDKQ8MOm@y%m z3g6)8)E+G2G5Y#o-a3@=aG46H&Tle>A<>FTe@8TotAszR1Xp!ub*;@dTy>GS&Shw< z?b$f)%68n3ijEz|GyL97`x33s4CZU*wL1`6Mom1{^h?Xvmm`;@uhh$xxXjbVKJIm{y0U6Pj9VCsb4x6&VJRL@TM?!pJC zPcj~Nw~Q(3i=FEsO`|e;ce=EqHYjFU@bhJk{a`p}$7Jr&<3{H)Liq;^LD8tuSy;Jt z{zQ#H`4q@mTFf?4P-K}oM$C&D76N$9Ev^y5vQ$3K*dlf2Iq;}48+Ms;5@ar7c1qbt zA2$WlxW3;FV&_nY9B`Fz7-W6!!A`cY)gVw&;T*w6pwL-9XFa+>v5LetdE^x0c{EX#TEJs{dDx2v7G@yDH3jrgT{GO>MtFKcI%waU{h znQQKgj-UTCk=&s!^pn}BTjCe%`h4r=A512Xjw{QJvpU+{)JKsD({t8tq zpL-i9rs{TxJ)fZPfjIt&^J4l?_WkBW`?HbHgDp9Xlgc=bPAvI9D<7Nc*)l@C)!i1gP1frrYa00JuI1uY5059 zEz%yL_>D8y5BQOnOMXTZu$}%19WoPk1pYiG0uT|lrf7xN)#;qY0;l5PHg(L(*z10g z(L`25{I=Tz^-c+5(m&=*gbL}0ew2A=)TdqxWens5uh^r^V7g(^y|-9)ve^q?-ThT7 zToPy916%Kzo}(QJv;@)a=~kitVcD(n+CmUb^(h~}u;_7Go#3SEf4AU6RwcJHrQAmce*iP!wt;`sl8+(qZN1LSmLOMfO8oL@o9-DP;UmEp1&6Ty%#>m1opQlEpm z0A?b=_jW}wSo6Y+AUzB@I3@5d*&i@L!P3-Z6;in_Yd$!^lE$v3i0>}}_g9HEW|gq2 zg8wjN`qUA)Ymn7U9eV};&ZO(P2j8lq{?G6>5Tg^*@yWub$e7GN2&R6N?Xu~-SJ0Kw z5VRSgrXL1Cwb6a|i7xf(S7j>YH|_5!vz9<|_W-hO$zPtGE%6#YhaParGJe{zhPqTV zYzEU5HM~Vt3Rm+Ss^vzvalAb3$Il`ugovIAOde{1$}QMuB}>xhOTo}~adZ{>`l0Cn zcFWjUhDJVH9CAjt-ZZqI2tLt!!i_ajoA_GEHC`y@NHOZxxIO}dl%dQ%%WTyWexckQ zVv@@AJh1Gn?k`)1x8Z9?7bpm`A{8#r(`xKUjHb6tM_jU!o@ELER6;;xiV(k(gT)BL zJ=$;YYM&ZWzVfsqbmw=(wZGV>L}1cOpev$Dx4okqP)gQgdioM&Vi@h#nF zF}4ZWrcfNk`?y~S=iicY<@-zKnqaBG_62oB-_o8ZBLYe)4w&QpRgM=|B1*Kavt1|NUis)j$Qx~s5Du;+m(CXXrklQrak!sj{vku zt{%muDU`QH-ASTSSn5rroq!(;@FWbWvA~RB_<8bUwpB;5(>mwCf2QAs$J^gdK|@ve zb}#`-uNCb4u(@VTezSiU>)>0g{#*SG+_9IXk6y* zyhF7qL2EZrIQHLeE;6s|7j#8OSjaHaUxRDBWFsL#DR9ITR51pZf_Do4^O*x4jA$hk z*APQzo|Qj>q}2%&WrrOJ9|LT!o+`;of<$gxUovA3(cfg}Ron+}AS&|c{5JI`cHbKl zUrAnf(9t<r#w&%_>`Tz}X-ZBBj@71h zn8z;B%`7@^G^)EJ!OaMQlFn>L+sw>bDRb#eHF$UP^ecZe5n2?ygg1ADrlNGSez-DG zhmh4F#NAU4y0i9n+&fUOK$k}9JOX=*(u4o9cSg>PoDyrXt_n4)=^h$>n74pJ>Erx>95U&uWbvJWawhDju%Hb-o!L(?MN4{#EAOAW3Bu0_;O&f+BnB8IKvq~I#l_#}jYy8SyQ6+506Jlu!~C{ z|K$LNchm;A)JE#E$t&*4oSyoA0fDogr}3`!HP{cZM7q|)I@!dr;~BEd?p8ItB+OaG zRDh>G5!TIl90bm^f5~a$0GwF7)XA%y=+&42lQWL1Cmyxwso>&^9mA0lZ1;*Qw(u3?~M0i z3qBpfb}m)UX3*I6lGBxSz^0he2@*p;Wl*SGj(*$0g79XV)Q#jW-amy9!!89dW)H1~$UB^o6OX zFa6iO7Z~Yg%Ux#MBo)43^|@aKXWA&Vkybu5@0uE2fd7?D2Oy`JUp%zVSquS?@YdQ( z25N<%=|{8ML>bY9{GfN5Kwz>Tmdm54eS~&4Jm#;#(MOBRc#PP z*d|pxVrVaj-GXx>s0E#uiqRpWt~Z(%u0F&y`4(-PWnQ!@7x8aa)Xp{v*Q$Snn*CXf z_!Jj4kxsf4E5opSswb@3ZDC#3OiUz@P!JAt)s5nI{)027gH?2x%TU7_jZ#n}l8}F5 zkmxSZ#INF1$>5|;97ScX{v;uN`X(FRTP%8+i;7Z3*ieA5zbB^h0SI;)hq;L(OR*cj z$aDf<4E8*^x1#(isQd*xuU&I;u1CP(ZU`f%8G*}_`dWBpHpu1=$c;_Cm;oorS3l=> zC9VQet$G+Hl6w^oN8ef`6)iQ4bzQ+Kc243TEMtgQ5aAHbda^JKmD(5lk)Tg`g4i~K z^Me&jXgmd5R8QR9w4qVXbh@@=_uf6llUOkkr%s0%l1Z9Jgu}(Nd;ZMS*s&PfdDOhv zmvo?W1pn+BR34JOlF%-C%EUz1>u1envOMz;gFhuE3$v{btuU!Ty*{o`K9Vxy_2uJ) z#4xd5q#zuAV5ML<4*Ddd5AkQHVKnJJyvr;1xW7&*&eKFk#88=*8L3$^K%$L322lE; zv|BHVtuX;&f|)eb`o8%n-1eniKUqZq9eBymaj1p)RoKW9>I(vU=%33rv<)tz{$xaYj$Ko(9@mrwHiW#?UUbt2*NNw zD-hGneWPgcWo0C_77K&{rbJJ?)P;5JlMNZIJ45Pr#RVS0rs{1)-Mu!7(%I0kd9ip_ z8-D_54a?Q2`NHZ#OkM#C;RPTMq39Fybb_#6_LNHsB3ayp1f)4DFOYIPbHoo9uMm4! zxLFmHRQu(7Fu%gs84i1m{7LHT{npDI+eLB-*cE3)`}w$pVgvoNoRp;1Az(Lp>R`uNB`boysf!3W7b!R#-5>NL6(U!sFtV3q zM7(XRv`SI|BFJItXAOl2(NZ-YGp{aHK8vJDHDo?26rQ$m*K+!5eW%SU%>f8N$3?di zzCg(F0|e%wGEQeE0FB*;sq(a_+Bs!osdy652#(Q6Jmah9jZ8- z-Dl_5Y$bh1uFZJp@rLP|PuEtV-k$UF$pAEf#h%*<@V5qifXGdihvj7P;Ng*_3=LYt zCdI5?Hc#dr*y2$cWbp7{?o6NVaZ+W4(RuJ)U@c?E0FovwR6C*AQp&m}5U0*_)t^!| z1m~ls-FfIyeLt~lNrO&4A=v{SsH)eV2L1c+l1Rl=DHWI zT+IguV9y)B!?okX7%r~&Qn_(p1)qNTwWR`4HALPLQ8)^PBX7bP^zc#GtGgsjL9mWN z;`VghfzVJ`=_j4}02F&}pD9j;>>BaoZfqMeoZM6XGSt$`$9i6Vx2-Qq{IBpRf$M-3 zpDn!E#~FmB-~L}Ow1A|j2E*aH>1QP)^CK9%w>$7_(w5|{z%0Z^1foI>YJ3aM7D+G# zatQB(e*151qUf9ck&Pz_?bulnM*sb^LL%Qt_3MCe@&tsq`CTsc|4|$F0FP3bvDXa{ zWidxHQK?8QhzLglcLh%AEl95D!#hQ3C>tiqIvA%q#;lB6gd(cp5G1aOTSI;73qQ~m z8Ub$ulxskx8=vK}j0hIZl8_v#xM5=ON|>>a@b`N$COI63H3Sa_3m&8z)7KaIRg2V} z6@`3xwl8zh2MhSegf=L)m;zGt)|^sR=z8#{T|uqfeXzNMyyH-y?IAMDU;vx(Vp+A4 z-TCp-5?0Gl2SfOEk=N5iVbs7%GV8XIfKF)kYwQ^8IDr6u5hZH~dq{4NFpCNSt z&;nPal*M%5?hU5TE{LQ6z#0t&j;`T$yvvgu(wWi166VZ~0&V$WLAqAN?XuY)@NJ|d zijWO|CxUdMn2eHTa>&rPfaz2slQ)$=upZMqTejj!nct;AyF*{<5<9dXQdy}l!LmmH z2{uob@2OtIC|_1|`!ES0YYIEJ`&A`78O}3FYB^$EifevKYCHs?GSc!6D?ECm5Y)cE?Ob&3~5|jD)#%9SdON!)I0?2#GzmeMtt7LXJk#!vv7l7APSF9Y%0QDUbgmNnSY#O*46@TOaJZvYy zx*y@T?D9Nz;WspeO<6Ema|@Frz*Ic@(VilE8(!JTkiD7`6JSqkEaw#laG`u^#m%|i z#@&zarRLj_&Yf%HQzPQX}R-=klYCRxmZ=++Hqpz`vaD( z82^2w<$uj8(Ts=U$CEPjK&Th58b9c2KKEpeRIPy7qNv1LEf>%XI`k_Us<$@KI zi6XcYDP2)O_9%Z8T&{z~C}Vg} zqyr?x#Zg=;%^b24m*5yDPF#xMzpY_3*$}uwWE|_{Km+lPAyWrWfos!74kFGpN5#2t z0u4U-6HX~*`;d21RZv%V<9Z;`h1BDi1t~=nE-onoJt^7mN-H>no^Q_qQ1id~_CdDW zt3vhiQh2Y0`mJ$iW|U^q{8_*OAqGu~72#}^$CO7qQ(Zwsn)?A^p|;lxKV$dyBWScjd5ZZ?am zR6^9+#l$U@f$U$xgW0L-bZp&sfTLw z@XnTDNXPxon&<2rqu5&;ReiCqfNGNqP)`wN*TFvyAse!jF+edvpcz=`b$MR!^U&+h`oYue+0RR+NcYRo8B zahf(&=^eq6e#B-Ce5#tst;m^@)!@r*o;+BdzpUgYNS4R7MV7Yn9FTvej?a;fK2LrG zcD$qxKqZ#V3a;3D6HA9#alFk;`0q@5Ow!H@c2eSv-&$$iOzfR{**jGH8`Ym!sEba1 zGv>3nfujC@gRx6GN?ZZn;#8fB6@_j#a~yNKed&?0DZ@J)C8#fPhlG0cxe8IKp-)Xl z){E@f_F<3jvA}1w45jA7#w*=P_C5)^Pe~e^fva|YJR=^hsSAGy)b3Dfk99FeamXQb zOi5?E1AQ~z@7H=ySRMfD^loe8xBR|wR&EswR&w&=O$jh$X9aZ4Q@%kBOO>ZCl7xOq z&!U$Kgc-(93P(pm@989JY$wTel62@lVmsTZByGuxz z6IBJjT3%I$jT8M`ymkB)_kt-e_1roJhau@lKr0%3!qu{!4zIHtaH?DyLPmhZSgXRL?$neY4 zrHCk50$kDRiE^yFB+`OA?AAN?nTxTIgNWyRQL!ekT08Wm_MLYtSz7n3r6nru#d7nG zWo?g_$dtBt5XYRP^v^ZSmsOMy5l?B{?zA8z<-nY7dZQNZUytj?uhP&eSxe~&tm7L? zUkgEj?SXZV=kZ}-*Nm$vY^czVV>)TzHfKPeuK@@02s7#*u64iTV0zle&=|3>R4z-H zzPZ>b)p*cs793Pncs1qy46C?74l(8H>FaO4sTU4g1VA=h*w5ztu66O2i$s!cSUF3s z3BQSV;)675WpKTInBUrO73{*enmQ9llh1C|=ddeEIUL@dplp09n3$5fErS%?zk#6u zK%p!IK!mS{&dao!g7t8T7x19}2_>V>RM$d1Jg{{$ea$X{fs1AS*9mV-@Cv>Rtw@)GwE#?)2Kxm+?TjGbfy){wQh3YnfPPK*BZksgFmZeo307NT> znr@+6-bDJn4v@VFbe|&THv5(*DuDGws!)z8=4d>5=ncFm#+=gZFZYlh_MP zL)6ue%hi|jM>mVmZ=Oj^tfr#!Y-w{v@RpVqEw1m%q<+5NvFF7R{uYDW&pD*sBPn)h zT1~`G1O8z0$;5>w(D!>uAJD*6+^`#LJGceK;_HLktJx2ufruY1RJ{U?L;Gae;sggs z*EonaaZuoexzM5r^TP61%OzCrVOdW>D)jwVi=N(|Q4K7E8)2SgppjcBt!^KV(@ z4@D{E{J+ZVUX)DBKnP!7hBpyB2Mt&}L|!m##4=_7m~h5G!LKbWEIWhQ`UQ(KL{JmZ zfA_VciS)Nz*&!Mzs3d2t4W|J$O#NHP0-D;~Y|^W92-y~^XduziU5@;qdLN4$TM`10*ak zT@sZL9I(ZRwQjbmrZ3JXs?wdZ=d|TTUe2~l4g_hntXR!TA#0TxC$%v#%{v}^xeb!bcjKr_;I7GnZGNe2$% zvt)|2$PJ19d(QA6Yh(k-Iw|@7@zK{!AtasbP69>D;3Xt-ldjY!j&_~34rlQ0$=F?P0QB!eAPcr@O!=@+&XFJr&vOzsi z7?zpj?S~@4%=`_TOmhXCh*XeOx7CfDAJ8op!ERFx=A^oDygs6 zX=XWJ`@$eo7MmkRj@I9T#WF~rDRqZId3y2FmAj!_WamnEptI&(KWu%o-*|(CgNJuf zM|ap^VYWmiuzN;I#PR848^AK|82f36xc{TqeLO*0paj{~`WryxO9Xqyd5yF7MTy$0 zh*@)7${}DLqM!M)eG60#a!prDBvvqtCWI%+J$agUqblNU%XVC)i23dg2W)TEs)-m5 zgncKyf)7Q|L8%D33RA)UL%gAG`5z2#AIwu7J9xuVo2~(Z z3$qjBQ!AD6pQ%p?+1iOUkU^4Y9)54ustI0|`c{HE>0$o)h`#&`s)U!E3U0$BL z0d4vqyQwH`qsShWR1Q-OY0eF%_ANdw<(_hDiwLdPt%IQQ((5pMJ#>ue7nG#r zW3d6Pf2K?YBKGg3=|$E9kk}WJlEk`Ko8f%ZSaP4*z>4tyn{W|DZEcu$%H=9aQa!)m zs$h2H+wlfsg5;$U47g^*s7&q*%g0Ny&^8G}HH&o|VbOzk3Yt4GZVB0dwUI%FrKfe* zB$_9?s{ljXa(!*<9FrUz{Jv~|dv%nSL;d>~;`D<}>fvxcDTW}}ob=x$rfl1qq=5hj z`p{>tL_gkXp3Z!VT4uz^2uX1&KaLU*MK92={yklM5ebJ@B;JHW8u zLW=}<9lPW&9piiGwO~euUEV~V3qZ@`{y7e*V8%ad(%wyS?MU~@&+IDQp)DsAx(&urk$1( zVpJNe2m_%X@r<*qzxNFYuuk;mEJevjmF|qN84ghF9;^gWp2pCoXN1Ddb(I{iHKaam z0y}Xd0@E}A*6%$_y2Vl;`*stB$c?UvK!mGX4IEaNzPi=V;AwuQ52Hl7cVvq`T3Bk#+G46!2ts z_oX%|EFC^f1S;Uqp%h=KWqV&*pFvM9lkK=3`ZJM?Tbm!)H^+2JM>sWh^iD)5gcpgQ zx3q83ypV@LHWy{n*`mSFor$MpiL7iluJPm6k*cvq*an zUuE&uU5&Mqe3(zwT)KW72SfZ8PZJY_BfefpJ+EOzWM8XD>JO?hpf_0D1=KjcU;WQK zAX1jIGbGqMp9pIw8Xcr$W}(ggFeB1>bUTsUbS2wRALB#Ym3}Nky=AwKM zG*wiZZ1d1ZEr2rrCC;X;@1jF0STdp1I6Kp~kQ67yt{(4^(|5#PC}|ShMW>Y%1?H!= z=g50Yu@9{cWxV7vJmci}zeGU4tTjv}Z;Nt+_nry0QkmvQtYp2{3Yo|MBC%=+emo+c zm};^{Ml(tbk(PHR0^u1i#U^oeWbA#Bv%9poY5%F)T{C^lKYJ@(<#K-BBiX!+Gh;0nR z@wX_=ht53Coczc&PLPVz6o|p>0bwSe$P3j;-zw*}GuU>zd6k zYU)YHf5Ay@$21oC=B#qdty=k^ zoX)}AG#}F2JJYQ{iU!2ahys1qdPrU7pB6ld4iwsu0>&L6x~QFgS(x?ye!6v$m8|g? z#UY+^6*Yb`ia-vMRlWA7wb(=SuIxx9|Kv`sp`Jeo*AiAJ)H?(X$sb-V$04UEdf`!1=C~Mb;D$4`MNkSayz4vmmBW)=&uI_7<9s$M zQa0J8q^!lc<4MuO&k%I!0SlF5_3s*%m-OD-XL;4CzqNvLdWKUi{|t1_GPQ7i7Oqfg z-^8VurWcz*0Drpw;=`U0-A?pzd{VcS@9C=~YZmto4;E)uI8?pDxb)d$WM;;%BAYg< zs2tewXiYd+0eM?j@WPB}MpSWpxM8~YLR{Q6z@a*lP}+H}iPt!NkJCkSG9;U*#eC{5 z)I1ea;*PMgRqGQN3whYzmM^5fkX!LKqE>B<(iM=%evU3d>Qr~g8&~Rfb`U1rQfmZGlsI9Vl2#(T_hF-6Gm<#y< zgGWJuL8(`m;SgfKuM=~Y>0^#^NGYHQJ>%XaIfMVNf;7BIUR@EopRFXoRz+8+m?)9B zbZ`8keD@zfFC+Gj)!q%-EOmC3WlTvf%u1An&BK~TM06qRNM$7A!m95u`WOsnG3L)n z4Zm{J?Z1&O^z|mnA)7xoI&$!+ronK%l^{~lsh1)2&8}ap_>rNRP6Vgfs6(zbL|GnH zmf4HLc&GREdTGzS)zT^gDEW@rVGKaG>y6gMJ&2pcj6}olV~bJm?2O~^Dj&_K9={{p zEe|<~@n6TzqsGc?EFdImI9_Uhygk9MYJ7o8b7Eu9TB|rJ z8l4BE?mSb13$Lc=n8^P^1+sG1cwY{%$K2s2Y!I}$alCdypJZ8; zlo=sDlQZs*0OIy+0Ul*{0^jebkHeU_?l)@2JOpw>DHBjBs9I@3{-7bGB9F&j$va#~ zp9R%+JTC1#!4#%i(5lr*_IYEhj^$r0!M8Y-!DDqq4l#Mff7o`x3Bfph_{Bvvs+6V7 zFP0`!dp^&JS~=~5wKp1voG3~>F>WfGJfp*o@8D8_Cg}hi@z%t!;A}<-r?$%2DY}VQ zm$^+S;Zm`OILKC@^G#m_EY3r)I$T`b-lDhQ%Qcez!ZXErVlG?ssN`EThJ}UQ+8{8h zyoSzBMUd+%Z9^7<*u{;+*ZGwdqPrl`cs@drpjOZIq=>J((`!Kes@8+qmH$snMb9^8 zz43U+5=rgxkr(C{A}j^P=%@6>he<72-3|!cMYtVc?_GmG1Zae015`w7ythW88KLe{ zv6W@Tfw4aNO;-6X8pBEXPlYMNLZts5`CR_Yj(SOEV*5*ZALQw$gs&=7L?Z&g#oa8_ zs~|qijcuJRUaOQJe!oFn9M!)sqrG4h==UpyhfoE*!FqSxOv-MD*eCU9r5lRH_tbV@ z$9Gk?>Fgb_*=&iP)mx<19(=jkfz)R7WTZ1v!ieqxg8xo=Ac_ly4j_w#P(t1>2xHy9 zKv|U4e8iLq=?WCuzJa_$-}J+EtP}kJDQBy2u;!e}B!$v;fKC2W_*UX|Yeg*!9@8WY znh3?J(NFKrvAQ1taN{U@+jK?5IZL`(P2+OH=w6X+3trfr{!i)$>%~mO_RP}6{34v8UeLWH{H`K#l(B+Ua9uyL zxZNw$vcvcXH(R&F`x4;X4GBkQGo`^)j>1#BcQMf*t->)W( z6#@hOB4IC%!sIU1VXhexIv=L&fMdo%N8rtH;Nj`$sy^TEU54a&PUhxro3WufP(an|cD$3Iw+C{=e02wYV=$sh&irB%WhUVZVbPt9-A z|Hnxi)`d|Oc#-56^h;W8%u6u8{v^RE{D^03Q|LxeD-or##bIqokJ~g++Ecrv`GBD) z53$;YuGRRC_kvG5?}xyKaX;6%qBB5`C{FDAE#)Bc5D02%!NNljlAfn%VN?#|p?I&D6`>3fl|75CCf8J%*e zTOxVbBK&i=xgB?hDdWnYeaava53fLgH{otu7u4!VIfZ^df<6$sV|AMx79f|;s@(yBWx#S#?P4em|da`unU{RveqssbLQ z5HltLvfwdEa&b284o9TYZAvK|G$2Ag_9>?d;i1n+-0B3f(yCKu4ev$OmPpRXFPI1X~nfypX0 zn&5zZ$-Cy3NLEVU_pH_HQn9(T4}3^V?d~iRen3c!h`yzZj*|xk&Z(+wevDB(Zmp9iM=*(2~nAj%~Cyki-%cZ{)D6u}k z31HS03&s6L_QH?kzJ+QGVG_(`lEA<$9zj_>!BU1jOg+XRA`6v&hC))(k!qIsb>|Wy zmzjb%o=49>!U~6IDfIM4ej}m?axu}?now2CFzpvqLGN#TI09=ko{G$0!10evsgI7n zc4CSsSwR=GA7!`mh=Gh78K(Ce#fxq~$%QG+ed{4aJ2@QuYTJ6Bn?Q|L!hA{mq~}Y! z_w5*}h;}w*qJy|jpHFbLA9HEf zj3$WqSrotxU z&z<^^ncyL@crXm3Zky>5R7&M+tAfX!NDjjJc=am=26{Qrtrj;2drREsF;k-68Cf#r z*|NI#;Mz@p){{FSjgW1!{T^Pz)+q2!Kh zBp$9E2+Tm|21AB0uW^C6f9qCxu5@-@2@&ZzFN0q+M zTIqOUl>xv^sisVQKSNMJd*rrj4iS97=J)H95X0X1yFGT>7j6a4Km67-&jo&UE^5?t z;%)wh7#*%?c$}Xxq`Ov9n!;g>ExUYoKacFi2&(sevR543h=|p@cH(V16wuh$V#Vxv z=-oW-Ntr>xKWcip$8O>Z1~kJ!nHZ<2upI9t{f{Ws+9nu0`($7l3h_k{$2wrWuJk`- zSUum_)(z=&+U`yfXif;CQ<*q=^ihY};yJ>_lIpfvC?N-_+274sft>T#O>3A|?;mRc zG^1^r0a~8RX$|n)87NZ;>B+kcYcw;CD$&qzM4t25W5DD}DlxJDpyH;1cpl5>x0Q!tdp?A=h z*JIUNNZ4k@wVKUf$-#BzJ*~iM^D(G>#qod}E$QTwj?d9=+Sk%y=hS@+^+$qA_?q#P z6f|J~`sm=%bshPtf-GfmEnW~E%6l>k;Cc_GpB18u@itht^f&>9Uh>F`vAhU}J}wEO zIQx$VSFjv^gYBG)On)~Uj;>)PMsQfit*2d#2BAQTWspMs`(qS4o-BpOD%$Y1^*$<=E=8-Z0Q)k9xn*a2&=pB|x z&hw4`0vkn-&I`>0ZUC*HG{^AIWkP1}MmIpnp%iD1!9e;kbfh`d*s1{xe|CAWu^iQZ zH*Q|gZXiyND$ z{prB!TYDO8Zj8ggY5&xz3EG-Sz#-cbZ|9H~K=u9i3AZkc6s-j**cfA0_0>38JWA3P zbx`pbm$GQx71LgoxOI!nw@V(1@=5%zfNxfxQzWK)LqC;u{)ge*GKS7JE(nMAH|657 zLX4@}>mx2!$s~kWADPc6y6Qg`{A=IQ7CHBY6qVnPH7rtHqk?0kLl28(b}Kt$(7Klr z6QPJ!WlepCCHOV03*-|_hFXB{KS?I~O@Econ}{{+7aFcJ0;lG?y*@8Y&!``uX^<+L zHBt1tCBSeE{&A#7ntdp(Ls6zf?go=sY)CbRn1}H@$rb>!jZ%P;fQU&huB+l8PNwH6 zrPGsS6BQa|Atres7n=g>m+%geXa(lDc*xi?$Q%d%g)-r@Uy zoTMUj6~2SO>s`%V%jXb*sCA5Blms}QRlx3B>#_JqG4*SJJ@f4@yu0SQaGNh-(t5&K zto$%t@}kDX(tHq@YE-7=Idii!I$8`rAZ4B|CuH|;)|t^a4=E`K_85VgM%@izOH_R; zt64tQqAd2@2CcTMkzjtoW)ZmL4$rLp zGeCuOjogT(T8qyk1|9e#*+(QInJ_E26Q8o@Pcq?q-MhPg`C=a|L#&)`_R?J@1|z+H zaS~x08WQYlMpMC2oP$ob$`$MybwB)&J$_afJG>*X#_loa1UZxZ8eHg~d?#c&g)v`Y zch>?*FSbln(%`jIPH2yYOY|zoNzngGNIAYbUjaMmu&B76k|0i)ItmkOfw5RR&^P(_ zly`$Xo^@Y}F9XRLk2bn{jvE|fFfNp?NN`!2tpuMHuiDtTP|{S?jPCm{KPFXT$f|u6 z{T=#umTja=__;)0ar|iYg!m&+->V|Oj!5-iPdf!E@9Z&`%ux)GH z4mHKhOeyxLrM6+pVRXbzcRzK@GpiRcpvg-(1-Jq5v>t#m+#t&>w|OEj=)d0ef|uzS zE2Szn$YT1OtDT6dy4Tg>ehn*Teg?CqT5aXs6Z1f;o|B?PYcd#?j(imq5j#i1BpXgO zo~C?0gvw+`H8L3aqSaqvuKn8;w7ler88`gk5yG6P1#*VK!5f9B^aQOXyOT}HK@MK& zLa&h<55{=;?Zur#zvZZk%PG51|4uywE>YeDrhbBfZG4mlZaVe1Rb5dWv~DvMBg|7Y zf8vEiPu2g%?1jD?h~`B7&I&*}DgpwGVudikvyr1W!MCJt^AQHu4_AXVQ!bSKD_#|+ z_jSm0J#+5_U)H&a4((|ZFRp7(v#tQmuo=JJnR3YUueWT)D~p&GgSu(|p`k7-cC`I1 zfbnG1A^gG7QQj(181Gf^3WrMm5Z3+dQ9m=LMDoB?sn$=+GIu3dGY<)BPc2EvFD69h zHmrCG`Qt!Ecc#m&C1Ms=}Pw@|I(^KK#!`(UbO*TZnC`zW-|zsX>Bjayr&zSQ}b-K`aPWOMYAnPY5_#X%2z3* z;&3d{fqU#rFmZU_A^o&+Wsj(B7$=eYRBIPAU!ls{&QG<+>k2)cC>d^#IoW#JJT11l zNWOwA-TamMh&)36ZZlGoJKGh)S@z)kZ;>3i+ra;omWm|NdrLq0in zWq1KkA|iI(l?1^|3nB}dVJKkycT6t$*p=6YD@LwjL%2P54wZ!@{F_pWisuOQ z&5=~ITA{z9CY1(bOrCX2(>IAen58IW3jvhYY1`1zpyhXsGz$_x?)V+x{z+xvp zIg7xg*GAPX`~?$uQ|G^|MmOeRzYRDaA^?tD>WC_#VuF`lldn~fM6?VhvzBZrUFujR zD-5E(T;9Pn>KJn}3Sf0=d&(w89w;`+hw`uynr8hC!|!=>o?-w@tJj3%gzO4c2Y2Yv)K%jB3f8 zr1Tl?J*YJ|4uJIw(-5o%oyeozv)UR4TndI)3=_qz3*I=DZ%l>%kaUm{Gum)~vrsyG zkIzyh#m=F?u0?fNkkXLpu?et@4;=(8+vpStfR)upXUHwUIlZCA8br3U zj48GWAvYvU>x5I5_x&5Mz#u&g{e<2N1$r*P zKlM$GO@BhJo@W|u8rU;NS|tBtW6<+q-TOQKp-X@Vm%IYoL1-x?mnoTMen6v6SK8@~ zu@7VhQh(;jH#&UPT7TPOA9XKdu%5>>_zrg}1uCLdjEEnhbMr}9?RnXNo1>F)lR6zx z1(huDM=lc9ZvFoU$3M43A@e!#%=RIuSlww`Rdtb-z<*?6y$b`OU&uA;17aA0t(uc& z)y~6of-Z6WXB!iaHk~CW?el=^{x|FWMnU7BJ^4)Z!HQSt5h1YHlTxUp)Xj%QiOr4< zxfqqce=S7OqKDsx<6#Rv;Z#p9V+{i)#q?+(WW_vHJl7IW-X<$?R&xM{r*eXlm zOiO{Jc`mKvM29#(LU1-Gso@wFU(V(IYAJGAY;`_BrMxVy9k}5}NVe3%HN0c@XO}^7 zHVe7x#}&AByh``_nqH+1SCPCQZLDWx@lvIhLP(Z8waB{GW62tlQh4Fzp}VjYlP%N2 zXGHfh1=RmN2GGIi)7U*}`|Xc*Cs!EF>G&$uB@^k_!L$^G*#(Pmi?t=WMJg>)W$S0r zW?T)8HHzcCcy;-+Pun>bKhzqSZY7XOdm@$8rSotJ)*f?vgd0*cqzFX(44Ny4F#pXJ zUIAR78Ywy^w+2q4pl%ai4P&p0kyo8o^-$ z{p4I5?we<+cX}hkF8sv`E>Nrjs;%7hU6XT>_=-BOf>H`PvNuOS(o>W*ChOLjvm7;2 zAx|sD;-(<85G8okW6@Ckxi?z>M}(E(w&Q27Xow8s7<;@%@>MiIz6zUXR== z;h^3*eoz3{trI>wvrBg%LWh-GUYWNf#n7ajT3m}&r!mD_|KZn^Pd(UiU&|D4{(sA? z3m_68v|k@k?d-mY3PsIz2N=}d$h~(gz&g340m;Q>z8_;Hv!NfRF2g%5>tvR46rLRX zY*$}TBA62dgl9^s|9W_wvITROI(f&Kz+tIMnWl(dnk;}f`;4GO1^HD9u^}HG^9*qj zxE_RhXkIo5-Ah73o--gGH-b4N-Nr?WCy?FDN?u~9rH{YV4cxSa+AkC#Dfyq05nPb4 zT0N?Kv?ZHX%d)oaq#^CRZcz}KO&s*pRY$D$n#nPjPo|2-ch#2fA8Mt(2T-}-en#>F znOUf19PT6-q%#1aYyfOk((LIMIhhX4^be>+iK81yC_@t^#FP&h_wgya?CwTDxDf;a zi)CU~mdqOx9l=Z2fZ7WUg32O8TVcMIq+rhX6rZ`J0tpIC#0=>lZ{BW>+ z*vVV>ROx)~&Nc1$e7{jLT*Ti1bcC^6m6Vb=vh$hivplq-uV!{Wst6vG8Go9}6$H`` z6&afpc?d=Z-Dx)0BUt6|-@pT^NFvPZ>o+)OAly??qCy#YdDbvgt^E2@{z#J*0TAeP z_4X- zS-f+aa{%+f z9{lqS`s9VS{=iS-XfBftTsxKBN*^xieo;EP^g#FM%-6Q+r!DqC$6Til`g}H=(H)8s zMrCTfYizhpkvm~5Au!>+E4N~Hs zqSH-|`s`^(&X!Jx3!MdpkQ(&4M(ZmOKv$lkdLai89+i(-n;%WSnv;K5`8#NLYur%n zoo`PaAA&x+w;G=1Lwu74{M_-;R}{K0hMU)rVFww@T%HC}rE zj@yv}gLIiDP&KFyx{pfRLgM3Svqs?f{`cFApqY_j>FS(!`t1}h4at3arwA;7tUt zHV18!`&;f>!dri=6fk37RU1D|n!n%m#~NkcvxLzQO|Ch$>OqaJ0U8@_-EWYyI*`4z z2n~}&8E|k&!I*k(?$_aObg9;usf2X7a=Z6KWxk}5bd;+#L?79xYZ${A*JG921g(zv z{yWKyXU@TS%NSXT2B8Bb_^SU8oU^P`@0Df=7&6z*GNnu&>J}#+#b*DS8`>b+6?2(d1y9CA?I@&07ryKl za*yS`GT_iAAp5GO;Ic_uUg9i- zUuzmi&97gzJ85p~957F3NbaDm)FaKzHg#fd$W6IVh8Ne{zciL!f4_XCUPRjxq#Rr$ zcFcrB;1Xj$J#vjv{n1*C(c9p#&fD9*kV7Js7YaO-o3DtfcF0oYpa|AAtqez-!=cFz zhXa-RmCLmiPKa$VcQif!jwe4Oy9Ry&7;Lee*JD~sbhp@8-VIZXQ}TYAuse}zfD+_U z1ChH!xdJZ1N2+!f0k5J_5;g(|X5o1ZwA6IZ7tvQ}JGP)+Ww)Iw*(RqHd2KQJA6EM`c3Gs}BCGBu z;8lg|^K+C0mUo_dItu=kGBpel{E9I0QNc-I_ z3P=(7wo90Sx3xnq1c5x#4@>kac9?LPe_L=&KsvIe@+{Z2*hC=F7nzAHd~2*L{!hID zS!de%cr#4(S^|&<2h7!c<)ZBxohDNhckJZ-3W7kniTUf-W~iB=M`>h>KU;wC!F2PCTrt3}mFY}PWN7*gJ zh5f!9*2~@EB#0<=y(Prnq|EBYKUehcr3^)8@VViq_?kh%atO#IDy;gJy6@jQUX1~d z;1qTBVn=?r`H6OtXo-8h&Bt8M=q$70VSSqZ|K73< z{TSBCw*Cq-I0X@N-J4;ZwbuzHF4km%AC4_E%#6cM-yZD4Fmx!xO~WATO)`9&6l3*r z?rQL~fgll9evNpR>8=^{&vs@~h_x&4tGaHQf>S0{hAU#Uc8En(Clk>d_5|9)bY^s* z$vpw&17r1?RG&EYk&hMMC)RxHM$&L0Rid5`HqBB45tIhk?8NV7Y~gK+F(tZL$kN1V6W`bq&Mu0em^fT@OT& z*iOt1GdNMPVb?O`mnmtK<=g%)sDNFa0FT^Jq7 z3m?Z(^-nHWy#C4ny{2MYp|$mA>kk!ha_a3^`r)*lDSt6md>~|ULxQNK;-qlR!Zsy{ zP%0mZy`T;Jd{2rVG(% z_OONC3PWp*U4K@&rM~k?w@BiovAp==K@JSg8`}y0ruMV?YEjVS2WL$gLeh9{)s#{9 z+ueeSJzv%)rKO+YlNt^)nl(7Lx(*hxk4U_8+z*t6wAIJuj65}YlL}$e z9hk%69>{Z3&5uB4XH;X3N+oWoTXx}Ddc2ht$jhZBV#bh4&XR+LP3x9GKR z;`ozBZ3Cm_$7Vkgv2Y577d-dVnO+9og2ODWjyrl7H>@W({KxWyveW#?yP8GquiKip zb+O>en#n!*|5sptEG`e*s>J295mg@GSp2h+YJ%#v0(-@S9)yoR-0s`~^O(hLZK78g zs7dR6Nz+&G@GX3Cm|HzKP|HfZNBVfAv|E(Cj19$|B=BdCUK4_H6~gIauEwcFKr4RU z772c$dte1$EX$;6hx*N8$<0-;frKYBpnvOUT@%nz2MZ2!J=AAT^#NN0WF`=t3U)3q zFxC)U#d&*-`3GyOo=dD$F2bPR#}RM+0oSL0yqF%;jwnSL^uPdcdbR9XcTZw^Rz>2r zVoPuA^Qrc*Q!}umXd4?R#}LaTKsh+Cm&SU*S#>u{{qTKnzkn? zD8xxhj;y9;7neV@E>WlP|NCFst!|JPxHQ`arhz9UTNeeqftU*Fc~Mrf5|z*KB%i?M z$K*Fxrxhe&ElfQ1_x_Vrq5OEG5P4a6lnP%Mf9PQ+D0xy9NtYw009S)BF3U(I z?+7mA6VXI!S+O%+T&^w#_bKc~82Pq)1Q&8Fx*g25TOCnh0Xwn1*H84Hc&2YTGh<)Au0eOdP)yN<8vX{k5&)?Io&R4TjiLWEG zDY1}1hSBcOSywknE;`1>gDWtnhiwcj}3@#|}4`a6gUvHXUe%og}_FJivL}X|6 zE7Gwvil;9cI-nt)O_7wRJ+7~IGKwW1(bBdsbg&nR`72*F{IbZx|599?>8LI{{ymjau62JcUYbFd zx$ab>i)HlRM=t+8;nn7!?xJ0&M#uSDFc?DJG}C1$zPP)gcwjsjlYw+4zi~h)0Wb}M zVsr%G?LA9M}>x#=Zu1q zWPWM>{*2LK`}7WHq~d%dxw z;qX$rd6jQl7MOGkCQj&A?lm(ALIutTsxlH{S>cHqWwuIh5MQeQ3Ku$A_Fc;|5P{=} zz}#)m>9$qG7gO0amT-I<44GC6mVklF@A#Nuk<*X8Ca6@l9Vse0vR1rH+uZ5|2957ZU+yn4#W|(4M4CvQoQ;1XHicpelRGZmA zpBy#=tb@F2c96!G6pwd}{L4r6gNEuwWa===N(AV8!C~L!Msa{%Rxnwua{gi#I}2;` z&Br{JUhFC<7lo{sy@0AnYSpr$`b`glbS==rz?L0pYCLuxAvrbDlLoiZx~+U^-c2lS z25fsiiuJmlFWW2|=kLcYiH{nPRq|k#hg(t3=txy`u$z5b zzrB+q%%E)vI#61(?pm@j5K@7Y4!$92T;a^%v=1lUd)%1w_L>iYrKjRXBJYLBW?3F3 znLsqLuSQ!SMquRav0|*l9848fq|Bq#9(ifYv{!#buq@aMSv|=bQpg;gOD4Ry#cg(* zhGGVRwGn$x5F!u%)2K2HkP>l&5b*jcN7CnMh`!%kcnTSVA{sbC3?9OyqFchf6;WXe7pN>fL>~etmUhqw`yDY4Fk7 zSunNi+OiRUZl#~Lv=yRy)PvWodm3V3wd!iam2P!fvfcI>M7MECxE<9!3uYc{O*K)% zqvG{_OBtrJjg(^9Usj^3O5@%FJhct=U1?}AFlzG|0#wGDt(UuCiTO-Ikh>*~v{mRv zGiS@eEE2V#YO>(Hwl_c>DBfr zY!Pn-KwbZJRf(cyG^pWi9ZZCH_^oUpRxI7c9<1$`h||POu4%C%NZZvhYZGenL06hH zx8VlqZcAVNoeT}g^Wrlu?H^itv8&w!vsiT)1fVz&Gkk!h!p-kO64W^X?VbSZkWx1| zd{aG)t0+yEW?t=$JH!*a;zojeAAt{l=?5(C8-es%Ckp{&abkG>LLexS+|VJ`rK06j z7O~S77uBAS(MLZxS%_ckn~-Mfp08lJ3b20TnA4;M97A7vm0v6U1AN*7%1=U~s zfJ&^NAsyYplEVdPp#Y*FLCB5Pn?H-`TH2AS-TfW$H8EvoLqZlL!!{|u|2Zo)I=&vk zAnU|S`@03^@YY2nTOJb)&KTz1j2#3m4L74Ku3=pj#XXtV!L9W5k^?MA#bWH1+|&yO zB(9e`vDe@u7sS#yi6fN;4KG`ZPj`pS%4`&&)0WuiHUX3*yo$JkYLpR8ZSLn34`>s& zkpba#QEEDGo0i|`O_3Y+TRM)d?@PmwjNbBy&I}^{`-=4rc}*4`RiD*p*og(>r4#RJ zMR*JT2JL*d$;+o883WQtrsFiE!G4@mp8 zX*Xk^7v`g2a{X&R(yz-X>+~Cz;`m7W`E93drk0 zbmUcJ5xU~L?(M%yLK4v1VOw<@?8u>)KQYuyk8iX|&kFC@W++oki1B6Td< z-fng9I7lJIpDkm+T|AKgb3v&xJD7_w8oxE-EKe$@uFbe54UD2F3fY|um|$~jYTs-j zK#CpiINycwP*@9)6=fU|w9(=p_r%T~f6cYvZzWcA@{YnwF>*Q>iFi{%sMcwr!^F^^ zcGiwXwtAt{Y~~40nKjLH%xtoASOV1LaIT2V;@{)wJ=tZp0Rd!M-^mNShVxf1c%tPp z=EaR&`Rs18?sf#^S|01!RjByHS0^huFM>{4NQYfxP`rXoTSi@GAuY=iC`nBu7 z^b12agU6qMf3#g-i%?sQqeR#n@ZlZti@Vqx3466_8v)Tt8kxsD%txtTEh5<))6@Br2ugQeO z0zO@$vG9nX?0dO1&g{H&#gvcGdegs~rIGoD&>o;QzYo1IW(!9ky92~7mR{hfu(bpq zZBcYWzyfIvcMy30{?~FhR-dr1+b4iYT*;NaFu^B8EyRfiQ@H zk6xw;DG-vANUU0ZgA8J(>3f{K0fOux2k}}*aP1!IF{%@r$NyPvgz#nGH&O)CY%4-- zI71d{TsBw&R0cd=66W6nUR`C4q=K=qs}uXA0Y8f6NiXfLWjP2 zc(up$5A{Z^*H%9Fs9q<}M@f2cut$h}{d%3j=j?o-X$) z^ilB6^9#lwPxFNsq*JY+%u}d-;7q+lM3BkXlI>9Sa4gH9UQzuh-n&JmZqDNf@Nf8< z1Q^Q`ysbfC#aY z%Ns0M&1)E?nrUh&^_JPDNC4K?$|A=G*6yS^zZ7s7*fT%K!3u945|79+4O$9YibVwu z7$J5Xl?7T(Z#48KY7P!<2AKw&#{CN@X^bAhalNc>2JH;)6{_s91Kr>-j7k5&0Rc*I>WHBmvQJ?r+E2bFOMRuGm$q}C(+Vxg7iV@U5fUdkBc zEmfH8@%<^o?2!phV@!fYnG*JC@MiWcVCtDS2#b-0RvUUd@BNMj2q6vk)B;Wt<<+bMl>TYHhK5qvV^yh zXqB>lymErWY&zjCl8)=pVZm0;G=%ERf&No5eNdcv<4{ zqula1&_OzveejaFY|Vu~Ei(Jv4`wP-eRQ`G1 z#8ln4H!JP1^Lkd}Ug= z(ZTb8&46YIQwY#v^{QPsDz@}xZB6n;Xum6}WY9Z# z_7Hi8Ujj?{fRX7PRO;e^Dr*7KSR(0_0)E7}tX5PBJkuK++|poURV?NjwGPhp6k$yHC0%UsZ|j}80Ga|jBO++uWXaqAvJ;hpFanCkDaFCcw`Q8)DLGF zP8lC|Rul7^_0c2gZIT^fcSbFj?#)3%;`elSNndFUxRdqeLv%UK!#6sAe0uQSv4e(Qfp#tZ2KBKVKuv9x^g`Lw@II z1ukndEktS5Vh<>sqoAcxdeqwjZ&rF24A7lD6OM<9B}v(JXe!;<3+jfD4w z*Az#K^BI=Hw*tAYptO|3Kx_9YY~X$$R~R2pf_2z>&dQ?|%R+h6RQ*T;Rx4a$?=ULr z!P1)Q@E`2c!D~#B>r5_=Of9h9=%#fKlT7M|Py#x(J2Z+x2+CV0y{(%*Gfs2y+tOdA zW0%mu(?6E3ZE!$mXS2hl>T3@qvnYy)gyVv#(q2SbO6HYY;E0A-@?_`&6`MsPsDin! z$jZZxIR}2V@yZ{sWh%oI&}`y*yMj^rFBPqsSdMJL%G;#=_>A2iOKngpzf;Ys*@IJm z^JB3B`O;ec8xA4Yp@YYs`Is5(tD@Tx7F^!p(x_c7P9Fy*jHxwepG(TE?M%F5SO_tx z0=I)tuP0&enIFv8)h|@2;33;TQbYM^x?0JjW2#$x$X8px+8|HF6&OO-xYd(N*9A;r z3nfoWo7b<uXfYOuK!nGP&ITKrq4dbY2FZ0|rrANs*j$MJEn) z`zpuumVKetak`@bV|9|N!^^Dy4RGj6LxgtN)6$TeoSOR+l)OWJi@L~bc9Asjb9(b% zpy}xxBi6Cv$^UuC^|kBI*X*lNTLAlL>;C;#;2h9Y$ZuK;^WVbzVZ_z>c*Nro+C&Sw z%{-ql9f2JEorzhvL(pa;d)VS|V5Bq&S>2lIR1D~-HXgSmgcK8QXFUa7ah%QU%hn4{ zflkc@jU8J2OkY_zv4k2!Z(*7M2$)-?etUSO^8u%-c%yFQC8F#J_5MX6tg9qIj3=()=;%q8$X_;{@cW~u;Huy9vkRljwgKKp{kKSp-BOrrKd8u)fK>r@gXJ0@#VRhAjy`x z9N&q|3nx?>>$GNKA!@p;xAU-MX2xRB7ED6{%H*cgT~w|~hm{U8>on$>4)PLPOBBMF z?G+9-kHW?TpP`A!k!2aW*adzT`NL!ddi{p9@ze)$Pv+CY)Rk9#RUghv(6OD?b>b%rL>*^EG3Vd$uS8@RGEh`o@Ebl-X|-5EDdrsalx zpNpBRk)>s^qmilh4d3ksy;s@>YwgVCLb3W-Es~N)kaN~iE~rZIrL&cTR@e}+AVzaP zJV#UHH&RFJMf`n(D;=gvf=$sM;B=qDq zBV3|?Sd&rXV2ErFxBB1P73@zJKg4Q z!RXFI0Ols`B2tYTM9D z0eK~8442C1QMCQ)a#7f5&|}9&31wuh<&u18XVPE1t%jHU+Jb^N^YI@yMD<`hWu= z<)*)V9Be3)HW$Z=`vAZsVa%yUq91_Sl&jo8$IX%>F~-2E+5}A9y`z?5%+^{=%ZMCW z3k)#In4%TO2>dA4trGG9^6V!~O?zHfa6m)CWS11&2M-#h(8&A2w#rb3 z?kle+^&3`DRi@mk9Gcc)Sa(Y%=ZyZI%qUpqFpt-K8>fh_eJF4$`yWKQ^_e218a7#g zjti!+V5`4mf3uqzz&`rLV1KgUx{t{>_X+d^akkhX{FUmn0hjC;ML}~-$83wQ14=hn z3)oba6H?<56PMrV&Yf+WZ_URp&;DLK0VX) z))`8AV$j;^q9X8k1hht1cH;Ehwe@1W!GPv14dQ5#I{dU^;E`=N6%ie&MdNMXk-)l-Hiz58Ls-*JY~CO*|Fb|+VtVahuGiTViIRyZ zkBYQM9CiOMfm`buIC-`Z`XV!Lqthgn?5#l4=2duFBZLT@ai2Q%1L&*zF`E`hFgXi$ zm@x?7OnNV_{cIW8f?&;2m|W8={}|H@=lu-V_I*=e(cel@UhnkYCC^kGnJnCjLJ~`N z4WJPM13Wvb%|FyloKaknLolWWDW#ndxwdeKf{e>=^k238Boy-E08SX5@l^BiaGs+M z%jb6gk0`1`tAP6hzRn@b`oqI@8>@x7um!mRd@h`#Co6>X!qpck^$3%6HLR?XLHQc% zXd+i2x5&Y8!9}^tcx$ez41`De;Zd{O&?~{cbI_)D-=N5*Tnh+J9IW_M8Y^{MEJ!m1 zK^fd47`QI1CzOK_uuC_a#X5m7UflAxccHvq%?^CbOO0>zA}Gp)A?OyisNLvru?Gq@ z-={T0@g+2G1;%!uI4vfdmV+kX$W*1UElqCm-A8co-fc+W@LyuQbudrA5g{cIeH3{Q zGT*=5oIVrCB5+8(ELAnDK0ugwn)(EwxRDJo{%Bh|^r*Z9) zqb^NK=h{9ZCG6`|l+BzS@P&PS|H(ug-c&}nlR zkHU?6p!&8Z1-s^~8YEkBh2eetxrCb=)dzYZy?E@RHKbL*OOi4|ty0C%*7)7J^+5~Q zj=?Li6}0a}37BVYy8WmuRR@Ehi%h%0+k5<)tNjE+prhozDV)aJuO3dQ9Bouu_|(0M zm6Rxu$roO@3fs^lG&tb$2KP~I*rDr&z!iO=@jcr=UK~03?X1vzj0o9{aw1hn!OZ+G z06PnV$N|^**X{inf6$$#{_A3uNJUG>jwRv1t~BKo7qPjHit7ba^2`W4=O0yFa!pr% zJ?*15@t(}buO$2a*{P((msB1!)L)&)o_eT{bGIu`I-WT=Gs8olS>;rn_P7}`y2+rX z!q;k_-KW2`NCck(OLZZ&i<177u(J16w{-7@HRoq``WBVkK($$A5Q!d=x;Qu>Wl%T3 z6@^iLcuVG|x+)%y_I}eLarMqEM2`6LS8QLoQUnthYHoH9Oa=#*e&;QB5|8W`eNU$& zQyp%WhC>9-PYCrwyHRelpfLxCcRdOY5-loqEHl8E7~}3V#`f+vI~D2m6Qpx;Ik7qs z8Sz3%j3>j-(ws??^&s{g1E8rC682|C>^viS#+;JtyC#}iO1z+%Kv$g8FcJH(Pz%H> zqER+w2R}^lHo8=!>iR-upWBxT=os~Ro2^{2#$BQm5C9Rf9+jVQg?y-vZMC>k7Kg@7 z>Hq+%@9r+ox3xXN^@GijcX`8w*FIBatera)`A%gz_HCfy9tqUwV>o=dzSyaA?U=}d z4CX~@vUaYdBoh8A)YdowG;| zC27i-QA|eEpHfM&RhFadCdlQyTpAp72Xa>)F+Ik!9?>D6qFx2Qg1@@;(fkGV0E@&=6-iR@DG#2wEvJpXwEZClyD~m?Cy`=blGQ=fP zZKD2Pw%J)Ms%~MHO_F@g07-1f26)vg%gPp=ZNRZXP^Y7Mk^@>dh|j{nNb#pd&54^F zp5TaxTc`_KqFj}Wg7=bb{fqczg{DgBr1i;td;l%+d!JFx#%%X=sD;UzRws(7vFXRV-!efDzII(uIi+OARYuBnDI(U~Syo~ngHf0j%^6E=Q z2)dSjBP654)^QE=l835|;aecp+_4O58v{M5dmK=JEU2Tu1MC87+n+5w*a==K{e<4r z-o8qBKPX_;NvY+chTHgfLuEO&RciBQM;0n@IA9)<;*{$|{?6Frb(Xvym*+|?9w;A( zqiSBcn|Usr@yuObNkzu46&Y+~#-MQa zSzasq_b0~q5+CnyuR{S~64T=B$xzRg=X5{3iAmazRu=u9qGRnunM$*UU5{8Cv0 z?o2^h?dK?@lD+-#D0qg{vo`m0WPs~70HliIUqW{4|pWHTJYRLf#3EKGXS<5lU= zmTFU2MYwiMQ<*f@Uc6zirXzQFG6hrEpGRuzTzVz-DC*|`tQ0W9&h~l>xWq{G>Ut&} zlYFOy8|Y(x;FXk=CAqfAWeQhbjr>^Mggf=*`a-h3rjnl;GdJ266X#j30NkuN{I0cFPC zk8A>cAYs{PYC*-UoA4HMJT_mYn@HCLXrpEU@}N=kWpFyy@lk|Cr2iL@>U)#)CSZ z$}{=Njm^t{AMecJ#TG#0(FhomVl{&*yTeMFIo+-V)8)*D-HPl{X;hqHd<=1)K5^cK z6;9&Ge=YWIMu!!hUpNlsC;i~IVIrIbLBQHnHW?6~VqbR-#edI__XQrABf(i;!t6x+ zwvPdz6(MRe^sHqyNNgotCjxC*cum)mo-SEhu%uZ^TM!r0%o6t5tQ-aev>O1_j@NcolmqEqVLF zbtIIZ7$4YIcq$a%b)avPE>Xs`J1f~ZtxO}{8oEQj(5|M59v`V?Mk-v>>%e#xdG|Q= zRYn#o0{oFnrZPuq_B6C{JwqtH$(q-dQeJ}5nF|LGqUjU=nUYH#6l_QdlXxge0h8=ZY?r#bUJBY$M49Gn>HiTFFoH{ty%Q%!0ghlv5NS@YI z=z7~2F^{H3vvCa?G}(WnX|-{-yDqGEBE7G+wcFA{R9@ z?Wgi>AEUx=x0nFvYGqmcy}X&V1DQSPbS}fQK1`~_5#u9CxD>rWnYWjpaoOQ9X5Tbb zHWy7z;~TNUPIdi%k`GaUg{@0 zL3HA!&fO$h!}DpvSf2KGM%KPqVoe`?7$6^=jX*wTOaP8;CXC0|lR|@xA8`BWa1s^| zE;q5Zq^UT?t}%^sx^yBS9ti7UjD-~64=v+yy2UG#iOg*={e;~!$!U0b6F{oiuTu3L zqe+%U180I%6l~i4HJb4>H%*b+SCd)ar~(M*yo|u>G$NNTl3$Q%*%D7lcwkHgqSb)b zQ2@VSiH5_0qlSlwNs4O&I=&T>H1cK?Pxh_K8tqUeZ@R;t1sn-AmkvS4t)68nY%dy? z845YW<1JU-+;Pqf%z?UGoFitO)=842BRn=(NFr%HGCF5KDreZ*VPG0jmb@xnKdw#o zSI|mxE;AK6<%**!akv@@=eo$+vGRP`@Tocg5bDs$1o0i5e7IdN4me=I-yD9V-z)4ZzS;zrSxM;RgoX+mCK$;8=RJH{oaFrosLYPMWrA2> z-)fUgUC{{W*F}`S=R~lm6)QJ&hayyW$*q?h?bzS>C*DaCU@+5VcbME_o;XPXe{ud#~ET>`t36JTa21Vm-Yw`6f=dfh(9@~k^lQdg)VoPb<2QjZh>D3a&CXqTgPXd zI)VJUt{-4ACQc5Ju>c5btWhr_QDm++SWW7_1eW0L6g zhQ>s+`zcGoYd*dtZ!Z00pEGk6_+<@^D8~MxN7``-4k@`4H&=w19JoQc%QJOt+03** z*%LEz6wMktyN+wb2cv;2$V+2nRhc^J8-vcGWeE_#sD%Y2*Q?xdI$%7_@1cNaZA1iM z3JX`aTf(}Eb)>tvi>uo*ZaJHPm^Uw+45RYvE5sK#rI7kb&hU{G;r*yOdXddT&r22M`)G8eiWA z2uW!f=6)4SQD;GL<?Wt+cPAZz+;qv$ z16xordYG~F&jC1;#I|bRtHf|$nE4|T)#Q50JcC@pMzGo51U##bs}gHJ;r_to*sI;ay|t7i;}}%QVBZeQ5s)qBwY^uf#5Gc_Vq5K? zE6*SIt)7x4VB8y>g_}NpsdkbCP9Q|}yY)l|j-n{ZUn07CA7CtFI7?Ko;n7xw;93&< zrE8@ecABVS*KuEU?jhg=;@^t0pvbu#M~-YKC(1gONMEbtVb`~;@NbK`!U{>9tHWml zS*iAPI2HXME?-c-*m);p%&q8B*ho?RBe^}8`bm|d(K$e;pHbpFKBR|1st|YqcCWIJ zG_+8C(FGbb@6>uo(nTt#7idi`}#C?E|o4eXsX zwouo9Eix5XNt24Yja$fJtatD%=%xGs;rt~iqeE;oABbC}Vu7IF z$iBwE-w@wwsWc*HDp7! z_eJV*RrY6`)OHIXLrlf_REF2L3|L_7U_Oi{A2V>`NMlmQulELfTklHwX91n6`>|1^ z>48HpGSD(pyLby1pnsdwHXK@1vTKI!h}nRs>mUIcm2jEP{}3fe{$EzlIIw(FwlYqV z(F$YNz}i{GUn1w};f-FtX{`@hhmP1%6o{z^4y!*O6W}^Lwd|-wN*_$IdpVp4>wQda zOVwBmt)d6_H>|&S3tYc;zl42*=fvXKBi@wZRhxsX3eozgu^QV0TBcGiEw}1NoFAw}CENKA$Lk(q60TT~> zZ?SS4K(98Gzhy!{LPUanNWlf&2`mnC!NGy9>FPj6gDDl=A?tmp^7BH6berKWEnGX~ z#5BqB0&hU5VBvSVUAngcXSuXJd!fwb@yr#ZB#l7bxJ7H$vdNz1Rg*Y_jZ|S2?{{q+ zp5mZ`&of81qPM*5bo=q2VFq;wE#Zcb)ph4ECI?E{Pnwaq>I?1A*;SKY5YP$BL9=Lx zAOY(#udk->5{tb&IJxYGJ6IMT5BGxHnW;a=Re?k~^%1`d-i++n>!o~sC{rXa{&a$J zp`<0s19k(T^DlYOwD#{X5}+6d|IkYXz!|kykjs+Xo0A?f=3PrH*5xAze$y30Hy0;T zr)6@Tgy8;KVBxgv>^!rtHeKzhs3dK86KbSy^Kx0mC_+T)VkaEV<2 z+wv#$2g1Fn!^qz>2VAcp?}SMmczmLfSbQR+Hc>wlWZryP9KwZp<>c0w?`h6-SvL8M zDZt*e#?qiep%`fHdUD|wRG-;ort?ZhPSz)tQg@v2+EI-lNtqVr&iA@EM}{pEmbe|u zwe6+4PQSz`84KWe%6vorr@YoynD?>5ia%=;Y#PD*esewoJ19GLSgFkrmrOQorP93I zYhaFW%VHI0nD(q^BZY7Z97A04@!WQfR+PZqS}P&o1MC5RlGh{vDa7I`1;KpV%h4=X z7^7)2OO^)F5DGq;v{^%da!KvnMAujBgddnE1m8#@v)_ZmrN}#JtfxUeC#3s{)&H6m zNFB>-C}XPM@8b!kyu}u?fL5PwOeh z7lp(>6SsM7F6EHIc+2kot+AjAkzluWP27IYm4^xH^2jB#hoK-CHvD_Btsm}~oyHP_ zBsBDumT0xbrP?7NF|sCztRg*ZThS+Z<)!1kFW`-tr^#1l?oGxcN}H>o;1F_)9+Sgc zi~BX)-r7dS6^bQo$ijo3alolOPL6;*ZL6%NPcKO&kF zd_~ICI;~cxLjc3C933aI%yx*j4ieVY@*jGkwo(@wwiG3%Ww)#tST2`sEvpP?S+w*~ zJyc{jLf@kB{@aHj4?&ri_CS4nB?uzRU1!(sF;UkzJ3u`Im#h&_ns~b>EzX04qn#*> zCZxfl^}^+)E;n^oUc10mXXE6!XGqMsvCn!{&kQhBUJ6DHk*_9jBw3Tya93p7r;l+% zCtVgWL68i;(8P#fM%Ym|6UADnMCN54iLdLv>pi?ZX97`8+0)sR-ArkofBke%8b2CS z+t-7LbJT^f?e_X2&^qoxUYNk}HJFfeaP2(%#mxLT9I{!8$@4nzWeUa|+(@ak8=6%c zMDRvwAj*nBFP9pfh=0(szkDZge)W98##zZPY`$Pfv_v0BtHKK^YGm~q0!pW)*$!ls zXqr*hzwM}%g9a$N2%5bJA1(NQTvEG%R0-TSZv3`%OdoxKAWMEj{#Xq6kL2P^+gEh_ zO}$b@IQx7bzLUcWTJ{$E2sv+?;T(JprTVr@L6UjD=dB3D0uAB7$3xn50CA8#Ef)3F zMV)=as*iHC8e7Zt9dBqJn`X64;l|MTayu6T1dv?;9!?~m+ulfz7K{{Bs3Si(UC}YT z$12|$?e71n<9Wn0x(r?a=B+w|*0=Cf3g0HC-WsWtfT(w`&cV;+KxiC>E>hPaN~Xok zyxJPhQ`YhvZm+2a3y3U~Zq0!kUJ=DOEL0mk^E#_&-=yym+qsu9N1%Q+$! z<8fp$thx7wKA6~|X634PmPM&{EMx+8ih*8ktk3BxLfd;rgxkfI)BzGz(z|L++Lk&9 zT0S5L!q}z;L(&W~I?Q8A=N7I>af>V)ryKAMl{UfQFYMSbWtpp^VofZ7E!|`16`Ml-XwAVvu5*?_aL+b>Q?V| zt>|rO&S~!~k&dmYSRV-%1MiA0pCCP~_YTSqdE6UZTiG z*ldHYObJ zezH0NCYP7dp`}7-aUTS^^E}UTB%zoxP6R&;EFj|UJ77mKb%Xah#!9B-&-SKdO4H#y zJS(6sWW*l;(V;Qjx-*S44CgJtU{jcvraj$3PW(wu)*7F9T)C>|kjm&TqrN+F82vwX zSs!XA?lUMB;Ehh0lOlib^5TS_(uU^vJT|>0qxwBnKCx$+1PxaO5_YUe)Fd{8lV3<; zC>1$lh6V%e$H(vo5GI^O#8}z+D&*1M_pct06oe}&Rn7vKA=6%S0B;*Lu}TD?WNCn$fKstI>TV5Znv1#Tj%nD?@}Rz*8y=jGq)KuWBa8z*O@Pf1HNsVgu{IGz z&PM1Q7&D+yw_kpNQ%zei2v<14XO3w7&|D`aSUPPP!x(?oc^xF|b9%C)mU36o?yT*h zyy&V|uiq8=19)VN3b05g3rlDbC}AR@>WoCk4IN5t9W?Y44PdlB$3lEBj-b;Km_fen zW`xp^US$Nsa-f@$NH?PffC&2HoVkx2rAK6Fb^+z!M56*BGY}-dU_197`ilWtn- z5+ovrh0)Wm^;6EyE0+>P)Aax%4aV9SQRNIEO>uc%+8W&vzI-%Z1evzaUc7H0s7IgPcgE2~mLQ$swwR{tpW!EgQ-N)-IS^h-iCqA4@S|;f0#E zws6Hl(kYAVUPztgOa!OSV2>)vn>A1eib$0-E%USXd#c02Kj&<3^z_1vcN3Kg|CKRk z5#8!F>9TNE>=rh4J6)iA9{*qhH!rf25ODY_`z*}c@2+x8q+xImTE4aFf}v1icVUmW zC1esxJxQel9mMxqUF&BYZ z@dVhMyS@4k`{<6@8iSFQet{d*4l~AUz+HK9Y;BR&Bg{W^awUwXAbOZTL28P<_mszi zt~EY*_VpQ6p_X~U+}0Q&JT{^N@sQ>#p19PP+pOvfm7rg7sMRC>-Xgx-H$x5+WD=5L ze}!a96#9&E)A>f30{{t>iiStS%X=DREFI_4QPfl(Dm}V!EAB_m;}Iqe&&ba89{O1SwAJ zZ>+EvHe7KY6z7N{?C^NTBN$$TF;q)Q7!eBnLUF0jPROQ?P}|=k(Uw@-84VX!$WF3_ ztnmSDptrrEt{(k_j8Hx~j>8riKxh$V)~zS_f4#~`wO@J|eTtQ$$_%gv*3fppROO`N zGftH|cFsiM8cjKdk!7|m=th%Q+gS*QVzW6m`CS6K@ic1Ls5PQ=DyMdyRlQ?$CQ;Wd z8r!yQ+qP}ziEXE2I~_Y6+qP{R9oxEn?)#m4zqfwXu07UXs;aST)LL`GlDIz8>1+55 z`UX|Ed;C`2o=QwI2HkJ^?Y>8pW=WhM<*H3R?Z#ej8;F~^Z?E-%Z5`q%m#8XpBtwSK z#r(C!?~gYQ8!6+Vn}RI8wf8+gKO_f>F0qlkFu&lm#uoxR_Sp{FlzeDMhARjJOpgne ze^-z6ui%7n1_+?B&fs=dSGqjXV{cPAFXmb;`oTT5f(YaBDCZ-j@0;m401U(Hni zNJ5iUGM43;PN-%J^rlMYV(c|dAz2=1n_KJE!ym4Gf)3g0qGEaIE0DNB?#Do(2@RD6 zH9(wMK>nF5aVqVOtl8L(*Q~2bo-OHVn|j;ad=@ek&=|64dVBOlRwfgy4GWuZ=gY=X zA`188%_QdVU961{-_nVz$bNfF!Xmxi)FbbKNjld^1h!`EPTOArOE852PH3#^E1ULl zEvUd^FAKV(`e&zH1{3`WnLsjq_JT*%r~~-Cw8HUkzJnX0!Nl+C%*tX@P(a%ZgR(~g zwPV8?F}UDW4&Li>CAScXV#-DWhz*t9KRHgB!Fz*AgrEmGlhBuWJpZ6+#&@JUfugBsv9P ze@bt$V>?&v9uYsrB4noqEV6;6iM5SVwYnTIO5c>TBC!sW=N>S+S6>ZhHQ-jIAJ`7- z_#$0j1JnQ)(K@8PC%R3&f8E(jMH=iFK0@1mV1zMR7hU_#xx!F>?eA=y zGN*y=5+q_a%G-q3=DhG`J{0p3wb$%c?_AfHA0UNBl7$J;`WHJ)eHumZ<4+X3J^#Ad zSogJiXovX|k-UsH$;SRerQ{JeO&Ni~{T$Y-IY3|R*{X{EOTViftMM7uacTR_K=>9odE&=sy zZyx&SBU__gJ?`NMwTs(pud%0J+G|iFoM#sKc8oCX50k|9B;exQ;bg$d`gQDikM2F7 zb=s_+aVgH0Dg~Yn+$}kf@z7~4evYpw<)znX{I;g@}#Su|j`H<{O`T*Y}P%`N7 z^&1LQ$w!_;E9tA)WkUPRSkQGvzp$~Z2{L}tJw-B6g}Et%kvCMSR`|B^4 z>PHR-);>)F}Y+idIX0|q9=zJ~n1 zX>}b5-3rSbXd*FxzuprgD1hE6*$w7sA({G$i8>6$jmI+;5>K07>sm!?<*$dtl(4Z5 zbg;&uh>I43da)VJ&JU3UgRg$wWK{PS=wibCk=%0ba_(ASxcbdVP? z9lDid+P#|T=nSCa^OZaILu0mLT>4AUTud?2p1!uvOk=Kw7#UESOCq&CQHAayNyl4qo#g8U0~>Iv}X)6M$ybIJ962Cjc02?2el7x~kekmK=_9Q3K;+lOX66fFuz zbyA;hG`}o+r8aw~BfALUiYhLxa{bEamwsPMdm$|*S7M4@9X(p>A2|j`iSwr$)OTv^ z9bIJpTo$Hg7Ssh60*GOESP?9tW3_4131=vS?wU0%&C23)Yj5P;1o+7Aaaw87rH)De z;NQ$&^TYJ=A>CiY7Z8>Pl6++ddhVX2g~l8MF0y*q`ta>#dJH*uiespacui;-h?;lnE8%XM1@RKg;)p`%6Y%3?(J+Z= zGi5q1ky6<+UodHYb$N8AAzbvR*_UKos6v-aU2) zF9?fRgx#fL$0Uizz{H|ykW)r#J}5NUrM{r~YTXFS8>@`Lk?}%fbTb74eD&5aJ#LpkLUO~pJ4oFe{CX4xH76})N&XP* zK&nEPnzJ4JuIv-W)a%oCxB@yl?Ct+J)SKDY8L^O=;OP-V51OIJfBx>DY}=A%(823F zR&eVx5G>$3ua#zg{;S03v_?S^yS`FEpa5qo>*y_o_wf_BX0jvSshWPbo-`>F20F{{ zieS*U`7Wfbk)G>hWdx&{8PfTT^9UAmdX$JeuK6k{f{BBFZ<)K}UD(5pgC&#Jw#Qpm zDHAF2$LZsGL?pMH6>EIaKCRb^8G%8`rCG|zv*9ZHh1hY76f<7gNm$8On+N*QnyRz? zB=WWxT|cTEU%YrTbR=~yH`Kc-OVQH8e8j?hpA`?;MADZEACI#+KlOph6?>fj#x)`< z=xWe8<%gZUTCUlg!%{r0 zU%jrzYaG60Z58BDrHqt|IVQz-S-`Kjb%9Rv1h(Re2xF zaSV1*D?F4Ce0eRU8Ki!k896Xi+j@5SlBJga^cr#?)fYGqG$_3~tEjCb+?QuQagyWt z6@(Sp00|(r#lLTBIUjw37Crq#(TB$*dWK!iZvP79P2-j&!@C!0dYzO_ve7(^804_A zuRr&(7~3C^>0==Ii-=C38U@8kIO_!x9(|C1A>*x6ildFB*ZCt$XbE=8OgkptZ=mvl z(mG`;Sf1xv&06+ha`ayDqc;({N+QgFdP=&8bX5Kam^MxN+Yy8+*N{;a4{U!Y$9P#B zh@2Y;!t)En?i&mgi*QG_0y9w&!{Y$Hs8HD^ zCS=9b?vK+jG(n4ko3yId=heD$8IF(xTQXGSbZWL&7@fduV)W0-7$NV2f>~YFW+p~Y z7U}0HE{d~k-+o%jIu!X)6uu)3zgk~n^lutH% zqFtBi-hrfASg%YatHkN~jj}&2p7Hpv_@@@FHOS*fdlt$2yy+hjA{aWO=q*xO<-xGF z^);EH34+t-2N%B)9~u&fJ%NP=F~f{+B2%-&t$>#cN<;R(G%i`Sy0hL=%W|hk0n0+qx(Nzqx>o6 zwXbIgo%c#P|FG1CzYvsj1e*AS~~^R*!z@EwG{Egkp>UkGR~mP2xo(1~ zkJ#oUo335sv1Ur#yi5+IqYo@+EJwyFiBkdF&rpdCSYJ>O$p&m#=ufg~NzUsClRuO6 zyG~o*MYVoY450)n10Gx35>*#XFiuDh+2@^=Ek@Nf;F-DQ2_%e67)_KOv`Vu9L)`cr zXx*8`T~Uc?io{5zIH9HFf=|w+cShe<_)cGDRuX+@3qpyd@wG~sysRu7=Kzq|yfCnV z?TydM?wOp+gag=m*f^0MfP^yTIBkJifv^ zwFYJ>s70olcJIkPys+isb!XNAACv*_iG2WJ4iC5bOPi1VO-~XL>@N^*Yw{^2!3&L! z***6dmB?3VB|&UI`1R~y1Eu`jN^?Di!RfLf_{dE~&6i9PP6{8ikEO!mN*ZPIhncBX zTiUFburnlghm?4mxxNv%FtmH-9P>!din%Snly% zqwIC?(tABWf^`QPib0wYYfSoomGV`Z#tIg36aRVw?BE)%l0uJy_@&NN@UBXW(=McH z{=x7Wh+mPyLCZ_w>KRJAKM`_p`v|9x&9kY|dpCyP86$5!RYX^=!lj+k6_E)9dA)bj5e+CkZkzMk<^ zPcZ#%#_ZJu8(ae~Zs%03;+XWtWpVUe*WJj32g%$|*EXtwpohpjysdU4w)K@fmp{<-g_dy>>VEz_R#a&e-b`r%cKqM10z;9y~YYD>* zT<=y;BgNtt;TiKx*$$NRO|QoKU~HX2`slA8wGqA4Xtm352h^X06A3A0ghR+lf3f9j zS<3zTMNwLqG+ALl8LQK-zA1OhruurEL#V&U7u6VsOyf_kTMToH`Imtt8)Vy81hpQR|vqc|F zg3G13zSRMi%3}g-DUWD8DG37tlTV<}W$kLQZ@K$sS|JFwi9*WZU;0`N8%lZ=BT_#@ z*9Irrw#~~MbM+z`gQXui0TJYGvfx%7SX2i#Nb1@*KG|>FYGZvv*cy}kScg>y&y=xt zbOGa-Xuiu+bxeZDd6)B`M1$dnHI1+t%G68dZq>dXvdO_HzFair*g)fb2l77WwK6t- z&pSA?94*p`LKr8e#~7q=p75F!%1jlhobTFc%DK>U)=5{59f+hadK-`4&+(bkt#G2c(cZ}@^ULDPthz_)RfTT| zuaYsv(cv;UQ``S0$cPi53>w@?r7e4(nw1Y0W=h2#5D`yl8*!sA*!S&H4jD&c-5zIg z!cT^_|JGo&vkt{nH^Lo1ntnV-(@8jfyj2O|%L@=jIRdz6Ry5bN1BXu;bhdk=f8IymkxaGyO`hnlhW>>u_FN!a z+4L%jfwn7-TaH|SE|?AE_hM*bkmxQ3V#MZc(msXttw;qB)1A@04oD3?mMG3D7f&-k zB!^c-zBFCn_=PCOaS=wD1wwo4-yx_R)76)eAO7JZr1_Vi!8_T^%Z?Wzj`U?J_bv9s z5xrZAMQ|n<<)EHkL-0w%lh8^jZ>A{>ROtm>CY!9EjQmOSJi`8h{i_q{dGc!k?W5x) zezo@rLMQF+z*BZGwN$>;p75Gt0R9Ap%@g>L=agRr*Z0)bUDw1$RCcsHC59K21r_zp z$9=z4AjUUAd#FCzIi5rF^;V7i<8JxyBUWT{$ScEsCaMbh@WL5(c#n_Y!N>TU$Inz>B5egJa5ZpcNT`orVtIk!(Pj3hn zBMSlf7H%izO&!61PE!jMRc530O)$V zXg8&PTO*IZ!S7oYuO_B@7Dx(;KXVxjYZl_Ye0bp;lQy8|RqTwD7zb85^BYt7?#I~k zYLeT2ZuMkTtvp8aHQ`Zp`E##X;FiAMoWb_d#5CI{DDVC=+}g+=Z?HnN^pzj(0^K); zmMu5%HViWXb+dY>5{x+GnSRNgH`K%NR&Z~Yuvy^ekHF^|uhjZc7zwm@IaFLR@uA6o zYe0W5>-22HDW4lfO<|yTbcVy&r}L2?IoY1u60whoiS7O!2S+X4_JpIdE+<7E>I~B0 zk*qg^GE{#^=jKo{lX2BWpS>gtm~Q+iM8RT{_;WikoI4wWgf9nsXlE&M18r)eFh#^` zWiBPm0jCtIYe`tt>slm1^4a$W9XpR8T5zExw$z}0fbXpSm&Vr^ri^PGYlV0Rbf8|{ zHe-qlc$J16+pEjysj2>-Fe&M)y_~bg@VKRv6n0nCp0MH^D$6!2AIwGrtM|xj0V)!f zL_MS5hccM2aITv#fQUAc%DD+E3G2a_7DE1%dFGCJMz2T&b060II-2Luey}VAK|0X< zU;I9|#Q`g@Cg1%4fDZ^r_gARtYW}%F7U3yt+ms@!7=f;&v-Wwz&(ve;+SxS~ggLRY z7?U9i((&#|=ZL6X;}3nw`h^7#%Yw%Dn??yj_OeE_QmS1o&{w#Y-y_O_9B!*?v&kVT zT9PkyK#9WW9>@;t5T|XHUW;4BV*8e*PVx2Q#|$n29*-wGi8PN+3bojoN%azeUUVvb z@fsb?IPMcRJ5Q&Z2`?(O_y&x+1*c%Z@=9uG{rAFb_`QqWTyHLmMQ6?tTm+0EAWz&+ zEg(L`YYxkmMpE~$>?V(IuW?!>oNLEhf|+F>^pPFu?jslo+m0acr}8iji4&7d-nU+o zMmM>Nu{He!j)Lwag_P2mQ-0df5g=;$PW~LNVO?xQtgevwN&k7NLW?JMmM$IEoKt}_ieYqQUu>)k{jL%kUMO>@k zVVVS(~^%~NSnGr{#~Wp_Te zgX+@n4z!v{Nu#u%kWY$KUv~}r7Uzj`2~#5WFFtrPhHqW;E{;bbM{yDL=N)2zbp4am zK-u&2b9){l7sAP2t73Hr&4>zCFC_G5J}GalfdJege}}lupKKNO(>rB1y(NePY|e+_ zb`wh)^n71i)ZgCbDW0-=ILpGKi%I}^aHk)8YP%MPv`_vH)*e!Yu}z$(gcjt+H~PWE z$gXlwG=E{HfWSH6oI2_yx#Wh>0R66&%b|KWHEZf_$$*E)EA&a_wdq!L|Gp_L94 zTuM?!y>;I)9vqys#2>rMrw=6Ib2S&@it^ui%Aw-xoN<8lT2tsgd$zGQixcIPx;2Qk zfX-Z5r-kfHqp&3cGhFa@S}TB*U8C}YRC0I4fymXe#Z#;^3X3oPhC_2x>)se%1$r@k z+n8_XfoTUkJ=L2Di765eqv5Y0^d0_9H25`zAM|pHw@6}o5C?1>HYW6{SA#VAakZSIV2)j87cW)3#Ok_q zvZ@Gh*BIA=>(+Wf914c1z>R~ube+8>>gJu+(+G@L-YrmJv~YkDu>%bdrUZ-&5knyk z=on5~+}&seUYa2#`2~W2ZOu0XtbXW)v=N!5Qnpz?SvPRPIRl6jctTlsq#O;q#I1bA zJ(f6(1zx3pTlP_)DWe&7$h zYc;*h*aaZD(bWc5-sNNHfT=XOAA0!6!{$KWgbU*#OA-x@SC;L7V6{3ZRK z-u*KMETV<6ZQ`jtMh^<3n<)u(A?KK#0-`e#@R1B#}qe$nfFI})zHsU$H7Pn#kW*1Ypo1rY_E zR(s=SBChwbC|0iOw`JZ+yf8Mc6a?#Z&i+8*>dhDK;05khkM~92#{oZH+ylq|RE4y@ zSK5NEzJ<6wkxJ_mCi&u_iD&&`w~_jtijvxi@r2@;!j^_vfNOy|$O70o^4(x>_9%Y+ z8Wd!ZEKf4rf#K%;F|Rbk-|+m?R){J12sG*`k69eAECbPS&`6jVwrXG&WM+rY3SETg z#}UL24Y5DBrr(rc7&lL`7qVFQq7*efsW(O|Y{ycs94|xYru3nt(XoR;W2|OolC6FM zMX?BY$|{(s60EN@qLlxvMxeb}M$J3>RS#k)81D+ilK-8%g#;Uhw0D0eE57XW=j7*` z56Xg`0mg^`WsUQL*i=A^A3XYovOt$keoJ6d7+yfChgYC|jyX8SI-s~tD|(I+M|0nY za`l+d6kZ}AWoFT36ke)X*^PAob<6~^53;^zVt^!SbmvDL)6WxPkVum;Ue?a9Y?McTdo-3AlFLEU64E0G@}~ah|51hO#Tddf1&Fy<+V8N z9pW%}JsXb(nm$$~=(HF)I;`iRCGx)hL26)T&vNJfy0gjGYSJU|;XAI^CGt~KS))uWj76RRwHRDYG3Am_gZ(0)`{nbt#;zz?jy zZb2#I^E!d@(o%`%a$$;QTLqN)$8Ql?&%TE6{?|c62$8UUQS z^7SGm{}>KF4V)j0#~M5-D$7RlD~@{!lP8ST3TUpflNt*a=- zE4htrsw->?4}f|>SemFiGWPFE^#=6bv5X^16h9hpECljTo~QPTkfm+J1z2?4eUxv5 zpvVG8o+e&B3X=Fy6$pQLpC|H{?v+>ERMPx(r^V~%%?(7tFyZd(|Pw=Ix}9IHO^=rYyN;qm!h3q zMEnhl;X@jLjbc8dM)o?4KGkdsZz7u(kU4B#U6miMtQV;L;c)p1WXemhfOkm-)w{VT zN+`>#O#oXIYobL&h@SqB(R4>V%Je+eH{bAI#uSee@1^i|CNr8(?pZHTx`gSC3+@pF zk)Bz{4MWFnn2(gauJJSZa~(~AC(^KzFr(BaCT8_DKceya+yFx6pfiV^MI6lQs7G%H z$%N!quTr=}daE;I>me;r=QqX$hteS^K3so7wq!0*R_2dGTnw5$8Sbr+ol;lDZTo;p zR`!#ESt0mY$(wZ->%!P9$$SL_NjxO6X_kI0#nOmMk4!<)kaVJj_D7{&~s}J zTNs-(jjiJeh)&dK*CvY5+K@G4 z*H6I^2x}KX^eK{*g=C{0t&MlN^VDr&>hACTS2m#LA!D`~{2wh!d?;@mPLloF#G4i} zxV=NN^BF3BI9hx^48z*LRt(W~)*!&FiXGfRGfDuyKE^r*K1KlN}s=toHn4NOhQG!weVK2&RDjkL9x18Ek{LaM0##@;faOAhC zErKraW7D+;$@^D{R?&#Nlx|SVs;&ovIuRuvx6_T*X~){^Q^D@<>Zu*Pkb_o z9%k}9@Ht3w0+vgeK=ld`p@(LA!DaRHJB2VOPAFdh%}95%1mm%J6t}C{h>z&jf-C9 z@g8?7M(^~>xy(`pHGNcm-*lu-xQDXS*&(T`kJ5De6S+(t5Z8E^mQ=?76;-{}o%+}C zYLSO8XOx13*MGw|+F=HgB+Btv{#tcT;-;oyuOwXv)v)6%DB|fWS0yT92AY$x+bf(T z1p&P?KXm;-g&D1{S(fb9uXZit1I;n(+=pBf{C-I-h~h!P(f++h7wwqaeJ^c9b`{St z?v`J)SSf8JP}^5DdRyc2&AST0>h$s!7DcS6*$Y(0s;8L=rY&C*J7QZMAJUjod& ze;3EIIxW9uRJFtkB}I1xc^Nz<36N)0$RV(FGLb5s2UnTlhO&-z!Y~|%ZR$f>24Z(K`S@EF7`8!qClX$E96ZP4LKX>AH*saZ zuR6FFBYzG!_g=3IXDzNf15J*;yJ=?0N0pQd>2f$u#_`=qaB|y(L{9`A?Jj2S3qPl{ zt;GyL@g?LLfQXuaP=j=u*^p=@fb=d{sGzeerJB?~=?(Y3H!#(5+3sjoVvgV}6yLDN zZ7n6&`CT*fHd;4Sc=+u9+TYN?nAxRyd)K5TLZqjO$^K^|JbR6^nhBc&pNgGkuJLuN z1sa_c@Jxpl`d5ZZ0k}%&Qc%r1m9*0OiCmMtDJh6OajU3wN$&@-WbXIT{CZ*?!!v1$ z|F5h8gxixK_wdTiTJpV`iigv)DsW{JwPN>=v8n-KC;o%Xv38Gv8#!L)0pWgq8d@Rf z3Dq1S1;|8r4*p;SWwV0@m&2lsr=9)`Bs^lBAtV%U&FJ*vDCz+Gh{xeAp5XSZl$$E` z{TyZBWZ_Boo2$YsK#HGju`O8-Wk_XJ@+5SKe7d57BSj=t@)pJg77xV3ha)jNNG)6y zA%E5Tj_+5i9%RBlghv{SGDfS9x9SB6yHq9glTGTM`7AMU!@~6pRynF8yk(8kKqGf* z%SOq?TH~szc6R>$ViWzFY_uT3?W{2v#f3yjt7zjSg{(lMm>C$5pdf~gXX=(5Bc*;I zP!4b@Xj<3U_#fSF(1SeVv--<|Y+DJICg6jJ7>WBdW9PRf{ zmIuo@YoA_7n72Usbgj?HEgClEI$0@&FUVxe^yUqQ?1VLfnzP4s*@7{L&-3TZ%zcf& zBPBQbtY?9{^Z=kp4nAFHv@n()QC|q|jYZy^yDl;Ot6_x8CB2}YyCkvDksTpui+r^&Cmuade344L%{sWP4ZbZ zk2A~<=G!EICrPn=XNh#az*vDU)AxorHnl7yQDGXAXf|M^nbSuC%AAYjMauu!MmF7L z2gRkO@vc%3&9cGA&;YfXTR-B_$beObjI7b2KY45G`JzdfQ<9J;pBoi^=nlDpe`vNo z?)k^;*MtIG$mfTh#?u)w3o+;}ETVcx*+(wsj!0~`1YN~NR&f*?vM$@_T)(fTIHcyzmm>1Jc4YL zRt9QD9;@6&eZ;R1VGzAPux-r<lou? zWZQ@?PLw^KKX7nUUsLxkh*K3JPWxblfq(4z%FFg~jw-gfyNlBvsCp-9W|tvXb;Ds| zFilm^N=Q5qmQN>5xI7kabu)+`ojrHM9Ni5`Cna-a=+d7wYkV8MSrr%jdNKMCs#-e( zueOym*XN;JtcyS0zypN04clA@be8k+fhDd25mDlwg~ zQMtNCas8!EP_&gCC|ZmovqFV(<*}Rjw*sbR?bE(n)`R@w2oCyhV+l^z4s4K#=DfIQ z?CVYlwqX^Y=`_T}*7#tm-P9Waw5LaFU%9RxSUizrbd|UiAkMvX8Xk>b0ZPi`x+%ba zX=dHUB6TU32&EK!>8+&xq@(J?PZUCw0GAbt{#4;v4WlLjJ1y~rCR~K*X$PpJNH_xP z@XfvvrrNQ4&ZPXD8;jh1PJy--u{i_Z0W)AESev^*|x2ChX{)hyETAs+3Y1-L`tij>g|sQ6Tl9TBjr` zO@)L#10lDPEe9_x+zd`Mz=~ z1fSz^Kv}82R;06w4E|$pf+B+iyNUzcyZ$`NrLo0mOqh_o(llm!gi16vTkE*UANqDP zTuqkAI=?miT3?8MAMDRq9E-g))uoq1ds$;j(FqCKz9*xEI}KnQJuZfTqW6hM}KxzS5V%E zZIqP?gayEF)$X3^pE$}*4j)^Z#E(on^uxV0%QGD62k!XB7swSR8-2eoY2_Z-@{4Ql zHtfSO(K5x;wX8b1A2t7Y+nH3ixnyp%eC6|aC99KbubVi}4nFB`A?b|{O{@5)l;_1b zT={@X)lLiPV>dpo?x^KdFGbsMhcE+?Eb0zhe#*BtQ+w|{T3wDqsjJfahi8sJ+Xv7_ zU~%erpsge!CPR>`+<1XsKBOVT@Y!UnKK+VmHiZ^h^oM&CGN}IfO(M?su4|J){4yAX z4{D){Pm8)bl_3_wN%rzw96RWrDT-yvoyiL*fw8T0BFdND zoPTY+iv;MdlgnY&hzlw0POfyh_QrKMZktz4<)a%^Fib0>p}$LVKEmoMADqBKtLBkcu4Du@?KqVx60*)xHB2RGJFi(q=e4Fm#7k zoDC`Gvq@2{Uv^864mUQeJ+RI*+8t|V`F zQAjNtA+wNJ`!)%cyS*{IBC66{>cb0#rtQ{4mXpuJb&>~Zg7<31-khHO94c?d6N*`t z4-LbMm0Aqd!aF3eVMkx>Yt)MHLxxGE|Czp0La38jm1q$2C%!uKT6e2L>pkV1F(@r+ z0&y2(>vj$OlLuBRhZLTobrnYMMBE8^)Q_?l1POf%IsT`M4d4BU;qYx`zR(6mWLY|t z&8PY6If@Ia$3Evm7#}EeIx000(y(XTf<#RDQ~sooY=6KVDT~EwV8pft>W?ik2H+CB z+nv7ThEaaU^EE{pVJ=k0hJUqsLhv*qfl+{Wp!x_$mBc!X+>5OCswSKS_rVc7r7lqw z{C1ePFhvaSu1NH|(Q#P7Dt*C8X$yS ztFJwpUlWM9{CMF&&e_ztfz?Zuq;S<|x5xC~yM6@+A#qWgCLV5lFb+v|jv@;U5zqpp zjNHZcUAUy|CP4;&;FIInh;ZVw!i5Uacv>z{t!llL0coIoKcq(BQhNy%NETGn`PGQpv0jbde?j;; zhTp4)KSsyLtF7H2uE%twgS9d^A3Q{>czVvavd`#_Ie;{}cwrJbbj6^CDR2YmvS&L{ zJ=HlT-NLS4Ow*}0c>{VY)<>>0o?YDwlwxWVME(Z1e%c4uTlgm4VA<%{y$}cuJ4nUe z7@uF)QofZvOG+g3Z#y)Z5;zF`kftfTlUn>vA88d@fpe1dXMm z_yi1>iFnR$=~Q5mLEJnpfmZmAD?IpM5W7^pjb>gwfI=)T+j&RL8%E3!1dpjeTrHL0 z^yx-O`2PO7ktI;GsqY?qUz7y*Y8qgTt4^o2owL{^L*pCKf=^b}zezg|@Pk*yi9v`| z(s6U@P{nFooMi8*=xn5W>%O1H~fKc_T){Y@2C*KMqtO&k>^+}$tf)2o3PfRRMwuB0Q8bdE( z{EMpSkz5Fo0BPbmxd6I$jqr~&aiu*8AT95K8;j_)j^ti-w-EpiG)^v}>A0b;S+Hoi z+b!`?;e#6JCK2glW~?;^uQiQWnnd>CM2)-BFWFaYZHI%)I+Xqz8NclFWTs7HFvi2! z@;JB@-m}x42NV1R)e{T3q1wBNQjm8+zi*?@cPkVsL>_P3RG76)#;u*36CHXXhov4@ zWpr-qX5In;riY+L!pV0sy4ip8nzDGF3kT>~!kkVNRVE$1+5i=u{=rhwSTV6$IXryf1UToBit}}q^ zMxA0ha0@`M<5(aVO)Fo2WDa)J=12oYhDoDjo3 z)90?t7>@xLpgsNp_6t3Ovzi@*&?_0yb^!*iTVlmWvDar-YD+@{GFEl1F6S4MQ{ zS-C2A6?jZr*~-W2M$w_sSf@&Lp-bX*dG7D3d11)xNzAB}L`a$_)qJ5{WjJEeoOPxO zMvc6o*-vuw*76cw18)2Iqv9Xtr?9<3JJ_3iZm?A@j}LwN zuD!#^!0NtAYijX`MQGeusu=b|U3N8H@Z)v*hUvY#D3iYF(!c=Ds?;qZ zDh@QDSGsCAA+baM;W1#doqX!ze(Mv%W`8uQm^K_y84Rpj%^viDQ8Q_aPBfDoxuhk@ z3WAeO3grs?oxo{32J4NX{%vAvN6WQS4`JS#E_jRQ$@f5ld|J6V7B#JjYi2aHb+gXH z&49Rw%Y@Cg7rn8tmq@+0+jmD9U83vl&sFD&gSGfo+^`7)TJ^Is@+twz8?O|ME}B=8 z7MS&slqs2n`wT{3wQ=~zltA8q+tTquHzy-UTR7_40v4+Q9T5cctR|RYfiP=;_=ack zIa6}USiasZslZQ*VnC9M)PFr{)RbpDDm1CN-nzc(ua_$KnZxU+C?ejHBpPVgTiWkH zJm;@t7u8Ka^RXW z4Ky3gU~z6RCeR#JlkK+H%R_BscxL+Sg`CcYPGZSw+eNn8UFfE-Gmf_72O;AL)XXie z?Ev$1+~ppGP{`Naebi0^K?d2qU>7?PHKJiHvR26#$XYRwM9uO-X8ZH=5D$Cd)MIeS zk;yR&4eC=nQxD&ndR5PC5#0*Qx%W1-D0ZH)KZ6rSC)2um;^X=^oLk>oo1KvrONxK- z^{UXVa`1sC3k2LD>1zWhVx0ZrfN2xqHwsH0-1%{PbKDt_|D-VRE5`fkqjrY1q*PGe~DZeB~0tNnvnvK_%^ zK9?PEl~TRzp>`vE9JHA2l3GpQKzTVoQJvo>MkZ*FY`h7~ra7)eHf9$&`Ze^)8~PBA z`RzY?fyZ}lHVku&nlYKfuReM>4N~5UAc9Pb8j6=4xh7;(H)ULHiK+FJOo2Y}SvTjA z)cL-8w-SrY9AQ9K_`Q>2mvZ!~d?j5tR>tavx|P_W@#1c(r#QZXAFyn*admpP4S21* zXF&d*?B@mHTw`{USnBJV1{k@TdjyBjef8VGMJ>}xDpi$1EFrUQdL(uQ(aGw@OTK?7 z=XaOZOzw6DQ?<;391D@h)FSVLf87+3bo&3}Y!Nnt?oL>TyQac1G(cC1MKsZo(2)$1 zc5#fUY_0-p2j}DO;PZpK$B!LTxY&tEd=LkCz zeh+}82|e8e6~#=%<^A`qlj%h;s`dHXb6hXSp4!*D?*VrQ2SgFK_4g{Gr=R3)pNnJY z7?F+Ke46vC6MaFrT?xug=gRnJ&COq}^Qcq&fs0EpjhiYrg~U8q8Pb(FN5s0~d%Vy& zMMaz16j>JuX0LeOjUD_lp#8C>i7eNdJa#634LA*JwYqH9F%wBVdn)vqwC>N>+&^lt zxOF;{-7N(9p_hsG5jK(OLkx_Md=L33_$x!KoRjM_H7-$dLHCeZFbPx1BYTDTPfJrl z*q3tpE~#UwJMAYzTpn2|)46z)i!k+)?>Rnl?BpFbUniwsC8g~p=Z77)r7=u@O6m$+Vm7p@j z7lZT6uAj~T`Xs*e*84L*T1J+V#^l_MN1LT?diHKRV@*LfjF8n;`(8_jT)DrU2U4k# zC5>}*B-RR`PQFuc?>#m@KQ9bI=aPcPbcIa>?SOp)H1xMt^5J=?p>fMvv5u?kUKw44 z$(Mq5h&FY@Sw;*?Q8ZQqU%&?~fudkI&SnnHN zOao}Whd@GIE-U0O0A|ihGj$)zRFMO7LteBq@~i<^s4_w-$t2IpNI%WVQy zcwvQA<$j4h$EUYw8lu=uT@+LoMu2Pr!%>9q`-am&*=hC^Ew^hdnCkgvnlW7(oBpF>w6u_+YBD!JUkx`W%3GGBZep>1Z; z7R<~JOe~5l+S-f?+U6>j{QM-00(@r1uEx^#uI84`R<2$|=Juw}UXHHjW>R*>rn2T< z{3KS|9vZeBI+Ct-W-1CMDopkOb2~hA_LdrZ+RIi@30elc=+_sl1wvESsjQl7{mC(f!}LG>rf4#P~_r9Cb7t zuMJsh2x#6|5?In0DDY?(a0xfqo#86`Y9 zSiCiqncPj?q%HqrVe(D4i*+xrsj++OiUb%|67#N(b&bs!@=2% zM1aqc=l_e}zt|DEI9l0@I=I=p@{=$#{hxz`$i>*!Rn^M-pQp?mBt#~*4yHE$|M1_k zB37<0{3I;w|0V&(E`a~caj=mPnOZpl%$?SZ0#XtY^fn@>1FHWY9py7Y3cpnw*HIz|0q*-6?IT$auX5OFjHe^)U;OB z&~#^&v~-acx0mM-H*?jNvM_U2HFsqe75;x8g#XH_0N;NN|J^Q(|C3rP2YZ32b+LUV z39~fDi@-~28fD!_0rM?b%yh&RtO?w52i9}23 + +This functionality is available for the C and Fortran interface. There are [ID based](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_cpuid.h#L47) (same for C and Fortran) and string based functions to query the code path (as determined by the CPUID), or to set the code path regardless of the presented CPUID features. The latter may degrade performance if a lower set of instruction set extensions is requested, which can be still useful for studying the performance impact of different instruction set extensions. +**Note**: There is no additional check performed if an unsupported instruction set extension is requested, and incompatible JIT-generated code may be executed (unknown instruction signaled). + +```C +int libxsmm_get_target_archid(void); +void libxsmm_set_target_archid(int id); + +const char* libxsmm_get_target_arch(void); +void libxsmm_set_target_arch(const char* arch); +``` + +Available code paths (IDs and corresponding strings): + +* LIBXSMM_TARGET_ARCH_GENERIC: "**generic**", "none", "0" +* LIBXSMM_X86_GENERIC: "**x86**", "x64", "sse2" +* LIBXSMM_X86_SSE3: "**sse3**" +* LIBXSMM_X86_SSE42: "**wsm**", "nhm", "sse4", "sse4_2", "sse4.2" +* LIBXSMM_X86_AVX: "**snb**", "avx" +* LIBXSMM_X86_AVX2: "**hsw**", "avx2" +* LIBXSMM_X86_AVX512_MIC: "**knl**", "mic" +* LIBXSMM_X86_AVX512_KNM: "**knm**" +* LIBXSMM_X86_AVX512_CORE: "**skx**", "skl", "avx3", "avx512" +* LIBXSMM_X86_AVX512_CLX: "**clx**" +* LIBXSMM_X86_AVX512_CPX: "**cpx**" +* LIBXSMM_X86_AVX512_SPR: "**spr**" + +The **bold** names are returned by `libxsmm_get_target_arch` whereas `libxsmm_set_target_arch` accepts all of the above strings (similar to the environment variable LIBXSMM_TARGET). + +### Verbosity Level + +The [verbose mode](index.md#verbose-mode) (level of verbosity) can be controlled using the C or Fortran API, and there is an environment variable which corresponds to `libxsmm_set_verbosity` (LIBXSMM_VERBOSE). + +```C +int libxsmm_get_verbosity(void); +void libxsmm_set_verbosity(int level); +``` + +### Timer Facility + +Due to the performance oriented nature of LIBXSMM, timer-related functionality is available for the C and Fortran interface ([libxsmm_timer.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_timer.h#L37) and [libxsmm.f](https://github.com/hfp/libxsmm/blob/master/include/libxsmm.f#L32)). The timer is used in many of the [code samples](https://github.com/hfp/libxsmm/tree/master/samples) to measure the duration of executing a region of the code. The timer is based on a monotonic clock tick, which uses a platform-specific resolution. The counter may rely on the time stamp counter instruction (RDTSC), which is not necessarily counting CPU cycles (reasons are out of scope in this context). However, `libxsmm_timer_ncycles` delivers raw clock ticks (RDTSC). + +```C +typedef unsigned long long libxsmm_timer_tickint; +libxsmm_timer_tickint libxsmm_timer_tick(void); +double libxsmm_timer_duration( + libxsmm_timer_tickint tick0, + libxsmm_timer_tickint tick1); +libxsmm_timer_tickint libxsmm_timer_ncycles( + libxsmm_timer_tickint tick0, + libxsmm_timer_tickint tick1); +``` + +### User-Data Dispatch + +To register a user-defined key-value pair with LIBXSMM's fast key-value store, the key must be binary reproducible. Structured key-data (`struct` or `class` type which can be padded in a compiler-specific fashion) must be completely cleared, i.e., all gaps may be zero-filled before initializing data members (`memset(&mykey, 0, sizeof(mykey))`). This is because some compilers can leave padded data uninitialized, which breaks binary reproducible keys, hence the flow is: claring heterogeneous keys (struct), initialization (members), and registration. The size of the key is arbitrary but limited to LIBXSMM_DESCRIPTOR_MAXSIZE (96 Byte), and the size of the value can be of an arbitrary size. The given value is copied and may be initialized at registration-time or when dispatched. Registered data is released at program termination but can be manually unregistered and released (`libxsmm_xrelease`), e.g., to register a larger value for an existing key. + +```C +void* libxsmm_xregister(const void* key, size_t key_size, size_t value_size, const void* value_init); +void* libxsmm_xdispatch(const void* key, size_t key_size); +``` + +The Fortran interface is designed to follow the same flow as the C language: (1) `libxsmm_xdispatch` is used to query the value, and (2) if the value is a NULL-pointer, it is registered per `libxsmm_xregister`. Similar to C (`memset`), structured key-data must be zero-filled (`libxsmm_xclear`) even when followed by an element-wise initialization. A key based on a contiguous array has no gaps by definition and it is enough to initialize the array elements. A [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) is given as part of the [Dispatch Microbenchmark](https://github.com/hfp/libxsmm/tree/master/samples/utilities/dispatch). + +```Fortran +FUNCTION libxsmm_xregister(key, keysize, valsize, valinit) + TYPE(C_PTR), INTENT(IN), VALUE :: key + TYPE(C_PTR), INTENT(IN), VALUE, OPTIONAL :: valinit + INTEGER(C_INT), INTENT(IN) :: keysize, valsize + TYPE(C_PTR) :: libxsmm_xregister +END FUNCTION + +FUNCTION libxsmm_xdispatch(key, keysize) + TYPE(C_PTR), INTENT(IN), VALUE :: key + INTEGER(C_INT), INTENT(IN) :: keysize + TYPE(C_PTR) :: libxsmm_xdispatch +END FUNCTION +``` + +**Note**: This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. However, the functionality is not limited to multiple kernels but any data can be registered and queried. User-data dispatch uses the same implementation as regular code-dispatch. + +### Memory Allocation + +The C interface ([libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h)) provides functions for aligned memory one of which allows to specify the alignment (or to request an automatically selected alignment). The automatic alignment is also available with a `malloc` compatible signature. The size of the automatic alignment depends on a heuristic, which uses the size of the requested buffer. +**Note**: The function `libxsmm_free` must be used to deallocate buffers allocated by LIBXSMM's allocation functions. + +```C +void* libxsmm_malloc(size_t size); +void* libxsmm_aligned_malloc(size_t size, size_t alignment); +void* libxsmm_aligned_scratch(size_t size, size_t alignment); +void libxsmm_free(const volatile void* memory); +int libxsmm_get_malloc_info(const void* m, libxsmm_malloc_info* i); +int libxsmm_get_scratch_info(libxsmm_scratch_info* info); +``` + +The library exposes two memory allocation domains: (1) default memory allocation, and (2) scratch memory allocation. There are similar service functions for both domains that allow to customize the allocation and deallocation function. The "context form" even supports a user-defined "object", which may represent an allocator or any other external facility. To set the allocator of the default domain is analogous to setting the allocator of the scratch memory domain (shown below). + +```C +int libxsmm_set_scratch_allocator(void* context, + libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); +int libxsmm_get_scratch_allocator(void** context, + libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); +``` + +The scratch memory allocation is very effective and delivers a decent speedup over subsequent regular memory allocations. In contrast to the default allocator, a watermark for repeatedly allocated and deallocated buffers is established. The scratch memory domain is (arbitrarily) limited to 4 GB of memory which can be adjusted to a different number of Bytes (available per [libxsmm_malloc.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_malloc.h), and also per environment variable LIBXSMM_SCRATCH_LIMIT with optional "k|K", "m|M", "g|G" units, unlimited per "-1"). + +```C +void libxsmm_set_scratch_limit(size_t nbytes); +size_t libxsmm_get_scratch_limit(void); +``` + +By establishing a pool of "temporary" memory, the cost of repeated allocation and deallocation cycles is avoided when the watermark is reached. The scratch memory is scope-oriented with a limited number of pools for buffers of different life-time or held for different threads. The [verbose mode](index.md#verbose-mode) with a verbosity level of at least two (LIBXSMM_VERBOSE=2) shows some statistics about the populated scratch memory. + +```bash +Scratch: 173 MB (mallocs=5, pools=1) +``` + +To improve thread-scalability and to avoid frequent memory allocation/deallocation, the scratch memory allocator can be leveraged by [intercepting existing malloc/free calls](libxsmm_tune.md#intercepted-allocations). + +**Note**: be careful with scratch memory as it only grows during execution (in between `libxsmm_init` and `libxsmm_finalize` unless `libxsmm_release_scratch` is called). This is true even when `libxsmm_free` is (and should be) used! + +### Meta Image File I/O + +Loading and storing data (I/O) is normally out of LIBXSMM's scope. However, comparing results (correctness) or writing files for visual inspection is clearly desired. This is particularly useful for the DNN domain. The MHD library domain provides support for the Meta Image File format (MHD). Tools such as [ITK-SNAP](http://itksnap.org/) or [ParaView](https://www.paraview.org/) can be used to inspect, compare, and modify images (even beyond two-dimensional images). + +Writing an image is per `libxsmm_mhd_write`, and loading an image is split in two stages: (1) `libxsmm_mhd_read_header`, and (2) `libxsmm_mhd_read`. The first step allows to allocate a properly sized buffer, which is then used to obtain the data per `libxsmm_mhd_read`. When reading data, an on-the-fly type conversion is supported. Further, data that is already in memory can be compared against file-data without allocating memory or reading this file into memory. + +To load an image from a familiar format (JPG, PNG, etc.), one may save the raw data using for instance [IrfanView](http://www.irfanview.com/) and rely on a "header-only" MHD-file (plain text). This may look like: + +```ini +NDims = 2 +DimSize = 202 134 +ElementType = MET_UCHAR +ElementNumberOfChannels = 1 +ElementDataFile = mhd_image.raw +``` + +In the above case, a single channel (gray-scale) 202x134-image is described with pixel data stored separately (`mhd_image.raw`). Multi-channel images are expected to interleave the pixel data. The pixel type is per `libxsmm_mhd_elemtype` ([libxsmm_mhd.h](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_mhd.h#L38)). + +### Thread Synchronization + +LIBXSMM comes with a number of light-weight abstraction layers (macro and API-based), which are distinct from the internal API (include files in [src](https://github.com/hfp/libxsmm/tree/master/src) directory) and that are exposed for general use (and hence part of the [include](https://github.com/hfp/libxsmm/tree/master/include) directory). + +The synchronization layer is mainly based on macros: LIBXSMM_LOCK_\* provide spin-locks, mutexes, and reader-writer locks (LIBXSMM_LOCK_SPINLOCK, LIBXSMM_LOCK_MUTEX, and LIBXSMM_LOCK_RWLOCK respectively). Usually the spin-lock is also named LIBXSMM_LOCK_DEFAULT. The implementation is intentionally based on OS-native primitives unless LIBXSMM is reconfigured (per LIBXSMM_LOCK_SYSTEM) or built using `make OMP=1` (using OpenMP inside of the library is not recommended). The life-cycle of a lock looks like: + +```C +/* attribute variable and lock variable */ +LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK_DEFAULT) attr; +LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_DEFAULT) lock; +/* attribute initialization */ +LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK_DEFAULT, &attr); +/* lock initialization per initialized attribute */ +LIBXSMM_LOCK_INIT(LIBXSMM_LOCK_DEFAULT, &lock, &attr); +/* the attribute can be destroyed */ +LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK_DEFAULT, &attr); +/* lock destruction (usage: see below/next code block) */ +LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK_DEFAULT, &lock); +``` + +Once the lock is initialized (or an array of locks), it can be exclusively locked or try-locked, and released at the end of the locked section (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_TRYLOCK, and LIBXSMM_LOCK_RELEASE respectively): + +```C +LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_DEFAULT, &lock); +/* locked code section */ +LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &lock); +``` + +If the lock-kind is LIBXSMM_LOCK_RWLOCK, non-exclusive a.k.a. shared locking allows to permit multiple readers (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_TRYREAD, and LIBXSMM_LOCK_RELREAD) if the lock is not acquired exclusively (see above). An attempt to only read-lock anything else but an RW-lock is an exclusive lock (see above). + +```C +if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) == + LIBXSMM_LOCK_TRYREAD(LIBXSMM_LOCK_RWLOCK, &rwlock)) +{ /* locked code section */ + LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); +} +``` + +Locking different sections for read (LIBXSMM_LOCK_ACQREAD, LIBXSMM_LOCK_RELREAD) and write (LIBXSMM_LOCK_ACQUIRE, LIBXSMM_LOCK_RELEASE) may look like: + +```C +LIBXSMM_LOCK_ACQREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); +/* locked code section: only reads are performed */ +LIBXSMM_LOCK_RELREAD(LIBXSMM_LOCK_RWLOCK, &rwlock); + +LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK_RWLOCK, &rwlock); +/* locked code section: exclusive write (no R/W) */ +LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_RWLOCK, &rwlock); +``` + +For a lock not backed by an OS level primitive (fully featured lock), the synchronization layer also provides a simple lock based on atomic operations: + +```C +static union { char pad[LIBXSMM_CACHELINE]; volatile LIBXSMM_ATOMIC_LOCKTYPE state; } lock; +LIBXSMM_ATOMIC_ACQUIRE(&lock.state, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED); +/* locked code section */ +LIBXSMM_ATOMIC_RELEASE(&lock.state, LIBXSMM_ATOMIC_RELAXED); +``` + +In addition to the LIBXSMM_LOCK_\* macros or LIBXSMM_ATOMIC_LOCKTYPE, API-based lock primitives are also available (libxsmm_mutex_\*, and libxsmm_rwlock_\*). However, the underlying implementation of the latter is experimental. + diff --git a/third_party/libxsmm/documentation/libxsmm_be.md b/third_party/libxsmm/documentation/libxsmm_be.md new file mode 100644 index 00000000..f84fe381 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_be.md @@ -0,0 +1,76 @@ +## Backend + +### Code Generator (JIT) + +There can be situations in which it is up-front not clear which problem-sizes will be needed when running an application. To leverage LIBXSMM's high-performance kernels, the library implements a JIT (Just-In-Time) code generation backend which generates the requested kernels on the fly (in-memory). This is accomplished by emitting the corresponding byte-code directly into an executable buffer. The actual JIT code is generated per the CPUID flags, and therefore does not rely on the code path selected when building the library. In the current implementation, some limitations apply to the JIT backend specifically: + +1. To stay agnostic to any threading model used, Pthread mutexes are guarding the updates of the JIT'ted code cache (link line with `-lpthread` is required); building with OMP=1 employs an OpenMP critical section as an alternative locking mechanism. +2. There is limited support for the Windows calling convention (only kernels without prefetch signature). + +The JIT backend can also be disabled at build time (`make JIT=0`) as well as at runtime (`LIBXSMM_TARGET=0`, or anything prior to Intel AVX). The latter is an environment variable which allows to set a code path independent of the CPUID (LIBXSMM_TARGET=0|1|sse|snb|hsw|knl|knm|skx|clx|cpx|spr). Please note that LIBXSMM_TARGET cannot enable the JIT backend if it was disabled at build time (JIT=0). + +One can use the afore mentioned THRESHOLD parameter to control the matrix sizes for which the JIT compilation will be automatically performed. However, explicitly requested kernels (by calling `libxsmm_?mmdispatch`) fall not under a threshold for the problem-size. In any case, JIT code generation can be used for accompanying statically generated code. + +### Generator Driver + +In rare situations, it might be useful to directly incorporate generated C code (with inline assembly regions). This is accomplished by invoking a driver program (with certain command line arguments). + +**Note**: The stand-alone generator-driver is considered legacy (deprecated). Associated functionality may be removed and future instruction set extensions may not be addressed with printed assembly code. The cost of dispatching JIT-code for every code region of an application, and for every visit of such region, can be amortized in several ways and without dispensing JIT-generated code. Dispatching [multiple kernels at once](libxsmm_aux.md#user-data-dispatch) or (most effectively) tabulating JIT'ted function pointers manually, can elleviate or remove first-time code generation and (more important) the cost of subsequently dispatching kernels (when code was already JIT-generated). + +The generator driver program is usually built as part of LIBXSMM's build process, but also available as a separate build target: + +```bash +make generator +bin/libxsmm_gemm_generator +``` + +The code generator driver program accepts the following arguments: + +1. Select: dense, dense_asm, sparse, sparse_csr, or sparse_csr_reg +2. Filename of a file to append to +3. Routine name to be created +4. M parameter +5. N parameter +6. K parameter +7. LDA (0 indicates A is sparse if 1st arg. is "sparse*") +8. LDB (0 indicates B is sparse if 1st arg. is "sparse*") +9. LDC parameter +10. Alpha (1) +11. Beta: (0 or 1) +12. Alignment override for A (1 auto, 0 unalignment) +13. Alignment override for C (1 auto, 0 unalignment) +14. Architecture (noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx) +15. Prefetch strategy, see below (only nopf or pfsigonly for "sparse*") +16. SP (single-precision), DP (double-recision), or I16 (only "dense*") +17. CSC file in Matrix market format (only if 1st arg. is "sparse*"). + +The prefetch strategy can be: + +1. "nopf": data is not prefetched, just three arguments: A, B, and C +2. "pfsigonly": no prefetches, kernel signature: A, B, C, A', B', and C' +3. "BL2viaC": uses accesses to C to prefetch B' +4. "AL2": uses accesses to A to prefetch A +5. "curAL2": prefetches current A ahead in the kernel +6. "AL2_BL2viaC": combines AL2 and BL2viaC +7. "curAL2_BL2viaC": combines curAL2 and BL2viaC + +Here are some examples of invoking the driver program: + +```bash +bin/libxsmm_gemm_generator dense foo.c foo 16 16 16 32 32 32 1 1 1 1 hsw nopf DP +bin/libxsmm_gemm_generator dense_asm foo.c foo 16 16 16 32 32 32 1 1 1 1 knl AL2_BL2viaC DP +bin/libxsmm_gemm_generator sparse foo.c foo 16 16 16 32 0 32 1 1 1 1 hsw nopf DP bar.csc +``` + +Please note, there are additional examples given in samples/generator and samples/seissol. + +### Development Concepts + +The low-level code generator is hosted by a single translation unit ([src/generator_x86_instructions.c](https://github.com/hfp/libxsmm/blob/master/src/generator_x86_instructions.h)). The code generator emits instructions as enumerated in [src/generator_common.h](https://github.com/hfp/libxsmm/blob/master/src/generator_common.h). A kernel then is a buffered stream of instructions in either binary/encoded or textual form. The latter is leveraged by stand-alone generator drivers that can print C functions with an assembly section (inline). A [generator driver](#generator-driver) may exists for some of LIBXSMM's function domains. Please note that emitting the textual form is not needed to inspect the emitted code since the binary encoded form can be easily disassembled ([objdump](index.md#objdump)). + +The binary encoded form is directly suitable for execution by casting the code-buffer into a function-pointer of the corresponding signature. It is advised to rely on LIBXSMM's internal memory allocation routines to acquire an executable buffer (see libxsmm_malloc_flags, libxsmm_xmalloc, and libxsmm_malloc_attrib in [src/libxsmm_main.h](https://github.com/hfp/libxsmm/blob/master/src/libxsmm_main.h)). This ensures correct behavior in security-hardened environments. As a bonus, [profiler support](libxsmm_prof.md) for the emitted code is enabled transparently. + +To debug the JIT'ted code, GNU GDB can be used to disassemble a given memory address (`disas address,+length`). Having the code disassembled side-by-side (while debugging) helps to look ahead and to have some orientation. For the latter, [objdump](index.md#objdump) can be used to acquire the source code (assembly) along with hexadecimal line numbers (length). The offset position (for GDB's disas) directly corresponds to objectdump's line numbers. + +The kernel development is much like assembly programming, except that an API is used to emit instructions. For further reference, some existing source code for building kernels can be inspected (e.g., matcopy). This may help to capture the concept of mapping registers (basically a table to avoid hard-coding register names). + diff --git a/third_party/libxsmm/documentation/libxsmm_compat.md b/third_party/libxsmm/documentation/libxsmm_compat.md new file mode 100644 index 00000000..e8274b01 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_compat.md @@ -0,0 +1,97 @@ +## Linux + +All Linux distributions are meant to be fully supported (please [report](https://github.com/hfp/libxsmm/issues/new) any compatibility issue). A shared library (`STATIC=0`) necessarily implies some performance hit when accessing thread-local memory (contended multicore execution). The GNU Compiler Collection prior to v5.1 may imply performance hits in some CPUID-dispatched code paths (non-JIT). + +> In case of outdated Binutils, compilation can fail to assemble code that originates from code sections using Intrinsics (see issue [#170](https://github.com/hfp/libxsmm/issues/170) and [#212](https://github.com/hfp/libxsmm/issues/212#issuecomment-394620082)). To resolve the problem, please use `INTRINSICS=1` along with the desired target e.g., `AVX=3 MIC=0`, or `AVX=2`. + +## CRAY + +In addition to the regular Linux support, The CRAY Compiling Environment (CCE) is supported: Intel Compiler as well as the GNU Compiler Collection are detected even when invoked per CCE, and the CRAY compiler is likely configured to build for the architecture of the compute nodes and hence the compiler is sufficiently treated without specific build flags (`COMPATIBLE=1` is implicitly set). The CCE may suppress to build a shared library (`STATIC=0`), which also affects the TRACE facility (requires dynamic linkage even for static archives). + +```bash +make CXX=CC CC=cc FC=ftn +``` + +The compatibility settings imply minor issues when using the CRAY compiler: full control and [customization](http://libxsmm.readthedocs.io/libxsmm_tune/) is not implemented, enabling symbols (`SYM=1`) appears to imply an unoptimized debug-build (due to the `-g` flag being present). Some sample codes/benchmarks enable symbols but are meant to not enable debug-code. The LIBXSMM library however is built without symbols by default. + +## Windows + +### Microsoft Windows + +Microsoft Windows is [supported](https://github.com/hfp/libxsmm/wiki/Q&A#what-operating-systems-are-covered-by-libxsmm-and-what-about-microsoft-windows) using the Microsoft Visual Studio environment (no `make`). It is advised to review the build settings. However, the following configurations are available: `debug`, `release`, and release mode with `symbols`. JIT-code generation is enabled but limited to the MM domain (GEMM kernels and matcopy kernels; no transpose kernels). GEMM kernels with prefetch signature remain as non-prefetch kernels i.e., prefetch locations are ignored due to the effort of fully supporting the Windows calling convention. As a workaround and to properly preserve caller-state, each JIT-kernel call may be wrapped by an own function. + +### Cygwin + +Cygwin (non-MinGW) is fully supported. Please note, that all limitations of Microsoft Windows apply. + +```bash +make +``` + +LIBXSMM can be built as a static library as well as a dynamic link library (STATIC=0). + +### MinGW/Cygwin + +This is about the Cygwin-hosted bits of MinGW. The `-fno-asynchronous-unwind-tables` compiler flag is automatically applied. Please note, that all limitations of Microsoft Windows apply. + +```bash +make \ + CXX=x86_64-w64-mingw32-g++ \ + CC=x86_64-w64-mingw32-gcc \ + FC=x86_64-w64-mingw32-gfortran +``` + +To run tests, `BLAS=0` may be supplied (since Cygwin does not seem to provide BLAS-bits for the MinGW part). However, this may be different for "native" MinGW, or can be fixed by supplying a BLAS library somehow else. + +### MinGW + +This is about the "native" MinGW environment. Please note, there is the original [MinGW](https://mingw.osdn.io/) as well as a [fork](http://mingw-w64.org/) (made in 2007). Both of which can target Windows 64-bit. Here, the [MSYS2 installer](https://www.msys2.org/) (scroll down on that page to see the full installation instructions) has been used (see the [details](https://github.com/msys2/msys2/wiki/MSYS2-installation) on how to install missing packages). + +```bash +pacman -S msys/make msys/python msys/diffutils \ + mingw64/mingw-w64-x86_64-gcc mingw64/mingw-w64-x86_64-gcc-fortran \ + mingw64/mingw-w64-x86_64-openblas +``` + +Similar to Cygwin/MinGW, the `-fno-asynchronous-unwind-tables` flag is automatically applied. + +```bash +make +``` + +LIBXSMM can be built as a static library as well as a dynamic link library (`STATIC=0`). + +## Apple macOS + +LIBXSMM for macOS (OSX) is fully supported (i.e., it qualifies a release). The default is to rely on Apple's Clang based (platform-)compiler ("gcc"). However, the actual GCC as well as the Intel Compiler for macOS can be used. + +## FreeBSD + +LIBXSMM is occasionally tested under FreeBSD. For libxsmmext, it is necessary to install OpenMP (`sudo pkg install openmp`). + +```bash +bash +gmake +``` +An attempt to run the [tests](https://github.com/hfp/libxsmm/wiki/Validation) may ask for a LAPACK/BLAS installation (unless `BLAS=0` is given). Both, Netlib BLAS (reference) and OpenBLAS are available (in case of linker error due to the GNU Fortran runtime library, one can try `gmake CXX=g++7 CC=gcc7 FC=gfortran7` i.e., select a consistent tool chain and adjust `LD_LIBRARY_PATH` accordingly e.g., `/usr/local/lib/gcc7`). + +## PGI Compiler + +The PGI Compiler 2019 (and later) is supported. Earlier versions were only occasionally tested and automatically enabled the `COMPATIBLE=1` and `INTRINSIC=0` settings. Still, atomic builtins seem incomplete (at least with `pgcc`) hence LIBXSMM built with PGI Compiler is not fully thread-safe (tests/threadsafety can fail). Support for GNU's libatomic has been incorporated mainly for PGI but is also missing built-in compiler support hence supposedly atomic operations are mapped to normal (non-atomic) code sequences (`LIBXSMM_SYNC_SYSTEM`). + +```bash +make CXX=pgc++ CC=pgcc FC=pgfortran +``` + +### ARM AArch64 + +This section is not strictly about compiler compatibility but rather about AArch64 (v8.1) being supported, which practically covers the baseline ARM 64-bit architecture from embedded and mobile to supercomputers. The build and installation process of LIBXSMM is the same as for Intel Architecture (IA) and the library can be natively compiled or cross-compiled. The latter for instance looks like: + +```bash +make PLATFORM=1 AR=aarch64-linux-gnu-ar \ + FC=aarch64-linux-gnu-gfortran \ + CXX=aarch64-linux-gnu-g++ \ + CC=aarch64-linux-gnu-gcc +``` + +**Note**: Apple M1 is supported but JIT code generation may fail due to macOS 11 ("Big Sur"). LIBXSMM does not currently support macOS 11.x (regardless of ARM or Intel Architecture). diff --git a/third_party/libxsmm/documentation/libxsmm_dl.md b/third_party/libxsmm/documentation/libxsmm_dl.md new file mode 100644 index 00000000..c67b1317 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_dl.md @@ -0,0 +1,133 @@ +## Deep Neural Networks + +To achieve best performance with small convolutions for CNN on SIMD architectures, a specific data layout must be used. As this layout depends on several architectural parameters, the goal of LIBXSMM's interface is to hide this complexity from the user by providing copy-in and copy-out routines. This happens using opaque data types, which themselves are later bound to a convolution operation. + +The interface is available for C. There is a collection of code samples ([samples/deeplearning](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning)) available including a light-weight [framework for deep learning (GXM)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/gxm), and samples with focus on [Convolutional Deep Neural Networks (DNNs)](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer), or [LSTM cells](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/lstmdriver), etc. The general concept of the CNN interface is circled around a few types: `libxsmm_dnn_layer`, `libxsmm_dnn_buffer`, `libxsmm_dnn_bias`, and `libxsmm_dnn_filter`. A handle of such a type is always setup by calling a create-function. + +```C +/** Simplified LIBXSMM types which are needed to create a handle. */ + +/** Structure which describes the input and output of data (DNN). */ +typedef struct libxsmm_dnn_conv_desc { + int N; /* number of images in mini-batch */ + int C; /* number of input feature maps */ + int H; /* height of input image */ + int W; /* width of input image */ + int K; /* number of output feature maps */ + int R; /* height of filter kernel */ + int S; /* width of filter kernel */ + int u; /* vertical stride */ + int v; /* horizontal stride */ + int pad_h; /* height of logical rim padding to input + for adjusting output height */ + int pad_w; /* width of logical rim padding to input + for adjusting output width */ + int pad_h_in; /* height of zero-padding in input buffer, + must equal to pad_h for direct conv */ + int pad_w_in; /* width of zero-padding in input buffer, + must equal to pad_w for direct conv */ + int pad_h_out; /* height of zero-padding in output buffer */ + int pad_w_out; /* width of zero-padding in output buffer */ + int threads; /* number of threads to use when running + convolution */ + libxsmm_dnn_datatype datatype; /* datatypes use for all input and outputs */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for buffer buffers */ + libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ + libxsmm_dnn_conv_algo algo; /* convolution algorithm used */ + libxsmm_dnn_conv_option options; /* additional options */ + libxsmm_dnn_conv_fuse_op fuse_ops; /* used ops into convolutions */ +} libxsmm_dnn_conv_desc; + +/** Type of algorithm used for convolutions. */ +typedef enum libxsmm_dnn_conv_algo { + /** let the library decide */ + LIBXSMM_DNN_CONV_ALGO_AUTO, /* ignored for now */ + /** direct convolution. */ + LIBXSMM_DNN_CONV_ALGO_DIRECT +} libxsmm_dnn_conv_algo; + +/** Denotes the element/pixel type of an image/channel. */ +typedef enum libxsmm_dnn_datatype { + LIBXSMM_DNN_DATATYPE_F32, + LIBXSMM_DNN_DATATYPE_I32, + LIBXSMM_DNN_DATATYPE_I16, + LIBXSMM_DNN_DATATYPE_I8 +} libxsmm_dnn_datatype; + +libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer( + libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status); +libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer( + const libxsmm_dnn_layer* handle); +``` + +A sample call looks like (without error checks): + +```C +/* declare LIBXSMM variables */ +libxsmm_dnn_conv_desc conv_desc; +libxsmm_dnn_err_t status; +libxsmm_dnn_layer* handle; +/* setting conv_desc values.... */ +conv_desc.N = ... +/* create handle */ +handle = libxsmm_dnn_create_conv_layer(conv_desc, &status); +``` + +Next activation and filter buffers need to be linked, initialized and bound to the handle. Afterwards the convolution can be executed in a threading environment of choice (error checks are omitted for brevity): + +```C +float *input, *output, *filter; +libxsmm_dnn_buffer* libxsmm_reg_input; +libxsmm_dnn_buffer* libxsmm_reg_output; +libxsmm_dnn_filter* libxsmm_reg_filter; + +/* allocate data */ +input = (float*)libxsmm_aligned_malloc(...); +output = ...; + +/* link data to buffers */ +libxsmm_reg_input = libxsmm_dnn_link_buffer( libxsmm_handle, LIBXSMM_DNN_INPUT, input, + LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); +libxsmm_reg_output = libxsmm_dnn_link_buffer( libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, + LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); +libxsmm_reg_filter = libxsmm_dnn_link_filter( libxsmm_handle, LIBXSMM_DNN_FILTER, filter, + LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status); + +/* copy in data to LIBXSMM format: naive format is: */ +/* (mini-batch)(number-featuremaps)(featuremap-height)(featuremap-width) for layers, */ +/* and the naive format for filters is: */ +/* (number-output-featuremaps)(number-input-featuremaps)(kernel-height)(kernel-width) */ +libxsmm_dnn_copyin_buffer(libxsmm_reg_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW); +libxsmm_dnn_zero_buffer(libxsmm_reg_output); +libxsmm_dnn_copyin_filter(libxsmm_reg_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS); + +/* bind layer to handle */ +libxsmm_dnn_bind_input_buffer(libxsmm_handle, libxsmm_reg_input, LIBXSMM_DNN_REGULAR_INPUT); +libxsmm_dnn_bind_output_buffer(libxsmm_handle, libxsmm_reg_output, LIBXSMM_DNN_REGULAR_OUTPUT); +libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_reg_filter, LIBXSMM_DNN_REGULAR_FILTER); + +/* allocate and bind scratch */ +scratch = libxsmm_aligned_scratch(libxsmm_dnn_get_scratch_size( + libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, &status), 2097152); +libxsmm_dnn_bind_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, scratch); + +/* run the convolution */ +#pragma omp parallel +{ + libxsmm_dnn_convolve_st(libxsmm_handle, LIBXSMM_DNN_CONV_KIND_FWD, 0, + omp_get_thread_num(), omp_get_num_threads()); +} + +/* copy out data */ +libxsmm_dnn_copyout_buffer(libxsmm_output, (void*)naive_libxsmm_output, + LIBXSMM_DNN_TENSOR_FORMAT_NCHW); + +/* clean up */ +libxsmm_dnn_release_scratch(...); +libxsmm_dnn_release_buffer(...); +... +libxsmm_dnn_destroy_buffer(...); +... +libxsmm_dnn_destroy_conv_layer(...); +``` + diff --git a/third_party/libxsmm/documentation/libxsmm_fortran.md b/third_party/libxsmm/documentation/libxsmm_fortran.md new file mode 100644 index 00000000..660f5ec7 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_fortran.md @@ -0,0 +1,14 @@ +Title: LIBXSMM +project: LIBXSMM +author: Intel Corporation +summary: Library targeting Intel Architecture for specialized matrix operations. +project_github: https://github.com/hfp/libxsmm +project_download: https://github.com/hfp/libxsmm/releases/latest +favicon: ../.theme/img/favicon.png +css: ../.theme/ford.css +output_dir: ../html +src_dir: ../include +search: true +page_dir: . + +Library targeting Intel Architecture for specialized matrix operations: [libxsmm.readthedocs.io/](https://libxsmm.readthedocs.io/) diff --git a/third_party/libxsmm/documentation/libxsmm_magazine.docx b/third_party/libxsmm/documentation/libxsmm_magazine.docx new file mode 100644 index 0000000000000000000000000000000000000000..22710a9150a18dbad6fb7cb064032ad807b0f666 GIT binary patch literal 185856 zcmeF%WmH~Ex+rSg-6c2#cMb0D?oM#`-~3o z?%w-$?{j~gAFIZA#+>t;QnTKwdg~)!rY12r3+jlrApTPkhJc26{EY|!_a8n-0Q-=C z9%Dd2K>hvr@%QiFe|`)PcAl<71dsq^00lq=z-51x|H~rq3%m~u=$ofLeCli1zwg_C zj($4#UqJ`gfgu2a319)(01ki)-~spm0YC^40mQ&FfCL}~$N+MH0-yw_0BV2+cn;74 zbO1fT05Aef05iY>umWrVJHP>O0$cz$zyt6Ce83BU9}oZp0UX$U77c848=(F;fCIg525L3~J?QdB z%~pTJ@?Uw*(=`MjBmP-G%ztr{s!Prv?Ivd{|6}_fFVhBXbOSZof?jy)*MGNQKtMtM zS^vM<{--*g#`05v1fJ&pQ=dF-|I7nV-Sd>kpN^mU^*`qEr}+J!%>Son{#E|(h`>`$ zR05;#X%0Grap(-Pj|a%G5`T=xr+Hxu#^uu(W%;Kw&wR-wY97IwQoOwDc20mv8(Pjsq zuz^*J7I+%b z|H@rYnfK2c{;_^J5aFlz{%hqK1mqw2?>Ph)lX?=Sk#=FY#<1Eiq-r}6wWum08X z{}d<0r>+B>%$X? z$)Uh&APfixB7jIB3U~uV12I4>5C_Bq2|yx{1SA7*ffOJWNCVP=3?LK80w z+yZyN58xhn0Dc0$fJfkug&Gp%EdUCD24DbK01ki$5CB8~2|xx=08{`CKnE}YOaKeO z25ca+zasEg1pbP^|49U-|NcA3$HXHp?`6`rWjp@4#w~ZV zQM8+w{xJ>cUHdvQGO`WBMe$M{Yis^TFfVrz*3@BMyDuXdQ(LIJWe7Cz&NRtN5a#nu z5ZH$x?9t$TQ_=|ZP?EN}`<^PMNxY)8REgutHHPPY$dt@ylYFWoxVk=9><%IG>@=}P zetfp#l6iE_N)p0ft@4QQ7rDuRSCL6O*Y{7>5`>N6?cp)cm7PjI20I%tKr~>GJjxt` z)K}xkv}Ert!)SA-^cWOi9N8$-{JbytQ6T60NBnHLz1NI~p{m8)-Yqkq7bD8A@$RD# zZ;iUJ5%|nLBrX?wstX9vvE^%$GEgwmZGGX1_ zKD#<*qb77z=!(WZ-a-34`cdOW2+_bcJkpO(FEXBG_qB2{iqYE zH@B|4u3PW$3(rr~jJ8B384UZ)7TtUysQcO1VGe4a;?N!rThT{DI$^%D4Qw|EU}ytrWfi(PQ-;Jt6_27Z=pADb|HFgP|asJ+u%$bg-0EH9=oGJKy%G9eK0%$QsAu6kumB7Md|Vs*7#kH91}uF{0r<+J#Pw(lLp2!xFvi zz~_V8N0*<`h`B7aUuuAi3I8}jn5Jr&_j3h3Ae6=Owo#Bz4g-TytCd>Q_XTTKLvW_T zWoU(ZkqUHg<#Xkjp#wN%!|O!EzKnp`CX~Psmm!QBEzD;l>Y0K`0R<2VTp=694%cB*@1 zx7R;KFO#k-X9j&v_INmDzu{wloFk9k^tadl`Jq?B-IqA#LU&mP&*eokbW`*-6LyJ7 zU2ip2IOdQ!!V1S&XyazP@adt{97H1?0#PoieSU&qYshme9u4am?YQ|vCekd2Xk_`O zItVc|f^F;u-1T6Tpmf!+xh#FULgp_vAN&y@tEpagZ8f{-I8Id(BYI$Hy@f;;LJOpM z;ZraiXLRsx8gb98A5Tt9SAx|1oc=!M$JNnH>22`P1Emfz(>xO<{%XbodsMmaBu*}2 z-to&~I7NwCTxhZ~OA#cf^h%?=1b|c%XDomYKLM)K^5kgz zu-WI)<>3hx`%CJ;1f|srHeNS9PM!!cTX*3Y?D;KK3?YBvCzh--+!Tp zl!>djsI_Ac8#hn=V>yiT=OnegOm1n#2E$vYaES&z0pDKlQ}^3&PS4H+!DE|IB+Y;? za~QI_hrfm|10PK*)m(`*6hubv6>hxj%f6$#jhSZVx}G>e7YJT$y1vcN5}zVhnECqf zg{xz5tWm$XE6*1*TcZ3If~#B0{?}n>{6XRjNXY9R^qk2`B1eU!?N^XFJ8izm`0sp; zc`uYy55{zFwPRH(Tn{+LqB~gZkYBBheV;5El>J5(M^h7>Fr{Fg9XtjlqW&o+|5L0{ zAw^WV>J6MZLR{zSm&(q~^^IfnRK5aJLxaORi(+IggLfi9l74+rmz3crtfKe0QBX@R08RPD8+_n~#%mC=R$wq`LPIEi9?lSPz1a@v)MNMC$yk z!EYE!O&H&7v#nThQ8kS(yxD@EiRFpvPtr}Z|F0C8J5EfU5 z>l?cn-FnYxg4w<9Nn&;TbvI=OANEC@wN-ER4Yelu*IpNo7hZ%6%qCLJkhc%x!|m?p zV!qN`Rm1~`Ml61X?YZ!iqn1~C?^r3bU$3&czJN`Mqp$C0C!&4p(){Vhxy*OQc|_v( zujNS-kH>cpa1ff#de1BD$j2)}6|Hdw4;2rwNS}2q6i>0iX_S~&-FT=>Io$0-sgICK zkNH%NPW3G2XCe_oZY0gEexa}$7aaW5&;Od5{k`smmF)&v`mdI(H5%o^__w-Ke`-x69}FbFv<_zRNv*op>cWxhVAP znN!Cp$0Rg}mQuBOF69%dOo}{mcam2l#?SRKkaytOd$rm=kKu(5E76I-mav)!o2)Oi zOwI|=@L*n5DcYzVy`=Bb?3}02_H^E7ntzZaP-N9&{~n0^xzzYBCrnEPUsrvuSd%`? zYi_Qp@tWgR%IiRW*<3seW6kxVO<-J#FR4o4h8@8N4Lc$3vzllxcjofl)TAXC-o#4V9 z@dW#M>Cw86e$~rT0dzI=0`^u3M#2}+t2&{W$o?isTdgK{KAdt#DCKQ-+3tkD_Lv9= zJ(j%FG44&b1|3e}8w1WYMQpW_hk1^yrZ^oeIarI;bj&7-lKJF#<(0;HWqh`XT2gPBFdxfpBK`F6t?UO{GFFjAj z=dJGnH%b<0v97@ zoI!J*&S>F^kof8+Z@W=zA8g=bB3TVS|o%u zn^syxJ7}C*B{bGwVMwaq`>v zN}5}#BXJkDM|(Ry>^`8e?}+JMz_7I7!O9F>fMC)&sftm-XDgF2XpPR1|6Q=%da<>> zJDg_FM#pl5mc%W_q624G>Eun{ezOx3VkZRt`~7Z#u%p)2+4LNwJ11dojdMC>ZTy@4 zQVrsFItIOwHBEOkAARKNa7z=eQH08i-eU^drRzZRyqDafEjLjqTpYtCMomBNhSA~9 zkU@}@i@sAdES`HINGwI(MYOptzA-u|o$hb24b_1vn6=`u?3DLn>oeQdI-y@sViFE( zFwJbL8-q3yf!?zG*aBNUC$qT_ocJl@8^=_%k9VbRlO@`*(MPbPw@wI-ALT<*Yqau~ zT3wQMcUdWQ3@`mts!X=iw=GAf*QdC)D&Ub9ZXxuZ*Pd`eJ-B1FPPij`mLF|YwO{AuG<1GJsX_TCK@(8Zp$?B zQm?hN#5*|#<^W`pwRkz~)%~9|Z+-~+9NaVd|0s-ZIohV9vIA$ZyT=~}gY@g*dSzokC$C`%4&>k!u zW8Q~Z=s6NPW@*B+M*K3>Qj*$e~n8#HIr49Rbs_WGA*$N*Z7P+N(Z?Ij-Dj=YE!$BX(Fi)HEp_m}-g><{ zzxgrJL8@{~M~9>U@Jg*YvmtSak^7oIy66mHJT z+m!*KBgj2zP?UIa>rP+LwLDO!Yw{u#@`@$o4Jo9Vy^bO+-BrdALXw_CW_=z-8A3z- zWy3+_l}08TuE!x}z2>XagKzX;LdhKQZsDGobzN(+keDk@H?z7<@VjJnB~xz&{Rp%9 zoNRrt%szXlh*_^cjo;C_R;ILIsbaJ++lGil&L>F>=FahhveNhF_k_bn&k%2jds{jk zQ#$uPFhoSM&_gi45*WmiUGNGooP@eWN^;MgkG{Rx!$hqAHEdR-bP6YlJdkBeXidv%7I+uOOR6+N?HAgF%z*_Z0&HHCo1eYgsH47eC|CqcvL#Rsw(TsWNt$bCQB zmXVU&DHwQ?E;Nr{n#e9v3T*Eo5J`02d+yS+*NDArPspGdGO9{`^h@!P+E`4;LkMIL z3eGzGs$Zx2dEm#&r;sML5?t~@^Pdk#Z3+{vW?JU9B=d!Z8^*4-w-5Y}UP9`~xGD*)q;MbXg6P=j_5WKjn+-bMzXoqOOOy4?aZC zlH^>-l|}+ zws9!!N>?jpXH+Hwm%%-_Eb@v?EXieaW#o`lqsW#*Z4>(%wL%Xm%u_alirUk%w+_Xc z2A?onmepS;&?aA`I86P<=nDxbrZ}~3e5+1m-jb9?B7qqh269RBAs{>UX_gPxKLgdFY zT_erkF;+W4r8d>vCnGYMIEZMnEJ2~*s^;|-9}&WczVOGh5n%z$ zB1xiJe5cWg*BLCPdpqz>^nG4qcOr+4Ny^K=+x$;tk5MLszKtK~YHbZx9D0!|y(LZ` zhKtKxw@x)Nnv*Lt=}ZS(#FhANFO=4+}fPqR=Xs zJ`H0*=4ikeJ^47X#)+_NZ5R!I@D2tx!G9ZuTXzmRfxSLPPmB5xt*-#hx@4CCJw<~X zy>sJZ;zDO}>@(!=jk~V#h(2HLL|Pk!ev)iv<_&*Q@})n6&{8qGM8nx1XfyL$C+CL(?;u@ zS!CKvD2Ka+k^uL;;=Jey;w8STb8UY`f!t;+=pyvvL>NgjlJbNsgi3D8HWkmqSI?KG zqP9X^>G&w_R;`ZLc-!M7st zu(NX=i7?xCMrCMwD{E`SJ5y7`wa1CDV+GL z%ba%9@3}O1x7oZB-mc+p;BuHH?qWKHa+E5tMV(c>Sz9XkG%BFOg6*PnCGDW1QA1YU znTT?&T59IH3d>rNWn7iP{X=0fWuM}@L2<}QlfG|Q$1j~$R9W)YAC}rZ`qCzh%ejE~ zjqAt8!b|Flnj)%XseG2y+xO6|KLkc_`+WxD%xOXyUVMv1-htG&_-&zo6BS;Q;xH`( z;hNVcX^0A^-<1{QdV*k#&w#h;$T|3Q-+7AEy2lB$7tX;ai6M z{!im63mq0$ByUKqM01rB6IX&cAPGLqdHJFI+;dP9ZhO8r^JVcNJ~#qt!V-JxR2kuf z7WeJ*$Jps}F||k>KEk3@C3#x$NfkYnqFK`q?(b8OVF;0fn>X$!maew&VD{fE(C&%P zN^x4;8PjGvT%BKuZwSohz$JY|Cn&z+lKnUpkjM(jlZumDY4uesK(;S+@Aw_kdQbc$ z!E6E9zSZ2$yuRrE?0YOFWiFAB?|Feddibra8)7c}z?~Cbn9jKeCxGM9^6t9r~KrixA~cyPSW8!@6H5x`<0~ zgM!6%bWT7*yA@wx+JRTItZ}}iHn}+OQP_H&tk9WFTje*oTg*r5;f4@{D7>Y@@+eeW zLeyM#*=v~fAx8%-b*c@aP}@wIWGei-eRbkzSoNHNll#n4X6;|oz2nud(;w$LI7XJ1 zKI?Nh^?i(c)McESZuMn^mou?m?S57<&hTNr7zr6C;KxkGW)sN+xBQLS{`Q=s-nC@(oeF*D$t&ny|um%%r4EaqcMXt%rXGrK}-Qxv19pZG1# z(kTj63#6-jf~rYOne%wbxI5LVvDUkus7AW9zlv%2*2Ph4xU3}?1ihQL){rX#4EV9u z3&}pIN;ccua6XSSz-0+%Nr+M6jH#q7`KZUxEd}Z2egNh3)9K8yW4uyaWkPw3G*`od z&MChScfj(zgY`u};^+_MPg6ljA-l7q?}*-pK?Z4?l6q)6-`OJLU7`y1_OV3|b`i+E z4ot8c+;lX~ISh?_F(qOgrGKYkIUj9te3zvdP^*JTWku-AB3`u%IpyV6df!dRY`Ux- zJ&qJ4)s`M$1}{PNC?_N++aJDxllSJCGPAdoom*1#4S^L}!?3TSX9% z^UJ{^-UT~y_hTfe^M^64qO}SOC5kv$(fE~+o(#IP0L)yP*O$r-P+}t8-F8FSdLGTh z))is{Y-uggm;DTnk6bcQ)Q{cE;|%2j-zQW=p1T;Ny!I|vet(3A^E-ZI8^$3(GzkiY zQ)M!n^?q7^4x4Frk`_rYVwF@{Rj1CwV6N^+dQ_um>-(z(sJ1a9uUgDf&JReES1}?& zfe8o`y3gh~ja4-1UPYaJRFNdwDnOM`Ij!7MIb4{pKRiYbTKd$Co!Z1rdP|v1sIzm= z>Xupdo(o;8pkSu$?27+aQb6;=X5r9DL}y6m)Kr)CRV33e=7bY#;gmZ3$_J5An`5$j>cl#vQA>CiZqwAmdg{R0eYiJ|z6Qm-BrHVo z$Fn!+y9t+8$Yqn1QFVoPiY}qZq$BJdvZ!9r9w~Hfq1?h~tsnSkYl2}fOg!dT3MAAp zWyG+|6yd7BlTh?_E*JIPJ%t zHcVvUa(6tFz%A%9EQ+Y#(euV{w?p(5opUc|-tidhf|=gfkTc%P0^Lcb+A zo@`h8+VJPWd~L^;m0*x#Bhd}-!_gTWbFGz&%|fezF1E_y;5<417U$II%qfjtbXEp! z;&tcg_Hza%yKbVgBg16VAc0WzWiL91Flmvimx^`Bse*30FYGw;-L<`Grx^VFu4{Xi zO-*t?-7>@)WJ?X;+hKjS4zO12sGc0!M8W6)|RwTaX+1~q>jDK z!n@6dUtaSc82+?CEi`B7D;dOQwPz#PSfDzpRZc*1#tMxwi|0GIKSWO2`I1FW`HhRS zr{--B<>;4A1twE+id4<-4ojR((MH&Ivvc`awEL`6Z>c>I)42TAEB5EB=VwHq+6~1H zY@VmNcrqm*DoI#$8gIm1U^rsTHt2fWIT^i9=e&|EMYCuu{CT++Q@?3&INeTIz`$6y zy}}|@)Oq%fi?}`CEtDQ&m_#)}8L`#+;4|Ti%0O&rh-qI1691XTZ*jv?Cj~5Fc=Zgg zG<|!x3Rsq|UQhjA+&0m1q&UIiviym+dK_hs>s?&mV(=Dn!I=ny9Ul)bi{(NhUrMMYbs zv36W7=5m*h8%e3woAFV6W~h;FBt>WI&v8Z50*RXD+YO@%=_SD)xmmRE5gP6&re8<(rAIbUVPFU*|tljzK zX0;vEW>@MGn%sSyamflEUs-|nz3g1IkCU0Xd2j)QuI~`Vop*THb&e?$ot8xE#ESNP z?pG5f{lSBKdpu$G<}x#tjn?6#aZfoBk2+dK#rO#!e|XzEEJQx$auBK)CZY z5!0Ga-tC9y^oxNARC2yG<)8bPAF`7Cmt`&!rBglaf}ae zH8UGZaMLjyZz3bl$GV^Y>S9Yv{fP;;g*t1jhQ~p1AyV^OAr4LIRqe>ie(M}qA1Y?MAHG>v_;rZ~y$9BxM^b$$|qh}S^! zi4Sr_5d+8lXMi}(wGnP20?NJ5I`=b~(H?k2!`*Kyxd*s10|SC4*>cyu(VKGDqhF6G zG}Ccohn3f19Y#-K1us~qypTkq33B21)G9Fszn_%Bpk*o&Qy!Uct9OoXsHNa0i&19l z6a+6EcO2FSPeGiLV>rIqax3?GA`p(AX#GxE=4qv{K9*E3&8}sSAFXU+v+;G&%8W^t>vQhq9lk^VN2_=5lTkg1 zZo4RcX}xAxbz@rh394xL$Vxxe)hNtrd@L;cDv4xNfGFStg@X?lUHa zBw-AVStf42FV_rN3`Uiji&-kECRgrkqcVnZ46Q*`$bvOHBn*Xj?3cd@2&ZMO5USI(pu-wTlv%4 zk2ArP5v-?~1u-Vq-XNX@V^MJeaTxA2LU|UNT&DjG5e5OX2gP+Z%-!*I zMUP+lQ8FAXgI?ukZ82vJ!xR#?x8~kRv?pfm>HO~Q3#~caiJv8LzR7pl65+46WMYo{ z9ZWObQp}XTe^s$LRi_;>^R4C1uzDLVnCw7njUvP(+8XoVYB7)*dgtVUeV#${80Fg+ z%)xkMs8@^t59c=-=?R@FPJ}%!io2xu$bi{?$nS$V=@VmIVX{&QZ`bxeuDqX%?(Yax#4f`4|oK3+h`}8GBK6I_Z?Gr z%wko6g1cB4?~|Cx(7#)o=oiNpi5a-J+)lY?EA*}~kJ^aaTOR6}!VjmVX-Sbcy*V7C z#iML}b=u<9Iwcj+cZV-ZGAUs|gUVO|6G|2E&f1HYril`D;s{aC``uDQ7~aXj=Z)t- z)NRtZ(lcT+OABM`DGuk06Q_l-HXrcMYbI+MB-mw(FpOv% zDNXsjS8t4!O$Mm!mR8WMzqN4K3Hoe(AwG^KBq?u%T~ly-8xDC|Z6A?3@?m6O=M8Co z7wb;-qm3MUsXKJ>&Zkc}_H*CK4nh?1PZ1%ASabQElAHp#p9O|(gg7NQUr+nnA%6lZ9vGYLJ5Et2*(EAebdQ%;+;ZFbdDJUdh~j)E3F=z@PF z%)BU*MsgVPrH7hWMU-TSdmHH6JkHJ5ol7_-1(t@FKWoR!2D8St(kL;0y-q@bbIf=Xg9+E@ ztT*ftVqwmdZM(${jm4A=B~@}4 znemoncJTarIFG@#}xTJ#|g(&v32cYB+i zW3lXv!%+o&gMlmmSyJ8-#i&GZKBkb7vnkMIDUS%62=gV}I2g)Gk1iN$NoZlxbbH3 zQ%@WUlP&RMwYNwKIdgE(fm&Y+y`b@1?+y|VIaBO!r?(1x3~AcFw?;NWUZz7X>f=w4 z3kXR&=D{#93+SJmH8JUZ_HJ)~B}VrPo@0Z_I}|(ogB;D5wj|jHzc7)d=+SK6ByxMt!g~9|Dit)B5xB*RFZ2uR_~Zh9H~gG0Ae}9zC14dXT5*WXV;8D))EAd6b!;usS$d-ebU2)M1&qEiKoN zYUVFrP?hhxV_0!B{CxBaaw6jqFJfM7G)8wkcaFom!`w^N*0denqHl2!+0mU7Y)kH| zBn$c&r&L%klEW2Zmgh7_F$zQ@s)TFP z5VGyl+%O}daN#{4-X2ttVxRrUe6g1=Yu=EQMx#phop@t6kWwg0IKeX~AiLRflX~CW zxqWX{fW)wdv2{GX&z>UhI2$ep6=KEUVVxFv1zLJkTP0pepDfC24P0Q3JVo+;92_2ojYrQ^KUHn8a@k&3}s{|L)uMSKyObKefT|HmhNN7-@yK~)N zwB<9!D)R;y8ZD`{cc^#@60eVHn(<4m>~~5M8XE~V?m%KHMN(XRIO`KxW1rVKSfQxS zoxAh82Zp4!(lWkU>6u zYK)CcRn%8m+_jZCS)EAx!y9ZkELv~GKC?vo-x9Po1MMYUvXJdV;!x8JwUyCU!t3sA znGo+qleX@h%c^~mIM>Uc*)u&b=%I?6yc&Qu2rlC3$6U>uFl3OStx7OS&9gb;MZ;M> z4KH0{ok(E6Dea?uWe!HyHw^EJ6{nBS5h@ct%oBQ~3=={Jz zfj3BW5YR&xJgNLNk-~~X*COvP=0XPQWt;0TOhv<-xP`wBnXM2EzK=M3$I0;{pE~M> zXdhO!o%5oA^ok5EyKl6AL_wjv<*kE@F$Wcvgri{eGvkT$c2~)FgGT%WZ%mJuv2ce3 zUQNxsQJTM1FxuG4Qn-Anv2^d&KBfkjpZZ9lD;MK(8+%A2M$s*{Yu90?K&4Tbw3)>*hAlI^X_6r-IN@EZ`O^?0 z;x&JmZ+UFHumPjtY_A;`?g@L?zNgIMzIll`tblvag04U#Tzc!BddB_9pmo_yi~n5q zI15)2;iV6fMC)vy>DP+8aN|#AV_n$^6WxP)X=Aau{=B0$jA0)l_i~yTp&_aXephT> z@R_$jbmTFJ`DE78UuYZ+0ab^NMK9k6ue-I@O~nHO;9Ze3{t+k=-P5Kryzixgk$MMzsP~oqd3?(n?f5 zTk(w!%g*3eE~}4gUROFCXuSwS+s+>qe+X|SXlWXTy3#@jyQ!>N8N$a}5BhG zT5vCiQi2luVZU=8mn8GbB|w8Iv0xf!nSBE-m$E{+_+63UOwYL6>PYC9dc6+6)GY8n zjl^vAA2uS>g!2X-RuNAPUus?s@mRY%nDOM(3=8Dvif}*crqJBA3e84eBT1lb9+lB^ z!BLdSR7b($>qm+EHi(0E<9L4SRc55O>4qqyC}Yi0n{;#tW1^YTgxRM4ObN&lV?(L&w_*F);JrsO_pAkv$$8bBCpM_A9>TO z#ECVfdJl^#+o1?I-hTNf={SO;LX4u(`LJ%1+YqM*9W5Sfm5f*XQeWF$-sZO5CJFT& zN%Tqb;7FaZu3Rf~-A4-DdgdCeuo#jr(#o462W~*=j1H??tIBu(2(!EqvAq}aL#-$B ztW=e}$Ew63`+(x=6Ae>$5@lS%=DS|eAHXbCI`Lqmg3ceC$m>HlY&R!)qP78Z5*cYX zUb{rulicguXh1CM-XSjOZuX+a9J=atNj6s3Dt$iJO zK{5{`G|F5a>|f^>@c5_YTORx~P8wF?b(DsC^iVtHZP+wzoQ}_~XI9F1Ib%2WS7ctY zs9kmATua6azTXQ#&Kq)-)7jQ)xktLZY#&9cxBTTy*(}#tp8W$}oaU!#v&X!7(`ORBwh=?Tl(ZX2etKNF zbUXb`Jw+_%vnjzlkCWZ$#j&4FHBfkxaF|jh4i|?$F%8$>P`uEo)h;9OqwUQ8grQ5> zk>KsZQ#4$jN_tt5N&ve^b$f&0=Va5J4sC2+z<=dQS8PWVjq^`poBQ(zP2I8#0yAE%=7i zn)5~E>0V0AoM%WF@tG}t1UxB`ux?6KY`)VEDGz3^UoW)pEp-~;Xnfw;D6(D|m-w+P z+M%h;>wK&KC~{$r((_7FytEwYw<1E$6=c3C>vLge)I(j2{xGEIynUM822R#{Z;j>` zsv0b%N-Au@ZTAtrnn*-0+#Qt3)Yh0ZOopr)i|OYnI`;?`d3z`669pQR&qbts9dQ`106 zsVH4&`L1d^&drGWm;I(d(@SQ1Wn-0)nU>0bc9tQBlmY35v>wd+Rt6$Q%vRR+ZhlN2{ZK6z}StgF*n^4o?hL$T;&Gev8YKgiZLEZqN>IFd;KG_2Jdo>o}u zgAfK7 zJs~d0Z|+Vf*vr$^(Wx_bq2S)dL@}h;bOc6T>|vys(-rRr>4c-c8A&R(Kg7zJIS~@1 zokohpw+Qv0BoQ+(ji+db_GcuIFZa)D9J$Psd+=%v$yCWM+@6N#uQ{R${H$9$AWY~q zO^Dx}F-YSQ?~)vJfBDn-)~*JF<}7v?6JG72KBodN1D*(hwq1Da(P_)3!_?~oQ}b3`YjF3@uY#Naa!b8_2ETT&o7VWKHa?| zRnpF1tTyf`;%9L!6}c;BymWOU!F>)SixkF{Cp-d;7e@JfVD{V2^!r)_0;>XT6 zs{iClQKqk7=}vbYAmkjkA98IOWaX%!+FpWAsc-%nYG|r${5^NgAm-t&mZH64t7n{w z3HF`Kwmz0PDcihv`I?NH_T*1|ZV8rot1Qh&E^Ggfmy$`G#dlttS<1VTDXvwbNgy?mt z3z!%)3PQ6D!=6dxeHLOrOB^kIgrD#$2;v)jv(o-GCqbsIuSKj$(L1h-e#Ooi!C*D1 z62xG8HRb0FKnsWjZVJ`G!F~Q!(cT+m{*Ha|cU5uZD^Y}v2KjE=lg}L+Ri6ajiN8CK z<60Hzy2N3VAkCxHQ*#oca^R6dXG5&# zb&R0{Q9L0zq^X!ful!lv^EG|bi}j?;`jirvu*_Fx3tf z;4q&S=eu2fr>&(Y#u#?J?N|w|zWlB72>}BNrfR`eE>C=i>H*XJsgx>P+%Fx@=jj1HRbUfTeDyO{w6Uz#Bpv1-VHkFWQud5jE5ApT&chqy zBt^)Q=c^v%^(YbJP>p}~|2;U%2-PpBJE#q2AyQULMv%4sqy)fSR8(1NnoD*Ob2N2F#FX6ZkIN`mxl>yl{!o0e zr#V|XqFzJeB?*fGq|9PCu=?Oecg1Z92dIHR5*JpS&~g|Q*njhyRjW)YM3JdNIZ9sP zlW?dr#)Fhg8@!5dcCUNg`uG>h%$MzYx)==hr05fl(Lu;aBQJ#-2vlyPJi#M;C~t6OUNfUYEcrdgEV%8zC>F zKj@QxNll~n&5sdLf3WqDMaYc^esu(Uj?ee_S4sI;VjBm$+&k(_TWqK(6h86f1) zh7`2ArWr69?~aQ_yr}p$=wrQ2C;y|Lz!5(3q7}oDY?^(dUWuh(csil?j*}v9g@JfU zIxuslbJ=1nu!BHH9|YiWCJy{%p=%2kjNiT&1F)hc8lX4{Ecvdd@#0||28)UiEr`rC z$T>kMF8R;hJ)^0KFqNGFOkCE|$L&c}cqb4pEkrIP*F2X9nPWWrwzVoq(U`tZR=+Mx z7p|HkpE@Lq#9)7b{gr3V&XrXLl$Tx5^v~W1#G(as#09&;mQPvQ9tVGf=*cATt9W^Zw94$lQpDv|cntQBnVOh3<)toG|tp9Z|0g7d#AavmT8r>p7)b;+ATj!f zt17a=2Qb^m{AgZ=yP22P0S5<2ObsEif0EC2L>~lDpzpAtho7*wT5@;Z9upLwOmgw% zW6&InQIS_cML+RcL5EIuj}?una+p|Yik54~S>6*0qG=O<-5#l_T$ z123#NFTjbG2VSa8kAS_P4FBIlbBG*FoAL!RjDfY38sj-VL0ZJSn9Iftq_gv3{pg&;6xB zdJeOLXrz;8+u1pfGT#9=*X+1)sZE?1WSbU= z_j}=wN-M6BC~(ODEStvBX{SHYs#+Kk-Yb21Xc~mr12cb{h`gU*so^yzZKf$MTuM!PQ$KtVLM^N}C zI~;Gv%)GSVAwxETwhWSyQ~)v|!=>yl?dZeFO`h?8B|s z^^3OuD;K+!cLS}q#TFvISo7Ur9zRq(YUDWO-zDTTG~Y0c_eM4%OYDRdL=6cHU}r*c zHm3f6EF;grv&eAN@@a-Vt@0sv(O_94CQcHxt5T&}l{_NcF0EWrtEx_0Dsr%^ zzc19mnIksWTbt9{rq96*pL(mVqm-x6!_gPN++oD&Jlvj}qId}bUMSl9J zeKp7=J*3`FQf5S3`E+VUgtFoSIwc1dJh<9NU^x9&DCO7R{;3Qv?jwuplGw6ZC+R`} zpyg*s(lSU$qoHoH^>$6|r#O(Yiq$xX)Lu)`ve5zytQz)rP9bPVonR%bO@Kqd>Pw9U z9O>Gv;XXU8?2rnv=)_=>6h_LTyz*}ej&cBM)>Cw*t3TW@G!V++vrb^H_sk5`LwtRN ze_O&;j2?&Bs9#Niaj3dH+s7aX zk;{z}&;=lK7XYE49f62z3Tbpa23{{Gz)-Qt=SfdC>w`4YQ)e1cFt`<{7Q%Ws@Hyrb z%+O!$$Mlx!PleezP>84Ro-Aovc(P^(V2n93i^T9y$siXCaj@6iomA8_LuKsoS230i z1WEbhx+9xmQpr!LzNxu_&q5cop5k;bryZKl@-qUCT{@)+vD|Gm*2qO#Y|)YM?VIZd zcY3gXMZ_d|7sK<-$6ou?Xx~Tb;zzs!PBFq0Z9qp;1L0BA>bOipzHjSrD1-*4C`;7p z$%$mxR&P2;A2r96TUKCSMSd|6aOtz+%K$Timv0IDIKZZ`z@+h{I=~n{8e_aV?;gNO22)`qio0co)bFW=RzjxEHp#RVTRqISp1m zK4ZsM2W0-&>8g*yw9CV*thBAhJ;pzg)7y@+%&U^()(YQE(UY95@W4tD-@7-E`UEHAVOpal;%YUx4(x#Li4c#^=$Ac zV?HM4LrVNo6nKF6r2Q?VDBz$CvOXN~*A^AHI$bbUQLtj35UaEOJwU!lsLn3JMoVht z#eY?187t#M07d8 z?s0nXlUa9?M!kyn?Kdj8h^CeCzGyu7)~D7BAn@V~nN%+t4~2^hF_? z{?=q5fJwTvDKcLMgF*$p6YQn_y}m;Rf2%vsRDx8W*VZdej8psdOd;?Lo-QfeE;>&b6o#dz)zW!O` z_++o{-vQIl(?^fa!g?2MRLoO7TU80tqwvk0K5Z zJl%IXhN!xkjpncC{F}g40tjCZRlY(xN)p*IZkxanpib0XpG`AEyKmbx=f27lexO)n zuhue9_~k7f+EEVf*UtBJdyPso5Sk(RBm2!-a zfORf2LBHD5h`qPFL~vP{>62|Yy@`ET%sTK^Sq^fQvwlg5IuQu3oj3olGZlX|3Z?bY ztj(AqPh}N8#y3OX;edNoq47u z_H;B@M+=e*0XuGTXu(>#NrnLDnmQu~zIIufdSxwU!FkX&Y|sSCjPC#LdqOkD5L3@O`wYwMz( zE|B2lu${o;x34V)ew{Lee-0BZN8=#eI{R>kc92c$07>RZ0q03OJ*-O1>G#Ss9jarZ zq+rXNSzW*QAXRg+a{hH#4;$U8VhIyNx~(VT<-1Qn4l=Rv_YER@Y4Fs|jf@73u^mn^ zsAfXk%j4-jb%{NsvAD?!b`^l$cqJ^$zx?2+j4_*w!40ov|>HSi~v(w|h99@;4oCZ{FhYy_+3F{ljz#%q(z1T0)q z5*;aIU3QN7G&U_yORv^(h6S9s%~!-v(j-Rc0UfvcT{S6m~xyA20cg(`yTZ4pmT51}h52x0z`BD<2_7TbWKs6Pfp_F9KU z_U{c-jP@0-QcPUAg32bzPOKz&5QKjH9^P~5%sBn4NQFriX15oMHH4vXCM6VLGd^cL zLy1B?>Zc)*{Q3j_)D-u#uNe?*h3Hj-*`I5WVG&1tSS zq!(eUr*5cGs>anHl7Ae_4;K_pRl&K(oe2dkF(HJEvePA~OHbdpEYrj191vH8>dWOzhQttFw+J zcoP|!@(==?3`7#)J82SPB+%S|-?%Ws68(Gu2f|E3i`=l;oh#Nel~MXs*z2hPr^--; z&cWYKtLh=|-izD82w$fi17H0mG(rJ4t4^`-i{u55phYU^J72aDudnQMVooTT=&WwB zCRQ`4KAw&~z4)nN#}<{GyFx`nY91LSgXSG&K`=(QYwu})BfKxVM>kQLsaO{nioQ!j z1@zUP#1O;Fbp%!VC_48O47{5gIA~=jnB5^>2jCs3j0HFaEvh9;W{iZl;rpY(K5SgEL|JKL`y(fG1ck#1=og-!1K_-AmTqtgqw8OkWgQVtahnw zFP@RRnC^T{BYl6j#O6aduezFiaXI5f!o+@}FBp` zpRHH9t-?5AIKxD)0PiNaND&}l5O~^YPuK8p2(EW7fhjzhQ@>xH3^eFyOXJW@Rq|Kg z1F#4Ujwb_S?2}q89AbR9T;w%4=UO*!05;O>zgaBH#1EB8=I6{D7Wp%TuMJxFH(Ihn96^cdfj)oo zWuD4QWyVTAEl)y?9-ghO{!DuYs)v+q^jBy|sAV<77P8H%RTdY%?s!Wb5xIf0FU#Xg zba6H*?}uEq8m(d^-?DqW$}nxXkG&D&Sm%wcZfgjJgqZP)BY>R%ThfK??*Vxa?R_VR zd2c}bOL;86yMQMB4MBTg!Rfwv=czlggQY(B2&qho37fk%lBmHz6#;W4Fj;D&1n;Wq zel?{nboiL(31rD&isqT=V5LJb32QnpVA>Xmzg3Z+^Ur#G{DJi zI3IaA53Kc$d0)A_6yq`C?UB(d%1BhQsXu=+ZXm??J)IjD?Ms3SOhk~M0m*}Uy(8v9 zvHALXq=HMAQB_^ZDn~R&%Raw4iBTJoJIZX>VN#KgMeVF|Zcr<3u7B3qqpx@{;nR!B zlgVdg+94fIzp&ba_QCxwvYwt~a#aq8VEC9e0p>a_i94gU*rO+t*qw76B|2B~80;&v z*HNZ0*Tdh32ABt>4aSst=H#-7Q02#Um#s2URkm51e=5c|f9sPG*3~l2IYiL(PG;2Db?>VcBpJh0N429vM8a8UzJw`$%uSDj-fc2t0J5Zv33Adrr(49k|zq}XDbt<;WFG<(oj+R) zu=pXZ-2gCIgdmaw<8;uUb~XmGn{@YV1ZT+C=DDU}Z){Wj$;nF=Zy zRIQkw+ zp1l>@c>HC-KS6u$2U@ld1t4Dq)61*cF2fJR@$+r&oG~AJS*pR>RR2CB{<)si=ulH$*z; zPmXl|moyI$+S1XA`WRH|H!UHdsb6AFMO9KB5CWYN@}E5^ICnki#-{9p9qJ@v>#!HM%M?ADDGe_NHFb4N9O>Y@5Uagv zrl@$>wMAk=1ujj!Ke2+CA9H^0lQQs4m2oPsx0^5(!EGm6S^@GIE;nO22dj~kB=yGv z)nJt{1@)Ai!8fhHkGL&i2-+_sdKT45NTDzMnAW$4XMXcl)k5B>m(D4qa7|p`&}*D# zm>_jX^h*6Mf^LVQ(hOzbCDedptVkkT{yU-IW!p2tRFF?dA=q;?8P@AQgDo*!pH(yJ ze3bEn=WYZ!K%Cpri_4DRE=<@aN$^L2Q@@^dCB`8Yu z5yg~Guhn4tJ+b#PE^pRtEw+-#NTx7#v|fmyqB^MKa?StYm~Q6MIw0?I8WMy46llR5M+ zDU=l2T&i{DMMt2`K$Ba~xaXS-Z*J&Q@8NBwEn#h;Ft;X}|d&vJ~-Idr>* z;Vef8ypMEfuQp$X(8rF}Jm1LcAZc4mwF)Ir2(t+!D67N=*$lcltmbKARi z^sZ%nF`AoKcYM|Q>9YP9O4Mxln24nOZeFk&4xz63G#YQZw^)WD^upnTK?Zuj#rh%3 z_|s)-tn$KhPIyua+(HKueSy57QtQkm;;N3|a6qWK@Qlj)w9eP$n~GBC4D3d(Q2B+; zR9u^G=~0QO-SwK3p$wBfF>fw+ozLga(t)5~klGuNpSPENs~PU;Kx^AruC38JYXu@@ z8U5_8ux-gyX<(e{0}zai;{Kj%;<9J-uir?%-_1M>mR)ym);6}p>XL$hz7@oV* z%RUm(PtK^Zf~6WU?4couNEBXPQZMI*jgk>lb@@ykfx9X)xBPpkL%Zw18p*Eu|;@y zF1^4(gI=>CnH_p{|HAg3YUcu>tH5?HnYhjT}dkBoRfFajSUN1k8N7BY|Mgd!RWI-+ik- z|6FVAF49)TCYG32+7m`h*-3)MdiwogA$gm;QD-CF4xFIOBg6^f-yZ$^c9?1l$^7$S z4=v+Yi)5Xut#Ogzo#Y4<>-Xjma%~@|#HaB4non5mJuVVw@OGLsOQ&=4T^djChH7)# z5Y2=BIS`#>VQo85qN{oZ^m813Zn-w&gFGqrdX!SJ_(F4bh(2(Fhx7u%zLbdC^X5dKZoB@8w$#4o9;qPJ} z!zssv^Q-qFXGY7{4}E<&q!1`?F`*XU?6qbi(xd4(1cP>bA0gD*z#|nKAoi!+d10y>dsObV%ba8TFqLcfGHoBc`2ej~G>9w3L*o(R-C;2S*$ z;5_8ACfYTxH7Ws6WAm73vM{ox$iG(1U4$Btrnzv|M}D~E;WAiy zjMW3^YQz>%nNlH54%{K}vS)SR|!g1Yar7UUuYitx{W@EJ(p0t#eczkpC~E(pbTQ zJ@|c;GvbPgo~E0dB5DDAd2I5S-HX z6E`$P4c$pdS&C}QFGC$C5${;#Ck%|1BSgQHy5DW2r9y~714xKL4V~0aR+OWa<^&2| zzz!P<=n6Bj6wfaHDzp1XP8RqX*@=c>Am-+Gr3=lz<3bLm_(hsEPg()d%TO$%F6GU9 z%p~&zp?aIjXPm_uEy_^E_F~TQb+0QP&WbUK8ObTY2S8MXYO-qPeUPy-M^){yJUtU8 z^)$ZhR=dMYX2_KFg#B?a)J%zu*Ya!cT|~c~g>B%|?8Z`n<)gUh{(HT=>HK}5CT@R2 z=^+8wL@{myifCsn1JSLWwpu;EoF;y3Ai}=GVTnenLUV2heN->U^r>qyko_s+ofO;iTio(qq)%2;TAu$vHh}2!%Wr}8(17a(hMwDN0bRx8` zH=oa(bG1T39AE&M(yJlseM7P7`1zMFFYx{NRh9*YXJ0+wF~1DgDKz4 z$?0Fi+p65c$>S`PMWuZ`mMW)$hCgCNN-vueh)cAWfVsNTL0Vtm`|XMVU&x{qSc9Hq z@y@lGU^8dn)rF|S-Eba!`ijvrl|B5wtlU~_5Ul_Zmw8Ycrw)gNG-LD(JN7|^GG*T{Qp zg&nySEByI-Qciz%UblX ztF3egjEn_ETH3p~p<6_%6>Bp?8yRyNvpB#Oc{&)O^Tr4exnS<+TLxRV7Sz|zQ8Aj` zf~O)sU_{?eb4jw-Qjb3Icx``C-Q)I;JWz5{Y!c?yymM9xA`ycA^<&Vk$`u-9wMVv{ zl@IUZ_jzhze_-9|abEaxKn;PILcP?hCeDgXOxI4qPSl(P3o+K1*oK|y4awNi4akVc z2{k_oN){;Z1KPGT1662 z=mZ~0IQF~ZZ5hpJpN4eqipA1|+&Z2px`!4BilEe6V_5CK;50-OQomN7rV|9W9YU;M zgsZN5h8T9NZ{mHZZxb9-ah=+h+;Ei8kM7gOh0PNd zMfen`6K2pVVxx>esP#3ojQ^6sweZ^f7oV1bjQJIoNe8l&-#xU9!2^uZLr`0gi957|@D#?Ejz zB{x^zqM(8)P08(oVQEQg-4ZB4?jTj&;B~Dnf|gdBPX#^_-!Q2 z#_eZ^`!cV@nmlS8%IuBFY{&U1saA_zRbNRL9i<5E?f+oC>PM(pV`d6xi$+V>KL7VU z3(pL|zP(iRcVOvM%T;q;mYTY}nP#=F`|zz#P=Sb9K7d4K_I9%Ru27Vj&*4&q#JecH z+$_w6-a@~@R#`ugHC?^(eo_1nz4C4Pl>jS1@n6*neFwkaobswg8>JRa%fB)cR>L&V z+^M*tQYfU_lCH91iLR5Bkfatq@FL}KZl4JIYx0X9*q5?11%-_kw4-LO-Ip3kWy~J9 z)5=1R2>#3z$ntd{%<%hMVUY{DTkZy4bNMFDMTOyFCDT+p%A_N$!$2mC8P-f#qb_^-zEnRL2Z zXkEdhm3OE_WZ*m@%-b@tg8}xF6$mSi=?@IEm?$vKj#Az$7K3N=?|&omQ@tEVAZoLf zn4>QYkD_|p*j0ZvU{U16p+{LCadKJoDfk%FHCmCi)~K<&kNOni3gGcv+lk^;Qc-ZJ zSZ7}0*zR9V;xyx0fP~d_hEvDbo(LJXqYkXgjKmTZiqz%g+t+A)kHQz|7?BY2t*!z- zmy6D^54j32;#X#ohRq6qusL?bn6I2~EDMI>iMOqi4-z<&8!72!MUNwi(bYtCdQKW} z0A?bpQ)!4D#r_uSo9v_|-vf66uuw*4-Rk8YFg^k0X={flN1^f^UxvfcVyzc8?Idc( zzA2PE0)H@8drzUer=ZGKM&Q|Xt7fUJz%@S(chNqYUsO*}pBkG#j9a#TGozM}fdFEj zT7rzdo2=-+ibfIj`m6NjuW zHE_X7Zvi+@6%2>cTT15W;Twy7wx2P< zne!$4W(~=o3i~BDB>}~d*`TSP?1{K zIK-}VG5J7HYui5VnXgVq$Yl3Vo^*);Voi`ro=sbT!gRaxXpbOgoj*I*0qI^s6nz8k~Ak@tU7v zHq=Lq2l5^<$fDD*_~`cr8&>s+84EO)#hXeY#!gEBtJ6b2aS+)E=US*v=O6u|cY|tF z-(1VzY4+w{axXr27u@MeRL4nz-E9!R%a$Ho7VEX9Gx-6r>+sdD7Fj^ahX!Kr0(g0(YbTEhl*!m*xjCO7m zAO)Pg8KQio-wjeQXQLQnUrPW&NW6n{FRQnhTpjMSu$bf$iF!SZ)a-#qCVx_khP4`B z*gU8UxMqd?!)5~XK=l4CcTFMekiD%7xA9N?e{j~!)r zK48ul&x>m28-(hI0cXE9pA3&RRdIkMG!0rT=pe3_cZ7J62+D(e=yyhUEoiE&EqOyQ z^{zn>WU$K}g*yTQ&qk;OS1=n-AirE3eHc?m#f;SP zLt;&hcnT2z6U&WAS!wlJ%nh6C+*zaup$+OZoRaqN7^zP)e->UlwC%R*b^AlOgcyIr z1(X8E(EcnEMcCEAdP$>%yC&tL4?02hyM^MNkaltFK$T~fHEs9r3Dxkt!D(}zYJ;nA z8`d}T60{;krl#V%ML$BCw18;NTA=WF@rvAjcmgs*c=%xeFkF`TVXT{C2-mllC71-9_KmV>RxQ=d-W&P%#;x5P ze>2Y%PxPi4Q2G&qQ1x*1MS$JM*Upl?E9SnsH)5WFAf2rT29nugwhJF>8zi#om_X<7 zDiZa#6EVmc%g!v`SoZC^xbuq6oE{(dnum52%u{-a?)=nwG zt}36%s?xEB=4#nX7t=U}1wLCP`l_k-Fx<{T!i)?g>>~FcZF+Z*xFg=%KIXpl12$sd ziHHv~0mqH=85x1=+l(E@5l?L8Wqa!Nr95nbQ|_~SoT>zx$qV=r;eefo=OhNIRpA)< zR<5Io1u4`338w`y(W1nA=y8&%~5-#fg&L2QCj#@4a7#e25fO)79PX-`!4#M;Eg&fKn2;QOcR z>$H9rLl?D9&R{Kg7WZ`oe@37zd9pRlAJ>4WNE3>^C$BOmGCGZF^(gF2)ogN|*U z_^N&@?#%P#^C>ew5j~vf=V2DVpSJoE&ddPM!XohRl>6)-9&wP3NPz%aeL-BcyigY> zV?wLlrh2>=z#E$>j;garu2z-f*CK+n?_i>ymdrp(B{5{44NC4xt1GsVZV$e3QozMg z2TLbr2_Y?~W9%}|A%^)E#pHZ8E)uJzjr&GnP@!?tAemONSwZ1Oc!Iw&@)`8IB8lJQ zK(2g9sLM?QacGDkLBzh13Y(7TK zRZd1b@3iCI8RinYbh;evJGlDlpnd;q#G7LRNM_`^|KFGax3+kv!s4ZxHZWVRDH1Rx z(-{EI^Una16VS7JbQDoqB`+2vG;mtPTWT(D4F9}r(1qO$*T3`d= zZ5T_nQt)W zTQYO0h62xKXo5=KcnpBzu<<_(QPYEKn&e_)HG$QSh?YK);1LLXzmn)-_+J$&Z+{+W zsl!ILSO$w(Q$=V6DuY4MhLIfV=&X=pLQ|92zMF`BKqH?Sz-u1sy){(%9ieTl&4XpS zSrQ;_zK_JX$?hn_EHm0^2_x$rnBRJs#}dWKFw``3TI8Akf=|5hlOagR7G8?$HJb1b5eF({4UR9(0{-bE#{@b^gHQFq!!$g@NGbhM?W3Cg$!#VCwd1 zz`jdskNrbBbV2$?-}9eY;tSqt@jDe`Jk^L&p;*EXU-tDEC^g{w2AZ=2(_g64P6{@D z+|tj(Vt2wH!*t`?M(#PM&p{qeFeA95anglA*6y2;Q$VrS0LnWY2YEx(s-SidYIqtu zIzMn+lFFk$mm#ovdengguvlIg^)g&U0-XR<=mxl|gO>FSw#t0g`A(tQy0WGk7IH)T z!hW5Fvy#(GT;xFf)D+@W0h|r;U-lOtnJ5<{RI!nkr;S<z;zb-Lo7G*8t%*xgZMro7xUv z&}Q}^lMq`}(*1QFprXSO*3_H~LrAughBlbY|D)xx)yDZ(^P|k0&#)LrF;{M6!WEBU zzhO5wh3e_Js4s06aG%p>7EpGG_ViJJ`w0t*g+GZWv<4RhsmLyehZiA1ErqDof0@9r zXQQ6N!l&+bDIHe_b2Se0Wy*7TL&10m4piOsnRVtZX1%FW{rG_z#;Mz$@y$%ax^p-_ zD;uN-E4V|c_O&#i)}`mYqQj4{N*mR{fYa%wCqdT?`iVGUK&k4fRKu3@feeI!j@l8# zlS|w-QkPwME3#LNVpojY=A~1uGn9)wL7s`dz=Bj5H6b1oIyKETKm12S%%Lx8oxhh# z0?F@NpUbustJV(dVvTHSMcKGG2k&vuPDFiL4)6fEVHS?NlfB7PhYw{j@kihI2zz0H4 z#=~;>W1Lfllf{Txzcj4>nc_RXWa-dMs+@O}ncG^{8Oi6?#&Kp3@~{bXmkoy^i^7sv|19E>r6^ zQ!#3Ql!a>9ajyEsuipTSD&nb17nMBFnH!q+wgkkja}FF`uH&>6Oac@PbctS|7mCQY zZEb&hm{J#ILKW>bNxGLLwt}j*KL-RCYa2Oz(N(7+cNjvgC)}p%yPqic5hR7XOsmN7 zcnXTa^g^rKFX;1xnz3QUy>3`xZaL+YjvZeoD?$UbE1!P6x*VrNbBYW{rrdjuI}v;w zOpf>=^DOj9QZrPZz7L{o=$vwZfA>Mt! zzd+D{Zuw||NfpP;{Upo}dM$tnrPcc}ZRHqk?lcF?0{J>+`R@_e;MRyLiA%$7YG5kps9R1wrd9yS`zW>d*j9K(fCq?O*VnO~-Mx z)3HE#Sh&}<-?`ag&k_Y9h?SfUVT8VL(MX?PI6t`Po`oASjqG2zP;U3PQ{~tZv85S7 zU?}}p%MY_kl#U{;d@UYkP}l0N_(}@?8bCxQ*XF;NeCPI;Wix@;F%|UW_d{*y?|?tG z)6|{eXnQOIsAjnyC~P`hNgy zG-9cm?~v0W4Cgg%b@|2>23&`Eg$6Gep8T`8_+tpVFBAAAHSWfyPqh{Ft&S zr_#w4(|l$WnH9g%>Z!%B+3}gHCMQ~;bF!JtMUYm$5@>*sGOuMO+p%rgr>Fr83GS8v zuKC48pne04Ql`^I8QSMB8riZUQLc#t-*7YYll)nu=pa>*#t8l50*IZQk@N3;R z*9ap}O+3Z%+>lNCP4!iXAG|}wby*{6EII-_bJ%|jra?WS(hzII(riNi_BK43q`n@j z`e%C|($!dX&CA1OM1|jSmX>Ki1PRC0IFn#Jtb+McCmxf|!BXTfxwlB)-6S;&q0GcA zH^*EwDsYfbaPmuw!Rc#x@-Pk1_@;GfP_5}?@)djVX-#@b(7#JivG%)EP7<$4q3TM7 z*p51H>1rTkO;TBHKy`>iKq4x`eWH3UN>-G+_sCcm$F*T9q=%f_+_#sAQ5Ebiid zx!N0|7YWQTdAwU{|G2Xow7;Nfp6PknGM6AoDMoq>(OA!?&QEZ=8;`uKQ~$*&Q7Xy$ zuQ~9m=+b=pp%1AVQL(5{?dIycIl=^JDD zO1FKGvqn$uwof;i(VxeZ_XZ+}%(nTIF7KaW6OI+9R5TXy)V7*!AblWSfVKb7fi~_0 zZ4%g9fX;R57QaToPF?1#72Bjsz{RRp&^f8jBA$4oPO14cTNNT`P9e>~WNPBw!ej6X zmMQOGN&N?k7fU47*flWqvHB}xmQ+QmJNE$e3qgW|Pff+Nl966V$KtmpTzEdkeG<&w z-s>4dmDaD`5lF|;PQg8LgFFwK{ToX(AtjYVw6uAW<0=uy4Dt!6d6ALaB$`sUS=)mwpCkROEkr^IapN zPek+|U`1p@jwVd}JulCVvcv|pZcn>M27CmDSSw|w19y?Pc;&2 z2O%-}!dN=QFhp*8E0g=k0AXC;bK52*Y@rda+0jU(8f@LLRM*8i>O+Y&cNiS$Z1N^E zS9;Vl=4UoaiOuo$&MIWtW@&ji!gZDNmmsr6DJ-A`9GWY6qO=Q(S@}MOU}7p5oU!ox zDYIo5|I=Q1CYh8A37_y+ogR>6yK;{3>Uk44NzymDiwHvm_WJsow7@E8k2W|n1*DML zL>OV5z1K+QKu)v8LQQ-(#x539X+sT_73j8fdr>6qRq%MAZznA4oEL2GT(Vfd`uQda z_ZC$p6=jWn8crzIiiQyPuu<9ZxDI8wyER-CP?vGXp`t~G$($`f|NkT+YF?@ti zDjk_T`hIG${|Tc5xR&!+&D`_TD{;}9-T|UI3Cm^U^a=TV$-v^o6wm4+a6%!#Aq?Vm zlp*f6eqUH5-Bxd&!w0+f(?`DS^gyD7PZAfWEY|C7$rRkU~Y5=_{rSY z)q3VP=3gAkKpjq07taoO?2`X^Uoz7|lEYCin$YD~PC|UGS67E1N`4f3{2sYMLE%@M zqsp5jGollR;}>$f)!)r$Uy01lJNZc}Gi4-?udZI*GT#r;Ju!8oy{^M3_u^<*PAA;X z-go`sxX==4%lT%5^6-e+c;BAU3Egn2Uo~>xo;ESBPAQV+g-?|K*2es!G?@y-5WS2uKA@quDG5ki&~Y62@NB>IA_asZwg8qI`=na&1_>! zPkZak5vkQt?8w~O^>YI9HE_`(N0s*#<@GD*@CGF_C5Lwv4)Fc)$W4N8I$R@EiHCus zCTyN4hSqrpA{o;xfHDWOJ5k_9BHHlqw*YR0h><3=(FWUJp7rj4+Amag3v3tum-=}VJ7dE zr?OsBRuej&;OPHvj~}tT32{hF0KzRfJ%eTVNytpKU9%*)>Y|Yj3qA*AKI&BaU25lA zL9F%X!?8BjqkG>WP|;Vne#a}5Nuj~FC;|j>-@dl`kA6(jSKCq>i532xV>ZF}4vSqu z4I`sp7X7XaUgL}T(9&4F9M?qTJhh8RAE-T)A zc_>02)VFDUL|8>oGz83ovm`=oXXD&lm?*p|%JMH8p|tJ27-f6q(U z>6W{L-@W4r2YIJoX#06fShOX>L^Z?AYEA^RkCw``$pk{n!Z-eHHw3FaBN+n62pGi1 z1WwR>xE;nI&Tq4VzdU@sn-K}$5zpy4Wc%EV{cmBiuUvZRMA4ajBE&5p3ZyOP<|Un^ zFB~blUr6bHd=&x(0JqYmpDY?+iyksl5x|cJrh<<4o+v6xv(^a|?tA|)#8c~nWNkf_ z>X-RH+h&!0!HCCOwLO>Yc(9?GU?&jD^a$q56&D2i0(=o6cXcQUBGrDy;r4L7ICLp& zA4}dv%TAf$hzqRwtxB*8nSWD)+fMJ-8q!6IxNZ6Df2m#l@ix!d-Z>AvbCB~jKDd$8 zwbZ2<_v^@Ah(5guZ>Z4mjcj7ag;I;$gVUCIno$3>M%|S`N8Q1ZE!?-`nmZ9Elsijd z)%0vFdlvHLq{RuGnKEutYM4&yB2fQL?5v9Thr^44o0!G3S43qS_~q>F?iX`1@H-`@ zt1W}kjR+n2)k2>cAqQv&Z&|jg);Z~m#leq|c^}(skh97#UJ2g?(7^lcPbl~`ie=)^ zsSjj35PoswoC`@RbtDZNYg2D(s0yNrlfk#}yc7U5jngO|?vi7upM9@Xb(Mb81{I>- zOfRJFX{vmqmteJKk%bB8zZG{9AtdR<71#tQki4t-QxLmaiiVTzvN@4STJm{~Z3FZJ z5cfI~TimS%IVes%IU%N%P>reZ(s26?q&CZQ#Q z<^;Xn)O}j?v_9RJ;QBLl72TRt2vA<{=&n9w2LK(>@mx%%Vt z8KxMeUO|k*9oh_>9qF&=g2l4QT?EpEd{BhK${)bycn9BfB53sM$DF@VzAc)Ruf|n# zGFn=ee2j#L=r>bD`Wv}e45Xttvk^S*X~MbKAPR(Q`+WKvJ7xv6 zOTjm+Sj2E^rZBT(mfW8yxUFNQ71u;tiY3JsLe_3Vzj0|_lI_H0Eli=xB9SEc4~@Q5 zd+9!DzJn87GcE-?P4{&bBkS%FpLd?-KMM6lo9SwETxC$4v+S7n;ZNz+p*&`~x|)8O zcyVxd0YzjVFY8&Crw4Pdb2MY!f;{0R%?;0{m)5Oe#bIRUaohbZA3A?w@u{l%4`3z{A0_gGxUe^MZ|Hsu2dw$uvk>ozf6+%eB;AL zv#SRpYn*m7R7A3fBovpg7T-C9vmOupoO)>)1-q)rjkr$4?La|EJ8#jtZ+pZqR2uo? zu5^2)56sf=R`XP?(@BQJcZf|U9=eHbi#F22_+njt=>~%XyRJ`Jg!Az$B*fMGycuXD zJ)}uLsO&tMiY!*WE5kmr-4DZ9qtT*oS|_|KFMW$6UeNr)>4 ze0>3Wa?FnFw34E%*TUg%L;BiG>#p>A8yN#}+~9bU4NKct_L@k@&X@P1o8SPfAwbJa zTXTeQ2uyJ&6|+VGqS$Y&SYKbAeM1m6m6a$2{Nn?2kr_r4r$O=NZrY`%sMT8f!F`JD zZ~+sysU83)51SVU1lzhS9SK6Y_}Hwuo-=uB9Y!2R{M3ZW4r6mQN)EZn(3YxbKAkGFEPB(*U{Vr*t zhaJGVRA{vqai*bEK{#GqtxhPIH3QTs+1*>{8}+LalgpOgk0whSI8rJ8TEqm<9tyyI z!*(d-exP*!+#|Fo8$qmzcTlgm0F9L+)MPrd1P5yFtCA-`W6M=$ zMg?h;Gn*!1M1S;%iYb$weAg%?!SII&?fIkj+}KClf8do6m2(#WutU>5by07``b=T& zm-L*iCz!Pt={r})`5s2>2worEAzRdj+WIuD!NKyF?QfM|Hci(~wWT2dZ`FxC2**2r z@vABMPW{2jT>)h$Lc4enc4k)H$j2CVWyPZp5hl+_JgoPR+-|kzH`LKlwe10_A#qyNw+Vn?|hz z-tv1b4qt)@f%%#yEUO?{73U_X-319?S(S`v8H1JG7f_5a5%&QoJz85mg-HKLPN9ekKrq`iINEHb{AO0aQt3slgc? z=wIXvIg9)7EI68m`am1$7Pi$BD}GOz`!dIcOzJVDz7bZl4ej`gWv60>)_Xb9)m@mU z%4AMLVyZ#WWqzo1Olxo>dBeDPra}9HZM04=gvSv}zxNSoo8N-IyP#r(Z7r69 zRVJTC=G!9EZq0gjMC#Yr%Fs)QYVqolZc#XPuK0*dbYRu+S$M$Lk=JZxpC+P zM|pn=ZCGn+0R3cxWu*^*o%RACET#8*D^wF7K}I=nvrGqL4!VeN%0Q$VZD z^@8$F-b}J_+MLJMz%}<7LDOI8nWQ;JZ{a#S3aVOnz+TSPGHAPNKdUh(GL0%sH7Nic zZMJ!;?-ft!Ong7SqdlN-bT?P%PEar=E5a=HzSMl-AolhAkdaw*!+O}Dije0&m$-^_#FsaQ@=R~=E2cOe(-sS3)rt;Q5GDck4_3OG9?$0<-3C8#ONMzy3t28lD zDlgBv{Yw#cn`9pQ1p(+xT_tQJfEjOSmT+y6)y3=pZem@Kz+VCu)=d+Gd?w$w)s%Swd z9XH>zOnnDYQtdHHoAj()f{cRc)ka_6>{0OmrM9^g7D!=s%Pl&!OcuzxPh~-t7kYFd zzKGZ~4UttwGgjrhIVPXkl>aO+k*HoP)j2eADI&hgD|X(8IrJ0-NyXNoCx5}rfHmwi ztHwv*?;7F){UVvM1TeTru#M8O->C!5@sIq6qSkSroD#(-=l(Ki6|vyi9*l%N30a=J zdC?l@VSN=me|JYmz~PIZHU;HGrO8aa_E2{ZiphW8!j46?O4m#^ITsuzYCAw4K%j|l zu!UWiY_bJ!wsye10|%W7fQELBT(&|jwJXGg?xYJjMG{%pv~M;QZLLx)niNDv-pN}d zx&2@^g?1Xmgpp@`S@ji6vBa+#(i{yyTyzee`vGCKTEO5x6B6_2+0HXoz#{n> zv@H3Cvb@>FCi3x%4PzGIMd&vXHd%^||&=>270&rLVh2VS~%{mAd*Fb=Gw z;;R0;=ixbf1=%^{&Q`a18Gv1jN}>+pZ-KmW>M$+lbmT9c-u2#Dc2d8Q;+#fcgq2Qr z)i}C>j8(6?)EXJp1#<|0GkVLS)E^Ki(7wy+9A@HX4c{8p0ifuek)W{Z0S(N7WfWD_ z$-VixLH75C>8{h(&Fm^Vo20)A0oJ5u@8V}jEfWK7PQc``CqeWM_O~>;fi=Q>O=sIe z%xd9&N6g49T*eYzh_DEznOO&|22RF0n^X*TcPG4;bCZ;x{^y*A zN;p6#nJhM;$NrhN1o5g3SY1@%~7Keav;J z=2WFnY+Qe~vGpLU$*n*R_&a8L&6iWJk7I;^`n{y6JzWoH6U(2?YUEscIrO-}u1cU!gp zKs{CQ;a$ti9wLWQ!k+81Zfi4s{KAMD;=;iO=)yu|%9Q2Z&l9*o7=bWxM)|5AJ^j<{ zAsff`Np)o7P5>QIYsP`CU5mqn%f89MM4Fq4ExjG%-m&Q7=6F8WK)fycwg;j*&q*Bp zue~_i{}&4SIKBZ|#W4ZeprG6iELQrtumSQYiRU+>>H#ba zHxK3-TxcgF#Y}M#$$PzG`G#zXyn8pw+MmS>np|JG`YoLMCnH~#n}*{srOkdNn5`#k zPTOtYrm|C9lhw$jrPXA&ZAx=E<}|AAC67-9KFj+xC_8u!aaK!rRJup(!4a*1QsV+p zF=-mFxeJ-8{1~y7B;{}hd8icU4dXo%<}1!V9$xP4MC-F6Gh+vp#%0{A_n{CfAVQIG zgSSQd+>>_yQgx2lMg%%R4#c7+sTst$uRSBR@dnOq^qvIbRBzZX0vt>K1{Bn>{BLGJ z(f9Dt>oK%!8=HKb-5Bj+V<{^Gnui`A9PO2m5L@MU;Zd&~cj>G=GTrXnI~$w-5gD_Q zdVjloeU3cpFKrxtFT7~`#PtICcpeajHuQ(1Et zAP56q2g%+!nd2VIGOBsPMC^L*P1m3 z(HqhSDl^!Z3i-d!|)x-kpJQdeG8Z7aYKi9x;0Bn@e7#=Y=Gy@ zsb4L$DDcSxu~Pz$)vqIM^VO(hq_?NF9piwt9H!9O!cwYjf<|EPLqhXNvda8Un#X_`2c9^yerQG?X;f>F=b?I>?>b0+iEcn-| z#v7PYv^NL#S-BwA0iLIV4H)zYbM1#=S3n$sukjymh2;^ocCXo z1z9Zi!6pk|TyN|KJsW{kbwJS%ytVk`V|{=^GKbGLIyN_T&`Uk2Zxu**zeOBgogj|` zEl%4>lP7g4p~VJ`hoAYs@ef!hI4YJ^GViEFRXXa#byt;|nw3~-t294P+h!bDyPMKn zqyH{4hc(ytDL6C&mtl|wXqOa+YK1jV8ukqgdG2e`2FwFVOFhT}hd&lEd4K~+-Y*|> zNU#{_;W-(plEA<XCj>opA<%R&A!)dWFBS5sGt~8BC_fDFv zDg>r6_JbwCKNyIvKem|UzRI$1jnM93A{+6e5IP7Q#3l_HRL@?*Kn$HW`?MO1T!Mts z1RPsrxp~NuN6z*#|2%X5AYZOIpM5Mn-ehL0D+$fBJbR(Q$~f@M4H8c`B{`}n3 zApxulO5Rxc;Q=2E>QQYpYm=JC}mGWd}&w@Z;{O3VRr z%ze2l((;UVGVtCdLp^Ew&n0bWzwBPHeRQLlat?*5?-AxoEmy8*a+VR(tFMv4g!Q}? zSM|m-A<117kc_F4H};yP_1Wc*mct%Ovs$p3go9n*)5!o8 zM65=py;youm`q^rxIM&SP&9UDuq@I}ZhS&s7aFz+h{3S~RnfuTQu|Fufv@03eG9Wv zMywbn;5GLTZQ$aiXM1?UxU|sDeCqUSOy}BAX3HQWFqPAtx)Tg;i|h5BTp6&kQ^bQ` zIdNH;^>o<^c0^+&%&wmm{o_yAvHMm}Cn1-U>}cOBh&|+f)VZKB4ql|uSv$%0jsS%9 zfA0(mzNVc>KlIyaGpwDgd1QbT8)vV5GKO23AgQU4sTm`czxqZMx>)w?h|MsG@!_qP zxHLGxK(7LxSy#}TB@*#{FJSs84}kYzD*SGnE=A2u}w zi3v#-KP&$q_Xlk;C5qo&vkE+MKOw10+NM^ymk*XzYm>G|a3fTxv4ScdBF@izQemxa z$X<=|qc^1K=}ZP1*bN*W#WH;8c4$kX3q==1?U7kz!&+&(sSOVE1AymXr-#jwRe63I zhF_Zi%^6mY3`q7`6pI<&czNoK8W)QQ_o)^L7F|m}$-8q!B@D^NC+Jyy&|qRG|BZ*N zt-hSLNKy?exPzk$2<(siFDXQuu4m^iri43Gi9eam*HxNJC9|SX<*uf8K?!fSAus|m-s3lKj%v9@2-3-k z##h;N%9NKb8qb1~&u526k>D`}mG>mr75T25J|%lzdZR@svd}gUG8Q4Qx12=k@>}E+&^?}LEw?a z3`I0n;VWd^a&=6q6M=~U5jWFc?47=P^ zc4g#!TS36^Ns)4z+bg9JX-qMNSwFKH(Un?dnZ3pFNR<94-gUW)|GL=|(cK3}(K0Fq zcmrDSH@4MCe`sbpN@7YyuU(KVC0|CO>al> zW+!PF3D;Y)Xn`P&k2Q4eO1|6hNyOE06b0^2M7GYiYQk-J`jC3aI4If5@0bkhIm;tR z{VqE6>~opali->#&?$~1sl1gn+ltneB852#sj#IuELP>8nL!zX)0vNw!WnB*PZ41A zg(u7tF&IcgFZK_jYNQi2VDE=93T7br0BNmhY7K?!Sw zjJVaO!QKBYZyM7DNvJI)=%!bm6%uZPcY)oR;~-eceukvtRvBFI-YCu{r)>S>b8CGZ z=T^S7OWBX~*^4mZbY=MMDtdM4hSxk5gn3!OJ^~>tmwsuZcoJG4R7D0Ob6RDInI7Zf ze>YHFH9@02a5vYkx4XDzsm92%c|OE$WUfYfMScCb?Ak3^!*J*I$?VrS}wrW2{O7 zTF^E_+p5{bhYD3mj~09jc@cCP-5jSi5w{q}5-}fAqQ+qw-Ggm&#RrM;PR9h%)$z_W zKxc1T$}Fzb)@=;Y^?i=z2WWJ5tr@^o3wm7zkIJsa$*LPZczpQ16zA;35+!5$?t&>g z*=`EA5+9+0A0Wg@(=Rq%w45Pau>ko1&mTs*r(xJwlTyiM_&IRncVaE70d&H2^KiFnak0Y%#IQOY*au5R_W}y8JgR3vtC5abD_e zp&i^9n&IopA@P1&0C>JVEstUs=01rvr^Kkd!Hk3y#wN6i|6OUeqh9Fuv)MV-hjhWp zhKUxqsXP)lj3zt zMyJBv$30X+?0lRk(PfphbRK@gIEV~(-kKKgoJb>tjS$M1SzwU%PG1|fUy0Rx0DSVX zy22G&o{~5UVKUgM0+qRy*Q2srJQ3N~Sb3hzvcO*+Y`oa->iBW=IimUuqiKGUqJK2L5NXZhyv)B)q!8L9<_JRWVxFs*0~_x?eIK=M3|^0CsMSNfwh) z4slCmn{7rN6C8`j#k(xZ7l3W=iExA4xhxaPi4vRmaQH6fp7g>=*{w%LQ?t_aZlZNX z3>0?E&EPpFf2)7;WV0D8`cS0iBA%06glk*ctKx26JLW*Y);W4|qaFg3$h60}9-b~c zY+HF6OG`zw#)_|ley6YC___Yp5P-Z~`S$(AsAKvxd8vw6<1;<&+hlg@$~CL8jySWKz86W&tU&H4B_ z6LzpDqO@S=p;(F;vEKC1A$)yV(N{3bRwH9S8|KwQ@=81>#?QcgH-I;&p-i;K#aCZP zd)>VQ)?}m^zBN@iRJBIv8BH>m_92$y!s?tFpH1f&)EjA@f+bi*b{uh%9{B z@AGVWh($0#^v4^%u4_RnCyhVZc5oi(anlkSp~FYqp^Ok^?mHsb(UE#D*=KRq1%L~hq7_NFAEw{ z!Pnl!*3KI5jE?Vk6{l-Z1(2V@3s?M!I{4VrCzz5mVSY~u z7vwIKTHL48cWlngUFFyllCjy334)wdUkRWoUW%h-l38{ZxPG9&ocbnHSSS^S`@DTC)T1xlgid9+T_-#=-0VUwOv z^co1qY&_yxN1(!8~dS3UI0vX-60Ag7Lr>zKLBmCKB<*Ca8|sJJnm3Rn>og_h|7=!*bQR1$Imj|f z#jn*WoZv-jlM0)|^{@Y3q&;LMa6L!cSRAzuJX9ESx#W4|jP*MX>Pb}>3ap@;B zqm3O5NCHP!wG?%LYJ1!oU(QH?m%4pP&Kuw2Rr63&J2jPr8jhVR67P2HIeId}(P?x4 zr!bg%(iYc+r}QSw!s>ejlWYk_%${7zeU(LVe47x zRu!c|;D54!jguRp-)`WD?%=G08TTs}!v&Zp;9)D=L<+)`XPjQZIiW`e@6K4@t{t+QRM(m@F7d{Lt zhLcz*_}iO`&Rih~qsTnpk+?AVTD;R~Q@W3$(@@h0M2Zz!XkKA}qoAQY)c_khmzaIC zdt1J+gQilLNhSyAcmBSqg6;-FK6O1aoH1}Y%GL?Q+p5Zmk`+F=W}af#RaXb^Ar)0lBDE0{6Mm*d>yNxijOEqzWLC3Pa8 zB}5>~x87jA43EqXJL}ls72JKZ>};Fp#z#^|wU6baILE-hi!xSG)6V_Gsrrc=@Br1E zk35;ucS8utm6++eS|1x^4(%rDr1UkGu+{QzClV~rX%vnQN@y2crY+0}wO z29=t6446<@)eh4mEhhY^%Q|~<(|JEH6{R^{xJFYJSLNX$&NcZp##k!UN{EmqN)B+F z5}5M)Re5cxVugm)9+RtVxXalK1}Ei%DwOjirkW5T<$IzKafJlR&W@Mvgxs-TxIIV< z0PESo3M$BOma%Gw_otWcdz6%$g*tG}6V)TB7-v zP7Hz>d+B_q zG_;2%%Nb4dTg)y43d!GT6F7RHW%&#aHT_5>Z7e3N{YKCt@PN z{MUJA4T-CJ7UGg0ps_gfxk&#-QtUVx}SAE*izVKm$1S0XlKTN^PkB9HE zVRo$>?vUd+#LwB?XyX!7N?2h}FgRZ1t`j~Yat%CqBPp(QDlU1Fkz^a2?N-iNV6L%- z?A>VNt~}tKi$liGy=a!PO=_Unyku|6Z6=aK2$kl=51v7T`f1K=O5vq>A(22ajpC}8 zO=kAd;$AFVwDJ8dxQeq!B4A9FfTBVEVV0@#+%$BG&~%+1eMSFw2qZI8z6h7C-|DUC zgxN&ZLvx4X&Wv{$r5asR5=sIB~s z;k@aR%OJ^BWKxD#RqIWr5FB92-{nPeFIM9tz|?DM4tnbg$Ee6waqyP5zsbp9265hQ zQTu!T-OZFUf<`-;cOjhz*Jr#hCNhQ6LcCmif3`MpO-R%Z30#$(-9FA@Zdwxly>u?X z^An=Yc-)Hu{Fr)SX^fOFuUhV7ZDan_UwC3Sjv68$Q?KmkhE>==331+)2=jcGA|~ce zs>o)v)B|n>`GXEgoI#o)lDlJWVU`W>)B@fw5Rr$xvOCslTqMkhs&t~qanc-Gj*)vhV@*W2?h&~NAN?a- zTN9RUql{5=E_Qx18gESlQiU6+uCfAR`H4h)dJaYiwH4<0Dq;8 z8Ln`bs_01Zzx^z=79b}c|5IAVA`a?)t|Yg`Q=vJrHLjzC5>qo?f@kfLtAW`TVfY&J zN{RpSO2Np$?R}_4K<4`HC_Cow_MYBto6u^q=r6rx!F8J%7}{)e)GGo!6vk;#To6pA3Q;bXOu zMv~xzH>>}f2GCw;+TT80J+Wr5mz0Gzl^h%;)NuFUGZ>n&yK5QU^VYUNCfP^+s}z`D zlB7-=vuyCy9WF|QK~j(y``oO2`BEj+e5xJb`mYLjZ;(nhi{#*vU7F_X+R^BR=fkUr zhkGML{ep3UR#2PbdENkoY|-HcJwP%5nK#=j6?l~^-fZQX(EpM*!5{Vgab12*^_|xH zLup~kjOHq^G^0e8Aj0E9g9P6Zo3MhMqrk{vY(;N3I7y%lT0d_@64GS;wceUV%;_J@ zHUN{S1_SB9=Fl0svxWOW6JKY!Wj8#tXh$@fQZ1_?Im~`)Iz2cCVgq*+c2r=V6Ks{1 z=pCPSB+9NZA`;JfLxYcKi4k2O*9olL)!d9vb2dYEKMZe8K5~bt4J}g(CYuqQXi$vC z$LpbW*eL$$bJUTv?RtdyFUZ4BT%6F);@!g)VBA$+*T-P|Qy8O+#8UZnB4j1tpIRLJzlVK%?qxg^Zx<5TObe z9+=*qO{>UBcOT(DdwmCT7pI_sH?z?<<661=hae=F)I1t~LKAg5de7nZ!-R}z@_c`@ zd15pY4_HMzF^goT@tuH7(lPz@MJWAz+Fsel;z49r+-z_S=-G|*= zCx~J=LkcsvJzywuX|WrYl{?%kef-TKt6g8zp96JkBY&SBg4YFG+`kB`qN}JnKz1&L z6;cpA`AsVW^SfnF5V)>hhg`c#HqasU+a5Io-FJQnc%$iPU{1k(7pKLE-5cgrj6s*C z1^L+pt(<(+^<-}!KeoW39Loc3Rk2xoach({TN*0OFi)IXR;1VauIocUl3nybZQgY} zd4+%Ek;D2I1nNK3|8>9CnhtP96!8jeM3L(Y1`v4-x_K1Nfrt>f2IPG+6bJN2mfI?^7j~F?Qw*mL#3W4TQDK@X`KeXs+pzTWCC3wnx%(>86@tAcUJ52*8Vq;7R3h$&c zNXoAE*ogL}x0M;3AG4zDKfCoty04E-E#6zM3z)R-W*Ro=PEdh1kK~YRCxx1E$9R~M zs_h7*x(|IsT_I)3s22Qv;MZig07pQ$zuJtS7ifdmrgubkQ|{H}OVEfGZwQjepCCyX z{J;#t-X?WxdMu(fW(C^U{}C|fMxAq@4ELg9{G)hjp)91`nmMhj6ymr%9RW5fwYus4Gp?T&6wtt`2w4hGg~4 zE6(FHiu-Yy##vBGq*m`KwDd}G(OK+WMkDF97)(R^uT-Unmud7F!(f5tn?CtRNdtB$ z6IW2_d^Sh>8cdBOu z6tZPs#B-K-qy8G&tsH0&O$G*WnaBE|U~$b12Pd|2_daFQAy=cW{x#3u#UWq;+F#9X z!2J@}a~=UVVl64>=ABBVFoi}O&J%S_H4+3<_(4on+Y8*zv9*a!;budL~(G(qEc8bAlcKfEnQ7E+F_XUj5ua%Q-2=Og|5x)F33DF>(?g;)@Uv$n z%FiYd8`gMCR5_i`SU`5ew+jp0AXykT3yB+@>UR0VpBHS!XoXr3^&~i*+;MQ(up}5v zQkd|w_}_Wz8{paK8NO62T0Swx)%xzkMc|Hos*L6MbW)hlP$+UMRRS|#QagjkxP3oQ z35Q|bcdI=>IQT`zBK71WCVp$rUE;37aaMO#MjOoP+A>Iby6gFyRQUJ8S z1fKaZSjjNRUBr{WW(V0*EPr=t^9+}wZ?Q9eZ?2I0X7f{5cV9YCrie>su5P@~YfHPI z(|sthhpDkFuL}f-E(5+@_h2c^B`+dQ=!1{I{ zZveVhu@o-`Z-mx@XDk>|j2ceD*$}Pet1~)i_i1__(>LLJ@lvkS%{0CzZSVB1heJdL z`2}&Tv3wYItGJlHKtPUcNLYqn5Ns$o#ND7O2p4mubv#t^t(%z!>w{e22Fnkh+rg(; zKVjoX9=)Xl`%vyso-YzD+j^K=uhxDj3!RcdrqA&hO9!NO1wjkBwdArDjX;IGjt(g+ zkceErV6$eKh2yU_OsWRp1G${AWovwo!KiiHoUA|)A#GIuc_`&g_*vV0W zj$}#h%pZD|ew=Ls6;s^$?|p}3=E6}Iv@t$!%u+G(M9>OQxr#KBMc)rPI5!BHb?Fzj zQfII+UXL2bu=eeNnOTQv=%U8TBxT0d>OCLozwo%M$x2tElnX~I1d#vqBxN%QsxZ}_ z+7<(Y3Ik>Q=<8;-O*pWdLIGamfpmXJiX45So1vUj|6oPRIP{HD8<~c9DbF9SO&qe; zp7hKTu)cM?J4F}TtD6rIz}ms@#rSN+$lh;LnFPu67K1pT-U8WS(7gca3)cY4+`C}F zaaDAJXem4^*mxGLwd%d?NL^qv=QN}F02HOQ<-NS}(xdR56aYv7+8c?tigZg2tR>v9 zQ?76Y_&ct-T#Mh(72~W6u^!*)1uQ(5Zzsfywknm+&HPNpomCtfl+klg&c8Jy>f0cZ zZM|S`0v=RK+Xiz1QUs!7rD%x>_{JPgq1-*MV-wH!9rhwFPT?KMT!(aj+sw}Z==H?D zD(15XEDKGmE_v{l#1`kboqPvoHu@%k;X9Z};RMEB$J$$BEw|LI&^;m|YPG{wSv5a~ zD$nrke%Ga5h0>7pInDr2byj)yl*>TA8jvsN(_~Hq=sdu=#J)amz z*aAkKqPggG?niAfLEw`ywc-o=()LVKDST~L3Q7u=HQ<{gEJ+SqxHB(^wJu^I8h|&2 zwKM>=fl#)8eq7M_fDx~|Uyj}!JGSgfZ>nx?R+zf;`89nwyJizRvXXnD8dAK{~Y^-5M)i8q|DUi!hF;g9+Tc!|VD zvwE#NwkM3E zo}0BcfF*SG3h1wYQ&+}CQOdJJwIOx_kkk-}z|HK241L{m1bp=ncB~Cv>=UorVrl(w?}8N}ek#p8J>!8&!yT+e>#cGMMJA%*5X;mt&TcWPbQNIH_Gd z6iN96BQ$2}$>^HI)_X~4o;WZdw-XAm(RD`ph%AjVP^T3wfSk}>SqP8JznLNiR{hf5xk3LLqD0kW!E9&wg<$gzq zB7zyrN3mp&LEhL;aR`>0-S`Q=aWg2*=J-^XOs3f=N^D>u);>@O^u1w4fVDayN3iFf z;zg2RFao-gR+@-H`L8u5NnWUW;B<9}g@+|F)obK983>OA{m(}fVauMy03_|5Izr1v zv%bYR!dVhT#WrO7!fI{*2yteG-ex}qZSwp}nv3YkkaW?ij8UHzjv|R38JK@PM}w1Dft~#5H( zeOkM3Cc`7CIcn|^7!j+3$}Hn1mA^I6GQl0&AOc&=aR+vGj7fdA5~A04yZ`g9h%*C70!tl8 z0LPO0CRNxG-7#1)siQf9OlxbtXt>jdj_$rUC@^68Gl%mTZ&?Q{6u~s?doTB7UcRS{ z=k6$$ETLS&-3i7EGRg(s`SrNt#u7r!3=5ir3PB||9WqHlte+sZcPMx7+R76H!8Z>zfTwA2H zP?7RL*hq$th`F(l{F_!YmI3}Ms|eLxicMhA7+y#|nnPzjcT1>9qKnnI{O;BINA|23 zKbwLp?M)uF#V|RyK3}l@?ry9hBTU#-H0}(!y8UbKW zxe{Bi@?x-dyi(BTG;KBVahUwws#k^zI?5)%6l3OdqWH*0Lh!v4ZBiJ}DT1-p#5Rs? z{d#C8As+Hdz0IR1!NfYDZPv49In)7HK3wR`6T8JQ#^EuvB28OzxZ;vR5*Ca_;JnF@ zO09X1R~#{tOJjWsw9b{etO1LALr1W)Wcav0fv)bJzs{9Y-rva0+}(+IlTCYqz%6}Q zPWhJqZ`~Ou(b4#4qssEHINA@chFf&28)Rn5fxcRvV)KqdIHGIQmIRj`jrJ`1{VaZ8 zcCXZM8v&h(aiEXjis#yU7EMWKmDYMbTXMF}7~d4`szhVQgHaD#{WEO>Wg}s(4Tb=Q z?zJ7H1Oh~)lW1*A)*S$ej6UlRa?6b1q{a<49hJ256k_ws*; zl6KxEblJq`Q$KBse_wRqS}85by4p1ohAz3zAJU#Bu?ag9arVmFdO(^2drL!s@OxgF z`g!{Q-H&i11xjOAxeJHE39vL!UugBVU?tRWWDXK_U4hJTsvtB8a$g0f&rQV9wT5C+ znV916WaH2wDQa7He^GP{NV08gJFwP?FIG}#fO@eAn=y(mcy~>_`b3`!r588xYEi$R z)NZ8MrMNGOgYfaa?t(~J+7KvUUL4ojJgnXs87XD?TE1^PCy5b`wGyt^_K!IA0EBoS zcvxp|dldhO6mE8P>i=&U)Q4z|G?YKZypdJmKnkb!`|Cx_$n0r~SHk7^r=+W$ffEDy zR%y&U5G=bZFU3krb4)n|+CRp3{fc5koZWa=9f(e+DJhhP=qBrz%!Y`JWzPtfB*jO7 zDmyB&L>zfnHL5}m?qd_9f8Dw}5cjcZ0|Gm+Lz;{^e{%H)&uH|e_=ijZnYa9f7$vdq ze;4wHoxKe{y3Fs5>R)XK^QsJ5(|9JVNaJeZ+guO7<{z$z?U7XhLH0DX6y-ipnK0-2 zjn!v|YP&23=w^Tb-QxG`i59i8M7l7%VqX6DL2oM=?`#p%cc`~G;=Y9jKzelqEjn^k zrQEe~^p6vna5CRs9`hbe@uR|0fL8v?ro9D&6H?VEqu{lPzE{;yF^pY!DmR=F1yVz; zs**DD_Tc)qwevTvIY{lFG%{JRME|4)N>37)Ee|WAn+rbs!lO6-LowGK%-T#u?lJv! z^s3N2g8J%JG9g%+-!)YAkbeoEGIv86kdaxy0}38%X}%|Z?~DE;MjU8>wG#q!_X&8M z1VBlk-CUQXHBNf^cE9Gh=UOH&<~F*vIsss(sJrLQdls)XygBmWN~n7{%mJk{iOR0^ zxZAf*rCb_E=_N$!DOy`b-oWl4Jee~W<*s|hEk3Phol8KF+Jd}j7Fmw<088%p;5v*= zXOONvW_Xoe9ydZ01_9udgNLyoXGD!J$5TDIms?B<>nq)1b zzA`=OpPs~UMN9Ota~(jeHucj8Z+04%>;4p&j2*>}04=k#YeHSnA5^lAW+*|x%Rm+a zAVNNDPgZyj?pG$X*2GVi4{F($d!a+92-TSoR zb%Y>`7IE5=!h&^KvN8QD>$@XU7%|U;qVthOt9U%XHiO%AD2b<9XdpBtUC+5R3EzSs ziY;3m8iKlLL6G?+EmqD&hLjwMo)4{oYav_Qq=N63z~9qO^RSAE|C#}&RX#pII2h?U zPa%5F?!wC3R8g5EnHK4hd^96F%2YErIb}d7fLfaW3C&bu0VII!z=wo9zTO#$u6xT} z53Q^Nb@H1gJhiWMZ9pkMN%DOsCO4qo;;>W|ASi$8v9@>WYSZm1Qq5@&`(Iq3b(gmI zfbTb7p-}|}g&u9@Xl&(BM@R{)A{9&6=wPqjjOT|K%rs$qr&vsFK?8%2ckD~Do!tUL zoFtLMZ5^C)(v+j(quW+_$S7`iDOwxGr9~bNY~I8kfIa&a)=5}uAe1K3pvy-N9eNV% z%D8n)NVp2B=2yy)sb+2=l-YUkoJ+O^@rb-(MWc1ZkXYZuo@A?2gIrdTS22KvHPo-UELfk%SXxflN;UI>cYEsiM==VZl`e@ zoA!#)ie?$BcLRg7Ua6(RLrPffkb7$K6k*1&Ueei?JZ0vF5C%K!{Q?oUJfD;7Q9tI| zp}J2JmtK0)2Ld$u(mA8eigLTPJuuT-l2$rwYPVlkfSeOasQCdzGcTx?v2YL&E=4I{ zEXhPD%Q_6Z#%ff(m=bRvFYy{J{rHc$n z-vWD8Gz6O$L<-W9LX*=5L2A6oj5YiCO)++juiqT4vChp5x=17HJ0srD!y%r!;=gyL zN_!3m!G&FFUke4v>I_3b0e30$o|c0*(w^(H5$rbx6b~_!_mQE1elZKP0Of;z#OD9| zdWg^D{P%wyu6iD8;_Byr2uF2tcVe-BBs8~4CRaYJ|Ig=e62-z&kfBekNYpaYHN#cJ z8T33MB=QK1bHw`7(qlikAG_mlYu-we4J?t)(AjQJql92$j*$hY+03Iu)r90_kV5?c z`j3nG#b|70>OiDaDRGzN`0Su204Qg&LjRUdxqepy2pXsT$RpE&-cBf3%M$tv_@v2_!?Orp=9A=a zNvsZIc&V759w(+=py2MrelpTi?GGM2x~^0njh*U?koB9SwR18*T1T&RXR{9NzhK7< zAS>$vYL&PLq*l6+C4UI2swV$ z?k$1etYkf(Ut+H-aM_bC)Nsa~gl^c4jTf&d6l6XS#&3#An&t<&V>!qKGSMVC8Vl0Cvyia(|Ez^fb&ZRjovKcmw#siY;5 zwGCg%f7=?R>7|1y_)TXqi}M2N%X=^B4!Su6`< zKWAGO2gL85VCWA0vt2E*UPs_CLn{Wc$yP9gBese(5$hO{#?TNDxR$(LFiYDEz>VRW z3f5!G+L(@%tXA_UP5HZe`>IG;DmPvP1Dhz-+QzuRyM}9~b^w6nR%6ca48EI4D*3wV zNOQ%Slm1%d&S`F!8M2ph?9d}%@|KykiO)+4NTiJ0`Sl}2?;aXm?sR{Bymq@+1*S9T z7xH0xmk?LLIkVH#Za` zbl<}pZ&M~_*B#Mxr$nyI#!dTwvgoa>g?nWD&>Tz%R4^7b^pgQzNpM5>Kj-jdC)BjX z`{HqkJFTmaDyYrd^r&g~F8A$%RWMoiJ$1sN;L-SfT=iS5)_Cg>b1vMA@=sdlbkRaZ zL;;vz7PVe$f~qryOcAg0Jxs`8w(6Q7zW8d1zgMI!U@w~h9VJjk1o)5ZEO3J< zl)K6c`S(9>n)uRFzkvxhH(t9!^Mr)OC#sBoRoA!Au5qE%#f}EzdO5Zhe%aqR2*53F z4tpO~&4?uiGgvqqz4tYY%=QpFr$&c$^S&RglCJ_dNFURr-2Pu~{Li02)(?oW3DGF+ z4GI&4p-6n%yZ~-RJ9eJeEPP6_D%hb;ANc`tGmGnQ~gWv;@tsXEX>r#A1j$y5N1d)Va#Cr0Aqx1`6| zw}mi}e`T$@W729{u~*OGR_WLeo3_gM&Hnn7R2A5<7tuKDqMxP~%EK`8ovzp{($Hx9 zQsylaa4}LAiZF7OV>ii1nd>XGQlA9YrcCQv7N(7MI;F#w>MfIr=;4-x++Pw=0VBe5 zQsAOL6{o7$aG%{Ak=*Uqu+b%4N{`pbA|Zf9&<=@~^Ta0wupbm^!fGOySF^N<60JN> zjl=4XiOy|1wC6r7)Zl_q008MlKIx`Gi95#Dr8%n{r{IZTq?28u+S$^qeo>FBa>s_# zs8qEM$y#n-MAo-`7xbC;=I?=n^RSD+ocqFNI^cH0C!E$s3r|#7_R&8*&C?{#0rjZn2AGR|elFj3RXZL(fx$(yD>q*K zjVh0b9><#1&_M0rGL1P1=o;wzjv>OwM3FMP85|4noBut z3nl%#fo*bxiooKWU~g#-`dpWTw@ge!p`n&rXWgi-pZnCC=xB>=?O9o#_}V~uH0SvL z_Tz#iNQd^*-Hz(kC>^a$f&XLislG)w50jLsn(~%(SsQ7=$<#1%&5-0Qwn{ocCnO~4 zDfr#H3}nV6|JdTTZNB+jYWZK2$m-O~Z+;^GoJa1CHX4pg>=Zk-ff(5Tq=V9G-$5c} zi<{vIo1)734bFtZ2@F8Yz`dgm5xu3`6?Q03DksMXrp6}*no61K5dYr0S>0%(kxOrA zla5~~qzpLjGp{pWRo52WgjCdsos{`vVnEIEb{+3>n0L2GSQsaFTpv~(L5S}9zM~DI z+7H?+8hvW32hjg~!pnSij6c63$KT|5v5vY&;%l42FuEHtWd!!Tb0D7uEWa`A+=84a z*jfZ~%bdstVASTcHS7KxPAD$kmBzkC{ZywqUl)2`8AQ1RV$>c$BHP>L3JB@HFLC88 zOL|}e>MNSkHe8F=UfjOdaITn+E7Xa>njgnt$cUT0O)j005k!pap)hta@Xz5mjAVMl z{3Jz4Z*iWg;Wvx11Q;clD9;W{Dt$@I-Qp#YWm)suSU0SAT0VjRkVPW|6dR8Ni|0`I zh;J6g4ksP5xX>2J)63J$x8jN(^YsC%8v4epDK zCpm7xlC5aqRb{&|9enW^kxku}vX*{pN}GCD2$zI*76uA+5EvUCZ1ZN}(vx7dclK+f zYU=Mb1AN9;bjo4wfNZmf9hD`qn>(&@!Lqy}C~!Yo`*+#aGTSjMT!ep)+OB!clB$;u zI*P|kFCD{EFi@cl7JG#)%v-b{ry}ubvt#Hd!!06*f{LNxLwSFEiei8v7Bkrk2#Puv zOO~xgTa;iOrXT;_O$=#Y3ST&ZcVt@7W8522{T+G!#d;uBMvsvOARW$93avtlrj7>W z#r=y{PYRVMB5zv&J7*@K?Dg7h5UfisM^+hwK`IHi$}>k>KzaPkoRFJI+M(Z_hJXWz zOkxbpZn0_-(J(;A6#}v_h{g+9zd}@Up13o%QsW0m&vyG;JONhopz23$iI8}nhfyvp z*SP-;J1!3VST$W*Yw5grJBD`DvcN7K53;iC4QIIUTN4;d&NKGlq^^@b!5Z0-?Bp>W zyA_y14{x7X?xT-wqBHzQ0P^^895L^8DtLVWRnNPeho116;%buTCfDMYEz|5wrDOwl z*S7IN&Zp>2snT)SY&q-EEQ3*LdxL?`$df+DJ`dw!{5{CUeYK&6{A0V|^lj2$<;rjJ zQy!lK#nm8_Raf6j%;R#dVCi01)b2m)VU=oC_J96oK;$02xV*0py2}@`m)VoOC_!c4#P?hbApjiI4g%I5DjiC{?dB#S6zaV&Ij~ zwiaRKC4#{J>6Nk`m*$_vR1dQb2+ufIZaycb8rM%*G@R+Cg^tYBe)K`Z&@hFVgr`?^#)@3U8MPHih0ssEnjq)m53~LSk=VMCX5Gl+eJw zZ?lb_b4WS}>T2W3`jj)C}q!AOubj(nY6X%TJWJ zx?eI2r6s3(8o|RT1}ZG2hB3u7>WUI+Z!0vo(hUgal=HQ>xh#CJ+rUY{fj#8S4Ozk2 zZT+E8`cgfaM(kt^H-4Y^>`BblNT@glrk_b_t_H3Sq&z&dh!pC0k?#Sw&V7sUEa&O< zRw-=B#sPsparF7UEt!X}q>46Y5w9W?N89Y5Ca>KS9;Dv?W65?j%^Uo~rF8#Wc$S3H zbU^jq6T}~4iejq5_z^|7!}-h~P+JQX>z2I_X`pu+Bu=!;`JrDk=HWQr54mT5UIReJad8Ni5!dlA*)&CI*G90~z8N6JP zg*?uVls?iAWg-5ouhNNQsL7+4kUTAf)$8A|pU8&0H1QqtfE05o+IUF?b#@|=s z0WIu!QFda1&t+if@mc6A z$r!)j!+_Y+yiEFhn_-&zN4M}b#%oEP&RO*J27v# zn~)Oj<6u;Qz`c9V)BRS8fkkoG0F6r-UcU-;e#cSPoqk64_Hly;S-*8)fezyB78-%N+TAa2+(_@CHEoY8jicdc1ON-M(5suTXs(y=5KyeqZ_p-~VFc>{-0 zd}GiJzjCA#%BmWvfwkIq09GmL|I&-kD?3Od+XkS6g@2d34j6DRY=;WI?boum7aGBc z$L?L{d`H;5FK*TMD3Xj?>V8`r{5rr2X$o%s2UA#wq5 zr`XAP8mqs=E{4S}k%m$$9-9<>P|mhdqaMw3#g^F^><~%T0?p_={K=)?`amJ`iIG^j z2zEpSGbX;qBI#&Hw4&U8iD`jWDjlTl+}ZCF*nGP%rB>ston#UIPN1bdt6h_)GnLj{ zQuUmksoRJcm^pSGqEJcw_oGB&!KJl4^rQF6sNa?p*=KKEg;RV|=DW76z)+!;bQGIc ztyokN3Y2D3vP;ZZF7)67aejH!Pg0+}Z8LPp1)pdMyJ>!gV2C~O)y1^-1owX}(9r0H zBoB$i-pFgN0dy8sh1Tqe%?MNIjUsIw=bUXHfn_o{I;UCzhIbopU1$6 z7q^gJpMk7GQNm-)i0I&*10;OfVg?1pTC4XSqQUs_eCd8hk;TOgaZ1xA(R=lR6c+~I zlkA>YrQV@iALJMrg`@0loZUf!Wb;EQO z2K@)|N3!)``UkZ0RQ=nk=UC*wv(kI#x^bRZ%cgV~no`gk<@ z=n%S)9NK|Lg_TR7zt`A8+;^)RP-Tol$1{+Wgy^+bANm|znaydf+b$Is*;d}s9pxSyc=xX~J2Gab{*#|nWTYGl zykKLu%F*Qvhg^e$mey^`dV65TXPLCdgo_>mO7nLcZAdIc5IlrYyYk2ACCHc@o=y!< zA`jd(%{L{WV~Y~#f6G16FG_;-#IhcV$G9U4xDX|PBP3b%MtrCD6x~xcl>eD^_F6R+ zXdW}T7qL;cfFH&!WObX(i>O<=1?}%orlQjX9paafS(C0lci7h=>UNX zLM?n&xn2lC9Woo#bc=&0X;H|0Xcp_!{4*$mfgK={Ag(QJo^mP(qW7`Ilo`Z!>ZqLP zv|1>ZY>_ziHO=ngo;sOqsWnSxp#|a^?Htk9?4xlas2+g7)UDBUlF%?FlK0Hqz6-HKdP&={?uqGkss#j?VE)A`{^ltdin@69GDiF`=H>;;kEW9htbSAcPN zgUTb{j&DdN(lt>NjkUkXUucyM#!5lk#70j%ow#J3LG8?`R%c}GI`(aOD!SkgGZd)d zZ5w?*=@+M}Ze2H}w#2mV>+6*W&~HG4amcL|RGH7U`-O$)Z7ax5f(7FiY(~63P>m}# z3-RB18+N#ii1j8Ni3edkK3UZZoI4%;TJrXO0w^1Vyw{y=nt{&>#o$hzUz#zH0kV#` zs0z;N-OOOKgm*FYEC+i}L+w`S3Fe&9jK1<9ZpU%;;{#-52qMsguYEBWo(Fo7HC)um znTaaIpoya7H}o{A=nD?y=YXa#m7PB^OGCDd*|G>$(8VRk72P?zvr`Vw-V|w6TP>5@ zsOpC!ScI5$JQ6Nwx~H5k;}%a=iizu{c=D9nSfxoGG9XShJ@tL{G|-vd?i9PFgVwtDV~k8n?@D4D%d)k zk73dvw{5mF&k+qK(2|a8`tmou%S)0OMZ4;_NJ6{RI#>`|Dr<~{aW;8o?&XQ~KbV4D zeShi^5~{Y_00v2@Xa$fvG~S}0FhtYzO6 zW}vCY7!uCugL4i@Z(P3fnq}4%=xQvBUUT>ppSrG+Xv^ zqONZscu|x)#1Gv4d1hyl1Ra0_9)C*!X7Do4hNGLsPP|W$-67Ln&8hS! zzZP%edl`Lflr)r@OzA}; zc-N3s6SHgVGN4nro{7Kuv0Pie{3y4r=`!Fcb=7a86MPx5o6D$M02K_4Bm(^Q*c2LI zsoYdwQYlRITEoTyycMYWAIAp!Tey*nayGgVZvXTK17CD>*@b-~c?4YzB74YZoh>k$ zM8*RrYK9|pfwxyly;aU`d|eSY-6%A;l_=Y&Ap=P0W?22-2WhC^c4U|Pg30wB!h zQs5MOOa}@;w8&r;S!3GneKA>PP4XI|Wx5uzWb*wYn!V;Lv>_fbpYiWF-Ufum_OXM{ zjiO61gh~0?mDGdV9?z~+1U-*AUFvjpZEWdhYM?*LYxIbSSnkq!bgLoph_e)Hz=oMim66#TXd9pGmA|>}%C*%aK*^9^a6KCwh}e zTgP|yDZw`l9*w9v;?hC?bK4$92V{V*=%dFNJN5q(BgI6!c5!+TnPz)Jr}_e6I4H0l zIuoB}1E@&vMUyY_%^W&YvbHyVfUJ>Ua(4nZA}CU8-Mf1n1IU#{vHwWev)`;DVeuj7 z6EUq$!Cd$>a;)Ut^|ent?FWBoZPtEsWl2v)C|?^>EI`Wo&)9Wu+#!kw9i=Ft!-{AW z*|_O|Bs(%_0WS>|^Tt;Nn^I~NfIChiR}(YrDEY=X+FN%HK+?7OPUC*T@tm$4)=c>n zmJ6OJBzT7O8a>~6u2E;91Ivj)idrm6;mgR>LB=D$!q@cMOQhiouxhf?8!h6;|9eCw z0|Ga@2&KFN{fJ(+9@Inj>KT4B-R$vif6Ez8Wk&lvustS0X?}ImqC!&c^J&nza{~}0 z(V^vhr^(c7fIYT?Q?-wzKL?l)<8cDrNdf;$dJYbI&M4lz>0DWa!YMWV$zIbU5-LEO zy#If8$tkvOt3!i|Atj9$%E&_@8Se?&47ciIu1tGV1~P(twQdKWec=jU=dN4Dk@fpFRX4{^zZO?%}^vQXZk7f1RHpxhW=8|5fmgd*vW-X%YB{ulTG z?ynn*s1GyAKXZHG5^mkMv?pe4^q9k78Nw;Z%qHH2jDajZrK$F80DHunSSpD_XU0lK zn!jd>yW!3~X%Prj&Fj#cRUN`251++MGdeaBO=XyWI38tHIndW8%Zl4>~EX;R758nP0|FurvD^X2*ufb@$S-%mUuzD; z2oH!-f07D-^^3Mx-)_9!zrZrFwRh8P(@@}yNa^7!PY3D3C5V;-Whl6ECLo?I*19sC zV4J_H>f9x%^8bZhT%t1D0|UA-Z6H$JaYaTyu!siN^t#%pdMDa&i0bCSUD-g#Xx07u z&h&_`W9;{4P#DKzVmcwq42}b3Vc!Nsv%qULfHL%?f-D|~f!WlHhnF{Il_i6t01%ip zb#R}NhwK$p?yvcs`tFH18{<5x5d>M^U5Dvb$wtU|LD@27L{YDKu~dl$QFgk(9-%7Q zXwNU%ilJ2N9q_4|_9-^#rc_X4F`ztlWCoy7MkaAoOrA%K1E4>VUKDLKF>7P=^4PT$)qK>n zBpq}TRHI*WYoq_-?z>w$8*;IWp(`5Kc{ITr1BC#6dT_6%_2lC_H==IJZ(F>srziQO(8n*LC3@TuZz zdxh1nWdR&>>?(s>u}5&dIDzH0bt-1Knc}YJ($3yl$A?KEh22!gRbwpsq%dC~*T_p0 z@toGBa8hKK5csB_N}O~g}somI>7d~hlPm>)h6 zHxXE-namVr|CWGJ83M`MjCQ1KA%1tpM2jz2y?&B82w7^6 zet1AkK>14RW_Mc`x~Di80W+}Vnr8mVb$GROT*7x4JjNslJEs?ZnPaDWq=1+MD=Gvt719sx)Bni>zsK?m)x|?V>vz~{IrNX*^n6M%SJjmI z32vpJ&$TiUfp&dQ8HvFn`e_aVks;ni9NRCAO<%4?e`j)p|0f`T1JI)~&iIUbK>{jg zyH{x#xLh;1O{34g&UQYjtnCWJ=-un;C4aa%Wh@Dm^QTm)gi0QcW0XWjjo$|<4EOd@ zlqs&IqmD{aYlGcd{;7bhEh>51%|X=TNlY9*la0K-cJ1g>^051JyyM~+s>luAW>26| zk$G@Y{$f!}qKvfyRVDpSW5Vl68xKMB?hSy*-aDm5n~1!Spiz5 z?NK4~4M}^$$D#`@wy2l{5fqp$`m#LPIj6Eu6eOxy^W#}3gh~`-IFPKf&ljIwcI=c_ zldVyHXnk26U;_n}S_tYAz@t@0xh%Mt5kYEoJ^p2DoIyL+pZ4S$QhA<5_h@8w2zSpG zy`DWe(W3e;#+Y~@{ne&+MYcTvecTFMZrjDa1q|PG;gya|Tej9ZAU``fv~HG%=INqxZyRQ^{0TdRv5%-=`LDuSt+t^GrIjND~6H z&)dWLVh)fYaU4{X1s`yEHjZirq;m5B|BJ#Gsa}YAgml@N{we6n#`4L%7q9^srzGY1po`pXwtWcekGIKd3a!EznbDL}-i?f-8|XxM3&4#g zKBwQW792!@;@!h#ywGg8IIR=%$U0ktjYc+AjMEi~`jRvb(aA!vgg zAo+0HMWve2v8@ zg^_bZY!ZX}xeKU!2ogOFgpAxyGy*tzo!THDI7%~+8wvogqFh{+m|0(?Rib}(dg1r( zdX55xO4n(ch<*-QKR%FvuTk0tuoczh>RKCvz)S}7DSXF?^cz#0q+v8g<{SY=80vel z^^%8wJlQFFcwVx8?U4(44V3m5VCK z5s;zsz$8)P)QQK>#E)2qeIrvL};rWc5o&Ao|SVMY2-{?u|Vm5)NA>bk?yfIFRbhrHJhMJeZi+W5&`iVLEXF zuOTqaqj6%>0s#C_Wd0Hp-2y(VoHBr>P$pX_?f> zp`Q6KJ+W;~M~JYnndn!26h#=0!J9?Wxz9?6aM<$`Q55Clz<|(xn9SFw%!1nZ&?{-{ z@~6+ys40+go5A5|v0xep$hmevW95M5c9rgpYx@4SE6dsP4|&;*m`6sf8R4t{YJPi8 z^FDXUurhqyAE;;pfL3Rz_a`KmH3?pu>=S#PF+Muo^g5jvW{m zsYb%z`$~b6_A5FsICJWU92FIqMO?hV_xjI^tBLmW8ze5#%bPI(asRATY<)Yuk%0!3 z!|8B7*mr5PuRN{u8(aXLiIZeF(gu~xs7=Rn`yTbETLF-LzH<>1-K{UgDb}a2o~Gjx zPN9>k9Hz`}JE`YjD{;X2^|qTfN~{ zqRi;@qiT01KeqyOVTi{gR8zbW>eyG6qV4YD6XE@vy z4hsz52l~qMK8E+5Jfm5+X?E~lrl1~h9YDx}`;qfu+LJW&g*BbaaEJ*s49bbvVTx0o z>ezLjemID}4pz&IOK~z>G70|HqeEa=_sUAIm7aW6ES?u8*%3vn6V!&Ku*M?~0kt`` z%~A2eO9jK|6oG5xTg!Jrm&(SB3_l!Rc;Tg7D8r13!&+6L3aJ=ky`cjjz~UmL87pE^Md>72W!RZfO`IK$YGRvng0g zPS)U3R4yoQ6fBTT;R7Af5I|a56P-w*%)Ibk{imOU5S%Bz-y$)0k%g1Gj1dJh6c7UX z0{KmYu|l>wSKEzyLsVZ(Nx{&72Rl5ztI&n;KjHtH zeRp6m^BF5XpvZMK%pwBYS&39N>Eh>Mien8UC+4~_@{<-`ijU!Y2*OgsLF?oy)M{X7 zI-E8!>WDmEii1E{SjQk|KfNXsxv*;ublH}-9`Ol35|ev`C*$@yeNDz zZ9WEp_2qPf&EROPcj2Tx=(;GLpm)hmx7v$&TPh`@8FdLir@#n%vh5brQ_+pe94G(g zpgJ`i@MkWta_dH48J`|X5==joP=p|%3?c`6ZzZ|xM z7v&DYzhcMD4OFK*g$B8?3Zwa(_mXf`r})O@y61*sa5CK!U3Yr3{saYQ+W;5v)W^s# z0jYUihs78H>iCZB&2F_CQO?fTmx~_v{qt`&$*g_%~nRn^nKZm`% z3Ek;{AS7P7>U!iyY9{SLDy3|oCJTDap`A-DP57dR2MIW?^|)c?YJiCgSVFSQXjFli zA{lZp3u6wlYV&w!zN#jX`JLQ`M7C^*&Y3-Pxk64ye}t^Vk2v}{r;~Cui!>$aqVTn} zHK6+a88;pnx0Eofej6n2a-EL_LRVeeuuD8;!7$7RsRF=BcsLQa-ZI5AzrYQ6M$_94xvCsr<{vW$d@$F(^~JY6w@w3LKfI1{r7YL#=!-;?^Q0LU49BQlJ0tOz zT;2=^Z*bG^AIHKS`QSQhV%f0IHVh4Cdb?h93F}i^Lzi8W4!S?}$lxysG))%*vTx-} zeVn0tV47vwhbT@H0ZI}p3h})$5@C&fFWL|p9UdZKm*Pi04lz^tn)sX4h|LgWtiraF zxY1bnGM!?w43V;Voktk`b?eyk^MmEk0xKaI*j$zF?UG`hnMgr9*!s}7thoGvQqF^r zoFOz6Jt()X6{#D*G6Nqg(gzXaAe7Jh0;JbGoXAfvey?G>`LAj9j5$PZ(5t2KvJm zK6-`+6N07Z_T|BdsEx>*&cE?B`89k36J` z;&$d0!U#icf}UC&%%$8u6~Oxt&N4^jPM?>N30@IE_4M*h!4`r|3zKLSw_vM9%V8vA zgOCdcjfFdFa{2<{ZE-3kg|$0Affyi%c2lf+!}Nwy6RzXwT~aCoGDRgWphjya8FryT zD!end`?HO@JXZF=EeEfr0OUlo`WT(5G)eJoEkAk9Te`~Hjmq9S%hk>#CTpV+(N(&W zWQ|b)EK%ld$R|hmIf;g2XWE+~p**CHB;7xj*$fDTd=#v$f+=tt0!o4p^s0rSkgM7% znXHgb)BE1rKmqM&e*LWQnIfDauuUX5fmEhgUP#hS^Ajq6-uuNSN8Z|3=ZB|VgeFkE z?<>ot3C@{41?CGJ=8`jsx8PIaHFz7^cxD3aVJ$b>T#I#0-V3x<3=&gr@1${xyIGj4WnyqX2ep!TuK zn7@>{)#F2UAOs!5QAJgd4;C*)%QOKxk25K9qdJkZ*L*(90TMM*h_;yVVJ!#@fmgr# zX0afLFpSsjF3jJe0RD!je-%bMG0T2D9f1(B$?ktxtsP&GEsRvgDEvqMYKd@{w_jdn zs1~fy)JBm!5MYgZJFL--uM@6L6X}vd$vqUW0S|{><6zx=8+$uL{uRRExae-0Gq6kH`sKs`<9(B8rXPuRG>zKhv?3x{9U)gUU zdiZ0|q~h_3jdhOlI}35<%iQ`mgbvH;^9TR&X%KMw5zKG?kQKJaAu49}i4~#wl>W+% zlOEWM;ndLTPatJAr+l}@tp+YXJP>m6S%$pDS#sCChAcT7*B3U;PqE{;t2N~vWeLU4 z=@hVq&;9*=K#i8E#+Tma5X^Lk_}iEIm)~FKTc3#qbZKKUCIyh%+QR1HF?Vs5M@42Q zA{;a%K@InxfYeUqUaAV@zVDb(sZM^Uw0h_OsX@ka6bkg@oy6K+YD)#!_5CvYLtJ!& zsx4~Q1XO15CwUo`l7aoup4-yhrL4WSkInvzFu-v7U)1l>gUC!AHLZjdu8J}OwyM&3 z2sN>9`jB?yxMy--?I^7jE(h?Ep8_;n78Nv%N0-pO8Df4~wXN~??h~+g%sDbfuFoYa z9O&w&;nyS>H8d(~9THPAd%Zk}1$nRV2PV~mrXv%QT`o-u6e3_?gb!hmrI2?t4t{D6 zo;f8GpNptr#d0&zTFa5PQJaamJsboh<;c-RlJj546sKhp1fSXlXFc%Pb(QamY4}ZEHq9XUEv8lM5+l*uqnHZ3h(r$qs;GbK7vIjlsPEb5RP=Zbnr&9bBt4L4pt^u+>f8 z#jr6^1Zc*Bft@%j54nQi1qiWaS??Uz&F6|$G15_sB_WTLl%?X9@8Dj%o8=uPZ zX-^mmJmJ)FSC|yAX)O32JuP205(nzqiQ_l?!tTj;-~8Ed!nxq5^uB*NHa)_GJND2Y zBwb0l%f4v`#@%O!GRVYG&2y1P8b4>^njwHM`|GI)Te5PF-`;v>Y_S{Hho5R1AWdgt z=`IQ_u6mFk=Ht30CXVjiJ68_f1(4Cv6c3VtX9k|sXj$QVGW1W(JE5pzSuze@(KO~; znM;V2ykZfhlLWv`)KoS77Dx2xP?epc5@u1^8PX*_?=9@O9bpDyzYx(hAhKe+|jD*f2^oW-+Fh2b+ zj3DFSU>G_Wkm8CLDs-`8wgr#Nrh<6iajWA)fE2V!F0!>H1#?Zh!2?C>{+%!X7n(L- zYDvVb4KN2FrW7+lPVR^VTuDA~l0LAy)ZTF@u5bBNyAd#Yy|PL)H=Otjm}A85spc9Cxb`mx?Kns>w=FPYUy6W?KbcY!IRU)#MVG56j9GTSBHE18bm z*(@OJyhH+UWnuF8<#sJo+(o;AX*(r@`Vfmcc3$>HbOYQ<{7An#X*8oBG1|%<0aq)47;U z*Y6nlKF!G89M}?#j}>tZiyMDI1Ic#s%?1)*25_z5yX<<60V;*LNQmXF>w`XeX(ImImvKsEbK%znN*QQG9HAd@ejs4pl->y*-j&E19BbK-L zVJ_cK$?JO-Jl}%+z%7CwvLyMORyB_h&fW)a9sj#ipzXf zsQjYsl~|7UVi0m_tdZZ1Z|!{-fAuuSR-b;R0=M1Gyci>Rnk^|8@Z+f?wYpDFF)t1T zoKWRU8(TEab3e$x7>7;@V_U;s0CovtbGgu})ju`*yus8(hhG5L!iweUE4k}!-J{2Y z?sC`ammPT<)e5e!&9&hz8%Y}>y*ag3t1Kl*t*TmaunUSSXE~^xgei&(ForhRi>uW|Y zk`b1=0Jj+&=I&;*4alJdLmGPl&-7yLJWKA>?e<_2#_WO-21sUa48S6^=2~ebHy8;kIPJ6e>P8E3Fr09;Pk_w zpb%QIobV=cdS%a@_kK|_aLCbK;N;QXJ+o8z#yL5`@{KajP5ZjQ{g0Gtuti8=EJK72 zNe=Qmr6Jmd(Q6^U>F*{I6`g$U~aX4ZVZRMFtQLnG6|_sMNwki~0c~M6mU3S#UN_h2XhFOAeNmX+b2dSI33Q zMr6Rk_vk64a|_kR&hsf6GUUc*0Kzf#ukE3^04{$ zOxaMHhC5ee=Pne*u&4QXL;Z7VM%Uqo1X&?-$d8&7DVJB(P8JcVDbmgjC6uCv@mMY5 zz<)ddoC-=2J41S_&_Dh$7U;(Ze;Wd|Y7Ib&FR7%B>AwUPeDNZ&aeFd_d<#$f$=`s0 zdMVf)tzkemM{~z`9j*l?WOx9frow}9e&JHrmG16~uER1+SfYP4gd)K>hc4kOZ5Hc2 z;Igg35yKEFqoZ82;Xy@p!`tqf3|?eqkJ%%Ir;{urb~z z!7a+gZU8h}4(zj%jo0IOpXc*5g9{Mh&}nMkEzG;9e_rCH`D)gZQ2suzWd8Ihy|>!# z{!kpLNDD-?igPT@rh@EI9!%f;Yb<#El;y?BT`_mVi!lj z#M*%+!l+*FKjN98y{W$f;>l|%IUw6iQ=F%TVYJ&~A=TH8Uraeovi*|KTpg}hdbtix6c zq4pgcW&^!F+QzSavnKtPvqbDakg^q+(2ijQH50}XElojU!;To!xpda zXfQ_IKF>xWy~hC21|UOw;B;O?*LcSw&*<$F=|i<$;E$X%1D`bDIoud??eH?(A-6Mc zOF%qC-WSFT=@@`hg%v|HeNEmg-wnpUQ~|T2n>vv3`~TNX>e5honK=)RUA_oHef^Nujxskl>Y%Q?!bx-YjRVQ^VcYes(-E4X=vKzR z(`vd9OY_P>-qN=L{-{;C$WEq-Sr)EMM|WipG>>x^dB39jI+>hpMp+%0UKc5{rb_|= z8oVV_jqx{zcO!lmneA|=0*X~^ayE6)Ji&8~;8^a7i-Ss+tP5 zWYNTSGr)OZ^Gc7@QbPK!_b@HohJJTA)F2qm@qtFuISDyEZTqqDu8-2L!pkKa=i*Q_ z`TLM&xD<_7lb6hl3=98n&^tm^W<5bQ&kA-p6Lxd^Igo2F4D2|;jL2}c&noJx`}3vW zsQ`?AkHHIvPWpXakCCf5J#L!!0+~w5DE(dcxg}i`9A!g(%2m27b!-LITI2u?8;DH@ zXj(d&=SKLVn!EGkmqd>hf#LnB5s_PMS*$m3o42IUTa8@n2)XDJ_lBt?JfSJ>5#5Dx z_1DMQ9Cu$x6)DFa@c;y!Q@a)(BG8u=y9^Vvv2sg8c#g}o8Gb0GHGL*Mb(QPu(k!PM z9)E}@WlU63;QPxZfUW+ieKR|`<`Lvku zu$j9gvgea>EtczN*4t zx|-fsQv^=iKxGrs4_Fm)A3Cd#K`cZiPTPzY`MLur)pGHLNNWhOX+?&xIQX~l>NgyI zde}8hedR&un|J_}IrtTI%>Oi>*1nBEdEtnEjT@k8X8&yY0L*fjUd+jt9|_cE>YR)Vd@)!EGcCYc(=h^+&uR1@j@1 z1Rs;$&XlGKv=wLW&r695`XiNW4V!0%I8HKrcK`XhStU#7-X=CLO2W~ zMov@jNz(tOsjd9cMJp?ij3#Q+Rccq_^OwM@s~dQgHvw zk~UrT)2)Iz4^fLlN@j{8yQu^F#x68=FvHT}nP7SA`q#PE037yWHZ!lT_;Gi#%vpvT zcaGLNs5GbxbeKi-|4(p6RNI;W({~r>qxA=WBjRG8Rf;bY}{6vSth;As+x#sMeTbBy(Hx_;4r3=P*D)s$>tlv#3^t|?zlb1u}aY#(e~wPhBIY96uG?u?!)QvCye+EBLxuy5U4 z4N3F)B`X6Ws5UD`rL)5)T2Atnx~0A|!X4HaR$15GNXf7|4qd-_udvf|rdFBO z>+L>-k1_5EI~+PqFcym!Fk+U9u*WcbIdDm*?XYJzGO5rT`oI5ZV#OlU)Lr0itva}b z$WWib7?Lprnr&m zd=`QnmWQZ9C&G6wY#R%z6977pgSc_FCfZh(&2tI?*7fzowofAwH@j|lpbL1+ym`ne zi3(JDCPaiPCYCh&>K|Zm<2a#8@iK~AR1#6*&89@nh%Qf)RH(cMp)5k2<+o7}RpPrq*d z0HB64Byi~88S9MP1kX5&sC(^aD-H}t1mCjF0AqxBzP1{0&}Jq2-@hTb`hcA<0a&{T zJlyTxT7b0v>~6OCb2qs1caVgG9)Ivjk9D!9w_7u0Y_0c7b5rp7$uB|wbR--XXLH5kuahF(+L zvCr7I{Fg8~a{kp#j2@-H5F<5r`_>fnJM6(Y4cI^AmPdOHtK^crMm8)mV(8fe3}ozr z9Qsa3E{pBTk7@Xt9rExEzsq=)$U-~)KY+2g#c3noK@Ymsdtdjpw}x#M5e1i{g5ItSL@u56n;mzNWN7apiGDNo77Zo4KxZ?p98;wa5j>h(nKJyMP8}0{X)g8| zE+7?(ccOUdsaA%ob-zA8+bQ&Ig$(kkNC0c0|L&CKlGP;N_vw*9V#i*^y~EiK0(reu zJ8<@;CV{+WAtk;ppn`m{KQ*7l#L4eWk-*2|^xI%7Ap83imzLY$TWp1%Bz z2GI@TtzU5d*cM=+xaea50vGW2CYy5M`TZM6R|R3mSfNf?IJ6?w?_IW5Qv%oAptR!( zfDZqJqKJ|{Sv{9b=ANUaqc-YvbG?X&%FDn~??U2rTxF`^ahz@5y<>d8Aj7dWCm|s5 z{soQ1-Od_ho+YZ+&GIwB?kpFk|4*5VasSwqH*Fcl@GmXIpNwmc!PQHxAAdvP4^KzzI+-{uRUVTN~`<0GEI|iEl#7 zVR1Dj;%d9`(#MjFRWu{^oJGtcpDYiq3hzbPG;umak<|9A?2!qh>G0z4AQrK4hx6*E zRO*bvgdsldB7I|A`__H`Y^b*W<#`Jn1%!<4M8YD&MP^vuVI@!`n4(*z zwgQ%}PvU~aI95;ua#iE;V5pVK68!VdG0QMA5V^(wRo%1u1wjGs6_I9K3MT~s!f4mf zewVg9@lW-k#?Ty>4sHVx=5`040-=?2rAZV8j6Sr$+{p?!@~9$e428O#T{mE6JzMPL z_#wLRkh#~7ScRhLl-@gFi%S_=>{t|i?`osNGc03sUbivgZ8;w1*ym^}@ILROYeV)Q z2)S`xPjZY48cA;}Ph%qoO;WQxcrRtN)))q&mrrWO*k(Bwfyp`04n~JZi?|=PfFC+u zapr)1|FUG?w40B=tBnqa$DxOK1^NPp01p8hQcw17jjwNfxPCLB%rCJy4`C5cyK4V_ z2)`Ivwzx5G1USbcpZjI-OSWoi27TUSWe z^k*Ga`gHTB$lcfuvW%tx5)Rt$|;cH#c5YlK-3o@eUf?*)6v^-nsa>us*DB3kMTsl`8 z^FZ!?SKHQxOT=bH(GY+jk2@sV=v)Ue-6x!oTo0NH1gHP^xMleR=cNj#wa075aAX>- z_c%gZ7Je7m#}#C@XpLChE?(+xG%T+lTSGmLEeQcQC&iC=4$g;lluIRIfNQ&{O@ael z9_5M(VaVaegLUug|209gVU5{EX>I}PnR6Eqw0!R zWNp@_tc_NpM14TOZVEbT6?=5P`5vIA=%__im9|t^?F{d=2pTb#fu<;@u80b+mr``} z-+*d3rcTyDy++9FqTX|NKW%q#Of#7FXm7K5xBP?4jL2PDH5sjyKKpaZdV+nI}&XYe4jl!xmKqj z-JjeWcr#10cCjdb>1a>}2*q#-#ZR5e3Gh$n{`nSDMSO~Cx(pB z)_yj!39Fp^c=o!nP1Ea8T&6waXbtZ`FHqfRArtH&qIXIq0_#q|PR572F30h4K;IoM z?CQ;0{+}(Y2j> zK-VcU+hSo)=o8hjj0T54>u_?;2QbBNOMU{f&_C64guKVAz>ofQ%jkz6Qobf|cYi^P zx(e?X=~%Paw>ZC6`!@8_6qbLUd!mq}!PeUb)i z90~epaRn#eC)u(>V8Q+k*wHD*u6#=E(oH76-9S%awKabvl+&&Pn6TFmOc+0)(0f5{ zC(wZRlC+j!WLpa5{hk%-!nCV*#36||WA)h+{5ctJC8%#So_xyBBQ9E0J3Ki3;;r*s zUG29KZiQ1r;Cc1;+#90$==tS{qdr{9?Nb?{9g%!^M&vtRs_hZ07X`prRbTDDXA@SS zodullr>>m4j6Hh}0zkTp6UFFu+Lfe~x~kEjuUwK4;kuEdo&FrP`-?HJ_=BG@PSWRZ zrpWSYflz7{ld2ha8$7x-3T#m~i1TB5>)Q#M9I=c!^ImYj_kL=bkjEun75`tM?x|;r zH(YX8sk?30w~rhG!gx0bx;S14H_@*_D(R2bxL1$2zmzXc&pNYX5*SgtV-%x_mzl+= zeZzxUR#L$xLyLRsYbpCfz_zfy-+w6k5CYgy;)8->n@guh@EE@ZUfS&d_*q_j z@>8p3Kybm8&U^cmAt)hk2S=+)D%HM}GvHCSuu`*VGq}z1J_S-@9_tCyZ1D(ct1*T5 zE5&@$rzeY%!`gz}!Ra?J&^(WN+Vk&;M5&p#Ay5-$L)6+tM3ZWcUWxb3UaV!UdJO)KL_WN z8Cb!#dUL%&b%o22 z#$@zz>*%d(73f~{%-?EO%n!_zk|NZvlnAj8_h+zqzT4UWuV(@(f~SDxC`C`gBHx)kBrqa6%Er}}olacmz;>f2aJXU>>L zBe<#;c%SmV@qKU^yP6zbgSmtg>sU?HmINVYn00HSA3j+&#Ai(i5naujH3@iv9XpiC z6f5)~SK^#s8Bm;UmAansH2q56q|UK>252@&(loK5s=nR>AOtRxgG~(gSJQ_j>`u5$ zFa|dkB&s7Gn_?L&M_`%=acolR7Fg1$)W_*^0Q2QGnYQCnA>cc&%kekeMDWL8E?LzL zz1OK1rn*|cPcK6+%!Vl@h5xm7qG5AX)#6SU@>$Thp`HKMBkQ!k&Y#3cwytC`%#gdXn1YsU2h1%i><>x484_SrkI(fjZ=L;qOoH z#3-E8-GK@Wxd4`KjkDMfLgO z7~S6^&iTm9Aa@1-p^o_N;QZQgoUDM07a}CUKepa}?z(;evZ8z_)c8$wvVjy|zXbF0 z_ERkU;|T-ELi7flkW_y|4RN*%w?FAqSt34QM4#fv`0-kPJ@uJWpFP+ug(uGM98RWT ztKruBnSo50d?S%~y^b%{*ier&fc8(oPjNn6%wUQ>b^YX+@eJN?(GT8P47+@ky-a3l zX&~+6tPJ%=OVi6dOozW)Z}Dlths~>P3O|@(LlG9#`+dxpCO`z9_yVTH!?tXpDLs$Cj6Mzj05A1Utm5K)QVI^&|GdyA14G&oKxm*`0Qr{kqT8@*vZxR%LE zA6`B{&micoX7b#O4xl#_bZLQAGREb9nW-DBe~C_I?`pI1$o99j66{}|47O!)Wz#`a zKv}A@=L$vN&+eTG{w%az7yjASTXXA58b6QU>$PNd=RIg~8MJ54VW33+W5i5zdVyu%82euBZRv35}jiU1Y#?r zU~Tdnui2D~5GNeALPd+F|Mb+#yt5V$7$pK)$nfSM+9;6)>;)GG2S-)>Fp0=8W7JmH zbx^ilgTH*Oln60*It298s!b0iYBs@qSYP}*pItf^*(Edl072(?dRk8YEP~-sAWtwT ziXSZ)pFYf_u9z?YzOA9kYlpun9f!BVcH~JMd>+q$w~)WH?~xmIZ&Qvyb#>u(DYf3Y zS{+w!LmI#h2L3cfGby32LJ@8iFSZ&M9yO_V;45sfH9h7z%7Bz z1LO}cQ9C+tL+0wFlZ&@KxKFhu$kXHfwrIdZv?}1 zsGb=z1}^q7Q2+I_U3Fv5{A3ZGlte=o`7~HR_eIqyFSkwVV2$kwZoR{AjR z_{P5dU9=Euz!}F(_=gw`R7|%c)0)AoN}LE?Yp=tt)XsaCdLMm2EpcJ?z}QE#-RX9` z)DvGLo%HXs`(-tw5?`#5TelD^f%^aIG-S|K7S!o*rp_NH^Lt1qF$S4L!zy`WY(9K% z>FAIR_Me=4L2ip^ILb(#cV@e?fHP&Q)LFjIOY1+X_oh23c6;c~sQ7PY@<;p>**+)@ zbi*!CE|Iz+Len}eqrysF)D?L}y?!}@k9cpJLGpz1qSGt}rJ-7MpiP7}X;5e0Q5|b| z?GZ62nv6g?5PhU~@~#kF%6Q|I`Ie@2PW_m10H)D7hb-$zHphz?x)CtWe^TJY~?UJA@%Y+t#*W zB%R8oc=r;GTLZrz%3=zPsjf_p`hGL8x?_;tiXAJJgh-)+ zZve8p`BW<`V_g9IUtVk8CBgxmz{-;6VMmJ548p!Hyf7$6-T+h(^=j=s)K6x&4gG+r_(FQxi^#%O|5cV4(JWqAPY~{K|IT~JnrOD zvtyn8?cZUK#|@2*j#&lZ==jv)Z5|7ygHZwii+@a$-LJ1RU`swm7E_PzZaF0 zxvwEy50Uk=0~XgW#D&i(x+O1ob~rS*61Dc+^jtc}NXF7A!+2;XZU>h%&+7JZkXdhv z7u(ICKgB4epIy`5--l|w2TF8aLXEcB9iCM5R|W}1nR5AaS92DtJ}jIV6do9H^mpOR zNpHKL=B}{yIZ%*PG_-EgG^9Esj$(bD0B9&`c={k24n^bO&mi4YCtNY73L~GYw=zVm zvL{n7X{1;Ma&(7tz9S_^lM-f&#`xH{uK+rWS1P-13O{`cx|VJ?t)oP48*(FpgObUYe$D_;i1>ZZeTMIlh=teTPR_HSPW zsf_J)D?%bN)@w0|FOrbZ6uxziJcYfmr%$5oRR^15NjlI@?`98#Tt@1ZTsGr_|F3yG zA#m4qfIdn(PEF3Xn}$#PFC+qiP}XDeXTm-bum;k+M+=rj&IGn=T@|7rW2?s2+HBszUUW|C`OKQc1E=#|3+r42)NRQa+xx?^c#|Q0SB&QSO6cDY2_$#wn_jGrjba< zFR6Oya%#v1f^+E&b=|j7yCmH`n}7MthwuuJ?AjzVE0V~odPVE241Zt-dd30I?WP&sBriXjwb%m2*!e2 zaqwZRZ#pN$d^@?iF03H<0%sA^Di^gff!k`1lHwrJORssX0tsu_87ScEBoLNi1hN2;@5Y=uxwb)Vh7dw z)pEvE?w8vg4XH6Z;I{?ie$iey9s54UqZZbE?v-gKL|0*eyVcXaA9R2s&?iyTYa5+Y zKzLZy(dd&t;|Rq-Yk1mmJM=JWK`4xYEg;36QHs_F=bM4aw1$q0X|-9B`VU2W zt+5^IUO$U$wX)v-)5wonpT%oK^!H(dvo%j|AvIaR4wDMZ6w!$y#O0b=EDd&t_SfJ2 zL>NOP_J}jM!KClA{aMg7h-DJbz#3;`*#p>>b?zwwJ{xv%+GMRuWZEEDr-e|@i3I}DDI(6`ua5o6o1ZV#Ow#(r_GlIaZ+xzPO?Ka?S$4A zh8wJ)E&B*Ra4j3VbDg%8PBD$h5=oV+CMm1^&1f!bnU7`A!e!y?cE!(0tq6$}ryxhQBmE}VX0RaZ!T;qYCC_sE9;Cj;*Poy{Ezp(h013mk)AQ#IX>xWqDAf z514?=k|CBL2vC(VAXQ2YZZbuax!?X0E3C*y2HdbG=<}shYiFOSjOI#BK}GM}k&lTZ zPT8d~)sXRO1~Dc=*X)SlwUewghjR4ZjwzT!0*EaIUFd$|W;axVO*dtHcpBLQYP5=p z)qRxy3K1*uPPV>F8j`@RV=`BqEPKh8Em7W7S z{iD4m^=p0nYL^4wZ?8}4GTQ?&>MrsZTjTdJW%;lur^ZOBMVMn(&V?9zph@^tNnfJA zGT(yjJ06g*K(!iVKlP9g&*gQ&U#Ut@*4$r&k28n`je2pMkDTx`BP2WQMjE0(0Uxg_ z6A{djVx6_V=7cK$XN`AlhBKa~3+5FrXIG^u{<3-5LsX`wIydxmDXOD8(L&WSURh5m}W`spsA=tP#wQh)suPaZA|>gS>1O)`v* zOUKNOw*dy8<#MVkFBQT8x13NILDvzZjg$ihd5crg z;wtf?$iATXHG!P}L1G2ViLGg&i`?(1*EB3j0VrPkile}-_l3b^44mMI5$$6Z*lIe} z<@Ab*(o#8)+!AV}ThUKAcN}-;*IRDK5k6ckIJ7rGWlKr{7W5&nIazx|nco>rvcm`A zhM3mgA`iXL1&;Uzqc5hIwE3YdZz|L%H%;0%?z+u?vz=htPWkGSI$ za8M@0j6<_DcysE;OF!<+Pge090!?rv&Xd3!Bs^o4J*~(=MY!4LnomXFy`k7-4eEY`AvX?NR>EizD4kBkeKQgI`&`&ZWw|KNhRI)eyhl5(eQuva= zyLOZa%>`6?05AWd$tsa2%>Zc2D*94HanN|}EIkx+&oG`3j@<=I1ZwafLyB^}rgKp2 z;eL|f>-$o=OTLkp_p9Hk4yt#l4h{&0E1|@W^i--w{(4t2(QZQ6gv#UEqECH%=I+i! zO9Q-=7bk5ZRfWLTBU|dwry2GfWh%&-cz%|FP{0~vYcy0f_}gN8|I?U2V?`8=3OOZ35o>VXci;sun&1TXx}8 z1vAl$q?w+ktw(RZb{zicx{hJ(&IOxPFa215jF#I7xi7s#(Kb4!%`g?U;^J8B1TO9ed30}vy zWCWUEitD7sK?%@FU3P!2RX!fF1fLaaDbh*`a{B5nHFv-`HVv_W&Z(@%Pu;eGWC?#o zfbr8w!v-f&cIYtn$OvAvF!E1w=)o9mxg7oD|*u$Kq* z=%VG)zzx2oxn?V+Whn2EcelL#M3Kj^n7NMo6Xd}d{l=HJecN}{YWlY;N}|R_f?%ei6G}D)w`z}7BaJS+_e8M|he#d*ys_wCadTb#=^J^2 zUW1mtj4Owf*d8hf3L_tM&~iKBId1VApFX~DeP>CDReg!ISlc3p789cNVYdXmbL=N{ zkq18s59fG9AA_NnVhTY0s)u{Sn9k)Sf{j;g!66JD%%D%1k?F7~Gd-_a8;CL0g?HowT@qTm`3A{8pIGJ@!>9!OgMHY(UWMj3+oWY=5XSgAz)(dYy zPZ`H4%A6@joWkf6f$>6rFz{dcmj|=m&@w$JP?V#1phC!$rS~d~hGF z=Fyw;G%u|J8|6lIWAUR{*jLxf>Qk_#*vAX={EuYVqL$xlub|{gjQ^Q_2~uz8W#+Wp zqz}eBE;^)^`Dlz5n2EyOLr{i8i(gs=e3B4;mzNMh0rAneht@cROGrX;gxW!_LI#Wj zcxwx{Romk=f}L^vn`sO59_3GHmsrN41OSuwPH4GU3GEp9*=lpIJIQ}bWg-6S%w-|5X~$O<_5 zw5}{0daFKa5dRn=l-0hhLKj{Bezi4Ul|uB5>BXl<_LeSGyDo6;7i0@rucyhL`OLnH zffIwQA9D5zOO9f}V|!HpYB>p#{4kn2Pgpj9Ji&+iE*SJEIfq#Zlb35DRLQ`p1`Ii?> zt3L5F$4`}i(ZF%3TvPk%HkEXrcigIRhGbETl7(UP)+bD{Q0nhIP(ZDcEUMNL!LJ2V zd)cG>3J$o;$!JQ7rD5ci+6_jPi@hB|wE$oZYlXDG%-p6S#Rso3*N@Gs1iTH@GqcW{ z^E~Hj<(m5S?v?32Y9Hb(D3S=Mx^TnPHB$>Fg6jXLa!s!*>YOuIMB=hXWr6PI*B);R zV!)i%QW%)CITvpP4z|uOIx1!4aR|U z9YY&zg=le@r?W5%CEM6EBGn@ctiCnEzeC2iGE+P`>kB(KLz>~aTUl}k#87j<;6v^+ zn0X2YEHA#b3Pwra#D2S^6lB1hzK6&c39gZ%I|a{kS}QS1gwm$TZh^UF>BBY(v>1_k zFGB^mn?_wyeJT@O0+U{fq0un&5>fan8OTdhxla)pNZoc5AYwyTUVec#S=Q4{*)Axf zz6Pz0Lp&Cb+Q(o%syt)DPyf~6n^#Jl| z$}m5DXlrClIGHyk#WM577>&SviORqAJeJA4-q9-B7H^aywa@y;mMPT-AnjHjN6_$4 zFb|UD_M&f-e>^Np3S_XSk2SyjjYslG;3aQmws}l+b!>$DJ#@qt@il zzlD)kcwye?7}uxjunFb%EB9*ie8t)uGb)}o?97Q>G2&xxaP!ZLFJrrMA(X@N#usxQ5FY2O%{;TF^1 zTz9&e9y*;_5nH(ySj>k4SsS?scDL7CfsyJ`kncAsa(n()t>vtX5x}JLf|O@@6Uv%LIew;1@~%or)w6`qo^^7 zxE9@?_!Oixaeu#;y~nKPcetn($z(MR@rEgvlWrN-Gs7N{Zs=(*AjG4IF)vQ--%v3tXCE1jXC}@k$mYsoA;jT~&?RTvqcGA-!la zKm-}zZ2(=o=Fw7I_rQ{&OEDf1n*F!m=Hl-@OXqLBi~Zh%4wipZAyZ#c`LpJXX9C7Q}*XM;5<4!%f$b zz*d+}nrVbZ!dKU%uwS-tG!8<%)zfNS6kP!Y%=tl2SE+xl7^i>x}EQpi$_Z31|r2{6DcMWcc zVsk$-ow(cVC4DK-Y79a`lz&`Sgku*>W}ZmFTKF4b3!&wHV2h@wquiXYL=|u~9Tr-s z$VuQOxi8fWTQi@vzxJ^MJo!|%aMTfToHPAi+E8rYCBJEwNim2wXvrJ|`teqW1_q7c z!P}9HPPImd5ik`tTm9Y->hX?;CuU`oBsN_m?RrIU3DmfJiY5C!WuRRotezkJ!xVy- zq1%IDRQW;dF}dlR0^tr(7SFnVM=a{CIRO~7PfA$A`2fynX3NAe4pTf@FsJU*t{3)` zrI}h|*AZ39QxNPpGVe$m6wEoK!e7x>}px8+$wMVW5Zw{3B!Ef1V$8U49EfDKm~`}kQ2 zJerl~dahRlbiJ+@ym;5l^y<D)v0TKGjb@H)GS-deDZ)lA5XrLtu1CEyPFv7VBM zv9 zemAuwbvFgqZ{%z8;v32c>;=H7wb(ky=}>Si*zunh32XwZv2)CWx^&GRoPz4BPO9(d z+OecqJ*c7lKY>=VM=Jb!@2c#cErbHdhfssdofA!W>FjZ!z*X|ncR$i<=1P;}x;-3q z_;G}+N@wh)8=&PoJAr;1|rnaQ6x#vqv!g zv?)_;X8>m=%H`mMhcR&dH&dMz$M}DXqF8h$FvW$iACz|GTUM~Yh#A`1r5(|Z6b0Lh@ioyIIzuYFcuyw^g(WJH)=SSZJ6oYx`(`+I zDu4q7z-f_-EWt5xSb&ZVml!RpC~C$)Am5@QD0yWBJ<;_A=qhZ4Gr}4kIxo}=Y3Idygvzi5~q;>MT+h}Q#cpQAxtv5r{R4dN%*m93cNMVgOAX|9I zb$Rh9yYk`+NidI0Z*X=Yya8fDZ^Vf}m#B@FKA4gE!Gz81>s}Y?Te6{1CYoM7>6O~g zCv4USzzmXf)sZ{q-8vn@HX?wvevkvLC%t8YFUVxD->;YznZ-pR}>9%B?AV1VRKdn}-*l2rpFd zBfcuKqJ*Lr0m9Y%dZxoE$-`@?3=SS33TDt4qe@9bDJnsbX=5z3bLNyDO7>dILMzt0 zbYHb#+?RAG!Wa)kT4sK*A)Ye-%@xHwFQ7gT4Ayom zX_Bf}XLM}rMeaH0RPuzevp1xg789b3yz1`5igOv0BNW}fbpz)N;AP?Hhzq4~702LU zFdzh4-nA{RomS}D3&_95*Xa@4Clg$el-PWw!9*lZwE219iCFp-35eZ#T#BMH(`a(jolszv$9%7cq53H%ipaC<<4x~A@ZhB&6X#4(J}YDJDD zWzBW(mFG{$dI26hR>Q4&>UpbG2mQ1*r-VUR=V#7ZZH6*3sIMbO>N@D;K;j_yJL(-~ z%tb6u`du35@{ieJKH*(o_IeqbpRS#HevZ>8`4*n6W=V&c9&jV4%zX4?d}vY{l3XKI zV0aW?4*e?;+bEOK(}$R3`s~3t>%~Hr=*6K#_o5ZA%iW1xH35)HWi=fs&{bjkYkpy) z0L!{vYTF6b8J1{xbs}GdC44dllj;y`4Hx#}32kR6GnV?wB2gfyK<+cm`fjnaGl~R@ zRyi)&%wBY&$4oNiUs)D2b_eWbWtd_iLvWV3dddPPoD_v5Vg)>zm4jNUCB`Fz<`5N1 z`mnHo5BAjJTW_5Zpvb?5t9kdZTvVFqks;BL#iCqt&;yaadrgk_xp_&{d7rOB2Y`%& z%Fx|$NZKQ`>~R!U3_?qd)Z*4puD5mL=JiRV7>8aSb=*VzUGeDLyMp$_Y6kHRX3=lH zhPV`2Xlhp%5K8b%#+TFD?k{WLL`+A#`+s2ou6Cy#-JuGkdRO(=RVHalPVXwPIZdNd zj@U$G=W?x5AQ)WE*YG&rG!yo^J=qor(siKUsDb?nA`e4Pq`sI{)$dQJ>kCG(A2vDG zJho=Q8aZ6R@bkgY(cRAfVn9%Dpf%uMQyaIiuJNa046BM~=SrsYT&|;Q!C+2ZLK5SD8ple`|K<&LLgrNy)&kw zpu76=xxK&bmoe4Qxlm3UQ5Q$-pKW3C2bc6mEG?svfc7Y4nK7K%ap+rbIwPkVi0mKn z(BSM2xk(qil1wHBCDkydg0=I%AO5Ch+tRUc`gF(2KLM#^fK*&iWsEG8*R$Ih#TPh=QC`1|C630SuW=cv8QtOO;MNUQzw@qy`7 z%P!=Le$4j+Z!4BHT&lQZ_3f8^@1kjD&3w%VMqx#DOT2gC!hNd;c$NeOVJQsGT0pZe zz_-M8di&mW%a1&-nrLEEPSAAj-&(G%ha$W8Y_5XUr$>6qcpCzAQ0t1kfdBMn7H?*S zXPmVWQdI1#oIZXk1=`=zdYk2B`O2AkBgaO5{pMkY4C)p3Lp-I{TZIFhOM!wJ>qN~6@)4U$M`zfo}t zCrCal5$%W~ni)N9z{KmdniLe=y1p;kZ0iKP=V=Wl@xylFADPd*gz)&Lg*RfE3?FCC zt6-|uBV7f8fEbW=faWK78BM0VErv?13}|&?)Bdmhv60?YbF%3y`;_4MJ&{6 zeZkw0H*BW-^A?Q8Ca6af@vrd++jRh7<5WHpAzgo`1nG`XMyk0|MSbES4Dm)LZ5)D-w<*4iC=3=I>&blb!*g0gTVtyrk4&+=;~;@|{U%b7bb{CP=n= z)r7n$C>u6#k6DkgqT03JgV55&*W+Y@i3G2!c07Nk#4TgPksp&Qx}29)!@f&v8NjzD z+nU8O8n}Mh+8nA7Tf*7nu8-pZAVTGIXz8>@E_xGCV~qx5bD;`~cjk42;Oq%WN1$!q z;Mc23=$h zSh){?mApd--7DDG*EskT9XFfu_t#X=r_LgA+LQtHzkm_tMh-$)(O*bM!&{q=H#tCE z>?{r?z2Xx!*&Wd$qral0nVMG)yPojUQJqe5+Hr-MO^W{$dn^#6Uh`08X{)BGX99Eu z3}12;KTT*nAO6KPX*$f&jo?lg$Kl{8icAVN6V7x-vRU_n60T$H-;4AW5fyF?NT%@VweO9j%-TXGsBTQY7)G! zH-yYIZ0eCcuGJ}Q+%C0M;X0#mSw%r;5)%2|Kq&>{RP$&Y-0OKR8i1$4OX~ zUtFkwo!XIb#elP9puSg2e0_VJM-=Hm=+S|84ulx%%Io0b{Ny^op>1}pY(DG2OLu(3 zkG7Ch-T`ou>e^vL8;p-P+rK1*Dkp(;r)1Y8nnQ!dEIEju+qM%J#7e^o{kaV`Mn5$3 z3;p{or(FJ)Ra_vp+&BT8iedQE&-x&Y3?!deD6ooK;EmVxWRYkGd@CwggBemjL;HyJ z9NkbpL3NfUyJaLf1CFlycm!Im_pF_wbI?b!dVK`@zm``;Zl0$cXb<~Cs@7uO)gv`!z79rwzrusci8 zEdR=?jE=6JQJ*wM^MBjus;wd(a=wKmL*GS8LIR-vXC?>A^Dw(p5YNoeZFYli)Z`5M zAqs^wlTgru7WF?z=1!SZ`=nd8;JzXK-DyxqguY4>_nye^ya+)9Mr!emf+YP0jVBJY z*EhnHVY%f73j^Kd01PoZm!jb`al-LZd#E{)Z?Ps;eHO%-xtjSRV^+W6T}bvDMel-y z^N52M`E5{6Lk2b8c>x&BSt1Cwu9*t^_exsPke@z;>(e&xWQ1!!vibVs3*B#I>ndKi zJ&qj0C(yBigR%s>R!}Z){fwTpYv2wyiF@S^XDL~?cR0jF|0WT5} za;2V|=43PZf<%30z<9s-*azo$x~AB&=GbN(x~YENFZ(NJiIKM~7v7up_wHMnk_(OL zRvs?VdnSkUE#`I#j+%fNiJQrBP9cq>zY?}vv9e0q?AbQ$6A?w_;_QhhHAORR!M)*5 zeA@0lF^#1#NyOIVRk|FPRX*d+zWt7~PKd}+{tC`}dMO?F%qxlF$o~(^WpMe|#(H#b z@u$9vF{$d;QHa0h0Or!erzs1RDzENYq_)tpPu5e;<*QEQj_(gN_(`A*M~i)eZ0pHC z4m)cnzx{cRTdE7yXn~A2Kx+h_hnJjBz~@QCMF${KQU8BSVVvg* z=$=T;dYhUZbS+*>oxz~WmRHp~*_`;apKP(w#MgU*E;bBc@18wWlu)uZu`lwE9=2v2U>IyT#WMl2y3tQ z0;nz-*)Gulta5KLOXTFq@z~Y=mo65oX85X1`%^wKYAn#rt^NSH)ACdGmfnpSvVqEu zF^NS25g<$xFRSg9*ZSv4JF^)S6Ss_?K>SEI;hXWU(1);Yg4ne3)`z{pt>M}F{q zM~1Z@k7Gdn-#@@IRdH_&^}PvOW`noz%&Z+XMfAWy0?Ud0Ao3{&lPzr%;SZZq9;4u# zLEv$C_V~PrA7k0O0DMxS2Qq4bv~Oz7iNV+&PTd0o?4_&y3|lUKOB>92j<70EA^rDREuEQz-C$nyhoD|1b#$>WU%PA1l+U zw4-@CM{suE&-c3rzYPr+;S>QKS`4dSNi+$YdD4Cwedg1PL`WT$p_R~@5AJ)Al#NqV zp6BcUlP${&pS^US&sk}$d9mVyWo`+n8?qtzH16yId@TQbt(aA&IK|J^JEdI;TLl3VPjzPevD4+Q4R4=}wAkly zTG6TLQ_#uvh&MY_QH%r(Brh|-{9y9tR1$o9U5o=p)%{pLvyAk(00>F2FD0y~uAk3W zQDonGKY$`a5#&Yex{d6f)FW`xAA=R8|H)XlxVV;T}5+14~~Q zE$;x-wuu0NWW;9JPpEHp0Iq_op8>0Ul}0oc)~>F+NK_Q4m%9nsZZNzgeqA=qmz|AZ zGJTs&4vDM>mPW1XHD}#QA0gKi+&>PXppncF{~bKDF!2O#bJ)A~$-IlKKZ5l7#z*^K zEPP0kz$vEF3r;X*GSB2TpAe6p(bflU3Cfvwt zb=#WhZR8*)boR-ug>kw|gc)h|XrA7k12-a>elno8-;mN>y2m{&m#*{veEIYNBeo70 z;1U`OZw!J1J;{+{eoN8BKc@A%8%B5?VII2lVfxhj!~+R81Eeyy7uPDQ@tu1%gJF%O zn68k;T85CU4LrCt`?KS}LgSXHMOo>c|imlj)klm0>9 zp;-8E>uJ7pVfB>fNbngQQP9zjpu9LeWd{_X+LR_6Ox%G#4@=%liDS`KYN2CRA0C08 zb&wCQQ9tChjlyf7p~ITz&z+6VZ-|4Gp(uMs?c<31DH`Db*%%rYReD)JlxA++ z$^kEM9oJ4i8T*~C-!YW^A3VRC0)L2u13izVPpgpp$L| z?)4?3=b$htq-{?w67in)6Dwsf>1HY=V^-c(x+vzT?%X&O*0WmyUM+9kY;kV;y?sTZ zH}qeeDws{vTaX4Mk%kZ_`Axd?55LcsbmXIbU?2q!&FZ~WMS zDTt=@wSyXDH*!y)hhvGwZof?qbQ}?46HiGQj41@Asl!>`meK9B%b`3Kg|30edh@z9 zxNs4lZUv#fkj?A?c}*Px^84dDGpguyM%d)1?lBtVOiN=_I`T4e-K}a%SJTfR-j%C&LxE63R;g zfRqJuWCA@_sS!xzG}Ej{xhDHtF4sLoTy28673L%W+@&0)2~@9c9-@MI4st%oVicnX z$_PO(6W4d($#p!ZPIm&feae3S-cM9H&kRPu_*6R^K$6AZo;nAA50BjYy?9p_iTWRh)v>ncuSFi zBDF``)b(8&@LDw6!ta@h=-c#Hd9UWwt1*Dl!04$b|4t;BWe@<95PxP_i- z_QEFBQVd_-{(8RNDQ>lo+n#WT0cfFK5$oUBBh)?h^ptNDq$c=6S%#gc@NItSdKRv5 zQDgqqX|I;~mP^Ee6ZGbCixF)7+cVHbO>gM$d@S`S4%*mRm=N(u>hE<%j3B)Ag+gMQ z<9PDnk)F>eq-xlc(!&MqKSR&Hra@XLpyXnc2WO;MPMd-GOEUi-Xe8`=#@sPGB7@nR zX&9eiSsvXsbTk?Z?FCr)N_zhw?nt*hY1O*GUxeTbIF zJR$j!)C(hC_Om*-sO+!VOfdj#Ok@23<2YC#Y{L4O_c?qM~_%1Jjm(`Lu+ADY5Nk6XOvYorC;3g(3MPWavTdK13e*R|?1?ms^1nUujW zN`{Xa(VI^2Ndg;5=Sl5X-z=9^TC*3%u+dj7>lHlC!#Hf$KcDyH#WNA%oB>0t=#B7H z;Yix%@5%b=Sh-;Kv zEo`_k2Z#xlN{8+T4zY+zYU=^}J-7WGJ#j~gO=v!8s$Wk7}(`cY)| zOnCX!twu7=!F&CmN>gZ0ADhF8JN_(ASQ|a34)JHs(1-NT;nzwZT@uLt_lB@->-SGJIN zR&4>NI+Kd^vK1|edQ{$ zhKA1`9#FvZx9~z&k=x0eJ=lr}t{k0X4WKmR3At8LwyeQKlIfGkT3Gkl@!FN^?dPBo!1@1s9jWAK=W8;xET{)AGmuaKf)klTnmqg4y#79 zjQ-#O!Njl^y%sF=+s)Tk5{@n)LcthhPkoES)w6?9>`&>rY0ZBM(>ytjV_jebuE zou4e(51bQ7q^PK1#12e^>_|1V%McWHu+C~1zUH9xH06>cq zM4UnX*95}PMX!%?b_N9YMdpI%iSc1EnnehMe)8{ovwcF>Eo&bn`DiS9FJFccmM=_z zcx{tga3qJrHNvGUTQzwN0xsMItVvZ>7sG8V)BcUQZzy&BjE+`g4At;zxJ*9p{#X+i zVmqGF#acjV;#LlH!`>PFS&R0#nltx9bKb!C zm0K5GNM}#S41fz&J0G=yYsJV478;~6361DoM+Le{+kLSzUj>NzEK-6zN`w>xvtJB~ zq#DK(g+?TB#c?P;ntjS;NVygstG@Ad~5AWewE)nBK~Br6G;Uz3YQ}D%>B@< zPd=?0kOyLuWxX)IDrYu$HFn~LTSyL*oAJwp5z2eblm9%L>e{KrHv-%3=xMHcnh^Vt zQg{1hE0O_4J)1?OvDM@}J5+LB_uF?Yuw7rSDeacvC?cYoz%{MNgQwOCCs7M8c zFg3~iKJYbedwBQu!&o;54=765dJ>m0$ui~Nb=vZ*`o$j?FDptX=b=tmy~^U3%}#@` z8}HFLPpp@!pzUE=CDc_jHO6nedcjq78}ZdUx)Xy-V8yX}(}B_3HH&Yl%C)&WQEL)F9JCmy z*~7NlCeZ~zoKgPO=E%-9<0R+?hwO@?aOu!cF2`3Q*ln-ra*d-!F#mdni{Qe=hF%>Q z$>XsTxemRQ%ZDr*Tg*cNw;t)|^MOi`Es5N?LoI+9IA=Cq0y4!OsPEX&Fs%G`?qYCX zx3K|K`m1)W>(gXEuwWjN!2jBBvu>^e&yV0;V)udjUV63yAWJ|vus3ueY}dpNDl4?V z26)H)*BTu#G*S9*dXR_DK@#KTGR8zMgprNTAK@-ewVXGDoV=enH_?o^Y&Kd&5kng_ z&26I9a-S8clZ`CZ+z`aEVP=GToDSvi*%H;*)LB&)|3PJ`x|he%VK(K?w+qJB>2^}K zD#Y+Pvc)?4TbNNw)Lhq1OO>-YF*tIukrm4n2ndQbUUqx<_%5n z8dyY#D4puufCh4)goX8%bPs|mqXmz~$+)XgO`>{FMGW(?ud8B?Y`!mdTh)siEU(Ga z<5NgjO};F?XE*a12DOmiq&SZ@j`D8tp;VYa?ff|NWLu%_+KCHA;sXGMM2+t%uq=x! z9%mdpebR#H<0l4&f}PlOxyecxS_WK)-4?9_F32yLmUiJA6v(6n%tk!N$CEYc*=e)cnzVYZwQr4%V7id-h$Dujj%}sML{Wm$qgQ2GjrLOeJ;DA>N z!`EvrAA#p8ZQxLvO3U{zuE3}6RZj6QC+(>4nOA^6k1LrV3R=*27?%7xUv;vld>Mz( z-oDhEi{4agoMR(9P>NAExqAPxSN!3p&lW(IvBYFpqw`o*+o7xDdDvFIVuT&?J@uGz zm{X^0l$2)Vm2-1^wDMFd0Tx%O=e`?bNBmhY{%sj@8$|T&6&^6D-*B?BT{_CQvBqsYIi>T8AXR@rIP5QrIjkG%3%hu7F$Wqko_OTR5*> zC{G0>)c@DuwGU2h%+B6fRV=f?qL7?z06>!E0gETR`zG#7Y<>?`dX6B;3DT<$3$-K)`igUm*d0z16|k8=ZJvX|SboNQ$W zIX~70H!$7Wi-rh1in9*>8^$`zmq{e5mf2RdHz0q|EO_`gmmM|iC*w{o_Z+MQxD|B} zPUliBYE}Y50sw0flv!_mdEw!abB&9(Qj61hkBNv45jZRxB z>WFRGz-wL{>SxZ#Q~T8R>6m1MbPM9flr4(c?Tc7X&xEj4_fG%7U-(WV(O@z~RG9_t zjBWo{LUlfUQX$`{4Zg>DjNG^fRby%bpm>`S?kYNFd-h~?S&n;Xr2lB#ygcxA^*XUM zI%!2-dz;n9kFSC-o7lrpfd)YD>al1m94mNNyo^#g#MEA=eLDwgFU7t;^9%Epeea}v z?MCrOT6ExKE!MCjT7xMWi$%4-7)dTYdQm)!BshZ?x8uVXeip^(IVda|T=^wowRvK? z&4{Wqn;eVaIAzhsh>y+yN~IXF*RjFw_BJQurF|$OT1ufugOKK{lzy${g9lu37}u%; zKLqc7tMs8I{47x52esBx1x2pAnV#KOgXI_qKh};CQeKp2#Tm0PeTl*OgLMuaEOKQ! zGZEQ~R5ehiE$R{iQqwG?TAtpEMIKRcejfIpsP8X=_!Qj*Er#Tc7}Ir%WK-<-md_Ow zC3V31P5fU*#$-vF?qqM+mYjZqo6DB4+!(>|G0!SqlCQl$SHBroh3*FdmR4mwV5!H% z6gWW!*R6&+f!QY{<3mln*+8hKKD91a!x+_Z6EI|lFkML;0gv1rSgf-4pJoGcGp03| zfKg>^zIFz~IB5*Oze=Ql-B2dQU%`!76&< zw2g$HLhN$^Uy!1(1lWjlE-oVa!}G4@ex;m~1;}{c}-FNg3aHtyJ;Sd z=Xpt+)es-6N_+3aeBL(VXb}Y#S41j~5#%XQc}#1$YrWTB`WuPnOc(YJFzsKc8^%yD z=tm{8!}+af(A$jXuP*BN9rv;y_tBhurFUBxaYP!vdHP*qdtpgHN^~|pPRwl*`|Grm zJMF>}_fqy6^o0p=@Z2+J8j6sBMcd5&M};Y|rzF?@B?*p22~QMlIMWWUr0E1EwE0P# zT6dXBK1Nd|UcbLZKD8*vGJtN}Y&3}#p;=lLoy-XEzU|2K=wKXkh7o0;py`PU!7R42 zaJcVeIjo$#U72{_PAsOcy|Tjg21f5fw};-ay1RZHF*;C%wZyb}pH34hlUm|AD;_)< z({&S+qKbw8A*nq>&VJ>p6hjuB2c69p$8zPvCtaVB4eT!GIu0WXj5Fvig!O;WGyfYH zl@RwT)Eq+NSn39%9I>HpB~*Qpl2qwk1+_WILfd_$l~Pel0dKz zAI}3J7@J4L9r0iKKq6^zJ__$vl2|2Qx|9{*Xc!Is{g*UvYo{hbvk@hrazV;=oi7M| zAH-D2_dKPIL+_JRp*6#wi>}Y9!x8`AzF<(1bjPp%m@IpV!6lr}MFi%xz0O8e-R~T5 zeoBYb%pVVl2ht!p&kitkw9k5ArvNwCIDkMO?1v)B@uW4^S}M(IiE!Ln9&pyjlfzgS zuf&1TI}6Ex+-qbdu+((&iScb9lgRqD3B^tNa#|zQu?HPPwtNgKy%rTIqW4SlC4S@Z z04W}lHk*~NavxFa$`oCn=vveU zgQ;MQKT_$JAG36IWtR)&CQ3^LTnLYJ5$3kr4G0B-;C#rb%EBIYAx>eKo9E4%jR2lb znxqZQ_1RZ}zP^-=GY%eI--oD-8uAO8i2G1R9L{F`iL%Ekrxj78A4|X{x8a+`b~1xS z#RC9TOxTqKwqn%UX@A z__!cj`cy3P(H(wbtyVXVx(*F z#KnT$i?Ik=)3LJbCC6L4dO?tSL2Ehr1_+Sn@>wFF!ggZwCOO;2YK z^&>VErv=e;93MdwG3{Q$s3=|ngCeLeCV!Vzz z&~yv9iw%&J6y#S;fy-BD+vS#_GVRMC1ehG|n_CV;_@>!j6Bw5M;UUT8OdEgD6V;Z5 z%a3JRrsz*=s@7vwGW5B|;0TKLj!ua2){!sVbX=Bm>cE=CS>S=>@N6%lN(&sE47Dw9 zj5P+{Yqwbv$dtrV7eNK~%8fMp6f6Vm@ykC5(5L_UuI*U18iWK){rzO}5x_L|8rOb@ zarJC{<`LyM+5sYD3k#c_wk?yr2Z-O4nfTELp$9Ltv{(<>PU{0rwZv#Be^Ldfs;?ES zr*zyZ%7vx|^N}lkwh|fjaKm_6gI1L*%*z{*PS=vrnbg=G3%G=&&bEq|t!@^|ZvNc2 z{!x(A(GBD%9TBLB%^tLj4ltW1w9=tx)Ks@42mh&VhSb&=FqZg)-%KS z(u;gVJy;8ew3Tm`5gAy0g)n{qm;2~Y*Y5!A)@Ibqk^CZgE^A)*>R7VbR~R$tFRR^M zjyv15$$mqyrmez?few^XtEAVfzJ>G~O5qEAtyzU*5k(d?_bQw0U>p3afz86D$E`M) z2uEq229JjrEAx{Kn)J8N5C;Xq*!Y98XKr_VW8NO6SmbGgO=a*(2@4$MTbN;2rV->` z1}ln)nh5aXGE?r4I1d|B#(Nfr>iZZMd%OdQ77>LUie~<{R@8ngv8KtI-kC!S;>nIi zq^*&`EeM3NK1HkUwl3mgGKPm5o6+Fqj6!b0PgGm6A4Ui|X;?wim2HFYcx%W}p@7zg zVPEkM){dI?UsqS&+@i}kXCy~D>vRfrP|nZzC?%dlAG`kFyO;2_y*cZQ+YIUk0(3_A z-Kx@Q14AJE^d%7acw{MR9awru*f1s=?rSWL4G6M7k@m<4P5p9#cYDkOa~x0KXgr{7 z>TBHjTVQQFU6uE>L<<|mwoT*Cn6_dx`qYmj1`iY(m|)n=!2N2>A(Qk~gLwA`@B0YiXKpHyVWB#?Kgv5CtGnR+99_*qk@;ds>6ZraG zs4!w|UvL{0U4Z&FLL23Sp?`j^1w45H!EUWa)xlo)#H!2h&)i>b3gkA&2*NC^(&ZN{ znhFc3w`aehT`VqAC2wv`-#C>$8A)*{3qs}fLsJXGl~2Z<*BbGMpWf0}?(l2h`c^10 z=ctzXiIoKxx`oX3IK zL=E)-0f1>?h*uR0$_M)oVnMyRn`BpLSKG_D?iy0rtmm#kmY;hb}ZR3SShnHQ(bDKR3qZ?`N zu+(!sLK9GKhTAe;xz{Xd6^nny@HcNIkDC(coEk0_DYZH~e!i3v(cayF=~WP|H<0k%Pa`98bN{4#2)zv|JyK%NJ)uhbDSiq+;JZ#rGs$~F8K(fDwQNxJN z@JzGE2R6B)qFt>$&h$&{b=k#QAq`94GE*)BJFP(1rRW_jmYTJ6T znlz`&pUH zBKpx5ie4j0xxLpXtqXdoyd@vC5(jd9&kAxb+I=^@MN`%}R=oiVXT8=H5>`bpvcsQ_ zp*;&xCC*N%ya;wSfxwtP9 zM3l-Y8KO@Iwhk_6IJw^%{)5lie)ovp{@o|)Yz*1G*M%Jo~h(!^|8DBn(}vu zFYUa9z8ItahBYmss!cApnGj!jXyD~~sA=xpy+*KsdIHu$%I(h0^ zZ-a^EzDdZB-CK3(;bAiCLVos2o}MK_Y>ci?m75xbK*}>tbQ1#2Nac}QJX+zY$6#k3 zyt>Us_lkRYy&1*@OuaawjyKnV&wsxhBV{q156x)(o12<3gl%68oR%Wk?DB}}f-lW$ zM+3nL^PlV?>RmGow#ne*2J)KS+j1ADTZ;%&ifaJ##mBjGp6hG4gn!%EQC&X0(jel) zUnuJ>EZzngLFfS5>{jsx7O`^8TG7aMD9p%nA`2P0MBT(V0;c!QxB=uCdSrdGMmk=Y zH~yR6{$>Ut`8Q?on@8FR=1dAY7@ySF;x@oQ1kJ(p($h=JQ-Lz`!o5|K2|)s;+&{gg z5FUIRBFK0;NnF9ahZqwUI!{zRFCdCqNnW@>42!+LY@MVN+7ks+?$8Dma$seNa{XU* z=U1TGm0AaVrz&u;WweXj@_$BH%w!|n+H^e_*$bl>Di|7J zG~e3k099G2>&c30psGja)upl6QykhWNxVAE;I5tOosD1t3z^%n)k{XOvA^4L*KK(= z)60s~Fxc8;{qi(cfJdzV@CetV4p+_Arh0&W6=9+UbgjR3+7hJ9p3b+Z@65_Kn}INf zxBkq#!_p^zWBI^Ytm`|?TQ2#>R(-(>o*HvXZZKPj-riD~UTyR?$AsCnRmac4fOzlrGp7 zbN_^9(GEa4{ouFNV21HTZGBVy2tDn^1bcPJgeV@Ry32Fg$}}WuzGrA#&FiwJT6avD z5oo~lkAZS>FEcBUxiEy5jITw>NpJ16Qmm&lWvMoa2PwcKNeu=SXPYYElOdS|Q|MkO z2YGIOBV7ZH7K=bv{belkbzY6Cb(FK|`Ya33Bl!Eb?#>W6{e!dj;AD$k^~KDRmJ$hu zNIl1jVQ!j>q_9@V@!t{4tKjYaP_`^dc>1@FVFl+Mf=#a_i}d5*HnQjb=1&vcTl-n1 zJlUJ5;P;h+3Xo&wN>|=dC80~2azT~2WgnpB2^wK`_21`shxL%Phg|mIqzYO8_sb&BH}hhsZwuHq7V%_ znYT=(KR-SQVPc^bdjD{F9gI)jb)IOCC58-_qIcKYh(7M}KRO#^o0^#@XI-E%iU=4L zgIbfJ)LM?mu&kW+`+65B=fPR*BF4kwxlpCLbDH+2ZtEgfrKjcGzgvq?Se@ss|1%Go z7>_|RQ6N)N8Uz-Gbb+jtdh4Hj?!C4F zq1Nick;e$y#ml~NR^b-k{*qxFDKpbHKMd=M15yw?k7hpoqpM}nDKAPZjQymI;cljm zum@0`>2of!El+|(O(%G1rqcdULU~O7f|-#eo95p#(6eV9CEmYdu0~^O(_P&e$F`HpUG2TPa*7sA&f4AcSH zD%?j$v*?of@XrGXg5;i!Q?*Jqi~dhgVI&pl3%v*6FaPvEwJ$EiA5FBSSS83Fzi35$ z-Cbp8ea6kD`~tE4Uj3pc$C!9G-scis5Ytg=XaWkeaFa0h|w(be#XCYL zU$wo^BQ*j+S!&QQ3P>J1k{9G=J)?!@2PqP0OyWgp~NckwR?PP zx^(ljKG~qP%Pq1`bY>AR8!ctQ#P^RPKf%xAIu_qhHc9Y-2CWh?v@-1_?3xB}!myD$J zip9cxG~)=Z3cSolVRx^-L&&0pV*jr;0Fi5FY-~$M-nUj|tDN;(6I2>2x zq2PBVJ?)zN$&!JB6NU;r)!t=(qjlE~o%o}?k?822d9v!8d$MIFDH-<<#}* z$NWB#!M!-4y+bUgl}U-Kg12kg#@76s3c$xXEzdq4wom(+@lL~kTdM3v;uN%iz+DoK zU)@`gu^#hK#5?Acxksd0oA=~+NM}958yK-ydJIsLiCb zpN2pMc(%wS848>V6GSrx)lhM=tIS{R#6=rjjL#5fI;FJ1^vP!@bW=s5yEOgZ`kj24pweE9doM_D#MKn1N^t z{OeXK^PoY@NaTjyaBWGNIgY!7VK1Q;pjB3A_F4^o*vi`3&EM;hL4cclCetnx7 z*EvkGr1ei?Lq@K1B}zZMR~9QT^=sAa6FLF^Y=G9&GwNN@*boD1*;3kprVH00N7rYnyTPO8SC;|NB|sv`iFJ zY+eDdiIqWlbO1-`Y@Hv9AY0Rw&Z6E<9f&*c^2;m)C6UQLmoya93}kL}#4CLdjV9|w zCRL=iuYV0nN;4WO2Uf9$ys@5+`H?gpv(q-GQHw);h)hKVQ$=6w?cy-IpQ$TE=O*pp zbcjw}-2^sD&wo-U<((ZqN+IAHC|T%SWtYX@R-LSopN#@r-ySv3tM}m-f%98+KD!X; zlA!O7wtdXQzv5rcsdF%{R1;&_ce(Q4%lJc_!WGFw^`~Yc4O!yBk=2Ug6bW|V-vRbH zwknNP9ei(NZO=5&Gm|F3;w7bGkc25?*#G%BS@NxIaHe5yhuDs>Ra9zVq82GHktw|* zH48qaTfi4LoHlFeQtoPfrk1(%`lxR;&jv0Ctw7(pk zW&QPQiJYrIoBTRJTtXeTy!vlRU|(Z;x&?GfN(PH`A)&5g^R3TrTz7@#jDMA7C!gjV zhN9PU@3NPxKsbfPn_$j*VuD%y&A)pfYprxv*Jzgyq}s5AomjkJ>9(3Y?IS?1FGA0}a6d7&` zH*IEZpE64B(ZDzLuAC+_=Ux+j3UF;Ijc&Ie`LlVP16-9j59jl``Pv9E(+~6%ob4*M z$;;@;`mMEE^&Fl?l37JUO;j{DMl>ov4D0j9&%xRgU#*0gP-VKSl_^MAlTkZBkp zh>}v3%zVbgj#B0b7gdC|vcri@kiv*75)S3t!F_)^;knQY5u$^Se}m!M=+24|XaDgh zX8^a&9a@Xl@JWWs<{?a9t@?BqzITCsEvE50+~4t9W+R+Ju+q$yJ3Khf5m#Dr z=e()WmH)X}>VMV3-$}M(d8ptr^1AKNWC^lj51Lk=8K3HAH3)LS!=`gf;_Q?2NP+)# zW!>%kRkn4%H4m{huP#4JfB~(Nq6l3=oXIl^O_;+P4|}FKd2i6o3nNA7i2R&iz^z&v z3U>&Kyx5tY@*pak%%~Q@>O(Vd$)5jfq6%oLp+memHzTO6*7?33<47OX@r2Es?6(-* zzx>w;a7e@A`1H)#KorvL>tHo8_sLiD1ufqk3c8pMKFw;gQB#=~_kQTxa7Wky-ocv_G5@*(>VHkG zUz-HT^t=e1H$@-iUZs9=o@UhTfeS z#zWwJTk`1irA)S>`Wbm6Sbh_;7$4c(N(;(wWqh)_FmCbvnH* zzrmjfX5epp%JUw?aqp;`U#uqvyYC_A*=H+aP@D3eNi1Bdpe@70g&!ZZ&aZlOFHu>* z-^*SssGhQM2iF&?>M0ieI*h4a7bVL`Q}5vTstEJ0pQ~4K!uuvNK&5>PeEQz8GzbaP z*IZ%5+D($Nu3Hx(DOob+qhDbQ+yRRDr}`jW@6rhL(*~tv9tGeRNufUe6ojl}M?whn ziSz(w@xG|n{QZ|7Z|J1A(k`+mZ3dhCbG(F)Pd^fKtPRYF3`w87ljbv{rt4c%tiHY} z4sJc>l$5)olUKLn12h{D37ORJdBH+-ex;V^OBS@2@xD&5-8V_*zeBH4U=EZwogg_D zWkh-N{Ocaj7a;34M6yGODbsDs0J~YW!@0r*5WIT zLp7Qrz`M9!9-_%ZVD3}cPvs?Hr>srU!_zqfS>>clh71w#rRrq4_!5C4UCbaq8VIvBkN0P6o*P9B*+D2I7HIUvvj!D%Osu>yQ#0yx zgpT@oDm>_tzy)vBHypg=A^wxH-?n(`tYDk2P;Ov^@kYTi9T*VD z!JOm>g$;TAJ-(Lz*xch!fDhWhUREODy+;cJ(+xlk(ZRcvQPrbxkh-g733bAhi#Zkn zqeKB&i^kL|1+Lrh_PeVWM=IV&y##4#TvvX%kmbsFsZI1F;a6m6ILg;d84A4v3a={A z2@jpyAVz>>Bg9)E$I^n2CgXd%ZLCLA($QHPk8>l&2KHy6^oCE29gR}_dY$Yq4+$#< zZLa7w1TxfeWKRXn!ZGR42Ft|cg-xa%sBP5cgV>a4@nMVW+ z;jrq07P8?V%g_(%HYBXI1)__n_#!h@2N*<55|l&UIvJU%6AXZJ$S!h35m*4{8f8Ex zt7n5YE6pdnnzluCm_p=}noYGEtLQh{uB0Ugo(fozDEvHYltlU7oMi>~=YqaJN&JNc z{M<`7{CjX%@G&k@oSV|F_h7_2S;L$W=Ov%ySp-e6#0P*0b?MJ~ zMycS|oyZI^dTJ_La+A375MoUF_R8kxbZLdSV>yLgkp&hkdnK%Utv5Qe#r`-|&=IPl z7Y6cYe?zw0G9;6naIJbeLk$M}+GD&g;frz?POtA7$Wfz!D*I`~iM^xu)XSPD>I6+y8rjw*a7zurTo%4EaJ ztazh#nd%+e;%)c9{JjwxWygQ?ORuzi6A>XP2NJaTO`DqdC#D~Nls4RFJMmUr0f#959u zszKh%9A-ea+)6Bv5h) zN3F6bQCFR$xO7?ohfNc=*8T3TYaU;CFT~sR5LH^b3x)m!i#Q(e6>1OQri@&l4Ytko zef73)P8j?m+d`!m$`7#?v}uN6wT3Ys%?;iDZY-SS3QP!Haud5mb9v+Za)x5)I*KR1V!5a_ zudS4m4syjl+32(6Xm~3Q^(<9 zXr$;VKd}017d6)+-SyzQ*{o!M5aQ$zY<$R~yVL$V!xe4UI7Ja-8SfYY1UK%;`5&T& zZ?z+phNA|f@v&9Nb$BC3%`gYWcT@?7M~qAoMPDt`MLYF1pWX-t>xyYB@eRw-ANPV- z%6ERE5AINLHiD7&A#_?nttbv*W=rx5G-ay}r4BnyeuGKWvw(N~2# zUt8xT{6)|HVgJ)kEo=>1`rp#~1-!{$oZY&wZesC@bj}R+LPD>%vE;d|S#+AWqu0_{ zPHLTeUrF`IB5J;}M9w@YFSQE@ai1sRq=&uu8W2LW(XYdJsEs8$74Jo#w*G5^yKKWW zw4H}{fJ*BH>|6Bg_ro^Jh5yG0w>puRIM&g=IL-`vj++F7Ec*RjAPs2^@^%8lU# zOnKwaR$b&f;fiyS{<&yf%~+u|FhKeb&2dX~59s z;79P70fy2m+DCD}A&hv@%TeSSff=96F_T??Iwx`m9$Ez=Cfa@ki131Z>p|Ge;}Zyy zb|bY@0P?e7E&OR{NYReRg#6m?)7x3Y*Tlo#?fW7WavKU1a|$(XPBy&6Agu2_}zdOfe zB(^f45us_sNd?REzyW36K;@kqf7|csr&B_kyC-XpuUJmS?u^3S#Sz& z=Uw;o)3L2q2D%}_`l@c`t<$mW{9!Ay*O0-rST(k|KowsPE+vi;fZ3#-tQ$M?K~t{;U{C#6A{st2gNj7pQc*r$)LX- z6ki5+v4A^VeB@F3XltTu7dGym!ah~(t_R{r>q?gf|4R+ignLyWhV_z%Bsz;sW-D3o z_&!LxHJt*7;QXa7OLe7{NRU*IG^V*9?RIRe*6fn+qPxk5G3!i8G;yIA3pP5XU}Oxaja!0HnB zU%nHT?zHgRhv$jHc`VHSyO_r^f#{iCtxeF6DXIdwccWK4Bp60F#3p?Yv3h#;?9^*k zlNjYwt6%NB4CLjJy?7=ovZZxSut1ApM_5)uwKEraLok2rl=#r{JMy{pq=9u`6t@95 zvv-zJc3%Y&2g9DO+VcAIfEW;O!hVFf8}?C@OVAsYs{91TnK|d^_wtB9x_ara`&@vx zz}|3m{nGP2I$=qH>dQKrwP#1Ln>U&phH48`9AUg~Vz$BU8GBTRm4FCz6$$xG_L$1f zK=7hF5=+=U7@z#Ht8>dlNnxobVgVtz-&qWo8$^6dZ<9H{aUu}Edyy$7&dm80jDvsE zfcd{y5~eHeI(}Isid9qZiqrMb1;2%C*UqPe^tMASRKahIM{D?kxBQ=gvZme5W20%% zgtJ(M%;^r9xMeWZ-?gr)dYq#Edu5Yp0{eC@z{CS@OA=eE2?!*Fqx)P@8iR`0Q02F# zgB_{VuGItqTH-pHFxb_%kWgOhlJ1QTy!*Bj_(fpx`4tPXGC}xF$ zPx|2FAWx&C=97SQRW$??=_T59?z;F zg<*-cMMDF~o%RPqn|Q8#bG6KS44YRe8iX9+{ZnsIa0yyJq|c1wge(*anuyS>O=ovcnx6^V5}V^$A(TMQrYjWvHTB++*(tR&MGS-K zF%g7QWih2-yyOfbAm{l}fUe2N^?cf+u6Ly|pfm^y7j{#10d+YjKd$&-v5+&M<`D;g zI`u%fBB|cPbs=BS|0I}!Y8}Pzg$Xn9on5m|pj7xGgV1V>Syi^b|2NywGs+x7CI!!_ zF2(3QAhQ~#2837z0M4H6!u#qNwdG8Fjaa6USEp7Im?PAS;9c!kXNtJ0J>}|jx0xgW zYFFTZlfJNJDb2X4C*_yVaN^-!4yuRkfLyy3s%UE6&U@_B#HEJgAU`^#7dI|Ctt36x zSIk4O3H=EinR_Wof1?BYdUdMLPhaz{2Pr3=vm&!XPIsbtfMZ)G-0qE;J@T z$5{!%5${4NyB6Kq#ym?uzyb>_5ZWg0bXE<7I8=VXSr=#g%;pgY`n9mxb@J_`fRMd% z1s*MInPw<7s^?j8oMAy{zbz{;o;6$VZ12a5eJiE(T1#zd1Vp6c=tg))4T8uUgt_}- z=Pxp52(46)@0qD?wkblx8vr~Ur~&BbKD2Es@u~{S>cCCvNZz&B$<+XmVGWF>uJF-V zU0FM8eZO?l-zqW1=GU8|Q<1fTQAD{#X%y!VCdt(G@EvL9CVcf;0(nd1*kgp*r2fd7 z7_1pb;KDgNNJp=qY}YjCH=`Hx=z%7is1bI zLvQ8)g^^Js^dc{;ZXFR^OYXJULJ)FB9c?Q%eU^tUU(?RCQezrPqfp|i7PN3bop72N zsnv+i{sE5-BpHc(b&Z!?tS5U*0RU1qo%|m>2TmS;QGRxQ#c#oT(gHG+L5OYtjk&PB zi{-+hGxq^W(JvJ`f&0KX1wLCCXW7Yct*Ol*r&63NS>jOvAncc`*p)PMo{|4Ct2Vdw zACbD;T8(Wnasl~Z#CX5z)(nZ!<8kLSA0kz<#5$*2f#jJcy#T`i#tE#%fe$;mB?YgG zL-^Kvt-=pI%#op2B5UIXxFKUbyWD=5x3DL(-$D5leyZRE_pp4-Y{p0@Z!1w5hjf=4&J9n50QNmvu*Ur^XKQK_K)D!?}gU`sI-mysqM&WLaj?CDo4iR>Yc6=Wdt--eiF`7Yu z_ss1VM5Xr}0TnRV*x zFe|YFqQt$6jpyjp6jg&}JeNtkvq;Th-ceBlQsRv$J3^4KL zW%he5{?riC(8V1HTXwnhd#8NOG1Nths>KPeg|>uYPQwi7woNilgw!?OlscSymy@+I zkQ;to1LZF6H__ZOmrUrAQKI|yxFHURtVdoVyLqtZKt*0?u0i(EDnQ(F_kQB zu)h<)79B;T4!s=OCD20Nje5Akbh}t>oiZs-68d*!U8Uuh^5!(5CV$hrq~poMuipA& zK*8g^<0Q1;4ru6thISit%cCYMf){ys1YSV+U%Jeb4MUM;pZEp5fiDo*SSB(`p0R!sUJoh(9r`Zzf>45q2H57ZNNg(LNNdkmcHW2~Er@Q) z$o;r8px)|coH}}Yx{urzB#NEA{}lj!R1+cRaB2MKN5%+UBD%O<8g`(Kn-a_YxKLys zH+U*OrJ!!g{?EiE)q3k3*|5J?BK}1SBpK6&Cj8%s#QVD$Ft#b3|{PHCXUBN(7zy+U~_mc7wDrz zb(apD>Ge|#qGME$C#okFdJ4b89Dus$I;oHSN2v75qYZ!AycUJ5W}{P=0sR+>6WxwT zpLqoAYB6Jc9@x^dDufAp`14(a3i1|O@;#AN7IsqGMc{wyfE#|+)q}N)11MtY3}t|F#l%cIu5}_ z0J8^+haE(w8ECsqCUVe8n#EfZ*81>#wJWt|AGrqK`79wFy&9@ zi_%@^PBZ5;x%#MhpLtx4 z;Be%%#I(ZRJF`UVnI?z5gD->N%6{COCe9wbS5ZFNn2b6yXcf1Mrdso8Hu%$|Nj%!R6d=cAxRQRv6W z2zZ9h5pOz*tH~`hee49w;Q#xM4-5tM*C!$7+2}hJ+L86SBErQa27pbhETIn7$Oiox zwRw_N{J7fGJ9-2`rnR)wNy_O|d#hqU=jXpVgR`COuO_7jc11i1&LaaO9hsmj<3FD$ zN zwzUl0sqzLI@7g8|nBGg1n>k%(FY5cNtRIGOi51SVBQ3_d(7f)5OTA;`ieCQ_hs`n= zeHZ}#oV=}lGG)NdNg|g*f<2%py)*(FN3yW+3bWYD1_%jo_ISDdDlav2E}a4~lboHD zu<=m;SNzh~C9P`#y{P^kqM4`ZjzQ6{&x<7Pd6s>`E30+M-?&`AwUnqq>#BZg?@)Ce z3{v^}D~a-o3%$rfY?{KE=b}*^IM31=c`>Ai0_P{aIqyN_AhbcuwD_ zh1U<$%W_?~Itf$SQ1+3mfzQiEfcZzZ9!85TkubOc(f*kglptAK;R$l=xGMiu?T^L9 zAo~1+4g`;4Dtc)ozM_WVkpi30i{Sa2yL;wW*2~1+iiX#Y*HPQ!wTkw6BRFKqk1%)# zv50?x#!y^A9?aq^pd6;o?mLB8nfN5SD60x}I~%t%9S?$LAJR*|bC;fct(TL>JCMv1 z=%LgbGk-rU5OIj(Ufi!z4^=pOwsjpL;b-L5{7wI-N54>2%ml8yGEJALe2L{2Xx-PWK#a=*IdJmjgnbZz4 zU?hW2h5=$OKiS3sPfv+oA6~ssAzKjV3I!5kc_` z-GUWXKKjWQ?o?oAS^DVKMKk`ixz$kcA59K}c=L&8nY!CT2L4inL77hP6b&C`%b(cu z4(=%~CWaL-oq(~XVsb!0H6UzBL-CV;L(lqcR$!KMPAhFWk3}Ks5cg?pB#%g<5WQ&^ zLY%KSAFD$j|3-E4SB-@&dDoJAH<%jjtjQ+mVU=@4Oy9V>`y0;0^)+%o@sR0_)a%07 z^|0*qy{5qRkzbRWn#@RodUI`N&%Dmhy_S_TS{`J|zkdVqM&|$6(pluk3ifE#9hZK* z>j?BoD)2F7-i2NC_9h#bCd~;H;?w+B1WfkfLWm}aurzy>lXmLhjR75PP|+-=AVkj- z9HLJ*cf88~s8>{)%xsRa2L3TJTKW3Vd|9}|EuttgpwC2=olD5H`zjY2c~bu<(TMDY z8h}#I2;}E-o{eTP5U_406JrrKP9)6^%oaNEp05OWy1&Pow(e49SU&^vV|r(%f3OYpzW?b&Dv27NNJ44 zKn1DFRtA}5TsKnq*K@aAvE92=Ua(0*Io@+QIQkoiizSu%_N zmy0qTx?RPW*-dz84DZ}t7r0>!dD%GewmKvM>ikTR4sr>*sVxBeT!63|B3?s87qqI$ z!WdadAXhVw^LiuXO7Z(?HRqsuBI$L`e?k5$`NY-Xk^i|>h)l0igGSJ(r5eJd4B+>V zt(t^uRiTeO^%Hzaqw0WGJi)`7*Ag5Xv6&A0oAj|GYttk}YlU+ZhwdkRyi4;$ z3EC$UHp)+$F)L}IUi)&Jr&~T@4$SQ9f9>v839^4`;(LYH4p9Z5wMlZTh>5E#O%0*1 zJn~_jcDYXyZlkt-sO$8N zz5B==j{PlYu+hG+)owgrmX{M^InroSDUCv7WZB^1nT)6>^oJ2_MHCPF#A;<8Z7Ety zp5^p2dre)4sk2DP37s_W#%ZQD@~$@DL^i*H=?Wp!B^=lEEfS8$r8;R0z4$Zm$I7c_ zYMJ8G#y9-+mUic<9exQrrPQ3Hi>Yj`aZ{x96OPL|-}eK$04bw{-U=pdXozp`=Kv`l zc7QcBYg4z_uy7qV^9K>PNxB;v@2=~&9-0S}X+6exP&CmNKC50w|MrPEU!uAV%JSV7 zr2yU^`-(1EYz>|DTgDOlmBjB~LtO}zJq*=-v{h12AyGz4$k_BXB#Y+F-9NRQ5;EwW zeZkP~7Y4c1!=B1^#uvX=c30Cq@Hl>gxAIbw5&5{v%pDIR;SNOLAIEhC9JnZsBUfV540=(+jugn%kf}1%lJrpj^x#7oH$M`t3tt0I z-~0*(St_Gv+AYzRw;5?V;p&&d3WP|K`I6@y+RuzGmA*Me9#2RO6}}%OSU&D705<3T zphAz$Jl@#qLJAGKXw`~}I0x>I>g}KAkEwD*`tW|SgngP%w{Ba2z5U`RWmFD?t2y$U zL7=EO>r0#~y=C0NQfKt2M>EHY0HpsgAeBCpYbBBPJ%USB|HN5VqBfaX*u0s=Vx%|! zXv?aG)9h9Fcjrh|nCQV!N49j*wdtt1pZH02>oL-q=w0YTf)aB`fc zxy76YyOZmtGxPmuySO0>(q1hp{fYD7NJ22>==Fz5`!NT5=1h(RV z!?UfJ%1HS~L@P$#BV zCiZcuQ13UHWX8wPx@*P7mTQr-cUr#m`%eEJLyyi>&W?(&%4^=zWHX;||E&lGZ&^x_ zVQ-IznOA0Bx@-X{;ieKW&$M*F_{129U;U*0J;8x82`hdB;;z1R-$p&LB||n03@ynu z>}yw4Ic{%F8@GzVl?P_dMCqtLf;U`S>xi07#!2^q_2nt^A`MTpSJKaBN*Mm%F{T4v zY4%&1>PNK9M4EETICdE;gcprqFPr#$4EQ_=Pf~1D`S<7Vz=cO#bV?vC%W_N7FE|vO zPgF(!s!EA&duyiu!~46PHPY{Bs?w$W!vJmk3(0^Fy9WF+?i|v}CT_J9 zgEa^zdC-%U(I3R`-~k@@mEY)uvY%ydh0@{$9=;po>|nY+SGP^6A!^trEz5*BQ_z+k zGJe24sSfq(=yz#`D|4txUl@HI1oQmIxHxOOyx|$e@ZG6LZ2|A14F<+&W^z+c6vm6{ z>uyLG&thu<{DqgY*qqdz*~hq!jHX8}J5ujqKSHcO$;eWKfY>}!MMSu~*+oxx)O^zr(-j5QrH87~>L6&De z{2qaJ`Nmi>%ew9WzLz-|>XbC|3ek;X#D z*5u3<3B0;s@FWz zORmJDs%TglKAYR6rn)kwZ`QFscs)L%8do{%8{mHzn(6wp5vLU%#%%6agu#xsE%MAP z%X+>~^Q^vzcl1*scj4N!x*ryKH?wP_-S4AwG1Z=m!aKDwhz4k*@*M&@5{~^X^-WvqLA9=Nox}nUb~A5~Dbxv* zRR>ABucFoQ-N@%{r1*!zNSp&M)}}ks!9sB*Tor?gHc@8iDYHz=#gh*oPBE<3{}aja zfBww-f;t?$X@aN;rAcj{!RuyGB4>VA-s}_Hko05S{iYw?*0hv%ySc9XXQ~{Tg1CNy zU*q@Nu-m@HxH#zlnAgYP%^e#ZIF#dT`IBINxBFUpc1w3fVORk<_uLudUKPFvgBTg* zH%)}rSr?_9RFv5eTul!QqM_iJ_;5M-S#AmQu5aPVrWt|@ z{L^*XO-@AqY8=<;7hd6KCjTN9<GBC-BVw?=EHV-a$=x-I=Lpgpnjlf{`X6&VPyj~tGjNnHXx+eOo(3Y zA#2LctNWS@O#;9`_20LmT>Y|#31VrdCamO7FuBm=&EtwHk{~Uz^0`+!4dF%iQT?gNZLbs_}JoLdHWS;+1vMRWQzC-dMPp3Jpd?K zySVG}UZzvDG<_^KQBl<8lWIs`#gsDM0Z+qyjpUqkOASOwV4|5JLT1@}(cuE+l8#*b zWVTt$g;KljD?w%Q9{Ap;_pOmMUCx76Z)(Eqy??om=hzBv|6n#0;qW=?AHTrArQ;z| z#UM(z^pmRfU1HAkK-n+sWSrMUp=*STVn`486&Q zrZFTwR}>gx+ZaFE0t9B4J z*wxE(cr%^tC7ugLZCaC?bK2YL!Y9L`O8hd9;T4f(V{yw7Nj%{xj-HQ z77b8sr|StDX|Z)wN5#~QDuPJx*57r&t$JQ{+(XO~-VnEtdZZ^dPYIWLrnt|}S5tNt zm7Ezi6ulsG6Q7vi{9K^6#wo%d?Lc7r*dG0)1m}((XleNmIMlKjB_L)vtbYGB zF@r>g|3DVGlFE~&5DtV_Q-FeR{}u$?o(-Fv$!8xmVAWsR{v(I()VIqL1Mr^fLKbe= zYTO0UHAqwlf{ap8>U~ClFo1|9+!zogE0$DYiZDsVdxQvl*Cw7~OhhSV-hi;l9TxL1 zWisbWrMo~8Zl{a`@GD-p2-Sshmyr6Wj z`dD`-p0xh?kgq__VKy+g82H1UTbMwRw#5XxH=uk*LKi4Flmmz;yvt#E91WI`*T_q_ zppV#Z3v(xSv_B&2#z&=id|Apie6%P%oWBSd1%n{uvc+M4HZCm&h0^E`FgXP%%yp+~ zIHXIFH^um&XmOzeb0oe*Q(kA*uytwm9|4;GsqTN_tw@dli>*$|>OI?wfWy$_+I}&j zChlIVdd~lVqs%X+1jxWq6}RwZ9siknjeFn48=5}dFfzq{G5r-)gQ;ZmdZxyXyX6zX z@V|n?e=Hl-1DN2T`!iNnO_-3vhzD)&bLl*DPP+3T4r_x#Q9O)k4k(N3bTEa4)pnsk zVS=+SpC$!lhki$)EBfEUF!^D^inK%8?lk!_L+$7K=jsKSqhM1jbsQ2u3fw$|PPcTh z|7);W^i311B^))#m}#5Jx)ka{XHGYW0EK0j)8jUxJs6Z_`5CdhRPbsKfAe(JMY1=k z29Q2H5xm>#M=t?F{3)Eo%hNMDSZ5OH!X;uEGtkzu0XF({9zR{GUQw1PdT)^2g7@uU zT|b*i?;8IfHEsI_7MGMMGIglzQDC@B7_v@;36k-2D9?LC#<)(iHvhcApC1q8G%V2l zhDi!MXEtdB79`2?ZGi6Ue=2-%l6B4iAv36S>C%NV+yx)0@X;<=wpnGn z&5*9CZ4YSwH>kX~*Z8|uLSRa9a1JE)Rztp25DMf_{^bB{BD=x#$-C$gho6~z=nVXM z`zWUeOUZNx^%ub|r-Nv4S%<0(*npDk@{Ln8R{gRbMldZw%Pv8=4{NfoPk_32wNm

hZgvq2y)tE(k#OBM)jjw{VJ7{DkmozTam_6#JM=njn(OOZ>6zQS%|+&_tu|dh;D${z2V$>UE=C)JZs&zn@6dXC84s`=$0$)k6<@a7 zsGfwRUU7DFxpXg-deBiMTCUu(lt=q#S@t&f27Wti-k$_)$%Zo#B$>-1y_Z97X^BIy z@igap;%AHeIhsQjz?k>wjlu@tm4t>R)TzizZ8@1vn1`p@+$wpni9YH*rYP##2ZSM@QM2~G58G>3LM3G2s zQTc=yN9^7rtt~{^A&E}sZ|9RwWdQnZ6_#b-<-RpPb6(YMi0FSm@V{`AD-rs)veJnH zmH!x6@wt2il}fP~0hk%c7h*w*MPy*UP#1YtyF z97J0d5cR1P#+c6|Ug6x`i#gEWRP)5E*(4QC9@Kl~I2@nMhc65v;L! zPqyz&+UT6Js6{^0dkb+;I2Mg*4OK_z@a1S`9=;fm2a1E1H(+0YGvpm5lS^PzCnMOF zT4NzvEI8mCw#1?r^t$?PCMX!GjX&gd$IAHgc2E${n5A!!{!1b~ZE!ntnW%iCk@z=c zc+z<0M~e7nJMi%pgBSSjDO@0GjgRf^Dt1iftQ( zU)e>Gz>SL1(H8GI@x^RdHg$sz^WC(N;VxqwFoK6Hx16+4im207tx;N~Q!Vc=W`j0DF;`-mZf$1c09uYMH()) z^_*qEX$5&g4_jm;^QT*U7fx442|O%3HKU&f%JBvx|8Um+@KvfMZ^r^V*0&ja;4n%d zad;*=5_kNTX@rFN@Hb`%QFmV9BXt$tY(cGc+}(aUvhp?+AyKsoNsT=Q(!*QZ@<@55 z>>Jm5%ETE&&nb~tj2($yV)Ohr0oMy;9lGbB@+i2=e7vpvVo0wBBf{VB(rg93*wH5b z+Q~|`0rbJO`k}dwv@#48`2W~a+fOp^S|Pj_C$&aOBjS}I-CJbB#%b2wr_k?IugO3H zM?|#FKy)}E`q$6Xfs;-B1YCuYTH1+q-?)LqE|e#Pxf?WUrkK>HrrkDPYW5=y%HzwG zqr%);#RhSAAojDX48?_V3&?fFp22AJEHivyQ^8Xt7HNh&zi469Q~@N0#hSN$Ed&m> z!_2cgcMU=KRnr9Gu@F@xY3Mg3X)Uym9G#7uTPmtSIimZPs)hax&r~1Gk|EU~oMjC| z$?5DX@gYtrzZ1@kt+~)~VNmKcoRU}AMRie-m&jKo=hBIJ1E2J~sz*lS3ORF?ni^HT z9Lz3xkb$3{pSHI?x@|I!s4(^Yf#tvHCkRbJ=l5O0)<~&Y-ouvdeudAs#R-)Q6G0l)AaNP-GeO zMoCYi+d;jtE^T!)#q6+U5vMu58bQK1K@{VE0(RvzYN}W<@`s$7A8x>8FnhF{oW0OC zBgM{x7|chaZ*9JRbtA=-`E;$c{4lIJV3yC@TyS~>rapz>qs-44B?)Z9^hl*XAEcW9 zAb8xVTPPE-9-n>uY%;TDFhTf7#?AW)S3T^w>mJb1H~FOspk@SxMiSCuii;N(YUd*$ z4(T>=*ZhBx@(=>Hf%ObB!u@Hx`PhLJ9sbCUi(#PG(2gxQ)cG4ty=4iM5k_skI7C1M zo5pU!BUxw2?GJJEOQ!QkOmO}_vZT#4I#x^ z=tbMLU2mA+;V9I~E8fWva5?LFN-LY&J){i#_!2bUpa7ZXn(d8pB)S=|T7twwYgrN8 zg76J-AL-{$HGBr3665mtA2p;lSsuSBpfFr+vebqEI|hNpeLG}IJYFivh0oix68rJV z2RLf3E4Tqh1wYT8mNjT#mwNKRzlxfLz5BW`QnmjmXX{a~_coxP>!VBB)cZ`VW~HlC z1?)Hhpc+6+r98+OSAVI~k^vK(+{#1oPPRfv;8noh$-e^LHb2tegLSd9$6(|{r`XsM z1Y(M;mD_?_KTx+rC{c-qeiggE3ZW*SZ?8uhXqLAWH8vtC#*`vB3`e6;Cs*LUqWsIm zVrg3taEqe&7-~|u=~SrJ6=9If+%89+*k8hVTu}ShzKodi)(ygXOR!{TXLYHAQERLt z+sO|orLwb^|2RfSZu5x;A?rz|I$m!-!lkqBTqM2tC6DUrLP??+Y!`se^<>kNwr%I` z#30v8F}(x0;C4Th$acWPCY5Am;6UpDunTz!t#6&LrMZN1p^Y@Cr$-a$KTA1TuNJi) zp9{rVnlrjyp=FPE)>LM&pX)brOHY`&*73CFHy6;9r_D-UW1*-NJ4aq<_?+@-lj zds(4mtWlOPH1~A^NGj+*dOzwumBX?aEW}{dGeGo1x{uaJ*2BQcRY{BE4J$O0x{Z1u z0^-tV>|5pvf-<>WLASy%qoQ^;M(TDUq&f7nz|Jmf2FLU1wvc9*d2+n6h~L?>$zCAp zy&Qmj=`Hby$hUJLru98nR2Xj>ii?MR{kHMytir@(+*;gKSfyP5Kj2w5+~eo-Yz(*U zkecI^Hjj@HKu<)!-c8#~3JZ<=#1=KVD&Z`3hbc#TP!DZudl_ct_xB*^JhEbsW->>; zm5nd{qi-YzQ&=UT3bfS6`o)gmr|taxX$_~vf}XB0e^4n7^%=~zk;qx z(UI2S9YQsq=L55F7E+f=pg^;8{*=AxDBC#e4R+w>H|TIraKuoJ2hRXs)J{Q~k;(*a8*ZyaqXgUF2J5;QsO(cwx-+My@tr3Kf)-=6K@H;u;K*_fl|mt$iH+KYjBG#MGxnR5^`~r#GGqdLZ+E z<7nS!h?LMT4tiKBuF8P_7$B-t{@rJWRC0S6g!_C z=tY7gLf33v)v5oX&^wKIFXZ16z?p1@I8sQg`QUWfmzQ!x*~MLj9tw;^!YCP@-A=w+ zKXv}jGDU{aR;oYFar^-ILqr;oi?UH&`rQg8_;`ac*mZ*IR4gB)&0jwTV3U;*NJ0nk zXF!pOkOgi;cWHoD$teL3)-?-l7w`=09DO2%kPh5uUDUIO+7w0szZ8OF{H2m!$NIH# z?JAt)*4Z;=?e4-=OhsENf;aO5VO`+01ZA1mtvxHeI(=KkF@ZsVw$;i3QIGJ7{N%zR z)IA*XIsBV9nz$}QYo}MuzeH&5i3F#UuC8!He^%L}wsR%|yS2?N zJBTX-Lvcj`De+X>Fc66%F`gWyvsF$%c+^%eq~>}?ch)4_qbP`eAX=YKiIV$Wk7pTq zq}Fc5cA-k%E|0rcK^~vz-pfq*PW?c28j#}7$Rt?DE9^#EhoOyUSKEx=sTu|x4R5}> z(XnQ{jpDwu%9nNMg;I;Xl$-&v3T$>^u5n)+dB0(sif=F?x>|!B1O=)>axmyRYM$F4 zAZvFs($QK8kDq`g%t-ZtZ9HebQqi*-aG}zffs1$-W8}M_M(NwL09gjHspcRTIAVGS znJqD2h8}+v$4@}pq5}h1mXxJs1(dQ=pw0Kwv2XZs3kCh5aaINSNlE@z;<$XH z_W$)3aIY>k?_|vj7Ar*gV*rby!xe7*<2lvLyg5<+`lz-rU-IK7B@(RgoJ~w=o|h&rzhEX){q3Q~vvOFBG9YEONC27Q38`o$oylU*lCZqe3bE z2vRYt?9QkepEnXNnRJHOJVLdH74d~jl0=BpHo55pGyDj0*-8Y1+|%FnEfT^-sd=2j z8w7!q4`93Gt~s4yPsPM`2WVsu-+7!TVDTlF7jrW8s?pJSfdk|qeF#%Gdys)A zkHiBxHA^66LIewCn|vnJ;t60{^v>Y+*4a=8gqKp0V<{ppm5wgofYb@?LWMt>O?CP% zecl~4tjwQV9yhOhF~E#^f|rGO*RvYn_0Z@fj~hNe!U3r_Ta_*uz+>vylba^vghw;> zE|8jtxz686szIce!SwPW$+9bS+#K8s3plK**?Z{HZ;ADR3_K7X!cu+NmKipzk+7KUrquJ~R6)FVN>< z{f5&ef_)nZR%a1SA*1Al{usi3?~u<#6)1Tnjb%qO)Rc9xlw=+;2@@Rh$=;#+8k#hq zC9i$O0xxqAil;Tl>}L&n-OSR661{Z?cfod)V}6H^=pzXcPQ=C}?13BGtDBgMd4)WN z=YM9B-Txs{9G1S#u2gQ{kjNta?h-;IlmTH{8XK|3E*{T6dXV}!rqz$>r!=mB$bW@1 z*^vtw;uNVTrvC-sfm9HVdO+`DkHD=wZDA_j%86_c`spJQ zi17qS$3V{h1KSYT)2_K*WK)&XCm^wlcE%n#f(e@$fQy%%hrFog+LcbgFm?1ZWp6~A zCq0JPX6rAA90?jLq-spX0@`JvoIo)fM|Vb$A2BH`iYTg1;mM=S7JZT6ji}J1-*%NG zOMwVqgyk)I?$GfTNs@Gv>y%q-m5DR9G2J zV~GzSx-yP52p$;ao0A))*XnyKIbEa-H@_B8)qI0^**;h=qV^Q!8muw|8&}fBKWSO- zX=2w7)|zV6%d7lr4TxnAZI!07tueyUZ@8`PJKY+~amQR(0?47|C*r%SxT)c>cJEnp z3u#sWG{=(&nd0M^2CkS2ORzYQoKP7Y(LR$%3tsy3?yIO^iU;6D2qjYcOBaAXtK8qA zWT4A@E={zrtx_Q=wd_-Niv>CrmO$K$yTKN->cXa8xqy0CK9ou_#0Qx|Oz5sXEKwUs zdyjc(X8^QBPoku7L%BqsR{$O1>aTaO?zkQ>%d?+!dG)ws36ZB6U%MVjFfXEHt0Yvu zpqF~|?j>coI1U|m@8Mr9J^i)AhfZ! zX$i6Y2w&UV-#s&CQZhS`!Ode|vXI7O?n5t)!p_3N*+Wa$fb9zO(gss#MlKgl3S0Zz zc1xSBQPlF*8U>eVbpf75RVr0qV7FwK$#DyM-TfmAjNrk$coj`a*m4Uhk zEW|x#;k_*QDk#cHd)v~j=5psU4(s$FTQExvj;YKtb&q^cDk{V9MFJd2HuN#-#$2oT z1_Ke9hbBf9@Kf!B`{vIO>%VI-AgdzRCy(7YR=-m$R&#$OeXB*>iH~>QkKIIbv7&W?-2q z^FLp?A9&Q?PqpBKc=Y@zt~}js=mUY|)e#H`7D)XAU`wnBRz=q`FfMwlMHQ&IXFw5r zcJiV=kEyYucrrdaVtdJ?^BO^1dp+UI5HfoBlj)jHAu>o7cJou0Ig~QtHt&y? zg>j&;uit(6^|ZmH?8Hr`BSXN>f+qZ}ba*iP$>8+=n1Q{4g&lQnm?a~h-LHJF!!O;D zUFqd*0G;oz28Xl&4SARs7g*=yYJ&E47>zCEjmTc8hoF|a1eaAR5R1M&&0@}&Sy#|q9nl8eS z?Q)e99`a7KQhH2Evs8SpUcV02)=8@>L7k8XM~W;P{zie+XzuuA?FrqLlzR3X$5Xaw9XOx( zwS6XG*Bu6s6B$HupN%4bdB2qr`PDgSmt(6Z&cl<>Ate2}OQ1XaH^xQcO6^Ueof{v( zNq542bk7=e@=!x&T2St&;nlptpK3oSt5IE`6^y$N3lELA;nPi@z6pSGD(99zjY=Dc z;9n~4`ce8*P9MEkO@eGlPaXLU0*MJct6hj!xsF2_43&l*LM$Lg14;dE>b;i(n;+jr>Q;Htw%N}>)C(%n)JTkQ$4eJe}^6_NJ zPsH!lB&CMT#EFVM6(skg3Rr`&npz|W5b(SCPANmkxQX7H#8tWnpV;m=w;C$q^0&_| z&3ij41uzWQJOF219$QD#ny;*0sP$97aUsXo8w6yNp*YpHk-I;pNdboRxkex1dr_b9 zSXi!^dYBx=F_~5*JgM0RA9*d-nnJ%oG6$n+mQ{-2QW8ewi+AyO_)tqx=bN2}klT~)2g>+hagdbM*&aOwE* zpbN;NFqlv|3Sv=RNgpH1@g)>arGxI|^MMmLsAfAZf@RIczPJt%n$`QC_X01z)`QZc`Rmc7>nV;U;Dn@&#%Z9ES^&T>bHmb2T6y^ zPbK3wO<(R43iv8_c)FudR?ybVm|@IkX*n81YkpR2C1cT&uH8zbqrPHl`(HA~yM6)W zm2_@xb>r8d%8j&k1x=oA_2g-sixjh`xU~{sN=Ks~kkzLoN-q2O%{D89s@{bf>|o3g z18KF99_j<#avx9TicQ{o(surR+nW37m~(POg6;I~0^}T2lw@*7Po{ezLMB~#vxs-g%+)2Isua}&wW$opUGh=Pi)N1FcCZv`iWpHVp(@6 zEVU5bG-~ZcuFzOAV=FWwAb{E)Qx#0x8?_uA;=}RJXZv|;a(_G#9pM|PDMsScEt(`<6L z(^5FyP^mqmB=m}VADI+4bxmmiS1F4G-ES5HH$L5+7#`trsC#svuB{)xp9NL z0VwP1{u8qm7dpS6bB%l=K7dgrEeIu|(>d_oK7mY%)9rOJVAfEPzUMKgYuQ?Zdp==> ztPtBRt$;AKycbd#6TrVf^2!wNlqv0A9gM!c9O+q222yfZ32_+y25)D!7EZT@y%O@^t)zC4&G3Wr` zh;UR}nC0u+E6OXtzx8Q=6Ad9Id2BN&Y)cm&n10-{NouVH1t)0zu`pGJ$PdmvM}E(~ zCy3}Gs%xC^B#Qc7jlFq0F=_?|uz3YWDE2rf*ov&Q@cWw&>6WcwV42K@ooFqeyzHpk zYG>f9Je}>FlOSP-tjaljj!_f#vuo$Y=uMHN@DQiz*I?gLFY|hhWYN^ikWNMf;;H&< zC!BN!@Qir7H(};Ttp&;gQ~Wl!u1^P+P7d-!>5r4SV&>6=(4!V-c@TdovUC6m!{zul zQcTZiePIt^B-Em{S`$pSozKt>8X*WtDrj0cl@P^7DZY+w+wk&VMCDvNgv^UdrtMVg zp*ijyE(L9}`x#&^j*dK#%3dS zQ_LeNUGGrNOR|*(W(c%nBh}qj=;B>@cT^nbLma*sNCU0%4^4B;uw@GBre(F;vmI?r zBRO3*3K(^ImVB7AmK5TUW7CoG{SA;DVmb6 zau(vjd&7A96SWqN#QnYvK^K|%ZaZ8-2iEgM{01}NRc%a{e7PHnDf}eoI%ruN+xZiCjOb^h zD10L^9~b+>@t#T2K)i_Vgv32%^`@)E22K~haKHk1WLrnMkl$Mwm;tbIaNN7nLkc(# zBRtRAlkvpr>h%-l-hc0$p(e3k;Bnb9BbY^@>lj(3g23`E9sN_?CkPc3WjcGN?C>|R z`Z%Qw_}pkuh2V95OzH-R+vVs2QncsRmMg#`Yl|~m;rNU~#x1ShudOH5#x0WF&zNbZ zwjfu)LySQFUi?+7(F%?etvHZ>%I1j8xeZInRj!aiDIT*}sA znxKLzVX2_fD}-duawJYi_=hKWSI-QS+kWW4QBMpLN_k?_OLCHAFf#ev6Bi@Wbr$^w z(RGQmQYUhpZaxa)ZWeAp)OlqNI+l!n7>LnT+BUjl9giSU5rEbx*$%bs_;;4upg>0@ zP{~t*IMdVCf7|AlN(hkshSna1Ccwqqokhpja4%})25L%IGi}lx3mPN{Ty^#%2)WGC zeYn@PWfw(NwQLjBUm=qMikI*Xas%l1d(Jj^A8A(vqXnxNm;F6D9n;zYCrDs`I3@q; zrTo}E(tG~$CX-X6uY%CQSn3tsGAofqY)1lRE#-l$vs=NSyP#9sk9h$F_0!=^H}QqY>O;dw;nC&cq=fK%s+L{3 zf{6s{L?(k$D`#^$cjI2GRgUEt4; zyT(EMI@BoA_{o9*u^u|3iPlcj6LS7KcFuh;HT0Rh=X?-J?P}*PMXtr`G)k?F+Gp%)u;|)N$YA14&Q1Aq86pejkLn7rw?_8@MS7LR9TGe zeyu>2(f_v7j+wJVQtmP#Pkyz#cH<=ZD2olcgR!H+y~YANGh)F1zE~OcBYh}CRdj{J zQXKNvC$E4|7ZO>bnbJTDm;flZZg7mVJahP@c=AmA_cGHNYKi@i$Vcz-Z7Ta2rvx_I zd%k>z``}zlO8-Tg$vob!(&peGLkh|VK`@lz(5%dXoIc_P_D}|(k`=`boYn8yDojwN zW+=D{+!bI3NG6oa9^SW6gt`pL-r6jaS>1D%^D!hG*{y2&ghy`w1=_jgk75D<3&0!} zi*30x8qW#|ysM|S4=K)B!(Apr@o;p#tSzv9}Ur@^sATfT2HM z7o&MZmP+1DhIDy0zp10{=p*kKXfl}PODUa+U|>=htzsn!*#A3bF77lZx2WDcECw=L zS-X-fxE8SX8`3QRZWvCo_|BbHyAcvxVe&uOvp|fZNZw2x-8|jc?#=dWQRAJWrX?~L zF@gnH%K;+cKZ9MUkrPoFsM2@_iY)51C`RMZ^>jIHOu>?ta#l*@z&lKKl-b-uo`XY3 z{8ClMEAFvbjOKAtZ6>pWGhP<~=W<24nbsjAk?9vzDPFuxy8wn!fIKf(W@7uQZ_Rl! z)}96e+Cc;0qL96#&brEM56+L1&tBys8Du7VVxl<{ISsl-fFiF`^yo>lFi#bii|4+% zNRTUv=?q#5QM^D|5;R>#o{^?o^ECNyqE^)@W&67?J~SNleY4@FB6ep1KYdeu9qr3= zS#S=3milIrCylikze;)*<3%0y4^uO`m(OpE}bLk42=}b;r_PN_66JIC%2(lC zANl&?u1iWh2$%O1j}_!OLaqUWnOj+llQ{F6!S8&Y^%3Wt%gr$x_NCBGlc^^TFHg>q z5+STNlTp20&%UAH!v)Au-BWxNSM_T8;^Zi+b<`$>s}*RP(kENa=Fj1FEPt7sSz=~g zyifwHR&45pvJbO=~irE-oqhX(vCvEjVc zW%Zl+i4}8qJZ3g7Oby`{WJl0NvXvC@Bhq^`No+W`=oF2uN*U%4xERmXEt&rYnE&Ao zH51D1G8BhEOVM%{S2!XS#h%NT(6y^fEj}#kxU7FbS3J?*Dj7#!PWn28;hBxEFUd%) zW|{q|jew&ZSb2Tra^qKtJ<;=}^G9XDEp}6b`wRt@K^Ln)h-TE!n{r(mSwx!^EO%Kp zZ_`HoC(|xOQPy6XamKQXnmc*IFGX{}b@XJ=`uOzVn0JhQCM_54oScMHb>c518`eIZx)GgW;hi#4NczFm#m;rD%%p_Td*fr3)**V z*wLVI`-z5VF`JYC{D0{xVYl{GZV><}2Z;x4iceBrz4WyMj$@iZ&Hvi$^r6&FIBW)JzZrWII)F&b zA0dfC0Kb?TiQ-b;rq=J+uSGSEEg+%lq1b}sIb82LE2Uv*RfF)uyJx`eL(dZj`>$mV zo?=BG;luKhC4NAdC{I>1ZpjGHr4K_zw~$U)My&KKmJI?H2hA*>J{#Zf*jxL-qPsdf zaAa3n<&v5ARz}R)IMssVPUDc#FfW1G&eF)>SMBUKC-4&ydQUV7MumTBoOv;N;Obb` zvR`Kk@$6=gO>gFK(YrX?g3u+BEfX%m{q_Jarq6s1k~2T@>V@r;^iYQbHM-liXYpCj zbyZEW8XE%N1%j^rV#Ro=qTh`Wtea+5%wZ!uH!Q(d&R@T7%9wOaRrWq4Nv-Y6x~0F; z@yC{MW!fomzcYmL$~<;aAe0GwGTxvQjD=(N&a}wIn)s|B9!dK7N3Y3w9bK{&TVSvT z^zF4SAAH=1b3a)QEvo_1K$kgjE;t3m9?3Y4KI4%^BD7Z)uj@OM#2VV;eb8ovcgM^@ zY{xRlHJ5`<<@(u1++}3_EJo!3X3#Kl8x&i>Wxlx047Jm*>lVqAC@npQB#>>-h-9Io zinniX#6L)y0-)SJ2?(KiGUssS#{f|C+<_NHJ5+35_>>dO>+KYUD|+wF#x2ol%&?E- z>&!d+8@X9K*;B1Gm>pL20!N6SPxx2=1;hxuPE|J<=g!jDWUxGY1I3MlPQ0Gq=>eIP zGmd4dS_7QgT;Zu-l=e~@H(RcZk(p*s-4e=A{>& z9GjYqd+D@V{wRcdTmutzpOYJd#MB0ahL$hwpeN%Sq@rM>hwP?xwM=Pgram|6e8hY* zXu+p6Mz;Wx&=by{RUTFA{Rifsag*t^Mcd^bP?6y-N;gKpq^9cyld1_Dxxx{QL`Y73Y;Y1spn9T z9cO&xjO>E3SX;DdKJO9o1y~k$$gT^jh9?IKY_F)?Z*NF`Nf?csA}Pjzmg*JvbtmQt zJF1VbOY0#OPdyOE_fG^7py>PTMpv+gK;?@UPM?u=C#l^Bx>6JOTx06vzHolWJ0?w7 zk*vMiMbt9BTQs4E5O-_&FLHi+cw%$bU4nh3?Tgg~*!|EJ5}l6XWj<7rRQcHa_wG)X zzSUl|VX0!I9^T&JVU0j1z%ziPNhA)2HFWnR>;Q#*^co?$&3J#R-T`Oe;K+jZ-%EujSJVmAK_iGf`%TJ$zB;<(L{XjWph1Q#nL#* zma|94e{m{p=Q9~i?)is*wGh6PBiUXU`AlvsJ z*JOzH?LQzQZz*xsr!b(-Tg?H*pgw{@D`k=?p4`unF?aH;m}rMR!dFTb()&ifrUkto zI9^_5jzUl+wQO4t*I(KfnfYnvcOv@%Wv?x3_KSOI{&U5xJI&jA_6 zD0>`9Io~K)U6H)P@U1l=-*J+W#z>4cU5XoGqL~)MTOl|p$TFHfAr9RwBfaq9!qfMp zlVYxxX57_@`M}rwvU|B{0}S2vo|c=IjO8)`Y`K|QqS17&&KGZ>SuICU);C4YjH&-r zZXDcRs^v!ne(pI9v-hNFBc1vHVfw15-@G>LvL4i*r#uAGIq~&BJ<9w=W%GfP=twGmglgrWCQ}`t(|H!>B7r& zj2x=l!vU&(8Oo=$!4Ibp9?+*qV#f_E2qRPmA1&dJ7srS+F~4?quHgprwzJh$PG0A_ zC@KSVjZa(KAtLYYJ7QL=NrD}ji1&y9+i<|SF;jbxyT+n*J$10?5`B5B=-)~} zgJ0TDRC8B93dD-yDS}%k7bmI!(xQ0IaKJ$$&Wp25|FaQpU#_DPb%59T8d?F#^0{o~ z1JyK7)hF_fTTXD{SO#tSit`I9W!j9=ap)T-tDycK8^fQ9de=A$aC5AIXmdzDTaix# zZ@5Z>^F`dVM*@~d->i|lsERq%3nN9)$D#TtEZJx zG>G1d=>(p+jq(VB?_D18Pin|A%B^@Zdr!VWQt-&)z1Mh4vV{5?ssNEb74baE&3CEy zQ}Fsep_b&r+p8SQ{heYW)IivIMyEL1v<$Y=skMKh(Dya27|g^3Z>!T-!SL-I;?9jP zviyf?dDT9t%Ovx^c*Z&61mckb|`9(Fa{GJ3&Oyo4`{CaPt)@~JnFLs z1fo8dGcfpu^x*`vC^}h&TcKhYHJARwqqkJ}q~)uf(%vVoLhA=wU6o$rSU38Si`%Mz zN%2<3DOxY@P;8Ud?s^rsNj=xQiPt=Lcn?7%nf5E(i|IprcDigXSE%{RJ0!0;XK+Ls zeeY$dOH6(jx=0o!TK`HMxq&n-U?$%MI2)YhGoVc>g|>o!KPzbM|Gf(f)iu`+6>ZjvZu!7J;(OS{(d&M z%uj{Ljfi_**0v0{8l*qL{L6IfAU0tHPME|DEkcQCo3;aEsHnOJ|Xf*YWoPmVN| z&>Hl-sA)8afhj-VFaA{LVdahgJ=t~RkhWy{%CE>60z@laVc_p{>3991nfP7h!{coL zKEU=OOoX)acD-1rhkPK_5@CYiHFMRg8Z9&}mtQw&%r;sEeJ@D#XneLr?1lKCw)9Ct zi99WuHNthcBI*Z^%ta0@LpDFDMjt@9d|Isga9Ose;lg95Z$h@PC#~5N?jBX_l`I%S zcMvG|iRHMClD`a`g9DjPtaVd*dc#*f)P^3DA@_wPF=l>RDyj@PHR2|7C>15bc4-`B ze6N^%cqx6Vt+?LP^3EXnh1T^0A6=bFN!rh6VL)?lWH)TPQidW>cTfl>P@usT#_-FY zUWRLNjIOdGjexh=;q6M;k#K9I82DXk8q#hCTx-I6gHU^a<}prPkfUbd*A%}AjL8zb zjC52;t3xf(k#|pGP*3p#*>mHVc+%%Xp}JIRPL$1TA>C$MW}jTt+nL`7uk;bRzmZDR z&Wx_iQ465I7@fPs(Zv?wBcIqtXc=nlDe!8>@$K9zwrQ^6U%;0`ctty;W(ARaBxBIJ zm8ym*tAHh_!+-j?^~T$h5Q)q)3KYDpo*#z@%_1&!h!xIIo@Pds6VHo!6~n0OyHTr1 zXi$Wg8JDmBVkcUW|9qsi(M(lyW<_m``}4;t?u90S!t;>&<}%DbH~kV$NLuivfRI_; zDQGRD_wjRuN0Q_e9j62HonohhPKLDA6kf1bd%TrR1|`;P0ftx7XKSv;#J$aS&7!P|UA=s7{E*0VM$sllHQyXB z@&p~xgW-YH`tTp%>v{GvIB*Sr7qY>HnNk1uHX^R7wL#~iFqtZKFr+XDtyEjz*7!E#6**?5OVk~>cbvDHgLaQAIG zl6-CZXOAOiam}Jf$LOHC-6su?W{*)g`6+TL*YNU~=bTDSZXO5Mpd;Fn;;_t(w^X7& zQA?93IU#g6{&51!LjS~oh`s$Nxjf?kR~QlQ-EPy*c&guOtgs4}WYnwc3#Icxc^Lh` zvbW$wb#w@;h{PmRkT}(S4ublE#5q6xFB?u<%}JjH zPq_(n9`$8UTVY5WrQv)T^T2pry@ z4-tPC6Q#%TKW|cJOwD8HN=<*|RUFi!-Bet3_n``;5V>S#UB`~Cj}23Aukk75enbx* z^-5_Kel;+mT=5~DiMAraY<_%2qGZbKIs8wa1C;>p!?S-oygV;{e_I_j5$*zw)(CPb z))Faw7VCXp)coq{Y`vBz4uZ3+>h+GSxdnm3Y-3NGTdCU-$#9MPv}Fyh;u}ZY;aDCw z9ktGda=B7DQ`4}V(+iN;CqV&qbu1+Lt(7)fJW2O`cq_W1Nf|;_K_+P@#^1TpK2JSN zzH2*EC#`3S044OivLL!dSo4hS4Mr;+jGT4oy%%|X+ylPakA9EV`p~jzoQxXUTfE>< z{JYi49*UVU1sxsAqsHoI%8eeb-)N-Vj!Zu(#Bg#4^URXt4TbGC8=i%R9x_W|u>+o; zB%Y)5IsvDuy0W;>dfcc57IY+@*+Zj8;#UXg{wQ>#B~!pfq0fni7cE2Iy;DJ{_u&$Z9{4_fOl@%m zc0f`1H9sE#DcW_|c8cs(W$>=Eoc|k~$+iP^0s2_NN;bYnAT7)AtonmzR53^NwxiR$ z=ZL#Z`=+?CpAbmHe`YE6pw~$0rrj%ew7BC*o3^_eh7dUV_bf%%1vx!CEB2MNjyV!hDhUrvJ)W z=q2+^M*`;wUa}ezXG396($#50ZNK9RDU}EZgN^_R#6F$@4w9~O_%h<7>RmsW6}#(W zaw$`SMz+FnQQ3h1GKT47FaJu9^Lr8DAFvYN%VsTu^egu#$cQi6G2K2Xf%PTd^_%?U z5>~;4P^95b&GU)&NFIdI;JOyr{BAByDnEzWDlLu5!U{sryhsDPd!3)5!ya*KV(f~c zvgWgJif*ZB!ISjb6`e`{RYS`QmKaGg&?vYBb^1vaqeEkdJ5)@x7uPZIOTsF{<(dhW zGd;f5B{0JM;$h~Y?E`i;cq=Rke3346%t-M0-p-ft`O^|+@+)q`w^+Glc!s7YcAFY* zt|%w(2a-`9=FV`1^dhMp9uDOLTBulku8bJ_OA;@o~f}#jh4W@mhFN zEooZQ{CLBaw}f#KW;P*&Z5nC@K9xVJr>0CafVFOsM{s5hOx_XYb&B~;nsw{(NbT+n zRf&XgZ*;!bCrHM}wJFV0UeB}+qm0$06UH8?*J8?)?)U=23k}sUL5(r|n2axW=T_5NPr%CH;+nI&9 zyvv~zF;qS3(0{bb{X$d@cAvbIHig7>RUsg!;N)L>6(NOTvpTyfSMuByUQ%dunPTG( z{XkK$-?n%ufvbY0{OuK=_?%gWCBU;HeF?eMX~XNoGC*8$C#Daqe)UNExkkd4HR7-asu3VUOJYs_&EC=ZX~00W*x8Mds+fQGSN+eRycmnpO9@fMu`LjMhfGkBpW>C)wA&3{E6x zkvU|AP!xgLkqsdplw`9R^J;=v^8}6wn@m@}vTR3uWX0tPLb4+4EPv&tFgfa7J;X_X zb=7`3jXYGLlvR>on&3{y63vkX!hy`d`D!17Jo=ecg~vIJjQo`ImBhQEcoCvCc}GD* zXBQkG)v5Ao7Q;1Yyw?Eh!x$F(p3jMCDQ)3w-`3)8rSjtM{g|bCSjR+EE#(30d+POE*t+MFS`rn$VHh> zE`|Kmbu*PE)%FpihpFwMJBv!Rwt~)@LRCMkZgW~K+OTl$=XtPy06jp$ze5&Ey51(1 zpL_S~1{-_1l-}%=$tg1{|JJ0oJ{T&HJN80(+})>9o;M09kXYFwAzk2h?_~z ziMSFfcoDP(+!e6oQ~^Unp!7Q*!O2bh%RhVDdWl4P zB4ge`M%$`+b~;ku`=k;vHq|GsY1*!+7Ob*6&jDq9$fMA50SQhU6_~2p(8-O`?lf_< zmvL=$4^j~)n#jQNJR1Y;Xi!V8oRwdo!n5^<`_l+G?20)OVx^ahmwSh`sOGse*K@_6 zH1dF}(IYQWT$+~}ewFSWBJio;5z+)zOsVqv3CQ`BIW8AN3_ifB$;kE`aY-W<+|t%M zzyHhgmlD^Jlf!@dpXGUqn?IU}A{Pw;ZP>p$cnLL#;HARWaa{^ZQZ((l|Njb&pR2pU3_)E6xKP7C+ZuhtgR29!$E&b%_9{cEbRDjg zd@Mq-K=_b3;2+u*T8 zW$OL{+D%PN*4Y{gUM`<{58m_JiZW)b96-k5iVl%nY#fczXlLCC`_pQF!Mbbj09k-! zC7>JATQPdc>(!Ed6{*Gx#UQ)6gR;rX(Uufpj-I8x7_uH83r3ZIN*WsjQoce zP<-Fek*RGkx?{wEGX9dLUwT#tE3FkC6Q$kFRj#Bo390|~Rc7L?w7t8|poh({C;H)H zgefBB(o=^(#rTSr-VLFKkZ_qxeOnHk_?so5@!e;i~;#xfR=kK_(sIi;Dnewek6MWl_c5B*_aHXwa1GuP&;$=HBi*7T ztJ;dmG3zlA!3{W=iqZGwrR+cBZyffVd)_{9MJV@22;r-MloYa}v>C-}FcS#x0LO znP&wBm4oAdn1N~%fxqS@jMpNR`2X!S6_JD8~rN7Qy{A<>#U&AtFVbngjV zMUc}hdg@Fl*>CL$9xz| zO{Hm5GWa)ohcWnGptDt}^oAE&>E6*NJQ0okb|eVR9_z3 zB*woTn)w_H0ydV^nKP!IK#q?h#kK;DyR31qe>q!YZzMjx3 zXUz<$6=$TVt{@}#`zO#}seT(5nGOWz7KpF+u}Cu&6->k8&l46SmlbKs`W<`fU=r6Z}D&Vj+z~g@u zVELf4vs};jYK0fx-l%u$dANAQRAJHicOGcOansD03F5{VW@@9d1)eGO-kbLyNT=If zTrB1*I!aE+=5rF2nvj$c+8U6PHI}3Jqht(|8u2`4C%kJ_}~q>DG=(`)TXiT44E zoo}m5%HqEQ3Jezo?O@^!?l^`F?CX2^+-HO2iEGkNZ z6YT@bMxp%NlV#8a6WYshU%23vO+WCLYzONmPqfw;S;#Z-IWY%hvMaCaOkS$4-2(r6 zv*<8kkpqyc>(|CP{7`Y-vRIzIAXrMR_>#I-YZ{G5Km7*lKk(!g=3nUDIC45VCbucI zv*;WI4_wi9M1;FK4?O8lv{-M5dA3$oBpbNs9Q`T>Z$;0pfz1cNpw6Z^u6!H!oE$W! zck`Loc-dyLo$6e;NOyR_*g>gI*xc3jzthe7fCNdEt8Ur`j(lw=LxIVUFHhV3o{t*4 z(G5eh(+TA_{wFAu=Y>bU=0_2y`W#dt4L2V2ub;Ik2`2V=7 zLqR2praTY6o2|DZeGJ!t`pARc3U^eNT`|k1Pb&6d>q|S{F9=&b-A~ma03*26T!3aU z-AviWarB7h7ho{K`@Z2HX31X?`3pqD% zRvyWb;`sx3v;iM+@o3%bRV+W5bcmhQnRvNht!$Km1ak+@vkkCBxCoTVQU)ub)y*8F zSUE$_eA`B)26?}AV~6D=kp$@%;8vuLIu)sKM;OjZm3zMaUQ<)I(~)=jlp(h6D*Jpb zY+lSaUC?0mAnD60Oe@OgKP}ntB7N-U&QEl+`rfSth zYe4xpxyS1m%r2t2&vV z-3Pd=iEp?0-5_TY{E3*1YO$qD&!mUeQkTjI#T#Out^k|F9tWW@jBS}aREe^HhSqnd zc1upKUcyt2(H=RIF?z#Fp;1j&c80%CDkA6$OorZSL2L`tGn7FY`$TehEFH3lQU=27 z6Ubtifd{u`|B{y5o8eCjex!fDZ95cvCUO!YE0j;kk`t&dCo0Oy{V!U7)A^Rkna&#i z=syDdHKr=smnZD!2TR$*l1+83rbAJ0nIGUc{q6h1>5H7VTUXw{H+hpTDbMn#kc6Om zB<*e*JP$yIv?i&3(Nn>{ME5d5)UXAsq@)6BB;ifNEoIv)gxS9nV2)mFVHi38Nl)pb zn;&}nk=9|K4fu6MC$J<>VOoeFu2RbJ{w(J9bRhQ&jM~+~=`7nmBMw63WyDIjZ;!tZ zyQ9QFaYcv7RAf2R#%;23hg4glRo(hmYiMZ7a^H-(FBYXOKg><^sel{NwD@}1I+F`x z3@26E6<-~6KVw65tUS<1kP(g!2&q~gA-a)H`MA^?8}zQ59pFNX?fKi~w5CA{4eoY- zJteBWfY*x7H=PL?mb{R;mP*CRj8N{yrzOq?=BeH41V|-Zq1)2eBfq5QZDb*KE*7xt zIx3ss_ZVOT*pR&U!VxLi0AsSmyH78#Fhz33I$jQq_WyuWYnpm|Hpo)~KrMS3{oZ8& zATR*!Ak0h5LkgVqu|+6Ezr2AnJFK9CPZgh`0lu01+=k=yMu?+heA6rofYs8TMHM|R z8Ka-!8Lo~gd7bovVrgh+Vo=Y=)=vRI&3MO?&4dCCH&t`${cwIJ$&lAkKJe^* zzM;jgGb=C1gSOcnHe#`kcEj`;(zba+Kc@N&qQC6NKZZhdGEiF@PBpX6Y*~_K z;_hSZDGOA^ZbxP{Ept^)^_QoDU%tHTXMG5&D=2hf8tep-hNM-@BhrFd-)a#O+inAg z#+m-U&A#nH4#KRdWL}>aa&J(u7g84hFtx8a-Yg_~J3QyqpkDK;nAw@O7I`SpB z9OhF!`x&ym)>)8s0^`mQ-mwl?y@{*jnwjQQ&lnNxT+833@`T@0a+|;qJkUQ?m(}~X zx1wHK4x@abwh8RuasM?&W7fZTbbh=WCj16_W=a?(nDu2Dab;^52`%B0)xOO!*HKg7HP$(xn}f{NYp!ND`!8T5s|W01!$^idr70 z+PavqDgu4q3kJ4*5l&l9%6`=J*G>E<)m2yb9XG;jF}Ib>kA9OuFu#(2Cy4MY*l@U2 z9r?VpWJk3CW9i-&$T-J3+q%@ocT=HNhIdd_k+|IVjseU`n{)eZZD+%C;PPpJH7coV zouVgK01HEpKZ)zjk^lA$k1Z$l$ndlF0g8*zjZz7>iT*S{O0qHx3!qG!u>%|nERNcE zoAa&*&U)!Eb{8IVs`ab=Qn)rp&_>8tRgv^e0Y@8R%0ttK>+4E_7q#ghlW2PGV~rsn zdJU25$pKnJ1T_?ed~afFE_L{AAJ0QIbylzIH`5K!5iy)P{D>%Gb6yUS2TGwy~7P$h^2L0UiT-c8|BGx?*}OE(ZN6O zueuBD7rwA#bdsiRR!@ zV)!9NFt^=iyWu*BFry_-a!{mJzDquZS=0(5xi$X|`*+Az##=$1X7^0hR7_5DHy<&1 z8SySgfdUI{vj0t!Gt$)zi}(Fys|X+meO)~FnPXsa7+wNF?R^jp15vLqm8NN{=|u{v zOlP^jWJ3hQ9U_mKBR(V}W+dQW?(~+b*bRVw?Sdfj+HrAqXQ3OEMe>r!v{#D`@4Wxw zJnL?@Krx$0?uiFauRCP5j-=J>GPrCzK<)|0g?az*m#&L)0NmrAYIoN-y~l9vCtlR) zji|&!kl7!Yg3I@BFu5@|(l=-CadMV@l-LjiDo0Ujxl=^K$;s1+kldre_wG5v z{(%JDAs{VSDJ+?hhR#D$ROe>1M;joDNo~czkKF3(&!7+|jonE2tjp>TfI~hph)!$D z)tYku4v&(AxV^#!!NkVv9)Uv)bahclGI0jW#*vG=$6L|UDNj+9E~{GW!~A+lgsxt- zE*$Qo%bm7UDQfb+fRB7#PhHUcQ-&3>e~1-LViGc?M((}SfRja3Q=p7-4iF}`MyVAw z@6Y2nC=$}G{(tIRB<#P*p`HVGNNR0N%(E;N<8AiP`MB_Rv-s=Sc6(#7K#o*M9Jb#; zd@*g)SGhz2QF#wR9s}BK{|b_^dqNTMD;sU1nV)TMnW20Q4j@xEJ*k1zzx_z(3g@*tGgle z9|^GAZRYiIvwh~NSs#UCaW1X+?%X03-)>`>lXpv3G0)M85E|KiF){Bi%`LrvY zpRmtfNDtFrram?rwzMSR6ar}~urPZZ@9+y!pauS)qeK4eWO#TpRPVJW$l_xks&L9m zI?dYVhS;S^xAWG}hebly4K_A#bafO%ILJ8~@iDz$7zJ9A+7|&&v!0Gq#xTtuo(FO7{#b(lF8>;rR)e z9_4IkrDGupEy>`=fv#zn`x2;9<0{@V zYf69^!9+rFC^t)xn}5f!A*zKH-&pHqOXsRvR0ns88k?wwCsN4}N7%ls393>Pi--S} zzEq$8Cc@@!&nc+xUdX}F&*d|sdk&M5;#_%LwAO0ZXVX9O|!o_r8V$ z25+V0(eq^)g$KAyOx`{Cw$V`g?i}r$frKtttMc!*La?2dA4QslgW*dUy1KOY3o$ov zl^Uq9Rz;}B%gwX6xkl=A8_$%<`8u^CXk|*KH!I7)j`Dqu56%8Tce9>IJGP1zOayot zZA7vRB#1%?NC&s>>iCCZ2v9MJObyAg33AB-D$o(FG1XL%u%;eaDD$5>wg#-va-&Yy ze=DT^p;yOdADisHYQD3cQq~^Qz9<~ewoTos)=pX<+D*>{r&&CmgTL{m*FpD=a26j* zsGXf(&F^7ziTijz$cAZsTatbuKZ^)E3uSJyKz~z%&Mb0Fr5|rfg%6ykojqewAih{{ zv8HPxymd-cql`pV)aYsI_|k@&ia+F*{Jz>sgvW7KpGA3u01n_RX1(^rA4S+gW>}?< zy5k>M=3v)HZC+?mAmx&+Y~a94dM>H&=e2`$N4=te(DCLDF(zCEuamdKN-UMsKDQ_jHAewywo$jgktN9TuP@zZu>PCtbMK#*CvIS*A4w}g@M|%k;|6CL z_rn>BB95fwKj0dpVPlS?LY0)Whd&LE8k6p0j=p@bpC}NI zT;j^HnOd$8Yof&HfBsIi85)9iLtY#dGBo;>-@`nRO#60i>dFF`!{{grDyd^JeA z*JIzjCV*S;W}Guz7uu|wdWYT7we$-b?zc8;wvUAx7C+wQTPgYcb0_ipbCOjlc_Be` z7EY|!e+}?g*@QrsQi*Q`@o|uskE`A>X-_J2qd+xtXDk+^*`zp84^a8-rEL}QL2+P~ zzV=YkJ3N0LAJfD8+H*D=3ok=#l63axzXP0L{Gt7(X!g7udA+!y4|HalL>9ETkhXWF z?LD+h7QJZ@JS#^?nHx@-!3`v>^MxZ|Qm-dbRzp_3x9;A|V6s(!6w%PySh!DJl2SOZ z20Mg<^U}ilAZOh$Iy<-_gD-m*I@Pud;V2M?W;2$jAd*Ry_{unu^uaGu3HB(f`r`f+ z(rmOa@Q>E2`(Mq;b&>=;o}(4=O`r!&+H^6zOz-QFj&Qg2O;#BTCDk+GZ!PT1DSQ(d8DE~H5~R(!$1UZ z;VO#$h6dd1);d=iQrV-8c!ce?%;j(89>9UvXk1IOgmkE?Zi?pXDV02usukPobz4kz zg$5BLKOuFc_wx4;v1l)t{Kvj@JJWv5pfzS6Ogi=l@%Pq0LjhrBuGjNUbg~WZh}K;y ztv^prDQ$$e@GI3%GJNh~{juzb|MYfY&yEPzKOvXQ$GU)Hg7RZwcU7IRQ#7wk!aitS z$z;28g0%d=44UHhisEV@@gOLUaM?r=cHW%_2>-^BIDyGuFkDre&;+^pZYYr334=mm z`GDTtoev7PeMC5@Oz+>~u&6V!HWwOj$7j=XEBg&_R$7rI?i-U3K;wB!%>ebA2LlyK1BdlMw~!PNMc zt+}()Vp6niec6)s|;=qDMF?SaGpGWLx{BrtR|fB8$c5#1>IyMEC==_moisK+B}28c?Py+ zm_>R!n^Y>eXg!-r-;PdDwo!*DZg_oC6Hi8{5rX>h-iy0GqJRnTSC)k|9F_hX_^Mbd&XhHdogbnT?l$M06-V`OQ3Z!2Hez1(08i!9q$;E|lhteVULMws z@ZSHS#7M|!5Rd4$-=*NPiefnVtcXA;PQWsj&k6@WMszEzHUvQg52z}3OOjyMkr3%} z1gZ6$#_|ko&DSDgJYu$}J6<<;_{pgG!(NY#g6~A?IAdt79t6RHn(T~vRyQ`yCgONdcFbBeCGbYp-4-FF(yirfkA_IOR!QDI9?YJ;^ zDW?+^0nFoE*-WPY3zdeQ}adGL6uADE@C7gPA{p_R|&k zg#FBFIFg0uwEs>!O=ZP9wn72{guClXCikICN<_$F2N-ObU%c>+;wLr<#3Tw&SN&A$XNJ8x@TIMm}P65B#o# z-&4#nLqai!{o`Mm)hSbZ1QbgPWfS*pAFdpLBY1GN^tFl`?TO4(5Q+zD+z64bv2w;u z73N?!D`Wf}-lR^BXM!MQWi4{A5sFsO{(rW8vqmm?+34HR+OmQ!><+rSKW&!m{7e!t zIvnBdI@#Y6uVD6`RO>i1iT=j9m;SZBGV{GJ~Cb%-jmv!_jfuZwQXr z8vZGyOCYo(?lke%{ls>Ai`?r!m+T8KNVC39Zm-0~gCHXxu}0_uREf&GH$omPk(Oqi ztrh!AX(>V4AQZlBG)WC%Sha=6+{Z?yLUrj@MKgRV=4S0rBv^pf{l^?)M%{WwQ|%T$VKJT^ko9CiSoN|DKeU(7jzCq9f>{rPBFTng_q5v` z(7N_9X{*A?*sLjowS=A*UHSr4Nz{w>$f9V}k-GW<+0;B99iXC{Hu{Pz9m9Pxp8jv{ z7^-AVK_k4k5F7g^efuE;C{}gR;e&5jK+z=<+hFP^R5ECz%Tw9?L98c{Qk$gRxVg_w zh>0j*t5CyK8FUn*E?~qFtEyJs*)jdtlUQ9F1?TS0S~5l{)&t2gP9^w^)yD$qCvmL| zqkxg4EscQyxbE4iDUN|B#xBJw`2_+t>d+p5p-86qhG^*R^|bX&nUxhZc_gD_Yfxuw zG?>DhDF6MyEF4BS*g8(1p$b1&8r0C8>Xc~0dFYQ>O|!!20F?piE9`^PE;SMQs9_bN zJotoBIu|52tcL~J9h+X7k7}`#J&!HPVay}ZB;1t~b(dYVTU*e%NE_T_P zIo}f=aM*iF5YF%$ejY4YiUeL2U|O6>0m%jtD4n8^x1R@PaQ+F^Pw>iUPPeoG=ARd=h0 zJ<$3!$VW)ENm2Ht%N=%mR=PuSKX_EWV6k_$+*@)ME*5TWN^!*o)E)MaLxx;=?+g7+ z59nX&xh15p)+XT3u|SA{d^a7s;%~q(C!E}UOGSoSaKjfvgI~IW-=ifXlYJmma0})S zY-C^(dWUD3eb2)Zpy_=?OUAZ!YfvH=9jL0DVj^Hbz`2IT@L+%z`JAXQxY3h3{a#L8&d zZ~gW^lN%V!Kk)o;v)0mL{@1c3=m=!0MEJ5czO5-VlZ%9egioqrW3})jaN>3MJMypJ z6GD53{5VojYiKFIG;UAM59FBfQWH;|OG45hj6=3*ZI4Uoq`G1uz$Qe6j%{D!LH8sm zb=yOYTtn#-x$obWa$s@2lhxfoy^`?JC`>rf+oU*jEsOLOepH{^=EHe9PD3*5K~YF z&C97~D;=Q*j8}BA5ONo;=wpf7EU8qLAnGzyRiJ6#sth(`Pk+rH#k%d->*$&x@x`%H z?{R(NKtLHy!e+IJr_6fVAFB+*2S_`tU(eLyeZ9l~CWBoohTS%h--+GAnvK9v<&5sG2&_U$ZQ(u1tBL*8?(%8=`1F(FpHE2s>EYcFlIX8<^9U~= z1WAe5Ep=1;mAd-*sLKZ4&0I*duh&prUZ!bXL64e3W&mBAm09sb68skUXahc?w1XEN zn0}~(H8+L^>x?Ah*GVON#n5Lemc!he_G5-KwGWdCc%I49Vb0Mphu&S{hTSbuZ=KG; zE9%SStXg(b0IG_N0ui(jW(H9K;b57xE8E_;v+Z)!q}JB8G=6ejh#_2N~iPw?Fo?xU=IIG7@_8!)nce1=>DH90T!l& z#gaO-bg1~Y3*FshpN!uMLrMP3q3lTNv~RLgqWtND|97~s%qX@Lsdx&zLQrFEv-%pV zE2;27^~{h3rpsRs6HBRZld5sINc`91GTERUEvKt#)8B%r&6t%J`Jq;ctSoL`My0iP zqx=1+W5Zcz`}{c$?dm|#i(b^#Qu#rK*y9#Q`#$+b!pJo}kZKeaj|qw*H3Z#~v>-tepxd;gK)nZTZ(LAUucGghMw1x6RaHZaVwfoLXf zkwyz_u!55=l7;LPK=h+-$&dW`ys+)F#rK5GVEVE)nIqpK${dqUQd+H%W{JisRX|J= zUR_r$=HI zi4=^-D)T9_Ymrf2>=nPVa^l@3!DbXtHHG#>2xorYgY)4-(kIHWd?#d8fm(y;@Eyjm zISnxfDG;HD{|USAg{bEljY(q0SBlRZYw-;+CkR%iBxnIma#=}LnvLi>D20RpDzm4P z+YulxwY6Hg(1K%DRu57rRsW#>5l`fbocrl@Ho#jNz+lv3Y4sqRAdA8|CpN?}>- zE6>GAiA0l5@QBWu9J;Xx3Fz|nHjVU$w$XV=&%s;G0(p}e{{+8CK6Gq-6N0#bGNfxP z08MCvWfvB4RIT!28rHd=nTJD)@Yo0j1McR1^Hw->Y5eJQDqyQ&z@c;95Vkr<0kO%X z6gZhhB9gcjdrd@OZbA-qS` zm0DDEp=nSIVs+WLtzEkzF?A994fVnE{@oAwzNlpKkx!)(eaevuSi~!SX)Wn2&$08fSYc|hv8Xh7A0v#Jg_Pz}ZIrHDbJ!U&njgd{>7#*ITj{X+5+WL}p zYC-=cixcLclCiBRQxkv9S8+SOtfd$yCVubEID8Z4@`oQi>6*dmi30W9Ff9r`~*svxyo*tpJd*f7I+xqB|M;L-Df&fUatI41y#Y@LC!OyFQQTlGx@9-CX|d;kQC2EEQ7IvYZF-Xxt<}X+1tr1WiF2PJW8;%t?Lr)ZaBR^p#Y8LOS zw)N4B4t;wEDcx6dTdNHM)l@_G0*xR_C3j2G4^;`avDZM82y#$9i{Os= zp-K~-k%7nJkZ*F^?NG&S8(fgM=LM#le5erSo;ftu5+W z{h{6=g0C)uyw(OH&*z~nOT=c|EXK1%K8je$(VmTJ5B89Gx6o+E2NwbcIcDJ)Mo!ig}J3mZ=AmsUXwHvssS<56%ti=f!5VQfXJW616*VW)D+_&oMqj^ zdjWHNX5^A<367_52$33UU0T^Tq=VAR8cd??zx)+cf~HVR zAQc8T7@Xk8pF%yD)8vq~{Jg;|X;k+*bU0M%4NOV1P4*Py>-Zkd7oqx9ylZZ=l=}^# za(f~tREH}Q<3VG14w9nm6@`&~bi@b1>6@C;7oDhxe z2K$F4;}WxH-4&WmH=K5|Ax(}h+>~;9C6p`Bi*eMIOn&B6DtyA%3jWd!i#&o#)3hW+y-B%|~8uI;4Q)*Z5Knz;~b7kK%x1>X= zmcG31Kfm~hY0du#*vEgBw*GAygWdE@kNJ6EZ^d$m`6=$LwRi*Pb%56@!7-0Xd{)F%>Alj=Ly)@<+Ov2f ztlK#P)jh;(YrJR9m%hQjYp1|g&Zo<~@lK(3jm`>~ELU9Oi) zG9XW{M|@OV(ptk82P8c$QRu#Rcwsk1j%9cl`zO;%#@d@})G9vr)DnKza77`2#CY}h zyfGv0l$En~Vu*X6`8%@mZC!w(+H!or!%bFilk=j21(KQ{K8112K?k=0g$0_)fVwjV z1Sgk{0^cd^2_W>Z+=3*7Dj$szy0YILAz?q0%&&W|_YA~nAn*!ATxfXsQ!S(7DOGB5KKud3RE1_(p43mMJTInFTxt;UrLz`k|_a>+&TvoAYRz3p`yqx6J_N9J`R z{0*YCP5VrmKO}tnZB?nWIzv|oQnC}ykY*g$HtTzb=T0BJCVTdv7BFs<=kTqgL8~mG zN(h+r504Bs15c5E9Hl%c)oRx4{r;v_+6D<~+{sSVo$ZEc$6>tCTgNZn8XT0e5ZRs@ zh>0*P$-BUzxF7K{eNmLIPy$tGjT3;g;lWTwbKw#XS2i&{5{S%QNq+*lw)40v`3}Xh z8>-}$B&X+_vu9>YI9#$q_JlfXGU6vVYqXO5%WPe$^Z#yx~#p1jvhsnJ(4+Nz>qKbl{99y1*GocGR)scO8H(2p}0ro{+te`!!+jPszDgR(=n zhfvD;umvj-)L$U&S zg$;~?x_&q4ukiVYJh#b!tJX&i#f-x!PMV^hVBf7Q;|kh*xFAd0DDis5gz;~S<3uF+ zfGPJ(U96h-uEGXEMr^Oe{~P%md|0`g?g`L50Q58OnkxR{=|zRDy)QLjTN(0n#H=T< zboARc*e6>o6zWiBkz2S7q1zr&q~|?Wj!r+HeX3NtY{lL7MmB=fa>1BS-JJ_WqAe!O zfJuArg5g%|i=Sv_cU`br%m~kol0aBQtlyombttn7R?HRDunv``X(|diKAksz6g9Z# zcP@;sfEb2hmDA?p09R35C-a+j-<=T4tGGe~5+i2~3Ekt1@Z;@vMZLAG2`MCw2+44o%LTD>)PxWHo3fcr=Fb&e9ntdjrW zW87(Egk`TV+@zVgAL=pWpFo^Y86QFi1_fC&^zrKQddsm!KkaVZKo)~N?|XS#AZ3_U zJ1!-oCv8BjItFsg*yCY)^Ol$b3gT54fpEXmW@>7%YwOVYqkVFEzF_|~e-ocTEy-xgqC2KjdU^S% z0GHMi_3Cm?c^i!z^0{)K<9v%x>kZhC?xMLqW>M`2|uS~nl8SJ93na&A%JZv+)Oqg$MC zgYZa;EA-&j5uBmzrK^As(sV&&z(K~?299c~i|PGwpk49jVc|WLDB;wM_-R$DipnlefhVi%b zV`js=_if&|ic>AU8i~Xg!qoG+#6^HsPg z(q&ra>u-Me{TM*!`+?g3GDSFrj;tyOcM|rhgkqH^!s*uw3zvIY4^|~wyW}J4=#r*E z;bN>JRM|NfbaPk#@jg#)gh>5D0r86nD$;KJp^%CYRTi~ij{znz@h1a&#W&E6Yd9)I z2|Rm;mWUL}|2ciSaDb_P%7D%$dfsk(eR(@dMDEjsmk0vhWXehTmIA|GpV$ICO8O5o zDAaraK68^UzRiw~0`q4|0yf(#qk0NagX3+e)t)mwo9_ImXLDs=d$xNq>r048SYD;$C%K60b2Aji2R;z7I8)y;|C zhtc&lLOJoq%gk2<%J&QItsBAvC^!sw@9oz;^&8rn?c-44*$!>JnA00m#R%7==+uvT z(8LYTHN+9K$xnSTrzB zia?r!LK4*fujoKR1I`aSDS_8UEC|U#)K!rckc(JJaBl4d=^)d4MM~EGr#XhojI-oU zfz9-UaFiY?M%zOLYP4H>qpuBhFrWX$KXEFfKP(kKb6U`;ftPYIEGQxXR*dIiJnYq zVaRk{SZf<8e5eg4a_*S8Z$7(`{QA>NaTHNe>jJ6Ifm;qvDNCfSbwGelHq`nPXk1-G zXXRcgnozE3N~68-*|F{e`Wr90W@{8BzMMK6QoX-cNvolg*0C9Fj*mJ_hMHVV$Ye3y`gH z^)5WS8TJ%G3&N|o{3>Vl5tx)yDxH-gE41cXV$nn@f!SvQ#wsZ09@{T(6c9ER7XjYHBnaR`{BH{r%gTQD>iMZh;FoK89?9 z(J{ub1+IcYoAK})V?@cJZgy>RSW=3YT;aR+3o>U&`@8dN`|z;y6eS3`d-5l~3Pt z-fKq$;vOj_$;aigwMEw_@$NL(plmWh;t#zo_xNG0nf!#F@tZ`zz?*qhk;qgGX`H1K z_#+QCG7u70xRcfcd(N6IUv0wcl)pJ>Hk0bH?4#$icCLu{!9Uh!S^JubX}?yY)EW{? ziF7^El7=!J7cpmg$6J#@$4+@AG=#qOXj?N6X_%@qMC>One*UBy2|_N))?NTY9YY^z zMkgVt>mszRlD9?YQgjXgKmyvh&;dd;>kA4WHHwP-UedJxpB_*X5}$r z$jK(Lqp4#!lBl+8cNsQ;`=ZeD9iLFY2QJj%JwDA;esq9XP3G_j#Z46e7_q}G^1XXe z_o2Z96PA%0yXB+;f&l6z>Uu)?4w;8=4ID#wPVVjX;fsaki{qc}Szx$=5+$LGU7a7H zut|_8W59!2^FDAri#-?8!5j^(62_8q6*3M4>`b+nrPS~C#ed(S@(@l%%HxY_fg@q| zuGrp6!5k0n?%NxhVH?-q!$HDk-U*UM^fnQClz7lMQ~YaqpJbu?TO6g{R0@h<-DVVe zZsNANY;0TLTqr|W^^@CX!9rf+B|lbCt|u4%^`pE+MVLC7In`(}1D8z(Qn5V*=OB%j zvWrfz*!#l*aTpbE_aSbe|dz z4#bA!i|L;xiF5GGL<<5?qeIXxMFqUP>f@=KN&1^9_CT^DDYiC?it*i$Yrv^>vaesD zy5D<%oy;R-PhhoLg@7?R?s2hu_+Bs#zpw4SIT>;OB1mbvv#)^Eswxx^7z2IYSEj_f zFPbwoh#C~?_Z8!WDW3z0)$AaZZF>-CK(RKOM9`>M2jA6Q$Sz+Hp$#z?>wuk7pb7T|ZPGIHVKxoX%nrg z^U-P7%oLQRv_JP4jo&cQCr6hECUdlVPM{Kjx%u+Uc8R8)bK9eGWKa(y;xM*x!4#r6 zgl?X=;VV~Oi;*-x#wVfn>g>Y!BJn(>5^bd45bA)`%A<9XDcU#)tWjPI1!ZWX+qLS6 zb_$KF!$Bx#C^VbMXCjT*{%F5Eki_`F_Sa%=uC z3yOvWqbU|Bug}i62g>XOdDz&P5>}Dpl(9PY>$*{$yqT&izA-`i;G_c}7bD!gPG*Yc zAT`Zh_t53OrMyFGDGZNeW=5BO5~>#Om4V?8e=a~F+A>JbdBm{42Fv1e$WhYdJ}KLy z0dFklQImzRgHfpB?w8R>cvUClkEoyF-w%zKHlgwe^0z$yR_eVwy8X3BID}PfifY0UhN*kx zjFR$!J7Be!nVbVFOh0BJg8vt{?^)^M4MPp(jZ42$R#q3R8L`Bzj!&|Os{X?}PPHK* z4=6oBbpDQ{>ISsMDljCzSPfVRQIenKj~OSNQARx5(GQ^)sI@bZaSQobbSuD+xY^pg z0V3hw5Tc?xDe4rri2+T~vGTPG!m6Qj(l<2OmQ*CV)#R;RJH;YjtEAb(6#R+2*ulTw zy2lFHV;%UtZfY^;B$?%_GgDSF5{Z9y{xMmFgH`I9&HRscCAJxoHWi~!-W2;Q0WQ%w zF{h{&qm=N**YThm+}U{#~4&D)2x_48hKr1 ztH+hCnYNS#Bm9plJ?z8pAPOAJg8Ca~Z3$BlxHZgXjkzV6gq&us29GwIm)wfGN{lJN z(nh`4V8Kf?D*eKN%_VNv#sOx*QCAMp^WoFppt#zjdc|b|5D^%f$$#up^8*c!PhF?x za+N$}Bm!xT3gtUCfq0`rdA1-cdyoe@!MW0HM5Te4XkSAO7IaPQ>&SeR`(RSCzdacjLTn{Y7|@H1}o$vQ2F0Mm3=Cc2`{ z{l{Z~;PS!c{@RMnO|CjT^Y97{D)Ce;lS~=j8rX_r*h~e%iiP84=b~S;Te0EPdWMqb zeto-tw)FH8r}oy`5R&Si6kFND_b(Y^Z;g^-dFmbdYIT~l?Aj&U+GEz-Mxh!@a;eGQ z{rK_Oz9!zQoo8v98X55kR)25EPqMSXd4!Sr3f>WV8@q9}&y@bPdyxcd;<$wrm)9do zcZSoysX6Tw7?bW5t}OQG*dbq}_g`|9KG9q66tn*TbF8dDcEwlXJec!32eCpcB`2HX zJpYY`Nl5{K`o99Z_=xX=I+h=ma29?O*@C3wgGwCHMW6x+Vn-)@Lh*+p3}9u%Ps!Fy z)IsZVJ0g1KrB%Ppb@srMOjscgW&Qbq^8x%OGgO{FU{O#$GeV>mf4L%JAA*4vx6={@ z`|~kH;uf4w6dJSt`JW~E!t`!c>%um(quG>Q(+D_JnenW}sD^>~*SorRV6y~lpQc(~ z^jcxR686IHME1c;kUnGvf`~;gpr7-S6xeJ5n*BUYgVA@cAE(D*7QnMf-kEW=~oaRHL2 z-{ThjmaWs=(Xk^76{B~3qhfdIDP`BM6Q(xZR!z5m&x=B3&23ya!nhQGD0g$gc+`F7 zMtf;LVeIYe_0=tPVl{9Bbfz1uuOR+&))iiu(?wx0dI|s@`dg}gs<)(5TYSI++8DbsA`olBeDz@Cg-aoz2kg8qVfN^;O%&#+iRpyzPhN$FFDy2s;Ag=Kk^bk zWWT=%3X^=>e!JR)*+MJUKv7?WY|kk-ZqM+F3FFil8&55(Sl#zYpUyc!E!*Q7fE<~0 z(3UF?0V?4){kk7CG$YVg>i5*VNeqrAc)BN@Ov-?#AJGnBE_&@7T@v{&2Ui+1nR1VD zVxxkD87;3`ibNrGqN=URTUl-xRSwm<$t~{(NennfhKYBNyb)OAjPyj9608$lu4Frk z>*^)LdlpWOo%LIQc_c%}Ic!A>@YV;QJTpR`b5m!COR(pqZv5q#xm(X&SgbuLE|-$U zxMq9doWYP9hLR7GivitnUa;8BD4IT!k4wM0ZU|yOnOzsttXuv5$wVI<^`&0LA!*%J zA~&iC*>Tk{7mjjY6}lFfNoQ%S|1}MHY{PfHbKi8|6nM2?C@!Aci|UAGla2^-7(<`s zoS?QmT_R@$hYq;V#5e?jWr;8=Oo=I6$VZ%odK@NzZ)j7ar8i}nx5*0gpNOIO|H8rV zxt!X@(#dCP^x)^bxR${`MkS!6K}Z__{qfG>9O)eW2~h>T&=03h48c9d`q_;k<567; z{#li^=2dv1z-V1ay2seCxE9OTwAka3f_DKBm*Ol^m7XTu%v#J}^U~S$s%`+e+0Mgw zyX(DHk_FurvtXusjV6!@$3+IR-ANlCPj8G$ zHR@ZBg3F#QTHhTnN(v_&D8BW}p%{U%2Y8u=gB>>g#cVbx<_OMvKN01k%0KqZ)tB}1 zL>Bjkci_&(6hp9p{#$wgWDUrHcA`FH9z;6og4|K-BYB0(0?HwwggD_vbBG60oD%&t z*TY6+YX8-mw7OHS`3SD7!QcWfr#0~S!`Lv6vkyoVrZPPOg1771`N;u$8f8dtf5Ni2(6}K3@Y#(s3(Xf zIsQy_(a7YuAGYyZS)@UmF7_&v1TES8GhDe9<2EWdqSHu9yglkA(ftHo~ZNX zSjhSn%J74~cR@8!=D(q*^K!W`v96U*}U>XT+A<0xw~EX1^4z4&E9q59Zj%1b`b&nGoa=<^3?yyOkmr%(i7 zvk$`&K`~wB9(khW5id#ni_<=go|m3E@AM+vtjMdGw1n!l+m=nwAZG(FG=t%g3b zZc5&tTgvtVr#b|BuBO541$h69XeQ6&7wRo;>s*Gt{3M=>I8m?gl)m1OcF~4PC=i-= zVEmS`?b;m#Dl4ZYDYCH6D-kSSkH&&9GS^f>i!-iIO{8yTCqBW!^mJ znr+tn;7)d|G`W!-7QZ3f)5=fO^GuBdYfwrZ9|p8C8jAZrXa~Jrm`V_1Tlo&KxX_xO&veBK z)-5q5;-$Imw}b||&r6vI9BGo<2MEuMVySX=9d!$DeAnpq(NlaUL_w~vSYjBbtg>nQ}~O}VYUI1T%57{p)^!!BFHH9N+LG?!SG7LyewtysD;pV zW>9-UF;vuPSo4yyzPPJ^N@n1%7j446?zv8T}-!ldZt zLHmgtCTmiw*pUYj{(AGbc;HT;owhzdj~s}uGE;FJ%HC#%BL1a@P^kuFr^oX67mP!j zNg#i>9I3p`cK05prh43H_-Xq<^X+(vE7n#A*Ko8xV052G>_w1prw!*%>u*IfX#{;*= zvK^(7mjeOkjZNqNhleet^HhKCaEF~TC-4Ey9uRiX-a-X&E*mi_J?w}QX@US&hu7Yn z(U%nU(Z?M;<{L0gDznJNip!fMyB)zgw1h()$6Q%vb}2oYc^hYWUjc3F?q+XwjC?%z zVH-{kz;4{2wf{x?3IBPTQsZX(I-ECowbN+=QCxC((rlutJ zm({Q2Tb3Ere?mDFwaJo47sxT?t9H?r0Te2L)7!@(X(qSj|H zNkduigPK|PrOq|`At)I$`hnvQlu55CltgRtpWx6tVUcAp%&v-N3!RU5rSk8}GVf&1 zvNS@9eTq_vZwu-M8k*X>y!C;_hbX6t&*1vEN*7p9qhr9SuGf z4xzpc@ju71)jr=bwHGB3zG|O$YJuIE_YE-Y9Jz*?d>NY!4bs_eXZh)Uu~}k>lCB=| zv(Tyd#QxKpdEa289iY6AvAtf_Y8W|3Z_IMZa)%6qH)N|sL_wFfGKF-|U%v8$ZR}J) zR-&ch46RY;=%y^7nKrMs^FLISKEeS!#zA#If{&$T#*8K~0?XWS<=_27w?LvcWES~V zgz||Emh@PDk*INpKd+CuP;VKUgC*7EX5>+H66d?zFDjH_YQEDQaOfD$UYxLCtrg-(~I+3w3o6%`tJN za>XcUT637#TqgF`^|4qy(B;9zQc}A}H~;z64UB7UB?@MVFw8>|MqLG3sNUl?Fn{ z0IH)?!qhRAu>06VE(>=ucwJ3xWeSr3NvO{;_>-m+2E#Z{!#GZm9VdIQ;4v`LHEbXk zURF!4wo)wbo9;xpW9tQC8Qz*g*IADV7Qju1eRcwm+}~R|0Bz;aZK8*o9%3D=I!{+^0*3NA^t$^kcYKjj0 zoXgf1rY~ss2K&CWem}6(oU$hsREcnN#)ddbR01uZf;etgOU}TzxVgNCw1xfQ&iyfT z{?qRgKgi-%J=9e#iJ+gE)D_}j!cE&ys}!u+hIxBUj)jG z!g_*1z(VOz7vG<_Kb2(R4Iw6Lc8)m_YO`4g?`wfcEJLB}HG3-(1u{oyb5MorWK8RS zW0rf>V1}wXC7UBRfoJFmeD}KO{(3*G5)2(1rJnXWB>-qSS>T`+@vuDuMoDphoEDXS_;%9tAfXKS&)#5 zi2SYRz%CzE{IJxWA&4Nh4%N3>Mp0jaGY|j@I!(_Vegus5^yZ124&hDhu9)vfQvMI1 z2kfQIF%bg8lz_S##`Pd&tM3He^+zB38X;Bc3uEA=?!PVIk|A~@+W4T>@_>EFhE znZ^iEp-m%i={lSo9V=c9^|`*1nT!IRl?}Z3>j+5J@lW)Bt*z$UCUCSI9ta^(F7Z;C z+rd747EsS4un`?ba_?mTmg;iEv3b&4pHL|Atj$73OGqW zD{Xe{QKyl$>xY29+CF<0ThqxCKVaoKSmEngD8(1anbgU+nh+Z={Q;ndhFtA=3(vCMYv1F^JxOd(A#qZnW9rtb&B0?E zDoXQ^qHom)} z0o%_J%}N;p-5JqL(cml&=;zI?x)XB*g-IJkS&59MpKWTX%axjZaGX94%CsRxzoS1= z;uQ$j#DJ%eWhij{{?`J@eha8Q7IWrpwn=f*WysQz4=W~=Mgp4+uw7bWc_A+l%ZGuE zUSJDRCaf5ZHTn{Ld01<2V;m!bF>Xu=90#i73yDR33)EKXnqpZ}$# zop_l+j`Hq>Vq$t1OT}z{hb#+HLaSC;i@8N{8Mzm2P?)FZ6aUXE#q4c(!;)WkXi4Y^ z$)%%+ndWK33C-TF!#F*+3n~HCtrd+*K(G#%0ul3snqAH*SbPJku>c3iLK%^hoS(IZ z#HaU-$!i$OyKN@(VD01{a_81hJ?zZlX()u;HYSo}_1dq$lm7}_<@4>(xX}Bapw4+p z-0}uDuVPtMc~;T*`2JZ^6uIj8z(+#z;U2BmXjK+X((<3?AcGCTn604%3;>@UK`%M# zbkTDG3QosjnSciDMfS07M^>24p19wLsFdR&wdJrKSrr9D{-u<)d|S@!r3s0jLqfet z?ED37f(r6!Qx?9g=@ojUL{Htu`4!v`!1+8t{`0fx9cpk$YUYi+1f&(rvpe2u!+;Yl9 z?l*_u))Cr802)PpGIs)pVKuw-)tij@vbKXD%?fyzY??Q^0KtzV5!k6eLCSFfT5-b1 zlmw-cew@*A)ocQnqi(7MY{|E&*UITWj}j~UI+_}dFyvU+JFUm&ZsKs=a!RR=OEFGM z#HlVU*uxwF5*&T<9E|Sm-RT#7n%9B`CZ5T!Rj679luRG3c!yE!TgT*^&x%#)Br`MB zgWOYH_|ldonER>Peo@_v-90IUg(fjCSWAR`M0$3=p;9oVs~H-pec%;h39y-Iq|(fV zycmtA0H%TC0n*P|gdzwXUc*EjZJkn4MYq$nu|BY#$5WiLM0!9kpf znp@Y$i@<&ZK*swC75mczH>KoKL{&x6M<%JklQu`g8*x5<^8mpN30nVk69fRo+N`aP z9Ed~6vzzCmZnQXQQZ-w-s}W0kXrU8ta8`h)S)3%dMv4$|lIHAY4nha3e9>fTlL(}i zOwS4m1=;>Xg)yFvRZyKZ-%<>7fO~YSA}S(0qkFdBWxTi374D6@qVhYr_$~{AT17Tf zg+`t|ommZNb{U4;C9s=#9^3a>bw+dhipI3}Lp%zsqG{f1{5sv`MVKQpG7wZ~QAPBt zTQ7{jqrCSOZ};Nc`jL)#eWU0Z^69=c->{d_2$!m~IisjfQzugZAa4TXu9aj2doMC6 z%dGVrpl2;cjIJY>TrR2-=sbhHMZ5pQ0MAx^&=^+qt=* zF&cc=fxs3PBu`$9Rf0w8u@8F0>=1Q&ff@La7nahLqXVN)CY}lirBH>7A^mC1K~8|Y zM4kaT0yP8^6aUWzlq4lZ7f{gjZu}Nv}S`8s2Rl)7{v_CV)EZc3{Eud)t z)zVac^^f6Fc;>3vC#u=j$SMiQ&R?V*gVzzRGrS#9kJZB2jexS@V!9`Qr1I`Wiiyk+ zd_%cNh$Pth#_~~RE;Qi?0^4k0$kfuQ?@(fbEpsm_`E$kd_#vh3tf-MZRw4NI-8nFNr$v`;*nov~}7tmhp zU1{qwFf|ocpU_9WGF;<@hT}t?qkS<30csp50an(E6`UEcRoPov=|D+<@~;Vcc01uq zZ@@=gFAiH2G1Kws$8VzQifvzE(~vS6G#b=c1dCCLE|^5W)2~MVQ<8dO@)@hh3qz6R zHoWf}Atr`jeHG6@3%lF3V}(CF#^s1IHm>)#CH&E38S7gUEN7_7FZj5{atdg zt*17=3>KAt0Iwy=tYwHDuWK_VN{4`Z>!&k33>twnmKOP~lL8tlw?z{XDKrrfw^lxfX1wCwJ zj7404=89QQGOkj^12Z=xYKS$U#txVko>*|QL3Sf~=!w+;9p8P>IJ;1^TEL*hWZsb; z{;rKCV#;~S#s``@e!ueiD{G>mu!i^bJ#Et+Eh|%x8bjplDxSF04K#q{$Cgy!FViIk zrWh58SH`Ir8^Uu0@Ony9PYOiovb9TtGD#WhYDBkK))gkDZlSLvM2$g7W!i)UfI-@ea6jVA!v@oTZV!dLC`daN=Lq1C0h${FIx#tvJ<$T5E8sSgtff52Fb5xE@-^UGod zd3;4ToHnA zLHsv4x?4!4ay@@oH4c%kBgV~U;#vD&K_abw51 zOTnC5%Xu}wIXW;QFrvCNKZ5V!=3w(_VT33!q()&ynpFLNdGAjTciJ1D%dCTDj~#TT zI`$o+w~ycR!X~pvIc#Gi+awQoj|jD&M?N02(O+3ZE(emCy0*I_XWN=0eq;d?V`yISyDhH6Q|ES`ekii zkYh}4nUDRVI!C4u5blHn{);H!D)ot8o8@;fwvj(1hKI0+$ET#L|ck9Xt)-h=- zA9=S63OV#W=aGfQXmH``^SXCJfpR`X4JqGFGeSY3q@Ysh+AP!jDFH>T9`s?{i^Eae z^Lg2I+a+~Y>D_SGb}zN+Gb9~ot|_X?i+^CqDa(JzsE$z;@`a=YC35f!&^T(I`CL@@45~#5QY#&+l~vRcL3EJ%kR; zzv1S!BA%Z|ur|qA!XXjolEMP*%X=00%`~CqHHP~}1iYH6eb;U`bm99sDIz+Stn-1& zrw77S=-{g>?}Ul$yei5kX+c^aULqFO_MP@#7$4@B-so{-?CfZh?AsDS(h24Iu8i1U zG5v*G(zjH@$)a3RMs06OJP`bs2bRG`qPvL9?qAz`fa6zMJcw-$DEkaZvi3DO z`}pY=7KO3l-3`gLoz*zDsC~oJ`Pm|qBbCyH?3G&Ak?GF(nwR_u6noONDxZ*J);=)7 zQf;ssOjx1&hmKDXyWQQlvu$xI?^uB(xftzJnFsw&4$OYoZ?iPS8v0mfgR#p7HvzYx z0OJ&m4?4u_m#Iq_s5Zf&g&Q)sMdtLL-$$o^00I(t%a{_yMw4)?@CXGavVsI_KU*5a z__adiacfof1E4ff3)-C`Vv}JG<8YxjM9Fw|sKK{WSBh27CpJFcAyUjh)km^i(Ug0> zS;m|%OFkcVH2!WZN4wZ71R59+W8}DnK>-HZ$bP3DYBGlQ)rh-f>+9f<@1W@jpn!z6 zj=zKkj<#@TA}X^@_(|M`LF&X)Lig@;?1sASnz(}ag9~U-p67Bi!|2Yywo?zMEjqpx0YvxTIqvKQBOUTQ28ZSi~PF?q(Fcf3xF*Sn{ z=6pDg+{ciYNp4bL71qlmpMrqer)>E!K+AD_ynM=P1|IAl6srIYn54g# z*gF|-NCo@8YE-;9zKpK`0#Z5f{xB*;L&CH-y{QkFcdZnTY;_IhrhkntVQ^H7AwOVn zw^M7c%Ad|%%C%65r9sCmdyWM#hk`>!e{~s4iV&o%C_X1Ei1;Mky`GHA-M^Ha!83jY zrFGRF0FI}8p{uPScBRF)qABS}4hBy6otz=DVQ`hHJvJNDf*nx`$XJS=SHkwKG-_L> zSA=Bljw3Xfs+cs*ImV!V+pJLuerM)S6#psR+p&_B;aZ_Q#*2jD+&607^fokzI8U)V6U?NHLlH_WOPkos0XJb{4E6xom+;%a3f1|(UQw(f;{vmy41;N3v9 z+jQhOGdxBP5;}M`RtHZ;EU8Wuj)WF;(Zt`-A(feT7#D-7_sOjCo&-!~N=B}_!;LtV6+x`A<4n&!n#VeEe=W@Skyywe`a-YBYOcU|+u3eMZ{wcZr0 z@z>stif#3@!}g_>zh}j1$NpM%iH_3tMn~~aOa~d-1ZZK~5h0rI2D$j4`=}SInYstc z7C)%90+^dG`y=$L#H1LTKfiZnlBcGFs3iXqF1sWbkoS*fteS7>4k`LHZ{rId ztUF*G$5oPCj>)0T8c1V-HW%|!JusOdH=HQc^KAEY%x~?os=idm-Do>;phWR^weA_S z!g#07RzuJj^dPVuUL1RRL0|l};j9-{@u0){k@7Gglg#`3JjGfU(_mGp%p?mPsr{6F z1~<6ZnDP-&K_)xHbj*7qgVoy=vFg1TM}RO`Xa;T$pn>V~Z5FqpQ~#FR@5tp5vmcHc z`AezmM^lSnXEMxPta5`V3^1qrbXielJ41nx%V@h2#|}ily!d zVkyg1b!0`Ayd9EC8YIiC>Tkf-08b-i@WpuU83tZ=4i$z@X`+kTXAZ2>6|ER>u+28xf&wR~fT=8m!xN%m5oaiTRpWic|K z*;+j-xqwJ=uQriu%#A1L*j{4gB!*3T(%&^pdzpfS*$G5!y;b zwaK{ATf#lgmaEHLugeoBA#nzzU$IvPTMP+11K=;Z@^>2C%MsWAHHVmcN_oD&Z=WvfYgs_7*vO>`LHl@t>orXc)p>+#>jF^Kkg8r|(-xv`8X7d^&O!p?`<%}p4 z77SJX1z)?Nv`IYpT=v*AqGx^NCe}SmR<4jaExE%uXkezQS#P=IiqtEGNqtn974orj z%BSEm{Y!kUUVzflkkx$^wa)o?ooYb+&YZR{293?Av1>}u=WeK=&T0zfil??%4P|B^ z=~X<6f1&N`6Bo6>IRC@4aJ6$h^&)I%1OF&l4Yi?lYil0Yp8n@ee};L`(5s`dAm=VZ z+!ER z^p5hFp~6L>gNZxo5WAHn*=^{t7lgq*~68-8n}%GMG#VIB6q z)V34>p~5Xh7h9SA5_G8Z*4EzkP5(Z&ZmS@+@I*aM6a^Y0^iJn{sG#CyHf+?B2@x7o zxdzPXV4d$z$TP(5Q^h0S3(xnQeOnCwK%+jJUb~4e_f(Secu)?L@Hloac7y{!X!h#YnLEMh#=_>@>{Xsh}0vm2-4bsP}aH`O>qq+AX9urXW&BY zrq^_hz~t{y>?`%r?Q=A`HWzVCzq=?h2}qU5()wDHLu6p{Ad3bkTU_FwQRpeah0~KDR+e)Mb za?E0=-D|cd(}4L=A^HPh$PLbf)lx-~ZqtQX&e*?*`jcPg4+`Koiw{bNv@}-6W$U9g zhF#L%9tCqluqSew&EWGM22F&(u}$K?g##Auxe_=TbMP}E%Fs7Nf~ant{LB$r1JGiL zx-#JmxQM{iRvpk zz}o*iplDrOuWqVa4?o5!lcWY)!E-LctB+=Aa!3!{xut&F! zflT8cq08KdVpp6t=quBs%K>Cb{4c^;5h?~>4VTNg%`S}%vhYuywbtK){Gp&NQE+-k zGue#r&em5KfhId?SoXKZfQW3FH8q*WS7+_>0a$YiSWQq`=PjTR5Gp`Cx})%2XkO?Z zyUTGmM62s&q%%~!vwpWcV+&4by?}R<=q0VGtHS7h=WNw{+<{I@Ud7g9PdRuW?p$Cz z@{Soz3~|Z`*)MaHceB(QWN<*Nj~s3Qm|f0USKeULFc&lAA&vr`S>qdHgCUO4m92W= zu$e*%Q4ihAf9eM_TGNq&N;h^%(B+|^>#6gPPo9fD$dR^Zy%bojjt?0YNISt6PD!gwTf$it1#!Ph)f@{W zqFj-f$SOU-b4^Na793+zXe3Z)4wnm;m@uhbw$6Q56)!alE+9sXC;_j~M!6-DJ!(vF zqx1rRoVP{~-i(5ae!i0$5&0lzMo%;nKY`!%<^9`kR7CPV=Sycdv4QE21Zt}|wAML( zdIwzy6>}(oJe5qZ=-jd5M$z+&Q{Uzg0j_pI7Wa?&gIgb|njTg543E`g%5;%?rqPpG zgNM|{fQ;rsiRlZQC{AO~q&J;#)Ak@b4Kd5EE*Dkt0CNS6y89fmnbyX#sRl4 z)(s_%U1cRv=2TlC_B|aUic~D|RlD>%JD}F^TI`9S^RVn839-5)V_wm@Tof5ey!1xC ztxL1Aj*U1BPEu5jAO81S9PpLxGj(^Kc53G={Hd zqi(L-hZ0dAy1SR&9@{JKga#2>^SCtirY08I*gkK3`Pf_R2LqxZ7mQ3xfMQLa?DQ|o zE?FJCd8khZjgmMq;{E2TRAteOmH`yp?K_rhMKJ^XO7Vl>Us{GJd;4#LM&>X{02>$*rJD$`+o{4}u|6Z77uF2e{V~vdp{z!FG+n<`@kv&yBiGrxK`(_ymYoEDeP;1huzBc#dNM|HD5Q~xj z2*&hv*5e4|4Ks9~%_SmfMB5OHiX-8Tx3K=m^Cbhp@JBC;d1Z$kgk!m8D=Ptwn*(7b z5k<|65zV~qOQWVFe;n+zJPZcvB7_WAudY{JCjgVqy`QT#22(+^sTr|l3)72eqa}_i zZ?}(P=si#}30$~2EeSyotRKD&7!PoV9kqHuivQi8?Mj zMKgrU+kse8B@vm&a@vjqTU*)I}B`a_AVpW*jsdwp=?0O`_nSlUV zDRIi#p>jbf0?B+cUz+jE3a31YaUjDU?3t$)`U<^bPO-X(UQZ^vxUF{$;9>!G=C=SN zSpsFDi}W`Yj@(%{`%h>^njyeOWAW_rOt$H@^(>R0RBPSJS+K8TFI82ZFC$dKKU0c; z%NV{g)NqZ_Rmq1Ofe1t8=Oz1fjNIoV^%1mxXR0?=-gKEi7dn(%CM`RT*GVa<(?}M_ z2)$iEtdjo+PQ3U>T{^d-%5_o-m_P=!H4rJngueStO~*KyS02LOZQ2jntzdl!dKR{@ zxmjGL0c!%(8bsrqeZ>|&_?EdCFEej1X%gLl_Ff6NNBz#FVoVSZKcdZQTnj~Imu7Cj zv9}9vUiZ@UG(e6Ce^r{%A?H{KW2tx7T!5EY2PWi@>Y!6E!k768CgqJBTAVPV+8CXK z2p37t?U6rk_Ka1p3{Y{v|6$|z69L9TOi|4@11?St^R819ZEkD?UawMT&|y!lPk@+3 zckaQ(poUXbniz{tHcqZq@jm8?HvGIfj+W3Ogb#Y)^|m=q-^cw1U)QgR~`;N*IS2PM$0~ z*qKqh71G>!L%+vMYxrp5P`tOF_yHti0rfFWh;E>6iQ9QRui!&hiSFqde7{-r8RB2d z#M94=ZTY5^`;jBqXR5vfpOIf%5;oAzC!MAiDA7Gj=c3-e?1Hx_y5QWbj-2R7_`A)V?ktgnb7}s!aO9$88J$tAmmMkYM^UA4seks zneCJ@DU5f301q$sYsy(0?eFlCkjNuv9NI0I-wlP>rA_1RZ$1lbZ%)Agr}5tY&*qHn zekIuOQa2I(tVrThHg9?UR|lyy0s$`-{*_5R+-E}8LjJv_xKi=h*`Y3?^yeNw);+H2 z`FF!atXMWJbChuktk@{lr#ns)<*J!iPL5sM^V&R%aFA-b0X{>J9v#N~!YXq$^+IK* z2pp`3o+nLflW^M25c-zldBiq>MP|oUHkt4_G3RR8>bkgoJL>k(2?<(uKtatt?xqOrePKZtaaZju)7MpU9up~e% zQ394sj@ME!<5eQuV1noIpq@`;rKZQ%l)?nfl)n%u*6oDJq{-FQrZQJ{exVW+V$X^P zkYVH`!QK{VV{UA1Ps%4O&Cf9f1qQx6?AJ7XYwEYs2b6=l5 zZ=i0;EaCUW_|3OzNf_TppSoXdt>twCA9?Jn^7ikRIp>}uQ z)^R-z9Q7K?fRQ}p&kKzAtVBTopuXoLpWH!03K{J<`rw1Zb{yXr0-`o5b)_*8Ese|n z^-#PBRF;|JQqR#{luxXnN72Kcs|CO#AbSD{rGF%NXd)x0QQjz9D{fEUAa%d(B;>j{ z-c(U``39l-EOC{YKY}rskf9=@5_$7hzJi(tWMW7Q2Iqx!p z!$Is(S>5AgWRIv_$8Wo9!Rs+{l11neM$3tO+sE=R6l1^nuO;&_CdCQA5}TGLWGTzI z!iB5;M<`v^dhrW~d4LosxyZ>`1;~CljB+%7n@{iiK|DR(jshHOb zN_*L$U~_Zacm<>ptUC(g;OO!q_URe$;#Ksv@7`KRh3@tDZ^O0!Gc{Q{jgc+4ZO1?e zkUka113w4#ks^WDBp)KB_O)S8qp76xAEmoJGDmP%w6~dhKKNo}ftE-Jk<_AG3b%rr zS+IEk(j{vU;2w*MFd2cG6J3L+nTpX;WKk*zCE*YZZfigw-bdaS zw!^=OOK+}X>9-`cF=fYnR(?@h(mnB=0)~Zyq)f}VB7@1m)-66HfamkqS8kL}~Y z;|$bUQH@kF$jj6Ul)q1{S#Yn8P^Z=27&o{vX;C|6`qFN-9mlKxoDsd{T~ixx#87Fo zwIuv)n)*SAkvRZ9E2^L5b|AO{F1a_)E)s9J8mE zcp_vhg1a5Xq%1|j>#* zEpqAoof4as7k|f!c6W7W18k75$i`Nr9nDiNJD&YFC&QrKNUVXY-rQc~&=Q$K2ZdXK z(p_W-jXg!FbHh~eUq0fF_+UwxaeoQMs<0e5yS)sV;DM+`rQ-KJ!|O;(UFEe!aEr@& zU2>c9u?rS9#JU|mqoa1~L)dlU(LD)M+v zBBv*H_mL?2-S6SiOjHwX0w|u)Gp0v;33t7BoS%jO=${YIKN>g>+W@TkFHtnUo8e;! z5wy}GSU0_fY5T&&dG9}KcMPCugqL{bTwzVPCw$;k_Vm@zG2Lc32vDp;6Fhq(yQmLi zgOomML{pMth=&X{M!4J${YWHi6DyXy9a=5b$#iJ2*FAUdd|{vc8z$_( zDU2md6DPrWm30#QDhm!M!u;`rWkq|IuoDxCPQ=q zp`b-}Cp_XE{Ig7_A|Nw3;GOpp*3S%MHglN-xwQ0+_k)J|(|la6>?~d+hkzkXA;@31 zS#o=NMAE&zZ&6H!Q~=+$HAW-z%I2Yc108LIR~ZKW*Dh@@AWH7*i1tOdZsL|bGIi^H z4upu{!QNg=`7K?>Bwl^V1ONpgs#2W)g1Hf%z8WH5TU z?K~ZkU4~}3eF(Yi>@VE5k%dM^wbF=*wVcQ-|INYYtSU?NgOu|MZ`Xrh2O^yGq3KmR zyh58CHH6!U`iwG-|%nYreO z@Bbqv$+G9VLzLLX<-gwc>D0jxlpOd`X3s-`xQ+HOyaD<6BA95^ndBV5aDZpNz1Pbx zY{l!jfLLRN$YDa=|6w-$NZk29Z&e_5qzJp__Rx(aRm@=)Z>rh-> z!Omq$AKw_N+OVP6QL$vO>m?s(QR01KLG5=KtP_2-wdEFDum(Pz@>{*}{&9cdDhLG3vvB74fz?-i*z_isZblfdG=QtRB6RvUpp8Zp+GV2|_SwN_%)8;`za^dVRcZmY^hUM9bGS zQQd2m!r5mmRwr(+wc!kc2spA12lny>nZT4lk-@tx-DAlOOIm!ku{&s}6P6g0cv0ro ze#PA|@bWS}1R!?;{X=up_TEU0$de9a2SOSH@31Cap&o2q#)SNB^}7C}Ze+ zI7@DWeH0-D_><0QXRx|bY-fOfC5Tw`Q7$(FqK?W4uUEYly09d9J)`)IaHpe!C)0MP z=|UBhI5C^Kp&k%RkHh@hX&g{RKZc=kA6Vss>Xt1WjM08~kny2g&fXh31vVYgjKISC zr5wMs0;l>M`7|r!?TSzPz|19Yrq`$d<&;FG)cS-M%Bi^q&xgMD-`Aoe)w>+Eo;t?F z{!l}`i;r4<%e7bqHT8+lIPee+=1W1Z%1#C**SF(V354Iw)Snn^8e2>466S`o$kEPJ zL7}qy;n|9NF$C=OMVHT0t*pVBSc0T(ES^joi315gc38`bJ+?j6wacg$JF`2UW}!U z-JByq0JM)TtGX^i*Kc*-c2pYzR$)^l_erqYsFw~_$VQBq9y`l646-9Ufu4e0H1sdV zlHx|3+Sv9-a*@y6J?gZv-}~QaQ?g4?-Q07$w>w}m!&f~&az9RO!g}7Y8=PZG1c(gntZ?z_*<)dbA=l9W+Qc$JE)F@Qw9}|=e@AP!pg**u>BrK$FW%EQY~(f>syS} z*pERv0JCtMVq1Jij|iCdl!O{`zHiHq!ChRR&O48a!3eI>tT!?EEjb9?(WJ3i&~O{u zf{h55N>dZY7%0MHSBJ^A(rVRNKM%17H~a5GgyF&MZZrTpAf+u{Cn+v38$S zGSO2KfFt47VudK?AQB#%y!-I}L8t8`$&@5kXY~U|BtV<0uX6DQm@hx6&-ls*M&stm z$bv%zdHOYRqQXHI7;M>zy;OH|NeGfM8U8r3ez=BKF;^kock{dxe9K5FRm27=NGrHu z6nuJ+5APv^QS7!yT6UX#svdiP$5_L41SZkQ9rN1SGSIR0qsI0fuuId3AoakaIU>x< zfdwWEeY@y_fPV35;k6swstr6+YFzR985C=Cy?P;LuHhS<`?B*jQp9&{GO{GgsV{VJ zz3}?W4fKZQsZ#tA(bKsRg9(MD)R8%;@O2i!uB&?i**i%u(_OIBUr-<=X{4$CWy#Y{ zE1B(BI2A6$FZjeRV6Zj7ekf-K3H9OZT(Dv&wrC4(X59lTI!MlhB2%Lb%c|y+BVd1M zXJ73%QQFFkb|8T4)KewKqkp6>eqTQo3GZm0y$^0ZxaHUk0!7H4w*D?uVPJg)I4yk3 z>5h*EyA+m32M)=jszJr5@(TR1jyD|3!17 z-RqyJ5D)Pj8=1Y=8)H=1wS!+k1t-opL_{h(3t}zWM_br-1iU7TiI8t)(E@-1lB-&IX-R^DQ+D{4DrD7yS5j^=B7( zyqJM#7?sCBet5y zAsxZWXV}?cP_BbIuL@?Mu^~%Tm-bAobQa0Y&YB*jRQ7@Cm}tU46c4}Y*2XMX$cMx1 z2aT5=r86Dy{wHm1z0jgFlP5JCi<-!thuV2q`Zz?Ae!Ghei7<$JzzGaltL|Jc;koET zs6}nY=S7q%dUlO2raKW}!>FHLG|v8nufA&ovBLXQAz#xSvD6G9iA+!< z;Y^veaFYGkiF!FA5#sXn7H@30HZ0!cdgMXevBBq8lZEMt>XlHy`bc>o_@ej*vHuXfI*Kew#jk=mE#GAID`CeO z^5eO3Wx14EC?W#EfJ-0$Y@WB2t%8zFa{9C}v9^Yp&2m$Y(RQ{iyhaPd(f!G3esj!- zyY(7;IQiqGZsFv&Uf~M~sW(bh#18Cr2$pZ}=6;}cqWh_0jL^yf725j?70mwP_znI@ zDil5<-1IHitFbMg18Gc4#mXREE0s=Z6^&}BOii>|EvLP*h0!D!89TNYpB>t_5g$jP zOJ-@~5Bj;dUJy>9dVN6=Bb1rp-4`N0wX=Z+D17Y))_*|kjsbthM3y#!EhqO}K`XVK zlM2uqnTkV!EB0gf563+cRH6*4il(C9nCxWe)IJ$-lYrx!E8!5QiK1j7&d7g>i))P zAVoJ_2#jS%?Iwh}_JZ4PMV;m_mq^e;s<0D)Lk(B)ZWw+a}Hv* zphIvF=1e1lf=xCj=gYFx?Z6f6!!~*|%i}#Z3%=kjM&wCI#JfTXfVFj@5Kq0x1sC;L zkY|C-O^?%?eBQaXx5C*>io3dmr-FKAfQ7LRYm1Iy9}L1=Yb5lQ36c;Rl%dcy#0=iG z&6tSg3XvKn_2pij4DHpOi;+T2)RZ(pfjkR<+?N2%YzXNam2@Vf`mDr5iB?6 z5u#V<)_TDFkv%VmddsOR1>TG|sKHMT8$IqtyjlG3L9#~yK4zTDx3e+IUPV(poq7$X zl}hcMi25BWyd|MgBqu=nv(6#EqIfX1)yuCT_RTBq7|O>> zLi(VCw7+8zXEW?EHvYPDhtlPYLjaelO9Vz7qB-h*vh`2keL5)VsyR#TdAd~*;LuuL zs}`nQ`;M)~-q^lkFuJ&Mk0Ei5(h@No0HarcGbuZaePEFu-8IRAy{bZz=3r5cTw`ga z^!rz~Q$c>#vULz3h8+M%y&d?=Jl0G?f!?uC$(Na`UHgb2f$iR)c>#S~^iG zM|}k)Qyx>fyi9L8Xi1V3L(yJF5@mZs!bW=4d=YnSS==1$@4h z?9Va+$s0PYhwL^L^?$NcjHDdNI?L#tzhVW`11-^f2f`KC%FEqx0*pCFsGyp&p=d(mF&l%hdpuFD1>xu&v(1+tJyatlGl}6VD9XE02PXenCgv ze&`7OHZT59!m>xx#C+0zj0)#&i?K2=L-p!}nM%hM&Gu5}h6e~`K@(|rwBb;PwIFLK zfg+~*H!@KVT-#`fg!a28*18RBHJ~LKIbRqSn+kFjb++UNzIrLMsR^^T$Lf!6>m0-1 z3QVls3AnL%RI$}eqw>-TW?XpBRsI5sEfBe=t>}JdwEq5)0B+NrMt$^hhoxylhReh+ z(|O3|9A@^`PmS)a!jM5&D}0x#VL|&#Nm~0 zv22rgq0W9=*Hf{f>y(EZr?|*NuxJ}F*XJnw zQB@%dM@1rzol{7xpw(SJp9u{=0GL8ns!G}%lQWyj!17{_;dt3u-Cm%M#C|6oLe9BI zsrsd;BNXt1k^ne45RebC4j^5wt7?UxH$ka(q zw_~Y!(h>uZ*n|(2HnpTPiN~lYxVkX}fatNd>l@={qQ4d0vKbypOS~(w5r)VF7jj4# zb{!JtQ4ym$BHl1NNDQN0dX;mA(v->V-^Xl~k{ef81rbH>0nUj+%%|3e+!|tIBlF@1f`Y zP^VrLVntVvmncRTU?C8(R-f*|UgaKFcUWpE2*1WH>`@7>S3`}N4ax*DE^VXhnHPE* zbJ`{m5C*87l5q3=dl&8d1q?2ikYr<{vAnfL-CGhgw3>fgj!7AiTaOus0GFU9iHept zTPAZbQ7-d}wtcKtIjo_OT!M%=(S)L>u@D<=PXMt#muV0Qz5TmfjJ#3GP`$}tg!wg3 zvrF7FB>E(xBW}to0EOCxuZ-?ozPn-{l{?&;?!3JU|J1|7*87Tl4jWTJN+$`#TT296h!6Fa@H2;#5^DinBG{T3^hgtU%D1 z;g{bXd;we9#Hu{w9+;!Gix>hm6$WpC(ow=&U^MoesNTb-I+eLJ=`ZQBFhPx?ljlp{ zQn>er&^bH^8O&?G-;3e_rrGh#*_@U!l;6}q0qv@L@hXc&fj^E^_x{@`JqT8ul|BQrY)0q6>}I>=2oV2#~E z?PbmVX0YD1L`{rQhm-)kzMch{i?e>ERJwE^B>&Q9`VFZJP13=?cjakGq=>SinUNtt zq)079(Dz(DOOS0Ui$P%%bG4E;cE2X{R>RS#hlEE1kTa(LJU8AxojD$$y`bg81Guml z0}7s2Rb0!iKTV5L=jDU0a2({Bk%YBryFAidJ&3b*hF9p|UO^k5t&}n2mY6NiEHz2= z+g*E1tz?0gxNNc*ngLlkgN{erYDRJzOlbmyoyyK$w1z$gBjbgxOvt02ah1i_)ZyB_ zDUmeaZMptS5WL=uJ=Xs4BIORiKQr@7L913^p58I;%uE$ABtH+T2dDrX%qtxiWp+5* zM6ZHA{mOA<{dr1|wUOdRjL7sdb+VuYVGb#Z*byfhpF+BFB?$YfhyX&jUq;Vbuxq!U zMCzK`5@Y>G=dk92g>sx8`x_?ia$?@MkT7lXZ!J_yy2{gDc}yMTpGY*%zN z9-z@jqfP`g1Gl}wQtup9>FHjq=+|>bqJeozA z7h(tBo=)nrRS=3JZ=Xw*CfQdb(S5~^sJr&W+x$*{&c_uSD1dGx zVMNhfCKUSCxX&+`;RSu#ypKJky$#cuuHIQ+hd3vd`r{J8d=E;;7{w3 zhuRvkl4adI9|E&<6V*VG%_OU%Wnd>;aQQ9XE8~&%=JJR4h@xzCy03IY2*Fca*-D;! z-!LgKOo1dYiCRSSXDHW2t=l4b{e!73$K^R&2_wkW8cUS~^l7McL2gXWyC(_RDq5Jb z@WLF*@$484<^i!Y&$dS~K2{eZNo9@6(L7EYYEj6`rH=iw!5-lAVlDy;-@Dyn>T#5< zklNTSG{o8_4hqBRNG{bxApVoZx8Q6%9e_o|wU~8=S$Rp|uBJaa>-zXu91&|jH=mT` zQ9!=~f|fXy#PQWblZB!Y_bc|1F^r&K2j2UqS%Y-d0eaUOTr!d^8Y1 zMt{?{-Uhwa6%Yht(lWI-Hv^E{pF8ufnp*kg_}HOSc#S|J*vd0af=z)Iz7Hw0;!5}6 zWttL{6sGk?yC~GB_WUoHs%d-t{&-iJ0?NBqW{!Ub&Cog~@&mDixSl_m(1}04bQMb7 zP+cE(%w{naLmN>2f?_PdjNwpjq9-Wq1E@QOCSO3~cC(3P8Vm(0MvJL$b(NN3C0Xy> zk?gbVhqLRwL3vnGtMbI{v#{0aGm>0}nFjbnK)Uy5TrTQ|NcT;a7?=>!-HeBu$4lj* zF3~n6e)3AAT$djM4T?xBfP_n8oi)Rha76s|&Il|;m` zx%D?FPErHNyYRga$8krmoPBpl9VKIan0ep83NY1A9;L6HlpQ4_^kbhXScg9&%cT1Z zP;2RVmgTg!DZ3eHEb$`^rA$x>nlEve$|AEx)wSh%w0tDX9RWyS-P*STrk@4unelsd zs$w{lCO)eHX?2Gg6TbDz$8^KCwsX+znT$i3tyF7>wU`_GAtjR+y?-)hcMtq1CJgUN zhD^Ba+Z@S$p-NYyUdzjUF!wu4Qc;42wMD}7C3eV){^$hs0c&s`e92{Qkl}wPfKccp zPIYaUsJXdUQ4|=%C0oi;Jo!bAX?m*#rc_J8`S5y}MeQ!w&4`YGY1gMj*%>#$l&C>T z*lP~H{1b=RiTxKufMfb27P48E8P=@g0?Htp{v)QDJ`MAm3*~r7stCZax5o6h3;!dl zrpNc7tb;85Fep)3x5*sfrXKhH?r6KFHscDlx!6GJ9;Ndu3Dg;uA#Ul7{A?}06F6?y zc4UK|5ZaYkHNr+cU!8%${;(0}o35=stw8YlaoB^hQL06!uXTf4)?dNG%rf&#;E*qM z`q8S`sDSOt%QHErEykwrl=FymZZn&W$gNSxVKag4pv^Z!?JN(6M;J`Uwd2ojJ3;N; zVEMJJtN4ZB0#g5NV3~+6{~(kZDM7FhJ9l61t@Fj2&Dx|Hs0LA2KdAY>%yON_ot1yeFCU%o6hNOz|($_^{tWiD$+ zh!{wvYv)tH1xi8R>UIHmvn0`f6Ix3r15N*?8z61sIz4(2fhXK|sfbRnobXpaQkvku z+Y(y9e57sMZeg-KfNDu`(7Px)k%%+@Obm!LQdVGMv$1?8f!EKW0~&k>&FW?Q*Ia)u zytvJ?SG2X19@t;9nHQ&tspeq0B23T)c$4JjBSI0`o2Jz{lK&p<*2`yE&gnF@>4tUF zxFDq8EaKSDSS&d9nF!TY++24`d{*YRK@%-Y%+Y%5!{AaFt2rL|(8_@SO<&dE7l4mp z$=>WURvf`957iG-ukRl88zvf{YZ<%xe%_Us!UBds67=`BF!d5`dfFO|$%!=kS2tbp z@ZKk2JS8FP>i;rcFB26gaW8yJ+4wNlTqFM3CyKCmguitDu>S#^zK+uu1!%uchlC;!X2&6NBbg@MBL12TQ&pRg9w4D|V+PPv{Sr&HN27i2c$X8RFj2L5H? zw-z>Tl>gk4zi9oYVaS0&nBO8^+WETj3Kagmgk7(r7ICtYK?%LZKrxng=W8?)`4xR8 z<N2j~7}%mlP4(=8cO>q?AI1P<+GmWF6scwgRdyNFqyf~LC@=3D@4RI4<-0Eo%T_gQ zoQE(c{>AXMuxCHe3$lb@qocsLZB3hX?!`(Vo(Wq!pky3NIHqdee zvTy>{rqW4W+Olm2!RgIQPJN^efb1wW;z@Tyhede(3+_hkH1WCb!)l~1qrBwF2N?EC zXT`atalG3^(e$fi1^x@OQHNycp5((v6S;N6iUUQdaeZq=f7dm=Xp~s_q zEvgGho4wKBgo)w&m*{NJH?|mdF2br{uY`3K2*Sh|>AVsEUX!TU9L<5;pOwV?zLzk4 z-z3sx8}AIp;wYJPm;J9f3lS&(rPPw@{OpjR$1?KuGQgioAMHet4^H~zAq zg}84tch+&9rTL_^FS)M8R5aysoxlGmnWd8~bjYyG^GweITiz}D8{e6SO%Rl2@S6}G zCk3>vWr%R;m61`t_o>h}qB@29cCBnC^ZcM>Jo<6F5EMv#63|@DEF|yfsMZ)4nv$)_ zu}l8`vvKXwV4eHxqP^a5qJwyb!PT+_D7VX;CwLnM;jEiXG16C70KKB*_z2$Q{o=#M|7>LV%%E#UCR$R5_`VQi!&KL1&6k+AFNo zJdAsg=uVh!Boo?UOQ0=-=7pbJfa-v9FTqpn5hr zk8>+Ib|-$pkaDKu$A-2#noT+}1casS1u&j;O{XBOvnn@-ZbD=5B6P|%2+fqM(+if| zmbnpTW&Iv|PZ;RvM^U{rMSHf`bVy?NOo|VRdBbvLF^uEj0Cz_q-bNjlgKwBr#d?=NGk& zyP+%grE&z{JusUt?9eP>BdW$TKjK6hkVDnmSaxos=?in!`SD5he9E3WC9cb|t!cCl zW3}j}=yei56%pup@em3z)S6MH%F+o>bz-(B{$kT#G%RaJZoJm6i(H71CrH1j)E{eC zW&13<%+$!|mKMg{`@grFdMl;uTgRxAobBrz z0FIs)zB)j7Q4>47PfGx&C1=#>jmmk-F{X-i`veJiqYr>V>;`~V`gHP`2|K>DPcm;) zt{KOYOnVM#DQ%+ZQJ(PZrixg$zJsfQ^}WfYPxTOChiM%Me}Zr51k)Ey zQOjB@WTRv1is=i1@-{oZA+;x*5SdCEeQ3cQSS2|sP-g0_5Hs=MP|Kt?4S4vCIZ~^( z&;$+4BV~>2II8LmfsBXfw%wObdEo@-s1u+5-oP_R46b!5zK zhidG*IS+m=oV=S}pQjw@R{>G_VwjuFqqi+}?RMlSUDLnAtHW0bw2!|(+L{7^e4OiT z$*(Umo0s0HNu`b}ywj=5j)Ftbf8tuA};l)en~l}M>k*iSDWJm%QMWxzm{qR2NhvJ_*Q$V z=Q@wf{23xDSGU&YHTGU%N$^5{OQ)KHtZv9838N6F;sLFoO=?^7Z-z~&!*Q;yA>S!b zIu5`R4|n3c`!Acd5ySNQFh_m;{5E`;zQ!=c1Th*gkZI5AK=G@rIbL=xwa1GHx$`{O znKfRJcSLFWV;^9x}Wfqt>O70HNPuqED74ty~t!Tv62yzZ>w7u514sY}*NhFb(V^7BU zQ>tCoq|PZ)W3P#3%pYV(!XrJw&)KY8aDj>nnHP#c8P|XV&{N@d2sxStAN+QXTJWE| zY^lqGI8-XsK`R>dU{#Yv%3cZrP&lO_K}2A19H=<{qU!ZOxr7oYPnbrl+(h`+8GdWy zUm)-^AcVb?(SAW!?)u&ET7IHq#fYauhdk439ifW5;>1uFBqrMJ7+Op40x1v<5#!~L zU-Fwe6uxbZUFU+lb=T_CBL#t)VN}JRhjvu+fu8pX$>foMYJ%Fx^YE8+HC#Ij%!k0K z#lxtN;GNGN34Q?@RHMd)f2Fof>;B{O8;V!nl`!)un^>^g%KE6}!LXV#TDuDmpnWwG#0NJiSGaXoCST9|6^3)%CGz4#> zYA!!?PaQfeYEtRK-(>8*xUxr1Vo6Su@nglM4$c#%^LW}4)a7T@k$^dB{Nb@HwCiVA z3H_DGKKX>p4Zre%rlQUu5^oUaxC8;Tkd|&01ZmB^n#r{c+!r_k%8(}ggIL2|jk(y8 z45OvcQQDO*oBJ;YbsV~oeh;->7xdOck1iUKfmy<*knG8@{&3J8h9+4`ctu5@S(q&h z@3f>TjdBZBK?gLv{oCvQ65}5qRbH8aD{|4XrK=f`8l3)a1}&?p6mb34nRD^(_jRdN zzdB01_-Q>cZkmccku>gL{9WqIH}a|Tnk^2f&v?`qE{&mejmt}Y-7z)l?FIn~gVSl4 z?>kg&*uOXd5p(0EaY8|&H=dcm&URzmcX$@Hr&kZma#Vp8#$Ht3eUz+Qpkfc3`5wDI`cuSG7tZyi zf8>ROL9@`%C3%IastNQMGf;BMzkjT6-Kn1pHmjYVM zAI*Dj77#C~6Ci)rIw*|(a`B<~9#xWg>x(+fq~&P?ItC|lqz%lJAZZ9Z7AghEp`>cb ztRz&qa=^n2pV+_2^!Eyh>!1u58&Y>3E97o+c`GY_Vsv|i)CaTI>%wuf;3^~z;O_$C z!;4IRo@K~+{EU4tbZ~WRLNgP<&v4bsBs9wzdV-yx61JD|kt{~jGZxlW-|Ae4#XjB- zy+}~LFwlX5SQscJoSFyc0P9_2`#^Qh{1lvN5J0fmkJdz|nHJLf_pk<8HKE>Nwk z^Oz>#>rJ#>?qR%KZKkOnaY!TAh~%EV_KlJXpSe3mJ_rBR5&nPvx>hc`w!6p2R6?)F(|w+OF|4LktmKYb zDv0igwW4#$Fj4Myg16}C5kBKd*EoEU*TBuqOr4RSK&izMdkeRGc+NW0tW~86jmw?X zUfO4Sn7Ki@o0<6%W%z<}+|||F#npWYQtV^8&MP}Kvr0G=WC%F@XQY}v91uXoJNmf3 zumuAPUwZPe#Ri9G$f%0h0YuF|$Kve6Yyp2-1Ebes=a>67(0ar{&JE+G-5km4{cMr7 zP)3HVu)c!7ONSf$Ggrx|bbwGahRMiNG+1L7elS~=X7O8LXP~@xIk6r^K+D)K@EW80oi-3N-!H;qkp$9rz^OYXf6YzI#H~7c(dK}rd3kEjFvE?S>$$SI ziR+14%OZ=%a_X@ZCrXww1NEQ^K^`e zcaW5*1{R%qzXQ&iilX;ZCRy_zY#P)%`9`~oDFKmIv~H+KXH@+4azFm2-WH?Oiyoug z@=Y!4!ANG1!nF#6uvfpYfL5=mk;q`MXPl_q()vvVl8LF*0bghi;UpCTDUyd2G2G3I zIUF!jx>@SiG@0-dKk@>GCH=7lViK9LONZGs)vY1BW^mifQ7-WS(&H(r6Ydk=r@FxO zvX8}XA4jGPw94fP6wnp(yIHXtPQcL90<$#&8Az)PX(I<3{ANbL3T2tUdt1LCR>qSg zQ=1pF15(VSI(fOUYq2~GaWN={Vdl|RqOLkJ4IkXdN4c0Re9hr}sBd&3|GqY6Y~h-Y z+G?}vIt$ow^fc4_TEVYyb2om;$hcigm4s$Tf~BX?XJ zSaV)&+?vo}(Fx3Do*M?5Q7oa%s06RH=joiHj+i6|ZlhA-h$Bl1&;;PF=0L=!HAM^P zf#ZRoJjtxv(gTEEqB1hU&`Pc7zg+<5)nJcmI7Mp{rnLu|ZeQDMV~UR2UTNA$HE6}^ zn-6RPh6iOBXN;ffEaBNoTed>rKDoGE?;YAlGnB&PRFJyMRXzcaTVe72-Art!r$F!comr0_Y+AWXx)1vPH zHVEw1A;!718+llEk?vEALz*)&X}6mh;F^idP6kwCY7DAVTej(YXZ`$+tec>6qo|HyLv~{v~>Hi z_|idNbe3@V$)R`}(e9}h`LG;x>VcQzsQm0q6JtvtZJG6=4ee@KAx{7t_w$G#w@B&tl2|NEo3s>93;^hhzZ_gC4Dl&_;Od6H0wtV?)jI2a-gPFoQ`Orcz!ZHge=2SgzC}aTa7}qEJ%ML<`A%h!FZs~HtER>|Eu~_7ZlVznD$1vv zo5b<+IYeTNf4GR+%Y`cX?l5#8KM|A#e;Kk|Ba2$FZ1tek8fM&SmYpm=%!0uD_G&Xc zZPY`;Gj^8_1I6~_2~~D02qjIg5r5usgv#|QTJW}xwUI@?-wBXIn<=h45~UIJh+c6F zwRRZ&BLmyOoa*-B8q=!-_$$SjmP~2-m=;~>IfvV7x1%mK!yi1p`f7Dv6q4 zHpxG;5c?dMxnv%OqUqEqo>b||GsP7MqUd9~8I$_*RajI00kZ_Rnl-dqB0h`UB~hM zAF?I9hLhh%#N>YwuhOR9OoQ4dlM?KDlc<(JKN0Qr0MOYTC6Ad*)dUtmD89$Ubz=Qv z5!L((`sNHBnHr{yYx>>I5PkbIzqXeMGrXRKM)ra{Qy{H8rF^>m>TbXEIm2%PRbAzj zeWBO{ZMQ6-@veg4HoT(7Y6WU;F#ToB6^d{qc23svf&>&tqkp2GsVih-UQJyL1SCF{ znv3GzGJ*>;bU%E{?FbSSgtoa=Bx!@EurUeIeZd(iMv=+JmM=FDkF zLPSnZ!-xFs5VHy^eXuLS3sRtk0}Sjk){%z=85}F!`I;n4vvb{fMG_ALdQAlFN4SB$ zT%b7 zrn12PaT5RR+uWaBc31^HF~APxQbgpDyp}AHuWi@2smvz(dx77{|}n3vwo4TNUGQ zznh_DhRvByU&cRefTkdgp-9opru)LZ~la*Tef(aMOnK)&9l-17T$q znI<;5Do$@Yd=aDNC(>!#uNn)qXzs!auT%b;eOgjX)O$HWuQg{1*KW$spMgr`zPCbvs$7@LavGNk z!r5(SHZgt(31Ov~Cy6IbF1-EDeeHes)Wir6L751CpKeNL-d)IyN$`6v9RD?k(XW=v zdi>P-g2MrFH4gEL&;Wo1NZm5LNjwnXZ7{QKJDiHB4-;P3>TAMXqCfNhguFsB*8JZ+9s~TWVeK>MuI4^dyr*+QnuKn&eBVfv~8SXU^r&Ca4{x5WE0vQ_B z=6v^r`~vtKKHk@ ziA{F%5+HqlG`1Q$?Md=O+VA`BQ~cDnj&h_&P>W04)Q{_ zCIUP>gmk>z#ssgCLTP5!m3UTj!q)#LYflt zw)7@OY}RyEa^j2vhVBxY*2>nll9uxFrkeDkR_?0K^ismARCMxmN}5uVPWEPko^I~U z%C7PjBJ#2trp!{xc4F%4ax4;Rtp8i~|CFU_VC`bULuklmr{qE>EXd{ByMc%!Duhw?nb5Lp(5aEY3e3Q&+h1KCTedlOsB3SNae|4&7o{#Dk5zw zsU~M4ZfPyaY^y1w#>ns=O|48kME*-XJ4Yv8?*CrIcX71fAvAY(w&$dyb22hFu`zI> zwXraAv~#jEb*444v!SyyHMKA@p<|?{XQlh^P3i0noSfY39E}Nix$QasfBXH~4&TY% z!dA%6#nzdJkb(YxUxfHh2G-6>7M{PFGO!Zj8(P~LS^eMQZ?}RL&Q3gpj4Z#E<_1pY z|5eAzOo(q}VQ+5YC}3@7_v;IRq5nSj96TRMVVEsO(p3ZIOL5S4XNY>B|T(V4UCxW?93H~ z%#9hS9OW$7W!=>r&HjhfU%UTDn5mj1I~}vMfVr}#|Mw#N53lla|BvGDbfNo?Yc1?-d1tFjM}u|11qF#j>vIDx?VXVL zw1A?xYsXRD-)`j&=>@5iRC8t+@)Xk93G_jxZ;&O&0k7*=kL0Rz;8(0xEIra=V-Bc_ zf0nRgcoj2CvPP@h-`?IF&mWz3G%o970JOEtg<(88MRS{lVKdymvR)+?GMUK$ni=P% z#W=Ky^gSdHfCQ;B8Ie=YI3qI)9J45T2&#lJ)4)LFg5Ck;D!y_UD+<*8S0qjHm zdHfs#0_yL_508(Je|`)PcAn}X0?z;>02x35K()Wh|6>uj2hRfo`sS$*pZXg1@B22O zqo2YUa5~5B0qj5J;-?({ z^xRML$s4R3Qeedq1*?%1Se-b)YQ+UsBS~`fd@ zt?VrX$<@`Qn0UxZU0jXqO^s|F?9BwpJ4a5MkKpYScBmjv(5|9j}0I5J4kPc)3nLrkh4dehH zfm|RD$Oj65Lf{in1QY`$Kq>GUCVXEJ5oiLMffk?@Xam}T z4&V#W33LJ7Ko8Ih^a1_A0PvI}hJayU1Q-Rr0%O29Fab;gQ@}JZ1Iz+*z&BtXSO6A* zC14p?0ak%EU>*1lYyg|U4`2(}26lj5;3u#L>;nhDA#em711G>Q;1oCm&VdWy61W0> z1J}R}a0}c4_rL@2$3hJW@)iIEKm#xUEC2_<0|)>j@C-l#kO33`6+i=?1Lyz-fC*p$ z*uV<_2fzjJ0DOP|AOwg2Vt@o71;_w$fC8Wdr~qn!2A~D#0D6D{U<8-|W`G4?1=s*~ zfCJzJxBzb8CBOsl0(<~JAOHvgLVz$J0*C@)fH)ulNCHxTG#~@W0ca+zasEg1pbP^UlI5#0)Iu|uL%6R5y1QV-$9zJ>Jye1lDp1>+IXw> zde0S;(HpdXUy86{-WDgxX`%llS9x65O-?i={%VxDl|wYBUw9*_s)r0mO}fZ3=a9)H zVffOq0Op;3g}9+h?zB^BS+mNQgxmZZbII!pp5Tvm`KsXyu}1Xrv^Ly5vE+K5jK~?r z3pK@i+;)1-$NI$Mp$8~9?!+gUqQ7Q@2dE1yDQfJ-k~c(H^7PmXuHKPtPD}h$6h`-Y zp&C8ki8GV$J~%kz@k`1+MEyu(TS0`2E-&J_PhyMQL?hCp|6LaBPNy>I>QeCIvzaEM zj~Si&L3!`#zbeN{d_m#uHDHzDJsNK$Iy-!3>gE0Tx`pp5{R25wqF z2Pyr9kU$O+J#x2c5)bEQYdo{==QDya^`f3jPJCX@5BJwp@wlz1m_ix;WH>>6w+njC z7Z8oRUOVZWM22!Vt0VjN$l5H~l%bWEyt>92-xm(_yzDC9Ba7;cD@dMjWt=~84&zFNwoE*qqzX8 zO}u@~?cPYe+f_C4xAvL7Bcyj!6dnyy{JAvU^5<=>Wn>GYAFm#msSy=!Jn4*+#E0!5 z(j%dNlfS96wCuDb>><&}#- zqa9wQlSb!~u;NG%@paRh{l;nSu;!VOue1p%w(|B$Vm0DY=W#oJebJYI&dZJU>q8qjzN4Tk~zxBjET;%K5 zlb_R>JKL8pO4q_0T+@vV=QVNYVZ^slelueHW|YFSaO-f!)Mvt$s#L>d$Lh9cc6;lRmyQl*pZTD+sa)KSd(>Y0*LkXpnMyR<;(H)4Pa zmA%Ew{ZOpPS$#LN#giyusbQ2@4(rnt?A;*Pk@SkGDt;NsS0&%ZuMei*e{MX(YMhs) zaQoU@nP2Yn0Vg3Tovl&BbEEwFUtXhY-*N&{4v2%kq0`^MBJJ_Mr=vmRNxp;c>#0$X zDeIW)7B;JdtF1YSp%ahyn0;sHH)r2lk zQCOpD^pL9bkO+eMom|za!nstv&Js16weFz3G}`3hGg`8(+yZy&Fo;j$Z8N9L z{pJM%vsbMVHAhOAA6e<{!|zZw-bv&>lt(eTS#P|A$*-IV=CJqp$TTxx?>ASThbC8b zgC*Cn5fFy|E@lud>;?DNywcC4Y#TF-h=w}5Y3!f7WicL?w~CRZ6{i(V&b;$XrVd-Z zqT*f4nUM5#5Ui^zFn%Rg#8s`2ZOT59Tf35l;!Scv5_&(2@~;*sf1z{JGY$zoa0sO; z;BIUT?M%CU)%rl>33nihZx)XB(=Rp@TP|<#Qe|`E=8L8teV`eSK&g!J{ycrj97nWB za`FC`BxA8r>>tdsSuWmuG3CB~5wB}P$}p}!1k z+xI$c@zq`5c9H8}cDv-vw^hTRI-N6liwb?xTstwhu&`rXg-q@J9s8EPCktJ6XwCc4 zLM5~0T3y#bE0p{zzvv@|S!na#fcyYfoCf0pS8b^nGc^iCVg#(Y48|gfUX7K*Ar*8# zP27v?=Qu|eu$B+%6z>M?HX;uiSknqod`PvR&|mwhsViA*AQ8<-ZQm>8eepD3wpdP^ zK{(!vN)~rYvdQc4C!%#kd0siJW;Ka}*o>^N!%l5jIrUxMED znS$z#el@w;y?rdLPWIyu%P!vpC`&PXYl%r3?;lb9$OXMF6v87A1m}bITFWq)dW--*`PM9wHGaoZxg@0{%GLHDxcr zDuI>S@f~u>1VLlRXG*pk@3BgQp@%3p>HKjO`)ynC4|NG7e|jWCbCPR~HJ^v}J1HC( zd6Ju*&rxB#pGq#59g#=qtMEM^CXHo@s{0rWJgNG=9M$24!1StQ<$0iNyA$2sg(vT>evtb+eZ^)fby9-<4o$C(qgn4b zP5nLdT~=Jw&Ri`V9nG-vTH$o1eTa!Uy8!+v6<6M)tKY>uB>r@f`uw$R6ly{PEA z%(8Yzc|{92L%+L!c~$WF8=~u zla6%UNq{hLX~Z0VinR#u7Bx62ryJ@!Mlr6dcD?uSs1F&$^|-=&<@0T0%j&vTmv9-l zK8}mS>@6LXmeqc5J{cfMox8DiKVAy?^@N3IHC4%CCDqZsi4~>sR_`lb6(2$$l6Mh%er^x1|5Y0Lb zt+%(aPUg=V-2^-ub-7fahslUPe&=|VXGDYevDo{TM_#_A1|5L_t?2!NH_Gxh>i0-_ zG*`s672(@26Y@KGe*f^(-tmbL9MDpAM#QN7rXl{5XZR5NZW@1* zy`RuGi=dmiwUi+U^W{ZF3q4D22&3kXQ0|ZCvGcV=+yz=+i};rnYoLSAuwVj3e)al7 zIK4%h80^%T{lYj%l{ZF=kv-p)7)rZr#p%Ez^QZ&o!$^wPTl z|JDsQtYAQ_5f$;*6oDIin_FM7nq4j@iUUu!M|aSWqR+6zZLE1P)c9R#U2&OyTMc8V zls`Atw9^_kjp|7-4^nl=DPzO;!GN^)Y(0+}wiWGjsQRPz3j9Swl#PVJ;Y zW5+@&4mU%i!IuJPg`)~ne(2+`kaT_L|OaAb=75O}qDIB3Oyg8}H^ILu+8{;^c9hYOtdjgwV)&a_X4(la*PPj{nSFyY5#^z<|b{M}ioL4b( z^p*1q+}vorNcx~R^bjbeA79V~?e}AMkxOZ0`YyR;#nhuieOhHFNd7@s{gRp^*oavgVJ(QlpLBkN?WzM; zY(iY5BK!XEviJMvX0PC9%jOxHBQHx%80)e5i;fm9{l6=>D^isSc%Mpx;L#=j3UcaRcK49nkR71D#Q$H=hokQjlwzTtJC4e17BEl zE)wcTE#%9;-@Vqg|Lb*J}YxG)ZgLm z&R9wL@025p20usdvPWN1EK*tGcPC<&^lJN?d}wM>9T#7D!=XbGOuWn*3(t73e%6n^ z{tQaWcp5S5Qy@&RdB6?j;C@&ova3EtbZSVyB|~mV!ReQ%ush}#_6ek)(SFKL9|s@f zJAa#>e!C;#>w}@A{o7aT6auxGGukfXjEs&2+49}EBpIJbVe6zaYO|zL5_#PP>+e~g zcPCu(#-mmdJjAyMhQm@BF;KfO&x|mT-yZ(>xl^x{;^QZ?BextLg;F+3Hxz(GS(Cq~ zdcUA~z_U3fmO6h-!Gw$NOIV9%N*w~e#1Dw^Ozv*prbvQjr0 z9OAT{*&F;Bxm-uo;G^LAp~MFMfp2*pH?fmkk>meqzt6NC{>@Zwxh-|85#0{$-T6;DW7992v#gmwD|?p#1O* z<7(2k{hOki&lB_hOxmuxKdtuXOB8jnMI1@XA=&y!ug`uq)?!q8Frw6Ku4h+-V{Tcl z@S`3tgi|)C&8?rj{dPE|hFdO7^6HFye&fT<)~S^}Z45i4(>C6G@un_)fx4~IOxeBG zYw3dE{NGpcm82Ry(zT<|khYrbHubGp1T*5_&j(iEl%phdA1i25-)Q7tMfWGyc3D$g z?!|kvUo6#3nsF!ySds3$fwxS@aB1HUf&z+30 z$7cj5IdMCP>=28z=#FTneP5wsb76>`fyMdr6_58eF^26YRN{>@eY|AL&~gN^H_fm* zpN`vxod$`=5a<#eCatK_Z`9sLH%0lu%D*yushcx(C8ZBRayR-m4fe-RadlT1zg>|+ zF>!yD`{}_o4~#Y$3HxSG)B?L1_$YMqRTqd@_-#0+_W=~|v8bunwMsnNU5NrLuqyB6 zzgq8ADM7x<({2r?=D1jw;K$H(3B5#VuJ?W9qN&Zer_CAWk%HOBHsdHoS{#~f8$mmk zisyazk_G+EuAaqj?Vb&|UlLl?K(lh~MxAj^>^unkrt0#8`A0e|gLI z-fst8u45%-En)3$ku6>a9?XvM23jv+)M=yAVEf7G`+r72=T_@I`%vD9NA}_TSFp+O z{hal-4Rw0>vHRw_`+=6V2Jd^Rudnn4@RA|WJKyMHRXiW~;9iXNS@4&+iS^?JQUX>A zX;t>EyIYgmgj5k4K|8ep?JN8^o~+!ekT-VT%Y%iOcTAZ(BpWP`opDDfXv~eP9usa} zq|ul%*V;v0x=xQs-gLhLF7+_g)Z+}f$~-^Vrcp>;yvTCgPj15-st68wxIFr0kO%)l z-c2|nXpj|Gy3^zH@v%+o&x0DgR&yI(g$YAPIjq2k%+aKhK&X@U;zt6lXXW|?&Co6; zyzHMvSV;r|AlUTOgwUD%COa;1MhdlDUaJ*Bh2BD#&W+c6O3WVAxFr%-n_H!pt)@PWe1C+^O5>Db)qyy z&}p2Bwg48>D(TLjSx4YEU!;i}xJi}F?vszcmR+Xj?KojE-qPbDgaU#qtA>cDeIz2G z+Leoj-`HV7Sn6WWj(!*%{IIp5pUw>txM0A>Y4DCdANsXt@Z!fftlh1@%qb*Nlrx0z z5B?85BaV?}Y|a{7v5nv4&K4KrW-`Wv?I!H(J$IYgxu|@KHKn7Abvn5Tbj_-Nyh8u_ z&cl3z{LvPD&Pvp0DdxnY{r*Khrfm;pub|XTpBH1-fyxW2$ZLb0hVwX+w_6C6#WM)g zlwnITmZFq?pLX1>=RIe_Ayh8}GtqgqB2Wvi1yk@YeRUNAbGAmcESc=sCr)QI>hq4P z4wDX zlq<@}1PeDpOlVCptl&0>05K=5xEKT-ktLW)*!2T@4+B<#w!ddOBp+78$0;pJnh*y{DmVH{XKB@FC>b!pbkqAO<#x91!yB(e z(jSzJ7aze7ghBB<^cLEwZ3#Zs1ZPFdjNf31UG*KLdf}$Kizvk!_G3*>!+b=H)l0}M zrl-w0c6}g4wbfwL>hAWZ26 zd$Ki;`=!$1+MHQyQD88}8@=VD=JhNL{LBi0YP>|~+YQT_ z#@eEdxGKpfWE{NOeKh zQ*+R=&)e-XqvDF28IKIZ7)TJVXXZ}n$%l56v#g~y3V(vfOR0;W%sCEKHkKkPsATq< zLH1p$UFQe0S}Kh%Jn3+~{ITn?S{T{iK6FWaZ(M1`S)3WP)E0mkZA-UhPKRiBI=6hs zEGruci>#)IOy{%7``}z(r#?;1?VFgu+K~S>KawF;-E~YQ(VTWkY6j%EX4#(LfG}0tuD14gwczM36U<#(*{Gj#c9P!Aw?6O#1rnr4a z=G-Jm|JlRO{P!B^-SXnwqQ%C_%F|@9#StmFAHOX0gi|JJ{K#03(slVN$I(ls!?sIh zjB=7=SaiLi_IuJmc`!!GW0S4I&(g~x(G59#&=@|kE;Wjr?QwUbdIoRtw>z<7Qfmuvx|ID|K|IxyVqBo{Ya35Z;%nEJnz&SJi+~ zQ#G8c{H3}h$C8f({gP!TIOZ*fi*A5m+)KZA^p;)~lC4^+QaE*&a~cpxiJZBrKXa|+ zcg#4d91awTCRLx)*7+k*xtPA+q~1T&xoi}3emUed#i~|Ef^^(CqjCC1MROz(Yrw**?xCUiOFG;m)ac^ zVlrYHq{F}Tc|(39E`BQ;rqK}fIDX|I@jAcHhpUQna|KS%FFB9zCQF%X_}LhY9qK4? zy-ys~Td#0o348UrprCHw7urJkIJJ}fsr+15m9}0HZsk}e$jkUxo9S~F?c2kGr1s|0 z@$d%*F#_Bl8pXNAlG9CGNv)Ryg2_X19e=A|XTG~Ni#H8uQ$*fSra*H({>WFW^B9g^ zV}sY*m2-1v(Bs7L_QQa$2JgWSY-=G50ry`U@iI$P$TZ>1-@Mx<1)U0wB7-*a1nfAR zhf0;LehU+Vo@&{# zkz;&m=W!TI`#Uv$1PiiMgH+}o_TeJ3+;X54vM*y{h>JwMQ{TpL4)yRkUS`BYGx$njlUyr8K0w3B4f#~?O?$8|$DllM2}6w!sN_V@`u|5>&K z^J{*;wcTd4v(g^|muMU+@B_C;di~p(KT$;V9^ zlaK>W)V5U^bD$)r;u|{39)=EI+$H6F?$W)n3d(!4FemsGJ*7vcq(6iM1^=zxP8r|r zEFoHql&SDnBGQFK-Jo=JiBItMp$S82sd>AXO{RpVbOL^pCbrDL8Hq9%FfSBMP z*wG7^sOMpkX|ho4O%dWTjwUQN&Cp@R5KSN!7hpJzL2`X@!bkq?JE4X9LSp_W8!o|K zlT!EiVb3?XoB|YB3&`6{_JwBw$~8HpyZT=vD)p3d_v~~GY&q3NpxR(wZmD%^p|Rvj zWnR9*XOkx}+o+Pbbx=)%p)#RHI(tcIdnqskoxWfEIh%8pEKA!yc6@vWM^~0r9!gz7 z6T#$}Nv!%+NHl>*ZCzKpHC6E{WM?(;FSMf5>ssn24=8<#I)RzzF3-e&hAM}`K2MfV zeB1kb5uVewk?2miGc-}&kPoe!8zV^QEINLfWL(4M`zfG~t(OqEW2 zFF};6VbG~0u_|GpcBEW_K3@X6?rw8VPVu2&_l$+*IA25qaa(ot9^KYlgz%!2GxauA zs%KeclooM2NO3qiT6jC!|6LtqoG^-abjX%_1Fgiq-Dip7(C-ps{L3e=NfZ)zM`sBu zvt;XxuK5k4`Py2{vgQMILi@Ja9g}{pQ@zzO+s5LQ&hxppOR=~ zZW9J+S)$Qo3coR?^6nsA10`b(k<=K3% zFfTn9gYH1t?`L{ksrN|UuD-r|?rxAfGl~1kv;;i$_inOmeTQPP^5@&*1lcMbZ|Dp^ z%;{vsKSE1~Zxxcd65%2Bhb$oAUe~$l3*N}isy6Tuh8?+o%I?7EAvdCsN^4|!k^I#l zqGnfx>YQl_c0lc`<-|wXMaPqu1FKf|4TJlGAwS+6oqyeJz3bx8+HkaW9I+PQkJn2* zg!--#+NpO4J8OyoaUzBT5n&+&2_@=>`8^L2v+JNbN(jy6nYSjQjT^e_Nk)60E~E=w zB6&owZ+RZC6rt-EFV;lh@7p(W7Dv z`@x~l6|v^yEMNZlMXR0$<@HCe=vK$R+Y64Q89#r&XDoM=h~(rWdoj`Er%a)iAF`7> zT(MC)7y>_ON~94ljf!TVNg=w>ohdnpxp}6cur&CXrxuQOXmONkGqyn`MH>hG%w6Jc zm1y7jAkfKqC{%Mi~Y;CEO`rtImz6eP7Nmn`)HCr`4``*&{#{g*X)4T?|np<9RQOC?_uusnp`jUaFFk{(U3VSYZDTfYfl zjM5u2>?ZI@_WxF`8cQP@H&#i|1Ldq)o5Dcj>2qC$`mXlDHH>`3*k-;}F4Nq~RUv4a zrIblLyy1#~o}VkuQj2ihdcR@CYZ&DH1-d8{V=e7gnymMyVeO?Oua^&qDCQ6ClVtYzTYXD z)?*DhgqvnLo{Y%i9eTN46g6tY?7OvimE+02zc`lX; zi1l(>BqMBm9Cz$52j@!CIaykTaiH4%dmCj0V8Dr&+b96hp^8CZUAtM@$eEHDz z%zlJFR@o~zvixq7ZkMHZY(Qv$Gqhy`R$o{b%A@$SDJ5Xf-jesZTdGZY?|eGTJ7|Iu z8ATgXk|w^QTsxxj0?YijmEFQ~*0y)%$bmmUhrMxLOso2AzmVi-<#Z0oX*|;w7}mmU zJUryimVyOA%X+m3M}4R4-fVb4RQ2`cSsEU%tW(V?5+ZLmOMZb;$m>E<)HWRD=$48e zV|KUNcTVB~JojQQ?(?^6#R{323>#h>`^e22L0k(Q-3)an?jJqlcM}jH(s3g4L|6Z;BuAsmK!)V8UtrE6xZyb%ZaSKB#Q=xuCAhB@YW-Z^GmD)~I@dc@;VtJJ zpZaVRp)PKk(P0(BhRWF$g-`cw*J9{JwVKca=$q|*_}mw(%W__YmUxpg8dX+=L^c^x zu6Rh?P;c0oWg1#5T(~%cf4(J2E(rQoo~*=&O%=%0^Fu)vu304FxSponbb@2$cS|rw zr^?8#kY7oNLJtiL`wmUwMzXd6X*ti`@+quZA=;tbiATzUWCCg|QIcvTi{{YW%U_~7 zeso7`#tB=uch>MQ0p6{7p_t4Lh^X58X6-{)bLzS26uw_+xm7+29=;Z(5u~mrSNpuV zkmJ~Tm><2Fd9U`7O_}C8r6h=Zt>%GL%LBtW36q#}A;CTrJ$}ZuHMdirN09HBndg;0 zG5o}b@?{i9jw~~p7vAuDh9dDK5S`IIsJ|9gs|%CR9L;yzVDs=^Zzj2I7(+eZM$ZTg z^WKzD5_M=5E8wl(RY{r6Ume=`NYK#q)$HJ|J6XTPWX95QYftlbp?Y8-_VBUMde{m{QH9+y8l zr{nB@&KYK$QrB{*GA66J2X)q7@Wowkl0;C>;~sC>FyoO+{ce{0{A#X_42{H)$vn?! z-P8C?e@3m2+)iS}VwGL0qJxTuvIWjJ=a9|SNygkf)btK5t9D3Ap3G-h07=qJZme}h zi)d>Eo)_=g1ez4mgpNcaK{eYcFOw@({goDUrrD==-bl-wabHGN;)xcq*Qoaxh*&Kf z6bCdDSGfoe4Uur0TWr`R9d!=i6yBExuq~N3PaO9yA=h=RLgsN5y=++MQT_$f9`H=} zOOFR4yy-e>Idnq%c5q@rJF8pS`8OMETmEuXQ5eNvTeH%d1$dGWqLos`^%^zN2F_ZK zbL{esD*LsAj{ai!xxcGuVBD(Bo_h#7C$8|~(5)}Z1%@i_x|w<_2d7jR)W*b}epyu@ z*rm>#;jlvx=V&BY@%b41L9(Q|rK7Q#T=N&9U0=;x)o^a?+Mm%nuU1D{nBaOt_sw8$k%J6)4D09`VwZJE~#DutqY|_+>W$gj8w?nTg%tNR%S{b)gUX$-$l6aMCL@7`{ zq|_}jl$aOt6Hgf0NHtny?WXr#J_pD3!bR&K6qII0xCY^AgJsVYL-lP#f=vU8Y7bR( zDjNIpKm{6fe+J!RD}>91JGr9Fu4wpA_KzZ(4Y5n>a2;PGQ#BS`UrlR_HY~<0Gqk}Q zgiU_E(BXtvTs(hQlFW7*(V=qjzA{W`b=k8?=e>39YnFG^+vCePH;J+knt{Cp*e_^0 zUe^#J#Vl7yi$n=Ma3ol#F%D@)Uih-TUhLJlG0M4bLRi>DF3)QBcaFtWiG9z|uOEXn zpgVQa?OLq}cllVUxzH4&EiAV6!7jfdL~f*1<&bnZL*pBS349JCo1t3ZP%g^4DndDG z7CEF{qO7@1pCRdFmilJy2lKvYmAa9c^Cn*C1>_nV@g|3(bfVnVMeM|9T)RYeESr!v zLc!{kJQtY5cYb(fbW7sP%{o7BEmxuRX-Dy-2b)t} zP@3Xvd)$lXh0>dWj`Tt+;<$2BJz6CP_%B;uUT2nJ*4Wag8OdH}FTu8xQa5m`AJZp( zePz8BjgU}&U?J<>#@>8CFq`o?w};}n^q`A>g@~!H|4#E4)3)!);*V_YlkzO(QAx9K z)c5Dzi)U1mjh=k!A{ZVk4!7c&v_8f2h^3A8gvdOQl8Jmg7#pUjw+v2l=`->P0ogFeIymg1Zc*WqB)K2(>xdCY@yKM*+sM_ z5|J?e!|6=Xi;B)03hS84DG5|d=~|yp?w_Zd@UV!Keonol4#Lo^pSyYgaKy=yIXqZI z)-$VCm%$uj{0RfkU(a3l;96R0k91y}9<4>SI8EGiXyLmi)|8Vc@f({ATyN3@VtM&EA`@%c|_q?ZV{vH!p+EYz%9 z%dK)!vJ{M}ii%CMxVCxY`sP+FI=9n({8s>Np)^^fvDhv={GGQ`KUL3_`PH)B(2{pu zgfEsIo#PEbMAL@EqmR*~iMl&+xe=eSqzk;AEjKoIHU{$oUL$lB2^Wn_vU97R+XI`N z@^^89YH{WBjjm~}$WmS@-t7{#xJicR>TPi|)F%!*J*Y*v-feCVCcK3V{_BYuK@``bw7CeZZV$RipA^^KH)UP*3v%rb9X> zZPV#>_IuSdnFPc)6Cb%xwbr;>A+4SGNV-)YS+vsi7@te6M*pnRUPlq!6a zz3Fey|2^nkkUioR)sHwJmG`n)0Y9y61}Z0+CQ73@L@-?58dVi)wkS zy=Ijhb8Tz_S-U@vVYPibxtz+wda11=4m;1s2Iw<59>PU&wy_eKRwK4Kc(gCjRxBHl<(M3)6M2tex4-ZNedsHUcA&X>C2HMJACPg4 zpT7(XE0Yo&wlcnD(~5=5XFuIII%3oI?QE94-nMXbNs?w<3R-AF&iYlov30pY&;DGW zApZd$GPl%MW}%lwE1EcLcPn>7TszPgRHI`K?{ZMb2#+Sx!?)hyxTr@mhv`lpVnXPd z65y%)W|n(VRCjV@EZjX}p62-i@iE+B@feA|%J-sWp)0PBmW>rlN;#vUS?b&{@vD9{ zzD5s>toyh1pq(7j=^6I<#2qA7#@cr&8|(U)HXFy0Y?RU?hv`0=WO069(YW;F0z=dv zFP)*NqQjN9d5hc<_J&b@W_=J)CidaN!kG?>|G6?o5Qh?F<@Qdo1e;zxklP@@9dpse zgQWe1!Wik7(uPmUSP95@g6W5@QU<=t`q9G1DY#7(mNcGSVhIH83VhMqCD38Zw$Au` z0WpL#84V!=CGw**v`IAU7dtpG$4fsAbB`kuI&R#saN(g|i40L-a%-T|r;Rb*Y0wB( z7`oQD$*@u#2UXJ~n>iH_gzdx@&Y^yyw9PB`JfA)&5H3$9?0U0O9enINP|Fo0{*XsF zz-Ua@lU?wMl&_?4WoCM}?qttRiq0%PS51XfA*Sx`*uJ z!XC$Ga8-W4R9dOMEfZnfXIRPk zTNOKY|4WRm$6sFki0LcMv@YnY1NY1LIKzj_+*`(9bNAS&`z-~B+~YWn)Nr|X4`}Sm zU+Z?tJ}UFPDyBd8uWa3=m|!XvbV79@cher975RjQMU764Rd{BMkSIoKDj-&>*vF{F zAF9MbbKy|Q#hdk$xQQAwyqvjR2t{Fqqrr+D0?%m2Hf2tTVJ6Wh#HR4AA0Yzu596A{ zR~y~w1sqi#>L_<3v1?p12b4in*F!;FM|MK^%38}#5(hL{Bsv2SpeRQfAJh_d~o zc3^_ik`tzUtFC5}hY~T`U#4^F(T|C=`+W6=dFx|~jwMe=0HK-XF8Hg=CW~EBqp7zn zV@0A+Nq?}G{YTu_v5qS`dz|`&4ABM_4$Nr30@C_|@Mi78RcrruinrC4F9tY98XdXR z;$A>G7O+gV#(hCD$>M1>9o|zpBH?w-jcau~%0Q5YSLf6Fq{uQjCz3WD%R7#gPKxcZ z@zWIP=dDJRS>y$Y25cyS5EUzhw>1(klcgK@#wZgA9JY=+_<>#916B4 z$(+4#CMYV(J&64u)7ee2^&%g+nIx#w!y)XB}pR1fUcc-aSudWFLSg%yiLF#T^5hcWx52zsZK zus3|MbYs|*#yHagK0+WfJ=0uluHn)0yRWCDG3*dP zH$U*JZ#k-Ka;rEL8xS{BDwk71jxoQI`&BaYUQgaH%>6YVuOIE1%>x=k%s{Tfuz@7yE zcth2uj}#%<>ogBj$9vWJzNIl4Mc}^}+S$FGZL<0710w`J$o`ICB5PQ_>)@NqA2Eo# zMYm5MArxLvW>k75T$>ÐcUwd1y)CqhBH;w}9C4d>rv1MR{9_0h~)`W@l$EFLn$w z=Dl<7hpA2(Xu~y4p@7Bf8R=U@90UGM3W&> z>!m?9R7f}T6n3JoTCU+ez_k*n)?A;YNYZC+6w9d5h@A9Tgim&c-w4zNP)z<_ap_fQ zwk3oT6n`&Mkc^zG)!I1UQ|Pd#6F{?v+Cmoj-z@4ymZq>3n-xVpJ_un^PDE#WVZ01w zEpo>exKyRj{yRNCLm|`%oP?XVbjgUdTf9TxE4K{C1$wROpUi#Gu!PocaER*t*}4Km zIGr}Jp~jnRQ-ROAN&H$sExed)Rq0tS5rk29+M5zcHTktk!|?$WGZE<%VGo56l6f=) zrCqZ9gn;HiU#5K10GTOHYSy~vl}49nc2E%Ze%f!Cz;l0|ZhnwtuE13KwwPO8>axVE zzf|V2O4lkp`0fH!n1WfHcmHNe`rn;%@meHl;MHtsyHsxnUV%&pr-{!C1Nchjbga|RMsRS!5oti0iC~~zoL%%N=&=U}(pq{mSOu&@XvUs%|!lGvec4{QY+MA#uDY5m2E5`zK#x=ITH1x0*Edz~XUdmPyK))7Zkj zURZ2YG~fe61JKN;n8C8GuNdCu2uYd_Pz`$%I4Ck?|L{{zduivB*-EE5$8`^}gO^F< z)u}58@(nqg-c$mx!xH^Wd8p^zf^~u!499_Hx` zQXrl|f@@8F5@@BFX{RVDC7x{og@fWAhl+p-_j(`7WV#EP!7@%IPSl((aY_1bIgKW{ zKq>pp{YPS>ACHT_+i{wOx=dk@n>rsC9T^H!MwsP)@MA{2-1S09klMDxlqzXDvndD2mLeuf=eR_SWCb3DS{+eQ$iaxRFETmJ9Gx5K@3!qnI zXK-3B&u*gM4q(?HmzF%L;9!#Kvcyx%Qhe~`YifJpBztnv#+TDKk6n5`+E6I)BMf9@ zQ(*Y9Re*i939#OyCxl1%NQxtr9cauBu=ZyEcjbD|6LWiH3aoU5w2k(Flq%8c{y3b? zzs&3I?kO*C!~dqNlb`!Y#4f6f0KU z$gpCW4v&)|AR2Rh0pT4uVENE$w2b&dJHxd{`HVxl%ijYA`egu{7;nZcF}XWYgMDAN z{xajtTz(Lwt*AFAiP}}qy)ID`16LX3l<6H(>Qsn=bIBNfONj8(cJ#;KzP|#Ty10>L zBA-YGK=azt5ONpdB@lp_W#C`~!o%Z$Wmh(ws_VSZI;lY{yRBCWx1iphDIAB=wS&c0 z9fqG8Uz(pj0IXB8Hxy$h2S7#XK$c3iodtOdJz|v66x;T-JF}oUDy797fi)xvV1xq`I3Va%AFCByVeCi=wbmvFyEwImt$R>!?`XM=(rY;hQ2Aqn*>#*T^F; zhdSGu66R1Dbo40R$L5)|4!sGjJ0t&HQFT%fnsGrTeO%(WHJv!>M(QLbHnE#mzzBq; zRMfxO6!5P+JG|G{m!il!%0SUjV%ep}Mt1B<}vO zO0*I4d2?6MqUhz-9;&BC&n>-F#CDt77W z_D5C|+Wxfqxf=4^gp53TxcyYnbl_be{Rz>V#uKee>#_ofF@LEE3c_|IH|@|eD?sQA zWb!m2Hi~ImSx@e)b_pSYHx4RUo}<-X)3?_ivp?h~8w%<5!1oOZq@1czk9$WR_BLYh z$$ATYLDC6R&1ux{K9w?1K?2UsPe0!}=m+vAjUGKibXLT-VG{hrTSkU)@Pt|zXPKAk zvneNR$(=#>?xU<9m)xNG3pq4vD$hCgK)VKC@CH-+UEcMec0M5kdr#3S2x1=ee?j&? z5{4Cv9%H6oXW3Tju)!l!q+i^OLZs@beGJ=0P4+hcuJ<-1VqUc6jmw;#gsG$J+xXfW z+LR+gt$YDKqGk%}T*O@^&(DLk4B=-aD{1f1z`g-LKO+S|59f)Mf5X2H@pdI!C`Jym zQTz;SG2Ya}?MusylQHUpx)4&eC3n%ZL1x|U-3w*WqJI#Xl*YSATqPdi#_N#{B|$ye zJ*$7{;mD+@Gga@X7QGX6Tfvv8?sK!H;&Yy^KLgg%el5FOjnAQg!&!_#T|5>(6=H0^ zB|YOf+7t?N8Ljf);z~CLLh*gKQAgBJp)rDP3Pr7jSbZzJ7Nc#@;#f*Nz~v=1zy9sZ zwcJ0%Y>mKB4Q=8W?BTK`3YUmVJKvkoss2jE-2}$U#Ac6b3`PfusTeGggo|>QDXAY} zV@b}P;av|b$F)9~cq5sY^hYA005231l10oLpOcXrpU_nWR3R%w6Dp~L*P7c)8l~3C zih3ruTgPWq19ggx7w_<^;-%=idnxVXeX5K%2v>um-axj=&BFS~+7@m*%^|e? z!HLu1goc`A&mKUPnL;qWd4)&9F-EJo_Q>>L$QM0-$0S}npG0}f}+OGo~M z=WB}Jd38Gr?DfV6aKj{4`Hp*^3Z9{F{!yw3TzP3bEk`Mwl zrz=vp3{-n*(kNb+B+TZ%m;8T)@DLR`VYcyuLPy8~LL+$pWki|brlAT3EL}@bZ<&Ke zqOF87up&0T*EQWLiXwX{<(XzVkQHacs5X_ZvoQ=s^a&&EnIzi{?w6ulUz=V(Hd*7l z>UlaPTq)i&NZuqwnADa37r2n&&R+2SH{kZ-1sJ;Jk5i_E0snNyFcx6G)+iY8=lJE-1D6gq3Ibcj<9nwN2RTQh=Wtj_i}sN zfm5^Vd<=j#GlqxyMP2i;3d;WQx+^{`LaFG_oj7uIl^OHae*v2-5+Fcn4l zsepWj)=bN26^3<3E-QmbKOM7I5V zF(l5^kxA;%e(z@VGnN>D{uwr@LiQOOikug|{WKhLu#mC(k`WLz%VM5e!En-WKI%LG z)8^lR>BM^-LUnOH(TG)Us3V#xo=C#0$`RNela2_YQj;{4qQ0nFMo*e5b1n%=yds0B z1-}PwK{&Gzhsd&M8(;o3$bLi?X4c}c_7g;bn6l zj#0y!K~nS9NL*v~O7VEAL*H~W#(I4?DHkJlBHw-^3qsN=PX$~nxN=L_a)05L9NUo1 zd-vJsI*%MGHn5UezY`whf01Zm$BwY#j%Dq$U(SMxnh=Hwjy9%3Vws6bl-m|9jVDzu zpnF{k6HIl9IWwY}D7{xiqkWKQ4YM;Cqw_9WjF_g)$K;9k%FnS~^^Z@Z1BB2!A9Rm0 zahel-nzl5C$*F<>J12NIuC-vn8SQC{30d5m&~|gGU>ztd#KxdPhaHu%{d2v5Xc0GM zU{fCzgXU1dB-BNAD>=CtamM;`>uNE(?TmTHnlbqNokIGBnHI(G#vPEgvLjn93;N;D zFwH>pV%JLRr|F_KO5v6)%GM8H7o>yk>CT9wRi+&gQ|?*AK{=94x)?$|s>*&OU!(!d zOGhFD8>RgO0MACawvEH@zLGW0PK(|mhI5k+<606nhe?fVwZC77h^+e;#*h^cV^;Ta z*GcA29Wl(yMv2L}9j>n92W+)yZADN7FB3!|A;NQzBGmH?ST$0pC2AUC8jBKn7pW5gmZi!!f(s+nuj z120EL$vpZwXYh(nWvLej0XAUnLMLO$5_oelD3e3afj5uT`E043}rGXp&kQz2A98b;TcJPZs_5LNph#hzXwgWW6JV! z)q^j?1CsD&jZfFKZ+Jm)ir@Q#OeQ08Go*J zWLxii=#9&z)7g0Z9+sg4f+GGJa#^96ytWW!qWTm;by2e(;0lW}>HA6N@x!OMX`FFO zOm=~Vn;NNnPOcyKYs?d*AZx4mhz3luiKLZci71xpSU zu8{nt3dY2VKi<&GR&FplPD2l$MPT;QL4r8#O&WLh?w-eJW;{>-3*_Toe`Bt&t3Gba z^k+7I)Ya@Sj;%VQd37s;)?AK*!*eI5l?r>k^Y*XV5mj=nNTdcuO>cL_7!}5(?Mr3d zF>&S#Z6&^j;diW|=tT@G$bg;_&V+(z0f$y?2%o8k(wM-pFCMP~GCGg_e7Nc`Y znY#A4;#z;o)Mx!jbGdbkd`5(UoiOgo z6Fenr(X`feR<62(dx{ruk)L7_@jIj)r(wfAU0WN52SC<>tH)cFZqWQ7N&PoWel?C; zP6XT)r?3rCDL)2HTYS$e2PSb!>Nx{xgXLVudAa9o0^8%wMpWqL+6gdnX>O8xZIa{!-scaLKfnyg*2N0@yJ0jCDuIXV| ze-hRD@4tOBvGdB~+G2(#p}ML#fg?t3mMnr-$nh#`mhwGfPDmPSPY@{fEp9uDLb6Ca z(I`L%(c|Wac*_OO;h+dMcTW+n(4gaibJdS&f$_PaVyF5N&cI4|1fexaGsCZ2yTA%k zb%hQ}&hqkcM^_~cB`L#BxFz4s+qOw_QZVKEhjjI;#kdtdke7Z-z(%&i_F;tXlW729 zdvi|+wNtpo2UB*p%$t4*o%AZ0N&!%f0#X^c`KmgX+TC3ceH?$BFhT0Zz47StLGi|@ z0!-3M#34E6|nKV!3lFGs&5n}8`{3GbDrs3Y}vT>xfcpLR-W(lbDe;(d3UgSVz zfi_6aQha9e0J%}fsh*m2%rHC;fYEDeK|xY$vMzD!>Y9=R_!^6Lwe#w6LMiMtsBFvd z#|+Y2=#wZORWbU#QbhKZZ@=EGW^x9)>=o9U-8)9y*2=RT|GpRFr`^JVV#AgN>ok!- zg19kNsot%~O!y?cmP}cOm-3*^Y5Nw>4-8kv*L2>x=Ek)|w+zhDwA{Oq#ph!zxjKB3 zDihNb_3fu!Y;XYWxO5klqm+(BZx!H5#WMV~0i z1~fn=K>iQEt@>i+_`*Pjy=ln4;3j*xm$vYa!d3}U`KJrRzaKNqfh1`~%FLT*1hL@5 z^co#4W{DC2@II3+s|s3o=%S-W77PgaxiEXxj9HxI!x(4^x~d&{Z6g5JP>FWmumxm?c}Egs4;)B`xhI zO1V3=-l&3JeCU71FhuD&A8_n>UR& zd}3kO#L%Q%BW{o@O;aEnEF0YTv%ymq0rr+kz`Ov;K%RoUbFLjQFTh~!+A?diBI5Hwn#Ok87I<7&zPb|dYgLD+TXS)A3?LpeFhthhO`NZ~jjr&;`qCC+ z_50jGy?d_gjq>A%mSux@SRu;1`1gW0S9arl`mzSmtVm7jj& zeTU1b>yQ4MX$R)b#GlT>VjNLWxzPk>iXrVVh$k0Ptbji6Yi|oA2(Jf$wz!la$@`~o zv-(Wib!7Q71Us zD-2G`GXf%NU~S=0<3(`i8sTb7J++?OsOJsmj96PgX{?}pGaJdW|2+MBn(ND1*qsKA z<$M%L(^s?$1@4;DYXH6FwDyOSfUo$E->fldN&G3h`4L4a$NU*?p zNPV%tpeA_WHBa~)(|;g3;a-%!|zM{IpJZYxSK@lVs`n4h1 zZ4Y|`O5>&ItQ=FWv)Yw?e<^&`kOO8M90s!0X_SB~b|2B&+d!C!g#e1p?Qs?jWhy_a zxsC|E>`J3eDzcVR&+Al8d|l>Bb)M2`io(~sCx`b=@04Imr|f`>7=3QZru^&i5Mnj0qm&M`Kihn4U=oIPU9`y zp*{rl2^RMlu<-g!tA{le5QoI!W@^JjMaHUeUAu#qnEGs*7zbrTuC0eez?aN)bnCA* z$&mR|f-o0qa~!OOeSl~;;TUL~h(DZt-VQ}-@#X@EP_Et-cio3tJjE*FSYbcPRR662Iz9W4I6a(@e(DlT()^mOyN$GTYYl!hPlKgrzIaoX=F02pk( z)F8v8dRF^u6IdQzMJtJ=YEtVUYXBN(97`=a;XS|mEv-)ZFn-Yc#CQMb-6~w>wfVmJ zK+M2c$ZSYBe2Ls`L^6KH8VG}!D5~1Rv|GT{53!Oed_6*xtW}Hk@Dc~Ppek9DOWoli zht_W+n*l+l{QaOE<^xE$yrtTq!mY06pDCAL;n8W*tHnadqnkXc75SZvd;byY*`yTo2EZo(3!PjkmfS)bn;3+zOq*~%* zNZX8oE_guO``aQw=g!YOLwJ!iB-#9vc=JIg*j8^!}HK?)LsY26z3|1)~I>;8pc9#sgz$n`xvrR+_Cgn6CCr zoxwYQrg$T5#T7OIjd7h|c@PD8VnCWdSu#m%C39m&GW|b#Jz+A6vo-k;tz1#A53Prb zCt0%KVON3&-eIare7LaB`S=x{A24YS(ZBC`#>4ifgjIMM7zmb6+rhU48ktZH^lN>p zf9)|*-4`wW{1dmro~);8`R{?0DgMpyLjxNJT=X}9>Odtfk>l?DSGpyZtqulvSUjr# z#ed_PXhh@=uj9&rI5pqHQ7ES#En3F{%`*77i_U-USXnbJ$fe5<9;O6!I7+}k><_jX zSu8`(jzz!jT@)C|*(X#AvBl*kEzh_y>Y_>W^kBPfdi&_!?OMiL#~!HjR&l;48p6qu zBM3g@!*TACe)DUgG$g$44F#sJlUfbU1<=1|lHf?+u$6Lf>n2u}zm$?cd3Tvpfm}`l zLFVY2+DZW@arB@6B9HE-ol3xP6GWp?PeXp}&XoT#mZ`-}9fk;Ui+ky-PiT<_EovStZ3!;wFJ*11#@s;I>a0;2!UiqtxhWB zLc@$9F?>7~@S(3Nn2ekka$YR^o?jz@oL+u~&8+%?V?uu5*o&fc@xSRc=B8}W<&X|w zn6n`Jp!CfzLa2@kS9)GvYXO_D?E9F{itgVQF%Im1C*8?&@-!X}1^ghBN$)uQm16E43gV^a(=IBBq6tA-U zKFu9eoK#a4zYiazwX#eNqgo7n!wHVC9g?Ya(jAJTh3oeJH3BvfMn_$EF`^uQ!-4I8E)%XpkWZhl2RvkRQ*>+9jl&z@qhI zp<10E=nBygUg*KWG)76M3_%3&PHtX45Q^)Jc`79Dr@1V_OE0UI5VCyiMm{o0LDQ3^ zni$W)NptKieUT%VF?WI*{PeG!BaBRHNQ{C>QwSMq0#e?%dROF@*VwKO<%Fsv2 z-`eK^H^sOG_bM%!J*&WD6Gi>VMEKdkX{7NOggud5pJvZ;TE9k(d*HV`K2xs%J5G*2ySB_tG)wVoB z3@Nf18)2;5p%vh_CRcO-X8h2ok34I33+mb6#XmP2TQs{56w^to-E8eR+QpNIUgeE7 z(+gmC$X~Vv$DV`;*cr+1B$7 z@WD#DgHLwDZ{m*c)NF(&B2w}6eL%9R`J-FpgU7o(s?O|cw{m;byGd^;w18d|>}QI* z#ihl^y7wC6Mr(Zi;nD~rhiQdv5Z1)0CpdHE9hwVPD{%1T3bg3OV@Bz8GZP$JG@JMh z0yzD~38JdLRzg=G1#%cJ660O)9IhhPPX|cgrj8p#^ozzII-<1kmeiG}PUG*1Thu1u z=X1(0>le^Nw#ljXwN8%i5*TVC)T^3i3G(TkF+ie_8MHt3j&qFHmu33}zk--gQ>PW| zanBz6$Y$Qk_TRb36@nU^T3|dSmJn30E{A5MOx7ROrE^jTTcH(eSPR@}teG6N_m~Oj ziyHg~ig0ZT;cP^&S8DQphmQc)_|D|ScR=&iT6p8!*dYtIY^K*@mC%+7cS{~`E$3ui zK+u`@gIarP+l%4)Qfs5vogwy|vz0C?prQ1=$>E3Xz~^raBr>t~L5P1oDUjRP{D#L% zmM)aE|G0Eg?i$>9i+zV*v6q6k;J${d-?y?m@mEE0$~^hhea^!EUg54~#m@PnjS8;} zIgoDaUpnb@$2GxKOUDr8uuix+SHZwExgdB;fY#i@*5QBq1!l*oy`UxlXX4}*{0(P*3PVKOjo@d~YN?APriq5v zAc4yi1MhpAG+y;`EGHy6)?e~L6!#02u-Z%|&rD}=c^_gjTQ4Xl`!^+)C zl}T2s+GF1AeZJ5Dr;m4~Z92h~)G~XQgR*f~#tX?tmve{+JWwC3slOt7$&msE=I&{5 z;=S&qEO3Uri>ZP!!`x!e2eLY=WhL0D))lh3ujVJik$D~Vy^Q)LP`#j~OGQ;R|L)U> zg8NTKs5#Bj&d;d+83fZkbmg%(PXPi>FKb31(E97|i0bf5`h6+6oe?%-_j}y7GyS`~ z@3c}KsS))Q;Tpy&CoBjV#}tV^ZUrPRV6JTN^GI*cryk|%OBFXUDx6FFpQ+xeqno<; zX6S-f4L!Z&k*%$$N8|5^&aqaY;ek0V%1S-^y}m^hW#jSadKGQBglk~=Z`;Bhz9^KU zEZs^ZuubYHk+zYrgx;~D0~^d6A$E9z0{0sHP&xmKOZvVjDg`)C8Eaf?c$OnAcaMNc zi<|K8NPu|s7tHI>=w`2fx)5Ttthg?6b8Cu-@{z~>mZ;3%T5+$Hk+ir6(W#_wD(b8=O+A8+`Uw8}1>4k-5}+qW(A%QI$<=ECOzp5Fe2j zX^3c`NKA_SI+XT>X#;wx*;U?=yfsnHY=?jc;+s#8`5kzU-+3PaDGj?+SQ;6nfI4!s z)AhC`wO4P4y8Gtb5;k2_la7x%%7cZJ#dp4BP%ofX3jdKVjp@VD#GkQcJuLJGv3aPG zNG(Qe@hzM`V6xC&{eO9%&_i2isFqU#8)z(1c_ISc{=a9{I1}XRNH!2*w=zoIq?&5( zWi!#!RE$^TVh)Rcdl@r;m851qPpfh2oAk{W`;7AwH;?hS$5Qx~n>-PB;YdIKK-hz| zGuL=AFa@0DLuBWCGW9X4ZT&y~sc^!Frxv!|Rtw&#n605#K?fEXV>0$^4gd??Z4&u% z>Pus*+VuANDNjfjpAq8vU!B{){Wxf@Y1}O=9=Ypp@t|{|w-Sah3OniKr0tk+{P$h7 zjEH~2*rB>Bhy5yI-#n}l^l)l10pGnv4lSZVh87Anhsur!KE3uU-&L1IvcCd2?BewN zW0QzP$}v=OKBLf;;)rdr?sFL}qBNUn(fd&iz0rv`t&3K z@g=~H*>~{*>TS=JWMI8n`l#@0V$RSQd0VHLbsZ+06Wa~*-R{QozMbnT_`AJ|4&4M% zij#{!Gx1gLvzJGZr7wsl#}7(pB)l^7X&c<#NI@r2(nqw&ZRs)7fm@wFYCz(nggh|u z@V}3sT?%>zg^jIA2tU;&M!bWlMCieBw)*;q4F(EiD#rO(`W5Hdn!FA(^}0hNx&~SB zLYc-wmv7pgd;Jxl=435V8^w@Spt^p*r78b$88Ox^*N4a2>HZ@Hk2sxHBBjkJ2GV>u z@u?N~i~TI|)#wcCpudB)h>`o4p=tZ~qcMNGOt8y1T+>F3o~!(eMcsMIGB2>~qc&1rVxef* zb9>_t<^?`a3a|D4N&U+He5E+I=lsKuW;j?7hKthcW!TPd_izRfNBZ%JS)aef}X4LHsVrdjhY$qZ_anfKM15yf9h&F!(W|Hiul#=3EXagMqfd?e#Ik zM5w)%@sO-9i)#kC<=6}LK`ve%LlZDH)xrCi-&+}#Z&aK!TnO!@WQteQm86=V5kf@~ zR0`=TPi~X=J&^o$haYumRrfbI=%lV-NVn84au{&Gt5{41N@?@3Ww-lmL#vGyBG4^2s_3BmRA!aq0Sbt?FS7KV=F-!AvVfg>cts_7#&s_3UF5cFl%!ukf72 zr4As+tP2)I%|UM9*O1!Cf6i`1>UTDr_L92J(t7ZUJ54?oNTySSgKIU_@oOj5{9dZ+xzyxi4Sn}mmF8P>Sc33> zd;2fXbgCPrs^Pd12r3HXJ{SH{7qzOrJT3?d`LH7jLjl58hQiUHS&OF(jeQ_@637!d zKP)8aI9efk9sjp&a;=XqHcIx80TCf*4)h@WFK<|Gl?>XbTkMxD3X)yGUX3#xxm8U+5x&)S%s?@8#PTgUH-4Nn;n?IZkhFeO|dwp*-$u1eDxS39?-%U8Y zvR4(j33Zi9J?l%PkF5V8=cX%2Y|CpernyF0aZqLWB)t zE!owi4~YH+pv&(uc?5%HEe*61omotl^{EzkJuH|2X$WDDE6Ge?zN#H{R)E_wM0TO2 zWdjrxB^%W}IJ6kMWw>X0rG{B=-07yLRnlrwx$e`~SnvWwlm3QG%9D~WhWXK@*nyhB zr~+>167IeBPSAM-)sh)cKM=?VE{=rtnp&;z{JHFC+ypN}#oiZu^h*JldARRnzDX6J z*t(6v+!x5-+dukIEOv|C0;ptLx(iLW?i+7KtEO@!_aatpVqbDvt!-Hp+%@Cf=-&>)s0)i?-~rg5 z%pWcHdQ*idPBaak&O!~b9Bq|gDbJ|Ab{GLyHpdvSFJ z(Dv~rTBZV#^>)vr1~h<1pSR4{fhjMS2GuymY%t{8T7rCsiG$OB(1-~H{+t+kCA0$M zy<@EuFliPfO#-E6fsPbcZXIx{WF4{>@EAEvYbu{#(MnO^+vhM#bbg)yT?hr9mnK0- zAm)#m4IwD=!Anwu6s!C`n3@O&7!JH$-#22^?Gx)aJ2LTLuk6L94_Y!5ySG12vsNdV zvF9Oj+o`3MAZV(cMAUQR)Np)fSH4|V>?dbV$-keH*_Jz!4ad{~r0H^J3&!HC!}v<= zwJ#-vS(c&6Z zo&=okS+iTEPjbUL^Gkk~8GbDhTU}NI2&N zFG{+?h~>6y?x0+s-S3^9&t~f!#!Oq_9T$X{6=^Vy6`4s)q<2W*j2QK(#l*KIEF%(h zMnU+elu9doDvLf7WxWQK=m4p(_j@MBRAwlMb0d;11=qs?IBDH+z-ehiRASD2=)7cSjyD$Y_d(S z1GgLqc~}F%!zuF{9rQ+9GJq+D(ZxLjZ1}~Ypp@rQl#;&)pRiH+n}gGU0Hm`Fcd-|s zPDaEElax(r3*XA(%^lJn%&>@lns_-6J>)te{3W29sv4>kP8y*Dkb7_HJIq~VJ)dsF zmzS2h{*jm833RXY4IvvjAw z;XS%!m(d@bAA^YqyqDY>I{(?nJGUhBEmF7FhtP;*kqfp~arP9>)8ArZ!w8g+nA-PD z-Su2%R1i$(@f~B?q_i^^vr|S!CdcwTWFqg=6?AfWo^d{tr~rm$Cr6+_knrE1o%~Ye z4-MnOsVRP{5r0$Fm}}ct^-<3ex5akqMHxEKW36PTw$NesxRuxMxds+e&ephHTfeYT zspG=oE)VGugFO{c9RmhD?aqNWG|2($xgFs4mN-%PkZj_P?f2E>Gr~(Nx1etHFM*-+ zQzd(!vjRZ)Y+xME+6gzjT%N z0j2k=;RHAv7oe6tH5j>2mB9ea-&jNDK|@g;>AJYOr`a0h?;fTZbmSw7>tycyF+|y$ zeaJ1LI~dixQD`5&O6iQx-W=Q9au=PVWr1H_O>Ku{iQl!{xF#H?d-i^iL6a*==vz<@ zzfjs-_|W`u^W$g3flmC$&t8jgV7C}Vw1FE+u@AtN!@#k^}kxX=@erKaI~O61i7kAMFJy~KnN4RIXL z`llOvOe^NlZD=Z$c}QGK>wrIkv$UHvyB?8k9zgJ24)dYS*` zLqbo(47Ie6?<9PzPi;niho-*<;rU}Y-6B0Y$fYwOZWuiMv9sF`C)tvfs@?Lf0YI*# zldni}oB+?`DYm}Q3cjxZ*{G9aJ?15LBl+>_RjS2LnTgHq8T@~MpyR$VsN;dG5h+mLZ7*PJ;F0>Jd=S1@D9(uzw?u6ld zXjbZeGxgA}0O-l`zsNWJO`^sn<3`q{$CT}ZJ{U9Zcr(R>z51xJ{;DEp+9`vvvus<%UU2 zDf%ITBb=YG4Qxh40qE#3O6xp=fp7tW0BW zbZ*inxKLBJ>5##65MqlTAG|YI`CyvEHLsOxva9HWbRCV@{(0RPNV1*{ZxIBe6O2 z=-jnodj-j#=)LhnFm8Ta=5CYX-iXk)_waas8UFdhjTv%Bd6vLH^o%Pxen^0)RJFvG zf(y!`lI$P66x0(_ZcUhQdrtZE*ziHonclP`k*cnd)+GmjLb#1MGz8^Obm1i%zT6v& zG(`PAC;^cM7DMb5{^8?LeO61CPuX2afK`9xn6B z#p5puXn(C)x4UZ~Pv{NQs$S|kfmR>z+nScC#B&L-aSVG(#88fKOU+?5m zl699hOLr_NA`vH&if7RE)m2xry7kiaIdAx6=hBM}SEJ&-D`~vC7i8cbP+d%Rauhw{ zJ3k!H0!r&QisbWvYD-ZEg&w5Eb}fVr^NGV@3~b2KPZ6TDw91BFG3q2!F!CK6K?j{H=z&2a6jY{wDzCDe1*wS7b} zl1O+V?nL>5%t8L5>5>x35Z2ghCHroA`sRYn<7jhWX`j!J@Zv6Cs-7s^vuV7&g zU0-sf2>NHK7Dy}MG-iC8Za}F2UZ2t!sSJ)2%cC!25!3RRCmCr%{yhCJ>=1E_AsHHX zN=y)|QSOZ3?*$go_TkNEYPUkI%$a^54}iq&bC$vI5p!4@GrpVck}cUPGK2|4kIjGR%XASB}J2 zkR)j%!X1p8e);QQ;bpB5Tw9|c_O{#xu&4OZEymQt3drV^^N3vr)c6_oXq+ul&vF{d z-(0_UmX@&EzHap(bWxNEmR*`?4>Cpb)qJoEyQ2-t8OljA_%~ZDe;ujlEs)ynT*3~h z#NzJHX83i3yboyRnl9vSl&H@By?+Z9sjbmq6Dez+-Z8&P8dXc?(J zas)$-%tPQIIJ z<^5v`@R`g%)9%ZsF;_a!&(>&kb1>=)x~%D`t8xdllp(`!o!PdJhoebwUg2TewPmN+ zGIz3>`*Id1{ACoiC1ZK*Wp#E1glZ#mT!oY~MW{estRf|RH&)?YeZGaflunu8>be&e z$4ao(9)HI>OAp6U2fWZ?zv5fPQELF|v}v!52BsrhJsCXl1x#+r3n?Ut?XQ=oLDfk@ zMu37#zR|HDqFC1FOgUlcGGG2=8bOB|m)@F=H>bLzjZq(+O(6J48Lhw_&^Z%j3JM6E z%j4vD6Krwx)1C6#-+|-;)K36Tw`UyrS89#&4z-&K!xUVyEt{31bm05vDMWCQd8hNl z8S(Od(q_)ghO@1~|Afsw)$dCNOk~BeBz;ElmgUH)8Atm$5@l8<1vr~8rLtl=8ok~a z0E5uq6&@H}{KtN@(+uJdk$R?tFEU^F^!5UC2Lc46PZ~ul+z_*@Ogv{Z+KWxm_ z!;alJRRe0vDi93xu-3Co{VepHNR=H#QKLWUeIJo%VR514S{?LC=kT?`NJ%2!+(z|! z^22EbNUr4SA4Et`p4R{~K+L}-XM3*E$F8k@^(6YUy6{>>db2+Rna#8Uvla8yNYg7J zcAG&T8j{}e2wvnFTYUg!V`AaQ+s&of%5ETKdW5{yPytB^H5O89&M~L%nS*=LIQdQ)1d(sjUu_(ziaqaZrPnRM+{4SjOAIzpCMLtu=LScvp*I z&0al&gT<*CKYuJ8H)^RJztH9UfRu$Af&sCRSFpCXR0_-J++ej0AqwZl?jy`9>`3Bz zGF%N|GqdOX+9O||PfRGHaM&sRSrT0&fbjd_9Wf9zF#;OI!4;wg_@ZDXJutzJx@gEs z*ma?UJM@#43=si=`J_154q|o(2~=$x%|(f)EHRmx#UHu8KP9?iX<4fH7-M01S+~_B z1@rcpsfodquyOuav5ZVw_e+q;sN=*j zu_OibW*2UAM@P((LT_y?#+a(L7v$4{NaH!uOnxDJ+7)@zFq&k5?nF-E0KxxSX1eQ# z(nPH1(tHk2)6-;*UCELs03 z%K(zoXLu+(x#E zQ|xo_j)KPLZnu-Ds|uS(I&ey9%iLLzX(&6)chP{8EP!e}xb!dR<$IDY@nMqvj)TXC zk;EG-{rD)7R}fUCCX16TLg>u8o~;P>9ZMkeB1T-T%kPV{A@SN0A)(NoA$W$ERRFjvsEkD-Pl(YCxR zuDiKF&W`CnQTlk=W~u*gl~aig@pz9F37oT+<6xQ;32H@n_Q#!jq%Fw#ir4*;Xz13$ zImUa?TUbd2RP>Sq)1I^AU!$g_*Oi$EV>0+wUR_glo&r&60)8_Dn&B?6$L*w=rsd?0tPw;B4 z*otqRiC9K+G!v(g>T;n*qGh>*E-2b~x7+!}5EFYa}Wv%AY$>4jYZ5 zGV4zcS7YqUJ2D`*5#>V%MW`d)ZAwglg^3b+E zG4?!1^I_TG+C(L`>%Z%1J$YKlp9=4gBtm^fZ-S^dpDCd3PDW^gJn2n6z=;c?hxUC% zK(_F$+9($qdp$=}JJm-+A%G(mBt+0f@6P$D`F@_Ge#F$pgO^_ry*$r0P*TxU?Rafh zzhtN;LVjEP`xUht2x^3;sJ=&HbYn{0eDw9IWkNi@)3lh!COy60F=N0zeNYbj^bi(2 zQzMQ*o1PZpW~aR0R3WkM@+n~M=|k{_ zTM8#}E+rTXjiUR1N_6h~Yo|!${1Yk5ZkC&=QpM3^JTn5vnluLN2D9wByW8`OlUMX4 zkXa!{aAjslo+{0rog9eXTn5u!BXcWg_eBLQDd=-;Z|bI{lqeXqlmJQ4Hln?tLUg!p`!PTB~me6wIq=M z^TF~>&o{sBmq_PUL4Dg`A}~X$VuJlJ77;Rbrov*R81Ky{URQ=3*#G!WMbMul_t?jX z_-K_IS){G^Po~7K7eeH(Q>*+-s6bToucyfp*K)w3WOWscFm41}+tplbbM!Ihe2{K0 z>V}5Dy{SOJv1zbfl48NAy@F3UUa9lIp&UM!LzpsRt2iU&qv+j&+f3*2L9z2)NZv(N zL*goNRRQd-SMnQ0?Goq9i@X!lHFnNn}Qv=zK`sG@DiC) z{%W);1Yd#mP*8|%;}hygauIn62~G9p8)LC!1GgQ>9ehv?HtSdNBTN|KS;mOYn;qOf zGE;PD#SKV1@X@SE7>MtHUy zM@S=;q<6W=n9HS7S4|{?zyG%mdaMC=W?cLuXj}bGc>Iy8i0j}4$9v0i1X+L&Qxh_8 z_F_IE@9pflx1sx`(cs1uw@VTMD6ThRcdodjg{mlW@6@s03%tzz7c3bdI*H9TwOW)Q zPBjo>QXMbTwz6GjtI=unelp8r)6OMt0&BBlpWszG2Yz%7P((}L9kSOWoHw0pP z%3Mw80s8C90xck}=3XCS3Hqv;s$Bg&y`pF8fec!XJo`gNtthB63BS2dHWnq0K>J@jLahHg`yPW>-&QQLyaFeq z>riuryH4j??AE0Ak}&)#>&0tDb-Tq{K7>U)?Nh8(THFlsE*4!9=E4RDt);e+9L1xS{^%2*|>-4p^uq)5n7Kj%c zxgz@JZ%WV0KWQLQk4wpa%-OX7fk~uUFRwX7iK3_i>n3~$m$wv(jCcrMSwKe-h{(KR zXv!SE=@+%MoR_R6AGTgE)O z=Gr*ZO~O7^@GG@~sNd&|jD{5z9p0}4$kwmgSVaf{C9KYoL-}vMF;}#-9NzG?)Id(G z{eJ@P1jlFv&}wQg#n$6J?_?7cYz+|{dPuQG($~Ovv2{4!W0+YbX5`IT1LuoQFyPZFh-#9p>{bc{6_#`;kUeMx zpO3Nm>O{$n3nMX8l1vsu`Rd4UPqiyM>7~MNi2ZSQt_zVNe(HSWo+OjUWw!p=tTSJ% zQjZp~uU{S6Y(Gj1)Alrfuwe!9X;Nt;xED%D+Er^H?QlHnCpo+1D`ZAFKV;gNr4^B} zb1K_cY~;;EJ{}2O($I`e?%b65;a`}8vlym>hs5GXFKx|%4_mYDGmg57_1rbQT^@=- zqJ!3be!KqikBBWTwjM&81ClT(Bhf@P2FGBQ3Rho%Xs|8C2kMSBXP>F20$Mt4HqP9I zvDd5SG%Gsa zvcLObDBdspXiWyjsu0h<(L+Z&I_I_pRz?nPv(e{tutK5ZpHh63f&0Zb<-I{eg1ha* zg|IU&1+x2N839Xve!x-l*|q3`U{2{Mg1fZZ(}jaK`bP%B;fEf2Ls#uhLpAMQUUux3~?YV7eO!!m*(VjV$HRSd=MGf$1=|w zE0nq;k!UGq$?uI;D&@W?Y!?U@jz)!>+FmlTlE}Dohi6fcD{DQ@(i=7}MQf(9xTpn3 z;_p?QC#>TgHo?413i1u2pxm?gstOJlT=9^Rg@czHpP*OwpsjepfNJcWb8Id1X{fey zR-eb&P|2pn3U`6kq&!a@BFHpNbz;9A%bwbNg90(9f#*kIB79sE7*mr=htINrVu|(q zF`$*r?}bAI!g#bU@v>@(&m1**^wKM@G|t{~c?N1T6Atx>Qt$#U1c{2b2z6m-I9t8q z_<5r)$zQy<-4u-)PA$v~^8w7&nwqJJS39Bs2Y4)N(sN-FG<7X`s)d&{{9B$tb9j|M z`-Qg-M>wBy=*x{LehgZ6-^8l`b|{BuXqs~+LPn2F%@k{+bCm`G&oL+PmTQ0@wq2Fd zVgN{(xknN-nK8L6(N*7v5?`nYis@|DOFnCtclZg(JvfmGpEfYCH{E8Hp8dofkdBX; z_KJ*xrO=zLCRQ<{J`p+hW0sDYEW-G}z`EY{%R2Z6(1q$5evR>b5AuMq#P^P3a4kJ$ z^i*f2NeWEj-*ni$RA$VmiGExIXEOp4_E7<%%FV;f=?@5FJ-noe1D=5fzF7}{T}bAP zaxf`fBrT~EdU9ftNpi#VA%rkPU?ggth%>1K7)+3$8}!axU&ymdJ3mqt8k4Q>cc}T^ z(h+*&f?98SRUtfY-^Rs*A|N!G^?v(atDT&)IUqaD;9g;Nd~@h*?uWati{VbaQ&5GH zus`~$Z4@I^lIhRsR2zr{Iv66h?pAQjo^6S%ZxvMlF7-DFi`P5{a){X zfab)$_DVvBo!3KHeF6xhW{Uxg#Vg(bL`HkEOo`i4e$KH-k46h-KiT*Ul2>K8ZC5wun68}(h|5kOhulSQ&u1|T>uFEC3Qbx09Ev1{4z zO(h0)_;tOAAC>v*uFVxYp!e)`;b%Ch>d}~(X`4q|Naa_NMh~h#6U15#TJHgG3PE$CgWt8bZv}ifcydshCBa>Y+>&0^NjK|2jFQX zdy96c9I#y2-M3-|tlwFy*OwXj-VKws?@lJG$V0wX9w0j84O@CM2&>~;+GP2{F7w+bXg_^vF-qMt>Gd2}gn0 za~=`g`g3QAsdE(mv%aKm+NAi)wVZm60<^Ar1jcZYd}QLr_1OubNmLt4s-nJVOx}~m zLJ;rT88>Fdw$VS$g4l1xYo=_-+Vgs7eLqnIZr3@=2^RFASY!|Qp?jh>OGyan@qE@D z0tjVhpnYGWa^VWbQnNl$F@>!c6z2bZXZqm+p01Xn{4We#({9 z!a3KVz)!#jL!Tn)~9`HYrgs0?k8F>P=29NMkd@Z7rRZ=%tN!kK=sDTk~ z7Ea3Ro%hy$SoF0J%jS@r77ys-i+CqH7_VAi1c*ala!xU{p+9Wk!#DkmxL= z46SQC1m`N_lno)#HT!4BXMT_qjwfu89abB?6L?Z62+k&Xk@mmkW)iOu&!1~BFyyBs zZ+s6W)do~$L1s+Hb%c6OhR8s?wG+2;KG{JS^e%7{Wd!jbDW;2){I3eYbD``z`FVEB$OJ z@$>R&RNPK76qTCCWVpKT>NDVW&eGcm*rYYUdeJjp9lF0|wI+kH6eWx%@s9oD2~*=s zIc369cc@`1Q4Qx;lG6J+wgpqFH3w)SL(H|1)$17Lk1XIGBz^)sqy_S7=#vSCN9Sjy z$=F-~)oNMf(jOD_OXUt5fizUeRfQAAIZ zts&s6XDt~kXp)%PyiOCAmUovyf_E;$jW$B3S7kr23~)`&-=l}Wt0PXpzE16;;R5xP zQ0c*Q`W$~;i@#OOXq&1|_fKK$(CorA%F&2PCsC<|3UOh=dHeLadI9KTN@u{PH5Iu9 z^7!095oPe4$JyIqKrgGI&$wlbt45x<=?`My~Mn_ZN(Bu2rKn zp6%)r)FsiHG4MxyyL=`)dG~vgqtdVfMnDLprS(!K#1iwe?zfGJR=6w(VSePvU%=dE z=z*ei$wkXIIZcaGFP2&mdcxI?%A8D}OdKBEkupv(_&M`mMf#MvTS}wdtOWgRtBWt| z`eCT5r3Fx0Wf55bT`~v|p>rd~1ba=%mATc7qFI0@NMZx(P7dwr4TD9Pq)<3W0NA+R zy=m;{){=V1g_78zCiTAMz!I6Wd=(67jdcyy52l~@ar-UFJMruJi0w&wr&=6Kom6Ai zo?yNXrt`wdwCtj?=n#cjxBEQi44j=WiCA(HvO-Yei9=_H=BD;cI}c$PT#H53E++b> zF~kF*ef7b&laWj&#j6v(z^w?dk`Oog6a~EX?r>Zm98YAi8kg%*%Kd3Dp4(^PXPY&8 zbn{81M$3Xy^6!ZzIUlKvfwd>ap5OsLwVSk+~r*6w*HH=GW%< zx?Hm@uZ%Dx5)t9n@8Smr$VVNmr~$XHm) zcV&4Toym!=UPJoT!Rr47W9ofga!E}4mf9PbFugxu^uDk^XxEU}xznedbm2ylP@PD> zfz($6u|?Q(^qy=h&0>(=lKUdXY^5&GQj=?_y&)&HK+H{`u2Z7xBLWO9ru7w%Cgbfv z|ESM6<k3ncZKOSvZ{kgTl{m2BJK;;G32kWP>b z8WOo~t)l065<4GG-#EkzTpyi4=wj8g_#BCj9fM?^QQ3cVzc2Rufso<@g`2VFqu8~C zlR{>*X0=j33*6Oqx#BtjFQ|u~IA&^O*QeGfC^h%Y5! z#RehhlN%U{g|vNXmqIIj)25*9&8`%~R08DL;*CQEJ6=5@er~#NS{P5Dx<*xch|e7p z_iRjjvn#%0JCm~ktA54O`%o9_p0LnS!rK^$vR%In#^cuFk#TNi67^;0oTIAc@4iBz zc&gCQEle2F;MQ!@RX^$F1OXG}bd#~7Ge3nLRnHO$*-S&VX`wF}b?FR6X4IH318Bi} zJR9-E*iA`sr2&uEzRgZs{E*Bm1!hrF>xp=DvWJYfcPg!3+4o;0J~ zc|LK$uEi@m0E=5jRu`kIA&_A#vK9O!=sonIxj_~XxeEG+e!mxkmnNWukb0e5p1z|d zz%Ww8MH{@{n^3b%2dfds?2OCo$9vpSC&n-6L=V$$#`K~z!WwU)iJH$EajqGt(j1HNVq{lPEYt|Y+y((N9J@{;`i5hx z7-La){!4c~0#8+nN*ndrLW+;QBmi1Gy^kA-L*r(8KQ9=&F-E;hJjeN*oc_Xjk`OyP zjqZ4}2h;J;P~eL=a!o^)!9N5B&l*n4p^gJ>NnAj6ifwPcGji5QkLpZ<9zI!}Yx0az zL|Y%1Lke(#FsxB~D!M^U(wM9w!5$d(W&rR~9BP~O;I#)o_Is|g#|Wk&T(vmp22STI zAR(q?Ba-IFM%`m24}p+!b=TJ}gSRf~Iz#$N=jXswm@pmUNh3EkH6Ljwxcc3jv=@Oz zTO*8z&uusu+&WzV>N*EO{ji0QqZAO4z9Z56u`VT0nb-}7A}Sz%&!7begYIk6;A9o{ z+#Gx&qSJn6?X)umOi2~pGHLXMglnoM)FwF27|I8pmR}sHXV!%qd&-i&+pavN#mKlF zyg%=84kZc+mN_AwJSg#EQ5G{E*?o+AeTl8Np@=zK-srQ8bTogT?B`l(&`{>|x{&1crjVr{Ub#)p$NageZ5Upmut3vs#SQ z7nW_MjOqZNXyI~&z2qIXmb~5KU{}ym<;&l*>FZ(dE@|>)VyetoJGYZS3-bxyynyyk z)c%pHNK8fd5x?%z5orTi@+AfR7iWEJ6XJ3_bII5+TRj?liMF-fEN|yriQkO_Ul%)C z|IRiCO%e+R!8{V^-urqBomuTg^WpKWl=9NVH2B@oa(%Q=+uTsKkf!^FlJaM5CQ<3N z5<>OcxkNlvZ>JwyDLlAl9Pf#mp5ZdRcUrDQ^Nx~8bz2g}H28GD_XzHya#C2GGX;^w ze6@FJQ4KDh?1P;>Fn-skS>+-Teh4?j<(YYHsFRE>>Shyhn9_f}3yM`8e|IP{x#`%d z&`Mfr(5(OPQ4Qv46e~G1>qYV9caDZN8#n$ljUz`p?LFM-da>G<(nFJzspz5w&5A6}NHs530~Su+?#vTGs?>Y3+;u)^rnQTEso+s^YO;Q&IMZ z<^Oh}1_uB?7*t86W60d(Eqh9A?l>OsK&)xuiF1BeM&c%~eL>re818MLC#tS2kL`Pv zL-Jp%S*k|dlk9QuMQhfY2Rg&JBbz~y6YN>iU)>vgfKHZ!hUr)eq% zy)CK-S4DIm_-ddLqm-?XMq5D+iCS)8hhqOHMjViDtS*A#%3g%mZ13zhmp-P62^R@k zk1<3+tW+>CTO{4(2~}_Qc`76cn`9<5&x5y1%rluDl+a+oIiIV3GTThT?)Dwqj(wDP z)v~TyneCil$Rg1myd{K-8ST}1LBm6w)RX0DdZ66R#<}4*Zz19m!nVslc?i`lQ>j*P zG>2T4Mi7C+2u^A{2cZ9y0%*bmMFZQ2xBSkkiu0zM!f43myEjK3=|=X4rwY+|?ZJfG zMU@ZU&XiK?I8|I@@!?w5`?6Mml~tLjW;(&Bh-(Sy%xlA{ql0L+GO-nANOP(Bfh!{m zZ&Kn89nI0J}CGALq{jp&g;3?4qz_F*&f@7jv&`+XX#^UnNxHrB-l?%O*A zT$sdJP_t6mBbJ2QtQTGVQto2qJaakiXPe#<5@1B6cS#C=5GA_b+ce0QKfR8113t8E-F4&`kp zg+yX*M#93jH7Qj*Muj|x`A{%p)t267KBhtX8=90Z8(duuu zzY-+(st38grT2fvu;*+_ZjB-5MOZRaRL6NrU(~PTKch)WSza8!1b60pz#tA&HJCn) z!aW<)Q(_Dd${_|sGTE?ScYRDpdr%81&{J6y?@^@7*97k$Nb^)ZK&t3TO@a`=!A2!T z`}sjnvcgS$*XT3qYD6`HQT_vKIed7-=qxU%?5K}_@0}2_!H^1X`dXdt#;2Oh#sN#| zAlq=A%mu*^Ea#B5$@z_=`wu zvPAha`amF}<=)!M&2#H?s7^2`N}qZseshWIB3)Fa24nLQ-Y)hN50N9_H_I}Q-wfa?cpBofD(5QK;?`-v&vUNWtE*>?^#JvJe^Ytfb z5PCL%sC+j7LSr~+8pp?b1w7AVS|h>rk>ehh_hidnD*1;ko_{K3rbSfQRb+Z9GXW#bNtt3}8Z<=0rycG(!(Uz2en& z%~IgT=64F(_d~zdObHHA&?spXDKI})9V}@=?*POS%2(4|e-sbe8M%8SzAcW^ ztDsO_hSFBQ>2#y@9>V$(QUg*RN=vCuZt0XW$D{~@163TXnD$CCzgeDg2=(Dz%j#~k z-I*NbRg(=1-`L|8&YeKw&9}HSs8&_|$Oo{De{DvaEiqb;^ga764*Qh@wI9hegf3*u zAfQc8R5vO8q4`d@!j#hJFv=JZj5!KC`~z_g$|=z6YVZz~sa5;9V5NkvDmA8LX-O5H zs_de}TYX6UpsNa;C9{YX<+-3oEQl`OY#XXp#==EQ1Y*>D&}C+80fWf0CO7Xjg*d5O zkcsLnMd*slu#y)zf1}nzu>_N(I-<%_dyCbWjTQ2jWYJE7BNO+NiHRqoFWz(~!!kH0 z(*0yR4+N7jr7^6fX)|0Z7R}c;pLB`cf_i!h`ujV!P8sVgx4zTzGv!{qCx^tCZqci} zSU!|_|D{y->}}#) z(|uhg-SIdXRA&Y>j+EIIxoGEEMO<-wGesmQ;7K`#jh&~NPhfZC+uTeWs#s2zotH^| zvJU1;_jK7eoVch{yQ5S(b%p8#zsEXrvv@I#f#3x~v#c&MD^DEQ+*a ziuu|w-To84PP|l+L==p#AeFt5{hGZ-^=~OqBh6cD{z*b%^SveMV#Ifli9tJb-S;LK zCF5cI_INHT2I2_IqpN!R zmuvnee6@-+;n;`=cPnjp?2NJiz%~zrD+^uP831IL0JOE3%GpuNN zp4@@KQe5=ko-@}_eoe;Pt^rM5;o~%VO9lU+2@@k+3?H3KLU%FxlI0K>tmPCXvpQlB zi0cixcAo3V7ZzdetG|dSYXfo0gs)+TUx0RQy-h~cK=7u$n5!SzjB{eqFWZNo@@*g{fA8`c^TL-i+j&*ioXQUwy^FSn2p@dG^PR#3! zLAkt$DWW*f9Y06As~rwi4lDf&N9#)5Nxt-7aLqtupzt2nulIG1&&jgM^JD=hWraPP z1-lh8QTqYQS%Nau{=Ob|;EvbTbIbv04>d-6kk$EYv9A03_MXco2bldIqI7GGkh}!y zJL5ZZ#5K7|O4LxpauK(k;sjo?MWo8PZ&39Y$31Cs$RDuh5vRY)icP>na^iD7ftmKZVz%HFnwR64M;WnbZk zkZtyv#H1}0-EOip7r>z-_k=5D=o(RN8|AF`NZ5}(h&sJ7R~ypbXqiNw1H=FBkC4W? z0|VLYF9~fS4N2iJfX(syR~QBwb{gAOkzk1kHm0gnb=BX?uVNX52OojfB`ZlAQJ3BQ zdt#W5W6p=*O&*P(57@-NJa2zs;^3K6xM8fw$_=_!9eS%aP|L~UC{S~ehiH_!Fr2fP zdQYscMi`&%t9;Zo?aRb6eN3&d_a7+P;+?v!~-H0Is)5<>B*^ zLxWusf|?;kq~HLyOcoP;_sj*MbHADVt#0z^BlBV*ja-zo1@ZB;>#P2h@8Pf*ajh54 zToAWCCw=TPLXg9=*xFrx1R>Bmp+h9Z; zSm3(v$cnZJd#eLOZcHHF!#WYY;XLvY7RDxE)rdhBhR5R=4m9=pJ%?)V0=aK|zm{yI zSyQxelTQPZKYV01-=29H8Dp>EeyaXj+YX2E1@3Jhq~*(Ky^swIFGsX~ z`B{0FyI%%?DaPo|h{88d0wPQw3ISLKE=4XriQ>E!hn~vNA0CY)G1uZB!=0-x*j@M$ zJ}W0a6*4MhuYq`#D@GdKuU7ydd+sW}3*W1Jj&Hzlcy*bPxN4Za=9{^xlXai1QJEew z%+-iWEu*pQ1Z@zhM@ZT}DMsS@7e3Go^7Z@?h{*37C<8%@b!B}1f`P}>zyF(%&4*bO zJ*U&Rb{BK*WR#(9tkc}ABQGqQG_}d|EId-y4H~Cq1)}|k{~^uw7S2~r+Whk_ibJC0F8_l z60LOn`;Y*YCz^-NS#l>X1FpXXC)E|h9*#KkI@J-k)(5Btz?08TIq^HlevqIa*Dh(U zw!yT{6WTaWmzbyRYP5k`wRI%!!qQG>WFS}X#iWj^yd~dNUKb3!=2*Hw>rJDz8Hcm& zoQz0F)cQa$ud}JOgJVY8OY>hxm`s7zK{6Lund$Aw9IDT{+U^X90!>TT<5VSvTdqgz z4pPtYrxte&jflW!hAy7KjI-rvvp9xq5Sz;$D77?On+b+qUy2(;?=))6ceG6eN^^Zj zzvK(Hdkx;2kl;z=Xq26FFtkIL3JK3<{%rvrU{SgH-fm6@+z&@co36^?IKjUZpl?U1 z=Siwpft(Abah(RWtxcQgKgMpyutv%rZw!5wYzt{oqVfz>{n@-EBuNUqVa>wq@OG`J zRYScQfpV5J4Q&5eD);SsMhwEb1p73YggH45Lx>N)ML=d*b%Nn=>$IK2M)Z>>AE`li zVq^%nfPK5si6u&K=P0$vGNpCTiEl4%Z&VTuEh+@*bMFFoGe>4Qh68p!q(zB+X9d7E;dI_K z_oe~Lk#bU*n9v^zr{HNi=HVXsi%ABJ>q?DS&E(&(hjLR1RwMPkqm z>und+*eLk91=KfSeC9Wu4~`Dd8GRG*Au+70115c+pYK0!lNko(MMGLRL+dOtgQQq~ zGaBh{`zpHfS{ng!8#41&b4~#B91JY$(6up(zpV;nwX`zUFhy|q=2srpNdJ*V-`kW_@2 z?l&+dGvPR6PQS62MLb6|{Q$e@#AiFl3+nd~ykMd#LCRv6c&h+0u_4NMrL5R76q9lb zkeSc9V)p^qDW6iyi)hxst9<mF`to6?gANY3;^4W11USun z`kbZSO~8XJx?Bi|{da1&JqruG$&#Nppu{&HXcxeRmO8!Wo@<#E=Ka^i z=axSbRo=eIO{30tdsgK61t9HWdJYA26MJa0q4 zP{5zBfusqzmx@v9K&}GX0CMgqiL$!G4^EGR<{E2GnTaG7f%PTJYAkCB!VoOOqyva= zy&cgFrI`{R`rn1(e;HnReok+0X^#4^1v47uqTk#b3+5?^O`n<=8tju-LzvhM@8u6oll==HBf{y5lvTgVb3yb>c&aE?dv zNXx%1bU~Mma8s&q_HGJiQaO>OG{og270nT>w5f`#?fMSNy#mmvR|X<}e-47r+N&ZY&n zZ4n1`5j1KfJ9!31CG3ZC&xhTE)RB6m3|ZYQ*M2T&vp-!((KnAASpci!G0SasLQ}njCT?92~bUA_ah2sB>&?Owd z-|#OS`AJSwrkWu91HNx|${l;_ZIYRxxKp7r&yi&vUf`5Qwa*dfjbt$PRLZoDGkhah z(G;=IS-RlwX0S*klcQ7ptt5jJxM_JDS@M1X*p%O#ODfqu*R4-Fv^!X;zG6+macwG4 zHb<0J2Wbj7y8=M-r75`gT@l@&@b|n;(>$R zUW==(nl)&v^B*;&^>3E^4S<@qic*Fs~5ce-S24 z`A;vyTK}h{D6CX0loR!VX@zJRs*ar0$PF)W>h2U(e!-@C!MC2;);}@=&kB(Jf2Q zD=I*ZYpDd=3(=BB8Lk4z+PhwdoJ0@n`NBfW7NXWuZLHeKG1&>M&`?}??U_x9X|g6Y zksbIU$Hp`^1|QZoIMX3Cj{|2lbgol@;+k1T2osuJ4p&hf#&l{3c&JMp=kkutUzzX~ z1RUNm0@pHCeK5{ON2<|OklbJn^m3>*mkNXIWv@|F3V1&fn6+W(R%}I4 zNKftGknw}2QEGL_2Vqnc0{On_2q zBW}x_sr`^h)(=DepxSXi98BCyw8na2xpV9gu zR<{=E>r5zD9X((xtuRDuq@x=6)Occ;`2N?yUtqYQA`$-rU3r6QZIZj%#EF&@G&*Se zu7P^*jUVgtN@>z+m4u^cFYt0yHd^LR^Ybn3Ccjz6oD+WQDMJ3M0NVSkWase>1zfst z{P%a=Hi(Kqlot|d%|vCHC6)p-=JCew;_%}Luy_O1kzmCeN(w~w3kghWb6eF3gwL;; z@brob)%sY(LUspZ-TkKf&6G9a3gh&wmf#qvgaaGDYe)d8?6!QRW)5)7IN9T-FAMDW zDdha}mAjYr0=wu-3JitNvV45`h*=ZqcE9W!?3E0C@O_jG)VXd1fHr0cPIl5R&TQVb zCrvsN7Ex5}p|CylB7p}o zWO0LQW>KRX$zWY;uM_~1X=@Shf4fQFhA=?*HXYa#o-PjSsR!rxtHhYssZB$G_Qae* z8`gFi%L(l%bJxPtvr;H80}TmD{Z(c;OKT5ru@|EpoPs-Yxaa4QmDiB2Sd~7x#2Vxw zSmEc8XJh`P-h{z_aDv%hGY|D!Y6fwFxZ6gz^KUV)jHo1bc;k?ZgMBjHl*rkpK&>!; zKO4=G_#ZP@HUMlNZ%GHCYtY|*PTmh?T7p%PQ*9aiTyH3=mU8dX*010nnIBAu8vP@C z?t6cq4w)1>g(k;oe{tFAom`cijrf>%H9)%9DEC-~a<6#6Lo-*}2>zQUWQVtK=-9)PSztP>7eMViagv;V9ZBdb}6QnLp;O59OyBfIst#xZoiu!?E*8_(Wj;Hzn8iJO*hJ^pGU}Q{;{k`}3D8a^oeJMYO$?E5r6%ZK-qKBd z4R;nZYwv>KRSj=&a$*ebui__72&?t~HGAq|e9uAZl|_9%ZD#6?WrB>ce*TOsI|YM} zpHP!15JN06dRTwgqtTpwm1}XQ(U23S3)$RNZAfs2-Y^TJlfJ#`TU%kfywpb`=pjY? z@|M44D5^O~O;6fc*eM_$k?q{(`Jf;KSZy8cfA%62Jxe_Qs#I-gg zS~8}^7zSrS1#it5vWy8thqepy#A=f$u&=p%6@GZs>?{ZJuYnanEOK4s%psL0+@GS_ z311gL!57MPt9Qq!6>%yFC^G)jltfT}0uTDw8MexHc1_LSFTb*ejOF^tqsS4y#!}G? zAGzg`xvrTsww4nnd`CzL(IB=lsG0ydK*qlFw#FmpvK!%M+%j4)iUSV(&V#}MWK+X`gjIIM1EOqW=eW}!Oy4_Oh?(t|pw$9SZ~ zgUb`uEdHGUqB+Rl%5AvW7oi(ae-<05GXuPm(muhV1a)JRtA#?_)V#VV>~opCEBF66 zLv~@)Oy5%FkoIgXlhdnqi!4LOPR0RFD+sYH2MynBSd&C9(puC;`!4>yWU9kg7JU)~ zKjok#YBi^oHh5!^NrYqKssl}iU_3}O9dps+$R7-xpy2IqD@y7AhSkIkn(fngRFk64 z&EL~)Ki*j*yiMc2#UB>LUnJxfR#|&`-$Rag{84yalcFPAW{44?OEM1;NZd$p_PRXP zNt96--fkOQ5bzdhMx+gufNpVdS-&WW@`=Jo$+ldAQL%terCQzyc(e2?#0mPf`zi7h zMNa`1Y!T=N0g8+BZv0u&RN7@XFa8Z_s`-Bd3@|?jK6vppId2djkrW5%#$$gm>{%B{ z04de(>U+6lIrtDgOXs%RB&q99tbGx+=-MaC6epLk>9vJW(LM*9GF}FR{qXpn=tlpm zb?*#3{Tr6ZXlez!|0(FX%>mvQlAAPDjsBFDix#tJP8!II%ZrPmlB4k4(L+68^K!Ym z_I#0`cPmB^?tImsZeM-ACpTPlY-1tclDX4y%arFu_gMEIC<``>04lT`PGWbw%ti<# zu3j6*Io57jS^=J&B9(YAefF9fOmraC@}&{2EO#C*?xd20@dKh8xq?SJ;p|+#Fj5t zpW17F2>XmC<66i=%HqV~T6${PJB0~OeOBGa9gFQReg8S~HS(z>T$hik%p35uAuQSA zt=1{-EPX>(J#qn9gNcxNkrzgx?B0FzEj~#M|Hl^JrVL*e_(~y(8=+=IN9mjYt^6I{ zV(YP;!ZD1#b8O(G@S|yO&IqGIiQ8CF@46tz2q{5^_iyh_a-zPfa#F`Ukf65tCZED^ z_@~s^cCT^Oq@1Ym;{kc7@!HKH>$kJp2{YuGF6;-e53B-Pe!mKG(ugB$MlK+n-@Dez zK7OkyoR`3cl$$kjGzX({l+#!YWO;a*C|lHkCWI(-PScZozR|m3#?q1%-&=yq_ZWO% zd^{u?WxC5OvQF{P9@PA@ zk$D_oVaWvP{%UAO&5<-yDU;7iyFbBomSm5a+%-O%3c&U<1v{ZUWbtL{I=p^3hG=bp zAm;=83i~rE0u%zJZtF`X2+YL4w1bgSVr#h!weB!&6*^32(t7al(5gI=6f}v=Yyiot zrwy!g%%?(8`Fk5>7c$uYQ}_cR6h@T784HjeN>#{+&PY;69pI8M4OhE4k2fAc@Iu!w z6Q7rIlC7d6Br>hkZ;Z%54Ky>KI|)mZT#?Z%>Nv=JJ`(gKn*!yNoCtpU-Zieen763F zzq2K2?lTYOODGeC`TtTnj$Z>&3dNK;D5i{9!p0`0I5)@m3y6k#S{Ivoez^jC8a$iTC|c6b-hWFt8)vBoN(s=oIqD^Tm{+<~0>72+ z<#a2lq))$TWT2ILsyi)1g)utwIr-p$=#t_NDQ4XyF{#3}pQ%Hz99*3N(fR!@T1Yry zG8=rMYmu|#Q_wKtEWveC~#(8JcI4aAx|OQ-YRP4P3rnH)FP=c|TK) zm@tNin?tQq&k3|~deM&e4lYEava$L4)Py~S30KmNCLn_;SE&mbm%Z+G4P`;hQor3VE9AQww z2VHwb+>lYz{I6+p))~X_FQx-a35j96#RNDrc8N<=wXz8O-u&V`2h_myc=a3 z0vAUZSSeQb^Kb!BO_tW0$yMuYpj3VpOyXf0ryCP0g3t)e$Nn^%!4Ln8zebCQeq|W; zkV{kWD0tX0Gf6qEo02Y}o=q>Z#TJ=jblMCOICTN#&WakIQ_gE&*^#G_7a#@Juy@Y_ zoDNLyW(i^FLtvxtn96d;6s~MM61`PU3?V*JpMM9f{Lsn;WobN}Cq%zrd;9KsVeXq? z1t7fWm=QOG@+mf%*vXpI%2~U$%#E<1xt^0R=p-FJo?YBnk>c zryl4Ik!~Rhpjd0iU2n^-30-FB(Ll7=N)?8h?>mn| z{>4^_b{v2vgijrutO&|99j%Ad4#63Ryfpm19IKJfx*Sx}(TaeO!gu^J^;#atl?}9$KlQ zl@frT@@0+MuwFKzYqHDfEX9bL*~b}B=o1DbH3UIk%$VDAn>TfWsJ)^W^%&_A+VAc0 zVBZ$RrZtuU2A+P#L-}53vjvyS$w$EU+%Drg5PcW(Or3- zg{kJ?ihzD;9y3yG3!O%9xck3^=}AN<&I&TTHw~0nn9jq2sWm4$@e2i4JPC0SA;i# z5BMD<$S-unm*E}@qH_kBiuvunR%54LHQrf0y5AGa;yLSDV)$&8r06|?%m66a?Ie-= zMBnx=(+D1NMIO_WGSN96fY)Tm#hgwA$2s<6;!k+69~owa^G^jr7wtfpNeXL-!XvI} z>V!Z~RKVT=r#GE6x?#BuSo*ASLqPQ?)~{MO>yTNRvYr_%4o-gt4doLNlWYGWd5RX@ zu5y_?*%ZYk_C9*^LE-x_-?J&*EP^d@+)@HOv9X)D*LnCY;eUAUa=ibDZCZWZVy9N1 z?&{#$P1lX0;4os<fP8%cKYopkr|ic6S-CVM_^Kh%h>{1!Cbb1x%|s2AHsorJ^nSVx zAMCVk3o+lFye<`wFXs%6IN%p(f(x7Khsw8j!J4fHqL!^t`BOf&Va?Buv2o(=SZN8B zD#%^Vt~;4FqOV>)k^6D?4SPiAgDM1@pRI1Mvf!}1MhHV9)~l169j!u5Eq(5wI2#+|%ut``7+5HOww|yH;Hb;>f7OCoCT|4>RJw zR<~H>ImwDN%95xuhhwPA7*x4yK0dx(;1#M8m%HKT2@Laun8$@_ug*9H8qe%CJ1b|R zV)ZlPj`6P43~5qk{O0T4aW=Uv9A>?Yr*-|OvCtEEAgU>Ah#}z0E$Gvj=SmyOkx9u; z2?BT<@Ya^RMS0c`IMJNa1YjzF9^-KmFMPnacPZAve4Bv+g`yaYzfMIJ~Y=NJcxVR`F|H(Y*Yo@K=B9CHg$(|F)PC zqM6wnvlo-rGmH1J_Y92{Cw8pgIN!!U9)|`)k7X@WOn&s<)}K;$_OY1zTkqH)v;j1 zECiI^C&N)hrJF$psU0H+clAG3^epUP^A&zLN^RWI@>Zx|fEKcj+vfA22t4dKS17i< z&<7HAbKWhx%^izTCUU<6xja+eArGKYfNWr~g-v;aE=`Izg>O=ORxf=LiXfIz0_W`Z zPIXTod=jHpv?o`J&`E%5{wMy3NoqvCOMuUlgC_quo#p4|n8=!iSFh%k08pKJBqNRI zM)9ZwV6kx#a^+laqlBEdeDk*8OiU+j!X}=WhO>y@{VpPWxJa&MSI+4v?Y z1=7mER;H@hg)?~Cv%uP^Gt0)W79Wh%UKeKF1--?!_iz)>4?DgKXL{Mjt6x}Gl{@0g z_nRBw(t|J`SrFgX6;ma;G|*EVBl)_oy?F-KoFNXCDqwETQC; zL77shzWG$*bAxNe8w(8ds@Ybpxx=Hm zADaE2Qu7|pjgqVmR8O@s7fJ)yO~%gh2W{bVh_KjxZb|(q&tNWX$Peu4#V5AzCo}|n zcTy|Lier7y-b0MMaYN?GScvROYG)$vVi2G)lXo*8`~4V7YU;slmOGK0b6~E)NrmJDrTmr6i>6g9JoZ?Fs0q$iol$^h2G(T0n06ONP;ew? zX;WY+zg}`TNqQ}`j(3zK0N=6=SFuBT{zqD5a4hm6WPzha@EED%L*j_s#`a@=Ex4kZKN$H zfRw@Xc=J-1k`}NiQX`o6NYmkakwTNuCY*;*oKaBk)(~)RJ+Ur-UmwXHArnDhF?9{H zF3Vkj4={skN9a%iba#5bdM958YcJ+qBy#tK!E{?yMlsUltvf={S?lJBqIJ6khI~n> zIXuy|aPlkuctL`YHj8`&wU(Du3e2)>`7U7r)oMO6*aTg16d5~tWDr-;KmZ1IEx}z~ zztM(bG_Dpk4N}FGVQ`|I-Q6~8C|ai6e>Y5Jm>8rDW74UWr>;kru(Q00qx5YTR|f6N zs9Qe}D=fLgJ>V=Yx#=uPq!LTlH@$Vn{Dlu#=hM2NEkA!y1~mSy?|kYe=J%>6HTY1C z5dE(6x5U7m7B%h2DH21?r_TS9-W&r6qT-cH>xaBNlQO;kEBHHatuZsgt)8wKgYrw_ z$U!Mz_04LOpWLl@1eQ}Es1erdjYPuDlFUAnV~*SW5IgGPfT=Rb!p*b&n51z{C3{r6 z*!R@4erD>_RM-Z7@vu~gGhjOz1gv}}jo&RhPiS4uTXe$kDE|kUAj4+TgMNpBIL+7p zut0*h1Rm{&)VlARN^EF8j{@4=IzkM%c)h67ub>nR#CJ+q1GZpbE7T57f*hqkV^cAh8>1I1(6*PATI-{#gkx@4TW>ka}v;C+Drf7JrG&{=;K zY}x+cat}ykQnKq*KFDM}em;M|%}cC6SVp8^;7pvYX{­hkV}lNG}YPoLOvQ>45T zlAW^jtlo3n0Yj_QE=UcSMXyHG?+4}4_7x(b9rk-V1cx{z}977&S9%$&1bOIG(A&H<9HlXnJc&U?duVFAt+YA_CQ6Ig; zp?JBs_D8>ihR!021RGGcb(aYUt=v5;@57HE0$4M7ey|5&=^{IO;a4}8 z#6K~JeObeYLW_jVk(Xe5v-m%Z#%`a%O_eeLrz~_TW%kOr4zg9!UWn9YbC=vWhvI4~ zLM$=-cC0L!+7156fqI|d2ksPXh}~~Kq7}hDBJlq75w!KhE@B+W8~`ew8Ft|P;(II; zA1&n6H-DSnjGEJIbkfV9<2KF0Y&_qVT$fufwFt1@6dnsN^8(!ri7T9RX^AMIweeHJ zcOtqjJ=UMP7FIw9VI+ygvR*Pfc}$J?2JgS62`P92?UDSwMlVGgSjv%IC^tFK(OD-R~-^FH8fy9*NODQCRZK3cN z1%weueu*0DZ3SV_347yl+X-;1e%^?VFp>rw7?3t^HWk4(Oj(>_G_1M1PfpmXNklHp zcT-mlvWUM|fx%=GHgC3^OXITiKz1NLYtalO^XU%!a8eaZd(kG_JU=;2_D5zgdR2mt z3JtN6aX2Rki-0!g2}pbRW2StrGj7ab!6;5IdUrG#v9V4*V5U0IwA)Vu%2&pW-Ly@d zU|9#QKeyt$&C!AQG>J)9CNQ@8mj^TEZAdTallBm_Ir6*bAXVZl?*r}{re(HD^={R2wag3%`zi?g zLDB6gY>fn?$dUZjcq9m??p*t_WTC{&s5bf!@t=-M**r476->fQ$Dne(eZro)9`H8w zLQaSU(eVC!GO#6O-#Z&wgK~pum9JwdG{FkcP@`YrgbqbeWwM$FX2%e!yaQw_WhD~# zNluBzjc*56c|mJcLUtW6r*V}*_tz_zS z8&{G3e=_avU}!K{p-N_)&R4X?N|Q+v8d0>vsihPfJXq;?L=FwG-q0h2@O8>ANMae} z(gXgV8l7zbTNo>y(Mn1+e}V;Sv%haeE~*}gT<&f9PicSS87fzySD)BBR2fea#-gz? z?!R9177IU25m*-IP_g9z$uxdt!}D3Ac+3^N@!82?;* zb9H+840->MH8{p|=(a~RvB$TNm!kAwUZBsNMKoe4SY7Uc&N!uaBF)r$(y1%@vWgJp z#LSNcPQ;o{m+HhaUDu?d=UR^vl@iPsxv}K4$}sn<|4_?E(=1fg6KI!F`Gtru7iby5 zPPs8Wp(oZ26cP}&RsvnSfl4DIODSGWJkF8sE_(uT5#nIq1|Gn3_!^pT+qdrQ>6Pn! zJg=nPq%w5oq_&76Fk($%NHC!o`b4Q_hIcb2SJ&!ldEo6_n-A+A87}{a9N8z}h1?A~ z6)b5{1PLJV)~l=-d8cJ($yXm!2*R@p5h|yC7S(dh-O!d6PsO^9E%8!UConh4@Wn|5s~Pu3vV;7^Y-ZQT)MlYW^#2aBtmZG z4xsjOK?US-+;K_g3U}-=PHr=@3EjUYv_wMB4XYleaN)o*u+2-BKeNS%|7xc*mhSj! z%j0wih=nhaqIO3>XaSzdlsEVT*zhDOV0{gB1gsZ#>c@4@NZ7MCC7#_eZ59v$ zMYuE$y3VepaNc7va^=`Rv19SBe%n)R|l z9m-B-aU%@jIADoY(xe>8W{(i5z7T~^&Lo{YW474(@^WH*LYto;?^JD3-kL~uW;nQvn|h{hHC?=YKL2zqP?7YWT3ADJ<^3$zY*X92?6+5!;1AXeuWr-0mAMv{cOrOA*Ls z3{A7(taHS!srV=B<@lCN27n|jIEHS>;?I$zOOu2ObT4;oruPr0B8v>!2h$X@wS6F zE>AoDWe3CwG1`8&v0HF5TfS>x?N3+5Jix;}xG&G?oaT3|))o+NDn!49N^>STY zmrkKHM{qDh4i-?Zfm`@Q-43P05{8h;!aQqYNaMF!Y{93viDHyMcKC=x8g5JAEPN-( z5K?ISxN)MWG^0h+(%JT)hR6LkEMGmN%38^I-kp=!*Qb@gLs8wXUN2@B5jnvMd<>r) ztVBd)X4mzHHwg@Tq6Rb| zDHBpU;i(1Va{Vaz>klUGUf{g7Y6pvcafIrHGUo-ZQ#f=on&ij9P~Z{OQJ?_KC-p|F zjE67z@(K0Ir51tg>X`$jLM$yKw?{vON()($bZlUT- zO9~;@*k33PBp;$$Pz+2wf%$J##S-0!7&wNh(5W|`i}7pugosb>wuNNGUUd^cU7r4O zp8420<#F75CXY%PD6fx^~<^B}uPy%WObB9~cZKEuh2_%@^x9Ih168lgh$aII^W!5{=iM?YBoR zR=*LE1O~U_%wfc+L^(#&m*({oS-9%eGpoZNi}%E&@iejQLA3N>JxyU)DH6yL9g!2azc1IFNUBc}_`g)&s?7zo*2wBWuoTRuN{a8u>>r4g(x z2ygk~AY?duz)JbI4jvj*>MH{~AY++2Y{tEuSV&AsH*ZD`+u|S4@iN7WCOg{Fc1mYT z^Qx{iVKKocLzT^{+e1jfz5+R=nap%HgzXf3g-}7~6>8&&i#+qEGZ#N}I|Hq(J#3vrJyfh;_S= zlMsjLg?uZ4xgz-t^&J_D^7cWdiU%97_~>!>_|p=z9c#lZrL(V|PSv}@VmNKae*Hf{ z`5FxTOp$tz*IB+1ZM75rAHDUr`nOB9gHO3ZPx~J?nW@o1T&joC=s)=xdhkm!T1n7DeGeV=AN-(jEHbsPeHd6(|#p1Xh6?KR~iHZeYT_2}W-fdt%r z*46mQrQ+5agoH3>vYAHIy=xjn%O8 zCU;JB3nuV1a+9}1rmaCdP*wiz^O!WzJNF4qFzSUK1;*<0Xfp<70&(~XxrE4M2I)Qb zvw9w;6VDzSK|#XO{2H9ELW-sw6H>bIAU6)xSm&dAr)WD4E1a|B$k5iZ!Nb=8lz3~F z=Ekz_j_8zSr7EfEWk8CLoJoJO%B@aANO5kZPZWA3cSFXE3RE0B>=>YiPL*V#vo`6f zQc2~y0g;GD@cff)d+=F6F{LrfbinY9M4QA!vALO$s`V3P`PI3NH0<(q&nVNn5=%8S z7>s^NrQ{XH`5AVSt^|jCR5}|dq_Cc-Wm79guhK3z%@o|G#YA!xHBYw!xb35 z>m4vajfF)rCs+w!xW@$f73~rJ8ehK=+ST;i!@8G5S6mxF_BiKbRIc(HS;8GM11*@DE=a&Q=uyX+bHM}_VqtB?D zmAUf1w5>-xIp45r#BB5WTcKXlPwDQFTmJfi55nkk)8|VEY;VW_d>K}y235ooa(-n);>*F2x#<~Mp z^gLIgJe3vvgsu3KA#hM``9y7sL=sIJTVq(jPa$Vuk+r9>j4AU zN*5?M5Hw#W5W={>m**)rl@gw8kX5J$&l*`URHw+H*|a$p7CQNpg)B2p0%eyBa)_FD z&=#+Wh#PPAU4JDIa{C6YSor;_M#76QYx#0zjKTAiX`+BCTyB}7#LqAo+29pps)Do{ zPaHHxv$<@q63v0Oi;WbFBY|59KQ}yyf1p|J{8Ei<9B#EKgq8{=Ot^0Kurb>Bcl1et zHl34}^rWkjw$G_NvEi5BMm|KcmodJ8p1sRx6uLiYse!|aaR_E0JNLD#x5@cuTaK(I z_rN9ONh0T%vqAm=t}V}7zjz4z%;0~m;P*1vXuc|+)R$T7L_Ua(ZZdm7toq6eO&Zfw z@3m~Z%d~E_mf^F{+VGW-j-?n9ryxV0_E40l4zEHHkv=eBFf3CYngW2WIUFg}YnDL` z>f(P5IeNoESKz~T`2}Or?-|&&4>6v>g*d#x+<+{p3SC94)A=h6P%KOLi&}f*6YjWq zl)=GHct8($| znkjbwdLFw7+2ULY<9W3U~T z%=DSvUhJNPBTkg6tVKHX&lhR#UYPw0-=;N&=b)`@>*AC7%c6&*iNQD6d|nMtQmhbm zv(N7jJ;Nr$liAK~>hsHfU(~p5eF9_i;4Tp|e?7wyDJbO*t7F&>R$wApZrIiY-DWXA zP@BhCDn;qA7C8#$8!of)k;;ZD*+{wef%^qS*@>LEjlZCYIQBj|z4xY9<09++)4<1^ zN`I-LZbg2EdytctdqUPJ$X+Z@AQfR?{CzNTLH*G1QWz$CK=R^QV{lCRy=KLrH~Fdw zd$rqqJRaRSp(4>PlGl{`yG?df<~Lz@9hHAK8QCIiL}nCHaLMMs(n5H zj~^0TUg7eAV4Kx8`4b(~0byejnkis9qnJFr18qT{pe3Olir&^xo&sV~Huv*jkQN7s zD#wPdl5I<_vvr}K&UJ~iDkTmeX*nZaTW*N1sYMZ@ma9Zmz$0Cdv7q+?a5Au4^Md-7 zJhU*f+V&f+@j31*8xF@(%95Y@I}TeSbG?U%HZ3NV+diA3ZHpw$TdAMg( zi*^v(e>RHj(dQw~g9QYcgJbY8C(_ehBqP9BLAC;EUdb;;rbzbkN}l5(jVrxY1=&y) z6MRkD?|~?;C?Bn}5i^0_AiC)h5ju@3ieUzH4KpXL&C~vXd6Re)I$Cx^h^uPJ$uJ!B zFL@U{33E`i%&M%T$$_IiyGEdJtvMkPaM*S2lAGa{=-19q`^=^VQNaf&%S=2-Q4wtvH@-?iSm! z8d?T!J&_o72$di-Jfvw8B{$3i5T@ic&$6YvJ+1;NIb97qcFki6c(#&6z)UE};?5u;H)Siz-Kyb}7PdQMPo{(}0J_LX4y(0FtId z(C@u1*l?coAN|4G&sCu*#+2nj(AmrV)ms*;@&}VbU6_RNc>ZuGvsuPvpuX&D5g-p1 z{r;4=)K=P*`QSw$SZYq-YAYzcj$?4 z$-*+E{w~Wx78A=`Y<=U$TANEYvr52K84bQiCVHeq&q;k#a`&&&TM6owlJ&Q8#W&KS zc?deS>b<;BJ(z;yQZWRZd>y0Gz-U^_ASc>YF`H8~Ol#wm5uSmf1}0IGj{tB)*|2vQzXEB_B*rE&eqtZg`Ub>DlK;{D6eh zKo&XSRU`nLK341P&@ls4LBugHLtK?%LK^%v>%6Hx!?U`jvQu6Ph|NAMxj%qwFgE6) zAQNQpw_MyNapSfcIL!rtZU_DQEZI^y1Kqz&d+j&Cimt7if=t!lOfPP4#`@pgR$YAK z9;|Wr2DFH7IsYK}m11+afa+WN(>H+fgusu$&=*WcUQP z&xY_%L7}o5MiVPkC*Je3kl{|~O?|(M*_7<#*jPqo`cd~A1g?65uk!vhM!&eOSNOe* zD;}et%Mvj~E^T%KESRo7oPF@)C&DJD`sMSMGRR!2+iE*%)0+xSgFow7`Jgm^*0?lj z@rAKQ?TSOw;i|k-y`PO*bnHyLkJ2Vz0&pYN1_G+M@G@QKd2Oz1~|i7~dna;sjZa z=F`0}A+%Lq-AU#geFk>7hgt}A>(~|s-aFX%b$nU?hN_9KF4hyNvy=zoOX$EwP2J4m z5+wmpZE0dK*8Qqr z3tO2Ub1x)5%WjDYb;iShP@7CoiQNLwF4%g;l3P|7L~NeSckcl!FfEDuT9Ar^0Tn7abQ8Ta4ckP zTKC{F#m;py+3?CJX>kgQ7zRhD_ZT0gv7fZ=9}>wXJib+GqK&uLU8anMqi2~4Rp#`? z4YcW`)_Z#cyl)p0)}P!0Ha0H2aFPWxxCFb*`4D{ycazUaDedCh`h$nY;7KewMaG+$ z^M7Suna+^nca_Yrs*l&vZO$#@_Das}9D1M+BkuzDacsPa_wtaIF z2&SGL6DzNg)?l-Rt6+&`(EYaJtk2}s91<;4R$`~1;pKbu92QWyggt#P3XANJlOmoE zl^Y*847+UA@)*y z(P_63M6RJdu%S}rvOth08Pt(+u-MXfSb&>jHmV&%TkR~!qgg*+}v z9rm+tkX4pfy=mNPe<%CsUvN!_-PXU_tUq6r>c{;ys*@?2(Iu2XHxkohOC)nn^mKPE zX*(HO4^iLS3i#tJFQ4$#1DDp*oo||j&sHqxRg&Ff2aJRl{f?Z<|4Ymya*V%Ny$qrt zH&BVU_se{v>YAG4Ri@j13*P2yS1i(YduNfQU%BybnGCCyejy-Sw}yboODXZ2I8N5` z8)<>B>i+;7aWV=*JMmu`iklLfPc@ZpSuGn*Qav9y%*NF|`}En%tRiNQB1mHenV^fy z42r-AH^p=ekf9h1{%LO{ZS=RR4Up_rFjsu< zT#@(#5#W^pYGG%w%VZhZ36`z8*zz-!QPQSj+Ya15a4j6l#(N_}H>x*4tN&dX{xfF~ z|E^@Q8MibTATc;;2w)uSO25cEd|v^QH3u&9)HRPNb2{j(KghfDv3#*PVCUj%ewvU} z=wD~bR@5JbXijyziDJ1_|hhlrzFc+I&}4pShF)YkYeFG+_yr*M7x3ZLCG)1`|ky zhhmp;utqxUuqHl%>wx$epk)F9gm)Kj=#}QA?q8Zn;*I63xjbwyt=pC8w}Do7}PDth0E%#|O*tlmA=4!>x> z0^nQ?8~O(@3Y{VJ!}Bc^9Myeq?dkJLqB zFd(*BWG3&oIP3*cTHpp4PGNBhd(_UhKOt9tv?L1~**QnX%Xex$#n?k}ZeVy4BGC70 zpJO*gq7B=)6}z5mUqRsaovJeox?o_Y%5-6g`Ovg~E937>;h7xS|3@nd5-v++oMiVf zXHd60y8pTJlYc{HVRt{w0Lejs>59Zc&?EH#)vlJz{(*84)meuV#~-R;l+m#2ch@*uS&2AvwE$ z;IV;gaG*0WI3i`G$l>EKs3cYV!E0RCQUy3>L2p-jE|ks4H#G|HB}O*meN^b|5xAIP zJAnTeckkZz|HRs9saARg)xPpRf~f2F+o}|X)-bM6L?#9Mb-SGo4242IcHAAQs`}tGy8$N}a7}ym!Ojg=1!n*@D_00N)leeyPqK`EDV$d|_nWD&A2Ba< zIMua?qFB+WS6TBF=1Xt}G1b)+XQq>*9icOXM%LE|L0Z9*;kMXrY=pStNk-6qPIXNw zn1^M3ifP-vqcA0%D714lFq=5;sZv&(=a*n)g-W!}i5+hwSg%xlFlcU8s~-|xJGC{1 z5L{)-cgDMWju)l<9^q&D+m5wDoTD^IUat;NQG`4M6iY#9xK$@6JK&$Z680gHyFXU| z|1^D7j8aAnsbLd0a*Xbx;c58RR@7&TnsK*ltqp4j0=p+Jwp*0nji8+G_wb>RJ|v!1 z6%$s&%-G(m#V~9=WTa@hPoE%9+6fW4L>`RGL&&f5c^KTf*oLM|@CN7Z^H%zWtA@N# z(-=N1_22ttmwY^;Q~RnTthmU$n{|8>4+uXV=eP1cqmF(kfuC1*yk?*UpUZ(xIV8h{s1 zw9{=yw6UojOY3r^j^X@3GoFCiCJm5mz>O0Id1*g^t&ZXGT(MW6u>|-j%_|?sfEoDB zIM}ospp5ozRh}?5{s+@cIj!N^=F1nLKx!D6^@wMNM{eWJ`;@^BIw|-;XZSSeMfD?R zBui5I`g`muD6|7=yA_#cCe)F)^mgKZ&eqx%ag~$L&M8E$$5)G>$$I7}%0Z~Q_l4t$ zD5x5)3nUh8gS)D$G5x7`_Z5ymzJ4IuSL6;D_Hh?X%HcKx&MHCFY%9#5!Y%lsC(J!@aP;0>gu{Bs506BrY~gDiYSg&{t7Y3o3PR zgMTc?J~JtG3gCeIA}$+{PNJJDlJeEdTUNWbSsYC?Dbl!;#Biy-uhs2ZX*tf=*A5R_ zclGI=5=KpAtnGFERtZxBA3Q57O;zqD`No|nW;W(A_PzKeLCDxM9UNmw^YDaqAL-GL zk_E+*HVZ1KR)yz_Wo58t%PU)g)K`HWcFknu_fP6b)LkWdQ0`=>6lQ8l+I>b0=#pp0C(BnA=NhG>tnR`jHOpuV+p{o%^Ap;>ZAG|zKj1vvr1gn>eVd1 zBAaK7qP^*dK;ZV7GV|EbxqDVS6ru^%GM<~{vYj_ifqsy?=Z%nJX(wc_4((lE~5ETYrnW__fZdm{3M;@v$e4SG23zHSc;|D?fr#Usypb;_FYT9X|5d zif};-96K+LM=YH|FxD!{%A9@T;Ngx6F44WKMTM8axmU*pBW`s~f_%JiOyyY5E3V{Clw(EB^adLDfwW*On&Cfk4nXF?04Mykj#L+G33GyB5v7R(80njV z+~$Mc1^wyxmpex@P7{?+8XV}vl+ktHF7O}4horx%uB9wuyEv`iv1_K@%OX^;MxGx} z4xrV=8?=f6st9z3&pGYvY8_ANnvTZ$%mo_Bv|9?6AEWKuCD3w6CpEUW(q%Egn$$JA zE3RB-I3|txhBq{@9FXAD^oAQ?3ee((r>gh4 zz>AQ}V-wT8z5sEDASuJbs2xWB!7rLK%oYl0k$xwAxuM9%+{F2ilE{=~O#a%%S?SPNAwwmocb=d|RFjyAT9c{|2=O ztYQW0Q!V-3{sgZj0Uul= zLC~2R|COhk2N-IVB_{Rnnz);NjXGd2cK;YOU*V<|{*_EOiWm zL=-$}EgdynEvl3R4m*MXgsMHxM&j8MRA312 zs94y9@qei`A8%>dA+i!Je`s^|LDXi~o^o#_J>6qF`J)|m4i;V=JWy?bQupdK=LUq| zmegB8kLVYW%@rpE_6rFotDFGYm+1v zd2q%ZSK?1{nqlw+a~{8dE-qTUZ)8JdS81I1uM%$N<8Qk;r1TT5hParzU(jjuq7>9bog52JYL6Aqp(~{M7M!q$0cf@$E^cjb!@p{Qf`9 z42!|cs2|GZzK_=Y%?DKtAjqt%)?l#GsWu&4y&u!{eZ4z_a?JfUG0{D9CHu zMoKw3>n@+k0do4Ig0rAbNwz(Pd5T9;aw(B;NCO&Ow&NBHq-}G7g zYt8kT9^@ToKTE+vX*b%Msl6uO zthyZ>HWA4ZDKA@0U?|2L=*T2p@P-O1p7SS5I;DsM8*}@22V=U63eI*=-DVE|nMQl+ zKl_LH?4+SI-OdNK%lfQ9%a-QWLUApz@*uvTKW0L{T(w;N%*MrGdhKia2yI(X+S1oh zKtsq!Wl70yfR3$NqAO5}s~WWaJ|R(t>T^)elNQty2xlYSovK~F>L3hn7*p&#CJZ{F_NJvTo#HfCEraya2?;FdcQ8UFd)_4-m^KNoSD zKNv72smruS>{6Bd#_qGy1=tB|DN;I`{@g@uQp6i15)n^H>u@AarD^3}n8$#)pj0c0){V^wrM5+5F)JHK_WS zv~k6FMrI@9r7twaki~7>KP|(}qMUw!s@Y0FtqW5ja@G`N;uFF))xR{ z&Qh1+vg6>nDDmf<=aG=Q>~WuZ!Kr$-36(X)R9Gsd@(O7+kQ(pKk=62Uc!|8p4$<1> zF7G4wb8B9MqEHf*RLXYSjT501fwikL3d?-A7bRRy7a(JcMmN9mf*7es%$HW+pBDxi zYJAF3LP_*8R~R`iJq)F6@74x9gA&o$6{C2{0v>}uDI~OtSC!p14y(BxZTZTRw|z1=@ZEG z69hB�WMIZTdt~E^g+9-%U{-U4m*xlHpM9%>V}SW{0Q*v6II~Lz^nPj2ETVsU1M} ztyiA9b-Ei9v*-`Db%)a=J$&gf(<_=J{kgS{h$99KKWa|sbo~L4iYO&pvaAGxp|{Vd z=HB*a8>Dvs_H&#=p18C7_ZCUa!Wv|&8?`!XV0i70U1@(qf(>D%?z~$0#GVU% z|D=y?AAd0npXOIPeW^4{$GxrM z-xb!n7TVF*`^FJ%OuOk_>bPj^6l{+bHQP2dU>g|A?z3Fs^P7(le3IfWZ+4L+1p&P| zbbIh^0P-ZN@jay9yj(u3f%BCOsrBJHkcYg7(TRn*#a&sqab~&2-V|RDYBrrnP6*Sa zw6OOjc%5igEjTl9?rmvHG)J8f~N6K^5nOA_(HkmsY($q5KpcY^9#D}*i z2etr{R!&iUB#Jl3T*FEg&%tMtGUnUjlc9#sX)Is{3PT1BE|2}qV9pTiD!-ITJU@D{4SO=`%aiRb6vV@T zs;62EOAbn8{y{mMntsDQb;nMk*-uYe+;~bsWJcg^bGx{8o57B-lW4S-@ncO2};*Wn7!M|7=Yw#+=IwOnak_n1tASzE*3<{|4G(d>oj)wP9 zNQNQPG_V1r0u2XnJ^SM@!L;@{BlBT9%iDB5tnquzuP_6cl*uf-)0?bL(!U( zsAmb~{${plF1mMtui%>^*}gM5F=5^_P=4Z-$*neX*5QwNj3D%o*eg$e_zl8qZ6S2+Ga22UIEm9wLW`Q7*rnXQ@Zl80skxDxD zRU~@Ncd|{Wo)sC|uBct?9XKYnWzfH6{}eV@I>BRo#O^7qw{P^b7FxU2F= zyymW?Mi)td{^LHF9tvxBNS)?I!GMi9;VX!oow)4)LReso4jPOSt&%PG+$~Jw5b|8I zx(<}y6u$GEnCL>OnaFkJk$Q;B@%IpdH8a_}3TK`FC ze4~1~l6J3##=oe4?c|GGs?vDGl-ty)oqc8a6Jl6yDi1MGXVkUvKR}KEZ@LMB?-S9S z4r2#Vqa{#)KqKRXFHi58RO9gI(tI;wK>ueSgr# zKxT$&tKX3V%(wyPkI#JHDK6ekq$U6auA#OMLV8}x#>?L?)?B8qx>Qvk_1NgzN!ZCo zWjX9n)_ZKZcban1EWfPM9s>0rgpq9yios;|C~~{pOpc*F(4*_G=9)F5s|YN z?8iHWJaVEkNT~X!lQId2rnEYT+BoHE>e4wPuO-9Gtf}f3pC0U#l1V3@EZ!E)Jng69 zo_yAld$AW%6kCwbfvm}BF5MMVf^SnnWFgilP$WOQ1P7jzXH^DQPs)IP3&`0umNC|6 zC`cI^O!NufxlJd)&hOHjkR3)A#~>ua&th;-qao%K7GzIQCKPKc*iU za&1a{><=o2@_Qf2QGVKeS>$8YpPiUWVB+HCY1y@H1?l?2eP5T@ zh$8YjwJS~m-lgdBGNQqcwA0YTB3vVp`qqRw4O4cYDt_EQpSH9C#3EK6W@16}$Ra1vqbZ>Y|E}d4OOxmmINK2?#xs&hr9F zDGO$I^l~12YSJ$}T^_lvGW0lo#e#s^z)+pi0Gd$5!P!6+k16CJVR)6cr9Y%RxUs~G zXV|4`xc6)vc|EMpU5t3YWd>;|NIqs-T$(d_sD8E_q_Rv5;1aBaUk&vyd7vk}Y5~*e zz=j2o9q{aI(z@*Dcc3Pu)8JumcosOewTGFuGB9k+>VinVj~}#ik04uYj^ye)WSfpn zh4R1=)WEv%Z{FUy1^N^A_$wzT$yz<3UveW*ymz(YLH>K*ZTOhSNFTZk*_wv?WN(P1 zd#B4PU%pb)Q56lW>$#GLXykK6Ps6MF=Hg+Xoly_(XO$PJx_#oWTWOG91Xr*+C(W1I z2jk1~t{_!cD2&GVy_zr;l)6ysY3itM$}c5W1|UFa1qK_u*xrD2aXt<=*|Oj^(@F{X zKr_qH|8GIz#umh5wG@oex4Z=4)Vb?`YoUlz=J)A8T3xx#xmvRd7J}O}_b&!vQ=YcX4`P~6sdFNZP$mwbwChkRQ3etNbw)|XuDio#UgCbZ$p_ts~g^5~*o$$kIX-jG|+`1r-M*3UKv_V=_in z_;@DtpQg&efJ{mPn`m5SACE-aAu;HBJvSc-s^ILh46BRGw_+zb*Y7oT55b!LR3d&Z+8UY&Tn}qko zCRAJ8jsb8X+Ihlx7%*+QE8+#LaFm3-JdlXTUoqs(Dt0^;nrUApYv)%0^SoiK*)0*BeP?9*N+|t{vW@5@~ik<{I9LCb(Q@Z_^6sSP2c6t~{l6#Z4pf zAa<;FCkALe?&iH;f~NB$y0#Jw9Q)*8XBp%fShx?6%GUYf;FIM=$*WvS{TeXl{I zk+4=u+s-6J0fJiC-b8={AgV=rA>+bYT3EL_5Jiwc00X+1d5B=W6goWQiyu*lhMyCC zCQC6BfYQLS`l~=FdSXQ$OI zMvPU;WjWjq8+MS`Y;&dqbUm%9e@)c-shi-7q#~|+y{lp-9+XZ7EB!Y%n!#G;^xGxI zwFs8$AHH<&M8@NYVR>Yte3uN;zBB6HZ6j_>jRKyu77<(K?2fm=tDg?O*rQY+DAG#9 z_=EVpcRd`HC5?y(s)+bS4;y|>WQbDTqFrz;^qC*>V&B5cGdBcj#p<eP>Jv6*AY}{02P%F0JiC16|{CPp8mUQLKFzme}ZbFw?bj zkW&xCMiPFryuA0vCLjm@!t(k&n$9yEdTztm!d`=il+7Lq{md|qqhLhLBtidouB6o& z^M~2c8_`S{=1#mc?(i?U51T`;Bhjb=yaqri7m7R8^Q&a(3|0Up>NYQ zVJ4hk?hFC3m`_hI<{zuHS>#tthsv8y9h)2Y)<7GJGWO)F9nn6 zel~lKQwvb$t;mVtI52}$%rH`)V7dbZ4=G6ir2MtHrRNv+!*BIhe&3x2gBM_4240`? zqpQFX(p0btK=B|RiO-`9Kc*KwFVhLg*i5sO99~364Oj{(#KL1Lox!o`#C#;5Z$pDA z_^m&NA441ux4_7Y!sDC*Q%$T&To^2bHvXhDYFX|YGoK?vroo1(8gMa&q?QO3S1^X4 zW(Gu76oTQ1Xc%z2x(^GgTy4c^|1GuLVdF>9d(fh~LZK&!#Hb}zeo{5Vc*5tr*?v;c zvTVg-w(uy8LLMSH*@}4a;oB?l@>Kk3U_rQL2-Ni9$a$=XRx`XMm$^E}Nb6ZGCv9~6 zzI*2$sQq$3fSH!P|APr+_P=Opyx>zoM^6ByUtsUY6qx@Po>EoKFms+Py+j68 zPY{SX-0!%q$KJwxM}szowUp;92+#5$8(Qe>!dBGy!GQN}1>0NHrup#$#U&1_f57#%&X7)UQoK zFrHij{yv>d4c-&_N8nnG_4E@fRej)7LtfIeI%2($C(Cc`_Es$+LMUL1Y_w0Vxm2RV z@jUZ%UnF(+8#dl_3-hxW&*P%`PW)%9%jKnW=8#bNW5h?PmXAJP759Ezyk}PJO2FdK zAbtoCHI|v}(V2K&w*?v<^w-FGXz1{N_W+AW7tl@(VMU<6W^Yk*Og2-XX5A)Ihd2rm zU+`?xO^wt3QqR56oS}z?jcC_?pBWJ{-$HZHJ5`Uk{k{jFIHfb;znfY~A84v`l^1U2 zis;qLC#R{YZShL8jYnh2tfSmbmOOy2MwK4e-IMwL%O@&iW!!p4du+HtLMufOS#rN( zhUwoPhUGH&s0gpUFKCwe!p-&vq)`k3e(1S~#cWS-oH*cnMPAO3yVr}jps$upmF`CK z4@P z;LMRZS@=h@|4g=PF^^n{0!~0;YSLD@pAkktTt6Sys?!=)<*foZnx*hlGg(PY6g*6G z)m0xRTeiJfX+B$rnNZ3;j*C%UmeaU*uekArcbj2j%FNSAc}`QidNzbL^cPM3O*i_M9je&)fH;S>Zn_A zw{Ar7ApTJ(w0vbt=7>}IatbI?otCqmb#^E3HBH2zoJ-~NJ(kITkq$9I5RS+{|9*Ok zxm-FMx{wdv1CcYdv+eu)18Qg|0f^?UlM0x~IsQTUdE82PBZyEtZwHm-q}vQD51dbr zI0~Mu9Xbl?u{p5=D(T^o0J8N!*s{PKPG+cf+N*(l(c?z?n2&|*PblU z(sKaP>ke}AX~i?I5)Zmrs8ao~?0w!M6F98%pO5a~Wgn2v=9)HZ>_Cv9m^(BDxt}4nX_~j0{@33p`ty!Cqdw+jl9#_4*q0o?(DB!8 zO@k%a&R@FT{kFFkEeBEr`xc?oxhlXlWC7L6Z7K>s1}jL1Sqkr%ofnnX93c6OXJzgn z9y7`YDbfUmCIiM2^T-*N?<)uoV9hjw+p`R!q*~;pM1+h!3jI!Ya->9t2v?c`_7m zp%LoGR?An3gPp**4v?;}AXTrEWW5+Z0x&i)%0j0K-<^f34q#CEy`||N?*Yb3u_zUE zK(}|0THCw801rCTcOHf57^{L!@s4<5(V~qZi$0e*j_+@wAj55tHBgp{wii!^VTm!* zPdO|MF*|}J#!!T|N^h3G`@S=<-rqRQN}|FIE1`y1_1@^FK&6T?FUuMMOclst%wcdZ zHMgA+5P2rEykyMeu>){(#@We*i$$CNmqnYv8tj`)i^k>7`924k_h%I^Y@}-=U{DfX z`nhy)f>tbDjHxWECJ{D5c{!>f{U>!=vAFU^UU09loi)GT!cX+pJP_u_CNkb40+xDE z))SlJ4t=_DXxz}zqQ4J(R0NCwc1o@+?l>4))B!Up)#Bvwu#H_tF|K8_xBUqiak{!{ zE1;z*RHa3^N!oNJ1l&6}5yNpXi|1JMZX02?r{oSRESDhk!}evCYty zC--Q@TkPuqT*0M_|0OU+*y|;){!$d&v4jw?v{Bi5Po)$|Xk#i8^jwY<ywauNbuoW{0?qO-LzBm^KzZhg|H>A)R;U}0s$Z%}FiCQe_uEWFKQQE7JpVVt zUGM%-Jt2Hl0vtcU|d2OHL!B)Kv5X6b{=LqHgfaIhx@+m8i-1boQ+3mmG zor6doZ<|nrN%Z~Bgz6f6Kulg`3y-g9QSgvD?7Y@82NR?~U$})5BL}{U&}rb@@-d#{ zrjd%Fz3bqu4GK3)nnZRU-2@D%F6ULP-H_!?k<&6&v9R#COm5jMciB>>qli2`Ygacs zOm-uWhn9$RrxgvZ8gq4FOhi$9bmY1^Oe($wRm~^Ih1M1%(>$op{AZ!HtSqBKs zOWtD%8)@xdM!LB}GP$%a#UQ!*s^lMaz11W|3c_|-yqhgXajb6?UE#~H77^Kn2+%Uk zFcgaVe@95+6~)|Lgrw3#9JMLoX(M?}oqr(K81iLglYUhBW7)zYZUdnM(o8)z>oP{? zPM$I1dgIZAL22QI3Vibwk4du!mvFFNYG=PI=@Js)PfW%6>+fI^=Xl2MF~vB*D)BIu zokA2*GXK7~2Q!4HvgepuZ3oH~Z3Ow54^_hm@vgzJ4D2w`!H;)9aHD$+szk8%D&1uf zMK?*))ZjIG9U}{Jlu&+FZe&y?R=8flPOcNdK7rDW zXjizujJU(%Zx-2#AG;3qnm>ig#!Pis`xpb^;y4f7%NG}Cn(%h4Mqii?D~1j!{o4@d zFLe`wPZU>-nzM1~jX9BO8}@tj7HhipLuD)RbMtMCB8o(&TyvFmWo0j}AZ0x*g?16h zYj&i=K>}pvnK$Cm#Nc=fdhgo5rDtIt)w1q-D2^DSK^-`-ug}T^XARLMZdp>x-I-Ib zJ{TMaV*|ahp8+frBE&M*b1nnw)lH*DC^>7TdXb5^XlR1n1LDT%14S1@0Grw_?h!sT z1y&wnGMoWXHu>fneQQTLpj6I zXe&HQZ@iXIuyjOrUqUc?dfKI?vKd&emQqNAMHjH&V~2b`pBNBg@pDef&ss_?-@Fvo zlY3>x)OyO~l|vd!_NDfR^i99?3M)KB9JajQqLTM*mXZW z%d>#LjA3R5nRaNbCzqHQx(=-qjB>{@Uh6K$3Prn9f!I-gB0Nj4)y3~-PcH+Tk_tFr z&7_KF_0>0lC1$>hFEKOVy-WvnNTvhJB^vg+Q{lPS-?HkGVwJ)pMN!bRMiXE?K5-zF z-iTK-vMNrWpvNTPAAZ~z4>zGvH%Ynl^iio1ysKF>(jIJ!WMrD8i>yYlXu<~?wM%b$ zU{kci_9Ok4-nMJ!sj6>BK!axrZq*ab$FORfQK#7S8>R;d2XwDg+$^w?O4cb?6P!%B zZo87I|5=Nv(OnYr3XJ)ewa+?8Y1ejsB+XSO2#i%R-u}X!-uTX<1#NbRi?U8flq~ai<>5tRlswOVfw!#zIpy+Uy}{)E}K@`JPF>p8+-&O!z*&! z0I^kjE?9h=GU5-if#3%N|inFeYxKXBCW7uo$#%;K2;QMt`{Ak{){jwQbZc$RSus!tc?BvSLNldlF zcMlR=F|`3C*4;p`S=V)g1;*Tay-%i*wdQ+gx+i-d2-HN^EkYbfRE^wMEjvXv61E{= z5^)^wh3w9V!P079@z`BU(vh4e1A9RRP8hlT`a}b!JTjdAbpgr!SUS3zi)+v_f=j4% zo37dFrofr5aqq8KsI8_L4jzX&x5mfE+C2ZHgA7T+VpfCrAodnWp1Q=1lmq2-*z@u( zNi)sbT{dIsx93WT4%BcB^y~H~q|$MLS-oC4tJOobdn+jes-g=B{8+(n`aje$xfe~H zrnCo89oTJsS~5~x->5rV>gcIXH&e*cP^>85CspkaQQObJ+}1QiVbxv;M|>zCX&L1+ z>Y!GVZgP=xQNE*UU}FM{>QHNwFBiPdw!P^4x|_(C%}f+0_Ksa3cy3A+SYXm3j?U-v zH{kint!#;*T}40KXh5v2R~kmr8feyKV%J+=Zgh>$WCEjdm8P1Ws*tn+Q==tspCM5H z6@7aB9Zh_tshr}Lyil#m`!{?)yAH4YGRWZcDmMTOxI_);)Vnl@Sm3K`+>IE=qIY%G zOK9zAKM?$--1JRjlx~x`6$V~96n;U6BJt!Bp4v9&jA@yv$(X7>VgBO!zW*S8l6O7xn;RGu zlGyf1m_;x(eQ>~bDH*JG_{m3BlAmI&a=#vg%q37O_}Tv8bOCWzWVV|a4YIWnf)~LJ zQpprdq)2E{!p;XUsNb^MKXHq}JvIx4`<(W1!Ls;ZwZI3}6>v3IlhyuNJI`fR-ts(Q zKa+p2csDE8erVZj34}(z+vX-y@MNqeJcz|5H^vhsaga{!4URV1f;L}#AW({k-80hH z=o>SW3r#XT_?mS!Jhaf(jDtQ;&QO@J2b7OD6pV4DcRV0wc5QEzvuxWz;$GY`t=pse zN)_sZefV8CJSi)m1zx%jUIEp41ObZ7Xj^s?;FDbJ`qcfY%CTDL!Wfr`pUy{rh6qyd z)KeELVtX_tk8XC2qrHn%DDg+uBP&&ha|oc{5T`Q#KVqoW;s$a$tY|CL(@`wU?6129 zJKIwD3llpHE0@m)KvL_2}G4;+Zdm~Td=-!pp0XfyF7@5e_jHz#)&YHI$;~%o=u#wSD4aMdrpaSPY?D7a)9m+>FW}ly+%R&yr&9b~= zbyD>tE#4Kg8W)e~nHF$7fVM>UjDonHvm^5FGm|}?yzG5aE5=lCLpZMU4wM_18#uB4 z%cnytZ4i#oBbw4R&0eeW zsvtul_8sLQ=pveYdy8NwEN!AdA>F$jX~T5&?boAif01 z#)XLrSa2BKjNxllu9-*S~CHX|JT4W98x_^6dDQK$}^@IsIheMLfQeqYh=i zj=|Eh-cHXjI$>>xF%%<`D`ggB;r>C|vAEODduASNe}jeqd2m3?4I2CaV^_@=Q{jNl z<97FlGF2da`*-a5{DECKF`x0$Tfsd~LUBc%a*2atfeQ(djz57WP)DZZTfu*%K!6a>)-3+{_@KhG*E291XjCIFPW0yiINuFxxv z;aSo3qiAW_&HFFqWpMT=M`(BO@R|qKvr2Gyf{4nPI?HWN({M_eg-w{~Hnx5=9zVE= zSs0khWzj~SM~Pcuvb^Cr9e)p5O*}$pI&@bq!=7PtUSaOCo3JXw?3h~K1>>NPx09v#L zNzyH=6s9Qfb{B`SbZnDMNG%g#XuB;}wVbSTXmqRLnjYULF>`b*Gh2)b7N0NG)*MAP zo-Gs{a%Y(5PNU`X^TgK5Rr|xh9qu0p-`iW*WIs;x+3B(|uxKZ}(ji7!LtRls*?mP6 z+ftUp8dpC*VR<_`AX952%~$&!@td?7o2#Xu?e-1Fypt+DkzRSFeZgxuBk=PhWFfuz zX^1`j0W^5iODa{c zF9pw?kG!mrI=iyN_ZJLN2!?pXU?N~>#ed6dWW2_OJARL0r1qBq87+2RF~y&NyVc}^ zUdxVR#8rLUmJRU;wNnSnj+~f`}$ZNzxVtv$1|sOmrSP?w`c~bv(QK0vsK6Hu@`EoYu2a z_Mm`C9;47ysm|Gpeb>N1r9Ax-oA74;3gwDc`cp0Lye2LXO}}nS0PtF$nL2xHbxr08WVeV$iD<)q9Vp0j~jQl zRgXY8JX%6Hx7^lm=XtW8%C2)kZ5VwRSz~{0lQLY^?cWQ|hGBs?18Ev<-u6ns zjK_1({G;jnVq%?jBhNYhebkc>W))4vC1hZGhQYdKN>TZ(3AkClDE2!yXheNCAKm_FGft=X3)oNAbk)8Xh8OGOir3<_*| z4(VvCxA%{ zFdg$Ln$9r?W{u?PPo`E*f_ep?%k<7D{)Wtvkk1d+l+7~4rGcD`=QV|4UF2>n&At>t zGRmH4a3}kzE(bL8OP4Xk(#7+H=C;zbx@tW*cIJLfVD?06_3km#X2uP?@0YzNC@J-9fl=d{Vq+a=qhBt-_P_gD5wQa(%l0{+?jy^><6>vdzN|1Cx2(d}#Q#p} zU>B=a;#tDC4mj*_{bH`wgXYrcP#qd345dA+&4KO44S;4$y2@#}Nph~E&vn#4{rBum zSS&R(_onjD$i!aqg++|!Cj<$H1YQm&OYch%7jHH-BK*YWpFWp;=IqF;*c=vrcJa+P zZjuZF=z~KsnH~M0WxjnEX=h>M>Xe)`*9D50)5jJ;rhs*>peG6L&$+@}go9KZP@6xI z)dS7N)kl%Iwakx&6iYryak(uNm@>x?S~P3ljWx|d?cFC3Y4{$__d>R0TvEovdHaX=aH>}r`R&!a{vMGL$5kT0|tQew1T!$hl&0x9=kka|zR?wwcl`74r zjlN1rJ=E>o{z8_p?M_ahGUZN^J7Q*-(epUS=9Ytj=X28-6TTkO6MrSU(Xz5H2GsyYLW$>DI=YhtcL*aRzm~Kl0-Oz9pYcr?pmJ5}{T7kow32F1>u!|IRP7%6V zEsuK_;60QmxixCbU$h5dqKyBWmbRMsx~`@%aDehmeN54kjhpj8&HNQmdp=S*(3k-i zRd15`c0%pimT1PM_4I=2*zEEwJ8`4n82VJXinN&H4 zCo8m+vA3T<}FDO5veodxW&`njl~9r)WYN zj(kKT{cl8%T#>A93>E0z!L(vTV2Mg-STAsvvVeIA&}X27a1!VyYe|`JbW#%e-VrHu zK4z5tAoA2iiVbZd3U{_laq?`ho@Ss(j0%;q zC(n`2m|efvyX|e}rGon+Q@k!~nGPXt=u8|JtoS`=f2>QlP-a7N|7P78zQ^q>Sle?0 zzIFlbp$v=W>LG6<#s9^ek?pWqC#Ez&S<14X4SKTLM1LlDwvSuL3yRLHKo( z{-V@tm$2}l{GL~>zavzIBtmg86GPwjj-|K@Vy|z-C{2j(fU8$)%8S{IV;MEJkZ=Oe zR6RR~RxUMcug@E8VV7#Ka(-@;kYFt9;!T|M5}?*BhDhS}U3wcEy-oz~XFWIcXUHnUm{N9TyOr_MJD{7y0f02K>>% zQ^TWMbL%cFrj}5Lx~0@$+1*R$Z98X$@We245b4ca?B@30)_kSObF(e9;ZcD_&)Jmo zk`QW+5r4)SsUY1UFaU}GF<&W@5y^I`zu99ICjUPK`W^o?W{HH9F(!E?TnQY#-d>$} z$JO^q8cb}5*LFrkTPj?sJDGm<5D!~f&;ZUGs-f9fx<@fn&!LCYj62Z(EFwQ^cv5;w zJ26{X95~>xx5;lNSp3A_r36VuDOj=nde8y(4LT&x^8eu5o^H2w0qRha4XDz${nQ-G zRVZpeqz`{BFeIIqk8mA$19J_}0KF$sUKZ(&dLU!Gw7TQoD zP>|EygXgI4Yk5OVZF7VCuAv!b_5IbfDBb5ly9?#;9+!FSDc<$jR(2S%**>%nP*%EI zR~-wD1O?}oH7lvoGp9_X za6JhY)&JobXgYA~9&3M9^`nRtDlt&Tx4H#jqlhK(=hIC#@YaH?ku0&QgfN=}8W0;+ zKRDK;4Mc)J%1`&+4o?)Tzxo}BulGkiyk)dV_eP8V0A=T7;c?xn+uI{s=H~CDaDXikOz(maqwCU;*%j)u7 zx<3u=%HBR+aBh;~>j>Ex{P~S-yYpJmgiIx57L?R`eZtA{_U*to)z{ufae8VAaC0*u z^DdUPdx1!M8R!nr_S!bDVl0*aYfS+PdMp|}nm{s_XACFF{EsT5D%!>F!qx#XjsV88 ztJMw*5BKym4nrAGjUAv;F;(5X6m-{U*6{lJwFZ*ee@<;twV*yK z``2kIDQj;Rrq!I$593%^ZPYI$rgKd@<%LV2M$X~PNW7N9*_S>^r5PPTkhBvfb3>#%mta3gMcZyz62v8?+F zB{k*GEnkoo@eD5PoyT-->2NJidRsk*$V4tn}P)ucK!stH>Xpc}Rx-a1z7jjPA4uU2+re+Rday1_Afyh1(X-`9eu^?6Aw zZw{-vY&A4&-K}lM8#N3~-eL4ji?7CmM5AK8Eb>S7XDFCV?;zqg{(XWl!H_VsG7?bb zKTB&afb>{)QMHDc{~}pxgV)G_E?%tI^QH+zSe3Rx0KM=3spn-TyK~WV3Km1GvXREQ zQj|4;D9-V}&b%h`x-R2fY(B!w3&=ryYOyVe&(1Jl6F%?=9Tlr8JB=A5&>AiwZ325*WFt zFd4ll3t_p|Kazg|qj+7Sw0{x*7;P7kdafLQkRjHiU$9LsaT2zp-nSN>#|RmN;TGp; z8L(Dq@ydhMJ5%iY3}(i_FL1%%rhKgvRU++djRTw`@_w^cfeV)rnTCc1fYe7iz0lid zvHE#i{IG-iT4|R26vgcmL}Sw%rSd!=0L4+t#RCv^e~iE1BfUzzZxk8iA;QYEkEL4Hi+PF3Iu~^Wj!_dEoES6^%J>H! zfLl$;WVk{M5R`h)G2Dj$vFMc@YZz#T+(n2d<5wu0~>j3p3#Y! z(eO6@f7;O+vXC%FBE(EKkFOg?jTH7dF5(2t*AGu;ZF8D?L30mvew<TuqXTnG>{)` zC{?M>L4teu6*k*Z)MB7Pd|GfY#pwN5HO(z+SW3W<7B2$Xp67qLTx77J+Wi%>15? zgWmb&np(P0!tUvtdxDU9xIu%}9}YCA&*i)@JO(BAW_8M}tyzHp&RZ!5$e zFqB>HetQYV(_2cT&f*%68Ow@S=k!o#;(8Z{UW>e|zPa}maLViihiqZEBj~QOYts^S zU%Dy)7w6TW17V-e$H^8^+RoJoAHcYl?cnHGZNsLzJ|h?;4&2$9(NgNU?&9DVGKNbo ztVjj1n>2K(K=>iR8n(`;8e%3<>1uiwF0&&fs4bxp`*tQj_f2B7li}J1 zc2*yi(@zB%fCt%~l{4(IBt!JXS)yw2vS{fxCEDDw#gnNp>Mqv0=J~9%U098La`5K) zFye4J0iwSNry}bmuuK(`gLCq~JWw2w!?*kM;AyBjh~m9}Ff{7vO9cWoBi%4rA`P2h zrGjW=vHa>)61)tKJ`Qg7w`eRb?pS4ibeQGoK~njuEB289rLhxLG-nH3C>gdjlEYl{ z=aHXO4px&N4xd@(NY5~8beB6QI^_0kYr;IqQ}GU4g~d7|b)1d$?G!0G|~_y)}-u?1mTO;*uMkOg^pOm6_YQo_}{e za!K#BwU$D;)7~0_mb;z|AXM6x>k}w=3Nah?f2Mho`;fk|WT?j%1D1xd&{EhlihnTd zCp(W80qnpq;_GO8ULvhk-+2^h!+fy*n9KhgR{I@vV@gH19yr_<34%$sQn0St--s); zo?$BzyiIUce+&c064ze<4B+$I!&R0m>YJGFK`9eQTXqV<$jS4pEsswe0#tiVS^x*3 zeI)QhX$f!I>bA8$zVWm6FD`@c$wli#jP6l`ZR6MNYrLMXIi=fChOwwz_mnlOI0_Ql zn#_=H-UdN$w_kWQs#;R1&DkW$$dvvV!ex=(bP%Yx)anqTl43yTMe&HiiF#57?L;d* z3}Gl^VsIp3-48;c@_E_+H&11CSsx+o+_Locyc#1sf(jnkaD{=UQTW1&ULPT8vUjDV=0v}T1#U6i z;Wdt5f&AFRPz%d{ZTgVy&eR5N@D&KCjMCGZ-IGcLrAVxR|TX8R(zLdP0V zk~fr0RHCxOy6ol1r=DzBU9I(J*Y})HEKVQI#0$r05O($)`w7Z{6o%h{t}4CA2TrbJ zz{8Y4Sd|s@asNkyRACUij{_C2y#Rs;D}Tu>^9BQlc=$L}=@-s1{+~tLe*|BuYn-le zSnCYxs)60#|Qpvhc08sdm{ql8v zJ{#rDKK&RW%^GnIbLG`cYtyQtHP77X)R^nroXAVu@icRyIUpoQJUSb6>@pBKCk0dn zhNTf>JYzs7?mL`M4SBOMbdi3kWj2G6LM~yx zQKGJ~WxS3N1T}zSgJ)ytS+!rY&>Z&O>OoymFES=cR2Dgx-{Jr}#~8fT&@hDJIkt2J zxo(YBw)ebi0!V7+%AcMpiZ5*Zfi_Ug&`XaQw0-U2m?E0RGD)E29DG>h5ifyrOT>d4 zNvrZRihps#hPTyhzfwhq-?-#*sms55jv@U~7Q0ri_;*gj*#WE1>Ur1^@mNTNki0@D z|1^?EU**QNddFHnLbl}PH_GRr*e~N@xt8K^^_B%&ck2ydwVSp31Ltt`-lS*f9*M$p zya3nnwe+XC9ZPM$F;08a>_2?TRUp}0<%^$eY1mdC*mASc_vo;!adM2E&gea(h5O|1 zbTolLM42;lb&j5^HrnDN)gDs7NM{*qq%D(uu7rL?T3?`;GLx#5?>n);phlQ1(m+Jq zZcJEiO|SU9bXgX5{iqPvUh4+4yF_iCt#VAT30tU|2dB?9SEw}?uUV7}Gd)_8))sut-g z2I6^?Xr5eQCng4Ry7*w8_F6&>9!*B(iel+WT%--sQ4wc%2LNE!utq z6}!Vjo|SfizDD_QR-=2i+lsU@zZ?C<6~oY$x(?owm_!=hA2?wFKcq__(?p%NYIn-T zbogU*r`z6pdF7Mtb`QtAgP4S4IRn)FTq*Qpe z(e)%2U1^GGfTfOfK_yR zqegHN<&PApmVk_{B+iEK7lEyZxpYDu*}2UU87-#;ERE| zuP_wXuS{}aL~lF8cRxEYTIPXQ;~^f5He!pTaII8%oA*5(7E|>4;PLh!JFg&<6X08< zwo$Vl{y$;%GGb*1yS9>q_wN}+$XZ4P#p8JHFC}D-9ogLqUbnZ~pQ6-*7#5Y>-H6D{ zw;HJBr3z0@p`0+1&5fm=8>tW49!hH?Ooj-#cV-Tb(fUA_;8}YeH-K(> zD)c$;*$$%=0lO2gM8T7&CG(GhKp&rhCV7pICE(;o}Z9f&xWx)3Zpv7fGW5! ze{}vvBg#aO@;ov+lP>}^{+VAF%P;%p`Gl@s&F5?5zQfCv-Wk1kw}h~c3rM$LN%adz zll5*fH4e2@@Q4EW7R9_**B_vLB=Uawt3GH8chY)BUOD=|zaNVwEWHMi7 zCn)}D+-rYW#}u4x&W%U>AY#up{4f@!wZcgw)N*b_(3^t(#BB78;ciqO6bLdBT1OXxgqFk6`P}dVOXe@yLWzzeC+`IWhx#UsnfgU2^(pbzaBL zejp2~5S*QL9mc!eTxH#bCJ3lbOW2ICne&wI87#i1iK<*NzjQ4to1JhwR=^_6PHIdt z%WUaJp<50X<+;-8%^tihRw3T^5Pi8Nt#7H8%3 zkIn4zoE!5%p*e1&Ir}$Zx?01x&|(gC-Ve$GC64+b50!@1ROEasnDb5Ydo#B|`@|x& zlYGUi1tq%U%tqfUj~*D7o|OO+R0nw>mz936Q(67;rX4u@mbPPxnqOh@%_Sjk6PGK5@|)fpu-cg_U|BtA-i*07!@nR< z_3EKE1Ly+qa|V$~2$p#)fgof83U7eEu$+m>%CdC-koxRswuipGRB&z6q`9HB%Gl<(X+AU>G!&dK67ip-I}b%lzn@YBp^We> zdbIVFgnIbdyrqY0n1YRYX{k*F+)o*?IG}bO9L?4cYfDQwa;4KuL&2HU~)^p6E z0sa0qjwahge!neiG=bC)z_-jzx1y@bZ{iDPuY^mpQGBksaKKyXFl=LxFbUw4<~ji| z%EA#`vne+|9~>OjCkY<@Ux>YHx+UvjkDqX@{gNy^J8YtD8{)I|5p)SsVbXi}y%SfG z`F-i~XcN3vd87w5&GB0d9mzejrW}=}9pVwUatQ^TLc;5NS-od@N|$FV_~X4TL|sjP)98h8_Cvq zA_H~UBu+uyrX49iBhff9P2h|gYi=UR_>)k__vLkmOxj!J+qix-?^LaG+S!OMw(rIz zhLK8a`lBL$W1gnhK|rcN;`I3~9L9)mKr4PtxcRu$ibXi4TZ^TyLy{CY!TFkgd!~PJ zYdouBTxphi$yJ$X8LgBO;G^mAh$92EC)&moU3{d-QaNbS$1FuK@)9D^siwC|8xcn3 zo2<)C3qiUnb%JanPa9gP+YitCHrVZ_U6c^JRH^^|vzd1V{=ocrGMVYWB-UKTf5!#C z3H9{2QZ3iMpR1S+0jq_BkhbsiJ3}0{st4P+2e*FtJ2-1i>~P#Sx=fzFf#Q~h%3rT{ z7LK1X)3b~$fT_8zY#$~>=O;C)yGy|D4>Osp6VHqaWYDe1=_ zj?E#BYzx{YO_vcHh*YirZ+oc%qCgwrm8#V6AfF-I#dW(H{S4{{2=7HKP#$f@2xx;l zMu`z$-~)rndnE*if$Ha?{L}y@+{$&#jo`mwUi8<^! zE<4xwj=tzEnFsqrn6tfZb!i|wv8dl!Y*=i9%&gwF`!6CeIK%> z%O0uoCMy>D2f4|rD}2}LUY3m{XoU7d{JGg(?+NF=f!kU2{YKG4PU#|B2y&>67xNEw zL;msw0ai8P4mglxTXuR9jdzHQkAh8Q4x^lxcr_-R!hZ#$K+|_kxlzWR0Ds_SLGSSce+P|K z!t|4VR19M1RxVOh4>?14Ev88_*LabsphH__LH6wLfOoPKyscKS=5JB*^+f;Xg@Y1- z&of9GmYuS)lYz~W!t6u-4%{VH2`&`rjDfoNJ>o=hqkQYvMb`%)2eyZt`k9!~qQcSw zPaG+?c|zc0AWsShU}*)i<8G6YlG?g>$sqfKo@5)mKtqs<_?=~t{nQo^>-FqY79^eJ zg)@zA2`G8Z3ubkaaa(gd5sqE4(02*xe2r0lm(qzYIJ|-w@H&_sv1)%{vjBrNEB@yL z*ae0M4!eM1l4@3Jt(*mB-e}pMi8}L~)S*CHKz2YAzX>qD zh4SJm`1E?=Z=Vb7G$Zlp6U}Yh^{}3_dzv#^Sh=DcXCSY^3?;k=e7i`CR}I9rH6U z%#(wPOu@AF*lMsLs7D*sXm3%1+3-He`PcK_UmVWj}Kvny8Ti$T-6m zDbDX}^OInIt|c|J)SOpVUu*2PgcrZ9!^xk`HqS(;&yqgpRT$S=Wl>Ym$45 zLGqba*qqsi3JTHq$IlJRnAKAx0+IF=)sToUWYjo^;eDd{>9psH+~azD3T~64XPowm zUd~U|JG1+u=h2;6eLS>kDZPj9!OfV!MO^;nLIRBHr_6kvv@AeTp!t1}`=?5Q!2Go@ zYNUl_btHFSa0s`4mu5*zu3&`DA-)2{480ng{nd$_ECW`iPuAu3Ql^=6tMQssl9v00 zdYsdXqD9aOh)66i_0fih2!|~1g;3HUnEw^a_)h(SFhoCtus-087q*Q8Pdz7Rr#cwo z+LCT_tZ3kd4&8zo|X4Hix-iBBK&R`WoqQNJ6yt6JCJbJ3J$?M`&o|b^HSb^wyDd57; zOOEqaDrx*p45RqXb(uMH33Cp-54S*SLqfOM)|JCU&R4xWSb(W00vNTJKp1M;GW6H{ z?W(kTsPhGED*n88&`E7cHoSx|(4pd0W)Yy9loc0;9O&c=AN?8ILIK4X558wFz~GgE z{kD1Y(jrym?uoERB*lT|SuOf7Bu=>4l}~EvZGMM5*pbW&0lteO#r^*@H9uufG{HYh z$(KI}jGo;8CCuXF5w@aO)vH8kji&?7zgl?!Y~k(K5z^IC{s7ryIrckjybl>tMbQnF zx^Z#h&I-Br37|=*%e%+>1ixGoo;6DDap}>|%V4mt$=c5HpZ1}UhjONF_Oc=gcJ&!m zYg!YEwJTTTpx0&dg0@<%noL&U%COqm;vR5m4Rw>j6I?D(oS_;97;(v~5cz^S|(k&)PpNLFdsIl9nvQuqapsLMCTfzHqvvi0onp-T<}MoiM!|5E(0)>`N> zm?^B5W(^cd{OBtzBKLOyrc&hY1h+vA#h}HPBae>YYqR2^?a5OnNiw=O^ZsS@7Uq!{US%K$ei&Kx>4ia(SV|Z1AE_ zsq`y0Iu_@~ny`^Se(q@iK(H?nachEGUy>F2y&E{C7YY$Rb=%D z|37}Ncv6FEbev&Gl1?_*ug}n}xo`+HE0-FZUm6D48AiS0Y&Ab&8mya=543<^ODh!~ zz}r?2z9wMrN2yEFhk@VJp=r@|EFbA}1X&aQfepgv3rpWCug8%{al&*!$*>7ir*E#5 zk}#G!z^Mgl@U{=Itx{qL=Xu=4yi5}WRzlIK7*jlG5b&AMrWfxT$?7EfAxu6Pt{~$C zBQj*fkkcd*5tUje4&-sB-WYzRc(d_LOW!)XVdRM>dA@))PNzi#m^7Isd@<9R$y`ET z1;$!K6!PCfcuoc9A&+?mB`KlNWf>bsOV@|Ahu!E06uDP=+(ZGz&HqKActxheKv zAjVR7BO!}UWF4UIC9qkRMh2#=BsFyNZ`_xs(F-vcej z-I`a&6uzV=8hZaD78Yb=WooY|HmI3&n?Fg8B|7(=^wd^8*MEuLezkoP8BK9R5(Ppf zIcAonk{{dCudv2*JvNBS0K3%UpU$387TCs~?%Hn2B}0*FLKekH}AG-FbJ=vlcRyAK1H%zq`H$)2LOQ}@V*8E5ggq;XC zwI@oC%i0IRs%_J%ABt`9YTH}l2l-ErXWRRHm2E}*n?2rRVv?cmn*L}wJCtP>fKAaL zXxAq$*B@w4Q5fG7GWK>=gL+^$Cn9^gLqr2@n6d}(#km|}o z0?F;;pRUGO&t3M!#o?ZYrtEV2<3*UXhQ*qteb;cy-v=+`2g3~=h-8NJ1I9@89iv%m zj3p_`ZD5AD(EShzWLCkSnutf15nJKvNfjV@mM2Xl#doI~)?`X)2LE~TGO0T;aU!*n zXQus;IujP;nhz1=oC^Ey{5D8}^UW@1z+w{JtRPr1>ZFGa(YU7gP4n(K=!w{X_Twom zynVPKDgIUIyZSkH(U-HBWeTlIA-v&2hZve9$)W9ueBPma!ps3Z`Y^F65jPVkOKN(B|xWQaGo?mtxcYRLvL?Ey8=;Ks*fu7aGrk*z4m3 z?#u<|C?+=`{a(}eRbb{VRUe8poI4CX~#!+t|4j9}Ub`pT=;FD^l6_Hx?!wS2U( zM1!HF1-y6h06)F;9hBUQe3xAI)-HU|Id&{MB=G)C-;crj__MVqzwIWK8+5L{B3!Yx zO8MRgmxydo^g-n9fH}0f@!YuenMLIR)+d`UxKxY;7!F8VAAxZ z53bGCNvZeSo>VyQ*cIvX;Dq%CgH7BqK{0x28w2-Jk4`Q ztql+vkF`p@&P(5X1AXf?TL)GtIU;RHB_ua#+%Pk`Hocc-5;cOVrhZHj%0uo!|LWy( zC7?gNpd#ODakIIJsoeT<4P-~Iocu8;&Pc%rPZ<@E3n?}7*pk}9{#Y}TI`TbZl59rj z(eDwD8|e&LvvP(#iR#VCD_;x_S?LY;=72^pMY|IZb0zeXCapuo{;1qFFKh;hWb$M0 zaB;H<*`zi-GLW_UQFBRtJs!63y)X`}6c_criEsSDjY&n{MKYhmW_=kftBh7Lxw|QZk=0{DwDra|IZrd8>V*axfIPb_E$~fdMZ@DMx3?mcp zJlMz{oFDp?A0vYQB%v#(xNN2it5!X!I|>%|&DR639fI;(u1ilHZs41*eu4Q&ybkt1 ze1b9@jQvvQ&x|aK^%Kl7JDI936+egaDP3ugN!;n{!Z0Otv1aHG7y7jH3bLamgDm$n zbGl6bJ(^HTLuZ)Y&lbSL6qu72fXP=r^c`l9&64o+j)N~|C9z=`U}}c#?6xk}lTj3J zQD=yp0)e$*Od}x9AH5WaF#+0@WW|yPwQJ3JIZC6t)L9m`xQt?gaDZNv|FZL|f9g0n ze3O}tFrr<=KLs#Nrzc_a^U*AbPA?m%&JwlI_n0^O%0C zsfcLNnD*=KKTJc^T)GOHtqlL=of2<$i7U}8qHEq2wAdT><+aSge2y^t$kLFMN7@Y3 zy`9oM+zPMwg46u-Biu>PPoe{bbMBDQi7w*hiX)Aq$zt(#oaQyj`iuVPUE-PQrzKkq z!gKvj0!;Et>HDIE&f3WwdxcVVcFm^Cb{E{>0>N5p(O9`cRCWb80=E`wJWiFh5&Y9S zldgcr9$7sn*?qkmJ@MAo0cF@sIV`6+Q_ihtKd4oHY>uj@lb=D6Wk|;a){J=hI5p9W z?ZS)y1$)LvXpY%CJ#IjNy(_-7id}ZSeMw4^VAFi8QVZdfJ7R8P8j)d3g+O_(=;}-U zM8X)Ro6pM6g=B&=Bql&@4}04%kAQ3(Uu=QVfK3#Z`b)#nn4+rf^&OS%qDu=|FJVQe zU6MmFr|u{*2QFFlhVU(5aab-i;nx8ukFi@%0)g&AV>o$c@$#XHyIC5LDE9Bcf(fpiakL(9FUa-OizN*KL_NuoiuSR4>zc0@R2LtbbD zwN`gj98ZkjihymLqqlkN!kd7+)2ul$Cnru4C>AB!{iG6=eeA;CpjAu(=UunVdiY&T zP=#s}_6I%Cd=!aAwY3DH*BP_W4v3(gE;Qzh;v#wGXqyDDebNSe>js1hdVmj{U$dSY zZGd2KYGptcEGZT{QI?st5f%(@)7a?WVQA2U@LY2W+0nMzIRTxfU%p$n>g4s0CpfWy zBnEUz+CN_j%|l)f#IA)sfG$XVn0}BClz18o*6^pYA>%x-eRi1?jdOacOpfI7{3Lx! z8Ito8%0=E9KhQexi!Z}FH|Y|IUY8uH*O|w%rnr9Uh#L+KPi&sBPQ|;=6tVFugYG+} zrF8vXHyakyG60VL;NUh+4P?n^6+v%i^ABg zKVGtTcAxDQAKIgzc$tcaI<$)!B@Mer1|$=w@Q37Y)>&ANmyupgI=y+kqrg zGX!>1S!zjh8pJbn&ytj+g#`H@Ax*Rq6uob8PGvd>&mV*fZ_&fMxUfq!>lCdO%ND>mZxarvJ3?LSKICCWR*hXPFfWE+=6>wp z?JKKReMXL-(2|!zz(2bYy7TQmxL)PzT zx71n%lQ2a(bjzQ;8M=-Ty<9brHBZ86qQrpLva>2{STVJ58GOuP!6JD&^%~lt?jy?Hq8#K6^Z{%<$ZFqybwefs zalP^gI2uwBa$@t`-j|8Pi<~=(VfQ=Ucf1%Rw~J}8Gk=BwMLhz>ue@nrgpSM&X6?Nh zg&fZ0X+8z6F7K8CsZB(~^_g3v+5sl**~6HvVP+w6;iKmTiF zW`*9Dsgk&FN)o7Cvvk^gYyF(ll%gT~aNvESMwzV=p91j$5A9N?yb7= zU|q26I^n?=0tJnvwGbxr&hLT57q{?Vk$}-z4F}36GC^;_rFiWFy~zKIlt}P+0O*+e zx_E8xEG|9&iNVb6sbQEHwh_*cFiAdcANav8JYMTV5h2Chm)t~Y#V_oEBy(;8M=#M# z-9Ltb0rqxpa9M8Ebd@x3PmqY=$0oq4NN6p|HzI|xeE_4yH4Dj>6U#{QR1C)?h)&&i z%!eoiuKwkkQWDx&C&30Fi;MAC5#4CP@~GY2R%WBu zhR$*m0e^oQ)TSsbaB?JryF)k%k~e07{e|wGnJISid^Z4ed0;^4$BplSz|g-?w$yEU zN9vGguCYItt+q?t69J$EN2IMcyVKjFm@y_{!3^z)ou`x47s9_nGI3oFpSL z@Lp>ye$={0;ghHo{bMm`RVy=&DZv8=QX@3b|3&O4 z)k>S!99q$kRn!?m$L~?|$^u=JPZ1$N0#*f(tPuZ^2JrF9zVg#_2{bd?W}uWmzQj@WV}K)B z%|#?6({iCvl~Vp3dU@&C&SqrDtBVO}_&b?zxNIs5<0Ab4+Bc%RkwGO=K&!RDdx!_v zxCYxOQnKZc_9CnJm-%hxDapP?EMBawi=g0X79KMI5hSyQ#l%>DU|#ym0QLVsV4G}p z;}UaJ3;d=JEa+$$xuV@-hSKxlayTOnrH-#|CzaMAVQ@p@v~D8YrB{;wBe|;{YTm{C zhK;X|<__3091Cu}{QQW1u2K#OGAjM7Nw5v@oktC7+%<(}pebe=gLLDOe~97=#o7a5 zL&W0t;DbG@#BZJF?iYSnnlk)0>AK(wv33OOO$3qm#+4uf{H??j8Fs9s8g3$R+d=n1 zb*S#3i>ojgq@h>Jn3R@M_Y9w1GS8~s)Masmeseu-LkyrB3WcPcw_7#%ERv3R^5}K` z?YEsB?vteNFh()UTSSKv=c8q)oi^0jIsydwjE%2YC`fwTGF&l+H*IuD&w^=yS{47D zOdwC2D`xD5zXc&FjJx47;yUh-2a8^EDSwIoRsJJN%!~=SJ1OF*?uv1Fn;`*55kr6A zmrYbrKTWEVD!+Rt20O*xL*LO-wC?<2o0%nOMY7Tadi}pmCsY76F*GpFx9iKQ%wzMc zc{8uA@IAkJs%s9OpeTe-*f?Fl$+9yp&F0K?lij}89=3#5$6730P=wn|Uu-eR&sZ(X zBj3GcFxDWn-Y?==pz;|X|0eb>jB9~h=3{%+NQGh7t%VrtM8iFL!zwdSLAzL5;6vbFTKnquURwvi4 zZ&qWL5_@b)XdHAGM4B$h;Y@ppv=W=-aZ+Isf1SmTzIcniJd#}B#_>dey#3dfT4Q=7 zaolOOdJmaC=Ugl74G{qXqvoWKJkbCF8h}4c7Y?%zF%;fMmq|3>>vUB4AJ8notxd$| ziNyp$5vUwN@6Y+O7?Ur{@5nS(Vdiy))h~ajz8zwYy!m@!)ITQpeWFZDDs4;nJELkg zHE_NF$~|AH4pmnOYSb5JS+{{HxD!IpKuc`*b!#*J4MNcr$wD7}>hEL+2&uTKqpk6IUn5ZSEK2*-Cx{X)j7^S%S zPy^KT0gW)**B;59vb-Yyo-;)lXHfOT#v78?i7@_ zT2Q9!ZTkt^#NI*%;*4VVyb(*9+oI(LpKhe)UXpN{;I%r=n$zM?ss(eX2}WzkMa=lJ z6UA+mdQt@?2Im1-6ZNK!w9adIox*=P;FrU_8uEXW)SmcJAaEdNRkryzq|J@KxYhVT z8=5^xNQu{PCqNtPHy?BSW(Ookg&^&-n2r7+a(?YI+Ah&=d+=I+b9$P-GdFZb&R_<# z5ewZMA7#G~18K~$0Rlm4sGf;l0 z$0#s>Qo_&DZXwX$+EEc#xhW#yHqSp&jMe8SPQ=86rlWd&yiY&V3dgGgJlXW!T%k9ar_f5!*zHa81$hO$jS(^yl`cb@jF5{c zQW2gGO?m{q{W@VDX42Scp&1doY4=5RUMPPqk6GeD zl#$(4b58VMm3m3loI!Vilrx#C9?vGrGB2jJhp|eE`Fuub4pX5oW1<~m!x|ZF^=zc2A z8SI$||HVI^`-jlP76MSTj@bH75QTH^WJ1YLFxNi&$@!3Z-5lN7g6ewU3KV9}Byp#F-@^cpGcdB}>$ zSb-)gV8gmPe1-?NT@@2R0O3tJN;;Gp4|YldtYDjs>NQ47SVL@kr<<+JTCb5)7Ww3GXZ3NbN)nd?S#Fx;AhE-)9NR`js^|Z*6hT;Nze#v89dJ}IxreV zRP!7jEHH^W`ntYUPbr{5GvC)Uj@0TzPT`QP{{7eqS9aT1*XFi2Yy3?fttifn`7vU( z4xfk6d{D_DH?3`11@NLSPYO@R8A!reZ^M2?F49I!xjB3CgwOUT>Ejh`y&j%uIZp`x zAT<8sww)8PRtXojREEtx{FWsEn?btGH=%Om0D6TvaYOR>dPtH4lo#QgT8z68!Jv01;6`GPZVpx5B@ z@Tv1F7fULjcJ3SH`4j0J6Z|g=uLraV?YIIL-+M9|R=CpfYcb^8LH-wfC1>EGf7=CS zDao!CW?67>Du$>MY|@6eUpubl z84BjNM9urG>{tV{Md6XAL9tu`Xx8--+-_nOzch#tkLXT&GbO!2uc-`ct78nXg5P1Jlos+`x4gFupXS++ zdk~h=gCl)UiHM6)f2l<~RDFJH{x?LobH1>yF0^f!G4=4-YPvDX7<_UW%F-8&T{j>}jZ9=kIi3TP`$0$?HCU%? zbLsGG-xW*jJ>UOFgshcl8vHNtfG7SnFW6w@PnqU;w=t2Kgs~z@9h2}{u5OJuVb~?;!zH#7bw6cSKMw8<1H3rH<*wg@Fx$Xu3Q`AzOEV#1}0QTXFu& zt}>!eYR!}j%qL>lpz zmt1PZ?c^pIrk32A_HBR~^bePBjc$lX;VhJy3Z8BeM4eri;jw+;6N1RcQ7feH5Jka5 z1{21LjMFTiVgzqo$Bvs?c}oG&VT#P@)nhN4cDISm>u|Dit$GHgC@w0^V4@0K%5_Mj zn}I~Y9ighz-2KsIYw|Zh1;TGqHV#!GASnC&W&8Y)o8r}37oC0lBAehgqbivsUKE|F zlpR!*Ax{LJFl){vD@@xA$p~lLs;X^E&SjJP2iFI4@DzvRrMU_7H=mb<_~s+ElUm$y z8(=gnG1T2r$bQT9+!>=)`Op<8bGFm;vJN-yQ4XCq^ypm%R59AN4X%%~tRDX^LH0qQ zJE~6VB#|LsNJ$lh7?C(7;5t^<{i3uw7w1<8O|)APy9m#NvB_ywF^dGfb-+Whx(itv zZZ*f@oZ5mQ!ZpaW3A?AcEa7d)`>SP$z0R!7fCdWR5%i!~`6Wm(ewtQHr*Aq2jRMVS zBw2dC1l6*UN=IO)wFn>h!Buj(G<=hcPY+{k{@|l{An)Bo^<)dY5XuhG$9j!Eva%Sg zf9^-RDNxWwId2-b@a6Gw#-vK&z9~MXM$(9ZZH60@&|;b%#Tqo2<|8UMX2MHk7H`J& z^Nb3`hkVIj2JpEF(FHCh>$6IdyyVGlQhfh&+*L0G%)F zp{Oq%T-I`s1*U)ksok1-atD+ZYvTIR5xvW^vVc@ zcCGR!wlHni>t;8^Cl%ryE2H2sM7g*>Q+_>B^Cq!sB6lXADFdzJKL1%#(sB_A z8!06PpNM2u7s}P9*PgWqLa1N!n|jmU>8@_5*m@7cS;Sv$M+3zN&HE~`BsYfLVf2`R zLRMKJ&UDn0dANI|srOWWYdD_`$@ujkAqsOcz0#cX~chO)ZK=0t6GNo4J!_x>W50%E}A^rlIwN zcasy{iPc-avN-hM0-%NWYxSZip-iq*=gO~Hu2Uh>rpnVS{CAUdv9`GJo!n}mtJza; zDh1L()O;Yn19l#eJ_%tugu}u$p|Yv}Cy+I|SD&dopj;L@h9-_&gmU(FVYQA53By30>}J%|4=23A*lOW4D)Z<; zFo4q`XKDl4BJ@pA-NDauxHc#cWf93JSTO;^fKu_9$6cOZ;MsmxLOaVNa_QV4XUPBn z-i=-S5gR>V-h5mGWEDuj*X9X2WHPJsMwDD0SOFYFo#7MizN>lmsJk~g`&Hchelq?? zv||K7H?UoQ3Q`wdN%Ztq&~%P5c&xY_hxjX>`3Hqt#@SQDq`XL?4G?h*HcT$E`4LW) zWkjZpq*_>=3A9;~wQHWcE&2MQg zJO8+Mi`bBT)ju&%C3HzGNCxu|!wq=07uGiIrqrQBj7*EMSu!{%m-b@6cXI04Q8UOd z_Z@GG9_)L%Lu63s^nItF(z5UXbU6tKd8?y$*!naWbkLihWd9)5_uZQ+GHZmvEw3KINZn)ualUZa3bK$TlO5n{AiTc%6E#RN>i4{32{41lXw;sR zilQ2|6V7?9W7olU-Bo>}I6l&_qzarDd!$RzkpVc28O9hQB$2N*cr|RZ%0q)AA1h_3 zHQ?}o`BYJG@TL@KT`kJPaqOI2VKBn8Uu$2lWzo%W=-&vuoR&w~E&bQ!NF8k_dK?lc zyxM#hXXnyCxDcvFe>0_m8PP1vTdI1YC97#ISlFzN+_}=8F?~dZQ%aEeG=M$tpD6yR zXYM8d#?7{N?dFy!nbMmqchTc9kv8g?yV~6pTGVzG@qTfp2=1BVRk96Y9*sWa`!zV% z@-*7GqtECYR%#Fb*B@}y-I`uVzKM!C0x(7|LcMSleL_1N#?C92Yb;C@o(H>(iw4-2 zIL@7GGb2ZxTUoNoXIlk*CXYBJboV{gJSHs_87!>$q)6?|7=lc$vnIKzj1AFU!F-f% zFSblqIZtvWf6XC$UY6@O3K5WQ9)e$Z(=Q_Se7bRgzLLWy-=bJ(r7Pqz-!XsuP+rPL zt=MLa*DbI!7TSjjunZSQg%a-ru&TYB^#vECjTw*3<;XAol-!T*SSAbw;7pTDlMPdq zeF>zMxc{s~$Os(Ae+`hR6lS|}GJ$^)j;`RejaF+Ve!e#5fC<#E0qrTnbgg#mK5ExA z=aABb?<0@mg0olT-u_KJob0`OW`!Mh$2Yc!fllU;B~dq$Jmf7qzhTPuL%wJbd-7%bhpzaP3J zK{3ZI61EuGbawL($AfG6PM0c{;GDBNWs!~B=zb3lxpwl51cbC@cDBv08ZQ472{6># zOw4n6F>@huyWCUn8ZyDyTiR{*0m&*inHhPPkynwMtiQxK* zZ4QMzlb3^CnL|kK1GXJk9CT9D{5K(M{zVj#DjkGCG92Cy_xm-)M`83KHTM;HwdKC{ z8Y!Z?z9IPYfCtr~8Neq&QTtnXa%H3rqP&iPJs}FRx>N+=&c9hHZ%j zzci{CK>$VIQgJG$GwBz>&2N%ACVCqzQS>h(i<(gYiX5w9hKgM=8;wl7f99E?Q(tCA^%=DWpm~_Xd1V2X=ug_T=X%(B-NBr~e>EMP<~d zBXmarBqXGB2}FZQCfyi_XOR1gzkCZUP>B+XvN`}=#2S9C(j%!@jprf~sT~qMXUel(N~<48l57>2=v-9xvLuKMvQe{tx@?+XCY2*ZW+83hLE)g;I|@&5H*U`0k9pyl8XH5K z(AwXq<6y!B4Y#dPn9|K@aNR&qNA(uzh8Awe`le%SAyM3*oMD=3K#-T3`6){`1+uJS z@5M*5`A|xFqfAT8TzM#}l8DpEjY@;OfS0XkXLsDi0HD;fx4!nV#H591LEN$&a)N;I zx`w7fh2oO|8cs6O#iOBA1nrGwJu{Jy?Z_UAhWQS!R0q9{0Z{kgP$Q26^tG<5dTaa- zcwoj-@nAc759qv>Sau`gWdly7BFKS;L^QP;vP+45eXj}Cmdt~0Zq0|bM!y=fPBM_X zgI^Xr??Mt8xPnjh>-^nRGsl+$%`(@!L`X>}GpnUFzz{vfjK1lDC2s9oe`o1Q&OJM6ezlF*^KI`=-4-8b=3$9iU%O;L`x!TG99!xMlFKKLXb;LaK` z4%<#;)>TL(@0lcI)6NrzG=X@$+&I1uGp-|$v!dPfi)@WXKwbXeaJl|Dx9dq2p`s5Mm zAaK1gj_I=ZVx14(yeYKMTSxG=A?S$bZjZNg45UB9?}7wwh*g$W_n#e&KTJNX9*}~= zP?%DmbTw$=Axb+GGRGyAj^zvBbI5#kH{(NT45M`8(zYVBZi(<{3T1G}Y8#_V>V|mz zbTMP1v`XA%y28$KU-|53uhX$(@b=t+lkBv@hVp`lO;&dktN|bF5U+iH{YY)@9ycld zRYgzd)&m{m>xJjXSm$KxwybQ|deI<(JUN7qz6H5;C-MtGf5PuzCa-IrHEScSHQvNe zt^OfW;sHfWq>Zl?S#&u?g|($;X+IV_3`g2xohd+7GlbyB(vNwv;+*?Co%0=`(d#;4 zk(D4IR*Imt!fl4^xjm1$hrjnPt2L2wm081wZtv9*OU)hust)x zi7SaAi3KT~VA;)P20VP;-m^wSN5$EPc&tL;g^#9W#}tNywofCUF)K+SKs7lxTj0P5 z8_c+ES!G{~J938(~Bf4W60bffv zPGo)6T+AgGpU0t5kb!z9m6 zHodtj;p)_jJ(>^&E27{me0Lp_YiNQWS95#d1bcE!7vD#YhVC=ah_nr2_ z+rpmQ5J1@OYxJ=q#yH}dXzad(Of#vW@C5J)hpX{alP)$gs( zKNd=&3>zmY8`03u$*0vUN$MqFgt>x!=Gg_5g{&zMJ3Lxb;20GCU+-vMdQsUe|8uA&KA5 zquG~xDJICOl_oS8f5X&6U4*4Ue)}JV#lQ3&$DjB|5)RrsKa^}Kj!z=hSYoQ$JgKG9 z@WK<)GZ$~q6{eYWbsTK7>Yq1;gBo_rO4x})Gk-aegw3*d-8w1`f!l*Ub?fneiX>)! zH*#CCn1L)L)snWC-Kw^lPyn<$#nq)loF@nb1rl&mRbT5n9#U44Nj-|Cq{SdPQ4L8y z<%;Ux6Wtkz-F5XNW0ez@eNS9=q@9F>@6L$&=jwgHyFI{!%DGxn0}RU6-gw9qr_G8_ zr}lTU`{q%+!zGr_IU%6bjMQ@i01Ls6F7vMI#blBy zu6w~Ll5x`1_K-=$`TRY9Ur81*d<`PAX%I^`H6_5-0kCMqmB^xA4+ng46Yr z0}OEr!P+GMVNbo81Sbwvk#&Gh5B8AT`J0 z$0^Am07XE$znHp=of4JA?S9(aYo){#m>bIp7V6@qkWN8N z*1Cv16a6$H(HnqgW%(v=pcgY-^)R|@s{jA2zOKG#DR?{l|7GkWEy*ua7u2+)&L@!G zexU$ZO!nFFtt^wDLYG7a)+zhoE7yB+x&WUa=AT1(ceJ&dLOr=hOGpE?Qyu%2XAt|G zs$9l6yDq@h>TINmXqjRVxz|OiqAY5d@u)j>$=AI8sa7j)UMOz`NRbVqbT@wRJii;F zKGZnpM`7B$irjyUh~zx_gcx z5lM_ykOwXbn8;bXS-qh|*{HGa8F?8t4?Nc)C|+?ORJYl8S@jp^#P!T$qK*8N#`J?!~mm$;dw- zu=`-HVvD5$*-B5qpk*Efq{JY)Fs5qcLpU#HBj30b0dBR>p~}U>J#6D@i(WYFvW=jh z!k}(7m>c$cQvJyXWFH^(9%44^E+wr5tFL_49`?u~I~4+U+%PJQMheL-lY7CA@n3y; zeclVPr!5QbLt%lZ?A(?sFOm$cKS576QM2U-AoR6;76c!ld(lyY*WVKm+CysNy+Ngt z0Ek9ea!$^F2&V!c2dD~xt)DDW~}PXPH+~ceS}K>SeO9pkbN6m{)Daj)2n{PR=@{l zqh2-OX;>RU&}%-aTk7|vlFPJHMml1^wn-hRd*x>#EFgq^ejI(wiZ5$IOf15U;67$I zYcQIqyEKZ)L;wI+%@?z&aG`%1mbWt;Gq%<+y!Ib@p}lsLqI_wy41lnE@<9@(a;^=J z^4v)3S+`0iwz}iufsOn7E49E4k#UU0sya1Stl!-{W_iH@-Z)xc-LYPX!3YRJ9#)p| ztx~6q-lLmHoEpsPAzf@kU_z(iDzJ z$!v8U$4$nb6m<)xLukj}av zo$TM^r6j)f{!Bs)@Nt8&ckxRu!E`-paak*zX!0)Km}s9E&aeoItG#&v7(al7YADLfTIN?m+VW`0m`Uq9f0So+PwK1<#PFgHip9wzhuj zv*nqvbBQC0`7!&q1!kx9dlYdnc|~|gik^78o@WzJ zMlyE17D^=C7copdE*Qm|SZog;!$gkGilw3JbwQyCWoo{;<*~)KI@YNV5wn75II7E4 z`uSp!5MT1>Gf&q=LmIcLTu87f_U-JuW^O1%mLfF_yp(z*X#`lY=yJtlpghNu!?7?M zw);ofe9eY8hzEZsZ{D)%v-$_yDtem}>z38+F;JI;i;tBKw+-G457|TnGXjZ}pme;+ z%xq+EC*QecPQCLSN4COgCx3{Eygv$9r>R^nnUL0J_di3#+>P!kpz(~-IjDEg!S<%v zr0_9M9=UF08SgX%JwlIqp@_k19-pj$Nf1o|&s^C4L~vcq0u#uLZe;W_np}~Aulw?Y zVOaU@Xl7>VIc&#%?+B_(n-1jW5kE9rKVS)PL|1*lEjb!jcJ(WniYKt6?Od@y%aNOw zXU&Uiz)f3N+sms$#5B&77v!;1A-!65++L$s0w~4EYsnW_bnR@lZfEi(xgz|FEIwA2 zr=f=!C4b^pM(z50D^nA1XZu=QZog^FxXiv9qpL;$%n$TMUNu#&05KfNy1=SJHCSX(W256mDBYig8E>`z# zM}7GpQ}2%iX6Uc?FKXq6P><&qNO^j3f1@1}S#dhM(*OcsIU3RvE2O;dR{=arUdw!7 zE|3csEi*4_<-1*_I`LLC59X3x?3?vEmJ}s&+ zt!GP6baGGM5a1P|F0Qnr?xlf6L8o+~gtli@U!5pq*YrLAQ#Td3{Qoh8E#;ZNsC`fI zl5AW!qKqz73YsZ4&1O4~C7Pw!ZF_FV^}|)M^X%==7VZ_PVg1xrTOp6Jd(ixk=&dyQ z(I19F{@ovK_@qS`oCQHpXiLd2U41Y#BU?k{OO5Xc@Fs{X0}0J>o|4pM0kBnIe!M{+I^iqte$&d{aL~JD;Md_U}7lD*O<&- z4g;Py@}CyELn3sS?xKM)+%y4GRAeD4DVJ`|{!W13E_8$ICo+K{u=N3L>Q1#u`r_BE zxj54=Vbx8$R6#|lk9h(sz??n-2g(_=?AOU4He}+{D6(}T0GJ6SJRCBZ8A)dn(1>tS ziH*n!ga7{+SREHr`*cdS%fp9aa;2(x7}xtO4Sjkm(1Qo*>FKvH0j729xXPu&BU4^1 zJfgbkR6SQ2Ex%QLeR6Es9|oYsZCrbOhw;RxbiAEV;1773Ef-3RPJSmzdn#BVdkDaZ z$n&4xIIB`=OG8!mdl4>^Qr~(h7Sw|DS?;!*GNDWfXyHjC2(}#N2`{sHg)&09j&)^s zYwZr)WKO?ndwpT54q19Wr(|*Ekm0&vc4=+Axg8pII`;N#92Jm}4xEgZa)u4mZrvsM zo{wR_wIkk%G{OGV)UVfC-KX!be1{XJ&Bc^RemJ`MNlNg)6SK5LUp0+2k=W9UTf`*K zSG%Eh_aTB&8HKKNZ*BVrlm)OtbvR=x0nYjgU1Gttry6yA9<&F`{mjGD0Ee(#aQtL& zQ6H4;@KO0#(Yu$2=S|ZNTyN9fm&)#iIvZeiKX1sAh^f-8gi2gp6u45iS6RF1-2I6U zp=?XQ-({v#agtwSzXnhI=j{XtR9c}dS!5XH$!jAG-iKKEy@hOUwz-Vb3f%8r; z&k6Ui0375tUap>(^AE>SU>R9DzOrVv5D?8UWphuz=!qB{^wSR|sJN*UV{?JP!Nh-= zDb(}_ctypRqPKuInF+qv1)NfGTTEAh#jHlDtg4Q~x>xHBmNED%hgFZYe`%u1zu19&)I+JJ zBVJf3@_8o$r5fAWgC1u(U1jUs-5pO3F3NU%QL=t#ZoY6$tZp}Xs;U0Al|<^m4|R*( zf+Uqkl7@}9bAr!s_o3|^S8qOtK5gzMoi%n_P0z%8$hWD%gZmcGB6N~dg(t_j`yxwB zIqh4NjeL`Z$bJyh8*irH;!3~H#a+e7ufQAVcEGO#D2F~x5RnI6rB&XO{+P@wTcb`s zyoF2*ZjDzB#;_wjB`abwlwzA64tw`)Pf|)EZOibhcR04JnE+d>$P5Ff z))mM-5N?M>3Cl-L!_nZ!NE2qsNamHl>tLw`7$mX4+;Z`xrU+KQVM!b zp-W0(onuI@{^p%y#lSKD^*XLOiCWpJiH*4LfBcn8FIg<)~QS!rF4Y= zL={?4G}BXQQyS55*J$>7M%%Z_5255qhUno166PEgLT56^O*{#E@258^g(iNHEUCle z_gK-a#MA)iuw>fz2r2!34#I@Hm3)WZ+~+%M&^+nKI>79BD^g+WwdIUf&mJqpkYwin zCqY8^?LJ@=(j*@#2hUZ|iZX|f zZ0klGNs8&3Bx9_-fa1TC%?CSEeJ1puC?W52CO-l|C#lF1lkbE`RL>Qos4uTl|J7>C z=3ao}u<5uwmHNN0h`>+Q0QgL^-w@MXxuC+d#1&%*x36}?09|JKOKG+)OV|i3St1wdtV?3W(1&nBI zNj6$mJ_LakACGEQ>$THw0wY}kl3~e3wSfvq0|;y&$@mtkUa0je7MtMCq!Lgun1STx zyIW05uJGBp8;L_D{ZMD^XAr}-ysLRqekgRAo{e~toaS|SXf`sNq=fVYaT=Tw%B z5B5sW8vkCyg@I|QhDU|rr|&Za>V+W0ssQbj0mrF(AD@G7)l@>wh>3ux`2F^)o@G2Cv_dwqRR?wWnz z>EnbGjXi5>E;rM5dyny>ZyP*NfpeL+lwm^^wp%nGvb;LbzIp(1!8AfD<2ventp(vq zIQ)UMJL;q}L~PmooP|OZ-W|Ju_=vVzya5{OS57hhbck6|mdYjW1(o>L{H%&jEYhNp zczK#pi;wc#xwyfB;pWTYNUQZCCi9VK+BLKM2!u1lbaB}Gf1l!qcQ54ea8_u^V;PaH z@_1<*Jn(aF*Kv-7%qVle`hn#1Eg1u>RHzKsXC zRv&Mzpdmg0n{@w7d3@5s>AAvoSGn`OObjb|V>_-DLtfvNW}(0*QS~^r0o7W@KRCwgsuFYEE)VfheTdHa6!lV494_=WWem(a!JSE<}*tFNAc_^4haaefrV z^i27BN^T96{rPEcNLhTnNgOmgERqLk)Z0be<2=O3 zW1=MD5F}!90jmpMse%t>_wgQk0IGeH+Mv?G0K!ogGHl4EzS6)>u)hv0ptt4()_GtN zz~qi*0I$ksoOCQmuv0BhkH3&zWRlre(mmyL3Sl{4!>gK0ovL{(tnJ5Xj!>g_XdfCO z<2n4UR##F_x<&njCgr+FBq|xU$KB8qcv108<+jfwdx*j8E9EXAK7u!b`cuJqAV<0C z%lMC_=F}^=4CTFe3vy>^%b%XO^fL=*z500Ke2YdtLAr2zgmZd)8tmH6{=>_4gE47t zUZ6r~@6{;#Y}@_o2hzwnQ11ned|V>av0=&%R0e_9`=sUyI*K3_YB zV4^;nxAz`0x4AIk#pY|?acKRHUNFlh%xVHOr>NgXZHKNe*T4(Mndw7{?ht5DG{jiR zlc!x6b7hKxxIf~>?g2MOd6h|iQig6@{46v5LzqgbpsDxI)QF#5UrmeBkAEw% zx+K!&DvJNByQhZ_)UcxmEXGob;Xx)M{9zH%tMbs&9hMfE`2l=Up(;dA#otz=1!Bid zAOg7@Fv53-;{#rH2;4)!H3Wvc??z>EA8FFeMb%5t-Lgn|S%D)v%!#(Vxl#tn>Ed$_ zXI3|&8L--`!NY8O7{CKb!#D*X{3|N35M)R}ikw-%R6e?sUx!Im3!Arw_=o|Tp-J!4 z9}$j_U<;W!6A?GNAyn6%a7#1o4GNm8zYG&M0#T9oM`0IgN47zj*1X26Qx(LCVQ4x0 zXR~Zh5ZvsU-cD&Z)D|dyy4yX6h zd6-+?Rbt3Wn9{Oe9PV`jIl$#<4^BS(b4ry_Tk7>Hd_{Lx6k!yEnk4&UcExgI+_2#) z5BThN!^r7O8&Y_mTHZX_=-~BTvE_$Py)HnJ;`~VX*px&)E(#6|SkZd0rm~wvuEhvk zoD?C(s8d)5$uP*<`2;;!f0*Uv@)il;b~He6iI+8ZA>RN{h)UD5{24JEno5i8KNQ$M zM#)MGrk@z0!;zEv+^!2IrpFA{%QJI+Azb1fcV7}%zrBdL^=czz5{OI>-;cO3*0*ST zF_K0;%rFDb64l+R&?PD}ik9gT;@H7-Ol|h3Xv@L*145*~1I-!6{~w4G3l={|^}4v_ zH$CXS2qruAj<7Ci| zmK%p=0-oseA^C>IDnJCp2%LObu$y93;Bc1G>A+c#QG+$smCUwQIWNLuI#Bu+I89aFK;gaz=66>+mUg9?Z&eSHI?sKBq6uA1cOH#`Q>KeR{=cCstsx=C$gt^_dn&BM3PzuS?nOpY8 zHx=dTCsOdwvu4=TBtd{XTkyOM5UjwpPgTp}?ha@>a+;k!)L*6xA?KdEjeb`<$2>wB zqvMq<7>~51$bUP)^!e%!_r|%t0vv#J|3Zl$ zTUp4Yl8j)?ict}q6_!lL<2(;@XZ}tq#&mp72R>z#2gyCf&&RI<;;>$jrO6x$*X%~f zC&(W{9g6~}4F=yIva1(dPoo3X`l*Uq1W)xZA~AS-i9$w$YHZsy8vHR_p2*csGLQ@E zVtbh2nc#fXhb0103fig?PAdsaK)xw}C>)xqxvEqpjg2SRK1k8>Gk!|&RGeURC7M*P zzS>m<4ONar#=@rj;;q=qDK`%T#!nYb0roz{5gt*O5H$!AbN9=EQW-07eL6uyAjZ;; z{$?TLU_6WnnDTqI%>kwPK>ofiw3t0)RO$9tP1d-g{VcYt3x_|`0k-9Y?$g}$ zMnAsDgQ5m+k5q+{H(jHGE!>^jNcfN<7D@)5_dT*k(BzT~i|Ptx{?UMX!}lFN+``BN68+a%z}C=7x76};mbgM*Vkj%NhNb6m#Z4;G@0fJ5u#`H za%v|pdxFQzxL*jW(Eb2uw`8}~Ofj0p+1mayAOiQ*p%{DSy1)5JBe;Lj)+&{k(wN); zW`)xkcK3&<6fUIbBysiH9)rnAABsI7mpk>`Q0A1mVh`2&nVN`pCK@h83Sz>FJhn~+ z807--PuR6R`%QW0UZN4JUy=`PIm?lwzeFzk!yAwI^Gf=4V>?lup5F#2%9-2H>E8KS z9Su_B{YR6pd)%{oFGf?EMv)tP(Wg-?B!BOnyVWm5b_ID8>un>jVXQPTjxXpAegET% zH}(4kmr_xHug^D!%S8z9#hqA521gQ{5?= zBW41xMS@TYdS@He4aglgz{PY^DaL7({qKk|e5|d%dnjAE1a@%d&0T!wE0m-xK8OlO zxzc^>6kpl0=5W^$n&6(JrVUypOiKMdlzfcKZmJu-T^0#RWG!5Uf)LCMd#`Td580%Z zug-HfF@ol^2_B1;H%rsoqqw5|8XFtY*YaoF*Yq7Dwgi8yxqdrX9wz^^Dkei2<4*K% zq0RSTMYMq6Z|5@?6CDW$zTprP>gr2?W`$ZWcU^)}pPe@NYFT3vO&5ADq2q>eWmt5C zD%74J@(HWiLa8qW2F45_d)j#DY<;7b z{6tl4$1+{LuVWBZn<@|fB;?4Y>Lkst(TbyC&^D0{BHfZHP(^7laiatdFbmh#v6_%9 zVJH<;Xhxor^P7lX9hoK48KJwH|2SeEQ?95>;Idax*%ca-VP02J0w?b{;)OY%MgE|N z1hVbY)KJW(PJa|vT0T`p>MWJqB6JE;UIfVE9xcqCX1(=mKrS6bF5yYkMr4zZII1q! zk^9Rn{`f=m`Oh~Tl9fhH3-C_YNrH&CAh&BPtf8%^hZG<*AnN50!?VQ4iOrLK)r6ok zcHR-@<799zk^v2%Xn1s(>O*g)5rh$RZjZLfVD)|wpOO7}2sxU@!0c7b;qaR0J~mgK zyE}qP8uww~QxF903F@+&m3B)s!I09@oPHSVCQ<7G*WU@uW<_SH20_L5mmO@xRhc4X zDZ5}O1swuk69d`Ya`J}?p%rkwOPDL+s7Q@Hd6+Q}U;0{IWNYk9?yM^z~;LsMuvDN4ytNe!?O zDpZu*P~Y{fVfDSrRxxM;(fAugJr2#ol`)cb5HF*8vV!LBCg@J8!|BH7>u5!7B*dj1 zv)wYEF?>ABW!7mbydU6u7YDwK3PLdi=D;m4fuHs4Zp08 z0moW8<3nmqSVc3~DFBWQXz}bHv!dh)=5rp>L~QDpC?h%RHrNp+SoKV^Y?j5L!Nd6- zLS3_23{`OZl@^e}t3lUVu><#^O*0(Tc_NxT!A=GhV&|`5=skvvTzi+Ib3e({e)FZ< zn;gyWR}#G4vEC{PiXNz->9=owf}Or}57G6O6&R5Ss&6de*}f&NYOGr(SR_^Yy4YP{ z=G_Js{8?K&t*ZPL+w+*}UkND?mvl{Ai06d8GVm`;q7zh!e=m8AGa!yub?rgu68-=d z*m?v?Y8&JlF6UKJxD(WCDa3!8GAN=e0AlnV8B9A$uakUIPqzy zQWXlvUI}4m;RJmnw{ktn1K=Q>-lehbFI7v*xwW76zfRe5_lrl<6A+o6e6fX5x|5kb zhD6i0aXKGpDfYwHH*(@qscWm|;f7Mx+5ub(4aul@u+#$FIXIQ^O0b?dfw{BjC>i5W zo4+cqU8^Kl#n=&q#x1t1$2-F5BZXx1G92Z!Spi<%s34ZW{lO{17p9n}xx(xo=!oI2 ziS$+~Ak^qmIhF9&bip~)%itAeP2yM+yX6ogx&1arHt)12Ju{0U_YE{Onh)%~EE{=C zu`WVs6Z&Z5!tC7i{#EO4? z<%d31T*N+U7R!UQV6KpCD)22{xg5iT4*wcf}piT%MgSo|4P%Ky~4B zcSGhJ`4M8l#f2zs0PrlZ8dFVF#N-<0!ZjyaKHCTyhEcYlFO<*5Lx!yeJjHhtu^4xelrRZB$gFf;#Pa> zP3Tvg=#SLQI>4#$p0R6p-IVbY&B|YaxHYj_pK?r@(}L03oXtqc6{!vr)@;Bt@9EAL zKmJ{6RH~RftOQA`61U{Vw@VzlQyd0neugf!f9?-epg)5a6d>oa_$A}4t!(>^5Uk%J zD@4#IvByGX@!$=AU}!@h2qs0N+Txd_5FQ?Dy~{jfWrREn+UqU4n?V;unL|F`%LRV2L)sv&z(uW@^(Oya`@_JVe>J6^*;YthNCdR zfLdK;;o7@Jp4Yv|JhTBJb$~~9S=k9bt`ov9+0H_D44QB~G2Nudt&!if4Z{lftLM#R zO;d=lJzaA&gTX{5GIMpr;$$ zl~%fk`zE+)&!uWth7w^{H@^4l^&o3sn-*}RLembd^t}xs#fdj4;&=e69Z1K0(dHI!dg>roicnWJmHJJXT0(^nj_Jb5IX7)UUlGw*- zw_t+mVSs`6_oLe?!55Qj`K9$PzH?&xT?m)?RN@gr>N z<=bVY?n8$KN|)=VTDNV8?%St)M1F&_bMV{Thi~% zW3GwGg%C|Fd9Q;=W-42}RFl2>J({1dE3#^Sdy1Zw>x8nz$zWLl!Jcc7N>K0KrsxvC z*C6AettaTc1tH5oAx*fI33e(Sn%L-zu>S2*6b|ajHsK}**C_4Ogxgfn)9s&#c!@ig zCqgLldCa`cQtcoJ$?3og6SdyU3_tNVHk`P92I`h_)-65MZ}UoqpdXU?;mQ^j(C-(v6X?u|L34j19{!u~p+?dz`MF4^t&6O;{G#uq;UQAwgO z_g>f>_`v|`3ax+r4jUYmzc(T=-gAnT*$-|`I-df*r zTnJXU5N1VG8Gou9$(}aIRPE$+L*{oS)sHPQ9&Aa$i#?j3Yda+8-jQOTdU#Jb}f_G%itY#@i^X~89~y)QWjh*RA#9e zu6=+n(oLT3Gp!i#*eA3^7dU@ zYHX3mW*m%Fmxpt$QH_x0BtvQfcUND8WfO*&Pv(<NX)arWN;<&NeOOUIlt;8#{}Rk zGMQaWp}-s0rqJv(RDWo8x;Gr6DLimlU!NjD0082g@eY|;g4P=o_+E!W{@6zcr!n9E zYwR(vSHA~w4lvATY#*R%lt}=VD1QIQA&^gL8=z!eLb?!y-02#HlMNr%dej)zneu5s zD)|SJ7`UXWvIo(}BR`7CAi;D*xXfR^Dz{{%vOEUPaBblK2ErP>Hq;BuDAL}hrLV|5 z1Msb(;>X?GxUGEczB!U=Fud!ceND<70^~|Dni9X_R97j&^h}gdlw6ol!RVJc-dTm!ZQiWU+-1B7Y5!uDqjcg`7% zdn!{F1~IUQ#dV9#<0(~ppGb4~w##GXAvf_w&-{?=>)vdlLRAH1*P3Ob-=Sk4ocA;k zBpHX;efy92A~yz_+$2m*OHdAInrr2i*!ZL1%&SYy-Y9}kSgBk?gXv`sWz+^vXwR-C z@>xcm{P$wM=vxkXc^*u4Q<$-6g-E^l^$NOM*n@W7I9?r0H{j$gHfAn}wW=|h+dR7W zJ~#{Ba!bGbsO61t4{fdd%hvtVL$_3Acjv3$B%~#lnMBPNW1Q_wqQCU_%&!Q@fFu2P zE^t~k@c>IMaUa3@Vc{~e?%8JQY%Se^T76GL<7&_UUM@LW%GrCnFs-$jR=o zUAznQ8*i2Zb!yZiI|1^A3Gps~tLEhZZ4U-mV7A)}A5F)qJJi`Vc{GKe;DA^6^+W}d z##U516o7aQwC;mv=;92I#iz#)D8O4|1?cB#v%#u>n@fIdVpFyssHTDXT6Hi5>na9Y z{Kc5D6eC2*w{FwE_-S|Gf1a*X+fdse)ViW38d-StSWWFh6TyFD?4OfyGLXW{k1{JQ zfJ5!2{>H)VIo;Z$qT}PI{)+^}pb{f!X^+@I)i}qRNa34V^vtspwDCgK;iaVfNM*!O zo#getG&5TmA`zB5Gm_s;m0_UEG5=L#FV^wWbe@{6lQHiQ%kQ|#Wrz+O$n$dA2rG zazZBPXFR%~Z!ye}GSTJJxOT;=DsKNAZ&Ik>vqGjbmw(){Og& zR)_TOY^?JnzfxesJl;hXJBh(4+na-?lc&v-tcca$2UZ}i1jSW*lnf=l3YRQb*^7E{ zBY^ax+9Q`Y=XkoLoJrcjdh$tAayxg?E=0}C5sdeT-xvrW?s!%*f z^ve7mvKY&Q7K!}Ox-+li6YWZGuD(XGmv4yBrhA(x4$#HiMNMrPj?ZiD=5J*Va-aEl z@d+_@OQ51eL&!=Ci{^9jLePMR5kR%JFq2V?LImJP2Q)fGwpz+DtgolHqe)`+C~v5Y z65r+2G;uxkIPh83>26X#_+=6`cd;vJ`cMFkDLwG<=elpw-oy-pdPHUp&1?^zpElPe zOd*S?9D9k&CBAM4;8HpGsO^Di(&KgD$iowEWx+TkvN|BH3SNzzHTMB+y$*!|W82w$ zCL1e^uevlekOM!&RkQ{F1`d*LcHr6;9xns@_jc1>Dl48Mw78YWs$gjL}k53 z`oVLv6tfd}Ag#W*qklPlie|wyj)r8zCKU?8t86$m_f@hVl}kh?>3G;C^Y zdsi4n#dnzaNN)_5Zc1I5p15}T4-z(QF1)n4ip1H(66;bRvsnMAIL|0np5m^xvsEO{ z_dM1@2E-9fGO|4Al3if%Cr*$1n+Wsx9)-{Y3p(B(F`v{(UgB-1`k8tA`U~r^Q3osx z_jYG6#^%+^%6Y5d=}0{2DX*dTyFX=$uTjonly32`W)Z;gUCmiFz1DOph8B$UWA@cd z5irSb0n~Xth)Ex13h*vB)VaBvmrLI8ad;QqKb8es`LF0pmk*xfhi;C3KEHH_3c^fVy~7K-0&DRstn|?p5CvPpIkj`SmPI6tPQewQQ#r zO6gZg=uh6@FN~}@D)gD|O2FXidk3$yWPf$;y;NGZ)diU4!S8)0^mQ4usXpJ_+l#Bn zPCIq+-WL;0ZTp$&!y`xm>?Fptv1!6+Y-roPcx08fvZ#z6nFv#re^UU*b)TjrIn%1p zxYFVrG>>26dJvuYYaqz$Tht|?dvJ)u=n1VemCY55C+T-+U9vddTr-f@iQ5KP_>!_L z;rP(R8C4VeBWis%`-BOMyu5<~`Y-Zx>;PTQFuWAv zm!rP-hWqwSMd((Sq&UhKy%E9p8qvx+-251VWCD^h?l-ACz^v6Bz(|~9@&HT80;X7m zM#}}vz4EnWpQrdo#7~!nU&qjq@23`cb7VvYa}m|`{ysl08&Vso%u38!C@&tjXmEXM9*lnZI zuDiwg6~aWgnii^meoOMkfazgO!KasCRA@_hNKNz|UG#cwFUXV@J{9u-Lw)m4tAfhT z`VX-nobELCOT}b6gqb5{i+C&LdhkrRK`9T0SzuQh7mV)apd_JyD)G+O&%0Et7gi|X zfC*2M_7b%EC!W%jX^^jXU5YRa@u6>Pv`V||w{HODUaeHdy@i|ItTWBKV?DL{aJv0B zIlz0Winvjm3)Q)%;}tTI6NF=FzV&KU_D3Ou#p~JaGGIDPms9v)ZY zh5Xw=a}HuTOnho_;sfcsN$(S%yf#h=7XkFf#({MHBFH{_-M!J#L`UA6!gtejPS;9K zhjr2zWg5swuntUpNph5rX(^{#*tpUDi%<;ov2lu`Us(&M}I7Mtml*Kud|LMVloAV^=t zZzF*7yi?ac|27cXf@%H2*Bk<#f(sxgH+jE@=^)4}-qO;wQW9K6qXjJ}v*-7R)TJ7t za>i$b&Hefe*wJKQ6Fe)*@15eU_|Vj$HGt#A?>49vkO zxJVUk1b`*%d(rR*FeCU11R=Y2Y?n0J4*&}@mL+DLZ@O{M71Y)|SC3mE+ejuJcFvmX zc(i$MaB&@>A`mKR$@`^y*Y;uABxaGj<100fJ*+sC(Pu9Iv1&Cny}Knu%{oR6G59F+ znIH51>O3GatcCDc^o z8F;YFJV44yt}?yF%6rb6L^4jKz>IY^NY0AAdz!~c5aVsmMMLNEu?pjOW!67OLr2-7 z+#lZ{2UM$1)*A!>*VhQ0E-zomnn8+1n(Ln1lfBOIIGy3oIDb5o0q*QG@Y9aXtw?BD za27ROGnDbto7}a#3<6vobxNZYOvVB+cEC{)G*(mit8tE=$GC$!6SWXfInisFhUJ3EIlTqw- z_BO)}jUw*OQ8D$H2mk+sR**ih@YzhA`;>Du#am=P16-P-VU z$p(F@&hbr{H&>Q>lQsvdC$=|=5rjq~P_r7>X)TIRT-uQgx&dY7XDuj&NC$|W%wjvL zJouMMP_p!R9B^~gpniDr1W#?M-A2tIvdBlFUbkU%)p^;RTmy9$0sJ*xDzmnucw+zEE*XukwNG zh1fMbXWQSO!#o~}_kqrLsM5#>Ot#mD z(8``zVPqCVST1JJn8<;KmPMNMQzy0!LNpG`RzL>*31Oq0ozOX7G~9C?Ez#2VsXhY1n;MvieIVXEh1kl_s(H3C|vO_V6D z2Z!E@eJL&I5w!e1*xF0swC$yZhHKO@1IKwB5Zb_93#iZ1`097fqWVVE<7w!=2_Be! zpA;-QP@;4-w)7qhGZTp6h6wGRYDYezamksfoZb$0-)W*fEj+XkrhLd?=?ibzI*|T2 z3aw|$k4O0)g@L<6p%bF|Edi?mhh!GYXus4T2>kL#?$22`!pKI*#$O7sgqDh?ftA+w z`JyjRuSBnmD3TALlr@+tAFWFcKo<?z{@WAOh;6_&@0c`i~Q+@kW;n;@0pY zn3o!yx6?PS)AXDcn+d6jnTObU_T8lbl(Iki1&U@C%ka}PeSf)IwoR^mRfMwbyRtzi z_i+$rYH=yoN+S3Y-3;(!%f`6OfP~J5jfFr@ulLs$#(qnn(xl0ee@A4rC)yWcqyG&w z^EZvn08=lJY8oeoD6r1bux=!WX$g$6~mG48?>GlAdpnxi-euD0(u5rQM7jL1fxV#NIZ^USCHlJx7(&; zNBqkFn5)@wB+#wd`vo^N?)S+eFfK3W6vx zhg#x`eNtv=MMrh*ITWO6A#lX9L7-A|?%ENXf|rOOi!x1``e>%~*L2Qjk{#P@GC!Z3 zI}F|ztRR^_SZh>Ry`e7;izvIZ_5nNyL4% zOID&r9}}0U+${{>1wFS3zwY9rYRR{4DX^GwIrZtlKe4(6&js_60UrgshRz>GRq-ov z!>Wf%$+o}$Y-M=APAh{=P%)|efE`Z^H!(nP4~tSgj7|{jBCT2Q3&b2ox9R6U%+Ev|z^JqAVr&G3wTH7l^CK3w`g*Prv@j`3$qEb%Yjao_0h#+4N9* z9t!b^_Fp<8NIy8;`Iu#yIY;+NS{l(&rU@mf z@uFb=Mgj5xxC5!dD>Wj{>pc+L50ZdQPh0^JONn#j|*_w zQ`+VMkEd+TP4tjbB`*V0?3Ib4Cirks-XlT3UqfUApVp*-LZ-4iDv-Y~Fd*WPe3`89 zYgG{_n`)LeL${(GL))`BRWkd<5QAQr*SY1S>*xh4BABiGqm5{?3TypdxkP^M%?16} z0TfX4ZfPgPkDN2rC06mAoyRXy+!sAl&jKDBlH`SHjF@EBe^iZAWTTzN7ap99%?o&s zlXVYAsNd1vAh!@v9F)b}NUV&xJvj_SnYS3zV-f%(A=dXLRlWEh(_p1^ipO5dv&RC; zwa_%8h)RJtHb4r9l=D2+7H<`7SSxX-cK)&JoUOI*M`xpg?F%Y{V;PDyEo`u&ws2J4 ztPa5LHx2{tqbrL=zBExrjjTj*{3KL9mLwtIWK0T0>DiVmt2n7~`{IwkZpPOnO`aY4 z7ZEV*g-d~XZ)>UV53BQkIhfTW`xa%&aTDt}e_Z)p!3JBYwSn*ptI;pPm@HdX-U-VZ zA2w;9lGC^O(_^_-6Ygs@{)7pcE*iYX{W}mm&bS?kJw-OCk0GF9N*X?Cj7lnM z9__Y_QDgA-#LRd%S`l$j>-hc<`3?2L0Ev!+NMt3*f?4^cym!ns2>$t*(@3*d+PL6% z7zX%iIO8ioh23)$k`oIqxcNT{p6L9TxZWyMk}v=}K*Yc4Q|hiuY7Sk)^k&Xyz-nVB zR@uo2k;ebxP)qgVQVs@U3dYYZ?RjxyH z5$F!dA1gAK+DFhhqqCht<#|}YQqQ}ZS?-lKD4R6H&I&OIAf-N(g|0}1HF4WvB2BT+ z@Ck&8!yZi&0i*fyf!Mb37+^R15~UkiAJ8MCH5!c@@l;FtHmn@M0vDzFEoC$r84Zpu zLY{*$5umyXl4#G!l+>x=w#>iCPK${-V4AQ_bvNiS2ZO*sG)E*v2NIG!OSIdn(eD*aBHs?QD{edcjv;Nt zJn$nPSI9Mk9ucNMwL*rW!|D#>d*$%~h&Mjn$V3=YkrnB3jb;T$VD@!v9@Z-J@`6Va zRXcNPM9Uc}nbFNkXtzSIU*}y|EIGV`;S`N#SYw0vVoh@4gsT28rU5V^i4gJxuRjo$ z>o|>NIkTf7t-c_ZB|Xe?JFixzZ(L*F^{;y=TEbQR(VY|3k&H#SICeCi#LDO32-N0H zEY^P*S0N+G?{Rm^4|L@~YD105%nv7dZcu0=8Vn$%Oh-fgKl?k6dQI~lRP^dJay%rg z`ID@x>Z_Lph0A0?NUAeWK?})8Q z!Xgu2?eW!FVJqxap5898+@GX{fNtSr;Qfa~8#7U-hYWE-o;TBqA4ld27xHxj1JfGd zf1{JB&(ky+u`V-m49wYrd~ZE#fWvq!nT#q=Jg29u*(Z4?|ZgZWq}q?)kFXJ{9tMb>E(t67~6F+s5R-FFSV z!c3&BTP0~xD*bA~0Ig-+hc^Rm`fab{#fB|A>X+m*KeG_I+n5f@9l*%9(cMl#gq}%U zz`*ry=xRFC%}G&RBOoy|LstQSzNd>L%2Y^6_{ZJ*qEpik0`(gA5kVN`Qtp)Sn|i*t z;vGpUwWG9S=11sD@stk-CnYjQ${Jzy=xKeZNmhe?HqmPyQ$kLj(01t~3rl3Zf?VIP z*}XS^9F6o1Q~GHWki@OtW}0V2m@6vO;!JD$?0y-Z?SiqE8Mf4&jb6Ze-pxCAos<|j zXGRUvpa_5te@V{8!UB-Q(-!1XW%;uJGD^`ZL^*v9b1I3?M4sNnkEdwl6z|7~)?HNc zi80tUimMlwa{j>{o7(g5;gcwgmXuN(v<(*vF!8;x(Yv_@EwvG77fP=Oa?EYaS-rV6 zQPSF2=I6?j&2Gz4R99D22gBQV%xb8XzH3h%l-&778p%*tBh@6(xJbat>*~bIpKTjJ z(+@Ii=2tq(=V^=uSF0^wS#xaB{;TjKJs84Ka&VjFD!Q3Y1HQWbba1PZ@p7kuUTGD=efM61`lONtuE+rN!SRc4qTc9+GYTrl&lg6XT{B=E#D;=gO-2RaL#_WEk4mV z<0`#_6$2&N@2X&fkk@hA%q%4QoFI?_Cp2(c7}|?*F>H`QEu$%@Nw_Bg9*gUK$u3bv z#J#~|#6EMg;z@xv!qaU4t^yMXb#a`g2!CSw#aSJcx0ldPwV5Ms%o8}9_Q=i&!Ce?r zFP|vx(ADh)Sjv}Qfqwchj2me4J9Or#J1|mTh4pFWkvDh}@2yS%Kx`D_K=s_1(Cw zOUHQ9qG+gIEK{(;0t}=ALt~TrsQ}+SOCytYG*qW-&fJT=xBN4DDxa!YY z=d-tB+}FnZbA1QryscmDuQ~GMrQ~B9ko?h-!xRO?alLaJFVxvejXEv=bZc*^{oD6V z4Uh7*0UiH%Rt1)+i~uJ|xx{}=Q7L`s^{K}i>)}saGuCB4fU`;W`Ew9{VwRr>uvsut z^bis^&M7cp7jxmXM*d)0Th296s-pjOtRt@@{*dtEbW0U7(@foRgSn`XWLduwYzEb| zk%>k-<0Y4R4)D$vfNgURM5wH}{dc`_Y4AxvI2rZZ<7-bS3e5W;jqjo$d)1JJ80kL( zwgL{*vi_tP8%RarJX7d%mw3+t(1O$ox`$N3q9$eNfYB9D-7j?iHlm+WW594tn9Y`I zrAO}msA3hjs5s*jS);m+*E)tX`navhAjWLlav`AkLMnMIo>%e~jKo@v1$S)ROY2a} z!_vfqB9@Kz#7o0ws$XfINO)0lRZU}{KJtlhuU*JOVVGMB{PN|7WJ5j}REvmpY^VCJNJ6UQ|ty+jeGp*RsZjVpW9IOs1oger;m*1 zN&ZBiWr)-AZsHPNs+>U4u%!ez8I?{>#j@w`g+B@DYI;mn3Ki+D`(b^)ZcvFMXp7T5 z$9=Zz@j|+ajgPGI)7396mecXz6J-BrC@9#N;c2UTSS&Y?&x)~9Fcs`H8d5Bza;wAG zvp6uNyMJ4ICSSp)ydp0eqoL#=BCR=}uS`FX+i6+x50^V3ZomC3&)(`rJFo}abYF^2euHxqkDS$#7E`V@g%kmSl;SC1n24IY@ylu`EI$R3_|6Qumfm%{~r30z5U|gL9`3S=Qoup{z`7{A0Ue(dj`dz>=mv^{8 z=OKk|BBo!|G^^EwW-)%C=Ghfp1_=l-R8q}rc?46NybFMLB#cI6Rr1pVI`baN2`&3c zKC@vyuy50ya2n=<98*3SqVMCd=o;6ZG@BjL9=Jg;Fyi7Xzce4x)(uKjhtUuNA>Yo` zds&R2)z*oX9PeguYL{M3Ej+)T%8aXv%Y`E&|3S=9!@2H64tfTT!r!Knk5p#6rbPi~ z;?AE*K+iVT3;y3+CAkZa)l7$n5DImn-ywY8A}RgKZnCb?R;cjH;EK9A`YZimkF3U* z^=9mr=m-OF4X{Px=gYVM8;I4l+;MX^iLneE-AvrJ1Z-bXd*X1a?*sJ`5Y$I&OK1J< zkQWiJVdXtHks}AEn8^2wPBnatkY;y|*SNzN3`4IKUcDn1+q4ST*7rj~RUHH}KmXm+ z-o)genD^ULL%REi#`NdDbM#w*vzWJ6(;wiA_>ZA&qUxlb9kg+%I2Ga`Y{DH93#bOR z83w9+1*q`*$n-aExmpJAj44@@^Eor6Bk44<4LHa5@jA=I*J0tR09i%wpOCAaN{An> zwI9$=!1?W}2c4YUjLF1fMzvhC6x|9k2&+%aO3{|5gIw zaMo9Yy)}!#qK8gvzLfC0*%4$EFHVcYwjWq}n>`E;r38+eQU$28@i93%90cY;9TEHY zYF*s%@Tyx1ZF|Hug0-}3$ZEIS{AeT;_m*Cb=B3&@*Gd({n3%bZb#fDX)UB`Pj0 z;sZb){tV;bc_~rwIJTN_Nbh!G*ErS5hDe$slKpo&vTgy`(ot&DPvF5Jjk4104H3v~ zrmrrz!~=PDsaIwSqf>S_w{R;p$SZ%I8-t<{`n~lGTx7;Trxlb0w`7H7j*(~si-sG3{s5_fm&dO{4`kE&e+D14>RpzELEA=Y$E0tVH_W> z%`c}o*uKz(a!%^xs6q5SdSyjT*LHn_^iC(PD$7)pN1QCD<<76XV%#Dc2$|OOb+93HRaZFuLyeFUzYaZiGq)!-C5?lj^T13}WY8)HX-6HX-M(*SD9vF>Bz}Nz zT4-g`JYrH%`>AdK7W{HkAwYCdCZ0Q9Ha3M38GHLb#S@5I)Tc9sakG$5Qy_fSYJVRp zMyfc7=EY%Ec$MIqRw4?^a`m$!qi4dH`#>u=#%B2AgFJB2&aP`J`8u^tI;qXd>EWyI;W5B#Y2}!*V+ht}Tj8|`=e2OVJ zYbB$HDA_52GVL;lHIe3!C^xP>VE44sS+iR>egjus5uGFO&dStH5r^MU&q6S&A&Ydi z(Bm$QHH^v^uo&c6EM(4SHJR9PtuB>8!+1EK$Wx_b1p{bCH9eH8@H=M_62u4*Qofon z5DGqb>>4=ZYslEL zSG${oKT&5@rRsUH`Ik|hf094g>rj6mB$T@PUd}P&( zXB0Qz3VT`oT6qW7iGG)a;choUA}TLCHLOEu2P8K#A~pBy$WdZ$X*NzDx+=*v_LWVD z+kGn(=MGK!yrBtf=N$a*3s-)4$W6{3ET}E z5R`>jIawYgOVRB|2xd%R)r$O74pL91H!A_&sxxNWn&`w~3ofJR{vD_X2ldY#9{KZxAS6xy zi49CdBH_FVGZ#Qc!%g~Eo^?_8N!I`QHM;MYol;2hT7(b4gP;1|oc*YS{GB`CW9aNX z%aDM;r7s}DCJW=OZU=e_^R51gT%5#=_()QU1)VrA4MZT$V6`q#xs7! zN!IlwWJAmSwElBWEMx%bicM#q#P})X`$h=Nu21Qz8H4|qC|lc@TqEe7K8?Kb#pSQ9 zF2Jb~EZJ*)WBppV`R{N>Sds5B|8RD~x}6mw*ns0P7ovPxwy`xz9*tU+A>6%ad1=1!>BcGeo&En0Vt9$jy&}OPI z2EAQw%h&d_#p}x2zhnp^pX)z?6AR2#wB?qz!mLrIMbIz~5p$V!c3<@~o>S+^+w951 zD*qi*i>2PdNZhwGQ2mQ^k}bek5#`oMv4<873flk*P+ZgV^woxNa;oF1>fsS3OE;_K z(t2r4eaKO2E(sitb6~fk8)F`hQQ~*0v5pDhWzI5Vhx)lA${eoLqF@y4UA7RI;QTV) zYmSlM+WgTo`-_gclXP&@8JvL~0~}ptvQdSw0gEdj=3?#=JI9_*^E)4!ep?u;aA)2C&-J+rui-dzf*UI+{CU@vXjw`qWy-^y`*Br#b4mFiZ0K(u9z7+Fs>s z$>Wyx0kUQ{o|KQ;%7_qSblN2{6x`fIP~g<;%9mLgB}}Mxc2n?48{*DjX#y??|FWHN zj<)On8W3Sw z?N5DNwi9GN>%|?;QSI`=&cposckd=@n*>ugC3HB^KnW_L1qkT9R0%`Rf+wS(Re>kE zcia;wkFHUe)#JJEYdaB;%+67X`Rr=yFvy%L-ExU7YfE+0 z zS2byG*|)~C^O6fZVf;CbV+?KU*rJ{=uZp%&em-OkCs}%mq`CL<>^Ug$U5Fy^jk?~y zGdUa%wJGWcuyN1Ue;Xn$V*}H*M6t+X!JI$;M_Az!tA=Qrdi}M2Av@OFkYzpm7FhjC zO*XyS8`<6=|H&l;uACNtwM-U@M=&NG>EJ-Q6=y;$`DntZC(zYucqa>nGs(`!xHh%k z=Z0jS8F-EvN3x4cnV>3uk{wkwH2bR2c$Goj+*=C9d+I>0>dqzj!PC134aJxa#O`ds6_Qsp9v*DhG=YI zVM*KDLAC2b;+|;YAGQm{j_>NF87nzNrT5`3pWu?C6$Fi>>@jN!za6P0${Ye-W|SA zPhyyWOrMYE!*Yu{V?+k>`H$pN4obcr`1e00HGS%XWuS378i+y^=6C!T6y`O#pH~B= z%Vn50P0hvTK=`HW5_sn@+V(%nkv5xFw!?WBBs~M~m@4o8>$q#1*Lk{K3x?F)xR_??oI>Kt1w*;Zv#zIAw4#8iklHD- z$Nx}K^cjmcZq1YiyCKFjRKRl7>Z#wwSB?&dGv|0k1JEu<0=i6iJ4$NrUOE1r4pV?_ z%?Medy`vuW>7Bz{c%lJFL+XFSlO8%0q1c!t;@^2BOW0 zDWj`KlYOr5H?P#0uFm}ULHsjtF=FTE8mNev%CihrY_O;W88l3qhfG~} zq4x`?{Z>4R(>mFG2~L88J;>$c0_O1)u1x2)sD64DGP}h;i|Xi=gxxmE>tLK7yYt8C zo$17+_JMrg60X|{*2;F15ORkfv21O_~jmxsw$svtkHO|RR`o_ zoa!xPa`-+|Vg;Tr*0yE+LIkV2ZXRGlQ6C7(=j^Z&Y%Lr_O3GpmcT6=i@oB+QkK?b) z1jf&4c3%45NyGAxiR~$=A|^qUQ~8+6xYaCAG=z_L{7X+%7Dt;pPCednrhUc`P){p8 z`1#1U@BjFCqcd`Xnd)*ZrB50S>o5OjWpUT;dDLqX2#{ORfJ~QESY8a;L(2n-VSq4U z`B>Z-!;aRYawbc20Yie3WGa;v9~q8CjSKfk%Z~caKqhH6!P*fFns!GAE#{>}vqC#t-DzDk1O~V#t#hXFWwBqR0cNItnK$pz73tK&Wv9 zv*=^r-;@i?Bir?n`O*1(n3i%CHEM>u;yMp0u(dV&XAV={lmLcQLfxp*)oYvySc~gU z%;kvr?g>G7-IjSP(N?~;DGoGp=k^AhX~-JDOEAC^y`Xt#~JL^h{6D1ywp7TbXc9_u1m} zI83Kw@6cz{_4Wb4!;eK*yY|GEwE=|p?Vjz=2H*h$V^t!0HjruunujRli*S|r5(DF! zfKh9IjEk~s42Vkrux1+RgEPk`L|3t4ZnNOo=z+@KX(E|(c$3*g7%=xt9_h<#a9=0fvp8Pd`~=w{CFHAWbhugZON~zfTNWo7P5pj&)wk*Jhhby zQW!k?Tcx&iaK{jDo;4qLZCe-9D$M;4uwY|(zAY$kmypp<(U-?-3LG-kFE0kqe6xH^ z-b$f9&L!4!dY+wR}U`ngy~Iu2**tV zvoY8b-eqK3&RcSu8XXedok670YtZD_+ zjr9wpbICkp=P2Ez8-^1kFy@Nr$|8Vq8iz9k3rq415dE7!K4#}m-y*UT3|omxb|e(y zKVn*qpuaj%S;AZ=rav}XNfWY9k}NfxMri{{=)H~YV-sh8r~hXQMr<*>)a(^UAbCzE zliJ|>+Yn3_3KKSggchB^Pv(#qe>bE$B`4CH>@oi3xO~ai zq1&Pekmj*f3Xb})4Jw3$P8Dw|c(vmLUHhd#KlJq5bOK;!*gkgDXMuc*wznofANCpo zhpL(3uNd-hV#clvH+LBuQQ#%heQTj-@Jo9GI(lM#SnKX-jOKa^L3tr zX9x&4$aGXnL{w&oTCOToj}*GSYDuw9b80nKsA!KdxX7rmh*|H6Co2Lm$yAN?&QwHJ z*$^+Km$97Iu*~p*w)iGlMubl2Z!UFau3Fk=8+Anbiobh4Z9^J`-UCu88fa;ku;L+@ zk{h0JGyz%ma{en~4tfiVbs8gDBH$BocG(*?=pIbTRdxKXcuj z-q9($e9tW}Fvn)>@GN`+Xu$9x&N|dRWkl45@lzGWF1*q57c|pRIP{Bma{Xlz&+{+! z=)#ZV*H~MoC1F?PrSvKWt8;l7-!Ie*Cv@(S5f7FFsjlft_cvFts70&=21o9VaJQwa zh-Rbzzv^(7BD13F4YagYq;XCEgN~=T+9;Bf+zqheGYTVH@gD@b(#sQ1(4BWP$*}@H z$@Or6*unfPr3g6n3Hin6AkFE+!y$E5n9xJ;mz@Zufhc&OzesG)XeFyyPdxT!AYz9x z{iVa~I%``V1&wVaGi|zL@Xu&7*(hI>*H(u~N~hxfB83c8Doi*XXx)BM222aXY=;je zNP>95Av;eG)r0~CkGNz+^??@}IkTj1Z@av?syx-Gbu-tk_tfinNGxOCd_qm5cTkj? zUP}k7%ZsUyq&A$K4`oy|XNb|kIc)h41oPuZ-x5dU@r{-ru|KctSVomi`$nbq&}JLS zh7Sq-M&Z*@h}K&Ku+Eo^_say`kkjvDJn)Js6Q6PvWGn@oqL`sW5FY8JIETOG>&$fr zeGYyE-|L!~)_2M-cb15r9Z&cMoLv0&`Yt3X^p4OdKCiH7qO=&9IA?=C|2uE@AFQ?x zmwN?SFJ%CI%u}NU>cZBy=wAs5hnx1(0eH&?fOL~9vu+%J3olYz3jY-f_qa==;51<# z5G-zU64KelcnR3(ha(kjuY(I)e5s3;SshkF8{rgrEtY!)76%+qY}T=%7Ou8am~3% z@;(6Us`9KFPIQypa_v&i$!8yi$Obk z3_SVZLc{(BpDiU=zT)*tC`w9Su63g)v_2{)nBUUgN>-SVkYRv`cY4wtB)i29y;8|i zi(M>Y);S$RAuX-tRjQEfgYC_84%Fa6zC}!Az%vIXVa{7Qb;``iZ2uHlu`>hF4{y34 z)A=a->j1YrR`3tuh+V91Mf^7;Hk5#DuySz5Fu&z0BYgClGRo#ZAs>)6lZ*39&u`j`vN2dN~MFGn>*ytw;&y6n2 zRT=TEs9#@wdn}b(%io5;ikpmR26FhcZez@X{CP`M1x|lHxoQ;<3fEw^%v;Fe^KqeA zw&mF+rt$%HXszFNOWNxwQsc3Vhj@jW-0S5i4y0Rc%busWyk51Pe4~F zCX#fh#r>ZTPw8a_;u%`ga^_7)G7D$MA{!wPmk!p0f4ZlB5Db%Co()-{vM#g^!;$Tv z_4vsmNdmQPU%b=LvFQ z;w=hDV4+4v&u^i`bGIRp5HLb{p#p&hBggc;dm~445%U*9^fWe~#~7#It?_ZJ`6b?DZ6Y2f5Rx^<97HP>xLJmJPTD z;cupSv;ziO*GofLtZS_8v7A_t%Lb{wqp?zSEOm<2fIogchrtE#yXK_f6;8y6_$!}* zp>FVq>>lo+@rpzr_lst3yiB+)(a+!PC9l==eEmLwgLp(M&jUK_4_+rL3)jbTa=cOC zSll%Smr$f6bl}tl0Kpt2W@m6NS=A)H?Q3i{wo+Be0pSsHDUJcq^!&B5` zxOy>RW^Q-0q)pH~pAh0+5$G`Jv2PfKsq54~`7+D%AcgzoTRC$_#q2rqtKCI$^L%0M zEzn^cOv2LynaDOm%mBu)EG0XDw>ft3V)=CAohJJ`N6NH)d78fDERbL9yt&w~Y@?fm->`o)CQ>2cEAF60y#F0{J}N z9>LIoj#RC*mUZ8d28i{<1QDn#W82-ck|LvL8C{39=4Uq*aVBP75GDD!31Vv6)Pu?0 z;CacccS}lT>aVx+5VW0kXSNRs%WY`7>DTwqxL+)t@2y4pz-x47o;_V~y(&q+Bd1eL zPuI_;Loz>L^B4hL)9Wh6LLHbZ;y?ZPUWS5uw=|Qtsyg)l z76)7{q)P13h*%m@8}SBb+eB|aOjc*>NVhS<`~k?0(L>p`3my67_L4XlmR@q*?!)c!ZCHHA_hWSDKogKZn6QCv^hrW(bv zHH)S^aH~PGpqSe~)S-*~_SohmwbV9LvO;mm!o}a%pj$MW*In5Qh1vo5l&W~SRhYCB z;5)8o>n=VL73LM&CyMyr0lNs(6uxtOID#kQ7y zFXA){d$E~8q9qDes6=9bq=ox%vqIxH2b)k`4^XJ8F^o_!adPemN zgdO)1QqFjYhRV9#w1XV+do>u{zLpCsd~#EW@cf&+ zN4b#K?9vJ4o`Y)qc2__eXL*rsNzzqNa~#8yrxza(_YK-T&EQd_EfrF z0dr1ED}1{k0`;?^_e?7+xLfN19is!R69VTncTu=R9;);JnSH4Jny3YoP3XeJW5_VX zaN30N-?mab(rfykY@}%qF`098RYJo+8PxU0DUvZY(2eotG=>d!#L9BSW`um#$e+## zf0dn)_JH~ypSC`YGi!^%5`6A9-tyRnAx2{CEa1h-Ax}%{b*GP142QfZ)d%CA@Ii)A z)-4FMgC`ar*gtNb4I^(6xWm;6jacSvKCQuYVLr-lm5jNWl)t0-021$}>rE*5u1Qa1 z)T9TU#A232Zz|^WSf&L5me5N3o@!?6d<7{*q8(*%qfb>X(bxMhUA@4ZPJMXI8#2g) z+K7&vR3pMCY|C&!ddd&oru3C-5jw73suf5-;di_Rv{qhOOye2OcjtkE?1{eBlgT7- z-k46>Hw@OrtLAA9uA5(pGKhoYxgi`hr@WT({uIR&yDflR84FbB%tx;ud!`oBZB>L# zE&X>j6r$7lg_TrUB@@tho)rF}p<=sjt2(Z0O}?{c#{z9bp3{QKRqYHlr9;NX^i=&Exb#BiH zVanbU43TU?j;5g0lKUF||}g1L;$U2mXOD5?eny2uJ^IA5i(M+@JBCDQkh z=3TJfxUsK@X*jh{g!$)E4WlwPawzG!47E!j}Q2G{KJU_c z)8aEa!a!U4Kp3afOMGqB@X$sUj;#0?c)wFU#3*ck)NwahkA4H%R*2Jw02~5Em^d-bgs|eH<{>*lx zu1~$9hG{`|4MLfLf<2O}kta zzkOFl>5>4><=aA~y@fmxG^0lGuWI>K+@#75wzVW>((#h1OCpQlp)Q32C+Z|OWB&*A z`U1t_^a#_bKx~k`Hv}q1EMgkz&ped`Jb1y53Pllwxb89_yX)G*jWkHP$tjty$7n&$ z3l;zdlIC0u@%I%^<09X8gDa-L~Oq7BqZabS287wInziwrlq>PW>> zw8q{0HCFf4f_0zha!r*)CE3~ow~#UrDbWuLnh$f+*O>I>+8B&_pt)DpkERjS6XB== z+D0r5o{-1;Ig=23_oelX1-7#!1yaORy$_TTByuU7fth*mGEVxH{S(JhF}0F5=wr6D zZ6NEq{V+D_vs2H0g>v+5xCfg5*9~vq`f;^kc{+f@A292<)S{S8^~PFa`V_=V9v7r2%p{go5 z$o@Te|4X}?odx_MW@^tuu31Em#;P<-P-Zv4tvv4F^#g6*Q<%}nXlHtCey}3sN&=nXT z8=F_pxDp@9C09E6URvgQ)A(usldyN?e+2_8q zn)dw#5A?9HS$^(S1Y08?y&BTfoR2nboFHSHKsZ&%6OV<_ z5{D%~_H)W=LZ5WbMxrgwcM1jOmr0Pg4An`#HORyBmV#<`wd=`oYAHKHtwU?=C2r#X z+vA8c^UAFa$rMuaZ!3TSEe3XVBLZS8oe*}nx~%$@O({E1*Ozg{q;^jViTE`swyH$r zEHaiz)TOIhxw02-uwS>woc@l zJDgkRQY54UZL(KyzORk7uixHwC&jGAb>;_Z0=K`PIKIu@+GN{voU8IjS17{FI#2J z>%z2%>dGEp0ND+HH`wtpv!W(YtJ)l>O`E; zmgj$dZvdOTJrmUn8&{-Op<^EuNm+0`iuAJqPW2fL&TX19<)TDp$&VL3jvET63R;@sNE1?h!{u2n|rzD&>**;LtEQ9pWw7(bA!r z$Hi$m5I>`1OfD8@b=mWDs;BvtoffJQf<)80vAUOoHpvG)H@efkNU~y)?s?Uj{RAZ$ zmz+wF*cgDgTP#r~S3jL?il6H#WcR}4ka%$cIXXbeLl+hYM&Db8zv`D~Df>6tKSREn zw)vOgWx=KV)y)XXWE5Tl)25`?^!9Om9>}Qxa15%>#lgZ|6{M@)ECc?N8CrpslQdaQ zdILzme;Db5b^m>F4sVzQ_1Ru!);p=Z){Jsrd>~KkH1CJX_dPQNwwo86r7iu+y^rs6 zhVi)~Qum4=>F#{#MrFM^{8GTTzxYXz0Pd~_M)U?G`qLwtQ8r=2H2Cf?(1&9aql%dB z;L77&o24w%1yR)u0gA~a7c^e0ac2beg78B^O4$wJZR%Ou8cWhrL)VqNIfw?tX6n%< za4JzOxivCqN|GJ8oV4n+%zNg=$87KCq`oH2^WTe@la7VbM2F?&Pj|mF$bC~n^;WMv z=E!V}y@iQo$Jy`!fH&@F`!uO$1@aD5;O28{GdXJ@7#;~=>)qa6Td(xX_3`zvHMQ7| z2FICO9?R`i7eLZeu4PssX28?TsM9PJ8)~$#5qmD)dMpdQKwF&8z-+Rue1;$=F7$XO zJQ%v`w-Kz?C-M?xs^UaiWD%~F*41@S#mg66V@+&}H1gE^n0pZJTJ0Rr`QDwhNODiy#dXjT@ilF9eKoD^t{f=7UX zL!6dO9>?3yG5J-zfp`g^leb9l~60N1IR@=KYZGHSJRUxHhpGNbb6Fj%QJjQ79T4zi3WDs{ySZuF==R_^A%%ne# zAIi^+DbRj;p|I;ipOPF1XYi~@?ACz3&VW!)5LFFK+iIiP&*Uj?wHEc5>QaKY-oXXO zc#D_dK#0$jdcdhJ{JG;T3EI})^1JVNH>mUt9@^$kM-D3|Ak2C3aXH{kLefaW>IIn2 zXR|@KuRa4~>aT*W+3gD8(zY|NnmAra-;8NC#6lUvegHw0crtDSD0_D zoqlN32+EVLaH_A|!L;jzoQgqm>iGV{yi-%hOhM##SFS1J!u_VL6~On9@>Eq1`*H); z$n29jHaZ6;vh^xorDJl?UagUlj$Np(3JNB|)vn5|c&P(z^=eh_OF#E@1hZBG1PZ(A zq-iUCH#%`(*uVK?0&~?*zPa8~OTMh#HYERAm-{C!JZj2@(fDv(_1E8|Xoc65=m2>K z@1>wd+z{>?p6??Vx6F+$-D$;*0)vVD#aYuMa~gt&J&o>r@05dShz9}%?dUD~qEk3t zSnYWN3KW7ow-A9Uv8Xk(yCzaNjFNznV3JJm%i9EvQG=Q1-rE48i*#r&idxxAOAfy3>3!CQnju#6Oo^S4Sm3st3abjk+WSF=+wGA@Yrl{2z`Hi=a7j zm?ng0P?JsFMN`U6W`Hi=XFTa$j9KqI4?sIpgiCq0vJfw@z9d_Uscp0Uh@LYNf$vj+VAasKrrC>bxI3=($LST+G!-c6 zxKsn8fQc1h%)TS@O&6L2kB`22ie&6_NK%^KhT=XDAvTMz-isL=m6Z4H2^bhqgJ&*X zI@4V))@0+xd0UiJAkI7tJ}~~jw+d@b42KsTLYL-BHyn!;{<%75BjlX%j2TQRGNe=Du6A7mJ0Ahf2Oq0s-VQkoJ3;{ z#eEf`~Rq8vtK_ERnFE;EXnb2e=<1#Zx5BQHEe-%|K+t!~xB=J>0%|A$%>owppnyN};&2 z*$;BTln=$tw0f$@tap}bq$Gna#^RC%i1o9I>QrRj`4OUoiM41!o;H|4_~4;)CTJ|B zWB3!lYNBvmEmdwUly?1HAg<_4VuLGPo&WOVQw_8P)lt*0Ha{U`vM5??1S~Vq8H60q z4!-XO?;k8H5!$;wrdA05#X9z5B4M1%t*16|Vvu(Oix>FyT$WUgSs&cP#FcV>DVmbq z%z0TH+KIp6)VFL8Hm!`1{wcNg=cMFc?|_x@wIMVE2+cv-!^w+&Pg4DP_Q{eVrfpi? z#@%p&eCkywSD5a(t9!cVdY&T1-rTB#Qm^Y2)29ax3PK-zCy}2#t|Y4$1Prt%Ca-j( zAw(-lKg#z%vIlK<(NR~;jMjm@3)EF6Q5l-w!X7vy4;MVy0k`z>uy(s zIo|OxHGe;+W}!q$;`nstiu5tOSWffa?~_KSj|>My7yE!91IgrwdI7$P=~(5-S>EsP z!k@s{Ap?2*>uHssfj6vs1l{X=9~<`wZIHDO1{Fd4jXrIEAMGlp? zQ312Lp}}H_waK-Z{PF*U^~=&uOnX|e8J_!*>Evfzq6bH~-$E+^ zH>YxD*SAf6Hh;>gGH;9xzfQDpsIP}5fw^UzJdDuZRxQ*(s5lOJx}DU>6pEcid6YPi zI_^FaNEwZ>`ZqQ}gT$L}T8WwVpzIvW-_qmE0eY7%mJf^sA`zKFn!{C62ec%9`06Re zc|>^6&hT(Y4m})G^#B{py&i4f2%F#s&!8TIeWG2ty#zjG8#dmEs1iBQgOu%DwRcK) zV^gSn+^R&-rhDhgcmS(v|6jR@tnn zeO~1?#Lu zz8q$HT4I(2ojgEwv4ekwBfMKd`%iw5Fh@HMbKOw~<)w>*Ej~gDU2I?(h)s%Px-F}= z?@}GkaT2JQwxwA{2Hfr^TZZQLGubRp^Lo_-b9BtaEJ==!vAZuIpN`3G9V1cX;N7uo zR8M{DvQ2O^$Ku$?WC3z2ox{);yv2-A2WaalT|+uD_0nuO?+4gT76xrU#>nvfeAqLV zVfrZ-@5(PM%0&*1`Z8}GJw?^SJJ>XXM(j;bD9DG%s z+%mU@grz;3Ky|3{S68%pX%N&`q*$IDCF^YnM5nE?5ttlYb9vPSFyS&g>Ab zl{!WOfawS_S#w|xU=Tlf-amCKMtWG>fXJM*55H&jZ|#WWda;m!$k>4Oo;UY1@`sU) z@#~R*C+P%gX#V;MtGLPtIt4Y2UYi)zn4j5{^#nW*`5wW3g#W-TLMMHFU@g$}hTC4d z%Z9OXVXY$4%9NTs3DNneWer{qedE7-ssf$4UO+2{?I@g$ze$lzbs*r&eG33>?MP54 zZDL0f+;s9~H(rrq88HyhDp1xJ+_1c6Oq)=L>V!nsTE-6CE2n@Y6@)!|DBM+gEBIVig$Gyj|O(iF4 zdQX;hq7%n!Vdbvh^ZD%)yy>(ES1p*tm+fRr6bOc4tnacQlF}?^>J@g zeciFhs3x=0r5mDPNE0_h0i+LGI!a-sV^v1jZ|Jz zCURP|&4PZrz{$MUyJ_K`XSzS0k^?m~_iOJ!BMj0x5x^tvyOj4+{*jh{np| zaL6uvsFMQ3=37;)-T-RvU&znSJP!r$k4qc=r=U{7`UVI&IG@de zM2b?E5fJKm1s3(ogj0Sq5T-+<-hhY)9e^D|)93S90(ErFk_1(p%Erws6rccrh$(NuJH8-oY zMZ~BHiNiZeN>6Z4kznq$<3L%8Jaap}PTJu@`=A_C5Q-*^5^f78xC?gTQM?@Xc@iyz zf=q1OcFBB1Wj%sy??}ewskVekPin%OTA)O9-SP{%J`wCKXGhZqpHYiJRFrCp!I8k2!hs>Z_ zJ(&cWE%v(6$!R9&mh-u0oZ@zu2}^6f|Gm{BeW{)hWir6_)zTe^AGJ-fk4zfLMh7=X z>8QwOX^?g0uc_iEJ|w_WKNiZi?muo=`NB&E*W`;F@VAN&I*5;#NfV&+BxO<~tq0r) zS8zGCg}BIpvGyG2p4?8H$bSOzToS5uJ zBmw@;7_a6}LWsrgdVf3?o6=~%-_V5{8rCNT&Z4CZ&4f(VU$A0TBI*ih)!SC=p___A zBVcNN0DWr;J(GWeptSWdy-ZxSxQY?P6d_=7gr z(IdB>$rh=FXtUZ$oTx&X4$BgWQ z+9_XOpn9<~FTm|ZSr2zhPl~d9pAGXwC>vR9h(ZoCko)_4y;XFd-o#$nPsUX^Kq z5^ZFh+~2Gt?{%z>r@mE(z=Ts*3aj0ZHB@}*u@Iy&^3Cm?-Z*C2^&kz#t;=B%4XDGe zbb;J81Vf>LryTtuNi*T=Amo>*zzFC?heux*71XsKKzamd8sp~d4&VuybRYAhUxDwA}D0GXa z6P4WqKtfYx@R&?JFg?1Y879AP&uDW9@cbjyU`5TL?jgey1&=73<9&e!T!ASC=>fV=n@FY=CAJs3{)rZoN6fwHqh3s2_Vs*hD8x82AVLUEZd~K1LfP_6$q+_kzA5wj~}>C{NYRIt>gI#W9YrPkj8 zUgW@H2}N|c=14WpS~JG)+w&bAL6tKEA2 zSqbYWERZl+J|$&=A%6a_lRVVf-rz-a=Wzjw z`dN4|*n~3e#FZ2cN%KFI`pYzAnrgD3#vYV00#B=o=QS@tAY+AI7Bz(#02SM{`$HxPtleqn9<-q-$U7%e3M5zO|_!= z$$p$vROjz86?y#!71c#)jx#z?MR{DP*Ea`*gdNdwc0Y^hYJ(y>4Th9uh{Q5R)Bi9Q z5Mpue!e`$|IbhW`>)NU&F)o<{R!LJzPjBHwR8CqFi;zn{l|!G6HmLH^U<|X@{{4$t zw~=o7qYI}`qU)-d_R}fGyKmi@sT39hE}K+OOA%3D1P=fgdA4ES03OBI(79$Z79J2O zU6JCuh)ejYHt__NIrrGL7iZg~$hbN2!93DyD>AM>ZH29vz|MyY5TtNcgmMQ&)(auG z&Pofz??QZs7g-xo+pDnD841(eDe|8kyf#L@rNGThGqk^&ux3L!M|b1yJS?%-LyN7;!aoQUbh;Pp&pCmOzd9-kbqA1ke%??Gi`Pbg=H z+YZNLqE`*x&Z&IYc`~28&>a>k`|>MTdBB(K&Dq0xrNFhp(d@C5djO+kvSYZ}f(ChM zQuJ_D{+c?K+zgBm%i^NYhGdw$z=vG=Cf@1x5r7JSFi>aJcarIxPJ$z8+Q6FNssR^t zwjDmNOU<_o;u;|!jGQo=$8N&vB%7O|M{Y_E0M2tTVhDEI^(!u z?14eB)PvE-_bVMYSCeA5h4CxsVm)!bn4Z?u+UX_l&^r!}VBBY4;AX9^CpFG!s4M$5 zj_vr`usizqaLPu5y~bQds)*o|NuZ{Gzg3EDo=yy)?@RVR1$sz)GQm@@dUlBnb@%KH zX^oY($@{#!BBH3hS&TUY7_bKTJvs-FRz1|2m&C9Da2 zdX-coaA~8qccbdrBCDCOZu>6`6#4ws82oAVgmFITD&Elky9cwT2fAE&qED}%v^Ihk zX`oZh6t>Z3J?WT4W_X3x)JPJ6m0OdNPVstRAEt~X{ycrR9s36$HIN zJ5#e70e%Wx!vLCxhCf9!RJAWp3sd{aN>pxNMb(#kQi?7e3_F0Q?Jf;^CQQci)#^nO z^AT@;JbmH}%-Y=0z6a-kZ0d3$N7E?i637MMl#;;V>h5iUc>2gKvB<9#k23$uRtzk0 zC3VVjWp^y)%qtEWWq(1q%NK#3OV9R*5qii=SW0gRl-0f&I+`$Y%1<--5-~!sC18_t zRa_e%5`N~0-j8s~9HCPDUQ5EHDk(yA`*-(9ZR>|D`?vf(Ou?us6b9w2)sxdez@Cg^ zfQ2bU1qn#t0!J-$b$3~=qj8ew1rtBm;2li^ZO`WOIrf_0^8Wp?AulHcdx=pa6)BH*2>J*Vr4_xf|Lh?XpgcM4PEE+W zr2gESL&mK3rVmVMiop9VM4hbErN%_fQ!Gg0w&gBvto7&#)#tE)Iaw;S5>s^2!!uZ?JQjpTA%N>eQ9KSqC$>|jzR)FE$Z6Oe9XS0&>>oFPBNcvKvF-jH z(!ELGz(}Go|K+`lWG_evC4^nf{SoSW^fKVn;?{gfW*6_+}4tI zHb&N=FQ1(97b&3&9sP}B!|O86(Or;vyRDSq9WGJQ7Vc@GS6*cB#19{F8~XM+vPRit zt!O)99@+nZQ>?EbxNQ(5e@}=GSt6t4&mqQf*y1LoW1QN@SrzzUxjnfCpS3e8!#fkj zl}x!)49C<3SCgp^@q3%FwqQ8~@XrW27FL-&S2}M5%kzPNox>B$8URG0u}(a>#-qMZ ze1Zi*GAYF5ItaYsNHtxw^tZRk!S#-c&IjlJljbSmZH?$sjzrEMm=u>tN+@)3Iv8b0sgXbqpi6R+Rwls|%BzUD*O z9|2R6oFNPR&3@=n8})zLl-NVd4iK6AhBMer20@Dvc0C{Yw_BG2-;+I)nU0R0Q);wm z_e|6loqrp%Ny-AvXU_<8I8j(Lfo{Z>+fRu4Nf*u|UfUu>14xrEL~+7XC)0c?MtXHl zbl>M%!@(EQsE>(-al9~#gUog(bUY)KMS z_)=xjJ|$=QnG{*?Dpds(3*3=6R(s5oh;BP}-+3VejM%#RWaw}zdcl81&H>q~bDNB- zE+fI>#`=TT*R*L;gpILBA4J42r!W5Sa)qhFm-%9Pr*ae`2IJ z;3YL4DbPoRUBBaAkL!sERL^OEJii{u|56|tkqg|0rT(}KS#{$$?<;!QcixFU`rABg z6ZY72;AvEXy{W=)mY4>$v&?havP`;N)W?Ty^jTQwt?rx8)g8}W>rvAj>#eSmT9^>p zs}?hR;=rjEA#@iR`v9{7t=b7djT`;2Bv~A#7@7CJFZl!F>sIr1Us2$bK@JFy#uI!as}{qm6Z6`pU#SrfS3V96v((+a!giY zhrkB8^^*Rj(>1>lI~(EjG)sa9_~hILoY;G&n3Seh)+*!|l%>E~3qK{g?KbTypNs(< z%t2})@Xewq<`DEZh0aoXKf>zfM2K=7b@)MV*Bs{c$-}}+)(JWfRsQ-`RVcjqLBRC0 zJSUfx;F;n1Bx$C6{girB|H!2Z1sX{gGm48u_NadU8WSthk8i;#0idYSI}^D*MbU=9M_t z{z@Qqbb8&6w8FU>OUtd~g;1`wDy=0ZsdZemRwiG;j8ino*C*v?TjKZsh>3% zf{gGZxY4#75_C5$=x8SypfljqOb&Z-Jsi+K-9@avctZcR`zREJadR*iETV+piK1_YkQm%IN6M;&7y*a~L>fLO`|H^W=V z4K20_mMBbabyoSo@dDw8O20qh#qyD}^+5-NdL8^s*Lj|c%A(V$iV(cF)N_lErX7fA z{;9M+Y3?%xUAkVal(&5Yio3Y$E>&&Mk-i{ETpH-2BMDBH@Z+>Ddfd)U&#y8(&Tc2i(V^iB4Q2h=9Yi%o zqPQg9yFK#`!nz8_r1#rKa<-=}p;!z>4_)e}UpWfASBWeWGa4)9*XxiwT>kKIN6c zdV}|VG$akblNX)f3dh4;Q=f^0Mbw;9U|gMQr=*AGqMg&kU2|vqU8#uKSsD$wiZsLM4lvs>5H2#zwYA=&00azxgfF$Qyu8sYk8V-bf+t5=V64GDNrc@su*mzI_iIUD`r%KP@XA$b;#8y}HHY z$d+V@_!s<*SNK+Jr9>GLW(h~N;vN{Kn5|ch+w@E~4SSI82n;akvx?h&ciGo^?~rkv ze6AVysrJ~6c(!@{)j3t(!IFB>*azMSq^Rk$+p|++)%2~Wp?R*rM{&x-u;G{uP$b~l z;WicBjO0;;T42>Bw!=SF7P{dOEJi2BVK6JkPySAbtR|HpW;Gvg%k>v@aEbZgPlN;8 zuGJ*9VO(T$SJJdM&ss5nSd**9IanCp$yMA*Bq-?YtdvO0J*}9OpEn#4>?j%mP++<>yO_Eft-&iv~gh#2(D z)-%052nW$-b%Ycnu(u%sVz~!!nTXJ{p*uJ|li$&cql0}VZh1HoZazylOXJROl=@7*m->aV8T$Gj2?yG7g@C2xzzrgq&lis-?P!C@B@ znPAYU#m;&Fue`gc>*L^mVevN>QH#qn$3D{Vl$K}yq%o+bD+k$lad$6=4>=z$BJ ze_p!F8z474Enf1Ut-t0c!eUe+PySmYt7uRBW`)A@A+ZHCBa#(c7d*Yg?{xH5N~R%WEbheq)lL{y(u*|L%Ka_n_cl`!gij(Tm)C!cJv^%-du7 zHZ*xe%jlDX~r= zG2f=Ghv5O5IPATUryobTZxZD_RZ*hbkUkQ6smt@K0n#CcC9;bM z3;WFNDXaiK*H?ys%MSy)z`OvhqI{sMN!$+z#?R0n!U$Q%Gi0}$Xb%KavX@uw=hmkY z9M+XBcmDCHxunOETdAtrkbpOywBH0+s?!)ZbB9CN_=CC1^95~q7Pz)U zsUVPeMk{MxBF(Orj;QXh} z0Il}P-u{fsNXM2x3wL;ugV)?WnE|!D+HHHhuIo92Bv7NqHm%7nVvT~mNeHwa6qB7cT6^MvBVO>d^xP>Gw&VZI|n>aZ8bz|8CvBuJ31 zM1vQ-zo*fItA(qK&@RJEWEc1h@N;T>|2{bHKnXQFa(?S{um@J1x7JSAQ$8Q~2TCaK<5w&ZW+fXZTVpRxOW#RMy7XKu~&@?yR=j`Uml9V8XtJ z3$&OY%X6fU|Hk)B?5*}j@}NgWTia;6InV0#q-slCrE)gwPkTVshCE<`F)70RNwvA~ zC~|x1AV!W}zdQm;#0xB#Ez-kFwH6~67}j#EesR8EmGrwm6q~45ja@OHbuA~pXMs^; z4C`k_G9pmSofgbhzn7x+T%5wDeu&(8rMu!iC z^I;u0M59tz6m}qmhAH1i41)lyDle%ETOgifJ5UN9t7yqA>(;umnx^nR%QMV+r-t-j z1!mM9-B#yF3r`{I*@O!p2x5n_qF+}w%Xd+;XJG@Y3Yk~pU0SC#@k1k6bBEVBrR?@$ ziVz3b;M-x7ZmIO5y6UNK*TdL&f9;>(#(M`~V^QMm?bv5U>nG#j<^cHiVUkSt|)W(42M{DW6FIf;p*2uaGX#i&w*ax*{= zG2klpUaDjU`14l!i!tCdLDiEZ-2lr+6Q??sXNr* z&_7cv*6Hvsr!0sM!XVuQXP8rb5{cPoudHV*T1x19@eYln z4J9vHP!R(c>??VO(^e7j;6~<{5kVcx8hk%TTF9(k7AB%e()UdJ-$ln=S8!Z z`+_7y%pqJ;)TKSQdTNOb0ihUmp8K99tyiR+FTAs$Yhg*YC?jID@Jw7l>Mwkl5;Up` zx~*8T(yjLAfCt*iPC)e7@*;7QkCn2G!6!L+`A3#K;VCTEX~K z7ik1aXPcjZSE`#;M-qVdyEn`mxGi6a56A!pV{Y+DSx|jD{!xkXG(+Th!DjX;0pdkHbHk@ZZ6LFA z;8uBz7Hl1ETexJY0UO%J(SC{L(K&3_H*1?-YF6SGEw?sp?ew>^&X~Sy_`AQK7dq~a z*U^zR$q`~1o*X8na8qoFg@%;?OyCC}2`PBLO%+ENZpjhTYsJr3FSU@QFNuPP+ z)v-#%w0yP?1Nro>?`a=*xkzqZ)(*ZhDYWM?dd&(q;u>)Ll~Eh4;67VH@ZQp;dDe<4 zg+TI^WwR%pA(N;)h@d|)amx=IOA!Zi`|77pkBoC>7Ollq>t_0h%%ZjfcwXG{=nvB< zMU(K4SaA>URjFef8l#|m5{Z<3>bDjdg6+;i(%*cjpj1f~N5Cb=DaDXV#?%NkOEw>o zcowFij*51uLx$D+m3+DLyKq44!Z7AQdsAN42^Zb5RE1+!8g}l#XPl9LQj4wQvBtZ+k*<@t}&>*QJ-q6C)TI_-yW*T#Z40VpQsNoS;$UaMb< zI|?xRaGfRN(_0U3MU6RQ=Y;g~ymO1Sm31-zkkmu{ZpJ5|?!RNZ9T%^hM>YDKgw(oS zRW}p6qV7xl{ggc5z=A#0g?Z|A6~c+bRfBA|S&TdBCf?-&bYgj}okE}70_E;dcj30STqNtW?G0$~Zb4MrL{84goUgz_xjWe=7##K}{t5)sfTShUo#uzC%ikoW zZbpJv!DF`u>R0vxYqFkwK)M72rI?ZH^qz%tpzB-s9U~fh#fZbAW*>Y8ABaXU=;5&0V9}a*-&9&?#Kg1drKcnjKCN*CY&LHDpJG#Si;}x_ca6s!`=L|`1MSnghl0lrt zz?>jwhPsx2GU5hN@&r*(oX`#vVo!%@j$sq>#4X@#Hw}?CHx5Oiblui!5CcA`X9ZhZ#!sniAeo?}_-$)-B6qw++4X!fsOMSLz~V!zSfe+;*t6+2urL5O4jW~!G-&uZn! zLHY&U?qc7=in=gfjKZVa_}!GGKx8>C>nrmTH|lM8 zk`xi@8*`s_P7ryMrSI(2)n%Qmf^&038g@I~Eb;AG2k=w1T*OETQXC)BA^nqHWoHxK zp+kP9$d7=%>kg?$hotBT9LZ&s<1M5|oU_K3;e(DG-za(c@Ob~t)Vtio^)GTBerF5A zsG_DVVM-O_h?|1j%34M_aX1+ZaGKw;XX$4MVpbQUfUyNoi=U)ULDMZY)%Ayv+SYw)j*ZDi%8yG1VYHZV5Qv*Up2+~0{7d0I=iSp< z^Zx9E76>|0C{+4 z`Cwbpg6Og#8BNvrfr_{7=n=Or!G%@_pi19?hi~%;CfS}C(A=Z^lb>(fBo*`OqvSl# zN2`b*Kb2|(->uRFz{MjS4PVf??{w~m!6LM_m#i-8o}(@4?0!b5kM1r_XR34>?yfh5 zL*Fk(-(fA#5cCBhSOaPWy|Fu<8t^92u%}ZQ=s8g@9_EP=&H=v!U1U-a?I{CFFRc%5 zr*D5TVDw5x?ma~+PGN*F{SAE);&nSY=OZl3`^lp?q2O3=8ud+(-)$uJoWchBzOY29 z)|)>uq;8uo<{pBR^#z4p7Rtt$OHiMZ{mqRy`BuHQF8tmk+WF>y;GyQHlti5nH2+d+ zf~-zGg|h-{!WT5pHP#f@s$~DrLp$+rCQQ_tSsg=nRhw3I2M2qFiC?#E!=35#w!^Y9f#>D6m|p)?8}pj0Upb5AhyIucnPQbjKY=} zJ+)-TzIZh%u8?Q9@g#ou?phEF*JaK=>$}d^OHehii(#9W^4CTS{9!(n=mU2g)j&_- zJqI;BG7735#;!T4ov`3Uv?u|V;#}q_MMvcXB3Pf@W=fApd0&dgMs^Linee>x#Q3Py zld!>CB`>^v*76dwKbiIS1v(nf$`V-_mX1-WZmqmX?DU$ho8wP7cvWyc_@C=Z9vsWQ z)~5fScYHzCZx%K$6;eLVfVhw8a~yKbZ8qI&4)?UkTChkOxowi6^a(_fN^=QLM3_A7 zV2u-RHC1pJvnbw(>Xk&2SkrDx^W#kiuk5Hp`s#mcWe7Xf~ zRK`Hz7*@e>vW18U&fO$feOvNed_fY~S*B&b|?y=a}4}Vn(5}iHk>We=_>T5htVT z#Yv?&Xn;KY4;1&Gt5+W?NWK?knL^kF_Ga}gl2ByobXU|D5PQy)n$urjNCox!1a^^PGY2 z0>B4jQW~=G{INHG?|B9>#h%MT;i{|bgAmVHP#-N197eF5)w zgQ?DyDE&X(6Hy9eiSw6?cr-H_X8RhJfVjW5?bacY9RZ_@HKt0|puxE|unwFfpx#}_ zJcbrBbKa6m1G7C%S-h;o?h{&38MLaCSW6z}77FIS;gZRcr^TZAna$erGJjzZ;r~T+ zf>km2#J^E$R}q=NAsg+NaF6C*9luW*qu0{1{^;Cu|ps>uo4KBXa! zjf{@uwqF)gOJC2Tgza^a66k%BvYot}6~rxa?$CYS?dj~v|7u5_%~gOO8DXT__j<-2=YU&gCJ0vA2z+@raLNsKWDT zjhXKj0sGw}@Sm;wbJtm3)^B~cj0%d@*o|~1)f4;51Z{Pg7GQF}F$6bMSgE$Qfq;KT z-4X~id;OhkVVq%hNOO1T+6qBqymzUeY?JgiF@M0ve9csE_Z7Asr3rq`Q&jhV3{IkQewLV<*^Na z=n8~fCE07x!!PS)jSD{=Vaav0iAef2v?xs?{dndB6YvLmM<{wYA61ipfOJ}b0l$eNq0(>WwKSoNHk~Zu#E)pSm`qEcw8kK zgO_>@odOhxbO}nqfJ(`Z42vx|3*%a4Sjq}lvNI?NqXwElop>|j&7H2P`!h(+Cu0f^ zv42Ra@H%BLa1}Jb#cNq9Kx4b2Dk@R!NOsn$P9aw@9w5{e$Dd(R9^mE%@CrF~5h9Wr z)0nI`Tq)F0T&4)s_z`mnU_9K*)-kCp<-N)>6O0gwaBmi2XJ|FPFQ`!CvM=bSHxu;` zHc9iFG{TTpp@52~Npjb7VgnZC_rvF{_5MQ`?Ps_kKC#xKQNP8MUpxxt#@T!j^Q}Z$ zDy|hn#;2|%@nuJkf5$kt8t(DAX-`?wl>7xG)vkW-uh_VbYo=$3d(Pw_ZSPJlfzsP# z->Xnhny%B|$Ld8v2nXv5E+Pkw32#Y+#-w6RV28a$er}bL7gOHv;6z&nU>1tcFkLz2 zQfwp0PxizD_ikO+d=4Mu+_6Lol?O0r&QixmU7@KmZlrS(s@>^n6yVB1HZNEZL1mo@ zZgGvEqQ^92X3Bo}E<6ay_{pn13R#X0(D3q?aeLsPP|rXrOs_VXSsi^leiKhkBtHW) zN3F#Ukd8FiNckued>ZB+76W)mh+dBum=(J3+^@tNROl6ns~Dd=(`lDr<)$!jK3D~F zTCD2$cYe+jIVJ4z#OnNc=d!FTCqhV_Sd1lW^hVfjcIa?o>V{uk+Hf$e)T;ZNL1=u- zE1V?j7Zvd1Qi6yn!Jija%RY_dzA`T(=(T}OKAG#01>7?|hU4B;)B>{?THRl(T|4WS zo%p4>a8d0UM(^I)Vht_DdCIYz3n=CmmqcQLnW}9gM}I1J^}H6m$3w!%Ja4%=gbm+j zXhmgZARluYc6F9fUR)6M<-b6jCXDl`e`w1^NdBkxZ~#{R6+&5eF3{!=`Mt zkjgZPUfvKMnK~m80=xH#UkIwvpIeDSdqgcEaHS zl3=l~Izxd`2t@Y*hK@AT^qNy?_%S$?M50b;)?4)8y!+ft(Y|#fv%vTFn0f{#_sZW^ znF0IkA7#uJkMC5LNyuME9FC{USe@My-~edLNXRNmewzvKjwzNFY{hCqGsa#cKnl2f7rX*;?UwX_+}&K7>_dW3&pju>$nF-mIuKAzQX`VhSTgxgiJZjxStw zlU0 zWZUiw*)F?fxH5@Gk}1Q*RQ>#HNugexD)o3oTUzjdt`Q*fmv@<`7O(v!CSO2MX^Xg^ ziRwQZ{dkB(VCl1Z$T@8#566Rjg={$&VjSxkuZpf?#M*2t(s|59pZs?sSW0eXA-xEn z8A1>L%ny2@O|k8)JjyC9RuQ@qGtOUXw-x40?L8-6VCoWr^N(>Wbg6I^X@!?+OeE#L zFlzZ9VLmuqLlwij!)>4B1dIs1V|pakPmWKPT8!XYdu0Bo>5d2L?itP&!ioK}8L2U# z*wj>*R)cF%?&W&kZk=Y9CyUcFg^Seq`v5q?%SrcG z`o0FiEvQxFN=Lfxw8=2G2hej*BcdcV?&f|HluChkk{~n*YG;S)*NA9)kwmR8w;u@go*TT>_PgdEW60u)i_XsbJfr@}`Zyd?88 zmH{DTBxhS=^fNl9Bn?NN9o!cwPc~LqyC6&&gu{-n_qi2gTI>1lwW|_Luj(hBEp|7+ zgilX_A2r+{E&z3AWhuKNe5ZYUvD_?l;`cIJY&QHS3NXDyBmB79BW@D@6pF8sxa zgp`T^o~cScItH#3?ghyv0I6F5~{H4cu6@A7J zuIF4%q!t}6yinaTAZ{8#Wth9@_o;w6u?{6%aPUo@PLGuhN)1Qy>rG=7)flWGDo>Jv z?}dI_!ba*<>_*kZEMi|<_A;b!I~rCR&Rff%ukR5t0**4b?X#ZC|BNO7R*`MQjXU(RcH=v-t7cA<0| z!XRazClNbJTmS4V0UuS|V?Pqfu;fOmV#(* zM7&&f5#}DUA@u7WIJs0?wpY-4)c?Q7WY-JwcAYp=uDCTi_8h6&8mJ#K+Wl8Y6~ArU zC_^swaGASqIdYTWxS(5EZ&JM{2c{U0w-7^4?-{^7Q*uBBib||G<5EUzcJ7#T7V(}M z(!=JMj$8-hlc+d6{^6^7KV6Tp4sAllV%hZjorcwH!=cuajv%2{Z0DfaeYFL2X+u#i z5@Pn2gRP|tB$wgW`{2h(L)1>TE$1MnI61tqifq`>BKk-#y?C>o>!Yy@bt63%i)l)z z=c*&ggI5w1&RMm~gp0t!JjE;_e#y>Ii9YPy`*cjQ#b2mufgkgiot)f3k<~MP^2C%8 z3`W4^alBIcsFbJv&fL9XZNv5hTmgT+pqKZG= zumo^m7km60WtWdH*}Nk9W%aZUg;^$x^saOkZ&edOoqaz!1)Y$a7|vYntnZctn9~sd z0;4VNX!mJaSI0$h^sr5r(q$*X$RA?^%BkJ@`?WxBcV|4*Z}##iL%{%g48c4T2iu+C zy}?g30{m%0m>dm=F}IRcWiWHIKllp$ZE2l3va3hEyVV!gG?KK=SYOWw7RJIQxg5;rb^( zsr))MrGd1iu8?tfV{v^UA??}X(FzDd2P>*t3d_pWPM)9eXA$d#!tAaR!pjgmJDlB^ z)upimq(Fau)7G(fj)oaTLZvTT69mp!@gWLEBDUrg!748G+DrCq# z<7UAE!b#@9O?b3iVANrqa0DzaI^Sf-@JRR{g3TzH>A6kOcsidgeW*GksmG#q0Gb*& zG1XV{EDKnHz=!g8wbDj!C?33vJr+0=S!*3T2KLy`lm5LHI(18#?QJ7isl}wMpgx)fn`t9j2w|3g0Pz}PGhjG)Y=csOmmr@UjKoQV@92&#yW;w+y}hHxl%Pf;L2()WZ)5#=HO#P>^K z6Z7gMiBqj2`nKv@dA^M6wJE!zataQtx*ni5N9=LBy&jb*yO}^(G@;(FVG_?p_UisD zz{<49l6!m|<&=O>?6abXooctQKRqdlb0Js--2K-Te(q9E41RfQ>&Lz)4gEHem}-K++eZl3h}Z1i>#=B4nL0PB7ZPd5AjM-kYOqJooj)g@EJmL zLwCbg2xYVu)UZ!7lHZ|jWK2(jS)Ij`x|MlgcRknJV#fpn)6>8ET9$fKZvT*r_oNR< zP`YTes=MSI8RYr<(2H@WL?~jDkw>vK$LrTPH{83;BxQ0&_xOp@yDKOsVU29#mpPH! zt1OyjPBJ8vGHUSHk~LJFvJivk$+k+ZVls-nmWm&Rzbvs9*y6DlGA4MJ$&}xx(n7j7 zT%c*2KfP-x06##$zg=*IM#`>Hao`-U&+@2KtO2diEy79%*C~<4e-%;RD{p+C@t6~) zR2yh%L3NU--F)PodPx>akS?zHpVLj~kUaNKf;XPW_FI}X34n!_L>q&TyfoV4Ys(*) zmB(vlP%tF+wTpRdj7h{V*2vEleeyF%s6s{Vd(;i0P%T1x)9U|%9+R$GQy+#-gI!Ne zAguq$C4+rkCii%gVos|gxciWj?yv9!VaoH1Q2dY?wooz0?DqyeNIMn@e-by=pFDki z*)4;{>ss&p{D3^;OB!lhCe2e%|E8r-9CEvvU3X&uWw;eOqPtgz$DP_^fk)4u+Q1h3 z8;vn|zFunTthMaDFpJq5<~T!vWos>8@<<1+zUk^7$SUIMDu^MvNU3vXZ~e7UPJy&V zoUbw`71TC=qM}9(WmFGFv~pI8iT|HlnCHA=3#q@UZ)F;8CU)uI4Y_!@+SUxtBW@F^ zHlA~l=bmI%@y)n6(Y+wuaFuiv+guB+9ie>)01In}5 zhgQsE-;%(`>4BSgnOZ5OG54aFnTs7F0PWB?x01%)0#*VHbn}LW?G&NtE>O^@CL168yKL|kp<3?|K5B^s2RpoPb ziauRVWDnjZDHTwADG(?;TlHuqxRQc0R{`-#{~&esUQes)qB5bLet6>UIgZC zkkV2BxX{dMAjEZkD7r#RZ}m-j#2SK2J)^-=pA!Ho04YG*08-=wuRPO5y$Yg zZ{_~%yF#Bcnu!6fI%PVV=9&$a;<(m#1cSj*uXtN!FmhV}YxVa|IGsu%Fh`~Br97<5 z6*X!1?ACHq<)3W$F3&-|J2In6C-P&RR`(eY zJOUxt-JM@j1f>>95T_@AgFSpnELTT6xzN+DGXb4FxP-lZI<0KyOWXf_2m)Zzov0^^ z5quTjUDx#78$1Vt2x041@ZnX2z7c0Zc7v-L=~F(ffT_v1iPUu9r(yg%dux|We9g&5r3TFNorG{i;Kdf~F7P?)ST zuK!x|bV0vKBzZVao(T(tCJVf4yk5+bRLW5+B5-_Fq}8xUSo+3>-?ncUp%x)%) zf?oT~R?+q-3Hp`NSobHjQehtjh~8c;F+n%i4Noq1Bqye5P7G>JaxHmveSu<-R$T$N zG%KJvi*)Ju0(nY6`Q7Cst7vC^IKsL-yN|JC!Ggk|3gFt^ww>Z5Q5a@bBgwviD-;>8 zd0^{Pv}6;HAm_}yq%~xU3*eA2gfb}|&_FOBMB=Q9^6>Q93B^Fy(xyMe?R!ov*lB*)fCIeeq?=k);%C0t&+%fcp!5CT10#K<7-K za+utkxWsXU>?HvO?swKhFNgyU#l2P({tinguaLmzfsz7lueN;k16wYy0;6IgdW@$t zd3gz+bbEK!`^~AuYic_vs!yP&9pg)df`CTs!iFc)#eqbo18@3K8TpZizGd?J|9|Q0 zusVE4Iq`tTMqxG?L|n~yVS5M_TP6vfFo3i=LQycdUtz9$;Ek`n@{rtrZK868MVkymn!|L-kQs0uIG9T=A$XP*<2p1a|5(y!=FO}oc z)Xehl-uy?!pA2;UMxP}lqT5RzJq;~_E(HYzTUY?*yu~eQ@)uEp_{7LHGhX$x&_L~y z_F(B&M+9&Be_eytWOUCPW*^UX`Bp9MYS13F-(Nz#YGZ9^ukpW0XVV$jOt$Bdm34b7 z{c;C4-lCi1uOZyW@o;fWc#F}z6E8YwfMH+-g3eknt%soBpk6`X{d13103`#?TKEeR z4831+C|DdMuTdg;zeI7?E%=aICtjv$#e{c^3@a(KRS|9f;kB z>JL~Fdj?T$F~Ubkh4tvBrh>~^{J97NJ#H)dy%!LU*Syi!r4Wm6mBdj!l&F=qnF8hs z3x;DKKs{iuok}9y*S6sVR=p~(Qzy`4so|+#1ssCigbDXXE5qPv>Jof`tp%eA zNafur8_ln6yX`WUPO7F}OF|Qu4)RTmu*)AfTh?f@#Fc#XNm3I)40X^d$Roc0(vs}4 zX&q^DJc;SipHqgti&_l8aWRWwIVd})w)#@LC$LUfI;osoH_!rpT|5=0=-;V+Ze}IN zKUKEJH%P%0;Wdhp3wqE1I6Aew zWZ-9p%o5m!3Y%brzsTE}#GY-Jm0VKY5(47k5%~_qYl0aj0eznHA0PO(*0!~>Bz&e| z+5g&00u<0FDML164N8*JrSn0ZXgQgX`U(C%`EMdr7OnqLj?p1R(L*Bp`zW7VU^qey zbx!5@jZrSaJ)Gz5x?S!)Kwz6uRp`CzJh3PQq4VJ_2vlxa{#or9V+pc6e`&DbPmSoT zpyyj{FRIPu)SZX^T9J=w1{FYQp!I8j)d%(;UJCQb)>Yk;I@a0@T|jBTHHPc z#vhN6SHzSJ>$7$B$s?Qt6+fm!#*T&jeO4ILGieLX28Q9gpJ|r)e;wG2Cn>c-ftt(y zPHrSSqbXSsUPeba)u28yj?95bFO`avI!h83>lfrO@n@-(e2A3i-akdBjc9$hTZNwi zM3t{K{15#O9cgx%$Cq0s-hUx&z(5ouJzlM;Sl`J#`@HhddvUW3%D%TNl~a+jaBG;s z-n!)LvepS1qbmSY3!ZnoRUDK|GRx^j00GTDiQ}sLQ=|hL9@&WDg4Lr4O(5}o@03Ps zj0vy4#4r#*0Wi{jMC!E31!$qGfZD{-95@lUEOCpzX?aU$v` zEh!F=ErZQUugxM|h{St43VyditBC}TZ{1tuay-NMeCb@t> zdEUI9J&Mc{rNWIHRwO+-!zIbsdr~)5wIcvsa}nWwWwS@u+Z9ExjsfN*Bk^#PU+^w! zx8rE_+QB$W+%exq-lg_3iZ&PdveD-ap84urc|)7??{!@m~_@( z$>}y1#(Z1n`H_u`B^+J~r-j%}B9+kH;U96>2WF?>roQ_mD=its1G0u>V+@V|)Ey9t zsALi6$~X5EGYh()=&3=Lt~#`B2)@+h>Js zRH`o5p6xPfrL`F(fN^^LGtKQR>uZ>E!v6c>-#U12H|4$}k=L0JhgGFP%f`2t-=6%w z#)EtebJG4t>Wrw{uNPPer&qSBU%NXElkZ7tq5D3kYiHW~QMTQE;0082qg1$bU$X4O zp}k)m+8RZUgbH`!(xg2kYLucf&&ob4W-cZX*mE>sr?A@|`>Q-3_3ZP26cLFTs!0?f z-0jl_fk6-PKNeuxZpL;yM!fH=K{aB31-Ci1sf5dkhK6xi>*6_Jzq>BjyU5pcB*{tA zizq>&HPI8ztIWv10n@PXDq(84nL=GCPLK)RcLVI#czg}$If+uktml~|J^Qhwk-XK0 zbi{PlY!9E!RB&8IuAm|%f>S`nd;Sh~EN2GEt-k(L&YqWT-3ttn$Ao@zaA?sZdAGYh zEAg2j8@UI+(Tah;R$wS@oqf;gAx4jieW<@O=UR9Ehb|)AO#jNRyCDb^Jo0--spr6q z&SxTCqX!;^zU0VKn8koo@dU0SwG=B=!ep%o{@vjSkA@lFjI?D^mCd6ZNZh7M=q!=1~W3npSTIZt=Sk^tkb&X zi>CAzGkZQ}L9=E6lW4%`UaY4qtIb)E8fhqPFn(3}8i3x)UNj{mH?hzr7S*R}IJU2r z_!b|sOLUFAx%jM?G)r;Tba2qT@ngO+)emDj1}XDB0h^oxX<^X>Ab^1w^`w<0JCeYVc>_{@Ig_f%T+j$mC6|Vc(wOanWcZMWdi}ZCm@b>6C9a0^UiKoKB^YqrW@KsIc9iV{)c1ZX zgt1di|E2UbUzop}zIAEQvWWFkTS?2G;-F94PTR1t0LfW?=Xa9!)B1Jf>li>6q)F;` zV5U3Bv+c8r9QgnjdlgGYq>{qeMBtt#T&{Zrml>$;aAq_RU%a5Xb&E)fPzImY)75(~ zApMRC-ra^ZgUK$E!nq_(bAp1wTsOct#a6&O*&y`2cC6KkTb3E4G7 z(Rqx0-*Rc05vP5rgoW@uXlnb~7JuH3Rk~+@k^9PjLHz=N6JAlw7QWnh6hL6!y1ZRf zJl6L9j$2ioeB!WeG^0?z$Xcy)AcU9#e9j1t%vwsyon~CZA%SR!L!DjcPz1(1=I_mS z>J3ln+5&}{YB_0e*EKG1K9xvtCVu1y`(Dgs= z@NHe#`HG+x^MghT^x(mC+cdtv`3RHPSN=2ZZuCOjj*4ykWn3&=s7j|QsrcQ@`f2ia z;9~dz;%FG&PaNvF8@?s)UJlgJvgrzBGb)h4soK{f^3*!+_QVQ zGKjXu>szAZdtzNWfyKyF)<5M3j-K!6vnUD4V-X-gkeT@!^gQ4*NO`3d3AN6MoiH*o zvFWo>Nlm-egJdl|TeY?Ap~XOke<)hr$bA3(cYzO7lS;5Bqz-f0|DIT|B)@WLy_v=&|j_V%-OV28i(qz zRcb=)0|yTe*}0s)6y$hv^?32sBD@kANdu)YY}YRvGO?FAq12NL*EBaZ?0|Ocbf79N ztc}U%Cyai<(a5z*?e9{2Kcl(%8VkWoH>sd4LjESx=%X7FanMqZ)dldO!*j!pf~%aH z)Huy4wpYR!+??^)su~nuO`&Ax??>V=^{RAbrlUEt1}&eG-Nrz*bA@Nl%g?MD`|b)h zsv3QidF-aXe($KSJ*`Q}W6S8!GD4Uz5$Us4b%m>oMLfYg<(bg|Vy3f%y22yecEM zxX491A=gmUs3Pubu9sNT0TSnr4YC?oJB(+X6!+crTvmgQLGndfYCrOS^&qomu{!kD zn|5et@1lq;zpz}Ei9|-XGm?#Cg|03Rr;Kg#(gQ=SMuGG5jnN{XQ<_IZcNPuVeMtF^ z&XYW4cuuFZM8bw6f^edgijT&HLjptm=ML+PXRdF*#C){wz{y(e$OT3D&RdH0heWyUt3xBR<3!v_!bt!~8ye8u`=pz)_ zV0wTj0DiG7>XI?-wyfXUcC&ZSB)*V3-8RDG8yvJjErQ_#TLhvSb~(Y5kF__qXT_C_ z=p61#JaohwmVH$SgV{O0zsyH62fU4oL8`u;UoBUl-m6OIprjAJEL*&)8!6=Z;?)FT zvfccj9@?TwLbRc{y{<19Qc=M-p?stUCO(>PKwzs`6g$Go?LWANytV8@4ST8E1k%W+_<(9M=DqRFA=#%^^QXajp{9f$t3`oH4pLyZB42E=AzfYA5{d zbAp(Y{M6zF4ibT~_>xt@dklw49;KccHwG@U$!7`-n&T&foi7v0jH35N$VAM9QxSwj zxBowE1XCn7nrPCL#5Vs?DU$q1EBP*7HT9BMj^S3kg$ep*KGFvNm~eR z(s?>fMW8U=<~e*7eAjQs+dOBzZE~|awC%H&kTCWQFBS;HnNViu8E)eodcUQ9*o@N) zcToNG1WHS5=!0hGx82YnvwT)Y0d9D~wHg;8Z~mp9aT*RHdK30Bab|T(fElBIG23Jn zRiep+!8Af&42&g5-u@C<%?;!&8H3#F^|$<4<8$@-GM(}wAKp!&HBogOuy2F$zocC4 zEpGpE`<})4T);d5kj+rrA%sb;MFblk57>G?ffmUpw?8YzwW**2!# zKsqbnOyxEZK*^#p;-1mQRdIYW%cg{}=TlK^W+MJyCWygH9oR<-R%2ODLjEU3xd#{T zUvzrWW&Pmzu5r$^Zq+P(en5X+xg&ztRBj-nh?Q5YvWCYCWcOHYG9~TZ36AvLucpqz zp~6;yUw?elRsOlCy|+d!{+*7PqU#JmiF1i5i{diI8mA8a6BaBmONHyR=Kt0g@2!y) zMgHPnfT3iP@`;()cjq$439Bdkvbhq7-=2fs#{Wc$xBLyuf;gPq`_0H-)3a?S00>U| zB%qV|_wHO|K{xQzAo^RV(vy!FdGx?Jsu2OPD&l?lrt$9;4-?UK6NDG92{ro5b(Acl zqzu9jyta&*rF_@(FrV9PbU$`?mn$xTvlsGGvjMRkDHB7#8ctT zHp4G!84eAeD5n8o1S{xSh(}*wMtf59i$Z9@G8aN7+iQ!1*a=*K z%d~!R`@p!5N@Yz7g%0`<{k#P}_RmTE6uzwkwU9oWW8)}KV7mTfEZkCZ-H%=b(1U6q7ngLfpJY>rnnz_TCgtY|`Ufue7 zG_9QiHX8)hbm3R_T8w%=X`D1!kyD8-|4cDxZ+4?ULuU9HvO4i6MMLM#Yf7+W6@tNd z{jlk4;BN4-7TwSY__KAZTXR5T_~7x7IMLG1A@!CB#&DFQYt?~ngR3KqFk?s(d-Xv0 zjfzT7ldk%kM*PIt5-59c9hYyizR)l3U#<=6&A9&2GA)FUE}C(DYCUt2&Y=&n zT4_ZVgiG&kh1Rp_?BS32!W8V~vqSGW8E%;ogM6REc)E!tuRq|Y_?R`~!{Y@7305X} zLpO%6PNXZVu{SN|ZtMHIJ%9+vxGVo$t4==4w&?wT`tL5LOmCH0HXG+^DdJpoC8tXm z+6~hC7n012a3{q_HxxP~HQYpRQ6`*;P0dTR)QYDXG~6_gkFkezi~a_u2}Q~E) zTPFi{ZJuan;V`8r&53;WZSdj`*zHVt1Gu%T@B52k1Lxu$p>{wW3Ft?O(^uN;%{7kV zppsQLJ{sq|KjNdZ^)5zo8)of73H4Cq|F|x@zU46<*VRLkTltojX&TJxUDeP{?bjG^ z!VXB$X~|0U;5NH1sMCbTjagc-^V)&Z+s~q)h}7wuoV-Tj9c$@fO6=f*4q5EYK$0!} z5>yqWpG598Gef!9ZV_vK4Bbq0@!OZ7Z(>FYd9YdN&rGC0rC9SKLIq@13*_=uoDf{_ zWrTVgQ>x*Q$}X#^g!PbPLLDjfQGi{C!fhb4o~ba*_W{<&r|Aa1nS}kUT2ssai2V7&IQw$TL6uyt(VIzxCUR@(Tg!HG zBMZym)HyhHzD!}G4WSLs3?QjVy72}O`S>+$xBs3l(#RNomA&@85upRix2(8|Ro-V? zuXx0bSw05jYl8A!>V<8o@;(yX0X7T}^*?q_$+790}ni`6lA!Zx&K) z8)FK~g^Z^%66|)WA&k@qIZL=Ierj_|PGu7c=h`V=}m zWV9|}p=UqCoxYjXDa6bfZGS!qiD^=?m6RG3fxg40c zEF7Fh1?PQWe)b9_`f>gEM@ou4T6z{ERwe5S%(m=N$rkReyKovJPbM)8n|ce(U73P5 z7OK8-Yta=@S|DaEd6ZF|Zx)=xH3kY)7iBCJP}Hlx>XOu|fAS$>amL)sNO{eqv^xzO z_|pc|RMG?nV5Hj<#rxisqZN>(!Dyq0c9KysBy2a#?xr+d&wHJ81bogt!#T)UrGn0I zfBlu*&a@rn+jI!N*#Bo?cNr9KZX@_F?TQcxYPg!9DQ(-XwB78)p$vx%5z9!6jYBn$ndbmK+;`P^wYK#E7%o7Ffp((ji z2Q-rDIEMLO=R0{Bba(TTiDvpa3C>7f%p#T52`yWQ4Xlhy)W5OR6@sQzP~3!n;;p4z zP#badsCa8X11|SbDGGZb)!rl~4D!w3)erk0=vxOeebM%DahUs-CVSEKTBcobMeXzR z;}i$n|I5dbGO!UD+zeO{y!=fvqG%XQ0dlzP^f||zbo?;iF@ofUacxQq!OOCoI)tn5q{H{va}F{ zidi>2;q3*#{WvVA)U`)LrH_r|iXnu%`+TEp17_5zru!+uwVgiwMgK1JYriNrp-L;4K5a>390I{d8XnuRi_c*N_=^6oFP6q|pCUsY zJbUl(pWqw_W;6vV2A%EmVwh5%hcJEe(T@h zk_l{GSk`U>mi0{HJBiKLFtlbR6i=zeNhinH9)W?n9X|e9`W7r{hNNpw+Av2@hMr*< z4rFQO-UftSP`k0-K|S&#&*KQun#2o8LuZ~f!9Wl)&P+->5Sd4;Q_BW@&R9jn%nO+% zr~ANpdcoq`q-1Lt2O;=j7ek1bIq{-+PdQ93S*5D$Bn-Y6RdJh->t*?14#D~gRKy?!GE*(oYj%&OO6SnS{>^$4G%`3V6XjKla%7_48578lfY z&IEJ19LZ}+3Nn5f%_|Xv2HWpS9qqA4@gArfpiWN}e$O2A#bUTbTENT~XRz;us&Td( zJH;i-kXWPW`kHE;A)y`BK)>vU#h@YJD{ET#o&YI&jBfmB$^J_=29C=h=1>U}8$sh@ zDs$W0befW`p*)Q`Xb7Xy3#GLB3myElcH#eOrmOn;wj=QoH$Jey(uNjZ`XkWjzY|h6 zgCOQmj479m3>MT+ND!DcXZ@AwGmw*N$R|#Z82m42#-&r6dFSTQq}veBsM$4h_OvSskUTC$S<&yQudG1iBH`u zt1Gd>0e?MKyCcVN(bwH)Ali*^`7kjOij^r&NYlEkVV4Tb2q5lPEB!B_PQlG?gZ7gb1kk)e`o=I_Ar4ix$I<5nUbxSdo3K6 z&;_+H7VTqD%9c0Erou<&wv70E2Bbu--D+vgTBccN^l&MzOti6+8{UBsR|t{tj}bG* zc+(f27Rd4F$`31rsdeukSCL)uSWAdVLvP=fZ0g&i>Y^sTJW-XvwkEASA?pGYn{42> zEmYD5gakF#&v>}>Kg6232?`f0!k#gQj80a&cTuD2P9198-vaZRDfw_Vl+7i?3WO(* zgaFuyS@a9zOr)-Bv6z`T$z^_Fw)AW|AGp7lSa;9tDdx!^ri&3#pLkJV@j`8YE#X5@ zOQdeA7qZ7SsCdGDD42<pyU>iDS*i(5k6TXYrG&_tLV?#>Gn9h#dut26jBVyp~h1*vP9!!NB zK|A5lD|Te?>?)64a{#y+_Xyy$$Yf~!^pYGkPBB5pZp6Nze%I09zBp$(DKoyfkuU(G z8(VrfAO82UHPD(Hc-qK1_bSrdXp6Elfqa$Ad}wID&e>24Fp%~9752yHItnX+yAT!^ z!Ha1kmWBZid``$cW}wN%jt6l?S7nv^XgjB5XEzYTUX^H<$eVEdvpTeVTJ3{P)?rAz zo8nq{iIDkBYf1D|))~TU=ws4*ktoXEpE_)vEQ!u&bUUDmo=fdWkk5kODx!aN>J1Rrg^`k(2q6 zKkz#iCEHZv9S1@fiy(}}9CBHSYl$0+C=MsO9B=@-^g*$halC%N(>_ACPfLC;vMi_T z1ju4zVoufa{kXwiRyEdS+|Cwh`EecEVSP+4Tp;N#%0(jp!r&ML7%W2~X zA+2EeDttO6sJDF!rPL7Y4vmSsm|8<7DVE!YF;T z!OrWe2arqgc)ikmvI0tSh?F^`BpJoTB9JNkUAq6d%D`adYjWuU8%f>9(&7V>>n1Si z{NJn1JhSDzS^`Ih!Jvxg5=~QlPH7bom&i%}%Xn@z;>nne_;f_ozk64Ut>|zIWk@86 zJ35qE>DHZw_IlSEUFi!>EZ=d;ChTwKzICv)3Msp-^@G|?z7()3&h3cuxQCp+w|#h_ z&*)^_+zw>VVr>crPfh2fS*->x8Hm*WEG1sRO%OtDQU>FA0WE;&tQRYE!ZeY0DFxOE z5#@*#kkjm94)+dfRr-MX&IcRkb*BZ6LE}$hMejB|<3oNNLW<^7C+oBUPeGCfKU$JZ zV|nF^dF%B)8GeMJZEoa^WVTt!!feF!AE$MVGJ`x zaN$iT$O4#FsDMW&e{Jbm3tVXmQBqs$7y`&Gkh>}-4__YbY03E((O$Z zvvMF$zrzM9>B(87p;_+T!bkG1SNDkHBg>e7fZ7Xcp$!TtqYD7Ksn3lzA> zxSUM0cUaLXVHhmM)?R`d$sK?v=FasLS!~g?42cN!x^dSy{OuGoQ&Ji#j=mrYU;$3- zEk2lE6{lNBoCX10Pf?p$Hb++gyVzutWo3*9x53-8g`q~KCinLHFk}Ug;b)vp-2cZHOm`sD5X_M#ia45 zq0t}8tbn;+kd?E-c6KzSA5-HUEoV%(P?_l~d5ONmEAkzE9DFKqeo4P^tZleAO_Xci zn&z8=Dh`aN=SSu?t(42zpp5UjDE)RAh$Wm> z(e!ul?2MtQ(31#5ys;owW1Ed?bdDpeH}9A9Hf01_15{FbQDIBj6FR*yHkjwMrCDi7 zq-$Gn{UCCOwCRta)=a7C5>rgwnuQT<%}{pwmQnmNr%C#&#G4i5jy&d4HyBh({H?Uw zjQWcR$8TSHzv_$HavvLR!cO6p@xbLdhe0D^p62@iaKE(O~bxMK!lau`QS zCGAC6*vU&KM~6CbV&o!3a;Kx&VuQCK3(7rYNS4Z^Ko!Nw5)7D?O3o@gtPmDLK#hLR z!>w=D{n)U$$2?rg|L(>E@R_rLc);qS5dP}gj>%shw7udg6D;h_X>W8rSD^@t*(LXe zp}g{aUo6^b*Xw<|U1Kq@V=a}WzTWSZp<^tOI0_oX_=OdEgk{eA7zb1Sb>8YnR6^3F zohq~5Gq1F=tppC~t%QTiWQ5G>o0gG6V5@?wq{)RLcwFXimZDwkje_i8yKN}5L|zy` zTKrBduaDMfqFEuBQ*3(449X6aAjVk}Xr|0#%u_37E}!X2Cd)T3j5el>I5voml{ zeDyQDMlKm1#Z#vsp5l8`w-as~Zof``=ZHc0W~pw~3sfqdwV~p*fgJ|xkIk9GJq(7P zNXB2CfLcL;DCOTCUMQXiX5Py#{`cwBFlEnO(T^8G@Too?p?m4}PI8xpr89IF;HIc; z_p}nY>4@yD6zL9S3ZK9q7ix^VrjuK|uOp1i&piW^i%_DI7>A)j}zjIa)wvjeEeY=5a-F;7VU$>U6qd*mo74tg5x%rZwdWA z5)6t;J&$sL|Wb1y(WtIfoxyNW>bL<1al}BIg6$*DHfTK{*R4Fh|$+mlQ+Wg-Irc$ zk{z-R$a`-_K2x?Zz4l^RwdsS8Ld(Shn5u|l7$F#-LQL0ksZ?lY-AQ_&vSphHeUgJVb463r{{l*c#{YGJK*ttUxfgT z9hGqni`9DZKYRr3Vvpz7S+C?*KA_x&MjILXGIkHWB>EKcQdX1kvrj{L+dqpPWPaSG zSfAIpjQRSUNAyQFm-|<_V;)OkHxXD0h`^(MMX`56y>@m{OSbto_6q%cJh0r^lgZ9I zgfgzvKdKL+E%QL28#3pj7PRWNi?wg^9072}?zLw92E9>MT2j3h4|tpJB4Lg&^3Iyz zqOHh=)gXvR?uOsvREM^C5X7%ZKX%%|VM^nX`D!vofxrx7s4(uQnZ~R%T26hVM;yqm z`1^#cbW1~5hmP?|utW=N`TNck)_bg`eO|-gp3pWZjBUwgnn_S-%tAAiqOmMbYgBAH zWaDv*Eb5CPxeb}Dw@)Pb#(Pn4G4&IH1=21|Ix9&riSkwTXYz3wyFGRjkb9$Nt2FI- zWBJ&0nZK3vjfSKQ%)c(Qb~EWo1VFKuz6ah-F5)Osw0E?5wAz}Vo@O6zS5K~Bd6JZ( zu~AzPvn8$n&UrT=F!MD4ZvvzBNc<}spiF*@w@>Hy14ksi3%tb(kCe zj;qCOLEM%+YA4SwRFtl*85a7Bt?zE)#i67sdq6zburl`8dX9kDphFzXtr(i=Xslo!~{eT);Ok zjk^_Z4fYMJ0uuLCSq3(q1qlf>#d@`Y!Tj(SUAb!Tiu9ar(SkNhv+F8XKX7-R(a}CP zHREb8pZX|qPp;6DD(oSS)(ru6tg#EZ`V9q0s8xg?m2g_b`^&4P`5HiOATLI*0ozV6 zM`kD+r3udYhV)BlX6EnR-wDVTr7dD~AtBiVpi)Dxmw%EU?8QmsVH0S&SsY)C&!;BW z_X8pFwmJF=?06%8dZXevuHC|5d{)mN#aT+w%jV5!u(JoHHhNY{KKJdBf@pq^Q>YnD<;bJGFU?-RvQvO8kpBn%QQJ_|Jyh!<(tS%nm4DDH^ICLxV4>SA2OJnBEDjqGm zLof)#qZHL8n7xz3`b}-lc(|%LqxG992_%U;Nxw5dwKpBYu;TY0^Lc#L&)gt;9#s=p zU+Yan3+K7=VV~cP6c1yJ9Y~`ZKKhOv6LX-TP6LSNAbdO*kvb~Q+1x606~Is)x`fS( zJ~Z|GtqO{uozv3RLTmguP7mtoJz4$z4g>CnwSL}pgXu;Ih6|Nf3f}p`NUVNfnHymR zDx#yl9$2kyzo)zsoC@)Bwk6h32gT2;94vq`Z5VuyWwD$*f9%vGby7Zvi3Tl&ik`D! zs=yY97LPHjwr|9Qnf0}FH#fTXCB5nLt_WCnca1Ag+8+QqwxiGOK8W=TfrEn_w4M>K zS7-(>Ew{Y5BR*2CuiHM0X9>yP8wQ+3;YQ`=MqjWC4Z~4s{@-eJ2~-)YugpsS=1yN~ zqo(CB$&rC<($T>;kFG}7d6Yau=b$SMT?YH4@S5@G!y|jhoZqRWMHX|#OMDV8z1iEr zS9De#O)>O7s`2;ke7Tsw^jmeocquoGgSkn`5`VoyN|XP6N5p z4NKyL!azG|G86IZo#bgh`;lp(pJjetz3fTh$9+VuG1h1XOGqdH$n6creRksS-|E$I z4%3oMxw~C%7mqH~897*}Ea&(SpMEl`+!h2%44kMsL}s3brz0}M&iep2aK{B1vA$X^ zA7tC<=Rt$ibMt}lDTY@aG|StOaeqYxe^`TzxVCuAH9oqjg;O$%=|DEE$wa>F^{n69 zaqw}$8opId(;d(M{*!=Lol;4y=)=kZRMpp8_evh*Jew*6tF60J^2oxMRqA zyZ}qLcJCg7{SCa!e+Rohe{{HTp>wG0p68yiV%lnS?C}##A-ss7v=4?aX_FmBzlcyo z|NcloROr>T=2iJ=vR}+}-!DvuFnLU^OD^Hrvy7Y$4v~VMVATMX6fZFoDgjH?Od@vk z_K_`#ySDCymv$Jr6FX$KO?w-%SXbg!IxmL&+R;Ke*E`A{D619?M!ub>(%2+hdwM7d z{1$0fcWu_uj}w~-kkxQ7F2kD8_*^9UqK6i_mRjC%Zj$WelFn3H-0tOOHi_jtxzibo zemt1$LQ*K6r~4)Haj+NeJkov;=)aG$@OuD}e3W?ZGyGkyB>@lf7X(ib%ri<;Oa}?l zXuBt#{=$HlzS{ZuPk$|_f|)%!_v+X->mK?CZ_!O%&Rg9!X3C=_|ka2a-uOS96aq2 z1~oRytCdV*=ilC=>8oF%@sWw1BD!RzA|Kr&vK%-6IkCN@TqIi1dU3UyGC+%n>k<-0~xA6Q`*R?FTKk4L74YD zbuvo1QfFIeZ5LFOF1}D7dxLQEV8&jr zLLMyiv(VW5$&ugq6DnLd>~ztTAUqw4z!xamxq z(w=y9*Ooz<>~1#&xBf^gZ{&jHqid=0iTre7kYFYS;&{%TiqplDH11@yg(k~+nJyQ1 zZLnW{G+tEKVvvC~#zHWOFV9u_*=d2w<@YsC#uQN$EE+eW{d}23MDLD*;1m?+z;3ll zz1Xj2FZLDS3V+t=VOR;!+~9TqP;o;Mx7Uk{^WNsE9ICa>|6PT1DCf0E(b?>#14(vp z?B2461i24?{8R~_TA_F!{xU~)fg)y}9^oeiFi7^dlySlmAfksl!df@TJwS*PN@%dB zJ2rQw@V){twgk;?%@QA;LQ+p6nyi5<${;&~o=gM8|2Jx*J2MK4(jkA%8}TfhmhF5C z3P${dB^bHo5z^R;s-=WP=ZrcUWTtS@5!Do}?b)x3io-fek7-2VZ<$U)8@CD&oWZOIxOh3eUwpIP_p6klAdEt{>lCzPx%p>hZ zu9|9{O?CH6LG+lc8_;h{(U&%xumwd>p;ZhuuKzyNy;@RN2-V}I52wmPPuO_&MYT0B zlFCA9}A`=+ui5B?tAJhqjE$>M$D`n@ews+))dk3Zx@9E>Ac@$HkKfE&C{Ra*%u3uU>6^2 z*aHIVM=b&p%vLF}Q|CYNCW{}iy@pB!GH!gOtes7K0pz%{%4l7;f7Wgk@Y)4sGSGL{ zMUs{|)_Kyq-5FG-69lM4MVf$E*HYUk_Y%)M%nAfF2;w7)VClD*0Np;qzS$f6LgA#y z=*d;p*HcR+9Yp(oyVd~6ew-DsG0`oT($X%SzTp8TOEp2k^0EbCqTz^#*a|9%H?K)p z_yX-c^lp)j>S#Bm1e&YjAEI5F%Z~2oT{PRbk=7y@NdtX9$;AFlu5gZ6Mal28K}IPm z{F);O`voWCU%_2gT-jS1sN5nyb}GX|C=JL4h!#A&9U4g-@YoC}V}W~4K@DK|V!paTlY>|f=~I#XX)#-H+q2U6Ys!1(D9~3o8;&73 z{e5>;fj2f!21g&DVmFagGuPa?$k2y=;*tObA?g3gVl=ne$D}OQ}V7hHuIlsz9VN zg0eIhu9#i)#VG*lZJC`1r3pKk0^*GTtYbT?wWd%XuEgpZE4xseV+4n4MS4%9ujJ^n zn>I3>)HItVE84)dxMi9sL-3`5@ADUG+*tnevT|Qg!w2HiCmOgP@~kQ)xyyBO<3(Y7 zz(=dPVBqfe)w39Qo#T`RN}wX11h}vtnF3w%J%#Qrt2iC`q*PD&?Q(yFAaEe(&Z32x1l^l=&aut z;cV(pCwDnSH7Xv?ERKEweE=fn57q3~hK}DuLNYn6v3~pq02ABj$uW`;s-LzKX8NR}ENIUdq5&AK1lJh)?jIsmGPvPm8BdtDp<(?qQV8tF^>O=ke?l&Kvv0ndY1 zi5^ja?}t&yp4;9T>x9(Oa{qh`g_LO+j3uAw&@W{VQq7p-EUX=}EkPmn{j8IpACEI1 zxFfaGp0(qWb$2WbxD?>+!ic=}8pnrRZ=Y~UvQ7p?`g@pWxi6Kr4*HZK{ch%0IV(!_ zuz9l~U9Ds6!9RNnJa3PagNad3aw zW;3wKIawC*^n^5~Iotlm*8Q)&CeWk){w2!OC&M9f=K^>Q34s)x)H`~=HMpE|p#^vo zE+K^bmXO;XGq?LdHcOttjSgt(3(JMMm`wn~sM^Ch4xhzYxfvBLx~U(pZ)8DvXAkDHd)c2}z4OF%V{hB9`+qvhM9s|fV)b5o(>qAXJ^)Jm zvj%A=1oyrCM0xmXN(1y04bHN|=!O~ZuaH@vCghmYCXooa6~uRM&K@C>{wiyn%I!l* zdRvo7;kT!fVK4G?z&sEgsR&Xd1qP$Y^?Mx}wmjV`s{OeK!7)=`x`Gm7vz+3Nd-jo+ zS(-?#5hBHoTX4O`D`^Lwj3WvVH4B{V zR9FT|H32yDtvd&bG=J2A*vo8pGk~eV_{CIw07JOz7iZkW6~q4c78ubri=ipLeKk?+ zN^+EiU-S>8_4djK<0aS}V3-5&>qrRQ=*nub!row}y0@rvb%F!7U{m~d9Vt_04?R!m zIi;!w88%Ulz5pf0FqUhS!KuhD` z__S_(8Z+zNy(2A1FI(p?27=A6F+Md+8!>OaHgf@jL)8<{(AdrY)XaA8EyLonIyQ1zCX~q7LLI&BipStZjuc=SC{<8N3Z?R6E2I`49FfsNxoLqssK41F zJ;bC8dIZ=IDo$iv$hrfbtJ0~PgW){j?Q#a8nE(%%^)zsht)4g+cgl6&VBIJ^}d^ZD3Dsmv?hbi2)x^(xoK`}Skpm=n?? zKhmd>4)mmX{#FC*`;cnxatl+dkLxD((0#Gy6CVjLPO1BXC8yYDOdP)1v zlMOLrcv^c!3ji}M+AWH}xGU7$ZZ^Tz86XKK8MXi&IQT&5Jn!o5&{eQ7Od(0yHOPUV zHD>B8xi2{R%oZppADZ4G{#`X2OUNJvSG8(DFDQ>~f!;Du@3vEb6Rt!nQEB~{pYKi?=SS&J;-KhyJBZrT?smH5hASQGtzzCJ8@2bJCNm9p2OIRR~{cRGiTz>v)WFdi8u9Whj*(K$l@ zANP#V?q!2B50R|23%DL_(F!Ahc7LuVtIo=TEPmlxxvu$hjdN?Y{nvc-0q^1vO^a2> zqO-nT2av@c&H)yqJ`M4GTER;>x$Qk2!uCXxL0-VlndtqWZ;~W4trBz)*{vSU`CdE4 zXGE%XB|`&vKiHrJQTz($x4p~?!Ws%a4ZZhbh_{BdOTaC=>Z8@RJpcvI1a6Vbw7)HR zUutb5TeP{Iy%obAo*9RSWQ9mzT*u|(im2znsa|viM4Ng~VG3dQW>0JTy-^y&F9OT? zI7q@`BKKiF6X^))u~DP&m4Dw!xCQrx^jWq6c0ZOlgb)OH7)6F$|0)^=tCeDl>6rNJ z-WTI*;V2xR**{e(?#?W#df+rX6F<=K-K_jD7-|}7vmk%prmnF|HzzVNH;2OYRS0}G z@oy9N%=7=?p2lkF(X(I@7#=u-LY8ShAF<9sp4m$ z{9!7Yc&dxKVx#bs#5Q?(kH7Gj=UR`1C)c8XrLSa_aAE;!IaeE^Say@4~4#rpx$fVrcljV<}tp zvGYMXjH@g9SP1Qn zaHHu*fMx0x6R}I3g#+`Qu{^{NT&!NaG8+QWyEg1DmN_T604}x3RnR}bMt>TutpXof zJ>yz-THv=MN-ygBw68-nk6cjZT; z;-kXg6vuo(iYgbaw)e>a+WOv))}Ayg9SmKbLVgr<3h9+dm(26K6Z_@Xss!KLoTsqFbC?Lf?y8>b6$$60Xw>yu6!Y!e^laV%d zU8CM74&S{?$EhP$R#kBh3JSraTj(#=smCg)9D^$s6fRdvLj4(;5H1fq&~2FkbKB{vE1k#w#kINC#C4hLE79bGwW-N*UMLnsZ1Eqkkl_hLnBx=px))gLK60W&D2_SwIApqYktO-l^2 zBG;suMaFX0mm2zjGgS3xod-6e!xFw{#r5`mVvZ!0JD&G^n$3f8o^%WFS}|hH*xN~u z*UH-ph8#W;L?Izkj{my5IL%h4De5_3w*(+gX)Oio3Ann9R(+55X~)vW$tNwcfw6y8 z&o^M_9K)`%eH|}u07YI5+tug(6uf%fjF~kCT2MRvBaAta+oa-25-Z&qG~j8rE8V+7 zJPYm#a7U5!Bovxm2LA-r9oy;oojKKVLN&en64UNZ+#LgA2%4`w2(TF!VIe3Z!njx= zjO8SDb>)Ui&riUPH2*BsO17PwX=We3ENNGlP<9VLTnc}Fl$vnI5MH;I$P(Qh*2B?Ws!@MQ2D^=z zsjZ6@Jb8Axi`<6XCsFsPP*Sq$e4!VuDhk)gR7gg|2myh`7^|e z>?hagqoQoAk>p(tNB@WlL(zJW!LfsGnWPOp(}=WwThOe=YPz5^_6&Gt zCh1XRLOWtrJk=xAq(@MjtA7N-IXU{^^I-Pi&@4)OKq47U5d`a~qE2^1xe{peirp+QNk#eVWP646aWw%2DLE zEhgjeevI%=oIB+*EOmTZ>dTjk?=BC^IKJg+=?P8Ra@NOKvrkOl)>?0B&vDt$k19ku z;NG8r`%Xc^DsQlGW?uHK{&bJ6$wAy$NuhQ0ah}(*@a4^SICkEF5L9`OP8>P;cLnfGS+S=&!BvYx}L>Ite|7dD=!PWf@M-)|MtNGqtPt<7a@f7QOZyLYt_dX7s4 z4@@k}iE<<0#ZG}Xk%=vVml=@0^&i;2;(~JAX7$1iC0-LmIjaa_U@n6scdyMAdvH9_ zh=&G8q%-9m76GHZB(-Vv5V*}#?ylzml{(=Wf1zFoj?QP|%TZZ%y2Dn3!?*^^>zSFL za6hm6Y-cqcQWgL z>A?$EMev;r6=QZ?`wp(crcr(`eG2V&D_FH!>jUeDV1Q)pTEoVJ15~C4bke2jzh9_Z=Hy! zvrp+zy|7>Ua_N3zlR8A#`x2#BX;jweR0CkmyA`M~vx+TRY#IcX5a5YA=X=~-`Lxr? z9B(!#1@`YoX3oPh1E%d>xhy=Q;_G|L6rc_j=c7L%PJQ@GMW%9yb>2&M{^gszZi*q~ zwsd>FiZWb`hn+SuiL9aSiNUV_xa%JM=}#B7a+DKfa6Io8SxbObXVore&*Tcs$~BBx zS*u4~I=yURXQjhqMU2zezK3G@O-gk)#1;p&FV<`Ac;y(r1K>%}E2l2^nYvslv6Yjuq6(O^BOq0O17U9Rk}}|WC+TtPp4x3mGu>yO*8P$Cdals4 zzcJapYA#W^rdFeUwMr=N<28ZAoISEnX}U7NL{A`qSOH?5K!rtP>uFjanc(ln8(=%6 zy}g5TBPXLtiz~0kbPb7e(D%-8>K!;`SGWq9&mVVyw@fcX=Q&~6V%_cAZ5eJWkmQFG z$9ClexXtrrP#2szC zaPLr+8+8on+~txz7`;{X193yIf%&teY^}8_mRtz+7f1wM5>{I$Yameqk=9W zQc~o{fZyc!1Jt#XELsB`xHfZA=RUc*aSG`NM*#R=*pyNH0c4(_cB>Xf;;u4_pL2)j zj*xW_&nW4iER&E0^?br78ja2f-!LrZ4b8_j<-RJmzMFh~o=_W}S~H z`h86n5=1Q0r}Jvn5t%QP2m;RY)IE&b zA6-!&jq8OvO)ES0t%L=VLzQAtQE5!j1(a7K3P7XM{(M~#O_{8e2>IcqaKti|`@tGb zpNz{fDMY;C#|6C_4(qH9pz@XwTb0kgdz0-}j!7fyIDCJ48bMqRPy}@PfR$5LnXU~5 z+0*GSd?dP1h}@UowkLIw&!B_(_kZX;8;%dJywk`m$X#{YN!r2grZZ*!0 zPkTtTwZjc_==sgf4wL0Ej~fuN$Y1%aVCK3DQwrAl3A)+h8Qc>RP7y}u1y~1D4@wN# zOI3&E*Q6p>=8yKMZ^-5R6RjL9pMkFR=|T>T@;_x-f2vgW z5GPZYRQ-X*^DGcbwHf^Z5KRCXW%i04F9>Kqk{%%!3!6{S%4%+mDYR%V#wrjN!W&1W z4q-F5V`y7WDF@p%(2@9xn$pD zPH@fO6S?*J><@!9Qb+jhUbsP<#SUEP98si;#p80Obnpru|m<2@92f$3{NRqQgmV0ZVW zeSU4-&lJ=(4T5Gkg-yM@xFBI72aDUl#OmAZOJ@wswk^1Hfek&Cfa2uq$x*{xNT;yN zm{0a|rY4#vpuLvA(vgUIUgidBxDeK}p7DSyojGs_9_8y&=t|JjlXq7es0KSOn59}r znhK`K&lLDuX{=5CrP31qknF-O>P9AV@xsYdRL{L=C{>A~J|{4Mr9={9mM z+Dv8!xnRJ;^=DKeDT~r1H_&nV+$+NRb^^l@b>Cz3T0Y$q(`2U!iFc1Q^5paO>D~gk z(swb(-?t~V5ua7F&%!S7EBuvnmSepF85U!Lj^)G97ImP?r+&eIVuL0dGK)MU!7}Yl z7_>v2HT0iG#+uZmlBJkF#Hxi@h{y5HoR+;HTJ4b>$>Cu#c_*b)QABFevPx#cSE2ig z@+008lcx`1C;A#aO;PEB|3tl7j+i3V3!cGnUKNN! z?TkBkcFBVvO~%|3=kQmHn+`BLoooh6MqS1GWC?4{?LxWE^t0ttgyiN39EFtvYz&K) zLQ7TU!G=b@bA^08U$~b%;J_8w=nSrixWj7dF6GT}wtZGqVa3ptn(^Dw*h+SBf|Kxj z;8DMc%z~asa(Bc*Id)lunI)B!8_E~^C;F%s-wDFKAwOq#t4TWrYja=%Ehym! zN!r?%a1)3~uA2PWqJVtDQ(ZFF9cQ<323%uhnmiy5O+9HDNz}}6EkpwrjfaJ-qxXUd z6>DBaIqu4g`XcZ_jX09RFrV`jDPHCGUOIU)d-}kBrAJab2H0G&Ih;#DT{sv5Ao=B^ zO%3e@MBY-^3u;9P70mAkfY1P90rQ@;;vwDlAAI=%_dFt}#FvS9?eC)pINLxiZ{sOf zT4GIb21xp8Hrgo5%MIhzCGqibW%0d7cv=8BZgf-2@wi^m_q?o6#=q>`WSJ}^YavoxwRFIqdTC%J?R=vbsx!6;x^`A_*f@+72U+of^U#E74 zxrakQ4L1@s4TfHS2aV@(&@)rTm}a`E#kR2R>2fnTpLznRSyFyANIQIVq90^@IEhua zdnP5c+Tvqlq?=mI$d=-_X~NIET#1%zJMq8o4gsMR=dB%2tamwu}}lahv9U3nH7F|1XxW$i+T9Kq#+ql zst$$pK%^u{I-GR+mZ%oX+8i;wU0)i8Dv?R;M`3mQ^Bc&hG*?bNS3F#cL^Gn}ZG2Z> z<+J6$5~VZJ&hFj*n2TP-<>Ts1=M5Sk9~w3Ld#GNQIuS6ou02whX5zK`;f%~j-e67s zrX{{5q?7=?*RdStIpRT@xN;qq=Wc11_lw8C(*WX%(-{4B)6JPFql`4m<;!0($haEE z$1KAS7C1o0kE4!1wwKfrE-fj1WjsRV16-V8O4b#_gA}z041Waf$gFGo?jkXQ+i`xZ ziCPeWv*20l+rWk3qPXNQ#kV?YOfB|@jUHqQadwk}$fH!$9)X;St3v4PMTldR1CVxW z)tj_TiA>F83xs>ttyx+Jo#p9=QTqgsqualp7qnvBiQ3kAy(5N=V>Y zzPLk*dIwOSeGkP(96ZNG9s&I`_+MVm+6cK6UQfp?x(I5#kdM39)R=Do&z6WNG_BSq z6Q{Jz9V3+l4b9Q^3sj+{^o5{qc(QwlM~lvBc9TRdya6!T2sY|SHDZT_^>sqxJpg;4 zFE4eOe^DsMn`Eb9Thrq!mUM*L8QF)@UN(>nE#|j|luj8vG?XCE{yeko?9?{J7~pMY z+d-&CqzF36x;;&dByw;>aKWth&Gvk zIOwo!O_l+_cE%5AE+wU#!&U8BLtA;};#NJuv+yRRI0*Ym8n!*blqdm|t97@J{p!s? z9={g`06Ab{vhlXj_H3d%h@qR}DV7u8f;?9@5dQC$R<}ecDoT9?E-&8sP00O+CP5Qs zZeZz#62`Ix5TI9hNVX_dtua4tCcJeX{wAxP^5F*O{nXFuc^m?6XFwXP6^7)g}a(D?(> zNvuHB_a`nH;hJdL9gDo>^p48~lvQ?qHGMYQX(=|&$X@z^OznVKSrR{X{2t|znZZpv z%df*3jL>W=uX6I&{FunkEV0qPouzry>33hQs2Ci_B?Mv~Ai(FKVOOlI zF`iE3aa`G9%MeIXp&++eVKXopc=oHNMqw+y^-%G4IXt< z4UVD;$wIoJ$EriXoEc!4uz1aPi}6{%KPl#j5F08k=8iqqRr7;mr^CU8NOU{ZYCh1W z5*d3cIWxsqnD4M=v=$U?=MEtCy68q;CjeBMqH@q0dKt6ba`kZ{by&@qJhI+q?*BSp z20Fr?7Gs;1m%k-d2wS{yj1iDNRB<|&S>p|j9K?N6TKql?%D;Wbr1SR-+Q7;EAaT|3 z&W45T$qH=4wCqo*fe;i_(r|sgNk&?f?F{80!Ac;2cIpsVIL4cbVs+ayo%yA0z}VaS z(E~Bp5D1dLOQpqF!`d`B+QRLH-zw9aYV(CjEU7pjJ^Osmcj61rQH9>6e*yA~*;Z3P zwcb$MnKBigOutgG{c;?V-@PcVo{aK&6bCDK6gDuFW#^sX)S?c-kd;^}YsfwQpmbvz zlq!6kS0?K3YhG>BO8laDph1nA6b#*}PxJSU(uFaXaOk36kN(hj9T3!H%PP|S9Q^?@ zvAB~04)4{hIObgR3``yvlkiq_hMN4`N~5kHCRW_e99%hq9iOlaBiZE0RNwa@!YDSA zo*M*uH*aC?9h=!92ycE#9}sQnWe2OC9(piv#2XSL!R|BiWJf}rG&nEU#4NIBwY?H2<#h>fA{6%67igBEVYU5UmBehj;-GDKDP;WmnmC zE8jE~_D*>`q%S00SLMZst}u4!_21Wras*ZTelXGJPF&e1bm@==<8O*bXco?=XX|K1 zot4kaH^x(mJ+}8XoAa3ZegoCHrqZ>|Wek$}$`{yswVI}{eoU4nESgVY}<686BsAk?6GAMS0#Qn-o&d_NGa`rf6`pGwkYK&CY}# zP6QzEnZ?mheG_;?7p8`gd_Oy5TmoLMiA#U4Z1%9mik?;I9-=5+O=HI%@Fp994gU^M zdoyqiJ?SRPt%dK6!4NQ2XyP2R*OHP>GFKp5pd=;oAI(3hAt%E&nV3ha7nmKUKap(( zqIA%DJimk>pIb|kmO5M}ukDO!aFIY4kE$2}7akigo&+1AT16VE=2}wa_37co4gl<7 zKS~3v3}v-+P4R@(RTh)Kv;^4halS?brGs38~RqH=)ybW`VbOnQdVtsr$DpKGBRom^!mE&}&ihVZ=pz06M z7LI88+O2=|_wJH`=C*cN1844!+`37bXhUwTBU0}&75V@vX5RmK365iv5x)ghDPK_dkL}kcA7l}I6>4IXkEt}Rt)bOAD5Q{r#<=^HlsR{Ih7oq z$q4Q$I(z_M5!MeZL7U0$Ie>Gu_>}Vzw@D`qf6|&8Kk*fC2xuc{NSWPROEF#6fNgG8 zv0tc{=1g7X$s!GUPOan6jxM*NfE8`~SF*(sI9Bm}Nj3c_@)whs1cHiVFEfyOXh~dK z7bco1U`+)n*01!*dec-j)}7-v^L?1N#osL0oRGNLB14u=@#i~ZW5-^C0aVBZVBwr* z@4Lmw+@Xm>c^poSP7T>Tr=ca3yg5cjsrp9PnZ=6}RvlGVYD|U(LjqWdh8!f4xnUIs zg-AG^bwGZFtXVx;k_4_pBNg6 zyiDY6JtTO^SB=4^-+D_gFEq&sJ$ivQ?!w+Vit($2n}a{{bsS1eM0Kj8PQ%??Q#q58 z;QPE*$YPyuj%~*{t-lD3X_X2tI7HX1dIj0UdHdk+$_EDVZHCyO1%86tKox7B_)Gub z3FW0d*C;mn5N>GnUI9nyORE9EKPG~j$_5!vSN*C9!T}5N&8p1`vrBK`PT4g)L1Bf< zNG`u|RXd_am0^FUO`+Hvp64Wc%F+Nm;*S;;=Ct2!k7e#e^$V;hyGV9X+{#k^veAvA zEdoCz4R9~^UQe`pGD2CPr?5!?StTU)@MPGSXZ6fusnn|%s1uK?Jw0>ZAO_9)fTNs& z^T@tjE?0A8!rJCA!SX zSGvLpZyU(9O@^?h@7-UVKATWgXl&ikZ385iNpLNQ`PdSL+}JHzKwC0JpvQ+Rt?@5C z7-|U-`aR+aH;C#iNy#B47B(n7)9M;mKRf701gqj1C(4Xnz}4v8eL|}=IM+T63W5&N z;>Vf(80ymWJ9?iZk^+Xqn<^u`Fw||j6j6X|_yVFpP`9|F6`)9KJbL#-Lbbu6iao7; z;Y&BxBCE8S>$9HIYqIudI#PF^VTEh1Yl_63ig+$B-zYIs5*q)HU0fYBMCMe~uADRS@ayKT|$K zaz5-ExZc^wcCB%Lw(d0>bi@7Y!Z)g^t&jd!H>2f@b!0%G;(XcJ>TmOMh8-4|X^{EU zO*{IqLQ2iNfXr0leJBH)+$?Ez7&H$i4XF?>g}(P>3>e2pZj&?=i5f1A#Ot5Z2*;XI z&uQVqg;%F@6s;d*3_3VwVf$dAu&B!TZ_ih;qVE%e{SDa$Vmvm5^HK4?}!oh2HWA=X-TL%me*p zjBis6GJXaCZ6s~^Vk9kdVjZ488LVAm!f@yKyH-he%6P_(GVc8mdr1e%p70o1^nW4G zq;*aVZ58XNM=ZXYE+!y0hw>NR93i0y8~h0;_=Lrj!;J2@E8tyG;7!e;m=iGU^+GnA zw+!#-tc3mHgoxtw1E=bsHe5U3TI&roKyWx`^{H2NQ|DyXe%Ji442~)T#9Jki*1J5- zjkuiBFoTnveA8J46!fgZQlGM=;&!PAR>ZF1$}F)8c$qm|*=-G{3t-~gmXyV+cbDyj zIHlV4VxTkN)?$Q7?K+)3ZO_PMi*gof%gs~Wzz_4nBCQ*63>k5JjzUg*m!2969UHSs zEKT0}ieOdZKNq%USBI!Ml-fgzm#95DIPOy-TJs$#)Y*X?J(a_{-E;Sve5LByET5mC zg_L;d1x|PkyM&yXKi)8pdY$|E{hyEoo^f9h`*!%P6n># z_mg;FJx&l4=8& zQTTaPJ>ICem4Sg;J<4mk!A|wjaSk^H&b>{t1k#puOy$bi@(OsmtHbVFs^SIUWrjH) zMZ?x}nk$~Yxm-__5ychn$y-LtY)Lu_Pxhfo_&v(s}njS7`6tEb)+aX=F3k_VpnzZNO ziJHDbN>F#PtnI28;3WFcSIWS5CCt|Kdga&r?+MUP;>~-=>tw zY-VzZs3Mff!}+}DgcuSp_M2oY{e-+>Xo%&%(vPT8fMLSgFMD#M6VDxwvfEbKtE3!B zH~1LVAC_-5yE^_kwnYF?{a#fPc#Hqed5aw3@oqvQB4<|VF)`7U@G*KYv7X*te=D%A zSB}n8EgR(q+@;q^^%_4?YeBDu7|(=gXQ#>!{s|rz*Zk~wp{-Bg|Ja|w`1``4IRX+QBCS6w z=)0}c!%ee08-!gOi2#}DCn)I+AbWuxuzNh_P-np?YRnw*E-kYl=e##v(FXJlDabQ!2!$PG9 zbs8UXU!vw>FGSSq0Rnk~qF|!c77fQmdikHvuFry7j(K$aab0thY*;B@TKnWFUtovl zp#U>3jo)goV~okXj3#=e&{-)Gt(}bFm!jT-+-LsyutFC66@Ody7u?wBC(F`QBuU*} z`y9ERc{%?`zZVRFFFSroG1o>&grwyWbex;Tg3kATn!E48)fCla;YqEBuc!}E(KEj} zel21y(~>e(jWu6k7I^5Gy?Ko=&Ty~9U+m;e!tl+drp6B&`oU;-WQq$VN>nup{mMGx zbSPsPqrXuY2SMh!{F**nr5gUlrG3NfGC^!uHyv``H;b`N+Oeb!M=sxM)|$JW=2=~X zD!i(FL)#I+=qLWebttU4kHLkWvr`&-bmoAgSrbULI9u9YC7fsmoJ7BqdHc|2wWG4w zomtB0ZxV)5k+1K-s4dMX!&k+mLRp2pHHeO{%(;?cbKT} zyy3|yCVvbL_RjAG$-P!5onOwQVe#n4R=v!Y%QZt&oQ6iOCsbM26qYn`yan9v{?5&_ zDU!0e&-0p`+Mp<#=D?{Um)u8S3a$l>r+#b!O$j~WO>zl%NSNXl2qay1$6M5kqtJ5z zioZkx1_E(rdM_`eBLJbnc5(m;(e@U6V1#7I`YOKr1F;CK0gVFm@tW;g>Z66JY_CjK z2ZLjA%{e+HdrMnP{%u%mZb%niaVH1`AdGCUO_%h~n^NyL^B0s8E!m#<1(DI~)nH_u ztd4-Vi3(c1k(|z&-hu8IzKYf;+M2_7jqURm0~VX>r0pc3v7}bw$;_-*LtpL5EgiLw zN4805Bxw*1e`)Gh^8*hI@>JT|8`x){X9P+ENH|83i-z_A! z2jzBPubvg-U*ocTU^Bxo*!3W&=tKH5R;0g%qpExKpW5I0VjUKF+g^8VMl}n-j9T9L z#N~5H^8i0knR&sz%kZ?Ec_Oyet2i8Q9+T@-sJY%-&+&PFB@JNb2IS(4;H(@dJ5K5D z?>#{ydPyQFAp5_I!FkY4P1$uWD1LT2U^+4#VYV?}VY#$vPT4)?+^Bs(YK7WQ>a9qGYExG*)KbxcpBSx6g-v>f zYgd#gGWC~K@toVE@pqtIC3CTio?O{ADI{;N5+=*lPvsvP@%`bc>+ z7+D|LSs~}&3y7_yh`My(X6q{IH*3o-nkhD0UpylY(hCbk<||xfL0AbYYWTa4257_{ zV;b4?kk-#veV(`*?=h?x1TG+YCh-cEzR&zbVw?+sZ9+%+Oj)ZR(o}G205E_+15`%!RMkqnMG#8t*Vs>_%N9dk&#RRgtH~$~ z0Dzdb5*@2SzYT~u2^1x|j-NesZee&jCG+B-o01zMkFR$@)V+-n9N5WW61X{l4T?A7 zWMaMZ?9{3V^?GIpP`4K5KQ?{s+b-~N)tt$nehGaP5ycizCn06h3)aX2vB)bfe0pfE zQI;Co1&l&9BTbQv={`Yi>Ow%z+rRPUT;(HCX!)y)HYEhwFuojHsG0_WJZ3)qW7)=< zZ(AL_jYV5keIek@5z3uBSqLz2+AI0h72?-DBn`QVBA>AwMXe@`7*@D z6YuKZiZo&}krraSkDpH)XP+e@aO^zb|Lz7uG|ATA6mfkyKpW)I;H-Qei|Kj)lV6*K zFos4hw<9Fy6@C>H^TkKqaI|BAzlh*_As?%!uoN{0kc9{^z^4N)465K(Y-^Mn)K@q} zY|2|Q?``J*wTO%%_>#V7P!d?kZ6<=(O)d!C)zml=rSJ|hA;BpKuS4W@ z{>;eVEwu5kfjo`_F}$h;>t3G#YZj>5GulC2)AYUZ5yFmZP-~<(k~_{-KSt(GYZCO0 zqC(@>gAdTgF$@0%UFzSm+ycZYKY#FR%q0F=0l+p;)CC?xed{opV2`VY=#-*gc56 zD5o+cs3I{aAcnwaO0GE#URlp-PxX}!G{KV$a!@SXn7>H|v!8eIWTLr0G!Bo5koXuP z9q=0T?wU=)SaXT#`UszBa5oy38WN{OT%ZLe)t}k|Bg63?N3*o=ZXdg&z(tzp$NVLR z9w6lNSU}DUE0|SMC}rDoSQOrb9w+v2-TNhtyB=#Y67oEH z$X-AV-)qsR60qgs?@KZT^UEt?wpQP3L{Y4vb>W_!y3&DNC?ZRtt5MpuNCG?XI8DeE zhd}5;7flV`Cz3(#R%7L9Sx4Z#$4T~loDF$jKE?dnvY6x#d%-}j7hw|NihcV`r}`Bz z;`}~3u0-h-yZqK&4t~ z(x|w;4@zvw(b4T(a^7SHyrEWXsw0{f*YmwSyDn5YT<7+Z5gE&dx z?MCA6B_$kWkM$^BNt)mq&mBEGL4tq$s$$n9vYhaN#tuOrtgsni)oC^Wq#MvJfIEP*KCTfr4rbYY?K z;NK{D|63c*!nf*M)Txe|?h=)q9cnZ`Mo>T&^;eIptY`&(;_!4UJsmCJ#O(ej1=myp?+X2 z{_r0ZWSt2(#V>};Yo$_9s+aRY?+S=QNP4KXFPy<3@eS*J=xfym51|Glh+ZLL`Xx68 z>uX8UdvqeX7c6Y7cea|49OjObhW(Nsv(?r_;Up&Tu(!`B zm2|sETL(Hx4Z_7>6={KviQ`P}%JTS+f$N~249vRE=!v|5GpE_LS4GZNKe6mmABHx>Mk zr&cB&qW^Kv&e4gF=YNX?E{+zwMCQ)U_FVMzPDbV?HU>^~HWo&Xc20Jt&U8k0HuQF; zrWQsf^h^v4Z1n$ImEPXK$;r*m(U^#j$DZr|i{HQ45jfdf*b3XZ*gEqP{R`N?hls$* zz}i{a!t%hSqjQR{y{FZ&@J=XD41FCf0wG<_1pY|CwWBAtErcus1hx6tp(8 z`-hXc4KI5`y zr)Xy*swtvi_ur`gZ>|5ar7mT|;Uey6YNz0;V96@!DK0Hzrpe%}%IawEXeB4dBw{VZ zA?NHy@9rco$ZTqCt)O5it!BW?W^N{B=)|nTDC}UzLNDeiYb7G9X{^M`>>*?(EBD`4 z|Hb`(jG3xSanQ5K2%4)H8(0a;%9+?Qm?<3j&G&|28p@>#*-M}5)|Z}jaCr`fb}Aij9F!aR1fhmmdeGkvWot#Y9J z#Y=~}8CHU#f;4Od=2+UIY^orARPcR~bQi<7_A9p>*E&|4T6LlPh>Wa+z1b^jF*RLR zDp{`6lm<^1@15D$|5AFcsLpuzL?W*lL%wza_Mq_o*Fm$hR;ws$ySr>Ud~Yo1y$OX^ zV(75%z=5nZB?ocvXrzgMZdN{)fnCeFJ2(+@y}x}%ZKn`H + +To perform the dense matrix-matrix multiplication Cm x n = alpha · Am x k · Bk x n + beta · Cm x n, the full-blown GEMM interface can be treated with "default arguments" (which is deviating from the BLAS standard, however without compromising the binary compatibility). Default arguments are derived from compile-time constants (configurable) for historic reasons (LIBXSMM's "pre-JIT era"). + +```C +libxsmm_?gemm(NULL/*transa*/, NULL/*transb*/, + &m/*required*/, &n/*required*/, &k/*required*/, + NULL/*alpha*/, a/*required*/, NULL/*lda*/, + b/*required*/, NULL/*ldb*/, + NULL/*beta*/, c/*required*/, NULL/*ldc*/); +``` + +For the C interface (with type prefix `s` or `d`), all arguments including m, n, and k are passed by pointer. This is needed for binary compatibility with the original GEMM/BLAS interface. + +```C +libxsmm_gemm(NULL/*transa*/, NULL/*transb*/, + m/*required*/, n/*required*/, k/*required*/, + NULL/*alpha*/, a/*required*/, NULL/*lda*/, + b/*required*/, NULL/*ldb*/, + NULL/*beta*/, c/*required*/, NULL/*ldc*/); +``` + +The C++ interface is also supplying overloaded versions where m, n, and k can be passed by‑value (making it clearer that m, n, and k are non-optional arguments). + +```FORTRAN +! Dense matrix multiplication (single/double-precision). +CALL libxsmm_?gemm(m=m, n=n, k=k, a=a, b=b, c=c) +! Dense matrix multiplication (generic interface). +CALL libxsmm_gemm(m=m, n=n, k=k, a=a, b=b, c=c) +``` + +The FORTRAN interface supports optional arguments (without affecting the binary compatibility with the original BLAS interface) by allowing to omit arguments where the C/C++ interface allows for NULL to be passed. + +```C +/** Dense matrix multiplication (single/double-precision). */ +libxsmm_blas_?gemm(NULL/*transa*/, NULL/*transb*/, + &m/*required*/, &n/*required*/, &k/*required*/, + NULL/*alpha*/, a/*required*/, NULL/*lda*/, + b/*required*/, NULL/*ldb*/, + NULL/*beta*/, c/*required*/, NULL/*ldc*/); +``` + +For convenience, a BLAS-based dense matrix multiplication (`libxsmm_blas_gemm`) is provided for all supported languages. This only re-exposes the underlying GEMM/BLAS implementation, but the interface accepts optional arguments (or NULL pointers in C) where the regular GEMM expects a value. To remove any BLAS-dependency, please follow the [Link Instructions](index.md#link-instructions). A BLAS-based GEMM can be useful for validation/benchmark purposes, and more important as a fallback when building an application-specific dispatch mechanism. + +```C +/** OpenMP parallelized dense matrix multiplication. */ +libxsmm_?gemm_omp(&transa, &transb, &m, &n, &k, + &alpha, a, &lda, b, &ldb, &beta, c, &ldc); +``` + +A more recently added variant of matrix multiplication is parallelized based on the OpenMP standard. These routines will open an internal parallel region and rely on "classic" thread based OpenMP. If these routines are called from inside of a parallel region, the parallelism will be based on tasks (OpenMP 3.0). Please note that all OpenMP-based routines are hosted by the extension library (libxsmmext), which keeps the main library agnostic with respect to a threading runtime. + +### Manual Code Dispatch + +Successively calling a kernel (i.e., multiple times) allows for amortizing the cost of the code dispatch. Moreover, to customize the dispatch mechanism, one can rely on the following interface. + +```C +/** Call dispatched (*function_ptr)(a, b, c [, pa, pb, pc]). */ +libxsmm_[s|d]mmfunction libxsmm_[type-prefix]mmdispatch( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + /** NULL: tight fit (m) */ const libxsmm_blasint* lda, + /** NULL: tight fit (k) */ const libxsmm_blasint* ldb, + /** NULL: tight fit (m) */ const libxsmm_blasint* ldc, + /** NULL: LIBXSMM_ALPHA */ const type* alpha, + /** NULL: LIBXSMM_BETA */ const type* beta, + /** NULL: LIBXSMM_FLAGS */ const int* flags, + /** NULL: LIBXSMM_PREFETCH_NONE (not LIBXSMM_PREFETCH!) */ + const int* prefetch); +``` + +Overloaded function signatures are provided and allow to omit arguments (C++ and FORTRAN), which are then derived from the [configurable defaults](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h). In C++, `libxsmm_mmfunction` can be used to instantiate a functor rather than making a distinction between numeric types per type-prefix. For lower precision GEMMs, `libxsmm_mmfunction` optionally takes a second type (output type). + +```C +/* generates or dispatches the code specialization */ +libxsmm_mmfunction xmm(m, n, k); +if (xmm) { /* JIT'ted code */ + /* can be parallelized per, e.g., OpenMP */ + for (int i = 0; i < n; ++i) { + xmm(a+i*asize, b+i*bsize, c+i*csize); + } +} +``` + +Similarly in FORTRAN (see [samples/smm/smm.f](https://github.com/hfp/libxsmm/blob/master/samples/smm/smm.f)), a generic interface (`libxsmm_mmdispatch`) can be used to dispatch a `LIBXSMM_?MMFUNCTION`. The handle encapsulated by such a `LIBXSMM_?MMFUNCTION` can be called per `libxsmm_call`. Beside of dispatching code, one can also call statically generated kernels (e.g., `libxsmm_dmm_4_4_4`) by using the prototype functions included with the FORTRAN and C/C++ interface. Prototypes are present whenever static code was requested at compile-time of the library (e.g. per `make MNK="1 2 3 4 5"`). + +```FORTRAN +TYPE(LIBXSMM_DMMFUNCTION) :: xmm +CALL libxsmm_dispatch(xmm, m, n, k) +IF (libxsmm_available(xmm)) THEN + DO i = LBOUND(c, 3), UBOUND(c, 3) ! consider OpenMP + CALL libxsmm_dmmcall(xmm, a(:,:,i), b(:,:,i), c(:,:,i)) + END DO +END IF +``` + +### Batched Multiplication + +In case of batched SMMs, it can be beneficial to supply "next locations" such that the upcoming operands are prefetched ahead of time. Such a location would be the address of the next matrix to be multiplied (and not any of the floating-point elements within the "current" matrix-operand). The "prefetch strategy" is requested at dispatch-time of a kernel. A [strategy](libxsmm_be.md#prefetch-strategy) other than `LIBXSMM_PREFETCH_NONE` turns the signature of a JIT'ted kernel into a function with six arguments (`a,b,c, pa,pb,pc` instead of `a,b,c`). To defer the decision about the strategy to a CPUID-based mechanism, one can choose `LIBXSMM_PREFETCH_AUTO`. + +```C +int prefetch = LIBXSMM_PREFETCH_AUTO; +int flags = 0; /* LIBXSMM_FLAGS */ +libxsmm_dmmfunction xmm = NULL; +double alpha = 1, beta = 0; +xmm = libxsmm_dmmdispatch(23/*m*/, 23/*n*/, 23/*k*/, + NULL/*lda*/, NULL/*ldb*/, NULL/*ldc*/, + &alpha, &beta, &flags, &prefetch); +``` + +Above, pointer-arguments of `libxsmm_dmmdispatch` can be NULL (or OPTIONAL in FORTRAN): for LDx this means a "tight" leading dimension, alpha, beta, and flags are given by a [default value](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h) (which is selected at compile-time), and for the prefetch strategy a NULL-argument refers to "no prefetch" (which is equivalent to an explicit `LIBXSMM_PREFETCH_NONE`). By design, the prefetch strategy can be changed at runtime (as soon as valid next-locations are used) without changing the call-site (kernel-signature with six arguments). + + + +```C +if (0 < n) { /* check that n is at least 1 */ +# pragma parallel omp private(i) + for (i = 0; i < (n - 1); ++i) { + const double *const ai = a + i * asize; + const double *const bi = b + i * bsize; + double *const ci = c + i * csize; + xmm(ai, bi, ci, ai + asize, bi + bsize, ci + csize); + } + xmm(a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize, + /* pseudo prefetch for last element of batch (avoids page fault) */ + a + (n - 1) * asize, b + (n - 1) * bsize, c + (n - 1) * csize); +} +``` + +To process a batch of matrix multiplications and to prefetch the operands of the next multiplication ahead of time, the code presented in the [Overview](#overview) section may be modified as shown above. The last multiplication is peeled from the main batch to avoid prefetching out-of-bounds (OOB). Prefetching from an invalid address does not trap an exception, but an (unnecessary) page fault can be avoided. + + + +```C +/** Batched matrix multiplications (explicit data representation). */ +int libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], + const libxsmm_blasint stride_b[], + const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize, + int tid, int ntasks); +``` + +To further simplify the multiplication of matrices in a batch, LIBXSMM's batch interface can help to extract the necessary input from a variety of existing structures (integer indexes, array of pointers both with Byte sized strides). An expert interface (see above) can employ a user-defined threading runtime (`tid` and `ntasks`). In case of OpenMP, `libxsmm_mmbatch_omp` is ready-to-use and hosted by the extension library (libxsmmext). Of course, `libxsmm_mmbatch_omp` does not take `tid` and `ntasks` since both arguments are given by OpenMP. Similarly, a sequential version (shown below) is available per `libxsmm_gemm_batch` (libxsmm). + +Please note that an explicit data representation should exist and reused rather than created only to call the explicit batch-interface. Creating such a data structure only for this matter can introduce an overhead which is hard to amortize (speedup). If no explicit data structure exists, a "chain" of multiplications can be often algorithmically described (see [self-hosted batch loop](#implicit-batches)). + +```C +void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], + const libxsmm_blasint stride_b[], + const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize); +``` + +In recent BLAS library implementations, `dgemm_batch` and `sgemm_batch` have been introduced. This BLAS(-like) interface allows for groups of homogeneous batches, which is like an additional loop around the interface as introduced above. On the other hand, the BLAS(-like) interface only supports arrays of pointers for the matrices. In contrast, above interface supports arrays of pointers as well as arrays of indexes plus a flexible way to extract data from arrays of structures (AoS). LIBXSMM also supports this (new) BLAS(-like) interface with `libxsmm_?gemm_batch` and `libxsmm_?gemm_batch_omp` (the latter of which relies on LIBXSMM/ext). Further, existing calls to `dgemm_batch` and `sgemm_batch` can be intercepted and replaced with [LIBXSMM's call wrapper](#call-wrapper). The signatures of `libxsmm_dgemm_batch` and `libxsmm_sgemm_batch` are equal except for the element type (`double` and `float` respectively). + +```C +void libxsmm_dgemm_batch(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], + const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]); +``` + +**Note**: the multi-threaded implementation (`ntasks > 1` or "omp" form of the functions) avoids data races if indexes or pointers for the destination (C-)matrix are duplicated. This synchronization occurs automatically (`beta != 0`), but can be avoided by passing a negative `batchsize`, `group_size` and/or a negative `group_count`. + +### User-Data Dispatch + +It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch). + +### Call Wrapper + +#### Overview + +Since the library is binary compatible with existing GEMM calls (BLAS), such calls can be replaced at link-time or intercepted at runtime of an application such that LIBXSMM is used instead of the original BLAS library. There are two cases to consider: (1) static linkage, and (2) dynamic linkage of the application against the original BLAS library. When calls are intercepted, one can select a sequential (default) or an OpenMP-parallelized implementation (`make WRAP=2`). + +```bash +LIBXSMM STATISTIC: 1000 multiplications +dgemm(trans=NN mnk=32,32,21 ldx=32,21,32 a,b=1,0): 8% [main$omp$1] +dgemm(trans=NN mnk=32,21,32 ldx=32,32,32 a,b=1,0): 8% [main$omp$1] +dgemm(trans=NN mnk=10,21,32 ldx=10,32,10 a,b=1,0): 5% [main$omp$1] +dgemm(trans=NN mnk=32,10,32 ldx=32,32,32 a,b=1,0): 5% [main$omp$1] +dgemm(trans=NN mnk=32,32,10 ldx=32,10,32 a,b=1,0): 5% [main$omp$1] +``` + +Intercepted GEMMs can also build a sophisticated statistic (histogram) with LIBXSMM_VERBOSE=4 (or higher). The histogram displays the call sites (debug symbol name) of all intercepted GEMMs ([example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/wrap/autobatch.c) above depicts an OpenMP region hosted by the main function). With level 5 (or higher), the histogram yields the entire content, and eventually less relevant entries are not pruned. An application must be built with symbols (`-g`) and export symbols similar to shared libraries (`-Wl,--export-dynamic` even when linked statically) in order to display the symbol names of where the GEMMs originated (call site). + +**Note**: Intercepting GEMM calls is low effort but implies overhead, which can be relatively high for small-sized problems. LIBXSMM's native programming interface has lower overhead and allows to amortize this overhead when using the same multiplication kernel in a consecutive fashion along with sophisticated data prefetch. + +#### Static Linkage + +An application which is linked statically against BLAS requires to wrap the `sgemm_` and the `dgemm_` symbol (an alternative is to wrap only `dgemm_`). To relink the application (without editing the build system) can often be accomplished by copying and pasting the linker command as it appeared in the console output of the build system, and then re-invoking a modified link step (please also consider `-Wl,--export-dynamic`). + +```bash +gcc [...] -Wl,--wrap=dgemm_,--wrap=sgemm_ \ + /path/to/libxsmmext.a /path/to/libxsmm.a \ + /path/to/your_regular_blas.a +``` + +In addition, existing [BLAS(-like) batch-calls](#blas-batch-interface) can be intercepted as well: + +```bash +gcc [...] -Wl,--wrap=dgemm_batch_,--wrap=sgemm_batch_ \ + -Wl,--wrap=dgemm_batch,--wrap=sgemm_batch \ + -Wl,--wrap=dgemm_,--wrap=sgemm_ \ + /path/to/libxsmmext.a /path/to/libxsmm.a \ + /path/to/your_regular_blas.a +``` + +Above, GEMM and GEMM_BATCH are intercepted both, however this can be chosen independently. For GEMM_BATCH the Fortran and C-form of the symbol may be intercepted both (regular GEMM can always be intercepted per `?gemm_` even when `?gemm` is used in C-code). + +**Note**: The static link-time wrapper technique may only work with a GCC tool chain (GNU Binutils: `ld`, or `ld` via compiler-driver), and it has been tested with GNU GCC, Intel Compiler, and Clang. However, this does not work under Microsoft Windows (even when using the GNU tool chain or Cygwin). + +#### Dynamic Linkage + +An application that is dynamically linked against BLAS allows to intercept the GEMM calls at startup time (runtime) of the unmodified executable by using the LD_PRELOAD mechanism. The shared library of LIBXSMMext (`make STATIC=0`) can be used to intercept GEMM calls: + +```bash +LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH} \ +LD_PRELOAD=libxsmmext.so \ + ./myapplication +``` + diff --git a/third_party/libxsmm/documentation/libxsmm_prof.md b/third_party/libxsmm/documentation/libxsmm_prof.md new file mode 100644 index 00000000..f5f7b0cf --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_prof.md @@ -0,0 +1,45 @@ +## Performance Analysis + +### Intel VTune Profiler + +To analyze which kind of kernels have been called, and from where these kernels have been invoked (call stack), the library allows profiling its JIT code using Intel VTune Profiler. To enable this support, VTune's root directory needs to be set at build-time of the library. Enabling symbols (SYM=1 or DBG=1) incorporates VTune's JIT Profiling API: + +```bash +source /opt/intel/vtune_profiler/vtune-vars.sh +make SYM=1 +``` + +Above, the root directory is automatically determined from the environment (VTUNE_PROFILER_\*_DIR or VTUNE_AMPLIFIER_\*_DIR with older versions). This variable is present after source'ing the Intel VTune environment (`source /path/to/vtune_amplifier/amplxe-vars.sh` with older version), but it can be manually provided as well (`make VTUNEROOT=/path/to/vtune_amplifier`). Symbols are not really required to display kernel names for the dynamically generated code, however enabling symbols makes the analysis much more useful for the rest of the (static) code, and hence it has been made a prerequisite. For example, when "call stacks" are collected it is possible to find out where the JIT code has been invoked by the application: + +```bash +vtune -r resultdir -data-limit 0 -collect hotspots \ + -knob enable-stack-collection=true \ + -knob sampling-mode=hw \ + -knob stack-size=0 \ + -- ./myapplication +``` + +In case of an MPI-parallelized application, it can be useful to only collect results from a "representative" rank, and to also avoid running the event collector in every rank of the application. With Intel MPI both of which can be achieved by: + +```bash +mpirun -gtool 'vtune -r resultdir -data-limit 0 -collect hotspots \ + -knob sampling-mode=hw -knob enable-stack-collection=true \ + -knob stack-size=0:4=exclusive' \ + [...] ./myapplication +``` + +The `:4=exclusive` is related to Intel MPI or mpirun's gtool arguments and unrelated to VTune's command line syntax (see `vtune --help` or `amplxe-cl --help` with older versions); such argument(s) need to appear at the end of the gtool-string. For instance, the shown command line selects the 5th rank (zero-based) along with exclusive usage of the performance monitoring unit (PMU) such that only one event-collector runs for all ranks (without rank-number, all ranks are sampled). + +Intel VTune Profiler presents invoked JIT code like functions, which belong to a module named "libxsmm.jit". The function name as well as the module name are supplied by LIBXSMM using VTune's JIT-Profiling API. Below, the shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an AVX-512 ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and a prefetch strategy ("p6"). + +![The shown "function name" (`libxsmm_knl_dnn_23x23x23_23_23_23_a1_b1_p6::mxm`) encodes an Intel AVX-512 ("knl") double-precision kernel ("d") for small dense matrix multiplication, which performs no transposes ("nn"). The name further encodes M=N=K=LDA=LDB=LDC=23, Alpha=Beta=1.0, and some prefetch strategy ("p6").](libxsmm_prof-vtune.png) + +An application that cannot rely on LIBXSMM's build system can apply `-DLIBXSMM_VTUNE=2` during compilation, and link against `${VTUNE_AMPLIFIER_XE_2017_DIR}/lib64/libjitprofiling.a`. For example, TensorFlow with LIBXSMM and Intel VTune Profiler may use this way to gain insight into LIBXSMM's JIT-code (see [here](tensorflow.md#performance-profiling)). + +### Linux perf + +With LIBXSMM, there is both basic (`perf map`) and extended support (`jitdump`) when profiling an application. To enable perf support at runtime, the environment LIBXSMM_VERBOSE needs to be set to a negative value. + +* The basic support can be enabled at compile-time with PERF=1 (implies SYM=1) using `make PERF=1`. At runtime of the application, a map-file ('jit-*pid*.map') is generated ('/tmp' directory). This file is automatically read by Linux perf, and enriches the information about unknown code such as JIT'ted kernels. +* The support for "jitdump" can be enabled by supplying JITDUMP=1 (implies PERF=1) or PERF=2 (implies JITDUMP=1) when making the library: `make JITDUMP=1` or `make PERF=2`. At runtime of the application, a dump-file ('jit-*pid*.dump') is generated (in perf's debug directory, usually `$HOME/.debug/jit/`) which includes information about JIT'ted kernels (such as addresses, symbol names, code size, and the code itself). The dump file can be injected into `perf.data` (using `perf inject -j`), and it enables an annotated view of the assembly in perf's report (requires a reasonably recent version of Linux perf). + diff --git a/third_party/libxsmm/documentation/libxsmm_qna.md b/third_party/libxsmm/documentation/libxsmm_qna.md new file mode 100644 index 00000000..f4071048 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_qna.md @@ -0,0 +1,58 @@ +## What is the background of the name "LIBXSMM"? +The "MM" stands for Matrix Multiplication, and the "S" clarifies the working domain i.e., Small Matrix Multiplication. The latter also means the name is neither a variation of "MXM" nor an eXtreme Small Matrix Multiplication but rather about Intel Architecture (x86) - and no, the library is [64‑bit only](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962). The spelling of the name might follow the syllables of libx\\/smm, libx'smm, or libx‑smm. +> **NOTE**: the library does [not](https://github.com/hfp/libxsmm/issues/103#issuecomment-256887962) support 32-bit architecture (64‑bit only) + +## What is a small matrix multiplication? +When characterizing the problem-size using the M, N, and K parameters, a problem-size suitable for LIBXSMM falls approximately within *(M N K)1/3 \<= 128* (which illustrates that non-square matrices or even "tall and skinny" shapes are covered as well). The library is typically used to generate code up to the specified [threshold](#auto-dispatch). Raising the threshold may not only generate excessive amounts of code (due to unrolling in M or K dimension), but also miss to implement a tiling scheme to effectively utilize the cache hierarchy. For auto-dispatched problem-sizes above the configurable threshold (explicitly JIT'ted code is **not** subject to the threshold), LIBXSMM is falling back to BLAS. In terms of GEMM, the supported kernels are limited to *Alpha := 1*, *Beta := \{ 1, 0 \}*, and *TransA := 'N'*. +> **NOTE**: *Alpha*, *Beta*, and *TransA* are limited to `1`, `{ 1, 0 }`, and `'N'` respectively. + +## What is a small convolution? +In the last years, new workloads such as deep learning and more specifically convolutional neural networks (CNN) emerged, and are pushing the limits of today's hardware. One of the expensive kernels is a small convolution with certain kernel sizes (3, 5, or 7) such that calculations in the frequency space is not the most efficient method when compared with direct convolutions. LIBXSMM's current support for convolutions aims for an easy to use invocation of small (direct) convolutions, which are intended for CNN training and classification. The [Interface](#interface-for-convolutions) is currently ramping up, and the functionality increases quickly towards a broader set of use cases. + +## What about "medium-sized" and big(ger) matrix multiplications? +A more recent addition are GEMM routines, which are parallelized using OpenMP (`libxsmm_?gemm_omp`). These routines leverage the same specialized kernel routines as the small matrix multiplications, in-memory code generation (JIT), and automatic code/parameter dispatch but they implement a tile-based multiplication scheme i.e., a scheme that is suitable for larger problem-sizes. For *Alpha*, *Beta*, *TransA*, and *TransB*, the limitations of the small matrix multiplication kernels apply. More details can be found in the [description of the xgemm sample code](https://github.com/hfp/libxsmm/tree/master/samples/xgemm#xgemm-tiled-gemm-routines). + +## How to determine whether an application can benefit from using LIBXSMM or not? +Given the application uses BLAS to carry out matrix multiplications, one may use the [Call Wrapper](#call-wrapper), and measure the application performance e.g., time to solution. However, the latter can significantly improve when using LIBXSMM's API directly. To check whether there are applicable GEMM-calls, the [Verbose Mode](#verbose-mode) can help to collect an insight. Further, when an application uses [Intel MKL 11.2](https://registrationcenter.intel.com/en/forms/?productid=2558) (or higher), then running the application with the environment variable MKL_VERBOSE=1 (`env MKL_VERBOSE=1 ./workload > verbose.txt`) can collect a similar insight (`grep -a "MKL_VERBOSE DGEMM(N,N" verbose.txt | cut -d'(' -f2 | cut -d, -f3-5"`). + +## Is LIBXSMM compatible from version-to-version, or what is the ABI commitment? +One may have a look at issue [#120](https://github.com/hfp/libxsmm/issues/120#issuecomment-264498939) or [#282](https://github.com/hfp/libxsmm/issues/282#issuecomment-485390494), but in summary: +* Binary compatibility is not continuously tested (only manually for a subset of the API namely SMM domain). +* Major versions are likely breaking binary compatibility with existing integrations (that is typical). +* Minor versions may break binary compatibility of recently introduced features (may not be typical). +* Update and patch versions are binary compatible but may only be released on request (issue). + +LIBXSMM's API for Small Matrix Multiplications (SMMs) is considered stable, and all major known applications (e.g., CP2K, EDGE, NEK5K, and SeisSol) either rely on SMMs or are able (and want) to benefit from an improved API of any of the other domains (e.g., DL). Until at least v2.0, LIBXSMM is not able to track or even maintain binary compatibility and hence the SONAME also goes with the semantic version. A [list of public functions](https://github.com/hfp/libxsmm/blob/master/.abi.txt) is maintained (but there is no distinction for a small subset of them that are only meant for communication between LIBXSMM and LIBXSMM/ext). + +## I am relying on a prebuilt version of CP2K (or another application), is LIBXSMM incorporated and which version is it? +This can be determined using the environment variable `LIBXSMM_VERBOSE=2` (or higher verbosity). It is not even required to use an input or workload since the information in question is presented when the program terminates. For example: + +``` +LIBXSMM_VERBOSE=1 exe/Linux-x86-64-intelx/cp2k.psmp +[...] +LIBXSMM_VERSION: release-1.11 +LIBXSMM_TARGET: clx +``` + +## I am relying on a prebuilt version of an application, and I am concerned about optimal compiler flags. +LIBXSMM uses JIT-generated code according to the CPUID of the system. This is independent of the compiler flags used to build the library. If LIBXSMM was incorporated per [classic ABI](https://libxsmm.readthedocs.io/#classic-library-abi), `LIBXSMM_DUMP_BUILD=1` environment variable allows to print build flags used for LIBXSMM at termination of the application. This output of `LIBXSMM_DUMP_BUILD=1` can yield hints about the flags used to build the application (if similar). + +For concerns regarding the code of an application that cannot benefit from LIBXSMM, one may have a look at the build recipes of the [XCONFIGURE](http://xconfigure.readthedocs.io/) project. + +## What Operating Systems are covered by LIBXSMM, and what about Microsoft Windows? +The answer here focuses on the actual runtime support rather than the supported compiler tool chains used to build the library. All flavors of Linux are supported (if the library was successfully built), which includes installations running a security-hardened Linux kernel (SELinux). The Apple OS (OSX) is supported, which also includes more recent SIP-enabled versions (System Integrity Protection). The BSD OS is likely supported, but building the library is only occasionally validated. Microsoft Windows is supported for non-JIT operation, and for most (e.g., GEMM and MATCOPY) of the JIT-kernels (prefetch signature is not supported). There is currently no support for JIT in the DNN domain (no further check is performed i.e., crash at runtime). See also [issue #71](https://github.com/hfp/libxsmm/issues/71). + +## Does LIBXSMM has some support for GEMV? +The library generates acceptable code when using `M=1` or `N=1`. For example, building with `make M=16 N=1 K=16 AVX=2` and inspecting the assembly (build directory) or dumping/disassembling the JIT code (see reference documentation) shows the minimum number of load/store instructions. Given that GEMV is a memory bound operation, this suggests reasonable code quality. LIBXSMM selects from multiple microkernels (specific for each ISA extension) by using a fixed scheme/heuristic, which should be acceptable for GEMV. The sample code under [samples/smm](https://github.com/hfp/libxsmm/blob/master/samples/smm) provides ready-to-use benchmark drivers that can help to compare the performance with LAPACK/BLAS. Afore mentioned benchmarks exercise streaming all possible combinations of operands. + +## What about complex and mixed types? +This question refers to the following kind of element type of the GEMM interface of LIBXSMM: +* Complex types: complex numbers in single and double-precision, +* Mixed types: e.g. real double-precision and complex double-precision +There are no (immediate) plans to support more types for the GEMM part. Please note, that LIBXSMM indeed supports lower precision GEMM (wgemm). + +## What about voting for features? +All feedback and [issue reports](https://github.com/hfp/libxsmm/issues) are handled openly, are welcome and considered ([answered](https://github.com/hfp/libxsmm/issues?q=is%3Aissue+is%3Aclosed), and [collected](https://github.com/hfp/libxsmm/wiki/Development#longer-term-issues)). However, we do not seek for "feature votes" since the development of the library is not a democratic process. + +## \ What is the purpose of ROW_MAJOR vs. COL_MAJOR? +This build configuration is deprecated ([issue 85](https://github.com/hfp/libxsmm/issues/85)), otherwise there is nothing one cannot achieve with row-major as opposed to column-major storage order. In particular the choice is not about whether a program is written in C/C++ or in FORTRAN. The ROW_MAJOR setting is just offered for existing code, which calls into function(s) that assume row-major storage order and where these calls are to be replaced by LIBXSMM in a "1:1 fashion". It is encouraged to avoid the ROW_MAJOR setting since BLAS implies COL_MAJOR (and LIBXSMM is supposed to be compatible with BLAS). [More...](https://github.com/hfp/libxsmm/issues/80) diff --git a/third_party/libxsmm/documentation/libxsmm_samples.md b/third_party/libxsmm/documentation/libxsmm_samples.md new file mode 100644 index 00000000..a99e5c2c --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_samples.md @@ -0,0 +1,706 @@ +# [LIBXSMM Samples](https://github.com/hfp/libxsmm/raw/master/documentation/libxsmm_samples.pdf) + +## CP2K Artificial Benchmark + +The first code sample given for LIBXSMM was a performance reproducer exercising the same set of kernels usually generated for CP2K's SMM library. The code sample attempted to model the way "matrix stacks" are processed in CP2K, however there are two different code paths in CP2K: (1) the "main" code path used when processing stacks on the host-side, and (2) a code path targeting offload devices. Beside of the host-sided parallelization via MPI (and perhaps OpenMP), the secondly mentioned code path relies on an additional level of parallelization (which is obviously necessary to drive a potentially highly parallel offload device). Also, the additional level of parallelism is not exactly "nested" in the sense that it participates on sharing the same resources as the host-side. In fact, this "artificial benchmark" (cp2k code sample) is modeling a code path as utilized in the secondly mentioned case (offload device). + +## Hello LIBXSMM + +This example is focused on a specific functionality but may be considered as "Hello LIBXSMM". Copy and paste the example code and build it either manually and as described in our [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm) (see underneath the source code), or use GNU Make: + +```bash +cd /path/to/libxsmm +make + +cd /path/to/libxsmm/samples/hello +make + +./hello +``` + +Alternatively, one can use the Bazel build system. To further simplify, [Bazelisk](https://github.com/bazelbuild/bazelisk) is used to boot-strap [Bazel](https://bazel.build/): + +```bash +cd /path/to/libxsmm/samples/hello +bazelisk build //... + +./bazel-bin/hello +``` + +The [C/C++ code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.cpp) given here uses LIBXSMM in header-only form (`#include `), which is in contrast to the code shown in the [main documentation](https://libxsmm.readthedocs.io/#hello-libxsmm). The [Fortran code](https://github.com/hfp/libxsmm/blob/master/samples/hello/hello.f) (`hello.f`) can be manually compiled like `gfortran -I/path/to/libxsmm/include hello.f -L/path/to/libxsmm/lib -libxsmmf -lxsmm -lxsmmnoblas -o hello` or as part of the above described invocation of GNU Make. + +## Magazine + +### Overview + +This collection of code samples accompany an article written for [issue #34](https://software.intel.com/sites/default/files/parallel-universe-issue-34.pdf) of the magazine [The Parallel Universe](https://software.intel.com/en-us/download/parallel-universe-magazine-issue-34-october-2018), an Intel publication. The articles focuses on Blaze-, Eigen-, and LIBXSMM-variants of Small Matrix Multiplications (SMMs). The set of sample codes now also includes a variant relying on BLAS and a variant that showcases LIBXSMM's explicit batch-interface. + +The baseline requirements are libraries that can operate on column-major storage order, "zero copy" when using existing memory buffers, and an API that is powerful enough to describe leading dimensions. Typically a library-internal parallelization of matrix multiplication is desired. However, for the magazine sample collection there is no performance gain expected since the matrices are small, and nested parallelism may only add overhead. Hence library-internal parallelism is disabled (BLAZE_USE_SHARED_MEMORY_PARALLELIZATION=0, EIGEN_DONT_PARALLELIZE). LIBXSMM provides parallelization on a per-functions basis and no global toggle is needed. + +The sample codes rely on the minimum programming language supported by the library in question (API): C++ in case of Blaze and Eigen, and C in case of LIBXSMM (both C++ and Fortran interfaces are available as well). For Blaze and Eigen, the build-system ensures to not map implementation into a BLAS library (normally desired but this would not test the library-native implementation). + +### Results + +To reproduce or repeat the performance measurements on a system of choice, all matrix operands are streamed by default. The file [magazine.h](https://github.com/hfp/libxsmm/blob/master/samples/magazine/magazine.h) can be edited to reproduce the desired combination (STREAM_A, STREAM_B, and STREAM_C). Whether or not matrix operands are streamed is motivated in publication. To reduce dependency on the compiler's OpenMP implementation, the benchmarks run single-threaded by default (`make OMP=1` can parallelize the batch of matrix multiplications). The outer/batch-level parallelization is also disabled to avoid accounting for proper first-touch memory population on multi-socket systems (NUMA). For the latter, the init-function (located in magazine.h) is not parallelized for simplicity. + +```bash +cd libxsmm; make +cd samples/magazine; make +``` + +To run the benchmark kernels presented by the article: + +```bash +./benchmark.sh +``` + +Please note that if multiple threads are enabled and used, an appropriate pin-strategy should be used (OMP_PLACES=threads, OMP_PROC_BIND=TRUE). To finally produce the benchmark charts: + +```bash +./benchmark-plot.sh blaze +./benchmark-plot.sh eigen +./benchmark-plot.sh xsmm +``` + +The plot script relies at least on Gnuplot. ImageMagick (mogrify) can be also useful if PNGs are created, e.g., `./benchmark-plot.sh xsmm png 0` (the last argument disables single-file charts in contrast to multi-page PDFs created by default, the option also disables chart titles). + +The set of kernels executed during the benchmark can be larger than the kernels presented by the plots: [benchmark.set](https://github.com/hfp/libxsmm/blob/master/samples/magazine/benchmark.set) selects the kernels independent of the kernels executed (union). + +## NEK Sample Collection + +This directory contains kernels taken from Nek{Box,5000}. They aim to represent most of the matrix-matrix workloads. + +Please note that the [mxm_std.f](https://github.com/hfp/libxsmm/blob/master/samples/nek/mxm_std.f) source code is protected by an (US) GOVERNMENT LICENSE, and under the copyright of the University of Chicago. + +### stpm + +Small tensor-product multiple (stpm) replicates the axhelm kernel, which computes the Laplacian with spectral elements. +Usage: + +```bash +./stpm m n k size1 size +``` + +The elements are m-by-n-by-k, mode picks the LIBXSMM interface used, and size scales the number of spectral elements. + +### rstr + +Restriction operator transforms elements from one size to another. This occurs in multi-grid, the convection operator, and, when the sizes are the same, the local Schwarz solves. Usage: + +```bash +./rstr m n k mm nn kk size1 size +``` + +The input elements are m-by-n-by-k and the output elements are mm-by-nn-by-kk. When m=mm, n=nn, k=kk, this half of a Schwarz solve. + +## SMM Sample Collection + +This collection of code samples exercises different memory streaming cases when performing the matrix multiplication *C~m x n~ = alpha · A~m x k~ · B~k x n~ + beta · C~m x n~*: (1) streaming the matrices A, B, and C which is usually referred as batched matrix multiplication, (2) streaming the inputs A and B but accumulating C within cache, (3) streaming the A and C matrices while B is kept in cache, (4) streaming the B and C matrices while A is kept in cache, and (4) not streaming any of the operands but repeating the very same multiplication until the requested number of matrix multiplications has been completed. + +Beside of measuring the duration of a test case, the performance is presented in GFLOPS/s. As an alternative metric, the memory bandwidth is given (the artificial "cached" case omits to present the cache-memory bandwidth). The "pseudo-performance" given in FLOPS/cycle is an artificial scoring, it not only uses a non-standard formula for calculating the FLOPS (*2 \* M \* N \* K - M \* N* rather than *2 \* M \* N \* K*) but also relies on (pseudo-)clock cycles: + +``` +$ ./specialized.sh 0 +m=32 n=32 k=32 size=87381 memory=2048.0 MB (DP) + +Batched (A,B,C)... + pseudo-perf.: 10.7 FLOPS/cycle + performance: 23.9 GFLOPS/s + bandwidth: 11.1 GB/s + duration: 239 ms +Finished +``` + +There are two sub collections of samples codes: (1) a collection of C++ code samples showing either BLAS, Compiler-generated code (inlined code), LIBXSMM/dispatched, LIBXSMM/specialized functions to carry out the multiplication, and (2) a Fortran sample code showing BLAS versus LIBXSMM including some result validation. + +**C/C++ Code Samples: Command Line Interface (CLI)** + +* Takes an optional number (1st arg.) to select the streaming-case (0...8) +* Optionally takes the M, N, and K parameter of the GEMM in this order +* If only M is supplied, the N and K "inherit" the M-value +* Example I (A,B,C): ./specialized.sh 0 16 8 9 +* Example II (A,B): ./specialized.sh 6 16 + +**Fortran Code Sample: Command Line Interface (CLI)** + +* Optionally takes the M, N, and K parameter of the GEMM in this order +* Optional problem size (in MB) of the workload; M/N/K must have been supplied +* Optional total problem size (in MB) implying the number of repeated run +* If only M is supplied, the N and K are "inheriting" the M-value +* Shows the performance of each of the streaming cases +* Example I: ./smm.sh 16 8 9 1024 16384 +* Example II: ./smm.sh 16 + +## SPECFEM Sample + +This sample contains a dummy example from a spectral-element stiffness kernel taken from [SPECFEM3D_GLOBE](https://github.com/geodynamics/specfem3d_globe). + +It is based on a 4th-order, spectral-element stiffness kernel for simulations of elastic wave propagation through the Earth. Matrix sizes used are (25,5), (5,25) and (5,5) determined by different cut-planes through a three dimensional (5,5,5)-element with a total of 125 GLL points. + + +### Usage Step-by-Step + +This example needs the LIBXSMM library to be built with static kernels, using MNK="5 25" (for matrix size (5,25), (25,5) and (5,5)). + +#### Build LIBXSMM + +##### General Default Compilation + +In LIBXSMM root directory, compile the library with: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 +``` + +##### Additional Compilation Examples + +Compilation using only single precision version and aggressive optimization: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 +``` + +For Sandy Bridge CPUs: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 +``` + +For Haswell CPUs: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 +``` + +For Knights Corner (KNC) (and thereby creating a Sandy Bridge version): + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ +OFFLOAD=1 KNC=1 +``` + +Installing libraries into a sub-directory workstation/: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 \ +OFFLOAD=1 KNC=1 \ +PREFIX=workstation/ install-minimal +``` + +#### Build SpecFEM example code + +For default CPU host: + +```bash +cd sample/specfem +make +``` + +For Knights Corner (KNC): + +```bash +cd sample/specfem +make KNC=1 +``` + +Additionally, adding some specific Fortran compiler flags, for example: + +```bash +cd sample/specfem +make FCFLAGS="-O3 -fopenmp" [...] +``` + +Note that steps 1 and 2 could be shortened by specifying a "specfem" make target in the LIBXSMM root directory: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 specfem +``` + +For Knights Corner, this would need two steps: + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 OFFLOAD=1 KNC=1 +make OPT=3 specfem_mic +``` + +### Run the Performance Test + +For default CPU host: + +```bash +./specfem.sh +``` + +For Knights Corner (KNC): + +```bash +./specfem.sh -mic +``` + +### Results + +Using Intel Compiler suite: icpc 15.0.2, icc 15.0.2, and ifort 15.0.2. + +#### Sandy Bridge - Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz + +Library compilation by (root directory): + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=1 +``` + +Single threaded example run: + +```bash +cd sample/specfem +make; OMP_NUM_THREADS=1 ./specfem.sh +``` + +Output: + +```bash +=============================================================== +average over 15 repetitions + timing with Deville loops = 0.1269 + timing with unrolled loops = 0.1737 / speedup = -36.87 % + timing with LIBXSMM dispatch = 0.1697 / speedup = -33.77 % + timing with LIBXSMM prefetch = 0.1611 / speedup = -26.98 % + timing with LIBXSMM static = 0.1392 / speedup = -9.70 % +=============================================================== +``` + +#### Haswell - Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz + +Library compilation by (root directory): + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 AVX=2 +``` + +Single threaded example run: + +```bash +cd sample/specfem +make; OMP_NUM_THREADS=1 ./specfem.sh +``` + +Output: + +```bash +=============================================================== +average over 15 repetitions + timing with Deville loops = 0.1028 + timing with unrolled loops = 0.1385 / speedup = -34.73 % + timing with LIBXSMM dispatch = 0.1408 / speedup = -37.02 % + timing with LIBXSMM prefetch = 0.1327 / speedup = -29.07 % + timing with LIBXSMM static = 0.1151 / speedup = -11.93 % +=============================================================== +``` + +Multi-threaded example run: + +```bash +cd sample/specfem +make OPT=3; OMP_NUM_THREADS=24 ./specfem.sh +``` + +Output: + +```bash +OpenMP information: + number of threads = 24 + +[...] + +=============================================================== +average over 15 repetitions + timing with Deville loops = 0.0064 + timing with unrolled loops = 0.0349 / speedup = -446.71 % + timing with LIBXSMM dispatch = 0.0082 / speedup = -28.34 % + timing with LIBXSMM prefetch = 0.0076 / speedup = -19.59 % + timing with LIBXSMM static = 0.0068 / speedup = -5.78 % +=============================================================== +``` + +#### Knights Corner - Intel Xeon Phi B1PRQ-5110P/5120D + +Library compilation by (root directory): + +```bash +make MNK="5 25" ALPHA=1 BETA=0 PRECISION=1 OPT=3 OFFLOAD=1 KNC=1 +``` + +Multi-threaded example run: + +```bash +cd sample/specfem +make FCFLAGS="-O3 -fopenmp -warn" OPT=3 KNC=1; ./specfem.sh -mic +``` + +Output: + +```bash +OpenMP information: + number of threads = 236 + +[...] + +=============================================================== +average over 15 repetitions + timing with Deville loops = 0.0164 + timing with unrolled loops = 0.6982 / speedup = -4162.10 % + timing with LIBXSMM dispatch = 0.0170 / speedup = -3.89 % + timing with LIBXSMM static = 0.0149 / speedup = 9.22 % +=============================================================== +``` + +## Matrix Transpose (TCOPY) + +### Overview + +This code sample aims to benchmark the performance of matrix transposes. The C/C++ and [FORTRAN sample code](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.f) differ slightly with the C/C++ code sample offering a richer set of command line options as well as build settings available inside of the [translation unit](https://github.com/hfp/libxsmm/blob/master/samples/transpose/transpose.c). + +The available command line options of the sample code may be reviewed by looking into the source code. Generally, the idea is to support the following: + +> transpose [<kind> [<m> [<n> [<ldi> [<ldo>]]]]] +transposef [<m> [<n> [<ldi> [<ldo>]]]] + +Above, `m` and `n` specify the matrix shape, and `ldi` the leading dimension of the matrix. The argument `ldo` allows to specify an output dimension, which may differ from `ldi`. The transpose kind shall be either out-of-place (`o`) or in-place (`i`). + +Running the C sample code may look like: + +```bash +$ ./transpose.sh o 20000 +m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) + bandwidth: 18.8 GB/s + duration: 159 ms +``` + +Instead of executing a wrapper script, one may affinitize the multi-threaded execution manually (OpenMP runtime). In case of an executable built using the Intel Compiler this may look like: + +```bash +LIBXSMM_VERBOSE=2 KMP_AFFINITY=balanced,granularity=fine,1 \ +./transpose o 20000 +m=20000 n=20000 ldi=20000 ldo=20000 size=3052MB (double, out-of-place) + bandwidth: 21.1 GB/s + duration: 141 ms + +Registry: 20 MB (gemm=0 mcopy=0 tcopy=1) +``` + +In the above case one can see from the verbose output (`LIBXSMM_VERBOSE=2`) that one kernel (tcopy) served transposing the entire matrix. To avoid duplicating JIT-kernels under contention (code registry), one may also consider `LIBXSMM_TRYLOCK=1`, which is available per API-call as well. + +### OpenTuner + +To tune the tile sizes ("block sizes") internal to LIBXSMM's transpose routine, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`transpose_opentuner.py`) is provided, which accepts a range of matrix sizes as command line arguments. + +> transpose_opentuner.py <begin> <end> [*nexperiments-per-epoch*] [*tile-size-m*] [*tile-size-n*] + +To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an eventually unrelated range of matrix shapes. To get reliable timings, the total time for all experiments per epoch is minimized (hence a different number of experiments per epoch also asks for an own database). Optionally, the initial block size can be seeded (`tile-size-m` and `tile-size-n`). + +```bash +rm -rf opentuner.db +``` + +The script tunes matrices with randomized shape according to the specified range. The leading dimension is chosen tightly for the experiments. The optimizer not only maximizes the performance but also minimizes the value of *M \* N* (which also helps to prune duplicated results due to an additional preference). + +```bash +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 1 1024 1000 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 1024 2048 100 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 2048 3072 20 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 3072 4096 20 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 4096 5120 16 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 5120 6144 12 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 6144 7168 8 + +rm -rf opentuner.db +./transpose_opentuner.py --no-dups 7168 8192 6 +``` + +The tuning script uses the environment variables `LIBXSMM_TCOPY_M` and `LIBXSMM_TCOPY_N`, which are internal to LIBXSMM. These variables are used to adjust certain thresholds in `libxsmm_otrans` or to request a specific tiling-scheme inside of the `libxsmm_otrans_omp` routine. + +## XGEMM: Tiled GEMM Routines + +### Overview + +This sample code calls the `libxsmm_?gemm_omp` routines provided by the LIBXSMM extension library (`libxsmmext`). These routines are meant for big(ger) xGEMM routines, and thereby provide an OpenMP-based parallelization. + +The driver program (`xgemm.c`) currently accepts all typical GEMM arguments (except for the transposition specifier): `m`, `n`, `k`, `lda`, `ldb`, `ldc`, `alpha`, and `beta`. All arguments are optional (or will inherit defaults from previously specified arguments). Matrix transposition as part of the `libxsmm_?gemm_omp` routines will become available in an upcoming release of LIBXSMM. Please also note that unsupported Alpha or Beta values will cause a fall back to the related BLAS routine. The single-precision matrix multiplications require to change the `ITYPE` in `xgemm.c`. + +```bash +./xgemm.sh 2000 +``` + +### OpenTuner + +To tune the tile sizes ("block sizes") internal to LIBXSMM, the [OpenTuner](http://opentuner.org/) extensible framework for program autotuning can be used. In case of issues during the tuning phase ("no value has been set for this column"), please install the latest 1.2.x revision of SQLAlchemy (`pip install sqlalchemy==1.2.19`). A tuning script (`xgemm_opentuner.py`) is provided, which optionally accepts a list of grouped parameters as command line arguments. The syntax of the arguments is per LIBXSMM's `MNK` build-option, and expands to "triplets" specifying the matrix shapes. For instance, four matrix multiplications of square-matrices can be benchmarked and tuned using the following command. + +```bash +./xgemm_opentuner.py 1024,1280,1536,1792 +``` + +To start a tuning experiment for a new set of arguments, it is highly recommended to start from scratch. Otherwise the population of previously generated tuning results is fetched from a database and used to tune an unrelated range of matrix shapes. Optionally, the initial block size can be seeded (`tile-size-m`, `tile-size-n`, and `tile-size-k`). + +```bash +rm -rf opentuner.db +``` + +The script tunes the geometric mean of the performance for each of the requested triplets. However, the optimizer not only maximizes the performance but also minimizes the value of *M \* N \* K* (which also helps to prune duplicated results due to an additional preference). As a limitation of the current implementation, the multiplication kernels are not accompanied by copy-kernels (and not accompanied by transpose kernels). This negatively impacts performance on power-of-two matrix shapes (POT) due to trashing the LLC. However, it has been found, that tuning for POT shapes likely achieves superior performance when compared to tuning for non-POT shapes of the same range. + +```bash +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 192,256,320,512,768 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 1024,1280,1536,1792 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 2048,2304,2560,2816 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 3072,3328,3584,3840 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 4096,4416,4736 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 5120,5440,5760 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 6144,6464,6784 + +rm -rf opentuner.db +./xgemm_opentuner.py --no-dups 7168,7488,7808 +``` + +Above, the series of matrix multiplications from 192-8K is separately tuned in eight ranges. The tuning script uses the environment variables `LIBXSMM_TGEMM_M`, `LIBXSMM_TGEMM_N`, and `LIBXSMM_TGEMM_K` which are internal to LIBXSMM. These variables are used to request a specific tiling-scheme within LIBXSMM's `libxsmm_?gemm_omp` routines. + + +This package contains the optimized kernels for the 1D dilated convolutional layer. +The C++ implementation has code for both FP32 and BF16 formats. +You can run this code on AVX-512 enabled CPUs. Ex. - Cascade Lake or Cooper lake. + + Install instructions + +IInstall PyTorch in an anaconda or virtual environment before installing the package. +Use GCC version 8.3.0 or higher. +conda activate environment # Activate anaconda or virtual environment containing PyTorch + +cd Conv1dOpti-extension/ +python setup.py install # Install package +cd .. + + +A user can either use run.sh script to run the torch_example.py code or +he/she can follow the following commands. + +export LD_LIBRARY_PATH={LIBXSMM_ROOT/lib} # Set LD_LIBRARY_PATH +export OMP_NUM_THREADS=28 # Set number of threads +export KMP_AFFINITY=compact,1,0,granularity=fine # Set KMP affinity + +python torch_example.py # Run the pytorch example + +In the previous example, we compare "nn.Conv1d" layer with our optimized "Conv1dOpti" layer. +The example shows how "nn.Conv1d" can be replaced with "Conv1dOpti" layer in a neural network +without requiring any other change. +The optimized python layer can be imported using "from Conv1dOpti_ext import Conv1dOpti" in python. +The example checks the accuracy of the results and calculates the computation time of both layers. + + + Limitations of the current version + +- Keep padding=0 in the options. The current layer doesn't do padding. Explicit padding is needed + for the optimized convolutional layer. You can use the example for reference. +- Optimized convolutional layer code can only run with stride = 1. +- Similarly, apply the nonlinearity (Ex. ReLU) separately. + + +To run code in BFloat16, set enable_BF16 flag to True. BFloat16 code runs only when the parameters of +Input width, number of filters and input channels to the layer are even number. +Ex. - Filters = 16, Channels = 16, Input width = 60000, enable_BF16 = True BF16 run +If any of the previous parameter is odd number then code runs in FP32 format. + + +Keep batch size as multiple of ununtilized cores (Ex. - 28, 56, 84, 128 .... on a 28 core cascade lake) +for optimal performance with the Conv1dOpti layer. Each batch will run on a seperate thread thus +performance may go down if some core are not free, or batch size is not equal to the number of free cores. +Keep the batch size as power of 2 with the MKLDNN backend (Conv1d) for optimal performance. # Deep Learning with GxM + +### Compiling and Building GxM + +1. Install Pre-requisite Libraries: Google logging module (glog), gflags, Google's data interchange format (Protobuf), OpenCV, LMDB +2. In Makefile.config, set GXM_LIBRARY_PATH variable to the path containing above libraries +3. In Makefile.config, set LIBXSMM_PATH variable to the path containing LIBXSMM library +4. Set/clear other flags in Makefile.config as required (see associated comments in Makefile.config) +5. source setup_env.sh +6. make clean; make + +### Running GxM + +The network topology definitions directory is "model_zoo". Currently, it contains definitions for +AlexNet (without LRN), ResNet-50, Inception v3 along with CIFAR10 and MNIST as simple test definitions. +Each topology definition is in a .prototxt file. ResNet-50 can run with "dummy data", raw JPEG image data +or with LMDB. Filenames indicate the data source along with the minibatch size. Inception v3 runs only with +compressed LMDB data. + +The hyperparameter definitions for each topology are also in the corresponding directory under "model_zoo" in +a .prototxt file with the suffix "solver". For a single-node, this file is called solver.prototxt. For multi-node +the filename also contains the global minibatch size (=single node minibatch size x number of nodes);, e.g., solver_896.prototxt contains hyperparameters for MB=56 per node and 16 nodes. The "solver*" file also contains a +flag that specifies whether to start execution from a checkpoint (and thus read load weights from the "./weights" +directory) or from scratch; by default execution starts from scratch. + +Optimal parallelization of Convolutional layers in LIBXSMM happens when the number of OpenMP threads = MiniBatch. +Therefore, on Xeon + +```bash +export OMP_NUM_THREADS= +export KMP_AFFINITY=compact,granularity=fine,1,0 +``` + +The command line for a training run is: + +```bash +./build/bin/gxm train +``` + +For example: + +```bash +./build/bin/gxm train model_zoo/resnet/1_resnet50_dummy_56.prototxt model_zoo/resnet/solver.prototxt +``` + +### Preping on RHEL 8.0 / CentOS 8.0 + +```bash +dnf install protobuf +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-compiler-3.5.0-7.el8.x86_64.rpm +dnf install protobuf-compiler-3.5.0-7.el8.x86_64.rpm +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/protobuf-devel-3.5.0-7.el8.x86_64.rpm +dnf install protobuf-devel-3.5.0-7.el8.x86_64.rpm +dnf install lmdb +dnf install lmdb-devel +wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-devel-3.4.1-9.el8.x86_64.rpm +wget http://repo.okay.com.mx/centos/8/x86_64/release/opencv-3.4.1-9.el8.x86_64.rpm +dnf install opencv-3.4.1-9.el8.x86_64.rpm +dnf install opencv-devel-3.4.1-9.el8.x86_64.rpm +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-2.1.2-6.el8.x86_64.rpm +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/gflags-devel-2.1.2-6.el8.x86_64.rpm +dnf install gflags-2.1.2-6.el8.x86_64.rpm +dnf install gflags-devel-2.1.2-6.el8.x86_64.rpm +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-devel-0.3.5-3.el8.x86_64.rpm +wget http://mirror.centos.org/centos/8/PowerTools/x86_64/os/Packages/glog-0.3.5-3.el8.x86_64.rpm +dnf install glog-0.3.5-3.el8.x86_64.rpm +dnf install glog-devel-0.3.5-3.el8.x86_64.rpm +``` + +Make sure that the makefile follows the OpenCV Ver 3 path! + +## DNN Training with Incremental Sparsification + Sparse JIT Kernels + +### This project contains code for the following DNN models + +1. Resnet - ported from [link](https://pytorch.org/vision/stable/models.html) +2. Transformer - ported from [link](https://github.com/pytorch/fairseq) +3. DLRM - ported from [link](https://github.com/facebookresearch/dlrm) +4. PCL_MLP - A python extension of the `torch.nn.Linear` module that uses efficient sparse JIT kernels for matrix multiplication (supports forward, backward and update pass) - ported from [link](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/sparse_weight_mult) + +### Features + +1. Training scripts for all three models located at the root of each directory in a form of a shell file +2. By specifying each of the four parameters, the pruning criteria (magnitude-based or random-based), the pruning start time and end time and target sparsity you can apply incremental sparsity to model weights for training +3. Additionally, by specifying a tensorboard log directory, one can examine training logs and metrics using tensorboard. + +### Data preparation + +Each model requires an extensive amount of data to be properly stress-tested against incremental sparsity. According to [The State of Sparsity](https://arxiv.org/abs/1902.09574) and by extensive experimentation, using a relatively small dataset or an overparameterized model may lead to false performance implications. For instance, when a ResNet-50 model is trained with the CIFAR-10 dataset or if the base Transformer is trained with a limited sentence pair dataset (i.e., EN-VI) it may seem as if the model isn't impacted even with extremely high sparsity since the model was overdetermined to begin with. + +- For Resnet +- For Resnet training, a smaller subset of ImageNet was used, called ImageNette due to its massiveness in size. Download from [here](https://github.com/fastai/imagenette). +- For Transformer +- As a neural machine translation task, the transformer model requires the WMT2014 EN_DE dataset. Preprocessing steps are described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing) +- For DLRM +- Training the DLRM requires the terabyte dataset [link](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) + +### Running scripts + +Each project consists of two scripts: a script that launches `sbatch` scripts for experimenting various target sparsities (usually named as `launch_pruning_runs.sh`)and a script that runs a single experiment. Use accordingly. + +1. ResNet model +`./launch_pruning_jobs.sh ${TARGET_SPARSITY}` or +`python train.py ${TARGET_SPARSITY}` +2. Transformer(FAIRSEQ) model +`./launch_pruning_runs.sh` or `./prune_en_de.sh ${TARGET_SPARSITY} ${PRUNE_TYPE} ${EMB}` +where PRUNE_TYPE is either `magnitude` or `random` and EMB indicates whether the embedding portion is pruned alongside the weights +3. DLRM model +`./launch_pruning_runs.sh` or `./run_terabyte.sh ${TARGET_SPARSITY} ${PRUNE_TYPE}` +where PRUNE_TYPE is either `magnitude` or `random` +## Xsmm LSTM + +This code may be integrated with Tensorflow to make use of LIBXSMM's LSTM. Support for creating a Python wheel and a pip package can be found in the [directory](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/tf_lstm_ops) as well. + +## Dispatch + +### Microbenchmark + +This code sample benchmarks the performance of (1) the dispatch mechanism, and (2) the time needed to JIT-generate code for the first time. Both mechanisms are relevant when replacing GEMM calls (see [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation), or in any case of calling LIBXSMM's native [GEMM functionality](https://libxsmm.readthedocs.io/libxsmm_mm/). + +**Command Line Interface (CLI)** + +* Optionally takes the number of dispatches/code-generations (default: 10000). +* Optionally takes the number of threads (default: 1). + +**Measurements (Benchmark)** + +* Duration of an empty function call (serves as a reference timing). +* Duration to find an already generated kernel (cached/non-cached). +* Duration to JIT-generate a GEMM kernel. + +In case of a multi-threaded benchmark, the timings represent a highly contended request (worst case). For thread-scaling, it can be observed that read-only accesses (code dispatch) stay roughly with a constant duration whereas write-accesses (code generation) are serialized and hence the duration scales linearly with the number of threads. + +The [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch.f) (`dispatch.f`) could use `libxsmm_dmmdispatch` (or similar) like the C code (`dispatch.c`) but intentionally shows the lower-level dispatch interface `libxsmm_xmmdispatch` and also omits using the LIBXSMM module. Not using the module confirms: the same task can be achieved by relying only on FORTRAN 77 language level. + +### User-Data Dispatch + +Further, another [Fortran example](https://github.com/hfp/libxsmm/blob/master/samples/utilities/dispatch/dispatch_udt.f) about [user-data dispatch](https://libxsmm.readthedocs.io/libxsmm_aux/#user-data-dispatch) is not exactly a benchmark. Dispatching user-data containing multiple kernels can obviously save multiple singular dispatches. The C interface for dispatching user-data is designed to follow the same flow as the Fortran interface. + +## MHD Image I/O + +This code sample aims to provide a simple piece of code, which takes an image and produces a visual result using LIBXSMM's MHD image file I/O. Performing a single convolution is *not* a showcase of LIBXSMM's Deeplearning as the code only runs over a single image with one channel. +LIBXSMM's CNNs are vectorized over image channels (multiple images) according to the native vector-width of the processor and otherwise fall back to a high-level implementation. + +**Note**: For high-performance deep learning, please refer to the collection of [CNN layer samples](https://github.com/hfp/libxsmm/tree/master/samples/deeplearning/cnnlayer). + +The executable can run with the following arguments (all arguments are optional): + +> mhd [<filename-in> [<nrepeat> [<kw> [<kh>] [<filename-out>]]]] + +For stable timing (benchmark), the key operation (convolution) may be repeated (`nrepeat`). Further, `kw` and `kh` can specify the kernel-size of the convolution. The `filename-in` and `filename-out` name MHD-files used as input and output respectively. The `filename-in` may be a pseudo-file (that does not exist) but specify the image resolution of generated input (`w`[x`h`] where the file `wxh.mhd` stores the generated image data). To load an image from a familiar format (JPG, PNG, etc.), please have a look at [Meta Image File I/O](https://libxsmm.readthedocs.io/libxsmm_aux/#meta-image-file-io). + +## Scratch Memory Allocation (Microbenchmark) + +This code sample aims to benchmark the performance of the scratch memory allocation. This facility is a viable option to satisfy the need for temporary memory when using the DNN domain of LIBXSMM (small convolutions). Although any kind of readable/writable buffer can be bound to a convolution handle, LIBXSMM's `libxsmm_aligned_scratch` features a thread-safe linear allocator mechanism which can help to lower allocation overhead. + +## Wrapped DGEMM + +This code sample is calling DGEMM and there is no dependency on the LIBXSMM API as it only relies on LAPACK/BLAS interface. Two variants are linked when building the source code: (1) code which is dynamically linked against LAPACK/BLAS, (2) code which is linked using `--wrap=`*symbol* as possible when using a GNU GCC compatible tool chain. For more information, see the [Call Wrapper](https://libxsmm.readthedocs.io/libxsmm_mm/#call-wrapper) section of the reference documentation. + +The same (source-)code will execute in three flavors when running `dgemm-test.sh`: (1) code variant which is dynamically linked against the originally supplied LAPACK/BLAS library, (2) code variant which is linked using the wrapper mechanism of the GNU GCC tool chain, and (3) the first code but using the LD_PRELOAD mechanism (available under Linux). + +**Command Line Interface (CLI)** + +* Optionally takes the number of repeated DGEMM calls +* Shows the performance of the workload (wall time) + diff --git a/third_party/libxsmm/documentation/libxsmm_samples.pdf b/third_party/libxsmm/documentation/libxsmm_samples.pdf new file mode 100644 index 0000000000000000000000000000000000000000..19b257d36121ff946268d46df2f321b671ee346d GIT binary patch literal 223171 zcma%iQ;aa&wq)D3ZR2a(cK6q|ZQHhO+qP}nwmtuy+|1nMoXnh+>}0>~{j{oTRV`9^ zVNqH}IuxuxMXC`J|n1_E0H3n*@GD0*pQ8&fAU0%m3wMuPucQ1qhaR!+ta1oWa- z`cB5e#)h^=#!$SxP>xOx#`@M!ZX45>ma$tc_ntn(2M>fALzRi*Jb^8J96Y0|A6CRY z0M-cubJrHG>x+`u3Y?!`mt6D~#oauel=1|+Bo5+m~29Q<~#x#hrI^BU`g7 z7s)K})tM>#p_9N*z_OGyu_!Q`$P*30sGS8mP#rB>tqtnEG@z>Q^;W4P86JaDo!9#W z4KpdYff+2nwbh*mkqQX4#RDsT>HRY#I13i zpgP7jn+4u!rW22+krYrku)$X4akOc1)n^?Jt@tfxSUtMWDeU_*IglTJR~T88Y}GX`S(%^tgO0+WsP2JN3PBsl|2mYlTaK$^$H?z*^6kUTNM=Z)ZCI9g^zuGZKA9 zroJs`fK}Auia4|}+~ChOOLPYwd3M4#bcJE~d$Z&O%o0OFB#{jNFnvV9hc+1_m+fil zY%VYuFFB!B14v~$;SX)0$*DF!UKK-VmoP&E+r`bp#(!3>7b2p+iB_E+suO(;I|%PK zOmWml+j721YB)9o5G~6v79xKm8PgUb#qH=AUByW~BD#urDDF>Ye#q#>QBat_G=qB96$xlOBR26S&qN<$i_;6rZv`kiO~)wwArz>lx;+ zx2}d`k3Skg6w@Z4KRGx)dX>r%jO_tV$Nj)dMmHV5E|c_7$gS_uO;cH+4?gM+pX+Fp zmwXvcj(l2=U$NxQB;AgZzJR$iGsf3!8#7-gQglGpY9+kE7U=!X2yT;&Z$+PtlwT8GAZNfJ&a`CrN9k|@#VR{KY8TUx( zVg%DI)|@r0q)^c9Y7Wu{6rFK7LSGc8Vj(a2RNrptPPltfaiSXLm={WZgLJ%PYe&D* z9plLUp{!BY3cNPi3ov-UBoNyJiGi{e}=MmkWee#Q2g2-eVWX%qd%%?`lWyjxt0iY(Qp^7}t9enN1UfK!4mSpTB7ED%P1LgNoX%2qXITd@@b4)k6(& zkCG{2g=ScNuBL1|i$MKHJt3PUL2h-R`ylE?(wx2(`gSb~20j*b7f2YF0NQSaWr#88 zr{KcxRp9SXjSzPM&*-4@crp4yZ~ix)|Gz}8x)l3`UW@oB+fI@V7kkok~+###;Si<|);)}5?Gaolw* zMe?uh3aAu=u{JjpDB<3>Z-I-%ZUjNdMoQ1v8ezF(-zKsS^9C>;EwlkQq~8qWkF4 zHG@gl2;*Zotn4}N?41KkCHhJ{*neQsm;xa?*iRuT%#PYTB16oaJl*Gu#vdsb zSd;n`l5_SoY1IEJgXDvR86XDzkp@H;P*xK^_j=yyd-!#-37>2~2|?2d`TP0ShYaC| zL0L%)M{AA|I)8x{{K_#Owq57oWdE*V{aPj%3u~0xQHo#ooUW>l?s7>orcyoEfy24TD|zCe{|9YmL!Z?@T*}U+bhM{Qbml~ThLcX43^@Ax z1CsCg(H^rT!?3MWz=i33WO&Q=6k#&9fiwM8iMB`6GB&k=ICY;^woK|_w>1#j@o}TLwtibRd%aVMcc#-GNUnYuKUJhFw=C`=5^ADvYWws?2~um)bFDtTWn&mSsILrUyS{7GYzM9heTtb`O-2_t%1;E-yBW{HXR&Nm+Xn;3@*Bi-iaISe3SPImRN z=j%Inhu;>~a6_G=wIb8I{_qFN7rxC47qj4^rB8?&iVpd>vrFMog~BfV5u*SQl)AsEpaeqX3c(IC6?CUBgGDr z#`2+L=7tu!Nz{jKsSR1jHT}FV_A&BhHx??~OCdADH;sds(EIn~N724iw&wMCwq8R3 z1cODi&TM5wer@#U67pfm@hl(uhze4XdWUh0Q|WSrZl%inFB#m_q3>#NG(58iF8~ij z{A(_}(lK?kHU~pa?{OGIsPRdMcm1;caG`?=U&VFOUSX&eA@he%dl?Q)hGHR>yFOw? zQtj1~B<74q zVf^&_C@*}AD0e)vg5^?OawT-cNope@{quI&h6~9G0kXuIX}DGf#mZAwe$-Zae=wCb z+qk)CU$c`|ZUpyN)9+>zO~ zeuJiKN@7qi-lOO6^T(iB=gY)f-lNRJ`+jygceI>gHgnohi#{07b)PVnL4(BntVp3w zZMo0DIsW6Xr#`rkf%C5R!)|q65qMb@{K30<;k>$+i5mU^{4D{ zv&Q3CC^SWmyf6vWEd>;cX@Z;VUI#&u6LV{`{Vg>Lc8SSa)Ov&kvNndmbwE`Fok^rg#qPQ6ZPI12Rlo{Pm5BX zO;ZqoQ6zxD;)^uKGm!8CbPg(9@aQXzB7Bs=b6;Gzho6>3$%?hEPfwTu_3da)NGIH& zkP4d9UqzWHQS7++0Sbi3F&ZGo@rP**mf^&!R?K8yk1GZzFSZlI*cX@so}}c9pS*Ry zoyc+)HdpSvD)?oywWjLha4GIFn3iMC+ZThvZ34KwNfS-V&Ru_+6{`w%*a-5PTJ_JI z=61jf<9%G(p@NDDX`&?MaEA{xYI(Sh(%^1dM zMhCg<&SxsG0q?~tGc`-5S!Gl=yf$-f&o#~U3>@4yosg5`yx1Yl1K>BOc7 z8!48As3Kf-uzqyuV1#pHeLS##bPn8X1_S|Y(S=BLA1H^MI$A$n_JjyH{0KB!w~^uZ zA+PnzL~CC;at`>jf)W~7!0iqiMg^a_gAA)#FGX&yM>A6kV?S3cA1K%1Eg?<%4G6Z5 zqb>qXQ@UkBIQ|UVvyC+;JV#?6>ZOc_b8q#97c8ljg%`FF>a-`d5$cnrVQpkkV(-vp z!L|n2sH5d|Hkk5B3QZ!a-jme;jhvA~Z$@U*h_lC{p}PPHykzP6ujsYX#R5l8?C~q} z%D;<~R^{fm8{kUPM^*6wPO^`nSrL~y$Q5%ol4_>Z&jTw^@HJA(OrDtBM@fTDM6ec5 zf!Qn3zXcYWDXHsOmrsR1NIxfBz%HZ8hWVZ{^kPd}(K~lx#GH=CS%G%q7`hSYBT^aY z@24^GL?%FRfr2UGjizj~R3K4vI7u1`hY(m*2_f-0$G&8_daEm6#GV#K_(x6?ozQX? z0itN&#P}#LO{;LdmWS*_eAfe3pAizc5Vr3MW7-Wz>es^%B9QU!@O(vjc6Ijl zrQFnMpela^e(CQ0c)Ga0u1%%3@b1D3R1X_L#q#s3Z}uO%C($>8vzLpu$!T#1(8N!R z;k9MM1Q@f$fCIqVgL+U_W>rR0z9|#VB^vbi5F@3z5UOd>cR?tmA5@^C(xmdddYh=b zB9SpK4+29T3i)7O=3wqb(rx&?CPTHZDvGu)uPNL4`n6&jYJ)2-X-IiE^+u=DjL&7y zCwGq_kUH}}RZd&3D~YV(7Vi!`D*3bOn4jM$g}O-Nvn!v44F~d2u@7OTPlZir*g1;)t-I6P@MUE@2neUKlpQeu zC%Mr%Ww*ZsX^M#6Zh_q}oSh!8Z+C3f3MHb@6-EW$S=;g2?+l9i2HqUN;ymo}a86=x zf=v?%l!z`DuxFlU^}&W~DM2tUfeA!RS&ecUWg5^nQc;x^3=J@9$rA>7D*l!pQ;Q3(Pgc((Zup&Xl`8GI0rjj!|uzkF;3 z#+ws^70;e5Cx{)5G1+TK?oonC2&f*wu5i~m3pN(&S>hZ^J%dmhX50w?kOz$47zjTC zA0Wb_c3!*#R+`T1AOTM!AGinuwtGPfLrA6yMJiA=l}g>U^pZ<}6F?42qvnlSE#Ra( zluM7G1hT&n8|T2BpF&hcHm6F*9YLc=yqs)vU0@haWWmGrv#y^=mfEYk%o!1YliTuN zH@}42O+GwAp*Bc)d`);PBM3K3ZI{028oo6W@_|p6^)Wvl?S~^L@iRv*;3A z=HJyyv`@U$J<6Nj{t=uhG(3$;nz&RZtZd}oH;O^l_`INabx!^%C^Aq8Gw+r+oxy|cJw7uxZ=uJli(->Q%uznv3+Yui?>j1ptgEVE|52WC} z@A{vj&*nys)$+}J>L4Y;TKNW$?fyOJv~qCcxP1ha6j!J-qS*j?ti@5!_MZMUwp;r- z-CDk_9j)COx2eun)z#M5RVikrHCVmTZL8l>A0**;Q0Il>UM@@`Bf5Yb=ur40SPUs@9G0>(S*`3LJDerU(8EhXiV}7*8RuovpP$KE95F zO9|)-+hp`bx;3_A@!v2Z=3WJbI@(83<|Q=Y<9%|I4{bBJM-1$`c!Isv&bzKaj`5k6 z-(p_UJ3O3ivOcS$2)@FP$=wi7dlE$7d4571JyOPB6w!0Ux9`IgLJTWrUxos=el;RL z_FD|d9DxX-qKpzoW(-jv`?A&w&7Xuaq4FpzjbixiFmbaN{T z3UJWe=Clj7E}H`3UY+g7a(H-+%y*81Ms0V0rWA@j-6Xdu50vJM4o35^ym&o-(GLTE z6*9p?i0UE18*QaDEfeS$W^moMZ@j{A)EeS(ZT^S>^J0f6Tlf35 zc^i$-vWDE|I3@4v=7^&8R$Y?LTF_yCbOSDZAN$ZR<4u1>!2)&IgH~KUSTM`Aaj=OU zN|=lw<3x{gP&ncT+2uw=!f9B<!Jlna=;MPQ6#)Y%48~aej|J%Ol4a^1faMEJ)CE3^PfKt_1L&@98 zd(%MMX`qA_IW>8BM)TLxKs*{<2nj1&whA>r;?LxMkS>tIIzuyJH}RGSW~k&4GIpsx z4tbXkPj{dw{MI*rBM7tI7GE#Rh_PI4r_jshxE=D%R>~5lK#liw(m9v z1T=dQhV2Ir{?PQ&7{RR$4>aaF!{3Wbc%gvn;`leH;>47uYwK!HgWFCSXB;vM+eO#K z^cB`OQl@x6(LB`jdKoNR^AK9={ulFrLm{7;ZRX09Esp)sK`07nC_Bi=@}~ZX{9sSd zO`FCBuOorP-4mI_&eujXcRs6+^G=V8nrxTc-br&0h`n3*g{lSTR&i(Pw0?mE2 zHC-dTzXzEQ%pCS!zLUZ;#LnU6;N zo;PfXR(m%>15WlbsSfmNh@N9T$N|hl^9S8l_s#eLV)Q4@hT_NMSjniF?Ty^*}aG>XOe zU>X9;R#*w>{vlGAl+BTGAk}*dcC%Kfw^PU|6n@AioURB{J?(-xmB=jB0?~P^#MZRX zH9z67yn=(%&&J5u+?p5-6chIB;lJRcH`P;!3CXw}5hXweKE&HkQGqa{S>#kn6Yguy zF&c7oOb%DoS~9g;^2o2pJpb3{jaj}6O*l{yUFOGIUVD8-8aLOfJ(E?e%^Rf zI7nAHpq;8;yAqn}yhoMO%@TslaW3s8A(jEl0wf8cWz+L|rA=2ToTyo$V)UTPR{esK zwose$&?(!P{~<9ZODQ0xW4GUucd>qgrD1a7!D#^3P_J&tjY;*9+H@mu^ef9{%3yi6N{V<=kl>DTrow|Bi2eEY`5Of5#;MAjOpz`u_0?G& z1yd9@6*}`=L`aU7z}+=1abAWDasoVY7W8)Dg@x1U&+?T75MfOmMXuHeWMgqhfh3+VG?0%Dhb zg}7~-e_E?%{sd+)17*vCr}1S1&`o4nfyWwyTLKKMorxnTSe3MPdz#O)fz zaUYhFvg8zjySciK z@z>o>mcvWxBI7ot&WiBAs@?vDR9bC;KVJuL7jd2Bai4;u%TqdIAW;bVR@ve=4^UKK zvKx%n&s%p0?vxb`WiO|UDEclzUI#6L;OM81zB<}Ec(jo&Lc{cE@sr)-E5>SRaHzeW zRjr#9smw5x;6*qc`CD+0R&XgR6nJ$+|9ONO|aFl0rleCacVsk5&q4*7Y zd-3C+))A=2>NMp)5F9h5hToxYfcC-`thI^6E3|v3AjVYbHg=F+T#;1J8-7@gCi#66 z(CpSX*{Vv?U4*zZXmCMDTheD)KG&A?upOohQPUQuAgFVzh@Xv#r&!F-a?&X=f{LfulsY*W*lQx&UNEh6sKHh zVGgfH-=7GWvY``WO2O3CmnP|KyIG5moA&F4-V@sFI^}2EHhmN8=`icRdsi+)EXs}| zpg#3yaBd#t)#=Rn)<4Hy>3l;g=coR&?7QW2_xoRWAS!3La=~)k25U8LR0pA9@7F=0 z=WKlO-V6S1kJ+6dU>*<8c<+1TU^r(4HymywD*pK)3lsH`?3o4{C)&4CHkT~5#p9Oa zzX7X?zpY#J2T88Gf1YL1YqM2BS=P#<$nP9_pqo9vD5m-dCfHt$vuS@%R`9*q9nhxx z@p$}QPb*;ryvPSBvTtX#qs+2r-x81KmT_I5E9?Rw26XVy9$``-E0YVB;_1SgWG z1`|t7H*UKYn_js-AxgEwPx{ta?!MnWcf@iIa9jtGD}hHBy*%z2{*p7Ap)EMowPz+*%9WCvmLBS&?tHN5E2~aW1TS!4{-(#lNd}`xg7);zL`+PZ ze5u*2K1m$KhrXZoOSQ^$*wM!1hMl2@P}y$dy()c7DISx?a-|BY z0)Gj~lJg;NK?;Ar-M*sZpO$(`1^jSo7aa~rl1N~b-td0ogeNe50=Vt?{Vfca8eC0J zuu;3q?_7==SR#z&NqysR87hOtnPDsh&}+ddWEiQm)lt9Vb#joQd~bx~wGU+iHiE`e z+8VDrHw+)LWqc;`!Ib42E9Fs`LgQ(-1*Bq249r- zV=)U`my)k?Nce6$22k_hwI1`M5f-WnL`D&rNp^UvUSFvY4gU$#wT|Y6ujetJZ*5dH z2e6$k;yKp1+48G;@GvOHhl2qo<=Ds1O~{J(i#%)^x^hfoQsfspi|cMMKq28uP6z(6 zM&9aCR{ky4QOnWFVb^=)_~_tyqj6V!n5dVnkR@%x8qhVWols_~kfr1MiodaGj`*)K zhv|RV0m|-n#su_o1{O+A)=>1a1dI#}|CPIPaC9PI<6!wesVqh|R`&l?1x>4O+F-LG zc(1A=oJoD^(a{eQs$&2;2y6h+A`DT^tuO>6H?@FDX$W*$WO;edR$Y;}EQQ2pwS+Y$ zxy-_xj?!nsUpA;^N_mqVpNr~RC+!7U+T4x-91mZ*KCbE~xlmy-j zLk8l*0fb)P3=F4<@j^VokPIm)+~8(L7$GAOoYxg8k5xi~LCFu2g1U=r16KEE4xEet zxe7|pnkkxqN&RYcZD8Q&iU2EW2w~h_UK#<>=}jEZi2?Q<2HYPX689~B&R^Hg2c@4L z0H;4z0R6uYc&a>zHb^f89at}jsqmeY9vmt~W|i$pD+EgAfb zThB%(jiFsiE%N>XtK!JCkL+iJSmM>JSfO`nZ`EZ*-;8lE;Hb!LJU6Ms6Qwm?j9!#p zQ1ZlB(AHLrj*bq3EgwqoS*9*a zJYP=c9{skadiP%R)K)~Q`?h0VN_3Xd&8jYYvG&}cMOtY0mZLeZvRzti$g)CH=)&XJ z0MvGR)Mv0|>&VRZ29kYJ9*&a{#MoVMcK_Q`Tq{hSOB0_a?* z&g$2cNf3NZc8n9ZE#B0nV&6256pU*&ijLw(!{&^JdC>Fc%4t2^9P19Bx48RxsUWXh z{=r%cYw{Dk;4mw4NJ6!aFW^~0*!rc{s-7>KZ$0o3zb*c{roPD`<0BDZUr|iLk?25{ zN_<%$tedA`(5(?%&3Uzrn|y{r)#oi+nD&6+ww}41?y8+JCHkq*S$%Y8=G&k#^<}-R z;mB~_F5!4^QX44bFQb0a$@L2oWaTa6oqjL__TFHdbk~$~Eo&&Fzi@3l2G5S!kl2mM zd`83?c#3zxhq|=w{ml=VI8-qpLH=VoIql{1!Zy|=Vq>}B8~h}yxs)_ZY$#jC2@g#& zt=zyyEL6CVOfXImo@y4e1Bq7vM1sY?)JR0aMZ0YeDOYKXN69Zpd>QHDs!sfxLV#F2 zld^_9-QZ&O!xvjLMWIy=5^h6dSVt+~f%Aq$0Q+1I5>91XK6H1a_oj?Awvg9OG-x=o zFtR~Zq+E=&jSG`&?V(q+I7BL*6)jir$N>gPIymZxX?Q)oQ13)99{1Jc7+6Erx2+Nr{9RSfl0r!a(JPR>|fP(8GS z9_PZ)tnap*rqyfVX|Fl3<2#oc>NGh9!j2m6oB)VrDXdr!K&D;8_tn59K`@tk~< z@#6=!te-c~M}v#64^U_JthN)U_nh8aK{t6)p`m5SbXDrA!HZLa5V0A?HRoIN&XrG5 zS`RT%HviyL4#5fH2FaAK>umgaSjFjLEMMzgfMh1FLq20Hj~8-_L_r4rB7z1HwL*>t z$$w=(7ZJBWcYAj1A}EnaiL_h@Hknz6R3@ewuZ6ip3=#peN2EdeVl2-AEJC;!2h2c@ zDiHr(MJt4L?HvzZ70a0tocF928Ao~fDN9>6e0zC2>g+zNk;Pj+d|s!K1y|X#VEM3@ zp4ZqFkQ_LQK}wsD%tx`P`{(-UxD`(=-gKGAYD?bV_ z4`dvLQ5~J}m;l*;Du77-m@N&dOsFD{h=lui;>mOnW629r`C!&D9)M7y21Z2w2%m^9 zz(Bc3`G{teA~k=kiVnddk3R5bMujWS?svwa9h&O4L?&LFv^p4J5(s<7L4?rnooW@_UtyJzk?nr~>wm|Y z8UJwn$3FYD+P3W$D~#{0uKtq*m`4POFZ`NN0-1G63uMNnd>$>>XlW<%K_h&Mt@`U~ zY=O~ek9K2`3^d9lv`AtD?^~g`JLd6@`KjQqs)3nY#Po=Wit>cx%;JK0gS5ZPwBbsHx2_Jz6=8B zA4$Qv{WWsbC=`&Rg8!=;>Z>o-cNJ!oOL?T85-+Qm-945 zZB~w_LAq6_lqI_93?D|R4X&89)CHOtp~g7KSP&UdaIzA0gw5xwFG%SG;%-4GA#!zo zJ#z`koium{C_}J-FDTf^F{Mr6ya*7)Iflo65j|x{uj@g6NO#abq^D!O!gxpAl2hz+ z;hB(v4t|MvgxN&HI{BBwq9fZjt@Bw78N&W-kQ~tlt%Kftdq%8E)fJU+%8`aP!hh|a zRC_d16?fr`k(9^+;3n<~{6<78E%fWlDjynAvC@ZWNiIc!go77uQschPfo~e-zcJid z8Z*4=&5BkgUg{=1wkwi9uuHdK?U5AX*@F^uLG{)6jYP)}(@VDcq@Z{@>7{V$pn zBa8bhmT!cYVT<%Czs87MK28v@e2C8mNs0zUbuxUn`)RNs@8?JM`}#~huDRR%!+XLb zdraQMM&0_DVYooc+z_w|12EGir1p>%EUL}OQ1`&to=_EuPfhgpax+Q8?-OLfMR~s} z>f$42&_z@e&spgwcg(F+ii;dYRq`!Yh+`KrC&y*7L$IP)ogF~~(?Pmlb#GAkIu?Q3 zlwP*C%(+R~;STTJ6b+F>SoO`2LzI}CfE&5$()!C*2_hlc6;k5B-l>bQNNFE96ep>Z zR>xFb!g$n>(op_TvNb5D_4>C^jK|Y+QMpNzos7=4VOZ%BR(s@dxyLkh+2&_sL4*Xq z^ou4lYs%_Ym_EEXaGfNHX*csoVl@6Jnml&!Bu@Tg?B^dA zHn5xgLDJOk;!iW>f}@n9K(}YB7iDlPqAs#5B2_e}!ND+#%4f-Nh*l<|ott~~H!|K1 zx8=C2VYm}Yyri3C#In^T_W^3ufBIVC*7u^v%yiyWtM8Q<0EMsmnlc7OaCPO=uGR9+ zUv;ZZ{b#xoP**1iF1vYjGjUq;!b2pGMBmi;GRuXs{k{&0R7$hfvHS}|P)y?z|6xaM z;d=i8x&Y1W{YTncP){$gwlVXnUmol^OIxnHS>pz1w<0lG|Zpa}S9F`FB- z?GGUAquXl+ir*>;CJHC@+E-?Wh^rq#?-Zj^v2d$Hb#>h42s4r9DXE<2u~tov#xhm!gDsfNqY2?QlFBme1|>NC z-}AR&s&@DK4V6bbQyf46_Q-BO|9Aua=tnpF9?kp2aQZAh|DT!A(_0@(p)DR`?n@o~ zWW2v!*oW=Kw_R)`$xS(sJkdWJS`AwFQ9i#2&i93uFxrQoTg82@Hz^jgc7BdHSGtq8 zQd=x34M^txPB9MD>V)5gr^qx4GPAbczuuNRzHB$ytD?+$GttVdj~k(aP3p%ggxAQw zOKhbQt*t2~dYJKZ1T#Gqtj4XXYADvNruQA1E5lZz))q@|pAR+h0HwZl2lt~^)pD;m zx2$_EP}+ApGXc;z9Fa?Y**)-Yw@$xyc8ck0s}nc<_R4GL(H8kWnL%~--`RoZV@H$YE11@X+H(O2QNOL3Q;4b<0NNdd@T%UL{xnIDW6+q7a1}WGW z{tZ&F{`t?1CJk-dO;!}|+gb)Nm^l2YyUe8sz)96i8kuX-8~HqHFt)!)hT*vE4xKlv zvD`_@R4&_pR2Dg(C+ATq<5N>pk9ax?svFU)dAx;fwOT)7S#uyZ4U&rf7+3$8M-~rB z&NoCM&$w6eS#?XsfFJIEx`>%2TFHnFHcm})o(C0t+vjM1iJj9!tDJ0Q|dHH>^ix3tSZ z=ap>lnps1b$n9vBhg1xa%+7GIvf_2S23L;1=Nlnq*6wJRUD}oa_9+ zdgMJe8ow2M^<<(U%G0jk#F&CNa_nKiM0+7pwk!OdTjqbc0~k@oRllq*w26+vOe{LtEH2yvbf%s#fUbIL7I{I;grpvzaMb!Tr<>0r+vPm z?*C!#>;~Bdqyxar#~E++7CF?2^Srgn?$ljG!x1e>=N&7HS{sAiAerh0kId*IOUM`c1@xe=yaRrde+2PBcO)~$6&L%xwBcr9E@*c+TKG;odKgmkQF^#2N23dJxRMf4l;RY-7bzJ+r8ROP3etfEz5w9n|;%PBPk$
`2N{+g#yMzTo4=)18<|@s5y9xc~P!?Yx z(`O&;o)m>U%zK^iBubMQ_Ds-fA9WH8!D%0IQr2hvIcW)XX|Mfa_?XklVTsi3x>LXW z;>0J&42AlbD+4ZdJ>=p&N39g~?&nC!p@?6in7-`RGvpOX%9@mL%}DQ%&d4zQjtrOwGD#&vjotDl0=Zwl@A1kTySA$10Gy0OaleSXu;5HUjY_L9+Ly;X3@E{5vN9dWvDQ}NRy9r zBFLX?=4^Kekfh|xM#=%3d1(zj={3trCmA7pQ8)WZdB$fzFB~eo8M+4r@)IDU6Cejn z^t#GXQ0~Ml1LvnYH11nPLO1%?lhOU49}VUHv;sCA+=AhcF1Y~$TRDs@#*3hQ>}hqh z>yl9Z*;;i>*bw$R6bzz4$_aWxcwRgnj|OOGjVQ#NjgO^}ACq|S?(n92`>IZ9y+kG7 z!FkBnO%GHgQ-<@r4aq>nK{5-oXF>tAnc*;a0}f(J>Zv=ZdvpOBQKENe_E3mKHXSwF z%F5NX3^Q+@0ukaRk*@9x;Yd-!S{i0ZSGfJOKW{)*>9$i3KrE6FM!-)$g~vj9noY7z zJ%1n{3BgVjJd%_{y&?=>GfO40^EKI6;;Ne7OUU2M_ChZl^oxBJ7->b z#=`a-31*U7^s zOZ)~|x&`UTX1pCSy!u39i|*_&D(C+(K4fQO z{lDWw#(y%%|GWPGOP$rKlMz~CFx$JeW%Tl@=h_Tz@oDRzQ$vv?BhZFygnA|eZZI>u zr)8s%?or_cI!>)O-KDe1G0IfS8oVcJ)T0^Dy`D79^%$>6thN0_Z^!+JZ^?vht!kle z`&MT(S@0|zXrpu5Z8#;tn}bN~>4W<8>Vv>u1H@={dvM^gPDeCZ(MtpGW3as8^~H7^ zE@*JGWI7!vaI&GR&ne9qV7!w!A<8(YVJU?V90efJ9^DSOQjyFNr-BN5AMAasmw~5X zCY@gX>#ThtqdIgLX*|$To&BI-jv#5U=3%^GF1Jg~iNG#?<-1+Vbml5shd1A+Heuqsk!qy4vyl-932(FHS2y7%+;3Iis}qFdW8($fBMKEyUHR zZXkhWtGmIN02p@2fdE%TWzL@#z(heIszvNQtr;|r@mWYy(AVYNti(vwx{}iLVpKl- zISAP07|3e~Xkj2`!u~Qr!<^89f<4la#+hfUoG|OR;YN9^1k;PgX>go2>-B*enNSGX zjeI6GDi#R^Lq{bOB!kmX9Dx`JIjJ0FA<_;k99~?ocYj^4KC*YaJwCp(dwaTm+Vyz9 z`8mMS3}hZUIsVwa-v3;;+pHf}-_G3rlZ&3-F1@!n0Go)zNSD@5`g{f~@sx*2GSAQD z>FhdscWb7*{4mM#dAlb+3Pidj@K#8kydHhM)%8W}#IL7OR+$^b%75`(D_7nw40Sdr zstt^Gs`%4V$4FVHr>O>SkfaV>DZ2A45C|ky9Gop3@Vq)mmA|5)<^()+qKn@dHAj@O zCcM)Snk~?n4jtHASsvIfFG-JXB8}Otr3YTroxyuoU}L6+HnB3qWPLtf8IE#GrYx>X zKZqr5!Q-5ah~*NG=Yq_(tfGvmDaZ83zUi;6LG;PRJ9W*U>QZ`Eab+Ro$;5_rXN;c; zM`UcH>6*Y9znRrZP-=%QA*hwQr-l)nqYY}OD2}t=ujl##lJ%H zI(BmfJLCx3dPH@4e8bh@OOL%Z$ss&A1>m@Ti+Zn8yHLJsi?Y66QkgdXdGS%MZf|xQYp9GbX z;2c@&GwpV+^?4**7Qvwv&m~Hlsl{rSRIbjuFDYpbxedBeER&CJXrZ&#$BSEOKsbxc zkUP!UzENe8WM_5D9dA>}(5j2d=?xo^FW>vh%d&G!aLh;-Ds^3?-=<4uTE;v`$2lQK z+wjmr0dr+`uxgr9p+@2tAL+~Aa83W@n*D3jV`Bcd$*?~h|7laZ#M9Hm0}SMmrL#0{@ll)S$pjI z`nr2<;rK8~hS#Uckxy=}_w$(wj*L|rS$IwsEjbfZqMJc5OmR|TAMs-AM2t(}w;&J@}deBNTINaGYNY%mw_QrVt z8w7-kCj5~JiRmc%R4|DTa50vF2zi8Oo44j7C{WkvYB~UCB1Lavu|04uJNgs3nCtjL zAlkfJ#zi9zV+)VvspRyUnByzKx2=d0z02&Vda?+^e*$y`9Iy(TFiY_``ek-mfl1p1 z^Z{CgO>xL53Wb=s6eA7t=!W^N>ErluwU87*`(ySZD1T4)+`N_oVpJ77uiz<(zUqXL z%LHVuVVeaSjS~Jz&S6qw3OCt{$884a3CcHKeEFC%7*YM1Rv9u7iCPryc3>#Y0nHKK z@<55hp@9>9;b^jFCQLTlif#4;7L!SRk*%-ysKwK1n0Jm!-ZOhMz+Q9ASc&@rj8%ICxf6LjO z5-~3lW*tZ+!uaPQrJxtzi9ZjVesQgnVAeY}vN(#mM@cUYubCXhMKTBi^lA%(*?Q9l z3@PVn3cl}+zf!&{pZ>TZ0w(-SGbLv) z4HcjY0cGVjj#qL4x&`c>rLU7|_|G!8#jjn5@c*Hvnr657+fr+?UL zPn6!&s~b-Wv31)?v2*5VE={sk99K2z-(9cM1Ep{b^*9+yK!+Fhz-GA%&|`JHzMF{) zHKJrW-Nm+R<3bV_%ai-p&l~YYj~xBdH*kTg{wA(P$Qzc$;N&@Ka=-H0K%R8Gfl5p5 zNT2^N%FZEJl&IO#$F^p|6^_PjFJ&h@RV zqK%4LMj>-MMt$*jq3%SDyB|M})yITacyydqGGC2uE2n$oKFlixh%N1}+-1Do`YI9c z_(P0;Rzy>oeO?|AKy((_Y1u(+N2WH>qie_NVL1Mn=~ zYi5W(F5mOlt;gnMut(8S0ks*fce<*3Tnd4+DliQER?^}`Iu5>;?(eDxOr*yz9WKR%P@9mj36>qpx+#a_EtI&msbRj^d!nrU8szxT=V|J~4 z*<6l~a4cfoYrNOYCp%ay4zb#fY5()3ot!vASi){ZBRi$bKs!s|hG>Mk((6_lj{wie zfsG>%U7E=@UZG~L{OD^w%i6rHYD(6347Y8OrUeE$E4nSns*`=4DMF~`^~^PN6~rW$ zHk%i8ns$X#gRGH-f<@Y|c-}!_P~~6>GPkH>ZGfx$>!!5}or|SmhhM?x7I;`Hl(BqX z13&^=)_$EU^AdXt<4sawWdC{n=VOkIi(1>RYr6H&bv8Aunj?+cnFGzp(e6~qt{bcu zK1Nu%&aTS{!Ng)Jy|ioRe$RZNxIk)=5wE*c`GjuKJ@lEGi^q@R~8gke{sbo^Jq=jp>;!8hMTo))m}P_U{wI zG*>bKE~55K*EYbIlkpc)arpM#R50Y5-&mCp1eN;}%CU*I>P!jj*JzXvf90sJO5C^k zEly9?kA&FkeE@R6hLHy|q;d)as^4I@iee>}0TVfr6#ZhlC3?2}XmW&OB6=53SE;M? zHI7xeN+L>Pqk}+*rW;KKgJn5~0WDd=zoB0XeDR=_h&U!l8tccC6Fe4ihqqT+N5wE# zlq4fDsK%b;;%8meZ95$Ei)(zt??sfu07PZGMm6HoR(J*Mn}{`|CU`2Z8CaruAT&BR z5P&UYu5wTMIPj8^2*S!#bSad6p}m?=3j*MxX?pxH2heKIP!7!V`B+L;J4}kAm)v!B zn2!?ce0(CgtGDLqYO%54^<*#Kp%p8QDs^hgX!eDZ;5Zx=payL~o*gfIa571%;+17= zJh7NibcBXNM;Nr7j^k!ZfOkSUnZ!fl-Amv^0cDzSM{YvlUH`~+PBJbM%@||=ArmBO z@VdyP;1HkbA6*HLbz{2Fr<3L!_Tf^mvndT1iA`pEGRUFN;kx(S zb&VuJ$}Yo|8Y}`rH<9*fpGE*cUfE)abh7;DoOXd?F5khN=~QBCduML$R&-fB&g94H z=~3_(zNFU|$dJtj>*wI;81fmzmW|7>66(VCOS%{x0dQ}Y#MX=|Y#2w8_n%C>dJ#U3iaLV{Z8eqD=7~Rn( z!D-MR+JVP=HP!20U<_mS1+V`TA4B2H*8OR-iv&5 zuoS+gM$4Pux=HQm`z<38oC@8PYGxn)q?1ot2o}?hU7vrIIP?x<(P!q6tmsH2)l@6U z%R!bS4U3HNI6-Uc$P&?9N5JpROElu!EfMFKClBB_gsy;-fk)NIL0qqk%l&-Kq9*-1mE0;^VwZE(C9JTl8ETFOiPHc&25e>Qe|^c1G7)QbR}3SDpAyH#|Q zsLk|LXGaYpC3z%r2^ro3g@;|pFBHrW$mee|CbaKE#dAn9?AMLDE~Poa43kEZS#T3K zW9A!D0g)XZMnO!bg}18$4TV_C7LErNKl^&YcqEYGVJcCeEKyinK0+7WWE}!vFk^BB zL|48%fG(`JzQp5RG0)Pz(J6z{V;-5hbMtlmc)UzP-{tRV`x+J7AB8w?0V)mECH{cN z!opW2aXp^IhUz(Fcv{aYTJM(ssaT8IucQtL`d=Bn+We=X_O?hThce@WIN`nD z1wI#;%w^Qzta>-}j`(p#gpZR-M>zQX+^qnAhZ?Xk43}L(+cn=$;jCnQuDY!oS<}WY zPm1b>CK`~SAc+dMr3aCq39+`GODpKH=F+XFch_610$hsq-(+7D#LsJIg77!l^!!K2 z`fOQRc^X`TFwTtR=UcUp7+z>wW#ba3C|JMv7z~;Yo&#AQ1XZ2*$A~#z9WU+c6b|eY zy>KVDZ=S+~kUxSHrR;&p1qu&TKHDlByK3<`bXO4}yfd5@fO;$alVeJ@KPJe)s5H~s zaN9&F24}a1!l3j`q7FtlQ9rs#;d{SVFU?5*9v@qMbsXH&yF-<@l!xFfc2f#S8Did! z4aU9fnyAeHel9PTNu)^JYSBD@&8Pm_nEmTP-J3*;(-RQzw|e#)IB0oQ#H3|0EPP(6 zu&Q9pC&N6G)XRHi+dDcp5sAilkQr%eJDBp6#p4~BZ1oG#=|Xmx1J}KC?K)lKgQ>mc zde@}?xYbn>11Gk$@kE~MPGeh(W8?Wg2LOaO5qQ2NgzXsfE$wh1ke$pCZ2KG}9~72v z6-_5?@gh|0;Xd*!3XhCa1SIp`M^`<}W?8_rHuht=uFh<&(v*quhG;nS8j5(a6Wnn& z(A)L?HJL@Za^XjuzK6l&)W#R8Qie!+@(XC;3P**5?X4g6eB%S50M3RmyqH@{KP3Y{ z?dAEr;D1#e5B?7d=pQcrAF{gszu1_#!`6Rn%s_jZWmF51RIvysL}V8ccIf=ztLQ5r)5A;=Ds#`Gn@&x8>ikP4 zKBG3T?OhrpZ1y%CF3+IN##c6rGi1v*f_t2p^IH@CFShv|9&>JKZ}jj|n98nSJIQWY z7r1TOZI-PJ2G=Wrv&u4do2JzK>NfLr2ybNF8kRMItKTCn@Fu6mFC)5NmmhW_h&|*H z%i~0oHtD92i@~q!SD}?>lck~K5y!lJUaOtZs9R~%_6@Q*o1^5Y-?{3jkp+!cX2NB==oMbj#7wTp5%UU0x)i zca{`XBg`Z|ezx9ri|D9qv~bCJG$wQYKK4PNOxU1I z(BWfqhrg2?`{duomQLc7gKZ|TZ(njCA+vf!$Tm7Xi6K#D70DHI#-SC>bbb+uLoI<2 zM)KDNQPMkO>-%u>W2de9LEF~dhX%0U*|r7couOeU+jZN~N~y;H>!JSmmOtmmkggT+ z29kK)I|b=U-TPKg(0wyEX(D+NYIN40lIYtLV9Yp5zusP9Ii`VKR)GO(U2_@U)DUl^6KHL z7*%iQP|p_Bf zfk-j*?jwSr^*Rk`q%h5Wz$+1K=vKgXeA2R~7xPRw;Z`6E2@t5B^|Hk>l1U>Y_^gNW zQJkivn)F)zup=y8xlb`uHCcatR`G6H-HQ>o0iAgjwBXY&F#uk1*r<&2b0LvlRECPOwnFGW2$tgMlp`M4AgE7ey_b7+N^xN1cA^ zDaC11&Ewc90%EFMaoICI3BGGET1hnEyDD`&sIz2ft3u`2^TtdWAS0hUjX_6u_51A< z73;Q~%2fNsnsJt&jPO+nwvZ$(9RF|@*UlQ?bbIP>4vceZk$sfEFW*~<>rP+g9j;r>v90OjORe@3wSL0G{_@E=O@oXGu<;JU zVv{7rPB}3qsXO=OUm*wmP=Mi{0z02cip$MDd4TzF3Ho6K#XbUZ!T6L%fJM7gie8rH zVLUltIA_DpCCG6-nHP$I4gV6uH2&VgaLs_9jgjG=n`igkHs#ICj2cWkzzKxqI3E>h z0<9F2lS1bfC%YsEM=&nx1Z0v3s`k!e_x%aAR z6=d!qW>GX(WEwqex7GQ6?O$B{ql@+azMo$&cX&w~;%>;y8sN|#f zX|v1frMIU@c!eP|aUeTuyJ_q9_=5JGc9N&gP*5Y-pOkZa(>`@uPO-9Z ziach3&NQ~b$dXx*@}_r?0F^#xm!z3#`)*yok=y8OF-VLR*f2et+z4r^U>K%DF*h_a zlv5ScESKrjAOjnt@9&R8kAUiZzqq)7UPGgZylf;Y6-NGbyvzgm1$uOs?gsTt<#Z)G z(+6V`P)feP^jh6XyV*w=e!a|F8)%UJN?9pQCh=6ac1xL+bvp3mEv>Ao+%)UVLl-L9 zf3o1#8*nQ2wrLBQ=fx-$ znGbNf)!bx+`Q7EOJS;fT!uq@q(3%swITC?(^~v5>PSA{K3l1ly16ZMXo|18$y>q?G zusa~+uif{1PXWyX?#&b2?TVI8d>Kz<^1(S;ei~4U(#)R#8=XrrF6G8}Tlt&O$E&8u zI#K*3b-Pseyv9^(lleg7j><3Gm9$G?)T@@7%t_igV%B$>QMg_y?H#>0Xqq=LoSO~9 zNql^L;yIqBbDTS3DT{V4P4(FNXt)+7C;i-Fhk7h6zwKf6D8|yq?O5c-2Aw5B#faQ4 z$6?Ghja~ zBl?DdNaLnWWec=VryAkM}JerYc{ha&4B2k_5F!Sk|_SY-%1)d7gzg8TOB_>Fo6dWu05PJxFXrRL=Q<-*S3%?vVQbsv%P7O6r*By*yH-M zAZR`%MV9p=Iwclw?|eB0;_xg^VRdT*Eb;yn`m*{|mn;;TXwT_`XTDyiW_sS=0xY0U z!dFtcLx!10OuXNmSEgl-J>(MeUNQmlQml9+tjz`#{pEP4Xl_C;seKVmNy+eGCgAhb z*FF({4-8>GVVw{YOeqBWltZqJ1|<`u>$zR_B*dRumeEi3)0|473DwX}Y}bL%)LZO= z_rw__dV*onI)&DXVeLvVauW<8pa<5l*lC0`6N!uwjWI`MnMulzlYAUgwQ6Q^{|V;$ zmjjxfkDFt7X8152+QV-s1kHTo<5Yt2szz+u6OE~$Skwc+u>2!0HH^h;;=)GWcKzdW z@rj@{4=$}fz#AVHH3@I8@5QV-IV27UXqD3M?!yE_m{Mz+TaM9LNka&CvEtZNYxATq zGPzSy>`&u_20(>rQ;iz^ihRHgZOeaj2g~GP&Iz)29&}o&V){UAn%4X@ z*Qg9P%ymC%Pw$u!g6(<6DMrTrT_mhjAB^lu!Y7+%2v99D>@yJZkq? z)6mG7{-HSIQ;czKtQ9W9O#)g;(?g4E??D9g6#-7)VA@H48|&&(%?J{oo`i-g?j;n> z1GZL$%6-Cibtpn$f6UZu`bG=Ncfi7YZ!U5FAjNTrlI@A{vu!n7GL`U4g+qgqf(Gky zeNBY4`8g{&LlBf%OnaKkYLvE}*>SR|91(PY@Qsm#!k?V#ssRq~BeKlJ8%dXA_}{l1 zlVMESHr&B_c(6Ju`SFC@+AeChbZS|3JV1Pi#3^e= z{BGf04lH?N?35+Iu46mSB;Ip3 z4NRY9K;Qc46gn~6CzlXg14i(>e>@fCxoV0oe;s)ba1d#jmL&sY^bVH-qHkq)175n7 zKBU66?8Q=3V&#aH9epd~bXgv1SVB zJ2PMGzHL1@D|qC(gl84vIDFQq%T@s7y^t1dmue5!+)tbLnf^mR+T;f)`sHqt`UaIlOXRd4f6FL@vi95mfa% zk-x;G@+5;@T=tC&?M)TVPGQf&s;LTXc{b+eLf20gZep`Ksc-&PPJ6cjjzEK*OPF(U z5h=JKcgxRazksZdnOpx|OEEJs{wFQP!1VtyO^VxWzyJJ(I=mSi?`cs(Ax->1K=JaD zq?NSuidALNKNOWR!E4V>&q5MYV-Qr%307G zc!~n+Qx~=Qe8Ssu*#U9YxG|8KqN}Lq6@DeFYt!M>i_njxw(QZBJ<6pK_9&V-wpXaT z`B7_r^>v%(mjUSF&w(o|LuAIqT(FQB4wQuW2Wu{(YTGxdO;pve>|n7^-)0wextGvd z)!s@v3geV^zY^?7I%tFKQ+9I>0GAEcmZs1mkhMmltB8Wg6s!7y?e7fs2s!<|Kabn) zo2m$M+pd0LR?y?*29Q=3UL6;`T?CRcRXPibgOEzT(}aFjn~Ou-HHrBOr*waI#Wh4W z2s$^3LNB19zd+8MRA~`RWTSbMw=hF3yFcCC2;&*8ta;?F$3>z-T0iTp@Tb-!5<$>z zA&bzFJbuaiRB(n8BvdB~)s!*VXon6Vd{m4s!P+nyIez(sio6n{CGpoq5;1nU0;LZ` z?-$M84T(nwSEGEjaMAR>m>s_naefL!+*N}FG3rQDNKr;WZ2JQqmw)R^hHNbm?nt0sw> z=D&%J;c2wLL*uRjdsGf!Om!<-u$$z!wsfP3Msrt&=AJC}TBFS$x{fqjY?!0sQhW{F zO}{If!3)rG05$AQwsNcr7~fj^>usJHpikSIR?Z9pt!ssp4MS~zt#4Z^EV`=HWR~dQ zTnbBTp@?XwNG&d8kd5Z)TfmXqSOmtkAlAT^Cx@f(Ij5~(z=KfD!jUn)u;mJf9}z!#F+F+6g8<-5LQ^K0{+t+sLf&?5 zbmLn{RSuZk?!vm3?W-??x9F}+8B!MY&+SqTuEVk3eq&_AC=lm~Ia^@vSgVWx;{caE zP#V1&D5mfw_}+t&6|^xqEXY7aP%~KhQcmoA&&_wtX+zWQWdHuH)#tv)i~O~~Y>%)D zwcI%Yat&=9Aa7_cBk)h!aRUo>@5JsK-NUE11PG(?QSl5Cy0N6zMoa`6jeF`iNx)FvR*txn%Ux6YM;c(P?lW3lz%@r|DF_+jJ@F^#jD1 zrp-_1U~>j|On?wNX`uaqGNAMGdt9z-tn@luk2kt(V;w$C;gAm*#&zPt{L_mEj2fLb z0o7Y1@A99d$I2Li&?_8J{Oz5NM_zB-PJOY9L*WFT2sNg3)T7I(leNzxx9u)`Rr%vV z92sz6)#r5f!$KB#@_7;7SaTg4Aeaq$AFkjZrbGr3G>ETb9KGiRfvBAi#%kQt!J_l1 zn!AuNrUxsssC2b)YgcaA4{3WFucS56;5Aa+XP!jL4Ax~0V?9%cfm17Tiw>7d59#DS zcJ(vf`Asu$>SesxoZVkNr&|E+cv^*U_7%lpEW0D^DZTyN&fv zVqq>548A6k=#M42?j^aW&LO}F!_be;IuYhHTeOd<^mVluLhQ2XN(+B! zZC4?(tuSsXB`o;&o012KA~B3Nl;k#Y%X89#*1~`sI(8)ZV?`q4d9Jwced$P(b z;#Wm|djR9Z*q3i6=!zJaOktkalkoj~zQ5j=j}OD{KEIMU*acW(_wcf1^>e*JA~+Td zGRg3T4~Fr@gt8RMjUz!!R%6N5dD)nh3Yb2O)_A^))+B)Om0}Fwn3FYjDrnOJag?>S zooW?amT^Xiis=!`PV!BY0+$o0;9Q2zIpaZ=rGbuhEai;fJQedO_Pbfpi8(L_BN9yR zwMmjl=_%L<2B7KcIs#MV!9@PF5T?cJ1m=Skp>y;l9V@=j<4q&$D(P$o%4S0fLV14 z(W|#sm@2c!MdgmB!>q-vg@~28eiJQiskz556#42cSC@XzN}@kElC$m)i3>VK_0@bd z52v^MJGPw$R@JXBe9oCIc4FmPUo@ARI>Hi!nmK`9|RrF4kQn{e{_y+`d{@Ap^@ zolR^h?9e=>$cLWT;R1RDeQ4w=MY2TufSz|B4A7PRyC%*+4jQB@3>S)6RzPVbH(5N% zim5YWv+2=Gq`p@Ty7z666FP1>k>BFug-8IQ-_@Ie9U^p5*9@EqRL+{4>iWeQGD}%Z zGs6A~LNhPX03|sD*;c)hZy)g_t&7}?py)}#OGxZm7_-8tWE)aSEpy_FY{N&*Ly#| z55^soTX%aq4`|sly-r!Yc>7n&_pH4+;C1$Uj;j6r*YR?c9>K`l`|@B@-XH$(X~q&n zv5*SGD@}$!+NoOPOUyajn?u* zlFp9M4m8WKgo)-k>3n_q`BwwBoc+UqhxWjxm5FIF$zq8MGA;Y%YtEVXxU!BkXS&en z78px?p`A`#6uPX+auk`x253!MJp1IlFp4gAo@Y^lIdiMqTYtqBd+$W>cT-K=<L2Pg#^_56`Ve(uX|ta6)b#k zQAwX|{)&piqnBoz!aS44sa$G%HuH%?-4T$ z<$q~US(WF5)?Sn>xM0s_<+dckz@9lNTE;bt2zFxD7jx=`0)@yat`@1 z>D4^I=Gv5D!PrIs`?Wvw`F#PT$NBNycf$a!9M>SvEJECDy5SXl1T-dxr$UlrRtRVA zj?3noIG8t)53R>*4e~>LRIbtxNfO2HCM3i1R^kc@mv~pFk+0b%1ZKW_u0oX<4+`gt zsQYP%FOZJlPLc+ySS!QQkL8bd1!r1+^Ayyrio_>M-fi=l)i(lX2@S_yB)tI)=+|DH z;N>;GXH6z$Ebsnentgj!ioR4hdh3pGI0yHXVQltxv=?P6qE3XK{AAu+l!H^)Z9lW- z_h22npY$Imp6|VPDDXn^ILt$CPvhJd*X#wzJ$w}Ul3$cJ>A>NW?0&l<{-kh05Q3eV zy8U~n!jO^c{gUR?yVB;;VEC^T(;?#ZIL#XFrOqr_CzKjQo(gQmnLXkC@qM&2Mw@m1 z@3M)B`M**#|1ak;2jl;B9<%&oZT^SO<7G`urYQ3B#U5AzCq^dtzbA|d z#2jxBP4&K2z)ICy$_=<7aZl6*M@2Zj$Pi)a+qHgE7w0PX-f`NWnKHQWf>2@A(eb~xDdcE;VDbC7A!$-WN84xBm|bw~0#8pI70 zg^}Z57`u}kL@YG3J}{6r^pOTM?`6n90<0b(j3aC^+x%Kr8oLvHS?lHgIcr-}@W&h+ znCWe!#exoae?b^{2IYb%@vv!#7?jZIAi}1s#O+?E+3t>IgIq7ghP|JicS_ z=lU7@iBD>1Frmf*n8Fn>_0i!%_IU{6B5_J;P5VxC8-;ZnKL||EP3gT&GtZJX&l^YZnh!*LL^2J-r6qCT7He%pAP2`^zo*mw8(tcLH@2A5^dN@6rbfu$f zCLJ3#itp2D^rfeh*B_2fjaxI*`o7PP56kECMbBN%3U@+brgreU6t529+p%m40mVH+ z{5mx&3hoST)@@_jFJInvHEz~^J51((+S2iA*y~cWJjxt7zBzbv@DAaduZ7#LLF5W1 z8uKR$=Z$a2ENP1-l&KRXw+hkIM!C*z)U={s zUO#9})cp;^&pU;3xr>9-Pt9thm@v2FlCu4!rmsA)J=3e1p4GBkDP(5cI_7sfK6=#c z7i4OqGn^^%u&V3WuO@a_<(o$4XzUd*(Wy<%>fY(juuTsU+e?~uAwF{=?px8kIkU$% zs>>9si&lGrtygK1wl$vT;fzeeB)DUYKDmUkN%Q|Q8E5sbuK~~9`btkl@ z9n01bBWnKl#$ebkR3BkN(#%yg>~*;A0Of{s>c_#5W_~e?D8URh<}s=W%HVy)I|Nb> z=Lyu&&lV$&a|rNYOw-qle^d)vn_0iLCaex<^AhjRBHr~eBHXyA&|`th{5C>i`A$}h zccATJh0gSzp9-@=%jBUu0ksI6q+^eMZO2x8Ub@zM)o(@PPv;--<>1h_f;DV8A>s|T zk%*Zm4IPfQRtfS%6h6jKbI`>W#0F_FiZQ_K)`~Kqp2(vKB~AVdb{O?Jh(R2@OxQ6r zkYE0@V5Cy6M{gAkB`T!^y^&A4ic}=zu%;HFSioUo8K+oaOc8~Fs%7tF&PW9!OQDgm z0Lnf}u);AJK2HXVju)#K@ZF>p4TZP`LMaBixy|ps6|sysBuG9?VYHQQxm*_~+9C}j zg&Wg0Tp0$2ruvkH$fAC^U9_7cV+<`8P$xI`=_}KD3SN9!v43KGpMeab z-pMO?_r)J5pG#1y`$00-`Tn~MSLljZWO^llJupV2AoMnqIE3IbCM)ymLp;^TyEL2d0=+{oGrLsr$MT88nXVQcl7?1*SmwLpdy;=^1j(X8Ckkoa>b)6mJAHk z4+9K2x|5{5?!12ko!`rYO@pm|#p015l_N7nFx{b_%JN7iyU8veWRJL)GDQ@RcSlGU zqEGCo;FpV1x~Pp4oB7+Yk))fvAP#re%Uk)D*d8aZ2VNG6n~N(ZyT_e}DLz{Zpe%-Z zdxij5lipbJ3P!%0%D^0*tE@FEeR~X#>q;@AC(}8A$?4a!iKZ=;r70(O6*iALq%G(4 zK2P4)O|HLG*T?$_Qt9`n1d)%Ri#~%g%^F>e%`VJStXeFOCyfNwrHAtbLEN@pUW439 zxmTf}pdh)OjXgHpC)Twc-|c^DJ;`>_9)C^{B}Lu?7OWC5aAg-*UW!D5m2Sf-+A0-} z)JY&W4{$1EtnTbPovSeE>rS`y+K;2z2RUsYI$KRfb7(*h*=)L*PNhTm+5~o_@_dA+ zZ^}(qa}}Z!?lGVQWISSv44YdV(`fbr4KtTs=8nCk#y1n zIT@xta$2O*Ise6>dxgTPG1pP=Xx@x*+ZJ5vrT8?^(6+@Z_!|wA7d6sLVQ`+tYA$MF z@~*{jD1xA!!NKmRW_maf1hcOY=#6CX8$FNe6I8H0NrzI@?A4gvJ!_5kBM&y!b!HWd034?{J!9 zEMwrh&pl?Y-o2wh>r`Q#(oWqXG+*+zY@XA(uHHn%&@!vn&YInJJxk=eS#R;UR(&7K zrpL0)W?y?#L!%XVuES53N9ao%USvg`#_F;Hh@6Hv=sE8=?reEMOo|ckJ=|*$##WGX z3GIJLQ~kpvDF*Xb4pQ>y)tZ!!_yUUhwh5c?7>FhE9et~gJc_+7Il5H@3P2By9NN-P zV`@Eg4f^v(Lh^gAqZg39?B4>a?*q`N@5qMxOfvE&I=HmH6r%Q(oz#7p;2>sM zhueu1^4<^nsqYpX9WU~=j5ZfVdYli>faAx_#cTh8Xy8bFrWiwhG1C6<0}EwZ!E1|L z64bb4e+OMu(v7%3O}PWj^??FIu#yCA=yERfl-&$({ee3 z(jMl4eiZgaA_8Ac!Wk4m+-C?6xyUNCK-^8A2t3-mTqlAB8Yu&ctl>a8B+~?S$)Wm# zKfPt3^?R^{YrqP0vBF#xv1DIXqslXne5d=M2ADh$Mhtc6O!U}JrlYJg8>M!&YO5^h z^lV^R$e<9+E?OT-3X1&SWkX)%WqSCwQchHYM7~C}MDT4QKZ`7IK=IWAZVU|w@dyj1 zvN5R1R|g5uP+t8@@MUS_qhY7=GmKnc_YW}q@-HzjbiD>&J?nA(m#v^5HjFmYsVTZA zaD~ij5q$JG{{3QqYFM9ODG}S!ZcN(%hQeartLMpisIwo#)XT~SF^M^ioNuGGW-zw- zi%ys`r53$|6-CP^6JKD3* zmMdF`yOuRCHMqcjVMNsZFEhK4)l!ayCrCot^TE|fK3{teN4VoQS*un!?b=J09~K%# zpS!YN<sE!jOe8MDBM2+&)j1(^I6<+u>T|R_~*EA z>ER(~jir5s*O6tytJ6|D)iy3eUxJeihFZt5ab@eKzhu1&@IJWt`UUjV6GyzT$dW+5 zJx{+dZ&~tSG~sPM=@ET~fExJS5*)c7mG4028iVHuNqyK>KzPX7V||VkfRYc9D-RF< zni}-N8*ZJgDIqVYb`gEFj;^@o&`wgFW}h{xEvW{M#kuF;*Y1kAQ=0(9@%0V<*{FwjhF} zIy5qjR7^JMNzklG*o>$8WUni%Jb!dySogDSiU}ce*l(_BKV;)^<+wb9x*N> zA{M%1MC#AAml~Jn!%s;}YtOEiF+DQOZ7U0Kt;gA%z z^btY~nf$hn$^xg`llAiE-`H{Cuf5AMkBBo)LDaV-4NgI2#}aRSrjkfC=TSDq+1XW# zcyS=pVtd9@xA3yBYjbNyBm`Z!Y^qEpIRWc+9;>L*1Q^p#h3z5E&wb(!86(@y9Vl2vr8eO2oy zJt?EzI{rc2RP?#K{F6;lPi-M>OTouea<0Z{4*0RSaT(h9DxL*z z$gh#FKM_=Ne1Kaq78y*P9A>iPhV10N)nz7m%Y%h zWbaTb(`{6!r!f8mn!m7z=emd{>QA$D~>&LZ;# zvHjo;v3b^0Sh+@_wDV`PMx7@@<qeOV{QtRlH9%o6Ecw35v4O1^m$83K7>VbA^r=tXAoEyAqFWJ>LC4FpcGR32@KXm znD$se+&KyB7Gw#Pe$YHKGCjXpWkuygMXi<>235c}Gcbm{Cqe<9X+kt#N6DhXTAcR1 zJmlPRXx%QBiRvqz*9Sogh#zuMHXdgmIsrJaXPVJv%Vn483Epv^qS&(i97BCv0z)88 zL=WlPc4%==PDx8X%+PI(%0*?R2u(wawApH2{H;E~X5NxXTdbCCF9DOyprlMn#-vY{ z$8jbFRh6n%MWq9~X;?>6p_*_)L$Gj0!dBe+WYL{z+;&DBsh$#5NWTg4) zr$&BT_mwsiR8+$FnS=CGZeL1)4$%Bwt0oM*z3R( zY6>+~-TH{U$CmYwYDd=AIj#N;!wLu7K})z)-h{hIQG}GRVPwyeOF7Aiq`*!|64y@t zFMhZTT~EoqG!-X$=1t$VokI3r9K}Si2&SK{Vl1h}Pcy(TJQ_frI|Bfz%nv^iu5=fF$<#zbN>)@h-#HPbx)PLSLAEkvS^>bD@(j=k%CL6i!EW@~1M>)T z0>E%A43Ais;zXi62wR zFutz0gWX8txaR*LE;%{>gN1{EndN_a*Lba|?X<~;_D@{S&0I|>H6aQEbidHrWXED{uNPg4A4vv3$5&`p197s|X)0bpVJ&wcgOgpEB?j&jn+py=5YF^ZAN$=X0 zG9yyoLTA6`3T7EnA53Y>n`!Q$F0pypTz?R$b5_xU`sw7JAjJJ&HZ7DEbukKuD0SW=y{t+?bQePn6l2DKoa&GIYP zl9oP}B71s8C?&K|2>)T(L-@7n#d>HM<(g2i(n2jq#ao3(%M z_RE5VlMo`)hcS*Q9DR4MhTRrN`nlz2gZd(n2^*iV=*>Iq&sh*a7$iQ?-gnQrE?HVC zssm~3bRwfYX8llYMoYrX+nHpY+YtxE!Kdv?$+^uxtjB#beqmg-6!%D3dgvufmc{}Z z)q@MPtj4|gq>Fzp;0DUUuWT`xoJqYw;#}Z&cK^oCo#=yn8MJG3Q8dr! z{+6}KDgQOil0v!pDUy8?g`trWQD+D}`(A18ku;5>s`hJ+h6_()xmQ!JMb--N)*rZa zcjUrr0(%G@g^bpaA26Er)2D{k(B_K0HG2jCE%Z+CMc6ig@6GjYfCd`iX0S1#fitDi zU%oO_l{It3W52z0dEeIFOH=6vzD~%k=8K}W@IY|RIZ!gM>ZQwW1)h}=0xMW`Dp>Gd zG)7R4t5539Lw!L&P5(!?`)7y=LOfVd6Z0uBB0V^Q{$Jfr9FkY%$*41Ue^;-g)BDRM zen82b;yHUCpbtF1RY>X2LWReC%dOl3jqPS4#TdSLMDW~8G0kev?zoR zo7QALyG)-*>>x&1uk>FJr?2fgTuS3<2WaVwEh5;p&G7~(o#Wyq zun)$+Ap4VC;8=pL3^2~kK2_A&7{2~M)^QW<6k-GY8O1H&l>B2>9rKN{(IZ2`xiZGN zl6K}CeM10%b1RY<<{i#zd|JV{K$B@Wn5ZnrUQjl<0Gd6bQNUrYAv-|m0g!wklULA) z?vC|GNxN-WA0ASmIuTX=NE8^Z!`Ak_rJierSNn1w%(YM#bF>^bSa{H9tu5edf$(mC z$=(MoCx5aQ$fE#*fA#yjpW-14}^3!wE9^P80PSz~*<=lL_bY1P+ zHy>0(*_;h_=tYaZs&MnCo@o6_Uh#}&A|(w61X+3Oy3GhUi-E4 zAySN?4Ku72X*^x~OaSMb{kb7>M?j(QwU*0HVfh8EU(;L)=%lIg%f|^xKsF>PRDJy} zxyz?>i>$9F$pc453uP+xhwmxql{(xl@G&tHQwlGRV=(?$Li<5qoO1tzXRO)OBTx>K z#?&=S3SKxLl+f)29)Gklm_j2s0nXAJ0IDEOc@}l7!BK{NWA$}qj&5;a8yI#ujC>-A z1M=gaI%xI{{MG9vtw7b8D!>@ohWet-7e1XmWl3xcPdep;1^#%lozul?bG_lJ`dsfJGl(gyPKLHOL+ijr?C06)e1X?gtEXT^O|ONy7@Fp z2*h`ao2BQG_6N4{6nkRll-ofx0-t1_;ahN)hLTYp5Vn+jevAR?vdx{%d8Z*sqHFyZLl!( zO)!g{6W;M!bdY*-OTCP~OelEg5t?G-c32i7S7D<%io^zt2#no^aDnqpzi+<+7&VaO z#k|KTE}JzilshMFxbuuTd}UxL9>T%12wK{=4>@#ffW0?lVSN3nYf*PXZB4cOFQ0ew zNHwj=BnG}j$5Gd7fE*FiV-raSG%V4)nqLDVd~-FrkDNJ%_wrpk;f8onyRNoz*bbEN z3WKSZDd7jI8jxk0EN-}01n(r=D#Ic!`I~O?;6L+p z8y+J4XxSXlXhF>)SG}L7mmM%7WAx-^Y4mdK+|hn03dwG89V*JRaDjp)zGT+CO#lej z6TLIm)`ivEtVXu=7|n1*ekf9WwSc>Eu$|BEOXl6$IIp>@+gu^QG1s&JW>uV+7uGgM zh0CK|xbUwFnAll(P3^}LP{h)Q!V zIL%q~zx&lWjfX(96IiiQSNGu%gu~6}Up+OiFoBHRHyv-vz?rn?P8LueoyTKPi{U_4lOza~9F2`n65ybi5UF_^K* z4u16B`T-9T3r}Yf>SXJR$?f76hysZz=TMh!>8AS;eN+U31t^#IhT zkYIZSe9wKpjjT~VOmUvgG4AHf_WhhFp`;04K`JpW zbaIDUb17{%HFR97V+kZukrRKPK2u@bd<~Qcby=}E_7V1eO)4iQ^=xLCq&3FqN2<^f zr@nkX9?nb%axIPiB$GW|5mW%~t` z59(%Juy7&*55(si`1-gzbN_sQmJsrIeVs6l-4l%Zlb{E)!dCNk7Gm+LkUY&Y&U@n# z6BH-^+PA|D%{vuN3ehBzxB*<(qRlk;{-T0k(uCviQ5u@GiU0+-JK9b+@GrH zS7;r{<*=KU##HP0%wNLmSMB@PkFhQe9CCl0^%zF^Uu0mVka`9QP{0xyW+C(fzmb1M zS>oP&F$sw<4N?&>gDx{trv%eoNYV1G-rtsKWWfE^SrQE@Iyd88;`J1~m>O6A9X102 zV4qYJ!nx`Gc2b982#)zZ29X9ieHFqDoZ$r`V1gkUjk2DvRE)OCuW+jH4SPGu#P{En z3jbV!{%dmf|0Y}DWc@GM0_Q)C`2SG0Q1k!cEg z$p?a!2ZzNoArT-hS6m_iQ%0ld4XN30OH>8ZfYKU?^$<~-5RX;+1BIuRM(=p6O0hp) zr5jXS@T)*PP>S3HKf2PpT2y)k%q!4sRhTF^kVAw-gSjG5qAZ7YKVF!7&eZctvj~Zl zqJu$1b~0(K6PYiy-Vsc}6%IlHjEO8p8e9RyC|HQtOBSXddOA}MNTodo8}9V5AMQX` z8V+;-&gW;24Ax(e!Z~VYmxA#G}Xe?ycHJM@& zs|~|Fph^+{xye1#7{=SygPe|I46z7k(P{VRtXqIjSB#2N8O{dnfw(Z(g59FEBEnr% zovq!iYv~s`tM=cg)gbmwqmV($=B^DrOggByL7^Y)lL(6N)k}|Siz9PTT%wVei>{i8 zd_ozZj6+Q+oTMViMz&4;q9T6o>VVd&w4{b`%ta z^o=7^WDX!ZonuV{XJN%c!DCF<8lqt9G8_GKuuC!|yf+NT%Lv4-qmD1cI#3v-h=zU) z^+QuJWy)ZsJ>!0+rjL)uIQ_!|UC?yV!qWG1M+qW=83{wqeyDg|%aOe_vQ+jHCP!@* z^F=a*agEeUVmrM#^hiQSetqDB%prCiqsXa{<&pq(8GJfJKB!&CtxUGAnTt`05wj90 zg3Y0ZA2N;3jDep7Bc?f-rBVrnn=G$%8vVCeUhUs5SmxSTRE+iks+{u2FqelQyfx#^~#hNW9_U*byl>h5K# z+gdk(pW>J`GyXnLZ`Jmg`>=5v6daC*Pq0>lVyGz7ticrudRH=kr8i45@1xN%H#-oM zi&)&^T>oliC^3Oyp?l7+Q;Q6PP@)1|)R7mdrssh^F{9<7942`+A;IQ28LGHN;z%w{ zHC<11dlC;n*+%I>ikZmE?Yubjpl7%-oO^{R~1=Lr*%pEG-3W<#i-Gj#|JUhF)TE& zkeQ)reS@BIlU(b84FutFv=o3%2r)2GwrvNk!D&@935(BDIBx%*SZi-Hx?GU zgS}dFR=;MunfrSDNAdxTrRUC{3mH>$DuQ&3u7Ww)371L2N6Wc(SsTw)t=LZ^%{!k! zMub!j5`f?bzec>&C|r|qozo^h?}?8opWZx%+S$j(J%8k#Zoul$`nPvzB9nE{R?>O% z83)1Gi0nW9FfJ$mukn|i#@wf`h{CuU6Dy5uqYp==g~tos_NV3t;mBZda}mfaqdBnz zl&8b5KO5C^wPV+-vZA73u1CudtnOP=a=0^d$qg2Dr}$1pJavRE2V17ab^_!ou(aH&!hTe;dP|b8#sYb4& zs+L1jju>!$&*YJf8T<^m$+jdlDVcQHjsx1M`- zhk#p*KE^tK&uETNuS4X05K7r?)-Q#YXZ z-E159_Xq8s3oGM;6L+&R)~&k0O-YejG7V?k;EEbJt{#3g0tZZLlig}Ik(+y#V3jYt z!OQ1eSh#6Pxl_s7n%(Xt{>u)=SCEdzC7ZMB4*NAFV=PXN_Y|q0iF>b>LKl24TNeLM zZYR=ejq`}up>aQBX&ZMd@0HJznZby&x45b8Lm-o2Zl(`45HaTUksT2@d! zo5*R1>!>Atq}6U;q7YGen?MM@U}cw2Zsc&;rI@Tmb-GCnPN1us=l7xnfw|`)j{pAh zv&1UVL%Gv2HvM?_Cj%sYwe1_f7x zoPp|MWy`KrrW_7@U!Sh7U9DTq0+Hr`?p%x=dCbx9Q*f*ni-j0gB#vmvTnNN^ymgoU zBP8Mrem%$^k~1^WQiznl6j9qHN4?!?_}Y3Gb(d7ZT}d6zj2S^xW>pDhy=EyvL2ZHt z`p!1l>1Gz?k=^q7s=FFq(x+mCKiXyWMhvSJTM@7zWuSVD<2VN)B@{3GMu|>noxhsP zR(y7>e{!u?Y=+b~9`#KbXLYchoZsy9ltVi~x04l?1v{ zad@C9gw~xp`WXC#<_aF;p80(fpMQ3|7+$GcyENp~8gM`u_t4oFGilC>^hxuDu3e0Q z;fhrCd%B3|TWVNfPa$6mVV4J}Z!cxGU(sH7i%a2R8ks_fv%W8r0BXB26h)Wvcsp|O zaok}&qG9Pj#;*Ow50>EtwVgil@A-k>xJ^MS!v_b~?%w%`WuBCra3z(un#+W1#}~84630w+c>w zpZXWG3^ceYP%bXk=cZ?{Moq1^jr$h;b00VW)%8>6hzt-IAF~)ZC~gDT!9Cvr%LBc1 zUOp%mvc7xF1?&#S-@hwnlXajI;5!fQv|HQ#duyjsD*9V6IsHl2x%7!wj<4G-qI z5O|i6XeFL5@rYK(6nUXv2(m!sBfMgHD*LB26ekq?%12>u#czWWAl1803IXF*O2_k= za&jKQLB3rpdXlYRAtl;}#!AHO9&@?%hIoIk-|>7*i(9cnRhY=i^_g)CK{X+lB)<>y zQeXzy0x^X&+hWfHq0~^?{iki)?PByD%*kS8>J=9(1yOcKW;Z8Lfm~qr&jOw-V-~rv z+6yQWdYV-6!3#%tL&RKnaS|9c@^X5dp@ecgo+r6{IKm(`HG?)wjBM*~LFtgiV9;p> zt4qcC`w{%c)?bHpkI^LK$94YLV~j=9ZQ;Sp^53C+XeNCQ?BvTp&mO)Fy8w^tbTx1# z{nFu^(L}B63&xBR_-H@{DaH~yyGD^cgba}19Sj@5~2l9j0)Cb4UDGQ>1BrefU z@;#;j3yD^09~ecLc^b!miKe_Up#Hu6JM1{`ZF&vDzk<JRHFG7A2^r(fk|6^Yc=)O3iWF8_U z?RBPb`S@Dp%x~$zb@x!Lw+Q5fq7bZW;Y9UtHe8vf_3s2xT~#L`cN%V)2!$=4`tA-^ zyK7~4ViO(={|I;%OAv$9=M8x=Fs2&r*eaB=SBOf$7d&fg)!z4a9%M>rz*KiH!6=9% zbwm!GfM%2|R)-P|)^YxRI%nNK?a%x2g!VA>2E(B_)<-QhkzzZ(_P+WD95y6ME$*jw zr(|MC)XdedhNDRlU0%=TH6+?u6tqNwTVBO^+1n9%+SVxRC zy#fmnXMB%XOR!D&^OdzCY##MuFt)T#Szt!8tKXYGMVWB>u3kBsoK=H=5w2lw=T?MI z`wpW?EddgU=H#CvK%wwccS_iy@KQCrf$eruhb{ zO+OI*v&LHg*XT>-xZOK}CR}%{$B{k_^W#<+Y?=XS9E^o8rG^YI;8JwO<2t&sgzQo*$6 z1rkF8n;4=gyO*O2R_9$N88yGBpihYF9N`k17g8aVyV!g2(YCWn~aDoW|_VVE?mr3c_P zYLQ-+etNMU0^g8cwoTSk(rUv7;PoqFO_;z6(fa+$4_TM{Y>*YUjMA;5287EdR&dVc zf%PBV{QYxg$pmLpuNt9LEC-$OOV%9>i~k!#BxqMVtQM8raGowC8!NC-&}J;dX&CQ8 zl{%x+x0i4oeV^8)y?1G=aIP=)w9pD}0!D63Ue&_LI@-WQJp2B{1_TjZV@Yw&ocl#H zWopZ&wnE|P{x?utdLxzfO#Y* z7cA@3sk=3SzOO)osOZbN(X@v^z_)Md{UNQCK({3Xv6cs4wI`DUwO?hZ7Qx*SF%9k> z2H%j?+`+gTqjb2EXnjx%qtI4LGz4?%BK6((=sT;o(Mc!lbRLW#u>^~{;VB$uMWX{n zaiZ~umhm1gH=K^IL$H}dO10g555IF`v=A>QBw?N}lh~>l_%lbwAM80o1~jI?Jt-1L%EN7p51(*`BH44e zUHy|o1Jn12{b8&;<>>tK#qW}iwj4g0%Lsg;QtKkU` zf{!NeF(FHAbW3#rViH+nP8krNF>OQda3=d9gklp8k?49<&n@WL>IBib5M{1uCS*Pq zW>3x5NL_(#l1JkQI__V~d?+BDz*hz)$~81iWDfM1G7|0z@Tzf1v2xymwFh|qr~{SUwB1BPp&d@JQ%dw!%;RIAqnPE* z{o*K*xbX1`xB$>FxG^#oP+UgLa)wb8Gdu)@P$>X6_J>_L9fec}uUm+neo#{W_&yE& z0NI6Qi~cwD566E~|1fi~|BtaiX&QF_us}%Py9R|0DM6aZ;uFdL48F3O#W~98F%WU* z5y0seIFNg=lJO{i{9i;il5IS;Fk|?V$GRf^WhZ_X*RQh?oZk~K*M~DtKABu1o>pk! zv&JWp%Q7P8WF)u2Kx7>cBYx93D`wdfsD5w7lpH^!0T<>6V<~N_a%K@l*A+Es}WkGaH;8+yRTl}EW&?HpK!gR4ktyIG) zV-xLV?2Zkwc1M@Q&JGFbzpa(q7+D%=)H=W2*=71uiG)=sZh>90D7$(aBwM=+wN~UR zDyL~`G`g+Z2IxZ}s1MSVL&(kK0{3UYpy*hMc1>}@7y56~CS{E}p|R{0uxu(0&zqE{ ziu(Ly3PAKjWB5Qq>d?jpQ~bxr6Z2WX(Rh|>^}PnwhYA*X&XC(J)?6nT*5a(MzLj+( zO%HDd1U-2jYY#{CVnBvK4r;x&4x+mf2)z>5{;)j54M6U_eQos-5E6I>A1wR^LQ`P9 zMVj`Mkh(CAAXt*aqJc6%6H0Z<_HNrnLK%DrK_x0$k137}`p5K>uoVqB75JCiBWnlm zNmSth_wI6*nv^JYZ~Hy{S%#luY8OI`HG#=lziwM7*?rPRcKf99sV$w#zbj)9nn1um z818?HtGpL67$A%Emi4z@sJeVf0+8wRWQV0jtPVJ0_<)F!BEmNLT20+&(@b|RzO}7V z{AH?P_=72Cay22ur9#|8CCfdcq3zZsVM6pvI*5vy6{*UqN40Y;6 zvjcv!tKxp!dnOH*x`8FXS!-4yQb#^d(f=R@$bJ~~FQz4+_b45Q=G$J=sx6Y%0pJt? z9Nm4x@;8J_2!z=qTg~xQJNPGvaa1CUQ0Ze_RKK+=7W-p@SOZtVO-Z1&Xx{1Y*Ac`j zR{Nt)m_=5Jy`8_oxgW;;v#1bPE2hZn!DT=BGd-@VCvJbjsYMUnh1C-c4xOPUQ zzMg=jOdp4dC9)obkzid~8%&*$-6ug#M8;T=Ee)ee^9j^cYv7%@ZQ(FPANSRUGq7~h zz1`j+^0xP!Nb-0U&cA@PXWoooIK#RcqqF`P^3&0YYPAL{8@9(s+y)zh3>qE)rJYI?WNygfW7l+!glP{xzI%hD|J2;HPVx42=Ui0`wRtiEj z1h3~R>XDkg;poJ#((Jc|Dm7JA20&{woOg>;r*h{z4pbFGq?Z zfs+seUNsw1M}_r8dudKn6i8tl1A$brzXYnv<7QY27Ys?x$t#4n^vmG5jV=#xPF#)C z%8qtj6zSTs59++mByEDT!p10YDKJOW#5!WbQGx%9tqgw!DL4eo-{d$h=otD*wqr6h z*BnFCLLkIzC@RoOoY4NdfR#Z7-_6z9kJkmeq%1>hOA#r6Dwh8(@JEml-euEc!%^rl zI=y1~5af`^XA<8sv>hHdn2>^$1Vb^T`m4Oz>uVhL@t>v4G5Dv_k5)iQ z&7H-!2?CvzmH0REdgTu*I(qYWBvz z&GP|~&$`~S4xHKOq!EDemWS_9z%gJeBVE`X#F znp6(YOzSyLD{ON%81TO+*Op8bxjedI(08`DD#d0Mb5{(jIdpJ4+fm2rQQPtet47AwqKSe8_McgDF%}~AJxpFG(aA0qGvO0_rNNj(A*gHAf&q!zDMnMmi#Z*9w^=?`yCaTH1D8V9-q}-7-TA2!F>q(NfoB<_E=Z>8zMNdXi zuY^m*&a~)=!jWN#0?~L!24`9&g*Cpqk;Eb7ew+o~NIFE9$S;Y6gm_-4r~|}tlX#iq zcw1Bg+9*zxT7aGa-D`u3@}~paYK)oKBd0%f8@Q7ND$c#Xp>rISbcT@1_<{W zAuy6$YhkNmH5ZPId7rf{XzkaAB6SY?)EXDeUCKeSwifEuyG{~+PhPNvd znxwW>QMe^oG;$$CG?V}~!XFG_Cf#Hw6jV75ZdoYaL|x`!j(!Ky+Z2)5 z*#62;UI|Otf(tfdTS(E(z_<9KS{5)NfM+DOUcSnm~}yL-#7q} zLJA5bq6`}l6yrpzsR0boU2XZmNiubqEH2q`M#W{CML2V9vvG_h&FFAo;A-LJC{Yf} zEKOM{%;1}nGHEHiav&dW3aKn$;#`WeM8h>P5Md9AB}zO*SOEYaVUcqJvIJccz$G2< zb7a5;I*oeNZU(jO%bx3x^L0DqS43hdLFt@WjX3=l)UM2uFpijvp5s*F7g-GtXW@}p z9!!O`fWl9A=GNGkUlIYu?wG|NG0slq#5z`p!Rb=37g+Ha{B^$2h?jrtc26vwu!~1= zZso5=iTwU}Nir63MMC%oQZZ%j62_ur0RPXJms;)bqi*Y8B2AA1`Dh!L_cU+)4Tix% z9UQ>;nTYuX^g19Ug@f~veOYrdME;gA+@WPfw`Wg5hh!G{81FPqGCPfp7I_%WjcvOo_Z&xhws$0o11f zrFG9QxbWvyPI{>ik7QrhulQg3x~oq`?*ej+(eKV~qoXHuJ$wcwONUpLSu=~?nP=y8 zRTsXQ>KnGqGgYq1XP)L$Ud(+`mk)DGm3e*)m@6{v_e>5*=n@+Z>dYhbEBN7+=>-L> z6FY4_E+zsxryC``)08Le#^N3N-*98SxE4$2yXj`Fn#WTSI<>NftAoyQN0HhgI8(8b zX^qK+%X|?h2}~SJmHzEv1Nb}_@o%U8rn-MAi>Ei5`MTsX)%0XzI*IPlR!FMG(wzop z32zej!w*=7$gh&XX`%_V`t`23QiMPIbl zY%-stAN@@Wv6$RCn;52(M2W_}sqJ|%&H|vH{C;yN*Sx5?T;W`!Et>jk-QkDcW>!~c z%!pnaTiDoczD=YQVK3y#F1t#F5lToAGf7iJKtT%+yU8CyH;5ZPVa=xFZaB?`5Xs_P zK<7AApFvHIt-T~%)DBkeLr4LO`pD$cWA@H5!2!SLZql}Md+-A^pw@O|(-v7C6(m@R z+Pot_FVKiu@jE?HF`&wO%nGiIepmbF7lPekMw8RUo@#RB~4cIqsH}`@??i?SA30(UwX0a#459Y9#d_%;CwHKZ~itue2wi>eh3;KaCDnwLy*pjW6nlrnX#C-q}X$Hp_ePZ(pmK$7*2{q1~2Gv z=`B8#9G(>(9R9$CJEnyZrwQ=Ir_YjHn6h-maQf-q)8A@&DU0Nb6YZ$+mZSq{{Oy9- zlZ;_Jyy{zc8tv&ObcNNpWw;fzdR1_>z&40!e>UTEM9hxobS*GKKT<_3i2QpnUdQ&i zUO4upsw;y(3@5h*a>UVqlk{dE;lrgQUNRcKVV{`C$G9am^)W=|gYsU0`jKiBJvzY~ z%NreKyNo}&76bu+tG+Utb6~E|R!Q}ETyg4HXpSi5YAuXM zuQc0K`tZyfgQMFTu~j+I-cs12Y}mDH@H;%Je)2=FiLYhb?z~v?U!iRs0C`*FXOP<}os38YRa!>G<}tVROl%Vl;0WQ-lSXxZFUvCb(qh`ij6WK!voU<`kaSR?4 zWVGGE_3YVo?P+D5DwIq z9v&l<;L8~!`s@w2o~-#;Yd6qN8yvRHeARzH*Xq4+4&*$wuU#i97;iVZzbveoS@`ky zb8PNG3b*}w5-dV3+nN$=OZe>Gu?N(Wx1NpXkM5%{{BG$LS6%UZ0_62t(DT@T=)W8{ zT%_l_cK5lZ04zageX^Pw5BA3JIOrsM7|;ag8G^MpP2-#qG`e`y zGqHNSH$mWn4&|qPo5He!l)%T;2-U``V*rWjpa!xPtLH3s9(VBXaF8Bti-9z@=UajpmH?&ATf2?~Hd zvbDKqC`;kK1%Q9!K?|)8Vp(27eWs8hyl+`OFr4@-cI})S4DW_HzBiRV?HvF~65R?; zX5jLCr_olGQ^D3VH`lv?Y~pO<^g;4q^KkQm^nRN8{6J(We#s$0Avw7_zN(1c1n43B z&^+`lhY16Cms*}5Ib6R0sI1Ly?vJk#96ud)CW$r0VZ|v&d$b_Hr5LOIf>-!IIhkSC z1{~bPUer#^=xPufFQZLniJ(1ICR2ZpPaicN^PfV+%)EfNr*nWX2;6}ld{Uw}vLXX* zZFjYJNj{SlmJjtPDHB6@mjc{c(&)!mRv-FmN-eP`^E-##~kZUD=|*Wd^3VwCp3V1g+~ieeJ= zPYhuHy?#>|3m^f*UMj0Q54ox%|CNdifIWN63Q_V2%+O6qftY z=L_huv^9fubp><$&V}$@+wXhrh5d=350qBY)EAYCz3JOLsUu?6F-eRHtHBOT3qdvf zyKne6zW;LFc&Na6R3ItOs)*9sXBg@n$(mxbw{-I%~NgmD&nbxF+00irk%EbT+a z43s`Lh>XGRC;N67hA>Qi=k%US`TCFTgYyuAY2pXgsqd`f6ZWoK66YKD4k#n`CyZ7WS4KJRwK~8wmZ1I1XdT@&#!M$&~p6>I{@Y_YJ&9tMdigi`n!I>1$#9 z;4pr#|3b*7xqBJ>@91*#3%0RuY!kaTfySnwVyUl&?#JY->+Q$PUt?(Y-S6PkTH^0MT98)??W}6ZygDgPRmdtR}0=Q)uk7Yd9Svvb|iJSuLro&_N|1 z;1?z2CImraE^NSalV}Nq>oDuPXJE-PInVK}L3!SYZGGi14%}Rif>MxtQDM70#(j5# z$+Y*{@+SB<^2aB``a6U@e2WFgpK{>dm9l+l3sv1V)hK@+=6XB*6Dfz;3ae|^p3Sav z*;g$j{9=TVf=P+JClri>X1P_$+WwpUWgUAeQB8UH3K{`)a6HyjKIC927`1xf4|Zch zv-hga6ZokB6G_wC#jJjqdD(+l5GYd;1P&N34Rc=JvZ`;{ww?qTFkyV@3LMc1?U>i9 zJ+gr_3|{X@DM&wN$T72(lf5Yk{yL_=fK~q)+C_}75KWt3Akro^@MGg51~9y&WKq14 z+z`JvT!XA5NsJhTTv%8aEcC11;H&g}d=E{keb+jrgsBou2c5?kUgEksKAfN=GuW-V zjnJNS?VTXQ>mRhs=^$EVh`l>|CnutM7{z;jr=@xO@Nw9oqo)<7bY(5KD}|NpsoqGq z#p4%HE~nAE-#+4ilpzTk2%Kc)2N$f|q8{Uw4t0s&4D38ee5Hq5+xvyb-Q^D-J{L{; z?NPWS6$`KzgWP6|>7<2obm!ZIVVfhpr(LPD$lVPn_S?lvde2V;KoLW*O>5hD4=zo} zz<7cP&+*iG)ylXipbS#y<}FH4>9v};7)v{4A!nDk1T>$03>65^+@}ZpeX0CLH$@Jz zaulQ*$g^Pmbm+o!+Xn8fFNj+67I-I<=n?KE`+@r8N}S$R1Z|qdqwdxV<*hAj@}Uq6 zJ~w|t5nPPK1UhCstJZ~u&*c;OTQ*-oOP`anVhoHm#!=4NT|ysR(BNdwWI*miiCVZy zG1vZ9vbU&F4)#}E`A8=qBI~v#?S^{f(Rki01Es7We&%EbH$)j< z#0@3wUmpvt62Dn7%t%b|>1=PRLb;8jk2dOcc9$JJljsm_rw(u#rC6Sx>!|H3AafoB zz)R=wAMfU8NP>gwYbD6>E3F9&Dx_gn<*{=39Ok6lRf*zmnafH4_7zx7gx5bjOG*e< zhg;N*aaSc2)|V#@6>2WFO0+->ScTG;Sku6Ek4g_JvIv{$M6Nq~X0NfpBD~|%QXAiB zlr70(zxbo%kb!@9%QiZ2({Vu|VyE66PNg?kFr@TIk!^`&#qRtgeSSB?srN+%Ez9-! z2P!XZCSGBStyy7K=_dN%edN~DbYBlUaLDRio79v~yFIb@j%(*1;S9(FcI1;H3pG6y zyJJqaN!vG>4R!XMvoNVEqs+^dMLGd6mjh;lD|qWdpznQ@7Z8tTQXhEZzg+7i^SpqX!qg>B7{qFE(K?KE?qixiP8Es*&x(r zh@ONI^`1)l5z}-2@kIb;C*BghIwcT7%NuwlrMn}=Am;whbtNLW;+>kEG6N0w3n{24 zY{p|`M{PSCMFYvHFz00k0C9YLKmh7vOY@d2 z`X5|q==|`%)^w)}y(4(g=5hK$Xdn28;?ksjl$OmJ`Aob6MYfd&x)t%DX^o)Fx3rHI zfj+1FlX03oG6QrZ@66kiNO}6rLZhcm)Pg1UtZiNuVKd3apiNMkw0l|}*aHxqln(CX z1Y;)AC6~|}lLMcUNU%N+azQ^_?mD&eVS9!aj3$8GY&Fnc4r-LutoSp3F^SGd$kA(( z3s;kjA4o@2++tR^P5mo}F{^mm1{!fuX^-5FTz%YD7?2GNXO0VHkBARj4pYyj{J@L3 zCbK-}$CkZY|0ztP`U3K6b8Odl-*jd3TkHZJJg{6I@bVM(TO7qjJyx$gUs%TsMnar| zh$TXwBTwh3>_=>rt#zkJS`BJxZZ~z^}V<) z1mKBjxrbk0ucm+DSW<5^T>oQElGd|pjxz5uiV$qG{Qky{2MQSYd+0cRDkrVsbw`4n znS}Saqc$%Fv`16qoXygT)26hDmf_*jpE}BGUnJ>tS z#;_?xvX&^0&2LDWclmSmhfe+}HaZK#cAf-Mlp9Bv@Nwer9Wv@M)=XLyTHM(^c z&{)9>x@4q_}1m4AjDk;dHm zkjubq#`0}XwnSw`-~{#jv@wzv8EC=DJBdcjXlY}m&*(xnUqBx$Q)?$VsiSvh)b)!p zt;xfhvw7qS1DE}S!Vx!Pe0w{Gqtizguq#iif<*}>1l}_%e2Hy-r>%#RYR=`n3a{q( zUa?p_PGteJrv_DJE_7CP{t4{Zjh0RRhGw%dqUN6VmMMotk`jk^ux^SA-EFr&zA3OQ z;nR`S#9a$GF8C5l7^Xpw7C|u-vA6-fK^S87{r&Wosq9jTwIxVOw?4= zXuPg>zk)ULOuF{Y_(ywlWAS*XVncto3WIlqsMbP`{e?3J!M>lYlRbs>g!;w;Sp$Q* zEXsQlqCGbsqD*9Z+WbcY=b!b^flQbU8K+(1s}gwaJxyH&rTJsqZ}vs}n-c3iDNZYp zObQrtaujJ78K|`y@>c@+?MTTOLYcd|Pn9p;4gnaRFf7STM<-;%X@$iyl1?e9!RvAd zO&!mueliq|u<}hAhn411C^@O6#lovWz0LF>Z@+51e1}ax{5oJ4HrW08j}oh?QV1i( zFe>ylPy|+Jg(Rj)msF^Ci<`R6NKRtVm|i3&XQ{Aj2b zpw__zeM^TtT(lXSD3YpoYZSpxsV$=V;hAghtD@@GYg}oLuomQKyb|MstB-=7_pcNl zD_^wT^4@erWAsI8Q;3(y{31Q6;Kw3$YG3KIM3{TKt_nz*-FT6Xp2lOj>H!&ppe=!m$YOmPOR_yDa^r2Y7tvj;ECtSaz+p>S#941&=UOQvA zn<(URJJ~#_VyWFh5_R#xyjJW2Y@thT?ksFWP1>>2kLlrZ|D0m?YTB-a)Isizl?HG_ zw8Yf-pxn)8-%_G$GW(?~QS+9bE0t6KM89?+z=9M_ljUd(K%xX^M1_aKP_qQtb=rB4 z8>8N0^vARYWW73}Be-`8SYkGrDqZ2pHt1Z}vuHqyphw*Csad+`(vtZXl2fUJYv;K$ zm+@F^gPq!Sd#zG|hJcT4L$XrYM@JDMFpv>0G=r#>%Z%Inj&`S6=oDos1bsKE&h~Mv zpK=?MZ#uTI4RTdW-J+qy7WiG3aCy*F!}yh<7u7l7w56%d$sIHcH5SEts<*411YFh|Ka5M6r?05cE-q=W#Rjg ze3s;^MWB5*&X^YY;6DW0VwYKy;u~2w9Y)jt2MmU)Xp{3mrEH!ynXV@mA+Jh*@{s6+ zXJKc6>t65a=hM+|xh&^t6XRc)XLZc`Uc2e7Hx z#Z6^@&}*_xXz91fK=mn9DoKh7fE!nv;)BiYVAj{gVVV^4>p&cg6oD(4v~~67#89g= zX0B!XSR=<^@`-edk%*`bY^E_K6QKS$?hUJ^Apa*wdxFydCA*W7`8Ucgl?Hj zks|-h>cSLUz`}%j%Si%DyQa~|0n__x>n+_l2aJhy$T&H^N zA(u;1ywG5LvXG~GQ1ImLV95AVqdYXZ`ELHPMoYP~@oRfQZv|F9?GSN(;KJug30+BO z=<@xonaAwb%+pK`-~r@0(&|OoJmTJ6la_@LgPolFw{GH2Xt-=oJ1xFk)Dd_rP(?(y zW1E1W>ivS~F0)M=+|eV#H!*#d^>>Z@)XFPZ$8xv?96N2%vIJllB`T29K`iADrn3=& zd=s|vVRJws^*f)xWB#sDz?ie;{an3Ss7!RdHw_yzN(9Dvf&|R;)n*opIFfa)%UUil zcRP7L41gDwl)=%R<|>_n`st6gs3I|T$T6+b_$h9`B_)YrE$f-|i^rhKPW3OWqpisD z<{WTknDVS%1v&9jtzRF7_Kr3T+n$iFQXkWEW6fdg?Y7TH(K(*&wSgOd_DF}?Ua>M~ zbQG_S_T3;Am#HF>vz%ub!0EHp);aNIK0-HdS>C10%%Fukm}X3mX|^2awvUObov*>! zevLTQC)m8Yn=!jlIlw6jG)*cY!{K`Vj$Es|3{&<8Pq7M5)B$nP3sT!^2NX&qx@JmQ zTortJhdsxDTAjwJ=E?~3#LjCryWPW*H!`!Y=eOwBFy z;`SmoQ;;LyR#3~XqmZQOy20k5Y0wf?x53>I^v4}r&U)RWlz}U$zE?SE=yX#Lb5yZ- z3+di~<(8Oo6Zvqck!+4A_;yLRi5KxRBfoI3*jW4l;=T^+#!#!j740q$4bVc?-uK&0vwXWK$dIGiOryn_H3jK{DO#62(YZAkaYV_6~)VsRwq-G6LRrixU z`BTcGidmxrnDi5Nc(Mx!Ol~Z_@;PKq6IITmKGU+RMk3qAf)@B}0ZmQDVZp%Z!|PW7 zn;}FN-v@RFzRO$;&!he!gnU(lo~t^CALo6-Ja|Y=w!+chblycXYi8Fjpg+cF?PJv{ z5QL_zCzquQ2dLv6LtoCPKi*cCS8bIQ7vdUrGBM*NNY)^YG-RkPEcMMC!W`gUBKdTe zG86hw>wMFHIi+||w|`x#uQJDaf9C;9-=nh%^my~50BR6O4=Z2T{e4(C?Jw5HRXdOL zrY`(Xc7~XCV4j@}h0&m?#%+Z}AFlo;Gr!`wEkz)K4*2*#h8a1agIUTxgL8+e2pZuA zuO#vedKq@V zj*{L{w@~~A$(A<7iTNg^sE9qx+s^pc<#iwtTZNdZdZo4~n3q_VHkx9h&1Xn1YxN`sBcubp-0Q^AsLM%M)mB1iwd=zqG`{6W{LA9#h9e zv~=3ywgbT=L{oroMF+b{)KRB`W*5RuPiLHkjdRN6bc0460!MW*H!&hXV-W`{A?PV_ zau!-Rb0~5u81uuvChkIhce#iSL(}tT`3(Hir_3py5n_s6Eu?zsk@Fjt_H$_{k!472 zwXNv>W({rs?aBdD_jW#M=xLXBui$V699nkkLLtvdI3>m`snw_z+op=Q34G^)}N^8>ImuhnaEGj zbAY?fWKhG(Br<$Z1qw@FG0=A#K6Q+%FR6u^xlrHM+_+7n2AIKL=VN;#Zr#L*+Yw(y zf-|%xFA)5eO|IBh9gGR2CMKBt0fN|y;R;aZ4UhP8{W+S!4Y~HuOq>aiwr88$l@SHw z+1?uxE_6b|%npqxRU~39qfiEAK00xw=h;GnYQt-3x(^B^6BWpMMogXOW3TSrca+k9 zgR7Rc3W^o%zdK0Y9+p|l2;x0Imioz`mu+&1`j-0vDP2fgYP%%OWcke*uV2wF>w-|g zviyjM_=Z&aOK+?+_rEUs4Gm`LKjLmQ{2ktvUdZPFI@YFLDgLyQ^cF?SA<7MnK3>-k zU^9(Q5u4XV7KarRka9c@b65*asY)01o5lTymobSuKzE7f`_C@!poCiz4wjw)$a68}1}6!EtHZPcRY zfLKobc(O=zi^)brX}3^I5iOx8n7n^5Kp=lvz4ngol?W-xMVJq6(VY_KSw>sIzB>K7 zkQs!onWcHiG$N6~d&XO3I1Sn!*Tp4Hc-Fd+MuYeB#17F&)GzzBDr*w2@2mps6#DsbhB37LwS|?O*D#P z!@hIOE-dY1bNO0+2q_8$9TpjnW#()ImP z&6RJGszo>S$vmJfS3dv+m_>+Comv;jHx`%H4of>%C6nF*AII=|JuDy1E*@6%7iZ?;O(=m}?%oM~v)ZSo*|bVn`oP81l4SUSlJ zlnhFG3n=+;xO-01MD?AVi|^)Q%u9hV?FeY}>6NT11RGt3QQtRA8!cNh38=G@`F>VD z9TtbE?pPRvYgWry9Kc%dGlO%rB!*`njhHR|n$Vr;6Uw3x-tpV7W48MoXrm1yU4Qf$ z+@4d-xs!=!p{s@Zcq9$ku~KE1Kq%I0Xr5n$U7ijy`0&LHhleo?7B<-|`-k#oEBQcsGdP(=#3aZLp4 zT84|Y2}}&^=pYQRc&5rLnq0`$NDYMrtJ|`%nk_nl&c0Y9f%8h*(-@PvljL7yr*I=* zYt9evO-JEjb_#a1z6WugLi>nHKMu#fCR~G#xzoNc!>aPQ8JadSd}283vT_DC?>2XX zjC(b`;HSPRv)IU`95@M1kK44w>O~~?USsRMSxxpauB^7t)Pd8AXMLKPG$9NG8%zaX zrQbYZ8SGOC6(D>nhg=hIUX_WMkg{I@7YY4lqKFji3-g4W{5@Nz*r54HR_{0st4bO+ zjfdRi74hONyhtm(Bm{8*LISf_eZp%kG(s>p8KOQV#4E{F8YQ0r}xF7u7ESZ-a`p@egs$3QiwR z0c=n3AGHMEn7+Xp6_FKe(M^BEMI6=6RXD%cqGlAuvA5 z&*EgL)pmQN8spF^oMe<|Fwa$mr){eImsewFvWNI<9qKZ*WJiM#JaH^CXm~I@sAOE^TPUGaR5XKS98OH(;`@HsUAL zcc?xSHUUN_c*#n_@X+vlBv2e&%YTjPT!u%xdEaNoN;?Nja$HR(vRY{-N0|=LNrFYg z(GK?Z2z$)R(|>i5LKN?8+YGoXuk$GTX5S*k1K~$|3ougYTOTfIHIyD>j;P)hR?)6N zh!OZH+CQwgp7h%5)+&i6M3qnX!HWkNd8kni)x>kdipL8)Po8WV@9RqGkBW7p8mvust_UJ?|n{o zccSKcqUPu@z3~BIix>GEkj9%JsHjIh$OV>ZgKwK^RXY@u!%-|00dO^;Pof zk*C6C?2xOOcH13{suA$RcM|k~`Q+i2U3++w>*?xA)}>$4w0vdeRKTH}_H|WHi4`B8 z6Z;`|lk>2-Rq-4-bgTilvChrbwJCelKSV}>I*i^KK^IipmFYl{zJ^D7FcVqqc3ZBS za-dTinBK6L2SKf`bdSvVZY>{m%p?6u3CCuX&0>LxnQSK`!to* zCn4VdlIKew6d{4G$xX>o^QF@xVDAYxf2*MTl>9C_LuJ=(Oo~k!5u!vjA8R) z7V&d6k9kGeyfjKISvQLQ-3l>hwnM+Ys14L=q1tuHfD|;9Bt)Ea{%dMmKzL}N`7} zv1(h^v7qb8Lk?WXj>3rg2GF7QZvZ5bvXmu<1=Mf8HNS*NEuh_T8H3$yVdi`gdHg* z5xqY;#78twSkFp}^61_(jv9Ia3^Qq!`Vv+Vvb$)Ch6*ClgtO<*9(Vm`rCbnk>hQlk zP=H-$s?to9)|>s$PPy%;>p`7HPpp<(gq^~lNusGK&(QUf9Kj4JupB-HfpzTvt_hkz zV!0AKzpIiqx18_w-&ci9n!)c2+@)MH9hWUd57o+~z0p{W9Q(aH1+}|YYAn*WsvFLs z9Y93+N)DU}Wuwx@;|{Z7=!FQ-t-ql4nbW$I6!xQEETRiH5%eZnJFXcfdE!6omZuDG z@ja{(6Nnl2Y{iHb+`7`#$U$b~HNYu+%E%vR=VuWjS8qLbYg_qhvewk#Kdoy47lDG8 zCtaQ*IzbeSkZl;j_BC}*C3J0f)N4Z%38rLyL4EG!r^A3-HxHQ)6X*99H`$T;2sl(j z!54Pdb2?`ZTetxIOH=i7bo@{HdLx548#Z$L#hIx<$KEbSTlo^TJJZX z8t1Oz+_yeZZoCmmG@q!v@bk9i@I7DlY%=S6%)NvMO#?~9MW>WjOTR?+H>Ks%Rnt`J zK(bLpOf3%$I{Bn}$hi8&w8~N%A{?@9&P%BQR@E%=qz) z87S?UpA^by?H}nBq<)Q2%CPGf$~}VrqF1u*2mDqMMtUT&h^%0LKiNmC$qrMFWceoAu6mqqg7!duMTNxx zYoC;KB}f{0Ds@^Eo>$5Fp~`jm>ubn?Oyn9?T&2rf&~EG6XA-xH;w8<>33Dn|ud0+r z{;zBPUxq@6qdb2ZSU39}CX1cc?_!0(Du!LyFHZ9^Ds(B%HA>25626rIy9x-1 zzeDQM8kwt%pz40pjh%%f4P3xa7PEB0QJaRrj5;wYLA7|0HlLQ|9!y&{vU*(_Gh@PJ zgwaXuvQ$#{nDI*NOV1R;VfCP-D!NGd!hF8i*yH)hWMPRU==*Zo$4%LELBRHy&E!IJ z^Rqt0{?>uG`oh;+b25Q&6ves4AFr? z1+`qwRxkRjSYNpg>R4<;l{X2-d<&%EcQh1Wa1~LWdD7)meJhOwPR~HtM>n^TLIsmG zXfMwGo$Hk1Ik71@1NdA3-PGIN)#Ea#fqJl8=*R!-Lw$PQ9UusDJzgC1<2}X{7U>Oq z=7f7DB@ofdCV!M-8<6|5-Xj(5q$Ryq`Ryc8_Xl&@W6n(^^Ri$WY?9v%N|UJjwWa8W zj>mAgN%AU7Z%NvNWhI&-`v0_#&3(!N-jk=O42+$fRq-#%QVIelqZPBR*YI?>xm}CU#E0WL2 z<+;{waD^@kt6f|1Q!YK=iKh<_(2!J6ah~05Q zBJW;T(vd|B!sX9$FdS$l@HursyWA=V-LfOCU= zVc3}>`t!_$XQ>Jftx?biEx2|8P3DJ2N-k59yZC^mO$mheC8KIH?Fd0i+V4%DHW=BD zfvFQvbmMS>HRIOv^aP1Y`cSUA?K*3a(#PcOl``|yDF%zN)9rNEJ3)2=%o6nHXw&o>p*ZY&ykYPdj*#T3iH9_bB*cE|;Xqw6bY@=q%eZ z4TR@jnpr^z`1v&A%J4>o0uR~2l8RB~-4(5T_+TFOZk_x!x>bZ(`3?`!Z+=#E-b~n}q?9V~(Qyy#{BB_C^K+?&o^FPUOgjNI`tKFRR|^_`+_qN(a?@Srmcbs*&9of%BW6 zzl{SKd5JI%acUg7FxDJpUS~f*elAAlHoNUi0(-Ut``DMO#1`@)FvO=E&a6tYxpHDP z;;kM}X) z3n{J%(ECJ+mZk1?%m`qtX_f92o7A%)#VpatXP@!i|Q2V^&8s_0hNvh`im8r8{eLetP3a+iliL59nT7Z3^9f`XVhq} zE{VLjO$btywQ(;Y)-BY2*TNyUEbk~|t5DIdh8*n@fmU1wsipB`!zdYbQ++aht|G*u zIG(iDnr$K}YiNH6Gs=H-gC%}zx)*v3G*(z;hLh-du;omDqu*@htQxZ z^7B`AWmqdWg!~c^g242L6p@J46fsB%H1fe85AHD7Uv8%wR=`m#}s^HFy=w1fDTOH?b;`NAt1*t`ud&H*;piBsY;6kVtxCa=Zz{Bkfj zKY#LhUF{9rxk;Q3)`Sj&9`UN+*hiW-z=}k8Ny68p5Wu!!Cc1SQ3|kQ0DDml$u4QWw zt4(`KjTY;CP|;P(&XAM420lE8Ohf!h=s9}V;e!|{X8Vk9h8LeR9qrLbXOA=!zZNR$VxQ$rYF8Dwo$6D zx6ngTC*hQ;BTOGppVNdMt0jw(a~v>Vmv!4G8M*A@O*E<}IWjb@1_Ko1WIRl(u1hlQ zTJfW-GV=nZJ!=)Xf8(32%OZ1F)~QL71e&i6LBR#R0?U~%6b&dqq;ploro4B*K(gi| z_<4-b-w{nP2KKq4QA*bE+I|#7`qX0e1dL}HWbd1p^O{O6pWb-gC+qWH@)W@q^RO7e@1jvt~EC#Z6_*5*;roj928gK&buemd{hy$zIg0?&=4HsbNkFXSY=O&Tmf*NGqrV?s-34--JEc8${4} zBq$G*De=yHQm5D75v$;d)5Kov|IvidvsAG`#}z-fLlE{W%F^)!nrH0w4bkK*eAMpQ zJHdxDf4}%f-dQ$!`Q)Sh808ZI*Q_&rx&adDmF++-F^qRDraw)<0siu&5R?b4aL`h2 zZ1M$D2D#cTZ5}vH2$gD}8h-@r>zBG8dfKb#9~*y+`pzQskU?03C`B+L z#6KqwGs|!N^5X_;oOAw2;iZe438Ryc(DCdg^hgz2C4B(C13v%7Xn-kZBG`7VvWbeYl#_8AfaoMS zafagUKv|rOB*wJ5o=>sct0A;J=rtAG9Oh%`Yh|lohy!n0L%T$My1Rb=<{1S0i=vdC zoSHjsn_$4#4k_h(>!^v3=)*8vE##sGhd`tGR)qaAPh4F?h`G=fczF}tQqN*LO9_2d z05_Fy6-r!`WM0cl+WraWv`6UE(oM&}5VOWo^QP}DYkR+lSTASdnOyRhn^zECsni0e z;NU@O7mu7;>905Vr(L+&nU|t2-$O+)hHPH8^`g&>g*tEhcX>X_oQmL|Lgf83f8bru zI~*Giyd?<*FQJ47?7>Bv4o4W`U>|Uc!S5i-yc+rt@J;YjXtV6SF*Oljoxnbjg##bv zBAHkGd2u09#98@Lwrq^c@y^;TYZQVqIxy&En%-){wJw|ch1sW@#(o$~I!iGnIVr)B zX{j5u3Ja7^`vfy?OoQ*+mGMqX;KJIDT(3QOg707U$XEwJeF*@+lk3|>6pTnP$>kjc zR>zyYqlFWB(_a~Ne z`D=$ol#^o{Y%O0iF0!(2YF$__FpS4Xf{vtCYsb3rau$7MPWI93!N?@on00b1W&Mpj z0eMHd>7>!MHB;MuhEMcA3K!clnroB9I&*T#brqdXah2GPKKjewT<8?wP7UTen_1l+ zf#CE~zKbI6tNQ2j&~_u#w}mu4xRx+$FmpH@N$#Xck{l_RuF@?cRI$nr61RU2Z+1vn zgC^%_PiRSwQnV7WY~1McBY7U_@oV3YEt!D269(B!w#Syh*IP()^vf*|&vy!qzH`{KO@z6uVgmfzY_IelT-jy}%$If-1!W8J z&G{sq@wnkwM)y2H-!+X90KMZKA$s2{fM}U7_Tk>)=K*9O>QIJO-`6zq2V}aUu{sk= z%Cz0trXH?3su_tydh+PW_)tlWUTQAxO3|wEZnSc($F$J;EdRYw>=EQG1OH#Z29#hA`jj7cCll8)S3&Kh@!v zRnpS=1=WdnDU?X@CaC1Rx|o!fu5|Oj8hSvF`v1-l3Hk|2$6hvUKO>$bX%Fj~cijib zweX6$!=2#{D0r40)PeQ2TCQlh2-FH_{9b)ZJA%%>$&@~SeQVn?oZ0Jq{}A25>xOx8 zCLHb-T^pj?oJ^#z#Aa2fdpzQuYK`JitfvBmpdQRi!bT_p6U{$PT{F-8VPK3=T&92+ zIBE!qGQr)~L^l>*;RsDXDMopFN!Rt=^K&*sC7H8dMMt-M(r`-%OZ=njR-51je$p3B z#cxR3U=^XzA8&bGZ9m63%1~h-*ssxD=@-x@*@K|I23NaV9m!Bq%hg>LS{p8*IT?=V zVD{r(g)}I*1JxGw4#ldn;PdIYIaoAF3BV;I2pxm{)H5P6pOIuLElGzi z-9O4rwOqIl5;_XSB3Bv=>6THDiCGFliZ+JZX8sC+YOyB~o2~xpvEha|IA}$%tI?VZ z`2wT~u(GbS7>AMBrPWE!Mvv%iF34=(JJQRAMAvn~=}V{T(pj51@qZ;M@*8ht+Vtu! z(L9bT@oD#eO+w_OQL!1b@wZZGPze^5-$PDGCqcAQ{CV8nmB^_-;U=14xSnvMxwS#8 zdsyNdVEkqaAcWtmA~&xv9T1jFfs$gHKBUyjog3oAN9^Wd_}Z-Z35${mJ8?X<8LzGy z*ga^@u)|7U^cInNtPGIDo{w^*W%bzn*MmJ8BFD!4^XgkTmtGM%qB5X~0zY0Pm8N$? z=_&zp3EE5PPG(BaluY!>q=Qc<{^5ssqX)F_T|{ce8xlr= zmvFPrZ(2J>EKwT=peBP;V^M4#x^e`35G_Bm;v6e`Gw$jvIc@t1_NSHOtIw>rSW@kX z1+^H0VGadsU+7|-Jb#W>==MO<$Za)DiQB*+oNQ2nOx%yOQo*WgbP<1=psy5Jm;PvD z!T{Xw*h79y=?t@9B?{|-hYsRM%|N0m3ems~!(Hbj#v|A|H*>Za5QgmqiKtl z_Drzn*XbHWzDw9KO0wQiQMH4`0%q(SD21Vq^X{ILx)i%GrbGyui?sfd7>z8*L3dM# zYkIm%aYUtll?q6MSMy0A@30%y{}3Q&nV~VWMR?|l@k`&9`sXqox6+SF1re!2>8dSe zpP);c`kWcqwQ3k3dHR~30ISS0pu+}qHOw1`vbRY0^otK)Dgi#%xpWM`l&zPT$K{pMtURSK9TG)as`CS?fKyPjJfweI7Y?L{n%1IY7q4~geqL#nr3Z_BkOp8zo-)7%lcuqWP-aI=*x27? zuAF?!r|w!bBSZH53LO<0^IUds)is&N#Rlog>@W~^jDAZeVZ}-(vIxGqN_5z-Gj<@B zIP*bNs!5Olf#qgx)%m#6NYokzBSnz9;d;y*<`b~aGPxhp9#3TbxCZ+7bwJxD)eS;% z2mBL82CC*!Q~vlc$yORHlx^I&;p8nvk-QO_H|;7Ro}f-aRr9P_5n@b?~e#>(Yy>M`i=*X50bJ(WIfd2k4Ix?OKr>zIWbO&N|C5bag#2f z^_+z{cmqP5rRNt_iv^T~2ibQY-uqG_f9-Y>meOnzlFwWiEC%*Wjz0Btp(Rb- z3a8U7tWU7IR3rdZ*uV@HmERl|@x*8dkhXVmKjGh&MX*+Qq`7XjzhK;SwPk6N?R{W! zey)=gWo3cTd4bbRMP55Cinb4yZnLWRrUO+d&cgV123Z=5<)-v|SrOKvd~J0!N3E+d zL`GpDjV^VA^zud$tJ&PVV_S}}WpQwrANNr2QE`&Az1~9s0i{Q3d>EcP_)7e?GEoW0 z`K|dQ+Xz63l%%Uj!|_$hq0pwKX=`eS&Xfi}+-n|P9YUgES}2a`RNOUe&ke3^x@794 z3{&chJ*sTjD#i;sh#i!1oznhu)GV})r!?XVdL;wVjpJ~%!IER#PbBO6zz%ZUj}POd zxaCkeA=KI(S!ov7)FUV~Z4)U)43dbxN(Fg$oyq>Up9-;_`Y2{N4ivYK1?BI{4>-Si zsToxokimaXhhXNL9s#d0!ASvb#rD+(gnZJpe(UiR<0BKd;?TtV71a1818nQqbJ@`#}__#Q?( zc^Q<~yS9&P%}d)KCbSgc59lWgl)BXwf3>G2o5V}C`sfs8i5E&PB8h%jNh$m~GuV!x zx4jkrT9ofAuf9Ut6lo;zLE3@D2+Jgt4yXejqbV9_hwMX!Y z3Lt|y!H{F04WHvH{_eRH!Uc+U^+}#5k5E|K?QQrsJXL1$2^>tDqbQn>F3zyuwhi)z z<(<-@=I%@v=goLD=jX4+_~sX@)rtlpt}Tbb1$3^tZa)#rzwcN=NjdE@VpWeH@dpVpfqj(?&aio#Do=2kLvtNS zvmPV*itxh;9fELB$!ZTBSU(Nd&c2kjP{68Ua{x*6#icT(jYD0*1|^iue(GP z{soIEdPSU}FrM1?nC{`T4$j|dbq;QGJ zoQuLmM_o6!Gij6V{3In{Asxe_Yt}A;bcl^j$JiJZ;3f^9Y0~5<8R107cOj?~hhVbX z@q8&QbR(e3J

    UO06nu3oBMG->L~7JM-G*LFW3*FpVuDp|Pj79O6D@baLQ*YmnH zb{Gv~^r3v|Qudf62Ry-yd^G{tr8VN+J27pF_BP8ogF~bQ-CqGnNdaiJFcl#Mw|5;G zV6X_^4ZJu(bfk&_c3v23QN1=hbdb>LPzD` zX;+eaDe!EgD?CqaD6Y*x3p9T^u7ag#C*5b zh;zSpQ8UxdIe9$3#PrYH{#lyzlJYlqk!sa5IYicDxWR`v>@TXl9P#T-IGjOrfgYYD z_*#p1-D=_%A^BrtJ(Sz<@F@R43%}Q1ONLOrQk4`|9ige}2xRq0GrqGpXOt@!TL_*= zfoIeeIw~n1MD6p);!A8hfe%@K6TI~ATy_^V*c&zwcuPnXxX*Jj7)O!I`(Il5XTFJH ze95$Lzb|rm{c0K-Xv7&CyTp1rO_kKu7>>OW8rtzN1i4$bq4^mF!lnG^C00oS@6e7c z;fR*XAm2yQ-s`;Zw~~!J{WWA;Qw&KI4x~w#h0T+htPfs+nb7XJbS|*=z8$9+C%go% zvKQy_;U(4|uo1|=ntAl!4};Y*$;F9uUJ9`pT`whfjG3TLg0a4hIB@pk1G;-cAez{} z+TdiFDl8PSKN#2qizNGJ*$wJR4F9%zgtf0f6x``{R1UBWmfLxkT5rxHbNM7cm7-SP-{z{x?!UC;J!&;_JCwzMTmj zG|Qexd7xp5U6eH&KC!d80Hcc+#_7Q9#`zMjzq;`_o32H&n4O>mSAdoF81M8ltMg+_ zioTYR9ZkqKvH$Tz@t=MTIfzV#CCeGC05xs{5B(qNzW3US-Gwmr>mAr8R}yi4Pj-=L zkK2j|Ak>0nfG0mk!lr)!#tG=1?k5p|HS?VYwbC7aE#6U}>fO8Wyy=SPS2fzWvySB{ z5C+gquez)pz_Jrss75Bg;TWJT3df#7OZ)}}UIzWJ-z{9xRmxHmMg-YKqP?|wAs+F! zz3bU{rw%E4bf2|qfTR#x7L8X6@F&8{MNZ-<8{51RXdnRm@po1SIti=#c`Px*ddbEu zz~W)9`qenLM3gIQm*;o@4dU{wD+3Fk#u8@hO8UUje!ebsq}9B;2vt?WmZF=i zFpO_Q{%o)4V1Yp`-oy%=zRUM{ld^v!#JRu-Q0eg4X>yt(aY|nfeY8B|5*47{th&_v|LDLFKZ_&6C@x+z;t7ZgN>W9l!L3H^9m=9%;!TLC1v`OLeN{??Rl47 z!CS%GO=q(e#8qyle_}dFe8yej)?w9!h1PXOer5(^E1p*Hs@@268|Tn(ni=L`U|-D_ zU1nZs6~M=)2ETEmp7orbmuz2FFEPd_0vuo^*YLxjp!DgQ8AP)3wCQ)}$)3$y_Q22uI97Hrkc*c`5Eyurj6l{#lW_!c}VNo{tF*pF)b*9_EZN@<{_?`uXwl30tZx z)aQwTkA!rrz8^zBCS*H5c1$;C=15NtpHyv+AzmlUj7q*!%>9I%Ldk*Y?p^K2yN9D+ zfZ6f-98U0)5a_UfDR-bAMfh}}N;6Ye^^J}Z;E@O&;u`TR59AVKUUIK`uku_e%onb$?K>j#I=`K zUcYO96u5|gS8knLpJ86eFrx~#6=$XjFHAQog(nzps=#Eb8e2SbVd!vK$uWGv%`M>O zEQ6dlsm=V^4$fU5pq}>>UOLxaBa?fvntYPgI_g`n$b_6g$)(K9>23ze%CMtIS0SzkIL1?np z)OAN119Q09xoiFS4DZ0t7-6sDmy8*SK1v&MDgqiQ0nK0E=R?%b8j*}&Sl=BsPAOpz zQDF_d=cjTP4Kb>;pUV0N$F`o5gRYAvR!J~N<2MHPqiWYZ>`RVNFs54&|B{$eZ# zM>L`0Hwp6eWAj9ym1i18q}aqqKyXg1VceGnHB>xZ;X*URy6L6dGU-T>WI@~ZO%>T z6uzZ5y-X24wq<^&s;^K^JJ3JWuWJ56OO@997fAqQBoN$jWc?8 z)(Shs-QqMopdJ&0x@`dpS!}(}T9*1U72bYANG!Nt)Y46`J#{}O3RqoLEZm+RxN6nf zSLHUIZ$a^3KBOQLaz7*fAKi{@l0za^cfKl~bo2=+hL-JQOt#ZZcxX0fXbj@4R^C(b zh142PP=Bq=1#TlEW$vi3O-3YaVaDZ^MtzRcjK)a=BZ2=eSlyCs?P2SEt!i#mJaY(% z$dR(Q5vkLq)0Yriw2`by5!?WLLr=T+_ncnyH}DCGFu1=qG+9tk`HV(=_PRj2dkhX> z%Qby+wznFjv7dl9hpwZqZOxVEIHmRKT6isZ84KfkXFWc}%N=uiWP3uSTBYu9q<;&+?L0i(zWFYBLCN7!({4+iAXw30H|u{Jk0c>vO7cr{e8HU5A2NORD~ z_b5XE3S}Sn^&nhz?Q+$D2goK%tcy&BcYa=$lr~GUHr^G#zeh59a5+CYZtG-Vz-E@a zlQaIQC=bK9t57621@$(EoBo$_wtsnlp0%_Sk=lpmt&lCm2b#L|Z>$u|868)MC{@Fg zoVNO!+DdBM6_<8x&6)z?R49ggR35j5WpGydzF){U8^r9-@2n_Yh*z|*l+sB6{>x`5 zb%ggw8=Ao3I~JYS5RgfmL0Z_V{h6-mygu82Xj3u8#hII0?11IQQcaghhaoHWl1rLo z#{>{N+aT{?EUYl({+>ry(bEYD0db!PAz0r)s(g6SD6crJFw!KXCW}@QObkNI{fI2_ zf_>1rdFTGKAybHdXztKG;g2@@`#IdXng#3lF&<)~sb(F!rh#>)R#8MH>229?*d0jw zss4SzYQgC_fuwC5Y!mPIuL7qc)w_7x;YiwZB$}xJ6We#JZ%HD|A-$CLJS6MQ&=-Q| zO-0>IJ&5z{VmUCY6S0Quhs|4Q-(rqYE;x)KX|pVz(U=TOwa1WfnJTLESZ3JPH#VW? zNo;nmc~JJJn9^4qs811ng;=cCx&WBQ>#?mY2&rsEi<`6jxQ(?{GTBM>tPE%;_tmQJ z=wYD3Iq)6KRg?-edhpQgzbV1y-E=Koj@o2c7Z|Pe``0IaSLueUrGK$Gs+0PKam@-mmD?z(i{&M>aCG2Zx+Z}P6Qc-IrgX>fm*vCF{qv&A3rw| zF<)^nahW5X8EEO=`G5CBAmJ6A=L(J`J;3{Ct{bWT6beB{R$+~Q0GDr^k~OE*QY$UK zHLhE1>nkceDo};H4u{{+xeCO*z-tgllCdNE8DjFqJg9c@@F zdX(H)D*ixJ;^+CuALzvSywhZ_lDpGn;=l(OvwOxq7yp@purRo>W#C}7_fK!fx18Ma z2nC2qL*F66_sfzYF?@+ao!aSe?IlZfLfM1{!!ZyjG=JW|<4C0u*67@~sNhD!YBXk@ z5B|$&l;R114gTRnHvFBZvIGJlX7Rz?NS$9JM5!>H?J7KTKe^OjfpvWRj@G)hVBkjhmxGY^Dz;cyxLHDXB+#FLF&UDT8sHQq96Tf|H@KJ+^5a0*Ihi`hmDTR+^={L>eo%esC zyj%Ar1xkHsaB5#VQz5Fi#15f8#%9vht0aVTCF8&5ocrr0z|~HuE0d z?3+6IRbCD=pr1l#R`$Ovr59(*<73aHzSz}Tm4zdS4>jvP^ zKF_dgdk7GWoSL+R(mNf5ARLmFHljdoZ5&0pOE!#gMwmggS?)pi^hB+126CPswmXYF z*jNOgNO=pIcipaXDh%r}bt{JSiK#of4}+i~x#Q(qI=^kyt!DaC8u8F{adsIAs_>sK zHGm)2ZnTwbDA)w2LD?ILyx_t46Uw>fbYii`#^)WGF{>3Pj}I#dVaw>Lf_p!np78OR zp)}&fIVMXQ=M+Hoz`l#$}zGG{aMqN#m`ICBakh z`1|CrtTvEvv+lq}N!->9x3V!NvBD)f=vuRgiLz&5x=@^dV~lLPO%f{Qs1Vlq_m4#8 zC`cEgNW4d8r^4aR>f9GvD%CU7$fVAQfsMWA zSSu`GdeL~wBzL~D;0N-n3?g0%;1r$@$U6;agLb&@Uw!ectZ#42{trb#^`MvvR`-o2 zwlG851ecOk^lzA7u$4%mRBL(TGS)&q2e6XZ;*LuuTI%&Kz(Z1yB@xTh_rCBddf|-a z5oBMUcJJn=c36j7#BUyYQJLE3yngFUTtyz*vtAL(M-eRg=QGX$u7hUMNRoTRlfsk` zybLZ6KNBw5M3uZ4lDlY(Px6cJ=?9ZaNd?4$v5?39uuc>FmOKP_*DGQl5{hf<|DQK8 zSfsqqxF#dQbG4MR_>VF4*q$QOPQ5EUkndfZ@Sxk(6kiXkDL5>Z4y;;`D= z4bdMgzp;AzcPS@vLBZZL0bwS>%BijVZMJkP#0P=(tF8dKEwyl9C^DE}&s`-G1}}Ut z{g^EnE{3zqW65Yk2$ggjn9MO&?=f7%o4h-ws0o->`UE}{KcS_y@CdnWCQSRFU}dW! zM{7_Q#XT6>+}JmMjHO;{e?%J5AtsrhrykGUaA15-`Y#LX&lm3+^?NXrO*6c?axn4u zP(Wh@ zat5jkWTuiNA2D2TYGc&sXubp8Q=i+VyW?=w>8t{JP(s<&Dj^AJ#uHQkcWx?W#Gq)y zwFxgi5;U{e?<%7;<)*(J3H=P-P3S;}=?dSvE1fB5Hd?6G5GkETpCM&MK1%s387YQ* zaD_pG@ml3ZybZH84(@ouOXY;a;pX=9hvT#NDHBV?q=U}S_oLN4ZuUnH!y4&v8ofGB?9;a%j|C8bT^T?Mb7B+uk!=A=Rt(!i6Jm`cN=f9PUMgq2 z3rPX#y!%_+3_W$QdB6{)gtu~htl2)x6@V}B0*K_Iq=ghUo*FD* zk}M!0Tdhq?VsTVDOU|XuR8r03;Xtp=q~5jBjG2$rl#tSK@QT!iEyoB+d5H?Lp6cMK zRHu3+Ov4y6yyn|`_K(ooNH60!|8Zioj&SV{rgt7R=2w{cbeTE2bbLd!XfK9~U{AFL z-Bw}_b%7u}ze%@N)GwfRc7JhiQ0>r@PK>Ph>F%!fySN)r_Ub;L)<}_$9O0DWtVhw< zO~KoSZQ6W-?`~vludWecSaI@tp@NW>ah6)hgqu5vTYai$+QUbkGYQ^l6Lu{Y$3%=58OV?6o6B?e-lQk{{NGOrD`-lhTqO*I zr|nzpy{%8LvJv4LqyE+x#WF0R1QmIcc2S|k60k3kI9~UlXQ!F3w>V& z3xJ$9MP8%-SV(~H#2bb-mj3T@<53dly53I;1QEA4H(9<)V(~W*?ci3<;$?$|mhsE- zYYuwZunRNM*9RZwwh#&xx+G_5(|{rI*AYNm=#wc^)|DbMn`wtBrak~a^|Xl}k~W@` zdCo=l%je@4i5KsR1jaCfgV7g)S{0Q>ZL??v9kH`r9_01CZJ@k+;&lYT({5Z<7A`;H zVuq(BLJ?{Aq8oMdNr$>vY;W^utK3OG#(TeyzF@ZZNgK?0;!_?M6`_mxPkE2c(>GRO zn)RB?ofo64>ZZn07;R`rg{aP0JXDsG)7{(n%XEdyj{P@Mh+M#}LB@-9JfL&OxLGht zZB_1)TWiRvf?j+u({N%{<9yez{dIfl$WnRSx>rW5#2A0TYYHpE7 zJi_kD@{Fe<4QXAAUGd#!i{Eyo^mG-XKP&l4`@V>pRc$!Vf1LNCdd7E#rF!HPKC8S5 zP_y}3hK_C<4Ki*ysWUvPaz;UBWP*mpOUM6F6>y|a4$1LJ{c2V590a|grn~az5HoMg zq@JvWSYw&;or*$i-xAXOjpwY{8JLCB5(?FKE$Xmx9FMzJ+lUY=cN%dz& z%J?>4dFis06fz0&x{v$zVced`jA)e5;m7inQWXqg-6F(|#4;*q|Qk?H4}lUh+%yEf+#%IKp&fB}mMU z*)m&aMN!?fc;@^LXx4|o0n-M_RKdpg(aZWr6QR%%3Ose5sT3*Hq6AsCmo(|z;A3@t z;;fF|=mKhDK>!g z6Pz||q0{9gUTrYV=R>yq3Lq$ar1?WTg)c?An#PpPW%T=0@$>1!dQ!5JGNTBh8 zwg0gK*AYS)NQB#~dfyh+FCii#;o6r<-0;%Z-fi?&nzZ-Pei@rZ#{hb4Wan^UH$I7rK z75+Wq>YCAC;){Vn@u80cw#Z0)i7KX_oS>W%;WItT5wf+>=F-|m!yXw>9LJh^Hu->3 zqvSegUF2ZR`)gBxpHRf9+(W+>*b4FS5s}H6-s#3Q;*b7Ve+oTwO>M1l$Fddk%O%Px;*3O6U{i#MowX7S%&68)4IiQxK4 zDmbAL+E1lf*$>w(lQPR?SwRcGaz$HZ`v}A-q1dp}ZAMp37tc@aX>1)-E#(FBQr^#K z%1B#|;81Hvt!Jztk2aS9FO^tfXGSV)0ZC-Ahj+i?8wYV>;>JOS5nb7x*36;6^IlR} zI7^?Xq{S75XUW_}`gP`sB;%P$G6Nvjt9#RbG3bde_lCG`Fu~_+77Dko!pttf4>_IL z8_uklR2@2cEkfx(&dKdNgV4*SGX{t6E$jqEKv;g@q}*h(n4Ft~7G6yHW#9!Wz)q|g zA|I=$HTA>4F0bGV%8EUZTOXVXl*3<{Aro)sX!TDVfvwkTs>Tb%Er4_cicF2rMY{R) z1b4RU55YuBcR4X(HyOu17&-;A8ghmZk#7>yO@HIR+*9%S4fE|eVps$B?Qx-eI1fRl z(>R=m(s&T9Ke+HR^Yl$rVB+bR=Oa=-gPkppDKh#CUK99|OT_CkM?pNwZZeNf->!!1 zK-a{QqHnSWuxBn$lq8eFf>#f&LUy2Xk^}Tf;SX`V8c{K7lKgEvfQn4RbY@E~3o)NM zirmcQ*nUn*88Zr$mw**nq>7nG6<8N`Wa`su&BTGRX`#{uukfrmw;E+)etoXP#K1kgatstN!D+K$F)ybY=Hdj1kAfW(w@OcWn_fYbF)wF zBqOJHLy{z6-P<;w?yyB(P_bnip7h)o_;_+6uEth+NgzI&Ynw;8Jk1iX#|BiFK$z6d z4o4EvJIY3%IvaCe8ie-JNqKu*?5o-)DSP9q4_>+5;PJ!8nf87TV_IrscFElS99_I_ z{KA7}@Y$0)CC=t4MdAnU`oc|z2~_&`0scA(lyu>NpV6JD-HKvEC@mm2P5nmD3jX{9 zu_&qZy3HeHvbW(uPmG5lki2s;(+h+dFTya&V}BG2;p;JW4`34N|G7@aL_8rB-|s-U5xLI1|_g+8aS z7DD9W+S)}qB)fKKnm*`;D2OPBMjP+cs`Hu}Sa{c2v9}`KM~faXb*2G!0tq(widSRXTzvglG1-*8Sw?krxa z7UbODhQ!sLsmq<` z3S}*^cBv%^rLJ?3`?b=$e8SWrYUQGCOOI-a=@p`?TTdH_H8e6snl0T~Hq}Qc<)|S(Xh3@R(zU*RNyjcRR{C2*Z4Y|68k#gk-uxM1Ce?={T2Z4)p>_WI z7KAilzZ-EQY~Jc4>w>!Ze}Gu9)Bk^vdwP6EI%c;2MpZE2Gcq$W{a-fL{|m8TWMOBg z`#+ql{{_UN-B=0bVueLY%$1e8sPo_9M4K+?jK5Ln=qwWE5auW}qE zdA(&tuWK{Waol-?1ywq#z?3pbkvZWrCyV1m!eXT2<&ziR9-oIE8ks19ZF;ezbNo}s z*x2Aes30%i1~k28oz0hEa>M5X5ZejEH#~rI1=XTwd@NW5GzP%Wr&_mwsCNS_2h@>i z{=o*k4$x%c3#d%S>PqU!$XbU9-q7*~@Vt)}QD%mxre%fYrq{rZj{f`KTCrvp2EMI{ z{_WARi7gZzusYCDdU6`zgp7_}K8@cB1PC42MjE>sU{)1C2A{THN=Q-%D4&S7u%vQe z35jtRk!9<3_Kl}s{pNOR3hZW6(x` zEKaPBuXey`KP14uz%=GRq~IW8ZfzYu6ZucN$(Ow5Ka$#85L)T^fx3R_tG_jA7-NIe zudzE{omNm}ahcVbotwJ|{s2^(V0(XGY|FV# zZgy^b)qf+%NXy6I_Kc4~>+bD=)cOS}|1>1oIXHndx(I$P^N4HOTL#Q zb@BQ9!Ug~1SS(zq-qLVqsW6x3WxO?ukg77Pyw6)0eEPx)qP34t#*AO zHhv=pVj=9EUL8T#F*w=*gRW@+_Tz>5*qG$-`E^!;(e>TR4)-DuV!`z-tPJ3AUF=;4 zys0{|$0Z~A-P+Qpjr%J9VgPOQwou}@o{e)jw4*=@MzD53_M&5V9r-b)efgI((qWiRxe2D1Kj{hSa z;|)Qb6ntR$x0ij3=+R#EBBVu**n*rCy#JHT{v&PtN1FMM^!ZOZ`wo2T(|YnHs>@yH z4MUn5hW7es+vr&TWdQ_Z`~|D855&mm%ux56wWNCG9n$(m`0eihu{ZhDw1RW`P36WL z<$dtwtg{bT&j@^lzk@TZ8#}-o`g3Bqe*}!*iyxl7#Vaj6Rjy_ZY6w_)X@3VG8I9oq;4{}lR1?H>*v1kLmn4CHU!Hy->} z*KSl>KM0PG_4}@eowTpg6e!DEb_5hQ6bKKc4=Y3#=Fz^{&*fS3Paq;zTeeKEc%UK2 z_x|t*DDBdFD`wWoNx{kt^c z7C#ys* zPA^3O*6E4fiOHj=p54!`d3)b-$7UPQZ<@TF>Q;Bwj~&41-CH+FAN#KdMY}p%`_H<@ z-G|2yV1H2ePnw#Y+$}FYTjeYsd>3_3FMA--JCOFT$=$6d-}9>GS*>Kw^Xu?kuIPcT z+Tix`IBuDtC4>+DuYQ3M=+4y+Xw@E81`IWr@AoTfp)X0^$9LP?pr+Q=_-1T4QdfD} zT7M`y7Fiq7p8NsCTWam_?rn3g4%i;nRqxxXAp`)32QUqMDry!r$iUSCuaFX`1d6O` zzuY~2?$RFQ%P6F?u{D>+y=Rk*G z(XuA^xa1y9H!?m&=bc*5J%6|>Vdj14==O=?p?^=|4*J0(%a`_K0)>|pCI$ItTPTJ_ z72Cmik}$4(#4nD-9e9yj;sohDO(b&%JyQrGPdJV)ViV7+@O1hUzuF%jDn2d^ery(3 zo(cSJ9TSz}QDuv7)ShFkAs%(UX_Kcop#g&6Kktf9TxUfFZ2!ZdTF*d)>w^bZA_*!mWUwFL~2DtPLm&j>}( zP?sH8}FJKm?EM9QrBFn0xr1?Rt%w{O4T`88scqxKso^-(pCtR!Z_7L`~?skgu}oe zITylqm3*!JtbRc7Lc`&Mf;qL4wQ$V#%U3wsIJyzP@F)|%Bt*i(ULZVCQ!$tgcKYxQ zw8!Ask4Wno+_6Qd)QU2yWhLN+SC^8e;IkMhTY zNij6Zxfnu@lXz+vN}%8d#t4Kwwk;j5qdQ`1lKPL{(}BqpE<3t&hT$G9=~-~w8&bh+ ztG{xnNjW5n`vZ0yRRVh;o4`0ohJT-6y!fn|6bClK9L@+zlg%P{x*~oBfCg+;4>Z_REj)y zS+7R7Rt=L1u3cOK^E_hO3XiD=N04bzDrZyWl30lLu2Bqh;JXop2$%DiK=B3p`l#j8 zxDbRCBiy-XC5KJ-?{@OYUG@f@__|Q{Q$`^N8HZ~0imXFh$6M7n=CeC6iG9iMm82k) z@jD~SqUoHb#(EE()rRPyEL;qv2?(Mhem*gtz!H=?S|1osx>5BGf^mz?vS?^FqRj_2 zKfyW(Q)UN(<%a6w-)mF1oXI&121cHq7(v2F5O)LGbt)+Z5O@=gy{vRhy;gW9(l;9s z?4seOzF!sw<}kzqd%PZ3n3VaVzGA7_60_jt18$>pDS7h|CHUs$es=x+(>b_m9kZIF zTDvQi-@cYp;p*<;*a8_`8rAh9R(YmZZj-m>q;<(?%qE&v+C#)Zbj@`H5{l^BX?sED zEqJ|P$+j1e7;CW7G3=dNiCC}B^5TW=$u5oUCGH&I{hV~htR1<)Bg`O>v?O{4K6Lf^ zuMe^U1-0eG$ZeSC;pQ|a698^LnSv~}|K;b$k%j$qM%^{n+nig#&oUu3^CXHKZlzS~ z=jM71Y+)LHHRMN7@LWG?H;ht-5O$W$z&!~^fO-LS4r#uQ;iabK3RjuGQYSaDQmJtC z>~TF|DYXk%FFmLzBel~MdUMYf`|;_YN8Ee9^hopDjjy z)N!LB+RPI{TO>YLp)pD01Rw+@C^&^&tY#g~?;=itr4xq3S86de8(azHHD>ciQ+Yhhc&$B#n8zvwx1+?uoldU|p-}afQuSMCo(fk!0V<1N1(25}nVB?q>WOn2M{oKbr|+ z7>982=c{-EGQZEv0H;sbw@Za_PbAyH$S-;1tGHec^ic=o2)AuuF6~??X{#&pu~8C@ zAJVT~5c&bwRr^Voy*i4h!>p((AfmgPFo3IMUc+tXWGj+;m zQWhH$vo^%I;{r`KNpQY?)FG3)9o}I~)BxB8)}Q~*(ZapR6~m9uI%7r@H|_S2as_J& zN1CxBX7=gi+vJ9>Zt_0bDxjD(lo;LqDcH5N%z$|r=^YaaAYP&Y2`vu zVtYb#9E>>t99}8-NprgI=*dNcdtYK~c@kV5h{C)m{Kg>Z2S*r|R>+W_Hk&X{SRdpY zUJ?BZws|VWrnUG*Lx?ZNghx%xZo`M@ClyVJFIZ zgCL>Sh`~7+o>1)lx!_H4(@NcbkEwA6x{&UEn?z{FJiLddAOhp`l}|b|QG0fewx!)OV-JuqaEXnId;ny?}dWPaECnUP!EisNHOOE-yZ6qg@F>jXVHA0nM+IbxP;JDrXDu6Rq9wGK~Q{Tk8+=}}~7 zZ1mLVkmDhY1>|bza$D6CyEjubugj zHDM06X4UQaJORT{TL?JBa0JUcELiJSGgkXai^sw=eT68Yms3PzQJC{*>S(AfUqZxZ zR8hX;DJjXgHOw39eS5n7kk}GCozTP4%hu$;40m%}JuzzsY&rLL7mxP-wvq3?T4z=n=I?XjIYyXBhhIJr@*j6vIMirz zXFVf5<#~V*VWBL(RB=ig&X`@WnyiX)=Ai*I{^t*6&X>OZE3XREkeI%~IDx+ow{XRr z_P0DJTbdkEQZ3#t?o&41Y!4UhjU(H(f2 z?Lw;h8d0sP_hO@-Hyrk~B%pC5vCa1@i>;@#4mWY`_FV_*3@)}_Tq0V|_DB?@^7(7; z?p_-5&*fBD2xay~_f2Q(gcJl;p=18M*#*ly%K_zCJn>j$e^!)JAX9Rm$9mQJlL$UY zZ_+oM?K{$$TX{h7%nanU}fxrI9W)1r4AQ-2C^?nt8X`_v%>dY5%G6=F_DPvx{@^$8&pJ~hnCzvt-$20U4N`goZ9#O z1DE4QPn@-;B)H*T5kKswma6-Q9~ zqH*daJXX#YEx8{_&kv)aJNOgvi7`bp#xOuYF;)-%&6Vm5=@i)NI}};EkN?n6On&~v(Wkx(*x#WnNwA?k#x4y{e@*>A>Q_ek zfgiIQc^!4o{e`AZOWqj#@Je0pQJYykL^_W9Q;4)HYJ^W8C^x7ms&ya!pWarg?>DQJ zZ)%l>w}0H*7eY*a&=t&GYD8vn?j^-@a*E?|r^w{0$vOyj@ zTUg+cNcNp+w$_CmbX-V`Hd?x>jd>=Jf}Xk<FoBQK#RoaWy;S8pGf3%SB+ zbx%#qHmK4Y3Q@Lvn*M4}PgAxWLazXc{p6kV!JqM2Ja*==^d*t#WV*jNYbpT5b$N7a9@dj?HZq6`V z4bkixrsTQsZfBWB?%$DuVWvQ}w_etj^hHYZ8pz{wx_d|QPq1{SFMq?B)5IutwI%4;dm3ryK=Fh3HBMW z3FtHs(~C{nIU`km9x)lS1lzdi2hlklwgT^tDXq+8t5?_TDwl=laLv%Sd0iu4nSPg? z3#&5YZ^UD54TJDuec1=+!Odk${QF4QojKa6;%zBP1^J2;vlY!aRtaVCU>0nLCai+5 zNKmx$o%h00wC8$q^a{~QcyAY;J5d2jK_Sv~1%q^E6aMHk=EL=KmzK{?*x6O?_%1j*Npa6BB+2w z)i`z4Ik!Jrx=RgnK-53$m?)FSyrL?F%v2e9GUSd~N&cc^LHVacHRJnE2ZG=>)pO0Q zsk_K=$~8GoQy%@SVk4MoUTslQQ87p{0*>l1y9|(YRBvmfYq9vR1+j%#gekpCB1agh za0TKK`@2wnI^R<^3!S{lZ+b~dthx9b&1Pq8uyvYjx^xSFm1`*`eCKw=F$iABe7Pv@ z@NsP-N21+p8qm|HWuvn^9aUwx*_HQO=@s^DVehM0v0JIlly}ytLs3XEtnx%9zl{WZ zTcW8=OUzCBD|GYdP7Z4XLgFI3EEmsLmLIF7)UtG3@-pVMRiKzY<--PMEbm9v+`%F; zy`B2H|M?t*x5BR8yweLe-r~Q`!RJ7RQUR3JwUUlu{Mzj*!ldT7Dk}LXu)JPYdRXql z(cr$FNx2@1*{zKZdvS>xAZL}#)e<)xp=y6S(}=$wRwcb%dmKyO8fMkR`_>5Wck<~V znWbC}I1sCg)`s?hzqGFakP1S8O{1S-9&V)lQ|fBwIi&=|cpe->>>o4?ED`+S{q@E74rPU|=2#2?3Av^U5dqVu z#alG~Vf;n%t;asIK&6KGNfsMUjg$5)54ERdfs>Gsf62`hUuuM9en4Z_@PTzCSK=fR z21qjhfK5jzAH#dzwkWdFf5jVVfJ)7;Yfo0U+M(2}EY;G!fijL#j3IGG+$m)Zl`>av zl?zXqzV@W;g2~oJ#>g*r)*R2nqTD4h<0HrDPb<$)+YM%tL$b}x1^#!r@l_4b zwDMZk)JqPp$5z%QvC~kcd(R(HC4(Y-+S42}%W~8vHLoN0y<`R=C~B`{#;9D0-(zf<;ULgHA|5ii5v0QH z^#295aj+Ue3K4eMeM}B6UkR-5E$I=)RcDEPZh7`;!MG-0*f3vmDCM8Q!)fFda42Q4 zgbAp8Csrm|QEShn@H`6ptjAox4e?uN?Bo&&8!2 z0Gk(H7B!f9+GPLpY+FfP_SWudE2v?Zbu>n_cmwWhJ(r=CWOp-XuQHn8=!0Zb)IyRj$A7l&Z$adSM+vZ(0twUxPDX(#-w+qO&n|DyPID>l= zt6YZ+J~K#q`iRs*jvcmap*$rr)sQ=xPcMTvX2DJwH*GJ(wlFax_|7gdW;(s^a7QWL zF~1miNJ4(tNmIIl10_(f6f+6?mQ6Nn?g@mZWnot0~KX_(awHC)_{L;@JX zb`I%}{LM_|Gz2J9YH-ZXwAyGSYXzAF? zVfEQs@HI4iek(I@v*W(Fjvk`F@b8J}xuD+B_Y~!P?#w<;i9OP?$*X(OLC|#K(xx4K z$)KzgM|at#$d>$hisB7?NNt%Ju+;IPCg1rHNqF zGdSr<3?(nGw*@6;w$Q0^zaR=Jg+93~&82@miQsjY!h|Qx#iHgv<-r~(Uk-KM{wkYj zkfT$(_p%9r>k1ur>&&czaJ+G*B^KgDVA2jenhBs0i=6F~GUyIXIQJx?v9;)Z<_q%$ zpH>lJt2(1Vd$ZEwIh*~{Y7S2vP{U^N#~c3HH2S~()2Tvt>?Q~#jq)2tR_i_ z=PtW4XXn<$(K;v2gXYM5BR-j|igcTkll%s{s3q?SD&Vj3|Fp z@aPzB^l(lU^2NcU+kkPB;Ml%CoB!ZyK=GcsG=gu1^^FwvKp9Q=IG_5uQh%zG!j3{W ztG`vfc21`Lnu4$v8+xQri7lF0Y9Ex~!9?ihmPCL0os z50D8a^^`JOCoE^S_&}AVLvW+MF$FSpyrr~`2KKfm`qbtESe|mkM|Up zGNu3;EN|{0FK?-gqhCSU5WDG zy(=kPMaTyEO-uE*C{Iwa_ zj_W$0f82f{RXLbgX+4o?=(t~O4|4W_nsMow((&V4Tn&8`+xz6QqN$yVD&>Vpi}CuA zV1Aon-BgmHc*WeOq=Di8#|n5DCVT&+?F*UyT$w4+-7=XQlw z=8{6X0DZf3-zUo>q|=zPbXrW8?Xs=HQ*$!PXzB~XUW_>Tdko%y0<(i2ieB*NR!A3y zca#~hTx51K9ATeO5=*Hz@*C127O!QcV67L#2xr-K@I1u>jo_&W;-*4GOh34? z;fE@wIhc-BFGX@8@w>1Kwc9{*K~aG^x5-{C1jwP8wV{o4<9!_r-4ZWbj)yLaS5rV1 zflCZtp33V3)W<}zxE6+Hay6Ap4(PTlZ^E5FWIguMq2OA{hB-lm$eem11|2AVCjE`_ zLqcZHu7WrRtrYqdcS5FxviOiqf%y5zy-d`kJ_F zjtmU6IoQ1ZAL2O)#9AMC59<}S%UQ&6A?mLg5Zr%~;iS(ASqrEc) zQ(J5HZoSibvd=CHFIq#5Be@#B;;9n9veDTAB*`h?avPp;zdt*78*_?H_zu^!C+I5% zyzDN^8pXFU9}e#+Y~x4KjG~(e_hM+IU}s%MfxgqYNEAbqwdnu=z0-%43&QX2tq5T2e~I}8|4|HGyhlLI%u49;EINu{uwqUe0WOwM8fOsFeOm2(#ml-W?6 z%*F*Ci`GFs0ERIrz8-)B)Q3h{uOMW$G(=mgOxxV-mP42(* z)qC{&Mhby6;p1%aA`FLZWba|L=xM%BlRhK}ea)MiI&?mL*YHQF_rAmO znwiRXI0c)1jO9vS$*jl}p!=pTrrTSC+BNVTc%`}jekw~_&YlVo@oyT3P@wVrku5jA z!k1Yq!KaS0IgNqsaUiKZZh6GH#)P3@1afceSJa#0v;+^;yoG~1xks+SLmlg-r=q=6 z*`BXEOIgX;7*^7NG&U5ikn9VoxG9CX)7n?F_~Y;wma?y(2~^NhgED3q;jY<^I^D zzoS=TUP%)XqK2-F0C;TduS0tWub?ii&JsOrF8rlS-{UvOg zX&FXC{`EF_4*KEZ67|k$#H%T^>D+Z{y|l*)GKqt_O4zmGl!-p88PME%;a#&kkQ`#d z?EBab*#wZ1@A+8J7NYp1YCE&HZk@J|HWqx8FJ{k$rRQ@5Awct;*gRIeEI%&c-B7nz zCkD*s-5xtdC6BH=0CCm7&YAUSAg7?^u7CQDQuJR=dcV&ZWh>7py3Auc2@xx-SwibX z`5LXH#}@T)`T1Gav;YG(lT|){jW`^q@a*4kJ~CXSkLo~$TYKLPUAoJ*%4RC{*C>9* zL8wW0!^ErLIynPI=c4(0^U2iYix)VWi63j|AT+G6e{zlX==r;fkA~* z{d_=;qr6bnIkpsq9cF_996pHe(n{WFVNSFuaa(GzmxyFODe}QGl z8%r3CYDWFjB8ka>3?0^I9N2VfAE>dHFjbrZB7FL>3xWVo~n;jBQsJH>hwWG>KyHtB67@Q*r zO80pjyf!E9S3x()f%i20+<-xvM%uhW7+prB_mqd@_Urtdu4Nd}H~4x5H4RRnY^zuF zj*H12x;1}M3N9{cvORH!HhQIW>nrcCTOE$38vYocGZYcjXFM1%0T`qw!wjU7RGs(* zFwcWG*J`7#sC;oeS8yqu zg?XnlnCqyDjG!|i-Gcb|RSt2Ngvl0D-PowP5*>b4K9H?v_|&1^W0NI!} zxBBH&ON#4erO)@LY&+>RlE8K>c+GYabGdFYA*n#zso=$a-DQ#I}8Y5)S#dc_3q@65k zuc^t$lRb+2OS(ulySXGZuz5i4FWp_g=HnxkcKh*dxD&5HZ)j8`)eX7OyE@&rWoHJ% zp&9TOB6=byIL(U@9pxX_#a=1H{53bD1p*JJk?*TEr`y&Mu!$ zg%#%{+@G||(txcR6=gf*$DE)VOEGC+`7f5x*xH>{%G9GxqZ=@fm{}<&AF@!4i`v;i zVLWU1-fL&K0*XL4i)LYCZyaY%QgT{}*EN{}z0Rr)Xf$4^Rg_z#zPtJCnmoVGu1@oO z+~W@SGz$M1fhQ~Wkz!q%Ko>{iip59uD6pWd7GrGzF;52$_jjg%w~kRNL02GLMT3fK4ri+7NnX-ANYwtzYHN)V z$t%Ry1=AHePM+Sx-m_cZ=i_K&lugaydfWP4n3v&`%0<8Q)4=g{>GZ9UJyF#mTLy|d zEoO)8UG#+^K%oeC=|N%&)eR=ZXx;JI>exTp7YtnEbyyG#BpIOi4#&`i`{n*A#M?30 zax7;{alWRf4z{3Gh(1S_*6>Uip~Q#V@;;T^-t>rrUyXI}aFS*$p5W6XFfH%`j60Vs zA8D0wD_>vY%5#o^Vs0UXsYiDNK)uXe*1li_SF6FUf##Vu`wZ>Vwy1qPhD_v4w zEsEZlD&t!{N-h?z)HO4G{cn^zU6$q(>(2|q?Ajvskfk?KSrT|T0q^!cKPt;V5TN&7 z?+%a#t&=0Wntme5Yf5$$9|AeJ@|N=Z-+Q!ViJ=fFEKOChT#YAZ#uv!T`D=f5Z-3OXVM?ej7(qQNBf3U`Zrg$PD1zC2`Um^x*h8hZn2&V>yw zKyuzg$P{fUCese)ig>|BQK%NaK73gM$KD?t6E|Yj7|XR|15+wH9zcJ3aKSgKxm&1c zBze?tfCcA)+ug{L5hvQM=W}$#!Mdvi**dVrea<^Q8yAMoi)ztZTch7731O9XgG`xE z>}Cb@7&&g@&+(q@4%Z}nZBn$7Bf5URyMLMicg`)^4Qd!OR`gnjh81~P&r6EOTb1D9 z@~E}3fBMtMY$ztxSZ`8j+^D3$r<+rKI!~stEMrt52GgLZwiw(TmTFWQkn!3xsQ;=c zah#`xEA zJjfb3Vb1wf=`DCOIG>;)?FFPyWlP14UZe))I*HiofYUoLl#4z=G;Q2ic1bTwqzA4xs@vlNJ)kS0KfMtili1T*x8`F{-*S-(?f^4&2KPE$2?5ltE<=pHy)f)F*I|s)+rH&KuTS~ zYpZE_Z^X?P@^^W!WRXIJTz*BvAbmR;i`xzjTo+)(6L4s7m94j%* zW(OMKQDhdw!=j2Ij#27MrL;@VhgVO8HhL;yJ*ZA}A3Jiz3wn|8d*sZ9{9v8Zaf{0f-!{s1g*!n1x?kDyvJ_P9)0 zHU3NAjHaFs<1)TwBVKex0$FTq!`RHL19e77ALUl{m_M+hcjSFw+Z@fzJ=YYhaCE{i z1~{IbFG9)FvCzC+>qg(BX$}owmE%fYAq~IYRfI5);53x#zh00n-@An$m!_p1Km4Sy`Lh3E4?_&-;i`)Qx9I zOLPlbrSY2t*;5r*tu6jZW_TZBLh_qm1P5F@B`em&Xxozy}aZQ z&)IyV&fVg`Hf&FM6*0h z($s#K8i*tMOPBd$Xn}$jMYC1pGP&B})r@=HD08(Tx~)e}M2vOpyg%CbE`)#WV~850 z|28Xq8GkT;a67zbtnEEJW44>Fp)7&crIkXbPvx#>%Qn*ab+|@b5#tOl185R2iwPZ< zhg&H(t%ki~_x5IsWT0AG7v3^~Py#|xhIMp@`jVZjfMAt;Yb$FvKj7k(^eA(0c+yt+ z9r=-og7DXpn-)P>!I~@PzdBwRX;~5~Sn3OgqEv;h)sv^kqj859o)`;l`_YWUn-(q-G@E(ejw7no4-BrE+uYr72jHOxQQ(0I$Wf^ASo- zBvszuo+zeGwW6xS5rA?Mj&qL``Q?!hlgNlhE*I~o-%GPqFG;w6C}EtO&k~D z)9Mvv%aX3LBAAdjwv(a$S-05!Mp}!Gh9ILf$#aR@ZN)<2^Lu|UM)-x|-5^RIa%EqW z8SOzft9A+tXRTBmunW>wAjRfwue-0EvZK(<^3W=Dooy^tyM!A3 zAf??EndgL^25GWd9koY_(dk?Nx^`fe3)TMAdjuFOJ4a&pko)Q%So=JJ(8L;I>0RJdEBa>D{ zQH!rfOKvU>lEz{3`vjOa-G;&8j=1bj&z#&EE~M-BE*-1L{4W4bK(W97(d5UCb--v4 z@!KSd4_cSB2#e}%U2_!TZ>)5i>0hl()IC zstwC5)yuHCx)pgV1vB>9KADEnz~_;k_-4>;@dum0>+=NugljVp$%(uBXI}#4#+FOt z@)t~H!K4#8n5;|S&+)9WsF_0l)o(j~K3O^+A<|M)GLnv)&iia+vYlFWwYQoW4+-id zJAI0)5;x57ISuIdBZ>_bJ>1?plPKQ>z3*>Ap|=p#5`N6Ma2oJ)8dzHtNYPM|@=x7G zTuI4;FtX8ssu}p9iHAb;deR&JIJ4paf#OU>TT2A)q_?Q^q@BCX|D45nJkYHUFaJeG z${q1LS*Zkw)hcHh=*~QyAg#dx5i#>Ej)AzvLO8m7)Zsy?xi^fx^ECFz%;4tr*iR8JnR!w_;(aSy9%^WOSkF8w?6BP#6i6wk@W)oLjsIX?Ke}vQLVJwezNVSwn>lT-)vZr+ zun7QaKZjwp%3C$7m&YmNPbxHz4O&iP_q<=n!VxALim zJBCTSE116Q1-A8_nER8$OUV@<>DD=BUI9t?yL?Sc6

    cpOAA(?D{abo{P1|0TD=9mEYK<<=YY^9edPN9MDeL za?%`v_3#I6V8fC7bwx9%;Jq^1_8IfY4to0;pFq@bj}yf1k( z8{|oZ9&icLa-Ar|O|#slrCIwIWtDf)Ps47;WF9LdJ2rOjX>ffot;XEmD6%N-`$csg zgN*SD=eT6e19AwSWJcs13=wtx=GJni%Nw{`=Q?nLc3&anZ@^I_#gcp2)8TIz)#Ppf znXd){gTUHA@`LOg3$#xt960h*;nxq}ffPXsa=8@xKn)H^klV-=M423c8rr`DBs9Q> z&VtaYf18(tU*|z{kMioFQn|2{H<|VK@Mmah)8fE4^%30HJZ%f+hBUp*rtjcAWWiHj zHIq`k%)rnUPUD6m@?1&EJ57Cd4iVEP4!ZcI%4$Z?;W35G%p1`3Dm`K0_OcEPleG)v zTNWZzuIi&|e{lL*$i!C@kFHCogUOCu z?>LEXdfBPXx7uDw9*=auz^ru>V4;T+!Q!p0jxqO7FK;6F-t#PBmMA9_oF2>j-6mYS zi7=oTJf8hpDpLsr#k~Pt1LzgeGVf!>51=?NMcY~II|~+HF}(0q&S zft{OX#Tx*7%lS8_Iojl`CLF}M!vc2dfd3eZTOxL{Wy*46hpfKepwFb zN(7RMC@J+M{0@95Zi>rmWKHXd$@=pu>6?PPd%xuzQ63yX$E5KSpYHK&kd$O=t_+=p z&fTVLzuuSU&p za`Yl8boMmoVaP2M1XwD?@g6t(%_hil~Pi%VP;5;q$*tF6OafM3>{@ z%i`8X{K{~SBj(-G>zY7ktqu%NRX#(sVJkCgud zpA3mc1AYY$TBq)&rBR*1V%^o0uicW>3*+XT&fHo6>p&l7xO3^BEHG}uWg#9Iv(X|E z$RVO9_WvxP`dGuOZ;uk3Z8j(|V5XZ^E=E+=BB)tYanKV2spy(h2)YTF7FD9jS|NXq zO__anB^9A~NKib=J{S;d>OfcpFw=Zg8l#{}(v0ql=|WA9v=FC;*VtC>5(_;F9M})A zef4Tete+)qp`#g>HBpU%U@J5wy!Re+ub2=2HfjBUht9NT=SqUk_cG{@sxMGs&uv)j zzPnk3T^mYdevB}QvmHp|67FPuOx5=kn6GPO^=YE7>?{2;AbN~+lsiA0RY@HIsOQ|m za?^^S^vP!lvUnIv3AO*bcS(a8Dd-A{_fICxDT(lp$?dh3_6k6SjSK&#;PN&5fCB5W5yu7b;66V@GrQ_fK@2=HxoY64^ zo^GcQs1k9{HTNTi{3~g*4+n)|aRjcX!~N*Cl&0XpfoLkE9n0gv-hER}lpUl65Fgb5u!5%%@kDXA`b zpR&Qh%MJ0b=8hgHcD$b)X6+h*)(VBYAO#}5Ch+2#82W&KKtUF@Gv11vIl~+=*dK-^ zColbsQ4?=@;VtmT3$ZirdFX(sDm2+;QsCuv$jsO}+{dP*pnf;Ck}?}Ty)s~t+ZTW} zQ(L?5Rt>aL2L5@g&iUqxE;l$G{cF+HlF~kU42`|UyJT>H?V{T~^K1LIMe%y2Wt&Y_ z`1f1co`v_S%#X@9i&}cv?UXw6oJ*!$N=D-`^H6FDU7!=^sL2}50a>aIwOB%y*mRp2 z)r(TbK^;t}U&}8k$^3{ow0j(~(YhWxVtn){xlAc=gflehwx z3B|?XG@_Gjm5{YQzTrwcL9lQAG0#?JfMd4AaotLdc$a`!a!I- zd_84buen=nFVcQl{0Zk}o^`4Ss>CqxmFKwNwCN zv~BW)AAanrgUd}3$iii@jj5~xEz~9aCz}BKNkG$WvPU4*g)^ zBHZZ-X&XD=#IF%&bjU%3i6P+yzvSSLd93Q3-u7CTo`|i}K8!qa2UUS~P0)9RDmQrpP z1Dr%-O_kKCx{ePDWv8^JAA(ObIXg@hYkRW)gx`Kh&}vlsE2ms42X2&UcPtGT#wm!& z2)e+)Oe$&91oom%%XcOulKsk}ATNDylF7Q;scX9IFMrV6$u=;_){Uu7LzRyJTWb+q zeCL{z8jZZ$(Kt#{9tLd1dZ-}PjMXNMd(y0_$JUzD&#DtUL6*bujqQN1FZ3Zj83Nv2 zcq>|*iX@~m7sc%{Rw0$kbfji7UB~3#w=od=gxd^XGkQNgd$#t2T|FS7PcBmS)Da>!@(LJ(x{YRo%k0BSMBh5+Sbr~&hI}UIbUWwH>D_+J69*oV-1RhkA|_v{Zs?JMxS2JO3L;ShP&c5wQl2 z%pF*JpPgKMrC6}_DL^_;0s3JsvddE74QV}ef!$6%L`~F&!Ib{XSeLPrLA}}@B$LvT zs<(z-Z$d5ARu`LrClA7cGyy(2aok2kp^6Yo;J1zs_lS~=gd)7FKPO4#IRCQAi0rN; z>vZF9H&}0#>;SQrs{oyeNls{ zO_!U^WSnldH{1d}GaSXEVRl1l3WjWuT%d_-WS>_goPhmGNl3GW@ybC133Sn~RrnZD zB-UPu9Gb{N@3**WriX?{Hz4qn`vORP+^Eu3TJjhEHt7j0wB$EtqE@S9Fx z4KX5D#yw(QQ|hxGff&5kgMz3F^qwF4bR8^O!00=`llxD+jQ2QkjX|z8BJ*F5OvT4V zkZUsgt5-8doA%B#5N5)x5N_4qY?a0g!brCfjK=}mAaL~0`n$%?=Q~yGUFde3CMG9^ z5=$8mu>#X|C7q|p(rEw5Wb&io)6N;$9O6m0GQa_qw>eW=l?2%q1}-YEqq5l!AmwsQ>nWN5hSXAd7Ed4SU^TBTx3y)-NGP}#9cp(16rK_ZOQYt!1nx(>`0W`u1mdijhm(P0h$6z94AjJ6Ei z2q{Ef(Y@gxUz-*LCn#M$Jz2{XMlGt)>bNt;>}}~TMJa*tDl$8}$E&+hH1uP_jXVx1 z+M2Rv*_OJyo(DjUbyv+`OGT))8xolF@sy{OA? z*qhDU-oQRSPHVDh5QxQjk`2>l-WDZu7DB>E^J$#d3ZtpUUua@HCyV%8QeU=AQ~pa} zE+-#dN4?_yNe{)iFckqwGdZQD5|e+A2Ho^xrBxqPgGHcte`V$F$m~;sqMvqDwt59( zn@SZOBDcd_G$miG56SN!ppY(OhOva#81a3ET%&^|A&(Y3@jtI z?IiQKbn$r(Ic9ExH5O|w*AGJhEh{le6pb_a?bvVwFk&90wCt72q-(n~3|wFjc4&r1 zjpmlJX!+k>*cE^TGeMweVb9zkt>;Et1Ob2h|WU!!4Wif|l)^1hAP9L3MxAdii< z?pQ1McO7(a%W{ke$Bk^2#aR(_94_w@RhVEcnmrYd!@1`D2n7Cu4@MdX+?H|p$ti$d zHE2Z_67-83!?C*{hgCSbPR~9TGm*jpf>q31c?4+GADfReOI!*Kn(QR+#4atCYsKAM zGzN@wqH7`9N~{UjqY{(9rvy*b&7Zo4+VTowY@~$LOV;Z;V?>}(gs6*nzmF0^je{R- zu6=bM43ad%SnC=(B_`{B%0@~?I@VhH-^ zC?{>8-y2B9+dWUB;<7Yd=&vaA>{<=U$K)Au;UXQLv7|!ziOf1CyTbUkT^iIZ?w105 zCM7r+F#am_t5Xnzx`1&|6^sHskCA>=JlgN^p!~CU@ zdl=!7$Oc@alY?sGDqMljbBM&V+iH^d4dqPe{d0S3DA@XMMo9kMRIF48fI66Y-GSaE1rhYX6ZtD?g4YolSVm39AAO-YCc!Nww5c=?_0acpPHyKXVthi)1yXc;nd}T^z8; zahx0GyBX%%>D&j0&nX{@w&=R0yj1s<-xAry3*VHRUy9JuLvw6K3`Ur*n^K*by{c!rpyMoM`rr}_D4*Xk$$gB7va~@ zMTt>7(V2Pg)5YV8MXL9y?kBf=3U(hH4XtyX zbUQpNi@5%C(q_x!A>)L4X(gDl6^muK0SIIW= zih+jcGTGGDKQp!JxxsU`i17;c2b?SendRQp2OICDm4Lgr(9vK1ORLtBy_PoR%cnja zOd1qB;XSF9#2v0C^W$y)nzIH2P9KGn7Wm0&*J*QGl%sa$ z@G}h42EJ6QcTsY~NWPA2VtGNKOx5Mqd39F4B*zy!ql;LMrn^u*Rx#^pIIih~=^LaI zxrii?4-D%v+FDEX>?5EzE{%Q*S#DA%9be%;jAo_JS(Fd2*+B0S5;^|1E{zA};6QXN zBqXJcvz?D&?26wDCkeV`A*_Y%h%cM#5pof(R~&+JXm$;RlK{)gmN!#}QfW6UdO}(? zJ!HPyD~n|F35T5aJ0vLY6Rb$#Va9}9sX^9%-+4-;gSGlj!1Z{h4y&i#C+Y*~xjAeQ zvW2)=%d^L-ETEwp&|EzL;d5w)%`WmHsL`rz!PZQ57w@CESDNiNMKPWyt;z%?9+Zvo zWuJ7pkQ1Z*CGKQuTc=vCzt%_Vra1`(soV`5VoUB5T{$1xyepyI33cp=w^msYX&3|} zY^$RRA;b&!fU)^Am(wUhkS^>lHU)CcPrSUlGGA$J>#~-!oVngu=Ret04kJ$qhU@o` z22GQv@$*|_`lbrN$~zPYe-sLHNDXSJj+bSi*Pht@@2iiKoswtSWp;M-^1I?qA4-vP zh5i)iM8h0<+C8WzJc{`1bJycJPF&U+OI`x41jKE_q6!Ex0(NAdA)s3*go-N$7Yg-M z98yoTHh-#w0_SE}lo9k3X5-QTYUp5Z#8ED@g@+;BBFxs_R~{MCe4F}D zBPwG@k~P--a8d{3#=sNe(17l}f3a-&wB~|oCZ7y+r zV%PuRUlY>H-;qD6&5OnzzJ39wvPZ@<4`Tj&UZM5zRVhCY_v-Ex$(~skdiVIadU0<7 z1QDm~X!C675?kK+M{i&!s!&DoY^KZS>rV4Ncj9!q6Y*P@AMIw$RV;ky<|a#Z&3uBq z210S7evjm~3yyFdduwLK6@rPA3b5*FM6U52a%X;#owZcnzSkSx$udX$&Qf$9cFpqz z`c%J3jNO5>58ji~56+n{`nug_Yet8y1+5oF_OtDGCY$AC#^K8{)ehq#uAZ!03{=*CAi$v+igO5iKZh9)09+;+LZRCl2q zKv8qGIOAXS(yKh5(iDbShg$pv4pIUe{Qkmw=Mas5ZH}r4an0ON?|uAA)kI-2>>QN!fr}EaDLwie3I}|1m?y0R|K^JJpB))XBiLk6$vGQ~>YmV(d5v8q`1ibs4@Rfg`=F(^ z_*TIjYbjoB?MabTBCLs8ktfM_Mg4zvtC0bgM9J_uB zP8DGc;Ii0VhG$7%YQ zc47`q->VKz0gFyHSAAUKvj_mhv=aj6JrmzdSZVGEhU|9T+MGwqgpvEdF9zHUj% zgcB)pi_;JgBBg><9rXqoqoz5Ag1K3B#`O4!?Lwc8*Eu61w$_+HFlZAYt(qwTs=mNC z3G&-6C+#x?Ql_ow29j##>IOFNpR=sfc~lPM!uNU1c=5ax1`fft4;7dvRB~o#WqDB6?0}c0h zMkWjBA0M3v{eHDk$@rb)h$-Lh;jLi7;$IztSHg2C`RGUyt?AwM)2R}?=C}NM9N=WG@5|HbiX#MQH00bqB@&tb4GwVZ z-{^LN)o&EK?!%Bysg4R@e^(j`-Enn;=006kQT0EV)OE;23s&R=pJHib*4?nX8tCP(Gubj^rrKG(6huKC_gd z6h-gQlc}BhSiGp1%65(OHl-hMl#)ge7%G;r4=S_DD>_RD7JV7nRtUcn#@vB%R{YDs zM%0-hj5+|nTy#4ut9+K!(LWR0=ChD5m;x5$of{n7yLPE51drSi@r*|}qR*=6=>VU) z@v72_fOf}faFy;wBbJl8D%vecn8?GMiEZjIWGpFV^C>ElX@;ao+@6rj%vV%tD3sMy z(x@NAG~xGD5mF68%SHR25^8)~moMU^Mgk0NRiA8q;SSds2qSW(*`;5etX1Bq?~LiN zwPvdmLS$>g4t1$Pg0yCsMpd{HJ%C+gx@-G$Uim||z%qo#&L~vwUrX%_%lXY?wXa47 zpre&P0q)~C6WCcy$T>4aq=1^Cc>J@;zVMbERlLA7d=}9%Eijdb5eYB)f5#G50L`v( zns{rDwXC{!)aCo#N6JaE!kZ8fd{6I7MDA>s%~#-|HPmgd5tEzL7`-{c&GZa&i+VK+ z8WIhjEd{ViU8Fu#`RK}vgke_6Nt@!@nwsMapwss&OYiIJB@iOw#g5Eq#R6RtlSFnn zY8E+ZQd6)a1&o?YQsGUmDp>7_e$lm~rDORJ3S^-)S629X4im-;M{bX}pDm>)h{V z1UDji7gfNQau{etsmt<8G%5lOkNs$7X+LVCHBVMsaHr@;vz%V$1H-QhD@^>_hjo=f z13tJrKCnfRc0SYN?@vP?bS?Qb?Q0aRi1PMLGGo{6#AGfGQ+OAK8o+g%k_J*NxR>aaj@7n>j&u|@^`c*kc!kT6b|XVgwXe#LYbZN* zD(ZZ~kDV6omEcxp$A>!2wJZ=CLRiv4URO4eH;Ewe58SJGo-K%etZruGCYI?_;wWD% zUmPT+Y+et4`+SMH&Cqt8k4-A-j1=i1& zEGN8bakjg)3LPJIFfg_#$Qzc|h`m{DIN~OZrO7@8R&NWsfTNq@W?*Y6ehGK$5Vtp9 zW@##@xQq_x;SHYr2HsK6UmQym55?5L-1w~(+mA(<8 zIY~a4xK?U9ReCnFIOEOgJ*Z}Ue?0C$1LL?M$ZO*|)eV<`4U(F*%eZ<_J2%A9 znN@8qDwjK0Wx?l<2p3oujvF3+pxF1}L(`ZN02(Bu3*356E@2kqcqzKE#D`j{K+ivv zTQceRxC{sHo>EfCuOlohC5B)^wbpZ5T(f~Fzs>U|_lS@Wc&|Wo>?IM+T zl2xJyp%#6;-EiJ;TJ49g@+UKeck6C;goR{V>w`4y)00Ws{}hmILPGs}=>Q7ai-VV8 zyClG^Jwi@hW}DgYVP__(ftU8|k{-(pgQM8MqB3JS2?qlgFjbcMWJYt@(AgX~TXILP zE~=Ly{M=$&odl*QR|Zz z-vTAx6lQ>;C4KyFB)|oB<~SP=&Rll-g}NkOM*&~INe{RH7*;cO+u5GBD#Jf5MTaLj zr!W;oF!@;U>J_3)S)D;2gM@KZ0#5{3W`axDA*3-{<$@_iN6c&z18V3@TeLY8&Go&< z>-zSlVc2mG+VdrdofB-Ne9yAg+_h5TS@&&tCy^DLEF)D#doay6ijRQ%j=$I@|HKGV z4>SQQF}{mg3h(=Mjo9~6Y@c3Cz+LrlP}vd{1?x~M4k_K4Mi`dpzp&uyztZ7eV8`)A z7(bcYp3yQfvYucvgo-+U!%Xr52Y4du(yZZcjqk)_VJO~28~UH~C9U+_@v4(`zpCrC=@xQdK> z=tbZ}P4MSY!qA`Kdgi;_VZ&Ez_1jdP{i&;ZoeXR2st@auF@rVBZn)hVM&qe%gMvEh zOHqH^C!i`rz=KK=@T?aB=PE_|;mcHL=~R0w041fB+!W^*_{Tf`RcK&LmJ>Q+E!PPu zSpL{UQQ~?Qo91`jsjzy0o0B;kk}ycXvu`cI^y3->iPsD{fin2RAU>Yo<~PW6=-U>w z^#ui_JRZ|>0PMB#J8SARzsum}DAIxPRL0$NF^)tGEvbE=E0pTTs9J)LRZqzvz2!Yt zUy~UNxegoDsP{*a<>fgeWXq_X!>;TwyKkubnadtfCjm4Rn-V??`|_ zfe`^T8rGtSg+@Yx=8dqI_IpWlNx>3piL1Kh1u97h{1(Ljamv-F`%dwlPEoaXNn6#k z2s&u31`h*HRZGvIagS}CVXcZ;XAd8^Lk*UQmKqmwI_#o@rz<3Ge>^7EqqxUh zaseIYuH8o~VB9#*bKK5CSnG2`9@`Ic!78Jwf3-jA9#vLhE zZU^G0-N*AByg)ti>-DYt?NVK6F(J#dqY!SlN+0N^NW0io%O3cuW=f&vC+O%vUTmLTL&8n=tKxiIAnqo>U7 zR}(~Cr7+{CXMhzH#TZi|by{uTL8m7Y#gKFm8P4lmP_y~5)0LgH2d0u9)Y{kL-bv%+ zl|$nj_0&W8Wd0VKXuF&y0y}k+E1v!?8@BeL5Yk2U%PjTae7e3)i`0lxK~$+6Jp3qV zdRDX~)*^YMI{tX6q?`0`6A_y{R0G2UzK`+Vxcj|Mm%ro0EPKipzi_#C3)9jSSyA4+ zNyr4!=qat#%Ts%274u~E7ACvfYZ*e9{Qgk0EU0Or6Nq=OK=hK! zeLnYUIWjjPY&EH4{Q`Ov(OY}HZTXv(eiww>@7q{2Z4Z(DrSF#d0X-OI1jU-%B}7G+9v|YSV#+zJ%j6(N=)AQwN=NGQ+lKjPqtsqW};lc)N2YwN(D%xWb!jTaAf* zp6{G=xfI&pO|Ebiai+-HDb1lIVe!V)O)LxRTH$7t|5n^Wn5oMcg)o)JWf(ZoIBPkv zGPiQd%O#15@{hrNR4zEStw2X*^dVW-;5cohat;gf=0-kdd^qo(Ilt&wMaOG~eh)1s zr)trxxvXGGTAFv+Z2d3<@}2fRXviD`6$-3O9Fdk{X-LWjA7c7&UTZlIBzl7rf$T7h z$1&Q>Y+3Dk^mK=&urRgjvtQ|W=w^52L;48lTG>D+blwhw4QE!E*41@yN(wtN8rX9M z8sHZ7z=w^;TDaa%M>!G{yI4F>3u%x9eYcWT4t1&@w_M_n-Zyn-#5;521bT4i(qCTW zW@T2#My--a9lQWizZfTO25h`_@x&6R{6uYHOW@5~)KL{lF{3Hv8n=)#`KiJ*M)Ymu z6CL`&YS*T!=1dRi65<5Yh{q8XM|<(=IU z8s;4*ESJDcerhJB{JxeEb1hoEVDG9tjO=3iSm}!=(B7ggM;D!cOYZ#rnE0e>H`WMjptDk1nW6+5rXYFj)W#UnmI(CSy}4>;?E!B~U}pr9q)h%-5B~x; z)7nbBMN7pCzN?4CTpfpUf0w{}E~3=ak;F0%T_>8HsSXsD~NULYn zhY^8X@95`gQZv;!y!m-V?Js+ z@oW=ojh)$yukh)!&7#v_V-%+v;@bKOA)2J4h4@W5x6c7YVWj2b;Vb<2(-{HZSFj$7 zWzN6pbIj)iEWD!4nA9>CdQnvx!coR+tylLszDqlwd_(fx4ThcC$_@Kgi)h?ItH&zQ%);E}Re z_MHc2YG_*P)0kfkDz|5PhpzfmHG+M*#9vBeX5f|QcHOe1-bQ+)EO2e3w0EcDYHp*m ze2K9(v?L&FgLIM2#mdIkG-0Ry6xV=+9!(ryNuW4c z^y^d|&M(!RD`xvdtcMn5oz1+=V9GG3Zqdhw6xBUm`SSE)n_loq*c*Tx)tU$cGdGgr zC%nw!izt=G<9{HIllt2M3+{9o0o==mksbKyUFyzsB!9O^jI<%l-{u8Xh^Z^BdtFpd z_nU$0<#08f5eTt1&aBMetPGJcTvJ@Jyc60BT1fB zsDyWc_GV?1fXWg=iJDLX+Zk7zLzV=gxz$Xj`i`sVOg6OAB;vcQg5nq$k4pD3z4gRy5FtIjXg z(Wf3p`r*VAvtr`$jJcZw`*h!846ZxgSUKfZoN{z%k#OaX ztC@xU(y=t;BNc9+?LvPT4~>^!FE|+@wa^A*!WJ6zjmwwzHLH^SwOK z&=(~6HtHyX|7_MpPCGe^b}-EVZ|bW+H?jzc!=UkYAJRJm5i9Y)4nxnt3W_$;69J?=#5T3b;3)}f7Qm7DTe65`!c8~3ftC;? zmjTwhXS6Y@Z1}1li}zQeN@o46!e$1;1cpUHjqmsFYyCmlo}AS;d^ddZ4 z@u7Iv8~Y~wxOMB7QG5T+2z^H3wOWbBHL^`5#7MxB+RyNBdaW~S<;W67yGMj?M;@fY z=TFL{kF0D*RwDR)>mQedPPfB#ONAxn^3{@>09*;$_pX=b)32hSUR}UZm~B)*)q%+6 z4=h2N-2AeKOJ(%$M0KJ{wQlY@)FB;m2{HU|Kii=WMk;!4J6f{UtR90bC+xLj5Xi_J zU><6It}TE6+Nb$MR2CUkwf5}>8S$t+ZBMs4MI*{b>V-I;@SxgJE}MS>#T7y{8z z_B%-dT_1{c_VY^LkSSUYWdgOSzH449zI5&UHfeQrWvggGWxo}zNEiM>5kA<8PRKRe z{73YGRi#n!2ZA>Q>8>)?w zh}}+6El}ns`%23|k{FHbD`vDA)#fjuz`&k)&3kNhE6@in2HnDFikO?Y%~y;BR>u(- znY;$_T?8r|V9~VO@Z~#WL-WT{a5Dr)r`^htA)O& zkxP>dWXNgOQm#LJ4txzkH@On>>@4)F zbnL%$9fzNT$l0;Ec_teG#wO1F>}EBKIXJ#z=bCuZl8^3hIO=yy1z=!ZdIu#F$B%Kj zgOOj95wYw`vt2H<=UeysD^|slqGs94Tq67^KHNXm>jS5{0~N@-awkTC>h?MIYn->& zN09Xuht%{5?=exFV1l}M-f<=b%UuUj-8anrsR{v72|j~Q!SyJvX~;!RT@9dyL2g(6 z4;AmLN`Dd9U0g#sWf(aQ^wB#l&{@ACpV_>*cYfF*b8#jw<8d9~xyt=zi&?E=jM{k& zI;Mnxyx}1&BhWSfeW>Ih_0kn7y%|kR^5HFk5;!9}My6cQt*m%VEPwmJ3YikNJt3;t zRikhy%aW%@q1UOu+q2*tj1ly1!)pDHATTrLVU`(yK{E{D6DuAr-{&?fZcl;NwW-7< z4D78UF_Kl5Nw9D!^~P)*^cd`B5dbHCH(*%IVK&k=CQ4hotD-gw)ioe*^x%32dWazl zdX+bwEq+ri(NzW;3>1WbO%S&N3N1G21Nen6G&Blwe8dqH?hRsDu4YaU@nh<~dlUY3 z)GH9G7#@|COk6e} zBt4pVgp3gK5n)C3m=(J%x7W=P5{DOK#GUH#hX5j;N<}3S;I zImC$?>>mXi8fFCkdU!RfkTYxiph9aG^C&PMc>c%Mwq%)>sTV)}b(G zfdCX4_$`L$0#&Gvu#i(s(z+tiN_S*XVKgDpDwcyO(R#?JK%(OP{@IMYon2+)sSaPW zuxh)4KkACN62xFF1*?0GT7OTIIOIQf_L2*B6GFCw1xp}$wY>Hm%HS)aIg9c2{lZBe zYgabxsvg~rBb8?qn0>YWY0dlyTkJA`Z}Rp1sz40(^Xq0!d=RBsrj5<4f*|HQ8PhCe$s2q98M2wArUw$(y~3L3*K++rI29* zL05<{#Mp9NN7r{;)qF`!*$3g~+^%w+8K(K|F4kP;%<$58%YfveE= zO~IStHh{>{F$sRqa?N00US#~S9sT#qJ7tE&v*PFEer7(2cH$?p>O5dyX5@+~%V@4L$A!QN?~pnOQ=#J;W1_m4YY%}zaB^o({mtLA_r2+fRnb76l8 zfeqfE6mFhh4)m85z0IrC@1VV9!b^G`J0cAaif3z*iipr?^XCdY;OYrO*k`prl?Wm# zCc=&hUbKd#H?QbbBp)5-lhE$wi%IlL=QT%7 zifVGNlTXnLD-O_DmQ2%_EZ@>`vEkt65PA=c?)T8;aMo;s>PPJplPBL54Rnyk)R>{z zvSuo^T5!Hny|9p+bf8%3`i9vrQK0%!1V~ZhCZG7Q^v;Co*x^3_4HYgz+Z-O%2}6uS z{dV1M)h9I6N2T9${6^`ia6v@TZD0U-Cy5!)xR>ZO*bsSj1rhZo33TX7S&#s_cHT;r zh_a?H69P$A4d>N+&oxJM32hn9vqeiYf-aU1E?js;Z`ODdrS&~7#E&?8j-J;tr`El^ z&F7F5fpv#p&&J{Gm&Om;!-zI`G5W9ewbcHVgz85Qf0--|T*VFu>hJ2cQ3r)l(@FAa zZV-0Sf1Y#K4-=E6`|;95$9lwB1-v+SH>dMrW#@@e$;{?_wpWmNNZ)@w`dQYqoPJ~U zKwIcjtM6~nJ?VNyCwwIsDtAGR_2Wa6)*CP!n6Z+K$?uKdjD$V*e9UuscihBb;M??Q zUL+%PvT{*QrA=aJe!Vc+Gd<-Kb8JN#%&id_{N8=O1-32ASan6%Y99doa8%vW2VBA3Y z3s=HIhVhfZ_jj-`=20x!2 zMET(D$I*Kf4M&U!reQcLaD9H`MlXaDyo7T^y5kcK<8d{(3~yJl&SVjIDqbFG#Udlb za)G*hap-cRZnA1CkYBjqV5^7tR+g_IEs((Wmch>|l8}3sI59hFJG4?ke;tRs^J$-* zX|>qWx(^QoaI0)*Dz%hRUU$35rW>z1-fsD{AX~&cqs^p<QS3saqv%{k0+6H{f?IaD zt8ye`b1R6$e}tpxzVg*3dMx3{XjaQJoPJWKh1$l5eP%uT+y+J4|DNWr^pKj)wfZUb z5#;N^x2Do(`(?RW4drK=s;n3U$g3cP%eyI_(^1XL{^LQU;?kE9wpH)J&_l<&nRrx! zrFZ_`!}FBbEXZ}Ss`Ka3NX?P&juXt8(kToj-*=L}k;XmEL@XNcX0W?S(?@HB!tNvvs+H?r}kT#fG}N z1*QBaf_D-E@2t}}7uUM$dzDGS80WProSwk=L5^}LC^Tb7Y7k@`P*!(p8=N+D2(z`8C^iT00mSng`;``BHT z6!2nJDGOjtlFhfaY`(nWH{m*bx+XA`$8r(D_Zh$ z@rPoaxt{D=ZNEcr6W3)t@?-TmK=??Q9X2-zjdJsSM2VEq>{sLQy?Mik?`d4_uW;kC z3|ImGD=IJ~av`X83)^{LCT5Z;2_(`;B=gy^!aYr7Zc|~0=l>#xVV%u;0-vEeEUbe! zfa-zCH6r;~TmjFuFck?+M7EApV;#i?+VDo_zsRjP%eQyhgH!}eU z&51&h!^5JARAZ=~AxF}YlOi1mNEzucnPS*lG0)liA)@%%3`Vc-e{@018uJJQqs%mu zwl*(?mbXJd>P+YKqw@07yBYaD{ZRP z7rSzerEX&pUm8nH?Ht5Qww=@+=_czjwoDl$xu|JhJ}qVoK?*io z>CB+q-yKlULQnonTanD!gyou2&zNmDKeyZvX}+ChDk!mB2VJRA{*Vz2t0$>kU*HW9 z)0HqVlHg?d>#$hkvH4S6OSDD){<|JmQp?9vlyJvmTtb7)`FKS~Z|S7kX6GJ={x5s( zZjiEs&}!g&lS8(qtblG)M+kZ+1wO18gF(E7)$`!5KDC@WH2{XphFFT7yna8){6wwR z2lf(pG#U^XC}P|3%W1_YBT!we$C3yGkBK9WtY+e?@Zp|0w#U_$8)Bmw{Tr|^ai(j9 zj#-`bZ_$EMcb~<;)!Usov*uOr+P@qv!$_Sn@*aC2=-dqtHSyd!pL&#;GhZM&lMB`AbGpS8Ybw?4|nrPyd-9B{S-&WvxBU+0) z7njH}38b6Wo_=RGp+Qsx&-B4SEM04#{T^+aeG;^d_ilqn>puC-KR`JhoM08xxX%wO zM!nPRQi4ZyaAV~H1oI_P*$=(9JYO0WRj#QMlJgse?d8$kU5M1{Ha2?QC~dm2jJ{ z4|%r#(v%?3^?2E=nrDI_P~*Bt6k<3{3v0?;ca%aoG2XP9#~Qs&d)CSLBo(GfITN<{ zTnL^2^CTlAcIpx{Q_Um#$Ktqd?kP6h{ci<16^%Dmb_X5rrP<>yH&@b?G1vU^DT#59 zEI2`Fbe9>0GHSb9_XqPH&&W<}wHRvq?D_945u1pxZdis(Y5>9j$mCX6i{4pDPH&Ga_iFcQb?+!z-p{;PDgjOY{NhO+^Ct=p|b#JSWQ zN@@e&`frH|Z1Zi1on@~;TAano*f(~ZVom%x8|vqpiC$vu&_2!gdeyG~QwPODS+N52 zDIWAt5FsCDyg-+qn}*0_eP{Pp)9DtL|IQLmCFmrrA|N35 zUB<8As)FxfRZ^bz+9b$U*%DFHR~wB%!nyNhPU2}pnhSdlA0&_*hFs=A(d z+2ki*1meF`W5uDwakjP(SW~v2f0q;WA!&bg^q>)CMyc6i#(dflf!%s7^|Lt(FB$4f zEJ%aW5))op?6&&^GrZSAOUl?Jt#{aia3P6X20cE3XfDj!N}|t#o3~6L!=4p&XH$rr zbzy`3w)fS)-SyfThmCrsd zB-R)iyoD8&Ag#ul$|mhymf$I!rG;9l(cBmw=M~FWd`nXn*dBgUuKm!oC!p*eH;Tv_{Pyy8pkRY!R^B{01ykl%!_zf|u9PF&fMh!o4b=Xa zU93D_o?124L#cP|2Hq$!gZl-4tWd%y(ml$Ra+j2HdFyVQ^f3&xN@O`0NX$7ObFtb- z3ryx+$#e8zkk&Ep2E3bMsbA-0SDoO@Tz%dl6j8Syl50bgpz#_3k!37oBWjcXjt1$~ zM;m%B2Tf(1TLodGN-q`#%s7PvU`__H8OjQy+RcK7J3Q_)H-jvpCGtYbckIP&j73k&ghA?ij%qbO-~q-u(8n$ww*jKkJZjmat`Ww%iGI%$%0hs4=EXb+ zMA=nF+Y$OQHJiJq_ylWG%S+Drlk3C9uzhb`bALq9zoXSe}IS^IZ-HIs(DVQbjs4~q*&)v?b{+2T0(q`)$|fs zBtIJYY_FO{qb(#bNl26hh1b>gTO(u93m{L9l?_x|lcemg|4qJY>1PP0wKprANpwwr zNXg@#YMHI)u8@r_Lpkr8D2VC4+Y;aG!!%Ldeg!P>58MicP6timSQEYv-EDAAzY>9+ zv!{^NA*XHuIr}?|fyNlLe_;AbzECo^tKk!DO6&xp9TJVkLg(74tU;~9u-uJai(Hzd zYlUC6m0?#ps8?_+DvwC@W9_5&IU!>uk;7cckEj>5v5(?WcsbP&Ft(Mdi|5I9iTyH_DeH3$S^CfAq5X^uAVYU|*uteqScG}zx>>Ynm>s_13 z#N|XdB;2IK*(~|sQVF==TM0#4l+skZaY;NJoBY^jl0~D(5WRoZuPYqkX>m!{y3Wu3 zUJv{-R2YBB8s#Vr_?Xwc8 zWK|4#lGwTnOW(A=>oYmW$v$l{LDdbmwc&l}5d{qiGMx!NADNI(`l<~2UKA{N5OUA2EpnJaF}D`J zCEQ7ywqB_0bD3S*@N?UiJN0RAq4nk=PKey_k0>~+REP0x@^2c>oGXQB%fVAeFb>1n z<^iq_1`>T%)pN?QpaQ()$Eq`2s-Qee*?@}CpHItjG6FsP9eqqsuT|ku!=LYqN9XP{ z#9Hyh{Gca&qRl}_%z>3he1!uFqqM?P!x6;Bt$h`22m@7@dGDIN2My9o!l>3e2q&8SwKSyi%Ujz<(Oli9-W7O z5ckNy>>YHXZEEc;!fYs{1G~yJHN*RPap|RDaL*a;u=cV$kWM5HlPHu9Zy=oTnJi(8LyQLL_N4-2lLj@))o( zRP$CH`&{b0RqZX9+68)tZ?j)U@206qr(L4c@=3HHJ+P^cK&iwTx+J$C-g7=L+_`&Q zTPOsVw<~L&ZUW@~_lLJ8kbfEEo|m>Z?ck(Nf6GuQc#UFno%EA(E@9hZ_10=pJcA0$ zL$YIW8Bi1VPr!bLA(*RHPzK1nTEwx0;XgXMN*G9c5QmPPfDJU_#Z87%+N^S22INE^ zKsuceUPn05Kl*k zngppb)0EcC;dRBf^SdL7mTz++vRN{{r<|Om~Zg{q$=@E zxfpjXe|PzkyBOF1PgC{v1gkSI!G3joXKPOJQgCL7AL*8>2EiJG18S`ofU$dVd%|Rz zt3X1xdKp$EmXD#771*jma&HrEdkj)A>E3QBz_|;?3uj1eHdnfEr}G|fBZT)uzH1ty zH|V;vZmt<2tIPa^ABZHCONejkV*9|^0K4|~FWT}h`5!*h@7{ol!)fgov};+C*?|Ro zFA5?#mY)@OaKyxP0sqykO(cMcLFL1~>T9Ra7pO}NgEGLYMM6FoU2J?zN26D*!*#gr z&ba=&2q&6XTbzT>l_{*I`t5(j)jlS~@DKJJ!!MOpC#m@rkU=~o2BX|daJJ{Blbl2B_zt>Q+HwH-R^jYA zTi2FY#b#Uc<*#g=;IpUr+-eH=)H4d9gvyFu5gg4IG-6Wgq`+b=D-MP!tUW9FI|njd z_=)%h{{6iawLMe_2_0ZL{7HA^CDpsKr#-WnINWrJ8h8M#*;sO^V~1+Oe^!mbNK{k7emM&Ew&9(+7b_Pzh5X84~smzSVCVFg(`G5U-R)YIZ zh{vhgx3=(p4IW9vf7{+EL4|TmwR=Wc%8Kwc>Ad~)1|_-7y0U`7W@$fN0}Uu7G>*?^ zhFo?^QEAFQGQhVyMKAlULKah>L5vv>W`Ixtel2}3Lf&lx%bO}bx@!`+jGr}2Txxl3B=N6@ zo36a+6wOIXC(~4ri)5o<8u6YkFF*)L1JmjI3#^pp9*F-{hAjs~NB|FKJhD)1;BsAY{CJtNNQWo@)4)I}sKP*^Ny0758#^S#@V6BK}w{s87rdqGiBYKbeOJmWl z-xRMqoU&KRE2T`PFZmPt$Q$or$UIUT704r7 zK<}VCg2lM2iXcz?J)FP7PG}~VlKyaAFa#^-w)h(T!8JVTc(Z1VK9S@*HAkHrgTr=v zk$|?L&9U0TR~!*M8eX_*)kxp{`noNMO*q zqXG2~Yahix1hSBT%iaV%|CtkqxHP}n%iXA9WsXX%hY#AZM26(}*Kw}hCdQxcD+fL| z&lTu+A(N04P|?g&89Tc<1rZaoX%1_u!%)EF;`#1d`cqb z!U098@?OZ2eSsDnTx1W<=-dap;hI{kp);O$bp&^ri|68vwU-zES&;py`RNhvg#T3p zkb^!{Ftt7WwnM4C`fs4V2jc6 z1Kzkv06Z%eH;v$4k?Hz5z3vjn%T|>u1Q)DV)J>wosh|qIIQ7OuSH4*v$Ka6h2XzRX z@DBWvjUYiWRg_6fk}(MOV$~oxWahE2#4mH7HGsmMT&SkNW{z&k0AH!Uw?zKsu2?x% zT6uQWq?Jw7hyUXTqPHv}2exb#|9B3XZk`_dY8NN4kwt$8hh{F_Op2(8c7$`L-|5C= z<5i$Levpm}Lg=2;*wufFQzW{)x2J>qYr$cMJYn0}hJ`!SY#YtUrIgOZOAp@|^^Kb9K`syzSVm^Wr}FkiuU z0~mZiTK_!}Xz)d$7Cv}quP+Yls}4-?st%Z|9!FOyGKtMe=Z^~PR^mKI!%7**C}BPO z+k_{J=kp0(Ze%t}oJJorBimrj*+KO^%JZG1m2m~VF9YY3kV(TaN**qcM$!4y}y#l~SeH1XoKPWPZ2dOYC z+(H(1v%UG0G(L!32&nhznSuyUwk!?U*fp?0>Zw`-uYw+2-ukf!-l}F9i|6p8Y4^O3 zMKk75v;~Y!+U@P)wCa%QS9U9nJTDB(eyfVH1|XJr(4tyJMjq^Irz_rtdbz_dQntW? zX$Xkkj&b@)cmP9=Ut+`U<=bnpP^T$|l8>L(Fk4 z13b&su)aD^haR2K4T!Sj`JZ*m^X0f0+}+>~Q^`wIOV?)5DAc832(}U(oik~;t>CeajZ|2TSYJtR zTwI~v<3i`fZ&1mxeQIqV5k^ihflquJ#brXQGU^F`DD5%L2Z|pC>$($NvPb_ z1!}(?C4r@YDy9(!isc$M!h%|MeMAU8IK6%;wyh$3`ELAd$!mlX_paDW0x`2Xq8Amy zL-hsd$OE$MMnHs~D<`XR9^N=n)f*ywgvCMwES+Jl2r-Pq`35!4SvvW98?lq{~%17cOiBZOTXG6Ynv!HT)Y?IWy9o`w3?Yofq@#$8F6?^z*FL*1 zpe{ohE~4HJtdKncJ?G>1OI{V)9`Sie@V%iJYXAj%DSjrG5>5J`&teX(-3w==l+8a_R7SMR?c^YzzO9F5HqL-73JeY3L zq7rmG-iyR6<@m-cd!h3YJ@KHHfcPgpTDWQ5hbG2Xz9c7*&KCF}O{pZ0-Exr8;8%7b ze*Qv+wcvmRgOq!*4A~xQD>vblU8`D#)C$I-ZZ~vN5?2dc$|B@u1Jp30ti4Y>#=HFH zlkYtz-EsTjaAGT48jNtfG&?}@MUnXbKC~aMK3?Wbjf2GW!L8j8dl+5|+t+oK3-Q-$ zhjj#7S`9#o-Ga?_j?c&Jf37+UVOXY%OI<7dKimXG!x*Stu%>I$KvqQpH9}1VGhJzv z*UnLxr;NdLz`)<{*`r%seY(bBv&X?%84x2kdNB3!15aDXSLe8SzyrC!!E}`zX!Mwg z!zg#VmFk}voqtrhrS6>&l5*^a5T0kAV_ z8;zV`W32~}h4jnhZK_mq~MV9MYSSH{6ZLW@F2uP89?(hg8)r(GZN5LPeP{pX!ZN)o=M*jBrDw5wqE6 ztbH{=E|@YS`ywgs=~S!8ioAm=;xtIBuRXA*veW`+`rm)#vu1Zp(Ji0dc6@YaaL+;d zDAFv$FW0xX`(6U%-{s>uCC3O6k0rIA-Xhe&z}dP1AT;b|Xo^b*FF{L8=epUZ`w970 z#$NiWf0z*^=h$59ZP5|`EOAJ@pf9`4XN4_*hC8XoK)1j$zNEx32fCwyuSB9pj;8pi~U4x~PR>KRA-IO0;lQXI`n80B-3rw<6_{YTd zZ7+|}DtT|8)<*og4>SSf%2H#y= zb)V;U`*F+=^3-x3K_b4{pzP0LMmj($VD&wHj^yJiIz2&1!`_OX&<0#j0A zr1isK%D9R1cn3@}{940tEO|kgg=53QSXU*GuEnDIRjU8~{G(l>tHK~M;A|NaqtvH3 z3?cYb%Y419>JSl7a)rbysuxpsJdIO+qrprauj`4vj_#!i3#>?gnt>d4X&>4daM~L+0T3E#)?pdr-VnMFE+P{piwa3 zfh;57g@`_6MeNP2q)DLnbGQ|$j{yLOCNG7AWr~`9o8(<3lIS+uCwAhBT>FZ~XTMZ1 zS98!08&pwW2Iiq0eUk=cU+fh@*#+E0HXSRS;2_O^=F^GdAb;xvig+Mu|Zy1LwLlJhZUYguRf$#5y zfCGXF!v`|>lI$3}mquJVt%SP1DKGO5y7C2@+W2srI%3z)XTrUueHQDWqIQ`N8})*J z;pl4YELYjQj~px4Vu+Zp>gdxkHmjU^P5f3wz>L8j6XK}&nx$<3y`V=h_AgUi30(7{ zSL5IhMRz^I1b}AF8V=_^irFjL45}KS!AWk)yPNzY1dKa6l(1I6-_w8S+wzp|dNwxD#94+Pkq2gu^eYnUoPCqPw@5 z;}Q<=gCt-L0x;xs+eGo?y`WXA{Ou}R7W5%2r4t9w7CUGM^J?9xH2O`#+Kf*FR9`bl zL)}p}@1@bAq8J5RazmyjO{#86Q8S_I_7V9S#(9IX<-t52vbxa!kqdaa0wKJci|gpe zL_ut{Jy5XA5BC}gj(oCV@00UP!&B|wo0tWJiybGW#EeTi^Q6woVJm>}{7>PV<_z3h zn-9K)(ndX=MM;EIIbg1S&?*jv!p14YIK`u6!oNH2f-}kUa%PGAL{9PlUB;3DQmXy< z!{KORb%kLT^5jy@oA(|I!6K6ppRtRLOXz7K0w_5Gg!9J5my_?Toq;&VknfVTmraj9 zdMOG|(XnpnnNtMjL^RzSMsb|4r7Nd_>}9qlovTi?-QMDE3}J$XmV4o%{!qPn93T}eeHBiF8W21Lu1qg8&k17E&x2ugM z8hsv#w{6JJ(W+U{s=c;DRUysztsVP3Q!`lI2g*e92nV(2llkoXH?+vc6gZ#q}@;SYZ4 z__U_~8UF7_buQJFDo09=11L9hH(q(z>6!*>KRmmi$20G`oiT>{2V-5Y0OXuminiBh@NR9R zNDPI0elLe5)V9(F^5|gW?Bam~_0Q9TeOzHgEqden$3@ew7 zW`3|0FzuQsh~*%kbAbtz&n3L8fWtI93n)rN{4($&_`@?v@^L^DCYot}l~N|TGY=it%>7sf8pbc?2r;vT%rsJlm}Ox5f?lxjL)A5)F`TXf)lgLN$c*rOf8I}v=#Y0+pFth zuQ{^V ze*qh07fsZBzWwX}WyIazZ!X^<*p|>VkN~va3T19&b98cLVQmU!Ze(v_Y6>+qIUq0~ zZ(?c+JUj|7RC#b^ATLj1YEyJ=3NKC|F)%O+FGgu{b95j%Ff|G4?5av(28Y+-a|L}g=dWMv9IJ_>Vma%Ev{3V7P>TUm1(#}R(Vuh>U1 zDPZS5s#2+vEXh(V$+C6Wv8{(Cuq4;QC15Gj{`KkZnZ1C)l5~{hB~`{?c6z$|>uY)z zl(lvl%d~-yhcA(N1z$4bO2fC#xRda0GGTDtWs(W__L*WP)0SsSarlNcAt&0x9+0Q4 z&MeBYHgf{RTAz6a>yBrh1D_KamkO9bIF}l9I|UjgusBfZQHKMa!itG@HWSj|ipzv& zpw6=lxIwNLnY1)fnUV|;dy^?c6PGC`Y-SikYzihILK$Fp3?4WKxGpmaY~xI3MoXCq zof!j)L4k3A&-2W95Bhm#xdvtMEt$%s$gBcGIn1r$un`ul@e>UjJV=q;WsU)x0nv`r zM8U*@GpzxqNSsLY4q&w`a}s>14fsnb3lrF^83u&I!~#|gB!s3lkSw|bbU7!$WeRk8 zG!sJRJc!o_ok?)G1jlm?x+EASIOqb!UVvR%vdl|kCmNjOm4ZDW%WL!`CLkGoq`^|o7||HaxK$Gc>zoM)u!0DIn}J<{a+pD?C`TT2 zkW3hLh88JI4Q9B^6eJC2RzMsrhRA^fl|>y6LWVKs%xg6<0DyCt!y0Jg0^(vI**Ih{ zgca5~hhS!g11iA)k>P+qV7g%FfPo_wBtyDE;Ngi2FF*%a%_ZdL#6Z$=i5Y9S#f()* z1sz}p&OnC*EEzhDD-NMF;2KD1u+RvB#pH2@Wd+#)Gti+KWUhf^fy`27^57R~4Seh` zjIkea*yFHjhhLat`27NhH#oe-;pN%nZ-2}3H(CDm=ItiSA7>Bt*XR3kURRs-UuIUF zWq<$sQ2JvWs#e7ohXRKhhdB;cX{mj|=2-B{Rxk zNhF&=({Oy|GpqFsx~q^gY*T86Cp{dm(#)_iA|JF!7>(yMJbdX3w>Tq_-QmVCcSe9H zBOT6hhf^~f)pG7=IbfjCGBI;Kd*h7ogdNR_k3u!)pe)XXg}Kx`a);IE3tGkFJ3M>s zz_XFB-HUW+0AM#9>^M?Xhz60e#>^~6%u6srSOpRsMiE;uAcQVCGjR-ilomJ-IJ7~k zZD^KDj63)fvmn?(8sPpzqejVL9;pA$7-iB(nHP11d0)F#m~)mAqo&vR7>-z#%MaVBso*-0x&yhTVDXtjBQPK|Ca2taap(x!Id4u+0Ohxd_|3Ckg z&wtM2D49;;kaO`olDHB^%t}F(B%wB3Z#B$9mBk%&>V*+^3~01F%#n82kLby;N29V- zX<-E}%9c?r3R2fVZnik88vmrH397`v=L#eHt(8 zqE)Z)28^ORL*pzkYSwoFttq8i$A>xGt#PH@5giy-Mdywf3O(E%c1Y@6GBn%}kRCM{ z5awZwC|D04r3M)tx*kakqb3*9!34V-$LZhix?XVXWUmA}c0e4fuY%gs7p@LA^gRegrQ!E|%I%P;fGU0DDM2;|l8 z+pBU7@%)t{x-9p53hTOBm-+g7bzbiF)g^{`0}H1@4bV4>5b;m>r|V5!F3y)>5_V7? zl&dO0--7SZ&NsU<|D6Ap-{imNf8>9ZyUiefJikx= z{PgU}(`WB{`4eJ&B7L-r=_8$?UwZDHK2xl&kU-kpl0e^3!VD5)j!~s}_9E1)H}W^? zEqCRjtF)RtNDdqNO5TlK2UHW?ww5AYK$;>gG$BAB2^|q6^xk`uDjfo$gY;e%q<2IR zLhl`<3W6e4dhbn&G=Ud<_rL3|_rLq@Ta$b_d!K#w-fPaRnLS@JIepsYSw3ech2>9- z8{yhwltCqj$t>*OG>hHAjZ2-YQbZrJ`o37l`7+VX5|oafRqmf2muIz}6C%CUk!LGm zWx48@gv7Q2v`xM<#IJ9^X}Ui{SS;}D-OO5(43o&Dq^T7=OYIVK8lm{T1`h}$Ycd0c z71NZASEARR;SN^Q8MphMk$pOHuAXlx>chy_=-(x!&;#PLv` zo-u5*I0|2&XY$xN`Lvk zJ*N5y;Suxn6Xb^+6_0+#)pX)#k+7AjUQe=o;_;`krR__*m)w(uVh#_!SA4GevSVbh zXc4UDM(bFXjBvkpb@IL@^L};9V@pbPYd0#tPvnlrNPYaXF4S*A*lLn4_A7qT$j)&Z zE&O5Rv1%pP8|O=qiPM*F*=&%{M}^}K#%9)C-UCd2s@>SJ(hEf6d2Q39#?sb~L0LCy zslxre{xJrhi7F$+?LPgg#GUK1Y{#q{oop4i+U~bEy{CAR^N|Z}Z*^~*I8{)h_s2I| zB(~yfTq~9`sj!Y&Rh~K7d<>eM|m1}VpC*Ah~R)W(v!cm8T>;5c)-`eCqQXoLQ>*{LLw`9!hS zpEQFOs+ijklQ{f(wKNk?3Z=Kc?#h0{Wf#wUAAcb&>_ElvJV@c3Z<~+CfColJIm_(P z)>@YQ?fuzE>WMNn4zWi7gAN=eu6>@+K z(BPkIda94oeI}{$U>`h(=!O!R7dTN)euiS`g4`1e)MlGjlC1Z0W~RvMHyeJW&`cBV zZqmU)6~tXqs?kdt)J0>NjnmN|HB9eivn=imd-%Te>7rR;X?(WAOSPg^7AH~oDaVIh z{)eir7l1uPK2<46ZV+#FprFB`Z>aA#``En*n&6rMO4t)c2UszoW|dX}LcAm9ZI0Rp z8(GwnPp7Q)yBJ9#%q1mopp?Y}8{rsKYpKsXLd?EjBPNP-D@1lMa+afKuAnjAjD9U> zRn3(6JC-GmHNu!w&9|tk$fo08R(;2$2$;uT8l~yCWbfFBeZ>37Ymro zO=SU;29Hem$`ZoZW}vFCk>qTmR?-}*PY0TnCd3>MeE{+W4X%Bl(b@OO z8V*iaL=IP=3Xja6`K2n2bqa}8NwOtO#bdY~!-!e7A(}6)oIRjt`;~mTY0Yj38oHF7 zOZz-k#P?a0scSyl(7o4r3L5U@8ii$n0W+6m_1g_$7N%2+WY)RCYF1f!0WJ2y=c8-} zzI4L#UkXt_&eG1dSXvyU6)qnfkIr6d&)w##s8$GJEY=Np&PLrl6xl)k!}v`D$-(kP zl#S*%w%&{{9bxYMK%uT|!z%{fGJ>%u23sZM;cao6%Yg?Ve;rEBcwd9k0Za8KiBjt# zPlND%WhgQWyzvPiDL8hnzz|NAIvVFYMwJ=E{3Z6a9Ea9=^JFD^RkNtrp-Qm(fO;rh zs?%#j(4$cc*Vom?rNf5{me^z_zU`omM{C7P*r`!R1h*H6DKd=?{pA`Rt`FUTM9KW@ zCl!(Db;~PcdHRML$9PVe$vYS0#Z$h*gK0jxWUi38{Q>(*#+H<)BgRoS?V#-oa*hyr zn}uDB!?|@v?ngVPHS^@-Pkvr&jcbqEJYRUn&ice3)1dkJGX6Z?*R_+$;;h#k?mN!7 zM%o-td{rqJZy>!nE<2YkJduZ$~J&dm(m7g^W1c-ym0_fqP43Z{%QwGbJ`&!}ZO| z>J7vdARe*07w%EwL^oHj8%1OD7bQkNT=8|~uMIwLaX7H;;7k8>!Qf+>Y9f|`8Nf)- zEb|hp8dHp6sb5lhlQ1=QZ)=nsZyeWNrOY4#vomAl_N$9Rg^Y|Y<_2zDyFLLfEv|H7 z4}F_>mTJ%;k7BpjOKL=T%;H0#;5e)&EeTEyvb7*}u#$ogxlp!UI=@go_|Gw&q{=At5yuo{?70%Gk+g={kMUh`t>=sTkJIP7G#E2nIr0 zA^#W7TQ<_RqyPqWvh9fNlsxn}S=I2q53FM{b18k zi9xb+COQF#XC#*)r+TQhe3D|g&8pY$8fD^92!C2YT%N?aw5$>%Vxm(dSguBfNKg)j zjv1V{Zq=AzOFx6hJfXmwXikJhN1dU3I6anEJq+#ghzvX#u(*uhX1Fqj2pr|4+@pG-8IO`M!8%u(EwiJglD9+!%SqzX ztK%6(ZVjKpRHDKpv%KC&s+r5wMk;9ZF1o7?|8yJ_oj-IeHSKWBN2K%k=@i&gU33BW zVm64%5l-iT0(Gl{ek==w^X#X5A^QtPp!iePkBFN}vwm43Y z=emsfnul~(b)&9*&bo;BT~}xU13TR0lmo=FjCPf@$tj=DM66*WGSl!_`Bp~aDff{N0FswhR@G$9BEoV51 zIWu6=F>%41UD7tpb)Z8{vD)ky;Q?umlU`0~tgq z#D6#N^-rUC{cm4&qB*>9Z=x8`&m>2D4~@uLu{R_~JP!>G4r#1Vd#$+BMfHSwqKip@ z6VQxQ(VG$gpx+vV>Z;EG%k@{+PtzI*pe}liZYia8VB+AUQwB^RyF7i39Wi}(Xbi*Z zy`jC?vgt|s1PJyV4X31)suMNU?Gc@M?d;MRcz-P(zJ zUYzsgvk_@QzsyZWfRx0daP^`vxU`$1Cg=dU`K)0$vMgu=xhW_x92pz5fLyFMXwuU= zoEfwC0&Y3a*~)?~%DhV}LY_SX1e!X$2n+`Z2cMD>zh6e$q`pEu>z+P{?@#K+^nxND z8H8aj*^lAOAmQJFbN>ZTVRlK~p_qegXZRZnZrSIe+g?7wkPxMP9rsWq;)pY2NRexHL9PDn z5pIV;D_tQAW&aoJgdJ*oIpm(!*OCRktJu-L7|_P(dZ}Qx*42&2X7hTB2zC3ihwk~F zQC8Z6LD!-64%4>BFY9%)3FGEoUsvg8zD85|Fy-FPb8X{3>o#;*G3T~(yk4;}l`>Ng z{Y>@topYF3y=eP>GpUkE$f~z~!cOa3(JsgT5HgfG9f5NuQl^VO+SB2$6E7r%|AE$ZJuN!Bw((3msM0s~sY)3D8RjjBcW>l_zA5ryd zcKV84yGQHj23nU5?<8AJ?h&E}+B82$YRl|=&=c*Y8hhP;br5QMWKG2%duBuzlfjVt zc0C2(VY+{WrZ{Btmc^4-t#tQUKaix5vsWED5lCCzT7&dX+n?VV89m}kFXrTI&L%S*tDoALcx`)sipa-P`V{d=!xwG}9?fA%JaH=)Z2GP^@&2Iva^8~W zsP|c|+dW&+wMQM(%C5LM%Y{e`ozX)T6@Mp3i;(uF7GGUE)jB-+n8+_ui5eSClG>z-0PjzF=u z82W}OSCLDPJp-f!9}jTWR%(RaC`8^M>C}vvE)a$2CDN1C^8@PAE9ez;@YM1wlMSZ~ z#Ii{bJJFp9{o7^YxFDp`VCz zuKbq8{jViMZ5wfgWlJ!T$&+kPN)aEiH#apKByV#;+lbax_s!GhzVhf_(<}Y++0ACh z=e)O4$Yf!Uy^Ht1YlOcDdm)tdoo13;u7V^!Tc=VJo1dRBGTkpz zyYSdayK3%4;p9R+-=0?%Xi?NctJ?T1=PgKMfb;uzn$f-v2Po+{DfzFp1@9titRe~M zJE4f^)O4$(7mj5&Chxy!@QJadgWjcO$r-796teyW>}=Q;6$t()hg#?}#(y9R^@rbn zZ1vaX*Y(%c{imb{0=i4a`|L%+aMg=jayD(O)syx1(~c?`?1LBQNdvcvb}uy8)Y4Wm7)4$vB+*3LYrs9_Nq#-s|Pv2wWq0XOk{x zudqfSV^N-e`IuMN!E&W|HblJNJ?MYC>-l>{tD|ONiN_`DU~b_7Kz%*LCm>*m$E9xVWpUTU60~el-k^uqpijFYAiW`^98R^c zsP7ZzeIDJs0zPiODQ@fY6?Ve+u6T4kPrLk(b_F+UnKEnTzR~hKx|tA>j7g86dXF%{ zmOgBsZebA+u@njZ1@%>WGQ|<@MX(P{gg!f7xu0c~^iDK_apfj|lShVXz}PAY*>K+l z%fXl{DW`GlgZW@oxiR0QtQcz^{)^af4Q^MF@CjfPz>5h(+0sYFOJ>l9$@{r&j31b} zQClXTh~Y3{lAz-T1iyd*=;MiH{;gOEf24+s3bc&YO>Bw-34RPCpbsS$52x=YE(Zj} zzmNei5tHEH24lj|=>s-#i>faP?Arh5mK4|Zwd@yN^?B8C)iJbIPmpl(W~!Yn;Oz_9 z93~YKR@_8PIdta#T`Xp&NhFlRE(rQAwv~fW5n}y^c(jw-@@Swutm+!&vP4Lf@a$&H zQR?4$`M+cQy>|?|9YnDX9X^ft8rQY+@wKCA(bu*`(Zt4wq$%AD9KB>oFM&xWf?6+` zKUCtCRASjy67E!@QB~rmL%0b=Z^p8JKdFoDA`(4W;j`-_-0kMv?WNuA5!vlSqwc1s zmTKQ(6o@nvh$R(>P#1{dPU+Fj=<+n^kTm#YVbGb}5Lxbo!4JnbUg2x&{Pe4jr)f8# zMTD^Zw|x8fffKenoM>T1gm#ghb_tTT2+Ud%G+JafTH>8w1kIZe9{fb3gTyI+6Y~=L z7&Iof4RDPY>i-BozZSi!pL#w0ubV0-O_=fCt|)ydnRq09PrRuN{1D7)KRK}5We49$B&HSCW;!rS&QGf*GJ0S4ymNt%MUPX?%Bk7ViBKz*;r z+t=#yzf_6N*T-J>T_!BRTlS0FzkA1C3trmKxs%!cq7+8#a>;04OsS3m%Y3${j9Wh) zHRhkITV8yxTqvG6s?Fz@#p7HM`qfy(xk25Us^;~>kMfj$n`?unbxSSy;>S;dQ25b$ zJC-)%rN~oogt|^}&*otICz6Z~H4gEG6K7-x1lg^mV1})g>vv=rRJ%#%HED*y7u!Ww zcB)-#O^E9ow}Ru9;7cC& zt^N?x7wzk}SGVZCyVu9Y+2>RzChg+c$KEFo{UoyYSsvTzr))*_8HsU^T4oFhq|5Rp z*o96JRHhnpw_75HHq$=xV%S=ae<@DY`cc*VIb{Wj@l-o%jrbt# zo0mwPk8vCKg4)@q{~TDM?Ko+7-n3?r&9XV4^JEJ?7j6}MveZ`}K6Thwx}M?uq+f4H t%aToFoHG8qZOM;AX*lQa7a|vgi8I2(8Fl4Bu|qukym-valFCwe{{`KXM*{!= literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/documentation/libxsmm_tune.md b/third_party/libxsmm/documentation/libxsmm_tune.md new file mode 100644 index 00000000..05c793e4 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_tune.md @@ -0,0 +1,159 @@ +## Customization + +### Intercepted Allocations + +To improve thread-scalability and to avoid frequent memory allocation/deallocation, the [scratch memory allocator](libxsmm_aux.md#memory-allocation) can be leveraged by intercepting existing malloc/free calls. This facility is built into LIBXSMM's main library, but disabled at compile-time (by default); build with `make MALLOC=1` to permanently enable, or build with `make MALLOC=-1` to even require an environment variable `LIBXSMM_MALLOC=1` or an API-call (`libxsmm_set_malloc`). Both runtime settings allow an optional lower and/or an upper bound to select malloc-calls based on the size of the allocation. For the environment option, an extra variable is introduced, e.g., use `LIBXSMM_MALLOC=1 LIBXSMM_MALLOC_LIMIT=4m:1g`. + +```C +void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi); +int libxsmm_get_malloc(size_t* lo, size_t* hi); +``` + +Querying the status may return zero even if there was an attempt to enable this facility (limitation/experimental implementation). Please note, the regular [Scratch Memory API](libxsmm_aux.md#memory-allocation) (e.g., `libxsmm_[get|set]_scratch_limit`) and the related environment variables can apply as well (`LIBXSMM_SCRATCH_LIMIT`, `LIBXSMM_SCRATCH_POOLS`, `LIBXSMM_SCRATCH_SCALE`). If intercepted memory allocations are enabled, the scratch limit is adjusted by default to allow unlimited growth of the scratch domain. Further, an increased verbosity level can help to gain some insight (`LIBXSMM_VERBOSE=3`). + +Intercepting malloc/free is supported by linking LIBXSMM's static or shared main library. The latter of which can be used to intercept calls of an existing and unchanged binary (LD_PRELOAD mechanism). To statically link with LIBXSMM and to intercept existing malloc/free calls, the following changes to the application's link stage are recommended: + +```bash +gcc [...] -Wl,--export-dynamic \ + -Wl,--wrap=malloc,--wrap=calloc,--wrap=realloc \ + -Wl,--wrap=memalign,--wrap=free \ + /path/to/libxsmm.a +``` + +The main library causes a BLAS-dependency which may be already fulfilled for the application in question. However, if this is not the case (unresolved symbols), `libxsmmnoblas.a` must be linked in addition. Depending on the dependencies of the application, the link order may also need to be adjusted. Other i.e. a GNU-compatible compiler (as shown above), can induce additional requirements (compiler runtime libraries). + +**Note**: The Intel Compiler may need "libirc", i.e., `-lirc` in front of `libxsmm.a`. Linking LIBXSMM's static library may require above mentioned linker flags (`--wrap`) in particular when using Intel Fortran (IFORT) as a linker driver unless `CALL libxsmm_init()` is issued (or at least one symbol of LIBXSMM's main library is referenced; check with `nm application | grep libxsmm`). Linking the static library by using the GNU compiler does not strictly need special flags when linking the application. + +Linking the shared library form of LIBXSMM (`make STATIC=0`) has similar requirements with respect to the application but does not require `-Wl,--wrap` although `-Wl,--export-dynamic` is necessary if the application is statically linked (beside of LIBXSMM linked in a shared fashion). The LD_PRELOAD based mechanism does not need any changes to the link step of an application. However, `libxsmmnoblas` may be required if the application does not already link against BLAS. + +```bash +LD_PRELOAD="libxsmm.so libxsmmnoblas.so" +LD_LIBRARY_PATH=/path/to/libxsmm/lib:${LD_LIBRARY_PATH} +LIBXSMM_MALLOC=1 +``` + +**Note**: If the application already uses BLAS, of course `libxsmmnoblas` must not be used! + +The following code can be compiled and linked with `gfortran example.f -o example`: + +```fortran + PROGRAM allocate_test + DOUBLE PRECISION, ALLOCATABLE :: a(:), b(:), c(:) + INTEGER :: i, repeat = 100000 + DOUBLE PRECISION :: t0, t1, d + + ALLOCATE(b(16*1024)) + ALLOCATE(c(16*1024)) + CALL CPU_TIME(t0) + DO i = 1, repeat + ALLOCATE(a(16*1024*1024)) + DEALLOCATE(a) + END DO + CALL CPU_TIME(t1) + DEALLOCATE(b) + DEALLOCATE(c) + d = t1 - t0 + + WRITE(*, "(A,F10.1,A)") "duration:", (1D3 * d), " ms" + END PROGRAM +``` + +Running with `LIBXSMM_VERBOSE=3 LIBXSMM_MALLOC=1 LD_PRELOAD=... LD_LIBRARY_PATH=... ./example` displays: `Scratch: 132 MB (mallocs=1, pools=1)` which shows the innermost allocation/deallocation was served by the scratch memory allocator. + +### Static Specialization + +By default, LIBXSMM uses the [JIT backend](index.md#jit-backend) which is automatically building optimized code (JIT=1). Matrix multiplication kernels can be also statically specialized at compile-time of the library (M, N, and K values). This mechanism also extends the interface of the library because function prototypes are included into both the C and FORTRAN interface. + +```bash +make M="2 4" N="1" K="$(echo $(seq 2 5))" +``` + +The above example is generating the following set of (M,N,K) triplets: + +```bash +(2,1,2), (2,1,3), (2,1,4), (2,1,5), +(4,1,2), (4,1,3), (4,1,4), (4,1,5) +``` + +The index sets are in a loop-nest relationship (M(N(K))) when generating the indexes. Moreover, an empty index set resolves to the next non-empty outer index set of the loop nest (including to wrap around from the M to K set). An empty index set does not participate in the loop-nest relationship. Here is an example of generating multiplication routines which are "squares" with respect to M and N (N inherits the current value of the "M loop"): + +```bash +make M="$(echo $(seq 2 5))" K="$(echo $(seq 2 5))" +``` + +An even more flexible specialization is possible by using the MNK variable when building the library. It takes a list of indexes which are eventually grouped (using commas): + +```bash +make MNK="2 3, 23" +``` + +Each group of the above indexes is combined into all possible triplets generating the following set of (M,N,K) values: + +```bash +(2,2,2), (2,2,3), (2,3,2), (2,3,3), +(3,2,2), (3,2,3), (3,3,2), (3,3,3), (23,23,23) +``` + +Of course, both mechanisms (M/N/K and MNK based) can be combined by using the same command line (make). Static optimization and JIT can also be combined (no need to turn off the JIT backend). + +### User-Data Dispatch + +It can be desired to dispatch user-defined data, i.e., to query a value based on a key. This functionality can be used to, e.g., dispatch multiple kernels in one step if a code location relies on multiple kernels. This way, one can pay the cost of dispatch one time per task rather than according to the number of JIT-kernels used by this task. This functionality is detailed in the section about [Service Functions](libxsmm_aux.md#user-data-dispatch). + +### Targeted Compilation + +Specifying a code path is not necessary if the JIT backend is not disabled. However, disabling JIT compilation, statically generating a collection of kernels, and targeting a specific instruction set extension for the entire library looks like: + +```bash +make JIT=0 AVX=3 MNK="1 2 3 4 5" +``` + +The above example builds a library which cannot be deployed to anything else but the Intel Knights Landing processor family ("KNL") or future Intel Xeon processors supporting foundational Intel AVX‑512 instructions (AVX‑512F). The latter might be even more adjusted by supplying MIC=1 (along with AVX=3), however this does not matter since critical code is in inline assembly (and not affected). Similarly, SSE=0 (or JIT=0 without SSE or AVX build flag) employs an "arch-native" approach whereas AVX=1, AVX=2 (with FMA), and AVX=3 are specifically selecting the kind of Intel AVX code. Moreover, controlling the target flags manually or adjusting the code optimizations is also possible. The following example is GCC-specific and corresponds to OPT=3, AVX=3, and MIC=1: + +```bash +make OPT=3 TARGET="-mavx512f -mavx512cd -mavx512er -mavx512pf" +``` + +An extended interface can be generated which allows to perform software prefetches. Prefetching data might be helpful when processing batches of matrix multiplications where the next operands are farther away or otherwise unpredictable in their memory location. The prefetch strategy can be specified similar as shown in the section [Generator Driver](libxsmm_be.md#generator-driver), i.e., by either using the number of the shown enumeration, or by exactly using the name of the prefetch strategy. The only exception is PREFETCH=1 which is automatically selecting a strategy per an internal table (navigated by CPUID flags). The following example is requesting the "AL2jpst" strategy: + +```bash +make PREFETCH=8 +``` + +The prefetch interface is extending the signature of all kernels by three arguments (pa, pb, and pc). These additional arguments are specifying the locations of the operands of the next multiplication (the next a, b, and c matrices). Providing unnecessary arguments in case of the three-argument kernels is not big a problem (beside of some additional call-overhead), however running a 3-argument kernel with more than three arguments and thereby picking up garbage data is misleading or disabling the hardware prefetcher (due to software prefetches). In this case, a misleading prefetch location is given plus an eventual page fault due to an out-of-bounds (garbage-)location. + +Further, a generated configuration ([template](https://github.com/hfp/libxsmm/blob/master/include/libxsmm_config.h)) of the library encodes the parameters for which the library was built for (static information). This helps optimizing client code related to the library's functionality. For example, the LIBXSMM_MAX_\* and LIBXSMM_AVG_\* information can be used with the LIBXSMM_PRAGMA_LOOP_COUNT macro to hint loop trip counts when handling matrices related to the problem domain of LIBXSMM. + +### Auto-dispatch + +The function `libxsmm_?mmdispatch` helps amortizing the cost of the dispatch when multiple calls with the same M, N, and K are needed. The automatic code dispatch is orchestrating two levels: + +1. Specialized routine (implemented in assembly code), +2. BLAS library call (fallback). + +Both levels are accessible directly, which allows to customize the code dispatch. The fallback level may be supplied by the Intel Math Kernel Library (Intel MKL) 11.2 DIRECT CALL feature. + +Further, a preprocessor symbol denotes the largest problem-size (*M* x *N* x *K*) that belongs to the first level, and therefore determines if a matrix multiplication falls back to BLAS. The problem-size threshold can be configured by using for example: + +```bash +make THRESHOLD=$((60 * 60 * 60)) +``` + +The maximum of the given threshold and the largest requested specialization refines the value of the threshold. Please note that explicitly JIT'ting and executing a kernel is possible and independent of the threshold. If a problem-size is below the threshold, dispatching the code requires to figure out whether a specialized routine exists or not. + +For statically generated code, the precision can be selected: + +```bash +make PRECISION=2 +``` + +The default preference is to generate and register both single and double-precision code (PRECISION=0). Specifying PRECISION=1|2 is generating and registering single-precision or double-precision code respectively. + +The automatic dispatch is highly convenient because existing GEMM calls can serve specialized kernels (even in a binary compatible fashion), however there is (and always will be) an overhead associated with looking up the code-registry and checking whether the code determined by the GEMM call is already JIT'ted or not. This lookup has been optimized with various techniques such as specialized CPU instructions to calculate CRC32 checksums, to avoid costly synchronization (needed for thread-safety) until it is ultimately known that the requested kernel is not yet JIT'ted, and by implementing a small thread-local cache of recently dispatched kernels. The latter of which can be adjusted in size (only power-of-two sizes) but also disabled: + +```bash +make CACHE=0 +``` + +Please note that measuring the relative cost of automatically dispatching a requested kernel depends on the kernel size (obviously smaller matrices are multiplied faster on an absolute basis), however smaller matrix multiplications are bottlenecked by memory bandwidth rather than arithmetic intensity. The latter implies the highest relative overhead when (artificially) benchmarking the very same multiplication out of the CPU-cache. + diff --git a/third_party/libxsmm/documentation/libxsmm_valid.md b/third_party/libxsmm/documentation/libxsmm_valid.md new file mode 100644 index 00000000..fda4ff30 --- /dev/null +++ b/third_party/libxsmm/documentation/libxsmm_valid.md @@ -0,0 +1,97 @@ +## Basic Tests + +To run basic [tests](http://libxsmm.readthedocs.io/#classic-library-abi): + +```bash +make tests +``` + +Remember: a set of key-value pairs represents a single unique (re-)build (and test): + +```bash +make STATIC=0 tests +``` + +There is a whole collection of test targets available (`test-cp2k`, `test-cpp`, `test-nek`). However, it is then better to rely on test-suites. + +## Test Suites + +It is possible to run tests like LIBXSMM's continuous integration ([https://travis-ci.org/hfp/libxsmm](https://travis-ci.org/hfp/libxsmm)): + +```bash +scripts/tool_test.sh +``` + +The above command runs the entire collection ("scripts/tool_test.sh 0"). However, one test (of currently 11 tests) can be selected by number (1-11): + +```bash +scripts/tool_test.sh 1 +``` + +The suite itself can be also selected. For example, some DNN tests are described in `.test-dnn.yml`: + +```bash +TESTSET=test-dnn scripts/tool_test.sh +``` + +In general, all key-value pairs valid for LIBXSMM's `make` can be given as part of the environment: + +```bash +AVX=3 MIC=0 TESTSET=test-dnn scripts/tool_test.sh +``` + +Please note, the suite/test itself may be comprised of key-value pairs that take precedence. + +## CI Tests + +The `tool_test.sh` script is included in repository archives and releases i.e., it works for non-repository folders. In contrast, the Continuous Integration (CI) use case relies on the Git command being present and the folder being a Git-clone. + +Functionality + +* `[skip ci]` as part of a commit message will not trigger the CI agents, and tests are skipped for such a commit. +* `[full ci]` as part of a commit message will trigger a full test even if the setup uses the "Fast CI" option. + +The "Fast CI" option is enabled per filename given as 2nd command line argument: + +```bash +scripts/tool_test.sh 1 .fullci +``` + +In the above example, a file named `.fullci` may contain path/file patterns (wildcard format) triggering a full test if the files changed by the commit match any of the patterns. + +## Portability + +It is desirable to exercise portability and reliability of LIBXSMM's source code even on Non-Intel Architecture by the means of compilation, linkage, and generic tests. This section is *not* about Intel Architecture (or compatible). Successful compilation (or even running some of the tests successfully) does not mean LIBXSMM is valuable on that platform. + +Make sure to rely on `PLATFORM=1`, otherwise a compilation error should occur _Intel Architecture or compatible CPU required!_ This error avoids (automated) attempts to upstream LIBXSMM to an unsupported platform. LIBXSMM is upstreamed for Intel Architecture on all major Linux distributions, FreeBSD, and others. If compilation fails with _LIBXSMM is only supported on a 64-bit platform!_, `make PLATFORM=1 DBG=1` can be used to exercise compilation. + +If platform support is forced (`PLATFORM=1`), runtime code generation is disabled at compile-time (`JIT=0`). Runtime code generation can be also enabled (`PLATFORM=1 JIT=1`) but code-dispatch will still return NULL-kernels. However, some tests will start failing as missing JIT-support it is not signaled at compile-time as with `JIT=0`. + +**Note**: JIT-support normally guarantees a non-NULL code pointer ("kernel") if the request is according to the [limitations](https://github.com/hfp/libxsmm/wiki/Q&A#what-is-a-small-matrix-multiplication) (user-code is not asked to check for a NULL-kernel), which does not hold true if JIT is enabled on a platform that does not implement it. + +### TinyCC + +The Tiny C Compiler (TinyCC) supports Intel Architecture, but lacks at least support for thread-local storage (TLS). + +```bash +make CC=tcc THREADS=0 INTRINSICS=0 VLA=0 ASNEEDED=0 BLAS=0 FORCE_CXX=0 +``` + +### IBM XL Compiler for Linux (POWER) + +The POWER platform requires aforementioned `PLATFORM=1` to unlock compilation. + +```bash +make PLATFORM=1 CC=xlc CXX=xlc++ FC=xlf +``` + +### Cross-compilation for ARM + +ARM AArch64 is regularly [supported](https://github.com/hfp/libxsmm/wiki/Compatibility#arm-aarch64). However, 32-bit ARM requires aforementioned `PLATFORM=1` to unlock compilation (similar to 32-bit Intel Architecture). Unlocking compilation for 32-bit ARM is not be confused with supporting 32-bit ARM architectures. + +```bash +make PLATFORM=1 AR=arm-linux-gnueabi-ar \ + FC=arm-linux-gnueabi-gfortran \ + CXX=arm-linux-gnueabi-g++ \ + CC=arm-linux-gnueabi-gcc +``` diff --git a/third_party/libxsmm/ide/_vs2019-configure.bat b/third_party/libxsmm/ide/_vs2019-configure.bat new file mode 100644 index 00000000..8ae4e7ca --- /dev/null +++ b/third_party/libxsmm/ide/_vs2019-configure.bat @@ -0,0 +1,17 @@ +@ECHO OFF +SETLOCAL + +ECHO ================================================================================ +ECHO One-time configuration (Cygwin w/ GNU GCC, GNU Make, and Python needed in PATH) +ECHO When configured, it is sufficient to start _vs2019.bat or _vs2019.sln +ECHO IMPORTANT: due to zero-config, configuration is not necessary anymore! +ECHO One may terminate this configuration (CTRL-C) +ECHO and simply start _vs2019.bat or _vs2019.sln. +PAUSE +cd .. +bash -c "make realclean ; make headers sources" +cd ide + +CALL %~d0"%~p0"_vs2019.bat + +ENDLOCAL \ No newline at end of file diff --git a/third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj b/third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj new file mode 100644 index 00000000..32b535b3 --- /dev/null +++ b/third_party/libxsmm/ide/libxsmm_generator_gemm_driver.vcxproj @@ -0,0 +1,395 @@ + + + + + debug + Win32 + + + debug + x64 + + + symbols + Win32 + + + symbols + x64 + + + release + Win32 + + + release + x64 + + + + + + + + + + + + libxsmm_generator_gemm_driver + {47EDE325-4516-48DA-862B-F689F12DDBD3} + 10.0 + + + + Application + Disabled + Disabled + v142 + + true + + + Application + true + true + Disabled + Disabled + v142 + + + + Application + true + true + Disabled + Disabled + v142 + + true + + + Application + Disabled + Disabled + v142 + + true + + + true + Application + true + Disabled + Disabled + v142 + + + + true + Application + true + Disabled + Disabled + true + v142 + + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>10.0.30319.1 + bin\ia32\ + bin\ia32\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + bin\intel64\ + bin\intel64\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + bin\ia32\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + bin\intel64\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + + + $(ProjectName)-$(Configuration) + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + + + $(ProjectName)-$(Configuration) + obj\$(Platform)-$(Configuration)\$(ProjectName)\ + + + + + $(ProjectName)-$(Configuration) + + + $(ProjectName)-$(Configuration) + + + + Full + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + Level4 + Fast + NoTraps + true + true + StreamingSIMDExtensions2 + None + false + true + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + true + Console + $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) + libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) + + + + + Console + + + + + + MaxSpeed + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + Level4 + Fast + NoTraps + true + true + StreamingSIMDExtensions2 + None + false + true + SingleFile + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + true + Console + $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) + libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) + true + + + + + Console + + + + + + X64 + + + Full + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + Level4 + Fast + NoTraps + true + true + None + false + true + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + Console + $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) + libxsmm.lib;libxsmmnoblas.lib;%(AdditionalDependencies) + + + + + Console + + + + + + X64 + + + MaxSpeed + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;NDEBUG;%(PreprocessorDefinitions) + true + MultiThreadedDLL + false + Level4 + Fast + NoTraps + true + true + None + false + true + SingleFile + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + Console + $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) + libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) + true + + + + + Console + + + + + + Disabled + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + Level4 + ProgramDatabase + None + false + true + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + true + Console + $(SolutionDir)..\lib\ia32;$(LIBXSMMROOT)\lib\ia32;%(AdditionalLibraryDirectories) + libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) + MSVCRT + + + + + Console + + + + + + X64 + + + Disabled + $(SolutionDir)..\include;$(LIBXSMMROOT)\include;%(AdditionalIncludeDirectories) + __SUPPRESS_FOR_PRODUCT;_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES;_CRT_SECURE_NO_DEPRECATE;_SCL_SECURE_NO_DEPRECATE;_USE_MATH_DEFINES;WIN32_LEAN_AND_MEAN;NOMINMAX;_DEBUG;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + Level4 + ProgramDatabase + None + false + true + 3948,10373,10382 + HOST + true + + + 0x0407 + + + $(OutDir)$(TargetName)$(TargetExt) + true + true + Console + $(SolutionDir)..\lib\intel64;$(LIBXSMMROOT)\lib\intel64;%(AdditionalLibraryDirectories) + libxsmm-$(Configuration).lib;libxsmmnoblas-$(Configuration).lib;%(AdditionalDependencies) + MSVCRT + + + + + Console + + + + + + + \ No newline at end of file diff --git a/third_party/libxsmm/include/.make b/third_party/libxsmm/include/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/include/libxsmm.f b/third_party/libxsmm/include/libxsmm.f new file mode 100644 index 00000000..828e3205 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm.f @@ -0,0 +1,2087 @@ +!=======================================================================! +! Copyright (c) Intel Corporation - All rights reserved. ! +! This file is part of the LIBXSMM library. ! +! ! +! For information on the license, see the LICENSE file. ! +! Further information: https://github.com/hfp/libxsmm/ ! +! SPDX-License-Identifier: BSD-3-Clause ! +!=======================================================================! +! Hans Pabst (Intel Corp.) +!=======================================================================! + + MODULE LIBXSMM + USE, INTRINSIC :: ISO_C_BINDING, ONLY: & + & C_DOUBLE, C_FLOAT, C_DOUBLE_COMPLEX, C_FLOAT_COMPLEX, & + & C_LONG_LONG, C_INT, C_SHORT, C_CHAR, C_INT8_T, C_BOOL, & + & C_F_POINTER, C_ASSOCIATED, C_LOC, C_PTR, & + & C_FUNPTR, C_NULL_FUNPTR, C_NULL_PTR + IMPLICIT NONE + + !> Name of the version (stringized set of version numbers). + CHARACTER(*), PARAMETER :: LIBXSMM_VERSION = "1.16.1-1534" + !> Name of the branch of which the version is derived from. + CHARACTER(*), PARAMETER :: LIBXSMM_BRANCH = "master" + !> Major version based on the last reachable tag under RCS. + INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MAJOR = 1 + !> Minor version based on the last reachable tag of the RCS. + INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_MINOR = 16 + !> Update number based on the last reachable tag under RCS. + INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_UPDATE = 1 + !> Patch number counting commits since the last version stamp. + INTEGER(C_INT), PARAMETER :: LIBXSMM_VERSION_PATCH = 1534 + + !> Parameters the library and static kernels were built for. + INTEGER(C_INT), PARAMETER :: LIBXSMM_CACHELINE = 64 + INTEGER(C_INT), PARAMETER :: LIBXSMM_ALIGNMENT = 64 + INTEGER(C_INT), PARAMETER :: LIBXSMM_PREFETCH = -1 + INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_MNK = 262144 + INTEGER(C_INT), PARAMETER :: LIBXSMM_MAX_DIM = 64 + INTEGER(C_INT), PARAMETER :: LIBXSMM_FLAGS = 0 + INTEGER(C_INT), PARAMETER :: LIBXSMM_ILP64 = 0 + + !> Parameters supplied for backward compatibility (deprecated). + INTEGER(C_INT), PARAMETER :: LIBXSMM_COL_MAJOR = 1 + INTEGER(C_INT), PARAMETER :: LIBXSMM_ROW_MAJOR = 0 + + !> LIBXSMM_BLASINT_KIND impacts BLAS interface (LP64: 32-bit, ILP64: 64-bit). + INTEGER(C_INT), PARAMETER :: LIBXSMM_BLASINT_KIND = C_INT + !> Integer kind used by timer interface. + INTEGER(C_INT), PARAMETER :: LIBXSMM_TICKINT_KIND = C_LONG_LONG + + !> Parameters representing the GEMM performed by the simplified interface. + REAL(C_DOUBLE), PARAMETER :: LIBXSMM_ALPHA = REAL(1, C_DOUBLE) + REAL(C_DOUBLE), PARAMETER :: LIBXSMM_BETA = REAL(1, C_DOUBLE) + + !> Flag enumeration which can be IORed. + INTEGER(C_INT), PARAMETER :: & + & LIBXSMM_GEMM_FLAG_NONE = 0, & + & LIBXSMM_GEMM_FLAG_TRANS_A = 1, & + & LIBXSMM_GEMM_FLAG_TRANS_B = 2, & + & LIBXSMM_GEMM_FLAG_TRANS_AB = IOR( & + & LIBXSMM_GEMM_FLAG_TRANS_A, LIBXSMM_GEMM_FLAG_TRANS_B), & + & LIBXSMM_GEMM_FLAG_BETA_0 = 16, & + & LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT = 2176, & + & LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0 = IOR( & + & LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, & + & LIBXSMM_GEMM_FLAG_BETA_0) + + !> Flag enumeration which can be IORed. + INTEGER(C_INT), PARAMETER :: & + ! Handle recorded batch unsynchronized-parallel. + & LIBXSMM_MMBATCH_FLAG_DEFAULT = 0, & + ! Synchronize among C matrices. + & LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED = 512, & + ! Handle recorded batch sequentially. + & LIBXSMM_MMBATCH_FLAG_SEQUENTIAL = 1024, & + ! Only record a statistic of potential SMMs. + & LIBXSMM_MMBATCH_FLAG_STATISTIC = 2048 + + !> Enumerates element/data types. + INTEGER(C_INT), PARAMETER :: & + & LIBXSMM_DATATYPE_F64 = 0, & + & LIBXSMM_DATATYPE_F32 = 1, & + & LIBXSMM_DATATYPE_BF16 = 2, & + & LIBXSMM_DATATYPE_I64 = 3, & + & LIBXSMM_DATATYPE_I32 = 4, & + & LIBXSMM_DATATYPE_I16 = 5, & + & LIBXSMM_DATATYPE_I8 = 6, & + & LIBXSMM_DATATYPE_UNSUPPORTED = 7 + + !> Denotes the precision/data type of GEMM (for weak-typed + !> interface functions such as libxsmm_xmmdispatch). + INTEGER(C_INT), PARAMETER :: & + & LIBXSMM_GEMM_PRECISION_F64 = LIBXSMM_DATATYPE_F64, & + & LIBXSMM_GEMM_PRECISION_F32 = LIBXSMM_DATATYPE_F32, & + & LIBXSMM_GEMM_PRECISION_BF16 = LIBXSMM_DATATYPE_BF16, & + & LIBXSMM_GEMM_PRECISION_I32 = LIBXSMM_DATATYPE_I32, & + & LIBXSMM_GEMM_PRECISION_I16 = LIBXSMM_DATATYPE_I16, & + & LIBXSMM_GEMM_PRECISION_I8 = LIBXSMM_DATATYPE_I8 + + !> Enumeration of the available prefetch strategies which can be IORed. + INTEGER(C_INT), PARAMETER :: & + ! Automatically select strategy (frontend). + & LIBXSMM_PREFETCH_AUTO = -1, & + ! No prefetching and no prefetch function signature. + & LIBXSMM_PREFETCH_NONE = 0, & + ! Only function prefetch signature. + & LIBXSMM_PREFETCH_SIGONLY = 1, & + ! Prefetch PA using accesses to A. + & LIBXSMM_GEMM_PREFETCH_AL2 = 2, & + ! Prefetch PB using accesses to C. + & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C = 4, & + ! Prefetch A ahead. + & LIBXSMM_GEMM_PREFETCH_AL2_AHEAD = 8, & + ! Composed prefetch strategies. + & LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C = IOR( & + & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C, & + & LIBXSMM_GEMM_PREFETCH_AL2), & + & LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD = IOR( & + & LIBXSMM_GEMM_PREFETCH_BL2_VIA_C, & + & LIBXSMM_GEMM_PREFETCH_AL2_AHEAD), & + ! Current B into L1. + & LIBXSMM_GEMM_PREFETCH_BL1 = 16 + + !> Enumerates the available target architectures and instruction + !> set extensions as returned by libxsmm_get_target_archid(). + INTEGER(C_INT), PARAMETER :: & + & LIBXSMM_TARGET_ARCH_UNKNOWN = 0, & + & LIBXSMM_TARGET_ARCH_GENERIC = 1, & + & LIBXSMM_X86_GENERIC = 1002, & + & LIBXSMM_X86_SSE3 = 1003, & + & LIBXSMM_X86_SSE4 = 1004, & + & LIBXSMM_X86_AVX = 1005, & + & LIBXSMM_X86_AVX2 = 1006, & + & LIBXSMM_X86_AVX512 = 1007, & + & LIBXSMM_X86_AVX512_MIC = 1010, & + & LIBXSMM_X86_AVX512_KNM = 1011, & + & LIBXSMM_X86_AVX512_CORE = 1020, & + & LIBXSMM_X86_AVX512_CLX = 1021, & + & LIBXSMM_X86_AVX512_CPX = 1022 + + !> Generic function type (double-precision). + TYPE, BIND(C) :: LIBXSMM_DMMFUNCTION + TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR + END TYPE + + !> Generic function type (single-precision). + TYPE, BIND(C) :: LIBXSMM_SMMFUNCTION + TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR + END TYPE + + !> Generic function type (low-precision) + TYPE, BIND(C) :: LIBXSMM_WIMMFUNCTION + TYPE(C_FUNPTR) :: handle = C_NULL_FUNPTR + END TYPE + + !> Generic function types with certain arity. + ABSTRACT INTERFACE + PURE SUBROUTINE LIBXSMM_FUNCTION3(a, b, c) BIND(C) + IMPORT :: C_PTR + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + END SUBROUTINE + + PURE SUBROUTINE LIBXSMM_FUNCTION6(a, b, c, pa, pb, pc) BIND(C) + IMPORT :: C_PTR + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + TYPE(C_PTR), INTENT(IN), VALUE :: pa, pb, pc + END SUBROUTINE + END INTERFACE + + !> Structure of differences with matrix norms according + !> to http://www.netlib.org/lapack/lug/node75.html). + TYPE, BIND(C) :: LIBXSMM_MATDIFF_INFO + REAL(C_DOUBLE) norm1_abs, norm1_rel !! One-norm + REAL(C_DOUBLE) normi_abs, normi_rel !! Infinity-norm + REAL(C_DOUBLE) normf_rel !! Froebenius-norm + !> Maximum difference, L2-norm (absolute and relative), and R-squared. + REAL(C_DOUBLE) linf_abs, linf_rel, l2_abs, l2_rel, rsq + !> Statistics: sum/l1, min., max., arith. avg., and variance. + REAL(C_DOUBLE) l1_ref, min_ref, max_ref, avg_ref, var_ref + !> Statistics: sum/l1, min., max., arith. avg., and variance. + REAL(C_DOUBLE) l1_tst, min_tst, max_tst, avg_tst, var_tst + !> Values (v_ref, v_tst) and location (m, n) of largest linf_abs. + REAL(C_DOUBLE) v_ref, v_tst + !> Location (m, n) of largest difference (linf_abs). + INTEGER(LIBXSMM_BLASINT_KIND) m, n + END TYPE + + INTERFACE + !> Initialize the library; pay for setup cost at a specific point. + SUBROUTINE libxsmm_init() BIND(C) + END SUBROUTINE + + !> De-initialize the library and free internal memory (optional). + SUBROUTINE libxsmm_finalize() BIND(C) + END SUBROUTINE + + !> Get the default prefetch strategy. + PURE FUNCTION libxsmm_get_gemm_auto_prefetch() BIND(C) + IMPORT :: C_INT + INTEGER(C_INT) :: libxsmm_get_gemm_auto_prefetch + END FUNCTION + + !> Set the default prefetch strategy. + SUBROUTINE libxsmm_set_gemm_auto_prefetch(strategy) BIND(C) + IMPORT :: C_INT + INTEGER(C_INT), INTENT(IN), VALUE :: strategy + END SUBROUTINE + + !> Returns the architecture and instruction set extension as determined + !> by the CPUID flags, as set by the libxsmm_get_target_arch* functions, + !> or as set by the LIBXSMM_TARGET environment variable. + PURE FUNCTION libxsmm_get_target_archid() BIND(C) + IMPORT :: C_INT + INTEGER(C_INT) :: libxsmm_get_target_archid + END FUNCTION + + !> Set target architecture (archid: see PARAMETER enumeration) + !> for subsequent code generation (JIT). + SUBROUTINE libxsmm_set_target_archid(archid) BIND(C) + IMPORT :: C_INT + INTEGER(C_INT), INTENT(IN), VALUE :: archid + END SUBROUTINE + + !> Set target architecture for subsequent code generation (JIT). + !> arch="0"|"sse"|"snb"|"hsw"|"knl"|"knm"|"skx"|"clx"|"cpx", + !> or "0" to rely on the CPUID (default). + !> There are some alternative target names as well: + !> "sse", "avx", "avx2", "avx3" (incomplete list). + SUBROUTINE libxsmm_set_target_arch(arch) BIND(C) + IMPORT :: C_CHAR + CHARACTER(C_CHAR), INTENT(IN) :: arch(*) + END SUBROUTINE + + !> Get the level of verbosity. + PURE FUNCTION libxsmm_get_verbosity() BIND(C) + IMPORT :: C_INT + INTEGER(C_INT) :: libxsmm_get_verbosity + END FUNCTION + + !> Set the level of verbosity (0: off, positive value: verbosity level, + !> negative value: maximum verbosity, which also dumps JIT-code). + SUBROUTINE libxsmm_set_verbosity(level) BIND(C) + IMPORT :: C_INT + INTEGER(C_INT), INTENT(IN), VALUE :: level + END SUBROUTINE + + !> Impure function which returns the current clock tick of a + !> monotonic timer source; uses a platform-specific resolution. + !> Implicit FORTRAN 77 interface: not available. + INTEGER(LIBXSMM_TICKINT_KIND) & + & FUNCTION libxsmm_timer_tick() BIND(C) + IMPORT :: LIBXSMM_TICKINT_KIND + END FUNCTION + + !> Impure function (timer freq. may vary) which returns the duration + !> (in seconds) between two values received by libxsmm_timer_tick. + !> Implicit FORTRAN 77 interface: not available. + FUNCTION libxsmm_timer_duration(tick0, tick1) BIND(C) + IMPORT :: LIBXSMM_TICKINT_KIND, C_DOUBLE + INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick0 + INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN), VALUE :: tick1 + REAL(C_DOUBLE) :: libxsmm_timer_duration + END FUNCTION + + !> Deallocates the JIT'ted code, or unregisters + !> and releases code from the registry. + !> Implicit FORTRAN 77 interface: + !> INTEGER(8) :: kernel + SUBROUTINE libxsmm_release_kernel(kernel) & + & BIND(C, NAME="libxsmm_release_kernel_") + IMPORT :: C_FUNPTR + TYPE(C_FUNPTR), INTENT(IN) :: kernel + END SUBROUTINE + + !> Type-generic (unsafe) code dispatch (trylock: impure routine). + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: gemm_precision, flags, prefetch + !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc + !> REAL(4|8) :: alpha, beta + !> INTEGER(8) :: kernel + SUBROUTINE libxsmm_xmmdispatch(kernel, gemm_precision, & + & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) & + & BIND(C, NAME="libxsmm_xmmdispatch_") + IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND + TYPE(C_FUNPTR), INTENT(OUT) :: kernel + INTEGER(C_INT), INTENT(IN) :: gemm_precision + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch + END SUBROUTINE + + !> Type-generic (unsafe) code dispatch (trylock: impure routine). + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: iprec, oprec, flags, prefetch + !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc + !> REAL(4|8) :: alpha, beta + !> INTEGER(8) :: kernel + SUBROUTINE libxsmm_xmmdispatch2(kernel, iprec, oprec, & + & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) & + & BIND(C, NAME="libxsmm_xmmdispatch2_") + IMPORT :: C_FUNPTR, C_PTR, C_INT, LIBXSMM_BLASINT_KIND + TYPE(C_FUNPTR), INTENT(OUT) :: kernel + INTEGER(C_INT), INTENT(IN) :: iprec, oprec + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + TYPE(C_PTR), INTENT(IN), VALUE :: lda, ldb, ldc + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + TYPE(C_PTR), INTENT(IN), VALUE :: flags, prefetch + END SUBROUTINE + + !> Generic call routine (3-argument form). + !> Implicit FORTRAN 77 interface: + !> REAL(4|8) :: a, b, c + !> INTEGER(8) :: kernel + PURE SUBROUTINE libxsmm_xmmcall_abc(kernel, a, b, c) & + & BIND(C, NAME="libxsmm_xmmcall_abc_") + IMPORT :: C_FUNPTR, C_PTR + TYPE(C_FUNPTR), INTENT(IN) :: kernel + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + END SUBROUTINE + + !> Generic call routine (6-argument form). + !> Implicit FORTRAN 77 interface: + !> REAL(4|8) :: a, b, c, pa, pb, pc + !> INTEGER(8) :: kernel + PURE SUBROUTINE libxsmm_xmmcall_prf(kernel, & + & a, b, c, pa, pb, pc) & + & BIND(C, NAME="libxsmm_xmmcall_prf_") + IMPORT :: C_FUNPTR, C_PTR + TYPE(C_FUNPTR), INTENT(IN) :: kernel + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c, pa, pb, pc + END SUBROUTINE + + !> Fill destination with zeros; treats dst in raw/binary fashion. + SUBROUTINE libxsmm_xclear(dst, nbytes) & + & BIND(C, NAME="libxsmm_xclear_") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), INTENT(IN), VALUE :: dst + INTEGER(C_INT), INTENT(IN) :: nbytes + END SUBROUTINE + + !> Remove key-value pair from code registry and release memory. + SUBROUTINE libxsmm_xrelease(key, keysize) & + & BIND(C, NAME="libxsmm_xrelease_") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), INTENT(IN), VALUE :: key + INTEGER(C_INT), INTENT(IN) :: keysize + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine. + !> Implicit FORTRAN 77 interface: + !> ARRAY :: input, output + !> INTEGER(4|8) :: m, n, ldi, ldo + !> INTEGER(4) :: typesize + PURE SUBROUTINE libxsmm_xmatcopy(output, input, typesize, & + & m, n, ldi, ldo) BIND(C, NAME="libxsmm_matcopy_") + IMPORT :: LIBXSMM_BLASINT_KIND, C_PTR, C_INT + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + TYPE(C_PTR), INTENT(IN), VALUE :: output, input + INTEGER(C_INT), INTENT(IN) :: typesize + END SUBROUTINE + + !> Transpose a matrix (in-place form). + !> Implicit FORTRAN 77 interface: + !> ARRAY :: matrix + !> INTEGER(4|8) :: m, n, ldi, ldo + !> INTEGER(4) :: typesize + PURE SUBROUTINE libxsmm_xitrans(matrix, typesize, & + & m, n, ldi, ldo) BIND(C, NAME="libxsmm_itrans_") + IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + TYPE(C_PTR), INTENT(IN), VALUE :: matrix + INTEGER(C_INT), INTENT(IN) :: typesize + END SUBROUTINE + + !> Transpose a matrix (out-of-place form). + !> Implicit FORTRAN 77 interface: + !> ARRAY :: input, output + !> INTEGER(4|8) :: m, n, ldi, ldo + !> INTEGER(4) :: typesize + PURE SUBROUTINE libxsmm_xotrans(output, input, typesize, & + & m, n, ldi, ldo) BIND(C, NAME="libxsmm_otrans_") + IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + TYPE(C_PTR), INTENT(IN), VALUE :: output, input + INTEGER(C_INT), INTENT(IN) :: typesize + END SUBROUTINE + + !> Matrix copy; MT via libxsmmext (out-of-place form). + !> Implicit FORTRAN 77 interface: + !> ARRAY :: output, input + !> INTEGER(4|8) :: m, n, ldi, ldo + !> INTEGER(4) :: typesize + PURE SUBROUTINE libxsmm_matcopy_omp(output, input, typesize, & + & m, n, ldi, ldo) BIND(C, NAME="libxsmm_matcopy_omp_") + IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + TYPE(C_PTR), INTENT(IN), VALUE :: output, input + INTEGER(C_INT), INTENT(IN) :: typesize + END SUBROUTINE + + !> Matrix transposition; MT via libxsmmext (out-of-place form). + !> Implicit FORTRAN 77 interface: + !> ARRAY :: output, input + !> INTEGER(4|8) :: m, n, ldi, ldo + !> INTEGER(4) :: typesize + PURE SUBROUTINE libxsmm_otrans_omp(output, input, typesize, & + & m, n, ldi, ldo) BIND(C, NAME="libxsmm_otrans_omp_") + IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + TYPE(C_PTR), INTENT(IN), VALUE :: output, input + INTEGER(C_INT), INTENT(IN) :: typesize + END SUBROUTINE + + !> General dense MM; MT via libxsmmext (double-precision). + !> Implicit FORTRAN 77 interface: similar to DGEMM. + PURE SUBROUTINE libxsmm_dgemm_omp(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_dgemm_omp_") + IMPORT :: C_DOUBLE, C_CHAR, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_DOUBLE), INTENT(IN) :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) + END SUBROUTINE + + !> General dense MM; MT via libxsmmext (single-precision). + !> Implicit FORTRAN 77 interface: similar to SGEMM. + PURE SUBROUTINE libxsmm_sgemm_omp(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_sgemm_omp_") + IMPORT :: C_FLOAT, C_CHAR, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_FLOAT), INTENT(IN) :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) + END SUBROUTINE + + !> Process a series of MMs (batch). See also libxsmm_gemm_batch_omp. + !> The kind of matrix operands (a, b, c) depend on index_stride: + !> index_stride==0: pointers to pointers of elements, e.g., + !> double** for the C matrices. + !> index_stride!=0: pointer to elements, e.g., + !> const double* for the A and B matrices. + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: iprec, oprec + !> REAL(4|8) :: alpha, beta + !> ARRAY :: a, b, c + !> ARRAY/VALUE :: stride_a, stride_b, stride_c + !> INTEGER(4|8) :: index_base, index_stride, batchsize + !> INTEGER(4) :: tid, nthreads + !> Otherwise arguments are similar to GEMM. + PURE SUBROUTINE libxsmm_mmbatch(iprec, oprec, transa, transb, & + & m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, index_base, & + & index_stride, stride_a, stride_b, stride_c, batchsize, & + & tid, nthreads) & + & BIND(C, NAME="libxsmm_mmbatch_") + IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND + !> Determines index-base (usually 0, 1 for one-based indexes). + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base + !> Stride (measured in Bytes) used to walk stride_*. + !> In Fortran: index_stride!=0. + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride + !> Number of SMMs. If the size is given as a negative value, + !> then internal synchronization is omitted. + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + !> Arrays of indexes determining the position of + !> a, b, and c operands. + TYPE(C_PTR), INTENT(IN), VALUE :: stride_a + TYPE(C_PTR), INTENT(IN), VALUE :: stride_b + TYPE(C_PTR), INTENT(IN), VALUE :: stride_c + INTEGER(C_INT), INTENT(IN) :: iprec, oprec + !> Thread-ID (TID), and number of threads. + INTEGER(C_INT), INTENT(IN) :: tid, nthreads + END SUBROUTINE + + !> Process a series of SMMs (batch). See also libxsmm_mmbatch. + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: iprec, oprec + !> REAL(4|8) :: alpha, beta + !> ARRAY :: a, b, c + !> ARRAY/VALUE :: stride_a, stride_b, stride_c + !> INTEGER(4|8) :: index_base, index_stride, batchsize + !> Otherwise arguments are similar to GEMM. + PURE SUBROUTINE libxsmm_gemm_batch(iprec, oprec, & + & transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, & + & index_base, index_stride, stride_a, stride_b, stride_c, & + & batchsize) & + & BIND(C, NAME="libxsmm_gemm_batch_") + IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + TYPE(C_PTR), INTENT(IN), VALUE :: stride_a + TYPE(C_PTR), INTENT(IN), VALUE :: stride_b + TYPE(C_PTR), INTENT(IN), VALUE :: stride_c + INTEGER(C_INT), INTENT(IN) :: iprec, oprec + END SUBROUTINE + + !> Process a series of SMMs (batch) with OpenMP (libxsmmext). + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: iprec, oprec + !> REAL(4|8) :: alpha, beta + !> ARRAY :: a, b, c + !> ARRAY/VALUE :: stride_a, stride_b, stride_c + !> INTEGER(4|8) :: index_base, index_stride, batchsize + !> Otherwise arguments are similar to GEMM. + PURE SUBROUTINE libxsmm_gemm_batch_omp(iprec, oprec, & + & transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, & + & index_base, index_stride, stride_a, stride_b, stride_c, & + & batchsize) & + & BIND(C, NAME="libxsmm_gemm_batch_omp_") + IMPORT :: C_PTR, C_CHAR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_base + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: index_stride + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: batchsize + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + TYPE(C_PTR), INTENT(IN), VALUE :: a, b, c + TYPE(C_PTR), INTENT(IN), VALUE :: stride_a + TYPE(C_PTR), INTENT(IN), VALUE :: stride_b + TYPE(C_PTR), INTENT(IN), VALUE :: stride_c + INTEGER(C_INT), INTENT(IN) :: iprec, oprec + END SUBROUTINE + + !> This function is a no-op unless LIBXSMM is built to intercept GEMM. + !> Pointer arguments are used to filter intercepted GEMM calls such that + !> non-NULL values match. Otherwise (NULL) the respective argument is + !> considered a "free value", i.e., every value can match; + !> libxsmmext required. + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: gemm_precision, flags + !> INTEGER(4|8) :: m, n, k, lda, ldb, ldc + !> REAL(4|8) :: alpha, beta + SUBROUTINE libxsmm_mmbatch_begin(gemm_precision, flags, & + & m, n, k, lda, ldb, ldc, alpha, beta) BIND(C) + IMPORT :: C_PTR, C_INT, LIBXSMM_BLASINT_KIND + INTEGER(C_INT), INTENT(IN), VALUE :: gemm_precision + INTEGER(C_INT), INTENT(IN) :: flags + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + TYPE(C_PTR), INTENT(IN), VALUE :: alpha, beta + END SUBROUTINE + + !> Processes the batch of previously recorded SMMs + !> (libxsmm_mmbatch_begin); libxsmmext required. + !> Implicit FORTRAN 77 interface: available. + SUBROUTINE libxsmm_mmbatch_end() BIND(C) + END SUBROUTINE + + !> Reduces input into output such that the difference is maintained + !> or increased (max function). The very first (initial) output + !> should be zeroed (libxsmm_matdiff_clear). + !> Implicit FORTRAN 77 interface: available. + PURE SUBROUTINE libxsmm_matdiff_reduce(output, input) BIND(C) + IMPORT :: LIBXSMM_MATDIFF_INFO + TYPE(LIBXSMM_MATDIFF_INFO), INTENT(INOUT) :: output + TYPE(LIBXSMM_MATDIFF_INFO), INTENT(IN) :: input + END SUBROUTINE + + !> Clears the given info-structure, e.g., for the initial + !> reduction-value (libxsmm_matdiff_reduce). + !> Implicit FORTRAN 77 interface: available. + PURE SUBROUTINE libxsmm_matdiff_clear(info) BIND(C) + IMPORT :: LIBXSMM_MATDIFF_INFO + TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info + END SUBROUTINE + + !> Calculates a hash value for the given array and seed. + !> Routine suitable for FORTRAN 77; keysize in Bytes. + PURE SUBROUTINE libxsmm_xhash(hash_seed, key, keysize) & + & BIND(C, NAME="libxsmm_xhash_") + IMPORT :: C_INT, C_PTR + INTEGER(C_INT), INTENT(INOUT) :: hash_seed + INTEGER(C_INT), INTENT(IN) :: keysize + TYPE(C_PTR), INTENT(IN), VALUE :: key + END SUBROUTINE + + !> Calculates if there is a difference between two arrays. + !> Routine suitable for FORTRAN 77; size in Bytes. + PURE SUBROUTINE libxsmm_xdiff(diff, a, b, nbytes) & + & BIND(C, NAME="libxsmm_xdiff_") + IMPORT :: C_PTR, C_LONG_LONG, C_BOOL + TYPE(C_PTR), INTENT(IN), VALUE :: a, b + INTEGER(C_LONG_LONG), INTENT(IN) :: nbytes + LOGICAL(C_BOOL), INTENT(OUT) :: diff + END SUBROUTINE + END INTERFACE + + INTERFACE libxsmm_ptr0 + MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0 + MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0 + MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0 + MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l0 !! long long + END INTERFACE + + INTERFACE libxsmm_ptr1 + MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1 + MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1 + MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1 + MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l1 !! long long + MODULE PROCEDURE libxsmm_ptr_dmm + MODULE PROCEDURE libxsmm_ptr_smm + MODULE PROCEDURE libxsmm_ptr_wimm + END INTERFACE + + INTERFACE libxsmm_ptr2 + MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2 + MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2 + MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2 + MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l2 !! long long + END INTERFACE + + INTERFACE libxsmm_ptr + MODULE PROCEDURE libxsmm_ptr_z0, libxsmm_ptr_c0 + MODULE PROCEDURE libxsmm_ptr_d0, libxsmm_ptr_s0 + MODULE PROCEDURE libxsmm_ptr_i0, libxsmm_ptr_w0 + MODULE PROCEDURE libxsmm_ptr_j0 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b0 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l0 !! long long + MODULE PROCEDURE libxsmm_ptr_z1, libxsmm_ptr_c1 + MODULE PROCEDURE libxsmm_ptr_d1, libxsmm_ptr_s1 + MODULE PROCEDURE libxsmm_ptr_i1, libxsmm_ptr_w1 + MODULE PROCEDURE libxsmm_ptr_j1 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b1 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l1 !! long long + MODULE PROCEDURE libxsmm_ptr_z2, libxsmm_ptr_c2 + MODULE PROCEDURE libxsmm_ptr_d2, libxsmm_ptr_s2 + MODULE PROCEDURE libxsmm_ptr_i2, libxsmm_ptr_w2 + MODULE PROCEDURE libxsmm_ptr_j2 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_b2 !! Byte/char + MODULE PROCEDURE libxsmm_ptr_l2 !! long long + MODULE PROCEDURE libxsmm_ptr_dmm + MODULE PROCEDURE libxsmm_ptr_smm + MODULE PROCEDURE libxsmm_ptr_wimm + END INTERFACE + + !> Deallocates JIT'ted code, or unregisters/releases code from registry. + INTERFACE libxsmm_release_mmkernel + MODULE PROCEDURE libxsmm_release_dmmkernel + MODULE PROCEDURE libxsmm_release_smmkernel + MODULE PROCEDURE libxsmm_release_wimmkernel + END INTERFACE + + !> Construct JIT-code depending on given argument set. + INTERFACE libxsmm_mmdispatch + MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch + MODULE PROCEDURE libxsmm_wimmdispatch + END INTERFACE + + !> Construct JIT-code depending on given argument set. + INTERFACE libxsmm_dispatch + MODULE PROCEDURE libxsmm_dmmdispatch, libxsmm_smmdispatch + MODULE PROCEDURE libxsmm_wimmdispatch + END INTERFACE + + !> Check if a function is available (LIBXSMM_?MMFUNCTION). + INTERFACE libxsmm_mmavailable + MODULE PROCEDURE libxsmm_dmmavailable, libxsmm_smmavailable + MODULE PROCEDURE libxsmm_wimmavailable + END INTERFACE + + !> Check if a function is available (LIBXSMM_?MMFUNCTION). + INTERFACE libxsmm_available + MODULE PROCEDURE libxsmm_smmavailable, libxsmm_dmmavailable + MODULE PROCEDURE libxsmm_wimmavailable + END INTERFACE + + !> Overloaded GEMM routines (double-precision). + INTERFACE libxsmm_dgemm + MODULE PROCEDURE libxsmm_dgemm0 + MODULE PROCEDURE libxsmm_dgemm1 + MODULE PROCEDURE libxsmm_dgemm2 + MODULE PROCEDURE libxsmm_dgemm3 + END INTERFACE + + !> Overloaded GEMM routines (single-precision). + INTERFACE libxsmm_sgemm + MODULE PROCEDURE libxsmm_sgemm0 + MODULE PROCEDURE libxsmm_sgemm1 + MODULE PROCEDURE libxsmm_sgemm2 + END INTERFACE + + !> Overloaded GEMM routines (low-precision). + INTERFACE libxsmm_wigemm + MODULE PROCEDURE libxsmm_wigemm0 + MODULE PROCEDURE libxsmm_wigemm1 + MODULE PROCEDURE libxsmm_wigemm2 + END INTERFACE + + !> Overloaded GEMM routines. + INTERFACE libxsmm_gemm + MODULE PROCEDURE libxsmm_dgemm0 + MODULE PROCEDURE libxsmm_dgemm1 + MODULE PROCEDURE libxsmm_dgemm2 + MODULE PROCEDURE libxsmm_dgemm3 + MODULE PROCEDURE libxsmm_sgemm0 + MODULE PROCEDURE libxsmm_sgemm1 + MODULE PROCEDURE libxsmm_sgemm2 + MODULE PROCEDURE libxsmm_sgemm3 + MODULE PROCEDURE libxsmm_wigemm0 + MODULE PROCEDURE libxsmm_wigemm1 + MODULE PROCEDURE libxsmm_wigemm2 + MODULE PROCEDURE libxsmm_wigemm3 + END INTERFACE + + !> Overloaded BLAS GEMM routines (double-precision). + INTERFACE libxsmm_blas_dgemm + MODULE PROCEDURE libxsmm_blas_dgemm0 + MODULE PROCEDURE libxsmm_blas_dgemm1 + MODULE PROCEDURE libxsmm_blas_dgemm2 + MODULE PROCEDURE libxsmm_blas_dgemm3 + END INTERFACE + + !> Overloaded BLAS GEMM routines (single-precision). + INTERFACE libxsmm_blas_sgemm + MODULE PROCEDURE libxsmm_blas_sgemm0 + MODULE PROCEDURE libxsmm_blas_sgemm1 + MODULE PROCEDURE libxsmm_blas_sgemm2 + MODULE PROCEDURE libxsmm_blas_sgemm3 + END INTERFACE + + !> Overloaded BLAS GEMM routines (single/double-precision). + INTERFACE libxsmm_blas_gemm + MODULE PROCEDURE libxsmm_blas_dgemm0 + MODULE PROCEDURE libxsmm_blas_dgemm1 + MODULE PROCEDURE libxsmm_blas_dgemm2 + MODULE PROCEDURE libxsmm_blas_dgemm3 + MODULE PROCEDURE libxsmm_blas_sgemm0 + MODULE PROCEDURE libxsmm_blas_sgemm1 + MODULE PROCEDURE libxsmm_blas_sgemm2 + MODULE PROCEDURE libxsmm_blas_sgemm3 + END INTERFACE + + !> Overloaded MATCOPY routines (2d-copy). + INTERFACE libxsmm_matcopy + MODULE PROCEDURE libxsmm_matcopy_p0 + MODULE PROCEDURE libxsmm_matcopy_d1 + MODULE PROCEDURE libxsmm_matcopy_d2 + MODULE PROCEDURE libxsmm_matcopy_s1 + MODULE PROCEDURE libxsmm_matcopy_s2 + END INTERFACE + + !> Overloaded TRANSPOSE routines (in-place form). + INTERFACE libxsmm_itrans + MODULE PROCEDURE libxsmm_itrans_p0 + MODULE PROCEDURE libxsmm_itrans_d1 + MODULE PROCEDURE libxsmm_itrans_d2 + MODULE PROCEDURE libxsmm_itrans_s1 + MODULE PROCEDURE libxsmm_itrans_s2 + END INTERFACE + + !> Overloaded TRANSPOSE routines (out-of-place form). + INTERFACE libxsmm_otrans + MODULE PROCEDURE libxsmm_otrans_p0 + MODULE PROCEDURE libxsmm_otrans_d1 + MODULE PROCEDURE libxsmm_otrans_d2 + MODULE PROCEDURE libxsmm_otrans_s1 + MODULE PROCEDURE libxsmm_otrans_s2 + END INTERFACE + + !> Calculate a hash value for a given key value (binary blob). + !> Conceptually pure, but C_LOC may be (incorrectly) impure. + INTERFACE libxsmm_hash + MODULE PROCEDURE libxsmm_hash_char + MODULE PROCEDURE libxsmm_hash_i8 + MODULE PROCEDURE libxsmm_hash_i32 + MODULE PROCEDURE libxsmm_hash_i64 + END INTERFACE + + !> Calculate whether there is a difference between two series of items. + !> Conceptually pure, but C_LOC may be (incorrectly) impure. + INTERFACE libxsmm_diff + MODULE PROCEDURE libxsmm_diff_char + MODULE PROCEDURE libxsmm_diff_i8 + MODULE PROCEDURE libxsmm_diff_i32 + MODULE PROCEDURE libxsmm_diff_i64 + END INTERFACE + + CONTAINS + !> Returns the name of the target architecture as determined by + !> the CPUID flags, as set by the libxsmm_get_target_arch* functions, + !> or as set by the LIBXSMM_TARGET environment variable. + FUNCTION libxsmm_get_target_arch() + !CHARACTER(LEN=:), POINTER :: libxsmm_get_target_arch + CHARACTER, POINTER :: libxsmm_get_target_arch(:) + INTEGER(C_INT) :: length(1) + TYPE(C_PTR) :: arch + INTERFACE + FUNCTION libxsmmf_get_target_arch(length) BIND(C) + IMPORT :: C_INT, C_PTR + INTEGER(C_INT), INTENT(OUT) :: length + TYPE(C_PTR) :: libxsmmf_get_target_arch + END FUNCTION + END INTERFACE + arch = libxsmmf_get_target_arch(length(1)) + CALL C_F_POINTER(arch, libxsmm_get_target_arch, length) + END FUNCTION + + !> Returns C_NULL_PTR. + PURE FUNCTION libxsmm_ptr_null() + TYPE(C_PTR) :: libxsmm_ptr_null + libxsmm_ptr_null = C_NULL_PTR + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_z0(a) + COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_z0 + libxsmm_ptr_z0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_z1(a) + COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_z1 + libxsmm_ptr_z1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_z2(a) + COMPLEX(C_DOUBLE_COMPLEX), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_z2 + libxsmm_ptr_z2 = libxsmm_ptr_z1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_c0(a) + COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_c0 + libxsmm_ptr_c0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_c1(a) + COMPLEX(C_FLOAT_COMPLEX), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_c1 + libxsmm_ptr_c1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_c2(a) + COMPLEX(C_FLOAT_COMPLEX), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_c2 + libxsmm_ptr_c2 = libxsmm_ptr_c1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_d0(a) + REAL(C_DOUBLE), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_d0 + libxsmm_ptr_d0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_d1(a) + REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_d1 + libxsmm_ptr_d1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_d2(a) + REAL(C_DOUBLE), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_d2 + libxsmm_ptr_d2 = libxsmm_ptr_d1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_s0(a) + REAL(C_FLOAT), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_s0 + libxsmm_ptr_s0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_s1(a) + REAL(C_FLOAT), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_s1 + libxsmm_ptr_s1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_s2(a) + REAL(C_FLOAT), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_s2 + libxsmm_ptr_s2 = libxsmm_ptr_s1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_i0(a) + INTEGER(C_INT), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_i0 + libxsmm_ptr_i0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_i1(a) + INTEGER(C_INT), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_i1 + libxsmm_ptr_i1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_i2(a) + INTEGER(C_INT), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_i2 + libxsmm_ptr_i2 = libxsmm_ptr_i1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_w0(a) + INTEGER(C_SHORT), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_w0 + libxsmm_ptr_w0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_w1(a) + INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_w1 + libxsmm_ptr_w1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_w2(a) + INTEGER(C_SHORT), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_w2 + libxsmm_ptr_w2 = libxsmm_ptr_w1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_j0(a) + INTEGER(C_INT8_T), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_j0 + libxsmm_ptr_j0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_j1(a) + INTEGER(C_INT8_T), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_j1 + libxsmm_ptr_j1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_j2(a) + INTEGER(C_INT8_T), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_j2 + libxsmm_ptr_j2 = libxsmm_ptr_j1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_b0(a) + CHARACTER(C_CHAR), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_b0 + libxsmm_ptr_b0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_b1(a) + CHARACTER(C_CHAR), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_b1 + libxsmm_ptr_b1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_b2(a) + CHARACTER(C_CHAR), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_b2 + libxsmm_ptr_b2 = libxsmm_ptr_b1(a) + END FUNCTION + + !> Determines the C-address of the given array. + FUNCTION libxsmm_ptr_l0(a) + INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a + TYPE(C_PTR) :: libxsmm_ptr_l0 + libxsmm_ptr_l0 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_l1(a) + INTEGER(C_LONG_LONG), INTENT(IN), TARGET :: a(*) + TYPE(C_PTR) :: libxsmm_ptr_l1 + libxsmm_ptr_l1 = C_LOC(a) + END FUNCTION + FUNCTION libxsmm_ptr_l2(a) + INTEGER(C_LONG_LONG), INTENT(IN) :: a(:,:) + TYPE(C_PTR) :: libxsmm_ptr_l2 + libxsmm_ptr_l2 = libxsmm_ptr_l1(a) + END FUNCTION + + FUNCTION libxsmm_ptr_dmm(a) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN), TARGET :: a(:) + TYPE(LIBXSMM_DMMFUNCTION), POINTER :: p + TYPE(C_PTR) :: libxsmm_ptr_dmm + p => a(LBOUND(a,1)); libxsmm_ptr_dmm = C_LOC(p%handle) + END FUNCTION + FUNCTION libxsmm_ptr_smm(a) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN), TARGET :: a(:) + TYPE(LIBXSMM_SMMFUNCTION), POINTER :: p + TYPE(C_PTR) :: libxsmm_ptr_smm + p => a(LBOUND(a,1)); libxsmm_ptr_smm = C_LOC(p%handle) + END FUNCTION + FUNCTION libxsmm_ptr_wimm(a) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN), TARGET :: a(:) + TYPE(LIBXSMM_WIMMFUNCTION), POINTER :: p + TYPE(C_PTR) :: libxsmm_ptr_wimm + p => a(LBOUND(a,1)); libxsmm_ptr_wimm = C_LOC(p%handle) + END FUNCTION + + !> Deallocate JIT'ted code created by libxsmm_create routines. To + !> unregister code generated with libxsmm_dispatch is unnecessary. + SUBROUTINE libxsmm_release_dmmkernel(kernel) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel + CALL libxsmm_release_kernel(kernel%handle) + END SUBROUTINE + + !> Deallocate JIT'ted code created by libxsmm_create routines. To + !> unregister code generated with libxsmm_dispatch is unnecessary. + SUBROUTINE libxsmm_release_smmkernel(kernel) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel + CALL libxsmm_release_kernel(kernel%handle) + END SUBROUTINE + + !> Deallocate JIT'ted code created by libxsmm_create routines. To + !> unregister code generated with libxsmm_dispatch is unnecessary. + SUBROUTINE libxsmm_release_wimmkernel(kernel) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel + CALL libxsmm_release_kernel(kernel%handle) + END SUBROUTINE + + !> Query or JIT-generate an SMM-kernel (double-precision). + SUBROUTINE libxsmm_dmmdispatch(kernel, & + & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(OUT) :: kernel + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & + & OPTIONAL, TARGET :: lda, ldb, ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: alpha, beta + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch + CALL libxsmm_xmmdispatch( & + & kernel%handle, LIBXSMM_GEMM_PRECISION_F64, & + & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & + & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) + END SUBROUTINE + + !> Query or JIT-generate an SMM-kernel (single-precision). + SUBROUTINE libxsmm_smmdispatch(kernel, & + & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(OUT) :: kernel + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & + & OPTIONAL, TARGET :: lda, ldb, ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: alpha, beta + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch + CALL libxsmm_xmmdispatch( & + & kernel%handle, LIBXSMM_GEMM_PRECISION_F32, & + & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & + & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) + END SUBROUTINE + + !> Query or JIT-generate an SMM-kernel (low-precision, int-accumulate). + SUBROUTINE libxsmm_wimmdispatch(kernel, & + & m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(OUT) :: kernel + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & + & OPTIONAL, TARGET :: lda, ldb, ldc + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: alpha, beta + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: flags + INTEGER(C_INT), INTENT(IN), OPTIONAL, TARGET :: prefetch + CALL libxsmm_xmmdispatch2(kernel%handle, & + & LIBXSMM_GEMM_PRECISION_I16, LIBXSMM_GEMM_PRECISION_I32, & + & m, n, k, C_LOC(lda), C_LOC(ldb), C_LOC(ldc), & + & C_LOC(alpha), C_LOC(beta), C_LOC(flags), C_LOC(prefetch)) + END SUBROUTINE + + !> Checks if the given kernel was generated. JIT code is guaranteed + !> to be generated if JIT support was enabled at build-time of the + !> library (default). This overload belongs to libxsmm_(mm)available. + LOGICAL FUNCTION libxsmm_dmmavailable(kernel) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel + libxsmm_dmmavailable = C_ASSOCIATED(kernel%handle) + END FUNCTION + + !> Checks if the given kernel was generated. JIT code is guaranteed + !> to be generated if JIT support was enabled at build-time of the + !> library (default). This overload belongs to libxsmm_(mm)available. + LOGICAL FUNCTION libxsmm_smmavailable(kernel) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel + libxsmm_smmavailable = C_ASSOCIATED(kernel%handle) + END FUNCTION + + !> Checks if the given kernel was generated. JIT code is guaranteed + !> to be generated if JIT support was enabled at build-time of the + !> library (default). This overload belongs to libxsmm_(mm)available. + LOGICAL FUNCTION libxsmm_wimmavailable(kernel) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel + libxsmm_wimmavailable = C_ASSOCIATED(kernel%handle) + END FUNCTION + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_dmmcall_abc(kernel, a, b, c) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel + REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) + ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END SUBROUTINE + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_dmmcall_prf(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel + REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) + REAL(C_DOUBLE), INTENT(IN), TARGET :: pa(*) + REAL(C_DOUBLE), INTENT(IN), TARGET :: pb(*) + REAL(C_DOUBLE), INTENT(IN), TARGET :: pc(*) + ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + END SUBROUTINE + + !> See also libxsmm_dmmcall_abc and libxsmm_dmmcall_prf. + SUBROUTINE libxsmm_dmmcall(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_DMMFUNCTION), INTENT(IN) :: kernel + REAL(C_DOUBLE), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_DOUBLE), INTENT(INOUT), TARGET :: c(*) + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pa(*) + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pb(*) + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: pc(*) + ! use .OR. instead of .AND. to avoid full check + IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + ELSE + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END IF + END SUBROUTINE + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_smmcall_abc(kernel, a, b, c) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel + REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) + ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END SUBROUTINE + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_smmcall_prf(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel + REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) + REAL(C_FLOAT), INTENT(IN), TARGET :: pa(*) + REAL(C_FLOAT), INTENT(IN), TARGET :: pb(*) + REAL(C_FLOAT), INTENT(IN), TARGET :: pc(*) + ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + END SUBROUTINE + + !> See also libxsmm_smmcall_abc and libxsmm_smmcall_prf. + SUBROUTINE libxsmm_smmcall(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_SMMFUNCTION), INTENT(IN) :: kernel + REAL(C_FLOAT), INTENT(IN), TARGET :: a(*), b(*) + REAL(C_FLOAT), INTENT(INOUT), TARGET :: c(*) + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pa(*) + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pb(*) + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: pc(*) + ! use .OR. instead of .AND. to avoid full check + IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + ELSE + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END IF + END SUBROUTINE + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION3). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_wimmcall_abc(kernel, a, b, c) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel + INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) + INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) + ! PROCEDURE(LIBXSMM_FUNCTION3), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END SUBROUTINE + + !> Calls the kernel with the given arguments. Alternatively, + !> PROCPOINTER can be used as shown by the inner comments + !> of this routine (LIBXSMM_FUNCTION6). The libxsmm_xmmcall + !> routines can be used in FORTRAN77. + SUBROUTINE libxsmm_wimmcall_prf(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel + INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) + INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) + INTEGER(C_SHORT), INTENT(IN), TARGET :: pa(*) + INTEGER(C_SHORT), INTENT(IN), TARGET :: pb(*) + INTEGER(C_SHORT), INTENT(IN), TARGET :: pc(*) + ! PROCEDURE(LIBXSMM_FUNCTION6), POINTER :: xmm + ! CALL C_F_PROCPOINTER(kernel%handle, xmm) + ! CALL xmm(...) + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + END SUBROUTINE + + !> See also libxsmm_wimmcall_abc and libxsmm_wimmcall_prf. + SUBROUTINE libxsmm_wimmcall(kernel, a, b, c, pa, pb, pc) + TYPE(LIBXSMM_WIMMFUNCTION), INTENT(IN) :: kernel + INTEGER(C_SHORT), INTENT(IN), TARGET :: a(*), b(*) + INTEGER(C_INT), INTENT(INOUT), TARGET :: c(*) + INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pa(*) + INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pb(*) + INTEGER(C_SHORT), INTENT(IN), OPTIONAL, TARGET :: pc(*) + ! use .OR. instead of .AND. to avoid full check + IF (PRESENT(pa).OR.PRESENT(pb).OR.PRESENT(pc)) THEN + CALL libxsmm_xmmcall_prf(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c), & + & C_LOC(pa), C_LOC(pb), C_LOC(pc)) + ELSE + CALL libxsmm_xmmcall_abc(kernel%handle, & + & C_LOC(a), C_LOC(b), C_LOC(c)) + END IF + END SUBROUTINE + + !> Register user-defined key-value; value can be queried (libxsmm_xdispatch). + !> Since the key-type is unknown to LIBXSMM, the key must be binary reproducible, + !> i.e., if it is a structured type (padded data may be uninitialized), it must + !> be initially zero-filled (libxsmm_xclear) followed by an element-wise setup. + !> The size of the key is limited (see documentation). The given value is copied + !> by LIBXSMM and may be initialized at registration-time or whenever queried. + !> Registered data is released at program termination but can be also released + !> if needed (libxsmm_xrelease), .e.g., for larger value for the same key. + FUNCTION libxsmm_xregister(key, keysize, valsize, & + & valinit, keyhash) + TYPE(C_PTR), INTENT(IN), VALUE :: key + INTEGER(C_INT), INTENT(IN) :: keysize, valsize + TYPE(C_PTR), INTENT(IN), OPTIONAL :: valinit + INTEGER(C_INT), INTENT(OUT), OPTIONAL :: keyhash + TYPE(C_PTR) :: libxsmm_xregister + INTERFACE + SUBROUTINE internal_xregister(regval, & + & key, keysize, valsize, valinit, keyhash) & + & BIND(C, NAME="libxsmm_xregister_") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), INTENT(OUT) :: regval + TYPE(C_PTR), INTENT(IN), VALUE :: key, valinit + INTEGER(C_INT), INTENT(IN) :: keysize, valsize + INTEGER(C_INT), INTENT(OUT) :: keyhash + END SUBROUTINE + END INTERFACE + CALL internal_xregister(libxsmm_xregister, & + & key, keysize, valsize, valinit, keyhash) + END FUNCTION + + !> Query user-defined value from LIBXSMM's code registry. + FUNCTION libxsmm_xdispatch(key, keysize, keyhash) + TYPE(C_PTR), INTENT(IN), VALUE :: key + INTEGER(C_INT), INTENT(IN) :: keysize + INTEGER(C_INT), INTENT(OUT), OPTIONAL :: keyhash + TYPE(C_PTR) :: libxsmm_xdispatch + INTERFACE + SUBROUTINE internal_xdispatch(regval, key, keysize, keyhash)& + & BIND(C, NAME="libxsmm_xdispatch_") + IMPORT :: C_PTR, C_INT + TYPE(C_PTR), INTENT(OUT) :: regval + TYPE(C_PTR), INTENT(IN), VALUE :: key + INTEGER(C_INT), INTENT(IN) :: keysize + INTEGER(C_INT), INTENT(OUT) :: keyhash + END SUBROUTINE + END INTERFACE + CALL internal_xdispatch(libxsmm_xdispatch, & + & key, keysize, keyhash) + END FUNCTION + + !> Auto-dispatched general dense MM (double-precision). + !> This overload belongs to libxsmm_(d)gemm. + PURE SUBROUTINE libxsmm_dgemm0(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a, b + REAL(C_DOUBLE), INTENT(INOUT) :: c + INTERFACE + PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_dgemm_") + IMPORT :: C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc + REAL(C_DOUBLE), INTENT(IN) :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a, b + REAL(C_DOUBLE), INTENT(INOUT) :: c + END SUBROUTINE + END INTERFACE + CALL internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + END SUBROUTINE + + !> Auto-dispatched general dense MM (double-precision). + !> This overload belongs to libxsmm_(d)gemm. + PURE SUBROUTINE libxsmm_dgemm1(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(*), b(*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1)), lda, & + & b(LBOUND(b,1)), ldb, & + & beta, c(LBOUND(c,1)), ldc) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (double-precision). + !> This overload belongs to libxsmm_(d)gemm. + PURE SUBROUTINE libxsmm_dgemm2(transa, transb, m, n, k, & + & a, b, c, alpha, beta) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(m,*), b(k,*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & + & b(LBOUND(b,1),LBOUND(b,2)), k, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), m) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (double-precision). + !> This overload belongs to libxsmm_(d)gemm. + PURE SUBROUTINE libxsmm_dgemm3(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & + & b(LBOUND(b,1),LBOUND(b,2)), ldb, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (single-precision). + !> This overload belongs to libxsmm_(s)gemm. + PURE SUBROUTINE libxsmm_sgemm0(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a, b + REAL(C_FLOAT), INTENT(INOUT) :: c + INTERFACE + PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_sgemm_") + IMPORT :: C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc + REAL(C_FLOAT), INTENT(IN) :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a, b + REAL(C_FLOAT), INTENT(INOUT) :: c + END SUBROUTINE + END INTERFACE + CALL internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + END SUBROUTINE + + !> Auto-dispatched general dense MM (single-precision). + !> This overload belongs to libxsmm_(s)gemm. + PURE SUBROUTINE libxsmm_sgemm1(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(*), b(*) + REAL(C_FLOAT), INTENT(INOUT) :: c(*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1)), lda, & + & b(LBOUND(b,1)), ldb, & + & beta, c(LBOUND(c,1)), ldc) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (single-precision). + !> This overload belongs to libxsmm_(s)gemm. + PURE SUBROUTINE libxsmm_sgemm2(transa, transb, m, n, k, & + & a, b, c, alpha, beta) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(m,*), b(k,*) + REAL(C_FLOAT), INTENT(INOUT) :: c(m,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & + & b(LBOUND(b,1),LBOUND(b,2)), k, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), m) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (single-precision). + !> This overload belongs to libxsmm_(s)gemm. + PURE SUBROUTINE libxsmm_sgemm3(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & + & b(LBOUND(b,1),LBOUND(b,2)), ldb, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (low-precision, int-accumulate). + !> This overload belongs to libxsmm_(wi)gemm. + PURE SUBROUTINE libxsmm_wigemm0(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta + INTEGER(C_SHORT), INTENT(IN) :: a, b + INTEGER(C_INT), INTENT(INOUT) :: c + INTERFACE + PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_wigemm_") + IMPORT :: C_CHAR, C_SHORT, C_INT, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc + INTEGER(C_INT), INTENT(IN) :: alpha, beta + INTEGER(C_SHORT), INTENT(IN) :: a, b + INTEGER(C_INT), INTENT(INOUT) :: c + END SUBROUTINE + END INTERFACE + CALL internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + END SUBROUTINE + + !> Auto-dispatched general dense MM (low-precision, int-accumulate). + !> This overload belongs to libxsmm_(wi)gemm. + PURE SUBROUTINE libxsmm_wigemm1(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta + INTEGER(C_SHORT), INTENT(IN) :: a(*), b(*) + INTEGER(C_INT), INTENT(INOUT) :: c(*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_wigemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1)), lda, & + & b(LBOUND(b,1)), ldb, & + & beta, c(LBOUND(c,1)), ldc) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (low-precision, int-accumulate). + !> This overload belongs to libxsmm_(wi)gemm. + PURE SUBROUTINE libxsmm_wigemm2(transa, transb, m, n, k, & + & a, b, c, alpha, beta) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta + INTEGER(C_SHORT), INTENT(IN) :: a(m,*), b(k,*) + INTEGER(C_INT), INTENT(INOUT) :: c(m,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_wigemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & + & b(LBOUND(b,1),LBOUND(b,2)), k, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), m) + END IF + END SUBROUTINE + + !> Auto-dispatched general dense MM (low-precision, int-accumulate). + !> This overload belongs to libxsmm_(wi)gemm. + PURE SUBROUTINE libxsmm_wigemm3(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + INTEGER(C_INT), INTENT(IN), OPTIONAL :: alpha, beta + INTEGER(C_SHORT), INTENT(IN) :: a(lda,*), b(ldb,*) + INTEGER(C_INT), INTENT(INOUT) :: c(ldc,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_wigemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & + & b(LBOUND(b,1),LBOUND(b,2)), ldb, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. + PURE SUBROUTINE libxsmm_blas_dgemm0(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a, b + REAL(C_DOUBLE), INTENT(INOUT) :: c + INTERFACE + PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_blas_dgemm_") + IMPORT :: C_CHAR, C_DOUBLE, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc + REAL(C_DOUBLE), INTENT(IN) :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a, b + REAL(C_DOUBLE), INTENT(INOUT) :: c + END SUBROUTINE + END INTERFACE + CALL internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. + PURE SUBROUTINE libxsmm_blas_dgemm1(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(*), b(*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1)), lda, & + & b(LBOUND(b,1)), ldb, & + & beta, c(LBOUND(c,1)), ldc) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. + PURE SUBROUTINE libxsmm_blas_dgemm2(transa, transb, m, n, k, & + & a, b, c, alpha, beta) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(m,*), b(k,*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(m,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & + & b(LBOUND(b,1),LBOUND(b,2)), k, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), m) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(d)gemm. This overload belongs to libxsmm_blas_(d)gemm. + PURE SUBROUTINE libxsmm_blas_dgemm3(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_DOUBLE), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_DOUBLE), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_DOUBLE), INTENT(INOUT) :: c(ldc,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_dgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & + & b(LBOUND(b,1),LBOUND(b,2)), ldb, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. + PURE SUBROUTINE libxsmm_blas_sgemm0(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a, b + REAL(C_FLOAT), INTENT(INOUT) :: c + INTERFACE + PURE SUBROUTINE internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) & + & BIND(C, NAME="libxsmm_blas_sgemm_") + IMPORT :: C_CHAR, C_FLOAT, LIBXSMM_BLASINT_KIND + CHARACTER(C_CHAR), INTENT(IN) :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldc + REAL(C_FLOAT), INTENT(IN) :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a, b + REAL(C_FLOAT), INTENT(INOUT) :: c + END SUBROUTINE + END INTERFACE + CALL internal_gemm(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. + PURE SUBROUTINE libxsmm_blas_sgemm1(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: lda + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(*), b(*) + REAL(C_FLOAT), INTENT(INOUT) :: c(*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1)), lda, & + & b(LBOUND(b,1)), ldb, & + & beta, c(LBOUND(c,1)), ldc) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. + PURE SUBROUTINE libxsmm_blas_sgemm2(transa, transb, m, n, k, & + & a, b, c, alpha, beta) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(m,*), b(k,*) + REAL(C_FLOAT), INTENT(INOUT) :: c(m,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), m, & + & b(LBOUND(b,1),LBOUND(b,2)), k, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), m) + END IF + END SUBROUTINE + + !> Re-exposes BLAS based GEMM routine with an interfaces similar to + !> libxsmm_(s)gemm. This overload belongs to libxsmm_blas_(s)gemm. + PURE SUBROUTINE libxsmm_blas_sgemm3(transa, transb, m, n, k, & + & alpha, a, lda, b, ldb, beta, c, ldc) + CHARACTER, INTENT(IN), OPTIONAL :: transa, transb + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, k + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: lda, ldb, ldc + REAL(C_FLOAT), INTENT(IN), OPTIONAL :: alpha, beta + REAL(C_FLOAT), INTENT(IN) :: a(lda,*), b(ldb,*) + REAL(C_FLOAT), INTENT(INOUT) :: c(ldc,*) + IF ((0.LT.m).AND.(0.LT.n).AND.(0.LT.k)) THEN + CALL libxsmm_blas_sgemm0(transa, transb, m, n, k, & + & alpha, a(LBOUND(a,1),LBOUND(a,2)), lda, & + & b(LBOUND(b,1),LBOUND(b,2)), ldb, & + & beta, c(LBOUND(c,1),LBOUND(c,2)), ldc) + END IF + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine. If the input (optional) + !> is not present, the routine is used to zero-fill the out-matrix. + PURE SUBROUTINE libxsmm_matcopy_p0(output, input, typesize, & + & m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & + & OPTIONAL :: n, ldi, ldo + INTEGER(C_INT), INTENT(IN) :: typesize + TYPE(C_PTR), INTENT(IN), OPTIONAL :: input + TYPE(C_PTR), INTENT(IN) :: output + CALL libxsmm_xmatcopy(output, input, typesize, & + & m, n, ldi, ldo) + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine (DP/rank-1). + SUBROUTINE libxsmm_matcopy_d1(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(*) + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: input(*) + CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8, & + & m, n, ldi, ldo) + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine (DP/rank-2). + SUBROUTINE libxsmm_matcopy_d2(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(ldo,*) + REAL(C_DOUBLE), INTENT(IN), OPTIONAL, TARGET :: input(ldi,*) + CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 8, & + & m, n, ldi, ldo) + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine (SP/rank-1). + SUBROUTINE libxsmm_matcopy_s1(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_FLOAT), INTENT(OUT), TARGET :: output(*) + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: input(*) + CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4, & + & m, n, ldi, ldo) + END SUBROUTINE + + !> Matrix-copy (2-dimensional copy) routine (SP/rank-2). + SUBROUTINE libxsmm_matcopy_s2(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + REAL(C_FLOAT), INTENT(OUT), TARGET :: output(ldo,*) + REAL(C_FLOAT), INTENT(IN), OPTIONAL, TARGET :: input(ldi,*) + CALL libxsmm_xmatcopy(C_LOC(output), C_LOC(input), 4, & + & m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (in-place form). + PURE SUBROUTINE libxsmm_itrans_p0(matrix, typesize, & + & m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + TYPE(C_PTR), INTENT(IN) :: matrix + INTEGER(C_INT), INTENT(IN) :: typesize + CALL libxsmm_xitrans(matrix, typesize, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (in-place form, DP/rank-1). + SUBROUTINE libxsmm_itrans_d1(matrix, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(*) + CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (in-place form, DP/rank-2). + SUBROUTINE libxsmm_itrans_d2(matrix, m, n, ld) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld + REAL(C_DOUBLE), INTENT(INOUT), TARGET :: matrix(ld,*) + CALL libxsmm_xitrans(C_LOC(matrix), 8, m, n, ld, ld) + END SUBROUTINE + + !> Transpose a matrix (in-place form, SP/rank-1). + SUBROUTINE libxsmm_itrans_s1(matrix, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(*) + CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (in-place form, SP/rank-2). + SUBROUTINE libxsmm_itrans_s2(matrix, m, n, ld) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ld + REAL(C_FLOAT), INTENT(INOUT), TARGET :: matrix(ld,*) + CALL libxsmm_xitrans(C_LOC(matrix), 4, m, n, ld, ld) + END SUBROUTINE + + !> Transpose a matrix (out-of-place form). + PURE SUBROUTINE libxsmm_otrans_p0(output, input, typesize, & + & m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + TYPE(C_PTR), INTENT(IN) :: output, input + INTEGER(C_INT), INTENT(IN) :: typesize + CALL libxsmm_xotrans(output, input, typesize, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (out-of-place form, DP/rank-1). + SUBROUTINE libxsmm_otrans_d1(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(*) + REAL(C_DOUBLE), INTENT(IN), TARGET :: input(*) + CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & + & 8, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (out-of-place form, DP/rank-2). + SUBROUTINE libxsmm_otrans_d2(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + REAL(C_DOUBLE), INTENT(OUT), TARGET :: output(ldo,*) + REAL(C_DOUBLE), INTENT(IN), TARGET :: input(ldi,*) + CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & + & 8, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (out-of-place form, SP/rank-1). + SUBROUTINE libxsmm_otrans_s1(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldi + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), OPTIONAL :: ldo + REAL(C_FLOAT), INTENT(OUT), TARGET :: output(*) + REAL(C_FLOAT), INTENT(IN), TARGET :: input(*) + CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & + & 4, m, n, ldi, ldo) + END SUBROUTINE + + !> Transpose a matrix (out-of-place form, SP/rank-2). + SUBROUTINE libxsmm_otrans_s2(output, input, m, n, ldi, ldo) + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n, ldi, ldo + REAL(C_FLOAT), INTENT(OUT), TARGET :: output(ldo,*) + REAL(C_FLOAT), INTENT(IN), TARGET :: input(ldi,*) + CALL libxsmm_xotrans(C_LOC(output), C_LOC(input), & + & 4, m, n, ldi, ldo) + END SUBROUTINE + + !> Returns the difference between two timer ticks (cycles). + !> Implicit FORTRAN 77 interface: subroutine available. + PURE FUNCTION libxsmm_timer_ncycles(tick0, tick1) + INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN) :: tick0, tick1 + INTEGER(LIBXSMM_TICKINT_KIND) :: libxsmm_timer_ncycles + INTERFACE + PURE SUBROUTINE internal_timer_ncycles(ncycles, & + & tick0, tick1) BIND(C, NAME="libxsmm_timer_ncycles_") + IMPORT :: LIBXSMM_TICKINT_KIND + INTEGER(LIBXSMM_TICKINT_KIND), INTENT(IN) :: tick0, tick1 + INTEGER(LIBXSMM_TICKINT_KIND), INTENT(OUT) :: ncycles + END SUBROUTINE + END INTERFACE + CALL internal_timer_ncycles( & + & libxsmm_timer_ncycles, tick0, tick1) + END FUNCTION + + !> Utility function to calculate a collection of scalar differences + !> between two matrices (libxsmm_matdiff_info). The location (m, n) + !> of the largest difference (linf_abs) is recorded (also if NaN). + !> In case of NaN, differences are set to infinity. If no difference + !> is discovered, the location (m, n) is negative (OOB). + !> Implicit FORTRAN 77 interface: + !> TYPE :: info + !> INTEGER(4) :: datatype + !> INTEGER(4|8) :: m, n, ldref, ldtst + !> ARRAY :: ref, tst + PURE SUBROUTINE libxsmm_matdiff(info, datatype, m, n, & + & ref, tst, ldref, ldtst) + INTEGER(C_INT), INTENT(IN) :: datatype + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN), & + & OPTIONAL :: n, ldref, ldtst + TYPE(C_PTR), INTENT(IN), OPTIONAL :: ref, tst + TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info + INTERFACE + PURE SUBROUTINE internal_matdiff(info, datatype, m, n, & + & ref, tst, ldref, ldtst) BIND(C, NAME="libxsmm_matdiff_") + IMPORT :: LIBXSMM_MATDIFF_INFO, LIBXSMM_BLASINT_KIND + IMPORT :: C_PTR, C_INT + INTEGER(C_INT), INTENT(IN) :: datatype + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: m, n + INTEGER(LIBXSMM_BLASINT_KIND), INTENT(IN) :: ldref, ldtst + TYPE(C_PTR), INTENT(IN), VALUE :: ref, tst + TYPE(LIBXSMM_MATDIFF_INFO), INTENT(OUT) :: info + END SUBROUTINE + END INTERFACE + CALL internal_matdiff(info, datatype, m, n, & + & ref, tst, ldref, ldtst) + END SUBROUTINE + + !> Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). + !> Implicit FORTRAN 77 interface: + !> INTEGER(4) :: coprime (OUT) + !> INTEGER(4) :: n + ELEMENTAL FUNCTION libxsmm_shuffle(n) + INTEGER(C_LONG_LONG) :: libxsmm_shuffle + INTEGER(C_INT), INTENT(IN) :: n + INTERFACE + PURE SUBROUTINE internal_shuffle(coprime, n) & + & BIND(C, NAME="libxsmm_shuffle_") + IMPORT :: C_LONG_LONG, C_INT + INTEGER(C_LONG_LONG), INTENT(OUT) :: coprime + INTEGER(C_INT), INTENT(IN) :: n + END SUBROUTINE + END INTERFACE + libxsmm_shuffle = INT(0, KIND=C_LONG_LONG) ! avoid warning (older CRAY) + CALL internal_shuffle(libxsmm_shuffle, n) + END FUNCTION + + !> Calculates a hash value for the given array and seed. + !> FORTRAN 77: see libxsmm_xhash + FUNCTION libxsmm_hash_char(key, seed) + CHARACTER(C_CHAR), INTENT(IN) :: key(:) + INTEGER(C_INT), INTENT(IN) :: seed + INTEGER(C_INT) :: libxsmm_hash_char + libxsmm_hash_char = seed + CALL libxsmm_xhash(libxsmm_hash_char, & + & libxsmm_ptr(key), SIZE(key)) + END FUNCTION + + !> Calculates a hash value for the given array and seed. + !> FORTRAN 77: see libxsmm_xhash + FUNCTION libxsmm_hash_i8(key, seed) + INTEGER(C_INT8_T), INTENT(IN) :: key(:) + INTEGER(C_INT), INTENT(IN) :: seed + INTEGER(C_INT) :: libxsmm_hash_i8 + libxsmm_hash_i8 = seed + CALL libxsmm_xhash(libxsmm_hash_i8, & + & libxsmm_ptr(key), SIZE(key)) + END FUNCTION + + !> Calculates a hash value for the given array and seed. + !> FORTRAN 77: see libxsmm_xhash + FUNCTION libxsmm_hash_i32(key, seed) + INTEGER(C_INT), INTENT(IN) :: key(:) + INTEGER(C_INT), INTENT(IN) :: seed + INTEGER(C_INT) :: libxsmm_hash_i32 + libxsmm_hash_i32 = seed + CALL libxsmm_xhash(libxsmm_hash_i32, & + & libxsmm_ptr(key), SIZE(key) * 4) + END FUNCTION + + !> Calculates a hash value for the given array and seed. + !> FORTRAN 77: see libxsmm_xhash + FUNCTION libxsmm_hash_i64(key, seed) + INTEGER(C_LONG_LONG), INTENT(IN) :: key(:) + INTEGER(C_INT), INTENT(IN) :: seed + INTEGER(C_INT) :: libxsmm_hash_i64 + libxsmm_hash_i64 = seed + CALL libxsmm_xhash(libxsmm_hash_i64, & + & libxsmm_ptr(key), SIZE(key) * 8) + END FUNCTION + + !> Calculates if there is a difference between two arrays. + !> FORTRAN 77: see libxsmm_xdiff + FUNCTION libxsmm_diff_char(a, b) + CHARACTER(C_CHAR), INTENT(IN) :: a(:), b(:) + LOGICAL(C_BOOL) :: libxsmm_diff_char + IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & + & THEN + CALL libxsmm_xdiff(libxsmm_diff_char, & + & libxsmm_ptr(a), libxsmm_ptr(b), & + & SIZE(a, KIND=C_LONG_LONG)) + ELSE + libxsmm_diff_char = LOGICAL(.TRUE., KIND=C_BOOL) + END IF + END FUNCTION + + !> Calculates if there is a difference between two arrays. + !> FORTRAN 77: see libxsmm_xdiff + FUNCTION libxsmm_diff_i8(a, b) + INTEGER(C_INT8_T), INTENT(IN) :: a(:), b(:) + LOGICAL(C_BOOL) :: libxsmm_diff_i8 + IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & + & THEN + CALL libxsmm_xdiff(libxsmm_diff_i8, & + & libxsmm_ptr(a), libxsmm_ptr(b), & + & SIZE(a, KIND=C_LONG_LONG)) + ELSE + libxsmm_diff_i8 = LOGICAL(.TRUE., KIND=C_BOOL) + END IF + END FUNCTION + + !> Calculates if there is a difference between two arrays. + !> FORTRAN 77: see libxsmm_xdiff + FUNCTION libxsmm_diff_i32(a, b) + INTEGER(C_INT), INTENT(IN) :: a(:), b(:) + LOGICAL(C_BOOL) :: libxsmm_diff_i32 + IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & + & THEN + CALL libxsmm_xdiff(libxsmm_diff_i32, & + & libxsmm_ptr(a), libxsmm_ptr(b), & + & SIZE(a, KIND=C_LONG_LONG) * INT(4, KIND=C_LONG_LONG)) + ELSE + libxsmm_diff_i32 = LOGICAL(.TRUE., KIND=C_BOOL) + END IF + END FUNCTION + + !> Calculates if there is a difference between two arrays. + !> FORTRAN 77: see libxsmm_xdiff + FUNCTION libxsmm_diff_i64(a, b) + INTEGER(C_LONG_LONG), INTENT(IN) :: a(:), b(:) + LOGICAL(C_BOOL) :: libxsmm_diff_i64 + IF (SIZE(a, KIND=C_LONG_LONG) .EQ. SIZE(b, KIND=C_LONG_LONG)) & + & THEN + CALL libxsmm_xdiff(libxsmm_diff_i64, & + & libxsmm_ptr(a), libxsmm_ptr(b), & + & SIZE(a, KIND=C_LONG_LONG) * INT(8, KIND=C_LONG_LONG)) + ELSE + libxsmm_diff_i64 = LOGICAL(.TRUE., KIND=C_BOOL) + END IF + END FUNCTION + + !> Check if location is SIMD-aligned and optionally consider the next + !> access as if reached by incrementing the location (in Bytes). + !> Optionally calculates the alignment of the given location in Bytes. + FUNCTION libxsmm_aligned(location, increment, alignment) + TYPE(C_PTR), INTENT(IN), VALUE :: location + INTEGER(C_INT), INTENT(IN), OPTIONAL :: increment + INTEGER(C_INT), INTENT(OUT), OPTIONAL :: alignment + LOGICAL :: libxsmm_aligned ! C_BOOL (GNU Fortran issue) + INTEGER(C_INT) :: aligned + INTERFACE + SUBROUTINE internal_aligned(is_aligned, location, & + & increment, alignment) BIND(C, NAME="libxsmm_aligned_") + IMPORT :: C_PTR, C_INT, C_BOOL + TYPE(C_PTR), VALUE, INTENT(IN) :: location + INTEGER(C_INT), INTENT(IN) :: increment + INTEGER(C_INT), INTENT(OUT) :: alignment + INTEGER(C_INT), INTENT(OUT) :: is_aligned ! C_BOOL + END SUBROUTINE + END INTERFACE + CALL internal_aligned(aligned, location, increment, alignment) + libxsmm_aligned = 0.NE.aligned + END FUNCTION + END MODULE + + + diff --git a/third_party/libxsmm/include/libxsmm.mod b/third_party/libxsmm/include/libxsmm.mod new file mode 100644 index 0000000000000000000000000000000000000000..2e87ea7ec1156b6f9b2c25231541d20eee590098 GIT binary patch literal 16591 zcmZvjb8se4)bBU8^~83vv27EMwr$%sHhlBj_ui`e$DQi>&Y7v{=c&eY zpHI(e{7A?j{|=B#Z%x;A_PR$dU$E_XpH%e$T(`H4@b;Kx337GWInyK0*?40Pd9h?K z5{{J5$722yU_Sy{g&F~m$p?xik%+bSUnNvOlzXnbJKNvg+@DyxwRN_qi~lyCTJUK8 zyu%3V93d}1bz|zBvQ-&h_u_4Tt?lakIF@<3JEU`u`Z8_T^%>n!b?fwEz}(!RyJ9WJ z_!~KE)}#|nmS4WQ=h5`?PBd&sz46O2sb}$G5dFaH-I_Zirus;+#rbV3GV_S$_?57| zz3OQH@#OVhNtb(r%pt852Q7)~0~=`!X$ol+X%T;{8Gmd;t`eKq#5XvGScpc9TZ~9- zuGQAXai3fEuw{f{jKRfLbNe@20v%mle`V@H>QO4k^X68-2syTy(_85Zk`mOUPwYtT zMD3VF97-ZeCQ3X?`ZE6~qmFNF8d5S!HcA}Ianq^^i%x|#s_CtrQOa4*re;4nS)o4!w?q z4up=m4(bQUhsekA2XkLjedr^zIf8^G>O6SY`zwE2;znODQx81p>z5YeB(;bPBuS>Vp zh1WUP-HCM?quu41(;Zrz<;{)X>G3@5Nn4UqHqx6NU0tm@&vJ4i<-LD>oT4`h{o>;` z^<+ywlf1g+{b6g%h@tk0SflHL_<&N$SBERF<-;)c<>h+Y_O3EHK==QA3H zZh!&B$zOkk>{S{|+f&i_W4*~L>xV0Ic%1IjpZA(%{G-2-Y?C?!=|n0d5Ij5MZuXV5 zmFu6c-J9FD2Tkq!7mMZh2>5p?_`Etk9*?*4g{0gscN2eV&F+*nUudvDd`~ED>5-k$mWhxNdjPk!`?VSH3Q9TWCcuO*4dPz|y$=rida=LVcLce+{Jq^{bAz#Z7_J}Rp60{quSp?5XwbDwFKdv2#Q3d* zmkR8z^;4kjPkyswHj=q-ig%wZR%4Ii0^ML*a9O^IXn$YCWU18J~`B z%csTdaHW)X#@loni_BW8Lg(O@>6~?EWfWb1Nnx!cxo_X)hzvfD z6heJ`SDworB{*49EBz&uaaSPtoq@E8`3nVKeM}T_E911TtIh-!O=8(4lykA zSw3phepWq#n$#bLF&2$(cT=ZaXaO`dUGUBaL69VH01#qHmxza~MApzU$bpB5h)7LR zVUWlD#kzFc-mh-APuyI-j-L=-;M^ykxI5XMe{<@6eF0ft;}1((d2yDT>Q`=!`R3;V(fwZ9ZHS2UR{|{Z1Wh6pIgM0yV>0#;O5q;@G|EPQxZDvAq^nw=S>tLqjKD61Bd7FNy_f*#zOGUh8P5Mo+#2IZ76 zYZhZUDV0S;R^_E$O-&jf!z!Ll8jI^^`6f1U^otcIt)*L&zprFigu@|* zl$elN7ke=a$r%=-z-3qrr^l6;@K_h6xm6V>4}7H?Ez6+F*cOK?mop8@m8Y#AZA6T0 z%iZHfR2)&8zVlTN*DvBMubdg_vycmDPM+S^EoKH=vm8$^syN=_YhJEd{2Vv%fAru; zJ1uJqVJYfF^rCLn6=5`R^Zq>hO&s{~_}(mMtLxq2_S~do19CXozD=j&FUr47g%RcI zYk3f6iE@Js^-I!!?2ToI2dV<5jFF_~$gWXqDq|)`JNv^PKF`*2f71ZwWq+@~eb%LS zg7NVXx&zLK{Hex}f6>|P!NBw=3yn2^MI?_>wks;*OfoxYuly^(*8S1`dYUMXB%NfD zq&nrfXn~e;(wkG&GQ=T8HjV?9VIFqXpO8Iuz5VZlm_-q1?cwC#ZdpQ2bJ+@J^{UaW zsH^1qL6fzT7U$%_-BmqRc$Fo41p$PBAbL@ezG^D5x*KcaR5bV(V|6iSCf#VZ()E&d z);TQef^>o{6fg8Kamz&cH%N7?8Hh;inE3(3(N4>^k9XRE^6Dqy`ug$ldfym+o@qL< z>gnyAZg20;r^W8Th_#Dmru+=!`J^J`SLHKsM+i21Ea_p=cx$M!*&2=|c5Z6|V{-yR z)Zd6sV3aC*RifHX=Pc)UxXfqX8njdDt7?6S>Rvy;Fr7RX-I#H6Mi^CeHl~lo0P%`p zA!T0LV+a`Jq4&Zcw@xmSWhXYVN-NL;&YAg4PR;RWp{e393~}U02`UtdXS`l91!5FM zXI>x1tyoQ+L%W|b%pzeaCQ_SfbiE9fCXtU;cjJL@N$ts;YOCG0W$3gC!J4A^4Pt+> z*lNi&J}8&<&PteSkpeaZ)TI$Mfb%>7`L|wA?($PyFFh74+XPjZs5ru>vRN!toh(ps zs$_(y)Wa|l{nfxdXwLWiga69q^e`fMRP{Ja|j zfAvXSMzna~vu;#Gxu6bf$ZyaW?nqUVm%W{KdPFdo{T>W z*wTN-BtJPoJoJ=9Bv!UnTa^GI%fmnoVE~YbiA2W$vLE@JG!S^CO%%p@yB_6XX3&?BM&dXj5|}+Dg}_U!H3j z3RHgEw~5AJ66UIgw0(jCo3$*+fMK`fA=EeD~i58in_9g)@q(O^601C04P(XEo zOGgGyRH&n23*Jw4wkHzIZ4x{$w89?);T9_lk5yd=X^#bwhXk&}C@cjBly-|>3e0Si zZt2<8b$pSGo33qtxpIl4hwdJ!`G8zd0-WrnNz)nt@+&~<6Z9N+z+RMoU>UGy346;D z@`459CgCQ4Q^kh0%=J6PlYJSMFNoa#GlmC>QVdVnHcZw{^db2K6o8^9?au%GMV<5$Y#K7@mVST#*&?hdyy znY1ao+nt}y#3}c_K>N+6wq8KSES7`AV^Hpq*21Zug#2WJIe;IGNCYTAjBx}gP~XA@ zRhRHB&k`+i(7xKU6(TFST|YUtirk;{cwntNb2{-v3r?|-SIYWmfZn8{TW%;8K~;G4 z3&zq^^ZrbtGDJ~rs{f7ZBUIJ}V8SW?agRue^7C639Wh0PEqWCc2MEqgCNjzdaSJx} zR&p>C!#iDmNs%Qp5C(Sq$+1acKdtNJrx4%CGS!YZM)u z@P2l{vq)YaA)XbVP&DF<$j*wl{DoI-Z_e2~_xZSyrxW6qOqD6Mr+xiF>m7hzSzD!~ z1a&9+>IHcCJsHeEVXWt`2P>I2RSgECRDUFwW^6Z3xTrk1r9y{bFd>pwJ5Ik>1Zw~T zC{q9wE5Ix4w&_6>0Pib}J<40^rRZl$ey7$kiYP_OD8*90H$7=d{TEUF3(;WSgvucw z7)(^<%^yph#?-SFtfUVVaVDSa-GSmvaz$0uD2Hx2eW^*`Nm5%5>dv2mI{05odY$3C z38Y`khrSBTMGm${R<^^3Wu%CrF9~=f;4MbXR4jB*2{Pn0Io?PDo?luvlKf`WK@*-= zMoZb$i^(W*(ST6o;AY`l2I29Jet9!N=5j~AN=Q?<&tlyD3Zsg?a*q%OD26K16~5<) zv*QzW0igr*asP^}Wij}Qn-7uE3T|A7g;eDAjOZ|n6g;(F>=A9a09!lR2*{CBOa)$-4Y zmMbk-RX#0_JFono!g8%4^^~xxe0#Mm@Cu>lOfZ;thPaWIhP;Aur6I)lLW?n8erJ

    )a4GAg};pp=S#`>7S?Cnt4S-u);!IE<$t5J44hVL@oh;e&=#@ZPPq!(T1!B zHsf{rDDb%-UyM7anvO+hn_4LNgv{!ISJc%^(YqAZVpc!I+*0ic&Ynl|O>O&7K&5{y zw^jaPE8f-o`^xpoQu;X}_sF(x{qEx9u71TwA6+AuD<0sH990a)!c4nU5=DHEfFr(QGT_Z$C>5NH3ey+fRW$)9 zFUY<`As0ilpzMXj`so%Dl6Vmt(78(G^vE+oFSlp^ zVtl5fGAYk^#-}Y>JaIlxOgQQANwPlflg!d@&1$gI~Qtha^S@YSb67!l=N&< zF+z$CnGEm|6aG91roJ4kF5@oZ&=bG$cV~*UfHi*^$tO&j{T*Jv6}`scr5PuY%pV60 z(UlkiWiI=1PQqV!j!fd_cNeRn3b-fvqfDAg4#i39-vyeLynBH~rk>=eEg0tPO|VKJ zjpd#4=Nj266?C)&Scd=;EWWl$)h@HHA|5Z`krne!+CQB-n*po>twx3FSOka;RpreO z8g?|uv3?Pm4)R@+N#}pP5wxF1dCdIo{5)A0#>psRGi8v*IQlP84@kkPD~|RD8f@Bl zv!1_?!~xmJKFXNGJs8ApK)SGA3^$dd^G7XXuFPq2ovp#?Tp@ZJEotF9>=pYsEtPPFy$K|UeDirOvdkDOBz~BdQ5;02(t$S+ru09%L6b7OaQfZ3bT;CWr<@!J?E#BFFw3c1!s9KqluAakM!?L^Ix4gKUwST zYfS1~QRV#uL-DNCBRG>YZcUoDCPHc-e#2mGi1%|+pA@As~dibTzTi-PK?p!|NG=gxx9%Bk9 zLw3fSyPC8+Djp%sxe@e`oNX~kT6caN$8Wyu3e024)E!*Lvy-2GE%_{hP@E4m zCbbd63&tzsW8z6o!=rh>yl62EGT~A3HSwVTzNH!!i-G>JC?sM~1i>^P7>ti@Pe=1w z9ur~I#>BLl_NI^Y+;*?)twb`P)$E?oF_+aUrN1R4!Z;ciVB21=i`K)ViuiL;O1js{ zwCi#zslvUQD^$u4MUj5fJtKbX{fefgSmHHm?S%#&J8;Qwasw0rZv4JC8$|P2et*(w z?dgE_l#70sqb4jN|CSP-jZ<#hFj=mjJ9q zg2lMkBS!Z6$Cl6|#>6jr$HP9?_CCkcy!e_ee9ad}_hqooGE2Bm1)O#Zm4!_h6*#SXM5mOa_FttBF_9 zyFy!0Jnn@uCcMoutR|^**mPwol)eF!+viT?f;Jf))-;B|WImQ?rZ?x^sg~y7xR%Us z+$E~9pSyVX<_C{wrDoQEP^*jI%^|T_+4++-`N#;ob=>3@OKt6~r3$-=thBRuyR~U9 zbCV#_t}~;-6+#=oaZ`M!3Ud87cM)hy9DmD*{My^ED*O#gBV6z5II%*VHcfR${!ncV zhS-FDQo#_U?_rz_DL4mK2S*232U&+IrR6`4*$9Q|=sS}O`*z>)H^}TUrzK&dYDY-G zhXDV1?|uwGL;O;L`;x%%gm(Bqqp^0%sJ+>sufBE}B%|DU)38$kpLQ10*oJwstm+$KNKEWVC^OimhR zvO0ery(6?>7tCvzy2$5L3`Mmjikcc>z9njP^i~g6$_<-Kgu~rjQUz~QJS~iRn0;R} z5fkb+kdvy}L197f8HnayWSlL`3h%<(v)7n|#VQQ2DP~9fmOdK^PN76XyyC)% z79li9pTV>Mb}ckio&TzE*ABHkohN9&L5UDfF^O8q2wIRC>^4oX2e8NVeGYh$7Vs(} z;8mo!NJ#25K5DxjHm7_u6R~6m1vwaWVu>ZHdJRNN>@+Nvx%Rnru6@U|O5B<)uVztt z`k9fjdQi?=2KH%QA#u)Hzc0#iGdvjTnqPV5u4$B~;^{?f>v2*e=ESdFD)a2phNZn- zJ(RSi>?*Z@%fdC_nR0yNr|qZsnM+B{gsec1tgxb{D6>%f(_1HUbb8baiyP_cAN(#jp`d;R9462WXuHL19@!qGH5Wrj{6i1D$%`;!gK?B!RDD z$)wdzMTS2UYDIuwCj4nY#^8w@zmE9R0?pXNLi)XNg!}uGYF$Y5g=fB66^!qmQ6a{( zj3s{<10{m}Lg!ciXM`1*V2HO<&CXnX8I>|}@lgR)(&&p*`PYGtAyY{Gq4{St2n3js zY!ve|mzjw1YXWw7Z7So9A#}we($7Eh&2a?=1T?nLO z$#@xhkbm2Z-w%nIYr8bm&kv(&#*$sKM6Q#|Fp-T>dPDUrfd$SqI}jC;`kU-N_kM)rHxAe6Q{cj9TB%5hj&{c13SVKOg%ejn|QG)jMqohRMTFV9L z!#Xa5xDv622(re8>Gqk0rE_yTuN5(!f^dwo8>}H9L641@f$?aGv*$0?6a&-u0of-Y z&H>PArQs3g%-`@BR@1*$Pd8id5D$qXA@R5-q1eM|9eI%dc&JT0toR2P)9(Pgq>AEU zI&HkVgm+P}d?*RXf`Nz3(a>Vp`NRNy8QOso;xKNa2f%U7AH5DxWO(6J-S2|*;w?a3xxs`9eT?rU{E>6q38$IfeW<6nL27|AxUE* zF!?7Sd}UCQ&%$I9csY~=sVgaL3AVKT_X4_zd1PKgVq$VUQ7^1o3qLLpsJGb8>yNWr zKWwHv`L<&UTKvMp{Nl8@Y!gwBbWBl`SNyy$AX89q^{=5vzTI%~7J@*=UIZvO1ObbY zg(QL(uupysL60)@(^8FsVvD%!GLi2K!rZSo0^|bYvSYTm>}}x|lQX>4`WC+A$0{oh z3e}K_@ZlT9NKv^1BA&R&2jQQ68qC6xAxgMj=7q6LDJ|(f*_CYb1H-^VGG%qmr5j^LKUX%V{p(2*A?3-*<|aJT;1+U&AIA8@La44y|g{ zC(+MW4^kkoRbsw={`-EStgzwh+~OXiTUe6TL1*tV^SH2JrX(ugKIC5i;jZ2N;-@#U zG-EY+j$oRCPv-*y=d4-P8cih5R6VmGle?G{GXFyo4EMKR_L-yg0tI z+H%XJmwEm&Z_U2`ieR|fg>UEaFdTte$wCYwx7d2N#|~n8TN2xR=43!;kq*GthCU?ovJo`*s2JhNY&x39M!fP zCS7%|+9{T_=$r(oWCX%V+PV*i7zqy3SW3vVS`S09bYI-<9qez9a04H<+Bg0)1fCzK zF?>xxJvrAub*=|>KW}^5I=B2633|v0+@5t{;GFhJDD;}i{fuISuD3sf;{8&0iayVC zu^V1LA>IT5!CMQiAI4s6JOa?UOKvlG!MhYZ=Utf>XVS1lgKI>u_ZO4Jj@YvEE%{LJ zfUi}9y7Vnsj0}_;ZW6-oo?Iwovr9MLw+=JEhnlM%L>_oDp1A=|$0{847*{_#f5VZI z)(1j0ojB#jOOe7hqF4x(8QRctAVJCA={Er$Q|^Q*P$!&*BWkSD0EizAk& zjh(vM<+kE*SY;_kt-9O#r1T-WR<#LXpJ{n5k3!9+Aig%xb}$%;D$YPpSH4`(DX#B^ z6NXFK5Q5(;b(WqWq5(P)jT%p|(-de^EsR?M8?9&j1i)Wu5w~2GGE7>#5Q|+*XYAbb zkt{y%>`*gu=ZKz2SvKUT(=SiG3w&m8>U2iTqn6Ipqd1uxLxVr=eLg8 z#PR{P6-m3SP5A!#Vc7uwmm|XO5?&|I!Q-w4p0sNhQ&njfk7e>B8@R%se6}j-Np%fyODYr$Scrg)8FakA@jh+1UBRs zFWnfL1=Qsiqe4EIkAAlszt01zSe#^gW4}Wb%c65WxnD^|QS+FY#+b)KR(&oL(X^b-_r3_sd22RHnROLpb1 zm>o{Q$`lh6um#sN31sRu_dbP!B{pHEawL4~bK`!j25L+rD^<{Wfh z>0{|mWCLUYE+vV;4#{M*Y974GkpwyPVH3qoia1K}SXzp2$X$Hg;UPPtu1?J1NJynY zQs&P8w#NHixDvbCpdmFUy=}4Epdms`U=5Fp+th(bB7 zJIy)s_!V0xj4gl=f?zkHD5QA+2p~Mqa(Uh%qXSnI5zG<4MFR1c?-^nIe=pENw#Uhj zJU-IpptgG8IFhwS|I1T<|M5lKR`TR>wjjAB+Y3T3uz?;IYJvq^ZE_m}<#5yooxZ@k zx{4BE=>C1b3-Mf*cSs$D6d>9;A_A9!SWuuwTqjS`GxraqaIG%U)J~Zh$3)eQInO>E zv7_;Gj>MKhRfcXHj(BTZ+@O9X<RDk%5tcEv>K;l%~Hh;|GAp;Af^7EmfPe{>fXH=_qW+wM$UBtqGZ; zouMojPo0f|d7DeQ9`1N-$=*qc0&kU#k}8%rPHozvdt3pHhAEiFmJ9fS0k6oz78J(P zDu_G#hH9V~zy2mZZGwyS*ZtMQ>tlRP(t9HtGq(WZ3HTtZ{CgV6)r@*&$VS+Ww?3$f zV{M$qbvfMUU@X9NJ3I6Wo5icuSi6TZ4l|1a!!RT{{qTsa>ltvaEf4PMU-9XHSe_UK zLS|{liWs|x#_o@jSEtKm2##nu{~5Mkg9EcKCW0W`OFmldD!R-tlql{{l{{J4I~)K5 z`GXsLluL@2XCz&d^$BQn1EFf^rg>@S49|$(w`_AMPl-l=R9yjiY5_r-of*Wh`sB<9hdfE zkXNnRBN^X4_FDu<#ciqu3bn`Nnx!_>AFSwpt3Ao9Qmh7ckz5+!+h!}E&*55;#RcDi z%bqGJVJv3u)a0>5@?W`eiPWD^C4C3Gy&5{P=*YDTjz&xcGQmKv3aHzv)SpM|)dgzF z0)vn(_5dCQjaTueDD?%I*}V{s6G!N{5y3F}o?CZZl(IqnyHms<;3A9(3$ql~ z1+Z!b@k%#JxJBC8N#UW4b7QfS$m77C^6s3d%BYcK?bL452YHNnW$J76l(vv16HBx( zut4+gRP=~M4qxpuB$cv%85#a6y`Xh&HKDe~x(B9bFp;g&=f=SQ=7M}zDSo$5d#g?9&dB?$Z~HMeZ9uGl;QeXBO+sr{-6jf&G2 zGoBdqyRcK5Op&>)Q#&1bN0yKqzU0k`1C=Dj5r+sSe(%{g;Cxb?nf*w0Eu(zItH z&%)pb5+ZH`0$i2CVK~%DdU|xVhJ^|Q1ljhI{{9}UlwjcU3?hum24wT*Vm8~dVIp8; z-fR6Fo9Yy%KkHZmAUX3gTSrJBYt}Ur^ayNhWdi(_JuE{zx#g~~$wrTvQFHCesBw@^ z%E(6xE$`Wdj+Y<}D8iVWqUWhL|J>x+y;*6{k6d<`_ab&<+yf6IeC4>gB%gaL;g;{z zMqsD$ad4SDUOr98UaxD}7PqkrG%>KKv3qkd08Sq+8}uHAGJUpH{wtN8))*4g0`YGt z4nZ^!hrUwBEVz+`2>PoFNU@b#1-YEaDJ1&}C1}tECRs{)`f0Mlo?p>Lw(tb8X0~Bv zy&TH!87G`4_<<7}7gkcb8=sfcmhsyJoCMs(+?3*0f&7pO{3NdRL@ zC8|nhgMw9N0{db`6;})$H8{M##rAOBnp$okdC@rUeY-{%j@$DDH`CtNe+1%|j7eAvaG$q9nP6E*YiZ zFlE+~*nykMUU-mMYeDHQnRK|QaF=K&+?dEsii1f(Du*1NtoeJYD#69P)9}RL-k1ef zw>kk@hgfpksvw3_RhJS0WPVuVL!b2V*iH(pHlgRPM;xYo1%s;~=?g=5bx! z!?ON<%2u2MnGmuS>|h}i=A2Fe5_xoCGgS(DR5N!9x-= zSHTIqkOM{CL85ggGD*Ed)=GzX73hdnLA~0d3GZ$3tSZWy8zx!#?c7|e42p$ZEp?>L;=XK`4}!7DvzLK z76A_$T1O9t$o-d?-Fa%-{b(rjZXoB*WR|R86yQClS9r{FF|*a#;roGk#$#r;-hEJ@ z;hWQxWC=0oN^(?-zVS8lnjfH9Ko$aFIg!r`j(<6W@A*#>c|zAGem;?REO4P{zRR)C zm-BWcGWx#WDA=+P@EDBvQj(BPbJMLeJ)9 zFTNF5#JKGQn)sB8eELm}&0?~t622MQ1xWuF^k?f_p> z#!Xai+Qy^DLch4CCzek{A=SdA#9Kg8M_*rNJ_ErC>zZ-+mQdEod(%gv8}&w-%?%55LKgk*@%Tq+Lp&tqU^G#JFdN(IQ9R_-4FklyX&p8^eB+vL#XiDy^E|3 zY=`as(Vlku2HrP$4_>StGDD{R!|yKluzc~$_QI^B4Its|%Qji@X!GT?b`&$LLwT2% zyZ5yy4EyycjH?6T2-bP1y=ZLNg>8-|ArAciaN`9@6RAM~e^1tn*2B_th66C^E9O;0R!Hx2qqEur4GX(5vY+NQ!M#s-1T|GBw~bB(e0c@!f8ld z$34W_Wq`81a&6-O0~+Dqps`B}Q#gZVZp%(${Qapxz$UQGoIiWi1gA>xUzLsJ-Om~M0}Mf#_BqvCm`EM zx+<23Iu;zL>1cAE!0uRo-Eag;@JuBYo+_e*X?g`gdLnJ`41$bLjO1q-YeK$yxLAv0 zed(_FwbzypGuFy?^vi=u8;2Fj42+(!^`_Nb<4V3oPLHl8$+k@br}EMW5c&cLP21FX z!Z;;%tdod86|#h1o`!O6h)Ud_GG#Q713gvchERM$8@n>9Z}=|%7QL>o-}&O&yDN@v z_Mme%#9W2rKQ8nbE7>o=(wO(`5RBxg;Z@vv)isz$vZqf~=p8sAFNPk>Nd>8^k{Y+T zscJRn%h? zm*tDmem3DgG9mu7-}KSgRhYgk7U{QYz7{&o=5>Y59Ja8&0!K1Gq^1@SK9|{1i!^3|4LEDgT%BH|6+vxpbGa*}|SDa$_0%ga2RySqUyq-u1N6e%6FjO?!( z{O)Ke$1#+LG(Y(8HC(iHf|o{1Z%&I8IU*00}Gx(vyYBX%;|_YA<#<)AUNM^sB%%`9Onm;K%8BogW7 zUpd9V<$qA}Mznt&WFivH^4UefcImb|d(T!b=Xms)j!08{ey&JWKKt6~WC!!Jo@8!P z9rZ?)V?_G_xNyEx0$e10%LZo;JjGrL2@q8?at&R{hDJ}t!g$1r3U9J%|Z&>V`_1!?OaP+bX7jK`A14SSse&J zIxwEL?9B}|`T@$VhkqgB0`G}2b z*-Z=1v`W))5!^g1pg}w+>QiB{%KHcdVKPaMS$SyK5{k^hJZ3;_v)TghIJHWO0N7J< zn1TV}R^Km48lO&S4dw}5P)Vo|)np%qJfvK(oMgW`>qJBlL@Fv-$Qu$nL2=W>$p(-z zkUQpl`hxq>WbYn*{K*XoBAtajcqaX87|b2tTc!2~ zuXDD%U+rj?tU{H)Hk4VkvO7Pn^-#`}GIBCb%{tX5tBRdrT~->8I-E<5fS}fuuwp_e z;MY=l3Vw4g-h#ufn$=k)x=2^J1eAQ*zCHJ=)h;`#or7JPkQNRv`CcR@B2^#1gb_Vt zKK)TEJg!;7ZPT2F(zjl8bP_bKIc|aVm~|A^o_6Q1$nuQ7^OQX=7&am0c|B>!ia%@k z)qL!<>PN;(p>@j9;T(FkiA#I*BM8rJIj`1ZoH)`9I%uSUf(djPB;fb^Kn9A3`TypdotxKWP z`E7HB;hM1sDuGwN*M>)^5^E3OBZZn##jmDf=bA#rQ z1y}jHs&X?%!u@6CIn5Q0mlaD}?lJItXOx$xCV*dMs)cRiDp0UxB>n2yk0=Vu_PICZ zek_01suM>>RJDHa(du@!L+-(`%X&x^fj&Uf{IXJ+wc}|E8b#gHG8l|={Gb>!W%6Bj8_|Q8uEk)7Is)R*&TyM)ZuOy9CP#@-DsDWnvZ17yB z;XP%Gdpz>$z|%fgs?PJ0jsyEi=q(Y<8ze=H)F$Xe4RbyAdLzQ@1cmuH3Y=0S-#=3< z?he(O6QF|Q9n7nnKG7G_f<^bhnBcR%$BZf0C4F2!sI*Tu62ET55!!XXJg25>1aSeKtcLzi@m>$EYKa~s%&SIwAg(da-`crm zr1I@S)fwv^20l-64})B*&6%${E!QI5DP zZ=|+X%{!NuFY9is_?Gn`aMPJm-*MtPoYTOnTll}LCCmA(2iLXRH8)c&`~Ge`T6wc= z0S@~o{-#>g=0DvwwM2cva=nOH>zR z*IHviK%09j05^z2YFYg+q`9uKxPpIpLj$WwUZ2-rD5u7jz?cb&Ieg^kUxVxyTQ#D--_N3J$pbGR8MjUf%@U=sLO5Uav{y&UV!Y z>2ArDNo?4Un0lGWE^8*8Q~Z5pcOJtgiMy(4SR?5Tt}Yo> z)$vEG`+2R#Amt_8YpX14;4v63%ju8j=8!GlnWZ7k2+oXc>!dMOViR2Wt%BsD$*4Nr z6voEiBUiwzN0kn{RTQ+yrCMqkVrYvnA<4mzdkdp?SH+)C((4&LX!nmD9>$ghsSN|Q z?J5T2r4>FFFwtODJIEl;RjD6%5t+m~?!Mpgr!p`O6ZK<@+82|xWj^0)N?kdLE>Q`6 zpBD&ry3I%45Th}(mwV;z+e1F>&4-yQfQdIf8NO*rzJ87}FAHWCLKKXG*U>rnz8gy~ zzU|bEGF?j>4Y{vpJfitkwYTXg7#&x9Vchyh6IO;86qfDQPbk85i`E}}Eh46kdvhu( zCcQdqV`FxQD4NE*E;XGS=e5z`W`}(qmL_EwO=KT-`nKXu1X1RXg66gl#h@nswrkcLjm7Hv1%v~MUWTkYV@4%=CYa!3LbAmMO0sUk09pIl%*&ds_RmMx;Pu1T?d~j=TNNTuu zdx>NYPM%hE9o=z@qS!t;8o$5|KpXvi=yrJwuO07 zbzRTmYTrpsUH`!5VkZe&?fb>m*~jLi8xu&Y*%dgHp-*zVo7ZSs*pGgR+cnTId zmt{_&PNLfjolWy(g}@jt`?gA`D(x55HfYdQRL}L%lI}T9WDGDF*9fCx`?h>d7{3k$ zG0FPSv|51;t1pYiPp}3e+lw_`t#x&)iq^5!UVvQNA#Qd+x_;U;FN@sVH`iCxGD>D67oXK+t#xh^o55#|K0cZ^S{L0(xZFJT7n~;9u18I{ z^=Z}ce8yPQ*k+ShHFS}!PThu9i$BHgg_etz)UYhHnWf{c7QGMO`Jt=^_Ty8BjW4%j cS|60mEd_T{EUSgRlD_ddj3ueT-vIykAF(WASpWb4 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/include/libxsmm_config.h b/third_party/libxsmm/include/libxsmm_config.h new file mode 100644 index 00000000..bcde13b1 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_config.h @@ -0,0 +1,45 @@ +#ifndef LIBXSMM_CONFIG_H +#define LIBXSMM_CONFIG_H + +#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(LIBXSMM_SOURCE_H) && !defined(LIBXSMM_CONFIGURED) +# define LIBXSMM_DEFAULT_CONFIG +#endif +#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(_WIN32) +# define LIBXSMM_DEFAULT_CONFIG +#endif + +#if !defined(LIBXSMM_DEFAULT_CONFIG) && (!defined(LIBXSMM_SOURCE_H) || defined(LIBXSMM_CONFIGURED)) +# include "libxsmm_version.h" + + +#else +# define LIBXSMM_CONFIG_VERSION "" +# define LIBXSMM_CONFIG_BRANCH "" +# define LIBXSMM_CONFIG_VERSION_MAJOR INT_MAX +# define LIBXSMM_CONFIG_VERSION_MINOR INT_MAX +# define LIBXSMM_CONFIG_VERSION_UPDATE INT_MAX +# define LIBXSMM_CONFIG_VERSION_PATCH INT_MAX +# define LIBXSMM_CONFIG_BUILD_DATE INT_MAX +#endif + +#define LIBXSMM_CONFIG_CACHELINE 64 +#define LIBXSMM_CONFIG_ALIGNMENT 64 +#define LIBXSMM_CONFIG_MALLOC 0 +#define LIBXSMM_CONFIG_ILP64 0 +#define LIBXSMM_CONFIG_SYNC 1 +#define LIBXSMM_CONFIG_JIT 1 + +#define LIBXSMM_CONFIG_PREFETCH -1 +#define LIBXSMM_CONFIG_MAX_MNK 262144 +#define LIBXSMM_CONFIG_MAX_DIM 64 +#define LIBXSMM_CONFIG_AVG_DIM 32 +#define LIBXSMM_CONFIG_MAX_M 64 +#define LIBXSMM_CONFIG_MAX_N 64 +#define LIBXSMM_CONFIG_MAX_K 64 +#define LIBXSMM_CONFIG_FLAGS 0 +#define LIBXSMM_CONFIG_ALPHA 1 +#define LIBXSMM_CONFIG_BETA 1 +#define LIBXSMM_CONFIG_WRAP 1 + +#endif + diff --git a/third_party/libxsmm/include/libxsmm_cpuid.h b/third_party/libxsmm/include/libxsmm_cpuid.h new file mode 100644 index 00000000..83329b82 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_cpuid.h @@ -0,0 +1,76 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_CPUID_H +#define LIBXSMM_CPUID_H + +#include "libxsmm_macros.h" + +/** + * Enumerates the available target architectures and instruction + * set extensions as returned by libxsmm_get_target_archid(). + * LIBXSMM_X86_ALLFEAT: pseudo-value enabling all features + * used anywhere in LIBXSMM (never set as an architecture, + * used as an upper bound in comparisons to distinct x86). + */ +#define LIBXSMM_TARGET_ARCH_UNKNOWN 0 +#define LIBXSMM_TARGET_ARCH_GENERIC 1 +#define LIBXSMM_X86_GENERIC 1002 +#define LIBXSMM_X86_SSE3 1003 +#define LIBXSMM_X86_SSE42 1004 +#define LIBXSMM_X86_AVX 1005 +#define LIBXSMM_X86_AVX2 1006 +#define LIBXSMM_X86_AVX512 1007 +#define LIBXSMM_X86_AVX512_MIC 1010 /* KNL */ +#define LIBXSMM_X86_AVX512_KNM 1011 +#define LIBXSMM_X86_AVX512_CORE 1020 /* SKX */ +#define LIBXSMM_X86_AVX512_CLX 1021 +#define LIBXSMM_X86_AVX512_CPX 1022 +#define LIBXSMM_X86_AVX512_SPR 1023 +#define LIBXSMM_X86_ALLFEAT 1999 +#define LIBXSMM_AARCH64_V81 2001 /* Baseline */ +#define LIBXSMM_AARCH64_V82 2002 /* A64FX minus SVE */ +#define LIBXSMM_AARCH64_A64FX 2100 /* SVE */ +#define LIBXSMM_AARCH64_ALLFEAT 2999 + +#if defined(LIBXSMM_PLATFORM_X86) +/** Zero-initialized structure; assumes conservative properties. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_cpuid_info { + int constant_tsc; /** Timer stamp counter is monotonic. */ + int has_context; /** Context switches are permitted. */ +} libxsmm_cpuid_info; +#else +typedef int libxsmm_cpuid_info; +#endif + +/** Returns the target architecture and instruction set extensions. */ +#if defined(__cplusplus) /* note: stay compatible with TF */ +LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_info* info = NULL); +LIBXSMM_API int libxsmm_cpuid_arm(libxsmm_cpuid_info* info = NULL); +#else +LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_info* info); +LIBXSMM_API int libxsmm_cpuid_arm(libxsmm_cpuid_info* info); +#endif + +/** + * Similar to libxsmm_cpuid_x86, but conceptually not x86-specific. + * The actual code path (as used by LIBXSMM) is determined by + * libxsmm_[get|set]_target_archid/libxsmm_[get|set]_target_arch. + */ +LIBXSMM_API int libxsmm_cpuid(void); + +/** Names the CPU architecture given by CPUID. */ +LIBXSMM_API const char* libxsmm_cpuid_name(int id); + +/** SIMD vector length (VLEN) in 32-bit elements. */ +LIBXSMM_API int libxsmm_cpuid_vlen32(int id); + +#endif /*LIBXSMM_CPUID_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn.h b/third_party/libxsmm/include/libxsmm_dnn.h new file mode 100644 index 00000000..c100cbc9 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn.h @@ -0,0 +1,132 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_H +#define LIBXSMM_DNN_H + +#include "libxsmm_typedefs.h" + +typedef unsigned int libxsmm_dnn_err_t; + +/** Define error and warning codes */ +#define LIBXSMM_DNN_SUCCESS 0 + +#define LIBXSMM_DNN_WARN_FALLBACK 90000 +#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING 90001 +#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING 90002 +#define LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING 90003 +#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING 90004 +#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING 90005 +#define LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING 90006 + +#define LIBXSMM_DNN_ERR_GENERAL 100000 +#define LIBXSMM_DNN_ERR_CREATE_HANDLE 100001 +#define LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE 100002 +#define LIBXSMM_DNN_ERR_INVALID_BLOCKING 100003 +#define LIBXSMM_DNN_ERR_INVALID_HANDLE 100004 +#define LIBXSMM_DNN_ERR_DATA_NOT_BOUND 100005 +#define LIBXSMM_DNN_ERR_CREATE_TENSOR 100006 +#define LIBXSMM_DNN_ERR_INVALID_TENSOR 100007 +#define LIBXSMM_DNN_ERR_MISMATCH_TENSOR 100008 +#define LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR 100009 +#define LIBXSMM_DNN_ERR_INVALID_KIND 100010 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW 100011 +#define LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT 100012 +#define LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT 100013 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE 100014 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS 100015 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL 100016 +#define LIBXSMM_DNN_ERR_CREATE_LAYOUT 100017 +#define LIBXSMM_DNN_ERR_INVALID_LAYOUT 100018 +#define LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH 100019 +#define LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED 100020 +#define LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE 100021 +#define LIBXSMM_DNN_ERR_INVALID_ALGO 100022 +#define LIBXSMM_DNN_ERR_INVALID_PADDING 100023 +#define LIBXSMM_DNN_ERR_UNKNOWN_BIAS_TYPE 100024 +#define LIBXSMM_DNN_ERR_MISMATCH_BIAS 100025 +#define LIBXSMM_DNN_ERR_INVALID_HANDLE_BIAS 100026 +#define LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL 100027 +#define LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS 100028 +#define LIBXSMM_DNN_ERR_NOT_IMPLEMENTED 100029 +#define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER 100030 +#define LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION 100031 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN 100032 +#define LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING 100033 +#define LIBXSMM_DNN_ERR_INVALID_FORMAT_FC 100034 +#define LIBXSMM_DNN_ERR_INVALID_RNN_TYPE 100035 +#define LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN 100036 +#define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER 100037 +#define LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION 100038 +#define LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION 100039 + +/** Kinds of supported compute flavor operations. */ +typedef enum libxsmm_dnn_compute_kind { + /** Forward path */ + LIBXSMM_DNN_COMPUTE_KIND_FWD, + /** Backward path */ + LIBXSMM_DNN_COMPUTE_KIND_BWD, + /** Updated weights. */ + LIBXSMM_DNN_COMPUTE_KIND_UPD, + /** Backward and weightupdate combined, useful for RNNs */ + LIBXSMM_DNN_COMPUTE_KIND_BWDUPD, + /** All routines, need for some init routines. */ + LIBXSMM_DNN_COMPUTE_KIND_ALL +} libxsmm_dnn_compute_kind; + +/** these are some quantization definitions, not sure if we want to + move them into some main part of LIBXSMM */ +/* @TODO check position of these declarations and defines */ +typedef union LIBXSMM_RETARGETABLE libxsmm_intfloat { + unsigned int ui; + float f; +} libxsmm_intfloat; + +/* F32 masking defines */ +#define LIBXSNN_DNN_MASK_SIGN_F32 0x80000000 +#define LIBXSMM_DNN_MASK_EXP_F32 0x7f800000 +#define LIBXSMM_DNN_MASK_MANT_F32 0x007fffff +#define LIBXSMM_DNN_MASK_ABS_F32 0x7fffffff +#define LIBXSMM_DNN_MASK_FULL_F32 0xffffffff +#define LIBXSMM_DNN_MANT_SZ_F32 23 +#define LIBXSMM_DNN_SZ_F32 32 + +/* DFP16 masking defines */ +#define LIBXSMM_DNN_MANT_DFP16 15 +#define LIXSMMM_DNN_RES_DFP16 libxsmm_sexp2_i8i(-(LIBXSMM_DNN_MANT_DFP16)) + +/* Quantization Rounding Defines */ +#define LIBXSMM_DNN_QUANT_NO_ROUND 80000 +#define LIBXSMM_DNN_QUANT_BIAS_ROUND 80001 +#define LIBXSMM_DNN_QUANT_STOCH_ROUND 80002 +#define LIBXSMM_DNN_QUANT_NEAREST_ROUND 80003 +#define LIBXSMM_DNN_QUANT_FPHW_ROUND 80004 + +/** get string of error code */ +LIBXSMM_API const char* libxsmm_dnn_get_error(libxsmm_dnn_err_t code); +LIBXSMM_API size_t libxsmm_dnn_typesize(libxsmm_dnn_datatype datatype); +LIBXSMM_API size_t libxsmm_dnn_get_simd_width(libxsmm_dnn_datatype datatype); + +/** some quantization helper functions, + @TODO need to be integrated better for all different ways of quantizations */ +LIBXSMM_API void libxsmm_dnn_quantize( float* in_buffer, short* out_buffer, int length, unsigned char add_shift, unsigned char* scf, int round_mode ); +LIBXSMM_API void libxsmm_dnn_quantize_act( float* in_buffer, short* out_buffer, unsigned int N, unsigned int C, unsigned int H, unsigned int W, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ); +LIBXSMM_API void libxsmm_dnn_quantize_fil( float* in_buffer, short* out_buffer, unsigned int K, unsigned int C, unsigned int R, unsigned int S, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int kblk_f32, unsigned int kblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ); +LIBXSMM_API void libxsmm_dnn_dequantize( short* in_buffer, float* out_buffer, int length, unsigned char scf ); + +/** some BF16<->FP32 conversion functions + @TODO we need to find a final place for those */ +LIBXSMM_API void libxsmm_truncate_convert_f32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int length); +LIBXSMM_API void libxsmm_rnaz_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len); +LIBXSMM_API void libxsmm_rne_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len); +LIBXSMM_API void libxsmm_convert_bf16_f32(const libxsmm_bfloat16* in, float* out, unsigned int length); + +#endif /*LIBXSMM_DNN_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_convolution.h b/third_party/libxsmm/include/libxsmm_dnn_convolution.h new file mode 100644 index 00000000..0c956546 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_convolution.h @@ -0,0 +1,93 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_CONVOLUTION_H +#define LIBXSMM_DNN_CONVOLUTION_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" +#include "libxsmm_dnn_fusedbatchnorm.h" + +/** Opaque handles which represents convolutions and LIBXSMM datatypes */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_layer libxsmm_dnn_layer; + +typedef enum libxsmm_dnn_conv_fuse_op { + /* we fuse nothing into convolution */ + LIBXSMM_DNN_CONV_FUSE_NONE = 0 +} libxsmm_dnn_conv_fuse_op; + +/** Type of algorithm used for convolutions. */ +typedef enum libxsmm_dnn_conv_algo { + /** let the library decide */ + LIBXSMM_DNN_CONV_ALGO_AUTO, + /** direct convolution. */ + LIBXSMM_DNN_CONV_ALGO_DIRECT +} libxsmm_dnn_conv_algo; + +/** Structure which describes the input and output of data (DNN). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_conv_desc { + int N; /* number of images in mini-batch */ + int C; /* number of input feature maps */ + int H; /* height of input image */ + int W; /* width of input image */ + int K; /* number of output feature maps */ + int R; /* height of filter kernel */ + int S; /* width of filter kernel */ + int u; /* vertical stride */ + int v; /* horizontal stride */ + int pad_h; /* height of logical rim padding to input + for adjusting output height */ + int pad_w; /* width of logical rim padding to input + for adjusting output width */ + int pad_h_in; /* height of zero-padding in input buffer, + must equal to pad_h for direct conv */ + int pad_w_in; /* width of zero-padding in input buffer, + must equal to pad_w for direct conv */ + int pad_h_out; /* height of zero-padding in output buffer */ + int pad_w_out; /* width of zero-padding in output buffer */ + int threads; /* number of threads to use when running + convolution */ + libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ + libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for buffer buffers */ + libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ + libxsmm_dnn_conv_algo algo; /* convolution algorithm used */ + libxsmm_dnn_conv_option options; /* additional options */ + libxsmm_dnn_conv_fuse_op fuse_ops; /* used ops into convolutions */ +} libxsmm_dnn_conv_desc; + +/** Create a layer handle (non-NULL if successful), and pre-build all JIT-code versions. */ +LIBXSMM_API libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer(libxsmm_dnn_conv_desc conv_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(const libxsmm_dnn_layer* handle); + +/** get layout description of buffers and filters from handle */ +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_create_tensor_datalayout(const libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +/** scratch pad management */ +LIBXSMM_API size_t libxsmm_dnn_get_scratch_size(const libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind); + +/** Bind/Release buffers, filters and bias to layer operation */ +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_get_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type); + +/** Run the layer identified by the handle; may use threads internally. */ +LIBXSMM_API void libxsmm_dnn_execute(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_execute_st(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +/** some helper functions for framework integration */ +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_filter(const libxsmm_dnn_layer* handle); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_bf16_filter(const libxsmm_dnn_layer* handle); + +#endif /*LIBXSMM_DNN_CONVOLUTION_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h b/third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h new file mode 100644 index 00000000..4dd480e4 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_fullyconnected.h @@ -0,0 +1,65 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FULLYCONNECTED_H +#define LIBXSMM_DNN_FULLYCONNECTED_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM fullyconnected */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected libxsmm_dnn_fullyconnected; + +typedef enum libxsmm_dnn_fullyconnected_fuse_op { + /* the fuse order is: 1. BIAS, 2. Actitvation */ + LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE = 0, + LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS = 1, + LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU = 2, + LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID = 4, + LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU = 3, + LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID = 5 +} libxsmm_dnn_fullyconnected_fuse_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected_desc { + int N; /* number of images in mini-batch */ + int C; /* number of input feature maps */ + int K; /* number of output feature maps */ + int bn; + int bk; + int bc; + int threads; /* number of threads used */ + int compressed_A; + int sparsity_factor_A; + libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ + libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ + libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ + libxsmm_dnn_fullyconnected_fuse_op fuse_ops; /* fused operations */ +} libxsmm_dnn_fullyconnected_desc; + +LIBXSMM_API libxsmm_dnn_fullyconnected* libxsmm_dnn_create_fullyconnected(libxsmm_dnn_fullyconnected_desc fullyconnected_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fullyconnected(const libxsmm_dnn_fullyconnected* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fullyconnected_create_tensor_datalayout(const libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API void* libxsmm_dnn_fullyconnected_get_scratch_ptr (const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API size_t libxsmm_dnn_fullyconnected_get_scratch_size(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_scratch(libxsmm_dnn_fullyconnected* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_scratch(libxsmm_dnn_fullyconnected* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fullyconnected_get_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_execute_st(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_FULLYCONNECTED_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h b/third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h new file mode 100644 index 00000000..e94a36a7 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_fusedbatchnorm.h @@ -0,0 +1,39 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDBATCHNORM_H +#define LIBXSMM_DNN_FUSEDBATCHNORM_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM fusedbatchnorm */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm libxsmm_dnn_fusedbatchnorm; + +LIBXSMM_API libxsmm_dnn_fusedbatchnorm* libxsmm_dnn_create_fusedbatchnorm(libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedbatchnorm(const libxsmm_dnn_fusedbatchnorm* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(const libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API size_t libxsmm_dnn_fusedbatchnorm_get_scratch_size(const libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_scratch(libxsmm_dnn_fusedbatchnorm* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_scratch(libxsmm_dnn_fusedbatchnorm* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedbatchnorm_get_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_FUSEDBATCHNORM_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h b/third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h new file mode 100644 index 00000000..6d1d90a6 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_fusedgroupnorm.h @@ -0,0 +1,39 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDGROUPNORM_H +#define LIBXSMM_DNN_FUSEDGROUPNORM_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM fusedgroupnorm */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm libxsmm_dnn_fusedgroupnorm; + +LIBXSMM_API libxsmm_dnn_fusedgroupnorm* libxsmm_dnn_create_fusedgroupnorm(libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedgroupnorm(const libxsmm_dnn_fusedgroupnorm* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(const libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API size_t libxsmm_dnn_fusedgroupnorm_get_scratch_size(const libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_scratch(libxsmm_dnn_fusedgroupnorm* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_scratch(libxsmm_dnn_fusedgroupnorm* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedgroupnorm_get_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_execute_st(libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_FUSEDGROUPNORM_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_optimizer.h b/third_party/libxsmm/include/libxsmm_dnn_optimizer.h new file mode 100644 index 00000000..cac46f40 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_optimizer.h @@ -0,0 +1,55 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_SGD_H +#define LIBXSMM_DNN_SGD_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM optimizer */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer libxsmm_dnn_optimizer; + +typedef enum libxsmm_dnn_optimizer_type { + LIBXSMM_DNN_OPTIMIZER_SGD = 1 +} libxsmm_dnn_optimizer_type; + + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer_desc { + int C; /* number of feature maps */ + int K; /* number of feature maps */ + int bc; + int bk; + float learning_rate; /* learning rate */ + int threads; /* number of threads used */ + libxsmm_dnn_optimizer_type opt_type; + libxsmm_dnn_datatype datatype_master; /* datatype used for all input related buffers */ + libxsmm_dnn_datatype datatype; /* datatype used for all input related buffers */ + libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ +} libxsmm_dnn_optimizer_desc; + +LIBXSMM_API libxsmm_dnn_optimizer* libxsmm_dnn_create_optimizer(libxsmm_dnn_optimizer_desc optimizer_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_optimizer(const libxsmm_dnn_optimizer* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_optimizer_create_tensor_datalayout(const libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API void* libxsmm_dnn_optimizer_get_scratch_ptr (const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API size_t libxsmm_dnn_optimizer_get_scratch_size(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_scratch(libxsmm_dnn_optimizer* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_scratch(libxsmm_dnn_optimizer* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_optimizer_get_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_execute_st(libxsmm_dnn_optimizer* handle, /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_SGD_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_pooling.h b/third_party/libxsmm/include/libxsmm_dnn_pooling.h new file mode 100644 index 00000000..0a973664 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_pooling.h @@ -0,0 +1,65 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_POOLING_H +#define LIBXSMM_DNN_POOLING_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM pooling */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling libxsmm_dnn_pooling; + +typedef enum libxsmm_dnn_pooling_type { + LIBXSMM_DNN_POOLING_MAX = 1, + LIBXSMM_DNN_POOLING_AVG = 2 +} libxsmm_dnn_pooling_type; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling_desc { + int N; /* number of images in mini-batch */ + int C; /* number of input feature maps */ + int H; /* height of input image */ + int W; /* width of input image */ + int R; /* kernel height */ + int S; /* kernel width */ + int u; /* vertical stride */ + int v; /* horizontal stride */ + int pad_h; /* height of logical padding of input buffer */ + int pad_w; /* width of logical padding of input buffer */ + int pad_h_in; /* height of physical zero-padding in input buffer */ + int pad_w_in; /* width of physical zero-padding in input buffer */ + int pad_h_out; /* height of physical zero-padding in output buffer */ + int pad_w_out; /* width of physical zero-padding in output buffer */ + int threads; /* number of threads used */ + libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ + libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ + libxsmm_dnn_datatype datatype_mask; /* datatypes used for the masks */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ + libxsmm_dnn_pooling_type pooling_type; /* type of pooling operation */ +} libxsmm_dnn_pooling_desc; + +LIBXSMM_API libxsmm_dnn_pooling* libxsmm_dnn_create_pooling(libxsmm_dnn_pooling_desc pooling_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_pooling(const libxsmm_dnn_pooling* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_pooling_create_tensor_datalayout(const libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API size_t libxsmm_dnn_pooling_get_scratch_size(const libxsmm_dnn_pooling* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_scratch(libxsmm_dnn_pooling* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_scratch(libxsmm_dnn_pooling* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_pooling_get_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_execute_st(libxsmm_dnn_pooling* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_POOLING_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_rnncell.h b/third_party/libxsmm/include/libxsmm_dnn_rnncell.h new file mode 100644 index 00000000..c3402f9d --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_rnncell.h @@ -0,0 +1,79 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_RNNCELL_H +#define LIBXSMM_DNN_RNNCELL_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell libxsmm_dnn_rnncell; + +/** Type of algorithm used for convolutions. */ +typedef enum libxsmm_dnn_rnncell_type { + /** simple RNN cell with ReLU as activation function */ + LIBXSMM_DNN_RNNCELL_RNN_RELU, + /** simple RNN cell with sigmoid as activation function */ + LIBXSMM_DNN_RNNCELL_RNN_SIGMOID, + /** simple RNN cell with tanh as activation function */ + LIBXSMM_DNN_RNNCELL_RNN_TANH, + /** LSTM cell */ + LIBXSMM_DNN_RNNCELL_LSTM, + /** GRU cell */ + LIBXSMM_DNN_RNNCELL_GRU +} libxsmm_dnn_rnncell_type; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell_desc { + int threads; + libxsmm_blasint K; /* number of outputs */ + libxsmm_blasint N; /* size of the minibatch */ + libxsmm_blasint C; /* number of inputs */ + libxsmm_blasint max_T; /* number of time steps */ + libxsmm_blasint bk; + libxsmm_blasint bn; + libxsmm_blasint bc; + int use_fwd_fused_impl; + int fwd_block; + int bwdupd_block; + libxsmm_dnn_rnncell_type cell_type; /* cell type RNN ReLU, RNN Sigmoid, RNN Tanh, LSTM, GRU */ + libxsmm_dnn_datatype datatype_in; /* datatypes used for all input related buffer */ + libxsmm_dnn_datatype datatype_out; /* datatypes used for all output related buffer */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ + libxsmm_dnn_tensor_format filter_format; /* format which is for filter buffers */ +} libxsmm_dnn_rnncell_desc; + +LIBXSMM_API libxsmm_dnn_rnncell* libxsmm_dnn_create_rnncell(libxsmm_dnn_rnncell_desc rnncell_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_rnncell(const libxsmm_dnn_rnncell* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_rnncell_create_tensor_datalayout(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API size_t libxsmm_dnn_rnncell_get_scratch_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); +LIBXSMM_API void* libxsmm_dnn_rnncell_get_scratch_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind); + +LIBXSMM_API size_t libxsmm_dnn_rnncell_get_internalstate_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status); +LIBXSMM_API void* libxsmm_dnn_rnncell_get_internalstate_ptr (const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* internalstate); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_dnn_rnncell* handle, const float forget_bias); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_rnncell_get_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_set_sequence_length( libxsmm_dnn_rnncell* handle, const libxsmm_blasint T ); +LIBXSMM_API libxsmm_blasint libxsmm_dnn_rnncell_get_sequence_length( libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status ); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_execute_st(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +#endif /*LIBXSMM_DNN_RNNCELL_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h b/third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h new file mode 100644 index 00000000..0e9b9f55 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_softmaxloss.h @@ -0,0 +1,51 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_SOFTMAXLOSS_H +#define LIBXSMM_DNN_SOFTMAXLOSS_H + +#include "libxsmm_dnn.h" +#include "libxsmm_dnn_tensor.h" + +/** Opaque handles which represents LIBXSMM softmaxloss */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss libxsmm_dnn_softmaxloss; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss_desc { + int N; /* number of images in mini-batch */ + int C; /* number of input feature maps */ + int bn; /* requested N blocking for NCNC format */ + int bc; /* requested C blocking for NCNC format */ + float loss_weight; /* loss weight */ + int threads; /* number of threads used */ + libxsmm_dnn_datatype datatype; /* datatype used for all buffers */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ +} libxsmm_dnn_softmaxloss_desc; + +LIBXSMM_API libxsmm_dnn_softmaxloss* libxsmm_dnn_create_softmaxloss(libxsmm_dnn_softmaxloss_desc softmaxloss_desc, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_softmaxloss(const libxsmm_dnn_softmaxloss* handle); + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_softmaxloss_create_tensor_datalayout(const libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); + +LIBXSMM_API void* libxsmm_dnn_softmaxloss_get_scratch_ptr (const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API size_t libxsmm_dnn_softmaxloss_get_scratch_size(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_scratch(libxsmm_dnn_softmaxloss* handle, const void* scratch); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_scratch(libxsmm_dnn_softmaxloss* handle); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_softmaxloss_get_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type); + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_execute_st(libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid); + +LIBXSMM_API float libxsmm_dnn_softmaxloss_get_loss(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status); + +#endif /*LIBXSMM_DNN_SOFTMAXLOSS_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_dnn_tensor.h b/third_party/libxsmm/include/libxsmm_dnn_tensor.h new file mode 100644 index 00000000..c33185df --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_dnn_tensor.h @@ -0,0 +1,199 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_TENSOR_H +#define LIBXSMM_DNN_TENSOR_H + +#include "libxsmm_typedefs.h" +#include "libxsmm_dnn.h" + +/** Opaque handles which represents convolutions and LIBXSMM datatypes */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor libxsmm_dnn_tensor; + +typedef enum libxsmm_dnn_tensor_dimtype { + /** Mini-batch */ + LIBXSMM_DNN_TENSOR_DIMTYPE_N, + /** Image Height */ + LIBXSMM_DNN_TENSOR_DIMTYPE_H, + /** Image Width */ + LIBXSMM_DNN_TENSOR_DIMTYPE_W, + /** channels or input channels */ + LIBXSMM_DNN_TENSOR_DIMTYPE_C, + /** output channels */ + LIBXSMM_DNN_TENSOR_DIMTYPE_K, + /** kernel height */ + LIBXSMM_DNN_TENSOR_DIMTYPE_R, + /** kernel width */ + LIBXSMM_DNN_TENSOR_DIMTYPE_S, + /** sequence lenth counter */ + LIBXSMM_DNN_TENSOR_DIMTYPE_T, + /** channle group counter */ + LIBXSMM_DNN_TENSOR_DIMTYPE_G, + /** general counter */ + LIBXSMM_DNN_TENSOR_DIMTYPE_X +} libxsmm_dnn_tensor_dimtype; + +/** types of different buffers */ +typedef enum libxsmm_dnn_tensor_type { + /** regular input buffer */ + LIBXSMM_DNN_REGULAR_INPUT, + /** regular input buffer */ + LIBXSMM_DNN_REGULAR_INPUT_ADD, + /** regular input buffer, transpose */ + LIBXSMM_DNN_REGULAR_INPUT_TRANS, + /** gradient input buffer */ + LIBXSMM_DNN_GRADIENT_INPUT, + /** gradient input buffer */ + LIBXSMM_DNN_GRADIENT_INPUT_ADD, + /** regular output buffer */ + LIBXSMM_DNN_REGULAR_OUTPUT, + /** gradient output buffer */ + LIBXSMM_DNN_GRADIENT_OUTPUT, + /** general input type */ + LIBXSMM_DNN_INPUT, + /** general output type */ + LIBXSMM_DNN_OUTPUT, + /** general activation type */ + LIBXSMM_DNN_ACTIVATION, + /* regular filter */ + LIBXSMM_DNN_REGULAR_FILTER, + /* regular filter */ + LIBXSMM_DNN_REGULAR_FILTER_TRANS, + /* gradient filter */ + LIBXSMM_DNN_GRADIENT_FILTER, + /* master filter */ + LIBXSMM_DNN_MASTER_FILTER, + /** general filter type */ + LIBXSMM_DNN_FILTER, + /* regular bias */ + LIBXSMM_DNN_REGULAR_CHANNEL_BIAS, + /* gradient bias */ + LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS, + /* bias */ + LIBXSMM_DNN_CHANNEL_BIAS, + /* regular beta */ + LIBXSMM_DNN_REGULAR_CHANNEL_BETA, + /* gradient beta */ + LIBXSMM_DNN_GRADIENT_CHANNEL_BETA, + /* beta */ + LIBXSMM_DNN_CHANNEL_BETA, + /* regular gamma */ + LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA, + /* gradient gamma */ + LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA, + /* Gamma */ + LIBXSMM_DNN_CHANNEL_GAMMA, + /* regular beta */ + LIBXSMM_DNN_CHANNEL_EXPECTVAL, + /* regular beta */ + LIBXSMM_DNN_CHANNEL_RCPSTDDEV, + /* variance */ + LIBXSMM_DNN_CHANNEL_VARIANCE, + /** general bias type */ + LIBXSMM_DNN_CHANNEL_SCALAR, + /** Labels */ + LIBXSMM_DNN_LABEL, + /** batch stats */ + LIBXSMM_DNN_BATCH_STATS, + LIBXSMM_DNN_MAX_STATS_FWD, + LIBXSMM_DNN_MAX_STATS_BWD, + LIBXSMM_DNN_MAX_STATS_UPD, + /** pooling mask */ + LIBXSMM_DNN_POOLING_MASK, + /** ReLU mask */ + LIBXSMM_DNN_RELU_MASK, + /** general type, if needed might cause API issues in copy in/out API */ + LIBXSMM_DNN_TENSOR, + + /** regular input buffer */ + LIBXSMM_DNN_RNN_REGULAR_INPUT, + /** regular previous cell state buffer */ + LIBXSMM_DNN_RNN_REGULAR_CS_PREV, + /** regular previous hidden state buffer */ + LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV, + /** regular weight (LSTM: wi, wc, wf, wo) */ + LIBXSMM_DNN_RNN_REGULAR_WEIGHT, + /** regular recurrent weight (LSTM: ri, rc, rf, ro) */ + LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT, + /** regular weight (LSTM: wi, wc, wf, wo) */ + LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS, + /** regular recurrent weight (LSTM: ri, rc, rf, ro) */ + LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS, + /** regular bias (LSTM: bi, bc, bf, bo) */ + LIBXSMM_DNN_RNN_REGULAR_BIAS, + /** regular output cell state buffer */ + LIBXSMM_DNN_RNN_REGULAR_CS, + /** regular hidden state buffer */ + LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE, + /** gradient input buffer */ + LIBXSMM_DNN_RNN_GRADIENT_INPUT, + /** gradient previous cell state buffer */ + LIBXSMM_DNN_RNN_GRADIENT_CS_PREV, + /** gradient previous hidden state buffer */ + LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV, + /** gradient weight */ + LIBXSMM_DNN_RNN_GRADIENT_WEIGHT, + /** gradient recurrent weight */ + LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT, + /** gradient bias */ + LIBXSMM_DNN_RNN_GRADIENT_BIAS, + /** gradient output cell state buffer */ + LIBXSMM_DNN_RNN_GRADIENT_CS, + /** gradient hidden state buffer */ + LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE, + /** internal i buffer */ + LIBXSMM_DNN_RNN_INTERNAL_I, + /** internal f buffer */ + LIBXSMM_DNN_RNN_INTERNAL_F, + /** internal o buffer */ + LIBXSMM_DNN_RNN_INTERNAL_O, + /** internal ci buffer */ + LIBXSMM_DNN_RNN_INTERNAL_CI, + /** internal co buffer */ + LIBXSMM_DNN_RNN_INTERNAL_CO +} libxsmm_dnn_tensor_type; + +/** layout descriptor to allow external data handling + outside of LIBXSMM */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor_datalayout { + libxsmm_dnn_tensor_dimtype* dim_type; + unsigned int* dim_size; + unsigned int num_dims; + libxsmm_dnn_tensor_format format; /* format of activation buffer */ + libxsmm_dnn_datatype datatype; /* data type */ + libxsmm_dnn_tensor_type tensor_type; /* tensor type */ +} libxsmm_dnn_tensor_datalayout; + +/** tensorlayout handling */ +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_duplicate_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor_datalayout(libxsmm_dnn_tensor_datalayout* layout); +LIBXSMM_API unsigned int libxsmm_dnn_compare_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout_a, const libxsmm_dnn_tensor_datalayout* layout_b, libxsmm_dnn_err_t* status); +LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_size(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); +LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_elements(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status); + +/** Create and manage buffers, filters and bias (non-NULL if successful) */ +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_tensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_qtensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, const unsigned char exp, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_tensor* tensor, const void* data); +LIBXSMM_API void* libxsmm_dnn_get_tensor_data_ptr(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_get_tensor_datalayout(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); +LIBXSMM_API unsigned char libxsmm_dnn_get_qtensor_scf(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_qtensor_scf(libxsmm_dnn_tensor* tensor, const unsigned char scf); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor(const libxsmm_dnn_tensor* tensor); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_zero_tensor(const libxsmm_dnn_tensor* tensor); + +/** + * Copy-in/out from a plain format such [N][C][H][W] or [N][H][W][C] + */ +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyin_tensor(const libxsmm_dnn_tensor* tensor, const void* data, const libxsmm_dnn_tensor_format in_format); +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyout_tensor(const libxsmm_dnn_tensor* tensor, void* data, const libxsmm_dnn_tensor_format out_format); + +#endif /*LIBXSMM_DNN_TENSOR_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_frontend.h b/third_party/libxsmm/include/libxsmm_frontend.h new file mode 100644 index 00000000..afb98498 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_frontend.h @@ -0,0 +1,590 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_FRONTEND_H +#define LIBXSMM_FRONTEND_H + +#include "libxsmm_typedefs.h" + +/** Helper macros for eliding prefetch address calculations depending on prefetch scheme. */ +#if !defined(_WIN32) && !defined(__CYGWIN__) /* TODO: fully support calling convention */ +#if 0 != ((LIBXSMM_PREFETCH) & 2/*AL2*/) \ + || 0 != ((LIBXSMM_PREFETCH) & 8/*AL2_AHEAD*/) +# define LIBXSMM_GEMM_PREFETCH_A(EXPR) (EXPR) +#endif +#if 0 != ((LIBXSMM_PREFETCH) & 4/*BL2_VIA_C*/) \ + || 0 != ((LIBXSMM_PREFETCH) & 16/*BL1*/) +# define LIBXSMM_GEMM_PREFETCH_B(EXPR) (EXPR) +#endif +#endif +/** Secondary helper macros derived from the above group. */ +#if defined(LIBXSMM_GEMM_PREFETCH_A) +# define LIBXSMM_NOPREFETCH_A(EXPR) +#else +# define LIBXSMM_NOPREFETCH_A(EXPR) EXPR +# define LIBXSMM_GEMM_PREFETCH_A(EXPR) 0 +#endif +#if defined(LIBXSMM_GEMM_PREFETCH_B) +# define LIBXSMM_NOPREFETCH_B(EXPR) +#else +# define LIBXSMM_NOPREFETCH_B(EXPR) EXPR +# define LIBXSMM_GEMM_PREFETCH_B(EXPR) 0 +#endif +#if defined(LIBXSMM_GEMM_PREFETCH_C) +# define LIBXSMM_NOPREFETCH_C(EXPR) +#else +# define LIBXSMM_NOPREFETCH_C(EXPR) EXPR +# define LIBXSMM_GEMM_PREFETCH_C(EXPR) 0 +#endif + +/** MKL_DIRECT_CALL requires to include the MKL interface. */ +#if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) || \ + (defined(__MKL) && !defined(LIBXSMM_BUILD) && \ + (!defined(__BLAS) || (0 != __BLAS)))) +# if (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64)) +# error "Inconsistent ILP64 configuration detected!" +# endif +# if defined(LIBXSMM_OFFLOAD_BUILD) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +# include +# pragma offload_attribute(pop) +# else +# include +# endif +#endif +/** __INTEL_MKL__ is needed later to fix some NOTHROW issue. */ +#if defined(__MKL) && !defined(__INTEL_MKL__) && defined(NOTHROW) +# if defined(LIBXSMM_OFFLOAD_BUILD) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +# include +# pragma offload_attribute(pop) +# else +# include +# endif +#endif + +/** Unfortunately calculation of INTEL_MKL_VERSION is not stable over time. */ +#if defined(__INTEL_MKL__) && defined(__INTEL_MKL_MINOR__) && defined(__INTEL_MKL_UPDATE__) +# define LIBXSMM_MKL_VERSION3 LIBXSMM_VERSION3(__INTEL_MKL__, __INTEL_MKL_MINOR__, __INTEL_MKL_UPDATE__) +#endif + +/** Automatically select a prefetch-strategy (libxsmm_get_gemm_xprefetch, etc.). */ +#define LIBXSMM_PREFETCH_AUTO -1 + +/** Append "_omp" postfix to the given symbol. */ +#define LIBXSMM_USEOMP(FUNCTION) LIBXSMM_CONCATENATE(FUNCTION, _omp) + +/** Helper macro for BLAS-style prefixes. */ +#define LIBXSMM_TPREFIX_NAME(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_, TYPE) +#define LIBXSMM_TPREFIX(TYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_TPREFIX_NAME(TYPE), FUNCTION) +#define LIBXSMM_TPREFIX_doubledouble d +#define LIBXSMM_TPREFIX_floatfloat s +#define LIBXSMM_TPREFIX_shortfloat ws +#define LIBXSMM_TPREFIX_shortint wi +#define LIBXSMM_TPREFIX_libxsmm_bfloat16float bs +/** Defaults if only the input type is specified. */ +#define LIBXSMM_TPREFIX_double LIBXSMM_TPREFIX_doubledouble +#define LIBXSMM_TPREFIX_float LIBXSMM_TPREFIX_floatfloat +#define LIBXSMM_TPREFIX_short LIBXSMM_TPREFIX_shortint + +#define LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_XFLAGS_, ITYPE) /* ignore OTYPE for now */ +#define LIBXSMM_GEMM_XFLAGS_double 0 +#define LIBXSMM_GEMM_XFLAGS_float 0 +#define LIBXSMM_GEMM_XFLAGS_libxsmm_bfloat16 LIBXSMM_GEMM_FLAG_VNNI_A +#define LIBXSMM_GEMM_XFLAGS_int 0 +#define LIBXSMM_GEMM_XFLAGS_short 0 + +/** Construct symbol name from a given real type name (float, double and short). */ +#define LIBXSMM_BLAS_FNTYPE(TYPE, KIND) LIBXSMM_CONCATENATE3(libxsmm_, LIBXSMM_TPREFIX(TYPE, KIND), _function) +#define LIBXSMM_MMFUNCTION_TYPE(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmfunction)) +#define LIBXSMM_MMDISPATCH_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, mmdispatch)) +#define LIBXSMM_XBLAS_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_blas_, LIBXSMM_TPREFIX(TYPE, gemm)) +#define LIBXSMM_XGEMM_SYMBOL(TYPE) LIBXSMM_CONCATENATE(libxsmm_, LIBXSMM_TPREFIX(TYPE, gemm)) +#define LIBXSMM_YGEMM_SYMBOL(TYPE) LIBXSMM_USEOMP(LIBXSMM_XGEMM_SYMBOL(TYPE)) +#define LIBXSMM_BLAS_SYMBOL(TYPE, KIND) LIBXSMM_FSYMBOL(LIBXSMM_TPREFIX(TYPE, KIND)) +#define LIBXSMM_CBLAS_SYMBOL LIBXSMM_TPREFIX + +#define LIBXSMM_BLAS_DECL(TYPE, KIND, DECL) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_, LIBXSMM_TPREFIX(TYPE, KIND))(DECL) +#if !defined(MKL_DIRECT_CALL_SEQ) && !defined(MKL_DIRECT_CALL) +# define LIBXSMM_BLAS_dgemm(DECL) DECL; +# define LIBXSMM_BLAS_sgemm(DECL) DECL; +# define LIBXSMM_BLAS_dgemv(DECL) DECL; +# define LIBXSMM_BLAS_sgemv(DECL) DECL; +#else +# define LIBXSMM_BLAS_dgemm +# define LIBXSMM_BLAS_sgemm +# define LIBXSMM_BLAS_dgemv +# define LIBXSMM_BLAS_sgemv +#endif + +/* Construct prefix names, function type or dispatch function from given input and output types. */ +#define LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) LIBXSMM_MMFUNCTION_TYPE(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) +#define LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE) LIBXSMM_MMDISPATCH_SYMBOL(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) +#define LIBXSMM_TPREFIX_NAME2(ITYPE, OTYPE) LIBXSMM_TPREFIX_NAME(LIBXSMM_CONCATENATE(ITYPE, OTYPE)) +#define LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION) LIBXSMM_TPREFIX(LIBXSMM_CONCATENATE(ITYPE, OTYPE), FUNCTION) + +/** Helper macro for comparing selected types. */ +#define LIBXSMM_EQUAL(T1, T2) LIBXSMM_CONCATENATE3(LIBXSMM_EQUAL_, T1, T2) +#define LIBXSMM_EQUAL_floatfloat 1 +#define LIBXSMM_EQUAL_doubledouble 1 +#define LIBXSMM_EQUAL_floatdouble 0 +#define LIBXSMM_EQUAL_doublefloat 0 +#define LIBXSMM_EQUAL_shortdouble 0 +#define LIBXSMM_EQUAL_shortfloat 0 + +#if defined(LIBXSMM_BLAS_CONST) +# undef LIBXSMM_BLAS_CONST +# define LIBXSMM_BLAS_CONST const +#elif defined(OPENBLAS_CONST) +# define LIBXSMM_BLAS_CONST OPENBLAS_CONST +#elif defined(LIBXSMM_BLAS_NONCONST) || defined(__OPENBLAS) || defined(__OPENBLAS77) +# define LIBXSMM_BLAS_CONST +#else +# define LIBXSMM_BLAS_CONST const +#endif + +#if !defined(LIBXSMM_NO_BLAS) +# if (!defined(__BLAS) || (0 != __BLAS)) +# define LIBXSMM_NO_BLAS 0 +# define LIBXSMM_BLAS 1 +# else +# define LIBXSMM_NO_BLAS 1 +# define LIBXSMM_BLAS 0 +# endif +#endif + +#if defined(__BLAS) && (1 == __BLAS) +# if defined(__OPENBLAS) + LIBXSMM_EXTERN void openblas_set_num_threads(int num_threads); +# define LIBXSMM_BLAS_INIT openblas_set_num_threads(1); +# endif +#endif +#if !defined(LIBXSMM_BLAS_INIT) +# define LIBXSMM_BLAS_INIT +#endif + +#if defined(LIBXSMM_BUILD) +# if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC) +# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_APIEXT +# elif defined(LIBXSMM_NO_BLAS) && (1 == LIBXSMM_NO_BLAS) +# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_API +# endif +#endif +#if !defined(LIBXSMM_BLAS_SYMBOL_VISIBILITY) +# define LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_EXTERN LIBXSMM_VISIBILITY_IMPORT LIBXSMM_RETARGETABLE +#endif + +#if defined(NOTHROW) +# define LIBXSMM_BLAS_NOTHROW NOTHROW +#else +# define LIBXSMM_BLAS_NOTHROW LIBXSMM_NOEXCEPT +#endif +#define LIBXSMM_BLAS_NOEXCEPT(KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_NOEXCEPT_, KIND) +#if defined(LIBXSMM_MKL_VERSION3) && (LIBXSMM_VERSION3(2020, 0, 2) <= LIBXSMM_MKL_VERSION3) +# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch LIBXSMM_BLAS_NOTHROW +#else +# define LIBXSMM_BLAS_NOEXCEPT_gemm_batch +#endif +#define LIBXSMM_BLAS_NOEXCEPT_gemm LIBXSMM_BLAS_NOTHROW +#define LIBXSMM_BLAS_NOEXCEPT_gemv LIBXSMM_BLAS_NOTHROW + +#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm_batch(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \ + libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \ + TYPE CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR STAR, libxsmm_blasint CONST_STAR, \ + TYPE CONST_STAR, TYPE STAR STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR +#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemm(CONST_STAR, STAR, TYPE) char CONST_STAR, char CONST_STAR, \ + libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \ + TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR +#define LIBXSMM_BLAS_SYMBOL_SIGNATURE_gemv(CONST_STAR, STAR, TYPE) char CONST_STAR, libxsmm_blasint CONST_STAR, libxsmm_blasint CONST_STAR, \ + TYPE CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, TYPE CONST_STAR, libxsmm_blasint CONST_STAR, \ + TYPE CONST_STAR, TYPE STAR, libxsmm_blasint CONST_STAR +#define LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_SYMBOL_SIGNATURE_, KIND)(CONST_STAR, STAR, TYPE) +#define LIBXSMM_BLAS_SYMBOL_FDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \ + void LIBXSMM_BLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND) +#define LIBXSMM_BLAS_SYMBOL_CDECL(CONST_STAR, STAR, TYPE, KIND) LIBXSMM_BLAS_SYMBOL_VISIBILITY \ + void LIBXSMM_CBLAS_SYMBOL(TYPE, KIND)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(CONST_STAR, STAR, TYPE, KIND)) LIBXSMM_BLAS_NOEXCEPT(KIND) + +#if (0 != LIBXSMM_BLAS) /* BLAS available */ +# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) LIBXSMM_BLAS_DECL(TYPE, KIND, LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, TYPE, KIND)) +#else +# define LIBXSMM_BLAS_SYMBOL_DECL(TYPE, KIND) +#endif + +/** Helper macro consolidating the transpose requests into a set of flags. */ +#define LIBXSMM_GEMM_FLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \ + ((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ + | (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B)) + +/** Helper macro consolidating CBLAS transpose requests into a set of flags. */ +#define LIBXSMM_GEMM_CFLAGS(TRANSA, TRANSB) /* check for N/n rather than T/t since C/c is also valid! */ \ + ((CblasNoTrans == (TRANSA) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ + | (CblasNoTrans == (TRANSB) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B)) + +/** Helper macro consolidating the transpose requests into a set of flags. */ +#define LIBXSMM_GEMM_VNNI_FLAGS(TRANSA, TRANSB, VNNIA, VNNIB) /* check for N/n rather than T/t since C/c is also valid! */ \ + ((('n' == (TRANSA) || *"N" == (TRANSA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_A) \ + | (('n' == (TRANSB) || *"N" == (TRANSB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_TRANS_B) \ + | (('n' == (VNNIA) || *"N" == (VNNIA)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_A) \ + | (('n' == (VNNIB) || *"N" == (VNNIB)) ? LIBXSMM_GEMM_FLAG_NONE : LIBXSMM_GEMM_FLAG_VNNI_B)) + +/** Helper macro allowing NULL-requests (transposes) supplied by some default. */ +#define LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, DEFAULT) LIBXSMM_GEMM_FLAGS( \ + NULL != ((const void*)(TRANSA)) ? (*(const char*)(TRANSA)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (DEFAULT)) ? 'n' : 't'), \ + NULL != ((const void*)(TRANSB)) ? (*(const char*)(TRANSB)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (DEFAULT)) ? 'n' : 't')) \ + | (~(LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B) & (DEFAULT)) + +/** Inlinable GEMM exercising the compiler's code generation (macro template). TODO: only NN is supported and SP/DP matrices. */ +#define LIBXSMM_INLINE_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ + /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ + const char libxsmm_inline_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \ + (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \ + const char libxsmm_inline_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \ + (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \ + const libxsmm_blasint libxsmm_inline_xgemm_m_ = *(const libxsmm_blasint*)(M); /* must be specified */ \ + const libxsmm_blasint libxsmm_inline_xgemm_k_ = (NULL != ((void*)(K)) ? (*(const libxsmm_blasint*)(K)) : libxsmm_inline_xgemm_m_); \ + const libxsmm_blasint libxsmm_inline_xgemm_n_ = (NULL != ((void*)(N)) ? (*(const libxsmm_blasint*)(N)) : libxsmm_inline_xgemm_k_); \ + const libxsmm_blasint libxsmm_inline_xgemm_lda_ = (NULL != ((void*)(LDA)) ? (*(const libxsmm_blasint*)(LDA)) : \ + (('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_) ? libxsmm_inline_xgemm_m_ : libxsmm_inline_xgemm_k_)); \ + const libxsmm_blasint libxsmm_inline_xgemm_ldb_ = (NULL != ((void*)(LDB)) ? (*(const libxsmm_blasint*)(LDB)) : \ + (('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_) ? libxsmm_inline_xgemm_k_ : libxsmm_inline_xgemm_n_)); \ + const libxsmm_blasint libxsmm_inline_xgemm_ldc_ = (NULL != ((void*)(LDC)) ? (*(const libxsmm_blasint*)(LDC)) : libxsmm_inline_xgemm_m_); \ + const OTYPE libxsmm_inline_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ + const OTYPE libxsmm_inline_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ + libxsmm_blasint libxsmm_inline_xgemm_ni_, libxsmm_inline_xgemm_mi_ = 0, libxsmm_inline_xgemm_ki_; /* loop induction variables */ \ + LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transa_ || *"N" == libxsmm_inline_xgemm_transa_); \ + LIBXSMM_ASSERT('n' == libxsmm_inline_xgemm_transb_ || *"N" == libxsmm_inline_xgemm_transb_); \ + LIBXSMM_PRAGMA_SIMD \ + for (libxsmm_inline_xgemm_mi_ = 0; libxsmm_inline_xgemm_mi_ < libxsmm_inline_xgemm_m_; ++libxsmm_inline_xgemm_mi_) { \ + LIBXSMM_PRAGMA_LOOP_COUNT(1, LIBXSMM_CONFIG_MAX_DIM, LIBXSMM_CONFIG_AVG_DIM) \ + for (libxsmm_inline_xgemm_ki_ = 0; libxsmm_inline_xgemm_ki_ < libxsmm_inline_xgemm_k_; ++libxsmm_inline_xgemm_ki_) { \ + LIBXSMM_PRAGMA_UNROLL \ + for (libxsmm_inline_xgemm_ni_ = 0; libxsmm_inline_xgemm_ni_ < libxsmm_inline_xgemm_n_; ++libxsmm_inline_xgemm_ni_) { \ + ((OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] \ + = ((const ITYPE*)(B))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldb_+libxsmm_inline_xgemm_ki_] * \ + (((const ITYPE*)(A))[libxsmm_inline_xgemm_ki_*libxsmm_inline_xgemm_lda_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_alpha_) \ + + ((const OTYPE*)(C))[libxsmm_inline_xgemm_ni_*libxsmm_inline_xgemm_ldc_+libxsmm_inline_xgemm_mi_] * libxsmm_inline_xgemm_beta_; \ + } \ + } \ + } \ +} + +#if (defined(LIBXSMM_INIT) || defined(LIBXSMM_CTOR)) +# undef LIBXSMM_INIT +# define LIBXSMM_INIT LIBXSMM_ASSERT_MSG(1 < libxsmm_ninit, "LIBXSMM is not initialized"); +# define LIBXSMM_INIT_COMPLETED +#else +# define LIBXSMM_INIT if (2 > libxsmm_ninit) libxsmm_init(); +#endif + +/** Map to appropriate BLAS function (or fallback). The mapping is used, e.g., inside of LIBXSMM_BLAS_XGEMM. */ +#define LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, FUNCTION) LIBXSMM_CONCATENATE(LIBXSMM_BLAS_FUNCTION_, LIBXSMM_TPREFIX2(ITYPE, OTYPE, FUNCTION)) +#if (0 != LIBXSMM_BLAS) /* Helper macro to eventually (if defined) call libxsmm_init */ +# if defined(LIBXSMM_INIT_COMPLETED) +# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch_function +# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch_function +# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm_function +# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm_function +# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv_function +# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv_function +# else +# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_original_dgemm_batch() +# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_original_sgemm_batch() +# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_original_dgemm() +# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_original_sgemm() +# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_original_dgemv() +# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_original_sgemv() +# endif +#else /* no BLAS */ +# define LIBXSMM_BLAS_FUNCTION_dgemm_batch libxsmm_blas_error("dgemm_batch") +# define LIBXSMM_BLAS_FUNCTION_sgemm_batch libxsmm_blas_error("sgemm_batch") +# define LIBXSMM_BLAS_FUNCTION_dgemm libxsmm_blas_error("dgemm") +# define LIBXSMM_BLAS_FUNCTION_sgemm libxsmm_blas_error("sgemm") +# define LIBXSMM_BLAS_FUNCTION_dgemv libxsmm_blas_error("dgemv") +# define LIBXSMM_BLAS_FUNCTION_sgemv libxsmm_blas_error("sgemv") +#endif +/** Low-precision (BLAS-like) function symbols. */ +#define LIBXSMM_BLAS_FUNCTION_wigemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + LIBXSMM_INLINE_XGEMM(short, int, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +#define LIBXSMM_BLAS_FUNCTION_bsgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + LIBXSMM_INLINE_XGEMM(libxsmm_bfloat16, float, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + +/** Short-cut macros to construct desired BLAS function symbol. */ +#define LIBXSMM_BLAS_FUNCTION1(TYPE, FUNCTION) LIBXSMM_BLAS_FUNCTION(TYPE, TYPE, FUNCTION) +#define LIBXSMM_GEMM_BATCH_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm_batch) +#define LIBXSMM_GEMM_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemm) +#define LIBXSMM_GEMV_SYMBOL(TYPE) LIBXSMM_BLAS_FUNCTION1(TYPE, gemv) + +/** BLAS-based GEMM supplied by the linked LAPACK/BLAS library (macro template). */ +#define LIBXSMM_BLAS_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ + /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ + const char libxsmm_blas_xgemm_transa_ = (char)(NULL != ((void*)(TRANSA)) ? (*(const char*)(TRANSA)) : \ + (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & LIBXSMM_FLAGS) ? 'n' : 't')); \ + const char libxsmm_blas_xgemm_transb_ = (char)(NULL != ((void*)(TRANSB)) ? (*(const char*)(TRANSB)) : \ + (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & LIBXSMM_FLAGS) ? 'n' : 't')); \ + const libxsmm_blasint *const libxsmm_blas_xgemm_k_ = (NULL != ((void*)(K)) ? (K) : (M)); \ + const libxsmm_blasint *const libxsmm_blas_xgemm_n_ = (NULL != ((void*)(N)) ? (N) : libxsmm_blas_xgemm_k_); \ + const libxsmm_blasint libxsmm_blas_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \ + *(('n' == libxsmm_blas_xgemm_transa_ || *"N" == libxsmm_blas_xgemm_transa_) ? (M) : libxsmm_blas_xgemm_k_), 1); \ + const libxsmm_blasint libxsmm_blas_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \ + *(('n' == libxsmm_blas_xgemm_transb_ || *"N" == libxsmm_blas_xgemm_transb_) ? libxsmm_blas_xgemm_k_ : libxsmm_blas_xgemm_n_), 1); \ + const libxsmm_blasint libxsmm_blas_xgemm_ldc_ = LIBXSMM_MAX(NULL != ((void*)(LDC)) ? *(LDC) : *(M), 1); \ + const OTYPE libxsmm_blas_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ + const OTYPE libxsmm_blas_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ + LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(&libxsmm_blas_xgemm_transa_, &libxsmm_blas_xgemm_transb_, \ + M, libxsmm_blas_xgemm_n_, libxsmm_blas_xgemm_k_, \ + &libxsmm_blas_xgemm_alpha_, (const ITYPE*)(A), &libxsmm_blas_xgemm_lda_, \ + (const ITYPE*)(B), &libxsmm_blas_xgemm_ldb_, \ + &libxsmm_blas_xgemm_beta_, (ITYPE*)(C), &libxsmm_blas_xgemm_ldc_); \ +} + +/** Helper macros for calling a dispatched function in a row/column-major aware fashion. */ +#define LIBXSMM_MMCALL_ABC(FN, A, B, C) \ + LIBXSMM_ASSERT(FN); FN(A, B, C) +#define LIBXSMM_MMCALL_PRF(FN, A, B, C, PA, PB, PC) { \ + LIBXSMM_NOPREFETCH_A(LIBXSMM_UNUSED(PA)); \ + LIBXSMM_NOPREFETCH_B(LIBXSMM_UNUSED(PB)); \ + LIBXSMM_NOPREFETCH_C(LIBXSMM_UNUSED(PC)); \ + LIBXSMM_ASSERT(FN); FN(A, B, C, \ + LIBXSMM_GEMM_PREFETCH_A(PA), \ + LIBXSMM_GEMM_PREFETCH_B(PB), \ + LIBXSMM_GEMM_PREFETCH_C(PC)); \ +} + +#if (0/*LIBXSMM_GEMM_PREFETCH_NONE*/ == LIBXSMM_PREFETCH) +# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \ + LIBXSMM_MMCALL_ABC(FN, A, B, C) +#else +# define LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, LDA, LDB, LDC) \ + LIBXSMM_MMCALL_PRF(FN, A, B, C, (A) + ((size_t)LDA) * (K), (B) + ((size_t)LDB) * (N), (C) + ((size_t)LDC) * (N)) +#endif +#define LIBXSMM_MMCALL(FN, A, B, C, M, N, K) LIBXSMM_MMCALL_LDX(FN, A, B, C, M, N, K, M, K, M) + +/** Calculate problem size from M, N, and K using the correct integer type in order to cover the general case. */ +#define LIBXSMM_MNK_SIZE(M, N, K) (((size_t)(M)) * ((size_t)(N)) * ((size_t)(K))) +/** Calculate total number of matrix-elements; matrices A, B, C are given per M, N, K, and emphasize (S) the C-size. */ +#define LIBXSMM_SIZE(M, N, K, S) \ + (((size_t)(M) * (size_t)(K)) + ((size_t)(K) * (size_t)(N)) + \ + (((size_t)(S) * (size_t)(M) * (size_t)(N)))) +/** Condition based on arithmetic intensity (AI) */ +#define LIBXSMM_SMM_AI(M, N, K, S, TYPESIZE) \ + ((LIBXSMM_MNK_SIZE(M, N, K) * 2) <= ((size_t)(TYPESIZE) * 4/*AI*/ * LIBXSMM_SIZE(M, N, K, S))) +/** Determine whether an SMM is suitable, i.e., small enough. */ +#if !defined(LIBXSMM_THRESHOLD_AI) /* traditional MNK-threshold */ +# define LIBXSMM_SMM(M, N, K, S, TYPESIZE) (LIBXSMM_MNK_SIZE(M, N, K) <= (LIBXSMM_MAX_MNK)) +#else /* threshold based on arithmetic intensity */ +# define LIBXSMM_SMM LIBXSMM_SMM_AI +#endif + +/** Fall-back code paths: LIBXSMM_XGEMM_FALLBACK0, and LIBXSMM_XGEMM_FALLBACK1 (macro template). */ +#if !defined(LIBXSMM_XGEMM_FALLBACK0) +# define LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +#endif +#if !defined(LIBXSMM_XGEMM_FALLBACK1) +# define LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + LIBXSMM_BLAS_FUNCTION(ITYPE, OTYPE, gemm)(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +#endif + +/** + * Execute a specialized function, or use a fallback code path depending on threshold (macro template). + * LIBXSMM_XGEMM_FALLBACK0 or specialized function: below LIBXSMM_MAX_MNK + * LIBXSMM_XGEMM_FALLBACK1: above LIBXSMM_MAX_MNK + */ +#define LIBXSMM_XGEMM(ITYPE, OTYPE, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) { \ + const int libxsmm_xgemm_flags_ = LIBXSMM_GEMM_PFLAGS(TRANSA, TRANSB, LIBXSMM_FLAGS) | LIBXSMM_GEMM_XFLAGS(ITYPE, OTYPE); \ + const libxsmm_blasint *const libxsmm_xgemm_k_ = (NULL != (K) ? (K) : (M)); \ + const libxsmm_blasint *const libxsmm_xgemm_n_ = (NULL != (N) ? (N) : libxsmm_xgemm_k_); \ + const libxsmm_blasint libxsmm_xgemm_lda_ = LIBXSMM_MAX(NULL != ((void*)(LDA)) ? *(LDA) : \ + *(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? (M) : libxsmm_xgemm_k_), 1); \ + const libxsmm_blasint libxsmm_xgemm_ldb_ = LIBXSMM_MAX(NULL != ((void*)(LDB)) ? *(LDB) : \ + *(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? libxsmm_xgemm_k_ : libxsmm_xgemm_n_), 1); \ + const libxsmm_blasint libxsmm_xgemm_ldc_ = LIBXSMM_MAX(NULL != (LDC) ? *(LDC) : *(M), 1); \ + if (LIBXSMM_SMM(*(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, 2/*RFO*/, sizeof(OTYPE))) { \ + const LIBXSMM_MMFUNCTION_TYPE2(ITYPE, OTYPE) libxsmm_mmfunction_ = LIBXSMM_MMDISPATCH_SYMBOL2(ITYPE, OTYPE)( \ + *(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, &libxsmm_xgemm_lda_, &libxsmm_xgemm_ldb_, &libxsmm_xgemm_ldc_, \ + (const OTYPE*)(ALPHA), (const OTYPE*)(BETA), &libxsmm_xgemm_flags_, NULL); \ + if (NULL != libxsmm_mmfunction_) { \ + LIBXSMM_MMCALL_LDX(libxsmm_mmfunction_, (const ITYPE*)(A), (const ITYPE*)(B), (OTYPE*)(C), \ + *(M), *libxsmm_xgemm_n_, *libxsmm_xgemm_k_, libxsmm_xgemm_lda_, libxsmm_xgemm_ldb_, libxsmm_xgemm_ldc_); \ + } \ + else { \ + const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \ + const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \ + const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ + const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ + LIBXSMM_XGEMM_FALLBACK0(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \ + M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \ + &libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \ + B, &libxsmm_xgemm_ldb_, \ + &libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \ + } \ + } \ + else { \ + const char libxsmm_xgemm_transa_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & libxsmm_xgemm_flags_) ? 'n' : 't'); \ + const char libxsmm_xgemm_transb_ = (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & libxsmm_xgemm_flags_) ? 'n' : 't'); \ + const OTYPE libxsmm_xgemm_alpha_ = (NULL != ((void*)(ALPHA)) ? (*(const OTYPE*)(ALPHA)) : ((OTYPE)LIBXSMM_ALPHA)); \ + const OTYPE libxsmm_xgemm_beta_ = (NULL != ((void*)(BETA)) ? (*(const OTYPE*)(BETA)) : ((OTYPE)LIBXSMM_BETA)); \ + LIBXSMM_XGEMM_FALLBACK1(ITYPE, OTYPE, &libxsmm_xgemm_transa_, &libxsmm_xgemm_transb_, \ + M, libxsmm_xgemm_n_, libxsmm_xgemm_k_, \ + &libxsmm_xgemm_alpha_, A, &libxsmm_xgemm_lda_, \ + B, &libxsmm_xgemm_ldb_, \ + &libxsmm_xgemm_beta_, C, &libxsmm_xgemm_ldc_); \ + } \ +} + +/** Helper macro to setup a matrix with some initial values. */ +#define LIBXSMM_MATINIT_AUX(OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) { \ + /*const*/ double libxsmm_matinit_seed_ = (double)(SEED); /* avoid constant conditional */ \ + const double libxsmm_matinit_scale_ = (SCALE) * libxsmm_matinit_seed_ + (SCALE); \ + const libxsmm_blasint libxsmm_matinit_nrows_ = (libxsmm_blasint)NROWS; \ + const libxsmm_blasint libxsmm_matinit_ld_ = (libxsmm_blasint)LD; \ + libxsmm_blasint libxsmm_matinit_i_ = 0, libxsmm_matinit_j_ = 0; \ + LIBXSMM_OMP_VAR(libxsmm_matinit_i_); LIBXSMM_OMP_VAR(libxsmm_matinit_j_); \ + if (0 != libxsmm_matinit_seed_) { \ + OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \ + for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \ + for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_nrows_; ++libxsmm_matinit_j_) { \ + const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ + (DST)[libxsmm_matinit_k_] = (TYPE)(libxsmm_matinit_scale_ * (1.0 + \ + libxsmm_matinit_i_ * libxsmm_matinit_nrows_ + libxsmm_matinit_j_)); \ + } \ + for (; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \ + const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ + (DST)[libxsmm_matinit_k_] = (TYPE)(SEED); \ + } \ + } \ + } \ + else { /* shuffle based initialization */ \ + const unsigned int libxsmm_matinit_maxval_ = ((unsigned int)NCOLS) * ((unsigned int)libxsmm_matinit_ld_); \ + const TYPE libxsmm_matinit_maxval2_ = (TYPE)(libxsmm_matinit_maxval_ / 2), libxsmm_matinit_inv_ = (TYPE)((SCALE) / libxsmm_matinit_maxval2_); \ + const size_t libxsmm_matinit_shuffle_ = libxsmm_shuffle(libxsmm_matinit_maxval_); \ + OMP(parallel for private(libxsmm_matinit_i_, libxsmm_matinit_j_)) \ + for (libxsmm_matinit_i_ = 0; libxsmm_matinit_i_ < ((libxsmm_blasint)NCOLS); ++libxsmm_matinit_i_) { \ + for (libxsmm_matinit_j_ = 0; libxsmm_matinit_j_ < libxsmm_matinit_ld_; ++libxsmm_matinit_j_) { \ + const libxsmm_blasint libxsmm_matinit_k_ = libxsmm_matinit_i_ * libxsmm_matinit_ld_ + libxsmm_matinit_j_; \ + (DST)[libxsmm_matinit_k_] = libxsmm_matinit_inv_ * /* normalize values to an interval of [-1, +1] */ \ + ((TYPE)(libxsmm_matinit_shuffle_ * libxsmm_matinit_k_ % libxsmm_matinit_maxval_) - libxsmm_matinit_maxval2_); \ + } \ + } \ + } \ +} + +#define LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ + LIBXSMM_MATINIT_AUX(LIBXSMM_ELIDE, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) +#define LIBXSMM_MATINIT_SEQ(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ + LIBXSMM_MATINIT(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) +#define LIBXSMM_MATINIT_OMP(TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) \ + LIBXSMM_MATINIT_AUX(LIBXSMM_PRAGMA_OMP, TYPE, SEED, DST, NROWS, NCOLS, LD, SCALE) + +/** Call libxsmm_gemm_print using LIBXSMM's GEMM-flags. */ +#define LIBXSMM_GEMM_PRINT(OSTREAM, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \ + LIBXSMM_GEMM_PRINT2(OSTREAM, PRECISION, PRECISION, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) +#define LIBXSMM_GEMM_PRINT2(OSTREAM, IPREC, OPREC, FLAGS, M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) \ + libxsmm_gemm_dprint2(OSTREAM, (libxsmm_gemm_precision)(IPREC), (libxsmm_gemm_precision)(OPREC), \ + /* Use 'n' (instead of 'N') avoids warning about "no macro replacement within a character constant". */ \ + (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & (FLAGS)) ? 'n' : 't'), \ + (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & (FLAGS)) ? 'n' : 't'), \ + M, N, K, DALPHA, A, LDA, B, LDB, DBETA, C, LDC) + +/** + * Utility function, which either prints information about the GEMM call + * or dumps (FILE/ostream=0) all input and output data into MHD files. + * The Meta Image Format (MHD) is suitable for visual inspection using, + * e.g., ITK-SNAP or ParaView. + */ +LIBXSMM_API void libxsmm_gemm_print(void* ostream, + libxsmm_gemm_precision precision, const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc); +LIBXSMM_API void libxsmm_gemm_print2(void* ostream, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc); +LIBXSMM_API void libxsmm_gemm_dprint(void* ostream, + libxsmm_gemm_precision precision, char transa, char transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + double dalpha, const void* a, libxsmm_blasint lda, + const void* b, libxsmm_blasint ldb, + double dbeta, void* c, libxsmm_blasint ldc); +LIBXSMM_API void libxsmm_gemm_dprint2(void* ostream, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, char transa, char transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + double dalpha, const void* a, libxsmm_blasint lda, + const void* b, libxsmm_blasint ldb, + double dbeta, void* c, libxsmm_blasint ldc); +LIBXSMM_API void libxsmm_gemm_xprint(void* ostream, + libxsmm_xmmfunction kernel, const void* a, const void* b, void* c); + +/** GEMM_BATCH: fallback prototype functions served by any compliant LAPACK/BLAS. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_batch_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); +/** GEMM: fallback prototype functions served by any compliant LAPACK/BLAS. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemm_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); +/** GEMV: fallback prototype functions served by any compliant LAPACK/BLAS. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sgemv_function)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); +/** Helper function to consume arguments when called. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sink_function)(LIBXSMM_VARIADIC); + +/** The original BLAS functions. */ +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch_function); +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch_function); +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemm_function libxsmm_original_dgemm_function); +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemm_function libxsmm_original_sgemm_function); +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_dgemv_function libxsmm_original_dgemv_function); +LIBXSMM_APIVAR_PUBLIC(/*volatile*/libxsmm_sgemv_function libxsmm_original_sgemv_function); +LIBXSMM_API libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void); +LIBXSMM_API libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void); +LIBXSMM_API libxsmm_dgemm_function libxsmm_original_dgemm(void); +LIBXSMM_API libxsmm_sgemm_function libxsmm_original_sgemm(void); +LIBXSMM_API libxsmm_dgemv_function libxsmm_original_dgemv(void); +LIBXSMM_API libxsmm_sgemv_function libxsmm_original_sgemv(void); +LIBXSMM_API libxsmm_sink_function libxsmm_blas_error(const char* symbol); +LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC); + +/** + * General dense matrix multiplication, which re-exposes LAPACK/BLAS + * but allows to rely on LIBXSMM's defaults (libxsmm_config.h) + * when supplying NULL-arguments in certain places. + */ +LIBXSMM_API void libxsmm_blas_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc); + +#define libxsmm_blas_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \ + TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +#define libxsmm_blas_sgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + libxsmm_blas_xgemm(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \ + TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + +#define libxsmm_dgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, \ + TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +#define libxsmm_sgemm_omp(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) \ + libxsmm_xgemm_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, \ + TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + +/** Translates GEMM prefetch request into prefetch-enumeration (incl. FE's auto-prefetch). */ +LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_xprefetch(const int* prefetch); +LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_prefetch(int prefetch); + +/** Determines the given value in double-precision based on the given type. */ +LIBXSMM_API int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue); + +#endif /*LIBXSMM_FRONTEND_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_fsspmdm.h b/third_party/libxsmm/include/libxsmm_fsspmdm.h new file mode 100644 index 00000000..46f3275c --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_fsspmdm.h @@ -0,0 +1,40 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_FSSPMDM_H +#define LIBXSMM_FSSPMDM_H + +#include "libxsmm_typedefs.h" + + +/** Opaque types for fsspmdm */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dfsspmdm libxsmm_dfsspmdm; +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_sfsspmdm libxsmm_sfsspmdm; + +LIBXSMM_API libxsmm_dfsspmdm* libxsmm_dfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + const double alpha, const double beta, libxsmm_blasint c_is_nt, + const double* a_dense ); + +LIBXSMM_API void libxsmm_dfsspmdm_execute( const libxsmm_dfsspmdm* handle, const double* B, double* C ); + +LIBXSMM_API void libxsmm_dfsspmdm_destroy( libxsmm_dfsspmdm* handle ); + +LIBXSMM_API libxsmm_sfsspmdm* libxsmm_sfsspmdm_create( libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + const float alpha, const float beta, libxsmm_blasint c_is_nt, + const float* a_dense ); + +LIBXSMM_API void libxsmm_sfsspmdm_execute( const libxsmm_sfsspmdm* handle, const float* B, float* C ); + +LIBXSMM_API void libxsmm_sfsspmdm_destroy( libxsmm_sfsspmdm* handle ); + +#endif /*LIBXSMM_FSSPMDM_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_generator.h b/third_party/libxsmm/include/libxsmm_generator.h new file mode 100644 index 00000000..b08d6abd --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_generator.h @@ -0,0 +1,219 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_GENERATOR_H +#define LIBXSMM_GENERATOR_H + +#include "libxsmm_typedefs.h" + +#define LIBXSMM_GEMM_NO_BYPASS(FLAGS, ALPHA, BETA) ( \ + 0 == ((FLAGS) & (LIBXSMM_GEMM_FLAG_TRANS_A)) && \ + (LIBXSMM_FEQ(1, ALPHA) /*|| LIBXSMM_FEQ(-1, ALPHA)*/) && \ + (LIBXSMM_FEQ(1, BETA) || LIBXSMM_FEQ(0, BETA))) + + +/** Initialize GEMM descriptor as used by low-level routines (type-specific). */ +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_dgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + double alpha, double beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_sgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_wigemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bigemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bbgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bsgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch); + +/** Initialize GEMM descriptor (generic: double-precision alpha/beta). */ +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, + int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit2(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + double alpha, double beta, int flags, int prefetch); + +/** Initialize GEMM descriptor as used by low-level routines (generic). */ +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch); +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init2(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch); +/** Similar to libxsmm_gemm_descriptor_init2 with optional type-converted alpha/beta (dalpha/dbeta). */ +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init3(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch, double* dalpha, double* dbeta); + +/** Initialize transpose descriptor as used by low-level routines. */ +LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_datatype in_type, libxsmm_datatype out_type, + libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldo, libxsmm_blasint ldi, + unsigned short flags, unsigned char param, unsigned char operation); +LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init2(libxsmm_descriptor_blob* blob, + libxsmm_datatype in_type, libxsmm_datatype in2_type, libxsmm_datatype out_type, libxsmm_datatype out2_type, + libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldo, libxsmm_blasint ldi, libxsmm_blasint ldi2, libxsmm_blasint ldi3, + unsigned short flags, unsigned char param, unsigned char operation); + +/** Initialize matrix equation as used by low-level routines */ +LIBXSMM_API libxsmm_meqn_descriptor* libxsmm_meqn_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_datatype type, libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldo, unsigned int eqn_idx); + +/** Structure referring to the generated code with some attached information. */ +LIBXSMM_EXTERN_C typedef struct libxsmm_generated_code { + void* generated_code; /** pointer to memory which can contain strings or binary code */ + unsigned int buffer_size; /** total size if the buffer generated_code */ + unsigned int code_size; /** size of bytes used in generated_code */ + unsigned int code_type; /** + * 0: generated code contains inline assembly in a C function + * which can be dumped into a *.c/cc/cpp file + * 1: generated code contains assembly which can be + * dumped into an *.s file + * >1: generated code contains a function in binary code which can be + * called, when the code is copied into executable memory + */ + unsigned int last_error; /** + * 0: no error occurred + * >0: error code + */ + unsigned int arch; /* target arch for the current code generation task */ + unsigned int sf_size; /* offset of RSP to the beginning of the stack frame + * we track this value to have RBP availbale for general compute + */ +} libxsmm_generated_code; + +/** function to translate LIBXSMM Generator error codes to error messages */ +LIBXSMM_API +const char* libxsmm_strerror(unsigned int i_error_code); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_gemm_inlineasm(const char* i_file_out, + const char* i_routine_name, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch ); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_gemm_directasm(const char* i_file_out, + const char* i_routine_name, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch ); + +LIBXSMM_API +void libxsmm_generator_gemm_kernel(libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc ); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_spgemm(const char* i_file_out, + const char* i_routine_name, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch, + const char* i_file_in, + const int i_is_csr); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_spgemm_csc_kernel(libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch, + const unsigned int* i_row_idx, + const unsigned int* i_column_idx, + const double* i_values); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_spgemm_csr_kernel(libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch, + const unsigned int* i_row_idx, + const unsigned int* i_column_idx, + const double* i_values); + +/* @TODO change int based architecture value */ +LIBXSMM_API +void libxsmm_generator_spgemm_csr_reg_kernel(libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const char* i_arch, + const unsigned int* i_row_idx, + const unsigned int* i_column_idx, + const double* i_values); + +LIBXSMM_API +void libxsmm_generator_packed_spgemm_csr_kernel( libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const unsigned int* i_row_idx, + const unsigned int* i_column_idx, + const void* i_values, + const unsigned int i_packed_width ); + +LIBXSMM_API +void libxsmm_generator_packed_spgemm_csc_kernel( libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const unsigned int* i_row_idx, + const unsigned int* i_column_idx, + const void* i_values, + const unsigned int i_packed_width ); + +LIBXSMM_API +void libxsmm_generator_packed_gemm_ac_rm( libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const unsigned int i_packed_width ); + +LIBXSMM_API +void libxsmm_generator_packed_gemm_bc_rm( libxsmm_generated_code* io_generated_code, + const libxsmm_gemm_descriptor* i_xgemm_desc, + const unsigned int i_packed_width ); + +LIBXSMM_API +void libxsmm_generator_mateltwise_kernel( libxsmm_generated_code* io_generated_code, + const libxsmm_meltw_descriptor* i_mateltw_desc ); + +LIBXSMM_API +void libxsmm_generator_matequation_kernel( libxsmm_generated_code* io_generated_code, + const libxsmm_meqn_descriptor* i_mateqn_desc ); + +/** Initialization counter that can be used to check whether the library is initialized (!=0) or not (==0). */ +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_ninit); +/** Target architecture (libxsmm_get_target_archid, libxsmm_set_target_archid). */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_target_archid); +/** Verbosity level (0: quiet, 1: errors, 2: warnings, 3: info, neg.: all/dump). */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_verbosity); +/** Security-enhanced environment. */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_se); + +#endif /*LIBXSMM_GENERATOR_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_intrinsics_x86.h b/third_party/libxsmm/include/libxsmm_intrinsics_x86.h new file mode 100644 index 00000000..59ec8676 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_intrinsics_x86.h @@ -0,0 +1,1022 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_INTRINSICS_X86_H +#define LIBXSMM_INTRINSICS_X86_H + +#include "libxsmm_cpuid.h" + +/** Macro evaluates to LIBXSMM_ATTRIBUTE_TARGET_xxx (see below). */ +#define LIBXSMM_ATTRIBUTE_TARGET(TARGET) LIBXSMM_CONCATENATE(LIBXSMM_ATTRIBUTE_TARGET_, TARGET) + +#if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_PLATFORM_X86) +# define LIBXSMM_INTRINSICS_NONE +#endif +#if /*no intrinsics: tested with 17.x and 18.x*/(defined(__PGI) && \ + LIBXSMM_VERSION2(19, 0) > LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) \ + || /*legacy*/(defined(_CRAYC) && !defined(__GNUC__)) +# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC) +# define LIBXSMM_INTRINSICS_NONE +# endif +#elif !defined(LIBXSMM_INTRINSICS_STATIC) && !defined(LIBXSMM_INTRINSICS_NONE) && ( \ + (defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && \ + LIBXSMM_VERSION2(4, 4) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) /* GCC 4.4 (target-attribute) */ \ + || (defined(__clang__) && LIBXSMM_VERSION2(3, 7) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ + || (defined(__APPLE__) && defined(__MACH__) && !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && \ + LIBXSMM_VERSION2(9, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) +# define LIBXSMM_INTRINSICS_STATIC +#endif + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif + +/** https://github.com/intel/Immintrin-debug */ +#if !defined(LIBXSMM_INTRINSICS_DEBUG) && 0 +# define LIBXSMM_INTRINSICS_DEBUG +/* workarounds removed after LIBXSMM 1.16.1-1.16.1-1268 */ +# include "immintrin_dbg.h" +#endif +#if defined(__MIC__) && !defined(LIBXSMM_INTRINSICS_NONE) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC +# endif +# define LIBXSMM_INTRINSICS(TARGET) +# define LIBXSMM_INTRINSICS_INCLUDE +#elif !defined(LIBXSMM_INTRINSICS_NONE) /*!defined(__MIC__)*/ +# if defined(__AVX512F__) && defined(__AVX512CD__) \ + && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__) \ + && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ + && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) /* TODO: check GCC, Clang, etc. */ \ + || (LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ + && (!defined(__clang__) || (LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ + && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(99, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX512F__) && defined(__AVX512CD__) \ + && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) \ + && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ + && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ + || (LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ + && (!defined(__clang__) || (LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ + && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX512F__) && defined(__AVX512CD__) \ + && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) \ + && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ + && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ + || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ + && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ + && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX512F__) && defined(__AVX512CD__) \ + && defined(__AVX512PF__) && defined(__AVX512ER__) \ + && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ + && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ + || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ + && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ + && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX512F__) && defined(__AVX512CD__) \ + && defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) \ + && (!defined(__GNUC__) || defined(__clang__) || defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) \ + || (LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) \ + && (!defined(__clang__) || (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__))) \ + && (!defined(__APPLE__) || !defined(__MACH__) || LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512 +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX2__) && defined(__FMA__) && defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__AVX__) && defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_AVX +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__SSE4_2__) && defined(__SSE4_1__) && defined(__SSE3__) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE42 +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(__SSE3__) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_SSE3 +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(LIBXSMM_PLATFORM_X86) +# if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_X86_GENERIC +# endif +# if defined(__GNUC__) +# define LIBXSMM_INTRINSICS_INCLUDE +# endif +# endif +# if defined(LIBXSMM_STATIC_TARGET_ARCH) && !defined(LIBXSMM_INTRINSICS_STATIC) +# if defined(__INTEL_COMPILER) +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) + /* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */ +# if 1904 <= (LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# elif 1801 <= (LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX +# elif 1500 <= (LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE +# elif 1400 <= (LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_MIC +# else +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# endif +# endif +# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(_CRAYC) && defined(__GNUC__) + /* TODO: version check, e.g., LIBXSMM_VERSION2(11, 5) <= LIBXSMM_VERSION2(_RELEASE, _RELEASE_MINOR) */ +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX +# endif +# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ +# define LIBXSMM_INTRINSICS_INCLUDE +# elif defined(_MSC_VER) && !defined(__clang__) + /* TODO: compiler version check for LIBXSMM_MAX_STATIC_TARGET_ARCH */ +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# endif +# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ +# define LIBXSMM_INTRINSICS_INCLUDE +# elif (!defined(__GNUC__) || LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + && (!defined(__clang__) || LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ + && (!defined(__APPLE__) || !defined(__MACH__)) && !defined(__PGI) && !defined(_MSC_VER) +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */ +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# elif (defined(__clang__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# elif (defined(__GNUC__) && LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + || (defined(__clang__) && LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__)) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# elif (defined(__GNUC__) && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + || (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX +# elif (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + || (defined(__clang__) && LIBXSMM_VERSION2(6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE +# else +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# endif +# endif +# define LIBXSMM_INTRINSICS_INCLUDE +# else /* GCC/legacy incl. Clang */ +# if defined(__clang__) && !(defined(__APPLE__) && defined(__MACH__)) && !defined(_WIN32) +# if (LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) /* TODO */ + /* no limitations */ +# elif (LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/) +# define LIBXSMM_INTRINSICS_STATIC +# endif +# elif !defined(LIBXSMM_INTRINSICS_STATIC) +# define LIBXSMM_INTRINSICS_STATIC +# endif +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# if defined(__CYGWIN__) && !defined(LIBXSMM_INTRINSICS_DEBUG) /* Cygwin: invalid register for .seh_savexmm */ +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX2 +# elif LIBXSMM_VERSION2(10, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# elif LIBXSMM_VERSION2( 9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && !defined(__cray__) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CPX +# elif LIBXSMM_VERSION2( 6, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CLX +# else +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_X86_AVX512_CORE +# endif +# endif +# else /* fallback */ +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH +# endif +# if !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_STATIC_TARGET_ARCH < LIBXSMM_X86_AVX2/*workaround*/) +# define LIBXSMM_INTRINSICS_STATIC +# endif +# endif +# if !defined(LIBXSMM_INTRINSICS_INCLUDE) && (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) +# define LIBXSMM_INTRINSICS_INCLUDE +# endif +# endif /* GCC/legacy incl. Clang */ +# if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# error "LIBXSMM_MAX_STATIC_TARGET_ARCH not defined!" +# endif +# if defined(LIBXSMM_TARGET_ARCH) && (LIBXSMM_TARGET_ARCH < LIBXSMM_MAX_STATIC_TARGET_ARCH) +# undef LIBXSMM_MAX_STATIC_TARGET_ARCH +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH +# endif +# if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_DEBUG) +# include +# endif /*defined(LIBXSMM_INTRINSICS_INCLUDE)*/ +# if !defined(LIBXSMM_INTRINSICS) +# if (LIBXSMM_MAX_STATIC_TARGET_ARCH > LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_INTRINSICS(TARGET) LIBXSMM_ATTRIBUTE(LIBXSMM_ATTRIBUTE_TARGET(TARGET)) + /* LIBXSMM_ATTRIBUTE_TARGET_xxx is required to literally match the CPUID (libxsmm_cpuid.h)! */ +# define LIBXSMM_ATTRIBUTE_TARGET_1002 target("sse2") /* LIBXSMM_X86_GENERIC (64-bit ABI) */ +# if (LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1003 target("sse3") +# else +# define LIBXSMM_ATTRIBUTE_TARGET_1003 LIBXSMM_ATTRIBUTE_TARGET_1002 +# endif +# if (LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1004 target("sse4.1,sse4.2") +# else +# define LIBXSMM_ATTRIBUTE_TARGET_1004 LIBXSMM_ATTRIBUTE_TARGET_1003 +# endif +# if (LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1005 target("avx") +# else +# define LIBXSMM_ATTRIBUTE_TARGET_1005 LIBXSMM_ATTRIBUTE_TARGET_1004 +# endif +# if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1006 target("avx2,fma") +# else +# define LIBXSMM_ATTRIBUTE_TARGET_1006 LIBXSMM_ATTRIBUTE_TARGET_1005 +# endif +# if (LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1007 target("avx2,fma,avx512f,avx512cd") +# else +# define LIBXSMM_ATTRIBUTE_TARGET_1007 LIBXSMM_ATTRIBUTE_TARGET_1006 +# endif +# if (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1010 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er") +# else /* LIBXSMM_X86_AVX512 */ +# define LIBXSMM_ATTRIBUTE_TARGET_1010 LIBXSMM_ATTRIBUTE_TARGET_1007 +# endif +# if (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1011 target("avx2,fma,avx512f,avx512cd,avx512pf,avx512er,avx5124vnniw,avx5124fmaps") +# else /* LIBXSMM_X86_AVX512_MIC */ +# define LIBXSMM_ATTRIBUTE_TARGET_1011 LIBXSMM_ATTRIBUTE_TARGET_1010 +# endif +# if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1020 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl") +# else /* LIBXSMM_X86_AVX512 */ +# define LIBXSMM_ATTRIBUTE_TARGET_1020 LIBXSMM_ATTRIBUTE_TARGET_1007 +# endif +# if (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1021 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni") +# else /* LIBXSMM_X86_AVX512_CORE */ +# define LIBXSMM_ATTRIBUTE_TARGET_1021 LIBXSMM_ATTRIBUTE_TARGET_1020 +# endif +# if (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_ATTRIBUTE_TARGET_1022 target("avx2,fma,avx512f,avx512cd,avx512dq,avx512bw,avx512vl,avx512vnni,avx512bf16") +# else /* LIBXSMM_X86_AVX512_CORE */ +# define LIBXSMM_ATTRIBUTE_TARGET_1022 LIBXSMM_ATTRIBUTE_TARGET_1021 +# endif +# else +# define LIBXSMM_INTRINSICS(TARGET)/*no need for target flags*/ +# endif +# elif !defined(LIBXSMM_INTRINSICS_TARGET) +# define LIBXSMM_INTRINSICS_TARGET +# endif /*!defined(LIBXSMM_INTRINSICS)*/ +# endif /*defined(LIBXSMM_STATIC_TARGET_ARCH)*/ +#endif /*!defined(LIBXSMM_INTRINSICS_NONE)*/ + +#if !defined(LIBXSMM_STATIC_TARGET_ARCH) +# if !defined(LIBXSMM_INTRINSICS_NONE) && !defined(LIBXSMM_INTRINSICS_STATIC) +# define LIBXSMM_INTRINSICS_NONE +# endif +# define LIBXSMM_STATIC_TARGET_ARCH LIBXSMM_TARGET_ARCH_GENERIC +#endif + +#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH +#elif (LIBXSMM_MAX_STATIC_TARGET_ARCH < LIBXSMM_STATIC_TARGET_ARCH) +# undef LIBXSMM_MAX_STATIC_TARGET_ARCH +# define LIBXSMM_MAX_STATIC_TARGET_ARCH LIBXSMM_STATIC_TARGET_ARCH +#endif + +#if !defined(LIBXSMM_INTRINSICS) +# define LIBXSMM_INTRINSICS(TARGET) +#endif + +/** Include basic x86 intrinsics such as __rdtsc. */ +#if defined(LIBXSMM_INTRINSICS_INCLUDE) && !defined(LIBXSMM_INTRINSICS_DEBUG) +# if defined(_WIN32) +# include +# elif defined(LIBXSMM_INTEL_COMPILER) || defined(_CRAYC) || defined(__clang__) || defined(__PGI) +# include +# elif defined(__GNUC__) && (LIBXSMM_VERSION2(4, 4) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) +# include +# endif +# include +# if defined(__SSE3__) +# include +# endif +#endif + +#if !defined(LIBXSMM_INTRINSICS_NONE) +# if defined(_WIN32) +# include +# else +# include +# endif +#endif + +/** + * Intrinsic-specific fix-ups + */ +# define LIBXSMM_INTRINSICS_LOADU_SI128(A) _mm_loadu_si128(A) +#if !defined(LIBXSMM_INTEL_COMPILER) && defined(__clang__) && ( \ + (LIBXSMM_VERSION2(3, 9) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) \ + || (LIBXSMM_VERSION2(7, 3) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__) && \ + defined(__APPLE__) && defined(__MACH__))) +/* prototypes with incorrect signature: _mm512_load_ps takes DP*, _mm512_load_pd takes SP* (checked with v3.8.1) */ +# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const double*)(A)) +# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const float*)(A)) +/* Clang misses _mm512_stream_p? (checked with v3.8.1). */ +# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_store_si512(A, B) +# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_storeu_ps(A, B) +# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_store_pd(A, B) +#else +# define LIBXSMM_INTRINSICS_MM512_LOAD_PS(A) _mm512_loadu_ps((const float*)(A)) +# define LIBXSMM_INTRINSICS_MM512_LOAD_PD(A) _mm512_loadu_pd((const double*)(A)) +# define LIBXSMM_INTRINSICS_MM512_STREAM_SI512(A, B) _mm512_stream_si512((__m512i*)(A), (B)) +# define LIBXSMM_INTRINSICS_MM512_STREAM_PS(A, B) _mm512_stream_ps(A, B) +# define LIBXSMM_INTRINSICS_MM512_STREAM_PD(A, B) _mm512_stream_pd(A, B) +#endif +#if !defined(LIBXSMM_INTEL_COMPILER) || (defined(__clang__) && ( \ + (LIBXSMM_VERSION2(8, 0) > LIBXSMM_VERSION2(__clang_major__, __clang_minor__)))) \ + || (defined(__APPLE__) && defined(__MACH__)) || defined(__GNUC__) +# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_si256((__m256i*)(A), B) +#else +# define LIBXSMM_INTRINSICS_MM256_STORE_EPI32(A, B) _mm256_storeu_epi32(A, B) +#endif +#if defined(LIBXSMM_INTEL_COMPILER) +# if 1600 <= (LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ + E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ + _mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ + E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) +# else +# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ + E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ + _mm512_castps_si512(_mm512_set_epi16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ + E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0)) +# endif +#else +# define LIBXSMM_INTRINSICS_MM512_SET_EPI16(E31, E30, E29, E28, E27, E26, E25, E24, E23, E22, E21, E20, E19, E18, E17, E16, \ + E15, E14, E13, E12, E11, E10, E9, E8, E7, E6, E5, E4, E3, E2, E1, E0) \ + _mm512_set_epi32(((E31) << 16) | (E30), ((E29) << 16) | (E28), ((E27) << 16) | (E26), ((E25) << 16) | (E24), \ + ((E23) << 16) | (E22), ((E21) << 16) | (E20), ((E19) << 16) | (E18), ((E17) << 16) | (E16), \ + ((E15) << 16) | (E14), ((E13) << 16) | (E12), ((E11) << 16) | (E10), ((E9) << 16) | (E8), \ + ((E7) << 16) | (E6), ((E5) << 16) | (E4), ((E3) << 16) | (E2), ((E1) << 16) | (E0)) +#endif +#if defined(LIBXSMM_INTEL_COMPILER) \ + || (defined(__GNUC__) && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + || (defined(__clang__) && (!defined(__APPLE__) || !defined(__MACH__)) \ + && LIBXSMM_VERSION2(4, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_mask_i32gather_epi32(A, B, C, D, E) +# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm512_extracti64x4_epi64(A, B) +# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_abs_ps(A) +# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_undefined_epi32() +# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_undefined() +# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_undefined_si256() +# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_undefined_si128() +# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_undefined_pd() +#else +# define LIBXSMM_INTRINSICS_MM512_MASK_I32GATHER_EPI32(A, B, C, D, E) _mm512_castps_si512(_mm512_mask_i32gather_ps( \ + _mm512_castsi512_ps(A), B, C, (const float*)(D), E)) +# define LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(A, B) _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(A), B)) +# define LIBXSMM_INTRINSICS_MM512_ABS_PS(A) _mm512_castsi512_ps(_mm512_and_epi32( \ + _mm512_castps_si512(A), _mm512_set1_epi32(0x7FFFFFFF))) +# define LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() _mm512_set1_epi32(0) +# define LIBXSMM_INTRINSICS_MM512_UNDEFINED() _mm512_set1_ps(0) +# define LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() _mm256_set1_epi32(0) +# define LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() _mm_set1_epi32(0) +# define LIBXSMM_INTRINSICS_MM_UNDEFINED_PD() _mm_set1_pd(0) +#endif +#if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= (LIBXSMM_INTEL_COMPILER))) \ + || (!defined(LIBXSMM_INTEL_COMPILER) && defined(__GNUC__) \ + && LIBXSMM_VERSION2(7, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) \ + || ((!defined(__APPLE__) || !defined(__MACH__)) && defined(__clang__) \ + && LIBXSMM_VERSION2(8, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ + LIBXSMM_CONCATENATE(_store_mask, NBITS)((LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR), SRC) +# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \ + LIBXSMM_CONCATENATE(_load_mask, NBITS)((/*const*/ LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR)) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(_cvtu32_mask, NBITS)((unsigned int)(A)) +#elif defined(LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ + (*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC)) +# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) \ + ((LIBXSMM_CONCATENATE(__mmask, NBITS))_mm512_mask2int(*(const __mmask16*)(SRC_PTR))) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) LIBXSMM_CONCATENATE(LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_, NBITS)(A) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_32(A) ((__mmask32)(A)) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_16(A) _mm512_int2mask((int)(A)) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK_8(A) ((__mmask8)(A)) +#else +# define LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, NBITS) \ + (*(LIBXSMM_CONCATENATE(__mmask, NBITS)*)(DST_PTR) = (LIBXSMM_CONCATENATE(__mmask, NBITS))(SRC)) +# define LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, NBITS) (*(const LIBXSMM_CONCATENATE(__mmask, NBITS)*)(SRC_PTR)) +# define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, NBITS) ((LIBXSMM_CONCATENATE(__mmask, NBITS))(A)) +#endif +#define LIBXSMM_INTRINSICS_MM512_STORE_MASK64(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 64) +#define LIBXSMM_INTRINSICS_MM512_STORE_MASK32(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 32) +#define LIBXSMM_INTRINSICS_MM512_STORE_MASK16(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 16) +#define LIBXSMM_INTRINSICS_MM512_STORE_MASK8(DST_PTR, SRC) LIBXSMM_INTRINSICS_MM512_STORE_MASK(DST_PTR, SRC, 8) +#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK64(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 64) +#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK32(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 32) +#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK16(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 16) +#define LIBXSMM_INTRINSICS_MM512_LOAD_MASK8(SRC_PTR) LIBXSMM_INTRINSICS_MM512_LOAD_MASK(SRC_PTR, 8) +#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK32(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 32) +#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 16) +#define LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(A) LIBXSMM_INTRINSICS_MM512_CVTU32_MASK(A, 8) + +/** + * Pseudo intrinsics for portability + */ +LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD32_SW(unsigned int n) { + unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r; +} +LIBXSMM_API_INLINE int LIBXSMM_INTRINSICS_BITSCANFWD64_SW(unsigned long long n) { + unsigned int i, r = 0; if (0 != n) for (i = 1; 0 == (n & i); i <<= 1) { ++r; } return r; +} + +/** Binary Logarithm (based on Stackoverflow's NBITSx macro). */ +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N) (0 != ((N) & 0x2/*0b10*/) ? 1 : 0) +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N) (0 != ((N) & 0xC/*0b1100*/) ? (2 | LIBXSMM_INTRINSICS_BITSCANBWD_SW02((N) >> 2)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW02(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N) (0 != ((N) & 0xF0/*0b11110000*/) ? (4 | LIBXSMM_INTRINSICS_BITSCANBWD_SW04((N) >> 4)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW04(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N) (0 != ((N) & 0xFF00) ? (8 | LIBXSMM_INTRINSICS_BITSCANBWD_SW08((N) >> 8)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW08(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N) (0 != ((N) & 0xFFFF0000) ? (16 | LIBXSMM_INTRINSICS_BITSCANBWD_SW16((N) >> 16)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW16(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD_SW64(N) (0 != ((N) & 0xFFFFFFFF00000000) ? (32 | LIBXSMM_INTRINSICS_BITSCANBWD_SW32((N) >> 32)) : LIBXSMM_INTRINSICS_BITSCANBWD_SW32(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD32_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW32((unsigned int)(N)) +#define LIBXSMM_INTRINSICS_BITSCANBWD64_SW(N) LIBXSMM_INTRINSICS_BITSCANBWD_SW64((unsigned long long)(N)) + +#if defined(_WIN32) && !defined(LIBXSMM_INTRINSICS_NONE) + LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD32(unsigned int n) { + unsigned long r = 0; _BitScanForward(&r, n); return (0 != n) * r; + } + LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD32(unsigned int n) { + unsigned long r = 0; _BitScanReverse(&r, n); return r; + } +# if defined(_WIN64) + LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANFWD64(unsigned long long n) { + unsigned long r = 0; _BitScanForward64(&r, n); return (0 != n) * r; + } + LIBXSMM_API_INLINE unsigned int LIBXSMM_INTRINSICS_BITSCANBWD64(unsigned long long n) { + unsigned long r = 0; _BitScanReverse64(&r, n); return r; + } +# else +# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW +# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW +# endif +#elif defined(__GNUC__) && !defined(LIBXSMM_INTRINSICS_NONE) +# define LIBXSMM_INTRINSICS_BITSCANFWD32(N) ((0 != (N)) * __builtin_ctz(N)) +# define LIBXSMM_INTRINSICS_BITSCANFWD64(N) ((0 != (N)) * __builtin_ctzll(N)) +# define LIBXSMM_INTRINSICS_BITSCANBWD32(N) ((0 != (N)) * (31 - __builtin_clz(N))) +# define LIBXSMM_INTRINSICS_BITSCANBWD64(N) ((0 != (N)) * (63 - __builtin_clzll(N))) +#else /* fallback implementation */ +# define LIBXSMM_INTRINSICS_BITSCANFWD32 LIBXSMM_INTRINSICS_BITSCANFWD32_SW +# define LIBXSMM_INTRINSICS_BITSCANFWD64 LIBXSMM_INTRINSICS_BITSCANFWD64_SW +# define LIBXSMM_INTRINSICS_BITSCANBWD32 LIBXSMM_INTRINSICS_BITSCANBWD32_SW +# define LIBXSMM_INTRINSICS_BITSCANBWD64 LIBXSMM_INTRINSICS_BITSCANBWD64_SW +#endif + +/** LIBXSMM_NBITS determines the minimum number of bits needed to represent N. */ +#define LIBXSMM_NBITS(N) (LIBXSMM_INTRINSICS_BITSCANBWD64(N) + LIBXSMM_MIN(1, N)) +#define LIBXSMM_ISQRT2(N) ((unsigned int)((1ULL << (LIBXSMM_NBITS(N) >> 1)) /*+ LIBXSMM_MIN(1, N)*/)) +/** LIBXSMM_ILOG2 definition matches ceil(log2(N)). */ +LIBXSMM_API_INLINE unsigned int LIBXSMM_ILOG2(unsigned long long n) { + unsigned int result = 0; if (1 < n) { + const unsigned int m = LIBXSMM_INTRINSICS_BITSCANBWD64(n); + result = m + ((unsigned int)LIBXSMM_INTRINSICS_BITSCANBWD64(n - 1) == m); + } return result; +} + +/** + * Target attribution + */ +#if !defined(LIBXSMM_INTRINSICS_KNC) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(__MIC__) +# define LIBXSMM_INTRINSICS_KNC +#endif +/** LIBXSMM_INTRINSICS_X86 is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_GENERIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_X86 +#endif +/** LIBXSMM_INTRINSICS_SSE3 is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_SSE3) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE3 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_SSE3 +#endif +/** LIBXSMM_INTRINSICS_SSE42 is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_SSE42) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_SSE42 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_SSE42 +#endif +/** LIBXSMM_INTRINSICS_AVX is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX +#endif +/** LIBXSMM_INTRINSICS_AVX2 is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX2 +#endif +/** LIBXSMM_INTRINSICS_AVX512 is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512 <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX512 +#endif +/** LIBXSMM_INTRINSICS_AVX512_MIC is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512_MIC) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_MIC <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_MIC <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX512_MIC +#endif +/** LIBXSMM_INTRINSICS_AVX512_KNM is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512_KNM) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_KNM <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_KNM <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX512_KNM +#endif +/** LIBXSMM_INTRINSICS_AVX512_CORE is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX512_CORE +#endif +/** LIBXSMM_INTRINSICS_AVX512_CLX is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512_CLX) && !defined(LIBXSMM_INTRINSICS_NONE) && (LIBXSMM_X86_AVX512_CLX <= LIBXSMM_STATIC_TARGET_ARCH || \ + (!defined(LIBXSMM_INTRINSICS_STATIC) && LIBXSMM_X86_AVX512_CLX <= LIBXSMM_MAX_STATIC_TARGET_ARCH)) +# define LIBXSMM_INTRINSICS_AVX512_CLX +#endif +/** LIBXSMM_INTRINSICS_AVX512_CPX is defined only if the compiler is able to generate this code without special flags. */ +#if !defined(LIBXSMM_INTRINSICS_AVX512_CPX) && !defined(LIBXSMM_INTRINSICS_NONE) && defined(LIBXSMM_X86_AVX512_CPX) && \ + !defined(LIBXSMM_INTRINSICS_STATIC) && (LIBXSMM_X86_AVX512_CPX <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +# define LIBXSMM_INTRINSICS_AVX512_CPX +#endif + +/** 2048-bit state for xoshiro128+ RNG (state/symbols needed even if AVX-512 is not used) */ +#define LIBXSMM_INTRINSICS_MM512_RNG_STATE(INDEX) (*(__m512i*)LIBXSMM_CONCATENATE(libxsmm_intrinsics_mm512_rng_state, INDEX)) +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); + +/** + * Pseudo intrinsics (AVX-2) + */ +#if defined(LIBXSMM_INTRINSICS_AVX2) /*__AVX2__*/ +# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 +LIBXSMM_PRAGMA_OPTIMIZE_OFF /* avoid ICE in case of symbols (-g) */ +# endif +/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user. + * this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) __m256i LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32(unsigned int* stateptr) { + __m256i state_0 = _mm256_loadu_si256( (const __m256i*)stateptr ); + __m256i state_1 = _mm256_loadu_si256( (const __m256i*)(stateptr+16) ); + __m256i state_2 = _mm256_loadu_si256( (const __m256i*)(stateptr+32) ); + __m256i state_3 = _mm256_loadu_si256( (const __m256i*)(stateptr+48) ); + const __m256i result = _mm256_add_epi32(state_0, state_3); + const __m256i s = _mm256_slli_epi32(state_1, 9); + __m256i t; + state_2 = _mm256_xor_si256(state_2, state_0); + state_3 = _mm256_xor_si256(state_3, state_1); + state_1 = _mm256_xor_si256(state_1, state_2); + state_0 = _mm256_xor_si256(state_0, state_3); + state_2 = _mm256_xor_si256(state_2, s); + _mm256_storeu_si256( (__m256i*)stateptr , state_0 ); + _mm256_storeu_si256( (__m256i*)(stateptr+16), state_1 ); + _mm256_storeu_si256( (__m256i*)(stateptr+32), state_2 ); + t = _mm256_slli_epi32(state_3, 11); + state_3 = _mm256_or_si256(t, _mm256_srli_epi32(state_3, 32 - 11)); + _mm256_storeu_si256( (__m256i*)(stateptr+48), state_3 ); + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) __m256 LIBXSMM_INTRINSICS_MM256_RNG_EXTSTATE_PS(unsigned int* stateptr) { + const __m256i rng_mantissa = _mm256_srli_epi32( LIBXSMM_INTRINSICS_MM256_RNG_XOSHIRO128P_EXTSTATE_EPI32(stateptr), 9 ); + const __m256 one = _mm256_set1_ps(1.0f); + return _mm256_sub_ps(_mm256_castsi256_ps(_mm256_or_si256(_mm256_set1_epi32(0x3f800000), rng_mantissa)), one); +} +# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 +LIBXSMM_PRAGMA_OPTIMIZE_ON +# endif +#endif /*__AVX2__*/ + +/** + * Pseudo intrinsics (AVX-512) + */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ +# define LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( A, B ) _mm512_cvtepi32_epi16(_mm512_cvt_roundps_epi32( \ + _mm512_mul_ps(LIBXSMM_INTRINSICS_MM512_LOAD_PS(A), B), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(__m512 a) { + const __m512i vnaninf = _mm512_set1_epi32(0x7f800000), vrneadd = _mm512_set1_epi32(0x00007fff); + const __m512i vfixup = _mm512_set1_epi32(0x00000001), vfixupmask = _mm512_set1_epi32(0x00010000); + const __m512i mm512_roundbf16rne_a_ = _mm512_castps_si512(a); + const __mmask16 mm512_roundbf16rne_mask1_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vnaninf), vnaninf, _MM_CMPINT_NE); + const __mmask16 mm512_roundbf16rne_mask2_ = _mm512_cmp_epi32_mask(_mm512_and_epi32(mm512_roundbf16rne_a_, vfixupmask), vfixupmask, _MM_CMPINT_EQ); + return _mm512_mask_add_epi32(mm512_roundbf16rne_a_, mm512_roundbf16rne_mask1_, mm512_roundbf16rne_a_, _mm512_mask_add_epi32(vrneadd, mm512_roundbf16rne_mask2_, vrneadd, vfixup)); +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m256i LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16(__m512 a) { + return _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16)); +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16(__m512 a, __m512 b) { + const __m256i aa = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(b), 16)); + const __m256i bb = _mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(a), 16)); + return _mm512_inserti64x4(_mm512_inserti64x4(_mm512_setzero_si512(), aa, 0), bb, 1); +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(__m256i a) { + return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(a),16)); +} + +/** SVML-intrinsics */ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_78(__m512 x) { + const __m512 c0 = _mm512_set1_ps(2027025.0f); + const __m512 c1 = _mm512_set1_ps(270270.0f); + const __m512 c2 = _mm512_set1_ps(6930.0f); + const __m512 c3 = _mm512_set1_ps(36.0f); + const __m512 c1_d = _mm512_set1_ps(945945.0f); + const __m512 c2_d = _mm512_set1_ps(51975.0f); + const __m512 c3_d = _mm512_set1_ps(630.0f); + const __m512 hi_bound = _mm512_set1_ps(4.97f); + const __m512 lo_bound = _mm512_set1_ps(-4.97f); + const __m512 ones = _mm512_set1_ps(1.0f); + const __m512 neg_ones = _mm512_set1_ps(-1.0f); + + const __m512 x2 = _mm512_mul_ps( x, x ); + const __m512 t1_nom = _mm512_fmadd_ps( c3, x2, c2 ); + const __m512 t2_nom = _mm512_fmadd_ps( t1_nom, x2, c1 ); + const __m512 t3_nom = _mm512_fmadd_ps( t2_nom, x2, c0 ); + const __m512 nom = _mm512_mul_ps( t3_nom, x ); + const __m512 t1_denom = _mm512_add_ps( x2, c3_d ); + const __m512 t2_denom = _mm512_fmadd_ps( t1_denom, x2, c2_d ); + const __m512 t3_denom = _mm512_fmadd_ps( t2_denom, x2, c1_d ); + const __m512 denom = _mm512_fmadd_ps( t3_denom, x2, c0 ); + const __m512 denom_rcp = _mm512_rcp14_ps( denom ); + const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ); + const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ); + __m512 result = _mm512_mul_ps( nom, denom_rcp ); + result = _mm512_mask_blend_ps(mask_hi, result, ones); + result = _mm512_mask_blend_ps(mask_lo, result, neg_ones); + + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_RATIONAL_32(__m512 x) { + const __m512 c1 = _mm512_set1_ps((float)(1.0/27.0)); + const __m512 c2 = _mm512_set1_ps((float)(1.0/3)); + const __m512 hi_bound = _mm512_set1_ps(3.2f); + const __m512 lo_bound = _mm512_set1_ps(-3.2f); + const __m512 ones = _mm512_set1_ps(1.0f); + const __m512 neg_ones = _mm512_set1_ps(-1.0f); + + const __m512 x2 = _mm512_mul_ps( x, x ); + const __m512 t1_nom = _mm512_fmadd_ps( x2, c1, ones); + const __m512 nom = _mm512_mul_ps( t1_nom, x ); + const __m512 denom = _mm512_fmadd_ps( x2, c2, ones); + const __m512 denom_rcp = _mm512_rcp14_ps( denom ); + const __mmask16 mask_hi = _mm512_cmp_ps_mask( x, hi_bound, _CMP_GT_OQ); + const __mmask16 mask_lo = _mm512_cmp_ps_mask( x, lo_bound, _CMP_LT_OQ); + __m512 result = _mm512_mul_ps(nom, denom_rcp); + result = _mm512_mask_blend_ps(mask_hi, result, ones); + result = _mm512_mask_blend_ps(mask_lo, result, neg_ones); + + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP2(__m512 _x) { + const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2)); + const __m512 half = _mm512_set1_ps(0.5f); + const __m512 c2 = _mm512_set1_ps(0.240226507f); + const __m512 c1 = _mm512_set1_ps(0.452920674f); + const __m512 c0 = _mm512_set1_ps(0.713483036f); + const __m512 ones = _mm512_set1_ps(1.0f); + const __m512 minus_twos = _mm512_set1_ps(-2.0f); + + const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half); +#if 1 + const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); +#else + const __m512 y = _mm512_reduce_ps(x, 1); +#endif + const __m512 t1 = _mm512_fmadd_ps( y, c2, c1); + const __m512 two_to_y = _mm512_fmadd_ps( y, t1, c0); + const __m512 exp = _mm512_scalef_ps( two_to_y, x ); + const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) ); + __m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones); + + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_EXP3(__m512 _x) { + const __m512 twice_log2_e = _mm512_set1_ps((float)(1.442695*2)); + const __m512 half = _mm512_set1_ps(0.5f); + const __m512 c3 = _mm512_set1_ps(0.05550410866f); + const __m512 c2 = _mm512_set1_ps(0.15697034396f); + const __m512 c1 = _mm512_set1_ps(0.49454875509f); + const __m512 c0 = _mm512_set1_ps(0.70654502287f); + const __m512 ones = _mm512_set1_ps(1.0f); + const __m512 minus_twos = _mm512_set1_ps(-2.0f); + + const __m512 x = _mm512_fmadd_ps(_x, twice_log2_e, half); +#if 1 + const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); +#else + const __m512 y = _mm512_reduce_ps(x, 1); +#endif + const __m512 t1 = _mm512_fmadd_ps( y, c3, c2); + const __m512 t2 = _mm512_fmadd_ps( y, t1, c1); + const __m512 two_to_y = _mm512_fmadd_ps( y, t2, c0); + const __m512 exp = _mm512_scalef_ps( two_to_y, x ); + const __m512 denom_rcp = _mm512_rcp14_ps( _mm512_add_ps( exp, ones) ); + __m512 result = _mm512_fmadd_ps( denom_rcp, minus_twos, ones); + + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(__m512 x) { + __m512 result, func_p0, func_p1, func_p2; + const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 ); + const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF ); + const __m512i lut_low = _mm512_set1_epi32( 246 ); + const __m512i lut_high = _mm512_set1_epi32( 261 ); + const __m512 tanh_p0_2_reg = _mm512_set_ps( 0.40555000f, 0.11892800f, -0.00972979f, -0.02740300f, -0.0169851f, -0.00776152f, -0.00305889f, + -0.00116259f, -0.00041726f, -8.53233e-6f, 1.0000000f, 0.99999800f, 0.99975400f, 0.99268200f, + 0.93645300f, 0.73833900f); + const __m512 tanh_p1_2_reg = _mm512_set_ps( 0.495602f, 0.88152f, 1.125700000f, 1.17021000f, 1.1289000000f, 1.07929000f, 1.0432300f, 1.023010f, + 1.011620f, 1.00164f, 1.56828e-14f, 4.49924e-7f, 0.0000646924f, 0.00260405f, 0.0311608f, 0.168736f); + const __m512 tanh_p2_2_reg = _mm512_set_ps(-0.108238f, -0.2384280f, -0.354418000f, -0.38240300f, -0.34135700f, -0.274509000f, -0.20524900f, -0.1511960f, + -0.107635f, -0.0466868f, -3.60822e-16f, -2.05971e-8f, -4.24538e-6f, -0.000231709f, -0.00386434f, -0.0277702f); + + const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask); + const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter); + __m512i indices = _mm512_srli_epi32(abs_arg, 22); + indices = _mm512_max_epi32(indices, lut_low); + indices = _mm512_min_epi32(indices, lut_high); + + func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_2_reg); + func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_2_reg); + func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_2_reg); + + result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p2, func_p1); + result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0); + result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs)); + + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(__m512 x) { + __m512 result, func_p0, func_p1, func_p2, func_p3; + const __m512i sign_mask = _mm512_set1_epi32( 0x80000000 ); + const __m512i sign_filter = _mm512_set1_epi32( 0x7FFFFFFF ); + const __m512i lut_low = _mm512_set1_epi32( 246 ); + const __m512i lut_high = _mm512_set1_epi32( 261 ); + + const __m512 tanh_p0_3_reg = _mm512_setr_ps( 0.466283000f, 0.82850600f, 0.97437500f, 0.99882600f, 0.9999860f, 1.0000000f, -1.50006e-08f, -7.98169e-06f, + -4.53753e-05f, -0.00023755f, -0.00125285f, -0.00572314f, -0.0227717f, -0.0629089f, -0.084234300f, 0.071199800f); + const __m512 tanh_p1_3_reg = _mm512_setr_ps( 0.500617f, 0.124369f, 0.0137214f, 0.000464124f, 4.02465e-06f, 0.00000f, 1.00001f, 1.00028f, 1.00112f, 1.00414f, + 1.015570f, 1.050950f, 1.1478500f, 1.310130000f, 1.378950000f, 1.07407f); + const __m512 tanh_p2_3_reg = _mm512_setr_ps(-0.16133200f, -0.0305526f, -0.00245909f, -6.12647e-05f, -3.76127e-07f, 0.000000f, -0.000245872f, -0.00341151f, + -0.00971505f, -0.0256817f, -0.06869110f, -0.162433000f, -0.346828000f, -0.566516f, -0.640214000f, -0.44011900f); + const __m512 tanh_p3_3_reg = _mm512_setr_ps( 0.0177393f, 0.00253432f, 0.000147303f, 2.69963e-06f, 1.16764e-08f, 0.0000000f, -0.330125f, -0.3176210f, + -0.3017760f, -0.27358000f, -0.219375000f, -0.136197000f, -0.01868680f, 0.0808901f, 0.107095f, 0.0631459f); + + const __m512i signs = _mm512_and_epi32(_mm512_castps_si512(x), sign_mask); + const __m512i abs_arg = _mm512_and_epi32(_mm512_castps_si512(x), sign_filter); + __m512i indices = _mm512_srli_epi32(abs_arg, 22); + indices = _mm512_max_epi32(indices, lut_low); + indices = _mm512_min_epi32(indices, lut_high); + + func_p0 = _mm512_permutexvar_ps(indices, tanh_p0_3_reg); + func_p1 = _mm512_permutexvar_ps(indices, tanh_p1_3_reg); + func_p2 = _mm512_permutexvar_ps(indices, tanh_p2_3_reg); + func_p3 = _mm512_permutexvar_ps(indices, tanh_p3_3_reg); + + result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), func_p3, func_p2); + result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p1); + result = _mm512_fmadd_ps(_mm512_castsi512_ps(abs_arg), result, func_p0); + result = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(result), signs)); + + return result; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512DQ__ needed*/ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) __m512 LIBXSMM_INTRINSICS_MM512_GELU_FWD_PS_MINIMAX3(__m512 x) { + const __m512 thres = _mm512_castsi512_ps(_mm512_set1_epi32(0x40879fff)); + const __m512 absmask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)); + const __m512 scale = _mm512_castsi512_ps(_mm512_set1_epi32(0x406a0ea1)); + const __m512 shifter = _mm512_castsi512_ps(_mm512_set1_epi32(0x4b400000)); + const __m512 half = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f000000)); + const __m512 _c2 = _mm512_castsi512_ps(_mm512_setr_epi32(0xbd877b85u, 0xbd7d9780u, 0xbd4cb70eu, 0xbd08a1e9u, 0xbc808857u, 0xb9476fd2u, 0x3c36f765u, 0x3c924160u, + 0x3ca7b1fcu, 0x3ca5732cu, 0x3c95af63u, 0x3c8079f7u, 0x3c55fa4fu, 0x3c2fa86bu, 0x3c0fbb00u, 0x3bec178cu)); + const __m512 _c1 = _mm512_castsi512_ps(_mm512_setr_epi32(0xb7c7fb58u, 0xbacb9740u, 0xbc3e4b3au, 0xbd0d292au, 0xbd8bc5d0u, 0xbdd9978fu, 0xbe0f92d3u, 0xbe27b66du, + 0xbe328ce7u, 0xbe3125bfu, 0xbe26dc9du, 0xbe17a056u, 0xbe06bdebu, 0xbdecc593u, 0xbdcf57aau, 0xbdb5ea3au)); + const __m512 _c0 = _mm512_castsi512_ps(_mm512_setr_epi32(0x3ecc4231u, 0x3ecc541cu, 0x3ecd6c48u, 0x3ed174c3u, 0x3ed9bd5du, 0x3ee5acd5u, 0x3ef2aeddu, 0x3efd5384u, + 0x3f016724u, 0x3f00f778u, 0x3efb389eu, 0x3ef0464du, 0x3ee3014fu, 0x3ed50a78u, 0x3ec779dbu, 0x3ebae363u)); + __m512 result; + __m512 xr = _mm512_range_round_ps(x, thres, 2, _MM_FROUND_NO_EXC); + __m512 xa = _mm512_and_ps(xr, absmask); + __m512 index = _mm512_fmadd_ps(xa, scale, shifter); + __m512 c2 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c2); + __m512 c1 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c1); + __m512 c0 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c0); + __m512 poly = _mm512_fmadd_ps(c2, xa, c1); + poly = _mm512_fmadd_ps(poly, xa, c0); + result = _mm512_mul_ps(x, _mm512_fmadd_ps(poly, xr, half)); + + return result; +} +#endif /*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/ + +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512DQ__ needed*/ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) __m512 LIBXSMM_INTRINSICS_MM512_GELU_BWD_PS_MINIMAX3(__m512 x) { + const __m512 thres = _mm512_castsi512_ps(_mm512_set1_epi32(0x408f5fff)); + const __m512 absmask = _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)); + const __m512 scale = _mm512_castsi512_ps(_mm512_set1_epi32(0x405d67c9)); + const __m512 shifter = _mm512_castsi512_ps(_mm512_set1_epi32(0x4b400000)); + const __m512 half = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f000000)); + const __m512 _c2 = _mm512_castsi512_ps(_mm512_setr_epi32(0xbe87047bu, 0xbe6eb875u, 0xbe2210c1u, 0xbd81727fu, 0x3cb9625cu, 0x3da2cbe8u, 0x3dd1d4d1u, 0x3dca0bd0u, + 0x3da47dd0u, 0x3d6f1bd3u, 0x3d216381u, 0x3cd2618cu, 0x3c89f6e6u, 0x3c3ca672u, 0x3c08ed08u, 0x3bd26a14u)); + const __m512 _c1 = _mm512_castsi512_ps(_mm512_setr_epi32(0xb930e738u, 0xbc4b28bau, 0xbda4212fu, 0xbe5feb0eu, 0xbec8b0e5u, 0xbf09e61bu, 0xbf1c403fu, 0xbf185954u, + 0xbf03e1eeu, 0xbed08a61u, 0xbe9b4508u, 0xbe61788bu, 0xbe257770u, 0xbdfc542au, 0xbdca014eu, 0xbda8d7e9u)); + const __m512 _c0 = _mm512_castsi512_ps(_mm512_setr_epi32(0x3f4c4245u, 0x3f4c927bu, 0x3f5085f8u, 0x3f5d7bdau, 0x3f73ea12u, 0x3f86142fu, 0x3f8d3df4u, 0x3f8b4b0fu, + 0x3f8022c8u, 0x3f5e5423u, 0x3f39ceb5u, 0x3f199bedu, 0x3f00bee0u, 0x3ede1737u, 0x3ec59b86u, 0x3eb4454cu)); + __m512 result; + __m512 xr = _mm512_range_round_ps(x, thres, 2, _MM_FROUND_NO_EXC); + __m512 xa = _mm512_and_ps(xr, absmask); + __m512 index = _mm512_fmadd_ps(xa, scale, shifter); + __m512 c2 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c2); + __m512 c1 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c1); + __m512 c0 = _mm512_permutexvar_ps(_mm512_castps_si512(index), _c0); + __m512 poly = _mm512_fmadd_ps(c2, xa, c1); + poly = _mm512_fmadd_ps(poly, xa, c0); + result = _mm512_fmadd_ps(poly, xr, half); + + return result; +} +#endif /*defined(LIBXSMM_INTRINSICS_AVX512_CORE)*/ + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_FWD(__m512 x) { + const __m512 c1 = _mm512_set1_ps( (float)0.79788); + const __m512 c2 = _mm512_set1_ps( (float)0.03568); + const __m512 c_half = _mm512_set1_ps( (float)0.5); + + __m512 x_half = _mm512_mul_ps( x, c_half ); + __m512 x_sq = _mm512_mul_ps( x, x ); + __m512 poly_x1 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c2, c1)); + __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(poly_x1); + __m512 output = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); + + return output; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_TANH_PS_GELU_BWD(__m512 x) { + const __m512 c1 = _mm512_set1_ps( (float)0.79788); + const __m512 c2 = _mm512_set1_ps( (float)0.03568); + const __m512 c3 = _mm512_set1_ps( (float)0.05352); + const __m512 c4 = _mm512_set1_ps( (float)0.39894); + const __m512 c_half = _mm512_set1_ps( (float)0.5); + const __m512 c_ones = _mm512_set1_ps( (float)1.0); + const __m512 c_minus_1 = _mm512_set1_ps( (float)-1.0); + + __m512 x_sq = _mm512_mul_ps( x, x ); + __m512 poly_x1 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c2, c1)); + __m512 poly_x2 = _mm512_mul_ps(x, _mm512_fmadd_ps( x_sq, c3, c4)); + + __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(poly_x1); + __m512 out1 = _mm512_add_ps(c_ones, tanh_poly_x); + __m512 out2 = _mm512_add_ps(c_half, poly_x2); + __m512 out3 = _mm512_fmsub_ps(poly_x2, tanh_poly_x, out2); + __m512 out4 = _mm512_mul_ps(c_minus_1, out3); + __m512 output = _mm512_mul_ps(out1, out4); + + return output; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_EXP_PS_2DTS(__m512 in) { + const __m512 log2_e = _mm512_set1_ps(1.442695f); + const __m512 half = _mm512_set1_ps(0.5f); + const __m512 c2 = _mm512_set1_ps(0.240226507f); + const __m512 c1 = _mm512_set1_ps(0.452920674f); + const __m512 c0 = _mm512_set1_ps(0.713483036f); + + const __m512 x = _mm512_fmadd_ps(in, log2_e, half); +#if 1 + const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); +#else + const __m512 y = _mm512_reduce_ps(x, 1); +#endif + const __m512 t1 = _mm512_fmadd_ps( y, c2, c1); + const __m512 two_to_y = _mm512_fmadd_ps( y, t1, c0); + const __m512 exp = _mm512_scalef_ps( two_to_y, x ); + + return exp; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_EXP_PS_3DTS(__m512 in) { + const __m512 log2_e = _mm512_set1_ps(1.442695f); + const __m512 half = _mm512_set1_ps(0.5f); + const __m512 c3 = _mm512_set1_ps(0.05550410866f); + const __m512 c2 = _mm512_set1_ps(0.15697034396f); + const __m512 c1 = _mm512_set1_ps(0.49454875509f); + const __m512 c0 = _mm512_set1_ps(0.70654502287f); + + const __m512 x = _mm512_fmadd_ps(in, log2_e, half); +#if 1 + const __m512 y = _mm512_sub_ps(x, _mm512_roundscale_round_ps(x, 1, _MM_FROUND_CUR_DIRECTION)); +#else + const __m512 y = _mm512_reduce_ps(x, 1); +#endif + const __m512 t1 = _mm512_fmadd_ps( y, c3, c2); + const __m512 t2 = _mm512_fmadd_ps( y, t1, c1); + const __m512 two_to_y = _mm512_fmadd_ps( y, t2, c0); + const __m512 exp = _mm512_scalef_ps( two_to_y, x ); + + return exp; +} + +# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 +LIBXSMM_PRAGMA_OPTIMIZE_OFF /* avoid ICE in case of symbols (-g) */ +# endif +/** Generate random number in the interval [0, 1); not thread-safe. + * this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(void) { + const __m512i result = _mm512_add_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); + const __m512i s = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), 9); + __m512i t; + LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), LIBXSMM_INTRINSICS_MM512_RNG_STATE(0)); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), LIBXSMM_INTRINSICS_MM512_RNG_STATE(1)); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(1), LIBXSMM_INTRINSICS_MM512_RNG_STATE(2)); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(0), LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_xor_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(2), s); + t = _mm512_slli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 11); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_or_epi32(t, _mm512_srli_epi32(LIBXSMM_INTRINSICS_MM512_RNG_STATE(3), 32 - 11)); + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_PS(void) { + const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EPI32(), 9 ); + const __m512 one = _mm512_set1_ps(1.0f); + return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one); +} + +/** Generate random number in the interval [0, 1); thread save, state needs to be managed by user. + * this is based on xoshiro128+ 1.0, e.g. http://prng.di.unimi.it/xoshiro128plus.c */ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512i LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32(unsigned int* stateptr) { + __m512i state_0 = _mm512_loadu_si512( stateptr ); + __m512i state_1 = _mm512_loadu_si512( stateptr+16 ); + __m512i state_2 = _mm512_loadu_si512( stateptr+32 ); + __m512i state_3 = _mm512_loadu_si512( stateptr+48 ); + const __m512i result = _mm512_add_epi32(state_0, state_3); + const __m512i s = _mm512_slli_epi32(state_1, 9); + __m512i t; + state_2 = _mm512_xor_epi32(state_2, state_0); + state_3 = _mm512_xor_epi32(state_3, state_1); + state_1 = _mm512_xor_epi32(state_1, state_2); + state_0 = _mm512_xor_epi32(state_0, state_3); + state_2 = _mm512_xor_epi32(state_2, s); + _mm512_storeu_si512( stateptr , state_0 ); + _mm512_storeu_si512( stateptr+16, state_1 ); + _mm512_storeu_si512( stateptr+32, state_2 ); + t = _mm512_slli_epi32(state_3, 11); + state_3 = _mm512_or_epi32(t, _mm512_srli_epi32(state_3, 32 - 11)); + _mm512_storeu_si512( stateptr+48, state_3 ); + return result; +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) __m512 LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS(unsigned int* stateptr) { + const __m512i rng_mantissa = _mm512_srli_epi32( LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32(stateptr), 9 ); + const __m512 one = _mm512_set1_ps(1.0f); + return _mm512_sub_ps(_mm512_castsi512_ps(_mm512_or_epi32(_mm512_set1_epi32(0x3f800000), rng_mantissa)), one); +} +# if defined(__GNUC__) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) && !defined(_CRAYC) && 0 +LIBXSMM_PRAGMA_OPTIMIZE_ON +# endif +#endif /*__AVX512F__*/ + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#endif /*LIBXSMM_INTRINSICS_X86_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_macros.h b/third_party/libxsmm/include/libxsmm_macros.h new file mode 100644 index 00000000..43f3f0d5 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_macros.h @@ -0,0 +1,983 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MACROS_H +#define LIBXSMM_MACROS_H + +#include "libxsmm_config.h" + +/** Parameters the library was built for. */ +#define LIBXSMM_CACHELINE LIBXSMM_CONFIG_CACHELINE +#define LIBXSMM_ALIGNMENT LIBXSMM_CONFIG_ALIGNMENT +#define LIBXSMM_MALLOC LIBXSMM_CONFIG_MALLOC +#define LIBXSMM_ILP64 LIBXSMM_CONFIG_ILP64 +#define LIBXSMM_SYNC LIBXSMM_CONFIG_SYNC +#define LIBXSMM_JIT LIBXSMM_CONFIG_JIT + +/** Parameters of GEMM domain (static kernels, etc). */ +#define LIBXSMM_PREFETCH LIBXSMM_CONFIG_PREFETCH +#define LIBXSMM_MAX_MNK LIBXSMM_CONFIG_MAX_MNK +#define LIBXSMM_MAX_DIM LIBXSMM_CONFIG_MAX_DIM +#define LIBXSMM_MAX_M LIBXSMM_CONFIG_MAX_M +#define LIBXSMM_MAX_N LIBXSMM_CONFIG_MAX_N +#define LIBXSMM_MAX_K LIBXSMM_CONFIG_MAX_K +#define LIBXSMM_FLAGS LIBXSMM_CONFIG_FLAGS +#define LIBXSMM_ALPHA LIBXSMM_CONFIG_ALPHA +#define LIBXSMM_BETA LIBXSMM_CONFIG_BETA + +/** + * Use "make PLATFORM=1" to disable platform checks. + * The platform check is to bail-out with an error + * message for an attempt to build an upstream package + * and subsequently to list LIBXSMM as "broken" on + * that platform. + * Note: successful compilation on an unsupported + * platform is desired, but only fallback code is + * present at best. + */ +#if !defined(LIBXSMM_PLATFORM_FORCE) && 0 +# define LIBXSMM_PLATFORM_FORCE +#endif + +#if !defined(LIBXSMM_PLATFORM_X86) && ( \ + (defined(__x86_64__) && 0 != (__x86_64__)) || \ + (defined(__amd64__) && 0 != (__amd64__)) || \ + (defined(_M_X64) || defined(_M_AMD64)) || \ + (defined(__i386__) && 0 != (__i386__)) || \ + (defined(_M_IX86))) +# define LIBXSMM_PLATFORM_X86 +#endif +#if !defined(LIBXSMM_PLATFORM_AARCH64) && \ + (defined(__aarch64__) || defined(__arm64__)) +# define LIBXSMM_PLATFORM_AARCH64 +#endif +#if !defined(LIBXSMM_PLATFORM_SUPPORTED) +# if defined(LIBXSMM_PLATFORM_X86) || defined(LIBXSMM_PLATFORM_AARCH64) +# define LIBXSMM_PLATFORM_SUPPORTED +# elif !defined(LIBXSMM_PLATFORM_FORCE) +# error LIBXSMM requires X86_64, AArch64, or compatible CPUs! +# endif +#endif +#if !defined(LIBXSMM_BITS) +# if (defined(__SIZEOF_PTRDIFF_T__) && 4 < (__SIZEOF_PTRDIFF_T__)) || \ + (defined(__SIZE_MAX__) && (4294967295U < (__SIZE_MAX__))) || \ + (defined(__x86_64__) && 0 != (__x86_64__)) || \ + (defined(__amd64__) && 0 != (__amd64__)) || \ + (defined(_M_X64) || defined(_M_AMD64)) || \ + (defined(_WIN64)) || \ + (defined(__powerpc64)) || \ + (defined(__aarch64__)) +# define LIBXSMM_UNLIMITED 0xFFFFFFFFFFFFFFFF +# define LIBXSMM_BITS 64 +# elif !defined(LIBXSMM_PLATFORM_FORCE) && defined(NDEBUG) +# error LIBXSMM is only supported on 64-bit platforms! +# else /* JIT-generated code (among other issues) is not supported! */ +# define LIBXSMM_UNLIMITED 0xFFFFFFFF +# define LIBXSMM_BITS 32 +# endif +#endif + +#define LIBXSMM_STRINGIFY2(SYMBOL) #SYMBOL +#define LIBXSMM_STRINGIFY(SYMBOL) LIBXSMM_STRINGIFY2(SYMBOL) +#define LIBXSMM_TOSTRING(SYMBOL) LIBXSMM_STRINGIFY(SYMBOL) +#define LIBXSMM_CONCATENATE2(A, B) A##B +#define LIBXSMM_CONCATENATE3(A, B, C) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE(A, B), C) +#define LIBXSMM_CONCATENATE4(A, B, C, D) LIBXSMM_CONCATENATE(LIBXSMM_CONCATENATE3(A, B, C), D) +#define LIBXSMM_CONCATENATE(A, B) LIBXSMM_CONCATENATE2(A, B) +#define LIBXSMM_FSYMBOL(SYMBOL) LIBXSMM_CONCATENATE(SYMBOL, _) +#define LIBXSMM_UNIQUE(NAME) LIBXSMM_CONCATENATE(NAME, __LINE__) +#define LIBXSMM_EXPAND(...) __VA_ARGS__ +#define LIBXSMM_ELIDE(...) + +/** + * Check given value against type-range (assertion). + * Note: allows "-1" for unsigned types. + */ +#if !defined(NDEBUG) +# define LIBXSMM_CHECK_ULLONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULLONG_MAX) +# define LIBXSMM_CHECK_LLONG(VALUE) assert(ULLONG_MIN <= (VALUE) && (VALUE) <= LLONG_MAX) +# define LIBXSMM_CHECK_ULONG(VALUE) assert(-1 <= (VALUE) && (VALUE) <= ULONG_MAX) +# define LIBXSMM_CHECK_LONG(VALUE) assert(LONG_MIN <= (VALUE) && (VALUE) <= LONG_MAX) +# define LIBXSMM_CHECK_USHORT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= USHRT_MAX) +# define LIBXSMM_CHECK_SHORT(VALUE) assert(SHRT_MIN <= (VALUE) && (VALUE) <= SHRT_MAX) +# define LIBXSMM_CHECK_UCHAR(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UCHAR_MAX) +# define LIBXSMM_CHECK_ICHAR(VALUE) assert(SCHAR_MIN <= (VALUE) && (VALUE) <= SCHAR_MAX) +# define LIBXSMM_CHECK_UINT(VALUE) assert(-1 <= (VALUE) && (VALUE) <= UINT_MAX) +# define LIBXSMM_CHECK_INT(VALUE) assert(INT_MIN <= (VALUE) && (VALUE) <= INT_MAX) +#else +# define LIBXSMM_CHECK_ULLONG(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_LLONG(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_ULONG(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_LONG(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_USHORT(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_SHORT(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_UCHAR(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_ICHAR(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_UINT(VALUE) 0/*dummy*/ +# define LIBXSMM_CHECK_INT(VALUE) 0/*dummy*/ +#endif + +/** + * Perform verbose type-cast with following two advantages: + * (1) Make it easy to locate/find the type-cast. + * (2) Range-check to ensure fitting into type. + */ +#define LIBXSMM_CAST_ULLONG(VALUE) (LIBXSMM_CHECK_ULLONG(VALUE), (unsigned long long)(VALUE)) +#define LIBXSMM_CAST_LLONG(VALUE) (LIBXSMM_CHECK_LLONG(VALUE), (/*signed*/long long)(VALUE)) +#define LIBXSMM_CAST_ULONG(VALUE) (LIBXSMM_CHECK_ULONG(VALUE), (unsigned long)(VALUE)) +#define LIBXSMM_CAST_LONG(VALUE) (LIBXSMM_CHECK_LONG(VALUE), (/*signed*/long)(VALUE)) +#define LIBXSMM_CAST_USHORT(VALUE) (LIBXSMM_CHECK_USHORT(VALUE), (unsigned short)(VALUE)) +#define LIBXSMM_CAST_SHORT(VALUE) (LIBXSMM_CHECK_SHORT(VALUE), (/*signed*/short)(VALUE)) +#define LIBXSMM_CAST_UCHAR(VALUE) (LIBXSMM_CHECK_UCHAR(VALUE), (unsigned char)(VALUE)) +#define LIBXSMM_CAST_ICHAR(VALUE) (LIBXSMM_CHECK_ICHAR(VALUE), (signed char)(VALUE)) +#define LIBXSMM_CAST_UINT(VALUE) (LIBXSMM_CHECK_UINT(VALUE), (unsigned int)(VALUE)) +#define LIBXSMM_CAST_INT(VALUE) (LIBXSMM_CHECK_INT(VALUE), (/*signed*/int)(VALUE)) + +/** Use LIBXSMM_VERSION2 instead of LIBXSMM_VERSION3, e.g., if __GNUC_PATCHLEVEL__ or __clang_patchlevel__ is zero (0). */ +#define LIBXSMM_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100) +#define LIBXSMM_VERSION3(MAJOR, MINOR, UPDATE) (LIBXSMM_VERSION2(MAJOR, MINOR) + (UPDATE)) +#define LIBXSMM_VERSION4(MAJOR, MINOR, UPDATE, PATCH) \ + (((0x7F & (MAJOR)) << 24) | ((0x1F & (MINOR)) << 19) | ((0x1F & (UPDATE)) << 14) | (0x3FFF & (PATCH))) +#define LIBXSMM_VERSION41(VERSION) (((VERSION) >> 24)) +#define LIBXSMM_VERSION42(VERSION) (((VERSION) >> 19) & 0x1F) +#define LIBXSMM_VERSION43(VERSION) (((VERSION) >> 14) & 0x1F) +#define LIBXSMM_VERSION44(VERSION) (((VERSION)) & 0x3FFF) + +#if !defined(LIBXSMM_UNPACKED) && (defined(_CRAYC) || defined(LIBXSMM_OFFLOAD_BUILD) || \ + (0 == LIBXSMM_SYNC)/*Windows: missing pack(pop) error*/) +# define LIBXSMM_UNPACKED +#endif +#if defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__) +# define LIBXSMM_ATTRIBUTE(A) __declspec(A) +# if defined(__cplusplus) +# define LIBXSMM_INLINE_ALWAYS __forceinline +# else +# define LIBXSMM_INLINE_ALWAYS static __forceinline +# endif +# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(align(N)) DECL +# if !defined(LIBXSMM_UNPACKED) +# define LIBXSMM_PACKED(TYPE) LIBXSMM_PRAGMA(pack(1)) TYPE +# endif +# define LIBXSMM_CDECL __cdecl +#elif (defined(__GNUC__) || defined(__clang__) || defined(__PGI)) +# define LIBXSMM_ATTRIBUTE(A) __attribute__((A)) +# define LIBXSMM_INLINE_ALWAYS LIBXSMM_ATTRIBUTE(always_inline) LIBXSMM_INLINE +# define LIBXSMM_ALIGNED(DECL, N) LIBXSMM_ATTRIBUTE(aligned(N)) DECL +# if !defined(LIBXSMM_UNPACKED) +# define LIBXSMM_PACKED(TYPE) TYPE LIBXSMM_ATTRIBUTE(__packed__) +# endif +# define LIBXSMM_CDECL LIBXSMM_ATTRIBUTE(cdecl) +#else +# define LIBXSMM_ATTRIBUTE(A) +# define LIBXSMM_INLINE_ALWAYS LIBXSMM_INLINE +# define LIBXSMM_ALIGNED(DECL, N) DECL +# define LIBXSMM_CDECL +#endif +#if !defined(LIBXSMM_PACKED) +# define LIBXSMM_PACKED(TYPE) TYPE +# if !defined(LIBXSMM_UNPACKED) +# define LIBXSMM_UNPACKED +# endif +#endif +#if !defined(LIBXSMM_UNPACKED) && 0 +/* no braces around EXPR */ +# define LIBXSMM_PAD(EXPR) EXPR; +#endif +#if !defined(LIBXSMM_PAD) +# define LIBXSMM_PAD(EXPR) +#endif + +#if defined(__INTEL_COMPILER) +# if !defined(__INTEL_COMPILER_UPDATE) +# define LIBXSMM_INTEL_COMPILER __INTEL_COMPILER +# else +# define LIBXSMM_INTEL_COMPILER (__INTEL_COMPILER + __INTEL_COMPILER_UPDATE) +# endif +#elif defined(__INTEL_COMPILER_BUILD_DATE) +# define LIBXSMM_INTEL_COMPILER ((__INTEL_COMPILER_BUILD_DATE / 10000 - 2000) * 100) +#endif + +/* LIBXSMM_ATTRIBUTE_USED: mark library functions as used to avoid warning */ +#if defined(__GNUC__) || defined(__clang__) || (defined(__INTEL_COMPILER) && !defined(_WIN32)) +# if !defined(__cplusplus) || !defined(__clang__) +# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(common) +# else +# define LIBXSMM_ATTRIBUTE_COMMON +# endif +# define LIBXSMM_ATTRIBUTE_MALLOC LIBXSMM_ATTRIBUTE(malloc) +# define LIBXSMM_ATTRIBUTE_UNUSED LIBXSMM_ATTRIBUTE(unused) +# define LIBXSMM_ATTRIBUTE_USED LIBXSMM_ATTRIBUTE(used) +#else +# if defined(_WIN32) +# define LIBXSMM_ATTRIBUTE_COMMON LIBXSMM_ATTRIBUTE(selectany) +# else +# define LIBXSMM_ATTRIBUTE_COMMON +# endif +# define LIBXSMM_ATTRIBUTE_MALLOC +# define LIBXSMM_ATTRIBUTE_UNUSED +# define LIBXSMM_ATTRIBUTE_USED +#endif +#if defined(__clang__) && !defined(__INTEL_COMPILER) +# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(no_sanitize(LIBXSMM_STRINGIFY(KIND))) +#elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 8) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) \ + && !defined(__INTEL_COMPILER) +# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) LIBXSMM_ATTRIBUTE(LIBXSMM_CONCATENATE(no_sanitize_, KIND)) +#else +# define LIBXSMM_ATTRIBUTE_NO_SANITIZE(KIND) +#endif + +#if defined(__cplusplus) +# define LIBXSMM_VARIADIC ... +# define LIBXSMM_EXTERN extern "C" +# define LIBXSMM_EXTERN_C extern "C" +# define LIBXSMM_INLINE_KEYWORD inline +# define LIBXSMM_INLINE LIBXSMM_INLINE_KEYWORD +# if defined(__GNUC__) || defined(_CRAYC) +# define LIBXSMM_CALLER __PRETTY_FUNCTION__ +# elif defined(_MSC_VER) +# define LIBXSMM_CALLER __FUNCDNAME__ +# define LIBXSMM_FUNCNAME __FUNCTION__ +# else +# define LIBXSMM_CALLER __FUNCNAME__ +# endif +#else /* C */ +# define LIBXSMM_VARIADIC +# define LIBXSMM_EXTERN extern +# define LIBXSMM_EXTERN_C +# if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/ +# define LIBXSMM_PRAGMA(DIRECTIVE) _Pragma(LIBXSMM_STRINGIFY(DIRECTIVE)) +# define LIBXSMM_CALLER __func__ +# define LIBXSMM_RESTRICT restrict +# define LIBXSMM_INLINE_KEYWORD inline +# elif defined(_MSC_VER) +# define LIBXSMM_CALLER __FUNCDNAME__ +# define LIBXSMM_FUNCNAME __FUNCTION__ +# define LIBXSMM_INLINE_KEYWORD __inline +# define LIBXSMM_INLINE_FIXUP +# elif defined(__GNUC__) && !defined(__STRICT_ANSI__) +# define LIBXSMM_CALLER __PRETTY_FUNCTION__ +# endif +# if !defined(LIBXSMM_INLINE_KEYWORD) +# define LIBXSMM_INLINE_KEYWORD +# define LIBXSMM_INLINE_FIXUP +# endif +/* LIBXSMM_ATTRIBUTE_USED: increases compile-time of header-only by a large factor */ +# define LIBXSMM_INLINE static LIBXSMM_INLINE_KEYWORD LIBXSMM_ATTRIBUTE_UNUSED +#endif /*__cplusplus*/ +#if !defined(LIBXSMM_CALLER) +# define LIBXSMM_CALLER NULL +#endif +#if !defined(LIBXSMM_FUNCNAME) +# define LIBXSMM_FUNCNAME LIBXSMM_CALLER +#endif +#if !defined(LIBXSMM_CALLER_ID) +# if defined(__GNUC__) || 1 +# define LIBXSMM_CALLER_ID ((const void*)((uintptr_t)libxsmm_hash_string(LIBXSMM_CALLER))) +# else /* assume no string-pooling (perhaps unsafe) */ +# define LIBXSMM_CALLER_ID LIBXSMM_CALLER +# endif +#endif + +#if defined(LIBXSMM_OFFLOAD_BUILD) && \ + defined(__INTEL_OFFLOAD) && (!defined(_WIN32) || (1400 <= LIBXSMM_INTEL_COMPILER)) +# define LIBXSMM_OFFLOAD(A) LIBXSMM_ATTRIBUTE(target(A)) +# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) ((RTYPE (*)(LIBXSMM_VARIADIC))(FN))(__VA_ARGS__) +# if !defined(LIBXSMM_OFFLOAD_TARGET) +# define LIBXSMM_OFFLOAD_TARGET mic +# endif +#else +# define LIBXSMM_OFFLOAD(A) +# define LIBXSMM_NO_OFFLOAD(RTYPE, FN, ...) (FN)(__VA_ARGS__) +#endif +#define LIBXSMM_RETARGETABLE LIBXSMM_OFFLOAD(LIBXSMM_OFFLOAD_TARGET) + +#if !defined(__STATIC) && !defined(_WINDLL) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)) +# define __STATIC +#endif + +/* may include Clang and other compatible compilers */ +#if defined(__GNUC__) && !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__) +# define LIBXSMM_VISIBILITY_INTERNAL LIBXSMM_ATTRIBUTE(visibility("internal")) +# define LIBXSMM_VISIBILITY_HIDDEN LIBXSMM_ATTRIBUTE(visibility("hidden")) +# define LIBXSMM_VISIBILITY_PUBLIC LIBXSMM_ATTRIBUTE(visibility("default")) +#endif +#if !defined(LIBXSMM_VISIBILITY_INTERNAL) +# define LIBXSMM_VISIBILITY_INTERNAL +#endif +#if !defined(LIBXSMM_VISIBILITY_HIDDEN) +# define LIBXSMM_VISIBILITY_HIDDEN +#endif +#if !defined(LIBXSMM_VISIBILITY_PUBLIC) +# define LIBXSMM_VISIBILITY_PUBLIC +#endif +#if !defined(LIBXSMM_VISIBILITY_PRIVATE) +# define LIBXSMM_VISIBILITY_PRIVATE LIBXSMM_VISIBILITY_HIDDEN +#endif + +/* Windows Dynamic Link Library (DLL) */ +#if !defined(__STATIC) && (defined(_WIN32) || defined(__CYGWIN__) || defined(__MINGW32__)) +# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_ATTRIBUTE(dllexport) +# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_ATTRIBUTE(dllimport) +#endif +#if !defined(LIBXSMM_VISIBILITY_EXPORT) +# define LIBXSMM_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_PUBLIC +#endif +#if !defined(LIBXSMM_VISIBILITY_IMPORT) +# define LIBXSMM_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_PUBLIC +#endif + +#if defined(LIBXSMM_SOURCE_H) /* header-only mode */ +# define LIBXSMM_API_VISIBILITY_EXPORT +# define LIBXSMM_API_VISIBILITY_IMPORT +# define LIBXSMM_API_VISIBILITY_INTERN +# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE LIBXSMM_ATTRIBUTE_COMMON +# define LIBXSMM_API_TARGET LIBXSMM_API_INLINE +# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN_C +#else /* classic ABI */ +# if defined(LIBXSMM_BUILD_EXT) +# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT +# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_EXPORT +# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE +# elif defined(LIBXSMM_BUILD) +# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_EXPORT +# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT +# define LIBXSMM_API_VISIBILITY_INTERN LIBXSMM_VISIBILITY_PRIVATE +# else /* import */ +# define LIBXSMM_API_VISIBILITY_EXPORT LIBXSMM_VISIBILITY_IMPORT +# define LIBXSMM_API_VISIBILITY_IMPORT LIBXSMM_VISIBILITY_IMPORT +# define LIBXSMM_API_VISIBILITY_INTERN +# endif +# define LIBXSMM_API_COMMON LIBXSMM_RETARGETABLE +# define LIBXSMM_API_TARGET LIBXSMM_RETARGETABLE +# define LIBXSMM_API_EXTERN LIBXSMM_EXTERN +#endif + +#define LIBXSMM_API_VISIBILITY(VISIBILITY) LIBXSMM_CONCATENATE(LIBXSMM_API_VISIBILITY_, VISIBILITY) +#define LIBXSMM_APIVAR(DECL, VISIBILITY, EXTERN) EXTERN LIBXSMM_API_COMMON LIBXSMM_API_VISIBILITY(VISIBILITY) DECL +#define LIBXSMM_API_INLINE LIBXSMM_INLINE LIBXSMM_RETARGETABLE +#define LIBXSMM_API_DEF + +#if (!defined(__INTEL_COMPILER) || !defined(_WIN32)) +#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_ALIGNED(LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF), LIBXSMM_CONFIG_CACHELINE) +#else +#define LIBXSMM_APIVAR_ALIGNED(DECL, VISIBILITY) LIBXSMM_APIVAR(DECL, VISIBILITY, LIBXSMM_API_DEF) +#endif + +/** Public variable declaration (without definition) located in header file. */ +#define LIBXSMM_APIVAR_PUBLIC(DECL) LIBXSMM_APIVAR(DECL, EXPORT, LIBXSMM_API_EXTERN) +/** Public variable definition (complements declaration) located in source file. */ +#define LIBXSMM_APIVAR_PUBLIC_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, EXPORT) +/** Private variable declaration (without definition) located in header file. */ +#define LIBXSMM_APIVAR_PRIVATE(DECL) LIBXSMM_APIVAR(DECL, INTERN, LIBXSMM_API_EXTERN) +/** Private variable definition (complements declaration) located in source file. */ +#define LIBXSMM_APIVAR_PRIVATE_DEF(DECL) LIBXSMM_APIVAR_ALIGNED(DECL, INTERN) +/** Private variable (declaration and definition) located in source file. */ +#define LIBXSMM_APIVAR_DEFINE(DECL) LIBXSMM_APIVAR_PRIVATE(DECL); LIBXSMM_APIVAR_PRIVATE_DEF(DECL) +/** Function decoration used for private functions. */ +#define LIBXSMM_API_INTERN LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(INTERN) +/** Function decoration used for public functions of LIBXSMMext library. */ +#define LIBXSMM_APIEXT LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(IMPORT) +/** Function decoration used for public functions of LIBXSMM library. */ +#define LIBXSMM_API LIBXSMM_API_EXTERN LIBXSMM_API_TARGET LIBXSMM_API_VISIBILITY(EXPORT) + +#if !defined(LIBXSMM_RESTRICT) +# if ((defined(__GNUC__) && !defined(__CYGWIN32__)) || defined(LIBXSMM_INTEL_COMPILER)) && !defined(_WIN32) +# define LIBXSMM_RESTRICT __restrict__ +# elif defined(_MSC_VER) || defined(LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_RESTRICT __restrict +# else +# define LIBXSMM_RESTRICT +# endif +#endif /*LIBXSMM_RESTRICT*/ + +#if !defined(LIBXSMM_PRAGMA) +# if defined(LIBXSMM_INTEL_COMPILER) || defined(_MSC_VER) +# define LIBXSMM_PRAGMA(DIRECTIVE) __pragma(LIBXSMM_EXPAND(DIRECTIVE)) +# else +# define LIBXSMM_PRAGMA(DIRECTIVE) +# endif +#endif /*LIBXSMM_PRAGMA*/ + +#if !defined(LIBXSMM_OPENMP_SIMD) && (defined(_OPENMP) && (201307 <= _OPENMP/*v4.0*/)) +# if defined(LIBXSMM_INTEL_COMPILER) +# if (1500 <= LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_OPENMP_SIMD +# endif +# elif defined(__GNUC__) +# if LIBXSMM_VERSION2(4, 9) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# define LIBXSMM_OPENMP_SIMD +# endif +# else +# define LIBXSMM_OPENMP_SIMD +# endif +#endif + +#if !defined(LIBXSMM_INTEL_COMPILER) || (LIBXSMM_INTEL_COMPILER < 9900) +# if defined(LIBXSMM_OPENMP_SIMD) +# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(omp simd reduction(EXPRESSION)) +# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(omp simd collapse(N)) +# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(omp simd private(__VA_ARGS__)) +# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(omp simd) +# elif defined(__INTEL_COMPILER) +# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) LIBXSMM_PRAGMA(simd reduction(EXPRESSION)) +# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) LIBXSMM_PRAGMA(simd collapse(N)) +# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) LIBXSMM_PRAGMA(simd private(__VA_ARGS__)) +# define LIBXSMM_PRAGMA_SIMD LIBXSMM_PRAGMA(simd) +# endif +#endif +#if !defined(LIBXSMM_PRAGMA_SIMD) +# define LIBXSMM_PRAGMA_SIMD_REDUCTION(EXPRESSION) +# define LIBXSMM_PRAGMA_SIMD_COLLAPSE(N) +# define LIBXSMM_PRAGMA_SIMD_PRIVATE(...) +# define LIBXSMM_PRAGMA_SIMD +#endif + +#if defined(__INTEL_COMPILER) +# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(vector nontemporal(__VA_ARGS__)) +# define LIBXSMM_PRAGMA_VALIGNED LIBXSMM_PRAGMA(vector aligned) +# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(novector) +# define LIBXSMM_PRAGMA_FORCEINLINE LIBXSMM_PRAGMA(forceinline) +# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(loop_count min=MIN max=MAX avg=AVG) +# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll_and_jam(N)) +# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N)) +# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA(unroll) +# define LIBXSMM_PRAGMA_VALIGNED_VAR(A) LIBXSMM_ASSUME_ALIGNED(A, LIBXSMM_ALIGNMENT); +/*# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE))*/ +#else +# if defined(LIBXSMM_OPENMP_SIMD) && (201811 <= _OPENMP/*v5.0*/) +# define LIBXSMM_PRAGMA_NONTEMPORAL(...) LIBXSMM_PRAGMA(omp simd nontemporal(__VA_ARGS__)) +# else +# define LIBXSMM_PRAGMA_NONTEMPORAL(...) +# endif +# if defined(__clang__) +# define LIBXSMM_PRAGMA_VALIGNED_VAR(A) +# define LIBXSMM_PRAGMA_VALIGNED +# define LIBXSMM_PRAGMA_NOVECTOR LIBXSMM_PRAGMA(clang loop vectorize(disable)) +# define LIBXSMM_PRAGMA_FORCEINLINE +# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) LIBXSMM_PRAGMA(unroll(AVG)) +# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) LIBXSMM_PRAGMA(unroll(N)) +# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(unroll(N)) +# define LIBXSMM_PRAGMA_UNROLL LIBXSMM_PRAGMA_UNROLL_N(4) +# else +# define LIBXSMM_PRAGMA_VALIGNED_VAR(A) +# define LIBXSMM_PRAGMA_VALIGNED +# define LIBXSMM_PRAGMA_NOVECTOR +# define LIBXSMM_PRAGMA_FORCEINLINE +# define LIBXSMM_PRAGMA_LOOP_COUNT(MIN, MAX, AVG) +# define LIBXSMM_PRAGMA_UNROLL_AND_JAM(N) +# define LIBXSMM_PRAGMA_UNROLL +# endif +#endif +#if !defined(LIBXSMM_PRAGMA_UNROLL_N) +# if defined(__GNUC__) && (LIBXSMM_VERSION2(8, 3) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) +# define LIBXSMM_PRAGMA_UNROLL_N(N) LIBXSMM_PRAGMA(GCC unroll N) +# else +# define LIBXSMM_PRAGMA_UNROLL_N(N) +# endif +#endif + +#if defined(LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(optimize("", off)) +# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(optimize("", on)) +#elif defined(__clang__) +# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(clang optimize off) +# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(clang optimize on) +#elif defined(__GNUC__) +# define LIBXSMM_PRAGMA_OPTIMIZE_OFF LIBXSMM_PRAGMA(GCC push_options) LIBXSMM_PRAGMA(GCC optimize("O0")) +# define LIBXSMM_PRAGMA_OPTIMIZE_ON LIBXSMM_PRAGMA(GCC pop_options) +#else +# define LIBXSMM_PRAGMA_OPTIMIZE_OFF +# define LIBXSMM_PRAGMA_OPTIMIZE_ON +#endif + +#if defined(_OPENMP) && (200805 <= _OPENMP/*v3.0*/) \ + && defined(NDEBUG) /* CCE complains for debug builds */ +# define LIBXSMM_OPENMP_COLLAPSE(N) collapse(N) +#else +# define LIBXSMM_OPENMP_COLLAPSE(N) +#endif + +/** LIBXSMM_UP2POT rounds up to the next power of two (POT). */ +#define LIBXSMM_UP2POT_01(N) ((N) | ((N) >> 1)) +#define LIBXSMM_UP2POT_02(N) (LIBXSMM_UP2POT_01(N) | (LIBXSMM_UP2POT_01(N) >> 2)) +#define LIBXSMM_UP2POT_04(N) (LIBXSMM_UP2POT_02(N) | (LIBXSMM_UP2POT_02(N) >> 4)) +#define LIBXSMM_UP2POT_08(N) (LIBXSMM_UP2POT_04(N) | (LIBXSMM_UP2POT_04(N) >> 8)) +#define LIBXSMM_UP2POT_16(N) (LIBXSMM_UP2POT_08(N) | (LIBXSMM_UP2POT_08(N) >> 16)) +#define LIBXSMM_UP2POT_32(N) (LIBXSMM_UP2POT_16(N) | (LIBXSMM_UP2POT_16(N) >> 32)) +#define LIBXSMM_UP2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) - LIBXSMM_MIN(1, N)) + LIBXSMM_MIN(1, N)) +#define LIBXSMM_LO2POT(N) (LIBXSMM_UP2POT_32((unsigned long long)(N) >> 1) + LIBXSMM_MIN(1, N)) + +#define LIBXSMM_UPDIV(N, MULT) (((N) + ((MULT) - 1)) / (MULT)) +#define LIBXSMM_UP(N, MULT) (LIBXSMM_UPDIV(N, MULT) * (MULT)) +#define LIBXSMM_UP2(N, NPOT) (((N) + ((NPOT) - 1)) & ~((NPOT) - 1)) +#define LIBXSMM_ABS(A) (0 <= (A) ? (A) : -(A)) +#define LIBXSMM_MIN(A, B) ((A) < (B) ? (A) : (B)) +#define LIBXSMM_MAX(A, B) ((A) < (B) ? (B) : (A)) +#define LIBXSMM_MOD(A, N) ((A) % (N)) +#define LIBXSMM_MOD2(A, NPOT) ((A) & ((NPOT) - 1)) +#define LIBXSMM_DELTA(T0, T1) ((T0) < (T1) ? ((T1) - (T0)) : ((T0) - (T1))) +#define LIBXSMM_CLMP(VALUE, LO, HI) ((LO) < (VALUE) ? ((VALUE) <= (HI) ? (VALUE) : LIBXSMM_MIN(VALUE, HI)) : LIBXSMM_MAX(LO, VALUE)) +#define LIBXSMM_SIZEOF(START, LAST) (((const char*)(LAST)) - ((const char*)(START)) + sizeof(*LAST)) +#define LIBXSMM_FEQ(A, B) ((A) == (B)) +#define LIBXSMM_NEQ(A, B) ((A) != (B)) +#define LIBXSMM_ISPOT(A) (0 != (A) && !((A) & ((A) - 1))) +#define LIBXSMM_ISWAP(A, B) (((A) ^= (B)), ((B) ^= (A)), ((A) ^= (B))) +#define LIBXSMM_ISNAN(A) LIBXSMM_NEQ(A, A) +#define LIBXSMM_NOTNAN(A) LIBXSMM_FEQ(A, A) +#define LIBXSMM_ROUNDX(TYPE, A) ((TYPE)((long long)(0 <= (A) ? ((double)(A) + 0.5) : ((double)(A) - 0.5)))) +#define LIBXSMM_CONST_VOID_PTR(A) *((const void**)&(A)) + +/** Makes some functions available independent of C99 support. */ +#if defined(__STDC_VERSION__) && (199901L/*C99*/ <= __STDC_VERSION__) +# if defined(__PGI) +# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B))) +# else +# define LIBXSMM_POWF(A, B) powf(A, B) +# endif +# define LIBXSMM_FREXPF(A, B) frexpf(A, B) +# define LIBXSMM_ROUNDF(A) roundf(A) +# define LIBXSMM_ROUND(A) round(A) +# define LIBXSMM_TANHF(A) tanhf(A) +# define LIBXSMM_SQRTF(A) sqrtf(A) +# define LIBXSMM_EXP2F(A) exp2f(A) +# define LIBXSMM_LOG2F(A) log2f(A) +# define LIBXSMM_ERFF(A) erff(A) +# define LIBXSMM_EXP2(A) exp2(A) +# define LIBXSMM_LOG2(A) log2(A) +# define LIBXSMM_EXPF(A) expf(A) +# define LIBXSMM_LOGF(A) logf(A) +#else +# define LIBXSMM_POWF(A, B) ((float)pow((float)(A), (float)(B))) +# define LIBXSMM_FREXPF(A, B) ((float)frexp((float)(A), B)) +# define LIBXSMM_ROUNDF(A) LIBXSMM_ROUNDX(float, A) +# define LIBXSMM_ROUND(A) LIBXSMM_ROUNDX(double, A) +# define LIBXSMM_TANHF(A) ((float)tanh((float)(A))) +# define LIBXSMM_SQRTF(A) ((float)sqrt((float)(A))) +# define LIBXSMM_EXP2F(A) LIBXSMM_POWF(2, A) +# define LIBXSMM_LOG2F(A) ((float)LIBXSMM_LOG2((float)(A))) +# define LIBXSMM_ERFF(A) ((float)erf((float)(A))) +# define LIBXSMM_EXP2(A) pow(2.0, A) +# define LIBXSMM_LOG2(A) (log(A) * (1.0 / (M_LN2))) +# define LIBXSMM_EXPF(A) ((float)exp((float)(A))) +# define LIBXSMM_LOGF(A) ((float)log((float)(A))) +#endif + +#if defined(LIBXSMM_INTEL_COMPILER) +# if (1700 <= LIBXSMM_INTEL_COMPILER) +# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION) +# else +# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION) +# endif +#elif defined(_MSC_VER) +# define LIBXSMM_ASSUME(EXPRESSION) __assume(EXPRESSION) +#elif defined(__GNUC__) && !defined(_CRAYC) && (LIBXSMM_VERSION2(4, 5) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__)) +# define LIBXSMM_ASSUME(EXPRESSION) do { if (!(EXPRESSION)) __builtin_unreachable(); } while(0) +#else +# define LIBXSMM_ASSUME(EXPRESSION) assert(EXPRESSION) +#endif + +#if defined(__INTEL_COMPILER) +# define LIBXSMM_ASSUME_ALIGNED(A, N) __assume_aligned(A, N) +#else +# define LIBXSMM_ASSUME_ALIGNED(A, N) assert(0 == ((uintptr_t)(A)) % (N)) +#endif +#define LIBXSMM_ALIGN(POINTER, ALIGNMENT/*POT*/) ((POINTER) + (LIBXSMM_UP2((uintptr_t)(POINTER), ALIGNMENT) - ((uintptr_t)(POINTER))) / sizeof(*(POINTER))) +#define LIBXSMM_FOLD2(POINTER, ALIGNMENT, NPOT) LIBXSMM_MOD2(((uintptr_t)(POINTER) / (ALIGNMENT)), NPOT) + +#if defined(_MSC_VER) && !defined(__clang__) && !defined(LIBXSMM_INTEL_COMPILER) /* account for incorrect handling of __VA_ARGS__ */ +# define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)LIBXSMM_EXPAND((__VA_ARGS__)) +#else +# define LIBXSMM_SELECT_ELEMENT(INDEX1/*one-based*/, .../*elements*/) LIBXSMM_CONCATENATE(LIBXSMM_SELECT_ELEMENT_, INDEX1)(__VA_ARGS__) +#endif +#define LIBXSMM_SELECT_ELEMENT_1(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E0 +#define LIBXSMM_SELECT_ELEMENT_2(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E1 +#define LIBXSMM_SELECT_ELEMENT_3(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E2 +#define LIBXSMM_SELECT_ELEMENT_4(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E3 +#define LIBXSMM_SELECT_ELEMENT_5(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E4 +#define LIBXSMM_SELECT_ELEMENT_6(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E5 +#define LIBXSMM_SELECT_ELEMENT_7(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E6 +#define LIBXSMM_SELECT_ELEMENT_8(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E7 +#define LIBXSMM_SELECT_ELEMENT_9(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E8 +#define LIBXSMM_SELECT_ELEMENT_10(E0, E1, E2, E3, E4, E5, E6, E7, E8, E9) E9 +#define LIBXSMM_SELECT_HEAD_AUX(A, ...) (A) +#define LIBXSMM_SELECT_HEAD(...) LIBXSMM_EXPAND(LIBXSMM_SELECT_HEAD_AUX(__VA_ARGS__, 0/*dummy*/)) +#define LIBXSMM_SELECT_TAIL(A, ...) __VA_ARGS__ + +/** + * For VLAs, check EXACTLY for C99 since a C11-conforming compiler may not provide VLAs. + * However, some compilers (Intel) may signal support for VLA even with strict ANSI (C89). + * To ultimately disable VLA-support, define LIBXSMM_NO_VLA (make VLA=0). + * VLA-support is signaled by LIBXSMM_VLA. + */ +#if !defined(LIBXSMM_VLA) && !defined(LIBXSMM_NO_VLA) && !defined(__PGI) && ( \ + (defined(__STDC_VERSION__) && (199901L/*C99*/ == __STDC_VERSION__ || (!defined(__STDC_NO_VLA__) && 199901L/*C99*/ < __STDC_VERSION__))) || \ + (defined(__GNUC__) && LIBXSMM_VERSION2(5, 0) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && !defined(__STRICT_ANSI__) && !defined(__cplusplus)) || \ + (defined(LIBXSMM_INTEL_COMPILER) && !defined(_WIN32) && !defined(__cplusplus)) || \ + (defined(__INTEL_COMPILER) && !defined(_WIN32))) +# define LIBXSMM_VLA +#endif + +/** + * LIBXSMM_INDEX1 calculates the linear address for a given set of (multiple) indexes/bounds. + * Syntax: LIBXSMM_INDEX1(, , ..., , , ..., ). + * Please note that the leading dimension (s0) is omitted in the above syntax! + * TODO: support leading dimension (pitch/stride). + */ +#if defined(_MSC_VER) && !defined(__clang__) /* account for incorrect handling of __VA_ARGS__ */ +# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)LIBXSMM_EXPAND((__VA_ARGS__)) +#else +# define LIBXSMM_INDEX1(NDIMS, ...) LIBXSMM_CONCATENATE(LIBXSMM_INDEX1_, NDIMS)(__VA_ARGS__) +#endif +#define LIBXSMM_INDEX1_1(...) ((size_t)LIBXSMM_SELECT_HEAD(__VA_ARGS__)) +#define LIBXSMM_INDEX1_2(I0, I1, S1) (LIBXSMM_INDEX1_1(I0) * ((size_t)S1) + (size_t)I1) +#define LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) (LIBXSMM_INDEX1_2(I0, I1, S1) * ((size_t)S2) + (size_t)I2) +#define LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) (LIBXSMM_INDEX1_3(I0, I1, I2, S1, S2) * ((size_t)S3) + (size_t)I3) +#define LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) (LIBXSMM_INDEX1_4(I0, I1, I2, I3, S1, S2, S3) * ((size_t)S4) + (size_t)I4) +#define LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) (LIBXSMM_INDEX1_5(I0, I1, I2, I3, I4, S1, S2, S3, S4) * ((size_t)S5) + (size_t)I5) +#define LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) (LIBXSMM_INDEX1_6(I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, S5) * ((size_t)S6) + (size_t)I6) +#define LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) (LIBXSMM_INDEX1_7(I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, S6) * ((size_t)S7) + (size_t)I7) +#define LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) (LIBXSMM_INDEX1_8(I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, S7) * ((size_t)S8) + (size_t)I8) +#define LIBXSMM_INDEX1_10(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, S9) (LIBXSMM_INDEX1_9(I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, S8) * ((size_t)S9) + (size_t)I9) + +/** + * LIBXSMM_VLA_DECL declares an array according to the given set of (multiple) bounds. + * Syntax: LIBXSMM_VLA_DECL(, , , , , ..., ). + * The element type can be "const" or otherwise qualified; initial value must be (const)element-type*. + * Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted! + * + * LIBXSMM_VLA_ACCESS gives the array element according to the given set of (multiple) indexes/bounds. + * Syntax: LIBXSMM_VLA_ACCESS(, , , ..., , , ..., ). + * Please note that the syntax is similar to LIBXSMM_INDEX1, and the leading dimension (s0) is omitted! + */ +#if !defined(LIBXSMM_VLA_POSTFIX) +# define LIBXSMM_VLA_POSTFIX _ +#endif +#if defined(LIBXSMM_VLA) +LIBXSMM_API_INLINE int libxsmm_nonconst_int(int i) { return i; } +# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_VLA_ACCESS_ND(NDIMS, LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX), LIBXSMM_VLA_ACCESS_SINK, __VA_ARGS__) +# define LIBXSMM_VLA_ACCESS_SINK(S) + 0 * (S) +# define LIBXSMM_VLA_ACCESS_NONCONST(I) libxsmm_nonconst_int(I) +# define LIBXSMM_VLA_ACCESS_ND(NDIMS, ARRAY, XY, ...) LIBXSMM_CONCATENATE3(LIBXSMM_VLA_ACCESS_, NDIMS, D)(ARRAY, XY, __VA_ARGS__) +# define LIBXSMM_VLA_ACCESS_0D(ARRAY, XY, ...) (ARRAY)/*scalar*/ +# define LIBXSMM_VLA_ACCESS_1D(ARRAY, XY, ...) ((ARRAY)[LIBXSMM_VLA_ACCESS_NONCONST(LIBXSMM_SELECT_HEAD(__VA_ARGS__))]) +# define LIBXSMM_VLA_ACCESS_2D(ARRAY, XY, I0, I1, ...) (((ARRAY) XY(__VA_ARGS__))[I0][LIBXSMM_VLA_ACCESS_NONCONST(I1)]) +# define LIBXSMM_VLA_ACCESS_3D(ARRAY, XY, I0, I1, I2, S1, ...) (((ARRAY) XY(S1) XY(__VA_ARGS__))[I0][I1][LIBXSMM_VLA_ACCESS_NONCONST(I2)]) +# define LIBXSMM_VLA_ACCESS_4D(ARRAY, XY, I0, I1, I2, I3, S1, S2, ...) (((ARRAY) XY(S1) XY(S2) XY(__VA_ARGS__))[I0][I1][I2][LIBXSMM_VLA_ACCESS_NONCONST(I3)]) +# define LIBXSMM_VLA_ACCESS_5D(ARRAY, XY, I0, I1, I2, I3, I4, S1, S2, S3, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(__VA_ARGS__))[I0][I1][I2][I3][LIBXSMM_VLA_ACCESS_NONCONST(I4)]) +# define LIBXSMM_VLA_ACCESS_6D(ARRAY, XY, I0, I1, I2, I3, I4, I5, S1, S2, S3, S4, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][LIBXSMM_VLA_ACCESS_NONCONST(I5)]) +# define LIBXSMM_VLA_ACCESS_7D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, S1, S2, S3, S4, S5, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][LIBXSMM_VLA_ACCESS_NONCONST(I6)]) +# define LIBXSMM_VLA_ACCESS_8D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, S1, S2, S3, S4, S5, S6, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][LIBXSMM_VLA_ACCESS_NONCONST(I7)]) +# define LIBXSMM_VLA_ACCESS_9D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, S1, S2, S3, S4, S5, S6, S7, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][LIBXSMM_VLA_ACCESS_NONCONST(I8)]) +# define LIBXSMM_VLA_ACCESS_10D(ARRAY, XY, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, S1, S2, S3, S4, S5, S6, S7, S8, ...) (((ARRAY) XY(S1) XY(S2) XY(S3) XY(S4) XY(S5) XY(S6) XY(S7) XY(S8) XY(__VA_ARGS__))[I0][I1][I2][I3][I4][I5][I6][I7][I8][LIBXSMM_VLA_ACCESS_NONCONST(I9)]) +# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \ + ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX), \ + LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/) = \ + (ELEMENT_TYPE LIBXSMM_VLA_ACCESS_ND(LIBXSMM_SELECT_ELEMENT(NDIMS, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), *, \ + LIBXSMM_ELIDE, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*bounds*/, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0)/*dummy*/))LIBXSMM_SELECT_HEAD(__VA_ARGS__) +#else /* calculate linear index */ +# define LIBXSMM_VLA_ACCESS(NDIMS, ARRAY, ...) LIBXSMM_CONCATENATE(ARRAY, LIBXSMM_VLA_POSTFIX)[LIBXSMM_INDEX1(NDIMS, __VA_ARGS__)] +# define LIBXSMM_VLA_DECL(NDIMS, ELEMENT_TYPE, ARRAY_VAR, .../*initial value, and bounds*/) \ + ELEMENT_TYPE *LIBXSMM_RESTRICT LIBXSMM_CONCATENATE(ARRAY_VAR, LIBXSMM_VLA_POSTFIX) = /*(ELEMENT_TYPE*)*/LIBXSMM_SELECT_HEAD(__VA_ARGS__) \ + + 0 * LIBXSMM_INDEX1(NDIMS, LIBXSMM_SELECT_TAIL(__VA_ARGS__, LIBXSMM_SELECT_TAIL(__VA_ARGS__, 0))) /* dummy-shift to "sink" unused arguments */ +#endif + +/** Access an array of TYPE with Byte-measured stride. */ +#define LIBXSMM_ACCESS(TYPE, ARRAY, STRIDE) (*(TYPE*)((char*)(ARRAY) + (STRIDE))) + +#if !defined(LIBXSMM_UNUSED) +# if 0 +# define LIBXSMM_UNUSED(VARIABLE) LIBXSMM_PRAGMA(unused(VARIABLE)) +# else +# define LIBXSMM_UNUSED(VARIABLE) (void)(VARIABLE) +# endif +#endif +#if !defined(NDEBUG) +# define LIBXSMM_UNUSED_DEBUG(VARIABLE) LIBXSMM_UNUSED(VARIABLE) +#else +# define LIBXSMM_UNUSED_DEBUG(VARIABLE) +#endif + +#if defined(_OPENMP) +# define LIBXSMM_PRAGMA_OMP(...) LIBXSMM_PRAGMA(omp __VA_ARGS__) +# if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# define LIBXSMM_OMP_VAR(A) LIBXSMM_UNUSED(A) /* suppress warning about "unused" variable */ +# elif defined(__clang__) +# define LIBXSMM_OMP_VAR(A) (A) = 0 +# else +# define LIBXSMM_OMP_VAR(A) +# endif +#else +# define LIBXSMM_PRAGMA_OMP(...) +# define LIBXSMM_OMP_VAR(A) +#endif + +#if defined(LIBXSMM_BUILD) && (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) && !defined(__MINGW32__) +# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT LIBXSMM_ATTRIBUTE(weak_import) +# define LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE(weak) +#else +# define LIBXSMM_ATTRIBUTE_WEAK +# define LIBXSMM_ATTRIBUTE_WEAK_IMPORT +#endif + +#if !defined(LIBXSMM_NO_CTOR) && !defined(LIBXSMM_CTOR) && \ + (defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__)) && \ + (defined(LIBXSMM_BUILD) && !defined(__STATIC)) && \ + (defined(__GNUC__) || defined(__clang__)) +# define LIBXSMM_ATTRIBUTE_CTOR LIBXSMM_ATTRIBUTE(constructor) +# define LIBXSMM_ATTRIBUTE_DTOR LIBXSMM_ATTRIBUTE(destructor) +# define LIBXSMM_CTOR +#else +# define LIBXSMM_ATTRIBUTE_CTOR +# define LIBXSMM_ATTRIBUTE_DTOR +#endif + +#if defined(__GNUC__) && !defined(__PGI) && !defined(__ibmxl__) +# define LIBXSMM_ATTRIBUTE_NO_TRACE LIBXSMM_ATTRIBUTE(no_instrument_function) +#else +# define LIBXSMM_ATTRIBUTE_NO_TRACE +#endif + +#if defined(__GNUC__) +# define LIBXSMM_MAY_ALIAS LIBXSMM_ATTRIBUTE(__may_alias__) +#else +# define LIBXSMM_MAY_ALIAS +#endif + +#if !defined(LIBXSMM_MKTEMP_PATTERN) +# define LIBXSMM_MKTEMP_PATTERN "XXXXXX" +#endif + +/** Below group is to fix-up some platform/compiler specifics. */ +#if defined(_WIN32) +# if !defined(_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES) +# define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1 +# endif +# if !defined(_CRT_SECURE_NO_DEPRECATE) +# define _CRT_SECURE_NO_DEPRECATE 1 +# endif +# if !defined(_USE_MATH_DEFINES) +# define _USE_MATH_DEFINES 1 +# endif +# if !defined(WIN32_LEAN_AND_MEAN) +# define WIN32_LEAN_AND_MEAN 1 +# endif +# if !defined(NOMINMAX) +# define NOMINMAX 1 +# endif +# if defined(__INTEL_COMPILER) && (190023506 <= _MSC_FULL_VER) +# define __builtin_huge_val() HUGE_VAL +# define __builtin_huge_valf() HUGE_VALF +# define __builtin_nan nan +# define __builtin_nanf nanf +# define __builtin_nans nan +# define __builtin_nansf nanf +# if defined(__cplusplus) +# define _CMATH_ +# endif +# endif +#endif +#if !defined(_GNU_SOURCE) && defined(LIBXSMM_BUILD) +# define _GNU_SOURCE +#endif +#if !defined(__STDC_FORMAT_MACROS) +# define __STDC_FORMAT_MACROS +#endif +#if defined(__clang__) && !defined(__extern_always_inline) +# define __extern_always_inline LIBXSMM_INLINE +#endif +#if defined(LIBXSMM_INLINE_FIXUP) && !defined(inline) +# define inline LIBXSMM_INLINE_KEYWORD +#endif + +#if (0 != LIBXSMM_SYNC) +# if !defined(_REENTRANT) +# define _REENTRANT +# endif +# if defined(__PGI) +# if defined(__GCC_ATOMIC_TEST_AND_SET_TRUEVAL) +# undef __GCC_ATOMIC_TEST_AND_SET_TRUEVAL +# endif +# define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1 +# endif +#endif + +#if !defined(__has_feature) && !defined(__clang__) +# define __has_feature(A) 0 +#endif +#if !defined(__has_builtin) && !defined(__clang__) +# define __has_builtin(A) 0 +#endif + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif + +#if (0 != LIBXSMM_SYNC) +# if defined(_WIN32) || defined(__CYGWIN__) +# include +# else +# include +# endif +#endif +#if !defined(LIBXSMM_ASSERT) +# include +# if defined(NDEBUG) +# define LIBXSMM_ASSERT(EXPR) LIBXSMM_ASSUME(EXPR) +# else +# define LIBXSMM_ASSERT(EXPR) assert(EXPR) +# endif +#endif +#if !defined(LIBXSMM_ASSERT_MSG) +# define LIBXSMM_ASSERT_MSG(EXPR, MSG) assert((EXPR) && *MSG) +#endif +#if !defined(LIBXSMM_EXPECT_ELIDE) +# define LIBXSMM_EXPECT_ELIDE(RESULT, EXPR) do { \ + /*const*/ int libxsmm_expect_result_ = ((RESULT) == (EXPR)); \ + LIBXSMM_UNUSED(libxsmm_expect_result_); \ + } while(0) +#endif +#if defined(NDEBUG) +# define LIBXSMM_EXPECT LIBXSMM_EXPECT_ELIDE +# define LIBXSMM_EXPECT_NOT LIBXSMM_EXPECT_ELIDE +#else +# define LIBXSMM_EXPECT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) == (EXPR)) +# define LIBXSMM_EXPECT_NOT(RESULT, EXPR) LIBXSMM_ASSERT((RESULT) != (EXPR)) +#endif +#if defined(_DEBUG) +# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT +#else +# define LIBXSMM_EXPECT_DEBUG LIBXSMM_EXPECT_ELIDE +#endif +#if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP) +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(FLT_MAX) +# if !defined(__FLT_MAX__) +# define FLT_MAX 3.40282346638528859811704183484516925e+38F +# else +# define FLT_MAX __FLT_MAX__ +# endif +#endif +#if !defined(FLT_MIN) +# if !defined(__FLT_MIN__) +# define FLT_MIN 1.17549435082228750796873653722224568e-38F +# else +# define FLT_MIN __FLT_MIN__ +# endif +#endif +#if defined(_WIN32) && 0 +# define LIBXSMM_SNPRINTF(S, N, ...) _snprintf_s(S, N, _TRUNCATE, __VA_ARGS__) +#elif defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__ || defined(__GNUC__)) +# define LIBXSMM_SNPRINTF(S, N, ...) snprintf(S, N, __VA_ARGS__) +#else +# define LIBXSMM_SNPRINTF(S, N, ...) sprintf((S) + /*unused*/(N) * 0, __VA_ARGS__) +#endif + +#if defined(__THROW) && defined(__cplusplus) +# define LIBXSMM_THROW __THROW +#endif +#if !defined(LIBXSMM_THROW) +# define LIBXSMM_THROW +#endif +#if defined(__GNUC__) && LIBXSMM_VERSION2(4, 2) == LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) && \ + !defined(__clang__) && !defined(__PGI) && !defined(__INTEL_COMPILER) && !defined(_CRAYC) +# define LIBXSMM_NOTHROW LIBXSMM_THROW +#else +# define LIBXSMM_NOTHROW +#endif +#if defined(__cplusplus) +# if (__cplusplus > 199711L) +# define LIBXSMM_NOEXCEPT noexcept +# else +# define LIBXSMM_NOEXCEPT throw() +# endif +#else +# define LIBXSMM_NOEXCEPT LIBXSMM_NOTHROW +#endif + +#if defined(_WIN32) +# define LIBXSMM_PUTENV(A) _putenv(A) +#else +# define LIBXSMM_PUTENV(A) putenv(A) +#endif + +/* block must be after including above header files */ +#if (defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) < LIBXSMM_VERSION2(2, 26)) \ + || (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER) && !defined(__cplusplus) && defined(__linux__)) +/* _Float128 was introduced with GNU GCC 7.0. */ +# if !defined(_Float128) && !defined(__SIZEOF_FLOAT128__) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) +# define _Float128 __float128 +# endif +# if !defined(LIBXSMM_GLIBC_FPTYPES) && defined(__GNUC__) && !defined(__cplusplus) && defined(__linux__) \ + && (LIBXSMM_VERSION2(7, 0) > LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) || \ + (defined(LIBXSMM_INTEL_COMPILER) && (1802 >= LIBXSMM_INTEL_COMPILER))) +# define LIBXSMM_GLIBC_FPTYPES +# endif +# if !defined(_Float128X) && defined(LIBXSMM_GLIBC_FPTYPES) +# define _Float128X _Float128 +# endif +# if !defined(_Float32) && defined(LIBXSMM_GLIBC_FPTYPES) +# define _Float32 float +# endif +# if !defined(_Float32x) && defined(LIBXSMM_GLIBC_FPTYPES) +# define _Float32x _Float32 +# endif +# if !defined(_Float64) && defined(LIBXSMM_GLIBC_FPTYPES) +# define _Float64 double +# endif +# if !defined(_Float64x) && defined(LIBXSMM_GLIBC_FPTYPES) +# define _Float64x _Float64 +# endif +#endif + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if defined(LIBXSMM_GLIBC_FPTYPES) +# if defined(__cplusplus) +# undef __USE_MISC +# if !defined(_DEFAULT_SOURCE) +# define _DEFAULT_SOURCE +# endif +# if !defined(_BSD_SOURCE) +# define _BSD_SOURCE +# endif +# else +# if !defined(__PURE_INTEL_C99_HEADERS__) +# define __PURE_INTEL_C99_HEADERS__ +# endif +# endif +#endif +#if !defined(LIBXSMM_NO_LIBM) +# if (defined(LIBXSMM_INTEL_COMPILER) && (1800 <= LIBXSMM_INTEL_COMPILER)) \ + && !defined(_WIN32) /* error including dfp754.h */ +# include +# endif +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#endif /*LIBXSMM_MACROS_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_malloc.h b/third_party/libxsmm/include/libxsmm_malloc.h new file mode 100644 index 00000000..3f978fea --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_malloc.h @@ -0,0 +1,397 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MALLOC_H +#define LIBXSMM_MALLOC_H + +#include "libxsmm_memory.h" + +/* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */ +#if !defined(LIBXSMM_TF12) && (!defined(TF_VERSION_STRING) || \ + LIBXSMM_VERSION2(1, 12) <= LIBXSMM_VERSION2(TF_MAJOR_VERSION, TF_MINOR_VERSION)) +# define LIBXSMM_TF12 /* TF_PATCH_VERSION does not matter */ +#endif + +/** Can be used with libxsmm_[get|set]_scratch_limit. */ +#define LIBXSMM_SCRATCH_UNLIMITED ((size_t)LIBXSMM_UNLIMITED) +#define LIBXSMM_SCRATCH_DEFAULT 0 + + +/** Function types accepted for memory allocation (see libxsmm_*_allocator). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_ctx)(size_t /*size*/, const void* /*context*/); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_malloc_fun)(size_t /*size*/); +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_malloc_function { + libxsmm_malloc_ctx ctx_form; + libxsmm_malloc_fun function; +} libxsmm_malloc_function; + +/** Function types accepted for releasing memory (see libxsmm_*_allocator). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_ctx)(void* /*buffer*/, const void* /*context*/); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_free_fun)(void* /*buffer*/); +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_free_function { + libxsmm_free_ctx ctx_form; + libxsmm_free_fun function; +} libxsmm_free_function; + +/** + * To setup the custom default memory allocator, either a malloc_fn and a free_fn + * are given, or two NULL-pointers designate to reset the default allocator to a + * library-internal default. If a context is given (non-NULL), the context-based + * form of the memory allocation is used. + * Changing the allocator including the function for deallocation applies to + * upcoming allocation/deallocation and works correctly for pending buffers. + */ +LIBXSMM_API int libxsmm_set_default_allocator(/* malloc_fn/free_fn must correspond */ + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); +/** Retrieve the default memory allocator. */ +LIBXSMM_API int libxsmm_get_default_allocator(const void** context, + libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); + +/** + * To setup the scratch memory allocator, a malloc_fn function and an optional free_fn + * are given. A NULL-free acts as a "no-operation", and the deallocation is expected + * to be controlled otherwise. If two NULL-pointers are given, the allocator is reset + * to the currently active default memory allocator. If a context is given (non-NULL), + * the context-based form of the memory allocation is used. + * Changing the allocator including the function for deallocation applies to + * upcoming allocation/deallocation and works correctly for pending buffers. + */ +LIBXSMM_API int libxsmm_set_scratch_allocator(/* malloc_fn/free_fn must correspond */ + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); +/** Retrieve the scratch memory allocator. */ +LIBXSMM_API int libxsmm_get_scratch_allocator(const void** context, + libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); + +/** Allocate memory (malloc/free interface). */ +LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_malloc(size_t size); + +/** Allocate aligned memory using the default allocator. */ +LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_aligned_malloc(size_t size, + /** + * =0: align automatically according to the size + * 0<: align according to the alignment value + */ + size_t alignment); + +/** Reallocate memory using the default allocator (alignment is preserved). */ +LIBXSMM_API void* libxsmm_realloc(size_t size, void* ptr); + +/** + * Allocate aligned scratch memory. It is not supported + * to query properties per libxsmm_get_malloc_info, but + * libxsmm_get_scratch_info can used instead. + */ +LIBXSMM_API void* libxsmm_scratch_malloc(size_t size, + /** + * =0: align automatically according to the size + * 0<: align according to the alignment value + */ + size_t alignment, + /** + * Identifies the call site, which is used + * to determine the memory pool. + */ + const void* caller); + +/** + * Binary form of libxsmm_scratch_malloc, which + * expands the call-context automatically. This + * macro is intentionally lower case. + */ +#define libxsmm_aligned_scratch(size, alignment) \ + libxsmm_scratch_malloc(size, alignment, \ + LIBXSMM_CALLER_ID) + +/** Deallocate memory (malloc/free interface). */ +LIBXSMM_API void libxsmm_free(const void* memory); + +/** + * Release the entire scratch memory regardless + * of whether it is still referenced or not. + */ +LIBXSMM_API void libxsmm_release_scratch(void); + +/** Information about a buffer (default memory domain). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_malloc_info { + /** Size of the buffer. */ + size_t size; +} libxsmm_malloc_info; + +/** Retrieve information about a buffer (default memory domain). */ +LIBXSMM_API int libxsmm_get_malloc_info(const void* memory, libxsmm_malloc_info* info); + +/** Information about the scratch memory domain. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_scratch_info { + /** Watermark memory across pools (size), unsatisfied (local), and library-internal memory. */ + size_t size, local, internal; + /** Pending allocations (not released). */ + size_t npending; + /** Number of allocations so far. */ + size_t nmallocs; + /** Number of pools used. */ + unsigned int npools; +} libxsmm_scratch_info; + +/** Retrieve information about the scratch memory domain. */ +LIBXSMM_API int libxsmm_get_scratch_info(libxsmm_scratch_info* info); + +/** + * Limit the total size (Bytes) of the scratch memory. + * LIBXSMM_SCRATCH_UNLIMITED removes any limit, and + * LIBXSMM_SCRATCH_DEFAULT populates the default. + * The related environment variable LIBXSMM_SCRATCH_LIMIT + * allows units: /b/B (Bytes), k/K, m/M, and g/G. + */ +LIBXSMM_API void libxsmm_set_scratch_limit(size_t nbytes); +/** Get the maximum size of the scratch memory domain. */ +LIBXSMM_API size_t libxsmm_get_scratch_limit(void); + +/** + * Intercepts malloc/free to use scratch memory allocator. + * (related environment variable LIBXSMM_MALLOC). + * Optionally set the range of malloc-sizes to be intercepted. + * The related environment variable LIBXSMM_MALLOC_LIMIT + * allows units: /b/B (Bytes), k/K, m/M, and g/G. + */ +LIBXSMM_API void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi); +/** + * Determines if malloc/free are (and can be) intercepted. + * Optionally gets the range of enabled malloc-sizes. + */ +LIBXSMM_API int libxsmm_get_malloc(size_t* lo, size_t* hi); + +/** + * Calculate the linear offset of the n-dimensional (ndims) offset (can be NULL), + * and the (optional) linear size of the corresponding shape. + */ +LIBXSMM_API size_t libxsmm_offset(const size_t offset[], const size_t shape[], size_t ndims, size_t* size); + + +#if defined(__cplusplus) + +/** RAII idiom to temporarily setup an allocator for the lifetime of the scope. */ +template class LIBXSMM_RETARGETABLE libxsmm_scoped_allocator { +public: + /** C'tor, which instantiates the new allocator (plain form). */ + libxsmm_scoped_allocator(libxsmm_malloc_fun malloc_fn, libxsmm_free_fun free_fn) { + kind::get(m_context, m_malloc, m_free); + kind::set(NULL/*context*/, NULL/*malloc_ctx*/, NULL/*free_ctx*/, malloc_fn, free_fn); + } + + /** C'tor, which instantiates the new allocator (context form). */ + libxsmm_scoped_allocator(const void* context, libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, + libxsmm_malloc_fun malloc_fun = NULL, libxsmm_free_fun free_fun = NULL) + { + kind::get(m_context, m_malloc, m_free); + kind::set(context, malloc_ctx, free_ctx, malloc_fun, free_fun); + } + + /** Following the RAII idiom, the d'tor restores the previous allocator. */ + ~libxsmm_scoped_allocator() { + kind::set(m_context, + m_malloc.ctx_form, m_free.ctx_form, + m_malloc.function, m_free.function); + } + +private: /* no copy/assignment */ + explicit libxsmm_scoped_allocator(const libxsmm_scoped_allocator&); + libxsmm_scoped_allocator& operator=(const libxsmm_scoped_allocator&); + +protected: /* saved/previous allocator */ + const void* m_context; + libxsmm_malloc_function m_malloc; + libxsmm_free_function m_free; +}; + +/** Allocator-kind to instantiate libxsmm_scoped_allocator. */ +struct LIBXSMM_RETARGETABLE libxsmm_default_allocator { + static void set(const void* context, + libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, + libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun) + { + libxsmm_malloc_function malloc_fn; + libxsmm_free_function free_fn; + if (NULL == context) { /* use global form only when no context is given */ + malloc_fn.function = malloc_fun; free_fn.function = free_fun; + } + else { + malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx; + } + libxsmm_set_default_allocator(context, malloc_fn, free_fn); + } + static void get(const void*& context, + libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn) + { + libxsmm_get_default_allocator(&context, &malloc_fn, &free_fn); + } +}; + +/** Allocator-kind to instantiate libxsmm_scoped_allocator. */ +struct LIBXSMM_RETARGETABLE libxsmm_scratch_allocator { + static void set(const void* context, + libxsmm_malloc_ctx malloc_ctx, libxsmm_free_ctx free_ctx, + libxsmm_malloc_fun malloc_fun, libxsmm_free_fun free_fun) + { + libxsmm_malloc_function malloc_fn; + libxsmm_free_function free_fn; + if (NULL != context) { /* adopt context form */ + malloc_fn.function = malloc_fun; free_fn.function = free_fun; + } + else { /* adopt global form */ + malloc_fn.ctx_form = malloc_ctx; free_fn.ctx_form = free_ctx; + } + libxsmm_set_scratch_allocator(context, malloc_fn, free_fn); + } + static void get(const void*& context, + libxsmm_malloc_function& malloc_fn, libxsmm_free_function& free_fn) + { + libxsmm_get_scratch_allocator(&context, &malloc_fn, &free_fn); + } +}; + +/** Forward-declared types/functions used to implement libxsmm_tf_allocator. */ +namespace tensorflow { + class Allocator; +#if defined(LIBXSMM_TF12) + class DeviceBase; int DeviceNumaNode(const DeviceBase* /*device*/); + Allocator* cpu_allocator(int /*numa_node*/); +#else + Allocator* cpu_allocator(); +#endif +} + +/** + * An object of this type adopts a memory allocator from TensorFlow. + * All memory allocations of the requested kind within the current + * scope (where the libxsmm_tf_allocator object lives) are subject + * to TensorFlow's memory allocation scheme. The allocation kind + * is usually "libxsmm_scratch_allocator"; using a second object + * of kind "libxsmm_default_allocator" makes the default memory + * allocation of LIBXSMM subject to TensorFlow as well. + */ +template class LIBXSMM_RETARGETABLE libxsmm_tf_allocator: + public libxsmm_scoped_allocator +{ +public: + /** The TensorFlow allocator is adopted from the global CPU memory allocator. */ + explicit libxsmm_tf_allocator() + : libxsmm_scoped_allocator( + libxsmm_tf_allocator::malloc, + libxsmm_tf_allocator::free) + {} + + /** The TensorFlow allocator is adopted from the given OpKernelContext. */ + template + explicit libxsmm_tf_allocator(context_type& context) + : libxsmm_scoped_allocator(&context, + libxsmm_tf_allocator::template malloc_ctx, + libxsmm_tf_allocator::template free_ctx, + libxsmm_tf_allocator::malloc, + libxsmm_tf_allocator::free) + {} + + /** Global form of allocating memory (malloc signature). */ + static void* malloc(size_t size) { +#if defined(LIBXSMM_TF12) + return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), size); +#else + return libxsmm_tf_allocator::allocate(tensorflow::cpu_allocator(), size); +#endif + } + + /** Global form of deallocating memory (free signature). */ + static void free(void* buffer) { +#if defined(LIBXSMM_TF12) + libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(-1/*kNUMANoAffinity*/), buffer); +#else + libxsmm_tf_allocator::deallocate(tensorflow::cpu_allocator(), buffer); +#endif + } + + /** Context based form of allocating memory. */ + template static void* malloc_ctx(const void* context, size_t size) { + typedef typename context_type::WrappedAllocator::first_type allocator_ptr; + context_type *const tf_context = static_cast(context); + allocator_ptr allocator = NULL; + if (NULL != tf_context) { +#if !defined(LIBXSMM_TF12) + if (NULL != tf_context->device()) { + if (0 < tf_context->num_outputs()) { + allocator = tf_context->device()->GetStepAllocator( + tf_context->output_alloc_attr(0), + tf_context->resource_manager()); + } + else if (0 < tf_context->num_inputs()) { + allocator = tf_context->device()->GetStepAllocator( + tf_context->input_alloc_attr(0), + tf_context->resource_manager()); + } + } +#else /* include tensorflow/core/public/version.h prior to LIBXSMM otherwise the current TensorFlow API is assumed */ + const int numa_node = DeviceNumaNode(tf_context->device()); + allocator = tensorflow::cpu_allocator(numa_node); +#endif + } + return libxsmm_tf_allocator::allocate(allocator, size); + } + + /** Context based form of deallocating memory. */ + template static void free_ctx(const void* context, void* buffer) { + typedef typename context_type::WrappedAllocator::first_type allocator_ptr; + context_type *const tf_context = static_cast(context); + allocator_ptr allocator = NULL; + if (NULL != tf_context) { +#if defined(LIBXSMM_TF12) + const int numa_node = DeviceNumaNode(tf_context->device()); + allocator = tensorflow::cpu_allocator(numa_node); +#else + if (NULL != tf_context->device()) { + if (0 < tf_context->num_outputs()) { + allocator = tf_context->device()->GetStepAllocator( + tf_context->output_alloc_attr(0), + tf_context->resource_manager()); + } + else if (0 < tf_context->num_inputs()) { + allocator = tf_context->device()->GetStepAllocator( + tf_context->input_alloc_attr(0), + tf_context->resource_manager()); + } + } +#endif + } + libxsmm_tf_allocator::deallocate(allocator, buffer); + } + +private: + template /* break interface dependency with TF */ + static void* allocate(allocator_ptr allocator, size_t size) { + void* result; + if (NULL != allocator) { + /* no (useless) waste with alignment; raw result is re-aligned anyways */ + result = allocator->AllocateRaw(1/*alignment*/, size); + } + else { + LIBXSMM_ASSERT_MSG(0/*false*/, "LIBXSMM ERROR: memory allocator is missing"); + result = NULL; + } + return result; + } + + template /* break interface dependency with TF */ + static void deallocate(allocator_ptr allocator, void* buffer) { + LIBXSMM_ASSERT_MSG(NULL != allocator, "LIBXSMM ERROR: memory allocator is missing"); + if (NULL != allocator) allocator->DeallocateRaw(buffer); + } +}; + +#endif /*defined(__cplusplus)*/ + +#endif /*LIBXSMM_MALLOC_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_math.h b/third_party/libxsmm/include/libxsmm_math.h new file mode 100644 index 00000000..f6514228 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_math.h @@ -0,0 +1,140 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MATH_H +#define LIBXSMM_MATH_H + +#include "libxsmm_typedefs.h" + + +/** + * Structure of differences with matrix norms according + * to http://www.netlib.org/lapack/lug/node75.html). + */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_matdiff_info { + /** One-norm */ double norm1_abs, norm1_rel; + /** Infinity-norm */ double normi_abs, normi_rel; + /** Froebenius-norm */ double normf_rel; + /** Maximum difference, L2-norm (absolute and relative), and R-squared. */ + double linf_abs, linf_rel, l2_abs, l2_rel, rsq; + /** Statistics: sum/l1, min., max., arith. avg., and variance. */ + double l1_ref, min_ref, max_ref, avg_ref, var_ref; + /** Statistics: sum/l1, min., max., arith. avg., and variance. */ + double l1_tst, min_tst, max_tst, avg_tst, var_tst; + /** Values (v_ref, v_tst) and location (m, n) of largest linf_abs. */ + double v_ref, v_tst; + libxsmm_blasint m, n; +} libxsmm_matdiff_info; + +/** + * Utility function to calculate a collection of scalar differences between two matrices (libxsmm_matdiff_info). + * The location (m, n) of the largest difference (linf_abs) is recorded (also in case of NaN). In case of NaN, + * differences are set to infinity. If no difference is discovered, the location (m, n) is negative (OOB). + */ +LIBXSMM_API int libxsmm_matdiff(libxsmm_matdiff_info* info, + libxsmm_datatype datatype, libxsmm_blasint m, libxsmm_blasint n, const void* ref, const void* tst, + const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst); + +/** + * Reduces input into output such that the difference is maintained or increased (max function). + * The very first (initial) output should be zeroed (libxsmm_matdiff_clear). + */ +LIBXSMM_API void libxsmm_matdiff_reduce(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input); +/** Clears the given info-structure, e.g., for the initial reduction-value (libxsmm_matdiff_reduce). */ +LIBXSMM_API void libxsmm_matdiff_clear(libxsmm_matdiff_info* info); + +/** Greatest common divisor (corner case: the GCD of 0 and 0 is 1). */ +LIBXSMM_API size_t libxsmm_gcd(size_t a, size_t b); +/** Least common multiple. */ +LIBXSMM_API size_t libxsmm_lcm(size_t a, size_t b); + +/** + * This function finds prime-factors (up to 32) of an unsigned integer in ascending order, and + * returns the number of factors found (zero if the given number is prime and unequal to two). + */ +LIBXSMM_API int libxsmm_primes_u32(unsigned int num, unsigned int num_factors_n32[]); + +/** Calculate co-prime number <= n/2 (except: libxsmm_shuffle(0|1) == 0). */ +LIBXSMM_API size_t libxsmm_shuffle(unsigned int n); + +/** + * Divides the product into prime factors and selects factors such that the new product is within + * the given limit (0/1-Knapsack problem), e.g., product=12=2*2*3 and limit=6 then result=2*3=6. + * The limit is at least reached or exceeded with the minimal possible product (is_lower=true). + */ +LIBXSMM_API unsigned int libxsmm_product_limit(unsigned int product, unsigned int limit, int is_lower); + +/* Kahan's summation returns accumulator += value and updates compensation. */ +LIBXSMM_API double libxsmm_kahan_sum(double value, double* accumulator, double* compensation); + +/** SQRT with Newton's method using integer arithmetic. */ +LIBXSMM_API unsigned int libxsmm_isqrt_u64(unsigned long long x); +/** SQRT with Newton's method using integer arithmetic. */ +LIBXSMM_API unsigned int libxsmm_isqrt_u32(unsigned int x); +/** Based on libxsmm_isqrt_u32, but actual factor of x. */ +LIBXSMM_API unsigned int libxsmm_isqrt2_u32(unsigned int x); +/** SQRT with Newton's method using double-precision. */ +LIBXSMM_API double libxsmm_dsqrt(double x); +/** SQRT with Newton's method using single-precision. */ +LIBXSMM_API float libxsmm_ssqrt(float x); + +/** CBRT with Newton's method using integer arithmetic. */ +LIBXSMM_API unsigned int libxsmm_icbrt_u64(unsigned long long x); +/** CBRT with Newton's method using integer arithmetic. */ +LIBXSMM_API unsigned int libxsmm_icbrt_u32(unsigned int x); + +/** Single-precision approximation of exponential function (base 2). */ +LIBXSMM_API float libxsmm_sexp2(float x); + +/** + * Exponential function (base 2), which is limited to unsigned 8-bit input values. + * This function reproduces bit-accurate results (single-precision). + */ +LIBXSMM_API float libxsmm_sexp2_u8(unsigned char x); + +/** +* Exponential function (base 2), which is limited to signed 8-bit input values. +* This function reproduces bit-accurate results (single-precision). +*/ +LIBXSMM_API float libxsmm_sexp2_i8(signed char x); + +/** Similar to libxsmm_sexp2_i8, but takes an integer as signed 8-bit value (check). */ +LIBXSMM_API float libxsmm_sexp2_i8i(int x); + +/** Inlineable fast tanh, such that a the compiler can potentially vectorize. */ +LIBXSMM_API_INLINE float libxsmm_stanh_pade78(float i_x) { + const float l_c0 = 2027025.0f; + const float l_c1 = 270270.0f; + const float l_c2 = 6930.0f; + const float l_c3 = 36.0f; + const float l_c1_d = 945945.0f; + const float l_c2_d = 51975.0f; + const float l_c3_d = 630.0f; + const float l_hi_bound = 4.97f; + const float l_lo_bound = -4.97f; + const float l_ones = 1.0f; + const float l_neg_ones = -1.0f; + const float x2 = i_x * i_x; + const float t1_nom = (l_c3 * x2) + l_c2; + const float t2_nom = (t1_nom * x2) + l_c1; + const float t3_nom = (t2_nom * x2) + l_c0; + const float nom = t3_nom * i_x; + const float t1_denom = x2 + l_c3_d; + const float t2_denom = (t1_denom * x2) + l_c2_d; + const float t3_denom = (t2_denom * x2) + l_c1_d; + const float denom = (t3_denom * x2) + l_c0; + float result = nom/denom ; + result = (result > l_hi_bound) ? l_ones : result; + result = (result < l_lo_bound) ? l_neg_ones : result; + return result; +} + +#endif /*LIBXSMM_MATH_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_memory.h b/third_party/libxsmm/include/libxsmm_memory.h new file mode 100644 index 00000000..53d4ed76 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_memory.h @@ -0,0 +1,85 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MEMORY_H +#define LIBXSMM_MEMORY_H + +#include "libxsmm_macros.h" + +#if defined(__clang_analyzer__) +# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) memset((void*)(PTRDST), VALUE, SIZE) +#else +# define LIBXSMM_MEMSET127(PTRDST, VALUE, SIZE) { \ + char *const libxsmm_memset127_dst_ = (char*)(PTRDST); \ + union { size_t size; signed char size1; } libxsmm_memset127_; \ + signed char libxsmm_memset127_i_; LIBXSMM_ASSERT((SIZE) <= 127); \ + libxsmm_memset127_.size = (SIZE); \ + LIBXSMM_PRAGMA_UNROLL \ + for (libxsmm_memset127_i_ = 0; libxsmm_memset127_i_ < libxsmm_memset127_.size1; \ + ++libxsmm_memset127_i_) \ + { \ + libxsmm_memset127_dst_[libxsmm_memset127_i_] = (char)(VALUE); \ + } \ +} +#endif +#define LIBXSMM_MEMZERO127(PTRDST) LIBXSMM_MEMSET127(PTRDST, '\0', sizeof(*(PTRDST))) + +#define LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, NTS) { \ + const unsigned char *const libxsmm_memcpy127_loop_src_ = (const unsigned char*)(PTRSRC); \ + unsigned char *const libxsmm_memcpy127_loop_dst_ = (unsigned char*)(PTRDST); \ + signed char libxsmm_memcpy127_loop_i_; LIBXSMM_ASSERT((SIZE) <= 127); \ + NTS(libxsmm_memcpy127_loop_dst_) LIBXSMM_PRAGMA_UNROLL \ + for (libxsmm_memcpy127_loop_i_ = 0; libxsmm_memcpy127_loop_i_ < (signed char)(SIZE); \ + ++libxsmm_memcpy127_loop_i_) \ + { \ + libxsmm_memcpy127_loop_dst_[libxsmm_memcpy127_loop_i_] = \ + libxsmm_memcpy127_loop_src_[libxsmm_memcpy127_loop_i_]; \ + } \ +} +#define LIBXSMM_MEMCPY127_NTS(...) +#define LIBXSMM_MEMCPY127(PTRDST, PTRSRC, SIZE) \ + LIBXSMM_MEMCPY127_LOOP(PTRDST, PTRSRC, SIZE, LIBXSMM_MEMCPY127_NTS) +#define LIBXSMM_ASSIGN127(PTRDST, PTRSRC) LIBXSMM_ASSERT(sizeof(*(PTRSRC)) <= sizeof(*(PTRDST))); \ + LIBXSMM_MEMCPY127(PTRDST, PTRSRC, sizeof(*(PTRSRC))) + + +/** + * Calculates if there is a difference between two (short) buffers. + * Returns zero if there is no difference; otherwise non-zero. + */ +LIBXSMM_API unsigned char libxsmm_diff(const void* a, const void* b, unsigned char size); + +/** + * Calculates if there is a difference between "a" and "n x b". + * Returns the index of the first match (or "n" in case of no match). + */ +LIBXSMM_API unsigned int libxsmm_diff_n(const void* a, const void* bn, unsigned char size, + unsigned char stride, unsigned int hint, unsigned int n); + +/** Similar to memcmp (C standard library), but the result is conceptually only a boolean. */ +LIBXSMM_API int libxsmm_memcmp(const void* a, const void* b, size_t size); + +/** Calculate a hash value for the given buffer and seed; accepts NULL-buffer. */ +LIBXSMM_API unsigned int libxsmm_hash(const void* data, unsigned int size, unsigned int seed); + +/** Calculate a 64-bit hash for the given character string; accepts NULL-string. */ +LIBXSMM_API unsigned long long libxsmm_hash_string(const char* string); + +/** Return the pointer to the 1st match of "b" in "a", or NULL (no match). */ +LIBXSMM_API const char* libxsmm_stristr(const char* a, const char* b); + +/** + * Check if pointer is SIMD-aligned and optionally consider the next access (increment in Bytes). + * Optionally calculates the alignment of the given pointer in Bytes. + */ +LIBXSMM_API int libxsmm_aligned(const void* ptr, const size_t* inc, int* alignment); + +#endif /*LIBXSMM_MEMORY_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_mhd.h b/third_party/libxsmm/include/libxsmm_mhd.h new file mode 100644 index 00000000..ab4cf174 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_mhd.h @@ -0,0 +1,167 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MHD_H +#define LIBXSMM_MHD_H + +#include "libxsmm_typedefs.h" + + +/** Denotes the element/pixel type of an image/channel. */ +typedef enum libxsmm_mhd_elemtype { + LIBXSMM_MHD_ELEMTYPE_F64 = LIBXSMM_DATATYPE_F64, /* MET_DOUBLE */ + LIBXSMM_MHD_ELEMTYPE_F32 = LIBXSMM_DATATYPE_F32, /* MET_FLOAT */ + LIBXSMM_MHD_ELEMTYPE_BF16 = LIBXSMM_DATATYPE_BF16, /* MET_BFLOAT */ + LIBXSMM_MHD_ELEMTYPE_I64 = LIBXSMM_DATATYPE_I64, /* MET_LONG */ + LIBXSMM_MHD_ELEMTYPE_I32 = LIBXSMM_DATATYPE_I32, /* MET_INT */ + LIBXSMM_MHD_ELEMTYPE_I16 = LIBXSMM_DATATYPE_I16, /* MET_SHORT */ + LIBXSMM_MHD_ELEMTYPE_I8 = LIBXSMM_DATATYPE_I8, /* MET_CHAR */ + LIBXSMM_MHD_ELEMTYPE_U64 = LIBXSMM_DATATYPE_UNSUPPORTED, /* MET_ULONG */ + LIBXSMM_MHD_ELEMTYPE_U32, /* MET_UINT */ + LIBXSMM_MHD_ELEMTYPE_U16, /* MET_USHORT */ + LIBXSMM_MHD_ELEMTYPE_U8, /* MET_UCHAR */ + LIBXSMM_MHD_ELEMTYPE_UNKNOWN +} libxsmm_mhd_elemtype; + + +/** + * Function type used for custom data-handler or element conversion. + * The value-range (src_min, src_max) may be used to scale values + * in case of a type-conversion. + */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE int (*libxsmm_mhd_element_handler)( + void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, + const void* src, const void* src_min, const void* src_max); + +/** + * Predefined function to perform element data conversion. + * Scales source-values in case of non-NULL src_min and src_max, + * or otherwise clamps to the destination-type. + */ +LIBXSMM_API int libxsmm_mhd_element_conversion( + void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, + const void* src, const void* src_min, const void* src_max); + +/** + * Predefined function to check a buffer against file content. + * In case of different types, libxsmm_mhd_element_conversion + * is performed to compare values using the source-type. + */ +LIBXSMM_API int libxsmm_mhd_element_comparison( + void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, + const void* src, const void* src_min, const void* src_max); + + +/** Returns the name and size of the element type; result may be NULL/0 in case of an unknown type. */ +LIBXSMM_API const char* libxsmm_mhd_typename(libxsmm_mhd_elemtype type, size_t* typesize, const char** ctypename); + +/** Returns the type of the element for a given type-name. */ +LIBXSMM_API libxsmm_mhd_elemtype libxsmm_mhd_typeinfo(const char elemname[]); + + +/** + * Parse the header of an MHD-file. The header can be part of the data file (local), + * or separately stored (header: MHD, data MHA or RAW). + */ +LIBXSMM_API int libxsmm_mhd_read_header( + /* Filename referring to the header-file (may also contain the data). */ + const char header_filename[], + /* Maximum length of path/file name. */ + size_t filename_max_length, + /* Filename containing the data (may be the same as the header-file). */ + char filename[], + /* Yields the maximum/possible number of dimensions on input, + * and the actual number of dimensions on output. */ + size_t* ndims, + /* Image extents ("ndims" number of entries). */ + size_t size[], + /* Number of interleaved image channels. */ + size_t* ncomponents, + /* Type of the image elements (pixel type). */ + libxsmm_mhd_elemtype* type, + /* Size of the header in bytes; may be used to skip the header, + * when reading content; can be a NULL-argument (optional). */ + size_t* header_size, + /* Size (in Bytes) of an user-defined extended data record; + * can be a NULL-argument (optional). */ + size_t* extension_size); + + +/** + * Loads the data file, and optionally allows data conversion. + * Conversion is performed such that values are clamped to fit + * into the destination. + */ +LIBXSMM_API int libxsmm_mhd_read( + /* Filename referring to the data. */ + const char filename[], + /* Offset within pitched buffer (NULL: no offset). */ + const size_t offset[], + /* Image dimensions (extents). */ + const size_t size[], + /* Leading buffer dimensions (NULL: same as size). */ + const size_t pitch[], + /* Dimensionality (number of entries in size). */ + size_t ndims, + /* Number of interleaved image channels. */ + size_t ncomponents, + /* Used to skip the header, and to only read the data. */ + size_t header_size, + /* Data element type as stored (pixel type). */ + libxsmm_mhd_elemtype type_stored, + /* Storage type (data conversion, optional). */ + const libxsmm_mhd_elemtype* type_data, + /* Buffer where the data is read into. */ + void* data, + /** + * Optional callback executed per entry when reading the data. + * May assign the value to the left-most argument, but also + * allows to only compare with present data. Can be used to + * avoid allocating an actual destination. + */ + libxsmm_mhd_element_handler handle_element, + /* Post-content data (extension, optional). */ + char extension[], + /* Size of the extension; can be zero. */ + size_t extension_size); + + +/** + * Save a file using an extended data format, which is compatible with the Meta Image Format (MHD). + * The file is suitable for visual inspection using, e.g., ITK-SNAP or ParaView. + */ +LIBXSMM_API int libxsmm_mhd_write(const char filename[], + /* Offset within pitched buffer (NULL: no offset). */ + const size_t offset[], + /* Image dimensions (extents). */ + const size_t size[], + /* Leading buffer dimensions (NULL: same as size). */ + const size_t pitch[], + /* Dimensionality, i.e., number of entries in data_size/size. */ + size_t ndims, + /* Number of pixel components. */ + size_t ncomponents, + /* Type (input). */ + libxsmm_mhd_elemtype type_data, + /* Type (data conversion, optional). */ + const libxsmm_mhd_elemtype* type, + /* Raw data to be saved. */ + const void* data, + /* Size of the header; can be a NULL-argument (optional). */ + size_t* header_size, + /* Extension header data; can be NULL. */ + const char extension_header[], + /* Extension data stream; can be NULL. */ + const void* extension, + /* Extension data size; can be NULL. */ + size_t extension_size); + +#endif /*LIBXSMM_MHD_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_rng.h b/third_party/libxsmm/include/libxsmm_rng.h new file mode 100644 index 00000000..fa0ae514 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_rng.h @@ -0,0 +1,57 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_RNG_H +#define LIBXSMM_RNG_H + +#include "libxsmm_typedefs.h" + +/** + * create a new external state for thread-save execution managed + * by the user. We do not provide a function for drawing the random numbers + * the user is supposed to call the LIBXSMM_INTRINSICS_MM512_RNG_EXTSTATE_PS + * or LIBXSMM_INTRINSICS_MM512_RNG_XOSHIRO128P_EXTSTATE_EPI32 intrinsic. + * */ +LIBXSMM_API unsigned int* libxsmm_rng_create_extstate(unsigned int/*uint32_t*/ seed); + +/** free a previously created rng_avx512_extstate */ +LIBXSMM_API void libxsmm_rng_destroy_extstate(unsigned int* stateptr); + +/** Set the seed of libxsmm_rng_* (similar to srand). */ +LIBXSMM_API void libxsmm_rng_set_seed(unsigned int/*uint32_t*/ seed); + +/** + * This SP-RNG is using xoshiro128+ 1.0, work done by + * David Blackman and Sebastiano Vigna (vigna@acm.org). + * It is their best and fastest 32-bit generator for + * 32-bit floating-point numbers. They suggest to use + * its upper bits for floating-point generation, what + * we do here and generate numbers in [0,1(. + */ +LIBXSMM_API void libxsmm_rng_f32_seq(float* rngs, libxsmm_blasint count); + +/** + * Returns a (pseudo-)random value based on rand/rand48 in the interval [0, n). + * This function compensates for an n, which is not a factor of RAND_MAX. + * Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator. + */ +LIBXSMM_API unsigned int libxsmm_rng_u32(unsigned int n); + +/** Sequence of random data based on libxsmm_rng_u32. */ +LIBXSMM_API void libxsmm_rng_seq(void* data, libxsmm_blasint nbytes); + +/** + * Similar to libxsmm_rng_u32, but returns a DP-value in the interval [0, 1). + * Note: libxsmm_rng_set_seed must be used if one wishes to seed the generator. + */ +LIBXSMM_API double libxsmm_rng_f64(void); + +#endif /* LIBXSMM_RNG_H */ + diff --git a/third_party/libxsmm/include/libxsmm_source.h b/third_party/libxsmm/include/libxsmm_source.h new file mode 100644 index 00000000..645cae21 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_source.h @@ -0,0 +1,144 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_SOURCE_H +#define LIBXSMM_SOURCE_H + +#if defined(LIBXSMM_MACROS_H) +# error Please do not include any LIBXSMM header other than libxsmm_source.h! +#endif +#if defined(LIBXSMM_BUILD) +# error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM! +#endif + +/** + * This header is intentionally called "libxsmm_source.h" since the followings block + * includes *internal* files, and thereby exposes LIBXSMM's implementation. + * The so-called "header-only" usage model gives up the clearly defined binary interface + * (including support for hot-fixes after deployment), and requires to rebuild client + * code for every (internal) change of LIBXSMM. Please make sure to only rely on the + * public interface as the internal implementation may change without notice. + */ +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include "../src/generator_aarch64_instructions.c" +#include "../src/generator_common.c" +#include "../src/generator_common_aarch64.c" +#include "../src/generator_common_x86.c" +#include "../src/generator_gemm.c" +#include "../src/generator_gemm_aarch64.c" +#include "../src/generator_gemm_amx.c" +#include "../src/generator_gemm_amx_emu.c" +#include "../src/generator_gemm_amx_microkernel.c" +#include "../src/generator_gemm_amx_microkernel_emu.c" +#include "../src/generator_gemm_avx2_microkernel.c" +#include "../src/generator_gemm_avx512_microkernel.c" +#include "../src/generator_gemm_avx_microkernel.c" +#include "../src/generator_gemm_common.c" +#include "../src/generator_gemm_common_aarch64.c" +#include "../src/generator_gemm_noarch.c" +#include "../src/generator_gemm_sse_avx_avx2_avx512.c" +#include "../src/generator_gemm_sse_microkernel.c" +#include "../src/generator_mateltwise.c" +#include "../src/generator_mateltwise_misc_avx_avx512.c" +#include "../src/generator_mateltwise_reduce_avx_avx512.c" +#include "../src/generator_mateltwise_sse_avx_avx512.c" +#include "../src/generator_mateltwise_transform_avx.c" +#include "../src/generator_mateltwise_transform_avx512.c" +#include "../src/generator_mateltwise_transform_common.c" +#include "../src/generator_mateltwise_transform_common_x86.c" +#include "../src/generator_mateltwise_transform_sse.c" +#include "../src/generator_mateltwise_unary_binary_avx_avx512.c" +#include "../src/generator_matequation.c" +#include "../src/generator_matequation_avx_avx512.c" +#include "../src/generator_matequation_regblocks_avx_avx512.c" +#include "../src/generator_matequation_scratch_avx_avx512.c" +#include "../src/generator_packed_gemm_ac_rm.c" +#include "../src/generator_packed_gemm_ac_rm_aarch64.c" +#include "../src/generator_packed_gemm_ac_rm_avx_avx2_avx512.c" +#include "../src/generator_packed_gemm_bc_rm.c" +#include "../src/generator_packed_gemm_bc_rm_aarch64.c" +#include "../src/generator_packed_gemm_bc_rm_avx_avx2_avx512.c" +#include "../src/generator_packed_spgemm.c" +#include "../src/generator_packed_spgemm_csc_bsparse.c" +#include "../src/generator_packed_spgemm_csc_bsparse_aarch64.c" +#include "../src/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.c" +#include "../src/generator_packed_spgemm_csc_csparse.c" +#include "../src/generator_packed_spgemm_csc_csparse_avx_avx2_avx512.c" +#include "../src/generator_packed_spgemm_csr_asparse.c" +#include "../src/generator_packed_spgemm_csr_asparse_aarch64.c" +#include "../src/generator_packed_spgemm_csr_asparse_avx_avx2_avx512.c" +#include "../src/generator_packed_spgemm_csr_bsparse.c" +#include "../src/generator_packed_spgemm_csr_bsparse_aarch64.c" +#include "../src/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.c" +#include "../src/generator_spgemm.c" +#include "../src/generator_spgemm_csc_asparse.c" +#include "../src/generator_spgemm_csc_bsparse.c" +#include "../src/generator_spgemm_csc_reader.c" +#include "../src/generator_spgemm_csr_asparse.c" +#include "../src/generator_spgemm_csr_asparse_reg.c" +#include "../src/generator_spgemm_csr_reader.c" +#include "../src/generator_x86_instructions.c" +#include "../src/libxsmm_cpuid_arm.c" +#include "../src/libxsmm_cpuid_x86.c" +#include "../src/libxsmm_dnn.c" +#include "../src/libxsmm_dnn_convolution.c" +#include "../src/libxsmm_dnn_convolution_backward.c" +#include "../src/libxsmm_dnn_convolution_forward.c" +#include "../src/libxsmm_dnn_convolution_weight_update.c" +#include "../src/libxsmm_dnn_elementwise.c" +#include "../src/libxsmm_dnn_fullyconnected.c" +#include "../src/libxsmm_dnn_fullyconnected_backward_weight_update.c" +#include "../src/libxsmm_dnn_fullyconnected_forward.c" +#include "../src/libxsmm_dnn_fusedbatchnorm.c" +#include "../src/libxsmm_dnn_fusedbatchnorm_backward.c" +#include "../src/libxsmm_dnn_fusedbatchnorm_forward.c" +#include "../src/libxsmm_dnn_fusedgroupnorm.c" +#include "../src/libxsmm_dnn_fusedgroupnorm_backward.c" +#include "../src/libxsmm_dnn_fusedgroupnorm_forward.c" +#include "../src/libxsmm_dnn_optimizer.c" +#include "../src/libxsmm_dnn_optimizer_sgd.c" +#include "../src/libxsmm_dnn_pooling.c" +#include "../src/libxsmm_dnn_pooling_backward.c" +#include "../src/libxsmm_dnn_pooling_forward.c" +#include "../src/libxsmm_dnn_rnncell.c" +#include "../src/libxsmm_dnn_rnncell_backward_weight_update.c" +#include "../src/libxsmm_dnn_rnncell_forward.c" +#include "../src/libxsmm_dnn_softmaxloss.c" +#include "../src/libxsmm_dnn_softmaxloss_backward.c" +#include "../src/libxsmm_dnn_softmaxloss_forward.c" +#include "../src/libxsmm_dnn_tensor.c" +#include "../src/libxsmm_ext.c" +#include "../src/libxsmm_ext_gemm.c" +#include "../src/libxsmm_ext_xcopy.c" +#include "../src/libxsmm_fsspmdm.c" +#include "../src/libxsmm_gemm.c" +#include "../src/libxsmm_generator.c" +#include "../src/libxsmm_hash.c" +#include "../src/libxsmm_main.c" +#include "../src/libxsmm_malloc.c" +#include "../src/libxsmm_math.c" +#include "../src/libxsmm_matrixeqn.c" +#include "../src/libxsmm_memory.c" +#include "../src/libxsmm_mhd.c" +#include "../src/libxsmm_perf.c" +#include "../src/libxsmm_python.c" +#include "../src/libxsmm_rng.c" +#include "../src/libxsmm_spmdm.c" +#include "../src/libxsmm_sync.c" +#include "../src/libxsmm_timer.c" +#include "../src/libxsmm_trace.c" +#include "../src/libxsmm_xcopy.c" +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#endif /*LIBXSMM_SOURCE_H*/ diff --git a/third_party/libxsmm/include/libxsmm_spmdm.h b/third_party/libxsmm/include/libxsmm_spmdm.h new file mode 100644 index 00000000..1f452dd3 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_spmdm.h @@ -0,0 +1,115 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_SPMDM_H +#define LIBXSMM_SPMDM_H + +#include "libxsmm_typedefs.h" + + +typedef enum libxsmm_spmdm_datatype { + LIBXSMM_SPMDM_DATATYPE_F32, + LIBXSMM_SPMDM_DATATYPE_BFLOAT16 +} libxsmm_spmdm_datatype; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spmdm_handle { + /* The following are the matrix multiply dimensions: A (sparse): m X k, B (dense): k X n, Output C (dense): m X n */ + int m; + int n; + int k; + /* The block sizes for A, B and C. */ + /* Here we fix A to be divided into 128 X 128 blocks, B/C to be 128 X 48 for HSW/BDW and 128 X 96 for SKX */ + int bm; + int bn; + int bk; + /* The number of blocks for the m, n and k dimensions */ + int mb; + int nb; + int kb; + libxsmm_spmdm_datatype datatype; + char* base_ptr_scratch_A; + char* base_ptr_scratch_B_scratch_C; + int memory_for_scratch_per_thread; +} libxsmm_spmdm_handle; + +/** + * This stores a single sparse splice (or block) of sparse matrix A using a CSR representation (rowidx, colidx, and values + * Each splice corresponds to a bm X bk region of A, and stores local indexes + */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_CSR_sparseslice { + /* Since bm and bk are assumed to be <=256, a 16-bit integer is enough to store the local rowidx, colidx */ + uint16_t* rowidx; + uint16_t* colidx; + float* values; +} libxsmm_CSR_sparseslice; + + +LIBXSMM_API void libxsmm_spmdm_init( + int M, int N, int K, + int max_threads, + libxsmm_spmdm_handle* handle, + libxsmm_CSR_sparseslice** libxsmm_output_csr); + +LIBXSMM_API void libxsmm_spmdm_destroy( + libxsmm_spmdm_handle* handle); + +LIBXSMM_API int libxsmm_spmdm_get_num_createSparseSlice_blocks( + const libxsmm_spmdm_handle* handle); + +LIBXSMM_API int libxsmm_spmdm_get_num_compute_blocks( + const libxsmm_spmdm_handle* handle); + +/** This converts a dense representation of the sparse matrix to 2D array of sparse slices. */ +LIBXSMM_API void libxsmm_spmdm_createSparseSlice_fp32_thread( + const libxsmm_spmdm_handle* handle, + char transa, + const float* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads); + +LIBXSMM_API void libxsmm_spmdm_createSparseSlice_bfloat16_thread( + const libxsmm_spmdm_handle* handle, + char transa, + const libxsmm_bfloat16* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads); + +/** NOTE: This code currently ignores alpha input to the matrix multiply */ +LIBXSMM_API void libxsmm_spmdm_compute_fp32_thread( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const float* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const float* b, + char transc, + const float* beta, + float* c, + int block_id, + int tid, int nthreads); + +/** NOTE: This code currently ignores alpha input to the matrix multiply */ +LIBXSMM_API void libxsmm_spmdm_compute_bfloat16_thread( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const libxsmm_bfloat16* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const libxsmm_bfloat16* b, + char transc, + const libxsmm_bfloat16* beta, + float* c, + int block_id, + int tid, int nthreads); + +#endif /*LIBXSMM_SPMDM_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_sync.h b/third_party/libxsmm/include/libxsmm_sync.h new file mode 100644 index 00000000..1f40fab1 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_sync.h @@ -0,0 +1,816 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_SYNC_H +#define LIBXSMM_SYNC_H + +#include "libxsmm_intrinsics_x86.h" + +#if !defined(LIBXSMM_TLS) +# if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_NO_TLS) +# if defined(__CYGWIN__) && defined(__clang__) +# define LIBXSMM_NO_TLS +# define LIBXSMM_TLS +# else +# if (defined(_WIN32) && !defined(__GNUC__) && !defined(__clang__)) || (defined(__PGI) && !defined(__cplusplus)) +# define LIBXSMM_TLS LIBXSMM_ATTRIBUTE(thread) +# elif defined(__GNUC__) || defined(__clang__) || defined(_CRAYC) +# define LIBXSMM_TLS __thread +# elif defined(__cplusplus) +# define LIBXSMM_TLS thread_local +# else +# error Missing TLS support! +# endif +# endif +# else +# if !defined(LIBXSMM_NO_TLS) +# define LIBXSMM_NO_TLS +# endif +# define LIBXSMM_TLS +# endif +#endif + +#if !defined(LIBXSMM_GCC_BASELINE) && !defined(LIBXSMM_SYNC_LEGACY) && ((defined(_WIN32) && defined(__clang__)) || \ + (defined(__GNUC__) && LIBXSMM_VERSION2(4, 7) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) +# define LIBXSMM_GCC_BASELINE +#endif + +#if defined(__MIC__) +# define LIBXSMM_SYNC_PAUSE _mm_delay_32(8/*delay*/) +#elif !defined(LIBXSMM_INTRINSICS_NONE) +# if defined(LIBXSMM_GCC_BASELINE) && !defined(__INTEL_COMPILER) +# define LIBXSMM_SYNC_PAUSE __builtin_ia32_pause() +# else +# define LIBXSMM_SYNC_PAUSE _mm_pause() +# endif +#elif (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) && defined(__GNUC__) +# define LIBXSMM_SYNC_PAUSE __asm__ __volatile__("pause" ::: "memory") +#else +# define LIBXSMM_SYNC_PAUSE +#endif + +/* permit thread-unsafe */ +#if !defined(LIBXSMM_SYNC_NONE) && ( \ + (defined(__PGI) && (!defined(LIBXSMM_LIBATOMIC) || !defined(__STATIC))) || \ + (defined(_CRAYC) && !defined(__GNUC__))) +# define LIBXSMM_SYNC_NONE +#endif + +#if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) && 0 +# define LIBXSMM_ATOMIC_TRYLOCK_CMPSWP +#endif +#if !defined(LIBXSMM_ATOMIC_ZERO_STORE) && defined(_CRAYC) +# define LIBXSMM_ATOMIC_ZERO_STORE +#endif +#if !defined(LIBXSMM_ATOMIC_LOCKTYPE) +# if defined(_WIN32) || 1/*alignment*/ +# define LIBXSMM_ATOMIC_LOCKTYPE int +# else +# define LIBXSMM_ATOMIC_LOCKTYPE char +# endif +#endif + +typedef enum libxsmm_atomic_kind { +#if defined(__ATOMIC_SEQ_CST) + LIBXSMM_ATOMIC_SEQ_CST = __ATOMIC_SEQ_CST, +#else + LIBXSMM_ATOMIC_SEQ_CST = 0, +#endif +#if defined(__ATOMIC_RELAXED) + LIBXSMM_ATOMIC_RELAXED = __ATOMIC_RELAXED +#else + LIBXSMM_ATOMIC_RELAXED = LIBXSMM_ATOMIC_SEQ_CST +#endif +} libxsmm_atomic_kind; + +#define LIBXSMM_NONATOMIC_LOCKTYPE LIBXSMM_ATOMIC_LOCKTYPE +#define LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) (*(SRC_PTR)) +#define LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) { LIBXSMM_UNUSED(KIND); *(DST_PTR) = (VALUE); } +#define LIBXSMM_NONATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND) +#define LIBXSMM_NONATOMIC_FETCH_OR(DST_PTR, VALUE/*side-effect*/, KIND) (/* 1st step: swap(dst, val) */ \ + ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), (VALUE = (VALUE) ^ (*DST_PTR)), ((*DST_PTR) = (*DST_PTR) ^ (VALUE)), \ + (*(DST_PTR) |= VALUE), (VALUE) /* 2nd step: or, and 3rd/last step: original dst-value */) +#define LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) += VALUE) +#define LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) (*(DST_PTR) -= VALUE) +#define LIBXSMM_NONATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) - (VALUE))) +#define LIBXSMM_NONATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) (LIBXSMM_NONATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND), (*(DST_PTR) + (VALUE))) +#define LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) ((NEWVAL) == (*(DST_PTR) == (OLDVAL) ? (*(DST_PTR) = (NEWVAL)) : (OLDVAL))) +#define LIBXSMM_NONATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_NONATOMIC_CMPSWP(DST_PTR, 0, 1, KIND) +#define LIBXSMM_NONATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) { LIBXSMM_UNUSED(NPAUSE); \ + LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 1, KIND); \ + LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_ACQUIRE"); } +#define LIBXSMM_NONATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_UNUSED(DST_PTR); LIBXSMM_UNUSED(KIND); \ + LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); LIBXSMM_NONATOMIC_STORE(DST_PTR, 0, KIND); \ + LIBXSMM_ASSERT_MSG(0 == *(DST_PTR), "LIBXSMM_NONATOMIC_RELEASE"); } +#define LIBXSMM_NONATOMIC_SYNC(KIND) LIBXSMM_UNUSED(KIND) + +#if (0 == LIBXSMM_SYNC) || defined(LIBXSMM_SYNC_NONE) +# define LIBXSMM_ATOMIC(FN, BITS) FN +# define LIBXSMM_ATOMIC_LOAD LIBXSMM_NONATOMIC_LOAD +# define LIBXSMM_ATOMIC_STORE LIBXSMM_NONATOMIC_STORE +# define LIBXSMM_ATOMIC_STORE_ZERO LIBXSMM_NONATOMIC_STORE_ZERO +# define LIBXSMM_ATOMIC_FETCH_OR LIBXSMM_NONATOMIC_FETCH_OR +# define LIBXSMM_ATOMIC_ADD_FETCH LIBXSMM_NONATOMIC_ADD_FETCH +# define LIBXSMM_ATOMIC_SUB_FETCH LIBXSMM_NONATOMIC_SUB_FETCH +# define LIBXSMM_ATOMIC_FETCH_ADD LIBXSMM_NONATOMIC_FETCH_ADD +# define LIBXSMM_ATOMIC_FETCH_SUB LIBXSMM_NONATOMIC_FETCH_SUB +# define LIBXSMM_ATOMIC_CMPSWP LIBXSMM_NONATOMIC_CMPSWP +# define LIBXSMM_ATOMIC_TRYLOCK LIBXSMM_NONATOMIC_TRYLOCK +# define LIBXSMM_ATOMIC_ACQUIRE LIBXSMM_NONATOMIC_ACQUIRE +# define LIBXSMM_ATOMIC_RELEASE LIBXSMM_NONATOMIC_RELEASE +# define LIBXSMM_ATOMIC_SYNC LIBXSMM_NONATOMIC_SYNC +# if !defined(LIBXSMM_SYNC_NPAUSE) +# define LIBXSMM_SYNC_NPAUSE 0 +# endif +#elif (defined(LIBXSMM_GCC_BASELINE) || defined(LIBXSMM_LIBATOMIC) /* GNU's libatomic required */ || \ + (defined(__GNUC__) && LIBXSMM_VERSION2(4, 1) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__))) +# if defined(LIBXSMM_LIBATOMIC) +# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN) +# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8) +# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16) +# define LIBXSMM_ATOMIC32(FN) FN/*default*/ +# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64) +# if defined(__PGI) +# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) LIBXSMM_NONATOMIC_LOAD(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) LIBXSMM_NONATOMIC_STORE(DST_PTR, VALUE, KIND) +# else +# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_4(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) __atomic_load_1(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD16(SRC_PTR, KIND) __atomic_load_2(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) __atomic_load_8(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_4(DST_PTR, (unsigned int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) __atomic_store_1(DST_PTR, (unsigned char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_STORE16(DST_PTR, VALUE, KIND) __atomic_store_2(DST_PTR, (unsigned short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) __atomic_store_8(DST_PTR, (unsigned long long)(VALUE), KIND) +# endif +# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or_4(DST_PTR, (unsigned int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) __atomic_fetch_or_1(DST_PTR, (unsigned char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_OR16(DST_PTR, VALUE, KIND) __atomic_fetch_or_2(DST_PTR, (unsigned short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_OR64(DST_PTR, VALUE, KIND) __atomic_fetch_or_8(DST_PTR, (unsigned long long)(VALUE), KIND) +# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch_4(DST_PTR, (int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_ADD_FETCH8(DST_PTR, VALUE, KIND) __atomic_add_fetch_1(DST_PTR, (signed char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) __atomic_add_fetch_2(DST_PTR, (short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) __atomic_add_fetch_8(DST_PTR, (long long)(VALUE), KIND) +# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch_4(DST_PTR, (int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_SUB_FETCH8(DST_PTR, VALUE, KIND) __atomic_sub_fetch_1(DST_PTR, (signed char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) __atomic_sub_fetch_2(DST_PTR, (short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) __atomic_sub_fetch_8(DST_PTR, (long long)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add_4(DST_PTR, (int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_ADD8(DST_PTR, VALUE, KIND) __atomic_fetch_add_1(DST_PTR, (signed char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) __atomic_fetch_add_2(DST_PTR, (short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) __atomic_fetch_add_8(DST_PTR, (long long)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub_4(DST_PTR, (int)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB8(DST_PTR, VALUE, KIND) __atomic_fetch_sub_1(DST_PTR, (signed char)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) __atomic_fetch_sub_2(DST_PTR, (short)(VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) __atomic_fetch_sub_8(DST_PTR, (long long)(VALUE), KIND) +# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) \ + __atomic_compare_exchange_4(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) \ + __atomic_compare_exchange_1(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_ATOMIC_CMPSWP16(DST_PTR, OLDVAL, NEWVAL, KIND) \ + __atomic_compare_exchange_2(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_ATOMIC_CMPSWP64(DST_PTR, OLDVAL, NEWVAL, KIND) \ + __atomic_compare_exchange_8(DST_PTR, &(OLDVAL), (NEWVAL), 0/*false*/, KIND, LIBXSMM_ATOMIC_RELAXED) +# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND)) +# endif +# if defined(__PGI) +# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ + LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND); } /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */ +# else +# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ + __atomic_clear(DST_PTR, KIND); } +# endif +# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() +# if !defined(LIBXSMM_ATOMIC_ZERO_STORE) +# define LIBXSMM_ATOMIC_ZERO_STORE +# endif +# elif defined(LIBXSMM_GCC_BASELINE) +# define LIBXSMM_ATOMIC(FN, BITS) FN +# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __atomic_load_n(SRC_PTR, KIND) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) __atomic_store_n(DST_PTR, VALUE, KIND) +# if !defined(LIBXSMM_ATOMIC_ZERO_STORE) +# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__atomic_and_fetch(DST_PTR, 0, KIND)) +# endif +# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __atomic_fetch_or(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __atomic_add_fetch(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __atomic_sub_fetch(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __atomic_fetch_add(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __atomic_fetch_sub(DST_PTR, VALUE, KIND) +# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL) +# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (!__atomic_test_and_set(DST_PTR, KIND)) +# endif +# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ + __atomic_clear(DST_PTR, KIND); } +# if 0 /* __atomic_thread_fence: incorrect behavior in libxsmm_barrier (even with LIBXSMM_ATOMIC_SEQ_CST) */ +# define LIBXSMM_ATOMIC_SYNC(KIND) __atomic_thread_fence(KIND) +# else +# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() +# endif +# else /* GCC legacy atomics */ +# define LIBXSMM_ATOMIC(FN, BITS) FN +# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) __sync_or_and_fetch(SRC_PTR, 0) +# if (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) { \ + __asm__ __volatile__("" ::: "memory"); *(DST_PTR) = (VALUE); \ + __asm__ __volatile__("" ::: "memory"); } +# else +# define LIBXSMM_ATOMIC_SYNC_NOFENCE(KIND) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) *(DST_PTR) = (VALUE) +# endif +# if !defined(LIBXSMM_ATOMIC_ZERO_STORE) +# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) do {} while (__sync_and_and_fetch(DST_PTR, 0)) +# endif +# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) __sync_fetch_and_or(DST_PTR, VALUE) +# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) __sync_add_and_fetch(DST_PTR, VALUE) +# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) __sync_sub_and_fetch(DST_PTR, VALUE) +# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) __sync_fetch_and_add(DST_PTR, VALUE) +# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) __sync_fetch_and_sub(DST_PTR, VALUE) +# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) __sync_bool_compare_and_swap(DST_PTR, OLDVAL, NEWVAL) +# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == __sync_lock_test_and_set(DST_PTR, 1)) +# endif +# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ + __sync_lock_release(DST_PTR); } +# define LIBXSMM_ATOMIC_SYNC(KIND) __sync_synchronize() +# endif +# if defined(LIBXSMM_ATOMIC_ZERO_STORE) +# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND) +# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 8)(DST_PTR, 0, KIND) +# define LIBXSMM_ATOMIC_STORE_ZERO16(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 16)(DST_PTR, 0, KIND) +# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, 64)(DST_PTR, 0, KIND) +# endif +# if !defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) /* matches bit-width of LIBXSMM_ATOMIC_LOCKTYPE */ \ + (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND)) +# endif +# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \ + LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \ + while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \ + LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE") +# if !defined(LIBXSMM_SYNC_NPAUSE) +# define LIBXSMM_SYNC_NPAUSE 4096 +# endif +#elif defined(_WIN32) +# define LIBXSMM_ATOMIC(FN, BITS) LIBXSMM_CONCATENATE(LIBXSMM_ATOMIC, BITS)(FN) +# define LIBXSMM_ATOMIC8(FN) LIBXSMM_CONCATENATE(FN, 8) +# define LIBXSMM_ATOMIC16(FN) LIBXSMM_CONCATENATE(FN, 16) +# define LIBXSMM_ATOMIC32(FN) FN/*default*/ +# define LIBXSMM_ATOMIC64(FN) LIBXSMM_CONCATENATE(FN, 64) +# define LIBXSMM_ATOMIC_LOAD(SRC_PTR, KIND) InterlockedOr((volatile LONG*)(SRC_PTR), 0) +# define LIBXSMM_ATOMIC_LOAD8(SRC_PTR, KIND) _InterlockedOr8((volatile char*)(SRC_PTR), 0) +# define LIBXSMM_ATOMIC_LOAD64(SRC_PTR, KIND) InterlockedOr64((volatile LONGLONG*)(SRC_PTR), 0) +# define LIBXSMM_ATOMIC_STORE(DST_PTR, VALUE, KIND) InterlockedExchange((volatile LONG*)(DST_PTR), (LONG)(VALUE)) +# define LIBXSMM_ATOMIC_STORE8(DST_PTR, VALUE, KIND) InterlockedExchange8((volatile char*)(DST_PTR), (LONGLONG)(VALUE)) +# define LIBXSMM_ATOMIC_STORE64(DST_PTR, VALUE, KIND) InterlockedExchange64((volatile LONGLONG*)(DST_PTR), (LONGLONG)(VALUE)) +# if defined(LIBXSMM_ATOMIC_ZERO_STORE) +# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE(DST_PTR, 0, KIND) +# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE8(DST_PTR, 0, KIND) +# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) LIBXSMM_ATOMIC_STORE64(DST_PTR, 0, KIND) +# else +# define LIBXSMM_ATOMIC_STORE_ZERO(DST_PTR, KIND) InterlockedAnd((volatile LONG*)(DST_PTR), 0) +# define LIBXSMM_ATOMIC_STORE_ZERO8(DST_PTR, KIND) InterlockedAnd8((volatile char*)(DST_PTR), 0) +# define LIBXSMM_ATOMIC_STORE_ZERO64(DST_PTR, KIND) InterlockedAnd64((volatile LONGLONG*)(DST_PTR), 0) +# endif +# define LIBXSMM_ATOMIC_FETCH_OR(DST_PTR, VALUE, KIND) InterlockedOr((volatile LONG*)(DST_PTR), VALUE) +# define LIBXSMM_ATOMIC_FETCH_OR8(DST_PTR, VALUE, KIND) _InterlockedOr8((volatile char*)(DST_PTR), VALUE) +# define LIBXSMM_ATOMIC_ADD_FETCH(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) + (VALUE)) +# define LIBXSMM_ATOMIC_ADD_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) + (VALUE)) +# define LIBXSMM_ATOMIC_ADD_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) + (VALUE)) +# define LIBXSMM_ATOMIC_SUB_FETCH(DST_PTR, VALUE, KIND) ((size_t)LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) - ((size_t)VALUE)) +# define LIBXSMM_ATOMIC_SUB_FETCH16(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) - (VALUE)) +# define LIBXSMM_ATOMIC_SUB_FETCH64(DST_PTR, VALUE, KIND) (LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) - (VALUE)) +# define LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, VALUE, KIND) InterlockedExchangeAdd((volatile LONG*)(DST_PTR), VALUE) +# define LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, VALUE, KIND) _InterlockedExchangeAdd16((volatile SHORT*)(DST_PTR), VALUE) +# define LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, VALUE, KIND) InterlockedExchangeAdd64((volatile LONGLONG*)(DST_PTR), VALUE) +# define LIBXSMM_ATOMIC_FETCH_SUB(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD(DST_PTR, -1 * (VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB16(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD16(DST_PTR, -1 * (VALUE), KIND) +# define LIBXSMM_ATOMIC_FETCH_SUB64(DST_PTR, VALUE, KIND) LIBXSMM_ATOMIC_FETCH_ADD64(DST_PTR, -1 * (VALUE), KIND) +# define LIBXSMM_ATOMIC_CMPSWP(DST_PTR, OLDVAL, NEWVAL, KIND) (((LONG)(OLDVAL)) == InterlockedCompareExchange((volatile LONG*)(DST_PTR), NEWVAL, OLDVAL)) +# define LIBXSMM_ATOMIC_CMPSWP8(DST_PTR, OLDVAL, NEWVAL, KIND) ((OLDVAL) == _InterlockedCompareExchange8((volatile char*)(DST_PTR), NEWVAL, OLDVAL)) +# if defined(LIBXSMM_ATOMIC_TRYLOCK_CMPSWP) +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_CMPSWP, 8)(DST_PTR, 0, 1, KIND) +# else +# define LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND) (0 == LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_OR, 8)(DST_PTR, 1, KIND)) +# endif +# define LIBXSMM_ATOMIC_ACQUIRE(DST_PTR, NPAUSE, KIND) \ + LIBXSMM_ASSERT(0 == LIBXSMM_MOD2((uintptr_t)(DST_PTR), 4)); \ + while (!LIBXSMM_ATOMIC_TRYLOCK(DST_PTR, KIND)) LIBXSMM_SYNC_CYCLE(DST_PTR, 0/*free*/, NPAUSE); \ + LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_ACQUIRE") +# define LIBXSMM_ATOMIC_RELEASE(DST_PTR, KIND) { \ + LIBXSMM_ASSERT_MSG(0 != *(DST_PTR), "LIBXSMM_ATOMIC_RELEASE"); \ + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, 8)(DST_PTR, KIND); } +# define LIBXSMM_ATOMIC_SYNC(KIND) _ReadWriteBarrier() +# if !defined(LIBXSMM_SYNC_NPAUSE) +# define LIBXSMM_SYNC_NPAUSE 4096 +# endif +#else /* consider to permit LIBXSMM_SYNC_NONE */ +# error LIBXSMM is missing atomic compiler builtins! +#endif + +#if !defined(LIBXSMM_SYNC_CYCLE) +# if (0 < LIBXSMM_SYNC_NPAUSE) +# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) do { int libxsmm_sync_cycle_npause_ = 1; \ + do { int libxsmm_sync_cycle_counter_ = 0; \ + for (; libxsmm_sync_cycle_counter_ < libxsmm_sync_cycle_npause_; ++libxsmm_sync_cycle_counter_) LIBXSMM_SYNC_PAUSE; \ + if (libxsmm_sync_cycle_npause_ < (NPAUSE)) { \ + libxsmm_sync_cycle_npause_ *= 2; \ + } \ + else { \ + libxsmm_sync_cycle_npause_ = (NPAUSE); \ + LIBXSMM_SYNC_YIELD; \ + ELSE \ + } \ + } while(((EXP_STATE) & 1) != (*(DST_PTR) & 1)); \ + } while(0) +# else +# define LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, ELSE) LIBXSMM_SYNC_PAUSE +# endif +# define LIBXSMM_SYNC_CYCLE(DST_PTR, EXP_STATE, NPAUSE) \ + LIBXSMM_SYNC_CYCLE_ELSE(DST_PTR, EXP_STATE, NPAUSE, /*else*/;) +#endif + +#if (0 != LIBXSMM_SYNC) +# define LIBXSMM_LOCK_DEFAULT LIBXSMM_LOCK_SPINLOCK +# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \ + (!defined(__linux__) || defined(__USE_XOPEN2K)) && 0/*disabled*/ +# define LIBXSMM_LOCK_SYSTEM_SPINLOCK +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) +# define LIBXSMM_LOCK_SYSTEM_MUTEX +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) && !(defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP)) && \ + (!defined(__linux__) || defined(__USE_XOPEN2K) || defined(__USE_UNIX98)) +# define LIBXSMM_LOCK_SYSTEM_RWLOCK +# endif + /* Lock type, initialization, destruction, (try-)lock, unlock, etc */ +# define LIBXSMM_LOCK_ACQUIRED(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRED_, KIND) +# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISPOD_, KIND) +# define LIBXSMM_LOCK_TYPE_ISRW(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_ISRW_, KIND) +# define LIBXSMM_LOCK_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TYPE_, KIND) +# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_INIT_, KIND)(LOCK, ATTR) +# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_DESTROY_, KIND)(LOCK) +# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYLOCK_, KIND)(LOCK) +# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQUIRE_, KIND)(LOCK) +# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELEASE_, KIND)(LOCK) +# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_TRYREAD_, KIND)(LOCK) +# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ACQREAD_, KIND)(LOCK) +# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_RELREAD_, KIND)(LOCK) + /* Attribute type, initialization, destruction */ +# define LIBXSMM_LOCK_ATTR_TYPE(KIND) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_TYPE_, KIND) +# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_INIT_, KIND)(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_CONCATENATE(LIBXSMM_LOCK_ATTR_DESTROY_, KIND)(ATTR) + /* Cygwin's Pthread implementation appears to be broken; use Win32 */ +# if !defined(LIBXSMM_WIN32_THREADS) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBXSMM_WIN32_THREADS _WIN32_WINNT +# if defined(__CYGWIN__) || defined(__MINGW32__) /* hack: make SRW-locks available */ +# if defined(_WIN32_WINNT) +# undef _WIN32_WINNT +# if !defined(NTDDI_VERSION) +# define NTDDI_VERSION 0x0600 +# endif +# define _WIN32_WINNT ((LIBXSMM_WIN32_THREADS) | 0x0600) +# else +# define _WIN32_WINNT 0x0600 +# endif +# endif +# endif +# if defined(LIBXSMM_WIN32_THREADS) +# define LIBXSMM_TLS_TYPE DWORD +# define LIBXSMM_TLS_CREATE(KEYPTR) *(KEYPTR) = TlsAlloc() +# define LIBXSMM_TLS_DESTROY(KEY) TlsFree(KEY) +# define LIBXSMM_TLS_SETVALUE(KEY, PTR) TlsSetValue(KEY, PTR) +# define LIBXSMM_TLS_GETVALUE(KEY) TlsGetValue(KEY) +# define LIBXSMM_LOCK_SPINLOCK spin +# if ((LIBXSMM_WIN32_THREADS) & 0x0600) +# define LIBXSMM_LOCK_MUTEX rwlock +# define LIBXSMM_LOCK_RWLOCK rwlock +# else /* mutex exposes high latency */ +# define LIBXSMM_LOCK_MUTEX mutex +# define LIBXSMM_LOCK_RWLOCK mutex +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) +# define LIBXSMM_LOCK_ACQUIRED_spin TRUE +# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 +# define LIBXSMM_LOCK_TYPE_spin CRITICAL_SECTION +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeCriticalSection(LOCK); } +# define LIBXSMM_LOCK_DESTROY_spin(LOCK) DeleteCriticalSection((LIBXSMM_LOCK_TYPE_spin*)(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) TryEnterCriticalSection(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) EnterCriticalSection(LOCK) +# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LeaveCriticalSection(LOCK) +# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) +# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) +# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_spin int +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX) +# define LIBXSMM_LOCK_ACQUIRED_mutex WAIT_OBJECT_0 +# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 +# define LIBXSMM_LOCK_TYPE_mutex HANDLE +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) (*(LOCK) = CreateMutex(*(ATTR), FALSE, NULL)) +# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) CloseHandle(*(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) WaitForSingleObject(*(LOCK), 0) +# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) WaitForSingleObject(*(LOCK), INFINITE) +# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) ReleaseMutex(*(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) +# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) +# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_mutex LPSECURITY_ATTRIBUTES +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = NULL) +# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) +# define LIBXSMM_LOCK_ACQUIRED_rwlock TRUE +# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1 +# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 +# define LIBXSMM_LOCK_TYPE_rwlock SRWLOCK +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); InitializeSRWLock(LOCK); } +# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) TryAcquireSRWLockExclusive(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) AcquireSRWLockExclusive(LOCK) +# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) ReleaseSRWLockExclusive(LOCK) +# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) TryAcquireSRWLockShared(LOCK) +# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) AcquireSRWLockShared(LOCK) +# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) ReleaseSRWLockShared(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# define LIBXSMM_SYNC_YIELD YieldProcessor() +# else +# define LIBXSMM_TLS_TYPE pthread_key_t +# define LIBXSMM_TLS_CREATE(KEYPTR) pthread_key_create(KEYPTR, NULL) +# define LIBXSMM_TLS_DESTROY(KEY) pthread_key_delete(KEY) +# define LIBXSMM_TLS_SETVALUE(KEY, PTR) pthread_setspecific(KEY, PTR) +# define LIBXSMM_TLS_GETVALUE(KEY) pthread_getspecific(KEY) +# if defined(__APPLE__) && defined(__MACH__) +# define LIBXSMM_SYNC_YIELD pthread_yield_np() +# else +# if defined(__USE_GNU) || !defined(__BSD_VISIBLE) + LIBXSMM_EXTERN int pthread_yield(void) LIBXSMM_THROW; +# else + LIBXSMM_EXTERN void pthread_yield(void); +# endif +# define LIBXSMM_SYNC_YIELD pthread_yield() +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) && defined(__APPLE__) && defined(__MACH__) +# define LIBXSMM_LOCK_SPINLOCK mutex +# else +# define LIBXSMM_LOCK_SPINLOCK spin +# endif +# define LIBXSMM_LOCK_MUTEX mutex +# define LIBXSMM_LOCK_RWLOCK rwlock +# if defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) +# define LIBXSMM_LOCK_ACQUIRED_spin 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 +# define LIBXSMM_LOCK_TYPE_ISRW_spin 0 +# define LIBXSMM_LOCK_TYPE_spin pthread_spinlock_t +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_spin_init(LOCK, *(ATTR))) +# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_destroy(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) pthread_spin_trylock(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_lock(LOCK)) +# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_EXPECT(0, pthread_spin_unlock(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) +# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) +# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_spin int +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = 0) +# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_MUTEX) +# define LIBXSMM_LOCK_ACQUIRED_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 +# define LIBXSMM_LOCK_TYPE_mutex pthread_mutex_t +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_mutex_init(LOCK, ATTR)) +# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_EXPECT_DEBUG(0, pthread_mutex_destroy(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) pthread_mutex_trylock(LOCK) /*!LIBXSMM_EXPECT*/ +# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_lock(LOCK)) +# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_EXPECT(0, pthread_mutex_unlock(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) +# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) +# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_mutex pthread_mutexattr_t +#if !defined(__linux__) || defined(__USE_UNIX98) || defined(__USE_XOPEN2K8) +# if defined(_DEBUG) +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (LIBXSMM_EXPECT(0, pthread_mutexattr_init(ATTR)), \ + LIBXSMM_EXPECT(0, pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_ERRORCHECK))) +# else +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (pthread_mutexattr_init(ATTR), \ + pthread_mutexattr_settype(ATTR, PTHREAD_MUTEX_NORMAL)) +# endif +#else +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) pthread_mutexattr_init(ATTR) +#endif +# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_EXPECT(0, pthread_mutexattr_destroy(ATTR)) +# endif +# if defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) +# define LIBXSMM_LOCK_ACQUIRED_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 +# define LIBXSMM_LOCK_TYPE_rwlock pthread_rwlock_t +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) LIBXSMM_EXPECT(0, pthread_rwlock_init(LOCK, ATTR)) +# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_destroy(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) pthread_rwlock_trywrlock(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_wrlock(LOCK)) +# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_unlock(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) pthread_rwlock_tryrdlock(LOCK) +# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_EXPECT(0, pthread_rwlock_rdlock(LOCK)) +# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock pthread_rwlockattr_t +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_init(ATTR)) +# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_EXPECT(0, pthread_rwlockattr_destroy(ATTR)) +# endif +# endif +/* OpenMP based locks need to stay disabled unless both + * libxsmm and libxsmmext are built with OpenMP support. + */ +# if defined(_OPENMP) && defined(LIBXSMM_SYNC_OMP) +# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) +# define LIBXSMM_LOCK_ACQUIRED_spin 1 +# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 +# define LIBXSMM_LOCK_TYPE_ISRW_spin 0 +# define LIBXSMM_LOCK_TYPE_spin omp_lock_t +# define LIBXSMM_LOCK_DESTROY_spin(LOCK) omp_destroy_lock(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) omp_test_lock(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) omp_set_lock(LOCK) +# define LIBXSMM_LOCK_RELEASE_spin(LOCK) omp_unset_lock(LOCK) +# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) +# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) +# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) +# if (201811 <= _OPENMP/*v5.0*/) +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) +# define LIBXSMM_LOCK_ATTR_TYPE_spin omp_lock_hint_t +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) (*(ATTR) = omp_lock_hint_none) +# else +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } +# define LIBXSMM_LOCK_ATTR_TYPE_spin const void* +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) +# define LIBXSMM_LOCK_ACQUIRED_mutex 1 +# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 +# define LIBXSMM_LOCK_TYPE_mutex omp_lock_t +# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) omp_destroy_lock(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) omp_test_lock(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) omp_set_lock(LOCK) +# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) omp_unset_lock(LOCK) +# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) +# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) +# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) +# if (201811 <= _OPENMP/*v5.0*/) +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) +# define LIBXSMM_LOCK_ATTR_TYPE_mutex omp_lock_hint_t +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) (*(ATTR) = omp_lock_hint_none) +# else +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } +# define LIBXSMM_LOCK_ATTR_TYPE_mutex const void* +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) +# define LIBXSMM_LOCK_ACQUIRED_rwlock 1 +# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0 +# define LIBXSMM_LOCK_TYPE_rwlock omp_lock_t +# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) omp_destroy_lock(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) omp_test_lock(LOCK) +# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) omp_set_lock(LOCK) +# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) omp_unset_lock(LOCK) +# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) +# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) +# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) +# if (201811 <= _OPENMP/*v5.0*/) +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) omp_init_lock_with_hint(LOCK, *(ATTR)) +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock omp_lock_hint_t +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) (*(ATTR) = omp_lock_hint_none) +# else +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); omp_init_lock(LOCK); } +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock const void* +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# elif !defined(LIBXSMM_SYNC_NONE) /* based on atomic primitives */ +# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) +# define LIBXSMM_LOCK_ACQUIRED_spin 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_spin 1 +# define LIBXSMM_LOCK_TYPE_ISRW_spin 0 +# define LIBXSMM_LOCK_TYPE_spin volatile LIBXSMM_ATOMIC_LOCKTYPE +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } +# define LIBXSMM_LOCK_DESTROY_spin(LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) (LIBXSMM_LOCK_ACQUIRED_spin + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) +# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_RELEASE_spin(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) +# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) +# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_spin int +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) +# define LIBXSMM_LOCK_ACQUIRED_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 1 +# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 +# define LIBXSMM_LOCK_TYPE_mutex volatile LIBXSMM_ATOMIC_LOCKTYPE +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } +# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) (LIBXSMM_LOCK_ACQUIRED_mutex + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) +# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) +# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) +# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_mutex int +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) +# define LIBXSMM_LOCK_ACQUIRED_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 1 +# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 0 +# define LIBXSMM_LOCK_TYPE_rwlock volatile LIBXSMM_ATOMIC_LOCKTYPE +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = 0); } +# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) (LIBXSMM_LOCK_ACQUIRED_rwlock + !LIBXSMM_ATOMIC_TRYLOCK(LOCK, LIBXSMM_ATOMIC_RELAXED)) +# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) LIBXSMM_ATOMIC_ACQUIRE(LOCK, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) LIBXSMM_ATOMIC_RELEASE(LOCK, LIBXSMM_ATOMIC_RELAXED) +# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) +# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) +# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) LIBXSMM_LOCK_RELEASE_rwlock(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# else /* experimental */ +# if !defined(LIBXSMM_LOCK_SYSTEM_SPINLOCK) +# define LIBXSMM_LOCK_ACQUIRED_spin 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_spin 0 +# define LIBXSMM_LOCK_TYPE_ISRW_spin 0 +# define LIBXSMM_LOCK_TYPE_spin libxsmm_spinlock* +# define LIBXSMM_LOCK_INIT_spin(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_spinlock_create()); } +# define LIBXSMM_LOCK_DESTROY_spin(LOCK) libxsmm_spinlock_destroy(*(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_spin(LOCK) libxsmm_spinlock_trylock(*(LOCK)) +# define LIBXSMM_LOCK_ACQUIRE_spin(LOCK) libxsmm_spinlock_acquire(*(LOCK)) +# define LIBXSMM_LOCK_RELEASE_spin(LOCK) libxsmm_spinlock_release(*(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_spin(LOCK) LIBXSMM_LOCK_TRYLOCK_spin(LOCK) +# define LIBXSMM_LOCK_ACQREAD_spin(LOCK) LIBXSMM_LOCK_ACQUIRE_spin(LOCK) +# define LIBXSMM_LOCK_RELREAD_spin(LOCK) LIBXSMM_LOCK_RELEASE_spin(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_spin int +# define LIBXSMM_LOCK_ATTR_INIT_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_spin(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_MUTEX) +# define LIBXSMM_LOCK_ACQUIRED_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_mutex 0 +# define LIBXSMM_LOCK_TYPE_ISRW_mutex 0 +# define LIBXSMM_LOCK_TYPE_mutex libxsmm_mutex* +# define LIBXSMM_LOCK_INIT_mutex(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_mutex_create()); } +# define LIBXSMM_LOCK_DESTROY_mutex(LOCK) libxsmm_mutex_destroy(*(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) libxsmm_mutex_trylock(*(LOCK)) +# define LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) libxsmm_mutex_acquire(*(LOCK)) +# define LIBXSMM_LOCK_RELEASE_mutex(LOCK) libxsmm_mutex_release(*(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_mutex(LOCK) LIBXSMM_LOCK_TRYLOCK_mutex(LOCK) +# define LIBXSMM_LOCK_ACQREAD_mutex(LOCK) LIBXSMM_LOCK_ACQUIRE_mutex(LOCK) +# define LIBXSMM_LOCK_RELREAD_mutex(LOCK) LIBXSMM_LOCK_RELEASE_mutex(LOCK) +# define LIBXSMM_LOCK_ATTR_TYPE_mutex int +# define LIBXSMM_LOCK_ATTR_INIT_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_mutex(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# if !defined(LIBXSMM_LOCK_SYSTEM_RWLOCK) +# define LIBXSMM_LOCK_ACQUIRED_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISPOD_rwlock 0 +# define LIBXSMM_LOCK_TYPE_ISRW_rwlock 1 +# define LIBXSMM_LOCK_TYPE_rwlock libxsmm_rwlock* +# define LIBXSMM_LOCK_INIT_rwlock(LOCK, ATTR) { LIBXSMM_UNUSED(ATTR); (*(LOCK) = libxsmm_rwlock_create()); } +# define LIBXSMM_LOCK_DESTROY_rwlock(LOCK) libxsmm_rwlock_destroy(*(LOCK)) +# define LIBXSMM_LOCK_TRYLOCK_rwlock(LOCK) libxsmm_rwlock_trylock(*(LOCK)) +# define LIBXSMM_LOCK_ACQUIRE_rwlock(LOCK) libxsmm_rwlock_acquire(*(LOCK)) +# define LIBXSMM_LOCK_RELEASE_rwlock(LOCK) libxsmm_rwlock_release(*(LOCK)) +# define LIBXSMM_LOCK_TRYREAD_rwlock(LOCK) libxsmm_rwlock_tryread(*(LOCK)) +# define LIBXSMM_LOCK_ACQREAD_rwlock(LOCK) libxsmm_rwlock_acqread(*(LOCK)) +# define LIBXSMM_LOCK_RELREAD_rwlock(LOCK) libxsmm_rwlock_relread(*(LOCK)) +# define LIBXSMM_LOCK_ATTR_TYPE_rwlock int +# define LIBXSMM_LOCK_ATTR_INIT_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY_rwlock(ATTR) LIBXSMM_UNUSED(ATTR) +# endif +# endif +#else /* no synchronization */ +# define LIBXSMM_SYNC_YIELD LIBXSMM_SYNC_PAUSE +# define LIBXSMM_LOCK_SPINLOCK spinlock_dummy +# define LIBXSMM_LOCK_MUTEX mutex_dummy +# define LIBXSMM_LOCK_RWLOCK rwlock_dummy +# define LIBXSMM_LOCK_ACQUIRED(KIND) 0 +# define LIBXSMM_LOCK_TYPE_ISPOD(KIND) 1 +# define LIBXSMM_LOCK_TYPE_ISRW(KIND) 0 +# define LIBXSMM_LOCK_ATTR_TYPE(KIND) int +# define LIBXSMM_LOCK_ATTR_INIT(KIND, ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_ATTR_DESTROY(KIND, ATTR) LIBXSMM_UNUSED(ATTR) +# define LIBXSMM_LOCK_TYPE(KIND) int +# define LIBXSMM_LOCK_INIT(KIND, LOCK, ATTR) { LIBXSMM_UNUSED(LOCK); LIBXSMM_UNUSED(ATTR); } +# define LIBXSMM_LOCK_DESTROY(KIND, LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) LIBXSMM_LOCK_ACQUIRED(KIND) +# define LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_RELEASE(KIND, LOCK) LIBXSMM_UNUSED(LOCK) +# define LIBXSMM_LOCK_TRYREAD(KIND, LOCK) LIBXSMM_LOCK_TRYLOCK(KIND, LOCK) +# define LIBXSMM_LOCK_ACQREAD(KIND, LOCK) LIBXSMM_LOCK_ACQUIRE(KIND, LOCK) +# define LIBXSMM_LOCK_RELREAD(KIND, LOCK) LIBXSMM_LOCK_RELEASE(KIND, LOCK) +#endif + +#if (0 == LIBXSMM_SYNC) +# define LIBXSMM_FLOCK(FILE) +# define LIBXSMM_FUNLOCK(FILE) +#elif defined(_WIN32) +# define LIBXSMM_FLOCK(FILE) _lock_file(FILE) +# define LIBXSMM_FUNLOCK(FILE) _unlock_file(FILE) +#else +# if !defined(__CYGWIN__) +# define LIBXSMM_FLOCK(FILE) flockfile(FILE) +# define LIBXSMM_FUNLOCK(FILE) funlockfile(FILE) + LIBXSMM_EXTERN void flockfile(FILE*) LIBXSMM_THROW; + LIBXSMM_EXTERN void funlockfile(FILE*) LIBXSMM_THROW; +# else /* Only available with __CYGWIN__ *and* C++0x. */ +# define LIBXSMM_FLOCK(FILE) +# define LIBXSMM_FUNLOCK(FILE) +# endif +#endif + +/** Synchronize console output */ +#define LIBXSMM_STDIO_ACQUIRE() LIBXSMM_FLOCK(stdout); LIBXSMM_FLOCK(stderr) +#define LIBXSMM_STDIO_RELEASE() LIBXSMM_FUNLOCK(stderr); LIBXSMM_FUNLOCK(stdout) + + +/** Opaque type which represents a barrier. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_barrier libxsmm_barrier; + +/** Create barrier from one of the threads. */ +LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core); +/** Initialize the barrier from each thread of the team. */ +LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid); +/** Wait for the entire team to arrive. */ +LIBXSMM_API void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid); +/** Destroy the resources associated with this barrier. */ +LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier); +/** DEPRECATED: use libxsmm_barrier_destroy instead. */ +#define libxsmm_barrier_release libxsmm_barrier_destroy + +/** Spin-lock, which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_SPINLOCK). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_spinlock libxsmm_spinlock; +LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void); +LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock); +LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock); +LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock); +LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock); + +/** Mutual-exclusive lock (Mutex), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_MUTEX). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mutex libxsmm_mutex; +LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void); +LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex); +LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex); +LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex); +LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex); + +/** Reader-Writer lock (RW-lock), which eventually differs from LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK_RWLOCK). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_rwlock libxsmm_rwlock; +LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void); +LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock); +LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock); +LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock); +LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock); +LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock); +LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock); +LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock); + +/** Utility function to receive the process ID of the calling process. */ +LIBXSMM_API unsigned int libxsmm_get_pid(void); +/** + * Utility function to receive a Thread-ID (TID) for the calling thread. + * The TID is not related to a specific threading runtime. TID=0 may not + * represent the main thread. TIDs are zero-based and consecutive numbers. + */ +LIBXSMM_API unsigned int libxsmm_get_tid(void); + +#endif /*LIBXSMM_SYNC_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_timer.h b/third_party/libxsmm/include/libxsmm_timer.h new file mode 100644 index 00000000..dcd9cb04 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_timer.h @@ -0,0 +1,41 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_TIMER_H +#define LIBXSMM_TIMER_H + +#include "libxsmm_macros.h" + + +typedef unsigned long long libxsmm_timer_tickint; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_timer_info { + int tsc; +} libxsmm_timer_info; + + +/** Query timer properties. */ +LIBXSMM_API int libxsmm_get_timer_info(libxsmm_timer_info* info); + +/** + * Returns the current clock tick of a monotonic timer source with + * platform-specific resolution (not necessarily CPU cycles). + */ +LIBXSMM_API libxsmm_timer_tickint libxsmm_timer_tick(void); + +/** Returns the difference between two timer ticks (cycles); avoids potential side-effects/assumptions of LIBXSMM_DIFF. */ +LIBXSMM_API_INLINE libxsmm_timer_tickint libxsmm_timer_ncycles(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) { + return LIBXSMM_DELTA(tick0, tick1); +} + +/** Returns the duration (in seconds) between two values received by libxsmm_timer_tick. */ +LIBXSMM_API double libxsmm_timer_duration(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); + +#endif /*LIBXSMM_TIMER_H*/ diff --git a/third_party/libxsmm/include/libxsmm_typedefs.h b/third_party/libxsmm/include/libxsmm_typedefs.h new file mode 100644 index 00000000..dc2405c9 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_typedefs.h @@ -0,0 +1,878 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_TYPEDEFS_H +#define LIBXSMM_TYPEDEFS_H + +#include "libxsmm_macros.h" + +/** Check ILP64 configuration for sanity. */ +#if !defined(LIBXSMM_ILP64) || (0 == LIBXSMM_ILP64 && defined(MKL_ILP64)) +# error "Inconsistent ILP64 configuration detected!" +#elif (0 != LIBXSMM_ILP64 && !defined(MKL_ILP64)) +# define MKL_ILP64 +#endif +#if (0 != LIBXSMM_ILP64) +# define LIBXSMM_BLASINT_NBITS 64 +# define LIBXSMM_BLASINT long long +#else /* LP64 */ +# define LIBXSMM_BLASINT_NBITS 32 +# define LIBXSMM_BLASINT int +#endif + +/** Generic prefetches; similar to LIBXSMM_PREFETCH_AUTO (libxsmm_frontend.h) */ +#define LIBXSMM_PREFETCH_SIGONLY 1 +#define LIBXSMM_PREFETCH_NONE 0 + +/** Helper macro for type names. */ +#define LIBXSMM_TYPENAME(TYPE) LIBXSMM_STRINGIFY(LIBXSMM_CONCATENATE(LIBXSMM_TYPENAME_, TYPE)) +#define LIBXSMM_TYPENAME_double f64 +#define LIBXSMM_TYPENAME_float f32 +#define LIBXSMM_TYPENAME_libxsmm_bfloat16 bf16 +#define LIBXSMM_TYPENAME_libxsmm_float16 f16 +#define LIBXSMM_TYPENAME_int i32 +#define LIBXSMM_TYPENAME_short i16 +#define LIBXSMM_TYPENAME_char i8 + +/** Helper macro for type information: INFO := { FP }. */ +#define LIBXSMM_TYPEINFO(TYPE, INFO) LIBXSMM_CONCATENATE4(LIBXSMM_TYPEINFO_, INFO, _, TYPE) +#define LIBXSMM_TYPEINFO_FP_double 1 +#define LIBXSMM_TYPEINFO_FP_float 1 +#define LIBXSMM_TYPEINFO_FP_libxsmm_bfloat16 1 +#define LIBXSMM_TYPEINFO_FP_libxsmm_float16 1 +#define LIBXSMM_TYPEINFO_FP_int 0 +#define LIBXSMM_TYPEINFO_FP_short 0 +#define LIBXSMM_TYPEINFO_FP_char 0 + +/** Helper macro for type postfixes. */ +#define LIBXSMM_TYPESYMBOL(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_TYPESYMBOL_, TYPE) +#define LIBXSMM_TYPESYMBOL_double F64 +#define LIBXSMM_TYPESYMBOL_float F32 +#define LIBXSMM_TYPESYMBOL_libxsmm_bfloat16 BF16 +#define LIBXSMM_TYPESYMBOL_libxsmm_float16 F16 +#define LIBXSMM_TYPESYMBOL_int I32 +#define LIBXSMM_TYPESYMBOL_short I16 +#define LIBXSMM_TYPESYMBOL_char I8 + +#define LIBXSMM_TYPESIZE(ENUM) ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_F64 ? 8 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_F32 ? 4 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_BF16 ? 2 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_F16 ? 2 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_I64 ? 8 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_I32 ? 4 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_I16 ? 2 : ( \ + ((int)(ENUM)) == LIBXSMM_DATATYPE_I8 ? 1 : ( \ + 0/*invalid*/))))))))) + +/* Get input or output precision */ +#define LIBXSMM_GETENUM_INP(SRC) ((SRC) & 0x0F) +#define LIBXSMM_GETENUM_OUT(SRC) (0 == ((SRC) >> 4) ? LIBXSMM_GETENUM_INP(SRC) : ((SRC) >> 4)) +/* Get/Set input and output precision */ +#define LIBXSMM_GETENUM(INP, OUT) (((INP) == (OUT)) ? (INP) : ((INP) | ((OUT) << 4))) +#define LIBXSMM_SETENUM(DST, INP, OUT) DST = LIBXSMM_GETENUM(INP, OUT) + +/* Construct an enumerator (libxsmm_datatype) from a built-in type (float, double, etc.). */ +#define LIBXSMM_DATATYPE(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_DATATYPE_, LIBXSMM_TYPESYMBOL(TYPE)) +/* Construct a type-id from built-in input/output types (float, double, etc.). */ +#define LIBXSMM_DATATYPE2(ITYPE, OTYPE) LIBXSMM_GETENUM(LIBXSMM_DATATYPE(ITYPE), LIBXSMM_DATATYPE(OTYPE)) + +/* Construct an enumerator (libxsmm_gemm_precision) from a built-in type (float, double, etc.). */ +#define LIBXSMM_GEMM_PRECISION(TYPE) LIBXSMM_CONCATENATE(LIBXSMM_GEMM_PRECISION_, LIBXSMM_TYPESYMBOL(TYPE)) +/* Construct GEMM-precision from built-in input/output types (float, double, etc.). */ +#define LIBXSMM_GEMM_PRECISION2(ITYPE, OTYPE) (libxsmm_gemm_precision)LIBXSMM_GETENUM( \ + LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(OTYPE)) + +/** Maximum size available to store a descriptor/blob (GEMM, MCOPY, TRANS, TRSM, TRMM). */ +#if !defined(LIBXSMM_DESCRIPTOR_MAXSIZE) +# define LIBXSMM_DESCRIPTOR_MAXSIZE 96 +#endif +/** Size of the descriptor considered as unique/small signature. */ +#if !defined(LIBXSMM_DESCRIPTOR_SIGSIZE) +# if defined(LIBXSMM_UNPACKED) +# define LIBXSMM_DESCRIPTOR_SIGSIZE 64 +# else +# define LIBXSMM_DESCRIPTOR_SIGSIZE 32 +# endif +#endif + + +/* Support for Bfloat16 */ +typedef unsigned short libxsmm_bfloat16; +typedef unsigned short libxsmm_float16; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_bfloat16_hp { + libxsmm_bfloat16 i[2]; + float f; +} libxsmm_bfloat16_hp; + +#if defined(__cplusplus) +namespace Eigen { struct bfloat16; } +#endif /*__cplusplus*/ + +/** Integer type for LAPACK/BLAS (LP64: 32-bit, and ILP64: 64-bit). */ +typedef LIBXSMM_BLASINT libxsmm_blasint; + +/** Type representing sufficient storage space for a GEMM handle. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_blob { char data[128]; } libxsmm_gemm_blob; +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_handle libxsmm_gemm_handle; + +/** Type representing sufficient storage space for descriptors (GEMM, TCOPY, MCOPY). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_descriptor_blob { + char data[LIBXSMM_DESCRIPTOR_MAXSIZE]; +} libxsmm_descriptor_blob; + +/** Structure storing arguments of GEMM-like routines. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_gemm_descriptor libxsmm_gemm_descriptor; +/** Structure storing arguments of the matrix-eltw routine. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_descriptor libxsmm_meltw_descriptor; +/** Structure storing arguments of the matrix-equation routine. */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meqn_descriptor libxsmm_meqn_descriptor; + +/** Enumerates element/data types. */ +typedef enum libxsmm_datatype { + LIBXSMM_DATATYPE_F64, + LIBXSMM_DATATYPE_F32, + LIBXSMM_DATATYPE_BF16, + LIBXSMM_DATATYPE_F16, + LIBXSMM_DATATYPE_I64, + LIBXSMM_DATATYPE_I32, + LIBXSMM_DATATYPE_I16, + LIBXSMM_DATATYPE_I8, + LIBXSMM_DATATYPE_UNSUPPORTED +} libxsmm_datatype; + +/** Denotes the precision/data type of GEMM. */ +typedef enum libxsmm_gemm_precision { + LIBXSMM_GEMM_PRECISION_F64 = LIBXSMM_DATATYPE_F64, + LIBXSMM_GEMM_PRECISION_F32 = LIBXSMM_DATATYPE_F32, + LIBXSMM_GEMM_PRECISION_BF16 = LIBXSMM_DATATYPE_BF16, + LIBXSMM_GEMM_PRECISION_F16 = LIBXSMM_DATATYPE_F16, + LIBXSMM_GEMM_PRECISION_I32 = LIBXSMM_DATATYPE_I32, + LIBXSMM_GEMM_PRECISION_I16 = LIBXSMM_DATATYPE_I16, + LIBXSMM_GEMM_PRECISION_I8 = LIBXSMM_DATATYPE_I8 +} libxsmm_gemm_precision; + +typedef enum libxsmm_meltw_operation { + LIBXSMM_MELTW_OPERATION_NONE = 0, + /* for fusion into AMX GEMM */ + LIBXSMM_MELTW_OPERATION_CVTFP32BF16 = 1, + LIBXSMM_MELTW_OPERATION_CVTFP32BF16_ACT = 2, + LIBXSMM_MELTW_OPERATION_ACT_CVTFP32BF16 = 3, + LIBXSMM_MELTW_OPERATION_COLBIAS_ACT = 4, + LIBXSMM_MELTW_OPERATION_DECOMPRESS_A = 5, + LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A = 6, + LIBXSMM_MELTW_OPERATION_TRANSFORM_B_NORM_TO_NORMT_EXT_BUFFER = 7, + LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_TRANSFORM_B_NORM_TO_NORMT_EXT_BUFFER = 8, + LIBXSMM_MELTW_OPERATION_TRANSFORM_C_NORM_TO_VNNI_EXT_BUFFER = 9, + LIBXSMM_MELTW_OPERATION_ACT_TRANSFORM_C_NORM_TO_VNNI_EXT_BUFFER = 10, + /* standalone TPPs */ + LIBXSMM_MELTW_OPERATION_REDUCE = 11, /* to be removed */ + LIBXSMM_MELTW_OPERATION_REDUCE_COLS_IDX = 12, + LIBXSMM_MELTW_OPERATION_OPREDUCE_VECS_IDX = 13, + LIBXSMM_MELTW_OPERATION_UNARY = 14, + LIBXSMM_MELTW_OPERATION_BINARY = 15, + LIBXSMM_MELTW_OPERATION_TERNARY = 16 +} libxsmm_meltw_operation; + +typedef enum libxsmm_meltw_null_flags { + LIBXSMM_MELTW_FLAG_NONE = 0 +} libxsmm_meltw_null_flags; + +typedef enum libxsmm_meltw_redu_flags { + LIBXSMM_MELTW_FLAG_REDUCE_NONE = 0, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD = 1, + LIBXSMM_MELTW_FLAG_REDUCE_OP_MAX = 2, + LIBXSMM_MELTW_FLAG_REDUCE_OP_MUL = 4, + LIBXSMM_MELTW_FLAG_REDUCE_ROWS = 8, + LIBXSMM_MELTW_FLAG_REDUCE_COLS = 16, + LIBXSMM_MELTW_FLAG_REDUCE_ELTS = 32, + LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED = 64, + LIBXSMM_MELTW_FLAG_REDUCE_NCNC_FORMAT = 128, + LIBXSMM_MELTW_FLAG_REDUCE_COLS_IDX_XOR_ACC = 256, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS_ELTS_SQUARED = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED , + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS_ELTS_ELTS_SQUARED = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED , + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS_ELTS_ELTS_SQUARED = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS_SQUARED , + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_ROWS_ELTS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_ROWS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS_ELTS = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS, + LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD_COLS_ELTS_NCNC_FORMAT = LIBXSMM_MELTW_FLAG_REDUCE_OP_ADD | LIBXSMM_MELTW_FLAG_REDUCE_COLS | LIBXSMM_MELTW_FLAG_REDUCE_ELTS | LIBXSMM_MELTW_FLAG_REDUCE_NCNC_FORMAT +} libxsmm_meltw_redu_flags; + +typedef enum libxsmm_meltw_relu_flags { + LIBXSMM_MELTW_FLAG_RELU_NONE = 0, + LIBXSMM_MELTW_FLAG_RELU_FWD = 1, + LIBXSMM_MELTW_FLAG_RELU_BWD = 2, + LIBXSMM_MELTW_FLAG_RELU_BITMASK = 4, + LIBXSMM_MELTW_FLAG_RELU_FWD_BITMASK = LIBXSMM_MELTW_FLAG_RELU_FWD | LIBXSMM_MELTW_FLAG_RELU_BITMASK, + LIBXSMM_MELTW_FLAG_RELU_BWD_BITMASK = LIBXSMM_MELTW_FLAG_RELU_BWD | LIBXSMM_MELTW_FLAG_RELU_BITMASK +} libxsmm_meltw_relu_flags; + +typedef enum libxsmm_meltw_cvt_flags { + LIBXSMM_MELTW_FLAG_CVT_NONE = 0, + LIBXSMM_MELTW_FLAG_CVT_VNNI_FORMAT = 1 +} libxsmm_meltw_cvt_flags; + +typedef enum libxsmm_meltw_cvta_flags { + LIBXSMM_MELTW_FLAG_CVTA_NONE = 0, + LIBXSMM_MELTW_FLAG_CVTA_FUSE_RELU = 1, + LIBXSMM_MELTW_FLAG_CVTA_FUSE_TANH = 2, + LIBXSMM_MELTW_FLAG_CVTA_FUSE_SIGM = 4 +} libxsmm_meltw_cvta_flags; + +typedef enum libxsmm_meltw_acvt_flags { + LIBXSMM_MELTW_FLAG_ACVT_NONE = 0, + LIBXSMM_MELTW_FLAG_ACVT_FUSE_TANH = 1, + LIBXSMM_MELTW_FLAG_ACVT_FUSE_SIGM = 2 +} libxsmm_meltw_acvt_flags; + +typedef enum libxsmm_meltw_flags { + LIBXSMM_MELTW_FLAG_FUSE_NONE = 0, + LIBXSMM_MELTW_FLAG_COLBIAS = 1, + LIBXSMM_MELTW_FLAG_ACT_RELU = 2, + LIBXSMM_MELTW_FLAG_ACT_TANH = 4, + LIBXSMM_MELTW_FLAG_ACT_SIGM = 8, + LIBXSMM_MELTW_FLAG_ACT_GELU = 16, + LIBXSMM_MELTW_FLAG_OVERWRITE_C = 32, + LIBXSMM_MELTW_FLAG_ACT_RELU_BWD = 64, + LIBXSMM_MELTW_FLAG_COLBIAS_OVERWRITE_C = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_ACT_RELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_ACT_RELU | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_ACT_TANH_OVERWRITE_C = LIBXSMM_MELTW_FLAG_ACT_TANH | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_ACT_SIGM_OVERWRITE_C = LIBXSMM_MELTW_FLAG_ACT_SIGM | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_ACT_GELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_ACT_GELU | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_ACT_RELU_BWD_OVERWRITE_C = LIBXSMM_MELTW_FLAG_ACT_RELU_BWD | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_RELU = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_RELU, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_TANH = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_TANH, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_SIGM = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_SIGM, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_GELU = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_GELU, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_RELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_RELU | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_TANH_OVERWRITE_C = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_TANH | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_SIGM_OVERWRITE_C = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_SIGM | LIBXSMM_MELTW_FLAG_OVERWRITE_C, + LIBXSMM_MELTW_FLAG_COLBIAS_ACT_GELU_OVERWRITE_C = LIBXSMM_MELTW_FLAG_COLBIAS | LIBXSMM_MELTW_FLAG_ACT_GELU | LIBXSMM_MELTW_FLAG_OVERWRITE_C +} libxsmm_meltw_flags; + +typedef enum libxsmm_meltw_opreduce_vecs_flags { + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_NONE = 0, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIN_VECIDX = 1, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN = 2, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY = 4, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_ADD = 8, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_SUB = 16, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_MUL = 32, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_DIV = 64, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_DOT = 128, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_SCALE_OP_RESULT = 256, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_NONE = 512, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM = 1024, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX = 2048, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN = 4096, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_INDEXED_VEC = 8192, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VEC = 16384, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VECIDX = 32768, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0 = 65536, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1 = 131072, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY_REDOP_SUM = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY | LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_MUL_REDOP_SUM = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_MUL | LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY_REDOP_MAX = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY | LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX, + LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY_REDOP_MIN = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY | LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN +} libxsmm_meltw_opreduce_vecs_flags; + +typedef enum libxsmm_meltw_unary_flags { + LIBXSMM_MELTW_FLAG_UNARY_NONE = 0, + LIBXSMM_MELTW_FLAG_UNARY_BITMASK = 1, + LIBXSMM_MELTW_FLAG_UNARY_BCAST_ROW = 2, + LIBXSMM_MELTW_FLAG_UNARY_BCAST_COL = 4, + LIBXSMM_MELTW_FLAG_UNARY_BCAST_SCALAR = 8, + LIBXSMM_MELTW_FLAG_UNARY_REDUCE_COLS = 16, + LIBXSMM_MELTW_FLAG_UNARY_REDUCE_ROWS = 32 +} libxsmm_meltw_unary_flags; + +typedef enum libxsmm_meltw_unary_type { + LIBXSMM_MELTW_TYPE_UNARY_NONE = 0, + LIBXSMM_MELTW_TYPE_UNARY_IDENTITY = 1, /* this is copy */ + LIBXSMM_MELTW_TYPE_UNARY_XOR = 2, /* this is zero */ + LIBXSMM_MELTW_TYPE_UNARY_X2 = 3, + LIBXSMM_MELTW_TYPE_UNARY_SQRT = 4, + LIBXSMM_MELTW_TYPE_UNARY_RELU = 5, + LIBXSMM_MELTW_TYPE_UNARY_RELU_INV = 6, + LIBXSMM_MELTW_TYPE_UNARY_TANH = 7, + LIBXSMM_MELTW_TYPE_UNARY_TANH_INV = 8, + LIBXSMM_MELTW_TYPE_UNARY_SIGMOID = 9, + LIBXSMM_MELTW_TYPE_UNARY_SIGMOID_INV = 10, + LIBXSMM_MELTW_TYPE_UNARY_GELU = 11, + LIBXSMM_MELTW_TYPE_UNARY_GELU_INV = 12, + LIBXSMM_MELTW_TYPE_UNARY_NEGATE = 13, + LIBXSMM_MELTW_TYPE_UNARY_INC = 14, + LIBXSMM_MELTW_TYPE_UNARY_RECIPROCAL = 15, + LIBXSMM_MELTW_TYPE_UNARY_RECIPROCAL_SQRT = 16, + LIBXSMM_MELTW_TYPE_UNARY_EXP = 17, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_ADD = 18, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X2_OP_ADD = 19, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_X2_OP_ADD = 20, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_MAX = 21, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_MUL = 22, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_ADD_NCNC_FORMAT = 23, + LIBXSMM_MELTW_TYPE_UNARY_REDUCE_TO_SCALAR_OP_ADD = 24, + LIBXSMM_MELTW_TYPE_UNARY_DROPOUT = 25, + LIBXSMM_MELTW_TYPE_UNARY_DROPOUT_INV = 26, + LIBXSMM_MELTW_TYPE_UNARY_REPLICATE_COL_VAR = 27, + LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNI = 28, + LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT = 29, + LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_VNNI_TO_VNNIT = 30, + LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNIT = 31, + LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNI_PAD = 32, + LIBXSMM_MELTW_TYPE_UNARY_UNPACK_TO_BLOCKS = 33, + LIBXSMM_MELTW_TYPE_UNARY_LEAKY_RELU = 34, + LIBXSMM_MELTW_TYPE_UNARY_LEAKY_RELU_INV = 35, + LIBXSMM_MELTW_TYPE_UNARY_ELU = 36, + LIBXSMM_MELTW_TYPE_UNARY_ELU_INV = 37, + LIBXSMM_MELTW_TYPE_UNARY_STOCHASTIC_ROUND = 38 +} libxsmm_meltw_unary_type; + +typedef enum libxsmm_meltw_binary_flags { + LIBXSMM_MELTW_FLAG_BINARY_NONE = 0, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_ROW_IN_0 = 1, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_ROW_IN_1 = 2, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_COL_IN_0 = 4, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_COL_IN_1 = 8, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_SCALAR_IN_0 = 16, + LIBXSMM_MELTW_FLAG_BINARY_BCAST_SCALAR_IN_1 = 32 +} libxsmm_meltw_binary_flags; + +typedef enum libxsmm_meltw_binary_type { + LIBXSMM_MELTW_TYPE_BINARY_NONE = 0, + LIBXSMM_MELTW_TYPE_BINARY_ADD = 1, + LIBXSMM_MELTW_TYPE_BINARY_MUL = 2, + LIBXSMM_MELTW_TYPE_BINARY_SUB = 3, + LIBXSMM_MELTW_TYPE_BINARY_DIV = 4, + LIBXSMM_MELTW_TYPE_BINARY_MULADD = 5, + LIBXSMM_MELTW_TYPE_BINARY_MATMUL = 6, + LIBXSMM_MELTW_TYPE_BINARY_MUL_AND_REDUCE_TO_SCALAR_OP_ADD = 7, + LIBXSMM_MELTW_TYPE_BINARY_PACK = 8 +} libxsmm_meltw_binary_type; + +typedef enum libxsmm_meltw_ternary_flags { + LIBXSMM_MELTW_FLAG_TERNARY_NONE = 0, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_0 = 1, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_1 = 2, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_2 = 4, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_0 = 8, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_1 = 16, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_2 = 32, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_0 = 64, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_1 = 128, + LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_2 = 256, + LIBXSMM_MELTW_FLAG_TERNARY_REUSE_IN_2_AS_OUT = 512 +} libxsmm_meltw_ternary_flags; + +typedef enum libxsmm_meltw_ternary_type { + LIBXSMM_MELTW_TYPE_TERNARY_NONE = 0, + LIBXSMM_MELTW_TYPE_TERNARY_MULADD = 1, + LIBXSMM_MELTW_TYPE_TERNARY_MATMUL = 2, + LIBXSMM_MELTW_TYPE_TERNARY_BLEND = 3, + LIBXSMM_MELTW_TYPE_TERNARY_NMULADD = 4 +} libxsmm_meltw_ternary_type; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmelt_flags { + libxsmm_meltw_null_flags elt_null; + libxsmm_meltw_opreduce_vecs_flags elt_opredvecs; + libxsmm_meltw_relu_flags elt_relu; + libxsmm_meltw_cvta_flags elt_cvta; + libxsmm_meltw_cvt_flags elt_cvt; + libxsmm_meltw_acvt_flags elt_acvt; + libxsmm_meltw_flags elt_meltwfused; +} libxsmm_xmelt_flags; + +/** Flag enumeration which can be binary ORed. */ +typedef enum libxsmm_gemm_flags { + LIBXSMM_GEMM_FLAG_NONE = 0, + /** Transpose matrix A. */ + LIBXSMM_GEMM_FLAG_TRANS_A = 1, + /** Transpose matrix B. */ + LIBXSMM_GEMM_FLAG_TRANS_B = 2, + /** Transpose matrix A and B. */ + LIBXSMM_GEMM_FLAG_TRANS_AB = LIBXSMM_GEMM_FLAG_TRANS_A | LIBXSMM_GEMM_FLAG_TRANS_B, +#if 0 + /** Alpha=0|1 */ + LIBXSMM_GEMM_FLAG_ALPHA_0 = 4, + /** Alpha=neg|pos */ + LIBXSMM_GEMM_FLAG_ALPHA_S = 8, +#endif + /** Beta=0|1 */ + LIBXSMM_GEMM_FLAG_BETA_0 = 16, +#if 0 + /** Beta=neg|pos */ + LIBXSMM_GEMM_FLAG_BETA_S = 32, +#endif + /** Generate aligned load instructions. */ + LIBXSMM_GEMM_FLAG_ALIGN_A = 64, + /** Aligned load/store instructions. */ + LIBXSMM_GEMM_FLAG_ALIGN_C = 128, + /** Batch-reduce Ai * Bi. */ + /** AMX hint to avoid tileconfig/release, it's negated bits, so that 0 is default "on" */ + LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG = 4, + LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG = 8, + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS = 256, + /** Batch-reduce Ai * Bi. */ + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET = 512, + /** Batch-reduce Ai * Bi. */ + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE = 1024, + /** Aligned C matrix, but using NTS Hint when storing */ + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT = 2176, + /* in case of integer GEMM, if A is unsigned */ + LIBXSMM_GEMM_FLAG_A_UNSIGNED = 4096, + /* in case of integer GEMM, if B is unsigned */ + LIBXSMM_GEMM_FLAG_B_UNSIGNED = 8192, + /* in case of integer GEMM, if C is unsigned */ + LIBXSMM_GEMM_FLAG_C_UNSIGNED = 16384, + /* in case of integer GEMM, if A and B are unsigned */ + LIBXSMM_GEMM_FLAG_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + /* for low precision we also require up-front packed formats "VNNI" for best performance, this flag indicates A */ + LIBXSMM_GEMM_FLAG_VNNI_A = 32768, + /* for low precision we also require up-front packed formats "VNNI" for best performance, this flag indicates B */ + LIBXSMM_GEMM_FLAG_VNNI_B = 65536, + /* for low precision we also require post packed formats "VNNI" for best performance, this flag indicated C */ + LIBXSMM_GEMM_FLAG_VNNI_C = 131072, + /* combined types */ + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0 = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_A_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_A_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_B_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_B_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_ADDRESS_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_ADDRESS_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_OFFSET_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_OFFSET_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BATCH_REDUCE_STRIDE_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT_BETA_0_BATCH_REDUCE_STRIDE_AB_UNSIGNED = LIBXSMM_GEMM_FLAG_BETA_0 | LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, + /** Marker flag; do not use. */ + LIBXSMM_GEMM_FLAG_INVALID = 262144 +} libxsmm_gemm_flags; + +/** Flag enumeration which can be binary ORed. */ +typedef enum libxsmm_gemm_handle_flags { + LIBXSMM_GEMM_HANDLE_FLAG_AUTO = 0, + LIBXSMM_GEMM_HANDLE_FLAG_COPY_A = 1, + LIBXSMM_GEMM_HANDLE_FLAG_COPY_B = 2, + LIBXSMM_GEMM_HANDLE_FLAG_COPY_C = 4 +} libxsmm_gemm_handle_flags; + +/** Auto-batch flags (can be ORed) applicable to mmbatch_begin/mmbatch_end. */ +typedef enum libxsmm_mmbatch_flags { + /** Handle recorded batch unsynchronized-parallel. */ + LIBXSMM_MMBATCH_FLAG_DEFAULT = LIBXSMM_GEMM_FLAG_INVALID * 0, + /** Synchronize among C matrices. */ + LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED = LIBXSMM_GEMM_FLAG_INVALID * 1, + /** Handle recorded batch sequentially. */ + LIBXSMM_MMBATCH_FLAG_SEQUENTIAL = LIBXSMM_GEMM_FLAG_INVALID * 2, + /** Only record a statistic of potential SMMs. */ + LIBXSMM_MMBATCH_FLAG_STATISTIC = LIBXSMM_GEMM_FLAG_INVALID * 4 +} libxsmm_mmbatch_flags; + +/** Enumeration of the available prefetch strategies. */ +typedef enum libxsmm_gemm_prefetch_type { + /** No prefetching and no prefetch fn. signature. */ + LIBXSMM_GEMM_PREFETCH_NONE = LIBXSMM_PREFETCH_NONE, + /** Only function prefetch signature. */ + LIBXSMM_GEMM_PREFETCH_SIGONLY = LIBXSMM_PREFETCH_SIGONLY, + /** Prefetch PA using accesses to A. */ + LIBXSMM_GEMM_PREFETCH_AL2 = 2, + /** Prefetch PA (aggressive). */ + LIBXSMM_GEMM_PREFETCH_BL2_VIA_C = 4, + /** Prefetch A ahead. */ + LIBXSMM_GEMM_PREFETCH_AL2_AHEAD = 8, + LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C | LIBXSMM_GEMM_PREFETCH_AL2, + LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C | LIBXSMM_GEMM_PREFETCH_AL2_AHEAD, + /** Backward compatibility: AL2CL2BL2_VIA_C is an alias for AL2BL2_VIA_C (Eigen library). */ + LIBXSMM_PREFETCH_AL2CL2BL2_VIA_C = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C, + /** Current B into L1. */ + LIBXSMM_GEMM_PREFETCH_BL1 = 16, + LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB = 32 +} libxsmm_gemm_prefetch_type; + +/** Flag enumeration which can be binary ORed. */ +typedef enum libxsmm_matcopy_flags { + LIBXSMM_MATCOPY_FLAG_DEFAULT = 0, + /** If set, then use zero matrix as source */ + LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE = 1 +} libxsmm_matcopy_flags; + +/** Determines the kernel kind. */ +typedef enum libxsmm_kernel_kind { + /** Matrix multiplication kernel */ + LIBXSMM_KERNEL_KIND_MATMUL = 0, + /** Mateltw kernel kind */ + LIBXSMM_KERNEL_KIND_MELTW = 1, + /** Mateqn kernel kind */ + LIBXSMM_KERNEL_KIND_MEQN = 2, + /** User-defined kernels */ + LIBXSMM_KERNEL_KIND_USER = 3, + /** Not a JIT kernel */ + LIBXSMM_KERNEL_UNREGISTERED = 4 +} libxsmm_kernel_kind; + +typedef enum libxsmm_dnn_tensor_format { + /* use LIBXSMM internal format, we need to copy data into that */ + LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM = 1, + /* use NHWC format internally, this allows no-copy operations */ + LIBXSMM_DNN_TENSOR_FORMAT_NHWC = 2, + /* use NCHW format internally, this will include shadow copies, not preferred */ + LIBXSMM_DNN_TENSOR_FORMAT_NCHW = 4, + /* use RSCK format internally, this allows no-copy operations */ + LIBXSMM_DNN_TENSOR_FORMAT_RSCK = 8, + /* use KCRS format internally, this will include shadow copies, not preferred */ + LIBXSMM_DNN_TENSOR_FORMAT_KCRS = 16, + LIBXSMM_DNN_TENSOR_FORMAT_CK = 32, + LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED = 64, + LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED = 128, + LIBXSMM_DNN_TENSOR_FORMAT_NC = 256 +} libxsmm_dnn_tensor_format; + +/** Denotes the element/pixel type of an image/channel. */ +typedef enum libxsmm_dnn_datatype { + LIBXSMM_DNN_DATATYPE_F64 = LIBXSMM_DATATYPE_F64, + LIBXSMM_DNN_DATATYPE_F32 = LIBXSMM_DATATYPE_F32, + LIBXSMM_DNN_DATATYPE_BF16 = LIBXSMM_DATATYPE_BF16, + LIBXSMM_DNN_DATATYPE_F16 = LIBXSMM_DATATYPE_F16, + LIBXSMM_DNN_DATATYPE_I32 = LIBXSMM_DATATYPE_I32, + LIBXSMM_DNN_DATATYPE_I16 = LIBXSMM_DATATYPE_I16, + LIBXSMM_DNN_DATATYPE_I8 = LIBXSMM_DATATYPE_I8 +} libxsmm_dnn_datatype; + +typedef enum libxsmm_dnn_conv_option { + /* we get default settings */ + LIBXSMM_DNN_CONV_OPTION_NONE = 0, + /* overwrite results buffer (set it to zero before running the operations) */ + LIBXSMM_DNN_CONV_OPTION_OVERWRITE = 1, + /* external filter transpose to bwd convolutions */ + LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE = 2, + /* compound types */ + LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE_OVERWRITE = LIBXSMM_DNN_CONV_OPTION_OVERWRITE | LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE +} libxsmm_dnn_conv_option; + +typedef enum libxsmm_dnn_fusedbatchnorm_fuse_order { + /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ + LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU = 0 +} libxsmm_dnn_fusedbatchnorm_fuse_order; + +typedef enum libxsmm_dnn_fusedbatchnorm_fuse_op { + /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ + LIBXSMM_DNN_FUSEDBN_OPS_BN = 1, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE = 2, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS = 4, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED = 8, + LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE = 16, + LIBXSMM_DNN_FUSEDBN_OPS_RELU = 32, + LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK = 64, + LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, + LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BN_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BN_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BN | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE_RELU = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU, + LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED | LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK +} libxsmm_dnn_fusedbatchnorm_fuse_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm_desc { + int partN; /* number of images in mini-batch, used for all elementwise computations */ + int fullN; /* number of images in mini-batch, used for statistics computations */ + int C; /* number of input feature maps */ + int H; /* height of input image */ + int W; /* width of input image */ + int u; /* vertical stride */ + int v; /* horizontal stride */ + int pad_h_in; /* height of physical zero-padding in input buffer */ + int pad_w_in; /* width of physical zero-padding in input buffer */ + int pad_h_out; /* height of physical zero-padding in output buffer */ + int pad_w_out; /* width of physical zero-padding in output buffer */ + int threads; /* number of threads used */ + libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ + libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ + libxsmm_dnn_datatype datatype_stats; /* datatype used for all stats related buffers */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ + libxsmm_dnn_fusedbatchnorm_fuse_order fuse_order; /* additional options */ + libxsmm_dnn_fusedbatchnorm_fuse_op fuse_ops; /* used ops into convolutions */ +} libxsmm_dnn_fusedbatchnorm_desc; + +typedef enum libxsmm_dnn_fusedgroupnorm_fuse_order { + /* the fuse order is: 1. BN, 2. element-wise 3. RELU */ + LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU = 0 +} libxsmm_dnn_fusedgroupnorm_fuse_order; + +typedef enum libxsmm_dnn_fusedgroupnorm_fuse_op { + /* the fuse order is: 1. GN, 2. element-wise 3. RELU */ + LIBXSMM_DNN_FUSEDGN_OPS_GN = 1, + LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE = 2, + LIBXSMM_DNN_FUSEDGN_OPS_RELU = 4, + LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK = 8, + LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU = LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU, + LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE, + LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_RELU, + LIBXSMM_DNN_FUSEDGN_OPS_GN_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK, + LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU, + LIBXSMM_DNN_FUSEDGN_OPS_GN_ELTWISE_RELU_WITH_MASK = LIBXSMM_DNN_FUSEDGN_OPS_GN | LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE | LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK +} libxsmm_dnn_fusedgroupnorm_fuse_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm_desc { + int N; /* number of images in mini-batch */ + int G; /* groups of channels to norm */ + int C; /* number of input feature maps */ + int H; /* height of input image */ + int W; /* width of input image */ + int u; /* vertical stride */ + int v; /* horizontal stride */ + int pad_h_in; /* height of physical zero-padding in input buffer */ + int pad_w_in; /* width of physical zero-padding in input buffer */ + int pad_h_out; /* height of physical zero-padding in output buffer */ + int pad_w_out; /* width of physical zero-padding in output buffer */ + int threads; /* number of threads used */ + libxsmm_dnn_datatype datatype_in; /* datatype used for all input related buffers */ + libxsmm_dnn_datatype datatype_out; /* datatype used for all output related buffers */ + libxsmm_dnn_datatype datatype_stats; /* datatype used for all stats related buffers */ + libxsmm_dnn_tensor_format buffer_format; /* format which is for activation buffers */ + libxsmm_dnn_fusedgroupnorm_fuse_order fuse_order; /* additional options */ + libxsmm_dnn_fusedgroupnorm_fuse_op fuse_ops; /* used ops into convolutions */ +} libxsmm_dnn_fusedgroupnorm_desc; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_matrix_arg { + void* primary; + void* secondary; + void* tertiary; +} libxsmm_matrix_arg; + +/** argument struct for matrix-eltwise: reduce */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_reduce_cols_idx_param { + unsigned long long n; + const void* ind_ptr; /* index array pointer */ + const void* inp_ptr; /* input pointer */ + void* out_ptr; /* output pointer */ +} libxsmm_meltw_reduce_cols_idx_param; + +/** argument struct for matrix-eltwise: opreduce vecs indexed */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_opreduce_vecs_idx_param { + unsigned long long n; + const void* indices; /* index array pointer */ + const void* in_matrix; /* input matrix pointer */ + const void* in_vec; /* input vector pointer */ + void* out_vec; /* output pointer */ + const void* scale_vals; /* scale values of indexed vectors after ops */ + const void* indices2; /* index array pointer */ + const void* in_matrix2; /* input matrix pointer */ + void* argop_off_vec_0; + void* argop_off_vec_1; +} libxsmm_meltw_opreduce_vecs_idx_param; + +/** argument struct for matrix-eltwise: unary */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_unary_param { + libxsmm_matrix_arg in; /* input */ + libxsmm_matrix_arg out; /* output */ +} libxsmm_meltw_unary_param; + +/** argument struct for matrix-eltwise: binary */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_binary_param { + libxsmm_matrix_arg in0; /* 1st input */ + libxsmm_matrix_arg in1; /* 2nd input */ + libxsmm_matrix_arg out; /* output */ +} libxsmm_meltw_binary_param; + +/** argument struct for matrix-eltwise: ternary */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_ternary_param { + libxsmm_matrix_arg in0; /* 1st input */ + libxsmm_matrix_arg in1; /* 2nd input */ + libxsmm_matrix_arg in2; /* 3rd input */ + libxsmm_matrix_arg out; /* output */ +} libxsmm_meltw_ternary_param; + +/** argument struct for matrix equation */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_matrix_eqn_param { + const libxsmm_matrix_arg* inputs; /* array of input args */ + libxsmm_matrix_arg output; /* output arg */ +} libxsmm_matrix_eqn_param; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltw_gemm_param { + const void* bias_ptr; /* optional, col-bias pointer */ + void* out_ptr; /* optional, pointer to output after eltwise (contains mask in case of ReLU); */ + /* Need for some activation functions, assumed to have the same shape as C matrix, */ + /* may not be set when OVERWRITE_C option is chosen */ + /* If OVERWRITE_C is false: out_ptr contains the post-act output, C has the pre-act output */ + /* If OVERWRITE_C is true: C contains post-act output, out_ptr contains the ReLU mask (only when act was ReLU) for other act unused */ + void* sparse_bitmap; + void* decompress_buffer; + void* relu_bitmask_bwd; +} libxsmm_meltw_gemm_param; + +/** Specialized function for matrix-eltw (weak-typed). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_reduce_cols_idx)(const libxsmm_meltw_reduce_cols_idx_param* in_struct); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_opreduce_vecs_idx)(const libxsmm_meltw_opreduce_vecs_idx_param* in_struct); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_unary)(const libxsmm_meltw_unary_param* in_struct); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_binary)(const libxsmm_meltw_binary_param* in_struct); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_meltwfunction_ternary)(const libxsmm_meltw_ternary_param* in_struct); + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmeltwfunction { + void (*xmeltw)(const void* in_struct); + libxsmm_meltwfunction_reduce_cols_idx meltw_reduce_cols_idx; + libxsmm_meltwfunction_opreduce_vecs_idx meltw_opreduce_vecs_idx; + libxsmm_meltwfunction_unary meltw_unary; + libxsmm_meltwfunction_binary meltw_binary; + libxsmm_meltwfunction_ternary meltw_ternary; +} libxsmm_xmeltwfunction; + +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (double-precision). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction)(const double* a, const double* b, double* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (single-precision). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction)(const float* a, const float* b, float* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (bf16, fp32-accumulate). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (bf16, fp32-accumulate). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (low-precision). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction)(const short* a, const short* b, int* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (int8, int32 accumulate). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction)(const char* a, const char* b, int* c, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction)(const unsigned char* a, const char* b, int* c, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction)(const char* a, const unsigned char* b, int* c, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction)(const unsigned char* a, const unsigned char* b, int* c, ...); +/** Specialized function with fused alpha and beta arguments, and optional prefetch locations (int8, int32 accumulate, int8 downconvert). */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction)(const char* a, const unsigned char* b, unsigned char* c, float* scf, ...); + +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_addr)(const double** a, const double** b, double* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_addr)(const float** a, const float** b, float* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_addr)(const libxsmm_bfloat16** a, const libxsmm_bfloat16** b, float* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_addr)(const libxsmm_bfloat16** a, const libxsmm_bfloat16** b, libxsmm_bfloat16* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_addr)(const short** a, const short** b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_addr)(const char** a, const char** b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_addr)(const unsigned char** a, const char** b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_addr)(const char** a, const unsigned char** b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_addr)(const unsigned char** a, const unsigned char** b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_addr)(const char** a, const unsigned char** b, unsigned char* c, const unsigned long long* count, float* scf, ...); + +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_offs)(const double* a, const double* b, double* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_offs)(const float* a, const float* b, float* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_offs)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_offs)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_offs)(const short* a, const short* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_offs)(const char* a, const char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_offs)(const unsigned char* a, const char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_offs)(const char* a, const unsigned char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_offs)(const unsigned char* a, const unsigned char* b, int* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_offs)(const char* a, const unsigned char* b, unsigned char* c, const unsigned long long* count, const unsigned long long* a_offs, const unsigned long long* b_offs, float* scf, ...); + +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_dmmfunction_reducebatch_strd)(const double* a, const double* b, double* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_smmfunction_reducebatch_strd)(const float* a, const float* b, float* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_strd)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_strd)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_wimmfunction_reducebatch_strd)(const short* a, const short* b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_ssbimmfunction_reducebatch_strd)(const char* a, const char* b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_usbimmfunction_reducebatch_strd)(const unsigned char* a, const char* b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_subimmfunction_reducebatch_strd)(const char* a, const unsigned char* b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_uubimmfunction_reducebatch_strd)(const unsigned char* a, const unsigned char* b, int* c, const unsigned long long* count, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_sububmmfunction_reducebatch_strd)(const char* a, const unsigned char* b, unsigned char* c, const unsigned long long* count, float* scf, ...); + +/* GEMM fused with elwise */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bmmfunction_reducebatch_strd_meltwfused)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, libxsmm_bfloat16* c, const unsigned long long* count, const libxsmm_meltw_gemm_param* meltw_param, ...); +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_bsmmfunction_reducebatch_strd_meltwfused)(const libxsmm_bfloat16* a, const libxsmm_bfloat16* b, float* c, const unsigned long long* count, const libxsmm_meltw_gemm_param* meltw_param, ...); + +/** Function type which is either libxsmm_smmfunction or libxsmm_dmmfunction (weak-typed). */ +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xmmfunction { + void (*xmm)(const void* a, const void* b, void* c, ...); + void (*xbm)(const void** a, const void** b, void* c, const unsigned long long* count, ...); + libxsmm_dmmfunction dmm; libxsmm_smmfunction smm; libxsmm_wimmfunction wimm; libxsmm_bsmmfunction bsmm; libxsmm_bmmfunction bmm; + libxsmm_ssbimmfunction ssbimm; libxsmm_usbimmfunction usbimm; libxsmm_subimmfunction subimm; libxsmm_uubimmfunction uubimm; libxsmm_sububmmfunction sububmm; + libxsmm_dmmfunction_reducebatch_addr dmra; libxsmm_smmfunction_reducebatch_addr smra; libxsmm_bsmmfunction_reducebatch_addr bsmra; libxsmm_bmmfunction_reducebatch_addr bmra; + libxsmm_wimmfunction_reducebatch_addr wimra; libxsmm_ssbimmfunction_reducebatch_addr ssbimra; libxsmm_usbimmfunction_reducebatch_addr usbimra; libxsmm_subimmfunction_reducebatch_addr subimra; libxsmm_uubimmfunction_reducebatch_addr uubimra; + libxsmm_sububmmfunction_reducebatch_addr sububmra; + libxsmm_dmmfunction_reducebatch_offs dmro; libxsmm_smmfunction_reducebatch_offs smro; libxsmm_bsmmfunction_reducebatch_offs bsmro; libxsmm_bmmfunction_reducebatch_offs bmro; + libxsmm_wimmfunction_reducebatch_offs wimro; libxsmm_ssbimmfunction_reducebatch_offs ssbimro; libxsmm_usbimmfunction_reducebatch_offs usbimro; libxsmm_subimmfunction_reducebatch_offs subimro; libxsmm_uubimmfunction_reducebatch_offs uubimro; + libxsmm_sububmmfunction_reducebatch_offs sububmro; + libxsmm_dmmfunction_reducebatch_strd dmrs; libxsmm_smmfunction_reducebatch_strd smrs; libxsmm_bsmmfunction_reducebatch_strd bsmrs; libxsmm_bmmfunction_reducebatch_strd bmrs; + libxsmm_wimmfunction_reducebatch_strd wimrs; libxsmm_ssbimmfunction_reducebatch_strd ssbimrs; libxsmm_usbimmfunction_reducebatch_strd usbimrs; libxsmm_subimmfunction_reducebatch_strd subimrs; libxsmm_uubimmfunction_reducebatch_strd uubimrs; + libxsmm_sububmmfunction_reducebatch_strd sububmrs; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bmrs_meltwfused; + libxsmm_bsmmfunction_reducebatch_strd_meltwfused bsmrs_meltwfused; +} libxsmm_xmmfunction; + +/* matrix equation function */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void (*libxsmm_matrix_eqn_function)(const libxsmm_matrix_eqn_param* in_struct); + +/** Structure to receive information about GEMM-kernels (libxsmm_get_mmkernel_info). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_mmkernel_info { + /** Input/output data-type */ + libxsmm_gemm_precision iprecision, oprecision; + /** Prefetch strategy. */ + libxsmm_gemm_prefetch_type prefetch; + /** Leading dimensions. */ + unsigned int lda, ldb, ldc; + /** Extents/shape. */ + unsigned int m, n, k; + /** Set of flags. */ + int flags; +} libxsmm_mmkernel_info; + +/** Structure to receive information about matrix-eltw kernels (libxsmm_get_meltwkernel_info). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_meltwkernel_info { + /** LDx, M, and N. */ + unsigned int ldi, ldo, m, n; + /** Size of data element. */ + unsigned int datatype; + /** Set of flags. */ + unsigned int flags; + /** Set of operation. */ + unsigned int operation; +} libxsmm_meltwkernel_info; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_kernel_info { + libxsmm_kernel_kind kind; + /** Number of FLoating Point OperationS (FLOPS). */ + unsigned int nflops; + /** Code size (Bytes). */ + size_t code_size; +} libxsmm_kernel_info; + +/** Structure to receive information about the code registry status (libxsmm_get_registry_info). */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_registry_info { + size_t capacity, size, nbytes, nstatic, ncache; +} libxsmm_registry_info; + +#endif /*LIBXSMM_TYPEDEFS_H*/ + diff --git a/third_party/libxsmm/include/libxsmm_version.h b/third_party/libxsmm/include/libxsmm_version.h new file mode 100644 index 00000000..1c0bdd90 --- /dev/null +++ b/third_party/libxsmm/include/libxsmm_version.h @@ -0,0 +1,13 @@ +#ifndef LIBXSMM_VERSION_H +#define LIBXSMM_VERSION_H + +#define LIBXSMM_CONFIG_VERSION "1.16.1-1534" +#define LIBXSMM_CONFIG_BRANCH "master" +#define LIBXSMM_CONFIG_VERSION_MAJOR 1 +#define LIBXSMM_CONFIG_VERSION_MINOR 16 +#define LIBXSMM_CONFIG_VERSION_UPDATE 1 +#define LIBXSMM_CONFIG_VERSION_PATCH 1534 +#define LIBXSMM_CONFIG_BUILD_DATE 20230510 + +#endif + diff --git a/third_party/libxsmm/obj/.make b/third_party/libxsmm/obj/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/obj/intel64/.make b/third_party/libxsmm/obj/intel64/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/obj/intel64/generator_aarch64_instructions.o b/third_party/libxsmm/obj/intel64/generator_aarch64_instructions.o new file mode 100644 index 0000000000000000000000000000000000000000..a37798f9410f85ea3e963ec8d1f06e3578e95d53 GIT binary patch literal 33008 zcmb__4SZD9weLwLLWabg$gN5(HlxBTRESAPD1OyBI0K2sihL-D7!op(DEVkI30Q4S z?j(^h(u==-+*`f37ki)gdHed_rR^;e+NT-P9|pCGwDnF#D_~u3 zZE39A*51@qQ(N1%Y4hUpn#Sh#a9hWwaAQkzO>KK)Q+-WSOK0dRUv1d8B~;rU_BktW z^4+?m)Ylve)wlb?Exx*tue~+2sj;CkRPU<=RNHn}DC{GCm(0kRe@Am@8@PnQgQ?(a z?r5qDwO!?#+c87xpQ*kr4GrzkqM@bD*U{YGcvmwR18XL*w!XeC)ZX6Me3!4Or9R|q z=?LSUudTNEF4S~x;{yDkx=2%y*BWYD*w|bj+IB^2OM6&mCzLr^d5z6m8k<8`w05+G z(CF=o5Q}wc3*Q#*w^0C+S<~F2Gy)ZeO7fpFze8C%L?@0F{jY(yDi(Y zx=&Bb@m~|3XZJk=lJ3)<%8l2AFS7f-C!VHR*0te$&{+O!?cNx$$ODC0cHf^c$^y|N zf#_>a>eHQ5ck{iyI;4gE^96d40?}5KXM|YpH_P=EP2g)B;XzP z9*TY#8J!;a#M4n^?>Xp155|t>I7 zS)QM>e#kB&rStvQ*gY+0&YT%5ZYZ{kdLT(uCf@>n8*r+M5BVEKep(LU*C8MMuyRKQ z5m)!0L$}hws{SQ!Q!et-8F6R)-aK>W8E~Rz=%{ldf6lgho&=@c_qVv>ImhnVD|6HA z-WQQ;2=)#-J6fkZcGdA<>1z$K(;oYoR|vJY7oyREJ+Ill(~*gd&gdSsKE2LzyffqV za_nagH8|aaRu--&&t%yTk~zml^FeyNh?C0tLW8W1+m%c^Av0&}A-gC*U38!!_MRQc zI?(X)X~D9{vHyAKSR>lOF7j(_Jya5^W($(*1j#ax>=e&7xCm$wpi0?j`_`bYPeV7W za>uo}T9y5|No`o&e?GzcYH;CnRsA!P=qmd~aa4R*bxY}BW&d@_G_t>r-FW)U8DeT> zFt5mY>@26R$;!!e8k3d4wKDR@TxF7W-?IR^PiGJM&n1iV3=W=a_dWuQEUt4?F#0?! zc4GNqeGbemAEkx)DO(GApLe1kMBelSOAaC*^bR@RUq?=RI`XBV z!NMHC(TiyGRYrc9JCtePR`#DuHHdzQu996-*?(nnkICLwV?*|T|BQVND3wGv@%-9a z+t_xty_m^(VRAC)#YySS_=z=jYh!(QbMo3YF25}ussYrhW-eG6t+T9tQ(ZjJdCsk? z*pTfH%<nYc@Fq|viE$p~oE*#f@y1ypBy8pu1 zSa#$j=Fj8ci`m`oc}Zll!gbO8@nIz;>V6#rp-}p7@+U_9H>p(0g!)V>>^``e)fyv4+^m90!wJ*8#u1eLvMRc60_=8BEJx zassyYd9t?$K3~Sz2u6o6;TLw<x4zdeI{X7kw1IFi=3{LIa|Nv*n6Ij zz6d*ty?H)tCl{4^)3SV)_XF>95Y0rv9+vNzZJ}6p=fQ-fep@d`Yf)wHLYS_Gi%Ha> zSdvNLdAxDd-Wf`_&^U|Bm#zJ4>~|9UGaB;HHdB3Rn^PSqU3uFAaSQlbnrS=G(%u+u z?8F8~Y=+vgCxa<$+0wE#u_?%`JyQA0?G?5fd@Y(_9ZJ-n_^n#K@`~!fN~Ij_S`1bh z)j!G0Q)=FJMPLEgm*PGF_les#iT@NVUM-f3F#6S&s~Ag8_vwg#5liX|HOeVyZ4KrjeZ!27Dh%rv|r1M zJCEL6P+}A>PXBJxOoZ;evS#p27_B#b-ld@9&Nc2^9OV;EwIIk;8WCl$C8i z_&8L+oR6x{XmDbG_E7y%1f|gw*f?K@(nA67tYf8ccdDhgon|NHO`6{HbV;tj_|s*Q zr$OXtJq@)z-gkq+ugcizX}LEWbg9#{aOUG*1&6quUSR83+s zmy_X?J<)s`*Lwr zA1rxIMpxh-U{|QUgAqh$VD!&pi>SDLzJK+cSq`EU4q^`&oUzdhkg*0~rH4TkjLwQ1 zmEpV6!kczP>5+0P2*w|w6YVc2YUkE?L!aFM$y`om10-{PQyU;`eAsz`ugul5v0PG^NzB3KZil^gfcrt4ecX_{pXkgZI!`~!CHPk~8W zxfdm)#}R?U*p&ge*NaL9&SVt*2vF6uwDr zY*a0OcKec3x4{Qwc}$kAz$Q!2hKB5~SZgDUy{y>?Fr{D86l(g6QsTE2zn5S~eS zAM)!&ep(LUHz6P2K}{~cJ7sd4Nd ziUP@TY*khGM@nXD96B}@(KEyb|=0U zLEq^;2T5b+=cqu};3>~B`jC=oh(eF`gpZv5>&is^N?&bj8k?F*7f=1$E%cvo+;cgZ z49Lmo&G@x%ZfposoKF5X!FOe~e(U;AX<$?3k&?#uZvjMyyHBHESB5LZy)C>@+-HX8 zs%;u`P^yfNBG?~BKEe05@9yt9mF3uf9}D(8*Rj6qlxOR$(hU9)0)#(yG%IL7H3)NX zC`kT^&%S;*dI-MOITS)0IU~mZMFB5XAe89B7gX=@*wOR7hrGkrM{^#Hk9ZX~2-tgK zrGwFz;|oK|_qF8Mv#aOiGK$whiIg22Fr>=vPiO+Q`^eUvTlc~g&IIhI_B(5;qQ7-k zSH<2fbfW*crZW1MRh7|SS6;clGgK9Q1(kmn?RcrI2_wT6`5%Kt8I%Ix@Bi!x|5W{OpOJ` zlRr^WGfY;0&bO)QiWD=REI&jqeQlw;5Z4T~;g|!?pr|7#p~mp$P@8X2nGd1G#pQL4 z;j86A5*6H^WP40`%1D|o=))HIlGb_Br?hUddtU}7KdRdgK8C!5sla}azVdc`Qq;Mz z>r_$adN~>V4p?+J2;;`)o$P+_Ol;&LEP*20PxAwuQ957IRo#!T!6P#e)>r!~#K;if zKMrTqJBSJ4Tk#2DCxmQ0hlD)FCIhkArii}z4Oi^A9Q&zZn$)04EdE(8Ir`;6VU9}`LRqhRSl zXYU~14_t@KKDs=OUcCjgmZ;N-p{$PP6GPc{FZJk&p&YxHKINYn%C&nx6q!Q1_hQUd zCx+nI_(jHV_ihpygjoA1^K6#g`)?uxJ}-z2_`Huy*IVE-6Pl2_ynX|Yzxc(v%eYY?w%%fqferGHgwiGx07mO$3Uag*-J;8@Y)&36J-W=;g%>UYEEg*%nwwAe_O>K%a?CVV$ z)zQC1Kf;#;jQ{@$1%(|Q5gCmC95Mv5qbo!Pc67VQz>dByGO(jvA_F@bq72#5n<4`{ z@`C?~XTfIyGQy5-Mh5nrXeVkuVccZW=ZRq!`8u6&YXfEfP5G7mYpmF@>}PWx8uZk> z5(&zqGQte8Zxt5hZioJbf??bU=PBD5-w0JHZG@jhh-D-iCw6Fo9^K-H4ID<*u3C8dMX z{M0E-(kIjU3C}WAUjygxtXzO8$>Xh>I6s}_J)bq-Cb%|Y`z!ltO8lnmJx{S8rm#fl z^T~GKe9YkpxQakbo{Sk;H_8P8J~KY~mVkhK=5lgpufXAvBKyJrfguxjMqR_7*oUKx z{oNrdgzflHpy##lFuBG<bXa?hT6ETV5PtYDorUopLi9>((8UQcI>zi5;jhlTc<5{FJ1+in z9r!01N4%ED&Zjjcd3M^ajy~)7nKO}-*my06_$M&#-f6Fgh$eE5zf86m#RTeQ=F$bbf(rUce5FVpgnzwwy&bPEp?ECkSdw)t8;?D>wDgv9fC|cxvH`h3U5RKajtpGAsH`dJ(;* zPkmuPE0ODq7PRrH(wg{61clggxJk@(B82w!!DrmjOm-d%;$KJ~h z%>P5|O@voR;O}^kBWUtSJZR!FCDSCE)W_O&qq&-onbdPIBH+RT#b)FM3{)Dgz1{5C zvX)N^!JZSqKFUz|4kaY^bF zZb1Dnw^_>$aD2YtVzGcQx!My4N1-_*(6wF%sD>T}R1-Uom ztu4qMI`G1;UR+B=edj!!v%~YvY*F9;qxjyUWj+d-sMBVMIjP?LMfELf=f~nr8_Ww7 z%v)7Zvbtd2(gNR21@mqxC|RTPHy4!LRxr;I_)0;8yvLyqy{VvZg{Zk@t%a<5l7Uo3 zeUsd$`{S~fDOtB@{u9cwmVn)$;{7lQ5Myg;yzLxnJBONO(~N?7RRz8v+D(kL$~+p| zpQ^IG)E07lzax80>OGtEMf#X73EpUmhjS9V@0GkEHds)&j0VQ$f+bZ2{*?twAlt!P zv?%nSKActy~G=1suh9*dG`qMG?xBBy+5JeXFq^fRM;g^6>_W(@WmuGd4y4> z!&L(quRN9#o`UBXhwxSlnWUwfmq# zN=+V%R;bkEu_$bunmiW8vyv0FaF#_OUYUe!P1y7ee zI9_Zf*#bG%M&pI7QI18v08bQ2qnXA`egWiIyNwq?h5HOXHwb*XSc`ZOI!kO>sq+S{ zlNRO&F<*+jysQ(q87X}3R5&9Q*-Z+sH1ugyxb_2dxfYQ*H-&$P!cBedQ@B~*ZiR~* zyF4iJg(>a7L*a~6WWS~GLSyRwp2E%c|7V47H0t|==40UhUGp*U9}7HnK72~yW;^^Z zg_Ezr(~x+$Af+D8m5WS*y^%FH`tn zf>h)v>PGaYKJ@v8a5GgXT==@|<<$aDo&Ozyr_S@s1)e&e-z4x7YnHet1gui{b^~9h z@YV#W$YF1fRAQi(6;k-H@#Y?ZFG^{LuPWRix3&sAb^aIo7px1WJ`q8mI?qQHZtC-Q z3OD(X9v(zM%=8`5&&*EvV`A?HqF(?nG>P#{e-V@pf7Jy)>Vngkw{(2o1wJ!7Hz66% zls<>~Yi4%BuM>Mi;L8P`@ZV%RTGm$up77JeJ`eQY6?noQ)ARS!0#Eo|VqXXPw*;Q> zuf(1Xc)o}9N%&EETrU>54-_(q{Ts5i0>4zlme{-DsbAo8i-3HS^wu`{z1{*2h)0AC{Tgx@0eF2J`4JmH^w42;sF0#EoMdR}=6c)I@j zy$k+6;b=QE z_4TE6DW^*bUCQXPh%SriQbCt1>9T|_vVUT|0>DFvT$!J%Bx zlnXxPqHMXWzTi+U%9cxd!J$IXR0#SC$wBa}5TzqBjAR)hG*W*V%}w#Km4u(b^*vNGVy z{+u!c>iFU03~1Gd&T%RAAD~W1lqoa)S4QJ9)PGJgAyJ0aO*-8=AyfQ=M|7BQaruOl z>{Br;X1u^-OKYfEowpj7jvve#moVi^b;i`}yKly2CT-pe~K?tl(Zj59#k{KFH-cFo-Nv| zX!=VT-pcTxn!`;#wVH~D`<;dCoueEJw4&L1I6kw`A* z{~d*!{C6pwZnQe;cK9LV!`tU44CnmCAzqZH*L28C^LYe$N~YW&NLbMGcKe~i&32=| zZlGk=Ye2#&{C}x%(ublFTAvpbPJDR1e#3BHFS##D)qhmc6Mym^H2?o(e0aZn%5cvA z9)%MhifL#*Z3;Kr?S6%u`u8!M^OyTc*)Qa6YyKP5I5+v%Gdm*hN7HvQyWsWu2E(~r z?O&PvA7=cQF#g|Xd^rDqWjN=*oAIGtf!34Pm%J;D>+xXf^CD;{neF+Cgj2@b7nod% z>1h5X3OD)7{hMrmie72@DpoIEU+wpjKD@qbUHHp=n&eNr8qHtsyJWpM|9hES&c6y} zDVh4Omawo>ijHaicPQL!w|a(i{jyrf^gL!wM(<%kb0uf2VM>eMTA1`5#lb$zQJ*#D6({nm=DJ z_&Sy2V!uCg;eSBk#J`g9=kp$KAGdk$_b&Y3X8dUm(fS`(IJFPg|0KhC`^b1->iC?d z>P!0HfS=~C<6CC?>$s!Yzo(R3;&UV8qvL;Ozsz9m%=MqiaIXKe3OD&2Vft4yxg(4Z zZ~wO$&iTLZ!r!6_A<^w7{B--|D%@-z9g8sA=R!qq>VGNY{}slc&WBSXKD>R(8P4^; zn(?_AWwf3$j-1->9};sUDz*YYP5%jjNT%Fumv+0{rQOODPW*4dPxB8jKD=M_83nW5 zWLz~>f9^-EWc;_d@TY&*h?2>_Si&ju9{D_!NdHy%Y5g4nk<9kd>$}N6CdpIy^LdYa z0L`Dzd%WLA6(5to+q`#N(Ubmb7=PZ6y#2X7@^N*ZYBzJ->iL2A-^%#Yzs^L7^yK{Y zcrf+9O;AhvuOYA7f$C0)=sErafu`aIUGO(ta5}L?$>jfc5>COj-)qwUyP~Idpt@-N z_bA+Kw?TyyAKt&i4Cn3tlEO_sqY5XvYw^=^-%+^9U)ww3od2H{J@HwGpXQ_2K~t{$ z4kYd3c1AD1>&W)x{XUnC2QGJk%ecB$;iS)c#y`mTaQ^apjFijym!d2ssxR&7biIB> zAQIv2_-Wk3_;Y-T3(n^mPX8#QznAep;DUeXf`6VG0EzVEd~R^TJ6-S}xZv8qHT(CN zqNf|}L3I0k!1(a?KgDp~{x36p8{_jh@|4Ve;p3CH=RDRv9M}76vmYOEp?`+qygkQU zaJ^oe?SDP17w4n*M<)GaF7(g3;Kv!x+fAPdGyBWK{D0nG=QBT__t(V?=lwND;U@nI z#(x9)O^=6b86VC+$Z*b|<|s;Lzuzt4l=ZPw;nW;-W-vJgXVLU!cF~ooTQG? zYyXPcllS9##fNZSuR0h0Ivzs$Y-0R*JcPGfw+sI+l%+&+@54{m>-z*E5#EKL#&;u6 z$*k|sB`o^49(heaq;TTH^?9D*yuR8`Gx@)&=&8OT{51bD#)r4(M-1ouX^o>q`aFQ2 z=5sZHNF+CcpT=pALy7Pn{4`EJ7$wr@a{M%Y4}nOg{$vZ3h@SpxL(>yaN~90hUw`K% zoa_HBB1SU#KPq9t|1RV;{~syb-G5k|Te=);#{FCU582(EFkmz=a$f=83?+Cw);Xa1XW%&IJ z{{qARk>NBZ;;Fd(-llMEomGra)$+bs=fa_FVXL^lHe72J*R{7>#ceJ0P^>=#ru)8^ z_?%7*ON;R#zAdz+wwQQjSa1?@*yr^riDTs_AvONpdP*|mr6(a#eX5?4JpFAylit$@ z1|}&;7t5$dL_D?nu#y=|(kB+CP$OM5BU$>;!zAVCVj0VbkDe(AikHn;TH;?jn}WDR zVaUsD4!}{5#QBX(2CQ+AqO@!ZdM2L($wVqU4>C!4tYk)=z)zG+QWP(mku0Mv%gQGy zFs1yYrLw~1cR7qRNflhU#5FJvJi^|Z zLe20j@PBO!72}UK7q*A-H`$=4Ng}klrlGC2iS|5yjYK{fxnkT$CBA;Tq)Ezci@zaX zj4hg-M^$Oll-qe4P zgi+Fj@+swP@5{i`>EB8p+mL*!yn{T+*7CLf^xc9;O#KIyLjR)5gG3}>AG`j@Cpn2w(iFG{*RiixyOl{fX@ zO~R1At_kI1O8fr|@6+i&aBFbSC+rW7{9u_R_ijnr@X2E zE?bt@=jr8B>e+CLlc&>v)-AHcE+wDzKMy}GU+e!BCEwKl$V^#YpP!dciSkj`WT7C#}CCh zO#M%|=zmfLMQ%6b>+%||K;u!~Z2yfac%uI<20>R*EL>DqtyT1nrf1UY#0;iu*6 z_P@tP{w^X$GVRxt-{2y@gg$N}4T`Hgdv+=Eru@g0eAE7RUm^?sHcK&3H(g%C^mi?k zH`{OLWwO96noyND^*`dG|8b>1AHNib((-ltoriUrM40-&Jx2TLqCBc#`89#MZ>Nu0NL>GUc-Hc@{`a})zk-O7)G(3{xa#s+|Gh5y z?^fmY-#uvtX8ZrtMSmZCv_d+m1UblGj-QsV^?%Dne~TPTq-uGKUv+t{|9mt&U zm47&1Qf5s3=b&J^_Mf#;((hE|N&jN}w0y1q1|{DdKO;)M{=TZ%#LH{_pK#It5D7!l z=bYtJ>iF5^qW_Z{WeGlhN&%Vr|6dpV*Otig6ZyZFOi$nbJ`#rXvAo5vX8X@Z`E>2? zSB9tk2WtOv{7n7VDEVgl&r-l|;*w=Xm)Gt8eHZT*e{;Po!u7uj zkg5NAB|keZL1sqij}4I2O+Mi2i|1sfO_f)}E&lfW#WJhRAdc&L>-&AGe4IdK@VAL+ zlP;%U?sU<=QkCBg|9~fuC#8M>>Dq65NRkaJfes$8!%vsj{ZFsWwxg0KucWJGJ((}> sOL5zRpIP1u%urDj*tJaDtR%33AcckW*`X@8;}AD)=L;NGZ4&WoS6i$774jI zAE3NwEw$D1*1p!Z*4o-?5?YC3!TMUYXce{frQYgPt602HyySh>+H243?97a|-yh%a z+rQtLbI!A$z1G@mU(Y`KB-d5f)nsR7IXYxH7dXi~jXI8ZbMm^9FDsq%o%5W!+SyAN z&70>A`l7xjcOV>Y4bO5nH7;KhX>P8MM8ho&p|x&*AQEkC@kJY3Tinarg26!8-57C4 zTU*_cW?xg&7sn)JdpqCCAE?3ogJgww<0y*qZa$w+8}r|w^UfgzzH*$+OFro%X6MVF zM33xzXKdME=TMI8mcd;_M9`+J2t@1GR7|g*KDB;Ds6HIHy1vEN9B?kRcWclc_O+}Cn3gGdWMAiy%hlTnt=`Q?5gHrvHstnuvmPDVzoqkVwyWnx#O=uo zmrZrNosZ=2a-CM`Fuq#wY0jIkyG~nAX&J9qxNnW~<`X79SK$r7?>F%;DcleIb`w8d z;md*dnD`JB!JbISiC+cRCG7dD!j}VIWa3W>ztM@GYvKdF~g+JOC(S4Z{-zoQV z=)T;E|4i3U-US01h*olmNoBGrYs|5qVJeQIn$J=fJq zZd7fWIH7G4)$|Niit1CxP`IeM*wr;q=s2#Pt+20S=qv8g%UoSGM+51dm`zCE8KX*F zU1JI9xf2E5G4xDPH;}!YYAPd_p41Cu8zV1>oKcW7{TL-P57&nwp<}4lJ$iwwYd5aF z8&xGf?A^?+EF1K0oH@*QR9`xVJZ;e26=nVHc_d?EQLfw7HJg!KMhZ{N1Cr0kRy5s? zp%c#T2J!@3RLMx;NxeXB215CshkOr$-!XJd5%BI7)SJ%3qg_3#iX5l5zrDG&-S2M; z?XJmp>S6`)pN@tRRBt+l`p>4^=Z>Z8rJV1lj->0pGk|xW!E(m9dQK-fJKBTIKEJ>?~Idpjk+C%=(>PW=z>5D+PH&%qER+As!2Hz+3egCXc zFT1*aa0KbQlJgTjWAp)6S2>E=)$)v(-(pvyKBZIZ5hk9R3ZocXHmXyhE|jh`M7&4OM&NZfD})E z1Zm+q$U!#dE8qPdRYb~n5nFUEr@xi2-F&@}EvjKz@XTzosINUVm0i=1;(#%E!Pxx# z_>Uxjmzs~Z*XXNciyI^D!BU`yl90=?fx{0qJ;TC!6 z+1>kH-Pf?jBV9f1q!FdOJlyK@H$bB=61^J5yt`(s6aPi7s;_r&`c2&4wz9>uI6X>d zI;HP#U*U_c421n$H&x*jpmOHNLy*uhwDss-*z4yU*BqOBudAyxU*V&-ySm^0h;pCn z>iHWC^bC+Y_vBrLR){jXBsYEzL`eBu?#>Ijx;BC4>Uo=Ud8Ke+%M-hrq;v^>&A4lRm5hejadRmV^+w}o8xPYqwM;kh1$?qMe zoI~;f5HOmF=ryg>$dv3anRZ_3&P8tTW@%TtMTC+)@1qMB zzb#?AqMWCtblrW8*4$2-2dGUY+J*Efze=W`SVTQ4<*Q@pY`*T}c2P*z-6h(Fvmgt# z7bVvw)%I%0K!3Mp;wYql25DS9r*i)=?kqyKvXrHKJ)N)DP6FQjr+<=PP&@a*S+!_s zX6;;E7t7Zs7W9Z9-bZ4`+9GK}Mk4Jn>SMRRLb>d!D)dfk!J)1|lF80V@i^wC79U_lljM|NK zwFBhimZ%?TX^bk0AA#gX|38DZ7gO#xe9C?MEY{w|QqXGGlaw8ya3C0nHmuwkEmd7A z#wb+k{P+N@2$m%K_f_*L|L@UQ(Q_wTIE6JlOi1s^JO*AxobJ&>C~9U28OuBz4?fmcevTUS97T>>N(woI$aIY zNf(6y82T>Bee(vLuah|4ChDtu?&n;b!pPSM>79&UZtdBmdxvUs)L5YVO4Zsum_nev zT*Tge1o~YY?qn+#aQQv*0p%LCXAnh;+OsmOYR?!XGs~-r)BgY|+ru40HGFpgTCZz^ zo7?bQme$VF@azy~X{(w7PJ9rfKW0#D_2cZf6Y0A9Lgu2UssXo8%|O_6-B0x*VWK~s z%^I6H{T$XchSOu-o`>|F9jjI(=Iy7(>o_Srub^hWjz)&@NR1~~W3=~Mqy4F1D}VEd zc|Y;{j1lu{_!XmZ`~!^Enujs_2Jp~s&_4#triVN`sA{6+kn8i~j}bRf-#ZVF#WaTa z{p}6y(Uw3c;u)AuRjekz9Ldo&AlW5j<2NB0`Uke7MDdd&_tmhRI*}H`4)`w*Jf6a! zyKQc3qWa1+xVbEAJ+wd`dZU+6*)fL6Sh2SG9Xb_wy&1FM0BTeP0*+zh6Au7kZL z;HiGe#~Ck*w{bMvi+mT5J^iZA^VBsl|C*QN!S2`51@x~@nvH%Vel|0ZZ^+q=cu#*A z4CH%F3yqXAj~kVggMRXM@Gut1?3MaSnnwKvLpkz~_;rXU|Ehece)QD%Ln=8G~e-UbejtL(?=B&Xk;>&x)l4x08O<)7vG*>0XZ(ACgwgjS1nN zyj3megU;i)O5o7{)R!3#57uy>DZ%)ZGmV zCh=tdCDT64h#_K?$KAjr>E)&RZ^rO=eU(5NsCJ=5_{b*i%Eht9& zM0pmFo__K}B-$Dt2%x;o=R_~d)a?cSeJR)U8}jLomhrfLsuIoi!Eg72=jrqLRe43i z$#(tz`-$>LzjOn5d-58ZLu$sfk0ZONIU^s4Z{c`&O2zjLi%0)5AMv~dg6^fQ3G-Qu zwl^^e=@DOoc&_z(^3=R=U2Z(0;~~F*}T=lH+9hvG2iH#Gi{ydCN}BIHkue;`61hYb0?0u@`~qbfmn+)Mv*6YC=_fT$f3#dR92o@i;{^sEa%q{?JD~ZH z+YpGaU_CuL+12|jY*T9_(NkSL+xU)F-=l>cL$rF+cA2gp z{4#7;(jT(k*sIn>XdN>0C(_wn>gv6VtDOS66fl@tSexTX;g<{H=SQKIGyN^v}4BpN(YNU;2H)@k-Q|&Apcp z_YkcAIdO;Zqt+pKb+~gCcpXE$#=hYd#Ie7joEBw!x&UmxvE`X-PND4g?!qmbhjt?C zR-Kmv>lixqOgzVTEx>hqOUDqcqO6`Ts~es9tU;~He9UaM0&+U8<01TrKY6{c>jh@; znqc=LTx)$*o4<7%mjmVvy|Z)#`RQcR-TQq&VmmLCbUuowRatDkB!1H!sQe z?!i3NaZtPV^nT2j(A4?zA@8K;Hs6?wxw<#@+(Yr4gWjL)at542w^Q~(LX6M7BG3{D z>vzxY)`o_*a5&(1Pvm#e?(uj9_eBVA#*h&DyWR;=e!`wjXqT8-c}wE6=gl392S ztY6>Bo9hN&$k)&qT{~QZFBA&2_}!e`7jff_Ao>gUYFsuzC-el-4=7VvZZqjwy|S@k zrQ6>ch`6cSh_r=5t>LIUx)SdUE&JPAu4-vr-QxCz8&)<((XF(F18#_HYmrn?k?fnS z#K?cj^do8DCFPMWwi~Y#+!2h74N+);xps2l*GsU;eK)8i!cZSn>ll^_yEGcYjUgJa(x*+4?sI5!92q4c81<+b*(CS*{BD0EZ3l_C zgW=X@cM4LniZ!-m%8&NseD`v+@Tx`ZP40L3z^tR6(YVt+V$26nCQ4{lAvUFNm zxs_?iB`&tTP>19GYRx>4IX6KYkGcbEPzia0<7-C`>06GPIkNgVwJrX@8u!`mMd;oF zZp_WXfe1~JsC!`f+zq!xo7QsUYHSXvVu#7Yq+M+EH>=v3L&NOJ(7@)m4%=14NntF? z8F|ZKIc+*>_O-w&)pDC!TSJk_$Tc-DZaVViJYe3~0ZZLdEf!+x>hp6qN4?|hG z;d5UTRq3^K>vtO#4a=}fZuAc_Bn8|gur=(%C?&;CLxyhB>6&t~pZUCU7N+bHl5!IL zRkG>LS+o%4qI<5C{`fchr_{#4M=AOlyMIcVHbuH*|CBO?Dv%Ui&%e_@rPW}LtA6a2 zi>?i+F*HpbPs#UTT%b1Yi(;^7Uaq&)QE!q)jHCOxxz*UDhY=>&$Q>hEdRquBh-MOY zlVpcb-BTmrIn+l`A2y43>~Z74sD+V_aF10RR5g+3oA6=oVt_+qY>BvMCo?)3TM02&@`bUr?@I}n4(}aDh{e3*bIvbXhGz^; zP?NM!xV62J24ZABIaRu3{`^{9G{cn6{x{RnghBP;m()QO=9B9A(dwgN)bNfrUDt>1 zNlyV)6G<~s-rO77{^<~QrH$)6bhfLvnP%Uczd%*O+c0*6H|q~Wf8pm3*^%=}|*{*N>BU%^Y#Vfp=0X8wIP|E|pZ zPMiP3%=|?*|IW<(@izazGV>4Owdt_@{vb1dm(9N;Gyf)=|NYGT2Alt%nfX&}{`SoL z9Gm~W%>1YEB6XO5ZqLlW!{*2<}b4Ow`AszxB1`9%s+^?q{H%ib7uZ7n}1Vg{!KQ2Yi53f&Hs8HauL5BiDms_D9cRPwerFy|GUg#SWV8nn~{loaph@-LJM4 zn-MXNh+T2qWmkAE_gvw*Qs!TE>idb>jTi8gs&3;15>UJGykfqe*ZueCQ6AU*)y{dP z(cG@Vwi&Vi;Rs^rmjSVdr7#l7w7B?MOp$oLXe6tP{VkF5AxZh>Bcv8V=YNT3;r!b9 zbCcf^sRh=0T41gBMd;HadRjLYSotXd(_DHiUn>i+cz<zgk;YW@||b##f#{h zi@J?x&g*^^`(P1MwQ=>>+SvQ>=Df}8vb^0NxVrub*xT^Jg`3YiS7kT0$J;&V>iRqI zvb|MV2VGkqo<~I5AKAUv)m@0$Z(ZzlRLq3mj;xElt?%n%e`P#&#n{eWS#_~*II%18 zX>X)14o^Mabl^Qfo%-UbAL1NOXGK=~m^$@+hG*juGP{0|x&dAQ$NYA;<4_s>(9ref zt-w7yNkVnm2P$Kfscw4jAzxGt;FW{=#H#E?sU}|ex+*>zw8Z;Z`M!qj>AWDT{RsJ7 z1lGWMZ|t21snfVjOq06hsb}*z$IxSjjdKgT2ct(Ivt0~(JKoC2+u8xVvBon0>w9vG zzcFTv!zsNTk5H6)dr-2XsdYJ4jL}9)_}O^^R&6SSZ7uv(ZOnwoEbQq*Xm7=G>V(Kd zJnG^>Gl+d2(X-s1$@J)(yq`T;$IO}J{yHc8SJPP%J@jIb_wQN}v(rV)Nkp&-!}a@H z+pz9t7_I?Wt}m78|3*4(p%ygKb{#X)cpYO|Z~Hy667Mk#^W`4aH?{yzc>P~Zr%l)L z6F#&eMjEd4CiuG89rOmO22w|{da5_}t~d5k)aSZ?_9>nFUFT&**L4j>&pI#9)%7rH zSJ_@~>^1fLOKn?qg;Gy1y8@NkS%1yRsK>9lx;sJF4R(kgjbrbq9^m0Oi3d(??A_Yf z0eYv(&(~^{OR{cbJtD8|Op7J`X6)g3n3`*1KZDTTd`{k*>ia!%ujRRP#69{soQ-(% zd(?=Gu<@e8?%%bog_~j@iKjNs9e4g)Xs_*8o&R^7TwT{Ad*{=h8CTc&Xt3wJgUj>x z(|ynRl=F=&T6H0_xC@y^8a5WdgF0I@buwE$dlUU3e6^m)cn^6Bo#8i6Up<=a0KWgg z-P_n>R8?H)nsjpU?yBNrl#BuFcc(bWM>-zST$75`i00|1_73)<%oq_N6A_5xKL z6uo3M=@ZE&BDhF!<}7%3*49y3jY*Wd7xT=}0BvmshfOC3;96=yqUim_g_Lo&iL&9Q zT)_ToxoE4nXzP#zH*x@Jj%qJ6t@g5cOA(%xI}X2wM$uk;ew^Ay?-!|Us4R z>pSxA=?hRj$DTz8!{Ycr6SjP*>YfX)FThy?b0CDO4EY?D zbTQ@Q^5Rl>u9DosbrO8#E%xj=WDOv8j%R=xjcQKpA$;AW-a&tPpq51!Lo^DR%b2`V zT8D;IiRQGp*lX7Ke6VYGNR1zZe9$*DQuUI~F#0n!KQ-qcY`{hr)dA@_Q*<6s!yz$W zzIajhP}|zhPqX{bML9d*dxxh;uDJ((LpPWzRhPY3UG{2q*j6NBGv^Hz-ax?c<>9oxMR< z%iEsV-*CFT_+!r>a53xjr?PaRRL7nUx|;ruV{h3YDrxAT6RhS`iF%-G^WhIXov(~` zb)AcxLQR&diz+Lwvhls1y2^2NeN$cKMuUhM6`h9PJXiPEkZEcHzYBU?t-I;!h0eh& zoDV+{bbb0H(BMP3t0??4w14_E5#kBzVWFaOD(Y{U`Uz2=BkDtn>S}#in7cJ|FbJ8i zf%EAzMErlKzG~)E_!JwiGweX_I|uWFD(|g_$;>BFFpm-CW%_v-KNW?asO%xPhmqyh zKLCW=#AWaRuonn>7C)b2u&OBhy-ERJzeqe1bND6r*L%s;xiy3uV`&D`tiX~czUmhVwSPce$4dgeb|onPT9P3`qPYOkF~ri=Io zHS$y`o=+DsC=nZehzp}n=zIqYdn~HmQcfEiQ$$48GI-4}cEryh?fsOD!T!ia4E78M zXQcus5V(PvSue&fhe@u}Dh*Mt4VPkOh=1e zDp&P~@$&$0F|(O3+1&6el$sgw4i@@YyB+atx`^LML?@+)FM(mnc^!8FuT!;pn0KV1 z3aU=P72J<1Nfn7|E2{NV@d#%V{i$irGOhLhSa#0U5j|D}eI?5o*f2a|!#Xmr5V89C z)X=k`ogxon(QgF!5`kIo#h=9);|Gr-FUINpf!Bh53%hr+%U%|2{N zh(vrVuodKVy+ZSq>V@;G>)dlLnzx{~&a=4oqWSK`q7B|ohZ^zTeTAFEmgDIgn?kTv z1wv`tQ%hhqUIm+5*=D&7dS zxyF`+b=CFb)!S2SKhZCzX^S6cH{hR)jU#IBk>1KjuhHcNt;(-1YBX^EJ3BNPF%R6X zA9l>0GiR22;@tTcPjXk3Oe-mMmzS28l};<0>Ylg|Z=t=|3&8j}Wu=pl5PuDUL)JA5 zovby5StpGhoxcUjXfA!S#Ffyws-TdtQo&A;^EiruLb*(oC$kMuJNTN z2eRQ+x&9SRCNC^-XI}~v<{jx&5KfLopibSHT~K&K_UwXjn{%oQ+?#T%3MTf9swyb$ z%)5F_!9-7i8&ROlF36vIB-wtRr0t+{T|wcEl=g<4*#+*+xw8u z$HD(SR6haTPb$}Px&2!7Pa)fv6l}@8tRR0PE}s$nXgYfe3VX6W1>-t%<`!(v&Gw8b z7zfBhWl%@y$oB}7F>Q6c3GRi&4QMKi^aAM2>B-K1FCk#|kt-9|H7IV!i(`Svqi1|e zUV|3rHN<_C5b>2G+cW`5&%?p>0_XMgNMn@^+}K0@?aa##S%t&4AVtbRX5~8X^0Svs z$kCO-ipbGU8iAA3Mur?{NELD%U3-(U)DE%@oZFg2=Qy&tIZ5ZM9EU3M$dKb4F%krl z<52003^@)z|BZ;qaSBFY1mtk58yS`396b_bKCM8E3^@)zZ;pt_aj26R8FIKMNrj&{ z5(JXtoRkJXWdwdUkUWQ5kcyM~OE)>rX$rG`a~!vYIL?=Wrvlb_k!2Z(Yjd7J@=rv&QlYSVE8|7t0?M`a9jSpjGu19 z&i#y^>Fl>+)ZFt-3w4}_g>UEUQNfp4DV`8~9G{bsY7Tp*v)#g95dJ<3-_Q5cM$AxN z7u=l`%~x*;ZjVU+5WLdj9}?UyNBR`l!T7Sw;^%U`oaTfqe2n1hExb_hEf!uR_7F7F$|1BJU-xFb)FjcU#lo z-%f*XPlNv#@ME%*gO`2|e~j_u;LFeA(EK{%$@x4#hXc<-znsd>BY+=6fAUU(eg>Pu z_^}#JJex0O{0j+O4_9j$PtND{Gy1KJ(^I($`Z;|E@Kp9ZOnhTL&(H8k`g9up%W3d8 zNlvy~w94~7-C>+drFV21{OC0JDZr1(9-olUYXZ1CHw}Mw8oV|Qz90>LaT>fK4c-R) znCuA&JM~O@6XVHVo7WD&`Vr%2CHTBffV;;TPd*p$^FHu*7*Eds`PrRPCl@ra`qkXp zQqqw4Tg%v+MSu3W1YdnbOG?$JJoU1GSTePI`m|D~34htLo|n>iM;D<}r#toaw4tZI z9=B5}W|+6rr<4!Bo#~X+%_*gyGWsc}pDFY+m49aN?{xm1#=neDVe(W(G)_B>(@y79(^piBnZ@stQh3!TbuATrt(DpIdj)Y)@VNMKv{xsu$NU_RPlHeJ2vd zVtCjIt`6h-XeY3yF`B55YPp^M{5SvY?-lh8^i6i4KH_T+BuM%%gbRiKJotz>`RmRU z6%1hmVlpxe%qCPc=K3}YFqVFE-&!19=uf`4HlVyh=p=9o(a*YoPPV1}`H2x=bn|MCb_#wehmb1o>lYF`@T+d&NEqt-?CtCO#!S%?g zCBhk-O2Un=Qo{hWfvqw3b_ma|>oZV;UKWP>iRZGuy|Z!+-h z25y%7^)lrnIcB;4NcgtiUkFb8G8|ej{S{jZw%&&o%($tScoawutr}`M;#THU9yOki zYYLQZD-Mkp5r{x?=vw2nPE3L75oJ;1y1Yov%?9qlH3gDGb1cpG5{N)E=H%?yaV}z1 z+4D_YYyKsI+j=iE_@>@~;I{lo8aZpy$cYI~_D{v3?b#x@E&p4BlOIg`?-1O!|1N`X z+OO6fmEWd;qxB97-`2awkYnn7T5wzMK7()SRqKUHFFn<1y$6JE>;2G>W9rS33TNxh z7u>EVYMoE%onh#83*XjzrXk1FJ0*?Y3WIOvSFO(}y%mOD{d{ffU1G>F^){r@yVBsB zdeyp`(mT`8tDn1Vy}gDUQ?DMsY(H-^_@>?;2~PESmZA4v!EL<{7;;R#y9BrOs`V|^ zeoej48+y+*^zKii_pfR69!#TGtxJv6djw?^flf3Q==!YInMU?M#|z(XzvBhB^Q+dA zM(Uku=%w9+T5qM`wx8#w(Yq*(UbPN1Qg5T7_X0z&TGttw-_>dKcBavLy}>ujm)376 z(23pw>iljO+|KV04LN4Lx=(Q1&%FB-a_xTOcfzMrrJ?sJ!EL?I32v9~D}vj4-!S-Q ze*Y;rojitKEQWEg^^O+Y)_bhrw%&0D-_$!vaMC;5&|50Ft#^hY$Mo|Y!EL=>gKz4+ zRB+Ne$I!b>a9i&RLyoC8BDk%0jlnnd#snw5RfgUzg4=p&+c5>(&vyuJ>(%dZh;Qoc zH}qB`PS@wbG(M#)d6l}fw48EzCwzyNE{MO*mdJhP0=l4THj_Kzd$`FFBSHI7) z^Lv8uN$*@7TCZDhTkn~I)5+93MQ~fMe!oW1W`1d%j{@nXIic1|>wFY!y-NhAlc|^1 z`6$?W_4_`GHubI*KIyH+q4lm8+}7JGIGs$rHwkX*)$boE+SE(yh!jZgg*dd{dj+@k zJ|H-qOuf4VxAi`1@J+qX3r>2!V(8s3xUKiEh8$DxLBVakhYY@{_lQwYhCq7jaOnIN z3U2E?UT_j;>K!k*t(X3%5C!6!dS?nwdgtNLdMgFD_0Bcqn0glpZtGoY@J+pqf|K6) zhTf3iw%*l-98+(n;I`iD4Zf-O+k%r`dWO>Z-7dJT_lJfYQ}2C(+j{#9zNzp%gd^DX>R7>Tz2gkNsdtj# zq<0|>?PtCJ!PYy&kYna|P8z*lgKz4+)X=-g(7Q} zfZ(>?4-Gk{-W+)~Y3t1w-1hScf|K4QhF-Vew%#)hIi}tzg4=p448ED)TER*0QbX?o z!EL=u3^}IW2ElE;D-FJ>cdg*0_fkXedckeIy@ni9?@fZ+dbb&TQ}2%iC%s=Y^xiAD zt@i;#j;VK-;I`gJ4Zf-OdBI69ty$@MwO?>s?_Ui$rrv{s+js@WgG4=KeZtLA-@J+qn6`b_e8+yMlxUKhYLyoBzNjZ=lO5-+o6rpkYzljtiI)Ur@ zXTBF8Io}n2A*V{p2^sZ?qO|-f!N*zrae|xg1xWr|(hs^VzV5G!E&OKTPqgrrf}8IJ zNd9?(ms^JH9ZPCUwR&6z84_*^#9H%RPt|P+s2bkZ&pONRrl@_j_ zhnpv7~H0uiVhP+m0tMFJ5hPsG$e_f?uw%_FIawmIk!lC8dD10iHWd{Cj!HG|GK=Z$E;Kb4RkJI2g z4BXWFJHbh>4~LfXw81y^&N1-s82pHVQ(e+>It~0b1K()iw;T9)bEE${&s`E%8+l` z`3(c7u3yW49@i8|?=3hqzQ(Y}#Lpx$0`YIfq4~6SO~J;OF{=8323%`?v*5)4rh)ee zPX6&5IMpQzHs5UT0mN(hW<50J>-t9Wg9iT-TPix8u%fb$hh+R-~!Vx;@(co?rNO zzZWv<{~3l}`kNILNUz!NJ;$i>PY15G{T~|m1_L+ypH2gJ8~i-Erjwq(k^g1N$gy7j zReHzBHJ$thew2Z)H*lAMoAcp91HaqgA8p`+27ZizKV#s>8u%LqPIFBiat%M9ph2AV z{F?H6vVjj8{8J5lpMkp#Tz#ja{PRTvUrd4#=ro?M)X#PUpJ3olNn~wv)VCbxXjq+B z>Z>UJM~G8Ge{$a`!GDBvO2VylQ=&IXf@kdiCIMw7*x!r)t>-JDv}uUW@W_mtc5Rg@ zHt%N8n}RLoSvg5f9G+XvN$w5)9Ga8l5#IaEjl>zme`eZLrI2<%a*7gD+%#L0hnJtC zrr4mIR&0vB%62)UF7f{33_j45)Fo=f4lhP?ce9d?4d^4QtVtVDbSeMnN(NI?@>6NG zHay#2Nwx7ggW6Q|R9aK*hn6kVDLj*kExV^vurq9=*10pj2ANKuW=Pu{2IKR9kuZHb zQW8baRKj(|`t(G_r2YU!3u}l(=|c?qvIYOD$|=Er6$zB!Z?K#biQ*5uh_3=G>x1-f zP0U{N-@ZNxDXB|0$3PNRNk1uqbpMGM>gx%U=~4o(Ok54Wu+!7C4vn+_Zu-ecOESxG zn4wZ`=9(a`^_Z^hSOSc$?fmO?t`aM~Ic8JTLL4+lsl;K11@-D~nSiwZGYW^+KTdF( zH>T1*DDmc4O^~hsd%#lVe?O5CJnEGA)%qz7g(q>O(qAeB{rw-4@nh?!d1k8o-z)jo zex&>#fkW%p`TqbIsgmgcu2eW6K&1xcOuEeQE^bokuUw{B>m?tg|41CR{u2>T6ubWD z{a{r}OyXCk7qJfQf2C`?{;Zb@t-q7h0&M?PA#p1E2Yo69J*!Y4`wMVr{n~%D&O`cb z`wOXIBQ&T};#a4qG!*WV^dsz-^!mF^#U5GycOZQ#`zw~K6w4$b*-!IYtzX;!=QR4= zGU=?A^!jA`kLIJP^luaWS`X=`d9tma_P0XBi1G`GermI3u=UftH-z(|Ien$Gs(%AoK3j4{Iw*9@L->!dyqW>albVktWDbEz{ zPGf%|Jt!jR?{wAO$o$hj*i`=CFZQpIhaf6{s?u7&t{0Drew>DdMIT2I{;#LeKX{!=qV&1w~2n+e|L#~{rs$? zCDUvFJ(0$KXQoP_zrWV$ZTo+h#{PX`zy98Y?4O9kw*P}P`rVWOL4W71?ne6W-8B05 zuU9GbK3@_?U4pH@7?n3w`7JnKrN3S~O5KhqztfODRsHYlQVG^edeTpGOt2f6nx@AD3KY z>wikp%P^48YUh6t=_yY+IGf>ZNpB0biqm-^P5J|p-V)1YZR+^|1d@4-3F2C(*ZrT4 z*7oaj?kJVMR1@U9T^>B_=Lc=SJ|89Nt>n%V_JfNMbUMvHUDA*6zog%1Mds6}BpOOB50VC9NkN^Mx literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_common_aarch64.o b/third_party/libxsmm/obj/intel64/generator_common_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..60e8e5ed6433244d8e7ef010db0762d355559720 GIT binary patch literal 14736 zcmbVT4RjpUb)MZ7M#1Wia781Esnv9aAB3r>QV$Pz@&dhXS%G;F=R-5`H3+*l^Nd5@Xcw zd-G=Y?P|wR-Z?un@4ox@?!E86`=0D=(Rgbx5YXHf(7vK&doEPd?zkcQ-7G$vwJWu9 z&1hd@eY|{Gvp#+@b8JhaK0X!eIcA;t8Bvm&b;7WwlJnx$yT(3peez~JCPHNul>(t- z<*L<2*8zQAGjN~WhB52JMBr3;WpbV&y3zXm0ms_vs^sBW2I9Zd#>F^o=QWqNC*I54^uN{uuD zH8ZUbe-jkx=|E~_dODb@A={|EKH#uT1|j0VCF>wrTSu*l?W(?g20sF06%2aH&z)5;x4wg3S7ZD zsX^!0=rAhj8JDpVZG1-``5WnrR=Yh%z%fCcC0J~mm7X^y#UKm~VkKl2gNO76*D3?H z08YG8I>YrUlq;Hn*CEu~(G5qm+FNhL+~103exVQFOoLkQ8R?&f^x?k{Mm--$bsFjO z!PI=tUalEAc`*UbTr3=KnGmtXUdZSZ7WLwShRRObTLVST8gQ*YNi@)^YarVAwm$r~ z(wc2{Nj9?SSrHAvQjsmP)>JMy(wyJQc6t#)JTg0xB}$ zPml%MBX!Au=UXQQBH!vGYk`y5uMacfq(2IzdeR>SQ*|Or>=gmQlP(Qll1Hr5g1sZo zc#4Zqu4E@zN*1`f2>inW+ht*-f8;3OtkyPr?gEFAbvhZqJf6?fUt+^6W+S!;ePkcR zQ4Rtpm-289rO*iZY(YS5d(ivEeET4dXc>XoW}6Fw0uwqM5}X`#p}dNTh+GhIRR-Ia z+?lDEm}QaRD-tF3-Vw`7lz$J|oJUq`SYQ^@BsBOhV_z}nNe?1h3J(f?Zno?NwaAwL z05s&uKx%`?mN$@P)NYSBJ`*hAGLN0(xvbCJOo%$P}|=e5HZ z!$c(1gp^32b+bI_E|o7+>KT2Y#M=mf5n!ZdxFeATmSAGq>R4EfuG1Kz$(=76l02N3s)8 zUBu!6b(B8rt`ztdD}`iIPOfi-&0FmD1p!Za=;AP!?u2bfE-~YBr2q>y=Rj5w6%!@L z0>|c#ABjmz7si$=yhP79D<#fC51BA^6^pzk{2C1FXHZB(mcx97G9p2n!F7?FkZ4OA zQ%um4zhOlp67pU!5k#bB(gxi`0QA}9?Ucc4{Ts6`Sg+L|TV_3f?w4?BNB!$8e#f9Z zGjk1o)uRtZSfxa*qfDvgEQ)udkD++x7}9o;{>XvA0aM@mJo0$d`jI(09QiH@7(hD_ z2BPv!;={1sh}M5NhCQ1_aVsF!$@DvN!$w_{>P&xv?K&C4&iIP4ZyHQu*4gAXvGY!Z z>UI|?`!&KU8A5)lgqfd3y@b)^Hc`~Lgz7eo4Y5)KV|OgMP3((gnX|Pso_?wi{}V(f zL)dD;Jxc<$u-}r#+p;@Qj#dU%CPUcE9Zac-tT4_B&vha?DuZjp#9mezl8+z|XbA*n zWr*{Fpn}|>tdnY4s8Q3*U0Ak8`nfs5qM{2+1x%%H{a(aaC)eZyWkZRmRsSY6oY^}Q#p zgXldI)t@*YwL7L`ILK6(7;BjSbPVk4Tpub?;mQo_b~(VnguFEWuX{^RECs-Bc>{+|dDW-4p4zquQ*24GyV*_ z4Lc`SvvnegP&&)7bN4t3@d{*b+b^pU<3b*=k6_=|nsAFGxERR3!g20xK}nT6 zE+!6IE~aA$2VKlkVnS^-YxA;WM;l_0R|yroN~qHx+ymjlz_J!T3;VhS@(YTASs`Z+ ztuV54q2M?T7Ybg9yk3;Z@>KJ6%-g9LMg8g(YQi{*)7*1PSEaMAgc4p|3d@OUd2YxS z)Yx5xk|bbAIKpNUxc3X=(r?`Hu*DX$i-07pEXGav-aH4wOdvH#kw^8BEpE zp@^WDWI=i&>6L@h@Lt4mw{haQs0xM<`iQ#s5FN*2GS~9n>S}UU7wqm9)edV}RC}w7 zH&`f>pOALPQ55Tid`ZdlnKc3AOS7BV^#YF)JBtPL5FjzJUK|7v=ld`hGcMN)uy`Co z&d4&BnBIDEmN}|q?qA$k=5nKnAw?RW=eDTjgLzOigQhV2dZ#P-39Ou>Tz zTfZNd`@+#Y5jAgr+q7e-G2cWn1_H#&3$hwR5IWTuZf1D9J=?3stSOSB76T^{A!;pF zLSfkAGash@Sfo4~>FKnnXb==`Zh~Bw7$;PeNTsq6W9kt5yHhBsqH?RS?*LJ(Ks$al zPJ==Kw0C!g83JStb^&BFS&Lba8)Sf!vp~1u@`0>{mB9Llt1<7@Sb?$9a4N9d@x{WX zTvT8@zp+x>(FJf`{e^Oh^;ZKtvoO`eN-BF4bot{b=#*bKz-uKa0U~;)QWjqkr{oeu zqV6IK)?OekRC|p%b=UV$+zw;vuo+6g@)#uAIAd8a$Ft1v)Wus`+i) zBwejx;)h@oVIk(RsfD1@=b@YgpSa+2A*cT0nMG@BY58(Rb=z&r=T|h;-dI~#aedwO z^>sJaFRZA(vv*xZtUKkvudA<{FVxb?oR=2((>t}mmeRmgC6^YDA(rBR&y`~Xd@l}{ zJ`^;=qcf7;fD03OTya&!G8|#3~UIG1={dl3_vg5ar9TpzKQp3 z=--96fpNFLT=s9585@IeN>mxUWJYa?6@D=ZF|!zyr1%k1DbAZ!GtI(Jg8mW1wH zSLOBHsd;n%Yapm|@0JAa+-Vb4RWeZf6u`cxIvG;+EqK#dsyEnoHOSY{)96vjAhX-eMF2yeJHp6wTxj}Ku>~0{$K-eV_Yhb4`)*xtCuvGJ1Q@7AgCY%-=hw<-$P{tL# zxB+iBuJYZ^@r!zq1dJcLeJ$SXyK$8tsO+l(+&<%%rKJ}B`4ys42ISM0liNR8Au z^Ww@4Oy9WrHcVtPp3s(u@DTPwUi(HU>gAxNUJf#;)r+uoFKkXP(pgBWxqcMUDcW69S&~ zuULzKYwLSA_HOP@4Q@{K3=RwoZcKD{Z|=E!(Zbput$)MXEkgqXi7d)cZz{1VvAK8s zhM`pNW*0RvbYHKBEXflaHV&mW-`kVgK$iXYCP=zrU|`Y0i(-OPViOeEbZ;ulGAqA- zaBx%N?%wWoS>YFD=V_X}cVJUuZFkQ-iT>`jz5O0h9_zlrL1kVZmfxJDqMTR#^%uqN z8Qi!okyRo!co99i*CnoB2hG>^4?dv&*Lud54X1RXFd9*z)0{KJXv-z)$+X zL&z_M_}BQr@ArW}?E`<$2mVF5wdT(&<^%8dfsgsX_xZr*P#{{q+&}VxulIpJ;sgJI z5Bv=u_@%PO;;`!XGYb4ljJa`Q$SsukDtyof{+JK^WgqwjA9&TJ5Qvs9cexLIj}QD; zKJZGp)z9GX1GjwO|KK%#ivO?zSMfg;2Dt;H+!eC`QG9%Lg7-?5W~6_`v&o;Htk=K9?$XQ{jtz;Ol+hkNLn4`oKT%fzMOy zuj;ei2hM+l&6aOZHJ__|%6#(nSA5`i`@sLw2Yy6>tMPNWlDAd(6^h^0IOIPncITzW z`PUV=ivKSPT*X)G3Kf5r5+^GD0tK$(Kdiu2{i_u}s`#(_;8!@)5+{0TDgoJ~z@@1K zWS0V$Ta$o1tH9;fAt3KdxqQjgF5u-dP6(H&SHNRZ3BqR!h zcE3`+TT)u>y6#lBR=aj+NUPo4+uvPFXzl;&Xm?S%LPxSsC&)W!eF{P0BZhM#`dp+e z_%BNMDQAn%M=PY_&@<4x5q{Z-qh>As6mi{93g=R(PVe1`zRleOy{d;l*Y{PR;!;y% zl#H7BDA44x^hSOkUxdkJauZ#JBM4PIPI z%}&>Q9=gW)j7bR3RK#=FCTnN54fqw3|F{^Hf1{D#eC5ku2UsEbr8Ep$o%7NtpL5KY z|0rOE%&i^?K6q3J5%HPgG zG|r!SE?@pE$SEjaUXi)?W8}e>KY#wCwnFmXzt+LuCFeo;WSK9Y|F@%%{5pBCckk64 zZZqY7*+>4)>l}&>CUP{&=WSiS{IB`QzvGLfl6UFOr+DVi|2`l2rxrLw?@L7|)muy$P1pHeqvjJo=} zh=)O6$p4daLf@5$oIh`|U3z!@Q=fd#K0WvJ{M-kSw(k%4#Gaow Xm_L7pSssd@+rq?%c07+DudDqZ-Z-&H literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_common_x86.o b/third_party/libxsmm/obj/intel64/generator_common_x86.o new file mode 100644 index 0000000000000000000000000000000000000000..96e1f116d41a4cedbf12a5ba171b045ed798dbb4 GIT binary patch literal 51272 zcmeHw4SZcinfJ*@OMr3{yR`(Z67YhlTWZp#B}i%S4L8j#P212WCD@iEO;ggCHX%t% zkh%m5rPs93SG&qCsKu??*I!rTZWW4chznv-z- z(iHdg}?9j4xwelh^27 ze_i2=Ii6RytUnPg>|YrzO7z!8i~HN7B?;V>Up=n!#z!~4FMn_-k-}|?@DvDeQKKb>oc`P{?yx} zL;5_d&m;PLl+Syj)QRMc=>fu7w4;Ng){Hgn+HN*>mYf2Yf&-|$eqm4R8iA4^}P2^9#A=;@FvU~vB zqkRByfm*AJ+oQYH#h&PZy4V*TQWyK9!|LK-bOaY9?kLVjsL;tBJ<(WY>S!|c6K*iw zf}B^GIugM9qkT-y_0icQGJD9%j;C{b&?to``frGqz!e}Oxnn;YG@}4SQy$RtK}0A` zETEw(XV4_CzmL3b#j5zq_^S97_Zp{e@S3$(<4U`A1FAtOepx*AMCv!>)(;N-Y8Osf0a?5>1iuuhq-@I;Y%q50dG2Rhri;ayj zX&3{oo)=ps<_u*D|sUy{?Kaj-*1?Mc-aW$B^!`WHYsh150br=t&l4=Rd1GZqtrXoWp z^U*mDL%-=9@rHYG2me1Yo}-tkPK}EbxuDe6x;UO#ZKRbP4@`kBrW@>iT+*DC!rwGe|vNU=TuL$08`mET=pO2 zSRd=7DgFDShY8sg-AgppSXW`W6h#O#gnOF%D9Wd$R2eU<`u9a|<%~Vi-F)810Z{^+ z#R&+zi@}4@VVogoT55B&1hoMF1dfnEH;LmV6sK^o@O|T0-yA*fB2b{{=~cy?AgS8U zb#2#BPJsv+O8Tp#d9aomSd|@rIC)$N+cAk|QsG&fC*m_wNEVFOlDz8ZN=;OD4(^4s2UX;{$Cr~Grhu#%3-4%uPjnJf5Q@l(TorjG$eWf( zl?C|#%K?^wgm_e#M--a7ow#3y=-BZQ<+KtAJ%X$%XUKyS9M^{!7aV^b+bq=!Cm*LMNF(-E1$eX(KR{_des!_r|3M@(I|Dy;{2^2M&D@5W z0k;&|Nt9yHALJ-=8fOOWfpVV78zVmv2O@4b8{1*L((}{QoW(bCY6Y&maxUfL)e|Nm2gB{Fd zR=h(~N1mEgDC|+W6xjw#3XXR0C}Wd({?XvIcU(C`Kk~qi?@aX|L+$v|e%H(N;No$; zIYz6e2YD-*mQaegz7xT7Ie(~F>OXnB!o~+r;&M!Vkp?Z5iT@T~Jik&nH-HFqlL6~}7d-At3-z6siYN5o8zSNb58&J-@w!2nRA_vj|v!%vCmNmif6ck+23KN|8MHx%oqO4^6v0Rx(BNS}xK zTo$E=3V86fnI3#$9gOD)iD-;}YkWs32%fz(f-Pg?njSDUYFDv$uJr>Z1@DjUM(w3L zXm2suBUZNd7L#2B?UFm0`Z>8LWs^e}QaRP*Nx$qxjP*o|453X>tO;Ua1aQX@==9wY zibNuRdxTb3chQtj1It6HM|ekOFt6ma#=2zcjb*9t^SITPMazt|EtaQJPti)`*NR(F zd4nq^tJObO0K;BUJoOYzn|cbAY6Sd)S#!rzPcj@oa5~hhOn;Mx7Y$Eh7iWcGT*Yfe zwN6y4L>-@`9-=ZohDTmB{2?0K@&c`&Td@|#+7M$fmVtfHfZ@227IPC2wkLpX14d@2 z9)M-UBrXB(nKWQgngDw*EdpUz2r1wSsg<7!_^8ze{iOgFL4}-41CFYTztF)`tkg59 z=Xl+gZY?siFe=D6R^+7y%U_9-ux9w7+R{?8{S6YmWYl{D1(xRYWgAQMOQn`=BPcF~ zw&{5!bqZRqIQ3lW`2?O3rG8?3NG#FG{K>`^=U@L zU2CF$bF?0J6S#71S=1z{XWD3cL|cd!gV@{4n|<1dun?PAZz1t&QW_`-#5oRU(Y^)f z(U%p#tA%NT*V|T>GEVrq_1h>v3WwD=wwX+;aABsMJMJiQ8z~p(dVQ{;Gpva>B+|Ay z@ML$5nGNu4H@Kb|Tz6~;LW2!0S2oCd>c3-ppJ51N#HjN{Y9uoh;1?OJw+jA@(LnJm zvSNPv%#bkQjgA#?ui_jlW}tT(`Y;gESxq2=s=7npwJg$ic+Ar zm?$wLe4;_~)qsW!8%pD0gHFpCa?#`kG#*5T$|(qFVlJA(fQIawAqO+z5OC82?_0RLhP)3%oG1FvUu*xNMwL;M71H%(WYe6egT(#OAuHPo={#6P3a7qd_aD0eVf`nJ z#_m61G7>(V3Y?{#ib6I8Td(?^WAdg=>3ZgK?VDw0$7n|&TFd(M`02qxaQ-yAJK zdr@GAon~y}V#^+F1i8gLrvslx!36?5+QctJzS!r+96#{4qk zsURZC=i--9VUvC_TM#$tI;^4A0|^mS8-j1&z%Brjdx_K@4%iV+)ejP%8PP>tN_3Rs z#y@{R@#*h)w}ckb<{1)Hbh48oH~_i7Bv6ZC=s^O{8Sg60!PZjeqk$a+x2cDPyg)#iL3?`QLmtH@)k^2-K4* z^Y8qvq9Vg5^6z|BIWkr^u!LH;seF1iU;+I*TC6W4vO6y6r1BI`RtY|rykwjbsb$| zf_SL1^Z~U}kL3@(xUB!`LM+!|Ol&P&inl_@^mV~fucW^ledHNlG!r9>Y0F|V3?s|y z+G3_0#~`h%Ywbf3#zu{HP1e-44oL=6O|B#|Od3=Dc;kfX_qOo*Ws(2+_4Am_Q9lQ% zqkiGk$5lUi-z9~?GwiKmwBfJdd4(9;CLF-KxASO8U&BUD@Y|kziAU4jg{U4@_}qX8 za$!8wxXrtp-XVzB$riWyhz>6R>$YbGdaVR3=-+c#zCV=li?Q&@l0kn*0C$*U@j3zx zb(}wV1?EH3nwb-ZoG-Y_@-6C+`mXuY*OzLTUgw~`7EX0FKCkS6N0qNBJNz1ZaG7>6 z&->_=EZX~{frX~LA+#CIbWh@pscQm;gFajW*+vM0BtNKA?_JOYhYd9K_5}nS2AO)b zX?t{pSTM;X@S4+J#_`SrO_Z2g4r40`(_#@M4-?tuXcW7D350z7;uDRd34qf`G{=6m z(K#FifxRsC5>LHcW$fLRFHfX?zKp|&I9}Ak3li~5RRz%n*3`gRTQRV;N84!&vyOKA zI_dqJ&Cwo;PO785#M*-k3K9JGTd+-xEjrqlq-{x#`26XS7|M`OnRwpg6VTk1X>8O* zjPI#ogebI&X@+**d;L$eK)oK?t9P{HeG=+yG~HFFUbF^xx>X}0@3@YL9an$yL{m^3 zE=2>e<59}!hh)M?2li5~+LRsbSU&L+7<+ajBu0Lu}0aFIT62!Pz2q_RxEQTJznR;rhj95A<#(KmLAEZ!-SXV&)D%L9Yy7TFKsbv2+aCirZZ%cXB zrjwKXyxY}(iuRhpoef!DFblIiP6xHJ$7ytB?D3x2JB|u_@Ee%J%TnLPL`7cB_E^ld zLfP2kjXfyaR97tl$ROMmNGJ^Q)qujy%8}nFres?io#gUHXDFj>$|qCLt$;08nW70( z2H&S0uRY^+sRtJ54{KB)6D%h3xOnX9>J1=xX&yz7KgBFIShmq@Jq4|UhkB_8k`Ol= znR-d0q!4RT4bfX`VuO4`u?MGt2dGuOWsWNDlh z6PYR)RMO)IPE9Kn%>%HG$)sKe>zs#W@~bQoErcl*%VNK_E7%a}`}@>hkK&bQbUF&5 zDly2T$sWWK6tl-0%dA%GQLRRE=LBjqn)a|Y(i3e>O^T+Tre&mQJl=Cq7Kp`sIeATL=5im`sCeU(L8y9Oa-wK0%`?(r} zi=kYBiG#+2HL;T-6lwXiy`l>j+DInT(MYOz%2;8Igr+r#`t)IccHriD&pG3YQcrV9 zc-%=njS*!<{6GPCR&j@kuk^cgUFtWf-%%`l-7@|YZxE|`8!7$|9zHW7I_xkk>SGH_u>1yif=xS(Px1qbIs}mpZZ|a>@I>X`_o!;~4 zljXDnU*Xc1xaOdbYvtkLd)f5En+KKtQCjif+$psW&P}#IH22<~N9L|N^wiwP_dPrJ z-@p8uxi>6(V{Y`yta-h!c=J9}{`%ZS^N-G5lJ&E>y{9}r_lMVfdv5d3?w(u!`nT_l zed@b+e_-LSqN*MI%JgTL55WAL7HN(VpvqjLwB?>>F- zmxsm=e&Sm%9Qx8_-#hey=f8I7#%pgn)c2{69E#joedxJ6@(x|Lt?%HY9aRTfUYR|o zG5+y6zyInN=S=JV{G8^m{QaC)uIZbzB+@6{V?<~@Dws(BNi@1FPmNn7Xr_M#p0@)!2cJA8KEy!X^zG4E>+Uoh`i zx1BL>WYC*;{^>uSyY;qj&8@E7KezlBB?s;C@0k~y za!>8$h4;MpOwK(`9~&ObtGsQnzqe_y^XhX3-&gzlLyNP%bLhDz_8z)6_ol&(cWoH_ zX2IEm*L7WgXyxhybB=uXa~D1G*drGWpZcAPhCj0BqVK=6FM**tk%H|veL@_77U(Oh{{)i+-xj|;za*F1U1wx?^_Xq3n2 zhXyLF9eMn6wN4^QCc#qw5+IjY0H|TWK)mA z&nqpN3R_IVK`HCQOTDb#f~Qf>G@krj1 zQ)7VXptLU=KNXRJo3pDUg*$RCi4<+mt%?+H9hZodY#Dz^WFR+ZQ+A{z9x0ATiYg+7 zAgzexdEO0Rql41xTE;EeRguD*Nyd)c%1H5clCgDsGE%hPp4#}HKr#=eEx<>s79 zvT22H(`vG5RU}pwNmfK^DkAk2k;Vm)STd4a6sf5)x4n^MRgfEMJ`QcFsr``F<=}^P zqM>ijfuXfd7=G*cxYD^TEBiZL%z>ehS>LFRoBk!&5_=P zDE$&>t3b^bL^`u;BfUuhEsAs|BfZr?Dm2m3FeJPFlt}qfE|hfOMYQapat-Tp-Sy`v z*OH8KC8=B#Pqb2yh;-%uFTT+hDPQOqM{0Qs&h!42`XG;q@Qocg)ifq9pfRyZkBK=s zAIS^vhgI)^ET`H-I5*j^I*JY-(L`E%)TYkS{3Q6 ziS$;%xe54qxamA9^G+>0mM%N#&jl!xo(nK0t&Y?uBaJnzU_qoNdoXlxk3sGcEq9Lg zKi52UbX?B(97mf>qa84$wOFTqg02*W%kp3GU8HcWPE`Py~8?ZEbE>=CW==In*!$MA{mxe4k| zmDHaQA1FVLM{1IK>M8-a*m!f?0antS7f@D4YCx&LjBtg^Q+6^H#Pj|V%0qMgeY!l) zQlWx*m*@Jxr#OCE7^!I#eI_31t>8{sv&f(okxq@Sh}0~Ol-FiD8|f@Rg_L>T8}wuO z0qf+s{swF%#h~$`q}q+Y~q-TeV_Hur>?pTdXvzb zgx(~a{u1b21ig!(cMV4Zs^;i^`1(4>nO%vME;NVmHv-T3;)l) zCQ=SFEVwsPoVXw3_JbI>A407CFoJE8byUk*=#s^A(;_8nKg*i0WnCg=l|j~=3HU); zKtJ$-9Ta13&q+jzwo<>`Li1d0q1T ze=djmT#=EqCa7++J%y}BaOie8QR5fOb&q_Ilh)>lOum+{nD zVTOSah{jT%DHt!pQKeu(q#k4N%1HfXk;W?{<(G!_ry&%7a|Ay~d;Pj_9KqvO&?crX z^${vv!ciB50d3|~x*Ar3(t_Sc8TzAe4TsuexPLV=p%JC*QbY@b^3L zQU^ZMfzNW_=R5EV9QcI}oIf4S$jjkv;&hliAQkvT&AnnUBks z%keJuae0yBRmxlcI>)Q>ujNIKnsFt>*E!z842*yrZ;=CE;=nI;;L99%odXZ-API7W zJ=5{a<*k36AYYw>Ol6tOdVNsSHwL79)u8cC3tyx0J_}!`@tqdFLF2nEoZ?t2=YWONyE%jp z@j3YERtr{P(@a-BuYT+N(xIIcazghTX zO+RGeQ#3wo;b&`n#KMdD-u4pDo381PTJ&Waw?m^3XuKesEAIcqG+t!kyaq&nDY5Ws zO&_!HrHsGBvm+Y%xCPsNWExeQIGvkm=8rR{gUtiwS z^WK$Fo{wt1eU|)f8sBN*H*0*ih40e%fQA2~#)m9i`4h$;3;%+qAF=R#8b4~`|DthB zhH2?8jnnsUG7W0H$inYq+#Y>A?>ic|$1&x{=VnBdKhX4c1otD2m&n5C>m2Wc8R$;YX2uJs$ef1s`7YpL_4>5~e-}(MF*{fn@Utn%N&pX&v)+M!lH~-Wx$)B> z#)EYPKX-xr7a1?qrPj|qaCZhaHYa8Wy`7&C0AIv-urAONBU}I)tcCe0k zR?DG%?Jzl)I`FGlPO$FaeNf2y9OJ>dLygOx_bB7Rxu_F-VelGKjRns_yT?PRfn9PGJT~_KV4ttVaN#6 zI}`ZC>?)srfX6$}yMl3}&GUGl4piG257sp*j`zH;FdnRHwrYvbFdnRHOguR0RMHzn zb!uG3-XQR>a&C0sUt~GKI*9kKp!-?IgLTkVriQ_bP|Y3d*LQ4a=q;Z;z1eGMSi5mU zb3?=QvYE4H&Puy2o9RujzPR*!{-vK8^uu5oUCpAO68f1e{;Wh_D2po}Gyv4k>~P{tCY;Wg+w0AVEY3NuBIe00rX+ux< zThrxp*_CcWxz?>;KWhH^mLPxh8m`;0X|!4%H?y-?n>t&%)~{(OoqcuFHDpV7t5R#C zBX@4>X=#|*g+I2#jn>`NLe(yJD-dimh%Jpes7P4Dj!7t$WlV&m-6tGfF}S$(RM&}7p!v9*-TJ<(+NVcW)*c9P5vWkVC)=G83 z3$Yj(n%W!AFSmPaVBfLvhLmg!TI}wc;mv7oc}!~Z{jnD$gwm>O_qr?BcdT3UR!!w* z4^hn{VQ1HdD;qEnqBEn{+ien7|Fn@jXtZv;>Dzp0Vie+}PdP zAiJ(zxXzBww0yTV>Wv|_4H(>Z4EJ_7eTd!V>)>8PRrQjJcy+^)`SX`m);83}E2=B; z=S}rHNfIw9Q27V5Dln9r8Siz{mW4Jcup|+qz^l=;p zUrHbn;lyijv(rkr^ux8dr$o5)!)`_uF8y$`z@;C4OyJTFKOu1Ghj$2E`r(f?PVIgZ z4paUd;Xm61PM`Wyvgu8KAzaeeI_R$vxTN19a9JL;|E}~(dedL*^88xJk@PuacqE&i z{=~7tCH(@8(@FaCVnHwa?;1f*yG6#HS2^gn3VP}P{SJECPp3pD@;xK}3l91rK`;Bu zw;c2%f}VD%jQnRE^m(A6MA^qX@6g;~4$RO zk$x!WokrBb`1u{UrbP9Uet2TuQGR%0-cfcKAqPRC(^%#m(~oE#-;Trh8~GR|8>h7r zCBk>$F!cXSAd*f0O9eA7*IDNI8qwc^!^kPmQVfJkdNV%T^c{kJr;u~Iz@?mn0+)K9 zaNxPxo_2Xk9r#j#OZhSm?-%7X&vQw>#Q#Oek@%wmm-1h9;8SuS7|AaGg#wpyS{(S- zl$eb6eO%yD&aVV6+r`!H?-%x$et5Y9-{QdkN#L?P_c-t$I`FB&p0fP&1up%+*@5>7 zT$bn44*X6BF8xr-`QO$qX9@erc9|=1*)A&_c(1^vJ^$W;f5m}k30#)v#JH#CqraAM zuUU+HvK>#;;}|vbY8+;sDiL_Iz|9*iM88Jha|L~iz-f!1ICs9R_=(GWc=@%0;ZubkbUZCs_L4TfuUhZFY z3VPT6m3iNZ^vZaK)`pa9{8|MwF6X0L1uo~cZwOq{Q{5?%e2LRu93>myqu>nu0fEc% z9}&2e?}>Jid@)4pc>wChEEiW1?n{3%8=_3jq9EdN1) z%kn?r!2i>MoBb%eJaXSp_B%PhTm_k?zH)x)61bdSx&?0RLvGz8@FlpWMEFJ=Mn3KJ zQ6hX34uk)iKqSJi#$j-|AJ{AKKj4}Y(O-kZ(2v*cNce{ZK3U+@7Yw~zm&tj`JeMIk za{T#E;U|*bJeMJQIsQx}0EyZ~(wpZplr6^}x$cwn@_7yI`Iz#L<$29V1ig%xqv@e?M^O1Y1pPrreSfHNI?3^E#6kb2pubMY&lPc$EdP5nPHcUGzSu#3qF<=^ z`9#0STt`U1`rpNF7^Jqz$N`_ z4&21~wms$kn3SWwbIbIHR)>9VaNu%1CFOrz(93!a3tZB_EO6;>Q-nPwK1bkEev`mu z{Pr~mK1JJ;vRC6U{y$ydtpcZ5ixSycjwk=XsKV*m$kEke`hOJoY(f7yfj2pDvK1xL zE9r*_M6&V!*Yf2$MZPC6PZfvD^LbnwJCF@2+2y2|kP_ivz+vb=OCXX>?^?e+qsWkGz@1UO|`pa#gF?PF9;L;9j9r)>5zU}{01up%cVqi+tF4F&LZ-Elw zQtyu#RrqIcZOZvOjoW(vOVCUCa^8{h&2xNPZ-tN}$Bh*NmvS}>T+;6nxTKf+=aT+s zn*Mz_@55p0`&)tkv%ufbIN5=G)zCkoal2k}zwXO|{&?&A5g}ie|C0ij^}SQzlKy)F zm-I&kF6rN+{m`~gnZTueF48#JXC@9~pNS5Ct`fMk&$~r=cGu=N_Kk{i~34A zX8+u#Um@sad9HQfcL-d{e}^cijF-vrT=ti<1^#(l8$XxvhNO=<=xYTo>8}vD zluvtBl*m4k{-ca4T+&YvxQr_%JMbb0UgE%)2>fnb8#~Zm79}dD#HkFF2&Wjv&_6~X zl8wvt*XNLL=%ie(*e_i0RzOo-lJSoa4>$Ohcviy4lF6pH|)7)q5FUOw)0$(r6d6&S8G@gy~IDJj0 zguW*HBz;Y%ZTgzfhDFUz6S>2$uWJFkX-^du`A&(2>c8J zbSlxtB{^mcq^ur+zf;rFX;|P>1YVY>Zb;6%1ink)CdQ!by#hZ|(3k5*B{@X`KP>RG z1U^SMHqoCg@Bx9pN8lye35Y%_@Vx>*N8rB|_fhV*R6Fx)W-t_Kk*7r26#<{0U zomL%M)A7xErtaev1sQ)tS^lRAa{M(3`3jjnc{^^4gvbdKVmE2#H|C>i4*e4RxH*nl zLD%=$Z<%rI-eG-f4UKQbH!`KXj{n7Ya zv4&gP=ryp$Fo&4YWjL6Q>BtVg(Un;Rc7FkVO!Z@yGkH`G0^b4Dx)*7@#daL{t)00eB`OQ8INI1zgDgm?>p(Sh#hfIA} zsXMd(Mf*`irHoJm^d~=>sh{@0Z2eC;^w;RkGX8hkjB=dq`akB-Ke9zJN&C}SXzM=({ew(oj~{pH z%{RHnL=v?*{qs`IZ;#(2n*U!712H4n|QyZontnaX0@uY`z^%zH)Z zE>nLV_`~#%q!fo923da*AY1?W4*f^9{>{cvx&U_j&2Z?i|CnMps`;q?XXCK-uW{(_ zEK-cIW9V;i=uh@5{vmbZ`bTkX>%T?oxBY+MEX5ep{N`keW6nQul>eyaH~aC1-?rb= zj`HViR~&NwGO-rrP+z8W?l_14=#G0V{bzw6Met%IOr)Y))Q!|n`^UEZ+8p{jixvMa z?f*u!UH=w`{_>A2jy`Q@@*modw)KC`p}#E5dBX$ z^bZp;(mr+af9?7|>d;^F_ll)ab5i|jzud0>Sr|CTME3a6r#IapJ}ue$r+_i6{sUUS z@iWp-vAV6l)uDf<*8fPF_RR6C*`fb%Nd4)(5nKOmhyG6eaN$_`Z*%A`{-i2+qb@(y ze-;i~|5FbAH6@DiST6aPw*I$)Kg|DHwf-T^PxW7f!`8pdp}#;^ zyiD_(ldXTTLw~V;ak^16lKv$)Z2kYop}$7!AJ+b(#bosV?GF9hwEjHdf0u^nA9m<3 znyWPKJ%;}8I`r?{t@!2myDUWi8Q4@J754ZuOoJuTRqEvb+V-CW{?PWnL$M?^C)vLa zhi(7)TE7+MdW|u~c(ywEzeWC)x>)7l-x*W+XIk`j`Fk|~M2l`f^Ost@u;q( literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm.o b/third_party/libxsmm/obj/intel64/generator_gemm.o new file mode 100644 index 0000000000000000000000000000000000000000..6c33deb33929b9f6143ce05802f30e740d441dfb GIT binary patch literal 10912 zcmcIqeQ+Da6<=8)QS9I|0b29bC?Fu=Ls-W7=EIlBa4E{iB%y_&R4DS9sIetiIwx)m z#YQr4Ra4S7kkYT|A1#xXnU*r8C1e1{2?H z#PucDarD}ihW)Xz;c^-n`F|J(w9G`brA@eeCWy=DTeO)ILfXt&s7af-DiqXaelFDV zHnDd^hK-?NYwDx*7(_-5dpC!5dszGREupiD*EHLQ^KWYH1BdIhLbQ&t{FrZDroAw# zZ5kdN`-h(~UAuRCVOT57fd?Ihr?o;DonkaJC0d*k(zii`Mdv-0TdGZCvEr#T=T`Dn z&(HU*t2c@l6H;8lj1BzR29nw@a|EU_q$8@UB)^iw@S03@!$d8WK!40!4jp<;Mkiyp?(O{j%K0Q2<}n!{QF}y-?6@v ztS>l8N15zY^?3-ila(l7DN z@R6Gj&b*}q9EtM$32d2UO&J_JK#?sppis8>v>`w4d*0qdyJOkX$I5yr@34Oc&Lyix zM|{@F`nDiE!X{O?7i^^w_<>WQb`txj&-L`g zZ#L6rHeqG5=wX;jcc;>3BG(g4X8O82gXxSFT%S!@X3)w6Q)yf}&774=C#+N^9n59= zvPm=8mFhO@nZ8PYgm$3C_QA#Wzpgdx=M4K#;=IJs`Ap2--le@ajH5Gn;C}5StgEqm zLWnFgb|X~O(KZjCiR{vbHex^^9esxVdW>}Km-$}4|Fhf`XPG??Dq5jahDQ6o7_;9p zHt-bSaa?Tg#F4BO=o~PLmxT;_7stWogdUjF$?pu_aHIaY9rEb6pNA*#VDpZF7g^!O zzBy#*yD+r00vS=RZ`j7uPn2w>GVEI(eY&=JL?3v8P9E!`@y~YXm<{Jl-*TFcPMOl{ zP94ygcJ{J*GH0d%IaX40A;TyF-Gi8Q5uGHWJ~|N6=$!S!p* zG)jWB22;6UuGdVax>9E6bb7C=sIOz{|1Zl6GoZe6*P)pG^h!5=`>3|M zvUZ)m3nla?bHYUih6vbk8fVQ9v^Y?o^0XuxWyTZqo_(f2Dlxik399_ z%pupwy2M>ycXGK=^qoVK@mi$UKIdE|h}Yqas(KMO>6cQ62dTWow~g)2<2qTc!+mtP z+E15x`ATfp2mXRL*T?tdKk{{)wij>vr|N?vRE+$kzw`r0VHql@#tGyUAWonP<|>04 z^6m-d6{*3$*372OZiSh)OiouXS~Nd6qUabVr@u*qrw-ZM>=O+4{z2 zAbMk9VT1pnI(HOpsLLvYoZG*q!O__(bbbpj|7>An)7R^kG&bJ>ox%KsrH$=%57svZ zF$;7rYz*iPb%r{I_FL4k(ZE(g%}$|nxKih*rOsceGJ3;x>JTbdsbi=-Dp8y#uz!uq z3lj2I{Zc2K3>!~=`XH@Be#}sf{EY9B6_2K3&W_iCa%W>_!p%3LpSZ$v-OaW6Mxor1r- zVjN&8fy;V#P{z^%=clX$anA`{?g8>qzrceM5cga8eSDnr93P)2w!poSyPM-)ZyCEs z;5YGm<;BlVt^_dp9@ zysHNOb;2=+m%dwr|A`v--+@n6_5r?6L+M1=Ud_&PYT#GZz*9BwZ`Q!K5bkvf2TIdqt7v629?xl3$k0boiEE> zW;UD2x?p8s;)$OAxOjW1VOTC_#uMxMskPE&R;2Y95()aYnce2)O7`}pI&0bCd=Pm> zY8o3id-`grD!;r*k9vA&?djKscrJA_es}20npLDzxkNm<#!Rk_uQ3yyX0@1=S-LVA zt11is2_(_k8Sm=O^v3atmR9+a-5JDFn!~y>y=Izq_4Zjg*2SN(uB>Tdr7?e&b@87~ zs+h_rQQmUp$%ArWpFqQpd7S1@Er95Jy8un%O9=!Kej%E~A0QA!yceNK{4oMSgkOv% z@!bT1XkI#PB>qPNL8L#7Ch>=`pAiv$37W(&5U*a|c`s9Nb>0qvd*@vtaPPcHMUU#2 zrQoXmfg1Y1Rzv?51#eOOdQs7@&P$)mMBaFQC~&V|b>tAp>(|Kw_r~p91y}uw2;8fG zfxzjeu2+wuN45W}3a;wEOWv(;f1AL)`gMVO^?z2uBbZy}^PqyaD)?pvzf{2=6u8&U5d~N6{H?&fcK%t>uf`!m z-;5x#U$y6U1#iPMX}^5`B>kUMa1(tZ(lZ-P^7jx3BAoUdi97G74n9M{2hb-X{<&z9 zzepg6@Ofww|2Baj!snw&{3is0$WF3Z;y)u0MEncTBtAkQi15qMB>oJ6Ai^(4llVRY zL4;p{Ch^w^1kt>wp-KDcUq2ELD*K+gE|)6&rz`x+6#NVYFAALUgua2K{_iXN z^o=0#9}Ar9Z&z^sXIhNO&a)JJpQ4ApU8SCri4jD%X=oC^Ou>T+PJc`!qT6&d$uGZO z2v_rp&LASf&qkB{NdiHHpMxgxy9fl4Uuqtzd7|cWIHQ!{Oc;|gq;XhBA_m4H^ zeO=*K^X;I5tNFHo0tTX+ns1#79>g>0??wfmuHeK&M32Q*$Hx}GI(WV4)9on*Z&Yv= z&V8xJO032m@8JBtM)oYT-(um;gq2|7)wvuCXEQVi%ePO@ORV#@O?o0>d_-i;?nIbW z9G(AITHTl4vf`s!;o--Z%P|dcMUG{Q`(ApSws`oD#}-$_wMG5Dqc1qI2r7*4ylODM z1|^)#SrL5BB{MxeW*Uo%Ki!yN{CInD&PpWLf{hl-TodogCVJ#isAkE3-KSck5>RuK zl(f^O3W;)YpHQw7CSDs|yxr|pcD(ZE<8c7(f0d_qHquG&6e_ig!4wrlU(PS@^t&1L zz5Xv3a`JZ~F&y{X!1Z___c+Nz_buWDW;s0s(OGi>8u>~j9<$rguV(+7!l3gwcljdi zmiLEh$Tx`}(rzk?@}8Vu%D*M#z5e$KdHH*fR6}QoO0Qv1&HoR@L3coRIhW{|6~A?3Zf1{-+^d&Hv?e zXn=YZ|4+nN&M*C^-{Z&@&ot~u#0;A5+}vx>?Kz!a){uYoIOU(JA>SkpA376N^7{XZ zke{kDq2D6pz5H@4@9zrv7LRIFkSI9Se z1myZjz4SY1we=sK?r<1_k@BDZiXi3X`d6EqMDEh)ia3hIB>ov_UU{ky@7@WBJtF>O X@t(&qk30Mup_+W)Y|;Xok0JjbJ>F;G literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_aarch64.o b/third_party/libxsmm/obj/intel64/generator_gemm_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..43e65ebce710531a0394c79aeec3b14328790209 GIT binary patch literal 28768 zcmb_^4SZD9weQSiz$j@>w20{~_nFpFLq(ccv_w&J;0zgLkRSnqAR$5!A|DY3>a}>q z&ICLi#?oHw^WNLvEw7*6URwI@^QP#Ji$TSJpGfOR>uZa)RmX}}t)TTI@Bd$W@0p!p z4tjgv`TfqEbN*-Twbov1?Y-CD`y{u-lT#w$aL63t&}E_GlR-nFMcu{wX1Q$+)rQUp zxpO_g-<{*mcH3qr{HJCm{Pz;+SKG5URW=^-vNuN)elnIgcFeii9{37+r8 zCi?F_nFrc{`&$2?mu@Sas=^D`m(Ae_3vO@9zcU6{y1LGo7R$>^R4Xhep)o41hK9TqXdv%_sqW<6CXo=e8U zC?nKD5_kFzxakMOuG3co6-6m2Hg91fyTO!N5Yw+f{z$Pf3p#ygbd#ml-)mYDt93R# z0zH!16_MuC7ha~coAY83C<0>9162Eq!ay|;jTKB4&$Y!WEMjQ( za3HapOU0r>3@%{%xKJu7qKD>;3Pq(NZo4~ARb}(VgM))kA6)Ncm&6*p9JEIr$2(s+ z0$sdpGB&bcUNA4Jpxo-F^I}_fSEYFJ`3lC_NND?5_i!cZYIi=oJ~HfO7sRTZP2UvP zKj&ojNP+jlsr7F9y-4cf1ik3=AwZP>$xxIxZm;Pco|_R-H9R1Sh=BI13qjM8Ydagp zWyylQK(G8bs(ZMwQ(Db$PaxhU)LL>$tCZqgD0SM9QuExaX`PYIrY9i7Lu;x^79tjb z*^=|l4xw!KaLBiut8@~{Ex83}(`2IxdRh^OqHxu4@D|$1xeaGCVl*v0`$W+J$I&Lw zX26UBUWh*>L&|8rg%DWSp=z?Z83Ev-5>+U*0>RzK2H*I5b_X@_+=^l0c<&)6^9{Jo zO}|j#WbT1>Zu&3b)LQ&TQWqr=H)2OR9m3sRQBV~1gia%4-BGI>aAC$IB1=k5&ZTI5 zmO)`DMt`Bysohb=a?xpM>Xy@#rAzEAE%r1>6NX~Mc?((58YdIvvIWXzQLwUHVR4`v zw9BYwIX4j3E=wwxX*ZNw18%TrJ>@b*1imU{XhgJB({c;+?Gk)7PH)j==z~T}M>+yQ z11F*tx5Y+#x20l}2fSDnvd(4FA?3_`9pCm({Jt&mTeyM%x;wPp4UN&Ck z-oprn2WPb7XKtmNeh0>l?`6A=Hy+qikytd*ZAHO&W4{z)KzL~Wp4SAWoy@0DKJLE= zWTN*uXbp6Nvv~|c5K&Xxcod#`{9ZgUV!YhUK?G(veREZ`*}3B(4|N(#NsR~64@Dqh z&Yp^zjgAflHAv}E=vi<;?Z);VO2r!o$snHl`|!OSWWZ@a)J2m6kMuxkFpq8ve)MT;yQQqlVeM6^f^x7K{wqWVZdp&ZmY2c z)D}hBVqxcwy_0j}N4e?y!(_IXUBMwX39$34VxmtYhqs7m)P8lm>7_5-1_CI+7Tw@A zrPz~Sj2fYy?3Ew&ayJcv%~Njrr3wmiGFuQ|$}{~+nU3y?LM7oC>Ac-NG`F+qbzI`T zgHC229^Ld{ICT?#BdI1gw;>je=i=fAsPF`Y9CR{u!xVGP8PzBR2usp1 z=)^yR+R$es^G~sMXKuvr32ni$)N^ufxBvMh-F~Jd>NknnvT{ zm#uojK)0Wu7@I{rw=jZwj*f@_$b!0-FwptBV4!6y;!VfaPnrz#qKhrq=m4~R+UY$5 zo&FGQoo+TJ6WRGsGAmH_U?kPVdyhJq9L>dCVI-Pi4Vp|#Zv4pf^B5C;0K28%4W~NO z??zH%NOB&zn|NZRKMvnRf_onWN#x91SRh20U&h#ofc@jT)TG?Ri{11^5LTIwoUxw-6e&?s{gM|t5FL^n+i+9x2uaE<3$ z*vuLaAS{N*!-s_wiZhQpho!(3S zLf<~vKK~kbfqRX+(Cv7@IkS0aa9ZwezdHZ;2ZMu4P5-aMLUij6Q7zSZLV^4$5S}P( zPRD!6^W5~i7=t+s#2s&MychZ2gLse}54t1U$R^sIzShbqxwb}jNDt?K^DaZKpl4b6 zJ?{+;77{KDB_P^!QYbV64CYkg=gPM3kn0~*zEHUksQ98)=wMK2Q6OQJwux9zHD2X; zhtW`I4)Ye~4?t|uK4=eh$_SKy&*fk!;#*~4Ls=bm?!x@JU>;~MwC?46dh!2%tOd^YtW49oXyvx z1-rS~MsG>kFaW!|hc|ERRRb*S0>KS1{3y?VvHKRW2qvRdMDTJc5{R2C_aHt_g@61f z+|y!#e5J`tc8Yu#sy?qEsXXDi)VP3nHX(8O~c zlB`2Nuu!R_Vu7&Z92u=!jnwcH6pWWqu)m;Syy*jH^CC2Lgxl!eqp1#d|L*Qh+Yf)A zm2z`8hxcsS-u!)bE21rJ;Kcu|qp3SAIw^ZGPEmU=HkQiAtHdIrc_CI8AVH{QqJJLD z$SezdH^i#qx!LIZ4xm22k}jtv$vBAnk7M4)tg#-my|JFZAU3|xKA08OP}yCZ`kQ+p zdC#W9{k@L&CvE7ttaf(@R%$$;bmV08l2mN0lR3=LV#){-%Wa5KZ5PI(io(x<6#2$r zmpFS*uGn9Q)1(S2f{S5Ic#KEpXI3Rt0z{uV9aO<8x$)kw(@*+q*oNf zG9g3|eVIXoZUAZS1YJ9fIb#(!RIgYwDs(?**r-t$MLgvnbX1ZwVW6pJ=2VOP;X~CC z*O=A=uU*NMo{l}B`efY?mkdQ)Ia{yc!UD~~j*}&UO)wpIp3MC>TxaKgNz}PSl{%Lt zYS~YP*-tU)cXqz%-I5FRX&XoS1Xu31$UfsD<{b%mwY7`O!wM-^!WVgQh zEJBE@V|586uTcKUw-Wz`UoMi>ByziB2T=wPrZ<&@fwW4A4YB6NLwJF8H~(#}chnv` ziZW_Z?a3ql3RzOC+bjcLr+1jO7-zZaA9r z0t}uN&Za4N!`jp@byh8a;r%C;I;&5IN(imhpWt=x>yf2S&jTT6Ok1qg8M8$ccE)5x zP2}cI=J`<=oA7oK-yTL0zFmY$AZK?n2ld+?zWtMa+aqt0)7>d=x5YNfi!CuOO0xJy zA;84p$C_ZPCaBQ_TZLelllezYuw4_>X@c!SFx<)XYJ#6?f(A|SQz1CX$=m<})b?z= zhwN>!eofe{3HxznzEQcs!in#wPOaM_!$A+rhV%t4|o|N7|3MId(Y1Z1_lw*oOxoJ3VX)&FX%)Y51<7_m1F(A zn9I4+^`!f$&X!i@M!B(5b87~n)D{Y@#F>8aq)GmcEz|`|EY7CcV8qm$6;6K$kDe?y zaNaLT{UQAHuW}Pj-JJe#SZZea!{MnhE*6BPe^SCSA`D`rZWIFRK}o|CAAg3COE;_^a)bmpX&*GxdhtR{gqy>xgvLCbimst+`U}; zFfX@g_&{cx5#r^Ros{mc^mlBdr~yx@F_0-};qMS73f{E+Xqqm%#?c z%V4uAV{Cp4{VNxayj+@43U;12P2x4J{XrPW zsd#TVA;=Y3#*j&PYcdIM5jY?McWZ$X-b$c^w-We(;`A-UNy1xmlJF*(*M#}{Su~CR zAc7;DTNd$hJr%h`R6IiRTjotee6d0@7fJKci#CD+D;V5q_%NKBAOYPw z$07SR4E2~-r1uToH*)>F665rprc-I|7@sQ5O;Fm|)Cb+TLT*)p>)~E* zIt7|7sUim{jATkkBBfPD5zHl6-yw7rW$4oTqQ{ZaHhx}=W|gqXkP$Z>aW=Pc`3JM~ zO8OBq#4E!{67%dnu$*{@_ysFKoSH@BejaRTL&aBi`?z>NJf7=OdxL5h0$odmn_H>p zCG*>n+eP7<7Udf!lR>S-KbI^2d>J`T5@WkcF-3~Co#A4oA5Md%J~?EEX1;q(!KtV# zYrLxxoW;Vr5PfRz1tGs>$YGecSG=`?Rl|;&&TX+@Cycs>7u^I-Sl=LHmfSU-dgjx8v4B04zsI6CFn0!t?OFFKoBndEy&^8204Oqf)8kBendx@k1lS*q#8 z7bfS%)?ubpk-8Df@?k98rr(J;y}ulx+ZyW}_n)_FDmy8(9Ge(+lYc8UnYXau8<2~c z+0j_#P(r+;b{x?Zn_hJ?syV>G1>369X2+&jzGyGm)h*peQHG_+p@gvPP(6YOVcO#e z=~c9bBG-lO)VgW#`9NDMI0vYptr|)Gimpp%%%RLq+GY;%I0hVPYObnp@o{IkdPe?7% zp*A;Ro{H69kj1g<%URk)G)r`s9A2Rpr_N??M0!T3!<9+qG@6dS}y*qvV|><_N)0C6P|c(^f;kGc1)enprOViN-zHvl$>eS23Y z0c;vUbzy@kvlK!BE(T!P`5cr7Dbh$TZ0~0gZAQ-@8IwX-u44tU`hE_#l2D+6b974K zRuYPc0Dof4Xp3b6OUwgUE;t7*9R;{XfU^7jSGYO}tvr9U64I`!xA`1|Cn!+A7H+0u zv}{1V(Nlj$g#>ptA4i})Wrc#1X+xNsVCb%ZAhn!Ys5cfzRs67@R>ANjf`My9cjC$& z;Pu=9o)G8ofOR5KpT>hI5M2)bv~G8W)*B*BW#Q;~3*))yIBXl8kFa7apo(0|a5C>z zn1Z5|)Y&SCo`n9&EzkvNS4hN`$Tv=0(4F57vqYWEoI2%)v6DR0Gol!JG{F#QE@BNI zVnEP*(Gj8^=9VUEw-lyTwL89;hix$K-KdrQmreDkB{Im-LrbD&B2&VqBX@HIJ5;%< z%e?UGMkKdvV3ug1TseDI%v;FHt#5S2s;Q!JXjSa^oQ@z7g+uuo+%6qw*8AC+VGkLT z%vm%Ez5$4Gzz_q&DQ`{6jcveq;bguJvvJ61$B2;{Ltfan8SnXT!UDye8`a{uUL-Zb z*BL@Ma=h3{=pNQ#{3hyNAcS{FiOrY-OGtmX=#e5N*Q1(Vb28tBLaI!ZWw;r{C=%|F zW@5(8%~8HZJ0EHMi*6qFkY^4rQh`4BdNvl+;#Y_1^d{q(Q%A-%0hX5!2-)wNu0R3*5UB?OwWHe?!P4Q+#FdE zt1Wy!weH`iqT}$;@~WK8>*4GC7^tP>B*3PgV?73Z%rd;JTg49{ForyUsQVqhI*JWD z+QKpa6lr%?A@6!ehJC0Yo}`i@)VpK}LPpkppmb`o(G|KuHE3UjBKI;SgqosxYf)Kx zrw6K;_O3p}kp}NVFc);I)_ZEA<-bAJasN+<)GgR4KO9y`Jz6uezj!V&9D>|go^xMj z%VCI4ueh7JUE**())Cy5PDZy!NWXbG&jiCg!5}ra;wvK z1>#w(VdA;O5WUAhC(Fq46_H*;MmWpn))D9>wJ00Xh%xc|`(#0v`|pau=z6MxAnO-P zLS_2ay-KRbkBG#{Uq`dcdx*`#`hH=$QOsT85qum%e}D^S8?7{0j0V%pkYQC}G4GeA z9@IpNgPx1V3uN00UsAd=-Fsj-^etfUd&2;Gx_j8;wC8Lt_MN=Pf#&wP=0Mz_R6Zl&7Rau>GeuXt1U zHBwZ*bA)2nN1P&8C4`*2sQ{N4{S#LGx$`C6qbGR|MLiquhxX~gL&)W zP5a>wcxdd;FMVZj5E)$li2d$>{jl49*l9m}2M@`(;xN!o;`~cPIo8CIz+4%^<2A78=T8>UeU(`3d z(#d=kRY0kl?6k_Hv-3b4`?e_1hK-zhl%J16*9bEdeZqWVkY~Nsc$Jk+3R$X2UmRY4 zaw03c71@)?Ad!=|d+z86TjQQCy^&0`4f8)kX!CmjXi}NKQ93YrFI7$-ejmny$qE(6 zdeUgAM|SpD#QSJyDqVzN_c4eP*d(g--$Nl`!B$aOC=R2g{}^8PLV~*qU~vyi4B7HXK)n}*=e;&437;-|AizzpPP`#-5vWWUZN*SqFG@R8toEuND``Bzg#P=JM+hrF}T9}*fEIJ zlZ-~G;|;z|z;vJz(SqTlj(HZE{fAT)iq!yHHaT1M$(HQ+Q=z)@Xm(a@B0Djb^j{Pq z*b~V{V`TGrbShksZveZ-!TdGo8@6Wej(rPa8Vg?z@vEt3bO;!!oW3n$T(eg$>>#oS z@1Ow&wPY+MIRz-1?~&I^Ik^QDzggL3NSm`;L{@g2a1|PnrWU~P?ZB28*8wP&zTCr& zG;{|Rv$_*9gVsh>r*E6cQY&0$d%rD~Qu;~vp%uWYhpDbo%-6rVmmN;A7g<93*??PzpNB5nST8)pUxEzT=rW^BGM8S4E6coBU_B}ingMv<1?pCf3 zcDf7F+M`J5ddc_bLsL1Bm)z}(OxB^j4cA{{{;QWAq!V7W)V*r%}ZeFf?&&4OAR4Y$PZ z0wbM%nKnVHLLHPyNzV%MRMVNa&NMd=>L8)9i;7vfI8Xl;0o|yJoQ^J1W{3?c!8O8q&6n^}d4< zF#n{?UN?dqN*njF<6LpCTK*ut<~sXMY1Q++JkJW4D_u=zMEDET6O1OCb+Y;^!0O;K>s;MZKNJc*U6e4%}m{$sUuZen2 zh9&CbS$td1WSLA#hddMZq)f0VlIGrbA%@k$B!7ntkYN58L_G#-(}39JuqRYTVNZgV z_FKqA<(HxCSk=*lv1s7=Z(t3#UMLe-J*tbkvT)h^PsauahomR@i-73ww3FEj!9Jw6 zNYj^v*PoKi{w)zVJHjMD|I12%@_#uh@t(|{ZIzoClY){sxy8(h={n3ID&_a0puK*m zl7jpQzxn}XQi^g|c-?e5{*9vG8bbM6#6{}bG*ku{B{{!clIO}kpiWA?Kgg%LLBu-8- z@L3rC@->9Ne+`AugyhQ-u%u>H^U zHS{T*e}nVWI7j0oKl>z&eU8EN=Ww2f^Aen&#d#Lam_FH>&cS&u&U&1omdTSZuB&gk za`u?I3F9st*HCvs!v&2E7dDQstG}viNnN5drSS6`8_1h=e|~YCRNwvqLZrYw+wTu8#WF1swbpRYN_5Djx4UeGkj(B z_V5y1T>#5*?ZERiTqogqDz1z0oYe36d@Y`NC4C#NSK&GGUEHw18eDq-wCLxxxX!`z zY+PBO977|3<>!Hp3Prx73q)FRvk=I1T#ZWC;CY7iOqGp_$8)$Jlvh|BA~OL@1;AQT zsUBQei!1c=I$YcF9GR?vn{iEQ0_p&(D!|x*cC-ojOp(Tpv~THqKAZBLCGX9%dB4nh zPmLqAFRL&IS7TQ`FVfGfN&?T<>gTI*osQ?ozu<;dSba+M1O$UyU60_HR9%Z;nN(dF zAK~G>t=Uz0wrfI-Oie@+0ye>wc%BPMD{!SCKHCiE;&~?C+x{~Qjel+W&^rj_KO?qz zG_&zaTqhXU@v?EfF&x=geOvfC2=Bqwt=My&yijYY``@RU?r}T zP$04tH&*~i8lW3DY%^1EB}=$~q!!?5fXw5dxEj}15KP0BB`BN)_)Hns;7X<1%6WnW z_?v!oT7P24h2t`IW>WWLnKtdRO+(>m#g9*_zB7t86-Agw;i4#lCyEvnouZ%Nu_4dx zct)FxMz(5#n{aIfuuwm*#FY%t#L$aujgkSHVBr}}E{foaM()53C9;J@7XTq80bZ$+ zQ2|k?QrR+0BQEA$R3`j@IB9i4<;akN#MgU@`@ND=mJw+FoG2+X(Lr@D#vPLV5 zPQx>rrHJA)-_w$;4LUn{AMrKNUADP;BRnJHihx_GHJxJ&OIOfPyi(LG!(qx04tQ%y zk*dW|p@%5mMrd{Plu`PVWN3JIM$sEXc?GP-wG+?i|Am0I#dsjLlt?j=K=n+`F~Srg z1szy)KAzECh@~*%MA5x~$(>O&N2vxaX#~39c8OCUuS@XENTfWpF*K_-CIdJg zVLu~n3|-=J^r8@%fJX;$ZX#fXK#rytj;CP#!0gY7OYqk&0AIT_%% zX}E63{ijEcm34VyL`6%adjIKe^}*9ys-Hc5QuX1}7gvLPGr(y8=ixeuKs#<)07Nnx zS&nNffOWXKptug#1fCNXn~HE4XdLJwO%?eKZraIbu}0?NIu*bys~oBnbTC(nhTc$% z%Oyk=l1Y_-c&w0&tsJg9!;)7OlG&Aj!iSmO@kfPZe&ujxfbG4#a-Jf0TeOhO%!YvY!mF4pKgMZc2EuLJOt&@jr;^JC<4co*=~ zBE@)?TnN1XDS-aI0Q^V*emZn7CwDUN(;}6HDoGxKr<(%k?-KfAJCZ&X^p6G5zfZit zy?ApPntD0BE&#uPICzyFEdlt-0Q?_-pB7;!tq$q0aeGey{ci&B!2ta82%U0z&JDnG z0rwaA^z$(<5_uM5C`5P&}&fDZ=Xq4Em`;}(ZHI+m_mwYZ~W-1rMFy5I{# z9xuEgG%h)L0)LwL)4-n#`SS(-jOWiq{AuKm5Pwk~g|Jc3#tT9pvF5OH`QkOJ)xt$b z=i22fmvpS^TD`t&&DyR)Nwd7MeAU|2nstj)%U7?`q$@#rVzL$MI@WfjI+m|odC~Y2 zQ;O`xt5^1{OMNs?P1mt@+47~St|c97x~?m5FK_D(yP|8?cXf1LG=Axgg+k@(NUD!j zL!%99X&F3-<$tw~Lv8mlYNy=z(e+bw%BMps8X)9)S)cldfP%Z%^>kd{0lv$Zbgd0_ zv?OOta+4i1rc9X?Z|i7tCne(@1-q|WzG``@@PxR#cJF^Y{xuj^sJb*${{=~=$27a|_LequC95Gg|%4y+}QAIh&< z!>z+THn-$>F95#)c@7T@hoL&0hW@$${QCj;p#Xd^0B_JaNU7X^4#1iJ@F=BkME+0? zzb*j(w*dT^0K9?%ag@sCzeg&E|0Do^F#sQpalag&g#q|i1Mt28{KWwLt2#xdTA#sb z{C=N>*ID?vdR|WY&szA}0Q`{ve3(v=OZl`1;QtYTzZQUx;YJ0HQvOQ<@NWm;PY2+R zPT5QOPY=NVBLIIn03Xc_mNNQZ6M)|qfd8KW{C5HPM4if))@w-so(aHzAApaPuNReH z&O-f6ynQhMUtr;Oy#0fP*I9h-ws4#NsQ`Q*%JN`+&&Fx=d7U5*;#`w3_&BQ{v+4N< z!8}UoZ?$ln-tIqb{!awZzhU7veUsH6+Wb2#+@}Avh1>KeTm7C*Ki9&~Mg5E&ei(rN zIRL+ihQYy$&FAI-e7A+4g)&C&5eu)k@W*k_gVp06r=fqEAP(Z(_cZu}+MdL@3S{tO zR)1Az;hxpc*!Y?Nd}jduOaLCS`Wc(Q-H+M$*8}MD0r=lq{fy0LQ2@@bIeF0BbvTWE zej0#37l2=7^)oh~uLR(`0`TVp@KdaQ#^(RU0DQp0KWEizY(W2TdjP&S0Dn6GpU6#e z9JJ?WaT!`*lmqVH1QxQ;K{+nkWl>0@ThW-`{x9xnpg-@{P<&W1j zpU+tMuPu5zZ+zdvn=Ja1s5lPlb1_b%5C2gj58^g`n}yr{z0tyL`a3P$)@O%>+j01) zh1>LYo@~>`7IzVBPOZO`#EBo1EA z!)fGpTevOv?*s7fS@>BNpGN}lw*&AqhoLYIzP9;XWZ|}*msz+?|Fr--Z{hsh*3@^@ za7tBj?f97%fTt|nj)xrqIREa&@VDttv*r=}!!4uFc@}Qlf0~8ce6A0`@3e3`9(D)d zuLahS4@E=*Y9Vh!OyxHRaoW_sq zS1sJ`SC46&&D-u*qo)AC!ML@@kxy#8l>a#vZu4)@cq#u&HD1cU+rsTQ`KpE6aq^=8 z{D6hqe*A@n+x^Bn7H-qq>rpm7!TqH`S4ab`EExZN>iAIRRJRC{xU zt=PYgNpYL=uPd*VP#cLKpM%;Z{4da}uja?CSxuvjvv#6C8W0 zE4x;~?yIoLHxA#IpT9PReJO4Hu4Nrd*L1GzQoj3d{4%!;muemAI0tXZ(?rHe47n{zBbHJH~Y20D4%sKmEQ-focu+4K5xGF zQQVG~|G)8~T>amft4j21!%{wFm&$(!SULII&r{{i_e1LGc=>PPMLGGq^n_-aRe$zZ zrSj|XzMT9XPBw9v?|;QeZv zE8M9Y{=0qZUt0dKE`N3jeO<}(S9JN)O7I#^25=lym%^`!e{<#Eo=|>6fkNFJ*5$Ps z3vWrw!(9JSmse=i;&!zat8j1E?-*z8eRTQV{l!d+a(!L8&=-Dgyi!~BmUOg4Kj$@_oKIrq+!o4J1U z-S_;Q+MrfHYd;T_u0wsyU3S)vQeM!?VGdrRpPeFSLU1UYl@oh zHAc-3qlrHkSl7g?g%#ybyhi!a#NL9a`DWC7@v_S^frMKi2s{vj z2c!dwb!}VpG|LFr8&-FCY1FI;o4*`N9Kg<5KEz~>uyKTk`&4>PQ~@rD)gMXdSj zbZbeGuHE-5-C8p%qTTmcB=tl*5b+Nhr9-g8FKa_jT;J)B&(IUQlQ6^|F9Fc)k59&> z#UK9)F3bJ#lOxvRcqDb8zr16bHi-y*KHXYd4>7XTBt%R8Hjm&NDL-)x&o|`bnZ#+y z)9KclA5T4@CBFiDg?KHwT;47R#n0p|jG5{fbB5rKPZZSgQ*ePXLED&=sFbXUMy>H=BKV3gyIi(*y;(G)4&jG^_eWQ0 zTlb^>t$sK~RQ-EY{iW}aeO1Zh{ehuFdA6EN;FIIWwMkXJBYQ__$@}1Zct}fr4L8xY zUlIpo6x!6Nk&SQxe0d5v1|CW-I)3~(8d~`Q&{lzN=%qaU=$Z}S%8!~u((J_U0l}3z z3&gyl;F)^anRjTQ5Y$#3nN4kqB&G|~$nXti@|6chGB6=UxrLy-Zf=K?o0GXoIAee` z4t$WOCAWQa{P@5J!l9X;iactuale+D_%W5lF{U7^82#>YpRj&Oz3?A(y`jSabiEgb zG6AFXw{BY;=FC3m(nBFTJGMiNrnkY`nmNpCWenb7lf1vak^2eum zG&4;|9e{m4iY1w6vh^0RMvGt}q>6q&Q>S(ilf7S9DthJwP^1eVBpKAuMMlKZy8Oe! z%uJHgK>Uasf`%eSrV|Fp{_X^{YS(!TF5my{$9pog)Qne(z1#Fz1E(|NosT)R@jpv!~3F!ln z0bgbD|26rC(6>DVJ8{Xd+F3uTs!kV*0p^QFCc|3k67w@EmfH`xMkBq$EDUbL9zmCRtkYm3%TJ0ttg!@)9u>fN(8&i@c=?!ZY#~rc7~6`KmPK z@1!Z?r72&ESeHZ4^Zi9Mz74ZO+h&9(>ehyEvH57?4<8SFkRQ*VRoEY?INgys z>9TO>Nw^X2{sD>rA8E-2R2DSoSy?qaSl$8(||t#Hy*6a}zPg z=#PPh!p#I0Z77&Q01eJG7cD@L!-S$VWB1*N)m-1@r)i0jp#l&lDHAf^MA_|m1q2mQ znQ4B3c|?4=m~)HbXI{j3DCnpt|Bi=Qqg~|v&nO4l#mL+PvMob@%7ah(qvVg&0%*J8 zRUbS-dg3RWuVG$sw0{F))8V-?-a#q_A2XwR4C)Y`R&LfN3BRT;hF{AMn6Hr(;+HZZ zMN9q}Dj)-s6T4MEOzjkd*$gplbw?LkpkyZ};3QvSwSj)`h!A6$n~*}9z*twdnR9S5 zs;mi_AEU{rawo2%dBu|zLfnk^(vY|R*;swnL^bnF@RGzl;%8j=1Va%gB21Be~6tdPO^T} zfq~MF7g=~X`KN|eIf;`IQ3M={s{MQthsWXNh**<&ImTxTk4UNt|3wSpYNeUeop$bS z+Y_=)JJTtGv24$9we3X=mjh_wh$1y0$P|&REj0XOlJ7g~4u^P{Yd^(Ty7etS1T9hJ zUc&27HSA9rf)-#1qIM#e%1Q=JeEWw@e7{H|aQp77h%<=&*k3v7?61^ee(7+(}}Fioq8>edX=yO!$d zCjTXg4>7r-{TU;GjRW&H(Zpl6J}-#ks-$S#QGM$HE+3=JQGObeelo?bQVO) zkHe5A#J0@-{6O^3{PBBk%^i9-P`ISbD^1jT#tH!psk zzC>)bd<&Ltr$1dj>zp06q0!h8cuPjtrIGX!zkZ3H_!!!!3330UFG04$NIZgBFbb|K zBI#y-v*h^zVk2m5O9ag@E?ZM1eU)E2N9hqODD%}wq-?8ipgAsDeQi1x&flLO2Ayv~dU+D7)L6JGW-bp$ z_Z#67g3*e~sQrA5juL~^nP{{n$SkJJx^zRn|J@jRVG({6F?131nKyJRKx>6Jh{Ak7 zV!ox?Tg3dB=J&c)TNHz$0zL7kfDyQ76RBwob@>pg)8UKZVp!Y-1NZy&rmkxv)*R|L zdyN20!XBw!Z=It*M9omRT-)7P4^Ndv%-0Mvv-^xQPYBrsrYs5T+i5PhyYJMMVLMZf z3qnqpe@D@&`NRgTP1^2x^$k$iG2RF)50~geFX!otP;#gh3P~At=pu4Qfl+ypcGDTC z!aDOQ>RTnU!=sgnj;%M*(jx?Miy>|i$~^RkJbfWJ%c#36kdihqw&F(P%EDsl>k&x4U9a{`)7u5R7=Ad z`pog*Vz{X&vBxh1V};t7piW!)LjM8t_c8O2q7EX~NwtZge8Z1+C>mDmf*6A2#u#cT zWLV?Fk;)~DwB(IqWjB93l?io?VhOL@a0s<`@$h}y3#dn7#K`;v1JsrT+?W@$mWQh% zR(`lPu@|cH!&JzsSS9+3mUbEU@24lUJDqeSX`>)Vg%P=|Khga~i=}pn z;SPa*JD5amp_8Gb#xS@Qi-vnYbwa~DR(^mY+zzU^n)I@%KH1S=dZhGS{ph=>zZt5l zqN^j$brmUKAwky7EnD6B1iUfOgMbQPj{BzRzVop!uwcYFMX$(P+M_7`kuufPL z=-!oIizs1;n0fPgs7D%*i|qy*L%Bm#+rkTQU4aOABEX=5M9lw8yy-_XEubbz4YveM z_yN&+L$&7bsOJ`_R(lJP=;`qYY}bDyV!mMh7mYKRr>K4|+N@3u7LoX|;Vl-u+o(L; z|4hU@9D_K-znQh@qJ9K=n${G@&`m@A-=)@Cf+kHhNEIG$wktaxF;LSH!b((gMk6c7 zARrsrS-3OIXJL#eXl8=QS?2FQV3@zQ(Zi1sasPwP^F4|6Ai0fE+$_cY3)pR{%ny|; zEp-F#vc#D_kwI#&?ZJV(rH&v0^k(Kj*%Dvb=$Qr9C)aWShXY}qoW zU^YA=7G<761Vn{b&>Ssu8N^X7p`Mm>Xm1Vbi|K7$QzIskB@qkZcCuEAkc3eDN+k8N zwz(PM4|B{~^QmatxAQ6HnvX@>zVlZIh_-!iJOI2^r0HKlV&~fc{k7)P`;Dzd$nc?K z6L;IL?}u?sY}i$VxJ~_2Pke+sk``_Ielg`q-dLM;ObAXC0OXRArvpHQOy$q9Hdz&T zh?&?we-1tCfrNp_#WXp@0Pv=$TpB78A7Ml14k7OQ6G4&H@sZR~2ymE=vjJEeQuz=q zc0iu9buBTh+l3arQK%6*XlC+JAg;sc5Ep+ZZJR~g66AvF{l+HYiaPUgE%gl`bKACB z(i`xRmfT6SiM=J+n77AX&s*_SzM%?>lESFPhP73g0qgXI!U#+zFU|an){I-CZJU&z zp;AkI1hDd#hPGgcUSJ#MU~Sv6A41+v(ykxd%=QOHjub|0as;e|~wamd4x% zv!CNbujJLXwe6wR{k^?Ceeuw20DUW`UbJS_suiJlPpExeyuEK#Yj^0%{*I3JzEIDq?rTG9 zJKI-*vZp`Z+aC|DSWONkHjs7DB|dH1v@t#s(cIhKx3amludnsmW}mo^w_e%Z-YoLE zdq!fND}c2wZ$E!#^Xe5>IjqA=!{aUHatOKRe@1Wk^AA9!ous!!_v^jHuwP$C7j4^| zi=8e0f>~qGp$YxPoD1&`#YgX-9Va(N$_MYxi+^eV>`r>pAELi^S5bDixGkXDUUBP> zPc*b`^H0~si-AFUaWU||zaRO_1uqsx@q+R<66?GD{WHXpfaX4f>(mplqJQ;a zBHN!Jb1BK1TfYu5jM`jcXA*z$K(F2sJ2%pUE?D{})b2pY7e{aOhcUTCjQKusC)W3Z z{(6Wt#p1>=@7V%4dc$7QF)Sbs0DxmenXUQ7Lz$Bjhx|&VId7eo9EI|b@h~TK4aQ3I zo~_{M(0aZ^oVxj1G#wAu<5{#sTDzb`n1KK}jVN$Eldf7RRxr`Dq$*t$ZqZ7Yg?qKq zyTSv=m1R~XO`R(Ayab%&K?g)a&Qt(s>zIM!;avG34;yI(J&J)D|_s6VGSe-l4%mL z+=m%mp^?5k4_hh31cj@BnBRXIe4O#9_Gcj{YvD@ zQs~+zzc;iJQDm(|6j&?W8Xf>i2l8IgN@Uqjy6tvAM!H{v=1*sUzdl1 zZJ?r_bpI~OKv2d>_0j%3jVGuL<>@?o8?gk{*t=jI@*qj5EoD_HY#O%32@GQ2@ z>9Uu`v&GWbI>*{6Sn0%!prJR~w1VDIeD(=gcrSq@Y#0jF2Du8O3?}tml0DlF?E!3I^5LKUvdmvu;yD4VuB>k#s~u+ZBf<3HOWgCu2yLyB$2tV8zN z>bx9c;0W8ou8GP{n-M;FTyTPoT&;>GZO-~6q}ZfSKx)=ugUVhhZ6 zkE~mzL$TY9M9!GpjRZ}D`cEX??(g#RdLG_Trbxe-_xujTqV~*>4fD=}c*;B@UV04R z*0+CliygVKmIj`Q;19i=Jr~!gCAyLtbaz}1hNu-3gRc-DwM5Sp2mcZm``3(#S|W1c zl!S_ZySHHO4<`xoOgm(f)@-8v=%;A;AdVH%a7ITZORQDJ+P1}~879__x|z|Jx3z7@Xjy@G z+D#J?X+#9k_unK2Iapyn4WZgeH5(EyeVjP-;jH|PvG4@!M(iE%k_fQ?%uCjJOsf<8CtG3rmhAf(T!=RNhmh?C!@hGW* z3t_BA#)>lw5e&ft8^RL`wB%frHj;ce?;z@*qB={5Z50`%(0KdgG4Q_Bhj-t^vV)kK^6_k)c`f zQHi086Nd)r)p-)!DD-Zc8px)JheG~91BH_lbRMv5B0f6jlASGxEhDi{rfa{r?n0c4 zsEYTa0>p$UV%<6bYEjkaz&Wg{MYGan#VKm>kQpsTNkb45GgWXz+lHei=GqOS8KwRM zkkCdA3ktQ=<3v<`q$71)sf2I#8Rpxh`NrF5;eh6pczrZA$(!!N+hAC-Q3;pRxbg#; zTR?~X6Lb^w5iR)<@>OjfqCGlMn-j?OvNrD`ov6*^3|PFq08e7x6*E79mxTK=#e6-5 zZa5J{cFV?jg<;J%Q!3O~!8GYSU9^4S6y~f9?I#I;i%=qHm`Y*LKS5Ha%b^pdksIA~ z*Me^QiO9%QgLsx`P??_!GvI$&Bsqa}iX_OToNOmhUb?_(^Z$-J<)z<4fH10%jCu*~ zv)7l@q2Q^k7}4mJYllaps2!1 z`IMAThjGyp-=fn)cZl0ZHj0c?KlW%al#AVX#2oRqrE82~{=u%u9dy`CS|)}OVHsZf zt&Lrpp}ka5AIiRzZQq}%(d;7UXD)+#x5)aUxWeio^Ai+LzJC^RE%U1LOcsSeozo&= zhVqs3vi}mYs5wLX{%FU0;0T!N{`Lk$@)ZXC-5|z`^4&tXVXGbn8Srn7aSJY7ER=b^x zb}Y{X$T;LFN!l$;5Hf`E`Peg+%84&?P=-hY_I{Qesv03jhT_F=jqqXrzh>nzR|znu z2{#nAq5)B)<}liif-WC`y?v>(zxcyB7<{zU1~k#!&cfjYl2}q_z9YSlF%P2Z%(vlz;nFgB z=r;EGvjtRX@-P_8Kqbz^QrXhPc7_n74n55xxTZj!cp^&ddlaCojw_sDa)o-$)>#0w5D+p17asq98W zEJuw|xva+GE*u#cZ_71=U`Q2=s5J8#(Z+1$71XS3FLmadb>_#l=8G}&FA=JNqZpL*BXhotg*I)Thzb{j@9ron z9J-~1r$I*ctHy*vU<(IF63Hu~8G%x76tuE6klDKBv~i#*H&a{*V10MB-kbhU^h;$fsLIq%23tWJrZBP4evwx!1dF8TJjj?UzyRO zat4SVW>SA*#z(DrXnPZY?5?uAK+YnP{bVE9g=t_s3lK$6&dJLu=n@Zagkr2O%F(p# zUXuDL48z3ODJbr_188epa05rAv+oz+80j!0P+vG>=!QD+Wh|Jo2hXj#4N>jBd6NKw zx?b2DW#6835{gVTDy4Nnw~|o=;{`Tc#-d?m?gl@ZndRqJuXRe3xnH~-UogTzlw*X# z4Ani#?{I$(UnrxU?|`hd2q?02!%QYm4mWuL&Ik&(Io!h)W(~=liw;N43GeUa<{Q48 zQD559Nzj>ezNuG?__|d_gM3kCFAnom;nnG^u=c%2BjwNQmHK#^4iw{!&;ech-X4kz zQw*~K2XHP~rXQ^mBT77EHJ0JiW!WFqGE}~Gwj%jhn_Ys~inE`o!G0t2Bn(pL*XU)a z_$V}UJRV+0V{1qZmeh~rh8EbbXf`U3tzE7s_WCOiYRS!DH_SssQjC$3q+z6de15v{ zCf&NEIAUEp-thlLH-E2J?qB_5l366i$cT9iN2{aOESxjKp%T$pC=!PE$vHA%{sr&vqL`f0`G?|YdRJJy z`p39YL~knU#p{X=+Ra0SAc)Ae+*llah-T2}lD_>(!9t5i+|r>3c^WYprc(oe=-@&D za*ymoFMbrmoBb+AaU#p+r0FEk@rpw+)8tnxy&Xs9;L(R@{z(&3vb78hwL9pE=Bu?k zX_8G#=jg*Ue+D^1bTV9zo0z#ZTq3TwW1=IAi>;D-!}Qt_dgvuONycpyx6bsJ4ucib z@uDIeR)j`+6G@+_EWJp&e*cKGxdv^>--3MHVcKvt?xJ)ol1_ci6sOj- zZH*K09YxGME<}>~IMjLwAFR+J!6;q@A`87;=F+kOU$PJ;04z1`#>O}8%H0k5K2#eu zQUsG_I1sm0v~Ji=yLj}OLLnAl^5CQ3Ca4HhbT|Y#iqJ(c9t6}2auQH%#Fqfbi=rMj z!ZkScf>V1Gj_`pz-ceKi&}>O4TzaW~1V`UsDQzx9QC2ts(M5QI7!lyHoN>H?_a2KU zijhPX7w^(3Jh3=N=kY`vMUT_sCyRL(8mR1K)Ii@c%-eadVqJ-Hk}yp){mF2h{r)#9 z7P%K93)IwK=r)RONd=)d(}TEXekV3;F#q0%I=&wW*iXU57Zpn{3OZmfMnMbYZ3wqA zd~H>M^#KvaG8Gt{$&(a|#S7Z>)=v2X!dS<1m<2jdxK)dg4$4sy%fp z2*p>7K8&R#XnP~LXfS)imLuh61zmDQ<~k4`*@wD#5c4P4PLb0W6}wMldla#Uf~;1$ zNm!fNtoGlYIUK=zveFl+zH-DYac<`DSbYET z432XD#wdL(cd(L@k|J0kR6h<0nHtDKtEF~FJ@2_koZW^DeG&&BQjN(@>1=3bzA3Xv zf;+33vYfsSfz;by!_Sxb3bNp(sq|30i#$}0pK2g<8#C`x4dh)bng0+|pzak{u3NowW%E_-tJ?co<2`-2!(;2pb@)!*CP(Z6~{ z&#LAXt2+8xA;IS>LIghveSVxT;5*ru?;Gzs!FLKq+*7fDo`6&$$oSlwwaT54L;GYcsuY&&!@Q1;FCiuSw{U)%XTZGcFwSk5%{$`g{WyWyEWr(vV+wers#ADM^F5MF?@!E3iXOctu>>oW-&DlR+&FtCV2$j^#Zz>JV zoObTCve1mO8Rcc?mRE#I7Pc=BMO)(%pITm43cm69QSyItq0hgr$p3eRqYJjcs4y_` z;tJQZi9in)-I!+tixYvGV5lKjToWwPg9Y#=x)S9h&80up!J^H1kzjE;5DA7hK#&+Avyx#e)p)22cFqm<&I@+xc$k;dr?q1ngPnE3-WtKNJlNS1?2QH2)ddGo!7VF- zovp#%XmH&;JU0YeI)rp7(8wuo&e*xZ&Sq+y>!MOdy-OyFLGdR^K@4e#{^b2INU@CC z>g@`DlmcO{@``>c$~!msv_Eh%I!}(C866v#o)2^|M_!H6 zM|q?#s`&nY*b@1Fv}GyEPzO6}RM~U+r5<@PmtX3^=i7qnT1s_>^x^P0BigEnd)c;H z6Kn~r%tO`HjtFye#|8%Tf}Qoj-qv7e1TA+NdqC?#2Ey{3U`tfpud78%yFA#^B3km5 zLHk*ZXMa{U^^sv>JgZgXS>D26ajh88UPIP+8i0{Lq-!=tX8Rcob;bt-kYA#&QnZPv zPEMUWkK!H0#YS8y4(4rgQ7s0z4p;<43&FVnaf;$o-hO}!0Y;U$r2v-zTOy=M!UkaT zfl+9_5_F5eyoi`7+%5vx3a|m-r2yCF>=u!+-Bd^#1CdtJxFE`k5n9ERY(1&5us5$U zxGn|{&ke4t4)$IeTsIdFOM~n5U~k?xgX_?v)Ke^1M;w5PUZrod<6G20?idEy69QB7 zgBvHnEfX%sRR@w6$ypj>EUIDSgcvB7;fnq{p&8eCcwVYFq1`)C@$-G}$W1>NSzyGh zO%eoXWt$@f#O!TkMnKF0bK$CCy0QXx<&40oQ;kdkF$>Lw(=2sl3W!-_E_~cb5J(T7=Sl(0%5$XvX6o*ADgZy$19qBb-{2a+j6PQii1o$DoB-Av zxe~2Ea-{&)Be_z5*~9Y#ShM6x0XefCkrlwY#+{@_8ZC#Xv&G{``O_6nV<}G+;-SPB z;vnuT6<+4TFA~oq>t(j0uX52x6<+Vc>%{X{ur?c>#M~YJX&DatmWt;RU#|;&MQ3d>pk!vd*J&#@Hc^v%ge@p zu?~ejrv?ZwQdYtbs^~L;=d$xs4}84`ewPP+0Qk7P?D!(qGO*%35BgDP3c2i=0DN3t zcH9|KB{`euol<3Qsyy%-4}87{-UxhLUUobZYZIusf#~6DclxOZ{(^`6!yfp@9&*NE zb7@>&cDxhwZ5VMe@Lcw^33}RRlS#~lar-@iPqtw>_r(((mCEJMArJX~@W6xUin;7L z&jVjZxMNNBrWN?OyfYj?%vmT(~OxukL}^kc6_JV z*WA~>yuYozyQim@HjEwObu-Vye(~yfUw>Pi_KsV-`J)d3v^zGrowKIB?XwBmdRF3l<@V+opXMc!#0ja*GntCD_Mn*<`w5vI9hV}V>$fa z*K@6E$-_nFAio;lx3_mUi+1Y6C+yulZI1N5cGQ)ONFo?D$NSJb+Oy?xI~YD?QS4I# z7?oY=zM6uQqtq=^#1bcrbo8}nJ>*>+5l+NAt=RZGAmvto@um%5YA9od^CO9{A%7k3f~(ZhyycUf#DE&h_Sx&f3Y#Tg31= zOg^p8C{eX>y|Wn3_0qbH647(L%Nfq~_A;F7y@BCe?=1}HdLLjo*ZVlbx!&I~oXa`J za4x5?Kom=qi_0lyIJcj^)Tcy#n2n!ZUmXM@x&3pugayvaYwsTtJ(r(g@tNEC48ysc z7d-H{7|!KT+Vieb2;<@6(y=KF6Xxl=k*vMLL|Cy`8>{Z`BNA@mmg&~m$QuFTuv{; zxtyCB&gI<6a4zQ;4Civ5WjL4f2E(}={|UkkBm7pua8AF$1K;9-KhAJ&&kGFa^50=N zFIUm1tesrWSq$fLE?_v9)57owMB4qR+XKIz;aooL&ru?`@_za=hVy>9o8er}A%=7M z_ZhB3hi&HxEKbg0_(>kP*Emwa=(!)}Fr3@jz;IsPeumdDdv0Vnm-ADGbNWXa&dd92 zhV$}1=YeYlFan9{h3h@b17E~&?uQl+ypQ2r{&fuJ{`n_{^YRWcoXdHI;hg>xhI9Xn zIZ+f>l$ZPGR1f@Ttbff0tKF{Xvv5k}w-|mlE>2nJ!2jw&KZT7WTu$5r=j#zpPw#&y zx$Sw^18-w-J_GEzYvYR;KA+*&Fr3zsHa)%fpyZZA?=ftg+kY$bGq>k% zhVyzaW&NGYiL!RiU5LGMwB0Jj1#D^E}#Ns|S9K z2cGc2Z)G^|zd!fDf9Zk0?17)8-Xl^qbNQ!x;1vw#{@Kjxm*anBIPbsoK?Ehz+kl_l z4rj4`$MGQ#ocG6tOwK>Be#G%-8P4UO&-xdq-|T@GLKh_xr{QPYe>;IlXXDp|pN-#1 zAQI7^hM$c;NFWmVVKII-{xE?^gfGF*#tEZDa_B8_6@HY6j*^Wpl8?@bBpdHkIJckV z$nz4}cX&4a7DaELWU=wP6ke>ZHa=55PjunbXDB@-FT&nSaBr7ujjCUgb2`IsWH`MA zwdsGr@DRiAVEAN)-^=i?GJKE1$$ytJ{53{@2E*TDc$ncIG5k!1+xuEnuCFotO9UX% zgcp1Y#VR$*i$Ep5E*)teU|A$jbgr9?-ZO?p#yUXh} zo^ECId|Y_K1GkU9kbF+>H7@h@5vTtN2|#kodCmhrnT@ktPMrsy^uWbGp_fDNn;!IE zXYrity++~gxVKK>R4;Uv&90a4FgZN#eV56(jM4uy!ohI4wpF6Q(<_MjJ^@hU%X`hy)$pFA2ulpJmkU*~fA zIuH75Jn&mR@a+tbLcd)vel{QG_Du4?bq~Cm;ZY|4n+)gj|Bd0CemBE8{W}cj^rKEB zZ;ouwQy9+ab%t~Kd_K?VcQSg8Kh1D1=XHj2`r{1e^j~K4a88eZtUYJBVhrc>tqkY# z*E5{c4|?FgVK|p_t4CZKC(RV`VGi!?c4eP*qcFj7nv+m+>%D;CoW7pnygx4Uz;9tVuXph;_Nnr6`3D$1uiuv$ z&gqYO;Dw{I<>GQCF`UbZFgyZQyPY?9;Fo*gD?IQG4Ci`p_Q3CC_^^1(a9*zeVmO!o zD#N)P@sYanKbOPj6s{)BuV6U$Ki_ZX_(n$0?Yy1gT<`r1 z=jGkUa4v`MxAXDq6-Lkf@U92W-ydDY{I*uQb3}jN#_(1~Pyb*MCFN!*{nhx{_#%dPGrW}qAQ3%%YhlyV_X?EU z_(d#Ea{6Wu{JRWa33;~s`xt%&!v`6@l;NK+oY(s#G7yO_9Ix`gdl=pc9^0OOWOxh1 z?_)U4C2aarRlFiM(U~qAZ(=y@nc4Wy6;9%M7|zd!aQr0I-$_2rg={&$WpRM}pYO+V zTx0E-<7a!|=PR7_euL=^;hqxVswm=_`X(j9m1*KRDgH`ai4@2n!wb|sU5+vQM23t1 zUk{#1zM6`PT>Cu^;bR3N|NLCF6T%A_zERa9;j}Mmr#l#K&s`~xzI&iV^!6SpJ=^DG z2)EBD5?;*wb}9iX<*Pyx{RA7tHO}y2hWn{;^9X32e?|8w(w5$i^rbpLEE z?9lsX6Z2E%)B1X-j+{Elo+YO`EuV%@JVsVrmPqA#3PJvG`K~(89@yspBv4lM-^`4U zRYuzVxs-n)Mt;ov^C`@An*8%|4m()>g;?a_?JvUO)SLSwBUPPktK;--5YGOuY+r?b zz6u|^OvAq)G*vi6xv;&nS)P{XA>(hkr!VJdO2A__$*H10Ig*+_I60GXN0su)Q>mC>!A-bd!rKM0)CXX(EcG`aLg+XVYC{iNHi zpWgT7(qBX%63sPua+m*?$jqgGOAh^GKfrQdE? z1j2I zvA-j>ad-JUJ@g;zl^pcXQc@!Q#Ov1o4<7nMrxBO>w~^$n`0Lhxi--P=amh5Ocu7AM z$*rH>@8Uo4u27MR%V0=8P39o(ZGPgRMEhZM?~WgDhbUM58%Xk2{B_&^Cotz~KlG1% zQ(7l4!hbYRwDsHVrvMW|(vQoq^uT2K7#fC3T^yo)yIlGQmHt6RO#Y*Jr(6FSO8*Q+ zXI~B~{j?_I$>t}UDYba;pZ8UnACnj7ugzy)|5@?7{l7)=KkCYG`~OGa&sF|`HB#jO zWg=1eX{@vDxBdUThyIpQ;!^)!lDrjv-TLW$crN|E^^$3$S|pKv>Jo1KC!>FnFI>g+ zwb08{q&|6Z{<`%C!JkY2#sSH4x8fxIwD<1TU+1C!C8b~dn@dUtUEKPk9{S5}kWBV> z45WWDes28(UixQAe*1fJ`7}~L{d)(w%D?U=$yhm5*5%Vk z{q+7dm;QrF|3^x5e4l6BWOU-M3ruyK$S(#!pxL?(z>R{mO{Eu6buY?N6lAHhk0!Im;eETMBktE~8fwF16i?)Q1l%V)7JeUb6>`{W!gloM4N9t6Sh7o zd3Ijd`ZOGSCqK~{v7X#V1mW1u{IK<2*xGZ&l_}<+FJnQKboOJ^3s603=g*_sWG14)vT$N?u;$iJI?oz8VkE zVL>8?#RNm0ogL*V@cf!A4Ou%D^nDIbvTf`OD;2W#B@$#ri^gfnGYcVusZ<(#R|rJ; zT6(z_hF8>(%gB{fLUsn=99c-2B4uu4j$LfTKd%NJ8(dE0v^;XB^!Qy=Y|7*F%*12t zyoK3IDiPy6mpsdbrKHp(eoa;_Ur^JkFYu&C8L{{6I%Uj3S&35G8?s(^N(oAB(mTi*#@sX=WCudlUX=ZMB$!r; zNN+kTejrT|vJRlXAh`Xc+g@jw9@35fSRc6GQhZljsk$4Zjys%4;sR6zZEFp_R0vh! zv?}3eby$m33h}Nt-)268608U z`Wc10t`@XO>!9BET9tK#3+>@vDq+()IB)LOf|K$Btk#6Isn%ai>y5DWK(H6afgpl* zVr>g{<7szKa2c?&1LLIi{I7ETGxX`|Sv~mWgX)6zaLN{yLZhIqLqNn}qARk+CfZ8r3N(X7@$Gug!vOB{ z>G8{lQZX(Gj-Z+`zRG$Bb)%>SHXTZbjTYaIw}x;vFs;u~Qx>$`5K7z~B%c|{^%rXa zws;QPz&faX8p-Q@u%^YwFmoHRT{%>{7VLO)2#M%n5uxPzViKVu2Q;AFmOPu5W0roT z$6NdR`yF|$+KxAdP~8rrR0>1!7xkWt zF+z(Z9=;Fu!X?y`?Ac0gHnvf`Wla(5AW*&y#oyI?-T*B*WP}lWD_8G%nW$s^KE3BQ zb(a$j8Fok{8GK7(Z@@#}`{*5vZ8R)ofkv`Aj}t&(pek9#N4@6=EaXfg-es{OgvzZ{d6;6G@c{?K53=T9f|COhmNPi_$IP31 zY`pMJ9-9S|)4B(-mVTzk^QlTDembC1P+Mc}i1j8FR;GR=R-jilcu>^hGd2^U-Q@P5 z=b{MNqP5&$BEn8J=CKHB1LY0J}TRFpv)$&TPETn?F;@xI7_G9L|-VuR{+wU!*c@1`2`&3iB~j_oev zERZ3T4MI1gL1s^1`?)l>a#380qHttvAtX$b!aZRXB#bQ@8nv;x3<<^) z=@>!LC*aJVDwEM^nlk<{I78M4ru9MJqLcEVc(-abG%#lh8r|3fFgqM$m`J1)OA$@c zGdWeJ4CO?lA=EJ*B0KaxyN*%pML!nXsY-}S$GjL}N{?TQ`W!}2U&zyizR4noGys^2 zAu453<#3DL%dpxQ;${Iz3UCuByVVSirtFtSf>ZRhPk++ik6ynhRPuM~G=2_`P^Uqm zpQ>AtiQZy-3kJn|xjC5DdoX>+-XU-aWs8~w6IUcRbvrukuGC$WqY$usNA_FRt+fE>vRBIjo6^BaSvVCFgkwdaQed!($%cz#P>J3vKz0xpAstt9n#h30yIae*JAR*P@fgPd-$BplCXL{?ClpkX* z+VEn>pyC+Z43{rD+~1$lK88r19;xaBylefsuJ6R_koCdVC%4v*brQg zrwa{`UNm9YW;{KC z51Pk@;70r!;|>^y4ursCqMe3H-qjz(z%i7Zy46euC#KhPDwAW*7bR;KDlcoPhGfix z>M7}3LH(Nu6x>3n@lnl+J&lPVH9psn%q+#wZsqsT%t!bz7AvFi{0Otu435{=1`$4P zg;!<_ea^z@-0%nzV1s z(j~~})9{1JCbB(#<rPX>o6`ikV5X=sS)*6%i)if2IJs!Wdl;sQlz zkd;6c=mxt$ccVZTqd;?Ev{|~CN_9FinVNz|-W%&bZsjN%a<4%}r}@a})Q->k`we8c z8wh1IwFv<7#Fw$K$~r`gs4w#>8J5E4MXcYVOIu_lrX;Sfi9o~@Gf|Gmg|$Wk^=?Ek z6yJ_XQ8C(XKsR3oK|VF(gGOP29)AZ?^+$};tS7g>ou~KgLucRp3D!0@JQeDBO^?5Y zXR40*kfGoH7@mySK3^o+m=~tD2Pe-+E{^3#Fi|OtSRbH*gssKFK(%!kb+s5H_S>IB zpM~-)4x`+rl)P4L?XI$ZZ~ER0C)@Loy8d36^!OONVSIQODkAn@1b4Hq}gshoPd?~#lJ?opp^%-h%n$Hf+@fj)9m;5 zlfe^4(?V-SSVVfEVynpu`Cg()asf^MCr3b15Gr}iNaR!hb^t`x)@LDWkBVm@>j)YK z+6sj}E(0U>kK9o1FR5oY5(@$tboh+qm`lmes`Tc!tPaC)j1gyzHuVR8Yv!V>4*8zN zx@F=*6c6Xj(EAqDU~EbYZ5RjkopI)I0VmfYEj<;YnS=6)-9=J4+rvbpplIiZNTe~B z@?4}pxM+0l;$ZQP|IVTQv&Lx`q$>;;6_};p(ASPdNmg5Lfmx09C`eE~7Ld!PP?%VZ zrG#W_&_^+o z<^|B97wA0}1siDEid7N}ez3fu+=h6K&~`m9gV>d;7L^MXWg`(Lug#0S?Tb**Ma{Z{ zA}>c;va%5UF?pnry-=PU^HXpjVl6pNL)gA~IznI-jaZ%o9m*u*%ZPPoKDrU~$5aT# z_&Llv_AG~pp=%ifkK&@y7|hJBe=3aFsVTkoz5E`6vyX4qwH;qL%Bsvi3L)wKi03{mZcaRBi3z{?}b^F}<;??G)9-CkI(Cd=W58 z-|hNCwRNBxt3@y)kZ$9sM|ccl3SHE-Hex+(7jJEJDVbXsDxF;ut)jYxs=Tl^dX7`8 zQO(Xl4`ElcvA8v@zahu0F{P`d#lEgTpxQa`{wJBdYm;gHK8@W240PW~18Z%WKYhT+Ko8eEjsuZg1^Kc9M3({QW8LsnOX+~mYO{y@HekB@8N_CSk2g|sKTmxiX zL7~jaUoIxMgQHXG2}DMe-DLD`MX4E(NQH+w)U3b1UF~-8RtLsx2rQ@5q_F@2`b};Z zbrgQBKUQIFzVVxRctBx<8^8502naWRcQk-1>!UClPJZk$>hXM4*7Li~4FyQh^!OCw zZrpZ95WlfC+X^s(>Ow#m$$MOM6Nj+Z!fI^PvO;4;(&{Q*m z)?~ekIo(4s6ORjM)QR@}9+z~Tu?RYH^q!3@?v62_$msY?=@JsO_EjWKdT`EGMPyQ129Mvs33$c)Af3FQs= zNbh-!Xk$BzG9fU1{vZ*J=V3m~nT0Jd3Y{rtD&T;1_I{gjnQ?jQ|Db%Ed&72FMZ_71hZy}hdO@DCyHG16`v`aIJ3YWPmbEl)&$l((QfvHmVwGPaO6#B2}K z7yyI$bbC$xj62zGI1pKU$kz7|>3g5_{S*3j+4`y*pZ_5o$TZLwZv2Gxp%5R>qhfoE z^esrNw{^WYhgBh)8Etn_#+>y+Og!W0#mN!KKP2BnMj8te53o<>XG&(#<%3*`c|qwX zFm89+xL3%yyKHBK8~@41B_a>!QT~6ed_jsZVcfuBm+Uf0ms*|54!yS}Y$*#LkOqYYQLHq~)jYkQsM#+EJv*07}MqA7JN zmR*yUrA^0{Ut9aiz{)0PGcZsApmX_omyoPLv^~(YD%#Z9*3cTbx~sXlsZ(tgu3XyG z2FmuXD2|Q!Wtw&bV-bC1MZNyxCwEt@BFMU99m-dKWd5br3$eXNx z*}>g`=+NC2Q9NsS$f1X&_CpyxwAv5(^w2>MvA2BDvv3k8^fjaVFyh0bm@eSX*EMo? z#S$XOtoIv_({stY%Cv8H;~~r|>`bP&p9er|*4t_A6LW}Y5HrUZZH4Q7Gcmb;OKnMj z;6$5{7YX%|_tUO;ILRn)ZW@mx@7m)SZIZY%iCQ058?4vIEe>|*@-u{V9zidCuS(NL}XiJH$*SSiEW zXQg<_3Z6jsugt#@*2S;JV8ptgrm^HG5P|~vz5sx7BgpYQjh>rLz5O_qpuC7@_P0I` z{?^B>!FD|LR~`6XAM6I@06)cZS=Z~pg)YcoVN_-Xq=5^@E)ja>BA^>p#JFMOU=-T| zT^M&zy0XzW2{6`E@Vz8hjPF*Ai`hzi!%VhvfRSc0~ z#D(8u?7bm)bk>`%1W&g0X8Z7iI)D_r#CQF$NP{PvoU8;}U7VK7}pZ``bvc3EHX@E3S9~rw_{U#ou-(;5_O5T*CkGyLe9?88Z z_*U|0G}yuR!;&p@ZXs;_Ei<&|B~PR{Q(LCMxuE;+1EqbOOYJ%iJnKX(_Ns?CPeV*| z=qMM9!G_a7CR=!h;W09OAs@hj!1P(JJi$gdF9Tq^hOEyp$2Q@y2Fz$4w39#Jun(Sv zi32%t`0F9C8gs#WMU649WrWa|g^>@c+`kztwnO+FSAi!}#>51bg>=a1flPxPvKz-Hel$wt{#opZ&^@!M7suqKO z{9-(hK{^fRF(BttnS4s8DA87^KeFI7+UhAZC@}Be@$T`~3piJc!#(@+V|#ThXyuat z^Z4h?Lro{(%0-VH)Hc=#R%Zfza5!b~BlP%X%In z^^uipVy_>G?frCW-nvNeWULkJ?EYi~@mz}g?bgd?a_*<0<9Ej1JYsZz;?r;I#$+Hl z<4{@m=UCA`0}q&omTbq#oo+10tbLHC3cJ_#Kt8E44u+G2B)O{{Fi2LD!uW02J2 zM;tP&%32e1)-gc3{KoFeT2AK>Y0whkDMq4tvJ1DqO?ilYK8T}_A{%x`$a5o1%$*AM z0U*_pEq2n@JOB<_wxvcfWYGA#ywQPWC~Pl}fZbTr%7?KA_zbAjdsd?B?_Lu;8Jk7x zDf>Nt*W>$XOad+8-8;6U|JtfTo7$l5i*&J3YGsI0lzRP~iwhuPoYyqU0f7DDi`C&~l|Y7ugV%hAXJrDR!v2KBOMU zIGfiiKupY3RYGthw)t1C;R+f51t8W&RrKI))V=LQRC2I6jx}gjY3SeEWm*SF^KElz zLV$`HdwVEV?zQW&ZV2J)z(c|fHl^gn)J)b)%?)i$?-`9myIwpCPpNv*574d`)EQE} z=-CIpE%|ubiV{HQO^#;grGAf;UNqw|z@4^%^Kb(iK=B|A zZ}YSa<3pdSz_yMAp~?k!3S5fV2V5DsJT-)i$xLS;?%4MZRgP`s?9>%-XRj(BibTRc zT&>l914qKtAvjY#lSKhgXSImUu>G2T)8I0)s4jH|mo)a93@(za{9t7fsFGXBM9vWw zVIaV`EcG&tFOTXGsh^rX^MeEY(AP4|*is0TJt_Q124fWwN|}Fafx}6BskzWf>*&bw zvssOnN@yFM>RWJT$6g5}qtgY;0^y}1KRo18( ziE?Hz^&uAj((5C3eLl~rFHhjc*v8H8Txws+9;)9$4mXV*raYX%mK%3G-FG+d>apT^?Zt(Fq)Ibc!b1nB`l@ zJ(o{vb;f=!x;{LADRZI^C~}hPVLg?yDukQRv=zC_PVdbAXS%S--(RTV1|nxqxX6_! z#&H=QL?%eOcxsYf2|!XA#C@p}UN14AZvKWZF-VKofL|mki|)Cys|l+1vdUrMWKcsM~|;Xz0B?`K8OSJ zs;v)He&H{O!e$vfFi=`52W_Pn$8#p5CK{XbVKIDyji4#$`x1KQZ2%$xOlLI`mH8l6 zYXUUuB&M{U6rI5UVKH9sbi7OZb5lxQ#ABa{ISAA0k5rt0u;P@w=p{Wk+PWJ}9ZY$U z%j$KefzEuOBLE?Rp>(mIhGHif#BG5atNR#TN32h)t@o>}KkdJf`oDuuqjvoP=5_xg zFM5uV=o)JzZaN#MVrkThNz6ffKF5C5E}Hpaj*5L_ZUI~g;J$ra58uLZJs%wlMdNuD zCth5&5hMT5`BWuCr3due_Cgz`iiHQC_C>oB^WetvWZp*rY4DU7PF}WlAVYI_Y=DX| zne&yH$ERSnCvGI)avFAyBEgIgp&+#cg+=*F%)^2B>B-TKT$2SuDl?+cEWGrJ>Sc?D zTc?r_)sfP~JRBFnqnWrum7^ME8h9HG`V`B|{KUKwlqbc>VCqejtm+@Dt@o;}BURR( zi1lHJN?<5@WaHrUQ!o{$l?*O$?ri(QC~zoK5?)7N#q$VNnUDz@UnRzpSJHJ)o#0os zfy~CrBXip|5XKvcTkIOBxE4AJ22_}-WWUmTUs~_V$r;*}#m;@|F(!cBIKBopaeydA zMdvg|oUs_#e2ng!&)4*xe6Xj6a^Z9nJ&dRJB#aLyW}@z$45Y8jX#!cZNEM!J1UuUf z#q_F!z8&M5As18A>-xq&0dIrp=1QfGh> z@GFiPx}l|b@f%EhsGp@Is@H_|2WO522A9Vja`MpO4+h>fck4gI@MQCt{rT5}ee)@6&0LuK5VL<5O=#fj@8 z1LiyGt;|Fy4|B205L~og1d5N44@&fx+ zb!O?|l}ik~I#v3L-g7(H=?zYh6rz(PY3L-MnUx%|)<|4d7)q=jZTdbmtiKthyI1^} zkBM>j7@||2sc>Rykn*D%Hbvq<>UuOh6_+T2jRzHMB6So9KgPCuRm+zg`#kBTaeae$n^!ZJA+*XykkfQ4*YW>L8#eNPGN%=$Ju1po6>6hR0(lh?UaQ@JwmE2BdC@lLn7K zY4L|RY-Sqj>f`RlDOtp;f(>O;cvHsR&G= z4bX_SK3K!Q8}Q}>J>QMN4r`$eZyLoyVHme0f{p~Mqd*a?R2D_hNfzL&DE!j_3Us(Q zR)N3hfU~>o5*%xaSoa1CVOrV2U1Rm06f*i)0D|@pp?<;oZE|M6b)e+MLm%LE%nxE8 z7wBv6OAF+XwjDntjZ${vH+G~%?>PlF$Bs1b4eIemT? zUU%LW?1rLUs7W~FcrROE!fR!rcs~|;DL=9Iee{A3w$iq7G8UE1Nb(`}z^vrrAE3=+ z{4zT^qA!9ZI^`Tru4k#-{IP>hMh9>(Cwf6|VnrUU>B1({>Q^O`TnS@7^F>Ej#h%W? zA6|&yxMP0T%lq@Wzs5mZboCLOpeu|HtHRm9{l6j{16?|_-viqr>j~JYS+BA@tb>(d zkNhWa7GA|5FM`1O+Y?mll|nva(~NYe#jwF}+)9VdrcgyH;tc~HydPo~I+Ka5jJ+6* z(;>eyoUp*zM#$w<8l%%RgD?j4M-d&-im=;@Z|tKsql*6>GZZ5cD%3|l6x@k>oD&?m zE%*}dVsGT%2Rlwj$FR@~WgBP$l&n8i+nYx#Y09C4jrs)6V#&b=lRcRxDp$cTH1U zQ)ff8y%TqMY*@akZbehHtD~;DYX$z&Ox?1!=FWyX{N)*N%gcv8@i=s@$7{p2ub@X8 zshyymsGY2xqMfRprj6B3*Ur$+)Xvt%Y3FL^Y3IY*N!kV4MOvwLF)G%j+@q3HsmRq; zfcueJjy4KuCun)vXvjYiefCMvax!XEA#|OB>hV=*JQcm-80b9>UEj!~e-8N11^;;Pp9lU4;6Gn0(2AkG1llJ;`y^q|f=#Gt~{}qg7T^d?Qd)Icxs7z?2CWOehXaES^|Wd_l?NK+#{y4*5eA^JiKP5KQ z^q1vq_lK|Y*OdDgh5YrE{-x#qj@kb3e1A=-c1Uvs&C(Hwt$HN3EFn14!@zu52> z!B1&Vhp1jI#^)XKF|HTri+zgnls-i{rF~k{;$JkA?L=lx|0=`Z?aL|iPl4av^8GcM z7KQ8x1p-`isR6M3stp-=gk z^j0Ia3Td;EM*erw$p5pDmSbkk!Svxd&9=l@;3DTbrB+2;36Us8Mm*Fxru-dsY9qCO zTz65BKI=uzyA$QI5wZqiq+`0^t+aWiyvD39G=}Hw193S7RNDf|;mU<@<>fG9 zInrhzZKj=8iL^OLqm0b6l~f|F4hGf&%kk8*0o1c?(?bsRoU~1d*&=S_%tb=w;08ZD zXR~cAd4N1h#u_l_Dq*aX5i+*YHa6#G+gbyRCK#!FXb!oKIy+ot2_olAD{bZ4vKDY_ZuFbdtZ)!X^p?5fjaa0oW}t@}I^n1uU@gD~fYfJG1YDW5S%ikSBCk8CZ@{%2?akfBGUa6FwIgSq ze^mq?p5b4GVdCZfRWtCghz2+vIXC!MndzrUw%(e?QIn}Y;JP;LTh}-$Gd6|nlXDyM z{OeAR`g>2V$1lcFC$G+$nVGg&CEWE*Sb5GwA?|n3Pq5MSyZ+wmuDA)Q*wEh zk`1?mvnwf=XD2QYkGVVp%7$mpVJ^>RveDD*YVeiIvz=`CsKFo*nulh)axpW^cI9HG znC;5N%rV=QiprZ8km}k8K?XDDgYnmwazusxxsHP&x-|M-{xYTob94{bG9oN z^XP0>F6PzQu3TY{OwYyKJKL3uxp=lK7xQ!XrE*HMz=78Uex%v$z-tme4lZAse`>I& zjDIs9JoYofPwSyl{)v6?j``!GEdg7vg&c*{S{Es$X?85(IM2R7d8!VlT_Mi+j>#y7cef^*eM4^uc_xu6@BV^6{Wo@*ye0A7%#d!awok`2Fw zDR#97HUEPGqlO~l+)+R(Mis`F_z9!PwMf5d^zBu&)x|>2@ z7t&XWqGs|ErdL8rn4^TLOPF$^LRef03n(ci2@{nnmNSthOk_nWkqIaA{Uk-mA}1=5 z$~I=5#H^E8$s~odm`O}O=|at8%WP>AUfZYbGi;#rA0#ze zNB`LeHv8JOy2{AxawAeVd;0Wwq1w7yqdXF-(^f>0$xf|#W#_VJlh(9qSu|~vJCpb? zE9&h3vI4(t*Ee-Wc>~e$pPRm;YsJz!UCFx`_}N1erkNZO@dX%RL!8RiR_cfB8UvW-EHMth!|vPEKob z3B@sjW3NWCS-a$raZ6ADo$agbIyu<9EG*aJZ=y7{)-9)a%vED~Lu+e$W7^uzCK#yV zEQi>-Xea7>Q^rT`ETg82*nHHmYfxfbC<3-Ndxdc2k=k9|@@0*k?M_zFIGdMUgW*tXE7~lANm{l#QbU0x4Y{4$ z-qF;C`rX;oz^8CL=^9(xS2RgNHFP}59zkMSqgCRA<3fBnBTK@=0(VCCZhB|^(2Wx> zU4+lT$B{pRKwLEBrCGOwpK8y$X(g>%;9s|KqNmx5LqFNZN&YN>PZjuVf#a4hl7peI zR?kn$KDu@A9vdebx*YsI^~mRMB>p=acgFk<{Q(G|9SNqZQ2nIktM$dqSr~_2 zt>Ld!oQ8COLMl7P$1=4iEfX(Jo|sxKQBKcR2Fb3S8FDdpvMj_oR#bFX{j0fq(3Q z)5`>Ox#d)O;7uNQw+H?c4}6;k{#Os2)=9J3SxAQCa@+q6flL3-6u9*NO#+wYOYeL* z@}-FvO#;74(7z#Y>4*HGYz~*N)O)tTrQV+lT*^7%fe*6>KvX<3 zPSQFiU2a_ck*UFWi3j}^0+;sREO2T6lLD9akD@`OZI8syvvGGna*2&ovp*dlr(C8B zIkF#_C2$$H?Na`(N76J4*s0LX^qIi|0M890;jh=>7t*>_&D_M z2>b$pj}i4smdiJ6oY*cD^z^?y(&hHgl{QYkm-D+eA?G5bIrdyH<5-Db=YfA);8OlC z1TN+LMc|VDJ%LO5)5UmI($jgJZ2nv5LA>DKB&py^O<7zu=Y=VroU-fqN$|(VhieM1LJV4*qijaS`5$kAur~j1>a^ z6!&xyeH0&u-r2t+yi4F0i+-d*;MD?`dS(B0y`bMC=vN9{_EYr&m+Q~c&gr6jr9JHe zmvM>y_%>aXFNr(jYQih=ar{7kuaz#sSK;H}|1EG?jt4z(OZ02f{@)8++Vj2#K8_5* z<+i_6;L@JO0+;gT{*I*owV;>z-6L=*=L3ODIio%L({lwb>E*hyY)276FY&m*Wx4-S z;8OmR0+;3clE5XsT+fmCXM$eJ(M3Ni=`Rqt9RC;|c(n(9nFrn~aH&_WS4;d3K`-N~ zT*sFAdP>ksIe!+oltcg1r&B+roZO)qT*}b}F3a~+4}7w~rQWayewo0f|C>DURRWiG zdac*pBj_uUMJGNyB5*0^Hv*Ua-OB=(et5$JZx!`$wa`nklrCzPvYkG{sKVQE@7U>_ zYasd?1uoP<21`$7R75IA|_$4Av zN_u+FfiALB;@=hcH<9Mpxk=zH0^cFU_!K+y zgnw1w+JqIWm*fAa!*8@x{VsLxqWB-?vYhCUiEnHcTLNg2y?QtEWOn*d1c73p7Wd&id6HQiX;%i|4THf39HWFU z%Ss2M=~Uwv8SFmm{Ut1QvmT3y%#%KZoy8rlvy-m#*k8=xKAP>$ze_%y0nPqr%rbRm zV2OW80a?rZm}O*hsVB4hNcI;yi*oOEV*1#)2OXaNQf!$czh8^ydo?bfEHb$mixMz{a<{i)#e;UoJP5>`fESuUcL*Xej5 z%nbCNJ4{5l-tH)U}o2Bd-n`q~fBzBf4mB ziRhd@*XfI{0hUcaty$0&Q9tQ#xBjbK@{El17aYOfR|Ml7#a^916WVrn| z4b0i>57-N1^>)J`|Is+j(eKp1n>_S8e_!bZN3s3u)=%eovgz->PBHY_e59Xv-TEK2 z^}GEy#U?Da`JEq!A8dTl-`=BpxBoh9Lg&3}n|#37ld!kIoX!3MT8zWB(H>Nj{U_k# z=y&`VKsja8|NKbew*PjVy5(=T|4s#eHvOCI3F!fwm-L^Ak6Zs75B-~n7?<;&w|W{l z_9iUip?{;@@Onl4p>efa|2IAK*91tq{ddRIEq}Z7Py1-u@?X40F>SYbDgQK1ck8GB zRgwJfu0Q+FP>hS!Px_l^>2iJ#c<_gVijK}E%FEI3@YCN5pnG@y+hp@Q@4-7V-1#p7 zf42M|u=8Ig>fdP);^=qk-wF@?f$>VC^WM9ce%klXroZ@`io@AYCjZkKzgz!<9{Ou8 zQv7x^QV;lc=l>TT`ZuptEce-*r2lk$-1^`3(Eq%x-+BMu7BfgcoioUm|N3=`qu4H3 z(tk!4{XsBiEC21ZNsY^S?_E6&&i`2OXVc$%yW*&~8A<<{__*_bg@^uPLoqt<%iCfG z<$r;P{sXrDMQ9&%k$z{aMbA$CTWjmL$ta1hjbZicZkGWCr@!fZA^C4q8c{c|+x*W{ zqPU#;@ULBGdk6^Sg0}&%uA=k^k3i{$9D50w$;Y z9DeHG@lPvr0Q)!m9d$$h0Rmm*|MT#1_#OXKvfKB7w2lSzt!ecm)GzsGIzDcG8dISO i47#4bT0#G2ub(&s?)+0XmyLhZcNND=lsR#^`TqwA*wIn| literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel.o b/third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel.o new file mode 100644 index 0000000000000000000000000000000000000000..fe853e4387c82efdeeec86f6936633d7c08864da GIT binary patch literal 29368 zcmdsfe|!|xx%ciSLX>nT+SJAt%GOS8s1y@H8kE*dm?0ZkAmj%>l(Gqsn8c8z$p)iF zjorjD@30mtZ?E^3_U6;q&waTqSL`eG14z@vU*OLI+SXdzM*BloMNwOVTFv|Yo|)M_ z*<@(%z5m^5vuEaep7WgNJkN8U^PDq-w;GY90l(kpUVh(Ked#9`_4#_5)Ayy~w$%4| zpX#eMkA%&?7C&AcHa`f*-^j5#Bj!V|k{}#^BqwaX88&}^&DWBm1%I^F73L|eRx_VI z5aMHvVLnnDHh)oVbuQM;*T6+Iwz_rk^n2ZW$pOL<^BuuVTd#dh+u*R&))?ktZM9)O zBbaJ6n)w{~{ne2tZ1t;s=*QZwZey`e-NE9JdN+%m>L3aoP@zcnsqn#Sua=H6QGb64|FdPo!5Z0LxL{)~GXe{DgFKy{hVHUTBy}9jxaX<|v9H zKnpM^a48vDcYbye8ZTCcYAZ1`F;FUz&meC(SGOYSOw1GmC+Wv9iL+$6^KI$-lAcpy#&0wxzu=}F3ibN$?7CNmB^Hp{( zSE(LVDR@*De8t#Ooh>%Zw;>9N$Z##j?eHSUDe9d-MEt>X!nEx-0N*jLR0ld@kt0|XXc*Ng`--qzqI-;CA{b4e}HE>+% zFT{k@)|f|);&E-YX8uyEHOxn~nhNu8wTZcBSL^1Jbei~RR;(?4+8fJPUpGSXFCv>$;{e>`;iQ z%~_90Jj`M%yfjxGIL`T7fVs{{U|8WAMhHMNF=zA8td#YtGO!;?weH#uBHg+sro}(> z>(*V2GrDzCtFlv9OLXi0emvWOp+=1`Iv~Ks)UXa-MOyp=ztVqhE?Z!d!?{H0$VY#r zRxkb{YW^a01zJ_&gQWwVBTgln3ym<&5atBIE8$hOo|}MY}~TXgVP{%*!&%QLzuI#S1s`= z1EU`klwnZ*#J**t|ApIHYrb|M8;uSw3r(wQ@kP)>30hN9)v%UgxbF#0Ut5crR4{O& zXJKvFs!?<7R;bnSRA9|#8)a`*zTl&cyd2P~*xYuitF>#Wr#$V!*;Mky>k$nXJ*j*n zOgrdmfj$GXR8w|7I`Zml?d(T2<$(_wWsQNymHvNWi%_e&AbBCE8+YhdmuoTGfZw$Z z_aN;hY-cTLrH-`tNB&e~hXG?WAaw4;^n7KY>Z9@T2>LIi$uz4YkLIGA#BW;fR|X>I zKsu})t}>HX06_mjvT>#Vi)W|w{{>h$LW4o!Ek4b$@7D47Yk8Xg87*<& zo#ze=!sF|-+g|N>^Tw!l>?0_nm1`kFXH4Ckd?En;y=q8H<97ng7bRcSAx}ElS&w#3$;moh2Pb{Hbm6|eyzD~oF{|31jQ8f`j ziyuc1wFf&dntrl&HPso?tgd`b**iA!=fG-fKIftC^g!<7$ru<``j4LF^Ep%IWlRb& zxw^lOfnjuL5(!(2)v=gbNuQs|4$Bj{m)k*G*}F1d&%zW$-`YqrYiVLZ01^8jTm~r= z?AaxAeFu)Sr zQI^n>gg$*dei}yjA*9w0&@d%G;LIG`u4&pW^=$4=qgHJB5#rQAYS z%HKBML)TdCg?OaHJrW=*K8kijH%p>X=Z5(K45=M^UC&ytZf@~ACN<3G!3CyX4TUpC z%{P?2OQ8^kw2{$(Zthb1z)!1$ZEI|Y5FOh32gY2Z8j6koF|*qi1WAfj{CJBW9wMBn z#fNCrTB}uI0T6tZy|?5;D+>}eC3%np@u!MC3Gq@LakWM9W`gY0_FQDSZO3+064BY_HlQHm6W|xYEq2K@^ zgt4A%WoTJti=Vm^1DG%sR;8wb?oI`&cY)#m4F=5V(2*KL@->G)=3xO=l5Ho@1Jw2$ z3iBN}7W|1>s_Rd>_1&YWqO)4}KV{P%5yf4Pps?bH#Dg`A0$tt3l@o3Dpjd4^xDSPO z|8Ca7Huv8rD&sHmxe7}rb7{-k#m7~sx9-7RmHFBlbdE*5dNo!ewu=vvqDqW`ri9Tu zASo&+5Cmp==?Y8&JI^KohXvAfWf#&uNrk9r@dIHvVL+X)TNU~6y`O6%F9oz!z;x(b z?ityj%NPmn1YKG2l&bEcWY&YWrtYVvpp;=z+(o5STK7|NR-B50D&n@@ENWv~$J`N>eWq6f=jczh@<-!DMTc+uW1?P~BW!;VL-59ajV1Os+J5iv%gQ`&0BCA&Cxi{dwR_#u+g-vbxIcnTOhdryy#Cr_vC z<%B~WV;19JT+K5EPAL6HPN5aD6m%D|lLw?Kn)#$Q=agoSX(O-CX0}lffoEy{C$(cA zF!^5m`t`Oxbn{iN#@J9p?FD6tx#x)GL5i(hChYhM^H#x3y6%4q%l|F;Sy)Z|<7?2k z0}2ph660YeuJj*=muc}w5#zZ`Kw71L7DUzJv&H$fqS%j*G0Vn5$L7;OwV2QjeM+_qv+*GdEk%&12&Dff;lOW^}+#8?^X) zvynCY975=3t}0ZhTe&Koed?@1oFX7`D1Q7{68X{M7QeHE!-bx3eUT2k9#T}Vh7+&0 zCm^%Imjl?at%nh@mZ!`GIZLKY7c_Ik%(i6`VQaoU$KiMdwTRJ3V?}@HVrf3yV!_fx zWO`mIFr-$D8h6E@Z=}{+BN2T|IDj_x2Y7(_%aK24W3tY9*GN?7C4Yvkgl&J@9_#-y zK91%+&{H~17l#T%%m`C9JWLyk`Eh{w!H?rd+P@{&zvI?J1?s_^jOt5K?Z(Hc>Zhmc zC-(dEw_U!ryg$IRZa$Wh>)TR29d9R|&1X{819)`$wcB4{`=6Sk$N?b6W!Ha^nN@mU zn*0==U44=N2?z@RO}BqhNRIetJN|o_^{=MuxmSBEv;HRlxZBU?_N6rWMLa@xxNyE8 z&Q#n}hNo(s^94-84K=z|y%vsAhk4Ot@I2wyioT*Uep8blY;rLd^1w4EHUm2=5e1c< zhp}62RcC4D32pd9wldI*$z{G4fBA#H_c8Bo!c%GAhXG~ha6dK}LwFQt6o&a&h56U8 z`5~eU$l4hz9^4M`kfrDAi`J_-O8Ri7Ok?0TILE;NE6m^FfDn_zj^F0yYkpMspkFi+Z&7HJ1yi*c>m)mi{RQb=wY_ z*J`N|hn1Mqc{)-!^Y^rLfjET34MI~)wU1qO@!CanHH4!)ECKer1x_{`UA`uf!eKbUm%u>10mFNs$Y!5H~*WM@F9jTbUQQUU(WoeCqe(4gMBFJ(8?I@o|l(ranM9%tetR7l2p8X8827@DQn zgN#2?iX;Q)<9D4qeYZPWBp8G$y3=aIK(c{>VrT(iyX*KzIm$o~Hf5eWQ%qnG6rT`E z9Jt74`GELi)sCeuz{`ql~6wewQXlQGhQBYfcyKPKH-$2 z3Xqr};=z0qkKzsuvJo|Sa-A{Nx!&Vt<70fKV3<$gP2-1{uTO?=d*cY##oO3ixBi{8 zHf%0X8Y$=C&#KDCB+b#S18)F$tY6)S1~-PqoZ=Fy%t2xi#(d(N1G2_VAXC?t2n=0{=_YkLiOR2;O zb>^Q_>sX5)4lAu1HB2_N_#^&Rf+>z|Qz9h)1FTZ%hL+%vdvsbQ z&K#Xn!xRVf3MwrTQ88KUCCh2t;#UOKRn~mLHq}Z?^r~5C>NXLz5HXwDLUL4H zIr7T1D#0itec@2y%C|yp(H?Z7F#-t2SzS4)k&)006|Oa|M$~bupB`6 zNCLYW;U##vSbQRav6O17$Q_2W|ED2<_9EKih5IWlxh=UoWvp{YQ>VpWfj>0Eig=Ie zW)rql_Ngv4^i$yswkzN`scM2N*s1FmK!Q6u!WsAo>)Y-iTxx42K8VFlm)NZxF!|t*a!*W$eVF^cFZ`l&DU3k7gKXSLJHZ zi*bl9wZzSi7yL;X_&B^E9Uz9}xG^!Ja3hIc1P3_=+0nzs!?!)c_?Hfrcpcl1b_gTe zMHmq;F2w5TvSL8_)?S?W&=rN}*`63qVCP;Pfyb6o-;~SsK#U4L7=aH8&z$6g>EWOg zF(O;jcwmf7J#a*s!$~fJK9t!|Dc0+ECPejZl=1{CX)X)=a=xnQR4Q=dhY;5QZwYRn25W2ppp$(Mn-La1D zSg5%x)Z7*-TUxwu-n@Af=fB5yOY^4A_Uq7TLu;s`p|i1lOJ`T8qrSPbVe^+_&8-bx zvG&e}F3}a&FJ>RkEAfc=Xf+lu=3<()2(Q5z^TUYfP53A#N8IldU(R6dnu)Dx>8uEr zsE)9;OFfA;ux=iR@EuG63N%yAhYaO`@Fl~qW$V_(B2&deavG#)R$$tn-!NmvvV9Eb z;(*fs0?Na91w}P9DsK&k6JO!2GN)f1bV{Cs^Lf}Qk2>Y3Q=W86e!4`uJf{pfWsy^s zI%TC();VQ|Q}#LK4yPPMX?qIN$s8UW=7$G{V;>jpOm{mhYMFiGIIiHM?mW0>zVbl# zC5hhR05{*(5-4I>j^-aR`_-Lbr<>c=yIE!}LkL~3b`aPFz*?4Vbs;~vCfENApnCvm z{&(>;K>%vojVGM-p_4rE_G=HGOL1CYK3f|4KE2aO! z>8QHxp$(zz4lO>!=!Aa$9fdahYPQnf#p=G(0ZsYe!v+pla0m^x!JHv}H!#o#!ic>$ zKX@(fdGz9QJhj3K+Ofs@5u?Bx&?SvC%oxgLiab)|imI*2B`yGMM-g0vHJ z2OJG^h*N_#dsN;T%DLbFi-YVFowDwbN{2_?zh&Fyz4>mxu0Qj zs@(B4J|WM}ftm`!iGAuIScembgblgg>0<7qWyCJJnG( zz&t6{7Cjw7!=rd+gsF=p_NaAe1s9>$BYnZ-E2P8li;J~J@o$XJKEyy(sc%Er;mI>j zt8wzK`JWg&7J-OG5lGfFlAO&1wHGr=oHL*zCu07V-)WREgDK@4tTU`T`p~ms&D?D* zzSo-hBTn@uubu_^96KRVD<0mpw7mQBR%&I9KZE!1Cwar2v)zoLt^~C zSWrNpM{m|O4-DGxEf;P3}r)a>RTL4v274ctHwECX=*D6rz_CbX;Uay7i&4( z^%dJBuvI0oN*Y8LN*pec*M8e~g#`Gfm?lC@>^(xs&w}+1$3>I(w4k zM7%=7_Xf|0H}$GbORe7TXGd`IiY~VAB3quKZ9g<^RGC4ne27C<8}X&Duvte$ZC??y#Q<2>ycO? zr*(+8-Tdx^y3RrNpk1QHU&VV&95^a>97ft6$D%>cQw9#;3ExS~*399E`E~?*66AyQ zO85mrNFVNmv5;NzK52qJwclnk!X-Dk4KIALj?>i;jd?f%W^{}^`01V_Xowdoc?eM7 zr^{LWJLwi-^PMG$#lJ>WgQLf% z8ksJ?!3aNyM<4%Kw;4C!G>FFvQ318#4bV=ti52z=$H!JYBBALGPc!Nae9u(j$Fp)VZbM}bk5p{c< zI%Up2vBWBK?61#t=DtN}1)4`dBVqbm>0^Xo7j67djG{VQ|01;5gc_qdTN*zZn~2+DbQ}-j9lM zmt(gw_%*(XKqP_~U?RR}Pr=N`CXN+lcY*b6k(Or zj6#ht4*?q4HIrl$&JyZe%#sjkI1NT^WP7ICjYRH^7QQ)6tY7Hoz?_+r$o(lAfW19+ z#RMt^FDx*R<6)ODjsMq<@ny@{POt2>cY7Eg2Bi6at@#gAR$3{10AYBXf9y^e3?Q^yqu=UY}~*#rOXEgaqODA-j_QI2xI$2?;O52%L{AYQv2d_^ zdj${y-Hje$G^FHg$QUpP@xz-A*{XBYOq{C6)Nn(^upry+!~y#sAJKW~#dNw+)UzxS z&-Vx=!6V<*6WKQ+utD+EAhk~t6xwv{yl|SilZg+d*EPPJir9+0LR`zSllJZ{l9a96kh1k zp!O2Mz(&VUo+CaHDaA%*80Vdn@(g|3PO<8Pybs*P!VhtHkL1O!OP+>RBi32Kd7`Ojg) zs?B*YkIha!q}bm_xbK-?$D#ev;^SlQ4jk`(hwluL@N(B|K3$A0Enb5ra=*t8!{%aD zn}cs-HmiBd%{TPK+$<3H6pxz&_dyZHqF$x`L6!>iHrnl(5lRrcaV%5rlQ zY8~tD&9XOcmE%qxh8!-;?I||9*n{(*g4qB3%HU=JU%(+f$E$%~bbcE*^5+l8$8buc z3@nENGO8#luh!;`JzZP2M6YdVYv`=UIV0}ySihwwDxFH&wEXRAZo9U%A(}dsOxM_V z*uwA#(${|eH%IxGjeA-wv*ZNg#TQrn!=Z@y<2`Xcvs@OMUdKP)^K$Rql6*7&Y)bpr zMbZOD0 z#YLADF9;P@HEa%r>ti(c!keW3+CM( z2nP$|S>?gdieN!`Fi#8S_)1WvJ9H~pmId?f2pGYFMAlWo(DrOSSa|zcdax)ytvOhz z1w&=Q0#KBprO&q(e|`Al^&=V)HQ!>}JF<*mD3NUh3%9fF?bGyNNql;Fu+Ja(NpPFL zBDlk!)#eZ0?XL|E`fm!BXu%>aSXd@==cIa(jB#0~tR;^j*dN%Dww54g6&) z=HZ7Qd+OU7kt zK;2ELE{>qPqiA!VKky@f2JeV>W)4`ab!cX7G z60Yj|Dua&jRm|r}zN*>2S{JxC*t9Izu{PMG1?#R4HkAcC%7Q&Ln1lxA;Bw5t>R{cv zV3QH-SRU+w_94?Y|61THf^{3F^x-8l0@{=n#*8h%mrs&=`3$&<)AJhiuoOL9g&rz{ zO%=h8aIj}7z$$=c05=8es)J48V8>OqIOV`M0*?gi0?VdUOT`R5*tA?USPn)N!8+ej zs3u=5`99xX>MI?SGUl`slR5$?f;}71dJDQ+itfr0m*9I4X*Wu`TSuVJmK5#@X?DQu zcSCuDkep+v5?ybDxhd=2aPUEt9TmZz3e-^9Zp~V|25nZNW~Ef)w1MkRVg1F3Q?EK> zI?Xa2r$n4h$0;$UtAc&mS=0T&ZQ0eq9SCA~<8Ls#Jh&U>ec8)UTZ6hv)LkXjg`GM$ zLG}vNEqCfF&~B+ySBbinsKc}uJXuGctH5(P>fls(zB^k(e__;Nf}x*2^s@~#6YI!x z1?t@XTv-gKziNjk98$;$lYrLl4>KP3!}vRbaUF#`&*Oen@Qa`?j;&G}n>%iJnrEFW zhdgdqet4o7w>!40AV;|)e+BBQoUu{Q%N!XwHr7!V_zUDq*U<*-Z-e7<40Od%I+T%YGB;%D}s^5Ds=ths?HXWa4`S+!YH3a8Bo zToXKnf&{U!Spjs%0BJO?gp-Y~R}= zf6FAu@;OOGMn#s$bTdFaW{IPZmpUO0SK7MWHURe{KT$KIN2`67!K5@n~sUpjF{v?b*mhY33@M0i- z-=|af7gBemzd(9Q-)2elX_WMC1}&q+XH5j0r!3!W4?HvpFBQmSK8C<2>)|SapD%K5 zaqIl76y+?w@}6*Uzw3jXAyKA()`{C$zM?cF?lw4hX$HQ@!7DTHCI_#}z_&Q~jtsoZ z!8kLLx_^IGi9fQ{mhpVmye5K5-Px&JvYz}=*MiM<Xtc4rZBsNlZ^8UaFPkqv z&d1}t$dV%d6!S+sF5ul|{8`AKFY~8_Ka2QNe5v@g+lWTRqE)eIRxFwni`K=WWwB^l zEZSb`bt>1?P`|mMGrGB9Q~Q<SikAIXlwn(hSpSr481kBx0CumBSM;Uto_W)WA$xK z(T=Vt4k96YeQWf~CDHn=J(oe5?m1=V__j7|ic%Zhv4-e;4CWsuq*hA)aq2DgUDx>< zn_F9>^_`vdH$)di8`@%>%?({X7^1Nuwy8;+i9+|$h_|#Ztn#o5K1E##S)ym6>VW4`oJl9j zt}4!C>Py)&S{_+hrbVJFmoBX~YN9n-S;UB@#^2W5)*MScZEfh>*xuC~yCH=%)wgYi z&Kf#9+dF-dKhM9kXa1#8ExNI_ebaR*>N5$0QAR0HR1Ex#ew9**F+&q z@xtiFM*QHjO02WKt*a6AQ8tQVma&XQix+NejyV%(!fC#%;WUdW5B=eM z4Q*R}2qv-iRuqUmoxaBFJDX$PUMBp|nUGV6n(D{?+Bte10~o`zyCc9dfner5R;AI~8wGzG%g1LJ8zaF@3 zmqil2y}z*aBjY>8XeZ+bJ@8+7;D7PJ&z1Be_gf+1SD>vsuDc|Bv4sEJ1OJYshp$NV zIaE9@7GK4m%fCg!uaxi?B)n9@Kag-)|2H5n;>Bv&55MDqzb4_bJRftmD4Be!C0wTe zBwys=BE5_+lW z?tyRcz`rNqa@_d6gqNa!cf8ro){Enk?Rckz%j0@Z!e#yuXRAg&vj6;(gv)%c%LO5> zOnZIa*}@T*{kF^lzgNO#y*=fDe~SMY7FTA!WfCs)`KE-+@+T!+13y=vXFFRs@|Wjt zp$8uJ!0RPk*28Tc_@D>=kc7)}z9iu?|A4cZqWm)bg&ugB2mYdmejWl}UYYuN*@gu! z$F~|OzRCJ;m+&QM>+1hMBwVI<*O4!R){y9{oj6H+sf5=#IEzn6_+K47(|?Mwzve|; z_J5uO@FFhz)xS!(Ea!0vm+ewa2Dn&Ujz4$2b0v9Xy~QNFT%s2Y96qvrR{`Ti{&K(T z1v(jT_P}qE@Jn#-%K2>%{9h%!NTOdP*|Ai@ub1!&2~SA)RT4fZ;mag^p9lV;gv;`4 zl0Id7-Q|J5?16t-vaif1D&ca!JTKx!yYP(K)#o=QT$AuSJ#d~Q@gg6Yo;fNnip$ds zm;Y9RxHA3WHXELV-{FCO*8|_}f&V`b{IG<}arH$Hyj-%QtmivD@FzU*Tn{_C@r3%3 z`S*Iz|JVc1lkBn_yj=ZPNx00P=Y+g6`+e2}pFzSY=nW4%?t$<3z@L?H4gI+Ck9y!I zJn;8D@Ux`+Q09N02X08XJdbutxUA@uLpj}1Ap8Df6fDcOTuM2|4mrSo{yiB z^f_Ph&y^CsO2R88{8Nsba@@RJ_+$E1CM%;$?9c)16@ z*#p1L1AkJ&<#BaN`wKZActXOl{l@_$+W3J<)|1CM*)gC6)#J@6Mi@G%ek`g1TC zTnrOo{JDPkriAl5KNtVHv+tmP&Xn*QBz%^Hn-cyR3IAsapDp1(knoU%KOo_smGJ4# zeuVN|DB+h$_~#`2D-y0sc%_7YUc%Q(_(c-F%gMvp?-wNe0SPaZ@P{N^?)OOvm-}_| zeDV)Td`g^s0dZN+nuO1h=xZcg=HDRUj2*6h+ax?B;p?1NR_ym;3Gb3{d0aP1xXk}w zC0v&0mGi(9SEe6kecZ;$V-@~fIrANy^30X+nGQ}qa-6IoAui(O_;dOEnjkLXEAZ#y zA>8v~v$MpV{lhuJi?|cg#B;y+wQ(o32xL^koe(0Bzeu<kj!lQ&8aIqZ`~2 zccy~C3moByJFY75HzeF~F@eu@gdn|R3W1-La7Tp#_s#3NVN0xjBT5`2i?YeRi{VJy zH*a%&tll?oV^^1NUS~sV{XC-cJWi<3@B-}bH$KCwtH0m)46mkaiKl!wB=L1${m8G0 z`;cA;Np+LnkVti&FD@h+&sPcaI@_szFKuW1BLG^kcplzm{uf%|?dIP#Zbo~*neDm7yigO;m*#DAv?dHhQ&=hU# ztl!dLd-Z?&m$@0IsvHBzw3I(nT+Xyk)qECr#GG_2g_ox8Cf;PW=bFPQ`(JHejJD+I zB|l@ld^yWP3S!J<%q&eI_RWz>yDW0>3sI8JjeP|U-iC6j{yUs{#$x%(?7tb6Q}ut^ z=|Ai=X8#;xX8%6|HdX(_zDHkbReU8D{Kl{$?{|~^X>c7J= zDA#cL${c?d8c)@J&}r|!&vJQWw*LqfQ^`NfMHAOu%KZV*d*M zGW);UqyN4O*vk3V-M$sSnfkBw=znyb-RQK`KiBEZ{%`T1Q`(d}CQ@Z?IKEc4$ z{(sVG?|#SRvdPr{*=RqN{hOk8ivlMwa{T-{%I)8cU!@-XhraBzcLlW{Cw)tGg-8E= z&I0B3!~W?~nf>#7J$8X&!c|&gKXQGLuaNwJ`vH&k9Zq{WH)PXH{q1tvJ8650`wpjl zp2zrIK0kEY7o}R-H-n4pa$W}RvUAJ(o%R`IzGs~FnYi1|#rd5gF%%Q7qfYxeIVlH) ztACgHVKkh|f9`hdm*aGJAsU3F_O4y0>YKcamhczz*V#nSov91Af44ow%fesgRk+MX dzUSosZoMOp`zUh%soD=V+9by_6qDKh{{uStgTVj* literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel_emu.o b/third_party/libxsmm/obj/intel64/generator_gemm_amx_microkernel_emu.o new file mode 100644 index 0000000000000000000000000000000000000000..2aa07abbb2e87447578dfe88b9371bcde1ac0528 GIT binary patch literal 21344 zcmbt*4|r77nfIMcDpBIxRA=3&xQzCyNyV68)I{huH{1&ooZtZYCnA#&CMKAFO$H1W zC3I$#yO&YEZu*Ipt?W~`?e^(Y*nSj+#WVpGao392t)N{${Li2$*p?{PWPk6u=gyoA zbFJ;(hs-_qJ?Fgdd*1)&+{-#WFgM5LlI+VR{k@cavZ0c+wKm5VQwNY)T zF@CGot93QTpXxQ^nZ^F-2LqZ}-DKVuG`A=t7;MIr?YQ}ke8q1>lp=GZ@+6-1_=si% zl)@V0l)1XjSgPckOWF-h$pdhju~wOlu|7F|0Q_j?cQ(=tvmp}w%w>Le6MN|Hl!r8> zTsMCZ!?T8|1xB6X*YH!N0iUKBor9N|hczK_tS_wTkAFcSe*jJ(g8?V|BQSOXf@cB*)Kq*hr9z zy=XOQ(&K2Z#OU|bUAY*$`xf6h&1yqEKw#64zjD>QQ8#tJkz~mH(WiY_X<>X~;HH(5 zHOWY-4UGXXRtP+19V8=f3wlT2%vnSeEtKYssp*;tji|e)gPe9vol-zzZPzPbmA}&n zjZ9stC>T&!B@aD2`u_BDUx!S_Nv0Kg{53iDL(Ho3MU>LyObl)-k*AZ2NQ28P4O;5_ zzm_6@PMutr_jQ6pp9(dD9wQ2cE+vI_l0qRvD9S6x)$xgm0F!$_nXQ^@3Sc!p(y1D~ zZ>dX;&zi~cF-|OAeqKr?B~W?*;wpj;+9cJyyDoC|EHb{>@6bcy zg)TcumtD|h9&{0a{EAewuMRWrlLFhq_d~LtUnZ_Sy2;uPGJee`Gq-@K^X$9-hix19Jt}K-0Cxm~Ol<`ss8?BcP6+ zn7;lENm@4%{ir~9y^vV{f4&;ux<1+a;XNTpqFH_EbI7WbB)3#AI{^VTC4Z9xNlzyO zEYK49Q`P9Fvi!iZDUzh_E+9&WjKga2^Xk~gYINL{m~mdB)}_f>a;2n~9Z;WKhatNQ zF3R<-bgh)?8~r3R4GS(;T`+)ccayZ$(RXvy1`yCx^Co6Z5NI#fTMCS;rb1w0n0XW= zb$R&ipehfKqeG5Oag*@M72SMb14N*i3%o!8tpA~ozLA4qA;&-b3JLCmys`tk`>vXD zvcA5N=q*%t5nn4YMN>4}eoMhyReVB@eSk^aw{p;AUbw2)I9zrJBm89L)5ao7AYghG zJ&}JY;n&MX7)o8>DChEH}8hgFgI+#s@tvA$-|97_!zktc;AcxMsm3A zs`yFU^vD@OwgQXXft7)oQb~?|1I^R>04qm>^qbITO+-I|I&MN_*hSh%%$X=VRQ&1K zDJ|i>Sx@v83^Q*uUZQ13$EQPfjY0Lv2@ECg3~KVc)5AR%-G5X!PNjLngOLPhM=@;p zUV$&wIKq^vzLcazE`?HEn(-pm(g4-W3#!2Y8MCE!1dIE;Zm>T3UJixrwJ?ZAXgNr{ za7B}n$#R2`1+fOAZ)Xx+=}KABN*5y!1S=)DS9?9EhJxy14bHU|vTyK0p{3P{KtTa4 zU?CwEDMk0WW+6n$(X0P7F+sX7)M02Kye>%AVibN(U<48M-$|IHalQCub?o=mi5VAY zi7pqJc*-DiAanxAv3}@79bSVe$xfh znD;ZoDYb;ByAG(teI5AVPo zk^hP9xEsgS24MafnuP8rxCe}JhVgf3GmMAP0LF(NXPxhX*jBXCW(MW=GCO0;fbAxc zZ88V46gGk_&RUl}e;r&XWf-Xtqf(4PZpBQ^yv>VX@R~aMR*t%u$=MDs%+!pemcmVT zI7w<)e}>eUD2P)M1xbxWK|*7K7(s&sK|-?yL2_e)Ai`_#v`d77yTj#xcwprWk44jTHf7 z<7XxiQjEheH$NtC22Bcn%$$s4B<4i%t4Buz_zp-qZ>BVgt#EIZL0 zOt1oFBLCu)%@<&?Wg~FBQczh-k<(K067_ZqGsQ|FT26|UYLj|fvI$(aF~3tA|B_fA z_hMoI5xoeeWXBdBzRX(+D89i})dq?R8Y|IM*Q+q9Abb_<>aIhiLn~>K4RN6z_;g}| z<%W*oBx#^hs(>^w{ImOxjYlNMU?ya|dRJhCjHNZ+Y6KWS{(up!ZTnKbs!=yA7d z98^b-=BmX9wc=-@&lDsd`h?~TLN}J5%fq(@VE`blHnPMAqO^`Qr@rJR;F0n=!X3k| zB_bDoq>jDI@V^YF0sfcu;@9;=LqYQ2Fqq4iqNQq#2h4mV-p9z8fL1qE^0j?n3yJqN zIsQwV6{c;j5|!s)c<$uP%Zejt0BqXApU;JljY5MTOcqh9b{&?tReRlt@l02)5jYO7 z7>F&>fEr!b>jHmr>{X~1US9}XH~wpEVj_7P0GmiHl)?+3ly=1RODL`<7ogv^AFDX{ zb1WzQTu)ZqKY`+s*&l5e@^vaBUiu?tlTK7+5oDAn3m}oQ6Xe;&Csku?^!RiQwN|0# zIvy}*C@-BsI<9y>vdK6`{GosVe}&bFY3B*)k4HJC%*9B3%XsQT0yz$6suzqB7&=kl zGe07+pqii!!f(Mu^cm!_{jOP^$b_*OU{Z}v$g3Yv6QPsL5XRnu2^g=8zL#74O7UmW zeFe!iY^gEKrA$*(PzmX7Lh*&A&Zf*h8?~bs6slaRJT#(Rfi0G{0b3$DcI@Mc2}^sl z*XhGv=XWS4nYu{{KD}~DP>%1#C=}sMD8ipWQ!`%25*h2sALH5bYpY2AE}kLJRFVGv z25$FNr=P+4eBe|~+&aOE)JoW9LPzwn^``}7ePkCzOHr04@@Ii~ z^iz~d^9~sKifSCtj3bO04JOZ~8b@^V{-c@R?8}WKj4X)0Ef7_WQ6pDjoUL?O$S7Jt ztc8o&_(fQ26SEnVV)uGs5~W%qP+0(;s6b}*MUYacKrZ<@uP@ZZ-5*)zU`2#<#CcXr zEGtOvUIK?(k%YF|vMRyAa(RZ`S@3CsX~C>HZQO+|Mn(;{bqXi>m=Va64R&re^#))%7hri*|Ui zWrx1hfIMcXDS2EBF{b+RnM^JJ;rLHr1+a+p{NJ%9vH9c&pJnzRPxsSib$@36P5`V^ z3ixBP7*CKHuGD!;jqyi1a{15t#eU;6q~ap8yG}LlYl49_BfhqiBj>BT;4@Nu!br;6 zf*^QpcOgj8A&D2=d69fFm*{k%Zz8MMn24y<8C;%u|N9WXH|m}@WgQ- znw^7Td-d2MIo5z)Y#q_@9|#<}IlK{cXmegz*{S5ou{ebjwCpnuzw!SmJWk`M6Dajk zCl0hzoM9bMU=)mZp@H2d)1Yd055Nk2IE0CP4HU69ue!@eq^?0Gpzw=bGHnd0Bul=6 zA@IlrNE}og--xwY>5}j^Vw-5qUx%^swr7yqc;c#H#8|eWNTM5`YR0F-32c<$qC*&s z`4&Ia%>p<))T~7oAYAChj;V1G^j~4+u@9rq)GZD@Th+ z!blGkNx*hP5Gi8XN{O64aW9Vm(2tgwJrk2H$8SV^B5y-@#wLt5UllfM;FNMVErRww zu|L3{t(4MWB|u9Cz(YkLq_J4Vbg#Va1=?Aa9k5b(S`x`qEFeh6$I&-G8$9D$)qpk9 zlG*VSA)^&pXGTFPr-~2+VO9_^>ImJhxek(mZOQt;SW!BX?1R%<{!=wLp0cX=Psz(n z?9s9PCBs#t6|U9i1R zRY>u8S{uv97(xP{%LR^gkjyPn-^}D}2@BiTr7$?AWK=`{o!WU=VzPp`@d=ED#ExqF zoTHe?mTcJv!$oqtagwF1TtgK>h3ZN|W>5m{y~~lt*(MQe*g`91b`q~a64cBUN}gUB zu9M@xfMVdY6g7T~Cafoq%Ox#e9;r1FAsg9?9Wooq{g9EMqHS8T9$o44Ip#XMKoQ~Y zW2}Y>o_cWWTZrjOz<8q}V0>sZ=u2J+4uW$-TFv4*QideW?dkJ%_uTEfJM8OU)!WbYe%mXwrGM)}|2?doXl>$wX=!=1j~a9>+bS6{!cx22;myyDtOM`yS{($g32 zXQU{;s`WL-3lv;`Qy-x7?DZdPhBYEQ`Y%&Wm*3o?9K^FO@9e$KkC2yVJWr?Ey(7nS z1Bge>nmG%h&L_u%aPYxTbL3bP8aUXSK?`YO!&^Mc#Sub$TLD!OnrO8C;y}PwY{MHO&}OA{eQ;7w1uXz-s6F;YrN| zSPOtTC)bE6F|^!s^3bBd<(`v|x-~~0+K0l^mjmcVKEANDl_>OEq#Fyi5TPJuv z#78nxIera>#c-PYHQW&_jlIJVwzVXAr+%5SH~OYLcm_(i0s{umOp)U?l$&Y`jbE>S zSMs5L9(U;%zoxE#OOjmD;5!9!>@aAk>-R15<@T!4rzlRr*T!%gIflgeb(|9cVU_&7 z{Z%nA@G9t$RA#UfO43unyb{0C?mO`_45|{>ca52E)pvd?qOlkkoy9^NGYHj`=|nY; zhqmEOh;vK_$>%_X)ho6c(URiZ5if0kdU`*&`~e`kdB;H6X>8l7a5N9C&PZ-x1j{Ou;Y18DMA%zgN$vrV;6CtE@ zM6ViV=<-*;_Xn$MheC3|7|SYD`K$3<2!f`^6f!;4c#15T02>vuK@9n&*SbHY;0|PQ zIv2$~q2W%oMG?jI2cM!Ryip>9RHGldv{^e?+n?B>3<4)=l>M;RW9WAI6B{iik)RZH z>>&D{4J5WG&@@bR$igvtY&!XGg?{;%-}uO%u$;~hSFQ>eC(2&)Cx)1N1%QmH7jGl* z&gyeNJi{LV)hb}~Bfa2!i)wPTaOAEj1KI`@>;ZVjcCbaS1uE!hqKaNx6q?^7Ud{X; zsAle?x|#n7rY^;#_V^(={t>!#bDE-KulvyVp?W$43!1C2?rD$Ee*B?-0ZH0+m0G1v z{$r}?{ef=ge!rd!s7WUk!%;Oe_b~z#GHv@?A%*?1AzD5DNw+P-`9>4Y;-8=e;54-o z2Pg7Fi=guNm+*Q#VBUa{(ryaxlWo~!;WV$Me#?~am0YW*1}q4Q1+Xm`GbL?n%KE1% zo~E-EUVdP?OO;M~Q5T;A^0e>=siB!%Nn4sG$#4C4%{8{MBn$5PfO`gzBP=E?yTsM>FM}wb$V-Wv9H6-qwQK$B9H#_bc}9~L!0ktBTMs`n25%*!A$x51w?6OR z>^F|19(fSS>s?659NM@E1Ohm2T4e5tQEYCUYi`^^7Q?trF_}C72Dny|o7kGGzyxfV za5v*r3z+?_uxp^dKZTy5R2OOH#*Kp(Awp|eXw#BpgvBcC;n9bHz8k?>N{94h5e|Li z=~c89K%H4zz_bC$LpA!X?(rKNtG3uqJ=am>UEpE3Ti@o4n*R3lHSF>N-Rb z9z^5^he03V;}ejFTq(kW%yiZGgcj9{*Q*SFz<85hN_s6}kV7+F!Rj7K&ZUCN*8gml zD3XWR=^W}Y5mWW>$1# zvSRBb`c^s#sZES}@)agKwoW)jAv~M>19_G0?_FKeWc$P( zg(jv~Rd!l^l7vq^n{eNS;nHr>3R|@LWG(^ehel9{+f5I{^utHMfhX8AjUWSJsb??m z#S1`b7rg_g-Vxl{W^tGx*+vv#e#Quq#jTij7K@C5TYOWRiG(kc0qWRg_mX>JJ2Rzu zn0^=oP`H;SR-sEZPU0=v#|*BU^=s)A>W%_`>lS4Yg#E1>m96Z)Ly6&TBJW`moWB=0 z%2xD9qaR0aDtBUsgcq$;z8wU(?2tRA9L6|=0D zWdkPA?zhmwk)AsWqJx~+PZ+VAuQ&qzZ`+60y$^5zja8V?R%Pf5yt`Q*x1qB zL^-Ao5piV7Cd?~41kU-4Hu>f&4!YXp&R6B)ElLoB0zmmQM!b84(j-<5-JMF|TzT7; zA>y%3#muB@ZerSFtoarp`9LoInbM5jCC-^oRRw+qdz^oreN?IQT{|vp= z%`BfruugIau@hKQZh#?Q-U-M;catX&$r0RXw%`Jj`U@bHZ2bUBDb}{)L12xUGa>g` zlTiQ)z0BgNx0IuvWc=33sk_SRPB}m=a{yg%fISKk#+N-uF_bJcMvs;Q*oM#S?Nr)U znVs($#01#bh?bbPp4%ts!LsaA+{RR6FE>gW3MIgoHh@CeA)@sYv^4C?h&GIN1^Ewi zJv_ii)WmF)8Q7ahseDwbk_A&_z%Rn~daROG9XnbB!Z^^}vGw(`n_yZ-cLEuQ|NzjK4kgf*NN{4ZtNO_27ft-eN8$f! zB-9e7W|%jN;-Ud(#AwTgcG)8=6W*I?;;eoo6W&W?wt`$XBlU6eDI?{`>6zwPM!fgS zMDVl$WB&ni0oc=4+^OIsD@t4N#$NWgk%jIo7F;dZCG`FsZo=6um<4sqf|He0X&@(4 zavq^A>{FDGV<_xW|uwwUT(JJc~b90uk!*obu zw{(gfhZtEbTVx`o&4}2Q3OWw@Ooc2ue!3a6J z8{4K-45Vj#E=H-^DnxX|$mb|kKiWpkeJVZUaB zDOwlKgMWqH;{=@w*BZ}Qr^57!bCG(w>f+qU`Pg!{Q!(6&J>q`oF^_#~rk>s_%duYp z2hY>d_g$KCuCO+ajMznOma7=rQUS9xl-*@RnLdLlaok!I|02y60ovzM~vkZ|fA3U{rt#`kx0-`N=srB2CCdy8_LRkMT^2KeEyb*1;4hewAjMqwQiE@o<)*tpulx${*=5;kdK5d>onfC z@~dc6dkP-PsrD2;=+-^HL~gaGXyaEjPwD;VXr7Ab6wR~U%m;@L@*?QBkZ@zr2tDPv^2YZ>~M01v5x=;UZ< z)B-?aB%K5Zcqf3j0!W5nE2YIx`+YWN7V2iNi3|kCLl`7x|K03O=^^deO&F}#|PG;3M?YW~!30bb)N z^-uPs;vR)jb1#;K4d1`rvaZ z$qbk4BK)Lf2aiwWre(+C$2`wLxBL6Kp2O~1&rx?L?ohBhXKmKyXj5~VF+#)0YR@tE zLeEKe1C4HD(=4D0U?Txr0fbuv?=d&LlZ}i3HXpDCHUWd)Mk4~i9k|0Sq4Nk>o<@Z| zu0o23jDJ7HzYPAF?ltH>?rGONy{aX%M#yX)WES=`*ElSqHd_c}1hf1Q3%SxM{`Jb_ z0dA_=vRrNn`=2bAn}U0m%MA+IF1JJl;N84vcH~k>A z+!A{=EkGoR-j=u|{5KGOU1K+JcT4%gQ@ZIE#;2h)KF#N3I3>2TFSm5TSwM|J&R$my zgP$ukWz1N>;Td4|=#~o30xt&Sf}FJcS2448_J^=9foLGUn<_#rs|Ritw$B40SZ zx2KnBDC>qPG%s*#wHgTBI(P13y)IOzRtNM@ip}nh?v6<636z%i^mjzoq#*4r-7ByM z3-|T)I2)44?@ZBDGK8FMrw3~dcdwRUm64uKsjZ{4Gt|=8*Rm#59tw9y`Z~h>=vx8$ zQrq2q9g%Q~vrKi+HZf%ww1@2N3%7+Mt?ibbcXWpWGo=+_G97%t)gQWk=D_tcLuzPw zXHV;05`A`7N3gORL(ALngD=lQ_~li%|m>ri!vRRg?Puy zt|0;bw*vfS2l$V83?w{-!uZR13Z>h{0^E-O1iwUpPYB~L72uci+K;+vA8hmar~toA zfbVjEpT|>BCO;(tT*MO*;3EGw(ZN5iOgvozT*Q;ZOBEXbclfjUR|L2?zS;pkD8NO2 zekj02Je9n}%cQ$hfQ#e5?EwE52lzVze7dl%t9fcAx}qLy1-PikbpqTc;Q6fp7uS0U z{?jvEgkPjPD8MgApDpL}dFrF_qP;B^;3EDJ0qzs<^qh|eT!cp)|8-s(6I`_aTO8m| z2yjvUzjc5Y@sc@{?m_`B;(1tri+s9xaYJ~nz@IIj%LTX?2dW(4%?|K20$h~C!w&FY zIKW>P;3A(N32+hrg?#IfxvuLS;ENsLCj_~PboWA5bP;}0K1l|(;9{I?5aOh0Z)*g& zsIR94xTr60jzu6-@67_-hat8ew+nEQpSL;u>lk~bF#favze<3gk9v_V>L!!4@l!oY zS0+C{6yQbZx5vLDz{Pd>2>~v;iF&+WfYTn<#&fBF=V}3dwJ`n)0p5-_UBo}B+Q#!o z0^*`+Bnunv5#&#MTN_T_sL@5^XW-9<&lBWEUkciA`Xrhzy3wxHhPQAy;VBW|?E)TA zFSK8xi+B|EvVp_tMxn^Y|6L9z{Gxo03viMD!!!^V-LAu*jpst1H;JyeE``Gh9sMsT zd;AOz&s^8_0-h^{@wxyP*HzEqnd=I1c;>o37vQ3ue^bcUr2^gi1h_c+l72B%16#;XwJej!vP*}fUk6b|APbkrw;J_4)Avz;9n8ygc(?h z?Jt)*z+VvZzBoSV0G}%4DRKNY4)9tBc$WkG9~|I6b$~zb0Dsc~{$nAJi~Nr|z~69y zyM#P0;iqVW$P_Q+qQonhf|#AA)G-Jp5F=1Lp6gp6N9)2&O-=$_6hkSpF#Lf zN&lLzNXv4mgm&rd-fp)d*e6RRD_SBgQpxiEeyOA{+}Tn>(2}fYRziQ)(~6S59zs@P zpGIZbc4xdRCZ@_t%0T(An^__9{N>4h$ysNe5!h@B$A9d+k8>}8=~S=;#}!>T2a!s; zdm`Zyy!F18o$GPx;r38lUrSe5RMubLradKYSwkrC`a`7@Q)pI#o1mk9xf>)(l zlN*`yuSPee@V|C`8RimBC;sjC4H&{M9{%h{mRrd+s!|ZEQS=Rd=l((NN#l6A!D?+9 zu@HaR=x^uo%RZl>v2+MM8` zJ&<_$#0RujILzo{uK!6se-rl~d%P__`;XtBs-#~@`ZS|6bN%Gz8AI5-PdNWS%$ShzDE$v@N4+Wl literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_avx2_microkernel.o b/third_party/libxsmm/obj/intel64/generator_gemm_avx2_microkernel.o new file mode 100644 index 0000000000000000000000000000000000000000..2fa5ae1c29c57c57b2f143152dd862d01838570c GIT binary patch literal 4856 zcmbtXZEPE79e>VFSGURTOcjPv2l2E+xM(v;$kt@-P#)(lajBEKd2LHm<0kH!EO9*N z99-7Y3dikEhiQkVX`e=s_`tVP3877(sA7HPZPL)tCQXbFKnWom6vQ%=HP-791m+BZeVJ9?9F(!VRGoL5({3iR`xzfoOUKH5l#lS? zk)deeZ=H%UDQ9@0?2*oV5lo7a^+okJ75!4%8~nrD#W$qlb?8TRO%CW+75$f7x^OG1 z$K=qfeiJ_~`=q54aYa{T|F&C3jzJbI+!3V3A2u+?3m=!I#Xox)gIrein^EJS98lX< zW7syNfVFv-{+^=$nj6qpjnO84Tag1EY4N*Y$6nVWrG^lmKR0&!xG@@-W@_7wX@S3T z67p#VKMGfbv3S%tE_eO~5Q`Qrw<=w$(&FBEPX#A5KHG*$N51%&JXd#GA)o=6w{(Big1SvTAnVK%TePqBs2L6 zJA{SJG2=-BxFI=o=Rvn}!QNkAQH&W`j_DsL27Kp*yA9G}nc}`fll{_}pJHl^G{ei% ziL%R6UbrdnC9-;&4M|J6_~|QSuSKs`{NF5Me8mX#XL#>%)ZyRQ*=rwFX;NyRz~dtNzJX{UhwrAQkr@>dTvO zI}(;!7Eu^f8pq|(E`619fJmdUt%f&LGgZiu4IDzeWz@u$Itd999R)Ofm z>rM)|G2}C;m!JK(T$Wn8849KuvA1@G{bUtP+6&9ZP@d>jtJAn4qIWB1kpQRIaoMlH z@n(($q35KQ_ z`Z~tNtLaI2s-S+bXO4mnH@gOB^ z-(4ik$rtP?-ZpLPrKDU#xUO5q(4N*IOMxJJaOha(m07%6n)P#rtzujv` ze!y>xz&8~B#0|6dt)M@Nh4y)fJ>Nwf9;P@f|KneDprU|c@FxH@V-MAP6E%Uc#SVI< zY^i2Zx)8!tQ<&iU@F5eVVmDk|&LFjLX8?@`rV$*MsD5)D8M!=;%^-e#`F>bKrE89v zsl)mC+1bQXsdOrv)aJ67;WfFCZ%@olO=RbuPG!@n8OHj0dY%Y|`UXa}1v|r!g>0j>va*{3#Q5)u7p zJ-uS!JKl&GEHy;L(6={mvGw$QTOI z52TMZuv^sva;@{O`^$_mFQf19e-1As>kThPte-(GZz+oN#gTGe?XZL9A z@B~XFj^xr4iA1=geS61t_TQce&boZ(4Z88{-SOL|hL3fD5O;7!KZI)rf)F5JC-sSbY2g}d$Vw~qqa-pzB$g}e9tSsnas7w+a4>?4Bs-8|b} z`Jz5|>LG|Z4N8M;viZ)!^T%ej^Ybj6P0b|3god4_ zyGK_;Wsda2HN721v&^Q_U{B9!sW86gkIrk!iKp!?Qb!XRv~ltoUI?O z!D%fG=XugKHwE_XC$_#Dqq(E!;Pm^Suv+_HvGp(2C^-H*`e!j&FMgNkP{nZKMDe4q zxufsk>+MZ?dHd6U-QJ(lK^(REw5+yv0I_abj3L(8U-N37sq5*#W!wLi-*l C)OF7Q literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_avx512_microkernel.o b/third_party/libxsmm/obj/intel64/generator_gemm_avx512_microkernel.o new file mode 100644 index 0000000000000000000000000000000000000000..60c31e1b0574a19094288590f5f43f52d6674b51 GIT binary patch literal 30248 zcmbt-4SZD9nfFW*H9|T!Dy!KR*R(sf$%>RHRZ!Y;!@VSd3Hc=9JA4HJ2~e2up;%*Q z2E3h}mTujBTi@-++HGIozHRYmw^VUUK}3b3AlibpR=}!^h+03wm%#h~Kj++;lS!u9 z?)8`C-shh4oaa2>=Q-!%oo-}&K`Z?{fvsrAm(#@?t+#=|{mSL;Q>rFOER$idAp&j}B8ylwW0)x}RZ;S%DB zoesthp7)->s|wrNwB0-Bq}tDKfAvDgYYmUA+#Wk$A+-ZOW%hRb4y&-9!sw&IU`ED-29 zRba(W;KuddcSc9U6;}M4=mzfPgbJ+{Q2{4 zvH>&NWyMc}Z9=jp)^hiN&L$Eqq}q*&ctua_u?kQNnuT^We7KM_FS~rhA)jV@CMcSs zNJ?~Js5VL+#}F5|CJCd;1{CGCOvtz>b|p?^DHObVI3|*k;A=xtMI-wA%?DhVUck zSZ*iV$}k&{rzT=2CloJ+Ebv1KtZHm)w39dZ(;MN8o*ypm9t`P!9@1YQqV%C#XxY{% zL(JiLA}Y6PTh*>U@^Y)ryNTmg{OSU9^Ine~DO6H{R$B2=&=xygU?n!-Mh#P{hxq{{ zZ(Zd;zhiF{+o_SC#c-1vL3NuQdm;$(wmkId3r4u!$qbXHPjSh*{{}JaOLjnd_aa_YR~FLX6f`8rRU_7{8&BENii2kAr|4K??*gWC*J| zY!J|E#b3BELv*H7o10pi>2oxv54iIg-N#L}UK>E$0xBB^w3}T|wa1A5RHx1omae$K zpQ&?g07MmLIyGwMylr%vGR?U;8ZOf|0YVd>f+h;nt;7|$Rg909;5GuSE3Ji)%Q@lX z>M?e*9`fGQP?9W_x!Ts|$;tx1C>972K6mc?`PyVOTpCG6q+fE9>S8CGh7gBzO#c+k zM6PbKy&`@544-sc?TqvMBt1{t@zrDX#L7Pme{>cxC{>B4d{nsZHmBtQbm=vw;d0=#Eg>l zumayNVd9GelU-|TMQLG;0Wo2 zTEmUB`)>lRc0%HgfkT8L&6WAnX~oW8XeE|`-g@sfGJ#S!7kj(}A+Su`nv%o zsTui#*e!w>uLN* z8Jio5A>npl^aJxAv8}t{6{&1i{6*eL>X4;`;i?en0|T>S32}V5>XXaWDd%dM6-7oj zi-=A;5k1Ge$ZtV8br!~lg13UN>2>fi3pQ$xi9qXn%r&^p;>ll~mX(wt=+dc%;2^Hzf(aewkdizsb@i<7cwH zf5uF*Zb=Mga>gq?#Mo>9n!iO&rc6su0wSdT$lDO#b9$CjK2tBhjbVg@EEI>d+>)6w$MI`=KI4 z{d5>E#ky)N-FO89?L;^L_;z~>W|NJtj^*snIC}WO=B$5*r@?v(QdL?>Cz*-0n31Tb zu*IHk&q)8Bn6U6=rdl`^sR4g2TC;_FIW&14!q(Sk^c5O;4d60G9EUCOhAwj<}3Wv zS?Ld;3~c0ED$-@Y71m3_n^_&{*%&$-2dcIKL2`27VJ;81-rNmE(i0dZ9mLUEo7-~q z{9@4456O_eNTh%Ir9ZRfmn+g^WZe3S^#28!U=B0F*!By-j_v8Qc&GiHo9s~XI#0bo zG&19mYq@`190bL7OW#;&DL!qJ#ro(JncTFZ&=Q>MetBsaT8Q4* zO-`tR#FEGa*a&M~YCmxLdJS~KW@!HG5FYS39=~Kl%^lWf)UeMQ=0xU)CzWblJZPn127)$EcFG7DQ=}q?d`7> zx!$RaC6qH%C!34Y)6refO3aSm29J?!M*Ly|Sb@=TuPTAOQzI(K8Oi+a!*yXPb(cC*YpFQI;-H3ZGTKh_&r zZIn`D?2QU1xuO{L$7DwlST!3}#352(>_r8Kb;X@@n3X&3y0h#Y0-A?hHhLL~TTNHm zu_KnNyHK_kYkh~SU zNvJ253AS22rdZ!&K9?%0uu;9k;5O1Z>aI8v@y7njolBz@=q@sP@moqFa8U^K-9NIE zcM>ZTxB!Y;XhjN2t6-ZiDBrl11ZyS{c)~5?8qA81#S`XRg83GUm!f7o zg3Jdi>DF+u((k)Y1^{8?fmS?~yc8_h;Rd(k57tsu+p^=UIV5lNPTJn?*dc2S82&02 zu~6=^-p4WPHVoJMc)6AM9V(u&*FfiaH2DFr3-n&OLA$MJ*YhgcJqZYSN{PjLNkr8P zvK+2Jf64c-Zc6W!3QvGVzxU|*yrWL4^E`RF{Sa!-Ft@S2JA>Wxt9Ch~4_op3EmD+I z6~?@qz-aM_aFFCI#G{!D6|Wyk=Gv$|*;4*}nevnR?|Bhh6WbbPBKh^^}ssrPRNh=Z7(|QKE`>6oX>t=5W}GPeUccPDR2&Nzz0GD3_Kd z$J&Z?tl6Q&ppy7`g31mZPyx+Q0*X1f(*xp2d*!zuN%Htn@GJ>A4wO)}&U)U5`3^$cc(MOryLhJ!9plzPmE zsg<3)T(T-rFHpQ!a>H!@nDm9yc#{q`w{KhDlxnZ^Q+!K2i4Jw%TL><66lJUgoQC4W zjt0G9VJ!C{`68_2sC@{~Luk{nBLy)eTJfG@>Og%d^Y9OFPqKoCLG>=Cdt%&b)|h~? zc7uMP5YewV#}y3ZSSU4R2yp$yL{H0$u7K`z!!x}K&vfph=sf%`h|cs4FxuiKCzq%~ zHY||bdI-XT{*j2gPSfrj%cZI9NnV~!Bk;Tr+;xM`;I5%Wn3CE5059NDoYeC3y+z?* z)YjC^hRKMR_!aduV|#0yR5@Pc9>R>e! zX_pRG(OFQ3KOj(DV|zQG%+EWi$w;#NU}dcIFdR~?W-j>b78#iCEq3-oE8YoGNdxz- z+@yiM`#zw2(G9*#%*XySqi<*|#Wa~8SOZZx-ZOS>AB=U4pbFp{9;6%}g<6RtVMBvY?HNpL-=z&7uvr-v`ejt(nMMT=RyHQtn%X%M! zU0(`I>wT=e+mWY}U?rcvBu^g)O?a{r-vMoYtbGFaRof)g{uDll<;irXuyZV;g%dvM z^P6^=)ncC>_sR}m(ntgBDM$q3-P#1ml@GEegz}3l2z?ux0^Q#{Ib%ae>3UA1_bs}& z>PR-KZg#6_Jj!mU9Mdz~1a7hui;QJh4*ZEF=^^m`>%a~;UTjxlCw3m@2T=^c-avD8 zs%V0>>8MBoBXQ4C2fyT(i@h=26Xs%yHR-0wa55t@rXz`*^kY8x80yR&C^2v9fcpuN z9*9(ogeD|xU5{ang(xxm_2sNVf)({w!Xb*GgeeF=lk)H&Ia3HXN^0FnXh21F0lh=l z6MQX*7ent{q!l0d!c#^CWfJ{NuT^@PY5yv=!ZJ;j3X1~6v^Gf7IK-g*nwu(7ej5H+ zMAU={k$ZaeHgy&~ej)@*srO!Wy*IE8l^hnPUIno>WQZ!W63ti-q;C{8a}fBHS3g2) z8rVl?wg#i&p_w$|L1{lw6iE8(j<2Ihc7I!@$T&l{?xjU>+x^@{#Tl%?C$LXuf7nf7 zf7WPkxAg096qAwmPI72iwms%?dKYHmNn(HZ1yGTCtmbB5)qxNXjMhfmIcy%L| zNV&U$DzqmyVByatA(k$Tcg@9mk&%Ym!7Uc)Y9O^X=N^P|Tn((vZA6r0U3dsob65H- zmg`z?$QL&?i%D?<_rYbpElOvA!WS(!=+gUu$|$ z3+gzV^tF(!Z!qFI#O=4HPeSl$04L+7=Ca=RM*7&)oDtyj-63x+UKYk8an@)4n z42cQEiUI=)16*+Xe@CaT=5M!O$ zuMns_5fWv|qKHdnRF^Z>C>@%b;Z=zmk&L8e&;#her-!ts*i+MuXb->JeF0E)NnxSb zNkzNh*~Hzt!Cog>!%W!mio!Yo`YzXQ>*I5^LLEkEzz#ACOslaG4_m68_q47?a>qo` zIUTOY_MX&9lTMhl0fOOArS32?C8kI4#*!W(y2P%-@K>m~Gg9y%AFviq$ZM-M`FM4dS~|#Svtz0u?8ls2R3* zDt!c=T3m^*A3&mOQ-{^=EjNp_duS|9zeb@c!`tO|kwu!hMe`Xbl~e&}y+hFqNR^c` zbJEo&rgccD51&cD`8Jy`p~}|xNxuC;Fys2r*pV}=&?HvC9$Am-*P4enl*Ag)1gULB zx&fFW&yw_2h&nWm^dCQa+@lI#sofM3_)^;pf8m+paW7*Qql>t1>U4VzKIV*3K zSjAtgH|=G;sm?oVx7K;j;ecE0jWyQhYMizi8D_ztjK!mJ)U@PG8}thq(8!Y zThRu!g4nCrYrYrFW9Nd=ju_U4*Pn7t@O3ClJrj+ya)1qr+U{2fkx#zQB9PPy_o05n!wBU(*nvpYYp2GY$MH5f*(l}o zdm$_2>ms+ka#RobhPB~6R8FANt23!b{41#2@X*~*E4~-EYRKd=6h{5RWFz{-UO_G9 zGw}6EC|cDEEL&8HHoPBuC3t@@`h`fcBn+eH?rIGZ;Z5gIr!B@`oIbffWX118L6XHa zU`c^?Xkl4Ul*5W2hP+UIDy~Itb*xdxKi#&w7~}#g^@4~|cPH*3eLByzc&CZ{v(RGQ z;XQOfbXJ{IzYqzY|9q;X`{XFPVbRvp~pX`QmJs*2C$lzCQS3YAm!RwUS z;zruxIGr$5{4J~?yeCyyZN{<`A#bGU3SD5DYjnw@aKeNe6e#3A9we8sWLX3mLxYj; zu$~@Y19r%ZuQWouvN>zT=Tl-pStfN4a%Ws>=%wC^j`wT~298QtFHe>7SXb|kV05)G z$9pt(1l;&7`KwsH8@D~Du|6)O=7@>M&IG$_)6ar4LZ9U61<~p;XfzkCfv5o*_jnMs z&?BK}mww+VSc=&|Ci1Qo-$>@NwfAJ~P^f#e4JP?MAcA8^pdQso@X|X~`IcSz{PDG!{e7q=QuFY zbLYzaF2pq9D`Z~AV~$1i{@$M*7V~**59jj^2jQnnk5wuDLr`Enm$6)LUI2m8Mt+LF z>OE@nJbC$FT-G2b2gAl+=^#O{ld}+=Uo1rC09%+rE9-ucSrj)Rq~6nA23svv^Krd> z!M8X|ObpErR32ytqslRjCN&|Y$Sp1XW^}t~GUnM`rdr;0?gO|19Fc<2sqw^g1OW#m zYl3NGELhN%W}HH`n;g6^R|GL2?6Wyu)@Vv)D78N0czUGu8!^&d=k~Jpzv1JkBB3~> zN_i(hNbk;aa2e3W2u;sO#>mWPad69&f4YDZIfnBGt01K`dMKRMGFdC4eO4D< z*UYjj_it(B1W?1q1eD@VpPrr#*=E)cia%7>-tm9mgQM0$^jWfMLkI4KJe8GgDFbDT z5o1VOft;#o;VJVe8S13Y2D)%)&)+hYEmZPWR}oFaw~uDVE1XH9emyKhN$SqMtWDsAkZgIkT$hIT^6dD3t6zMvHGBi5F~@jNka6~ z^T3H@V8%0za+r;9RiuuN2}-B(H>#d!dk5_xG6Yo{k-l7@6hPy%>s_Pj05SKKt|?cJ zs06yWP^%k65ZLnmywP$+Ol_RnlNhrW(iqL_tM^+F_a z20fx^zBmgN#wHNyuR~^)F(mXpfs)q0q3+T91QfOf)ni%mfZQ!b&@Bw?ldaYtSyh`l zOIm7Q2uej$aYR5B<~|9lS+LQ!L^!jIC4#Gc%y1vf@WQvnpOwS<+Bqi5C^VNoc6c`u zZBt#>R`yfZzM3#X3sRlzfwBj-zF_cM(I#GrW+x&NeF^G08&p;v5EW^c>H7kc)%rVF z3uX1epD$!+%s1EVh}HIWJK*xVFr5#l-)Gw2-~btW?yZ3pCwfpK$e70##RwWL915}k z>MXfORDU%`B>HgXE)By_#NZ2xoj?jD)Je#*%5qxBmoga(XD5zi>mEoZNN>=jR}lwW z5xN!CazOZhN^f7o2|nM4(q|NwL$TQC6_ruG3sQtcg())pxyCejItGoHH>PEzime^>IC@oR+00ptzFN7n?O9cza3j76-f+ z;aB06DiAP$7|v04YpWn(q* zAWAfhhjLz*8_mF1n=C?V91RZ>Qv>#=GaEHuLvxAqx>C-Lx)0N7^pWOGYC}p`zz2%@TCUz509Y%+I1 z(e#4&H5BE~Rx9OxB#`q)B5X>= zK&x7uyn-1rl6$T8h9ZF&iH!YcS?pPeyCa8=-=)-)m#)mk77!SyNK7yNBT(`b)&rU zp#Dq+S>`jG34Ekv-5V~#(*kQ5_JaCzhpslm9q6T1>5w}JRQ#p5lc^}Ae~y{ctYz&P zi(~SUNVJEByQ=J*5ml!YcLE2^izu4J$oj6R$peP3z%ACM;v zj6oCK7XuyLZZMmS#>I1z-Ilf4zLHDpU=*vi=wK<{A!T4ZYq#6#yk4FMtMm3uuJz91 z1Jq~1pHZyJh3)O>0>@)i!BaV^aUZkR+mIL6_~qbhIh0g=$#fn%?p7VI^Io@8ZV|dv z-0bjl#_h-$?bPHT8key#w09W4!^3*(6?|_~?>&d9BDcc2GlFk+THwRzvBp)5PYoOG z`!Ks52V^F~XtDhOgWQD}*Fs#_Aze&`37Xz^y)^WD>*=HEq zMG-v(7Fv1^PUSfq3iuo{hgJBNHCvb!tORCSN!t?Tm8a13V}+P( zpOUX4J>f!lcJL)=R=Kr-+t>IQJmS5~{TJ-2LPE?l)aL8E%(S3)VHb8)Y;9OirBns{ zr@#QdDp6vnNrk^C%&qE`?|Z+G*LLp{!RQU}0pf0H__atkT1Ih+EVV#P+pRq0R-Q7v zCM+Tt?cA8WJ=00|@J>0Euc0mLc)*i&&%n02SPjo6Rn3Sqj+(OyfgM14aY(V;JMGYQ?J=*079LaKvvU{aw2nK&>!U3A9|9#d;@ zP4W=C@>!gk#ir`kaa>{gg-4-3*^(_XD_3v@R0MVYf$&e%_|ra)>G)_= z@uxrjlQIGN_6mj``W0*v4NLdnXaUayiDTnj1{MZp=ZgaKf-LTZ_v7@iQHdGl__YaY z3GTpmX}seUUNB=ir7Yc*yF8l3yU+1{?}xZNn5(oLAm50}h;@>DMHC9@v=V)t_hq!I7kdhY}Iry+n!!YbGJ!<^9$lrFIu+qH1{VQbSwWMu~;2>aYK z=HesSHYB-HeJ?IwGYs4eO`w@FqO?lS2gU^(RgiiW(v*0a_#Vf}@En|`S&`8Om$fCC zoVHui2Y^F-(yDn-ka8xW>96oOi>xJ%R_%wO+Q}8HpevOV#?u0Pbwn{5Q(GLcw%&?! zs$9FZjO0XXeeD=Q%FEJcP~5;sVI=jxM;(EqF%wGffoui7>J8-_Hxyv}^k%fl)N}l~ z!6w8}F!*MaDLY35-fYxu(G1t0R7vzD9VvtEFU%ebHhiJUpL~qsu7>78d+Y^+&RCz zc1~2mudNzU35-KD{@`uRf#90b;3rBhD&7cbyp2q0sD8JUjjBVXUoWT$m3^(SI#ix2 zazhpWe1RPrvHn6kG%9vcO{gbW@JQ(H;Kb0zV0GwQ`1yX&4&5JgLJtI6`7(i*HA<2B02)C!sN)tMNQfKTpxmd{6A= za|@nZ0r&fx&xKb9@un6nTuj0Soi!M~TIj^^J$UKQkdvs^v`xa$U3#cl_)Ag>X3Nb+ z(5Ki@s~gwzCD}nt9EQ9a2-8d-Qzvd*G@hs%_hHFmjmhUaF_GXV3^Nz+n$VHc=X48G zoE{lMwv))vY-tGKYk4ERjtTFG;0I2nAOMIEN$> z4$0Y26D9zO$;8}tcc9$_10U5%X-tpcBw?He3G@5<&jr843pOzMt0+~(_zE-;?MU=^ z5L2t=;B&8F(V);M=(ruKfX-Kk%3uT4p<;J%H99pKl=&$w1Ew^% zU@h)|e$wC$nTnl7&;SJOK+qLhSmz^%NaF^fWoBq$b7*;eXyF7jnr4zu7TM^H5m$#6 zPRdM+nnvM*td#>QNbTS@kOOij4Q|2>(GPayuNt!8yT$q#{^2Qc(Q!Rh*$rknfRg!fbMz95bpHYmJ1h`YG}O$+T%`N_GNRuG-aWq}3);j!h)Bcv+7TZR6!!yt@_e*wHoK z)sY$-FDK#U1iY-q%UX1_K`Ljz3h)U2a#G9jw-L{-ZpYCbnbJc*jM?9!u{Y|GYeGlS z@$Fy>I+KDfjZC@IzJn5(KWK!=+VrS3IiuF-QR{(}ZCcTo{3nIZAlx(qK$XvOgEM9~ z@}D!#Md?qZLDRV$7#h%@bhcLZHY9B({!YYm zjnS6#_1kjHX0+j$1s-l303v8?V^9}^wxID0zTAhG{-oKOVj=sd;qQ1nuf$&$&kcGc zcIM36@Qza8J5o`rxe~#GMeH+ z!GECfd;okspQN8D5Hg9+Gj#_}ZH9hkdp>^)dFc0XW{uZ1{5zL9DB|$f3SH))5|6}| z9hHMFFSt!PV@4u6kQ#)&W`sQifWkyDw8k|&6FhXm+R&Pr0M)VAa{Qj99_*Zl=FpmH zcvJ8-dZFb7>r}%U>4o^wfp=@7mUj~Tn+jXsA${)*Z4RRmp#Z_oWtLVi#?>}y{Zg1l7?+g23 z$I%JAytE)I7g;j`b`O3^2N3Ts^1xrt1HS~SGZ3E9dEis?z;6Tm(t@m>NS=!^e!z1J5*Y&&`!CzH@k6!kWBlDeI-5pDp z=?Y4w^O{jN;C%hc=!))n(M8LacC6~0*U`0XRp(!9FmGAc^6qG7$8{?@Z|PXn)ivs) zx0ZwZKIH=q?&AjPyjbTf>E^I`-!EU$xu7#TZ(*jvhYDZ-&U|>EkHSlTI$fRl$tBPI zcGv%ie{cxb*`AZY~`Zp+6-jj zoTc-Zbar&ESg~wH2AC_81>H-Qh*m6FHfMgvvIU?N>N00(bY-T?N9L5O+kF)2gI%Is zV%kz$#vMK375Kq^gj|E+M|crlh94jPzYVzAUiSNR`@c8Z-v)c;m5ZOpRDHOKKdeJ2 z+kYHC{18&m!2Lo=lzf7kj#BYxK9RqIS*=@j0JfGGf zE0@lC18%loYrxHMcNlO}F6Z;W$LYMBn=kmbM<$*8OsdVmqQwH3m|G_-)cMP~m&y_kQCw_BYbq3s=*S{EW6VHDe za1+m)2Hd1`6l-0$a_9ApJn%;hxJl2+Jn(DuMiKFw?K|_p?>6{v+R=+bWq-Z3^TfgJ z2He!o?;7#WxT=N1EQ;vZ+kBNNY!dE(102HfQ5xAVXs zH{d2eFE-+f3IB=#H|acLz|HpSjrd~H&wpQjAU=#S;)02Pj{!IF4>J6`2_K&a{wo7+ z;=j!B^CtX|0XOlFG5og4=dL{P?-_6t&%QkH>kWTy($j6g&2jmEZt(eO;{ON3Kb!o2 z*npeuZ!-L?*?xrqH`{;5fSYtaY`{%AUoqe&o+`txO+0f9xQXW{2Hb3aSswp6$AFvj z`eq*Z!+GF;GT_)~e{KQjE-Rp{r_|E%G!O#D|E{>p@Z*MOUNmK*-Zg!6y@z>C9}@LI#qm~!X8 zu*@rWUauN(vwf-If6RC{#ekdin|{p1^E0Ep8Bbp~;3l4O!;hJGY7MyAK9L9h3j=QA zIbpy}Jo5~{XYzBS0XOkHWWdY8S6}X@47iD(|GFQqTzTDRz)k$S4R|^F`S?S5{9&^J zH}TwNz)d_`4Y*0?V8c&-(ipeWfSYpMV8G3B_Zx82pIm78RTDnafSdlIEe||yz)kvp zZoo}C4;XM$pKZh6n)Kh72Y$qWoAeJc{H+PE)bM`-FaOct=NJPXHsIqmoM{$W=*zvu zz<;#?H~qY+x4$>qoAkV6;HRth@qbMFZH`M_ahU@9<6dLHndhhf8S%oz-=VN)`Q;hg!;(ss?{Amr(wa3E-JPd+;dA+IO6enG#51(YjFO&Xz z4Y;X?+w#Cy8S%)R?>Pf*&g)Y~d@=Decj85{{R4h{IxjHBHT_08+VJAMw20(+h?YCy zn&kpNs2>Q|Cxh5JIK7UShPGI=ll`8|o zS9C6!Gn~-je%<%O%f$oiSNu(QvfCZQS1cnRbx3EL|4u9FAkG*P?(w?^Zph&V?NdR?O+@ zRHFT#{HD$_l{R&Y_{Q?r0GCd?GjEua5T@N+20k`(*Y_s3|7Y-;`un%`uR>qq%tyb! z|B5DZ>2T{m28Q8|CN?$$(Kl1o-*WA581vZ9w>9Q6yw>7ppz$Bj^&o#=RT|@0?)Xu_ z1{%M7w(9YKwtu4I9COG2Ibav*qKCOD{KYkTD4>bOoTU8JFHzjb(@tMNs zj=xfmPk(7%haB~EqY1=)dCv1!^YkxwRl{7lHZ@! zJ=IN*?ms*Sj^=XtfB7%^`uAqjpXvT4jQ6>1xeg~RmwylF{-gA|#>eCH-+z7*od=TN zh^4BI-iM-yZ>F59Iz`SxE1OFe-4g7!r literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_avx_microkernel.o b/third_party/libxsmm/obj/intel64/generator_gemm_avx_microkernel.o new file mode 100644 index 0000000000000000000000000000000000000000..0bfaa883e1efb94a77c43a0473cafbafa90a2674 GIT binary patch literal 5200 zcmbtXZEO?g9e>U-;1c)FMX8zzCbI1$!UR^#LP-ki@+5bTvl!x#knnCfiGhT~$((aA z8zo3iV5gf)wRI9-Hg#J2VW0A4Q&*{qD#6xn(?mt3G1^tzw6$tVwH2v_wG6!9|6ZKS zVYk!vNauU*_x#`A?mVl;qdq}k&Jx&@Z28GS8GAjnyl*qNZA@ejGd{=%)Z({Fd>`-M z*B2{%&|(~xr+DGMPrA?sL5eXH=+dt%`YVxZ{LV+ZBKzmBN%QYOgkto|{+O=G^)daP zs{cNw$K}SCNDr45XA_G42LJe2W`9a9qa=p9<34W2+b|+$~9SNBxZHhvh&F zN*j0U_aoO7eMAm%{d(LOk?R%XP)Icfg10;BecBO4S^#NB(Z7@HmsM%uO>V^H05=Be z$Cw)V&6vP{ISb`u)wh8k(E%LjPph#ZtI zd<-+VF<1v5i$_Z?r{QbiZvx<#$xtLyT0E5)7=!w_97F7lhsuF~P(eEX50K5jrR`OV zs$ffe8G6_l2}!8ZVo$;fC-3FPY!IrD$JxMO9GQwyWQ`r%s4WXEoRR%L>HHZLwspv=kcl!obc1(r}9E<5U1Y9}O6_kZOwL#ked`)`&e?^lt`B;fn8bjFT2V z88m7wyvK@ce9iZ6DRNZxo za8ubiE;n=*N48U;20My}A7#?BiX2iSx7GRExi#m`$RU=?qjTT{Ma>v&R`pxFUojMU z0~u5JH6m>LWem1({f4Ul#iZ3Y$S!M9eRcZ}Dy^Cv$UV;szY>(zaoI0j7=`gGtLyTy z@N#`Ba$_8-(1b|Jsa-~lBH9z=M#zl8(h%5}>3K0Hyox49q8M5iWM(y4okO{&(&gL%;{@@*QML8s+zSPGA;58e;a@e8Klt?Fy{)t9j*7}R>8k$2))YCTglHCt-a%4LS=!HUzKD@w>cbGEpkOe~;*eu>^0V_Z2 z{_V~WO3}=TGt119?}be?&g_1+o0(ddCTt5uo&0K4kkJsz)F0RHD#%vQ=nks$n6rn> zxq(Ry&|i4hS11Ae8A?FKdYvuzsAvKk3^tGt^heQXLI*Y+4Xj&_J zP_~Kq0e+CC_572_>5}grY~9FU<9w6pML*sT(!zCV;kPu4Y#XV5OfyxZFQ8Bm9b*+~ zaXP^Vdg0G5m$^zS`c|5cLWR+RJRvteZ8Zd=was=irje-ENNzb(UjQ*`<|>V5wYVWd zaawxgeyJ3vV1YL-2f+YNZaiWJ2Jv5-GYRm1i^F-ROF2%!KyA9t4k6|TjSd>LG)DUE zu5c#D=QKr<@8^Ykc(-1=TPpEIdJ8PgqOzdms%axNe1$bWI+UNDm`EN^XVTe}Hkri^ zkExk_a$+I-wqM@;K&%VtKE#d9qriN`z+agWdBh3wsd($Hgv6N=v zTO&=INll%tOZfI)CgelH`rw+tGq8%@PaEvBr8U^Ji=m6YcCr5HnyA=N^tX$R-&@7S zrt_=0*iu-dh*Q2_h3hm-Ef+}9Z^_Arci&F_2kqWd7=xHy%v_@_<&Llya>wNqlg9ZFPK#^H?uc8Wj` z^xH>{JI>Jo938W`oFgZ0-QrYC%nv|bh6tv7%1XLUNawcnXD#a{O+q_sm2XsVes-%= z>9nTe7b#;kjJ`M?Ts1hUv}&NiyS@xSV5`gUrt*&DFXJv4c)whEbue zToF93^DWzB)-tgDxIz`&I!!n(@45(}68@tq_yXY- z$#Co@|vV6Fm(t^r&{ajGRbiM#j_0bT&JgErXSG4Np!?<+OA%n?7EZE;XJ@ zj!sM@{G}RZEQkxE(_CMn!p8b4E)kc5Z+GF63#b1~v?xVXH4go-i{33qbK!0| zXRFA0-^RUme#5Q_FaG^1_>C&~nB96L=Mk6v7pmZIyYO```duUd%WKa?7w(q7Pz7(q zU&Tsto^s)Cy)U|OcOJPt3cU7=xp22V7pveOx^TDrP49}0;Lp2ow>?*? z;QL+mKy$>Yw+0Ft7Nyw^iTTXJ)2Ao2)DZ4k*1Df?c3L{GvG7PrOR@0K^fU`+)8nZy zp<$=3KYMC`W0Kgzp4kybH%_E85T2RT(qVi>Y@OCp!zXPG=@ZFg+0;bZGVuTVT@Q8& ziJhjhu(tAU<;D(Q8T=A6fwBZ2dGlNQRf6WO{1{5Nja@tf0q!&!aiBEAcnk_S1KTdwKc)13YHhEC2ui literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_common.o b/third_party/libxsmm/obj/intel64/generator_gemm_common.o new file mode 100644 index 0000000000000000000000000000000000000000..a715ea1e033887573a0b51525163b81dc7425401 GIT binary patch literal 27944 zcmcJ13wRXAm2S^y0P)E5fE7o?5Rc<=B4SJ=TfAbDJ1zB~5sYAv5Qu<12q7V4y&{c3 zc>Rzv8tlGp`SSVN*(94U@g}=j?&jX${cgeq?3IPFkxAkgP8>h7F*pt|eu0yir;YA^ zs;g#dTGK!_-}dKLSO0bDoKxqVI`!zm_l6>i-7c3RE|>B(CG#dwML7}6l!L4oROTwB zO5&ueWH>Q8ZR@1);5fof&z=LU-ujbx$CWrgZL5#h>gJRkq2!^S;;{K?BG^l^5?^s` znHV-dMSwXmJjiN>cEnstiXin`T~We`_wvmxkxaURlkTn;%-tqm<917>w5A#ZN?)Mz|Q|1yt75sdR^Z zXu)iTirKgJpkgE#xrLtU)aarE%zorF} zfNs1VGNMI^w}J!1YVtpzaLJ+M;qa4g)aB#e$JBjzf2Glz-@ku4)abwX4%iJ{M;eU; zeMF>&{-i!KTr#Tl|Dg}By1M(>p!)ELW`qkWACE0ll=Zm#N2qnh;KLJVQd6UI1_T#s zMsEPOzsjg%9jymt>YCBf(I(c9{)-BCT{Kr{9yj-1dENB=mE z5%llBk?4$|KV*ah{TJUSy#@v#v~u5`FC>fW(6Y*imZomM$dn&|WM(0cA5YSg^uLp= zd>lY{h*YCHt#De+N~1HM`bp~P#+!-bLG)|-;lx`f`k%6NVP@8hXn~r19WD#+@j*Ab zE1_TIzLQsvD<+-5dI8s~ICb!o(a{~2bICGvhxdFAuJGelAKo7-IYdsb+}8vbY$Av4 zzXmnjr=%q{(LhOl)vQ{`D5Ks#)< z$gL(n{CITKEGs8G{C zAA*Tv7ZV3PqvyZ%EnW^wgmLk&+adWT$I*I@a7WVIFQquwCDGq@QCJGt9K=>ep-ufe~ z#fbW!BKc%21OyMF_}cK$o-d3;0C|ex`O?%(=%HRa0O2rq;ylGA`jGS!+r-g4#tUz- zdTJ_S&B~WUYY>0rYsTXB3mmU}d=1zVYYec`)KeLpY|R|X9UI;wItj{r^G-xxu+TW4 zcq<4MrXNB_tUmM~?<2j?KQ;M%h_Br1Lzn1=Z~Uo8nMMe2!5_Y4$%KMBu7*n9=sP!F zQy)1{K6uwPZnK&afUk0I9XBxf*LW#$G?+N&nm&vHh{5Oa_rGPy3>K&ZkCXIVUJVaX z*jK}ns9tapWr!?Jy>OoCh*s2Fimb;G7zzaC$Gs(|`_4@Ws*fDh2iI+JL7R|K=L_!< z+AtFcZAQ}(Ce-OIh?nP7rE)K+5|vbexZ;#fs_e^{E9y^GAk$?-?>=RX1Fo%cF#M!Y zCo>E%Pz&LcW5LAHi;4Z7#6?$V`Wx_x40&lRflX&&qryK@QAABqEFP&~ zbC|A9oq&6}j~cK^Y@~wCVIvi63PauMMF*PZi1=4`SI;!&k5sU^D)E+TEH8q%UTaF* zstp`gllvh7GvZv5gy&c3Y0WwfNJNmk5s!!wD^{m&%~z-1Tlj7w!++p#k1nQDJZDnl zkqR~qKogM13Pq~_U&MFM-W!v})bWT>QJR{co~O_@Ycgj%JP=s{9?Ln8_=@n}yhm3yaMeV;X-BOM><0~9uD@{yq;#!{A!X+A%T zhRlzl>BGiSB;%(vHlI^69=2vtB;%xCH_CC>q-XTkNy6hads=Q@TbjBGZdR`)Ib0tP zO4j;wGcS9 zR9ZK0=~9G{r>uU5NH1zh>> z*EhCr+}t!t(N@-mjgKhC!u{{%!(}3}Dvh@z#xZr4C%|+?Ic&Vi8iRsm40YC&th%ef z7b&;!2B4U-ggRzVic1kgl`*%oG2DeA5&PsBKmN50rtj3V4hhFymwvBp!<%f2ig z;r^#d>$SIO>$KYrDB7w@(6abyvs-7VpxuiYx6r+JAi`Vpf;(PFlP_V7-$21JisYI`SNIJ8QWJ zE!k#|n`LF7*zD2KT0{3rt)BISiD|@mCv7T;v89gP{zwnCWZ#J8ha&ia@roF-3yPdw z!0lgU9Moz<#&e|k$LKR@{xV!psF~pcv)X6sfvDND2-Z2I-I$MP@)>$yiq#i&_o^AR zMERueoF_i9@B9=sX(Gflb8(?rHD#w;9l%Z&d!SvZzPGXZr8=`ailDT5-Cde4)|8q( z2qN(wF)oUtZk9(|6s*Y2r9=Z*i<(zyd-7oxOAFHrt)`&DtbwLM2I?rcxYDgGa%x*` zUd0L4MqzV5xV9)}O+H91BCW=f!l)R>MU@TDWwf~?!?z|s!?zZ#EOpe~%VrSg#L)|V z=U@U4dR=9l3ez}vUFw)Abzd?{O%bAlIuqemHd$~KBJ|e+&W>jF+Mp6Lmlc>bGpI?= zqL8}lxzLSX@|LBIxvCKMZ<}Jgk~rq>JLc+HpFG_&4rJ{<^cpRs&S@D-_5`_|G#>G?l-T1{S!WXfcurl4f)Uxgnnn~^tb3ZOJx#toN| zF8gsQ>`i@R^xQ=&&;|};JtR}7z6U9`{A3#+f!yOGaM|$@GM-PTs9)*EA)ckIA)@Yz z1blpuSe&G>Q8hy|?=Ccck$CHqzVn{=xW02!dL{)=k02&!A80(^f1D1S631ZNi!iQ= zm*l8A@CG6q!-NIT-vbjF4*`=d)|9fbUvfww2rN-#u`fmpA1SEQs#y~r7FGF(Z9-}6 zP(@Im^&wtF)amBhOrT#yglxoVH5n#k`6>hR`cW&eg?w4hZj!-7zFWuF4Vqb%&!nOT zA&^5hna#3Ia9(hBAQmXSrr18^e{xZ%Yo-B#p%kO=-IE0_FzHDjgo}%vMaXh<_b_3b8Xz zygI7xI;$B^&-&!FR_Id)S`fTA@>B`L;T69dIKBZ(=X6 z4s3^LEOO)3iF5Aw7ikrssYAbc$Y~*Sg4TcHW9%ig z{ zo`SGBB@i+z5GC_5_k3hl`Y;!4^@Xrp_R^Ye1`7%eyF{45&n9g1n*Y0^|IkW>ks00{N~XsE=dFR#a9Q7d7KdYA1lq zdT*#iY7sI+$UKYlwZw?aQg>MqbbUrM=OH(ZKxNG}46PSK>qGI$`og;cGt_}Ef;Ur_ z)O2x05umI5mi}p#D+WDSF%7MXrgh#8lM9_mC?h4d`RV}GVYyroGFRnC%o)gL_Y|qS zegj>g!EZHlsZVzuhFYt9@P^RkAR0-glQcq}U**4qDt|@@2~`+b>H{DN0$-gDCT)z7y*Gr>IJOZah?hP`*4Ou0k9gT-|33436;F9-x>(019t+@ z%;i{OSnh3cSq4HDQHQ#ttI*Q|^zAn=d5ur24kPRqn75>Lw$7#CRUe}2JiJ(G{0?$M z#+#{wxf#&PO+%!Tka;4!NF1k=f;t z#<{b=&XPf1_r?7sqfAn=l&z`m3KXTjL~E$@c~7PBX2kddjs83ude-o#J@1&|KLA~6 z{I3L$FnH)@Ls9B(P-1BB{FIvfJc^cgTLYsdhjgPA;~L)m8IC~11{ueAPB#Jp%dWZ^ z2!u*bvjn{le5;I)L&lk=9bs}fvt`6Y9BA-ak#0r{m{s8i@~7td6Ll5El>s?e4e-es ztlMZ+0{^959~vzEGEBTOfT4q!MkXJ+{)TZl#Na~)04D08uB+Pu7MZq}j!r}-&=itOvS`o1Gp)^Q_*?$oN&iln+gPxFK3G;k-TA(|LG7^y`_B3dHlA{~6#OALG;xr7b@u-HJ@WyS}B(1VYn$v4!2B_K)s z!4qL%gQM47vI;+&7yD3fFR-l`;lqG(L zWcPC70FuPZDBb46MpH;Nd#tT2WuTJ73)cm(c=~tn4-I-a>lUS2i>gB*9zfBMS%tk9 zvHg|0>(Ah-xjqEfDxbE3xI(^_Qzgb=>xq9BHH?2U#1chSQECy7b8Ju}20kvmfW^XX zGmO{M+m;JcdM45$F>%}-|B9A)8u1p7^x&OLXBi?aZGMV^|g(j{iVM6#2uHVJeLeppa&azB`%5A+`-V9Li>H z%D3h!EYsgKduD^f)4B^E?S^52Lz$1`dLAZi8mykJJcBy^aYk zy#U&VmymJLtcqy;#}ufk;cEhYQoE2i?!(uvx3D@{p@mN2}TCXp7}*hG2{p!&PkJ878Ok1&xke>%m=w*(-4ZV?hrT z(nR|ZtQ|r6#+SWcu>pg*ntoK!l={|0F)xasZ&6l*T{_^y6c?IV0V9@Ci4;8nV=NWsDno10eJdq1de$kduW2u}2LoNf zw>VF+=#{7?Pa%iL>{Y}S3w^fS_>l3VIg%}VzX+cpI~8eDO#?`YB>Df4 z3He<2a}>?my1mFO_YTo70f=eI+l-spQzulhNL*CQsB+bh!V;INr|jTVpU!3?mCc}3 zc5^;YW#mIFr&_b}83GmV#Fn=)bqbjgdn_8?;Q~J0&WF?9r^TS=N&#QamIPRoeg;fM z?twaNRjRaXOD6)>H3~7dk+7V(1N_SeXV0OH4m+D<4K+4)v^~hz%Dy-7*{&AJG8WZp zS_+ZWG9ghU6SlY*14a)nwQzDpG}_|JZehhX3xF1n6=QozOv{9!Yv+CH6fNIc3WcVa z0%#;y)QU_!-lBGa+9#;7W+Q6IW1(xYWs7shfDbyXS27`;qlGmzsTl?Zn<26~Ktd3< zTqV}yA^NV`G9IFtCPG^y)iOadPLv#C^;xxWg5OA43l@-I5mSX@DXBRm7^$IKZWxc# zTn?Mdiy~~LZ5+i;I(Zx^3Q5BX3`+5|u>A;G75N`MWO0%Dz)EVg^!xEyS`MzmtV;7_ z4VD4AxiNt60w*ht_mN$7GgMS*AnP5|67RV*{E~q7cv$WuQL;rKexvbC3}hPl6w`hk zTVJ+)n>`*OI;f4RL$y)kLzw3!bm$8cPC;|+2^yq{BQ8qnmT@vk$?_7YSfJjC9Ux_+ zSJYj#)-){ULI_(xQ9JmJJpI;<0l%8)WD9|HG*7${-$s2W*_v?*!kOs7(ic~V8r1ta->jzaxl|C8|)m%Rd^p%WSbe9aheGqTn5{v z708xu^|Xa@gwmXpU2pPvdW@7OK)ZOk&5ZOmod&XJ=YN+VqAu zt#3+P?=}d=!LPM^&(gZ00N?ze_(52i)-GgmYwZ%YOf7qrtyZl#`W9`1k;S-R@Py1# z=4PAGlAlFSrCwU%*#?f1hqao21>%3Lxel{&#I}*h9@1)?e<|0QmoQGBcDdj!zG&9q zTm*rr*Q}u&cYvX5wfXKkr=RntmE9ch~4>|3M-* z{=^n{FXOZ7OU64)P4n*x+3eka@+79U{?Q}&yQMT@vY)JIi3M4|$2H#MU(VMKJ%TTp z`1Z!kYMql9?+iUO)$=IKYQ>MbTMwkqSmPzw*v6-jKg9Vg7N|Y9?4_BJwY-m7Qaxjd z^E9@A?RhRkG2zj7e@DL?oY1r9-4Utdyoi~7^c3zBxR2m|759s{pT#|ln~p^aar z;obx6G2FSHs2+2P-$86@Y;WwUk9Ty%HZ|gB5u4jL$2ZrvZoacIR==fpZpqEDw)*Zn z#6WyyD`26zp3bs*Eie|+fv_^qooBo$lv7Do>GhD$fmi89sGs(^0#Z7AJTFUoRz-!=F7>85&7mfQP2JKj$>tLNqGH#fC!Y;5u` z(?eyo6_XUa(`yVLD3(=4@!r{0zp1U>-_h3Thxyx-it_TW_=_r*)z0!Sm@{`yvH#}c zn@fu4mX!L7sv9@@!}W0szp12n7IZGaO_%G=YQ@#-b6qoKLOy+Wq_~*DY82(u{7LV{ zyu6v7?uge{B_c-dqn0JaZv8DAn9n%B48 zt$Pa-c@kPtG5X4G8Q1){e|O7kur)Acz*kMjbdbpcfETPYaI19 zmwDIgqO4_?s~GNX_co&6XIXXy z4D8Yld=dFccZsCnzI+0x49}yS!%J|yS~ECi8=vJYaAFA!#`&!VdHwr zi$69Wv2o0N#K&=``!#RxQpu(j-fu&|qpn(S=Qq5)VP2y{YzQ^CSv9LHBWc_ZOHs2H zHFe%j_w7y|yx{F!CdpZhHWg@t(c@n3?G2%d*o7d7RA@leDsN}rk6qd7m7%T-bui;< zZ|@Scsz$2{v=TKDOAa}3aW-59>M9#`rMEXXOaVeQBG!by5f(btQVpp~Dxs#z+Z#d6 za^8`(=-bCu-BMaKV*|lZ6i}YS@RjnKA5zbBu7@_wXw1FSdSK0Z*?RM0_WQ{rFrF^nm zLj~p++dLAQjDZ-Hte*d9JhqC;g-;y|0?AY8m%?MOy!1*hyDm@pOg6yZ@)WY(*eg%D zW-JIKPnqd}U+aKh=YW$J#$I^}`zHZoD)JO8xNVn)cbCFeSgdA_RG%lsg9JnW41G49 z@ozSsiIZ&nCiyj8&QrdeF3W|y^r}09Mp~-z&R%X({4$7gjN`>Nd?Uw$ zHk_^dm7CJ*cv0QX>H7r0pWPhaVZ&)HsesR;?3wv<4=di3-baX<1jpA4fIsi&_+vKw zL5>gG@P|46q77&3dB<@iM~Bt$Q9P!(`62>ku}6A=;hDT)*-KH*0QcdxU!Owa%Y|R# zfZs$oU(3q8${g?wz^A!0c}eK=-x;3CKVm=j48t?|N9@TyX1F&klI_dzHXpi7b7%4l z+m8W{0ner9-GryPWD46EpG;m_!mIw(fzK}-@C%HO@CM(ff!DQ|5_0LY1o$*}CclXh z-p%k#eq(zq@cEttpC=f7ChxI53h0k8Jd^j>z6rPop_5CW8yxTu@M-Q$o)r8yGyFPm zv@W*4L2;1b*9+L%i{R}i4*Xws!2jZa&%{w!F8!ARpXScwVYa@8+!hCVI#SEU=VuQ1 z?;LP1hFmT_iyZKcz^A!0^TG$*QV%)ME4BlwhK{zjj`leXN_oxdmF3|%B@2~Utf{BH zAr_lcdh`6b^Riy&mCCOR=gwE=M9PclPYM0Gnf}~De@f{Ot6e~)h4iO{5zeLJJoe7W zOBj6#Yfw^>{`W{WHE|9W>)Oy6>uPMRk8j@680%++uYFA5xb+YtG%%`*3i-3w0RSp(%QPEu>nVu zP9hweb~n_w;_UISmDpV0+H`4&^&2;`e=i0dd)niT84>lJo%nZQfH&gcv>WH2U7OoC zDc$Xzc*dKYG-++znk~Ej?|Wu?=+a+FHM% zu{F)m=9s3A4x+xa&}hDs>PT+6ISB1NqR2w8oI1cIwqJp;Y#O_q z&Yte(EJ0%kI%ou5PLZ}NP~?&{tsV6nW98}Y+O)6tGMT~Bi z`<086kVu3v#NUE6Tbp0IF+v+Jp?+sqV^d?ip*h|DQqpd@QnlpFb18jXaKkU@ZU-ey^@DE6{9AYz!!?zSCuJoZL#{5T9+Nb!uhGh{uSI0MEcF{RHU3Ofo zU}zmQ54WHfI~l^~OZZ%r=^{R~IuZ2iQKk#a2W25{f&VuGaS={u(E|Uk(C2d!|Me)- zb*1_&x5!z)XcZ;&TrJ@N2{$DC1_>7{KeCU^=L-_O%x9T|%Y6Pp!l|v$=SLDws~Lg6 zD&chIAn+p+K1;%9V0}*)=}A5o^Z^N{U9P~_NjSxZz&A=b?M?+Qz9rf1AkISwm+cV6 zJ6(j!b~wcOlOGr17W`k6@LMIEVwf)CL%TRZA0iMJ@u8R$cpZVbh@N)i0%W)Eva5+xsAA{0Ga%Fsrgv)aO*#T!i7SQy*`BXSxNNt#9q`{uc9Y}w2UfS3{b?>1 z`W$q?-*dn}Bk`B{%yYnN9q?WUykF|q3Q5m@k??8>m+d3trFm8dDBff~F$tIHpOkQ! z{+NWz^xec5m!1Fn4)}_3x#`=-=f>&WiY{U&%l$VAr!!39$1@Tx>;LHq#APg>brLSq zZ+F1IC*iU_2P9nPGfVQXjQ_v^KkI^w376?BC0wSzUBYGh%@QutpObLe z&gD|P(H=_prN;sPk%ZHlT+qMdfaf{H)jS7$t%R?U_-}K-zboPLIQlmUm;KnA52m=t zopL<Wg5ZbF7UA>k@7)8nj!`y{-S&tIhHR0;pKgin)jvA-bt%vVzIm*@*5 z`cfjqMGvvpruWAr{AvRD^`3-(Mu2$M@%fRe3nlzxK2H%oUBZib9wYp-60XeY-qsec z-+*Vl%X&78QoOM@uFTn3AFo&DZ0PP*=Fr!wIfTv;hn%(}QK!#ZSJX253)khFjQIj} z`6iBEl%$VQe*+i(UFvV(Abp~m)73$ycQUp8Qx*BIWDdSgUe-zWmRHau>qFM%C1ibq zx;**VPfV9@!$nA6n=aob>%-FJ$+-yY8`I^Z)&RloQo|u1w!eC>8VNX_`qeUCJ)-w4Da8Ly0h+Zqs-$UCJ)l+kX+&lzRWO z?Mu*>csl7X&mkZi^BbPoz-USc;+C)B#nzjcJ7|4DbbOjf&nb5R%O!t3aJn{IkMv)8 zE+w!DcP{y_a``)WOOj80w9EfKuw3$A?Fa0m~&nNFXlp z`z-5ito?t5id^!~a`|E2kor$cb-R2zr^zXw6N=w~aW-T8KMjp@$q#bD`Me(_pVm`$ z`D+~HGi5<9esAU^f2D)`aHZ9vmp3H&v?R96|1OtL_3~B77k_>kL|L@w-BMn0&_BrA zpR~2N$L~uH`VVvcW&6>3QOFnLY9ba~qzlEid=>LW>Jk}5*)HFM_PPA`EN>X*?a4m0 z{Zf6Ewx(6@+lYDg%$ zr}eR2{y#X#k8=5RW*}d7`QLJoe}9ht(|X%3{{;v6k8$}`vT&5`^6B@Kx%~fP4*4|K z+2vn!kgxE~(2J;-FT4D64)O~lR;wc3p6pNiNxS@;F!AMz{}=h@>T>-rLi=3(4=%Ih zhpmS~@~_Pyf1QJT|1_(i_+2JvGsga_9pvB7<%{nIB>%b`^67jtSO5Kd;%5IAO$f)+ zZvX8L^3O)CMzZ|tbIAXhgZxu7toGt}pw`=1`MVtCH}ipWKi}k2|8K}4|Mw2^hi|kb zejo(%r@jBLImka%XEoT#+mU>+)*?vc-)RUQvcE0NmEqY$Wqrf4O4eV${DN}8p}qem zi*SyO-fw%q)uH`*-oD62Z*O1i(EiaYY)`*KrvCGAPfL4|w?C3btb(w=cy8tGi*0m| z@s8X3PqOF|&pzJXLNkAVZ*Rz}L_0zGh(rI6@b;Y|;qV5wS7is<<3A-R81m=0>2Mj> zV_YD`5A{*B7x6>Y_SgZWMCnis*B;*B7L@6>x1R_MAw1^F4_eUAYzqT>|0zx8Y9C%} Nwf_NiPP**v|3CNa_{#tQ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_common_aarch64.o b/third_party/libxsmm/obj/intel64/generator_gemm_common_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..0244d5b7ac0e84be99374ce90376098f14368dcc GIT binary patch literal 6416 zcmbtYe{5679Y5QF1d{m~_RCvGV^k`Vskk-`EVQHef**O_3`G3Ql)}St9Iz7Gsq+j0 zhDh|F-IJSwv`*Uk$23iqrfE~dUubPK6kwE5O(HO{f#|w*omA3vw5%#5(1m@!_wMY+ zvprxt<=v0Z_degd@7?#lcg{gMy2j~n5Rn|@SyF5Xl#ucH#d?TUL*#43OO&(K*oqia z%J9LO;Fxmu#bcZwHBKt}2UAMp^w7yMz)nuMK2?4-G;;wM+Nz z*z|!adN$NxstnM#LT$UH;~N&}f0uFxx~0r46ihCc>Ph4cRO5rD=)cyeVg18Mpuo_c zp@X*_xt|IilwUwTmF9(NBBRn`LR_9fX|2&707&tIqGkk7OZ6clNR#Kx`GF?saNFIx zcMF1JsEZ8UW@dPXZZorB1%M9r{N_;*o(8*&Pc|BV;?fPltD?VRoVtd5<8lWZPdeVV zz_>6rQ6(KR03Munn3dpFwEcqK>>Kp~F0JuencYVVyxikx?Bn4)8TCL%s!vIKULCE& zsw7{YJR;Q(LLC)rSD8}j@Bo;jH`k29!j~wP>JMRe!tCC|I)+9=0Kg0;RzVwukRHZ9 zYK+sdYgO8QIymLf_Mu^O^l6`T=tWQzyn2seVtna%`vPiQp|CQ!#|G~>a*On7kCp@r zrt5OAhV!1VaY292MfKZtl6+hD)iq+!B^eayXp?2vT0 z9Z;07npjtmW4nIhSxcL;MFfrd9f?Zes}l=B$ZW@qZiyH(baTY`jBZ+M{9$v{II~eP z{x~#^?jRld3v?NpW-cNdZ!5;TQR8E_@6r4cq`!e@h3W^0WA_{-zY?Lm)DbmqZ9DNZ z)j{ZU%J7#Fq}%BB39JDBO;iV}O#eAc^J{AkpVM9sUK|duxGM)Q%K2?ow{8S48gCg> zw{GaST~vDI6m|SdhUGu`w^?%IIMv4-@`}s5J893`kDfqJ)lWJmtI_Yn#yM)7)2}<^ z#tY#USN43I-(R(MIJ&5+BRVWsMUD3qSXXqvw7ELiO!J^$!^p9%tt=^U9()}FzuS8RG9r?YUe75c;^*W_PwXkC0 z#`Dt149)M}q{RQM7=L9eLGxSd!g;w4+8}8QH+~qAM#h!$}0NrYC8DzsR{>= zD0#NNY#;Qe*)Fgkk)MNoLDz{N!9H~GZ75LLUf_D=_wjk5i%BDUJrfBu-!J@EhP9ia z`gGMEo&FX#o1epQesdkY+3I!XX35gXRu5#C*w60$Lw6CFc-R4GLvwxXC;9DA48VoE8Hu)MhB)WV`Of&H%!9XK$ zzXTs9$G#23G2n4LQdeDb6!r!6euGH}xx-7u0=$lg1XBZiVOAsL3YKBFM|Y}o-8FJ; z4)WEt&Mi=Urxrv$`1;_(bPF>}-!$(O1^5+|L0XLd(c<>J?3CT!VVCUoy;K!;H@q+_ z><;L&opi3dA?)_Gy1fy%rv(Re?gQw%ZeNSr3p56OA5{DUl|jmKeMeB=%cyU-N_IEA zg!*2X9dgMlo<& zdb$WoOOuOvOF7(C3IgI{o>3}tk=dmn>kl4RCb`I*GAQaVHE|*IbTIERDZ*c{rOUuJY*L@+eIvoN-l4Qa=sH!xS7ORJPOVE&f-3b^LUEk#eHOF z60j+T7x$5!KfoHn)#f{k`^e55;NJth+$v`(;0G$;M-ay#XG^bDz|R6c-&yn%c4mQl zvqFDj3)JyUDwRo#cbm*nn!e0q%2SAOlBD#(LFM@!q%(E7#W* z(-IFsFfTUCwA9w<`W6~h*RNT#QQo9(qAgKbEtr?aDHU3~6WPv8e^T39Kz7B_UA+l4 zk51L^9m( z&;{{yeGab?Ouw+;7###YV!`ol6!<$99A8KR|IC7a#e$2w6Hl(!f^Xsd?YJ1%j#tvN zzJi`63;uP>xX*C>n^0n`5c4GwKE;IdwCb;e^NGoh_nI)n zeNY#AKDXctEx2`k9<$)jTl!n^A9CDo*J~EsYL|FqmQi z7Cl(Ug!s13gpxE%3f)P{cqSa5vniS%P$!T!Z)PRKP2UX11d`uq2$ zv{)yUTGlLgiJF!e(1;%bMU416`}>JMo9K=C5%r4$W{a-(4S)ZYElz8hAb9_wt>iNC z@WB<7hQ50CvN z{aP&E!*?*TOYP3aQVH%7|5xz{u<;03eHImlFe{VzaVba^LJcuj0Wm8fTU#{k{KX(Z z|Nme4LExg!O7j=*Dd@tI{mt_Ppm@eZOj^vw`4eS;Lm2z9pE#=+^D*K5Q+9lAsCc)r z&XC#1|8)ia4$c?vyGne#=gQ4r!JrOFF$;q8$M|EP|33jMJAVv7ki$eEV<;EZ5ATQ@XA#UJN-Vyf?-;07T&OfTRkByWz-f;)_t0UduU1OHJG_k=Za(nr;(!+!%^`(Y&bJ8of;K?KtlOq&X(?J?=pg%o%?Ik=e~su=b%5>;@z9TVaIkDrp{NcK|}`IP;9 z$|7^*9VBx(KePx=WN4CVIJa_pZYOD8o^5&g3^>u2S_4q7=iEQ_9m6YB?7Z&YUDPlB zLV5@(C2IF!`E~b4-JMpFx_dj3jwm;$l>zdTH*i~be`D{!eUL#dzBn=6T~d-PKqB3z zFg}nTLDF@c0a)sv8D2eNCD%~~T8}aqg7ho8TTqhBNV{B6BD(uS_AC8u&w3X&jTGO# z;qm^0ypqoN%;jmNU%T9=XaF2fBreSWT{dODn2CD>rUZHTaiTDG9o_rE_Q&x|S`S06 zFrVYlky}t4do@N9>0{nRG5kSNhN<;4-VA6-%)F0<=jnC~-il(CzQl#6uvv9?0z%;; zMCGEi4gpKs*@jBriYQ1eO~V>SfOlvQFM|0aV>4|#7T(eEu+#qvPQ_s-U-1FztbX*-%Gjt>~ zlFf`|M``LD80n_t;ltTX8a(zhuY`nNIY&ZU$Z}lw3 z2i^=%$LSl9x%kycIMfr1Q!PHA#gqAX^y~wi->CzaBtri1CJ`WnJDyO4gt0yyBn%^+ zBn%NcNtm!tY~N6r^zOqD2$TD};Nnzh598-VI{0~v;rsJB!SOol+f9n&J8g(*!>sKk z$pAjW-Xr8r06)#~Bou|*>C_s=nq}KYwPrSo>uQmxg{h~p$zU^r%_(e7V>61)7&ci_*;v_Xf?-EL zM%ijqt%^@F>vgNTY7}d$mSHxHrqkG{t`TFdFrU{7#{BH;;>?nYhCysOMLv^MBFbV{t*fPR>I{x zf8{vl*(dSuN_^Q*jPIvl{*>eB2QMd)&!EJY{ftPsoDW_gDCj2+kMQ$C7k)|NCnSDd z;>&TjC0vgCuH*;zuZVk1;>&(Ml5pA2_g(P6yWmG~q9DO~p5i!m55Xhy$xD1W9|{~4 z9LDR&7q$7a?U*Y>h0X5u>q2v^Er+P9rehLyrP(BEqlPc)K|U#TmsQ8X#;TYqdODDq zAEVfGvhaNuYqo7wA&Y9wu~fLNhMSIAEOWQkx>0JFw&hRe-j$G4??K^`RXaRlYCr48hw%#K)grlq>LHsKfXo$sJx0Dq*OKJ4E#F0fzP& z9Fzr<0#UurZ~3&C4T2xQBQT!?ipuR}4D}%QbB^(8`t%I9U+@u65%U-Dr=WDI-|zev zL(~cDhxdiB7dYkiphf9 K^g6!{+W!aEn3_lc literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_gemm_sse_avx_avx2_avx512.o b/third_party/libxsmm/obj/intel64/generator_gemm_sse_avx_avx2_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..bd7ed313d76dd4fa798360ec1436a5b80e669cfb GIT binary patch literal 15704 zcmcIr3v?6LnI6dwcEHgX0$C-2qHZR^biuV8NbrWF8W>?wL~+0nUO^VNj17Ka%LW1@ zQY8U{;>PWg-KMWYd(s@T-Q9H0A=H$V7z{~BcUwa9Afd?uIU6z|FPg-F33$K%&Wv;| zYw~LM9%kmwcmMmp?tlM#@5nsh_0O`~Y=T*A!d*i8%|r#^`0eSshgChob;1}yzDH4? zlNZSgXR9x{kI9t_7x>j)U+mo?B^hv@Uz@Ka!%~&p`Kyti4h+QlZL#A6u?sd?+;K2= zId|RF-uN*wu@~&Ti4$V{LsaDMzr+36Tqze1f9IXJ3v%?*yE%d&Zv6>f6xA;kD#LT6V9zT)tOc zAqRTJ@exQOO$+gmyT3{I=%x2a(fDKblx#761pYKEiC#WQ?OSdi7#OG($17+^P9`>c z$>(xxvta8O;ZNR7ken=#NT#oI4887uIA9af@-@PsKbg8^4A>>7Na7P8Ci=vLik88K zCm~c(r%1EaH`I@Gxn#FbeFp~n)pM%{V7!uqV`IHG<^VE0d4K-mrDgxK?nS@)k&@h8 z&^Oa*7vuY&sUkkHu}~JD7?6_-^ONl!re^gYmw{W#K}^BgE{xA;g)C7uzxodGF7zQJ z@|A6slJjjcLQZayoFsg-BJKeg9iSl-xBLQa6d&BwF#2kAqSps%m^+OC9GvIwl zWsHdVXmO7sIk%yr3q;#$L9QV7E|Tk1RA?|;J91?VJy0ZmL`*Ee z5GFU~mt7IRd*I5zfYy&2{A~=R)_$2@JuDoZUiS&IXS|{=hu~C1tsa1a%$;yAG=Wpu zn;vh791I(gq*@I~Z)c=gTEPeeMyufh#(B9U1?0tYtT$IF8!L&4CU7S*C;4Q&b_NF2 zcG0M5US>-Keofi)OUzGZ51dP#3e`%}hx|~H2Qe{5N3XDXH4;PSmz-X5*ky35?NV63 z|7REIL(g0QSESrWnn_ES$gzX)z-V}&86H@TfL?|aaXb|e@`7|Y=n=SVS;pk9Ef&Ye z$vr|wpdn5u5W_WtJFsAnUZJKe(HUOE#As{TfXvC4C!Vul>tcimJ1vRv$RZ=vz6R|& z*BgneO4DcPyhslnIZK}$+8gM~Y@^$3^+WaL1+uuOVqEkJ%d%S@2OPaZiL8CCdNg)Y zoNg22eoXs#j~JKmRFC$&oh!!4=;_D^?=bpLK`(CgQuCi;Z#p8EhKcb!y;EWy>l8bA z37iu5;YCh5W0$@qZv7tW?jwD@qWJKEvo}-x>4DWgmzao=OOSjT?Bgfex9_7o>ptRF z4>G&eYCR`(&eipG$uqXS};lxQ|4y{4tL*;a*JF!Df2|^+4>@CD<*-e~s4Z_VE1a zd1Cxg)MKZ`_KD1(@#`KZ=jOp2?J!!%zpx`bCDNbQeq37zsEg!iyI5lc8VLJ5xB+?? zklg3FJ|ZUOp?dalIhnTmP9W|R+FGJfUqHe`idi&Hj$Why>hPn&mchh%@JdC{k619Y zcf>9tX08HZ>>`55jY>DxWxVBwE8IP?iv{A2o&zUy&dy|hOm+V!z5DH~`$Kr?MuH|D z;Tw$p!@`AN@-$GPdl+t__gTy!C!z%FtgIfDS=9a>Epp@nba(TjTE?f^%U<(Q7R z{ViI&i1x%A21DDfZ5i?;*tpZX3V1W zP>k=P;RaJD8<||~G1f=VkJK+Q6QaP|eFWSY>u&THwEHPOI-{w%w;t~)S1dvwaz(Yl zJYP(F2i2q_b}A3qtU&XFptj51gCSzmVCy&0tc7qh^bK#saEtMogLEB^_HN1)q;C9E ztkM{p{dgC;X~v1(XS6c_u?f=k_o@fsp-Xb4AF1gfP>`V>F>w$?Nr#NRZS3T?kYZ@x zP@r{#yRm;FbNAB_a|U;!lal@O))!!qu9V&eONwdJO-_oq8CZiQL6(ZtfuBlofBh*uB;S}&5boHq**N@3UFO?wFx znHsXLPoV)UxYD_ucJkfOSBPGu&1}b2m|z}3+;ji9*cma}&z8r#qy03k!e!lKLFkDL ze1Ss~<1)sh1b{qVN82ICgHcoW5rj$wNd`@loYwI@w?x$A+;IE*gtrr`faP zfVk&XY~05BJ2y!cepO#qu_7wcI3G=Qb{+zc3M69z&&8Bn)Pr>NsKVp|(vS~?WrV|5 zgCs7|LFEBlGWMO$zQaf!Qy>}*5r8x|I2!b?q5i}Ny{U;p^BTRn6AuS2U*}U&1j>La z4Eu!HzZG$ZBZb*}N8nCb!LRO+9;IjJZmEa8`=#|hb(56OVk_D1q~-IpUp>#>z1S}0 zD#=Et9HV6RT?kibGy0ZEp!UWtLY}YFFL`{OmDtC>p{TF;J7?$mI~B?A!&G(o z5ava?h)t3&F(4+kko4qK+|85CUAB&^eaSlsNs8u?W+#p`KJ^3E)fYWP1D8JG=y|*E zF!D<;W>DW@WZphhI4?d;K88o?^7Us6`A>a{xnJ8&a~1ntoGHfo)hnhP8EMbC3rttS zdApEx=;;;rG)Uv**#C-hP@^S#y3;N-J`8RXs~rBy4lIrop?0_ z<3N7?6YTg8tG!BDuNYqiDmrKf@Jv1>5!b{(7(4oVopSW-2k3$QElp~1%e!FA^z_of zjcIKR6m6cJUiEFiddjcJ?_4afKJa)r4e3qOzP z`Xca$t6=SJX(y~T1=68Fk*YALK;(+#gVGMP%52HQkpCJl{0PJ1Is*p%pqAZ?nCA;x zV#;{QcY`%eNk{^N9-Z$bGjgzmyTF@VgVvpTYeA|N*5&Iq>-OqVK=Ipm2rl|Ltn51k z7yS@DEJuZWr!!f=+yz-|Nc$PLfcmPu81aSx9L<6agRN%=c3?0lM1pKonr+Gv&184P+vW}i`E62VL!u)F=QTUBY@U+@sWXgI$}0SV=0p? zM>Jw2E7%OwoeB%)8zVMoC@Q78p;Y92Q1LFutuXm+qaki?tieF_KD25u`2h)S8*o zBH)P3iN;;1$a^oaor^Iu7MUYhDXj-ZDw)E(D#)Z6sB2Sb8jkNcCJO}=-NaL$WE2f- zoOeiFK*5e5Zkk~fR)!^LP5W}N-e}sX%T0%Y_9KiU_9K>Y%Ev61{$ISlK*i=styE`rtg#&&Mz9ssU@(^tYU65FRIABEuo~aYkgF zx_+t1n2FE_^OLf>S|DMfrblAQ3YOd3Fdi8Lwb};F8z3g-sg#hcxjYQz(gBtVy~Rtl zmoCCL`uUAM4XV$`>Tx-Gk^;ul^L}o4xF>Q28}WZZ<~4S3XrUd*pdB2`jyLHv5wtTJ zeN&ikkkdv5-5O{=p$+e&5F>XXvv&S87Zze2{u8akTUVnkz4_WtThOQJX0-#;HZW#i zI$qOxiVfQr%u_>>Q7X(nm)``I?J&+pnCii)u*{*rK4TdIpXXq%`6N~nP(jmNc zDz<_1=kOJ}NA7H~$rF3zvU6>t*|>Q-D{Qf|gn7%JUw4k}akaZXNu8%KtkZ4FrD?{p zs9$IbO!cdgJ^3bjj8++^=tQ z<@7%H=iEM5yB5l%*N?tu!SzL09XW%2=XD!<7!--^Zo}8GO)oz7B#4oF zKr0x;k2_dU7$&xTgstK%Rl+{71f zY+wV#b_Lp-X@B+|iiq(Spf=k?cK(?C&U48jZv7oFaM=Ye;Kw>+r0I32kPv8Dqq8xU-(Z6(%y1MwWcNCp-^-cd zaCPREJBl8>N_LcN874cX#q!D>RrX&w>b~u0S?H*n;i#HD+%XNUvZDypGaOFzp5e&% zjj*po!`v(ln;dm>94(6-b+V(X9yE%hL>b(Xv{n0Twdu>_OhNNwd<74JIPL7$2=L`o(t8J)N0=e5j>hPS4cl#X2 z+hZymAC3uv%w7Yu9B3t;3IUH9WJ^Fb7gYD)eGv#M@RW&hIWfKmNE;}YfMNx8TL-WL z;0AyT4CWhwkwlXf5qg1ro6+g)mg4Dr1`g(#74p~Z$Kwne_ zm#KKQu`e+?UdlD+aQbjC_|uo0p(Te!Wj1`|P!LFtK;L=>7c7y&D0|vJEQbI?N)-yT zxS764Wy6mWjw`dg1t1&#u?0R10g;XWjTZPF7C2VQtbD$2fj?=1zixr&!Z+FUyaV_s zdpdqt{(-HVE$Hdjhf(%)T(SHE`cEwACm}y&le-G|D0?~%%^dYhhNt6@ zH(THfEbw&}xbXQKnc`|85U6c$styE-r<6{cI(5)%X{qpK_B3oy^cfu*>#JLv*XTPR z3o>JS#Zd;Dnrqu?R#vx#g+RG~-VE6vm^W+I0&iuYQl8=W22vU})i>3LQ*Rxi)|Jg| z_2CUEq%PQ0(+~=TT3ef2Q{eh(w*~5(+QO~v)#3W)ra-WvJy74+m}<}PU|lF!6KXYf ze_z}L(*TPewav}pFTk}lz`4arL)xf*F*4f5a+Co>$-vs$#-J7VY+K<~p)g!nAFdBJ z1R4V?8=9-v)HkiN;y{$T*nl1hf$HX_+WJ)p!iI*9P&IblR$a3RZ*QpyhC_di+_vgq zL(o#DrMh*&hT1Q$F8Qi4xIWPIxqixKY{t}S4Yh$CMewSYKx=3fIJdO$uGa}@Rt6e^ zD?<%|aBHv{Oi~W6Zf%KWc$*F#JaR=`UzaJ~wcFpg({x zaoTTD$<${Jhx7V8sgpB$7wRUT*DdhtIemEgf3v`UV}Xz2^#6NKZVoptJWhZ0ONH#Z z374to91iE{BOK1tKVpHuXn|LfP!y7T3ocXc1kSDr9DX~8yEuF?hf5s3ox_Vb{HF$< zIlrFZaDINVzsx~@aw~oMF!eml@#o`bG7+MXT@!Jc^bc@2$u{xdbND}S_!$l#&*4p+ zKk56A$!9Bv(_cuN_%ArTn8W|W0zbgv^e5UTpByp>g&uV3Ht|vpFX8YYhx7JsA z{ZReOIh@zO!1&}te2Q?H{9PQ*^PgmaPvdaj4~hjITmBKx#+TcoN z{|RWOxc;jBDJKz6EB(#iOF+U(p;Pcq!>ySyX$9dvN2OW{TGI8@;wHj`Fn-IX2@?nbl%G*f14q{ z$LL7*6U$8bmw{!IUt;9l?M8p|k!e5uPl;^(dyM|(?;%$GWpvJ_zi+bM!}J5`PfJjy z{`VX5Glog%p+h|i#TQ>({0r(2Tl8li3((I0en?$ibmI!0lkt!3B{*0prHP3`YY<PMxagwC{?r)zSoQG%vYJT;eq8k~E=fd$@_SB#WJ3 zpIdjdOC~36Pq%a}AZCY7Lc}3*j6g_2Wn?N-}~Xb zx=w}QNq6_&=Y4Ql)0Y<0hexV!PiA-4p+P+ZN$gOi&yvgms2)B3EPcS|H z&V)3<_`qPo`k43fpUpt;hvX|)-so4qkSLvRD*Z*7{kLSMUN{>0JJg#)y!4i&wOlc0 z9Ft++l@rQ(jj@Eii8#+tEK5h3bZ~9bMY!F~5(|&e=Gh+^5`)y{r}y!_d>`M>jU~dB z^{tHChaP+3Q{^kr@hU_Q_UhJ$nzaNQ$v7x=41?yvF-IR955aNGPO2MAA4v#eSZhhD z4a)JCA#pUOte+#%OtbsdMy+L14J$8eYJ(oX-npPY4h=oMFkYX(uFU-aWV1)q2ByrP z#1uKLN07GY)>Ur(VXRD$^|2cld$3uzF6-7Oj(`hws<#VK4#ChKUV2;7f+K23Yq?f< z#f{da8d8qu%jI&y-m6C8=@SrWcZHDSo>0<0g8bW`TQM$!4Nj`u^iab3cImbe(v?M{ z9d^5)p*x_#`l;1svshxLwAu-WHusNop=h3C!p`t{BIOJ9cl%6bNk zZ)ri46xGtw;+=AN)X7KEeiCZj{PWUw>EN2Ay_FDd%YOP_NBWm;g9UP+ed)WOmdjMZ zQ9b^*q&bVMfDgyhA5@qVXdccD7`dVSt4*AJA|*|KDA z@ar|oOXpFqe>Jzm#~CN(z7F%%E3b{766)@EAVJ;5-_fm0nswO^&Xu22eI_Y&x;=>I zYL5g=vac_kK*u3>ho0Cm{4S-mx85;l$$*cr8HroY8YgSX>#*y(SuUxuA`aGK6LThPudxVY-Ub+-0-2`|I5w^b**5}VDbA2FC7H_sK z)NObAd?(^IfVe@gA38@pjxgnHp?5c;QE>ZRd85k-pRSzU*IT-*%-)jBoFBq*w;=}5 z5VthzeXn*33{mn!L|ur+xQR7HcSG=iUqqy%b0O5iOBdW0Hu#XyJ(*Zfe#oUyWnlQU;9V@BH`4_r5T?=%TXtZpBwWH1@#oYsX5# zpPwKu18!}J9$h)D+XErDqR>!q#@uhiTTC}lq=fH+&b8Ks@DZ067bG&J5d`qq4vRPf^9;dCc28^Z+Jpa`>e{n;bbAY?Gr4bsci!i>tWY zbnF2xHHo1Ps+Q3P< z*aMykxya?|qbpE#$i*&CF!u!aKyaTt-S4tz9QHs-F1CR#QDwj`7;suHCIs6w&$eD{ z8Lrt~9#;ppIHZGXk1PEMiJk7QX&hf0Xm_pCtN3@z(_Z*$eh^+($F43XSfC2l{AI#e zkl|vlF@o5sHGAvmTm06Y-FFT*!)#vJexOjg<*4iDr+1}4;z`OgK1`TCVSd=(&*{v=+pa+e%_YU zjf(W_cy`>ZtPZAg2d47l=HUu5mdcJyrj2wiH;KnHql9(~urBMsul5+BNY1d#?jK zg=+aGaOJhSpL^b3SS$D!F(b??y9(Y~ncY39=BImyZk7L)KMr1!xsv_<{Q?qaRNomJ zfl{rtI=joSQ)}J%39nvgNp$`V81YE=<4gKIG7n|!oM#aECyuDNz`THm_?xQm4?O-@ z74FB+|32^WSCO$t=&-0Ud_I5cdzcVbZGXh$|5g=)@4wGa|CnpVFG7blh94)2AN@-3 z`F*^WZsIF?gqJ-1bViVjYJQSgT|0o-&>DI(=HCnM%JtQs)0fVYTKr9({q%c6997|9 F{{do|QYHWZ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_mateltwise.o b/third_party/libxsmm/obj/intel64/generator_mateltwise.o new file mode 100644 index 0000000000000000000000000000000000000000..6dcf2cddc55bf3d851b3c19a3e032281d337da3c GIT binary patch literal 2112 zcmbtU&ubG=5T0$SZLz;B3WW-;DA`?!xXGQ$2`y z3_>r8c=F%yB0YHV5AmQkJ$Vr7%;vq#w;aIjFSxXwO7}c%exMXDSFH9}r8_}HcdYU%+4=aITkw^r z-KmRrp$EwvAgyMt6L}tgBzzINaS(FPjYTVd(u@S(7ojg&0Oj@d6??vX|KWmti!HOf zUC0-l{IauT&u@u_U2)?czv$!_oNLX+>rLUxj^7na-Mt)Q`Q=Egb6zd=dBf6R3tGLc+ z^{5`*Lw@=XZvb}y9}M7M5TDVWHv@RcIvA`DobNvN>zuQtLgA(ZCpkvET-{t-t@6!{ zjR&P|zP-9uE%7wcZ~DzRwb~-A1yM6TNSQs?Z?ps#VHkudnN=K*@Z`Gf4n50GAw9(S zfhaRQ1=mp1))}_8*#di^8^>36&6xAo{AKkJlUh!G) urnq@o=v%4Bk)s@DJmoac4pML=Kj`1aJ z%f{hcV#ZdIMY#2}>0CO~PU*CrVdmy8*P9*_U@(DjIGJAm;Bp*+xf?Rq>7b@G0MS0;QMhmNQ_M6uVf+jx|Q(9Lv#RmaIgYG)&s`9~O+JTsI z4}eB{PMo@4UwBi_J_>%IO@z+RDtC#~H>yj_1qUC)=BG)<%c1PR_If6?OG2O#nT>TqSVi+ zpNExA*|YM(wcu2(*R7M!5X17Vn{I3Zm$vw~A|ur{&njLyd&aSkJaj+v5cK1m1Wh%i zRS%xf5WIC~kBM?ttVJvb6knHhPP2a8W&KLa4J!?&yXD(7(z3gOe4*xcGz1H3?xDsp zkvyQrv{pnaBp+P4eOB?$AM^B+4Lw+PBj%4|Roh8Lm++GG;G4E6=0{?Tkx&GsktM?I z1BxFK)V@9hawB=Urzl5TY!gA~le0Sug+iD0e_hu5k=%gNOrdHZS-EL{?!I-Ze0XaE zuIeQpmJ1AI(G86F}L!N1h;v^=VNAc*Bmj{%=KDIWcC8eSdsVHX?q%71#VyTMq~R4yJ_JYY$Z);V7RSz zRH=o^kUS>X3*B_bHU2SdDD?M11Fh@&Af^AoCN**3qsX z#IDzn-SFNVtZ>54`Oo&)8ZY*#{Y3}vAP4?Cp8qJFP9b&C5c3x=778N)`L+cTTTCZP z$wp+8i1;b}S}Tq&*dZT$;r1wP$rEspy{{?XV#>yn)MoW*Dp2_!UP4HU(OFXJfPGk39u&aC`>7kGd2C%e5j-di(| z*t^N%s+Ru&lrP1DKNXHGQ6Lw@AiP>EK2Na`h}PBG^<)`2{ML(gz;MZ^n|(la~2qaH&UFcRg%TpJCnCF;5Dh zsr_Bnin>GHPM4>Dqs!Ctg4ZwfMXX=oBthlPD0R$^h_%vbz0A5RZMAh^g@TbIB;y7= z5vwhxVl!iR_}TExN?XJA=&_S$rEQ=d{V`#zK6ew7k?R}4xfeXacp%9wkoMIAvoz6! zvvd9>zneHw>s_Wl#>*XU0mEgbj}F-@1UU;~{yw`7G36+Tw6}9Nu@#vvxbSuRXd&yT z^S_c8A9+7CS3vE17o^tlqi@&B*_Y7`&AsnYk6@b%bQ#cdAJ%dY`t-$n3+khn=%Oj_ zCt<`nh`Uy)3#vm%pK*CW!gqD!U@qYSwW~sn!b+kBeYv z0r#>9r0I0SdkE3O*=tK*4|G|ERG6<>FKW2TG-z~R)o@q!F+ceCSi> zPm9*9(vW`#z>;mj7F=FEFl9@PobAOsSjhhsE*YrDq=uYr2L{ECpri?Kz|_y|F34&b zS%~ZmVe(1~SJzkwV{HrR8s>$Wa2C57?t2bPsL%Q#1C?1`cpM#Q-9Y(i!#H$m#iMz6 z{}4cU>t)<3f@j?f(C&PNT+mpuo{S|AG?MisN89{g@;(x5Q5eqsFI`kWi)C>et`+2k zjp)w5EvnXwC>y`f?tCr(A#ge8q`vv2y#I`?6cffa5hRmK1oAsO>btD-5$g?F)rYXC zeeCMoxGid(Qn%?=zK4#P78V=iS2G5+*|SskIra3Rk0SFreAcn~|CZ-2c{25IHARNj zgEZ{^WirEMZ;D#y$}UhNOH}T$8kC^4plgr1&93vfkM%&k9XrN^wOZuA%wlNQL;IKr z;6!%rGuCmWCYc9^xsJ46qihU=jcQ>7ePig>$^2QISoYx(YI#@AK8RsHh9COw*n*4V z3Zz13TmRgx|w%=I;vy>$UtlP(jD0X8B7D-uxEKfU3MDMh9mmlS$)?X39**GvjF^ZB8Uc zhvFI27#dF))A6(spPr#-Q*eWk934uJ?>5sZGa(2aZEg4Xn>w~`zti6m*c52?Z)n~S zY~B>y=x^#Zhy9Uw#>PJ#Y`)W7x54R%5BCbKVdv4 zPGL^RTsPifj6M6%i2`T`@iq|aAdZ4~D<&hD>%!c6@SQ-TH)PFTSus=_?I97I8N>gtp+odS`AjH)M^C$rKYSk zVJr%DMYOfpqjgI-2Lq=Y(3P!5pi9q{twx{?xVC);!}Xq$o@|c-%i!@=F!wuE@Z(kR z-&eu!KmaPqk5$1lRq*ds!H-qJFIK^Cg{_tJi2z^kDaDKBK5UvNdc4cqH>==JRFVHe z75r2c{5qssB|TfJ;E5{uA;Qa*$N1Y-@V~EuKUYQmY8Ct%q*^8Yg;JqPqDCb$4~s@#Mr5YW9Y-`Nbk(NjT>#N8yw)yeD;arQ?YyV>Fq( zy10??@eJ($fZ}-Vu4YEkOfpl9WPEZrDN!kea2WY&V$$)}2U4^wP$d_A;CqkJ5$z7E zQKP%Py-$xBF*O|34Ivp%Ad`wJq|He)V~kBDCyYdV&`cPaG(u>miv;D?le1xU)p;f= zGj2;OQkB%_B0QQJ&5XtqqYs;GPml}$7zTFF<9W(eMG!|PDd;8hgpssaO6nA8P+W4_ zOb?Dvj%M~0kzMiBaKbdqbb7qxkdi4CcfwG@jHisX8{rSI(B^SC1o~8>Miq=M_P7nR zMxU_M2$%4<_=_A)U#c$t4u{jZ=;94HhpCaAzrf?-k8wCJf3*r;hx3pc$ytxbmGcP> zzlp=Qa5zuj%i(nAcIAAH!+Cm%7DOxe!&VOG?fp^}{8bL8FC}-~PPz-CmCOG+hx2~^ z7Y^swz1b&8Z^Lcfnb44RLyx*R#65l8%=fmJ}^}oYbhuQm44sYi0TR5EVd(>!M z`d)M8e2qY~a{NgfW;pMM=Qy0cyj?l}$>F?w_o`hk{{*)m>Fd~)v%>8!`f_%0lk*S# z!_&pT&*6R!Kh5Dh{V*wlMz4?KapmlG@J|8z1c%RZ`0X72bq6PPDMYTEe{^tqQH)&t zB8PvH!=K^w3SGZ7CE8u+vBU00R!fy;FuJx;0x%5H=K+?bMP3W}$2feGNR7@xL_ zi@mD3%NR+=lcsI_pZ`mHh+MnIrx=IuRBBEbic=I*!kjP{@ldh1wo}gkmzbvf`cwIX z;3b*W^mo4pLBblHoqW3s14_jZ52AnkBR1f36ri39#c_6d(w$Nxs%S>a()VPdF}!dme`+T1>rNK9M{V2r)4Vf Q*HLjmyI{YHzvcY@4`FPJ7ytkO literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_mateltwise_reduce_avx_avx512.o b/third_party/libxsmm/obj/intel64/generator_mateltwise_reduce_avx_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..d6df904d4046ada1d70babb8bda0896857663417 GIT binary patch literal 123024 zcmd443xFI|mH$7T3^Hi2q68Q)$RI5a5OfBjCKA^v=!QWiFi9p4AdujOY#gIxVIo0s z4NiBG-l5m*{_rQi6(8$9G`m#|(?BXlu@H3owR_THEZQ+cwO1}@MpH;di?3q5T^Xg~r(!GIM zqr!pC#?De-c5dmxu2L@Byyt~dSGI?5vy0mn9~urUXbK0~+P9T<(57wA?&7w-)4B$Z z(nM>tfUy76%xB-)HE@8ggab{TfMPK&PJQ%n}Y;zc_I<2|1eqGcU6mDZ#Zio%}!XtfGYN@Ys?TSn1 z2bbC=@Ody}G<244$u8xSp}1tb!l&(AxLP0aR)xV}xd7C`@daxi9DhxACAWj)ug!kT zKCjQ-%%@?u0@=V-*#*VDL1Elm8X5-rvOSw~*-5lwx~sCICzqa^T>9_eCjP&30Oa-W z%s@VU;IU)RBc+AetHScQ>-k(Veb ztM3zjzHp81hpY9M3GA1R5#Fw?xos4Gs^?qtHtV3j_SMg5t#YT%%Z>~Bg-}~bGEM0r z<6z&o{HaXXzj1^cxfF%)TjbkT+KjaO<+DflOTI~1+7g!T>fg|~&jIYFR{f>Ld-v|` zzbZRB6D&Vf8`zWWDQ<%xSz?YKlXC1@8(Y%_+zo8?xL^?4=R&70z#EOZ(tYc<9~k!k zCX<`klkH%8TZ3!Y_dl0GbadyAc?dF?wW2Glfx2wv`SvFV7G}HqH;jf1#^;N>gW@X2 zivC8lcXY$jtFpOsm-PPR`g`M5uR=R+k;6Xg2HMT$73*GdLdtW zzVy)Gtiu{bS=JSxMl~#+9rka`^lxf$ds%o?v-V`iORh{fy*xIXD_`D}D>t5yo0!YC z1z*i&N9BuKg5n9_IJK0hgWr?H{2QY1cYF5+M=xcdhAw1;t%y-F%$qA;$j?p|<<{4m z(LCeBDO&Li;iEI0nh$Q8Gdi4BSz(t}IK{1SQfh^N#tL)g(+|p(UhoU6EO>pqE?cTw zT=7_d%ni)Tw)L~unOtUGwwqn+%pE^3+cj}vwkKGA;_nmEc^^9>_GB*$ql|Q*nj8y) zk#pIOT(GJq`xzFzo)wG}(X{Up!Q_v*`;7AVqE??j=Ar&5lVe{-<)D@p(46puz$-7|L{Y>cB5|&<&+I$cyYypRZmmPQD^KMn(tECGX z9G%ZR3EhlM=!RzDz}aJZGa$N07#;kM#0}pEt{4*rH|<{ks{<+|B%0X-hu)bvbs@}9 zcsEVUvrmFVpJem6z0!k6tei_Ay{D#zwe8TnH4+#6$jTXTa zuBJ2NWMP9&_KmUA*yt`X>&U3Yn}OeOP0Q z;|7)jr?gDG3R+$EK49q-1%jS|&KMih80DK9g)cXKaSS~5;bbo6wXtGsBtsfniV?IC z(>3%n3DS*)`huW%$jf{8PA)y$1vyEh<(o@)m{NQhp=@liIj>E=chBCvUa1Ak*Ym5Q zbE5y`XFPso>#>8h=f}SH0Jp+$&1eyd>?7Z3@Up``ou!e18yy7qCG^$4PLw}OD!Kg; zFq@68dxhKY_w=WoD!t$*@B`XF?v{=1ue-UEnXazzxy`cpZ@oYWZv(##cOokqqW5V+ zKVR>G(DJXtHAi#1_buV-ck{`3P(Jv(WEKMY~mgGw4+kElKpm-a!5;Zfmzz>Q!qja>#z*$D- zy(4D8^UICryYmC*jzOmWC|v*e0pXPBZLEv=+REo1xIP-$|I~qqr5p-vLQuHwKxxPv zdJRKyFtE+p0|$=`*Y6l1&g`(ohiegyG{oS^7as`hyB0H^nIMaBl?alPtGL$5G<0UHkdOcYz^g6C;BG+8#1q_ft&vt zEmeRspdZB#oSv@hs01Y6topXMdiK=6Ju(@zYAxO<%1!)5@TDuEg4pi)u9Q~}yJuZc zI0Y22;p44H9B;b(;kS$L(+~= z-DavW$L0scd%<%KEY_JYaptCWzmXZW-A|#inlo%D_?^eM+1YGv;yjr=O*HB%{oV)%*XAr-o8^C}r6ja) zqzDMYV3ssFlrP?kxmVnlD{T*huWS$j?C#&50V&%3T+1Ut@jzKhYV_A+6EcclTmH5YoopBvT)$sS+R)^}BQ(pte{i=;{Zfgt|_ z&^PgZ1l_vmgPIld{re6FLGE5Q$nM<{&4F)#s7sKT-WF7X&Ss9jhv7kpg<&>F4ZATd zicfxDGdY}xLH>97Ab%f0m5DFHt9M2}p}kn8%^7$WY6<%v4|2zEi8cbV?&ZR@O&X$q z{edC6Tf(Lz>U+^ov@YtWq4hrz?=8ynOKQ==H?{<({jpm_7_VXr(>64m`$_xL%Vn3$Tm0vJpS<)lE$4P$-g5C}m$ZbrT+79unb$HooYgY_(#u-<7JTw@ z*X)v(PhE1^XD(T&wKYt0Ls*l`|8g-sFMnU$&W5{G^IWmulFw94(ZX0vpq0%GXN1!U z;(bqX!D+W$df4*qg!5MDR-8Gd}cryH+Mhgcua(-Y*Q$EP;?&5Eq4|J3G;nO=179I&R1Wh0zQzQkG(vkH?P;+$ck?$t2s=Uu=Gk z24|ICJz{tNuU?3M=8xExE1&Uh;&ac(S7cR9${(?%e|t;5eE!jr3jL42IG?L?J1wGP z0h{tCM@z0P#8Uy0F@u#xjEls0h)m;+BWAc%&3^+z;tIxpLfxGIAAWJ%PZP~ z!uJv5y@=!dco${FzKj(z?U~r~4GRB)jW0n~_&Q&o9wC9;ylfOW*DJi5r7u57390@1Ey@WGs`-BbGZB zW%%a}eTVk%$mB|oV|K37jok6;VUe+7oq1-}{4=I$*#A-{Sa$da zQBb&@ehfdMbkEKSIHuP52&O&z$Z+?1`Y!=V&iCqQ8hsA!P_eJfwcQwf10iDTW*vnOvkvgnG6%lEpXw}s zRG6}r6@ErTLvZJS#cXYr{U6DE_MpxIi|Y;?l#a0*G|_6h$9{3I?b!q)Td|5o%Liso zm^yH{+BVzRMz&`+La|-hB~WaZEm#;{I7>0Wufp9nDm3PT=>N(URzAiBw~?E(+jpD5 zw6R3{a%pCZ0HTA@AGXfUfs?c-2mTitGM{~0m%9=UO!lGY!LO1f=)8L43K~|j1CjbA z{;=V1eU$Ozip<-$31*|`ps*A^fb6>aUxa3l1hcZ@Q1IG~)H1lRqRZ>4pm7x~6Q zVTPbE|3pWE-C=owRPF*<@>z&z0i5v@@t`r{*xd0yMY7Eke~dN$n1r-paK}xd5U;K5 z(q}Hn%ZX+BruJP#LX8{DZ9KZe=p8mw^a!d(knIgKM8keTEBsSU*eOzfFQj~jHf{vwo zp}KdXp^e6h*WpSc^~Pqx?`hLOTP&zL$!-LNZ`)dKy#Vgn07ypy5`d2A>yvdvzX^!; zrN_r02{U7mZYL3?cuMfeq#K=r5I6g#d%7Kj?9-xZC{a~s8<^mskEbHZGioZJ`0r5BXF3vt)`r#}PJhbmtyclyH5YH(0oJ8Wx!#4#C8jsA7 zy?N6;VwI`EAXFo9v`F~xWI;yZ!K%|8h{Rsw$nl-9goA9JP-mhB@SM0=!6g$ng^wW$ z42~GCOO3bwBp?;<$Ji=ujiW#ga3@hVa-$-TBzKACxa3OgF+X*BdDIC)Z%T{_;<3T< z|4nC6ZP!Tjliz{vpiq?Fkw}k@g)`aLNydMJtz4%0B>r>!cM|;f-}pOsmUi$3+?tSS zuy$gy@(h`X3*e!7qD5hU0UPo#7sm3N+}cA3saes#VEu}-(WRzsxeu>5G(dXq6^l#r zM4K@5Sl?_Xqh*oW2KEC`#>=UNFq2qzwr)ok(xZL1wTf=xyTX%g1D_%-dPN(i>ZM~4 zR}*m2$J>zBj$ae(S1B4CPcX`L{n z|9*@{D{DkXbR3N|etkMlZ}L^RZ1YzVNQ|yG42dzTUy zi5+Q+Cu!m0#K9mg8t{#4-!?youAMLvSS3RiA_`i$u%wH-ZG7os*h%a+W~2gK?N63A z<&M9{wDM>~?(sY1a~vksV8B~!ZAK$>0i~SX6w7;2PB>kXWM0@~y2=Pf^4?H4L~S1N z9z?zQ2&83X2}%Q!qFE>jD1iiHBP+Xt;@4Pyw2b+MKi{lb@QkFH0R=ELml`sd!hso$ zM&p92w;`W-T%gScv}p#Kpfd1c&@Oix{a0x8-=onL>+ztCkG~7F`vAn|^q>`<*A_D+NLMTP~(vDQ~!PUnd(B{Z^1C3(4b{}n7r8T!DyR8@vG*WaTluTE)t9)99 z_-toTIEj1;(xNrG#b_2j+7Fncf2@H6Xmh2fQ?g?>4IIB2oI~!@q7Ac77a9+CN}{$J-_c?M+uRVAKe-oMwnHROOPsgxHeDao zpWI0Fy2h9gl7Em6pDofN2a&f&eyRSiwhZN2Fp~?0Ic0IYrh4c{jX-eFMwHIw;&91g!P>j z4}@fIj5PtL!>Jx8IA+2V*oUwtLy1Y1fXB$K;?9aCtFe99rSMJ&&V&1<6u2|s*Zy@G zN3+;E!Loj?2(F2n3`BD+1Yd^$!}7#*1#5yW5gV~!>z?VvhQU@(a#Ub5O&EuS3OC0x zW3I$m{9UMQ6>N7p*wzSL2GGR+4suLxC_@@|+kOiyfg{jjWW|ojbR@L^l6hT=D=$ja z1ZiTz zZS=Kd9PM5`xGV5R|G_#}D$X;@8yie1(EM)L+|lQSjwF9vJHKmlHf4)$e=OP$iXqVD@9n*x8=G-u^{)}Ui9GeMnm)Dd0zm> zIs*mz!2D<>S`e&4e0ss@#iKJ=U{)tTO?1ZlY?2lESR*gTC0rb;1jb}$|Kqjt62X}8 zS%t@(&}2jR#ay1TP9S38CD28&0V6Jd58sCf%u^*WWxY|4@*Oz;%@>#(nk(QwLzpkt z|6a^rAlXzUH&<|F;wF<*ri?`Kv7|!r)XL3RO3nBKq?9C4BBmyxU_CZpR1_dqKezDR z5@O!gn-Hs&iI0YsKXLj+o6ff$G$J7%(}gia%21h>5&Dsv`_PBZ3=~86dsA>z3<{#W zuoWgpD$T-O8j)t#*AkwRO|-8k74}DXE}=V_t_f8A%)?GcQmTAx)~Q6?f-n6y0zfeV zLM3QTjDq6lnvBvWfxjh+A4=&oz+%m$yufjA>q3IYj5MwjM0KzQW6Pi4+yB(a`Re=- zWa{Lhi^3>KE*5N#iy$^-jIx7eU*Lv3k$XkUcp?+~_m1d0(#DuAUa&~MaRMa{7?O79 zIw8~UT*Nn0iOHQoX>hx9mCOxyhuq!REB-Qir(jy??tWF%k=`tS+KqSzzulefe9M)m zaqB{NOSR(emQ48a=_A7O;t>YM^yH3AgZ>R88E&K|3zq$hZc(=Sw_e~O6~od)rH7LJ zmPr`9!3r_h424h1NH32Xt5#>08^>t@vHNb468EmApI<@Y&(t+>^&eus)MO@H-4rdU zDQry5Q|Y+Hnx+v@*z3s7Yn83}(u2NzuC&!6qmqt(F$7AbbK@2Lkj_M}#w6=zy0N}H z%dzQFNG<#u9VoCz^O|SX6qeSjukTV?52?GUxUAB+cp{kwpso>%#KnTx58ufKC~eX9 z*nGA}%V*PLKKc`_!GAI*n#1xGGUoUrK|ygt5j87otq^wMB&j4EOsQQJxwda{8xvz* zGuadhQ#gblKo4uJZh100rN@VK0V#2%6>XyXrLevKz(BK|N;OsLmt@k3!hoY~EVpSA zgHdxFAJDqec7b&ecz)B5@CJ;D&*;Z-+`?`<)1~4+xhDUtctq{}YWw8~W_Nt=$=FYq zrm-3tj(S+IU$0-_YcWi8sC7cByr!LWs{%)U{MWXZu2S>FUiw4UaT%-MZ}?RXK-^G; zcO7gpm9OjMd!AMLS+2B6Va_JqYRi@9?A6{iM;A$^taL8v9r?nxzV@WpcKIsq^^Pn( z?aB^aRbuZ50-!2M^i_DX&d2#ieE%zrNfnr{GDiA$VA^zEO&-uXXf$Ws5E9Lw;&`7(Z^Fu$R?k)_AAzXQrr;GsT5 zHS_}2;@KPqrd#?fAi+GbWJ5G>yA!svG*IDhk>M5!5&FW+9a2+EAgLwL29GR2PA;Np z&VjiEImB5jyGdy$bwd_swRk3*7RNImwJ=T-BmW`sOmI9;iMHV!ZSM=5j1TLr*ME!s z0!0iz$9D^v_=sWg8_-U4HJqHXztz&>tKPA~nB`qcdX8 zx5GgD6B!gPfw1s2tYb8@F9&r~!e8}Ufk{WnLN5$TNVyp}T=f_fCQ@t7h+c?g&Z0OT z&7auR6J!>D2Y(5Ox`~4y5B{JEI>jdR5{P&L`{^ue?4MSXFN|r?_SW`C6eKG{c%=t0 zvxCCVXe*PsLniYvW|7|AzJO-bpEZoQ`cJjrbb%bes? z(0N+1GTRgT+4iXD>}N!|8N=Be zl;4^wpOwLWCXkeNwn2y|brl7Fyyf7JoBG$gYlD>$af;~eP+3=$*^W!6b_k%_RJ;e~2SE7d!tVv~L! zY1}(9Z9SJF>}Quk@)k;XUU+`B@yQ)oJD?}gsyL=gKS_wA4cB5h?JRYBn&AZ*3h zOI!8OaB(HT#rtLEO1?by1fHnt`?Cswd)RjXw8xEdrH68*hkHk8c6$yf4maa@+dOkZ zay_%fmflcqUq@TB?*%THOu!AN!Rux=&q!OzR+vPz#cbieFWT0+$H6wj*RJMEX8n5; z2$#LYB8D+$D}cyif4xZiJ42eMvEtbV2qd<-3TJHiUk0&9=gkh+x~>`&1rUxCWUL*T}8lD+rxO&DN&?jhz@PmHyYiFf(CHa1-Nnv1e^E zo=vVP^?=P74oq?3fq0JMqfmon0*z;(Kqet)=Q)j$dB_<$ZLTZjF+9dK(^Z5RaYT0|a*dM_@j)stCs}zijg?M>rbD-YF zx)D3D>cvH_rQ$4A4PB#m(8vmj%?Y8bFL6S;2d2j^$Ws>4^!x2ykV6J0ryLMhjU_6@ zZ{W$`hkhdm1m~M^s~iw=lGDrqSzzEgC{Co#0)N)|+SM+t3i{ona_3zX?M9&YQt|#U z{9E13d+DM~$HKD^G*D?Y2kgH#2}kdgVyW*)MjuBNn(KgMnkNFIfGQy{f>H4Tr&m+h z9dMUXRPkEA$yaML2hAPlhl&aHaYCMi)mAnL!9dL$pCRMcy zvty$he{36Q6@HkKN%>#&{jDP6w4!X^(mO&^H@9!Id}tN}Kzi>X=V7V$uzn#h0ou{fVh?B`RbR71|#a^g-d`X>o<) ze%7Oxo@@1+jXVb+~Ab zTOC_BS`VeTqL4h)g)~2B0zDC8pNHCKjAt}A$G{8@gn5Zum157GpRV{DL+aeA(?1Gm z8V26x>-1IS9{mbd@Y5Qk@i{hx90x9P<@y4gGaVeLvp~!FgpW*J10Uxes2V>U7?owm zCUE}2WIXk|B{U}>!*rI-$MWR`LI3(da^Yj^qsDOkE=q8=2kdoL^exu2${x}W-)m*8 z{$1pqxrUWc9Csc8SbK2Z>SrjOE3Q#$iGM#@GTcUP6BtwN#||;N(<>A5XCe zgi6HA$1!39OaSAaLY*uF+Bj=4&ABFw{@ptozh&3NnG~OsbiRZ|LV=x}OmbZ}+{xZh zcuxlJ!PunHNhGack~A?bpVq-LqfiXv<2aQN(i~dBiVmUR`D&H;UT@{I1LvSA0Ei&39V!CTS{>dv>hOR}HSPE0MzR%<$JwL9DF_%M+FJ%r@rXe@ znMvzw?U!NilgGbi@L$e$W zEhBVHTmsd^=!$Z<(UWsRN{<2eG@)`_I0dzf8fE5@M7$wtdpc^;Ow9lo70BZ_JIZkk zHA_er!kVJ9{!Jsp{#Qr$U79Sx&tt2=QzItIgY$TPtmP-F0MkmQjV2t<9786E2QOYD zl~5_4V=%*E)W>ySH`zwaZ_+Vd6j3ML;=b#8SNwev-$$pe|I^Wx_N$TDVsy1X4`y^s z9~8e3f7kVYNL_z1b^Y(-dM%AV7=Ks$CsNmMcvofoYbbH`<9Em3)qW#;ig(Qv7 zQy`kxOop69iL06pdCU68D~6Qj z>4f3@Jn!MV`W?ZZ;iBcRcS;Y_RhCqTp_w_vnhnP~&gwGGQc_#3;WHutH`_0aLknWJ zhCDCkxlQado&}K4?yfr;qV_IOI$km$ik}D3;v{j^ zVJYd*fMj7;Tg4oA9#-M}=ST^8&bxGi2xMWSIbAtd(>M3R?RF}BI6m~{XSIeca{{Lrw5=fWBWFFEdj=hItF19v-I z!#d+e$9|~%{N~2&Ib5)|7~~#v?>;`+dKhT5GeOp9{BX4fU_9&M+uw@y5aXtBvuX+) zjn=>?J93R4^ce!51IwN7=2{3iW=X>ro*!Ni51ax@gVqP?qbF2=de^W(Ay^yC_!$*GmJM%#n^!-7<4Q3s^(g788Q)CtE{aBz5C zpfpr9P`>ZqHC6gPf}>_xJtGBY&&R+oSRrs}+b}3#j4l*K))jUzZsm5^TCk^&a?cvF zku5aBBJ%BwzHAn208wh*_^uru=xE-i4b3dJQ9Vieag5O2eRt$w1yi-N+ibp{&F|9U z9srM!OWs#O$lZ!Ur2sQBw1Hy4#4P&)%$QZZe4|>lR6$jT_uNDBqPz}JEf|rJg_CRG zvh7X7B_i}w+5-K;V}u#CtCw$-hD#<rkfhObiUb&-LX?4%PeM; z!oO3?k!Fq~87^Fd9o`T#hulQlV#cz0k}c8W_2FvWU!z{WQJM%PGiwJ&^PF!lrIK*NdcPcCOp#%p@Ku&MtTI+A~X>BB2j?nxDWP%AsBp6W#5zVpDoJK_S zZv>NxuOtv+JH(X6493iZ=Mn0tUi%@|NDAC(akcOZibrw|F$c{Fnc->;yGGsWXbOkP zN{fp8OEf;4dtN-SX;d53@KBA-Shy6w)9M5bLZ`5WKx@tNB+axulWM~)Px{}f_&2gV zw`eUXTvJ&nlS3%U@|=W}<+ur{=!tW51@P3OaU^R zNtK1xEH0d5)|%PHEtPFKnTI%rZYi#D&FcJ?elCTX-BiQXtXhgdTWShnxK-5@LuFFTB-MPGl1b`SPciO~Op>7XXm3rS{fp04 zLxz`A4Vk2opR^uiCau)VH`=$fRP$>7hL+N(FzLK#lINQLp_2K+UTGa8|Tv z`Dmc?oI0O#pwe*JMyKHtS^FvLAx8IIuDl}r67{NZPmLa_Vv}6y+3GQ!$TihuJF+j! zB1JMaZBv%Tv$hu4D`C2dBC9vGq1o2_+EdqSyU(dBn^JFTTeX^`6!HpwR<}*9x~6Rf z>!sIRb!^sL5Y8bK)vRQB8{UU$Uj=Pb5hTozBWkp-vwdWY#3wb~`(Cqu;_{E6c>BaE z*H*$!JLCfY&hoHDrR|5pI@r}2QFlgb`94$#8PisoAkp%jG(37KgL^x`^o<#)&XK9I zV574u?Y)?20yNJJJ&1{fP~LQVBG#)zR1^3!4sz|P_VoVd;5hR zQVA3nbiHPwZ=&*z7h2J~Zpnb|%q?5Mc^3^`W|i>p%Y19F{4Snu^viq)s?zVLuBWBj zA6FsOk!}^?!eWMzZeoQsHL7F<*K9q>rZ_&Y*EZpE6KzbwR7p4EGvkr!;hbOLI+c<2H2?F(`B)YSOf4Fxm*c zxp1|R<>h=e%x>ci`|9Ew!7~-zGE}WrkUu=({m~Rm9XuGxIRS6WaS8_4mPAgoN`Jlq z#mDmA)w^AYy=(rW&bUwciyu5i{-U|Z5(wrhRr!nNO4=kB&8h1zj_9rHFWweOSNJGS z@zbz`mlGe+Ef-G>*EqQ!hgK}aGOQoT? zNoaKjHS1B40j*q|Za6RFV68`g-q3o6C@^n7#tSthSs}~KROr>j+dcl9VtDmAEdEI^ zHmuQ~Hw1639&$_b$w*GAyR%6v72?eT1J@)7~L` zD6@Ekzq``D2H|2F#}C6Hywai$!j&b6*74+&^84`$YGx4GI0~+v0s0H=l|OMl$;e!j zq^S}C8|{5e6Zr8BLvFb&DM4)lj<26&N{->kCRO?d=`n_p$5_4(72Lcj_(t8 zws@(dn_)RqK~9Twz9G)sKT1(a+)sb*<(}f=n%%;6EW8Hqy$9Oje5*=>>g-9vG`5Ee z8Qu5zBaNFdWAS&#?$2gi4$E4Yc&L9y622--?9{CJXZG%_;^9z2QnM5kGvsfR-PF?v zAi>B+EM?fKQq;dZJnn5T-fBw%y9Zfdw|$GLY8Td)o|YeBMNT%-c^4Vbe^_jLtI9}I zi61j=%;dvKHTXFXn7{2Ep4 zPVDgpwcyrV){YrI6@uXG)xRLW#nGtWX9KOu12|j}88XYN!w?r#e9Y_}p;ENY8Af!z z;EBo(aseVb|5)SV!z~DJb#&gGGn$C4uHv7)U*$Ix>)YY0e~-{osmfR#JHzvp98bP8 zLmiEk+cfQbm+vL;d`ZgKpz!B(qfTS>d(Zki9_KrKH02zee?DU16zHxo-1E3fQa<(a zSQWvA$G@t|2i3AP1%(`h8C<8mwZ+`y|Lw6x^Zkq7DQ)yJPx zKA}AbZMl@4FkrkjF7cIKbRnd*XFfaqO}p(P-j(Cu#?pvb2mzHlqOUyQ&v4%uy%&kB zU`;RAOi;VztJN&Vui~@GqO)8lJ;V`xJdxyJ&!)y6@@)CTN1PIIbjIQ;gzCut zPn$BHbv`noS;ys{f| z7`(P2f1Fq$Qm7^aR~0HW99S&o>yUlCIW<123FW#qk3Q^BX;IN<=XmOr2jz{oL)fW! z5;I#3>2(UPqPu^RK;t76ngptSGb%fLz$z^$Onjt8#YJ-8^7O_}btT+>q$P$@oQA7TMw?gN@?e4# zBL-uBOHWicj{XH=5WK^x@+Kyzq{t+{bR{j~%#hKF*ug{=3etUAu*MiC47+!lq<-+m zGe)JwYcR%r)RYMcF_D<`@b7&V?4FVmBI1HxhUui%@I*v2=Oe^XBnwu@ct|)@QKStj zeXd(lTnTa24N*B;UNE{yS)IW>_ur4#G<1<^s-Uw@4;lma;9ZMLmKd5*DvdFvLVuM;!eP&uG?1 zgp8dxQWuSN>UVnqtCnk4L(^tX`R(?unYi1T=#Hc-K4}c5ne2wS{pRji13wFg$AeUF z0jA!9N+nl~&U=G9kq|^-p&sw5PAzF{`0{^FncIbl(^OKh+CjT-3YuGKScADY$V#g8 zjSeJyQ8(B*LQfnG7wmL-#lQ98%5`}inzs{kCteYR;AqU2CYfmjg>OcVGA?E5q<0*7 z(es@Z=j!@sn*B;58|E9tDIl~v2^v%h<#Q6*r7PZai}hbG6jf( z;)OM4h-rVT^mU~_UX?|=$h*X+vqT%W@kqP4La%-2wU9a-KYpN-ZL1fOW6vxkXU}f( zc3+ET&fmJD!1KUCVa_u=Txfy5Vh>+q3SfD5jyC7C$eQ$_d`AYLAZk^&p%0Gj{09j& z)y_Qw8N>pv@Wmgvz@L&Rq9&kUE8g~0RjG}hjX`e#_)U6B@sz~lKa4D3Z=(|gpQ9(_ zsS0roW+Qo2SrmO}Ve|-2y-+B2TIgc2Rprr--whABDsQSTQJ=%&QOVOcUMTQ7@i;WqU$X==-}R?nveeV#8>}^{#^Xu? z4mCi$iIKS1iASqVuHtKRBi~hVBYmjgf7)8=WGL#I!F39&tY*&n!=_j2nZNRnjHTh{ z6?`AIVO=j6{pw9+PZHidw}EDzpNdQLmKHNj&gBxP&t0 z?@AG+#!MET%iT`O1LLKER5R+h#k3rQ_zDL!i+nT7I zkz{pXQr4C|U>a|3?>VXRG7y?73{i|Z;fFWH@c7|?|YnMTFnuM(S@v! zn7L}GLU)8eeGcpByg@G^nbIjWkz<8Im^|$;`qUF)k-}Wg28{cr=%Qjt(bX0eeVB$E zI$CpX&r!D$t0>*uzy3gbz51X^(AP(t!}ZjG^weobO;mz4+Yp?|ME!)FPdG~Lbdpo? z)h2)X+VW0N;K;S_QGP8Xj?txYcN>Eal}Ry$mEQgj`l`S%dm+n1UdQilw+Z&R_G zs^>V9Y6owHA4Q?tCf-}K$yQVPaX2uuk-ECRx4F}{3a=ob;wZZv4G8QyVSwOm4@;S( zLqgrzm3}dbcKw#Y++n2}R`Y5!-gGiTD=Y0tL9rho?06j@6b_&iP9zmtRzmOishH}k z^;ER&K=kuUm_1+G?wW-za#AU)+ps^44%@#8f5})n2D^1~>G!d8EPkB@#}pZ>&;!j3aNqTQn)|Ld^PMQN9c0GQjLAV}>`d+Xk}iC=KKd_;o%S8<+xA0j z9e1{6^xl7`d{E5kxGjzhIz;}0^HuF>&9(A$s_HhQ);qP3bL@cM4)y`qtP>}ej1?Eo zxQY9AUU+osecPgx4g6PNUO^okOxF$I` z-q|0&)?5y#j+#AM{P_+T0`3^ZYXWae-#s5VvG&-H6?{LtKCnB8MY*C@Z$~Rwj z2YMR%PPj1<{H_eoSvE9w5%%W}^RG013q=QsdU1A+_E{~>PaV!8coRdQbu0pbwqF)O zC0J@$idR}=DavoDIS31aY8DO@CiR`pan*k*lhMXhn*O^u48yVK@?QciC?=z5vM?aV6Fks@WnM(mJ;=U6__DbjRNrK}L zR@|$1gzSVPl}0TVh^&5!$E75xekkO4R?n}awzEYJQ&Z<@tmq2n!*8^wO2CV;K$nz zUNi+&m=dls=MJLYN{Dj-b; zq`0L!;t{Zi*b6^E(uo1Q?V-bv{qQN<<2p4<^OmhEny78D%Q0iQ%ZG6}J`hV>RVOd^ z>))w`$qN0c)8bjAK)!uj`|fBu=l<8y8pIk-F?7N;^~gKi&4VA1_WhpJl))6bSYL{? zp#{@1e#={;Z_^Bv>oCj+^y3d%C zJ8N3coO8RU=5y0e&(E2WKYiMqvvRW(efr|d zKQ-s#%PzZEZ0rApGxa>gtP>7sISZ{jEFBRgafuu>nd&Ny!x3=)*TKkY3wZBVxe}9zU;z&A04i~if+NR>+n^&%HxZTOmo1= zi&46qjhVkY7vwi_uK8Aa6}VF>WPiv$<$$*$v9j;k`bbERvu_wnh_?o75$o!?x0 z!TCe+AZL{}+Wqf0jBa5cH@U{8&7F+;Ek7Es8yRahU#HB`(uV%+Bl>@lnUvu)_D7E3 zrF>zKyT5z|XJd1~ZFu!x*}>Z!(A)t;H#0`C{3-roMs)G~%vIQjn+vua(J^1w@gbSW z*W39jOOCHM@F73#&O-JVd|b&Tt*#lw^YwYYO2_i`XM8Bm%h%uWApybH)qIFFkz-9w z?sJLBwL*3&KQ*<=JZ|IDJ@=d&aChWZbwcPJcvdSfpcSPFEBLe%w;8RC9-2V=p?^8H zVJV-{-}8s(DY>OpXe0TmRfrkmReX+10$uSTz6x|10DKkbF+K#k*hHX@cAra>!=lwv zJ~Xo4gLpO$M{eL+0|X!or|S1e_gm);x9NvE=tn1=ujjiucNVww9ky236C>VhJI-$F zw%%dT{C!P6x%f!mJGx4b#qC_9&F#A<7oY4q0%C1$-^~la?XKNJU^JKR1ms2P%?peB zj-^$LH9$N-4fOCWTab>;;CYceq0yKGJ^b9dg1`Pq?GIts0P;%4639zmVmD647S-ED zMhCa%kCdI?w|iqpP<(|3H2pEn^l$$99S8Ss&v2U);n#62FEGtM@vz1gJoI(p)$2Y2 zbBSd((zdvJ@j>~4$sF{~J4?@6i^XgtH$S+PsYDo;iUJ$L&1a4gg-tHu8>e1^6ujfi z9^N?-2Jbj`UO2G0jdr87dOrI^hKNB_)Zw;j{OZ}XX>H#^54+MiaE)1>9@jUTvl9xp=nx*IsA$M7z8ZwCPIG2xmPhBC*9 z22=f->@6fRU4mQBICA$!0D25szkwrlmjjp>N;G9>A0F0WXjx+DF@p|ZpX9FoX=SP) zieGQVDc*;NYg)PCQKi>ruz$CncBCF-VMB8ohlvU$Cgk@74a4VE_F2f@$R}9XyL;n^ zpm-VtvU?*?y#uIrZ$t)lPST|3G++JFEA$VREjrzhcP+iN@qqRqA0h|6_k(Z=Qcp1p z_$WWnB7W-5mA0Q=UN~xRxaS4l^KxzW78v4t_osix;wlt6zH6z_}cN;SgTi>^NTYH|H{U!bH4|;I90{yMq2-A|~6ng?pX{ zm>$UaE`}>^TYSvY+p{)P-{G7?o-6$zzQyVG>yF=meD3ML-P#gg`1Ucv!nwRR#_6Ok zy&cEVW-_xafhRO)wLhZ6L;9c3ya#NDrHwmJiQ`Ma-^$LP#Ij6MI;WN^ms31UsQSI|OA&9XkQ|2DH2eYxoe;Z*6y6^vnTjNic*ChW>~ z$RW_K&SmD%A4s_qwP9sUR+d&63mN50_i=3a;IZ#LfOn|E?}J~_G2fztxi%PM&29ro$0KXKb(sraxWNk=ag0DcshFDyM+ddQ9f_xO*A;x357A+@O2a1Y#W!oED} z)aXv+qtjcE(vKO=sEU2kXEag;%kla7o%H$^7voi*#McI6z_9-d4VllrwQJy`x)crs z{`DK*<@FmL5ZiF=kfiEtY30r#} z_rp^o0QwQ=r`ZF=6HtbXFYeVp0XFhJ6 zb(J{&hNVIAEx?kzui_71|H{<*qgQOlSSAd~Ynsk!j#*dmiJSEaUv_g@GDH6)7Lgff zZwZ9soTO+(QaoQJDXxhn#q;7ZUggm@{py$W#*$$9OJEPhE|CIjuI9tzD+WMAb z#O#@!Csoc{0Faej8^M3kkMJ4zUDF&nta^7D(kL{34@%9*(c}0{mGz2| zB{>rQ{Wo;B<6lUtJM8~WCNx*V$&daGiSUrDaKXOdjE7BvlqVhHJldHHBt-JT;07l_ zW|nR@`2l1V5n^wNa3Tc$4_2MwCCCv81S)JTyac@Y(p|aImZ1`aj!4M;6e1=_zjQ0Z zVP~Rgq+Jn+v*5Td9|dpZ2L8!u-1c>)t$>;m?D^u8Verj$GJGT|Y(j4coB+5Q23^po z1)`1ZzR4_Zpk_Zt6C-PE7Eu??f}7518tm{U3(YWreE$}=AYQslMDhzcI!Q!vA9HA? zd!^gIe^Ay3))Yfc;gt-q%WR1KXKqOgZ2$#aP5jXdtvv#1*eQJ+17WipB%RL3i6>a( z7U6)9c!#@q`2CPEu+ueWkt3%jbYU1RXx}AGcRQ29%X9f?;4K_b%v)z?Te>i5N9*~2 z4E)jHOt@1TWySSbLAUc1Cw~DjhyFW~JA7^7>))#}yws$;M`m7jF7G2G_i;MGOX&2A z%?UMjI*STu7W6+A)IL0zDWSKsAPhxk!bFWdkKeBa`)6wtcH+K;>@yOeGjl)qlf;k*03HczyQ z_THd_#BkY2qcsJ~m*9;sudpo&hQf>6WR3K61mE0Ly3tr%6xg$t7QEOLwam(H;d{QE z%kCxTlE2pU3Z&k?>pgmRjg$~Z(p|LK`F@={1jw@MKI-UN-p03?RSw8s>(Zu7IPsZa z*)sHuVR-SqQjl=xqFi~*`}OLNk;n`$oTvCVtNxtLkL$%HncbV-QZhLgobebq*x=HI zZNv(o`1UO(yJOMJ`%3qiw02A|#hA#XGIpNcI~Ch{aoi8EDt2D5YDRO(-UC|9C6MTG z`k*ih`v0TZd3PJNEd`k=J1?>~Li(My9d>k4Sc>-Lt3nQ6qi160ePAENAXFsyiGcY@_uD`Hb4c^8y( zACxhNT`hJ9?d7!e=|&!|(BRW-DS{)7=memz;b%I2H$2~=+|ls%G=ycty|zg$`dhw~ z(VQHv(T9u2N(eh`=k=f|MZ6rFAl7~>#(^(ZITz(I%M?<6;4e%K_M~-^bx{qNu?6i> ziigtE2)srg;cK1c71FR;@k(I(v9K2_ML4E-HJkBTI&oH7BX+b)Bci6#5b%OpVvEj3 ztEUm!=^iBrPetAdLpK10;z-hh?Yq!}&3rKh*25-DuoH0BKf#Ofp2_sTka=qt?@DCn zf2cN89Q2vh=wpv7uCchv&W}{=GY}^IcuPe;egv6;&TGo?j-m|4re1^9gMLI_F@Rd7K1RR`kALNTGI~Nk}3knw~RK(jd1vQ_G^vN1KV{xIqM7}!$ z)_Rw6A0lSO`UW9~Q>THy75tbY4scqzqD&n?vh$L+O%@JhPUwAt?230uapHLu*Y#Z{ z52)Ksytr9b3+7`yj1vOs=P&tTE3IuZG3{6mG(&YcT6am(o>o9V(hau?2yzRW{n^H&ZM2TA#J zi`(oeembpf^w2@8S^LG+H7)+C-a41+=;MKf=Q%J|QiH5S2Q5r?_m1=cT3T+c=IkA5 zzE-$K@j;D@16ILC3}&o_zGGvCYdUGd$V0}o0lATkNqt83nDKaq=<^sY@y9*BAOCfX zZ#%DQLv(KZn^z$x;wy1zickL4C~oxmRUzypJaK#qSHT1zi~wS|1_y!fmoAzftz`F{ zULY8|1a9T7{^AWk$%p<$MS-;vj_pkO^?J)GQpBQk$e6TW9~}$kR#xc>CX;-z_se_t zCEoVg6-`lG&)O3H7`)yR8o{z{K;GL>+8VCj3pl($-kZ`g=zqs0Y|b%=O&d@u$B?<$ zB_j+6mzyczglZ|%f;nlA(%w%tU>gEA$KXm$8UsNAI;=-F>(c4?M2=5+5gWp zU;U~AwO3><_8zP{+snt5vMFD(eNn)qsNXKCqReUI;5TePFsQ2e&ycY49lZV9O{vbP5=F~M4k!Fdj_-PlG69|lOG&;3_|A)>uM%AXA0CqmVtTZ@&2cGTUH6R)_cU_Y zUdsS0k+-$_q;NC&;@*a}R~~Qi*4X`$hMduRr8KHbe8TmK{8B)92?Hk+DOT%)uez6P zbkZUno|UQBt!7?Tc@5=TpHa>oAa%7NUs~^g|G*w6znp0st+uxcfqyUGj%yqLnumiiaX6{Aw)V zT(wuARIT@orD!W0Ch+i#2k+peS&Soc+s@7i_XbX)W4*xRK>lSPfSgSp28Dk*#lFg+ zoZ`MJEBq@gSr1+GSwKzbuZ9k;H%n<+T{>`abldruD1l0b+kTOw(ExFH6_nAh5?^r9 zexm?_AR7}3I2hGGf2D0#=`LYYDDn8HAcNfEqS8YSggCT8MHbsVH&}KPSsag-whCU{ zSK;M9K9Iu8(U9Cq$)W`=Rnxx9dp5V3xr_$`E6uj4OzcEoXJ_JkL&wtJk5G}aOw=|~ zFDK%S^-aX>U*OYD|4HN6)Vb`qw`|vr#%Qc$IwZj6!Ym8SS2Q;qIy5w^9B1YjN2^o@ z+;mT`q6Dbrsof~DYZ%-B(|)*s*tfZac}es_`nUAcm{h~yE9=7kbyvD5H35MQyrmOJ z2q{d!%5c$x^^xLLvd?-C!J%4>9Uz04_&$95YGF&&|(g0C7w}d=sO{*n=;kGe%%};9 ztiPW2M{Kxg86!A5AdbR-6w(@h)toC6CRKDt8%2%K)qCyj>A_bCBiwOiWw*6F<bP!x$8njSaH`E*zC5FypSUw9JVB?@Z+y0a?JsRz7vWF$ zYIkF6@Djc|I2|CLopAMD>Dwo&Z?u zcb-cSTofHD2;u~4P&k%fY|mI>UQ*3+647B?72VaxvyE4w8yO2|_e1@@7w>?DdmFBP zRme3~5yD0WU)>O{kqHgn%(OO3ov@f&*6djjP!||leLVUlhL3b?Pf*+q^#wN(6tsYI z`?k`KeCZeA)}7&=-w^6MkvidZ#cibr^tSsJrHg-K$)<@Lg5~3Zgsp2X?)^S%qlWa6 z(n~?!8J^+#Cr3!jibHbc8I1Bv-n*RXe^v+Uc8I51$$253{u4=+Z+|l1{)jdo=#w>N z>>!H4(T5dVIFq9%ydCY*VC7|TxCmk>MXaltO!N)JIVFBusTJtE7<6l5>eTj*EM%AR zm1a%oi=%kquM@W_X|}N;w*HM7#igd&gx%ACaF#(~xI@g&65Zh>PMimmF8e4`DfHjU zb0b|mO@Y9^(WZ-TWFqn^j7-_7R#SSZ?rNB?JIqiTPRnpwWi@d?Ay;~8D=&QRIh6O= zcYU-W%)A^@8o`0cTS<1b?-m#!ZrO2eDML}F^~`X1n*s=$X=-JXcom<*LemBbIr-8P z;nt_mEYCi;;q-DJf%?q-@r*xrlNFDLvM}Qy0dU+A9nH@Qe>jd?AL-;1^MB*>g9q`B z^c&Qg=jlihM~e&Q#~Y#A)hd@Z$^#pCL~pBB%ZxV{-mqZ$aQ0htB^zV%-1QS*Fb39z z?7sT?UK<4&!~S$Z>_RT5mJI zaSgPrQC{RjyhyJOzBWWZ_*oiX2tWltEJQdoQ&!vw+Pvix4Bi;Kdgr2(Qm)?9`{(L? zld)FHptuV#EU!NA=WYSWd=~JRn3$+mO9SjJ?k4!ZnMjgQ5dEGi{6x+xo?!{xMe=3X z&nCvRo!ieJ)`7OxCpv*qD)b{VlrK&OJ1AU=gpDC8?yh#((|_!JLhEbhp3cZ2Wt z_OL`AjPmb@JifMOye$$G9&YvGw6NTLRc9;fsL_r@RJqf=WiDhce1ABzy;>yQ%=jyNl7Ib>>;4uNJpiwzc!m zcN(f8WUEN)EJe)T8K;fx7KwKiqv zff`OS!>5N){}^H}Adls8iOked3}g4X>1S)?x{NPbWd0gKr|N4M-?K=c6@JPDqM39E z-wNIS8!(a>K6c^k(pD6VEpK4vLU}0e)rB!S7AkCbMIYChd4?$Bc1A8eD-+9rh*#Nx zp{N}Ae?eHb%2+V^MV1~)))?G$p_Zsvncelihf6e6CC34xkET`EWa_CLg<`%D@B=|^ ztU+naxhQ%GqZNXsxLHKEtrCKPC1JO zNS>co9($~k=i$Hq0&RPx--j-qTO`zoW1ruo2UWH zFH4T6_gu;IhX}z3tMa}oZVrIBEE1bZ&!l?iF!OlsZRGxf;wJ$Cp178ExJVNu zSbC`2jgMqaLIc9L2s_#F;@cTt(KLuGu_5WPQM5K~l-n03wW@`};{@()2yH$cm?my3 zqS2rOQ~D}4V2X;0YZdc1IW9kXuH0pDVzmUb>EH{A#})>c5R_JRn}ePAEx@XjI?-}g zv=k%3#@!D&D@7BJ<)o|C){9@05@5~#xiYQHBM zxTQ+cDt$S=Sqn9DG~O?hWo%`of(b144UVs9vIOz)rkALPpp^gA64g8D_ce+uDZ+*J z5UYqGB&sV(a`=>Y7Ry$>#)Q(>j<+HtL!_$2sTOKqok&Hpj)jyuR@`pEyPaps4rXbR zuY=q$2^18RZ*@{#1wu~37VH`if0>xg;DpOpAPajEAO@=zy9BHc9@fvlF36qwPXLOPqLGpb3SWMgVONs>8YGvdWhlvggncaf^X0}dlKaJVRx|Yn ziT%?|>`(T2g$LP8{Ipb@{0@`(x?Yj_Ye}F(SoO-Ka+#-)`ik&21o>AgLVwtDsUIeC zav$>kJR_n+;0xj1`$)Lf&|vW@=72*-fo`YE6ESWK%}%MZu`r!xm98{)8`~F){ED29 zPNd}<$wwR!=OYM{-VuE*_A4<5uC@G!IQ1?NXu(iDF|sH+LMwRv`HzW$w>tk}`&wva z`)lIfw-|H$N%J4rDvh2>{Hna!pwtJHup|k7f6GeDbF$p_SA{0hw=-6ul2LLpY`aa1 ze%!R^QZC1Vp1UllP8(r=JD0_@*O(=t9q_hoZ1)i3#Ucon-8IVCQ}t^Hy~n=FDO+`n zeUoX~)uCj)q?|^MvT?Mp6jz4h?>o zI1Mob@teumNzR*SrEpM52SlfEgXnaq0UWTiqy{@K>#!hLb`XVp=mw#u)!{_wnbMM= z=Ra`c{d%GkEHH!cmTcwET+Qr6DAx4pMjdzjfi>9|y~KaSWQmy-Y0dnHV>A@R2jh*{^H(@@K$( z?yj&Dh5e5!%G)Xl*A`CP5fmOJPDX^cJ$_)ga-2je#%q;<(|Q{cd$Dab%TsI;LQ{zM z)0)30+eSh7CxpYL+*LeO6MVP@e~Fl#2#azX3*`1@z?F-GrYbzAnecVQ;Ilbq8LZ3X zGfzy!dJ2|}WG4BmWC|WogqNq4HbviNI!71n+gQ7s$;EedJWX%LBCh$&Q)Y{}ShChs zSr5lMqj8op&eka0y(v>}Y!7nl@r?=Vh7)%Mg|CfvLd)-|_Bw?jO+);<^pYl-+MOEM zDlwOxMt4q`_BK-JW`VtqU}Ij=3`Khf9Qm>qi2z33Oq#TG^*VEb|PB{gdm_zkoJI8c*PjnG=fp7mP;WA8d?w(=RyZvtXSZydPK8DQ*>6~(IEE8 z1B1f<*(|)whP~f58<)LM;x}mO0za`>Fuyi;ybp^Z$u6NyWq&PZ;b1<1;W>9aR=y^~ zykH}vC4!4HSQmz~F-%D{k;3AAq%K#r%YsVCAidhEOPsnOEbUV2$@$Tx9U%|FFFw5V zY`l3}pX~s_xpL!SAjHBs@JoFvg4gnyU&cgKVKwEQ7ynB_LUUEp1KP5{RH2`_qMvsd zn9vWd*9fXoB@Gt@H|-FS{Ph?S$&H3(AO_D%K}1;knJ7mn=dsJMv^k-d-+M6GzOB!S z3#gC}az8a8WO=N{7V=n`M@1-3^u$}6FP~|IBDqk4m?~7V736Fuxz3Q~QLViV$Zqfr z^Ne7eDq=I<;Y9nnLGdqR@(_$750OdqJ`yCdC%jPhD%jTzomS?_&FaD-tcBul?geMJ z7;pI=2Fv@cU#fld^+$~UVv$nS_>|hlY>fsV+$17#X+Ynh0?(+C>EUnuQ4D_)_jalI z=meBKbnl+L0ybxu`Jfg1IoFsG>Yw`>M#XhMHlspvq@f{|EQ-F18L@lgX!7^4h>95E z2a1<}xCxO%X&dRJ>*#agp{b%m3_dPD$YsK1JflZf;Xlro0jS78d}Ma#lws{ov2iyx z``GAAXY@Gus>cEA8fJLE!Kz+l`A45~dAoE>C0c1XT*?Slv6g5)FA0e08@KDC3Ld&^|tk=Y4&Q)Z+=Gx3r=~zt?aElH{y;bDT>m4ugKBALJlvKtu zX%G}{Bw)(dbe?ItyCmEA6)Z8=u=p;La@LUi8zCsZieeAos!GZg>6WNq8`(kah|lLt z&1E?82g-HZJ2c${qbi}Zz!QA*-KT+brJJ_!<)QFRsK9r#`>H(HrzoC^;p<16#qdq% zs^C++NTmPz;Y%f)RHj8e=s=|$w&v zKv%x%srZd8i2vD`+pk^qbQ~M&sIc(ux7$}=^;F_cIFPSYJ#FyCl^+Dt420^1e4-0}E z+~VrE*a0!M_1B=$WNRR~EyY_g!PEA4(!s-o-CZ7QP}V^AgEVwWEf9yEeUTM;*lfuL z0R~m{y@yQRrui>gs@E&U=L`5;P(s&wRxeeAu%Gqjhai2WpN8_u-@mT$W?@-rJd8fC z9(5uhYNQ3qY`}5z>XC;^8ls!lsX6-~J)@XN)Ox$FH1zMvm?u2)MtiU^dL`ZuS^Yg> z`ICFA?*7n~lroMn9~(0T^4*{0?3qf=mY$^t!C{x4txC>DH$P?YNcJWB?eeO%`mHJ# zQPFQlfsu+nLhLDlh76J_P0WPLZzCl5dPEB_xyZXzlS;Aba=j#CNAwL;_#b1EY5SO? z3)bG>IZRB&!d&*CWEUZ3d#QZ=Ih>T7gG;-=b5zM;Ll(kU6HVRM41f2SvrvUpME6p5 zE9_RimyM9p^75+CKHK?lvLiM?O#{t*)QxhVYqXS)SUGF34??T^7)c|T@fwd9{R5-P zGj5YxGeMkun3 ziDvPZ#+FK)C0$4t-Ud>!+qWYldoaFEMw`l4Y&L&<1%2S=xuEb?RT{DJW6h_MI%OvH) ziO)IzDp+-mds^ZaSABCaz#E(nZ$aU9G^^oF2n2VOIc|~D3-(lu)&%OP198uAu%Pz6 z6QAbf4HTyX(cv#%rmfFHZ|P!0G&Z5Mjcr!18sqQ`W;GnDIz}5Amp`_IrJvYo(HIqO zf2~TfK01U!hf~9euS?90gxuJFpw+aemm4Pu9ffU76Yor2yeiV;;EtnVHjsvh5t~N? zoLL@YqZQ|YNNBYhyFG^8rA(B>tHaXsr5D&jbN7tp_QS(#(*ckPpH%02qU}}r0bp>? z!$=_A#l%rFKvd*6(&GlMijN1|+RNVplr?(=pB>S~$%Y0K+tQj5V6caa#*>eZ8969^ z2+LFh#4ZFw#9jmz+%5KyP=<%ew?HTE0cS2sgxJVDq#TbEo~H?-2$KX2Hf}L{!|+5nWa#>c!({oJu!h#V%fXGvflIvblxqGyF)ybKXS;#WB`9$32H1 zhYrJxHcObQ!VM=6xayiTUM?KO>a23BOgYH05e$YD4gsjMhKoGdbE0MjVIAu9L&53- zgu+WWwLIzrKvN}lV}pM?O%1<8#O{~}lkwD6MuHG6giPh3tK;`Mng>gxMkgb!R|345 z{-wz3^5W_lt!baCWam!>g~M+=ebpXIqzFfBvfC%wAL)J4$!gJOENQWncBy@4mst$U zQh&>tYmY4)eji^9E4(H5eb!g|dplPd`UXDeX!q%@pH`sHwLu;0=^xLE#sYgHe^{ut z#rEU1aWG^zKA*xvH5?l8HDNJY%`%2JcT;ZW-H4-#{k-39<~LExbw+#a^^LX{8j&e{ zkxb#EzUy>Jvo}HFc&ModQx2{;JePSuCE<8MFUOKSq)rxvlKrTLuz=GdB` z@HW#LE~uF=Jw=EUqZv1-Huf1EPWS@_*Gxp)i+Tt^bZDSZy6F-DZ?Qyx!ng*V&jwfq zM!=+l{v)rqXo|qG=yqWR1m=VDdRS7MS&7dp4yy^H3vgIu1~tnJYKeZkyXxmhk%oxy zc}k)X0grOXC4O_>4_#WLBV;-5hevl|dAzxC;q0`xX^VxtgWQkgBO0|~N+?c_a?~q2 z)0vuzI|ujt5(j07t1D%#Ag%XxIhM!9JdYr_O@C2(_KL52mjMz_&&+_-JCe?}HwI2+ zZk-N(A(Id1=U%`)A4ZWB5{_XdIm6+U)61i%>p`aMcTj89GZr_Nh|?(UpdrTMO{9K| zS~(Zq*FdBf5qrX7MS&$&hu0p&Fo#zl z)rQA{=mjE1Y*U(GD8cgcC8i7m)-`0z>{t+e^0D+Nb47O5qwL#w(U~nK)JOarF&tY) zIK-vL8A)_)N-@!D5JRm}wBBnZ#M8ai&^@nB)rGTq7%<(}44(zjH<%YJ#JjVfHsOzP zNiY%9w$J|}z9AZ^vpdYHRk+Qmk4q1`!^N45^@jXoxxM5c_u=OMe-P6fn8LAV_dcii z?R}U$vGdv*-3FbQi;EoZh<>8@q{~G3rN^eC+i^>3CA$s@rgu&>6J1R@#H-cH#G@tj4uFhZXEq@P!Ay6u%BH>775^ zwX3#bR%2~r!{q5xYHAwY1LDU_smVMb-aI-!3pWprk8?gYKF*?t#>Z3Op_%5P@b2;I z6%7^jwOtQ2jPLqG-S{rvZojyqyBv>!?Ge|lV88`$_?Su*3_8q{dcVnse4E7js zOF)v39jdtaLHi)@`XW5N8+TOW8!M~!=^ir>_7B)my?`}>6?X08pXa@!+C0YFtb%q_ z_u>9>e!uE^{Pxd|YV+W3tUz+R8|)cPcf;K7_8yPx##di^KdamPa1e^=ZopOKTxj&` z-|1eFSv#~Ba{-v>u3eD1);tXYM%ehq>#6Eqj>m1|!eT}dem}#RL{0uoV$GuJ?y{na z4Quw<_ILAxXNaf*{^HqG-1CUp1Dilo{UtT~+dZ}k_%+Z0aH`pC$Ir6SUO zQIdI@^Qv1j&wuO4V+Ht4Zan{HAkUWP@>#j{MP*f8&sC3pA^(zP(8_NrzmI!z*gwkg zRO^GQJKyd&@De`I`gv@=KS5k|%#-LXxn`Ekzci1xm(}7?*63IGc_|DZrnr~zbb|>1O zBLi;IS-%#Sll^`myfPQ<_yR&>)g*iy3XU*8)*}sBfxF@Pb$(v|a8IiFi>W#)jmhct z*kfw!^-CLNJKp@6+jsN7s$x2A$)o;u59>lR*zgW8JpK7i3{7VL2H&CRZooH!cI96& z4#xAsPpJt2G>3iuZPyDJ7VL9xF{~L?-LvbS*}|X)?aHr_ z<=;N3zL3IOqKi>A4Y7?!#^L<43QF)(g!m5GF2qp@ zMDPw@9ykMW&)BIj5rZdg@CQ?%7I*R3pLCsmFTbtKIO-m53S#is0oVE9E%b2bH&m#q zyBc2>V$i$l+o2kxXhBZLc|F2uR(af)KsI5H{;J`xpc~7i3i~wmq)wL=3qY zw?pYtd)?HSYGQmkx{sT$VitDJM(3*PKE5c;Usq!sm%#^{a3)-qTve4B1HN(?zO%42 zRn@iAKE9)>>!YQ~o=@;F4w!X@8%|6QT7MFBmL>7E=Is^c$tUHV|1u_DN0Y1Y<2f0# zanqFVm-tEt9!WyG`3*7i+h=0CeGUn>q}}5|UqqCx+Xjh?WdHTt8B-C2!3 zV>XP&H*1{6tBpoly~8>hU&)Rfy@ysCk8>YB6BjljI@8duJ~H1+F)htAJ=4;cqPyb_ z-Ijg`?`=P%tE#)6Ga-kT#mF@?e@izVISm&k@RdMbf$kBBh;EGRoT{rgwDL(WlM1>o zK%~E%PZw9L`!~FH6((T3LwxZYvL@QQ{*505V_;V_u$+ecEtjyAsoBfea6yi%*&*0| zN6m2WGK)0DRdt`)2j5L%JL0i6D6SR7@c}pa>rcRDys6TL)`ISK`+RSn?y+~6VHyF@ zu+-iM%Wj3+Ijg$<<{uYhyy{0H+^Wm39DlThMkBjPOn(Z`F`iAGF4@r!g>>z7zw;?& z%1<_m!gCDgWkmzi{CNdGZlbj(m_Q~J!OB?^z zOQsW`%65u!7GjGDPGcwFWg}A~8>D7_GS6XIe_pOvzO|aak;Bi?^$iAJ^?FQ!{a}Me z-1cn;5#!0p-ZFf!baax-@R&Dh~XBfY27vk;ti=HJGyx&BXK1GZ4V9j?3y znD6p$Kc6P-$}}DePv<~wX8(Nm7Mz9>Z5)-QIu`KR;78%K%#5Ww(k*yvo@ZqpvgOc` zpC97XnFmVtFbBUQi?6rcj-RKxeTTm${{{orm~_idUygII^)5V-QE1f1R8FEwJOUQu zGnzA6Ls6%j`VNATcG1_l4a*d+@Cq;}lodg@3H3s_9b=Mx@=|wu0cL9?vEjPj!w(o5 z8?jX6jgRI0l~gVneG73pil6z#x6bzZkY}qnfx_Rwl3mps*5dN)PCNt$%ThDt7x9a> z(Hb|u^Hto=i5UYEsr_=^U+@`=lZt!j6r8@)6cw38Wd2=`@%!5KFg`ek8K*@{OXCQ8 z9ot;(wfpt9bhxJ2{bou(qY{PKemfucDceP26>faa#c#%-W4a|?=R-6KE1iM-s{i`& zNJ)PK*%xjsi)v5@T0Q!m`&ohfsxKM)w_)^uo{wtpz$A$H`Q`%^=2^X+Pno&Q4C(CS zt_EiI;F!&2vU`AZ9orZu#=Q1xT0<&g=l3^oN!0oVGdsR#kB^!7Hoqop>vJIMgDPaib93a_v}WhU(XD%tgkz{(is zcBtcLEsksZ*pBAASZ&v5*4|LfrGd`cW*EYg74d^M zTqH!x1Du(1L*C5YT~8W6xrRci194*rs)8j%kLjD0aE&n?BlCnVeAD5t#@`rj+;SZ{ zYrEb;l2KXmXxC%uKeLw|ZWo=;;+7K3@94>$TRY04)uWN8YQMeT|O%g%dnGKrt7 zW%OEFww`|8iJiD(UTPkTGS%!ODkyooafu1~+Y zHJzWCAI%8Xxsii#SFgoWdM&s=Kef}j{h^hB_$%O<@x#O))@^vSmbs5C)8|chV?EgW z{k-3E3QAYwVxl|mvXRt^J(uTujvbkq&M5L-H>T(s>@M4R*sc?zUy=5$n7$v6MnXIU z)Bk3kl*AnvG=|xUDaUx2NcVrom%MnG^@7jKPHvG23gF9!^ndxux$o0 z^L!v*6BGJ~dqjW5-89iBMCanw{#+b4pnM5G1cTuQ^)`igq3%`t@jk4se+2>j)F zd%@5AMtXF@jtQ|aI%#)_eBe*)Y(73 zU-C-6Ik~z3H)ffWt6Oj)YeIsj^X~i|x^U{@&ZeS4t;fk!cLRU1yWXB)`%_g@akI&D z3RW9mPazi73;tgif zFe}0>qwWigC47nw7cU@HDfZLsq+a&A3Dr=s}8GzEnlG#m|#sKM-Edn#Dw1% z6V~$=x-wq0_uQj`#_se>NOS$Qrt4$wlh`YWp;WC4Z?$cY=zNXilZmIU@A>t z|29>X+ckyTxWj0`72ba}nRCu|3k&&`nsl8RS*qAmP3PQ-sKAuH3){N6nH{Y3QVbie zK4x>`BDj;A6P3{vu@4qYxUKy>+t1unQQe*jcNtCn&T6zFW1s8a`3dS~PbasfKXmov zD&=k{;nEfRiEiR}hD+CLvHN6C@Z8BKw%+CBldXz5BewLjw{}ct_*67x7A`OYdkJO@ zYnOf}7~NP_qM*+2!zo!#(BT@t;#z;X_nTioag4xOd%EYD*Y?bKYnSK`re8CzEcbhE zW^C!5?{KKMx-&f`^Lun2RRLG#Dtt#P#2=xQ!gI6c;9 z#_A0X1$LD*2otfnL5mwV9ca;L2e5sB**k#+Blwu?Qg=_St?l%#ov_kq$I_zKv?hWW zP2_&QqjLcIdelDYN32ur{s!!3Y;mFAS;(uUy*Fgix!``j=SYUfb6jd>NAwr>`7!Jg znfc4dagzb%37mJN8&FZ79~WB8wy&`llNx&egyzTC`*As;-1?RCTJL8vak?uv^V^5f z^Y1=$a_e5@<~CfI=Wm?(m9(GNFk*9Jg8}wwILjch7`@~X+gvD9Hlz1+Zj0|!V@h*# zo7=iB*~+UTc(m>DMVMH+9(?ma@1IM|4|d{g@qZ9KZZt>4WJdG9n9&@M{~efFXm@ec zHsjGK`wqkVeaLM>JG0csFJXUJhndC<^0Mo<`_X+lmV~`WcNl5hgLx-6|I0HV6AkZd zO8AhMfjE{9v<=Oxd47!dt&B0g$0JBM%)jKHKS|@OHq_zENo#tXDwvLd6{3m#c|A`R zN>R3+DxkJkb4ufoj$7j}uVJy+V>hucOpnHw4Y=IEeIK6>#h5iJSCZ2Zp7Xc>^Rq|K z(dEr6-Mj!Dwk_6SXzqq(~$%Fl(dfSP~>>FPc`jV~aD4NAqaXLOUDryV`F9RX@qBmSZuB1KD;wG~(~d9RT`^jk zn1L0-0uwdaXPrNoB~W<}Kj54t(qlu+XmCpFvTOPWG}-6fyE^%)&}BoXE1hBEcQ?}@ zyJY$ubGBjX%|41V6FeH=8*mr5E4ZC^yl&^EqwPFgYB1Y*=*-yBd(1s0Jv|+_z-87& zWre8`K5cV)Ix98^@a8 z`36fcZlCT>!*AKl8@oP=T^cYB;xD(_alz@Uu)o>s&!5aEv&6B%{u*i*jU)cT2cC>? z+Qyz;uqsOnB-G8NWmtGGuh*NHH_D=jwbw`QHpv{eY*}rnd365K`^xoYxEVM~$2=Fi zK54a-;@iz;J?tCeIJqQ~I#@m*`L;k&U6DG`@bC(umSx=9`z>4t) zXBS)r^;iO>6IlA0W_Q3!NVH*CO)6YPxMRdpExjaP~wY;LlTE34o@7BI5KfmVsPS^ z#Bqre5+^1~62lWC6QdHN6Q?9jO`LA3fpeNfF8*+~W_~OQ%i#pJ&!H8^xG`f+E*vs$ za>Fr&r9(#!EiN2ZJglU6WXXuaA=6tH6pnA|u>6FQ;$t9=O?`9ZoHsp@v$i1Tkp1?_ z-H0{D5YT}xt%9!=c?FjyC*%#dBvqF;q%yA;*`c&r)`PyC;dBvCRU6EIM$v)%q)+LiG_RT8=s-UjI%De%{Qkj=q(RW$i zWy#9CjmgP**CgxnHYKM(IQ3r2(Mr$?(CWO+$w_&4B z8JVj4rj{g;R*NhZ$TALDs+^s5w4?!vlU?Fimsp96GniQJ5+}OqS0HC5@s(KOSXbl( z5|D@JLQ&-V-?!@LB;f9k2EHT;o2czMI6mnIO%MB+t(pBa{24 z`cc_NUOD7uBH@O|zybD-M7Fdk}=Xn=QbN*o>3!I3=xlB9*IcK@V3YRmJ*nmXV zgKlTyIOL?;S&!7vRG?5(%cV#R7TScwx`5tfBNE3U(Yu}IvQX9~)wVZM>Ak6VWmwI` zYb`!kITI@&%|@aTY@`~bv6(4*CSIqA=JAl74A~jfT*u~)HyV9tsxu#n@kV2!@nMC_ z=`E)BX>k+vuGtHz^n_2Pg_TG(;W-0|(?f|>uFz?Ct%NRiCED-9%12He(k?_At!G^+ zI~lK3>}6@w1e7`zY00|XC`vBeP0my|@M45W=mKFna5d>lZSZywSnNgDGpDlkgtr1na>RE*&!CsQZ5oNKdaWlm~sa%q~6 zX|yKQk<99oF@2Y~(lF0hpI~mV)+NVyQtb(4jGrLpVSVytMv8&<*BGD*NpJsd;eB~mYYKTL;cwrZLH z8B8gLWMt~Ko@7kL3c{F?s%JQ{6uJist(;^UZ)n&Wyza4k+97Pp%#`qDPcjZ;Sof4! z1;Tom;4%(t^fDP2W@LOQ`N~HT`E$)g(-~6iTr2U)&u6*Mt$1ZI>?R@qC7Y2r1(_M= z6y9A}WMayq*!QQHkCL6DPjPSfSfS^bM4}EUtw{nRfOf z=Q_0}=xJsffT;)j89^_t%#ty!hXh9*f*ukyHKT(P1Q`h*lPtD}VT_FV@G#CdY?fX+ zunuvC_z=A?u7uM_Fudq%cB}uYfxNFnt!ZM(haML2@Q0m>s$DEG@)>9Kc%`qBtMP_W zPC(bhHIJ8(GkN{KkTTBS0^QZnO(#r1c1+QA|NBI*bvfP`t9<;W_TnH$yZmUF`dJ?* zlo_ZpQR^#?MZL`w#qo7w?_}8P!wX9t2`3g5MQuC}XDlO;$>YN(`DY~6KrPO-jLSLS zjD^dmQsHJKa&q#0n~7`>ocrY@IV6P=QwL>V(8JjuBO6yJ9A@b;hADevt^53?&_ZfH z1X!M5Z_t==-k7(MXqs26J`K`b!I*}*3vcMZ1_yk&__}dWW8FB>axTEBFw{p*VH%yn z!H^~Sd|Z#woBxeGPV#N=)k*>5eO+pHlbG6{VdcwW9Q!t*ZHtgr6|*hDP0E4zE(4FF z>`S5#ZptWj?1nP|&6=~8YtE@W@15B5ymw;Wx>V{P*t)=3@x~NRw^J1$$)$Kx2Y}Hc zg+*oxdkrb4EMk!uoi)#H_= zrM{_pu(5z0nawHee^@;yV;dti4)U|{T1ol7B-32Eh&a&A&~!%%TP-Q9*)D?uYZMcw z60R_LBZ%DzlLHG>(%2f|g^Yiu0kTad;g!C?7;qUpXAMJ8rse{&!IU*q za51)%Y}UTqk}+?m!hSa5A2u zqggJa&$QVBwz&^5^hsD`&BSh9D%pWV`iL&TCSoeN!~vGd{K*^f2E!~IV;hMfQ-#v# z_O&jJ4x?&zC|ZIYwp6=IV>r+O>?BNMK6YH{WEe3>sne$=Z$ToX&byTvy<6GdL6KcQ@~!>zSScE(a2DZREhw-)5LoQNU)@ z=#CWAuwNR*LuSBXvunw|LOS4Z&ot9AK4+?!z6iF-ACTyS*Mw7*NVB8%=rrdekA5n4KGt6)leBW6{|`;IN8lqRb@D2 zQ4igrdRS{L7&GqYsK|v9mq)%yOk>^($aZ3I`n8VlQ>$GX=Q!V|e6({B!F-%He3<&Q zWE&D$Q$K_GxuF(TunRGxpBsD{$3jl~Ok-zc2lt)PEN%{AFfx^O<3f?sIa`oDhA~IZ zrIG(^77ZS-$>{*?pmYHHzdD7Eq9PqE$ruX0aE1$IIHoYpO~Bjf+fHj7JdJl(xLn@l z>;+uPP?IvVm}PtEK*(Ni=6tSD{S5EdQS`M*6dQz<c;K+30cK(3UvEWzC)$aUs+U9DHRB61lk0 zT*|MA;CQGA|Dx-EHJR&wScP7?ca^=~i=C_MbKFT=2hJF|{mj#6Zk}Vkj|&rb@OtP9*peUld<7uKxU>)DL6_Dwk#INjVP=Sd>3 zAW)0h_Bk(51kaS2hD)|My|#+l@CsjpMdpl!BFeVMWcfLpEYGQVz1P&1=bp5PmveX# zlNPN;8m;%%vd%sY7mTe%lf(jhDb-e!&Z^ClmOLLBnaJHmZck8ah4Vrf$#)8NdE z%S`u*-h~kBLhn^MbE(Lfdo>cVW<=%ec7(5s37IuW#D#5>Gp>HBGXWRmtp>jlOILY2 zSz%Kd#w%NiHOG*S%MVejO?Ljxv>N)cF<4#5YqH!vOMc5G&cQ3AHMt0HFgIq5WN`R4 zPF{sXI!IM{<9G#^#!YoH8YFR;9TU=nXcA{)iKl$S-B=r4R4N6>8CkzmX>j%akb*v>vIGco%^B*q;< zpL&%nmo8NP5ycq6W?~#kN3*hO@u16pjw;)OAph+LvBpoo67GcQ3TiOSb_s z0I`^Z4%?YBb-eoy>qMworh03 zxGT*cC;2ycO_@8-1h}`U1TM$w_TcS8{z#d-*8C^?dLdqNa6gXCc^PEhq;Lme{P>Omz4d(Gaqe^LaijTgaH3El-rwwaag5(#KKG8RpBw(U zgugG)XS>&N|JlEqFEM3l$A^qSK_Xs*M;#yG6}Z>mIX*GQ`P`TUw3M6I=-&(G?ZJtu z5i;*yHvbC}3uF8>^SO7NyybYj1>SdjWK7S$9j}e?PaSWHaehqU9?|&GCnu38$TDmU z3-CGM{gdWSQKQP-uZeeGr+i#FKHn7JKM(L7A}V8Q@1@h|gxvWbppUXuLJ|VWPb*3dsKtd6xFQJs|&7 zfWIH$N8|fD+3cJK-apB^lkH*d)xz5^2jqVk;7^UjG8^QZ0{axp7{{YUqDj=VTz9_J~siS3C$C@SWEsgG;e|$~A-EBX#Inn!@ z0ZeXL+tJq4+|jtasr~E`jSJgamz%exLsvGnbqp;|G&U|=g?siJhmIItQaVgO4jVa2 zKMo&0T0f3BX=Gw(&6r~TE8)Lk{5PEcM)2QB{AWZ)@m(qZjpn}+o5kcQF_}tCt`d{2 z#AGQkxl2sWlG04g)|Q@x??<$Ew5@9HSkk(JUzliYZe6}|RYyzX@V1skjZ2m<-!ti9 zkj|)$JnY+DVtf_9y=vuL4#gdN9r`0WO^IO}O zbex+Zii~?A>l`RWQ^%_^SGOEyPvY00=u(+jZ zK}%cXipFKFtt&I4@v5>h5to_t$PiJru(h?Lx2mk7J=@)M=8W!m*{fD8S-7NSfob=E zyEWvqeCw$9pdwZ~2Q0S^KsU@5O9&-K(Mtd2qM)&KRQ7He3`gH$8J$@@=A7%y>|G=0 zg3uSJe1^DlIimIq;w{$cvUA25=o@bGTzwP8{55GU?s2Yy~YLj zT><`ffcJA-l+>?&7#ZLTg{wUq0{kb!RexHz>N)U07=j~S-_Zenrf}7Nn7i~zJyrPg zk;&E)4KZ0iNPX z4vx6}CkOcI0KYB3pA@d`_(6al>n=dX?P(0~%LDwr0DnKg59i5S7C&4qd?M0)yuBj4 zO87XQ?BSsOD*qkfD!)%5rLDZK|Hl*>Z1gBUN4Wa)JHjg$8lP_o*LXPfm`K0MuN1EG z&kEOmbkMPp9+kgHxXM2uT;&Uni}a{HHNsVXvv8GvL%7P%K0eZ~{pC9W{;qIsuP>eu z=}}%8;EMwM8sTcsUBb1$zb#zl_a7SBqkgUo@Uw-h-@X>$zZ9i@Haj~Ds%0sg%JzbC+-4e(^KwI_)e4uRgzqaE)9ruNJeuJ!E<@XZ09q;qjlzqU(p zfWJyX9DLFJ)sLlKry|YUe=mh`uwF;t&+~@~aZt}V{CWOcLLBUVBki-b7H^?dd< zL#=);<-Gi?9GgJ>x(~pI?DB~7UkRUpbg$$tizzz_LcWT(nk2Kc-H*ZmFEvne2d z`@T`Rm8*Kj3RgXy!d1_$ z!d1`n!d1@~KOdE=dUU@+<=2b6@*9P#o@a%td@3(0SNUM!s%NEemH$qFZxyb3a`Us7 zJ0ig63RgYrg{wWc1^DlTtDc?0RnO4{+3lPlT=lFKu6n*7;138_JxBG6%AJU}KK`c& z*Ku-#a4q+q0Dn=qmV2V?^JuxVgloB7!d1_00sgG;$x^SB?Ek3!#|c;civxT^fa|`K zmirfx*K!L3`#;qIzBItE6t3moFI>z0RJhjn6xl~oJ}{kRnODHRgeFQ1m}I_LjwC|^}hzz=g@RiK@!XQFV`vpT@96|Q=o5w89H@W4Laya3-A;Liv6rvX0r5L6Tg z>#OlxCtT}$zHqJA0|EYefENrj#r7(<*p9!?+I7^zY(tb|07)WpAgtzoiAMVUoKqrZxgQiUkmX3qR0=b{}kb> ze}!<>|J?xpy>QjPOStMEDf`1J-y~e+HwssK?hEiY1H2%xKRjBv+S4Lj?fJHFm48sU z%D*pM-B|WqjJaNtsmdE3s?ER$3^n0e^h|46<#X(?-Z`~zZl@3 z3fFQ^K0dNX^(+kV%Z00+dxfi>*8===Cq(6{o{_>;Pg8(*3RgY13RgYP1o*#%tDZqa zBRf^kU12UkTT8-w5#iD1@UA{ztDcvHtDf9p#++XE3=yt+ z>V>PGbpd{ZaMkm;aMkm1fFCwIvQzcU6s~$M3GkbQtDfhDYy0jsA}Uw)j~1?angjf@ z0RM?_E%!y?T5j&h$Q~^hU#Bw%<4X0+7p{7)60Uk46|V9h1o#(Eipo_zV*`A(aMl04 z0N*NH^}jD%>wDO!$R5={UAXEwU%2YINx15HPPoeV;bev*-mgaq*K+3xS3O@7uI2tp zxXQmFT;&fQ9oeJh;!Bj~h{s8*aMg3QaMkm$aJA<>;i~7DlcRD~PrY!JzaYS`7p{69 z6R!4rBwY0$&&)XJ2h}qzz|Rw|`fn7j`kxohw|-vOhm#qOxc`R>S3S+bbsqeYaMk~= zaGg&NIW;O*^~@Ho<*pC#9}CxVpBAqA4?Qg^SLJ65SNU6ntDhecuIti|g-=0I-p}Jt zkIGd)tP!r|UN4+erPuQ};c9<=d8A)?sc^M_fpGQnWx`d@tpWa2fPXAp%N;i~5r;cEXA!qxr{0{r00?DmWiuH~L5T=jo1z#kE=dj2U~?H@2EyFI4{`1}Cx z7Ov$!AYAQ#J-`d9B70O%nQ*m#o^aKFd4T^c!2cv%%gwEh?9p=m z{;+V>|ABDTbHv!}c1{qkdR7TnJ>L)ThlH!1cZI8-f#b5iLs!)ss8HnA6LixaxUS zxa#>Jzz1;hz(K#Mo+{z0XJvq2DO~mZPPppX8Q_OaGUoKQC%~5k_&0=Wf8Qcp?RiPK z+TXV(yFJALK3lk!yI#1Kd#i9Q_lW@iNVw`Rs*UVa{k6hXe_McGCtUUK5U%<^60Z7> zWM&-kxEdee=LpwwuNSW6J}g|zeIvm8aq;BrQT=7YRe!T^)qh!l|3tXze^I#V?>9BF zNA(XD&Y{Jx)9QtDYW4he;hf?;zjIooU-_u%+4&OTYR}~X{<8poA;1UKMdfnob za9x*P<#-aZgGBzPBCr1WU4XwXyj=9O)Z+sV#wSmCy**b2_!i-6=VQWoD(3a%G~fdc z>R0(g1H4VRw%0F(tDdKYm!k|{Zr>UBfP>{~JB|$SGXne);Tq4k3ok|)zT9VpYq>iE z{K%QK(%Px|Ckt2o%Y|#ZcLn$j!ZqIR3-CV*XP5Q$I%bwJ)A*B9mgh5t>v`6C;p&Gw zg=_o1C!Af`>o1xem8MT=iTlT=i@T z@V^H5C&INI_n#BlseGhx?JqL}{7T_!|8In=pWhW;D*nvLiR?K^c$17jJVo>E`!^Xk zxV7N<0ai_~apMTbIUepM@|OxfQuq%X?}Jws<@NvAaq1}&{xiqp^7o4TQ6m3PK>jx( zf3(Q|J|O?Ha1LqS&Ub}#2=V+W8jgc@vden@VSw+SqI7ThQv>{U$Ej{G{=EKogbxuu zoQB}w%P9PL`D)=BpN+!V^}W1)UqJQj!^XxD*S|8r`5Zwn&r>(A|G|ACUM0LBH{#k~ zh6(2ul-JWF{1oB$2G60Y(U!c|_s7q9Y- z0r_^}Du01+)xYd;Tq4{eN)(NM8Lv zLAaLN8Q=%(AL-F@n}ut+>xFB%4++B_CfL5US-0yz0Mb|dVVZi_1q_1 z^}Nl&2nWm6`nG<-;9mSs!nJ*y4~pb99<~Wr`F#gO^4c!P3D<$czZlg^K?i5o(=J^7*>y-HUy8SWocV=tjoash zYusKqFw%3T=y_1Mw%5^zM)InEtZ>zTh43=bf4gwi^Q3UqlR7LaSM8Y?;MWP)IDbmG zmiwXb)5OlB4v)$$7JjvGt=G?mtNuR-SN(exM*6j0rNXt`8-$;QtiIiM3fFSWj(`x3 zcz?fBxSkKbE?o5&4~q0?|5_U0UBV|JtGDwW;i`Z5k&%9Fud9V?y&e~?_8(pp=~4Z9 z|EKoTF+3T-5%l`6cGY6W}9`iR{#J%LDw30Dn!m)~k~XWE}Lf@`r_MJii>^ z9}1rZUA`UlzDm_I37;qOw*~nB5w7FO zC&JYadBst=XNvw4gsc4c0G}^h<88HY)pMnAjkm{zYx}+%;6)r{aj?Ir{!;^dns9C3 zmH_`|fNu`)ZNk-_cLIFxVaEJk_8%-2*e{8YU4ar>-rjn82` zImZznms*6+L%P?a_W>%uOXTSiFaM12(}W+*lYAU3SLO9SK$X8j0# zGWc+?TwUi37Ows4bm7|W^}@A(Z4}PX_I~&r7sohQuG)W$aJ7GuaJBz@;cEYF!nNE} zN1I|zxm9@U?O7vS%e_{(mV5BYR!IVTnpxTHV`7`j=+fyc7AO)&IC~ZO2{0 zRet!S$WE1S7OwJF3s?Ca!d3nw;VOShO;oP>S?}{venxF1&o1cu*Zl#Wnw(w!y8)gz zCA<900KZMRj-O8m*YUHGmBGRIX~Lhk|2x8In&*EH@Z+aJ2nY43{8_?P|4)Uh{3pWI z&l9H`^}XW!(g1&2xa#j;7wJ(x%yF(O_QId9?_`lL5w7>QsGVB^dR_?VDXNd`Q9W$| z{)}*qx0Z%TkMiV^GFI@ZYRN*SWF2HXVu6lkeT=l#zT;-3Q8QH0RSP|gYJ05SZ zdqiH_>pkJBzi^fedTG411o+JXo(}M%I7#D(_q!Rw z)t;XSS9@L;J_`DL{2x5Ws5km`oSZ0JyKUCx|4#;m5&LPC>zg{@ItmnTJ&MxKo9|Qb90e(=9O`!d% zXLNvH?Ku1CVEp-V9};5H;i2R$v*=4<+&r>lD+M{}g2l&hY|F&>0;l2K@6t!FZ zT+({3zw=Thd<+f2!51$5y!=YxrNVPLY2jcpL*C0z6@H5F>x8pwdinc=s~?^buKs*o zxUL(g?hU0lXiqc#y#7mtpC$Z#;Z?$??Sl_EsE12Muje-5D!+FwrF+XS7OwIy2v<8# z`&^_)`Ax!A&lcg@UVjv>{_pDs8rDno3=*z-P8Y6vZWXS2YWha@D}P%!hf?p)gZ7K$ zRsT}qDu1nTmH&@$m7n(cNWb#03fH*$et_R2oJ)9b|H8aTzt;D}{Omj}T{$@tj|x{mzaHQp3)k^K@4(0&)iYSQwtJ0m zZTA%c-Ys19|1iMyxp^8V+eKda;4eh>Yy4CTpC|r&Lb%G$JSe-K%Y>_*+k|Ue>3uI+ z-zP*~=c%CsqH;AJzAs$k^Vly&@;%~HxW?z#g=@cfG{E~E9O+kk76tfq!ZmI`60UYm zIV94rcK%qn`tAMze@3{*^P9p|PoIHNxpVN=kDo=tM+u)8;ERN-{&fNVb>SMfKM3%V zhemd4d@d0_Pwf1LaE-$s30FIRA^c3y^PX^RuiV2Td$hfd5q^s3X%#+K_-}=){(}yW z^v@Id$--5>MY!72DO~NjTe#|ZOt|WKOStOUyD+lzWU+IIaFstz_^BfQ72)ciCj)%I z5y*_A5B^ksws39Vl>z>eaMg3fAfvvQ{!@jkf6fuE^}Rv3`gx0R)$^io)uYdsQ~4v0 zjO;uMZ+-v0P`K9XyTWx``n7Pi^RL3S+&)E7xoT&Xa4mPfaMiO(xXK?U=Ml4|UWK+; zbH40E45<;G>)!LFENMUR#VvIW*y_$N$lVgs@Itp6+i52UKjg!9r!W&#Lxvz9ZiX$^V{1KL)%)GH4P;l>VHr+`hnct{W!7z z@%o`{t<;t!xBiN4;FsKKV98MY4*l+|*aJOKuqRgZ{&8(P^ zJNNIV#(VY~(R<|jJyW$CTeR&$zht~emh6e5j2(@Q3slKCGm|K>pXn|@%+FLpUuKDU12<^R={ACJG0uKaT{CEGVv zYZQOOlG)qu4wwH1o;1i2xBmp>&(?lVIQ!QgVKdWyt}o*DpX17p+h6B;%K1_@-^T6# zRlxoyoa94AJr%_5|3$$5@ovL?qgyP|eui}1{*PSwcFkmch literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_mateltwise_sse_avx_avx512.o b/third_party/libxsmm/obj/intel64/generator_mateltwise_sse_avx_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..b1605f6c7176cf4d90f8fed41bbfe818ee5e015d GIT binary patch literal 16624 zcmcIq4Rlmhet+`;3?$AAtU!dgj2d@?)i^;=RL*V&CdmXxpnORW>M$e|2qc;8WF~;0 z!)ZW* zpwDHiFSMai`!uwELqz-6nwQo^v@gg|b+G=@!foZ@zAc}IvR``DN000sJ>yZ7UA|?0 zwdZvhto7Z4o)71<+}C=0_Z>vOD~|5?Hs2AtJ#@!WorI_k8@(-Lf(NM*(cXL3i!XCQ zAyawqfEC<(cRADhcOiAK?$Txc^d3^s{c~_0hCSSQsFcbT8VKF3Xz?2r7)5+40Npq~ zI#~dUI@n#IcPe+Lq&EAtOYIGfjg6T-ga!9jB!L7&1t%ZllY56zO^9ng~Tz)9CNq5!J7Ii@nnV~TAnDNZ= zh;~-5P@jwn_1376A4EmSP46#s1pyu;_ZCevxZo@siOqMg?(^|))PB7|o&kkaP&>sm zN3}EX8Y_kh>B;~ZtepbHroQAzXhI6DZ`6K=PX9(b7S@ieD`cuLrxD7e-PC&+x|12!9`D*2i>FfYJwiUlwW>uuI~3B+ zsSCZ8>cVhk8Af>Pu(uKuVk|f6>fSgUuJq~^ce)W3pI-5#m!VRsSG?(KQ7O|aQ_`2C zLURlPrlx13Qm$8~rG2PWP$i;$bdb#8`wg=i&Z*M(5n*3weOQBW7+vAUvfPO;(%WN z_vhH~U_hBX#Qg^#Tu6@`KN-=E=w_`7xxLTQSZSwp zYqa+okDrQaugj*Wy?=>)g2{k@X#*o3Fc|xR!1oP|oCcXvXejS{VCI)&V|y>0fZNo4 zFOz?#4%d6jP*4Y#mup9I@2i;`hE95i>&tuqhwFX%r@U-VK9JckTz|1%FV#O41fP!c zb@Z#;2PIiJxaSub_w1-!&Ht@iQFLi2khB2%lqcuxJE8VG1u}JkLR!DbUcjV!7ywkz z&NEzJN>&ZmLxneh{gR!GFZA|+Wkc?+sfRd=2N=;+(mj5 za}n%kN4#$Tp2lZQ`!~Y=Rm}ckeKZtmzRIdk_hb3I@56QZW3bX361}N{n#B2{>)Rc9 zj`YP=(Qbbq^wl$cdOvh4eY&&J4{t!t@4$2!>B0}&AG2^_{#w0jx{lHSQ(QsAII_i{ z8+Bz?6bUuvjyI6`6LVmVx#JCte89M~kf$tjr8lb!YO$6sr7B-bH@muLL<{V+3>AE0 ztk_~ADvbLwSLT(|Pqg%73T2KK>F0w}0gq&!F!l3+A$a5sQ-Ah?C%rj)VVau13(H6s zgkHnLzFPLdw4HWB%}2k&;a-2S-c+hL!4=a^jBG$lOSM?v%RK3;mO z`W5b5`QO>mS?TGuJ)fyPPXeja-U;Q71wwH0u9iCOcqsRBZJ4l^eRYL3r5Kvrv9i!- zwe!ltYTY{_^`DN`se_x!ca-b^>)UI>+Pk5l5pQTM?n1BE6(V4&^@h~&@!Uu1NrJ8n~$3QgH4&bQ1<14(~q)w7&>%02(zQJ)4icXm8>{N?b!wI*46y`NTa^>h0z7t zOZLSG*WMXf!@kEyR^v;G%oN%f+Q6=_`ti2+>{%e7`wRla8G~z5=d&~d0tgpwRq$#^tKq^1Z(aV4%#X77R?JWDJS-DM_C8Ox zi~9y$3E>C0q5Hu|f$eho@%w|~wWPf#FDHE%ERxuI)IMRiFXJ*7E^ICj?LAG`>(JgW z-p6Ox_tn7-mu5#x)t+7$pB=qO?fDri;L`sL3b|9{hVHQ4ZGhbmLg**@EDWLxfrCLx zqjoaNOp@-GQM!Q{29CUoRo!n#clu3pe%ZDUn(cIJXEx-A?q949W-4&EpRe{Dr0$%d zidWr7*LOQPBRkO$>04ik9qXP-+-uIzyzzlm?^YC-2p?8S?Y#ZI%=-Oh>3MRO_Gi;c zrg?;J8T`J)_rqnC#e*n48O59#`53!~EKis97Mx;h>vL$(Nq&hE*~rp)=st&h68Qx3 zTga~=zlc19d;s}r=Ph)JqA@o#hBxB7lD&ip0F*CXG6tTe1xaZ{kGVbz9(fu+?;s)K<=!9_K} zB{ho!RcjN?fk-^9UFR=<&;t>@-g0fW9GrS+6_nFVA@z{DDm~$g;AK-?y^F9ru?vvhre( z{f=mnR??4YLzrx+<2KysuewE2r&NKI-tLsHrXTPyB1@(!BV#^GF0KJ4G(YN}N`{e2^ zCGJO1UnQE!PU`!5F7p5m4N8P<=h%%5QJ6pP;h39rLqpE(&XCzqh_Mdd?k~ec@ZD0< ziRS&Bb}q?;oe#Ou!#)p)AM}QdzOVET&36Cu41W+1CXnHThLXF-gLNhDp7CfHotPkz zsb1w1a|4xVJqmZSgi+C>aO20}+(F}Ty1W)ATw@`fe)V*_DB$Z+*d0+qaQ#5drO#A` z(^?nFqtGlZPPM4Ils_`SVtU*=Y?4QzK-2i-QRogeK6%(ou)}AK2Z4Bu-Kn_Qqs%D= zc+I1fJK*yi@c9lndz%!stwznITxDRikBH<^7Kozpy%QyuVtTR8XrX>xCe_TMN4dhR zNF|TLVkr`$DD9}Z*ecRtDSQq2<~)&^JQ95|ZrWLloBb`u&3RIcvz<|fSlOen+n)`7 z?Kq48k8#SF&G)e4csY9+K7M@ZhN?g>?LF4yQL3$wQUEf`ZLT|U<{e_VxxRLC&i};M zPWGq+?!(q#M^DkSGMnvzp5k`+YT&cn=K2-(>}I&RPPd>=iJ~-ia*jISA3ES3*f`5= zuJ-{ZT~V$DZYO^=qc`_~m7M-A2l^)&z3Cw_liqZocVX+C;Xan4@Na60kyB=Fl9 zZtffGyah=w!_9qzovpxs$#8RjVCN?AkAU0t_YzEnrl~}AL)6o=* zRWDw2E0PcE(bPws^X= zI}uB^wBWfh5q+XxBuRgaR`^sOnCpvX!p(VS^Y#4z)lVlAm;do0rtPwioK zgQ=Y@Xd+3qc2a<+X#cRa$v*BQx{*p?(X$z*Psvz1g-M@q-fWVci4LqvnA|Q!mka}~ zgWS}Xq}Y=qy=k)E+pVpv(>zGUf=xV9B`h{MxWk%~ZC&s~bHa#iP0W_;gEXkE+eeKSLKTwd=j z94Ea2WFcR?1FiB)xkIfuz2hla^@b!|*1MA9R=q!Vkbk!>p7}xEXI_wSn)AZ`Zb^=e zi}g!-Y0e0G@voCs`)4`WKVPz+UbjNd{Sq$AKkI-~Jb;o_@4ra6?6+Z#(?`~OC6`a* zBKzlcPH)BK`4a|@u>U70QzH5XWP#IOM2T>kV*>9d5Q%VE&JzsP@s%hGdU?EVk?@n8 z-s*=75-$5;s^m{vn?in>gCCYiIIVF(zl-Bmy_$r}dd2)Cx5|2-kmSq$e^J7tlHNBZ zoc3^G&p8RdRl>!apZvT^!mpIZceR9Xl5pDdh5X$TUN7N4m+(K9aHsisnA4NrWdFR# zar(&q`HduB_RnbvUn}YLdQd^4{;oq7_WLD#y@Ur)rbOZpvN4j_hY3e0#7M+TiSX&H zr2nLOnQ(E3A?SGtFXd(WoR@HMHzO#>$C2dtISqYomvB|WJ0-kK!s%bcD3P3*tfc=u zE#YQ#P*FaX@QWpSF9|@R&umuGf5bc?e6EDIN%WUUcviyYX!KJOK3}3&s=M~Ir{i1k zO{etlc2PyfRs?;=svdxr`773*}6f%ZOxrD=tHq5&!ts`XtUI%`>}sWjeAr zy1^nSG;3?B5vZJSYECHP_vAH#E0a|{*(l0n_;AM)?Z zum9cN*?#$qB1_~mk?It`pu0gWvYbv{q1VJ;y6CP)bYv?ffw!S-=f5pPgtSxt7{6Bi z+t6aCU#Zkt`Z*`*r!lhX9{^^j|25#0zDfU2L1U-C-$p;_w(9>iFgyLVe4w}RFUVN3 z_WxC&cKQQX>hy2vABFlq1KFy7I^^5wAK>?h0nRTzR{dTu+vz{g_3!75q@V6nMLa!8x`PZudW(WNP-_sc)oR9R={m-hO-lul@`?-Fa-*U3*|B-|KlU#q0Gn0Pe zwd$w;v9i-Y#P!Q}XcA@B|2Gc$%V}dk66=ig&qB89|G+{2Ym?Of9}fD1Hu~uvY}H?m zd1lxD6I}mN@W{#9|L=g`ZvI8Me%b#Nv$E=6>!3eCLXqf=Cnu}^TOIW8=K3R?nfy;< zVb#Bn>!)@(b#VRi+lSi8SCqu}6$ksXoL|I`bguF9hvFD^{u{FKADwkp`+XQEyYUO~ zi7Wmu!AZXd{C4`!as5kqN69}FyR_RJPt#_LM6pykS?ABg4*G-iphuF|?`5c4^?&G~{~XsZ{%?Z`!;e+}yAJvzHu|ry z(SHR7&u;w1zv_wq198&-UGUqD{}!%aoI~V)I#R6tzn$yHG%iX3{-~G#Z;=QBMO2DH z>x}HT@@F|e#rWkEkSi$X9r%Yhf1QQi%Kx|ne}E6v;&0&pM+g2boS#o)LyA@ZF$eyE zN%FtX`DYm#^ulYL->Q$p#ymXB`GXezbDWT3T5_VXq9nfOIlqpYzu%REG$w5D3Fsp1 zQ+DGY;QZxSA951&PhdXq!vv*P0OCtQf27B_p)`M~kAh#UYdhV|eSnORIEawS;7KFT2H*K4NVM4h^6$Nw02{9N=;PQvd3`(NmH1>Hz;Tlk5fzRQR%Eeahj0|$@BwBe-I^&Zpux!i5=6sg#kc6aD z?oP5O5X_wauycr-X-mmWOG$@j$V?e~(!^tR64zk6*nlZ65IS)io2C;YCldn+U}S^) z``+%}eYbZf9H)I|_ul^A_wRk*d*AnV#lyjHtJC3-tmcruF6EyrRFa-ql^FY5GSYNUQlJ zY3|+_%3Sn}91CTR*518Y-K1_-?@=R1;>tWtlET>;L^DBiP%}-A(V3y_HH2hm5`Ad= zI(j&?H{y`;ypx0(OSpCZ0IdDz#OBUQ8Qp6nc-vah`cM0xWxgu$I{pJRa#B6xz?ASV8PNI%C6hLCFY zmUawGK~RoIFqjG;N5z=?&=NQxGwC3OmOu%{!$apNaE4S{$OaP8ZBiN<*>s=vB6q<^|5tiPg8Eb`76M(oBipSV=z$s=m!f(wzUQ^z|mDo<*@YUKcK znJH&D`;Wf;5U8hpH3+4T25$z_s9Vdfs(lHI^DB^~W-hx%E3*&O=@0tm0Q3_GZeXkG zupay%a0kcNUpnN5EcGB+s4A@=tC=ZxcpTd!sI>u zG(h+BABFXg;z~j+TKtNY9MaFL2R$H-E5pa6iTef9Cf{;%ZO4_VmvOxaa_qUVuPLs? zUx+KG#lFxmKb)@zXw#tQlro+`&MD$oJZ)qz-z<;LZJUmh{i${ zt3IrMV9GEi1ctb#Bn(TH1)8suGDE+o_qywM`FrQ!CfH4$e6JwrX{ zBbl+v|24xx)W!`v9`seyPFXmTM)ziEWKd+I zY_OWfW*YX@g9>w>1315Fz*D>Hva)L*S(iCF zmS4tLg_qs&pb==Ws!Ge-aTSp>BMfic2quZ#@+Insa3d7eK!(D;YAfhLW$NdUGki+I zed-O|;%|cFw6Su6i1RtfP5sn^yV1Y+7RE1rD59?A!#?@xXO}O>8jpk)nzC39d4yna6IRw0UhuEATG#NK_y{f-Wn-{f0bBJ_H4n z7S{_TXr$)A31KozU9eir(8_aST+Oa?XQm*yf&`=R3E3wgTg`Sz@&7g&v9^5>FlFe*Fw2!W$RF zNQ5js z!!Myqu*?zG-$iY}S!wDORNLaV)FeGMboF=Nt?@0eH>DZq^?hjT#mp20b@Z* z?`Cz4ra|rZ&*!e~%Z%i<`#ta@!}5%NGMCYRg~7ticSM=jj51I;`pR@k-9vmlALWwz z`(&<_)PF<|C6l`AbCWvOckU2tPwGY*qr!!nuMFN{Grl#T);_sITnyF^jO6o){W{ty*pcN#_ffaR>+QvJ7zGKQUx_LbGKaad~ z5mY1hW^Y}V@+f=W#lx~x_2~L=R%~jK^3=>S$LOqZb_oWt^yIrC)ZG1>4?TtVRpU+9 zR^PJ^c$O|x_WTuiwCsv%ZRra4_U|bN*3{~!GVdtq<{h6nMw)j_&D*|g`HdsDuMuZv zRb)g=LG7BD2}a}~=n}SfZ?yKX`96L4oICwM*m#9D&0H$X6q!`b&MGjIUaOgzc)t&4 z#fJlv^&!bIS{2TUldPJp_7yPybs@8H&VkjBtg);#3|G5gx^?&$R&TgxGKjTIj3u5 zSaNVA9SbaOiVVfl+Y`zANH7+d9@R@HRCpY4Cd&`xgiNO9GAF`nzJ=yrF2?z!wGl3&1c(Fu{gD zyGwG6)i~x>S9h+alNexl@yCA9n?1@3^}~u77NXD*Ja=sB%-LQjn|N zwaJ~%$K>viyk(W#+aeFQ$YU+?4#%BxXGreukhipOv>n8I<)*e`t&((@bdo-_A5t)c z-Vo`9+2$NI(yP%N+vWWZ*If?zprb`TjO#eA*B#wJoy{f~1Q-PPFu*ooK@hg$SH<%> z{I=sc44O93tOM3t(obrY^H~sgfVcz1A(MDhDW5(AvyYl{c%BxOm_sl(hakmC%t6G7 zd>1UKw5`hdQpthURh_`^ko{U=H1DX=@al88_B&Fx=P}T901JYq z6~8K;olm3F0iXj+ZD862u&b0O@l6x<)sTH?e|u%VY%A%qV3qS|$$>kr$d?b4ccnx} z{vVh&zH083$DLTSXzOwnUis-F3IN-S+aP@B{F9zoU&tK z`!XIKxhyA1OWCkXWKjl~U61h^zR%M>arML6PrGZAN zA-rNS-TZXBm2LsLiGd~5yPa-L(p8@+Eyl5?+ai5KsdRFrC*7AAij2m3B0Y)0VH`sv zwoEZ7a53d#euz?K=Q|6Vlrt zTuTCrKc8!4XqcSJjJsl1p-}t@PK5gkG#0KG!$V{&e-pD@laE_|o`w9kE+!8*P4D3= z74!R;QI6KWcxiDD#s-mkL`#^zRC#W~bqVjS)c(T6{}jJohEqMFM)-VO7X9}LL_>NN zb;-j2RiJeClB6%bScd)@=6}AfzRmCj;I;hlV}^72#~IG|-Om`# z>FK;j&5eu8pUZH*UsQ&3Ibnu#y-|j9`dtj?dY@)E*W1GEzZte#e%Q`%E@ux-3XR0^ z^=9Q&!ufjpE)k;<&ezoufg1Q1(6{uy#&EuWdA{Q7a1pbU@6&dMbN_tP#K|7MPak9S ze4p-T^mCb=uau#`o&=*&-1)u^nK=D$zun1j?zasL=lyRAIsS-=)7tUkvixlAcf#j0{5$AVBXbp87JZe2P-C6YHwX)_H*vfCuQGf-qYs(5 zU49Rv_cHoXhI2XAIm9mK1(Tlq)4=3d=Rm^0$nXt}p5srJ!M)(6MtWz6o^gB3>=Rze zaJMOk@RJj0y7|!!R{_P$)$K2OMU(M(bGW;5bOAV>}2h-85_)RB`-(IVij*X?I zhHcSwRBG6oN=XgL*g&*_(1z(=D5m{0&K1fid?}gkCAcA(APc9ps8HH;FOlShKm$HA zT-j8wT<{+v(f(?5RmM&GV)q9~yh`JW*^Dr{psiYAlhAB?wjRxGZ9R${N|?DF zb9i&ESJ<)hQ%#_+!B@*40Pl46zg(!k=o>B770QjxHoDKCSS2X`-vuIi0k<01-K^gtWeFnz`5{^dmr#fZVPv^2y`nQgP!}J&NX)Z>&oc2{v>667dNC8>!{ON1EUH_w|e*6@) zv*tu;fAZ$#59t408Gf(%X@9AW-p+r(vEi2gt>2?2zu!i8*yN|P7;mOK zM9c|4Y4UR#>i@>%x8oMSh5vV%`M+oKcT#Xzv;1#C{{{Y1@$=BZ7VSCXN337Uc^1Ew z|HvMQC~D2-pVW?<`l-&)d!?P9wwXQmlQ0i4pha!=+viWyEXBXywEx#OKIhX6>9Q0I0?dRo6?-7wImR4RZ?}8$5juJqD@Fw0xr@0{x_ zb=)QHVs}x4_l~=8(SmIEOI7*HQ!V*)YH1o5O?}OWy>zOy`>U9x?d>D-fJXic%zKX{dmhEPKDqm~tatCUzKUe`b1|~`N3YjdXeRn)aPZBBk+YD~T(m3p2cXS?sH zPEYOths#$dPwpY)fXxRLDSIhgzbw_t%4l)>roY?W|4zs8Zr_5#-A|)Ro>baLeYmgx7i;4| zTN~#7Hr!`=|0CJ+16;U$s;C2~lm{BTcN_XP#?s!MtdK_UH`%^X6Vl$jS#M#gGB@_& z^w>wT-tW4fD@peJKaefkeI(Yg27lun6SJtss!a0c_>8`a52n3=Hq`&SIq&s0C(~Ct z#?;w3RbRRM%2XXI4dkOzZ4Od=LZoQuz5_LlRG(J$tkW3s(L<1bCPbc_yfRgp?AZ%J z-Q>-sgHu98B^rz=$V5wx1i2KXn9-N}1Wt-_tdcQAJb)sCxFZ$U#5gJ9@!%7Z2gw0~ zcwwqU6PGCcpuD9jO}Mcef3qaF7NIJ?*TUZ-?|GMytfy@CJrue`<7O+2S|GrsWh5>8JoVhELD*_ zx!!4YN|D}syl?g)?}av)_){mlw;X|FIq$hPXZMDauIfi4ZgWIKeXH{x+HDyOxWete zyG*qcsO6+g-`vA~;`w9;hRK^7oq1^L)oJeyP{({m?&K#s_mfd6l=N#GB3CLPf71_Dk|f_5T9co>6b3k*BJL6*Y%@n#Ct&{QWYaA#O;2! zwDTJ1u>ut`I-Oses_MSG)Wbch)%Bi8qt~zSUT_Z#xZO|3(j^O1D9%B}cV^$jf1J@b z6=saOcDtWVqK;H;qtk^&BVmmYXHFcCRXR#?(8!_bCi-;UZ>KvD(rnq zY;$=o|4AZlz9L0(St&t%+Uhamne99wSqn5?xg(bBosSMW4TV>_-EWRcUOgSR?g0ph z4iJq2O@S3wmG$01<^3V!{U+Hnr^LZPG^#S~9msfhqxMF1KZPFgSCFRrjaWx#_Z#t! z$+XO>bUv4=)J1**8dBAu+5~DW_6F3E>AM1rgRRC_0Mh9sc%GE;5K2aE z7o=19fVRz7?Dm=HK+B(oaV#ln3}Owg-nw-FLLIiFB#`O5tJx-^CUy( zJrU^I?|d9~KI6d7Pl23{VG8dFX40h7P)m5w=(z0i#v#{&E>D-K=K%-}s*i;-B%cMQ zx63$9WT0|mnb1P2@fCoiE8=sw&g zCMYP>`Hs85mx&2P4e7c;TS0H~@pNoI8ee5*@>9vHeuxelbWchhrTZ zyAQ`Z&P2wGV@3za);CeA%$;wG7K^B=%{H0$L_sRQy3|+^uC5scsdPbOp_)5C7Ewqz zA%D~Sjfk6Xq{IH6`s-}ICv^?drt{aOw&9FZ z7f!kSO{s6EA~jgMh8 zhBpjDtI6R{9Xt7n*}KuxgNp<2rp)b|eS}?p7LTiuZX77a`16Y_yZ0>Yxh_-tGRDlB zbiQ)yzvFwz!@V|RE!Z;eex2dbDY#(e5 zGqviDxh!|pz|Y==56XE`Hm{~X%Af5%Ymm8QhZx><`tCcj!LWaO;dqZp>vgFekQgW0 zPyIC=sVTdwab-=&m`{4_A5it<()crR@*k34c{GkRYXS6)`rn{-ltJwU*!yy_XFJZY zA{fFrmeOI;P$yk8v8CNP3~Tsn$qaK z-q1Iyn<>X+y+=81tICaiE!%ytG`sg8X25sFx}QZydOsw{cE1}-_FSp2;vLg-D0HRg!Qn*I%SDDl)i~ZJ(Y$y$b^yfAHb@CBf|Uzi$aNq zNayCyVWK5STs&EB3ff?@=M5CW^&ZP%0$QE%4pPj@oc9)7v%&J@%fS(WfXaE8jk!w} zxr_axq~T@5-=QDvm&LS1PU-Tq|AJ5in#|xttSfIIln89tspqF3ZhDVN()sh=cG2Vq z3CA3=QjWtMOAmho?hXs$9rWI8<4n~3 z6`Blc!}y1=*Kwe()>6ciJvXBpLY6hI_q_LeG{0P6Ip{v~pHp^-DPEH`el}av3#xXY z@=pHzPEMJcD8utAYbA!U1V!hfUxcGx4~!1!IA~EtLQIEBfsi4(Xl8rTjK0xj(C&OF z5^jZQckG;+{7jJ(c_n|2macMx1dKDhsXYW!VDKFW^QdQ`5%V}RJsBfbviBq?f8ZA| z6EvVD80+pb0VYg#79i-O=KSc*P}Yj>XI@G6o(V0w-lMK}U&gyLQ+s6KB-EIWbAgzm z>T+Nbiz=9McHi@GUa0l?t;hmrjAIK^IV*fU=8BovE8gQ6En`o;c00PGZ-Ze0D>Bfm z$OoZ+aZE zaQr?nkcmCTQM~huD!nKZQg+{t;En1DEDt2JC90wO(bJ2vdQV^u8GD+A>g|VhMM$D_ zGv9LYSqwZ2@@-UtpK+iBmAlEmKoLTSdJ55dnfA#FV;l8_$_?s?%B5Zkcv_L_2mSz& zbUla5p$frj8C;GvI+2dY=EV5!a)3u1RL$t!d^z-w`Ss9oP{vm>-d&7;Hh6#8J)zTa z5SRAef}RixU5nWC<*9ZKs%TxUSpZwmv_`A$4WSIgu3pG`Pv-Ksrn*=;=rq}ZRjLxA zVd0L?cmMMTMs8P_zE{arUY5`>3JCL^HEHkF+WknaXmZ}O_7XTQHIIpqS2Koal}Myg zAeX_3C`gbf?SWqBz6f0do}mVEsg!Tj`At>xP z-ho^n0%B~G81tUx5L23?ulKC0s;YI}Nxe0#Yw<^M`$hWTks~0&^VP7)Pa{WAyStpv z^;M^@L0_D{W^L*;6t+6smrET-wr!;<<>$|3V+e|UyDlxu>j(dp(YNOEH>CRU6sPSt zA)C<*!B;AC&E>C8JqFaxFDiBO83(prz3UJFj$0|$`J0RuGD)2%!9lv^*&3T!HQAuxyWeOeJV4Rt{{mOZ>LSNYl5N# zQAiX^d5fiF(61`f`*7z37U>@t8>-+$vA0b5*U0%~2LC$yauodn&GO*`fs z2Q5OEXcq$GT0cP>2=e$NVu%@r>{JmCB1dqnz~q?iLnS~X;ap%LBHzgy@7()TJUw== zQWc2UkWk%EDh{e4fqEY%oAKn0_ggwf1hKyKH0ao4DB%=?feMTb$&i!s#hVne5!i@OMYBC{Elfl@{zuX>wJ^`NT8ycD%EX6SlWO<-3U z1Pbces-t#4#G^9u*QWMqot%ghX5>~U=~Oy7b|EH?5TvavONC$@esV~a&;X2Dl#x(l zqgCd6pDIzMVOZfwQ*Ny+t0zBaQ)*RhQ3|6+pcO?ODP@@?@S9gj^!F!1BI;Y0ZI57glfBwL+)Wj~dby^XEq82o$zlVfwKEdMOcVgzqOel(ie z{pguTkB6Yn6FJNVF#`O^9+^p_5}7cvzT9(|&N-Bo#5JJ^l9@kI~aG zji++h#&h6JPEy3d@xM$DiCn1U7KJew|I58c`sS5+_fLLeQzmMO)ViFQGe!9T1#7G- zuEYVeg8fvLYSZ2W82=BdNf*X*_M6#VbQ4NTk2wZwS~^zvcw1R(+ess8S}(*OWm513 z7+82SyW4SMvC`5JBOrjgi)PG0n^Qn|mIslp(2-zU8mAnMgSXMApz|8GkQ6f@^-y zbN9qwdLQmMdypN$l~i`5W-dbovN1k3U@>=74SKGMwD)WH@@4eL-$|pnVyPFQ95oH0 zk&RL{mh-SNMo%XpEga}qcGW|$pmzmY35MjN$qG{-3i%YC1oX+>KjqlVnFD6xn4m9o z7p75k@WH%kHg;^>c(=-#nl{8Bbr-S;V}z^&F`bZcARY}zWN)qZ3YZU6k0%UvK4|ky z-Z)!3^LT(!WsvlkF=RYM1)h*%Bj}NzJt}{hnsp$LXO;M1`wpag1?T(5!cWx2#p8=f z2T$V_?fCRH?okC`TY*nR&LBbw;XOgbF(9;-={Eb*2w@+}pI%CrxgV-V&A{L2!1kgW zRIJ!VX7Hg)Q>ep0R1^2w;!~uMBUZy|I1Qx%tg@g8ilvgPsTsK|AI+>%rW6GiWDBR?W%A zU*MMq^1R0Un0bB%{d#Se6aYp!XB5orP6#O!Jl(hP2wcc-LerRM1rwWr2bC8NKNZz_gQ`}# zAZ6~Z&!{r1Qv0Kt=!Q0FHxw;! z220eW^~m1VIOB?&QlxqWhmMXe3#0`NVkBzh1mg;>X{_X8m==XfLT9v9yHxX(OEm+2 z`5e0C3xf;`D_O^(SkF`adPbqFgT4MbzjW8?k|P+(vR>zx^$E^FbW6<9KZ6`O@1V+W zS0&1T1b_nDfakR`RjC3gTDwMmt>xs{l~i0?Wv&l_S**a9f@hP`o0%3WOMt1O3!vRp zD$dAZ?^AlxZyZyq!*Dibr*p2<27O4asl{L9vXHqbh50$n!B#D16wZUaV>LPf*c{4= ztaPR6w$yVIyw!nS2n0j9JgN5rR+h)<0y9-p!?qa9GWDzga{LES+=sfM5t+kv5y{K0Yf zp`yQI%&TlzF)y`$8fIJ&pF(5hvTtuCMkN(C`rG1DWzd)!w52uM7+7I^>Rmk!4jG?9 z4DsmksgnQK;#0d#d`del>^e~Uh{xQd1_gBKXQ22DQk~5+V^?UL^&ABebhtdV6n$12 zZAOQ#f>2c%LRBOB+se>JF@mFpLO}**5)rDRu5G|US*h;JGNH%|dy)U%I2Fe?S>M;H zQIYvxXCqW3M|*eW1>i=hlz(^J@1WEj;yERW$gG7zBnD-z>Bn zjQcw%C^eyiQaG#VfG``sVpM!e1@f$(=o!#pRIo~DWKfIxZ#5W%%Z6<*+Gn$)El0vA zgSj_|QeAmP)w`HV*hL#8sON+x%eh4peoLYF7J9=ND6v?{>0}o)y7?dMTvg3fgLo-- zx3uTrDztK!ZVN7!qS?VgP|F@n=4W}CI@4!LVF#z z;iRkk-AA5s`!2`i$~abneqOU^FloW#_&Y%5zKkIJ{kF_Y4je|bH|-XgEzypP2i;f zOqta*vIY^5pWzuIAkUZx$TKDa^0=x9L_mHXRAvKq;3>Fq@?SgefR0P8`B-~X?`uJR z)yJ@Cz5fL6rjj}uA?EdJg3FGijIv#C=e6r^FiR?&^DB8FCJRlx**}(LdjFX0{VQarwxFFJ1l$>>sA9GM6`G>& zhQUU~Bh(HF252w_a#^U}ou_A6%41WJ41bSAwOX`O-kM5lMKNO-VmyXt84CLlGF9{{ zEW*W~8OFsB(@Yeos`s_dk3b81Kp#}QDq@oRT+!KMQZJ)PjmRhnCRJkz&pOhX{NhSA z0-_V(3LiI2fc74D@6+0k)-g6!Nq#IzuJhi78NFHOg~Cb^Ra*)XD|%l`_Qc>G+vYv+ zS>-EtL1YzoD%D$yZKs66{u(?%XwU7Nbwm}#n#90jl>^GfcvY!hNgkB0-^FsWpN<`h zQIO$mdpbojP{J_8HrQ1aD^@vps(P@OP{ov=bmueDQjbUry>QNybLfP9t76w)5TYuB z@QSs`6;EZx>XEH;Jmb9BwOK_-76)WFqm!3=31YNJ8OJRa${ z#-UqJvL_DmfhUw<)vFF>upSFmmLrv@Z3hC{@?z7R3W~N>hplHNxBWh?^kMu>4D3(q zI7_I2M^~etU74>#_F{n&gRE?zg&Q=Asj;A-18>5=wC!1UG18F1MipYw$8DA05a&pQ zbyU2D#mNC8rx*!b58GA`uErQ~0y;Ci$+8ctrgx=NABbT++d0*&q|#~H9mm+^4bWMt z28{}7KDQbU@+sPN0h_x^Qy3YX&|;xpL*=W7uW~Zp1qv>sj)(Mia~8&YuBg_s5rH>X zRIwe}-=B3sGakrNfrUEgU&jag*H}-D|6SHoska92S+)8M)g;H3VjCA;&&2NESD~aB zwzeK@ix=t%Prb*fC9X8!If@CrKzC>gbdy5bQT>K=iZ`!czoxlkLo?4E&6`>~SFdf> zr3HtA{b*OdL}$QG2IWCwugCV#VNl!Do;${@*(1rgv2UcuzKhCZcM9TBXn0%oHlvBDEbOU? zmZko*@TxsIR#&)cBF1xN!;AKg`yq~|JldihsSM?zp*?Oq)|ATIrW49VIkSyU$pPnX zYpM*2Ixqmy=-h2hRpOS;>9wiDNZ~&6toB*?F$XNr%{ZbNz|%X>uwtK@kT42SuS8NF z5Eu5Sd@SNk&*yG>JLHwHe@H3M)q1i@6gsLH@$EWZLpNR$-ec>Lz9%-WVXNHnV4R&C#7!p?3=QSEZ3Aa1OC4_{mJ{ zHB}V$VyZ`Fo%y>^IpB;^PV(Dy5Md2m!#l*x*Q&j9us==C2zn((GVEAF2r{O8!*zBA7Ryb#Uk84d9-Pz~le_noI4b-mWeK>rRnMb^ zugYtkB^i99Tg=!c+kLRd4*6jr2K($xI&6%3RSoPA65C#aibpZ8v*o?sFYk4_yx2(1 z(qFHOjFLW+9BV)Q0XBX;DErX6t^*T6r!)>zTD>0XtBbQDs7mj+(@UN2Q%`t#)9SdS|I7Fh!G{LWCTlNhG`ntKHplJ&a2Jl`KGdb7GpblOZULb znb;os0c9LNwt!G=s>i!FaN%HrW)FIzZ|VtZ%BOfie&dW7v#NO)<5*_ZduH zbkwFGL90!m7Tx|Ccc!a^n^CXo)wmwUJyFcP7j|($(=s=h-<@tvWi3}wa(DY+$`RXv zLEt7q$=&V6DVJJG41blqFL&0Oo$WU0a}q{IRS1sT7?WA61!FSD&FFpY43?`x;H=ib zp(1}A#>iXBk@r-FV|Z~IJ$&yC*j+WHC1HB(0N!97ts227?1oXyV1&wN08xfA+oAV< zckiD{*<4DIOi{8UgdZz+v+GTc163G(x50T~Ox3vsCGr)7xn}$1oa5?H(5|%;x&GrnS$lSJs;vmQ04X*dd)M&~w>EMqiTy>(n5|y)*0(0;OeD5ZhRqvOf$-bx;eUwl9@YaTu~yk&Opf zpuuakA;RbiHCW8(-kW|nY|_9AP0!4^xi?w+>HJTY)t7eFfp{TF{UHv z38@O!6wyFc!^D1@qcluA(9_y5vUC59Ucvrbt<5%4;{~xj1@`0{T|^T$dOV7)WiWV! zYz+h02s?l&d-J?I;&%Fc*e*TAD@BKL zU8)Pu>^^CBl5C;uXgIB-L$Cv9_qDrYotf9Bm*P#{U$O(j_u-N~2XJL}OslX%H@@C< zTsE)v6!2pp=3Tt)%`s=-0nn=cNG0w4I&Kxa-mPTk{Wd=eVGn$i=`V_to~e3A(%${* za}pfz@ungcXH`*AfC_xBLXEk2QVBn3lydhRobe#wp~*Yo$*2ctS`~P~&G(?Q9H#i9 zZh?{R3-G-0$Tmk)RM|?DFG@NKOw%SoX)%Ce~vk5y3W|$9I zJfGE@%(11rG>ej|Zx&!S%R&TRfQkujGwauzmq4g72qcA{5B+@?Q>EWUl`?%OG1r;0 z-gT+%C{o&tnQ3(T*QopJQ(fwe_jr+S*6Yy=P4K<9uzuSk7=qrnXJyRjUE0w;T4~H` zeLdaR83TthS~=NcPuaV$6YRn7z_La6yB|$neF|I_@aLz_y!-GWM1JfH?&c^2tt#=Z zyYf0QxPqjLU;mguas*tSYV;2BTQc7L+HTR%@a2NVs-mAm#o;prndI)Pxb@a;!X|q2 zRyT&^2Qh-}!-t-*i*=_oSY5pY9+S7X`{fvn0HpM@$_~7&YzJQu>05hR zyb*7<-uk6edk-lF-ndll%P(Hadea{>8-VYQms@Gf_`stMIp_%nYNy)GFB#3xBIY;a z1!N4EF!g!vB9!N9Ukmt>5e*sw&*e7i`}t8CYGSI?#FTNVWY1)%p!@PvS!KF!9XBz5 zgFKQ~b)X1J7BnvP@h>Gw@WClIFx}QE5yV9sG>J_zSiO=xEWIjO0j(eeQ#H0(y{fXR z_R6xFR?1JaEdqzzC@Pk%&OM*!fnY3Spv6;Pf?e znwOtlVN^xK!{_++RDwXSI<2Yrb4j`ugNd}#+@J3qx)Z$`o}Y7Yi(&d^WcG^g`l!iR!vSW0gg#0x^)=+)HS zpX5%smW;>#1YOCqCo{H$aPv*6E^y$y7dxFgQq|}<@P3{W1ahdj_h{`KB*)%r=M6f9 ztt(Q~GHr|Y^YlaamXCq2B7qym85~|=KJK*)2Hjo`pP2F?{)!8?`<-O+s{aLIT%DY} z`v0KE!mxVs=1KT_PV(x<(V@_nba$N$noRAU+Wm`|(ZaUMw(7Qz!2+kEs|h}lp%{+? zlMEhJ;vG~~he&~2u07rZ>s|zh7gIL9Fs)n^{YEoL^!EfncQ5G92OZYAACKvNPYZYr zgl5~o28EY9W+_ppXi>j|Cy}CZnRPT)WsA!D7uLa*chEcngS>t9HWi~q{J?t@O6P^rAP~QTMx*mz3{&E9$4%sYh6(xJV55j?k__G@G z=e6Jud(ki6Ag0qVXO3K@QSzCYV>NXeI7+a>!2Xmu;-h5j`=Tq>JB;p+F0Ba&=#p1Z z5dZxJY?uhdQ3NW1l>?>b3-r5&AxN?2;2nmBMk}sRPrr7ywI~NLsnhf7a6PnD4yy{j zrDlbgmqSi9;%l`=mZM5feoUA}3or`lq2l0h7dLp1a*V;pegDdDsN9FP{>}pUvpMhf zT^x()QtW9f%b%b)DBWr3SbT}d-Bkq=MQ7i{OWr(vG{@Gyrf0g|jBUQ4FZ;(UpLz>a zovrx^Q#O?)uVxfr%BG6Vc(+b*DccA;JWM_8I3#G}PGx2oL+nV^$jOui;N?ni@ zzq&25GV4)YZ^SwmbiH}tB@N52Qv>&e8dyEoTPx)6)+!(sXN;J^ODwHsDk ztVM4wCOb14O8&xq1MjZ@L-Aj|p>t#N#4QtzZVI#0ni^8Rj!QN)*KMgAD*2GC67{Bp*_Krl^+3)l>$Kb2l(UnKES>|3Sh?4+|} zRrC5)P{+oL3o<`S`kFcxz%band}9=kq2#ZfB+^GEK1%*d{tGYdSj844tvLR;@j?BQ|^odr_-F{o$=uIS?tlNaVni!=L62^&I!&0 z=Y!5fXOwe>bD}fJImtQG`3I-Ysd6SeA9BueKI}|!KH^MuMmuLaAI1NQ@-gQe2NR%~ z)2E+RJ!a;dMPsY$#-B01ruy`n(`#$asGV3nX8x*`)!F488b7VJW-Me?A8}*Hn<;L?-`9BkE?L+OiNULEj}$#_0^JTiR!-6Ok&K|5pJU9s$<+lUH8az zqAC8x#L`@%Wk#ZPTB3bkqG?THX-3>GDZHJ1T%r!i-NYEM16#14mMEK9-jb+mNYrEt zVPtT;9$&>NrnHH9(CH}ocYc|n7Be7*#qANB((K0j9+K_0^CAQ4KedFN! z`RaMre?BMCIy2EeEwQCR(?{ITQv40SxsgSh<AqD(My!;d1H~MdeO*M|hIBbI(Gp*z@@YtHnFU=eNVLpOv^FN%Gg`L|iI&UMJ#?GF^Eru@ zoDvhdqwaCv5UHDV`SK{;?1W6;WF6zUVFdn!Y%e4q)DYj8XiX>D8xvb{x~>{j-Qm7b z-_KI_vl3gH^!?`(t+NyDa}!%;;vVv&Jad$7P~PZ!ly{zA-Z&TnKml3#a}zB~mD!v7 zCRI-`KHM{dg^89G^7%aV92IBiq0mO9U~><8)N{^H4`YpAAQ>RTW%3>wqC`Q4NU$=| zn(;Hh{dtle8R!gAgDk)zn<4HgFzBPL*^RC-$GHjBlw&7};~c+z2K~jLI_rwXUrbyR zo1NGe%O<`Va}(R~+Yy_V*cn>}30m-M0iHGB*);KNF`muBv#fcR4Lys`gQX2_ZnMhM zQ7yu8egj)8rH$bjKO9@z7Au()gHX&k1A$!#>?UrB%}nfxod=<2;@NCG%ZX>x@NBVo zmc_HQNoSw6nsl@9EF+#RMY=|#gc%Xfn#{8_o>6&}g*=(gK4UuhY(AcCifoay%a=#$ zuc3S?@?n{o56e8wlzAR_eL*~%X`Z=|fzMdEIi#CyWM~Ao99YKiq8`YLGB3k3R`@JD zqaG;p`M_BI8HR_=+v@E+Jfne3GxW?8bvX`y{*wbXj_>&PsQOnmeg*aj!{{D4Bhj=j zu@oH@+BJqRS9fI7U})1+{_K766hnx9nuQsPr%Tn-Orq8GJH^IClSZ>Djp-`UHJ_(w zfr1w8ogEHnXNlZ_O8!XG7+F~v81+z6HAYIcK5~TRcvL~>2FD0rU#k6PI36i|VPrPr zJ;#I>RS#qO#6TG>C>naAnT3TwmlwY(s%0WS=6#LH`*{jbbZl331z+QeNPSE%KR2-y zo%cK?*anV*9293Hmb!_iBJF<++dqRLI7&w1&(LGdOjugU+E`@v4do>tjKK;w!fIz4 zi_RqW!A>4RYup}Ni)ZukY_8uz(V8MT%_#pOXgfeV1GE{m49jO3_`plhjBF-k9-j~J zOA$FQiOBhUCYUa=U}HLJ7S1(LPYs=49Fy3{qUuTcEXF5niBFF#-^_COQfpLGp}(+3 z)fZX_Z@!?M2edLcl5V1#CX+%obJ=LKNRUSRRE_q0BxrpvS!AK6KdMzk$N36X*1&dxe%-Van7L^u z7avz>@pH>p7XqrEu2GX-$2r-w|Ke>9O+Va-rk6J)va1tK8P(3#B(j;P>&W(}?BC@5 z5XXw6_0wIklIP=*y-%h*zBlAS&PUrj57->o7o80@4SJd-7sP~5f>`1)D77EI=TSR{Fg`zo(ep7VyB`J+VDYa4 zoCa_Pz>FbW1#Bs>X@;;FiP3!x=6E5nCJ<%~;dvzdj^Jpsj@KJX%d!QOs?Xp40V>pS z{xF;}?}?W@TC~qUIhNQLzbx@kd}iV?bO8M@j^}Yc6kiH_1@JWnKR>Z29&ZQO9zpzh zJZ!^3mlzewSTRk@}J9KabSz_cFmAv4>+j0N}brOZ7U z=y_qwDiyx~1L^>D2!84CL2$3b2fWwuiQsty_%K5__23UWqhsQq2afQ>s*rBdD!wGqDSo1abZ9|{iAB%U3hlptu zJZuU{hYkzT$LZzqKO)gAaGVB?GEoaTZpOnIcsLsmVNwMg5uXVjhGl6JT;(%zMY)^s z=Nh%Pi)&*SUxjagF&c9(n#>X|lvI=4JNBLwBj z=TqbLa}(KViKb~bObDYeRuwdWn#%@JV#asTAcna?WC%kB43UO%N$`tFe!B-i}*`@mpXqg+*DxbN)We?I!e@=4}+hSAif`Ul&IB$ z!Y7Qi$DAe~tn;rh?^G)b0>CAo$DFx7IMu%*JsqIJV|c@IOq2w+SkX87 zNFjgYT>DQQXO*JQ`ADJs#xe7sI!=e8pY0=s^kyLUpJKO?qMzj>Q~WE+Z#sT~{j;KP z@R5oB73ob!A~0T-U=1aFd=~&V_xkWT{uSw~#9eS%g0+|Mu@gYdVJ+yRkk4of`{&zm z6?10z_x1i2^X1OC!s7wl#+-}~7M8c#$K^!{R-vK}u1ZB6C0L~jAHN3>bJ9Ls?_W{Q zG2$+`EWzql)KP-fuJG{^fS5DghYR&K-pB0)H|E5gY4+Bs_pc~VjkpUgOR%CAb(CO* zE$S%2id*>LSDLZU#)kumneUlCZZAHh=ZK5u+6tbO2)hbU@(Jv` z5D(f*As)25LOf{Kg?P|!6yibuQ;5sj3!aoXL4O#alt1WK3-O?TF2sX=z7P+_i$Z*! zy!Ee3oCW^1yeM%N%3F2K|Ir;=u?OL^4Ib{H>7$UoI+S9O!Uwmz&l~#i6KvX2eCk7d zRvJ89;xz_u3DK`NczX!nWbm#KzQy3%Lim>rzCDCU zA#N7N@Kmj(_?!+EpAZjfMXh0g{xaZE`TlDWobQW9(H|^=e+YAxDEc(;6XJot@jPVV(Orao zx1taH4Yd{l`WUJzihmvO6XJm%qT=zOzeM4IzoBB|z<;jrz^_m-Y~U{z;s0^O8%~H< zgF+uF=8Vf$g`Z+!N5x-p_fv(ZeEcis>NSO*>fA2KKT9>b1iEVPLHg4Ro(FX&A3U>Y;ZD*a8 zJ}T{yyZVMoc(m<=L#0&$7um{q)O1C*P}+1+RkUuy=2b`6VDY`3mUol_i)|~lX`ox4~{&v3xDIaz<>_wv590FJIe9F!oUoCAhzheTaS7 zL!~dajeMvyO3$jKYU?xfG@ODKs#uF_RJ^HuW5J6KLuEb$n|AC1|2wgguU!l!&EL9< zzpyk#-*OmAghCcIbw%Fv7>b3pL7o45eig#E!S|WIFv9{R6syi~C#I~C!?&`?4XZJX zV_aXpzGG8F-xRdjA##4N#xw-0;Tn^zNyVx_4eO#_sj)R0) zfp-`j(`OZRw)9^S{96S5K3ww%%k#E|2jQO)d}O(n7s0<;1m|Z@c!cwPsR;f-76=E+ zEA{Y6flED1FgQ;$@w0lEX>jJ-BJeNZnnzfkUubv`UQ9oqF!G1xJVW48&T|Y-v1Z_B z|C`9AUh@2>z2I_|rx36A@2};y__s@bQAb zb^R@jv=_M}5Wr^P*>Rsl`I6aSW zzKpwDT=KE@5U$5gLm#&1tp?}G#m|=aUj+XafiJ-|56W{fewLosJczHw&*ELU<`Jg9 zR>OnnuNS!Ff1|)zN0$E`0+;l1e3AGc1-<0+uD~Td*GqX&eyQhWMes`nF8S~~Z9K?F z;@=badOWk`QlCGMpnp=(OFl7CUmFCU6s~!M?ZHN%!}j26a#`jz-+i{M#-OMBQXaLLE^1FT;u|ILP;ryPE^e*Y@?tP=Rg zMZe0rw)D#czDeMoz@+vFiOFmx{xRn3b0+;eXBXG&*O@T|jok}5bg!{2N4J%ywCmDm&9?rqf%0JiOELV%b ze=Km>k3C%k|F~$UD+QmFp%2SBoc{K6g3oM`Z;QbHhrp$rcND?r7(6Ubi@>GcXfr&h zx3lrHdiyCs9MqevuiXmOxU@IBZW^Zllc1M;4hdZHDHVE_^rs2@5~Q_q%5hoR+vS2@ z^5Iv@d4%Qsk-()L^8MZ@`h!LAD>b>|--2r^XB8EUgZaw-`BH@rmgj2%m;Lki1uo0A zOW>0Jet}E+X9OhsqEm;6sR z;|2A95q`E_8U@}d@K%A#@_t|7(%wqQ7)RKyJ}7XhhcN<|a<&Rw%6W~zCH=n%T+08j zz@?lo30(4ti}6C*|Cs`pdib9LmvZhDxRmEvflGPp)@6z%`JW=j1u5re1upfO7PzGU zvB0IA_X=FnKP_-6|EQ6wdR6^O`9CXg$!Dg(r9L@U@?d!-{Z$IpxYWbH2wc+d6u6|n zL*SDB34u%cxM-)6p6{CQ2-i!Ez$Lxj4>*|q|L^|rPfdTqQw~4dZ~Ri=jRME!O?6Oj zs|3#Ph4LWYfuH5GQsC0A{z>3c&)*TawEuq>xTL?&;9>cn6u6ZC;}j4_SpMk(m;7bi zMwW~F<18Opu3H2y`8+6a*@?Uw~kAI9P- zgNOakNdlMt=X8Tp4^#28e5ey1tmYQ{EH3*)=?^~)8XlyVIQQN22;*BdtZ=D^?+IM? zlQ#=o(%&a=N#8GUN&lL_CB0k^lJw0&eo3$QDeH1cdanrmw*@Zge=2ave-SGL2lXcD z*9u(HZ!rBynErBsuLCbzkHz}8mjwMqf_@|!RI6Z336&`gsxj zA%ROj^V6c=kbEvIg4=ZvvXk=MDd=T+`voq`Yx|QhAMWSnL0szLLV-&;Hws+V%hdvx z^tTFJ%K0Axm-Md+T+06`(a%UZU4ctJO9U?ar7Hw3>3>`V|GB_rdG8ar?-x zMfh2JJ4N7Wa$Jz@>nla*_Z7hp6~W8IxFGxC zdVx!MekpL-et$3UE1*+bUxx)Q^-x1a;^0ZP-)@1+eC7Be_3(ot^v?)f$~i*x|5DD+ z30%r^uE3={Ul6$LPktnDS>A-`x1~Ha0+;fv61bG-3V}=h|0ZxL=M4gvdXw>FN&m@_ zs#;XNNIB)aThgx*^iuw71uo_1`zSnEJ(73tZ~wErH8?F8SMa@Nj+g6`}v3z-52(kiaGXX9X_h-zfTxRgl%{`G*4MT*>0H|B?1~yyypH zz8@C2)Wc^5F6n;X|Um%X}9I zT;}@)fy;cqEO4oxuM1q#pD5b3l;;eAOZqhemvVkv;8M<81ups6_s6Jbsh^Fw=MnaQ zx-~p#ovY8_G{?#KS-a{tIQh$b-xRpC+hXhBV@3Ow^sc}q|D3?3{#yht>G?h@56Zb3 zKP$iVLnQu`pqG3O7s02Sb{f{FozD_)7WuXq`mjDfV(@T%O%ZsP;Il>GEdt*!@UIB` z*SO{p)`N_@N2Ci%$yPWK6V$w&69l75@*g4kJ&DWx0KJga z);r(p^YaQd~O!`N;^5H2>xw>w+KFR9B&eMHLiIu-+B1id}|2e zAU+>Ii*qfN2k`~?Sv*4!2l0jYS$r`;9K;vlXYmyTaS$hs#V;p_gE;qSS=_Dz5??Ct zKj4}N?c_@QEd8Gd;voIG_*pzA>_^gHEbtydFZVY}{of<#B`({SYJOTg?;*x%xLN$I3uB|+Cir`%a58J~(3tZa6cMVQ^n1r9@|B|4;K;Z8RT=JhVF?d-1zY1LDYxg;a^OgJI>66%U?IIx#>iH`CEM6k)jeU{D zrwhDW;Qx+m9^})4pQY!2(8nW;pD6Ne!E;MrjcXo!Y)nAi+x1Z5#syP&m*Gzwuju>7 z+X655QOCK?1)dW4twxEY|D?bl68NbC&sFLh(vK1NPJw?$;MbeVAieDq_;}|Dnuz#m`cnPv zFqK67vjX2XO5YGaUEo_y2sZhG2ucZ%@n|;l9h*!mj$;p4? z_itxa>;?2;N>ZqyA>S(h+c`$P6a2Sxm9Ga6liP4!J0B*iLIoDSbu8WheqY>*zKlLh zZq{Ijc&~hzoP2{FlGO$dFGCI|L%ii3dR3i6SnYUxuV#4T)bJVq9XS3Cc07cf@57F* z@eT8h_wgH#UPD8$49xjGR?e_^4!2$+-j}!K5}%DyA4)RCl`uZ16h)1XE=9onG*bjT z5g%}hg7KNBD43sK8Y~Gv85IGajt@&k!O>arBUMq<_>@%yTpL}ST6`8OiW(o!ih`qg z*Wg235!5yKTvrr~k9t{n#iov0e6b220a~|eJzUiF_%P1+nR6DMhVM_TxEN&gQ&+V% zuim(P-74wqzW-~^O*jQJ(9zX2!{ChhkD{3;ur@JeWVr7|#Efz@qZ6HuGfr{YQ|gBV7KQfku_T?CY9dmY;ct z^FIPCD*tvfAHUAXuesq{c=p1axh@iw|4x(t9z)0Sb4ne~pZlGo@~Q#l!h?pI=n|d(6Duem9dmc!bN(b%Ut<_kBy#@6aceKd12F z{JFjmmA?&+*za%}HiPoNyGZ`+|DsduBy$`rKV=W+e**Lqm4A(yuiNi^k_V4)`Hx5X zsPb1|r|GLj{tOv}^Pg8F|A$V}>FxJJ_1)n7b4Bv6|F%vc%g-W(^Z!AS{QFIi=y{v5 zIfu)?y-5CtzM~UtGwG;*7Ac(nD@F2u$ONP8_eTwzLHd88Nd8^_rc>DOTQGkXDV+b= z=(wZmzrR|ix8F0>cZ2m`hxAePf6MoEiXA2)^Zzh@;ry>Il7ICnI^p9cy*-8X|CJ*7 zw_LANNd1on6wd$QBKhM#4OWN!KC8YPT>kru9n_!>)t}K$OBdq@misZlT$C`d8g~P%8 zS*79pcNNLMYXVc5|0_#hs=wj z`Mwv6STAUJ*>3XRVQ}VuDt_Vo=NHMp<_w+Cevj9%8Pxt8i{xLmQ>WNR;czhjG5Ce^ z-(Dnt`-yrP3nrg%`Tw;@{{1(RI*7G1|4-u=&i`PM{A)hP)L5H7iv97ZiFCu?SJ*geX?w8SbgX`}%MdYu!S*MWW2YvOh{GWlJ7Nx)XtWNKLzu5>7 z*59c}AEm$6UkuKlT|zkjD~se`-K5jo??jvI2ifoCMe^UiTc@!5V_1J|lHvRxERz4B z`8vJ*9<{z3tp9t8k3lMKgb{n}_qS20)Uq{oKd|IL7Fc|JT}$J#RMYVFm?9 zze#Vuk8K}@>+cIlA60)fzt$=Ci1FtvKpd^hgMbbOVHKFN7{Z!%f{0@AS{_0Ko zdOxMU2&ey6k@Q_Ay&1-RF5&b$i=^K(%>3^zlHR#cXL+>p|Dj0w`eCO3YmxL_CjHUM zUx7(^l>Yaa^izDz>5H)bK2apSvqIC?9VPu)Mbg&~GyS|G>AOsNwt0C}%Nty`7D>Oy zq@Nt352wGPNP6{h4$>Yi|F0KGU%ygkd^G+2Z`3;JF0&*%n{>8Ut&At$Q7wFC2 Wb9+UJy%Fd3d-TmSD4#sS`uo2uLoD(D literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_mateltwise_transform_common.o b/third_party/libxsmm/obj/intel64/generator_mateltwise_transform_common.o new file mode 100644 index 0000000000000000000000000000000000000000..8e5127af77bea45e86c5b94f05b16b22017dacfc GIT binary patch literal 2808 zcmbW2U5Fc16oBs}+uf|&&aSOOq>AyWskB29x)KUXCe3apl3HkYt5Dn9+3f6YXlAC& zOtWcGiUf*7nsy(2_CW+6eDO*2MN294Nht-fk3|q{Ek$31qOzVdx$~2oNs%77xo5t6 z&bepKy>oJTp}v?91mcT8-Xh_XPYLNvg?h#C!*oXw`KMecsX)OFL48idld3r2%d=}_T~V&Q*(FHiff)OKg++oIL_v9esJ z#}*e)EG*F_xm;hM!A?fgXga}DTeDV7yXjmE$cAdHwlu0)mT3hre7o>-hHdBbg`u6F zmZc5#h<1Qc+eqOqoL_}14(#cmWzdFctToqZTWdgDX=%o~vq8uiI6suvVTdrG<$7Ww z1VKIv!{;9$1i`C-x6|jpA_T#kxd(>NpCAN5z7vMeYY0JjzBRxe$NTBqe-Yu&NBH*< zzB|HS!o~o?=acd1Apadq_a@1{s5|Nkk!;5r%p1PuXdQ=0P*9afE4ED}OKYhTQfc%M z!t;$DmSmaeFLAe5!fUM=5Zy2xO@eNI(00_uIkttiLDwu**P?koqrMmXuq1qEDvgQ9 z$``D@1{yNHLzV)IVTJeK2g}&~p4C4Oz8G^{{(c_>6HoZwb^s{eg_5@#{@h`g;17mo zU`9K?Q>PiHFysDw3p~qSG|TM#DdQ__Zo`bGIgWqK{3VVv?D^o*Wf(q-ntmMzV{AWR zjL-iY=6@p^6mxchlzQ50L8-8ywu({(cJn%(zIp{ydi&HbOq=>&% zIN2))SERWnj+6E4`?-ykmS#zx`!&U}AOFCpJqj{y?*ibWrB?%U)0l`WWvt5U2z-fT zOB?vfrLpoN0;F;?1lyvO&(P}hNtNfG*(L3ic1h1l>KtKvnSS^6*LYdK3mWocyj+Pr z$w{Jo-6j+*o$)Y6f%;6YpRv?As6XaXw^dmH6p0l?wDJu;+O;$27pEuZ=jVy8P4Ns- z#6W%Y^Hd_UKIQRXqAUk;PP1Zk@DlM&2*T2Q%%T0MnKP&08 zrZ70SvQ!|L))KPF+bZ7&N#^2XRl5EqPCW%_eLUD2MQVI=UhD*MC3eXr?4ehbj+vwlj$4A_W59lAe0#U;l%p3{&hh<_6t5QF# z-_w>{-Wl}Cl|CLjx<%oyDqZKq<8P90eHNjasemn6ha}@b)_s+@2OL%?#Nlt^apE9@ zrEg(Apl4>#S32VbW9+6@aVm-wSn0odcYeM`jcg$umbN3{<0AHAT?K<)ZAiyfv+aAS z8TnB3zq5o>8}Um6l&XGrjwo0=;dKH&MjR;Viv9|ct_A%&koneOSgy5&;=wN?hgXQz zZ|*XJuD;)~@{>qjwLeIuukn-1V&&C@NaZcmO77Jm#FgDl#PL?1wD;A1j>_wo;uLDD zG~i}N8@L;Zjyb8d&dGP?=Z*bI7SIZ%ez_bAwju~=JGSh7+BtjMvZHjIg)-aJ&TL78 z_7-PkEFyEmP^Kgx4b=bjzgqppk6;p&rlla>Lf!m*>?Qqol74QJej9CEe?!s#h_3#2 z$Bew{7V8n`V1;_bbJQa`S=AhTcvLSf>mN}8albqq)Bg*frPug4qocn|MI$Z7ACeH- z&r@R3B+7D=HgjWWf?SwsLn=)AJgk`Sdo>TRFvuLk9N8J5Vx^^cV+(TEkzk8BbsOQ> z#uyp01RDdW5`|(EXgF@u&=`$2JlxMyph~5eXFT}JXctf;3MlRs)-&Qd{U z4{O?vN=5ZBO=!RtvLNyADf%1zL}CCAqfcV({{`*c(FGvs${`0=sb#X=^d?n`A;`J^_HSG9aHTi831&!{7Zl84p32gk?N z@rmRfH91+NYhA}0+E~W;Sp9?9Type)noVhkCh`Xs80p=(@hgG0-mOEc0^!iQPtf5#gU0+l93Xgje;655YFAPM2pVp&LdzA2;?A-7d1Muczi143eKsSw^Gv`a!8 zG@uC!5uv$nSwcwn2)Q0hP8HG-=Q=KgVc4bC0uDbh_&kk+PnC~i@rl&%@qU>X(tScM zDijqEl7#2^Ug0D^BqT-{QAH?j1}@vU=R>B@yHH_=kl1M|kRemyDIt-o8{wv9GVt0E zBFkhU9Tjr0-|Hasw2RlIS?hk5fF_a0PsQh|HSnhstZN{A9P236sk{6 zKKeZyg^?{nBFZ=zHv5ErnW3A7^k&P`cA>wAq2Sdk&(7Ckc zUQV#)&WV?6aW2`s55LPjcD(Fe2X+jXuTI*2OE@m~pi^2s{9*(AD&ckPH>1GIJ$63X zyAIh6!0XBHYk+@`*|GD?-Vxa0?8cl)YN>JU&{!em=+UDSnao61ErvToqnwIvogG!x z(AqVfoof~>!%uMkN2@kf;YJ& zWWt=<8`~CN`Iuin$ANN~?SG$xOS{2FDo@XupLn6bSryxW>BoVfMB zh{yER{IvQV&?TFV_?zboh_K;4GdvEIUfFK;)c%X``jUBcY)`waabxE`4KGxgU)xnBNfjr>OqL8=>_vQ2$+|JCzNDyNJi r{D|TIAz-wn-TFk+{q6y(I>T+295{w+$&uZ9`ab%&V)VHbTubhM6o&3^ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_mateltwise_transform_sse.o b/third_party/libxsmm/obj/intel64/generator_mateltwise_transform_sse.o new file mode 100644 index 0000000000000000000000000000000000000000..a1d22bcebce0e37da31df0559490a4d97ff44879 GIT binary patch literal 5184 zcmbtYeQXp(6rcMLN~v?@!+3%Xrzu)LxZIT@MTqHM@8}^{sJ2C-;&LrpddKY@?(LO| zLC{moW~rKJqS0v7zcew?m`JEGgn)lY;&#HI)gQxYyU;n5fj`@=k(n z`YvsK>2&Rp&KBs;Vl2mdDumR~N-SxekDumRf&t)j_4C9ohXKy}Qt{Kuv4cRC$A8l> zk?HIKCmS5zboKohXo6DSS5r$`=g_X9E~+Z_AO;tDbFdvKf( zeotFxgkXW7O}S|hCVl~urA-D%j;sJGFd00)23BAF!H3?QV1W^%hX~F<+~FGSJv)?p zl^x1`iuwa)vM$XdLgcZsZIksICTDb$^##7b4@cA1U&HqOv5a+UwwAC|{6xORUhFtm zY_l_#CkQQ2>`|i3hQU|bI_u~L` z#D1`4-TKhuFG4qI_qQe2^#}L89y&T4wmz$V9WsXw{qCzQICOr+zTk!qb92VN9F3U2 zb&fyBgJQQf`FqOC--$E9%It=Y$~(jFwg6Mc`VGQVMwD4Uj36ts3!rtEfSPELhZ zkc_$9V1jlXxhzRzhvzA@ zw>p1-4*3t$?P_NaDgs0slJui6RA9~c!)2ar497Ud_3jaP#5f-kaFKt;ga2(7C-N_chsS@d;8(<7^1vgE2jl0p9--$1 zym3nq@`ixVXMJo=us-rMZVwnl%Ma*gb_`n6sI`;4XKK?XiSEjpSrQ#9mr2yn3fU;4 zQGVY4y|!GaQeP6Gd$GGDD)+i~kyLUCj2b0ORaCmuqT?l9*NTwHBHRN}_*h<5HsNkz z+0-W02_vg(?DhQb2KN+hC2>z6X&?S^5w`8>GuJ>5G5O1{uddPDk?r9ZxU;`5)jkgP z$mtb7|1RTE$lownm+;K>aRt^+Vi5ruVgX|HO@K-0wBk;KS|BBg(XAt#e{|5kb z^M7n1Y7@f!pnqJW?EfRc-2DH1k^VmbdpG}|y7))mvVZ&@b@R`EBJl4@Fm*&;{}M2E z^Zz~bKgrgC>&I6j`@fy}$2uj-3AUm5y~F1syu;covyVymkIX(Q;Vc|y%{Y~3_L3S| zhKq6$ZqKIg_t*pupNgHoD!X_K1MW?&XJkYKJPAQF%yLgW(327`zV z-7Gj~TuWG4!s>GRO|Zi>zIvbKbOjgN3xj#iqCBhmzQ~4*~x-F*;0zEFJCKPnG;ESf{d{KlqN@W{NA zTDqHm$o0U4p>?Ml#_*-64k3-$-|<9fDM=K?dr+Dbjo29@BO{=CC8$17G*E0^1i@n` z0cb{FDl>+2aRuQcms9508zQk8ea}Boa1hD@o9}=QBp;X=iaj1N6F!vh5&0_L*hQYm zO|9|O-<AoDuuh@sSY{+xKSlCBGdB<|n>F2=#%2fyngH+PV*NTgJ9$lr}BkhC&k9lgPE7w(K|S?L)7i0=3srYR1?7W#h|91misf zZ61QVaN#uMXGbt}WRfvF6Qw|ZvJsmIhJ}Y%FJNH@>U6?I;zWM%Vr+vKd2A$ZKSt?> z$c-{1O*>piMHv4!Ot1$R`!%4rxIt_NuEe~FsiNpwNTXaV#~A(}JfuT`zE6y0MqkV` zhJOOE@+wo@%DjUGhbVecVeVE^CO5;sz*XXo_gwp!@w&fkd_fdh z3QdfUoQl7bjMz)yFL4Y-T{-Ki5mTvTV2!9F@nexe$*b)52r7uq?goo&co~hCu%+S< z%80YBXH3X5_w3KL>?4Vr@UUl#TRAaZiq`sQNMl<=exgzUf;{m;ia*p$HS)x%5Krkv z6zORdO*zMi{TDDKYD59l|F#YJ9_iBrxA5gUU*aNhWR3nIp{eYaSbz$Y0$!9J?vI3V9_KZ+dzIoz7BbEzD>n_I}n^<2F@B4N=qw$CiJ;*7 zdv+)C2By`V5NWpJ;p6DP{A;Jdc;`f)e-}t&|AXRtH&GcyxVlgAJ#^%R$eHnV$7_d1 z{8vp!^ol$iecq_?N1jkcNrU;0x~o-p(7WP;_vQ}%P4r94_Oa{SX+Io&@m=EDGIS&_ zQl;us2og4T@NZ&Wkex_zl00Nl}!qDPC? zo?%8`WvcYFJt?hBl0LeU-3f3~8;7hmHK$Ix?K5yDpx(>|tI{T10ug?Ax z+}dmtMaV!SY&vs`5DZtI9=5LumKfXiF~$~Kc73oadK9_u?s&(#>lOyfYNfNgb-d5_ zfEAp|9{2*P7hvS;A47}-B$$Hy5Bo2)=%7_lADoJ3<$PvFUqB~OA1sO7^h z6hqT!Ma2d?(O|#RVE;aG8|sT4IV!5$9la~K1rYzhX+T9 z$1SHY2p`E0u@^9%_B_)WDCzZ?jb^jyPnfNiv1?&4Y_{NeLlOI)Ffi}@0uYD=k-N;D z1UIT+k5d8#7;JHo3DOL4HBi8Yg!pae`Fzp){JATF#ns;l4v~T3IR0AbFp$P93>Jmr z!O0vh41bQ!zr8hB9(@rFwh#UF&^54$5$ncf^qOGMKk#eI$qR}Gnt<0Fy7qKq;0NTc zq!JRrxFP%bT}7wnm2Lb&$(yKU#tSDIU%wX&MDJZEjO&cVu9D_f>#L_8->kFWh~@?T zmh&Uw(F$2sH5EW0aMlZc|4Fc*EP2!j%r*bWWP7-AS>E|=2I!sr`*dquDej#lXWN*+``ZvW<7RWx?c z82%#t{4RMiW$%<%qR&$!MUnNP%7>E9dOGv}B6Ft?M<|~6DXZEFmceVU7q3Oy0S}D? z;i7k(f7ZA%c6bv#qI*a^e!-OLNU+SX?s7?A?2!f`mwr14E1Dc7mRn9OY2Fv| zzXGx|i_naOJt1zv=Y%@a%_KR93wq|ABp8YpPqC_7McnUXq^Yo(EgngT>0F&JnlQG# zYx(yh7++gNrbbf~d1I%-jxd#LJ2VDA)R1)R5ZpQugynm-U{aw*8kG^d9zDO?u)t_x zTDxMKavN0QGRYB!Cgf zlYD{Lr7FmO5A0CSea5b5FiD7hCLtZ|RJUJ5v+k=LseFc`<9R5HMvX4fw2vol0UpdB z6b9jv<$eskOO4ROxGwXX_QSB{6Y&+Nt3JgvLa$aoVGQ4l8n8aPP_)q@-1rAh1^%gG zdKD9qHlXW#k{snLhi)r|L&I^=LQLnX(q6yxkY+R57#b_c)`t9VwV}Yp#k6j%c%+pZ z3D!t7S_sL8zYbc?1wj*?-RXgN@%ct9g!al?#{X(wJFC0EV(d~}tA5_tzA!imt;3>D z{)WEyU4O)2S7R`hHDc$Zr2Rnc?l-J$N4Bg~0^dM^& znf7D$KKmcL)|@^5d0{ef0AT^0BH~6AWJO{5{?N5c z?a*-{`kJlURmizGY&MzB#a0u^m)ZLOX$(2D>V?*V`H3$AmxzpBh{GnhLmx^GVkRU$ zID#SjO%}S0d~#80zf*4qgAMlUu(iTZ@9C?2$W4wmLNLsP_W2_lXivsmv1iCWIxtC$ z7m7V3?fE*W9?E$DntU$o6iY(HUZ&IwQ^=s!o|#Z-bDkafJlDzbL?WsgL16L>Je&`U$q8{=|axh zQgNt|GpDfE-%bvYo;8BI-mAr*LjD98nOiJsWIrQ3KojiOQshn(siku>tDj0yz1P_O za2UR)6SbVV5P2kN!ZKaEU%Of!bjIp9cG%=#Q;j=mI?uT;&q>F^|Enh+Hdz zlkHcSADQ-ExyUtrqf(#YcToCq4qK5})AtFiIbMz)wFH7Z@5FEihkNXa8BP4~5*GCCoWC8MIk$ zz7mGzA@B~2(Y&ssayPE1Z}QCsgw780kk?tkGwB)Z#Vn@u$3ynZA^Y{$KJMBRG4)Je z&V{`;GuY=vluKM!@Pby+zl}nohh)N;o=F!-iQ#X*neGtgF$vqDX-;e0-c-A+x;$O2T}F zNeQc8H)1WYh-vRfM^p@n0riBA72rwQV5Vma!$fiD#Dp?TaC^4L$> zPwkq2YF-8A8i`+dH;KFn%f9waVsnzd{S<12oK4HX&aQT_F`5U?&L+5gBmS469=&^4 z$C!HOvEI}8BlXfbmexG?>~YTv#~Qa83)XuDGl}*PqE+HbA(Yb7RoA8?5*;T#B_FT1<2Uk0xO1`?|l_F()Yj@FxB9EkBvz#cF2XhYVS~_UPQesaUWv$Lp-_6$)pcBxP~3dRMXk?9Xuqkf@}eX=qsF{I_ZPa zK0dwx+%mS9&UR++m{D3`I`fvHQpDJF2Oy0Is~0pPk~cGj?J!%~F_38uE;gGHGy2*Q z0k2WqGn(IUGjZ_ZN4W6BSsX@-%O6@-inM1Igz|WXCds90cgE(QpxvwH^Ru(ZjctG{ z(JLlSVfAL(EL~@+ZG~Cr0RA98d}z)+J{uE^BwN6i((?^e4D?0Cgq%eys7*W^!Yx%d z6-rv>$wIYPSxDc@p-@lvQ0{iuR&)Ybg{nu@8md3!%!AJ?g3nObT%iK;tvJzHn%d_z zH$uaUso^ea7)l?spKHb>8TsK1R*R-`%(sye<~6AKGz0XhCC#D1(g{ZFHym0eU3%^^ z`zZt*)-^WoMHB`pWrT$YBcd7j1m;v38`T3hJj%1gRWk4!LUkXP*X_6L13i1*%FrNtZ6eT`|5t%DoBZX^qgu%9Le33(I(18=L== z;o8|%A*UNy3eDl~IJHBTse~f$zJW7NRMAcBD$QmzfHGdRzz(IjXoPi2aGnQD#W=Gr z=bfZ(Aum{$Yt&LVj~A@V%^|G&9_X>|VVv#fSo@#2f;58Q0ImUg9Vb_c*M|~ykUb@Z z14LkRtQ106#30im1_@IP5~dU+Od&{?LI_PtA$3Iz))g^WmtwFkrC?nOp_rx^9@7*> z5Kt0IeM3euzIb?KL?wY4Tk3s>&7lQ)%3S#cKz_A7VQFX)CA*XV3G3oC&YwjJ%RFvP!|osZuX_^+Y4TW4OH#qybMI z%oiVJzF21;lPyK|9}p3$!k8{ELwgyN*n`xtm{o8t<(U(ZVKiUFJp_tA1Z@o3|DKDv z5YAd)apZG}LU~+^d}Dii>Ap8Cr@v$9=o!ZFLcoWPjx&Z&lZ&7+{32}H8iNO8 z_z&N0L}t884h{?7DFn9QBT;BFbd|CAK#Y&YJ8u7nqkIY!*ct%%Lxo zo@ESQK=cSHrP2i(+g``Gk}EIeoUbU9aUaT;IrB=}u_S;8RiJ2Mtx@6ZFy|8;(%e}{ za}PsZcT+h-hoP_gn5YgN4jQpXk!o1ZjD?)=wT~#r;{-iW2^we4VyC48cg`)hLO`c~ zVVmXLxD}V|R=+*Oixy|bHI0@tXAp=YQVTZ=nKiPW!x=1v_$TNcP5VJ9;^~5$M9P{d z2-z_0Biti$kdnZRQgXq3Wz86qLCI?{<5B8I6&Wj*`me!Y0QSZ>_D*CzpgVfXqZaGE z(613Y1@gOOLpD~N_JInBcJ1o)I*i0_AV`fzsd$(!#IfgN9>e3sYgiQ>XPIy)_&XNJ za)}Y!1Ts_r_FlOO`F^*qr|` zq#X5`;9`7=KR$*SUTs-fAy*2q;-2OM-LJw>FrL}y+_D9iG~y20jz*+mVZ`WGoEx1x zhj40#0LYH^n@ywaO5G}jjMWAjK^Y|zW|h^c1t3wKuWo!yGaU4|o;Ot^m zdGknEVW6jC86uCtY5z_Jpj{T)wG8wBA;dl2e4>7H9z6_YJ?*>tT-2F(T+*#twUJ8{ zhr#*lDC#QQv%ZHgWqJxLg)p_St5L=So{`n0fEri1D~75h*qDkY4D|JAtej7GTk)=n z0T=-7xYv(~EUfN%ZI~vPO%WbTj8#R6DlfnH_>xK3GR)wi?ZBTCJE1X-_tb`pG?BnD z&?T$t?2079Eafn5Ja(^LgfdDmL4EPAC4 zrI&VKI`~ZNs02H>8gC3=4Y$S>So{XAFvo~_yT!(EKcfb=R8G2Q3%V`y9`;-2PIg)G zZGS4>h}A51YK%XHySAV)up_&p2>fyNs|dYW$Ot`(&I_www}&Php+nAKDYc$w?kq#8 zP`n0HkeJlRBs}L*FulTs=S%a=jn}$;-476ZsoNSLQ{sA@n zpHPZ^hhQv^LW);$Dcu*;Y#)W9+eFXaHc@m=*d~gO53AGA!yv`PCJke_S!{LWEF;Ek zWujDSGTD_-oibbCs$P|_4+E~c8sI$wDv9Eir7M`ZZ|)AhBO<__6UVtVU2(HHr%kAgkS0dR7cStLBmo$fVuVJrn@ zHNr|*rt}(Mg{oQ~>@w{H>ZYU~4}8TUxbkTv2TJ$Ud+a*uSVF34zak;kepP}ha7cN# zA3HQ!lh7ZOsjMQ!5K_664P^{ddbM7&TZufG5@TD+`wJ}I(Tm7_$dwwcFil3+v$hgH zElIs}^#<5vL-ro2t2!Ju5+yC$8e^%nw%At?l0vA2uJec|SW;7?d@bWqXPul|)FwuLWODvBpRMlEQIfJ~BJTGuIjLV6hc9N6c#2--0v1 z5xCtE7J5c=hB3M^DSl(`I!!?)wCn{dS!1>Li3$=g!Y*4jiqEk>fHF9qKu`7t#uKGnY~Vgm4s{Yf z6fm9dcQNF&VoK%^T9^v>upzc)VLtPa>3pdX&wIzG94h2&UI>u#D?4E)-1JlGEl|oFH=z zPsVmxfiyP#8NmTcEWI-yYO8$QJh!vP^1pKEZPV$jf##?-Fz){Yls(mBbd3IF+{y;2 z>kr8eHHe8t(Vhfupb{;5n95{>nR&1l3H)#&JdiVT7%Is4iLV>L`NM^5B}hFHC$WvQ zMH&iqWBT9P#c^ITq2f@2>Wb}yP(52Pj=47_s4V~gq2NH0GL053;-=`J!+^` zRSeSn10Mxayjye>(G639##i@X(;<4geJZ%EMMwNJRO9Agvc}bHwz6-Vo9f!69bx$k zvk*elxuJtP56EbYtwn-=(rr+T?Nl5LwNwL^AQauJ`g^za5v6ha4a6ypkgX8_Od2N4 z7OxdcWZPs8vt^!=ZkfjwQn22Tzj3|fPH20d4>=v>#-`uFdLgHW zDO7U0z&>tGdBO4@hW8$N3;sGY9y~+o@C34to~vXh?dIkavVj2GN`8oks`ogRsYFSr z`lw-H_7N-92r$O>2R+rg=>rz`Z(<-2&ufOkcF|zad%I{EO7NtU972>z&2VG%4XlVN zveeX@2hAq6v8c1x4?VJVQ(vvF)=|@hno|v#?T$K&JpvUpAm`$aakRO`7V?rjYanR6=7mHUm8h$>-3rc;=;m$p7&np7B>JRVMZe4PD3KwzM|s*5-X-!vI2v9; zD|mKggcMzXj>+vMfv0S$mRFozyHxP80#!`0xS8pSMcrIlBm4=AEl+%M<1o_6-tl%emqA!;5Vk)-A zoP!Y6!;J-J_n=1#;&Ned8dxaD6_%zEh?plaJYy>WUX1uD0!t6nIt{xwUWTAdn=40Z zCq#muti2AdWZLf~9)|5yp2Cv(nD)Sm`VYXOTmD{(YE( zY3Wdo-;8~~-)7U5{VsD-;gS7H2~*>7MvU**0cl%_Y{kQ?0$^b1XpYhqkk<|)LCV*; zngmmpa|WgQMZ{>O7=&qK`w`~?j0(3st9-S+7_$KbYBm7#nIGe(2@Fp#Su|rIcy&D% zW5YRW$BWa#S-=WB(!G^aV^DArYhaGgorwSr4$4ZUUAoh^R3MVz&?NIt4rbM06$4ZQ zbC(#>T2F}uj94+;pQ$=#T`^O40BFj@)-jAzU1?DIk%W|^KN;Kp>{eyVfm@){`s(gs z_@ps>J0LXa@F#JhG&=k<62Nf-f{% zj9pQ>D)(Eu4nbBD?}4mppJ*d=%0!Pa!gii)plo%O4*_GM6XO#EEXnYPj$k2n7_)Zl zHxPT^$2(|1nF{9Kh|0(75OH#`ji{cJ7?%Bq=zeTTEDR#V6mQ`i2SZfl;}EmYn(`zf zlyxq4!XV;Ert@290sEs&`;Vd*aF8@fDJZ(zpRq;k)7<`ym;12&nLz~GpYzrJeA!m) zZbIgYOM3^9a6=lizzWOhq}xLw&xHJcg91Cf9njc(=NW4BR;87A6GP3IV?Q3oqHkgT zPEB#Vh5?0m;sYw7v2wc*-!HlVv-*601$*15r9RAWSpEy#J%LA|Ef_&hY{9;F;FY)2 zxo*SAjuwO^Vf-o(e5+~#k%=|f3-dUNa@s?^mx3H_NvRoqJ=Yw1?~JQnM$EatK7JOl z)h*2%uJ=Db^1SE~rl>-2tDyhFt%004+&3H1U;p2Jp5(?;#Q!5_vmG;SHZr>3a-Z)} zcMxp9jpkovIX_;3#^wC14}UFZZB6At^Uyz0I`+7|&$RzsYkchi?rS{s4m;A=<7?@zh#U9iCM?JQK zO?e0iF?)!<{g7opBr%Cw~V&DUuKa|10Gic4gd=)nZ{BU;TPyPKq+yn<@~me)P2yB)_tsH|_foDa|R5Sk;HG!qV1*sC5W0 z?@p9?D#lP3GVl)**^3RVmUC?@#w^j}7@7Y9(l8vy9>RsQ2yqJO%mdh3Yrhw=pD=NG zlsolmL-t4M9pR8YfvRXU?Grhm@35;G9~avv(2&b48~gMSr$crHmr4dr`_p0f%@xa@ zR_~~tfg!x1R_}aAUZx7!XV&Arpl?vAHg@R(ycO5>xnUH(nOAKRx6~IU5WR)~L$m4W zz%D|v3A4W&vi|@^A5JvGJSuLb_Q&wS?j`AXhuI#V{x0l|uUxc+?B8&|{(Eiq6G+n8q&uRA{mxY-#!Y*o zN6%oNg`vWU`!*hzM!NX`^0i?M#;*f1tgBVw;|eg!oGyEP6+S}&6MP;8LUA+4Uf*Ha zGpeG;#zzeEo^>c5J%%~nXYtsK?#T_uBN(mPFI#q$3m8CAwUdQ&L_D0*_TwA8-a$( z2*uZrGvn*?%y>(#8K09AiqG()+IVAg0YnJJ=j5C5{u9g=`_3U$4aNIUG#l+3xyspO zJIv29b=!<~3KeiI)a0@O^x?;sT)?WiIxN0gABYc!0yi~;-j)+*z5Yd@| zAUvl=W6Si%!zTd|9WLHj7#}RK=PV1wmqUu^j|l6VV7IJ@hx6l&C)(@#ta$ASco}wl zd~loM zPBs8L`W^f=HeU;kHp-9?Q*@Z&eA$x({qPkv2ucvDLtn$#5$?g2aK>W7GoaT#*%LDQ{^rBw z@w3QGJe(KZJs~u)`oR%th1w05J;W0NJRXuhTN{ROx@vG4Z{DFyXkFg4W@iMs;-*5;l ztF~;}HNkk>yYYsjJx1Rps&JSmQi{pk4fq2m+L8f&ml-d%f(0-p4h}mqM#90>CZ3K3i4)a+ z)KpW8m0;H-nj_M$=t~F$ZlhEM*#8e;w3gWC?yJZgeG(hwu?5QT!+!@!pPh%PA7jb+ksgc!%4shT_W*2cyeqOnARB1)C)Jr;L9p_(vI0`ov*Mhdwdk z$L`B0KKl~ehrNko35Z{r@kU~n9-yVlC1*@QmdZcCj|B++QTP+*0cFNK!Uqc@)-#nz z{1!X>(9cJYVQXDK!fN6@&`-&)ydZI0hVQT^d5s>!$4R0PFM4cp1U(J7FGVHEr^tCI z-a$6IL;);5dEgO##l?snq_pr@bl`qk0v-2wdIef1JU2cAF%S;$ZBFZ>&)6!Zk!!z9 zH`zxw;j}cpL(~DUxRGJLeWQwMJk@ z<6~r3~Km%mMJ>0$B+?)5X@%7!&-Rq)9CPhv}y3UqBLu|5$`UWz9 znb5}=t?|;~4Yj;?Hr4QC|4AkmSU${T&j%nT{F!Qpleso#VP+zG>nzzjvzBGB&N9i^ z-dy6op&4njA5*V0*P9#PWuJT1X-){MpND?QK{v)lg_4uBs+$4x*{YqYBoxJgRZ)o`O$Pa;?FL+*c_ud(l!Ds(o^oaLWMKttG z?tZ#CHPU#{i6YSo`{P%l_p{)`NcHP6NYAn7)Swr@^!}Vib_b}=>=fFu`@@MUar|0N z0&_C&wc+IWXDP-J^&F%w?Lcy-rjI!~#l$R7Vvoa8MoA?^kHklO3aD)|CYSNCxaZ6! zaEqx=Bv#mlD281?UCd8#*ABIA;yNaHQhK(y_O3O){y_AB zbwgK|P8#^Db0LVvL40bmjL&x7jMe2wG^pIBdft*UeKywBO=&_=2#ervGRfq zs$i8wYeqzJ#WW-oH#6#c;j?C@m}8ZVfb2(z3JzI8*k zO!=6B5aiZY6-)_i4XzMw=9Hmuv0E5Ep!^I|n^y32kSgOW=<7%?YS7xdn3^K@LS+jj zle)#0QxWO~V;eXD8N6;v;$(wH7HJeGLX?}hT2o}H4Kk+;g$0m>iZtFP1o6Kzv*jy_ zu|sHN!dPiM{4Tz@uol_K;B)5CNUpJYIr{}s-$KkbTZ#)4u_1)nt``(LNlq>?7b%5q z+6I)V_C!SC9Cpf~beOH2DS$H-^)Tl|b!~df6ByCIwCxgx`@AL5BYbw}l?V zqz8;CJ+=b^8qg!2C_U1#s7JOUk1UpsrBH-lE)`{~Ld8kmK)c>kB0qR=nb5{J5Mul~ zqi|LlinqMWPvosF#QU7j)j)~2q7)HQYmgW7RJ2rW6G)2^$&2vNu^`P0RXzc{m|Zk6 z>4Tr@Q{EdO1$ld1k1I>UEl6+>(at_hzeilkeRB5boLv4XxQ zd7^5MSW={#;l5~I%C4^Q^S!OPor>E9iPt3mHlwdif=6Na_p+=JiRH?vYOn!3;J|E8 zd@e~RnwZ27m29p;ZHffG%ArL>@8$$kD11f>_>Q8)A}BCf0{Gzi=DY+K4`qJzl}UL0 zck|^sQM5f+14JW13q?1B1oZDs_98xqO?;FlY-4c7ZpW;~j19*=P-JZh5|yz1TXb0N z*Vx8my8u_pFrka-r?n@_3NF9jf_ws>0Y$awtA4t!dcuJvJ1qY9(%w*BzMP6r#HrqHXcgDRwt~q4 zhbq+M!hidSJrhFS{$MgFxDT!LxXf1U;V zzFU|On0IrndEYHe3ldM!3*s%$?hTbv4En;RcO?Kr?9HcudYw@F0v3R9WZFapy>|o>#*uS?uJ=3+{~$q8R>1;=BUi zP2*x8R*1@&eird%{o5cdxDfIArBnHwXfew3`z;uga1C}6>XS1pA1@3BQM!cB*#OEy zQ+^++{%y#Z^)@S4A2T-n9FN@mlUW{EZLpsY<1Nj3e7Om(|7ysOE&zL~M5aZZ)qy|E9i7Lb$6zK6s7gAdjlz+2BS3dq_h@kBB8C+2W z>Vyn$o(C&aI;1K{D45k$A72?Giv;Nxdi@B85>ThM>oO~G5H%vIe^aB6$*`J?NMarEV6X9} z_iqPLio9SWCVRK3k`W;YmI{HSqCKsa$^OH-J84{M2W?xq_@$+ZTm(q$FGiMB8gJz1 z71tKV!>YUc9*PPTv6m}eRuUT}rmFs9w4TP8J6{+}Su$~v)UEx1Rsy5z--tv18ddRo zJgSdjv|N^hRKGNaA9gHL&7?h;D5&w;SRaIqkyg6Sn#}Cnq!OJu;w)zi;WoHIUh*3# zRIG}Ae-ZlqcctH-htPWz{r)dq%ZZKNL?t!`bX-%Lg#=s$dX~H;WxRf1Ov$0V3I%C= zMwF=-g=lR_DL3U;^3_(xh~h*k5IOx7dqes31k9Ub-PYZ_*_+R0z9Od$(NKNz76rlE zPDTW$ja!>!vk0CxqfnALloBKkzOAye0%cL42?eCElg zuIt1%Vdhk(P=!ugjLtSd1-cGeo;ZXqTex>6glLF&!jZ_Q_T=jFh4{HCSH44yo_x?D zvOtC!c~><8E8}yjh^!EE=qP5^o)HM%mf;6{<;7Nhy?5v+RyH`m89MrLV|cefFts)X z7wMW9I*Jbgzkq2hml@gxoHz~TO=6A|&!cR@e$Lsz_q?P)yGZ4h7?_Hzq@GNJm~=gf zq0s~^y!~a0yV2Yp60sG(+Mmv2!bJ7xo0AZg&~v_=(R6Uaqz}@0Q`5DH?Wlm6te4%G zqp;k7iAA)GN%t@enLD5vUw9|}NOZ#cx`)GdL%F@7B5^eey1FUDV??;d1V724=}S}q znx$r==#2Usj1(tIuI^Bin znz4H{sq;dwzA8+n8p__+s3PBn()PrH%<@%GKKQXGpXS8?fxW4o17lLK4K0^Uj%lah zwp?w;!`^hMfKh%@lC}ybG?n{&I8jm~dvj-CCRw*YTd-MPwgvLk!8f|NN1JcHFT(F; zyf2)3I`(SC^2-MMSFY^5yn9vmnkA9dYdQy3Em^a!^Rne~TC#TV!pieHS1w<=X7v@_ zYgToyNJD|f<)hQ&;m0;ZQOoeF#cv2d?>RCLIhY)?9M-IT$ccT(=%g;66AzA|(Z8Y3 z1C;!6&8-kWrrb=uN!P1OTCms%g3}1%>j;9+2tv~c{C)&2X9SKg0v(PV#P1#a{)wM& z){GgSEi0cjw{1#U)wBzzRg|4qab9J`g_ReSl{a^Hm4%l?6#SEw6;ps;(l(s^S2z3o zgGK(+3Qx%20>WK@NrH#-t7K|=py*3Ep+LzExzht>@x1AQ^6ST$fr?Gz%|KQ31S`;( z<4g!t0b&NqQDS3lOIw)%qHbKZvz?f5#0 z;4rCJ9AbO-`cVQvJDyV)D7ik@3Y2ZivjXMOarJ@POD6=%%|Myyi8QmIBMmaU;F2`R ztb)GLm4Kc+a0tOeme+7TjNh;eyM#`Zl0Qs6htl;t8P#JJSv#J!;@N=vER1KS zTW>y|HIZB#59eyUOb0JB@o)(rerE!ndYrQAO!sL6ZO^rnA-9oqvhj%Q@SGcRLzF#C z*)7e-`f|tl1A{HVJj-Rt1WRGy?FjVc?9C{6xL{sjFoY^?fk6|coWNi^?xwp4W&r_J zNCcL8Wa80GSE3LeT@vUU2n;r(5}!~>tmHn)`Ey`!u8VKFTZw|s4fI_R7@UPCAyncM zYPKCu=7Z9lz~CG_3cL7dN4HW=eWpg_^W6$P-RcINGZ`pSev)dBx~||St24KU83j3w zAlv|+E(YP5;HeJG21^pT1VAdc4wUOagzjR(U97+k z|BS$H#mV(3L_K@{JCsE;fetSRR70Q|2Gv_FV?P)p6 z)@eGZ10gowuuG^8W$RG3&J`~VLSYb^9XRZ-4-Dnde|t3?Yye>wM;&m~i5?Y>FmQyy z>}=qu2aY=6fMEs42H>E^LJ*iZ>VcyUe0Vs*uE6stAg#&5bYmKhOtY}Z^lTKa2h(H@ z?#Z$8LbhofA5FIHRR56{*Jhzn{4+2}Rie305A-ciH)yoD$pMVsSO*>~@X!|M>r##S zQfbszW^DPPf@@T>?NBX$hH8&=X|~7KK;M<>IfIF3k693n?ncp?0)rP*bk<2UHxljKLV*+Qy>F1Jg##Mv&JRvSr_`u`3mi-q96HcRz*8dlCkU3K;QDf;A|AQMC`vRqk2OH zuK#$=tSQjv^SuXFBTO1`oRrL~-q-fq#Yi)i&gqK`Qm{R2i)P~*uMRq&scWbafSUu$ zW=aVQyf&a_J!&?Ic^3xy8pNV76kK4wT(8b3h*bc0fYqCcelh$2UM_7iRE%i3`$?SxY-}Hi&DRit;k2>tKPzQa8bmj~Qq{K%IjHL9Jz+f0m zc(H=svH*`7@aXc)UA7PA_AO?jgG2sFzL|ld>vNDsZpuYE8O`gQ5GXMvNuF7-K6yR6 zpygk@21bPjbi(-k(WUt;1F^J=*~y$LtzA&R=Vw;btO5(w%Q!f}A>WNjwXsZbD)%xV zN>Bg8;N^AjvCPGD3VBQ6k+zz$`3zSf5 zdsq7PoQ%$kWQR#kBQSYA7As9h7uiURx8Wf@#>+#z6r+U?kK!Q{Wt|-J4|W!f8OdR^ z<0I$uwOT|O(oz7TZgDZM!kJD8uTWm7lVaXL$q>s`n#PQTVmAR8&Paa-+NwgtvX-kN zO3MUfmLwI>?n?6503N@9Gd+DZ&Kbo9rz0F38!b-&3-xxq;f|KvgJE zA>;A8Tzs7Bp-<`fU>>(o%qUnCC|?+;sFU$nL!e}qG&xjxg?t#S?DKso8H>_#6g?K* zj8TnBk7h%oZO|1xlPMm(g8AW0Xm}=4u9--=>X35HM#?o4DHn!V=xndUZwm&P-^f8! z$yrFX(Y?Lgk4WkA^8lL##7sQtID|E7Feal-ESJ#*LIsz)jLkv)4wP+1*&e_y0nXWg zT?$wWU~2$t0<0ZpM!)c?{~3#?`0Q+ zhw+>_H+8ZQWmW;k2F4K#>vU^%D5qYBv5IvfD8tl*cnRwS>{<;*+=Ml0xIB7SWaQo| zxR|4L4=D?`pv@_9NG6>X58XPg3|ZlDK^L3#$THJorqOM{)dm=bGEjvYUouIc==0NX zv0l5*76~*Ia7;}tE^twCaGulS;v$ePKMz0-02xBtab_mUGBl5;HkQdjD7_924WkHR zM%NDDV6)?Zgg7|E!ErBfATw1=lMoY;(bd>8$qz+qMYRQPwb?k6i1NW~fB87i_tr?J z=jd88fiYj`PNJke&6SzNG$3wsWtK4$oI!Bxkt;Ktnc!|BoAxMIW;4@(xXqQB&P;GO zkf%M$m7XLGD0n801q$4o%^dwW*LUh@fP0tglSSz<&vJc}$G|E8@%#8!9Xh4lme zRGo8u9~%vD?{a-*S>R`6fuETL9?SwiI}5x#3;dic@Tpng)5gFr0>tmDOv2AgUeWF; zy=h>;Q{_j)_1dW?xxNchH`-OMPqs>H5T9?h+>OO^vB1abv(+u{b`eQj^IdpZ5~3~^ zx$rG%@Gcj=H4WbD!f#81_qlL&PCZ=f!YfiBxmxeSYt!IEE?k{c4>!5+#uP}d9Qk{u z@8mT27P&vqw>S;{HMu|2cUl^px4FLa$3VXY$iy5DaMdAe2)MilzoHD`>aT!L%t^%` zSqlJ++-rPO^RfdTQ={JUA;d$Yjb0eoUksvnU#O6cu$tU+bu zCky<_Eb#AVfj^i9p2z|}5ltc!KT`mon3L*-WF8Uxw+TGe-^tt<;6qvP{7lNH`azkK zLixW4Jk<}%Tn*rU%v(;(@m2;^`vll`uE5K9DTmBG;Hn<*O#BaJf%B!|i8*JuSloFB z+&v-iU=l9#2)sjR$b|p=Eby)@@b6}UKa~ak5rm#hbZ2FON3+0pXMvAofmdMeEEC;fT3oj?_kk zl}jSsD0J!bw+qvW%UKV&d#2JRZBZNr(JM<)u*d`)9Pnb z@XvYtQ^`N)%ayuU*B1yxipk}LaxaBH&CAd5kD#jdE$QkVz{d)f^hCPX=o=zBmvnWl z>AbwJb4~Z+JtWIleISOE9`KQbp5>Ph ztm$^8?p(W~dsTmuRHlLropU=cTd{iS70Xwpcv`x8W#2%gyR-Ysfh8-JU)|j`wn!#i zD^{=W>s-C2t9wn76Ke@XXfE>d!K5B7CJ zMdx=#L{CW$QY9iwRxM)@><5DHE?Lp}nW}V+rV6K_!)~Duk1<2Z{^ggiT)n*OLzB;5 zJ_GX0yU`3G*vjRrmakkgcs`}dR&4pIwI8-<8pg~j5ZW1#>ZLt9W^k1?U_Hs|2RX}V z)2Ek~&#kaAcCPGR`QdT0TGrBn(_{br+_i6@f7u5{%J<`|579^`;1SKDGeqIFNv_|g zY}cCAeX9o|$)YI-A6(tPZ26kimFHFUb@rTp9(_jsd`QE&h%@qWnc8Hh$AlyUqj`P5 zlKtJ0fj;0zaPNc&JkcDPdI+scMH7fDmwv#Db)yWNl0R!5p`!8Y^-|hq!A^D_DMnH~ znX;DJQP{Z__C%u`#dEsT_6)38A-#z7J*#`rS)hAevMQ3oo?@9%dkG@`($(EPJ^h_& zAu7po)(Tky;0IUym!eS`m)Qcu2UAUxPj}3JRm2ZfHyfuPs&aM}y?=GB9noQ3r~P2% ze8`xe$rv9hu8TkRtyy(>C)^R))raEE6DC999>fG@XKv2>K20QLTb19uKNG38_n}Z` z&4wtZmJ21>{oi9d%2OsNlb!zZ1o_YHJp$4zh z;OjL#&(`2~X>c9?lUd+3m>1+he&*ok(e;)L2(Q=R4`Du!2jLC)dF5Zx;5yx9n1|!R z@^kU?%5y%C2jP18?`MIZig`62EYCG44-Y?j$Aj=D{5<$|8oXJ9KdQm?b~jG*6J7u1 z8eHd-Un}H6x-Iy5bombg@*rF<|4$9B^I56+m0o_n2G{-YKQy?`|6eq??uX;a2o9cf zx_VsI={9KPb-Gav&b2;I-v80yx*i_Z;B8v@{Tf{7XOiaUI=o(k&(rW+s=?be_;)q9 zPWJ%~uH!$f!FBvxH{d~bb^JdUsDkTp_!$ju;o8#=|H%m+#G}*YKU~FwaNR%keqYDa zua(!~-_+o`{&oM+`HX3PqVw}>4X)dlpB~^rK6U*6P5=&`bo~7qJcMVS9&Xj(mIi-L zgX`rNX?D@^uhrmszxDMl|Gipy9Y4Rkz=L>n|1Z?+ zuKWMrrL-!q+vOt~pSrx&8r(u%PtU70xL#hDOUJM0t2+PxrSY%pXS~M0uAhI`;5z=x zHMq{_jT&61`>F=l@#ImFIC#?ibBRFJJB|N_YflevYkJV}aK9Z7;?dzh)ZjY)-)L~1 z|7$fqb+|#qIMVU!{eq64>%*DK->$VgJr8_ZgHK0APu`;%T<3><77vQSRavk6Y65YP z{3ZB#@GlXFgK+LK@!;DC#6frmejfZ$0&x(|y&N8VKY=(1pMjqTKSm%9!WZJ_!3pC* zK0kt=-u`v?v%JCKrbWr}y9KJ=S>>gN=SMEwTTAocuefmPgog)zUEL;kV0rKZF1*w| zd+>j{_ho7D!|r`B4gQ{cU!Df{w$M#YgXg(^T#*LPci|VL!3$k@RT{j=h1aCPFLUo} z)8IYsy_E*d-@m;5B`|C6`#@J zzjEP=-LqGIlY8Hl2LBKDepwp)2KRnN8vHBneP0^t%;6p@!gXO#N^U9yD#bG^8 z^0U)CSl+_VE5Fl)6Wu2?I6qFxgXMKR0RnKO<5}UtiHB|3!ynh+Q#AM^8vGm$o~Ola zwmA=vo`+i+{BgHD>7J{>y?%jk#t;vWe>`q*5Y8`JdGO0zIN7>fgMUja&s_;#`JZUz z*>*hmaSgs)gP-D#f7AJ#t-*CZqb{8MFvfWJcWUq}H27f+&!;r_hz4iO^zeKlA2&FN ze<^+*e7*}OpOqSXl?FdggKy4)#~Y8Q>v=#McQdwlbpN8kS8DL_6L5ord@`nbS z@dq_H_ucRy{A~QZ@}JS*dV8Fu;n($WgH~R*%N`A``_Gdap4A%N@z@u`gZwjQdi;b5 z#6dXw1P}fIfj9_fEcM`e9($PvUxaHO>3G&Fu)x{3czE>ky!PM+wem|exEFVcpE2Ak z|7k5Q>F}i*{35OVO&a`C4X(%4S`8kg0dbJ-r}6XX&ddUjXz+Dfd3~IsQj&0@vrU zb^Mf(2if`zexAK@UAvzH82fq;eu@U)puvMKoaQOh;L}_<@n5UKJ%3Bb&rdb+AY8}) z#Vq)*&w@YC^^|gEbtv!;HPQ+qvL7H0{@~0AJF*M=Ua3<1zJC=!w0g!xqp@i+11M* z(BSj%%+vowt)JE5RaxLyX>c9?V;Wq?^N%d>VXZ&a%l|kF{O?)d=V|?^&d+uYuH%0? z3%q0kwWjo@m!F>nep43s9a-QnW`UOpRn?EXt|5@?G`MRzfgI4_u4)D1jThORTvZ9& ziw}gmq6^%c&nMhvP2j_Bd*sRO(gl8-+r9`d6o~q>*kz9JlQj4>E^~wjH27|pIl>JM zKIk$>c##fwnIrsU4c_H4M>yvVyyHm?eu@XgIp{LY@~3L>6&k!mgD-TMX8B1PyibEq z*5JD}_-PtE&lQq*N;UWaS4hH7*WkWs{p(gnmRyE&WQ{s6^R6P@gAw1ft|gHrzG;{B z_xq+{)7p}0giiZVn`g(Cf1iD_3a_`{Hnp~&+UMXd*{j{M8t(V2Giu}P2b0kD#%3Te zcAIR5LK*g^W+*j!H<-R*jGIXRwFEy` z<0nNZTjfl(l3RNIl}ag688!lCU@b}H!xc}p?@_yQ&=52gW)db##SGhdJk4uw6nLQY zHw#=FE$xp~;xz%d%F6Clh7p6c*Auo7zfAOB$v~enrqj0p z%S6A?%_F?`HBn5CbouWFG*kVpZeH=IThBYC*WZB$nfMQXS=I3LMEx=4N$3A{z%tSA zbMut56q@AUbow7dzO2&i{-o^t}&zdheBH zqyM=q^c&p?d2juL^qF#{(_fy2{$e*-^WI0zM!zcy{f-+|jSePkI7pwmOQ-*>EcCYo zSH7a#7W%h+O;zgC=uZSFo&N8#(7){*Ro{CrQ{9cuwF!GL3;nWhs2Y6E zgYp;SmrkGa2rQBwf1a-Z5Qq2vr@9+Ue?7tNPyiq}k|S zi29lQzwFy%>9Z}T)93tLCi#o1RDJKg)ok=vXQAKm9aW=DlbAFw>&G9m&@cO*DiwA+ zM$-Qne(CfLJkO;6=Uw{Vd%zyebonQsekS`b`+=&l*sVzVW%#Aj|8y4myG>Pb7n#$J zboz_k`doX^k85YByR&s5u03?#neV~#I=%hf=GITXr|jZMr~fU~&m{kYKTs9CeanU-}4dkm!0ABXyx zr~X^go-0{?lr_*1ag??F+qUgP^oQ-~0 z7W#uOeZBvint}cgv(Ue-T~YMjbIwMe`(p6FuhuKzo*%neT{Q9`4(fjze(CZ*?b1(= zKaX9;TB+}gtGluB?~N?=W$_S|(%WU3hsZs1U6=mHzF9wNm#VwnIuO^M{k`)#xGCKd4W(p`53G597J04;7BF zU)fJp3GbbB+V4XAy!u|d&P1CvZ@!9u@r^t;4`+AE`QG&U%&8GV#vEIs3UrW3t9GQz Ve=?qBs$a29Rr)OC(~tD}{~y5Cp(FqR literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_matequation.o b/third_party/libxsmm/obj/intel64/generator_matequation.o new file mode 100644 index 0000000000000000000000000000000000000000..1178ec41b530f0e3e43f343ff49b435c73dee5d4 GIT binary patch literal 2072 zcmbu9&ui2`6vtn>)wbGSibAD|@u0S1$K9oiRZzAq+wNht(AJwUWVcgy=_aAcv~ESH z(qk3$AQZfK@sIH0*}E5yUOjm5=0T|MWoNQO%u3M*$@|V{-g}u$=4H23S?IAW&}PA1 z=!`T4IOy%@VkC<&3`6jIckFux;PH*}{xQNID0y$j+8!wHpCzz=qWm%vAK!2bz7ZO^ zSZoW_%|^8qx-NetJP|n354h_{vAN~QhVStW5qP2rusA<|+n!jw|6tOdVFi}8^VxhZ zTgXk@6U$=FE<3W#r*hdzdIG?iwgckzK=U z2(Mpwy$d?MN0x(bEE^T8iv3E(R3J5cBFP?fltXS@rhW?cU|FOY0sB}VK+~O(y48z z`l8O)1IHD5f&b^nF`arh*(t>?WhvIQOE@5w?AW3iWawo6Rh*-4{MY;(=2FdW>(_mW zA(16}^Ey&G3&m)*bE~TbwG=vup5pYMV@&af=;`^%R(+=C&HoW0TXF3|6qK*etLE>O zU>Yb-v1HmmHFPwgZlBG8p>>`{uTc<7uK$Y)NUlr2zs`SwB5nV^s{PaZMEgg7SDmkU w+Ow&4zgp;l^7LPz8p(XBnLIm);GKG*Pc$%0m5a$KRB8D?6?_{~Xt$TD#kJ+3wkuJ@r^&yD^As*mK<7_MEmy+pg7Z&tjxU>}gf#wz9wf zefPe(^X5IkcK06cd-wkC|MTAe{dhOSV~JF|-{(_A@+o&H<(fbhrERfPX2mvTvC^U> zjWgQB?mEr*Rr8cqI(bH$uBFz#P;M5zwHwc(co{2B?`vdjin8Ze45ec;fnRHXr<$GN z(BUzVtzOdjqG4o6*CGy}u~7Xy%n9oMv^Rn@lX8dfj1#tHRoPkd0e|$&^2&tSjr+N-u^juhlNq_`wFv`2Bk0*OTNJv(I=nZv0ob zW}IdQ%Jx;z`!7uI?$7A|fj>TXf9{^7UQQZ6Qa5xM|Dtv!jF;iwzpckGOslPEvOF}4 zu_Hu?K{#4d2BSw#RD-djbrz^iADcRN?i}l)Bg7>dCQ}$k1|v{Hd%uIG4s(N|nxEZ| zV#2(y?bz%7L(4*YCs2)to_@vHdjh4E6LY%XJGNY}Egf54dSl8gMwuldV^C0zHEVEX$7Zhi;M_T890=7T z4Z=V%jhQJM(RSbD$I#Zz?2`u-pn_O`mtvjfzuDfpcS^qn-h#=BS9=A*3V=oTXKb=` zKTuvbCaI^GCTgt+qKfdc15fl8<2m((4Q2H#p3Iez(wRDadFhN#Us*ci*H@R$)D&a4 zZ;R`_nt8kNwxw(DRPh?k+*v2pgPPghV6hjE(dO`CUH1dWmxA$T&AjT=Hxa^EUrEqa zm4bc<#=2Actai)KSHMZVqbTdmH4$_V_JFfK{gb7$e*Lo2*_vW)X=l{0*L5GqyS8}d zIa&$3F|dmHSaf35+0vO=Bd1E|eENSgbJ43I^R5V^I`vPMpF5+h3Gm*j(3f5aJ^R9` zuR(}8TQg2*# zTd*svvF*L-D;)%-lQ-!Nnz_1elDsbM0Au1Ma0&XPgB~J2ug@h0^*OfBuIw~-Zd5~;*CwaiX-X+6bA4ES|NH86d!!E4 z4&&~yYOIcS810P-V|7e5cDJIfjY#8W_r`?r;6`(ON;U3InT1Z(80j?c!k)Rh3!=lx zrQy){F;FI#ZmbK9QwMbz!ws4dR?VxKJ1KZN-PV9Ugn6&2a5ibwyqbWsIMT4ah>t9~%EEN}5>+$IaDfJBzk&ur@^XD%5L2 zd%uMx#oXQa{xplJ@xMS-NFARsRi$$n?mD108t96RZI7~sLo8`0psNL zue5WcPrqb3L4GKFA!?uRtgG`a?_4-(X#+ z;KOqB1*Sw%y0&H``Rv2RY(dZVMo`G;{kfq?Z!TLH`jj5Yj`kPyrlzLIP)^_4KeQ#H z=OT9|yA~{1FxwKK-b0fqX}pV5D4oi*J-d~CgvC~Oubc8yla_H$Bejz1h7M_T6$FRjnt>i>j_e6 zQd;NO$$!K#7RNs(O55`ynIMHI{2%+RulP)97py4$eQM(C(J@#9hLmBgklMs|qlYgc zXjjyVV{Kwrv=ax?C^WIv!9L=%9fVDg=V-s74G8D@lo5{(i?e_7D7DW=hU1-(3lYym z$M6naJ{~=T#?a*lqIK9!r^lmVylAhQCOSlGeaF(Zk>W?tLWBfD$S$IIUa1qC-9%Tk z5j&n{yp-gNjHyRE?Jh98qIF5*m6Y*@dW81ul<_v&nt#GNC~l^rjj4(Iqp_56f3yMT z_0Zm*f@tQOWZ1Q1DB?I6OqeR4!IwaUHG^+KDPgWn#f`3Lti!xUHP?5d*b;}r#v~3W zrMGeP(w)yWV^v|i2wsq~@ns~aSZ`|rq>OXrIlxa$))xRj16kBF3n9-w zU&D!A?6@X`&8&mf<^51KcUT99y;Hk;&{>)pGF6KGSkhdQGHVm2))_Za(UzGREO31X zCHlv{hoG971kBbWHm#4|hCa7G874l1BF3-7>bjZ5zaxGghck~_%cb%pj6I0FANeWd zL&%4bpF^f^ofF94Lw*JMb>ugZ-$6cuJcc}qJdC^*c@y$RA{RHmd$i#WR&va=7s5P+5Av;ph}DjfA-;_bRnycWCqd&J=3!- zJ(9^+@d?0#f6E`wpXu$*R~yhUuO>p7KKvIWFG0Q)neGQp&zlvigJE>bL9><0mm<^F zP>)P!Ze`WVmA6G2SFPS~W2CidaZ@a^Ft)Hcwz#<^(zp&gj+W6a{Fdg}jUc~*c#wRL ztW$iW4Zf@9TvB%s(}|8kSF;vH$?`X0v~p0o(t6>B2FD;?-bL?3u;GAT4Tkpx)L^7k zGdw#O)`AVIf^~P+_Xfjiuwg}z>?YZa9`9|)ia!->SVIJn)kM$;=r0LJT0oB#BsWfS z<0N-oFq{;!i1q~0AYF)-^!|oXy6YW9+ZFJi((kB~q4#nBox$)#AQ6llsEG#~_sxn2 zW2M@F-#0th2zyrq!ytlNtLy#m23zk8wrvZxs=?S=M{}Yca#~jh+tvnKA)!6k*e)?c z75PDZPic%E=&gO^b1ASY7}F#@`G zIL{#GDfu6qO++a?h+t*UPamF(MJ~D94%@ zVqph3mgJa*_fs63pf|et34bCOemt-u7@4S95o|m_W3jJR4Yrmpi3d9ai!dhC_iC^a zB=*Rxs;9pFC6~2>Qs!zGe1}o*ywQ;!+uvbeNrGB1Q3GiA#5fNY698f-93E1>k zdEm5aRi}XRiE0o?K)J>Pzt#hfc;NFr@asHq_MK4G7Eo@e!n%R@lyVfrB#M9(R}u`r z%Jh|Zc^oV8a{N@{Y?qKBRt_lajO2oUx(Xv8VDE!U0*bCp2uJ~gf_*a3?_0!46O56B*+JpWiqZhjpKPP}^E^L|WFRvRS zGE|0_Jr?r&8D3r=*ckzG{v5cQUq?Og*O(k}`r`XNkQl1fp7AVv&RGfDw9)Qm&D$GnBnDh=~Bq1^diH{>yp42zOls3)CnYFV^o(b9@)%aV%fk_zHmTNf!!sg*H$G}D7sTd1^9 zarsn}J{9|i`t|f+e@{NgKNZtGxuL%PEnaLx_~g{n`OU-We0BhrbR*exuCEW@jcyF| zNeObtCtZ3~YRw8Ym0r`{zCO{F?owBz5^0-#h%B;eBia1sT%ljzVIx~JL%jppbT*&Q z^n_n6wZgb^lu;@cKIfL2r$&q~52Zz)B z=TTWn%4(kERUhiYLdg?pltEq`=3mu)`OIL}HpA)BaIvuUV!AHfzof+$;;yt8m$~`e zj*C;9qk!6?^z{$*rcuu5X}y0C0y2ZcN`E0uqcW81&89c!vzcwU0ZtF+`iFFCDh_4x zJHXRJ&3ZmFROrj)2gSvQlOUh%E%szZYjy}%&P7BMzi=+g+upBlO>geW6qt_9{Y*(E zsN-sdR9B*wIgB%%$!|$#h7g&-s`^xQJrkq6;l4)wfT=Y}`@h`$ySe(kf#2 zx@ga55vb{b90FhR1V_8MX0o{-3mRp(CgWH9*xdk z=oz5#cj<;ciiCpRi^Wpu+n(>&vtlO`Cl@*b&^I0~Ia#`a6A(^c6#~CZ!Xpwc&O<~m za<_{gwKfw@m)p9iGNYT<@lsOc?o*i9{TH&z-9W^BwVIHCE>FD^d~$aN5*GS zAR{^Lr0+jB{8|ruxrEE{zgEI!JHH{}vR~hkaGCz65-!WX&cm-W61^-ZYPnSvZ|xG^ zhO&sO^&a>p4_tg_Qh&A~3ptN?(9_(f-7vXHZaKqM!=(}JsT;-y6NE0Kkq z`w2uMJdP~z41q{a`43w#!^s9gUy|?+2|wt8KOl{lEa%%2ewQRC>bC@uJ+vkWz4uG_ z8VQ&Cvs1#!cSK-wc$b79 zmhfc~F0T(8B>d0F5G3;VZe(HSTM|xtgup)@Kn02D<#phb621a$g8o(szemEe624Kw z_e(hK-9pYm3BOmu*`KgcCwnON1pR+X^fLaggv)VzJqbXfmmKG}bDYMo8(GLtah$q| z&ME@mAjzRQF7UsV^xh}o5tJ#B99*5)iDHiu;keqclg3Mg*V(9|j7j)x34cn$#hn*H zFH5*MTM~3e!h^g_uP~1*(%T^6Vm~GPatWW3=*8KIntvnVACc(k$1jxVbpklJJj8_*MxIOZYAc|Codyl<<#B_}3*|tWnhbj)Y$=(O=ESotn$HjHomf zb`0v7&3Nm1>%CQ!^z5jvH1%fmjMB8ZP*9rarl*O}rt^REc-|#lSc3Pp()oGaZZ6sj&j6>cYi> z?Mn#hF2!!CzI?Hp#BGTi+xaeHYys!JjX$^Yv4;dMVFF}Yl@H05(N|1L1! zsxDe+GB-E1ULY)`NU4$lR!ejFQd%I=*I)Jt%8?FQq9TfjX9y%|Kf z${Nh#R<8*^^UOBk$2zyLD-n><3Y6WNw(c&a_3aNW(%MN|fxNeotTuYMB))CfE%Q+# zj4!V?-e#973r_w|p_%s2Ka{^2yd=}he({|Q5|-pWlK?t1&{3fcSx#e7s}|!Ron?tm z?Db;rDgblSf0*;hdnG|m{X@Xq^dIK>g&xvRK05V34a`md_kdH{WxedDQ~$Tn;->!u z*MEp}l77Zotu8~lY|&&CjE47bLvld=oi1Ex{&@=9{OW9Sc+x+bPjatr|%N{tBU^!KL=f? z|Mz(4-^BIH{ZD=8)K9+ychfJ{Z1Y!WWDUC(|^N*KgRjR z?*@bbC;wl2@V8wg|D*>$`%eN;N^@RLPW?aj;NQgci}hQ`aPpt?;2-1sO@fiXo&48h z<8&Q=&M$r!A;dWO*Lv`affm1?@ZwK_-z|Qh%fpU*Y@?HOg1yOB|_D@CoSK9{xYY`Ne&mpcnBY>i+2X& E1%t$jQ~&?~ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_matequation_regblocks_avx_avx512.o b/third_party/libxsmm/obj/intel64/generator_matequation_regblocks_avx_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..1b1cfcad1a02c7d94f3f160f0489223c1e1fcf5c GIT binary patch literal 39864 zcmchA3wTu3wg1c{Kq~2+NU2Sg)@hw;s7Mo~Evcw;l5=1J6AUCk5G08tm_SHkG6?~E z#Lf(OdYG2mORu)It>3+E|K7HLwXIdO7K0#&^+|nz_9|MhI-(-g@)Yv_t+n@=*_q4{ zZtwT~4_{`^UT5v~+H0@9_TvmUnUU!}uh-*TUeD#8Ov!-F^Q) z6z%nU@ag?vo4%mRI;_vHww}`GRa=khbye2K^TO8iwPEYe;n=$+hCRR3u$N4+YXh-k z1zP+Ms1`fs)#5g|#*X>4_;>gwUrQ{*&CbMjqKm(X{h6@6wjgZJD$?!lkUb+*zC+va z2Ryp;x{#J^_IW&@Ur=qm9k$+ySZ^72B-k3UPU>^2t%Gla04OX%^|1|ly>8cr%IAc% z4VR(1sC5t(?Ydy=T&{f*1h`HZbrzcTf>P6NAeont(1}??Lqm2CU^jRYhqPO!g13<@ z?6Z4I^{ubv8Ra+hYwQ0Nqz!9vaDrJj2x<@91PKi5`e4Cwk3JWix(ur>Si(#oX|Z81 z4o(7Ia!Qq9y<|m#!L4uRQT<|p(p$RRYnJUY?CXQ2Km$>xs4oC1K_wj99x&|Y;P`6m zwVBB&|4k}?sC8Bb_1IQ#Wa69E$tm~g+L@yxcIA|6YkHBrXqs*vv92(l07z7>B0Bkg|8inzXqk3yoY;DnqY;K2>)FPit`DP4icp{0 zWIjTkKyEZX^&1EZ5{B@Wi1jQ>{FWUJwnAzMG9U6wacd#&tOEA3HKFp= zFjNHYlJPRsCXs7W--lcq55J(@{fEaO7>lr*Tbx80pPbT19#X(m(dy&jRj3Dnj9_tJ zscs#!9zmTgm-Ko(6?WtY*GJOWir^oD2g{wDauZQCxDs5;0u=)vlD>2=yFC~HQO7>> zU=HDb`a=CG$GK8J1yMF8KJ*3p{3`0~Rn`f;ZeA_)4BKtf!Vo5)-8a85Y(;}p4f~roI?Cf=xg)TDz_jO}vn`xL z1}S6{CEnBG*P+`bO;~nYfm!xO*?VEoF1iz;WRSX^y|nOw0+2HkZ$!WR01?(>AA6&p z)01_-ofc(SY@~S-z}aTlXO!59UTu96im}~TpL0=J z(dP~*4Pl^=AvF3PS;czSET4{m^$w8H-i4LPDIrJ`fi+9vD6Exk_mwX9aQs<}=8(Y$ zSb8n0Okf-sfjp!HP}pRWVZENIiqJKv*8z1BXmH?i^rfg3flHUlxIn5`CmX$nU0J9< zSO5Xd#9(xy{-9{VBi`s2%w+ha{vfrW9)qu}UkJg|{hRcT--Pa}$A;iX7VZp+28oxo z#6Hrn9sYLrg}Qe4JE`_G+5twZ1=?;c@$4|PRBAUKX6V~}H70_7VkxlL$PWtiN!r~Y zZKcB0hlEocu;-13e&I}LO+jeUKGM?7umkz+NJ|4jH8c$w!p^uUAp=uQbdJz4EAOK5 z!D$~F8wiat$k5z#MI3|gty6rLTbuE?F;czInf*w#fgU@Ouf^{mdpQj;N{esgvS^4S zqqX<|%4XSaAdk_tO^>9y=sEi>x*qDcqFbe%7xnA8Eb4!Z`eBseA3&!1TT@d}9&W#F z8@XBIqfv=GMSlp!_oSW%qV%7#_w?9tH2l3E4-KWhjSmHmG$V04P*aC9aN#B(sxxp52G9UN;|&}m{Rb&lPD&j=MmliMEIZ^hX1YH@=Cqekb0ZXq<~~Jl z+~xE}kw!SR*h&6FC>@7X9*@#|dm)JvdLaGdRLnHqA5j!-M@{#CS!x3TT2Gs08jNxg z;EpK=j>0GtB!Ig2z)_GLD*0L4Z~>B%{LVvz+e)#eV<1RTQ>*^kUfWDZ+&3cKc#frZVknD z2JC;^j(bE&El%-1v0K-EuwB>g+ioO{QC>Z1oH10k+nD&OVXb=zi2C8bm+iJ5wT|eA zKhUl1difD;{k5oPl>c|%85~gK6bHoJPw*giqG(GIAlmeQ80By3+Ki79hkDOLJdKkK z)?RB5y1cc23-!dq2lqFK{t|oNYrS2z`|zQ}?%oe|>t&2PubAb3)naD>PoMaqwRis* z$}>&t&!)9Emh$Qo58`?Efy6E?aVtn6#>US8E>U@aX&=(9ZPuO$yu36_eU;LjFomm- zZhe))ZoPGG7-@8kZXrD?3>wxsmPxs75e49FLM|g|`VzYh&3H8R8vJ}?IC&YEc{uc0 z@2AeDfuHg`j8{@8X1@Ox8AnLl!v_<)ql=`TDX#buc3aQt){7zS2Rq7kmkk~M z2l`k6yvTaqdI7+lNE-{G%1Zs_HO22!acVwN5o^hTgt zz(h7+{XJs6s88Glf)fnu71KIwSi3lV(_=e;Ru?Q{6Ht>D(FCZN$4$=okSn1doYc$T zjn2w&ek#p5b|Mt@gK_Lcny-}tNALktV`7S=aLD7%unrN^Fx;gd?2twNo{SK-Hv|Xx zdogBYU|(QZmzq`?{uRMbj zV#$OO%H^RSZ@-gqSR%m3!rjs zE;=g_NDM`%S;ugjABeqv9MkFOMadZ_V@JGwf!NUK=sB^WF%M9k&7EhB*{7LXw`KSP zw0_^V48hF{FHG!{^oqYa>J;?)OGX8<=&hbODxlD#`2jm&*gAnVfWN>GOTxDO3^*F- zvWp!o{)t*__lv11|l z+)gvuu-dp~F^peciOl*CaO~*6UJtJ;Htp%f6d4nv;JDNdU6csMKZ_C?q}9l454K|N zP;DKicm+4z2;!}VwI6B7g^Po&NC)GT&>Pm<6Zg>6;xWji+c%AeS+#f(IGJTHn`MV$ z*9ZH(y%+GB1IahRoS4$Rd(HC45ovA@4p1%h4ilX~yKj$aUs<4Si3W@G%>{U3YIi>Y zi+@Gm40;}5U;jQn%2{)8xUMG{s4U(b-qjnYef}kvtReV zY?SZSjC+Fv5Q~*zO{NzZ+7`IAoS$fSZ#TV1pv?s$(IagpBo{bS6eLUQe+AYVe(w6r zw{}1wUMmSJ`rJtR9HK!4K4IixK?RC&bIbuy6`xv|q&FtM?|r5GAR>bz{x&qk%Ww}D z@dFw0KRaCfKY|P5@;@i{5>%#kAhEd}MoC3Tfdu8xWkbT3K8rxrYD|3FyQka;PS6Z7 zfwp^#v3#`oCN#l=mwaVZV7X@jT%dOO8q=O#Kz?L;kDA_H`W)yTh7YIhmU@bZlMo{I z5xDV>(FVAKB~$PW+#wU2_$hR^?1doDKP3-TJhez^BP5PPHgrKfwjGv1k2Em#?a~t0 zAxqB^XxT>O8!v5H1O1^lYnS>6IbX(~Mn2yF>Y3-lPh&$+xEv3hW=7)qRTN<|#7cPg z*3>yDqMa4|IbfvVSD>iiUxFg=GxnlDGVvYnOXUa$!`cs(L=!F-jVPN?w`UY!>apEB zSVOIsX}U_-0m()jN{jH&ZBTqbXjs+9!O-;XP8~p&i}8JmVc%FvGkBPX7Z6{k#U#0g z*dylf`T|4?xgPntV@6uwNj(W7BMrC>vl1uh`cmtF!{&?f&8X-b-1&MiGVQ?n%WBw9 zOLVg|h|{SBxT8EzggRk&&~suP*I=b(t5+eJChO0p?M7aB~l$cNz>(VE6t z(|f?Mj>FzGE*9wa>hYqXIF9lph2yA^?zFY%WC6+G|Wa7<(;Gizle1j(VeWVn=<^^UHSM zd;_OWruBhgrLS?Q2dxrhLUK@*NNk&8TP(&&6Pic0?+iz2TTW4ywOFRd63@uX=JT8;*~*nTf9 z*B?UMrG+Dm`eCa*SY(|@Jq!-%{`V+i6xM~|6>s9C zP$=!2Sehii41q#S3R#X|s>Sbu6hd5+>5gHNx{re&z-%MN z!PA#B@q4rfDNvLSIIKzP#|($zsZC7lU7aVEy7#Y;y9*s{Ay582O#i#Vo2&l;?NY-!>Y#B4+&QuXelnU_ z{;?Ln9g?Wrv&cHV0`uqOm0s?6vJ)D~{IS`65IDZmz{$&vqsMlpdm+h|Xy#tH!z;QV z0}NoEs*Af?XiA+{0t4wF3<&1`%z`;^0(T zAkPfuvXhce1KvgRi(3vM@WI!4j6+y5y!&;`6DFSIrR)}9T6>I%AH?>-waKB7z(pI7 zyG18SXVGrm8r!>dhq z-JLiOH-Cg|k_+lRqQkpMrq7|??o+barpLCW(f^VIT~em83OekF?Ev=M(ecpX1$60Q z>Sj=VVh|(D&p?W7MA!wJwHT31IU*U}<1+VLLtWwhk7yXeI>zh6hoS$51LPW>D1$WY ztFS+S&4%B?f6|j*+7`n)08&EQVTg~%Gx%^LuNG8UPgYsaVIKUQedK6)-=QfmZJQNU zStny}V`#<#5!Otp>v^!eFoDMh&}yWq=Me2!&^}J;-{C3CHi|w;duZ|N07$Nuql;LR zrul~$g?RDxN)&0)H9_+!>|j&cBJ(5dzN2aXj!ZL~c^}$9hUJPm&3=4Gx>BV|-68Fd(Mf{HcAF#m4S$2r}-px01UIc#`#LCfX%C4|- zc|#%cU=w@wNc60uu;_U29_$rriF46T5c5$IGZaVYUkCte7T*LAlDz@S@@Tu-izTqF zsd79Q`JN8;jWEh@fJgdRZ(FGcX&$kq`~1=SL^mOENiG`JyS%+;iy@LnbhG?8!cASU zAH%tJ-v|2U)u@UdFU{EA*>GDHY0%HtINdWc9#Asw)Vo*rOb@>AmkJW zccY?H@i{tr1wO}U@z=o$F^i`-14l>qB3KrBQfC0V4a@9W{9=%x=SRrTrB|fh7X?p> zydTQTEA8)hW&1mhG}Hy_+25ZdhcJRQ$vpze4rL0zW@zH z={L->_o(A^fKY_a8erw(06NaHjWA_C&$$}0)II5&&;>qNs2PadhFuthH<^W)7*`iq zkC@)q2H$~E7J>-;BpNi56({up%!k)K1(2AbV5cT}=D;!ZiQfT+{=|ziw8Wy@v&Y*# z#ml|Q419D6CU39KTfTMnYMq=EJmr;U!*O`{q2w*9NPKJWR)cWLpd@IsDb6LDiC z7lOcXZ{HPS*NC5i#HaaYbhLyVche*LE3uE(djb4LET5$+!2&iK-FsfzKJZgd>Se&U ziG4O&3EGyT*#Z!W2y%(tDUuq9m0C&hL*zZxlwIBr<-$AY_)5fQJaGil)|ccWjhGe%r>Or`#+{r5p(aSa;+5fbjA(M`Mi zyZi8EdLK2h-4p)^_sNQ**7M1^*dd4n^JO$Jt(~wtMnMW2y7x&;{IPDo-l(l z3vv5Lv}^3dSS|iLB&e|ygPM#c1VU0d6x0$^U{Q3q;oZl8O%Nxeph#XsKr`z~@Q*aZ1 z-y3Y^?|85g-zJ_f=I?m0p5YG#{6xoO`X?8hA~{_C|=iqdnwyPfa!xeH+B`PM5n zPJ4$^5lF#tb;Pqksua(N&H&X0-x~hj6pRRJvgK=Z`(9=|U&k>n;<FAVf7uEW-FXe{TxjLEkJ6NhunLMP#+eTCCP~+^|}MxRGi-gxj=Qn^6#S z?xj06&pmVpAKVc94em(ZJMrx_Urp)>tpAb&<&ue+kx1MOED<{bNhJ7?NP-W^q_eyv z63a{CFbR@I%1h#;<;^Y8w;&P1qr5WXQ}QYV&c29)Kd0KsQqIH311izHwoDu=v8o!A3pyI+g-nqI{cA zPVWL^-1(x{n90Roz?g*;{t&9`$^3J<`@Ba5PDXjOlH>kaD}y-diJ=vn){B+H2w0G# zv#1z{3I*o=vqs5;W*c(sh~&`29Hs~k%Tb^iYnlsD#te`NU^MFnGxiCx%PAOKFB!Z8 z1CHK1_%#v)7Z&D9#owb**7yG*qA6wox26RzO)ro9N=I$|9aDv;fVsX^_H zlfbIQ#uJC4E!5OBb7ypLW{V~(eK=-1N-#P?&X$}uh8Z28f;DJu1q-Zh^K=JZ;M9(7 z%j1@(sa(2k1SduzPR!Eb#7z_50;*|G=6Qy0PmdxKnjf*J1S#d(YT_J__0|YGCvL+i zjk9r4-8$&t>(*mzLZOOcv&0iXHti)*5M^ZRC{7|FHFl&?^-dHZTfm|pD1VTle3Z`Z zLJV<8RybA#5MwmS=i?3N!38{P5|jKdVvHIw$sd5eh@A7&Dx&R7>sjuGEOlx+58{mX zN!q0zqH$8Om9UgT=vzP_ZkbEQzdvoJAyu5;E*2qBoH3)gC&J%j=#0hQz2rAg>Cbe^ zWL)f6zW%4*peiqt7@GMGqE$T|mX5*)Me>fASA(claA71^gV={~h}D^s6dfiWx26PT z|AQ~o&|DyZdNk)Z>}d$1a2(@8O8%{9E0U9Zh?Y9`*P^Rw^D`R8r!V>?G)G-9GCf&H ztLujK{`91A8IJ*&@3Q&+_XtM$w4m73bE72~VHF<&(D=hLVeMor4&i-T42QIS|A|kr zeU_sxE!;D879v6EKin(d3){d*h-F@kHgG*m$v4BHv*r}?eEtg&>k)b%hi9_%KF*U^ zk=#CD>mj=;OlyrLRdyMj0WOLpG59g;E`d>-I#F5v=RY zi6q4imp+#Ott=$XF~T@u-#j;By%SDiFhffERwRj&H+pRtwA*XKK)!yCwKtrM2K(pJ zfY1*H1~LbtAXG(RBMOisjxWPbK7=nU3hptJbFg=qf+4ZCM03SPa4f|vuGwq-WFL_F zK7&K~!7!XIP`1nI$#`Dx?JLHU0QpoMKVckZSN1aevKW=&HV9w)&!zblpN^PKk(5tI z+>SfM8@>G1?y4%h_hY>xB@G3tFf+%ZeD4!>7+Z3X7E&>rSbv~Ei_fHinq;EZ&BUzo zS9|WSwpU^U8!W?gh}`of{LwBzJDN2@6hNjfSZCVR;i~d^0WE%jNMQ{Ji-9-_{8lYm zV=m!E=pc9r3q%bd)a@PmgE(bn2ErDO!~K0Z)Wf)rbIl}TGo5!X#zN7P&Z{dVH<<`n zWE(N&GpeYMP=IDOY!i6{y&xs%^OoO}D(_}2!P0wK5T##P9TMvY5xY2S*A#$$Bluy5 zZf&5zUIX2J4CQsZG8`L%&AtIev9cJ(v6e8|ZaxA&UJ81j<1O>juhJK#R`8^305x%t zuZU77q8VF5X0HW_YI}Zx-HkZGkpE!Fk|NWt00?}w_+c_Am=cc#h?dZ54!zIwzTi=> zFO=^rK^mlQd?1Ppj{1+FtJejmu(%f>{juj_3H;pxPyjfDHbS;qGk*>gt(i~FGnH%_ zB6D)niY*}|#_Z=C*w4w8p~Mt0z-9sVjxekir;b3$Onyi_KpbdY3katk48duRUNa{{ zJHwClK)eWh(R;%N(sbfSosBvcF-$emJT-kDafH+0oF>0Z*G2NngCvn)J6Q}*GLEuv z2~i&eRZ4pkL!g?b1KVe^=1iSj zk=p@t=&UAZ_k?jvw8Yh5fe^$Rz~Q>)0$fk8mo1os7O!8x!pjB`ZlrQpQsl}3-cEj7 z3VKtt#HR$CX`~EwF>o+oICv*`ijY#T3%$}R!H1X^1>I)WP_fzwoycIsa4Hy1OZ^0C zhaBe(3X;Nyp@--rI;_X!)4c>d5NOafxA?DgVQ`~u9`0@`|d zua-`&i!<%kSnF$0SDaXJK7l}bL+2pexGd1HiqK$Lu|4HY+_uyzZ z!ddwdL!0q5PNDYw#GLpzIgj9ik@O|&^9rX-|JReDkaB{8XGKiZ z?w$~gq{Y>7#6BZ0!Ej!0f7Oz?q zPJR|Jk98=#H2-&D?=wUD((8wt$$IMfXIisY%e0WMl(AwjUc=D7KNuS<*j6^U4G91L zE}U?uO^GyD99inU#CkT8%>S-w-|_|oLAdS3TgwNn7ueaoH`yX_8oA%=>19Sl4n{)tS z1!YEwUjZwOm$Ny&VG+W1EygIyc4To;k*L;-_m^vfGn0`YAQgGRuzx7zf4R5V!SyCs zikltb@%r%o4crfA1(7f&J@yuseW-VW6db!ZSd>X}wRMj0+43wZNL6Ayg8cj{2x}%j z=zaC4_4zo%oqk{aIo|21O8)E(WC!T`#Z`6@<;c+KDWTA2s!K+yver+)*q@?v8N#ot zaMogmz2SbL83w}rAX1f_@5QRtzy%GwN*un1`i12GTHkk$~l62~EK8 z=HWqb2E|cWjukK*jHi4Hu+dnBl>~S98jxj7RqSLa+FWUGx}TWS9_m#v1Vvz-x0diq zIWohWXxjnXmajNFL7ZOEuC|Ja=V{CXM4YWre--iQy}in2Tj=)B$Rw}~HHqBr($=j5 zOy4=91~5GEvy#PoR!m;BaEnW?C!O&b4%cVGo5&bDD_x@0~zenPb1gL<0<^!0bEr^)* zDtOhWyt;KIX7;ow( z9?+Asv1GlVo=v3N57RRvS$73aJu+VEEa8u2UtxFA9@4<_==PF^)cKs_k{_m9Y*;%8 z26TFtUVh{J#A(@)c=cDM?gAG2!72H6XYkzV@u^`t`J9fgL>C_AR7v|w+Or_Sdz=o$ zVG**|u!do~tF2=s7pu)Z zlx`2*zF`ubQ%TSvlvE8qXu?^5^u`{oA6^xrh|o;SMpi=|^*dqT)Es~Y@M|22OHl)c zJ|Vg<9eVa?>n8(V>(krkJsRF z?`!s&l5qLiTKpiO;qt36&7}egx zaTI0}wl59ifcz*~SX-c$@GBdMmwTUEw2f+IF0=<;s{Dj{Yf*0~TCj*;ZC|$0sbAOH zg4buS>utgNv&{kAG)3DwI|9v}Ej{!iZc9I2ti^k;fsRhRhTCy1y$n0Es&3MxNoQnu z(_LnN29YHk!7QbO-!H_9%@lf$@2T>88huyOB4S5A4aWxt={IMr6F6O`oyi#m&K=Pm z+!kMS>VUpPDPMlUO8^`%N_0*SrJyJUMTwT$0h}UAhA0`LR3}QjuW;tg&7wpF1(!uPEIsO4~(gyC^*;O3$H0-5N_bVlxy!PJ?xsvR%tH z-5wuAmd9f~;tMyuT{#VDAcZHqU$-;7^(`NRDb2@P-%Ane%_{5BDr+mnOEP$PXKav= zabZJ@667k|FDUhtGG%axSigP-PsHg$)Wp_00^|6(Bmm>04x(Vd{_RGf(c9YbYaw>* zq%Zop*h$3FJGq43`d!@EYmNPdJ1cpP@NfVXWU$8^;efT)U}defSZ2Xq)$7(;IL1lv zthPQ#Ych@39A43D0H;|@h-R6yLZ&?f$&|)ucn>dMU!-5n`*hEO37n}AUvxf^kC?r< zY$Hwdry)7wpB*XE%eQIkeh8x*-aW8Pp=p1SUnu;xs63u3<1jFO3dMR_eNsptZQEX88$i-DHr&D=lJB-V2HO)sq19!8+>e`aF8eB}CJ2 zv1~+kG|MmVor{&e@yI9+XmN^WM6b{+@4y)J3lwpH3+XmaqvC|KReHfNIl6iV`D`D3#+Opv8h zG<151cKw&&5B2$?{TR=T%F1sAN@mQSH!(1E^4BMq1}-kWxUBT+Wm5tra}ia;O;G{A zsH}9Ntj8{hd+(aL9&dk<_ng8p1sl;`BK>?U!%XZ*56g*Z>{h*&hfX-@^{txTO)X25U8uq-#^bk z;H~yIu06v)Rri~<5&D3?C&SD(}M~;KqLFU@Ae=-c(41QZq{K~PuZS1(yGYn=)k>G@;ziS@5mG2*pe54VHF#}i-Bd^EjZ-t$q5k#Z@$K=~qidoi4S!dK&@SSy@qL%U=;ax3y}zbAXG57| zeZS7gRFfrBJ!E-T%JfayFOn3yHsl%pz`A_HUlJRI4&}SnKXs-*G{YY@{WTR*suFZJ z^f98TmF;xWhZkeG@>n(O`bb50X`rhhZ23{ov$yT5)WO9Uo}1+&1US~8i5 zSU3|*fVPB15pu&_Nt7yJRY8)R!YD|*jry&U)Gsdn0vH|2P z@m)a_>1GB15eewRH&G)FIymUyZ-V~{AUES#7|%#v;$DGgI-d36**saV72i6_3uJkr zEE_omnmM)t&&f$tjfmDPz-DC0&;`1+65o{149GA8GDIa$6ZOJb^;Y58V$`cuXi~ij z)a%8wCOjj(RGsPBHIf&}VM5=qv<1oGiJ*DwJ|+BS8u?9x#%x0!vwfZFkd3g_q?Tu) zF){->kGDUU^llVn8Max%mMX9(BusuW-ac7aF7Zn6{gH${PcZnRln-$*mhao?qkEAH(Yd*tY0HflwYV3S`&K%o_&oJLtjE&&l zke{<~Z;qXf&kFe|Ao9~+0FuS{n*v^RT}&U$ql=$WzFafnSu5zx7^?&089eWj&tE)s zeB!+GBFYU-L^|JEgu3ZYE_t1dlJoIRS%oi&n=sJAGKH)qpqbND;jt9;DTi4sYcG)H z8dmQDcQl5vB-;r0Ihh~T)uRQLdx!~hQ0s+bMIJ33k zkMlfd4F||l9((J^M|qyJM*t@S;`Mwp4R_{#1jtLz1k;cCCeQQ5;Q(36^8{SrUv`0? z=K}wV3;Y5XI7PRSSDxpqBY}YAc_z8Q$r>ZCy!7mM__I6_35Ekv%JWPa0V6=3=j$%; zjQv%FJWsh=OyB2uX!&6HB``n5xsdwgaxRSY^Gb%h$;nHlDjB9b&X?(TG)JVX8Gqw^ znSRIX1Awi=Ur~;3{Yw`(jY+xiDP-h=2LK=E%k*oW&x6KH7yRXnpXt{;uLk}P2p`|s z*RNgR80x_{69;&%jmj?&ehz)l04x{$QozUgGI3+N6o7Un$N4gG!+|$4JR>R3w^8T& zF7*HC0;hRdE_sGr;3Wvb<9wO8#B)sWrI&AV(Yb~3Gx3V&fxv&z1)uz7oG%l{c#a1A zbKz>a=wI#vZv=dtk49;6@q7%ox4PgDy1);+z&{Tc&nb@!e1!}A$1df9C?<=>mU`=?6ejT%7CT_RHvsx#*jK zkMn)mLH9Vh>?($zmxeEucNW7hNW-1Tbvwf|aelePc!1%VIM2B)o;}C#Oq}OD7VwjR z=aO?GlDu5-MSzd1rJl;#1$C9-$rqQFc_v3H`JjB4}m={7J8PkY-n z{XHvJHe^^W?d)3J(ALq_8*OOnzP2Zg(LePY0QW?@dzaESVnb7VZ^O!#mFedh3M*T# z>j1%MQ*(1SFem*YG*1xAE+S?`Vh@UeibJ8yjXRs7|LF)GCsfywSHhw#4Hti+AM`{!(Udtg$nfn0Yc!kuzdYvtAz7Cgmc)xab4azyOyXzE_w(AkySgrB6`C2G5z3Xxs5p{EHBo7)^M z{ap>v>XPOtJ4X)HvM_RM7j5cjB|7-^aRjiY_J(gxmF=B_Vo%$(D?8hoPaFT*7Q{mk zUx_ZfvZ?}*3ox**wWRUCb_2V6WheKDe(*H|FeqdHA?WyVIXGJwv@wSLu*S@ zb4#~pSzAXl`Abu@A=3^D=?Cx?#2mMmVvOuH;yU#z+dD z%_C7VmdVgc52<1hZRl%n>FDvKy&h+08t@bJJ)Vrw(;z2y%Atmi*|edf3Vi3`%BJoW zh@L$y-F?}QGC~Y{-p~XC4j(5nm_jf)8ug)v!P{Jh6xX08#$!$ix|+IMI-(8ExUj5~ zp`sXF-GzuFd?p(#tdR|6;XF+}=rVc$qCIu zf6vOMb|gWbW!){A(I!JKr~XUZTbeqg0eYhFS2*md?zU))OzG$+@9Dxc#d9J4aQ=?3 zKNApFHoO$`QU^}iu!CQ#!0AYW1MgGdD*jl^59!LLPxIAW@S9xVbRfyWSLuJOz*W8G zVt(l0Qx@*XIiSG5roexvz*W8eqQF(Yg^Ipq3LPiUCw*18Gv6n;N`JH>ze@iL3S6Z# zO@XWU5k*dweun~A@nZ^HrT+^BuHrwbz*YQi%roiA)@zdjSLys(fva>LSKuoC*IfAa zNc)pH&c~nAzWqwO&?=P!|C@q;fdYTe1^*u4&_#NE1Ah)3O~F^;2?b88at=QIXFqfi z9TiSLCQcW@Y1PcZU!}k=Rp4Jz+Fiw;p}~CTGMpo*`dHyeb2{y zo-V?twN3{=;sWn>f&at>KIj7fn+yC5%+Kk{mM5&hRej4buXga&_Ps`dtNOmCz*YP+ zs3vIJN8{!+nL?e?S#{3925K<;a0>ub2cTV3EAUEtdlxGI0Fkj=(+R_*XB z1+LoRuL@lCt1+L>&{6RR6u4@iyIkP^0Ee>(mg z_`3@H3I%?F;%~I8;oz(DojL{H>Vi+Ns?bGzuf(52XR6{qH42>m4_3MepLWL`d&}Q!^8Omn=Iw#|}Iy z;emE1FvG9Gy+h{>7x;$?T;)4jY4=8j&RH(-^AxyB=Mn|3(y4TT z7bxv->%SEuE5oN`&tDaz&%|=M|L$XPvK7im#G0m zK33o|HDJikrTvMHTvB5A9tD1etV>@(*)H@oR)J4d;Dri2qQD)Wq{q)lJJVyiM8UNd zEB48pe)D)fP~b(3DLx`nd7^Ws0)I{_Pw;UHJRp@P_;>~0uE3qXMvv>IQ3(Gm1^-S3 z{y7EykOD7O;Lj-VvlaLo3f$?N^!SJZpP=9eWn+*$Ur^xBDex~U@Tszq2>)CKUZ=nV z3OuU7zpTJLlY3ULj5b|^?`XI9ZguXWE&Wl?Q5m|0Ik2pm$xx14Dn(UCz>`clU%7(ExEklZxO`ht>!S2+RbJhCfqcuYN zk!N2d+}Y{O9E!;yS7r}UD&=|@=5#9OI2&?0)m#sjoKEFWJ~eVW6rAn!^iKG&BQ*aL z3Ynerf0~5b$%WGqBkK^usjG;e$Y{to=dj}j?!C)7HkSKDfxqFR#;*okB#w{?U0YfkmUTC+Y*EL6|HVCx zZIn`|ISG*q(MOI;rXT4iU&0;1WZIC1hthY$Z?fxOfX9?u|FiWeHzk^G@~b%s&_Pxbo19*^2erLbX zd5^=XpDq78c#uo}`VLW}QC1}RNhF8AW52I}c`p9X$fVXe41^jUnCJvRiE~;a`A7J^_}-n95J%_e;bu^>0jI_YTQE) zagqJ0Znpm0UHI3?q}F*Kg=o-~&7Xb+H5dQKdIWx>_+tJvU&!Wv+=YKgCdtnGEt1R# z{`5a4=Hg!x6*a;Nf0~zM^QZY0NQ{U-fv<>)-=v4im976JsGo~}m*l@u)+PJXJSUs~ z8W;YJ=ZlKYdp45H2>ofFI2V6Uuc+~ul%M!hpUdX|kPCl4d8 zQNwu$g80+?ESo>=ljX9%6VRM}TQ~m4T=?(l7d7sb6^Z`@{AKg6#K4t{|GrB^Md$q< zNoIuoFGu}c_8(X+YJ?ykUBsW}kJYpn9}D9Xe`)44w<(v2 z+vYSx+)!Jo*JuC;7E*z@|3LD0*2PF23JKZ#zke>2Ny41f%)?c2k zDQ>ds)BAe4h4j29#4XTBonUM8GX{7&eb>SZx5H$h{|H%Ml z^MA{Q|2|XH&(7MpHVMIZ|D1ESzpDW`^RPdY`9a;fq&p4|1+|F zh>~XIa`bmV`%yoa{cec~j4o*yvLEfQJN2EoMDk?Yj%pUw(k0eC>W6)2%vg`jp1gfof literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_matequation_scratch_avx_avx512.o b/third_party/libxsmm/obj/intel64/generator_matequation_scratch_avx_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..1d376ef16ebce4565872d27c7750f98407fe731d GIT binary patch literal 8256 zcmbtYeQ*@z8Q;4DJOOuO#Cm>I4tlLe3pW=E&PA%;o9rfgaKJ$>0ZDPWBpY%ucNgzA zAvhMLN3h!^LT5T{|FKR-XKMfG==38Xb~F^K&{6A5$F_qX9j9ML7?tT1rB&(kyt{jO zv$u=bJ~O-f?(coR-{<|zk`B>m11)KTH(im8OBp)kgK6XD>ZykFZP5lnIwl$GvXar7A>i`h0^#t~ zAeS*lvyhWN*rvkNlo`J`kvdDN7ks;JCeJJZrPb+cMEy_Ilga7@KM`7wpLxXOt6pf- z-XPE5ke+!Bjl+1oOgdBgzwGgPsiV3(92E8A!uUa8Fi3oSYHYw5jSq5C)2XK)K%tD$ zC^bz{Ka5V=F$Kdm5~sZ|UgE^U&H8C_?5#k}SN-&}b3jxYKd;Sw_6lfq1D`d9)9}if z{(bNY5uO}B3Ge!8V3=5yq#Djl35y>@E`!Kt-oS=M==F=8qW+r>S^tkTeoDK8dP<%S z0z;j%Tep$w{gJRwe}}pNp3Da}S^sl4ctX7Z{lb&`(LRosZT%FN9Zd_T38*+SF~6x7A}^wfqZdiSSy9srn1YIFGm9mku9VPrbo83L*9Ef z1j8WfS|l#(*+?+0i;-wrPe(%X3t}XRLhHv*3Hz>uyvc?b38nPk$woR7l=Y8kE}LGH zjRw)^5nmvQdBA-XZ!}%Tw@weqH!x3B5D!kA(C$W#dj$uMte?&x;1xDuT0%kZs>KU=xjjt@2UO&7lh^ zV^3Ia{-ZW2Q#TQon?KNgEE#QKQvDFz=mHsN`GxVb&?_4dJ=9iO{{)<2=6cH509iqh zvXG}i3?@ww?$jNSF*}-HH_^0y&Qc?uMAX#DBvG*IHcLN+J>_7F%#fxUz#<7W(lEUPDH&L$fDdFH#Gz!Qo&8RT`1PTa~ zV>OP4zXM3x7=p3IoP!z2t(kN)&n!swrwT&pp(-1&h5+?Hti9v#r z{tnS$A)0z;4q&GxK{_+N59In|!s309l}qFjY|vamejqnyx-i@Nsc%6MK;lUCEguaH zO@>l*iC6*}*Be)_Z{8Ljxk{K6BjJ5h!uSutMa(PBsGVFpuhWms@`u3UUwv;u^`cLB z=qU6&*8nQ1ndh#C7SSK#puA0>(FEqyfNM-3SOXTBz``1EtqC;P0O(r;ZEM&XKfZWz zTAbw%^&3VqVf+27$Wx#LXPgXA(-RymnFxoIH#`v>S@A@JmNMrZ4jR_V#HIutMe6qS ziyy%N*!jogIp@_TbCjna1IDy*JDM>oxq7E>bV1tqA|k|C7?CG19`4ygiVthjRR1;v z!H(|=WI1{p^1(Q8@&r{V8|zKqr;h+X2pT*XpK~$~|his19*ldu!`$;b{AY z&Sl|5Y;`OiUKwB66kpx6DjeMmYp0ykOngOCd>Mx$isZX*Gv^y?@ZGrZir^uL3-oX| z(H@R_9G@+G!^3`w4^0FlK73$~$Vd0j75R8|UN@g;=aUjIxA2)3eyhmukoWR-=?+WBxhA8p~|YxzVQ zpWMjHZG2`U-yrhAD6mZ-FVG`+qcZ+BJ_JSy9|iZ)z&IZb%g}umd4PV2H_v$P;uCAB z9g)u{e0(#+ATlwmTfnUX@dS=N&`!L8t^CXLZhcKDwTduj8YH4{x2%hrlfe zURsa=)?rKTAj}e!JLre(JRE4@!xM8__~?PTgpco^NBBhbigx}Le_)Y+KA!*#!U3LI zCdmS{dkI1_bGUQ$1qOJ){X_{kZWZ+ufF3sr;L_nj0lEv-cLX@B|9T2=j!oKP32+PR znE?rKyaz7S;mZN>ak$i()M7W_8DMY*1{@c@dJ5EL%S_)I+Vt`VSJc^*gx5y@Oy20+ z0&Enn2AA}IA91*x^iu%b@D~AZ^y4YOBs!O%8GwRl^v~p<&KJOM1Kdr|R?0s!E_BWS z|M!sJ#m`9({7n!1O3=~hpBY~|k3hdn@tJX?^91m31MX(`=N|aqJn(;e;A^17-Spq( zf$#Oef98Sz#RJFP(k=f>J@8eCLkvzzdGKdE@GTyCIQNaGe!GeFa*EPFQtVZf*s8Ct zUU@4QOSi_kOYdL3WfgWaWu%xZ?^gO$(3&5HJ&wy)EUYx#Tk2C4t(+@X`b*`4y{Ref z>5VNUt!!*->yWZamb9cLrDm+44r#j>eM-KV*H+fL-9xf`>B&2txvUV%YTrn&%Gj(~ zb9JWH>+VZM)xhWB5xiR){BW+ED?qzE(yP^^#}c;!hY7;xkCGc2Q3`6I*3W2H?*&b& zHkuo<6prS~UVJ-ME!Wpqwixk{k#}Qd4kGAqX+-;Uj~R~`@yj{KxC{fvG}~AApv?NR zM%FqV74p61l66Gt&lmfY(y)@#6fIwXPs&_j*dt<b9eK1*K3GRz)hkrDA`6C(MAM zp;5Kht%$7>v250dGgIWOZ}oii07wH@%00tMSsj9J-%(X5_4mUtxcF@u?kx>f?6cve zEm#8O(ymJi6meSB?2W4Bo>C>R?XDpMxnkdts;K2MOzDc&2Mw-&S2?e#H5R5EtihaJ zpxlkt!UoF|-G$-EPH8)X<00L~*LvWc3?5?m>0uq-Q9u4dv-RYljpW21GhvG3dD!OP z&*12@jenWJz4Wts(hNVVr-#88Gx{H8@GyfPW$+~oz6|zXB#a9`Mr=P*9{3Q0f1cqV zXYkK4IDU&Fq5h?C+4@f~_!k)b?+lKoESvvoYfnQx5e6S&aMu4v8JzX=4F<>4m#u#u z?7v8;AHVl({5lUj>VY?V;A=hbZV&uN41OKs&&v${B?kY1!Ea&k00snvb%)VT5Y=L9g6iZzGKwXATP-c4PJQeqr3I(+Y1p>dO)foJiSwa8c zSjN=>r5_G@DpP|0R}<$DmM}XvP+<~pE@YK^t!D|e5VMw>8lJ4RW;dMjxHjQD`(Ndo zKo)g+`TscdvE=-A!Zin%lqAc9ZR?XhZsE9xA)mb_;28m_440dIdmXXArztbsoc0TV zx!I3f_H8?8A7gad{{>)f_A^$ZAAx?Boc5oDPB;6qi+!~1w0|BjH~a5f_U-=wwhvDG z=REB1uYw1D%@h^#O->$2rkSFxT zN|$Y>#@I50&{o*--fYRo9e9gnDCWTJxb0_)CGTM4hHXb39k!eeZ?WXzF^g>ZDTjbl z?Pd^k8~-34vVabm4-WH(DYfP8{GocMZ4`-Pu>#@$6eLuK=S!zNo+X@XA3)rL)-S>b YEP2F`9O2MEl5X-pu>8+CoO8deg+D&MfcI=ejiwlf0do-_FeYeovE`g=2vLC<(xA zXnTqRgio~Xm}|yh0M5X2JaIQU&$5dwo?vC0)oYbaW;V=n#mrR%E7mO*UwwEJTa-Ri zktib+&nD8-lgxCOQ^KtEdb#HGQQ583@b-)Izgw+mzrF?m8W$(_dH`nk(cTDsr-)lG>$#r?>h(WKe%2^89nl)sRXgZn3e{y$9oLhEMS@i-x0-@Kj`y zjb%lFrEwrGpNNFVDG`R;FF@c)76Q#sV5GmNcL(beza;;ylCBMer0c6_NXOwv%kO8< zcclS>aLg|pIj(|_1hEG{34*q1T0jtbHLq<3;Z)m^)~KsH)yJ*z$W`>uY5lruge;9M z%n>O^jihR=zYn*>lsZDQ&t;MZ*iJ!M78EzE zvLiq=%e0KHM5MlsdJr4lBv)o6r!AmeR{W0Sw8ONnif1)`R^yj7-mme4TSI3Z@KtEn zVQ7<3zb9Qwez?6^b<7-a+p!LGNm-6)I=~C2V*=0FHgKy>4)0<=s9J>2V2Y@ioCJV-9(x`D1kdf`3|BlR4?}c>#x$6f0pfeUz z0aAaF6H;|fBd6C-^*=1*UZSS{!_rr3Um9EisnxiZeuZ4wdgzcJ^oe~d8Qlk6hGo8Y zTx$F(_B9%Q^M6DTMa`4uPxoBqD^A^b`$hz5+5bDK>26aEZ$8!Zo*hK+Qx@GLM-y4@ UkkDJ=m*14*e}5QxV0iQY0MND*djJ3c literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_aarch64.o b/third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..080905596cabb6897c96b6ee5be7a64495c3749e GIT binary patch literal 9760 zcmbtZYj7OZmF{_96kF+@7&~#miN`P`BEYPMbzJh|Bx<=w8kvz_l0eADt+70^Bs7oA zv|xCf8i}gTFp2jETU07lait)Y)c(+)jU1B25idL5l)+GWIjK!xw~`>1uow!1A&d4q z_c3!lnoi|$%j)j?^*P^p+;dNNj~|F9yZt_&CWlY^qE??tRMWy6>(`y^vQt~5U8b40 z^tmU^o6SLUpx6EBV8T7!pKw1&lwa?t%tQ`DFyT6pu;nHr>s>3-I!0HQ&n_4BN4CKo1`HY#bunWKQCG&jex{xvZn@d4Ze!az*-xE5%BG|b#9y?_j$E?Ve zW0o;zh9-SaJOGur@uP)Tan^3#Z$3%ho5uV*=J_6?D1yQ`2BXAdy5|z^Z^w+eKBzK6 zNDf3omhq#f7!zYL_nQ07@^e1i`#;Vy^Y{wxh0_db=XjogO zBCW>E|H22j=|nDHd;{JX8#-AO5NB%8ne@lqvljjP6d*v1>1iLwo8|mmW*R@M#dM)Ju5w8 zeB7-QlmfU`Tf_S5z5n6T5;Dmt`07alM#xy;(TQ3K3=Nw|Of4%juYo~i1nF}xO#0pT z&YFe-|1H5aZQ8;0$F%)n#DtVWZk3>A)VlcEceyn`0O8zne(8x z+&8dqtZFv|=HL8|&zOBLplOx+T4V6`9A7533Cc@BB=sbijYpm)o7Lp%AX~YX-SuL} zjw;{R8ib!=(|F`5@*}ojx#!ICQz-J#VZ(A;BM@SqAWup6-bma36paltSZmnrQ|v;y ze1VMe%_PiP)d6YO+{0)ZY{D9=Y#1Y}U_p+Na@aNIUtk*8lD35yk zh=iStSMNi~^LTrz`(U`A5yTc`l*S_qlxZGid4UHB1~(YtrfxN{u=ipg&u+lNCM$^d zxE8{?Qfkg~^Ef6W1M$kvkaca6)^)D0)pXA=0sJ0fn%&pY_|Oy4 zbI`{wXQZcOMZ`mlKEy&}9ZK739w!Q=zTr@)yo7GK3ryuDzw<@jYlsANoPpIBnw-fT zkg3wP){2TXx#M7{cSoer%?(h2w4l5W%w?KJQLc*wcImtcMc0M(Fd-|Xa1aNB9F5s9djE-;KWZyoO%jLs`i9_Q~ zIGejOoXbw#6)sNa^SOdk3}>dNoQeEYx){!lh5LK^tngGid&n8zxRRSUE!wyQi%! zKQU%@0;vq{v)og7hgzS+0aZ6yHHy@T2V`3*3uv>a|am|3&~I!?+7*ChkYV!anC}Du~JPCGv=%`j5%fjgy3Oz zr1223k}YFye@NCpgxLKA^)qYB1HtLb;8$C{z@MS_CKy0;7Z@EFKs3NSQ{5Kqkhveb z`&r(t_d)`Dh3lSHwhZ4Vlr>}v%d^|uO&R5xqRfy(EsK+E$A6R-@ds%fSQk!Izd%55 zk?%&aHH22F8rsB{V?>5^N9CTW{j~SK@dTUEzfxpZLLNaoNVERwSHo2O=HD zHCAK@B=hL88PYPG@mqQZFXE*_{LYPQPC2*7FdiJ2$gCIBv(}Hg;!7FC{R6G}v&YJ0ol&?!W@> z2%Q0D2b?!SY>?f#M=yXATy@khD>$o>yWuFy`_8AFn0mrks(Au413t@ zCLUiOb6h)&bU%vx)A${oS?gZg+v&-al2Vc#f#N8toS8k~r)a)5X1 z?G|&cS2}moDoY=S?_K20Vc!mY#DUrM^tOx&};-W{2J0^1T_5C(qsg*k1Rv!S>@AK)o`{~#VnxUH33`%H1cnidzn&k~6M)^1zfhAjKkk9Q;DNv6fnSA%Z^qvX ze6_!xf3{D+aSzGk+&KQ(1E+n~O#Yi5_+!9V`|In=_6CfdA^9eGIOl<{hHNwW?H+iB z@W!ApzsCcwdhmS91OE>XTwC_xu>3(39o1}mY&tt?+tE!oY})dVS~R(>oenYY?~mm) z{PV*uPGk;>kCd9W{6Lw=7M;TMs56nv+9@WO$=#9u3&ciqnf$aI(0Bz zkl#Uno^NW}7HsxJCbM}{O}c?|ESGbdaMq>7hnQC)G`lCcv&&4{JG;C4;{*19*_Djj zTG4^01#Rrk!UW<;mnNJh4tP@6WDDtH+ObO=o9#n+yO2I)XHxn6MD|dF0L#u)>OgwR zb_(?2Sf~lq17Q(NWG5UuGcj7o*^tUYVKkQ=n>e&AsWN%#FHfhYCho?k=z*!+=N@F4a%4U)eMe^S231HZ!q|Gt8&c752NG#dTc;epfsZzj)v0TVo` zpYJO2Dt@(iRFYkKSCny$D7fn9lOFhM3a;|Uh!~?Wu2BV7;KIGvQY%c#H6#{8_2sYXwgHS1I@w1&=8BO$x5o&)pvQHx&F63eTK^t99~o1y}Q| z_GcB3Qa~8wzlz`Lfzx{)4U$*w-mTzQNgcr1De&{SymHbdM~fazIVmlA zpjgzRh4fS^N@(=2`1z#C>t?i&Bi6s3g`A8VFousgC|u{TOtK)60MP-3t*m8o$ZtyPo!LC%aoew5{XUCE*x78%R#p zytu7R10(x4;7=WI3c)cI#I@9ydH5l)X8webk^i)h zrG4rDd%&96-z9$fkl)J?W24dj+d!Mye^dMznHT+n>{CdM_CE*XWM7EkijSk>Clgv9 zb%fOku1%p&??~!6BlH{nm2-Kf|GFak?W!`aUl#h&2ApJR$n%iU=V<+?FaK$W-IRJ# zG9&aG;|~e_BMmoX{-pjm^qb{>gdX}B34YQj|3oSE<@%F94JMezQBuG-BI=*=OHiXe it!d-l1*Emp!xQ7}2J*)EiL;shcZC0UHYhYqwf_hADA&#a literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_avx_avx2_avx512.o b/third_party/libxsmm/obj/intel64/generator_packed_gemm_ac_rm_avx_avx2_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..ad9130465696eccb9d2a5fe266c3369199617195 GIT binary patch literal 12008 zcmb_idvH_NnZL4~M2#I?m%>(QNKjI3V>;m246tTP8(ni{U&A%n*fI`}2;0cmMt-a< z14+XGkqx<3ob|R%lRwfflig{PnYMIhGds{}DN9J%bORk|I}6QD9!;l3q$EvuVjd0g z{=ReWl|HU@lTF&gNcY^|`Of$L&O>H*)T}G=`4p%5lrJgyHwRUeiqGf!5z&n(pHgNi z+B&UGi#1p$wblmfm$4SZ`m>S#L%U(6!okef!%u?LwDfR!=DXno=ex3vp(s{s*x#)VY@zSL zIhd*7c?&&H=;5*jp&N#})uH1AE|h~;k9Dh8e}?z7cwXpMyU*eCf;xX=xJsR`ha<*B zYxo{xA{DMW`~YSc)~nhf6`k`mT{}Wv7{55WjC2V`tB;yi*0g@39VQ&Kf-3uOMNoO3 zE_I-J(dw!3i|W|<8({0iu4yyV8ZOt=BaKa3`u9HFdMNDIrfP%bqw6#piH=XHW6%0Q zo7oW5Ceq}GFA6v8?*iJV^!e})OkdOv?Ur(zt|!tK+{CIw4kC<{_x5 z9&D7lJ|gfH!RW*!>8kC$DuZ@PXlq@CO!%fx?ejImZ3_;Q(&wdK$3t5Bgil{~P2JZH z$(BqA)4CY7u9+ky`*RTL83@&dyv#-+`J`}C$xTppMNo>hrS9K-BfwZlx=D-WeWY9G zv}RZiFF4YxwDb!;5s0aJvhcul!Zb=am??$rC%#_}>J$5=k(Z^B5x{+l^(V(jEmqLT z7ht3`ZhKE&;MiT$)6*0k3XYYHTIaQxj14I!w7id~@7IKWW1ByeynVeRXUCHb)6|wy2V+_@d?^xd_H~Je+VfLJMKhbq zko(AaABtu{))IvgQTD$77FG3_j_w^m3w8=}=OZ_=iI>Ql$>fIT{L z*RNx}^GB^$vX8?T>b!`kP%7O%AJ#`vyU<5)(k2FcS~EG3XQ%!JFi^*iI9Vx54h1>d zBo+!(1X6Qa9eWCXMI9`B82nYF&om zHM+Gpoc;0p)6@C9L~T$g)q)#BHhqQZS8%(s-kGsmWuL<=bzW|J6E4L$!z+IzfUQ!p=DpZ#Pk66%;nF!1=f6n$YO%Wi8N|^*k{guO zc$O{12`<|+W%JEED$a?Y8OR7KW^MOG}a&DIqP+>FIF`>-?t z{&UVpxpN%V(5NEwO31kH{%i}#bKHGz9uRt>(x-EJ9({LuniN4!HCoR%SU;Di;9qI2 zQR^i*|5^(KRp4xEAX{`8v01LzuVdN1^<`};f>Z7IW#n@DjS}5@t@=#*`I1Zjt1o*) z9V2x7j5=0?HJO0@762T6=W3Z9COP(PsMZ9322E>Y*e`T8GK;a8ye;}P3ex4Y-AF|2 zr`GGHbwO(b*3x2N)u^ClE7V~9r-6`Enbs9C8RsVT;2NyiJFqRr;H~2!hl?C42aL}S zY^jpDj^%T%ZoQ6oTe7atKdnPyN442u&>IeYPMIqjZBj7eLK_r9w%7_?rqro$$aO3r zHKYdgI-A!Hdk2ZL3skBkF!f-(+&U+0F|oz@qsvB0QoD6jn6xYA7c+M(w|#}JAG2)? zaw}|*gWg=~rGKNQkKKZFlgBq)_i*(>o&SWch5_mw22*jT zHIC4AMBbww3LB>Ndy)He85}aK=fZn13br9!HRxG^<32r?`{h%orAOF^6Dt+s1=E_A z@4HHn4rRuK84l8|vfM;(t~mh<#Q6BpRK1ZA*HUqcP47~C+e=M}XqiUEm=G7*BM0I> z1uK}CWb8doZgS;~`tPy1rZwddq4;!C<<-n{;s0^eGp*n8ap%Sgl9wD3Jrga_zW`^d zW1U!DM9`w+FREiPta4P+v+*%F=P1^s@%uq5l;a{`G!aWBauO(&byQ$TYr%B~$0$)I zf8rBc!|j#>oQDC(Qq9{wr83l%igBGn}-LMg?^sbr;o#frZVRn)JJ%@5U7 zE~%^vEvi~nUA3foaj0T#VoS*ANJ)HQb=7>#6H5x7z6aMTzMbX1+sbb8PeOqZgp2hA zV}B+gwSn?~EUFCzA1kg6geFR&fr>|O&;wQbZqx!b>6_{UlfGiZ7kJWF8#sW^Gx+?# z*BChD(*s9+%@}ROn2E8~f@acJvD+!BtSW9y%BptcAE326tgBhKVMgI^MhS>P9(!hv}X$1I?jWy)F0m9~+12tNpN()rLjc_O2S{v}!m)6?EhD|KElIRv&g<5bN z!aEsD97#&Otu9F;M?^8U(q=$o2Gk?JuLkZo$h;YVV@I8k>%0+5(`MNUmJOVAWobVq zQou-pW8*d=A+e3N*gDLofH`85LINpGNkP*n&Miy_N)!q+B?^VPkxQ*Fjb8s+=bB0S zN_msM0;ahIH*sz&ON;ip*pb)j?VwUBt+72vskDmEtwqs2hq8#+MLLh3qDG7YHKJ3E z=z+~enLv*b7}5hfwZI-u(l>wSk?r5Mh+=5(4UT^PuKVd@kF1pUUa&s%<3cz8i(!i20o2yL}$=ioO?kW)5m+ zPSGB8G|(39M`sm~COnZ_bB6I;hj)neY_#T;DDMim8yFwWQAIEZBcGfJ7_YRDthDWLnR4BrU}L3sbM<$59c`KrHaHM&&B(DyM_njpc+UdDc8Dtw~8D@!r=0do9d)S z;vYtZueZcIlY?E`NPN8BTvMx=@ildIEz#C^t5$19UV z#IHm%{q{|UbN{^WfmhjECyC+utnt7thI9Uh8P56qw+BAOaBkOabV)`deYl^S8P4PT z&mQ@A-7b3R)c&h0wFa8Ca!!#V%Y;D35hBmSIE6T>;5r#Yejn$nZNb*WtgB;au(>hI2koc;G+uz-O`bg!A9QaLzx?aPGIq8P4esF`V=N z1;aW01rPk12Yw4%uejfS>tSz_#U;f2oU(BeL+4b-|EF!7;(I5zsV+BosQo8cQ6UcvAohI2oEnc+N+Ut>6@|Ahzs z3d1>{cNxz4e453Pe#dj{TF7ute=oy%d^a(i)9?1c|HT7;!2^HU1HZ^{uFrc6pU>>N zljSG3x0>Nx|K$wl{%m15rypTB=l>OkbNZ(k&h`He!@1le4Cj18u#p;>ZEv~ao&Nt2 zHNx$!PrSd!aC^%VxKcUtK!2*E6VKGJeD*lKRAOgJsoc_$>QE{>M@E#&;Y6~dlF-V( z&SmSb!SFBC3eyM4!bp6mqe`8IiObdL>0a)|6W{&Sw6v*%a2=#JX+D_uD0O3iuRW6zgp z{`)PrLiUg|&!N=q`EL9FXwM%a=d$Mb&w=j6{6hYptdJCjO-S}qNS*mk{*gXz*)(M{ v9j?)w{QV^Qq>_6+oi*IG2S{n6g#m5KHR!gV__|2MytB6ct6dXZugd=cMsmm% literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm.o b/third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm.o new file mode 100644 index 0000000000000000000000000000000000000000..6d29c66da8ca8ab489ebf59e4074bccbcef96475 GIT binary patch literal 2224 zcmbVM%}*0S6n~{EhzM0Nl3-*^Oavoi+W;XZn3f7%O%+p+cw4vals5K*>~2c&VvH9P zg5l<$pa&10Jb2*d%|!VZJP`N^mtoNcwzzu;WT9Lv1E!ZFC|&oVf9*NlUWU`T(NQ$!HRX8C08Haz!9fuIu>UH zC9^4GW{O!Zb4%#8UN6_&ei?o%J+l4s+@DtK`R{LGfX0Q%y+D2qqqvFLPcij8MQ zff-nfhfl_$6XXcP-V6|WnuSm^5*i)o?c2flgkO|vtHf)A5vc|;D(M*fYy~Zb15X+t z3_U^Sh~p^)P#7}=Q5dvK(+tAUr)h053@6))l*T;XDL!6}kFKD8R?F+15n=|8NWiw- z4)|NbarkK$fgt=-2mEsf9M(il*p^$j&2=mPL=?<5QLUP}ylGc?9(X1dg-SWsbP)WH zX*V{eW!d@C%v5)_jizZensJ)&zkr#}EX^k~=F;Ng^1_O_lAO;hnBcetVcSsLu*(-El{j4i-Lk^(N}O((&Q;-A4L_sdmo$7p!w-H9y>-B@(2m1U zCZfD2JV{R6*{r%&4!Gmm2fU;tS2SJV1d#%Iw*R4h(`y72;@ zuUD(0hFNQMS8(hTla7neLdFp#vuImYq1S&zc@+JqC;W33s3&j)5!vc=85P3(ogkqb zX|mt{Dz4G*KdS$A^re__96BYWqTZ8QQ=LZDQNRU~{rlgM4ys=$fB#=Mae>}gLK#T@ zRc=VdIfXcNKb3z(+Py+f`J+--Y+njo0g|h_6@QJml6BMJATYr8oy6!r=rkhz{dKAO zmF*i8g7*KIKqR$K+CTks<*#r`;G_Qc_8@x{{oZr61D&U literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm_aarch64.o b/third_party/libxsmm/obj/intel64/generator_packed_gemm_bc_rm_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..a1260bfcf46162524a496f8a1be8fa13c1addecd GIT binary patch literal 9712 zcmbtaeQ;b=*}wZiS!ld>g_O5wwF_J|NZCnvL#LJMrcWo^#LH zJz4g?ujrY%x##}w^ZcHVbIxnMXLQa)@@T7l}i<4 zhjEk9V{~^qzwU`UPj8Pqe~*_=HI+?0A9rkhdBQREkm)4!MklCG9@ZvK`xPbD^4HP} z0d3+95S7mQw23dVj^B>O%iVhDO#AY%HhJViMKQI5rrrp`(6FgZ8{t^XQ@8jGZKhif zKXNy8W|sup*2T0#hhHX_di#u@k?-;Hr}Pk{-aOMmM4ceg4#ETCF`UzJ=lNl6x+{E( zkEy$KrgrF2M#-3rT}HoAdfb;NC-hKEn|anaV-VHXmN1`gcDX^K|8pQxr00~y+!UIDA+hOC$fSG zg1l6dGVux+ctVhHa=~vpzcQT@ESW5mlmv`mH477DmXu8AcZeT)q;yiGpJHcy21+=F~PMb>{rHecI%+ zH1iI2HMrrQljlQ-oMI~p2|TuVR>WGKEA49tVqG-CkE64J@O}-elDsV)MIO#W*K}6s zES8=9uiU1s+z(5%l6N>Dm9~JL4?z$JunB>r@;!k(wv`JbJpn~ zVdNI3g3stR*EZlnr!k>4O$h_8UZ`Pz5~V#=pz3j6+0* zITb9OM`@*XUpnu%KPDDG(}1>EDF!oc970{j`m7yqp3&XzuxT^LnWmVO$~vPoN9*(t zEa-4$?66|yzXHMol@-p4HkRJ;YNb@xfea>}sMHGW;8-{|wb{>bs`40+rU_r2uDXiGkjzC3)XJqE*Rh47#GPrmiqbdp4pb0bZ%Z;HL8{(=?lwM=%ZeL$) z?LV~1CMsQ6yp3iV_0)NO)`!t|mDz}+SWp_L8+432bJAV^w47Zx6Z#gW6f1Xy+&RMq z@i5jvs0fGlG0Y)X8nj>pJC*Urns)6rZZ>vfr?Tden(97+62__#$~@Z{P|rZe9NF1F zpFg*-FdWrZPLK|F+nvM|1_a3=cj{vQkb;=s@OQWHu|3Cmz0R(KS(%@e7gisz zb^BYS!fIH(^9ho5_dnVbSq>?5<_}8@{%qpw2%*o?I&)__6-`k=tIKD82*sJ=8p-S# zFJ`mW-qdKSkhF6JE1w*^H8lhP=$=8VkhPM@!eDxJt99#0E|-rKGJDf@WHfh2BsV&8 zXQVim&*utuF_Ik{u`~IRR56kpj%@GjG9x3Y(Y<#1ii=tP`1uEipJ%rt*yV_!?E@yf z+fsT#Tjw*KCog{vmF$mimyQPCdSmj0HaUff-0A$I(>d1V%*C8vnrnZdO>_e+_uOnc zPoYqtP~sRAcUs*t>XzsiyPUJAHJEUJNjQJoic*Th)#>fqDG3pGDa`cXd~;Hp>FUND z*@PnaScGK^@?s`xGd*EfhGyxeaP52e5XbK0oP-V|#WAG;N+C7_iCUX}9%PL2Br_vy zhw-+TaBaFj?5+vt8Q5a)sf>qWYdeBt%iz}X>eM~zjz5@QATwkE9Lz16>&}adZ2%xM z%y%LZvb!L>cscNSg*_4(GqzDm!^T!)D|3fZW0r&Ze1_HW_s~%OcnE)HsmjEsK2AVq z`64DZo{->ayH!Oq8j{1J4U*dw;YIO18OU2W}UQXBqWoAGUl{Fqnb(J&#cqq=9AYdRBv=kbx=|Mr*?P za}^2R06(;eRW$$C>5~igy_nBIh^@f^X7b~JC2?8_3FlnQnWy;_PB^c-FO4mo&dY9` zSQb!@OG;+>e*LTPf_9)L7^Z@1i-P(z<_#5^Ak!w%hq-7o7agxJ+S|u=G$I>-B%83L zem~S%s4??tCXPug%s0BuSCUh$Xp0g!{~#TC#rFbY1|O2Kl_laFVrDe7 z`C6FQ+R8S?Arw;V1*H#4$RHJ1DScZ(;(GAT^iej;T|v-X!>W%@E9Q&57$bVG_Acr)qqrJ>KZ@%bT&oD_Sik;bk*1DKJy%E8M6ZuFN3LtWuBG|< zmexqqw$xB0p0r*3+LmV0Z6qBu-(A}j-+0*f{)P)f2QbNoaMApy_N5U1w5#FI``gv! zp9{3Bk*Q$2+I0T~rrLb(5>s7My0AkX@NZSqH>&wQHEpN^d(?E7n%||S+gX1T!_gmB z(+M@djmd0R)17MmCf47frZ=kj-D-M+I^gyP1W#PegDs{GKprwT3whY;Vf?_|KJ5C3 zn9xS1Fv9x&%|aul=KZcprA2-GJgRb1G4^m(AfxNGw$sG`nUu$=<$bA z5mZjIvrDZ=pzbd+iV{$G5H$(`ys6g$uu!iB6#T4Frv()Ja#E)Sl#AYh(%$4#mR4}K zHu+OP!Ycyk1QfOd$`I=Z6!je#0Re?Jlf`W{dwi$=H!$3bpY8XHh}!UUiNBg3Rs(>2 z8dtbZn!o9RANRoD^uVvez}MsN1b&IXnt!%d!0}0v$Gx_F!2^HD1OK`Q{te)l_^ac~ z)(4C|L-KX<@TLcT8D#6pZ}PxzBfK^!Jm2ksf5C(2TORl^4_x^tJ`kdVie(LtjSgB? zwDr2ywg0I^6YHDlPs_Xi&ajG^>=6GLsc4HYk<4h(E{qM@ncS$AWP;h;?Wuo4Y%rJ2 zkJ(kpYE<-XW2IBcp;SS9@ciF=BV!h4vohK2>eh;M4d-w!XV>AZO8q|{T#D6^*xYU; ztj!xXY>#zY-9~#NW+_D*o)(nhI|>=ZlN!(1bsX@duE-Wr#gr`qw)W<<ay-P3H5N z(Y-Z7EJGv7J*g4PF3?9}p(0QXhJ7|>Ml-gR%?uWD7Nka@Fqj)1&g>O-DtaP;soTbq zBbmGK*|}#VH+ZXTdz8p33b&^Udve8$eP;zpCr5`yQWkQNtNK%Kp5*eWQ47;1mCRO@ z>&Xp{Ymm7d&6|z<(p*vfU42eNwCC zztsc(jtBmG377r3jBkm=FXLSjPG>{m|K}uJ_VbhnemN1N)$;FD%mbT*Cj~a-5bWo$3Vsg2eMa z34d9_-!I`0O1SL*Hpoz;e9{jVg1?VIG_p&lMuGp3Ks1v7ATEI)ArOu5mAC}{B7tax zM{o)J6#CRC-Yakk{A~i!2)`1Sz(d@B^591-#QN@8$jO%Int#iae^D7~ zuG6(ga1IX^?UvO_6eq51Y7}W3#i=xkpM$O~+BnN`qp7qtTu5e9ZtDM?zf_ZGsQI>4 zp=zu27I*lxsDM|XM;M=86}+v|TimJD{|H{wJo;DlTcAsvUjB>kUWl+(`%ShADAj;A z_O|-xEG-hWaqbxH4J0S#{BnAUHinDrUq=s8`z;r&9U`GG^6(Sr)b*c#vWo9yL0#+r z3|`dpKTiXTwuKua|7jfy`@(d0sEbK7~|k z|D!NY_PH2(Zrjg)GNJL2TSR_9-{AUmmXq7}xqhv`;#u4`bNyyn8U4dtKU#y6uhhi7 zm+QM|_3x`(zlyj$p(pSx*RPHLHLkzE=7q?g(5G`vz5F-RrjHhPZwlp~&J99ej6eBP wV}fbqNdfHupZ}C!f@<|?Ol#LJAa_&6?X4Q}+W3jHp8h!Z|Bf1knn&e-0oG2$m_E9CMmO39;ki1~8PiCP`zkeT>)6!;nT`97p~B zduMikjdnv`bI$C{eD}Zq`~UBqksb`2wFN$(;vPQbvr6{KMHS`9yzG6oxUE)hQf4Z8 zTfKEsZ`E7E)^WYL-a6CL6tUiqjQnIx#7b#FdsKS@)TULWEg9AJ;#Ph5Dg4@P!SOHM z_^Zju;rGrJ5~$sP--8FVeHe_`OSS4RrnEaI>TXlju~Ws0VmOQ{e;OV;QmuaFg>dDu z$mRbX`LxB&1`(=xTH`pZ9XutL*n)~eM#Z~O`5yl7|< z$Ww5gwIFm|fBGOqb*gg>2{}W+NP#DXZsD;Xs-rIesysb0DiEJKdZB$N~w-G3bC$?({OI=T}E+slIEdxia1h5dV$64(;zR=R!NKD|wU z?m;k$Sg)=!th4$mdq{!7Z;>LAk>@MIP(U%7TUv!A{a0Q3L<6rQZ_AEzK z4MdIJG&wnma2MMx6=CbWuysXmgSKEHCh6<+jZ?H8TTS%V&kVThkPo~evfjEJu`Wg+ zE@)b>Wv1F~vq+<3@CU4u9_dz)YwF<#8{0@pMYk8bx>bPDc(fpFH$b*nUi8(vruVB-P*#-EIj&h8v1*pKWNt zMkugaU&X>v2D(LI>mu|*0NM;tg@~nVNZ5{+#`HD%cmsm=fjU}DtgVxV-4qOu|1||c zAAVm(VI!QQ4`2NmmBi@Z;};J<#bxLy=?wux&>Jz8dOJ0dDXf=^oEVCrk%JJ$R*ItQ zq!2|)>gnj*D%1rnX)HrVW{6O}3&W}mYpIrA0KH^gMC?Tx7&J6a)Ka1yhe-PYt(de5 zj7_Sev#z7eTL#O%C@quqz2}V3jgni$G~?7&)+q{O8$6Z`W%?_icI3IKbunzctG6<@ zu34Mz{a|uZox2;M_2h0_j@(Uu51!uZ)|irh9FLuuXCK1q20CZWr48 z6u#80e7)Gt1>!>HgIwnG}eW_t1XGI)L z@-(tp3?SyZy?v>UK7w;)Kv2Mw_R8X@^)9N$wB7?@xN<^YMTJ!6=I5HV)e|rH)UjDe zIx2_sUx`8xE~i+axE#aLXuzRGAMf))b9=GNfpj+RPHCm+dzgy?3qU||SUp@Pl2J4_ zB;;}GZC^!6Q)27GZ~t7TuT1O~;N8A#Ubg6sl$WU4OllhALq5UWfUWA`WFzmKC&rGb zqqH#DD}v*Pc;osEQnT9!MuIuISGP(YYi%L^WK33oFkVqaORNI#=`+XuXTqr@LqwnF^h7GgT%A%Cn8s zgc=zwr&XwIDCvKApK?7LHzmF7*T`oX!xrllnbIgzn8McU9L?)F)iz!>iCR-NRYW`TZ~`npKA`6N@+Nw;F1 zlG>5Ru$%OSo-IM4r_*~V*V0$zB&lC^9wx}Ct$F5!;Gw-_x%3HUUL@Y4s;4g(=@#l~ zS$g4E05?p<$Q|*5qlvm%zc@zV$bgG>% z;5@6&Gq9PNw_dA=j5lj{M8;EE#i5;;M&5l+#f``-ck==v#S|Ob#Ejj284EGG4FYfk z-2**DF36aQ^;yh=Bc?CGWGS^<_3(DAh1oUODVwDv8ZC`NDVY2=eN4A&sgFS}yeXAw z1RpUUC6+d+R|~&UmNwguh*jFOj(++P;VeN&E+mh+R;D8punIC zp|?e?qk3z-_1}6+J@zEcVe9)X5$kvp5~0kl3r4I(nt>H=(*o*ee|{c6xBO6_sHTnS zTkz$`>qUljuJZKAcZ)9n!dUV;ss|`-vo7IEz@DAB!KQPLJ!Fz`X!xy-7IqS}6D<~+ z)1rz2aZjW6&{FHH3s6Ret@EaJNpAzz)D*$arHq<@|rqw^h5Bxj3v4mc8q9P{{~j# z%`dKVluwc4HK5Lo?*H?0R`!S}dbtF;b<@0^bPdOZ$MaO`sIIaRKBp5PSGjApj1o4qNGZ z>n+j3QDR)LHFmT}&U(Yu3Puor_R@#W{w*M2RV8W9;R#uK9K~NGbOFV)l7cp9;4qD1 z&05er2^wk`L<7A*m+=Kg_el_~`V9`A zDr$aqP9~es7v@`~T6&>q3q4)!I|h4uWA`Nb5(Dv6av;_p@7S8y3;<|*M{J-s7T>mm z&V}@IXXQfpXzGD9QmDGIcG9t%pGA0}20*n-b&L{FN^rWBuhcDD_Gh88x|J>SLRIB= zmRE!pRxGTnxU+ImsBCp&b0`u|Nqj+N1%09tEeKcNeXAAUj)3ph(iz2jp+G5wi~fVL zgCwNJANY%cus`@{VU0gDUKIA1J#wAluXy-+-Cs2_qt3tASGd^cf5KPe--q)VoX`4} z`=9fL{YQKa7%lib23I(PItEt*T<2g>F>$YN7p`H@m=1$lhk;29);PprXRro?HQ;Un zUx`Vr4wHf{xQQ}JQOq`OaHx_cUl4RzEu-LLfKMaH8Zbsm5>Fk^E1hSu$W?j;v2x9< zD+#kFuhHgIQUjzWxYsiaB9Kf97Ldpm43bC^z23u@)U0)#S%<;pAl4mDq&qo}%qIVk zuB(V^!wvo_-Cv>m%Mi92e-L4;@fRB=m@PH_eq9o_`#0qdVz^}O4T7-7 zzoP~sA|>|=0d;cTCjTa=nZo&oqqm=$htL~|!0Q+Z^6>RSZ{*bM!cJe#q^Tv3Ju^G( ze|C1kC~kDn7VN>z3LuTRB2=>naBapjB;`D)w-qTD*&9WctT5-7bn1iD4W}Q4Zin;? z6+%L;qfm6s=_7?o@ig!bAU@Gu<p~h(HeaAIgKjng^eOz~|!g z={$H-9(*tl{&*hzr9Ai*;4=$o6_tmmY1}S^t-18<$b&x$d}cv5e?*Oe&(S>eujj#w zP@}ozel8E*PPn5xzwF3^Kb!}DA`kz6%!8lGgDbzohNrwkiN!jH`Z{8<@yE~QNV<#2c+LKK7#|FE4H#^&|43`~Mcg4E;22%q=9jUHlUu;{VBi5VT zmiWyC@xiUvrs_!c_79~Jv4O-r8E#o`h<)FdM0|5%K&qFa$)%FZt2fb`moDBj6zl5E zj(cQvCX=aa%IX~I=@CBYNyazFlAW=RB%BlPOAThkdf0aLb)~xEJze)DoNa5Y7v9N1 zea-#uHQWFm9^@fj z5;9ZW_$Tfi629IXYwt;RY$frrI6bH{(?664|K~jT1%~r@eFXo{kW0^{4o>MsyEfPV z2E*xFn~QH?IQ{*=#h+sQKf>^{4Ci@%J-uV&AU|+>Kgw|K|LQ#WMuziz*u(H!A;Z;k zn1tdWxt#yFKqY<~?p^w?F`UyMWB5!){~E(N{pCFP&8%;5`cE?aql|wu!@1mT4Cj38 zJovMD@B-FnIR8$DbN&Mi=YAVuIH&&x!#RKPc^7ti<8p@4b3Sk8!GFnc?zb~}?CoK3 z2{AvX9GuwEzSGTzQw~mgevIMg8UAsGS2MYvVE76L{}ixKGJFHWZ)f-x2Pb`~q+ET5 z82>+I_}?(RjN$p>{;!Ol$MHWI&i#KQ^E1ciFr4%G48u8n6T|8In`>7W!#Vx^4CnFv zD#JN_zPx%h5514AkDSkk^WgIs&h@Ec_&jD;oZ;Nwdl}BLTb-p?JXbjMggZ;9z~5rHvs4OPDIeU~n~JyNni`PT zE$&?^u_L9FZ;q$pN_qR>pi(}N=!usTTK>m*dHQ28`~$VZ^dwnW?tNo);Eusm>eGaXwU;fXD=&VhJuLVENM`lYp#S^$q7F|4Ma9EZ~`G9w?u z9brzmGkA68Zt9J9KJ_%(()_;jD>0XN=F{K(?gJ8Wc)u~s2THxZT8w6Y0(0g45l1na zM|95iOYAL(;vt;5$v>yb84+eb$@a>pe;p?I1(H1T8=}P@$0B*j{Dv}j@+RoacgQoNn;f7z zgUB1#f9~~mXMTl;Zl5#1+=G*@bhy_$ocR*X{+{qEI%-r57C1gSCL YnD7jG?I*q-5;3pe(SN09g6C2Be*ssf5C8xG literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm.o new file mode 100644 index 0000000000000000000000000000000000000000..f38340f8a284a8911480a8cc367dfe1b4993ee61 GIT binary patch literal 4048 zcmbW3U1%It6oBt^SM$@<-NtBzC|O^$RXc1pS#%qzyUFfmk)_4jNzv#&feUaA)8*$ z+ZT$?QoNMAmw5jU=oWLoB)Z>#P<2kA>DM?wp_@}W!&T=gBn4-}AtmggriQUekin5<=o&37^A>7M$aO z%mH3st8H1$rnX|V3{$t;rnaHiuNn=_+E_7w*R8sC)i7JeIywoK!|hGqetvH5T>d4DP}HOagzF z`o`%%AWIYbkLsujP2%kDhhhD(&=ba3j2#P82Of*y@cdEQ`3DMEdO9= z^VN&9g{pS3Tz~gc0xgZ5`LUl5kkQw z$KA~NDncl-pKT8Z{B()Se(>4EBKyI26^k6t&mIoM^AB;^PaH2QD6*du#N~JvJ$3hl zj)KhRdp&}GF7PJ>{a*zx`f=YG=))TT?`H1*6vcyF)PDimSmb%V>fyk=^2BjocuR8s z?+};$@MlTRPbb39M-hHL6S%m4zKZa3PtX_l;qL+${rnZ-=S|{r-gY85-*2-1cY^-Y zAoINaDDYDPe;{yiy;8(6|L#WzwWkF>K(bq@)^^j@YfQCl)2mmxWE)#HQyaRiGqq+} zOf`*lT}7%Meo6SZS(r_z>VYs{1db`;)S`clkZ5t|lqMWsC_zt3w zja6;g)SHGl??;r!!48j#{L<$m{L!)Ud!Gjhnf%3(_caa*vi*zDjb9CqYCj3K=rbyQ z{yhLHZjs;qSAgQ)&AYw5za(v{FDc+BpbPadRxHnW6`#eGAmeuohAP|N>eK!Qvghw0G~IHW>{Sk^rnI3{v+_?_I&-ra)*|A qK0XDO0ERwfdwf&my$^`(&}H|bG-0Xfep`gu-=g^cL}x;g?f(Y^r0F97 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse.o new file mode 100644 index 0000000000000000000000000000000000000000..5fc2dd3aedbfbdb0830d44a7409b03441e4d60d2 GIT binary patch literal 2272 zcmbVM&1(};5T7(^YwefXLZO1sldafi653LPO42k*P*!_g3#l#+As)b+lE3{9o;EE9*jq>qkW;PJ+HGz$Zg;l z@>$l?4*YxKad_nu5`O%b4*d5HJZ#B|G=*I?jhe{amwCggZOL-k$XPifYt@8lNuC3q zj)$OB%r-3i^&cU}@il}nbA_3ywwt;nq&s7FpnyZMJklBU?6X;CW#S;91K8ZdS?Q$2cd- z8s@VYDocV>!sA2(&sEE1S-~nQRa|Kh%PBo^!}k*V6NkD!=EbY5tJ1wK`CMYeH#Vx7II^*Vyek>___e zzEK>#54Vh}cyC?0e(n1Tf#3Qc5s9MLN$aO~uj4gO(|G4b3Q}tRmrB!HrySmR%IUp3 eNWo8)^pFgfsB}jR^%lSQXR7{pN1iRw`H}h!r|bo=HUK_TX?fjjyw1}3*#0x zY~k+be4My7sOQ8jVpmXqOY98lKkriYw>p&XT<)&mI3=4Artr+ZC>+K^)g_@l`x9Z7 zp#8!@qR@{B6JV!csvFctf$5!sKX5ktJ>dskHzc9j!DpU?nMD0xZ^+d? zezMIC){@|voK3m2<3iJ9E)|4ezsMizaznPWQ}EH)Cqia7(?m&McFX+q+j3x5RZ=v|fT*ObbRS?&ucNuRGd| z0YBRuliSJU2Fl0%1h5I%jI%pJE(nT##!TU0IMe} zMEza;UD^zmu+`n9?{=neX4Arwv!G_HN-f#Wgr)vG4Hs~bceLOv1<8TKckUFbo5$f{ zlb?Q;%c)SboxGz~&MI!$ozIN9^3J8gFNfeIit(Z_MTX+1;gJfQx-q1Gn7-nGf5DI7 z@+bbwC8l3>s7B{%_{Q{U6*NA5zwkUJakv>G!W9}98nbZiV@-I&d1d>B#_P%tL;44k zbA0AKC$1Hg@eS~MXG3)tg6WIC;E9W_V0zXerY|hjhJv^SS_k?HO-Aq3zJcDQ)uFG{ z9zwrFyHCa8HHv(lBbYt`RerOlx=s~*8eEK)%}?MdPEmJ|y2I4Lzw5*^)Lo$NBeo*- zHaxY5bZ?XcbN@)rtpe$mu)%)G_+v-jzeQsY-1rpkNOt{?#q65z*4@z_61QX z-8huYd<91s(m!o^J}P)b{k)=ort0ryyExKLW7=ht1?A-I;~swco1IWlJWA^E(;a#3 z$ZqM|E}55JWh%Bq0k$7GaLUq8NjaZt@Sc-sG`ZE_VeM}la$d-j*2uCC%k#zpRENn#r$wspkOcgSe zOE5K1aD26jF`U;ySr>(!Cg<X4v{baEzB7Wb2xEvB?r{cXO1bHd`SN|`%1n#`OLAeS^3UuR4DCu zT-+%>EtmEGik@< z+oFDkt+I&O_F0vYKR33}PRNQSa8wZFCU9KG=;Ri$AbB<^GY%5XAA0L^MG>L3TXR8S z)ANwRqJO)b-HvoR$4~y~RxUSx76zj!wCM1zvE;}|crY>=N%U&5L^$5tw>#1wPR0i# z!26PY;a$miZz359_x2|GhSsj}Cx!=yH2-L9pFcJleafFq#pAJrmh_LLqS|mg8cF(N z1OAR+Nb*M`ql4Pes$~e1TL<{7vsBQEon2JN-U7%^w>``<`1A16^aXxBlEWJc620%i zxxiTkX~CWT_d66*s?)E!sLXA;d7ZCA@e9qBlNn(GPii!NK0oAW{vgJn?w>rXHK^I& zrkQ^jK*wr5#rQ)4g4t#41pWz*mak#?l`RPUs^FIOcd}2pCUbn|JgN`MRn__(QNIPv zdMI3G>yJ^IzbcIUk}1rffev~84ip>mcB(UR5 z0Ib|2HNT@hEirDSlp#7~V_OB3J0l{vx@2S6DI0rSq9IlSe2+l4XnbplXatu6_!j~^ z7`#sdpJc9Ej=@<%P!CA&LLWE1)PRl@^+RZCifVu!#N2+>_&z&A#n>;931_HuQ1L?b zTF8(Dod3U(D~z#Iqpy|;%p6~iPE3DNq1_)qk@|_Ng+Fl8s6So}SicA-0tfOIi0Bb- zULyuWs_`Q85Y-5}R3jsNAM>hymg>Z~{yJlidrETgMKk#<;)r7WL^zE1;LPu$|2_Pa z!-tusV3`%Xq70GdC6e6~Tw=uc9J72Z*oRFVL4E zJeVZ-@x$;a9;@=hH}Mei!=K~v`}nCiGd~kaSB zo8>G@3&%C_TZjx>+jE*QhF=dAK|QK@!o;du!OP2}A@pXK9}2{o@(?EJx0Lj25GOY0 z5-%yp740r1{WF(-8~LG!e=dp(E<4uW@i?J#!M_5+%n|bSDSv@B3&Q{VQI!RhU2TP0DB2;hrv|-ww zjOfRzg=9c1=(reP1a*p$wiHJGtipV8GRe@*8)T)BvD+Oq6nDr_kU}nnj13;sh37Z2 z4=PkaB9kENr^R)Tgp6G-RPpB}e9!mH$5Hs;%zGe{o8SAZLRR3oIv|hW_Z9r=@#8jb z*zgDb+KrEO)%lwmzTVL2U){Jm(D?Ph8h`EPNWWj{)$;hGfks+O*OHp!>CK#D%;$K> zv(!BWYplnQeH37>7u(}ny}m!KkiFF}I9t7b!zFub_g6~Z#%GsE-lp`@joztB=PHNy z<;pJa!OBf|wgRcxhCwHQPCP{dXoi3e6L=iY$MC)xPZbbx}?As*v+ZZ_YU;_b`4O`^9^^wwgxt=?+v zzSZj%@0CCpvJ;-j6Do@d<$K3rvYoIYwZExE$L!ZuuwN_L4-%eTBIfOPxXO9e={;Jd zdS|Ozy>C>>-ZND}?>USw;CHb~^nO$&d2iu2v0|_Ha88Q~@DaIp-24`@r#KT-zYRR;fV8JznK3gw1Au0h?y3nYmM8E&j;ZFn#q zPDBR7BfatX-2`k4(cWE=Xjn_o2UMcKVPOaCgKBtmSPPE~_a$OsP>o_iUu<+>c<|Rc z$XGlw8pgRsdPfTUOC5M$G=}drlaO7_SIi4ak%meP~HaZ$;63&wGBUL!ymNas?9&YWW#@02B+^rYQ+B`{4BW_O`IN7 zby#?ZSrQRF)z225vf&Tg@ISHP^iPZy{mVAo9tXOJF&aI*_*wMt+i<&`-L`vS$5+{K z`*}6k@Rc_Hqc+@bH$P~oksVAD_TFR08NwH;5xg`cpBmA6cj2ie^3Ne_q(#OwuA#qI z>*X4DC6io3A`@>0r?gn}SGxx@VSr zYkSv$5eelodz&={70a{Uttb802wLB0UySh?lc1ppH`QTlXlj|czKD#wV+kmlu-04f zN^?EMkwuM?7}~j_1=jxavmEy&%$KrX3k6{`YhPqPisROL%YMZEF4@4&Sq3%zhhjAG nC|*x#vDo(kap%|{yE*RfikKH|ht+hY*2hi#>xvc>y>kBp_rWIf literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_bsparse_avx_avx2_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..2c6bd9c529f9d755a5e7f0eafa542662ce2d39a3 GIT binary patch literal 14176 zcmb_ie|%Kcm7mEBFqJrOM2c~XW!!y!8a~B1VWTEO+uZOTc@ti6LJ~qXIE0V@A&@kg zK&-SvX9l_Zcw2Uht-Bw)OIx=e_fuQ=gSE0EO%z02tyZ@cr3&h*jPlFc21I4{d+vL0 z@-k*Ci0a##R2%7Jrx zy|6ir=--sT;=g7z#&15hlFf8 z-zb-sAC-1JB*(0o5t_-&^;sR$dUfy!@Pr(aW+)y{`Qee2nC+EP+X#&xW72+IjiIj2Ka)9)S)J016q-?4 zbRw;KUkCGB1I*|nVW=Ncs#eG$NV3n8BrBzVK8Yl0aRiAGdm%~#f1l*I*)0Z39`Hf| zx5iY{Ki$^BgD$=r7M|m4^nQx%4vXyzQu=THuWXg72ZRa|?SeN6VF{3$l+9+f*pZp&yGcJwn28ZF62byy61zn*`bROE=)XIY72=-6V*Zj~xsu9VVm ze3FI1z@Bw%cqucq0N3S;^kFIWB{H^+a;0Ll%G1^Ik$V1#ZtjqGLQR@^zZ}BW@r(RI zjj7D=CeL_>PZuTM_*jP@Rl^N}}8gM@?dOk+kKi5Y4z>-UO6RX??a+Ix`~8+#|%2X6_SW zQ5%MM)JE*K8ZAqHtp^JO55JTmyC)vU?HM7Ah;`U)#BQst850tV3Hz*# z9kWN5*=?`gZnCf^CQ1q0V{_eZkM6YF-F7>KmRx?LQl-e1iMYa_BnK>o0aP+G<~@|! zvl9&I=~SilL)!4$epQ<3pI!Ch`q_t4J9dg8TG3FEHvEPdo1#=bvHr6)hf*nLG&+3T zUyE5qtm<_C4Q%sV`2$$mz;)zvz0$+lHCuH#lzd&<^dW-P@_=Q8>8zCcHpnaRcnZzs zM^*D~ug(z#$a+iR-0VNGJRq!)D|Hx1h|JfDm7Uw?nSQCpY`Po~q9VzRx!)-tnt3rX zBNuPX;z{b9gy|WWz<7Y4O%8dhp4{**OIBu?q`tzE!`^6eD5O?Bv*ABIR5S{=_fe>!Ae7#J0vV!9*zzk|yt`56Du5}q}VFFetA z;+~Yc2a|0kbEhNFF9h2Sq^Ue^Qu-D|RZCr|8_-o??{n4k3sP!7R^R32zk})({+lkT zMd++_FJ(7JW#PjVPf)Qp&U^wVD&f=KD zGh$XkR%}>8)-+>>sBAFeCN|BXRZO(1LH1eqa2~JGac%Pfs;zEc%39e{b_a_;k<)9M z0$naodoxd;C#AQN(vK!uyFC1N$a@NZtxIAtsJ6^TO# z2U8AbPQe>p8*zttwTlC9=CyO6VzF}g5RE)u3&G2&S71Ym98<9k#q0lCY>8srRic{r zT=6mFM`jDL&F7E9JEScyeMTg!xn4MY1s3a3uRhAoy~xZp67QBH6v3!sQ&5~OQ~6T} zRd!iNHI)vORj|{Gm>iDRmJ#X41pLJ|SS{9E{zmvqtr(%d@a~ z!o(Y_m53Ug#kyc)Vs3T7Sx-dp&U_bv$>OVFU&BH*LSIN6M%2;JMp$*H}ZT>h&bxAP&S4s2D*O>%_sml;3OXsP;~wHvpyc&6(E3P3PhwfrcOu@CH#xWh0G zqFM_3jOf_EkYX+-j~*najPf%II3Sib$*oj`EUV`Sq#4z8nxvV+%lMn+FK9SZQ6Zvu zk1O7H6l3~kqhc=-%FNfK)M0WB{;Fh0j1%+K*U+W zXV!V^`LJqEomN}bl!hUXU?BM+FmAXEW%f%de_AnXikLYoIIpVvUe7>K^S-R`Lki!v zEd-lSX8n@#B>cmlm7EvtCrf?>@iq=w{)rGiLb>)JIxKm20Nw9f-C}hA5nZybYSJY| z2edJ2=N2sUt~EmR9mZcaSX5hzI_AxTaGi1Z^8&5|U& z9t6lz27w%-GH{s)ry}<5L0InP=>{LN_6`DqoAISpd_g3e2%AgU5*2Nyw2kb%YjujLelTUbA167YPQR5uEQ|8zY7GCQ$@4`6uQF7-(*EWcYts2e zTBj_I9p00yks=p@2%>&DavH=eKfM2+XH7b5iB&pQtT3?<^apl|f=q~|@b}cJx1{u= zu#qS~X$KN(Lv)Brd!-oP2%ymS(PmU+KJ$CnS3Unw>}quQb^m;0s>G5PvI&j+b`=F2 zqq)M0w5*9Fo@GVaMrr>*Y%wHlE@q3m#2L6!HRcDPXc3O62B!tB(y(V*jZu7A^22PR zZVXk>Lm*d8=Cv8br2T3vChe*(ZV+59O&2zk6L-NF!Xh!);xeQ6NSHmqOwbBY(R?0ukyN1QAO$0ll*a@u|Sc0v=J+Bl@rU#^hmX|iPNT%p)v$95)3&AmfJ{B<{p_QQ}I)XbzDz;Q|oh>rsk`8h=@Xogr7^ z{7ThL9T`X%o;gg3xDKZS)GSWPMx}@M6}Big#yKhNiM3d@j3{Ku-ZZ#!lpl=_zv-_r z64!W=7s2^9a1{ep$D~v*B&Mn~RL|eCJtS;Pq#9GP-?01|qaH#umq4UnksK5nM8((z z=Rp`?-lMEB#ui!8!m)RVqN1d(FLngc_$zd609mI(j>MO3s)&j2=TSw1t?(Xei7mba zMctf-3xZ)%T~5%ikc2i_n6z=V)FD#%bW&%vyz_6itxmLU3sHlH~Iqu!jQidNe4W8f%M_~F7sM**}o zCGW&6Ho~rvaI=SQ`4A2;bguA54?ThOm6g6XRDM)PmY~F`ojYO=7fIiHLc_IPiGUup zp*{_WVSuE2+G*fWN-jl94eRx7|<^{Zv{aN>}-p0&F8L9^EMy^l)nCdUMQgzKC z|Bq=_W`=;^;=?Mlp*gbPTkTSkszKPwk=$N5+dIDPW|oNf(DPX$-6{24PAv&G3xah@ z>mCp*$H4U#!iA?hrQQR;yznkQk7M9Sr_^~6pO>YXDBNhFGi}Ylzyg|S-FEK16Z185 zkI)%=M8cbJCF~U4&Zn23pexiSxmPv10t#HEg06tlL+@s@nJ=8BS{(5W8X{Y@PV!&e zCPXv(nrNU`P|HX}1_~z?=hWMeJ)`2pQW#mUWnAQ8#)8ojyu#ez#dVP5VFeWS(~nRI zJ(qbA#R~rwYNa^{V5A$|+v6T}D9W>JG5tV-Uq|={_<a``#e7Zh2k6jiWy2e_r7A(`oXXl^t&mR{)CW+y!48aNlINl5t3m>DK5%Yb4iwolO{{P zISKbINO4=0Ik*TJRweZ~j-RNbR^V7&bx=y)2JPaiK}sEiAmTze^>eF-i{9^AJzru2 zUCJhI!}o;5m+(C?5vJqCDrL?U>@g9O;c%s@r2GNYvF8}VCaq#PmJ*7jyh>bgL|BFW zb%B2^zyM*DF7PJ;3=m~^2M+4b5}cH~3QKk1png!`14Pl)fdgM~rIQJ*wZD%l91>YI z2fWN&S_F)@nc3`P<~RJxoCd#Y)=yxjI#H37H}M`&&83sje*^tuwC|wBotg4F+NpRy zjus(ac@xVFppV--#N#khf0y|R9QAE8=QH8)P6`>E-;n8B%m?q^zM9Tyh33-caF&S_#TvFskf zE1RGn#4NL<+mR)7klzVDB3@HYLy^93)HaE$!q20SiGi=x|MRA@2HQL-(+DaTkGZCy{JMCq<^+Gla+gL>iPbx5k%tRy)>t5rH zfQb76#Qls9c;u`)&dr;EOCHJ2y-#Uxu-=RXqUM4?y}6*c-mEUx`3tslW_m#Ev8lk% zOhM0Kl(`c*PTATdZ^+z-9)_INhE0D>uyor5B^cQ^vJ}8!J&yUj25lP8?3{ig27v9u;~(mofy+WHHwx2xxs#~x8HqJ{OCD+ z8Ti(KbR9@H0sGFpv09rgjopNYiy@+om~h)=t{Lj&qK_^MRWQw zMHgKH)dA=%PpJH>7*sJMCXj&L<-%5NYz+%;1iTrD-$Y9)F4EAUnYECv7Krt>W{Bs? zF=AnG0R|l^^_X<)IrFh(mU zg*r6r$1Wk}eiaf=UTjl#jE3|z-cA?&2#Ai!(!P7SYMrwD<{>SDO{nw{g(w`?F>4Eqx1s~r8x2J0^@oCgUH_uBbP~mx*DgzNa zq7*yeCU)$A+t{%Ko;uBU*dKgqT3zt@X|=&4_#B_61y4+?37*C0{4^HabX9F|>s3l{ z`&Ao&UG)06epg3CERvHhETk>%ppNRBh2W}Rx0Z(Ww*dzX(H4c4dl#VI2c!+0&vgDj&&Vj&vGo4#%=pIz4ODbweb(^rJPX4BV=p6=Dl2m3Me&!8Az z)7{h4o1-vTc~gA#y8cApKzm|!&$@V9_dtC0+O-A8`#M&x?oV{|#k&U9_QscE9r5nA z&ikFV`n-*HKV_MWx91Bs3}s9zFqZCn4EOH&&aE79{ONaiqJ+fjJQv;r4i zeD>TLsj?;|pp0*Y7o=(h!cH7n^`g02#-88ZLtJkegtZwUG{dKYV*z-88avbbG z*=TZe9jQCMHXsxlzvT}L-*pmX)gIB5oof%)gA<{Mv6Z^ybgY_X%Q@NTqs_rOhTB;oLP z*?v!vkYX?jeDQ2?!LM+^>3=iyB9C=Q- z@ZIoh=n#R2@ZEgBUI6||0eCDR$Ot-grf~QkbHUyE@!2UQUk_ImfX{Nl=^E9czrY1| z(_d2nKIDSCL3 zKK@S&z^9PVHojZ_TMED@iaYjC%l{7r;BOXy-%N|K`MUMV3&0;IrHq6BnhhuSBU5th z_H7rOOvZs{U2rl12i`z;g?LDB)OFzh>VnfT-GP7Lg1hD4>52oNbKy_1FB^!y8?L+H zbRXc*Z?fS;j!evf|H6fTjSGIv1^>JYPB*Xg5Z@X2ICLfw01q{8zrMl+cgNLPF1Q1$X1WUI6}40k~wBenj6Lm#%fe!!EwNTyVFZ35Ooir_D;dx7zC> zxIIw0X*w_xF3s`Z~JX$_QHam)CCpKg1wY z4+&V--k&JH$y0`6VQt4cNWHEn(NTumtLss>w6C#6?^qS@>}y-wVXgcx`X-%IsvVc>I@5E<3{yQTU)_u0Z zh(E0{pZ^eGgkq-`^sJ{kY`6Bi(>UvQ+FxRT=G*KXG|#56)h)|GtPbJQcpLr7pZ~u{ C`NU@c literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_csparse.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csc_csparse.o new file mode 100644 index 0000000000000000000000000000000000000000..566ce3edd38d07afa8e86c875e06961db014cfb8 GIT binary patch literal 2128 zcmbVML2FY%5S}!xt+m!F3WW-;CtI;=5-e1qnlz>f7;I^+^!C^!+cucwh4+^B)r*41 zB6#rN$*bValSR~{SN#p16cOrOs539Sc71&T+kx4c`F3V@XZFo|R4C8&BobgK0XJda zGAO{8-hDkAs#zF@ldzD>-znT?`K3Io`^;@MwwTxu^@gZ4BwKYomb<@l6*)u8R64_i zVrm^-+itz324TB6I{7mRF!L3c&A#uHdZDy^j%4ZS42h3)pVIc=yIUx~5%T2oUMQiM z&);C<#kC{yE8XI4fHCDpILVPlvnxH^{vmF76hl`L1MiOH~{R7YN z0K(7fz3|lK;Sp`d3c6D-{Mw7#OvZK)APGHj?ucXUF_gp;;wTAyaWG~X=zp9`bQF@I)9R8*YzS9MVHQAD$P_E~+MfHKKIevRh zHk(e>uOhdF=SyA%Ue2eWQLh9(Zv98dVSF925fEo+!A~2UVtIZxS9a#-=9UWgoO`+1 za=`&#)uiXa>ZVs$5@b+U)RR$KkS-$+Bmds(&;(O1fv=F=D| z8-i29!$bqGy3M9+;VoLOk{kzq(pUKGI-gv3Rz1;_c0EVb$1sj&`=8#Rokfl#^0nzA z8iYkBI%_*wdNiKqlBW2m@ieEDGlOhXRvSi|J@dyf_h{e->CyTh>i{$N+B`7JYq&w* zT-G>f8R&unZJ$6%e-m@QnTMB%(fKK4n?7iR+aT)Z-bTK%Fx&NqZCJXAGthff2&4J1 zX;&N3dzt$i-?ym5)xWFjH+53|G~XuP;BjYDZl*8#LQ9%y$`OsHoY8LwA=uFcnLZLm Yq|#?I< zRN2@CX9ecYMz-oKj~G`YnGY(m4GSYi+V2fzAMq#eyXaPws1fxq{QA3iy5mFr zO3floJFv*3D4Ab*MjJh#m1RTz?%)|+n`&x~7-#itl|ESxTlxfD|0nG))3X}_&|R7+ zH%`4rUg()eOI;hc4OQvcR{s(`I}{C`hNz%LvaoKwSoBo@CMiI{lKI3XMO$wmO7uce zcsbOpXQTeogc1&(PPp`meNcPcyTId1C<$dl_Idvpd8m#55&W*geY)N9CQKm$#CtS5 zBKRVimnx1=eC3uvqI~V>s)>Z6uZuu?wLVE|L)pXr@8TYI-yX_7?LS11+W5AzdTso! zKMtPO*Tadd$1mf~yc9rS+QGx%f{HSjZYqN*ib2oLlU#xzgA#P=pi&`EPZV#+xM2B< zw@3tVt~*WX8q!MINv$^invlmR;(f@Z)t4DBYU6Ji|Hur`plJvHM!MIXb59#@B*6k7 zy9vl!fsbHs^qP(~mVz7-p<(z21)cz>lLou|D<1z3(IcEqs?mVj)WC;Hf`uA-IQ|eW zQ!g(l3qVa1o`srl@I;PHJV>^jH`iB_TO~pYx3*@btmPEU_X6SU%B1&hvZLLn1>XPWpbO!Ry8Hp@g{Mi9q`VPmS}3$x_# zTzFu5dRo}EE|lN&HksKX#wSQ_|A3iLZ6-HgDBJ5FHj`V(zc7SFzaOF^$x*s1z2l6T z=~ACVFy}p)7d;4ONyvCFc;?4sYtjpTLVxlEdFoYsw38KK09;C3Q- zoCvo#5oS3NX4?sR=4CVgWUhI&1NUMiqDUnR^~`XoYiK!j&ys9PttneJAR)!x#^=RFZT z-6AWAs0n&@g>-7P&P7EAE*mEyl=(@Co_SZ(*GavJaulWZX1yhv-QuqxjbZmJ(pU$T znPLhlkd<5f^Rxq-XsnE@8R~BB;1d)niath*`jkpT$oNd+eN>rWPAZOQqpI^Xyc2pX z-=OC;{MMSKU=2zTdsO0$wDPX@Q$zAjdyw3_zxr$%C0|wL6$u%=D4-{3c-G^W5tHUTBI}#7FUr) zdc$E9Jw<*AC33M9ShH4TF~$Oe`s9te)N-5#Q!rbO7t9-rOXMI@X>k>Frr4}G<)A2b zPz%NWQ5$(18ghF`?0-%t%jn{eg=IkY34=RKQ6g&=nD3awDQq&_?#Mpmdtt0tZd~})YnC+3I$JxvUO$IXZ(Lf?zpQ^ z+E3x<`JpV7db9l2%GLVAqP9~t439qS3axyEO5!(uGd+FbeO$Ei{ZM0uMf+CiZNeqL z6cL#=wYRw)<|r(hQtm2kJmuYB>ZiFzx*=$$QZbDvmxEaZec{z>IXgUomr8|5;AMC! zp1_GGPnL7aHdTnlJ~u7XeD#SelPeWcRnh5LjL~M3Se$HjY%=RssW$S$$J5ifWd3^) zf?2C#>Jzz4Cb~)bDia-1CW+rUnG=hX3p{MG*r<9jRYk)t#%K%sdaF|3tZy{~|C~1R z01sS+^@6~2d3Q$OM42Rh=B>>MV!4^XMH00Y#G-*0W3<`4UbfXY>eh(gs*U^zYbrP5 z@*9|qcrN+R2%IS58`wE;V!5yi`f<0K_5Fuqt;aqXM%QT1)h~(|tNhr9N03%ZS7E#U zhnTL}_JymBk5(H$3mdNk_dfP(I`Vq#x94!x9uI>-X(>oowLC!@u&)!S6}?;`lDRN*-6eU%yK$ z$LvBU&elf$9tlsrN3x>7L~6`byM}I!W*5N==;h<^~H9_`{RS1>4Cvmva{>K_|8}=xjPQLE7gTxvU4yMk97_W({CwV?hh^%Iw*rt z-m#Xu(ok3TfUlyVsco^Zy7K zMR5&#U3ZqvE*-;kI|2MIllGKpZh)rxQ55>f*PXX$EnW_E+R zy=05pU8^P=)ovXx`_%3*y$si?`(3(v$knQjxngQ}gPN?DU@yR0fQ{;Q_Xd~R-KZwr zb8v@rliD3olg*N^7JMz>Tdh{>YCu;jYE>WnfIqcr=^Ax=dvWINiqe5N&Y3ZMM7b{? z19XgfYI8A!WDH67doFdj5gvz4=Ni=S!t*1pZZ(-whohj7JN0m96DSR!w5r4F07n2f z0N!B2&4>n34L5;SvQZt@)#L`rS+ZHYg-lvk!P^dsISCr+bWfKEV!JvV!D|EasRfqT z73+{XH3^T%DzS&*(JpvY3-)$33GHi`qA)0=Xsw!j(DdyMVX(#Y?QP*r(YIFg?JC-C zpv^0sANjt;c;)8DvH9+MTlHe*Q&4L_w)H%)noHZ(zh0iqhU@5nA=Te9n7`r zFYr4jgm!|~=s*jB76DC%k&sR9dbB1cS7=Y_3qhY9Uv2tOXlG(R;gK2q$KZS3H?FG_ z9U=SD>5A@FJj!LhwaBr$i~q$4O9=a%H-6*&&r6ZmYmgL7vP%4l6c z`5j7kEY;J$yEmRwF*-IW2lsTPdj|SrLvd97fxhIPbUZc~-|fiP*}Es!)7R%n zhip$O9UqJ(_VguVJFv5e^>*%v_c{uY;l6>PoAazM?rb@4Fgv+AdogGFclGSXT%*_;wnAZFMVvkMjI}E3z77IVh@VN{>&T!6u#sNReaIWVghI2hj zkx!|SpP$Fi^5;p0(;nKwpJh1L^M(U{-U0tFhSS!-(yt+(QX_x3{(u9%(E%TJz(*bM zKVvxe=On|qKfiLo=O8arBmWoRXT^0d!*6GJ3&T18Aj3KTpE8{DPcoeIpJO=Zx3)s$ zC+Dvt0F7>ZKK!Kv{!@l?J)=U)%yB%*@H_Bq_0RJR=X&09!0CL98ky((v@fA%$9)d? zN(a2f0UvO{zr%1oPmVI2uSanpWBSSYX@5hF{O9;thI2jt&2Y{iWb>1c-u&! zPdMJhaIQb@fIsAb8w}^`>nOvy{*w;)2M%~Cbr_nx-|lk2moS|B6J@dmZraGo0&rnc-Z|MF;#g^Am*p;E~!dAuJx;CGsz&i4KfF`Vn^bHJZuIQQojhI2ia9Pqj3C%fIBuQ8nK+3A2E zWH|Td1jD(W3l8`k^C-gZ&vJ%yJzE{{eGKRRh(9J4nOEl-KiB^tO+qyC=T7{reL{*r zG%8~xYT<7Zh(`Q$u500!2}C2D{_tVp*1vQU{`(By%<3tfJzM-E3@>N+pPTp}0Q(}t zpJDiy82+M(lmE0tEd4(-aq{PGhA(6FdLhFPIpD_~@DCjDFSB~e^{;opGYzCdbB|LJC?SjkzLdZ;hmxkIVMcbvR;TTkiua9XL{*_rNC zDtDw(O66d@x3iMa%Fp0z<1^A`exiRi+7RLZ#an4V&oSYyR61CtRN{Q0FW!&7=^seP zEAh9K`%-CqiJKi1?~d&n?Cgu1Bk{Z4XzkL}=3_}3Yv@v>nadzY`68YOGpA7wugN_X zKG@~&0hvnOrf2sbG`D{8ya>QqrWrfxvHTt_( zG5bDq(OBo7f*BXP{fB`Swf|))QDYh;If~J4|21Gm?NgCPv(8iH+f4hff>6x<@O@I^ zoH<}*pKROhYoHSgt$W^TsJNi<^LKjp(S^0InDX?^#M^#T-rir0c)-d29X^vrQt+w6M{(oA&8jiRz_Yo@$)UMIefDjEXMWh>b}0cq6?w7C}B@`oF~{ IV0%^m52BL2aR2}S literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse.o new file mode 100644 index 0000000000000000000000000000000000000000..a492f54fc80aee4708614982db85b27c44fc1e46 GIT binary patch literal 2272 zcmbVM&1(}u6n|;d)>_-DEfgv^o~&ZmB(x<6m85B!prI{kExoOiWZDGt#qN@(UQ|5R zLXV!jdi5rVcoc8qKj1;ID0r|p5h1>}ooU8(1EvpV-|zkQeaz>3n9MBn1OlKW0JorR zDHh<~$u^F;Fb0Eg7MA0QJIQ&LSjn=I&1%*1HscLmD)U@fu#K9<;%n7D-9i>SX6^W;Oz)sRXgZnBBg;@zt(YD^m;770bdq3Q4x zo5+d+OXDJN3NrNzNcN+-6Q1I zaSZz`>uCr6J@Gia@(Bq){!0h`dj}piMO9ebsaa;7=kJSxY1cPJrDEo7%j9;QTedLr zU}O>@D3@|g8-M*rNH@MF=T^QrGu2&8qiOO+GeRGHPGF`oOLOszxwNpbl3X=c<8zs$ z3AR%ZmIWJIR>=_{nk7f=V_1S~sE2UiLvtl6c{(Dx1&Y5ddAb3#?~2cA{5g$h8b6@% zhp&p>0$|r@=V6FRsO~8j$&X=gR~()L!*;AgT~x>sO$UqucQ_b1+Xlm`kzsUkPLwrl zL@`v9xj_lLi3THIt5if4XHl&=!oa>VX*+n0WIjvgKE7`x zNAE+IF&XcjmzuxweTBep{f~%5QR}4j)4Nykil;K(+DJiK)_*BAy>-grji;R6yMq+` clt~ZB&_t#?VyL(H#Xpnte|O}02)yxs0G5F&_y7O^ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse_aarch64.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_asparse_aarch64.o new file mode 100644 index 0000000000000000000000000000000000000000..a45aa25341da9cb4d84120386e5425b41746e9ed GIT binary patch literal 10040 zcmbtZ4{#gRnO{jRQHjy2!Qn(nLA2ZAltX9HB;!z*YMm8chjrp$JNRELgd_jMwsi9P z;(X4Pz3HGfqN6+%UySr*}YRI+>%>Rs>3+*8rtK z_xs-NN-v6*mhfiex9|79|L^<0eVcqEs@>&wxdeN-ggb@O%0>lY>eHp?2z!hO*9t2H z`QB!uAa9jB$@QoK>V6 zIg@v7o@w7*l{xBaF-~X@5ZT=>eF4Hc&v%PA33BeiD*4tsA+&z-t7>ki zn`ia!Yq?R$o9mFkrsi}>m2)iu|pk+IIZFNKj~0tW5A1zb~7A5UM2{cV=m)p=6|shpdAf6i{f+= z&)8YtD@o#XBX~80uvUSMLD~3)3dvpyOGJofUl*s_t`Y<}bHa7NKe>YeR{@C5Yyu!= zzXin%906z?jn4GBnYvjU|HqFpZjXYnK-&wGmJN->Y7DUZM%P)&Jn z$eufiv_{U=9I~XLj@m705PZfGhpc23L?@?+bG_0RC^$Gf=tsG=BYfTw*Str)uOVu@ zq@d&##u;9e;Z$`Mk?lJIRSLLIfiH8!J@4`F5{+t!0?UXyyYoKC9}sU+PzW7pIVi)! zuo=l;pt@yLX7Oe>{Haigg0))7ZI>G8_+UqCH2X_2dmf=gq4ZLfw4x(a|47X|J4BH( zmgZe(7(LtMUU{3`C&%&-jwnOlhw&;JUL>Q*9n(GL3Mqh-a88DI3OplU5q!%zR?FF) z?-x)%$6d<$QK??ce*I#h09jHLC*oIfV}2_B$I&H?7m#)0+|M(=_ULzH3NAg6DY(V# zXUR8erszHEC}obh*kKz>?6@B@A+v4VNylxsjT(j6&ewq_06KY!9kw`ooE(CiBQ91Ys28DukA`(`N>ddll(R44febD0TcmcK5MI#TR*`jIYxP7;@r=WDrN6t76X!;Ufq-?DmShHHm%dUJ34Ux48RtVw6 z5qJ0u^AYS!kA^;rjD&l+#z6jw3-kuEa|_i&T5#%C-zl}GNa-BeOPMYzGHz}8G$c^P znuG4YFC3zmjJ&nQ_^I5{Y+MW%=ra6&_GP$M&Q-GpT}2mRuMB(W3YfwxTwY&eh|wbv z@zLkPXK*+57z=M-$o$cjd9e^aBO6DtmfwEM28?60|NHD2{p0fb--+LR52_Ud z_n<{__7uzITt}^DyrDqXDg_tlfO*77*%sr?X5%*s?%TV`-ZtF0>0zs8{HhJTq;3oD z3Jsd^?!90yJRyAtd)B1FX4R7OYy^S#k_PZhA#|Io*)^IP7YYj}c8XD=9Z%5_ff7BZ z$xm-vx@K2PW;a|iPJs5LIQ#qN)o-#=HQ739nHR=OU}OlrpaS@7QBmVOE2JK z^32A9%wF=)oBtD%!F>NyN@QMj$xvFwoZiEH6=z?DxAGP{2nfmQm*loBsIxkF-bNAl zbMBW}fGA;%4&OgDF*X((OpGVeaXp!irQ-b$B(}#UQiBQL{S)a}d?FQ3Pb6aTc)EXR z-I`!}cyLG$jwg2nljEab2~JF=QpvPF5geNw)rV7~iHTrxAlTO2q69}1}WGPr8>$mYV#@`yoPKFvJei4N!(n~$tDBpQ@7xu z7C-e6ebar0p1{?qvbv+|t2*&r(yh>kVk>?btE{HYe`kT#JlgzpObC|`|)6y=? zsJi-1@Z6{t>*DrvxV)WQQ?h>zwq}k?s?WeKC)IrN{K`ZFJjH;BBlr;G55*H z0F2~if%p<}5sQ}8Rgz`x!(8Gd8_Uf71k)wX;UgC}17^$Y9`;==&Y5}6D$;(lZ}!?ZUb%UjPX{(|3V_>B;9%RpL#=f#9<~@{VS_r5V2+p|gz5}jJzJsoYqp)_M!(DAt;_z6bMNSZ}u09hhaT-5c@H3;{$y zfXC&+Q!5a6#HQ)J+E=gn>SSLn3~BTQU|6HiyKy=3GRh{K@)ZCH~6vWWJ{LJANGA`i$iQb>WcJz@r34S+1_T7EB{+VMm&-3U^$Wh2)1 zS&+y!th+IjC>-g2I$!i6Hqp$^cq5&0l+L)R{EY8&@AeI~`ckTIO7ZQ2%e!&Jd!gn# zEMA*@_qiX!>ByIzj=a3rH`L=xZT3ycU{t{vy_6AIbU7PLqAGmcvhhZIt(i9B*sVN_ zUTR~L@8C-J#fmmGE%)@ge1}%v?ORwG^_^VV=v!K;_+G^C+)4xnKj%H@br=-KD&fsS zrEO2ee_0d+j~()A9TR~)dByx1708;_Lh&8NX<(4DhK>C4!H0k zE{vgmA*5}nqd(y*xp2nDVx!4qs>o{HLu13^6MA~GUms46$Ks=tvHs*(YEnFQ6Pt{$1jO$zbF}LjPa`2a-v>f`KJB z{_F)kHhyWmJ_Kvg{B(j0wWBhailq~Sv9Wk6H9S68Bq|AJ-wMOy!+LCNxIdkYJ&;I` zCq@yw@qyt%TTGG96wZ!gG=4wm^fY~Rq>BW$UjBrWsl<2;2cC$JeIQ-`XcFHle*s}x z%7kzKK(R!fiVvq_?380o+U7=Ci*3H^uC{1ofgR~#Js~8fhIQc= zRsm8cq`Jeu8nAIA`UMTbKY^c(uSea|AbbUWHs0@mf7=0n-T_~0y_!h=wVeE&98Pyh zTkmNHJV*jCXyWO+Ih>dOO%CV%`Z0&Ege+Tc9lJQ2@_9MiIh>cHS}mN!@$?Tm;D;UX z_Z;wgd@9i(`TRMKa5%4bABXdHKF{Gi{WaFBwcP$@2R!S5ALel0&VcnQE0^ExfIs4Z zAK`G`uRl28>*$*Uqg?ObI^f^qa9;j74(H=mAiopFi&viTBuH)*F zm(${a$2gqN+w&aG%fFq|8{qWbVc}G+bc?d>`JsiAd)IULIS&6MhriC@H*k10XAgY` z+Vbl;oR3c@hkuHr{~Cwi$l>30z@Kx#f9-%TU%fE5>b)5Zq z2mDv17;|HOn4N+(9+Awok}@XvnsE$kyovF;|J zbdn6a5<|$-aH#$xx&wC8g8m6TyiN$=@;H_lN4&;y{R-jVU2dDuaVfF1B!*%G>G)W} zjQL0SOSMTQz@IINi_o76!z!2J4&@eMR$3MDNb%{?hjRW~uub`MrTk&=l1wN2?e9F0 zut7D;kH5x(dRl}B@qB=q8OD@l81WIEo$J&yY3#tSqJHa>)&72C#Q0OLKMkys{vb6q zjJ@W>_{hg{{fB^&e!HJrXK}AR*~) zbO$JyiP7i|d3`8Vrg%bD~$bzHxhy?y(8-}}DregEFum7eqm8cS?8$!a#~5o!9#LM7>~`=00SsPm$)Q_~S(ezKWkqB^~u=Kae)aqCD-HJy}{UfkIx=(SY z14^Z)pJwvJDkn>?g&y)mu!5y`)}NtyY|39*CZ|3vgGQJLiK&y+SC#tIq?~F)zj@?X zSzuX0@yW^W;ZX(GdVlI%Vjre?n zWTXVE@SgHb9knH%)Sv(~J-n)A%I9**sruWX(^{p9fxk0#N>2XJPMR!tvEefpO0kUL zkKe;h-rESmUP-^~_a66qCwt+U)4ftFW_7;wq@{s2a>T8{F3u@NYaa(I_A===@~KTu zzJp!>*0<9_EA>9b6(FIi{$}PpXyk=^@C$pda>8I${`6yt(=Si_OvOe1`pK=(bh&Hn zg*$$q%?{t-T8%yekfl4^y;85O*Ve`kkuFfJRn^~Stt|Z@)HUoka>T=?8c8h*oROn8 zZUjJNg?BB%u!Ns{nG+0)JOmkJ5&zf$63vD`F)VWV7P1J8q3Wl|8gg3!bC@!JXvkcK zMa#+8@E}M}-Ude@;O$t5hv^xnpJ{{UV97P_TyOA5Y9NDk=92d-Ar=IPks5A(OJA&u zPZHTL`fThs`~mFwI#P9*K`ykzpDsk-tE%F{(qNKv+|NP%M)x>Nw^zVQwlnZVuZ^YW zDr76E%`YGRRtu6}o@kuQT3C9i;k%r(D01&a#9&%bzv|adoLwO=RLK)OS|9WySW~MI z+T3#b8)S|4wpQc44k&{1yh7_cB4PoY1^)mr-IN7umF)`6G zH@##O_d%y>o02CkleO(kzodROpdcx(Wfvk;N86zp&l1X{{J>ZTyC8ke8|m|3ls=<= zOrWr)I&cYnQEO6dw&n>xnqg-Yrqq^CvRM^AMq!$gQ>Rg; zkmQ|zwt+pyHn7K8=qIqGhues=+v(%nGV?L5oBJz3c?=?}RvO_8ZC0Xcnzb_X&h>28 z%zqRJ7Y(R-uJ&>Eh=a`2SRm?G6&28r$c>aV-1U@v`K}iXi1OA^=hrV8Y0=IO$uyht z$FT9b$1}hFlvMB`qk&T~Q6Wi1Wb%^Is-L8C(%}~B2{Bzl`ARXs+A)o$z}N;)wvMgO z>;+fD4!lwjYq{4kyG~%qA}{b@ha85E+@PK7GY@kGjf(Tx`1Cq*Oma$bT@7QX`V!^r zGe|C;!<27NBQU+k`9={@;j=0vX%xAMD(z1lmy`D+l+$Zm!ynMjFnrx8!4Dw>$uh7R z6=6xg1REJvV{N1;MyDg0Z&!RfSO8&ECOmvQVEPH{2=;*fX|sMBAybw44wf%3l+tY` zXV$J?GuQ_7EL+1`)7z;yaPuH0!6xYBNiHAun>u6XKz%up`jwo#0&dXKHTE>$KCT%h zd;(~%O~1?w2H5t>IEJ)fy`8A97H79nI#A~l8%T&MAKm}~yNzAK67aWNK$A-y8#i_d z!#sQIk+l$U9Sn`KB$N8>Yz;}tiO4)cwN(FvT;XZfFQB4gn?%J?aT@Vt(pPi&A=={- zu`(lcmOV0lUN0Ff!fkGbc}KOvjC$ITt=9UfAJcCgIRk3m;!ork!q$>> zcH4~Y8u8E3a|L`xsPI6^1i?6GeYmn$Hg+{cOxq0W{ym zzce%8rq3hE7_7)QPt=zhOtM^pY1|Q+RUpt{dM5~cK&ft~gLYI-?AH@0sWVDn1|By` z5cX|@9T#n}W1VbW=G>I2D`kh2d6y0lx${cK2Nw|+p|OrbNDL`RC*q)fjI}lESJ>KS z{rBE1os3^c%|qb{;+*6!({$AJbekQ}$GM&E0sVZdroY{W?5XLXa#;s@))Le&W)CU{ zKw#czD>Akc*xZLi7)rsb;OCh;R*yzmtiN?8Y$V>jQ)-3Gvri!-< zSLRoU%W=LZrGsTyJ65IQMTOSQR;%g1G)SD5)*%)0xlDfivyM*gR4J_H{5;CgH_m~|nddJxV#UQkX zmT}f$?9cQ}@g5j^UKz)DK)=A{GWp+eGz{odR$7nX6ldFcr=Q+Qwt?{BKBY_^4WkEN z)0-wjo|0(#-3@NGCYvO@paBEvT#y8 zkz3_K#f5%A|G-j{8}b-&f5J6c{ZHUEQa&Xk%-YZ)jt9|9H5kXo|*QMqxMV~=?fyt zb$Y*>WFuLd_UbWclYN;WCOP3}IYFfVhA$L{Nqixmhy4nY7J?_Px#a_v4G!m@muL>xw@Vuw?y<54N7cSLs>8GQ7UrlMxwXkL zZY$kya~!a(a6D`CJD#&K#|yUAKud-IHUMle!4-}}AegkRqQN!{uEZb{X7rg@0Cz@2 zP5HoP02*dBo3on0vI#6nUcWhJaa1v;VJt|T{=yuz6to6gXmI>?M;&u`n4=0SY;d@+ z;s%GEm6z-(!mY~nlJKMaeEPhuJRI+ zVOg^QGJ>Wx1K_)(z*vK6>0KBLn7YwCzT)L3Naiq@joYGmOlT2gKXO-q6tko^mHTHt zHefK{m*ih-0!4)A{Gd3!vAj`y^p_JiQdkpoTPJX{7EcDI#h0`cbf3RmH5rlB$@(j2 z%4rmMBFvw~RbeLDdP+JT_bVvw{S^0piu*<(?l+b^=h(W^5nJgPQXPBX|EI9Xaa;%B z|7YRXjU~GZYNnNMc5Geih^=r8wP2!#iT;@r)q?ZXa?B@O1J6$K`(5yt#^453qzMzyR= zx@4aIW)hVa{8CSKj~p?(m0A@sZ>SU!3bj&RC1;P6O7__xkWvX7Wr0@8&xErFN+rJU z&lq@+Kh5S-Z{qG5)YxoH+?j_rnz$zqr^iyhW6T^_iN|>*({|uB9N4#UISZKcUy9(T zi{NwM$U=PXErLH<1dkTMzgqw>(|+W60h|FZr01q0_+J&l z4;R6IQv@$Xeio9us0iL(1b?CkE^UbnLS;j(ooMp zJTbVvE721j2!;E$hkE+^i;fRQw)Dgkk-<>!_WoFC6BZKc3vY__6`imn(iQ3t$G3(0 zqdOuur~1DLx}yED?TJVTIuvCT8;o>E5?x!15|BO?qy9+og-)+85zSlT=P~=hOtXIu z7P%D-M&gk~h#a>iMz(-?W3irrX`4(l=ke7u(31%D_jCENxv{VRFUt%g?A3#7NMR| z%f~$eJ`dv-eWQSjd|ngqI|ceK)F*1hpU(OgpI-{NNWTE}h8oe++pI;uTEOoT@ZAC~ z(*LV~i}X%njFzv@MgjjLj9YRS@UJ`uy@(G8xM+uwBKUp*r@snX{7(wF$j3S=kXjQiYRrHGG6-znfC{iuM8armDCF52No0xtUf z@45Dd{v!UOfQxy0M8HLQ`kFIKNBCcTsMufQxdM zkdkPm=a+F=dK%|e<9h^s7O?Px=u;zl`fgz1HG)111pMm){wD%X=Rs=3N7QGRfQxp1 zM!-e-zY*|-;AQFa&qZ+l3DlG;(jP5C|2qM93;Zt^!EY47Zx`$@^0`mI#dWDgaQfg& zE#GcW6v4*?T#S>Sn>gL*%eQ4WYri0z-f}Gbw*ozV8MJUZ=Takm{uGymcMAC30^Tp+ z_XzlY0T<)v?*v?opKAgx@@J$78r?+vV*y`;F-uQ3`qXH7<`%)9T_%6R&63RVR|MQF zr5u-P&t8AoW= z`YmWF<`=Zzy#|G55|aPOCDwc^en_8u*)-)TIxM4YH0@9JqAPzs9S2Pk^rqXHKQ{XR akT;mWe%eFxNchBYQ-Au0II-nDO8*P{7DbK# literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse.o new file mode 100644 index 0000000000000000000000000000000000000000..f4369b88fce5bb177198c9ba7c85c833e0322384 GIT binary patch literal 2272 zcmbVM&1(};5TA6_*4i(%g+c|NCtI=4B($Xnm87vv&|oF~pttoUeQkpIVs}YXFDf2u zp+`?%y?PTwJc>8*AMhYp6g=3Qh!AHs?=|bX0n>rE^PAt!%$xT$4;RvlU7-+Y3Be6$ zS(*j7cd~_JE{wqd^g~tT?#m69SkAD5%_`;M4ihy|D2i-RvdxOc;%n>Ia75`d9f>lc z;+aHhW{L@iIeCn#RtjaOx0!!8IeK$%|NQSpBmVf;X9J)%GWoF^VD13%_P{st8oU?- z7u-$lju4UBn3Qz6!zP^R}D=|+#G+YhIMB+LdPu^LbV6%LhhgmcnjfAHoQ*1ng z<)^R|m!FJ;Cnykx-7i4sK?Xwgq0m@gch5fNC;o!^w??`=Fr>^tMJ1hp!$#0gBXFky z4Co4SM;`YeKn$EANDS~Cn#aL_9*@<62Apmg3SD+}yZm@CKDmnaxvti{?j9kxj$?W(YB$#cNdi7*rk z*}9Ft{v)IlUsDJxm!FyHET&dBMXesC4?ZU_lj)`Tc-mZATwGpQF<0XA=>-#PXG2;R zY;IcxM}n*u9KDYb1+JhT#DNdZm088p5z#HsJUt~6x&gHBn$Hl4Li|}AnrDbfi0{Lp z`R1!?^8Fs3Ats@DPq|2axV=+yL>9R1SWTVR$dPpi_=a!<@T_eEw<=`t4$g_PM))j- z%A(+uu#;%uxk{-d%UDIZ;z*8tWzu%A6RLbNZ*E$mB)#i7CLY5$>LGva4cajrK|;P> zxr7FB{>~BeI(e`^p58xoi{r*qKcJjZ9A1ej%I`U9nm?>;tqv66nowHTt@R7!HTJp=2ay53Zxl!G z!z-gI-d~rlU;Dm75VZbBM55?*()#J$>v+x6H2%4ff|Q#7rPB1)DThCva{BKMQt(qH aJtV^;D%}x7y(K9AnX3QYk!KV5;zfjKdxO9C20wa3yD()8GFEWi`}e;0ecyXeqsKJ8(Itw4(?sFRLVo0+g0Nv(zF)_>b;8Yp zUr^RGo3AVPwU|eg_Ga_vN?ZHtaO#~=D=nV@m9;~jz$|OxQxYAVN2Loez3&Z`=GF1?Q?_N@6k0_A<7i|DZVT4sTci z21cW-U6jj!J10+HWi=J0gb7i#O5* ziW`4+-RN^_!VN<*o3V_UDJgRW{y=o=DhpH5kn2Jxr6>LlOQT|0P!6yPTISbM`UfDS zpe)C4LyP$V7N(pQ2J7OX)GyOA$EBS}4r#Y8*JRhu2H&O3mX>WLs^ApMYoj}KD!FLu z7FiaOVLN-VM!GqJO!6YWDTzNke*?(hnOtn%uROpK_$69bDg7p%C|3R!^MY>vr-Diu zdc#rJin#<}slPs3`o+eqT%}l>kuThK!s8;wPm4r8BOF!R~mCNVXK$T(!vgZ_#PE?7&3T8z_=^zCvWs;Ps!Fn~D z?;tZUx8EVJ%0(zbUbV^PZcvD->{UZ;lvmracrE5{X#IMW)CZ#U)OX$398!ly*RF%n z>y-6Mxn^Yo&yf!{!;x`(s$lR&r|{bxQJn4Nic7>SHS&?Sfs8DYA0YzJEPSb()wQJdLRF zipHNIr8TQ0tu0^a37ZG%q;DQkLeq89vq$UFSB=G5=oiuxB`!fgBzVOiQ6v}}m@SI6 zR6J$=8!E$IW%*+sMf&D}&;{w)Nd?KhSPh+#o^XR2RH=iacuFy+nk>%_ZFv(>PAZ{u zjP$s%KntCWSAqtG%bWZMW}A{afVF9(wK>BIS;Ra2&mYXr=KK{#td+qfZOlPBTt%DH zYTFN(oB!p7Fq8byX2na$72ecUkEG48;0sRNM;-~YJ%ePG=Nt&X2e2lJd6K?D%!+Zh zf2))lE%7iaMcR2lb1!VLJZaeHLIA5~sa$dHQjc8fqHvMMjxk9c4=^uJmm{pCWp%=b z+pFxGGniRr)!SBIhO@K@+w4|mcHUyD&w=12^NeG!l6s`XQzz{_qPV|I`DJ;2oI2uS z3}%hYW^)5`SHaV{{pzRJZ$4~dNSgbnD=veKe9orDmLRtlh^e_%P_5M^FI_t)CG}g6 zDrr-sf-MPI$hP8eeoC74>EQ9yUzT{Jos*R8HFL`Vw?Chnv?o&{)0A^_Y0Q=g5QY3c z&Wz+^e7fm(v$LAD5*26n%93%M^#mk2mJ`{3FSVeogI}Qf9EnjN@I<;hs_nJQ2+ik3Rvo_v;uo=8w!|t?31U1 zQ(-#Grrx|7MimUC4&vOZ%C$4^OBHE(J7k8Brsb`8Q=|UVxvoIpzlT1ZjA={tFTfJs z>($KHt;%m?-j&kp0c4Z2IB7^vM$lKx^E!ei(5sP+ljmK{lW%%j%%8Q0rRWT_ovqaEl)p!>9(fO@Tl&A`c91;}vFBm-Ji?ww znf(*cm)zb9`iqJ=-76*j!>goz`9W^31fK);h9@^y@O^zk4ZUXEL+6x$X1=E25ZPd@ z_(vslS(WOKh7NC9rdXv^6BT8*h`-c(-J6fzi!hQ4o!?e6#j40%dSj7AKez~7(0$3bd94>i6K1kY5b4z{0t*{f!!Nb3Ua!! zhRwb51U~5I3$h<$c1Nfo453f6N-B~X;S}0|H>&F;74xZ7$%z_HFX>78 zj3PQ3Mjh^;6HC8pl`aWqD;Eb(H;d=OI5myXA)%boyw&=v++9LEqgfT9Gn-=ONwzQ2 zQ82q@0lRF!mbqZ)2x#b1YA(pc`V0VvO}OJmz`B4Tx2mrGIEvD7&af~yiAFlLOz zMC(Wlw9LeepDvd;+kx$W+D_70lD2qeNwy?Bg6J|gr> z1ca@{jPGU4X*OpWr2-w8M(FI&ugqMeC2h$jyv^BWZ%ekJq$S&2qMQF^$Nsb*`zOL+ z14*4;hZ7j>w87)k61t>aMElz@6212CMH1 z)&$B|MSB9_E+dD3Ay`ekl)BW!M^_2rs9*e4$s+G|XspAJssXhZY5%GB`M=@Pd}YtL zHD4g>(R}6G7AU^zCl@Nd+SDS|w?}l}A^P@-t9=vVO1u>yu6xm01)vS@dIJ7}1Bjv% z;}#IC!dt_bIBEPfyy?5U$%URHlZj|A0KSef=^$1{(0Y@v7M3Z#@_JvO-d6_8>wVtv z64%2vWh>rHxQh%{!Rw0HkZZMrtg#WA0$DUCUIXw|;vrHt%l%N;$P_kn3biGpa2s1a zycf}djQOyUd(@NZ4Gr@(w?Iy~ zzPW`(*MmUZ!r~(M{1bEwCD+k^ks;U1sbP3Adnn+qZ6OSAapmX5aspTaKmR=L`A!l1 zU=jT7B6um5WIp-zMeu&$x482DWN`=C&ljP8xd{I2BKQK#bv}E(R0NL}!Ji=^6aLbU4zL;B5FhihKBC0xiMv8V5lcD6x$U2J;=IaLnBG!LnxuzrYlgNcO;um zZ!BiaGo3uuhu(kYEOzuoA5L}+4m^rW)5gJA_d^52eRCx7DEY(jRo_S?9_@<^b&ZT% zPsFSn?AjO|j2LmcgT?2h?(;w$>p1u@$M6FOh<+X$J^rIYp3kV#0 zC{G=F9=~Xp&O-U6>|CDH%gZ0&aNhnbhx7FR!r?sq1rF!wC+*T-XwO?5&eI>aOKKs# z%bTAsPp@-0?}tY?oVRl?hx2l-a5yjLPwXv??BVg19L~#sio<#NuW&doXNkQ9lKhY1 z=dACYMes0()0xttk8wCJXIByYX)b=2a&q1-f>+o_5VG?V9Q{2uPHc2qaP0XOM_yJaX6o+&yyi&^zz~7=)H}LL*5Vf7Qy>CocE`};h*BdtTt=e44|@ID8q0zs2GFx&%N&jpW-TY}{kl8N&0Y5e%vlj|>@I8}T;c zxp%+QGoqu0P}S3AbO}`(6A7Uz9v$qeBDCrw{F%+(m8;@0lK4?1kcq*n+W(^+V9#05 zoiKuT3suuqOc->s@+D;+=XYrXz*hl3Lf|q0#v)?&~gM>Azd3>WgG38<%52D{nNlwi+ zjB=Ie>~bqmZJ_orex(0S8t~1&g7oni`VPP2hZlj(Xa9)J@0`yaF@^T;#fN$Aw=d^= zY(->0<(Z@3vHyKwL_)QlZ(C^-Kr7VejGZ_AYX|XHGwS?fyUkyfhgi=xhw%__lg(d1 zCXCzsg}B4-;LB`&sv~@xu=!6GFgWvfXm7>$`TQqP(L;0QMgF7u=O literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.o b/third_party/libxsmm/obj/intel64/generator_packed_spgemm_csr_bsparse_avx_avx2_avx512.o new file mode 100644 index 0000000000000000000000000000000000000000..1cf33353cadf8e7247cf400eba67b2bc78ad0e8d GIT binary patch literal 10768 zcmb_heRLbum7kFf2x2sX%^E`k5of2Zvkg8{NWccOXzUq0cE&-D?Ie_d;y5z4u`Sn@ z6T@cP1W6&Y6LoX=KBw$CET#W+=?S|h%Sm?AASWat(CxaWKnNcVA6+8zm5?P4DI4wY zzL}B6vX%DGedo-~d-vUUzwdqTNk3Aw+gv=)*^TG^hRcs^D93rflkW#vH^_N8fsRD-p)ZVNgklNZ+{k2wAe?{H=$rC^_65<(rRJ~1f2PR}=r+AE@d&J`m8WUGd?G%Zf zrfZ_A8PAC0n2;At=^MpbVaFaehyFZW7Te-y`1O9HJ!Jeq)AVfMFwx*t14J-n1DYJJ z-A&@>K-)w&#)o9lN9f7JLUK(RM3BYWWLijcUyA-Cq8BK}HpUH@B4GwTm332GN!amv zJi}|cDIFyBO8Rew`3XYfl)&NB3AU0K5)T3(>7NOEmxiR%N`)89=0G2iK7koo*s;`2 z0=s4K_bU8*va!NTP_L3qk6b1jPcmVZYMtvdIJto(zko?&NMY)d6#V zGwIG%nI_B&^Ml+hIPv{FLtFKIi~7jZntm#dz~JSQXUt&HM6u*q#=1sBs3yPs-)uI% zQS|YW(eGBSOy^?Rig{^jT#Ba!8G$S(KNS)$!OCPEGx#?m8UpD?{iq#%6bdARgus&# zan)URyixq9QYyqFr-DZ9W6M&tTO{Km{SOQF3keBSCH;tGG?xY@r1-HCA(3QM&hW92 z6hGsNiIP56Z!F$cb5ar>ACqcMOTuGmB{3F5eo8f`gva;A)4YCAG8XTgfiedOd%=Md z8eRuhF-iq!HHU=6Kamhc7(3DBl**HIn*P#xx!ZVu49PXCs!opabO{7Zh7k=E^yf;* zEwUJtQKYCU$oxi8lxmI(55&)9v$C;HtksY~UQK^hNq#6KZgn9|pJm}I#110qVO4HEUM7WvN&&K&L#-R3CW6!C~?|EAJq#~jO8nlrYv8j7Rvb`(iL81QH7w- z7mZ!aqlqSbbW_r1{Ud$9jABe;XMTd0Fdwv6+N4`ig^)}$h~LPvhS6r_;C3dmk}8{r zssNF1cblT$?!9@nL-L=4L7uurmlc`3+|x)~St%s8V8Xf_T zS!eYRn)R`hbF}O^Scb3tJw7LaNaxx{al7R4nWa zW!TV!9cG;{+Bqf^CTY)L2=hiMZqtf~+?ze%#yWtcXGgCL3R?NnfI)1g3mOawe3KLQj!y~fnHAk z0^DT%br?7V%eF9sGoA<*U*?6c9QK7nz5q_I5NZL7AVXy?$>V>8TATBS!v~#!Z_jO6 zd`1P`?clx-+-=r5nt5{16Zl5t~91KW5yBC~I=dCD;b2 zSB6xqW~tbURFqH?WaJXDkc{PXQS<%gJ}zS)Tx=yyYe9%F(ohRl*<$%M*sl_m$Zu~2 zfloj7!0zz_e9gx~;ueITVh9qSk@c4qqx2EwW~EekXiTX&5fmOi6ij>-drhf1GCWTE zh1Xb7k^1|~zRqShe{~EMPS|n{mTx}$8TAuiu>Pm$k33DHs^IE&$X{Wo?!5S`rB3UR>9?(G(HPd*;^2JgMmDq)nv51W7BfYC>lWSwj>z_*{x00I_qihe! zQVo@9K^wzX7MwIf)xusiq(azg6vS;Tu_XSate1&~pAlL0ecNnUXxb81OTLV>6XtJ% zUpQ=FHRibuwG&|4w=+D8UN&zVfX)ZXf)nuLV(?0^N>7X>J}rfy*1{I(RTp9>FUuq? z5{ueij*dOmBy|wFDn>nW`n)dA%+Je*D&VtapaVlGyhO|WEIstkAVVd*BFM1sLTNdx z@&4(F1=Vfyo57gbg$%Lur<#!SGKl$&CY+U)S9~*1OL|5r5s=N5^+xSBc(2;@`+_Mt z#nX{8=Y4)^C51!V%ot#vyW{Q|k)LrhvCZ=~iAqVuhMT?et+cKGjE^41g-^i4R{>^(Tk&eTGl66v1rd1A3Gy`nMXm& zDo>_^h4z&csN`WY>{vPbDF{o(hKkLfeU6;ke9p``5jK#8=W*|lx5{D^r_`&3kwGOe zfx|o5hi^PIhCL*UW7$3>@D^Q&1{il~U?m!A48^MOUC61qpEzu!IGx#X&XICPvj%OdQ;OA7C(_v`avT*$oD!dsgy^FC53? zE3;E?#19F$*U?cp`6bMnnS#;S{+ZwYCPC_Z=Jm}OF>6Cc1|3b`#uT-c%-j#NfuY%? zm0iN;I7+iSDK4bAjh@17)Q#(usxKDd7rKr5Q(LGUs<5*AmiOQ$4RNJji8~+8+GI&O zT2Rv`X++x}HB$U3>-!wOr|Yj4gyhAzDaGF}jsM=0YATH%@IdM6z{J!^@EHWzJ{%Vk z|Au>wX3P^cI3UD!eayYo3E9)whmOzdKFu!EpxJNBz-bL@aoMLH4uvi|l=Xy$`bYAtrJf zd`6z=0{RJAAMX-|-*v~2ea)s067)Dg@06O_;r+ugxn?|elX(f%$K+Hu-(ZM_T=SkJ z)cs1Sd2RTb24nF*%XFKl`*l3+0{ZYA#h4qHYkmXJ%c({lanTLG4kDNz-M{%9?VLg3 zv4cV1yTPq7af6VEmvY=MDW|m5vn*av6#Def)YLW>OU)^zRMRIl3?BiKYQ->15*%SF zWc*Y@F}Hb=6^*l0G&Ypel}$*TLLa@-7b<`t$e-)SwbYX$#b00oT)L&H6Lu*ec28k~ z_&h^j{iv2oh&$1xSRd67GLCepAu$yCJ>*W1S3xjM_$i$FaePtr->Ld)v65kH`a01E zJ4O*Zn0kSZ%k(tvB7Hps+lw{qXCqbLBl=LzH2ot@Pp~79Oj$!`Ull*7i~|n|mBt4b z7V5r)4}zV#Z{y=OyC33%-)YP$q8bZDs;Gg(YHE+@XQU=o)Q=F8Pk6q4o~plVmwo(( zL8}9Y0l@8hBhW21{<{Fh{-p?5eKwY%Az=>-35~d*#H;}+>7`%F)t*KU=H_* z4LCggC%SZ3Bx_3l!ElpVS&&f}Wf;&aV}s~ca8ORXgGoBhtkEFgRyJ5Ij!MSd2h`NU z1GG}*KUQ%~*n}=hM=d#ERp2ezXf}W4!jCqxaf4Lz=K8;r^rQB9l{UYIRBbtJr$L-Z zaG+WV9okwH<1S9&KfslyOa*N^zni8C;+zGB^D{rvC@X%YC}7N<7z`^6Sxl7>r4=*vF)l+i=wU38MXn{ zdhTW_*2z{J6cHTVAkH0>5ebT`{i7$$O!0>quf*-{Zwj&1vt^0#V!+Y$dOt7 zhirE1YH>s6K6CwIcQ%!FST@Gk&5DM5|De>umGdiigaUF6;l z0d_)wdFB-NAk~mMkM|#3il^eoC3PNOs#Nh*Zo5G8__tgrd#d9XHF?JQ5}EfL;p;rd z@Epf;f|opJ_+_3=u4R~Yg$pK|Fezj5cA9KG&*VDHX`tGKuLR^~Yphx0lILCM74mHY z>2_d`6wE1Lt75JVa~m9U669^LM4~xY3nq!JtxOAmYc6rDXqw|XdWokRMvy#}FiM@L z0!FIyxaB#^r{j^NO%^E`e8?Ulz#0IPU@PLi60kPsvfJj|HhoxUtssGj%5svj+aXXh zfMjyA>K!z>-7(pKNz$*GO&;Of!P~VK2xL+hNq7e!3TQ92Fv&_`nyk0TG+3gx0_ZfI zlQemG9SGfJQSNr?pv;j0CQA^%I*^&BF_}S`vkX&9fnpnm)NBBZeBp$-9(Iy8*K*uO zgz9f8dC)GS1YvR8n2X!GT--`#+@{M*-YoUJRKC&t2sU@1Ia%m2AUvz#ba!=22EHV`C9qAtouZ14rzK*q#zM>1(M>@m( z(e;t@B6LRk2S;L&FsPpwufJpX-t$w-ixrFh36j~2`y<6w?i%Uq3zHgs(T;WDXjiy1 z3WIeF#D;US=Ka7N+B47->*?t0xu5wHZVZ#)vta+pagbYU3mo~u2xG7=ytXged2i1^ z-f%;a;Ych@j=g7)EDGj>gEJ8*3``4!V?z{=Lpdh1s1xq*=^ToNfi)26BMWr(+!H3t zz%YpD+|t5$iVj8w!U(TOM{xsp_C*o-781w(t(C`A!KilP7|GgJ2mCSzoc`vaR)|m6 zCmVh_IyRkiMc^^y^KAIfIpBZcpz}r%IQ>vD8=Xt75=C-4<>@X0Kk9(fS=5$4N_#h2 zp}yOS!2hub{1*;5{j_G&Kj?rv?f<(X@SEIB943!b&fX&M-9_LbYbz_XPldJR7Q&ky zaHn4$DgwXWI)@OQa)-VTJK#?FU37~_E99#=;7-1e7lFU;fII15d}*G(6aKFjoWhG# zvHkMz4mgQy!(Vm4iIok%itZ+8B>xq7YyiNzrq1`=GR|4;PV}P#~g5{oqfQeM)tHsVB;f}zvyEv zWpJ))_`d#F$69=2L*}>J?!_XbF|KM|N34UZT01<W^OJj9;;=OYd;^aS%ORn2x3%60qkbXo zCl<^~>m2$TYkjC7Fl_y8#E;OK&3?y46T{Yv>__{Dz25d8$y3OiRz7znz0oZGv!!3M2#(jSe-W3PKdMKr19fq#{8iM5%<@6k3k= zW@pyVV{h)-z<-RiJ8ypT-kbL^GoD#LnN0OA3Ir5U0?J;c))FYx_txk*BjW(AQ!2g7 z6P1w_GqaP!$(h%k;Y#o7nOWyRV%JEU`t+N?xMztv^(xe<${i^76I8BG?FyV~PgT|+ zqF36DDx)qxZw8bbh-IT{$21ZOD&1yOb|B4Ur}i?@#R%w?HX-^gA*yzq$1X5xHOgK| zR5m1by=C8>-1U3)E2rn?=G*~3p}6L$9pKJ=I1G1Z z4-;fg)V%qo5JbjQU(eye{(dc+cG44?W!pu2x7MSTPNwaWrDcn@wr_Ai8-rf0bz2m|7LV|vm5^niOR8-L%G z_H!(r&2quq(JRY@+g?DkY6tiY;RN+p_w=^e5APXiQKv3~VxB=B1UGt85WJM~yeuG< zB;avH=5Nf87v@8kJ`Xls#tFugXjDH34QMU>3)WMw&9NHkX8HIpq=T%7|L5b&U<@4qVt6cf5IkMyBr`Io~+`L~0{ z)L+Mq>d|mqy)zLuj8_xk2#6>~`Hx7)7!5=>5>ST)5a+;Ato5l6g6i}Zn1*b!Rgx{0 zVc5q0q<;NpjqdF9h7}ZobHj*m8XH?IH zBagz^DL}TQs$daQXjD?+?qjAgbIs{WRlux%*l;5$MEe2TkWha#6jqbduRE(!<$oIG zzkv!S$kwrF7!}ZN%*;89xzcmtSbej*)~q<;p_%v8nMG(V2*$B%N2>geQDqx5Q_)xc zxiK@VPJaL~aGAdN?$$MvK7nvcx%-~f@iKIP8EnVnJ0=yDFZ|wd1_U+ap`wt91sC7qoL}OZ4tgAD& zqjS3!8L-AQBkj2O*3Q_2+``qo4t!}q37lCGxOaJL+jB6zRsg=9_l2R21$o)xgL*q+ z!R3JvFd}ZigN5QJp?w@dC>IvNt0%PL*+qS!wHJfQkXCI;h9b`_PK07lElGsBr&<%C z7Xyo)3Vkoo7kVjh7-}7X+4fMk9*XIqNKZ)X39SVkJ)ySbvH~#l0^=}eWTW`M(A*Z~ z-MMBQ!R99|C8(g$jU@;@%}NkPkPF5ppsey>wpa01P+8*Pln5%Vw8`jA@D3;|J-lAe zQV*v@P-)n9l`f{|PLD0(QIJk$Ice3%pu&!fHGoq=CDefFfCSj=al7sz96^E;7{;yH zQ;*mDt;cKetjF1*heC`FDt9$thZxcr|6ztV#`7_TH^#?gxLS9dnH3sjMWS)q% zbpdc#N)6@PKKS>2@XLrdnYM5E;5U8n`(VMWB=eB1Yv9!ZZ>B$sIP}XU?}Ix&_*uYL zl6fn}{SNXs@#`0izcvrqJ_AO)?Zb~tx|yDJK6oeMvQyCB>4QJ)gZKF0eTc&&7uyF{ ze>P^qy4G^dt+pLeBA8r!0H4SjstPJ!Cvx7@M$6cKz}|*)#ZQ%$TF@&Ft1p z>G_##@gsSQA5z;Cib~EH*rsIJ*;cX65NDcLS$6-A(NiY1H92ozPawq+^V;z9yErb}Gr+j_5+OS z<%749aW&u(`fn%icnTAG&I64_&euZnPW9YQ-l?A3$vf4vki4%2zeOBwC+~Ro5_v2Ij))_~M;x~6CI$A0zQQMkGM-h$Q;xy%b z_K4a=l)V+NIQ`+^ozdfHeGC&kUE~_UZU%e^ z0(ZgvL5xgcxMehh;Vu%*6!Uqj0O^H4P^>6?n%Y`&(wQfKA9HMtn^`-Zx9CLqWHkOQ zhD%ny(Z_w%@#XFHZUhRkpoh3bTxyG^ERSmm*T*N7$Gr-5`q?kO695Y<@|_g-aNLvP zthe_1F=u><8;rkU$S2k^wy_+62U&0rq-C7TiS;h}ggoz7z6Kpg%l6N5262xU^0NIG zp`)4odP=_TR*f2nzlsMT?NBsLSEz_?UBbu5rcn_48(n#o`126oIN{~xAi8X*7x literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_spgemm_csc_asparse.o b/third_party/libxsmm/obj/intel64/generator_spgemm_csc_asparse.o new file mode 100644 index 0000000000000000000000000000000000000000..3f8b3882cc5d49ca6dde0a43aad917842687701e GIT binary patch literal 16088 zcmeHN4RBo5b$-&WjUqd{8!$x$5YzrS5kaj&A^fyFdSr~@Zc&oY*v#RZW{llB27d2_xHVL%S z1@r`Z$ozM7Srh{%EKr%EVP8Ed5L^d=<)S3SI=HDo%5LFC^gK^v}_(o*&$ms zb7r$#--oGPE0rr`A7A2%t#8t|ynIP5VN8#^r(%+q$O(CsU&5$7Vr~@HM_Nw|^97?< zBwrnw)rYJEtSf9jEz_@v>Gp>y-J&GSJNXp0Xq5~Z)0Fk7l&Jlv%7#SV6gJ;P0@Uqq zpo9o{N5qe00$C939t+w=+hOfxw8M`D?>9j(~xL zZTr5!3~l_%>ikAU8fP(vkE)z0j6hgfm`LNSYG)eP&eOOsrExO9?56y3yT3*>7RqdL zQ=WOf(PwNSV?*YmtzMsYf2EVcDzmU0Z$s&eiztQDJbkGVeb<1Y zRz%IW!blboO=IHOQbbgxj3Ti4QQfjL*mTgp*gIK{gsc4t6;~GQh&e7tSzcP{T`E2r z@ZDj>DV&b@%P&yx%+ri+x59kF z4sj{%%;yhl$U4?3&<1sqZBKH~_~77Boc1e#tEpzSUUQ(P2K!)6fJu#wBOLkYt8UT7 z{;sdoa~(n2u9YKDBZ!HJgUc1iQ&Eds=1-PyAC9= zk&cWHM^lMZY$!T7lo(A?YnTB_3bV4^d?CAJ7`=3Wfzyk(v8Pk`pF-950~K;AM*7yzHnbc z_cQ&2-laiJn~uXtsY;l>5%VSi1LodT=2&{o!xv9M7aW5OP_RJ29oqdHJ69F+VgUyjfwYMMkeBa-#{PD zwVP+W4+Q;7Bjy`{|1iwwVO>Q8mxePRnF^Y}2sQq4UpyVsjP#Q9F;7R#ABiY*V53IZ zD?u$hp(_I0najvwRiJ(7NYHTsK&i+79VVy>w<@E3E z>2V)C?G@e?=h)Uan*_yRwd&6ZrWyro8uOPngJw!~@I@8GlKykSg9WDE`?nof9aPjs z8`&e*bL`XR&~e(Jw%G@+j?uCiA(tL^vX#^6Dl*MSFHnSGn=wt>?)0@VPkkkZkBFQQ z=^hLhpM-s|fxZf{9R~fKFn_DHOJRkWc&B841YNy^R{hjUJ_z6uGos9V3UtEd*%!{w z%@vkB^CDJ8pEm7-JrGB`>T4C7jmHXD2WE-Y-uTp+BM@Xo?R#q1bLNlPcbN}Sev*@E z+X;4Z@9;pmC4cas%0IQX_3PK^m2De)s`Q5HHPtnGZB1>UW=)`8uiT7eJse9}_>F;@ zD%d49f80y{XtP#wz*lnJ;w#Fgusl5q?0RYUi|A9W#pAobwAE8_pR3)YXWT8G$}cSl zdTPG7Fz9KRxT4*2(%tVl<&JoM;x7G~=T&!`=d62^=e)bsbN2$OEFD2@8)`S9vK5sC zjsn{NjP3^B+sZu+hNs5xRJM5Z7EcA*Z1I$>Uldf#kpj%ONfOKElmEwTCVLMf-c(+} zHfhDVjp7`lIJaiwY*3sxP$+l8<2xwO9SBEQ2D(iKn!?dRmEXWa3&bgSaRS2gNT(n~ zJuq6Z2ZEvpM3cA9L!!EoeCneygYrZA(MgP*jpP&hRgN9>vm84cV7V*h^4z9ThH31}6`v0gzu@XrWMXY2p{e zyegMQLC-HPtt=k|;?l6f7AP)lQ2~_X#V?k5buP`5hY{e?d`0jpi{Mw~;krQbJ8RZRo;~7jU)D)^-G;Mx zZizqWz_&>Jlmp)>@ce!skhn8GcS_vp@2JFQ#NEE^lep8bgA#Z8^#zIF>#*})iJx@f z_etF6m>ORdc>ehOhQytIJtT3bU*DAYqvCE~9+CJ@Y)CvkCh-b~oz`}aAHW^>6GA?J zUK|&=PwNzS`*K3cpSL0LbW-BZ@$jz_f6ft~mn80t=PMF#aLB(Z@$(M+O@aHg8gaKT zzm)P1+mLvASK@sR+!`M~t=Wm=-KBjj4_$~jURi3dV%=ECJ3Ttrfa;LbuMcM`+av$vUB1Cv@wCR-MqQvs$sV8Z>o? zWx35m5Zyb1_zWcyqtRr4Y$!ISMcX2qT8v0^Q)_Eis5{ziv_wKtEjcneh6CflELCiD z6#t46?N1EE5l;O0h3ehe+IbsF?N3Da#rx4GA5MNSF+S#0n6F!@+k5GPLPLwjBR-x) zi4urHJ5m$=6h1^C3gI{7PvM^>5QTVX7AQPJAc~X6THo?{{>Bm)JhXc%p6@c8c29-> zOyW*^-e&Uj4WQ&{K2mYoa}C3}Uu%ot?Gh({xt%mmsSr*FeX8C4MR?TyLGr)DS%&cBcZQJnFtlQ`Wf@TdH`naOjWc7}61Ro+b1zsq>; zWb&M6Kf^iCy%ML}wT$QMOrG<6o8g>?ex0R4x9jky{NnTCdWK)lyrBOzQ~bXP^WqcC zp5Nqov6R{QzcDX##(z2U;s(ZZIrHL^jOTLZ#WKcoIrGBLc-G;W3e6Y3?u7+v;e37m zjKrPmiORQ#hrV&tyin`EbA6sJ!eix`R=X=0&r3|6uLH9T=j*@)iIY8_!k@C|qea%q zPcWSGG)kO!DjAR37o2|WEW&eF5qygA^Z2XpC~BAQJ5Lnhd5iJz^^N{1n2OWR#Y8M| z?(g*yryF12s+l~Ge-pzwPY2`S^F7MsInOA=InSRl9vwERe*6}b=RAMMaL)7NBKXzv z2!pEWd{ObQkhn8$bqweH>lx2V#?!;(xt+Th&UsRdXBp%9B9rGl4=|kb(65D5$S?Xw zY30`mCeL~Pjp3X}eTfjyYR2x6V+|9=GjgV3Hcd_xz?cB%Y ztC{>Hljr_^mEoM{n~aCfUR1l^Ve*{kDTZ^NO>De1GM@Vw&gK7@^%s59s&@a9^$TzJ zQHFD#tEod!=ynVK6whXfQ}diZ!Q?sr=NQiUj}_rL&g41Izc8HhyjFzAEkCWPn)6)6 zaL%(*;^bF=`P;E9Gkoc?}{ z;heu#;?91=|2Y)o>0vnMxvvQR5aX`{qq6@ICePz=l;NEJKN%1A_YEe`dER3<=h5X6 zA^BU+?Aai3iW`^jXYy;9{1}tx?H*z{=gBahwT$OanLOwDdnSJqlRw7fIlm4WD&&`R zUA(_2t`@G=UV`{{B|lx%K=lrQ!!tTJ=CI71OGB zC6ik9Sb|=vFXFU>SOe8Kq8W=1#j1($Vze)=Mf=#~Qd_kdF0N&E0CTBLGm5;n=R3=p zr@i^qvTgJyQ*;v7pBNsFkD#|l5~+AKer3HenTqxQ0mP`^;(MZlW3k~lA2*lyd>!;? zE97IDsG8|gpvY}ATXO@R2$S1d7H`fzTdQ@3jAJxqf|+{Vzkmko`v}sYc=UlkHCX>354l_V>xfZk#GvQThtrA@!a1 zpOi=D>bzW0IOE?3{X+Jiq=A8=QzZ(;pD0!PD*lvDk}b~hbHob}rOz=il|JcEd0y%} z?f1!_7PII2mU!N84bMTpko{+A^F`^Cq1+*^_C>`{ z-RYc>e38v%1v_bBK%tzR7m}r-?p~>Hq4xE2Qr||bC#9!wztqPqr`#>|D~rTm2``0W zq4>~XeDDi71492MEIbS6yS15W*q0CSiK1p0n))82MC?06Mt S^ab^${p$POVzc9}-v0-gQo}9) literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_spgemm_csc_bsparse.o b/third_party/libxsmm/obj/intel64/generator_spgemm_csc_bsparse.o new file mode 100644 index 0000000000000000000000000000000000000000..e31c00cc7996792d5372722a2e8ec3b8a78693a1 GIT binary patch literal 7352 zcmbtY4Qx}_6~2xW5)AVTjICk7^0qiEhT+<2*g}CdFZdBhb0DKox*hFTKN+QeQctWm| zBU(&eF1wdfVVPXpPo;$mo$_~WNSC2-W|T9dGG|69Q(m#54h$`*?RPGWXweSjxxnUj z+nwl#sR8OU`td)#$osouLv6p|$HL{ZlCLa~n?deaN_n=L-f*ld6qZ+c)#i0ASjC#Q zx?Nq>)^4lmO9Z;30XZJ+lLM)Ev^Qw03B|g4!?s9o=Rh(V)nffoEfr4%B3d{S?@P+i z6$Fp7NGu*m1bfJ~WRxUhountZjckcUNOH>n2}a=8H^2tQUcQ$!BfK_cz4Cyvu2uho zvUZ(Mzk0tt08t6D9&jCf+H>wQ2Mbz|Y zYQA!)6mf$C&3^rL$~deH{tXx>rQ!R)yG$Dyx?V2rz5!hgUAIcR=UP#)NV}(jSzq>Z z*u!tfpmV=I<{35m1qZ+*{V=V0GwVuU^ExV!s1Bc$(oX16hR$21^gqE9ul{>A<9&{J zm#P)g&P#y2jB%j4LmyeB^f7>xp^r*Z^OUS&Y1Ll^`Y@v}g#aLP=!iwh&YIn-{}CFW zW{np?(Xam-+uQVi0HsyGdHJEROR7$z+XRp$8_}%;w7)i@SpMqOPpFyF7Yt`I5Iq&b zw=(OUW@*!3ZON3Tjh6RlO9yGrdgqi-mw3xQrZaTJs${2wN3VmuJ*bs@D$_R0D;-p) z^I)^sdq&ZH>EMY1p&x>L8( z!S1l#2KnZIjQKk?nejU+q=RiZzv6VWxdtsDvZ?m%IEop)p+niIbSRq??a1WxI-(4o zhV(iurGI=2dY_nnke_o&dQ?ge7iv2Ui3xd`o(b-n6d|*ZD}!gj@HHuY1*UFt`%z`^ z476XA(kHQ9Kg%dI@1=c5z%BIZMV$A{BAntwm`cjfc4FCT^BEA#CoV4@h^IA+u`Y8K zh_6)bg(c`^%M%5-9ZY9r2oz^XQ)i|flG?fUgioS%*=Wygh4>~+Ttd`ZL4 z0*;pEL5+;CwN71lNe)DMyJMlQu)fyOI5HdB!R|)3WpLxUZ-W7;x(OC?k@UG;?&Oxj zJcQ}EKAryEcc}j-96s&8O}`Ae+oqq%okwhMQGdJ;j=8rRls5x}+^HLP5|{I!UDSiy zb)L(Bra#kw8>EpJBkaJo`M_~EJsJG^2jg3Qrmb|+lBcr^7J$f>9^I#5SAHDRAfwK@ zrp}X6VV+6uA5& zIH-&UOxuwe@Bn|qioUH9`c@?8T}J0~XakM~6lk6+ueDT#x@^RE2(C0V%Uku;m7q~g zHDIiKh|=(ESa`Wd;K3rWs|VbLW>`t*M!~YO<3oH&!|gZ>DP9W|x`s!VfB=pFMprw$ z7h_xNA-=E_y51ybmqV7E5riGakPB@EmWss~0 z6MoI;4#h$VxPc_JWM6j(?j6BoQ0q+g1ro^+C9;Ps_s5cSeapSElpKMLS>~6RWN)-f ztZxkkQ>Ipg;)zHo)}899YjoO(SZ^@Z>!CibZ>Wm=dyvqwV#U33UCXLA(nwk%be-u6?Kp2JQCzGuL z&fD>AB~!&}Ubd3L5wVi#c@7uurov=9bPW{n|6a;3BY2q<%YC#`N`57RAgc_2%Y=WE z;pQp$7RAdZjo<->OC~!T82(ukzM0{7nQ&uql@Z)+#PSfWlo33d#nMMBQ;AyyLEg&n zb`$<6!wtRh#+_80oF}^(-ZM_6(jNMpN}5dg4`{uV)S2*ov_2KS04J8`7%oqQ$@2^! zG~qvE_+}H1RV%?Ad}28W)tM#Z`ApX`V5i_$SzsAmK^%VOQUMFK5I&;_UR?zDAYPzf za=Q)J6(pJ#?vDjE&DBs}@9_|qe}$(3pZDN{N(`;1Z4_RN;@9x;b$m49V+lSyZu+FC zha&aVB!vwFlx*}v&3LGtdfHg;{$$4lKT86Aeeeyd1>;>IEs)fbsYGwAn`kZm)y;}u zTfK7S{pwn6tCmS}2i-Ct%X( z83}I;F@Ao+xkk$xkg;Hf&xaq!Hz5ec%zw;)C-HYN9Qk(maXUX3aNGkpeo4SFe>naD z!_gmHD;zI_{J>)Nvy0^nzVM2|`3Z)b?F!7X-fO=eq(f+Mmjf zAZ*4xkNfEqa6EH4t_nEb!Z_Y3;GY-podS+~C+Gh}!0}%Tj=#e21%Tms&vE>?!h$&d zIDSsBgQpJ1@hct+#(5$9IDSjuHsUEt&1&-sHz_|FS` zF&};@@WuK1jev{&UMOPc9f2>}xhdeHof>wP!10RyxEPLVqMc;|F4{>Ju`?v_MLXXY zaM8}oMeH0G_@bRt0xsIQTEq^&BALf!p8^Agg1Bf$ErPFSIK~a{n|$6k34GDdEdoAY z;6E|q?fg{0MLWle;BPV<{S@>3jDUYp==U7~$2$x6=ayh!^rxI%*wDT# z@M{@vj%TC5|B}F8UxfdRz!&}hg@Auqu=ApTi|g@X5&Q28e9`_b0k0A4%OGRH@iGdn z6Ib%#Xe!VN?^MEg_i!l{8b}dWS0EK2uFhnVxDs)!xcE7H+baYZ@wnhr|G#zM5|9W* z0xsW!7Z$}P>&W)1fn4$D0lwE zImDxtrC;7)Q(l@Vuwyt6VvZrkw);HZl$Rz7X8o@~a|Qf9t^QX*7j4$VPb^KWvf0!j z_JxJRo0)9Z-_Hn`n_}U5{QV3xqHK=e8;rp3)!c^puKYu2z%|^&3GBU}5&3?EcPi|I z`;TWimZR{)u5i7GK{o|DU~WJk_u`=sohknXeu=XDoBW`U|OUEsREgX>wB}a z=egsyNUhY7clXV2-uum)_ukBn{B2F|u9GCkFMQlZZxoO@E$VDXpFc>#e z+fc%BI|qxlWARu--Zu+|;y|9(VnIdT7l5t~nk{NU>Lrr1xOv zTNw#g*`v^tO~9G38mM)q47xfAt8t1B6V;5})`ZgeC=RA*39oi);N45n&+(3q76|3u`O zYl%52$@^yjtQ9ZF!!NSoTavLIsOpT3@D3Oh-a#V@?|S2YcsIz!S6L~KQgcM2qwfdh znLYQy>R3MkeaIdIPN70#^fCkHVDMav57A;dSe$khK41zDfheXm#gF2=2~0 zq((N~Sr?!V4bssM`Bb_?>l33c%^U*y4d@o?vx&gC5e;FsCeP?WSUx!Ex{B{P2rMVD zQ_me7y<8Xji!skS>*OVfWRa+`kMgt3Q+3ce2A#9{dXNs2F%9BHNc7so+D}R)3RMYv z!>pa$4?$jO0EIj`&&E$zqUbLuWnE`s>##xT&R*$ajTE0 zzeXDKa-(YAl;nL2N~KcqZKD9CVSLq`tTU)N8899(CxgZUbF$u;V@^T|e9WrsUDZ5| zA##XjF}d57&m#`|Eg&6VCuClzH=U@Rm#@AWQPx z4u~|14v*fWEP5=Swp z96ZKi9rkZ4&Z}9cRoD?L?JpnEilyzLgx#Q7XFBagFTqjw3Z9|Q{S~=e@5t5!W-}|E zf6a-zR~jB^P-!{%MZ*RdvRVKrx=S9Lu@EFp~`+(j?-a?aP|-t zs?&6flLg$KR$=zCXU4o^@to)Z0w(>R_s=xlhTj0#jCNCnkFR+?PQ3ZAQVFW zWMMFsFJzLvh2CU7lS*gugskl9dNka+^6|9`!|jn}k+$%Xwk6TFWznVK)~7P*FijaQ zzBt;pkj!Rmgh;!eBGS%g>46!IP5Aqq2K=*p>{b&9ouTFz>$FhI3jr+@c7mOu)`RtG zsBM3P8frHiSB6IF1A+Q!p?1L3P-|x>+!<;Co6b;Ee0t!=Qq7Uar}G)^{NQmFBpZaDkathef@HAJdvR-SQc3?2mPqJGwMu8Lxy{Y~z=}qNw1HB}h z&tyopH#bnolv(??^=~uEy&akS=7GXCV^aJRYub9QE%L@Ed|Y9#x)x$wxoSbAUW})_D4s z4}R1KKjwoEAu%-k;17_;|GW==$Ok{-gI^TzPYL{g<2dHw(*k}&z*_{oo^L%|2JTMJ zPnqLr_89^HJi8kbmS>TFnShJ@T^z6GU(NAq{;YtD>$)o7Vmz<=;FCUhkRO#eugDV@ zaB<#M9IuY&1_2k>`z*(+*SlTd7vpIPJYqb*C*UIgPks1*?!*5p0T2o~ynu`A+UC*pfYLyl-93W}LrDuU+zx@ETNpQhe{zP?O9Ox!>P_==c7zXP6HjsF~!hvtEVSoKY;qpVRWRNFrU!)*tUy!#aI-f^c|3j?@EJ3Iyc-Nf$ugy8?Gu;Fhoio9$I z);PA^@ZkJdmtxz*ubAR}6vwOYbl(9C?b~orXuG++d;gpo@Yd~pkHC;8PiYbIO6@D| z9lEb``)dC^`yr7OWN-Z*{2BFCG?l(*r~7>XCCp$!GPykHYYbNCFMR6dN8*hz*zGTVW4 z&^3?*_GVnNtu5^zlR9nczNArYR4r-8!pG=7P{lfJLeRn3I<*m#R$2{6*OHxc-@VSs zi_N-nrRR6g@BYrY=bn4+$MfU9z{(}t{8+eL)j-&+E>IgJxRmp3^SUDO_Haa9<@GgeXhadE z;OaHLRn2Q`^SYCvj#x-d#Jbf`IuYw?v&{>~+q)vRXjf}*Di#aIdt$+KA{~kbBhf^6 zN`U%sv^cBN}>eS0@+YaGE-wOJXZ0zs zvyQ!kq^rIP2PoWub!G4*DERdcJm~7;%FG%>n(loN-+1I|qz6ltDLmmuW8UA1hV%-F_Qm{bN>T-@R6>Sd@L| z0r6+=!XA!}L+1f~+%sbI3l4xs`f)C0eC#!^vkZBIBg()|p!z&lhqUb1E&fc}>G10B z>0{t`MIh^ZI#c_EFH`K)^eaBsF)u`+Tp3u38rikup7UD&->k|&5%}%ZM|_#UAoDJ5 zE>rd}KwiP|bKU+AEXu%Nz)h|HgQD~tE!%5p(ys&kmx8_?=*a9pVbQWPDw^~&(D;;S z{16la`Y5(H>;C{slYaVgYh(y&OE#f9&+~qKOtYNz>Bkx~<@*gMehSg@!C+TrTdbC5 z{S34`X|zlSlfKMm){xybtBAW&{;JXPlF;ctVb!v=;K{qbO!;AKOC88G&+saTnjJE@ zBG#S}bXz&}dY(}CL!67amxGUtkB@^hFa$3Ixk6iVO4*~|7$5g%W;=&*;$ebdT3|DH z%m(S@gao;0Ojp2J2HoOx(%BVL%X9Ohm|6eslbFC8H*1@;&Ds_%cw%ZcozVKuLpGgP z27dk#^gcP;ARTA30Z59c^R*p^JcKkH@PNA}MM&gRTHggQd{r5kp11v^*7qi~zoiUZ zf$&e&mkU7|_ye}<7X*bb1Lfcea2vfojOmbB1Oa15FvGO|$Czci&2K=NW`?Ux2I{Gx z#aO1Jmmt&;Wv&;Nq*ttOShjpQdX04rYQuAlc~D%toYh9~akW{Ly+8PHeB7I9uF!U0 z8o^F?kKs!h+z&V!;ma^v)me;1rueA>yzm#x2XRj27t%4S-%X)-K@VkdK|H#g8q16__J>|C~pP` zqp$wwR^swI&@Sk~%{m`Vfo336iyNlm3c&6?EDms+ALosq6aoF(#1Q~FH)TuU$paRE zDAp#Xsj^RpQ~cYjTPZ`o3Yp?2B-|qf^BbfC+>Y>B|U=9Dv3%o32(lThgHK8E+B|&JyTrm zPE~P5#WES~;aQegER^`NRa|KppAT9Pom%8Iq9Ys+C*g*X45qp}!f>x>OC^J$RCg$u z3R5jr?TM%Of>pcXX*CKvu_`YqsjgT%mA8l6(xz5~63J*d-jVLCs&m;Gm1|9MjY;<0 z@3KwUGfc6@6)PT8s~T5raH)0f#qMggrn<&cz1UN$R;>%StNu{hz!!O{T`-UtVlbAk ztz(v6hh}s~yXRq2{qS>qW-sZK}^_Q-; z|Fzh9tjK1s15C450SCAZc1MG~OuJp!$@jv|xR_nZI~&%#H)qDI434Wj#R@I?iWM&w zd5V?s8{3o?D}y_csi+`ZSXmC`nJ6SH!E*$tSlKix=IU0KKO$CEp5w^M7kFi=9iG&A z{C|-1OBhZWEnnn9F?)f6Q1=P^tO@^;z|B*zmg6N;M)0t}6_cHf0{@f=-zxB#CfrzD zB@A~7TE4=C5;m8DQ12G_JQKd1%hOo33Ew5~H70zI!23-2H@G}Cf3gDaH1WU9jadmSs8BGlgo=B-dQw}=X{L= zb`Bm#p5=8Nad^zd4huLRKC=K`Spa_k@jU&qn{CAIHWmyvdc2q)Z+=mYpuhd)a=sha;9W;k5Kks7XSR5`<7^D4LF;f88>PcPkPFLpnq8_Gma5P9_scK6U)$gc^n#PkuNEd0EF>aW7`<9)fQX zxS9WW0sdaXw-Y-*Avo?4vfpZu{~~tqw37TWfn)sdCHNf(K*4@L z3yIP= z&D73)0>?N|e;y)yY9~N&YNx+|o$nAnwevlKQ#-#eVCT<-Pwl)#aB8PS94+Q?IRuVz zpmy#iIJL8(0Nz3DW!0>?Oi4jyTzmhh>c%LxDTgdZuuKS21@|APddN9?>*z|Kj+ zr*=*g+)3=HP{)Gf5)>{ock0ntI@Aj9bkcZt%331{6=E!*2MXTa-$N z+O~<_!kxiLG879NLAmu`+-vb@73r5VSf8s+u83ne2V(9aCbs(=UZ1N?u9)@jf#x!J zKB+#QEoid_9$M;!u+`Kd_Ju{lTSeWh|C}J8f3!$F`F<7}v2Kpv89|WuYiYxLXMPeI zaBbI1f_OhCh(8jB@lJ(R=|7(3SWd!&UEzA6Wn`KmRZ6%;ww&h^-MH)B)^(6qCsB Q4*B%A3je2@6ig!fFYh?E1poj5 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_spgemm_csr_asparse_reg.o b/third_party/libxsmm/obj/intel64/generator_spgemm_csr_asparse_reg.o new file mode 100644 index 0000000000000000000000000000000000000000..31a58a5873312addd7080de4b91eaf27bf485c09 GIT binary patch literal 13232 zcmbta4Rlo1oqv-F5Fz9Ri*>p}8M{vn8siMbN)-3xg*WD52L(ygB07X*0)fd#XC{DK zT7xr#JcnoP9=f`xY?nRD>2gl7aE`2tN@>D3WmhdKTB(Xkt5f+hXz-KS-~YaQCpW{4 z?z;D!dH25G|MULu|9-riJP-)pQsQzcqPUd5Q-zdMXZwYRRVr?VyA-qkt!ir9^#8TYcT_EylUTDm%8ac^6st2utPSFiRC zd#|q+lo^w~KVpLaSGhcmdZ;@ZZVSg-7UvP_ixFnz2+W1y_FKDp7DmF7;{t%^qUpH) zHRObcLeYi33(AGg3re2%iqbUKTy>(XL{SnCRJxYeo7M-W^;!G}>QnU-)u*BpOzY2y zW9oEQyvwvNp@FJP4)$ELGm8{3t<(O#_Xx9gnbs-OdXv4FtM-t(d+yz>PBOsdh`s+` z>ZD%cg~FNE2Q#fR{yt6;%DAfcpOocbF8(v2W6jdeh_bY^Ly0Y2p|nKs>t3P6x}(bC z*mC8H9#XI9cjke9mc6T6kyypRPzoE@0 zF>07jBdPUL1I0}5(0Wlj{YtOY%v@_WZ?Tv>O%oz-9&Hg`^IK0;Bnbv;Oa-0#63CD#GLCr(2 zv(=}5IMEuYTNBsH6K|_)8nuR5sjQl;%Vx7?0xm0Raw@e`eQNR=H1zGIWXQgm>;W@ViNP7?VFXCTDwB zY7GjyQ*N!)s?=o3MF?V0TdD6vpih2IKliGE;cV6WkCp27)-yy{)prIA zKo<Rev%O97>RnWt)24o;J1WfSE?@_6LZhw5g4U26T0tiZJYI zu;*W#H7t*w_}rzgRxrT2-Kd>sV;t}woN4Vb6T3?Eo#c=@NrAoULs0k;l$;Hu2G!() z6b1XvtiR8n4O&MMe;K7FD-aJ@sJs;B2bNdjd1U!$!wTBU??}&fdYFmxE;V@^O|(kX z^(<;n$KOHX~&q$Z zG)Nvv8cKoe>|B^$8DuQe0>F0=LzYHIlx*(^<~a?F_pdhPSN zI^%hxwqIB459qbesSo|jI3$(5UfV#)^;=2|mT=aarnO#MgV*;c^fbd?z%)hFyRK)z zGY_Dm-^}f_u2tY?TI)0g8ulldwo!o4+ys4a5!B7vR3Cqj|;WU=AH?U0V>Qi$jFwnRO>zVq{5@=wPYAv{fc2av3YIM6Z8}J{e4$WpT z+-BMht+H-#uS-q+8g9`V;mzy=!A!?BOu6yL1J*IqdQIKAR$G7`z@9pJBhxs`j%zM@ z*#LH`xAM9VHS=d??Fp<(u!e2Uz#i%BYW}{Rb%G<9Ua$4y#l$#x24-OR&kiaFPvXSbJ;m z78p*JVztLoc-ZN7&^j;md_bM|lGE#bhtcD8EP_-YgC?rkoJ38Tl8ZpdzigC?NCitfr*Z}I{{glnsY~S%8HLz5d3*Xb**e(>XOFe@DPo@cy-H*u0N<-)@mm3IUpQE=fFnU!1`z= z_V^6D#T7_p<73YSJiTQQO#U7gjM`RhrJCwNlk#;MT=n{o>S_P`6WiE%C9&I8J723) zllNc-1>x~D>kZxQqimQRv?u%r#l%RD|4%*fsjE)?`Chi+)u~VIRZ~yHJPn>{m#4O0 zneyR?QYpH7n?3#8yvxp%Sp7373Z$2rdyM4`|i`gKin?m&+l!HoXDAO;s5uhOjySzQcL6NP-mGvB*%gb&8OK*Sk_IJ!yL zfkXP;lnJbxcVkZ8YFAG((v4b`o;W{gxjT@))J&WxgD~`sueZm0@6hjXto*tDnYbI*4=p2bUmfN$ph{s79?WH%bUsbpZRa3p;MA3RD^( zubS*Zf{NIQRIy&lET$9C1xKD;dhcsE@*Gx^y|8^DeYf6w6!fEN>PakLbR_y5gS(R+ znfC^9l;YpUI!%+~dZyY1Qpb+X-CN_c1CfqAkc|A;jmOVV%PY zXe)YRpNq~mQ?r?8KgnhvWfRx1UNI1O_`;&Tb9xVX*QoiB8nl$FTXs07I(5tUD|Oc= z7(j!@WCF&74FQd@FS7>0rc^f#dpa_Agil1CCUGSX|Lyj44^Q1A*rsx+TZ+@Fle!ND zfU{(!WWj1>P)g}@4NSCnz%C!h5;1c>xZ2=9mRHQYa}9iUdf_ikCfBzPuXDVX;rLv4_16ep}~S|A`EN504K1!-RbOZU-n^v z@2BBk2^mOZbl%B)_$eLfDMJ0EZ8ipLwipoo2l~8>&Ws{H){D9Lq>f>b!07{L=3TaY z2eB&Ca){xF9{5KGc(BmM&GG>;_)vF!;G_kOJFNh$-^_=$fO@*m*~s!X6+@l3KcMdG zqa8lUwiMPlMjar8HWk*$?JAkek(BtPaQ0KS$6a9b&joef8|uEJ*uvKHEp5Ga3@Yf|+0xbzZD~54Q4m@>DE#xs@Z+dyZ)3Z0 z(00VdS3}U=$j%z{9a(9;PRZ)cP)POKQ}{?g!#Yi2$=U%rpU#_}J!`1roO6j$dqz!! zu`|MFoZXsAFdgTW?mw%JOKLB{exPaXV%}vwnY(x_(&3;K;rAQ<3jv7hko1+j3RN3 z#5Za*;!J##Lp}`gG7=w8;_I{%O#Fzt`lk*Zoxh00r<3>&7H;NzJCP}8AU~R-gU%y5 zN$o?S;5N>I?vKF-0CgV>Yc*Qfoc?nLa-p15zy)xu(>@m}#&C|Kc3>>2n7~vxgUwXD zgH@HhI1UvRq(WmVHfp6vf4|FXZ=x5_Mr7HV_?StHT4A6IL7-j`ekM8Cib;6 z{rLfwO^>p0z|$+?2nM3E^cBMnlnhWTA7_C;ri~*Sj-TcG?It%!Yqbp^8I)wFk*6e9 zO=6{>ZPqpkvDb(Ov8fOn0n#>h3e_nt+qu{T5*rEHdTl!sLt_3L7tOZ z2BWY&Wcc?k@(i{Q!oXVAXV&1h*RozSM6x~0ZO2B!wqYHl^%G{92hQonm)Kj%%p=^R zoB)$`1VmU*>@H(|IbHhxyiDM6R@;te507oRs^IxFhNigK#IT8QvcPP6y zipXyik>4l=f8$W(?H?gw95b+avjR7~bd-44%CP=0okIpA!we$Nx<4XY&a7n(o@m>I zEgZDODD%k$s_vrJ!}LP3_LW&}MFEGdHxE8`?Z%h$ZgrNZeW0NAxZ2%WOIGbutr| z&Diy^8kUK~+$JaHBN!FeNy-uTB;ly4lC0*?#T-sTWAWzp`PC2}WLba2yvv!%CFn`F59cT+C}B z^ka?}%2hYJ9T%KVoj`SQ$x!~VH2~OV{PJy*#I7RvA;Jr!Gu6v00%f?ow63;9V`#D(zgBKYG)aJt+rq(}Lx z4X36>35DqYBtoH@sZ(yaVVY7Cobg?HOr?j99)5aEp~v;?l{Nj0?q|sN2)=esvGzrs&GDXSc-Yn#Y6#w5uLnc7-*U^`Kx3#;uMY-7O02Ux+TIy& z%js$E?hbdhhFZE>NjwybgS;q5{i^;dZP9Qz$KBc9*&ffmEel5%cE#G`_vets&7G~0 za3~y&c14wr=18QgB}W^w#5Qu$6zXViiFSpShNGR~2t4d;YhQ%mMk33?E!gvOqS0_H z91pFS_Px-eZrb%j9q5$Y3x^@n)zuw}G%tj%c(l0%YI7P$wjKY%676CC;1cRWOehrL z=8mE~!H9 zK8k;nDe6&6qzfk{-cv=X##%xJ>Cbi-e2cMCM<5EVzh3+VJ_~IsgkO%Iz@7El(L?_W zAn>1{O@;K+dL{5QfhhU; zys^wIvn%`aYYDHG>^{x${P8^?@p~oyDt@G;e#v_NfffsteEe4&&)5H?#P5Zih{H3q zh@j-_InVKYJ(tqr!uh?Zg`TAnF8ldY377f*BH=Rs7@D*wZxXk}& z377e=;s;vNFOR#w2!2x$yq@DU*EPw{MhU+{!tdia#gkH7^lPc4|4IqpBH^3&4$ zd4=Qo`T156{QV;M0LSy=;N~B0gv)WL;COxLv7LKjxZ1N|GT1Gr6ZF(hNQHP9t8?NUgap4qeYlIhd7P8ym#@MFHdJc z%9F1vPiqV5oPwWR>iEkq^9+id6mUEEsGZ;cW1Qg$u18dSdlj!rD_S%+>T-zFn5H>( zAIC{<1b)IkpKi+Iz{pg-|29oflu1sNdkVir{W2)j&i8*SmlyXWLP5U&FMv?U|Bt!< z-P~c4qh`MU^w(&z&&{Fblw-6wqD*$G+|!$DIgQJ>JU=GnsFjqMC|>p)dIc?TVO-pA#K5T_;N1m3{q^ZUPm%bR%uB7Q=?9`r)-Uwo~@ zA^JuAr6Zz{7x_c#^KFylCVm;em6W5><;&A?J%8>2QtrN-o{I8Dp7OlzFujH3 OALsth$rH$XRsILtUlDHr literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/generator_spgemm_csr_reader.o b/third_party/libxsmm/obj/intel64/generator_spgemm_csr_reader.o new file mode 100644 index 0000000000000000000000000000000000000000..ca0b2b6058ade1e02fde9718820ccc6cba453949 GIT binary patch literal 5952 zcmbtXeQXrR6`%E;?F%2bhLpyER$QW`HEB5c2>R;I(({-EeLqunAD;aro|l)?d0? zK1&*#^8u^PCP58VT8T*cBT7Wu1gWA@%7<(cWQfvGK2$zhg~Ux$oIp`gR2oXSzBfC2 z9-qC9)Qo()@BL=xH}Ac9Z+85&8f`2Q1mZSC; zi8B>6rd_H;cd85S)G&8+^XiD<2?zz+oS=bNPBqv*HFrwv)2OBr6?bAGC>nSXI_8I! zl-htIHUtzifR3tS13b(mv05QDEGqt}z&-DlKXw@zzDBQJmkd5F<&5&KSorrIsi z7jpLYf5hFhA=V0#rNqvtq{#!Hl*o{!f^07p5S{}A=b zq}kE+fe-z#fdE9!E6#ww2`x(8=y>+iEb|`b!lK>dr28ug~RDWFt;4%!D<1VgXT;F@4;wzOeT(bbSNyj#U2rw1})7j0% z!sdxlYYrl^)l7+2X!bPJkG&BS9CNhZxAEBf!5VqByhi?^9D57TBgGt2OqIyLN2csh zxtzTN#XBpX&*#n>8ORM|CFX&KY4Balf^T6GOrtsQ51MFyl(WYL2u2RL&M?`ADxosO z&IbGf!;b=f4rLls#rzY)4-vKYytwmqC~L($E8F#utdC)=_d#Mm=9BI1a3-!QmF;vH zT?tYdEIkf&3G{3YpT(GkmZ$J3c)tSys{rIYbQGEGQRL&JhlRnzxdXNPLS^TXAl zxz%$c)$=3sLRCwXiI5gI9DGisdN%nQ+6hhAzLW@CD}))7$^*}X@d?1bL3;?f5dfh+ zSn*tm8mxTQrv^i|Ukz40Qz{3mca_P(nrwMfu)oxI%ReDl1DG7Fst<0>`DcD?4*H_^hE?z3)q?CJ6AH2`gsCvro<|ry(H+R%H4GomV;{i{8{P0TGPPvHFS@6IJqf>x z(L<@$;els7@NFJ=j|aZn1AoQ?|CtB=vIqWK#K#NH`ebJ^9XGntP&(_AP(dT56K*51 zSZY&eTPzlyH+ODi9tlSqs`2p%J|h34=weOL$Lr;2?D59NWok>TMXrykF_P-)PIi*i z`lOK|nM_-}Gez3s?d@G{B$ZAkNvf^AE0Zj+c5dw4XcV-~$@IFe%tm8d0of4mOtdFs z$#l9aO%{NE*w97H4LB2M(-DM*nKT7{F8&gN&=8*rKNqi}2bIG!iNhNij^pp;@LSw? z{7|?&d5`fawg?=T$Q!0L4EEMp~!-S>0&40CEW7d-f1^5B1s z!+F0BaX25(qaOGP5B#(TK9wEy#qqqK;l=U%4u|vpE@XIdJXMaLkLP+0=l$yPz_)nd zy&m|l7+!4udmPT&{}+ZA+y61g&)fg5M}CGNFR>vH8$xvw&TQ*2;_KjTq@DK$S2B`Y z4H8bojW`Lf%VbD6-Gxfn^Nxlbk#GbCCfnm-obElgz*hWeYU}FgNOr=)on1yU4A;Fm z86)2IV@8_X5KE=w9ZARdul}3?6nMmsiV5g-@EfnO&q1N<0gw>_y7pw#qTw|%s_MFgz=jIKX<;1qwU3O z14JruFtl#giQ6Hn3w%`7@&9hJApwIoDk384q6;O0NDwI?YA)P`jcyPKFBJ(P2`@tulTCsOnq{B2 z>$TKcwY9C*+Saz#+G-;LLIN1Tw`hGA5EZ!wQBW}~viW~!&b|8x31Zv-@Avuq|Fmw- zW6qrSoS8Xu=HB6SPfD{|E#^O~bj4NIr-1EpJ=ivP`I2R99=St*Waj zEw@)xm6mvxR8`s+R@K;-RC>y5DoZNtOY4?ZFR5H)uc@l@EU7FXXCJhrR~(niw?E4j z$)LCzb!nGTz!e#j*SwEH8$0liLlsLFEU#U*tawRft*550lxit1si-SnR=#YUy{fX} z7JFT7x!qkbVNOwDVey^3a;2ja+QrlZ zMXOGmQd;&aOM!L2-0{)sR>g6a=FIzOtK!I^amL?%1Sg&Er#}{1{ipBKtY`Bn==(U+ z7tK`Pm8E;P`r5L5doq1TGu0W6?0o6oR@v8{<~i@#Mhp;wHqROO&3cwmtZAl1N!h1O zwYp2+c9(7{w7yyt0hnf4 zebJ|9$-a*=eMd5-ds`H>%rQWTlsR&GzO3|&@~MmhO2j?=xp$dp-inz|WMPF9kmHT&(U;)R?TPjVR)_?r>Y*l$!c` zk8~g^h0eo}`ZnjwzOB}9e`9^u3T)bI2pGsPMD;$fhGzk>R zmwxsfO+-7!P0`9OR+Vs><1BeSO_07c*3q$WxH&&5()=NrzO#IXj1AkcH z{~Z^h_H1sHwaHee){y2@U%()#&pWk6>2gb34#rDr+9|i_{V;wuIXob3a(KYV$>Hn^ z^rvVHxIdWxh})4vYoO;r&tNQpvzvXpZJ7H^7Zw9gzw7?v7XYRkbDxGO#@x?!WKY)0 z{$)|DAIsq(m~5LA=!q%{OSdZOOSDENw%XDhdN$Hrkt=A5)I)jAaq@5#lLX%@f;1T9 zW9`)d+d#wY(zu8#l9d8SX#U6lEtIO5xPe5-v*lR@8?XxU8}Sd+<5>&VJ<*UzF*e3+ zHf^HSt$G~0l!(VMM$v9a({9R8)D@0boq1T6rRmPV;f0D?;rJJb1T8%&S8?nEmG}lL z<&$QDd;@#9t``ree-nm6-bmx&J0eM;4-vp-!ma}85WCB5eHDbkIq-n2{tE+s(H2?# zT5kC)8z?0O^TM$l#C*>1P$g2~$e$P7a4FW{le9`(O+jAT=otSI^g4cBlJxkMV7@O$^)m%i-*cw3~XMCv^^||8U*;H>g$F ziQT>0nb(}R9pk0vU=K*=ThKv+SZFs|W%Yn8HJwpteNzyOhU?US`JT_0!)G|dr=}t@ z_m|Yv56wqAxb^BgzU|a@Y2phqHEU0%jHbywUkztXM&tByR4Vp~@A*bW-KJf^8@5KH`< ziGxR8M+-^)8f(m4Zbf-_LV+TwIMJnABh)ll^SCR)Y>NfWi*b9i(rfO3z1kd`uQ@}>>hI*PP0qe?vuLwCd!h9CVR3lM>n(PRg^$pD1$#iW ze-HsGs@t);;2CqXIJMezvupU}D8Y$Q9*e2!9aJ(($&UNgQa?2d;^php! zRyg`exgN(5?6s$xF^px7M-cpdKJC~)AntZN9M~cG%lQ2KN^aSI77kD=hQ;G}SP6R^ z_sQy4vbr4^Ag>>MgG`)q?Yvu$*d+fWCOl}|?A8ykTeCzAyY+*R5Svt>_u&zu#Kyu=&tc^hp!bUUIrR)_ocnrr*3 z-1BW`*me=-G!r9NH-)G}0f+<6|NP_6e`nrj`gL?nf+qsOY(HNL-if`1)0xJ}f^A2$ zVbL3qD4m~=1=)_4&2`+Tw&CySpJOVtLaY4E(ZG(n3l*)g5rt%LJ8jMeuwpOg;~Wc* z4yWt43Oa=HPW6bQm1EXlZbTLcfmVle_Xy72%RwFb5lsm9=|$(QGdzBzE)y+Tt#0r6 zE-F;A-&V3L;lBDnMwbsBjn-Hej}-00kFx5<`_?BgoU%jzqN`ZCdwM~Q~4AsPZ|QbyFIjHpS4qn$0=M>I{5FoE;A0Ol9CZ>SW`he-gd0Dd1WUs7SwxGPm zl%2i@zM_H&_L8!)n)2FO5*?RSm6h9<)Y@yxZ?0QXgGz*e8JEtEujnM(U0brO99S1E zsfDm=uSBfYURqLFS>>@WC?}D!%)YFy!n35hqTKFTf?TsNCyBqVvbL_ex~hf-roN)Q z62(hd0zJv{a#7(WODZcMHro-TA)uzkarQdd`I+pT;+!tK?at}bubpm)z`V=&BnCS< zW2Q?zR-}FjaW*G1!-4bILp)EJV=4l3yhnd(&!~|?J5k1abTz`YlD6)RUC|%=I?|=k zo0Pn=-+qX_N@Qk^6g+?g*%!4+!2v|lBR6I#LvO@Mb@U7=I1VY^qr>bO)sp{l4qr^+ zzygE=hu2&}5$D^^y!RTT%RNqc9g;1L(OW#@qzyX^2_UZ-$E}UdT<{FMXBAa(WEDil zGQ&@YU$z2*kOeFV1JdJ|s;Hl;J6-DMbacCj4a&!E$T4?=j#?muTQpzwu3I%Wwhbhl9G%CK!t@Wq4Bit}KQfQ(6po1IkF1A{3aA{1p4A$7_UcTiU;=|G; z|1WW_K`v;xQW<*?0`^vA><3^nHYr(e)%A3a{Ynbl27q|)^<#VtDl|hIhVu;V)UQi{ zcn>k4HrMrWwBq}biSNLJjbC1}j3`5`rwT;EUQ$yov_K{GB}*zw7GS$JEH1CK*Oq&r za}r?mx2l@;Q=Te2v`FOx6Z@y@_7;M|Q8Da0)-ffp69JNE&ABXZB#?Y(zi%zCsk)fR zCCC7`*d#Jl)#WuM9%{QCnyjjtTkK`!wP-P`(FpU2$MZiXliz`VS$WN(ayvDgMuC-R zCHDG~iX|Y6NQ_g?TKkYIZyJ%?9f>BDV5g(N>{plPim_JWWG;zxes{SkRrcx5Irj3( zQXCpJ_V|>Ed73RxpHeVo@;Li-Rdu{-N=qt0UKiQn&wyRE$VU|9mOWX6Sc=) zTU&l9wjOI~$C*ogaY!*6S0P2so<@?v)2}y0P zJ*fd*ci-N9|BViJQ?ASY(pu;SzWaWDcRk1TtCt0Xb1S9=^%4L6yrQ-^)oq5xocWB` zUEcjJND3r8hWYPPjz{s=8_jd{7~+{O1#bg0;f+@H4ea(PQm8k_u6JbY_KcQ-TNyHE zP>WN#6{+>&+v#SRNLVXWP)&a8bLont?)1<%|z^=aCvhO^Bzy360 zI{bdHGl6EwzXE@K?ZEax7Bz_aBl$O;M(x-RY?Km2$`z-Ia0ubQWs7hC;W~s}f-_h< z9@CJj7cETXU`W38M(r3tDd55g(NOv6JWuNUdWd_Z4-lPSZ*-pI5_G;p`dPEAZqE02 zc(VM>QlK|l#bXir3Rt``8b7yIflCDSt+fbSr2CrP>H)X5!eN&OZYGtvjrpUs5;EOt zOx?v%2tqR)Hm9^xHYm(d+FQ^QGWd=s4k_veOvP zbr(|D$>A!b*FB++LBzPf@J4@nQN|uAG>Vt|Y6|<#7UhNpP_8$D2KO3RX{j8AY%K*z zA4_eg1*0Q{>ZI_$5Z12&cN)jv?;_d*&rzw+EUrcDiYFOQHmBq{2hfvB&qsNQ}wseeBm8rUNRALD*JMB!C# zz<8s-<@A>T>Ww~h(a-is{uYGwOSmr7evC$E0n4qot;-lrem@W(> zM85zcgPuims4m}`NLI98eDXG33m)wu1#JXZlLR-A@qHFW@(s!DxcwYz$3Kzojh;P} z!rM7qN$G(R0M*wZA-+CNqjrbxrM4&C*0(!$GyP;DXVSNP>AqH`ZpuJVIS{^l zXn<`XdA$48T_&C5D2@`Ms)Ecxmx?eQC--b3UFto}a@Hn62c^Ja^hk-+ffC{@`w zGRKI#<_&a<71 zOa6OsXi^w-G%)c9U0=S(y$piN^Lgq2_7o0TrND*AU&o;5Tk;Oejo(5~zy}*##u?r$C`HW-XbAEZ#2 z&YCg6Kw+ZM<2lUqCNKicT}NbJAB>R6r{(nT0sQw6{)Q_D$QY6!x0*d>QIXIRhIN%#J^Hg~z-nS-1PhI#Mg{T=vxq(f1Ig{1O&qJS1Wm9aD*Tk-hj@`c zK}~AIFeK(T@9?^Xm_|&7AAc}Z*lkkab{@22SPdlzVXshUoKPdec7@tQQBqg?KEc&Q zG}DP~(N0z@vM*-!?d%A=2lf|J=+sJZJ^C^mp41|)Xa(&!Ct!238CNn^$$tQe2ti-v z;0bHTP1#xz%8~*DsTFMOLRZa!p8(ho%qkUHn~%miRot^3&7=#NO}4_)olfm`+^=o1 zPQtw_SwR$*e&cJ+!Oe%eblZeh+epc&&Bmpm`z6bS@TgugY$RX_kuF=exk`6xNGxfF7>AScPo` z2W67ECYq<bcN&1iaH)G|m)A zaK|J0A3y?xC@F9^0x+p@gyqJLG%556uw&)C<$<^{Sr`jUJAVGa?AECfmGeRGT@ipfwgl-6exPZ0~6U21~d47Yt z^?Pv$h}=2x++R^H2|oJrcy6NX9h6J#uYNsp4Q62NEL5E;iuUB@B;=uj(iW$7GwhQ} z(qVw4e}J)P`sHhe-O}||S*uB-vzhh@^_2edM$vACSiQ*xUgr@`M87$&wZ zFgy$vVdF4E~X(G1+*zvkOf&=aSYpiXF=&Zpbh;&8)l{F zhev^M)Tg;iV-v#T#|g`&uh7QA(j7*PfIZy-SeqV0`f4J7QmB%SUb@acdxz}X1FEt~ z=88j$0He?2PKy1Di)Vd!Qe2d8LbKM$_z}&e6;@b^3&AZQXRX!=VOyF$iBBTX!+d1w zZ3+HLoS)aD1q~C-^YBgdUz~@Me?R6&FXI!B);*Rr@ne|`Ax^qloCx{^hCseJfrnw> z2?NgEzz>jxeJArVjD56I&qs64S+#N3(EM4MenSo>WP5*>X(?)`q*G|B%elPZ%sDcT@M#>@Wi?EL3 zRsPQdL>R!t=ud#kCIR>2Nh9ENT4fCUC;`*P*L?s6pOpmP9!LIXQ4&1Gaz+jgMlTZZ zXms0eL=)2EP}2#DWPkl)Y63TqR8^wRQ6d-HQ0gMLFXf&_%|G~+S+*cvw*52443hhL zJU3DAyCN3^_8;-wf$_5IDVL;L{l0kad!mo%*{?-ejKnQQuBS+}*U#ms&&9|w5fh6k@vaK81MRc2MXJ z!naMpVIVh%5IX)8g?NDeL?J!`9-PPD(aI2ecM@ECmoZe<>u;xCTr~iwkJH74*MgfX#ZZ?u5>Td%%>k z3ba{b3+Z(z=Z-LrwQ{F&1(8!ziouqI&KCuEgN(4h6VpF{*%4o`*-nsN^pbbv6t7>RMjNk z=Y8LCD5%6e3M|}lFH>S4X|Js*#a$-u4pZ-^>_Z0C;g)1cF^#0SaVrLBp1rcXysXxa zYfo4;Bpb=ka!xFs?wnjabBf}e^j~XVRrSSm`9@|NHOnqRTW}lYDX%QU?Wb`Mwxr5a z`JXoyEAHQ&m+lNR{#|@0zIFWf+ey11VfM1%7BPUs>V{DHP=JdPw*|? z*cF2_MtH85LQA>!b1&;X#N(DiFGA+@#%!1MwtKFWf-&^O8?#)g**znq;6ofEeXZvL zDKwp9HGK;mo<81KP2Xh>Tu9;X#SSS@4<0Du?01E@Sb7xdNK+mWS4dw%BN@%$^cUXPG~!wPw>(Z0E~GksSgO>27eqvfwa zr6UC|A&`eiW{AC&L-#i4Apvp^kMDG9(W+IPTSVcY1-aBxg8Yi=E@$itjn#GidyJ4m z<9VOWp|EcUmt2qii2YB1!Iv5DpD&`6r#NK=r>x_YhbbjAj#K{1DXSU7GaUAF_%9R= z900GW*CHWl{1yx$`DY&C2u@+TByRWIoRZBcHG@e5IF3?6g4*!ysh@5KyfM0p#!D&= z($L-ZAt(w=+y(Uw?*QkV`b5_a4kA*|N|0k8LkgEKH5eVC$`QP{T!S>D(=S7@mJO#%zO$)k1lcuLvAxnns@7gBmI zev;R>K;UCt`QEN&af9!*x!R-8yMCY=Tk1H z^aGR>yq#M*n^WFr3}eRkZkPP8a(Ei02gV_+ZzOQOKZ&iRR{BVD1==>QH*h_Sa6`Q} zb`@4vJ^DA5R+;}!hUmo*bjRS0O{Da|rzHPZA;G{mk=r?sA*OP8I>P#9fN}q>Q}M^gT8&HTR99{+hrlR`#+xXx@Y(VKHERN`ioz<#)#=lK94-qer|CG_21j3+RydIPDT&lC<$=Nc~Nc1)roPa`u<|5sc@^kO*D8_>R6nR?i{ z#BI5h(}zj~xCGO;9wT~JOMYsJ6uOJ&yEogy?ng?)RB!C;5xo~n{+)<=iDhAewcF4 zzz>htv@;3QU7UYChgVVlmqH)ojn((XNf6w^{f$vLL<8rIeM@2AEH3vM!u4)%>}yK% zPvEpDhp8pr*yj`uJOq^bAS7_T)qPJzy?r^qy03(AFNF2m%zE3XjPEd_-q?GL>r*O! zgPCvP^X^SV>u>eOMssW%LyzU~dJfwt92ktS{%8DPJUoo|*PL72*M|Otxj*wL>5RdOw$JV43Z-Jzow*Br{Y(;tNoQj53==vO2AQQy%YOgyX!uxmuwSmqTKh9 zEAI0*GPf`biJ+^*hxo7N@UzT0JOwy?EE3}T<^hHuOi4kulaD>fDSf!qA32=G;a@R^ z8p7}mSI2bWmprR`DIwUGPs7JJ8oqnxq{Dc;zhINZeHUsE_>M_B(IffJp0 zuWtZtzL%oGH+f`!!Q7mmQ|{!HKXD4RgwDZVbF2(8Z|qSH&*AVdDIC~@3lx1a6&B~t z2CjcJL;Re3cM-z+0|4XtA7>1GIe!g@dvW+N#^xbxM;NvdO+kBq$S`}6M0{%w#a2y0 z49}M+9DI?(xeV2eu%3q>toJjipT>IXLt)>;47sUqHuWE2+)IQ1rC3n1fTD z@_kLn(t_*W!5XF$?GFUrkHKk&(YRJn8&=tn=8Y|*aBvle-=MI+o(g>O1&xA%|9)=6 zJcjuNho=+FW`<$Ag1`za#w&#F47JZxZB9ac$vV)^s!iHgG+>^dWHJ~SkD}^3e2Hu-k?k4NC1F+HaqF!fFWKDLuWND&RX< zuI2cSoQ!@BFGPw*#&zWXef@Y>`2U;w@viXyJNogi@TBO+^Zs}BX+4ePF|^xwfpa`uat59ZoNLguKJHJA^wB*{wI3-7!t z*4=vzdKWqb)NV1ot%QER%u!8B{^dCBaCklCXheCkFbnfv*n{D(H^z*k{}deJQt-E& z!pbB65rn1CYd9jjF`fq}pK@4S1B4KcA6JGo3cckN4@S*%mHAdf?;o5lss?9EdjBn1gqV0W()%A~ zZk+gg=>0>Fa*X(4=>3CBIY!a|>HUeJ*<4h-<^g*h^`^!o@!%vxucz2DL~Ck1W=D(Kcl zWEmpxbO2vKY7fey4G;LJNr>i%nG<}FCyuh3iDKFrS` z(|B{6WSNmePjT`5mW(q@(^?cDwkAhB#zi_Eg|M#Wy8ymIN-$8yt$-c30v&WKunVog zOX6%-xctqAe^`Z_yuD6jkq2%_vwRN_3eSHs7wo&X~aD9S|tAc zaU_v`Kxpb8q8RiPd^_O38!__+A;@9ATL^p!MYLXr1g?J$>+=_KzL2l4M_7LuK%6gb z@g{M;xEmNp`H!3VLWUfSG$|y;Q^=8hxP*`+v#7)hq5me`1MWq5Q)GB!LSM3z z&<4rso9@aA9g_b=P8rWB;_bp(PU*!d;s)qZ#z1#;c-hQhz9kCK3lKempCo;t(A3u> zQ3`J0JuWoVWt0-~a0=_h{nHudZXRu+G0#Ved5bxU!>m;g{Dw5z7a{@jA8B2skb|j! z(3SRLNTEL@qcAB%_FSfhdp}Yds!jd>PKr%{=8v`SuOdcpq>0~xus#Mq#yS+Y439H} zxQ}?4Ao?-{unTSZU7Rnp<*O+FFci={uR`zs2De1$-4`&d(8mhBztG2$-k%L>xZDH7 zpmhy>#QpmPg?$%sIiZ<74`C^EFXtAr#x$FASwHKy5GD^=kTf4c<4$%xQc#QwY3=Kw zzZRPD7m-u%f}V-B;$+w3jR~#z-#IL_;^!l*{{}yX9IG|?-ibxgSKS0gUN#P!s$AwP66-gKa)}*f`nBfeAdilP? z(0_=#fL|a>Yho=o&WK=yz&oIY7X24m^Li@2N=K>|eEjux=)<7_DOwr(r;Q_}0YV7a8WwN-qMsog09Bac6KCQGF?=HHvcL zUS%he>M?%eKI>J)=;n%Uy|!?eZ(IU-2GBgAOe zlAf5Fp&x=G+FU0amcMc9`P`RM%6f~lXq^Z>`&QOj3qAV;#=|#y{?P>drvx5TZ%;Q6 z)PAA4&*2h6<8Px9%ejO?<0SNb2N*x=@BDiZ)@LyU@>$O_#?{(Rs5FHgzBH#FZ4KfQDPc?;_FH1ryk?En#cO7rN}llWb+WF^-H&~M2Ex; ztcxLpMsz&F`V*|>=6(px=3s^pH?sW+f)wsdFXpoDvF*lmlP((P znQteOaTc46{As|9VO=O-QMZIt{ubkTwy(%nu2td+!l z%sgb8_c7B6u$2MPZgH=2CFd`oG8Nuf3Ey8_#A)Kb<2-~L#%aMn>PJOwy$OjINHCy#`M?zErzu4L5@xg6i21pkh(*2MQ0wAy}4 z@Mqyi^h?;B+{N(1=42HC9ESFp=Rw@CRU)n70&h&*jLm0gzGL&xAn0uj&FvPqa91*f zuz49t5Wg4r*u0vq=?%TG19@&?$G!G=y>PaJ6>>>(moJv?$)abCeNa%{xe-#HB_Zbo z`XMzg;yj6JV80|>>3sxoVJMeju?8L`TMqQ`?ahSuN;=a*=OT}kx(J?*AW1s`3g~n* zv2e9pxED3_$yXicP-n=A70Ro%Tf_NyqtS2`oO*p@yw8aAc|tA{Vh-ENO_!%Z#W|*n z7Z^5b1Fs7RP&tNN%3@YrxSH!205O&1uYROk?n5G0Hjf@DkdX`dDu<+U7~xSi>6F_V zPhOMEy7RT2HN>4&_l6hNB#m=K>dI0%E@SKW;l3adQ&VEMA!h2Pk5@8xGGN9+NcsmM zG6w?`SJR@;F4iEsgxc377jY7g5X40nX$_M8C7186ERA6TT`DG{Jn5Uqus%8!I2mPX zL8A>x<@34S`J&zV-0u9Ti%D=##hHR9x&5R?Mup#9sQdnRZUa|;qD~~We?+|h=!4Pj zxfr-EC&1J{cJ_l~+I~Z=J8C z92e^K{p6p(cepKS^ zC_%Vxn7nZghF#vU5Wo79Kt$(cRHz!+kx5i26Zb)v184Q(3j0xoVT2xN48n$Q59G6q=qGKJH!5Vcew06yaUfslfNV&E-5p~T>eNG| z&xs0E>|WC6903UYm#{n$Br9)kR49D6Ay*3S;IcwH)I=$vhd4!OhaO>ULOXOnVf!;x zNaIcVSNOqU9L#S<#SM84r+By?wxICO=al}u)P>~1UQ0tG#FECw!25E*U|DmReOOEs z^=PZuc9o=Fa+{(LGel)&%lqC!e|o}m9l zO=Es9Dr79OQpMb|EJWHW*l$LKJoO8Tjqyf>=8=3@V3fJNjR@;MWlU(7kTEJK?G8?B z#Bf@)zds5y1eP|)ZHn=bkNUwfPG}g91+l@S_+N!3(`;HWIy2 z)_nRc1aO8~zmQi+TO|cgWvs#i|QvRt%yQm{jhbBuO7LIwrW3EV}2MOEGN|D+k@td=6MF=61r9wW!_-c zE9_K8A+2E??LUGZ%+RbY_LG&WPVQuk`4aLVyG=QZQ+b@RuXs-OlMW{;Y<+ektbd8( z+&@EJj|#gCwo;8w;@a3s^#v17$O(^7*{3e z$$VwVQ?wrmKQBk%=cR~%7XsL%J3_V?k7U?QMp2P_ivJ+F zJwhV*mUK5#B3=6v%TdQ5M}0t9U!fe!8&Q^{?B1x5qqdO#Co1gP))3$e42X3mq_gxSewS#U;>`dw{=Sg^cn-1UYR@eQCw&E># z*d32siqdTTk7zorhmOVE*74ldCEV6+=(A|6LWkgPxa(+XKVQawE&I-u=|62nhGnAG z2hOdzxzy0DO7143t@sN=6l!N^CeYG*XLl){fNaWFv=TJA&)NDkrs5>8b)3g%;GH~@ z=TkV~;`&eLu{wlb{S~HZjR*q1Kf}KWc&wL>b2v7bOAp}Eq{WjrlBXH{ilpZUo@xz)q);k6HFENvIjdR%HzV;EvZ)e-s>dBiLeH&Ts}V>mM0 zfpah7qLSX3bWvT~k1?NJGcNiBsI{1Gh(72Sa_cevCNIIx7=)MYMrxfHL9tf#YZG)_ z-gq753f|*~$?3$C=r3>z;}vV2=TAz86XiMmn*`l5>tmi~S8}@Gbr^nYBHanOnofs6 zCUEP&!jGUQF7a+u_?^9yt**|b{zQ5iY@FfWN6dl#TXP%=O50@Z8mlurDp$V-*u*L@ z`gv`lUbFu^UO!Aurxhn};C7y$oKF4K$r+is{kebH$2l|J|p!R*2+=09Claf15 z3XkDA`ox|tsZ}SIV^Vxf*20fsvfs<(@OR5I-TROHet44nNy2Mot!R!YoWFibG zi?U=oBcl)tHHdTW$WSy*2!+zAXj3bQZaxAb zoQcj_w1^V*CftDy9VAq_GT*Z zHpJgc`8Kn#^i}Czo5Qy+tI*n3X#F&B*n_XyT&gwTvs_0C)z=HP8CQUf*-Fnd)q}oc z_$b##STAUG?YklPDoUki8>w{@Ry z6Ta$_DYvFciq?m}v=xSLlVJw{mnLgd!D}?9yY82*pBI+m0~#|jER(}4&b7d%XF0k7 zhfVbHkyaSt!odf;(K!cxp`7C9JZGTUBa_h&0Xp$%6ZNe^t6pgR+^PMvkp@j_%9XuQ zwnPchc)~pMF$VTMm(LE4a-HEz)S_%XJ;8@yq{RgZy=v5V0F5|*kS+w<2asN0f~b&f z@1!%uG;$!hNdGmJiGQyPd2C=Y8v!9plXXM=7(~TIC?87ZttGu`Y@1K~1M-F7K9rIQ z(Fi~AqEp5LNI%I%ZI`V^To_QKBaVoWj}OzaWIk6QH=BqEDf)dx@VPZZYhYN~lJelV zN67K|goycD6qJI$0)quyLaL@$jd-0Tl7y&AZZ@Uhk3~aC86^3aA_+x=;(^?3>MNK9 zi{lr<#zZdpl0YV8CXyGW(4Pb{A!eP6r20~z6EgX!6dljlNnq)~;fsintLRmuewxXT zB3TI5?@@9T6CR&eA+&9#^gYSxLZJLJ(xu>EM7xAsc|VeXPl%rpD)*=4azfaz<#N9h z<%DcIpOeV-AxaAow@{!G7C`xgs+iXzK?r5WP$cqt6MBmQqJU6bkX)qG%iomsCWQPi zA(+%(2Q)&S{0Py8kuqHjLHwp`DDqmOAH=E={G&G})CcP*`CTN($A=Xx{@+sirsQ;? zJ|OD`DfBxrNkVTxUQ6`*7&G-lh|}a384)2*lXU|k;)6b5Z1==DmZYj?52f*#z^t495ET4dm_v7uSO zUWdeb8Bw9*d5okEoJXAZAQjkv0x9DyF1LJ?M=yWl<0b_5n<)K9$>}V3tZG0K5Hnd1 zPNG`04ZrytT@dw_f}^R>Oh8coSfKTj)5-cgMEQ-FzYj&HBPvAc3_@Vuxwq3zzt9~} zzgY3%|BwV7^Jei)5n)w8;4@l9WZhuU9U&l-b%VYGS)xDjV=qLWoD}t&eV805Tj%2P z2Z~snz&LVtk~c~_Zi%wp>Hu-YW~gZQnkD#t$Q{XcK)8Z61d44Oji0TGqhFqE<=Hs>?nlBjH@@2hZ)ARs zv6P0}vlOMSOFdwo_IhvhkiI8G6_0^m=K&Tz?0g^@q(ttcer|Lt+d<+TwS_w!nV3Tz zfS2sRy+*VA|3tHJHDR(9$2Evl1~OVoPFeHOVBs)r7P(9FJJujmzZ*YThlB*uCPWBD zJ-n1y6H{<}f>2BYKShD*iW(K(q&Ye6O`01B$ueKRge60ZaSIN_k|GjCxQ*Ezd7&l+I|A95cmfHFmDHHVVU1guQ%I%X{<}3pm_l0q~J;vxq4c;9xoh15gjFB@m&g)gDrT1ZW!A|%(f>Ef8^s3&`b`2L5RsAEd;D*W09Nw-tnje3tTwpl&8zH$=`TBFZoAl5w!r1z{_}u^)c{Z#nBd- zMa4}AeN-Hu+QD-b;hNK^i=Gyl%kub5lnyWg% zb2r4^q^%I&1LmfnI;j`O*Es@pPT)GBLJ^LUX{*l!l6$yYaeu+_-LD@5>B4@(PE4W| zsT6`*LC*XLfaDmn0me7e8qpKfC!7-}rhb(hjXQ- z^&FL&UV;`5x;uCo#E1RDSIJ>FanQaio16yTT-LH* z#>aRiu;Ag-wb}UaA`pP>hOPE;w9H+)$7}+8lTK_^23Z{XDr6Y*4?kViXdXDcSbzK348+*KYk5DtgQV4BzteWqw^N+dmks0OjWSX&CiRyIapiInS0MaA zo2qVw1e9UX|3!S&YPb5TthN`b@9Lwd+ST&Le3YaQ)V4hrU+=X3jry_ox$?P)%j+jl zz2wJK6w{|8RkVnxlm0_Q@t6LY`kk))a@8)bTez^IycnM~p(i-sOJ8(S_=;RJCrvF9 zpBQSWs=29_zi)=`xFo($bnLWokgzsYS!*_VVXP>j^3GECox%WbuTJcH!&(h*2^~r=VNN9 z=EA_>egS@#Lv)V2N)P**;YjpY*4dxo+lhG#eIFn5J*Su>G&N0g;bX4p7$kg2+;#tC z9;fiITpA`CD{HG0W5t&V3yt&UxOVDA#`*5+KrOzHZNBGZ{bt&jn4lrJpzlrA0z zM}v#1`BmZq@=eH24&k8AexgE#am$H7#^D=U^j)l5;5f0Y%3g2QHv(n<>u96rq%{(+*_X6yTj2a2B;{e(OW+1M@)4dc9kbZnd? z;_Dia+{M{}&p7=O)p37sLVx*V5{yKJ4+B;c97vL@kx-}}!L6xVJ!nt}d@_m8mxLTo zx2?%id;ywB`kQqVsHSLU1IDKVoc4P-p|LnN%kI*{#N5Hne+wM%4;Z1Bh)xOqk=ynl zI{zJQgH>z1ZE^d@}Nn3mUu?+jQ?#5K}m3|_(n5$)$?5}yI{{OG{MB`*zq z5BCb=*(H42%Le<@g}rfV2hhL&nSZ`Ja=^nGryttA*{HUwbEc%Cj=U1nS1$Qmi1w07 zdsUt1;;Mxgi{k^CLORVy=oU%)l~qp++K?7vZ18XLk)>zrURJE zahrx4rY9||6}v>ztx|UlApWbZ-l&xFSZ&|>PdX}(=o^dr|*0k{15yW ze?|U_)wgh0{(^B=PH*w+N4p|*7JTu`1+}-o6>JS1+Ao3S0Y`AI6pA6mpng+IvsV>Z zUz0mN!1+fS>jB33TtTATVUqBK!rIWmAAq8siEqT2Q>Y#+P~Uc|x`8XuTz45Cv~IP- zpudm?L0Szt?)a;5lg}4SSKB5dF5c`LUnR)<4C-|{qe#DRnAQ$$jt^|zyti4l7L@oN$%mX!2Sl+mgd({(Rh=cOF3`2z9#oHI0k2=hw^m*hZ^8y8(cmd=57X8BDb*@qLtMiCZJM>+Anm1Bd zjLU`HmhU_RR7QU=Q9jU7cb=@Rcf5$5#9q*T3jiz}H!qxfDS{<~#e+|QYSZv7m$o}5 zX`Uglt32Bo9>GiR3n*o!rdQ-}zXJ7L`0=t8w0xT5y5H5()-RmhLz@6?DhwIirBNl^ zr^kfwfGhQT6ZLmCMZtH|2T>bka6~3IBlI@L7wYKB5c?V}s&L>luFqvS!?~=OHh3^a z-IHYcpQ@!WUrQyXrt8;IrXQ~aoOT5@svy#}AFs}E$NBN1!v8z}T=)t$eT1Dp6wP12 zwwKsPkGgolEntQctO>pYo}@@fde#${>pSZ?4W6rb$>>q|I<{c57V(Fv@dfE#C$ysj z;WrhV@nfPpxt#Tw1~?rP@qebb`_$9I4*DQGx9kTboft~QnQhX+Kq0mXOU#QA;2S6C zAEnR}fOTpvYeDHY-{&r=X=`EWYvDepI<@L_r#9b)M~Z!+wM+v@hD&|NUAhZ;yu#80 z1*LDeTJ#(MyWr9t3;R5L@MpMFo71B}{TI~dnc?idF7;jL^|A^}za%W!A3)Tzpye~W zyL2};$QXC&cEZ>4=@4w15it4b1ueg(HV23Ev$>8Q2L3HFFni34JWm4k@ot z{Q|rR?7PO8v{6WU=2wzCe<8V(iW}>u>vC%UoJzWwO9yvMe7>9eaLhBn)y_^mp94DIC>S&hkfY9QmG!6eQ0Lu?n( z`Y*8K)m>;jb<4ZI0LL6Ur`aw?Z_$Tbbn;s~q`Mf~t*}%VRg#*7D}_@1qA&Hc&@AA` zHCAeS;_T%A*Wns8x{YDc` zaAy{%ftKahXGQP#c?5BzOL40C%6Dg)OC6?QS z_>?HQ?R+C)=xf$_fTyiVUOzUDe=6Zu--G4H`dq4{GtI_(Uks`Vcbww`CeT`jJljA#I2UPN}BH%849_GbR zgip6ZJ|oM9`hmW!Q{^zc1{7ucCJxBLxlXv~T*sN-h%&hCvdV=hsBT5MG?Z(5k@qR- z?acGN3@!So7L$9vBo5a#_e|aB@|>@(;!CM< zb_7mt6?_MtKHc+M9}(Rn^0HKwV||^6K(ZPvYxi?4?wE9?fL%y4k0EUk8rurnrqn z?V!^5WgQyWO*!WC%t9y<7b&`dH|xQpmQukX)D@BR~nfvAa(TJ>2U z7w4AxcHWLb7M#IX&<{RR&yFRtB!3YQ$*K+;RD5m9G)M4mq1srUm?syLJVBUkz}$}3 z$&9(-G18O`FswaQIFbZ|N^GnhMhXex0-UVzz|Wbt9YzD>R@T9WlH{6*aE~5Z_La0qHS{u*v5(L;ZG8o@Vf_;De!n;}(c- zp@SHrbDRK*n$j~2jvuJR$Y!8HtzcMalrs@tYlLkG(-*0X1s?Z~(uMDU`1=L*WsK?9 z`okvwGSy;8W^J>eg{Jc2ho4U%X?f^%6aTl^2=C7@rV?3ZCf_UHp4vKRFG% z$M_D?>eKuWNvi_9tA2uT{V7_o`oOKlup4Wr5$YEWnc(zkRnoWS4$9t7h%Vs{SomLp zJ6VdtXqAjNVV1KQx9*$C1ZS$)$3A`p+{x&G>C|epr1t09z10Vf7UM;1{auE8rj%`a zoc~~ww#|F`6?9K;gYo(St_$Kf338_RO*{Ga4kco9c#r<%G}}X-84wBYkOKWdxS&2E zQG5hQlz#`})W1SihtU_FLBn{HI%D2~8#~n$&99rMZfctrr$>ad?t1vr6$r`dKS2I5 z2qVS|=*W!S&yNJVmf7X&KeRv0hK-Hseqek+Ke-H+O)R5KaTBk12OKp&!TR^*3lfkvPXW^Q;78l)=(3hc& z>yLS<^Ps7-DU1x?#FO$d1C~6K{%6Oc(WBLWj_{5AF z7RLrskl!PEE)Q@c+HK4eO}lA@af;(E@++q3tvv{hR&J%nmBKg8N?K--0gXlbnckErJ&6wb%Z&rNgxGzBz-6x zU!GxmLJGZwfq?T%gljMN#uSi7&!tjmJGM2&G9K~_GW5* z+0g4q5G0fsr6l`A^}W0u(00KiHwdTfJ!-IdE&B%e+I!XI#GUZnh4#R&IGozS4vZZw zm>huN%=^akNyso(sO+p7IoZ&|o+0Rc*rm&Ch z{wO^I>BG2_GKGEPk*?j~*Q6!y8%9BMb6if8bc{G7tUQJk(2 zjDHBi`eFRU`{|^yTJ<5rOr~&6PHtmmI%YJ~!6S=;bi{$h82c95*nDQ9`jPcvZ zq@{*=SoNTR1L0sZ(zRgI{@bo)&%u8q9CvX3h&&DLtL5;e6b{})uuU6jp2$ySfMAQ_ z^*O1YRTB-|2nWjm;Ei2P>HawcJeYxbeF!?-d3g0-0wO;5dIt)b{sos}0F$*1$K9Ek z!o6on1&nK|Dm^t-6^ZXAcjb~U$-=cW86qUb29n|S8DzeL|1T&hGX=Bm=*iM$)%M|3 za7krdd0DXs!G*<1B4BsrhL@FiO5)(-;tX%M@O&dTV)>vEBbN`ecl|ex1Vh&?p5xn^ zsQd)wD#~bxN*6A&r?z|uakMo{s!bc(ZtCr3n7Y-=rB*Vod$^Mr7M{Xx+SSF>p|bAB z;RM4#gV1d|FGv&=d_69c%ps%dx|^@g#lBO{sr8ui^u6loTr6G>Hx56>eHS^aFidcY zpp%$`BLa%cJ6r|>DBaWIBJunHO8DVAa4(hg9xK{xu}Fa-xLF{xJt_DZ=&cf2lpUE3 zzkqvu(QBpPpNRz1S0kW>0Ohvk!FG7ABW*)7=;cPf8dw8uKu3XfSAlh>+?u&X)@C?x zw-1AX@#hd|I-D@!JJ#0sVCq|tA0BnN?+DH?azy7lLaq?DK)K-zY)1&b5O$l5@;?Ls zXU}bt|2_oZ3BjrTtr3lON=-wxTcMZTg~O0UVkvYr0l)(S@b3xL|D(0n)2-oJ3Op7CbKi4;7>8X^p$UPEz6m^2Z`58 zAs>drFs2aZ;o>z-*Uw{x9tKPUEtMOf$d+%ENu>nia13T6DX|rG8%7k*_HbE_M|l`p zxkc|6c3eIooPCB8&b}0PeQnRS+Rkub3TfJV=vUy_NM0?xR*bbFMn4}G-TD4k@KIVD zmLeYGyoJZ*Aok9_CY7bG8PznqYQ(p|(Zz!KRxwxvKleD)4G_iNvt7Bn#p zO}uJ~wxX}KfP7!z36tdi14f{L&S1Y+gwi}q5l{Ebz^~171%5L;gYnzLBS}rC!ui-` zxPY&>hO^Tl#$GvDn|m3`w_0iKB4PAo0uTwK(y{dV;!TrCfJw|5fH-xc32{;=3dhDg zD}>&UumQmoE2AVdRzz=4!%uxWH6)EHPp5hSI?G02Hc`ox3UR2o!FXA+SZdOV-L;YWL)MtT!;=`PQKX+ zWWZf5`?g$zq{|ENb7=-<{AmQv=xDX2l|!#2uZU+O zuQ_RK6{BF3M!1XVpjPmFbSjn2)y1gPvEzsLx_R!gN1;RdH|9gX3B)iD(+Fy{FN>WO z;Ry_%mS-RKZC&o$YnOAsP+!(t@rr@TdWgR4Wt>u;8U&{E`8*q*K*(P?JPWnMd65fI z7NT(1XxN{yZLCnhDUlmt4EqIiLpFLk7g&W2WGo|N{!M#WbrUpaSbf}S{SX{(FpDq- zD`+_=Bg4D4>tz!|hxmNo)J!Cshhcy>n9!>8GH%o!KN@63{2OQMfyO5DPrt z$x_w{>mQVY1is}A2XiM~e{xOTj2G*p620Of;UB@T zv-5F7Utfe*`riF6;^=5ydZX9aGv<5xOTpiu0L6OD#|0w(X3m!aKS!ESGd)SUEBlfC zJoNj_53bBohOUHdZ3i5;4M2+bXn{R*s^o7+7!O`SJ0w8UsR=ipu#8c6(x(PVK@2h= z&oUjeJjBQp$1vUpq$~!TNbS2J!6wG}Yj0H4u?YJ$t}rlOLAfHwy?uf`9klps&qCwX zUBB2DErPSL$|vI))tEI+;U zrtfB@YLKy=`%Hzb1m_rcIz&%XymXjqf9kfriRw1KnjcO(W{~Ea+QJkXsx9-n7j0;h1;1#_Wq{sz6OHGtQn@h7 z0X2wyU1>5cO`u7=Y#0Fq(1&&T-DhUEI(r~B4v2Q}cPQ6&6>Sfg>LFs~QVCqS(KT`ijMuI3?Q}422bP;)BqGQmeJjw|sy)xp-;8R{-wa0saPu#?{k&#; zyqim;zKbkSKVs@YxA1msX$HXqTv(pEY0cnbi&epJjFq|+O}|)B1s}lWy4fC(F6w70 z=~4t$I38Eq-wn1P`F%AD&|SNiYJ12ab!(4b_o)sPx;ZB<7*uDf`MQAsfN?a_aw2;+ zJhLJlj|JUwl?E+YWIKYe#}k-F7# zw0@gzsheaxurWq@yo2BV^WZ1Ezy1NkoV4-@m00;O74p+OSt6@w7G3H;VT?YJ{lt7D z1v3GsL`tDfnwaI=XH^Bv{m$Io5!)(A9U1rvFE|IcCamKlXB3w9Q46il%N=`+CpSi` z;E*lJF@`a^s^hX+Hv_3m|Er9wXBswv4oSJ6v@j zNPZWxpvLGyep_WBW|P{bW3T3Kzo5)J#iQ8JR)8=do6kMo{b< zc@5by>rU#1;sB*#M>x7C=n;0fclw<_)3b$@4@9YXr4v?8Yfm4q=z?8;y_NG zUj2HYT)4WRbcptdZND%He`FNpZ=R8^v#{eGCAUy2>Lyej%}G#M2tw+Bju!#O2vUxJg8Wd zQ@U+`i_GTmEJU`KcWopg-mlC^nWSa0he)illM3g1^upctAJLia#j}a`6@0_r9U%Ed zHy!-ywmh>arEoOmen!kB@=-26;`^Am)@_h5D+|0H% znkH6XJ2T?c8gnB|4YGazOrB@>z1%R`uAXN4sR+~JY^uYwVyB1Nr0y*G0@8&)BDnkf zT=R9mqxjaQYjbtioA;ZZansv_srXf=k^Cj5J*(+@uCn_IkNDoNzJ*@;>HcDQS60_P zX5UV}Pblwrmuk?#oSH}XG+mWp>wEW6)eWZOtLdIbP11d)t)?cYvATF>wp?ba4cqsu+lkyyN^CB5sZ_^7{h{GHl2{)*qUo)xk5$s`k**3%yID|tIz z>exul8}FyGF?ZA^Q~I7R=Slk=xt)j7%zb6&muWM+Sf;U)=1DVZN_XU}x}M@osj;0O zfKz||Pi)_;Uc0}h=`T9y0R_D`dN?O}IVG>w-Fr=k;Xk3^B9J&^|Z;`+(Zk zQf^;{nCO*J@=Zvi6i!Q<+|!;$=~$B%p~Ot{(~;K)*k)&74kaPAk=#T{D8K9aSP34JO}5Hdy;RHX8m>o9Hbl&_Me)b`FeF1K1he zeg$cu%4EkMm7TOBl>%s3C0t9!6-AC2{Lsn*ht?v%_q7kz{5E$@11 za@R9mFV%EytKPJAFzq&9-819@YPE)`T>B~2X-t^4V+T6eSf^?GH$o{DQo;mTCYS+R}E?QH<%iJ@wBuT%0GM4jOWL3 zx=DFV{xAtOrvbxxj)PXuM=zh;^|q|av@fzw?3i#zTK73La`dB^WF z_$Dux-qM?Q;b;CEFjUST<^<6>5VOItJRYF|S&}xg_hBAJ(vfC+I?KosZeQ0BRQgoT zDwd7Ymwe)-Z{)0s&`EW}Bhz|lJJ4x8@ys-mxb$I?jQ5;C{q!@Hqv%5|pHL(F6e*{= zok?}ODU%x6*_nRhz~BArcx1q+=V|1S({pC7sZ|BGK1Dkhd!~`4T2+&oc|X6-m^PKi zkVz&J>E1*?dv9um;(Mo0>p5v}YJlQ<`#N~P-j2<=G(O2O9Oa#b<(sKw(AF8|5Z^n~ z+i|O3pGG|&V(Zh)vweVk!N_~q3|n93w!yT+51U~pud0`GGc3I~ac+jK{9Qe{Zicnj zB+kvSceQ_>I-eA@BLEIyX)gSA3NG_B6yNRlXOhj_m_dZMZ!j_xl}wt)*qpXKhdwRU znMExsw`84PETb_XwL2#0_Dd3c9gk#=dVa?q$+mZfz`HPxcXd@%u`e+f0 zwl8U|ZLVL;U++x3|FS#$)A+7f^_jb8d7n@3m}6s0^`1GTT^r#n96Gp{_q0dYg+m9o z^)lNr0v7%xgefb+>^PI2l`L&(lhscLxB2Uy>)Qc&lb5?I8hABN0 z^2>XUO{YZ}lRDe;(|FhVOHZJhG=+8%8G3Y2JU@MM>4(#LYI7@lj{dAX=c-4hmA*=c z$1Ahm%~>^`T0iP%qB*NalaJ=v-Y?HtGLRa|jPjg4_)1#);S5dBrSod)0zOV}AI`v> zJOXCq5fEw5W}vo!fVl;gT_4a+(>p7ACXA>o{UB%6v(ysPDp_92kQVQ$rR{ZInV55L z(Lh@1kXu>$$DB*~`)Sh}o~7*$&11rC zz!CJ#m`R-(S(T-K$>}-WTz{>mVRJbxkjS9xqYsGpOsAa*N7EbyB?hlMs%dzc-Ztlj zDw!lOd><0Xq&@laD@!-$Tsn#bCexU{;a9xaVgN1U*sD0+vt$765HYN>^tqf%vy6&O z(;8mlcnmZVpvWX5@eXB2A`)2|30m60k@#)SrPL#kfJS1FL}HMQ#LGFC-fFIEBnE3F zc(V!{i3>>}5s7S#M7BiYyoHGf4ABS-kqDeOby|ZO{{=Pv3z7rpjj@K)y>hhUiKsQ* znEm#7G=20|b;DhW;jKZ0-LTTNy=0+?rO_vo`oVo zi?HYh)(pw$Uge^UYI;OCY+c&Y5~|p9%$f`G+YWg}lE4C4`?OH1lkzQd5Q>mhD9^ zL?)yYDVi!x!!BpqU!OY#&NnCJL4wyeD=eqp4aun$oKqA*%Bk_3qA4kE3IbIx@HpK_DGoVb2Q)oMl+nT$FKGOD_}Jt?Dj>CwGMQy%fUmwS&+ zr+<^u5=s7mW#ApJO`^^{K{BeT&lyE|Wb!JJQQK!G#$}raezM(uj?mr@ZXWpO?N`&1 z6D^<9x}KsBUzpW=Z%||?mIv518=bZ3v~Z7>_VK`$eCh4A^gq%*mHub67t{aD_Jim@ zE$yQ~ql@F6O~vUCo1q^y?Q|M1POql5eROdooz3^ps8Q~m?Vk;&vy=QYq2X{lj?;XZ zT}=C~&OU^5fFxNwgS<_}v{DE&lxCOx87!yt9kxmV0(_7I{x9rhDU) ziU$!dpJo)-^emueiY%-teQxOu+y6vMd^-MTpJmyb2mZd}54qd<+e@1V)_442*!G3w zl^5rz3ho8t*yLB|WPh$|`%3r&wUlVp-BFX2CS&!fo7u1P%$kO(A@nu0>h67VuHDq} z9DN^PQ)SN#+7G5Iy<<}~t@oran@y%2%c&>XNITil>c9PR%6rN)2^~Nu=}!<&4L|J- za|V4v^?BMUn0y22&W_s9jQ!f^oTdIw|Aww&gr-2J4|#M3JT>d)W}I5lubM-5^20O6 z#2I@F`AJ}R)o^;Kdq3a*okrh}>8aV7^#Hfi5A)u6CXVzBf0*vi@Ti0j8Gq^i5v}u< z+mTjpaWra?zd&sw&Rd*iL`zuhj| z&Oh4e4O8T31z2KwgEy|TCCIsZUvPQOD&7}+a@SvZ05HT(V9@6xX>Ljuu-Jzk^m!H2 zsM2hPbRt3X85B(RmXc>AeGZnp8`@|n(XsVI-i`Y-n!ctloNT0+kZ(ZFh5X@SGG?#c zV5MNPPiG+Bsgc%J+8yKss zqSlxU2YQv0;YF`}#Hgd}Ad1?= zykOVQc%IHWTkDte|9R$rgZV$-{$Ip+oB3bDUbeW^*xMR2A&srA#!||9bCqL!woMy4 zuoN9QN_5~D(SZf&Ad;LmbjE2z2i8Le)Cda1KIBaY1+(xS*!T@>s?^H|5HH!!3DJ6`P6%N{4XnWeCx7X@%c}w`UjUPnh&7; z#N-=&<_>Z$O{tj~mgerNA+#iSaQvv+tYxEWx@jZr$LLdk<_oHic2)5`AIqMYct^<0 zm)5{Kt@4nCTKIj(9rxMiE40lGKULv*7Wz&#zco};-t}T)nuT6MY#$jn_X>$g6#GUk zErX>m3ZAP|B+u2*Gm|`#qzTjZkrT``M>V}>pKotA=mB6^VxHpV8oK$mFC}%KJ5r`Z z>}nQTsydR^A>|9fbNo2Q?gd^$&j4$>&mAq|v}G5)H)z;-wI{zsSlvzgK6bqEVGX@j zIIM!s7_4GaF*AB(=~ zFhfOaS!jmGu50{cK+dXri1>m}SyRqMb2(*c2G#1BA+sZ-Xvej*9~jw@JikJEO+@Th z${sk7YO9F5JjQ}V^T$HzOX=v!jN+m}+(C)VR>xB?sQfesR;K=JOc#JdjJ&L4NP<=f`P zvbvPa{){%3*gnLSqxntEFH;sYi|EN6S;s|$!ueb9@%N>F8y1hf$uc%c;t`E^&Y^hZ zZ2uuI)1&-14;%02=RhXj_nGNYmcPZsdkpPGaXtk&V*3gT5ZYektmb_|vCUXo(|t%e zEkK~J#KwE7Y5Rs55t?hFZ!`2PA3z(lk1nURt^?yeWmz2?2hsV2TKYGnoc`@~3QaN< zmD9f=C)2+Pj1`?k|1wVGtR^Vq(RkO1fh`ctd70&tQXsLaKB@V6K<0`6 z)BYryg+8=DEC18}`0e87Yk$Uv)!YBHKSAwJ$ha}_Jg>!{>fv_Jd481V%#ZI=O)tZ( zh*Ps==Evzd(vr%ag&8~IWcYqO zQGO`>!JNM3MQ?%HQ4j4eB2P%9ss}xyqM-4v4^8O) zhWb-~L(u?<_Num~_CuMQh<4mbcv<73 zQx~=;&gm2O>$BI( z+7^3l&GWp4ZOc5*ylzi#dSuCcbk6EuQbSzRz0Wk>pJFvP%H6fp+f2+psw!*QS@aQb z^JU|nbJD4!C@in+nMfPx&C1|jqqe-~^i2A4H@)PQaU?yHZ5v3U^xdX&29?=Hp{tf1F^O0t>O!$6}Qtv~JCYyBP$?`sU%6q)@8q$}@I(jiH zJ7i@gEvs-L%i8&vQ|h~$=zR6M}%d{{kS?yO2D6$k9N-rlEGcl}p#FG78$Rcu~N`S-sY z);`|feJ#-X)G@2+MJC@5y~r@`MR+nLH)mAFNa{p}%Plj1Cn}^r(Dg;*`?j7ZF&?{z z%Tf10+LEfhse8_lo+)NkPkp@Sw9I(VyaDtPx6|lmu_BGu{m!CqSMqLv5gJ~3bQgI- zM#m$(R3VE;V7WX38^)uueD>whS_Qsi%$-9E&^oG8M_}$t*!VVH9|mBuJ}7kjMH~Fk z_>0G3vX1uOaS%p3$@hB@wwq%&+7V|44S;t;Vd|)FB^4PO`TnmT_XXV;cp>#m$6NJ@ zp;Oa*IGKkyoYc3#&}yDN`9C`={IgucIff#NblLPdu+!)15Qz!k@RdQqMpQ zKRuI}leMg|yz7OY)->u1XpFOOdC%FAo(di}#cEBTQ89pehJih;SsjlIB6JWBj!JpJ zl*YbMJYY)WnZGv8z9+v%FuVb>#@0!I5y1rIk8Va+^8&V{d0$P0hDbRCa5a+AlZ^p=Sk z?U~iC)W;H~IXBZC<`oo?|2L81p3l4#oOqs+SY|D=P!tVgEiIMSp{+zBX!?pq34Vf^ z^y+%VJbx&o0b_N9o;o1oC@&n>}i4 z@WxD@pE~Ug4M@$MQ8_)p2reN3_hlEHZyz#tZOvKzEz+e=dAQbBcRfM3KC~Tm=RURp zAl9zUCXJJBu2z|NZuaQTvw~g+nHk|0lkQ1dX?nCGixY*eOrb@?XAdZ+!8u8BJG;jz z6h%75xRo8m_o1bxz2)aOX(!TmfVnD31%vextvsS z4m8H{y7GUKlu5*$Towb`qxt-91hqx9tAG12dbn1)W!WAn^+MhkGX2`_v_tJb{0G)l zQ_}1|m^!KM`smD-n(hOpUHI$vd^<-!V=(vp`}5p9two^1HJ0j-S&P7xfWMFE-p{3O zPi7zg*_}kBz(1O3f7(xxx-+?J>zRdEZ)%)+Q^T7NCDWLG*S7Cmo+*TuXL_DY4_xUW`{hNBrN=rmd55j1P|R`Gev(>`?5uCjjsC8@UH%l;GyAN{S8Z6 z&Pq{EKp*2D7Cbb3lK%GA`sTJ4+JrFGg1+eAMesi9Pt}@0!t^KEU%!~Y3=-(V!bAQA zhEuQ);cIuHJ~aMjZ%N%&Qk4Z9!}RMHt?ee2W^+mtdQmfd=DNAQsg^eYVI{N|S^%vN zh4CBvlJ>szgCu#v!YA8jh!REuLs)q8eqqCs#q`)Fpj0LZb{U@cHBX2J`gR$fH`Ac6 zNd*QJFf4q1)6!b{f?a6->uU?U;^u2%{AV^UuWjUi`k~|yqVVtywQUQRE?7XXakjNJ z@@Llr8Fm$3vWLF_8yJ?szJw3c-PiD>zqXJ_Y+CSl0~6--MgLeoY+HRJ=UN}b_n~<0 zGX9!%SVot(EUcS5qq?l5AU|(dZkBhsxLk27#c2%d#FdHDc{{zH<5xdHFNxi`&(-M~}e|j}`E9lD&jD?d+Le+Vb4A{f7+9 zx|{|8JTBnz1iO6x7mwyv(}CBevHK~n2dIeVUY0&NI_%O&MKrG`qavDrNoIMpVD*6V zXi3Mwc(g8ZW7^hw9$VI7h5o@j17nnm{v1oK|i z>_+gzC=1q$E$@Na*Bfel?(zIbC=knJgLb zZR49mW9QW5IGcQpbG}ZFuFQ;_lpgKOoEg15vm&}Cb4qk==81&P4orwi*^%F6ko07d zKAw&hB$@s)d1grPatc0+z*B^BDp6JlW%@hV8tFy%5) z#wndw4=`ylDLNxECpiBnXGij)lpvF%1yq=^^jBpskItPEt*eeMtcbQuj?N%8dGxzP z^qP9d^^L1b9@VM}&bKThvtDFIlDw&tMy4{FPvuBuUKAPg{FQj9hNkd*Br-AKAsH$c zRYuoL(KVdQKe>bXAMqR$z*8nN`66R&CVR+c<(V%sd=JAsq-!e1>k@)HMdr#}I@xvk zn?&u(#PsOGDbbdS=yEDeX_KRMO@T!(o*k(l5|{^*vX750tnwMkqn&9~q|%Oy*4g%M z7#9X|yndH#=ESiXQ5h|%ik4MHt1F{3D8V`2agm*nkhK|76)mYWvQ@rpi^!V0I^vzI zyd+D!mC@>i>9ep)pN_1<+V zEGJ}^oL7-l1xX!47}fiLUZtEXzoo26c$Ixhb`7<2JH^Iw{`7aKjizo-A!X@54%9R; zJ95*&=tpGF%FJqNP%5cGIg!QZ4JLFtp_PPAA(We*Dr#n`shO#uW@biqoFt}-1h-<` zbXAi?r8Gpx5KrWBYUDo3m`!38Ks1TOW@S$k0{6->qJj-+h0xVt249=Q)M`05$}_U3 zGgL|1rjRx@id)Co+0>>dr1?^{w9dzwKD;c8(vjRf(kES(PAx$2c%X)|I(LQrlO$U z^5~56=-i5E-ElOQ7;O5-a&FUjNHQsVfrHd!pF+sAXr3=qEmWj-A^j%MhcDUP#%))k zESY{HQQjwVpUF{6zbG*7<=K&yX%g$1bmW?lzLHa&J6ayNaFe)BbnU?k$yL+uLBW0- zpx=zeOnsZg`b}B>UZDPJGq%W*Wv)M_I>K?0u?5BDk_;MltAfM(KZaBxRGm>I9{4w>5qj?d8M+?=}+bL zh18U-&Q5=vdWxx(qI2jNC$VEvrD*gpoZ4Y4SCj3vkiG)*IK}gR!gVi^woaWl{YMo= zG;&yaU`kS@U!P75>2cJ?rr&Q`)XOu|A5`Qa@^DsAc_%eDi_Y&R=Vy>kZZgtOqohBU zLUZ54jref{24^UhuvzQ+JiW^2h{IoEWzqum7QLO=ke@icv?yIriN(t;;{j_uSvVw!+jd|FH@o$Gt#Fz z-35*J$5LtGn=Ze@sLE!(`NCHf)=nqxt*WA1@*FXx`}@G2;Xgf**EurrmZ&j zfyBpcH5vI>YN@WsPM@EOQj2t3Dq3#(+mYM>k(-f4|4f+=z2~o@tc!dc;Y^xExp`7H zkG`gJyRd{j+=8W3Hye<%R3qtTejDZg&Wz(sC=Q?AB%#ba#T{bDEo@Bsxlumld!@$& z=S7*`lxXg>=rGEJxqLhCt)kysa(&-k(~CNMw-K(+UYZKh_^+Ar+LfJ>9T^w^b3e#- zG)d=uR3ahJ;3SIvUxtZ+vKKTFX~9m*NMReEdB4PqLk)` zr=^rSFHX9j6dShjc*M+WQ6tn5p_!Si^s5GQg~%s;6=lj#ohZ736l76`|A$Umd zaS5mw<=HgucFSl*cBCbsR#1(9E=AVFa8SyQ0nAQ|4DlNY6kWW?jEsV)-ov^e5%Axd=-(h%e#o=i-@#V?aT9q@d(7V!qc zS;8CfxaCiFJlBy9xS78ek%%WZ1j#|j`XLTtPa~cTI-IK!bEoEjbQBYXMM0(eAgK~j6ctpM^D5Xfv3JdVrqqd>U$y{rlJp1 zI$$Y7{LsV+_#)nDI7@gV-k8J*_#)mBaAv(X(;>}!#>X`ser)itbcp%#=!w|bI)scr z;*Cqe7>IbPE%|c9Gb5^G(8wQ^f-w-G4@m~Oi1+m%nB>b+aQ>o7n)kGiV@^c83Li&L z#G8nx#?K`x&3nqnlX)tA96b?l5}q3W>-0a(`?-%}PDjk_hJVR;b{P35eGp4b#GB&d z=!tk!@znQ6ylK8)J%1-pn)igBdRiYDhz!&k3hqHt?;+>E<0bj&B3CvTSn8`;o z(lFN!28oDw2An0l5wA6IBECZm&tpE0EpNok(D;{p4}FEy_p4{T;dxX~?XxC1i+Ia? z96b?lIiC9dhWj#v)%?H zztP9BM~`?beH=Xz?*csa{SmLj_p66@H%jwe!bEAkdG&~ z|5y9CdTuxJ59(>sh#}%#m%LJd{;42Kim&fBc zbl(shG4@|1_*zH4M{wTD58YLQyEXVa!Cia4Yw&%%I&(}g-6FWz&>;bPKM))se+Wc|+tU+eJjm*PE|KhbqL%Jq`qD;=I+3;wFZ^LxQz zs>)EkBlt&3EXO|xUhUxT3%=gLKNftgIVLWCCivV)0%KpMIp4=yue?|#D*RlTy!8al~`Ns*q(i{^P`Llx_%`ep@Amb|+e4WG3 zAI4_+b&j3W1gAzQ$;}cx%aK3c;5e$X!DyuPPsZm@H%r$1X`we z0y4gH1$X1sq4G{TbP4Xp>oUPNm}5f8H_iFiy~`7j@m(*t1A8|r?xdS}MoIHOZjbas zk$21gj}2bo(tGy{zRkJvGr`?-cwF!T$DXGJpW&p>^MbE&_+Jsc#o_sl$~!!73Qm*F zNp8E~ZhZe_@JcVw;s2}PTa#Ek@4p3iU@y&-r^=LmZJ^+8yuM)YDc%}K@1BB}IQYJT zyXD~k!RI;#9W1z;?}rKQmcy9fZaNnW?$CQ*HTW@}+dfPbT-V8JxXFTVQ9zE<1urw{ znz(qZ;4Mx*o+!B6pVkYWV6eVZ4Srn8jKyNX*Es>&1b1NXEWyj13+CA<&8Mw&@C%Ln z@hRn@TX09*yTah7q}aJeaM#Z73hwgUBDh=5e_-(X6#lyfclpgHGXv)9Hj2E<|AfKk zr_9JcC%9|pOM<)o?CXNBbOO97_%;WB*Wju1et#Bxp;M1O5`2ck^O@iq9DINoXPoN2 z>hOF)@J>g5FTp=@@cjjM+rtrpyY0XzgQw0<76|Uj7YjbuvGZ$!yYa0Q+)YpVx;nY$ zlyW;ua5sHU5Zq0NdckRuAjvHhyu`unhqRi#%4ZUEAL7pww>~`F3xOb0vKM1MMm0 zZIHp2JJPg1!r-a%()$?v9B-`?;>&_R>EH(oKG(tdd)HpTy!2RuuXLn6??{7Rn1YuJ zp6^_#Hu%NfMhBlR_)Z5uPH;D0P8Ph{$*=i>U+&MkmDfmZ@{0)NVJNDcp_*@6SUGPo^ze{k3-rJ!3&V`=|?zWSg1$Rs0 zvx2X6cwRL47O&31e{JwF34MVq|;MGohc%b0=UW|sx7remqw~Bh+5rS_t_9pQ022Y)b|Ej@L z=g;GUJM`XEgMTL_-DZlsll-06Ec)V+p(U!TCdUrA@Lgvh@i zf=8*`2ida^;{kfdg~%UC_|Wt(`}UZ39O>)~gXj79aPfW1;3Ir|r1+jRxSsJ0i*)EXJ2YKG%lmV5t-|1C zzPx!?j?PXq_(UHs&#~Ss2@lHO2aWtBU*5cbMQ7U#UhU%_i*M)`*q*6Aew#EBgD2K+lnMWR2Cwzy&3ij^_Nu|>rQjbKywS(a z`!#g7A650C=?i^)j20z`$&HEs9w#eX1eB8XRL1&j5yw%6eI~8=c-r!4pT>JBv z41Si6Zxi1D%AcX>D}3C%D?n#4gJ0m|Rgw-T8+^5o+j%kCv(4b$K5o|7)7iZS@9}YM z!Cy1@Wj?MYlAFAt=~w!=S?f+`hZ;QbyxXijC;VuGU+c@8wcvEN%;4AgxSe01wb%x~ z!N<*7XF7Yz;A?%{tUso+j}3m4kDK+xbapTmu%YR<`nXx+OJ_3;zRt%Z(xzQt@Y{Xd zth=SNy9|DZk3TNH-y8f+A2(}Y`Rvrj=Em0g_9d-mCzVCbv{lj4hW15En%l-SctN|Y zjA^NFZ66a0-SL5U1fiWOtXQE}Tf1Osb3<+In4-esk`g>EDJ}9Q&YClOV)dA!k^;Cg zws73o#FZFc8CP7IxPs(kOJb!cSUP?zoE8;~ElxTehr&e#N$QJYMQCSHQ4EbMDjHXW zR7E9dMp02I+FLX(hF1&67bWQ(SD0kgxT0~1$R#Nm9~+N4$B#wUqVdW0jxSC!VSH(l zlHw$bii(q|ii?tk$0sQ&E>6Ch9M_V9LT^mX!~*_jd@=To<$p!|Zyf&{&;N?~UkU#s zW9WaW@fx9+k&l_6l{cgdJ@H}d0+7Xt#PF9Gywu``nJPKUm{28F#^@?F z%v9&;Dis|5Z+NJ9vfoIS8irVbp^Ft*DZ^K4^u!8GaN4(p!^CWXixn8=Sb-sm6&R*i zfdnw2W40*8q+HqTkJ-W(i&;*a(=jc8Rs+>LRwQLdii9m6FwZT8y|9kazdR$|k|7Q>h=dof#ZW44UPN^HKDSUYV|joH#0v&A%KOLxo`*O)Ek zFaLkr%+HZ&xtI(F#LMq9%RW;Kq%4=$;PntBVa(3

    rY@N5HZBhGKKC-aBc|Pso zMM<-y)dzX=HyE-|LJ&1?hNe8HHnyA8I<E?C;sWO}rwCH3=b zmn@)KLU-r&&AaG(TIaVlHqNi*0HMkaTYc-PwX`eit}+I8jx{^KhM5U+p*mA`u?-_b z!B95PSew|Bw=a}^5Jz&_ui0!k+$ZYZgdO5W*-5#t+`Y-W^t#Vs z|M^?XpU}?O>}dMRIG%>$)$)q(XJNClcooR&OgK;X@kh8-ezeFl4^0*Oc)QGBx_ZwS zoaKK4@)rrt{HWKi-?J;7QSUV&{5tQ*{Jg4P?fen&uLk??104A&sBz3gQ~uuN>>(%9 zoo4wP1yMYXM*6E{>*cM*s+%*f1%*R9NeyVOzdH*{5>L{=g40n zxaOnE+jWq#2P%b8{5p}x{AD}8EqH;$Z`Vh9UXg>}BJw2;zDCkPpI4|}yKd6+${hK- zgeUIcKM}mz!R>lVyZ-=RQvQcTeug8zN$}YYZr54Tem@TWbCEyE!Jiep&cW^aOS^Xv zUs8Ks68VLW{I3LWa_~Fk9LwQ-glRe76XX#;CtX%AR0zm=mr`drB6;rBz?xjbJ39NVR-g1bB?f?j^>NA;ctJlO8F1CIRMN3e7G zdw_o# z?{~A~biN4qbzKhQmjZsC$h+xqnc%F3%ZBP*BRKP5I_NrD#xWfZ6+0Q{wM5EO3iuU( zcM8sj$LOc>cYyqrAaCCvvGKhM@HA;xI2b=uQl2cq-S`d_obAMNz8~NiuU7_;AUWp$ zBK?#2n<5Sl$`|$MnBm zaOQss`0oXI*3A32tPntp1}R{?(p@D{+I1^imTp96dbJr`!j{QMMH`L7b31^tv~gy77B^_QP7vtt~~dGSyKr|`!GXMW7zIf63}=I@&_;B)Q# zKyc>SLO-?he*|~!%-oxV$hme75!~hflHkn$JpGjaVBo=YI2>?Hha%wlCGeDj{MC>z z<$xp4bl~B)9o3!_1$XUl0{Is~ekt&y{pSLXdM^T=mw@Lg!Ck#KgZ#@Ne}~|1zT5*i z>U|z~UICup0sboB&0&knp&O*Gch@rXD`9|@EZM8??HmQ z{CXe8IIo3K`F$iIT>gB)neNxXUm!U1V0kM69P_;c_;LJm4dB@C=$??QmsfqLoqrH{ z#;*kY3$#9j9ous;#4BHLH(o`8GY_WQB;dh(sSDvb6?k4JF17P4;KB6abuaALPE4Q9 z5dQ0c|2Gsy`ELXse-rR?*;D?ffCu@12{`iq8hCyOJnsnZrvFDEzZK+l zOEtG#WkElJdiM~V^YQn<^8l1X9G~cYBJ+O}%E|t8&d$ZP9dPk{E1n`hJOn==1piV9 zzEyCJ*BkWHe0fK3H$DF>INOQk_G7>?-40=5axTvp!TIne{Zy~sU%33uBF{LE)27q< zI(9C9o!}hoE&3@xuj^ySJXqe&037XqANa97$&_&s>&5ouOE8Z5Azf3wWq@OQ(kM9F zgX637fgk4;yMYJO?Ka@Se0eH_=Xv1yFR=4X;KB6S0XU}5UqbjJ(0;xR{Mk@nkbh6W zkzdD?ZvKt|elDw;&SQZG`Hun|`QyN|4R~e(z8&y7z~2SDNpRQBb3uLw$gdXMEr*u^ zj&>df>5uKqDB@(tJl}?PMprDm_-G@YDqj?Whw;ZkK0yUZuB-rFco zm4CPl&>6@3@zH=|IXq8rw(~FaQ=VHu9`l{w%V5X+nD2iA9Q6)`{C%IoC_nF2&5n7n z{@x_Gn=khOj`{L1@MFF_4g8qz5h)LD`VSS{&9D6gXFD-p8svV6i&qIe0K#qC-MnB~_MQ~UCVvzq3mB!%>=?)N=eo>}d2Xkl@_fP|Ikq3uzW~}FEQd1%XP*D2pYq^5Gp5@)z=P%R@4&-0 zC{F%fmA)Opb9Z4}5RVj@5jhe#*mRLUzpG z4gA{;np#dy5uAA@&`){V1b6LO1vuK%BRKP{qo4A8OK`Ra%jXs-SJ;ki1su!mK9HU$ zf0*Fr0P+1Q$YXh~0UXQoj{!&iCk5w28t{kpFTaHTC>`WqgZ>4}$s2%UIkfw+q}HRq z1AhegBb>nG_<;P`fFu7F@Ph#-Ot39&rIN%3-XvRcK;A7 zKLF&v5n|_6g0r0iLH>H+$8@*}aI}+x8^=6H(eG`@55A!IBZ5o!>*Kn=i!}c}-X!ue z-1c$Z2TSkAl|L@>I4{Tc93pn>{kY28eQG=}kBua!xV|Sf(!xGIR(SFq-0olF`TMgf z&ru>@;K+Yna2?O9yxrG^zB}c}>-%Ngwy;zAslroc&;)MxyP z-3P~hXMr!N-dd5L?Z|6>&2{ioMgAlQKV9%T2VWxid7@rImnVJ9(Ah9N#Rc7kGb!_3p>OgYBV?Pnd_Fy{ethGCW&B%$V|&Z^cLc8^KRf0Bp5XH>?BgFu`Yd$t+eN;~!ADE~tL?4wFBH7pk-tOm zgQVOsEw;~N0mt@P*Z(ko4#{YHb)3jEo(p&*@L;>$3OKgg+-9<4o;`qvW!Smx_IH2> z+qYW*$NKw2!I^(3`PKe%!*2C*U#7UGznebGMBc?O0vyxrCg9%_^xh5fm_EFQiXHP~ z`uy6U)=ut|)y{VXXL;n&@w1z5d$S-pPKUjLCr@xbV0z{Qj_Ju9?3iZ~{dV#RIT82q zPo%tM*~>nz`(k2y$UIs;a~=6%LX7Pp%j>uw+e5}RKG+^IuKjBs`Pr%c8qbjy_VLl8 zH{Zc^pBrosnZHQn3mo|p!LdDL`3Zt!d&qd1;Mg8A9v8ff{Or{JYQf_c_VF6Qs~voX z;Mg8A|7^i$IP!Y`gY6;9pCt0w9x}dD@*Uel#x;NI$j?sg*YU@E3;X!Z!lU(3vs`d&4_RKvH_ILQl_I~w!PWki4!&0GT;+c;7{*3Uickr!(-{asf3ckU?>jZz$ z!F7HAMhDmQdCb8zJ)d-N-N*D92OlYVw>bDP!C!Rn*@D07;Cg@kx`Thh1(@7c^FLAk zbzS;f4t|5kZ*%Yh(fgi*uMqj24!%v~^*$?uu4BKy7UaJO{Q}RKvt!zQ0Dr)sw!O!G z=Lx{E|I+7KZhwUH!^41|=dRhYoj5+{IZSqpV}CS&!m@Mub)OQpe_!A~K;&KiLjXtq zzN8!L(ta7!ZGXW38R<40`2QK{_9fu&Z@Qgja>eEsu6No9IF`d-0FL9RVzPl9=i`^b zo(jP^AF&+H0vyZXwSv3lMAu6*KgUt?@pj$U$aOTMao&gMybM4gi*nIgw zkS`CBpCdRQMu7a{5czIaN{;p7`jCeWnp$79{xTikBWOB2FY?TT>7euZjAJ@rImGNp5u$<0rLyX!w*0n^Zfp^W2msKX(DZ>4KmPL33u^=~I3F-Qe+TkoLH=!!$9#DoaO62KldVj(XAIyde=Ok0KM!z}?*bgho=h8>F{;nIaYAa zcWj4SLwGuXrwn+yLwK$N9NTkTpMc}2-`i|7^|~DRw+ZgXi`P)GWBak+^Bbv9P_s!1YaRIrzek@wO-#0@|f>$0gn6wnSdN0CIU~s;A|)I)PZ~)-{V9V0qScEN*}L9pQ2N)7wDrEyShq)%{$VAN$isLB0~~{AGxp_Ip*Sa-K8F;>9 zX;RAbt$<^^HUW<1S?34X&MCmpYgXB@omj3uF{s6{obVW!9rN(mRnvd0;BNWhGRe*@ zKhuEcX1cCCCkW2=VENH{=cZdb@L+k+`p7(39##QArrQZ5$ByGwLo#a5B7>%;&zXQ@ z`sh4{n+{w@*tzMT&nZ~%P4rW}Ixpv@13rJ}G*tOFn4BETV?G{kP^%Zm$z_1!IC-+* zY|r=Tr~JnW?%MMskZ%C_M+JA&;U&OPFP2Z#yF=t%y-_xhoXdXz;K;A-Cttpiej2ZF zBJc9w2sqZGrv&H2(UAVX0(ne-T<3u0#D1^MmbdA^Gf&zdHy_^90mt&M>%KS|JS0^4eMR2Yd#K(2PY!o*eZMHr!FAo?NC!Vc z^yWMGs}kSQ4*t5}1rE-07VL`b!B2-G;V*IU62T`p_-Mh)96V3(xP$K{_yvHUN`5UT zZ;D-Ref$7$tdE}w&a`WRXNc(G{KxiR)79mlAo7eO|0Kcra3k=~3gLerc(9!u0QC;} za|P#MYk~huP=C>W9Y3)AG>W^X+qEE%?S?)V;&i}v<9*PJ{S?-B^=LfOA zGp^+y>pSCG{;|F@uH^>nJL6?+7&)x(jO+V>d3=$a+Fvd5BQ5OXHG*S(XP$KuU##zp z-yt~GcgA-LFV=U)GlU=OJL6e`V|{1*6X7o-KRXsP{DQ{~n!w)@{4&5#C%=~4n*?|3 z<4*v``mzalz6(503(onE_2oUlvH#5!yV(9&{=(a3y$@U<+los^^Wnk1jl;E_%^|@-ZB23;8^b%-zhlOJH|f{ zJdgbB_>}RF1Ru#DImJH_9P1s+uN55Y9pn0(0P7v&E2Vs3y<_}A;lX;x_(s98-ZB27 z;AP}z$ES>I`otL|r?}cb-oggIg8W*p?00OfJU=5)e4XSw=O>nP-A9daEa#h{obVbm z<#{fI=huKE&)B5?=LmKP7IN^-_c8z|Axr3J-pYM#_Ng@p6>vT@mdc&Cjrku zX~+&Ge!Oqe{he4ZKT}h^`2Hrgw58JkzdCZ%wGrm?}I$@dnCh-?MMDS07w3V1ZVzw z;MaLGw;pjFWasjq3OMqg1^n}XU;8VUpYte)|2u#qzm8AY{s!QGC`A4V!C4;dc^Tv} zJ@r1(jjxVdnP)!e)%$oiUV8uS^5}S52K-3~kpf#*jck32sE9C@Axo<+c;;||yU>rE=8&a>SKIJN`2 z{)*Lnn|>PK$3&j_u^otz4eS`la;VQ`mJT%pyJ41BRKP8J1`66 zk)LIh2h;Nmz>&WL_^}zA88zXE=&cUyr6`Tqbo^6PpkmmljRwgX?1 z^5e>DedNn%&v=kWdnN*o_AC^f57-VY19{}R2yoiDE=ZAnJ&%?lj?SMYd z;&ecs7eaXSK8Y`5JFp$(kw@<6O&VXEnQ=GlRc!g13GT!_5s_0hee)wcr2mzht2zaCh~6m z%@O@Bj{QE?>#`7ee7=nGTxZ$2{Cp1>gollve@uVgKNFJ3UF)(o&vlL?Ei1Tu^ouA zA>{ag`F@b#9Bpg|biW71)gIkPfpKgH8i0rK-r9kCC_Fp29oT44+n#)j{F*OM3C?P8 zJTL(23)WM9hlL&6iT4EufIOzdXuvUlCjgH8+P^b@9m%Mj$AUca^H`T1+mHO5r|cL< zer->fAMXpgK_2*uIP&QCpZPM5=dT5M zia1yzZ~*C7s?gSWhu`QfS(OGuhC`4Jd970bDra6=fB-hj3gnSz4&cax_vtMZ7ft8IAdfuf0**YF3eJaff#)iaN1mGiN1mSm50-ze7jAln zt<$@i6Q7*hf8AwaQ-0QvU*r2z!I>Y+|9=CH?eiBT-`QO3zxE0#4+jE{@r?nF{1bs6 z`>!b=kLfuFaOCGUjUC4e`EmXk`>&NCkNkWuz|Kv7t#>Z}dXeY!!T#$$;6eV!07v_= zo}&Hl06)%a{SkPOAL}XdV}FPIS|8c|^N3T+Nxqa%jxX|u^>;YV$Nu;8jPtSI{5<1) z>^DEpI3N4X&oj=)ezU*h{IGW57Ro1fyJ`onrR$o%VeP>042$h|upj&gaO?-QzH@Oo zAIeWLlus;&yat;c%VU4X_XzA5=entOUIRGxYqtpQmd~F6j(V|w#D48*kjHZVYrv7` zJ;B*d?AOBXm(y5~oZFu0`+$5I`?b$AKh{FFX}a}yeympPcPi_Qx%rEWl+;n~fc(6TrB82A^z%jntfCt-?u=f#o zZ&-G&Jy9q>*q-buxLckN0vzor0v>EnNbaS z-+A~}OK;kNZ<1fjN!UF6_dy=pjj-qJINreassrMS<^QITc>NUQv0Zu!CKv=ii!$0Q8J*)3Q2emb^GUzB?7maDG-j`kD_&IfFlCV)KJqw7Xpdrk%(Y?tPR z@U#Ms_8cYo>$WpBfb*D5%gJ29neJ-(DLzkdwiDYKT`$Twmj4??o_VnRuM^x&Pn}P8 z)1klZCyTWatCHzaV}-;Jle^J#MaYU%-xea*WUZTMzgifDd6oa(vj^`0PL3zlZVt0KXjO zEx!c#J%E21@K*sp0Ps%$&jUP<3kW&Zd!X^zf4Yws<0AmqeXtlm2=GgQ=U~8x%epw0 z*R@i7xf<|MAm0u+*D-at?(Q%J>G%xD>)KSl{1M>#4j1DiVV&C;2E@$(T-UPlr4Ilf ztHN~Db!Z$fU2Dph^}S=pb&V_IWm5PU*EWRlI{^P01L8IUUZx-&hYYqS%o7KEA>fk$ zzX$NifPVz|RKO=>+Y{!W2KcLhPY3)#sXQ!y4B$mlc^KC*7++oo_#BWw3HI|p0r1BF zKLzk%QrVcl9`J>LF93W4;HLqO-!E$he7!8lX8snyp9K62z_$V326&59e&$&UIDX%1 zIpAxD+7p&P8}RBq6ZpA+KLhv$fbX?eLVgwCYXH9p@U4J%0gm4>>;}A38ZWl92k=(` zzYK71OxszD+w14ivAxwE7pkwlae2EpW`2Epy*FlFTbnngbqSx0S=8L#*xFp*RJ*)n zeC=sV7q={GK6Ok>eQW!e7_-F2l#pm+Q~ekw*p;-;81}LF`lh9|iyIeP^1=Y}z6BI> z#Qc7RF;NH$ac1N4+J+^ITb8yr))uxlp31gj)cX={EQRY!n8w{zmK0F5*?bH)uc3Bv zecS1^iLnc< z>0J&XWQ5bJ(JV&=s~yjp0o_SokYx`2e^l1!)!TP47pQCT1fW!2ST~fFowOC?c zZg+asK^eEGjcyao{bB9A`i9eMo9gE^N-A@T?K+ir9llSa|1gSPb@MH)HMxC>Yqft{ z8&=p;1Czc!G)#fv`jbXZ^7d+Ia`KqaeX}$;;I2(IvD@9$)Np+>FG*FedA(q1Q&TP9 z`86%6pI^IV0k;Ni?e)#=ZT+Yj;Ur@6E{6cE^V=F5=Nkk1B5n+=Z#}h^l7+IO*BCBo zX>6{g#=5b7ac%wL<^7136S6PHL6BYzYiL?RgT&pk%njNX(=bf0wn02<+gt0K+gixh z-Ap<^e6L#3#2~f1u4B6j-KPdKn4njyF|8KYw>K4I^Ab{C8disjhS@nZvGIkqGgL^5_?iRYxCtIQ8rer;}w|VH+3LE@-V^+=vfDcHh501?5M}@>tR*D)W8G9+pLS zAe}HKkAwY7Wxg-j@|*`Ir5|GGIN&r*K)emP$$q!bmT|Ks(+Z^=g~39{&jMFwH)=J;nn{8CA@2Y zfe8H6Nm$qZdnkO6{aeKTS4A<~&$?B=+P{M^R>jZl(7nf`QE*8ef>^t<-Y;K-8G?@Imok*J?FfF2o%Fed(w6X#97GemW$%>M{0ocoJ%LA@p{L z{t|gHVx0)He(vjB{d-aS$d9n63Uem8u!n^5515V@ilYF#isr|K z^=}E$zcxVs2qJRz^ZT}Jshj?5ML(`_V>?{^gD5=xd1ZVN+=c-CBk8)UpTC@F zlEr|Ny{`TW3QvD2`nLq={|d>u`ny8(dzGxs^A5HL|G%q$Rfzs=0s8Za#nt~%i2ef6 z&tcHH`Zt8=H!tf@KrDZU(z&buZz1~YL_en)I#>UnL-Y^hmxaioejaPO`VY+vZa+Fj z|4QOVr{OI-t>KdJZu!lfVnbc&80?&SybmF(C_^RWFA)1TQvG7b_4f$+seX;ueIfeS z%c42AU3B%|9io5d1*z%Bb-~ro-xUg~KRZSL1PX)B)&FjY{;Y4L>gV;Au72J(F-U*G ziB{`4R8D?Z{~;7UDE;#S^fQ;Me^H437SaFJUi8lo(O(jvzc@hu4?^^>7yZ5E|E(eV zX9Vak4bcB)i2j|T|HxkA|JxA#3j_2Y6`((l#_56i&o_qTxXhq)^M8K|AC&(q1N3uV zy7@m>^t84-aN&?C8{UJIHuW%NvOJ~2EemgI) zp>E^E0q5Euq3}WJw?*vNi6ypwD*aTy=6{jsce2S_v)U?M2)fzpnByH0rvDNf!xkZC z{k;C&)juyJ{@z8b>frw>_c%Zv0OU(O-6%)vxhn{nP2^>hBc&ZvG7u{jbRQK@HIG zmYw$B4zYjd#Wuumodk33zc0l8xnln|NWYl@_P-*CZ$-?O}}cXIL|Q&I1R7qS0&-y@}GBw4fJCb6b?82DkywV`Op2P z4VWe2IsbXCQuV9-XN1`Q>XlaEt)f>C8eYx1CB*)fSJ@C{PJ+AkuMM$(n7kNW0`~Ke zQ1z?*TSUK`{~NEh`ae*^<>;8>JsqNd#R0>k472|Rt6$&m3$y>N5c{`&+lDyXDOhg$ z|1QM-<=?d_v{Dj^%TFWyT>FR6JSmH~>0h^&KyteOj6F+)*Yuwx;obJ5<$E^79H;$u z?JuYBLFMP8wN?YpUoIp{*Z$TJ`=7kU>VHBFl?Z5fg?|)c|K+#Y0Ij{)e|w1iS@L2t zzdO#3)Bkk(seUa#zYVcJ{RbS1zFTDv{(rSwk3;GGGxodrmnY$Me@P`!{ThC%gip!8 zJ8g)cQ2&I^&A&+$J}CcMZnhe>K>M|Xh*ZDE|I85k^EX(nJJe7yLc=S3M~MA-57+?T z?Ir(i3$efKcClaTALm~y{Zzl&|6+*!8y~XzpS1`7zlK-)zYq;+e;>9%I#o~@-1N_) z@ImRH`$HR0-`i#Tm(ox5tNkUS-z`7GHd=)v?7{!9;nl3=A>ogd@WcI()}!I{_)tjv z>m>X(C%hZ~4I%O0c&F9ys)XnGpG7~lU*rEli2c=@Y`}LUydE^X+CP@=583b5pMpnh zh_7+xfOG91L*aw+Z_SUahP6=sRuGTsSNj)3JtI6_qv34)359a8-(Y#(YfjON{Id2WZ+r`>30DUseVnr zJ&BNQanrB!X{)uj^ovsXp#00fH#Pk_0`ymjemDKLgy>iMHT^D-@NW9$J!9>^t(W$1 zMTq^c-e)yDDT=uMtPZgMZqe_i-;=+v`g=>iABN~(dw**Bbq46)D*D~@%iUu2PjL)$ z)9(}ksV?+6MXl>aki1CuN%znp&E^iw)5e~lsf*S}^J$}mQ> zr`~_ph3KEl54OqGng4eD(L;V$|J5P->wawm@|-C;O70|gS&04(qF>|5@xP3IuKq_v zzngz+MZXk7U#EtbV(EPvX8&(&kbQe8{~w3gzeViF{JS#1{@Bpq^0RWQ)qhqm`bSgv zpz`zTqc(=9|LOq!CyM?;t1j{XE$e-d?hnzq>9;1t{*8aMp^lL9s~m3rUln5iRvCDs z{d`NT`n6m<6QY0TU#!OSd(rX3XgwnxT*GVmE2aGk z*zdL(TLbKm?j2nIcMh`pBToP8>K{bmgVL|;H#WTPlgawm z1?aC1(eDkh;m_|y|HKge3x8+BIye63h3H@RdmDbE1b&H* zchFDsU(3(KA^K?B+mB6h3PWrj}|0?>OWcFHeZFsl+(C}(g z))#I0al^0dWBC0;!mpL^F;yhTJmYwhU*O2Q;U|ZL&y$J2-on>~gs+qENA#k9 zSxES`68`XB!e1H^eyfBZ-AnjeLc-?_wfWzh{SStOuaod&deQ$vNcgo9{@`B1zZ(*M ztArobOZYUJpyQaji<>&s8@ZR1w z{v#ZFUHi9$gfEluz3G1^B>YMV@9J~UKMe`LR>B|Hi~W03N5nC8%l}phe`qh^M~8&Z z`=U+1-pb#^knnX9zPI{wLP+?veGGp_NcgQ1zPOk8cZP(|+sDSgxA5N$3127Sd&|Fj zL&C3>@V&Jk&q{c@jY@J`Bz#K}nwY=PcHq|%-t8yLhS~VL?Y~8BnPcj=B)p@>YnJdX zuHhB`I3)fp68gPa^N*%K0>f|JW+fy0CS=*?hW~&23Xi@3 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm-mod.o b/third_party/libxsmm/obj/intel64/libxsmm-mod.o new file mode 100644 index 0000000000000000000000000000000000000000..edd73c49b852688d23c54bce742a27b50be583cc GIT binary patch literal 48440 zcmeHw4|r77nfIMcfS{O3BCXNVGOTe$#W(@9p|z5MiQIt+21KNaO+pAH7)Y87l2XNN z`8&oa+tR{rTbIwZTh_YDZrv_xZ5!|pXxS~geFSaS(tbWGw6+3TL9Ot;@44sP`_7s3 z&bkTt|0TU-NKM}4GpC>&QbLUwm&q;zJ@U$ z4jjL+6^^chx_|cSH_DTl6L#hWJ9EtLzq4XQD)a0i(6IXERE($^So>n+>jU=cr^}@U zl(#b{?cIrrB9uavA|Y@00YoIR)UxXOhS&93hn)+^T|Plpq{yRP|kr&Ul4z;dzSHn_GUS`dQ8Z z--jN*U;S(KlT{huU!w1#_}utS@i{l$SiANYt7;N`rL}8cTvZ+Kv(^r*s;aGe$6otR zJo5EttA1gxJr()-Q>je4qSMaIuV}sXw)p(`ZSgM%labMDz?x+#P<3#yk9O;8`%2KR z-4_D0Q?OQQ(IT3DsJ8FPLew*AwPjyqLQ&i_b}B-h-PY&d{m8kh+n= zq9T=fDQjI(5}yx~d(_;4OfsoyImEE_h z+K$+}>?>BqQhmkt>fIr@eAk|IA`KT$mnSj(oo;&1B|j z=d?csD_~d<4@+f!W6Nc9BJzVo#V|Ai7QjxHcg}hkoe>VE;9&xs>iamlmCEb}A9iM- zzXKugjgTa;E*GnA#gMh`j^C`7R~TlhZ^hWUweO^psmxLP_{+)FCqt1RG+n#i2`PFM)Ojn`5smy-0x{5c5c7phyz$&zN?>q%5{$srO z-S)`n8W@+0>+Q^|sVf%SDRenC{J-qXh>A#NHOTw~<%IWjYhSbbuB{NGFqwHKvSBh5 zP46G^M-Z@P_K)}gk5%u~L8XSCN4v@X!uM)nkgWYhWCK17r220S#aI8|VUacWgJg2` z`=N9<Nv!0)a&a|E zT!AQD&B%A&5kw4isBiy+d*e!=IkzVif+iHcf(}oSf9TeS_z#% zfoQV7ei#;y^@?NQXHi{d;(mM5E6cxauYNXUhu*9U{VX}@?MVNBo;-Cbm3dRtEgbF4 zQ%)y==Cd;gyyknVu0K%`675h2?VN-jm5kJ$aD+*N9PR{z*aj-u+IZ5hKZNlmdR5i{ z6Uga}GBwy?Xo(@_j4y{GdyoYiVcFvgM$%a5n(0~4_3h)Y+RzdUcU{ zewyB2_^D+7$OsI~$c8WqWu8x7_F{bXZ(xXh6?l`Y-wUOCATLNaLmp1gguF043GyN7 z3dmuI!5jxShF#Dx4u%k#zP6i30z-9{Qe# zzVD%b^3V@F^d!0t1k3*hr~kUp@9fawB>Xq&WTgKkG0aYh29sm$RKBrxy))EO(BgG# z2i%cHo8<5^G1Bz$q+fps<11@u4R%zS$CGxy3O1C)6k{Gw$-#`Li7=kvMvgZueJCy3 zm^+UCG5y4?P>%&xp7IgZJ{;&ma2rhcTb; zJXml>bGqZX^j|(4`8n=n8f{z~Rfft{5pTN%(@VJYQ#$NaOp6p(cHTdBJx>7-UDdgewT*wR4lOZ3H z9t-)68QiT+VfUCQVL%nONyZP;Q5HnxHJaD9=Zb>JWn;1r$epE3--S(d;WVGkaa7uRmaE8#+Iq%^U)5HE!T zaGBa%gqMOIF6V3>ftP|DF6VE?&!i=>x8Qu?=8ro{!j%{R#l=EC`HL z0CpDyMkxS03<4Djz%GNpI0eZ5iWt04q5TyIRDXpfi0H3)R9;-WAFqT9(Ag6Ei||r7 z0GFx#C3q?5;d0LYa=aAea5;Z}6fa}P;C$hJg|_d9N0wbh$FRTdKKuQZdAsw!pues! z$bQeE-M^dmkCauH_FP-M+d<@_31^PqDU zPTwWw*!g0P{j8W{FTyz%es-aF!c2vZycH>hX|*(2h;u9a<`p{N$QJ;e|3V>lH-Mw? zYn4S6Sk&$o!OoD99$7RIi|&*~(0weN^K4NFdpZhVfXmX9S1hJPhY<>Z5^}`}inxgc zSYftCpP&qwA(zx*(msxpHb~(X6UPgfyU}4MO3+L7-%!^@bbUPe=;BadP}|7am%6+1C+-&rU|-EeW2fY<$@H&dbK>L&du z(*N=YXZf9KI~au86-i&e=-2Fe=9qo_73cl^PUHLg(cb&}=hXZAU*)`6wEh*kIlRB$ zh3^&F_xJPV`}>jbUNK`HZy+Mx-`C5}efwFq>Y4w+r~qgZ)UjMioz*$0DV(k3e?I2`E7e zty<{=I2lkC>_HLJi-Ol9e8(iqIXnDv~arv){YY+ZYR89eGflmOxhdi-+$U8u26e)aJuE;CGLpR$5)m10q#aS^FF-O zED>AZ#9M(ykq+E=X4&GyVQ450m#IZKUhGWRZier0a9tvXQD$njow>0bu48x|DV-XH z_mCyGl5c!_M!iyq)`~DYja2tiqwwZ*^`EUu2z?w;r;~pC({>oVr|eSCf$KJGH`Lgu#XjN6^gv!{JRm;U?)vEF1D`OKcld#YmDRR(XIok^Ll!QJpVp!2e zFjfNiC|M@Hro%a6F#`s#*+kkPk0>`!o-1ACX_wgLcuW*WTZ)S;#PI%V>AI#G-IceZsQ6dCCsuDJv8`)*NS4meDvT zN=ytC;I;85vCqS~Ra;zgU%`~(^8PS1!@Y&^;<4+7#EWCAhko&#;<54KC=^X8E}2qX zRCg|vQ9$&sP1M#c#pQER1H@v9;_5_k&6Hw0QCuG{o;RhqaVEAg^6#NKXTXnbHZqJs zIQwbwi1x=y>}4JJOD>YPD*ct6-AZB?KX@_vB# zqDLfNi8xFk_ZCdYn*~lJVy@jQ%Rh#?l$fe3#NIYwMe;^Pbw^2z>c$LM!k|0P zAlGiddL?!aIt&T@+hoADY1mkhds1QvRD}D}jN)jbcx-~qlQ$IC-&kCPvWKMXb&hPw zjN)>rSyy~YaSh%{VV;BENIZe$CyTz;NLgA(a|~Fk0lQOTa%hU!HyW^q4A>40i-9kD zC5GEDoc=MZxO`n0Dx~8a3>P~};@mG6m&ctxBi25{Q5f#sm_dycOe-#Ggwt|~C-A-k zeeHg{pWTP|v;IQ38QqKbvvot`#nr2a!OiHdbMVd-gPR!$xOX@E-0;I8V@dp6th+lU zM^^kAhH*V4u`O`U$5jwlfni(+q|Qv#!MKu6Tt^fhD6UoDeKIGm>F0(Y%@@}k*_mlv ztr~`L-6^sA&_^+zP3^Tph2^aOq)rlbFs_6X*B*&a3LIB%drdnx{D1PrHBK&qXk0ZK z)(CFRkyvveU)a9$(|o?{(0H*w?bR@>?>#H%b|R}j5-f%!Lq0?F%4^k@`S`bOkcKXyx87PY8d*mPhvK%@lJo< zdbA+Vyfv{HY(t{;Vry6Wnt^q91MZ5bsK{>FnAM#H#2w(gD+JB-@)KL48D@V?_}`17eu(c@%Lm+H@gTXg5xOlS4DGOVFor56| zw!X>W>Wy`FaoBnw8<1CFYb%34$l!m=;14nQ-!b@i8GIXqKg{6YWAH~9{80vfjKQ}v z_&+iD_Zj^09Q<5??8?D$MN~%Cb;G2@6oC(M;E? z(&F22L~cC|fdy!p<*oNBq`<|1o8%`7d2b!)#A#Wx8Tn=*@2wBVNc;G|$C&ItAmqLE zVN}ZFhh>xezYF>ErIh@}09StyxVLUpc8UuyUT@u~@JfMu>qZsVOo4mrM*I$fX&K-q ze^v>3Z#}8x9}u{=o>cgc1wO_x3c12x6}Y#aR4sZ^;Fq}aQF%5xZ1yJxc$sx+j{Gcv zd+SVPr%T{hEd zTHsT2@VNr7&B4aB;jakXTgNK=xWI34<>68! ze1~u1y%_K^%Uj4w3?BYuPF@d+_;4cV# zv5PA^ZwlO7KPx+>FoBd=x4ZHRpCE8={jBZ;Hw%23E3fdC0`JJdzb)`i7f;Bu0l-c5 z_L7kI*45%$0O*`xvQnq0^i``TjbdVaN{Vm?s4&%^6b+B_tyD% z-^GOQy34HloD0Ym?-vDre-3`1z&EZ=J93F#=anT9$Y>4|c8-xVO$%^0x}yTjwjhN8sK%UwOP);NChvCH?<_z_-dP z%M$O-;p#sG{-BF1`$q)+w>kKE=V4qAxwv@O3|E&6{O?>`*_kHr@8;kO1-{M2m7Tv3 z_`@!)>^vaw@8#e>6!;@9F5Wl8)d7J&>f*}I9|itc4qjG_acy^T@va!It`hh^xwx{E z68QIB9F{90-7au%oj+INe+js0oIVVAsr3UX1^apA*)IkDV;6_%PNcF3%I|P-B|lZ* zKXGv--y`rRTwKXNEbsvrR~8Qn{7DyA@}=N>skPI^m3^zdZ9z}>(xsJ)tjg5XtF49x z2{hE*kZ5RVXzu9zk_)9A*@hKuOP4NMzN|Uj*0C&@1~3dxVQEu(VcVib4Q1tTk{H5mh zmTp%m3+0$P(p^o>`{eJzXt}+q%;&U7m_U zI2F8sH8-`ldm7GlLuc2bh8(bIL9?GBRAk1F;%Wdhj0oHvg*VwZU-v%b`IcWl4+GpB z^DxNLJV7W4J_Zc4>5ith_NE1J&txO<69FPW5oF5kB_1}OYb>bde0rQ3xa6vtIWR||q`*U|pPvX^$}88I?h?*0?SREVm8TR4@|3o7qt{uo zT{F=eLIPn-wKjFPHndS`fiTkObo+Q~DGb|=W@}Mb3)jnpAh#C5!UcKVZ*FbsVq6gJ z~t8m z7@cb4bVXr4r`WX{YQj>`JR(J5KC=tCSW%N*4vNBjW}EYwZO-Ryb3U^x^O;?l&+N*4 zW|u98d#$+w7q_Gv(oJ1>XzKEA`->X%f^7AMhC9+t3kLUOy08VGC)?74(=zhF?8!dQ zvNZXWj4bd_z&`}DGYedrH8Yyv%3*R!U z`Z-g1X;$ok#&bXWe)6b~Nw*WE{?~w>it9|}g%gm`_R449PoB0HwG)zx7%#rvR{kdl zo+0>j2Iu6d{ria>Dt{lrseOub^3*=Q4LAA!HG)(76sPtnPVN5{u|wsz5S-elI44i- z<6Cl*|KBD!wNG(spW<}B!nfzj{-@v^Qz87m13!hU9Sn%$TXKb82kZ2Z&)9gu5k`7uIfj9p+~uYhM&S?n!I{{t#%Bc{J-RphtDA*aq_hO z@ol@}9V~vEa^b)a8#}UPYj3X*P z*l~1>)F-v`0l{g!my>$n;-z-DdcK<2`30$m8iG^ar3B}AshuFaubw4dYKP-}pZNc4 z!aG!cpvC?}6gGy z;pGTI!tV7uapq)Ed5-w)@*EFT{`Ulr%5#)&CpeXVli*Q#j`B+hPUVjhJSxvoewhKs zxMbCe(uq^i5$aBPp5qvAbiwOQW!G{xk zFTr7ph?@=(T&*b(D#QwaguJjt!A((ut3H5**AiSk(;;3@aQI~JriBDoYeFpSA-H;m zLi}EWpO1h{TM4e7F|p`zf`3d2!*L(MRbR#8HwdntxezbH3V?(M)o1Wp-FFbb5CNI0 ziTp(hgky@}pCI@af{!BjUV?v;;N`NDVX<1nWAQkGk0$ax1dkH@5rU5)_z8ml8NqR% zD<{4#>?Ah=qS zVDVmpUqR%{$9R(+Pr)S0Efm z$ia_(P9S)c;A$?y;)w)T&j^Uy1iuCWndTGx(+Y%RC&8-;zJ}nF2)>2jpCR}o1ph3- z2MDg#idg&x!9Pdj%jIN;^>7`*69iXlAuOIp@ERiDPVhLvdkH>;;F}0OmEhY5o*?*> z1g|CdK7wCQ@IpCRV_ee+K8oNr!FvgwB={zRPbc^`g3loMFA1I^_}c`pBlx^?oD1}G zCc)bYegnaK30_a|#|b`*;Cl%^o8U2d(?Z@G37#PM9D>gy_)P?VlHhX*zK`HH6Z|!T z&m(xbyjh|@YR!+uF@mdS5X3hT{8j{H+D34-Ho&4M39g*C2ta=p5*+suz0beD5%RcGmzp0riDuqoYSRW9i6wki>*)W5^{?+_e>9{ofz z2(_$F9)r-M-!LlSBYcxdSM9@4+XsCG)y6?zLFu;TE10z{5X3v2M)C7lZTuVW&!9xB z0*k`8OJBdGRrm%PR1~(i`HFh`2?L7;a6s)E3=ozQ4Q@3^SkN2PwnN{zT-g9FDA^zl z=E@GjZjji#ErJ2s*At%L-x)mN01f8}2Z&Dyi_L05eU^LTd|S`69MosIXU^BMv&Arg zziJ<2K#KwVRJ$8}Es8CO0bG=u9D~^l;Go?4=xZ-%lLu{WAs4{Epl$4Ki1Z&E-quLp zsMR(_f1^rLY!?h{S8Pu7m4^Gj*rym&FqjY0YA_dIO%TilDHzNJC5#&=^DQ#6w^8Pk zgrK~glKBLInh5%>8M1a)25AO&Z=|mq`TqLR}y1@&6fGPl@Nhx+^4!=v!ySah(PSC z^cD5CVET%pgA4p)Sl}N6?9B9Kb6TO=U(jiU0D|a0Y~A#$VA#OvCj#3zef>u^*wN{0 z%AXB(clxq9wFNsleI?yJpT3e90c`*Di=fRv0@w-~hz)jy`muHUvjurN!q{;A?lagH z`m@2t(Lk}m9#TIx*h%Uq0{ck=vBB0-UsL{UVwH zRo7UR%^gdZwk(6UUCTPsEtS(|-gISmx~chgS+gyz4U4*(meTif^cC7*&hc9oemaw% zEnL+a_bmGaWoa4%7l_HvPHs_+dzO8o>t71R_)7hB^~ZrO+T{F~pCLfPN%R{gBKXeJ zq_5s}`{{p^(QgDC(+20@<~l#_F){HUks5yd5l}Yi3lMBv?lL=Wk;88zsMd{m=5xXZWo~ zM@7Yn@#8Cfo&O$&zf1o=8ASC)t9j~#i zX~I8KiK6=W*)H{?R2a^6|DOaoQ~!xII*Ku=kM)Q91a$xLyS<5Dedmu#eRa_JUuXCm z8yyYGe?CAu{{>KSCjKJ`LW+vq%XI!S&^N`uvC+|?{J4)n=bysxm&gWkT zKhWr?P=4GOq4VSZEK~e3$?rFQS{eQ$lArcp+-ITlt{qJiGe^Ha8 zLF31JozDLh!`~Q?AO9USQ~OsN_;DYI&i@C7zc(QNVTQlez(3l+e<9p>P4&MuAV2<1 z8B_dQ4E$pZ{0WACK=P~l0#%Xp{y&-F-y``||H1j^V)!Y03h!X}_euV!VD&Pce+k2X z&=CJ62L5j{{6_-ve}myKS>X6g`~Rf|{@o0J3E-HbBKI;q{#^`zje-9%Fr(N1TMR$` z`+=CEBKI<#{|^j*r-AoWy^!R%i{!YnH_eP)}I)69Azt6yr`=oUK?=bxP0`hNV_zxNQ zCm8r&WcXu%WAcmt-xz+Y+0m!%kNddv_)jtX8zsMA`@hHVmmBzTUzpB64hT&BAL|v9 zU;LMXK8n_$Fi9~3|0F2a`9II_tIf53{Bsz7+ra->1OGaPzY%aue*AyI@HZOxaUYx> z|MwaG0m&~9!WYc+@%Jdh-(%pPY~cST!@o}w`^CSX;ooTBj~n=hz{G3ne=#{B`|;z% zZR&s94E$3K{FMy<#(?~nG5iAt{#pb7%?$q$$uFA$G@Yck|7?c;fPsG+n9%#*-3-6l ztnOF;`0u5f>i>v=KWX6qF2mnA%!${J|3QYoXrZG|=ieCy{+}`YYG11#|MLug)WBb7 z;6KjrM`c6#wf`}OzuLf$`#SaZ9}V?v9Dh>KFaC=_-_-u~27cTps`F1{_|@M_^W&ew z@V6THZ#3}Zzl&>%|46|2moWUj27cVfs>i>X;jfqbcPl`LCVl*WmEqrF;KzNjI{z+) zU+qt>q~vg}^Z$t9-)`W?w$%B5&+u=T@#8ZyO*%jR{l7vGsG&k|zDM%MB)%Jtx57{7 zzW@MJ`|p$de&eqU^iBLH4DsVWV4c5~;XfFVzlPy2X>kIl^B=xa)%m*_{v!eFe;LC+ zR`S#OZ!++Ii{USk`^f$3e>1~B&k#S}l0aM}RWHvm{Lz5?Pc!_T27ZKe{&yJunB@2C z|3?}A4U(VM|6&9GXlQ3s|E~`i|3#p0>i;_o@wXZHQw)D&K>leA|2_l%?FRld!`~_S zr^k=~p1LXi-hlj%F#OvL{5=N#*BJhd0r`K$@DCXH zR~q=sfX5X7R>_ZjizdDP@!x+p>2D9He=DOu5K#YaMt@&G{qHjR2c`a3Y4C8a$NwP6 zna1A%S^sqW_kzCO|6XADd*%QCl=9)nzwK(`kK)Y=Qp`Ds|Hu9-aIW*e&+zY){Oa!x zNHaO{zr*m?OMdM0m~i~9fuGL*Dc~{npJ)WNoYbEiaHBx^IC5LmJ zeVJ=npSJ%7(AVR~|1ZuIKmNP#m}WW$H`n>^XZTB69TnRC_W-2x|AgU>Nh0;{ zd?*KW{wEmz8p%)F|6T+CZy5gGGKUfCmL{El2>gF}ruy3$Q2$Csf2-8LkCMZ=&Oa9P zP3^x!#&1h~?Ef3#r`P{nhCg<`qnMQX>Y($_V)(5#MJ*B)|55hSA>_Q2+Og{z0kVT;TYja=rc!G4XFR#E)}{ zUjHSqDaF)(>&qQRzxXFG`i%kgZ)Nm5r9So zzF+*!jDBN4{XRy&Q|kN0zmAE2gCYKHhWH<4_|>1P_KW`oM!#fCu=;zK(T_@f_4j)m zYq{Z`vy#Kpe=^s$X{_Ozi{r?_@|KO#LKH8=U#Iw>rF#1QNemF}yxBd1r`syE>q5nn%n$r1S1i%#k z5gC6>^5gvXIQ;bZi(peDw!iLY<5)+r+Bvv+)ID=f!k};Bk1ln}DgO@u()p7N|JH!~ z35LJkz`w)5pJDh92IODE@b?<{pD^%0&+tbtbL!8p{-0s^w;TAMH1H3BbvCxYw#>5T zNq*H{jyGQWsq3kXeq%s=e9k})z5n$}eQYzDqVxjJf5`AFeLw$CNPReDrHztbA8+bf z9f!g?9qUi^b7f?!)E}n+c|=)E>KKvw4(jE5rM`zaXG%}u6;fYszkO1_2<{Iw>Gd%h z^iAXUpzJ^D-|fNq;~DrV{c>p({n5Fx=z#3F+du9I?tuf|l6C#zfaz_I8XIoF+_J7$ a742NQ^(HUHo@}H33P++6;-g8gzyAkWfUj-< literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_cpuid_arm.o b/third_party/libxsmm/obj/intel64/libxsmm_cpuid_arm.o new file mode 100644 index 0000000000000000000000000000000000000000..3e9ecc90272cff0b421bb18945b6282944b00b50 GIT binary patch literal 1360 zcmbtT-AV#c5T11_t%NKJ0=w9*O31@13DHF?Sm{N72wjG(x}oH1xF@h6>Zbc1p-1Qu zf-dSYdW3ocn{m(T>9}@L1G_Wx&3toa&T#Jb4)!CO27zd>14A!B0Zt-A+v=MZ#9{0ljrEdKGm19%@pQ&a5oeKpDB4X4 zv~EI6#-p)EY=H2Uegt4n=-(9eJON+5upB*fA^;tvp(LV?kucHmgfP)zDh$Rd{c9PC z=`cBhEg`ZX)gR2z1pRSB+ZP?zpT9BN%`oD$+zhsx$A}<#q*_$2-2R2Nrj_ z99YffHn5840V9Bo7H8GA>pCsWZMC_>aJgxZ+tmwT9j9qCr(Qnq*sdcw_@7E5op!0- zs5Pc6<;t>(4q@sBEjdZDRsI^rV)*@)zkyt;Ij;Y~UNMIx;j6)a7cJtO1H?Cuh0hc` z=s(bh7!PbY^;UKyTg|NqzWOrm1xM|R?n;(3bm)~>GKluOIA~em)2Qin1pEM}H1%wv pJkt;5(^oKqnrf(g!qnXnXWuDybAHtyDQtl|NFcs;U%WkwPsX1Sz8JAEiYIN|QDYg_xia8h%{g zcXnpItnXT>>XUYN<}>fjdowrlX6(_H_SQu{ACux^o7sF#QpU#aoiB%ZG0Yxd_p?k7 zt2Xu-)d{28(3AbC;gM=Rl}Kmex~<>7rRk|{Teqs;(z-gfbhK?!dv#Nfn|iO>{LLL| zLXVll8QoB08C^}KO?61m4ECF7?%AiBeY)DY?coaaw6LG_-qh|)td~^d>0Vvkt*7)% z%~2`-r1qrsmbSxsTid>4n{`T=${$g3KPL|p>)}VUj%~4S zQ+rz5 zseN}wSsh_4nlB7bT$)WNt8>%|ubWGwC&-?$6HD;^fLt8L%VnAfI{K4*1kuTFK=*yT zqSmo!{)k!MZhaE9jvhjAZ{o!pe~VId{DyD1B2>`EH`j!fAD_^y%~9(^-fqv*?8kQs z1s>lgM@hd*$t^Dw3Yv9N%l?(>m1FN!Dfvk%Y1x0FuH-7I-)YQ7OXzawI_CEpHMCFM zPj9A8Y|0mu98ANx>q`C%Z{mR;qB*d`v(AY@Y?%n~P}(Q>_**7cwB*k!`IjjqYC6;` z4cud%{QOv%^s(H^DIUZ5^X86R^<%ie^s-GkC`HnQ<-So!I3P3b%vQQo*RyM)ZFbF)`( z(_BqoCe2H)YStNAIE*biLJJ2Ub13^a>Z~$*6K^H=3Vobt4$&yI=?}?rb~i%E-n>KQ zX}mP@w_nx9Z+QkwL9v@0FW9)*PRhKl9c6nrilI zSz5qhzn1+JA0?M}$5G4UfZN&gR8&?oZTw=THvVa)8Fhlahbw&qB_6j}3i*%u@Na=n zK}GU&oZkk2*3FM`ehob1=D*;a5AEB;DL#@=VHThG& zO?{|NUDrivI2to;zCKj*7^&X}8xr65x|naI%J)#^((>miN3;`L#akHr3;AJ@K-F`L zB7qf?{zyPQuviPMduE9is2ML?^g=~oU2{Nf3an@fR5b<4TbCt0=) zGT4}4bK;A5UKC~8KL1rHRJNF16|3%Ub$`(llrcYBbho0Y`KhIt_{l>t@iV^i7Bu;3 z!+Qm5%f|u^cK7IK_&pbjpYbDiL4%(yTfhMP?4A;Q-mhE1&wl|ovCDqux@3svHT>(2 z+HUxg;NF@03g-*wCnR{(Hh10)ytHt(Xtw(u84htW6!u{Kd)5KZSii& z3I64~F|YZVvVcBIWVvhI@;y)NC-iuI!uVGu_~8PUj^nCwl1%S>ezxnSa(}{XOq_6 z-Pzi@tz~=ncCD$srQ6Xnbe2dD=_!`T=&@dwh$qv8WG{R4Oi$YAH}^5a?A0?Fme`x= zH|N{=a2;)X@Tu6KN1=O9QcrEDrTLR*1~M$N(JeZ>Aojjz>~Ug`y7>304Dsr}V`GQk zu)8VXwhDnhes9@7-Hy0`BYR0M4+i7|>{-cI;ou+vUq_GRUw3hwkCHbD?v3+Df@55G zr2gwJjy)(jzt>Y8{VaF!6E45Vol^fDS3mA~yCOLH`K(L-T8VySe#Us$y7c9Vfqszd zq~A4EhJfSjl3f0d;19TXn930F2kDXeI{-<*kpm@v8juA1bM#0)07wG9k{-zoKoanW z=#e}JNCJ+vC;7{OB;cQ?NAgzyNx;8AkL1W_5b)LXNInfn0xr(ZK2d8G4W|#On<+79SAd>3!Xa4D#6jNw6ld^}IK~aEboK zHg#IDhQQ=Ll?`E8@O-`Y$ZfbY{IB-NIp`BP+$~{I!T8)F{mbuGYAj~IOBfw+8&g@{ zZ=^l;H_G1mljpL$kH~J2``v;+4w2i!Zb;>~#3p%3I6|(359wdt%bq7z%>QY0O5)zL zsC)fS5G&??m$-UMKkyIVUjMHWgMaV*|4I1opf)I#?Io`k_Q=a_84>ocxgnM1{7L*c*%zDt3*xH2ODJOg zaleuFa$O-FuWy*BV&RU73Bwt`mL9J?&K~dF6JkFThtH#)0eh-EregMC@k4L)7qK%la z333}2=_|Fo*4O%4+uE;Z`)Y+&stKrkJuL)3_)z(>l?4F-`4UauZ|0o4dvdc2ZU66o zb@$#oXJ*cvIWu$S%)J{P35Ld|dA%Oz@Or-LN!&S5k7sF9;<}J83q3iW8$9b1V7_wS z8&izCi|p5o$wl@X#-z#isk`j+R^)?htMwAXH(TcfCPwnTVo4o(wOX$rV9-8bL=K!c zBA<9$M~Ic}O`V`fvUPXrAUx;S$5X$CKA)*k3m694qot_Vjy<8OA-v zy~fP-;+jIRItm#csWD#kgCKJLnn&vS%060f?2%`UKNFk0)8#Adga2-v`M9M6y_^Vo zJIsQ&9{zxO3O`O_)nE3PG$3MRY2oDn^b*ax`TksQ(Q+79f62gCtfs50OI*{4OOF(p zza!_rwrGXw3C+#a?6G&m`}-LE1*t!&FY(yP&aSSe&SQ|>;8Z=JYRB8#yn;VNy6w^V&|6t8emUum}@2GwvpI)=*fPJ1NjQtisBE}Ia8a?%{ z;$pu!e!M?eTU%Xwi+^&}eO1*DRQb#3){p-mD5zw%_8v z6HH3WDl7eU^|PyM!WHvND*f{-sw!regiGi7XIEC2-UmKJ-mTvY(pOZ~)QA1$WhLSI z+OqCE{wY4y_2COK()AFRXY3e%WyynOwg2jPlF|Pw<4MNnpBRst*KuW4Wwn?QHLu!a zy5>XSTP0Of(X zD5(Lml_Lo*yEvT-nCq?zJZ^eI7Sydgp=^xf-ks3r;($OI5YRXD0 z$}8fdNIHJi&z$PAx~dz({`u^DN)r`ff8D(5`pP-1%L>iWH(tNiK%3slpF` z=Wvm1-p7xzrufUNYat*R3dfeJaOHz+GNkGSWw@!Uuc@i7C99hKpkK`zBuFxUldBTA z+H^TxJTMRQRa#y1AcZor&{X5l`A8GE0E>&rp@zqOlU=bD&Gnc3BldE~m^<%sdj7xA zk29Z+O#i4og-RYou=(fh7NL{q{$ab-^e`C%y!pw7dI7qck~u7Q zQUxWU!+KigM|`cp0OLOV)vx$Rkde^ZJ3MsPq+-@qQvXo;p zuH;m!21_-mw~mcUvA%HJ*Zt=La#dD7hzMN;--3Xg-jHlb=6in0LVq2_tg6y7e`Q(K z-0(b_2iP9MF%|=?3q9H&DD#(u!)5bp!n7#Xm4z=b?u*X<+}i5;8d%f+@%;ZYY?JAw zyW9!8DzERR+6%jcuJ1&?iS&bJe|AafeWbst60Ct(D&|0|b^dTk?cA~?OJ!o!(^p8n zgS+ZWk;kxSH+B6M+XB(t9~+?swM|{m^WACeVZ;*4i!EJOJ403yIs zxM4A){!DLpksB(|(A3q2Yb!bm%?_a@g0HN{W=QU*n!0FPBAUa@|8DHon!3I_lOQ9Q z@h6ZW-s9uk;Nylr=nYRYk26gDpUHbv`@1-=wM_C7ll+BAcvx2SD=e!BZs2`r9RFp; zUkQA%tgYviJPXb-*J1G`Td^x>1qH!QbS=v-jzI;CgXn(*N(nY>3;>|_Xoa9|KH(; z^B8PX*T3O9six5O3l{Jb;E5$mbp8K|8~zN?f}*%Of6YKL>!z;X@NlliwId~lvtH`I zr7P4;(MoX+VpG@CAP&!J>iQ+lR#evsn?}6DrwA`!-PR6|!+%C_e^b}bnO>hG(6_Hw zco0VV)Bmq~8ay45W8O8G15c!@*8`WS&aBpqv+KFlBI^seD3aHKdXUeuH(2(L>DHMuY&Uj8 zpf9(!;^OeHRBxzvOewOvjJtzW@-pmKjY)?6`eX}bFUVn8tsHo$elDBBxU<-U9K&@U!C4&6%n-D106hvIRxX#<~nsOxc3F zjSI2}W{8|NvG^KDN{m7U?V|aY5aA?fd}!YWnq5q@8+SXHW*cbG`EJZ2q0P!DiY^4# z;roF50plLV-9g6P2i%v$;thmr3f}{ScN}<~jCUS)XBqD}@Lmy%A0s^Bo06Go#N&Yw zijM0`g0@Vd!j>TxwHxiRE6t^$z*U%y#XK9QF?LftHv=F{Ko?I}WWN`aI4)XwV;`}k zo(hq}+QqjUFxvWuf_aTC46K(dSDlgu&{i%MZNO)} zX~&HAgI6}a-_`Yq#~j?=^jD9k(bIS?TRhqyof-DW;<PD3nOzuj z6OZ`>nOjpQoCb?u$z^;awNZncpuH{jc?cw?@G=#f_NlBFk~YPZmsz2`P{xb>iVJ)m zz+zX^CC(KLlJSA=^Yr`Xk8zi%FP58P(V6m~SackNkprRWA@B(~9CPp{41@-?1ku*B zhE*;H5xzb?fc}_4sO(XSP|Kg6aPriw@Ci2n*0Z^BuIxNf4gCy=SevRMnQXrc*E-(S4xVA3g(K{Qd`ZQd+$&);E#HG8jmTkdQ2cb4h^)k2<2h9NTL~O3^u}6n zp--1+{t-1fqq`rxUQ{y{yMhHMq?lOOTNxb$2?zQ#qXDCP(+dGN|pDl_U%qVIbAkA`TtmGQs2ujh;al{=rJZD?c)KD+dx+c9^>(O7k&m{+ z+?>(9xh>voa`Pp6Gbt$E`wVXH2TMoCZALB$egJ4>S>Bvjj#)kJt^R4>rx?(Y=I+JD;g9&EzGTR&muIxi!aGfS$AZ} z8M!81OK0o{j6Z4XhiLvX2;u>0BK&TR3`iZZqiDfCiq_i?vDTjHw$>8#c((<2$pPeX z-Gkkda~kiTYrNwj{og5I*jr;WfEnK}J|x#yIODeS7*ZATSI9x@Xn(rIc`xA3`a}0u zGfFe53wY$+V#x@oFm?utS9~#qHlzwm&3M*Re+%hD>iq>Mr4iJPlm(ypyVZW^nj*WS zsC~b`XmCep@LT%q(6SM1c~3mHg}NwIdPSPJJJ$7ayg!m_=c%qP%w2rh5kA+dvnMgn zM)Z!(KO+($11|J#-*560*0$Jao(0q6iB9O=ybBLt+hXJ6^NoFiA%1#ZM0Nti%g|O_ z9R4h21GE;MQ*t`@{h4v0=f&fQcyRZ?P{f2l4$N(UTq@-QQ#L(64!-Y1d&i9&_NEHbxE984ZB}M%3^sqXp#NkMTA2 z6D$T_Ayh+*ZI69j$%*|(K*^>=nnWU+fgBd5yIcY4F(DU{k7Gy7Rt$$EoDmqh{qJ-! z2cBe#AqVj_?Mq5FBRazH|?tL5i(bRV!8*f7(+xpmEOjkUo47O}al2nor zI{wtsA30;>pjZ5KgNR&#Yo3sa>Pj+=hEyr9&!q3eK{t+hGL!v@DwMrLS=ly7h>C+5%6RBkH9Kv zo@sv7^bUO2M9)0CVn!%2x#Hd}C&kp57|uq(rX$8~27ONK21p5sW782hl=Xqo5h;hA zIhjZ~WcP8o$R}WLCIo{eZlWI80*7qGA~2N9kIM??z}J*;@QmoVxMLXN-m^fO`d1hq zBZ{#bVg*nY>66|!Q9JL+cUgGXr0Iif1g}u(+Q3AdTbl#(aI#khRsz`q|6J34&1`)l zFcsk2qy7IvOp5JJ1j7EUzX>cPcvY!exfK{|ZH#Qe zwKH4SGB@$eS{Ce%=*X);zuvN+4Xgo)y*BVHPIpB6VhqayZNvraA>x@chP@7gJp~Xb zwi=P!0;n`#^2Cxc{VVpDMAIJ2o;DT+Q4!NVkuY`Q3(Rc zHA)Z^4hM?i`)PBJToYz?kaA7jV`KjfNN7dciFckDeu4`SFQHOH3F&1Fc!h{i=Ku~& z$O{u#7q3HX!kw%`tdn(!CXfkt_VcLCv@h}R*`;ZD{ePRKgMRx0rbUpaE@JZq$w;#I_^GpT8!lg)#nm(1g?<2r9VVY#X!ImRp`H2h=w}u0Frz#Zi2klcEnghQ> zBOMkzDklVZ?s#QhwlwMx!hfdKel*Q$kEL1mJC?VLjB78^z@m84sE`B+D58Y`+kvy$ zx{Q4y1o(%zFShrzu4H3FePj7ZNMhoi9KnLC5e0;K8YenTxZMhzE!gK)x(CXWIAfD5 zmeb-Mf*(>tk$p5|cbfLAAYO%gIzsjnfsHt0EDbH2#PYj$%0{Ru}Z)Tn zwm5oU+N~IiXlVs*tQo>nK8QW{Bu5_9Ft$*AW24c2_{!^89^7p z!ZE@(V1-yR88abAVCM9XRH3{3lkVIQ&VcZ=;;>^CCP@<$E@M|S z3!Vr(iF?sf2U^3971?J(_9>8&mxHW1(1r_gZCats)@QlF@+z0dnLN%cc$T9EcJRoB zK>a%aLHz_r8xYaG<$Z!fh?V{u4mP(w%Rx`H*q|ZbufaSSYh_*ZDY-}wd5R;Zx%39B z>7{J6yXvDMtiXk|?&nkaJr|L_AM*Bt=nc^aO|PHQ8}?=P^R>EH@)bG zG&(`fq{Ks#Q+Aybs{^&OA6o;F95)cnvnkni>RsPKliL+qJ_QC6fIP!2B(JXpP6xrPja3d|vJ1OoxgJ?DNLouA6Xo#`vVvoZf%2!;- z)Waj3v9|OBGPY#mr)Ndva$Iw|+lMSF`j9u4jtgb)&b|)eVEcXp`D)8L5j+>$#~HMl z*!Kma-=IW(9o)sh!?0K;vnYg}K95Z&<*e6$qCc=_pTNaoR1DPfDqyk*jod*Y z<>$N`S)=Y#%>~Z`O^CFpi%kaXY7%Axrx|iO-qWUIFSO6o!Fi!R9Sim8z?KpH$@##B zY&&hpUPtP|&-BFDv$3@pc~5(Fwuo%*J?s^&u|_U8G?OhL{Om%=-?se_QjG_D;Tb_% z{1HL;rucOJfl3*XLsaSAE+YSpJD$*A;=+vH>5csq7t(()g`LJ<5ac|VXe>F#_P+&N zu#F796r&rp@4G=s|^nlRGumL>9DfV|L&p^sbXn>&TyB`ojD=OG5ntuaH$W$@C!(WEP zcH0SeHXaMvI!=$?3sj%2P~z8_cy>^ zXJOwI!?!XTfv_Ji2n|_LX(lA2ku%m&edJ>Cv&a^TqIdKqSz9I{)LGHBfxb{VvPU}6 z!2yx7W&*s(PYlF!%hbN^h&lMLgFiI~A46_TixW`7n+okZBNv4qP zWYx$Zr&2#5{GP!dqP;R@$h2_MlLpSS0n9CG{~)c{+d;ORN$K`dBC+QL3PVVZpT-TP zo;6sJi|lu4<$VgEAZ3;Ydc}%NnI+@x*1r= z{6eMc945uyv)H2hkv1}u$C(L?muvs`B&D`WH#^iJ@41k7pXJ>M+nkoMBu>nQr%q(e z{Y5hY6GEW-Obo*+{W!s+E98CO@@`-rFE9@j+o-Z|iUg50bRP;v!LwHBi3C|^$oqH8 z`wEjir^wRcWC&e|bVN4!V@+wr-t)!Y1ITlExIa>4>3JQI^%>Nm7>ZEIgm&>Kp~Ogw zz2|!9n$%1x=5e-U!{In7r71u*Dz`hB=4{K0U6AR8t=uZhE28D^$Yy|?gJta5KqE9n zzyHaxyi|s{bq-~uOC*NOS=TUz%30@`x2|Oq7cHZJWZH+oX(DplCphDT{mX>2GB7cO ze0Y`@u3_H#8CdvG-YPv^v_yc4HSbxuS9Q|NGw+&9 zZ;t%}B|=tS><>qgqfA*J+j>l%Kf;-|@PP3y{yVsjjrveFKEO!lxbj{|4Y~NDm_<>W zct7Bu3WDq3wN8I6}GggiZ?yrcROOVDIQGzczvyaQ~hpq|PTdnrJ{?Op9*5D0{ zNGnzBZLnuXt0bchvFC6>;{FI?bITblv3zlbL^&G!4IobaY|>=Yei`+%llGgapwXt7 zGWiwc$ISuCVJV!_NJhH0=xTs07Iou1%5|o@l1ymz3qEw_n@kV z+L^tPsB`a#-WH&cyb(w!ry=Un1X9z~@zD&%M+Ive1PI0fmI|EWPB2x0b$D>(TuC8BC!|u_C^y$ zIUg{6IMpqfj*TC0tQ4N-n_Nf}6;vsJ~GnL5t6<$dNRl zb817Sx9E(gGmE1WGm7n<#e=sMMPjJI#ZZH5M-6VD$|Z^-_!Zhje5==hu1!^pYKN3X zk#+nU&|$XHqZsCa3A1>7AS3Q$|~FoU=)f@=}Usjw}IqCG0fj>P&|xx0Om(-&F_MM zh$J+xRE@fa-gs(1mPV6h=q!B!cEQewWp%>c+?W45GCp5}# zNLpkf4a5qWz)IE{)hAaG0urI@VtX6sPgJK;ilB^5aK~8<(*vDD;q$DEp_7 zf~_=Ovw~^`|1MG~EJy?UEixQTtPie_EN&JiGnYKJ22M`lk5;}MJZg{$3UaCw#nF*j z#nIaqgD89O*{9X(47khXh1x1TWUWGkIVdxlkoT13Js9$COjH3M6-#O<#iuG@5t3jk z01lwqli#TVuH!1;ey+NqCKwGBpa{5`)(BMuBzlA2zEDNCYn?*Fb7=W^qW+9%NEVQqB*rcH0;x;%obw_!kb2mN%2$A_qn@zFR8;3q{Ly$eiHJXlg(vB=ae_3WWwLBsPS+(x1FAT&g9KXNs0*kP#KzFWaYz z?aipwk$WP8TN&6(1_1vhfaP5&T@?kwHMkcogFt222SLz=dl1M+3A9QRz(Q9XLJC&O zm9XU*q(8;>F}PsReg$L{yEtE?#g;NlT9^?D2p&Q`l7f@zZRW^OAj^}YUPwPi20`=NW;C*nXl$h-#vB5FjjlNyr-d~jS@m-Mfe0&=hhc_#KyNGI`Qb)h6S~kC= zx+N`Y`bDt8Gh<3?WkIfSMblE z5Ey$iF++^UJY$wD@|i-d%*N&pAF&*dK_k`FVpFkgNX(CEzd_}~BY35_SIZGp3*T{K zyjp!)Y~Z6*h_`V%L}V#$SgDWT;_$WQE2uok?_Wgkq1g+zY{#;M3Emx>1&q8SCg8Ej zxQr{l5mLjH$>K<6GGQsVre^8zFkv#*ROLa7cvFc`GIOeg#=xDFU4SPlgoUIv>d`3@ z!kXPh!gpeSM17FviBd7%lz<@U-9~H{A~G*TY})}4PY|Vf$VBMg@;=MS8FHhiaf`hx zZyQ>&QD)=^E9Btu#~t~^e8^Lrnwm@fc-%1+5dConwtIcCUv*y5{xe?Q=vkrUeMLPe zDQo3@1#E5k3s?Bc@TQU~Q?B5b zXWeB=JcZ-UMKK|!DpZaEse_331f5c42W3UwN|l2YVsXJCRjS0kAqeR&RI6MJHnhK( zjQz#K7pYbLm~f$#xfiZgHW3XeF5#7dS7KLf#GGDv_m1AHPN^N0`PtW_w*L;^2f0J zI`&WNhLg5q<)=&|1bZhz4DTkaJ+v%@u(2Ui;!-^F1lk!>;=P$uclpQ=OlklIu6))A zCTj={w(6DkdJd2}J;(r&vASMkLTm4H>GN-A7_k;qEnk7{v34~0a9>`%Uj zK%1FH05ISIv_(J;f%V-fGjM71S7|iVO4OV%PRW{2t#s5K@k;MyNa)vS5g7( z87G1i(|}zC#eKXY#V^>QtU|Me$}3gR?DcAo5yX#s)cyk1!12+*^`Yp{N97wf=O7~e zofXmipYkd#Jc&4}5^6SB&=i3l`wn|OhF2NLM4G#&INE<`k#|?I_XL#%&F@Jw;@3#9 z37|(=LpNdR4#awI~HRXl@Dx0FT}SwmiW zUOCPDi)6!vR)tT3weeb(O~qDN#0k)c6ZBMV4S7#9{U05AR9rRs9dY`5P-X7GF!1t- z?7!H%A5~_Tilbvtde!J<@s;{7&Y(K8rUouFw}-sPx&Nr_A5~kbC_9x{)<(+-B(3yt zKqmenuD-&Od}hI%K(1(6j6@K5e<4;K`3RpfYO+O;6w)@qqFU_$tUBZ%qJ`eb1U^+_ zpWsX%fdnrPNRPmycyd0`@@phRz^Vb3ER>l*s|jHiQ<0W1LdT-|98PI*` zhh7Y9u)Na6K}FHPywyqOk0T2^JOvJvjg%zwAlm5tpKNrI8T-Qrw6v{{y>UdI&*4nf z8^E}L^EmEfw@{Uq_VXBnT#fOh+w^XSAL9W>Wgr(zF@-ybwU*-<`6upab3!4CY624> zj$DYZggDr#t>w@gLMUm&2#hm(f=h1;#J;5-3ie}pWUY4>WQ=Xb7X^K)wsPQ~j`Ss# zCd;lHV)eK~Q)lctT<{PrdUdA}y*Kjf)~IE^iRgTdg;$(V>$L`d*sa#P0R3?xml83) zSbz)C%+2VS0nb7-zMuz}xHuZ4x8czNq*p&s{1h*!+;wITD!wcN$Hxu?C?jeHx*}(L zi6xsT(%Jj)_%H8tdn{eNvIz^sKD;IOb%=#$m(Fv`--0Wr1oJp07UL$))gdwcfS7(tOxq`>9T)uuiGFoH;5Ll%poWk?2tQz}+mp$cZw1Bljrib6 z^fbKVHx2LX_3K-QuNiedEAomD9$Y;KWI>R9fY)*A-p=IfPe60Z5YyiR&2cep8z@HB z;pb42$|`tGG`|J0&?%^ii8;z=MHl%((XoEItsLbKMaScba{i#VA&ug?QA|H9rp3T} z8?}5Srgw^I=cwBqvfER1{f@le0klqZzYVbbL3kiK{Q&OHiRt@AzoB*LxpGu40Q*G0 zhB_b47!Fd@{6X)aKd`{nr(!x}>DT{T7=tOMZvb{X?&o~VH&*n!ZOmv~j`87p+{-ThY>EZ5aVZ=pM`p&w_{Vd$wGZ&e{a!z1VW3|LCWo3WYD7d zP0|lNpQLOzJv;{A8R230z9js4eB<{>uL5JNSUu^nH2n0YKi+t4hq_nK;M)OwJ6g7m zR;{=-(K_1BaFFSCthzlsm(YKlRunbTilbl0OS`MhM+v+Cy~OtWX!v>tR*ekYncpY6 z0e1uMlH%wzXoo>|#@(Rq4M{R3FJgW-7)?@)K;C~WkRdb3yexO9U{iey|IV4XD1nSW zAp&?}SHre&RtM?tNu+zx(K1T>2d)i!EBydgRDXt$aK4((_$6vOuCJi$qV~N=ye$Nd z7y?JTmmgA49&6ztg=i)%;wmjX9Ud|qp9g%#pJW87ei=OehEHd?*xg<(IuGD;zFaK8 zb-s@p@M0!4)V}VM(B1o~1?aVJ0kZpJ+}-%T=yyG=r8;8*eTOFa;GWfOFq!JD^tF>& zn4%LUN71>P`5?IXwh|h6gTEnaJ7fo@*iGe+eBVw2ZpWinv75|2|Hkd)1zFI%k9rAk zUzM`2m#CYM=<8Fbuanf590_szi2A}{WnXW``ywC7_kBuzVLa&TWA5t=`Wnf771NKd ze*#^?v8#tppzrVmAFRR$JMiCnc$9*cr5~OmR>^z~Jgl2dY zxIL+cqNaemo~Z_ubT&xeEZR&kXAGJhnng83kgF4sDUgK=Oe-`)<923%rakp6RKMi3 zVmgRvw<(#F7vI*xQl8pEc>hSSe;K zXz!#S+>C9_cr0kIGoni=3ox4YBNKSUX!@8>!ra&%^C1NE9bd3cED3RxF< zdvh?_)RP83V8L(N57J8k8DsI1%#vg9FZ5y9?Phd^#NQ|Tz*eI15mT63jrK!?-+w$9 zrxVTd8ckzTnl`5oqRAkdM;)4VX0oxMAG|t@!I&5fXf7$(ykH8;1$@icB{+D1NQM zKG2C@f5f%DO@=QHRWQFV5h79^Tyn(?XO3u;{+w?2iRSO49Z~9Ld>(Nk6&&X^AR-!d zlA+aZBRgLQORU}ilUzrjtsHg`D0ja~?z0uBuaT*~fmj7}WDGuvqQLL`L=ktsg^Mi? z14JO{6d>A8_&07A{j!P*HsP<;z%J-H7mYk*&)+ruENUQfRI zHVO5Hl#?ABUkFbS78jdIG7dY4MLdciIVEUM6B#_j8RRJzicsN)V2f89F>pibp&BCs zLs3f+4da$-0BiE^>K(Yx_YuQtQdQ^c4k`DJq=-pbl@u=XZ|p?ayDf{UdZ=oTt||lq zQWbvMD|YWD{2RBxAwyX{IHaF6*3#i}NPf;^?;z)bW0^MoHU}Fc3|9GKI4iQxYebjz z!JGK74_~?6cP#lY`Q;qtm*kW7M#qQYgbu9_KRw4^k7)K1FSmAAH}xC$+Fv@<>kuH~k|#!u zzW0H8S*E1!?!VR2e+?V`O5b&lS?`y;nfoSB9;rOpv^PsH#(1S8L(%)8=#~AbN6n*6 z__OZU#R>V^cbEu<2>hX4|*dmC$T_O$QymFKlNNgn+X2_$p);dvw4 zaFrgOjqi-GiSJ8L0Ov%H<`=n9|8Zh9;@*Cr*=H+wAdOp6gg+~QmI1Qm09tlvEy%%` z1|;1m0Pjl_?fZOvHfs2(`-nXU@m7Fr3X-`Uc>5y_v__vT<%n>mJ>YTbX^J6MZz+yW zO~*~fOzLx6dEO?h9goAT#OiHQ@{aPn4#dbG0Wm|YUPq$L)Z`%eC1UkPVyN>VcrP$v z-ejh%28~&7FoQNI@9fUdQBP5HqSt8GRm4kU4l+zyWwGS;E`iJ__KV+f$2hF+oe9bd%%$QsJIpfAp9_-H_ zHfmU|e?;zxyxdWFBmFsd;t$kX|59G=U=Z}jhYs&UcY3@Fecr43_Rgdt9%aD; zxFs=0f>9@QIODc(4fo)qfRr;j%lBB?=&bDJ>A@_2G$WXmv+NQhD|bn+FS zrv2xoS^0n%SvjM#{J;do=&a1(W!&cq#VbcCl=^&1!9K6few=A%NXK0IGO~P;G&9Se zW?q`*8<&-7UFMks@Y~deqk-PTWx0ag3_$dv!Ost0TGbC`zs^=vZ5ncj)MJ3o0r=Z zY#VYJItY)7&2;QhwDPX!H#KK?j>F1(9_exG>fV+f_dYjV3D6zvNk814~N4h673Dg7rcoQngk?z4;T1k#{Pv0I;GEJ}Ni*a~<{EGNzX?Kasbk=m@-nmNm z;AOESM|#}LyEmqLKGz+f?$SNkDd3I=c5hDiT-6;A=fml1kG6Deyn180=NkPsex2^| z$FKE^bkDW=ZTvdjGbn!TTzr8qGClKLQ|VXmjV}203eG3(>qZ6lyQV%*!6|m?$Cnj+ zhYNnIf@dm!acBx1{N4^kp2sS9wo6Wm7arK>JeSGcso)!3@Tm&!r{JI+_wse7=R=nZ zP#h)xIWBp=so=9*@EQeQLp1ojD4!fV;;;o&7cioL&@7hQBQZ{9V8=*Va=fFZ_t% ziS^W3nfhS9FW1)1xg-n@dWG?Ft!K^&0jF^!lcz8R{LU2cX({0JcW;yNpPvF=3;5;u z&p9}!#7!&1oowCX;oK1Lrx>1C$2rFXoc>bH<=Q&#66<(8udA$>y|8Zn{F!sAs)m(%#@sb!(im%4-iSQUu+W&BM)CKT={uJ%Z>Del z%BQOW2HwQqBk6l2(~TU#*9;#aaU{9q%8-!^8Ns|pFtZU7pJ_)hw-MYwf3XZBC6v3P zuRNy9V}2u;A9s+)$a%~$0W==9jhk`TXg!tEMG~V*>XwT$-Vqi`jryB|;cP8~zM*jp(9EG6tPXo|D zNzx5SsC5Ki!YN|38!dp^maz z=<}u#GiR5>3LFglqW}*jT5EbXYqULD9j?TYSWAy~JZuPVC*yutZp+!X10T;&38($L19##e;gdHz@G1?Tb|DV@7aE*aKL_5X!TlP1 zrv}&g@73S~HT;tr{8|kz*KH~PAPr8>PU&#VKS+ZI(C)}7*H4Lmod%z*;p=kF(BRi= z_z!FF8#MU8Y498kPX8Pj9d0@QTZ0cqyCdg%4Ni0Kz~9#3x||0zIPFCo{F54ds0Qy# zg5Yq=^H~i}&m9~-gEaUs4gO^fK3s#3(comm4xbqsT$jI8gVUbL!LQTc->MJ!AENNYc)9ej>G>A4X*QfTZ7Xc&cXjwgWs&dFIV{|HB-*-;NPghb^i2M z{^=n60(>0&pa$3RLmK=R4S%`@r#+>^r$&S8d>+u?w`uqh4gM7k{%;!ms~Y@=8vJV- z{C65$?{~EZFVyhYYj8t@Z`a_XHTVGyK1PF|)ZnHDPgf}}={cyuKdZsVYVbiCe4GY% zaya5c*_NYcXFKkW=T4^Pj_0H+I;fjz_&9v-As`Nd>v7&sKpbxVl@iSMW8vE2bCLwa zK_?x)N{!3S|5pm`=HHeAzas_yJhf$T_e+2BfettPwAxA#yBYX6@?YAUu%#YICl35t z4NkQR2i}+h{-YG|wJG3lr+|Nw0{#Ux@g%=4PcQ|1W(xQtDd0a&0e>L{{GAl=PgB4L zs(rUx{%IOqx5HTq?zWR}rNCd20)N}3P#6yCcP2j0Jf6|uvov_8g1h^@S=C#Jk1i)l z|9rUd4{P|kU7gn8C0f5j)z*^iwn&2q72M5#iiUr?h96OIl2h+@xdzw!-Kg==``xGD z?tV{d_Ygs-{Tf^!SAz!E$Mp*ZckA;71$U3@ zRSjPsR|XZ#lgKGFxZdwL1$Xy*uY$Y#tgCoBw_d zuJgY_*_B(*uWE3;-yIrU@ArgepSu0$tGMm9|2H+bZvRJ8zIb}|1UN8ETA~{ zuh{okAHOKsa9p}W!>1Yo9W-Add>lR_2#AB=#rQbzs|kpM;5r|wUC=@B@%T9Sxdg;P z@Co=haOZiGo4+48bP!ynU4uTj6s^cYUR6{3USDm3!_K@F}D4X#Xsz#h-Ax(DZnOJ?I7u9fF`&Q-WT6L;^rN440QFa61in!?h$ zFg>bB+~ILVY4!a1_*Z5y@)_}}IHK%0s z6R8o$#pp{Fj=TR^M2y3Et}5@k$2x-1no4U~AwK$%d7r#<;x?@t)CcK`j%<1IEW?>( zbk-pS_B^TL+)h1sqtT&v#`XIY^be{a-YrHdTt6Q<6oiZo#(qmLx)@cZvvKV z{1esqYt-UKk>45rCxDSG?*7}6`lnRR-Twe| zo~-}9s{gr?P5j&4|CJz4*8dLG|13pG{gX=F{nPsc$@-tD7DMN`xMI^|{9j4YfByZF z!g();`X|}l{m)L(|JG|Iz4N?V-u2Y~Jt_KcTc9XZV4(g`4xIi(9>H+xcC2#7?`m+5=V`PjoBu{N{}WY*H2;I}ap)bt zO4c_Ca+Ws7ICiM|#`U=VIzDcCYE~fvSBYb*3j8;@8r|ckJ&p^7nQUDod98Mh-%bC& DSxj~t literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution.o new file mode 100644 index 0000000000000000000000000000000000000000..64b703c901876c27560184cd9e74b5957555c8a2 GIT binary patch literal 40672 zcmdVDe|!|x^*_9uL|rt^1e={D--xF(yTw8!95;FoeSb) zwCZ*78^g(W;(b)(w0|i{P?d1wYV(NE+^POGAnR1WwoTRZMb+z6bA9VNO6QQzT6q{K z%=O6@fo8PQnXRJrX~Deu-vHfZ^)rlS{HRO61@Pyl(z${80_D$NuAcc(m3de-k1p!~ z;PKDYrc<4_p3=H*(^6&IwCeZc-*(EIhYj2R%<+@SZS`N0NF`a;($WH&mE^aXm{#Uj zk{VKEjgq`sz=D4DYtFTdsNoX7bMjGPG6djDrAga-M75{ zNn=_Ue|>}Ctv$cFr3D$d)ce$X)qB)|_YBaV*uNftG#m0+F9vl~)vQD?fV(v#SSTJ} z2nNOd#b6hL=LWlp#}&Z}abFVbA?|a7Rm@HVL%2n)=fqtDmi~AS9a~$VB(LoVb{{l8 zDK_K5VkJ2akKyV=%5PVz)!ReLZ`XBrKV1C@9z%im>(2{U?}!%~`~1cUzw%V%Vsy!z z8mxGA;(0#bN?`Ow%^|@qni&lSHFIFF9Dk*F2xw+Oa38y)o2&I?qmua7MWQc*P)D~0 z2D_MJ#za9SsnV>npoR`bk+!A=1ExOg3uVHRAUSA_3zq6;F`ichi}7sg6Z@okO~Z2{ zSdQnY85$H$6^@MvIRwCmtQnFgX8&<`m@7D-betGaI!-QCI)=I^9rY@uV}BjZ4@3>k z>^Ie{iPxkGV%H{uC4fAmnadl1>iCLa84IL_`Fp1dCLp&|<~~me?VIWrM+s7)S@B>w zcwE4=xnKrz*&!0FSwn)ssA_?&5L|&0(bS7UQeX{;YF0lznpzTELa;e9IH=a#9W`y> zCxngb#{NLm92cAzg)oh|J8B*kqpn#sva)%^S`u7CY*PxO=4YtM2i7zflO-evq5`~5 zy?#1M)|!XGO_tO^MeBN~VOmckD{Ai1%n~VOR!yO`ocJ_IysTF<`I+M_n)xidWEy0= zRa5%R@#U;)8uRe0-mm^xed0tjIWX(hD^OrLxoll)v{WgHQAH?Wo-xi|pd|kGeT>A} zJk5Om1rz{R%9ObUL1xWuGFpn1SwBVLaOFEsOeyu{`FxLG2o>U25|>>M6i=LxEUxWA z3If(xO=jkw;=aba!NH$6-{$cUVk%) zeW_b8Y##5hSq;3dB>qxPI$xJ1-NuGpDdi;bI&g)}kIlvNfk;yp9)x&Oh?jw2BJT9~ z_m>sOWLbSF$j(<+h%80h^m#z(tfAw!-(3I9@n-5x%^F**S+PRX8V4jA1|EjwY$s*r zN=$Fr>A&0{`bLI>kMRyhOPP{fhU~Pl8!%q#E23L2v11j`m}mhXJ_B`)C}4^4A!ifa zI}s1Y;N#1DzLx{YtX{Q@H|361rV$6C&pW_mkj^I3sgjfRUue}^Rpri&m5rt~5oN4? zJ=ixT(HB@ULvmReMPWaXv4vU7@u)s=#s_{ViSbpWcrWsJRc>PjhE|#NKPZLb^;Ht# zv72Pa>$|um63^cz@qo(uiye`G{n+Z-Ew|q{u=zRE?#F84{`j{g|Nc=4ZdC735^H`) z3f4;ke3gyi%56r=O^QlFpHPcaY3Wt+Yhcr>#4ay0qLm~!3JdNunIcuMo6;;mqo3M zBEqJ^gn)>$U@hCe5kjE9N(;!pNc9mVF}57!ES}C%*0+sj%t=bEYJLH^g+TOTc1=LD ztOSY(V}x1rBGS(spQ0tVp%3B8MliK#+9gJ>pkI`zzsTwpG^UjMF(4#RsOI>XcHyz07}9z6<2N(sME$8v`12t z$CO0#k3_5J#tYp+@G-6Wz54HH*0Y=qmB(OXLdx8AVqTexHvy(Dhge~_p5I3V+v3V> z)mV*&cZtU8j~dILLq@T>0tyGko^3!w%1awG<&Ni>fr?WB0cYpq$?acapXlF7D3Je= zDd>tGU;z1{B(FhgsnABt`HQ)I2E&>daw|}G246lG?F(rA5y{$TC$rte31CxZPQm~| zS$7L3H^dbV`s1*DgDcRv`tz2N5v-%kG~hT|HND`aBzIv-iX1V*GiM=3Jen5$f(+Sz zf*+JN=$`SS4ICX-4z;|>zGG7Cb0&qRbX>j*X;q>WE{d3_4}I?;-$Q=giU!Bf85%~O zLewe8y0NC*vZga-8ne-Y(R`sYD-^(}v=}EZ#K_Zxgp@y|A=nV9wNwiyF&(b^j_7O< z0uOPyp6oQ?%6{rMm!APPbi5nHLz%S}^UJC!3s)Y4-8D`?CtBAbyZU1#aU~LILx>1! zpakZRaq=dJR3qKL#P7Q?|4ZcYGBJ@_0PTS_gV9_jdZ(GjVP;uN@T=UhPO7xqOHg7; z7k|81H8(_W!MVp`_j45!;XO4y)5ZGqE2w2)! z3Is$Vnl(p|MwihH_@^Q;mau2p=urQCK^HrIV8A1Wg!DY*UhXpqp&3?;u9v z3vd|J#{*i2bX*CWpTM|D?VyB@Yv$s2fj4bT2lecLP;89?l*oWE4b}ssSyRf`X^itK z6}+nNAlmDF4clwM40#@f2ul1Oi6^EMHsGd8MOOr^1LGqg%5Nbpz-b`co`WB*yEJp~ zNUp||#LHi`wCM1(D|GXN%6HOX-%C&Wefv<%r$4cVbm-=;!9P)ls6SDEst#T4*fAZx zdPVwzKcVdBJHaRTd?1!`l2-vLTzO!pBuQeFJ}GrZ=_dg(N-wh+P0>z4EpntZtEW~y zxQCMXEhY|7R%ktIAQ4aDDQ%2#7=+G4#$!Q@;vv|g7UBG0vK>Y#LASMh3&~C!yWrgM z!erWlWU@s`4#Y&UELt|JXA8m`Wp%h+5CGF6Z?z%;Uwo@lb2Nry3vM zrX;$eO63M&k}*pn#_CXQ*U$CTU7=`dOekuKn$fP?YLnY^*!|kBU)3g$MHOwIZq6Yd zJvO#bkM%3PJymj9)LgI+plEDBSu{4K93|*~ME#%X%0e5cPOyC1IEN32n}ZSgf3K)N8BzShQ-${Q8`?B0MlK!7PDH! zY>K7^g(4|EWX<^!XwgC!w*~V{JvBZAi0(jOf^Lg;{kS%{7u`Lm8hdW5?fP16as#@& zE85krO&-u=HH8sHTklYXev#gY|8u0OMCvtURUoCCYjx8qM+#XX+@SH3N4`z<$kB)C zO0GtK#DHCcl`CBlSiZ^@xZiL>`lAK}A4)M?xj_e0^;lgwNSz2cuV{?xpPB$b^BfSk z^(MIXrdC~}X-aL4R-5`+C=%NT_zwJS0e{%h2nYi^C;)oXhj~uFlyFmR*EccHwTgD6 zHgzWi;jL)u?oe&=9f+K)_XhG_kEW(UG8U6Q)%fT(WKKr1w(Hv{88xjzY(WWctL^$J z*ci1&1}nh5mWY3yZY?JpP`7~<{QrW83?smQ6shSVbxN;Z9zaT0YUJIV^E5Ne`f8RJj~gvhf#`qXJcuYaq?8oBb-#Gg!0E{vSZ)x&SsVx#MI@ z3*D#0CM28|gobjOX+Sl%nSY{)Jpt7v{pkx9K*d$QV;@5E&~s7^iFPQ3$;VC;<7BzY zyOfs88ZgybPK=thcm{6PoZ0xb_hPcK{sjvqzCTKQVug7>Wg06?EG^X)WhgDf159zL zS-An?ZNGBF+kda6rkT!KW*ThhCHu!Gyyf99INk%@2BVqY!zwxz(0GRnIf$QW4}}xD6v`a)Zm+pu4H9q(9Z$6K*f9-Z zEFgu6v1i~XTR(llKP4aj1N;VG=%6Vxe{-y*1!zOU*S;UVcGgjw^#Jl@sc|CJ#jniz z2~swHW?Lmet;12{L%*V>=}sg+$D&g>4~b=0Aw|cc3k*NBoEUlg_vjAVRhMHLU{aoi z33w3{!r6`2)P4+Agq(qAmnq47@Z30y zL<`yYDpn0X`waO8fB95i90nXh*g-aPLEHdJEOI+w&F7KY9K8;02Wx*dSJ&o&RQ@R32VXA_-)wKklcXTRL;emA$%gRZut+WnQFS2F9xZdaHDzx z$!pmub4id0%tVmZK*N1H?ng!vd*4 zU?IT5a0X_sgP?0zI+f!O!9zr}J_YhGn-1lLQ$vDa&2Dt%btg8p`wXHU%EHZ~HDx?n zNo&DgY)D!76_Vvlj#ehz!aFLRC4p5aIspqI;rmTIg`%4o=F-fyD6tt)PIPN(uuH^j zL7aPHr0LW182uioqw3_Sc{;1#*yIpYgkFMJY8IMC_ln_BPW(vrWD}gaY|&FV*8#lD?A-9ENX| z9<59~2zmPuOT6nLAra#g2EP#!bs7lAqeAU9Eem?d^e)Qw8V9z{w=tq3c~rgCIV*a$2R z)vu!`Z-MB|tQL9~Emd9B1ghU6_5y`u9|7_>kgH1b`+WWg+3y5INqCLHB6#>iByPR> zEhRAx>|!I^1i?F5Z3kKNHd^{nz&;@CW0qOPo(c9{^){aC-exSI>Z)x>SB2J~-7*D$ z39CU1y9X@WStH8}rx%|R<2qzjU4;=~$I-!cBz8SfU9}oC78Z%5!x^Pxw5S*qZ9?CQ z1XGTJ3nZ%QsxABt9Y>o*6(YGwU|xeXJFe8dEAa0CeVS4(D1MD??FE^d62fm(sX*pP zr9WDD6V))_O7m9&*LIc!$H8IpEhJFmx#}vEse3;lxQTjISG^AuQDqpbzlr1!Wg?-g zPzr2fS2uD{hoM5WXrgY|4P<}v3G9)jdR(KLwpzU(@d?PNX4+H)foQDRScmoSLPVAp zZdLuKYGd2(G)tg#O9DvVVV0Z-rylc%t3Un4eWHTqxAoXD)o99#rh0t^OjvBX6>H6s zV@}1WS#mgBz2_HKQnLh538gk@{w6)P;dV1x6;S;rBe5g>%=oRy{E(rq{Hf3c`5HFA zdX@#1qQgI*xttD#X1*YFf7sloIkrLsA*5OxEyb7*h!Sbc{tG^H4Rme89B55trI((#2LQNtxJ$KDSuN?`+9N(AR2SWn4;G2)7O^vD(HbJ34=6Kl z0W#?JQev{|$|^T#=7)kG=5d&&nY5L(i13dce@5ut-6$P4Au2YU1=T#OH+`J1rS3<7 zasB6)NPZ$5TxN-VJZld7gg^y!ez92+gff=)3$jy6DW&bRo~3aG>2}nbLqd^!t8c2H zJIu)LFnP1^S=!y2RFA8bg-83Q3OfTNcD9s-&9yN*8vE4h8Aah*GkA?N;Ajun*i~fg zV?SzAg*{ax?XNYvU85Uo1L5j3%B;KKD2wC!Y*smQs*fo%E~HZl7WuhvS*rD1 zyaseAGv2|pQ_bzs)Cw`M)?%^wX@01ghqwti9fkCi^cfyPEaW71={!<9LkQ+wI*cK9 zZ0(1ffq_g$t%4v1=ma?La86)GA-QMwkPz?rK(F2fv}%7K_EbmJeW+^x7LqOeIs_e) zn?8p_dkNQ=+5LDr__Giw4ky+riC0hn&YJX~l<+$2F0HfY1Bk&)pearRh60a$4@(RL z?0oi07k^L8jzb@|w3sJ49GS=!UkaSHv&|J)O4gM89fJ-D9I^!pd*=FfaM56tXOFOq0t@%DEqgp7$}Vs#mb#c zVxvp$a&fCCfKf`8qL_DOoGMI00r+g6sIJiur~eKZ7BQ)k9jN59*Y6c;RCd9d!#Il7 zZ>n6IDzk>bKZT4mpqyqI6iLj!`tNF1T{mk=Mc-8LMt_Uu-=q4^YW`1D{|QKT4`4uS z68nV=AAWQNc#fF*SZu{mNYrCZ{&=UbxlwKUtaI4EQQdl~CN=QlJgw_S)xSmc?||IG zp;-7OiEKx%tW+eyEE%jG;th zV-ecvS1w|mwXv3HET&;f*tMv~sj&}qAckkLEP^|0Ye9|i;cZB!qtT^P`T147gD||3 zk@ddX*cmld@G~&eXj@_BwrH$hF#?)FM9H`ZmHc zq0dv$=is17Y(G|P_W^9{Pvcigk=2t)geOZOHEx$RA%dU}jl@KdCmIu{Gr+HXf=LGj zNq9U+c|@33RF8`eHgLeKDUko=rH~ffXi%W=g*eN;j14FY$Ce6!vTy;XC2lk+K(|{I ziN?-yTxy~5f7yaq2i9~5U@kO>QqtITy&(m<;kpWG%x1tkLSAqWq>{fHV7r?z2eLnv8|;;PJpGlMH{2onJ2bl708C2!>;wtZ!re;Ki+Zq zMLm=}3Wt&hk`n(VHXOf_2Th>R_X{;O^F20k66U~YxpdkkB0@608G2>B`KKSRgXA>TNNe*O$Uvop?cT^9%t)dVlg z644bAX+pQYl7SU^UYJ}jpg_as1`#%qj*%N7K@?>=YY&*}gjdWN!uy4*7_YBqP_BN$ zLr%=WHV+}PKo^Vn!sZF}UT|j>{C~P)gsJz@N17{^0P5ZDVnDqMs{>{eK!Fz{L^l<} zEmsBH(=*mo*#Cf(YjDG?<&7wZcuP6@vIdcb{xGyBL^L59dp**$2M+e@QMhmcfvf#^ z41zdlAcWb&k)Qc$0s4vaZ(MF69%IXR%q9_!iHdj(ZM!EPgQmHf9uKcM8;{wXiN|1t zEW$4+hJZ{|24pbgVqr0HH~r`xPDBQXzd{MDMX(q{L`Fl;L`VilP|Cz+Cmz!b!DT$=TFgutkKwq9 zc+9<6Bv0V_e8R?M*Iq z7^L13!I+0sEX^OV!WW>)iM6~2v^^9NToAZFW&VETBO1YIz?VwL zZj8mm-B`?PIAsG?5&srJzhKZR()9*`M^Ozw%7o8EE<4)%Iew}_jmQDR*t^~MmGP9S zs~SN;Pz0Qu{)>2r2+KIUX660FBe?B^Vt{3rz_rc^!ZaTTB1TR?1ZhHpB80@}w?$tv z91Fb%P8>q7q6pFe5rZjYHj-bt@fk##l)Cri0Q?J>5qx3(febgdaQKlCS3;=X`~i~V z7@BDn$^xfBCxSKz$*{+Qll@tKCvzCP8-ierh`XS`+l=aslp&dog1%v%ILvD}YV`k; zI7}DDVS+La)92rd!yNkmIu29wmTM=uEAldwCC|Iq!$i=&0Z%w>O5e;`axoN?z$2qS z4ZK6G={#6uiIbbE*ad^;gZqQCbZ{7{_ub$;@AM2+;YK{^|y3%p2ifipV;Nt zMd^3azX2K%T#6m{_?>m|JL?J^zjHBZ!Shr&VTYr!PXtz^ch};C_%APm=7$SMr_&O# z0^Er(60@@iBZwYZ&+_;b>NVo5b2)0!aENoRwdPv**YX4c9Y=c#JVZBs#U=E*9auQ` zD_70Uj*qk(4M@jqGAupTMJ-spD>H8`$!r3{2B}JPBYdNS(B3$li*lN|wdrK%Ci`+V zRdOLzaMz8(wcz1`C(r`W+v9MjUm%bt?f|~*;XbkL^DIDc5=GbSGjP!yI^FY}@4z3y zK9#vy?Aq7^#ezH}R?~%W2D{aydfn!4_Mbdw0evwL3y%B%(gB9TMm=8zhQYiYQ5G)l zg(O(Kfo2p&aZfb%W`rjY%#xtJJ-dHriIH=R zUds~cwcJZzW)a8kKG$oZZO31Qpv_uLiKeq+i6xrj!pKPiR5*f#-pogthp4Ju9n<2lum2Lm;NUl zVQwVe{1pWPM+=<2{hpY_*}eU~sUDqbQYF5y|EPbbmM<8cKZonIw88iY{^6WmIb8hs zWn3}DOC%g%lo-ZrT-djNyo1qvlQQcboGFAdVAZstIoXIE=uR?ioRL)^pjoR; zejJuHms94_MhL&eV9p8RW?B{eJtU6f&3>CpAp%$cXFESm8+>bn7fyJR274UJ2>)9M zva+yWA=N_zOWP~6?e z8pN(B37{SxA{@@oAQ}{(hoGxAqnWgfSQ4ZP!4s(*PV_f>&yT)=xiK448&VScmwo_ zHEUu6QW0Vhg7=H3m6BeL=?Hm^$T>>c?CjLJF)i zD7#V+Q36He-U;uls+n^|RVNO>B7(2rl%q>U03QoJ;+||r(!AJ~?E;%wzm3~14 z`XRw2f`Opgm<);3nn!9)M8a&u7el#9dn2DaA>JULSp%9` zN%VwS3Zdl;VF^_>W6B!^(GGB}_l?nX+Ay^fTKiVIia zkE={^q*0l9GfOZ)^Ae{r)u=eP%(n?o;K(KA1{b!bNuA%7{5E;=S$#7=S2THI6$2aZo zpo+^64AJGEfFIc5N!|kcvXr_N@jiQ}XZ+I!BQ?MZ4S`eE#dyGrCCO`n$NnLHH6dnlaeP^7Q{Eo8Sim5$HPM^m`5b5b}NuN9(q1!O{Qqx>ByxQ3{UTWqR0ivkPR zguk`X*x_?`KY2<oiIt)gpT*JPneU^x9y^S)!CI=nKbqY>+zrR3Xh{(2D}ck*Bp!6Gr4xr!$^PBw=044T05@3k z&2q4!F`OY@xc*MF@CJbd5he^0!b~`(%=nX<>UF8+NA&EFHKjZndk=+oA%cI6YOM9& zVV2z(G1dh%yxKZz5OzGnCOmI!ipuj*g4QrMMc4SXd^nHc>MxZU0l71Zmvlf^FAc|V zgPMOF_xPW01S^yoX)KB12+&AT7-%b|<8Cc31tS@-dqJ$k9s-kF@PZ=u>t8^=+Miy= zeAa+JT3U;D3hrW<4XGnp%TO+C3bq;y3X1fDfkYygo-jOWoE#`t2SmY!V#ETm94z%< z&~Z8s^%rI8OO)qep(jsr`on(~tc5XQOne?H+TYtqe+RQ0Z_XU3zsPxK2FH5$z|FT0 zW9B>d!&gA{s9D=K)l^N45k(I6ilP>8;fYC0f?y=%#dka944$?~*6mBez9PhNRT<`6VI1S17HlQvU zz>gQ=?G*9i8#;7UytR?0ZhBuRx)4ajhN`nom$)8_T6aN`Sd6twc6ZONfTcf zH4lEoTlPLY$Z^H)W9r@(ta+gW-c0qlS2d3_MjuzyrbAf699Ort^i55@Esu=Ft2SH& z?V;<91BIw@D(rtB$15IHt52v(?Fn;vIe37VO1Ox@t(&}c!OdJe1ixALkw=`x ztAsp+Up^4C!ZheI@vWl7$)iHb#WC-h`|;p$ti`AAFGgwwdo$jAX6|5>iQyM1QRX(@|SB4~Zk_cVI-hUPyNN62?-zoQ&6R zQnzD7)`P!0HPEVIyZl&V=XggPuvSx(JF80Mgz@QaBV8^A1qXMy;P^G%2`CS$a=PvQ zeW`-JVRNfmz3LbFs@bIB{SFW&henQ!xts%ox5rwK42~ONX!b+qvO|+L2C>h$rY6nRS+D5s+CXI7=?MS`eJZ305v-os|GhW8%NzQ5nn0Lci3#I6e** zL7~h;EX+xR_XkCZWK&jNqDtSiShfLShUspfVuGfoX)Jpp=6y5h9+1=QNbVH zhhMm}HuhnpX)i)tLA>d{-#&;9zS#`*p{z3VZY&)#Ctw`6m9Hq`t^K5iJbNdP`OHR> z2kk4cv*)hA>)__RBP9G7lp5cnmrME%o9)$&FzZgdaOVGl5#ZVQ!RR*TGxQj?To1z2 z#6BNZI(wPI-Yiu9Nyh_DDL?{M6H zM)XCpYVlZIfcJxLB4mcB|v7>jS3jV=Hqg;l(jvY<(SE_ws0_>l!&h%T@q2!+q z(7b_vG}BJRB?XvRAteo@vT*8h?YZ{XupWyIkjN@(yuc#+2B&p_$@))vt7*zXS{7AgQo^fUIDL@!kx-0>Z zfj9K*F#w5!2>oy-VXzxgNfmTZ{j2bukAC?gxMKeS(gb9ZejEfid#QXq1`D3Fd`0Hx z!kRV6{xNPf74LqyE{GvqkdR<$2uHk-eF6-T&dRMY^3X!8ipen*<9ui(O{UZmJuxc| zLI#9B;gKj;e{heWHg*8QOJ4_SGU!6f?$OlB-)U@c{sYsyp{d7Q-9v6c$nXTJ@j zo6!u{Y?cq0z{;!+IDCQ^)0UBmcrQ)d?TY|(89D!`zg5bM+w+q&WV+&`z%GN=V^o4!|sdD0E_5w()RY#3uZWc!J zJBNsRK59{9-B7e`;5>`>B#ONU@ltV?qUg8iUW)WO-a-m863kyoY*fJ=vf9J8z3%SJT}FPPvQ~hG-*&;Rfaj~LUTgJAvMo@ zz6d4M?jMj!ix4Bb%!qz+g&5tKwsKA|@|&!YSC_;1LQm5Bf};Y@V0YO3@c1Eq+^J8h z+m&k5*}QPq9r!BG0p$9$uKU#LHSsb>{&wTlv~X&0i~KafAt^`*j^OMLKrmXIz-Cjb zU@a?mT|d8b{IX=DmYRqYxRJv+!?asXJyH($Q|+2I4`Zg82F)pYhF|G*z@uc;&s;$F z(iA2=>rmR$&`~ly*MM+taX=ozTP`+6K#s=L;25mac-F(6apf|bQ}=6!0&B@VVkEqE zF%sn*iQQna&?HbHA4_c_HixX9&h`*)4Bpnp_R`6S#^lL0Di)snwk&D^@4;Pa6D$O? zhj*V79mSe7l{unW6==x<=ES$ zh$3+z5&x0g2?4hu48pg!JHt=KAxeVmm)>MO(JzE`IT!7hVRUKT>et$DLCbR6C;pu* z76-{A&|{+1$2bNFrF#@y+TlN}r3(8(d4#JsDl?+=Dd<>aPiXRd=J#9^l>8+c7smc} zBoLKavrMQVv6@YUPI}E%az66xRXYGXW=};2zKQ9;AQrGj+vq?=*Pv9!~d za6okC6!6xv=SPs>36kqj&zBAtlPWdcx0sJO7;?RMS@R|k?Vb%Y5q?_UfLzU-Tp&CJ zb5Ot%pY<~JLhQhr6N~Loy$}WQR=JYk*8?%3Ioo0S=HV6*A$w54?sEic;ZSO3Kr{On*xT@&l_#8&#v8Ju zIbNB8)vGVU`}m@^q=u}M+-FKsv#vGwry>L2Lx?|0&VL6FY#IYdj$gv@#khbVaX?<2 zDFx?F!+t!1)#aIhl3WQ+VLU`9;1JD2P1nH^Mi_e`n%RZpV-5?L{R=-Yl72}#?*+bZ zd;p6cyr8hp^Fa%lubYP|4+v7#blCT2l>L0WjCX^ixj3w*DKFuDz68SArl1c;7#ffO zRp%i-&G93Bi4vrQ>`6XQCfo7>-B%Lv78nVB_GBy)6&>9>t2Nl~x zF;$#?v>uN|?2(GxjQ?%*8;AKpH_faq!FD&^;|0}`tZZ8O(dzX|;yW0Rh%kVZ?`ZW2 zC2^3Tmt75m{m9eN>Mcs5NRZqK{y4PF0dN`JyiiFL0L%ePp~A{C;E-HoiUTX2IM?hy zql$OFoH14&@h@ADzwJ|59vdEVJ=F; zyAWUNgP1h%(VBwg7^gHP1~n#AHx=49qL%Epb%R-7YX1t)Rt+TrAwgGTE<@Kmi(ac! zZS`p-u?6E9$(S5kp`;JB!2=3NJ0tAAsi2!p88*BGw_LE{e<49Oh%+ZIL$GM}IcMm% zT8mfkcR4#2G4Clxr{*CEj(iI@*qU%DG>FfvfVz^7rS&XX1{T#~`_3F*c)VFN)nd(z zl$iIInrgtl1?|c8u0wAr%1eo43Iw91tjZOtrUUA5@RD87Px~~ZADyx7Qy|38yyqNy ziFYVm0pmVHAR~)7KmH183qe?dgfNi5a+JkYXg;efdb|4TO8YA`KYF&+ufVJ=wfCT! zcze%nK^!u+Z^ckyx_uAPLyQ7amn(^dQoDAR1_3a{jyrKkmr{7k@n=mLWB#HfYKBYO zwS#Tgkmr(q5>|gZzrTl^c826K`&ur=91r5_ z1xSqbdaPcBB&|dO?`2I8p$sPal|{sRaE{PZr=pL-Zs<~$Yt4I$!^*m*PB>yp!M;_e zqJIR4=EI-31Y-ipFnanS97^(0fL9WilxU`;M7PmGHwl8Eg}NglHISovz)wmCZMc|P zRrWjrV&7ODf_0et`rU5-ym^FZ49xbGc%h3{-F+@fa(^&KYUoi9?ukM>$=ptb6zJ zTJyeQr6Z2~4}yN0SYkg9d{EV6#8iwd=O|o`zbI(b%zH}Yhz0;@tD}U7jS^(RF2<;% z?!%Lu=ABUivOY$GmA5ZO9-hS|)-HVKBWnF|B8D0tQo`3z#q(41#WQ>>enqVH!pC}w zE{PudC#PdHc3*idT#smMz4a6)nQnfCUG5wC-obpVhW}M-g#yn&89qlnJTTbE&mHf+ zGu5qw9$T-+o+3N2F|#2WyA!V?uAGNQ6y8qtIbDyTS~L~xKxA7GusdvN!TH!8--Q|* z5s1JG=^dge>{0`Po+|4g-Z@)`R=4P>9zN_6ZS0pSxtROi+xsCDDL#I&1RVpaXmEOD z>wbOf=V>jt9*y#BE#BK08C;}q-KTH;Oy7Divh{QBUvJ@uDfa29!VZzG2T0e6$ks!; zSrSx@{rPA^=LU7+y^?=2s*5Ia5s0hr(>IX3ttb}B9!Nl;JbQkTyPC&9~Z<6CqsgpQ1V^$ zZ4bLwnD`jhopF#Ax&BD8}dM%44Yb+%sVI8k~|5BcOp5 zu~T}g*9FENysGgET4DfV92RErt0Ji$_-tE2=P16zl^+jIkK%<^<)0;w=>B~o8ai4v zOZu9J8BuhQ;XTWZx_^Dt->Ca{Mg8k_oK>IBI(_umI=qpM(O;wcPehry1!(?X!cdLg zDv68H*Y>DM-QjPap!TVOsBd@%S)14`|H-KTt*C#ijxQ_lElDkiE^gEPTiU_M&fJ;0 zBhM5VS}~UY6Ad&nZlOZ^gJP3_$VBm4W1Hb$#!+a&V2Tldg)x8)_ z-R0MMLX_eca=O20KZP7)TyT>g;=IW?jd!-ggh1N;QH*~8(_u|Dri7BDn+^DS!1>Jw zNZWc1O>S3`8=$nf=tOuX`w_*Ze}!+pdMTHrt0~Nobw(uq)LLIj&8AbcJ&GLNnnlf~ z^VIeOe2zKF>_4cxpoGey(8~B)Gjv+Sd{eg~0qBw>(MMx{q_(0m+Ko|L<)|xma|P$) zUMh-R(NuvbjoH5ioW@rGyNPF{S4C4LzZID9$~o`i?XNeaLOKy?I*_L~9n1@xpQ7yQ zNRy4-yK%vN5%a4^(>|m*Kh5o+h7$^UHX3|dZ`zHQz2m_X)JxQkD=@VeF&l6EbB!}E zoa)np`}U~0mgBR9=hbJbY%(@AxCo_}3S>wKEr@_O)Mc*I$k0{dJNGh6Gr z9;aHBnPY)~r*h`;qaIJ=y?1DnH=?LL|Mpbj96h!Rn@->f&PX(nT5;~`YSf9uwjsWY zZ!Y@Z#@EFV_V^65RZkV%tosjtp(@1%4Z_dGAv1qO9Mju~e5Bwt{T+Caf_gG#svtfR z&*#N&!|(a=%kD^(U9%KT;I*^U5&suo)F{P;PXke;0G1BqZI1Z2M({}!{}E=DOh;D9 z6FNRNAC2uo_InZk8`+v=tXbw{pIbY~+QFQ)yRmk+oV9zfb`Ozle+GiH`nop8_m4f4 z_619(f6)iu^mzs!r_5_Fzfj~CGC#k)eDoCcOPF8KUVfR#FJu0B?d7Ads2^nh`R(O* z6Zzeke*xHuow;-SfF1)!k7wFrb+4yU(+`gHdP2u3&g2PbD-|Diiu(72t9N3%Osb>V ze#r1_;}q1e47o&f%yZ;#`e6q$GDkAwg9w!WW3A z!WcuK7wPy=jnIaB@zn}s53X#?>Z-v)S!JK4)>7|X%J_owUsxFXi!loeF@kvErTo#( z`mzU~^Y4bh!iGP;wN4SyY8~H+DP8)1p%(OEv8xU7ej~r%ZoUykq^~f-Nc36!M*`;i z!efkBJ@Ef0U>2s`WjIb?ejA^gxG8G>Wf)XJCccN}=L2=rJR~ZjTZOHfaps?KIzK+v zIDKCHUgPxn@t+x|FN{w%P8Y_f8K?2Fk6*F_r;C*YRh)4ek=f}YfrY6h5?Ghc6bUR# ze=QQ_^)o=qdXi#bU5)e@q;1Cc7}vn_WIm6~J~M1+55zO_=(^gvOiyRJmrT!QxTNPvHvL<0PKO(ej- zjUoa5Z9oDihMTdje;i5OdWu|h$8hM;~_Tb}~tVV5)MEg7oNm(1r{F#K zhnMR3mlsP-`gX_?$x6 z{)&pmw2}U${N#7M0FC3o$35nb0pg4T^#gWlequDk=iqwTX(PY>0oB}O-8lw(hQc!$ z21U;gp!c(;jT6V(=Mc~msJ|Is-!ietl*cLE2WoyB=)`w1t)XMYXM~8~IE{v=XFCE4@{IL3t5obWxu-=Kh`q!=d#(|AWg4b!2s}=1M*XVkP6h}X!-}0+bZSTs7`vYBM2EiG}sw$ zcYNQKoekb#jdHb_`fWlnxficbjn@mK|rBB#tv)3Ta)d0U17sGuQcpD97J- z2VDqdGisa!%UibtriglN?;K|=`pzjY;eX2kf972H6yo7}H7D0H1buUfOi}c-;WqY{ z7r4k{7e4AosuAxWVa#J0roGR26lDoxr0_iaG0ple%zbwA-8|k2V?Hn5J226=rsn4I zE`9H~r)zmt_aAkyDF0!_4=XEvRN14v%iW_!mg~dg5`JA}MOPFrVI5rlpWp5CPcHU% zD!d>t8^kj9S|)OQK5k9$T7-^N$fU1(6p7wN#k2By7nRP;?_E@$D(GF*WyX1GQN@$z zt3_4D1$~Pe^75DF6;03US2Q~>Zy^%B3DVC;Y9c_ri=M{cyuAAf{dL}jMO9i+g<8}F z$bb~cdlv=ti}It2gCc_RVU(Z9@(&X6oxnoq?SMuA9Rb+wfQ6lsgypMjPxG`_OK6hy z?+5fQhl<;a=A-VSyfMt}dLD8~wzo|@OwWsZ5DD};5!C`Z2#D?iAugh!4ncj8*Vk#J zHyY`UMnvV^Pvehuf~`{px(=4o8hz;E4ivUx$si zgR0v>l_<5IJtYM!#UUs3K?mlFfhc)BOL7QZo)Ah}S=(DOuLi(TPC<6m+i=8b*yrQ6 z%wA4FTsQEiPf@XvH?%0wZ?#Y7vonF#bJ6wn@~jp0&u?yDO8Z{K=@4~rZRF$}`?Yg} zxPD$xR8@11z^kj*ZQA^=*M|Vba}GX!g~-?MYw_hzDY&qxqNerSQ7^(|?9&D&(fbAh zyRf!*rB_Tnzyf5N>p$M(lX#WNttrBCM96qOqV zc|W~yby2Ch4hm#bQ6O^BPw^m>NndonKTABBKO-;3LSd&+pNmF$AEOuD>wWBZQGf4a zF_#)L?_nb%3kf*SRs48@A9*qL7o#fj%%I?(FMS=cIs6~OktMh$+rhf|H z9ZU>FLR84{Lo@}xBdyl7Zc*TJldbOg!h*E|iHCf5^0djy7Z$8FuoE8hJaE^huvEUV ze67n%ICuZgz4E1{J0~OG$5RXEUispE*7o4y{J=RG`98e%+s>6QB=p>jd>=j{(9V_b zySN=F`-?ZF+qv>H*0^lp>nm>!_6>F56>Z=n9eAh>{6Pmku?>8@1DAAVGwU39Llz`b zk2&yRZQy*&_g!`llo6GSJ@P619>AW%Ut1oq^51j7m*s$`bHLvt+_76oW-m<9_S*ek z4)~Qh;MF`29KHPv(Fx$N}G) z1AZ3pi^VB>IaI>eK(Y)rwmmv~0DiGYj|u++IdZ12S$Os@UzF5p?c=i2ao0-n`-t{opK;90%r z+VTbgcUK{fJqP{&0eE}z|4I({M>*gfU_aZ#b3+dJPjbMY$^l=P1HLl{yan)!J^EbO zbq?vc2S-eTnokB)lqfiZDZnfTzS zx}o(GM`CGK^F#jJz@JL~^x)5r_;VwFs`+yhe<~~ZTh+RWNu!3xM+q8+jvM~is7bz| zeWQ2wR-;4j?9*pJc;L{1YVT-xs4q5r+_(u@>c&4f{=s+#QGEEM2S&w*4xbbo^Wew~ zlzk(3G+L;iG-~L>!zT_M0VHGVvPB-On>aik8#DCbQRCu|4y_+Qe9~jS(UV4v%9I!} zeA1)`M@PI~I@WUfr)RRVytdETvA)1*mdUUq& z5g35~d!;hXx9UCGs~?dARb6~imLXqPuoY!r(}PwB$=wG=L5wC&8Z~+p_?RIuXWv*K ze4aF6+_-Gv9K^Sx>1#25BdYeA`ti9N~DmFNyEq24V^UVfuSQtSKc^u^n>H#5N{WK>k{o`j(BkV$f2@@R+VBC z9-cTH;{5fEyB+)bQXIf`nwT_d+^FGoZWFCqXN|hpBq7a0_`UrRBs;mSYPks8%NsR$ zRIDBbTF{yU{c|J$Wv^>y(H4-b{MoUM|! zf3GCz`}ay_`uh!SXL|e%&4LwOS`r>l_0uCloL!QbF@4gP+BKNmj9 z1Hak>-{rtzCVgEz@M73~UTxue4tOEHO5m1f{LpQ8p5vMj4{37YgB;g{@M}GAm9HG& zBD@>^-0}*(ipGoZ>+t8ozn=qcIdFE5s}HyQe|q3t#klZWoS-K0clW?|IB??OD!?sY z{XIP3YDdA@I4;*^(f-DTi!WazQP0l=@nVH7ti0WY#(0m{r2H^ z`@IQIrUV4`2 zfFJR|z2ke!jamFmy6vv4mhiUo_okb(<-OzECkOm958O-V^*ytAyzq;9W#Km=>C#i{ zfqUomuRL&Xzvel+Nu(c3gbbd}Aw1wBKfUdmssNwE=N%rnw|rjjY;%W7MdlI_G=fwVv{C=74|XfqVOP*aN@QgQtiCh>P@i;hjA2{>XFb zAHu@8Sl-*;-+1tQ>0jc3d+B*lU^$nc3v<9f_Q1XEUh2W`<#S&T+)K|;58TWDM?G*a zpZ}Nx?j09zznVPdz5L|c3B1TRFF)V#z`fTSW+)>p6zRwvi-U0$5 ze+oQy*->>OqrV64s3HNM=Ycy@-$5jA#9t_r;&%``gp2TRiIn^q=Ybb_;0+$Q;(^cg zz>7U_9zEqn{2fI~{;cr8v%3;L-wO`2SU$TefrsvOk3Ae8J_7gnBzYg>rsAU}$9>%y zTl94wQCH{dPC$2WxXlx|&mEpe=xumARnG~#cW=L-8%JbYPJli~Nf{w;CoFkx>+yCY zqhD9fiI3)J+8Zi87mFvzd2Xf~nRKF{=N338;CXJ5R$kJ@3|2a z^XyFW$>)0jf9=)Z;t=ZY-;2Wd@#+x5M&OSsBZNQiwdj}fslidE^pUATTpjA3e3%Vb zd+qzE{Bb$q6#0PPw(VPhwby>duVj@9rzYDcDQ(+-8L;-+4>|3-`^*lSbK3t0GTLi@ znA83sM{lxy+KIO9e+*cA?H3P51}=9$TRxrJ|8Jpkd-ZqA10z0n-#b_RALXb&d!(%4 z?vLlHUxE7V^?&IsSwnX!vj4PWZTmkUNBg1CvZCwPIcUz||HK^i7g4Zrx%=<(>0JK( zI!FEfjw2OuD?0bK@<%@m|IV3z^PKulH!@h-)~BD+Uj5>SWR+VpHD#i0eSTG}z54x~ z`r^Ew2S?lbU!ih)<1+wm)dL(pFP6UWK=JDKUZ+P2SgDebjC=GU^4 zw|&Nz+O~hA)4tbNVNHF!tWVtD%YAn5H96|{cj`NA$e?iPckg#O^(!(ZWMa2dzk3_F z%MSPcW2b(bV!omYZ6Wg6t>@-WaO&gM>MC~X?{f5yo9Xh;eP+C?J^pX`jV#gMX^`X3 z_?BDW)!)o++csknicK{DrUC#;~>RKYiEs>X*+I N)qRh(A+Bxx{|&VmI}rc? literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution_backward.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_convolution_backward.o new file mode 100644 index 0000000000000000000000000000000000000000..243e077f4421d6e9af5a223f20b4e3fc61769908 GIT binary patch literal 71840 zcmeFadwgAGo$sHOrVypDQYS!^C{Z^~iyck0O18u?3)hCVVed35gPpO1ncwcg>U1FL zuGX>CLXyIAccpbiEdDB!IiiP|VQ5u^iUreq%cZ2H&~i_?X1A0ETAG$ZbKallSv$R= z=s0K2AE$j~?X{l!dM@AR`+P5-^_^7a{Bp-BEB$lI{<7@QQz^8ptbgpG&}I|dTy{p; zyULdSa;{Uh@=&_}n&pcDrMjCB_igFub-iQN@k^4d@a?pVSZaR1<82x_a}8hP`*b zUD4rM6V}x`d+cxP2ry4^>P)oE#vfLkpw0WP% zDt>AjMp!*zYTPva7QR-Qq46~)ET3rh_MV?F&;MDraQDrob>E@4?7J~seFPP5Gt04|!;aI1zooRH{vquK5J=J||jVXh`bVV=p!l&oQl=k^eEL%7l# zn(cexBOR|f9b1Rp#p{xf_-@O3XL<4wx8;#=N}|o3k@=W0-``XI~xWB~=QO8S8CcHRNr!_D+z5K=U^zxnM0kg8or`Ty;#}4ABCK|Kh z9tBjU!h8L2t6#_`rrj_!D^Zi`+Us_Io_d2qCQ%m@c(-6-klPd#7Mbo!6=o&se8yra zFT(s92nc9a{z{%6^_e>D+$`sBV%!&;P zexlJ2_oj2}z36X-Q&eUaR`@984Dz>5L#LqdDS!Ny6y&VCTPvN)CkMH;sc>UJ6M}$V zP%XizTTE)kXD!q20oATGopJ70RF9HG6K`$US!hah4ZGbRGG%2FRYC5JhTdS*GLzct z*mk1Z`2v$96E&T_l;5(n^T6&4+^UMDyM1@;3F(ggEX6fMa98ej-k}s7`$xH*S`XY+ z`ntwPh-)r6QACM?>#VJNwhl#X2nty`HKjBUDPc-=_n)qTrP0i*W zII@wSrYo-9eV#kE;_gf|v;27RV@&)Q#g95(JTFQ9YQ>T?JyeMT%y!&{a+_aROjs~J>Zg4Iei`(_;!QtVK)lM+#HbY$f zI!_&y?1Z@j6yIev$?5c6Gi!Je&=mKMsZQ zS=ibuypGo$xBHhKROA*zMOZxJ*nsv{h-)yvNbuUo72oG;9f@`VOqL>KZlF(WTps6C-!SW(RsvinA z&k(qvu*?u}5%q<{v-Zp`8Bot;lh1BO*>JlyqFxhpiy=pNw;KJ@&m-B zDLz=fOJS=65z0|^8_bNl&g?A#gfv+GTzL>tJ?l}lvRRvrs7;2g+iV3rJOm=C0mQXR zWnZhNu}KV?--{uWy!R6GFg4K@ka5~&zI(;T$Vj?;Vy4ggYT*3jZD#LvySM%sQScZu zeMU{=jD?Yt$!kta2uJvyB_s9_BcrRi_cV{ve2D#TiFWjO%Kt1M% z_ol;Mzwp22=%H{&C(-Q91LsE-<kggZ33uO%!Mu1FQ(e6?-Xj7(lYQFB{~4K-`J}t=?sJYU|uNe?60x_d&D-z@*?xw^(;tJQO zd%Akv?ln~`qVS-0S7yXAvs)Ev6CJ%sDEqqSse+%tU*H*@`G z^kO=v{eMbgat91)xjT1?t1#yIbhEfFuTC@!2g{!xrT47Kz|M@L-l>VY%iJqYrZIZI z>4|aaX!@zlabhn^x$T?E)T*h8alxpI6K%ZO<{i2`OWJ&*-qcTDG@eydk8C6Aiq(4m z$1BNv#T%EYC%D^Fyv*IRk|cW$DEDfD8qa!>*ugpiH}A9gxQla8!ccjfW%Dqb$*fP7 z*#b>jhsrskgcT}_5)Lj`6DYuQ4{lJHEkPsKc-B5@-D+C5OC`Q=8DYIj|Bz|uJ`(NO zuFKOzZ{Ee-6>8qsUsX%@yr#hCxZC`!0{4{b_IVnZ_yl)X^b@$pl(yLoM^V=CD+gXYR1_fh8UCe$TINQ@v_4Pq`-5#?)I4jdp?Lzq4J=*P3&I1|R-|Tg)aQt&G zPBgk*pJco5@F8Q`G(|qN+5&&9A>G1?mV+x-L8KZCy;wUWJk+Ble1Jn z2PY8|LvnhF&=z>D61a<(#e3+i#58ThA~GV7ZCdSfs4MFn$c$QKXe!6Lg$=#K4;gBp zK{b1lS6F%r>71s7`X!aTSrp3cK52|d*`i@IA^fs(_7Y{*5 z_2!(?mE|k({|x(QGr^+oEYe$b_DL1aXZP%k4-PH9&b5%S)uL*uA;npkpKwPy*DdOi zF5IruVJh65D#SG{m4|~1B`E~5>HAJ$R&r8fv zi{Qr#=5WkRt`hT`8P!rrJ9Q2&9c;;1ok1>O&dpw7c2zX7Rh+|2gas4mExp=#QPU)M z@xkyQ?c5y>Lii}<`uYm`(Ot|z*HZ3aVJ51+T3xozRDMq;tWM}e_72{F*;DC9Y1mB8 z8guu3s;o?`4JQ~RHND}6VEOYE>F~86s{YHMadO?%48l$BL>y^f#T{l&`m$w$I`Byal@S=(0qY zUS0Zh>DOhD^fUVOKH<8K{@0W1@*mTC=n%;{C~9@iTU#-o4k$}LJo}7HZWHU7kRO_m zHM8FqEL>Z0jUJ9McTP&M7z#J)T%Rpm#Ic`oN$2M3w9c%XV}34~ z6$?UG@#~P9Cb1Nbo{Nj=R+nQN+E>#M+lB zTtRJ?0zln*TViPsGMYM)zmka(KTvYbqAG@$saZ?D$HY`>VZryBnln@th$|h3Or?hi z6~>x2OtBb@dNPyo$#nRtx?q|nQw?E~JGC3`#FS>mD`eK$d@zW`El?D#4jr$>fh5_X z-RP%pBLwzcY`ftHM6o4I2w<=!UO=+2gVLcJHh*zw%!9_hV~7abk?`_97jO zAL<#sdjq)dfE8P<7HVAS$A`jMiPn}VQOG(QHGWG7zA&mNXjRtHTJp9^Zt(?C=3FMx z2KgOFc&cGjEJ4f^D%fb&Q&xrp13j#4ncRLaJP>_eM8^)W*YV8jlpmNLungx#QSH(U z(uZ}_>4p2$U#&!YoLe{|RY+7JR*tTz_nnO$Rzw9c*m))uNHUX+CJ(Z~H6X9K&1^0V zR0TEePjl{6$ttcTYPwtZb5u*mGsrkPxJ!2&aPoi8wLCwW>!^H+YeoJeTu0~6& ztIF=IQZ9Gwi3l*N_#Jb40Kq_5p@-`Dp;iyI@dM)qI}txj(8GlIVWJ);nuknsMgB6k zs!1VDr~7V|UtX1hp*X#Y`%HNq_nFh#JgP1(ujl^a)9bmPQr^t{l+)P}s-~8=azFL- zR_>>lw{buH^fvDE!N6)n{l4h317o6X-(5@NL_+(Lv*3D`+VB?o8$wtFQBVCR+Q79Zws84v(8iMRg*!2 zk!48DD+@c1_#mVc2!Hjz=VZ^ zm}fW$hPEx^*z@tZ%Mv}b$?f_&d`7Bkr`!E=-iCJDle*88Zf7nwtk*$$0V!2TrA@NT zoItgtTj6PbO|jDuQ+ARl9d2pZsby0^%2eU9grtJ*;^&O0O_ae42O1vp!{;;yf6cQW zO?fv(Pe@Q$TdLvNbZ&Phe28Y)(``1I{;RSb6>N>Q9Rp6sF2@q1gX6dQ<6jVkQp*C? z(oFWe+!jN)ypB~41=#bgTD^}njLdcBP!~7@8DwX4Q)>`4U7ZQHH}rbL8(F<|nNd(< zAM!DcrQYr*UkKu3lRN+0!^mZ5L^|9K5o`0pl|}&bm^<&I#Q5QA1UVYDDqh=5-m0!} z=fAH0U(S$NLMl-~KM`KCyT`<@BOWR-O`A!NUkA zbpV6k75#}4H|qWB)z=N82=|i?QES%OrVXkl9qwdoT2-21fUVj?ETW+)46h`nHPNJD zs6_Y%S=vM+Ai*Uv@sQr9aLEL(aA|#}PzybJ&37J@AXojV{e?4CTX{CB{k}W*q<3p9 z^2mF0_v)oZek8hCs$XR9OkBjJ-%mqasF_7EM-fn#fX*}o!~}?dd?^qC`5f0#d5>#F z{=-~H=iklsg#1Zv)q4y9amUuURf*G}3mh0kJ|=KKA-1m9f#+WBjj~ofA6;v&fG37L_W2cnse(Grujqa5T5cj=+0^Fq8gB%@5(HG4EIQFw*GIh(;t&Wy}k= zbCsDL-IhncwTG<5@9Y~MZs_&HgT*PYKvx>p7yn_mN`_s^hC_w#ts;?M__-EH;f9&y zmoP^Q)W~JdizXne=oansp;KDiUbw?czEl!mghkl(CsWA>-T4o`q;io=_=TPxWhgA1 zLwUTzAWzP9R+_=KDJq?cCY+l}B6NHSp^oRa?D3Op-1&cLihy|dh3`sYRuYu4?9J5f zG(sdtd+$P`FNi9}22Q6r4H*(mn3wEhBWPC@;R*`wLs&Gwuv0YQX+L}t&Otz_h?|P-Y{+(mivIqQYYFL1s%#Sm{sakzzGeFxFGX>@b?miRs*> zvohgU=n4m!Cpk(PW|O(ToI{KFT?pS~Xwk-WuB8#)SrbXp!ybNv!as6+whUg0hfO>8 zka?x}8BGybuScPQF?)NHWmFDhk?Bd(9o@CGPHd)lPcM8WYC)o4r@GWyV#fO@&g*#k zbuq#VOzoDZM69vvG3t3i>~k}s09#p!-PM^(CD3N|nuwjcl~;O0yW_Yt)HHn6q<|?k zY_HSr!-uvZE3U{mTh$KbNQWo*NTV#=={PE6?B-p=XaM#-M18IiQ)DuEb2t)s~eJ2UJ?b=tju z!QK<(n#f~!SYBugjhe+4M3O32bE|}FUls4lG^-IIo0iF~X1$ZtsvZ7HIFD+l z-bXt)9?UV#S$D&bWgLUtKDgknC6p8ns3|p^ufLN}*doHDq1(g!W|x^+9xQa{p2LGk zMWgDi*9P5BfwXOiiU%`nDyk>#>}}W!KlO`jlurz%Im4zo!+IQI-Ojm+*FpAjuI2f^ z;W{dRK3Bt}a*%~d)j<}L`Ki0BNbHVHiX@){4Q}8l3~O0c%R{YsfTgQA2a;b=dK#mr zF{P(!J&k0WNVbV4DxKVu&r#4ALdG<3GyR+@_LEao_|$Xi*k(?FS0&VJGpE!O?4Q%j zesT(|YSpZB%r0{ZtSYJ7sAB3Vu&P!0bJ%EJn!|Jc6b{FGX8f3OX@gjHYPrx!5_;uc zQ8!D?I*VS6CZ3RS)})<-nNe&nyJ0w+?#SkJ@|6C<*_&}5)`l_bS7p)HoaFA=Uy(?Q zBg`NL!n3k6PtfG4G=&UjMln0cg&wS`dTE^Z;$M{|ADrEwIjLMQ4~$HC;5?jlh5`o} zwpydwv(A>?^z9wAV(gT=SdZeJ<{4_>Cz*0?*Gje_$*J@@Qk^+WvE=l`WVh?f+Mp^D zMp(9#SaVt{Oh`yh7Gy{_Eop5P+^&;MA(lOazTQk~N@8(HjIXD(vsywgtujApWhidf z_Ytm{xnA7x+VHS@X0w@_lc6kTZn|@g);790F$i~`jV6vkBLri!!s&SW#Qfg{NQ?3< zT+8##Tu0?U!nFdib;oO?^QUn=Az#U`y)%fuTE*{sG|%4ZZ#ua*uW3RiGxnD*;Nj99 zU6$z5t4p6Q{kkwQ_98XmT##Hj(UWwYt`;I&HNs+7+Jux2&yEN;zlxLujdzG z0m6nwc6Vez#}USS`wzW?QQ|QT>y($cR{a_Qh03`u)(cuWib%K4?MqEsYqdZZZINP7 z3P7vyL+9u&X4=POh!a4YDf1cfa^})KdxQ8A(Qu>j>!Be=4kJZijfgSYKsawzCfcPf zq1R+=LUfHrjC(OjeZ;B9uI1P5{vRjmbkJ?M7J5kw0^^Y$rJ=BTo#!q-g~E^pD^81V zCIz8+(?+F#6oej5Q!(>`2)s$*IKT1&AdaZ?kM+rBU@GBwD&HQQyD5nl- zj#MamVO5b02fc6u6=@mjUn-h)Dv2ml58>0P?^!o=o9%zY6z}P}Ovg6|5mD6u`-DW|hnjrT^lHsg%(ikICxc!w^|k>WpzCyATnHBt^Lr8n)ad}<`E>%l z$YbFSxSeN>=E0a2vdJ`4TqFE38k+Uq2V0s_A8ctJe^0o^W6;KLc-vODb1cbIh3ien zeDHed42+&*#z`FNlTW#wm6gP!@QH@7_j8_1IXetf3~wUcj%ZU`Hh1sPCf*8hnD4AW z6yj>()I(I=xsQxvjsn!Qr!Q6v}X;iRnN zb{l!x236!UO1=#({dC&Bq$G5Y^qZSk5{>TC z-7Hn(=3YH1@mdmpdN2zo(zSt|zz?@|Y6tPNUTb`G;>v1)B($3Unp9Pwd%4}j7c0>gR@fMBL&hpRLv{OQ^^%qems*~ZVD5t zi#B`=WOBPCOEtRGb3*@KI;yzp!tlCPYVn0(=PJ!goiU`33!-`Umold{^h&)qhcp?kQO=8kpR)uIF`x$D_}!Bu|5#p9^wOsC7i-$XWrSwKzYo{BCwJEY4nme zYQeelSD=GSSrJ`Krb}e8aQ6~Y_?#b`Wlz^ElZoVjPTl6Lz15tx&n9~&97=`H`-R1- z)-U|Ch*=Qshi(s;0<3}xnr%o))bwYm`50t#4?^Z(3^KVVhqiD#CDG^ndwgnWiXMmiT9c+F=X|ZB%yF8#uTpK?c7eE z1#H|q{MJy>gUI`7sqMkv-~K`H8qd>Cs^af%SRYKYvJ?}`kWM9>p7 z+8i_deB> z6BAf6n4%z`5I8lRyLw|mVGJAQ87c>_awL^waf^2;QM*j|REBjvlW8cy`_Z(n=!#N_ z54Z?sBdU0hVFV7EWs%MuFpR+DVByfi+;N2Qoi%~8PxQZ@!@N89Lu5L3QMefDXiYGd zd$QR0##{h4JcbJ}t72qHT(LtjaVr8}0_$$mxC(row#b@w2;0}LJF0iIIKqbiyC%`NudLT$kPbiyr|-IMlbM_PrIE@n-yYs0JB0~ zD=iKrUt`9<>FPjyvHUO=V9d2*0ZLOj`o|Ftpu?;V!vUD$hm8Oub22o1(9~pWIA&=q zf0L!b{=dR536`8)BG?5Tw*Qa$ZV*+>y)d*Z<9F`=#b>pBH#{3emUS(0rNs?HoX^)E z6TES+NnWD+C5m3`l9(di3`PRyYE3mfdrbJ(kX|%wIwtsAj1Ieh@qg%y&NjD{eq(F$ zkn{e+1jYNLmVa>#A#91-NZFd5EybS^fab5#+q(*B+xsiwVfL=Vmc5G;EB0I7oh;EZ zgnX|_#&^xT%gXLjK=JR1W@}*`8N1Q1KP+)ee14$~+WO^6WKR$mMit*fdlgo5Lgv8Y z7b*~8A`3Da`%3M~5WAT{&bavm*hwybp&YsI`Meh58N`^(Jk2_3yBdZetc^zDg@w$W zv;~2V%mh!XhC_*}IcTmE?X5{it%HXPb*3P;aq@@Jb219RN5GZ*mJ!`~AAdim?)ub;m`lXAwxO!mg)AQCWH z^+J}%p}jemgGtw4FuF#~L-%dvwT!eZg?^&zJ_fir_8|>Es^{XTw=sll^E$7dN#}Je zGm;RUs>qfpBM3-J&`gG~nWw4ZK#=Wq+&Agge z+)6*$H=|-lBOV&;YD^ofg|_84v)TivCJ5`7vM1~|^nL}8HqL=Wi)_fo?t}2~-MXo^B0>tZh z0h;&}6A=pbaeR1?caOP7#|rsIzAfD`jHXVf3+`|P`UT;g$bq3QECaL>16!aPVfMP$NA%yI>`Y5XQw?x%mW!->!d7ss3XYf$YgmNb0 z=wowFQ0@tDntP&hPkhtdP0HP5qGJ~zb?9LYIA0whi|G}W8-;6@=sKJ>;iObnkJK`>DQTDFUg!^ z-m&5uxd2sMb(jl~)x@*IF~q6lqi)w-=qJ36BI>IBc>PS4Vq=HXaUk{pVoGWWAp$`0 z(e8GAlMVkE_n&OxR(WD(N8Ku;ROv2f4v?v=nBFoxd>26+N?ZI8E!-R0h0nf^4Zsp2 zq{3SGL?FD2{PFCV59(np4`YUJ7}}2mh`abzj`M7#9PC#)FZs#mp;-JaeoD(&i&KL8 zb^NB1kEFsUNLKWm1F7V_+}-O#1hj&5DAzj|Z@HH?RqF0b(xx16+^)~DBN_8$rAt~v zHn;ne3gytgfCVCzfciml5D|8XSIf+=75U39M(K=#7s*OJ_turjs!y%up&TBU#lD=K z*1nF`=~ypSaVEsF?>4I?fEAgnB8ZmEArn;_wTNV{RVMihB)?w@)n+xVMrnKgr%(pue?4%hpI-m7U${yOQEB;GUOIs;-?gFr%VxyuF`BIf2bQCZ5bryB*>wgucHq%8UFFETTlRoF_>8Sm$%mB1$jUi|VCY8glCCljfqszfC}rz_!jD)H%tqMe)k6wNbvom*p?uFJxFh4D)!s zsR=0rWb+YKH(EEL*nNm2c>6=Fa#$?hbn_00(Wc@>#Qn2m!HmK3gB0C(mQe^8k=ZVF z3*s~+Uu$)1{cseIP(WTo9QWm_(|DHAC*X%ef$|`_0^t`oHj5F!+bo;cvh$p6%R^3v zAg)vhNlV$4vm~^~+~DjgrAOvve20wJ(0`q)(1$fI`l)bFuUh}sT!lW2<=*PLbaJQL z^&5^g+Uw*h^nthJDs*8q;hh=lDs+uGME*gpLVj}nmB(`xTFmAC}+1hsf)GXrHd4WV@)oyQf^k8C<*~qkt~|hCew{+ zTIh3Fh*@X1RGIapI>J>5x-v+##@6gLUSWx-V=5-9)=9|6UQd)Osy+uor8_VAP`DA1 z{{0{r%>M)e3@_XZWl&LaOq};`$k5I4AyxRE*^R7bTO>-f9tTZ#A2WzTTCej;Ja-#;Y- z7+!ItvlX z+Q}I)xz7vx+(~QwaAkb_;$@Tj+?IWxj{K8{l>|P{5QoKEz2p;K_!OM;8_r5UxdXa~ zorSf36=$vH6i>ce6ZP(e+@q}g_OB%cww{)F@8j9zB_D+_Z%R4a?cJmJWTy)^YBGxP zK^M!Moqm^Q^gKhTL`)2cjM!G*$5s*U5nZlV256aRn31RGWQ6}A;;1nR3&OCIdssA& zrBUl4lBo;(SQPEX6aY?IawckQ^C+!{=0bnOm}<=6Uzg*}7JXwle2TNi5Tmm%?4vXG zVd=y+ww9B}GRy8mq#8510b)6?-#@gUvo|+8CPfqHOQ{dcPKcazg+G4d_$PzhgI1a* zPrckDL2d)(tcHtSug$2Ef}hZ_Y#N4i;ul$sN6=_+lIR9!tMRRd z?l}YaQeR`uTH$DH`CDi_CY^RRs6X1!l4@#p-O3U!uD`P+hn4G}l#NEN`!nWZCN~w4 zE`3yQ9H6Y2m(s>DD_$?*l+Y9HyuPI%jQ>MCSiy!)Bph6z&P+p1hBop}LwSF5>?D;9 zAFxhR0-6JbDZ#8|-iQdlU?#C~k$NH(t`zZKLr|1|x0xO~6HSvQC-)N7VfZSSCF*)e zzik-UFFxcewcTM1ddQps;SG1Gr_qR+W0c{VA^rf0wl!!jY{oJKvZR__P%n0kT7E>iCA6Et=TAnf8_Qm9Az<7XO_xl7{hd_m$;%{;);5SE9xb# zsF%1J_0q?K==>_PcS^nVd$-z5vi~h@wWKp!Eqok^25#3C^v2;Y*>#5YQ2c~qh%7(3 z%kBEDw$O(V=SeAn4HpZVo@U{s3YW~HBUw=*ZULLf_3C;3s)G8SBR$MU!GlSw|J{vvr1*5u0z_=7e1@)G`FO}^a6 zAFRojVG|!wfAyLE>T^36b7Bc{Td}5I>;ITn)Qorqvq}?uP;5eMSM5oFa+QJ?j#x{7 zjYkZ^ax!*=1X(?4L8i=QgjN_>=9~hyT@v#VjxR&G6GaHwl{vh0q zpN|-lZ1@~wye4+&Nr%HNCA2({|I)c{aA+_x(a%<9O3-|4zG(*}J_IpP)NN#6SMrY+ zEYZ!BS~9!u{yWVkS@|JMH7r56O9kS%P!W+1(Nxh7w;R4;7ZTVh*n!~1L><(@pE-o|9 zz~)QaT{C2{^E~o|gVLnI^XDIhh|TWrKxV`;xSA+21U1;FW}?I=@Fqn0`iHw0oD)PB zRB@^?RN}4vrjx78fyr>^WNKfaiw;cfOLWnJsl87Z9hk(VuQCUw$;L|fUCgXZ?s?vS zs(6z=<2Ud$yoa~8;`%VKS;w#2wOee$ zwskr~&mjO`xZ+268{L|JhwxbY02Ja$pd)2&=L-82#KgP&HF;qng}{pb*ywFb?tO5XWdysO zZD%O&5xA@@32~tY*=2;PpQL6(V7Q2S`u(2AGwEM2W}lAu#AzRj2bXYIB2S zdmN0`&x*^gYBhzi&~0HHhYT-NxU|wMT<)tlF*eXdL-%rXRk$cltN7Y zZ;<*+a|yj!%vaHblJlk8`9D9Zqn|;h;RHB`ga&&xHyhUaiLCXEgq~{3z*xSBbL1?M zSh122QR~Wr)tgjdI#rQoZF=ee28Ay}1O-)NRZBs*nnG66k^v3O%u>k& za6YIWDY@B{e4RP*K@y%I&a{AZ7+J*teKayqBi+Rh>J=w*Pg?Y;V@z7T#59G`q*n9W zX-<;a!d&S*&F&CmtA6t1ya*$UcUrG{r;+4xcGzlO%pkJjscdJCpj+Lp=`7(Gag`a@ z%z84Te1nH`FIcO7_%KWvkU#M2`%=j*9D|2EXDxS}sJE~k==79TN;Lz=ScEk{|e9c|0(dUh4DGROH4_nPsKm~Ho~Le|@2MZ0%Egn=$jkotY z4wv$tub8uxbh6R_u2ohV8Y%9|^Y7*i|4w6t5ja}{aJ-yd$}narX$8nCQ;Gvw=K%}2 znk5z|%WB11NY))+PUnVM00%MvgN&4bHz&BP*c%wg((*S+0)Q+@3~K5qWNj61l8m;H zjH5X=$5sYdpr3=a(hV!l~utE&|RSVZ~*J)Nt(T`PbE$4i;5KyxS1Qb z^O{;MR!sW!)cwqpa$Rg>5G#!#k_!~hrex$3h*YKhuQX{F znMH5#j>xa+qh+hbFCRs0hYSNN;$X-k`6er$Uc{t9_C=fsgO!C@A4M&DHBoC+q;SB* zLn*KlHwF%}IuOux$tRIBrY3)z@?q{}EDE&F>9~cajFP?B6baLec>#h2E03f69y&(# zT&w($+(Gjz-)mlKi75ak)fknHsCY(zN;P_ifJ)D)rYLC1Oqml55|t|aSgISaG*t^S z0Wo2U@SN=ueGFpKnBhdAps*;(f?CY1*hhf=84Wy;r7W>^8DW+FWDW_?zjqQzkFaB_ zg-p>#{nGrciVvU9Xs^_5Q^lyc>mj7lq^X@=rju*bg2GWqr7Gz#q|!}e zX$X2hr7bD#WyiCCWKfF#EK=$FDgIB7RI0`vg;e^(Ds?PY9D0d9#lwsJDUeD__5lpu z68c0Mu+y}&vuFJ2@k(!IX&!BV@JkqcoEj!1N}#2O!Iffre=?0?DzSGOh*DM|<_6Zt zAT~*mqeh!J0b_Kq!&oc4iT%O!!F+=#j0Ek-N_dsO1yi4a6%V8uqy3QZ+j zCb*sd_(6yahc8h}v;A{eRcs7t>V><$Qs#|+>MKy9H-vAhb8hqdm4g)$rm6=z5pM> zp&47%d<$y2JU>y@A#p^YzSU~H|C#rd#`|69-6TqmQBGrp_ce+wK|0~ksS{bFCIBK& zq{(L1#i*uJ-_I;4blb0${57M6Y2qPNQ;9j!IU}h@T%i%eev^r@PNumUrg&nOgtw9n zGbBdTkS+X708j8>`!w8!xiH|SwTA2EJ33iI4<$9oU0Rn7H|vDhqdGAE*SY^%?y2mx zOjPj$Zq93VsWOKJlkU_JjkrZGk4!hh4PYML`S$bnDoT&6MH3uOyIQuh$9@lf2bbYSr`Ln%29~z3-)EF>FW*@i#51 zW-`BUI4G_z1Ceib9<|CuBW$WC0yTff3ywyZx@uC%&4)vr?)w0Y3l`)*1K|^LRwav0 zji9l4+*wCkK^X+b33yoCR@#VxC!C7v!RV3osUK!!BlT6w#8^`Ss_?i@=go}%&jGpA zOZO^URPl55`Z(LxX0c0CA$EeSXoD$MQ$co*X+@?m224S#=LmG%Q8w^10om}P;1I}Z z$Mdh#;($ncshJ-jPJb=+VINhd!sY6;R)R|^BI5}5hyn_;!8m;imnLOKmi{B)KZ4wl zPW+YW=vzV}KMPwta&DD6=^tyftDfwHcW||QR82YnCWOu{s|;J8t(K_ z${T?@s+qS=H|&u|Ydsr+#9cj-Z{=d@1iOy;VO=V|4>~YLxe?!<|1<5}*;~ecNGxrwzm$ z0z4Jr905FiO=EDR`d+)ui28mF@sNxo0Z%O%V%z>cRD$IS;A!6IjiE(4-#0?Xn+-97 z()?25l^)6j?^JvV12{EN*%D3v3yo8=ga#im#Td`eZ0;!#^#G+)Ibc=z;3k7~iqV@< z_p#GFCJ#_Hg}M^pX}{j5@Qn#r7t{xaI2}wm_XorQE8f?!^Iiuq2}kFvQ(7*M-;YD;mR2iR0Z0SGz4&mFwd7yO(- zK7CSX%C-M=D9w~#l~GTPg+8q}PwiDH=AtZ-jPzT8jp}Cfy z4lA89NGuB}1em9{27_8pxg#*BdN~7wx&kvuM0Nm0#f*-Z9ERQyio`9vUm6ruKBB6f z|H2NaN~aOPLfkQvrVWyy4e~1;#!NjQjf2`t5pLH^vKm`x#KQfAAQt{2EN@8_mp`Ao ztv38?4J^cwukoI|t#P?_k(AhIkk znhyhWYS`2BXT=5+V$NH|Q}vKVb_zH1ixCnWuF;x7+9wMN6rFV4AuUn!Vm7a>{eIuj1bmCvsP z&$Pr;LzR0<%%3<5x2ZI}o!V*uQ$;+0N)7l#0Ryk59+EECbKDRs z*RRCvY3&yD)V?W7vR|ty)1?2YRO^ClctDfhAfZiecuWnfGi>7>Y#^B!{B$9O(F$Sk zV3jD#zA6@Is|3d1Aem%MZe(Hs5 z*|NIsq$s?WMZRvpG6Xe^LT@6}a4N6T(^!JcL<_XGgn)Xn+_cwfZjUq4WXfgy`)KFT zArRDu8#ZAVdPg?O9}R+fHR*vlHAww{pnjhFQTb1E1!$WqK-*jm(Dq-(K-B(Yg$GHq7x4BOzH|M_???Gd=y|Cp>A3L(;jIE>C7-V}a zn~twd#~<5FXbiDER@=^0v+YbBt8FKLT07)DKNgY6_Bj+k<6opJYy*YL?fj+bcA@fX zHXe?EI?f=Vq6@471NFG@ry!f68nXbXUh>s9M?l4pN7$!uyaGOr5m1$v-3t?!qYzLh zegp*681@vDZv5dnkYrVt5Kv*6+0X1oWlw4`0_tzsku=Mf@Ps z&VFrBv#zH20G=}LnSMM+KLeRDbJU%y)tZhjj)72{{?7rSW|9YurP5mkLM6QgLdC#n z#+k)#ai_)NLAG)_Sk%m7TYn1g9O;H*P+TEJNRjFcao|A^Rx;$%MC-ngGAwdLMqg>z zge_AEAJtefolU%~@diOh#VBFK;_n@VA5ue3p+;%DL}E3Zrblz^k5&z-@fX3=Nno0i z5WaxllHy+q5KHE7iY+AO88ws0en2|xgpKe68##nxCH3QN*7HkSktq}EXyuU5_b05l zgDUtkgO@7&$|@KC zqlYJH7EP%0N*6vd6G(_+Yz_sb3U^6c7`}w6Z;%`P(?+b+>M=EyE#PrLPoS>2X!8sU zbb`D`8o8IkcB#t2sX;HJTS+17fJUVURisz=v6vJu+(;c-x%%fv6RYUOYq^D!;DGJ9 zlh*MqFV%~=+sUdOPgwGW6BAf8rLdf^@1XIuH44Q}%T;AcnW$}spLQ}CsT7Rl8L99Y z_DVGarQ#t}R2ofC>ZG4pmEs9!(c0<+4N^)!d;qg%+#WD%*=T@KKS6u=n${{+z(DYW z5lI#>*YHVSIDh-8Lj_vJBIDSMlaAYcpn`rrf`4T5KjQCWb^#fQJG z!8#a$&TkU}Q2^Wzsc`3qMr*Rs7LAZjB+Z|WGG~=xKlDMQ?;pIRDHT3A{$X$YNSfbun;5BU}QBPVu1qivIwQ55;`gl$1jm(H-Pkb4{tHJf$&)>!A~?=2(<-L ztR2EheE~yvxh$7(Qe)WEKLJjv0XcnZIH@Uj{I>g?H5)bIxDG*`6gB;(C0w>jnk#Wi zfghcRP+~3SxDQ^2?n!ji8>4RXEGMV^XlXQdk7%4UP8V4R0$#VHgtZh z!%lLP5K=9dg{V)(z!^1bHS=*vFS#q+6&5l68%_?n^WR5K@J0bgMSs{sQe2xw)v{3a z5DgAiBvY!T7kse!Lt-kzR}0vz15q75o(hY6 z9|&T;l!piv=c zI?DkFZLGPNfudJil!X&|HUz=cLX{dy8SqgH;y2%?_&ja~5Hi3Pxo$cVG_~=v50&gk zELN&}#goE^ObZq|{xa-?YE6ug>h9TCs)+Ih((v`iK!XG>6F z-OUf_0Z53ize0%?FBLFUKvoTCq(NDwT^1So4X78$(7f4O#7l)BO+ErJ6@<&908@cb z6(ls|>poH&ILHyiRFET%LQDliRhZDE=3ATNg$KFO?b;^xHxy*57v)DwMhLO%cROFv z%w}UpHN@d6$PJ2qOqP|8^b4=?Sxw~e*Xp?j{?L3_RVhU9`uOgixX76zS2IOBpm^19 zZ;7h^iTeO<$$WSf2OZp+@XC)Qi8WF%WdOEpFGNhv8Jy{$vDBD$UZaEsx`!LJ*bEf+ zb`uyE2ZaC#w>-Jw%1=mRhEt>XSQ!3$Y*8vlEo?*@@8W*B6toaH4+{{DM$k}Wag-u~ zTJAk+Rb-9eElFHJ2w&650LXX@EiCS0>6lgX1(*`EVg@|tF}=|=Nls-B+lG%NNUEN* zj!gqGoEQQ+oOL6Qt8)!|b2&ZahkRg-$kO6Leq^S8m8LxLJir&Qb~eG~zhF29)>3Kl zn6;I0o+1@p!gV^x!X=o~m|MtbQ5~OK(DyZt!c%3vFl~)}(y;FSBzUTZSNJ`6s*}xn z`J?evj{;Tyz0AT9c&fjTS{|29N|C*&8j&m0l{x;sLtDINhhY}vPP~2$HE{;W0y-{eYn@S9D5Lu?S`0x1k8-P^1Pn0Hc@oN-Qn*Sq#e=kl! zXsSmiMg$@>)uV!ELE8{3G+42MHd{X&0!sC21=y`tlH@JLP6B9KZm9S}z$H0&ROzr( z$Nq`-dNMyOPw5mhg@j>lR1`EmM&#V~M(|M5(pt87$)VZ$n4Ph8 zG2id?l4}q9fUl8~-tr7%m7C5^UgddazJ7n7@w%Rf%_G%Ca(m;6v;8&Xo^(DuhcoIvxZ@z9;xKUO+2+Pai>~7|kt#F?tEz??} ziY}fN@Q_3{NP$d4d56HDr!}Dmk|DW<~%G7Lmf?=iZm&zy+h{mnhVAP zRzjs-9@2*-CkFp>iJ6PYV9BQ!jO`Qr!jF|zquI}|2GR^(U26O~lkY3!EBsjMA5orh zpp(c8Map>7nWL~XCocN>aDe52P#BPc%^}3#T9A(3ktfV46qDgKYb0r0VZBf`knEzU z0`&>9!v5nyh2BC6dJkTEFM?&%qNK0B1L^Y!7$_hV77R4&yl@mA=#IX?q4Qv`$~rS<9pzy>R)_luoGwq&QvO0e;*PTB36 zi`+XMzEo25vE<(rDs#_PPAR1?et#RC(J~|MEB@&)JLKXhTSauLX)R(r`D}L?fwcNCIUuhi877(gkXO~ zkXtL0?|#-GyeBZ$`~)iGPVyji7h3fPIPm-l(5l~Sk$dVaa?kJm89}QirLcS27Vy`@ z-xB_M`Rn7apTEPcbf`df&-gfY^Hz~LCQ>ONlU5L&gYMv5mt04@$%o=*~0R?Mu{ zVf_-m-+f6vTHeiiGK3EsS-@=;uPja@EcV-#-P^w*j5WK&zi6H1W=XQgw6Xhz)-99& zDG3BMEIdzx3fL77<0P*-3-2M?Rnmcq4mOLrVaLN4?Z*6T!}|CnD|`?37zC3iQrZZ> z$5R@yCn%xeG1;lC%f-iJcR2p&&&Xe>k$aoqXk?Kgxvdu>v1W# z9uErF<5F-v9vs2-z?U6dkI$0RPj0#L2wV?+56hgK?vjKY#8|<4NGbA+U_CyX1}FV+ zi=X`8Vb24&8CO09$Q1))t`+$NS3U*E)qV=l;Ci6YtdGIX-qNqoJ?d>H18(;FeD#I$ z(Z`k*qkMFp%m8NijCoF~4ofq$&f~yNfGu$h@UG4r34D%|&5cq7;b z`LB2V81EQwnF4lP=}I37YlXSFtm+T>1JoeDR?@~GYp z@Up~yMgDt$I6eTA(n~0`WSyVn3q|YQmJOb>9EAq5w3k_0TDqtruH~m#Sj%Eq1_Q`K zzZ?hQ0l*zzSG(pA5myRbFfAb_TnQ*wok3OB(`ueREQl^XWV5&*h7f?6r-0LvJF)LP z<0lWm9`d&s3;r0NfPJV}wSf~KK1qWHJ!ePCfZ?vBO=_A)&mBRhg) za146c*AN?6@P!!F3=$pvtpZ2T%O**(0K^I~3yh6`es88+K6up2kK6sP3Z!cmFjP_; zHfxnZJVeT+MH5{ep-*gW;>o6&w6~bEjnQ4QtyL!fFsGUMI1t9{^hkwqHQoo9NfR2B@;Nf}G0V4HJZ)(%u zF^A|2BR*OhXqlKYY&v%A86TFk19WB&uS>o}UYlUr=rz5QZjheTd)s6FNii?k`SrK) z4i>fZc;L+2btsH(F_JuEgGLY5P@Q_Rhu;{S`FHz9wvv#U67Vylw(!+N0TtuoZ|USX zM2{<&NuWg1G^q5HbB`#+QW~2=s@eUobV^zu(_MAZ#YUfl_o z`9TQv6UhVa{PT?JbI3E$%%Zghn)%;h6)YVs{HxiMtQCw*a76IY8rD$|g5^bK_hG{s zjo89AQTy&>VpGXJZ462XFVvSp^*#SIQ!bwEnCwXM;XC-n3Fhf873sHh4(zWam>h7X$3m=ectr>S$jaZK{ddoyq?TybPs{Yd^+s76o%yoPWEO6) zp;xCOTtFAE_riOluMR_F;ifvCdEJ~!a@!<71QMfrC&d^Wk^S7Hqt6Xy!Z%Q;oqIU8 z*q3m5bn>6$YCi0QLnMAqzz}|b>*)OHTsf~G+Wl}kx}pkd0l;3z>Obrzle|CQ5NF*v zM>h*}>Ct70F1@<+>C&$YV0-t^cp%&gG~b6TBJ-z9E|B+JiRJ%xnIEjJSjnBp1h7;2kDmwIS5o;&c91w}Omq z=LTF{9m39QCC<$NjLnh$(+^e}l2-&s8J>ti{3{=W<)TVkF1{72|)JYL@tOt#4}V*^`pGeFrNJo1F` z@N7I)564DcV}s|2ns8FVo)$DKOINBQ}+M-R-(TN5V@VF+=rJ%llfM zS8SmcufFn7XV_h{>Mc~ZwOBC7pHPx(=})+n^g}>pSC^o&ClF}#!>l~d3gW!N)m9S6 zx2p|IwkM42+&-N<C<3xGO*tv9m$NOR{CLhJimMVSmtd-j9Vt`3=K#tMSgfj`OB!H|xB_k|eOUYY; z{jVRav_{}3x!sUP|4}#((d-LABAc{O$%A|iQcPbXhnf3i$aynm5dHv*^)8hs=C%2d zgT#p&GW7GbjHXoN}avpQ&TX#GErF!@JG zt)MimAWY8q`xp{ZZ`k!72_P*3WOKkLt$`v30`gu%Q;6sXDM>HCe1=N1`vnk4%{fGftj{w`I4(fm@F>Ho1KSYxmhY3gtOR8FT{2l}cT5sv z0{#`nV-eD??~XgYWbaV+qU2t>8ToezU-ITEMrQ=c;zE}x@gC>;( zRsGbHwTO~rS{uY3ZHlUdaB!^XA_WGElBx?^!8}SVCb|M0-Y;wIK9ca+kOllu${+w} zHW$XJKyVw{7iZTo+B96u2(9(rNSQ*a^Vw&`}bo6dLR1*3OlVm=6nLD*(&Xmlw4Ou{yZ!gGm zyzx7Y2JKOJ2XTDO;Xpd~kks!ecMKK@HLa0>5V>QKE(mzCtEeog<`3N4zbnHWyA&!E(iu{+jj?Q~rPspDU zL>2E5el?%-MBT~9L8;Y19;CsN>}3+KG+*uP*Y`fDg^yxVOZ8P~i5##di9e8b_~_>g zaVz-fC-Ea+{X8Z=AH^gd#rvD&=d+*0k9_y@nEZSclX&~#&(fDYk4bdcmpS!i&tu~G zDyCp8^<_!75;6yi9Y-1J*nHK-%_#Ob*ee0$j*W2E|G4`S?%0mJPW9~?4Z_>BDZb~+tz zpq{SaUQ*Lu#@ge<2tMET#HDWrG64l^QI|{>ZZni-fRHj4?LvOgC%WCkjUq!9Iup6~ zI>qx3833P1d=6VqsL30igNWlrD9nqHV@>(esy&37`5nqZGTl$4;P|_oPpCzAzsU1p zsF~klw;!?R?tbu8t+QKV9glU_P3Q3QT^7r#>^33kh={r|qPjp&WHbb9y z4H07r%6s{hs;F7N5GLbCMpYtrB84L05{_x%LI16h5#(^P(!kiM2HG9g1EzUKA@6Gf}!5Sl7hBik@^R^pYOfz&Agd;(@fF^^_;$Q=Djz+ckjFR zy?5Vz_x|qh_qkG>Q87Md6>?HFDx0EZI{HIn_53Q)j|$H^Y_}4@J(mSq%@|4Ov0@7M(o8)?5?#OzY8S9Gy+_ z>q?bnjh%I{?VoWzN<5{Fhg*pmFEZH7x9hg(O^KX(vqo>i>poIJ{<48H%Sc=r>0H5? z+r}51IpwVaXKpCcb?ZaQ+p*i+S6UB@fGnm+^Dgc}#6NeCNDAZa^?i!~Y@wiJI7vf9 zdrCC;*ef<%qE6|65PxM z3|p*8<7U=YubB~Ew{QkkG732v?aK8}ru!alrcjw#ahbgb)Mv&tDB|>ur$0x}d5-!3 z`quMJ?of}o_sv>xPuNDIVBAXO+-=b;xTtQI{vS`g3CTvMNv zKE_TUA9O1*N4lbdN#LS=7qk1B85|j6$^zTPwr=2nqsi;bo+(m$=Kzw(os5z+LmYsxS> z5jd^|=BV53j*FwoNROHm=(06g#Jc!d+(g$x_(L@R7-jUOXF9qQ4?R=GCWD^2<~r2a zMkhk5ENSc1kGRH*!ALKp(Lepb1~A)U=BYIn$+R zx&Y1IAL}ExNT*U;_D-EA0nm)go}JD(rN(573v}q2PD>jfJT-0g=27U=>6udc3~@r) z4DDlU>6t$}gCvZlm2~4YoB1-MC|83VRs>hOnV{DC>p-5!nXMvcz6?2&_J8f#mlNPj z=E#^31)QlnbkHPYO_Fqu2l!7jyNZlyjA#*CiZ^C~Ym(qhGG2i^m7B@L9bRtcuJ_Pz zp@TN)$IU#6D>1gHQjC=0WU_=*kuVk|ODpL)&{H*7Zl+#rJ$3H%!zxKN3~y|?D7Zs! z;~;WWU|W=Pbx+Wr7?j;2x*3+6X|sxUR@IMQqzE?tavs35%ML;}#NO1)%cOzGEf0h0 zHL43_;8#tu8uqcq$%UNxn4WEWaY9?d7v|D;<}KPD8j%bS8$DdAc0tGH97x9yk`XW*oJIQGK0YWm?+{eIu9gMtA(FS(!IeGaLt2=1g;%v9dCs zv$aHStjr%AJ1etCZkv zGYlq^3c}epC^Bb%4suJ#Q`Lkj6rnOzO{hX~ z!DXsYgv(Slp$f%Cm*q9_`kuCgN+=KNu7k^@8p^}4W&7$|(RLeJrY;xlxF0Q3GYDm& zWg6U=g=N;MqIO`Jn&QWWWs+qtD^s48)E*EGG;dLYmC40qqB18^bNHh&|5fTtKC!8& z%!BAu(omUk?4M|&g35dxkIjVr)7u?RX4yD#GH*C`PUfrQ#L2wCa5DGuxmN4aKr){J zhY4D|zV|8GP=}55jqs)$4J30v(+SANS8bleaQ3F(jh&L&mr2Q-S*mu!_nsz+pqG+) z-Yj0hF;FsRGccbRmpo=lrd3`WO6IhGBZ~nVsf$u5nN|TsTWbVLruC)pQZnr^)4XuE>3TnCqZ^Fo9oVt#vTc8kQJWVsem3IQR)mu+WrSkRSl3O0GW-!H&I`5 zfA8}lWcHF77?{C$2kX4}V%8ze#tb2|lJ&%R32S<_;i9><-!CF?cyhIvk8>F%np#b( zw}^k!^lzH^rxC%q8s&2T73g1q`){iLCE`^oUZpwKSpCQN)tst8q5>tcVlJniyrNnO zw9YMIzqPuA)32CY!8uk`S0InftwA2Cu0ej?+$QALRkJ4i682lGSr^6UcCa6><^m7& zkj&~WJnJQzjCI|;dJ^G#sLubQVVU>yO;TZ3Z!i^>d8NKqaH3Rzw{ldYBvpO%C4=() z6e|;#;f)*!um3%RaH5<~ zbN(@LGXL;4y}WN7C-cd(QO_KOli5ms#=yx8O_V+`jgxtotdEW-C$rp~@G^#=xnpYI_t%Pi(tc3$S) zqF{}Qm-$ZPG=O6(Lf2nSX~Q3orAl)?cifiqs(52UVle zg51S+1YYJGqKcRK(JZ{oZ*v=xyvz;wPCMj3+PsGE*ARP>eh&`jp#K_U|7FpCYUt9< zbGU!Yh~@G!Uuckt)0tR(;3CVQAZ%~CunsSidGk_unJ@VAG8=l~OQ!NN_hKM0HeM#9 zj3p@WO0dv)9utgJZzV4?5}y94(eN_E)33}Oml>XZ@jfmpH(F+Rx|tg=y#AMJ0~=Uo zeSgD5+Hv$&#)p#|wR7OV>7GwV_`eQ24l@Zy*j!}T;x#zM<% zHjSK>48#3NX_>a2b%K`}9e_bx~qihfFgq(>BY_y-@ zem)J+{s)cC&hT~qM9tnTM9l@#i#VSbV+hO}Iy_Hp2_K?n6Uq`FqUM!UfG8lsd=?$$ zjwNa?k-$sTT#7(M&EF1pd5D^)^TlOUTU~avFh8NHXy_|V5;cz#{N9 z=h~RY(q*A6T^7pHWuYux7Ru6Pp)6e%%F<;#ftok?12qplp9<9cXacDD278R>xZ?wu z%t&V@zv{jJmr(yldtY=si<}94ar%_uF?~=>u#CsGCTnv{Mi~2EuX9LCEG^IQYP|7K zRRhxeiF%JEj3hk@2`Ri_S0s42dI**#<}BS}!S|mS>T8e*>+ zf&*^tGWcf3Rjv_3mLbIr-?LYCx>4!vHzFW^zjLurwQKhWIY9|ogjfZ1nw{T*&$vZ7 zVPbe(6wq!-ud6d+=qt_T^q=ApJut*8Xb90vK^6OvTx^Q#QKsEuQ`jEG{$250Llsgb z5Ck^Y&F1qaK8AOYeWuQ|Wp17yP?xD5BvD zDQT6RcRQ!cPx#sBh)hV}8B+wXjGHa*7*ygnJivdMIGl|22wpa7Q?rL@)_us?!^_6% z@v$>$<^RO^1*}hszjHzVgmU9q(-=G`Pa0I_IFax^;be1|9KT_lXkM2S4To@|@%Yei zc&G8fabw~`!x*#cLX#027aE40ox_6}v2jj8*Pk`^9L2`1WbI(k^{+`i1)6-1e~kQV zGLC!vW8`0xaoppdBmbI=<6Iyk1*P)Ok$+9baeQMqZZM(pugN&hC34*O-?;uY8DZv_ zFwC}#lM~J~8OQMvLeD+H5JL0A>tEv=;1WXh0viXLhtTRL9c(@<9sf%~Lj;|~K1EWm z5$-j3*K~*1ckwA|?0vygU1eo~zI%wEec!Y5WM9w7*n`G&Y&f**$PpS}s=vsBEGch; zQ>oE{jTA-80Ob|XtmEqI^&spmz{zY*kGw~Y$^&&N1mGijU+0C6!j2$L8Th8yplT)6 zNL7#}WTh#Xgi`nLKnv;6dH4~<>HLVrW`rTiIxl`c>rlL$^@R8ttS82&vOXz32{SCq z1I5jB5Cf_aHwhjnA;kl&+01V%zkB%&^4raCAHQ@SXbbAS(eOZLOeqq*odT1X2R*^>)_?< z;N|P!)_?<;N|PU&S;-ug1FqzX*|iCxt|B9db4mpYndi|;3#!hsI>%i_v^`1!Y ze?$DN4Kk0;4{@OUAtuASh(=3F?XTgT;%h!cGm|m0k6~!8BxSDfUF1%}PNUFd=Fz(T z$T`4(p4LBR)W*gCbn!+>>_ytKZ!i%qUJJNqJ{1Wbh*TdTUq^r$|3ZEe%B@E12DOz# z=kwtOPt*s&g}zM2UM7Ue*xfc`=b4P<6LxqhkJjo3n94S`+u?fxtB_au=TU&9l6l~M zu9h&t{k&Q_5_}uN>tE*CL}T}xtS}d{auObRMnZ1OW*(KkdsTX^=#}l}sLTD7v2Z}g z)sAN5X_*OghcQU@<$fB*C!cF|(KN1G##Ss)HDSAr%wd6YvZ#(Fxu5c3)A+0`YYfkm zyp-@iF<5UVD%`KeKBntb-a4Jb{iJDs*yVl-D87$tz{q^8+)tgphSSUZUAN~Lxu2UD zinZ(`Zudd5c69FNe1bP*;eP6YbeJ3sNBTDFj?zGX1GmpK@MnbyW5J)gnkD@AfIsK; z7raAd(zd@0_!D4lc;I_9J=B}=1Al&;WAanh`$IQK*HK^1)DR2y)m9%054?-R8gEoe z#*ve2)#Fo_N5#ekL`3?M+|OIpbl|a@drj}g%l*`qRuQkh@4g;I@5gLf9hr^%QIU92 z>5r28dCk$epRZCMH1ywXd3~54b<=$-COJRuC-+GAQ#Y`$pmgnbRaK#~x7PF&b`|c` zXGZ!yROW;V-}8^9R2P~$TcunzgTm-iltz7Udy@XSJv?w1x27?++i+jy$sziuZq+TM zV;wv_j-7o?Uic>^V!w6gq$g>OWGKh?5>XatKr-k?>Oo3k2oZl}`mnUDd3?H{EcDOD zlG@#?G5h zGPY86wq{cr%GXfM0jxKQ7__p~D0^v|p>A^Y`fk>@n*YhhNjoN{4?iLi_L9^9MtdaH5tf^Gm+Jj-_@Al;Q~96P{D-HU6aVuAe1}!Vi2q3|YQYnlZt*-6 zw~f8&{Lin_WEczoQ&~J3|Fh7Xmm0>)|C|Q@^HzOL!kg}3^wr^i*7yBWH)o@LaF|1e zoxn7jQ~EZkx(eC~DhvO!&W!4M_@Ad3K|24l58X26fMukD72zoOpJx)$@;|@)Bai*N z<$pdqR{rOEQu&`b;XgGyn8AO(#7-9cXA6pe5iye}q7D8NHYff5okh8B1^%3F1OJNt zN<00PcKR#r^jF&Hue8%&X{WzpfuGO#0zdbBCLQ>>?K4?`pC{829zK!)eo8N$@A5vU z|5RPs6THuH;(Z?c^vQ`HMXtQhFQIb-&H6Qi6tAVNC@q_M8rsCo)XM{U$E9jEt;rmC zpYJ8SsSMzK_I*bR>b@J)SCqT->yiGC(f@h{EQ~>Z;_ciJ+iklY=+mWM(^wHR3^8i- z2lk@w9ld>?XyY)N7C#5$!VW8 zBhWt2G_QeNP7Li6i1U8j`S;__zdgA2=ab?&Rr<6HKM?JoC#!Q)dUM@gRQ>R%=BOe3 z>fIXh6YaCzp?&HKy|hoJB*!s%6tquJbH23C(+%yjha$}PAt&DFMyb0D-PoV^xlG4> zc%MzkNu}okrR6R>n1J3NDYc|m^8NzzF1+x(lCpUVE}vdfReo-HMais+Su-onomp8@ zc1c@nNwg(y<>$<-m`>cO{Imo=bx9!DT@?JAf=T((LlhyKW{v~`bhphSS%tIjl|_7H zi+p4&d}KX7GF7=++%)cI|4t(b1a7#ToxDp5i#Fv&3X2Cq^9oD)C)5>|-8iwfuwuhW zwS`q{C(SF|w=S6Xo5B~@1se+wt_xmLcz9i~uCQl)uti7yK?mmPKwaWMUZ;+PZYV6w z-|@)fPdxdQImg;b8&0}$V*iAJ(5Ae_idB~yYnh33_1jJ=@XKa48xmz!MNnEjrA zTBl39=hvA?-ABo~$#eQjk6}XK6}?OuODNzxq_iU;^OmIv`5Q8Co}XNAV&)PGpkeV@ zLWvTddN33~E$6d@0{K2tkqH6adrXvxEW+i1{obQ%htX2nD8O z3Xy~YAm@CRP~h!8QiVS)LtbhmnfX^;7U2Ai&^A(y!(FPPtGDQ$|65Ei+p|-d2<$d7xF22$#Q7wQ}VyX$dfrW zb(Jn`#qy=yUG447tt(eH$5yRevuefa`0`aNo0qo4uDQ0Qv$Z@HC~vs%{aRIOHA}0R zKGD{+HFw3EmtNc299!KLU)Anzu6qBhW`3iHvGl5$vzyJTt$p=a<2ARmcaJ6RSmKPB z*2-nq#zu2LO!&^O*fqLyqq}>7=6MYl*VQ&OU;M!jURr-y^JTSl4fV~5tGc=^-h6c% z&%2|u?W(qTY*_$F+sZY8u6SpB)rv$oibU)3u8tN$G`B&}lhF{IdxE9G(`@8a+oL4+O{I4Ck&R^yU-|1BO>G@d>pGZr3em-eG zAQ|Znd*t4HUSU5h8R3UKa&P$SXWDeruWPSI?hSvz92-78pC1OfrX~G8++UlNd+*zh zNK)=S|F14e%HP4Rd!K({#Tn&czav+=`Z~Jdf9nbFEpMlI%1NmwytKnwbX{libI-HM zBlniK|H&f%L6-7X<9u4v@7rlv%3H7VsZ4(^-@1IGgavR(>F`*c09<8b;FL zkvm1hNY;7e$yRIgE zEe$8LUFPn^+q&a{^46AkOQ3vdS6851g7VCUcCy*y-(ntZywrB{F~v45+A*JD)K{F$ zxmxWbr|Z|A&M{U>tGc{%l^#>>&qhysX8FuOd1u>-mXT9Td9JxeO16P>OvQ6INW)%H ztSe4;E%|5W>_B;JReO8eN{Z&nRq?iRRG;T`#p&fa`D$C%d{t*ldz&rZ$GQekXPVO9 zcV6eL(aL9WUcbbVGuV^NX}%=nHHp2{o%Hx>L#zH6Yy26+)j6}}KV9y8eu!b0RyKa| zo1Eh##lJmE{3SWY|7Di=Zaq3u`hQF2{qo!L9(VOR_FrcDmBjZ;Kj1X!bonA=BfLO)-$PCaA5UDGk^G0|A^wb`smMbCKg~oSQ;!K6Vnzm-w34`~CfA zjwCyw1U|d_?)!QDvCiC{+dP;5<@Y?#Z~itHUNp{esw)4Ss!vpnURCZ^Rkgl#^lr1= zZmxPy)s(8+e{{W5b@ynv|97r0_`{2Hg~I1rF8bVMmt8Pg7|w$3yV}NcbbDTWi(R!R zS5GEa9m&-uUHwdNsK49)nRg1ZuG@PT_r1%#PkW#8W`F9F!IxgBBlYgP1{JXMl_eZH ztKH=nH-9CE$S%&R%z)}BG2s;DzlQ~GH& zw}WUwre*rcXm+OdfIF!t^NM1u%v6o5s@kQHLHu!x(37bixz#(+dG_=}Kc{#^UuVI3 z2i!$}cn;Skk_+NnEp*6Hg$d(jR+2DYX+b1)8Lgp>wBfEYy4V4>ujGM4c3mV|D1LT<+w*FHv(iu08w;iDYE(eATd3wL z6sNN$J1mF^lO zhrnIFUu9SME{UtZr6hQVM^w#2oRh?%FrH>nL#n9R-e+k>{l8H9wuS}4AM{JBGOGDY z;u#jSP`a=tpl_;b{OCkTf$A-0Po{;V-iMce#c}(tKeMW8U60k1Zsy$OulU(l{ph$m z>+ziPyqDeUxwG~%Fa~%^0n|4_dP9qM2MN)Ee)5bSbY>d6PdIMRl6R?N^LV75r@D#| z5~2HNR@rXP-IFV6bl<5)Bi^kP!$RdsUm7C89HXT7^V}rX3tT&;g;oN$XY0qRs5*GLIx?P+FB?b|Z18QfOV;CI>sgxV-(XE*_1uiYIO3JT(cHFGE zI_X;kZ&sMqFO3UU?;RJGmL~nD)MHH-tbTD^e)aBgH1&>v_-g7r;qt5Z5O$fhTd;c9 zIJMUx>Ciwgh2w9{G+3Iu$Ejg%(K*FaOXOF-z`a$O6}m^9+c~OFbnz^VW@f_ZiVUsk zXKDjvPo0uoY(XfF9hYSmD`=egU8+Dmp88SWbo*}psIsBsQX?Ha@Jm1Kw8J;`JQp4I ziJ=EM=V-3?VYlyci!dkC;`7oL1EJUGN89Kl>xrU&e(C#3T0idL>cV*DIexU;j}G)c z>h|?oeB_Kz#VCRoJsE8!ayyymU+IQ9$}XPqb}!l=g_#DwbgLECFI|B^iH_!?VWKK{ zS^|&m=w7zhi)WlgH*DnrQY!nn+xNwfsQqrW9v$t8UbYw23n@(ZT@(3re6pyYcMdJ5 z;b9VRe2)dV$vb>|KliB8dcCX)o9lga-tSuXJ#j^w8~DLx#ERwp_HC z9%A96;ha0`2`_t`s#a-=otb(yYrRUjDpSM3@}#RAVkjr=j_e&Ey?{Ye&roSlpX?{3 zjk_ay0c9GM-C6hg+4XMUyFaXymsn~6@7qOWp9WpQk%V`X@Dl3*>O2F=lFqZ9V?dR* zlu9=cuJ=BM`V6H*KjT`3RGuNqqxQXAqdRKH>3XFp6ZN^aFnW=+D;;w|rk!@2PbB?Q zp?7hfcMeNJ4k<0^7ivc7d)>DWa$sZ+eMmj#R*Q&onV&(#wA1Pf?R|V_1BLA;sGr^7 z_6-m?P%>(_j&*(};0bqV9y0JtEmJfY^^QL6*6fd8V(qGn)~noed^$~-Y1Yf0umaKZ zc~LceuJkwcRCt5WPm|aEk`r>$c-05obJU56H#^hhlXHWAUeZ@I8kUhrK$`}MXqtXV zWor*sKQ}&C`l|X@7*A{r#&>4g4*7@(w`VsgO~22N)_C2=oOx%b-R`Et3o=bgJKChv z=CF8-dSzf2_c_OWr_2RLMo@e>DBg!cz`o!&%+wW}!`_|bQdMxC4972^YdM=PiTyjO z0_Tx@AGf>8k@=l-Iv<^5pH` zk!gN+v?DBCnQ6J{@T(r3xF$!Y{RD~b9qC1{X4gmC-OE;E8z^%3fP;9Lwcd;UsV9ks zNVtXD?(!pE_NW&f^P*RgPG!$olgn=4Y=dm9I!YqpA^Bha(2>72apO&w8J`8@=DpBM$?~EA?pzF)lUE+~p_Sx5i+m?j`>s@OE5HZEeY9Z$DUL(zkp^&8`GG>q?1Rho2HKD&?Sg$ zs=9>A3!`P3ehY!lPt9COiT02dT+f4mDARh!vM3Ub}uS#v0I8#z_29(N;NVgy;E})-cXfl42 zJUQtxMKdd&ab_6L?yb5kp0ZMMd5Bn-#MON&x1?}L0@w}==3MVdx9@8Fnp~;3gG_VN z4TRf zW{O$0*_@mHuxxGxvpW3*gQavWhJ>GKQ#Wbd@4IiGooUno!!75mV>J5l=T1h)J^Gx_ zyVQ_lE?V9F^ea9Y%4Wd`JwvibW0YD^Tdkv_3PpKAiS60F%0iBFuiw+4C%?SPy?Sy+ zzDeu!hbzWGe)Tisd5`x~IBM-yv*UPUG~-pi+i|Q)g-WbD)vfx$C2@Fe)m1rn%T7I8 zY4^hMiCuY>=WcmWx3Q#%-*oPaq`j5EbDyPhcGmEG?!Ek6vxUk!c#j3y&+oy#`gul= zw_oL+*vg-kNZCBA=r1YKL8W={1x~lzqo2D;b9kK22HZ*2lQ@~XRv`{^e=x~_XS&sN z?tX<;3sDHtdybo1b}8`FSMhX>GTK8~2TA0bm&Z}K&APSAcPGu6RGXB5z=!mBs}*C9 zp1(v2bBA?voAR^V4_FfQdi<<%c}mv@$LU(9Th`{>NtwxMaki|v%H6VB7xo^jJ$I5n zxi;utQy-k4c%*Lck5Et{rgbZ3?Ykm`qucX{T8oCn5Eu-x+!_OfQf-J1Zbmk2zbww2 z7Z#tu&wOtXhjCSDTIP~y;bR(6&b>kL0mK0&gW}YO&T0!{&)9!*k@9&NIMIct2L4G`1M9`oGGw$r5^L%_-~G4&G9v~;<>lX^TJ z#xv|%LRXFEXA>OB?OTOtPt@k#ak*%tk6J;u%*iy%6_%owg7uJUl177E%O$QN8}q&~ zz5`rm$$quJ+F*ztgbv~Z)PM;tHijTUe%SR-e_&9pme(uXo zTF)^by(~XPbwrm8yzC=x93$w}OIrfkv(=9#B7B#a>n;OBu}fc3b6yfp`|_o6=R3%R z5lM#q_9~d6%B72oLbT4OEB14W_I3NtP|8I{7O6Morr#GZ0?38p6M|PgBLAv`ljQyU zl9`jC1B8ZC;>%$Rc-h-2f{@AvRm2LSQqDFxGuJaBG}1^j%(S9Zren>nOu~~gDPVQ_ zrB7vo;=G3G^89+NJ@D5yq2m`TT{$Yv`QkB7(gs}poZ%Ta@-*+9pzUr0J$3trxkgv1 zmeExoRPTQ>S|);YO<~-SFWb$YD(oQqpM8i zYMQR5*;RNx-J$au`4Kf2xwX}Axd;tZlXGjsNwxXzau+f_bQW_xds08=vmJhGub8xg z^D7*FYpnB8Ps1w=a8Z<_%XKl z{nx!uK50B7ixvo-Z6S42FNe1wbnXi&fJZz0P{BLnDjF1lYyCuiIWH7|wMYbIVh@A)qX-Jo z5oBvEZXU=a8TyPJRAx^qvl2tu?fr=9jAT%Eqf(sCI`g$W`h5*DbV0^vg!FuBk93hR zlf(%MO1~I!g3PBIfiQbNPS8T}HVNxO@v9Qng=k~G_-|cyV zWVz9Onr6k*EECM=E&T1D6;HHl`2rH&BXH?YOm^el6rYmY#LNdaW5u08WaJT;YMd&S zi3dYOHfNaYLiDUj?ut}LTGWTppk%S6b`Vd#gV0Er1k|j^Xaw^X$!JM<%Jy8LIGl@) zRD?8lwt3w{uSiHXasi%zJ0HXo-&u&B_PJS;kM7S!ugd9KK`y!Ya?GX#y2?c-H1E() zrk?q9&sEDLSEbObO=wMq4mb}h|(5h zav~T{?I!kvxbA!z?oGoux{ts0|ejk7SmR%_Eb? zqbUzar~PB0_y`S%FAzG9gw9b3+KCI}OOWUfTvwE2rbFIN)poBL7bNim=YWP_m$Fbg zX++-dX2qVe(93SUj(i_xkbNw4)~IomLNFe6#GGfQFLZX|B`9!aiAm^Gi2Eq4TDNzj z8SE7SzQzQ2TME1VQ7XMhWC!B7zBDZpI^=+67u(N(J}jcE_#lqhjY&TUiKAibG-kR=J69m z>ZE+fGbc}8wV0t(JW$?7h1zJlMu6tXKzo~wCPq@UhC%hFF$1dej~OmbXTa~c&hz;l z-}wQ4t2_S_zh`t#=J!pVXS$PWgSe&Eo%D7aH^2T6WOsHpSa5afKU~aTKYuIu8{lt{ zzajpH2{&&CCPm;y2fXadZqF}xfqZeF_4YdsdMSphx2{@mjGgg-+xJ!V ztKufF`|fe-md<`UApxXXnRH>gr-K0US)IEXBL(O9g=y^a<*KCVkM8;AUG;?Br3ffT z&B2#%9Ky{yv~GOo`TRPa@ACmBZ%0ZYuu(WPxuNPEmd;F}=9Kjql&BPM0_^ntYUhD(yLJ$EByuFKT{J+BF5UILZW)b2O;% z%pV3ef4J13D9tY>bBF)}f-f~mD0du`a-j_mVVH7UEH{LCzfy~^lz8_V`iH3}VTfJ! zfRzm#bER*q{3u#R8fImZloVeJQNSl9J8C)g45%y=Uy@a*DoudO1|Ff&Uq&e1YE}aN z=xuV)>>W-4D%Z=h=qILW9V?|&QN27k1Vd1+kzgD{@+b;^FF zjElqov6)Xs&Cb3^T1^G3@%NV0%2K<(hfoB)rTSLfLTo(oL9o(GFMWd&mtXch9)bViyF8jB ziz#08G7!+!U>>>X5x}309aZksM3w<@y@zOa-AJhn8uW;dwo=lAL;gcL}I_^rug5)zBH&qe!k=i!a#&I>a$a#J_@Q-_5RC{@k9_vA^|i~siI z`IyUY&)3h^Rrnsp4e9t3czL(yE5P+R-o$F^yj7WHWb6~Zb>3TNM?oTEp8FEg1SrgOg6 zaN5LG@fcACj-!hC~z*gpAcg( z#iNxxd9+06g(kZoMEhkqi&+Qkf1LxX1*~)a!)N_ww&WcvvBUn?S$|f)3g?1FQn81e; zcN(oHH=C5mGo?7drHy!OoH~XiBSIrz(pheAv!6ZWF3MxwkaJ`A8b@Z+5#q>PY67Qo z7dqy+A~^mV*-ND7P)!nqCIy`*a?aiyGj0*~$*esn;zd&l zUcmIAdD;0@9vl4ZGE2$rIUs#l`e{GOdZk4@9LgI2Cxi5rvP4Q-$297eaV6+i^*{8mA{w5a^gT-Iou@%S$Xu@0dbZ{Yz9B;!Kg1NPhJYT2ET? z)@14jIXp!6ra7e{x!%FGY)6T?e6&Um(_%|=nduh!nL;4xM1xwuV`>s_OPWinvy~%` z<{)z<<3jH5A{hxYhSz&Yrn{>Af_V0a6D_NP1mjnkGS^GB>lz!OgCkX3&mk9WxG1i!;}wQTIG0_I z3Ir!!!ORJp_Lz%L6P)V;*yuvx0Pfv*>mDW0ew9wVD1wRA_!Pndih1Gm*iN!#2M15bZH8h`VAj%4E_|cS1w420~ zsI!~*QUzppyM4D2TlH;yy7+Q#I_Or&TCfB~OB5%+h&`2UBYleQ_4z5f&@zz(n??no zAca(E#OZ3VUU)>StzVMOp!>2J`pzD&svrk$s$`lxm)S%f;1`KdY5+QZ}>!Kwf}>@Gmq&zPMN3 zUk4?>dPauVT>b2Lq2ccv4#v;P zv`gvPgyge??Zo_%KS0t==N?tnl%?tqt0Eq+;did-Vx|gnQu`?A1E|uwrKAt@({%2u z3VFy>_1Yw+yTztUHM=Oqnnc;M4zMDdq-QCOBr+ztVp<$505FGat-_$I6pa@(DcCw~KNU=y7I{%AbqM&y!Bbmp9ggx@rK=yRbWpdB8LV|QZYnu^zOl;79S zQ8KfOqf~xt=+I9kHZ_3V-hPySp}2SKdL&?S2DwZP$u0@Mc1;&`jG4nUCsEo+f=Nwg zGRI@BFxR=#ze;MNf(AGMr<)*h9%sPnESG&D7wtkL+~Z|gr9RhehQrcdN(|ZOE_?yA z!Q^AE^b=EKiTFkkDq&tz!W#H~m02=gJo#;blL+$=#ni~d?0%I_!Q5H<{p?HbqK}fM z;V^!ww2V}IXQ7EYts#}j^Rr*OfrGo~AFx6Q)1!N(f0Q2dHAR-8&FOWdXw zM#WOkc@TGKTzt{Vh-I2A@{jf*bn7ssyzZwEv(Hz{g)9>oX(>j5NP z5PzWBHBR?0&+Pnsz*?@(S^SRcZ02`-=kN1d4cOCt?2OKL^82RF8YG1i#GU|DVBLW? zd(_^fjjnUKz>(sD~oJM_ool22_7y)&pBQ)7jFP?jHOcfgnu!q3i zuTsOB+)HZs!9Y03Z&AaOg7^!yR0DOs_6;9{>;bD`BE6&<{`6w~sD?kiLO-hEPY>!x zHT-FyyaQIlx$Mz-)q6gOqhqz4Rx;knweJp!`>Ex1)wl6V4ALW~)iQG^x$L0Z`-(AX&CWU$4LdJWMS0e1sE@`T`D+Ny40}4$NlhKm^iUAm_j7-1{Xy zwW?_DqxtSN(vK*Al;l%7J7|Ewh$4IhEqt!(f$lKZbp8`H3VU2X(vnT*Y9-O!L%JT8 zmVI7Yl;uyHKBgy6>ujeU59!GP0{|WY0Qi$3J>H?4n-zSnrD-9LD~{PBsBcc6)6Xw0 z;bd_V(Tl^1{!1nN6sCzG01r?D-~omO@Uvc*RNZd0SpXaB00eUW=@x* zF9e{bf&ifW9|i%y;{rZN{qiwbK>QiUpa7is1$x>hAOH*?{;$FerZd8UkRMix^_8SY zzy&3%n-yRo`@pqL{{ax7^0EZF`pcK3Kmcnj`({N;Ul$ChKmg?ntLfk0lH8D7{>3&_ zX7TG4u`$SPCgy0Vz0ZWvwmeR7bQrwckFQFk21{pB!7N?GHXWqK%bbnKej~ zT&_>$@*6sa*O;V#xQlm{$qfJtyO;V#xQlkyIyh~DJhljL5F7Ha@^1o4f z^!~NT<=u%~PVSs4msdNX!>sosZf_Oqf2TiM?%8TG`1WLd`!@9FV>@)Gd?soYuc^(C zGMTbV^%}(ZUfd6Z7m@G?mEgw1+eSo5&e`ldn8=nfB4r2u#2W$^4_Bnf2)LM!*r|bg zqp`IyMHU-JP(Oz1SZEQPRl;QiVYFqThA;|Ubmwl%VqrDK{es33BJTLKvgHNek*2r|w$8S|TfjP1V>$atBA z${Pk5FGi{yb$cIAq>9$|`#5 z6$huw0Dp`?xdu5R6S^s61`4ra0sz4xp;B-jLUi0&ky1%Aqv#@{@GV@&$24!)4Nc% z3wh#4F9z|-Q#SXMC8no8`kLDd7;e~&t>t6SE8ZNCR@D9JksF!rdY#C-xh;cUpi24N z`x689-ygJfj|S$HzCJ(!SwN|BmVRtjJ1^QUOHnbW^vOk&2-t8`$wsFKy_?oUqBODjTRAvc2KiDuWj}*^**YI}wrI(HNT0`3 zc#Vl)cTp2}*fObIaNFcls5riyjPiO_4(YPlO>0T4yp`Q~i~g3UatxJ|wmS@RqGcnx z5#m)W){&X<9lD=QoEo>cj#xI=?RDQVr1y4vFQB^1U!);M@e);^ib1(OXRv$~O&pcW zZ|7j<=!aF}Qr``P(9VK-P#RkeBzW(LZc-7-Ub9wBf_s{2DikGa$c6O0UnNuH<%^3` zS0jkF1*kGS4D&Xmx?AOB;BT~H3@f3aqStnk0}e+UiftSzjVQK>rzke2vT-<=%EsYf zDjSD`scak$rm}H3n99cCU@9AH*kdXipkd&R9K!5+O=S~&7gO0Si}__Q`xLW$_`7Dtxq&rM9EW4wjt4E$(^;=jd~bbO2Mn6gwChV+}^iP>6*5X z&oHdIb*M$qdAIi#S;pU6XIgIQ$SSQ)29c&@Yps&gDS_gBbRV^doIgv2j@ia_A$mZ# zwmedK-GLofnnx-)T@%Sd2O8~u;sYnkcbPs;<&EZe2URdZ>|o}V6}yI=gWH3rE1vls)k$TA7SHx-Lil!`X;t1XrHg#)SA6;rHXRnARv&fQ6Cd+R zu%EEPR%J9_5y5{4?FcfItm{~4=z3-7^!+<%d9L(zjbN|TtI=B-UD4)ftwchH9yyQ5 zzilV5SGvRQL{E67?@JPa_exIapT>CdaDOxtEcJqT;=8qC_DQxb`=yx~Vn}KcqkSaC zGK(GUy*SRy^rGz*5wg^Rc+q{iXoy->s$dJ3t#*!s`nGZv?bMExTS26gN!!QS5Ym0e zM4m7~w8VnUdk3j$RQb*zOBC+t=hTbXZ-Q1J97yQADW2`sd521_GUX8ph(2cfRR4((YT$aJ6^i>&RaCYFt!AHElvc9*orZi4P0BR^CVCeTZo0#UWtqJfJ@Gb|>Uwq8}?*HvdMIm5cS)`XBEYo1BzY9n5K!Pzz| zuKtPEQ?yNgR8uv3%+O4dlDt;!jm*4LM}Bmu^IT2*HGtF4&RFTI*b{ml;=ns^c4h_` z{vmJb!&BEPmbHEKWNC6{-n-StYWWM|>IpX;K3b{(sah~z`O~V2N+a6hc#+YSn;?Cs^AUI z2_HYHxGvPz(?fowb=}CO!w=K&=p|BNe=g^YPo6M9^UiwJLt{Rk`TH8p4ajEq`um}` z05`1I$Tad?mPRIiw1@mK7#%Mvu}1WGJEF-Z=^tcCKXdo6JIHCZ$?y|&24H&ZWmL27 zR;K6E8H5Hsx+23oyy5cltc6O?-H8Igh}JBQR$@+tu%m)`pH$oT=4HL>`w`U(!8x&2IGwl-=H`z`j zn=7G(2lLKmjMN)-U5}NzB|+PnCBbYS$g^&X#s_isan%XNvB^RKy8H1TI8u2?yIABO z3e;|i7m5$F@LYy*ZRiYGR%5ck=O3GmbGPyzNfZ~p2_$_S8(6Q+hQ<37yGB+>ou!v- zP$p$IVY8+sh$nwFA6Iu>5-qu3CC9$LUlRhhgqiD6yBf>@a1>3dk;0B$lQe~5G9hQ9 z9(|1`y69Xm`0nbAQ<{?TvyZw9j{}Kob7@%@@ZeKsqLo&=+|!e_TvfOBMhQm!0x$BP~WCubL_nZKu7xpNZ?eKDC} z<+7cK6L~g+Rv7jeatxUxkigYmuNo~xP!+koZz5{B_GPVy^r!|^XZJ2Aixm6}{d-jK z^`5C_kzo6bO;_*qNoSr{+1Io{J{*>EA+nRjgS;BO|&7Bhb5N`7olq!?(U1d@Qc$845y zCz1kT)O0TLU@kabBGxvIkz*_%k{r{%OLxoTN{@v`*Bn5Qu%alDW=~2=Y~#ibp6oZ7 zu@#x|hsrp`4iL^l0k%m9o>XvBvz1DgkvRT5 z_P$JeCIHz(NmgUBL5hvZhAc?Kz@f)?Nu*-Ihqk5NY2hqZ1W*9Yy-2_vSW_rc<&0ur@=Dt5JOT0}!$w%VtiKS_180*II#sL#bwkLlDHgA(dn!u!-)r!Az%u&k7(8AB zhfbGHOfyz~JsuBO{=8B-p?IiV`uq`0fCP^p(wU*`PwR1t$78s*8df$2k3XP@7V?1N z{6}~^(c6AQJl@958^zHa0-X2#jr>uiVF>h`>(ev@c_^I7Jv+wLpNQTX@ zvxKmtHf@hm2U)tpUq5m^cCxc*4RwhBU{DLoLH zb~NIjN-tNYgnui9W@>`aG$aVkuRWv)4dH~)w1d#J_wxrr(>}l-2u=GCeq>Ud`07mg_h^1~+?IUlhGFDee*6vn zI92i_cPR(@ACvNb+31)4dK!|n{o;7aHNlN+ODI2fo3KSK&@U}g%y9f7#q7SzNY4b( zC;_*3CE=t|SYY<$F7D(?f4pGNzb?icJq=@W<7t$io_`9?UH=QS9AXqzQu+M^s0kIa*{0u zqpXXcYbY&O2XvR;of0pCG!6g}X@`cpe0e|5uq8#K$W7tkr}Hn?j`AU#ugl@h?u{ zWP~-5$SDQm?J8dPJ@p}oMq4gf<>HmTrp+y0=@+UU4Z@mH4#pqd!o5$8C16~c=~6Iy z4tOkhoG#8oYmRa-&d$s!6o1gqgM9H$rI`>g-nrQB%$j+V7V@Zx(D{2IW^6uuJYV{W z+I?0$<@?;{?I+%$qd+`CbT=~e@%$cA@2%&&5bf3$HFy_;@v~|)DC9bL(LI;M)s}!t z)}Tnm>*WNJ(V&3AW^U>AN7)w}E+bxd!oFB2-q}UqeDNaK7q>#on6fYCOP?l>1pm~) zYYU6tQ;h}1zmo$KMlV&$jedd`b7^Ksw<&v2{Gw)oPK0swkGVzA9vgkE?zZ>i*^x?- z^=`5!Eney8+BT-QXrai0A!~IkesKy0oxLGd1LMN?G(IYCRw#W&(@GD`if35r0{lba zW2ziuUrdl%)mJ08tg|NUi(dqg0oPKM=^tBKL5vRi*_UyqjDYbuFZ&Wu+Dp8gT8;*^ zu0#%P2?3*D`sOn{Fb`j(V{Xq6*_I(e)Hbq;^j+YXT3^-H{fm@=@ni3x_$4NKz#jF6 zMEe>=(PS6hvu{Xc0qN?!M968%)#UcP_k7&Xg;o%NSQW(W`+bYXC?u7zSQc_t!zr)2 zn1lR0X(u7gRUBJSmfQElM@d(ePy8+(_@y7&x;dM{ z%At#ip!+#zlc3L=EI6%4wdgjzV}Nw@PnFrG6ewU5OpbDg{bc(S-qHJjh#_<|li@BB zF;PQn3=oP$>WHX+Ua4DX1k+<55RgYMDyHm(5$;6=MJ+1#?5i*@`tg)WR6+;We)LH8 zUKaj~2r()H>*hgrqm9y-ZM2nwc-^-rh$eQ(ym#{;l;zfjnWCpR9lo=l{CEQ>0F(7| zB3-PP#nN`jgMSK4dttJYenKQ6=k`qCZANGp2e`$I9*{>uyZC%cyNJ!R(nc3e*-l1Q zBt~UIyQoI?MrjutC=o076Ks2wc2O@rO1tpz*3qJt`x_)RYbp7{OZbgm&?E zrA0rSPxIZtx&EopM>tPu7l)NrTKOs$l`k>DaQ* zMqt_OZMNT?yKF~mS~S=&WH! zokkByF8h7quMOnXepE zzQrZtTNKwK^OuukV)Bm7;#w3>Q=@S$HfniuyW9Kw*wTQBi>VZ6gl;jVSYlI0Klfo= zR9|y@wVzpxi>+c@RM(Vj=+HXVTdn^~DfHWIHoS0585eIPGF6obZRMeU57*4MrRV0S zjEg9)F^r4W?<pGRKlT8I{nC<v9~jHx z-^jf{L@4jdtg>dtYEw%P6{CQ?QFCR$hg$E!@yCoGF>Sw^GQ zYyYxsW|m;IiDGh;j+V`)I@ILvd@0r)b^-T=rjUnKF6wWw-DFyiT;t?Vy`i##CLGR2w8#bo@GhS3V1y=9dz|!`YQexehdl{h zP_g|`f>;%GiSaCA5|D??hdT4BO$&dK38q{r?jY2!<5|3VD$Neh;^ensFKXwXR>m`P z-9QqfG>b|Jnng&+ZctTdA%-RZB`@cvM2k`?rd*^<;w8zTLM3YW0YqT5V9Y0jG9g+_ zb_b~nT1j0-PKzB^Su>|xi*CU0NShSF%53M+YlDyvW} zs_QW|t9T$F4YW?4c?1S+JNS#nO23uvYF?j3FQ-PQ-6(vrA_EEOQ$J7vL50oPtk1&7XR+QmZC&I-R zWW;ui`Kf^Mli7g;+aitu*y3X{)<)?T>lK5Qf}?bc8h@j7i(gFX7V7|}umWC(Z?S%4 zzp|Nu6iCUDt??}?wd}g z+fn4rtVu$FN_*5;6(Ck;HI35gM_beE=q(;P@4tI{HElh zHby7#K!tNrUElOON2!+^IsQt{#cqJp5za*up~~L`pg0#NPaLH-6z5`@iBHTf+cW(Q z&PO;GJKEEyyj9uFa=F}Dn|%i_E?Gk5XLrKTwVD3&XjuFltJCePCfeh31CM?LHz;vmJfIvdn~Xu@G@D+3V?3c2%z1_fzD&nrTR}Oc%2ZtN5^eIsla|VJR*k zHF3Z$UHUo_cbTa;NywHUdRz{))U{-rl8_H02UwMgE%hC4%gACC-!4|`t(69rl*$)h z>VEE(Ty&4Z-mdhuGcVes2A6zPGan%;$(6mhnf))TEAF-SWth4r7=>G`)SzHw-RvaM z6>1amAosKZwmvU+5{?h93ozwUPE{DTPOG*M#9u+=Cm0a7VEJG`$W&QD7!_bZVZSHK z{L*+P>wfS(3-N-t3NPu`YfQojCHc|~U?bBaO$QLa* zooc@7qqcMz#O#+h(7B-1#J40Z8U#lvOc{XHyxK3I_qm1hg0q&HIT($8~;^U%0pUg?o!%xVQKL_YaT%V;UW3}%XZLeH`%HdouQ*$K4z4(p-`XqB9^m}SGx@EZ zbM_$TbI#G@al=UQ@QMOuq|q;gUO!a%P^z~ z0BaO-qA4#~E%D@!7M%SBXN^%Uel09+rbUu56FS?qN_K$gq4SXVvS8+q|2xYvUb(JK z^Ipcb_)&tTJc|ub7mZRaCh-oZQ6b$G+aknV@P5*J1MxVjEY2p_9>ugs&HohAMdgD& zO1Ku)L(HnMV}fzQwb&8u40y8sfjr_tA>|+~D&ngh2_^pajx&p>)<3 z9P^sMNqakMPPzyS;bN=Xdmmb$5FI5-9gbeI#8X&jUC8pg9R$-FWfNDuGfiP6!#ZlFt;y8@SMSD4JI-Lb*LRn*3KBlS=YUcVc+Ne!>|@C7$9T32xiqMvr=Cc* z>>-HUS@-$Qz0lUodZ6OSVMVSWT=}0_WgQlOWB_!rTN)!Qmby5kjF0PJeC*}23bcJ; zg1*2WHljF8Z@}v)y|E3okq|m!mg6xdf~BHKq$ePewX}G?IxJp_Ix!YWLuqL8u=FST z2ojUFNid;DFb&)cc=$KvUrlth-}PcNlGyCIEI;HyNVT=7^JY9_A3S zXkfM1s&nx;(J|ON`PI)=htV2xUL&bx>&DS7TpJs&FfR2tIYKQJ?cl{1qZwYU=$oLz z7O6aVap@Hl$R^UD^gy0m=OSE#M8t~|fV=&c>dHjz#4{#Ax#^?n8uQV?%CdRmC(4QX z@kgdH5V*n2kOqlbj$z{}#jfIWL~_2V|}R#yfUQ-7Lz- zda2mOM5yQ!;tvr*v@#vG$n;H?DL|TO6Zt+=;0**(MJi^g9r1#_uS6I%0~`(F%cUHU z9ab!wPsvh&LWs0310`TR}&p3mTXhMl7VswbcUsw-C$bv3baHCb1a?J8s& zv5`RP))aOz*IqfHmeVU8erxAUsONl+!*6ZpghtLg9e!&UOlanOfg=*g2`$KfM2XaeJKTxp!XFJIR#{&jlkdg@h6EQv20iDeNaLX-fSPv?Y49sePXnpr94 z$VLqSMKWze&dj6zPO5QGQpW@5gj5yP`Xlt!q-jelG>{SjqJgYK8JHW9OwWkxKyCxN zWww-$2>DF0XjBamM~+rABd;c8la37_*86qw03tmU%PO>my6ymXh$lMVWhN33^0k8d zB+MWnPjt&93k`lYLI>&gHZrGOfzs%Ep$b}6eiwHuc&#-2=5h%~&^qq>z`n4M`U!@G z^j7Pb;!}jnVa9LCr`THSPC6^-zWa?Fv>BUCC1OI&#sI7sgZG$0}*er{in}apN;c68?k%QaqhhaRXfdzqB4-D~CfgrBKwC*QUxL=`QkA3urjHc3X!7e8y@B)5T!|5gSh&q zJZIo6v3wejm1F zp14`UNP*t8#`7T_7?Z5AQz=g7(-off9hgn@3(FMTbnbd3XS+J!H()WQ zJ|!b!M$puKdd|+{l;DUJ)mi~P_hmiPQfQVik68-cGQ_h}mqLq2^5CmNXH~Y8HesKl zA5yBe82V)vLw9H~^g$~ui=j6tfR=9??IO#bSrKg{3*5e+(0OIV`sideG&{A&vM!GO z49-0S_E^=q+UIPH@OAN=9c2bECf+qyxo^Wi|Yv3b?rPRIfagbypSyW zq?wtl!|ou`&se=-6bnKkxrQQP)ng-xB#euj#keSGN)n*1_vM{6v*PM;NT_e*`PLk{ zF?Z5)zhaSEH%`p`hKVAi1S|&eLOUk=Hg44W%4X+ii=It}B>OjV8uHi?^$vrwUQ$BB z+ZY!;zFoEGqZkl8)-0xZ`-vP6*?Q2F%RcBXeBQ7}Bu0)!3(LqLhwY((=q~Egdd_I~ z>75Y2FR>Cp9XyIA70F&P6=4 zBoS>Am9CVYY_-U#`^K7^L`5cgRkI#L6d~nY?6)*m*}Pb?Pc-3NOsE=}EM&4ns!fWh zog0?$MC+~-RY#@n| zr8a9TOPKI~l?UG(?y%B&UBPYFyhz|4j>K z7$nC9_o%)}Vg{pXX<~O^pSE#R8xK{%dZ8|kRIgH_xA<{&L#lCeS+Q#+e2WLPRRzPi z4USFqF0G#Uu5DjQ8h{)USC?|_vQjDWmDWfi5vM)W`~B>06z*>S{9El!+`g%#A+|-u zQKMR`tF)?a^qAgPY>R4a17uWUYbvoVu1tH5lB>jX1X!c@B^J_yuQ6=<`v+MI|AumJj9JYGWmPXqsO2*f)F5W^&X_|y}aVWh?SQpzhL|8hNn1?n6{7!5MN#r(e4iMOM zHFg!*pn9cRHEnqc>teGXJy2m?d{OJZTZOQ|nh5LS9L$e%niDJQU#ZXv-(m+iUMmKJ z_!eQM)2gF-+dDP=MZQ}%UONWqT1-0s`}r0>NS4s3yivZz57sI?y83@N-y&ZKn)7~DMgIWuC0&F#H6dMWc6(}qFhozB1%p79_}M{%##}7U0gK4 zO?XaBBsniKWOa5L?_#@I4aU2u#Ef_GX)Fg5lqv6`a*`xRCQ1wNs`Uf~%1``Hrbs^^ z;zh}I5ihoI%lb z8VU>EMJmpv@2MQ>F?s6xd=6}k(zg8`r!^dex5dJ%Wg zZ0_+ogIv&ZpuPl_@o@`BO#k+JPQF4`iHvb=`UNnlY zQ>Yim5~FzCqFxNy=fIS1@2gaLg?q7vcQ@|Ef8u8Ou7rE>U&eATegp2sT*>zCI1(&AbT~htV)#&#Gn+9=!+4p|F!~8>@X_C8RNzb+bP(-5yxn=i*$z~YFCIYBYe`-OE6~J; zhFJGWI_k4XwsecMgfFW8N>lilHe!1~)x%si_kb}m4)H--oG@J9?e!JdfQQMFf0+P` z5C|FBGfgrnK!F9P2-{M0Kv2P!Z3+u2qXQ~P^7Tc*1@iXiK5y)dSWFNyDa z*cqQzaIrJ$TN8Ywp#n1i5Nu)VXZp2<^*{l}&X^FXi8_MN_QhN(OrM-hKm>y%1_&&6 zMhG}X`!$Li_;L7FY*n8QDE8*I7bbYUTewG>v?xf~kh=&Jmi%+z3r0 za)sGQ2pa!+hd8aun;)TSS;C`P+8R_|NBJSb3bEAQac2Kg$&<*O!`l<7gA5v#JDn_4 zlhD~u_n;4(U%?igL>yI6OKmr2kUuVy6>+lPL=vC4T4k;w@+QSJ5gv)DpdIN10^maU z2u_f2V;-e~DN4*F(nuprTe}nDn9Zq7O6eTycq-DG({Rg#Ds}|7d?*z~$@i;I!7Yy{ zfk#YgnGi~F%heJ}Vskfekp1KaUjB!;Q`}N0 zynxF@0135RN3F}7sL8XyNl<(WTqc;M@WIJnlZ844vSiBvz2t82O3+6^k0W?xF8(7v zbFK9^o3H&sg~c&&b{J%7@FLTg~2)`4~Yq9SxoPsht{x_0^~QVHt}NF{Vqwu`;# zQsBxkt}b0tfhu*RZ;tL@NLS;5OW1lyS+2Zh%FS>(Oj(}IS{6~n$n1E^kDdW0Ah*{ui0j#5ZXzFCjM8ttQ@M;2c!+NZd!tuOIiDOuWgX z5L8p%zR?1eTSrM8+kOR!qudkQICKh$;||_sl*Cb2V@MpI*f8RCoJQg}KtSe)+es5I zMu*oWaSSa1%x(;e#t{<7COm}|Bmkh+gul~8Y@CP=4k?MFPREcq=8eQLWQrtu5lBcc zVpNWM#Mp?7qQV%p&A;0X7IM%%weUtM`xVG>(I)DkJ|kJO@1EDkj|G1;u+wPcI7#{%6~!Q!ryo{2eV^eAm7r@ zZXiHdWa<>SN6T=Kn{1@EEEB&8Q>xk4*E7hP1x2-J)4)dwp$2G^Y4d_ScVPB$lhv zE}um_jz9`;q4Fy{jy0-6cpO`dPT*#FEq|2qI5L>!qVgdCYwW(-l+LkU?SlVWt7CB^ zu!1^=9Ck|#FJ?a1g=m8y%3CaOW#;=<18Z64^&Jvso*KlS?$K<~lby23;wz!}iT_>s zY@c@18a$sGGs)WFgwk<`;-|Z1iN9J{?GZ~KC|^r=$M>Y}F6vWZ(Yh;CE{eY1uID7% zT>vGMd1W*Mvi670){ka2W_MX(*U%pd4Nel~T~z&UlKG3)lN0E>sAaV(u(>* z>Tb&3L1<^6O;mSS*str37aba9bJP&lKP|b^!(6mellRfyi}+-=HhBD5tuiqxM|4UJ z2y|B(#fy-B%Mu|F?a9TNDHUmwND@9PKg#C_#TiSySNqPB)JHohM8`zwxE$1**Jq)| zquLi)Utx7b7%0%ZT83cg=qsal^i#@cD{rMDpTg#th?m@ics)Vfp3B~bK*N0!grTq$ zf^T_ahidg=F1y+7%Me1Hs*$Im(B@myEy4-NL7fhe60Q53O((1;7)#u~WBVB(bhK4% zh=oS8sCWpW7@sd?@|p@z?L-?eL#|Fl5z`E-X$_1MQ98O)DJ*I8d<>=IVx<8AUEjOT zP;rk&Pw-^~_6Zk?&+!Qo$t$g1`#A% zH+fT`@w_Y9`1}N-wPxPgR4azU1RDI9OdNO|w{dTtlW8DS*AmFc-~tz@2~mG9=)9rp3j}Y3E%rI9t(=BAAN}(CbaSu ztiy6P8235y8vYQL5=$@ilaO$yW?^>vssg>@5iQbew9hO27ZMBNC1x+=qNge*L+W>H zDZiFQwB(_QiUzTr7uc5)n2e}UqSIl>L8kJP%BBf2&cs{SMy8Z(5r0@6YOcBSsBf9S zW7h&O`JgygvZ^$pKLc+^v%weHjZtk3Xa_N%=RRd*Pey`x{eQX$ed=WX~-mv=Kq~FmFa(^@{)sQu@a&=nfqrS z2%28~qE*%$%K)aF#hO4PctqMCG=%@J9aN#{9lMnm!7M@}AU$i$WMtQAeSmUgsTnEU zq;oBkvfM~c$=*~+4zbqMy>?vo!D~O7a5pL<>(N4IP(^3ipAzJ!aaLUYr974sa#;V9ObHx{H1%SqD(#NE`SN zj4#};>HYCPO@`c0H2~d3-_X6vm-~Jy^jj;mB^$l*%WokFY{4J#viEt>1NH&vvX_0) zUF0fO^jsl&1z{aM0^3c-(OpMjYOhd9KLgYGP3@ssYN zzXGHdt0Q7M7i|%4ol-gqS<_sGrI^eHV~a}LIOL}z%&>1`aWDD{l~vYvApgv;eCem? zQM9(bhgcma7&LFDuW1^wI`B=28$6koy!@2lE zecgdxjVqWhZZbi~`ZMQ2`yOMVIB1WsKD}LnoAc-cw1#rQ*A59)2PuOM1 z^r4s^zf6ZzFpFRUPRx&F%Q9KZG4C8Bg~d8Yz$N08C03K{rmjhDRKR`2cpc?^)x)<4 zp|a$nrKg|9faOyVk$x2`=bl8qExG6QK(h#cH0)^szPy{C9$QYkP1`!|5SYzWTEN)B2k#^@b8_n}KNvblA*h^Cx zM~0)-*+vfR1G>lYLbQXMv)RS8`Zh7Y3{4C%y5jbFR;UwfP6F!F&fMH) zVE;>r(UB5DnngfGr+m+hT2>SAai%=00d0`b)eZQcds2h+k;)JnLH>E3fc{t#qnwb> zDx4Ob9(-ERzVSec@br!IKz?pW`DwLk`dy~qj0f`1PvwDp9pvBU0Zji3`9FmR^0mnS zy;NUfEC~4*59DwK`Iohj({~hw`ojddg-`sd?|>XIoI2^(Mfx?vNxezKX_yb%um*3D zt{Y}SM4H$ObOau*v*Bdw--eUf3pSh?6D4y?j7HHcrJq0znd6Qi32E$rMu|*lgn^%EHJ#rEn$+ojuey5{>UJ^m70o zV#d3|DVfqqwgM7K{|YFA=2n0O7G{ekOHyV@EhTD`l~9@S{;ilLrzgykjR~{luRWy9 zl7thpWCz`LLP!4Q7%av~L;*M8phE>u% z2i*C!ypkWF$DGD1Iqb%D>PMBYB$fqs{R$BIL6)Gt9ss=%zDcr9{*CfWeg&?`QGUt) zo?MAP4}jyZboDy?l7GbgQ}`uoF2F|oRs5372b7lglb~O+)rjJkgjiCY6a-2(DeLc^ zNe;WzTD=4FYS`X{pUoqz(@}oO)-n8&FDM3UF*)o~$WeaDgCnUVhmKL6eI`B$Y`|P}}K?Yr!FaD&T zSa3Bewb(z8FJb;b(y~ze^vs*6ra{gzwhkAHpKU(;YQA)*h$TZl7{a=;u)^)fTq-ju zcMz0EEEwc%)Q6i?6V6V8ZVJa=&h%a~3jncX5LbU)p;fE~LZ!r#sG&iDj;%_7_Z}sd z{K3a~2@g_Y$wKKO@<@?#wNjzDNL5uxm>}}S%j&}jI>+jmo3`>- zq16Wvj}P*~VJKxU=@; zoaepllk9VTl00_sRA4O!vwgig6~!z4@EJYOms?LjNcE@eHIW(6+I^=Qkvl&$RFSUa zb^k17jNC&Ytv}<|ohmw^+9*ca;l4CPMB#D0bwX#mf0po00lcXQZqKK)^gz&Fl>}B| z95QX!vMOM$367(LsPoR;5jZ7`AXNxLcJgm|H8|)doJPITo{KOt*%+&a?jlX2 zht%rUdNt_{cBf|q6ldmn+S>$A3=o}me@-(%0q43mUr9zU$mLf9s#aDI-|hP%G>7JO zs2~l&M$-4D3 zp*@Hm*HZJXmnu6NRQ|ex6RVK|>7QTvhAs1pXME6$SX71aKp+(ps05(F~pEVgiDAa7wZg9xbNo#HpwB+VLrfm3r~0t=C;VoD_sE37yXxW%sb(DKF2ou zUMKP_Fk@3>B-V6PweOd{P>lI0s+IuG*2hiD%Glg0fxXRmXNmSerT ztS$Re2;KM<-k}N^-Dp~G;n32qS$!x#4A5$90UqJ!srxcL(f(Zp=5ENIBWX3w{ zN@iMb&1PS{`D-(S0#dY(DFdW3tFS*dy1nnl5H#OQ?2j$oRHd;$vSgYGWVhGH=;h{o zGljco0a|&!wwI`vLtmt>wkadSx;Nc)zV(w7ON^l4Nx6Hp zb>&8$$<`j^SmAx_NO>QzcgFBOYCL!q-pBFN)k(iPKpDy};bK*IA9q0vVuR5yUCK_5 zf9xj!#)fqm?=Z@Ay}q`UP(Mn6$?(svXZ~wFEFb^*B=QN;{N&qW%$g7IaY{epYshT+ za=6&R?A#8X5qeTOQE z9lD+J1Q0zfs$WKOtB$mZj-xxr60ll&)O0OFwI<4e#rwSk?bLAN9*d~TtDSh`ah5z=p3 ztX_KElq==0B1^Vhd(pko;7#nl8Qo${*Vi=I%Eai1Cog2L!;efrlahA5G@xw)eAVI= z_OY|1K*UH&8BoF@g>>%zpVgfUlvLH3@2k37Z3oiTGMX@ko3siE1jL45rdyqa!YZxC zYV0UUSa-YVTFz0W@T?EUTUwg0<5TlSBtd>^N(csKvX z=q@zB-OYMz)T20#I9;uH8r9>5CDQ zzq*t%v|7lzamQVqb`LxVfIi-)cV{ctAL4=W!mhu|ws+u;cIA8f|JrRRk`^mQb{8WbYd1L8X=Z%is zfhPe)SmJ0byoVV_uiStt=PTKTqr}#$v2K`6uu?rrCGtFy>&zy|BH!i@m=`syIDFHL ze(Ei*D)T6V)C{yQ4}kPTO5 z*5}sh+wxhzufPcde=N6zf6&WQhmNj%{bR)k+)nd>q(bW;%0XZ0mQ7%&SUrJ$n(EOk z*1}qVNb?vj)7Q6`N{jL8gLz7JOLv|GGh5~Bn#{k;d3bjPmtdFD!APll%@~CoH!Hdr z6F!zic6;LRU5L9s(T{?WYsT2Fb>=8#`4LnsnY$n1O3$lGeLxwXtI`EC;(w%b+?-s0 z3nE)Man2TToagj&K$*6($Y{cNUBOR)gSIDc9K~0lHBmKVcsU%Ag8in+@w#telv9H9 zmvuwiSAG|XdLGj9qw&shYn8OH$!-$iprsJSNwS|jdA8_Bm0;gHF0kN1h~-W|Q|ycaUf%Ys-c!dy zTkZtHqa2Euvb^_#%!W34hIU+<4z{C)4{x{}I7feV5|OTwqO>W~)s}>jR)BZ>5v!84 zoB_W(c1ggR&LAUBO`B#%!eDNlp1IwM74NjJ5YE9quAY%;dr5WQnjE{-6H>~vPQ!n3qYuq=X0x`t}z`7IA#aFQcXA-A|p~*brLT_W0GK{ znltaS5gTd$Ig=q((x|LUJ*ug80|g~h&(irFNU0c}@ezdDKCVesSU#!qyzIk;$TndZRzii=#WH#;=-g8d2>?E+7 zB(OzhJO>t5WX%aAugkm=&meN|i>$gE{~yu?E4fj%(3HdiBW(|8F75CpMw=9KeYh_Z zsnxnoGIg_*`DE(GR{p>NtH68O%aX%iaVp*769agT>4+XsyBU3ksr~Wrn@GIp%Rllv zA7G0qp$S1>jA(jxl;ww?zLre=4NzQnHNuvXOw5p8!cuLQy$C1(6C73}tD3}0BCDT+ zXS;0-2%lrJRGZ>~kq*gHY`4->3n7G30-J7BTv(-;2TZIk^O9yNu~L*tbi? z>@o;K6uF#ZSb9|qUf;#Bzl8sAK4iD966K-S zdDFL>Xv;JmHYCR+yRr$Ix|#&GRC*T&%tLBi=AnuFB^e6Ddv2-A{s<7QcLC9QEWGj! zK(sozZ(`-aaJq%xlJsnTOSOCU!Lsxeeve95@q2W7c%*W2^xjGmM~q0jG5&%1{8EE(0meKF~o@Bjc_M+#8^7=V+1lj!7U z+-c{uZ0sb{#4uuPLOkt?)#A^7A~NIosFtS?@G|3ZETh{P%HrR?x>oxtaiOsL=i=c< z*biGun~)c$@*Iek8%UY!BMQ4NmC;;MpZbaB5`YHlQvc9ugt`^pl#)k!taqr42KXpu zry1r3$2x`k=uI?RHPLM0&YQ#U`xDJWb<#h%y?9Z-$;JUX%JAC)LwE${1-c4`1q zT4gV#KGT(-U0$=6X+bqiFfY_+ zUemP0O*c!=OJpaQ#a;$dbvbk8vDLR2cZl4YX<%gPHROJqY`wC_$`T}-S|fBJO!AoP zI25d;nCwWwNXlv@UYE#D8CRH3=4d{-#UAxFupKp@Y_meZ1-{SPO3F!-ZTR|Yc)l6- znAp_#vVk(C2uB3U29-@=-4xc*JVESPbx3B*ZmbV)&knzYqFW)*&dbN;6e&Me^}O1i z(qW*|+TAJ_-m%=`kB_rEV#yQ6V2ED ztjFJsdz1M{FNgW)I#Jq{m;^>`2My0U=vI96DX zpad7N9!L3DkE04$kIE#ShF3I}KujO&k$J0t^*GwF9!KY~9>phwI964_deq=|SdXqZ z%LK@jbiO~`XvFO#jbuAhy0ijDBSPCdYd8S+$aM3s%pu}V8W_wu0uu(8fhxi*3E%Jp zXQzWl+K<)b#7wh|1U9_{U>|?bANDb(n}>4Wfx1?Bk6Us-Lmjlv`xOjbg?{@v%EH>& z_x~^6BV7l)N4m};p5Rb;k5Lk?mLc}j@E%8VT_)f`c#p%lFbMBaKI_~WzdLwv&qJl@ zZ|-?0oSvBJ^1&W)95M+T*yE$Zt$F~C7|MMet?c57KHg^Z0eC|w_a&)n4=_80a(~WY zyyHgo2jim$-Zm}ndd{OASFICy(&JbH+{#;Z4#k!1%`Rrre;*@Zl-Lr*ld#>hzwq{T z64`40_c-t(NfO!5hMjPdLP;<(ASHJm_J5*wo2}+2)6P~hH?GysI{j?WPp5vm_0y{# zR`YMuYJLEq-cOHDq_$-n!jF~x1CL38CX@3?9Ht16YDRxq2fHFnmnuR{D%HcRmN%qbZIi|VWNHUw0da0Giy?a z#*zc}65UzX9EK+N^YHvxk~L)NO?di|th-WsHPAKRN$QdU1?=Psx4TcPQKa%Q$BLOq zlu>0djs^_6;tdJ!5kmYQhCvq*rK3JO`fkY~y;kr%R{o#_5^%{Z-N;1P<|-unhWeI- z7=U(J*tz0b>fzLM^WbrDMmBb>Jwu3qCtHpRqGUSADy^2wJWNcN70h!8F#g6=+Nco`MOF`!R4huTs0p(fd8ghR7Vf zTu7#HhfzYcQXw%owj9@VFO(S78cW855>c$v&@Q(?lmDkNHjI1;GB+r*)<}-z7Mv$^ zZbNztj35t@sXURkA4$-c_e%(wHTRN3r+5G}o1^3L`9;SkM^72oHEyf43Kb|#zU>kU zif0!bjGehq0GTTY2g)8l9-%KW3^P`w?Q%*T3LdjY@R)D0U~jf;*L{@2BB^Xhb;!Ix zV4z%-)ls5dQ+o)DiiWjf648&}J4d&wY8L3)Lp2;ErYvo=d$U_xiVYV%lP)jP4s~mm zkADDF_C;*Vz71ZNWgK;jUS#aW*##&jFqn$xYa#~|F~9Q^FWY0=i}zF+wMn+mXR72C zZBj;3%li4eju(zD=eTA}7r#JkaFb7LWjO~dD{*1topnUTs*KHM4&I-yg$%Z=oLRcf z7Mq)ynDEk?F`ZbmyP;#9?KwyanWB3@^;O;v3vb zX5ey`8nElf&t_!H*1|}iG_Mb<(CwMvfBWdYi7X~U{sm)&RgGtt;ZH;P;IoFeTSpv2 zC#N>_5yBF}%1p?pV3UVjz2Z=7CL(=txjKwUhauU`DQw9eS+x^OZ5%$ao>F3ff8)K! zc@y`Iv<8(4^d;jint9t}@BlKlP^)C-0~)R+9^Q_J8dW{>8`SX4vDz(>HTy95qKZoI z6`x#nQ{DiK71)R2Y`cH^AOmnG24MN)HQu#9m_OJV>6up;>w6W>9vtf7t+qJiBi;c^ zz!hhq2GlrXVz4niGK$~QMk%r9%^&-t8?-w>ZQR&rHJlCG26bdDzi8fA7ygjccN+iI zNa(isb?hvRDq`Nf@z1#%-sSnB&WKkj8k)`TXlOpIG282dM+C5KUiR>A@)7Q&>(jjo z*fyv0R^;^xVA~kL#M>I71F>xs3g2UhHN7eY*9(bWV0iy`8{UBk_m4En<*9LixCW@R z$J{qNREBY~-Ijej*)OsP_unh|?-;~_&n=2=gT2NL_a|+*M|R8cR1Dh&R};g%9+-2( zwz1*PQcI?E2782|Kz~nhBHzuYkLps5YSxv8|L9RoNrh2OiH!6=?Wpe5Q+~>*&W&O0 zYSPX90?FF}Mg;j@?zfEKp;=Zjv1NTdu)cxew&GL z>y->5 zTXqw!TALQ97nv4SRickHH+J%98bv$#A8KyQy($fsd9IBFE8h*DiEl2_-uxjt*ha1- zvo20Ant9%zLKMW(lzU1InRPK4fsv$+o?FB)N~DN%qDYohcJaru^B2|ftIR$rCC>CC z^sXA4FtN0gO8QwP=Q1y9B9-1IZOJKo*O>gwa%@yGWN+WtX&as^@`i_)rNYS2V3`%F zqdnoAhgXxoTA?oUnDnycmNYHpPYyawJ$iaF`Qz3w!vVnAw)qjUL zuZ}YdVW$rY%UJYsyhX975h4=n`V9+`{VP(du|?E-pGDB5bR&HdY#`KelR{k)^E|h7 z_V_R9HR$PNc~w#rb{5i(l**$9iet$K3Lz$#gQqQ&8K;LtAw7mg0{s}OB1M&&l5e** z;=a9{>mxN)?^RJQdZqw|v#J?ceY6VTC96RaDHpP;_vdQmVvi{JP8VZ^3O`Hx9jHiY z-j({#j`~*xpN-@=-f^$gYrU)nT~hiz(>xGo)p6KQyZDV}I%C6JB_nO?t&v^yjJlZ0 ztjjL^xz&A^3RTEVmDFJuNKXO073=~iv@Kkx_96a=ZvuY^(q0SN|4$ltFI?opgR9-CezQH%&!m-esGh|Mto-#ut^T~4U_UcqaHNP6c z@FO(@p4+;=Rj0UPfRKoxe&paE=Ucid;9?7g2 zK3J^udq$)rRn$h}Yi%sZo#2BzHsZExz=pq+*O5rQWwA}-@#QRB1J7CYln&SYq)n#O zeF7dD+!C5f!F_=`PYyyi=NfJ^th`|Vqt^d%_{|oC#M)3>;mGXECc!pCQ#FOz+)QBWX0+7sn~Bt}Sa?77M`r8Eb~!M2!s*Szpv&Ag}1dl2~%s;;}X0yGIplZU?c7K%O#NdK;gq)bUc~F2{e_7 zH_0c(E^kDbXtG;J2?o#%n^suAIFDY8@f976ApxA_K7(k8Gu}Jx+-*=Gj0SNE3Ni*1 z{vCeD7{>6(jK_enUGrn4XvGVuRJdGZl@@ENxnYDG(h#|$m^#I8;17TXv1Fdp3-V;o z? z=Kl?ir`V>_6z{N{+i8wo%fD8&=t7c4s}>!6=CS#OKWiqFH5YnD_{`Q ze}MCSA8sh8K{5jz5L=f%^d`YH4~JK7KPLUxiIoS!={kN((r58onm&!+vh>OP9+f_U z-=ovzFrb>cY@#x9T&?Xwcj!NX|FZ2D$+a!q6|6tKVjW-BpI*_)m-VMt^zt>(-)+uK z5NW@Mc8sokG+ZCv$G(-3_8Qs$Z^6_b`EpF0AgP{qtKPkNy3VDIB_~?*^IFugm?Ylq zjJjAS!V+&MLf6BvK9j4qz=LJH1YWsjy>}gj5n>L4wV?$PruvWDNvI!w_3@ElbIzr~!s!|OmZYchTbiECZyB_=jC-h9cmjn?rh~%st9-Uz{_)M8hx_4_m9LVJSYc zdNK*vvyEFo)#weePCq+J)az9m)K*dbw>_O9lDii~M-MlIQBx0KE%*x@oeY|)i^yh@ z&n?&@F5uWoZak=Sgn#Ec3LuWNIUZz;o9fK zTXY!BY-X*QA>-U?AdU{7Mo(b~ju$|TW%f&2S7lh70lJN7%m=KFdqK=vB^N*(*E@?b&&X-90 ztHl&SgnIa>d~13bp$aCfOd*Qh;DzWA#VHz6{yp~Z-koA$eD|Rzg-a}z%DC2&p%(M+ z087vt*XcwdXc`7nIT7Th_t~=swk)=mW?d(LLXi$8BS!?$K`JizldI4J9K<4lX4o zPm`Ai(XKL{oidR!RZ9j`Wjynw7FS+lX}sALdpnDf_J^PGqNy@3)b5R}z6Ko;Q$qs4 zK_6={nObk^_OcPkAFK{jK;8Z8D475+UXSiR0B%q<$-@o)%)t%T_)LL{L~HjrxWQKp zZg93121o7yL|OSd{=HLWXLc!st(!x7GW%;&Cv7)a1Lhyn)nadjOzlVP|0~7EZplhz zA{?Z=040dt85!pSrpORL6KvI8*ve$}DX3+20JUz67I3?_meFBSFRR~K-$A0RA%Xz8 z*XvZY-dd|>91uZ}?p%y=B76L5+$P4FO#fO)IrXXM{ehoEkd=#+cWBRw+(aSj@hU8u zd6&I&>ch{(JHAtO9${osLa!#JKm_l!Mlen5AcHxg`I%RvmI=o*A7$Q2S4BIP2}bby z9*p4r==dvZqT}nQumNOr+!NzI@!w?TRd`_KvJ(~P7MoUp5v*i^AuAb$Be*$s?EXmG z|KLV(fCRN%v+XpRDYFkJ^H+}D8EKowS-hiNwTq8$U(a8(cBhXccp%cYpj?!vBiZ|b z{tJJSVChThImr_(v>{3d$ZoOBUUm;OOG7L9_ddr`hp3FIgpKarL)R{CLs zp><~|(e!!~u^9L#Yk=S~ojKb02&v8gL06Fg?i63sd0Fu#e?7oKPVbbu$qP}cH~1p#qs(}CLh`6eMQeUZ zU_u@a7fW4s%Y!RdrOH6(6gE(^yrpuFniti>G>q&9&Iq7E%IV^=V85l)J;O1(49F8| z_(0f)@sl<~w)C&)No?Vc=S^h@{9P|Z^X!^47vr(Zj@J`K5vA1w_}LU0cdq=X8zHkm zNc#Y2DUEhHfS}C*@E>VejWkPgdP`=!oE`SDk^EPPcCi`Y&hkwBb;(|}Fvh^$KQw`{Z zo%O7#rKAR8&{(mBdg>zrTK^E^$S(1=8+=0B4R$)zEz@07doVJe0df#|uGdW)Fe6&= z(6Kj{0WH{OOJmTO*jZZ8f|DJ}7Izw}2t~zp$7Hln=IrsHZmHv5V+L!+vDv3(e%!}E z_dTf6zZxCx);c{poLK`(Pv2y?TXCQTf7J;|9KnYcJleTUE46L~>^q823)HX%I-8S87=+iTS#(FcGO?SYhPuzU09~6ojW6IK0rSL zh)2^HJh4SADznSu@v-oFC*d#)iuuH{Q--lhFP?cHj!p#f);OlFHI0-YYpjFag}W*$ zS3|f)lO2qDgR-jaVpw}M=OK$V@@H#dnL>5UbftB&F=@h=p*=cOL9zI#Pz6sh#gy^O z?C8m4nuH8kdkcz+WHOAr@5C}&Vl=#NqTPIS=g68xBq02kyIv37bj>afq-%C7A+x3~ zNY@l=mabV{h*aM%Kqky22smiaD<9~kOrq3(`Y3YS4LkGlH1l*DS61%isrE(M&O{K; zbI|q&)X3tHubDyjfOaehzo5=17Kv>Rlr%aP7>S%>uP`UrHKSW~tEbu6;vV&O*dq2g zb2k596_bS>!WO|;HH#;lQH;j1Kp$qc5hTL8elV+5KFn$miI5=xLoKjB+2j%oGAKS0 zVGRR(0K}^24P(_k^AnfFw>+QN5IBV968jV35Dx7PI}9K~GR_hQ{F5OOc17B^(-p9} z5#0{bYBl2m2G|>Ed!Lu9$Zwwk_KA_z7#E}y1c3+#@joAk5Wn-; zM|UUy5niQxDXlUPWW5Kq(Xm7aL`VQz9*7WApIa?R0N0;U>{`FL0y{iIM&24(>KqVZ zfx8+rPGrxDfVy^%EO%sn)GT-Ur%rjc2>msmzhCuqEnXh8!yF@NtCU=mHrK*qW{nR<*9UnR&38V9i9{E`y= z5oH1fGgC6_1P&Ar6DI02n@yHTNG`x+ZgNHgzbr!FSFItOLZ-hy6Dl+uAK}76hL#r3 zT7XmY`q7d`P2li0BeHSCB&HE2hq0F6=Hp)_4m_!0)vMxO^fxosK16PVqTMEHp=)5CI$~W@K)#E*=0ZTxH!E8P$yI^#)kzFJ|{J3mJMJP{K{B zPqh`n9g#%9EF5)HzDSiHm8}QN!ZV!kp$XN5kuEpr2vj2I-Ng&B3?>g-=tf^Lp~Q=; z&uqN!t80}oqogCI^S{QbGF3ZqWtc=@w8 zxC|;M_XlK@ME1TyR)6ZUwW_(d8Q}h9!!~pS&|UnNyxqaP!Y2$SI8k$ME*D~JvJUua z?kCjOyo)Ws6cYF<%}f(~-06N8dw20xovj>x^|eX6k{}WZSN=3o^+zYvW_;g0h!(@H z7Vyz1+#H*QJZBBEVq*0yz!Nsgp6P9A(FAWfpkDAt!4r;i@PuP!&Ws#)$UhIBkaU74 zY{ER;gn76L^KcX9;U>((O_+z9^6-QWN4%v4ebIK{YXqev(mtFwL!6>4Gv%}#z(*|3 z;AQ0Yve_O6%=J>&OYr><2&Q3Y9^jOvASw<_2fy_r%#$vHTHt}^Qsba(~f@X zr0Q|x1s={m;U8WQ3NNn=e{uLRsr&>*~*r&R*wuir`S~crJR2Qb@FO)o?E2UQl>c@G%`dwGFe(R`G zr8*X$>|wQEr`4}Z^`lv)tHGwJ0^h6P%EAgR#B^a={zA#4x>EYpK>axHSHByJ*3Z>! z=wCH!S?Ik2kZB-UX~=8z08NLbHi`xijjoY3_)}`5Z78}~u?Afz&G*dymrAv;VTey@ zsC)>j>WRN%h)-#V^%X;WN<(zEAwH#{;X_bW{^yF&F&hu@DK+je_(Ew2fTSTlrFkyX z|6*zAg#HBYtTgn6BJ`1i=%*OP=VFD+W9@M8bDe7R!S|i)(Z%Bieo8~1A4m!T;TRfG zBJS%xK3`hE>4=g-fAHzQFuH$M;A1Zvz0e>06vxqJN0b!$gHNwBdZ9n~^dB0%&>wvI zKN-EyAAI@?MlbXSpI*-RA5l_x@5V=|<7DVVwN0W3{h}iD1x4sBMd;TSq5o46`h(C% zloV=ee3tsZVe~@(H2wxXv~a;i%a<-%G-ux8#dA{4i!WsU2fijr7d$wF=xsAdCOAsFS6sg^X4tdC%S0vk|hh~FPXDo@q+Xq;^lJ##6s^a z^us`-v zkM`3yz>oA%dda0e{U3|a7kg8*(tjGA5BO5)~K98U5=mYONXY|nYX9DzKecsSR@=-ZK`uqt-9`sy)eOiGY zOuuo;(DcV^3iM$6oK#z&2g`XYTA&Bh504E^PY3A1a<(OgPJdQifgVi%o#fE;Upexi ze*0UGUdDg$y=)Dn5BA&bf%L(CyC*;o_FFkq^ilc2etTOH`idg`c5RV<`|Uvg47T%Y z0eY}~ex^ixRG(n~f3--z9b2W;E`6|l{-sF2{q|^=eqevz6rcy|^LmkfyR=BZO%>_4 zWv3VJ8!Z3C+5$b;K0gl7gXw2X8#?`^v7zZ(0`y=x=LGs^@V@^Rpa;|M|1aKeFDTM) zPb<=IU-d-Df&JvJBK@|!NWZKPAsQgQd^z+vP z^k6+(&KE=&*b|_8UDzm!OVdZ`3t9(uH4*se z?>Ia1ekLxxVo`eTMWKlj2<^YgKT6MEo(@f%H#a>uH1VRPOG6Vym^fHLDTsJOs~Sg| zufM|aN98tE;4_zS#7|tPxzwE_y|1T;e3ooAhjFuj{GC@b4t=k|s>#G9&FT|}-x1Wc zCQU?VT{3^++(X~7Une@;Oo4|xqAaFo_)nBtnw~;#_jsI4`bKDCs(I0(`HL9`i<{H) zCt}Awb!mDox`}(3`Au^!UNU#l{NRZCOvhg&mxgYT7pb7B`WfQm4fFh!lQ#rP%zv;*{?5b9|8*uN ziU@gPh?grA+TTp?BJEZWL`H`6ate)k)y9!Mnpf zT-WDBzN(8a19)O$=;k8#Z@56YJpYP2wBO3Wu?~Mxi?g@kUSKfj5f4<25H>`Iz z%G(#+>7eKDDRTd+quu?xxvr0%e+1uw_t)xErPQ_HXR4>Ds%?WG|J?r{Q6ztZm%l2Q znPb2G{QoL1fAxTLEnfbK1L%Ig@&Ctp`3EEm-RR{XNcZ#k^ly6ky=L;sp>BZR>lBab28q&4@?67jgt23K9%gg5VTuQfBmzCcppX7k)XVE`L>SCG~W29 zxWR(=SkYhqT(PQD6P0dWLFn9M&JTG|!WEgOpnO{!?dbCao-p#5iy z>6#bC3ocC>^D_^+ zs)NP&*(kmvV?8QeJB@Y?gvnIvSs34vX(N3YPt6qKLs2sIU-+VNr=mE>G?4W1OOoop zcVVkD8+f6yuY^P{isGSS{_>W+oQmQNN=n&pe*0 zy6SE1KT~mU_B;N8ke}7^KFI$GVe(-;Xk0zB;DS1PTHiJE=#Sp2=y;ojb@upOmy@wj z{#idTQ@+(&Vo|wt=>3jw{2vo0sHzPu)1zWH`u zgS3ELx$9Ow?!ex!Z(sK0)DF5c2U;`tpn{fg#o zwKX?PRv0h>MvcElz6ou=m;3!FS)0kwm)wGg=qAE*ANedS(TpBif70b zwr{R4#2U;a*+cO$f9{E#cO;j6Jm=4S97s^Zm<_di=oV_f{qKbcHs#9~Ps^2W797!y zZXQ88#qq*SO|JbF8e2D2kv5}wdLJa0jb4}p=T5ls*l4^p-jr)U`EI}C2k&4&-P8uj zrK5-Z+DR0~5U%gwapSR5fKUB}N%cFF3ouL5-gxXpImpxmUSIqm@5I#PlIpkcnk#=@ zqne91=dNEeI9V%nE1SH!5rUs27wMWE!veB!f`T)LlGTKXm}!w0wD9eWCnytp_cT zAfBAz*&D8jlF45#RvB8{n~lorTSyU~vrJo6SgZN++A1TBqf^2UP_xN*r zbKY<++ZXurdg;UsB*~Sp@3xfSQu?IAYKYQQ1qO!)gX~d%?ykVw3X}zWuJG1@LB%v3F3i#{M3!PV0)I5V@?+=qXiJHUvicFhcnAdvn=SxvC&0->& z(inl07qP}{%(6ENsQr$o{dB~FG6mh zX$3b^bZ_6>WLk>24;ABkqWFP`^{pW)^zNM~kO>(8Ra%T6^w$q5R^ynD59QjQekEce z2KfY}3Z*VL&Zu}$3<_C8z9Y5|k?X%pE^a1d|FCl`$} z1ifK=zi`I@p<(ay6C*MTZHB!ECKBBr_6|_k9k!o@y@R}WWY#I3-74&TYGT;?0%5B& z-3p`S>UcbOoWzrg#4Od6SFf>$Pn3 zUdPY9{J*DIdic_0)|x`qZK5Xv&0!v*V&(M!qfMaIyM|YXD|vD4E5vt2N%c=d@BXlK zm!;gG7JAzXajycCSwEdQ+WDxzvVoG?@38()^snoF6N$>JZIHrv7>M{#*#1H-y{rz~ zH+t<)y}9*|LZGj8F2567oA|A2CCTw>w*B_wlUm=&@0(g{0)NVz!sMb_o?x4vH~fsU zyIUJwrolFzO{?_Nt)C70>CsQWegO6Pv^+`hf(Rtu+$GQJE#f0yCUzA z$a^62?#gYe*1DRI%=fZ8u9+2<9%%2Gc(w%DP1n4Wjos^uyxo!4A9;ObZBy3vZC+S) zt3hjad8W?q{PC&L(e^D~b_uJja}pmBV<-5GSltFj%veiUUYJ=X6t0aj)U!&X%&H5^ zOEQav(i^}XLg|Y`FkEBUI|N|u1+W5g3ldY17IcNZ`-xb{VrGRdv&Ad0?pCcV7L9~( znt@;nYk*+9?KA|&Th0K$PkmBY@#+fvfe2WJSJw#_@&9mCdhXI>T6dx9W`ST-zA4j2 z6|CqE9*iQy)D!%_yFy%vpi&CLw$Dvwq!{HETQzf&$&ToYcSNO!wAMC|GAiAxS1og` z?>?TLNBxyQ{R1~jVN|}w)|E!9DWY?tC|{1CAi7D)dz4v3Z7)P@O3Rr0gHdwDPptkz zGUJ9^d0r++=8l3HYDJT||MUIzgdMD4CX(M_#b4=veph{kcqh~$t}FeW_mXw?d58I3`b^lq zt-jFSQ?H8mtFXsv_+7e{@596Tez2b3+2_5efCIc9-p1R`{mOhq@kf;P@G)L@ZqZLK z)ecXjmQ6W-deZuoQ(^z-BC#HoB)p zqwP!h=?0#LOn4X8P}f@u!Y=;$UBW|)>Hk)`x5AcW==E!7(1K;LJ!~7FzkanXTjD^L zuxjlydqOb8Y&MnA9qphYPq-)xfYD=;@KsQF-{Sa^024mD)e=f_?n0qkr z`m=-n+`)KBre=_&6LQ)8eiG82eEE8{E{KoCyMv@OS}1p^)BgGs&?2mi0}ljobtV_@ zj>?~a?-_VSTlxhw9v%oJ_kD_B8LZAU0tI5>uG^#16e4QBePp8FA@K^l z=#u<(qf4^FSU!@t7n|L`a7n)0@3ru(BV$#>?vBx^G*E{S_-A7TC* zN#fW3Xb~y=<~uTNB=D!ZXZr}yAl{0mwpjd4nN=3=p6w$@LNpfi<ZAz-@UYt zz(VyC@7G?i%5Uz-Y_QDk**@|XzvG`>=D+GD^BtKUm)X6vkG$3I_^ONlEF@#|Et!57 z?_SzReyR0pzxI+Izj;MwzzTNH_L0f0f9Ti#e!t(mIx}n$?%6)_w${8~`{@C{d2L3J z=TCP}LEsF(_OrvPL_p|IchB|_zx5q{?T6Gefukl-&%l9YYgig8CNsWwh*B5+)`UZv zN`pGyrntA{kG<^PYc54v2B22bMlS7&v9#_D(0w1<1Efw&0kwfk)v=+fTovVzhQ-e3uRo=VxS`A-YCb_-il6pXA*^4yD47A$y7SOQe(kN6y->OrpqetPoNM3e zc~1t})=Z7xIrQ=<`^IZzhvAntOWr)Q=aZkxEzEr?_lLR5H_g0r=yD)KQWndbJi*2q zu#diu{T>>62XR;W?+uva^0oEpj-4-m zU6P$tIEpBwS-iALLsmjh)N{meCjVG}cib1>%^oi4sEJ00v-kOPpYXQoaUbMMJzPd( zzViXU^T4~vm&#&9b+u7W1pQX$cZ46;keNh3Dn*$ZruS{cBA(6|cNE3Dps*g4+LY>QvGHR>LcmJj{EoHrG<=^)#jbzK7Zh@179paa z{4V9vc@3vnV@(+oy3bz^WNI`P>8&{|&&d=Jq!eItHCDTo94M(1?<*#)jR2@FqZJ`g zA?<1fW!(geMOw!*0BGxjcvyUIZB*Lr*j>$DSQ^#`Guye(U->QC4^4}(EuxtXkjACs z5WMWi&{}JhTq2Y<_u{I6{k;*~UGNUUDCCpHqXlmhBP}Em#v2RXiGU^w4a`fX-pKOK z&wNyjBMf479^w7UbEy(wR0?Ra`Cy1bQW%cn{&~raDe(G;R(iemJ#TLPWZ3?)*IM9r zV(T1!t6D$EZ*}YY_?^^xF29IYz)!zXNG_=Lr!+{i@&+DJcCfX<(hauxQx>n{e>eX( z@V|%u{rn%`KOE8ERlA{|)v+Mk)4Hfox;rWjG9W8#hzs6HM70I)NWpvXf^F5pyYM-S zy?l1d7iSgOt9mA$HA%MTi|0z9GJ;Gc^oC&;wpF()bFO`p7gafFQ!cyL@4Oc*AlxO) z)#!KrrjSDCGyclU4v&t8NC5;tA%vM&mHA@#TQrHwLYR!uWF!DbL=S8p;X_s_JuoET zlH@{*LlV7$1T3+HuxkxAVdt*M)D$G&L+oN-%@F{#sL=bgNnyee%vGy;nk1$n95#vi zXX~t}O{Ic4kT9x6pnzqP#Hg_nz}DnvJ|k&rT@df#sKeltf*N!2ks!O(PwD{=z)5W_ z#Qi}$2ukYULm*odj)!C?!~u{IEegr3yhk;cCugG4BO)n*`{zR@jL|`m1k(kz2@iL- z;8R1AkMCtN!9fHb=IulXAE~u4KlG09az$oQ=!utHh}bEax=xhq?4tLWodAaMDtLpJCUdR}yg_DsY$a7OZiQr4$IPDG==NNdokQia zH>;P|ZU~rIwXPTsNYj}6!^}O!YE+_lBMT%1P&B0C%)?xEUX9=NkE9topB)62 za6bE1-WBP0+aShx0!6dN>bU2j<#jWU>M<>Z;~XJYA8evKBt9PG8_d?F zaOAq3w@(r#bJ!e2=*oP_Pd@&V7Qh{D01zj#e{L5{)oa>Af&6YtD8@SjAW5{hNev0^ zCs^{c4LsR1zQqGrg_2Vm`lay=d!J)P0Q(^1WfF259_GLy1tdX(iMemhv{=#mCIUbQ zqpB-_e`cdef^#jLi2Y%mRlsAXzxcTC&($@-0oW`qjp#no&hF!V;{3iils) zcBQeTV4F)qC2O&HE`Lq)SBwvY@k5l)fZdu=sjQ3DHa>Hc>VH+)^Ho(+ofXBji1&Q< z=^%b8A3wn<=#KN9MATn-wfX}RQ%h8>a-}`i5>oX-J9Ua?WE**2m`weReEe)NeuaP< z0zvsSBmj{5PvQn6xN^vv0CjRbpms?)%9B>&aFp|CCml(1O?Uz+36h912=8< z(wIicyW)djv@Z*z^+#2+GWqOh>aYE-)DZx+McUXr+LisoX@1w|V1f(r;at@%8Ih5> ztlP-^t~U{pPpRu1Q{iB3Pdb@-@4+1Nm)#NXVwC;4hXSNRU4IA8Vj~C4Y8t%3$fL7@63u1gIdU`?Z_D<^+nB$X^VtJ_SB9c<@lc**-D%-B zkUU5?PD&RDd4g{(SC!tt^P*(odpKEg5Ep=?luC675`*}Oyq)GB37DsvAbZO1x*Aj+ z8GPsC7Yp%263_{jklFZM6Nt>k$D?>OA3tA+PX??BgU#%Azv~OHz>dVn1Awy6G8|+O z35$0W;}>XOEj7}<4#P1b7+MHmFccnofn+YOZ|A$?2EX$=I%;nJk=85220|^y5BZ(X z?;Rc8`|;E86r|7|Z_9iALlM&^rpTdyC-UB1Lmwvy+Rg9$Avq(^!x{>#Ah3WXEKJ}h zU5e%CbbduhDjeP@NCbzw(}jwR)s7Yl@gs%aXCPgKm;uhkVecNWGnkf6X=a6&g}Px0 zc4}Vw2^01PfRWHTLMMXlpRLVfm9;6QS46;6EfhI}{KHD(n2B*+$2+99pi7v$UW849 zkirH+DhDPy6R}w)1(f&-n`#|OHHW0sxQNu=1j5?J&d(On<-I+^ zsj)CyiN>f41Eem-N5~kujMv>jM$q)32!=)oL`}>mQ>W(RZEEo#pF(_s_T10u3 z+I4|SWqY)dqrCYlPsmTRohv7rlVbc5MQ;tO90*eWt=7{*GPOt)hLDg%Nl_Q9s6F<_VPR@6n40~KZ^<9umeM`)VvpeK3s;raQ z-SNX}mf}$ykz`s`Nyj4e=k^J=wHP|S!$CcRzARultlVW)?F+Jx1c--Kmm!LcL4043 z?ekZbh=rWOjb$Q84hI6yy6rfT!1!!@nu& zg{+2O6#gAUz4cKtt=8zw_$kA_!{Q1mc-f8`W?gS+duGolXEhnn@749!t}DhI3qqb7 z`sJK3m;FSFf00YP7XO~hs?9Ol0)+e+{(YL&Hb3W`l<+0HBNsowpdzK;k+Z{y z|5XcwBDM4wktXojDiu;$>yfv@KqN?}zCC39uOl)9uh#*NLH0qX@R2TaO20RsecA8o z2A#$)=i>uG{Jg+6KFrz*fT;O-VcXwS^zuxYJ>hq41Caud0sKo-kiFOM`h*1>qaABp zJKUldh7xLLz!V;N9F&IMf=u)I?$F&|F$z*0`n%^OvvK=vDl#gR*0+%ZA<}Y0>NV); zM9I{j5CVZRDBtI>huMKu8K{ZTT@vLed~gr5c|CJC~m zH?p1|MwF7M+=`aXvC#@~33G?4%$<-i{@fwdW>_b4`>0Dh8l6r)P%R9)AFPUE2P+U% z?Iu$zRP#}xt)kMALhp;vysOUrp?Er8B^rHF~F8|8!9j z6v1td*{kf@`RQON2N4QdMP7h&kSrp?TVb+H$R*UMcEXZsV^&jXO~@lCU)!jbsVNNF zkOo>Uh~OHfm*h~)1xfW6gr0;M0pD7Uk`|xX?MCLQR>|6+ZnXv6Pd|n1r=AaFKQHpAEZt(VFf$yMj%qv{5zNM3 zO(X6rfWCxT+mt$)j5PP@H@7Y?w4d-=|8IULwtke~s@5F8)vX`ocT($n_W>H9R^r={;1-dV|(eg^L|>^;gp)%qFE z6{?YBPpRhMyEdA_@04hw-WN=%=Y7FMerp#_Y2j_JmGi#KcGhK(}L);$kvDcY^RXl1m?DMlqq#&l{B&FVx3A$ zrez~fBE<+ItnttVe(kJpqU;4cIwTr-4~O2+A%uOiRwkEtIZ)He1L}F{PT`7Wl+nI% zV(5wqy+guMDp)jKW11Nd__j)zvoe=`&|fKC2lRD-sNadajRG*05~7#Thy;4_N;IWH z?*(O;{I^5xBsMVqD_%|m@Ot$(i2z&#pUJf9@zTpbKtAcUhWt)!J)hsI)(`Mo-TJHi zPHLUX@0(iREPb$KK2pnx25A5OBg{S`;U&pmXye!j;pGM0{6~0sK@a~CUS2T3|6$T0 zymWkQj+3zL{;O(+e!`3*jV+X3DwH10C6{_O7ecO)w@IVs?M1ii+kC^7F~SMy!_s!- zk`pyV-a!V01b?dpurdD1B_t@89xZwZx(*e*y%(mrhdxS#j^94)lA3JD@7Vv_n(DQw z@CmdAa?qI^c!sxB1y}NS=>tnYSjzjk1@ZAzIA4e#F2sXj+$%AaDFCAi%ubt^wlcNkR?#pa-W&8}dlcuhC@Rm$ zuw9H0CPqdpc{lIky%6CNU}pg7x;0sp_E>5ELf$%AF6>RR+7w&dp}}*;xU^Y`dq@nQ zDG}FziWRLa#sn^931Td58Wz0PHf`}U!Xw%-7l{+HCeu8vYeu3T;v$1eGum`cc9)(d zIFj+Sc3m#+LD1jv#VJN(cQm|>B1?}T$6afMFU*Ab^h~Q`V~u9(Gt=_Pk_;zZTl|g~ zio=vTZ~rZgmvq<|$tF`Dt6?`52GbvmlR+*(-o~fcF3p-Lm?6l3NdUT~V0MuILtv7l z@#8sICVD@={n=M?5Hx5*?W&g^3H{b~OCo63LhnFT$XkDl{pO^bV4o zhMWLKe39minw@H#)UEc22oOGHgBSO80vH8vs7yA#ZvZ4r$uPN!r57Z!p7{WZa$}T? zA_~3FR;dzq=z>bu=Nya85!m#o3aPc4rG!MaJUT&1A!F(5dk9lwT={6lwA`gMv(MYh z@6tokGkaFbVNaNHoN1fwtVfM(BQdkj+eMRMw*~k2jx9}~Q zwjl}=aTIkB;b^yzbFM-%`Id-`w%DuI+E#r@yy8WnkN9P{zq^weoObM^ge6dS-m@q) z{KF;jEiX!lpG*G4Li z)qs)|?QMCzYJ%*e*IZ}?>#(7lw9?E=rhNr0sGM`-n>MSx-tinlXDQz4JO!#G^7f#X zw{mX0cC$6tdjzJbUv*KZp2n4mlvt$?BvJEo*&)Ak2V{f>N$a?vGaOhkxLn9wQ`-|k zx`7N-GNyH&%6;5nn$9gb>2c=;Iz<()*ry!cun@FFnAhp0&Q4w+it0UZOMMn1!S8&x z*tA!6vqi$-&U|!ej_vsn?Net`fmvgbi4!gvqEMXMjZNN=p9E2%gZziRv2`H5Q>so_5f$c+vmJED^mS~kz zi51@Irb;muybkkfh)FLfeLgE>bnpyhbPXE!AS0o&)5r*PbCU%L4V{7Vx~-tkxeOI_ z1A{qT_(+8qa-xYXdn-suxXG19pIaV!gV;zbU4wK6He$>SUmSiFHcB;hg^gIJ)l+F0 z2v;Q~_5{Ud4;2^=o&%r^8{>5;Ra{i~|pvT7HpduW~8pA>&hEg0fhI$+_8oF8{ z^YZ2O$)AkER&;*wm^cNn#!EA_?q%x0`V`i#CL!8#Kfk>`2l4=gP(R2d2fdhUf94g?hQlDw8Z9HxQ0jiLhU4?J zTE}CLRNMbF_K6)P1ckwB_j$i_%p5d4-B$DJuT(Gpzm1eC>RWAFW=Z zK^ZI{g-XwU=T?Q0#kO~sW#<-$JFGw}YOIuT`IR-L36wqnLy%#U5DQr76vVEL^(hfU zv(FnL?QTOH$5iw4((vz5)u`@Mg&~bYYVy;3?mVW>jVSu4A-EF?e2&1KQsPf(m8Vh! zLDB(24_k5c#0s%4(md-dD6m4OL5ZCWb$stX2XzQg{tHltVCLtb4n_ZBs6#pb(^1Eh z<4^|{o{(H&!L+F=Y5;c_4q>sThCW{3P#xUPs+8cEhq9lcj&C*ybr|QvsY&Ii#@0r~ zY*~zD8=a8!!xlHdAmtTPg9rshh9StaHZwpvwBMjVo{%_H^`X<={K%4s=ws>Xv7CJ!k0QRyjhAOmzQd!5^mLsSJBFo z$r5~y!w;`zZ-h!32sZ9UT#b|aOMkG!>^KIej&s^CiMQD?^xi{~-E9n6R3?THN%ABk zVbq9mGvn~e0u(-+IK?eVoyxQ^Tr9SObaa@f2wNvw(2D^s6bHsYBC&TtguFi^mSzu%?bXvDbJrXEoaHC;gcA&9#KEQVFTFu#&nS%J7o$C=&WoFlj=?q(%Ar$ zf!&fWD5qN ze@jtXxd9!}dSlPbwAs@gd#l0A6rale>=^A=Raa>_=MWUAv7)L09{NK+Jns~AbXp>K zqoenW;8kOEqSs6Sd!<$Cv>-w%-K|)Szwyi*}606N)zg-=BL8cy~e>30e}%truMOfDTp zx4yc-v26~E2EU_B83mmSw(vW#bs@i1ts%eFt+V)@)QbH&=Yo^{+F!|~N7r>6hR!($ zyg3K7xv>`Ors{LRz15Yk$@-dH`O4@E97jqWqAK64O58NXO|!4)dri7PWxtsl#0A3bkaNRU$ggK^FbXdC$n@Q9eJ$vRQy6c zE-R+z4=eYae^M4slcl9!bw{R~f`!}{OIofq7My>JNuT}B2RRNEP}B)WNPm6}X>;vp z&~jQYxZ)6gfJbn?qiZngB1b!m4E0S;Aa%CBPVpcfl#9|`gS=W1L3XVzK)>@wHUQR> ztwvUxR-Lp*gif?@Yk8`%5e1Bpv8XXYEjT~}w~08v^9Zsmt^1&zEm| zOtpexYlmLAT^Z8Gzu$QUN=r3~hEI{!Ec{a4Ez{={Oq!0y7-&0Jl@68wR_gf!l$FQVLh^Y~Qa2sIkPA7jq`k*u-+d*-!GiY?{&ThkHVOYZ<}HK1BZHW@WLd*;1h z59ta8D0Yu94u0oG3+W-mZM1&pDu-b(mxSeMG{;|xw8hYQ8Ifi(APQ#@CtNCpC8L!r zu?le!!M>}T4aiO&7gxj^@3hd3rhV0E!azJFWFo71Z?eB5F ziqcwkRyHiBu;h>$(Ed=xu8qmtWTJ#In-<9!pf&o)DWttyjld0}S>k{aef$;SuhCbMjri-4nS3>sjvwaCy>R4E{;VYMl7DjtMwlBSA86C$*FR2U2} zUJQh9YRCy)h5!LIb~OlmH)2nQ-=1hu@>GKd?J0x;!f2M3;s8yT9+t+-Pd+i#hJK9# z8C-2b43Lc_pQ=enUZn@#U5sLrl?PeHQ^Z&|Pq0t}1Fk5sGaB%Uc@QmFq)+d0-YfyQ z$Dm;47m?d(H&zKkeFI8I4I$5p!w^wiYZOmSt9YWSBSKO5Rmo^+BM-6t{fhMt>Qynq zVuLv3pbrC$fy!|7cmpR5d zo@&3^7i1r&2THCHRymYyx}m7SIIYOMhg$GWZu6_`nohA%5P1Vx^6|?wLLHGN&0@i9 z8y`&Wd>$;e79M`GWfDEjavU>4TU8bLP?_EqPtD}A{r<|&60Z$;6`iEN{nlFz-S;%P zqAA};3T?zSWY+eq<>#+_04_x9NEwX2Q7p42PzSnk`9Od`ZW&KmDscAXR%NdIhxIzU z`Ls%49F!nRoVD3o|;mSb8_xS_QZR%u3>I#L<3! z8E5fK)UYVm{`@P5e2x=(g^ci?pX4OKizIy5u7Y-MmR-D=8J}Wdh>bx8j}9&8@A;&~ zh_KlV)r0)!psA27{2J?Y+7EIjr+$XfVLKP!){Ino2Al0Z3jG{gxa3B9g%qK7NuyIC z7|%U$0@DqPB0Xt&M3QJbifVa~fpi^2W<)laMKK%d_Y(R&W}ij#mm(rLyA=2gVGSon zFVl>}c48#i(`%%_c4B10nu=Zxqp~0Je#UQ*5m^lfMka7_!kWFW71pGRJsfjMSo0gm z=h5pqEvrriOJ_aZI-{*L^A)xA-xVy;3pFbhEbXJMOMY`quq0ih*ApyB=p{K5id;Ll zsIk8Z^RlJBkUi;lu46;IFq!qPARftQxA`3}0)J&pgW1oPg{A$Ho-9V(9F<-GMrt4% zUj*qFJ&4E2KM=SeS}_edr;-Ci)Oy>+7>MAii@YI3*=4v))5l3dPhMhWW70`ARg*KQ z)JP#mnm468+qY^*xhL|Ffv1_OXw1(oGMDYSW=>T#1sx67kb85%+X|Ui12y{3UO!w$ zEm-TYDH}TTyt;z71##35SpJ7iUhZvl6ab=Qzw-q+1N6Uzw6(6qhBQMDvdryNn}u!5 zp22C1ur>ojtRoq=d{Rj4=Cy(XP~>Uj6S97Vy%aob*Ic3b>?QSnS3fKSiG&p3T~^yh zuB}D^A#9E?PjG%p-F7Y{wS>*WF1hj#WeygVQ!ij#@ek4>;yfU(5IZC+9$~bx zI6x|CNW4=bF|sxjV;!mIMMO&X7z@&;wS~tHpfyom+4*BD;XuN-CB4903~p z-2@2n59$f~wu)zEzrvhl4WHT6YNYE&_!`mZ4-+VqEn-CMVz=_7yF4vlr#z06@VkE5 z{F5vc+u<19fZ}RN;diY?FJb7Nx0)Vy5br|fw;!00?C*JKy)jbmm4DW;0dx(S;G-Pp zL+y2ZR|p%CY&H6p$vir#W(lRa7H9#hn>5lk-*imyS6NrhLdc$Yc`m!r7H&=gi6S!T;*p z>(`#!4Bhu7(R;&$RcuZ~%?WP(l~??eX%VmCgV7Oe3l7y=jhZG|%~W3RF3n<-)!^pY z&PVtbHKzrrI1yTnJb}y!dN|vOid9P9+pbq7L?tNsf)R8MhAQw#??a-;=y&E~4c|6r zJOgJ!NZXW3+`I;tDd^Lu*Tb74jMo%7r=?*lbhXtwj-1P7cVQ;Di)mU+2AV%xCGgZ| z;0*;?XG!BHtE?O+RiIxgUu4`md%EV~Do$g&h#;Dzk|_HHZMTqfci5uD=>mOgAxfgj zX3GpU#8V@E15X@K(b_)6K(s6tV72if5_i6yENEr-lg*R(%I@Y6UF}tEaj!oY=Mx=UFfZv3Pl(2Q6cH_@&flo^>>ZYa6$PxO*bSBgpEc=AT3JcZi1lTAnaH)j_6A% zJJVEdfv6)HYPAq`mK1B==O<~m-(GLjo@hHXvSeGn*kf5t|&8~Guarjj@^gd@~9p}xo;Z#%d7$^Y5(+?h0z(R0(^5Ir}ZMlynK`hP?P z;TI$5bUdhubSH)TE{6F0fNQMusa~I+8>g(ah2ziQ1ErTGc5zM+g7Y$+_+SA#(2W zH%!jGbOt#Wl)u?QKT=E{K+Xk-IziwFy11R77u=2s6TT9@eg-|~1hpcPdW2QUF0?Jf zVByiw<2+6xEyLlRbH>qg+F~U*F`~t?u69y#tdg}c$bJ`k&h1o~-a)>lq2&vMbG#pcEOBwpD!PEB6II{ca@6za_Lw~tW=f>-ur`k`|4BfGo z_I&BuI?+9T=R-G-weHP+=biS2?>fKpr}j#JI&bBT)Bc?dnc7O_13YM}8mK=U<7W?E zh$#jz^GJ5spSP8ksUz{$=}!#BDoFLQCHS4&O=#*yqxB2vz`-`1tZMRCZ_RZa^g9AF z*vOsF%eWC6x}Utsg%ke`>hmnExG@*^9Xr8>+MmNEJhGpSIJK2O$hmzIXKCC4s1foG zU1yD;o2|byZpeWy^-pclWU6hhO@~`2(Vo_hcx%}UIqvrHzWvZ zHpd4U3n&(EAUh12P)3Gk@6AA>rb|@h4SISHLBci+-G=rM7iB02_k&`~m=7%2hh7>I z)39&ol1KG?VCdlocshHDyB~Bx-b3#BJNHpW*8CFU4ZaU≧6N9XhdX44&aa!+g9w z@J5HO>(N0H$_=>AK{erz`NZvptAD+#|HuDA{ZI0f%XTTk?`+_Meu`Sx-_Wlq(%n^( zt6VqoKaK7}x*8xTulHAcaRa6N3?jTxakgxx_ASi((j z69{GQ-NY{@ULu6#F<{fF3hbX4=zxAuY6mF?G&`Ss3O?A)F?s{lI5`$J(~LA ziK(yxs6k)=mU@D=E>oFAsSH5H;nqUw1n=q0m}y2E3VoUwbY9M8HQ#Dv_ss_tV)nK8i8Z4Z+!PE;-GC;img#*+JP%=Qh0EGk83s5pZ zy+Eo4OH9>ZiBt_-g;%ZCiEEvB>lQ{nh#$s_nBUP$TcXl~Xc=hu*6E8B-FnIk*??YB zwF7T^5sUK+=}tMq@uT3SnUYeK1A&H)|0myu%0ZBQ?u(yAQirZV+T2y*6VC2bnH5U!d6l3qjy~sT;J> z93)o)PrvhSCo} z4o9_W^V#bizP|{9ZRp$|j*hN^ez=%jvf|IMb0(?Bc9sku_zAgl=-mn}U*AR<+#MsG zv|`ou(onlm$MLgqHQ#{yPTc4$7f!}!1CkJ^g{N<}+ZI*j+2v+bcDW}w z<}i7nhO?=B_44$#H3`O+iTAKQ2=+MiJJx?%lkmwJi7aLp<3XxKY5LmwLP?jS^s-ip z<=;n3kF#s+nSV$F0VNC(T9K{$>(S3k^~Os}r$uQCK#SIrD4vla%_h~T^>Im3ZOR8a zk!vu)K{4JsH<^}h;DeG23UFz{0iBJBtRnwW{v+$FT-hg;7Ow2mCQ14>r2&5|Zk-`Q zjA5zVbjQ$wht_ZAC<`&)xU23|#n@GMYyXHzfZQ8**WFhLI@?`$@`0ed0Mps;y7Ly% z_A!6eYKtX@ieJn-N*uq-)h(}c*WEYYMmxCe4VRkgHWv6=TB%!vv8(Rn!C9gym7Poe z)PqWi??NGxe&XzDbGz=koBjz#Bw;HQ&?IQE@YJsSYm}$E?mixI7;f$w%x|(tvRvvn zSt2QUMv%h>YdRa`ZmJ5~D|g);9=q%AFxJx$?vDYCH&8Is)x#rKUaAWRmf|{84|5%= zlJrp@g3iz4I<)jr!7_3_(ovxm0|}g5HHn%*QEh}vYJ}@h&iWDIh(bcmJjyI9#ycuR zx+#1L1T3|)tYzX2U_}$nRzJ*NC<1CZnFB9s83^rbHQ60ml6XVvE|$IbIu!&-p}7Mh zxio99^Dl~8<7w}qB0hwyoeUc_A^~Nc%40;Dcvd_$h#u4aW;%>2SD2tetO@dkq?z_^ zwTUCwMpl`LBmW0@k@@Q*Tz|#*c{xuM{>~x8lg};Af$}&|W$ctFbt2ji)*Dl}K=`&8 zcbd2?q@)27=`Se%wH$ZRx7`&#qUuRd@gus3kMcJh@#oH=buRsPA5pn$trU^58dnP4 z?aE|0-R472G!qoQQ@ftZisI*sD52)?7s>>zb;KQbFtoU}wr4vM)!c|a=iG=MVi#1m zQ2SEN%-S;2S2`QX58bnBhf2|griS3MP57f;L5%bWL7J8ZzR_C$oiSgc3UIzeo73C_ zB!$=?7UFcS>%e;wN)Ho}N3OB*c`J0xm8h!A#r@=|xDs7Lf>*l|%@B5mD^Wp)wp}&R zPAOD8;x6Y(RNj#_kOm^9BQqbB$GVDxKS@%t%5jVX1BxS7^la7Onk&tJU8EyJ=f7?V zKm^5p`uy7YuY&;BNs#F{{=5D|$9u4AH26k>8{9|pcu?q)Wz*SQhz^hk;zo01E|9eTG=6OgIPF4I z@ez$5!K^O(sB#za8f?6lhdN zWC}+FI$a&2_D;JG#qph6H|9RHhDcord%)J|C}2B4UNjE7uG8rsMmwk7hvu_~{jSLt z{8S;{Nds%BnzK)Lh@wTZm;kV$!gb`yIrvQmP6*k0DA(`ohQ!Okr=(p2t%6znj*~^; zdF*DPhdAEB+-MCsexaL%Zl`)f{pq&&zk0LK#nzrKr`4L?ECdDls+)!4$(drgxdy&H z_nM1^PQF(QeEMoLl+b>8`eGq@HahcSp(YC%f3eVf3*n}jA21|V&LRsLf3Z-Dg^a&g zsLetu7YnVjugb+j-S$$MjP)ssViuew;M55GNb5WADcddu9s=3=2nVwe_K z9!rA0Y&)oDeh(b#X?`6?~LaPX-mFxi)UvuWgLfw`bg+Ysa&Ba0+TxPgC zi+}5@FBa-?@f@aG{J4vS`U&u-qtIay<1Q8&un72Wix_vY(6B|UwYz4{xL8O)=ud~Z zN`dhm>BT|%@k{m_a^i7V{w z-@RDqm*FSvY8>S|eX-C)!x|x&O&1F_5{G!;RTm32xzJZ%EHs~B1R<}wSZI+;d_mKh z7Yntx#IL?ssLchx>SCc)u8db*EY$5nUwg681{ZPW#X>zU@cZ2WqHdeAi-r1K;Hxec z8X%O8e51#bx{HMz2oU<6ehYOM3&DRd&xHQU+-d}dTRN37Yn)g z*S%Os(Qo8pA#GBFG=m$3xLQa;gK-xN*}dI+pLh3m+hu?7WZfjLY~JvmH83Sy&Z)b* z7f#oPdG@=zw^Z)#KGI-&v+5mxceni4Hej4Cpm@CrjDy|PZA_oLEM2NIl0S&slXRhSr_RR_35#h|QoCDQ zte^}fS|Z%^)_Uuujp3J=R}b7jre(`Yosy zg=^61X{%h~OkFas+=0U%- zK9D(Z6Zh5JwWPbXJHM?Lm`+9MxNbm6l236Nw)69s!vaYsKdLBrKefW%h|1L=o?~;I zC!4*QRb2FMO?Prx>5$v?woSK-UPo2RHol~W{ zYXL?Zg$@&7nU0y1MF6`xDWw_cr2?b~6-e3Zz;;NTWq|C`Owt(oE3X2&u+tHvsA0&* zFG()C+v3i&)RF1=13D@=(^6--S?Zv;^S>>17&H#N{xXaK?~tik6bkAl4hi&g@`We?j6Rde-%}|)yi4#lU{7RY_C(NQPp~LYVT@)I zGD|9?@sehzP1DM$3qVv3Ve0aM76wWQG|;L>=YuL*Mmo2+U5LuoSmEyE(5ZE8v7DXEcuj`Q1d3FnK!!A#6t?*Z9D~93sEtL+kS7qORyKswyTSJf)nusJ(>DC6=WCqoOBE9(Piaa%&p_(Z=|<^!%DqXh{kd1b0Z!&DC+3EIQX3w~BCRT^ zCgP*c^gQpPs}j&VJskjZ{f)G2LHNsj1J_lB2#w-n`+2{0}){%UTeGdAZW4F?8)miu~8R(36Jog_l z&>5vpu8!x&E|!0_OR3|Gih0uAxlc`U`kXJq63_UMYpj2`;sn!*pmAq9ez7l^=WM2> zAHt>7*7^{kW#{*ijiS!^PHw2oW2e#a2-A>E<)-=P@Cpwezn>%TFjaXFKJ4wTSmJDpyumP8(mA0F-5cL>SVfpcPK}wj1#E{h&%vs!o#TSTYF7RXTb%#F zj*VwYLwTbIby# zPU*%haEcf=noE!hs?K|RUuA$3WS{W6dYsy&&@L@;cB&F3uTzU|aR+FgLz9vw3mf7# z!O(N(733|(o9IZBvTG=6c@e(fz*9(95fS;9*^}dQb%4X2`F#7GPo;)96O>wpI15ul zoE|0Du4dUd4dh9wjrQsE7%+*^R^@j+h!&<0^=`|v)=?X$jdvO>4_c2r(e^Z2pzF8D z13lIN0&HQQ9ly~KKB_NK&0sCwL8(F)TXB9%O+-! zVk$gL<)HuQ>VL7l?c|GM?0FoA;cJftyrkiiYUG(_kPP=U%| z1!5KhDC@|wTY>(1^KzgKRWqjg4AFN?bExcOmg>FZf9ex zbI#@vCv74oMch@0Izg(2P$Zj8?aE8Jmy%&#13bMxdfi|(;H|=* z&W?l;GCLf_iI3J$GcW@bC##!~&(r`%DJsCRQuLYzb`^mFtUjLf!GKTXn)-i(;vi7l zc%~|iDbYkd8dg@kC>(DWFUl!u;Q=GDoTNmHL$!{h)$&>VvCJK=n733&HK%2U0S?!B zgyg_X$jYhEd~R9T$u_ZSxEhBqJDRTaV+a62MukRG7h04GlRO5mnO8c~#6w40>Xc4b zl_H2=7FnsjhzK{RCC6fma+W>P;Z_NLvexLeCvw>X@nD72qyXqOOos2aOTI1;3BzP^ zihp<WkF;-<5wdOy8Ub{EK(`WLF(`#EI4?EHSWqJ)`8%JY}(QDv)V^RLQ z>9qtptjNi|wh9EUj%0HfYm8)b3i4g=Bxx#>O*fHZ%vFqzx2_OuC%Ly*SlFNYY@WL- zx_){F#pZTs?e`l%X<)Y5JfxQ(3@wUH*r$S^J3nm{+oACk+lbQ9uys7gQ*1*1GSyk8 z@G*+*ac62Xfk554*KC$HsbdRvjDEsp1g#x}&qf})!0HAiCzhu8T|13l1FLw|+r)ttBzT!Upo$d!#mpc?X*yGua2ug+*sqs6p@I zFoh!Ymt;Gc{1ZMPspJ-*eCwlOR~saUOy*;O%Oh7 z&w&hN&@P@_D0i)5H`IF_KU?T2cd3j-$*g}XR^28IW?9D6*)F?W&?{YZO$iKV>8_ch zd^a~3|=m1rl zhgcPS^rUB-@`|M^zkn*6U0tDRb~r2)JF)_59r&pnz*;p42D|O)bY+1QuIB;>PhZCs zZ#xZxDQH$;aKl^0UR*sH6Cbe7)pbHW*yjlxdBaZ%9&&y-p-^?Rpir1%F)O%(>Ts_d zm9{ags$Xjm4($niXr0C&wK_su%smelMD)XA3IvpMlgTa_JkkcfB}A>KAsR&MA47yD z?h4m0FwJaoSBCHIK5BfnnlXOM$!UCSsJx^RiJ}1C~2mO?p1;T0y9yET3zh|E2J=Y(%=S2 zExdsucz+KMk)arj7vt@b_Yjma1fn?xg4BMJX#ZCpPIzqXcqS78Y*Ea#7-MlsXykUJyu!&QA zCoq(^A;JbJeX(L`3tiZ$*owPoq9uJ?x=Vo%NTPiBuzt2%5gGw{fs}OI6&SkKW9LE2 z{G$OSvdc1d4l9ve^+At#Jxcta4Hvlg*Y4?xUuxg;3aXDC+nF0JL&#UruE&uP{O-IY zNIb;_-Z7crG!&C1KY@e3574|vEy{UY3#C(l zJv+0~B4fSgRlzqeBoM|ARsgy#}) zzt*Ci>9Ny>2sz73Y^QZr!HmJT3D?Iz@g zpZwc{j?5^}!Zql;Y0S`(v8$9#i-r(xj>aYc;$<-(N3Gnb^f>SPMa-ae4V{oyU)i>Y zWNB0^b+yr2-LlPngbjuA*JVLesB%88OP4LDnP_G7$tTThh%ZRyR9$@y9;~syy^-NO z2I6gDXkTX0hZVn}R9nyJ-7hiJpoUluoA7z<9G9ZQx`?@r%9mjd6J!>}eV8jbG6M=v zq7e~?W0YQx4&35R{@jPr2dQ|et|bL#q{&UR*xf9-0PokOO$fimUt>Lx`c?!eG|6z( zU*lnmpgx~`l-r=4fQyDfBWW8l2q+5lg`l}Lq?zS+;9)%$De!da@bn}qN z?2vY0hDFIVR?X1sJ-q7=;@-q_ylMoS)xz?1!tkN@5|s398UnE((%RBui%PUc(|o%= zM2so1_$|d+2R@?uaA(=02tWzP(??}HdP^$)b6q1ZhsGqS2y%r+IHZ2TM?FK>5Y)vxoQP zW)F(%X(ScS^>k#er=y7jm`W}<2Q4XFRaab32OtD92CcK{k})XWr@z6F$osU@S1;gd zt@N@uUBL*e_*j%o{p}(*hIn_$e6#wiSdK!8Jj8Y$c^m^BK>w}!$H`xE-t@{fAHiL# zN>W~D2m0Hj<7Qlv7Ck&wA5a<7ZYIyPAIs{qUf$$D?zpy=A7*?_^X}|CZzEBrj4^Kj z{W(B^I#J?kiq*DCI35u=`JKmMJay9uQhFqEytGLBR6Mod54O06hrC#H^YlAgNv>@{ za17eUwKSzW0Od=U+%EPz_rTPOOP_C`!7cvia-#(CaVZzu3} zTJkQx>tG`U;ohM;ZO6K}UdQE%;p$+aF^o|)h{9r8$3hQiO&9~bemJEGO#n-|L(V%olmTOve*sDevwP6yS|$r`K}5s0Wbu%G(icYN=ILN*(vKu`<* z0lHS$!Ah8*tw8`VUD2kTFV>`D#|L(UvdDb|bxKmc#*R+c$H%@1zX4lAf;7 z`wVgisjhyCFI@rhG$dNsQzIqOq#R=6$rGCc%2~g83i{Z_M{(q0=Fq#WwoTrTv zhWa?0V5_tqOn$)!$~np>d;M-+@GbR~e9VQ{2PItKGWu5v-bQeKj}mbWxOZH>5H|3| z-7{RuK;mBQm)azah5t9A07In*XE*!`^Pd!kqqu(__rkLKg0g!(ZVPDrBzJ^+tp$E3 zw$9I9##3L>l+EhCl&<(*bjWGQkbB9Mqt3%7|&~3rYj~OO^R+gMGnfOMz<+oWELj3cW14B$@S3h;k$hn;m%%vfl?7wL!b;+))EpaaqvD zyT}uPuvFnFf40U*_#OWS5RsQAb5dO!Jfy2Ho_EdBLx zLBgow#`MuFo=RgCCb!zDxamsTDlmDtV%Qklct+(6_)glCS|$+hnj&pO%-k^Xwl+YQ zw_DmJ=tLQj?iNqC1+qQpNZl5TEgJ5jNu0u%Aludp_i_UJsS6UUmbE@m4it(6iQ+gh zoilK3n+jzkS!YkDwWssMMUtQh5QO~IIjxQH*TSp0W7op>*@uZh)0NQUcbuq!^hPoD zX7<6Tv=Qg>Ye|N42f+(2I%@eSU&f8_pN1m9dDVu=1n9W)(EBt}D5_|2x4>8QkU{bq z7M}!eO&Yq{$OD!hfQr|g=#QO3!=}kl1+1U_$XeJlBvcYg2qn8g*yH49EVS*q%uJ|3 z*D*W64KkY@jECD$bMZbnn@J|kp(uA~jlR!2b(vz2T*WeQs)RZX7@RB1=JpWY-fTRnZNCODh+qrg9}+6j-_S zR0o>Sz3-qox?!2?6Lk=5x#ho)(dW*yv)!?5;+dl)_`DN1N%nLKA6Zjc-{It5KktrZ zlL`I%cPz`lAW2QEbtIJ>(I*oUxr^SPqE@?NIdcvz=bBX4=+lm*?V8kcP*rpHylWNu zoj(M3iI~-FUYyK&KbEta)E+)_S)73U3aS3z_Ra=As_M@B84@)Ln~54N+oo+=<2F{L ziK3gdcALQ&oxusFifw9nBru?;yyTHs={{7`B~j*y19Xcm_C>S1tkP#~WxKw0Yiq%ex6-Dacf<-+soq0-5jasOtSpe16* z^`Ka0PIV}fIg{0v!Lyo_WNL?ofDQh5h_KDx_*{I)sHV_Es)Gmk&)V|`v7(QPHavF` zRjO$$#!_#j{t#MgPfM~7vDafI)ACdf5Bhy=PI_okQ>OiLL|)EdZ*uy2Gs}yFwxbd3 z<2Dv+t32ip&5|EfIv&zfh`HAsYj`T$)z1!s?r>6Zg-VjZ#4}c6)|-?o)JvMTaI>m% z!FEGrbt6w14vbh6FM#~crl}#RrK=vfQ+Dd@d}1bOFBQN6WREgt2C_5tQzARYyC8W+ z;LuDCO*Xi75BptoT*O6U0u2~VAxWjNO)_SNRhs;sya)?cO@d8Th&#Yz6%d$En;6vIPqw#fF#x*Z&=gxUgt z=k}|jFVG+>sS{VkBEK`+pVQr57~zu7b4HC&tvB=41S(KjE9DhXbbCq|cJtamjQhfo z_y=J(^W440W3^zY{j|`8U76lMl zO}x%}tPCR{vf2|yKxB2r-Qe66BCFN02Z*eXL2o1&(aeiby+EJ)}iz#9>T^6 z?wuQVNq1A2oV{Mq15~DDW zpz9v%!Yl%l!m%#_lQ!rJTL{=5hO~4`Ffb|_&CGf)6MHu|nVl|*hHi?bpB7Eg0B}SH z$2Yj4SY~Q9)&V}Tc^ZsXM+JHAQ<}2ItV^xRi`9!M^hjLWGq!1J=J}heAQq0rfji~d z?bUI@&w-~2;mPwjk8stLLbFg&UDvCbw?$C#;ydl7jfOOflw4*B$U8nS7>2Q;GG4d1 z{xN!D?2Yg)=D$c?K_qymSu$nWHl~^ByIwSUVvG%(0R%vx&y*BnLuvJ(kR+Jb(w1!| z=IntbC7hwjBJH(&HDvZH7{nA&AX;Md1S0c(;8*tsCF3QDM)$GBLh!4*imEJH=iW`x zwrS)(kdj_ViSyzKcU@=9jWY*72`hYIAZU<;J2&uGg0WtX33yX2_oWKNA4d|M2yLE} zDfyR4>CWdhBhx#f7`NvDOl70jd`hPJ>dEO<&pUdN3dJ(#A}oFWJKwfCuU5=tkzWKjc5lssp4Ru&$p!r-h0lO~ae)vOs9`H7?L971f~bQG#&tB=(jk?-u^AuSb%Dj76YjRLIEiN}Ft7rh~V!O&AP z=#OHcr~aDhSCoCwXB;KUxLKavJ?PTLs9>K`L!K;Rg(v{bG5x-5Pgq*0hFL?7C;JXF;b zdX-tM*RrWgd)yvI`<5P3wQq5g+3iBT{mLtY=CZ7qn^FWgoRHhEY~gv{HcIoFw=zAr zUVv4g2~2HGW?i~o=fM~%(C)V1X(UVY4Y}E&d8sws;}GuDDT}0=$MuK*o=oV#`D5B= zQ*g^oMr?vVdxz{-G?za`7FK$4Ji%Mbb``q7+_+tjGnnMYEt)3W@U-WskhhW_{kEpO zDanO`{qCchd!H&dda^_Jwv;+jLPgqW)gLbCWly~=P4@gg?xT6PeHMS7 z_DD0B8$Na7wq7f^ZxPF6`<9u2E?YCL?ZYPbCz;$1x83brR@vlsUz;#h2NTnaZ{N#T z20|g+`3Z*Xu#JmWqTt3Qks)*?Gst{h+z#fWor{x#4Bojs3@4F|SnT9!RluV+JQVJ# z#h=5(7vqTh;gJIr>v>urVP;->!DzW-2q|EiOocBG&wtw|BpzeH?BD87eMYn>giUmY z+&q6TfPuEOyP|pNZIB8LN84UN6+|o;W^Hd6`|i|6r(}qK5{1rq6hcH}H+10-gXqGs z2Wmv^(6$1xB%^lzdZ8?MG7Tt~8-2VKJOY2+hBp~vE2~`v6w9o-og}0FaN|JUq?Pj@ z*IaqhHVwAiJ*fTttNoOC^>J?kZGN1j`onE|vBMU5o82nuP}V8qukw^JgexQ&w9%$t zf4&+XdhSW zGWf2?N!v%CftW;y{LhR!ADYz4^WEnbWn;x1d-J5Sk@(^V4p7$hH(^<;&9>w z8A*g1p>^@lK=xWl2zpPS5tlgP>+l$Ly$K=_!#}HpiY7wuBtq*GxK)K7ns|4K!iO<; zXNO|wDVBbfc7axHdm!&b5#+d%MEVYyA_1-LOqhDeV>&?u=1{<=eZxSDVeZMzed83f zUQs-D0sEqgsCQbX?W4p|HcCn&d67PeQWNtQfdxv!KG#iTqdMypC#E!&&=k}CDj zR+~4a^iD%x`cbt@DTJ7n z;e;L}Ylb-1>rzahINPY8#A>q)hL-EtunA3?GNxaMHar$3hQ1m=P9ZF$H<$<5H`QV_ zeuqPmms{4yNNY+xQTIY3{kZk0K57yjmFp2FmXDyDtdx~euWm|F8BeX(plzw!!j#ic zCB}7+Zl?R2G9}COVr+=f`#3+YXm`BIHpzmp@4Dxr8BOUo66w2$pLt@9R(_-cF!GuU zwd|Gd4E1?imMV5D5S=VjCW0hc%B~15Zc8|Zmwi&oT2F}-W1eW(;Bm>+YL0~7 z$y9tH(m-5HBK~f~df~ws3&OHrvs@(7zwiEWk80}vg~lZhjI(>v=fES^GmPh>YYc_9 zIzVw|vhK@GL>;Win78wpaZGqS*6CDMJP<_8sP_RuYjRJ>C4sNxPzWXz&5 z63tYUNi?&X^EjC0V;POn24Y>ZgOq)h3Q_y|7{;E`ZDoHm|{k}N9<={Th&l-)(-tMN>yX)M{Vu5!i} zQ${kjklqnKjz%z~8XgOu@*o=c$=jhUdKMEz4f7Url(y=3V4tss`E)oMu?7!E;}@i5 zAK(vzRz<1M1*uA;uF`kF!n=cm!c--Q9qM_EG^Ng7gi&NGREuHxHdLsQrX-2t4I%0y ze^a`j-ckD`ONw(OA0|`|w;dH>RHUu~ zJo)8NpKsmROnSqHaHhY6wwa1k>j35DPTXn8$3q!5e3 zmdH>kO7jO)W#;(r$;`BtYYi8A$%YGLO!GnHwT9$a`Zao1<5nGKc7gY*UztV9CRbwh zpQK);I(oBA_V=~I!VulouY2A&hdGnX>79GwPm&x`z&xi`hlG{$XioCKadc?a?M$ghd?#8r0zhrH= zH+sZd$R+iW(1>iKB1vj{X$d|JYFt03){MurOz4r=T>Q8xGqrkvK?u~w?LcijvtRq7 z#NxL@?X&nhs{J(n7Pm+EThjgo{_fNMN&fEJJ~~`}+^@>1FnnNbxO_|*uo0yg`ILvt zD>$$4&MP^u^vd`^hJdNJWfATuHjuU#}-AvGAAZFDXslX0rhyl#;&uFZS&fCm?U?kG1ntUjm&z|olQ{HBJhUwtT1(*{=w|jczG&S8+kaSh54W=ox_Zz0G&}c7eS&Lb@gMEJ zxHj5-(zrzEJ{0{IP0UOK*UK5y4(IBv>?M*K!$LJ9E(Yi7EG`A7m3dA~h$BGS?R{%N>07eD#w=FH7w|33 zA)ah{%*XnbTQ@x3LVbWf^`thj@j9YFlD4 z4ysf6Ylv4ks7~PA5U+4h9TP4;DDrD{d-y;@y!yBhug0>R>uQ{?#`#whbTz@Qa8#)H zIKcE3{#B)}D*dZ!UAbD4>1Yf5RVfak!19h~DALiED%N91y3|L%Pw~UQU>@_&IFD)j>WAWAJl$kdL#% zarmJ};>C_pN`~&&`aR}W&*g#j`@Nv6Lk{dpt1lIKcvl)zxIcKMD{hT8JoTNUZ0UA3 zD7JDtvu)oQBFo}keN!@YkDliPUfH-pw+Bpx;~98yjm~h_eUNhLEU^A{3$61|a6?eB z1+=nn%Dea7h9Cs~uhJGs~oCK}>nt{pm+pO$sNQa$pOrfT` z5`-^TLl*G?EgW${U|Ao8c6Bq!A_PU*CpKy)ev@Wpq~RvFl*EzmW!F6bLHDd;yCx}@ zkDvHuF4$?@lbv8?FYTagqWdc6qh{W@S4c)>C|3v$*z6)u@Q|+D_1Ns?_xos9oBve{ z#}7r10GPbs0KrR!JJ)jw-BF+FCwn)&n(WUu>UgAc|A-#x`bK8HX8i55@JI4(Bu>@` z7ZFjHX=l_y;p_$gK=a6(Z~vAn4~I0|^@3gDpFzYfPFWsn_|CP_^oy!S&!$=Z9P}eT zHJf`J)uKo!CchHmxRr>=cal@R%k$Le>=jfLco|&oaSJ&D9tqIS-$FjEg{t}rHCaX8 zcsuYEPVHNfo+bu7TGqvX0*TW?EEtaG0Mv6pcb)H8cEh_S9_=8Bk=PbeVB(tW>fey+ z+#IwhxB2**{r?ew8)NHX`)Y#^yAJiqF7d`505>ezy-ec6Cf0(^wOj}B#(v3lA-u6A zqzCZE41R;vMx!#Gau}HD9pI3OGvI}{cn%$=B0vocHE!RB^s{B2z9X5s@c6ha=~E{) z0#de|G8%sJx$O5a#k97|V4J1rH6MA5UiAUVqTN@iub?x05rKgh}KGDAAPKl1#}B^;ufGt*JY~ zovF}=Oi34&i}KyY+!SNis%5W7X_?BO>z;+i;Mv>b9)_pZ$<@lBNZ@?#4$I_xIiZ}c zkTh{QfHKFBP{;Eg=1DvaR5i`T#2RVaFi&Dp)uqc)5|p~hDri9Fq(eAp^821ql%qusA8}zj1yld#u)MCWs%cl z?JtXd4EitIwKEd0(MQwch{=$T(VGQ9#WYa)oa;F7T4Uf?1S+OO4pdARRfzzo81Jd| zL$Fz0%M4ZQGF#z!qUwp@Ql2>uuG?f%CV6j4WyLFs5fg8u@tl`cNyWgX6{8S#1!77S z&rS=MjWL8IJ!E%&FVB;%EwyD*Ur?sY@G%>oEvmi$`R3K4Q%6#k=3BrkOd5ttf36A+{3|_SAp+ppkX06^ho#5mryp+ zozdQbNH~Nd3eZh07X94t@WO6pfW=dec*l4*6ZfWHqrgTo=@)?+aeOh+$?kWMR30Ci znfhgGeLc^U#^gPqx)k zNI6U){#KK@S#>l#6H7nE7f>C7S^|()-%Zce=pr%9Ri8>szG zwQGs)^IgV@j24lpp-+gx)SyOM?Y%f5nWFDJ)oGqL-QQI?vFo2BZ4fq?eQ1~#doXqO zkuk;)bl70`>v0|{lT5u+M`YorAW%r~ija2Z# zdZqU`kDhOmaWe|(Pe=8QrT;)`Ppzf`QRb>aDiA`VRABFTVzuc%FbE^~iqn4(jejGg z*w=qV-ox!d_+S@RU3?AkQN?BF@WC3+Kmjt3MIQXB;R=wy;zl8OFz>^M9^xZ|H-BgP zZsh7RvR`&Bi~4YAFgF6>1od4I4>{4bL^vJvPG-s>u)9&Y8#_mB1)Ea$MbZy9;99bY zf$?&z;l=Q>Bd9%+-WnrckM48S)Y5S1=b8s-m=QJgdSj`iP0`moF*Eg(WV*#0gilBt z%j7L(*g8b8g<^-X21ak2exAD#f^(+t^kvkB3${O5z%giI1f`$T9gRO>MlAb2{4R_a z_-f+4)Q)Jw>)~b9B!ja>v_qOz?wWOrlk(H-7801m{^wvoFMHZrqqfQtBm$DVN4Ru&Fv=v?Xd02b>g}~1Y<`~ zm4^hY^$P3~63iTG!<|jkpUhy6B;J<&ncT8*+jbTWIk#;ribzkgku#Hr?ps&#+=w8- zlAiHH#DY!;6HMEa*@^VJ$Z6-o{U)VO&bn3E?KQnhN!6}K9EP{cWCPGhdHu8l1dbZk zwl#99Z5z2^@L(P)m{rIQN2|N4si%Le5X`|v4G5pASXMvKLp0Agv}{ud6gkzIeeRjhPTgIQGNR_Ol~>(~klocGN;kV{2j6nSXh>Grm9H7~WqX?}Ouww#L(o z^tsuD4vjbZPo)fF*fW`Wq19R+&a5Nq3k`gY9?{z%7bV>2H9a&n;UrmULSGTm1r<(H z`WZ9mwXmfojnDErHf57N z3^ABNqG3Bf{QeSpY8_SOLa4ICzL{%EKGCENowt!fW2JKqT#dYq9O*WUPM*O+1*1gv zWj&QJdSU8QPfzxp4Lx+QVxirMG$!NAFPViAC6Y+~$H%Ea9dXebf*3 zScf$d-FEgE59m(k3OIUpC4Zhvq+U&Aj$e}~zEz`IrW-^Yg-iOhrU$B7#TsRQN25Fu zg|xJKN%q)Yn{-Q!A}(DW1_75QL;W%=%^ZKJD!P?BD0KQbz##uM+`cxz>VtTiV5ikz z+{=KYUx;T;LP1@##{ozC_b~iPe<0v!xa(X*Jdwq{NdDc&RKO&Hf__$N_d7!kbL2=o zj~$(Ft)mSuwylbAm+3T-Rx1JXb8x6>Q%&66d4clk4 zSYkvX^;8Xvq~+Fo*IKV@MD=nlI$?tX_bG2s#9icBw-1fGI3@n57_Il}K7}Z>H!18V z6XibspQ2Epr${QfKAFm5VDXD47X*|cZW>?I#aciJ;GlN&=+q2L!6I=URtyVvf+&cM z0N54cCr{6lBMZ*wFD$7LgP#?JPW; zhmulpSIEZzP2ASMj~dsOOm9$@MYt>A2=G!^k8t(N2fU;L47!Tv+6#{@BS1upM* zM^b2KXIqgBJ#F?p1b9|x=kJrT$U-}Zm(E6Uiaif*4uvvd@VivuJ>x5FF&bb!av`1< ztBUNA+#-bgONhe=GWRRUw-;Tmshj9>k|aIx?u$#ae^d~+#YM zfKOP6J$Q4%oXk|1(j!3fp+m}9RbT7I4cvu5_gIsNcoTbVZa<-(GPeTpD#obLMVu4B z!F31cZ%De=5@brUJqm;8>~$2Zw2APfaCZcMS~plHEX~>g?|B7;Z7WB@mYSB>M5}i* z^xq_QY1~(eX(vD+Q#$<-_65=f?4*q4&4nvXV!i3Km5J_ia}i5r!65?d*h8qX7)x|L z&|+WYc@MHxnAUZk?jh1}PdL+IB%h!{7H?D*BF~EFlvq`yzs;B9i0D@N=HFt~S`z7v zwldO23ASl+ua-@VenbtDn`&G~(@$y+bkH=i{DHvmdMk-3Bbf#70i{$7h#t#VKN&RC z$t$Xl0?|U3Vo5}0CbGv6n!c=|BvCumhs|*SXSf@;qzy6(Y%!akd5jj#MfFFt+nh6G z&C|=Jei;3X)v+m4@iQ@ZTEEn&jT}{9vzpGaPuEB`bw8#6Vo$M+kp+it)V(gLvzkiH zH*?TW1`WMB@-bTUirtql;gS#!9mMsLxscdyH>`MIk7af1Z=BIqPZ5O5N|uH2SM)dn zm8y2QjT$dbsnd#R`s@9u&v}UG*=tB>3f&KR_;oU+$Yr-!Pn$8%Ix2_vQ&+MQUu&;2 zrMkK9^`^|Z81lS;A&-m+p2LtwTt$M|0qy7TcU1f7{56}}cg&{t9kZ!@$82gp{@Zf$ zh7TNzvB=TbiPW-WVpBVg^Kst!@thxT=Ll#@j>e|7#J?J&t1DvpawbNhLG z&>9L=>PDs6&K_DGU;JVRmoxWW&Yd}j_HaIDU;dW2AG(I~_I>$VzUa_i&KK?5t7Oa` z+xOjwaOhBsACvp`hs&oNIuJf^%Dw~km*KUg5{+p+?ADjf@xvXgkvAKqFrEp=npj+0l1N>RhCgo=E<278-D9fFtGb!gx-CrB^12s9wkU}79 z!A0B|A+Bb)vzYgZ$aAst0X0QIkvjjMJHrg>AKY0<`I#jiq@bwLA)vW>e~(UEChE z>)XY3q9K!m$KLAv}^TczgjC8+==_ivDcc^Hp> zdRp1N35=zLD3vt6q6gB*uhtQ!%$tUa5-D4f9+H{+a6KMK2={MR8XiiTk)pX`BYIR_ z8tlY-`IdK>Xh9+g7i>}*d9DyAIASS`$$nSy13nT-zbc?znvxJ0WL4sm1ik^72F$r` zXKpdM2JHb}j;y3%8;+TPlN2yJlBrcrhYiQG+KM0wIgFwqnnsV3WbMQLSIjQwPsO$Q z5l6OD`Qd4{a#Z!}>`1sGJJQE?f00PdBEcTJcgW*%C;tx0=hz-TOB)@fcT|U3&03(S zA+(i8)moJZv8+mj3LN2zS)feX$-c6mWItub`>X0S63BRKu36me- z-Nfotk?JvT1k3TbH^m~k*Vrz%UGSPVh+XD+Z> zoDP{ct0B=Sl^tE%A`N{^0%iASJ~j7=cU!1jyg?gBcsD+K3ui-EHw{o_Fof z@ld%P=r;Te@Z6{tr$8Pg!`0;X(Ynpq3uU8$b&F+CY(v7ny<%Y(tFX@w@M#40Yf3$( z9%(c_jcNZp9XK<9&yBLT$MI=ChbHZ20G}JDg&;n6GE;pZkN+#0R0QwRwuP1<=+Wta z8$LI#k>e)N?D<>ikXMb7axq~QDAL79tJ0ZbsSz2@kM4#AT6BU%cl(G>6KO{Z6wRDP zu!eQvrI+$~Lyw}ODXyqS*@l>yF!2nZzYsp{711@brSSMQp%lWg`Fxt^D$ueQeA+wj zk56;>+0iSUid(Ns zaJgsO(eWTTHf^S{X^O_*$ZccON|g`dwwfdBjMBxWiSqmK+(`DDxda_dOqyGg(4#Lq z(|9ywN6HIH?c?a9bL%B)tPU5EB`1$r)(5{OiC4dsLP2Sz9GAxH)LNlcgYmiR#iO;u zqoEaKk>1Hz3@_UNkLJL+Z^EWG#=H8y%=z&Dm2 ztl}lbb#!nq`|ck-Jwpxv?P`KHVAa-}{i#ltJA|`3A(iD2&JN=WjanZS;5K`R-5Y{% z_SMg`?rhJ#32LVAhjUUDXk}72-An^^3w0)?G>udTa*E8~tl(Z7jCK|?jI-^aopFY6 zXa`dNDru`7+SzuNTi+}&+4N~z;;Eg^2vA=xj|1I>04YLqOfJ)k zMFje}8sSk%RjW#3Im|OgD^8oZ2;CmfOlXjYv^Wj}JIhplmk^cWAj~)IW|gbk!a_i} zl>xNc-yH$(OckK}?FB?Ek@@OC4(}`vFb>`s@f*N9BYp#TXT)y+?~M2j;GGe_0lYKf zH+%?qXL=*UJNsA;@654+26-kIa`Sq3kY{Q1f~Mh~@H=M&!iT-Sx`QgT`e1Y;O9A={ zTefl4E-Z;N4bZ2SjGc*vD4YCRn3ixPfenl3sLLfLC~r^;K@p`4Z5sNn0>dt>Bj z#t2(W@u`h6CN&r{42z|PYxS%ZXoGm#4l0YM#{u~vW7~Lon}=^^1}v!CBwWoXTp~PZ zBZk&Yd4`4dHnJ%!9W=DkcX{@O$<))4^lipw-lh6Y`38D;yt@*>Gg*{iHK%TA;~KN9Kni$T~#$J{|4#k-0}8dyY@{U;f+b%n6aZmcFFJ7pQkCo@D|;fNZ2 zg*k@hjO2gv!DE~C)57@TJF2{A><-&(z{}1@^03WjtmCDf@>M;#P^SL2-_Njq6^nAA z{nC(|9p{d*UM9X^b?H%M40xwz8Uv=88+aF@2O7$fDLV+ctKyqBUk=CcAScfU``H{+T(js=9;QS}fd$ zzTUn*??G2x4EXaYSkJnJ)o{!hjZR>o)WHN*(0;w)&Ra^^w*qLkP&>=vfM(0eMOg!Y zW;1+1vw!@oUJK%GG~X;o%OxaPP5ZnM-y$H5+;NGpALLd*aHb#bv;fz)v zq)#0s)Pk^+`&v;RVnD-)=o0cilY!wy|KBQ1 z$NnF{xQNPGk_bKmbH%h0O(Y3sika#7H6e!!JBX#wTq7zj(FS$7sv~WzHmzWUlGIC5 z2SN5=$?lE@vC5<^ZIn!e zn8w)EOZ}>0K?put2h{r$ioLRv6bY!)2#N$}sQgEf25b8CKF|wb%QR%vA57F$9;J0d zia>TrZ_;zmktC6`l`gX*?Tl0hT-M6jlV+gvCfyXv{2$n=_n;t0B6_7ED%6yEP8>wt zJ)g;iHospmx3=bg~2TqNooE zoea5I^%#Bget-t<()TvfziV_dNzKMUYHkBl$c4r6fCk(+iDaf6g}oPkVox}9GVR$h z(CL>pJ+4``j!(jR_9xiLcxK!&lQUDkEQ<3z2SPR}{gY?KZ)iHVD(;rXJprMq;sHi3 zvpErZ44~y7Ocw0mVH!)#h^=N21^>bD(ksZEEVc(e8$oVG*vO{zThYw%;ixyt)SNgN zvOM+>WX2Ugw0d};YzH)2b^}rNMFp0JF(%?ZVOp;)TL|+7m2z2h7;V!|7h)ig@O0Tt(qa!tO51kAWBdNC$ z90H+YMgtiT=;4|X$je5mfV^aX0zpZp9uu|NN|M<%EaKfoF=zWc2y?iIrNP(`orr$4 z@;EOMQp>8oEpxhX$BbbyzJbV5ytFhk>JnTp_{2NJlSDJ8hKwCid9fp8xtT1~&(2n( z?~`1?q4`Wm0ESF6_ZiHL2^zJ+1_F=TT;&rwXoC7iaYO7LTR>%!P$gDMjC*wI!&Xs2MuU@Kgt8bclf>~Lh{ zF})EhdUJ@`*(_4lL4rTu4R-8gvmT71982xY&1k_jF#R<_nf8}e_vi$e5GkoW2(r}( zcJmNqn|+R;3J5Zrt=hrOW`c<=@^%=WeSq2Q*wj`J58h!6Xf|tyv5ncT+k*helpt+y zoJJAYM(9E&Yly%=_&Wq`J=|^sp#>a-=Lj=O+9?bbO8gsZl{jsYiI0R5ic=_(_5lecb_A$|Lm-ndf}OvKYXelmBodNL z@Gn!bG_%`zCf*|H$!miUxyLck1<6$V9(lT}O3EOkSb%BVVdNSS~+C#05F zwS*<8aiqQJpx)%npx0-K>?M8DC-vY$X?hw(_WqO28{45cxsW`?%|JhkWpS& z^neRLiKUdkkMbbMX!K+8^dqb-jy%RP6~_}*Q2kpUd(aQk~N&$Gz>E44G6PzH65diRf=NKDE<=O;C8m^d_{*YolM{rNT?= zs9zgp-Ita?4+ecqUs5X27IjK5&oRYV`hKofas>)mT@ZK-*AG%X;4x*F4^K1R$q{E; zsg}x{q*eZRZRXU6YXBCZ)b|I!%5l$$T1>t$|!IH z#azUY)H@PLv>N1?3)s;_)XZWp#ztn$uhg)54GThVN-w9royaIK+ z;!ps9teH(z8`dlAX*A9>PeNNIFKLxqXWNLlV}D0=g>c6NXr^&Om6ug_a3af>UXJez zcdSQ-!G&`m92{Y$LjCJl&TxMT55XD&#W^pA!%rm)J9DM@bC9>!Vou5icjEn_qb zckI#dDZc{+h=5Ny)#hk2~go!xE_recUnF$(=l4gb$6X~1KiF%G_TM9L!iZg_H*z*>H+0_l_$a_UKRd(uXVo(p75);XF@ z%_2A&f63HC<{is$6!nl;bG4G8mAyJ+rXeBj0v`kFh*# ziltxH3Q)`AWebAeDv=x9a^}=}Kk*D$kN+MERf*JmaKFwJiGC4Tq^s2VlIq*o;kusW z0^no!Xepp!%0~Oug|aIVjnx3`w|4UY#W4_}*01~Yagg>>oq|2~{Y>T8=!mi0q?H3C z9`T@-q*|^zNMzP4Q4YG*-ouQ7v;LGSFelT_9`5YYq_EvdcD2cx-2LrZ&2v4QZ@W#g zPpWKH>LnL3T3G(B_uR~)5I8!cg{7R}vEilPXP8TTq^3pFn+$qvg)JnMwg43-Zi}RS*{?* zaz4{xkBx5T+`}Fl-O72p!yX&mHWYizndGNm1~DvvJ?47Lu*YhdTZTQ>G7Nic9UZxA z?6KK&E`br^9`;zRu*X_-oXO}bLuf0M%*%fe zhM>W?b@Z6uvW~904av=}aL4ldH+a7m2RJqtk+GI7xaw%=t?V2w5v62*NysAMkX;Q9 znXZp{iihT~XSQg;e(8bi1WtYpxhl&3>%VdTyBCiwVp$}*_hzngXfD^;m7%$;;*@6# z&83|1iT(n`GSW{i`UTKjUh&Xe28d>P7xa89w^_keNZsV@<}cQ^W>2o&!|2JPMn0BZn0l(6YNwlnZ;sRtAKq zm>}3RnwD^`N7dVXgJKi+WvCE1oV!o}7M5up<5rE+sQMtG*lMZwLxy7evDk^dgJKhP zd;d^u@6x-Fcy%@m%NNwwY_nP+6vJ`Yx_JV4YkvVaY>bxn1!h(AVSuplv~FOAZ!!J z#_qiw4`Y@g$<}N=7P~BuXYn)fklVoILD(*XuhC{e{EP!((<~nZ!ls1bAZ%~K-;4ml zmM8$iHW}e&_P8Jj+l_dZn5v=1YR9VDZYMZVth89@+4NX(J|G(&@6HfSIRn1A0Pd}one*CfH zYpaf~J-V*;*t!W-HK)#+Srwn&?$VE{tHnK`sKQlQ6#Dy9i$V*_L!TJ4U+HolP)>TK zcMtbc`zfu~I}fHs3#7FaNSjk2t)oENiXmzG)T_O;N9J)d>Ws4TWuu~HW0w|BEUU_t zOf0LpY@bM3?Irt0%IX*IcT(BFC86Tqj4IoCNoZ18$EEyPa%pH{+44(6t-5tyS?TR} z-M46U)jcmX$_E7rz!#p+uoVzbz?n>l;>I+d;ZDb#SS(bt{o8Gcnh=ujvL|QpunD zbi(f}b@Zv~B3d(27xmLc2N#!ZRTojkB-K4p4{a^et>?$^R&?pGx1x(ElT?{gb7khL zOtWrSmnL&JX6VM>`~7{Qu7dqt{76lK7EHGmEGW^fse>Dr$~`Ei8xwO6`kIxQRH-tF z!DYUoTk4c|fnRZvPkjd8EH1(tq`9#-0}7-nKN^Puzv3dS_zV1s zi^>Y5svc3&;lJXdgN7%PQe1TK2q zcRziKrRTr9pDyLtL8J2D-B16erRTq|tv@{TancKY#aBj17uIE={0EPa{_zpgzc51j z2_vM>9wA*36brSdlk|f|<;Sh9CA_cVv>r4nU&3f8=|ywSnXz!e+_}?c&YL$aHGkf> z=FhpH{hayprd>GeoU>=QPrIOP=JfVi<5NZBlP65jpE~_HMt_dgpQ8&@@U2a^i`n#imS~5}BBcP0PL2*|XZGojr>$-**12vu3rY zW*3n%Yu>kt7POz=K7URw-;C+!pMTD*^QWCN@0|8rA|qzzISbmRtC&B60%_CEpEdJ> z)T|l0KW+NVndj#+`L$g*o!aKkn!8|DzS8^<%#WZzf92{JF`xQLW18QojCcJ+p$_9e zyAJz(!%NpeytAJ^E0DiBkpAsJ`r(1}6@m1>38bGQMxURa`#Jvm^*`^WtC*<1pT5c? zp*qwA(szuIKG`E>sz9{}})M^f}&?S9MxL{Pd095;HV?##l1(8`{od zOv?0A{@{DP@kvV_mVO5QEBYxvPSv^Rj&lo^r3c${WUX=!f9_Acbk!5=pT|Z>zsfCq z*8X7sL?&4E!}_PpEr`SN-y28|_Gjmp^7RDk|CC!mht+?z?E>9<1@ljeB0QlPIBq$pWrxuXN2@OoAUXC{a^a^eEq?8z8**qKKIGi!ui)v&!-3T zH_dbDYG?4dXKJAF8`^$|{nHgl56;t`KzeYVeshF*dP5*RIBx&2 zzgp?q8Ek)Ig!#43OV{Ir^L-IulKSa=gX4L_28dCAzK419MfrntBNdnz!Tg)O^r8Jz z-stKdn*O7SE`4bGCkN(Hu>JpVV&VL6dGkf}1oPiNDW5+$AIne9rw8+&d`dn&*#D!N z^Xb9-Po0`i59a^Elze(H|MICWeQ1BCXXMj^<8x&oJxCv(Pv!Fm)4zFcK0R2E_*4D# z++hAmUb+s!dHS_L{%Y>|^Yq+6dT^cw=WlSHHWpKu-_ZB{Q6N1yPnV1^Pq%}b=GPN^ zuX9J3r=J>OKF%K@{V{zcetNy&cRXE#hM&@d-?95MmTd18Y|mjwy7Zy-ycS3g)^k^2 zT?;<A`xcT(XU~eY|rWykTPL`v%gz?zHs%0_k4!Exk05-sYW;;<$exy)%%0 zKp=f(AboToUGf0^RFB8H+Vw{E-})&%zl$v@8c+g19lWV**G1zOe0y&D^cfu4w%BpD zf7U*0VSCYd_OH{6#?M%=plG}j#{1iqLc55<`_=bbZt$-4&nj*E$-S=OeZO(O<=Qjv z{<(qOS#E#2houJXX~&;GU!Q5uzm2|j-FRdX=g*omefLis^j-I~P~hd>R}tHV`frq4 z(2m%|dp$1djw>3Unm>2$ta%L0dGisipz1klLHl%euI^=K&7OAF`P1jla>M)2{VVoS zQz|&;RcE9A6!>}5FLy`Fl+ucGDek0k$hChtw7hse(T}|?KaR4hbENhUP4|{_O1R%l z4u#5lWPVt^L(Bj32<2N{^}8+q>#l~&MBk&Z*4jotgLhncntQ@( zZ(xdNkZ8qU>pZrts9}WX_sn+{`SMk-+F>%;36wAN{Kl^jf4=%+==0}{@ci<}-4kUCuOpPN^2*l+3v=$b-#`9` zSAJ-|X0QDCA?01G^5f?gul$f~MHhMHho<}G{q#*;RW XmBF7&&8KL)Q>c8$G?#PBZp#0E6Rzu% literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_elementwise.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_elementwise.o new file mode 100644 index 0000000000000000000000000000000000000000..403a4c38fe0f1e2d42231a7c91ffd1f69821320c GIT binary patch literal 50016 zcmd^odwf;ZmH&MOv8J4JA??sHtx|8|z zu|(&gkhS`LAjxfNZ0lWUz24QmCp0r(c}uFhKXQJ)VtlG|kDcn?6G?S;1v~psaxP7E z9t=9?;~sMAa1T2ZagR7vxR*I&aUW%^x*G@+Ry3UMTyKx`Z@PB7Gf8EBWjX#%Tg%%z z-tOhCi??3h`gt2b{;PTuxy^~rorfEOo07Ggor&qTO^JMcaARZ*yQOyHwUwz`6P;b5 zjgi#^BmwzAy6xb`$RGH+-a2hL{{o#*x;K67*f9d!D{*n|U^=(0VM2Fe_HF3YMAPhx z4lIjJQk|Pb?euSC$}!9?e-ldP2+P;t8R-JrGdcJ&l; zzJvF$(~5hq7 z1K10I#avh%usXmF02UWm(y}Ata4WYPfUUubMl`sB;7WqG1FWzD5+k^V;1>W^h%>d% zTPuIY&a><|)9aXi0O|3lWhcjhnv9KVNo_KUx2ADbmYub$6va(ZytR*;WZ50|Bt>w0 z6mQPBddpsB*DHFRXgx3|$IZe$W6yHQO-A|NG_Do*tljET+Z5${`?$rpci4+vV%wvO zQ=Km_C$+1-0`|%6>Fht82zJ%p?=+^{_K+LjM1lCv-b8`;-FS}1?*bc6QjE#8y46gp zyN+pfE6Ahcz@u-X6aVmL!fKBE=1uhAe}DDTL^g40V($8d6Rp!$qF(}mH>aYH?`+KV zy*bTlF2B7oe?c&Ppf?dbn1G-nFEsO;s)ZX$N_(d&`N<;3bZnp?(7Z{8WU(tps@n9MxZqU zttmj`0*wnaUVzpKv`(OP3OXH~u;$Gf=!6^IoQ_@)DB5JLdG+YAV*uIZl3ad6I^Pm( z%>PxWF@JfuF+U#y(Ne}DkG*MK(zS=LupD8D zy%heFb;+hTt>zCB1`Q}#-?Zj-gNvfcUj}NQwbqOR=7v?rtd%byPonb}rlgzHQ^-j` zVcL~T7)d?qDQvC0MLk8FE0I#>Ovinca|Y|g z;9LU!(^VV;`hy&>x;s#R$eRPH!fcT{$ZOtIyl+E}UY4)$%drDF`k15AFUQ-+F~A&g zzZ|b3#~$XW@yns)4LM?xgI!gP9wNnHVleVjid(Ve1p$*bVFr%e0baQTEK^|d&1CUS zl0Jjo4?JYXOQ0JHd6abNJQcq{JrOUS9mLb8^HdT~nHSG)=IPgYV#G7bi${r^&Qn7? zCz*IKLZcU=U2hZDfX)~Hg%{5;rXPfKM(fB@H}?%?Z}{i%CZUi;4d)JdHL$_b@FuPN zIT2{tw#^si-c98z^2(2VQ2CLSA8yP?LaE?DV{IS2#Px6-Q?+C#`BIsGg0%q}8;MjNqnJ zGNl(O+nFLpnwwI?lzybVz!b65%#;D7yv3Ba=m8e6S=F1Yo(EC=gQ&h-z?s^omC!;_ zk~~EUE0yN~De@AEyd)IU8&uhf)FuIek5{FN_B_sdA4k1S@GjvIP692v2-^i%JKRe; zlz<%|j25H={}SbASv?PX7(q62b$)~MV=h2z$6KAr! zcD*wz-S!mua9bju2x5ka+zH8>CzDO&cI{7ghv(x067G;Yz#>%d9hsYtgSVnXZ> zOn9CNvEO`k{aVYeMrSCmzd-5ey`KQ(Yok(E_Ov;Hw1I6o3^1e4N0F0x%}P7YK|Mfan(Be3rn<0x&MX zw+M_25RI|c^rHtzTV!=(zB!oAr$XuctZ+Kt9AWG4Pv`5)8uO`9q&adONvo65I6^da zH2mO}q9okRuF)*;>0|Mf=lir(W(W^WYL@$(aTaV7?^}g)C4Nx16Z6{*??QreCzJdW`HTOZ zD`BT;BfGg@b++oOxsMe(ociZSVzt%$!kVK@O6!vQwY8!wg<@Z_BSB@Ci{~iVG`kVE zmW^aNOw{;ebLL(eM(f2w2I&!yJgkK*-3h#ELPig;Gqbzc1G|X09fl#O3WO#CxdZEL(W;6qYS&!weDLs7rMd9cB9d-vr5^Sz@Y3z zt7V5#V!bLsb}LbbvR$nfxOSScF#1x5X3&0gA4?8z19D)Co)(73wRxpZk5%aUQz)^m zP%TRIu$v;kbL$EVmw9YOW#h4~-Db%RQc`>rJPooyl_kd#Z<{3^P<7FEtAaW$#Wp8s zzvaX4(SFMo?YFSCdZxDDtZJ>kHv5`!+c7j=Gj6$pg`Ig~#+_%Haa+NCP_r3PX#29h6o^BdEUC>_2yhD=McUzGp_FXS$H9>|X zG4OUGNesMxBq?(dNn$U)fh4h@29Tuc5>t+|84+96TS;a08>}3^t@!O!6@khot_)Sc zY-^`Ax3$=q4%%|dV7L|9;n1){xPOw_f+Vdv*mm_M?M*c6XvmpVx7)J-T*h^QBE1f+ zxvagI^e9@I3|sEp!M0p87Cm({t)7}YIRuop8X9$Osu-GwCb#$@;?BqlmKixng1<&Ji5TO>oh&~-U@Bkp?b2uL=RlU4t@CmjXPMQ zku^bO-a)%JZG@aeBVywHGfX@&--6D^VZMc|RsV^RLBS_rl=M`rCNDfmMXu~HB>a&H zm6Grr6O`E{78aLEkmE(#SNQuHbE1JXweb2j%`N;r3Y0Igq*!qgW;+hU9kKF3s&fY6~TtuZfoUk4sNb%g!ESG(v?9jd5y4COqYtW zRA_{yYILa@mI|X+BO@#s*Cpf9##NOV^{-y4&E7Bn0wVY|| zhIfut-c|I@Cm7CoxpK~*8tR;Xovfmqb13%~yoqzJHF{$nE8AFZr_;k_JZohqP^Y=( z{!QFNPBZRdrvdkfa~AGp&S|)ha?14rE)>McU?3ZR)w-)?SMWuz;nwk`PA}f_MXlOW z$Ks{ZO^NYEFU3CXETjSlq$UdG2mct-g;fzeXY49GXQI@pP1LE;rf5NxMzgZO3Y7_X zyDY1AP;o{(WHHs@GHTS(OQ=s@b-Q@H*sZYMnUZRIimoou!n;DYI%>o-Qn!jWw8cY& z^kRK;Op1z7t(Hb3q6F3YXeQT9=d1!=Tq>A4ur+vyb9}{h@Lahr`UG9pzen#P?BRS1 z8qH~^pL!wP)GyFW{REvi+a9Yb4N)bb|s#ki(y*Kk=fS|jcz7A^6NMFn;%;{;Kx3rYPLTX?7p^xr9# z5Ijq)S6%};_TDY(vp!G|6!aM{Z4Zq|Uj8Ti-sI)=LGrRw%ZqCa`HM{#2!voD7QrFL zQ^@%n++hXb9&wVmmpNzRF54d_6#^mIN6-@xAZ&Ah>zaKWZ(4F_CiPj_X8=-Cf=-M8 zhDDPsey&gfSOLMP-URPv3p%)5Z+?go+6U+}fEEzM3EczeYk*>32=-Zs+4E1J8Uw@B zgW}x6#qGyS7k<6)$gru8G>sUF-4RL=@Qr~h^n<}9$2F&t_|9lQDQ4Tf3}=q zk!U#u_Y*W_21ItV5_Cblr69Xm+3DcQZhZi#WH|-$>q@Rw{W4fHTeCWNh+!u>PuEd| zAEgNY0zD;>OK$q17+*y(&Qgr0QH;-|7>~26pC$?|sgz`^`gwv(VYRB?AxO3&K;A_j zt?C|#R@RhR^!*mX2Fa~ey^Szznh+6n%RWsQY6VQ`2a)e_=sX~Ko{2G$alE}uImh2W zQP%PI9WC#%8$bhv-L19;lAj)hjO-U6h9v|%@j{2>N9(BDdpFD^EN^ zlVONtsWe+7l3$}Fe=a4z1Cn3dZ}Pt6{~koM-^+aYrJ^&0VD@B>GvqJ-k145#{Qrk~ zLM-w|^%QnGLH&S* z5^YhCaQGh-`<87`2wMu-A#qeDOn4OuCLV#02)pQn--CWv=MWoq#AP>0H2`Mm_Yk@R zP)5TuGCjlDQQ)@$rfS1XO($s*pzxWk>_Y;ZLU&J_hiklDinto;FBdv_Vg3m7yZqP!<+C9=;soh1o|6s$&XO$0jxH zBit)70)!HQUZ($#aQbJj`~lM$A`YlpMH*lUsCX!Z14e+WO_){1tR$Arv8vvz)Cx*U zbh2D@OE-1P5%jp|mWSm5JoFQJfO${5j5_9P)G-pz{unEYtsz2^vYVtlL}gX33?wicMvS_?@~FjAq^r&59sCQwm0k*;E=L@mr*zZY9SU zx(lX`3swyz7W2s1`UDZH^%c$<$%5M9}i?@Ql(_dVJ> z;z}0$gS$p-@7w`)geD7elP7GQuj3wezKnas`6BMHW^o_oSlUgn=1_IO(1FsL(~C!K z&GHa(uBP2`DRoM+Xtz*k z;%lrq`zgBdf(KFu&|!j&>k=y3&0*COgR_KSHXOb5muqJ3t*LmJwur5 z!~hl}Y&T)bktrB1L{c-vF+t{i1$BTz*p!8Q@jC(EPb9LF2_mayAMnG3ivtq5g`}e^ zu_^|u7lZJSbS$NuegBy1d%RVm;m`0*Hi+3Z$hwdH;F>M1l;SJJVHx7DBZ0V0O|kk++gr&qn_^SW&sTC(0u8yx5%x(HLI8UbwMP*4c_9(JABzUH zN0QcsuQyuv{PDH}M;dd_&CJJt#8R2IJ^-IF7-?DScgDUIT<7cf1V!PMMdr*fe z@|VX!T^Y>J2$GC5j*~HljItf;)5pnJgN$KfJgajfLOE#I!4$fPtwQTE!7fI~5M}u_ zTNaOk+*M!3Fef@U1k;^85%%}0br^DED-nkyz%Gu90)02p)G&E-!{lwVX)xv&VWIT( zE`)Bl6{2(3VfP#W_N1+O3FTyq{Q*2QwsoJMkF3x!nVI?6cpaBDtp9f)C;315z{;`m8(|+?2Z2&6VivL8NzU zvi1pURS^Bj?n3~%J}|lKmAg6jvHyO@{<}Dt|LbGel9fA>XoB1%I$t=@4=*mCVn5Q>(AVQ)9jrMn z>9;uF;`*(UZF9cvq9YL_V6q`w+C^zJHoAa?g-*;JK@E6c7m{l_u!;J|5djob9<7tuRF>-UuRLxJev7|J-yZa~aK4YSqUgK-Fd+vJ5eorZA49kmzKH5typmbz1 zcQ9Fd%$ePguZVCfOWVt|kK=d2KWUEt1b3cBdG6YClEGbIwoKcO0vk5h-73s@8(8kc z$zVTtj&imf+ixSnd65+fuwEkgSdz@w2gBLWzp@SmJ@U8z_wrxX!+#PH@bX`_6#r#? z_)kJ^CHPN5ZzcFoLU6HR_)lVTag+aq=)wFabPwh~p}Kt#{|U{Gm;Y3d(93@+ROsbD z6)+q*|7C~gzwA){1N)KlvS30;+5hYO*W%&7mQwmposPaT!f`mCt`)L9t~ zFX}&aSVqH(`cIvf5qN3+r;f{*{3k>Y=0Bl(F#oCJGK2U}9hdR)pE@q%?#>;=| zxXj4;uVr}tYZ=OauKsIz-}>(a?5R^ZUyD68QKnrS^USnuA3X0~m@l`}dd}@F%(>i3 zhj|rq?kYX!UM6#HM&{f}GUryHQ9=C1ifv`d`0*XDp&w znr(^aIRBnaV{b~q*wZ<_$Q*T&|L)D#T6X_?edUjrDhArjjpo8Lu-I+p?%2-V~6gl1XIPXb8=`9Bk;;R<6@aipCgFE;mp0;pW1w}-2c z4$BkiSY)(e4I=$nq(lEA#VMNv=rX!5EeAmR08$*P;cOWzYlj`$@(c=~zAU7`&Xmmp zxG6jf+jgzyM*xO22nDTxFAKv0ZNct-*Gi>5+Ej)bVWr|u8EJkDH8dGI`51f@*Ct)k zkgudo+Ss=Jf_%;QXn5j2!)mEn3mdih3T{7-UJ-zGn$p(kc1VY0j7!G&S_n|{*&;=d zk<-OWP3LP?0AklINBNez6~y%&!Sx*|(xSooDMMgz_DK1?doP3B7DR^r(jw~L*$tC- zBoQ?b?{_x5;LhELyI&1mGdZWJMfK^o_?pGr{VuXPs9I(o`(MmJq`a?>uFo*Y4J0tc9ovSseQ2yL~R0!{WR!J zKuhRpKuhRpaK3Vnv_e*TnqZ7$v_gZGo+h}cr&G0iih3GJL-aH&9Fd+5Xb(#1=|Gk1 zJq0Nj9Rck*DLoyicMtK4>dypbxlK@$KsM0o(ype1jsSR4`AAAnlWJE__ak89T;2d@ zys>R-rft*Ae8u<3LuY{?QzKH)9KH>DUc4l3?K5G?>Z8L-ih zN~L82Rd~jX4R}*v5}unvxZBx4J)WVZ@Z8R!kG6B729vbnxr2()b~xlvaN-i98pd5@ zsAp!{dO6w{G`nss##}}ir+DkW$07bKAPvE%Fv1AB*cHgI88H}q0RbaB4n>neesRGr zU&}a5w0uQzJilEC!Km?k<(9%=Vz#3`*~js@cybqY3c@YP{MU~$j3)jv^~W!f-q`kV zW9~)h9E78DJgsQ0f_u1BcU5Ve~Igsdn-cFv}o5P1ocwBKVV0eNK;Za&Vg`GvX zN1U(VUd9o{QN{7V&>SL)?p-L{m-sDIR3JAkGQ+yByjkEo0pBz=7x@Rqa{!Jj@QVNs z7~l#877`5b2_V_~Q&jE7msn3qs+W2iajXzXZzNTSZrsq61s0H3T0`+*`i+7kwcy|IYOPaBR9MVoLuq@Du zcYK>Cx-8TRQkw#cfgYr}wutEdtHnkag>WhLHMBK^o046Lowv<>l=;dWRRfj67m9kH zSz^~Ww&AQob3b#%z6Z_l5vB8~>}!?-7ZUd{u_jm451%fuhN~Zj#K#VjKZel{w8Km2 zhid>XsUMQD!-y4UcyGqGis&iO1mKYr^5Y5mVz+S1Q5fX^rJf=>Mhm?G-htkzF0D5d z>Lx*rv+&W2ym|y7WLBP)9I87@=@mCyyJS=Ssk51S#?96t*;LQ!Y^L6EvpJG&fCCWM z*-T~TW?LrN4l-MvWZOFtV&<=xq!{d6bSU)-jmu52T%+YmTe&JGnPace9F85PnqjtH zW>X5xP+rL#hXy&(Nz^>d*~grfqSH(zq^m|h>8ksgGbUQj)NqCB(O0_qJM4rIpC9wdn0PQp{jIgb*5lKv_NRxst5QsHpM6Dt?v0dfh$M)qJi>&?md*ZiL)UJDje4-CFqpFg}f5%SC0Q9ni|^ zfFGzF_Hdu<*F>hdJw!M_JYU~WL_i>0A&6kOvyq<>!hqm@BG^v^-#zFeP(%a42tIRx z!m>s_?=TCO=u7wHXtS!YaK=s5D?PF5ZLla8my_WR?AfhtkCj`ieho&&nXoCw8pnFF z)r-gD&+;MsIXVRfwS_^|x~cHzx2ciIn>cPVm^Z^2GI`C-!!sw0#K{;z#hmY;H_4pV z%7@ewSoco#6mcA+lsV_)KFW!?gRPhrT|DEkJ{Ib-O!RN0CpC9+9Nj$XJR{i0ABTIi zG`e({Jnn!*zd1}E_rovIwqf$PM;)VI9VU?Mf)Uqppa~Q z+ycp8L&mfG?)gU~EmiFPj=}EbWcSh5 zs>@5TyXfPelHV^zySH&q0NX#ZeF6L?qKacL#M21&1-v2b%J@i+dIB@wp`NfwM&ctl z3WNB_6p4?FhZ;QvKNUE%dNu{E+1v$tGGG#ri-oP8Jpk?lutvv8xHW()#ytZ_85|X1 z6cCH^R!kev3Af@dLBAsXPLSqxCcGGTME#0XXN+L@ zC&kc~9S@D%@qk_H42E;l!&GZt&26x!qcyVWVItz-*cD&c_b@Le`yOK4_b@ithaK;9 zx#z`Y0XkmX`f$5xMik<4h<~s3`d|CWhkp)x;(8!@|Mo#}hye zY!h_kg)rbpW>)p$6i6R_{cM@}y#(NDQ`)=7(B9fu`D!Wct?cOu$K(Hz+AF^&=e`en z1s2JFiaN(Gx^K`{aVDAkMZR+F47iYD0UF;=z=uBk?Gm6V{he=}#rFjF`s{ zgIJe9VNfB^ow5sZ#Etu!pR!`fs!v7@p1<9X&|!%LriGgw+Zt$ROcSG7a2F?Z->)N( zp@i>?oy8g6_nQd1F{{poM%EUtp7K8lM)F#?XtQzA-o1zX zz;zRC6ZBz{XN>D3fYBW2$Ga@h>4`m}=KG{)z85oK6Hb-SmVlMcU7;)hfEl+0R@Zf1s)%314``M&*@w?EA z(Ym4?aJOj(ti^iDTu@su0m#3nhQV*{{I52&GME7N96>mtVb!bFn*RoyCT~Q9A>Fn= zbz?SGmANqkk1by=8^`IsRi}ga)OXz}*b2EKlaE{J+|!K{-br_I1MJJP0TwHOP=EX~ zq~heCxSZ@<$a%($C8Rqa;HMeg8Mg2}7p-GT&i(iZ&P-1JQMucCTE*JW1CjS=0s!T; zf)aP|V8Id~%J*#vKo8C(+UQ(p+UKm5bG-HdYJnh(ERHa&J%F07Wo#mfJ39J(Sq1E6 zngvn0zi;CJq)n%B5S9C%!$bhah0?7HrCV3s1^te|TQb}pvuisT)2JzJCV&~$MR{34 z{XC4k3~MeF;%|_fzW}0i+e0d}K`F^Vh#)s^0FFTp1~?!vr=_t=CY^h#al)Z=CzdR` z_d;l7*%Cs-58Pl81L}Xddcy8|Z8wOp6sJNc3>`VLXC5AGJ+wTk1jMH(zzFW!5=KKo zM7Qx_B5(}&6o;(w_D2?$l?VPb_w|G=5bW3fkq$a&RR=50ChACHXN|Kylw6oVp`00OKezI2ngL(?{ zg8^7N`>HIReNmRqPMvCfHu6uh`n&FGXy%qpbY9EXVM#24lpxz#5s;K#wz2|SoQf%x zDy5Gpm4i}ZDuq#|XlzhQjY?4;1saqRS1BCwXnasgol4<2Me8IbgRg_iQjZV`?ui7h zE+Zq-XNpDw)@PH!NG9LP4;uR*hDL z`j~__Nh)VjdsVqg>Sq$(B*|t{N0qIT_Am)=l2pMYr>a6F4KN9Bk~E%4%c{n!q=V^v zGFHiSfY?U)q)P$|Zo%zIZm`(v8U(TqOa94_e=Y`liixVuwdik2_#SU@^%(iu7nrKm zdmyNIEd;I*v%&`t)kbq|kJ3!7?t)0uxKn3gFqkw)t9t?64rswhrL|ez59pu(6=PG? z1Aux$Q~!cCfDlBb_%%m8(;+Hc>uZmGII;HlPOJoW$DVInU zjzS>HB~Qhm5Pfn9Q-LRhoLrJrw25&h>6|9REWNZ~mO5^XX|J|O`QeBmF{Yh`7*p08 zV|s(OW_C!7>8Q&}@}F?%^9?^N7?V~ixD{;!1zSLDIUEHB zL{;_rlL{Ff4i#kXml-la?H#d(9qxw_BnX;;e09D|^mklKN2_977 z!j}ys@JUu2gn6-h#LHU6&mAZ~$>~r*PuT(Np7e^`21TnrR-k4OagC-)_;X9qGs506 zWHS!u(SR{^oQWjFEaEY-o#|k{yg3# z7N;G0ynhr!6DW25@8tsjzKy5j=|>oJI~A{jf3H_h_-~~!Z!J1oI22VqfZr4N9bIVc z=w4_o*=8@aj>7V~8ZSE@$Mcg5ttmEg>=1>C{#X-D;mpVf@Pn^@&d8&HpSjWEW4m+ z4gdZ_%AHvh4-;uJZ1YbhT*W`_;Yp96{Z$*f>-eoRPojBgejrP#n(l z&d8OorIP0k`l}#1*pYK$2q0+0QG4(^pX2`en`K>eI;Z3AVwg%>oT-5o( zL7KNo%rCG%)>B?P50A1|Z^Kh#+sh4;k6~l+T^Hu#mnGg_Z>#^MfV@RpbkVo$#Q)gn zUyjfKZ(Za7jMb7Sz+gLi@V4kt3}+A0C>_?KM*yZ=fTa@FQ4tFWnWRmzk+^815}BlTTP zbT5U8&BI*@)@CAlT1r#*;D-(&9TZ3ueO8KEM}-P}VupoL)tLujhSr@2JX7q=-hO%x z2nzkQU5U-pM7D$Z&wLP0MCLqre;cdVTdTG~Jh&(0ar9blPeZ;2=eS>;eq;QuhTMUM z+#87%IHnHy=-z85R^Y#q0+&KKc2YRTP&k;d5S)EkGJhGQ<8^DzO+eHTeEdCQ3E`s$ zR(|^X#1a--0`C(`M6mn@T>XApR0b^9Uh`fsbasuB&c#$bB^m6c-l&1zc&#D#dSbB^AZN;cju~sc^zY)tE-nSf!pYzeCqI^gR{^bAH?9VN9)9%q(7UOgZ;u6 zH%xvzFhpeelQ6M!n;y>2m#$jUo4x4W59R4j5vHX7DmZ{c#$?pUT zTZajkt-~Y_Cq&e?YG7f4>>XCvikpX-Fp8KO^B0|@uyLPU5hKZ(4S(sr?nJllfgcdP zw0_9`;b*PFa!exlM6&ipYt_fW0p>;{GK0F7>j(GsOW-l%7ht;aTMlJV{9(8$dxal4 z(){7=nE4z0hJvhX#RrOaEv{$9(I|}}p z$&)6q2(AkCClLJhR|3In?BJ>8CyibULO3y1x@iq03|Ad~ljJE7$YSmcOdVsd37tP? zd_FvVOw~=1#F*;UWr;D9I!A@S9s0nS>cp6;sbj_?)6_AeFE~Y&U;IAGr%?7@xBR`3 zy+Cq|eRHT`%=k6ohA~z7NP0~5O{{I$w`iSe-E-r4JRJewvJ>rL}8pR{Tf$KTarD*>qX`g;dcsyxOusKMo!f%qB_aN&> z|E~^RFlKybIQ&v@jGY)WTJkixc}^$3tE4Z!9|XSiDdAbfM?BmV&FhU~o{zE4hEu|~ z>O6P2d3Lf-6`l^}nQ=;ZO3202<>vW$5zkAkD|t$IE%DT&=x#U9E2J6hcyq|pS4oa@ zk~|ZBC^(c`$n(|2kLz6GXdGkTL>#NbnK4zJk+6*#$D^0P=MASMQR@Q_C6O)bmf<`Q zIG?gIow5R6y(yBRtV|PGS&PzxWJTk6_z}cQQS`?yo~zgn;JG=JBA=!UJYVF~a2y@v zW%ubWO%FfvsNmcGXU3Q0_~80;(8qrJp$DH&znzPw4(hj5p`EJVo^kQW*d3?eUMSIT z4X4bIv1Gqd_tliKl`iiBUp|x`GybMCG9wojF zE3koLJ54`!{HG|FSNgBO{~h(c$P~ zJAdlpJ8NY4O6z>L-`LK>F1}w6Zf7I)(?#T`>7t)5_qKDY=4)3!B}cQj0)d#z6K?;@ zJX-AkX=+Yh96oTocK*=Ccf;Uz`r*5DID9prYqyJ!@muo8P1K*ODYwwwXN0usX3qp< z{7+_U5JMZ>FVOXa^tSEom!h7<)QIv~7tc?I$J06#kJdX1&yQU^tp%AAew1;_7UjM? z?BU0RXPoxAcT;sO;diI1iv-?)?K|`ei?v(|PJ)JB;eb2wd6L4Yq0|))xXRL# z7!J4!X9(OCs3CB-;agbQBX;N)=Ntib^HEgXpZz(3~$|GW>p)(8GYANaXG@Gtqm z>wVw}ANVvMIJNoUOL21jU?ASA1TqxA;)HtftIa~6mX~_BTruUdE}XuHd8wYS57e7L zJgc6p56tqw&y)B1K&uC?T3R1i?1A$&9GE%;ngV2OsHi7(Okjb)i+WPx0)VX$cu`Mk z_`eCfs3*mb2kZg-N@=lYeBk|nj|~;|s`&7L9rVe6MDiE)t@!DH{bi7NiuzW3bHJwx zyr^%*&jy_TMaeJg%2;I9e1sBgvh z1$;cFw6US0z7>BL@F@Z>>Ra(~0l!$_MSUxNE8tfPyr^%*7Xkbhffx0y_;P^L1|AzK z>RSzeOyEU*tKo+QUWJnC((sxv@tm%~1%LV?fuB);>$t!&fq%RJ*L41yz>E4;d?Doe zqri*$_Cuh9*BG$u*icd5ir)fwM&Lz#E4~Qeiv(WOx8hR(ezU-f`u3MDe*V`9V?#xK zE4DuJKP~X0z7?At@DN00Y-kdm)Frkz9zQMcqP`Vd8t`ujyr^%*CIi zE3TN=e&v<(7IZAQV!@TpUthj-!Nj(}#LTqGyq(3{*}TPhJBPQ=@%DM%rtnr1 zPEBOyT{wOEMGZ~!ni5kp4f6sEu379`(wTqdWxk0^7Ia+Yo9Na#@A50JUa(~80^fYB z;kcPt({VF7OXgp>bn&8L>S|lG_*=e&v@N<~u^QMBWSO`0udbTE#J^R?YxU&hyt9(N z^^hCpb@&fD+c8`^7-qQSrB{7zxMY*1hRb*Pm5V#(w=IA;6j^Y%{5s#lE9MWEjl{Pt z>bQDAo3m)ia2dbWw$wS1oL4*ZzP51k=jJV3eD+xad{W%}_JVlPs zIStR6pL=}ZyM5pxqd!eNF&{X$t?(lKW`6E};N^wi-~<1i5BwP)xO998{CXU{_41=k zyu9!ZANX1y_%D3me=hyAQIRRlGx_8(7f$*U`04T8ZR9ukWSrY?B-g}qy3wB|p0kbo zCLTSGUOZR(;JMxhPp<(t>3PZr&t4{QL)>{EWl!^5!2EpZ>p2WfwZ>t!w$xdm+5^_xj-Z zt`DB5PyVbA+z(IEC;xAJ;3gg(xzp`>+UPe^9zSE?G2#3_jCe8Ggm)2$%PV)i3Or=o z`+VSs47iEkoR1R7qv_}OuX&Lkldl*LTMJLq(U=9-rx0spKA1*so%cj18?(z>v3oQn0WrfCx4$0e7_Ie z?0*x#jz@ay<+q@CdEtwF;5Ylgf93=KbLoMw+g?^<@(7{In!5FP@(pa1)RA7rgX5Y~(llS<9D~57(<|ht&JF3-{*Nc9fUS zbBumAJGim(@c zs>&|;O}TqN`MJ8M`OTDfuI_2LUU?)vQw(?=o_P_@RX?3SN+2%6P5fsFRKZ7r{|iQb zK5P6{1{_O~g^PW{i}hj&v2Z!=6XC^O0fE3T47j^}PeF`vFRuP$J=cJ{+tMWeVgp`W z{SO4RKgfEEtN($3o@WV{W4Ef-K%4+v{G-Q|@686>BEY?t8Svt6fk0rL0oP-}A{z|& zSd9dK_ZaYT2Au2ZyqH`ePwJ-EZwZ$(jc(dcjQk%q@*gzde__B=6bM|zQ`~hB2;6GG zKO#BQt>1tbcO&4ufXf8Lqh*9878`KQ+l1e1z>B*P0)hPoyto?ybDYae#D5wQxYsoX zyh;P{cas4>-GGmGnU8qRFyO5Q{No1vUIQL8-~pFeiRTjrJY~Q?X~1ta;GZ(!dklEB z0Uz%&J@FTJDFgzo23+kaQiXnEz_o3_lmP>-bqL}0uJ91Q9DyZ}*TkjYy26?NHT-p! zsK1x#CuhMmPGDmDd}n@O;@6fg4NN3pqTUctawUKGEL_hYK8Ic^l~w(TSiPD*e13Ov zEii(tTt_=j?OZJz2`bN$-bjj=t9T>HZ!X*&hw9?;|L}b!n*c`8QN;zlkq}zUt+xh@ zpbEW9V0ePnUV-8B=p6&Y=hFKIhR@~h9vD88yN6)-OePz1eRKHSe(SQsm(q2(y9$QS zJY;Xd@Y#p%Fc?YxLCdd$b~22l^w8Y~BPr!mXJOC52$)ezGso%p8b<{AxI-%9J2Hg2 zdSh$CNXa;{BBn|araXRQy|npq>hV#T3J^P*O7EZ%a&BrVMPd?>m zx%S|_Gzg8$TmKzMqYF@PFk=f1>fQ!z)H1WJ7Ic%w~K8^`!^z? zWc}Y)C6>GW$ogsedh353uu}E+yH1_nFRri+?SIA}OO>DHI%j%cxnFs%N0p+#^@plN zz1#n!pZ=V5lu<+%?X|Axr_y=SNrdVjxP z`2iFz)&G~3Xg^uX+kUR6m8}1hsz!Z|z^{JBIZM@lZ;ARDKk(MiI7q4b*SSuoKCj?c z|3yCaA1qP-_!9MB>r;R0XH|{*T!df!|LjwK)?`&epP%q6zsaZkS_W@%>GKQfZK(cu z&8K{8T$Q*A>E`9tzvG}^OZ9(0gM7G-SAL35`BnxGM^JvDPx<~Yj;#D0KIL1_Q|0yd z7u4I({@d(Rer-aPxaK(h$2eH2{=Cb~4ldR| z5kGJJ5Bt==?h=;5_iB#Qf17;jU%O6<7fz_M{z>?G>*vH-CO?!KJ~9{Rps@$E%i3E{^=-RihqCo&#Hvk|C|!M^{?@%zp71@*XP6h z>VLwgeCq;LV)k*yZ>taeqkpW*bIf>6$B*kvUiwdhQ&4LBy1vd*fxu6WQ-28MOZDHB z67_RR^Va_*pZXVnL#hwxbrtnCbo{>HQ~zad{jF|6wx3g?xBhSW)ZcrBDyYxjxoC#i zuZ~asn@ZGwZi)K;)2DuWiKa{|YJXqcIAn_J$5@!6aALvDEwn+E#a^4Gh5w8o>y zU%%S`ur2i(;Qp=tSM|^b`-B(yCyAdfujfzF{fxN h;#HMX&%cL!o0qr!T+1v~K5&;Rb(*fwMdB_0zX63qt(X7+ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_fullyconnected.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_fullyconnected.o new file mode 100644 index 0000000000000000000000000000000000000000..f40c25aa8ec3845330007da4fe1b9fa8abdc5f1e GIT binary patch literal 25464 zcmd^ne|%KcnfIL}z=)U|6m6)wMoe3ExwkDZ{c0+E(mG>CE~UfX>IlOSEz`N${NuUekggr-*e8LIk}k; zUH8AeA7<{k-*e9MJm)#jd7kH-x#36Q=)}B0!0@hs@m(YHd8g+s=D)>|{SyCm$(&6t@uW+i-U(0_N(CEPRq@#{Z8f1Y*Oij)Kx( ztPU9EFWb&jb8Pp;>uBc5+>5y99QYu16tRj=FWFTTnCyk1+?YEHVdIm)LG~ znY;B)fT@4Pm*gw>P}oFuPfw54dSj1sDqya;4i7nm`TGHe5tW0tc2HDJo(GgXJ3VmH5le{iwl#)T0^y;aO;~=`IinB{xt-ZON6z z-JiJ5I+We6Hmmz%*SQbnC(^o3?w}mD>$Kpu>U~PHs%JlCTHS@2j@C5=?Ygt=y6OUF zWocg2?Xun9N8CS~@dE{hF{$;U3!>)cm+(0{Dezik*I~5S+679-CX8$`8R@A~zWEd<<6S)Y}zjd~=Bz`isQjt|ZEqj@THs;9@iq|(@UjhR@;wXrhz zHWx?YdVE_uu{Mn-2s~biQshZ~6YFqZ!8ZsPQFjN|W2*6E5;B~XSPh9w@Q@snTsbIt zT~R1GVF;#WM<-gfxkA?No{%e)lQBpc1aYa!(3B(H|=i9s>*SLyQkz#Eid#LMVCjMd8GeAD-Xg z{1H?sekgf;NyxqbMHCm9%Z2M-5HgovfoCHkyoem+R<0~8#!V04ZVq-Hk++?=Si$i)d{13ZgCXslMjd$&Eu?9 z$3+32@8m^h{QNJmPC%Bqt{q@;^I73|yP4Q@pT0YdJdQ%6qr z^vJk?*%)q#W%Dq2q68gXAXFn3Kq#5)q>%1kq0O-SN35lT;jtfu&FEosa|&krmNoXh zkQsf`{bW+zQ_;E&oSn$7&O9u*_blf$mg6s=M9cjH+Wroom%?NJBWzCkkBGVXPm}8Q zUE4b3T03wcTK76Syfdlpok`~ASMhmeVryvyppWqBuT(dpg=0EuCRa3-fug3e(P0h-FR**U2AGc*X(trJdT|-P5zy!^>Yz z{ff^B9|1G*&nQaC>BvJN>8CJ?abO@~h%hnfcmP+zgy`qp<8(IAn0vg9p=1W42RuE%-!}(b5tQiqcP0@<&|+O*m(`&tCqWo)54dLNLK} z@%}Ib=ZU@)Y6gT76NJvglUrF#2QYejyH&U-yHzY8Ydaug@x>Oaps~dP(HE|ot}1z5 zkP%fx8Rg1dXE<7hpyB!aT7m^y0OySfSqO$D(w{>FTZCx{G%K2%Q99g)_DihkLR)a} z4kxJmQd_vGKhi zhDyVBJJetUP{CRN(N+-^gcveQLo#Qt zzXC=b&Yhrin!!gTp{Wjlh%1r6@0o;M+dAkssK>HNcHx$41~HLmXVwu{PzP34YSBkr zPmBkNBOC%8@x|<$_V2csE6x+h{T4URzvF)YBl7uv>EY?CzZ`^6p0A#WY+3ngGjTV_ zE9Yz`R^h?vfy;jWJb2=sVx;+^FvLtOK$Y8NJqD+bB!PND%n{jN8zk6>=k6Vy__jNo z2XLcg=HMiwMTheszImbHQ@C|F`|-^d2$d(1c7}(xJBRZvr?b#<+5_;!LysWHZb1X* z&3xzSfV+t!OuYf6&<{pi7X{#xU$6tu+PnUems$+}n9PUkum`n(31%7P83ib}b)SYK zAR(^*Xz3jHs5OlXtO6NSu}Ji+88DuyFu|zh?t&4|;n0{yRY()eo~Y1{%RFP8jyxr> zFN)N*Lgp=4xFIa;BA+cvY9#aM*!-jM1bv%19wI+a13k)l%;8CFG0 z7a9n!5V06$UJM!(1OvnZDHDqW@}v{lT%w@si|Cr_`2e|vdtBFEo2Nd^4%D|%sqc=` zImkbmbl*@44~o@jRnvn`l?nI~EJ$~k^riIlF>8iTog9(pz7A2zy>f93LS*9b9%a^S zsZwO+Q+umaXaz&BgdU+XJ;4XL3XDiPI62=lgfq3KLGMzzqI<7S&|=yp2GLz}rsx8F zJ|_iWF3Pl7Ol&qXm1#1Sy~(KL8~*fC5Lp}u>|0zQa^$(%(C#X|6B@}zw;@f}&P8|F z7o$5h7v14sjP9IVbT9j2bm!)xJK{6Zh4nP3lnf@b5X8)Q7%q{HLz-vQO-0>zlpzIV zq(}$FbU-aeGoLwkQZnJ0(SLD2)hDnxOCPX4j@FV%rq5Y@`IMk3)9KU>ELE<}w41~F z^p+~tDZM_;{sy^DtGPs#pMyi<;`FUYu0d}Av^?-(Izdl@6htfpE=p?}76CLX{3ezF zH;|y0SqO4DrfjG~?nFh(18UpE07;s1!6#c&C%K)yR-*6&T!X#Sq#`Pk`&RdPI~fwL zmd-&q1BfD4Vs8m5$i6p`b`lY@VPiG|eKiaJsro?Jx6{H}+nIv0Pk(D0i;J}x8K=L3o;cVMM`to(w%NPhDp6t5-N7)4XuaW4+?~UL z?$?~Xd9K*2Ih}#-7oGMz83VBiB55=6#t}wm1m##6=1_{Q04kNKja6tS&fG{sOOhdu#+75;|yy9d< zp5#Uot&KKppDDW>P$ukShl`j#^qBKzz&RZ3e%093=BQci8=DIxH-rTyGncB6?#kje}Z2UbtgFghG6fTgzX9=F5Yv5ljfNKvFp zdHBaIa5d#rT3)xyeMSr@7~f|m+QfDrawqc84P%O*gkd4m+RjWX@_2eOVNo*Oin^ON zvdTFUNN>>P9H2%SRg!XL5-*t9y-xu_zQwr-4}IBk-axXoCxG(<&Pyr*rtl?Ej-_?b zQEvX zaTbyxBW5WzQB|rn;hkvQPgSKxpfWSRVfRnguK&EGs&(9G%Xuph9@}lkgE%jh>^UW= zB0P#z`6e*KQfa;!!`sO@6H`VhuEK+XHUDM_pTr{Vohr-e>NQPUDGjD;hnlXB#dIC9 zoHsFDPh+|^ak`w9#W0vRaQ5MLJwcwK1(9K`{D57T$x%Nzcf0!onI0if;BZ&$gF{J~ARz&DcL|GAXVGa! zuG4^dQT!Op7Fj`(~WY2vWvjbM{Hg55T}3nM?yKMvsr?;;#)gn6CFFxs@;z88X=ji|&aGaXsR zr*t@UDeQ*>&S!AL)?E3NL+~j_6YyUVx5J$X9TqzZQWP&sXp{Y=(AayeCpTazPu4 zhl35_EH603fjwjGhRJb1D4z*%+!+a+ z&U~SPp+}SgykiwLdIp)H`z(y!Ie3oM{gSf_YS@+MCK+q+GlmfO>bKL;CUf|di7svF z*rT5YsZgSYo{UMTxZ@cD0t+laMHCs%G9zH*LD*r;@uE#X9G@ZN3d2ap{D?Ssqo$z2 z`)P@C#x*;|Wch3p0QHqI1KI#BKxJ&o3h>iKYC4$G97S>#l39IZCs##*`QuCaVDb`W|mn>Xhr4p-Z!TH4L{iY zxB081M|*mRTx89Q&wxF=~_*jmXjupZL!0?>O}0r;m?`YMsu<~Duhv9XUi zljV5ZYpRAZkRmgS)}cB)2f+7U7*2n9@lL zCyxawE_D8Wvk{Z9A7zSSQ$96ABT*JJ^fW5vyGJ`vqF~gS6pSSbW`krU2%`~#ej5b{21^l>8$TNil>372I{|x1OUT7rUkeNP`{m%#Oav^qDwxdYGC*n2 zyMriqM;;Bh{0OpSoE*O0=Y2W+bXgj1wV~4!c;oICO5wg=enbdqb%YJRZTIf(%zB^^ zHox*YEQ_a%Vqhs0Xj$iD2pl2CrCZjQq)~4HDvbJ3;+1QvnzrUdD)nAQ=~P&OghVxYU!pSV=BCa9_zynA2ULneok0cLml2 z-o&|o9q^Dl2W#T8kUO_1)r=Du`JN(l`+q52oj@?f)p$?jWFR&K)(u;`1u&jbT7pu@ z4QI2bKFjz5v=z!e2L;%P2>HsIEOYQRQpX#i0iol6VF73F(DA9X=U7jE$+}=BVNPBc za!YL2F2F!iTcLjFIXR&W>ywUIx~&AQeq+##-w&G7s2VT;yr8mX6MEn^BMk3EdU!t* z(}~5NXl3nZ9Nt6TBKap2p!K!XZt{LAUkwsxGxyRDVeG=r@GR!QPUqNA!X6Ih)baz_}n%O040caly6mCH{3}DxH#6ZrDr}Dfrnh%Ea^_i_7*ixMUSCQ zIGrOXB)7hTZwPiVZa5^R{ll<~%+55LOwwL36lR}pS<>MEA@JY0RRu|h!<^(UD{8Yk zCKl5_bf1o5ok!g%L$L5uUxs)K@caA6!_34nN-jL;FjA5p^q`cVw4Y>mDL+e6{!B=K z3B|sOxDdNEImX=F6of=jI9BTb?yTLi)UR=BCf{$$u@ZYP!<{tr+u29GUvnSIsTZ*| zUE90i7B-wFiOcnhc_~Q!ov^tU@W#x%*H6sTnl%s!+7z2tILJe&R1Pxj7=z~E%Ts>= zn~eY3SQ)zu@1S9>@p^8&8Z=PCT=xf4>ApWkdvBU^M@|)6MDV;5eZ{K#5em>1XO>|t zaW}}z{0)*(zz%Wv3#r*ect5J&{XPBo^Z$vN9^f_v;n1;tVfYy&LlN9LUMx+Ggu?V-?J>MZb&y4^~$w^DVmB1o8i z-iLalgJWX`7dzihuIXagX|LqVrSaEU^7KXwSo;J;I;++o|Sgc`WvA zHXmiXT~6mU&4sr6@9b9Ij*gY!wztwAvA4Wk!2ny>SLU4y)^2+gOTuFgoTSdPI|OI8 z6J$bOduU?THU>Ss*C12WBK$xyhz*^S!*0FA-o9vEKu)*0;rS+$)cQfh=HhL?LN6o< zMP7!W-luo}z}F=PMVQ%s5_ONduL=D{l2?}E3C=F!J}r<$d-HjTmz%#8b?YnM9h}i0 z*lz^ssQbQXh95=;^zvAuO>j>2i;Z$l^^ZlIQv+kyI;RH3COfCh*mUPqQEZlT zYFP6&kYDlczh>4mo-5z|MXzW3-FtdHPks0A(P~cWHsk>)%uO}O!ot+|WuYN;6ADxk z<`>@IFfeJeoYVQSY0l|kx87iHo14*ES5uT)t8q^EgIuTk$C{nf1I;+C#W_95j60%W z#uhlI@hzlTUQnFMcDOI(-D`*EISQYv?hTsxaAZq3vhNEs;qY#^AJc|+wCM*FaW02f zhvAv=A44iJ#Gi`-hIpqa;03aO5d{qMAr#~=N7@}#EAV7^a)Q`-V!Rzy6~vX@d-jdy zm1!7u?2Q95;_nkMEofk;8(_V3R837kqCKAH`kv?bo)`L_8`Lvz{J|dMHRE)K&`lZm z-g;Sb_DH5;b*oOc0N%D6Gt}JQ`B7PAY=tM^wub>?{*qNAu^Wk%d?(W~%9hQ4a-R*k zdsFBy_7A`UlF;LNg^|JkcgWqXnqyp^ems6&CFyw={eP|xGc%ep15H7eyVsBIjP>5# zYfqoS{$4u|2P81>?$!JId(Wi7j6=V(tbfR#&)z+#o_Y7i;ql2=b$n)7f5xMD*6*ar zHk5~BU|Gk!vX1MRVYFOhR85$0b;Oussk#ubWU;g}P~|3!!d2>MYbvLEVk0n~1t9)QwklA=F)my1Z*qX`?cv zDlK#zkII>-oS_&_)auxgMDxOmqu0^nI2)DUQyjxSj@78kKPMlx*MX&1OZoE$1%QrM zQbu!(ZaV0Nwb8NX^-7M(sGEj5ZvgtpJx;8n#-ix62)t}1-w#1&Jg{lNqMFRq;55{6 zJd;s3Ue!$m9V)`BW54-mV{;_G;z(uyiJ)~vNlTSlh0lP({=gj6-++2b%#l)adi^X= znTa;~oGrogB=or!eW-y@mOj+POx4EIU={dAz_$u~C!zj&)K}-=I~jay(I$#Eo~=+0 zj)j`?G*4MWpcw+qzeOLK31#)P%<+U(8;&OqGgW999YorRM6nOPDT9`Wlws&H?Q;$-n>phpE-D@ zFVTGh=an=ke+IeoKg=75X4is@=LERGXmZmr)Y+(`yP{c4gjlSLpe}rNeu_(kUWZGB zbu<|+s4&>u%49-WOr{BRDOv=5J+H*2Ky$Ov_tVUc%ZiIMq{+MB3()%_viwi$OWkwB zI-JP?u5a>dVquY!S5p|gZjZs7FQn?8VH>WRZpH2Q1OnPv=Aa6pZsKU-BFI;RlS7BQOlKYEY3XBiDpFesv%ui=>y z%*Og&`67GDhBIU8d*zF)D;uta_f_SKzwPpYc+5BOa%Gk)-w-?7J~jDt}Gn?#t zDhlK76dgOMLiu72e{*Cn$Wa54RP*-iJ?C_*@@et?>Om z{Cf)T^x-uM_pgU~g_rr-FIMYR!7xRx4Z_sCppo z+<$T&%Y5zcQFx^fzfa+FefU2se5wy8oNrtpYW1%N@o=u#5lK|+Nl@I2PmzRree(=E z8~#oX{DU0$0LYh(&X64V*K^?0a^U)|vEj`o&luq6YJS*@mo);<_~A=IgV&D* zp7Fz8-27L8XZ)}iJ6{!e#t)152|5J`A=%`)JO@4%__g!_l^GDSg zqoNbazbU@?RVGTK`F$n7M5%(Mv7$!Q%jRPQAX=4|vn+tCSo)Tz62)@Sd9*6#({1KKUGo<;HPpuH7S=S@&u^}) zpC`&qvBnIgd7r6~M#@;wSYMxJqE+PPk(Od<{gT+NHOD2 zwGB1%pn;p4(oH^fBp}zge94kb<4+mT=V~j2UC`WAKfh16PaAbj?tUEV{CcQ=`E7p7 z@Xw`cmL64Kvscqy6RTg=w7juqer>FFN$qXRn`6e(-WHUlEA`z}*C?%F(e0V(JVV=t zSmUQjG;h(e`5xU*Yu8x6q`tPP{OZh zj=omj7|z_XenovCHVm4>g+fitf1^3w@xxK3x2hUZm`0E<3&&yRRg(IDD8l8tUT&Mrn9Qc)dl@u50==4`= zxK95M8qT+yym5V9C6A<|<4z9zho7tA7ixH^hF_%NBj^-x z`RSBvxK3w+hU;{WYWT&V=gBjGMvu#H=W7&B{pfz|k6L@~Njy3is-qavxkSS&G`vK^ zf3D%&3wd-lYq-AdF5m+$(jSJ8*ZzdUDd%@J{ANxDEBrXc&&xIZ zA9A$+We)r;jm~h5&QL8c)%7q=!*zKk<-k`doa6nvMt`f;UZ=k&2VRH1yr>@wA5Ra> z1aT3!@$v8n2;w3h!N_uEO28~(``#*wexU~HOl)!uN13a5i>^d;KdjXa&% zZhgbhPm9jTf@4;vH)m|?iQ~1;KGQyFw1b~fjmv2#qw1PsJgCk*;p`YkSxf7e!IG9O zkJXRD+i_Pk#cFW?B#%tw$oALup>)B2{ zrhiGk30OA%?WM9m_Y(T$=l?xaX5&A%TJ~b@$&36sMnC_%fMw$^k}}kJze}MgEhJvym#aVjZf+iWZ|QG;{|nGR+xRQz z$S!kLPmZ5QHvaKfDgNp>N3}MLmNBx}K;g8@>xtvtZ&v-&4OC&+RenT1d;PrH7S$iOUe|Edzrj_x*s3vM|35;}$*H#It|=JPyp( PeVkLi zl1VGT7zIg*FXQW&CVZPHZmJ}NZ5SoNac49eiIZr!JJA zuKVF7Ud`6y>Hf2SXFi(wNan+tr62xKaPf27ski@_=MVfAjq5*F>km9aNYMYqpvE6~ z-hFoR`HcH)=ksCr*~8}!_u0kgHv9ZH>$yL09ly$Fr=l=bA4ErQ{5Cy|mZn;wwp9Nl zl|AqVDx0kJ2fs?X{x7z8{$N=-zPnD(57mzhq*OvpP2c;=|F%Y9=Ozfd!NU0UzUZ3N zwF>@JTXaKeBVW-KsjKu_le*gdx<;>SQ=54W?k)YCXIV=Hr<^+ggdY!71&s{Qlr8$EK$C z+#bYhtQnuI`D9HPUz56>yqOP&@l~l&eJu@%S9xJPkh+QYZvJY@KeVo0uTBS1Hnk$? zzp1(WYbr~vS6P}T<-M2R%B#{`-leJLAR0*Z5uj}SPrTB9gkgMa4(0Y=L1_8Hrfj)8 zRi8b*JJqrw6K$b}g}G?gp<_Yu`t>Bpy5Y`7-KiB}@gLNdY_y{gJxxKGXeiUab1LU= z+MY}A$oWflXvMCeY>TkNMP`o?K9518TXnQ8QYr~;qWF}*lj3g)H z;fyz&FW$_LFizBD{3W;K{Y^K8G<`J{7NQ+t`EmhGmWk{mBtQIAEk!1S^0}QMrG)X~ znvfcMgYqBO6WA5_n0uh}4fD?WONMEc#wx#Ke^9=@&$17suHx`TM}rxxu!CvNKt>Po3r{~g(y zn%v+^{@S}$Pw|dIG`eUg^bTkG$LjonZ>qANB+eTiOGb0AqlTKiEk}(DU$v#To)b5| z?z&ojW=Z$a3<Xj~7#uc675#@nm_M-(hW9f2NR&R=e|SzQ*ME3!sX2_E4WmOU zyNh|Cp3Z`Itl<4TGqs~Xn`-h-i0?igf6uAnv&Z8{PZhuPc>LWb#|OvjKXvt|Xx<@O zINkh{n#Tw>rB|kw`2$T1OCg#FKqkpdv@c*CbP9gE!f3=g2^244;q9aIQ}+Y{YL#ri zLyrYLA`D29se26KiL53sc#~EL(TW7|&5Aq{Mh|Em6R8f)j0Et5=&k^KU^!<;7QDk@ zbeE8YMRw21S@4dA(PJUKv0|rNd#a{tpzCQ?6Ah(z>7|Uv;2gkHgTVTEF++IZ= zPdm&}{!|bOnvJsLw7q!ixUlx$)wkS=tk_X3LGKO#idB8E}?=>4y#j z_=fPa<#WOKD~0h_LTb19oBAD^e^(YX)tN0{E)*XWk42?aaziwni$*ht4rdwmu1s{N zLx7p+=4>>X@i*O^2A^*Q15V_;J<&v1{#1J=eH-Zewhg)HDESz-h1uwECYmBA)FodW z%9lS}pXq;aDu}iT*N5fr3b9kzts#{x6N>Jm1^ME=VYEEenm$bNhqK;}^kM(JyTEFP zD6>U)KcI!pLGgz|j$t`dAAsesELp?a0wQHZlK1ahJHi5j9t z723|nwjgRu@tzFgwhH@JUwQ9edR;#P?2 zL4%A2ICOFbqwL3Dp^E{dzARt7D=g1TW#VjVN@So*=rK!8bo^LY9!T}kwEXbnwN)yx zV_zoz(3AiWQpQT5l?Swbx`OB^==PSNE=;x3?TZbpgF`z5@BZM>-C;b>IzpFM)Y7Xr z2i`3s6qJuA%Cqg48Rx5;eG&VuKy0gpYqp=vB<}bi}KM+K{T0PaDTpkUm`!e zuQuPm&CB#ZT~`Y7@Pnmy^QH*|4y4L(9Ob3dYd07nXT-CiL1TRpJc&SB!H&~Q$dE_ZxDWN z6@ux33@3x)WLVq+5&|%pj_nZKgdho&7t|T)ZTx;vw=~s$ZrpZpX6kNHmWBSn_ZZ-i z1>X@er>($w3m8D821*XpX@Z-D7iYhhYb|?Fd_Gv8j}DMyJK67aIeE6F$Q~&hxfc!P;d-42mM}zO7_#poOz;bu| zdsuEECHPijGy7q=pMFJ>)8KEJKuuCm@+GyM;V_1>CGZ-;~4p7VBQ(%WIZVP)ZrGtn)^dhZhI4hqGa3*x>v zP#he%Lt-Gt(Mf9p3pY4YC;D>)kYx6~<8{>zMzz@UAT}t3VKbiQN@xllF2G8-s zpH^bYrvtIjW&dPd>3{M}lsb9NDJ|kTw}hOQ_ycm!yTlk~a_3=`)u#P{cOMar-y;m{ z4H*cFLo{t9?~NWj(@)O7goybTF)Kq#8B(XnuuvHmo}8gg8QLs-{CZmY3FH66;>|NE zTdcf`|0#%<)y|0c0KLBCVAgL)D5h?9%sh*kr&G4zSj<|}6I?3k3r zq!a@!8LFQhOV`M^@ShkV5_l&y{Nj9ii$A!)F#j;j|KqvzIKSccjrl*L8=O0M=PO_v zpr(rUuTAwa8jebXbdy5Ofxq@sV)~2lWP+krq5Tk|v6`fX=)pqtsB!=PriUHxe@MK4 zd%=6;Kg9MoIkvxfCflE!!S-7`>}54xZ2d{`An9tn#DbIJVScOeV*5{sH}=8B8{27qZ2v*y_?HVkR;XK(anHy;-6@I)qDRH{-zT=87}$RO zKF&*%Wa>Uq8AVo;A7{av6baWiE3(S=6R8f)j6BZvTh7^$$Ju_1bPYOQ=VV3HuwtiM zd#a{tr0NA-G?d<)ArPBp`pGLM`N zcaw`?&!rKFZm)3f+4AWG^uWD~AFW6Kk~uW#aDKEcnuK*o55u=-qHVD8@U90k-obqF zK$UgRh;^R?KQ~8*L51hUiygxpElwF1KLUbp&-6bwMGNi{au@3F%op!-s-MTR$V^4k zS7p<;WY7h%Ublsi4vK66?cT^*s>zo>E3$D!bO1t87NH>B-7{%~*z+#v&7mc6!ZIl_ za|kPs!p9mVMn5S6u`DQG*b|gL0Gs}(s8HY*cKQO?zqJP_JSZ^@@YA*V;U{Zp<82{4Hq}a4_3jD|?I^_aE*7U9HwyFD6yl}V zpGOiRCs$U`7?0fM@pYkh3$biLE2_j__;A? zTMYrwl{25>yWV}Dvzzav`~Ki_d^fr8kC8Ch?7siv1-@I{_a`U$Zgt;Q)kR;;qZbOj zsX}_xAN-!7_?x`2t}&%q>de}N4}PBT>dJz7Fj!z7nzQK%f9>lSeaGH+SJ-=F7CI{I zy>JLnutHk4ne4ro93Yeh4fPgx1ApJnEE<0w!0vu2Gj)%MNV7lik4&2Q`xWBvo2aG2 zk^s3TK(wpyFghHduv<@YD%h#)TCMOT<*t_85WtP!$mA~e77b;l?#;v>FnCp4Ybgnq z+e#|l4QuajdY)n?`O}(EOIu;merClW9DG&9)LAjEm4g?U`oUOwP?Q^`@s{zasa)a1 zVPyY7GW&@dvssk_MK~ZUZ4C5$E23I(`NDGDN*ZP;Dvcq`UrAy#&9TU@cel z6TK;T_K@Fv%J85Zh4SjnfKH*jPMlz&ynegAHjeVjGS5k4a!p$-WU(TQKtAG42i{2T ze#jvT!Fzt)ebVWF{Ui68@CQa0x=-|j&vd)bIsRaLiP|H46+M?(@NfqEMy7lr81Bhj zG?pu0uW`*tnuxA)lF;^C`G-#tm@AjZcx@0}2qZx;F53ynGD#D>kxz#7nm`^nRw%v{ z50o3KbLh+W6G9(zoQImbOQt!@0dp8`CY6oS$ziDTB55#1Tw==!l zU$Q$JjS>f&-jj*;Q$!w6M^D~N5oE}hKh)_g07JR-D1yiCtTz;v&#X^R`sa-xoeImt zs$FaI8C_#GjePl2E2vvq#DE5&NDAWlYef_~veDCGE0&!T&;Jokt6{MoI44fnHw)sB zM6%6rnVJao z&A;%CIDa#k;JZQH@>B;H<_)2DS3aJ1O)k2Z5zX{J_44pu$W}I0n~PVbykPj}35KmX z6Q9YH18;d0ro&^k;ewsHMGu7wwnP6k5Ry&LjS~nQ2#O+>XeZn+c(Z}jPNECmuqJ$r z&)neM{@NaK#qpWRAkHL1xCj68;UK+**HEs0*dMs<_q0-`YW+)w`3mF01bT$w@kAba zmNA0%5GCb}p#9K&u>jN#!s<%@i)W(rL7sC;@8dbQ)aWPQ?YE3xmn4bbeV>2+eSSlO zpIls<^;?Fre)mY$KcBybga-*v`pJcgK>F>zg9!c_=2apr$4(+1BZ9w%`7?9uCSorU z{533`839|={R$EMH7uSk%5T|8L6lQ1CIHk2Bvh#dehaFr7L22T`x6RXK&!S1av3+L<6khoN`THrkxc)N1xgZ|(E z@>&L4P8Lm6k3abBmqpUS_27i7sWK@4s~RmNRfAnkNFlw((1af^VvQPN&qogiD9p+? zxW!+*(h^9Zn|=Ws48h^uDlo_jz&Ynq=sWDW^(jjn?KOOqn%vU1~6 zYT^_A{-Hv=FaZKUZObw=we4x89#h*EOl|vqo^?`F97~km#dA*SEj;IzPV*aB!M(|h z-*8&Kf8;g43+er(R+njiAMdl*>9J9d&3X*!F{;Oy9uuU$Xf!LiG8f&JOAnV?^TmTG zySV=A?_BlzeDUZ=;yV4T$mLMjwXXxu%1AOwuz0Ta zX>}sM{Y^L6tcBS8#w-E4Ckc!24OiHJ!XEu0IPbPx`3IJONf20LD=^`r$8h=Ih~ohs zlp>&R&A=jH@(N3fg6Vbe|1A|etxe5)4QqZ7j5*rjUw(7IqGNoH#cNb?C)Pur;}54H3M{@#6oG{5A&>Pmk_OrmrS&pD;v;W-yYe6rq8 z{^pH_G`nv18|vZ8X;E_FyR&|m*8O~#`J0o3qHtDl&hPp;!RUBhaGQb`EBFw>Z5FH| z2u-{@<9F?>7U3rwDYxz2e%Jjh@oM&Yge_KGLOb&;G)({89}q252^6Dj;T$!gdrmVc z!ClD<=CtxG&FN5Jr=MIpr-$F+oE4O!^cTDZbh)?BSC_&$Aj;)$0a03qy65o4@6DB- zTrg)dzgNCxsM3YfoFRT^-vZpJ@!fO4j-|JZS+keU0WYq6%S5HU;hYIIcJM>86{I1f zrN5ER0EH#^2Mf#UpeXwj%!ULm3@UfE(93*=q%VTnFFAX2XhtnD35jQTH`tV!4Hx=tl&G6Ydw&BXO^V;4m{~kyc-ONv*Z%qq>`h`bpbkQ zS|;q8r-(k@*=V0(*er;CBfn9JzLM$3J7@5Ve=DwcO=?J?l5z^slO)nQtul|<=&l^) zY$PZwUH~VGp9P#`p?qN=M-Hvw4e%Rk*C9Ekzz~w;NQDK7CZod}EIitV9P=Gg)$~7# z-4JC1s_T83^ega+1vQ=$A1nulEZe@j4@fijsn zbX3kUJrp@c7~-7TPa^ms7VaBY(yjx3{28ngBZz+_smU#RDYs~EZqXESVGKSY@q+C_ z_=1V6$+U)dv>jGeC`N>=9a#?|P@xGhm_wwE?#y4t=bK`mI zg1R-SKGFtcRXt1Md4Df*sG17iLuLf5cO@d=k1J8*QU!!RkizuGZIt`sQ=kL@U=^Wdw zF1g02WvEdy@mZcRPw7KU>=FKkT}|kjE}bsE(uV3FU^icfhe#l)hf*_B zm!GQlwe)_l)4rHlJDOs`tY%Da#GVto@ z2e7m3&v}RV0&E|EV)tMXzu#YbeXlZ`o*5jHkM;*KtDAOJ!A0UnHrk#a-i=D|@t0*Z z(fsf;WW3Y_nEZrfJXL7m$2(+&X_f)AT>3>n{@)jBth!SjswxZQqIGj@QUAEdxE-xb zsKHLVM72IE1%YMH#{>CyI8MkW1nwSPof~Pski}%)@t&8!rfk>%*I{*D8+y;k!S;!Y3$l3-O!P`tZ;2+R14I0##?%4fxRcnVC8Bv3I3RiOrT>azhGTz_y28np2_ zQls_v;s}WcWG{}8C{TKPpcj|!o0RKj4jihlTDA$nOzO@*;IuI7bKSY}V57X6wlod-WQTY-~6@3ye$)F1h1E3hy~ zg!BifksXE!%0KHPDB5S9J}SAULJc8?MhhRBqb2OTO7XA3azO%BAYfIkDCRfv5&_Ic z56QNxx!eHRS$62~H09d`ainE}QhE6)B8-lRDn>)m?&uM+52bfT!=^2fzhqecKv#N~ zzhoD1Fq$t1XyVa<2Dmy&7g*KXm%pYG=_tx(NBD`dUXpFVMpO~&Xi?SsM$(3y2emd_ z%JK(-@{dLI6xAD+dr?E}WIiU;4=KJnY*gDdB{N8%c-OyO1_|8buT6dnoL2SOwa=Yo z);~g5yDuTVc6}nX#Wa@U7VK7)J*e+i$Q$C?3JVV$BOC%E#6b!))A{Imh!7%!5h2kU zy0C-4N9p{X=f(5BWIAHOhYHTeiL!*!KC;fHJn2XLf%^zmePBzg1FGjJe5JAw|0OwO zFoxSH{vdzCK99mFt_{)Lc1lQ7H2~*&N1-d!ZZ^@axM>8{;;F|uZA{lQGP52z{a8Jq z*UN-kJFRu1Wv;Tu^zyL_X1Z!ish6i_l@7ycCx z?l?~RtaP%PEb}*Q5%8vW`b&0tqr9>0Y$caKz!L}MuWcPtnXpfin*~tl!?Mw{kgEPC zUJhfVDjG5a1|(vmC#t2THN=RHNl;ch1XF;r(=PxOt%MQiyf$&?q_!4NPQco|ju=sx zA>KJmJJBN~6N!}X>`p~8{Zf)P5tc>Fx4QgI4wSMqJo?qg5sdgkxppc83>~Oz-Oq{`SElZ-4lV^tb{n^ z6t`36F?1*YhMMi7!CAzuWW=q)Ml2darr&Gu>^(qrq(|(Xq!>7f0HXHdf1||sLlZb` zA%>nYlbC%Ba^;l*p0%Yjc-EEP!!uEOC(k*hMxG}1|85YUo%9iJNS!%I# zyZiixW$XCc$lqrEhWH!hZ;U^Ls+||@%%ZSP|Gac=0VYw*X(z-h^zMc*u0df53yO7YJy&HSHK;TRo6R zJKrp@R?mUW8EXuD{lR}Wxd~-2X$0~48H!=Cb{nn6eQ;oS^IWne2^d4o2&@@DIw>ZE zMU^S%P|*KKt0`B$QG_g0j>mXyV0Di221Pp3B=JXchi5AefyG@59u1>COX9YeFo4`K zOJVt}PBLz{peqee7tEE$sg770V?op)XS^Tl%Gv)~F?+azV*rWj}M31uow&V{z97Zn* z6IA7eT!j8@ANsce$2j&s`LY(6D$p|R2s}K7Cvx%UJ+wwDA`^cKPoW!ZvI86_0kbVZ z@s*HeuQ_Mg-fN^Q7d;o?z>Q3-aTa2f{%uLTv>y%LpQ5_MIYEjzk;2W?mM=aak#kvq zrq0|-HJsO}L!y!J(LLlaEfGxf6{!vitTGNWm#EZpKUz+Q;}G?nSk)zQV>2AbOf`Aw z&ryEEcJhSnHsJw5N~vS?l!JsK;hK z(3v!xHLAy$9w_MUf$c;;mx z^D)Om9?YIj;VV=)i&V)(c{Fd*%w1_DA@AWa`+VTBHKzly0kruZrH{`^2KX=gmyeiU z&e<|X{DHrCo8eSV*hl#+#4AzE-GUH?Vr~S*+%qWVZlQv_$@d%G7mB$)_f=QAfS5$- zT%L0@eN%HwZ$Srzw2*xBPm?4Q_d+g^?E`cCS-m+_WvY$xq1cqUC&{Zs06hU zw3whnXyq)3d`gyhJ=&|?)>E6~g-{n7?hbK`xLE`{DY!Y*joSm{u= zw^2VgTUVBQo5|TQWSuB^L#X8%5OS>b%RMR!8xVJ_*(DFnTzA7nrM)XXh{)qr&D?S{ zb1SBGaiYs832I(7V7Nt8IwcJmo zysJ}^o-fQ-l3qD7yg^Cu50ZWlog5=3om}X>6nf8R(ntKkpFk{7u4%^M4z6(M0Ju|t zbaR5kEY=7Czdv}sY4ENwBuOrzQVPQ*;7Y?x(v~SK@UAi74?SZnu5l92~nXP4U2muwoLG| zs?ZLjePW0@@C$v>7!zmSWbM4j<)#@iYeuQDQszO5^hX6%=U zCuUE~{O?dc72pM0)ybjGsIqcGuFV_rC>Sa>PbkiP!m)j1LW^UW341Oq|CO<6orW%& z88Ji0VfpJCz4E_FI4MYnso)+2J|`K2Jq)GH5K6SU@I^M{U}cJz-fWb-m&{^Jg*o&O z;=Zi%){@0k9#qAVogyRch$Ekmc1jb~gxN|+r;lVJ^h3Obz}jK+#`rFvVJL2&R*m_a zcF~(H42}Gv?!&6PUaVwum4in;_LZP6D{geAGK}d}!Rbfco+4H4sa+N`J zCpA@TJ>Y6xOsy+bs}r)U!P3l~qH5E)VYv{%_R$7pE~Iy?GSju~lFTI~+$ewAx+g8< z9p}-Lke)OPgO3Tq#U)cKU4#8mfQD)6Ya+tO`#0EZLW*O>1n3n<9)oh2-415&@gI`| z6$kV}DYDT~hBS-sfRjw*)HaD^avgD=S;GCbx1oR~`6xk#E`oT?EG?T<_p6Cy=9_rc zuvrDYTmKmPwlLL;rBK#F$;ko{m;o$xxb=(V4=FX>#LYD6$ES<76y^F)mv$1B(H5a= z9S6k-ssV^6f?oy+{iq{Sd^@orhZkBddQ?u_t>ScmdAynTd_+_19YfS({LSfi1lN{S?W1pEqbX!SFO)g=X7b1`Vofhz{r#$&2 z0k2p#k(jTJD4PAoy! zJ^y{bf><&)h(D05XxLuEE1%vfu>^&j#FCD6{2`WfY~~NKq+^sn#FCB)(p|KZe#q5Qpy!FZO$lR2IyJLq`GdR)N6;sX_g$JrjA3BmHZfY7P^- z3M9!6gdf33p~BDbfixgvN#SVSLyovVz+x7q}N8vvAl;azhGg_U?~f72dP zJ}hRty=s?{w@;JPGzqoj%ekJg+=J(qI=qL$lf_CSmW%MO*ffHd5eoR-8E?{kvD*SO zzd5A56PGL;#@4nx}1c3ABUNr7UwBt!4iI{xpFX;YGN(>pC7DS9h5MN_>O?!5b z0Gzf^XhP&mJ&^`RI7J2Vxsgw)H)tJEPE#&hQ2b|9x>u4%Zx)q7haVT;Dx~HVA>*?B zN`cgagmwsmlSC&W85*5A4z;E?a>!&@Q?i2Ls!)catT5Yx;z7pvPAPnaGSr^HyVFdY zjOzB#(^_|`QhbfRlxI^TsHV9ph@nVVY6p-7Z!320AK()F_nE2tV4FJpflY4#q3u@H zppccIqT(boO_}@o6`U70UW3yyzF@Fu3^_Zdwimz=gYO27(3w{4L;DTBg@f3&au9sN zj_^&m>TKGIM@rV)%D61E%}}&|7uvMb^UgWR5Rlw#g-by9e+P4*i5_suBym!pEh`is(bUz8y>#A#2hfFUMuMU% z0>|)n91#zFUdjlE6ZX*kD^0s^I6GFhm%Z)3yk?Xe#x7+!(UtX zW=)#oImW+1xUe)xk;RbdzsX}w;ka+Y5_{dnJ*oyqT6}}!B!c3D{Jt$;WHnbfHcj+K z(y^5iTyk@OuL9f!o0xAsH*UOKyk~&F7Ew~`iPiIV;j;)AV(QQ#a>@|5cvEKTez|6Y zE|%g83v3Bzc8Q(o6yMQD!fCv52vo5QvEj!udl=&AHKf6oK5g*<(ey}&e>>lq=-F(v z6_oK3=yi-jm`b(~I3=6byZQ*)d@SqT$NXUOWRe2&a~l^<)8HmFzxl!&Pvb*Z=fsB@hZ5WCNb2 zbH&pNohzPJ=v?u%Lg(r{ty&#Id9AN}dYcdm$%Rl>fKXO|P*#9YR)A1efKXO|P*%X) zY%_$inHItpmvl(Rt=YJ(E?;~p8?V44WMAmri6&Lfm}qSdt$I(!dlY;^{Oq_)dSum` z9Xi?V(1|uAz;kTDF%KiIKX^OngpnR2R71L(_xQzzU0C;-XgsrEmpqqq<&(lI6gE7S6C~Wrc4>9WcNwv5vHAeEd{pBkzs=P zkaXL51B$HjUW`+Zkj}6QTT?(RV%1fvzi9#~YRNuto7@^4T~olLz{v#ZTw3Q`)G#UBB%4Ej}uj{yOfU8H|mw!BtjFT5-;UJ@q^d&0$IZTGO>p9W-a2doM1u_o8R3e70#)H~#2pcvk-R5auW7K%W z<9qyb|NNiho{T1scr%*A!ui<2)T-YM>#j25O2@a_2HSrmL)8Hpr{=`}&BzS`T!-f!oD%27 ztOAWm-8M-J5!Bi}tvEzFp$C8P3)3FSQs2xF%-{evDo17O_l9>MXMvJLEz#aNs>2#Y z$GT}-ru->1a+fQ;P@fiweBxAN)hCXhX3664BrM!<=3mlJ&xLIbzK$8$c$l5w4) zbd$agL)7sIIJYZ02KjY(EE<)JqPFLY$B4he@Fl~}sVb;4@?|?>NqAH%&yLinoFYMD zXEf5vH7rP?ArfWEWvSiFCyzu}9ut~P#IduK^g2G}TZay#B6yhJz;-xFh4c|kL6U|Y zg5AlNW4OM>>Xg?~S3E-(=j>Nxl9HyU@=oe4^<2e~ZnrAlPM93&zN9#~8D}v=ZW3do z-bqm>eiUC2oeU%W9By>T*|0LhAoHWVZ)Iq@C2_XlN#>Y32}LSfjGCCrbHS0)m_ZIs zP}ZsN^gcw5?tdn`Yo7;@jszW>vRT9a?vdX&XKWQ_8*}{E#M@NuDAY_U;5dQ%p14f z&TqbIORIR+mHv`vq7?G9{UYqM|8?cKAv4{Y+&2N{Jt>G z)GD;mLfI6r%@7K$6H-g`@$6!Iy)chL6AGPYp=_-ube=-zD3lUNO9_Np33C-X-$JR4 zP^*oN^TYLuNm&eA>WSf-k)gK?!NRqa=xG=ht`5NxgJ&~i<^hAEat@(b)sFa`Zd(J0a{-6*^7T2Bb3sNenlrVO$^}DZxb{arV|IoOWEaapcCj$D2^T}l7c1(B2*8w(`jp0fID2zj#(0f!wn^*Rqjr`u?q`UCPVw6*#dwpKfM&Mg@@%+2~Ox2Sn{@?hrZ z_dK2apb=TD$Q_D&K#_VhG`dK}VE$ZKEseq+jq)Ci@N`^CImfZvbw9`B{O6SOcx;3vK(!W^m$fIi&43G&}TgmO+L% z?E!}r>PpLqF}w|GXf@RE>uT$%@HR_Kh;}B1CgmwmIiXeTMp;#CPAIq=oCoqIw1T|} z1$`3={wB17z@;k@qFyW{-nIB^Lez`bh4CNP3Yj*pq_Pve1-;rWESXx0O5J%AK!JU<&k2`h}K$p{3{BPtMm!%m;b@F_f!e z{%T{N&(XB(xMON+(a`umJ}5rbseTIaUwYvg*Em^tV0BIg$e)kj$|(|yhR|V*KfhmT zooYEB|4-67)`qPLkj;FwZ+tmBz?F9U_(glH_{W|mi&IRGGpXW}Bztx^?>#x`RNU-h zl>bTq!N@MyogdzVMvJu4#9=V?2QFu%Q6ZYiGKltb6yOH!@I!pqss+^wFGxw=3`6CJ99x7~5$BZySS(WQ*@0`g4w#+l2~kjmDydg9A8{!edCn z=m6XrqPrU5#zYO6H4@P-x* z!Sj8EA-cpV3Uxx6#$|9fS{=vCIS;g%{M@*=ODqE%rkF!9`ML5Z@O=BRxV=ny2|DHP zNx5JuQzk~J!1k>b<_E%wwg=Q*{D*} z5b`C@a|RX-<_FGg|3X&uhD)UMWKN{Gl;`%f&QNttYO_^#O{$MKjIkvr>A1ugH02nu zE|o;kYstE00Y|UIsam}iTXvr6mEdX;7>k@iR)~9Er^UsvqR@PbKO1ARFd_07drFUo z#MEm@%HJBJjj}8$_-;Meqt7@Kr-dn{D1UopJyGNk`6NaHw9BM&9U`|BMt9XSZjwyJ z16tW!!YDmtDm%b4N-z#XCHfx5jCTX61vH2J_j4t%?o6`{4kRW}s6ssiKOdviv z9830JIhzIdJanyeWm+;Q-H>m=8w`e z09r9O5W_)IX50^>r-dEyt+)WJN4g2pq5FPUA->@EKpO|ErU1Fi2vR}}tP-a5dpqyE z_mHK!(p(uep`&w&g&1g5HBlP4Z2ak|ymx1qoqhE#J9%FCC@}u%9rk-0=|9u@3BUbaw|v6C@IfNl z-*tD!UwR)|+TXQ1=P%t)_)dA7{ej;=nS>5Oy_casS-OqCyRfif(Qow!w9Tio{}w{} z>BhG~zf0dp7UE)(ua)0u{)At;$NjZ8-|A|k@#7TPyv4PEwmnj>cD<`oKNZ}sHds-u zJQsh%ngctqd_4uy7_|YOVd?s$TEy49^*plcmAdq8tNPn8MO&L;#y-9D4eZYT54Gak z){1XiD^%@9HzaC>8nw~JMXlJVR%~o7$n*a>@Oc`Ep83J+FTq^8GtMyg7otZBaq9O4 zGX`P)ro9F4Ifa5p@uW-6|MppZi{DdoQ9|=xh#uC2ibVgMz2*Yar%BW>|NCb+W0k*l ziHb+twwlT)MnyD!TZIY5H~?>Zmkv&AI$rMGBviEWQ1ssO;YsuSJZS2`_Pd}j0g*6w=5z^bsIuj047D|25R+YFPVTFSC0#1Mp3;$0x zz29G}gFgyIy0oyZ;Jrvb8a$zP^)X*8_(ZIHo%Cn}vG`m0=)@B77o>GC6uxEwhi3QTYc_$U9i9)qn{G3!^o!|1v?BWDLR-EBn6<7ai z#WkL)xcg5kuF;A!w5{SKot!*;0+^GEOCXWKoZ9de9k#E|a|K=w`D_1L<*{sLFO|f% z;U3->)?Hw*4Hza6_5~}z!SB05_8G8zUz^U1cCg9Y%#9}$CP+D{FjNW$PAE)(Vucy; zQDJx4`Z%#=^G+&ET8k43V;1IFVMe`F*q2T&OdxVnVYKY%355wltS}Kg_Oh=^pU_2t z#z_Uys)rdU8!#xF25UYnF(z&_@SHm64Hvu-?MK2_*|7?E<>F4OBi?atID+~DC);|e z^eP+=3j59cB<>~gv`bAv*)vOF?~L62kG~>^puwvkbr4Q*qLQkHc#GFJ-F>tAf}fPR z&dOc^JzhtG@~7DVzdu{PbR$2O#EoW6T%IT}d*qyUzpB;89)FlPu}(ULFFMLz zcUj|FsQ_LTr9RcLY360;UVD?=YWW4H`Z5aJJ#wd&7p*)cI>{p2V4vyPNXHT6>a zTwKuP9`)n@f}aU|;xdxTlXg^Z{_!cCu76c7lKuv<(=IRADMP^3MB-3el2TN0UsD%daZa=?e)!W)Z}HQ1Z*B3@amIL@&S#6?m8m{5i$NMwZ{tLC3Ke}O{g^-S z80mC8Ol6ae>aKc0&aQ}!=M0wBMLjZ}oU*PUPOv9W=73780|sr9Moig;c5|e4R%+Q` zn>OG>TTNN%pY19{H%|@nUpi3H&EFB)u0l7(f+Y0}HW4q2ES!5?x3(Ly)l^m4R(^GF z@SUg#%=?Db1H50aVcE@}6!L;eW^Y+27s@NZ5df7?MOm7p3pwW`x5yb=o(Wuk__`skwR1*aYa;()>Kl&l?*B}W3 zppmyI)1vQOihBZyM>1iZ`koDXnd zwbQ!Cb5;Vm;$(c6)9eHMA&0bGEWpEQ)+`sU4PzfxXs z3C6JEHTw@>Aer|xEbLGZPtJr(a+hrZYc7>3113hE&!t&voi{Lfx1*jX|nz^0`)X|)(I`H>=@`ZBp|GF@g)fc-jvt7JAr}Rg} zqpZ&KebK8y^P2o0iHpP-NU_}TKNFejVNYscHQ!MqCNcX>tLP@9?caje_goaTl2h3& zqcNLpGvde=x@@;A+q@Z3WLsIu_Of`?#WUi_*6p%AY{b!`#$}9wJw1cAl6m=eT%FGu zgZqz#xI0CIRxT#?q{b?;R*|)q^@Qdswob8imUVW66`4?E!m^&wWW~-=>>SHFyU~iA ztH`+)iF=8gCDRF-3vd;ItttVrosggA8BL;Lu8 zzNDb-;yNcG_!c=(Xx#C|;IDyUy{4OuYqm5QU=ry9aQvD&nq?KYir6VZb3T1m- zQoYij6DR&rY?J0Gj3K%G`akf5DTj2fSVPyC9(Y%e+3_hGX{(8XqF2V>L&CCSXW+ubUu$oVhwosvs;wE^kJw_!f{Ez)#T!7D z4ObNNXyAprP~Ac7{PO@e=Vy~EUn0QFl-CM@i9gXA0d48L?=AoZPi{~G7+||TmbpD3 z>y+jY*Z6E>&YNkZa9Ch6QBZVXCuNM}kPx?S6$$K3lrIyEh2^vHE$XTVvDv1LLtm^Q zH>Tm|fTt6AxEzQk4yhve+VAkLZvPP6;frZHcCy@IN3Dj@9mjDVqEV0A{Gr}Hd^Wv> zmCZyrac??e4D9}__kdX1Lm-Y;!(u_5!(tzqBf@}msS!?cYZ1q(i&yN;fSzz>v=f(5 zvfJ*j3OF%i!;Ja7u_PXwb24)^>Qdo46g=5-tjgREH^DLp}Icaiyq3vXJf{| zrb>)PS!T9YKaLEQIj!;dcL`_Qg&sf}VDyR@0P+w*dr2aM(3shc2pC~dcgRRoabq$} z<|JIO7pxgbYn{+xCm2iJUcs;`6Pxi6ud4`5Ya2v(hj4o{H~9#uS>wIQe7tfH$8M}> zCX)-5ajp;w*js3Ze&xj$gTTV#Zg3Yf_=v1n=f#ODAw|7HXPCDbB=reXTLAkz_%@K3 z;h`iSmHz5ntX^2z0gGgugc9yxM< z(56W{LpZ(nK^SWscG@bYawBm@qY}hL1V;Wb8=U)+WkZxtP3Y@}X?W+!% z#>jD3+FGiC8hijyI;G~Jssb9{_CrxB%#NG_wg=8o0PeO4T*#(@otg3vcN4-!fv0$B zhd_DEQhdHiuA6;)%Z6Ox|0*8t6=nvl8wBlTl)VxUCrC-MQGPP-|3$>2_YC&%YYA#E;q7gd=~gmi?dMvXgw&c zNw`Z#uwP+jNWj8Us(Yfj;PbUW^qA;Uprnk@I&x%NH!NSkS%IIn>|hIMpZhoS*M0+pzYkVY`mma#;j#fk^zC_PErwAcV3*r zqP)hqI)2*;jYH((8w>Orvjg*u1sZn3gmmP9091AoA+K(8?aK4rd9t*x;%A7KN$^NJwcS);Eo9?q`iU6ZFDS_##}Xm#0n3)1&>SW!*`q#at{#u6dEEgHN)F>r`xT}!0$;u*`?~ux0|z)KlHG0nN3la6ZBU^g zPiKbicxfAE=*y@&3N-`u5Qbp(2qxbvJ2Wvmf!@r}cbff~jh>tVz#ccBjvkc#nY1gv zkqqN=lYEtaRr@nXhg!8OztKuDZO;Bo)idnRG{BjkM^GfKH{&x6So#JtK7SdeW^1#I z(O>=*E#TGy3RwAN*`C|JJZ*bkJ#BlwQ?}>TvOSN;_PkoQ=hdx&PI5m0HAr7$D+!N+ zgjTm{bs@o0v&Zw!&?yK4DR3XVK6Vhw*Qj(=2*79dE&essS>Tn2xV})dr<2&*@D}-l z(7R2lz4_n5;jNkc9PLB#@*L@*4CP-Mt57J`8iV9>qL*1-E?;bTU@vVSThbUErwVzy=REQ@P<_x(f zl!iWYv6Ut}ol1KaM(*UMg(sC}X66$LqiOT3Fkv(m_RW(ElT!4g!kiKOgwkkQo0TS< zsM5|mxikkPCybg5%qNs(aH2sIPE}cTrz}eVGkXRo>;99=YP5E3td#W=-X{#1)UYR& zWv1m53Zs(?tuP^VHSCio7bdOk$%Vg`E;K5?F8#pHSE|WUQ+c zCSK))Zql+-K!$9~Clodf8LKOW^`BhWDIr5GJ9t82(|{rJm40N^ADrAOFvkfKDO>!* zFu4b{mN;VFP{)r&p;prrL|LxdmU;OmiJeUW=P?)7 znz1e~qYd2fi56P?bv@;6ro0mj%uQ;jQw#d>drj+rNtxRz^3lUmM2f2lQKr}s!v#(7 zTYJJWDT~8FqL%5Smr~3hk#5{<4z>hdO+2XKm0UEeiz#%y1PV;2xp=x3!p+Hcf6d(# zf6Vq?+sfaJ2j0(;aAJYoJ>gAAQ6c;p79Vk!P@ov4P)-++cN#aIKZ)_~8tb?m_3Yp! zu>m#Yfy3;vBau`{i?#0>LwCm1y+|<_wRD==8-I&t#$0m%M#GI|F}G6;qgI)|@4)tq zcpPo4b1J!=f~}d{PJzQXw^QIS&g~TLoXX}f4xqjb@8ouw*ZeM|58Le&WE$@y^PjKN zW1}9M^%&A)RF5$|IQ;G~N<)kV=)<$@y5@2UZesq3HUqM?%+weGq9x0)ieJaK{)wO=du~OC4lX%e-%A5v;T3PXYoa?6Ve;)GJwiyOI_5Bp( z*H2GeAD^eZpF%oBW(A4;AKVxAw0`&XD)&>gz&us%r`X5Yp|CRvg6z%+F;j__%0(3O z6xI0d%DHmxDhWdDCW@4T?Yz8d>V=e{+a=sdc-7Fwgw9hadoLOmS3}zfov%>s0B|>4 z*c}zxwBjdI@1|N(ET1EeEjy>#@4Au~UV-$b>J=6A zYWW;?(yp{CD%7X!oQLFxbPl8OuMBo`SGI>T9)RB#TST~fhGeq2%kP}?OH!KyO_aJ>X1 z{}PVm`N|CyDZ8QKFYShk9(O~9(43N;9(pfUZm3u&_kf1eb^KK2h6+w)|2~=^!I+Q% zt~Y{fq8-cu^ZM;>s93Kd9$Z55suw!R1>Gd)h6?9CSh=Bs4UR-e2`7Bz6)JE2HiP4R^m6RJB% zg73vkr@Wa$uprG`#QUV1DQE=JpX$d-HAD&#>*Azssg-*Q+GTOB64?2M`NrenOWI4? zD~-h%`Ic0!rBF2mZ#*P;lHgx(EybiYM5@MF*HWC+kY8{ug({;xsQqc(jB6=&&uW^q zBi5GLcT)7|^euJy-+d>Ax(%$W_GZm`9&e;GARl{kjp{_1rUP~_H(E;~7$Mp0g? zDL7T~v_F_TteZyNFpL~#c{D;x|IY^Nrp^2=;)=%eXzCAZO%!zfRd#EEseMRzj; zeWcYnLT{r-*e295IHRNUh0iKfIvTH^BHbGD352K=*{V+>hY@I zai@CEj)!x$NO>=K>|L61viBBG2S)~3@@sVc!C~1Z>UoHJ3S>Xq&d@D-$+nUdyaWAD z;`sAjf--%JQ%1GkJni_?pljXfAt&Swtmcz1_!K);t^a6UX%)W{rH}JuzcNqr`QbK2 zbNcz@45y#JPW%jhNj0=t4T&G9ikn0c6W^ER+r^*{So{O?_<32bLW^faK}_darrVWi z-i#PBwYf|$C{x>vC^F5rOb;v5e2W<$q9JGYt^6(5%7x!J?wYg^56}xe;xK=aAvwF1 z@rAXD!qw;GX2uuRDGo=UliC?ym{1gMJ|{OczHpA>aPm2+rSXMx6@`lrQH|CKbH(D$ z-1x%NrU&wbn2%#^!l{p$@(1PFF#fUU?bN4J>^p5`W908w_nm4-DO3E~_nmgwc>y0d zeqI0pOx_6_{9(J2^H%fhnrFL4IR-%|0T9|~p^X;WI33DvP14S{P#iQXX;0p{YG-t5 zzvq10@5xS3RA#ml)EygOq3k;)^!UEhllGF@zSG$|M%gE7`%c@e1Wr$(1n5g;-zhs+ zb?|`gJGGsw)B8?m?|)SXs~c-==PBGLM&K2$S2j6}~ z%M{_ZLZT`s!S5!euI$^$tAB5;JQFUxZezs?#Ky9#6s{|?{{BqTxg$`xUDaq-xnH6N z)M1Dhwi`GyC=raOSx^}7siguQbQ2coiW5^ubAM$r+sQ1 ze3$U8IL_W|{O41$9Bm{Zx9BOIIVZoX7Saq6B&VxpKmH;pQBGHyrg1KK(a_$<6%?T9 zE}4!H&~@%y)k7hR*OfbIrH7%>{h7Bha9APv6Juol2{=??fB8Gjm<}$JkFIk9$6ul9 zozO>BlyIkZ1}r<#J7Mu9a!wdUqthan-4=H#_wCFLB)vsq1QecKce0!YVQ~U8vL=~d z?FFvewT-^~=&)nwoLDPT4%h3vm{h?Pc{E z&zP%szQ3>a#l^6tme+7-CF$&pxz+THw$p{O|{F6c6xCb30(5_HcxZ1G{{@JcVnOgH(Uu zj6ae9^4$2jCTC({uB_JzHUf!UO>r2Mhj?w^?4=ucV-Ci(fUA_>I~6?u0yu)tVkX*3 z!0V^$HxvKyJcEUVyrqTEsM61 zlX9c|=Av6&0R*p@*Y63m$v9~jGYd3c@zfo(o^3p#W3X%LtMr2uDrj~%t zYyzUKd;WW6zbPAe?OZ)cq#c|)Ap9q~h=`%xU_EL}XZ8#*45y+{hVV}y8#q3>OWzJ^ z7`o_&8N$S>O+j?Ouy2i}y~5^^D=irRdTYTN8*gb+HDXN`c9k_66<=1k1_SJi3{6H} z?w80SnU9xDaE5;&PW*m=iCdfrS99y6O70v9c-l#o+&RKYmE1Y1@z1KpuBxNi~q zm=#yK8%X!Lcta!ELCe6vhaM-6vI*v(2@P%_HB=hD@g)Q<8rpMN0ng;B3Q86`VOy;x z(v4D(bWxZ9U~O>k6uA0>{|Z)-3QCa3$zhNf-Dc>_V5mQIgPiWdyO|>- z&xQFh2gkZ_qEArBIs1K^kLK zpXKSJJ#?1FGG>=1%{)t^w{n(-Yti&s8nkG;Ytb$?$!J00a6Qa18rC8?TgQ!;A=z&> zzb%}Q*d>V9mN`Xzz@vU!gVjd313lD$nP+Kmuk{Uuy<*QvQ^ z*U_(}gqfY=t=q(~Ba?n!Cwt^PzGgGhi+~(hXHhXwyk@&u#B3S3Kb%o6 zZiD$qWgeik`*=GU!c_t#`+=2D7G;&!}N_(%ydLv)xs@$^C_`MQ%y zUU4$XA8^k34kwf#mux*N&i-CiDB-AjjuTumiA17%EuxVem*V;7k&N5>Ho;XL00ADz zc*D8$FyXeTqXVPQnv45IkU~?^h^_?XpZ4+E&7bV`JxC$f)+u-;=8@u^9QY1i*QCIaSvr?T^pA>!J>8ypi|xz$foD*ja*ZI;Oyei~&3*XwW@pGrWY z7*&cG!GIL`=%~A7$FWuvxKrARfG9?Nn|4{l+Dq3Cuii(-s`r7ggcvYYWo$9O#q(7? zk5$oiq?i7Jn>PM0BEYoABa>}=%bK*cxt&bvuC7y6wQv}Xl9-xli`&kj`ddS`s)}Wl z%N9`X`0`x(6+ixMie`{dFvyz($*PNFV*S)SIZrn0H#gEx3BtbCvE6Zx#_4(`X+aIHrJ*MjU?pjP-C*jqZq2U^TQt;mzJXB~6MC?FElSVzdNH*y? zy@Jl+hf_RVg2oQCvK7eQ#-{+GSreRuCC?Y{*Scjnnq^Bc?$)IC!`y0q&Ds?p1p;eQ zK0aM=yLSublYDH|np-BAr*#KxPbvOYtMhCqjGc8*Cg*l}oHrZ%-$loC)C0JoT@C5X zM&po%{xSAi@6y2O_=rCHs4cY#pBUohB!g!JyR1dEvlT_2<+gG%ae>X@0IOZUt7{z* zi+V{O>1tIKW`w{MMpJtz3=te~Rh-J{Q8{)MRuTdD7Zd~HKcSerE5#_^?2@dQEJx}N zJg92qPzsqlsck2ecaOw|9(A&pX3FB#q%ls09MFtL2GoW#J7ler&Fq}gConYh0^Em~ z0exgFG$#>0+8hk1Q50URcx69hErD)YUDkK%vT9u4T?RLfRH%8iCV@W~`RN`f#) z@$YbSCEUdipFGY^H~Ly4nN<0C$<10$oyQ)ZSUbf>_~&t_-el$c4{0_aE)!#3E5LXkZL zfFLf2Ve~j1oD}*Az(T+<>MtiRvAx1^5TebZfeiPsyY-^Fso(}9$Fd`6q_WX2u@81P z7i&RRQdDt}<4Ck!W5B4k6o|GQZ~_;ruv(2h&Ood&dRpZfwBJFrT>d^Hb2B)aZ$1tuB-8E$>~&k6s-38hSVDF{H<+9ysVV^cqt!ildg83K+c+<_7#JCFV@F!St8E zGDq=<@atbWj^fHHMJ|5ICF z0^Vo5gGgP#mA7IgfX(A%5R;#=67h#;auP}-tsM*p4TAKRm`1S|=m%F~%H-}`sG(vQ zs4f+NN}nO?-%{Zrss%u=U$uZ8DnL1AmtlmMOWpi|d+CSByCeB3N34wzXIe9(DR+yU zn}HP$XTJ`Bs&LrQ^pGGU`;aC;U3r5#JFGK8w7#y4a;Q)k?1wpBDlWiF#%OcC!zeTkKO}A zq9?ogs~J!4I?ff?kx(!GOmhFkf}Q9W{=}UY#vF5%SBX1>)#6UOjMu3chA6`H+t3}| zwL8S07*u)A<0&gP!g~VU!dj&EFPUy3qw96QLLE93Op+Tr1I2o{qN_~D%yJJsP%MgGLKGooaQ{C~Zj z4Rl=9b?;{+2TULu)uoM^kQXI&soYRyY@949g%^!2!3-VQdelku5*o>t?F_{qsx4!s z53-a>dU$FEoL7>X)Gd1NC9V4wtT+j2oEBDOjE#)J2nUP_!IlXj9-P$92iTZ6e!u@d z=gwS>MpzC>XD#W@oO?d@J@=fw_xYc__vsU<`$g*DN2CrMc~I(rR`PHjx=$6Nwr%D@ zoErTCQ23k!6xg($u*r$fK=Z|>2kanvjcs!7b(@?cWi~mhA+E5=c^H-8bw*5vrM_f@ z;=tYhU%n=Kc%zylDfd?Gb;8}p$b@R}9IAm$Qbo_(KpGJ?@8iPi-(Riw!ILu*WTs#g z4%^ydEDF(lLM8Y}tZ(H?-{B^rKyA+p(v^)OQoBZ3xMh2rT7jcHtAqD7g-UVb?DIlNGAL&1jzr^d zEcY-{JlW;^IkapZ;YKsER$ao-m0zS9p@M2IQ1yf~1~sI)sqI;T9F-1B5x&qzLCxJ- zF-F!HPBqe=lpf2@Rgu}Wx;2)&z*=`^FpAVHO{68)Gm|;zR1_%$5*#VcEt6V=9I4}{ z45w8?iet{$<-97L{bgZO6NrK5kf4@6*2mR0Ih|^z4cKiYDJB4pnf-bWt%cMtHB7tr zX{Yn65;~{ICGFr#C~Ba^YrK+AiQ4CkCw(|g zY%Vo;on1c_YFMVSkN5Nbgg#8?r?$~+Kp#$!PrK=3mOmFDch|9qLIU>DBv ziY!lS1&l`!-kK1mDNzG6t>z)z1o#&{a(f<-i#J^FWf)#b^ zq{i9F>|w73Xl^3lC{%;wm%V~26dSKun45NaOYYOrWNzB~S*v@|2X2v}2ieeoy zmL{~a8IY(4+}(nX;rc@C&J_})iXLx`9`Q;H&2cFDqX9dkt^B3PGR9a2%U={XNF^HM z4Kv#dqw#Wva{iTn#pn<**SztYBT&=4@!Jn`VN^=m-7O!GTHZJIf!JqdcO%$740Qf9 z&^bq!Mvk?TX?eDKVs*80PBn zW4V7qut?_EV=xTb5fjZkalDxbAWZtC1P@iI3JDd_kjUMF;DLMz2rWMka#bPe58^Ay z=qsu0N$K8o$KVPCCGcv~X~rspNqwc^-fcLyYzLjXg!aW=&c8Z9%TBuwiAj%`wQF2e zr-=$!*BX6BR1(nTi)wW6xphS8away?&gwXlTQ>UtgAtkPIwRy>Jdn1W*&d-hGz=uQ z1A&foV#$5KjXrzt66<|FP_+LA zGt0r?KZ#+#ndM;ck68`||Cr@q@XrCV&Ka`N#rL$utzYZD(Z!1^vP)d`BCdK7SG|a< zUc^-|;;I*M)f0t5et9%=qOj|5QO#u>0L+=^MwelyDn-z$i# zQ(?25-I(R%6%yd*%w{cR)v=OVt}|T+<|~}k+MrjiPDFGKak0I*-ZbNMj9(N|Vp=9T zX=ITG8bJnZbPldT6yaJc!P3sjp%|1qbul)u9AT8sj^*D%4gohB>bxrz!>2QryGFuF ziYF5~?X^+OW+c*4d0I6v4`1a^@T}~L9T1(8^A3_CnY-5pDWN~^mkwOp8r?1SOxQo> zgmznC5E+5@ae3895+!D7vktReWRZLlWX#aBg#Q&iaamEg5d8Yhfu;@QcF=22vzqw7kxqRKEUQ(*UGxWhKyC zd5L>$PZ?Baq*$Gf5>GM}xyLJ#OpnlTJ}vR#UMCU+b%B?vcdJ?7ix6?+qFX|w>zh(h zoXkjU=bdP8uw7Tp(oMQTZbturD4ml|zVfRSawgA;8z6b8p&6J`n$?Kei{zXsmoypm zPCNZyiy4fJTUC(kMD7cpIk_?!}Ty+6cbg#EY;sByibz+9#R?Z>`AIw6eCa~o!rb<$5 zoNe^`d3?o95?Ma7)gdupPnGDzLPLSJENGR+v>KynW|)W+eQddt@U#)uh0XzO(EC$b zt!TwOPa{_0GwG3POsh!lx6#Q-)OfDq5;Av_nR2DG`ynimyTa9@gO z)pekS*i_mnfe}EXcKcr;=9klUuqauOyW|2>GM%Q_hE_#j(|LB8TRi~n( zPWK3cThXn5EltIyQP5q3Pcn|g=cjuKmkOPB86d^^@8b`~FcD*F-uRUfyKFHP!0Rn2 zEbhE)4m)beg1YZBQ2yQMEc>OiWOt*lyzT7W3UqYx4AzY3D~F>en;Rct-Hmnb^O{wt zE1-{$%Fc-sjO^ROClw;E!E}^iucOHRfK7ajG+fyUe04|>4W*ul$C!rakQZU#L(r<; zvWRV7-Js8&g4`3_x-K^@$Hf-hQbGFo&S-eHH9IOza8$QW9@GZ>Fkhus>f!0hK`tmg z_`x&Wcn>Oklq3RX-;>Dh?LYDhyaPpm6axidr0wl|bwL*0+?2@GZOuUIQ|WBdi8d%v z8n32lLo9bM(^RX_+E{M2%~;0}*o2gOLuYIDHC~<`M{2pKz-U}@T^fzV_b^G)dnC5g z90rj+#Nfz%2D!qHEWsspt;_Ch%|2qQ4j$zcC63*a&hD{bN$P4)Hye7Ue*;Z1HoYr` zSgnSrVlVEogb^K7KsN|d1-8sQDz7(zMRBl*Xz=A)T6@T_eePr%{2eO%pty*(ePWT2M1xali%T6 zlGC&;0=ZloLh1=9JaP&Y9ytXHk9snP|AxY&o`Ay39}$TBBIzewr+XBBO4k5|C-c!h z3MV(|-;n-o)4yT;8`Zxt{R^S+FZV2P+6vw3*FvVU;DsQ&S*a`-DrtpXR9S+11xw;n zgyO>m;Wib3SEr*tfk`G~g9gOE%PA-9LjFvOi$)8vIa;$%1S(!S`-4>WVU{v+*4j=B z)SJ|-FnjMLDrj3W>ISTQ_=zmC2yG;lTaOomQ*L4d|A-|Ox^Vt&-0=f?BS{)faprNV zYbT*5EGQ{&n)&;X?66Ef)<#$EvTulZUM$+Q*7nAG^_sVltf?&HB)A=mKCUciiDLY^ zHl1tyl=(ZJi{Vo%##dvOT;yw>mG3_iteM$_>6Xnnm21Gdg|kMr3!uuX8PTmfVlXnT zT9CW+@-$0k%~9LpJ|K-ixBw-zY&PXalZy>|TQ2im?K5Mtyqn?$~dGcATTqd!ba&oell z?JR@in`qweuG8A0;=`bJrJ^s}I(fgu6Vy=#gKt|Tt99x}!_c$17PLH$9%mn4+7j)8 zSh7eaU*VyqxVC585Gn9=%ivaz@hC(r^~PCJyi=V_^yNenluX@kSYe9<@Pu zv8r1tqYRX#{JWk)R)L|A1tnS>B9{9x@=I;;TsO?@EqI{-;^=El3z`ChLIvT$SHsPdSnmI5 zK)N5imIi4NuES^T^GMDxjS9)RSxC+YjdFUB{^c2v5QO#Rbuubp_mUN+Fj%f!Zxpw$ z_aWBiKma>Zj4cf-y{=9QstTsFULiZ6qrA0@>TDMco!~t7o=^Cq_R<*7R#YRL#|Vl( zsiFdg3Z-4qKS8^3W7D-Q(&DO5RI4WU9Iky+oY`k#qsB%K4`;WT9ymf9s={cqyQrTw z3vcnXT7HB2zem`WnQScUVR<&&0Am2!<4bFZ9tC3vcB6Dq2_>oDKa?Ar zsNfnLa6Z&-UYCs&IZz19X~UK4Aa`SfwHl>4WYVNI@warD^0b-k%^SaNg1(?j{kuS) zyQ%#YSAd)rwC260I$d}+dqA2Knl<20l|enP$z8OONGuB;X^uXD7Nt##W@yjw$W2&0 z;exc_P;@t!g+GB=_9FB9R51dMXwv9Uh6eoCQK;2c6aCcMHJrxPPxc4&Ra;B$dLG1~ zJArv2Ja$A&RVfxAKo4>e-+fiujV*Rz&Dx=m=3>wXI;K{Y3>9Zd7Vz`rNwnMh9SR%W z*PK->Y+HV7vUCzEU>jrgcY5x!+Cr0Wd$5sfD(d2EIEqq0eA>QXZMU1?7N>VxN~M|~cLeQ1*m=>+QYP3cPw zPG8dACCU<_$JRbTKDW@Jf_z{uTEsTR28MX#}=O9RADzV2_@2t5PK^ zxAR5Fj%+ly@mB1V5+)x- z${F7@bJRy4nPP*Br7=;XC^gsGv<}coMw7&w(9DYstbI%v&`AELzfjJlFsi$(Y@+e0 zSni|T)F9*tV3P&;)~+8pn@QBRbk40^7#>dqN*1%4wE!x(lq@aAy(gkJ?&2Db4dSh` znFNphn{@P;w&={g2S`Y;gF!j)ktTzL$l5%Mwczt+Cn?CKAH#-D88*D%)@ndO@M(5* zoNTFccn}iB;2`+NS>T{6nsa~Q;2_gd8pv)!LcF~Q2OZZ$YY>fykU~a-ZoJ$q!a-zA zp`9qNpGgi5GR@1d9yLKO@uI_I^;!|t8&@NXSu;W}`Otce9_8!b1snthLYm+po_`>TCb&YVKV5)w!uSrf{U%@?F{HAv&Ltlc*O*MXmcSMF>C+X} z+LY{u{F(<%ol&JN8^0)_(X#RXNsW<0KFk#?$nDbW1feXk$xoOb@;bnK3)FjaS_GHd4V`+Mf91a$u-C_OEw4~?|-whL@&&K#J z$)M^Kp8`(i_Nc^Ul78v@(EGWZQt;;u)zXd={SP;mMe;lDpC#)t91nE@T0BCYp z5I_(+K%l9dchsuI8Y?8jf)X4@o_)d=(Y%ayAwhW7FG9P5d$FvPRH+gBA{z4qB#2Ed zx=>*-oRKj|&=lBLZy`*ERO`adG z<3$DnZ9FFkl%5I*WH^qUqJt*RU>y$vT{0d7+J_pWnQBX2vzN^g&Rx2m%GA0aqwm0s;{|#sDj8UOC5sK&EOm*@DC`oa>EtKB zAF#Em8I|R#H;H|*rK(}y)RJqSd6orbTvl}k*1K>94m_6#8?hSPTs!vXf_#!p3Yf8Y zvR`e(Cre8miQq#BVt8r$(Pjw%8}>(n#-oCF*xtn+=hJs0<`+hY#IU> z3-l}q#S$DQZG5ai!|lhfIKRu{@^fTl-#%Dzx*# zxT%cdcvplj$DEX&lONg4ttAxh-hJ*%{N)Jyh?#kEj*yNv#etRU2_!k=!smZ85i=pr(V= zsl}<$O`RNF2%W`>=YG3jacp!CCK;gluMw0&9D+a-!#pEJdWokb6Jx+P%o+ANGvkY# zyAhFAL+EIu_&|) z51>-9CGYgM##dsuU$klx4i<2x*c~S;H6_H3yKb17;Z#J7hBZC(kqQNf3=>eqH26$h zsdClr#5YE4LETTU?}a~clBLsfZB59i5Q|gq3BDTUyS$DV3FfpUB{{Cim_y!cXnB*~ zKIH>RG8P3_5*`mqC)Ic>x14S};LckSOh?tVss`3K!hNyEI)1AUj0y2ub7v#wQ&6;- zuX3C_C{Envo>PNJQh`QoEWuBVFjiB^%KM#-GvJw)SD+6%)J6^3s+;If;>mrK7B8+* z!}Mflv$486I4hP`-0YAwOJ^dNo{44tz2-theNKv~8&y>5emhqIWu07Gvd^Tm&lPNq zX(&T#%6vGJ_U&nz9KY%i4vV7{Xiwuzjy>dx$+3D!;Rl&nw^D%wgB?6(!IiK${+et9 z$H;{F@n^oQiEUDf+2Q&U*Cg>_HLe%Z%~fo7KaT3gYcC?fZHm4_S#p00a(9AJaV5|Y zCR;9&68v>ao=N8546Cg6+VGUFfi_HLzRN<&a8mAOB&g+!Oy_7ZBJDoY(%*w@$pyE zgbuc>29Ja2l)hh`Tg*e_y@ZO6f0^0_pj?E%)e%_5tG5;X9hPE=78 zrz@C9nm-|MR6ulMhmmOse+VhUu~Tw}?=E5e$}%@R3XY$AVQw){P_C4b4F zTQzpZa`EYUVLdaOH1YzQH5o8*Qs?)QjMg#rFM7EJ9Q!dK3u`u+tZBq4*qCT~$-$dW zGGa_^wrl&*C^(gI2hbBeEJ)9hM1;odlS*rM25b^CF2I;suCca4E=#Ea)4P!A!~)b? zuCZaW5976D^nkjpE{SOvree1hB5TnpTIO`d20nhF5vL|MsJF~`4l8WzF(#*~HG708 z=U)-PzgeBAxqJ{pb|r~&BAX|=DM{KM8m)AMdE+p4Kso+w)Q*RQFtofoj4({agn464 zw_iqUls7%DZ8Ex06duj)DCsZk*)Wwu*Tc|LY8g8(B7|O()n29Be-W9oTk~Y)F8xA4 znz#0&x8`o}y=9Zu_0-@2>AmmcI5yrtTH9Zb&|NJX|Hc83&O#ZVaFn-G-Ywj=QN7|; z6Ktb;Z}4Yq8}%dr}~f~kmyn>pGyP|0L6 zf6rSa?~kUsoRfh@bW=*sA6@&vA(L_~aGDj5`j9MXleTJP5hR8xo{t^Zb-8I8pRzRA zYrf!)A-A%xZ39YX@nT^|$&=DHD$pWdwI0=Y=O{Z+w$GCSYjFlTEU)em6O@2yasCu1 zxNS%2z#I-*R);_8R6Xm)*?Y7fsBB`lV_X?komzKl3JZnIP;@KbgXx+3JX~LD6Xp%{ zUo6+kZ*9j~0j_GENEZOAZe(jEsJ02HAQfsmUuEu3=fcK%gnD3m&#+5ne-2fB+&R+3|4XPp;8@O3LI=E-uZECe`_8tp%yPnko5S43O0OK(ANU<_)OWh})JZe`b zUJQMF8+D<^_mR%ZRvM%;l2`1?IpMw1DXnNxhyF8t!LgY?CXH){+dB2T%mMy*7tb`A z6`I{ETA`tFvBVuv0VE_`F@fXd;B!o6>+<|NoWAw>mh3NNb7P|d8KYN)ex(6t7V}LS zSx5OL%}J>;na`y0_R?_E+ITb9*x*gbj8rax4Vs6222QKvvpe#aaYqpdI;GYrb4%mV zSnhk=6&G)lxjaMfM=o3*qKP^C66m8L{jjVN?T)quQihvaY!qzg(KA?trJ|4F)b-D; z(MQ7b;=j{`c1NW!HMC}bpt0dI@bPU;pl4do=E?k;*vgHC^$aa%ngcEvj#bG* zxtdOoX5v`#n9JCp<_#u=ShTg7O`xp^}|*eRxV23NV-b&gGd!R%GN3}LqW zD_eN07er?> zjQBQ_2e{0lYSd+!;98|i9|G@get!T%$;JxFEEm+BtHdMca%dSE@^xM}}Db+K; z)lT*`rjYlMN1*)&Bx^)Mecbh$2Bn1Trr=%sO7*Ub#M5!|l&8v#j_%K@<3_QLu8w)! z3NI1WJso*ELS6KXv9pbL3%hFBm>uSV7UW}rU+nNJjHVqz>l_Gw&v2}VEN0FK0l|T( zMp~j6%m8XF+<}+{rec<0&~IRtGukJ9jv4I>>FPzQYaFnfbo-Y$OF_dr1k#Hzn;k_U zYZR?XV|^>P%q*qUUtWeG@L?H*YucrA4tiRUn`Y{b7yfR2Ew<6bo5yVZb;jEnw;a&s z?!C5F=N3xV76(3=*U4QjKuz?A+V&oW1}SHPN~E7?<=&G0L8AXfqJ!UVt9&V1agyaY zuc+Nke3#ce+`(7_HNZz|xR~)7FT=WHcPys^RumIJ*#bLKn9MLa<%dcas5yDs32}h> z=VaE=soZELosjOL1EfiHSdLbB{IHROvRW&>OvnV1Oiv=U$rv`aICdUrms2~Nb06Wo zI6zFU@r(dHkqq5bS|FTo3@8|uIS<}q4*Rjd5YXrwU$uFzLeT#Q!t4auY-aZ+74nO8z71 z=#gS9K1{lZ34VDHr7v`+ahD6yH&z;?&m84nMz+c7LcOZ@sg4XB2K&!B71@lSXORpieog zYEJx|xg{ChDRYOKAl3y#uNushp}$GgigU3*0Ea@%+PF;2dK5Q#=bLyxm6>niYV%F} zfO6G3qbqDuDTkPYVSDH)BYeG94^-nk<*(9lmGAS`!)*2jp5U0n+fFHflh>n~RC7{u z#@y2j2KN)oxaKS;kK)R?GJb-63z`izLS?YxT1TRUHK7<;yIRsfXp;yPQ6|BlN8a zSmaix=46Iv*mJhTe>;Xnqlh}^WcIG2lNqO-FLu*=xmUQ$YH`&}JQrU9vPyy8yInU}_M5L#sL+?stwmPW9(AySytr-OZFdB8Ka zPo=&0N)Y;m;Y{B+QO$Qos96X)nQZ3K&TyqQN>R78PkO<}s2>Qdivkb$mh59%&ujfY z3DP=-;0J8Xu9g1-fK))`wrX5CaaB3QPjr|g|e#CgZDQ72~Qu({d)zo&TGT1 z*%u@yPg%Yd8VATa$(gto2CVi|WDbOz_CVBNB1cRX9Wy~souH(QquZ%)I`=p$MJuSL zRQBw#b%8yx1_qN9#MnzW zd3))2Y+$n}=tnh?1h z8?k}6+(5SMsr+kZ9?h_4Xk^Se{@5o}0a8u-r0f=8fn)(41G8vm@1E;&m)%D-5<+l8 z_?6!ECbg)MVN_~?RxRWkZH!UOjuV#Aw_|%G@xv@xq*>C=g9dAV=Pb_{JP*rgOmECG zT0z^)5df}H#xfcbFn>Din7Frgeo}_caQE8q(q9=Tvy8@3-YlchB-xdUg_r)UJb}Mx zg6d4%CZzg27$}T)yJtCRN5r)zsCTw?#f`yVPG2y<-O+Kv0}) z<~_#q-}|s)x;~em`(6zbt@oYhZ)#&`$=?ESKn>d;B#!(}zC`*ytpDz+@BZ)F`SU+i zTX*e(kIbrVsQ*BHymn4}&g}RHX3wpyyMEcy+T`LMm;T4ItEOoYq_>t-VaziZW?^Wzr1KBRyzi3*V)$>2v1o{F_%* zeP_kIs+v10=T+6_rp>FWyZtSRs`&rBAW_xOf8n)NeHE4OjJ~z1VSZISQB_AC@=|7A z)%4`WpRC$I(M=WCRc+>PsNyfko%U(-NSjC6GS%>lw@xua1?c!Pi2II)ysS;BkH}0pUTMFCMJ?n z8L65iU2kCGr!w+e6B9|PjLeuMJ(ykNZ&yZsXM6&`K42*m>HLtTPi)ss{>j???Fsd3 zj?SnkzQ8Zzg?4MRA*`syv}4KQuA9~`UbD1g{jwWB zmFekN*S&Oc&$9ZiNPT+#9Q}*y-<5Av#oC^ZCF_?`u%m0;+Md-bJC!wC%nZmzv!H&f%d*4*yjwzGv-btk6+VsMC ziFC)pYp(gTmPH+l67$k69R(5?97}Ip+r7A_D^o~YvUts!8<(x=xN+5uJrfewFAgQH zS+%T#5q0yjH9Z~6yLtKQ*g}yqgRVGY3Y{AH)Uuvst8R|0?OD^adc_nAXV!NyO(KKL zXMMzZ-oqcdv3+(-l76ihLCXK$Q2w8L)}_nuhtfamSy5^FeWCQ3q5Kbo(!=%4_bf{F zgxfvtSyXBLwU>~QPigu!q4aRO9|@(0+dUOZ5BJYpFA7wDIDM68MM@9TnfI)vl%Dlo z6qKevjU9`4V&D~t7n z>1p_2`Sj^c<x37y1sn+rjM0Rpa047=|AcyribbI(2d3P zFg;sVl}{h~%kt^-r9km1eP127mQVj!fBE!HlcXQbme2o(|Eqj@_vZ5HAO2$b^vnOH zeELmaE1!NwabNkAKJTMD%BR0LN%}wSE}y^hK>76lHc9$woruGyl%D37%BR=5WaC#w ztv3>x8%p;q#8Q%>bT2wA<%UptQR|IFx8QJ*d0o^>BatnkbkEXU#`?9NUfHvFNu*xV+pd}5ThFrIo=819<>E;FlC^6i^-8FJ z1BJAx8~2YXnT~wHF>iqiJ^i?8x~xTBzW>o?5zOKxfz5p1I*4XtVWQt5>dEwu-U5 zYBlR4u&+N}+r!$)Q@1SBv3$+qmCM|Gc$5F%NqJ4RFpp4`P5M{n=XlFRoZA#Ai8w<;u6&-HG*YbKQzVi$0sSj&6f}G6901o7V$FWB`ceP4E{Q- zbAFy{EzU`|6tn5(qMx2R_;}$T zezBg~UjLn5am#1V&dki5ojE5nYpwf%Y$EZC?~}Fd>GRwV449Ah0zcjSPl-%%bMepr zl*kpwykJJ!`((EGlou4zGd^89p}5|s%nU!srn|SQPNwv$8@XkRFLvesRb9yy*G+yg zm%nx?&t0Wi9mQS6Lb{pvH)e`2FV7TrpB^-SpFA?pugUDZyngV(Y|!E4^53`~V8-nn z>MEVqlXY+1lxf@3=}zC6an|sZai{O`-CLjRDs|Un`LtAVak?f~y1W(OQ+{zqy51|D z2H;nHwBi+$Y16xGiIUkiKV9Qqb88iy%fFN@&7|V+GgV&kCC|O}&z+O+$+@@w+N<`{ z^*QIynf|8_aR$@f=C}i`_NmGp(7FOv?G9{INBdt(x&znQrwN5#(i00usb}?m z@j0(}zgL=?_SDfSLpkS#%-F-~3!Ma#=^u7{=gI&76gER~e>fV^P-qm)a4n6S?#o3DyK#bj4{?mK&T+pG-+q-ONF zH^;`x`!QV3r9!&aEa%bVULPCl?D~|SnbA2b)78W3+v_KMk}JOH1s@gIV3y+MGmEcy zS3G|^<7(RQ28Rmne;rExlvf!#1Iv}Bw|awjy956x$sQ>!>F@@}+<}`+;?f>(@Ktx< zS`wMksZ$w9Z>3W>EZ4teUcwz%1o%}zRR*#M$ayN*za;IrgCDhzO*|gUW3F^V3%$se z7N@0G8DE z?{WO%WBpskifc~~lHZ~!+8RjranI$~fD<ETCVlk;@0K*gr;2Sjy{0u zYX2QK@#N4M^Qs%wt(6RuJFxVfAo<(OP;?gWE&kcPbz8RW-mE+Q1?PA6;ofXlS#yYSX4yx+&d9Ip{_Ww?uCEc~Iiw^gM5PTa{nD zIlU5~N6%NO&_McX72@X?Z%Yf$XIYhtvTcQQqkGLKX~EC0Hf6lhQZo*q*+q=e{fyme z^YxmlR-ZTYZu_+Rfj+gx;4BtSoS*LKU+ZY^wrUg21HRQS)uy>;8K7Y5?H-G!*Q+it z+0Jub(EgPvLtgb_W_|2|wwZKc;Qm7riQ*PD)#CH&)bea;M!IIFJ~+R}yOEZ*=YqN) zb`Czcu;`a=F`v*``p*%1(aHM0<_lwEV?IM-zM4-<@#P|Hom1}aEPY*TQj@WQ`+1%2 z_GkF2Eq>|dbRPh(G++j)v$(l4`0`k$|Jm1l=XZXP{BgFpdH97YZ}{0HTRN*ZJ?LIj z&4gz9$Ew_`>gcl9^LQSVTsWBILsjNHcgI@peLEpW$M++^s(r6#|*~@fUq01^=hIHAa%ZM&J zD8F!1wm90^wz+UxF8`>P7wO#MJ5PAd6YG=z!5}1pFF2XDmzOko`B(dgtMH52DYVR+sZcM6Ba zSQzCR24{ zm&^agU@DsOL7yRFG0ahjl~k-zSW){Bd@zF@R6#m5j4xZIjC79ryg@o<8x(&XLpsko zV4r)yKKF>*ObO|$5eHAeJuH%#>%q9z)9Imm3~sKWqiTEDe#ufZ2*{ z)|NJiup2!5US0}5XTztw;1-Lk@=Hw%!dZHcSG>jGnEn?XzN4EldX!lZtIy-oa6oRd z=!m{8*mk)mY#`jyIAzE$txn%gD~y$}k9+Gq!Y&K^Qk@XaLq03;YP}?Azjm5XPxb0_ zpVDBTMZ!KIg3F9Om_6xc^VV0~)RB|`Phasa1_9pK;eFkFP>VY-K_3Lj=k1-L)T2Nbp>CO#5Rh129jb#}BGlJ~9Q`3UFcYDr@x#1_1 zS=P`7wZ>5OQa027P|+JWQvjR%0H5& zx4TSEI}^M*T@P4fL8X0Xmq3kxw7{CU>!ZG_bdX7TZEG^_b?d#hr3RI{Hw0t*hPNkk{p+1f{~zJnucCOaf6OUdz_qGy7T4-RC)Z@* zBU~pGKFD=q;q6&Bb?oga3WMCHkZ7;sq22NNhf;2;xyqf?96nFwd8&D)R8pmqQL0v{ z+OS+Z2*_csup&<`@3&Y}@^X^K*H#k~Q|s z^?=zvJ`8DWnw7zifwRlIY60XJ_QY%(yY?W;X0P00mNp%n&sGcC zmq)wDD>bCcJ1dy_W3#))9vb}>9PX66N58RqY>a(W<77+Mn=P;Xn{}=?uV}MulsmX+ zr--)YM?!2DGD%zMV`B79TG@-MEOU{U*T5|;SUx>yHyB%-<~Y`}N!YzZtfhA#))JK7 zGY+LkjOBh9$<$xLNNR`=z(}qlONbk;H^xOa?w5-kqHG+Y|9YCitqcAV3u&b5Rx^G^ zn96~zLF?V1^~*fAc@f-Tkx=K97r^Xhlky(%k765dYH>AaYlB(J`o{j}oDk6nFU`}}_HJ81wvnR6r{=e6)j>3b0ek&Y zQs{fL7fij@AbPE+4j}h>&SSZt{i|AeSJSZgMlUnADSOf_=@q0a$a_AW&Gg^tWZUMX zTit;Vl7!S6o>FMl;O!nSzmY~qy`bU8or5p9gZ~7w?krt3LQ7ftxTte*$i3#>%^G-v zo1iE}z;5ROs1r?#dO{2r{;>E4?4&T|L`DoOs8X3_;_j(ZH}xW{mgdkp8e$8e5&4ClB9&XIlnF)uh3 zq%^5jc<*5Ee%ltqIZ|nBAC(p@(`AJ&t8^LCWs@!=y6m9*!cE1kZEqBEUHL7Z5v44n z=Q|Jj&SUG7QlTV*Zl~>$C4cYbAM783cZ}taVUFqdm};S8?H!bw~w6#kCu#DYtui<7rARjPGq5^S1E zUMZZKyoy10l&&@DTAK#tUP;OJa&Fq>rlVX6Lv&1P+k#h4rnUd#Iw^UFA1ta;y3VBQ zl&%&ACk-Jm>7?=-On!s%Cn%jZ>9o=lRi1pxoBTuQVU1u@4Q%Qi>TUCtlvKYbHDBdH z3N%LU;Kgi#(JkQGDMO?0Vmd5e{o}1K&Ds_T>ta~?Ov{q%H+dm*4M+VWpoTjU zA!O$ZFNskR2i@3}{|zlYNK2bLOFtGK6EltU{^@zjhkSgwW_cFIR2-1`EX!p^h`GXM zx6))BE1&Q|aN%u+YwaAzW>;i;Bfg_&(x3_t=q;e+LF61Ug?%Lb;vO#m#nL=@SyQ7Q zr~$bi4BAzEL!7tA=U**2SX_-EMYX7$+= z8NAhIcsyjGZV41U+dc~riCsK)Lt3cRgM9ay+Vj&r+IN!e?Ut+ww`8{$EdC`}*>WPM z2UGjK>f6j3$c#NCRGe}LK2Aq8At3rjq2l`GB?Nug=vE(%O@0@%f|S&7gAnqULP=g9 z(^)EwaLcl&5)4ZxI<9NZB42GR(zs~^Jor*^QJtl4TBh1g939;$gif;8MrvWc>V_;QB+dOu;f<)RjjwcfFB#&{<5Bgt=1h&lk{Ukg-aM;u_C+ zS!=mQU7zxxGBzlK5g*~6g%;~BErp7RYU`dW?&utR+`R^|8qWI*aNaL>xYw=o+n#i< z8|tW8cRO`py1zlTWNA3=rKaS<9WnKU<+V{lIE9P2Ru#_XT3yI-O%{&h3a`!8^4f<+ zy!P9`Aa!Dk>mn|?L8*o)6|&5h9;K!#&vM`5ro~JvC(faJZCeYURhw$kO%=px7aWQ# zu4;sFbsBjgj5h6L9XM@YoVM?D0G)3*?fDLzc2=Br)`8QWS+xMJ*)i-kO)va@7A6L6 zUd$XD)Xag%_SKA#Fk}0FdCoc^!}gzK+io&bciFKjsR>en+W!a()KXw(dckFNg4SK> z!DMT$RJX1_1L5v+M&YipbGU=&fI+iN+6Vuz7TUQ^D13Nj68xI+Lu6lGLTJt-I|4HJ|(swU+#MFIZ3oF|7R$6{uCRTG`De zyIJWZ*_0;Rl%7Dc!6X}$J5i|MTj!MCPZPq+6&* z1LfskFq&SmIxT*1eMAiO5&GqZpRIzktkSybBh5F}a|1a%1q%J6IPFGn_~|Ne+`S?k zV*Y(-+6u~009s9xJcA0idDZi!e@CAY2?S#QEvm8fNlG5!NU3WyTB4IJqBB>M$cC#p z8!G-f+n#WP{{^q)L2%Sxq30151eLamAu}5R9IV~5O9iTtzv#(oL${0#NL0kU5gs1_ zX|>sjj7<&3rA2P=CpzQj=S$0clkE}vY!yv0hKu-Sv`r+o9T(X{{o)@)vs$Dd!oDOC zqyH`k*}<=#!$4K@?QQd!yMb@gu*eeV`U8WnE?gh!kUC4NMraWY^OB!*minv!@Gz4; zZwmUv8KCn}6)h3r)z0E@XK`(3&^_j(R_iswKuA9^OTOM&{Eegn_XD(tjA8AOuOke+ zhqvc~oD~M@5C%pO28NyfCyg+$+Xw@@jWDp=2m`x~FtFPQ1G}v-aC|5Xv{>Xj>7ed@ z+a@CnP}m3q3nUCISfPuAfdxalNElc!q6@;nCaICPbhfQ6oa*Nv@$wrO^vxhX-+5qt zvcyOw0uMFvh9w_DU2H^wSXQQO$jSm+5eGt90Ns!0Je+BJ(;fUHn_P>(m&rs&%ixZ- z1DZ0l?RxfpE3C*(=@q!-{~P6oHN!H`2XPKc0PnTy##k*)EF{kJjY`2g5+ec zxJ~6hYAU*^qjdKsuLV&a7lKTi{Z9+?+ENrpS8m_)_UOS>bO(xpyQ!L2+_O;pHoZjK zXT6kho7VC72axVr>tAtu{-}pP@NkzN)^X_>i9kwO(325p4UG*``YGDj_=mD=WUiJMn*2n= zI56teCLg9+avDao2<>zcGhI3bj=9UC1!VlS!Ut7l3;Av6h6c(B(0DmHPgYV z_tCDfYJ1HcI6}+nE$_^nIZamrXZjxh(QRMP9ZIaq}v{MaO{~1p|Fzg-n8P4r?jW{ zZ>tnvg-+!LLpe%69w8apDy>4hbjM0Sk`u+}>}9cpJra$9ZG?t=ohP#A%L24Ev|i#n zAuozteyiA9G%@Hm$17X>qLeM6j%9msEn|)rBC=PC!=ci=Tz)Ns_8f!%BfUpMDOPfX z=jlP+F9o9s<3piOS3hv0g1_u4U5HAE$Mpte8PT-Sg@DVgoXsxdd)1l2{$p%*qW@WTO+BTRNYvf zjJkURY3V*Z`IR+F%4>0hpM-3z8Tyco^#be5kwpg4JH%hKcp+cUfUIYLRvXSE(?5z@ zU}<`VY1)dLQj+y-TSysnKt4?2)ym9|NCS>m)2w{4r;OrnQ6-@4_SsUhSOWRA@7&`R zr36Ry8LHP{l>2BRrUyUI-Tt6Ea3WpLz*S}2P@1{}qoBoXahG)9^?Fl{Y?rN}4jdwZ zLj@|Axd>UEr5}vYpbVJT{7Q}uH-;K-DZtTx>r;m)psv>N9aaKH`wbfmD`s@w=({t; z*NSgsa6v)sog02SDdy-;R6`fh0jv8-`jZRJ6swyH8a@)&d;g<{V^0AuO7z|*ajh;K z%{3`S1bXi}t`iG~glg|l>wOS64~2J=DJK)$qGkN8;BOUwL;P*xZ-hTwB(^NvVinyv z6y5HvGjUl!-R(Sxnk%zD`Okbq!m7D(kqFh?u`Kl5DWm81oGlc;LdB)%)*SYBw%zFt zz6?H=9$<(ZyAG`R176qiqh2Xgcn4Fj(fs$A`KRb&L^Nk(r0W*D8(H2^ce{p19zBgI z%+k7cmE~ov@e&gbIS zGi_I?{N;}ek9Ii^c+P8`ZD#3Re%t8Dyp;W>P0xEW zdn9%jiRxBvuNqqpf_Or?ZI2_VKb~=3VO&)=BUgtr9z_|fq01uiHd1s$XWKC4Ml;SZ zPhgedqLq^+t83$%(Lq)Z5z4Z&wDv_k*!V1g7yLYa2jE6wnG_1!UDVOf8~$TeSMedK zD0}fJ2@&%!i1ksqtqLuAOzK?{Z+DpTa;bQ+3fLQx8#{Qwwkge{BxdU2g=lrVJm2f% z=vt%A*hD6NN)qF{)m95>N$OINNRqEN_oYVZi=LoHebWkAVea-DJKNT~!F$29Uj8O4 zlY^S{;Zj^&&0Rvg8Wi5q3+dZKUiJ0%W!(EzMqKHL9;$CLZNXvWZu zf})9mX>7A}(hbfAUxhMsv^o61D#0U{|!)RniuDSZcylkuAsK& zzkx#EBaaPtQoVEq2XXV;HVB1MTqv|>8GoSAo>lyTLVGsx2MX;$J+NWnmTXae--WE7 zzYp}uf`xeP0ekJ=P#DXJ428RVXG5eMSchzcQuP&ga1U6HnbtsQrZtdeR;~TG_yl)Wv0}>qSGDy?w+x+Wf^NnR?k}T&JD4qls*4WOXn@?EL|hq z*Hv88IpuMT8B1F1{7P`C%DBf!E}C`M$>7ZQAlNuL;Y{?QC%@j=X54Boe;Dt=E@!Lf z?9R0PbcCiZzh(5(G_7H1l0AlzN;3vbUs&V|mJL=5VM$TXO+G4M1-Z3ju%<6%AtRgd?{>!Jypiu8~V z`LNg^sD^aTc}*I``e}jxtwfh&e8dq%3OI%!8lc`gg_X^RERt|iqgOPV7iK1J$`l{Y z^pD^H*vB_y`gb{nPjIa&991dh7f}^y?Y=S; z@%3i&89zWMX^(Wo+6Gkb(8xB59M+>=9|35XO+Wt_t{yCrdt{H*#uAlU#T^f{rQ$!Y znQky9^g?e{Yi0j9%L_nZa45em-A5Kn>Ry;kKz|mSsU;^qGZ$p*Q>bt2>92D)+SXi9 zyEGeo4!#u$AvnK^)g@=gZr_pq7T#`~aehNWm>$e|JKs467sC7qrF*9Z$y!k^VFxdd z*#JMhh?41hHvkt(9$>xtK`!ODp?JD(*h5}yF`jS3w^ITm-18KG@$JS4_au*9!L%I= z93F4Oc)UG{^+#{W^`qha0#~eXT(QD&#R|vOc)X$EmB-tOkrl3uM9h1Q)c@EfC$A9f zaMpEBZfuZ+8M4C_WQQv_jk3tK$RdX=4ue|N;2h<{kaovtwgrdjp>~&(3U; z`Iffrh}(J4)1-GXp=GZVCL6C4Ci@d+vOj7j`$JqqyWT&Bc0KByW9)i+d6LjBwmR~r zMxG^PW9ExenE70Jc))iv&{+8}Q<{|ZI%)r^yiVG$@dK|)gPr^;*GWmGtL0U~xRPJ1 zbW-`vCcjx}<8?xL@=f`PD&Jtz4azTjo$wCYY`=#0PsQoPqa*4qK0ou^LDA*WQ<T#yuf^ z5hoRL`0({lRycgw)P;D&;&0);mA{xX#@WZPpGA8Ni{F)hrmMJH3!{%xetw&n84;vL zSTpdBxcog@U)t=#%n(ue1OL{bBpKUa=5jK^IS=@}Fl6TMjGUrDWOFQsESZheHXl~L zUUZ^Yn6d}x=|SPU5Sf8=uZnUfGW==4=qdKI>ur~~uh1)`4=`*w zwmtR95PI!ZZ7^BW)N$chBf8X^1lY2xxK4T$2#=Lk-C%N3eX^{T;yh9mrSe8@LIC>j&QLqt3;Bnyd zg~J*oePOn=d_*@UL30aJzvTPbQcZQT?51?bq2G?I*WLOTYFRg z$m?2iLzK<15HN4EkPGoLs49GxYqhj8;~hgf-PHRdlJp)D>51SY>*7Ibs#4bZ1Ku`0G0Gy*Ou{;7 zlc*zMy-7$+C1LC2+SU~EK-z({JG7Ub$d1O}It9?52vPED8UfFr2tmRjy9ID|Vuwm} z@HlfKWNBtiHzgKN6j@qa)626zaY1P5^lNtTiiO|GGB{0zGb7s6t2NOB*-6!XTb>_S z7-sxuLWR1Q9IdRzie>_&owm3t_qfo zR!@XFMrD(XbU+oK5c+%k?M#rN9VamUai5SmW>(HHvvML=JQlf5ETkk!oL6>P{G7yx zRP$A$246AMfMlJLb>-w#C8w5??Mk-WBwa{eB{GvV>1`MCZK>v;3ex;DoHeS{dh(}K zNGe&aWP_3oO3E6NYB#0Zm7btM0C=Eac-2xhZ5JPuHs`t*>F7Tp-!)% z!#64^VQ5eY@)%J{WIh=AS5`ukt~cKQQd@s5bgEXuW&KCcT0>!2t_fN55|q! z+akIg@uJ46#S$J-McgP?sI+shc;*%lIT9)AbmE$%|J71~v@vXYHR5TLwE+AK3d(9G zgb8W_*mDQY0mKM(OJYZCpGoK#QkwCHN9GCRn20H}bnJ|v_7^6O>16g<*)`s7G2^-N z#VR@_={EAfgd~MD9C*0J}JaRRIlcnyYf$U2LI>~s#%o_noabRVG?LY3`{PM zr!c{o(qIY^X!&X%_ugazjt)Et4jL-Ee%gF-UOq&GMIwBA1fn z*jOdVBF~mAQtE(FOop{Cj zX;WDaQ`=~SLM$r<&dn6c-zZWL?l$xi?5u0Cpxqels%Q&I`Xg>>BT78=a|^B5r~w6R zK!~ES0R}RR8+d^tOy=Hk18ruK|JF>h;s#DG6X8?kO@X+^BwC4#Hj{rTihz7XHr{7t zHua&wq#I%~Tr3j6L=MCv0ZimT(A`OfTM?Ny0$+v`{1LiC`9CdGOUQ3fq(A2)Y!DFn1)e%b(RD}NR9 z)0VZOpI40llfv{w$g`QrMcA5h#mBo)9-@5jCEoWLir^hJNe#Cu3Z{{4p7ve9o$%nbyeUv}*JL`o)^-xyx5ogG z@8cyf5h#D86eAL<17s*P44DDJAb{|pRwHI3c&$dvy61>g2$e{dG&mR6uo2QC1szg~ zEeHvOV%zguz>qKl)@yxw;Q!Lt-{eXuT^g>FmhUNJSgaM)csfF^Ym2)S79fPGQ&BJ* zoDt0Y8zIlX0eOl-Hwk^>)ZQw67}(Sc{2-AD{s~(Ys8;kWmwzr3oQ#6Z3M**DCV)6Y zg9S=48#y{HNP?~H+)FRkE!^72ccIPBIYY`CAtiZLb1NMCEk?;l81YSf=#b(=`^Yjl zc1W8OE!m}fDL8hB1UPn{`EEL$>)%Rz=vK64LukuJ(3V|EiJX+b;8>KupW$laLwA|@ z&|MP>9{^%-?B9eq_Ji!5m>)W+y<$bJ>wbuuX-zG!mj}WDaS{ zkV+y=Sq+v&fD%fG4doe^b)NmbqeFoc9txcJP@cm8QPOiPj%`}_P2t!%cwK1nMSMUZ z4gba*TPn<88^zfbpx9O9N_S?bMYuE#3O4!aD{unnz6{-&ofx55O^kFpAS48psUTVo zZU{p|t%1}XJjEc|`7CvIa*I@Fp_C7PTuFWUVHlSkJS$K{6y7i?d*j>oZ-8^Z3fDp$ zNVoxHVH?7k_I$gwb47&!;;-ZE*7-eb?jLA#cNN#mJo3zvmT(_RDsVg9V>a+GYlF-q z>~72>>y5|N8EkQD9(jbN(uH}1wpy_>kI>fTKLj;lz=5yBZs)m!^JJ~L1nhzzu9Q9K z>$JAS;R@59csmb9=K2_m37PAUBzdYT%;s8Mn8r0(IDzYg0@$AACT#!7GPeJ;uzfrV zjMcK2Hm+7F8v~~Ik;%lDb#)L12!DjaM_iFj-YDsQ zTU-}p-(xKDJ9p!OWOZ~%BzVv7!83gN6Ir>={{b!CpFkQ~(y|d1U##fqcE|N}tlRNQ zx<2Z_?q7g!il2X6ia1zVWEi>43olCsK%q417q@%G-y!O7h=RCm9DNX1wt2<-z>Rp$ zqgEk(zft0Wspc`?GQR`k9HQGSuSmY{Y?f)gS4QGqR=5l?p!HrA58ZWn$6Dd-8|hh> z^CX2H6>;ehB3wYZX~B%uDY3VUNQfr%sWM#l)h0aT!?W!(C0wfod5wBV`GKD&q=_ylr!hu~)Ocz%L^5H>ABF z`%%6cQ-K1jXq0mz6n@5Q)JoQv_i@oTOTr!a9(`2ll(3&>*~9+Zq4}PNg0RUQc(a;U zWAP&x;JVJzH}nlUH=&icn?qn8q=C-TSA?y^lD060H<335&;Qj{*U6k|RI|mQP67hF zpzu!o*oKWpkRUk42JmTWFB|k!p&zHuJ<(|nSJNzNLd~zfSAy9N1A2fFSj7{@Erg%C zYK_PcX^pcgX>p0FY9TjK(<^TD9Q+(^F>lHh?^S`VrohWl9KzRBE;w2GLX22GAxzf| z)kGGoAu_RGHA0yGlSdP!Pzg`41}skoEW*gJY0Ttm!pLO6;@O0e$$$j{KRnVF9xA<$ z^n-O(BDd1`DAf?9tR~Tr9c=TI_AkL{FaNzuwO;nwq+5|Y;G6MZhjqq4#*?|EpWqcdH zBTlfVFxShkm%L%dSrl2@Ukbn^I>TXM^K5|5(6UTRPEgUIute<*VF~yzS>iQim%Y>I z8#zEBB+fK2V=#dWpYQy^cb;ndy*oGqg3M~s3e*13s*$YH!9CgpX6;*2t%>(b))C^} zMz?5>R52Du6K}>yz5jMf89MiV&0iF6#vFdm%pvh+%prbw%;Dd2m0gNzk#3Pro@bPc z4bm4{!9@DHw34A6isC32DH+N=nje!6!A(s_~xpoA&(WS_r1R7*i{2ADn&eC^;eQ;x% zG9(K{Gy9ouOQA(LW*rZ+*1;! znR0~lfL64MM!(JSiHHv**$qylZyJG=Z~6ZhM0GLSqek>}R2G(-O{Q0mhbk+0E{M;N zUwoj8@TZ-RK>;2!5o{nL4U};rp~MEsMp)K#2!)M0gSu{tjJA?R)KAZmgYpKsVfD(t zL<%=;B4je!K)Z8`!ETT;ydos&pfA*!6VG|o*11oJvCmWpO-#;f(}KlcNlh;wBzKEh z2(-3IzTwcE#mI0F^j*{v_N+0pXFrpr(Zu0cZt!`?g_Vfd?WRMags77~TIfK?fQNQX zkzB0;dol(ezCae9UUgETD74h5F;tMh4oixK(oia6%l(&LDhm_>&Jaz>c zd?Dh}-2g4YP&=HAQ<3h_puujVL5oXwhlSGJq2iybg%J!900URE-^1srJWn;xluD|U z%^_8*RBe=3r<5f{A|pEcr$HdaHmBy)8!|$2PHGuqTRmiCW@?2ppdxcpt0+^bA0oey z;+k4qFEX+?wFwGT4;7h_8W9j8ve*)l|D9=jz3@3nWGnbyFTak(0lOu+t+`)d4SOiR zHD8JELyr~iGzBVDYYJ%s!a}uIBII@0$mHf8rUD8gOym27hpocCN77JTqiMyz9< zN#=Uaa80aMtwSl)`ltoR6rSs#vn96+@66K%ZAG*E41CX$Pi9I+nUX8+MlA#2821C< zDdA=gVmd~_T(kZp|20l(x)T9y*a&D}z<gz)$Kb3FZcur>%)A| zvbLoM#?_9J&tFX$M;|4Tu$QP{bwj(viiAc1=%%VIrF19pJrW4zh-p^n=eQaN4pu0~ zhOk08HY5}XFDVOzUoh?)$$nwre3|c_&t5e!IagZ;Cp3+7zY&Pj;rHx=n56G`o+6ac^BLtfKvI3A^Qzm(Z zv=99hlwe4oAeI^>cd#Hs+&7>RkW(qpMo^$VJ$l22GQeuoCf+S+n|0zBlW^)*cP7ug zR(oh{aKk#tjM0@zbIZ&&iq-9hj08?fXEUF8Fg%5N97urXZ$b z7*pq_x@c=$bYiF56oyA;-Jbi{aq;-bPPa$85d$(2cCuafY>Yr_OgtU+Ho8r}X0G-L zkUXK5MQryhbGBKIeBCdEgK>q57uhZN+%sgq@$%1afU%6)D3xo5nD15hsvL=D?z0SXlb zXcOA7E#s`^NdZ&ncxfL_ZL1cZToTcweAXCF3v$_%^^|ai{2M~~Dd|>PApK|~L}>Jr zcds-ifP_c&BTOCs%lZVd3wO%xiTiFTOkf7LRv2xW6VHl%O%A50(doCvDw!rf^#ro6urZ5fa(Fpzb3C zDwwFQLb_F(uZ$tyP8(>;JLppwlTa@{xY3?q@GJHz#}q7FDRsoWZ0TP{C~d+KW+gQ` zC^|K_O#~yLgXPSd+pN4o=QI;e+$#lz(z%sR`7=X{k z_`OLf6PIv2)$EaG#3dx(7LQAqP4L#i9BDR1=2+-ri}V5qd-vOh{>oDfhIEQS6sEAw zhAFJ`o#%XKO)N}d&t>KF%wjohyDl^5nQ?}}P&`Zllt7rmHVNu>1U2o3nP=45OvsO@ zo2-DYfmIj^p(mqLsKK#YqNGTiedNH}+-#*7T3P`B$-j@A@62JzI(D_9rQ+*I5W*R5Cxw>-rx~mglXKLsjNV&)VU1inLaq@ge@H`FtIm=NQES~U z6mxiip@yAku~2Bfmop0<^n%*!a6|Yy#i)<9?vGkBLDJ1xYIvM7z}}`}ZZhP@(l_pA zIkS!`2+9Pr{z$8mm+^jdyncwwjpvfW zNHFZ;-i=8LEh+nhJ&T+HqltnUH0~KC46U5R{~Lg@3}tfD4WVg+okd8y4buH6uMDGo zEOQyh)PrRUgP#v7|s@ah*ggYwaFZbF}-o88PFK?*pY4@#O z!0Vvc%HC*@h(`-9!Eh{t>TWn>gF`Y4TwQ4*`xl?VKdzxkQuCz+q6sk`tH$1Y}9?B zI)7we=(%E4_i5oz?DBn~GX;(NPON@qf5ld!S~T)sq)N0_Yd84xZ9)saoFv$hT~apT zc5ZnqFK^FbGe=m#^sVHJgbE1u0WVmtvJ2&xdn@~MTW+V$^&`kix&wA$qjk*)txxW) z_sNzbVkldPmI0+-jGXR_DbT&OfFcvc+zN8Npt%Vq^0$N%38#r3k{l7k2%3<-E|{?@ zC3qy|74HL?wE6&$u!Xi-|s4JOl)Y5Az?v&816IGf)M9u=k{ z6nB+M@sX;n{|h0PI@_@H9cn{-?jpb18p)*Q#9>^hjrW_+gex#=*#%<#dP8i**@gx? zxduC#IBa7sTgPcu;rL{AfK@L}?8C%RDPXEzjGr7uJU{<*FwMyZmsE8bzdIOIPC67b zs3B?c^P|GSxxmMF#Mz-fb%;6cuIzWiEA(l^&=h*a>JT&)DhkUMrxyB#fKrde>F;>l z$Ks?@rm#mig-f_r6?(Z=7rMA63$0wS%5$AqfUKuJm55AEN05?FptO3Hw@l&TLNXmC z3npn5hG_RDB%(Y_LrO#{#u-Xfw1wxO{u2Zrz0~3fDb##&xl3gyG}3sM=&6)Hp#>Y0 zTtaB51L*v!4i)HDez&^Si%L=MAT+Rm{KeMePSLdA&2rM#nX)&zw@S)jt29aX<9{Dk zeVDyHsrIWnMz>Mk=?u5>X8b71t2246c&7u#S6%Mu-Kx}ZrLiu(Nn~zT=B2qH0_o2T zVVox(HMAj+x{2R|rTUEoy^~jG*SDoOgaL8f=0WdS!G?xr1L2 z+R}2AGQUqXoi`M2TLaUob&l!R{(}+id~HpHEwlb%Uzd^e2)H%9KxelAcd{+O+`%ES zv+r6Z#L~f5@%g_R3aC4T#U*k)H(|AQjHj$usO?@*Q(7@|xmFjtxsJDQQEy?Wx1%2C z{jv}SA)Jvxiy1V@vbP3yI&wodJywEAhSn~o(^~1jXA4Iot#ga7a4n2 zHTl>L$u|Zoy?drfHdynPmWM6gZVg-j)E@OsgH~q;aE1le$3&&IN3B;q9Kttidy$Z?ZS& zs05ae&K$Xmzdv`G16r?_ZyO2;#np5LGY?Vh=OO$z6NS-agUcb5)-r@^n%K8)id*T` z(w@%X+$wpz>FAdQLUX;i6`v8bCqdOK=$Op-x9gtYH7!UU;sG8aDH{*R!VW2Iv9NW| zk|3-TSI4=!Y1!`O>h@U}dtBYLyuoi>qzucL8|Ug)Sb`$iQG_6VM`v=aE@ZeS3&(IZ zuEf^OE#^vGaS}!~T!rG-5uS7RI(RkiR9taAMs-Yi8-yblx9-B{Q1=3AaI}Jq3o$Nk zT5raMn8+I~Gvnko!#KIktP)RH*z zO@qzs4wx-3ge6DfAs!~R_v7WZg6t(Mh|szrYsr5pAN6!|KL(KA%igC<&52APo4At~ zy4`X@Yvkm{0YdKJbnum%xtjCeH9@@5y50V^o zx1m|-T>qV%+V;9T&>(dE|4EPj^&L@Uifq?5ub=z5?x8!T7_Xr~ZP6Bj=V`bMIElis$V?DL?sb;0(}GjdT!=UHSXF zI1CR8isnQC$Y|HM8pS)|$5LZJU5$za$L~;Qj)vrJdrtgn>-43w`jDBr9G&=%>RG(J zsOqy~&gX%j%t^o~=2shsqG>^)gwx{EFpSrF|4C8bBl?B})5Kj&9@>{%j4vNH0jZ4j zi<}k>0|6hm5?H2flN(%&MC#)lVsy*k&mCmw^F;~$B3>i|rhT!3o_UW|OY<=sJ%pSN zy_0*NiYppeL#7~P+Zi9J41su&$nq}bAhR{93cAd0-T5|=gz#*98&>58OAYoKyZC=5DS zzi3=|>itq(8t?o=6udPf-7D8bt<^HNemgCo=)Z{G=_D^7bytT1n2AI{A>a;dMmJ~t zhj5Wz_-zfD$X?cZDH`v@ubTg{bh9OgyrWUE+j93%-!(w`KL_(jcQWpxQUn>lTb-8i z+;}YZ7zt3}%LwL@0O2C}9~G>aRx9w?F!InzQQs;?Y6_>?61^Ns!>pBMwC(4g^zyjj zz@QO55cwx=lry5UMt~s-dBw6n;uSPz<-@hcAMxe5KjQx(lbzB8_w;QZ_mK|!)Rv^f z&l_IER?gNAM6Rw z5xkjaD<2v^M9>a^TDgzRFT0P-m-|Ro?ju>bj}+uSQZVi#S>ryEHSQz6aUbzho2ZNb z2=B&$gy)&{J9wTM^FwUK*D{m~G3;11fG}$Mo1tWwzE(bnS2G?@6^E}g?SZC$SoRztEBzI7F7u1n(9OxTb6}XXwzXumgtV0QTGCqK*N6UGbx4U1 znv>ijM#)HKh9L1NA1MoF$1@d>q-X7v;b2cX66M>>WD(SC91p4Xzbb;+5(!7RJH!YN z&JH3Au)HEsxH5={gee>y4Ek3d4Ux%E+T3mPDLu7BzVR9~LRqq;d?SQat9-3V8yU-Z za@3i0U0Et?Fli&DO;q``NvD->NfJY@H#Fd!m`e7CE{8~C%#OH7{tK1_vJyia3B(eu zTQ$UTZ?Uqm&TlffQjyF)60y*khcU?PW?V^yX*uK1DC0xiS$qM0fzSu*nTUqF1}DV? z?Se*n+`%4k51l{6*RZUQ)r47GQXHW@Bbj}PCbOk0ge8P(^xpBNp$x%M#9VSj_s&g( z8GXkKl7keprXhQtbzM!a2vcQQU+#t@%}wUm;0oR|tSs(pnqI@{YB=pf*+0Utux<($ z;wIxpu#aiz&+wsZ(Nw<7560( zj2SOntAzn7Sc7e7qdPDOXcV9FR`=)*jjnp+rcO63W0fC#Qi|-iU{I`o#HaN794{3| zU!?&t=UKzBa^!(wSj}P^@4*&{z>E%y`>Lpzha^@ETMvox&}}#ETW}#; z#h#nSdepi%+;H&am3q~jbfYj+eOGa-0`>B8ztqx~8d+zZtf0J&lhc1u5Ep~ns@NLs zMHsB7wE6mY77KK{?#4G2=E5kRrHh3r;h#uClzDheaUW2{Ubqr8@*YvAFaq<)Su1rL zTotZpWg_RiqA}18n;tB_0WQl*s(i)@V01Ipw%rX*6R*UFbjznf`^^um+65BZCn-qNB*)o=XuNSp#mX%#X4&kE$oMp$U`V8* zhqAVik6cE3WqLH8fK$K6piAZ{d_94~Xw#C9D{GOFuddU==+ofXB8$JS-BPcZz;G(i zmS`Tt1T_g^gPYeH;Ah>#!)}p--H3$S-KP5md>-U;FO=ba zFjUWrf@}wZr<;Xrw-Bg?{2W!I=Q-iBQ3}Z`@SIUx$xIKvcsS9E&$?IrRzqO$_9WQU zp(OuN!yPa-QDYyY&Rg9h1>c$;@oMwG5!i;^$+vh^m|OJ$1OV-yRHacF+#!ZY|6&>z zbl4r-WvmERq=Btz#A;x@Tg<;gQ%22X}lGsUc73Ls(tz#j2 zP_1z>p6B5u+>Il!$Ox)&u|&syx2y*@)_sd$%z18Q?v?*L3qA?03Ot#!0Khn$<_5;W z85oSiX>MQ~^Iw`ysW7h@!$iW9f;U3rz)b=Y3!^%9woqbu@mVqtjh77Dp+wXb*9jX;RocXewwrXj(#93I&XliH+GthS_tbAz+Gtg%PukQkYgLk_ zjpRSmpCqop-Esv^;tFg`3g$ewb)xEQ0khzlP8h7S){Y(_L&LsMCU;-xB9#q|5*x@y zD>M51KA>rU%61CN_$roag=R2#-}i;Aph>c#3M3kj$FL33)Da`b)kBDDfe5;2Rui-v zvz8_Mez7498}NJKxkWG$Jg?U&)tsEeN%udZMeJ8k0No>zTt*4ZUGg_c$&^r&b9VXA z{cg+`jdbcpio1av$&J3*3t(7Ch+}B&Dn6~gzy+Yp^McxUNU72xW(7S$bTXke1;ac0 z#rstYBbsq1Gd?Lj@=ke53Q}PW(U9iSs_52wl!^ z^!fGn^!Y&@O0Hf&F!;2t;%|{~Rxl)#Me>9KDPi40@noEbanJ6WOVedAyBIQHv*lV50! zhp)Ifo^>~h{qGpJo~uTvZsQ$>mRw}!cY)s0E1z;?De*I~xKmzGlsTuLn|KJqjye7O z!6ipk&hwq@9s~K9^?VeCWr87)tv??d$Gbppo6R_y<5DmGcZ083c`cmV(+CQaWaxp9 zqFoW1>aO|bKA^-+V`GWif5}NyYw2L3@8ewfOP!opwxx4uXTNv#f}OcU-;vqGA=6G< z_R@K|j~@D+4*rG`r=4=jiH%LCP5;#7#@3^cJG!Ou*p_3bv>Z2OYGczU=gw*LW*02{ zs3|Rz)il34drLT9{$#>glycrzdq~YP)^!Rv|IpI>P(hiWU@hTu9G2Y=tY<&4`TK$O z?FS|wC;g&#%;Z_CNx$RZ851WTp1QW`-TZ7 zpOyCgz1u%U?dJ~PQC)p!k{9sKaWBYl`jr{Gp0K^>8`Spxekztsyfn7IylP4K`}tKT zp!ECsRVPpTRMH6Vi_w%bx9J(K< z%4_uY{Z%L4zAq3+b%K8jy`NvTnSp&XsuPp;1p=u~V7A%Mue!WDDw%(89{26fKN$F= zeMZ{umwn%BxLx|wRB75gofNh+endM$c&)itE}bDQUb*l;S@=y#q${BEQ+48q@^kbsmxm*&D(f)X7dIGqWgSNF4F(>i zD!AZv1Fx*ZsDxbBkEp7QS_GeO;FWb4!GB=jm30`wpE2;tIyCr(cfOr1H-3x(C;s*T z@DmOI_YVM{4g83z%6c|47`=PtVBf(vHNx$weS=PjH+ z|4aFcF1&DVzA$&rtn)6q_=~eIo^y0QakPKRvATRjmnr6A(o^+#oG!=fa)K_cy0qys zrNvyQ{H1qXQkZq#7w62%FTA91(FL>4`^d5SJNOFco;T(AgRF63{=)pM^Yi)h|BAO% z>X;Xq%iIeV9z_54Q}e%9Yu4-w7X9~Y&N{f>+Tl;kI?X@h34D(_BB6xF%a&!}pj^NSG zXS*2eAHf?*>!q3H}@_JQNz99w|EsEeDioxF#gC7%vzc&UyF$Px`qIOP+!Sxuy{TTen82nQ)c>KH8 z#^CRdmG@PdpS~*&zxM#}qg06B_;EWv29J;1c`}9xwlekL(}*p_BFx-`-Jy$J>9`X%%?9{j)!1;hOjOe!n6HkB{3|WAONT z_(cpJUk__z@c4STF9wgVhlgYE_kwkMjdDczm3H8-vHk z`I#6zKF&L0@c20IioqM`S~PCX*_D3A$N8cdJU*VU9{|4OoJu?K@~bT)Z}3ule10E; z$H#MP3?3i1_w`oVkB{5m#^CXBJ30oBkK6GvczoP4F?f93axr*(+&&qD$H#3b29J-! zjM)kE8;{Q)J^=jmb1UWJZeE|5c&ad=muX&8Y%`a5iiNE)di!1Q>`2Xb63Oqi}PsiZ#ao!e#$H)2A7(701 zV=;Jq{15wL<-PIotc$_p^U@H5$H%!8gU841$VHX*3*6qd^{ID4OmKeM?2Cog}MB%rJwuS!0 z;BKsZ;^<4hbU|VEx!jRf&3#_o}) zRmj2CE6d$wg~(rDb8ofC|DgUs4^pMqm3K=;@;`XDR4hhOa`eS>=g&UahC-dm(HCE& znLdc=E%#hYX3Ej1(f*p(m1`gTmr6SlbuWKO0cBnOq6;pVdm)SS!ix%XkH#Q$)Fp-4 z`SZh`%$+yuyo+aFAeHR@_$S<^trOq#YP3U_{rtjhR?hh#PlAoNTy&I6?ENslzQj7= zkNWq=a`$a_eEn}8pnmT`*57=9`Vk)ezxDr}bbf#R z?>or%f8qf3mmOsNe?CC{6$e@Wu>;g!71odR_hE7$9sXwOi^uDT3WWC_xfSt?=$Cj0 z(U*u;i8s*iZ4eTgNE~ZYmCLhglwX|oD6O*kMRy%xrswhV6q)8CJAkRgW&HZdQh$H_ zUlI1d_3gH%`Y(DnzW@5)1p9k`N72?B->>oSUm4av!WJ)Iy|BLaSNswu5pW9x)m0^AP$;W@u_(ib$sJOrJ z-|`(>Vn> zD;2q6la^%(J+#iqsAHuvR%8Yg!4r`lUTq3GdMY9_4$yG`3FV_8Q$E_}{Qcj)ciC(b z%IKTfegF4+|M&lT|M#((+#ZRx)p$ITC?4r+Qn4jaN%A)q=~hOzN@q(Al5(ALt+J}a zJbCR(&3H4YSyQM}wYG*@t#u*QSnL7WuUdLYmv`O_$;xFf%DcY?LJu5Ihtgo4E~mq2 z8>edI^eMEBQyw`}0HGKKQ163!>Ihy3$gbD%NFCZ3~|w^@`~WnTPLFKKLzTGc$()+5ibx%*-EA*+#}w{*L%s& z!g};D-W|UMAkmYH1j)S>X4v$b{l5IWkUdOF{UEdv(nL?x+R@x`Sx!#h3vTTR8bVLU zW&e;f&w~L*QTxDbylkV0BBL}@4fz?=VdenXmfZ#Js1*(QS7e@%)89a=Tkbm>L&zXn zT}n;~Qd5!Z&Wxmw%bCN_f>=%4HuJ!@fJfzh|CaA~6d>{>f;>K<7$17RJ{>H6IdeN) zQ!VQt7$c@^EeopF%BK51*iqk>ZRm`cF`u=pR<+_GK(pJji`GZXZGP4I8Krf^?5nlr zDP}B~e-%xZultd$A3~MJCL9Eh$Amsh8zH~xq3~p$1!1jhK0vmc*VfyiphjsZnCm`N&b`Rc&^Wroh{yGZ4324JAHXdi9$5x>%)3jIUG8e5d6t=7W-pNVHCm zJ}1v_m6RuF3RcMb{v{hdSjDQftWmX=Bg!{4K!2^Z1JS&oJv&=z&n{XQF<1K(^KyzY z1^4K^!*(hvn3-gwwIirzecK{tyAL8Td}VEWw&4m)lHW28ljml8tu;$AR|oT}po{5; zH9vBPV$9oRI|^YuY=u`&I=&k z-c!y1A_7QhXWK}Wy{8=HHu9A_4)zt9SLAdK1kCWmG73KJ)wH3gW_|u4L^W#unGKAZ z6I_5ZmmZ@SX24b&S_bz57C|fDPl}t|)3lqg?QIE6VL`iy*|ej{dX&eE0)*_us`xgA zP06*fUBzrhxDdb30)xd@*V2kX_yS9_O9l}(b30!#=Jmn0Y;CJ*cKGwppA_dzj;4Z6 z1bvQX?HVWA7;U9Rlco+SH{h=$tdQupx4FA**w3^<9JrR8>!2r5jBD)`~%bt4o>y0q9gmYf=@B}U`p zbGOace!WB5I*m=RyzehbnWRmDiK+ULpJG&n^VYdJLu#)6*b>8Ut&|*VJ!-w zh4jT;g&~>I)U3=uQ0M;!i7EPkabl{R zHdzbjkxlpnF;ft;1TjZW-_4lk%IV#VsF%~UPZ=l9l+)i~#94CsyNtkc?ZXO)nh!jJ zc0LXo`ytHk%&%tj5MXMr$+wrZ*XJ)~;}V-$0N=*BJ$`es#9(1f6v)xH;;&QSj+~*(={R zKJv&rzl|YA;oNP#9V3#op3QNGyzf}@Ci~t+GkuX!n3B3lv)Vh0`AYaaB)~Ml+F186 zw4`1Dy@sCy{G6a<9U`NM|0uMly1}$*Y&cg#>Bh2<^&5q$sUzrGFU}G0|HdfH)Z2{0 zEd6SuFh^fy6z1xyjY7S?)+n5*-)I!h($}|)!uZqztSOq6qJ8i&H41tCLY~z=2wS8{ zd#qg)^Dzu0{Zvz|sirtny_eS%-!Zxpkp}oFr+)&1Ef?sPcRh}l_QXJOfxIUhFA_$Z zGu_t^J~{n8Mj$2&xaX-^n9KXNY8&V0_k~#5lBzEVZePk{RShD zp6`IL!|-J+a59pL@EdWQg?Tz(%e8BS^ZkSG6O;E@ba(P@axtSJ|A4LG*f8kzoM!i% z$vEcP2c04#D0bB2i!$TEljOw`eA)*nN8$Em|}yRM1D2VvbXabt28=oF=b<^+?kFq-<#*cu&pJKz-J`G|+grPYpEfoT3Dpjj3v& zwdRh1R?{Bntmz79XsXi#%}St22{eKq@YSY;-G z#bHvAgphA26bmYZ&E{AeLu{;xW1r*LD%49jc93copZ3Z4!2e==`YQ#bW=%P&%^>}I zxZF-&6Skkz-*;e7a{4CpQz5=_j%{H`^%t@Eq<2fs2EpwWK@kA2z3k(}1)}L?nOF&@ zn?ZT;O6<~8*5Q@>Wf%coX?hiWMirdhP|Ete61${Tz~wTG056MZd6!qJD+d|>tSUJB z7Ax!XN@tW|1bEpBDDATIu3XON8QYwSmU^YRr9#kZ!ZQ;yFA7`RXk3AZ^2c4)1E_@e zR>4z*(+1@#w4N);p`)!5{v`03jyz%e0I-*<@Y7FQmE@dca*BDy_6%UnSOAsyF9kl+ zk#AyOh%>yHZ(?t-7+%abu|GV{@M6BPJpy`P1YSw67cr}ZF91GMD(0uy6Ru)-afH|x zwlln#pJH$LA<4(%E{Ch&?=m^Xd=~pj0}h=^a;~X@OItT@7) zdVEuF-#|||E`_7Zn(5V2!WJJ$bm<8fw-jrSUbR$-#;$5>TNzmuTcs?GMq-oWdN=oO z)}?sY)~$Wzss{G^gns6G?rb zZ@b$l!6L+PHmQ3ck(d-#?mAZ&(-XbPzJb`rF1>4O*Y>`YF7=dqP#QJnP%=J1QH&)w z-};GaHf-+Q7~{;JWX?chYoaTe_%y6cq-}SJ#CTs%e-~n0?vjWRg+YYzlesE*TtTCY zOZael&ct9MogB+(Vl6E~I{|dfml08m2H&(%S zaGWZ-TMIb>x~!wP{bj$NbKEI^+`;d}k2`Q@+~$&CF2{+lb>L1pTdLp>a@=iyp$fmu zemAuJa>~~nxYM4_D)`ze_$^iN-45KTcW)K^4UUsv&iReAsM+>6f)?{d_ZTYV_IY>& z{(S;b2>&7;fhz=}5Pkt3fnP}=3gHX!2s}a{3gKZq0>7C+6vDrZN8pX1sgV8L71kE^ z5bnHHkFZ{we;TK$@_FnA@fUAJ*b^N1XKWVsUg*G`xA0a6Uc43IyUl?YZ$*;ykOME? ziX>^&fd?FVpL5{F+YcDR$?ZLQ*9O%3fL(7Al%5#Wr7#_)Qg}l$DTQ$)bcG2GJI|oX z*NZC5;=n3B9IA;f<*I(lmmB38Nw=1gL;kcQN{;%|a*M-K+zqOD@3P8y_McV-xxmlj z)A9;)1b^j;b6Mj!#y_3;uHzAt*hk3{jwkh&OQbMvFg=N0WKeIPo(SU?lrQ0=j&I>} zo!AuX9_Z>xICAv=A?fU-6nCy!lGjSF3WYC+$)0mS5yp>$NxXHEDka?GY44)-|G$o> zbDCsUvtN8yf`yghmx|q{y^!{}R=YX*9-+$#%Aj3IboU@Wu~+eKiN0$p=|9FD72k_o zOxd^9^;~YNVBmE#U4l@W?TYCqAKm)z23ARbGi|IW;ycrBm4DmqLPsV4*K_^-LJs&y zw_E=qV3qWDaz^pp3N}{U@&6ELrSU_2d>bDpD);#FFu2nAaXwz$@2ZVoS7rP$K3;q; zR~x??<158)oE8>}W>;+f=)B=#Cx2NntLPu)hn)D%=3>h7=dmi|gZ$7F-{aNBzfooU zBm8ir{B{<1{9mo&e?48kQJncpzXrJdpNIT{h_dw?%6h5(6v6Z;;22{xd^`_CMP5#CWm)QMX$+An6evpdk{DLd$`^ p!QA8Nj_2O{0ZGmLC&oKnoi2CSFQ6_K8ZHf;%_`{*mvQd#{{^F7>B;~A literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedbatchnorm_backward.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedbatchnorm_backward.o new file mode 100644 index 0000000000000000000000000000000000000000..682c31aaece1267c0cb35fa8c83ec78c14a376af GIT binary patch literal 94400 zcmb@v4SbwcnfISb2S_n;&uD<^MhQ5pDMU?c)QruuWMBsFfk}zd#mR2K_)KN%Zq>j> ztyua5CP9W|YSgY<{8!Do)$RVbZmV0RYQ>~LTi#O&rBH!T-dYMJP-xQ^n&Yx{&g?h7%nTz^j+w;^bUDF8%`@L^ZUNp zP4j;5onFss-ZOodWLxy|LwosjM@#PmUeCAf<%c%{+0why>$%=at{JeBFM2&!+xs_` z>%EYSHTtUla!dT(pd~Jjb^5&%UXM>=-}%YH!BwHIQ~dmIc92`<^~@qUQMf&ke>&3j zHb4LMPSW0EX`2)I=V~mim9*)WwkeT+wmDUJ&FkHJYFSxfyWe+dyD+GJrvOjD8U+hFA`C^juh7DkhO zD=qovmfXZaD!HH->*#aswd9^IR-I2hR$VeyUsx8ap=J^jeLonYu0-Dr)|+LqQG2^A z%s^`j`;&!z-kLGM7Xd!=NTP2^T}xlOqNT67B3W1+>lSd~j+VYpRrn+`ra@8)V=aAu zOTrf_GKK!wDoWm*VT_GL4o+mK%qQlPwU#hLp{%{Hj{yfq`t{@h1H z87RlsR+d!$*&$+Qt zamHKooL|^Zm-MZDZ_U9LZ_Uw*i%qfAtfzgZxfO_gelh$peVSS-yto)XSMg?5Uf6eW zaoz{ZS_-dIbidDd>PglDX{6juiXAt|!(_Uf*gPdD>g^cuV1x zi;7LZAhC>ojA$T>=;02{_z;PHvGV)M^qISH!gT0pr+ z)=Xb}Y*F7bTdqFMYqaU%z2%v{#mq2EOf!@W-Q{oH=L6bGa*e)fH_4e?d(2O7-JRb0 z5-WW{I=3~G%f#yatu{?J0^#SnVinB)jzru3W()sVt8M<-bZ*(6&_Af^%g~oS5GKD zPMpb5L7%N=qi5EJ)Ew{nsy83pV5>Hh+v4Z$Vib?i>8<)6%PH&+KUIEFvFUpQr?iuJHhKY zt&}%X${TTcCwe`HnG|bpw3IjM@=jv4m-5C+d1EfG!t2>s$}1zuDk$TyVtmuRo}ZNR zBBi{D%RAZYxv7*_Q_8Dxd2j2w+N!B9rPsUk%C5h$^yX4}vrB(_*LjxST1szq>8Es^ zZRs7Q^bVKqb-lyVJ4@-Er1Q13eiQM{UG08uyx1Dr+_OPnO#c)GSpO37hnLLBr!I4`z{ zWjV}xy^)4}uuF&FyfrWT)7IJEV8!+ueq%?~`k_r)!RcaCr|Po(4BBZd6&R!N++zEw zLQ|tumyPvgIm;_-Hh%HhH~Dt@2uGxFMF9{(7nHCTqLtztq|>wRZJUMVo%E_}7lA zRZ@6<&FX5V!iGet(c_2a7du0f!}B7QlcQsdjIJ(EC2f25d)bp!f8F(}f98HKda^2? znTd~iz4a3l6Vuvb6`5&CaQw7PY({*q*MH5ii3vW8udmY@%EW5*g*rL-YBRhwWp#B(Q6}K^ukE~2)JxI8A-3UKQ`zh z%VL9oFv6A^9-pqs9|5U7P0d~{*xA7lv1%@_2jnyL`6J-AVL+&7Wo*b=0s*^fKG*C; zzto&R0*d>A)v_$+FdY3Kv0DDRm0G^qnm+=j`?{-z6U(*4)_2uh+X2vp9r+`myvwW} zjwS~K?zehcIsy4oXZ{G7?;{4XGFGM>S`yy#qCf6tqP*xW3i3xlf|UlgAr>((cDsT7 zR_CM_{f2`45%A$NL~M>~tTVNcg#gl8anN$2|Ew^7K!BztMdCg*C_Bq zh4~|uUeC8If%$5-wpYi5GJ*K<5E)){je`7QJ6n+ zO4o(-ZbPiYLG{FhP09Z4r~~_k!u%1ht480s(?P9{3AZwYF$c6pAqZ}SDT(*kY3}v( zlIG{ee2|eagOHNWRZ#GLFS2r-?Q$=&Ze2HZvNK{o+n?~F4L=hmZpq{}vZ?o<=S4Oc zwpFahkS(!j({=mL_oCtJ!0JoR@|UxvBK;1G*fIyk5Gt=@??tPw%R7o8LvJ-X_`h5#y+-Qy^y)_3?g-89q->PIP{&>opyFC>;X0b~k z-K*I6U9nn~DLar#vQY2*l7=0QY0Jh(yy8{NlxVg^Ngg#xNah8i9iH|B!bJzbT1qDi z^JASEy3@)e`NO-)4O@fdNBFzV4?UDFY|n5&XsUf`nUO3!22wvMRd{0f#qtcjXcPsN zD*oeyKdr~n`@_$KQ{1GMUP+aS{4q$Fj;r|+#m|k+x-&ieLO7kjJ;WW4ZJ|>;oj)4t zx{7Cc*X2B?b)|WRyFSYEgswArp4jzvruD2}Mbup{TEk!LWIe#e!;wGFn;+lP^>@ct zwWpi6`OyJAL-vdgTHKz|wHCK$bjafNjE-2`p3zZ@+cP?5@iChD^3AEj;Z*!c*TpTl z=M%-NL+5P{|G$i^j8F>%nqdEu4`*`RZZ|>{gs%_sx)rA4+m^huC3hF;nmU!aQ6b;2 zn?SVV1uQEe?-@enjT}mu!mxdfcaY(}#*bS0mc?twgtM1ZY}NsP_$AKL&7qcJ-O2gq zz@9gd?B^#!U7zAv-gTB2ndwE|ajQOomwn!gLJLLeQ(olSvXqxS$l&Tp2+!h3n$3|j zOhTAQq^>^UMdp9BjItE>0(n<8GM0l=o&6lCYkrkrZZe3TPDJ4`_X^5grw9?c82 zRL$a1J)4%cQAyQo>TLgLnQHg4dsV9!nRiMAn6A?z)Un-*&O0qamC27Paq)@7v*VN` zMmwj}kht))8X)iWq8UXZT_3HXpC{IM*=PCQgptLkP;2ruS{SF`X&PSUMD=stN7cV( zs=SLTGbi#*_j$dSajGT@Y|tZK&%e{_Ol~_jwtpg~E9)ox&}NWA)w$LZ3IcFG<~jKFN9_B$UBr! z^(!U3L(68ipF7}}E{esfQnRyZFWazIt` zH0t%<_4>qw&jRYmhhrf>^r#=&k?adYN9nThN58gtYT694LFGb9#Doq=^wK}l_n3@EV`()Vt^aEGr;t8SdNR7u)x z87|u+0C5w2BE9u_Kldsj}fdaV30Y6TG?UxiA{zB+_ltMF)QEVi=bq`o@ zkssQbm`Mjf%YJA?*U+_qB$>mSOvrA6-Ms5su4zdXubS`+J+UF;N$BkvoDu`XgFeG( zS~PW~+!#OT6-9xlPi1t*&u!7(?I3D2IfaG3HCGjc^lFQUc6C7b-dCQt=8!J03k(iw zh!70lZ?}c?@RPcEkLf0$-a)LLJhxZ{qo>EF(+}NuQL*}Mm++UF)^A82@F5}mA+=Gg zyxI>9`32U?++ui<`q8LSW-@ELbU zl34wfp4}Jb#cE~^!FpV}1UPpUI7oxuEvI<5oKzx^wKU#0JQ^8(5@P(0#PoY~?lY_I z1`u{MvWVsVUVDr7P@7T5!;0O*=WctSw$XH|X&=_>u9R21Tag#8OnOV6AAUY^Wug$- zvqTM_(an?gD=#axu4{mLQ2iZi{~q!_ZEb^}VkD`RJ&Eb}+FbxJ^`iZstQ*zeR_0DM zRs9|Hq54~7?$!IFUhPB5+;*k6Wb5!A^`NqqsxR-2P>TIEduzNHvmVG^`uy+^6T2p;rjIr@GoPaINy@Qt|lc4b65q7C^#gJOgNZ}k3t}j-%!jlf4UkrbV=4-WK3I_uwFb!(Prg*^v z%7*0&i#Oq|xm{FXN2+jiiUaI$XNmnQj<7z>&4o?EY~7R=w=V*d7GAJ;!Wo)~D48)E ziUO>Yck#8xDJeeJ7$N*x__PqXqsjQQslszfZ|<=~hz{&d#9vK#b6>qVlY2@Gaer>H z>Bkf>M(_sfKCHNI65w11Twe{6+Q3CeXsUrmeBw?t?Gv7sGZW z`+c9TpwMeRrF6)M)4RyRb%wZ%a|yt!V9om|^b1-Ah;^GO+z*B;>Ye=8HR=j0xWNt! zV-;(sbf)|Mlf?r1y}Bac1!wyF3Q^FV;umWrgQ0IE(1eZ>Dru4-{GABsvoH!Ke_8J8i{#A0W%mRmRf8d&-%Z zMon2gy@RN9!U!z{VSdtf=@8O1(Yq}!X^*j)VgLHHueGt3bY}FGme8w-0^e$x-QGyK zD?R*7d3yNSaPiWK#IJ#3!d5zdRp>q)uf7I7MuyKR*1J=)JI-6uyJVYAq?#m;1*yWe zBz2CGk;w0Y;ky@xFV}hj()tcUFhnh1Zb62PsE&(urfJs6)TzUGmD7a_UjTb11esvJ)VwBs;x?LQrZm9`Y`y_LBrLjdUZ@}eEF?Ux63=t|Aba_+E-IW0&guJybZ;H9=7$8>X zkjtB5?yd}2ACwz$c~i{Yl>y6xyiu1o#oS#PusX;ab9qzDUB~0F_Qd31d{fNbmB!ex zJaIP2n_}*+jMY$;<%yX=o@4H^{7dO#U6AgWyN*#|@ZwaE?wGqP1Lg#M6LW%e$J|{R zup>wpJA!n_++7(kAV?Pjf<@98-_g~c$?ZrMeeQ2PEA+ke&*Jn&|B~^$moy?4fj&5n zwX0|V5WnrM8N0^sw@Q++RytZecub%R<|9i}v$zS&K zgr6S+q-nA+EjLlRL8U8Ix-uyJGDQ#a_cDBAie0VNzoyo|HYVzva3H8p^hp)x2ToqYQnNp%Z?7lr|Sv z)YInT6S-)2og_v3jh3aC=I7u_6iD-xHFD#HCx7qAo2ttuEFQI@%IC(vEwNb{w4brN6mLI~1ua=wt_= zIw)^c2nNw<2rkHr1FQ*9TD_|CYSAg6hKq}pc`tezou3JN$-AKzAmt5M$ms@*7yTkz z>9Y|yPhQVSmVx>KA+-=Oh%0KSqOC^q2V?acH+ikWa`6R(>?QSH^s4&&1Sg$X{o*7K zxmt)=EpwXzOE>2y0#-k@^w@zX-t>s8<$_k^GOhWEDOSHb_XIlXK(6fYqKS_D1jnDv zo!IvQr+=9pf3)(IP9C4^%uh_R`XQ99IHNBEzkAW|cC+4q`geu-2@b`Rt=r&T^|qF? z(cu1bfP63dFADM#oR3fGovEpD^${_+9}JRCeK#x2PXw%fo2h!M5Q#s4K-{_(2rv3G z1^J19)vx(#b~O^Q8kY_M0DM7Vegam%%~-3|1`pZQcEbp{Ui4QA@)M5L@6J}uS%(Aq z$*2KcryxJ!b)7{`>q?w{#N-a@mt$np;I|ZtAY(!}^hOEba`zNS&E!UOek>qK=Z9#l zWU(WpUAnBa2_viB_GA;zD=+9p^eL(r6>V@d-cD_3~ZC$FGd*@d^0Pkw4E{CVsy$^Cv5Qshf&3-hiH` z+tVAgxIMkK7PqH2WN~|XBNn%(H)?TvdSe#n@V)=bH;Xud7)&B^N*1%B&EcOhu(HkJ zpDRelcP=@*CHE9d3{L&OD2eb#3RzJRr266O^b&CS#VF}Wp)|MjZ&)etH7b;8q)(=B zprx>1AL2~1f)8xIL8lhs8yYn&fgvZ zvg-Xj>dqt;z^XIP=a{S^rTQ!{`xGfLgKLT3Iuy6H^nd4hVXRiZ&9DkDYi z5Sdq11MKspF09h~!UoY9^DANNzW~#GVP!qdx7XA&m}V+zuNq^>JINuYPWN8yXp9J2 ziSZp?&q~IS&K>oOVdF3`%P_}Bbe1>r$Ts~(n4l3nQ4e8|ZTL0lp*~Se#zK7VfAE*X zoA0FtL;~TsA9@&=NQD3^#fXbmhz>YO^@OiiA+Z&sCsA^izdTcnSxJNnXQo3B(!#ka z*p#OQvGXG^J1H2uULA~&=zTfq5<|9%*`PswkW!?qG%l6c@iTxZu^#|9`+?NV-}^bojP`T$&?7Mot@(&5 zb1~yN=vKZ+s&WHVCB}jD!L|@D@hw_%`-Si-s5wy(kM)Y@0UKt6lqd2}9pz5+a=Nh9 z2RVxW-a+02V$P3H1$hrmeV<>E4=rizuP3AZ+;O;+lNB)?eq@r>~g9HS~x zcr?LK#rs;XGQ+#V$(i>Hq5{qqi?;UOl(*?l-6Z`aIJ{MOyu@Nae=HF{;1$bHa2KOS zM!?d6*izpzg=3=Rx|Jy19S%b^A#v~m5_W4A^|ff@6%l5g4)Bn!ildgk>3~Z1*afhq zSUt<Y$0n>;705wB-0X|Qr4@r;B5cX6fD>5dtnWt=nrvJ)e19yjbdQNMvkbRA5#g`54j8G_eEGeoKu^)2R;kA zXuZxK;~w}uDoi7w?gAx+*o^1cHIRsK#k;+Bc%u;CKdu_L| z{ciKxIUQ;bqjZDJKKcpZ!m6jqUAl->m2-XCOr$OZiaUw?6TLR@k=1k`HSMA7MsU;+ zx0Eej?UURT*}&zxH&R=5h^^f;jq(j;JSxk%BRrwqPHojEsksv=|0I8pQx|Fi7Jmdx zu!H(`dbLML2B$Y!g+4MnsLxCJYBo947@HP!xlZ+0mwW9DG15TwRjR(4-ZE_F;ZF>^ zshlE$xz*v+X`j1=&bpC#T>|hd;ku6)c{{eX_j~1RiM* z>4S8Y>6s}Px}UEU4Y>UYwRj>>ZW`Wl3Ko}BT{6UFb6#UT^~|59=fW~Q+spOr3h`{) z&J?IguBD}+zxmf*TyFQ&bbJ#!9a5hHzv_&5MXNP8Ci)W@E4M`SzPJ`Pt>qc1{JCzD z;dS7nwvEi}A!SwmOgF}+UxNACH-pyO?$u+9DA3BEBdiD|ngeme8rDZoned zI!7YaJxY0$ZnYgz;p*#%W}9vh^b<;WOpl!$LDkm>m(WZIHIRh}inxSEy#7ny zI6_E_bpstF4kEX>?Q(9P!Uc$xI=%kqk02rfMTXeP{+xBOZgRZd&4+bFIYIM{q#Q43 z-gEGoiHR!*Boew6kk?eaP=t#}>onHeI z5(*O@?Et;|edi-UL(7W20+29o-}wk|>Lj}nCM%4f&_6%6C~@$ZNP7VwsOeN(HBtp? z^yAX=5%kT8gRe`#qb5L>s#UuE=NFS-HjE~`s57uHFNh0@4S(NvX*k*UA49xJ19_A% zQ8YNK5Agal|hPZjj+X4oGP^;cVgOpp^!*>CE|Zd6p#}DGhWeWF<=CxQb%wzBT%8>Av5s zC(uDYK&aV7M026(>|Y#FhLi#zRS*q{i*MtUARVoD^!=K_dmHWj)%gODascqzk_k5EFyPNxZ7a)>=hNDn_327aT61)ENmtec%cfX7#% zn_K8PbkVLfO8TOqxHmrH504_d>2ad7&^D6p{1sqY;e!PFhTXg;iq}rGgq}+EVkyxR z>n4XQ?!B6h0dnUSV?~nq3TP$^y0C@iH%cu3ByyVFOHy_RA!}5hM06BAR#c=- zCpl7Y7_#?%`&uOh3TXr8o~L))8RuhKg}V64hv*KCYfj!$W^Z7@rTK%=0{{wHKM<9|YCGY%&!P{{uy zna%j0klBp?37O6KpOD#%{|TARaSHkHr5KitPYf!?_a|gF<8L9e8Gj3z&G=i$Y{uV0 zW;6a)GMgF6`0lR1`BU2X$@ufv#{Jx*9I?mAbQEQH`bQ1dzNIdH5)nnGcCz-WSNxDz zH0-5u-n2Cru~@W1wdTdE$?6@$7F|;6EzR++NONe32+EgS?m74bTMWOGq}Tk=I7sqz z(9S+T^aK^DIlox-D=BWaLd2hwDOO*{F0XO?9U!!=pzvpM0F45C&O!`#is6qy-J{{N z+E9KbG}Hq1-V)m37n8kZ7fF~Yfp?>%J@t_06_Dn>S3HfGkai)mBvd?fv^I(rT-!&T zSANJlqlfx~4f6nNKG-ntt7X3xzj~a6#)l#=NoY`~3mIX4Cq;b-^RC4_%e(#;WZ1_c zM&4Z#uRxN3IwDb{(xH_^os2=8yaH7ssfZJeBos@GL4BG01IpJ4XDAutS`?{7P9mYq zIFvEu%OT_;NLBdDewOgFl%}MppW{o`sbrOvg!nl^qp#CAgg=F8PNkhlCmr7mLz;nz z+^5L~)zU1f3>8={(C+P`{O{%O9xr*p{P%JS5aoZ^R7&@2d{4!C=nGh0~&F-=>>}AhG^1@&CvxJ`=rJ==*^*}l57fP7; zypR#K|NR(ys~u-VyUN*T1mYJOuavZ zOJW;(#bsIxxmQH((yf?GC)#5_hwstwU)0t1EENPg+kV)a149>{|bFB_l(sXCnE zkS_Dfg&UBv>dX?4UtDVR^KLQ;i*_S<735K52LbYxo1`A4b`$G=&TMDJHAPLKAQ&>>Ltq$s^BjnPs0%UH zD4EpwFAAl~APEZ$AqTatBw^_tb6`JH2vH+iiz8>0K*Hs3kebO2>Ev2Zl1?;{MB;Vp zhP%RT6k^NV7UF7Iowp0-H&XGv*P`TpcW`4}>GnIIfut4CSf4<{-_#DRF#*el($14@ z1zqKG$;FgmRDL7x$i3qXD%R3DulF^iEV^jzc0FIEabWF9U{^oX?NK+|o+!K!+!PaW z7%+2pr9y{^p(gZDl9Mwc__71%OU!cHnGA5wd@is>pZV75Jw#?YcYC7PG9ij@r&L@A zNYhUBX%`)%3dhuXNat`ZC-rWQ=uPR+OC?Lsz4<(~?KEk1gSU03l$*uoQG3r~bCJQ23=bi$T* zo7z58ID@}x)bgX%Ao)#^KhK*Vf6as~LB&+sK3i3{*V}V;jm7ObyWZmVoZW13d(LjP zxIJfgSlphoJ1ySn=l6Bfj4UUPq~b4irCV~hrHU8Ja^iZvfn+M(3RCfWmwYgtJCw-Z zR=znbvnC`f>s`)cvYXfj($q0^|f~%B!n4FyyW%#EB!vsXu>a6oAm-bS=ds2w%jD)G3sGGGzkqAA z$xL)*X;gJw>b0fPg@?g$TRACasB+Z7q|phtg1Nn73{Jt5Gam=Bfdhx_8ch*lxRU7v z>4!7LO1t?d3wvOdXiKGspDyRtpPG3j145M@MWXoS38IbZ(72c;hH_y9jHo)oErjA| zROpt1$ODG(rky&)2FVDfM=Uv2USbDia=XM&EtZ|2Nm}MEXeM7-#Ym`(YmMXF2=S{W zDjsDXMl_`zngbfh64Kbyl0Cgz@?ju)Z7ms02kOW~#AL{X9+pLPz0$>#(T9lm1*Qe- z21!DBtDy#Wy;FS?yfr9Y zwMQxuuSDvom7QWYzI&*Z72b9jJi-mI{eEh_6VV1pHW0CJ8)~ z%BY$OxxVUl+V&EEPx7bEYTVCx+2@!V#{B>d>bR(qaHY$HD-xohRHr9m6)us~n$g8D zvJjC9s&_=&?(*91rhhN+cemcM_qa)_l?-PX%xwZ0ldWV<5>lTR5-MM4_r-;FcSPDk znTN?+d?I;mk5SiAb%*#);?jfM%GeJx_Jb0VM7tzgnQyX{6T$P5WJ!^a9wH`D{U%#^ zkSS1;()ODiyNw$JyNiJ%OJklnC7f)<=Wbb&tu!X$ca(%H<03s9e#Obq-tZfVIU3eW ztWx)D;*rV?plhTjJYFNU4RnumrQvsoKPKI=duo1TDf9F2m5U^atUJR)Psvs!X^t zqT~s63pft8Z_v4R*`Q&e!{aco(WdKVR=}h-#C$CGYb9|FZBNcYyy>*_w38RGr8|U& zgd(|Mw)!C_Fwx%UMSGnG);Esz?Dan<>59XYArRXjE^VMJf=^%N1c<^w~Sd5~jfC{v5|j6{roWGJ%)Lzyp{p-d9ZBxfn3mcfM! zU^fM3G8j8xPa{!_x)YemI0bwW`m0Ek6D4O^;+!PymUu>NQ|6yntX>ct2AIsW z2FRY%wqAvWph-S2(f8dU3I@h8$`QV$pJ94s&X78_p$o4eev!=#{QW7B3&$lQnJJOW zLJ)7@9%+zL&S8?6?Av1AeDnGYPCmerVY?yi=CD_=lb~(eL)7~Ktx(?V`J6j5e zQ2so(r69LU_Xw^Ac(K#Rte=jAa6%fsLStYglY*4SB1h*@8uwF^LKm}@S!U;>&kQzG zm)f0_!2G?(5m?xpf&Rhnu=1zM;{@&|p#7&rLo^bRxyCL{(sD9HMT`=P)CeRo-8LpC zoP*1Wy8Zi4rXo0+qUsuh1*DsA1V|Hru+KDSc20(N`&(bqIjT9(m%fo4f*pEAgtGwe zNhN#L^wt;r90DEP7MaZf)qye1DuHral*cQn*6&Ld?oJi%l69+=G?S12vnMAdJdEcV z>6f&GhLW0F^-88RTc20Wtuk!ct(v_x6F$A;b-7;YQyCvW6M7PRa7}0hZ~9)&bCOo% zIB;kk9uWb7wA0B^l3bwa>SMMBG#YlJ=0Rkmrd>Kk zoi{^PRA5Lmqy$V@bGzu`9!B$feVr|U%lP1uV)FZRc7@aE4Ba6W;w>rx%z9Nx&xZ(Q z?0)r5RyN;-h9OF>;Z4Fa4PLY|XXwmj#xQdc$d^R^(3C9=Og0Ky-hj)SlFJ|@D&-Bj zyeYX16nZIdt;?H|%fKoqPWGQdNc1{Rmwvw%_n*l&(vN3h9=jN=#!a zu%~AKY1_zU&6~Me+6>(;^%eXNVdD&~cgq2!7v^QDun7}nh}m1tT2J9JoGne?pBR2R za^*B?!8up*d)=aF9mK|5EHcIb5QJ=|}!(tPPXxU||i>CKGET+o)R( z7ut+%nU#u!d0mLcGx2e+_e^*ma;$(UH5xK?qlBm<`8vjauCr za`ibQ4(dt9nwjLe&1S~2)+%x`2B4WF*P(?Rj9&CIXUNg#s=|$ftIEk4s0tz)s4F|n zmg92{3IT(I!jq!wxaerQ<%~JbwFYs_K%RPMXc$>mteaW$qBkfoV-C5?Q->l3xWa@D z43Vibz#k|ubB-5@*nHI(^W}Jd+j>I*%D9~%0^V>$arVgVfGwS z9<#Y?rb;5tpo2P@P0NZdRVb5=t4s0*n>|D7YFcZLL4Q}Fj5;nRd&L`N4piqHV*vM) zQG@fk-Ul4Q2G{HclR5zBlth8MbqqK#)hLsLi=+|Fk+}sj9~Vv!Ll~?=rd>E(G9iLc z-4Ku@^pu2(QG|-jX85KA9KlS+r1-ifE;Hjy#a5UcpW9rt>SjHwzo%Q8>^eYQt9!cn z$W*w}i5=);oo;T?*!5hJ^l2n_P)IMk3DA`e965=i7VRpPkRfu+b3v-`I#P!>ns#sn zpNnb7>u+Y-@vf_xUNo?a*K#Uq&l}a0yTfQjWBiHO>=Zp!&&b8xNfz#eNKEASgwllv zArEEf+$p*cs%n&Yjr5K=uS|;XR#{8vIh`>gr!r`-VNF>zO{Z5QRf;&%;m}DOM9~RB zwYW3^d*sja))IfP>+g=Q>VUNpL;pMkrh~iY!0tw z&iKNcC5IS7{&2nC3A+{Ql8)ua)85igvzRuAV@fo|3iQ2O3P)`fq$H_0jr`#2gg+om zeGvi)B+q;_IoH|)FK#%LDTd)xKMBPu@m(64w`(5H@~#*w9pS@=0$*At9l!)95`is& zpH%a&B!-gwYegDPA>~=VoV_*S^)OP2z#@Byq$cyd1&@o`s!s8;8~Ln6t1GCe4#@0d z>@S1K#JA#2gKnro)oebNVkZMwSphRdz{mxfBnp`)fyBaSgd}`S5dmnvEK|_{CZ7F0 zEfeE?u@gu{Ye<#*sjjHFSV`gAfteDnoM>tm7$l1UCm3QDSmRv1HszDW z1XwD1uE?^AC0|JA#`D9ta&p!pTv5yMnF4J9?L@y}RI$1sqcAh(k*)iUYOCc6S|LwP zccSJ~q;v_W#)$~EMFat|EF%;uuOQ^|D!e%(gIc1t3R7|R z!>O5g)8z&&&rFcQgDTHuzn*J!OYV@&FkrsGND>(hA-+*$^^FGPv^e*BS3FAI;dB9t z8}U8u0QR82b&qW*vPN}xFV=};f1hsT!i<=hpRNn3x{)e=9^FM;N0+QGK$GnOSVsz3xmka=&;R>$vNh9S7 zf=IwCihfjw?c8Fr1*cg+C7S$6e@<9r$e+_SKqn5|!F^*taNaE^dbfB1`K@)GDs%pv zrr(yBe&2dv%L3|NZjBsu&r_N_dbY`{89ML8#`KuRBRh*n`?Jcp4f%}`{^M8T&*^aB z&uQdYF{GQhhL>Vf9T7pG-&WvubRY{)AYL&(@|J%4FzMab7KbC z##CciNT)3<6{fc5sn+~hnFU!kEaj#4-6Y-(Z~hs=Cu#G^DPK;o(<62PL0XxdCT0`9 zoMuem|OnoEeI?0Z5`WI}=d1EH-1vo7o33!2HZ1;9aYou}N% zJ%jkulrcmny+L9Bf?rNd49ptZ^MTzH69l(BV!^B-f&+Vgxz>WGp9Cwz;*PtXc>SmS z+r-446{|^w;)y%$Xs>_ic2XaH%qmC|1S%Z8yCx>mxqDK@%MmueC`}7SG{llJziY@S z#njTuwH1qu(m&4OPmS#oA%{TC3gmF3_Cf`u6L%xn7u3XoyeO2x+miXG@s>*I3aQyA z(8O#s$(fAU0h$3#HCsran8HsEcz)Ia*ohq3L!orBI-bana%A^XVLCq^>T2a#-t|7j zWbBAfm<|D_#7l^-dH)OkbR;4plU_%p4uQY}R^Ub{ZRE2;(@-A>g&ugO{UJiBJs2Yr+_0d?}2SMR#n zew+${4(yT&!H?*3CU+m$%Z^Rd0PMI$^e6#P44uQMJiSF^)13|XC5v-IyhcrJ42;zWaqvcuWUZ7e z>`(RGIs^;{h)%RX<}&jL{aTZDRD(n=;|8|ciC!F+2ZSvjZRx&F z{>(3#oRO9Mk}da7$<}Ueo1}4E%Y?W?3tXZ=lJE5GD%4K$j82^_o03NB|=FjI>19 zXihW($e1c*^L4^ZZggLFh4odkUz>KQ%5Hj5B0kwpSijKy$LKR2@8Hl-h5JQ{rvFku z26<11j!2P2A4M&%6$ndm6`?tMzlDn=2bgSYe@s-iPMQ^cgPjN;Xp+9V2%H`~XH7|2 zUJ(lqS0k^yecOk|wC1qJrlCltSc}~g#6Q@+h zSa#?H+s{#y^nvMLvx9e1mK7x_3lW#@q%0){9Pk}3-AP$il%y;~NN1g3tv}v%IX_R5 zEH+EJVth2dwnQh!LQ^~idm^6Bjfrp3W7_meRg~W9pIo26r2&lA1TY+I3%uebG^-_P zNR8{jI63LU9^+(byJ@#*yJ>^4&F6{Eo4DxOU#TDT* z>FyLEo&7XJ`;fJL2S?5VnWXu{&mr2~e+gQdy1i%@r9-c$m1AZxSg1ErRO`K#P*=SH z3z2!2wCT0@mkWq3m$l3uwpVX^NSkR7lCE=VE<= z++7?@MFs*t@>(FG?`?T{n*bRPvdRdNLV$GW z0S>%xf(}QE5CN8ssgIFpGKv8rUhPVgU;*)u+-jm%IwKkR&Ig9p42Ch$A7CzK}0M3Msk%=t0p+j=yL}kS9F-2%oPSw(j8#*cB*;NOFOCS0%s=k34?*~pr=iUPFl;* zYUwbfj1EJZ{eQG52>(-|*~{psM@(wvDqZQ0M=#np`6Da4hDwXqAf*~AxxdOkHz}p!H!P4+p@)o=YQm&c z6DFmaFe%l9lTw|P2y9BDHGGp;xgIcKh9sr>GrYCLH=2|xsESJGm#gaeW%evrqgZTO zdcDQ%S>9}MdzQCa+@9qf7Pn`4r^P$d`Q6<#gOsZ9aw>kT>q6vD#n0o9?B`4&5~Z~Y zlJVEC{ZNKQHjMU6STbQrptK&yI&|$jGr2o?m&-7v3qem>$oMRl&D{BwCc)GUek?Hq zJ53V9Q^v#jDA*Rp+;r0w@j3?!8t;Wv{w+%qVFEX zF{)%!CXxDUV{qw{y5GNh6KNDRMCF2Xo^wPP75@0J_*8XH9z~5G7G;l_%$td$7}0c2 z`OI%cUpR48BQZHM1%j$4apobp zDj88p_yF2C`@ORtm6mBMzYoGyyqhE+_xD)*nS~2K$OI40Q8XEJHg_nWn}V=crhu2m zb;h{hinC1~%LYl#KPSbbEA|Jxgh${gOI3APe$IqPx%+Gq_QdfS&Kkv8DOT5;K|eNd z=6ixl#2$aKdawa|I|NP@=^i9DN<`3fgQhZrsN zu^X5vuEFzhY-HYbf_6JJ1X()F3?DDg6rQC^T-O2(B+5pZH!fGc?rfUJ5#BW=6$%lI zMYy10KjXM?QkK;malqcjrOx>l!`$U#^k@ijwc!&Ow#<_4jLc-XGf|Na3hC1|5~f2q z<;U^`E)doU+6&yyX0G zjS|I&VcTXl9e&70nal@CG802^wu5HYLeo_$S|{V1O4ZmPS7ca(VBx|eEMu8!+?(OSx{a>1VIR_l z3$nJSiEjtlJq+}tTs*d6>*8V5A{Pwl`0ZZLJmx`t!dAvQSFJZ0y|iN+!!W3~Ql7}_ zxT0B4_foTZ%a290JYt>4vGhuG9#?-zV|FWQK|! zp|{jCLR6+lPkPyBocRs*J#B`H&0HokO}KkusD{d&7qWTfq?qwhGNinDCkff(0b9=8 z%>D?9sqJ>+&og?gam@Y*;}-pzl&odYLZCm3%}}P@xn7W=jEx#cWK@|sNza8csQDiZ zJX7*UcDvWR+Ib@bb&q>Jud=+}(j6H-!=OgzqarCaO@y0>MgpHDp zOy$o-*vcUph0SEfM4D}t-M~ev%wfYq4mjZVQoQXh6|X`rldTlR+rCHd+x2*e*JxGA zW@aH+T1GR*dU-fGr!MC8r|gi~P`ZI?1By5muEcW2w_m9d-#`(pW}Jk7j-y_^{>Rag zkg7t2L#{)v$vSXNHb7agcMWDUQmmAf=W4>woG~${56f7}%p5AHl&y?)1HkKkVa9RAk$Zx8f94l) zauP^24c{f~x6~8-MxXm>Igs4xK0g=0k1Kmi-{%NFi8bZ1HdAaF<3~w84k2FBL~<10 zH)+8rzRQf_`%BjKyYTB$BD%A%kfZPBGk>lOb{6{umQ++eVXHzct_F}nb~~4(s&W?A z;uFe93xg%H<1Cvd>K&3F+Dd`e3G`k-E6cIjntwtW84YD99O;T>kZE}<`6b;UrzylE zTuwz~H>vWBRbHja@6wu7Q=kmKR=fC7Su~=f#DJWCf>O;pe)btsq2#FyTRKM3CHIk8 z+(bs_MdXTh@d-*z%EB%?4(#GGjV)scU}1#qodtiv@scmv`6u9ucBt9#%orUfV}2Rz zEClx#l|-EnyC23oPSnYyo2Zjw*Xn+mZ8BX*;-px@8^oP=E&0M5#hsKlaVOp62eR0r z?l&xnlVY@`Ii`s9X(9N7_oSqxmhi+e_(DVyDyPnqQBo@a^_Tm#g5;iEJ)c3La52W^bFFqWOWvpoHdg_ z>%gS=5@Wo;+!+!ZUc3aM0kJC@y;aaJX8OJk)Tb+E?L1B>npHzIynwK!?^aQ)>S9Nb zJixnQb_qq3ec{-uw|hPJouV?kC5m1IXZfeV7>~s!4s|p8!Bhx+Sv@_encM<^EIP`l zRW@nT(B*Y`(1>ueR0WVUEaYl!s%56==Tzx?K)A=Q&@8iJ7) z94WFu2C3KV(e}x}UqecAaTru5auR)SeUYg>RucWm0Bw{!I?GQZLeZMq91!w)c%SHh zO}2jeDMG@rRN=0iOf(#yQ0sA(h4ZH@&U{O70#TOYy|(ow=+LIHwaN9hM)A;{jA=JbU|2@RGw0Yj zgSsZa!*q>MK?hZF$Vr#Y>2#ospkH%pR1>W5wUh-}y&=l zDsUsbYBR5*i15h}p@f)D2lQ+e4k%R(YCXBS^s504TBS@KG@ygE@3dd;QGOA|iYBMX z7Gv&b4ekAE$sYxZki&aT;khYRc>Kyj!-W{44Z$Nb(=@Zzk^$Fw((gQiX|Vna1fdF3u9DuA$pnKEiXDVH{Q?3 z#!N4-73$2s4eKxOL$7!@{coreueV*a$pLQ7hxdGj=zv%PAYrreLLh7Um9p_l<0^L! zv*q=d1EwE#;Me95(KBtrtQ{F_yKIj5h22haJ2Rn@fDAmO$pm1C5F4*S0*qL!XB4D{ST=$I%pDBKPckmhm93jF3S*13FVAwEPKU%WgAU>|Sa~g}F)ObZT??!T zKxv9o0T^1jgu8C^GH2(t#Q~LcJZN&r0ez+!pbMJK%j& z(+12h*BE%(!F;uYdX{vUjo04-0rQdz%Z;!I)VA3v+(-3!Ei>Cr5P|J=SBam zTc03$g90<~@(d1Un8Bf-ASA?3-s>B9(JA|`=k(6})Br~Z`%4nC-wl#Mvr844eb;X- zfoZCDpoq$-lll5*Z7D#1pwKM5zU@F!WOATRLFB^)65f{Wmk_$u!S{tlfkw{q|Hclfe71erUA>YZA)4*&M71}s1?rNBe4yp zpaSnA$_0dH6U>C$mYn4*yZU>yoZ0s!7j<;e-EXR7+O?cs%Ct)%-MmqK@4Xw-u7F4s z_0%j(BCe~bSzVtsv~=HZwIx#p?$lgZ*BnU2xvtIKmkMnnh86v*yU9jamP(!=^3dz~ zyEaPW_^Q~;M2ULZLcT8>FHRwe#D=Ea%B?iY)R}p0<<~0{`R6h7ddSSYKn%wE#mAfk zY-0C<+I7JPGiw|zp_hc;$BF8<8f4}b)~!Cv%z~}|(9G-V3gP&FM1c09`DOk8W#;wU z2(7Uqxid$@XX{Zbt&~5{TTA>gXXaI^_{DC%V)E1hJx{Rbse=}`=c#KgZqHMPEN;(J zM=Wm7Q%5as&r`=Nj+xhEGV_`rt4S4}lnU;onb+1tF)e>{+GUcB4Je#6^BOMyLpj_1 zGV}T)VVZ8Wi%cQBorSbH98)dgCDodl7b=e|1mmpy1^Hld8R6ZFaHFMoshN2#=L1Tl z=6-f5Q>?_yOFwORJ@x3P4ZAMk>CC*&I?l{X{FJlU0QKC{<8UElF@+qeQdVAdr-%W; zmcV=qRbeO#BhAM>id6n-ZyQqm3#F z;i{&Sjvo~~{E(zbq-&Nqux97A&y$@OV43M~SR#vM-DMVL#9)V*nb*Q7c*fajnVDDD zbQq|~5W==ie11YCZwzO#%)Fj}Lt{ci$6uK@-9%|(=+JmZW?qkYy_a(OifSpDd8zZr zJ~8tu5`&LbRWi7Gx8!K;$hcS_?-maEfD&T;NtOez(GSSJs$$8P@H%LgUX{Yw5K~pg zPhnW^0D)!aIoX=xsD_hJCr3ijnO62AZxF6I12g7W_A4e`qvcs@SwHiWZL(iBJlC#( z&ep36Td#XD_cB{A(<%R@F<1gEUKZCIZM`IIOT_niz56F^y`1PrIPMQjzU)@Z^7@w^ zQ5nwqBI)$r-VF*9%EImoMygP`?21MhyhUa>9R427hA>?{@4}=rWY1Nv-xD4shY_9q zFn2|+om%?-BxQ-LO-5bMqzjuxAk|V@Ltelw^NyC>D_qaGkFIlqHhkf9Fq>CAn%=rs zL}s%(!}(I9LR)pw-cPZ;mlWGS$oosFnOi}Y{Bp2k`0Dz$maecggc(i5=7(NP&3s)~ zN+DC`R$SMbBx=37nhWWRb1-P3Gx>$yYo&kKzc~afG1?gUncQ7T>#GGs}G2A7^ zx~jRw`KLm`)xDEncIDT59OW(XnNENV%r?2T-~v0Og2krueD1S|#5%z!Usa=x26msW z8$)Ve{^Z0_5!z#%H7s0AzHS#3Gp*D>IQCZdUS9(gjYY&=F9UqMz1I+n0L!K+d#^3) zfi-(Ck?uyrv-%;s5u$uSOQ!7{KDQG9>}F%MrIeUGo5XhcnMTpD_DMmEV*9kqS$jqI zUMs!DDLXGN56-i6^|CkEdC^kL483S(S7Q2>;0jS?R|x%dAoW(+uYPV*Q|*?LMRN`j z#4e@q<6*Tg2x{+f)?S~a;S0Q5s&uQ+&4qk^{Bz3MYouiD1%7oS)|3F%jxTmqhO26x zhmp{1;t${Kyz(nyNNC1+r`=jzY})SA1?bMe+$)j6e{b(}P@S$Le`xMC99%@?Z$e*u zkU7`YtV4E53zD+-%GfQ0hwgtAg9t>nUJGUG<+Sj<-Z7*rrcOky z-4qBoYDNt0`E&FNsS#Wy+%cXRKHOnV&FzkW7-9tPENbg$JDx%cm+z{95s7 zlYk>)>UAC5L{=ik3+l6pQ$kLVFHYi-K8NB~bc({J5@QgVOyOFN)qbeA< z;OUDS5wDWZ&j@(88i_M(c>-c^(qR}F^I*v|#XrtznA90GGrk;$$$=HRUrbhwdRdH7 z^GZWWExBjVQ}_$8&h5rMRv&_2aM`UmJZf=v$J7}NQ#1!c0_J{8r$%SG@4Gt;Fk3OX zs;-4$4EJ6{D-+TT{4Bg$ZEy&(?^caOl~3BN%!sH)rwXA{D69K-2k1*v0@i)DrSS4a z#fD#~49IkgkX}8JQFUjz5*f`Rn&B5qek`CCoc543Zds0)2QR|o`471dlR`I&6>$8= zQ!?OpdLPD~HNo7|lkU8vmm)MOOwT7BBS_%xRL6x?jdwMY(p&e*!509U32oVkrS6dh z^debE=O*ATVyf(p8ClNMh}A1ppX;|hP%^G&w8NV4<*Gryiy{tg!hX9&D{`D<2EJ94 zun+9xEiwx}OcK99G@ny~PKcS=657V*O%*>4vv!$r)wuQCFAlDjsP28@%`Si^th00n zwTd^)==HjrSLzuhki)VCgzURc9ab8yj(wYSkV40lR>~0i+w^#xzC5CjJqW1u{rPY| zRHL1|^IL`zYi_a0BNPaO9U(OJjE|hk8h31c$m{p=W>-40MbMy6Fw=2BCe`6h`gsGt zCk1Uoo){o5!u^!IDRUN(Nhxp8ull zyeV^*)g`B4L|opKIg8`gTDdWoH)YP^__da2yo~(ODW_qpOXe&@pz#>rlsU`lk~s?z zm*>n`9Ou^R6Mup-&YZ>ZZY{mpr8{$$68nM4H1vHoRI|2O!AK_V}%_w z6aJ*y@Qdg1i!PhO#KWHw>2$YRQ*Tq#$4i7~RV-E5EZVcm%>1`PXNhYJeOUEf#V<1s zvg0FP1`GLlN8&P$y7PHd@i(Dqc*acoU!T5BlK`0+)-1q8zTLY?*H5qa-*~2GO+d@t zb7iW~l%QARr5R8~#@)VK=A^B1NtT$7;j>K!^)VTBJt7hJ1l=QAcLVeB#ZvEJX3EJ~ z)1`bBjTe%1^3%MePeWCuUdeTl`gI4iq1XF6>7@**TYEMt0Xq8l`Aig^*po3Lb6#u) zOC*w+23s3P0Tt=cR#qs%HdH?wz5WdljHT;>7=Md|>vfJK;d0wL#l@M{vTjj&_i-*@ zi;#3%&y%jr5>zD*OPsQ;hSSe76vvOjFP&{FiyRudXX!W)*L#pq+-5D=ON!y2NdI5l zMdd9bkKHv&fHTaWa4(0W{a%yFiRI0}OBRf^D@(Q^S-2Fk4LJnv*G~ih1PSdTj_=K_ zy|V@&7~hiAJ!c(od~dOk?FjIqemxBB`urh|f;)hqnm}Zykp5B2f@VHlbN&z~!jG&H zPBI!2bJorS+JWVbS;!&Ig>F{~$ExcK!*aDWchHx)9r;5X5Mp`L5<*q0htR>acEW=1 z%pc;MXtb6}R*y`89TQ%3c{iVhntq_r#C)f?%AoV8#HK-~2K^o!aG<|bh}_TpN<9#c z?_FYh1G#<>h!WTPQ1C1DnxuMHC8vuH125XW)>Y}uLJn1WJ*r!C)(oURft@m5bk&g6 z_)7(HRJl;Cwz}GSN>(B3N62L?Hz>>>!YYK$vCteC7}Sm9#!(06>_QHCU1w$6-{(XHaaVGp}Mss^vf= z8vAw%4&%jQEjgo*bsI{c8#Z*)0RiFvnr9OZr#CNgzo63Z_}#ch>oFiE&i%f0_JVuJD}_4k!&d?5o&J>d~789Mj8G^?#}e#Ujv&7{A=VnPAjB2h6{BZk1E7D zPA^1|&n3;05{DZoh}Fb&m`S*MG`;>i&RQDoDg49as&@DCsk)C|uH#HQ9zASnJ088p z(sn$$Y-u|lJz{A)9zAMlJ03k|X=K{>Sd@DFr*HnkY0rOh()x`Mg6(kE08BU@C338( zSkQD5?z#iJk=yl2{U|EU4S!rRC)4>V<7uW&?+Bxp;Zu&deQTpb2$Xs$Viz%=p;vxPv_P# zt+C;WR-OhpRP`Qq>FwYUm8X$abvjyh78U+6vZ^dRJ=qCet9uXkK1a5lHL7GJ&dqO- z)p&Ryd+JTRheP-@FSmh@7wR;p=nv zFnui*&DX_|3*{pmT@3n2oEEgj9=F6Z2vt&|R7%Ehzy za;QmD^yyO|d^G3T>IZS1{yRmY`jMXixd)W?$L^^$c)2Ww!~_3c*noy9In9WRitPrT)$)GA#SFq435km zMh-20e6iNg*%{oA&DkvN3Agm!VDgeo=N9;et?&~$=LWnJMv=P7yj$vIah#sXpP5Oy zZ*!u7PNyzFkr%`V3{$+NcrSXqGvV5Q3-d424^`4E*|m4;A1O(An!8)%(af!*68_X{ z2N0K%E``rb!<4VqEpvV^X1{`0GV63YI5)wEuW!Jg*~3g>z3r%O?S%bs`e0&-WU^7q z+l!;w$z}0LwKQsrMA+4K;bzB^z}#qSEGsYYtt_Wza5(RDVuoQaHvL1k;PWnJftqS8 zUX|0O$c1{U`c#`kM{s?MT`y-#VT23L4rD>8a=KgttkFYZZ5C!>#LQ6{QBuH#ywY#D z;=9Q0VF%LoC1NuUL0gLlGaNBqMF}U--pd0_qCLxrcVA>!{F6B_!+hO~|m zqrag`oK{;X^i4c-y%XYBHi`8ysrE6xRJ%SkP5L#{r1KcsjB`avwTX&x8H|dsbd7yJ z8=QulF^IJ-76u;^$#s@fLXU%cq;OxB^M>AJCDCWrVv}aE5u$^8v8DRJ1FuXnP3t9R zvS}KdT#hVM`adnf_8)eA7Xi1?K-X&F>3m3$%OoeG$`0f5DV*$k24VVtj%v^X_5=ff6H3;jYxwcO?-k|}7+i3RB9Dc{(t|YauMFEHziHGKQVN4aX zT~_Rszsu@cOX+fOWJLPX(03qmODT-wF@eZ^s#GBIB?ck`t6~BjJ<~Tw!ZIdy8WFVE zA%IzdZRhfBWu6nB&Zony#ZDsHf}H|A`-O{%Lq>>=cRIT;o3L)FQYi&vHIJSOv|0v5 zCfeW+XoAMfbPjbG92rt=woGA610v|^d|bJv_plXLXay!klUv7YB;grgc5nf%e$t4AeWo`w&tTC z$iONebqI1rvPcj?@QY#fT_-xp&SeveO^E1KOeTyZp$F=IO){pW?+ff#B=H4?079GM zr;fOrR=~fT)>@1dTO{ug?&B6YiI}0=&eUF7Il!ZFSk@TJgtk%cz1)?6^^d(*Pj>;& z%wO5uWFO(lrZG;W*}%T(;+8>1vt_a=WJCAZR(U;>VAqI%85HWPc~5A=>tE2saF&fi zysN#Ty=3kB;VTjdh{ntP2KlH|8~O`Z`uLBaoaxEIDLJ9w(6i#>U z=Ni7q0R5pCBp{!yY%5!3*w7gF3tG|ghflOj@kt35cX9?PaY%tfr1RX%U7YVz4cge1 zr$tIc8_N>=D&UR2nvfk|jXt9QGlL(KMA2a7w4Tg;t^E3Mp&qP#Q=u4Huk=*D!sV&g z)}v*A9bQb*gONl_5!8dP6lmHDZm8v(_fnX*yN7n|;f?MW=rl}Twp3*b9wL4gQF%+~ zNzqMHNUGUeC;~|d>30zbjy3{(z53Kp@o5k=39PJ_Xt?j!;MupDL;Fy!w*^@6l67uC z?5+K|d2(`@4s zijdsU5;~}UG1(k~y0pkhNu3!fXQK#=_P+M3MMwc&(2$TuGLlP;fKl#F6<<;tMtNcq zziq@D)ae+fyw*;B)Sg=I(clo#sJ5x=hPhdAU%i0X4%5V6&s%JzOR#5ghTBqmVCels zO&Z}o496$H9;Pl?YR49S&7bzh7!%_Tdp+q=ai$Q2>N80DvNwmpLQCuyiAs`)s-CZk z?`yhJV!69Au8I$o=if8zuCRI5Qln~F<6l|ht{~WRs`yMt|4P|i8CS)}!o|Nb;;xLV z;yW>(Y8`b~kX~qvDrd~UXUtt0SH%a9B}XMpWY@PJWXGxEqxbG#k%W#nHC22Df}b0A zxu%NmWN)g)L~nijO%>nimMYieY<>Gp6(50`Uqh>_!Bp}4eN}vZU9PF(_xq~&{F2Mu zn!;-&8#V~X*#dEA&dA8ppQWi;f60aih2tn3IN9T#$t*oyWW!%l!uO3`;Oszh-p4(f z%UY*IRB2jGXeq65{=2|U4Zx+9Wy*)84<`6rHF{U zG}V%~gXL*~9;2laEpTy*agrLsIag$h;~c48b7?^-{2{8-)7El5%@aEK#Y*oK1%YX` zdxs1)(%Le@(|H*+0_J75SK3PfcEH)%=T?%dIkr$Y@nBWNKF9Lw%hqXUEzi`>sj@a{ zmQW~NqQQER(h7(0lpVJtdt&bwGhlT<5{jElLtX(`dd~&BY2GZVU$nlE6g+i>uPn|0 zZV2f;ut*&xV#`Ly4mg#X%Vc@H6_SOA@lnJO9M2h#7PY{cSL^|Y9$kaH*auZ~r4TjVBE+D!O z4CS@0gGzIkl#F-qD-pat&!wgC4<&}zr_{AYT%$od`C-leA>ycNFLw4gv?8hea&ANJ z?k}=lY_X7)`3_k*)?FJ4kd+mI+pRPNeC+bLAuFn!3b2!KrB?+e1vzN=*p&`k@x#-z zKe$3TfToF%p*Dn#i(6h1(daKqIT@@GQk$sD>UHeWh;7YVIjpJ-l$Q%rd@zi2Cqy)S1t;`6}8Xy1ll)B z4aAhuPoINUfK;rWRROfJa?HwnUq2ee^e^besJglT8uAQU;d_#yb#4p42Cd*a%ONXR z5(NJ2Vl~$@=VEmLE5BWm^??=42gY%geut}2rqMXIFu2Nwzi3EsjTk!&2_R(xWTjUP z9yryH4kKz@bgowT4pVWH2N#rTnu{BKTDQ3Y7xkAskF}04j1| zSn8!~Go3h;0VZiKKGR%ELC-P4hIyisCOl;-EZThP)e%mr0Ii8YO1TQy;Hje=B`xY~G*8gw`}-M3kbz<_u{xTrN#i%DdlRSX}rI>?xU*rO|qPW8qKq(+8;2f3n zgm|1y^m)JzJE=MgsG$aV$>Zcyot5d^Bo7F6D%VK4*U7D$2ObhW3&2BYMQ$CSNZ1Ot zG~Dm?-$p~YUUfEy%6VKjjAkCC>uSkL$Vt6{F=x?|)Y(y_893uYm|{K@j*AK#QIz^9 zUP2M%FlR|{nm=1GN*4~fa?g-Sf|3O}nU8j#Nah|gNpRY89L=I?pc4nq*m?HET0u0m zIAYk;L>mmCIS~})es4gE4$Nc3KAZSzUjHiIA=9Zis&|rtzZ@6)%w}E4RT(%tTVkKM z&J%{B#FM4@hR!$-BqG#Gw5z1VxPUY_Zr|+{Hb5=cTF#lY%w;ey|il zwDAb{A%b-(RL!NH#33@q;J#r=(TLpI0>H#oJwYpGt6-wC-CcQxxMJ0moP&5*G6|## zrmiKumqk8%drvdAD5WQu-oh7h+0L?FMExnxUl;~BfI`Gpnzv)E1}Rx#`~UH7e}sBa&TNH}J~sX(16f_g60 z_0vkc$bFa!c)2sdF`i}Ww1Z>3sw;m2Q`j3A2K*cHOGSBc&ocOg=b_kdb^){^(ki;1 zDeM3g>UkP)!(liUekNacy&eUC!toUy(BF{ISJ@6hLF?azpxG}7C%{l-7I3jtyPEtL z)tWRB1?F831!e&}JZx`DBI07si0<%Vl=XVI2lQet2s1hzqac?!B3DouW_mg zv(Zvv$n4-|Q^g#*0MALgmXb@E^|((m>gba|EzQNJA&{`%)2lIsoKVTRAbsksq-HS^ zq_8F!-usv@#Y*IT4r*D%NVai5fw<_^wUL^1FZpW?JfKDZmDZ4XRPvTJGJPiy7XrA= zB4dI?I@e}*BObu%F0?E9UW($EJT&2_6wy(?YQ@C5;CAHy!ACxc|MwX1!d{@LXUdq% zxYVY&hjYj!&=-L(+7QXmNkn^gB_PQ?6VOY7d_gbIcrhjAAqkI4hLF@4?vg%Frm4P1 z%v>CSoYL|-U* zs_v{ZC%g*I7Cl{e!sN4?pPt09)(2qNFT8;bOb(5#MYi!80%P0kHRSJF;Uh4oGfyor zWY^cUwQAfl6>O$;-lvUY5k9{#wlD&OxZ#QinJtdGA^(gE3aORc4g0(_hTAXHflU48;a?;C?=Fm8u?AOVz6UiEx0^KL~UR$^~Vg<9nyzz1WUscm7rRlPr z0+sR@CjY$Ankjt27o@;Ek%W*aeJoMs>S4T&SSDuOn+yp`2h3vcczTFRy+S+EY4tL zTcx#)$X6r(Fo&?tZk3v5$_rSfTwcUxLSdb%kVjQt@xv3I>cSJg!yjyA5T0k`qC>^#K|F!uJkR#&V<~H`*7k~It1!vsea&}9HQ8T z>fzv(3D{qaZXZ!xl_+x}*_nQYC^~3oeo3I)KGCo$EmAE`r+0uRE@m^(127Fsl^Q2` z^=j44B8$j=hgUCW6bqy&coEE!xl74p@%2=3kAoG*1dsGmak98EKo|uss;uotzOk67 z%@OXMaPIXs$6ljcG(ho9ss{~39Nn-t44!QnOajq%368v+CDi$DfIkN3T1bsh3hHaYUO7$$ zJ36VmnzWOgom`!wFqq(v6;Zf~;`4H$)l2RI-_aLTt~!y1?w9JuLK3_Ae^cx z84Q+MLT~AGwf{1sTxMo1_DXje^vIYM^Rt$0mffh9eDzmzXf$wm+tP^xb!o@41ym=C zcfs5wi+pD%=S^>Ykw8L%8M7e4nh(+j!=hT;#4x8>K5bdqG(rTm_KP6a73DUj86QHw zo$xq?aSNkw4F$2)&Jw3QaTwb zS5Q++=ssLTGaUA+gQ_W(8U$C@HhKv(YdSD4J-h9D{(0HiJ0q)LBL`MFCV|4jBBmo~D zcP%__apvGiD8*$erEY1SU0vbH(t|9MxLSIPlFcv&-%w1!PS|41S zLY5Vpf;@odO<0^cVA8Ich`a9q#sZQdT2?%`FYfMh7H15rAivh-JBu?1QQD17F5g+4 zA(jrxOFqr>oy8e*Jjm~G`Of03-^9>X&m!^}Et&`@2Sf{Ayk`3$rJYLD@7^4h6s5`vU=eR?jMxztjS*@&19pYMOE+^}$`q70oidwcrNo)WCaIp#HZGZXK(kRJdZDZS zWnvqC7;yn_-l10PP_`rr6l*Evo0+VebiR{(Euo9}Z3*2IsA_82UTT+Cm>fvcrk8Vm zw5*D<#1M@Q+bb!)HH&$x$z*g3Pg7A%(8an1gi~S!J%OOI{oL-BNQp{sAv0aN&E}Hf zm-O7cGG+-C$Q5RD5m2rgUYnoS)PbxkyA^PCyP5EF*`3$p0Qxt5TMjb9=1y+*?>jqmaHQ*~pj->ziw zKoVoI)VrQ{fadIwaMeSuS>TxKj3Gg5hp~kvTVtyZA5sk;KD>3njfepfX0$#Nl%67HjL9L zd77^^yvVl`*%n6ZIPQ*diZl{3yuurpZW0J2V=TAv>@?(lSdGml>?Wg(pY<-15Ieay z^jD_O>Gl%oZ)ii)$Qaf)#ClAlNF%?3k19PiJI7a=MoAWdghq~jZ6t+mw!xS>&ndfj zLp_k4vUQG{BuWboTul_f^B+ym2+~5E?sDqTl-R zOvo;_KI?GBSe#w48%JHSpX*mRXPjbp_&_I`Xs*;R$GA${^uq!drB4Amr>JZD#XYm* zh(3xcfS*>wF?tMN&59628O?@$%7JI?zoC~F;HW58^G&%!3xBJ*VdIhywselNW~jg_ z+mV!7Vf&E^CBCa;<&8;-IzzG>`c*O9<8A#`A8cKfJhR8@7P1GA1~ zl^m*70;4K2q7_k4ACtDKHdRZ7s*k!Vk%c5{CBOx_ z&yhoHepZ;(lfXQw24Rpbh`{GH^I;?9`Ft@C_wt4@eGLE$zRcgW%{6+*z; zTRoGu@@*^3Ra!LgJiiXqQ9TLb=^U1EcY|b1jWX|Gm*=6LJf%g(LlAi)@S;op>BvEd zc>S0prImL`QmP~l{dR9NHalCzE=bfTP->JI5XQ8p;Rb6;Xm)W&;-0xWcX(jwrV zzs-d1H9u*PpETYZ_!&FD$d9y!)QwDq8&E=B$h0}of@bek|wQW8Da5ry1W7CX&JM%c(JH)yjv5X63Aw z=awxzoZg-LNb{U6^7jIxxtc7&ooS1cjD=WB=Np1LCyD=Et8#Pj?cHT|Waz*kT2^<T_ILQ z{V=M2pkK7e8xu}?isd=DR(+SYbH}exX0t5FX#LA#O4JIh$&SHzY}}$u9k-y=O>j(v z;9Bh|e23HB_BkN+HmK)MOFX*6ozj_v0%_S^UBaDedw2FiqJ;E*SjX#4Y6R^89oolJy&_19)6DxNIt5DJt;u) z(Ft3!$Rzh@uTdaMtd|M@Xb%TKD2uCmV^&(@G{RbM+T{t&fxa7gQhSYAlZ6QmlNaN2 z81DPB|4gE*5mO_uBb%L?hMZS+WCV(tWJmUK<2#(B40=X0N*{EXUw9SpPY4MX7e>FJuT5sUhaqQDc91nv(>;J}N-~R#nX@pNdYmXB* zFex+((-W8vZm7OZ4tDyu+@FN`j5?}^0qtWxC@Tx|sbGTNtAV?ZngC5^ zpM`9<8e`tF1x!m`FB+geB8UE(Jb%A!wpj}=C(s8au)dwqNv8H}wldnz^kHiMA&yfE zrxDMJPkWr{e6er7;l<9yzL+(9rXU`)L%CX|BH{BHHrnv)3}rWV%5%xmZioD$PYPnu zMsALr&&Ha!>H2 zdMe1C&rV9bVb?epc)0Cc^15dzk0z3!To=5BGP`GJ!%dwFzPY(6*qdj7aH_H9pH$v?(BL+xhkf_^wExIZ&*_4Cu=rx% zJ)N~;mbT%_0P30LcHMt``>yMt9-4sKajwcBQmG@Oe8g+k8PZd0n=a8_juiUOkX1u^ z3|aLw%_9nk9P$m2 zCk&lalD~>_f$y|^f>b??{x?z;p04I(xWc+qD6mKTh0YphgKkDpB5=M8W@q~D5@K0% zGQ|_8kZpLU&cZ5LcM`63xi@SDv*+c7n*M@^QlRk^4pk@PX_o?3CxdWSoCSs`!qQy27?1?qWD9o2eF_W)p(tug z#1P6bVM)+>Nki129|s1a9Rpg-WYNxzsRamIbdANSIPIpavW0^{TUBR1}(5OX5`bDg#TgU5m#^0~y5Y1sVM zi&bc73ecBw%Rz=_qX7M?spM2nVGwGtnhpRXF0684G+T0I25eZ7zf5BwF@>yGr9WUL zvtn4>xYq#JO*OjdK83fWKujH?HIHlnkY_) zFtwcVQ$St7n(dE!w}&)0ycvx8Nu1lblk1}`w0pQ|zNxVGDMK068Lf4Fz+u$77Go=S91>grWar_nP>x;RG zAzdqLND7?Zq@doJtp$H*+~G>-&Us zz``2qqhuva3ykc#=^Pa0Gl$MV%pz}^wpgV~?j|eH7qeM$t6qkvoKzQLE#@kBG8r`I zc=+#_Zc-gXeTIQ5d4)Bv$8^OQ{>+J}W&@2RLBYUK-)|^#3mTc466aD^l%uuPQqjV6 z=7EG2ep3r(t?x9(+MKn%-{S5l6qTAg@FCj;_qcSFWnxto7--Q@QWR`eGN@{4sqoxyy;3WdxW> zQO<&Ws9++z0oPc;TnjDOk>X*rTg-NB9=-^F%SCK< zxrSa#a_~E4Qao^_G6t^^b%!WqF%?UnCfm7_DE1p9bEwAgZlF*jX+lR}vM+Ul<=pno z;wC|n;{RI)FKNY0f83TIe47GJX$18KN{GW3?=78%HFkcpZ^n6piE2=iK{*DA&S;A8 zAH_eScyllrq9Hr6{Xp9Ns_EXuvz`0l+-ybRd1ztiHP~!heI*;crc<&x#DgUMhm*tU zW1Zb1Bz24uGJ6+8_jqd;*Mel|0Bi(NophP;mS;Kdc=#`)b~VsyHeJUzCaUp<;HFyAy}J{P0& zin54B(+I9w-!snH{rqEF9;nUVVF0uAVX~nE(uaxq9b2X@Nzz)qW&aQn9nD520a3aI z)kYiQQ)$k$)JPx|C`tIAR5O0e+d3iy7hdAnA(*4ymCssb5P`2|=+raLQy%k94QBlI}xH z_9uOb)fAPu3~KDhdj2BaN;k2im322o%F3ciTlm$U^>=t{XJ->6(n3<*Mf9sV4%&7J z0H~yejq=?l!*wx~?b>p8Z9<6!7B09p?5<5Hu^{3LuB~y`CX`qZWd_&E?%ISB3*8f3 z8*$erlvn^WgKMMi+Jq7d{U2N#bJr%6SkOr$%icjSChs?)#B!T&r%5NUx9Zx25(_Y8 za9`YAb4o0?1xhR=T)tCcL9ZexFF2CAoD$1zff5S|m+zEV02+hxLL4dYlvwzdLH;80 zeMw})j>5;Jy&@a6{v&FyG`zB;KHYf`uyAv~nMN`UrYK2m`eQSjonE=Q-|#_YDCu8g z>S-pSpNjp*!lT?M6zL-gw5CF~_(zQwl-Ha-nWmtIJk#v}!;+}-d>(3$Bzz^md@Kl)Sa&`jNnKqydU zF;jdh#CcQSORRh^g)@DdL+HCPT|(57(S+;&E)63!wa_9K6mgr7DWefFgoiI#|6n-f zwQYqeIZ^aFG>1fI7vOp8No1#R>+MfMK79<1VvY{E4=MGbG0H|lq7qe|U(;_@)BV&G zK}VZmPnnvjP=GU#9z)&bHzJ!|r+^%}3&*@U7pk_5YRimKyoTBm!AEk{| zs<%;|ry7~V7F9lfh_}T^rX2}l?#pFbrbDh>srIs1t_qQ_vNBx1f!1w;^{O=nPpn)t zPgr?L2I;!z5i^6Ywc|ZL_n8rH|I7#qHzVGhVIiEJ%|@A|j&S`vKr)YftZ z&2d_5Ld70uG(n>tprQw;a0`_!P`~St`;t;$^1E%|ZY12}3xn7H0eTUOYFHSfDw@4B z!^gCg8jt%jomV|pI7%eyIiiiv5z~f~Iq2Mcjl!R_?|}mLEv-4L`G#I0Fu{(EUx>LZ zbc-6cd&@Ryf_H8^^5b^6$DOLjoeC+=)1<#3U)Ns~kiqkAaF*TnxJge7vi*m#UUkUK z!@0`RI~|Yb*b}{fUx9v)H?WK&Z#N2!k0XX?n?+a26LQ{be~@Raa@r4Gd*!v9t?-p1 zVnKGy<)o(7B;Z+FLeI5?Fg|kVTQ>AuI<(Q;(xF$48WXaAVS#Q{aE=iqA~Yr5v=o`N zs>rHJ>x2vngL5qsiol}vB}Ke>t+6OB0S{==n5RW!KMG4vKz14Jrp0m*Wwnd1wz_V1 z^nzEtD$&ata!D!t4YW|$vbk5K7E%gX&o^Mm8IG|pu#HFuRXi!>P*-vZmNssnP-~P@ zpNLY5ja*uLrqiO0T$)M4uF&GR7i)=Ah80R)NhrgrZ)HFgT^aenz6#sZ18nT~Q$`v| z+Sk>%GDZ0MO-(eYi6ZRpKXldbI#vcXCRfeER+`gF8TR>Wtr}bqSrPO*Tr~?jyx6BY zD8hE%WksM9DMG7O1&$h87x7x3!XoDYYsv-gS;4+fFKFNA<=6Dm{}lYXekltbaJ(yf zL*TCgTGo|ax!je#Q@@mjE-({EdLmA{tV@h6nDEz78g@l-*f3d>Nj+0brS!xV*K0SN@97P+^8+HfyCJjQjN_%r=76L!!u498J^8b+MC8}x^E#WGo< z3%025IOf}rQB%SGeS<3*+4`CnqdQ90N7ypqD)A(~{ zapcF$cNTMb&1F(M(GEaq37{DL_GXRL5dnApG(=l)tAn1AcF=upJ=0Pe{pDATe zKnm*=1+)xYL1}0QVGFk&3sLd^=lyzK;Qjh6WT$oVeq9%Mzy5DH3%p;CS7qL>3+Da0 zVBW6_=KZ=bbC}^?Fz?renQJUv)^VmC&HHs>=7{BwTD~34`*mR^s!GoLbuCoyh34CO z;oE#$ry8DLQl07ilc-(ivcXqUVhkO@D+2~}hU0vcAtIrkKD#_J)SF(?etm{hxa5C^ z+3>D`XX_|m$0BULgkz-5p=`-_$RLo^P#)hhjzN1SVFHtWK+PN^1)j*E)G#K*>J3jp zP!yGCGC>k#JBJK&B*LyHs}dv8++HS2-CP%aGg_sRjdRF;QgdhyGUkX%M)2gByPK~X zv63wf_j&!26i|7iZV9XMeOzzUEn(eKqgytT6($kw{(#Xy$6^_WPC=~uSV_9Os+HMx z0+~;f*>wU0XSC}BB6AAIa?Cx)1nfFihH-7x%FUmk-0kGHDHZMcKrIzykFAAzFx=-@ ziMq4Zh`Kj6ai!-3$d??HO)Pr?U-v$5;DdaF=(?Sdkc?f3+nt|n+2x?g74RDHMnDFmLJadbDIhFY6dhCP>Y(A2|YwHQ7ySD zoI63f(Ckmy(-P*5;Zsw5!QL}wQmHUfq349tR(k`Aks%4pVqX^t&4Q<}kwYWNfSwag zd7wMdInY(*;+?dVvz*S|+dSua=^9~?gq8y=2u4GQVkC8upkgV-q+&Fz z+d9#tr+~ydYJd!pR`ZMT8B5wat&2N2g@C-`VKVk6i;u(qy^$$yO&bpT>|5k+llVc) zXV!OjL6Pm^#r_;4L;`qQ;)e`>{EKJK!6gcES&^zD~@goeaN_>c57zw zX>DvdWy;Dzn;bWr)g2i-)t9wggx-8u%->p95KnW?p@X++n{ITXIC=3+uX`5$HWG!$ zotX*kk(T)=6%rQ6oL$l_=ZM2#^2>Sq~FoynayI4?4 z{fyDt^CLaYw4|)BiayT`Q!TmRFi(;QOP@^;Dl+7#S9*&5UezmC@2yhWtkk>eF*tSh zGxPt+7daErWwn;Y<=o~ki#6c~$M$2whWG-xFCiG{V7 z&nob4w7Xu%o~ac0uiw8AQau3k+-EE-7!DbLHdIWG9ukVHFQ_wH&$;!%J?e))+Xs2w zA;W0yE=0tN_YUWIU)T8U9FsbH)z85` z^Ho3P*j}{;1=eWIPe@0*@eas?_UQ09+3htyHcf$_pknqsD_Hw(7Dqgeu*25;IE82} zg`vx`?@)0|=WC)?sk_eVuCu!9tnR9x=m|a4O$`TC_baNK80Kgp)cu4i^9Ob5F&&;X z5?GY&0w`O?gutde9zVFPv6m823zra8#88TFQvMrC;Z+@k_G;C(S?BlYu!U2ipr@ua z!nHn|4gE1a4F|*Wo?h(@HL}Zy?DdV#oPuWvwbm@AGn(a0a!U`Z=g@l7NE&xOhlHZ{&bDI#~0jcC~!4ZT(6s^z_x$`Ml5R|z@a+(6W{^>Lzs0Agao0r?@+zW zW3?KKEOTyhokR6%bEsb1QhaJMR$FCQ?WCyO^C9vwrT>YpIiv@`8#|qmbuGAnGJ)LN z$hrihbvlccBdVGDwiYqGHH<}9{RtNcjFOrYH4J0WRM1&9F~(j3X7)X_pOjQv*&3V$Yk}X31Mb>etQB_A`BCNQjQMW`?cOzOwxd+IK z%%oJxoM@$vj;f82N~CB6eh)OMXENh+e=;&#cRDv(`J+)*FcR=+X)plN@>*>($utsh zrX*cmCu$NGj^^k{z@tUhwM8N8B&CbyXO5}eA_GOVzd26U;c9FB1X*_^2kTNm^ED9A zN{d!7@N8oO5Z%;u#>u87ufX?)ZB#H%E^ZYxl?SrVyCD$ z?6UZ7q6=qxLq8{L*MgU=)Nhy==4t(^Q{6V|g67H{0)7wzck5^3^FW@~$(j#*yQD@m z2w3fsYKWl@HhPf+Od_^(0baE4tyf; z!SI0q*`4lu1#nao2EwO6^o3$$!UtwWVo?J<*TG{02kR^=oA@6=J>EC~Gi9fw{41oX z>g%csLiHihQ=2)toO@$s4uaPOf zC?@7bp{}C0d-YuS3wu%T)~D$3yXog0mJQJ=mIakIkR_^z+=Od!*eEYBCQF48%m0IH z1^|8Cm(tW2BcCSH`Fk?Oz4VO;b`X~_Ax6c=#oy3{)$3jilfhnxcXX8yNadWeL6k$n z0|bMz+cTb;D!i9Bqmxsme>vn`dY8H>&FKeAjZqDcR?^90nWydJ7VC|jMvv#Xpbs_5 z6V&QOTvjWi_6rQpb?(~9DNoRLoZ?rX@=EV>|AQZi<;56Agjrc6AX5MKSv1=*kI14{aRz%Z z#(nx|e*k`1Win`&5L+Ng^J1=B&T?vV3A==N`4hpV;o#D+U4mPVp!pLa7lvTz5~a|-jq(N4YF%3J5uk6=l_93O&*(De>~Ab6h0m7<^yTjA-Na*o6+MdzEpSApRMVFw&H*E{ zL(gG4X#@gx{WM#;!C(MP!l5rIb^VBj z-ki`}Z3*qomROziKD4BlVw!nz&f&ekj#W}*4v4g-&`Pc;U@9~-4w1A#4AX95NlfUa zx{-N5%r7%=geyEjF9xXr&c2&ZXmT9&o(ie=RHH-#YPIQ9smrV;y3sYoNrW_GKclhX zX6$j^HGg2aS3_6w1bv`keu?kECt5^h@DHC0qj*@iHq*BOm~|7wFPdE|&T?)1%9K~w z06n+1_R6GJAK$|9Uv^Yjx_qTq|L!*^Q2*{7S9<^QXoc5ykYBDU3qZQbp8kqBfZ=uqZE0< z%l#|3^A%+#7Ws270hlFY&J_^Kxwgaz$9#gO=eiPOkcB-7^5>AB^yav5$4VHVs3h{3AL^MTrxq^5)BfMe(|+~iJeY@7$@yi>j?sjSzVeCX z^dT(KrZ8vi&v2k`?D>i7faXKW_>`{2l+Dkl!o18Z)w$8t8MZn(@?sH%B+Ex8>f~#P zht18$JAZq`)#&9eHLn)Ri(UFwP~8DnU87aUk?O)$UHGu-Hmf?tk8TeK)m?1WSxo7R z-=bf>`aZtYuJbuKdpU`M>@7>5!AVCo!Pj_Oy3$;Ms?GF6WVt#Ti@V&vn*WW+b$SjB z#f^ehuZH%_3WeMktg$a>$7t0L_0m=L1u64>c><|MxQqzGyqdOG{eT3}cnlY(u~S_I zSl`hCj(n9mYeC)eZ0S=|Rl!tMa6s+Z%Z-=Y*C5Or?fGJu8Z0CB#V{4Y$2H!_G1hn^ z<7XN#8R6w3l2=szACCFTVO8@8)zCfj8A;z&H6PV$k-<#;C>^%sMEeGWnqs7@-|N1% z>$fD+gLQbsbawX=GT=M68&$s0-V{=X3Se{u3>A`ghQ?R{GohCdE@-v$hFZ-Sw?EO# znbtFrP1+XraW2_oN80!HkwPq?byD6y^qU1dw?z+-U5lW~X=PNtR>~~-S~=wvL|c?M z^eu!IS{m!v*bH2M=r}zK@v=nMYgC6!B4~i-ut+b^P*zqFYy?XU8r7z#QmZMAZ-J!@ z^ONZtJVA=$Ip5twLr4f2Urup~T!>GQ1wOKVvMa-~Ze>ujb!EOBr(B$Rpn+bacA@2K zTv-}N83w`USqEbad}w{E6@oFddTt1y_0Kfn_@jw>po-)^M^OKWtLL^L!1RY(sps=P zwqCSCcLa_>z;)fjzT82f4|hBH-qNR&d$l`8c9qM0ak zb+0}^?BDf6eSqtgt|XMMmFy3&^2OU1W~Usl{jE_~?neDm&iM)HKg>^vdS{Hwyoi3- z-gnY9?E3roY1kKpL>SZT5>+k|rdHpewAi9ek3;5;VAHzWcpGyTG$7TmUp_(}@Wqw4 z4QV7ZwWC8ry|h$|LEVsqklB(n0G*GJn*(_L1_!UxD+HovO&^%j=U5HwUwu{tq7QS9 zFC*g+!qh?S;y%TO%Ww|oAdjOwy+=_Ie33*0`eSku@;E+8(2m`?lQ$lB4)VVN+091w zlk^?}LE#CDQl-vAVC)iFw2BrIZzGW15HC&JR=P<@yQv&Qb}gjYEr9LM09=gj#?okic)VM`aQJK5Ri{db(TG~8SGhshN`-^({I z9J5@<7UaQ>Zs8jRj09W zK2O~>sym@uDgpXU+RHNt@*JgEZ4rpw+?&iV7eYF3A*8!3gtWPpRVtdY5YkzTTD^`h zlZpsw?}^8$CuJd|EqFqu&I<}29nC6mbna#9C4@B3?{>kXxza6nFFD{Y9LVr}f9shC zT1+bV#-1Pf;H`8Zh4LdGQ15Iudim|(#@WbY<2ohqff>-G62jS!;B%S#qAFNv!K0(x z=LE77!K1nF$g!h+-^`*pl#Eu%6+y1bVn>5O4~7#4w>$U6lS3cWdIP^OxZSN!*}=+0 zfVz(dk2Vh=GbW+#sOn8Z-TkNsUwP#qYZ?HF6aAK!z}Q+i00yF#o-zxY(?_;M1;ds% zWbF_^;k<8XajKB0nl?QS&@yWfq=-v8? zaC0qWl2fkXH(PvJE5(z#PSbev~Pn#H}lwC(||O9n@&U72;y zOl_XCK_H6wX)T!n*Nsl4$^3-E%dKWES^QnLRC&&&rR)b-FQ0iUIp?<~Gg3?7T-RDl z9EIh>G+kIez~Zh}tzPxZVDGP43Ts%)R3|@q%2?O+N7`)>G5xG_%JB8C9VJFsehgE_XSjn&U`sMtp$7UNIWkD2$ zyJt#0?=y>yFL59Iy?U8#E6^V_^?H0}&`N4vWW7hj|5d=#G) zj=b|k%0>a4N7d*c-}UW}f#Oq*{+}=~U#5%(;^Tp=eZLoLwCb{=oj%NWqNciOw1ud3 z4sl1J-QKoxDwh(fw=u*6rYey3fAP|$ao#gcJ4$|rmggSB+_46x3CJ4o%cN^`6s zt~RFfCLOD8X31;c!TLmOaFB4s6CmM;Gt^%kh!n{6?IJ=eZ%LIF0<%BwEz?+S>3q?r zTf{9i1*rV!eZehQ<*nsxLwcL}( zg+F&DdO1efTR1}JlCu+b{23*1jmJKR(s=#_uUcW-iE7vmvmUFwM(l2CT7kt{AiGwA z(91GzuJDFh3~EO-$UCrsbcmq%%k+tm@!_N^}*!~nEu3JB!#7Y$Rq+aZsKc8NIm*JoCoxHt51BKqvAMLCu@ zDL|B;s^N<|DK7MU8oTQEaQ#toxp*FFtGB#sres@Kr;q~K`x#9I#xV0ex z0Q<85=%UOO0G*BKkx8-7wYQ0`%ML~oF8(<&XrTRS3B&=Z?k&5JP1V^|JFI>ZZ74oF zQ(9tJef^QJ`f-N?myJ6dKn3%(ajYIapcumRs3K7@W6eNXV11)N_{tbF>P-m2#u#T+ zGa1LYrt{+y1aN(L9D85K=nAW3<4i_nSk>(#tI>z+_l}1rs8i(;RsJ%S*GUziN}ziU zssN@OAnLT+(E2Vz>zU!PZbR$a46QHBw6=;spQ#&rPPB#MXci4eNFHtR3FPJu{GDUY z2uZ{1QzvTrXOFI>c}7SIxsUeD6p5H}sJ$9a-95tV_d4Q`ss3N!^;Ak|0L4RxJgjBT zf!Fh3@Orv90PO$aoqCwuhw^2v#w@I*hAFK|^efH48rc`K^~m^i*CD0^Y>CA^ z-_^V8AXtCloi^0xgTgEBd7R;G7So^x8phBOTgfv zx+o?oK0ed}>$f`$Nbmy|h(wA)0;vXK6)c3P(Q{Iz8U^@S9KiCuii_@`$D`O|_@*a+0X6gbWG#WWAX^=GcevCQDVnl1ApvbkkrTJZsBdz5O@o!isJ z{ZdKgzS$I@-Uq0&q$6A(yLsW4#d=h$edrf|uvGXNFMpc}5wXG%*P~CY*BPTmz35K$i~PTBKG|e0i(n&u zMv{l>+o@yk1{GsCsg;%x%P|SqrvrjZ%k9zxun)2sTpG4Z6TrSR7qbf2*rf?ze<(2j zCSjK*fPG~CL51R&%qYwU_FeE^t8mmVO#u6-xd(TR*`*0!AK5fHc9(?IlY#xA!19}f zU77&)Ss#MCB!&)veK`FfTQX;4%m(Z;gM(}dm;+!Rp=FRQfwJVs0``Xjqi+(H?SOqo zeNcK4*^K*Iy;H+J7h-*}fPItf@_iMLz+}5hCmRkaWWi?0XtGVLt87bCqyCZ&oBtBF zzsd(Poru6IJ%|)UgHh|Iu;VBV(Q;6>bEk{L4Bo4O+hln+IA+_JbEjq{-ZrOe4B5ag zA$3L`^;6j|RC7fJ;1VTZDX~{p-wVl;YJA6nQh0i{h+Mdh?Ls7Gy43c!;66Lkii^no zTM1eK+uo0w{cYJ&zq5qYJk|4bPJ0dI=ls%A;RM0{$7h*&)tXwG+>$wHO-_?pn=Un; znq~F+5-3@5lH%0`jb%bR(xE495?`+o zgVq;V{}`Ks^u&0n?nLdpyY;2C5N{%x^H6>#9G%%LXzHs*)h>gH&3;wayOwctp5{lZ zmccr&Cy^~tNwm8}C3#ra{O=$H+e+}Ve}bLd43Z4j4GV|xiK2*z*HK(Dj4y!wqpqH7 z9_8wzUakf3{}W|z&KE|QfX#saI#K3l$;d6iRC_)rOUV2z+5t<-lp=G^eBgduDc>Y! z_y*esOoSR^T4g)2WXuhZ{SV5ASGUwg#kVfIo!yg@T-8mL0oLc>nQ2q+fH$HAplcxeoB%uUBV6 z&s8yBl8WKcH}L}h{1)fq_0si*_>)WJ^M(5#AA+TS*tOZb&&Dbzs^twTty+4CYKaL> zWm4eJeaPyildQdD&{I@NHb>W4Uu|ZV?Na0e_+i?9zqPjXF}=sb)*G}Mj9*84NWvbH zu(m%%+cWlzi6|Uij2x=JQfdHnw(E<^Z>omMZ>CEh(W^XRXDjLBs{c(gU-cad|5Yf2 zh~Y8njFH!#|F*vEDSXx#|EIW@ud)wTbzQpjkI1j8Wlj*QvUWwRU6Jcm(Nt+;wBM^P zXxHa#&oWLpxx_vfS+wsGsRb}^8h@a+H2#2=p%$6Vm$nZPTR0EC=c>`;i6_kobuG{rvl3J-?C=e}3-i;{{#8E1wpv7;%`J3!y zUncZQHiSawd0E8Rx4keZ=pYqW#y)BUXLBC3RG_fnD& zQMX8uJ58Rh)lvo__Z_yie6WNJ3)6S7Tr|h`p$~Aj-5Pkxp^oL$f#o~P96AA%fnu7e zw-LDPD5h#e!4#svWpNIliBk&V=>@gAI-s4Zp&6~FmRibu5`}<$P*Gc~47%C&RHqO? zHBC+*U?0TP_g!-_v3C!Js&dczL@V_yKp|is3X{E7k5dQ`oIYxgx~hZLbWj5FN_NTi z+GF;p{vb}sRf~8pbOH83XPs-cz~a~wcz2Om*pT<~xpStF<)jO5UhnN}>#)B~RU2tJYULWdN!aVLh%3un%2; zuaUviZwQnDAlj&sr+&L^S7QIJU+O%z@R>jvK)oW;&5PYR!bMuBAIcn8m~Z{kVGo7k zaaG+pYNf8$52erpkVeBI_YUX-SXE|SubvsB6l1K3zGd{c^3$+ok{$rs=5(0ouAS<* zQqJ@MRyfmn*4CTHZRaaxaTKfhMRUWGOBS?rKI02ihgS7crxXEJt?H!>+GQd)N=Tac zPf>)Ynv`^fQH(4BH;gL+$b&|#>RR*w*3d-s02q%Xq+zp#UvJ3lt)ktA<|E%UR3IX#|4A%3}2S_&j-dlDl-2>mAEPat^ zz;84cQSrPXF-k%0o>B(D%+UByF-K-~5AX833FUxiuvlf&g+P1^suSW-zJgG8RzK^? z>_8_VO@G>=&`APnDh^BZAIDN>>C-3#lwon%9yf;io@EL+g@6O55TK1UjpB|J0uGo$ zzyVVTIN%flPI7Ke`4o8W#U@aPsTm-T|DLm!hIOV8;8#qgc?FV)<)eB4=699`;U*0^ zMLzyzNH%Vv4Mp7PFuji! z&Wv+o>R9OA+!*3tS>DUup3H8ux0eH5?Cl|U&tyx8b{z}JEHAp)-SgOdbuGnD0LIza zgzH?s{t0-M#ne70FPcaX8xA@Ln)0XpHU$74R0{=>tNWEm4TXW7qyT`Ky;h7rXOJa9 zu-VyHaGQ_+J2U$WbQ=QRv~OhtuQexacJ|I3Uv~DA^hU1|-{Pzt*;xgKR=DsVq9E=f zs8~9*DaE!vtR5e@*WiD6A7c{D$t(~mCe!kN7uuF;c*Ywz*}(riwu?^UHW^SEyvVHX zuSUHw5GKjdzpZY|z5x3RFSV9|CtkIJUznnow&^6!Ak~V~L8hAhKAIy@%2IwDqF+cL z{Bsr_6qvhBl+9vFqv>QkGzGKR#Sksifxt3qk)+JReOkG!oFmG=h-~!PSS{0$MTj{u z?YP-k5&G+j-J+MYk8@#POHna7U*rGLB5|oA?NjU-*7%&WntI zZ&L!P+y69EtYH-|MJm&XtlSc-FzasZ7h{fB&V$CVCuwrACLyo~xh`u>_2Yo^ztd*c zOC2;x-61ltbwY;qGzJJ>>44VwK(w$@Z1`+?l=h|7IV-tMZ0s<<)ViK!9EG?cCtTo# zv_$ZvvR>DAf+qEi;QwkR5GeT2|3aN=+YmvD5MiC5ejs+IFu>ifqUkv=D`f{p<}4y1 zB3R%7Io%h!kBy=sTME}oo|d*O90EA6GMQ59N32afaGS^u7E;xNQZv*3C?R);TdW@OcGn-jMq9=R#b^M&hDM5|SL2nAO{we-9@IS`?Glm1N z2TFyW!4+gXJGVvjRxo*I8ea#uKF(n2oNgeuA=CJ6ZT}tj${8D)19`0N ztDoiE7Rzxj#i}qNC4z`5ltb=3-c?Z|;7~V}Rn!dlsaDzp=ry&E@P9AW!0Ri7cm83o z8c3Yj6OC49reTxUFMf{utESN+RaD9NP;L3U_?0&Rr%{lg#fjkHuQInM1C*aAEF`)F_v;c%@C*+5=`7&wyomj8p>9|x zWC_5@_j}b_$!AZ%nK(B&gZogNTC!7g#BsQKOVx+8#Aj!)uopn}CEAci0oXh+J&n%o zp||d%l~3`oQCdlGa#nin^S-ftS72;!9fgJgJ3+S@<7Y!`4edM7P2lHl{?({^@p6tS z1s`fZHJUp*=Y3AS>I5DsiLp?wc!dEb_#Zkl@dE((eBy_K;S;N&CuO&Q)S>{RNTn9J z=8qCrz-!J2t-D1`<*T}MfL=LBQA`|C!2it+FL}#EFie8~f546TZ~*@cG_|_hT;1EK z;4W2->O86iVK~+Nh-yZq9#)!~t8UU>lQyEZM&a)A7EY$U#-M)ye)X`ku~$T!7j$z% z*uQ~{`TKPKtPU{yvBcGabShHQx&;11RkCTNLyx4+OC-FZl-|Y5-?}(i`<0dxHu(?`kf-*<8Am&7LKFz5J#0DPBNU zE1~azM1k~px@u>^f&xPb33`?dkfUixp0&E*uLZg3HLObJ6kSv_ib%u!5HBMOUi@oH z;TK&f0=U_`u*-_s@)ae+C*;H!rw1DomzKhFj^G#lS}K@C#L3kDY*s5qukoS8Ze!kv?Pd*s#7t!Luc8c9uZNM zwuYn|!igZNTSRvxbO-d5tx9r=j&_m=Df5Yr1`i8>RDY~#Zhgp4;apD9{RTpgGqa_d z(;dQ+UgVxgb5w}PN|{IWeLTM>%BgdRj$aJdOoF*lur2U}9>oi~-`Y0(p{uDD)lD0# z+@a&&-|PQBbRnv_Z>Q#1Q=r8Z(*l`Kg@%5dL$2#iQ7z6R`kx`_xD)NqahK?dOM{8j z093+-@|!fvC+M=iQ$`#o#WJC7dV)Ml=IRWwm#0Y(dmd;$8HLzy02K|wcrm_fA|v_HYr)nt2NDAByF9_dAf9QMXGegp#`Oy9gtHniq|n4(u1?peZNwL*kAolW54fLs)n=s z+?guT&4j1;6*-x{{pjpz6C;I?Xtqls2zs1yx zzhT<=4Zrbkc$rpb2MIY7y!xoUM)2y}I79P~HvP)Kp{$(AeRZo932ykVe*?Ye-|$s( z4u9{f$k9vsjZ&;n_l)uP{LltZL-n%%I71_xc{ zS(%{ALhj6OoA$@_U@kiG<)2c{_?O@A&iv9Ft#lLPWUb%3Keg*?{TKP3JM&9_gR_>w zPP%sdgZGm){^Er%jeqFEXIlpUEgpQdLPzky3YP~jSID*u{!4Z6QibzL*+UC;r0ju( zxtYO_!#%q1Sf~nKqA-(`mB^GXs`m!uhbP^)Vd2{EF}*(iiDW+ciR2Z@zq>1#HG?); z=4@^0TwB%~bk2qrpPelJR(|}Yk7D{Dna(v^xSB2QXMbvACf&&yY7(~biC~Ww$MD}M zWUK-g7mI7q89`S>q9q*jH{+Neff6Y!OytjCziAV*ZM`nazo~ftPF2_3@T#}$-!!6e zcZOGp-QozM_!7Isj}D?eQs(^G;-lO<(o(eGYb2#;y;;$%n_yZ`1Zp1u{(~mShS0Nq z9KGAOu3Ro#hNGxFiLr=x%~2J+Yw1EG)_0sq2~nIi)NEtsjB2{)hw{+Q=EcHB)IzJ+R>oFEUL|=| zl&cEJ?l6k4lw^6<{swjHyS}PazLDHj${#PEP<}$?Yn1Oohh*jc^LE-!w&IJ#>Qtan z1sb8;2@{e=e&odp^~>4lYL{z!BbBWfMlw-9%vM-Ssyj+5Rk6m4rRvwPk=Byxijr!p zD0{KC`ZCmXEvbC8Ozo+P5votsbHsD9(%GoeZ55(_+v+(|T8^?~(p~i&d7RR0 zpKAEP#U}h9n8V{Tk}r#pt^10^pmolMx0JV}ltL63qss)S)eMS@Nv+mwYf@Ph(g10)Xro~wWkc9IwU^Jz+M36$`QR~` z!J?*l>bx%K)nzSO>1iAa^jI*+^9N9TT})ZkX*TmF8~?{qiZ`ra0*JeBDYYU*J%nRI zX25!8z|-tzBJ?s$-@+vvrxs2zmrC*EZ=&SGf_bqi=JA~1WZRwbPWXJjOr#$2$J8$C zIjd~oP<1X@+xcFV=i)*g!Dlp~k~J#%EZGT_tW(KGm3&=a*-ExFZ1(!K)bc6o&e!)O zdf@B&y7iny8aUJtU+_M1A_xqlsdL2kbl~;$ypM~vdzRcTrJ@B-$DUcreVyFGEF`g; zy@3`RyhICo+?noVJ0H2k_wjv6sS#V-ts=F~VQl8=gx})doAU;pH(r-2eN^aetU(rB zpKj^Ar*AWZtg=KmpcB!E;qEq-_Eb7z!P7jQ&(%30ow#!@wuJ;gJCrR&Y-nB&NXtXL|#EcW>ZR_$bM&&w2~LG*Me`gbOVk zk1D*Mqpi&+8fyYjE6;_p2u2RVN~1;A{L)WaUzM*#7N`28Ce~NwYmqgFh5q`=nfMj> z6jF^UKf%T*SD6!ek?D+4@({y}ug_(-KC3-y4%32*iK&c1c$SzMD&*@)!JY63{QdC^ z7h)R!ex$QvOekDSngC6t%}<{|1kzp8o5=5~;27<$SGqf@bYXfc=|Tm^|6AI%hQ?7v z;hUY<#8B3CjivO(N?S}rJMJcF2r4zZ+oWl_wY8>HV#@67s~ei#gx!frOR9ld!B<^ru3IAPIu@hd)fwDyV4xNEHMhKlGeC_wK!W_ina{2QqW-H|IO&%$+-D z&dg4BeN$c7p7QNSIF4u>L1<0+pcPv&=3XOWjknOqEta2W%JOqOmYjph=c_*r%e(8J z$M&W(*xm#`=GO-gp%+d(yf-<|@^dp)Yo==JsM@+0AV05x{0tGfU-kvRCJSxI(-7TC z$kDTPefaUUWutR;Im; z{v|`5%OJHc$5Us=?R!dOLY)>}EGa;Vwv-Sh+7f~!)~?vTNH}Kg>+OBKXTTZ=M`Arz zX&)1Ip)i~-Si|{YrvjX?B~Wv4*}=+ZCz2(uMy-%ejVIGq(XpK(ETsdClZTM=Eal-D zwjTcA8cAh=O*O>0V+PnaLlaOW;IiL~hXgL$-!Y`epJuq~ z_d;B+-;)uz>~~K=;IiM@r}X%S2^UxWPM^^051tgb?01sh;#AIu(5pgUj^{YTRllb$ z==J}4UEs3c#1(t}u!xa>E@Dlzpux?#YD4MfFT>s?&+d%j7p-+`Yl zFgvo}6@Fm{s}Y`m<~9g@c^n2d>hX&VSK~2S_4@xjAaFUJ>me6MJ90b=SXh9l<1i4_ z;GCMpYLELh3Ypzqam~+7S$9X9TLuK5QO?9TvStpu(U+&h-!+EF-c!k zQ-nM#;i8%%gq^vHZc>$ zA~6v#7b&F5uA60r(Q>I!dn}g<^~VyaQt<+{-15mw4_fk!VHeUP_R`$ncC@q(x;+Pr zltgA>6sLumEIJr&se{9o9L?p@d6+o)Q73J}gMn>OR3(qHy-H`TOu^2j*>wEhtF>SO z&v5zNL!;fe(1@E2#D@Ezi5N=#U7?MaVUFvA7r7n7?_>UyU0<=BoJr9-l>l)Y|{QtMK95D=EKp|4Vz7 z_uhYinlApXBQ8J4co;ukH`Ms?{TS5rGd;eF54wD`Eer4Sb+ih9jPZRk8QQ$h*N-ti zkCUrUGrq|Iwsxa(V@7C@@f9`l0pqJU=W~3N@nKaaA2NPiA@K3z%J`3Oy7AAl@&B4B z;`n2I#QA*wpn26dQhY2EW<0k*;K50yAX-5v!&cY9uUMwQ7VY zRcZ~)q-973gH!~q8nr5F7ot>)P$i|(KY);eKoP|V2rYj@k^Z4A`My8rJWtZ9xcmLR zetk9b+~?eL?mhSax#ygFpU<`CPpM2KD&l{Mig#8_zs1msiWw8r&=v!?RQ!F#%!(Ut zSeB^RFpc;B_j~$%>G!7Jlb-jUcV{ntdh=lw75RZvvu*vOLGjjE6&2ZmGupHLTY}<` zdC3pFZ*g1yJwfqWd-;4HT(v3kYJ30s)%N~SP`uRMzdLO2j|IiWyk`fNU(0*8lu0#a zm3%kAfm1WrCVG#_7H-)_Tt!ek9boCU!XLukqq2q1Ed(-0kPU@LTMTW)(f$Qsy0j}h zaA9M%v?NuVy+xI$ziji%LGg`5=YCentNu1+QZ3)467$W^2K_I-R^@;sJ#b-5sW;Wq zzboj!^Bolx<;D98PweYGCR1KGnJyei1j|pFC$2kFwF4iSIdDNZa6)F_BQ=?Ui)Ia+ zTwPj}nlbQ^+R~cTjpUHwHA`%&-1ISmbpt=x43x+|Jzfs3f00smoK&t_oGGnN)xjso zb){lzRPjqv?WKuHCGRknMV659hj zU+#U2QYvhlgrgxC$ab$t^%0XD$jxXQXqk}()7mb${=RItbKPcfXA=K@;s;Z+p=%pB zH#H+qB4Z+kqilK3v3Vj}lqltynWg3%^Q9H3tRUr@ihOBhN=*;;#w>8v+wrvPl>(NU zQkl}T`O>P?xKKz~&7U1Obw+mJX4QMPG>^^%yq)B>1E?4{IWth5y7uUx_>)&twC+Wz z_H5~??7#{uK)#&%Z(-#J=!BVpd8x&@;QD9C``#?I)s-#Xr}wsU^|h+BtAz>oN;uOz zFVz#2k2+Gsc?#0qn_8GJz1UWIG~b;`)n&InLz*3#(o4CmJ2Itxx%E#~TA$1Z*WZzCAy%+6qaB2-D>YCn^6zm>*b~>LZ);T7&0y<+eUUO@u&mi5;!|lX-ZV z!CPj47EK3dOsZq)y?%r0=4yhaJ*9Q2nFI|ejVl9wJqi>X>3s$-3fe0gPR)ZPbE zm-1{Y=WnP|*PWd&uTG6JR4BsiJar}PEV*@AdtI}8o%OnW_h`QRk+$IaF$(guT5&OW zx%H1!=E_aqSB*_jgluAW_OLZpX<7K_P*O15#gnoGmv6ZMF4tKBDuZRkZ&8k*evL6k z&MLLP!w#J?Sf)G-GBy>ksLG&tb_^Mx3Ru)(LGh#*GBFjfsH&j&FEK=ugQZ--tXFvH zZ-QcV92Lefrn%8 zuz`c#V+}kKgGT@l6s>b+n)d|7mp?%d>K-fiBu?5?T%mrXe-?&B|I*F7mo{g+UoMPR zZYr*eF%+Ca*tB$J_UcW=;&fy&j->B}&6(7~pnNvyS%!)FJj2IFy>+>PRX(*Gw%R&y zK5<)HIkkw6y70WW2FDeR3r2NaTWLI9x~KWqL3tNxoEIFIi8|i{O0u*$uUX~S`O>C* z{kCjjdsUV(lP!!U3gdJgHD|W4ClM5vd8o2CoetK{4%TjYp*0BKoGZV(Dp&qw)%qvG zBlqUYiz`djsTVRqm`V}d@iag4A5-W*EBH;mSwH_-5v)Bz^g(!Rwy=r3Zmr4|Hdd`4 z506|Qk&{R|=-4XqEhN`*`u-P!kCQsiVk(0|ioK|TZL?48+#6!ov7lQc@y%2OP zFuKnui3R&}&0B-wZz;pyRv=BYIhEY$D$ePe2ZuaOY2w4oQHvlpK@b zt|$CFxXF|L7*#-x5UHY$)AxV^9(DL@RmVNNp1()wzE=ez8nUyjgzvS&_5ai4By0Aq ziMGT5ozCdqj99ZYm zB)we|SpKl0O>C!?8!idzSnbqjfN43%1a+%yraYsZx+F-pR?xVqRAG{N>OXAY)U8hW zJXJTC8Z$`pqajH7R9Ki~%6bj3pl(HKoJ51Vm8o&PC*M<>4w48O;;Z;aqR3}mN>i8e zoA7i#+X&LRsm8)2li1xBw>ni}>8(puP^=*N(H2maq*@A-OlGS+onk69oWWEG4mH=> z4pKhVUYKNBTW;yBN!3~&Fji{`eXKHF`p4vlt~ash+|l zGu=BKX;rGlQ(B#Bajb9mQE#NSI;B8ilBw@l#~n8U~Fz z{^GbNMgtQlOa{HLS63Qwv{k7QORZnsjVx(m0(8F~DyF$;3wcn~be8QN%LBJXbifzU zOu0S500>vx9IVjbic!+lJkv0wp;3KRYx4t3v$^gknTZ*JD^{xu6Ii+05UGxuYZ*wh zu1hy>T3VaET8XT)IT@r?HX|#wIej3dmK6{R5|RvZ#*DpzRw3TixbwYRjIuphz z!_h=loL-)#g_QYzmCL!o_4h~=3Sl^Z*a{c#g6n@hw_J4G0vBrc9Yhf=bj1bk*uJ$ zC#{q&yv$s2ZUh;NpB}^fFgJqO`~dB^$1p84Mllod(_@$m{w{)408u^_bT0B?3fK>C zf`m~_=rN4@N1~Y8D5ln9810**n8qlk(PJ3m*F`ZcQA~@+FraNnkxzRR)9x{^>Rqbb zyCP_pLyziRC}>Xv?Qv*z@0o)3MbJKn9^E@f(4h!AIRm5|FORO>h~EF*X$kM8-%CSg3LsvUPMmg5Cl3p z=r^1$*8+zvkPZ5>1&0;JMtuZ;LjfS7?=FG}2HVMNX}cHRM|q2a}S zG-MF|L=TEpw6#wy&;llP7y-yk07w}sU$hDfuh4u6mtXTIRnhL8u8bB#>cSxuwB4y$ z4&SFn3NIdZ;_v9Ge7l@>m9LKE>wtO(kTL7yEYJ9RBTjp{KPhgH6A#*o3D+@%R_XDK zSu`J3PT>`!_V}7T;JMFgYNfBBaBS}|+~;)c!oLFn#r)}*5q?JxYHFR2Xxe88O!ABX zFf0HJAJhAB$bL9(WY_5d*%}}6@I35f5kPZ+lLfuA$!x;V!)1CZvk@;L>n~W!%mLJs z0gmefkLi9a-TYEe93hOKf$w{r4wYyHqFrFsQ_I2v5%r$wIPpwrcTJp^3En!kGf1+| z2g@J)a>5iM&qx?arEVf^kirOg0V%HE4q{byI{ z_kC5yR`A?oY%(6?Z!2=!9b)MYqC2;~5S(@E6eF47U3<<8-a1b5NYik#=}U0JY7Q!5 zdeu#oyn@K^-;D1}umFB{QkG}w79XwR=iOE6Ih|kh4)moc?_E{}`q`Qe>h_V^{C&K2 zK0xTu>_^FdLfKQuVEM2rf?8EQ%vzz0|2fW>{Q!Ek*~rFn@MbgQd zE7|HLB=%el$&SmY!xv5t!gGII@xrNW;no^wG#2|VZ=t=^%VT0ud)UtX;%w8w6|-GU zv{|enXpma4{B>VRph`L=K~nedHyJmP^68`}lTX$1oxy^l;qMI28nd#TOq;H;#yE+s z2KrVlHNcXe zSp!txV*L@uyupG6Yn&Il&;81B9&D1|UWKa)3T~mJ{*EN(>!(wFvRdox#-iQ?Lu`b$3%1^$(i_{Mv^k^EmO+r%u?ue z7B?@F+BNtm-ioej2a1jvCH4wamS^BJo89{CH5v9Kb!3ty$!ke6z59R|mpCg`=FHsI z?I>y6x}V9HiiU7fc~(0)cC>Zhlka{AMbrKKv4ex)n)L3y*=`g`r`}T`SeMFKK(!o-NdnG`v}!8FbJyQRm*7rJlBl4;n^Q zNA3^6`yZvp9rbr-Ft`5ql~l=5XOw5XDx0|f^l}ZFcvtu54;%E=qukUZn3t3yPoJUsGw&mD~z zxzG6mv~#W$?VKw_JLgJh_pIM8+NyTV$2)#btp+Z41Z%4-rtyqgvF5NtYO`t-jE-dnyAdph$##mXXgg@>xbc%gDFInIBwwl1fC>Z~=AI7nFI` zmCC&83T0k(r7|C#s)XkIg8oXhb{SN3GqQ&*NrmX{TAQB^2CLECsb^<3Rdi(%^wBIE zst$U`%zWb9)a-0xUaIkgC8_pcaFjU<^+ZoXvp`R#IWGmp@Apqm?)dHN;{GG+-?y>; zkz`Bu;$O(Z`Tn!ho$0gF3(_ChkY{SBZ5{Y*i_C2;>S_JAKw9qEUwC|9Z*_U`q?CKh zZvUA8H8fO=Zmrs7z+ag2>35Eps66 zGqLnvY zz)W!c&UEvxba48!>BRjW+I0=+ec)jSTw~Ksrt~trxG8W+s;8~{K%QLofHJq-@G~CM zR>6fDF-(YAGx51i5MyGt^gwJ6l&z4VJ{j8urF9u63E2np1K-m05>LF6D#v(dn%R^u zH(5@RVQ?@qSgo+Be9AajuIUG#1Q2(i+88Y$vB6N2qRnv!TLz1^JZu*oMnPEwuX~LO z(MvZ%qr?<-HBqGYu%`BQwXUpzzsr?=#~lCiY2})Ad77s~lhnctTDd;ZgYwDfrn03S z`O@xu_qnM$b-dZx^*buF>z}K{SlL>74(i{pf3`A>Db8jltyF4KO}{qzQGBr_7W#PR zx^HbwY?Z+fX7*$lNGkV)Fz6;>F0sG0{~5Gp_o+&|Xo%MGe@>R}Wew4qT6knXl)Pw0 zhfb0^T|Lw!uUw>uqi~3^S|x@BxjNr{J50)o$GWw`)&yDc+RC#oQ`=OKL_YDm{9&Ce zNA!r9dM9bk;II;0{`Vb{{P&~gx^p?hOhSw8;YLC50Bevbx)#Tfv8jMXO&jmlnc~5y z#x08ZJu=?82Hc`1ENa?#hr%0(MO84MO<@oj?@$NCQK3am8}HEb$5FKwHEq0OcOOSJ zTGX`hZe66(t|wqo)5g1X@gALkMNJ#;&{W3>c3G4g?=Z~9;2s0J@ortDjV6yi1H19g zx5ZY>Ap^Vdj&_U_88)yR@22*(1OT^|_Ov!X*ZaX-_gK1|O&nyjQ;-DoFWvmhr6=aQ zx2D;%q7wUycD>f-+m<2)Z6n?`v7)nZl*YiERI|@oLHQZvHi;bSr&*+sYr&+qo7j*oXTqv{Vl-QxbxpoJ>tgzE9mcwB>33pk3$x8Y{33z+BC9~0?45CDARa19olb=hwz+Q z@i^gGJxx}Rkrm1_Bl8Hda)9a3zU_49HwJZVUU-4UUC`}<4uBY`_Xy}5V58!rN-_LQ zOHeoHYT%B$WsXBDV$4$-4Hqdz^lnPDcUV?+E2Gg8p483)zKy&_$*X9JGxCU|DP*lt zGMK~{l^+kL@XRhKUxM-;D3RB}gEEe$N(C{E+D2-^nytn7J*NkPxY=5q42zO+x@vkD z4jEN=Ss%SjP1g>>Ax+l~shUm^w6*z zZyGhq$Rst@j6+5iL8)84OMiADnO!ml3DwH3hTINm#bMCM#bXE*O@&r?Ip}>SgkKnU zLZ4C&X~YpHoSJOdUI3XMk4PX)J3;XVQpk31mPonUbn0l-QlIA)6S=etLB}f7uLDWO z2uTMqTV+_H0u9M_ zIhk%Cb6Ln&x-u!WuVuW(!E4ZYk_TZ}<%o5VM*lND+|}oFI7) zlS1-C?S*|TuS-1-gg4`BMKnbu=XSx+ySfVdm>SNqWL6$xL&l0P6LPwzu#e?d;)PX9 zi={GUK>lVQJqfh$=~>vv>U)f1uCZaQDr$GOf27e2<`4BO>|>N`(5a%jEXl#hdVJHc z_a)ej(}BHkC$>~qzq49qTxzOuhFxxk8ym3qppZnY!cTvr3t?<>Zm`}v#{^z z-VdwVhCD%RSXOP{9CM`a=~>u!Om8dMvROC&R>bDx%f^YNAXn&F*cbF3r!hQYNRiSy z`Q-`6xmZs`N!!u0z;>u|n#Y1-0eHT9n^xhOz|=FXipwnt^jzU8S4f}4@N|tdy06qV zFQANT_kS|OX3dt&wf!myjAY^=36Kkl841FJH6W0(i3c8Ue1*6$PL_dBhc37yV=V8t~K!l zq1NDxz1WSlAyTm<<5J#zLHUPB<|q?-B#+{#R+VZk?Fg<%1Cd9xZ%-F~zuy!o!S%jF zV`ey7w6gE2?pjOF&MjAe)j_}Z{Rm?7LZsxe>|ty`Gw4(sDf^pnRCIsGY`N-7L~qWP zTnC1ra~spsg3YQKZ+`BEq_c-rby6x-{TVWLpB*} zM(_Ygg}u}6Z!2RX9s!G*mZ05IXH?@BH7!9i>f;0_ENWVUMjr^os5Iycdl~AH1idnr zpb1#iv;;l1=o7H0X$jhgwk6nTQPUFi%2Ch6ZtA@3aKXvKvQrS(HoA z49ytaV_=t{SH==F0Ry`Py)qhr^#ly;67J$gq~@h=vkdeoU90scvxrBCPZtGo^bC^QHSS zA2S;8mc0+tacg-g#vFD;X z>s)e%3c~Y3h(3<#81FjkE*RL<$t1owpxyz6z4W@5EV=OXT8t{S{SRCfqXmQTyLv#2lweG{P;0GY z01DV*lnj+io$|UNBMCq6PwJxGnRTfa+87ecxf)ETsj9=PhM^6@uiFDYyDU=|Y}GaF zordVLJXsKHjBMG6XZc+{)nGlIql>i?8XW+)Jx0m;_-o7Qx>2WGtEbwn&naE16{`-2 z#oWD!xx+#Ye==r|TkJt24jHR1*Gek3cN%8MGCZdDqcp>^acFpc-JdWz?AiJ7rfOmc zWgIrbptlKvt0tWA7JpJRBTk5T3mSwmgd7Dajb)m52F3q|%#Fn7x{Z;X(ZG9!e@J?t;Bh^5@ThCZZNhIM^Ws%2J* zO~mQ)r>1ShqX*lF8%aM`nvipM3&Fo)BbLtoyRG6c^jT=yR7V>O+6;wQzLgr#mb>mH z1zwUO`EE*hkl}60?j}db_F#E`Wn^mAZI`2;PMHADx@{`|L+#Tru)EFb;UNnDYGnycvbsnW%;RWDKGi`r{94xGc5jN-(Xt0kio|>SF#*;yWK!txUTQP%EQm z8m3^y$R>K{Vli?0lx_Gia!uA0Rlr4MFX!haFFu1|H2jaawR{Vw2768{EP`2DM8R1F zLR*76%(3(N>-^Q9`VE*_)*J_1j~Xn$BOmoqtkMmGarexXPmx_1DU52jUP}2gu%u!R zP_VGq`$m)chT1^R((|7W>NXPIsN#EVQ3cZ-(+typ(he^we~H8-nrxy|cscVp^20z> zR4RWHX(lsd%A=`}J)!9P*Z%F1UAP^wq(h?gLJ8^E_*U3H*rrcac?%U z!sL`}!o5r~&AtN(`uAfKZY{kiv#><&YzY(Xxl{JwKQ{(n%fyj+d~%YVv$hV+$aXw} z@+1=!TOREVGRWvNa;2wpsNAJX?2^KJac=$7l~{yVMD~%Q*-X;AO@!JERa^I@6!y|? z>orq)4sE@RF(;LqWE7rlM&XCy^HO2U4OF(_J-kfWhUb=NeU*38+9C@=PZm8qn0LuY zDs%8+3_LAu-J@m;-o`#*$`+iif5FYco3rc3u^LdP87elbV+c~nt6bTV?zYh!>5;;WkW^k}-)h-B{bv)TU=CJUvH&Z3SW+SyglSLNgX>0NAFmW$hU2pd@ z%GI;2y2R}1<*E~v-ZsMd#AF`5$hd;k{HUHpiPgE~s^)xRtgT%0s#tBDEmtj*MOuPD zTjHfG#*mKKvV3}Z&K0uGWGQt%u?@?`->O3jRSvpVfg*>deQvqw%UNn_wh|i!!{z6J zS*hmER9#T+VzySNAsz#lZ5GwTJ7|%*;5fH<{1Z3@Iyr*&irKJJUXq$Pel)lKxvE@Y zQvy>D9d($bb65+}xZ?uc`zF-MOe)9S7y@UKPxA6kmXbRd+qFz#xAS**&~aywJd&7( zN`9)3Ag(6Cym&a1{tjNM5JBcY$=_252irAcCz!D%ucs;@I)ZeYz;8O7`QQ)W&mpJz zkAu%&2fOGb9vk52o5wja03XT&;;KprXC(;Lh;lt|M!5m%ThaDyg0f1{iz-8$g^u6^ zLzqtxoOd{RcRo!{ofQD*?{M}yU@y5y2${-S$j&{25{8Qs3ilpPxbqjh_g2&f^KYZr z8-tFG%rzFBscghg=ZHq)^NB|AI}hi#a}znvs~|KV=H|m!$E{?MNnpVrZBcRhkH>nf z?NYk=wpiD($9$S@VxsPxa|_}5bAC#A{+yo^?wE6<=KeW1XUdBct<49PzMDBf`VE$$ z1|w-Ok_IDbFp>r%X}FmzmZp%o;4yUxvwmth!TBll8}n1k2zI3UG`Xepi$qL=dE2v?W{txGbb<|1cPVGv|UW~ z(P#scHkEEcn!CT9W}q7oRrP`=h(b=9KlPvbX&Uchep&5 zjU406A`6x_(PPnwOtuHbZ|4#f*z}5j+^bPQnpn6q1D~5UaB6j~^zy*RYX;sv@Bt?G zkI$6xx`Uaek6dKCPM01iq1@zc;N!Kx#lpaHnyr>I9{9Us!GDmK!9zPn))TfXlSK*sR)E zu{AYmBFUESmvLLgl2Mx}_r^itlQC7Dq^fP@nCFX z*74QZWV2I>v@l(IP>ZzGa4O=(96Sk$ym4&R5EYTTk&A0n&vs#qsS zz@nyga<0^{yee3c57x=W`giyVEoxdPw<^|A5U{9eo!qL}s!hP6rgd`Y7s%c6YO$zk zo!qKeCr7}drgd^SAjDK%7UeoQw*N7>$H1t2O}x zyH1V{6{kG{I8)l3Zho-$EKWvcOeZ&GvOdVFooRkZ2RimKzyIZvsLdPxib77hYGjR| z{G#^3l4{gDCB&#l(yQ0l=+mfJoOv_5%v`zZ9r+UClb9j0 zkFp#;+yhnIW-U$?#ZfN;6!tO(3r^*Fam>YibQTEXc58RSD4KQ?z_jK%IR=xFE-$=E zQzC?M({(vvtm?xgW`wBvl13PsQH%uKc6~_e^*HU6ReRmEw(nP#OKjEl0e?sX_Cc#z zw9gyD^?wDa`>t`)%SL&M?;$V_X~Q8S^$2T20SbGM={*O-{g_cjmb0lQ95#{&N)zh6 zus7&MkfixWtRXHLhxDKp907;qqi9;t05;78#UFvon!=_yEUwW?fDWr+Ew3H7A^ov0 z?^wNNlw`GQjuDa$VuRiMg3g2Nl9)*XE8_&3EP;nf0BuvpnyCb0grtL*r2ZYL6Ur5l zGB2F7Tvj>~W+TUAxNqu!uV93;P^2be6xI|z{&rfkDxJUiyVuwj~_ zA56yeEdo$&J%%`FW0rz^WyBEe{1=F5?5{G!Vl+g_yw6Kh7$pKezoRFpNB4eQ)5DP0 zOwkr^68wRNB52F>1nrpKtQvjT(R>VocEh-%eN9i$>_{|9bw@+u^-OP`a5U4*qmSsI zX6S@nJetRPmv94x7GlY4TK?JIh3nipO^74Wq-NW^tE8=Kz&`zu4nn^WjX=f~=`e&> z`Iw_Intp;#G_{to79E7fe(g9bd~|_Ry$qb1<Y|ANrCozlsaR@t~RfxWf)kHO`$-e%=&BsCO% z50xHHmwqGBqmPop@X7=I_r>N4!jU7xZ{j+)4!fkp7ZtmVT$NJ0c$@?@6}$*D<tEpe(Kds5rTCsPNSi$2o981ongxF2Ya9pQL-48vMQRVp^D_4F7o(JTxb z%!U07VL#B!$OTt1Hi8TL8MlwXvmmumOs%?8VLyXaRi&85D8|NWVLv0(4>ZS%Fi)pp zJg$gn+9MhbM^du`&9Pe;PlwupW;cZ?pB{(Wfo306g7!Jo4m2}#qI8EGY6qI#L?zr| zhuVQ=mZJ!F1SkVSE#ACMXPQSdI7(q>whjwaPMOwc70xu@ArW{hJ^vuL&Vyv+za|35 zF3YW4_I3F(q|i3polDQFX&0$%rfO$r-MY) z2!OaVN}0Up%%xQ5dQQc=Da55-BGs5W#4-uf}65QP7Qt5kRzXP8$-OV5;i8UrHv6e@c1Y^Qo(7o_j+5|gz+RK!|J zEuq%lDWTq>p^sNN&8a=))XgDg@9i*%Txvuy05L}ZsObN9QFt=BzK{L_VL?w7u7%W~ z^#P%>Ie?zo;M<+QA7{{Sud-7?~|KhTr>5J{aFAr)L&B! zl|rrB>T}`Xn()1+>knZO#+0$1{R__>;bWo=G-lgZ~x_1`Vo9H@g1llqqC(L8h$BG4>{e6 zY0*zVuZcLVJYJmkQYG3rC}}bJzN*-q=@xIwN_<=v@85{0Y1?jOapxE?$9}!??)X=+giU6u^6n8&O_-D;ORlV& z#tzoa2gFLvav8>F^~vbHL)&wa!WkzFqT134?{L^poO2-J=Bg`*U;e__WpA zx%%FPO0^TsV)-Nv%Pt{hm6zmqsa2@Id}@aHob@GTGDnAv+10`G-&lj|m~_mus7A?o zuekOjb$BKuYp$TCey5tUv&lL}e&`Wx&R6^4aql|8@*jOF>KNJb+{)?B0b`F@qfXV} zE-QFRzYUl1n}jp?dyBC~U-Y_HUwl#Z*@Rq7hk`p@6(nO&-RcuxS%^G0 z#%VxSAnX0>KIv6OouCY}vXW!X=i@#_4Z4Zk>O)(#Da&M~=_BZ0h4E%b%^L|G7g<}_r%qyiS#_o?9EHc*3om1PdEJf7Vc1?iWSKIz z`RPK=_RZ8yqf9+r(Y>Q~=co z#2G=!!O>Qs^m1Bvk|K=@wl3AjL&p#o$;E;MHbRahnc2XhPHFpfhyss8+9O zLkxl@yoO$Ojct?&cV)|SphxI{9_z?$v5^2bOc@aNh{jGL!n`UqH*g9b2oE3s zP;TJov?%-*O;ao(xgxg4%>6qex{)h@<;zuOgUXhk)O|r1gV;X8;OYemmfS{YqRnhna$TuQ1Xa99)=S!QtZE3N5`45fnZ#-AZw7~h}f zx{vJECpiVL8LCHFXXS@|sW90faz<4)6|v3hm`)^;(bXiRIc!&&+eIjTE$_Ll&+1&e z8Y@Fh)U}rOXA}2nZ^{YU3`^%scDi@t=AZ%%Ly&>+TAr%&I-aeiyE4?$kU(YMP^RU; zL4JL}WEIMlmt2ZL&#x-Yms!tQ=zP9L0tQKb0;()IGN^Z=rptCeX*Myoy7yCrU#b-~ zdmMbaF;t$3ZE_P(9jdW<1cJlGJngXg&F3QokSgzNYrex90wvA^yn6sv5bW6aVq|sD z*i^vdSdTv(gT|)<9>-SX^cXZT74SH=8*ht26@VIQ%nmKnnAq>-CZF<7@TTqy zvkt1>mM!h!lCga0R(!?t^`n1wkLhNz(tQWLpR>CTPUH^LR|RXeo`tnJt`5jyXVSs{ zJ96b*Rf%K&l06w2N!{=~!=|=7@U2k`nVw>52LJ zK3AA_aK))Pb0*KxTTb~WokG#clT^l>m9c-8U0}L_3rsn;|5VWb8eL$Dc`zYm%dq+k z-Jw07ki`*oeO)lP&D`?5t67(*wXva#O>h4w4R4DYohnpMIn2cyq#@Q700h{|sOOez zu2Za5m2A2CIyhv=C7Zc{i^;7CjG%5+q`!|21tiZwegB~l45qka4I{D#)}eFJd-6<_ z_w#GPV4|BKJE?>H=~@-q7v24*k%@FWoU{-yqT4!z(FLHQP4eDxr$p!W?{XCt0g>?WW5 z{si%)Wp}TkrM#4CK^*Q=?bvaK>|yR$o%j z;?io9L)5Zl|8!73tvV8w^4$znq%iI+WoekHO1K_DMtI0!TwkhJ)jC`rEpq88)Uq~g znJ3=#{ORS?7cHiA4>IVzl4l!LcNpR{Sn+OoG|R=MlA`pE#DhpjSr^=e7w8-Fhq-+r zOY{uLB;BZ|+Ds8pD`~Exw2MklxYv0eWua+ z_jMjN_H}XPzL8HD6~jHcemx%UWUAp_IKUW>&Y!t@->5>1VqiZ{84XfPIv%hn#`FDg zRAU^a0bMx2FpjQYXImCWX&e_0Fm8uqs`fZaBe!sXq1qoub;VI#7UjM!jK~MWL^b*n_@5UwWdAGahtLb)(kfx3reu z=ND-=n%e&*D;#QpH!mwlv}uil;n!x4yf;2s+w>@O(7=j5fWa z4-btXdPfx2d*J{*`YlGVxQmjU-zBDM0l*#NyOG2IXap!6psVk-iVa&>s=~SaOzZgv zZ56Yg(`f0`JH&=Tbo6XUEez$C{YhzBJj<2Q@F62RT{~u5LN|p& z7(-xq1VUC4pAGKz%N6q756XFY*3~kCr@qc_e90&w&Yh8(jNs8Nql6rmDbL6M`FBe% zXyc+%wEG-|sj?&*zRsGf>q)73B^I4wV;!D?MTgil+cZhTkFssfrL19ouLCN4k&^k8 zem@397dxa2>k~P=2~es}6*NFUV@^ncG>ROWFM?IdaUpPw16Rkzq&#IPadG*gev zkC8hqQ&t5~ZN3)jSz2M6ux#12^tr@NQ!P*p(k?aT5-+gVGSz+!H!Pu|&6T#w9#ALs z3);9F&VhMa@*pL@G^7F{HLEsJB$}FEhqE4#tgoNHa5!t*d7KWjLqIESRS%Yb{0=c% zmC~+mCR#<)yVf0|^&O!eI0!UOF0qmy$>j}teQQqce8pp7kPx$=Yx+Co{F_g;OLfYh zYzN`{t7s0?{Zp#zrwH=b$;RxUD@JwIs^6l?4nhsE{95C%hC9}Ai#46K>+~u8zkz^+ z!j@O|2q%yZn3G7!vrNZ-st(a5+p8owHL8rx6V>;S`bdMRWpSOS0(a==_f+67Re}2H z{5`~nUT!(23fx9+d1uvefiZ?G3ZEsfa#dTyGF^+hNvzqW zp2^h3ruo;k7m@39%&FQr2%B_WNW+(@vFG`dI{R61`z$GRvZ*~toqg~Lymc<%E&n49 zzFq<4_v)6QnL4~YgEIpR?{m64Y=8&j62#M-FI{%jyglx4n@tE;q4Equs^86sL(?j)s*da zW2q~IgXKTnq<+PHXVIv@Y=! z@uv%G5cb14vH|EC)xAO8BRu%yGTVaJvg*KB?lb!#VbFog%)zN-I|4O#!Oi=2Pny?xHviVqyNeH=$^cS}!?r%%{ zD$h|l30As2Ra0Cu4lG9hZo578T#k6KFZs&{ytVxhqwJH<*KX>Xbl_DmR$FQ{Gw(ICrD&I#G&vC4f8OvAM)Py^is!{f{#(S#^Ibu?pGjUs7-2+I4lJk5!VTsQCJW5xo+RCQ0AkeAYx;fQKo48Aw!>p&gM<@2$#UHk!9xY~z zvv##=i&n!v=v2^|>L9qYp9i7{1lMe|*gB&yIQ*ZeO3C-Bm{e!+^f225AI zaouWbu=W|WWx8^8L;A@3ZUn1hO7X>j*R88<)C_t+PDdTdL=bWtDC$IA#oDo%n@hL& z-J>&}`T+TH@92BczbRE_R@mUsyo%PdM%wpLr>#O zeLkUSml9Yq4_efTnzeU-@_Wu+h(jAS_+Mup;&ps4xoc>E*6lnopZ%C`6tydjG4FG9; zuok2XYDo!|zIp|U4({(q`j3Dj@d)AWJQ009+6prNt_ za#U9Z;#E5=5>pA@O6bp2f~59S^(pu)=m{#@pD!6IeTqa$ojJfu{lig5y;4t5kM3pn zNV5)E<|rvV^J~TwO^Wu^Epj=~Rz+JH>S@^VzA+9SIsZUUlEar5Jm0SP$PBb*n~UM(g^lZ_@;<|Uh6zI0 ztLy?EVXv|KcT5WSp0JyKfZgZvGr_jIG59Qkl08@lh~xk(a|%1&0+L;G(Y&6+kz8*f zP4=>Gbaz%(5vogw8_J-I+1_lYY2=p&M(ob+?ucO6#cUUZCYaY)ed2TI9l8!MoKmeCR&+4p3klx*Br(*7KB#!4>imT$0@R=4;>u#~5Vc&I}DTnq! zQ!bFkhcysRYq#Sr(ugaMT1yA>t#^nsyGZB#U3R?X1BNoC5J%DII45Zkboba{mJFjjZIeJAqERPR`6E9(I!@I2zd7Mp=DA1b z*G*DE@wT5&^?vnY@BQ>_LS$k+`J3w1P2b8izZjISGW59nubb-re*KgX=X_s8RX<2K zZwxN~FVOwO$VwjotY#~UmNNA@>k?M9WKCg53uq?6urHpis|va#OkEAXJ3jPvkFsk+ z?4A(69puA5L!L#d#QmMa$qO#Wzh!jD4q*F78wXDlXzP`LI9Knt3-4`LWfR2Oe92^?rybz=| z79qI=`yzCxDGD48k#OZoEjX+}2dr<#T4~yB!V@}0Bkw<1g!ZHQ(;_5}D#WPSrM#X8aa&Ra=orG zK}EGOm$)sPKz9w3ndWu2Hw^lF?vhBk$Touua|1uSkyw-d(gUp--H}p@sApRmN%|bx z{_x*eQ8z1 z3$9NoN4vs4-hEeS+rinSWZS`VwI*sVG~ddOP|iwXK$oLqt-9ZKho$XpR6Da_huw9; zeevvXR2=qF+cNc=*e&hOt$#uvFzy4I>)yj18~ScCH`TLgjt{BiN_XeETahE{@%}Jf zzj5I^4$Mp+=XQ+U|^)KPrtBcfUw3U1Rqk;`!qb^<1hEN?MOH5s98&{SbmoEJd zs3Pmxv`hSy?d-@(4nqO+a*p|3mTSATNwsI900?y;s zmvYU&D~%A~+;Yv|YZjl0k64qw*>N?AWD~b$6PuOvxRTTzDXFWJqPCAJd5-&0w5uKx zGPz_+zTH>5jR4be{TVv$)E<=AGb&UP!Q&mNxD~DN+NBA=5tj`R%>2V?d?!&;z2=Gn z{TlIV7s0mjg?4Gmvs{|8kxNq^r|15Z6msk|dOyK$WiKY0SE0qMh%ZffLlC|-=-iAB zQMZx&J^RW#Q5tGH$=@A8GSGId5{f3)gK&;6QJHf%&e!)#>nM^Ye&#^MVWt3{d;X-y?tPEp>;OLc;+s|auB?+&aT_>p#OhOpWQb8<1-3FDwPFeh;r z3)xjEb4%h6yy+H~rUdzp^YF#*zlJ%r!9^N&>o8p1$M1aKxpOWLK5f&kU#8dUV&oJJ~<0!}^T4(q=(TewrgP@~X)vyf;fNqVT0OZ=Q)*J^lor zjDo=(h%n6Bd<_-tkG(x&9pNu7>e_N5!NKD%(FHBhS)OUYaLt4Wy|^;95QL?s$Y z9vO3t1std1BWj`9Om%sY7S3@MR!HNHw3L1WQmq+DU-rl;*~pLCf?%>yp6{;8EGpT^ zR^tkiAb<&m=?|>;(sTGUT8r62f8mfCI?sgzC*TFIZ}FJJ{2W9bKCa^`PGks;V)ue)^#-gu8z^WJ zVEG^CfwMffA)#lL_d6<7+}GRXGQW*DdD>s&*AXrD>xkMZmt5?HAqg(}@J{JPbBq70 zpWa!N%KD8&2Y+-2B^deqweRlqkc%!S>WO{gWA%9S3ABuA(lMHV){oJg|Ec7Rjv@b8!@4En7waf`o{XIp8HRGtValZZ+i$?;{G zX0cq&xmjj|A#fxCUd_`lCpzOOHA1IfPBc4L+QB8>`oa&l6YZ$vQ#|r1k8UQ?=XjXh zv!psEIk-=WMJq!3M%a^x_PnY|Vp(hA*SehuNp6O@vvWR-8~-IPX~ob7cyMs%V89wP zJ-3!N$PcTFs5By^!W|0ZK+l3XLo%NdHRGUU$HmBtx^6GHek*^s#}_2YOI-|v;3AM- zhK;~%0V=P3Z{Tele`cq{!ti=RvfAB)n*hrK1Rh{~qmEm%hR_xrWs4am?M3U;2v3&jen&JkI^n znV#eHdK~V4>GX*h-sf=lOGh~r!-pL1e(CX+l=O!k?oM1xuQC1zaQ_~Lo!92(IlQxM z!lP)19M?{BBVBX0dpFz6O@5DvFcc&${nH(Ef6X0TE{)iDUc=6hE*@DG8DI-(Gyv{9 zqD|f?iD@&{lX){sah9u#v`;AKo1^=RZU?jszu>DhkTquY0jxSkLI{xw`gKZ1F{MqJ z=Fy=1$tpVITjTFpTTj&uM&47;{s%Q;_?Zc<;_9f#rB5Ki`Mp3YT~O~4v?4Y#0N#W1 zs=FY3lUDXIr$c_9{|r)TAc%P9IFOD~*Ti(HA5 zLUftXghj(p)f|Nn=ZIvMW@YH8{f{quXdzmW4NIly;Vbr5f;vE_W%O^ZL_jA<8 zc_eAe9UU*Q(289j#Ze;yK*l)o@A4dUb&>W!Fv+I_a5$^A>;@xos@?$*GRP91Up&b0 zH=}rLk)k`68a>N68kPlt6xm9?5^BC4)lLiP9MXI(mZwKp^9cYU1JuR4(DcJof^QaT~Tbdk#~{l|I@d5$Pyiuj)ueQWy9*>;tR)^9G+Ud6JGiDQl`jjXB~OPEA5>=HU2Kok0I4}d4do(Eow}H z+MLT}qK%Gyz`Y;F#`9zfB|4vt3#z^t_LsiQ(_m@br`_2zq}j3B#YCjbI2(QDN9AND z`_nJ;aFFzW{4x(0t!5L?@?oBgcB5akjTb@-8?ulHpOK`7$YN%#oi`a;iZ3}tH1HDw4|TuUgdr|dO%@D_zzEJ&}IXxtQJq_(mv~y@!<%4sK;$V3~l3TO>7Ig zR@EAPriVl$TadOfT_BR?lqUMdVNdg7Jwcte1#y(X67?ZWeOxUm6`P|TU^1ewBt&99 zYOU(}oYRp}i-`|N@Ts1U5KU`Gm8UUEiZ_mdHg!BA`c}{CXshLmK3La|8`c$ikndmk zRu4O@t0xR8{#K8qLxe)rHkTS*$jL=S0}aGMzo3ZF^bJMTKayr6^+SmW`F!`LNE02I zIJF`3Z5BG{2J?$*x>!T(^?$M%D^735%nnp_F(YS&c^TlkfDI_d8traHmD*bRO?KeM8z~DOq#XTyqBZe=h0fS#=RoyQ9tp@M>bEvO5nSGkS(VhR z_(&yvkeas{!ns^f8YQjQNj8_vGEOY>($PGau87swG@+INOzphs*N7!QQ>1ntE#!k> zqjR1wPYu-)8)e0#ZDmGVa7Vse{kMEb@Mc}{P-2`Bmnl8fR{B+2=`Km{ViYMoUHYw^ zj5v-C&|3ZgwkgK&vfqL(X1R&59=S;sVgZo2Om|wyUK5KU1U!xj;FS)4>U2aTWTSzO z)0}X+X(2lv?_{j;PJYuu_L^A8Cg5??LiUP(4s^^fC78)Cu0<)`; zmLleAF;Tzbo81Pi@GLASh5G2tnVFn9)^>4{Z=c&4j+2FRpQ ze~sl|X#|Ifg5ZDqr5Z%1@4kx+Cn5qIb*SkhtX<_DJk8(JS_%$L#0xSFGj@l{OpkU>Aw z7{pgGA>KKNYX{pB8}jZ@#_ z1`{GVscV_ULtdZvffh&52l?BIP_>VO&es(+om^Ua$N_|LJS+KPjC9mm--G4<@UP^G zUePW>+G6)t9O4)+ES%8KV(XH89myu!$SC=@xq?3Ga}*pAV&40_+8CK7v}=cqIo%FJ zeYfar#HV#&VZM%L@8uA}xV9_fV(*om!=1Z3-|7lH<8IeaDW#Z+yuxvfcmEk3i+Dle zTH4M9d_7~MC?4KPEd21nBtxtIjX~x&KumLz&#}J7fzMz!Cl=BmAD2p zSiX%0mJnrk=N#gpE|#UNv?}w(+}?)e3+zyG=t^ee?pw)`FMGG9$%m!4^N(6LSLNcO+2ak zdo`R&=9|TBOJU6rpatx3NW?F^l55j9tvhjkY00aXZ(o1sci|WPYg-+EpkUhbb)gb>48(t=cQq(`@G;H-7BA$Pd$Qpntir1nUiw4o%bwh%5P&EaQh-qI{5!{oXYj(o?Q2D+-I*`##o+LuoV+_oZZuPWJC8l=TsAQ&tNw7K zQLHWodLg_1Q902(qHheP>$gC8zkfQ2K9rRUOQvM5FvHMcftXuv`YIuP=ioKk_ly%h ztvstrh6@>S#IH;xMC~DHW)i>?gmdd3uOdONF*%o8qA~Z=&v1Nkmdtp;)qp|yKPCBc zM1pF$7OE_XT@9nV<#1AKHO%5b1WQJbil-W{Gx#d=hoRSQ0pG2t?9z^pJymdQdApDs zLC&oo$N6Ogcz5SH|1ewhBuL+nZ((9$X)9Nm)5{3C&BYUD+VMp#?2a#AQWA1}QNeTN zs^#%IMCWXTxmt)Huxx`e-v*nlk z-32wD)9t{$=GF)CRhfsYOXT7LT8)yKgMMp0gsDm(4N68_*e8mV$}95W9yU3AQi~0= znlU6k#1ttSm&V*QQ)2P2FjmL?fG|f&<~-AB&NK5W@pA~zHRqW{hbufUfeTIN1Fy)N zWuBaO1THpmoRJ$$Eoq)>o-~UpYe`~BqL#=^0{CC$D49P(h&a<2Q;o)nHcyMpvqlv) zPo6cM`mE6_SJk{D%<)Eb)&DCTMHq9l`K)qK{3e|u-@QjGQT0tM1D)0S9MOEjxXM)9 z2a2lov7!!sTZ0eWr8S~@>hnhb6@A_a>XcEZjG8dicsJr`lQL?4p?+M{Df`mVTTF3; zDv5QYK?mFB6XwUhbc7c@PCas^|G=}%{SAf8VJYU=r>6aFFggc=PuLfVqI=@e<`PF=Eul#B*bZ7r-Q`%tEx!p;cTuH4+ZEs|0Ye=!t zK)LfCK&ITK|4KPH#yioUqL<;Ff|^&G>_&hj#Igex0pCuO(kb*F{9$TAV`qJ&qs0oJ zIuv3+!1xq>C&}&vH1^Vucp`u1^uM%)JkL$FfW_u{onx(zbiGLM%r7MKv39WL=}Xtz zFkfYCvlS*IF<1){@!Vi97o0ATPz-&-lfp&78C+#MVA49hho5s(sC?yMV|RE{OD&cb zsv=M8IyD-}u_si(HowPNws`$}g2hKuP()NyHW0^s6<9MBp=;HmL! zgG!QPiVg4_vVeB1b5)1Ko*Y&gPj1;bc(C9;mL%ss$bWVe;5!qZ-fHjv^!dh2>7iCW zT3&o$7s5{WQ{|K8h|HlDs*eDU{$#}-&w^po~f5#WNF`lq$ZyorA!EGjBw$)>2m$n4DNV0cnqK87BEzJ-udas+lFf> zDitb^?9GVCLHXagh0=DDey?O{Pb;&nwkiwdz%(1FG+TCsL7tBz=%PksTZ!pg za$BFy>O#rERG(61xe&MH_6G^jYO*1(vz@Jpt+}mFrElUDJwBV(*3#ZIII5pofHGLL z=M#^k22oW@#WA;>zo80eod+_}4n()_;Znrh&p4n2mx``*UxsR3OfX$|WPiH;ei;X@ z<`qYH#q<$mVQv(2N>$RJ=7#wAFcus>I~Jb~hV_T(5ig0Y9S71>RBAny;X;K zRdH&{E_?cgp{Bk0KXe%Go>Y;sLA;JPRAFwO^Z;z9?Lb=OdebG}Y{9cySKaXKU5vkOu;(j;B1dJPTRJ%&N=pqnQT`tCxyc~Vz3{SGr8 zHU_(Gg^nZ}gI(F8FzNh{zCIlI|0XP`%cQqYO1HE*U*cdr9~F5ZTfgnk?wizh zM)yqy#Vb5i*?TH?PO_a@D~EvawYlP+fH zZ6J<0of9}ICnaJ>6TYEyAWW%%5FvvVu>_K|JthaERxVnG4Aa4?RU$^HW5LsMX{&;v z+zJd3ut38#fRu7;fA7y)&vTLjWoG{Ry}qy4H?LRD^E`X6z4zMpwbxpEty^V$GI{7N zTInqhSchzut@kJoG5lK3m4!u&P?FKPSfTVhDC{rX4y7{MKNE;`%L?m`Zj5rVB|cgC zEXGpfZ`}~tl&p?4bCf6m;UPSJ9>yrQV2tusGe${4^IuOLx+@i2f7YuUsb_@dB*8Yh z=`mp*Tz`xVQ@6fhRS0XyexnNLs(qMvFC_QVs?V7dQ8pZ201Mi7Dks(Nyb1CH4%ScW z5^zbJ_9t%_PB;d+jfjPN1bU{tS+!!kMc$_p=V55X8_JS7h?%B5*DBl>p!T2ak!i}! zjdd>ueNV&UksY7(boMI=j?uG_i&I$38hu*EHRxs=m-+u86<%b-VgJlmdSc^u64;$ z;>sb<@hUxy=)TSZsA8u>I?+v`*Qj@RnpsLCx%&x-NNzQXme(Nh!-9enKh{Ik_ENRe zptToUoPn4>4RRVakXc=DnxXr>JP0y_xqggDin~Zpk=Yd*Hp33YP%)sD6AokswZyKY zIk2a)jK_W^D6<5JwL! zpks&vm4p;CW7}e4c2g47I*bSN(m9!;-PJgFY8Rd3Q+BOUEK+UOH%R$)w}tug?^)X1 zsEAWhJl4KYI2t0NgwUUok(|TvlI{#R0J|zX;njVlIrd-Bl`Ptei5#&KY~7HL#w2Xpiz&Tq=Pm_bPz8dH%SK}zK_yPc-$l%`+$0?YZ!=0)<8hO8(8fp&gBOpRq=TYEHFa;&Dy~F=3;!Iz8O!Ab6Qkc(;c;9mJ%J!UsLv>7b3_jHK)#4|h6|oLWJea~K9PEsPYGCZ?!h%m>I-Fc|Bw=CJD!uMUum7wCM6=QW;!>hQbT zzkPhkMHSHJiNE-jxJ29(t z_aEc66{Ux)D%&7iJ#A=+BL3|mON+*Zfzm5k zD>e$CRTV(FFsjN)OL^bd^2hV%dQu)IK~Fpq>0(47pwQ{5;aurH1oiw*!Jn7wsc!7H z`mPDJF~V;;sM}Mbjj6>j%O9YXBWV08{$Mv?7FQc-M@pd0|MxL$oQMqfPnNX>2Xhr?4OOYiJ@`FJPffHww}V zL6j{2RJ)b1>=yvU%k|s3I?}yt#l=;av>2A~u%apk)Npfb6St#VvGoQ|TGCuxQzcW( zs}I5nv2ARCHAe?D5oV1TUBWLF`{^E48fv+OP{~L&*u(^?&@&GatZ-aYiCY31y6!z2 z?;7dYow&uAE*^quV$?L}wqpDkr4LoXlSxR;ziB%_R(mB}y8tOp)syVBZSwonehY6W z6kZ5g4+0si+I>#S1rsP^I2Scoc{r?)ykpMLJ3Go@66W)0ZgE9^EGJJZZSh6)G0M)3tJ z&p|!LdI7I0m6$Yy{uFt*JL*%T6(=X)F1llA+9lmESg}HPlurBGV>21FQ-@@UY3Lmi&xb zlAPs`3v6n?O(2fFl#9%JK0!N{-)N1(+bK7Ix2$(2%6>gL;Dh#;_S1jRa7X!w@y~d7 zbcSb4>AMaTxQF(IykSP0(dQa*fiQ4aq$4Q^TRg6ZrtM~M3Jk7xv<)nJg{lZf` zj`pY2XOr%+Xrc+BG^QqU7Im<4TF6m#g{VluCzH>{%nYSrM9=(=tP*A2WSzj2ogQ;~ zWQbDhNznJ>P$NU5PpeIqP|hHcCCW1@mrbf;j0%RE6k4z9*4vRH=sO*8T8iaLCjx0p zRD|c0@Yd%pb&okYf_}NIozo1eqVzrks53<^?s#0iySSY(O9ztG)$yntb&!UHszT+t=FbL`S`TDF*e7s_|giFg} zhoLX=IOh-7e3POE+=N3hTT=as_^#`K#Tk+gD6GAfGWLLuFKQ@66D=}fav7|%O_&KL ztKV{J*@cD52`p54oy@ONa|}(6wT@uU@lAI;M>1VseU@|>e?&9GZ_=TH@hNe|nSL!Y z_@Yy&N{+b5+7t76Rx)i9TZzbYJQ|u7*}#{!64=2!5y7eXBsQU#SE%|R;YYy6qr~6q z9G4~@X3=agxofIP$`-;&GGHw$?G5clGy=wzc#*Xcxeib}H^+7!o1gP88Ki8=*PkCr zZ_}|mj?0(L;83?PvQyaeH} zLorWlQC}gr_%p=M-JaZiD-1*l`$CJ8a1dwAEx<#=k&BsegmaiY^bbX*AG@LE(~+84 z-=InA%?V2#VXa3I*2ipc&ny(>zqpLTRw@M$l!0szdCL1 z_9eNpqh&2p7A?itW?585 zrB=OG{9*7@8T{=d=N>{qV^)#UFUZMYQ{6p5-=|^HrKX>P3DW6IqIB%xrr3_uQLIbB zz#63D>O75W@fKF`)Ckf9NMVe!RrjT2k+L-tqu=zitZ4D#C=#<;w8UUWv_S2^vG^H41M2TnGOcNfPjAd4J=!556vi}vvn~GbKc1w zIS?4XQQZLBN2E{02V|N3loq!A#PLjbK!9buGEazK90w46ILKJ2jlU(3*!7V-o~45j zAkM7dMriG0jFzK74o#ODRHUI^1<&50j?RMCqBn`-*F@s@kw_fBSmJnkiOe_?xj5cAf@0$} zcwYP4y8x6!ki_xUS}l&=dSe}s+0|Mwcyb2d1z>myFYpF%BvYZrS#k#G;vom-AixX2 z9CL2rO)b}HjRS|!k+@pJjP`LoV?a66eBVGf`Wc2^?sm`{3l2)R>qYFrb10s*%(CR^ zrLP!uI5*z_+^nu+fmi zU9HXE#hBXSItj69sf|M)EHqJqcrgJ-2QAIxdzrISb6o8?N((~qGQcn;L;skXVFVv{ z`0kwI>52ex2N-|Bb70y#BCFlb05w$#A*6z=Sz34rN1?}(ag+|sgrdf$!_#wu zAK#e+L-2`)87~G%@4Qn19;*QqCFJGpOL%&fhL)HA)+T*3<`LYRo6;J|Fg}F*gzYTedxFUv|AjVSzhvqcRvTGe1z9~W7`XUbY^LSkF%`HJRGyQp z;}b)+FdRfI=f^5sg93Z9U=RKtutoa}J!2IRh7CdQpZ$ee{(Ls_vu5}IQ@sDvss0Pf zQvGLD_8%iXpLzXfRgiaqAt=S3t$OxfP@XS8oyc_mH5QOtZ^sN$RiS9J6a^9T?sSto z>y3&}=I(B+maAE<;!U}|==dP9=)Y3=UfzGf91*e5KF`VHEL#MjZfj>1Z=qS8T|M{j zGYy}q3O;+lBJTPu0<(L2D^wUTG08ZY4OnL znH=<(A7~cl=iDF|(b?#3Wl)BpnWSI7m%{OO?#33Ia`!3-GkqH(dpfFwjsmhb9uM+5 zddi^W4C%k8(^%5U-G{|~(R7_+PqcDlwy9$`O$G_~7~&+%?ZjSe+VteKRQ?~>x?V9Y z8=<8ZNlKWidqNLrMOD3}+!R-vMf|Qi@PrHHD)~~kH`P7z7r^jDy5)?vByw0E!=WFhe|#^_Yj-FY;OsR(0?HZ{x8n&Zl)CXA`^+@L{!7REG$F-C)${pZzT zOiLKk;%VO1wOl!6!%$I{;Pq}K!i08)p`9K&v+H<;c88%xFUn+dy+Q}W&_Rct)%8~j z9STE-JT&NvD|9#v9VV2Kw~WTAy9q}+9qQ;&j9E+(OifYIsr%jXBk`{E4jMXGC447i zJQD8C2=c0c2AGXnrjES8LXFZuHm=}rjPb6i(xA%;8oszFf+5z-Na9q$KuwFWluhPt z;cjz~KZp!9ww@8B+`nS5YI`R4P^NnHE$TRbBx_g48SuAhUGq8PzMGTi&Zhi1XyP1S z3eg#Vmz{Ng-;`g-oWYp_TM}*?bS;3Nwu0Uc<|pwk{ou&;U}Yev54tB;d2}^zuE|4h zjxul$3ZXI&hw#st8qEpz`#Jf8jq*7BO&(#VOa08(`!5=}oJ0Lng-QJarUY$Y9`RAT&&> z8TK+Xa`6YV8i?F_(`dA{i{X~lQc;B(21aB=)U;84D90R?x=p-#{lyrTjrmarE+3v_S%Wh7F< zIjsh$b2x(UVL7|aBj2Kh6Pm#X2zkpofF|{DIEqtk8NuhQb`Iz7J)FP$41{AhnCOIC zVIXD%|JW=b$R|%WkX~QwXqFlSnUA)7;$LVEIP00^S*qHdZCwEOi0 zmXr6NIV}4^)Eq4iYic;a{=jy|o;@7Axeci1gaCBd^>*+kU(>_^_9sK0FX!%GJ1mC} zhlSAfe6Q8I0<4+B2L}$`x(cgoyRFviL!F$f3ItQDpV+f!&tYAsiT5&SKy27n?`uaL z$PD3=o9X4a{7*-8Ewh39h9#$-F#-4NF})llNRAnlj6wb91hF92qbI9yCoNOw3a>%$ z3c@lSyTp)cAzb5A8z`JJ>E#Ug@te2Re*D%Im!C=YQ1+v`J7o>!eyQaUU-WVpUt9Ei zSN^Ypth_ul%guxj&*!|2`1zbR=6ud?t*oI`Nij!$y<395dpUKBT{Z|dASZGZrfj#+ z74xexhk4v<_9kkiM7T%yl_7z>kc1iQx1q?>HHP-D7@(ZPzBTA?vqjh}; zgp?TGGbn}(7m3Fs^&HfWWf7%7c`P4?@t|#7SzT0>rl)8no2E?v*)S#!zaMJ!mj-E; z3|%J(D&~2%olIrxi`YZK=U!93^t;F9>;Eo;p=hUFaDc1qLkds}$4BWwO!S+B*t6pQ zPyL3jss6u0{l-T|W9|+vX?87c>UchxZ-U4%$+kwu6oY~gIiG>ZL6!4>ddMkYKEO59 zx2KYrD@(<=KdV$@vrE?rUTw}jj}^~|sBWCQJ(d4IwiW#?KyFoM_AHi;_+R{wwQt<` z$=Pva#pvFkW5Z05M#QMrD|9F9yV}3u`LO}LOUn+bH(d6wm{h!R&99)Evz&6EKea89 zg3V|LZ!8M)QMrjXN$-a18a5$Of|#&4l6gqAhmel#X~@$Z-$1%Hen6|lf>8G;0Sksr z6aqD>5Yn+@E%kIKw~+2j{Gc)s_jHIyJzW%1?IEON8>{hj_=}CVWl1*T8yclmc&adf z!W2LvHa6*QQw}El{ug>TvildR-JrwL&Cr?Dx-xS8SC*vomuM8?d zq;unY;H1h;_)U|xcsjN~X~JSXZ-{i@dZC_D@5y?rz}K-7qX3dBfFjuu|J>{M&0$ab zZGX~pot_w;8nARl6hfaUgw*Vj4|=Na7yS8e{-i~_JvF%oDd4>=p4^7w!KF(sRV!Uod#x`vuJ8FeKq2 zi0MF15tCfl!B!tr|o9-ge};WZFaz^(3W+BtwPSkkoZ(PZYGZs4CNS? zfhN+kQ*2xQSh#}q+@*j4FxPxoXlF+6Mh+ILnsUDj zdVffxf;p^E<;`9$l^bc0uw7FKQD$C5IvPcj5C`_YT0$Pd%17DZ>UQ*@CxTe~x%pSQ6)GS!={W7~_ii`Pl*XD!w8U7U*3RXtMf2eo8N;&_vm{Ifcf?{{XV4hoX8ks9G|_60gkZ zw_L_r@N>!!x9Y2;5Y%=vU+p_o7+)vcG*tm-tCTrLDHVROBiO1o%YROeVKrVb6IO2R zM;maKQ)#;mcQr|s8Vch?G)OsUx?Pm`$7y1<=3*q0YF9LyUQ)Z(V%f#if)dzcr_ne` zypz?P$mouvbrOm}nS9bb!)EAGP=lLze$+K;1T}~0w^XUIFw_FMUAa=f&gVufEFh>i zyaJ{15@a+wx8_B&*E#nTi;cSL&WMpPUPqL-Q_A^1E29{xEmFjQw|h3=G6^9cEhe?v7k3$f51j-H-6V-Ekve{K})&U>_ zvZ#(jvQ4brqYAH<{g4UM$YE)&NlWn1)Uk>4&V;<7X4AQQK;(r??B(O~wJ7a`tnb)F zhS=GZWZLqK6OOsi>D++oU}N0X@j+GD776=iPH zG=??4I^y)-FoNwMN8w|{=)=R1^nCO=ci z;{dVi7&vUFP7AKzI=A4gwci^{kjZ8WG$)lZ{@ec9ezW>*J-Kv$otC4g*;W#&&Fq|8 zaPDN7OQ?n;m6*0OP`{H7&Y-fH^BweVOf_a#q#C0o{T|u~U$Dn<^GWBDkt-nf2M^e?X#+siiTf(YD!xU(s$}fg&NA#Mea$MU> zwJxIS6?bbRbzbv73mf2oV#tZ`=IKq_s}A$wI=HFCaSe2s-=mZ zH@AAQYa41lYQmsY@p&)4BWz^6zZXA{%dN+80?X(N%ER1 z=?+0TG5qBH7S`jr&YZ+u6XP+Sa$d@%W(}mhxJpfRV7rB(P0l6b?X@PqY)9nZh|iCP z=beZ7*sJ)f{ro=D6Dnm&8r!kJc;f|e3d5Rp_5J*R7~&WnrF;b%FHPq{Ue?<|k@!=^ zqdYSX6!cMrLyh=Zkc`GWqpw!5VJ)rzh*{zXr>eJT30)v9p(Ql{aRj_aQ;AYa$Utng z5^yB?tOkD0Z2*Sjp8VlRVI^)by^)I07sa7Gx&;`Hc6M;0tPr!J!Hx^EpnY@}AdYgO zuR!FDj{oXNLFhOs2z>$GQRENDTBpIG*4p_DfGfLcXr5Q=2`uNe_eZVN;;^vcr~ok6 zsK0r-ZyeJPdWsM=de%WXjCoz{-XNei4tchm$qEL#_RZQsKO7bWj%Vy?=dVi)awOzY zu8x$0%nT)vmnbJt#bNEFYLH(YBzEPf=Vvx3fpTAgtl5qsTZ~s`)6?F@85-njdE)6%ZK#ub3{>+sNDKTqw691R5*it+H-~S{C!9xbDx07WY=?XU-xvgl*T5373euomY#8?SBQ+YZ{ zvUF^K``6*KrXmz7QCMA~_E`HixC=k!Ba9iy;adb5wacQgUA#_e7jHvRC~s{KVQZnq zR#5I?s~6k)58%m(V|7$Ksy1H5uxX>V4x89Db!f(70?wOZn}oghdsfKF9HlFx`2PmS zVun(Y*+0J9Y#nY&lk zT+Xk|UQU_VrF+Kg!cdCnDOc-&SH1`3Y54HZrDA)`1Waz1P@J92J#%@o|IAVdRI>%M zlyo!HOdU_5E|ST2eVi6M1D9lxddLPMMIp|Zv;fmELzZ4-ZDp7eQJ#>N$PQlpgbhVI zY5K(#y`?E)DkSRFd6pxxBA!sBTaJYs&Ger~hOVZqo3BHb+x5y{8X1~vnf~9X40!qk zaoLY|Qr{vilQF&LBJB_<3hp5UGcaiyas#jF{-bBGJwpS#K7tJ@Q__cLwe!8?N)5UN zm>bvdRk`VW{ZDxibiOpR;k8g48t>QKA;S^{i@94tc$vJ$Y)ud(HJ1{Q6>nyCkbfVi zBFZ&erzrK@pUgcZLo+y*YL{T+aReJrVqx)i6z@}TT6F9vlGwpDcQ;|(PWmXFd6lYu80g4CS@FpD){XOvKZv^xP1G?$hHSe zmXnEX?!&w#=O0#9`nd{`%^$0GY1&*yT2c=2Xv{s5qFI;mn(Wz$_6STry_>BWu0Zn+ zeFH7D(NNHJt^y@Riv0LD=g2@~y-xd$>+BIHMH3ppsNyeCemPbMsAuWRPeyNb30#4^xd`Pw+>58j{B~NH`XSF{2&>nd2HHoEgHHF^_?^sR=_S zqNm3cHAv`ILR5|sG(E3E#<&K_fRi!A#Mv~#0OI9Q>Mq!dFinNWK(TBMV??kBqQ+y2 z8YE1MFs8v{ph;YVgjFt#5k*2dra>|gYLM{sP}3lB&aU$B^ib0v8E~S4&~6Vk4HAYy znD3y6ng&TI+5~pULrsHZAk-k?NhrffBT=-#ng+VFW0x6uF&QP*M7Ux?O1;7_o%c@O zoZd-)M3gh;sv)0)`~avjzTIY~rbGn@FgKZDMPn}S)e~<~C%J=KxjM;({x+@a z?}O_<%^Ae3&%UoI|Ao?YzPWVM<0w?Tl+J$@yGeF+LIgZh1JTaBUS zhK#ykkL6Zuu}skz9=H zZ2f%cbsc(oU#1$aJYv>qRKAor;>Z6bvZc`{ zac&1gB%R@A$g-p0iTF8CqvWrc0;WxJ*=HCEjK4U;COv($#XJFL=7)zb5l@q%*I;gETNgH6NAngaSEMTmAUv@C~I$r3;=%m)g<_d1? zhOe*)9&L5&^+k}fZ4nHh4ZAfu6Mm(`)arDtTrJgNHD2qYGm-Av;lleI|&1vwyH4FG`G=}GZ;FC`;aEYTp-VoZi~ zH}PPp2V=LJCPJ}9n|9av==W(|%<`dKdMVG)-TRwOxJUrNbSuA9uHI8k9lLtW(hC

    UZV;`)^dB8quNwe5+qjJO_jvWB8e)CtSO5s zf?U3hi-L_XEBk*f(k)eoVc&%>rDC`7mJE)2QtGbDpujZSb^*Zs^=E^8X=vc3V5-dM zk>nnPM>7K$F&I|~3_TLmNFrlkROB$FR-+VnvLqaSD_ca~0_9JgAx(02n@1s9Ue%30 zo5WB=dJ?v%vXevT;;ryg|GGI}{~O&DEKw5Z8)YdPO5a*64x>TP1U8EsC)K$I30}0> zU)5&*0*J9})L|B))xcjYiZ$TGC}gboVo^-}`B8}GzNMP5C}#X$Mo5B^rZ)>;iov0c{XE(X@?G7&fiiv7n zxv7~$Sni3r1qqq3*D|4FfyU2VV3_&Pz2%?QWRJ_T^#|GBNM>D#ti0g$i$bbB#D>%- zsWFL8jg#s$evpy-;6g|~>dB&zY7a5C;9jCg_SFiKoyQLf7C!J$MHusBQAo9i7^83t zQON3!!PxWhHIed&P@NzmsCLy6se_P6h9|F(hK z;VG~#7C#$XEvyHYV#Ku_sf2x3`?rnfxt`*JER~s`l{Q5MAyf(~(n^T}Ku`h3VSJs& zr3H>-lZqeD4{8)Op2~TRHn2Q|`V?ouEuPF6jHJ9uT}@){CVFCLL6SHMp=u3QCiK3D zYRWW@BZ)WxubhoN}(iFC&{ zwwrkADz+|pFKP94iI>oZF?5oxRP4DP(o?`n;b~tDOb??JAvAJvvX;I=;uHERUyNOkd+^st1Qb= z$UrGXRaR$WhD9>+gD9>fve<%Nh9QYMgKiuoj-{}~d8H=0*J;(!A%mT2Gbk*6=d82_ z)6f4#d&M#Pd#OC2QV_LTL8aEN6k;4~T*bn#kjcKt7NK3*EtokMPO;NMV~Ktfc6I>9 zBnX4G7X1~>ebAXf)n*?2x4Y$(sht^oZVdY|yUh2I<8<8&a6%lXrk7?7Pwq;EY8YuX zpZrE8$}W{MgH#|KQyD_B#tTk(SktreTl$q6i1ai_fas4la$svQCqz{Fa*T<7zsMp(?&mQ8BbkMdcruzCH4!L*JpFQM*cs(5DF- z3d;UfA^9r!w!i!)J1uW+r{5mTl82(XHp(7zm?Jkn ziP{x!KfvoT#1k*W|Lp%<SGd~t4&U6zdaudr!Q{Ah~ZEZvU+8Kz?qtJS%QLEmaq{x}7( zDu;?D)-IhNcB$+nqr{x#9P)l?v3Fsp6(>0`Xg8dWOxI0>_rt)a1<}o5AE$8H?$e%h zxw^O|)3MgL_^$9VOt`oMV?2dgT*kC_@BueA-SJFg_1;X*bB%ZiI<<5XJE^*nyOj)9 zDGmN?qz7_;^5s{|G`JYx+H}(SnV%BFe2jO}WzD0^t$_cD!^fU9B@_fD8!VaFL(Luc zHFx|$p+gEK4l9viuCIIW$A&VuEY7aZ&Df-vqa5yY|V7MAa_OFMTL78+1n6ry&$aU zoFSf|0oQ)Sl@Eh!9*Oitq*S6En!4U2Yb~k?tu^RD=H66THNikW0w!KYxITgD&A+EG zt};gK!4Z=+aNG4r)JW=-QLFIhB)5m0sP!Y6{LJOj7FibY3<-2&(NZb#+74891Kk{Z zd||%!D-t)T(^U5=F(u~(`McORGy!2Oo`OIIie0@fDAlu@r*_wG0fm=UOuF)o8*6O)yR;NEYnN&Z_-TDN>JQt_JxtbHwwl5T zIuNT|^<+uzkfxr!>|+nAY4H8t^^fr0mU408`se2s<{DG2@ldW6JNwM;q_uPBjRO9$ zR0VudzpWUdM%lxQxWl$c)FW0M9WyErP!nHvADz1)=D+Ffs> z{8Hsw{#|mt&vV^jzU+kf%=azHm44B*4XZ#}wR$8eWtpPA{XOvy@46`sObh9&UV3K(g-$vTcrI>sp4JK5;KZ;EM!aO&Ac6=ni<-IEx|Sn#FQ$NQ50MS?90+v-Lc=2D z>DAFj^wu6LVo9FG6q5}Q=6NQ%0Y!gCQi%Eut{ujY#qzqmuT}J)Q;zGE{?ie^AAxU` z+5KOSQ&=|xB466cKHJj&ivk}x=bQGQUP-KK)ZepJ!Cksm|14^e5~9x|g9;$i^oyLg zxH2d9(=H&eg+M%ERKJAwATg_ydIG;Jmz($PTgHSiopt(Y-%Xoz1^Ub+kkt>!x zQ(dv3%<85ZmhSJ0rHTTJl6Q%2GnlK6n_-5I)xM*!|7@TDd24Zrx{2bU{*c2*Vs{<8 zX4~FMD%IRAHH5~^U~JA%*6xZ$1cnABTkXBa=1Y;py9*ZeU7Cg)B6(B)F(#c?&o|E9 zhh+Rc5;m6zL_2ixOND6y$tE6j&^8Byt2p;!zpsH2Nf#nF1Y4SUQzX6IUo`A z0^+sD8*`5}nau}mlJ4~DiurRHt==NK(T?~LejMc>a6c};=oh6rD6@vHA`_^vK6Aw* zJc8q}8tz&W)%Tf;7?Bq=QUj41h%VHGn=L8)It6ZecAA=Zn0NIU52zDX>fTT@cdfOF zR&4c!sI@_r+A)F0LLlQ*;#U5jP`A{OT7Cwhnua1CcvEa#GAy;8P+P@Q3J-%kb>S_B zb_|vsggpGor$0=~igBaL`in>%V?%}>TKtll#f^F#XYSJ?@Uh~H$3ZMDjDjYLFCGVT z`x8-62|<=;2`mQ6gR>#)4;B=U;^H2M;ObSHz~U;RxC)Phzx_}YR};n6cpMDw%~4!K z6xZN!aPF>-;##7(7LS9){mm#Y8^vWk?*AVL@vgLCQuW>wK15S6s^eJh$}bhiyH}m3 zKYIsZ{Uz(hm)E5`?nWN;_BSAtb(`MN3`E-0U-SP?ej?HvC7@y`-wQuNPqCn!7s3htkUED+El*54lVzC@)WbjS+T0vGF_hiwIu z>TfAJXmO$b9?lCFO!d#wF8?e~K)r5>{ImSgs-OlEc4yEE)ZhUlMlH38_TNyA7Qe|u z3vkaUGvX2_BiyIyTE&fNJ=5g#=ONtprd6h94kC2YvUmc>PjwS211$`4P(qD#1AAT?`qT?GB>wpcN)KSpA!NAV|(n!<5sbXsO zaIVKz#HT*Us9;cP>C~vL(3oOy*6=h5KhFjEvL8wyThPeXzy)?ezV?SfVp@>+`=m}~ zD{6^@w*yH{0$$)QSV&?%@|eUzg#3G11!oZ^p}Q7AiMPk}ojAUN{;U9kwNT0_Bu9AS zFKfaZxryUz$Y(J^{=KYJ9v0vax3`>*J6^!9cNM2TXWE{CO>4 zgn)l9>*V>sMDl%!(X~pFP1a@mds!_{Fo+QojHrTIE9s9;^t2J{?`6%LXAo>*j!dt~ zI`DDQt*$^|ttXJIptBsKfl#i$IuvyzzNsAxnYtc8vYzf#04_(GU0Aq3uo;Ma8_OPxc>cc_6 z)H;J!=WmV@Srq8+oz*p8EjDB@F3u;TF@p(Z`Fn$|51R8i<_{#Fk0sXh2-eH#{ z66LG4ntu$zK2v#e+*@DB#~srd7mf&*)Ypl#vZD zzIX7=s&*I;3JV!<*p-006AJI)(rE2DeD}h9WuJlNXvx(^+CTws62+s5jjNYb?nOw* zM|4qX7)aOFd}bPZ6qF#Wxs@XRwJ5;mTztF8=&gJ#$j|v}v7`8mlMh6BQ|?)@e=2FV z6jBwp@VYAW;j$A7WNkz#@dD6(lu}VB%ZtZNQcA7~WEqTk+$5#6HoBe9i^ok;O3|7? z!6l5g{V1iiwh~yLagUp%lvw)6%Hp&JkZzJvqSd0f8jqW#l-TH_xCW1#q?FLxiQ-y3 zZjw@38%}NT%zB(tN`8yN@aXh#_u&HjAK$js&fZ^DVuYz$(2husJZZv%6FJzz>xqYdtQg|{&+&YBiGpRdF-g`!(oN& zx8U`OLaIH)2iiYS#UQ>Uj<9Aa_9T43DfK9cG&n~!*{c1UB#ghfB{{c(Bn=gjPJm>~ zW0u4<0)lG)CJDpxLB+(ELsZ2Z&Q|SgaOqaKq~e$d2tEii{X#OTTl#G6OL#S8Q#@V0ErYpB8K>9yr{)lQgvo2 zYE7gZfCULJD+=LcqY%;Q?wNue$zYpCLsy}ns)TRc^{_O%SP>+q=Q~8Ds^6L6hdl+ zkb0K%hT^uzl5%LwQ?dnnN}ur{9ED6>18Pb6vk6j=a)X{!m8%Y((syW5MjkAtinmEKpe51_kktX1wQiF)igqO6`#U5Vl|6(VNQM)z zZ4AQ|rn|7nG2BU4ooK27pCrP0dV2fTaJ-*0~kNL zO&6at%sJ|L{>Q=SSz+|7Frq#8cek+z3H+E4azSCR^7qtP6x$3{5`4OTsV4i3;UhV_ z(9W%~2MQIg7Vrz?CM2WnqiPvupco-!nQG#0}iZ4 zF`N8?l~>VB;>FdZ^IzvQr<`g3ZIQs+cz7o97C6;Ghk`~Z|CN5rHds}tW9kYrp{;2a zsd4B8o|yT4Az#0SX1Y_T-N|SB_BVH&8UL72JNadsU}XXI2_F&p&lgn#)NtHB4+k8L zooK&7=*0h0?WfK$w=HY!{3**7PYXe;eUR~-8?DDcrfef`uYk~AAtXBz?eG-#Q!|-V z2=MlKlxeD=S?sm%S+NIMx0%ro!NJ6sE@ck&EBC5Z@6}?hGlrOn(Ws{KatOt8Qhu$q z5=h(!Hk*abX2Sm)vlWifCrrI0;{$tlI=>jt?ukV!Rg1)aKq)4$m&EqQr zyv#%BWYW}s!v-F^gh#T!(NqL>?%q48A^@j&hy{<;U1tj!mgK@6>l972jb*Qe{B*h* zBNC@AXdOvUAWf)jI#8f;*9omfwke1B5Cu2EliH}%raPX&;sbo;RQ2z)UaAzGfAXc6 zj(7%vziA0{6AF{;`OEpaE&1}R67H*>+=crTZdtHMCcd|7EYlOZIL3c-3`?l1tpWB| z9suBRGBO2%QOh}W;v&2_*v!&dM#zcjtO{s#nGKJcM9*bnk7YVe(M`Jb)@fqdQQB*) za4v1O`#3S^JDLroW&4xJKAR*K^e3wrmyjRk^UVfNKL|hrt@(gl79eDok>+k#YH@N4U-|h)kfVs zl_r})q_AX=(G>677F_&wl1Bbv4&oi2C6Z?rHRQX&^@k(Pu7o3Z=w?Zt@vEZMj`imh zOnV%vZJ_;QOtQAh${Z4IUOq-p>pvT@P$>^FkGQD*kqzW5Vf?B?Oxs{dGB;!VQi3M! zr%M37%LVI*^pUH-AE43}xnjxGy%6-?3A9Au;J1zcGOR}I7EUNUqz)ZF^d9=eFk@B; zt@<9JQ{Nln#V3fJ53hko=)2|{tn$m>FeL&EL!w1+h*XG{H0I_sP{7R3!N7!l$ZPHK zp@$g?f_od4MC75jsw}87%9^t}EQ?PHksS>lbJPms!O9Q8N}P*4|C#ZR@+N;RbDx>J z?Wu1{Zj?@OX=t8g=T(xO&u)s{(G=UBLh{Wup+yL^r9i0_C|V<;^IHlF5uNH=%~`Dv zof)*6x@oC27Hx)jW_bGe@tszVrI!T*1!rWo#+Wdz_>zI&a)}v}m|W9B$G95yCFAdZ zVXvJMZACkfUi}IJ9Z8 z0P)`FF4BM<%qZ<>j&X_O;92U3ru!7CdV`g=#t&fd6}xFyGBed~AUTj}C9zg&0n}Qg zg;L4NxRpGw98?-p&wiRWKt>|ArSmp`xo(IUo6p`dP%+K*Ml*9ZYc$z@) z;}dUc0(V0|hj5Fph}hIR7Xh&7Ab|05auA6VRxoT#Be}hG!fpvsiU8>q!(788YDTaA z`1w&C!>T#!oNgJh5>?o2N-k!okEtj|@br&ixYJBS^r~CTAIYPTr)&pY`fmvaKKe)W zo1WNZ$<1U7u+3@dJn^2ZE@yEwqGc7!}Q7E-OroaSv!-s_R*So(`!!~tNT z)@1ZEY)#(bk4E#nThP5sqqb?n?8bNXb zMV(iSkqJ*fhS;*KU3u7`U}ig%neYLMjTmsq%XK`&uKLz?W?RRe@G|K7(0*wgo2O); zqao)}erq?%=+DI`0o~lORV*r&@6cx#%QviUfwO$q+wxsHAn3nS-X=jp5Ji7zSzg_1 zkW_1?EyiWN<1+zcF+SH9U$!}cYRz5hFRr0vvy)jF#WTx2o{66YaQuqfQ5ze?7SI=d zZ}}ay)gIGsvDsO}?2HEkH~xkhxNx(@Ea07^iM&$~vsN*m0~l1kc9rHW2Vyzwn-L5g zeFbs9z0%^guOSr&2^5KlnWLCdz-`$N*C)E%F<)QggsDze1JQt9!O3kTf)91##$W#Q z{`swr@>>(;$FQ5s&p&Lv^77f1ynLEynOf%E%8vz<5h%QP+h`Gnvlct;8dGwFed}-_ z6sDlrm6esE6yu+%)z;LY_3y&>(J=eQA6{tfbMoKFSsU_Lprj_&$}=}$XJsp|MA?7> zR4?C)V%RoeGEP@Y#tzxy&?rj)O^aFTAg*LOUydL4#$Gjjr1Qw2G1N6_g=REp zqIg=w}d38jC(=?&o~gbgch? zdZ#S>X!gte4@d1~Xc#Wdx#5}qm0J}7KpbRb0tmfB+GHJ}@*LG3xqcFe@WILtTD&-g zI-{~GcegD3w77C%Hz`mJ%%M`4d+F^QK<&!SsdS~nR@56C`_C-;e_C&6;EXK%Kz)@^f}e4QbyzPDAPz$+_KMKnWART-lgB5Hnep6x6tE+@ zJ%QyP(k6QzeZ4vM5bl{U;TatFqS{w;B-LMAnR}U!Rz9!^#9Iq*o1VjB$b^VQWv;VEhF+t35VfdZBt?O!(x>O4A%m=S3u;S}aY@V&h-UhX|o zaXUf{>LrBih2>nl?-k3UkkR6cMR5W=F$x(gzE~6jzwkDCq@_a66Bfl;FT9P8%#IK% ztAs7eY)*u?(N{-OXkILegWdDg$bc)NR3ahFwnTUv-3bSSsj(;yap7%r$iOI7gGF&_ z8;UX%fncdxEDAxH2CIc+qo}M!`EB%Q@25_k7VNjtS#u-kZVUF?=#a-z@Sp|zZFF|; zD0s+%UG~f&DheJZIF%bs*4^LLj=o0ATXgi3RztS*7XNJx%%2ZU)}4 z#@Zl>G_|r!a!f-(<|zw>CUaOX$7;ig@K!Y%=kxT@`0NMN^zmgAqMI}rQILBACe1!r zGO!D29Lb~=K?iw&jndDmwSst+F)KXT-6VU4WRWNvE`}%b02AlTAXyM^jT@y*Y|wfW znrssPj#*L6B!yP{x0zVE+d2sBCM>sj8p^u$5z4}x1Cme;b5M(-t3AXFulzt+#+TNJ z)GN_@>q}rL1WSZQbr@Rh-!>tZT5j=%22q7^K4^|TlvSXt-cX?J5tQixW|gJJ)3vsU znyv73zat&4;UwC&;yi+M9$-^xkJYIvno7vRsT^84ZM#W05{a~o^6N9T!&#j<4%xSg7+_Jn3ZTwOCS*AnAG|Ca3(}fB#t1sHVg7~^&D{>^1 zVMo2#ZAFHmAgXWNv<1CaSyA!sf+d5z_c~5?~xgH9nc{knpM3#p4p-4I&An=e>VxXU&KRNG$WRQ$Ev}?N;E_EoU-VBLiaBQPK!`n-6ii}-$%%)B~T40QNRqJ z6wWZkM)iOV8{v?do~fLT{lWoGGtD$<;;(x4v1PiYrnel_zZ}N|Zd`}`-Wen&mvDJu zZjV_XB^oPqRPWix4*Hmvih8GpIo&_uASc%7oZhpKopiH<__{%MtlrcCT>hm2T)xoI zvk$K66%OR9geIrz3CuUp@m$iq0Ksm)|Rq4(@# zAAZylQd-skT_pw7`-X@J?d$gJgH!q|2ZXjaK%cMxUSSh8aj_mf`(Tr*1RBccsI^$P zBl*jM#kkR?o_&XPeTi6~kZQ9*hq_@Ds8GMNXWy)@6hO%JtWb7z2m0xl0bQa;&%U7R zBa^3b1VCQZD<%wPg&ypQEi?#wuL!`p(Z~r{K?1G@f;1LIx8*zC378+HIdre}qcq&o zjB9sRUdZG-Hr;WL&gX#)@sf;}5uY)1m|F=?)P5V1;4l1W-s|~sur8j=r7Q|?{e?*+ z`MN6I8uXn74-G_?lB0MVR7_XbN=-PMn_*t#%@d2y@4gk=bHNea|3nM z_2;b4I3*-EioH?$|EJI~&`lbJkVAWArv&&95r*cCwmP8Ct?~i!40o)qE3C6Ip3F%mH3Io8c#Cwd9d%Bg>*+NP4QOg{O z)G{L~HVUA~xF=#rf65Xq%aZ7XteUKX07{HV*eHNHRd^!Cddy(rot>mIlTFd6r$sdC zZLEj$z(c4O^I)6xOMF8&-8gBo34w=@IIv2lriqNzQ`$PMN};ze>#G(^3hnB}hC3dF zmjWo2S@e1JQT(zYOKIksRK2VvWa9%)gq?lUH2NkTTi&00k)bYT{HkFxpiH(tt3LR4 zdZAGWy`m6OvkyLK#jPvY<63*rece{7&r6Lg`<^mSgWmrj0Tt~YwS;D~i9#W(ETuE9 zY2Pq_5k2Tdx<2M*W1jq4e^SpON0vG6$VMR)st}NMVs>!CQ=0EgN{mQXhQzR^j6x_= zAt;cjGjh+@J%r$xx6pDEOzX-kkuS#QTm}RCt(KeSL()d}m?kYZogG?kB6Zo7FuCL9 zmU^{Ak`L}SsR=nIieU*4<7i$h%T4HH$|ga4<|-&yR}2>wJm7qDXjzwpLMM5Uh}m@P zxu%%t*>vnlIU+~eFSSgm9)zZwzVLT`fc_QBjg)7!k+K5GbQ8y2Rh!*ahZfnln8%P2 zFG%!kCf{odQ4_Aj^1c*h;APe{#a_g`c5dF~uy_rxTWqv>Yzee3wOM1y zBULe}XR@-X4MLSVD5H1sx@4xCXlr9`1uOsN6GD-FTl7TSRQgPNF4F5Wl|J#oT%|8a z_?2B66hF*C;(e~B`EF2iHO(Jb)#2YuDQli#85(fA}ivRA8ggAij($_E zRs9+JS>Q3GWPk?|`w9_xs0xEqXv;}!U{RUWk&}8Qy^9$Sp~yMRRVKmhMd2p9Tb58W{Rbs)Z);-W*)jcyW62dby)$6a6M(2Cg>*_Lf`s!k*n+>OJ(qur> zmEZEqX|X4T?UU+F(WEDADh{+%@Hxz4xdN`+S_@WQ@p0is<4oyIjjgHnl&1FPg=!G* z&6z7@`AOrnMuInat?Vr70!>|<|>@6sk&Nt`d@*y z33<4VTum0QUfr!@$855bf{OMBe0vR;kVe>CMnLlA*QJJ7;I)Yh_RSqT#jsb52?yrq zXUfGC4!s`8n3pX`CE_keJ##^RrkzXR$fBJoZAxX5Mg)Le3H01|WjdZqZ`#G3eXPtr z&mm=(EUj!142EV|zB#i@G*<6{bsCqVqf98xYwp<6wCOQjKikoS8C$|{j&uyAzA&iN zpu39a!u-sSunH|p%PzpCv1!G_j%+pLyrkTY>OYXzt(XxIlUGYW$Eoj;8q*TB2P^9_ z-sP}B;w+gMXxj96X@4m)rj3=|M2T;V@uTV5d0cbT+h&Kho*@95a<}Q^T16v^?^~nB zrmWc%i|I6%X-k_|dlvIq?#twv*^Ly?DtF=vtU42W4)`@zzvJ?y=-I8e{Y(g|t_$*| zAJsXzLZ!rZH0R6T-IV8Qq85~;&9Og*#+_E1OzPNt<_a|I@Q0R>gZR5(-9@6 z^Q9Lp$k%>HRJ?N4r9wT^v2C7Pj zfL;J}i+KF1$wWvsppnhvga`f@LMUR3GI%qukkyrtBwUF!T)E2s{1y? ziDRl_>p6ZnOUAvta~^2Bi@AzTlK~Dj~4#CMv##Y+=-H28lgF zVhc#5Y5+dvkP4Dbt00RrW>iq}qL_YDC4{s;P0AzX&OlW7=@3K{BcVDBQ5^ATKox;pHefSy7!gt3zZ0N3VOdw zkD<;i`m*dNY-H_avK7@<_9K~(D*G|-t!3+22>%0rLh#=VK=V{u^_nR?rIoS%vPyI;U=`hCL~6Sk;%k{|&_%X<04^E7V9VY}uWj!3@r%b% z9N0GA#=4F+!LL!AW%l$=-=M8Qxtw0se@?vrs8s*yp!0PZ|6%qQ$fy7G3f4ZYoN{i1 zp84sO`Pwsijrad#E1^Vo>#>(rl&QQ-(s^;Av?c*u#bRh+=T~-`VIRe5ok}8lm)IS} z-cvGeqcI zro-&MXkQ~IgKKsRE^YAP z*@CvvSZQLD=>LI=YU;nv0{U-MpU40Zst7`^g=EU6D5yVQf_iobRVaf?oKCJuOA%TE z``cLjUt)v1EuEj!ueExqZW~MhsYB50AYW>ifbbW~O@yjc?v=*;S&Wp`UWpDUNlDu` zP$WCIJm_dmrVHk!k=e6hO=IrWH1c;z?4|!0SEXa4 zjeYyLpT0%Su#}J#E+u)*@|rYjJg)iAPMONF39hxQaR9Fka}~ItWR>oCl7@VO|J&5C z+O(QB?Udik0risT5iS}Wp-oQJrNH3g3qQu#V5?;lIg3|z%Iy-}6o9p-_87_O&0{zb zqhZ9TMZgvMOc*d0JuQO0r!owfh@KV!gXi60KnVdVqlC@N#E~z6lmg;mMBE}^)Z7j#S`a7&vHwv%3CLfkR>7kOi8b8)qgca+ttWZnUxP-mVk5 z)F~r#yZbX3SDbFb7r7v$R^>umTn@DdKclmDDV+>f4+&_fku8-JF4 z$7RE62r-QmVJtQU*vLFeZ789U4Nbe8z6Cj=_<5ti7{ACztp}K7{$LWhWS;W?Sq80~ zRi;Sh!%%o*V^$91`A*BM$|YtX+ET$rDAHv+TfZxyP*xO8=T2Bk2K?oo5`Lw<#Iw)< ztxuB@`wVc>N~{9Z&7-S5#Ad`ld4hC9P{r|q588zLH=!~>T$QNy2#V%~029#vwWnK# zfSz^0(@l!YD=eKykPbGdrDHxlr>2NEWF_pYAsMcO&;}3-uEx@Yq1FCvl6hI;P`4qO zM?m65O?F#@CG!Z1;v{B8F%f$$9o^QVa-1(pb3|R&0vrj#XmC`(TjlaD9+>RCtd$pr z(i!@mDwiJ|($JL1tdnB*>%dMg(PJpOsDBgmYV)CvLz){TcGl&7ec0`ZJ%+?8hV-niQ%JU9)RL{Wb%frZ?8PBV<`E>@SQO8H91iN3 zr3x3CVkZt;l4$vHaUpVfmLG0llLCWnup>exUC6)M4IJ8AMdr2bhWKj@wRRDuyB;re zst4uIizVy!Nie@pf_d=nbqMI>`VC4NS+Uyaioc``ArS3bP_pjcNHC8)k-;H8f(hnf zaP2Rc599Y2%ujIg!h~4Rw>A{aUq*#O!8{8Oiw{zHUx^UR*8~Gu;2=k}@Yzx%Zxrat zBcb0n+m}Xdn3B*}_)zs-aRF3=4WI(>lF%<*kgq2s@sIR!;v;5^>92?a#;se}fEl%r z#6NTFNSqZnTTCuUU_jXVqo>0(S8z+H!i+IJKjO=Q$>+b?X*QVt&WnNM^Gw#*Oecfyb<+<#uhi3wFwJn#wKnsuby^&y z{}^;ob4_7Zm>#qM+9qq&=zL0piyUJv%hR+@wTILo7MwCn4kGM@O9t8Jgx+Zx;D?NryR z-}XME2o^SVjO$>>d<*?2@fJEz;%HX6UI(-@erlvaD2@*`<pRD^mR+gS_j6Ka8g84_F zhqE#uslMJtDroD+g8Xlxti>5Cu@P_$>dD3eP35q#7Wzv~Rt#GZ&C1W?_{RGccy+bm7TiO{57b}=w5eo?(uZ) zxw+frPau;UYs`OfpN)!oVBCt<#Cv!P;s^8pk5ED`(GsawunL6V&6to~)HJV6_`j!CHGNM*->u5%;+#!)-VU{0lB#4eHyfh~_DJ=?#2+-FU)2~u> zsu0yG-!y_2JTizm6Hn^8bvztmHeYw+Tn6 zB*w`gi|-ASR0E~c(RHd*9p?ey9nL)$tCQqk3nbzBA9_;i8k>;3IMGSVMj`Y}x0Q>Q z=#VG+*{DDBdXk3S%?oz2lf+R-M2NmNtLs!syK#&JApReElF+w46*}Z~4Fjgkb`8!I z6Q1mQda8wnt!@42EBfBPsimz3k_F@-<6aJWwWBlTlN^b)1LDU7@i>O}af8I=TpCdV z_Vq5`#sFb4Ea72E4$NKYj^~rwA0a}AwMmvG_mYqT0RYsGZfiBzb z#uSVFPiE1BV!tf66^YraR$>W~Xx(Kak`8I-jMVdVu8vcBIuA#cCwZsyBewR4iB;1a zyN?nPtxKYU1{$Yt+#&_NU!`}9Ah%SKZ1OLuqpw>;nVjwp)TmP(5cA zLQ@pQ!8?WRab^_vGzqlLc&#}eGE=ttc^hf9SU}bw;jn{5C?i7yAX7cDW z6e@W-B1B>7;&r5@PYBl~CPzQrCFl}x3M+Rwgs(|izmo7A3Gt-Ap*?ES{mkZs($-x< zX2gYxD+fX|K5DQBR1!iY;^PNJ*g(On%|K*_qRVlSgMuVdg4p;kiEt@0?375@Dn03V z`6c=-y;Q9vDRmptn5w5p-}V$=?YHo@pSppB!0N^bs}qdV$>1?_8uVwe6P_`FtEe;c zawTUQOlA|X#>={e`6naHJu_Jux<{>Cb_wlspWaHpqt=3w{vT;-sQOsa3fl=u{_)dP zX_Mq{(=cOzd20oT>#ecO{t}b3{@Yq1N-wdVpz)xNt>c9Rw?HFR47F=n5BU6#6fzv2 zbIsrpB1|>KZe2jdn}(|Yp{V*k+$0cJ6TB$uy!I0CNt9k{oi-^UCkMF`j?dA>)Oe+d z$j8-dkI-u`sn=douRX3_yLqbYeyTYyN9(1jXzf!y&+zedScKMqHIYQ;LhBWQ)UP1> zMCHnKU9YVqLEqI#Xtd5~Pjc~ie`%#I-(;I(f6ByO${;P@Lyl<<&RU^Tb?ZaH`mYhD zqydzvqd`GTF?AGLI(qe|5|f{Tksjoku+`fRvEz=2d9lJ<=q+jd)=;*jrX|5yHWqn)oM0s3&mot729f#$O^CrIAJDql znA?-=8N(|>sxq4|P0RSvO^o<+`seA>=C8U?n`=tnZ zmYS=io8ap|h%Dh*`0ZQMl%lSYK-`7zZi*q@v55jE#D3RNi1#hnK>BbK?u$CMXJW|e z&wxXUKtghSH3ga7t;{Z?c;26*kk`_i9!Hw*)~cFi=5FQ(q zekrv=kfj@jXH+hY!~+~F&690`Ih5(IM7Z%=$>h{B(w=URSY5+FmWPJPl!`qfLB>1_ zgJ$AM*d3Cpj1e!he8QKo`{ni?Z%B=3S5QW&TM?_dJ+AOG7^|RH%6#KHiC-^z_b{689wKDiZ^Cf5zerY;>Q$Me1>bW)ET=!DY zcLma5IwzaLw^;9cpiI)p*IAq~i`oAF6?g7|ah6s7pJ{_+-O!m}Rw`5pGQZuHEj7ge z4HQTqfr(BaLXm)hrliG!EyQ+^E>uiOdda4#6)K=2Tm2P5s8B8nN`aJ%Ek&S+$YoWs zqR_~#2-+e@f1mGpo_FTSbVe@w$1l)%pZC1yoaa3E+d1b+{g6|`JE)w}ze?TIyd6EX zQ&ZPB#ou8TnYz&~+WOQB7wpD>GI_9JS=j;UfhIZStW4d&YBaIpK1iFmfwy-(Eb#vw z&z{dOS{Oty4iB2`;}*FKm|#0@seYJVw2fPm_|&r(e2B{_sS3v~%l?b89M_|%0EFD_ zmuRnAbe9*aN_17CtAg%H3G?LSmm8K%l#fqFJ5N<1s)~Gc6(Xw;Sp{T|D!NB?Q_`|p zqpL7FOT;^teDnC(UY3g%eXGQ&sCp&Z%Antl0K5Kz62=|{*#*X+ddURDwe)!cZ6;)K(OsKEP{bcFI_@lVBj zEB|5}BbA6bKaffU{uF*W{87BdXwhRwe+1m#qq@a5Bm}_1oPg2m@XkT;&@kIRvvBBe z0f#r7a_(_F{8X75fLW7B&vyI_(bMh%9S+{F3T||ZIq=&OAbys3LI`&7P0;7c;1ej ziG3UKPx10_8C9zZ4Vfj|Oi^aO)Mf^UJ6{5-rRw9LHVs^WKOwn!+fH9eyJg;nXauO& zKxjEtDMUQo?T5J?m+{nQhvas>K$^)VS&7QI$sTQh?d@2(gAiD;-{n|kQq^F6wc2*P z-hf2{R@>~!kPQyK%uJ4QDw}F-hwCx>6m4RSXJMkzmCm0=CaXo;&ywR<9pNRRS6te$ z0tfvb87=m_#okpPD9Ay}HJqz#%N3~^CrDaHk2b1i85<5lK(TzNaeT5k?BFb@tLH5? zV(T3v&O#iP+lqzLRjF=J)wW+iZu3JmxT+uZhYZ>(TR8 zBD=rk+&agMd8dnVl}*QBW(Otovf3fWl!Z+oV(Zz zeloFQfAE=PYyXEC$8vY3NNByYR@C%hmDXu2=xn5W{m<~flE%Hp?efrzxJ@1;PxaDZ zF`7__*NT~NoX#%j0U>7<2GMb~|f=U9^W;d#$H z{>xocFYKe$v$RrQtQScW7}-}7ZA9ohYjrS=fG;8#j^h+bDR3PajMiJv5P8%o9)m_P zO9U}Qgh@xZYYd|wb6ue8a2RD=@`opPknYVh7mNG zBcN-|m4M~JDkOCNDVBHKMXt|v-kFB(w}>K%5ZFuZvM$ORKb+A{tt5O-Q-31-D621d zIJj?68i*U6MR+IC1)H z{*@$%O}NdJk=}lW6df7R28SGyd1yPm8YE5+-gX=~9B=_z99VGq3YsaEC*w6!hIqer zAK51}o@=Z-<-2*!oK+7JJDE2-q^Lh-kJ2?_bDMcGZZl;&>fC11j~XNTRo82r(Sh_$ z9pKQRYWyQrtb#!zu*P=yiSGC=erDk|Q&x1&ZKl&FQC`$LPB(Vbqrl8I|G}g`9qaf} z4J=Z(nGdHW>P11d&3;hnPG1H>jBL`osHrh75m&F~3p7906I{0MHxL4sp48!T;_PrE z4{dhh46g4}m5^?S>iSfo{=pzjU~7FUsOC1)tB)at?^7>VR|@ub?J0Xz`qbX&z!|p; zF6m2#9J2vEVdmUs`cpP4lJ8W1uuY$dQ%~k#D|ysC#a(oGHDhl;*9ovlyv=oe^pd@< z?_;@CyF`0hbbfVjC05*I=#Q#X1wHSe4NsdSGVh?ReL&vdR&;~z%)mWoI*wJ2GwFvj zzwCq#w}D5?nD4vQsG-s+?b@`}t40WPKECz&Ztb4Uu3L-dIlkL@mreqX5dqD3WGC_A zhs-Z8(mjF3a5M#t`bqW~auUXekj7vcSsI@_>Z@|vvVcs`0@_;hF!nwA(cnDjxNVIw*Uyc zeBY%?Y9S6k_vky!c6+12YI&}UzAfT-TV7OgRAaY9`@Wn`+UL4M9R4Y?N=DC01K_T* zfq(?YLR0cwxtUDsAJKv*hS$~a9%KFaa_?adR$Uf79eXm$KNDtwlPda7N3QR~yVZ^C zzXv9;XMm*%KDN_|<-@!UfAdpTf0ypYmn>hh=uj+wjI* zMNPryx#CjY^m7bfQI+$Nh!KG=gW+G}u+OF@KA+b|m_E7MlFf-2EC_S5<_LW1hmR!9 zAxFPl()2~euOT_daKvKFCqBZbNQFOq#$HyWtb1r_NhV$t*CH$fhL_%_S_9ln1MK?W zV{{jAWFe69TKgBtHD6nwGabWDfL-AQ?9i%nC2>RQW9>R%XD`r|yXF_WDYk&QBYDlb}gOUa$EajSg}`1jm?<{Ea0O~5X)sBAPFP;fJ~qBaEE{{`5N;@?uJ+zsI3F8v)gtv?7v4le(&UF&<$fpqqeKd9z`*R%9R0+`j(u@TdD74%btGyQk!A?^8U@pZTQs=MvbM!B{>{E6A;%iZO`<3Y0;+<#2tbsUHP zY;U3X9bsjFQNh6JJ23%dzAtV%-06n9 zP64jtX|DY?vjwmFi(gartJySf_&vuY?i>$4Cbyh0kUCTs7Ysr{q=^-a_t$!jY3lg| z*j#;e5Iq5>|Kb9S{ichp+GCI{xqy9QVtRT68OjCh6Vp-e&5t#G^l6)YVh-xP`Ej@c z#5l)BD=uc9-kV=wU*SRdP161&oF^Qd&(RPAiwMg%*BAh4RsIv zLpy>b$A_xVoO^Imquy5aQ#H4v_08=1onoLJ`T=^+HHo?iSB zRe zg%|*{CfH$L#-q4lL>Ef+>s4=qFyyHGw+U`@%Uu24>!_CdYM-1}jgXH>*BWxh5=R() zCO$#T)!#Bl$%>2s_w3d{@CVEZ5@_SZECCbzH$AP&&|pq9lp70I=lb|Ff zc<2Pyaz;W!+Sf=X>T8X7 zg>COAj%t3V8KT~=Fn3um*5GIn>59&byu9VQZ0$d4}vd5qlpx))+@61H< z+gpsnx$#T1Mdi2JC_I$>)O!!U{isTm-8##XONJl~ifi%-^1#<1j;u0=iDGRj|ZfsJCa^&<^ zPMmJb@$eYz2^mNo*P(8;xz?>?p`EHzr#3llU;?Q->ku7CrN?03%RmjNZRr#z{--9K z6fVAzt#BRlw#x7{&2=hxW3FI6<2xgcV({!Y2c&-2-PtH9|47_hfdRpB<6MUrqo(LG zAPutQD3Ke3=p?s+tr+U@q;!iD6yBZo*72h_8@dY-_y@$We_NCVku9SOBNk~*4Q%^j{ z)|-RBAAp0ntc4&aklW$;LMuq0cP{=G86z|$91gQG$7p^BI2U)Ak3UTS)Unn$9MlIK z&P83|oZDqy{_0$0?CUrrri~zxH@9|MRjgd0r+N9)RnE$zR)+rs#CYlJm{8>vRv$|c(1RP?|Ac&-8 ziJo%u$0d=lTj|hfoS>1E@2Ek7X8SvG^M}`cw#XgU`e<9{)***wYM1i!x1aKEox^gw zIuz8M+Z@)7dNP~sQ2w#z)3mU$Zc~=#gi)c}G51z8S~I&HaAc`nr5%`x%bV(+xcIYf zTA+QMoR1=fh*JAHsjIXNl)752tlIX;qb3yX=pfHTGbN~6f&+rX-MK|#@wuEEF%4ZU zNty9#iim_Z8)X$;-O%^fro>He^gWdBzen@-hPp>|VRKXbc|w>Hyv8gztz}CR!mS{H z@w>!|XVIWUQ3CH6Oaz+x{?yp_BIz#&Cvg)iGuhUn=byOA>2|Q9-b1AFVbRUAv2T?< zUr!wHRN9*wr2GCH85uBN8P=R7t=7qG8C}ZP@$rX$zfZLcOua$|27_?Xl1P&iEpPO0 zy}&RcG%*Qx4u`WJH8BU(TdMKflN=4q3fRZ>AGX*B=E_HSz)_{}H&GU=ZT5kA zva_jmEmM8WO?{}h z56q5Nc#2j}p_b^)3aFAE*kM33EAYoruuv1@_(?Xku07e)fM!G_BpqlO4fT3_h-NP5xr90nb&?$an@;{so z>)>WV)8fIziVK)}n$lrQ-wXI)k#&Kes^Y${Y2akeP+3T6g6fP5rd6pkNTU0Y+?xR^ zT1=tLtp;bKtm+E^KRJ~h%E2TJqEm26F^{1DC3mYjGBa>Q9x%brO?A2&cQjwCP9!g^ zhQ=KY*wfK?zyv=tm36_xd?B9mYLIRoKWK?-5FwD8mPQbw%2mS49oY%>OiAlOW~&HD zB0nkh0H!s$`gx|K`C=>H{#v0~5=!ztfJsf#N}H-X2^xm>Q#zdAzEyr|>L#I@ z230FsGA>uAb>IQTaa|={PH$ycKrWc!8g!R-Q(cfSI?a`d z0jO3&^^Vv<>^S#7U5vM{G@_)5nRM--BmTagDv)kR zOTgld0F;aXoCzm-uL2d)o3@A~d8s|TX@P3#N+V=IfRjyAjc!bKHZ=b=WHgu9gUnqm z9Li<2(fD_@d85H!AL(1V4H}B%+wKYWIwzw$L$+dc5Guhs2z|Hj{KEIHgPJDaIM5Jh z_)AuCBdSbYDVA04RGETuhsq4o5y)G*HZlCoZ|>aIvtg8Ir~U+5GcJhi3Mi>vC#Vjj zzUEMQ>w|FO3gVMh1F>SUK)EU8uN@=W988uCBu)o&M}|&rUrl>Ln*hu4kaoqi#MZc(8E~CYbNsFJ08>>W)y7`3 zIsQ&_e5d*B8<*17U`@L3u|)5DvX!(G1+VE04vdd6DAlT!0v*b$xrS1b(p zIV!F7d+%?En-CqJMkRA1CRO$tlMt5X3h#4e$dSalz-jZ#qtIr_xEXN!Fv|Q7VViiENn6U)fzq2Xu5i^&6i`Gkb+!+lb~J4q2$9&5kn**(fAi3ZaAQmP44 zuh5*U_EJgkCL+mY+D4Les%|}Q-&W`{%`6?ZkQHoxJArmHfn1ioj6_E$$IMBZ$%2~v ztD3t_U8&!Rmuogxdqp!Vp5YsjRa=)xg~t|OiaK~fn_j=TQ>$g zo!>eCa-nHKBl9z+q3-QV>kpEVLPg%xH^>)fQ9zBBF##3@w7Oj_3P_Ff2FQqPt!@~Jlzy20 zH#O*ABun$aiE;A7HB4&sGm>E9H}s3)i53G_Wa?bsky!i_ObLb)>a+Ll7BT0Hub|*! zosqW3ng+h=Z=!3ZBoL1&vtk~<#<~mTW{P8eh7;yF^$>vK7QkA%Z4n1k8_Q{pjRU0B z+>mj(Dq8e<9Du_&~nJ7cl z+ca7YMa#6LLt1Z>5m;r}^&g=(rd3+CLs}cHf)}D+jNBcjI5yM8cU!a$M&ZH5mo?3? z7N$B2(Hm&}FRHG0}i<0FR3@e!!X5X~dN^vkcxZ>}^%<`sV;poDf9 zA~SP`$PlX!m5cSTS;PhcWJ$Dl4M~?1k#W{OInVc@-Wba%<9eIT3CKP6RzXE|LJ!j$ zgQHU%v!`RoUaDc4=X*`>o5A9U3tj}A;uy@>y&9ctvNKY z(dyYuY{q6fo#KSQNBl}YWOPucy@O>I-5gyi?rsb(G_cXxRDpqw<`EoyHEh_51HQA4g`%V7A1c#GL_lAXBeFBwO{*76bgZ$w%kxMkZ9sH0^Fk=;*MmsqXp2 zKod3{W=m)Hk6-1ECa=`gtX@1Iox)T58!7~1HwzoJ3i-ZU#}h%HYW(+wI{TnU5s`K^ zlt7(D*JR>P>Q3>v-8ydby1uHEYMm0<+;^`IIh9$O{wb4m>e#_k%hNPZY$v?M4VN4c z|I^10lua>{n;k9jEqI5Pk7)8~heUywidyA6#5*Q9`;*HXKs;ffYH!W9upIk6942$FDz5yHRE>R8NwDj1WPLLPOiS@rHMjIhvqZS#E@9U6YXVi+J0!D<(6LKSUBb_tCD`0Nl}I)&ss;l~ z$z*1!%xO5JO13Wr*7hqxGcS>?%rk_hT~eKBdrW+nVgj^@K%?;6@ST}lxCH$6pQ|*n z(K+=Yu~;Hz8w6rY?bOZ0IL$z2j$xTI0I;nbw`; zH`7guDjU1yDl^$!w;{230meG237Aqbk!!+{Snmmp6fK?G)mx`-<+nX`b4%a$JE$eH z4?i_^d-L`uEpSUy{8a*Nsh==^X-nPO(DTA*S$|-;g7wZZ%gQD<4;&lU+$MDkU3+51 zU*I9V_ugN@3%@|DEqYvNRQ4!({m6pn9*b+F1rKw;wsAdqJD#1#En7I*+3{TBEpoga zPtE`+G~^L^735FKF6T*Smt)jb5p`tG6Ga}{^N6|v>d2~RXVbE&MqOdlmN@kuqycx% z39Nc@LWE{LVp9g2jS^y`7ox#MNV#BF*MmV4)vUC24|C79i5T4GZG3iSIOmZ(o)LpK zra%nK6}R%+UU4(Or&io(iYTtjFE=caA(ajE9`hDO$WnEj;V z546O$H6GqBCsM0y{q3w`)d#hbu^I<7)T$rR_|b?tk2-g`Vb)U~r-$^NO790M#!M4A;p_ zM^imR)Ou7As3R{OL7@E?(V>)Obi44+NF$$_fWDgcm)IguajiMY=P(1+G&jC05mz~x zXEU)9i!~9~wI*z&*J;rNx#5!xeMEq!sENTbisSeIQi|jHh@;nD(|lB%lmg|O<*yX- zEEC}v5fB*3@$avwOnX%xT1QR67=TC03s`2F5iGUf6Bj9pg2$B#CKl#OXOe0s>te9HG*fkzlX6y#wY-%=PrfQzOJB?L6 z7dEkoIs6PeY1m1yhrhiz)V`&BSvU!kkY_D^06%o+46=99rUe8a!fz zcupAOWE|XgPX@$&AKoXT?>3n!8sG9Z7)!~sM&o9D5FyRd!aktabWmPIhWQ~haU4RfsnR7dv<7iN8O`akzqr)6tj2fmm zMg0)%sv~06NAtl%E-OB+6R%=6PN1-i!~X-5f|S2a-W!IFcoM zF`?A}W7#YrqsyKX#hs-hwhn{4(up8Z!s4tE?O zhr1^sFH6(d#NvsVuH11AWszQVzasd;=?(1-r#GC@F#W#f{%#>gwV~fCfwUW^p!>ji zN>@Ywx=kQVc&wrC7ft=M%e1qr-GzECYRCDhQ(chb^=~!wUpHv1n#);RXB)OJnAp;P zA}Z4~Q?0gJ)orCSoqtE4ySX!o$9Ehk4xzH$Q1=E}6~AeS-%kjwis#X}m|X!`*=GjE z-33eb&u*Hn`#^Rw@ju{!Oy?`wX_mK+S8b!Cx&Mwkz(NgT(`N?CZtQ!crH>EHNoiWB zM{E@f`8YsaUK1-SMyqV5;c>llVX913e1R7AY+_*T&J`lx6oK_Dif499-@^?(uZ+qJ zG?XjK;|JPbL-m>6GTh=L)ZdgsEZm-F*X&Q3I8NW$8f)l#ITNR6#Sk9XC!-qKVNP#& z0Z%EoK9L_3{60$bn#8`1iV9}we*W)MVd#p2vc!td(p+^)Q7uWv+st1ZXJkrkJdJPH znw2`WuK>%`O85OR#W-;Cwx-EDxr&{8Htyw~jo))%z66%dXby=c6JkMr|7VuPRpRfSTb$UrvG8hJsah! zqv}Y0#!ZnH9pEZqri3|Bn12M6GRzh&hHoP_KGlk3sz|o;y><#QmDTZ6br>snqANR$ zorCs6Sz~iQ1(kebeWLBREa}>J-vVf6LciI)wj|n~2RL&ty)Ha-h~0z6-G^8d?f|-Q zcfDs4`kl5HzisR;RAv-{Gs?DHnjw{2ZuEey-sdFro83!L+(TE+*=w6*UYw!bH1=(m zZ<^jU$j|MGY#&+eIDTL%PFc3FicB7*wcymt3059O{Y;wkx3X_E4xC~uTNFK>xcKar z&U{md-vaT2I*Zk{>|| zDtvj<W28?+VDyMjtiVt{#QoVeKaQmaA-|m%qz#kxeZZNU* z6dWMbCA=`M7tu@Y+EVS#71aT_H4!XAM*2- z?6QuJkUKfnF{o}$b62Omp_qTpv1~5pt%;>uaD#PLV*GuGtDDR!FDh%e3&t_rsGULX zowfyGPBxcr*;E;Yo~+fCC&`NI-dFlIm5)fD8te^0+Gu0#KZnWij8Ys)Yz}1%ygQph_)-D?ZlCD?3c^M9f_r**rK*vBnr7atekp-GH|1g zJqN|U@ke~;la4>uB8=I7x~v?hXsnt%Jt9F*D?#^#3EGlay1mRND4!V1$lvD^;xcla zVi~PWvpZ^%wXVS4N6qMp(xjR}5~kkJGVs-SzFv^ZX)dnXwas-;Bo=jcwc|_*!-Yl$@V-}U^kHwZURQ^8 zXr!VMqtiG|=O}E0H}r5L7gM|A2c~UIYTztyjz64^Z*Go1s4<(t&W&jDJ!cVHjioU?^#WSZ|U#RtR&rk0liNeGpg2d-{BnS$KmmkqDEsK zx8W_sv4(~@e{c~0rYK1XX z80j^HkP*%L#ABeOcF)B?351Tt?2+`TC4-;wDJxx+RGJgPx70b zIi-v6%$hE6=ES#eX;_VY+Zyq<)w_s_k?GuG4f_gf zmQqGZmaSB;Qzp%VWYsqmuDY}lt!0hspW}`@bTieZRY*?;Wo4mpF#uuiXC#bD0^y05 zav;0E9muBePF0y&LKl>r+D#A}0K_bl;s*QH8okr;XVdZDHFGQ#f4VvTbUOZwjo~|K z>CqTYGfN+iW=vvnpP)3CdYWD~)4zW+3x8F1oAa_U_hJ0<`aI$?Yd3v8Z#@$imfuSVi6B3_O`}GGl^(sVYBt*@E7Gl>U~JK5Vn$20%8dL zzYx$kA*SSmQwjObV5j@N%BQSTvXyn!%Fgx=E?}gT(78PnmS^q?+o3Y7UxQbMWNa>hUMenpvHmk#+C`C)Z4XH)`$n7r*etSbRY;zE8#O zSIM^zX-gc+3=F4P(ZG11>a2k121!*xSb?qU!OAZ}u_`Aw;n1>f~ za*e+-R&C!${)~>Pl8*dSo7;0#zTBe{{SSlZ%RPeMXz=JQma75J89ZO^5r!tndydMN zdj$WS!Sm%F!Iv34U+&g+;jJcIm{!0s8EX`S|E2`|wG!~Z0^f5~zFtN0e4Hk<=cs(W zvUaYB=bU}!g86gjPM>-1xzo=&fBvkQXU@oW&OZ0Nd2^?qb>6%SX3U#8sWUbybIfG@ z9HgIG{T!^HL-cd#KVmUIJN>K+W=`)se}48n7=HR7``0=7kPkrg>4QE1(IK_(o#@Q7 zCLc20e%_aW0onTzF(7*%LdK>ampS3+hRpO6jy?9IrfJisH5{F3nx0GJnKS0iJA2l= z>1UsNc6Jxw1v3E7|MI;2mm-DkOYGd)7j)(r6hWIeYv%c#v!>6_&dAQ^Hz;LQs#t8Z z@3Pe*KgmDVAojP^S53k958zLDxX9EyMfi*TfGoIz5&V__{x1Q1j9&OF#6Kc{hxh{l zJmmkA03PE18wQcZ_@5lWL;ULlc*y^i03PB`;^x<4{7(zuA^uGPJmmkI03PBm8SD5L z=Kr&w$iu_(8ayBm56kOq4=*gQYY%exh57$@01xGv^y%XGrw8y5|Hc3wrguvK5Alzx zEzW;#01xr+58xsHe?BX+2#6RHV;{3mGih~#SheyxI!^3h}W%01xp$F(=O_#Gets zL;P8?DV5dV|_9^(HZfQR_;yYqZP{L=$? zi2uC+9^$_ez(f3x-&34_cK{FZ*9P#AfBm52Q&?W<`-{V$-I#}`2u9^tv#B`z!V>TY zO2GGeusEM{O28i~0k7IzoX@Nh@Sm4}@B2`3K3yf?Yahhxjf3Q=I?x0X)QiA%KVc8-AbX6XJg@ zfQR_E1@I7m+;e$8A%1HB5AmPy@WOh2(N@Q&5dQRwd3adQ4}7UO{Ocv)!zJKHyj+~m z$`bJZDFOfEt9d@3Bp9{FAHA7}hxP464{keNT zhhJEKUJBr0{+#~F;`m<=;35920X!_PefQ7v3GtT(@DTr}0X)S2Xe!Sq#Gf9(L;Pz4 zc!>Y&03PBW@_&l+pA*1C{5L$ju>Nd1!0{=Be|%CN9@eW_CE%M%z{h^7IG-6M;5U?j zzg+_UZCwaoq@K6e7Kg7b0e`0ie8IuR`8-+zezWwNy*y$4?>r_C59@!QhZolW(?9F* z3+w+k0(e-yyJw2yA0EI%{Os9sW0d8wbdHIzw{LcFXVqi08a(@ zZL67W60S{%Q#1pJ{A@FN!$ z=d-K?{Phy>lP)RFXKe}iJ0;-hp5lBiD*>dF?{AlYfBuHz_&+QGf2#!ilC{P8JX8Wc?nlM(FDe0lyafE?Hx}n} zdI|V7CE(jjz&~?SasD@!fbX-eIR3>Y;LnwSA0kBo{tEl;?h^3xRdM($Y`7Bn5N*2|uj2y2Dq0f`|3C0)J@wE3CH{2Jo=H-Rt3nd^QL0Fy4K<=`0(fr#|8M}mGJyYc0QcKwCTB$ee>=e6BY?*y&HstqDNuBev5;ylUC45mHcAhtP?yPgEMdzNE zoiz!q)C19=>HLyU;;h-z&zd)5?yRsre!%cP&}jq-+n`w0>!-+{8*TC*_9G~mA3ky) z>T{uyfkOI!usV)A#t$N06)P>g=%d!+wX3HsHZKGJ9Oue<0ju7X};UVVh%pPPOmsp#K7?f{zW zsvnQ~ey^Ss+f*X{)>|C?yXD{CO3-(B`kelw6JrggvH(7RN*IQ8j^p1uGrcCn`~gh%ude4(f3G2*Y=)29mvqI5^d7eH97{H}Y+5kBP+ zReqY+MD$Vp6INl|BH8qFM}M8~*t90wn}3D$nzI!4eSl(Jn;rd01ks=Qba zke9I>4~yGNS~8@BHucp9cj!5sUXWc|3}29_3Gz%1EM$;ty5mE>3nSwedpPvJQqPqWCh*O&;~w?NDdF*QDm&!Ib%)N6o(rLgUmGP^ahd zb%tzS!bj2BoSvUmb6z0-qmD}d zPpU17HR)Gd^q|@@5MqmJ%bpIM4Rq;+=~-gm6)!}?ZQPi(#cv(iaGH#3%ok=CKL6GW z#KX)B^i4z3%gpcL+m^`qb!N_BXnECVuc*)V_&itir}eY*Uf?eQ-=niv9@Ncqtq(n{ z?bRNh8&=mvdAvwL@Y=O&6vUU0d6dqJ#02IlK5nG+y~S(SgdDiP1nz2Hm=R6SUo-C5 zp|b_uIzwL3cnMdcv!C(4bM~RR;fT5}$0cUI0%dbj8h-88?x!`h7qHkwGti;tUIwYL z-$L{05HP+#{p>%4>TmNk8uuznEdL9uXS1fBIIo#kz3Pd7Ww9GI|BuxCmm#GU;^C|2 zJ3jT;2^yJKJ!=$FLv@~b@_~M2+ zrL&mQ&x*p6n13s|yAgW0Pj`T@_##?QU4Wc^8h}EdVk}7E^|QO!CN|(Jy#w9TWYiBr zpCL>1iCafH`tL5Pxi5iRf1X6R*xf%?^FIXxoTBlW8}Vg3MHD%uvuN1QA-D&?&mH<} zQe(x#7=1_09mi-`9lr%4Z?8 zK-G0!O02Lqyr7v2p2wDh$*<;@BV(3z4vZR8t)5`i%Cvv}ts|}7g^d#$JLI!^nxj@S z4Cuz*!iK>Z8~0l;P&sStiDql9#uCBO8)SanQG6?>P^EtO*k8_vZuNA;@18YQ_J51w zrceCt3-A*})Y{Q;hP;omU9EO9XjGjH9(*|?$iY!!$3E zpdLF$RKW-@n2Y6r*U$n}oR79gt)4J~^HaPr_L1(w<`IO8z zi0{K<6K#ObU)lTVlD+q|N3?|4r#gG1^H)3)tr$~ipI-hU?TSzCHeAG>N;e~oJBA(wk&FG34uS@S7`fW zj3s>57O2QHYlV&XV!iwpKYBFQ*KDoS*zRCyJFHdqFM0J3@TznVVCa=wUduy&HrT|J~_BcOX$dyS&Ki~k>44;-o@uQKzn6t{ugR)3_fE=y?n4!(D6_?Uc!ThC*NDqqeRy8jIyA?Q2i~NtN zhv}c&2A5&Tw}5R>mcKlp`q?Z+$Ag-N1V5~%K%fLrzkK7u_EKnxs?S=qF;UrJ>Z38Iq7&0IWd=CmdJQ8Dr{?dAQth4WPKjQ3~h}XG%-=O7d;%qhW z<*R(9csRAi)X!x8&X&VD3q@P=$mI+O>z~fSb?Znwf+5fCt+`e!_VUAxbs!7`^&w1Q z?kOH8dmu|o5nqKDMoa5#hm{{HlVYAc?FZ5vvahsb@P0ezW#hCR*Ay^kCLc; zPYzGd{4;VUl>sMJEA`?Osi4NyL?u58=>8eXa`{RP7Ng;2Vl>~q_0T%~TSt_G%Xm?$ z&-@^x^4l6NW?N_e7E(Brg7QQX=Mj53j`Od%pKjv!;_NrrS!`!m3+YbDX5OKH#nPxag`Ynuesn=b5rE!!V)8GE-Y1ZjE`_X+JsLM zDs? z^%wUQ73)!QiufeM$e(w|RPHpp*O^CyJS`|bpnS)%5? zzzLM+KXL-)`2$X%JYV4i%JW%Hpge!X36$qWPM|#JIf3$gl@ln>UvdKF`349(4S#?Q zPEJyjaU+i_u}-(^#Xg;Iaejz%qWr$b^njSA5c4|f57-`oO@kg3G=Gnj^H}VgbmHI} zag8Uc$2{)>g1<+JM1x-#P1^%8qLrX#&zFeqFqfmR_y}Qv&nU`c zWUM<7EHrfoT9139fzUCZ7HBt@L<3z-g8{v%FEG%Q2uCj___n_$Cqe< zj$=!;z$SB9G%(Q+{1_)=MTYuO>OF^j=tIDc`}%y||5{HlYiH5dV2zVfwGUZtfL*=AZsviC#@D+|a;pYIbpEclrs{wwM>#5WgKQn-JU<1^%dl&E(j(U^lLXzW^dXxLh z;&`RrX8%qplGpYTbN{?qJ#>Ug5k)#rd_jJ%# zXBAsAofP3BRP`D9C-YF&Fnicwm%`5-_MR@Upllma6T&3-ImB!@w=)@3mTpN3Jon?w5($!x|LAH5Faz7NEhPVLVo$?TAkxsGucYrOaejqr62yc0Avw?ALEVUB+o zwAAxL1N;*Xf1Lc=8{m@yr=yKdE2%#&&N}kPsVC6@|FpoVqc^bBvzi{%XzqCb&4D}h z&o{uAl7c#VZfk%~IB=)_PdC8-vjKj&0ltO*dCiW?sef++{0jni=jUYyzq1}LHo#wP zfM0EZ-z=Vr6o%8!I~w2<0w+J6^^m0GqmiBvK5~8Oy+V!f+whV27YRfo{3G~CTq6*T z@H_C4_`L+85xxN*iQh>e8sQOqBpw1yjr+KRbK*)R9AB~ zzRhc$a!Jzbr0O!iu0++9eqC;PDatoMgXmpeIq&%EnxGKGoqJtg>5k;Dzi=*Z9M|;+ zbKiA&ViCuwSR%=c(fM&Df|tr@Y77-LHg2RM_zTEKaU&-m6>FUuNerilMpLpt|DQeG zsZ{#TJxeOP=&Mc>+oU}6VNisL>!XZ!m8ojNEl+0@?f?I)Jl)@Ike?8jp1kiG$^Wu}d{A71RNv0#&i|_o;_skGIhwP6 z>DRqG{*aq?*q8g1>X+VCZu|7!Ci^S!akg1uKjZ{K%lKvg zppchwO7?jnACZ9Q$rd%)CxyI?R=xvOkBtg0pGbg_`Q Pa`YD7DW7pU=a&CpH}~`O literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_backward.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_backward.o new file mode 100644 index 0000000000000000000000000000000000000000..280d07a6fb31649b8385193b89dc32bb36a32fd6 GIT binary patch literal 90264 zcmeFaeSDl%ng5?j1_%(iXQW7kD5H$KfmTg5YD24LV1oC+OdGW;>6gexQ`xRoH|pS4 ztu-(Sc9={JOLg0|``hSt_gB}h1g%onDoIP5mIn#2K#^SxAlQgNph{C7n&10#o%>GG zs_YZ{{p&Zc!?-hEzA?97UaZ0WDNAkUkMrV}{%$Rle(!@`&wcj(<$k>vqe))@cADD_8_aEGvfA%`((CzWi{qhCSG1Hq)0{4DGvAip=e*ua-&#>oAa@b3 zryXl)X|k>G$nk~ekN0{WJ2yhkS1K!yos=rPbduNejZ-5PExl7-&x4dMTOz_MTD+&9 zJm|&VM5-03(uFmt(g)_`ia+=2Um~r#meSl*=?k%zQfF;(X|ltT7Gd$ejj7Vdqf}Qc zb?8XCxHUP5b4%&tiF9#!vLRhunQY}P*;2Z!9#7S!-02mVYVox$bg{4Yg-fQXHJcN-hZcdJ2 z;;Gk^cf&}U3awXj#Wo&#Dec`bS=^2D)WNiO-qSqdTayz|9!zB8AOevkL&X$tN zCFS&+2(&)AfS4~)%*573v{PA^t_%EI`i%c^~vvU((%bm>tVWvV#gdp8`) zc5bu2;L{EE%DHosRd%*?L3O57QqHmztzS9yZaA1LeNR0nTk5lciqkG7+ih;|qan5> zePYR#K3wgW7g+(b<>))q3TvUsM2=9N-(n;E9m!fqOWUl`GNm7>^PNC}i~kg;Q@-qokY2utLx}%AK+_fXT>tEr!uey7#Ttsqk2Dea$QXLrhhtdMxFW z3rg8!EZ6y z_9T53$-7}#6Ku6bwJF&RQMx!xMo-FC^;xZFnHbjMJv?5SiHunMxA?;kN0oyOxVIFC zmz3w7?^pF$Yo-bhzo=BV$gA8umFv7KRZc}z+Vy_6x;b%*$*jO4Ft_a?r!Pbm)2x~S4;VGQw-N(@;9ag%Z=7q)uWg!dXk#_ z3u7-1|B9mIlg-320T~{X7>cA7f-K`TM#E@muokY{hhMgQNi0+TVl3BmN6Nc?r{DBK zvv>WTRQ0o7&qwbfPUbYIs=RA1~ zOh)p5$+I&55uR1~9M5Qe0ng*|=kPo}e`<>t&sA)vdsi^Ep3GbJIN55d{K8P|6@2v< zQUj!t|HM&|9_YS^BJy}Le)17PHrBSDDL}+!M3rEtY%dg^s zk?3hMQ%zHAYJFxRbr@oJyBvw(Z8DVR5T;h?b>l>;A=nS0lCH&MYp`8}{95KS!g)2{ z9X8ZhV{0?TpDT8aPPMcR=Ey#cst$8vZy}TS6`1Mby_!?&$*CsEZu8ofTxI6^WS5ye zY%tXE`fN&Lwy0;%wC=W_TF+9Stn{w=0dK08F(}oq?oNB>u1^l&=`2+gws1J&UBjlv zISqz>gU+wg>-kvdHx&8}Iln5eXL0B^8v2bozo^&q*3fS(^c!=2$9X-k4*e!VzX|8Z z9_e|i(Xv;8le4fPC7+CfheE$t=ofQ-)$A=pzeMPlaDFFxJ^vW`HH3Z*&W{b)wV_{g z=-2H0PR@VYl5Y*&Tb+AN{{NVJN9f+++)v40VD4R^dzW)RHGiJDcZcrX&fUwu%G~=y z_kP^zJerl#tVQ+drctlwHFw}h>-wD&<<`hZ&o(umezFo$zovWEEbu#TEeutT^z_a6 z_f7lnqIYi?>FJsA>k0ixa6nPe1u?wxy`-+;V!e)@Z@rGbt6qn*h^$M=oSRLyhAVrT zO_y)o6(3ENt5~ty$hqIto^18X>JBWltv<`Qb^?}$nOR7)$%J=ya|L^xdcVBGx~N}f zz)-;X`+A4I6d%7G+AF-XdoUFLaLT(WnrZQFdd@G5ME$~^D0|7+niq>ZuSgZ67vp)! z@FTI*iT8Oo%}EZA#japOapKN0b3>ZERf`y-I48ML*1F`O)ydj}EneFZZAsU>=y#6t z)?Rl#`_fqBdcPd~s?yDrmmXKaij3c~hwxfGfSjaMdU0u61$hkt0ccmA7u^tq{|m z9khp6Pvg^_G?8iI@#@(^K}xev=rw-47eBEu#RSo4;}mi?LDX+f4njwqxodQi+?Urm}0{CHPNP(jGHj(*C$6E&Ef{)d0#_eikV}?5p7G3 zxd=BU#~jgTn;}}(T$o}C`KIM}Thh%SBsKw!7yoQ4B%f<7Ofi#u$w*kjtq(Dm~$S(LN#nlHapTy$!16Tg8}o2ZI@#;_7c8P| zb(HKkp!DKD7<7a`lv$Xn@p`T^!k%P@Ws{Y~=&l)(bG#^{Fm+1)lO)@d>~fK9PIkFu zuNlRgB7Z|>Vd~WUdm-DJ?6#;_fgII$#_*-YG73{({tecW{Vp#0siRsqfj4noCzIJ_ zQq@8St59P$?e%(kaL;y*Y9cTa&P&W}2QTts+t!D@QEQuAHmx5ZdTjB=>-DU=uIaut z7x|sfGuN>J+_pmj7+$d*JB$#zA8E8L`eDi3~hamY;rRvX1C&M z>Prqm=k>mSa*Bi63)&^DRRG3XH70uqRfc{VjTJbR=yD3G`HFqT+^shGYG+_6u`{q7 zFqXGKrp<$LS9FIRpU~Ps1XDIdDoWF{Uil7OwS!S4sscr#saiQw1Y~Fo(z&ZOx4Tmw zyIZFA;FPC>rS|K7x(O1m+^kfz5D=%E*d3vcO2Nb~9!Ndhqjeho^Y=ZMW{>anyo;A~ zagvIFTNH2CuI*!1zn0!=N7p>>&{-^X9%nz6pUXDp@>$u zX8H#GQSW|aoAv8-6`e2}aihks+G^ug?-~fx!}InRr>4>@!_^$hYc(W%y0=dX>gM6u zo0(#-r9|jzLgym8a*u%niKzP!E{9J4dIR z)*E}n>)rZ{%C%Oxa`-6@%)fY%L-eG|Fr-`!KOW`G>?XHtu4%l(^5+AIA zh%pN)^RM+{XM3?X{Lt#U?ccoknO)Nkvxg5@GY#P(UUaQp3I}B z8mDob=2bHt#&DS<%0^M%hw~g!)}4$?HS^d*xYQY;-}H>vqarJ6X6KWcH z?R;QyIr_KcAHG^0T(Kc0%!LOp^x76}WEZhev-yI9@8$Jx(&c3lUgHbWoOtTjd_32= zi-uoZu2T~z^qDZ!2z{19*Ih;Ev$S5#)$6>gvBu{%WeNCJzY_*fw5(8swltwT2a2|2 zQFOA#s`XxTt={49ZnEhtRoETLL>|aQe&$Dho^EmwfY&=@Q|t~SVeZm|5XnYOof0D2r~!Cspx%}cV!Y2KJRM3(Z&4w2QTi||-Aexb`JV+K^ft?5b;bu`#V3I8xu zT^M0w$x!LAbjanDXs=7%ftYNw?wQ@O)2DF z4nQBpr!2HXK0v^h0DLq(U7Ym6N>or7_v$0UKWe=tyJ7QxpI328bueNvIU*i@!tZp| z#A@>pj`#cJ=#NDQ>~YW!_L#ygz&#e(nOi)CYQ@m`l3%>nFSS+^>}dw2MQ>0FaUIGM zR_m)zN&1QJ;_^HjwLUEh)1fAr1KUv%l>sfIekoP$7oQ=rLxMM41){dFsCt;yG7QEkiFI0DBBNJ5My}5GD zRapk)DxY@u=?G)$zhgv|D?Z?pm1>ChX_1>$^i>X{^CS0X`6Gj>B7gXol^_`>3(1(! zdt_2rleV2obgk9u4%=~ETwY`|a+<1BtqXGM4*a!$Ybov_>xY3x0E}M*Q4qADrcm`~ zI`@QtNIzjRG^t>V$V-0Y1;1)hQpYg0LoV{OCFQ!mS33_tCzN)tr9m$xja2!fh~LT9 zmT0Sa_cv&oQdCsP9coPVgKQ_;IHeqOK`K+l7ugRCKU_)E0CA*#0+B{otwlk`6#Tb} zrlt26s>B2yE#+)-s>NIK8^ttA2)e{;2`8-M`!cDYF`U!I31xE@W zZ(&7XU4jZAq@Fiz-c^49A)w&d@@K(8o?B1 zqxrY?*jhWq^u?e#q{-Lz4Tp#w?H2hlir~HT2SJb1ZbakW)N1`V| zDbwJORMX^|cWGCzE+lwcT_rUVeU*qX4gP3p8Wa5C5IP(!226_~SOK*2TvkJ$glsJ- zYZh$4WHYb|L%*!hbL=`UX{_KiBzJzw!3OgR26kkxx~GqmrVR}xP2g>2ZgxivdS91< z&^=d}>Ch5kl+8xhXU9G!j#{o=tlKcp`jR>!XOSV^R9WiS+BZzWYIl5PXMwAfVplv3 zeSpTPfL-wgBtzcTcjaDFq;#}4CI zT3#yHRgupb=wnB~t0bQ>=Qjg=z?vi~3zu+yGtdXiP3YI){AQpJ4!5CSv-5M%2S>ip zz16up=z}A4=-%Pn9rQtah3;L>-9aDhU_$q9=kB179m)RCy&rd4OS6qw-L=LjeEdAU zaSn$YU-rR1j6JCCBr8GnOEulGrpfPo#=wu?2acqgcK>1YBh7TPfmqT_Yzn>d&uA0Y z#tyPFDpxU=)X~)s)$y$#(z`o&pPL2^Udn`@PA(8cs8OIktmRUDyBkyYI95@f1#Ik_ zz4BZ|pHDXHm~Ld@iV4XoZ9zkPgLSZ01MY6cSq_-=W%@6LhJO}K!ajR5e(9p1h}df~ zg&{DIk?2O4X4N`_e8~7-$NQao#iU7>HRmrXG2z>(!Z>Fccw5Eu$3`WwhV|p=F%iC zPkHd*e@)0K+Q*aly|zj$aV+slOTB2R_syj4FRV#=y}vT9*lLa(v~j(#pF|oyS1c2G ziB5oV1c+#X*Y|wD71O$FH^#Dfx27X2lYSOH8XQk@USCho;p|ll0<`sy)tTXJ(ShiQ6Y59e_7BiIwFqbu#$mJd2l^6Rf%xlLa=mNL zheDKYsswWxBvwkjf>U|?b1@cVpj~JP&4$%kSOXv;nzjT)sRZk_*YgZX2q1EEXT3v# zh-ejub)i5C>(!oaKBvH%&&l2P2}c!TK?kT9rpif- zrb(}-58rI(0ZoSsRJ)q?0w}0&tGc7+T&?v7F965cIzS#^wZv<_1Ir{H{Z^`JbWPUb z-i+F9hI}*LWD_OnH?Ct8#v6;NrlB=&fWeLD)*eMH+h&IcK`>A3_;}I~p@b#7)sCF- zN`}ayag-}8gX}fPzZ^C8l<92Mat>6yD4#MltYm5!iTB(gqR3WTJ7j7tN#ni>b0HHc zae2M(JS1K(iw2I`D;=|a(r_JvP#d@eB;npEJ$E_hG&V>|T#JuYV{x<11}*b{!JN@S ze)|d9blBq#m15sEz3oycI0Fm91_+kDxvXqHCxLE)vAS_CT*(bH)~Z%Zg^5Q226)k0 z9ZS2#O@omshsDjUed6PDqJziJDp~FLNRZcxxv9Dd*D-Bu)5S5b{#jZqR&Q&QtrK6Y zfaO9%Ejz5odmM67cMa3DU>)XOm}abM!X(-rqygKpddsvpTz(MCQ@FRkq@Yu3-KHl# zeZ`x@^_7*Nkp#6QDbaa@mf-5|AC=;woBoGMF1nF)YLt%F4=p@vEuE$1FrB-Uy*Xc* zhF-J=8tAgD=4!lRpjckpE|qPLK`*}sdYMp|pUUkn9?S0K5ua0Zt4q=@7A#FRP|@`| z{3ci{s=-`n3yixK(wbi1XKfAIW@kE$IiNpVTiD2GZBd&o@FO~p(@oV0=%(sIUSExc z>^o|3s4rB|o;tiI()5xcNV9}gU-Usa)XUP0l3KllqeERu49Wq;+q=|Y9QN~cP&-&y zMlKt+~i;8XO`P(lTbI!(oK5^h2dMHkjGx zI&afvE)Lmyns#&D$|F2rJ+^3}OJY#0cxHYiDLtk?VQW4UAd`v#fW_ zFN!}2wS!siYbpL~s_-iZvv3P#8fJN3%j|X{2w)c2VgbgY`O@cVX%+EV0`1A#7G3Q6 zC~uDG|C_VPK>3psE%S$9sca(-`~(~EU6EY5Zb9ME0DkccFGTW7cvj{U2EV-CRlneu*D(DEUU>t%$lJZRK$Jx% zFz_QxYTTBOFN!d4)K%detpvi@k5gTx?sAF_*EtaaJ|EE4REdm`dm*baGQVkpeG=tj zTPylT(LcwUsCef=Unlw}pr2O>hO&TomsbP8ke6&V7e-^ttJvJdR>!cn{Sva}F-TS} zNZ{CBoglI&V0Bajy2R2|lx}qlpSBSQ)3U{uE?`R+%T^12xeM2&)eV$5T?GJ>j{(38 z0l@4LF#ybj*ZV;OU>>zL4FQ;47Y7%bv*o%73`Hiq*xLlTG+w2-?`@hW>-ElhKnnHc z3nNW5)tZm{8?r%2U=(nXQO)(jT!g;Ri41;mFv`{C3vK^iZ6W9T8@RgNtL{Jyukj=I zC}ct*lNK_eG|wURwxp$bjxO3E-cD=%SPaV z-pyTBn_GHM0d)~0bY>p<$@Dxle%%c6vYSoq_*J|s*%eGcQ!IpURCZfRUjP)jFzVIc zBB>GNMd}d>Nl0LX-N6tP92t=yL3*UlKo7)5Se`ooMvjP44l!2*+!YdpMQU+NNJ<2R z#7(YFNF;?MM``7>gvg2u%h5l}75|MS5}LKj^DaeJBvmSCGHoe+-KD$LW@TvP3iPo{ z_)C3D@r8|t+?ERAko|aT0>%44=Fl^)S)ASNw~_w0ZNd7wo&~A7vp^;>6|W(Rs!;&QUR090!bV;BKM}r;t_B0L}V08AZ{Ts5hPFGY(GVr8;HaI zeNrOub9XN(FMpHpkv&-id777$lkdouJ_X_nw1hX>VAIlS-iOz?=S?xAXymvsyR(*YcWTnNuI2T-`;|eD_y@IV;bdv z(=TKWl%$RPI4&_czRNGQ@V?$w`}t2w#AL`qLkg^8d0}9Iim-m;Auf8k=C|00dWXZE z!aEOvRLJ^Z^U1(g?loT2jK3=qVvWQ8K$+K8)Y_NAY~Y_ ziEcF0BqJFU5V@0Sj0U~$lJe3wae`nMk&O(gT-wzLpi*3FL7aE8?VdaRt=5V zrb8+V?+wb232_aliz5O;5cwE>8cce3h8r)DUEnTJ-3aL>0HPu1auY&Ls7h7%e&XX| zgdhvvCY6~Df+xtC_CG4jgdFj#ASq5L#;cYWwcdfVLq`HBDGZ6a_)0;*&)FT(!*NgNAw{jGs{Jtp(f53|1%?W?FZfwFI z2XH)E_+tzgQroYv#9799|?b4>4ZO67vI5u{y1?u8jlZzKVGTVP#(jCKV~8|;g5X1#6I#3 z_NzCIqvGJ= zfr8OhFk8e%JM9!V@-&u|cr7L{hb9^Q=I>K973S$cN3C(yAqzJfxih@)sh-~^I=Ws* z5ZQpeZwt^4HU(r_lfncsi+;dNhl+4@D^>idT|TnS>K#m5^eXj7(2xDR+0_nWv|AH^ z0?c@-1hc>%NwH8(qaR%74*iU!$5pLq!H>-$`hmrTn?XM|hv)|u=Qo3X07Hjyjyb;> z^kZ{~eqeEaGw8?W5dFX+E>hw78T4awh<;#kelzF?oDrh3@+F+#4EnJ-L_e@NzZvvn zbBKOmaefZ{0EG(UYjy4p{eYhqx_3Bthkk4h(GM)n-Ju_wL-YfSb9d;+<`DhBg1dGQ zw3aw32!8DKdLHrg#t!M_BtKG!ek{y(-ff4Q8GPh|qX3eZm;TTa1)8W8$16WclW7g> zQdguaufU}L%kwCy4=pX-a%2r=F$ttSrsXr3g1Ha8$w;!Sp{v+-C15`IUfCywek&zH zc;31*c%9F6Cwe!v8r%$iKdEw^_HB~g0G0M`dQ$ss_GpmceiGMOPkeUx;aCyzjT7&X z-y#WcR9`XwN%pF>%=CJ44!M^{+=UYqA|6p6zKzEsAq@F!M4}(dUSgAPH(QszN85b? z2W<_gu?{%tD2@wIgK_tQM0pCPZtRpN{iy-`<}$6_bmt8$RH z6L-Q7dWYqZEVJk<|3u4O<^U0C+o=;x^u|E^WZruD*3H60X2ef^1?94T$A8(pn{*JV z8^qprp2C|oexCg1+$7f+k9dsyHC{pf^^JM;5FSG&LSPFyTl0q60QyZkb{+mM~C=MNX2lKOY7)bbxL|1 zf-z!0!V2n^3sN!a=U(4O1dOot7GfM=B9e;vPJnD{N){tqQ>8kvre?9_)$+uk*bf2)js5CIP6$Ij{iBQ=J*#9g~Lp! z2Q48)$Q{izBJwE?&hZa46b>`7-U*Fy_T9u<@8n|e|MO-Zmo*m-2bc&%+|qC{vB4rW z{8OzwE@>?sW~%MA)Ev$%+6qlfARQe%F6=N-yN|fUwuP98n{kQaQ(Zhdy9$Q`OhhG1 zkOYYWK)Z;(rAmkFM=}eCnTSsd%Gcm1ry-FWwRA#xz0AU4rsZ)gR?22ax+%m&zBS-T zzb>$1(r)7Aj8m^_caZxy6bea_~NYb zyCgy576sY*re$fOUo#j5(p_uLhWU*6j6+3CT88{Gy&*y?QG zWSN}zE5k5Xm0Ow0Te^xtrWcJUpVQG`;Axv3xWigmbO&B54f{Fmu)N-Rz#!sCSLcdH zvZi@qDurWYR$wwWNiqi41aEVLBN$7Q5@BOCKp5S8@8>nkyV6Xv6xB)$ zUKvwFF$xzfakPelH0|`t8iy*PWfOOcQPT>EDI|?E2I<_qlaf&tunPJ*Lv>>};C}{h z5DXj2%e+(R!EaaW3Z|2`1%c*Rd6`E^XM~TX8qzo(^ z$jLnKq)-SNOMY*omO-htvQjQfvQ8jUpw3?dPh>Cq(-z z)!Tt%#AJpIQUN#9YWFf?GUF^<5|eQY9A{&}G^~3j9z)MDAV&kv`Vyx4R+>v{1DIal z=Zz2fUl)|gRyq7*mY|Fqs{7zoipikqsaB(+As8`YVW>|Jk}i|RC+Ts2=b_W++ zsvIC6(;_*I83CCPxj{$<HXFdv5OE?jbMSF(vICn$Xe2>VOH0j&(111sA~Zcjm?+75Dk=h6QWriNpSW#1dAqucWkN2(5y5WntMrQscfQPiDQ-=jm;y+$k7~nVFuCByp$=;LUeAv zohcM4K-`f(+@d7>K|ED@e=y z2gkQ*M&<$-y%=m5-af*2UXL~_sgV{U$K1{fZ^!G!7X!$o+p%#<(ZIu2mH#uI(R`NY zaryZ?kI%oF2$;JR0mFp|F0!`^M_3B=gOPTP%uHb@^1tCYuH8F~IhR@n9zw3%S_Om7 zrCMGU&6WM*a^+RWXUfedKv-68l$~mb*2zw`zA96Bl`b7c_hXq@qB=*2!UcmO4B{wo zFpB9Mgegy#m1HS+kR9QeOBev-JD-@Ea{Pg#ubK93ycgfEJUBHK-bZ2iv|g~9OQ?Nc zknd&tt%+aySJ%=%Go$ulj0eZZAF+O;b!h%`nc^J|{%^5{R{K<#SU_rojl3%1C7QV& zPi&R>FY>I)ck+zt+B~-c5X0nF!0D-x3#?Ip;#&AZ*WM%1?KEnU*%Dr5P5G^i)_^@TQ-l*iVi6^uo?7xGf+lT!e!6%71fi>?vVw!<>1HJln80PA;(Ia7};S{sS@p%@x=X{eecC*?WsmFU@Cthj8)kYL)7 zUSW(Nf+o~Luy3jeQgr~pbq}Iwpu>nYp677ZF^Jjx?`4K{LM?RDZrnn&Rq|)r$+rct zt4=xS-IG={2k3$+=LMa)IZiBWO>*fXFzmxQs;-qONE-Q^JQ}+A3?mJ6@&RdT3rj7n z;!ssV2c~qZbaOXh!JHr(diM@7;1*9tMnl0aUYG142OH?g#Y2)ui}{^XplJ_f1MHFb zs)-L^>pp4jN=V(G>I{7C3FsuHWs`;7?FJ(ITl^?Bocv7jfWW41-i78N+hkLO&XWVU zO+%CAIseS7y=zwit!d}eW?1?P{ZcgPO9NVnJZAzV)^zo)s217>eCLkE+0sT;n;%M| zu|z5JMFhZPA|uq6D_vq3tb!!uh!l{dG1d}}v{5JJU`bW_(Fp9(B54!QvxE#;awd|d z<}@hxfZT{)Gl^e}+J;*!F*OXUmzqku{=H^yvTZ+uVyi8ZSYC@eoe3uH7#6QQ^eWLW zrFS5C*08u-V|C0>0zq*+JLOr7hivgaVS!xkaktP+6b<>OG_&su>vz_ z7SL+0WxN}Hsut6EL&qmgv((7XC5^^~I8=XfV9F zAW9w2a?{Hu=ix^y)fv`Oo!OGu-jA6~eq=ux&2`=)?q{n>n>_~J>b4*%*BDtsaZsw9 zPJ*(c36Gw*N^y`~J3W=_u2ar7;HEC49{6MIj6$wR7$N)>OpYtMA~Gqu4Qc~xa6z?q zri%MOeU(}TqxHY(RBF&6#ZSlJ6-*>*Yo-lC-0CfA@^ z12~ZgaC}4Hd5%oF;VPY)&jFb=&KNXe;Z5KNC$NiG{Y*ctX$`>v_r1=Uqxf$1-7#L3 z3k}#Ty`@>P&|O+BHg_#FRroc`Om3i>UypJ6)K@zKS|3xRsLkFT=R|Qe@UFpQ{0{Em zzyaYVlmy(@^)RUw947=2_Ahu-$AVrM>~lbqaCE5cZAMj-iuk^Y|dSwbC)@vnm^Z^yF+Jj=f#bew!M%YFAL6eX3ae6<6L7OLJ0u_ zSlhI0=iYR=1AfGeK0UJ&r+z5>U2}e}^Vbe`HBkjQLt;V-vNw(VVW~5gSS?h@oB@b_ zg?tF=Qg7alYqkipk$$s+jA;Z|f22>VAI@cemj-feWmQJ#%n1`{Wg4`A0Slf^tYuF7 zM$<)sxBoN_g5*cmMc-2sImAG$fNQ7$y^-Z-5N z(u{r&>47NduhXuf7#hjOoJzln4Ow|{1U~m3*x#t|3-tIQ<>f7tSQs3RE?g1w0%eiS z$$8DYQ8`!%0a#6=Ci4eg1gr0IF)EfKhV21ONV%}q1e zGT{M(QQQ91Du*YvwO1)iw2!R$Ry&qAvj&+fTIL{`&YbX7#_iW>L-P6d>3nYcJafQq zI0u70vhyUi$^2ba%tQgZrOFM_Fsn@DFtY^4 zi%vqY)h-LO%Ohefhh3ULw0TNLI9Zu`Df409=ttblAV-b<^g^eD)9c$0f6CUcY^g43 zYgmGzj-X%+P`m+9T&t;8PXobeKg~#5MTL25R$wX{(`#Jv^$6~9r|9=%;&2B)9b@Qp z#7s<3m^cH2b0QWqi42<0!+DIaN|=b{6Qe(CIsUu=X=!gL6 z9nl5N5c$m}DEKW$#5TfGV+-SGKGsTV&8;RV*sEetstWUxg^8I=cWAtKn3!OIiHEn4 z$bC$cw-^6N7euSNrsV_;3U~2vAhUS>rJIU}s$Zr_3C?ku+7#-GWNYDK`Vp(E*tf6Z{KnC#|w7R+mfoUqUR0tWRN(K{nZ%WL}ucG z5)+ipNU$kK#<9`GcJ+iKTPu?uwx6nDWjcVNt5D()Wjn{*GF@K5wv9~|enPdkZXTdb zwT>;2iv$HX4-mbyMb7(wO;GR~Elmg3T$1U$m*tuxiqkU5lqj1hStb6ONi^OFAU-W8 zxY;4%Q0?e3DoT_5vd9)0CLSYILG-nxT5uas^3`YXl6DZfg05Yu#-8%(b6id6McI_8 zQ6*EODb!IX38{|YUF$!ROI;Z0rsc3~dasAytAeh=N#b~8M9KBN`yjGM zUx5s&NqMpNRl0LIGN;QQF|(sK#w1y7R{RRA>??rgzE+r1 z)!L7+FVoTPQfNbY!3Uu!9-tfDLu14$X51`4C8cF}t)9`ojk->y zq?AKSA$dXaAbdPj)!1n1&_7A%H-u7wN#`-8qi~0y+{4SVTX@8yfw8wi&)P=42FbD8WFLYl-@cK?5ygpo^^(jU} zGFsGSZM10gC27(Dib%qq(EMSMd zb+u?W84J=YpT#661llCE4OcXDG zo$KWoF|~|?SheftG>XWeVs#+riDbYpe@?)&Km4>bMMMg}3?v49^gHLfc!E3g1b5~M z?#vV1ndko%+v!{bHYqULbj=06%L z_RGsAq;hJ4rkp=DOWN#l`D=KWCb=b@`HCY6gX{GM=5<)Vc%m5Wj_(pF>P zPSVAbgu{vVqjJ%YJfvx<&FkG|H*K;u{b+!P5`RpOyk#S|W+QvKl%g5J>s>rc;_ujL zgr!{}^c!FQ9f5_WE#>v&Uk3`N53a?7f^!%4$4o}!@BfWqgzv`ATw|W%FZ2z_^I|U6 zfgJuVQ?dFsWzKc(PH_t1rp47t0kd0X(?M_DFO{jz`zV}lUHp}tF*SLz6M4d0H-dkz z^WSkZ-KChC_$$V5z#Vk@{eaX#-uED=2xF}0-hU2eOEJRsg>2$f>CLIqMG2Et{2;Q5 zmxGC|0Wez=V_%}=Ql(GUw3M!>W$V!zLSrqZPe$Pi!{VTBkpJakU9NKtOFxPomveQV zveimrvF(C#RM*ZMK(vAnTKN~0mwpt8?jMJUn;0}7r%GSji=nVFHXR*=YEvBIur%}X zr>Wu-F#S}(;wT{5EkvfTSnW^QXRKbJ|DAVdwkTc5fMi;^=p&a8v!`_29Bl);3C!B3 z5{1xN>fGO?N}qKq9JvY8QhZ!2Ma?P-AWF$9x`omJBX(0mxPp8kn`nZPJ6qss7x(KP zjkkFJECI!Ku7yzo2~SNDggsP;eYxtn5H=(Dvn@>zakIXqh@w`>C8mo%OE>LDz;Tzb zkeJZM5Oupaz`wHS;-gXkYyuPota1U&{9V~{qZoSZ^xPgn;(n@H(31%Qe_wt+nNg4T zJ9i5A64)ZufA?gH_hhHRHV1dfzd>amAe?BjY%3d>4Qy5rhjzOn0$vf;p$!H0VY!bI zKp4u9+)S3SUAhh6>_9kbj(@ksXgHt9WZweRc3E|%^WQS)mCO`xC#qfA->k(uSK@nC z-VKjtOJC83K`HDd5>aMLUmjEh^q6M9Tw}%eozeiNk==OV2YjV8%Te|keCuViZ98(M z4g>a-fbwsvH>6`h5G~qLEUBt?!j0@~)Ql)wpWb;ffKJCrcp4y%v$)RZm6svnP;}4{C$0s?q*y02B3&+J*Q6(k?E~abTBWD^8xxM=2(4&_&&kP#<@uBU09QD`T>o z(Aux467gaG=STMHLlzA_C>8S>g}v(H>F=7k5#YoZ>2l&zHZy25oww$SKhxD)s~V9jOna3QAQFhuAwM!KNkgl6Dn&J@Zq-kXT~dyoCM3<(-g@Kd3q_L( zNJut7l8b-?y0~+Z#}}8AL7&}h8oo*W5kK-Xh7ABV-B_&vGqjmH7)=%VG8gBOT;vf# z<|4n8fHz7g)pUhN4VD6Viz0TeMjljJ<+|&`(d&2KLj}d$)f@6sOAR2HPMcV#0zxx+ z$W3^8NbfBe{S%Em>SFaS_0{(2!^mLLmDo4X*){ox*yZPUuXnM!h)z+|~zx+a?1x)K|Ol*AQK0rsI5QMEUma7$zl-gF@OadoyEi zhjmquwocw8wX81w2gMy{DSKp;QZK+kSYQCr5no2!kxm+Z=jwOF9VcNx>cd7B)mRwk zn2Yme#2qJJM%;0hvd7KGiaSoiy5C97T6zhRaTbnul;LH>9VcH#+;Q>`h&xXH0ddF4 zKOpWn`3J-uC;yPRqoyF8hV_-J7tX`JS{HQ;cbf&f$x1=>6Y91gW2l>eHmX~{|2*c_ z4YkAEP9a97D6NPaaMXSgk&}Nn&W3s$yKD7A_3bXqx$Hmbm|AhPE!4o$FwhjgK;%q| z4fft=i-LAv&APnG*HN0tr5IXTHbtvt&uEFdo_7InSM-Ot+vAF@3^R>TGvB{*`W{7% zx*Mf0Ax!(wdf9B;A4FK75*NH#JK zKZsAQwtZ3{mLB2`En67CUPwC{^d(_Oj$Tb{Z1LmZCR@D8*lpvRs0n?q3%wsvIB^fU z9IZY#61nnt&StT?s|3HHstrq+DV&W9*N(EqS@>Ix;cuuc;994YWI3Gca>KcJ;;F&oTBQA$s2`PVZJEj zyb{s0&!e`YuC`PJYD>x=4`k6_pltj~(SOkL#S;f7d@1bC!Ey*+lD0$WB!?M`8vJJH z9$GW;t8iMn_HYWLPdM$5aM5ZFAl(?>CU^lrg&KDB*(JQg!Xjf@AY*Or`?>$6`v62qgodk*VQr@0>tMhT+mSGZzUQH zh85~_QctK3X%LVG4swIMQXVe0$N3Tzn9tkIp+3CCQ12#INJ7j<-v`sWCAqmFGl71H z%!_|4Kz#Ihu%}2*n8c`y4y_DE@@e5ctK7%Iq+Nk5amT~azy^ngl-}PA(TZmKIM_cr zqD`R?WCi#sS*bBBE)>_d)b+tE^xDwhzS4)=vH zjcb)FVezo|5|0=EC)GJIZI?OyfiRV>!7^MAhGB%??T3KaOgp4~9_*JmSQ(ogUox z$H+hu;de*zCc*)k`bHR390`Ilce^r=;iodo)F;9ciYY*QxH9L;JAtRlE0az(N%gS- z)Ls+p!PJ=fod?wEo8`miSvnzM`>YRiLb@%1T(&x$kVH1;TN>8$wl>|h=8|mZ&H_k} zJ|E^l9Em|X5XYt@O-#|Arr|ZOhmlS!eU56Hh=>p?pcv5}tq=`l3|6vsH}I_K3gk-D zc{QeTxgFC+6$3*=GwkXfw|4=;P0S!Y!A&FB5F?h8BN=Mwpcaw zjmBfYC`oFF=(PvK&`(2DyKE+a0~lI|G;-&*>xN>VCC(A??MBL4rRrq6uRP9wkbRyu zihAMRtsy=tjBhEqn^IcqbYaAr!EyKJ<^gkvxex@rq+bo?aGRcwj9Bc z;dR+Fdc4!YK1ehIwjjXBJdKJjLd+5B^ORf>aD?z*x!G*Nekr%#i11k+@%c}D?27(! z^ld>x$w8;LnB?+O4>^-kqtrvP(*F08YP_)=Jvm4xx>g{MX#?N`UO_q!D5G@&+*4yk z;`?0W6d;R~qXzewDoD^KhwWT`SYgC91<1lCG>hvdZsGaQ;XPjp@SYATU%TqrOOS3N zG=~cfYgn^@5p@QcYF1Q5s!Mr&l@?2%jS@!BS{(VZ9y?kQ)A9g&^RN=PRJBt}6D9$8gUhiqMltgOgnyn;Kl8hA)ixIsF9T%Ed zbyN`xV?VU8VE}xjkbqLxLg>(*@ngQjbu`+uSGhPA?McYdp*^EYkm5SDM?=M-J<;`zVz6|Ot~cfP`P=at`TaoQhaS04Ao z2M~EOY%SNhH&yQXolkM4nx0+rmTc!!DOLs;)^-aAQ+*ZvKRef)mh0Tl`%w&Sb^d<& z3xdw6x6Q_fsCc1lr{nJG{qyQw<@X^<=)Ryh|G`ko6XxYR(MZJxM*>U8a@yCpkn{u{ zIu%jEAH{I?+Ga!v!A~9!L;SW_j#M}q0_*JBuRLh z@S*%!N-JL_do@+{^@i>|!judCax?ggLDOP|JUIilTvP{kqGQo7Ql$?j zfLjnt0)PQ+Vg3Y&5&BXKN4u3ZUELn^C5abczyUy*C|FB-a+TpMtDQbb&1~s%-3kyE zht3@Rg)C4A4!+Z$$CCQ3ZbGn0X`^%j@WQs_0FJ5BHA6DaJ%sltHWE(gJ6S+Csp2gF zFX4x>kP$Gj#a#(!b-VmA);T_%HAX0@+-oKBVBiX3X(=8N=(17jAtkWnaFqdwwME*1 zsr}3#rokf!Kq;dt|Fl#R5DDT%+J@oU1XjY)1)KXwmhX~No_`l*(I<6S0_2%)0&M~O zR8-i-U!71)+wqeUy9D6Neg1~U<;EZ5qFQPu=4|nBmVFi<7z{8K!@FRHUp%eS(P?Lr z@RO<)hV^{_GKQ6ilf6K0#ryoyrCsp2;i+GMG*3^mn7*n3~ok;T6fF0iQu-vP$3#T%)MbHcWlcJ^8)+#N14~xC$0fk{x zNK+^y9MFu&!7`X~G1 z7ytM80ShDqW&mU0KiFnDrrZ#;)`D_##7uH0^rcq)Qiol&Fx#3T-o(8b#5@Xs8FLO! zCyTf4M*uKV9$Bwl4i&ju7>vLem|Fo~R3Fmn}x>rR$%lTxOA4=gUvD`QndfGcfHb}$d%_I zHY9+iM07Ve#kaL1aaHFwIx3<@1^z!rucR7_ucjgrgCq}%F8PEZx*tQ0k3M~7h!W_S zWPPWcKb@OyArRxVv=Gyfvb4pDx|01P{c_M=@+efHr)*agkx6alszH&Ei?R-GA>%d7vEJUNOOc{)|xYhP9eDuIE+XB^p&B_u*b zBo##HZxCo>*vk_cU=ei#IpZxpg)~tj>9tDwCY6U#(oGe|5ksOlw`PS=La8&WQ)nsi zCyk&9^5n1{=$=&%->)2_|FZW%#hp;x7lPm}*`<=VDS`lckx-P*nk_{%<8pMwD(odS z{6rDs&Z!^qsK{~w zru)mE5G=E&x%!~j6G0m#^1TNVbFGMZmGJH98n zNpJPSeiGwp_5rgGn0?UfgJvHx`;gg3%|2@OF|&`EeZuS$VOqd4?zSb8QR+AO%hUbST_D?N zfXcY>L)uCaTo*$>jowLhI5MN5g4dXN$jRMZ)2yR0D^;-cnJ%La$G#hj>d32bSNClt zJL;($L`OgDH+Puf_P)uO7Mkr>*n5w0CYk*b5+nHxQz0mgx+N&hS9#NxT78^3@k#Go zrYG!X!bSqZfRnhKc;qq|_{;z_7Y9B=fzOZ&!>+m^@EHw!#5E)~z?oMEK4XE;nDYUe zd68;SIun7Fuf-Pz)0I9XUT*KKDEA#;L|;VV4md^HPZ9@zP5!%v07B> z_ZUl!qSFJ7^Cl}DZx93J99`T~ATK(_Tfd^Ws$bbH9Q9LmF7?xHnQVEGvExDl)LN6( zJGER2l~K3qVp5?-1mWvgo{zw;v6}8!|yQDY(p+=tq zim|0Nxf{FF-vARbq0tv7Z#Us|D588l;fYaELE}SM`W=V*8eia-sLLCE$aEDi_7nfD(sDT_3=OtY^l1LJf z${IVLhv7L~<_ZxT9rw@u2r~3%b{rkZEzg5?-`d%t1+9s1}UqoH(?#1Ec#5XANj4m4Ah5{n{_qzG{@rrBu^On=ehhROel4sBSp>}i-HU~ ziqCQI=W}1vYAp%PH2_Dp5ivDCOA^8WY-+A|fDTc7s(}ZH&SB>0K&|C8 zqQhk<7ti~ei3bUuP_3mf8co+EI{m3H(`wn}v|4i6(3P)(@K&x0COYEy^9~*z9VX2B zVMnyXP2!X+=4d|GrIO94wX~8YTtsZ!9MQ&ZDxSEyWtx<$`_vaaP$(rH(*Pk6nOaLs zhw02xYss zgbx{YLOh7dega>vR9G%qG#u3rwW}bP_RTL-YiXtKcIk3TbYvUHTx8ucB}to51N2*5 z4%AVJC-78>Wzx46&`=zGwTl(1quC8qM^kKvT0k@EF|dx?LuerPIao&n``e7Wc+Eeh zz;$u~t|JkjXx&qHiHR|JwMJYUnbUH!fE>Z29wU#2 z@Mu4woQ_#e8DR__d1?eI_cFEL5pHWrE;Yd9r~xHTc(<8u9TURSZ|@weFw+{&n8-({ z`n}$t!mrVOZ3~VXtreKs(&4F}2I?fHVBR*owB#uh>A7&abu<~nb2iMQIYPN_4N5mE zVs>(xOVATV&1)sqThp*to)484J4LBnjN$I&c;JI$rwtEaIN-;viL8$?ebkGqa?EQ^ zDhrt&R#p{(_*q#UrEXYqc4u)mcxU?iTeu{|RF$IVaQ(cSRJhM7JcKpuT=j#6A-rMR$s=zEs!iv)}~6^t)HV45pm!eg1Rqxktxe2nGA z03H%D0p8dmr>_!j({xIU23=B;=lQ)!?Kvi?$=^+C(RWG7ARbb(>j1HuQXtM_DeY6P zYSy}x0)>}AJS>)0sO7Fu&nYFumMGLO$cstzjPTa3UB$B7dBj<`?fa5%2t;}oQ1h%u zY0ma+BhYk0m1ErPz5up5fOy>wO!y5jz!ch! zK{J4(iSlUB(696^{6>8z&G<-6e~!xy7T43bNas5LCv8pS7-HRUQ^2=i871l=qJTN- zxCpt(o}83BFn@>?R~s*pkz8a%qo!KPZ?!O5Ks92tfDYm;bGwY zyxv>5Sg$WGw;fWb?dlsel4L>hJzGM!RM+KlgsXChMp&T{}SZOw%?tibNj zhQ&(#|5qk?lrqUg{t`6=jLnwv$A3q(A>A~-W^ShQC25lk4@j5fg8}{% z$TMid?Z?-IsTR}ej@2cZFkom#K4kFJb`D%jlAv0e}{vYjPf+1#@2m zlg3AwflMaS5h-DuI@1w}%N#b$u^MTUlr+v9DV>apvay<7?H<-gBQ#}_0sf-^^Pn^Q zM=>WuU6KzI0|J=8PnTrgbV;r@U6Q6D5{zhNQ-Y}4Pg2zflQDoYpNvkCky|Hfd33P<6$B~upFD73GttYC9;(W@t+442O zO!=~^T=~K%>^OTzLDX@X@@G&W`2mzgUU))CR@fzdf<_YLhF8lKx%;wWBFAR({*~a} zOy2)GcsDa3BX~$|(U7QAL^?ZCRK1r;o8TLMZE{`W33d~SKBbDt)`z75GLHtxY;pIo z3OKZoQJYmfQB@J@cEm7;l?-HuYg z;g?q+oYyf+0Y@7OIJ%-fdONdaU%DNga*Tqh?bIZtO8;hH{=H0SOqmB}sH@Q+d)~hu z`2R~_8kd{!*XkX!6u4UfNmqrnf@e|u;}mdAjjA=$EVpjcW0!^d?zxB7IQW2#lQ!&< zu32H%%$YdeWU-b$uCMMT0L)D7qkVV}LAHhV2|X@aA+HYRF#s*!+vD4NQFhPv%{}|J z9#ly%H0_x5p*}%9w(gWJtH-(lXKmjn%-n1^%SBELWX`leHoyb$FDf^}oz?M%L7xPVK_!Yj-NVcsR^^h+Kbaf(Z96Znn8Hit(F5x0s znw=*ng9Xeevp>0q9!UG*-Xxgl2vuNnVkb8}Nr3EY=Si|dy#2lp?#cb=5AJEslOD(f z(XsJ*iZ~jGLy~N&3iw0che8(X|7TsQ{?vR3#(nH6XTolsPdOgtjiA;73{_!@15O0U;{2KsCiiGmHyIHLf!61Vy!DJ5X=r ze%kBDII&qDH*JODlL;{OY4_o5HCkSa3F z)&p|o#;^0;(QNT<`Di1bBN)+%WmhYGjlA4Nfz>b^Z}cW+{;M-1XDFX z$h+WmAWXg}cpAthiXa{C$yq@FtC~ZNNw#9@TpI|XKIzNsyo=BCRVf?VK?v4VB9jCG z9`!q6M5-Y2+8}0oLdYnU^nfbT8Xq)t`a^r5qb)@)LI6)cTB*+gf-LKkdqlCfCw#0`JP4_yTkb6?)&rE@WHdp_g z@AAzf$s=H&tyt7eR5x_Y=Q77?gp}$WbUsG%f622l{}G;5`5ez^9w~e@LY~7DoVE{jLiVB|B(Af&m@jA_ccx)Y&MJ4I-x76(YsG!(xT2 zX78{;?LGM34SjZ%x%U_=P?M0jaa!wG-j^q<;RrC1BqYk#?m$xc1CqX%H;SpQZn!%F zUZ7gMIZG9N^e`x?VmFt;&}-1VW|T&9VI}k$GB23%fzn90@S)eJdBKDaltyyHC-fRK zuNkG0Ox~f_gn0p#K0{?&HY#v&7WVwJl}0)~yLpMxZeNYk&dgjAVJNZJXDf|_j~;r7 zp>E%W(yD8rnnN!!&|#RH(nzMY(7Dx|ozlo1fqE+{QJnEmX{6(XTWn&5i}fv~kvqaq z^}f%oL*;NRyv42 zEe^BAwP}xJ#cAj=bbB((f8SSYr5a^1A4E9b)x% z(=d^h$yQEYrrFRsElg&0&O4B=mLkz5of0*=pfLZnV$VEKVbD zb{A0UIh#StKuXYoIq&=t9R+ZbiS~SRsCPgpA=rmO3CWrmjn^DABvg06@pDM1aJC^w zb2&#UDKiuh`ix7F+pUh6LzE->U;`2Pq{b|KJ48yuO~Pbr3>q(fQ8Of6%?5-XrOMEX zFyZ3)WGh4+t@eGt9*bvNsLPP`lzGdGf3^dXRUNZ&PU#5DFsw9UL^A?MYv=J)}Pwf|N*|Y+HbXZVGTvRsl!#|BSi_ zoiamx-j8J0hu`;OigRRN8*`DZl^NimNC>!c0Ej~6#jlxgM1wNvX|*&rQ?WL*$XWG> z8xi23-Bd2KYa$(f+%FJsb9)DdS#0w(4m!_p(B!w%O+Q_8amIbzZ?-1G9n+c&uV>)J z7hT`9Cj>!z99Bwnn;ce3wU{7oysn@%M4M}^AqldhJy1O+Y)Mr~M3|KGFN23h~doh}J&uw@Xu}ZvMnhI|~22 zn=Y~J44^_pHzdN=o&_Cb#c)Op7&K68ILE#~$IbuC1rj1Z&GLDdDDUMFk3aFTex_`` z*Jt_+|G)O$KR(W?%=@2drVvWto|GsNtBf+Pk;ZPSQ4*TP3`}4KCNxUbR09N45Tro1 z9UiK7YcvUVoQ#GQwOw(|?sm2Nth+`Rzby(fEzp*qru;0hU|2KG)}5*QqjnhBf=)pO#XMFV-F5ey z{gbNs841mq(q^bBVL$luQROc#0XLc_ALVs940NdssOJgu>@;1C?lQuMn4`|x2R387r-9TqC1t=$TeUM! zg%QTkql1IreN>D$^d1Z27(8?jwN+qG$IhWq=v{q4Z6wJtAUedAuHgl+r@xtsu>{mKTdm!(%fM7?DFFix37i5393L>aDbMu_&5+hlNPnH$I%djDPOD=I58Y_y zwbmze8*UTt96f;TFII#A&s}6}f0;H179?49Hi-J0=QF^GXEq9@fX@)FY>??lG-sWo zsKo)km_SeX8p1K@p}TN|k+L_LyQ3|4n|55u+!PFoD4S5v3Wvn{Dh;CqQJm@`U%>7V z&(A3N!dYlBd@xYFhhUp8Fx>yDX)%1&v=}1XmNMZsv=}1X7Bk^CyFJfITDjZvj$%Vw zQ>$ahvi$jPndXO`7Q-N_r}~(rmK-sxXN5hF7%{s&j~F$(J&!1w-JVB`ncbd8jGNt_ zM@*W1l4PXC@KJ-rvZqVV5X6(KrN49l;+bfEUH9kxCegoDLT|Wo+FXO^Hf6t6oKZixG=voAK0!%i}2zN#V zckW?=1W^S@+*$j7fji$R@wj=qI?wbW&evzr8)pW?Wnyn`R4Rizzj5ZiebNLO4Nu&PM6Q@kTKOlD(;dO|hJGhl zGwPHfF2l}1B?69ZIDZ}Tn5d9G#NxyX)*xV~>loWv5KN4caQ-hra_Yo;c`*M8loDQu z)Lc((GXIrG&TYc;msI7yBAGelT*rAsFc$LU|l?lgQnJNF2)|!p6P2+oUs) z^#P|uvvFZMMM4C^Utl@iNYLqCD?l6_%^JoR45zT9O-qO-RxBFL?Lx2q9ZJiw(-Zk` zD<0TaDnxPeQC5~41Hsv62+npN!TBvyWwm4CYWBR<5S&SZu-NO9!K{HHxTXxf2?{oV zZ*s7Pppy7pQk>Te6DRAiKHLIcb29f3S|z7pIlFu;$CL~5>1q)^3@wiTZeIBUO-JNI z9|EM4d&Xq(+znsN&w8i79KNslS+4qWb!b<5d5dr25;R!z4-;JNMptMtIoZS~Ag2Vv zaHxsd=tX$cH^7(w0ZC{++GbZYr*k(zDAtnjhF{1e%7?@lsUyxzZYzRS=1mm$h>4OC zr#)tVZlx*B`Jz|Nr1M_}x^c`v2kB(u=Sx8zu0E@fZ&qn*>97{FJ->dJUIB12`M>58 zuPanQeEF|jOPm%zj(<9DH+ZGP zPpN|T;G*a>x9Wua`DO+n4!KbkusFBmS_c(fj66 zQLEQsyqWZi80|g3J2jBKV`0n%URn{*W;T;j_(=^2t0WSq6v<#8hVUQ?7{Esoa?j49 z9JUCx5dcIHZKDqF*40zeJNb~oH&8r|m>Lj{RjevvmA9vPZ5PNr07*MdG*a=nw*wj||#6D@nM980fk10h#LXZ^+ zr|^Ba6;2Fe82<}Uj3k}f{B+BeGLb~0 zMz42~5@XT*$=rhpv>767hVRP;8$+2aEJe%lqq7EYrs?d)!CQ=~O~C5_wGpAXp^4lU zk`7@rku?En40U3B?;#XklNb`uq&U@xb(n1jPU%8B3IkU~)9CsC_SSUs>rNK!c`h!k zAue|G5CO`Z%9|>yNZ5r%R-I3`BWfQA*t_#8!euym0xrW7T!tsO3{P+wp5QV(9WGO6 zdIDKcT@>O`ugCJ)fI7wKTlks+Dp7t7mkH8PN+GQ{m)9w+<)N6_V`gtKdxP0q&E9JE z4zqWdz1!^FX75e)8_Ki1&TttecIme@a;u0nC8|wvA_SK{Y?Pmx5qL^1{sL&M_SyR0 zy58)SA#g`GN}rR5mpFOlNDF*B;<3z&fme>S0B{9fV%e1=EihLCuU7MNNDE~fICq${Ls}SF z0_SdXcGq)VX}mT|y4Rc?(!y{bQ1_X$Lt20%0_S0znSREp9^7j4-j ziWb8`n&jF<@`uYOZem{b-wKL=q?freXrv6>^36|L+LRQHQtL#rj3jvS?WXA|t|OZ| zs7dngRT-GI`n{I?w=8+XIb8DYWc?mC398SS)*VQBQI%pa)oOJIcG2JseH04ny2F+l zVVw=pa)uzSDGt?hlpYzzk}keS1XN_HcI0H|QP}_$x^w9^hE6RxG>*~UGpOqcZe9@`k6HY-yEFAE|!Y+kwPlitFhG=QG>EJutBKk=HRJh?yAJz-a z$DInkMJ}QtUk4v-ITug9k8<>?zbeCY@Hu&{0AD718uf_mSL)KlXM$U%gioE@KC;#2 zZ9^ci^{*q2^dDsQO9|g~SoEoGV5IOgFcRs1j5^9|WSS1XZ<_-j+#P(d21`D;zvxKU z%ajtnEJt=9I{r2Or@xyl(_Qa`0~**q=fH1s?T8;m4hkt z!ISvvgJm+%PEvxjyAZY2yerVcHx{h?)lv(qObg#nt_<3u=3@1j4Gw%D<@#^)ULC)p zdGDIjQa!(A3})E6`YHtw;k8$dkPGW?-xp!a`Zs|oh|YM=(jiLzdP%IGH3Uk;uCl#Z zfLYMTVAW3H6UtSxg8a(SmIDaV!qEk_$49(U4dTKnG^oQ09i;%g!TaC{Xw4ClCp~Wm zu4P9`#B}r*Cpw0bf+{)R!4O(*C~7TsEj{R>e6coyZr4fp)S{H(43Z(NnDJiU>lm7-C-D4Zh9?IR@*uY`cCa>^gCBT%VyOkY= zYWpmR%kNYAEK~vRa6d7d+|~0`jFY=6;SP5H;!di{a0i$!RmdiFEr&VJNLt^g04)P} zcizgF*|h|BUE&Cn;W;eN;L&xD0_$?6>bXy?D9v{a=*VALf=>Rbgg;EBX!ygQmt>7E ze)T-*M-91-HU{{ECG?zf*XS$QGlaoDP7_q0!bI%jgfJK|0kQ5+I!nmDO>A8+;$`yK z%peYdc#n@j)QN#@6T|@mSWFcw4Nb@*a|Ia_1`1h{0v}RL8>@8w4qKM`<~|%M4>D|*SQUO>qk2lW$+R^;B#`0O=^%sSf!SX1R2wPx z0~t)O-dx-r$ndYC0QU*bZRip!S2O4@YMSZZCD`-RFa`!Hfdf*Shn78f{)kIS_=%zG z9wwq1Nk3Hq2Sfx1h^b#?@Bzb{P=-dK2784XXdF>Qs805A2Onm*Be-{|Q2|;MKp=zy zAOPxMh8u`C0?e??6S6P|y?T!T3C!UjNwmx143*eBAAqc@5|ZU!jJPs$Vnj9pt3Vcs zRe>N>#=Oq$UUjlk6j_h8S#%0RDWkXiGFLRHgHhGT>>pp2Ep$1$J`cI(niLRE)C(OG3e zRfj57oz?1`r_)E(^mRtE1B9y9(d4|rtvlKe_2C60Wf;R7dJe>4;!Lmw@mS;|9*gt3S6*ibR=9cM-x;St7dmjl_yr(CIyafH zyDUD%0F>C?M$k}511v8G$u5M<8_jg|UWom)lrJl-RQoz2cRU^j7|HU>$;(=iBt zvc15}8I&Uqk;`Sm%tFNFt#_Qz8kUHEphW1Ht12>uR>@cWz1|VzL8zw_L7xtUdO8v6 z2?+a5s>g_M)RNiq`fHgd1^MJKuw@>Hf;=R+^G#MX8eLEOr31L?w-BJG7!XSWwVgXr6dlDE>bf{)pX?3-WSII1^IRWJ^leFs-(WZZ(4 zSCIuN)d)fQ{2;DIf^Q{NzS}Q^O!K{d)fAs{!iV4G{$qMiuGPBD!5?OW4Bvv7I2vwU ze5ePviDZ-cUvnJ(@G9kht?z%5{NMc%i>BuU{I52KC%R!#tY#{rjx>F88s_f&Ju(VlC=p-*U$ZNgb9|Y6rEv zD&i;FPWMX}S`m5dIHdH+5p~_Euznb5T+Nm)vG&^nUt&UYr7FY0V!*+L zz`+-@E48s!Ef8D*p!Z|&4E4QVE}()3f9DN6dz2P+zW7{`z2QeJLx|3SrOsn7dk26k zqI74Vn)J)jw)~fM^;@R^SqaSvLhXhyp2*K<&!z5zcPuH~HOsV+yBEgLUSX`<0JmBi z@-|rWII$4vg|N{m#jR&B+Btcl!qN$|-t zk$VQkq~i9G+G~JZL=7k36qM$6YqC66$F8{lZ4zo`t+RIM)otMfu+nJz@DbiBa5V>K zgM>ctaw5O7UZC(8o*27~<`XC^6<>QJZ|NzSIk$$_&T|)G!8~$Xb9qiu2nlC${a@v*k+BjVn>^G$UvR6{A8AcH@-Je_ut6^X)>*Hd`EyV}RR>QcxNI z4jH?&&24X#o1$oa5w8p=?Dj|;7giXrrhGvA;G<%FtB+p%%n=^^ivzqlyI0>I+7UtXaGNfGH8ZM8#e+9HWv&{bSIduqWRn)1S&+3%ITIdi6`zW8fM~HT-ya?Bs-uWbj{4hFLF`g5XWmHiLd=0;P zTA}4*?cAD69ZSP7{D*7p$Qm)dQ+yefe!#SV5e?n*8Pu7kR1NA4>7E@r-Bu|eRjF>I zSfr)Y3O~F9xcInC#nt}dHSz`Cuga6ALWdP0tuIYuoVJ8Rj~Af5trPU#p--S|rovAW z|4!Z6Rwu-J4+&FC-$NKLqN{sGl#y`JS7TbVw>`<+V~PIXy{33ZiI?l_bcb3v-E((p z%VVesmAIUfu)7X#`p<^=4lH5_HxMSKwAV?YM+jG;v<9jER*T4lf@Yq8@_-Z30DrvdxytWrQk(N6rYJ`YawN>~kt^%CA@;*WdtE0U9g&50*8Y9uV7YUPLd4w;bDN`Il61#U69VZKKu zb)h*#fYJ(+d+1Y~jiM>yUfN1!%2@>BL)#{_@SJL-dJHr-IEZ>t|G zW%bNN|C7ki!KcQqu6Dj?Xsm1PozckNuQk?^T4krbJ9Jiu5sJJsjx-1ad7O#|os|>l z>`vauuUfaA#R5SD^H8Cgr2<-XA6&i)e{&fPyhhE772KJDS26G^n%9&Z9PoJHHD+E@ z=qy6>f!DZsP07JA{svx?<~4=R!am2v@)M#;`(LN+-%D|sOEmC`npdSa;LE`=^aoxs z^FnLrND!(a@DfhZ|2p@$X+dcbwgz6>bM(LN&{??KfwMq}{?{EkdsVzUaPBr|ht4`l z2#ZY!L;vdzoxLhRXR(;GLuc9Q29XToOfwObu2v*DxR6J_^kRZ=-ERQTwsHAjbGqjb z2G7oyi4^WLw^KLC`x^4~hYDyZC)kw(f4nnH2 z7BS~D(=k17#z!o4Rumm-_%Ra`4+q3vO}6VIz6-s5fwId0L7#9EfDF8 zUemt#@kx4Cfz6AuSYVSvxh=ezVfI`9A4C(c^YD&NLOw6P<*6vk!)Qx9_rQ6HT+ON8 zrX}r+f1CDkC?0kX40n9dsIo=Zwnw%+8a*%Rt$8(Z$bBp|Y;Pg%2vd4XoGBy{CrlS> z@pK`gXsa(VLAdR3<9FaA6`b%^(h@#17)Ca3B_q3HvMg=MOXGLp7i4fta`HL2uJb;T<}y$eMRk&8cMQkh^5doVV~nz1*8?j4R9Wc+y+&sP~O`umRxm z6rM<2pxkZZ?O1grP6zFqbm5tJ-gW6PC#xvDbojO;Np^Tcn_;4;nENO!9l9FE)X=20 zf;5$JS;7V@bIx-a0A%%mTM+`rUNsbN=wrmfGC~!tfktQj*&MXQHR8mn6=2r}VAszw zPO7PNi;hX0dJ|}7^+(zo-^e`65rV+K`WQv?Awyk#`7l-u1Tipf6cT2uQ=-tE>R{Km zIlthbNF9P=j_A`dh&~fD;8jB%5iq1XYBmCn=<~4fs=wrrSKWRlq)(%hkiT)4pHo^% ztJPrF{p7>+4U;aGOFD?ep{~B3%&P;`bs#`pFY6|y6T1y{_4Q;9?GL;1VT*eq`hX27`f(A4wuUEZ#)nAiQ!d>-gt>l}yS(-vf znSW~p3ZneqG7WYW>quYN;V8jh9p&|E2~hrAropaPngfY;J5pqkTnRLU>Mg-&zQ!Q4wV?4q=B<8y3EW0uj&$=_(&nIFZIQGLnkm!fu+iC*@n3F+r~e+>DQ@j-~frd4fc(vUi|s z|6PMP5}uW?a#Y6N;A5~-g}7=3qFH6((=1Vnyy&N{s`Asmsg{t^h1so=8CJCPca>0} z`bJ~9{{iwIWh*n-j=@f@=KH^kL_B=7X$iKQgKe zu!XDB0ip91GUxArSZJfyW|_1|Z?Nrzsdf8rzj^G=esD$@k>mnMxB;?n@RY-09u z`Eq`x>Fh1bW@Kd{d6?zR+tj$&P(f)t}jCsx6JWQ7E9AwRa_^F z$QHmvjorxRy2;WiD?JpF3Q|(3y5iIEr?a-I&@=&e{ZR$^`gx_&i=3tjwifV$dCDCZ z0^}EL+zK2VLt&?y4(46SMKmFA=urbX+$=}~9<~O=G7Xby6yfMDhaY#SvuQ;l(^@Jh z@;L9_&?P!~)CK1xZ!MG&nTAE5F5*j{zS{b@^3qF>kfU^fS_krxUFPhD40hM2o!VYl zv}2HZgZ~Juq5-4Bx)kRZFg0+b%w$v`CGjD`?7!3DZH#kz6KFh1c-xnwbH_igjybH3 z1?IfYH%Bl7_EX@=HXN!Xhx^tktA(Q(kOMh3Ej}8I ziPGb|H3GLn?rv(k*%w`F&%p;ndTsxk+QyZa;HI`-A74(%5#+f7t@T*NDA1v2Wz^xC zb4fl`_#*ObcXHNU%e!#@6D4`JeJ0Pg&*a(mnLOLRNEUgveJ0Pg@1U&7v;8uvFL6-w zT;SW9DBS$0EPwu6y7?9dH3v~W*~j#?WbUw@v+Ox{#O(H*J8E`&&MlhVo^!{{ZqK>n zX1C|uNwcHC^Oj2ymU&$AYz^MvURncfFiJ6Ljpp=jKGUB2W14HI7Cx={aB!dZ&9_75 zit$D@AFjJoBda9T)U|iO!yhtuxSv8o1`rW6o_6aR7q7kddMi8KZKk`==6g|;Za#zV zc}A^yCurz8o|W})nF!|{kAGwli}5CRHyT=gwzIs8#zj^yGTnO?7g@_9oT{2wW&e@s z*a%bHv|<-T;8UXz5rr5dL=Bc`BN5MQV$@*dsBUZm$UP41ciEA^!;(TfGD@u3MM|#g zVCLVMVd=|>( zM+zU#ECSBh-ir5D)UNnCv&j%;I$0V@uB~y`+>3KL=xL8<<={Lkw4!NpuxA)D5$XI*N?fu&h5b8_- zxqS228i9Kr-d~yR#6|_mz2YP#+W0^p?7R*uc0GZt1cK@7vAv0A&j9AUPDPN|o?sx8 z8PR+r^Iv4bwG#~cmbwM{Bu6cQ!Y3n43$NJBnC5GNfL@=f@#=1B%b&o_d@)_i$EAC<@FYe^hpYH>c^?QE z7z~A;$lbS|^=Xu&;5t*@OOQA!1-qd`<_7sFUfAZs45BW@?pLNyF0^#&b~~2J>)U(w z0bV5wNPaJODyd?+K%)1=!jdyXZTY`1LJw^A`L_Izc3>qN>OgSz^tCAQQL1%-UN?Y& zy>8Nm+|S3Q^2gq%>-oHD=6mSTK(Lh;cWp~>WP-JU&G@DkGF>T7x~eJhffsaL-+JlU zmcLX+It`L;i7yyTj2E&J!aKv~r$Du6vB}&6D33(;G1>gQclwiIU9aauUwxL_7t3)k z^baA5{;&(j38x_FqX1qH5EjAeX6C2GH%iWo!gS|5_;Pg%9ZXXREG{&1Ga1;6lq#xZ z;LP8s6~ELyPG$=8$RER(VK8Dz#bE3I=V@SUDI#jHh_;FnXE2qTcWUP_NxTe=|BS&F z232a9ie=Slg{qHf4>L(P0`1Y>qe*t$_J}Aip|;Nghh;!k@*S0e<5b~-ue7hsc=!^K0@gp^sS!_5D9m%@?}q1(f+rNH)-QYG=WEktc0$EvZm=g$FcV{uKjI!KI> zR7)lHgMEZ71H|6Ns{k>_U6%}L2Xr{NHs=+>)vO@0@GU@#b!JRu#FkK>YTexSR0_2{fYHt7(qY!>WDQ?ANot+BrP;tj3U|5JGs3NP{ zaRmWI(_M?ttDeqRbZAcOCYYobZ}9_ma;2V@o<#*9j_?-2THs`EK`V|N7?;CT2)Z?0 zs5(e1NtKntskB906WCWAh&pWA#R|V3cpu159Snr zX2iKeaR8oGYHdzfu(K;D-uGdp2FKgO|J5FTJojumydzVw!4S^ell@OPU5yTq;7Ykv z&vrnwh&I3>;-ki6Dhhxxm9iQGgo-6ZUx+j3h3qB?Qvz30egW+4X?6oQg-!Ld!_Hne>%B2;m@t5R1nCy=-@jzDLyeh?+;GH}GTMBj$U=%mZ*t2LcsEgYVH`9&@r^ zlw+&!*lLc4W|zsa!*}d3$ExgNId=Pw-R5{$_INq=`i{NkczE`Ga_sXR`^?cU)4STS zb@3d=kr8+|C88x1Y`$@CVErk2(+G#CV(JaButf7M5>;$qCKc;1+T7S?)>wcuWh%&LnMlBP42lon~HZ^QtD5lm+z< z0&BdCm6@~>8EEVBgOlNh5gDIN@2crlN`^q5`4^o-K zz)PHtI4lId#4b4ccGHF-24R|&u6UZsLtaf>4`m3H$*zM-cy0HwIO6p6cAYbbI36Oq z+<2v6w6uX!;>F!p#!)0{CP2QRR(tJFy=H8@E>ngE(3T1`dlw?;#0Fn89Bk=!$Gn3* z$+0-aXxb8;hDN78k;TW-)pj>t=f|{_{g`-qQE!laq2vLZbPsJS+_?s+P1CXg3rj_a zI=!Js*z-xEXoQbb0dB@j`XUt?$7~|ONgDrEsp-&R4 znxDhF48)|Sr4;e~v}HiAIzX?dsTpy-H7dciy~>BX*uu(zv>K=`JQ$YRHd|L(qlV@8e$8EThQx&x|DcRgJu%;Au)8DsKnJq_q#P)$R+l*zLq>Sb2<{~Ob(v5i za9A`K3QYqW1x&wliRVVmr}}#`p{*+25qDL9eF1_vS`#%S-XAMAXpuRz8r2Ypg|X1l z{mYn(bCXQy4$mg7PWzzKclSB6YsT>=molL8vhOy~%ZS`4$v00L%_bSNX)T>Jk?)bI zGXzdMBIuW6E^Dew7P`Z%K7qQZSapc=A=V(_16pv|zNg+C`EIiLku{n2p1TEo0hbLq z=nLDG4*CKntNI>3)ivLpx+3nX67aR51bms+x$5}kT3C0IYQLM;;f=HQAa&)U0aN4N zB&M>-SEzW@6T3x%sf-HWHgH;fOpa=SVqXuN$yOIrK=ueGy?!GG=Y&R8OoMaxh*!Xy zXfd^*Xj}L@mKhoZ^ivMUF&Vt05Y2}Lpk;*;V^v`p;?*+3npPQOycX~!ujaj8VGp0D zB#Z=8Bn;b~TG}WUMu&}%YT}p=+f0#nh=egrGet^vjl!hXiCw`~VQOAsTSXRl5$79b z9uvlKq$WJ0SBRPDgBe35tGI-?`1n_y@UMq)E9npZpb?CtPSD|B$AN#b<5gSun-C~Y zZ3zH-+yJmw1b{u43$QOoDlo?|F^4P(ypgipex6Gs2H`Xc0MngnK7#WiT#P?5!s}44 zgTMBc;4jcG-T5MYijCyUhrQs+{vxzqGV*?|E+hEXnNcrhm3U>@6b17*?$9MCW!Y6_ zS##)8i=j)uw;UTd_d3?IV7n~+4Pu!F(rpJ@Dhip)MOKolDsoj0cu`i&V#=0HkXsz& z#oV@aKg>&$-5Y>j67r;M$QGCu*@BBpi7_qSE$0Y2#g@LPT)l!D*}`p$Y}-_3e#Q** zVt(BArkGdCVO|JJKI3CveKY_X$_OQ{2BeviS+!H@4XyDo;qe0W1f5K^FbPkpv3o<8 z5{r)W2>O!JJXS{x$&k>!)t~l*zE;t8<)|4}eL~{G>LW=eZf<9DbEYIJ)!Jb%yAJz9 zab6i5dV^ktXXVwn12ciR;jNIP%uvCyq>9W;>$h?7R8^!uM-NUS=zxj-&3gn9|bcMtS$GL zuBfi_2BjEbLWg~*m|YkWV0p3ZDk!R?BtM2XMffLQ1*JtrNp`Hny6=-CTj8(DbfROC zKH1+Yw}5r8UQ2~{%;GvIvF@+YJJMpX?j=dqeNf6>1*O)fjEHq_tanTc8-&5!C>l&- zyTg*VFqlS4I)jGzT;+g}> zzHrTq;hM<2qD=EI%5Y6D$_eV|F@S17i2^nokLS^H0&t%$*=szW4G~ynv})1*z-B!j zjX;&1&p=hLFp(xAs#-+AEm~|%0h|9u9uy@sH4t{*hj)#T(0M{a4cIjGngE;21Z)Cm z2EZmUI$(3z!JO}QQmO|-k~}U(Q1DEm^N~$RmoEbD1kdDSZim@)FmG8wnvgf(HVG_h zubG2|ZZ2`?<}_SW(n!KJt&Z?ZlEu1Ve(NmR9En0c&v5wlnHXj89Q!ZnW;u37ir z$2IHzBwUlPcrWv3XZ#wT)$f()>#75AO*N<~Tyqr@jXJ>;uBo?axaO)?dZrNhK8OWs z<}l&GL9F0RdZBml(Gd640tu2q^(nv07<8xiIg1W-i4m z=qV}`t5*s#0jbU=($+8?Joi*0|J@=aNfF-n*^nSVv7mxV?GwjMdwLTWi zsiPJ4NR5Rv34Z{c7D1=m^(A=d@?*Q_0kb)$G3X!?m z9E_2Z3?I{nN$iGU?I9B21~4qQ=MWU6<%rl-TmjRN`z8%=!IP$CxnYoBgBSq zl0cF6{3;;ArI0N%k)qp5;2=90jr9gJ(Lpl#>s5{9_QpW0ON$ZlwzL3D=g+PLi%R9c zr7|_l4a{Q@J0X9l_Tb}O0@T*&m4LkjOcmzcWK?1J_QJIsORhcNL!KN+C(>0SrIe!E zBXNG{B}7i%sPwKr6+awiUh%718uT z&33Gaaay7N5aC*gevR`gTnnE*eR_`;z`5tsxtl1-`?N^qKq9qIg zd-yg=;#6HES21L50gwtL3U$}-&h$Kwe6v=-cI&g_9Y|&86I@~j4_YaKYPA(w_o?Fz zlaO+-n?$tkZlV_g60l8&uy7)hVoSI$05*cc`HDY>Y@46V-OLx&;-1XiVn@*)d96MC z5+!2cV|WbSxtVBbb=MlSveHMgB4Qdm&iDTS+Kbi@K?(P!*V z??KR{fS8lYD;}_9X^F1J5+cM2}Nu6h+@+EKJJfy?` zz7zw%SEtF0U2C6X>O5b{6iz{%=k7?lAGHOhDnUIs-H+n%3wh4UcJi#qw(*Q)Kg4r( z_Gq4V4(`LI%~Q9nv8JhDAuwAj$LOE^l0qMDlniIjEQ@M)9|_bB^QhCKc0@KkYDZ<$ zqqZoU9<^h#=}|i_n;x~3vVn2A$S?gTv42&HNe)q}RwW?@YY!XaSSQBz2wLGJxOJEe zG;!`+^HQby^4w^rpBqk zdK6Q*n6(|6+lalhMlgS$TLxVp?{?%w^L~@*jd?L@?MW>W_-mXuvZ@x%_OEJKe%DzX;JWB8v=SX}Hv6Io;DfkPPvG1r| zp6*5i{$e@dJBmr%|3`zr7-oFOZgVvFYru!UFwN26FJ=^9clsh$Hti!IRs!>R% z{8us4cA{eqHs)st>bRJW@uNc)3My`L%16Rdg~$<7WGy5N{^)NAiegc0 zXy;*X#w}}JrxL7|t-6xXOIvbE>I5FuC-^Yf(QLXT+|a=!*Q)DHQygtaP4+9KjPa_b z-|j_}t~YeI?KRvYPm{6cNU@;$Fq2`OiP2!97j1@8VK<+Pe`eK%sd^BHOx3CzX^WWN zP%H6DF9bu!l)B9(Ah65*u}=dIZKRb}%#Owv+@ld`0K878JZgmib8Qg6ftLS;4tUKDIaGcH!hYHF^HDNbld2_Zc-4~Vq*+w%9us7q5*}R z4G2hk13H>X0uKIC-o{-dUup#_6z#n-r#$CEsMV# zb3_Jx_3u4A+eTbqvB4K>B;-qfKJLiY%k1B4kQWO(|7(XWs-I8d$JffB@oFU{umeNa zfhn7ro}#tQWTC_K?rE|UfOV}6ifaCA<$}D|kb|gB0rTqTT@qSt&G)WZobI_(pcf;D z8){|SP*TbnF{PZ#zQej<+Z)Pz(O+Td^G)#g)yYokcX z5HV&rlU9pd&fL@0H~_@dSeC;ZmBX&79PYtix$IIdmp-Hn?s6H7{52WVePrd37`cl} z2+R;M^hBBv7`b?KIwA&>!HI!s+MJRCLqeNeF?TX8TCSKtNgW80EG%=`x(N~k5jBN& zJ)(S#D_`S&zS90wd83br*;FYhFin*)Zj?|%z})Fef&I=$#6~D@Mw4L-HKIODJ7+>D zPV^)JX>HO}pTtXd?`fX&hE@?i5O=0`t4o!7H}U9Q9o2dcXI7dd?WJy≻M#y{iY5 zI_gf4FXa(rjsB~=3|mKW90Ls`JBX8{#w{r(>KMV@ZdyIBPt>>;l}Py3O!#&UGIhME zCo52f{HVKY;Mn1tV`g%E5>o@nxS9Tz8W_EUcWnb{hd4CSMK4y^4Yx&Tiu-UKHZ1IRuQ-y4rAM|E!ZhIJ2>n(rC#{dUu8C!w!LuTp;2FuH z7y!Lw@8Wq-_E2=}I^I7wSHXkl-`Mnjjy=`2c(kYn8D;L#VYn#N6s~U1s%0<;D7*9D_AWi75#8+d}9cF@*o2ZAWJIr8e zE`k-oB5jZeVK#)bi+Rq<0zDk4Uvxf$tK_qubVy6RQyj`-*f10+0@ZzEjSO?0aBR76 zOhB;`X0WApPg0qmzCQ~?CQZU*i1*1)WvVzfR*O)a4WtmCAsFNNu1*69( z)hLh78dESDRV&a?wZD#01_!k%Z}3+(@Rmk1jR`*1qH)TGi0~_LCzFgmoirwI@Ewed z2A?pbxY1_{ZJD@ka<4QV>I@*VVK2O>L6p@Du_n2%OoNm+R3`zkPA5e6BT0`r-Y!{k z{+04iBS40V@`<0@ILz@|(njkcpoWQ*p31A!bC}qSHMG5r2VFxdECzJ01{)ieKIc}E zJ(u@L_EeKL{Y#z)Wovvjhl$N}7s4}4Ss(3pCUc&CX_$9(+_&6uLWr#&ZyM~F!}XCs zwIbRNeU*nPR33I5?CU5bE0uxrLqB8?VteF2l-cc^(lOKNFf7w)Xj%_pq!rAc5Bjz& zID98?urEZ>B46u&db^682Qx$5Ng!8#Sv-15p(Wc6Lc~f` z^BH%|ozD>1-QM%`)OB)g`R^Brk(IOBZvyq%2#$!`)}+Ziy8dN%T+9c}LPR6btUdPv zBSxU=P%8y`(OKint$o;;yw!_>JLLp$sRK#Lp)cblZ*sNAsM2UTG#sfte-fYos~73_ z6UT5J{#wKjW41Yn&YfRoK16g&Y7aqz9H6$R3{Z3W6Wel6z~a%Uj?9{`@USUkD$>AB z+Vd-W4VW`j4yA`rfTTU)lpd-mqy>?Tk7G^bUPI~O6HMp#GQ~e5=T>5sHaC@P3djrh zN7MOFu?1n>by^|$LE%kLr%1n#2n4P5!dI~-hZ;T8#8xy9c7sHb?m;~BiA-VH2VpyH zu%(M&uXOGiB1W9X0V0`vz3D(?AGC`=`q@2msys!)9eAiYOy}E>?stnmLf@%f(FmmN z`G4Jokoh{|J&|>?SZLD^%SeP{}xIN#hEk5@$K)CSkGllvRDj2xfr7 zV8Si-3Dp#FaB4KX!U(vSR+@cyv#p8>w$kD)VEPHCim52XE;rx|U5LUN8=iUs{pv*U{NT5Zs@vX5}+WpuT-Sn z(Ed>=N69gp^%MuA)Szh~@Xr19Y@tOHJYVc)4kaZ&+lUh!EbnVRp4O2;nu`xDLN1+? z;iBS4nS!k>lRBp9ua!x1BS`}I9#_Qrj{Jj!tXj0-a+;cnnAA$4dP6TY(JuB6-lQIkO)or;kurAY^n9SX$G%KMm zB*NHQRVqH9b)_*xqzn{ zK8I7SoWeJ=%9Dn{e4B3ab5Pf$H}E#@s!Om&_bxvaoA?y^c$y+8@yC(w zdJ>6+tzlC~tX6<>{`htBU(5S34h6T@_)iHJh+@ zW8GWuCOAsj%${2wyWK9Ai4Buj-LN=Lu@PBE%sMLTs99ONX6j6|D>f#pNp{6p@n`Bx zuq(z&GUG5WC35NagAz^xfUCh|d(`louu=4CAspMH@NCw7n@qS@`_`PG@jxv^0G8SX zRQ^gNIoqh(ntvnFuB)v@aIC9Qt^w>tph;e}Imgvt6u_57O_po-VI#}7qAPCH=D1gN- zk;hC@Zf1qK2~jBBc82=Smo8xYCGBDA;6|Cpp~gO zEQ(3{Fa=jW)+q>!D)mzZKX2G3p@PGB4OgQCkte&;`<5EP#9>6{N$xC}t4Af))23mu zhJ$8L8X_i`4UVE0Jy@@&2_Iv(z@WT6qU|6Wu-MCPrN6AX$MGmrJ|33LnVza>eH^;3 zdK9B2<0y;*{$ggvY3Y-CJfsJ=4Y8>-?rEpkwAE&b&oT|kmExMrqa>L@HNyVT^$4t}W3oZ$! zcoz|;gCHWJRWLcKKkhUw-ruysqVF7jw&Zp znikVU#+u%dF{Qc;ZXS1JSICs6#rHz?6MrP-do+?#fEyYz`q%)v> zex_%a#+CC`X!MdM##ljZZV*)u&}3`M7Tx7_XjzQ>oxr!2?RdYV1p4!Zr_K zgw>hIz3hV1lx4weMZQAs%BW0rESc&_;+C%!9lg|sZ}nQ=eFPIds+A??)@D>LYKRl! z7R#Z@?Tn zl;8P|PoarGpF#W@-1VVuZRBGQFhN-SEN8dL_l_wVclMWx5}-)3?)c6+r%U)mVx#uE4Y{x!v5nH9g-+;VHs5F% z)5uZM$9OU?$=nzX??!J^Ra`LK6yS9Ph!Wi3O+u5vl&JWncO9}5NGVrk>T&DiCWU6e zmprKlng_FwfGkZeg5#Ff3yzBd_QLLch=*OV2lZY&&$>YryG4ADZ1)LJ8Vis(iA0>Y zAh=&vc%1S%l|TQ&J>`;_i5#@}}|Ia9Ad|vkeJ|^XiP1#dzhUB<;TcI6ss9#c`aOF44Xlvf#!*n;z znhw*MFx2hS?*McHmL?3_+>^ z2*n@Hw)32oeb)gn-840iEyCV(_h08LtYHMIx!e8b>2|-TaH==-4d-c~8)K7h0c=Fa zPUkij1XCcL#$9~k4%H)?aiWROAqE5K?E1l~86Y~LZY*P?tHZ(&xjHaw0NoM;=$0Em zXRuv$45F?>^~{6?+ucw6U10#b)y*h*1eZf*3T2ShEMD9jpkby}WC{RGFsn~D=xNMq*|9w+^a^|-+GL;oVr zifj+hh%`RI4FrpG7my|643EAP=k_+U6XAf|0zeAZpyV3gQnv{$qTspdARY)nFG8Fh z6+mFu=^ckvvoqX9#(jV&U`$B?X)PwHGnMnwFU|8* z?3(@LD-l1ue=P482&5!Szs=XdFuS4j8cKXR@u|co6H7jMMymh!sZ8E$&tIWPblOx4 z+jbU@1B+zIuU8Sa!>SvY&wHwHN@cQe^1f3G^~HoqJ-lDFpRY~z|4I>ryn%?w`t5#l zkw?+71PSL5XewfrVS`C`yR$I}jyPK+dl^2nv#0YsDEq!lo{7nE^@-CaZfZqN$`4LI zAks{}F5Q2fpYD({Vl>zJBwsQTE}{IjPjW$t6G?x^4c%ociZ>7Jkun6#$YDj%hv`OA2pgF z)Bng@Cf|zc;!eGl$fp4zWvIsz{;nV>A(-x|& z@aerEP}|dmbmihg*UqRA+@`kNa}uSiOh}Di_@;K2knE*u)6x(iBwOkpGgzW++)Y>b zU;7gMkG&S`S2FpQcshS@DnF`1NacssR8sl>Rhi-fH>W82n~T*iwBMA7S8{{kMM>4`yn!oH zyyzib^mxwPa0DFgK+U^(pm3z~ZWsYIwY=27gGb|0oc6(`{wNlVQI7CN-p!+mbN!uL z&^r^U<*nu&JmwvR(_Klj9IEH)BgV)pj+rC$y5z&-JeO6BlVKc}ROog2p?YSI!0{Oz zmme|C?lbve)(O!}g;on&;7lZCb-t_8`reT?bu}VgKI9*<a(&O~LNbfCI=zUSL`Gz$gYVRqM>YxiY&r;>XD&Wy{+KEwAQ}ce5 z-sgQ)D~O}etq5qF3h%X*z#kLMzxM`rYT3{qVH=)jABi|Zr8nqQhS=fWES#YWQS)qb z5nGpoQu(#}wM}%Z0@t@SzlLT$TAa@E(lWDZ-mhlDTQcFTa*8>55VcD?f%ZDh@@&Y} zEEg8bS|wYJoqQ4Pkzrowg<7UJpyaXHwZN|8H;xQ`H`|1sL+RU^b- za}aF|Ns(wAc3ymz;6hp#j`ebE#IeyFS?EO$Vs#h^Q^HlOV5_SF_Zqp^$bFvN=LPO@ zxyR*RFZcSuy;1Isa&M7)i@9fXw~1B>N=M0%`aDvfH;dNR5KZ+`Sgf$x-CG*LKB55TnxtT4R3CW&`@+1oQ zGsM9yZPBif>|Ifw(Pa(e*q2puV_S52NcQDXp3!W>B=&42ces%e*;hv=mHEMwY`v_u zo;{NbTEr2Id{=zH$Q7)Jqw|JkpZB*(_=m~nyVf+fyEFNaRT&vNR3SsB0AwwHOS?VN zAcz{Y4OQ2uSw`c1SmR&gpd+o5rU{}?8-1m%`9HnETWOu?{%w^R*DzPBv9c?!qk%PM z^2h9Vq*W8e!eZGhao#WspA`8o-G9u{u!+U62tuL0kA3I*dp{gH<)o92kJX)W`X@gS zYg+J;1r4zeHGF7c!$%f2#_B$C?y^|woUC*I;KGIvP>VJE>lgmgCqm&>(eOK~4z3*5 zM2y=!^9hBH(V^B8508Fp)`^GLezW4l!()ZWiHFw>%}yNNaQQ)r!<+gKKIQQ7Ln=Oc z)SSbca7!Fs2Mx4@Iq~qyghU=S05A9BJIFWB$L#a%n=7%rc^b*X`@$8UnME2W&yYsN zI}XCnRgl6=zlu;$68rmB zgbv={!GEg=9r6b5Gk;@6sOkXv_nXUs@zne7Gv%}*H0OYL7UD8@))ck15!^n*zv_DW zuNQ}z+%?$G;^LUww2cp7Q)W<-1R(-`zBGe(p$3#Z#W2ziyw|z3t;u@syWOud48_AaAx-v%ctTg&(FF`rs&J_^B=zZ^!z;hH&gMH=cnbX zQ}LAN=cl%sJFvcf>$^{n|LwV{c*^tB=(|s+ze!sA_{qIIKSzJxoDXQXn|?U6`=Wnx z?$hIM`q319d487q?$ha)Zk&pzJU{NZ*@5NoBV&;q=zfBhSp25P|N8aLeR{v1Uv%!% z>0i@;$xrU(`MGq=oDV3Us(WU3|I!1_eR}+>ADW^sFQ2P?_v!SP|8^>#^8Ad84Zv^u zIP<|5W_JJbi_U#|{NLI$MPHtufAig^)4%VPsd&os^Y|fdV4a?y+0mKZ7x?bemh<*CW_E=$&kt1iuS=f_?6aX-Tx5w$v)9$q`P>1|)(o6c|n bU)D?6&yTL;qOoR5X<) literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_forward.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_fusedgroupnorm_forward.o new file mode 100644 index 0000000000000000000000000000000000000000..019ebba85861d5763dea9b9d67e8ab11cbd6e9ba GIT binary patch literal 71528 zcmbrn4SZZ>nfIS*qXwy*i3W{p_hEELcS8HDsdlB2t~*mFFjFTGwP4CtjHb(0o`OMx zR2OPs66hhBHfr4kKixID)$O|FY0+v67M!#+O?gehLZQ3_DA4dyQeN6p+UEcLUH3UN zDe6A@`S;VzIp@Bw>$=~s`@Vnob*}TB#@tzxB9V&lU!>ww73HTew4!3sd&;3n3r`YU4(d*h$Q&ExWIk!3Mk7v4fcwN8XIoI=r zB~9J;dtE=ZhZUVjYEtA6?D*qP+r{m2TNo+Tw|6F54d%dn!o_oHO zKDaf~aY`ot+uej$cwKV|P5FRiNMiojjfddqnHP^1Dx0||G+C(p zJhvIYE3WD&RG#1SgCa~k@8i-h!Nrx9&l*8r5x^D-%ImGKJai&Op8l@=V)iD+*FYzCA;I){4I8yhZACwV9qd zRhhPV@npu&=_eg;mDpUN^4&RP_aP(KMup}Oob$JQrchZy+^-0hyoW>+lRU1icAXbr zlp~RL9-4X<#;LV=bZxSy%H5uhSBW*}uZ}N+BJ01F@%K0N{6iJ;pUd>jtx^tRIe%4r ziKO-1qz<2f7bD&t{A_uQ_U>DgeFyHBxcI;*NBN8*dp4P91s zUg7nqs5?}#bfc!W@g{#b)7ERT>7HDC$vK7iPsM)`9W&Fj(u$wixnJ4cr~LT$QRu;R z#B)jv4&lIM*OoxY-&V3&`~?d8&&vbu5CfQo|{)mT=7N1GliL- z$oL)9DNU=)=V+c~c3mCxm9^GMXc&#%rpT72$Q`-Jwj2X#QTHBi`N#e*QhGL9W#)Q% ztpe$fwGdZG6YW)lyZ+Q=)ui&SQPPvUtlSxNPIHrgrwx)^eip_>!?>u6V?wz;jEjYFF&B4o$JZ?%^t5>5>|8z@{^{-*q zQq<3pd~wqDuFd86&87HlOKUSXZ13tS$908qY1GJ)bifK{8_#I;HZl=KKb|RkzA}>^s2q4UnmXzAOyLUD z!Qyz7s4Ml`a_CiW{y5if9c9}3IDg+d>TP_xai_iLyEFMglG@@#j38p0V<%2h8oicX z{LFuexwPf0NBOP(oc(^aB*lyVg%cn_NifBPRni_aU+4eS` zb(DF%Wvk^sEMJF}Vg4RO1l-3(^sI_Ae{6yvRTkwuC~4$x&KM$9`t3Yeep)VjtyD8P zM?5Ym-#W~1_53l39hHb3dY+@umaJ9PWhzcJbYAtv7Lc{VcTXN4A5Z1CL~@bm8XN9p zk#|hbTO+y2Zt0M2U`F#+Y$RB^*7Y;j#nge`pabQPP4<>Qu!c%ZpuOqqUSflFC)BJI ze#)!&lpejK&+9*8`m`c2rXct0da_*{&i z3uF0lI^>vXp#*B)XjrN_F&ETBb5?zR94&UYquCHI8qMl>(b0T83C%^x{5aZ786-cO z<4&6;(BF>e+s)+XtmgbUYAx@Gn2}A;+W3g0ncIq>OI!2fsJAacBRSG;ssK}NUiG)z zA^B!|ejF_~+lg5lcUq1DO3RVV&pYXYT>nMa{5VRl+N71`?w+htK!mt9@&=yMAQ%a9p$yU=EqO=x^CwNN~Rp6TpgE6gz^`fD3SRyUGw8p zysnjwa&z2iNrXyELV9bF7_Qgqnje2x$0Z2tjZ0~&Nu@K1y?&Tj1pZ%L^W&#eWe%$N$5Te$`oj#>zq)(Cc^D&~i{)?{YSFId0A+Fc8VRy%sxwgYv zTg9}XYewC^I>(EyG>xjeRi;q6tFF04zf*5V@n)RjWjrXJ=3-oVq_%G`XQ2|x7S+k7OPJJ&a$KQtrj?s9FKQ<H}WlX_C0Z8+`gaWN;e7f$N{qCvihVK2 z!eidxV@mS@Z}6pYMZAExJ$AQ~;5)^9OyPI(G%Tiv^mC7b9#9#(r%KcMucq=(A5BMg zry5@E$Y~*JEBK%>bGBk2Z!J?+=(omCH#~LKhjVR@uu!nzA6f8?wC7c)Sodt1a-Gtt zxtibT2HOy_Z7Hf%mAU!%>gPe)Fqjq^y#+hT>jhh0@mlEm*1TJ&$j^>kl=lASFn84p znb@xTU;m*#$EJurm|?fE$T|zDS1-P);+0@O{EBxW^D{gb-1F+$UiFkUO_6&=QpwGI z+5EOhx?!#La&JWk6G+y70*UKMH0!_E=}vFVn$T zr^;gK22%A31)_P(fm-3SzF0++8_2d zdMR&Xz19&of-(#|8MOu0J>JG#yeeE_vHCda0I65q0(%9WBIPZZ+Qnk$1N<%M7S7PZ zZOejq)kWUY-2?lg7p45j3vJ%U$}1$9#mywQUP=`zKL)1a7H%w>NYdK~Smb2ZNM_A1 z`I*n)P-Ru>?Mf!?Exmu>d6lImo1=-kr!&3;GbE|z z`rPCkZN;8D6M2Xlugyk=&!k1X-sQ)&km-sSRl_uBCs(y$pPperRkM$yee9++6X}RA zI@76WulFOy-0QFogh=1#U5|{YSG207d##72C)2FC$X!hlcKKbdi*QxH$wb^bs=xf-f+WwgF z7lq5~zH3b7c5q*Skz{**tE4nMDqfh<$I$BM?x*Z0DVlCGSKlF=^B=yLT9Eo;>cZ5Q zwl?;BEt%E^p4dj#f8BrcUTQ$d-#3@hcl{?)`RCrAvNhAymoc<<=SSX|Kg=@a^!Z`{ z`D>b{vbcOmrD#nb{X4`}q)TDZ@h^BhJR!gJT6PIZ0`Dj{O ze`w-#H$iy`f472WrTho2LkZNO`^MC>YRPM*@~V~v5Ny|GC<#-mP{Ct#lzMH*3)2@7 z`Hgg8=J{xh!?4xT)tmivSAq zyT`!_$Nb}fa1GnNbH^gX4TlljiQsJ8LGRq{;Cl{}qkf`-4U0rkEH$yfHx*B~9)H3I zf50P(%Yh++f!Yz!ZGizus4fLmT$O7(28wlD`Yn?|AdQVc>T+&j`fXwWrvX)5j0$XX zH7?L&ji>^NS>I5{3?)qF&>F1Y)JX z27$u;a#U9*VcCJ_C#55Mq*z)q1N$dwPE_g1&lbSGR`U3`5Vd~zKqg1#7N!=6*8PwG zv>ROBcV`Q;Co8d8L>$gW-Z-Z)^?I8SB$*zt%2b3JeHiY!g$1C3WH=#c&qsDxs0810qZfyf}%dZEA(}%30*DluZv!kv9bsWGMd+HAR{}0{C6h z=#QoHBS#sdO>K{stXf5{F!4eB`iw?YZq2~Yb&Li3h!o60|VR6xPNQ*8Nr z4TFQIlNgnykE@z#lU;-Pm!=hv{NOo-8tZ|=<`SA*Ei=KaEFGcluY$I6)r8ufiQJVH z7v6Q}?3`mjf0S{Ug?|cVLc2lLgIa6JdD#87e02j0X_%v7{(P{Tp6FEO2RnpXyGg4bIcK8 z+=z=S<8!OxV_}?aVe?1J_}r>^1>{yf?Oc&r#^>0hhHvTIBieibNJk< zczqbI%_!jxp98oK!?p7y+~IR96T@)rH3@h4+^TqM7;f9k{85L`(OzM=_LFIUcVok= zUe{@H`c>N)2sY^41~r-f=@x4JYiu~av?1GepTTpzWr7lXmueUc@wpOr37=zHX>C{> zPkMzXDw(iZkqDuyRX4JA1@VA#y}{AMN>e@b@*)YR{GHPlGclo*a{iH=zu(^unppTM za{8>>ox`1VyHEU;8Y000h75So3&vO$-Q~^S?G^-2P$EIQ-4lf@SOke66( z{gD;iUN#7O;U)kwfww_Nbpi0vQQrrj7yWkBP#LAz@(_rg*Rt8L8LtkkMxk}?86GXx4Q~ql9f&kVb~QyFN}rjFw|c#o5~yWQly0DHh%}igqe_%Q zDt{dGxA`)>DYg+3b-=!HHH$mtZ~zZ&D`+p+u+l*`L52{$`x_^n3}(es7>UQnMuf*!fc(ewthB*U%fh_Xp2 zzmm)!L2LXBB8@~-8i~zLF?iKqZbr_xo6U|PfPcG8Lr9s8I5DYKXujT>KT?8!*`twX z)x?-9*JlO)UfiBPQigw7hY+VqsGtLpk?(fWb0E7$*ZdL2_`6-kHw05E;-Zf9<~~SZ z`MIw7BaG|kxS^^y2jiBC(71)ddh}ZE!)mpm~<=|gp|7(m`M14IrR!--zne^n^PdXuoq zR=8<@f1`hQlfMfDxXFJat2KHF`~AxgkUfX}PUaRIQfpR&2@lS=KQ-{eB-Z$qse@Y` z`g_Vm^!G`+!h-;$JEe!7)Xz?b{_ZTHzfUQ2STqj(-3d6lBS3$*1n94Ut#?qfEwAWj zKr9Dy4Uc$TPf!XHoc>-xz=r-RZkv~Q2I$rsd`7tIQSa1U3gs{Hh+x>n))0w3>Tt zU)h;|h;j>cpI_u>fsO|+7@Pq7J}cmNw}9Uv0ly%G-gT$HU!Box8Jro2brhkxHR~XX z{w>+9;O0VLk5?LzC*)Uv?}MU=bMsJ${HlL@E54}{MgMWo+xC?IA~tdc^S-VQzK{t- zgC|CZ*9twGx_5{aH>V_CUu;VD$J1@{7&U96riFkVzxy5 zb0t4lI$W50ge=i-^|vYo)xuV@qRdev4gguhV@1ooewT@(#s@o&jIJRK)X2uE{#%H%>a_^pKSv zK3#_)^QZq(zl&jJfbgK-_54;(zd0!4^c8}W%Nd0aHWgD`B*wjdt(epOT*H&@{6YX0Y|F?P;7MQ&8D} zUPVP-4u}g*OUzV;*dhj8G(>d?iz$-RmP_J>`Erxc4T!Lg7>=b`6x5yt#_KFW6Kuw3 z6|e+FvymjSk~CCc8`|PKB;O1|Ro^_T7J9midT`(sEHPMOA+9TwSmgr^r0>6tJM0~^ zAeo%^G_`GO>iHbl9vgAOF&6E$V=l87U1{hR_M*x_$zt?yXfZlS^OY<{W4<$Sq)T2{ zELn_>hZduUCs>RgmL94X^sCB;Z#@?bN4I0L35aJLU-3YC8^ zg$4>N(@|_9=gAh-S*wb94a@<2yd3}!sEkz(ToBXUx%m|YXtFKm7N(;byIj{a`_Q${ z)R4@z3RzW^8TzYLTeDeyA_AObwc(4-AT--^o*9G+NNiOeogJvze6BR0807Ql+`{bJ zl&mtUURJgPJOYYKMUCd4BMtCqg*XH})5g&Ni%cH- z2+g)VPG0uOq&SfQTP8!x2o}EG&y|YVorz%CyVf-1xrM18$N~7y3Js7nCAUK-wbm*= zRLCN(I?r2U^)}2ZBp2DGJ{Ti{+3gXIC;NDsRqlit-x3x~cku6in&uBvKMb$3X6j42tl zh=Tm>janCd&||gksuYBI8c)*95C|kuOWL&xqGRl-1??!HFNo-~2sYAx7ew?25&agy zA|oC|tPdjATLe4ilY@w25K*)UcF9=sTN=Yb#IQwxo;@5yj06!Q7Qu$N7(|Q(5n~p? zHuyh-hzb^e%C>gF`D5&Qmj@BiAVPayUK{B6mZ)Mupmw+UV<&ekRA7A&sEus?*p!ZQ z6_^YHwP(#AdsoLN71$gEYKxjbc1p*G71$aCYD=0w=5<6B*d7G76G%T(|HshC>$>P& zv>(g#44CmswSoTWmU8J|s^PAs4VktBh8b@t6O=&ZRKwN~GcIx4UQ*ojyFM-$b+%#y zOni=>s;)%CD}wb^&Og-XztPz6oL86%0eCPyASm0 z9h}I&&rB3Cy7{DRHnpnagg$lBu596RHtlkzj|HclxnsT=nsBm``XrOcpJKx)0nB9D z423|{WuXD5SAE(4+kR85CV}PHlTBbh3;t$K49E;R;P9a*o6%ss-AiLW zvDx&B^j{}6$u5=~dN%9y2J-RurfB|_Zs#Sy?6^XK#&HyvMIre@G=B`8Q|;n-H^_)l z=+Mw%E{l29Uy8|8v}?a5xH>NFgBVUIotQ6!^H+bZK7R~Fbf?jP?mHR|x*W|{lhAxA znLidFckrWX9TEFSm)4h?Av(7?e+>O}wZ^Z&wh_w@S4Wd-^{UTq%^yQkT^Lk=G!>+q zL+tKb?TGnKd;Syt*aAugo_jni6G&dsj2s{d72(_&GVAlRK0NrKHLii8R<=+3MD zkw&w<3?<}FxtC@`58iJy0pF#fz%z8Ab5x*@&8)yD1W3Y`p@h7 zz6BsuN)S>^3S8=j+#z&D(MUn=hy}5;JUZl#JYGFatXF-VF4B_DaMOtjP;%hC5n>Vb zTV16#Kd3$-wTVddr_0;TV~*-ybw!VAewR(-4Z1drbu0xcR}W0ea5E$DgTb?*S9+7N zxXPj2WND^P-=DfX)$qX5&jfaxUUcIb||`*v)0Iw?Vp4VKg~Tb(sryzY6it$R)UidaI0As{9Fj-Y9d2ic9%(;D%rpFLGa!#xpt{71#F`oX@0949`q@Q1W}N6+nTsCL zjbV6}t>Xk4^7pz(UsqYXYYf9P6Nz+WONtKs8DV#7H?1AjqPe#F1-pRW&HlcSyqVvn zpI;THDR~P0?QmAY4R193rbHKzM_K@XU78 zAUviY@4BCTM!DK-Yv~YPQ=b_(6z?~G2*o?Ds4H({3vyI|sfPm+=@s_>8c6y#ja9fd zR#Fk6$w_)zX>ZP3i5&RDshV!#su$Bbp_Sz)vk-To1z%ckw;W)sg~e6zYhhu%p465Y ze2x{ncUoFJ3o-E1@M9O?fVoEh$yCGLUg0Vvl<_&X`7;j6#w16u-sFm=@_U)Tp0=!D zoOyfqwwVL+tT^{Sw2@+~c$~L!3Ywv0J(@b{o>0`hP?SUX z0<%sHgqmMol=7DT4ilp2MU6gaKz=YX`Z5B&(>nRB_;jK2Lvs-PPk&f2t38)Tz`r@5 zfB(U9Na)|E>HhygCJ(6WpZ2G;=73jOg5HCy%75~=NGyxblXj{?Wf13-`uU8jm)SxnTvQ*X59D##M%n#o*I$6LuKO zMV@YoJgq5BLqiy%Hpd_$Hl?jtb5!E68zK;da6N4HRzp$5gb#7mPIfm}raHw78PwPo zQaEDIPA3W>G@VEqvxsNftl;8xH<%e}2_ZZT&j|?OHM;~|_PW6hkk0b;l~=~cN6+QN z%v6}oeWQ0B5m}%E8ahOFm&iWMtURMQI*|w3{ zbfsVugn3=lvBo_CPnF@VH-jw?w-vb* zYR(GYvFFuYo*_#DfNsC>(3?K*&5b$J+B9#^ll#*Lne{Hz`xMnHt=9?{75K`V_3lml z&joMe^YS>7+Jz~#>O^LVKq~c3!LFLI1S6_1>OK5y+aVyAVNx%_slWdM{;!B;t-4S1 z2XDu_+}ca{OeMU5FDR6D*YFD8Y$xah!&=Qz!V{C3(6E+FjaVR9Hq_fCMDXb>15^u@ zKOmBBZip^&xJMa=VqA0K&jVQy}ebP3nBEZ1I6x1r5@Ph&(+f_#Yl`NvNGq6kuPrFy&TdYnFyoyA}^sDr*~dyLaXMVFcbaw=mNx^PKPv`=WBlfL#Km z&D8Gu!93pFkKB{<7XkreLQCMTY{V>Rm%(sOVdlCVw#}Ug(L1)AxHj^xEIEx5 zo%WBUuny++`o`{Y`iL_5f(&@0*?}h~L8Lk*Y^#3&=2kfJ&mP63H%m$cG!sA>S}dW; zZfLFBF5)={6gg(ktVg8Xm7JD0YFrI=^=ZulYWXx(!tP~zCbBIXxkIhH9@a*^T=Eon zx7tY=*J)FGh~&7G-bv%-?)g}bQ|0SA-e8|tX*FwTGjjw(xUsDl@Tas7^ug;wx}ddWtR8y5#a z5Lmzh=Z1Vm7Y8s9yrJ%l{VXp62FOd<0u~#dkZ;7r0R#kZsIz+uo3)l^zQ5-jp-dVuX zRYLk^7w#-z=~H32&`jhz3)l^z1uQp&(|z_Sv)aB1R(5s=a|B~jWc{;yMCo6u;UPhq zvO&jA}tx7bq++e1jR6j_2aHD5MszD#(9_i-|&m5kDj!a8qY%PzH| z$zNF%_}`}<&P$D~IhGNG{=R__CW&eDu#x4IJ?B4<;}HM8bi+=s(2i-W-lwwO`B@%Z z@BF({)LscjE?Eo=WnVbaP&SCGTjd}x>$B&ubAb(5*4TJEwPa-3NA=JHf~dOCOjfqlX;D+TY09wmE zCmPBIadkjxj*ap0b=Y{dJt^E~Sj9WROxCM=tCg(A&8J*a%w90Cs+iMEF4?OG*1RJw zmtl0B!`a9s!)mgzM!(*wLTYHS$#@)2XU91Pr$^QQfiWi#kfbhd_Y%$&7RF~_bcVSV z^kr6m3KGlMXu%=W1xn;nf=Xigjc}aLy z%<=L*YtSGu<8=}60^m7bfV%fPUNgBfz|QgVs%T&mTyMM}c`n@m$z#xaU5}9-g*78Q zL32sRI{>dU3QiiYiy&PG?U84!Wq)kEFg3&EMFXqZmO<)xf~jl}P4hOp9G}whe@O9r z!;agj5Ly+hp1?r26*`hCDVLSbdQZw_hK;FqOYesxCZQ!9R_%_>ne(Y_70|~^BaOVL z*Y#_NWhYD0nB37F*yv#=s{zJVY%e(Z>XMCXNHEb#_P(W8l&xeTbrPJF>~lzvp9%QL znO2PrR5F!KH$3EZuf!}?aDzGu z4ihpFmJ7SiTo7;edRxt6Hqde?G-)|a)lz6Id8EZmOBp&ScLvMf2Ou9RDK-mQ2vEVj zP9b^K36Ri}OfSiQg1U#PCB|=`b(88OHU!m|MQutw)eTjakTz85!~GB;A&F+G{%lMZ zyP^LCH@BMhaWw7q2%2AS>Lu`>fa*Yy#$eIWEMmQ0{pDo-IJ#tw%PEU#qv7gkmNY~2 zt!A5ZyBrNis4he76&+D)D;b*KYFfhiPXG)uIVA(y=62+KwLO2FIXLj2Kz+ilN~oY; zAz@i3O#s=|y4s8^16pafD1_7_izXFal-H|xrD z9m4R8StpcH?1bL1-ccGHZ(~_7yy`3IC^x$~+^g;{I?7vgW#o2TBJI@ds8)yevzvw; z)sJ<}9|wjfgR4$3pe27dj1WtazN@Q7pfqUt>&QQqhlF9<_Ym+34@#_PFgCC|nIiaz@NcfW z9q5gyH;|k=+3ZwN_K7Fq)89@d^8{>91|ASBU=X$)Ch;-L2E6HM#*3#r%TZDKQ$8Y| znwYg>DJKu8l+TG-ro=*-6r_4^45$} zMM}WrYb26?dhc*Z*J->D+F1A+W<}0uOJH0*wAzFo7J<(yk*l`V3Bnl&2__Q-H5~8? z3vGlrR;4Hw?dvGZGH$0`DX4}S1y0}|_VFRMz<|_p+W_-L+Hkk>!Zpr<3im@=!mDgX zbEd5OXY80@%sNChM5E4{_0XG|d>70!E6e%sA!a@={nLHk{i^q+ooE@%zh%q%K^053 ztg}*uRD^blCIe$ulaTfnq!B?Ie!5b>m0kR{%&DZRza+!*>a%4YdXf^JxeQ>*5_LAH z^JmJmbN){UXdqB!!()HkVUUtdte2a!P3{e?htfq+2PT(45thn^{gy873i&X7y{y&9vkWD7l#% z!y-;!X4b4hF`WggtXWBJx+OPV#T+HM=__&#qs}5-R;*Eh&{YDVV=`g|gg!&?G<~C! ziqNrWKi4!kG?Rn#Ye4JI%1I+pTVz0U)dYMj9h;l+R??brXho zMG1da7i@}b<%J1Zuxgov1#8k-2E)p1RH#oyAUcE3A)y>X{DH|O!FwqYJ!>*qo{Tn@adx*0(>3t2LTM*%d z*Zc7UfUxHt1S}6Wu)*--$;t)9wMY#FdacXC_^f@Br^!e#i%QABRnfh~T?81DHpfs$ z9TTANtkLUOMS@pz+BG^KT)R9^js7oUCOqEl@a>O+tus_xKzmla0>kL7H0DKr6ZU6z z*LwYg0(>39(L5CBpRUgLxt6}cWwEGC{H5}n>bbNw)pHj97%s#g!}&FhJ(uE-;S+#E zTrR~`!%}%@odtW)x}bEc-4GaK9$y%kz#Ca)80DPO|PfC zb6!eCc5z2pymJRBZcDapf2QpqSno|_8RQk{crjRch!M+3*05wUn=4H37Lnk+5naI+ zt=xc{f>C)x-${{j-v@!LaTf_R6&5GR66;3(2k+DeuhQ#zEFi8on!^M`H`fo~@`z`X}kPUn(Y*Hm) zQ;Qk&+`<%Vvy7G=DpZlUELbv)-PHC39#~0WhkSr`B1pxVC1!GG)eXdL8NgsSQk4E% z)KsYHJg1QUuwmj!d2vwr1H>8NN^>jjMAYe$NJ`zxk9l_$0NqX$ZA-sXB9Wn7WM8BI zy4ov4fb2tn(ldqGdBdB5%boQg?i$Qh19XRxrUQ@lM!!=h=l$Hu|0{*pV;_%{^9C+D zy(lVoII7LH@gf2@2jtULZ;lZV6Vp#(O{l`)2|H&5vHjW%& z&GHJdx2b!D$0r-yAvygiM!&E{4ShYqz54tzQbw=XAfh(|N^9QXEK5fsokR+OmY=Dl z);6M+VYwoFhedylJ%e@7c_keBe*?F(`$2(Ne~8Y-nyICX?y0~uQ5anpKtI-X=dwkB zmDQDFeK(wCkgm^hhOu=FZ9-^VI<{;Ona}07fiq&bS$mdWgztw=@SGw@8-pRoNUwj0 zy#jnOAtUPeWq%LrjVTC)N6#e2X!s9!NbKTzWwq!Md!3c5gKLY%S30i=@n{)xUTled zB`E+yqXo-bfItuMd)SzQP=<+~NsJ-%2}_Jpjfep_nl0vT&}cb(7=eoN1aZ`<0$^iw zjhhxou$G05%zmYny1biAIPs1R5ge0Mljl#vld=d{CGM!+=^Ni>WmRfWI` z9_<@z^cU(OG^E3w;XY2n4h7d(wI+zV8~|N8ZXl}9z>PbCKA6QDg90LN%xz_ZMbbhmh-<{8vWI2;RqwE>)*G`=9T zEzC1%$IarrI5+i-FUjV)zV`6PR0J3t;FCrFXzQlq3G`#6zz6bY?oqy(M}zqw_>_qMq$sKCusU?Bc-3v zJA`UT<(a0ElTl6+z@jIgtd6$Q9hfncJE>RZe619~Got*>D7o@gi%BbK==MQJWUDB? z>KyqBik8lE8ivu@?`RB(o*y$$zIsvCT3&jF8{UkD2+Xpzo}cUqu6$XiI3hM2jwqRg z=(9=l<@=vbk~_Y#aW?FT+M6K)jFvCoYao&~G`+!a8gV&lX(iX+YAw0)g&hpqO2(4f ziCNSR&1ao2-!n}Ndsd=U2^H#-0L-1#G}j;NYMy)@6us7_T8F%9k&`96*nqEVdD@)SE_#h5EmAl{4R%`cJ!6jXQWo zMBf;ZT+K3^o~H#U%f5atDSu1DP{%)K+wRvqs>Pn5X5ix0b}r4$T6@n11S9@0f-$}? z&kTa0C!)}+++YDz(HN?pns*>{$%VseH^vxky`^ClH_|D2p;~Ns+}SfjC}A*kB7=RK zvcqyV8*rkUQlww0`)=tYhG;TiYO*rDye|6m%2 zmB;fD(YUBnnXtfz14Q@;4=>3AdbiY` zx$u2dcZc=!Sf=5S^UO=Saoz%nR-T#3ZJ7dK0(}hnBYMV?J-YBTgPi4sSn1xO zBHs=7ixgFUfA^M(u}`^q175?`B^7IPs13rtM|Vdh^kMxxn$t<1JoDlkObGN$%(HiQ zG^#r+axiq;gN3uqGsEnE6#_jK&kRMg>P8S|xCoh=FPsHiG4#kAhLX+aXmw{~NKbXg zdT4ezexLt1IaDd;-+_z;s=^DlhA1>``+|_@VIk3DLZZ>*-gO69Od^m2&gz8)w!%jm z^@1o%F?C1=PAyl6ojXl#F1#`x<4oWJ8M;@`?qvf-dezgH=OPaX?R4gqi-6kL^k#I1 za-8L&*2sMtsW_SjdeY7;iCoPUv$!%53=s=Xwv3DZO#>0sqPd<}KNv17o)?M9@3jd= zs9n1~ciF__2S%guJQi-a_T9_-PsK^M@ z_!MX}qAR#sEDi3CH#T{5kT~&3oanWgHG1fhIv?9jAdq&NEMUqg!>Op){1DRqm4y1k z)M&IibkHzf!nD1KkoJ4LkM6wUZ5*r2*s;trTD3#e0o<1;aRJ;}Q*{inFISpTnoKNh zO|U?g&h(GAN;=5=xWT_|m1*yzPhRnM`p%Ox4knjf5it)B5LCB1_YDY|>c2&hoB$(e zYL5ir8d!a{2SE>;pob-BrY9Lwml+DalM2454|Mrs1V*RaJh7ah;5jF5iow!TOx%=T zB5sF?+d*lkoRKM<6G6Z`C3T7%H$*G_pkxjZ2MITTy0U8%HDNv%Dto+-4yeKgW)zpD zKNV4JRRV3FBPS8>l#H2W8N$W^#RDwABx5Er4oJoUlQBs$W=h87P{#BVYR6k3xE!2( zB3h1&QOOu}H6w6ca;7gc@O-aXa0lRdt-x~-^qkDN>1tl@HbcSzkSLXU*BY+ri0idN z)HMt0ymO5@61}jKXH7Heg?h`#dNF{uda1maYWnedLzfQtL2w;gQ%e;fyG9`(`zso@ z&5&wC+G^Jg1{>1fMN;-etU)B`Jz>`kHf&|&>8-fL?7vsWhlpVcY%w3i>y5$`cGZb|kE)K)9TiHC0k48d7seCPYpdd&Jn!SL1hpF<6(c^dc62bbRFDY>Y6jNm z+IS~iqy6CMWxQOt`^A`ymuKYBXWlsB@dMDKH=0o|v%KCPy$q^;2RX)YyN-MGN2@{c z$QWHsrwQJl08kJP5630qxQ6K9)L~FQF*<)%t{u%DDO?~sm&j+wjlI7YMWVxNV zM5^ZuYDQH7o{)>F->h(#HA5cZg`_KTofzqc2LMz+9C1nREtPUV0SNL7ynp*$6r@eT5KB;mg@P}AwAJsz*Y%V@4k_*u_-0h?BDSWI9&d0nvhT6Pw-i> z``cv_i&JCp`&}{^ta7M3Q2rJ?k*~E`#C=5V!o&XSW~?1R@M>~i)s&P!0LHt%Cbt7E zAo5gDD}=&1mFr9iS{D>)ZgOQeFaeQofl7#cyRL@FJD_(WDBmneJzp?PJ{NpHtyaLT zikL9)A9Fa0FPwe?KEF>v0=os|#|-J60Mcum5oRF$KHXEPNv`h5VxbnPVDz#^R~5TM z3^$t!A}#w@~>PQc9{IogYLwbEg!%1MmFYK= z!t6Co{$tLgGBCVKBEVcqi#R(N2w-_C2dv1Y#fzFaWY;};j)_NKUIVcgZ@tRkt-D9Y z=^?;PULTiXJI%T_o)kfZMr1P!FS;5NvEr#(gI8LN>BL6Tv2TLA+irH%{VkM+;QdwdOq4BanX;#5S4K@~q>6mK;mHS=qw3 zqe@CGBpY;`uy6v@r&pmK(|KY>Hv9N*(L_UvG1*C2<>;NgPjVh$ z*6F7Q2Lpe=KeHLsEm0nXDhbALH{{O@>Ao07Irsvaev1MhofAem(1Iv|7G^c~*I|@{ zDu^mt6hP;cFlsnVO8|x0$Q__cEOp@$?#80PEFTP`1V>nu@CdVr`%M^CL5M{OhcFYk z{xB*UMnx?Oq@g>EiiJ@zi#l0Q9QmyegX=ALO2-#0I2i^fE%;p>NegZcg9RDLT&>Q6 zTf<;s1fe-P+kh~*onUjwycORXQChrhC|ziefbttkMiym@Zupz-8ym*tlKDZiMK4*a zpWuI^_{VM0>7mjm;j3=um|3oM*j{wdg*b9+W$3SgM5kCSG?Pwlo3+@gSDQ^+8(&7G zzb#lam~-YsDSt;AbBe+u?aT1TLARUBt#!Lg9o)bA1h~KK;kVOOLl|Ah`j8OokZHC- ztnWiSKl-Pj{nlMiHpb|jiC{lAUx9^oKl0VML0<-wvjpkO4=}d=bgbX{c)zifk$zs2 z4p82rCF5XzVaj?oL(xlMp;!TyAN>lQ=LDFaPIDs8Z^PmY#|2DB-w$B^!zaT0L0p}K z`OTKp`DIRoFn;ucR<=BMQ59v!%p?~^Qz>JXOF;hZHbMcfDhOfy@|AWt%eLPdU2ulc zbq@El@m4U-62{N=5(zL5lI#f>zl%Z22E+Kx=)4)`KMUeFgH-32`2>K!>?m7fS2?XG z8zH!F#;JSh$bS~LZzksd7cT1K1Z*Fig%|CE(*%uvk49*=ilZ$Rc_n^+y9`{f~I3Pk4u?JmV$vw1DA-$UiH5edH!^vlKTpAPg7E$s+w&L;x>l`Gg5 zf_`a?u^esba;F6ofqoa1YA6Q2nb($ZemoAo73SxQzCm<=^2-%-PM)mpZx|Chk>Vu2&$v zYm*w$RI{4}8K!;IDkKYeMbxXl#Cc}^fDx78{LLYpuWf`^eO1gnGaqzAh#AqQD@NgZ zM1cdaD1fTT`DWJo0d)`_oipO3DWF;i4&zmymo(?hKX+6mM4x@9qhi+uRipFIEc8=HAq1j(^!YZErGo1pLSB) z=AHRe2u+hkjgs*UB{`?g)F9Q}x|nn37r23Pb36tmH>*NOpPI9%>VMPKyfgpS0;sKe zN4q-2^r=5b`%_)>^3VK7L49h`(f&$Td1&Shk3?|hM)Deyyr>d}a9(nTJcV}AhDcRN0>`d4ERX{SvOVQa62sopWuN`r}AcG)pawYBD;83cK? zI|{?C2M(A?m)GxwY08)E^yQ5WhnI#fngRI3Q!~9TB2pS94JqZAjbE7JQrQEeDptJ} z-iJWQ7ndpcdvW$p;2Lt#{5zb(>0n;~MNB^!oU@kHPXf$Eh4H#K$m(yE?X#43vyTTO zWZ@~SbQeX`%aAh;rsdN1)ig`cXv%*L_k?WQvGj9zrIx>1=CTR&k2m?-n>Yg~yJfzR zA(F=o4r}x@w<-UC&;~<{Uz)K)Ur*-sQQR^QL_59#6!#G@u)n?~O73Vde6~0Q9?c{0 z{{`@50C9PDxuXRA%U~QrB1&K6lcN=!7#m=|kJ@S3BHi_p?hI9bF`IuQ>W-sGa`q|w z#DxyFU}Tp7yT^5@il(WUt+_oj!AAgB=DaGux$t%`Y?QY*$I_ zhf@&cHOQysL(GpUJ>9&ZpO>=@JG}1qf&Tliu>0b3O0RU}!Uog+Py3ZzTTxEMf&MG6 z(eLzpy5Tu}lTTjC*i+`jGa?NaD4lUfa%@tCs&h!TP*Prsp`esJ`+htj+9eeACzP(VWmH za|N04R(uPPGK)`URz7s53@?KCht#Rx&+=6(;1Ly^9=w|?;8cQ({>cueMHSOefVN;& zS^pt{oHrPKuiNOcnl78`I-paKNOBtY2+2)mi^t&6mn)-e?X z$Nxm!por#;IJF6Wvane1=`W5aZv^G6)>0sbeQn_Cq=pYc%s)?>b3X1~7%+CnC0JAG~4b2XD-H ziB(T>*+=Ech4aP;a+vCu9?ce7Nb>(w-bewWU+4BKm07-wQ!XW+yP$yIsMIgcXDH@n zx^HQ(bakYY0SbG7!lswKgG8DNUqsBjI1aI^KandmQHoDnl1kc2(tQTXLCxDqA!rKT zN&zXv@||}f7GkeVYs#ZD=frAY-ciCo!NT|~P6ArJ-VcEJh8AaZz{7~pG&I5$eBbp; zhcVHQ^q9dji~5hQ>Q92{txJNb7kNGtWQW@HdUa+eY^|9mJF4U$Ai+I|JrXRClWeYJ z&^#*EcIKu$OM%$yT;ij6wZS!JSBlS79QwHC(*jLxSMsE|)Kuo-daCY+r&e7`ieMF*)5vS+~m<3_z@TyC+;m`@z>1 zV-vo%h_h{{Ep?h;XRXk+IA`0nsF8LHFU*v6zrJcUOD7T)YTF)Lp-%J#qjvEf2osq1 zmp`}LRsfyg2M9kL{hgA}l$@U1HC6$LG1TBRR@Z@LS>HxCa)%#g zhtEf+$+1{4>?b>iO`A2f?V*SdvY>96TbOE9ki|JS*6A6r>scASq)8*LA=?B#k&QeA z`Mo+g$TUT$wZ*qFHKVZPD4+hBfX%A(I*-)yu-Qm4fG{_X7M3sat~$(w|c7@z`qN&Ww7Zxi<~wD-&t%Y zge-OH;?#GBZ1nM{?!BI?vbhWa&Yiip?ob)kM@?x7qs(X)k!~*G?kGNW2ZE*dWOHwy zg3{upS&;rWfzvgq6^XSKBG(Yof9v-aM?^z?c5-bhv_#ccGkkB+I=_8t(Ji^8M5>e- zog0K-S=5q| z>O!YbFABALg+pvj7~!*e9PXv@5v6(P zAS}ul%o<)=MHtplw=3n}pW@4&_7TOj|4^#od9QFaBFginb*nEDWZmjn=O$*g^(p38 zGtV!Bjmj>NZ5Un0Qi>324Cw6hpl$3c|17{BPWt5g9j61-@J$2M*0MD)?Cy6d261(0 zDVHD}_9-o{Ftlft;a*E&kz4kQvK~6+RNa{@Ge0{<`m_ASfb?Ua&jv=lu=EwjqA>-b zb(;;0Baig-%UxpMPO60J?j&}I#XI(Pd`oZG_(FURMg_pgKnT987{GFTRuP8Bf>VMK zsK{FoM;;Xb8>8fHmR`bv^C14Munh%>|CnP7;^>MBfQ=z^FHKJ)gvk7#2HeAnb-w&> zQk`a2BEa4o?!$2dkT6oeCvAS!#==e_@Pp(XpDPxE`lUc z>5dl@eZ3|Wc!*JUn_(?~%dy$`g3!7Rfr&m^;Tt4pCi(&sTU~NnNv>?6uYxIpa+ z`g~9EO!%%JvFy@S+li-c%A=~C^k8r?+&Bn7*dK=~=KsH;e!AhnRiU9i8R~L?B8GD2 zb!IDj<2~SKSm%>JEiOX>8SJa9*cv;ZF{?`u<8<>GBc~1X^#${mwV~~Ci9}dF%q2s8 zeO}QwsGjp`0sDB1VdWy{pDkYR=ZVprER`18D4K$rgo2b`i z$|7#s!8t!^2Z-96X(aS%gi9?sg3p4>=@X0cQ+2A5unmI*A%SwiY(IvS3u5LWMt1pX zZl^~Wkn5eCl+5zyIu9}TxJ>*$ivQ&#M4wAebP?l}gc2J@5Yd-4lc5Vva1q0F>2l;W z0z|1+elBjE;39^1a;#t&|F{f&xg9y*KEXu{21vx2I-rXo;g(K1A!Ogv_4i!FXe2~6 zL;^SMTz{@bsh!P@a$|KAGm&omBJ#$5BA~oRS38gUqY%EYkqXOCbhSe`&FcsahbXP2 zAqvTNi;nacx&|&{!0kyIa^`E7OEbrZ+I?a%a1jG?Z#i6T&y=p*))F=C;ZKnWIGu@}X7$nhE&WIn<#Xm4ye(Jx5Usz;Rmc~}E8%vO3qrwQQ zo^qXbEc?hv0N+pi!eT~WSWL=EOiVxZ!1rsQ^t!h>_+C{mYr{^iTICLL)w#YFoPs&L%l3%|{lEKtgMQJ1atE+J zH*)d9w{@LCzgh4buy45f9m3d-*~b=NvZVOVA_>>Lm(OIx6o&_!ggn_i0A~Jzuzd{l zPvs%8MX^T+z_&}{e#GxFJNhB$?ibw_=(ccW!W+Ebq70zt(}~jb@EqdrIsW9sTyAj& zp9`GJSb#jfE7Rk~`n=#eK<(&)j~Tymq(4-23y$B*Ib-e#GGYe(_UXm#h_YhaS;3q3 z>BaW|-=%n68g!peGNpv?tNDu5fwZieG1s=6w@*p51!NI?fA)>Rrx*3ksRi2?8E50o zeI&H-cb{IIEni2`)n=neE~p7gUoZssr8J`v*|lYM_t_Wb}E z-gURJ7`071lJ*K*?xT;Ar|Riz!1(``%*w+Iq+tBd3geHjYl=LfLjX+NB9DgmbEoz)4NR+y%-~#lo;i7-Q<*c(gb|6?= zOWUtaW>VnUXEh8-c?Iq~FX&}iH=jA+v%)RB3qgihYwMU1oPZS%p!!99JjZl6uCD^~wZ*_i z&9{|*y?qg#6`Hy0dyC$JE73gw$!F>Zpqh^PN&WEZ;FsP2FaCRH#;@1F%#8oc!lz{< z9&JA1n~Nn+H=j3D+JWz%GgSJspw8y;<_ndD%9H6I(W%R@kXPc^*t>(v&}84eW^z45 zEmKiY-j^Qcr+TXMS))!Ye6djZaY>o7zAQzS`t-iQQs1OZ`GurRxmi;5Q3!b^;xmk5 zCigh4aanLi`V>9zT4NP}*BWY|^5)<3T@xf754kD8LYFk3ZX^S^Zzs{XB^|eJP2*XHzeb+j<6wJ-tXjVBQXn^?KJbc!Ezd1{04P3HYILup6_2 zM#kg?OcnYfdfJE)O!*l-PrMnco;3CBKUj9~n0;qhj4*-uLL=Rn*y0eFKa zw_x|*?)Cmm?qGhxY+1s}$45WJdUV3$b}(%CUULZf$uP)Ro2~g`3HV0|kc%1yylO>= zSSMEji2|zu$qm+Km%fi+!^+hX(l$m_)`m+Ra$DA>I3qV7nP}xk#tTLD#BZLS!ps6pU&l4Jvq@;%t&o?8vD{}W1e>EJj z5UApq`WCP)YxoeUmJ4$MUG0Lb?E{gsXgbD$;0?86%HIYU?5!I!1RQ_p4@%Y59k)YQ zBva#RtGa#8KOMG^2dQq(Y6 zeAmbwd0_r6xDLqMm5_)Dp3QkXP9KC+*)k=NIRF>_wg%W+@a(L;YhYNmcr`*o zOq~)ijrjL>A~+ zl(VRhAuQO|2ptwr8w^ba*f9@go^ut@P{-jenY@pZ=(d1ZGx z7tzA55%QQ$Hfo#NFo}N>aNZsA5Wdau>rUi3{F<+f>uZmT2%trma5*Pvby}gFM=c`y z?8I%}2YF^amGDSxQr-nT?4IF2~3y_Xx)r=T1tJKN;yrivkMFVhF^X&iEGM13xf-TK8OVL@)Y4qv~E z0}Bj%q=(yd;?}!3;J%NAe8n)mqKgC83w)$I1lxoUyEu@$BPyEm={DoKadE(Nfsb^c zw@~Joivxq(67p3LWCAL*#kAMp1NSr_*kPP)xbu4RJ#bIMXbt0Hj?cNLafr6bt9Rkf zJ&nV(E!^Q-yu?1o9-Biswq4e1>!(|IYZ%_@8 z+H0-P{b}yc$HBDEHDo&aNyCpQJBzO}=-Wpbyt)wYuHzJx0GgSu`abxun#Kfuvw=&z zY>jUf4_7Yz#uS9sxo3Fj!cR|9Z;4(I6CNV8iNqdY)#$**^~UESXhaqPE;0J|&y6v` zehwx%o&kvaf4Dy%r|W;iv8{H8H_1R75|AQ;7*~ zct#v&7+uF~=vWyv&K?~sTj-Yt4kavG=8r+Ag8hLJHDn{0x~wqixI!n*W7JLA1viBj3w~r`#m2A%>FOO`;($w={?P4ni?t_>`MWVUl(Hy zs}NE?rQ>V#xnB%BPKRB?IY`|)YkjE*7)q%2h5*&ZyT#vvG^{nH0V+OX68>l3=iX^) zgvtLodPTFn?9qYFN|vq8Wy~S%l}>aSQz8jm#sIzv%`RJ`V-rh%LKrU8<=KGI#&xyB zvUx)?hz=a6m!a&nCAiJGKbJw|Til;BueZ&xY^TCZ z8R?^$4#R$tO4UkfmWH}APEE4VqScs;wHolM-wWW^gcA{z&p<46tIevOqjUqAx?~UPNPnhr zOPW8>)du!nZlKr@OtMIIW4Zd*{f@L-R~yN}YP&)jZ1}iK8QvLy)J@GJv0D@GIbFJnZKv4XCw7PXFK*3_y~Uv9pdP(E}b^T}pSsR<$2*r0bg=Q6;J zs~s?n4SMDGg}2353c}6>5H@;Zr6c8TgCixin(s4Ys%JtvXdfIUvdaxBvqh&Iim%KA zTvSi((&}}Dj06bwHWuPopKBk(PeMPl1Q}?i4`J+O+y_?Z0R+IxS1VL432xvj7+YPk z8#mIU3tUK9FdM*HE8jIdHx|xgW%_!`=OcJ*Z281!qkjb7n0#?G=Re)#-#y`r zqh~t!I>DOtgWNiU^gUX-?&XW4qij*0tdu$Ww}F~2q0*yM;^7BJkAc(*@!iFa6N7CB z{|*4#nrrj{vq?Wf@Q7&mBdY8heSr3Sq*0#6xf!A6aG-^ly}C2M<4Kep@p&bFly0U+aq$2 z1ZX|=NBWHeTDxzJB4z4zk`=#t!nZ~zgc29+GB_Z_s|CH!x3`$oGOy}M&e9h=O+hZW zBgCtLvLBUQ`aWUShlN=$7G`}03r+Pi?b#3rYPQmLUxKG++NVa}MSkU$=E5AnH4@*@ zyB@PQHAU_bay|9drpVplmqz=b^j2JI$n}0Yw?;tLf3(qmDCciY`LC;^KTk(jbyV0u za9g|W&Tdu-;X<8JP~>hJ@Qyw`r;%cNGe*r}!sSL?7!L2fZ?>$4worOoTf<9la-b+M za@7O-_a)3&%_+Pg74!m>Mw#}PAu=sH_=I+cVG0Fu> zZ26*KfW)K~7k$|+I;qSInMlQ4q022gsm!!gaEmToyq*+Pn&~U#v!z@j^MWos8Bp0u zWN#c-Lzig!Ow0$p-ueGuXJ-Q*Wp(EL42d;VglsECEZKAf#V4@S9NVL!p;o0hRsz@AEt}L&Sae+PSXeJkL4zxzG1~&iOz0xepMd zJD*PTwFozk3KeeT4UtWY$_RQXv>4Vq*|28fg5yexW0d7}b&(;SCHyOCC3=n%@D3}^ zT0U2Q0x33$-;*??I5<=Rx>Pi|96fjmRS<)OnJ92St$K0@aF!P#s@Z zSRK-^oC9=VF;yojfiZSLj<6|E(0d<$yvHvV}YDSh+nmW|as z+gNt$WE*!W?3~M70rsdAdq-9-IOg?Vd8GjV*g&N?Iio8$*Ai#s;zrB9s#H^Iqw-#_ zT(2mJ$eCACr}~wihS9LuZ$vtB@28_fi`$gBAs_iZ$>{q{N6tl_?H^O8ao40uOmjo$ zajx$KaU&q&t8ACM>pNA?y6d)*AVJ!wXagRjb{xYb49K#8`zR8w5!LKwy_7-}&!XJr z@~=*vq3b(a$Zn-Dt$kwJ{qx;8sBF_UGDif9_YyhK-bS|&?3*69)j`cphgok=`52ixaf zh+gxpuE$MJyM+$_fqOeOJv9>`IQzoXx%ZPQvmF*O96Xx^w}%K&DQL0ax<7iIz+u!V zXNBc2Eov~~`iTg3L=tNoQ3ZykNz5+s)B+X5AzTTmPqKA$Q>_$W@YVR>0ofJV2APzM z1^N%vas#It40^MmVl{|;bvhEpSq{|w z%K2N}`pKugHpA4R@h|3l0_*D6C3nHX?&sVMI}qvnamct2Y^Q*{SioXa;%H51Fy8do z+|-i03)J-@N}c+T;>MEc+R&aDrvJLq!S`!#?y%IXRp0G!q<*y!h$TlD2queP005T} z^-iZkjkm4FNx9cp1E%9v5x)(W;C3A0@tfq*pEki3to;E&>cu>~84sn^Bk~lUnoW0Y z1<0rMltnFe)FFg>H=4WE`hkvVH77#5YD2%l?2WoX8n$-V=T89rv?y!GLoK=i7uY+B zV<@yK7pTr9V~Ujq9kV5#ztv41`m8Q7+Y3dl)1jBdHKW&px@nGw)u8&C9wn&`P(7Hi zW%!uTM)Iu+GqlB;KuA-2AywEh#;!hVh+WO2uuU#3i*$ny24TG}EQ@q+DcKf;ZF6B+ zq?}MyMcA^}EZ>QQ< zKYI=+GWOX0z#Q0{Zc~o&vpVnQ2W%amu*=OkCF#D|=(t9hEJ=n$^v!R;UXaAf=L#Js z3BJJa^$Wk6I-X)okjq5!>3H(} z#EgCMj^TnhU+LJ^g*t}lWI{>e>ZYA3_cW=MJLCbP>W?|R!mVhmWup3TuhQrqd648dfVV?Y>U zlQ{M7^RMBMcwP0b))Lp*|En@wyzr@xff$jKW0^3^uz=1nN)ZDSEaFHooMP`LI4IjG zQ$smf`Y7BqZLfI?n($>~?esE`#ejzyg~a8#VFV zPF+;6-Sam73Qvs$8Mi8=yGS&;t{C(}*W>7b>TbFMxe#0vRlhKHHdZp64AZ2`O8`pn zv&TpD)7=2}1W(Omd?Nq+n^gIHB3hwml7Hm;YT=sS4nu_`lm!I>J53S_80&RJ)e*Kw zmq#F?x4ua#UNn%1QKzB{7Fi0Ti%OwEG(KuHOb(cA-gxu;fyV;kJ zBqj+-j^@|F=<+(Kkd>0#%8*2*N;YJSsvBW+Wg{er%qAlN(sU&Q<20K2EwH(=1sX)> zS6pIzAvRG&@g+J*?Cy5j0>b;`LW?Nvc+LfhO2oQt5Jfo9XD1 zysdj-`h&FMbd1?Oj-4fS#&D~TZ*~nhhCee`k~uD3jgFTCIAJve6|8IV1`0>rZL7qIB5xOI$9XEzWOcP z3jRTx?9(?@^sT=@%5LE;Cdd_~r&B$av12-0MN``0H3_;;8r=>@2MijJnQC0o>~?K& z$H*J_X4JH#lu^4aUT`Lhy0>dWEOS~}W?*EatqRqnX0J;JYPF8RYz<%QWy`iIbvYQ4 zgMP{mNg;Out4~H$JN(!RJ(GMr^sKA}zjA^NJ^__`c})Jv_Hg=l5TH`J6fa>pd)%&Z zT#4w8c;WeZ0aW=ol`PxT`zbQUGx&$~3_SYxV?E=5A(f_sE6i}eg-qWMJh=a;H~oO= zxe~(>NMza5fw<<-z2s8JBpnB2lBpyud)ZZaqr6u|O{rBDGYp5sT85d=cnxfjq+dfX zR*JSVjV-&_kIxtHhwvG|XIPmubJ_+|di_H(eC_ONLp(fRQh(ft^v~33&0dBjWwEgX zVi=?f1z0)cIYZ#m+~vl|Y>9XpP}0K*p^>vBJd;m5723gA#nY`dT9K+bw8Y#Xx9KT$ zIB>{mKQvnyw*p2wt1R`XJi_XX>(z@!1O3^+lSI*OYQ=z;(pM%sm(G%}Y){uA z@8*;7i5==^@v{YfRZ05Y_Q~FMSe6~)4#d9a!jm8MS34{L5C{&y^BR_#zNS<@>4cxg zLiIH#F7wvE*!OC9Su9yDgT-m@Lzot;!kM_N$yq+76r`ClM$d|6#02Q?B5q%5K;C^1uZw(a$ z@}`n>vS6DkOG>0_i9G61kYQZ-AhM21mlfwUG+s}O#Z;z)C$5IVZd_B>6gsG`srwSu zlchr1?z=XQ;T;t*rkaQG_qy6Z`|N8IZ)9-@C*FOZvZiqL@m{(7Ms`zSo^WCMSvk_3 zCDmV`W;*gB03s;bkKbf@1z-m0WFbKwqw%5uRW0>iMw#VF93*8`Y8P&u=>S*%0k%gwIs@g?hDXIwIh%F<1 zSsamlLubmfz4Q%t$)Yg9AvidUZsBe-%`DWW7@bq4CO{!dTQETo$+ys46#}$7 zAY*q~QkVp`5u>{75a7>SG_z@d z1_O*_)v~GssYS>IkZ<-YYGBy4U}iW5y90l#l%k|n5*ac}z@cZ%LXgZU5TRM0 zwKSO1FeKf~b%_zpJ7A@NDB1wU2Zxo;#i@A?G&KU$B~t9FSRSi+WNuDbP2{^?OkctLw)+ZRo2t4Wyl+cMJIUggR*vY! zjdH1vdhs063s`PV=&{<+GqSvVn^Tqc<_>c2BcR%_k-M=h?bWK}@Abu^9dE6PDBz&< zZA*0{Bjls_D+gn^guPB0yTu{IN_Mm1c1mKOybS1rv7RLrG<{g+Xrp`a6ySHexm&Hf zFnk;1I&AQ}n$W{)mMDS3;C{&=eDz7D_sy1ju)cB%t@ZNWM7~3Gwb*FKFIdK8kHGy#ag9bNopW|su$MYU*nCA+s2zW!|C*xTL=*sIo~LK zGI?y$=18suu))>0PI>hD0lgN$rnbZn*yaapvjBFoC;0(sKOk)Z>|fs?&tf&;2WaE6+;bv(h|IZarI`b-rhvc}{Kpj655C&qnh!6AYW3&KZF% zcrx75)CdcKV(#8Uplc(=#i(Odsmc$r#!y|cQ$~~r0PkcunZoOLj1lFXA8tfhyE#-= zHMQUTD@g=$pl+jyRo$_aXLas!x%5OETZpTOWLB}J$T%Hr6DivAM=|*Xp($B@+Qwo_ zxdg_69xJCaPf$9^u`F=6H<9d5%oz9paQBSZ8gO@`djhy)WeB9Q{ZLJ+#h@})r{Pok zo(H&kq9%1|fu>^3g-i6m{tv>ea2LM|S<$J?U5kCl?GQ9)eE_%X527=_5bbTEAH-cQ z+-Z&eEzd+}ZQ)~nIgni@KFdK?ADAta?jP8fL3MM)Vcom&iomPo*i}H+QI^f_#QeUP?BG5x{EoE@U&GzRr3H7`?3&T`RfaEb>~(k^HvRpc ziL0Ke{n|$ya90d)cZHp*XOfU40AXrp*uhmY+bjAsJ>j&D@r)`WYFI|b`aq+z=*lnz zYr`Gm$TN?gDp;u(3ij-*jE&^m5w@4-MLNcjY_GtSAYeL2gJ6b+a$Y3B?yIF8<48H* z0yWdLBidq!9YmK`Lex~*F^ti0A$Du)I>rI)zUzqiVrrRoTKq8j zHtWNp^BOzG5rhpYXEZocj}EeBx=G52m$!6`gV%l0D9|ic2nk6df>*Uu(-7S$w__Zv zPO6A1f7psd`6VGi^E9@}Q-|D+alV1P$b;2F#8DzXp(J01(LwsXULysnbLQ=7l~PB# z#lSj9f3OV_NPjH1V;rzf?BU9%Tt|9W0M`91?MUyG+cAD}>vG6;_{xZ^TwL1s4G>Cs zI^=eYPi;L8w+m_Fl2%V|Mf1i@=ewR2w(n_04!P zHe*m{25x)+&#L4jTC8Y)w+Hy{*o*JhQplCGZaClYJ0U;(VS+81Mquik{1KixINzU^ zVx7*WzW_QG<+S;9AKmgnx)||l{)?GSjBkjg;En>9obXTP9@Ydy%}@onw=gO z=Z1C9-DeE?sh?(6Zl}=O&dXeXM{0pLCIe_vL-fd2*y%Ab$vsad8U$;q*mh2Sfg0el zz8p=%K7o36I^Hx5nN7T#&MB}3tzBkU(&KKw(?Vh&-B*j+C@#_5)|Q8?z9C}>xy<_b=qD8^g|<9KlE)i zpz8?Lhx2`S>1%mhd+17eCO6wC*CPBDv0x#j$w(>o;%eos;d`WPkH5Ua z+xm;t_PmCx4Jx$T+v@)}7Fn;E0#f*nL4-hXu?>ax2ozT_as$lKlN3Dn+`jt2w< zg~I7-0-Vm~2-2T+ouI|%!;=R9Km;wGL+=98DO83?9}Ey3N;=#Ehb`)WJ%i!&`@jI7 zx@N5sdCIQM>l-1F+2tfMWt~Bcz>61p+hHnfu^-MS-5M-!=8;ms9o0v;&vOXlJjZ>S z5XM38rV4RVpVINe^S(F@h$g?M$i z&deOZHz!45F4%;to-w8*IqF`V2tCY2a>P)-*QIspftnK3F{NWziOaYGIX{a8G%5u| zFu63dR^x1i<(ajF_3PW*_FY|#cPSZytNVyJZoTn#BQgu;jTN<_ID}BNhF@+bg z!G%6Oy^7&u+lKJS3WE#X_{hv(BliMkX5Zf8O?Szih$5tRP!1dY^&0@59Zuh}g+aBk zPiNX6qOmjmBR%pth;nva8~;|3Y9I`*&(nPQi~}VFND7#Aprn9E$poR2B{91qZis@| z6>+0HD6BXKhAbAMrxFt}!s7;`*=nW7E1CmN&PkooO!XK$R0)~l!>MWK;iHCvA+Z))2@t@ zKAZ+FImMQ#N<$$lNwQE$QWJN}W%#N7997*GsRTan@N(vvF!YhpJ4cQO0!|7T9V=!9@{)f*{5xTWv8QHF!mpEdx85QdCj2jIwG|kH~0C^)=z0 z>R_}|vGRpsgHRu5`t=#JQl`3oV5j;0*@j4J8v0)`JWZ|fkR~tHM?9Zfised^QJ+G~ zYeTQsCf}x2=*{dAE$ln`@S-IX392@A30*|po$x1SoR~9xvDmdA zEjGp*V5n-Doy;4&q55~1wKt5^pzgg8vcupAZNxX9P4o!&U_k%~-w1+wgP>ju0yyw* zo8zlN5G3HB<1lc+XM-HFp1Fx2ODW2o)FU4qjU1Vw@%?f1F!#%`NK|0eJ*HE(y@ z++8J=fp?{OyW8dvE`hf;?z*~WhNadRcx$V!%W6(B@0P$@yK|e=X%VhnudW#KI%fi1 z+YaX<;L^J^MD^1n1=cTC@w@e3iZ_kNIvxNo-I1l(5l~cD{9*lRwM|D*f6N5iv(F)r za#a;hPJy0V-{QNxZ?tv=_K4_zLnNo;e7ED)CdaCiBe9BuUP^;KSPxWp(cfO9xf98~ z@f3ZGG~%+|u`52~>3GKh6d4Z)W(;*aPnUTTMsany&go0!GDw4$5dWh1L7QZG$plFpr zpZ@teQ&WWdLM!9N2Ao2MRZR0(XvOU+V!t5dvolr=rwezy3&=Sg&0a6M#l(`pFLfRDx+vX?&GAzGtB7?RK|ocj$3=nUZS)go z6SM?ZgQOQNMmg~aqDq3E4g8LhUPtA?P6)m$v@;fgiY|EAL{pTHS5OHmlh7i3WhwlY zqn>yK`Sp88ghGrF0ed>4+DeETDj`CWX-XwMRz^mI>xkypL39cFiARuW-HwQ{WJD~T z9MLx#A-bv&BBa|gh*T32ZYqFe8^KEjc&8KoEAdI|281b z9Y?0NUZCocI92sXniBf|hFrS8G`Hi3SrX-(HB>O&V%R5SU89h}^ha`ag@*bG3qq|N zKm>Jyva^eznx@^h0@8_W%;sQyk@1410@_4Roz8dxBj#M)(pOZ_9jkbJ{oF*;;TRvT zP%{f}HDRkW+&s~OI!JoPuXV2UjQgr9ezX45wM}W!^>i9f=x%e!Cv;P865Z(?T6Juo z`PiHm+pf2r7v}q7>Fi+C?P1Kw=`kuxaitY^*_y=b`iORMtimAH$Mv{|yAG9OOOB<4 zRw)N5ZEbR>D*1kO@?CBN(}JVi$XRCK^3|X9?Eac0cQ`y;l^m|Fc)?4}rxewx1tlP( z&sEQOxq8M3jlqf{fqzx!k%YWPKJ?9;)WU{3%B3T8A! z@o|K(UY-RidMHCTXfePdF0WZ3b$2Z`XKI--F3$z)-(@sKcM-=Zc zaR5~@B&xU9C2W1K>3ikV?+{FR8#(MuF>!z*SA9Z?d-VpjK1NdB^nFT5JRXBB8%dJH zdZ@&+_KlXwH2F$fhEz`Z3KFazQ^C4Ei>!5EXe?FrB|alPs>{H79JWD|tL-o&pzey9 z&}q&p==aQ6XuRnm^-NezOX@$GWxlacd0lT_{$*hM;6n2oENYj54HDWgE__!HD)$6K zp#(h%JT+?+O!E?hLjmHvn?7}#dLSDS^^N_p(A%*P7klimg}27Tg1a#m8lFyT$!I8> zETUibyjeHcRueiZ5>DlYj?{GYV^ZWU>(SoE6*i5q*jI;5j?i_r6tC5l;%Gne$VFi_ zp+8Eird*)^iHHC}#^sEhW&qK8Nc0o*Gb|pgf=6++&@Qe?K57V-`MZshb~2k3j+kh{ zRfQ{Uncb?Zn(45d1a?NXA<_C=7swPEfzwXXRvk?a&<8x@xV^YA86p$ZhDa;Mz0?8b zLJdSwX^DReE;}yCQ6=fJumbSSSO1tPaDXk1#Nte6{Y?Oqm`ho!k?gf7?`biB(Y@xn zqi40r7xl%vMU(UG_sjI??Au$~k?^iG-BwzTRv3($dN3_F!2 zvfS`)RzFUgRo%$(EouhY(@0;4=>D#__NbLJ#-tGf1k5M)_?KW6xDtZz81)GM9Jx-DDg>6Cnx zRFr7Z4JRqZ@?}-Y6H~QZ)OFdUc0&Vz zvug4_33t1G%;~67Ar46{kWzEQ937!=DA0Xo^|1ot$0t#RKrk^@)zy`bM4>c&TqpN~Gp0SKO_3J{ftCg|afk^<=J?-yt$4glHa8 zSt4ND+g>WY*RWbDM&kfbS5JOr@;mA)YMNGZPq2@^%Z_TpS^8%erPY_wGBg|*ym!+8 zxFb_lb=m%r#|?t4(mb`7&l)3_8)VHposT{%Ik1VGFlaX51eUcmQQB_V@Db43(m4Oh zr371<&;fvR_)6&Y#1Sv9sRRN*?gKvRba%A$RR~XvA;81>BZk0)y`KPn0>$FAX7muTfz3S~MqE5~QW! z9^6uvedl`!)7ExlVsYWrtzaD=H=SFi`IpJ0@ z+C?aXt=mK=^Dp+k!--!XR~NosY5op25$xT@yVrFmhxY>{m_+LEJmy-o&wt9DXLl=y zk#D0en_WnZM0jQJkF2s2{%8qMk-G0}AFj8tUg!Mx5tOxOvDru{xTE-jD1L%kuX@Q{ zW)gO@%_4Tn|1eQUM(9hK_-c$>;H@ki?5-~EoI}h!e`IT(+?gDcJxznEvESldJQe~`p7sTv zH3d`#030?p`-!VO);6Jy#O-J5SO9`gkY9B}q1P%HUb4~(itHO_ZQ5bwfvduH368Tq z*FB%<%_fXFSS>|YS8t8tm;lmFz;+X<3A7HIer0m!MX7>3!tPd`K!-&ZH-I(gHY;gz zY*V88rmro_q@$c^h=S9F6;(*1W?Of>-3O#!5)!Q(fGMLuAa311S1%LhtN{!@`xYDu z#o@x#xh@lwG0#W?tFEftVUY;wQMK4!xQe?wRdrtKufY$6FH^Df9?t&Pa-!YuB)W%8 zQ~?ex$XP;3)d0GzY>&dR3Ov`2!aq{mw3kId+eN9lZQQwug0fX}L2s}HzVZzay%WUSBKQbe$Z)vx{H2E+I-^&B0ihTXo-Mk6Q9#EQ$jABbs zv~KZ5&^F!#G&ia@=1C#~G`EupTTl^5)jg)8@#Q>lX3Rk3ZG;C$<5}T!nzWN>-K2yN zt?D&Gkkwt({%IoA$R_MAgOHse4!a%wN8oZW=i=K(;`z5(qu%8 z)-#z?w0V4j1N7j@sy%jpY2LswnapTZI%6;1?!6GQ96Xs7EEE zL0LSRrU-%r&E(?A;4wi^+JdrpGHn|K35Mx7oW+x=(;#Tbg0grrih4niFqe+QSv;8* zVCQd{7t)f8C-b>F2okW8izoM#M1mjzDFL3$_95^tHE)L}qlOW93n1y>X7ox+u`ciy zAQIroJtd8Sw-As3Pe$Gc-Yt03`7{W)U!vk!uk#=nTW!$IeLtjC~S4QYHFTRXn!-6E#f_?dt5wh8i4Vq1~uc>2S`yym_exhK>eprFAIe zj{sx^>BD}#p~!E}F z(W1?-0WvoH_oeXUF@eblZ!I~2SXov+2m~x;*j1)Wo6n53DhB~BJ3fHSrrt+@vvx2Y z0_-5s^*Jnxjt?R`U0zn)Kpz2s4UEKDd_mX=Zu{;$sH$^VHuog^0P;H1q>W8LkQ+XT zAOq?igCIYqV*3d45{k}#;F)6(WP!Q$3o{6EaLC_I$CIf`1Rz7v&$DHsepv=UwyMMs zfLDKw!N&^?K9(&!)2A(p8Gu0*Q^I9BFb^4!slBhV!gks9Cwvj84Grl6ye?^d#MoH~ zKHi{?!m1pbPJ_1TcO6N+a?EY7{K0zG|N;)c3ujS^V))@IA2O z^XX>*D;#LIp&mszLkohpUf}T>z++5d8+hzRxAxLFa}LKYq!in>x0Lt2q)qwqeL9({X}aHNdHbQq+rLd@I=ktW-?FJA(02kKlh3R zQgb`r13T_>q|7tqK$Rjf2U{EfP%duyBH2KJK?_7M5p?RBqyQ27;;oMnA`E5JK>I!QBXk=r6l&N zQRs00U9P}mEfWy2T3!U;@eKs|;4!;RR+k1Id*Ll6Eg1;m&4J4Uq1X68?QRFSv}V58zR_@Sp_L1Nw*LkM06k9Rx(~ z-vzn~q-clJP5+d@m%vuhK?QO(a27Ory1dvRquVMDy1*IX3}9kvu20(ehL zVbAGIJMKy<9sb_WpU%SXw^HaP!aqy$3f+aeVe5O`DuNu{c43bxYM9aOpWakAd- z)dFFb1=IdM0p0_y;Di}KZRaKo!nPy$gDPg%C-f~ZIjU87b!ca-Vzjk3Xh3x$wQX(F zLsIg#GxXhbP2gnN^_0FXtTA0kcAR#%yr&-PO<)*=f$nn*zWf?d8)S3fZSvosV;T#d z;OVlRfx0b0&u8#uou=miZN$52BjZz$McOeuDAe?J@~9DWh8gxelQ(JHE?iL(kzvn+ zviZr~%Qc~QHBd_aqbBrTVA)eW-ZN#}8uW1(K>P5hNxnjdPxGXec0CDU=`mth@Y15;x^Om_5~JeQ=$E-#u9rBWm{>sh?)(E7ED^> zx7qt`HuPX}FJx>SL@hI&y1!I~cGLjtVwMm-x!J`)MMBB4){-Wa4jZ$JwrwCYrH-UT zbrc0H+dGLV9994<-~H#I=g4a#%p&NwYA1(YE$8@_LU)LrDG==+;cR-fXStK@z<&AO z_5|4CJMZZrk5(b?R88(HJ4A~5?5^+4FcjD)RaXNj|58nFTCerrFm2Zb21GvPj=VqX zj^~#dAp7#_O)T#U)?nudcgkAQOOuAw19ow{Q*83?TZ4r7QZ6%s$SprHzNnGTlzZ&I zv|#HKcUy8~M|!o7_Pr+ja}Ou?k4ksptE$J+iT0;F2G;ift5ubBK90F!0#R9a2uL{M1QH#B<=?p`MIw!y< zQMj!XvR4y2OuOJ5IR`Mr>-i?gmC(X%w1d8sx8vG}0=--&E=c0=$WI6aWM&M z%CruvgJ~Ggn(q5GI7FW}@~4xXy-F3H-H01?Ue6kkz>!D&Tci%!6f?3MY*ebR`%RBE zck|M7p2+{5=`cGZjT|&h+4P9bl5B!WO`5TXpc*Nl*%)EXEOD2VG}6+T+H^KHH50Po zA%v5BjhQ>7>PEgy>O?6QU{|Ubw5%M!87m19_Y;wCB!n}w5QZrLkuaDdBR|}K^r#9P z{O-LU3rG*>t^F&$U+)`FUiW49Jo=Tq3(h<5>`3&2i|5aXl+XJ7tkTF?rDv6ue!gsW zBzo!6C6V~zR_A|aS?LTSP4k(AuDdiZ)D{k%Hs!>^O=>CpPxr&#IuVaKQ^Ol4&6!$! zbN-yEkyOE)snMPjVpB_R{77tSdB=$tOih0*e|XwQrUSB_^>_MdbJcWMnmoJHXoL_z(68t#7d@YZU^UKdWeh%~VIKp#61eelz}$`IRnuFq@3OCnA3GKbgOP&r|+D6`xab{1;5;h2O+< zfAn$ZKQZ0kxsGP^x#<>u*3nN)_sgyW+DOSwx8#g0eQtU0vGth7?y=?Fcy^XPH{C~a z{Bz6O=u5YWHANthnMGM>2vGlH$UI+%JRw0+9d*m0+e`2~7cVzu@(_MRKmQQXyl-_lG z|Igi><&)c9zqlt$pW9yD4`k_c+iOpbe{O!fU0FW4?R9UCe{Mb>*`4K|o1d#5%JRw0 z&r+%W@|##c|B~aMo1c50arCO^-2Cj%@z2fAHQJl-o7i4o_>=RW*j}x#IRA z|A=vM^>c#%ET<*M|06m6d9&88TiLq!3f$H;&TS5!T9>x9=FM8NxOH*ftSi>8&6~Am z>59d(%pB_}YO??7Mj775e?hpEl%WKc8Lj TYHQ`=#lOYR|DzeoARFlQ|PD1JS2KiHQ(O%xUzpi?jmQVfa5G$ z`T#DXnhWa8ygdCD^gopIiBJ&ksZa!Eaa>*pKQ*A16QP)D#)Im)4}zyWrSVfSdHPit zFZ-f@Di{66ZIArJbLAs#Cya9*v#8!!GI#DZkL)(%sdBvO#EoS$o-QBhIAhzb#1G&D z=mfpoy9D!Uar^S}@~GSe&#LiHf4SJC&hAmw*`2!w%R3ZvF*O_COWjw@rcl{;;>Hos zV_8C*>Al~&{G!}-6!a6E$WI^$i#L~+mtp?=SvB8_UZgq}--QvP&MfPj_NeA9wft-h zADUTw-lJ`Fejh{+{5kzHC|dx%gGzdE4|-Seq8$*Cr>{cC{4cD?nIQCNln2bWm?A6C z+hEoTL^Cuyc+HG=;6kl2ZnwxYJ{UNq$kYD-Nu`_$1&xn9^7I!0H9qpnGrs`167NJ? zXFK3VmC~MS`c(4|%t!?S;ioIQ6C~A{EAn(NwWx>o_JP^mm3ROXrw=To`pW50y3!YD z0>F%n%6Xbn2{YD&9;))XK5`;q?rOWSlrX;;F#G+g@rIwRmh!k2kBS!q0x4EAG&*O; zSE=+7OhXVvCQL*St{r~`#xK7vcg28zDEBb>F9XuN*1Sm!-t7v@YA!TaF=5Gf6u>~ieI&)p$f$BD^=D&{PtLM9+VzMMXGT=rfxYOokw_Kbk534+uzf#fs2AsRMo0h`o~Zh)bKpUMDy7}Ay6 zVJx+jRsqtzFlQ{amDWQy!sb!&;-u|(g9d$pKC*nX^1iyRv&m~htU&MGXl^@L&^ti4 z{%~|2RdiO@uhQR*o>G-iS72#+@R^T0*AvDgj2|1T4xx$7#=lL`q=>bCZ!j#1*nTOnQ}VOX`UY8> z$iM9ZD6se%o@6lJ=&MrTw7Nc;iFGibn3475BedAo4fsf7e|@B-ju!zR!Ii8hAEC9c zZoo(A%c21;)nN$uXr$|hd}LicNcr1caK5{0hkb-+j}!lFEdWneK6cc3bPwfp(=Qx8 znZ`}fwhQY*_cUNJxG=Ar^e6z0aNNs{aLmU>_;W7!3y3#Jt6#bB{|4}OA%AEs0Spho zjqF~b{B~Z^`T`j4=|=pU0B;xaZ6oj^eU0LFzVZ4nMR7adc#Zf0#qE6KEB#xF+xbRo z1?ayBcq4y25SvE$y@0n1BfK`K6u0w}*M}U%?fj%Q0_;AI`r)dhiSeOB#au2kGC7&a zPwC^i@n^I`c$kD!@hILRMC2GQXjxse8A+x;wX0W2Wp*W#d-~Fuw9=dE%Tz@s$0x^i zGMt^5$PZV+p=_Zrt`#ztt}0AfMO#m5dS(>Nl?qxWmz~NCfs%toGFs5Ist|fTq80T* z{;=Q_*TO9^jvA@zT$vvkU7IZC3z?Cuo}I`Z&X;tOtM$7kLK*a~I9$l;!(*8u#LPvx z$Tp=HT)2kDCr7w87nXuHp=FEO9k{3>YbG^%hV!|pEX1*Xk~~&oBvSMHl>U z496{vK5+icE`GZC|AY(wj|E)x=a(+{pBaw+;9H9Ob6wyU=gyx8E*={87MVYKh&##lty0J4C`z zB&~Lg_jK)$PQqC5NO-7NBw^UD*)XDEu`1Nnd>6j@I_$z|?`)>JzTVV(VGD^Rl~&&W zml^BtO1w2ycqK+&)>!>_3!Lq;Se92(EELq0*tmUGR__>Qp~8j>d0g=Da8ZvwNW$=5 z$Z3<11(SJQ3&U6B)}o#rKFDUIjb%m)*_JhgJk}E2zn@kf&rzt;&3}Hr1{O-rvmDkYtWhyKYTxGAW}RaYz7vp-*BX4s zAw33HBm4O6j>ON$)>HjiG7STb><5{n5hjTCagI*=&jHrR{yekK&)ZCmaALKP??Tha z{%&Sp{2oW!PW$*h(8&G;wxAm=Gu1m&TYY}tMV>kZV>oY-obmxM-pGCsCk2$Wn(Eyt zzuiTCiplfu2Cl#<|CEdT9FsrK1zGEq*FYXwV-CK(>=Qy|IK~0rE6(_T-^Km`_HyOl zC0xDJK7J3uuWtWbVDkL?#Z7+EMLxzhB>&EFlYbxN8_oYDn?K)YIDh# z%gVBJl-UnjkJUS$KX310@{uZs)j7%Jo%~}d z!3d(0?0~M^n@nPbZrYD&(*9MWQ%7Mx7GfZQR^=n2)hL}NS|pY*VA-q;3I*?ZukVeG zaoVXz^1bhQ-uF4@JFramsqK1pp0GmTFLflrjK!EV<+A|Ah6RV zUu8d%?N}ZieV1pxQOj5)+wO^uPV!7GI?^WTWoe_-wQ+;2Jdo438s+S(tFr;8JX+wH zV)u=q<^6UgGN(Bw-4cl5$yt%8elJn46`XBfVWNYsh<(t1isb&W~F* z<^5a?zRse`O!)4Ml)EN|gt?#+bXoIUjJT{)`V(`Wcdr%mJeEuF{bvzxM||#IVj`fl zz8p@U5<=b&`PgNs7@~!r!8}s>GaVJ+IQAUZVS4&a}PibfIpG-59iZ-ID+h!k*#h%Fis>uNIACFQP*K_#Zm2{ABd zlBm-OiSN(pYmstuVxnEKmKIZs#T3Y`2(wC;dd7L1^&3pXtYWQYLrRU%(S-wqa$cN}B6n_@yz z9P2_@(C*So;3O45g$$tHN`(0SIiiAt>BW7cOj}b(vHlb)wzlW~Au9Vcibx`W2^GM6 zM3a6-8G}M91QcrlTiM1=p!-begUE-K`s-jDd9 z$>55c4d>GLO%gwHI+(e_GjF2agGx?TN5qJ_*%1+D!m4FV_MYX3PDQ+b_whq#ef9Tw zCWE+;atak15S!nXayL!k+@}^kJ&79d&&#=4i=3Vm(%0&E=5I(u)(-7kMGH!(Ma!HT zg2$5Tz_TW`3eVcqHzh7MaPgTx(*k(+IUbwg_GbXPfLd+5`x=kE&+S&5oX&}WTXcAy z$1WIfh>I2t#tu$OsJJqR!*E=Js z_ce49-dV~O9pI^kErf3=rHT$_?P}OX_%0(sm~wjV=d^;+mryrBWjcN1b4gf`yz@NM z{E#Z?yrhPNd;qeK<1*+KJ5UCBjKy#(oPl9y-dV2JAXa`uN>7={xAGuDw~JxbaR7y} z29Xc*+`cA=`W*ZC^Fkrqdz->qj%wFSR9*Q894;|L$2c|((;i}TZzB%p<;3sl`;l<- zWf@3J1$3tI#P5JTcC38b4S-9xop)Q0M3u{7b+3Sq=r|m{ds!0xDW$L1(E*+TXm=d; zKICKTF+phbW1eZm)@%V}HfN?kEOu z5et&AAkE)20U19cca!19Z3eeXb6d!*?T5v1V|utTJw|UO_d{a08gATZaJOjLfo(`% zq?16+4+;g!5?NrK){_x+lOU@dCiIZ4tnM~P)DBBTZLg8l_F7rpq$`XiVlhR5updoF zu-tmu4A)^$AriI#W=WuN8nI=JoJE1g$f=8SoYHr^gQqUeJ18ln6hKPjRF_R-nlJ$f z151P`aJdeW%YyQQC~%QQ`$2*nq!C-mRTP9=x)?_&6AR^5tV0V`%J^N}?Q`F`UnpqB zIQKj()*c9~aV~FPw{eAYwfjZ4$GOV0s@d~mbBlBN`u;wr+>VJX4qsy&H^Du@#dbCEEN4E`ZANoNZJ6;bXW)sIvep}<1*>7{I7Qb!z zYc-P1bD&nTtxngq+d9q1O;6cY1CngZ5rR;^P4wI9gHM}1W?TK~>G#iHxY%jyOxGQ# zeXRzk+B|Lwg39j(&L zi%LH3G_!`s0ljBtPc4Cy(2uuoqx*GJDIa5h{2E@$$G`_P+~_f;?H#ZK_|ZOBw%-7# zhW~L1ob4Fcx+gg}81EYzitpT&8W#k(YbBotqyZ0#8y9zaq9Lj$QNz}-Cn8T+*^1S!k$!{w~C zD|)FAPP!Pltd~kfe)M?!KN>4|8u#CYhT6_mUe?F82QsT9Vv7+`pse%NfkwyK```e+W4Z?M(H%aVfo! zO!f3`XT<^fx5X2~J%jz_tNY6KvtZJhFV~O;^U34SMVYG1}z5L0dbEks5$?uTOU$-5u}2i3gK2+-f#tVP49 zXX;We#tqh}$GwR+&F|6s#_L=9)#x?={cY%PypI7l?DEi1^{e%X*!Oh4eykUVkahm# zB2qMr{0(>tMz!_7Rb~Bjs~db{{giv<+%$H*{t0+PUmX2AgFbted`h#j_5iWp>fgv| X>Xfxgt?cksZO@q(w59EU_r;w*lK zl)xJ4#iehr?xKycR-g@7w`D6Dun&>J)Mg9RjO`DmiD?>$Av6i1j8O(_fwalax$j>4 z#r7)_SF-QDzk9y!x#yhwo_(+-(rUNa2yZs>HBy*yl#qrm7RDhu43Ro=1rc|NJKMDj zyLQOxzdf>EflK;;?|w;t(l-ucwaF$N`XNYEFWQBPAv&-LnIsHE^&*TCFgi5_l&4`p z-hmI1wOKhcBTS3}@_~?)d?EOb_&Pv^OVs9IveVZG-+O(dV$SKKEZM^o&;xkFcNhjj zEzw7va&~f@PUIS)w!^C}3bma9p|&d|?uuk3-&(3o-{RC$UVY3Z>0zI!IepqolD?(- z`1$9NS5r}(_7Y4{fu%ujs(BR|xSkw72k2+)1)Jd3!vrh54@S5Z2$JSEp zA;74bNPLGY5|1~@T6k0s2R4NpgstHQeShQEbNY0Kn0h=&!3k*f=Ly&aJ3m859cI{Or?L`^@ zPXSF!=3jbP7KMk-i0TE0@X#N$mNla5Z^FbsfNKf@s0xl@3ikLNAl6fx)kT{yu^z_i zqFu-=!J;}Ag3D;)wNpU46>Y@0?G=(VsBEFc&ko2*Dq@Tn9fhcq8 zb#2+Tx3y(^9w3s|>6A3+x?Eak_1yKK(eQsZ0s=pF2!}jNaCYj2b;5)jMzZ!^@Lg)I zm}?=w0VmIW3{v6HjPusRo7IoN0~Y{PKZ1Q&@W=H4Lyo}5e&aP2Q%uIXJ9bdlHKWB5Y0P}S9bF@Z4@k`RRN4MESkFqPKM`A3wZ`lB@PhSxm^3xxYBMA8|NkIsf|!= z^fAyEcIi7{L3lC+NMxmCzEDd~4g^vopfE|2xO!Eq@H1hbNa z^STT+ROY-cpNm4e3-!2P78pkp6_nyCgi(esFCkSvnVW#qPgKOdDQ z&@}7-eSj@jiQib*E|c{KjmkO`!k{wj&sEa6n}O0so18hP+=AqOSv#w~d28A&YtKFg zOTif^gdwP(XF~FZGr<`K12~S1Mt$s^oc=YqhzkQ8MXi=|4?c!O(5M)R)5AIKp2nol zm(-e^db5DuSB#&$9!$UAE15;L`SsuD47oPWop=d z)dh!Ar!G_|SE&n?$_8~|UHZ=;YIy#PkvA~znt!f$68g^hUl!sC=6&-|;0BFm+#ule zo7F{!63n(PK)Ji8LS0;!?!n#OsxDS2*Q$$^N+LLe=z3Xi8!nt5($s-Nt^v4Xws#V` zPB;_7&xO?$JiKw#2wKul0#O(9SI&;^r>8>KVX4n_d;~0OM9XeqmQ~T?LPKss)lF#L zEok0uYM#RD6}RgKIoH;Kidp<{J9xana%X-S;sWhE31AlkVc2J0hxsthxs9Us3aaV= zJAH6fD#G;7VLt``s0U@+^bA1Cm7qFA^7Hgr?PW%es#mxsK{#%5EnOkxKd7wPU6ZvpyFEu7lG}U8DY_ff3dtR^$KA4Bc6Zoyw;YCP zYn8hJ2t>CB7@FO#Eh{^VkhYb3my-|=h(FA9`LSI%u-;n;1jayLot0GJ3QKx$ahwh}MWvrHE59UEd! z+#j@7qoWwD(J;S?VLPGU$mq@?211Y8+uWWk+HnN^aR}{DE5RwlRaCpUOuHQ%fi{^@ znc6hL^Q^loStOZK89VYLsBPsKVNsvw2rcGAQ(#+S*6WEqpTc`2U6M` zgckXd1qX4JUgaQK!5{o@w+ zF92UMa555Yi`?EUMxwX3w(e-z8Qm#1M_QtJ zhN1qUeuczi0|Uu;9z-p7CzNO}2uhD6qJy#FXg3h-PvuFvV<-B&Njs zqN)D-$_wu9AL@xRsa)t0Qbn9b7?B`uF9z5 z%iP>ckR(PE@ic@+N?B$smym)JzqcnEPp6dRpcO@TfufK**Fr8~O}qfF71%mpj14Or zJ{NHO6oSwY$DhhLel_G3HpH)h56Ay#!s|>pudqmer3ptGY=~p-b2&i-p&@Rz=V6K( zxEIEp{-+lBiwuV}g?um2?>5 zLla)OH3@N<;^cEC`n4v!%7h0@c;VKB=XDcaxHSoR)PxsqO+ubF;ck9zqu9`-C7n@ykfGFWhIPPG$ypPzLx6eEO_rN zyF08&m)#)NWV|r*yT)Q`+yL{HVl5QSWR>2Uh->DAdsA5SU4n~5K8 zv%$m=oZ?VYN%-Nd^!k(%i|=RNPxM85M`D8sb3A;;G1hMkF>`L9s1SaYX)GM`E9+o{ z7%M}0Jd__T9~ARn1Iw6apOqi)8I);dKYzXl655KNJvRc1r7T2O3tt{~;f*o~?=Ym} zH38??_Q0o{{s0ynX#Ba|m=*4CKF9Zqa{4Ek{s1FHeYnSB{oe$voc=DRpFi(2F{USG z133U)IsG$C|3~b=P(SJ}*8dD(<@DFEqRQVNfQ+_c|Gy4sIsOpiZ)JSEE9Q3qb2*3_XjK4VlfSfkozs>kLAMbx)zRWoYJ?G=l1;)?Pjl<+I#y?ZU!1vDy@h_)x z{tK|5K+ZBk^dCZ;ALrOme6eoCyV%2p7llK^XMYUIVtzd4;@S%k8D|cDxJY=>cIcUw OHIaw`Bh5bcU?$FnS)D~%5yNXV!37#MK!l;B ziWuCik8zolwp77K+qB2mX|y6uN2uz>GM*)wBxrA z%F^!Px_@-pOwFv1B|DVX_VJQrq;705lKYg_AE02SZfuINunGn37R^3nzpu@W+UK;J zwK+H6WUP7TaSS9iGn_arE3M=Qfx~({pT2gjWQ?97akEG_7YpDk}VC?p0buC_`&k9CT zZR=5}a%yjtbkwjAN&T1%dg@y_cB2|J;wNT0Id|yx9JMKRf1z&wQr(E^YNvE|#6A;A z&Cp5?oK%9J2Tih9sg)6B%h{94yuDg#b*XM|P+KrC3KioWi=WV(@>O7K&{9vgKti`q z+uPJ0!c3lX9@1RBr2z#SbUW$cdgodyj7KSGMfYL#LHlr{w{>b!4Ec zIU4`SP`32z_D;1Inqu)YCzZT8YK0LGsIgRKb~N>HJLpC#nyuTWnh)lqrroMmpbg|t z8}>T25_uE!C={9TouW$y6z->>6J{OwR;gu1{H)=W|H5!elcy8EKtmdSgJBf!?&E z#^~$~7=B%?x!%cfC$hWW=se;xI2%zJy6313nDjx; zzKH!MiX9RAkEVPkYWH&nXR|TLau%IxsCgC>)12C-wt%qd)J|bx;5&^X*aq-Q{#9w4 z05Q&a3(8vJn5-qv_>>1V7%3x65YH;dM-D}~JQ7A5bS>+x&H zZ!3Ns`0dB97r%it^NJVIXHI=;ofuVUuN5*2`g&M4Q6Ag|+kyMo$N&RaOjljVKR1Gc1dvp}(4>pVK zVE8u+eQdzM7Tu)#Yt>?I&?04Gr3AeOl^5wyT%0_hv>ir27fn_swxR(v0pGv&2^vrc zds1opFJuk$tw2xB%8$VPw`~FignZLJpvmvS-qtpQqJ=(C+Lj^{p#?6Ylzn{4zQhr~ z(mD|(WxEnlw!hK&T2Q;_k57Vt8BkR?T_l<^@5o7|=F9MSUp5kFaWb4hJ#s#4lCpiK zkAhEPQl!paNI&BVmI}e+5WF)0$vXqe_QD%Q8%MzPXOF8~aGeBK`9t6;f5=F5%6$%$ zEm9`x>{p;p2o0--ZwjcSos1~+cEeTG1Rh127kJcg=J_aqe&`qVVgcyKa#Uq|2U-6> z)*Qi_LsmbU3h2sqwNC4NW2obh$XCh!BH8l=d;X))D*6sF`4giIi~i#@stL3I;fIV+ zpOUN|92|7}pV%YAU+AIdmE_&$1_w3lVADTG&5ne6l;l;QBw#BtEon9utr9C=VaTur z!;f=7#xhQyqlT5_n;#3#dYR6F-c^Uauf^MhUDN$wf7F|Dt82SZP;AN@gq=;_DRz^R zoDLPVdwN?vL++vXDKnk?DjI5o;zYoMPBuj2-H^o$n0B{;nW%B+X-bDM%+FDy{chr|u}bT{ z&vE&ye2Z*(2goouf|B2Yik#1yhtzLXBMn$rBOPNEA)kE+u(Tm{2E7$UX2Qp_i28zq?k{EMvx-v9P@ux{_S> z;ozW2$H3#BZWvQYLy@L*7;Bz+9HnpGs?}<@YPV^1yMg^=0R>kaN!6KRi*L_QgTJ%B z6@SuGb=iP=#W<7R!0&_+zduJyeY=zpZ;60@k<=mq+~~SVT@=((KPDKAq-K@T+2w1g zB?SP8>p|1)1!{e@quz9#Q+OA^89g(6%{6#Qx>LGROHCuLBP(#;n~~JRv~}P#prf$; zOJT!RPN@OB@xBg^{WxkP_J>+Yw{AZsCIGnZe+da`yphzeTJW+5$V#i;MCCj!hy-2e zr`hmZdx>8{do;TXuV$zXU4qH|PzOMUnjUO=OU)((jHQ0w z4mx2L4+FzPcqV$pi6=CKz$?-C`@}v?Xr9^#ljxv<={6I041m>)A2uD;!+yXhM%)Ex z>pPl7|e6l|1%un% zEo?t8EQG#=Bh;sxYBQDvn^kF3aAAmc_WOtr!!f(p03OtpEm&9fqyZxULP4$TZsYC^r3AWUQp>VW3|LRi>&%m-YEerAIRN5w@2VB&yn(Cy=>YegMY?$hm~ zs5er-bf-{01XvhI^nS!y+AErJ(E~)qpb{tH%K`>cWt5 zDs2$!u7}*ahvMuHa2u@?Gi$<_xOlo!ybD`sov=C+hf#4v+0%j6+?hAf9qvSIAMZ#* zcW8t+I5}U$YV;yI$u(iPf+j=mH08az2MRtP96(AI6eiQo5t)V#k{M;aO08-KEVn~{3`6#Lg@^GWQ2LSW>ASa6ar(UH? zYe7!;1B;sRSJAT~jB&Hrb+knI-W)aR#fjRhj7G&=-;C@t&`=?L}7f8{Knp=o6q2$eQ*`>_>%x z&5E4@psK53>a5h{?)6Z)wh5eCXus0-Eo1}?xDA=aDm5E+|FbV(r#bfyM5Y{o`+<#j z!N%VPJb0I5NeluKbh`32eYip0PJ${3u?a$f(rm|Z$zni z4!AM9S_u}8N1_Hn?E&@A)k$l{H6|ns-p+sspi{HDT5aC^e^0Q!Zp;p_i4`L+CU5 zk;Bg9qs8C?A5+U$#F1!9(Daodt@TwRUErIIbd|53BLvqRUlaPqjs|O|Lg%rT^gts@ zUafCE=#5j`(<20}(T7rS)zq!&p@Fu_w-rkDso2$GOxV)84+V(s_XYz}qYs6kH5EHv zi~=Of*9&Fl)B*O??NOfT8{n{#B>>9Uh(SGc%4#&@gC{Y(+NW?Y%YRaL?w0|<=e-Ot zUUM0eyvrh?bE~Uka0FdGGA{Tb7{lKq)9YmVDw)2jf$2*IRAI3jpOSD8K1q&a1OC zQl^`b*NNRfeVg!WO7hTIqW7KRY}7*@rg!SFPfUOVMzVMrZo#_?!+=&slQxJWp9e-0 z8L-Rg8(PI_%{!|&k+un)leUR#0#L~J7^yMgjw@sGu9#C;fuZ(5-y}Ce6P3?g@031z zBPJeLb0aDgXDhTDaqe**P*VnnTwMBgVmgIucu}DLi8+A>O!!0kFp6V=u8KK1)5XT0 z3r*g}-+C!$DqltL=2~D@r~C_1=jH*Rgn)s?OS}#j@BqXO;5X|oq~bSq&Vte30XV1>#^Hq6w0%ZDVKr1gD8br3X2jpSQa*lmk%Hx@bZPo7kc?p~%RV>g4K$}syZ1X?UPh^>=VgdHOnLA{w(2K}atG&m=#68SkY(qL`YY~*WYq`?JQ z^~f)fkp>&Hnvidln~cQK7BsNpCvce9AOlv|hz~f13>yvxeoadb{6-5szxpdiycrZ8x%~TDXwT|gV*kMN41rK-ztnQQh#V|-*CZ6Da z;}xI~G2FAM`%5+Z6cA*Gke^oaA|&AZww%HVn5{Xh?qj6b;6)=`?JQUb^lzVHsVtKE zc?)DVVK)>b@XRX!j-}b|HNf6w_ZacR0MLZ~r{(?P1^`X@(fBjb_{Xs9e#6Ok;j=*R z0-n!N%h1W1XzGb}sKy@K55}VejV&k}ar6Xe@nsNC<98fLfhgk;&f#Cdu&-r+w*aX8 z5=MXPqX>>dqLLiES9lkP(=l8iu6F_g0O2$wwBAeYgqEU}eDwyQ%th~jqlc)t2*6=> z`g&ksmYNyk;*K_*z$Ls1wNo10CMugCc9tr+JRAVGC>L6+FjUnb2$s(%0HqXXaTfd- zkQ5zZ1@8w@Od#^jsd5!zUx3toSd@Y4*pFl0z6ky@+q&mhg_z-4}}=K>XOi*mtmF@lG?&(02E`jLZ*!#HkBnG(cwk znDf0HOFP@MkD+)PJxsj*k}fUVN*d^G-KQiQTYWCkdMjT z>^FTmCy)DvmrGt$B2J?^2wuzyt|i3u9jHa*u$i+doEq5^uImP%xn&F{mV-w$-6-id z68%8~z{UWu0j{xbHGpZXC{iCbJh9AGNCOrkZ@@C2N1A12D|s<|F^Iw2T!bLjg}e-$ zN@dm+LZGij$&0SXuN}Xw_;uj7AHQDw5Yq+cii6%+#bqkScMxkDR~Q>+Ku09EOaE_- z34;%HuLd$_D0di{cYYF?V*z#83_d0-^0~{?#IvHA2M~>mry&i3fRqk~R#Fcu9MmTv z7{*q)6R@1vY5F7+APu&!Y;l1&YP>50mWL+FQ-;e#Q|d)IDtFCt%2(?l2;x>C7S{dp zf^K7MLDPN@N;bH5hp-BT47{l<>XbjCOl(0;nsOkLd_!rw4+>&-KM*M5rgVJDcG-xJ zM>O)V2n+5u9NbWThYA=-p@=^sfC_L}ge0#BNECRL{X@=YO-1zaJGd9m4N!sB`F7A1 z$2>`qi_WGsAd(6V#ney?iSXC)(dl7I@F1iVcui+ja;QlJRRMOsM#ulPzVAcXaQvzY zk<9PRiie*b?(+4HPMCC$PqgO+R}UkiiP%e?G$bH`sy(wDz%wu-c)BZKBf<(esxrW; zDK(cO$-A@~abD4a*2zHcZlT^&jTupD3ZS+C-T1m04~t>AGnej$oE|nps~C0#hh1TY zjw#7XfJN*Q1j9@j-{@XYl8Zz<3{cJwKc^#9+Mp!!K!eeQVP5nHp$`>D1j*wF-V=}P z1+nA>6Ghy_P1VqB;+1VKoP&KaNV83N7t_B@Z1t%8*BhPkWw1M%QMTelxq!lavW4xm zK&9cBKZJ95KW#6FXTn|>&osSw<~-z;?HZzoz8%PV@yzRxMcnfLjc4ASj%O}oJaY}> znN?mq^MVVuh-V715%J7lGoHE3i)RiM7|$FkFrGP77?W9GJaecf9nW+Ls9BSaXI2r& z48=1$4f$oUer`O|UB7}z=!}wl6srfwkuP&?@;To%O7cf%h&WuoEI;7aV~IjCaNeogs@{d!BLna_qmjc$KTOyGp?1+2i4ZJ$C&u~Dt?B8~0`X2ib%DiBzV z#$QDIP)q!I5J%xV_z*MeV_kmJ3m1kyK-clXSDE!eG~N$$!ZG}qFvciCRv4%OORWd} zvg!9K2z2R0ow=H^1xPlgiKSyg7Y$X~O6Af4XK9Gj1%R?DO zs6Ut1q}#8g4`~i-E7~3c96StW>(ldbTflTGZH@RIjY&LAq=EA`8y>uZCvyjccvLmv zP^IOB2{?+~Pt@DQO-ds!sfCv)IY+or06g)mUh;y0pe3S?2E%~}Kw_B99A-WPI6F8C zkg1H=yU^heBld0)Q6aMiW-W2#qli;E=phEe6eHUCdVuJJCmD~qU`&3Uqodbh5EP$3 z@G%+MI(2+Z$C*})h#mj5;AP;Th;#EG|9YSTOMTAhd@aDx=4nb|O$z8JWPD9*6}LoF zM&QxCQE}D?MR-JJr+Bl%Tx~jq#nSA92xdo;hm|&}0SGAqA3(@_z%~vIA0FQVMmu1Z z95E8d0Yr`ii0qaVN5)!p{M2XNAU^ZWKM>WdIEn6N*6KT|kk!>+u5=nY$G~K#{ro@dFf@J0Lg(6k+cXXezVHvAFpAh|@YQ zLRwc~C+G-4?ORxQu6`~rt7Vu`W%D=?SlN%)>% z0P4`_Xg3WofC4yr-tRj=BPi+N8#e;*XA1xVi#`m3iz~5Io~2+lTELo)voS1OWXdn& z?)$I@KQ2StX}A-M@4}yJK-xv|re6m)zzbF2%DaYPvLL^Tolx3d;L_V?u_{XIkC~6l zfF`Zt6u|+Ui$irB4%J+&3E;&la^T~v()uuTA?LSNA`MtINXOvAJXh{ABR($>bpo3q zk8{4Jir;7@@!8CXm#>LBh1*aw0X2mW@TfuUgp7(ws3=WWOv5A!nPqWYQ!e65Yn{o69FDD34e?t^-x#83REtPM=;J2}EGV4n5Ew{$c z`pKDfS*+tH;hF&JvNG$Yv96qTV^}vWvu+0K!mOi)yct7vN^m@Sn$H?>&S6HzXH;?# z@Nt=|;LRUDcQ6`17%6#YrxO^MHcCgtyFMKRLPG^80+BqX zw&1P9{PPhTe%g=^&2kETCc$zcrnm5E>Y~TdLtq(>@E|bcih4nw_0YmS!qbu#n~(FR zl^C375B|F0LGgzT29ITO<{9&G!}7|%m&fia%D zk@|Q&LW~F3qfuf!$&K@%bl%vIi3rRs zK)Z69^|Q^p`pC1W_14DHNjWM?*`L-mXf)iXNO#ORP0 z;Tc%*pP0D{E%a<=geONtcybWo`BP?uCtpN(^1rW#j^5`*cnU>?r%*(A47oo-6MG2u z#2c&uEL8w3wLmS1$-7N?w;p;-*oKn)vS4vnGZ9ebRH2H{&um04#g}9E)bv6;#7$gL z0-S$gZd`5!Xj*P@Rc=vLZb4OUuqroOyVxJg z$<5xq=Z}AS{)L;tm>BzD*19oHpw0hXAIJR>8^M$Aw_6%L#gs?^>Ag=*zQ^jPa~S3E zOYkkDJbuYP3RMC%a31=Y^Uz1{xnBaR8|Cp!*`rX|F8&mCl*cdO8}%rUU&Mr(KP~E$Pmw5|PfzYAnNj)*ewX-$=Pf-WHwpUiyrt=D1U+3EUi&SAK0I$} zj#mYJc;3=<1xtR3Z+PC)^lJruc;18!L&tp3|8}&JwDLUkb?2e~ROla`|8%=j;~lp* zFJHX4Zo!fzbxli`Hr}!1TXnb3kKeU&{;~y=S}ok%XMFB$w7)SC?c3+azkH`BbaN;B(jtggN( zGN*2iR%J%&hQ@LG{AJ7TSh%e2jwN?ke?we3KZBULKHl6fX;>MDiHPT8?@iA^d%;&P z@yzyI2`wFexiCbZ;h7rg6Ef(pc=VBa>OIpLsb?epHl5E%J@*KJC5qq)L$g1xI^JRyjvxyJcB-DH z!gxMSelK`NlTZ7LPm@1)9{Co~`Fxr@|H5Fj_Sd`OpDNEkJs(ZJ-IGt_4zKt(*yBzJ z(-&8!8N7GdRc>a$E90Li1PV<1RPQ*D9yw3@_1?we)5ialCr|yEPlG4VzGXh?a{B$8 zCtsGygnXkXKY0Wl?BYqkU-aZhut|@4@+0YKxitN#C(rXB^J(|wgChje^Oxp5g8I?s zzsMUKQ;C$@kbvLb&Qn12(*!VA0(;aN~ZYNgAx57 VQIzt)Gn#y_*Z=oN2#hF7{|9uQ_`9n?}R&Y2htifY|#WKq_E4M z6bGf>LWoX=a=*(>V;5a~%C=c+_tEcRg?8N!+Ql}3DnWfTk&gmWH7Z&V!5~_KBKdy5 zbLS>wh>xHaH(iZVf5B~PqjYYcs-MllJ_@8qN?RCZ0)^G@dv0NL{ z>=`=7Y^ZhWZ&oz>u?<+7sNGg+tqHG=+RMwL_U+}?mT(am#J2ZiiQjBOgEL}&uWs!M zZ$w_Rf6^sMq0i%`%y#mY>ekY5NsSX&r`v171RYsJ*6Da+pEl*- zgF5=|Ms2NA{2lFV+f4Ra$|hUGC3u|?xAz-~lc^8X)S(e;w{8uKZM!2wVs4)b`hMny z46*HCEb*kZF}xWanPOX?Zk>xKMy$ufJ3&D8F#h__h=TfXb=Je{CYI(HDEw3GjdC(}eYhy%PV|fjnc-^TTp#4lEC`1TE_Q6-9TNgn+mFj-HNn>d z_NaBr#LR*a+qB&QO?v^RKk&|!cQxz9*uZfk@mv(*#RiTUsF@k-ecgu%ZUwWE*d%uv zqV*aBa#nSmv2OM2X!0tC8NH_hM(Qaw^=58zxdFpT-h$MZG?4m}Uq>29 zmLZ*#{7a-c$&iWRxxJbwxGI*~6S~y5Ictj)BYB<6d7>Tf?{qQgX0n}050ic-gG`1z zrd7{i4Y(pY-dtck(vzHJWJJnL-bnO0b?V-Ljd~?=u_7zHHeaF>oAPO3otYTb~gl)y-BvyY@-A_{jN7K_mAEddc0L` zt`3Q$MQ;i}0ycf>7O5%N^5EISw1kiWUtrLNnm}AkrRZn>Tcitu#x_M^iwn8NrPt(AuXrtXh*f~Ul3m0BChU09z8!EZpTZ(+EZ`o|c?5+hJnx1VI0<_x$BGw5;eI=NK!J5N*s$Ijr^=_Pr_IDcsC<36 z&h*!Z4RJO5RkX-^qVor6jLtW#BaJQ z6+UwP8cU`0z5&*g9+%{8SP}r+l7QF|xK;K(h$c^ZO_H+X%KOly@;)QATRl#fmzft> zR*YxSMF^`+>KR=$1Rh0N5O~yZ{?^CVy@Mfr@-glQ=j8Bx%@UUD@Q%C?1%qKPZ8oD(=zPiXjPtVR`uuDZ{Rd?3=_9vK;Fnda>`Oj?gT zC(_LqMn<$$j~a~}5@~@#N{0bqH&8sQ-`XX8y zPdOMD>4xempRPiYX6=&JBhqugNMoQ8(BIHjbPGF%jd#7Z3FW1Og63>={jGna^l^s0 zRk~toW>ZW(OfTC?`=MRVa|&;y@pK~}t=Ol-PH6$c31g=r5eu=zfN2e2piQpt%(&Xj z{#EoyMSBfH$F1{P>S(U$cpL@p&6P;zA8ye_+o9fLp%zQsi3%l_IuGUQLKZon==ixy zgMQLufEGwLA)S=G9%)Xpm_6MQ*h$r6f3LE?H+xT^zwh~2S9UMS$6KPJ?QO01?NEzu zg*5|P!pcfnd73*G2meg z_%bW3O3jQ6i}aVF{;9pHp}uIS$MwiwkyfSZ_o_U6`!N4#BB6vx-#!l>xtUGX=inx^}KF9EKR)D2`2zB-+K^;9#IJKC)^Ac z1bEYW$~qQH98A4DBDOsh=@WB@;olSQGuMYMyU~}C*b_^92!q^76Q4t7x4SQJ-p6F@7T(=*ma1`cptGV+Wo1l~@ z{plN>LI-qoTn~?l-h-45iyFdPw_cX|0S`8l>NJRkRHtsdJvdr383wrRkO92{A~ga3 ztzjLW@)5K#;K|zY)>4D#iBb)KGM~|V+^1uAv0A5qMClewjNW&A(J9XYj~*lpFo-;F^>lXXaa$s3URlQWP8l2edQN){u{NeU0PiGo=M5bS4v8NjX- zAbE}CIngc({=N%;-T2#%zaISc<8KguLtf)m`!Ojl7F-GE78y!bdzjptuaOuAWYzWt zeuW0WQvGdTG!riZwBE2c@SscTX}iqWj28E>h#tAwI1Z^l!r7CAJF{CrAPzGc}OQEuSJ@Z{5)_%F8(I(453j}H1q;*svs1STC9SyACy8BXJ-hM zFv(>kzX5XCBUh4KMe=fALQ`O!4ja|y~rhlLnah@Xuocog4WcPI z7vz?p{0Ed5u#eEv+;Wspp}aI$j`Et^YVGr0q^`xoVg8E|KY3#swnuT7Wt^H%1E4r z>EUenH$pUV214X9TOhQ^lWkvw4mg!5R$alWLS_rD$ly&lB0ci7Nbds9Anw*@!EQwQ zHWX=~?N~A(@H)VbjqU{&BGu_Cu6L%o-&69rA|idyd5-AXy8%O|f$xPZXsYO<^f1)ky=?OIF;J<~#G>M9Hx} zPBe#Ka6>zE3UnAZI+Z`A?U%&VJvTaKN?hH8Q~9wQo#LM|w6$6vy}Z&W%mmDqmfZ!` z3fvRi%QAf*-2en}kFdE;@qIjdN+HZ#r|L(D3{8(ad9$Q-Jp^LLy8c*1>#7eAL5%BR zR`CD816T9rh9f|9FfL%=81Re=N5Kk|{n&LL1sc*x$tg&4k_Fhrzsi`^{MWIKUjm^l zCo1NjK*3dYP%25`F1uC(N);&|f>M>SkU)+m=I_ZeB9@PgSx_O%`#md#LMGS2%1C^j zHVS3WK`3SZ3Fx{Sf(GZ5phVQA(h~F>Si<~07*AJWQ)_anQTrk6K!rtw%$z#ZK7(3Q ztwXs!rwQf#DA%h^C^zT8&gAr(RcRO)e$_pmEzB_zLyncRrnst>wq8)tB*i3rWJ*1x_z3N_Syy17zOh%Ju0EL@Sv18a3XnqKq zkD#2Xeq^d2>azf9M7mF6a%dkky&KI`s4d?(IR^0(>E5$0A~!OKyc){D{(#7jFL@8R zf1Rm{^zV<4jO_e50=~y@1`Sn5Pl7G+k5oH*cAjS6C8M@L;xdryjy2l}{|#Uq4`zEC zAvw@_&~60wb|W&nQ{0MJu)VGnNQh@?JaI&`*Ka_}8pm7`yClXU&c>)zU(?!8tc_suFe0|8!y_^Kfo{+d zX=H47SnZP`UNH87isK%EUf9tg*nz~cm=RrtY5ksOI{p~y9YTu>u$f;Yj`{@HBz~oR zBZ7MsBDpBLA=FsnSzz7|WK39WdxjCxAvBBIv7j5f0wSl8dSOapjUrg9?9_C^lVjHN zMgox!!uWP6M{~H8qnq~M1kJ?5E(if|+#o9#!1;-7gB-_@1h@*kA;YtCoIp5Qag>Og zP7ab!#!Q~$tRzN<#1gq+!cAZSI4H+U>IQUtrty1- zI*6VDlb?}MVE_v27$EzTxP%-+Mej);!@#|71WaCr4F6W?WEl#MtGKFck#L$yu^EC8 zca@r$W;Y7F$Al=Y^*$okQR|JE^)@8n{4fu8PE^gU(X9PeFN4CIoQ|bnrG~JuCgZ~Y zp35n_Ad*vOr#KY8h(=0$3gXdEU4*Qc13M!)j1<9PqzDcpMQ|7?g2PA=946r0i9XagD%6pK_sR_tmk-cXu-Qy-TOBx=1Evt9;a zi!=>rbr(iNxtDNyHTPcu#~i21rkm|TK65cMp(h5v(gZ=<+acX8SmoZ+9HLX(N~D`Y z9@4Q}-X73cIDZXbjwR~`jS5X7df?g2hXK1s@TT8tFGAvK z>>+O8qO%sAu(u80Kt{7~DP+$T$OCSKaHnqI-JmX7(F6J0(UGq9o3Px)8NA-gxHFay zLFf!li<}XihcFw4tNZbWTPWvut!iEE$uPk*(VmF`7&1GWK8nepscF3str$tYhKS%T z0m{4spc*0TkHvivqtUDAs9T&Sa5Xm zM{a}{T8p?e_`espE6=%=!NP*?1+1F`+HWyy;#+>6-pvAFe7+}F>v*d(MWBf_+cJkhz^aY!ycDvsXcInyHyp+ z6OG&+5uV3EHQnDJR5;rdO%cR zMn%*nH>QnPIc?8WJKV@P-n5`uTQj((5g46tm|h@m@tocykF%)tCL;t5l6)TKzfm4a zP;V1ab9oX_&ru!2iGjg3a}-!rHAs-0^6@+7;<-+qK`R>sL7(b6DWEFX?B}qt3tVD} ze$(0~#~(sd11Ox{5%!mfpYzjP^3TPNl{qp@&d9jfk-Qf@xZ&R(`C%Yh{(4z!o?!|4eRYmuRH6L2_5742Yj5WVys^=oF0D8($bC1gH z8OFjzBmE-%4`cS4i@r|{g$<<5OBE}rI-pt_Ha?q$K#Ip|q#nSJM+Uh${5>EPf z61t5BQ_n%g?dACGg8Op@;9C5apru%|=T_2pgm6gkV3_T!xmmH-0rx&eeV~JE1(=N5 z4{VUdho$eu5wo2|r+F3p>J0p9fgo73Ys)fc30(5K@b=WO3}6K$o&%6WB_c}ZfdtZQ zEK0dG#5gopK(pS(xrkpfNL1?rNC|1J;aK7T5Ur6w1Q}>n7jyg7zD#&nqFU%X0*vmL zRx@z`;bfkar5Fana+zFMJA@;ZQD#PX-^JfqFPIAu{0oL=SAzp^_n*Ka&su2}h=5YO z8@*sMk)2`fCDxVwVPf13!wICx=VIOtc^WftR0QD;5(JlU>xfvnpJrm08^yL;gR#W# z3|x(aK(7ahf1B7F!>SNP#+oTw9f&v%rLB#IvNw7m?|#mA5TgU`N*46BAJV<(0d9uc z=p3ZZ(7u_`XSquzI)5T#%`e3eE#^1u#Zl`xC8fXMWuHAqt+zA=m{H{R$9j+9cM7it zOe_y~+IoqhW(Mm@2yKJJy0OIB=oEZGd+j_1q#MJUUvQ(;4@>j|2@kp9XA=P77cm#o zxz3f`?cnwhvuT|HD2`b#!8LY+&be`LLSk;YHK1Ae7&sz2uV-iW!?IROy^W&)-}%=6 zB!{l zw=D=vj6zy2Uq8CgBM(aj63%7vO5y5dJkbMO44=0hO?Z+~-x-9G#sZ^gmE~RXrX$q` zXP8`fdK%Qh%JyLk6lABmhca}b51qjhdSFZq(7kh1fFiU2Rt@3B2j7byWC+PXnP_h& zZ*DNtO%iZ(hHM#~y0(N^meh@M!7y7~aEFes5ycr};gyZUB+ZzPK zb>6oZ6FeoT(CoD#Oq!TMn;H+p8yHl(FR*uf-NCx-L=|wb1ifDePRx%cISX8qMCE4+JtI9 ztIJqDItNs{b3k=~)s?Jv=iqX44w74blCNTQRn{D;ylVMXzL?d;S=Gf}buO2;iVNg& zyUR=N07R<+`bN-u1c=9rgYQ)t3Rh2Sv(^@hjgAcQK}%ie-K*lqNk!3Fw}r&UD<$;C zkD!3N85ajU#bV;rL0H@a-hDCkS;Pt(M7mo-X7@n$z!iOY7qkqfOCO=D4@)yQ0JmwU zeA6reyXmq-`n?YsPV~GGj31RIBXT?)n^?emJKnTQ>E))n*IgJzhs4-4IDWrIKwmb8 zMdLl!Df~f9?Z-j?^|(5W&q=?;J#nrR{(+9~&JB9R#p%L%0&Qc7=e^I642gHR0W~81 zat;i)g-^+@zmbj^M8he@t>3*0m`>RS)2aUkAOZvA_y{%+EmH1}G;x^yJBM&?8p2no zZva&?MjmtWsxmS1XN`4V(PV&!0C|`!gMZsh!e-89L z6YDNU-up6Ch`jsd37U(%`?(2fio6>?hfX5zM$UnrlZ$Y*4dmcD8$i^1O-_hW>=0;X zP6>#niWCv-a^&k(q=;TuBi}3|-Ai-okY6gJ+{v6K`88g2`_QT%BXSyz zoK50eju+f6{sm*y#lO@d&$WFe6W7*6DDOUe04m7)kscXZn~&oSq`*BS1v1fW07Qne zp?kKxLtXG!tdsNchaT21!kR z=!d?V{Gv{OO@4_JsL3zgI7!PdU!SAp&rIdc&TkL+cU^W_{!Gxc{8F?*TkzE62em%~ zJLh*4e|$Xo+|tA;rmh5()9#{=YvNZ}d-3B}a7Iq>_!S_42_8R2nCS5=5VjZ2muZT$3+&-i|p7md-g$NiVT zaOvzH|Ew@7PurDa^v`9{|HqeH`uP5D`SKV&yZ>K@$NI_czw0Yw^z8nxaN3>S5TD7Ec&P|fo5dU$Gn1~(68{}e8;?kqO?k?OCR$JiqerqAM*-|vN?;M*&<1y zzv09F$E+NMX?K2qS#seWcqdo7@5Np&xoCA#nby28xloyQ$DMa7(@2;$kwf@o8^2^8 z*w!WUz`i~!PnK1}1N|qsESKK@K?LYs;B>&T4f)A)&DC*p4+ zGoDZ5??ao3_&vYzY5W&HfxjJeKA*;4iNPltzsqI+RDOOmnuxzU!|&moOz|R2L;ORZ zT?r^E3F8lC{QalRpLcc+!87Dz4*?{e@%rMGz4tSp zz#q!+d-A%yxpQTnbLMj>!(Z+eN7JGVf95qxr>uOu_p2FxmpYc#j%UnNd3;{WKV|rN zu4F$(hJQGli2V5ZoCg0y``2`x%h;3Q=l&6|c>Lb}QJ(Q*lcjn(Q#={R!28|j@i(5I jXX<$00g_UEEmK8#dfbG^6{SoQ@t4eS`Ns=4p8x*?^HBGK literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell.o new file mode 100644 index 0000000000000000000000000000000000000000..acae445562ddda108a67b0d6334386ab7b5fa90c GIT binary patch literal 25824 zcmbV!4SW>Ux%X_c$`;~ILaD_oO0bJ271P8<3zq5*%qFvDmc#;*UbIPA68Q)rH(3P5 z*3jJrr^{A)Yj5TDi`&{;@2f5Jy#< zeee8!yK~NSp7VS^&pC5uliLGkO@Yg$*q2MWTFLZmsG@9Hm}&c2)2~cbe2TU}yFO?Q zYxA{v^XD4TcMT(P77go(dwiR4Pu%a@hFf&HEAcbm18Ax1rxYm4j6pSi5`2<#N1~@( z>V_j|M^6{1u|J`q#va6XApVB>gEJtuqZ1Pz1wdLzq8*RHE zLuO(l;}(1rGZgv&VWY1H4c%(?qw_Q07If5-hGIR@Fd?yKVxn`BTH*=B52z)Lr3u5M zCmPEFs=hxyVkVZ9V#Ym;qNeIY@#DI6cQL4Hz-;U^DRrBBLcPx2k>Xj*b*6%DUnG_*U0jZbb<`Vrv|l@sb*Q z7N~9Nd%cOT?;yJwebI?@=gqrCQ!R;>p^2NpZ;kx6gx}Wi+a0(GyGNIR32eWr4nqCM zdKX;rx2l)f;NWz1au1`QIEQZQW}Jp>-#sr8^KC?TT+A24?Os)lO&uN?Q70>)@GDTL zdfA9RZWz{~g*R$9Xg6x#)|z&ylXbx|b@0r{NE_?JT#4mopYEGZ}7@nJLSh}z5#2F*;4Tfa3`$~0dvCn6jzG*tfDgmk1tjBa~VDpKSn$@RU{V?^= zx@*XumfNSs?t-`U#7~Q$sc!98x26(nD}i!av!2kIUrw)EI~^<)kaMfC6_}@k>O^LR zgz2jWe$aXx96j7>(RV#oY8hf?>(fTE-Dl2UrYuuTvcJXdp@VewW=sKx6OT}GkX?kRH~6M8Q6Z6T-H@@7Edgw@kE}5g@lEH(XxAE;Fia3q zRWd+?tHXJALZazp0OH)9v4H8Tv{}Lh#e526gJLpQYnb?I^UUOJzIiBN%miR<`uyP9 znKl8Ln|hE04EZ$-x=lUMBgr0le0IM`6y5KYn^- zglH~PLRKnd4Vczz2IAD;K!hP*gLR7WhG3;IN+af`Hers))3q!sJv}JkRlQYvmnqgM zYuHR8u(ay})L+KSiJ7RY6eOC7#8n`{>jG7l)SIm9r#kTns}rf`iN_{NHttpmlxir1 zv}RU8ej?C>Xz8SThVshP<`G}Yzj;Wyb|l*T`RPn+V4+W$hK$R2A}i70@X`&jCpXjZ8S?df!HFrDPHI>@<6#N_DcLA| z{AnLko~pA*v&=E8`llhM0#D?c`iG$4pGkNi}WVOdrp+ z#-WiBS_DdoDgu-eZl{^N0Sm%h(jv9-6Uqv#huTB5&>i17c~z;gV=@TR$M{k=>cg-r zaNb@m`dX2;ZYLp;VP?`w=|(5@m6I&5I0K%})+x$IzjhrGoVY;L1}3V=$4bh_NvC)j zVB*=L(ad?6`Ax@ZbFpGWvh-!dMS(4nOT2h_p087yV@$$$uTx( z`#s3^UFqp;f0^D~PV9qZv;`SaV^K6U>myL!Q3@E_9|o<1g7gz?i})f|D=qqo54*J8 zxL2rm^=h8oYHS}SBxXy2j|C))}mBAVNBcC_66HodC zXY{J^X|&)c7FIqw0TA7~kuUtLXIKGv(9J6|sx!Kj`!Yim`$! zf!J|1b{eD;-6nQ0MxRzZ2H-YzB`eMrR+?rvHa==eIrfQ~RqIL2_9PY-V?XkVW;GQ9 zQk06)HVyl!=qFdH>wkcbmW=pbL{O;f>%b&{=sKuH`%kFx`-l_M`h0MK*8kN*b7P-_ zL_taMy2{Xu=OUla-crWDirG4{;!!tK{5wK%>XG8u^9a)iJR$7dW8VNi)4rNbTs{p- zlX0~^w~rV@dJv9o^FnwZ*(aj~B~{JpK#2}Nf2vFU!C8QpguuqZ@5%g2wGYQG_kPc?xF4=~DqPvJTmBGF- z1jc@|SnI3xkbAgWbZd!caBSPAz5)u^sv}69iH0>GZDaX*XgrK5O7ph&$C&D`Ust~u zM}ELpm-yAY`f#sO@7n42>{DZJLzTqrnFvWQ@jyuW)%ZsckBBf73){~|4V68XOrUSySt*)v_EP?{J( z4uU&Div_`YG}H%YQVdXu^2ZP%2S5i3H6*(6G+J9sCT!D1kwUvd(TA^74E3o=iQ=B@8^$adAQ)4Pmgv-^~S!ktdu)6~-Xw%KK2_ze#&8mqy zR%NJj8tTlaP%H*>t`4=FV!9V3nL2+&12J7h`jFg#7)r1X&EPNgpqf}~YHEp!x}jTZ zpaWc1wE4sc*T4-8s9BV{0CMN1XY4%K#n?_M$7> zRH-HF&sg&(G6zkEAbB0MTR3PZWhqnSC@6&DSTEirNxF6Nhwzbg3}V9Oy~57XPs-KU ze?kc8NIpco*Kp;n66lHy{TFw@(ma}_72#8y^27Gzl*m|oU#`Zhpib~lWG?z3#89h4 z)+N!C$oB|zj-EuiFGNGLF4U}VyHg|BD?B<8xZE?;a`Mc@5T7YT7PYGlosiHDuZWeL zRV>_=W9K$k=VH(*O%cZ)>#4#tu4KDS6ECNGwJ&K=4~h z-oBa%RL)a#ATZPTY|?nuY}9vHG~6JE_f#OFgboC(i;2Z0SabtchdUD)#1RHWRF7bf zWtT0}I+2~=w~QJ4)c8xZ)9S8dB^wdXZRi1W*@hGdtX#hd zY-Yf}qQNt!&Mhi7l!5EN0~Lc0+wRjXPssYB79D~O)?oxKc#o{1TZTKOL-f2Mm{lVf z!{uPeL|8pZ0Si#bFhxjes!^ikSTl6~ozXIFZ#T!JVC% zq5NJRIgmUDo~t(6sgatQL+>)sqmW}qASe#TLu61Q8?@?+tsz9j*O1W- zn>FjZZd8eYb+sKCykt;du*i$Ar%H#&BLb-xPK=D~{Cfd5IZvXMtoGA-saaR19-)E$ z0^U74bw9eanW23QRSqq=`V8CDeTd}DTLtZ!(aS>xrQ%`e#60&0G?;M~>Y@(9@TYmS z+zNWww6>`o(|o!$i5k`T;;UJkdeub(t~L#~4_$3)`^PZi#rRaFiXfkC|1(sUX49dZ zU3Fj?FAmw%k};Y!CvElA^T$U~UTKnxZ* zx>asGl5zXDiy_Oo)-6~if*uHr(y2v9kXALA)2H#u^3lgwz8|45rilp0qL4LNw@O(8 z#&3s}w4JXNu+-ZN)I^;LZmh0pmY){QfYpd?aLLs$pNGyU{Y9xQK+Kz?w#zfiL?0}} z^xlC6bNO;|IV}S$*O-pKLXTpbEe_;+UF|4>^QW=a1RrAiY{0I%Xpf6L!Ez~p?m<^u zKy7~x!zEmCI^fr^<6&8Q>Lc>yToaZF{)r$ok2{qWfsoaPqFqi7y%3dIx2_Y04aIWQnwD8>GPpb zc!4JrUX9^tK11F51aSk#2c~NP-Cdb8#89}_!$fgCj`6o>4(`YJ&LF(n1JV%e_G-*} z)wBmfU@U-#f*^o_0Wsrt%z#wt*7pb@JwOkpCf43!17|W5i78M9Ldi*=6KF31^>BUi zu7!RiO?|TZa?`5zWl3WcPVxmwBG)UnP$W>$b%bPN>}{DsuA?LtiJ6ul80Mk{U9XOk zE*P$7Mqo0zj+(BIi8H3Y&&U!R1IIX?P;zEP$n}Zo>L0w2otKE`vR=GR=%EO}0 z8ge}izi83hs|(cln^>gD>BwHnlj2bL_9D}IJ-G7?H)MJ~XuSiSPJ`S5e9p{Cx(`xR zh06EWC-0B-Qsg8bIM@TwoMdE&-*g>I{Rk9sJ`hm;58{rZ4hLQGORg9ZfOTp~*do)l z6MXi7c_|21a$?ohlpsOld8d5>ZJcNviDljc|;JrztY z1=8Pv^hzlHGLTXr0i^pXNxblKPcUhI&L6a%G?Q93>z$zXCRiIJGm6Qz$;cGHnQViy z$G|mYy%sFri7gDIe3mI>B^rRrLfr`0v2uNF^0ukO1?MuSjpPC$$ze!hsQQbfjA8wV zUhjgzkvYkEU-82+q2wY`kEv$8mC+8GHK!S%;1OLUikVyj`FG-=8HRt)EZ-N3?}fSh zKnX7_=xwnPp3S65i$+5rfVyQ8jbt+j>|$23UN_6Rc!9?t6U2)yN%IVMdzdbm3JQ!h z&3ue`%c^1gw9c}!%gS^heXheOC*HGmcDKFT4Uor9#F>3RY=ZX`o~efmH4%!m3X3bR?5c19i!HUq(JHLq6@NEC%`V-N+}< z{Q%}4M7U@22_hhuE>l<@)r&lOiRDq?q+vZmIRqQBn5jCRI!sTAY+YyNYb8~+O!@i~ zMwwMMVCMv%6$PyUDqSYjqpTAq36<{*T2G_2Aj6-5@UNRzVmkN)t>;44>rnmRoa7BZ z_M6r#!DP2T`X=mh6l76jJ_PoYke{kn$l3$cC+d^)qR8s!gHSlD9=!wHeqnGjU43%# zuYidCZ3HiLvT_R4p?+H@j*X)-Y8q=JF3v$f;<-HJBt7EmIV&7Rd7ysRrRg z<%i%Kh_e$8&njA|8h8w(B!<*#{0gi(BELnc$u3qb4F?RXkBU_EZOr%(zND&UP&MK; z29Aem#2$nvu0dIZ8Lwl;A856rI!Eg9ipbOL9bRJi43tokij1koI~WsGtMEdaLS+e4 z;K#qSDWVhwtzJH*A5$(BTv+Mn)dW*u;*`WCOOq7#5~K;CSo|`bO`tnzLdq{@OzJ#TdohIy4%g&0CKgQxL7@<&@n9t&ep!yp zEkV(>*qX&Gkj^PanfHY(zb+x0U~7QFV}|R<0Ywx`d~XmeKSp~4Ri~Z%bBGhlEoi@A zWYtT-1XphLW(BHTwAgoWZl$rzQQ^9ht5dgVsqi*~GlgQwZpP1KglDsy9*mAeRndUaoyrq(cn_>~?4q;&{n_qV|?QXBvQPcHu2K?dwNAt3C7!L8H^6QS1)f<-+LL4-F+MG z2AD3j8(r9uDExN=ZJY2se>`#` z&HV_t?`7N@1+rRlXBne%r-9opU`Nu}k4jQvVy{}Vf#YtN&ZcajDF6~J+?Md$8h*Qj z-?sDH9^9&kz#-lr;I|>%Ag&*0jEwwr^auAte)JD8ISGRh}$84BMQ4R=UI^} zxJM#i==wT&!rNEC4>-jBtQW?=`MkaY@$E88iarTEMJ{Rr`|ODvccJ(cEv z>eKkA*`AMBAU^tJoHZ)c*g23YdU~oF`-nllqQ=HC$i-@GJZn_qO)P6%VmI&xsfaOW8k@%;SE{jVSmSFp z&U71Rh8k1Y&{eA@nAUGRXb+Bq9mr<)!isqG3}>H4=86vcBIiekD#Q8i=AdSOZZs$Qp>E!>oZQdW|&@Mg6RSDEb>~ zAc|gP4Mfo^tbr){D{CN%USg9Ac!JP2T}AigCL5Y zM+15eU!}$;z;HO+qFCC6JNX-`^Lhx=sPR&cK!IV`A(rwwvqm$sQ=)c>xOejVhV6t7 zYpoIeUf$o(BkmLw#E$}k?i37kr$C_l-QrGaQGbiLlT)eRFYZI)PL3xy1xC8UpF5Mo{8Kjr;#?beVr?ocZ_93%-?6L# z|4v~I_?HjCzhw*p|6ax#@bBfU0snrDHQ?WwtO5UOtO5Vlum=2F#~SeOw^;-Jtz`}P zcL{62zt^z_{JR(p;omQk`}&xFNin#?Pum0XGhJ|HCjWVT&h(l8={Gl+K>nwTCfM-- zYc&fPTj$VUlGE?Z?z6ZMpteoGUn4r;&`V8R;DBBaYumj$vU+>6dYiI(`?Gp?XY~$c z^|oa72C{nnS-qjG-m$D61sq+Fn&Kt$`R$LR63Cq>`uC}s5x##Gobh7{E<|RWL{gMOeoNooqA%YGhdu<${n&6D$L~aRpC%?{MkfO|EObce`eJx8QTHtJb^CWqP;c z-s@V8>7Brt1sqMnnFX9#z?lV{MZhVTD8r+D94%<_ zdg@SX(A^@&uJ)pni9y6yo)~N>{6+o{tQ~h7roSs_mC_F%uA_JkP&{X7)lxj`6wfu@ z=E7%OUeB)GPe1eQz6Nh|bjBf)K(Sp`R1x+PK2y#C`Utks#h)6@z6v zB`U5d{HEN1dyjy<2^^)tTbwK~yc0JT>fX}zZrxiJJqt1w?k)6|&GwdRc19T_mf@|8 zp0(b+u`pRsxXL{?8>5cM5zl@>|4uSCE`og|U|kHtVIEdIGO`igBmuiw!mAXp>m`^j zVCM>0Bkqj?)=f5t-+oYVt#@LQY`xJ9!>y;<5rwV&&10FpG^t?d;|fb%qrrx)(*uI` zAxWn_0=824nB@PffL$o$BG@qjJ5Ru<{<+x?VHD*Z0sFKYv2-i%XK%8_ghE?g5#;OZzC0SZ|( zjUFmgN=5@2{&)xY=N;ho>XbcPsIZ@BWdqSGRK7S0Mu0+AWkwGbDs~iPj}VT{P6Hz?T{pExnp(QTOO`H9 zLl?DlbuDe}YU1>jwyxIJ^cbTeOCPzVv$cEatr?6(&};G1?#`A-cu7-N>*7`6)BHgvC=>?-QrddU3p8|1h!zd24XyGjxe{-t|`*m(Y>;( zX>m)WrM=~rm8&AkifkK6E@~oUb%(pSI2Cd0 z9Fk^fN2Il@qout&LZOi52$|n#)K8Txi|VJ!_t}`tnj~D?N)~L3mbP@~CCO~U-H|mO zUb&*PrK>fMr)+nVrgkWd_NR4hm}2STmMr~x+xS||7BGxlDDb7wd1D9k}Roh zQY>Wt0ld~Vt>IPV!bn~+-I1n6s~4~8T-?+V?g%$64~LgK;kB*)hmo3`sie96LpYx; z4;NX~U3-Q3GWtGz@Sc}l9bz116aUvH17D7Hk}krhy_JpsmVgufB@(<4>lj@GFT=;i z|Ga?b()pHzPv5p~kBKkzj#-HQ>UoOFE4`}1xA;D!jKX-uBxH&oK!u(>Vt} z=%XvwJ^=|X(=i1+SFf0aFVp{p1N<)%T&Dk#1efU)iLLS|IVHGECoI8bIz0~XKT2?! z&I=M;rt^*i{KC&=>?zZkD&R!pVtj1B@0IZ7c(5PsNnaUm@0$t!6^V|$e*Ox!3`IgMbqqI*qXDyzPKLCSw91-O5t#p8|5%65TZ%Ob{iN04HHRRHt>;S(}z;o&Emf*7d zBNANZOEr)#;w#tZB!iBIKj;8I-~bUJk0e;v4K9dZL zE7v|94)8xY!22EGpE$rTkm{;zpDP{U^$ze&4)6yY;QJlm?>WHj_aeFWxs;o2l-<7X zVCO*x_*4h`G&#UGJHVfJfS==_7oC&Qm8%#1i-3G^_9xAG@ZWO4m-peaf5%Acmkg&r zF3rbxiv#?C1KcgGBQpIf9pFnH;6HPKKkERuU%}+s^Q+RjBg?bY0X~F+>)#>bmTtnO41SLU7ba)$V-j4Lkik1e9MZ5@ ziWxj6!N)QP|DykEL>J+UC6V=qB)C}07`%uS!bSLEsbT$62~Mx5?W&t_Ifuj!SoZ0eIJI&jW*=$^qNC5+ z9LU@A*kPD_emYv-e5iC*`5Bb64pg0~+6Q~$5La==9DQQS4p;wwCNQx_ImA|wNt}Ir z>P+00`ya*6d+7R)m{6Q}I!hv*oH`@f;(jKze5a_+Wb>V+euh%gv1&!vO3IdsaCf8% zZ&PrryrQ)ODb%qt(prI^YF&mibexBBf~`xM+PYd+*ae>@b*)AR9HOpY(>PrP&a)ALY1{{6y0_IqYdZS*_YKVTpq|NaI(gZ_Om zU8Fy$l*|7$!1D38e}XUt37fh_?Lc}bF|;DJMnLEn7>UtfOOdt#XVPk`rnrM_*YV5;F`j3>F?b6k2%b57V~dT z&*Y8V`R_T*?-%p0|BU&Q5g+;F_g}^*p8FZ|=Q_-97W3`j6WC;O<)?GUeE#1=KWxEe z@EeO?+E?ZJ{}u=S{o-K4{vCoKGwL@Dzi^n}ON9}a{d)!88=Zgj|Bv$7ud2I6AeP1Z%SEey|W51YRkpm}P>9X&O#e5FUd=HEH83=FL^KAGn zVm{oEeO<}B6Xk~~fNcG3Ytc-sU+vR|n*-(LS{coc8ICVm}XxDJVh`AcXM p4V%9`pPt1a!l>(*#z7}ZIO!7Nq#^VylyCmTMvfBA8O)vk{{RYrOb!45 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_backward_weight_update.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_backward_weight_update.o new file mode 100644 index 0000000000000000000000000000000000000000..215bc074e45e63d4538a6ea7764a2e110833ef98 GIT binary patch literal 172840 zcmeFa4V+cgdG9|5jyP$j*@L0RBs5N&V+)3x3<+sx5|aNMJ(KK%`yeVw(GWvC))>Q; zQO8`g#9(Gd**R=NTk|iZwxKP(e{#z$rZp)^YxF!2CJF}bg zzvBb^)=xEGe^nS{T1w|n$%YU6;cMZYnN{Ta;a!;{{Dw=daL+xRbGz=c$CTH-<%+tx z_-%~W{d7=y8|!sn_MXaH+Ux#eYo$LLt1G##{hRL=AfNqQc5e1_*-NvZ-TrSlu|r7z9NmoC0CU%I9v7j|b_(p3FgTYKq-AQ!I9wDB60{-i!AeYPI>+A@G! zyeD6}c`;q&OW$3?Pq-*k@0TvE&zF9pim0W z;8BgK$joSZk80=}^w~bsr&-TwE`v;)AGz@f0KC!at<4Ooe6-d^H2~}m(hK20KDw0soWp8F*F158i+{bgyn^~q8;b3@MrXS$xA(!c(xp_3TI~$gB z;cnXMdM(wpdzhxef%fomZ{3Dm>uxF!r25;3KC%*V5-P2mxqv);`6f%*1K zn?~h02%(|Mg%3E0H6FRvExEAQoB3ciRnE2^p~exRJHH zMy6)EYZ#KT`gBdr(u}gQUHiu5XrPbGY}czP@0K51?wtOpfIw;+4sPC`Hg`JtE zT$KLA%xHdNDiE?MKfFBC5=0Gd9WiD3-e<;S!~3rA&df1_Jx_bf0*d*?NBr>NAoZB< zt$W#z(ti=8UYQY1czclA?}sn>leha_`_g{zzA=7RZ>nqWsb2R#@TtB&ulpAFG^X=v z-p6)+hQH~~Pw@AY&Pn_o*Lm8ZoY%IgCrWAZMqF8*)UlD|_F|&Q=N5Q*O{@B}$ znnHlL`O&mgkm~bOTl3MxJ7+}GzEOzg_CP=z{z&s4?hYp3bJEm*w=(rw#Z2*-&JO;L z?VQ8kbZ3^or*xjpUroK&`2HDD`eXU7-t?iYH~vdDZIkq>;cT}QCL~FqNY=2SCL!<8 zAxWXm#SR^6Qoi8Op(f>;qe1Hfot*%120f9{r2a4NCqek2moGeBK?WMl4v=NYK-M)j z|2EBJ-7r|e2Hx_*0K*0fUI6Ymbl~L%^?ChLziT63cRb(vQ?KWH%yhP^%-naodG7h7 z;l*mB)4A5wMhd-UcRPaacK_RK3-`kVe(!-XTKSFH@NTw|`mTYLzk->}W!m!5w6Et% zx4CA*K4BU(B$sIf8;*uM!G+EI+ubfo@o5?lU=o_zqbcLEVQ)}8N;zliA+S&s8X?!Q#5}ljkfrS(l z%gSWj0`l|ZpW`)7xrJqV)hKzNv^*m$qgG6-;yI&Kq|pZD%@`$*Mkn4PXf9DEkQk|n zp&>J4nU@{Swr=#g#d5l?n-YL2GVPXJ)Rrm~%SuyI*Y?MsGEq5Wk{tSYI?l)_V{A3! zvH@k#R$3Y9B!jljc_FT1y_}Nd(8*w2WxA@2OERc(&!o1hW7 z(7vl@(y*|%H`U&Hzt^+>mcc{xt;<(WKFC!`%k|HrSuot zJ6k(59p19&5x3<*;R2|WSe_t!Jy%+7Mg88NJ6zY!=a792k1dAq-Nx8~^EccNgJ!kyueqpUyl}q`2*jS2 z$qVnl;sf59!qvJuwX=`Bd=bXzL0BB{J~w#}6mwUM_g&9{_Cpl4{eP=Hfb^^F0i0iK z59!WJyM|Bc{6yFAxXy|B3c_y`!h4O=Z^;VbnU|w}Q5%%-o%Qg#pL)>N!%y5|I5`u1 z{|WW38oeHQO!jD*ppojGGfF*Hg0f~PD@OJjP$pWFmw*(#UiX81n}eyTrby-|(ZZBp z99EM0zpOw0jA>+ysmZwZX=SjwNFSS|vjhfQvf4>klP`NAZYE9oDM`BUf6$dz^W&0q zu>Wveemdzw@gz(5ev6L{qxXkEVTKM$|C5 z5Dpq>J|D6KP}K!#aj%s4APrjyL85lsqjUH~*n6-0LsZDYw)kRA8^!ZC@p%F_pq*y% z7K^Dt3*n#x&V-8W(5o!Pjr^-X_I!9S7d{cB27v9BnbEY#s@p;Hz!IePjRet+2ss8G zE1a&M%Z2;h7&`z&ftpA*K*B)*-5x#qZ1<}JS z!3rcm#;8pKoL^?p7?W&Qd5jyeo(Djj8rmo1@Jm0g$PFkI`bkQ8Jy+S#p1A$bCRH>q zD8ldi;eMlF0feqi9KK|4zjyx_ZU2c1V+8{96Zzg}$FR9GG?hsZKbg_w&KA4iu&mT`MYs}_R z?7XPq-Pv$UK782^?`Hr4W;lF)YI%5Pka{^cc2UL;#$L|)XhkLi4Zrp?El}%|k5E5d zfS(d>RJNn>ENsTTk4&VnHUCv#vNYM8+ywowGC_QOOFTq9Wp7FIeog0G4ZqVV6vCr^ z?~7x!NM{Abr8W%t-WM3$JwY)Cbm4xVHWu=^kNQJ<{Glf_5RLTBJl)T<^e`q&;&Wqs z=8UvHpH}W5UwV?dG0J<2)}SjqGpF4)>-pZjWAdq=`qA_ynnn~gOYN(JB6}dq>04^m z^dq#C7o{0{yD^W2aGUA|NQKRTlqGa|UwicB;ef@As^_lEAT^ZJBDuVgPFdlTNJBzh zLZjZYTa*U6Fx*B4D!VAV>q3dUD!o!TAf1o~Y8DAjbZzn^Un0-v;1x`GZ33sZ@Q+=3 z*jx6SwA~&}JJmMn3GWDsuQ`76r{1!sXe3`8%!QAHeF!0wA7MyAomcZw5FvQ8?;fN( z+jlqG1N-iG>|t!@*Sdz&ouAX5`*H2L@9{xYL=EtAlb@0hH@ypb!9I6`+X8#zE%%t8 z8e&UmobY393>S=&$(H+L>V0XHEb5U*z0oq+a#b%oNZ@I_l`C_(QC99oy%T@9h z*mA9zH0kk{Yt3ZYa#dc<$6Kz-U&EGb<)@Qg-EytU4Q#oV$EN~e-ae;h%iZktoN5wM zFI&&jn2+4V1}mX?VX_IclFH05(n$Bep@Ef9+|PTiRh!kawePFJQyZTPe>Xyv^9o^HGQ-o20fvHff1l#(oN|Hyc(X zXPpjKBVK{n<8skNVYSn<^nlhc8d|@U3-`om%|zZm0Q?=ifL6e%#YE#Ca*zxLIJWl_D;WrJXNf&}A4P_0(sh}QW43FzvV+c+i zh8G}jurKQQKt*nP!0R~3}Vc?QE^n_(RKyKfwl7ri_+BRbEPXS@D?R9__OF9+t^U*spMlaTeL3 zVYgCZ)_4^q5Y1p*^C%_q;U<($|C2N}>9x*+k-A6Q)50Kie?A;=ETCf$dv@G3TTF>D zICQp#TJ2LI8II`-QU}->zX!v)-aB)#H5oq7{v3ouQ_I=mUD?#0V63*qvGd{DU6Z-i z*S_|sz?ZUhA^q_gW8E%qJsWqSU9c(lvAN9R(zh|gg4E4|8(2-KMqd+ZV@l&S6I!cB zuJD81=e&zGUg1){_-<8yLoUUqMez!JsZl6i#V+_MRC>%(%)t05n5NqdQM!%{dp++v zGCaKR(q}|eV}R~*yLx!gTgM~(AYa|$t;4L;UK%55WR`vPzCYNfN^9L$7^`rQZ&U?U z6eHGNY6svu&2VIOl%Upfp^B=AvX=v$w{G~v66$JUT+H>jb3i>h9YUdaZbPdSUb?6rUZ`Vbw)9!QP&#i8JW+;zhKBQ{FV7S8p2S(qFAoEpo)B)u|>Wj!>e|^V7}-Ks2UK8Nwgo zOQEX$Rg}>j+n1GA_^Mi3Kgn8v2cUP_on?jBIyzd!+_%(G$I_CPxS93(EiG=-Pc+ep zl3FeEo{OfxP4l>f=b-pl-gtILzVpRB+0=uQv*s{FQTjAVKrO`&^ zP~3{n9i=?lSLMT}`>}u!ZY`u93>XW+f=XYsK+UP_68(ZAs^hQ7r!WKb!{B3hRYIy) zR@xQS$5qwGHF~5aeL+68%}?E%7jboLkycwH8Jd9seiK;*fnkFn?k7#^x+9vy)J9c1 z!`j)HFNJC`NbOc)lOBt+mJFeHW$SQXGCceQ|aSLfqf;Vg78sZ ze=0CE^?ng?hfNkZ9a%3FH#v!Bb!N934S%L@pU30OSm=eL+8#vHZ4Cz!dTFm;6C#pH zQ(J8mflYlZBrQZUxZ1ct?KCis>}o;OZPY|y^$Ct>sZof;oGV&K69LJA5sqAD(f|o{ zni&$*pT1J&)*xDKL?(#tGMaQ%U3mN|Am3XEj|AafvnAhV%~=bT47Je*d}^bn-`O-D z_W~go6tx|@+N!GIaaGk_O)uKq9IyaDc@=J}6m~^(^q{7#EFW9bX|8IF)|c`HIhnz` zjV8n$zUEqy(R)%M>@@)Wq8plgaY!g3?2$t3O4D!T!`GR&<9WVlF&PDZwYUmwam%gX zotZ1`_qL4B@-7HGy+`SP)2M5$%rer@Z3}~HzF_m#JxIx%fys)b#7aXK9wd^^fw-5Z zWrW{Cw)V+GfU>b2VCM6B>-OO{ zGoNd!+TZqQwrj6Pbxly7>CiIwZgbONn303_XzOl|{(ks%xa6CU4pY>-C3_8%aP#YR z^}H;*&P&OyN7&KGjOW4wy~Ahr9+=o31s{jg-RosrU+sJkt_NLftx!tCqteoA_3XI* z8M2t0Rn%QMbeH9H?LEfKuca@xc(Y>v@FyBp$H!&6UciLEI`cH(X1k7}Jepv+@YDKw z6eNxMkVtF*_@)m~mtrTxs0#_i^DI$^`a+dXm{U!mAOW--+(TP7nmH`h9yH8ph_JRu z3)BYuB0+2PJM^fpJwnPF=5G~AEUz}LR5hiKQX2b_=~li>cYaZx6r0<_qhgfHe69{4#&XbWTIEYOIs7I6Yp5!0Bsby`A2ODq_M4#K;NnRDtPh zuw+XHn?7E)6f!5N*c7WW(W*>Tm1(Ln?VUU|SPgG{LsG#^#F_E3w>7L$e+`XldEyE+ zJJEWXlJqjgDg~{NdEJi!uF5s5$~5aQZT+RKdd8|}tonqc`h@YSVIP$npRRt@qDDb( zpEdiPX_cvb*CNWrA2ok`y{fpcn%h{Pd{*Z>^_YBDKq;)<8imWpH&nkWXlzi!?z85; z(_SjyHGg~q!`G+}>-htOoM0>#fW-Py@AbSNb&YJvv!p#1@*o;#aC;E;pX7YQ z`uJz{3fbzsmcL^T-BuXtcVVKSRVE4NHDXxRwu1{Jy)_oq)h0@cKsZUmG72NY?LG^oxez`JI&Ki<%*)*1)Ti*dJLNWFrXlG3JXK zX<0&{^cL&YJL|>d4|?!ZudDUl%BGn*HeIpVvR3%1V-+bPnqX{VA@!=C>dU8g>hl}b zO*HMafafDTw`05EagfJs*Rx0UT?0IeIh<#|w`cnK6*OxEg-8~^+MU&VG_ese5or?# zx((7z8$TLNS>vZRsNLPx?f|}9w`cnFJo$)()vY|{aZH2ClzBWaTJ-g_z+TUb>@#=p z%bb?!0?N9a;qanMm_@TIsSG_W<~K(d^~Q3*Sy=j=Ch3cOHE2{)h{#B+mgV#V8n8xH z;Y)=d@rMN|jK~j5tJR@)NmZ{wYG)y}(@c*}&;@Z9YU*sA`-MD?rqIL6$5u78w2HWY>3K(dYC^)h}B4t5n z8dIhDnZ{NSie|j-`FOH3iPDC}-=6puNR80UuwxBDz^AZ;mo@xKU%QF#l^5ADC#C5zc|1K2IRKct&u}9Gs^rSXrr#6 zlCUIJhX&jbk1Sl7U8mEE*FYh4NM7mntxm)0b=UC?hJ(0}k8A+v?YIrgHw@9hvPMss z!$LIugCLkeqgmFP1EdpYGN0m0(RT!yju>!o#t;LQ;z^By7$1sT#0*x!&t$E{16Xb7 z)f2lLk);_CiH$!oNDXmn!H3W5sBK6~p@aDfPgp4l4ni5%&HMX)1Slc3tEC1IX1e{T^IOuKy%)#nH}IJLdjVpyX19A9 z(>b5_H}sgUI7GvG;Sr6?saUqy30dzg)%P3C{k2kn8W*ZW#p&97uHktq36wb53p??b zuBQ{)mcR52^Ou$ftvJ9-%we`Ee!FbI7_|89+85~13Q|s{b}NP-*kP4o7qJ%3IYN#( z+2X2ovcI&P3Y*OID>PZfIEgT6UTO?zFTg#a4-kn#J{jJ3_tS_QPU*&~T|!}R+5dP} z*oPW`H#t0u*g-^^w8551qr90taGIq9St(F&V7djrn;?D83kEckH?^eEpg4$8$S=Ou zKJ^|D$mD@Fw5iQ_v3}XiR#A?;A?>aXl~&tIht>FzbWXk*pl!={_#Ge-l&OTx`Fy!u z9t8#EU+UjJqs>fYi(0=V_8Y)bPuG@s{$nvCB%wB*q41e%C zwgKt+O-8C>X1aAH#vo}X?f~?hn&j6VNic_VI?&(^>cd#li)m4?Y7pD;j#>5-RSV_P z+W67-sXOw;ZRVHpuzJ>ch<~&P4A`MPjP3lg{4xqMUVl)28RHQ|=1gCf{Soa*avuND zeDR=W80HmI>jRek1Is8B?^OoldLKILL!VN;IOF*^14r2VE#sf!rZ&V)VHn?PO_{4M zec(cGeP9xQ&@#*q@Qpifm+ysKnt`G{x+;~8Zj|fJefi=hzc>W2cMISOTy)nS&A@vX z-sJ(x$%lOWM;!#&=o~E2_}Jc06|-l%j2~p9YflM^4*-!`b~W^s|3ru5=!n+UQMIX` z0L7_%2%e2Y^e70bwU%l0s=Zbp-Dv8AdOip$WV<$+Lv!w9@#Nc)1PSA9%*lx>Cs{S_ zj0yU$GbrROJI=_kkc5x&JQo2L9cTR!*zUw_uV{3>tS=f*XE_+!rX<`fmYgan8Qg<%0MyQZoJkSCj=g# zmr2M0hX4%@M~Q5_*zV%-3NSIn#_eR>a^Xflb-WH*TPIbJ2|qk+pJZb(%v8o%%R{i@?k!T}~>tic$AOV_nw56)qU z%$5GsS7E8!SE47Hhm9EboBR;^q>JWAN7=;YF8QHQx~yIsy*OfI6iE^qvVM9DN`JqY zA4s>UZ;Jy}1MpT5nx_5~^X^3*zO&~Br3>N3Z?{R2si~g+C8p;~pTaJSR#c*u>3!*b zp)X6XUpm)PHIIoQzEHfUQ2IMDV)@dywI1@NKkw(wH9-Y(2VJg4O!zM;NsSNkTU$(g z7ma8Us95t@SFDBK+G1r_Y-?Ohn}b>tw$v8uQ+nwVEWe1%5=u1jZEEKZ9%YM!88-2w_)~#yU*fIXn@_!#s+ptU*xUZ7sLiu zxuZ6Q2O&H63ao`>3;Nv}({OEO74I{mDFzZ>^O$&_Wrk-m7BrDFnZa6#WG<$z zGap{XhO!6?M|eE3vks7#7(Wq$F-2;8Y+k#nhE_j9$I7Z!E7&*l;h>&;hyn+8tqHSw zCe>{!sh)Hy@#ad%QjN{iD&^6O3Bp%@-yp}e(znudB$#LDR*WR1u^Vw6|Tl+cb zDqCHywI8cpKmCXV%3h~&!Q^;3M|B-RqXMUO@8&2A=eR0%KT@~0w?0ZUy`1^29@X6| za=I7*rJGzrTiNjE6+kMczuoPvPf_*tTxxd=L2PI|!)mTJ$AxV8f^3cV)b`n*ZM~0* z_vKRk6&vGHGZ#=w6e%B92dRq^K`#7GP8sJo=oy|R>{JTs6RqIp5ZoHh$-DHus4$%8#MF#rIbt`|8d9%XUb6tY#%?%^32hMnAup41XriX;}xgTv7Zuq3qmf!bB;&t&^LQlq7|#&iLdHlQ#A@$K;^RLq?? zJ61mdgCr(FD{W@dOk6r+jhhkYJdLE32sPC(uVQ<$YSYxo^tc+G18&8>$umRG#|F-N z;|8CvosDUS%rlfX7keeoj5;ruW3b`HjXMQp^5nf)d7YSPcxJ$PS%ft+8E|iWjx5Gn zQy7f5X*SqqH|kkRdQJIU%ZUx66Ubm#sW87$&u&=NiQZb2v1t4m>W{IczZ}5w;zkp| z0+-pSyxHR?Dvx30Wp1OrxPheL<>MzxtD>Ro`M#{Em;kY?Zp8C_F5Z_tKSMia$Hbuw zq7YusOj20+qKD{I>y;LrR;3nD31eZ=N-s2v(8^qNJ)Ejrh4N-lm7!xg1k$LZ(RsNA zW4Y13E<&LU?50mh&xdvOSy7W7(j~ayz}YXGX-iPtt(7bLfy|}%Y7?si6GSbHMP6hz~POjb}?sX+(lK z1D$0{{dzu1OFkbDPV+wf+sta!ui*cXl{Y3U<`D5IhMu+a7U>HmW8iWmU22YWdj`3mlQCp(aKoVrpVUBB3VsQbSGby;*bI2{oX_gqny@IPxuOqPLiu zh#X9asfnJe)I@JFH93RFgqrBNN=@_@Qxl^L2{o~o8fp>~3EVG!2$9H3=ZjilAuGAi z`kL2s$fyK|Q3)L&!r=8>M2f@-Nr+J9CK799YIxdth|@3tWv0bjvhxSZ4|KE*-!v@< zDlMl{P;^=rA7|C@;X=4m>~OQ_1q}RQA6|b5?@Y%$_(PBBgdQZIpB7lb&zew0OJ%)7 zw45M-{TaGp3LXsFN9ZNrJAiF^-x#b$b`(uZenylwiA^ecU&|H;b~>sPvjE&n^hHtn zTeQsY#J>^XTCj*BJ=Kg6)40o;X0L(s$l1p=X$aYNa-$dZlQbu%+tmfw$RzW zxioXda%Ufn&*92zbDqpalhnn2`ZU!_G~r!{pdTdNsa3X{%?;MjWFkS^Y~}5gWLzmO z?5B2f{`UR&&T#%#B2%NQBde?orz10g#=b#khl>6 zl4LZjSnLSGnUh@Wd`w^6L#u3(Ys|CKay8MJ))k_HGtW|r8FhO6p<~WGTaP?~dG;a9 zvp6ut9WuX91jT-SI$7MQr*xncQ|f;|AM2w&C5I}~NI=gN2#bA&0k-dKF(YsbA|M5c zt$E48c}8I|DOv3OL80_%9R22S-k>wn5tP1E5B~Ua)9AB0PAt$05XYgKWzzGdb5LO6 zKBzLRO2Sv*3=@ZWzVS>1!~h;=wM0z8wDv5 zYd<*3EIDdDX=+u(CDF==_UIe600gX-UxD#EgI_^oQ%IwYa)ek+1M-q~c6Mnz%IA zPb%aX3X;1kUeEn^|a<-3QDL8%-_JNX@qpxokbfKzku&z9s?zBxM!D;3Ag|kZG2>r}t`Z zoHUR`F560`i(K}Ne&sqfz4R56$nm0n{w9M94q?b}##bVjT}$M$*PQTB1__!?fWTa8 zX8<8g8{~btjQW~;p*nHPq6vg7ySzCSjEGY9^cc1oUpAwQRE)+N6{O5tcTiEvFh&BJ zn1a7Rb3mA~XWzi+F1F$XqkCilbeP>kvUvGv_%7^m^JzVS0paSjU=`7Cp;frdcG21>T&GK4+oHl!?&!`C$wsoR~}^lc8rUD>7L# z+tg9kMKvjunV$BIAezt> zMAHR?Re5fuva>-S8tQ( ziJT_&cD3eQI3SI~C!bCbfR~vz=QgpFyVT~~rk1xW1}hlA+gCK}R*-6!f<2X3XIH4x z>q{;`+VMi`y2~t3oDI=B;zIBNT87HEM;Y+o`D|(@=u)rc zE(Ssx!nV9DDX5{DNhmWbe|wO6o|t3H(O|bv?t^bfIXDI=vR#TirbFaBA^5L#PYC`B zO-Ne0v&i35I@|a=uJgV5B0(WWL>@~MiHWcfETK1u5;jdSv929GO4xMsF1T^@Ou~!N z>S*!BaNmz3ZH*eUj8;W~Mh!hiJsNc(y1FH2oe+AArl%;vS0(fq&OZH$?!~zV%cw*i zqnWFc&|?;1tS0o>IRmOfTNW*>5_*iUvB+bzWl_v3k;iE3k|gw)g)yrQJ$6|Vdd#Ag z=?HyI=&@^)&|?H#qY-z6z8ZS$KCfpFEt@T`+-g3)S}IG88J5JSD9tQ#%<#~VZtn?S^_+6ZD7_RCENlqJ0qKuic5NAXg6 zC4iVPnQ#v(uSj7v5yZ~12x6cy$&~daZp&T9zwpX_cL(!}3(5dLbx=xDgsU}3mknP@0)@3e zGiSL#VbWKMJxlgo&>i^+Z7~=XApanWIniV#W<(1Rre>kH3J`_^jdg&5ctsJxPif3!qjkuzED< zV-Sy;tzX3EGD9W^Sj8MnzMR;FiejlTbc1w4U@$?>aCO^o)xf2p5(flaW|XUr6=Ukg z$_-G@9yZeonQ|C#LPW3uDiaZG20!AmwWHGf!ffj+Ui1-ObOgE?Y>^(WNqV5N!x`Gb2DQ7e^oa zL|Q;K!ZK^_&^I8xGitjm#;oiFjGT^7p-PfMu|wp}x`RDXlLEQl!E*#Zfh|Pj_-BIA zCYF*`Gl zcwdU^)hy$HhBA)z#b-Pw&r4Hn8xxCX8}YvK3aW|=8a*q<7sUqiI<@&C;l8BbbOD%c zKG-?TfeSH)@iP5_jkMDE!w1!fnO-rN!F1fc;Y8xu%d_iT*|8CiY7U!KL&{^e?{iOBNo9{gU*W z@LySrj>LXRawYl~v6?J6lJ-l|E78Ab=Hev$m%@lu_e%rm3+$KHH={?mie($BiQMEn1X9lx)`~HX1yfy`FEfgKCX-zj!uCiP8({WDP~UXxe|NU}<~wNjq7f z3~5=)AW?<+OUNPW6fCa7q6rM9fMej`<3v4lox|o3Ht^5T@`bJNhsO4{d0^HbPQr8>K6=R!Uu{$ zjPcrP(9$IdAIQM+swmLCUeAZkv09O2H#)r5P5~KoSR&LeYd%3Z4utYo@#|m_6;D=M zLCrqpm#VS9+M`Q3rQ$9wu=6(+^A`<@O=0W6ON*9y7ycJGMx460ka|EAe~AjPHQ0vB zmT&tqSh}2awCOB3$LWcu5pN@rTXn!w7&r4vZXqhD*fBMcu$2tN-7b0;W{zf}cWFzb z<3Mp`;&V9c>duf8cOkh+vE#KT3k-d86Tk9lc1PCaC(yx|HB0TSb%=7ckox(|DE+TG z(zBSt+Cf(E3%B*{s=uN0<;;gTTW-YPTRV7TAT{?MouGIq1|XYyA>eoy!9<&KtxtaK zQJEXd29WDbGqsy{aczT^F4_R2!CK-e@cmq6ZE?X3Iw1vY=v%GJvGnS7-r7BnY ziKJ4%y@NMb8t+|dqO6z<<>EBPR(rc;#Uy=KCFqyRb4QD6TQ$x18N3(*QA74x8F>}s zOPm4+H2oBQVPs{yjuY?egNpZ6Tv=s5abBJil!{mSrGHsW7V>&o`q7>Ko3kp0R-m5b-uncqL#(b@N-F)8K3#|K z<_R`U2H}D$hNQ}sG%i-2IIH@qQ3q2k<+0z$J@s~5oqmL-n-ACRF}q{Sjk&h&(QY(L zK|q}OCLccP>;Igr6&%<-{K%=^R4X5r^}{l4(;g_?<|v-wN426xLlqNSjFAf!w`!Ph zlJbBO_oYv-tpX3+2FlM09}D+bY1YIXz?UY(3^;>SfJVrG(?0p2Stz#h&FrePIP?); zH$%WQXWJ=#SdC-eI5CEf-8s{TG|`)FEz5WkUpupXr1b`@SxoCAj5niQJ0r^tu_OuH zgJbE=6Rw?crzbVi3rAXR2ogxgY34EmYYVtSUiPDgIR8B%{I8cKH45W zhNWnGHpS;>TetBo+iYCwsc4oPq48U=+z9`W*jo6hl@ytQRW#r%H|xnX9^vp7P0C2i zjS$CKZoX)_cbXDkmK(wDEqk1NvGo~M#egf~udl;$b6tGr44Q>L=D%rF%Z=_85%ty* zljWx26EmW@vfQL44QeCWEpZ2S$}Bhfp32QK>==h-w<%LFALZ<{6-`M2ZP#qt)eC6Y!0iT_N)uA-sKlEYgJif_K+}!%OY2SW zDi=zL-y%5RTg-5yOz{X84v-^#KRH*K3g2GHxk_dmQ~U#n6!`V7)C@D*m^PnMcAd@j zf5z=Hxz-20o)v60I?blr7i@BS&Lu^{ny@a%8E_$;w`@OeOt~nK#9Pzi%V4KcSZ+Q$ zOmM^K^Ek0otvAT~Wm#`@5b~I_8Xlf~@QjJ=U{mFC}Yfh;>LYskP0HUw@L zT3RvSyfpG|8Jq2%b7CAAWbmB$wCr1O2wL+XfR?V?+@4F6A5t4>sW*#RtD11?(SdiN z_7R)fEJ~p=iV6>%l}Mz3#{Y$evEZaOa(hgp6yVtVMl;~tAz0pm0cR)W-q3(^xfyV> zZ)CuU#UEQHC!13+RV25yjGBpchnl0AcNAmE*(6S`NFJ>U>rT$2M=qVAQL`t-5)kAu zxymNCC;fuEW9;&liUEg~ZcHpVW{#<~;LJ!YI1^;_sj=W(m{@SEu^J1`jfn-vYSdbA zE;I{HVnC``aDPDDw?#e7?>S58u z@yf-gu@z>*TXu7~26JySe$wnX&Zj1^;|OzOJI*BSH@Y4vR@v!>ByPL_FD-I!yrs#G z0}eycK;l+I;xZ^aHn!uy`!=Z^DCv5@k|eTYang@w zf6GOvf0Y`xOP_4l`MPDLJLl;%baFGy33i=^#I9r4v1sEG3RQ20Nv>n5iuyH~iAXwX zW@5KEIa7uZCa+e1kThDWKZtQT$%1qx`U4A8Un9>aMWjWjD5_qki)#~kUUGiLup<;V zd7dU^s;RZ?%!n;JQUKIgb}o$Nd9_nx*|{;X?5KFHW#>YZ=$&4ms;Cl3S0(cN6qDz# z0)tUajgaRR#am8tLI!X{IFcYVGW>1M%(I3}+hUD%c7~qCN~>a}E{8!qYi^EE*Vxdr zl0r42coUmyvkX04fYPTOcHT&93Ituvo@`NhlZ#*$1GSpQWighCb<+ycL6ZS22-I$J z6AWe^*&&cOjen5btktH~1avG^zJ=vA&rCdcNtFS}Zh)yVJ&k$+j8Cg_o2o zH|EvAX2u_EQ^_qbl0#bBL*nQU#bC)Pt0}$)hWZFqayucI1Khm;vI}6Oi`M@2up3NG z$Hq}DKJ$C(bc4M zxIOv^Q^3cmTCKr?BvCkNZDg|edgx3R+!;gfLHLZWC;>5+sG4pjk?{f9VY%MJCt%BV zn`?AMeIyG~$VD!AbaDnn9?>8}L}{vXAb;pF@$#~2h>vIUauT@Qod-#<%SKXr+*w+L zV9C*|sB2ae;;;V=uBYEpV={VdwF;bC$9~~BkS$gp@y;|D)Gbp4819Zfh1r71l62os zn)(~CU*uiN)kCqt4{(^ zXq&)2_JBI~AM63gl$-4#-C4cQ;pD4I8e&^Tesr5dazBVs@)2eQ2x!LC?QdY^NObWh z*OYP}YIyM|JzAUG4WcaD14dj{k{C`#>U4XXSZ1*~LwAN$j1^b{*aOTkpgTh<#tQa; z8`uLZL*|ECYehb`R>=RgyNLt74hh5N`=}LQhtjuukMj_mmubn+Uoi(@xL3Yb6U=X()$M3+C-W$k%)xNt>`@YZcJp5}AltQLX5 z!dmg0X00ILBMK>77eFe`#;!M#=2b`^zO4IqWT$wITKVEjVIPO50bpYxHGr=5R^kC? z6Q`?TD!_?Dzxv!0s9nA8vCE~w;TX^UapTz&^p@V(-Nci=)!kN-b?mcYu-1Q`%S_~S zbSqPmA0x1&enyw7->q(UxVAw6EK{;aBlbk#p zyEQgJIpWyk#x8(iw+w4c411h$0}Q*hWi?`kJ!$R&7x z9KvlOQ=-&MEbi=d|J90BY!_~)tJ*HaDS+p(TLQS&DBOhYnJ465Oq)inNtDY>;>-ft zIfI6nZ*?3m5NFsB9!vcl8#uHj(o0jt@g4%-XALJ?&7JP#3EO7k*Pz8ChN_8jatldr zDQSUS70y8bXOpmj&zTF-H^*U? z61Z;c-G~~wVh}Z@W=jCJ5o|}*{D7TeJ#DkvzjA6VxAM>Rn#vJOPA#`0Rz!RPbGOPq zK6YzJbsIMzF2?5)IBXcJ1hniHXUT5ykBgUo5c49T zu7GY1s;Dw@;i0a96z&jz6LZ{yw2k+`VpCszpQgA`T%NzDP2CX`pUQSUgAb`wJm}=U z&^Oz4403j>dm7vMCEnAW=kfQH&JXZ+TxTQXY0iYdb+M~Z%)k^JL=gwl4i!?*TITnp z06L8unn-60+!CbF49%Hvvn6wRo-4kkK(g9O(jRl_dkU$iEPZ9%jGCvJA>7>6jB6dL z?us8Mq@JqAlnh6l4~(~L8gGUpMdT)eZR);qD3=O>6k=u# zDYjv{5@QnkeIh+@#!&*9<6prl2|ISNHfSi!GO5NcU~b~kIkaQyFzbapUc}ZBhXrSL zh}v=?8A0O0O=>kC5~wXUU3u#cB>pd2c4Z1y%t`{s$OtsDkLKF(5lC%P)ZJBTJoP*$ zvv03ka!>n54sBypV0-0MknloM8WJXNGwxXOJ7X};Zv9^D|aM{B6&S; zXLJ<^&gV^m;F^@WY}YcJ7PZ>kNFG;+VKFl*RXdc2Bn~#$j3u_xM7V9L4CHJ)g_s66 zkgvG|Y!cDTp==%}eKe38#BKkbz{DoVK)TRM1efF9NHq!KVePZexzioE^44+w)NxK4 z##Q8TgF<3T#%8=MpK{UWgK!f|%BJFW7wL$BED+9BcS4Oeik#4M0J~9C(;CGxzit$p zW%O-E^={+Uqnjr8eJ67I_r;Ww?iCHzZ$Tm%27S7=9*Hglw{+hjM$hhWNBWozKL;8z zCi0WtVCr@xjOQ5c_NW0F?6E{>v$MK%z~}unKL}UOJ4>DhZQ&O7cTUc(w(W+S+&^sL zEye8V^r&E!8GSdu$;CC=J;QC%gw5oM5JLn>V?O(E7R5+INWy5^ZJe+f$Ms-uYN(O7 zy%}n1PuP6IH1@=uwA>*onQBZ4&)W={YD|rf7GJ9R z#MEP^8q1&*4$!FTzlo`an<>1WzcFoqnl6D-y9uvEmTSw$*n zHN5YyUtuviVuim50`CoMljfr+i_0noJr-ombC6oXxC}q?&|7N641*;nupEU!SL3Z? z)yBasH)+yVD7uk?^4(^8Gi$A^6^|tMot`z1HyIGS`*jq^UmhIuM zh?`ca4igXXZdxsQdaPmy?7Yg~liD8kGUaFUMa9RscLdf!`>YvUyG6Qqf_1prJ&o=B z6W-IEK7TP;@poLOr_F83N`=p?0|(Fci^qw`!H^!s$&u}J$`6$q6kj2grF^)Mdctaa z&n547$uDqipCy0SwXxZ?akmAhpEKoVSNH)}_&)BPvcjP&JX(jj*3Dk`24JG+Ty!-& z!%RE!ujrG-S0%Cu#0oI&q+RK=aSTxCMiODhXRkTTZ%Ni*NHaH%D{j$mf5Y%!*vWQS0L8-NIm7&gJ0`8(V z3rQ+7FjX{n>To{X3^C2geZ?UuBQT^T-yvw?6;)X@QP2Y-&X@O-L+vblF4r<*G#68$ zC=gnSgtS!L7M2mlbl$lL%iWipte>6jsROz!xkEQ6S2nV@ynfvFl#EBuV9aZ2T%6<< zGbK{^37#BEJZ;Pxa*4)h#ts?9azi+mA*}5FuZNqH5ga9$%p^U`-2XE}1r3WaBLs!)~1Z z^^BlM%!mQ$Ilvh1>pLj2T?x}6Sz|<)jzyW4t^+A?GQhX*%aqn8p<+<-S=rW?z35yR zh?r7|X`93=`~)I4%!JM}VJ#RiMncxwa0{Ls_>g<+h-SP&mf2@C06-wR2P;gosEt== z%5)+AK=?0D+%_CLT7#-jM_(W(54pB#dWGF$WHiqDN};i^U6>y}6yJA?9?6Bt#vxC_ zL2r=S4l8jBJVm7VjBkmb=vM3h$MK`;_+i|{DsV|_S!1rfu*|OzMYAWlS#xwUHz7h{ z+{9lzBr{e;*uR6)wZ4Y!(PxIWpjCd~)JKGaOO0g)ZDx^{ZQbrgQV}LQruYOY7PMn_ zXvfqZJt9anUK;PSAki=VvO%JA-ki+$;vmuZM!c_3v>?&iM)Ijw(4dhmFAp+mVnM%b z8Q--Gj(Wx+pH1ti25*itcElOG{M11hzB!pci!*A1L`xk!C-cL&k2mPNCP?)2Iz}1? ziGG1lqilO@V7nZ^F3%B0IlgJDgXDkG=lPc1>)NL{s#@pzQ z+HLenHi4g3%~$azAfhe==X(f`o_OgJEQPMLDeTrS1gT6JoQ-Z3dDFruq#n`^AU;rW z3vKNv*gAG5?Ev+Ka94XQ4;mZVD7m^lwO`xlydZTaQ`H;$h*`nLe#iMx9!JRd`CsL*;eLQZjv3+XTs(a@Sg`}7G_9MPh za`W2EVm;X6vGI+*>^lLQOn{DutL5l}0pD-i!`P6Y8*s>vm6Vc{mnXGY9{PxZ{zruaIc>{5jcl0@o*vlRZ$P2 zXJ(XMC)~TkM#f=BA+4rIABG246Kcg~GEKVH}M?5Vld<6ifZD0<)-suw3> zu%Zg%bwtrl_iZ5G{OP%Oa;(N8$kbyb{?NXI-*!{b6yJcd?`#sZ{nRtu+K%kzA%?J# zcN2Q~MG{k-RF3AYoX(&#W=DXHN@d$XMi|I-HzOL^ul|7*5Ky8HubxKyRwj>XisP!? zOaCcm6+Yc}TCg#(Uu7l2k1nU47yT8r07Wl_;^Te<2PCL;*pQo@D2nBr5h{c)qZ(|< za}rlB=X^J)r07g`IF1Gs*h;O7C({sFjC)q2JPe48I1qB`c9UOx42I|S%wh_`WJL{K zIxyb-(iy4?mMEERvKe6PSe{m|<(j>D+`CBj);VQ?6hrk{4C2DeMpdA5;UcWwEW2_i zYIrw{+6Txs*pTTJ0pXUATg0P=Ka8e-pTd0TlR-EroV&uidM*?%n)vpE-f8o>x*9g| z3h%%@q+iZc*8Ax1{S=ka?U_|p%}LY;bKYr=p}ivKUCrPp4F44re;`F-$J6xjs5cpE zdWCo0%N#A;LyL|*yuw>>iUBeCr7OJo+bF%@@vgm1i584LJ9gKpjpKC`?Net?0O9Q{ zRiwmqJDa?2p$cg`^U)`p@(~&{Eznh#c)X7WTF!c&&L6A4qTO5Yvb}Gh>Vjjc|8Ntj zhkvej6+Qg&72aWGEEuH94$Is~65K0U?cOf^S#Ur#UumNJ@qIiVzmKO~L;9nJTh2PB zk_T1j@p^&Y?oFPS(d(cZY&nZgj_;!Vp)pE)WsGWWRvlH}^{Sp;q7Gbp)|>nZp1#uP zbGd*%$?Lw8*{+@+qxsipZ7073Jt?=*RLfZh-RFA8k0S84(cA*HzF_|qmmtw~FX>Zv-J?H@<#h+u#IA?clL*=(gY5v(9(d^UP>a+91DQWw!JBl0p7}UhnlL<2STa_`!n>?$Zd3Z#5_W&L! zwBP}s(9}Jj5pyIb6%@D+=Sb=PZ>MX~qsGE%A{#Z}&TtQl8%=Luw{;)YwodwK++R_= z2jMm3n9EUZUXhQN_p4vM%rCy0OFfEX(Oa%bzh8a7UmR3W?iBHL=nrLIkShCO_hI#i zRf=PWH>3VIte#{=>j&ky@&Tb+AstcIie<}h!$FS+x^Uiv1~u{Rj5~xytHWr37PrnEdy{92QdHJ=$It4b>3=9fghc~^5hiar zAaT5)LSn9VNsz9Xq#dAspTd)?B>jB!_g+l}lb=(Mj(!My%gcK63AWKxC1Y~0u$*?U zV=px2>b3SN1G189NDyS{n21h7*l2yh<4n8}q~~163%KnQHBmnLj8@U{Lk_)F2-F6x zMN3-F681rhs-y_C1aqpC$neq8sg1N_84pnjS`;HV1jy)7yLX(AhZ5Nuc9e+TAVU0Q zDO2Yl6vb$fsE`pX5k4bDMu}`qQ|?t2j){<{l?W04zQXOV177zXqCwu7IsN_LgZ@As z{)6;q$f%6X51(A2L9=Yw-9(B0K;TZKKQmP1SD-)4o+ckE6L)-ki;)mDYP~~*-h}=@ zV^lh!KTxnL{ed7jC(4Vj3ar74G&-jG?eB+m;KPT7Bwr;tpLVxakL;Cae3jHy< z1O0GB$A}GNh@cOmOiiGmG**4AwV3X3mSj>PO26BfBk0Y;F}?YJ_*SsvEz%nZNe#hC z=twr2{^1izPSju&rx5;}f$mF4hX~GasxbK}b(s*Hn5>B4WV@akL2+UR>O_iD<*?$q zzmVcQ^Pfd=SObulk<8EkVv3V*-Q;!u6Voj}w1nP>*D>a1B)_9I`i8vDg2PNn!s~d) zU((up(9uek;xEVV{C|<&Fg7Fj9k`WRQu9{$oj0d9nypdj&0jaXA-&yj5%i|Y?-aSoc5A}&oEN2++cW%*5g1OKI}Qg?9F7O#1HF-OH32XVVSdY&rb*Gq|!w2+!P*0EP z&y(ahp#cVl_eb;x@#OIHl-u<Mc4Nh*PZA?IiMqqbYldWJ%~o>2bd^@u_f zO^`xY)$UAPe4F1F-J01%UJ{Zj;CT+fMXAj3+{IidDm?he~5rZ#*;vL zuO1-vhc^gL@vaLVQ`J{&lJ+Q3;ME!>5P^nm?p`A8vVQ(tJE&>vxiFETHnZFgnGp3ICeQ>7i6+fX zlERaGdlY=6E@#GL*7^qc2_r9L_))`OM6*%}ZOOJia?_=#Npq=tO^~`bnw?Z6CR(o* z({ksQy=X$o+R4b!vE`;oTz$d4NU8nUbupnKLZ~vKusFPv1#aW0?#Qt`9ay^$1597IT2j56c5O>vSE(Nub&RtqB z@0(Mop;Iv(RrT5ZD8R%XNhbnZB=uRowEjwd21t>v%Oc;Cbp$ngy@-x6;q8Xa2FeUTwFT>AnFOdb~(&(7D~U6fE}Y~) z?|quHF`Pw7Z7W0*39&W7vzQ*_Gs^W+Grr8Qe&8V}JzhgREa42aQ zi}08}bgnnPVJfzm#))IO&2;=k-A&P$8G|)zeCCg^_B2kzv!R^Ud*?@RA5l4cC%LEy zf7XmWb5jv@(qkw+KK)LL#8s&A+w{sQX_}cnRt6vL=b>nWJ#!rrO-vYrC&dKSn(jK{ z-Wyjd-+H&#y%)e$eS%ftwjioZv0k~2h^D4kuTxY%t&o(t-xiF)Sc|P2aDfM9=JH10 zbGh7ux68)}p7EEz)4=LX*<0str>Qn~tht!#LLj=i+`Z{WAgblY)T>(Vo%Iy#O!1em zQ&Quiu?=c|5mx|FE^u!F*8|bU>@nO6G=BCw`P;ZC)xbz{vj-o@(}qTUV9%N6KC&sL z0CRQBc6;jq=ZwYr6k?`Wp892BcJSl%oQIb#!6V>wX{`B+s#$3C$d~E zzjX^%5g-?|(+wl1Tfc$pw*jwbEdeUng;~>6dgRV8iqB44`PrungXf=Oh(G@{L+<$w z@#nJ)KwvPPN;y^mFa~L1aB#X|@Z3{`;^XI@W^clK-Y$1HUkTND>$K}=8t0~k&vQ=~ zK3{%{P<;I5r!{C$at`m^MOd9Ns*~N>Xt=H~?0GM^E~A2`HH)93EPTRwaAI6Lh9Qf} z2Hp!3h4=ae?}h4K&n4i!j4dN}9}9M5yU)ITvxbMA-l*2#aR*IQ3?4n+a{_>an`{`` zNT9q6t+uQn-+aw(@18FmzbPV7O(y0zom%1^i05_g!dY>i8{TQfNVI!vpOfwOujyebm7BbV?slsi!{AT=&76AXJc39fmG3j0nu z?E0I>hbMrkgIRiKhj0oHZ?aQi##+`s%}TA-gEnIwr+TxxQNsrcC_v(^*YqIPvpRxK zJiPDB1Dc?UUN`>I)p1PDMAeeOVm&8d1ZYh)tR@e+m{QWe41y;WNE|a(H9J8M-c6>1 zt`?F9_MP?o@bJ_fQ_K5a8Q_6w-1i~8SuJJPOYy^J;csG+N&HFOVCnT-Hwlz*Vi=@V z9EI2OdD0dWj0lp4f#d?-zB|l;6tCyf_dCWz*eRo*kkyMmTo1k#3Fk*@jKO~7EPpcZ z)$7i)HW|e@7Yg?w=1aD|q1P)zVCKC>*^W6n8$OO(A}!G z>#Q~D?cq3h7kQV_xJB&>T?vteDUnq8(q4`REN6kF2ss7y=5S1Qc&FTR^f}smT5!h^ z2h4S@&v7gQzY3}H1-~F(ufBA#qdtN5x5-6LaQjjE|IXoq%W<2x`CVIZ%kE{+J_X=7 z`KU5(3<8{JcZ(hox-B#)kx++S@YW?vl%Subbe*?O0AEi6P6$ManQ*fzVcSQ}j zJ{6lP@M&P*Dm){G?^l9n@d@y7mt#E98n3qs?0OQB#9Z9KF1{#gyaPm8YYUE> zwG3&fWtarA3K+zSe-tj~L->;|(2QPs`BVj$gLvt+9$bWX2n)!einK!p%JLW#yajl| zc846=m;#*|lTEyP0=>jkHzu_E4!s5;zSEdn+?^V!=|wN{w}y%;v6hN9iR841iY9HX zt$0b89H-%PB4208a7$>Iz<=h`}p{y~P3k6M|)ZgBz8&4kec&oWMr~bdY+`53kXQ6&{}; zDBYr`DuS-fwfcaEQo{n(!L9n&o|iFHHcn@|ak}3ZrMGcnp5t-SDyR@J!KCpKvS_S^ce*t(pEa>I7U)_e(A{ab z>#+cQRtPts>~~(F|G4sLu{gR+?V z`P8HNDE%JB>D>}22otkStu>M!gxgfRl|OSnN^oY4fvWI6UWtuwR}OfJPHn3a-%ii( zRL}1yq_)wyp1J?hoAa@NcUo^zE!`44=d=^rlVFB;IA;>oKA=(`6wL2es~=P=C>G|J z2!@}Wx}yWy{MdYa-MinxIH*bjX!_R*DE$eF3V{075isgoM}QfBtaKr71MJ^W3*Pv* z3B>pEXU-Y%aOtJ-5A>byi`&o((?gjLs}V#$=8D;7vT4ChtT(5L=jyzUrTF;jylR-6 z_dSA6RmW#pD*W5bzb*U&t+)@kY)?)0w=o^SJ>LOZdOby!!oHP1XFc3yYDk^WSi~z% zcm2$|{+r7Bb$265SGg7Z^`dX%X(usBZv`06$Vn5JC)mGen`+#{SqR~vo%5E0a-sr+ zD$DN28q$ISix;wj+zse16b4K>J1lcM^m-*GF}$$+G&vjz*UC{U$R!HBADKe2hZ6|@ z`VJPF5&AL}jj;E*heV%&Yda9pmGkVAR1k+|>R|8C%?r9V5s?Z2XPIZ+BeaRH3R_gg zZ?0ZkH^vthD_5BXllLZ0DqVX?Hs3fWChA++VmNTxPw~!76K0&Xc03uOsp25xEX8uB zYpOchIg`QFv5^?F&tsTqS^jf>+;5X$ont5FFgdCUwyZ@q5orkg&H_|R=PAC{OWlm)04#V77_&YO-!7rW6p>IT0dH#$zJp|}!j z&Q^|S4N()gIw&4BEPmKJmU)CQ?}S-PTn5Vl_+gqFKC07JCZT~ zUe=izF0V%IEP(kg%Yza;Uh7;JSAhaOXHdsf$*4|+Mj3sodfIv2`_v6h9p>1&IJHkV z3F8Ni9whc?ko;HoremFt;@e_0#SE{nfvZ?vAtPTEsg$2DWW~^3e!k9~7Y{OJy-cZ3 zH!7d)z&z|;8AH8K<2;S!ui#KegCiVVepQEFK)R?(kpv)~0Sg|`ob_A; z>ok%iedafQ5lPx%)Q8m$Nvi)PNzyN_TT7C9N01~=nQez8LHI_JBqKpaj~pF>xf1<> zIAw?XM({imP>ck@+&L2Ta4;4}<{A=oKc5Y|!bv0wz`a1Zq&sh7q#Wh(I{Ha>Y>bW` zIQmftQKvq~4Xut_62ZY|E@o8ZBNv3YMX++9ZmS(RacqrfO+tLgt06T~&&G7;qrdlL z3S5#?3SKYwfodI3n)|LRUSXeTAfZ`~7*z?6>I|b!em)^T`Dl^IPvQSh~;S$ z6X%`T&39@7tMtd1pA+d%?|%;c$(D`s{d)MJ3yem?F4pivFkB<)k2m@JUo-i+;Fsrz z@`FYJx3Zh)JOLb4l^=pvg7|30M&XCBVmg(Pop~8nRk|>X-HAEj&{yT zYk{2Y0>}SW*lQf5;9k^~sVeqk|4?DHAm8V_1mHn_G-6mCcX9aMtBku{e>Ce>} zIBH-QZedQ^iv8?xQ?pY+yB??DpXjVoI>VEcR663KFlfuw@g8-_QKWL4aWwio`_iA1 zKElHei9siXDjs$WiReGyK;j;G9k!e`tOPmPvDL@UY384&6GZXHTamZwWXE^D2++l{ zEjfMWaXw4OeVkaj>j)k0I!=*46)X#mQsnsSxTRfLrcBFOl2w7tP9u@+*wOwIk?>MQ%-i~ZFfv9NdT-g zv4^J}17hxX-qTL9xKEj0gIL>Pt=t1)U$>TCRb`FSRo;2gS(F+McDw;;T|Un2MU^+7 z%2z4+=Y#yT?kth3T??*ugezTmif-dP?XZGquVSZn3R{{=@X>>uGwvI`<2uu%L?)X0 z?a@1~FW8RjLAT=?Qw-s9YB4mM8MfK#NhN^(qR4jhwBuZ}wk=j_h`6|Lly|QWPltZ!C8@1eZ{D`yVJPJ?vLJ<>Gn=@V|Qi&I2$z^tO;Jn5%oHFBJAHD=0g7}6?rsxWBOpdXg0aT{Bx`G9R~$E607PfX1x za12muqQ=bkyyTHFh)?HbvKBqDtB2cayOA6J*+2=K<~h6ZL6{bQ*@{buJvkd+4g0bl z*9mGC_uzQojG8~im#|*d!sq*V;pl zEdNkwmS<%PUF#s|W+$!C##5TKq0QiafWRsr!Ls^-1HNiNEwU($` z1jb36X^G3nPn>OuSB#SgO;ta{&aI2A`$dbRH>RiF3hEBNl0zhRZP(GIILp)@M$7OS zenF2!{6&4fFI6NU-^vMd4=I5L=a7KC$tCCl7yn!}fzc5^*A)adD9R81bh#p3LSPlm zBmszyxWIDor|o$y&m91DoTN(yEs2rPbtLLf$1FUx?9QLZgN|c5pMsM#+3VyI44|!X z4_sq+0b#y(a&NhE-ePXP3NIfJ2Xo@Y+XoM713VN&SC_~Vd-3AVVr-3E636j;Gwg`b z9gjlsr4B_8PzkVCvSPdYGv%zPdG1-Cdt5QfZh26=Z+E0G7W1Q1)g?iAjB4MLEezPb zuo5-Rq+Jlzo+)NuN0Z|G=J<|{(@4~m5cg*bT(i*)0eiEFXFshAHn?poJ!4O@?Md+0 z668p>9fQ{&q;GtK*?;7|!R_tt8|+RBmu>KOQfuu9!|!r;XI9*{kt_6gey7F#ST=3OfKgit&OXsXK;6nIbNhY&@f(R2 z0!Hmp1c~SqtTJ$E;sjkDV|7^xu&B#*T?Jr|jpa<$<-n-pQo20G(ikq?KT>gdObGzR z13||#aI@$66*}L9Raok)6Py!OBc)4>LD2CvG7W6L3#Pu!m^kM5;UM~!o;p_NsW_c! zgSbA~_5x7?oB&IRbH}yhcd$hmi(B^V!0Q0qCC~Oo_%X9F8Sag0e^dLF8eF-_W)cRfy3HgRZNA*}kK9eOptRC|a za7~Y-S&`GBX^zfn!X7SiK%C{wJa%ZD4WlaNF|eNG9!&&~uQ-uy>S_He4T7V~u3)RP zNjPP$;c?mzvR-ybHPk4HUtFEWuL=Iw+~=Vi>}Z|5US)Wk?mKv#m;A%!3l@p#!j+T+D}i@b`2Y?g-O{9hy- zja6dj5a%QjRsB@TRf{o^b00xB!V}^Kjw11dD0{r%J5u5C(v8Op)dM*#rx&W0Oh#u8 zwp@yHp;TpQSd#7&tmd2!D6jYD6*tz`k^9rq!lS7l!Z+qre;4LW*~_JHA8gHeuT{hu z4WC*+M6&e(V&%3`_I}}ZCVW56Dg9c1myJ+iil7|Ppt(-finO=s?@~!onTmLrq3-SY z;EZyPdy6#vI)9gO>3bXgE}O@WQ|#}eu^R2~A}$ZwgGb78ao{|Yu{iuQBXkdiKoX<_ z3|x4Jgs+QlK(wl4vgwr|+GleE+&Q%xxpRK$1{`>aE2P$Kk=Fhp9%Kgi``L2KGMAy19B$+^}-^#SY5f~e}-Qkxo#T+x=; zW>eOAHp|V90ftDoNy`+DGoQ-jG3*-dJYYfQM1k)^)p`+`3qOSS2t@7zCT%VQe2XX2 zx&Nk`x>wUanx{3Z@ub=ce_A(JaVU?+%|d$A64DVxiNar&&sc6yAhHPKU;n^@^N>s#&|?CNDl2b@kaT~mU)C0G$8oQUaUQn zCnMv@xYcHov?+~;+UK6-XwJ<{)Xi=_orm$UOvcEIabz?NdUx3|RLD_~7f)l3L1ak6 zU6!2|Dnd1hm~w%_j8#Y%moV@_Y&LRZItu5c&`G&D_ho)s00Z($Q7!4>85*;NRFMjw zAf&3Fyn4znms`1%MIt8Hax$88mvW5GCZ!_IC-FQtnq$hQtwp9>C$zo$s0U9%HrQXVTM`OSwmro@ z?iXYuR4i!!XZwPNyWYN>*1EFocvb7dw&SO_p3`=GQtNw&7h7EWKa>AXJr>dR1^moj zl4gfowEH)d{!=-=eSy7Wmr@|eZs5Qdv9>Kv+3QpG=MqoyY;oo}|n(3Rt|4tE^~ne94`FR0JZYUPuBBX)sj#Aol_13I=O#j`bFfidB13=Bx#2l5v>XmB+UCNAQoLbOATiq=B<9dbU>#Fo zA38Ruj=QixgxpZO%kXEv;f9^Fa`uU4RTJ|QiRG`8XsvI?fHt%sYP}s+d?AzAsq3jX zizsx6PV(w5QoE!Ey=5BDRW^KFN+{A%`IrQa0*-LWsnb@V`1qr&Y~p@(Ui0Fw|1;UP z13a%yHYDy^Nv(yNdb)F?fzKei=r2#6l*gg`-gDs_kgfas;(<{(Gt|CVIWU=!yBr{s zu~rp+d?bn(AU7o$mlNPX_j}O)45MYYZS0UT@R#|*Ha|M)a}MReDtGs6;EWmLh`Isd zPDDRCvRaNEom}`Lqhe#>hNh29`NXR%_=H=R2Sok1(RiTk?gS&F-g5{63k`hkFKp{2 z_F*_0m9V~cM9wS`QniE)&YXpx$6ZT?!dZ;j=I}&aIhf;`0eTj!@1_o}YKLQMCY|&1 zYyopoA7bWFMl0E-i_5mUZwMNRQl6dGdO3fqS{Lw_tu%ipwJOGWF@V+oL2V{%u;Xnf zzLj>(bfnadIANbl*q_0ep@i?a1Slo}o_;)w6E71n^}V>nE3O3iy!Ts)+v5^yLjp)Q z?y5w&jW=mrH|(}Q1dV7?TBlnJ@!l<6ec;%)v8pl#;B zFL9G$7;reY>spCZ;R%=MS?$3V#0{GIu1r+>Z}=8VdEG!WnoO=)^mWvjNd~CPZ5aEt z;i&PmN~%cERhkc zzp*IO2799Dt9S_xYpJhMW$X_GSD!}FpQ-ytZ$?m<#l z%P9WrE-?;mm)I^#AJB&-iIX& z2zRD&$I6#^V7H~3u7gOegpn+X;t~nGJ%ov=<)Q>A;)Nc){hDh;htE9EdY$06V76DG1Scx7Xf4*>EI{G7>o#>fH zXNla%Iw?AW2h0SoqY8Iq!ko}CVEm#UCWJRU%Dd{ynfY7XypfT{*0on7r#UHk(cXY# zOIl$HQq_4`*VnHhNmh@oe3q+UflEY_eVntT_9Mpnv$#3;k3WRD3<}!>@rdWlvXv zss9uv&KD`Oly@(A1cdKSHK7ay9h_u5VGEoOT6$D9)*pNjT3(Wa*}K#nbw5-Wv_G$} zyqb^cnFH^Qt~6&DAw&uN3C{%=ZXhqd#_9LTpFy~+&Ur%VD{<(-gThB3;5>m-1wjHw zgy!twP5B?SPa)@b_L(DPm5QJ|;lDs>H_pmi@#>CSstS36) z0CR_h@VQc0rMM!?okKwp@`v%L#3HW@&$Ln=Jov&0!4{_!A^G+`Lw*7?Y%E9wX_#T4 zib}XCje#7tJ;+t>Ar*WMy1!HsG5nF656_6r=-sZ&xT5e=(7L@x$oCgLdI>M9(LsI! zIVDe+VdZ=bPnfwq42>H@jXvG6x1}ynOeC4Ri@bVp0H%=hS$cuI_TIM9>;qJxZrJz_ z3lf?B;Wn+e27Hu;M%CgJq`ojddOh>|M8A!L`smzf^zHRQLRAR4-Zdg=5 zE1TGh8e(u(xx(&wQPn!!UjHQ=f8HJp*Q?_M^t#?-mv zTnsr<{&*mzV+}yGjn@*gAWPalv4hBwe0cVi-wgHl#{j%f>x-Si@xJAnreW zk^XoYXma0dK9HCTUkYFJ`FuLQ&kd~3mQDQ*+RqY|K9tOfX|0IOT2>lJGP88Gj%aqU zy0SxJVP%2jwZ{Ztkw%YG;I9_Ce8dKbyQtV<{0`RyMRRoe29<*gqWUJxf{wuO37+J` zmpoOzs_;ci)zOQV>)6tWp0bWQ=127(UjV!yTb7RJ;76_3S|_=1`J)3b^1!d3zA_F7 zWtQ)wq%G2V;U#`J0<=I&0n(Ek@1&2sDTzA=6z4o zU8EcXd_Y&tuD#fW6uOlef%u5`Nkvmx0t9w0L_i-d@g`ZJmmuX{nMfFqP_hvPddhN7 z5RaDbodoY(&gR3^7QOm?a&$+9i5D@g;VNTb*$QF292P(%t}vxf-&&4}XIdUJH~N%i zUio`ajvtunA?d^fvH{N?w7wc=NG6#s0rppGYo?l@{y7x=MOb;LE|SxW&ygi==U zS5-8j!1;rfPpIaH>iFx{4xSwmk|7v`EGb$EY>67~;~8tHu(3&0t&P*4=kM1aRe+8y zCKh?~2bO{w!Ghw~LKbw2$o~|yLjDkIHe&&5jZ;_nF;lTS?^2KXVQcyz{!_e6@frE7 zvwZf+)0KmpW@<52V;GeRms__&SD-yw$P(b0&?rnt*L8(XDu&W^&qI9s?%6#B2D|76 z1iiop1ZoZT{;6Qb*Gj5xPy^x@cy*;>S?6a--BZ8act|-iZDaTaCG2KIc|*aBY!5%B z76rquYrG;B*%lR#H88y^u=$m3bCZ{-FJ0!sn0-0^tYlaRh?D3@lp-!Ov1#hTu;&-FMAWAoxM= zscsvE!T+m^CUI$EQB2cKld=tQ)$*K_?G=NNi{=;*fqD3!AF?h<@61KnB6RJM`yS{SjWg zQSHCVxyt= zrQWa!;Q`xY8nR8A+cG_;vU~NXi&9@T_j)?g;7e+SB z4+Dj?Q=G{dVs}TAKc09MMP!d5ZCu}LLZPr5Jy7zf*RBLv$UfttYcA!o%v8b!H1CO~ zQViqHo=oE4yzu&aRSaZE{65yr3vawv#UzUw%)S0VfNed3O(7GPrW7)LQ_b1Gi~<*=ll3Ah&1zU?e-_%L}!>^et2 zEOGWL_8INAiI9%I7`GE?kW2U!NUc0jQ%C`6%=+EUl*lEV{)wHQNU74iil)Bt6>d6| z!Bc0dI)jsmTm+ zAO&D-n8Vu(HHT_cXF5ER4hQ5lSchWQ$NziGK6jPq7Cz@K^~=a+TcJ+Ad@7px+tTp2 z@*M*vcJTpS;$&4vbKZ$Y$`zUK6zb+3t{CQ&T;jNtMN>83Nxk{P4lNGNYJwqZW+<}Z z&)KT*j6xVB*JKktn1=KLA|mJXYYajr!(FV> z?psyRe!cs4dg~vPGO1N4owLu#M1Xl>#GlRezcrKXgmSV{>T$^dsg{)b207(T7U;|r zurv8yJ4xR`t>ybnmyh2wYp|3UROW2UJ6kxLRaPT7on-Y@l9l)2vre_FQyr$KRcn}@ z9=?{==eVr6y$Ta$ipx32a)N(bZl5IIKIB@5H9uP!Czm@kIj*waGCDv}x~~2i zp|#~c)mqj})ApcUHcsoXQ9RQqrg{ol^#JBVLsgBkEIq?8a+f;b&_$E%*CK#JNfUA? zWrQ3GitQP8=(`%MP_RcQ*^fmIDs;7j3SA9UC`hDd7&_<*&_RJuon${Q1tM4Ax%LBK zP=4gj5DM^=+pv|^I|S~10bwI=LW zYt2+DNu4oGHKyJ(&H6QInu;x*V!sw8?AM|i`;|JQ*826HT8gcnRI6gEr|8F`L@lgT zrw5B__`0o5{ioSkw=y(<%Q5}XL=yVcXof&%EU~$=)%}UyR>Z(N`)sJFD z;8t~ER6i+<>aS5RKxPsGg?^o-wvf@}z#68J)VC53vV{{oLl8(PWK<-ozl(HPgWYtt z<8T&+Xwkc{LQFnW=N*WLZVG=on%!p710AqAt6)a_wyGv^%taqKEx^6_*Ua~`%kzDR zHr;&R?!GbKBlj(y?=yKnHFGA)c5CpxbAFpw!u;iYa>`6Fh-y6B&t+LoArGQa49o1Vx9S1a^6;A_hWoG%%H^DWDUa`R-YRiA(pR`;w=vz5`y2Ijxj zn9(|ak>)$ufbWJTtC~@uYDR%cLqDOd_BVM_vtlCPOWW%7V8ywI^h8M50{NbJh}t#f znM3DU%2Ee?z4Y9E|q%;a!Vv3 zWG;;BBeQfT*`l%ud|y+aPy|Dr*?DAHL~waSqxp1Wq}0w(#=uUB_*#5MYfS`;aOo8{ ze1h0VDJy;Cb(_#iJy5$hS>U9xa!tyE1N&bs+a=Y1wGfw5wW`I8)4>c>INAti+-Yt3 z%`EW4=*+h{A5vepM;lgE+Q(|#Hme%HpJmVBs^E*OQ@Z_~Rz*Il(#66W0=B#ZbA&4V z0x^C!DWXrv!xIOQ`?opcB@ubcp&m=62-81+(xv^aIb;f9E{&i%Nt|@-pN>v7I zCXLpvs4WK!V50QkW-JlMbA|hP@9YM+pbH(;v_Wm`GaKbukInIqEMl@pq-pIqz!%yv zzVNIpkN;-bA7f{SPI@nZOvd^SzG9A?rtqn?k5aFN(O*u@SRT2P?-;!)n<9le#)0GQ zl$ns3L0EIkvlDn7my<>2-X+)~God!slwZQ8eUo{B^5!1ZPQH&)eCOzvPs8hA`}PVJ zouy>PSb3w%;Y^e|d9?z(?VnSM&Ew_H49u=<1y{>^BR5;LeRju9+34+V6a78460|?2 zwy5HTOq9k>@VPpEc$_mkm~sb?N43Wrq~c1PGdrl0vFI<)!({ zNzK?`a6)Ex9QS_EL&`0ajhUMNPwof3*Nl_8A2fU!Z$&0|Z@TIEp#ASSNjdd?KJR)x zay3n*eKH(lpIkQPiqN-KSXO~DqK7k3A~l#N7J0wz%@EEc`&b)TUlcT9hRQ`8Br#3h z3gIq;uS7UQhDgFlnoFwpQGtYzJRv%HhJB#KhYmxg1#7>~aKHzbqszm|CL^3o$XKx% z2o#>%tmbXBJTJ!we~TX)I>DuG_3_w4fDL@m1e7F}osg{Tkvjwp+-YJZ7DqgY57nH2 z8WBVl66m}SAJg%QYl7z8En80@?_FUP8j(N8H7AjVunyKflV2vKg7!Z8s|T-U02#_h ze`?ZbkhwJi2LiMI(jEwq%rGR(ZpYj-5oLCDBob*o{P9N$HhXoFvX5tAKaf>8XGg4l z8R>L?Xs>`RnAzkQ!8!Xwema;l?;fFAb$l}dcBVU=5JI1GLWr&@OYakgk!Hf^iwGmE zm0Hp8lOaQeJmQRGv^??vA^(r%5vDlc$JDZ9v78;=#(l-7$Rm3HSsQHw7%B^-q)eEV z0B8P0#o?R%7YHP^E32ftEELI1SF~pQDv3lHFw9h@n~X29x&l%(0x<;q5EgO0FNbe7 zN+g9$yZ>KA5`w5wQ$;ArY&Z=bL{IhlkGre_(bLJilR~+bPl=y?lPO$R>_t$p_Bp!F z3}-{m_>`rO(_dKJ&`@~pqz~tyZ#XS!B|?d#5-Cdl=z=rb>$LzOxZd|*yRBa(+4Z~ewUyD5(IOtc1%M5G(?kS})QpN<)I*&pk=NguoOZS0MI zb{Jzf{(VT<%NPD3T ?a~u-XUl!=@`-8v#w1zkGMb?rTUvO%WwryNG)6V$l3&$lo z2a+;m+K8x}^!Iz4oRY4w1)HQU&fp zYYx`woV`5{L!<@iDbhKK=FR-v&Pet^$rY7nBvP@!AtIN~-E`5P?kMCNYAg?*S8vPr z{4F7;f`K3R<3nj zJE4u=uN8cH_fGcl(#ktCw=o~6!(&WaRB3#!weUqzGpO#&@FBKHRki7Tg++=Hha5C@ zPyypTGD1lEvp4EVi#0*CzW2wPAT+q*MobIUN>Aj6Yja?VPMqkZ5o8KzRO27T>(J}c>o7aOclk>n4fx01LPQQF z-8^(W8SV`Gc@1@#eus%WV!sZ-yJh`dL=Mh(YLNfG5$?|y4zew_F!`-i?l6B^W#hqc zLrz&<3}c1C*`RJy6u6)Jhs1hIylniEGc9pE$RA41 zw!|yOFFD5&uNnvQC#b3V5vIerCy2BS+IP{_F^E4HWq(uw_ybr`pP+TIbxoL08Fa9#;VUTwy>z7@T=L8t3OK!}+26 z^ny`!-RNk-XWoE*_6HqR@F}*UYK(k<{lE`$!(8Ixrkp@o1K+|h>*zZ;Q|*)9WtOuR z(0v+Vmmq4usSMV2r=9L-Lw3=PFq{O{Mlu0bahvmUm&s@YDVU%*R=76AZH4m<;x>%G z9jetfS)}0by%y~9MRJ>udD5VLegpqG@$C|cj5_MR0LiSQQ(WzDbysJ~QbvLdV{rEa zy5?>DFb@phrzacU!|C^<*~D`|{Ctm-)qB!2_MtLyKzTZul6FA(5S`^4-2D^x4fa;l z4bQ4tm-6@Y)+B$8!?)>p0b7{GX))Bo_?c@QJ`Yfx14?ZxmBX?GjPv)7{l`tLwL}5= zj8n4S5`_;mPU1{U6w1#yiL)(HC_e`vMl@kzwm2~enq!Gq!Q-@0KCTJTrx-I5CMyAypEzM2AU~EcRSAIn#0eLH@nZ?KN&w?0PPhb&A4{lL z0vJDW!ewCmSi(#tfbkP2EC=Gp5@svGLHxvNR{-&2X>Rn;HYJ3j6 zlzEkut72v~_O4)70a6(k8GBbS>%5p*jlCCsbHis5ZjUwX1(Fh$ucne9)z?L!f1k0Bd(l^Jd}2Zj-gY4p#3^? zdB!naY#PH2M4uNQ7Pt(FPxOOo{fuCFisU3g%WlvlsydXAehfj~%?~~v z5mGILIt&n#*^0iqJX2twtAOkp8ST-fbR(JA-p;`E4z-)U#b$(5lunXz2EXzIUMfGb zqYRx%ZRYugRN_D`JVP)h*~9^NWfiCt34EReKt%n7HV6%j!O%&#fov=yv?6@TJ3R{1 zWQ`hS#lbP1<8bwu{jPe~;b|Cy+yRs^JrlWRT@0V33&49@7f`XOAXBi{gOVn@ zfc1c?MKPcep#4lzVL@l&yU;t?0 zk)vwVgAi@&lMgAnoG-1D zJsS?(8dyQ4yIE;u9zx($sLUumjP&@J~|gSF<+t#I4Fr4za?UDB6pjt}T ze6{?J#d%}ivC;mP-*Gz?@Fsp8|7|t$M)99I@OJs{?a}rY`0tL8 z#aGh|H!+5~WBMYk-8#*bf5CZokW77U|0!=d%}7!f&v`e0r<|uua&uZd^-TRX&x}u^ z1umZRjtnl}%~A+IDPa`Xc(#^4nBU|LpK*ON->LFvtG;Pm&s7`fC*Nv*pD&ANbG^K3 z3#RJ%k(9xE*ZrnPuAEEl-md=#%

    E}U90lWqOP%-LpVC;Of7sJ8OK7E+}^+CE3#*9uZcj-fuL+0g41 zHuDtQh159&#ENPGLL88ThrBri`+KE|@lqt+gnj@IHp1{^>%k8bLH13h*60zD!32iC zo5eDNgc=9_@!!>m!9U*QzD;ah&F|A%FXL}jYlgq4xBd=)C$*Xj9+`#xh*y|iH$+LwBqj^~~WzHjH{U3hvRj%bvdJ<<=IBoag|Su?^Qf zvo=h*s*iVzbBzET=gP+kRYBk9_4A#ueto}4Adl1B_ekJE0a_vMW)H$`eu10Es2r5hTh18PERwCyJ{9iEJf&HuY&z( zFke|Ns)HXiUjbXlh6msZhsi4uD76KUukf7RFxbz4`a*QnyX2uHp0$_rS%Xml)fH^y2SlL3!o zK;uwpzy_mt{zge);5_m7$NC{qedDZrqFQJ$?lx1nt1cG?QVb2JjLS@gZ71SmtesSo z$0*ZkYYx{^v1F0Zl1x>6cV=L2i*A!-)5@ZLjdN@C=duOp^8&hX-x&HdwwdrTe57yC z)zVSz-#Gf-=TPN#g7NTq=<}K)gX{*^gSnL|=o7qMjc9|PGCVi^zztaCFqsS+xkE2mE^;Thzs~#_z8pI&b(1v7Bu;x{xZfH&i?f+w_ydU%1uz^pSYv7y1P2dGa z_c2!8@Xfnq5qAL$5*Kw;=_936;cNYLZ|qM_7F_^s$A9iX01e(>w(F0Djniz?_D}<* zd4Rldo`eo6?o-j6i>!eWR&`JrUj;j=D`mVN@`+@UklO((dsmfS$+v|&M41=^A73v2G{9S>gl%Yh$|PSfHO z9Ima!=;~q45(d@QPuYMgT;nJ|q|Q6^JO#KIHYj74E>_ix_D(9b(t0qG)twGFd@q3Z z#^z8-iDan{RA$Koi4Pe`n|Us`%o=-|it@uciEW^rf!|DND93{DRg#r*xO+0Vy5hyt zP9;iuLQE6|d~sr`Z~h zrY&LQ2PDuH88~b7tNveu!zZaV{+BV;8lop8KbO2X`A5k`fArbRwa-sW87nLy;RkCI z6EaN~&bso8;XTC#GZELg%;ThUOqfe`WMBl9uVZ)XrOXq3CD8nv3(X}+zp(PIo>TsX zrT1kl+CC@l>A})+Qz8ncSE0F{ZdNds3?D%@n3JgY_umdHh?PoSrw8XVCFbB_;|TSYllw@@wV zb%-1a_3@P!dh~>p8nV%Nx6c630ChX6*Dr=XCSX++iApEC#OloyUq}-1OgU7Y4c3rz zUzNjF3rQm2D4+S_-)H>KHTo=cA&E`9l?i4mkEmcV72vy;FC4l#M+YTvhER_lo)^vj zN43j3y-m7L57|Y-1U}I@dUMex>$Hf_5~8>Wk~g?o#YmX~*bD4nln7`M`cTPxN~`;v z3v>g#YUH3+c(SkuP`NcYNG4fKa`nqHw4pHyyNU7_1d#69PKD3fX)cxaDZ?P`6KwzYipLyYQl#r#8+t{z=F99A8pb1Pf(=xoX~P6OF+ARG2$;izb!v0U=S z zS68y_;7QemIgh%7`90K@YSK4dSn<3(ahjj^v+52Tu&ptdJ& zw#F$t#G^`I`Q`pwAlMxH^UW}pmkKvFq9RYgUN#%oa9a9}xH4Gx*( zZU$SxaB{yWY?Hq%2i~PT=aRs!oC|{3IWgFaRxn9Aev=7S7439MP2wy6yKdkdqokt0 z1QO4!@j>)4h=@Boa03K;LVX?B@h9r^$pR4;k{1v^PqOXj;7?iEg#k!p@BWzBM?A5F zan5YdNA(R@m}kVqYFko4tfP_)ANOqa98-*k zNr$GTTvEUOGMGzOLWl+(d4tTbsJGXZEgW0Bsf{v*=Jq-8ccpD9{AWO%?cE9E&-tQJ@dj_p4TEOegJ=S&VS4c?|fdfD7q@q zbolB&&lL`H5kfV{V+JC2l3JKmxx@pOkAkYu)mB-$>DjAiVKs8#W8k9bKxUE3?E#P= zuV!!TRkMiP?*cB+Yx7p@x(a|Z&TNuxJ1}6QS<(rai55ymYNCnGC7Hyauuaz}kqM^B z&uK-Qb{YV6p4^urVz-%Lb6gU19AqOdSC@>e3k~?lX;!mKBtoAr?7Q_ll~`I|4cz1` zz6TwbSrLv6_{B8Pa)(v#HixpjwlH0@=1xnMx5>J1o}sxFyR114G#B0C<~d~5X?H(p z^m0a(<+?V^wOYEYW0FWhWR`@=4zZMN@|v!rEIydZ0`iWQ`W>np|>WnA|w^O zKxFdi*~CCD@raaGVhg7_O*vt*TXIyBc$HNDWhzPpWuLml?-De+<=%~@PME8;PwOOO zY$lazzu9WpWKvmgrh(srb#@xXHK&`{uTdsMxcZbq-0RKDH`!W)L$nA=wA+Bvx;O@_ ztzOj~DpJ-j5Zh8>wS-7n$N`GSnp~5PE;JdthH_X+ugxbO7LA?!l&o2(g+eT;Q5S(a zN9C)O0>9KsI5vocHDn71!`F3kx!o4i<66=Y=#5}G*lI4vGR0_TGKGVg#B;jVWR?IK zSX)0p4CnjNq;2Jf!^A4HbmZI^tUs<*LYwVwnuvGW@MtGLCqj)xI>rAZSo>RGt7#UM zNYR^{<%KJk8(?ylP7bbdYcCMaGMEh6U#FBR_A9ti7X{ntmT*lEJSiYS{;v= z1QN3d!=U}$j0TT*O*F+AROch44Rf$@x;=}@Ap3#pOf*+X9>&v2WF)SnB*7mhmU-AL z07%d`O)5Lt92_x$ypzK2o8V=Rm=pB?nH#4clrTLgf8xgJhN1EczV`Ov_56@haP~Ga z?n764f3Anf?l|J%NhM26gM!I7P{Dfl;SgRAZkOvVViz{SZ`%5fkp4h0<072IZ?Q2B z+Qnu5{8$})owo`{HIr1jQB&A-?Kp^!J*JROsZB2^2nbZa3R(Z^OEKw za{B5$%lUQ?m+xjRF|1Ae!N;2~aRa@N$|1bBTp8#)B;hgdqXli847;Xr`ok0l(Tn~* zZUaFd|5$IK7ys~H1^hVlpGSF4%!?7K9t1NMmQ*QpkwtVHu}AQ$ZETj${;=zsLr!x< z3X6y?K$;Lk%Y-xJ&cC}C{IMk#{sR*>PPf-VPrm*LQ4^L4k}ao()gXlE5^t-dE?5|( zItJm;WxzVUkMsII)ko>4)U83~+b1V+(fl;;Zy|s>+yb{iPLWQ0C!2N!?SIQ#(bSU2 z#zkBfOyawwJM9;$U!t390E{18P;i*#e8@m@iKnm6cpk* ztCcHabySw2)oZ3_HYQ${-i3Dvn%)jumqFY~>?URo&kdeP7&FCtL{|7LCVmX>er1$g z?Blr{g(Of-HnQ|oD_+HI3H;XFud7~Sp}|jc8-#cZ(=o#wv}3V4A}Z_PZd&i;BkSOk ztzf)925T>TpRJZs*V(206l^qEqem4R35}POmq1UpJ%p#~>g{Fc~0Tm0cs`$)4@y^R0AN}2ZhHP+H~B3zZP<7dA3gKB=d_HM1; zU^#3+ez;k5Xd$*|-`eUcRO6lY=!aV0Rlk8f({#7}{wjlbT}fYQW01;e!zjFT`znCZm}Kg`(d&K+$~hx{SzTd>g2(PqnEdbfU+T%j0FLiAH)TO(RRJ1P4-% z;1-`M!bx3asm=~T*~LCpH0yrRV9Qgn&V%nNWGwXgH9TQAJ1iw1;77L5n@$|kuW9_k z?uoiZpla&9>_`Eo|3@RKTcs1vssjsSVE1(*Gg@&vQfaR`y+NIi>P;sUm_0QPY8-H; z&L#RY^i_%$cLA&0tCYD=l0(p~&Z!~`9+MP2Q?xac$_W%%^jkGYP(6z z6!zzf-zupC;M%J8HDnUhe;uX3%ZKNcSh!!i-z8`%+hXFRKV^8ZcYch8N}V-oG1!95 zb4C3!*@HFw&bPa4*;U=(<3q97QB7>rXLP+yB2gv^(Qcte^Sf+ibgG)ni)Cx6gb_1O zS!*biX_Il%8Rc_)Y9Ea)6MHFE&wR-6VV%V)iat-1nj0ets>6naIx3pf@tAMr)M9v| zRZt*~69p>mY_ydnXdWUcp@teOvMRek#zYd*v7$k&#YQ7Xu8bJO`^GR}rNS3L5>i#o zzADa3W3#O{4rdB(bW$H(ilz%Knwnz1#aXV5;RJ`h$6YoCc;6|#Swo0?a})2n4g7@X z*YUyB+h~dC-Bt$pmKbS>ANUT8V z<|J+h3>7vFtrQ371NMq;8FM8#J08T%moxaEONtW+;i z4%=*|JxrwTRy1sd!Iq<#bn(LrQ0>U+%#tFuA>~v_Sb$Y znmwuQ=^)j#v-Q8pG|kN$WMcbui+zznR>!dWSB3-6!BaF){qPe)?Iu^=EK$6*r%{A6pqVQuuQyZQ?sjoVm3UH zVI9X%$LSyoCSvLt-3pZtd*p^GS;?bg8C~Zx#6^zYzJ@f;4joW{NvHUija#`nOx^H5R>x%GefG|3m;Y3QW z!w|osnX=TF04_m|f0 z#M)#hyJ~I?byHAsd0w>oybiKA(x`PM0|1%d|dOH%F+mEE&Y}L~l ztY)_uDkA{(qI(%niPu?Y$D|YZLDzd|fUTA$ZVC^C`p0 z20mrDImBlhhYZZrPD{GYR2i)+{rslIql`Y+X(YCRuEZO6;yst(dbgb#_l;HxY;``> zJ5eF>Cy0fFdU>k}L>REeUbsR@v0VGX?Gu^waF+e%OhSz@$elcw7k$mZrAnb*OQA!v zRBt$t!R``ORf~zM>6RVwK1L^`RGPGxwMDfc^25h`4xi2v;Ie2nBUS7zc|1DTf@luM z9f1N zO07TQDT~5Hs_2Sk95(?>;vo5$ZxE4jSm=AQ!t_%*AGQ9N+4V*~am3lsmZEY-lg}1R zm`SSk92982G-Si>EK-822Hcp6A*D0fh^p&IXMXyIxZdy&JjK~WKSM9QYL*`b%!?4L zm*Fg4qep`it>1-t7*m;kmiXbB$FK+>yizmG`Z6Assv`*6iNM<#*ByGOT$vQq92 zC5%P1^Xoh1CFY|!ABV$y57?Aw^iJhm1cvUkVS25h3d^m1Z#O^2>i`EAzdsHEPCXq> zs?}w6ygrtHMkVRJNVNLWxx`Qw9fTRZMh`3IpWc|BF{)!Lj2Ti3pm<7dza7)eD-0(I zH@{Jb&mH)nS(`C8z@QapuE}eU>rD2VS`>oxnUCV`ln8Fz9{lSosptaK9d|hUiIdo# zubHa6`tr`ynFlr#+W1@e)IgbZ?`z{fz-llWHsPK;*&jM;k*pFEi+Qu)Dw zd!E4#f?iIxwr3LzzZQq`u^8rHSrwpA8T604{WK;{2s@*0I=X;Ov7247@l?XGk^ah7 zyM|&K51VUFiOg%ifGwQ+no~{eaD(-~%|OUhJZt>(EpAD(<1KeUU?*ILjyvjzOXq(r z>9CL4VW(Vdzp$fxUn)XIR+iXg8mwJ=I-T^VTPc45s!XMBanCg@*74t0|US?lTJ1>!QgyGG+|!0Q?ADN{b8CMb0*vWKff=rRYk$WNVh%9_Z`bl%R8_)7{ct3w+N(9qEMpsOY z&SWWMeavf*E7_lNVRO4geQccm3#xU8T#sRl6S}@X`7-lp#id^AjD)G#?2V7QiH2h(`yOq&BQ7nV3w6Q;53Ul{q|}Txykv=Rn?%e}XSiD})ISVLI_r`ZPX}BI7JBG> zOi)3`(?|?4_$*GdG#NpBIKq;qujMnW?^&KrqitYr0>&H2ERt-QCMIPyK}`gft!Ur) zDpoSh{$@>9BD!oXTYnafWFkjsuHg26c1W#?#NZPB3(2JWd9*wtha}l}YF4ylc(n3v zRe6d?t>L|vP{rls{6!+SCGT5QNrAf*FXi+@5qJ~SG~KRdCf^{fOj?*62(~IG?#Av6 za5p$nDHNyh-RAe4z@Bm8QN`Wm*$?8;{*;apIZqKf8JbL^jj+9pL3WYDVS2f%5xrHh4Bj@u=ma zs$x8n=j>Lz;@clLHeagv)Il56Yhlo)jT$tmEq>5!csg*;JvlJp*?1ND2#iaQn<-3? z1aGdVssu+KdZygTKJ^7CUL}Bw;SzWbE{63HQlnKMW0XsLg`;arS*nz(ij+SC z9m7&`N&z3ETqX-ZhNaNCXcZ6{<&@8Zk)ie%DZznc#3XzMkPIc!4ukb)E(&{w78&|( zAwEMEQV84FD%oKe6E}9!0&JUYawI&semy)AzX~JJdaaLNjwe{PyL36#rOGQ~D0=)f-x>!X5yj@NZbE|K94r{yT@I0Jt`DgMR#xCdo3XE+D z^(4U8<%BWq-D4cE*ZHmwUb|_Ck#A6%nO@=J|55t%ig9@ zZX4I!?qzn8t1rasTFJCs(8hbT+zxk%t@(f0^!O*NL2tlkx@oV3^iBhRXWY)F>3h^6 z)7?J9*TCE0cP-*|jpi8YH-0tFT^t)6<~fc#$ga)4=7h8zHxp7!5c$UC^;C++bBM2l z*vYc!64|Ks7UCyQ%O!rE2@j4JKlu+-N^~Scb)EY*v2_K%PiwuHzg4Y7{S!Y)KqK*! z;GGj^s8C5Q08?D7ql=TY*#bHeIb1NM<#3Udjd2kq>Q%~7e&Bz*a7imctfb-^%WVLu z)U>blN*)r(ILPxl8or`We5(gyqu=T0=F11sEdUzoinKGZ{WaCUH2%hv3c~hY!JqSsJL2AqYbl(l)RG z)USMjTjz0+Q~aV7Q)#g^{K9}^e2#sY0pLcko5xX7wfksrnkuW3R8WknPEXB3?x`cJ z5i>oQ5sn8}qp70WrPufojtBfLjIb6>w;6EPc=zEag%Z0>3tR}i14rX@TPheC?MQ+_ z{)`6Mdw1zR)QS?lhiY^a=X_Bha2ptZM>i$>YY}n;AhKck)=e{W^Zi z@FdR0lh2Hld{w76kdzMhX+xmlh?y!+xgl9G+b$x6)~ooGAtE|KPr{ z>|g7?ohC4>V^yse^7r)CkMeg?>$??t|C2kYO6t^mIAKPOf*J96=?5`t@zk|?oGElm zvqW>jDyb6dUFIH_c|RsU%Y4&+RUMqRIkg3XqKpxg?Y0x)yq2m*< z65tn->oR+Ed|Ngogv~{Dt4H^C$(d6+5TxVlX#@%%Vw(RM9@= zKwfE*fz{@=QeqrbQ7;EoujaLj%Gk^=uzA|FL}c(BeP`NQFdEv-8TbiJ8B?pM-8M+HnB>OHUVFK%0SwLd{t6vnM=F59;9aWT3Zmu zPpq;;!D5Xq2_o7Y@%Nk5I@t;eGHaZIG%0ie(lj%m$W0m}he^Zwe38>iwdHa()Wun7 zw^p17?p4V%=_SBkbqk`cUJ1Zn#R->zdSwZdl>leQ3G@1W!b~NAdKD)u2ldKIR4D<} zt2kjUs8>dxm4Lln0;CF~%~Gs&2YXdc6z-KJ)>^kNak{D7a&=*cyC>*yC~d96yn=5k zwp^h!fm=+oWSw$5towgi!PYLoNbK!)Sl(GH-d)N4cr05_WNS4IPW0Su>wh%+8@AJ8 z_4dl>c9tT8>i7fW?wO}~SVAlb$WeKqL6;cXTN0eXs&aoNd`*W!m3iL)(H7@6ZFR$1cQaT2Z8H?4HCm7Aw>o~>hYm}EaQj!!mqxmCtB z?Cl6f@6?}-HX~frr;fnvJg=}KT*5|6kL!6{wjrQav(eJy$viF_5J3N6o2ADyd0e(0 zICM=tuHtdobb#v17RzW+_DaivxSTDPrPW(6FN2Z0f{x!qM9X%NYPvhSXb19h``eOOz|@W zxH&KLbTN-h;uj|CynYr;nZw5~T} ze792zC$$th$``OU{!tGcEXr1L>9D0)$#?i#L?D|Ax&P=Na`FjOO6DQwO*pubOXCay zO#C=bR}TArkD=J%O>VnEL>OFL$^#}r$p$s1`))i6doBj-!9P=WgB|i^t05ENxr$NN zo2%>!Y4b+m5=o7DTL4h*0^;shuUXGg5eL3~}5<%cxwU3#Fot zDtt}-8g}Q8Hadh@k~(~m*kQFX#t}{D!4^QQ(NTIP!tzh0s^8l9zYZX-j2~7&FCCUD zPUxJEA7(GoBlohNRJb2cD^Za`iBtsf4w!`q;wM-nc7QNL%rV|p>BLj0S(-oTrhPbF z?Nb!7UaOBHR=BH{F6u2=6{U2z-8p=rEm31d9I=5b&OeY&>`6Cmqb@-$KB4K^Onnv@ zhZdF&pNNZ>thkFLRv>U3N}OtXhAqJ}xSLhP5+AiXT!4}GvG>Has9VXlM`c3C@2pG% z3tfM*iLLJZJvhtVV%r__JUd_&{km}C^$&g-nh|#XXnP5z_)p$scC`HgI)COs+Y@6B zww0}Xu>D4Su-(5Ib@0|_ZRy`g4jpVOCmQR*bo6D5a_-}Ivh4(s$zNj|qfM0!dz16D z4WhrLxX$B6g-(H*;Z(`VWZQE-E*TFpUZv7U)L-L7bM$gX0=?O>LCGKI8QgDkor)2I z-er|7o2Ik^YKheAqH-0-mhk%M7~coOmlcoPLtJg!f~a*m9wm6o4B8x$b|XPZf7C$u zA_)Y~eZ*65d1yx!r=qX15Z&fArFs=|T^xwSfb`vwxD@=v7Zr}|lVf_3O**m>$&!UI zCq9;ANZpen7$VUPrR@u%71P-aPyR51>di>)KZR6BQF*u)X}BC_h&HK{jETOv34j@_Zw{A2bWpRP&6V5?HjS+r) z!BsR8RlNr{fx{T>g)#F#}RE8gHw%e}mqRDA=PT1uP`RsNtVMonlxwSBwrnhNSyxMiFkhRz1ErURPw`7=qu$9#Xj+gg;O6 zlW<`YboGv+k9CbY3A*(Sw+d2qfe6kUIAw4PNACl{j0LKrak{x$SkT$J zM{JjZIA;uQG$(9|JL28dK6mqLC$|mtDGwQY@7C4hSy>~dqw0qH<%ZUPw=0Xquq+J= zxt@tvm5e}nEOg}xPxES*avvc^y2Nl|e?HNR*Tej9lf^VC)DnQT$FL{dHZQ!*qMDFd z=TptqU(FSErYLf(tqX1cH8{JqzAl$|*z&<@u12{~j4ppZEPFL4`rOO$x}uJ`(1Wrd ztSGb-fM7YpUT^*!m=|tAGc_L&f?h-nsx3}*yn35RYv_o1I_fr1@na_sB$d=dWR60Q z-Q;n$SteP#gK=$=2TwdlgN}ze#>PEElRlBcC~R{|zjSf<5dGD;zw}HkUN58 zpmsJT5pmikx5Gisb<((Y@e)L9AWR#JK0eLO%m{9`BO>jRmMJe3Nx?*x-$6D(5tA#3 zww`XHOCv0iYt6(?8Ie0HpvXp&HDp?Da|k@WcJ_9%`)U-8M;0m%FUzF1v>WA3?2WF` z;jkK$a^FdA7e)L~jtvwm9M%oJon;zfu@H2Y_>c{~1qdr`EwrmkhP@;Hc_RxpV^)SD z(bCaLQEeoW;V~CvjA@Q228%I9s6NOVjnUPnK})vqNG5TQg>UPeBp=OuKL@?j;x*HU z7&l}JoYDmACu1n$b|)Hi|WD~BxD7<7lD z6A(rUHGU^`XqH(PA8UL85^*JO*HPQ2vMGe$d-X)MKk8zR<%xDg@u|DY74XFc^Gbg1 zQHv6hD=oN`Ta8RAn~+A@K0s#TTM^Cn?KOSdFEM=BT01~)u1~`U?os<)BHf1^@6l7c z1yZ$pPdE(mPIybklCIW~<9xdhk!xRcJ<~hzh^xVq?t}b)0kOtz2@cxDzFKFXPIn<$ zoUv}C;)+h1aB;>jyIut=-AEYu))BWfFp3J89qu(ZKt_y!V#*r2g}aFpH^JbDmZ>8{ z`XlGQBNkq3^*2sepBPa} zsb-nu)KS`PgNTgRMCSi2{YLys(d7gVh)e~YNX8*zkXmlr;&&QI5Fc_J13YxRk8mSt z%PsLbARWn7;Yfz%iTe|os zB+thT=vJ|8u^BVqL$ zp2?d4MRARagbz}~!#Qu3q@@nEBsy%>b9Hb$nxCk^3mlnGGNLS`_T#oiQ3kV9{S59DMRqmevA{{uHsqacii7!klZoTWrhBlN z-D4PhqTbs9;f@kRfqEvmKq6Z$*wm_PdmJRcUCAcwgH+dr;jf!|(oL*U^S1|E?j!0@ z_WB7_W*c53$#4iu6$ZejE_|tUJInmNLKA!-nE!wxvmKzCtdRYbr-)E%>#iE?D3M( z0!6NhG37e7_WB)4bq9Q@7)fRQb=>tM`@%x9#jeVJ`5=VQo9ck&WYm>q#8kII1u9H+ z8vNK)r=b^q;P3DZ$?NQ3>=vcL&QduV9OTnXbz`stcT}M98Gg^xW`CM3`_u2{62Ned zn7<^D_?-%$_6r~ODaCO~XSS(8JEF@>bvWONt{|E&s=TT0dY*U+%e@wGptmcESdJ`I zJKrR!j6>g$@pe2qffQ;3HsE6d20Hs!Qp(yq3O3NUULGyNyVR@*(5RJ1%iPh;Qa6eQ z`~ZY?^mHcC?;rzbo(%~feOn|XG1i^$=1p+@Gox*FcRU_r7nDUOs6{KZmrlyUfan6xg-rgjZ|!B7 z?C=ZvXNHeP#IMr(bO8Vrh)~u%7PDZ>>w5cZ+=CKZL$xt+K~(!~_Of-s+Q0ddxO>i- zW>-@{AaFG@xy`yDn%YEXFX!u*02B0}fpcV<)*xJt$p($zh5{{I_BxeP3f|a!V0c5{ z7FZzm8V#j^TDhT?B0eMyzXhN`wv4-lSEbTy=$^62Nw+)YuPQT(-UcONs1s0N7fp|2 zsAHE>hFuyDCy>pA;G)sO~xcQW@^hSmT$R0Nv;$16#*icC#Uscz|^vpLiV4`uX9VF0hFlfbZH)xqP9ch7zNgY$m`OMF&8*_y{&V;9! zDkL2A0X=8;G=UpDA`-}~Dql|hUIZ(#D zA$;CnlF825^2~6@5KWLgl|fIoJ;4NeLs2_N$#{Ba_B~y22;C@HI|03Ca;Wn8I_^+IzAb5Et!f!BX_XAcskEK`~eG+HG>h) zy-VjO1fMmMq+;Uw<{ZX8aSzr!jhR+yILztd8jlTBEE%rBYOvm%R_wk=X0S zs;qkI?y`IW(;hNN$1@0Qw@(ZolHV!xO26>yrwpzH^s#G4Zw!xV#ojLUOQ3)|I7nzf z_u~9dE2THtHIJC=sw+%(|1DlRb?8E*9qX}h0UPY3Wg?xUM?{p<6K&I*)e|=hWVibn zkDT4^(Ix7t8+rAl8h=@gc1T9P%UJ2`bz&t%I4V+}h_?hYT7p z7w*?R^rSEz$W|nux6uXd)mDu&(B-2G>ax-2WDD}xfSPIVf?z1d^_5RbV@IQA#g2jq z=!0U9?6fkHb_iryt2{y=B=yR|V-cysro*nIyT*~X_UGpWseUBx9Vs06&-ii;YsYl;f*&6w!^LK*Q+I;)JL!My z?@cWuxPbl!d0KS#x@|ODi3=Fp8+Vz6_ppHEHXB$O7m)GL3d2bf`30|s8ISI(5EeWB zJfJ?&0t11m&)MsM1f=sWcvm(}cSC%P5lB~M7Q=|=k@U16@z88xJT$Y`+>#@grrB`J zxWr6!!s~-P(Znc#5g4qrh+$b0h5N_gHw$u5W;(-KjOi>R2D(njC>VF*0kS?tZ5aHh z<4IZo5s(+Oje2BaR7KVq>jh*wv`1QWrp34~^wM7CQ*lEaikDhtLmdZJC2?}cZ+kl(>#38m zT4Xk>F&eGolMm_SW9)RcAO>qUGj50@Dd+6Mc7035%62+eY?PgDD{VSEUCEjlvD3NV zBX+t@a%u%piCp47+38q|-0I`(bay&Cou_2re@fikSV{7m+38Mvcf?MYFI)gp;57gR z&Ic#(FPhoGr;GEkyW~Apb(o#bw^Tj_|EPTKQmOEK>^2Gv1HD_Ss1vuN< zCQ~2s#eZ4@>8vkuR>SU8XWRSmZbS3-pnw-I%N8%@a*%)RR4tyVVmSD@DAA(%*@A|FIZ%rUurg+^TzeH{xZ@^q} zF)eNKH_87%Yl|8-#GT^5pxls?#zaI7y;S|wXPbrMzew>=^V59)bKuAENYOT{$koGl zSKwdz_^M{MiUJ5Tt z{{1TZxM`WMvg8sqRHC+%pZm?~EZ?^pFt;nA24FfC}@hPoIS+61%z94AspZ$PjuUB2acq zvt7!_Pp+}Jf!?ojmI%ioP+4paqjt*j=V1KMqO6WZ)Iqb0HLqw3^E1cZPPJ9xxGf#3 zXQak0Msn6e7NS>%Q}DWdNg*b2Z-;-bIu@%Jz7BWuN=6k}ezF;S9>*sd>tqP)0pZ6A zCg{Cnadv&`qip-kq$f@T-6$PChQ$!zJEATyQ(Eg4_DQwmd>=Vb#%mF?XIDCWi494r zNp{9Z1Z-F0ONmA#0d<@(jkak6M;mI?XrtfaDp|s#k|-u;c65++-RY*iSU3-IPc2I; z)$l*psZdAr|8DPX;G?dt{Qpc6CEAn;mbz%!n!2$K7BRHc5~Vi5NuA(CDPm<^TLJ-# z7J)*r)>4-?31zB-*tK2kvXFX~!a@LwiIWF*y_K&piiUdu?z9b7o4w%8KO7kulxxQ}41MGFSbhAa`#BC9 zqok6}$1%R(_7&JAOX`<1P7GATt(K+`gDwwobgwOw@8d%9No#P0ZE4uw_R{|JQj3$P z`&iny{Re{;1kdXPHy1qbVxUa?`2eu6YIx$Ahxz1~b zZlf&kzIAfjJH@T1aX=vP>muFOiuLr|QV!VkD?Lj$(3bz(u5;r3%4_x9|4zaCf03}P zZ7X8%p2w51os_)v$2t=?uta&Pe#B?Ls+gV!F9bZ8Yt1`3rqH`cs}&h$MMhYW5ixGP zttefnlN_a1h8uq3?N4d2IFA-I#$k^XMaffMqWt}T#Am<$R(9aR%$GU9zFv8iV_+Yl zM~O+U#H5l^6&Y+rrpD~YFv?)T$(CvD!`WpkG{+U1Q&Og`lp2nSnEfcHP>G^P1m)uW zQhhimtf3fd{Tf?Ru0jK?(4{f^QKi9{)Tf-$s8G*3K56GEu;5@ys%WTvE8cOQ9jb5y zLwvX97;Li{TYxA!-hngm0jNjQ7N<_l<9OyeJ_GU=;X3+e6@$2y!4WsRKTvjsoQoVq zqr1YinxW+iopWo}&Ag(KPfAfNO+Oi2!VgbNb z@_(#z&`M?o&r8Mj5{{)7noV)sjbGdeAzzjn*0p(}QZ1(xS=X4}2%SQnMzWmwGYVZA zQ0SD9LZ{|X=wPGJVMd|Dj6z2kg^tLfP)H!*-e##0O}SA(xe-l&Be?!~h?G(yY!Q-` zg0dksC|fjSr5tJ3*FF+&-*^z2ioz)iiIX1^rz|8cQu4{vXx%7BWDOxfqvV0CnMGrv zQIU~Pqw#*1Iudnc4M{lyg{mnr)e)$ux;KHgh6F0Ij`y1bfr1uP4O$5Zlpp2Rr<9{m zb*kgj9)*@Q)b5UVe8ngfvT)%C(LW$hQDnU14#-oywUT!V&DipN-EXIm=2i^MfNf@| zAE4~>=iVTN<0iA?=>UqLa!9Nb!ATBka&UA_dLyAM58jLPGBTxX?Bl<)%R`@A}2!?v{Y+e zDlyK0R0Omo-5Bv1kj-x-Nq)H_b4#I408Dwd0{xV;pNv+bi!OvJs*XW%dCk zA=&_qsexcwNyeU)Co(r(jc7~7?$tr?t(Lsl;?~mXb&d2$L8fL1M~c-@Z#_e)D!PEx z4?;?X;|sIcT0~+K8KwLVC+GQ@R9mSD&sKc7g~io)RgR?Y@prVJFmN1`$JQ79&0pqM+}1sKHFY3mr}2NLvKY-?694XxSxgX*!W2c=9sW!63&FRUYDVC#0h_(XB;_RaV$PB)j5Hv(2t(M7C z{m@AUO#pWe*u^JJZ!`QwFeYb!MIEKh%}AX^PdFW|>cjxMRQgE^_y}*ea(V%y$uGae z(E%o5M`D}L&kP%)<3}qP9*NEst}O*dl_cIfe}G9el1^a^YGWi%4e|#FCuAz7nk{*& z;q^JfVM$^au48bp*I7jW(bGQNeL4VpjaFUvyTPI;Dw4mHXh-&#F1|6dbW)7{^=m%u zXinV64l0rbII*!hs_JT8bL*LbFj{7o&7 zHe@cc_%4oy5yfQmm=c_W4JoCSfU%CM=%y;Q?MJ&B$AWJ^k!o3GQLk|ymuHRkY!)?f zt%i6S0$nFX7?k$Kv}qv}PUnGq+9{=tJ4p`e*OcS{BJ&j_Q$&1YN4#wgbCLJ4AU^MA zbuu&Xi)=r3X3Bq0Ioo)IMD2Jl2MA8$f*h>gu<*t~uOV7u+MOmTLH0C$Vd5UESZ5f! zacY%dOm{0V8*1~>wlFDh=E-lz^!Z>+_Cx}EJZr>65J*|EXY47$U^!#0HYhK(vNN_| z_6}~uiOdMjqdQ?OzU&B-{3*F4Wnu1NcMy-x(kDp{P$iX5x)d}IF?B&o=-SgA4g35d3Njv-$MX3Zqo>qg^bRBY- zzF#HG&hL6FQ~mBY)E_f{%l^DurzD&c>j6EKPkK~{&rQ=?@b zvTxBhy%wl|PQOMfXF$NNEBlIoxYzxv`fdXt7u;z()EdF_2aW3DVNEh2Q&K~#C7qy0~Xi831?rs5j9#qH(Hkah=LcngIlburB_$bcZE!+Cg)kNX_O~Chi z$&1mAn9?^aB6!1xx}1>AcgwlSo17?kWW9*;t?B=OclFFf`ua5Dq7HDlPYxI zMooJEsLZgx2MhQ-ui>?3^Y;~Fl@O8QrngMF_EMcJL8N0g5dwB=J!1EOh0TQ+Uqmgv z*Egbd4z+AffLNsrPJocCP)FQkw=!r&*Gi5E;dW&eg_~P^<)w$_YO=$tAr`pAAtA+= zEj7_Wm$-7{iSBVtWkz(UCpcyew}szDK_@^nzYdPkyu~n+d@i9tZ>3ZCIq`)6+(R)T ze3nx<9o_a?FnTf~EXSYee%;nzjU|VOY9x)L6E5y8=0~rM6*b>~3nT*;K7?63PMi+sl3F(dqhXZI<9<(QNVwYonzok5pyls7L zhAll6c>l8Kj^_AZUtCAEXv3XYjz`1pb`>WF;dii=&W{$9F>TudpF8v92UvVrz(;Gr zR?_sxwN~pnSY)+kkA@STdGg*EA{DtO@f**g2Tjo7NheNNsh=%UNhC$p@;OGJC_c#5 zIpe(|anbdx)vo|n`e{l}W*RD+GUrwT;4q#R&G6<=xV){+uTxGAJGM&>7k^%5S<`m! z;=k5xt;`4keD2nQC)jRVgm+UPCmw9krTQ*%;IS0nu#E>+GCs64zFEmm=IQ(zeu|os zT;SF$JN}}Di;SyPi#=^6_XQQ<$S7?`e9)CeEIGlNJ)4z!N4))CAj*YgvtIJurNbtT z?!ZZpPKER!HW3s$-Xq_n>?%#KC0n@Q(`psh06P7u^_R|B$8_w&f=?q&Sck#FjzZc{ zxb1yAAfuxryR{jtqap4tS9%dH+dFJY<|ijTsG(CIkMNU)ktfM0({)&<=MDXo?7+*R zuH;&N+YWh;>l`>gVolg93r+#;ex4H?$~pWLPDt*4IOxFityzq<-rC{Kz#%~!)b!6S z%y#%v-V(NPzhSl7?GnCg$ndO0H;tc8W`diJqu}ioY|C?=grCIzaFr7 z{cPxTNz?Dso9eh2&cp=2I=zw!mZ;r|m)FtEUHmr)BcyQE+0&7$M+G;{)o03&i|tej zp_L6-#cWKnI(L6}5{EiVD9lZEhh3CbzdJbDvZF+Tbxe_#OAIcb@*F zg<7`1jYfbUJ1hHKO7IqTa=!5ek?b_5g$K(#uLU=SX>NrlRqB9D3$E03u@iGz5d&HY zubT0i6oRR8z;T(hootz@*+~v^gc5`-LF-_;6UCT_2G+IhsZ0}$=j+0zN8%rI(&s)* zLAi;DczHC10m%~r)sZYm#Kvr%(odTZa3GTJ(}!fj-Rns)nG*jS<_nVw{X?!eJ(}RA zAwa2WLMPz(ZM+9xinyM3DEshJgae)%0L;rKxY5|oO6kbhhSlUhS4zTH^+uX1nrh!9 zjvK7U#jPXDbWYm!Jcz$0%;&y30K-pN%W&c|1UhnfaW_`NV05sWhMWv-N4UDkzS~-d zr;t;ps*V3z51pbREx~+CbYa{&EUKnJW1hCuu&RUS_0|Hu@i>@ z8OK}J!i7o*oPSQHytF>H8*zl|FNxlsji`^@bpw_@=W{w5=Je`!$Kx-Wh&^Sz>7jEn z{Z$ad(nG}1gpHZIC@0#I>;;+kC`?DJX%MW5+rb)Q_uh`OzF>3 z>2_mL5kEqCfImrNn1se~V1u*tVi~}l2W{c|Z%bQndjU>c$ec^u=;fv(urmhTubZN4R0z_+o^|QbC~4J!%o}Xfu~dwf@lXPAP#**RDcO`v=5x|Hyt*k< zT59pxOs>LP(dv;9kIm<0X;MhQjynbQ_)3CM7U~ifZ+QGI8O6hs!{Yk#R5HGAD}>p%@yvQRcegGOYgn-AxIWm4!l%pR#5rIhrW7 zuk9U9AWrBesMvYmviP|R0qT!vA5vVGP*fJwI>7NvLtpi^-F`K-Y;0t$%>=_t{dzOW z%=P9Wiz;(<0*XRy#bs5F`?%jcX5&wJv;0CtI-jGxebL#WZ- z96*OSNxDQZzuJfqIqzllHZ$-H^is42ykK@@M*Lc1BQxLVk(vGTz`hf`qP3d>_lOcd3n8F1`ILnkRVOrdG#^8uEo@a?V^W-%#DGXtR0{1P_ zk!@_ocCxUZcYy{!r>=pug;jn|xT$CWf&F5$LIZh+GShK7d|rj9=#55PXqz(QmB>0T z1}*VW;wy_nmjT|=BU+8*P-N|U`B0z{$)da8Rhqecvc8kJi+W7Mc$@^>W68?Z&pN6= z{bqDSg5K~RaE>;U;~hRQ`mun0k*DFO$Rg$(b~mV2w}N2`B`dC(*p$e5afN>FQ%Ceb z@T@^+vxq;z_hl#-UGer`GVJo3CJPp~o{fBOk$wRZ!%xwFD3f`D>57u+iO}knLuIKIGrw+QPllZ}Q@harPyJ0m5?%>BGdSM+G!jZYgY%6w{ zMC}{#_62C~wwB`%ThHp_>~X=xDY2DE{AQ9P>Dbtt@(;-LztMoK$V3FxQa%N1UbvLI z!yl`l`PBOq)^a|?E$0a?4|D-*UBONi7mQCh>$8@NQ%TpLJuSN8cN|q$XzftpVBwLS zPRoOql?rm8X@s1fWrS^o@EA!Dm{73<*`1gRp0Il@?m8QBLVVsu3)eREC+**p7z;Pz;j??uNc-o57Lkb8E6X4DB@7@c7uxIn=)Su zo%(y^pAXSPFisLi?-$?!0NCh5j`W%+6k>`vlFblj8nD?bTZnNal}Bq2Ig;qVC(Dsk zt>Z{LB963QjT<}Mpju^svCSE>9v(+(s(pcoMp_6KGRj$gWXz{GKhk!E@cYy9Be(_5 z_J1sX6dyDummN8VM?XAvBv#;t4<_-!`H^t{;QUC(*X$KKb|h9B@gv8qU`Tpj9bG9%6afW#9B zOM$5&)Bp3xNPz%!leNo4+F9JTr4n%pn~3on$9yzTL-Ahby`5#%WF?5c9EA zF(!E7t@g|OHB|=gvO-8Z`$uRjaqPy5$w(4!f6?WgzQ>n>U?jE~8u1-S*==$@q1kB$ zSd*q~hZ9kNg;QaOQ2k*`Pl;c^dicFbABY>Ivf67qZn|uRE_e;LO?b~LrRd-c%e^?8 z9gHz^$}>{iPf~UkWFX z>&>uXUYK6FEZ_0VMRtv~oWC`@#=WVSWS~)*fo&nnF*64;P^4_bd`$h83Bbub>zrvL zg=qtMM-ME-(}LbI=N?qbhS>IcT}2w+AJqQImlz1P?nvvN^ClWQO7{$khb_bh zg*M$#(T@*u!R89cTZdwIyd3m#iCny}tY+pK>^_yvR4I?9)tUu4{dQ;q5Eh0Nyj1IuvQ0dakg8P{Wm_EQ(uy(MP0M(KCvVao{tF7u*rvHhd^al>ExFMtJ#gh4PmzfN@ zKZY^KA96Ey+!f&GWKB7yxrX0X{G@P*!ofT6cy|&)V~e=6yGj-ZJ^UCugFSqY%bGw- zwjlg-CwMhgiWjlDr`W+i2!Hp7|!9Dg68H)HfyoAwec3Mw^Y; zcx0IhZSCg1vm?ENNIiS@(bdlS^g50a#FydNEt5aE$WD;Pi$)jZ%+wJQb|3i3RW@>^ z(f)I%jlNUywJSZvomLyIhOb*{*VT^7jQDps-*u4{qZGGQu$_&+2*$hM-kel)Zw_%d z?A{!?s}WCqb&~=Ym{CgD2hKt+IH0rSnZtk=D`V^etl z4_6Jk`+ul&Gj)c>E)hkIrm4^*l4Xrfs|O?PO)}o?+!z>vR97) zzph;TxU58|>7@zH`U6iuXd<}Qa!?h4cHb?B?VR%nt*R$*LQA!PF~7+(S77Wj5FbXG zkQ{@Gg83K@flhBYf+Tnb!`JRk zs|S?GhV?B*eIYsWHo3_uu;Uh@M*{l%^$P(#uD5`r9zDWZ)IIpr?bb~hmpx(}4iKt! zzP3>1jrpna@_;JoqW8{&Ble7xb*TVVa$izJm6AI+hm)5ctsi21s~o1bVr1!2m>z;2 zv61(p$Gc2+%?@_~+_)`ExE&!quFb{Iqer3X=y5(z@s3|RI^4<+Xmqb&nTxrHR4h{5 zOBUif`@YFshJ~@5jFTJ-7`i_BX(Chc%kBu-%`9(7KSwYcPLS+}nC#?D`z`LNxb=Kd zjfjk+5ViHcBe%WxC46f_OP0q?ecXtn@j>UBpNivjM^Ybt9H(;{XXA!GZbU4}Zfc!; z=g-!g2x=Qt-?ox~2JSqg#fJ#$8Z3YILDY33b>XQW@5e*M8ER%&A6Hep-x(fW$GDoD zA>7BRTv#1L;ghMUN?4sNtojP8@(8QG4Yg0X{Ya&r5aLL!M6}DnkCFrFd8_Ns2`X?Z zAL9m>@Yzp~U`nQ8eNyfYx-)2Arv%28s$06s2W%)%Wd#hM<5i%4Sm1bN(U}C5pib^v ztvD0>M`$E?s5oA=x$N3K0+s6@p0uIXqCtNeOx{5IG$4gc2}x8 z*!Dna@~oT8X*}NXG=$at;jN-1<~T1~F5-1YUk~!h%hL+YnAq7kQ5jH$gGei~nC1gM zPl^_g+Y^iHYAz95Fo!I1*1{L2|3}iBxCRbGwmG`W`2M9#=`o70g&)#l+ar+Geu`eM z7Fm&nu>ls@iu&}U5q~!u{C=`<{~@8?8`F=l>-KU}><>)m+Zwode$XTJZ7*{8tdu+! zS>H}{*si*}g1hZKDEuL={>uLX$1z7Qn=Csx~_)z(Z8)eQ*ZY$jp5=W zJV?TOm|kv-J+g~?m7r`NBPeanPx6xdQmVkkK!VxM>r2w(O(H%#1tls;UXqW z7^nm;VhR&3;UXqWs8RwKF@*^exroUU1}lM!n8JiBxQNLThADxIn8Jja+{0uEBb4Co zVG7f(;Tk3zWctJ)yH43z7Q3&JAVJ&25TPn9SQR>($}%pBL-3yxIzNh>;~f_%r*o3S zIj()o9LSk%P$J1Y-?j9*tUL-qI`0g3uEEp{k-O9gWoFpi0f#;Bo`)MB%d{=k;Y8pZ z79@*gshO}&Ghqy+q5Ycr^jda!KGMVm@XQN5MwZ)F>6bF){S=I4E;Ok<%xbdBFJ2Qt zOZ#XhfZKWk5MIypFhu%HD)ugB>BWik!-?Pm+ym5M7vM_IXnU9Inf}EdAo+HCC~cjn zOK|IS3GTzN)5=)K(C&%|i!Mx@OFz0=ln#ACJzaf@EXnEZ(YEs;Al^OdZ!#EJP6 z2V3G5`4fj(;!Gt5^zI=v!V<4>mp$K4A6?^>VHRe@E$%v8K{PkI)D=!i0Y=4rIslL2 ziY9^k1J+%*0eB2oGzmNqu&%-lz?X1ElfYGgbrEg=p2$780uKhPYj6Yb6zSbN8c{f#-bOZ#R2+SqNL9I9y%Lc(e@2;Ku6UV&K_EOGWJIYqzuAN_B z4O7QEUMISh4bD8ed&E*VHx>b)m1e&92V-p|arPo}L%0)289?F$)UVigQvHhVKgXq~ zZ`+nWiCP?r-}xq0q<6FtuSKPGK35NN^X^zb`=#!uNDuw#Wq*-KFDH{S3;>_$o{Ou4 zwEF7wvs-BcZSpDsmRD`2-ki_Jl#spU-$u>DHs?%RK4cm!fM%&QE1q^ekgt$+VY4z?#GgqyjtKGSr7s=$Ee zH1N_&@StwSzv9FcCzI)`FltO=&RzCI+dkYZ(*4=K9T9p=3XVExAzBROch+kBj>`p0qriacoJe4TmnzF;?#t$;~Y6>=x5!0`K4F`J&*{e{p0 zBe48V)t>*17v)khV>2@H7Rq9Fqw>qxBbARu@O%oOUQ70KYRQ`Uo^D+1lWWc%2h?QmcGUL+% z?IWnz+r_l~Rxk?LnrO~;uf&4Fb0oVW9t4pcRF(HT&wiEFnn-q>inMZ|~ z(?)O)xlj%h5)ZPuOpbv645mW8|n0aGvD!CDfY#V znD01Y>wo#q|MHzs4(;t;`u{<`6P#enm4DWG@=uoU%s(u?bLgVI<)7Ys2YDhZ|FEN* z%Xd7MBOap@4ANh*H<`S{CQc~h#GaHVS0@W1eya2BBm&~j1A(~Hug?*)9lW?IOUZ{Mrh+EoMCZ~1VTJ=fq#(@5DOsXR=pu^Pa`>)VuhM9)tCsG_ z#8r!WOtac$?U-hj*H!CTRbj4L1Ep6*zFKHk5|88o#Habqnv0k7{?UzNj4g8JWF1WV zs^cubGKmJe9rDapx^}O0Dsu{zj!0*nyOx}Yj@Vrb)j|=j58q#F8+Z|ALM_WHiwGI? z!-c6Zx2!|@YjNS{$hGtWeYJ<{TmNLjXmAV-|Mhb0f~$3mEQKdkq+nHqM`W+_7>tg{ z;;gkV5qr{{wQ|cL)f698E^^i)4CP992#p(MILwB#7PUL}5;|)kz8Ft2N03)^>Zw&+ zsX-&Dn49DqBEuI4{oyh$@)CW+frtZR7m`P&^e$S_aOfnS z8xxo+XM>jqzEiI_-ziCa;b`T9bpJDC06y_Sx7s;6%ezxvVD7w~_;n~cz7ij_xN5Go z3!B5|Mkcdn5sPtjjAt0H8e=;HrXF&1CX*`$YyTU5xQcmjEsD=~CoOac{6lz**1+im zbbv>lemm*aP^jUzRA@s*y+)oj>z!0>14@N9o%#Gs|62ldv<-iEk#VTSwm)X5sBS6N zl5Y(qHQqgvQ+e*3ZP-qTPpHjXxD-U`5MPZ~7Eg^+h@?EKOwT96hwQGV;`?}_>fek9 z-N${;Ixxw>F^ij!?LKu!aMG@c+cxaTUH6{gVal%HVbeAqI=e6ZNBjEc1v-+auquD{ zcK~pf>%Qm*vS1T4iXP(+6ko8H%EEvg*mu<({mFf9hK33EIqlEKGb;j*CjdzgQ0DI5%WR{5beaxnLY1vJA9(5#;#i@ z+Q|-1&7teO%bm!LvArlG-;)C#S3SHPUr>eFHMuCT+J1$Jk&QIT%cj!c-3RuEtbLe4 zd;;E5wS?|r(EVN*Itv5u0AfkT=hgKx=cdy8QqGpkJ}U>dI7T-}x78pwRDeo_lPlV` zRGL6gAscTOH7C=7eOEt7(uOja*e{)Ue-{()G+g?Wim>pK7SE&jp|kBv%_TE3`?3^Z z-i^LF{&7HU+X*BY_tCNtlyN>A;BU= zsa_Ya!JQB@!^*RSg~B>32XeJ73^aDMbkyGUD}_%-S7}`Q+DuhsKav3oU7=d={FPUF5>MxqIjY0Ly78q@aYup z*7Cf(3!Rm#uYKa$i8*KG4x?m!?E}|-y7xgjMd3FO@j|EM#L?uGNUW^wT8f%=q`F$X zF7%k)!7TIbDlCQKDn%3&hZEZucao!rRy+l!TmlTaV=iBumMfpnDsWPK-me%DXlNQj zN+Euby%!qeE*yzND{LkyEZVRC6G@;ugf?g45{|g3QUE*vzNZL#aIiL$rp>Tb(wHGF1ZgwlmcA1dX74dU_*Aw5VOio?>f1^Sxm{85!^;h2o8n383;ZuQa~sbkwX%&WY{HqOA1P1 zc3B0rhJ5IJlSa$2^`0zSuL&(#He`qAD##7pvndVS2^tbh2o)qYCOfHVVA;lw!McVL|R21?ec#xP( zAQR~nkmMd^smL-W?coOkQe*6)&{!b#*5gGBQk!R?j-9nJaHWOuuz(&@Bs zu<4DcU>0F#k}y{0bTn7}jAvj$FRLZ#O})jSKb2`1W+xw=_dUfMn1RtWEhV3C>Zl$*@bmDIma4IUtV@mOpxii zL49JU)Ed)rESv&sSs)YY9IvwrBZ=x5@1N2z%u7;?oc||=Ve`59@~K%sg8^#>d??;=6-HF7!m@D5Dy;7WcjaZR!X0H~=>G7g$p2Dyz9P(BP6c%K(b2}! zqz4APPIkX8+B9BhMoC;3EfaS-0MGJ1hB6ewQ2aPN>x3`~WC&nCW3sr4cxX3sC=agv z?wjlYt+TO3$OOoRBA|G&ERw=7i&UVjny_dmdhuc;T>~ZVoC-8jv*Id7e1NUEx3t%T zk&+&ZchZRwa?|awv$Nnv$OYCgUq}^FOTv)~ZLEfrJ2)Y;`dmwQNSbjp8QT;t6O!8k z88JNcj2%xGyUH=s5K`Z@nsmR$Az_VnjXG%>d>}aoI=u76vhzZ;=#1pOX(>Of`v zuYW$ku4lYs+?UDBc`9g+vs2Axf4#Vjp0sfDB){ZhD`?iaHnZ6LYUO0hxlRoA7MGu( zWZ3Fu%9UH~p8eq=YcsVH0eN41+1C2>>s(aN&F9x%+SIZ^LpGd>8`GPIuI6kC%`kwn zWo&i>3G5kBZUq%`BJ*!?tmFF30h|CUBN}d5GX0gkO72prlAB!6joQ_7Hgm&)?wqL9 zECIu9YG|*$>znKz25q)tF@6s2vO>gnJ4mQ4X2v4swITzu(ZgPHcsx;i6g6z1T>G`g zmJJjn#I=qkGVH=%M)$SCA=`!cvio)KweW^$ev%+^`ix|n3w~CVCEFeZrTV=}7vehB zUd=b>+~#mI5rR;Nwb}vkDb~VGc7VYlj6pq%aJdjxcT%y3pl-b@4L^R#y>ML}jLM9SNw;sTO@$qm`VPU z0saUubL09xc=ejJhlLjt#YRj+kR1}nt`iJ3iurNmHmBQ?nGqVX0IyW~lPFSUO|eIk z>1%l350s^1&t>CfXm(_ozwpvHnO_452R0%TSKu*$6;JmpXf|-)pw6-qP8afLY7Doj zvGZByZZM8t8u3p1zNu;k-%#ORzD&~4%5|sGm>x89ebp5EfflZns9D({-n#|lm{XE( zoEa80)2>VfObR>7)WijU;94>CX^S6l0@TxXfU`)yWoX(C6t_<1{Mi6IfL`?38G;VlrrXRSPW&S;FitMjPC zbp-xQVkTtkB>Qls9IKRL8e{LMQh>)&M?bYxp-SWZ`tkd0{XR=o5u-qbDpcsWoa!pb zd8%>_3Tg`u{0~y*>HtLrE;w45N?dhE*S;2S*K`7|yC%mS+jcKu1=vG(9WPUJC#r@M zA{=wz2I#q7rbys6{8>PEL`&i&G%CA^Tlcnqn3dL6z~2NjhBk?$b6Ew2CklM5gh~ z1LzUS7H+HKS15I-khZl07~Uu*fg5f*HITZ)1wso$0~f=%rD4xLF3Dz5c%BvEp#&2K zzVGC6tqjwWEmLu$X?YLNf*QCxJYs2fDUk5vmSC+Nwp^I11enF(j&NaU%9P#>Pj~Tw zvg-nz%v%>$u`Z|u7yGAWrRGvIbw>x^#AI#pmkmAK=Nv%qxn1`AS{laj>=uSz;em)> zDo3D}_l(WKutL8V@dZ;J_|91w+zCn;+&yk(u!u4Vy7+$ULN6hgF4(#tBoDDJIK;$j z-OWXZz$?&tvUqv?_t5LrB2ehXe@)Lm6khMYF60}#BV8Akt#N#AbB&!Yc;A{_Wm0!R}Gg*I#HFRLOdzuM}vj)0a#^aW7~t2(ycl@VM)a& zG>;H0*5+sItg7ZG;-X6GnjLPPz(YuPSnS@SpE@;4Txx!9VAFmD{}qtXIV#ILS=MN* zxON}SJ4#VZ$snAh&Yf;BH(8E)%3zCKd3=|p+hH*m@cCIK9ptjSNp&gjhRu|^c!w;! zU3^@!qp@~FyyGJ}a-M!D{S+_m@q`xDe`SCeZ=$okgQ2NKM}X&WQ%mfzQR(Ftv-zCN zh>Ci)Dq0S(N+mOMeqUrMRqQ5QL*VA^-<_6fIZ#3kT;HY;%mu2*Q(jS@hoT~&50xc! ztTDL54YL88zfG}+N2PDtpvcXmGO53(Rd@3P2qrjX-14^O-y*Qr5qZ(&T%N_#mhi3p zQd^-!O3By`kyNvcK^08=Hr*k!N!Ps9aIqIQt1xa-l9@Q=I_8oljV4cV#-%@lKZ;f2MqZ|^}07#_~bg!V9VVCh5gMkdSSj9imfJ#)aPH_mnldSTh zMDlRuQmD4WQ^7d*9>{R)7|`?s8qYmFx|_5`gZPlfN*q)JIW-$LHosf|zIaC&gW0`zlDuTs<0ZR`z?tInEJtpiAfD-`!DTi zyT;DKQ6IXPud8j6)0Dw zGzI0!Q*NNr6wN13+AyUlB+m}vA(1`8rN7d~xDZs2(>qszBJvovx}p58%nU-G=;q8_ zQZq(%sT(8=BZ1H-*@T&dKCy(sByh$dn{WlePpspris&aUftrYZVg&{QC;Umaz$L^# zu>z$e5CA1CFp>Z$R-g*Fi-3|XGm#J|mR5R~unYoFodyLGfU3Usop{H7#4KC<#c#Qm z^M2NssbbPMNAed@IodyUs)*3J^+XpEzxG*=QWrZ`4NEl%28xwbce8~|f#?Q6WPA3@ zZ}4uOBzly=i5A<|(RrEHzgM)AY8ULMOvGI+;)bsx?hsd!dl46$PIZ#ZhJPtDFqUe0 zHIYd{&uK!oXir59REvnVkdQA#*RKEy<4rvVUB9Q$rJ4T!kw`z57__PF73lgc5wA3V z6+`onx8G_H(Dht<5M3Woyr~0GyeX=BFpRI3jW?x`QgCCVYcd>2#vWtLD@NR-kr(mi zCPMS~Kxd-)gOey$s8p4bMDYig8mJ@@{N8}5Lqs4IzU%5fJ;-LP(1VZd@>6x zn4pXCOAaNO5O805GqY%hlyJDRf~P^Xmyn7m?V?om@0V)X3<+boLVPDBktsKJ&5Dg> zgr+Q}pg6Wx3o+|v15V-`4A&Z+#QOJQe8Ka;8e4WG(vNbvZu-qZ_w&-$5X6=f;5W3r z1X=GE?e3?O_uB(x{WE(gY5i{7{?gXJYTJKQ>sf94k8b@Kmw;JFMTFC2Y$u~W>~Fap z1OoUbWb~?%6zwv;?6qXAOD(4j0&9pGEKEOP}^{e~c*R}WRcrD<6O9IAG3P@4c%TS9*l zpz3VGm!WD)P)$&EHlZG>)`*M%4poOOd>g73SoJ{EA^6MCv{j?*eXyp6NwocKs9Htn zYP{d&)|rmaW@&mYA_FunCS8Wvp#s{jwn%|8sy1WkgE&Q_@s4Z7vYmj=DVz33kB3%W zz;1+JiP7=QH-2MTynPBW^}P_H1i-B#WSNqb8Rm?13S`gzAW9%&M;92(Fww(8v2m>q zL=LANOFlH>Y7517=|>?>b_wQGVUt~Mk-HTlP?X&)%9c>+Q#Nr0xnCrZ$Be#TB1g!$ zABDEJC1cMZkSv4X6N5Gyh3^U}{JxOF!Lir|hOyYhDYr(V z$o6FHQC`EaJ9rI}3=U2zSCZne9h_99Byr{3BvJnzqW*m4ME#a%s6>*8p+e$dOLW1@ z0=BLCMc`qUVRvMv1Qfnq&CG~Wla_NJ@b24xWjdWqBF-?>39(53unUJWP6^Fp0CFk`N~$wxy7OC! z+Ei6Cz93^7+gRPEhcAm}1BM_ex(aJ4ay2=e?}p4J2V_KANLo{Rqe(NC4FNth8cFI& zAeER(yMxsDk=TKLLTR`dvr&r-1E{Ovq|!+t6iS*wbNTk{Kw903n=<9g;`4ri9Vj#S z82k`90ng4VHX*YIF*)ln-At;g%ZwN#pF>?@G!2Webf!vKi0Wjaar!7`j%j0+|ICwL ziAe{pllf#vu~~!a5(sLl%W9muLHBUi$o|sh+kW1Z$FnRz!39LzswOj#H8QrU!puN* z+Fzm&XSihsT3JagM0h%+9mvl9fJ|rxicepG_qR1(|&)v;yh+--C@nmxM+jJBUR3v$*K#Y*KV;J-$Jw#y32#kEb}AG0L+5 zC2KdvJ5CTUPveu%KzU}MrMxvU^}r0YJ+S(~?(mM0s4BIpwoNVfhf1(2&_Pw0HGCV3D2g2^BtC zi>Mr<#Vs}~11nIGq3GgCJL84hm+dNO2ztSU1(_^rWUT=A&=9oO+Tm?PwhMJUFx`cQ zpl0zPY!?!g14EFFTE{U^vc_O5g|Zfn1&?E8lOafB=^KJHrsBGFG6cDSWZ1&;G6Yrs z7Y#v^9GOL93ENzvR>jm9zUfE*NWFjsa|5gyfoiY-5wtL} z0)0_^WZ28n51KJZRv;Etts5GfhD>RHMe7vFh5E$X8Z*xT^wz-!p!djb2B1gm0gd7| zdkEv-_LR0xkO8Pp2B4D~Tf#u(W&lExnbX0xiHk%oWjNO&6R$}9THEOok`7LrV`=%z zO|>*>(Fd2i)Y2qd9h}x`Y56QbV5>yQQYe^6(wpe1t0HRyB2VJiMZJwcBayt;AOKlH z)(GT8uQd|AKsBEch?Gf6$y$MEc_PA>m0?(V+kvDuTMC`?3_+BU-mLb=D#5n|ISEWj zw8J1vZ|;nD41x+nJJ4H*R>n0H47WP81Hq?%BBjFZipdTXH-dEjf3gF0GBoi8_YL-_ z)ooX4SefKg14TZubZPf&Z zpoUC&e`yzWQlh)SNZN&1X-MCGeCNkt3h!bFVm-R&e~KZfT*euZ{=p`lLl}Y%VLm!c zL(n?LD#s86G0L!$wFCtk#bcsh8DxA@kaUX2pen($mLTZ;(1su?s-i&F5+rI5O+hjd z`KF*zW(uN&Y(%~-NaSwDAoL9l-WX>JI$fexXbMt%XPJUna-faS6x2zd;~jqoSz`*C z0VQE2`MsHf?lMzQ6suSgg>5Kx--9Vgda;C+`nK2E3BA>n=I#&QX7QIr(6nr8V>QoO z8`aLWivu2BpJ}Zk^)4sWYvl{Lc-EAkelWcjO7G$SeP+zxKs~yr8FgRWyapoY7LuE6 zFGIv%m&%lO&_>bunM((Nk>UV{#Z~q4xPPXl1?7r`w zid(1hyQK9ie3rJ>@p)8hHJ?Yf%A=-IS~lXKF&~jqMF`+Uj(Jaz;U46g_b5{q0yH)F zoy{P_D8H#atn&)}9uWP$gdU1~-EzWdF5QNN0UUn94Y*M300 zuPa5utMXl;Pw;oYChpd+vf$S!`F7E+OS492UuPV3U%X>+pwS|N@v_8Lc|W`p$6FnJ zIQu&xP(lLgo&sHlQl_ZLNvk7yP0~#Dws?Ja^4-|-hfvl^>McVem2%ZLfi~l|G26Dr zk}KXQ4(m%vNv@&_MRa`4WFzrj#Y{PzElsodV2fYJZXD5Uf+qYO6B!9DMneIVmmBLuTwJS|64+UaY=)bQGIXSKl zpnlDX&3r?zH%CJ2`ovjo@HBxc;vEB-oy-miuGq`g@)ci@V~dY4NG3~KQPd+n z0VftgGb4WK2KM`m`_Rr0GPX}~Lwt5@olxt>_9i>iB%@$6O)<7V)z~^g)^>(9gu&Pz zudzjgBeGQbhOv%QjwabS%+Z&c{BS#*2g@~8E@iI3~WA04Qc$m zuD78G>8rl>)kOS)XJY#kwOAd`-`LplaN~+?#Y5j{T=5b?Evl|dr139!F23L%>LK7B zcSGUi7GH1+KO54knBKpsAKlpj`6v zKvQh3)idvpRE1s5a1SVf4mU9vIxY9Ar4Xx3vY1AbEl&z5wd&eWWa>&VuY#q@7+lzL z3qz`f8!XqUiV(UWlVqiSl4=gBm62rWWv2P4a61oPH93n3g!R@{ALQiZ5G_ zOWluY!^wzd)d{jXiX_>HMRhnaa%o?ovDDC@efb5ZpA-hd1d~OL9Kp&Ou4|wf{7YP+ z2^11Dtbmm2+q#kThcoiek*uqIO7w|2SUQ4CxQn8&VTnii0n;QnLbv!smGmZyK4dDqM{0sMlQ9(lo zjO(8G#IJ;eNUoN7_B5gm@V`6!AAT}biLU;QKqgp%ad4TiMrk_PvS63)D332 ziQF_swfSo$jasE*Qw~&wk%vyHRXP%7<=%MvX1eHVmo#G-Q+(Jw*fY1AbqYpJGr8T& zx-vou+-~L)=+!k`Z)PdwO5u95Y|5yuO3{sIO5uL9u(C1SZ)OQr8TXro3D{rABlHtk)p*WC2Cl_<6rmz z{oPH+k-`Q*0P)Q1fga6Dt$|>ZqGLBLY(G*tlPg}!CD`J$+!7tpYOQF5XxXC-sh07R z=Ll^0G)J?E%-}vg$sThs$(B(~VUpE6fe;ZJ@{w$NNU{@*WY5keSu=&9Er;8WBYV@< zy}>w_i+*y*+0qJ;bC{TyoWsPtSS_Q0%>$ZX=CkLs||< z_FmC)NU=tK0mWL8Ym8#)q0w?cv6eC_Bw3@(fMiwK7$aF)vC0CHRl+q!vD9THB8s)@ z#u&*`3X+xcjr!$L?4$9H=ONN?-%*qr@3L)gn#93$~TLz=6TX*9K)Bro+K0_Ksfb6 z5_&0PuJgDKqKrM4h`o}i-Gnx_sj+2OFKz5sXk+!Y@5VEm>JwBx zZp1Iehk*MUncz1Y+^x-_PNvGf7v%rr)MJ{NJY_-5X=ZPu5;@H*zz;W3Q_Y?=wMxiZ zr@p6^97}-H&E7Vw_yF-xJA1;kGb>;nwcUYdL;b8*MJQ-_E6S>9xphfNi)ye(Q`0_Y zE4rtqogS)b-OOhZSG?oHQqyJ(7kw?Yr5#CG>=WyjUKBMPsIPr1p83w^tj?CmG?3dF zMJ?7sj(c-x6D5v-6n z{TVoniPD!!aFx(7jx-(089hUoZ}tpfo?+8KTA@rK%2pN_KP#i^C2W$%nLmS4($Jo0 zy~=ifNvk0@+80(3SwT&y((@Rzfnv?ngzPuUlr-$4_G@Nn1+`yTLCx%`QlrcUiq+Fn zE|HRk_NJ7PYIXJxLnZCoP(*0$bPMTU0P*Odd-@Y17hK$V8_q9a!9VbBJr(A5d*xV5 zl|wmAg>f_|6~^s_%6uu7#WE0xgLC=Sm}%`O3OwauY{4ceS`6`=1BtLcGw^5v4fV&H z*sN8dMA*eU4wRTBYB^M5Ud>fvggHD3iBRe$PjIe)hoQu@IT%}`#=QzDG5te{Pdp36V)Dvvp}vq$Tny7jNXF4U%XuoLoL+jKL=)U7Xu3}%5QZw%VdyXeJ&8~{%q5{f zSfzxl4g=+e0-=d&z6vuYRAEf*@Kl&9LV3`{PEUmy8LBW=!dGFg2n9lGJ`xBY)Cf!r zb(m09>?IKEsQ^7bFO!n^OG>Eu1fJ8qLEGyDuzV7Ecuwyl`Ms{^2w?iXL_ffu^~ ziim>A`r4fwmb)msSR)OZffc_l46L|bYRkzk4zJf~t+i}!^u(93cOO_;6*oaXi+3{# zu(ED4gDPHO72a~anPY~-W2wMLkyfk4n`dhZfQfj%jHj&N(jFkZwW-)Mud>-bq?Tn$}679^P`Id5T1!hWIVoScaPK(zV z%vo#0`dHu*7qmi0;sRbNwL4q!hQoPFsl&tdTT@L7c(~xc=q<}zdB$353;XLl+X@XH zTDpV9pY2OC*(%Q!TDWbs*QB7m*$cykdzLNSD>1c>6~{YClHFI^wNzhkuHm5>QgU7Q zFV@(aK7;2nhL|@zyyHM3b8(+Uym3?gig!xn>A_9KQbe3d5(%It$gP|gDCc9`TTIUD zO*!m7`h_0ZiJE!EQn{1*mNwD(9d`6v@J0ck&B3f*m|1M@zaq&g7J&?RLi9<@s)WZ$ zLgG@${ip~H=_fGZ5?4D2jG(Aqf7#fwCZ*^&ic$kRUnGi=K5K+15rbxRZB%1=D^xgt z=<1S`Hs|Aw7FFoMvPO$4^bINGFP&zI3o-+7db{472DNiLIu(1$!sSGv$Qa-!QY~!DG^U?+K@pZ{h$*CzWvCslIP1-0 zh~X1A3d6?qSsLKRwtLH((n%tw1t$myBj^Trt;2A$hoPd7;gHR1yvfGX3*r_H6$-p! z&UMOE9wMSK$|3>=#1Gn*;zbCNrnC){kV#q0IrpYhY>YZYf}vWKgUDz@WO3k5X*-*u zOvT{Ph=lKqapkSu8H`B4@!W_&3WV9%o??jIJ3HQ@F7eJ%>G8ZXw*wx=XG3Mvh=^iB z8q?-&7ig&sw;z!EYMMCRtyjZ_RAagubHiN$*|SLtN*bk`jDpV344lUs`ZtHj35{(E zX19Wo{5FUz&LJLqH9oNmk+lILp@ZrAHWwp^Pq~z7w#VByek;53i0x?U=|0(fhTS%f zHhtJ`AxWDjQtd>c%xAZ4tWB5ntGHA*ZUuX|h})A%Kk0&mXihQzJIPR|td_~jkU^{? z-0PW3VKV)dW$3O>XamRj2|+aZH*lsjTz9ngZ?1M_o;`<-guKl4E`iN4^DakmtNO{sY}`vL6iXj*H>xewEhpEf3Uc9(C|Xh&^Yy%ZdDsQ%f@rjX$5xi zTshe=T@Re+VX}o79ItvkAby4QV4#cdHG&h}>Om^~@!rgg5CztQ{4`rK=|{8jxo(F_a*w52A?D|^P4I63!_cRNNrIcaAfG-QLje^n z<)gd_;Krn(_F)(Aa&8VPI1Lyoj#`Yf=DwW$%)o$3I`fSQFXx_UOq$Xkaa6(!N;l?t zIS)DpU#&Nzw&|!i>k<%i%ZM8j-OivP>3I9vuUObbmJ_w!@`f$qw+8$GmMlU^6HQ?| zFXvwfDf5qRI-TIwnH9|K3$m|90ensU-7)9MV43NK#r!N50f`uYha*<7QSG>U&o8sw zUDjza_v3Mb2i?(NBCVvgYD#z8r@6&P&QkU1`@83!ZTH7@yphOU)CUoAFB!||Onp50 z>Wcj(_31adze*OUe-SBU5X~z?u8fJ;)cu8zT4P0CV^{o{6L7^bLjkvw^zMxtO~AdA zXZ{s+FWuI2fbZZhdAET2|GR&j7b{BEo?CPG?|Fzfth_hOqg2L3VZYSXUjK{EJ?ES= zE33~v|Dr*a!-srsNKNHuYCbcx=5s@bRaTFgJgG7{q1C}p8(K4nvfLMDf3a)E6veKt zh<&WQPuW5mr~t}Ns{Ikj-u?uu37&meA`fg_9@x}8u(^3)H|2rdo(Hx(4{U7?OmpkS z05)5(T2g&0ZYUnvcff*@k$o#Or6c=RUw>46-_ zO&;=&>{~IiZ(03^N*2cY&QHfi_FbrtMfzBhj(uHaMr9KkvI&#t)cfY8`4KK%-_iNCa0Ul7g{q94#?BY6eazUnz=}I z6%`F&qAIMU{?0h^-a8?YO~@%RGMf;Uur{I+7ptmK**2mQ#j}-C^4dr8qnV^J` zc}iT>J0WVnV2R>$RK$93jd+;du9T8vPRX`2N(h{r9kN6=A!o>HrZtOl zPWxX`Lf#>>UY6W&T(8+Si&JtTUqu=3EiLI95ujsqPR?z5o_*7ymYWOSqE-5-H`r$p5 z6ospy2jsD&h)DQ({z^=aI5?rC=$M0nKuU^^Edc*e0eE=<_=gL?`xby}#XtD3q=amC@UqU;;J&*0H}40F6~aLI%2FNB{8fgV>Jy~hyVZ1CtkhH%MR#}!BK zF@)b|@aR2;@SO&a-eU+q89C>;;^;k^tOIIa0-obal*e5&BNjq>}(0{MSi0RCJ7cuxWNry;3)c#bLnpIHFDtN{F}0`Luqa9?jD%Z=HJi#H(jrdFA*?)25A|Ic-|YY(tYZ95qPiq<9@{RGAefBf@ ze^_}ZUN-cM@%AzK%GpO&;>dE3pK#^X?_c5@E#QA-IML^vQXinDBgX9T+c?~}5%%M7 z^Yf;Q#-E$IU}Sx2`~_e7(uECU$B(TanQ9oH?F-~J>GD}KCbYIp$-*X1m^t(E$uq}a zKJD_>gMiHhqdFHIiW6>YdzIpZvJfr;8!VQkpC3E!}>FUh@j+8dhlvoPV^`Er}!V{ALYTTJ@~^O+|U1} z2lw-THCUo@>;G#H?&rV5gZp@15G-lA_`fj#-@d%ib0CYubqs{8SHqj|cbL zeX9U`(#N8DKJC@Br~v$T1>jEKmYGN zxE{m$-}c~s{!?qBdh{6PPkL}a|Mec+um5fj?&p8OgX=M@|Jb2XyMF$`1>om;aDUup zdvL#=9~Xdctc}|B=gaOdL~x&OKYw<9_|yXMpB8{0R~Oae)8U}T2=3$a?eikIKhAe} zaDN=0_u&3G7p08rtwZ{?dc{^^a%F&;KnC?&trh2lw-L zdT<}l`maUx`|$2b`Qg8q9KrqZ@3<_2%SI8t$6tAHzy4PXz}HQQ>ha&>v6ttEpIHDt zr2zcZE24V*_xSVH2=3!Kban(E#$(v8u^!wX=SMuapMQ@B_w!$VRsQ;~_uzj1|M1{G zo;wS`|LMC?{eJ$RT^qsu_jtlzNAS<_820ya5AN5qqyT*F^-(>3{;%ASAHLwn5!}b; z4<6jFXW8Gn{1W><%Ks40tseY%5B{sa%b$Oh2lw;;#ghE_hkI~8{|&e1&wr~2_w#@3 zpI!b^zWjdOyevO_lLtS8EMb2)+~e}+jzjH#MsUB~o|O^YAGiMZMsPp>f(P@%|IUN^ z^^`pl<@ezg9^9}0ch9^08V6r)+)@DE=Y{h}R)5r6l zwnuQkU%xE?pS~l1{!@2FaKHY`0(dSxZ1dp$I{4Aut{&z0;Wv12e;xl<5ANr$cq^*s z3?4(e8WO;B$6y%J0L!;K5I)#&G;se#qs|9nb3W z2=3$gay)|j{p#`He*Q`Q^XH%A!TtP`Yh3=^{{Bq?_&*ka@AlySeE+kdQT;x=wgCK_ z1>i4uaGyRu@tOSfpHl!nu>kyP5AN6h?;hO8e{%u&t^)8u^3C8+n~efG;lq-&X)W?Wm|-U%vYO(GlFYlicRPeL3g!KKb*H_27Q~ z-*|Ap{--^-pTE2;s^5Q)e;2@W`NpS$Evwx5*i`Q7$<5z49@X#5GrtJnx%J#r0KUEe z{PhCxn=2xGd_HiS2lwaojsoz9`$hHm`DcG5>X&aXy2FF}I+2lwm$cx8S(&n^Ie&4Zs#(U6~1eIkE7<2-ncm;Wjc?&I@25AN6hvIqC`_dhvm z*RTI95ANr0E&%_I2lvNqsR#G#d8`1uTtvWMG5`Jf)!@PX`Esq85r4Vs=KoRW^T2Dr z7{PtIon8RGxBz@j0eDXV_?dP2+x>lg1o!FxxN{=7Z+|$$gZupU_a5BOzr}<5`M;3J zU;hLT?&trJ2lx3&WrM3n@5_HLOTHAr{r7lZGJ^a0zj>a6>plAXyiY31@7Hr`0r*)3 z;7tYK-zWfY_25397dJ)t`0#Vj&kx^vfrG1GpXGlz-#&ucumKb}WTit_vL?-hVAoD${t>Djki z`tT23onmI*eAEmi0bk2ylhqk_wDBy5ALsPpKOit`|I!5Jh;C;{>X#ZP&6FR zJ3Y8x|E2=)Jq6&OoE_og(_yOz_xsy(mCK(?KPP=FKYV=w_){L-r>hTN9o6H*XMa0C zyz3eV7rmXq|8RUh{+$T!CtmsuX%uV7*F3mi|7myT&p*AuM!2|UoQY(Tmb(3 zC-c|e=TrIN)dk>}7Jyf}?wWpgRPd~?VIKTw51#bkeLQ#|c3C}T9z0ma4StLVZ};js z)`Ktd;2-kf!SuCy$~}0em;b{ae1iw?>%jx^v3lYjTz$uMC-*Tw=)B;8iO3}k znf2`}TPI8`8ZxVOru&`}KDAE1y0vHsuCx=1hD@9_t7wP-Lyp9g_V7#e_Feq|<&FTm zs4aP;9%frPzOwTAJ-m{kf9fBh)x&HtpI`6c;p$3;p0tNoChT9f6pp784zH+Oafcj1 zcJoj?ysAS##UV4Ni-3o;%xWEaM$r%)MdfN*G-TRz9K<LuhwESSos?I|A z=fZtnL$RW-6(~PiiTu4^`Ja#{U;E7}%-{Q!*Rr0ke8|V&uY56u^Oc{g#`$}{@*@hA zU+9XzZ~3bWlwWd~MrO zUHhXy;tHremCbGc-wL$9WP&R{&=t>q=C=xEmE$MoD{u@;TXx`c8EC=fcDCA-o*MC0K6milF?WoCz3?Us%2m<@3G&0g9W! z-|(P8jlbmPuzdLb^R=yVw<|OXf8zonQ`=7}lUrUgVlM9mR8%z0m7nhmsep6)FY?S+ Pe)P?*{P8)7x#j;aU3oCZ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_forward.o b/third_party/libxsmm/obj/intel64/libxsmm_dnn_rnncell_forward.o new file mode 100644 index 0000000000000000000000000000000000000000..6710f0aadbb9e922cb1f2b3f26c4041e098776ec GIT binary patch literal 177992 zcmdSC4V+b1eeXRRCUsPL_o&!%TWqHB&3n+Ads8nKY|*BQ_6+Zk7X~sALSC381{g@f1Sa7n$;|Wp z{ny@S&I}L}wfDIXnmK!~z24XVef|Gy-P{shKEd;9(tlpfAJ>e&rKvSF!MX{Rmu7ov zu9;f%n>Ba;%dK9`rpl-P{ZzXSA8hB<>~2{)zob3%=DA~_Y5Ft zx?%wB|q(UjE3rg9#u=KIILUcG+`a*M1{6uXSg_-HOXicW!?JP{`+v;%;b!Ifou=KSR zyyi+D4ohDM!f18oE?(!wQ@_g(nZMG_Pr+aNbS_$-8RR)Ey|0dnbIPi3xoA(ml&9J2 z>q39+lcY9GRKb6rqS~XynYu8Z`L|@`r886JH(-YDu;0a*h9IU}DiGb2sn16PK{Swy zPK9)~mn=6V{|n>JOf#L&MGGYUKMV&CE8_a4vc<$12XJx2v+JhO<#Q7o84E8!`)7OSQ1-)XO=4 zEve;}sHY{`M6UOC?TOpGsK;-)KNs!u*Ph8WZOi%dc6qzp!@+#%->;{5j*-?;aW1+h zv#8P&&0fV-bP{=-I;0RJ*3m?EYnSyx`3Wf3HejMEtgqHDv+|$#uSuRTlQe$>P$Q7 z0V}^<-?c=!rUjWr{_=mLI{#vQrPu~r=UnO9Oe4z_ly0|fGTF(rGvWpwnfp!ykF39T zGym-i?g;#~+k@iKTy$0)pB2Qj9dH5&)AQb7E;n#}YoH%&`EdL)cC zUFToi$%}^lSQs4+qNA6Vvm<*8-b=ZOvok7Gmup&Y&E5P+j%Cx*Eu>Asde;kn=S!6B z+OzC40o7{Sf)d?tx%6Q{!F08MH;F=@=`GQt!Niq@ISlV>h3MXV^k7x{N5kk)7>!(7 z9(ka}dp?|a6AL!++Dv`YSgz^l%?Cn|VLCl%44LKT1jimYhoTA!K4I_67v3#2V~K$>8RQVYF+h6G2{yfEZ92xO1X6*j*&y0+Wv|PGJU_LzrE@s(yD}$f7!3NzD?PZbfG{Gb4%&t zp@3?700#9lQTftewcF2f;a9%&Pu=`L^aWGd(w_(U;>T-6T*@Q|aqW-!sR2u066GD_ zIV@dOmy$ag$c3f3bwC~1!1_S6g*YsYsA!?|14lk@H#mUE39UsmLrToCXRnM26&w`|QtkG1$~d$UbDTl{%DTfB`t zWqC?TCkxPKJpfNMiCrW9@<&etoyFQrK01&u-j~qLscz`!&`o$(?E=zDMVg!2?87MC?B>0iZffpAXq3^-dO^<=q=t4< z^^YNl8w4e|fykz)t;l9C-(Y_YO*duQ{N>*zbrile@uC?C-8?S3wSxR9mCQx^ZRWQa z`8@Sjb|HV2BXxo;5{hdTgcGu6ExFkXG%CB)q?S^ z9m|%9Tp7IiYj1*V?h!abHXU#XbIbHC2icmRjrJPt6y4Oc{^qIOsY3}u+WEy6n^yPMbBvCsr~lMe3FCGWp(j2@1)8* zGj~~s|9;R)LHT)&j`(Z0O16>$WV1jkqQ-^NH-!Bf{@r}CYKy&Ixd6)v?kGe@MU}vk zFBjs)Z>zdf@kd4&GzRiGZm zlfF+oXbn!#hv^OK+g+p;qTST9Q}m^o^iAr=^;90ls|{43P7Gwd5fw;NLVDL5-p-3B z$Er79ghnio29?=&6yeJ-ImWa?!@D-_nzV>I%2A(H3Yt z>)1aMZKMW&?M7Gb zh}WA`;IG{w*`%F92$?xeQM4U=igvkjs69urP4~f$U(R}4+|zxx4A;~dyIPbl{ZOml z?62IKz2%IhI+@;I+4I}?H{Z5C5kelbdDG-*JT37zM-TzCx11f}&Hu`uWf|k$R9SQE zb;yO;jw8%~bb`}edp`dhv?h5xr13S=UiOw3)z;0wddmh|b-gV6-RzI+RYh&KNclC@iAuii>78Ekz$)-Px+psXl}>Q{VeH+H78sA zH_iTm-fQpDbM5y|zR$0p+R=&oL9uHdbK;WHF zfjNJr#K~58^4CvZPKj=%Pg+6YIZ5F;-&SG2eo9qY8kqW5q-E|>TbZw`-PzXeY!#iE z6rK5PWKyTGxjH1(TjP|fwZ>;!<1?-Cj5VHFp-xS;Dsr^*?f3(pQf39kJ!x?b+=?dy zw+dPv^KwmdGJaPLLQg8uKt0=8 zL=#=Ru0~#wqTf$S4%id4=K$RmdT&C6^5bEk?>zX@PL2>fLqT0*NFjZ^ZB?Q8fJ7#v z_fpiG^F0IL$}}aDO_j-`s!b2xbXhC_H}~A z+8+SG^)uHBjA#Dy$t!@-|EcH9_mn>M?*N83O-<2ZmILX%2|A1csc%%_($^D^Dhrnk z9Y%2}Ej$LKzEOosDlA-5;W;40J=F*?1&Fp9#zB>}o?w(J5JstjnE((wHdE8JCpC&R-0`)>36+J zXaV4P*fSP2Z7`_!yFQ!XL_13^LlDvgnxC*>eWp)HKW}MtTFNQI{)~?0or9hY5Ereo_FyJ7GY+X>NGhJp)ojhP*}B2dE#q- z48Iwe5N(&#c)b7@d8(gJvMJ``Nw3fMJcTZNEm~CcQUGTHYhZh!tdSVSXe-g(etUXEkEO7Ng4IRX-TG8F!{Qi-}<5i z&{@=xjVG%E7f~}Us+FT&7){OuJ;}| z<)Rn))b<=BTb(&=9XJVh&2~J=c%HmSRY>w;cZIj5LMSILTsj>YS1LH1F&ycJwyg9yF*DJvO=0?6(v=xr5!Wc8UTJ&}|=fp~4ru zof=L92?g(-(A%L&uGeSARl~0~EC%VlEFCK>ePyQ2ephF{Xx|ZX_@!ea;QGfX2Km9V zDY4U!4^1I05tkT-5(~w91E_YqCNqU~5p?xZqT{%y=4))yoQ;Xr%hvkh%#>06mRdw( zsZ^qrMyB-;>D541(E;@Z5ojUFBK{3waZgY@B4q`;J+ez1qOvU3^C94jw4HpRx8LbK zJxVd1h<*uKWQzUAtAK{CpZd!+2|*DP3MLjcnhHtTIF~`;1~e!^s*pII^uvzR6Z~6e ze;8{`&R^o>LA1CN{+p4pC=Lvx++XM4_}qj-bn-gC6+@1af%iY89=^9eeqD>-cBY0R zGc!tOeO!*VjeNebuR+SrI`WbuvAeLMNrHR0F6*~$%Xe&Q$aidP;AMC#6%D%va(?5q zBh<6Mj&@of)t{&M+Bj`H8E<@`f#;#8RY8xQ9@3x3^}3rfz4rCq>-?b}{@YF2KC9q} zik{TR;bS}wAJCUa_|rH|b@y%8pM%QuXq_GpwfHkW(r@1%RqAsp+V`0L?9}_v1pPfJ zGSIh6f2@{ADK&SHKgE9jd~~qT^dj^XAw*h{2tZ5I<9=s9Gb1~pCWJK;R)H_-6&A}mZ+{Nlyn`xG zzS-H>8I&M#-u+gD%7S?IKhKTUTXA7PFC8#D+3`!u(nXQCp#YY++F_YC4YDJ9G&|j- zw{-3BJOAL10dJ(Ay$02t4xfr!gh~=`;t!1k(KZuSn(c2uf66p{3oX>M*@S4~-}<); znBsWqdjV$l=fVhMJNqrGZBF18#$+y!Ctu9UEpamiTiU~0+ri4xP}Z#xQ1Grx&cZg{ zb;*N}g9L#6*0<5+8o*%jgDTilb0Q~jZ>E!mOqzcI$*Bp+T2o;K3vSBH=UeMxQPKI7 zpoZ38Recm?15g$2bGU>F411$8Zwflf4T12*AKpa5ut2|Y+F%7$4$;%rXRos~8tyY( z7y@Cp@?q$({v4;=&^Dg>4ynl(c)f8S6%GyQ>nXhsHR$uxeC`{7huk16Xg$j7@KH6| z%b&g%NE7DxH@+b3O_1jBb$;KVRWX65=TyT}{*2Elsd3u#B%PwZMg1zJ4A1*B+6F_c z16{vmDB_q>OSB(0xh>oDu;00x2}zeNEV|XmVIqa8V^_H>p8Q4x*Z0Mfy{vcIz!I7Y zKFkliL4fjJ;fZ!67Y6a<6?3DN_ZnubbeOTxYkFYWjbU+T&O4CvwmFe*55lNrq@?O4 z)m0eRQUNu-9n`8|M$@Kcf8Y=!#R@fR=qa@md1_%rtDOCn6g33NH7xlkY9v^}z-EbZ ziUAqEWJewlYOG)uM8?GggNaTANx{%i=G2u5C-@t3gu<1TW;8?Mcb27WeA9SNFgKp` z<&F_hQf+6G1h6 z;(qcaEJ0QSV+nfD2$BT%P=WJ}`Hmj=!Y06Scq6a_$O+C*SKwTb+TSvuUo)OFpd)-c+wmf-;blr!xIrNr zC`2c~g2f3-08||tFd3?y%EY|N{2F5b^zYNtsQNITIhce3P#gbn#$n->9E;XlJ+$o} zt}}RtFKmP_^o-*R=aYX-+)v@%Z`~qzmgYUm6-@kjK7n@W6MQt_zaUe9lBXa(mMI_) zJ*$dv(Q(MLcO@zR11{`haAhF(UnTI6x|r7(*DVBB(yNLAL27Jd1uF z6~QVfPoT?R`sJwzg5Fo;2FK?(Zt#d88O_O5Zl@xwd)3?^7o5utjQY3u$462fL2N-| z1Ui;DUna2wr3g}2o3O8?M-h~^kYsJ!X7c)zDI5bg#u;+_jKClg%Ab(>>qOScviPjw zn(9dL$asSiDr&+gJ8+DEaw={3!FF7ge(>7)!IM-rB<*-;6Hh1z=ivt@4bTn7&(RQC z{9y@Y!)I0c|08~2lLbG}?=OKLs2cHu17G?%H3h2xe(+1v6bw0j7C#sS3#v5*^}_hU z$518BX7u$5;+v)^bd&j2X$m`~hx$v_WZJ1tih}98vnWm~210?xDU3$v(G*PLfg|h{ zKbd4kqw(B;uiL38Q5EW_m8^;95&R%Ya1RdNk5=gkiE1iY2SroD2f#b34{uT5M)|=J zzE+7p#|5y)5Jak-KD=33s-&OSq9s7uw;G;^Aq<+5@Ho_e3;%6{AzbDdf?l^e4S`%z zhr_BUlB$Z2h1Nz`Y_z`0gyn5!L|K z6^;Nuy<4+|U1*%iNIuCp3FPD1cg3F>>3A7g=cDgS*n!ADFdRll;jG2W@EJ9l?19z{*E_8mmsM;(vRxWy4sB#SX z{Q7>LObIu7Z?>UVVHN^4PBJXnh1wh%m|ZAE17{Zk4L&;G~caC1fYY;>fYrch+FkcOb{z=tFH-1M{D34PvIQ6=e|+@Lq&vy77_B{ zHbsO4#Vd#ik!#ym+NoJ?&bT%Qxa}oDAKE3Et{lXKsBPqA+2z8xwu?7;Ic%0D($BNv zLlpHvjEJ_2#5@&Qk}I_~i=Jq{ToedCflKAmyeEkONu8gw>;mH0tok&ML9{s2kUB$; zjs2iH*&DS>C7n$h>Fj4FVG$(C8x^Ve%1wqm5c9*Ft}(Ns ziwPO$+uXqXO5pBRfn>i`z7`hXRJPJG9BWyswk$`(LeAsdT!{(6z4?Doz5F7;GPnJ5 zZ6zMWy2ug;K%8ImFiV1O9U(a5{F;-fkn{L86A&W4LA=wUythj(*cbF`mTQUtzNM?Y z>?Z9L3EqHG1%%+)ym+iF`Zle5}@iNZ6dyO@E z6MoIKAeVsi_%%O8z0R-M?Vrkw?fjaFm5|Sgz&RgJjq=e;X742e#f73+97>CK%3?q) z5QTwhrn5~C-ZajynM{bA288GFYu1b;L`$-1;_hd;<#=3A8bqs|j#vzc;3k!z@M4r- zbK)=$FFEmNHbGjb?xI7I5r^KUB%UZ8 zFL5u#-L!Z^3Vf*_JKnyV9vXO~)xAFPWBzmen6D1ArFr-W(_iBCRzWHhE%j0s02fOC)J(S( zP-2_kJQqsKP5dl<94F0m=MEM%-7SFgzK%6_|(R)b+Opve4D`p`?SWk{=;aC z#YK>_o052CusAN!ux9RMT}*s z!iFBhLr~rv5>pG&UM%_#sE=l$D{i8_lZ7H-Th0c->&8lyN6#UsBe(_M5EKOJ5+#M~ zB5{jSb|KH5)PGnWk`2Vn7%IROX4BD>?>Oq^7)|O#Tw3gWh#Qh{#BAx0)16wibr)>` z{nK;r!2b>B(AzVMS&EiuyBvt`BSRaV0^#I#oH`K0l1R}9H6_osRngW{BiZN#p^1U# z=e7$&Dd4XiXnG#@v)3DFdd{Es9QCg>uMdUm0QJ;RDIGmm@%t@jn+Eaw-JbKxO@sbC zw&K@5%rs45x?%1_J?_U(I2RqP6xiO<^biHk`D?Z>mUAz%85AWm@FY_lb= zK<5Sg|2Qw;7tIU!*57Vsh;GSmdB&@)sk!B4_o0ptuo)R<=_<~@=6*cZ`FCdG{OkOO zpYdEs3_icagZ9In<^FqEf{1EKY8d1G`xDY}|HWaM2WG{7_KQ8}Ne;-80QxNkM{n}U zZi1Hk@jvjLY|j06y@9bB>Zd*<*(QGick28qIXF%iRc)YDfEN}c*5b?gSER3*WW){0 zNbMxgOB||zfL>k!ktte!J-6Vuvix;qCz6ubFzPWP!sH6^a7|Feq;u(2O%bMGe1qlB zBEZIrm1o5n0^#+})feZ7LVWFeBqIEA%cakV#RxrDbUouQ|2hw)+DvpbUwl6CV#EvG z?a0>Ucct>< zLQ8(!a{VgDLFa*+xICs#xU(()9Lqn)oCRyG0lXw@zuk17-?__;T7G|ZpnvK)JZYso z94D`kYbBn;e3ip)#mUedb{pJ=c3LE3MP^zN{Cz1hJ1sIh^-F9VTV#&)uGV^2tI^1> z*EwN2*WCJ!o^h_ZldZ^9D>79@GD(q)hSPN^OfB27#cOGL*6;d=b32?Ox5Fv;;Wnas zz+ZtU0bLH*=x6lMpynoIZa?lG1zCemMQfpNCDvaQ712Ntb_(zcj5y$QuJZ+AYXDop zsz6x5x)24$eLGp*9?G;9MztBh_gg&H2*!Y2gg6+q0qz9Kw&3DcM5eyeu|aw%+HGF! ziE0H|+6tJ?$;gXbPFK^B_AH!@UoHdyrrM+I5T1C@5}l2&uV^1;|OxOi)sXOxZIMu^l$BD_V@I1(|2od zT%7~F?$n~omeUGR4bCPA{i?(Gc7Xn zYuToKe&{FBM zQ^`JocL2OPdq&x3TK1XBK0C=io9yxI}YPw>P`T-Jf8Fq zY96c7yDoXelaG0~dIPh$jb14DNJLyYaQeVYc2cI}&`YW(lL?WJQjK^!IWCWD8zjrL zQ!pDnjQQwYm-Kr4v^|YSEv$q#B`fBl0TLG!_}0!nPsJg#fS67`@{Io#3dDOh>$)8Wwbc@ck|5B5U+rr?DIWyJS1B=+8%QopE* z8DEcdbLWwo_slu9A!#y@HK23(?>ckrt(wCk^j!o~tUkVMWR#sVOu>Z_$>w}KX@hyD zW@pNnx`Y`IOAyiM2pi7=sP&n4GGp(wO=q*a3KSB%YwP1e8@X<_I?E4FZDoT>CRq7^ zvti|S-l>b|40V7^?g17fpibW0Ltdx=j#1ykiI=M0Ptpp$ zs_Y?Dd8x{2w8~3$D`nhQJo8AGrz$p8^HLSy51W4}uBtq0p8_vc?4_zXaaA4XrTTyO zt36zil;Kh3HD9F1lCG^|SU}uBwaVrMi#}huwbHTcCA-c8#s$90EU> z&8Gc+=O*S3uoUv7TC!$(q5R&E3*~}}Gt%J}JW)|!p2(XgYTmndUUcVX!+s7(XaMat z#9#F0CC;gewGpMud1sqmTz0M7dB%cze*aW*Iseu?wWR*77+YtC#nb2UNi~1em((=o z{q;EAy(Em0xw4Tl;EAfJ%Hxu98zo0ztzVgV+%3oic-%1w-SiDl_|C)sNi+eG^-{6-)g!4MJjfMJjgFw5|f>ah;j;T6>q7GnQW8{;px4W^(h9zW-r0OSd1dOT8qdcuW1rANE~=o%D|xAoD@&i2z2g;JQxVHggHD|( zI?2%A%-9@q$Xy#`kPr85^h=C6^+|0MhDfHRK)WVm;_Vh5pn!z!8AtWyYZMk2B>vuH z%0c!GJxq%mL-uh){f0x1=HD@ZUa}$6uDO8r2hs6SG7kq>nm8UAytDrd%xT%(&Y5HB zDSHrbTtqcS_7e|OrMayU4M>1Y?a+PxI+Uy6irkL$9eXbo;&!yDA?Vm<^#6a0KPnxy zAccno*>YbU9sn+j{YK)Ay`;`{u!q;;kBY?eX~!w75lJG>j-DV*Pqu~gU|9>6zxgJaYCEsd=jSZw6AdGg()X|n5*_|)}LAkPt8GHwh zV$9$%18)~IZD8IKFEeIvH#r?MI0&>mgP=W>nI6Xs4!{gnXU*^WlW?WDHi=LWM}Qkh z>WU}7&e8lrbcFSXN(&bZ<|{k_WB|Qno|@vEFscZS>T>R>srQkH zcgJH#*eZ-M30r7DQ7UlGrFOQA@p#|GmJ$OMP;-Vg#(pA(P}jmZ^yPXsPZp-%z}1X!B4H~_n% zd#ZyJ9&|wpRyqk%u)-Fi;0n_t7>2mO|K+#9VR|P0R`&Z+$3Njh$sDI zu40R5TZF$EUjo+D~hI)~r+hYctvs2c1>bIy*^ z>y_U1rIXq!5g4PDpeaw6$|dzJ$L@1ICLs^KkeYKIw1pFNqy_n%zx~@xE|~}JKjA{C zWUtKLG9W`MM>jZ2>laX+QLvgRK~psYD|x?eg<SX&f7}A%gsEbAWq$J< zW8trriUTX_)rZt9WDjNPM`#OI`7!48*FJ|k3<*Y?1|XbO=dfq;EGt% z<(v!-gO@9fHrAssBGj8SkV3v7tq4&=FH>ene2F)Z$hnViRwB+F{D=Id zECg8~uI-}ysRp$oC*8i6s044R)U#GH8&%*m<@ddaZ1yF|qK(s3yvnD_5^At#FX=gWB}37rIpcdRqA|Z$~-kr zBQoX2hisU*0rS2?RSeU7pT5Y8NE1YweRxqb;m`PDFDb2xaTz|#KyPqWog(kSI;fR> zRW}-^$(yb9F+QYYu=R5tA_9gtIo;FoWZ|CX;XXBIGa#}!yp7V<_hBul^Vm_fjnho6 z#K)O60y6l7A5Xu=r?r(Etfoj6^(i>5PYUu26X@^tR*@X6TetISp_Q$VSa6u&EflR? zfORdz=5It6FILE0)xoa#--TR|6-4YijgW@L1IQr1&fX&=wJ-0TofoZqL?jhw ztzTgWh!6AP$zQkBw=4C7Sf2MS`*?wJ-r0_g6JBje5n-&j8_`4|oP>la2C^?eFXiY2 z#C212Q^_V3rBmWrMK{L@(z_){6O(MDvun#)k;t30wSFyI>nm+@CNnBrfws19i4X?H#ilzZg_HQj*5D;hU4HTs=wkR^7n+Y1!!I3sC8x=(d9b zr{-QwpEy=Z^+kiwK^MSi4`sT-0~RqyNO`WJq6mBcGD=O`4>DcY$9 zj32v^OgLIfGFO`L1ssWLqNXpR8qD@0T`Sh+!c6*S~JMB|q z+_B`c$Y|@cj)11xvcGs^suuvhZKqReifFV0`ke+p-?&!_gVVqLPO~fl(gi4J03|gl z3N3i2)`>=jg~Ff{ICa$UCzalXCVSNji&`6EjI9U26^l@7m6DOzh$OU#V&If53NZv; z52@UcaBN7nqzWZ5bOT#OeHy3XaO$XtlY#mk;QfTC>M)=Co~#Ur0==YiG(umWqqj*I zHZNC+r2d%>68Kbo`0mNj)J;sXqp4}|2SXS^0EDcTo1vDc>6-}X@IF0=nKVvQFCi9s ze3o9f?&nkMcKxyV3yfhTimf<;^$cI!@Z^#^w8agZwENX_V@fW_V!u^crl3WNyO31{ zhAYsU#>#Mo8)O}@=v4@kt%CX5Qc)WGO?G&W-?xd}A1PBBOMpM4Sp-#Hx%84~a*Hc8 zhST~8?e#V=C(UYkwk3m?mY^&A?lb`w zIpsoVjSu$SLuN(hV%cLoh@|yZB!NXbh($xg<+O_#bBzo?N!xu7(>tVM=AQ@I_Xsd~ ze>HKH^d;)b=CH`C_BJ}t9G8p*qNQ>wmI-k3?S0AQGxQ_nR&_&ktTg^JUN1VT5AVJ0pS(GAsAi($8To1x#S|5Ne>~@}vuw2aw|?>0M7E|P^eTwFiflOr4?k0cuO-g( z$99R)F44^SHowU^2u8G;Nzrmau|%8*6?BwMKdI9o+t*NzLUh2gB!sFEJ!EvuKFuHo zYw&BoqRVDVGJWvNEg{LW!VrCN97ViDs-XtPBI+TW1TtQZ#`X+4vmC4RQ*4Kj)BF{F^Ht1@J+CS zn_Q5Nk$W5+gMv{Q!exX?RE<|{szA{a5=P|CC|NV|HA>cu^t$PQz!@DgiU#XY#qJz{ zO3I zVX>Fp1f)W9K*@3bkars8IMK3e{}ai$idhQ6Bs-^dv>pGxl|(W;;3Pw9m#pWP4F_t- z?&~}U(W_6Po+@@#n{wKpkXDMNRCFsPN(J1~OsulM!B`q43Wgv$0CoJ1?Jw+mQ%b5% zjMYf5F@Jb=;W(6oMsSsPA{K;)XaXi3^|KyLTY&|r(qw_;KvCEWG&qw|I=?VufX`Hg zQfg&*WPRPy7_I%;j>j8BxBRxFMuYBw28HAhsY7AKg0zd=ZM`PJxmL$2rL@%7;CG&I zG_3W6G}C;1H8K^40MaE&51Y=Wvs=b#r>WL4Mmq(9Uw|i{W_&|pjt=?}GIZJnX@mMB zj*oU~2$3qLsA0pV5lJ{RwJ3>fmY^We9;cgWR>Vc8c=2MIS7venXlRY8RP*8mF&jWgSF(s>)S8cFuR)|s`vG9B%yo&{ndi_~ zcd3QGYab~29z8pn!_-lISLa^oyJVR0-yvTOw32AZIXZ)nj=WRkt&uhwOmyBV<2&bW zT9Dg^RZU4b)}D7Hu9MM#8!Ngu)hE>j6P-^)Wkh9nIPd(VQ=R7fBA*$z6*z9FIkrnux!!MrMazXia9ZU6d%D3NnW6CrV`8G9UIN zM>iNlE z_NtE+escX-eiGQ%S2lSXhc|P>eU(KNFFF01`H4Y=4tFr7__=<1CUAj&$Tnlv3_AoM zIqByZuwlo^*^yT_3`&X>z{})G+@mvjBF~_557&>;wMt$V$FydG;e@ zugJ6S`H{?rPM!^mk5=p7cLA6XXOmx>?XT(INU!%`DEz7j;Ucowmx>PV^zXO4?t=75 zY^0)p>*ajp$u?XZ%YnA6G3k~ib9?NsNfV#<|q#6(n9!kY98^=17G; z35eO5fwue-_hZHgy^cVw6`7L$BV4$Eyn8NvvQ3x@d16x7pRjGEpL0)!VO~hwb>s;J z;2d#R(l5lEvWd9+i<2jH!ZwiDkx6-s$kWUGm+Z&@Dv6BFu#>$Rx-eec{dvgKLmJC- z64Wn9o~}NhxO?@#{-We*GBu2qcVDf#ec%Tw`Krm&|028NhuZFTyCm*p)-c+U2yKP9 zRvQ&gMjN<%uLxQ*qVb001XmYfM*fP=2vVM^N@YmOCITB(eI*g~96cL&=0HLpP2iXy zksBdUly01CiNtv&<^?D6{myq(s$C>P0_+OPrGl#*n&%I!97@L}?E9uW?;2W=Q=S#D;g4)#k2Af5<=M{Rp2c{5Le(9b|%L5A@P98$mvgOh; zj@eTvQ!}veF?%P=2lt!LfVSWD@6tU3>o!=5hOU34EL&$OWqY@jzXY1_G|=?wA9-3> zC{oJyj(ZSQ-~4Z5DpJxl|C08jO&JG*e;-qivRwRK`dlV{bl@+J=!v@D^knkz+s`Bq zcRZIo96XggeEy};l?1>Qau7z(aB_7ax_@lk>q_%y7xYPAVM=DHD^f(X`n)&lJ@57M z*sRAumn#tnt67FGuHcuEYSQg1U|1(SVL=2X93L>qk01Z&SLj0V9wM&7m}na;Wg)hs z`XKI1oY<#kfN15dpcA}Q=%k>oK=(3DQN7WT1Z;&k3;9e`&z;sAMIJX-d#e*(lZ`g& z5M`%TVHVrSzJVNG7(la{;^*7e%RYu?+e?at1CzDlq97BhhNnt*GaI+Dhl<>HT; zx5#!)hqgZN&_g`w-<E0-|6sklANfo;!O)$$U%*tW+7~_TG1vnmAjTROhfhVilT;qK*o@SpsukmBV`2 zbjt5~bG5@d6KS9$m&L725%l4DmeQj2?N{Np#j>2$v);ZCpG3PwA7JuJ)taC!`ti;Gn*-@{> z)M@<(snfa~g%*X;6V3w* zPd%Yz1)jxwOZ@o}+_4(n**M7tlzgAcloS4N0UJJ)-^_LlvkkTcW3_zOlXGy8c$a1x zfxX+^Z5iNRJH>#N=O*Cv^j=nMiR%IT_V$Hybx^PI*cC1)B`i&~lm7H9#7Sz9rV_QH--X|AyWlK@xZ9GfAf7d0aJergvCJmJdw=G2SYa7_H~}WF0SJ+h5*wm`wmht zIek2@qm_RXG_iPDNdGJ*!O*1QxxU5k{1;|T)=NxJ>l3${-o$17@_0^ivhuzvm&3%D zT{Fs2{&=i^JKe)IkM+IF@K~3>VztM5b@__R;+V48rY)(*x_1WbD4sz%+{%9Om0TXm zp0PP5x^tb)-A10fV8jzfeT5*S^4)EWNYv5!tk2HKs7VBK6W%G#iwJ*1d<^RufZz4a zGfwzm+Z(crQLh=yah{2bi5ZWtl53*ao)>0jam(m*zw>W+6KXaO40EHSlP|?t`$?yg z`3c$*k+jm>5);`cg-2OT#g&lvR+#vd;;Q1X77g4$hpt;8N2k;Ys1Wss(c=j}%tgaV zq4n;x$Y?l~wUTyH#_0{pwJ{<*E0-LWp$@IE&Qm7LEE-#&;Bo3tukMHtidZ9JiF%<2 zw3}3%l*>ERTE%y2+!3SY+ZYFGSKU&1+93zBaWsVGn1 zC6*?mVu-wO3P?A#ata9UtGp|4UI{v~NYJfc>%VuCScDUyavWqNf!puex{cF7Uf%_J zF`TNl0(ZZd8_`nGFZ;tfYCUH1QkKv?i}OD_8t1P^;{09g-@tC9Lqw!r+E5VtU7Z@g za}j3E*nm3zjD-Qhs1!hd7gcN4*YTV0d3u88NURTg`~A9cNatZ75p9d-yk4i`NWkR^ z5k*JGlYN6c8p={Inv;pA#(bulpI!t(otlZJX5@-KK(L`}s3m$i&mD$PI=oMx!e?lS zt_TA1Rma`53SO*no+a!{a?h8FAMjfIEA2YIJIJQVF6%c&ghxE2beknJ1b*>tz&8oA z2$xpt<}``uSwd3zE+d`qzsR3~qp-}*{c6--dq06B6Y@m&v*6pQM=r_&nR)Rw z(==dzMlRDRLvMxRW=)%;YwofW9rm$M_uSy}pD%$M|$2F6ram`&uJRAlcH6>GM@Uz7DYlVdZ!_{zx?|=K&n}q71Y}j zFK|Z*3nV3LuhVaO1icB+svH6O%UQdm;U6wFCcb1bOdfH8h~a3&BnmB@NfH3sHJL>U zgs6za2oQ*!?qI&r5TBP#D85&vTzqFYRfyj^sL?PiMN)f|$nzg1(~vTQz+C>J^Nq3Ao|7{5s;?tTT^l5pW`F1lyX z-LC+RnB@tw`u)y-V#KVP5CN)Pk7TS15EPpGaGhS~D$u>b@J8v)4ND-y)B|0NCJ5%bhKh0%da z%VC7r&fz!OznqSLt!QVNk14xK;&e?ZAa6cOk?&*7=K*J&HO?TH$FF@Hj&0zHFl2B+6@jwBg~-UKAK3$%Or;3Q~0 zIgryFSi00hJqYBJ1Q$f_&u}@6UQnpD$0?3!04MS)X355B?7~ZF^jmR8Y{iWf$4XLg zd_)_RJ1UuQNtI;k953Z+YCfKQsE$LA`ZGSU1izUEMcI@|{6TYPa`y7_N!90FS!2Yg@@LN8G1kc2C1pDTx82_cgK8hfU z-a$DGrI_<-N@tt;bWEfKWO5(O)Vd3LYZ$jE_#YFWFJzeoQ-N`M*^ZYc`pbVy zk@rC50!NQ0p`BGg#t9&rO$V$gJeqQ4)hF;K&^%z&_;EqKgFRMk2-xOoUnGK7Bv1az z9nHK7U|V!c3^@T$+{#g3Ha2=nwVt zt#KM3g%)HpYlvK8ZaiV`xVGgB@YBK>eLqW~|0WxT3@*G6W77a9$nQ3GN6f|%Ha+Nf zy;YElJjM7SG7^3`#;WuCdR=L9ia{+W&?xvaUp$d3K|TGo&x=|r(jD6QzIgVV9npS) z%hyCh?aSNoEo%dChy3O5)#?|AbKYY{w_g}nLH2?bh-}ZKas1>`k`4Xov}Ut$?VCB} z#@imoQ#+8pEb2amQb)1@QK<03QD6pcM@e=n*EmuR@alL0pw~eyr&p(d?01g*7r+@S zd^hvsK=+#I0+=Ou&N0DD6rX)wNrLr!D|3k)hTY0Z5)0(mg5X9~6xD4t0PvuoudhMf zUqN;Qay_W)mSE-zJqbmUENkpx)UR|*+_1pNnl>xhm6wF1Yzy5cn#0}$%iNodC%vWMaV+1; zA?F9an=AjILf^EDgP{XM(hGE>w6`bQbnmhvqNsPqKy?S8dS9p$N5}-Mt15DFu8-~s zj3<@S(t=dt-+Mh^^?-xbyjPx=xO1|D#I4{sub0djoz@iZzA%N-3QS78kXY@gsuKc` zDF|@n)GGd<3Jx+_ziT=BF{IA-5F(__P;kCN8SA|{86B{(~U&WBVTlxp8$9f}9%)YT!p~-F3PgCoxAz{v)nRcGD(bHKj;-piW6dipu z{a4kw#88i5wvO~fU6Rt}l)0Ovyd7_eo|2d*JuL5Sk{DK2W_J{!8VWovYIo-JLNpX0 z)OuUxlx2b)A8P2iG`e2prPJH(*J*sx*|eIu$}N$9g5sugdel?S%8!mQg|@{KC+1e^ zUZ&7vT}ow8ox3ytx)tKdWOw?P^9`Wqnhowvzo%OD3W`V3Ulp40yq$~5<=CCK(z13Y z{%D$7-oZCTRuzvVD1_xpGjSO^@$AqHA1@su=6*8FIHii5*NLquEkr8EQ)is%u>rWI(uBd8jV*cArQrpZL4j?fPhgUz_ElVed30CaK_Rils({4z3)})0DB|(gprEPA=OxURPE&yUPt6pfCkhxj z67HSsSX>Sl(r0(Uu?}3I)fKb~&^duwUme2-N}gl8aDDncDWk247rjIVNDk(qjH^Zh zg*GhOWPcROa4n>RNrwdhgu-;H!G4ng{{co=X$kPBKkjvOf1(F8aB5kZiZ67&zmrNG zA|N0L!U-khY5dJcNWCWn#26;{+NVEenW(~q;|K|V+E1U6E>OkHK4B8Agxv%a5R%7Y zLa*N47M`xe?9*veA8?qkSMU3H=X4!1At>{{m-i12=5@Z#WWxl90-Stq7eImqYgr#7 zdcaU)SiG+#u6;+IK#jQ8vl}1x6pG~nkr$)4Hj0x^fdYOxh0R>t!u{`?mVLTV+}`5t zkN=FY-!c}ijhwih!m_K+)ipPs{N)sbnvN}-#m%}o?@{F1;-eOib^wshdnX;jNzR=J z{Ku1Sh502XO~Dl$>_>;d1sNTT@0d^yOiG$CoB#(B5F88mW@q?WfX_i9xB0j3xEM5_ z`CdUYba@hxe63RO*!Cqjihok zWk)Z#__^*q)xd_1&^KZ|AoQZyW9)K#HTH2X^pmjy_^))q1zUd=E+AeZ&_`0?i7MC<&#d!Kj{{x1#D|Fk3+BbSu9{J> zf1iQ(S<#aWyaT1?XfHwil z0sQR=z+a=|G2sY;`EJH1<6JR4y_Ml76#s5dqS-e1O%9PqfB7Q@(A$yF`)@(-pPVX? zo&?~VFmikkhO_i^9L|~a!{{ly{GFv(f)Zjf$|#+8_9NZqtM7;}Vnj$lXoSLn)5tjE zLD(ocaKFh6Z4^PwGdYT$8*Wfj+X$4LicB^WsAv7QvtuNis(2p*>n#gvs}NC zN3=~z`q7l6KR-8GnDkS|s6u9%=)_L*L?~4^XDlX|H0dhpEl%V69CCyEGLpTxP zMRd~Wc|^n=4m|+5cAK3!7^kQ*8U0RwvYm=ET}K^H=S&=_8Jrx8$`(C}0(cj%9;%N0 z&Ky-3Zy-Y*9rhnKbH?mf-~+<~?vpwr(ZpyqMHCtU@&?K*c{VMc~-CBN2*CGSyd2M zn|1nVr*xT&2ISiDZ%O>LxEf*AATpi-$c;__qX{S0rTbgq4BMFO(diJ*e3QzG9f#2& z7H<^h9x?9eJ_}ziQtu?bss|KIq&_-O9ct3`FjgKZKPmr0f|4R~p0j7{P@Q3+O&W&* zE)tngadd_v^q|q0XjUE9r^sCw*C%I?^TzchA#!|`3L;MyShH;)$&zoUwv_q6z7rqI zv}q?D5ZCAT$t*v{&)JF-t1I0b~ zzp)vIw$Z z`&yzE(Fr=V)cEmI&|c5={C!!+?YD`3_-juXdnSbKd;YDHIWhry{$|8dU<-u|=`zJI zoWs}&0L+)9xaJ`ot@lAA975MzLRC5rf!<5*G!({!ag>oIs+~3)Mv)r)O1;1;`yFlfLdl;|m!v-B zV~&O|#8))PzGiYQ(4>8a;r;yi+0n_;s!R$U8X-S-5$d)=Mu$q~_A7>Df+Za!u(Pu; z8MEDy_)T)A2P%_rowm+*Q$?TX0qU!rjuu#{JkJJ#ng-rw| z>+CE}V>j}3!VE}%Be=(aBr1XGydPURJC!;Be>=wVxzX*9Bo_2P@<4uB&%b7gZ-@B? zUei6xKE^&|&f6^_cH-QE?~5n>(YRueuo*J<%W8B&R--*MR_1_k@Y0$3IfdSyX+uUq zsSKh?G69X%k~bIuOSEV#oDSc!BlOg%3}eX@r>_{9+Gun$Q5)>}f@JE;{;k(?T!N7) zY@pnm)t{m=mZAwFSKZ8@BUMhN28<U2VF0USyE!_s&fr`e%Nk7x(AaoQs)uY;Kk^0nzN zqQq#o5a#E9pF5P%Q5ws#W=nF=`oD8BPOAM)9(4F&S#ckYK zS8;TKYvXGG&mY}n?t2*g{jVUE4H^vigriqHVPqSnSJE0Cy-H6srNb{&Xcj|1q0#nI z(`h;Byyx@J~Y$AyLoDywnraCL=tP*!$NQ=WR24_Rn$R#Y>181Hd;^k zys)e%?ZeGyDDJdU^Q^ExGJr#fka}5L>!do!QKw~E->!zt_kuv>;rAFs@Yx2T16qc0 zb7CoJ(j7S_!-q`c))8t|AMAXV#%VgJ495+=TlAz36YG;0C$dE*nGR*^aOX4XjD^;H ztYR~An2BK5YYTB9o*Mhzc`m$yRFL6WQuzE(2%laAYf? zX-2lX`8bAbDPAO%uH2C?_;dVBvb8alt{_|6wJjA-eY0$cZD>x+`SCr~yD*UE-(c27 zt8(#d6R03og@|=R7PBwUdAbs?KAbTeM!QL|y&pDjkR7pWK)ED=ks+t;9}}b68Hh4! zJIEjp6=^^zZ7)Kc5FS`v5fd(PCjSC^cQTYHmEU<%Bkx8%9MvFnF3_!m`Z+vZql*lC9p3{HASWG*@Pz9@BsGKiVVf%1n*`GAU$HL zxrQD)T%*?~96j39tP$=83!$FF+JYJul2mS!;AxW^8smeenlF_G$$a&@;UT{5fPzeN{`%GPoOOrDiPJ4Kc|9rwwK5e zi6sOsyZ+Fr{mj9KBt8=Yc z;3$DI`KRBSwSGX7TyN;D2oVD`!lV%^(%+&jF!H7u_7dY?u8|(;{KL+_5|I@g4EK8aC84yNov-smZZiP94|>#k)(~OBtn?7UHSjk>%z#>`Jeb$Z)l0GDT$gi4;Ce86u8WLiNp%5!s&!Cp?dB6x;SP1F zPpU?jw(*D+$grkKu1_-KV)BNh4$D38!m%VCSjI6qa9Gu5M;rJ%0I%ZjhWJ^w8R>L^ zo6$oO9M_Q~rF=WT=};MytWw=1&12&~>6J@RUX`J-GC#WC=xguaqlg8BZ(M0{q?Og) zY2up5c5+E4PWPzfSE$V?#75Z(D-($`ZNb~5t9~vch*BlxVqrBNmr!rcx!fu=H=gau zQ)|A4TB+Ze_~gZ#NZIOwnxrXhq0%)zaYswiA%?2u!xoZs48I)RpZdT+?vYA-wRW5% zyowkR$i61iU_e``#GkRjh4uI|?zH2@$>c)8d2mrA{tO&SY#51eC0$_xoL?fjFx=YL z)k$B^>s|?2akW5XT3x+eMk-xne7@7F(P!5Yl3dr6Q+KIxXfY{*l~Vk;NSLX zq&A7R`oaLzqHgkORTh#jfruQ?P(w!(7}TuwlB)o45DjGTp_)}vP^g8Ikyu4^F8oC2 zX{2K1B#VltJAwl*06*8Q5VS9%P1U6ShzgS%(dA3PD-gAB0Q7!()mc*qiH)(q?>F>+m%7DBTY!LLRBcF_ckTSZTl`g zBsaf~@h$w?oew5KN$-4!k&BN@gvrMX5MkimiP1+4Jc$$_i#t9@$M}ptJ_vcj89x;o zhzm<w#U#!(XrrAgY8OL12mXk#{`B00B14fU2DF4t8&%s+HO3_K-V3(>F0(_l#*jI$`*b~cq)LHS z&L|2oa(nmWR2}Mf!bQ`9a-6AHC)o0E>VV6i)Lkh@{S0GPingc)1Km2bfc_1<2|{LJ z{}(-B56=u-bTCQT^;71X)3{j~^PWxjSS#a^(f58Cc^%b;$J=lqTw>zq4$Hc!Nl%6a zFK~TS2enO`G60+xDq|y(A|0E}XJt>UGOBbZ%-ngpnQRU46FREk1qL{sjU{`yUdI_| z$lW^WK%cr_(7L<3!38ct&Jcz-maK`%mHDD8GzAUNGeVc0PhA2P37XS2KvD~qGPaeD zJqaF1ckG5`3g@_A<4wspHAriO$?H~QRME~aP@7UXgGA`1qkfKXRcKq_d-X?GeE2q1|3-qny*@=XV|L2ig4*!@sH69XKO#KA6 zY*w2e{aE&z?8mYn&wgT4=?l$a@rEED4bM1Mx`KPaFTXx|6hoNixAeIXSM8wmL2fr+ zRF^B=Jc(<6nRZzZggGdM8Rc-i%9ASIa)P0SohR)4R85-cdUmc9sC*yU| z$_TEAs8*b+jw-lNeU{8qPjRlh{9u4{==6visULG!iia@>N*);;*dtZL(Q{6w2Mcev z$xKAsj?E47l4^*~UhT^A;)-OWqd)m(CjX5b4!z8yL=fOZE6{?ID#%qw>)?AEKe{_w z)n-d?45*RN^@yX|*KW>5%sciWam#k3dRb2B<{|-GM=+mbKm|QfTHJFE^wV{*Ro2uvSkxGX!#&ui*O&-C9gUL zrCtoKjfhwoU>Za(`z_a+Ogp^|s7ZIea@Tgy^P~nvK16S_=(1OL<*JiV!}!ij`_Y;j zu7cIt_ftCG^W;S$#_Pxq$>n*aiPU0z%}Ps_N1?&JYK`yR{8GG|&zj*fzhJ>>QuR-$ zR6`QSYrI8hxQ?`Wapu4H7b{-XEro{%ES}&D{Gcdr?T10%7ue>F&R&URE%mr9^d1Xz!LA4GTuONTQh7T$6_g`6 z{NT)+l z+*aud3Q6EQ<+c?1`M@jA(cJLEe{KW7L`{XQiaZReti1cQc&fxD2||)j?%veYj{&(7 z6L{-pcNrees0gT!>(4X#BjMB;dWMLC9d_*{8+llckaTt30$YCwxH{xpMkNw$>#eM^ zSf*_w4O>oft_+FTcF1pMXfO)9O@Ui7(t2cOGRL~l^{WeZGP#U;cB#U1s;|%N+3XSb zrMCIRa;bCAv6GXd$Di@OLF{?&$Fjy=2YuMCHWfmK$(uf4F+*@RYnFRA1L8L1=gWwK zIp}vif#eCGbZaf~cu!o#e?mf?=d{$#t#@*C^;Xv6lqJ6-hX*{WT!z z7Le{G5dsUbeRx1_lhDfW~gp(aIU1&{(m9pF~ebO01V>&=(D$S0*^* zcqqe5WGMMdh((=XSl=^xmF;+zSVJ8r^K~|K06({JhmvwafQwOBeG#4}fO}Lkxz4>i zpkk%p$~zfsuN-J*Hd2sv1ZEVP^(5s+_c#iE1tyi-9C?v((X=l1epu_h25)j7;aV2J z(@sVrun;Zp3FWjuvI93x+onv?#yCq9P_eyxOp`pygo&M&Co|7Jf#HA?k4TesGTBwq zj`)yncO25T6gFnQV1>4d`13erJ}oOHu0!O(I120=I0L^1*!SvnJ9D7hqWX4?0F!TH)Y;NW`Qo3J|1?UTV~DSEcJ@s`X5ZK!fyIWlQPi2hOTLZ5;1E? z%*$$YfK}bcLhmDvs}RqA6I6Cclvoj5^#ra0X&Z*bP%Px|#Ig^C#TSC;U}%TJh@@c` zdINW66W?m(0*cr(MY2sB9S|R&0pNaT-rLU-Z&v*PJok<8Oum5L;eiJ9hM8FcW+xn{ z03@`UPJhLx1kBPqp~q5G7eSO$!VgQSCs%aV7IDNFAmiJ2*vuDvjOl9r;v&x&>bzP(Al?7dDhC(yqjp<1LuX+SR*zttZ5_AMiQgEnX8(uQYtK`AmSPbWYwp4n z;5h!!J<9k~A&Wdok_`YIxC?LKi*0?{p$fV*)%m(=Zy6VGidc`R6BHrIScqY{+n9D3 zEVqxoKIJxc!cI-!c8~^-SD_!3{&~N0h^N?xd0^8azw1mDHX)lSaCadxpa4?XWC+A6 zgj}2rS{SP2qZfl{CkwS7r06B+JMa$7i&h(~<4|UB-qLi!U+z;;p@;%7oR98-5s-aQ z#fb3Ddk+&nZbhspr~wN3%OBxvKrSjnTJ+bR21@pGmQv_F4BN51;DuE`u%3J<;dwxv zpBndeg}^Ssc%7b?NLB;K`K;4ugb*{1MUUpesq3=(J1`QogPQk`kiJ6{xXYHBula zDnrG+5bAC)MZhDxe9C8GmF4O5CZ9d*$ znj30bwT{G17U!#1#X=j)f9N7q$nQwm3t~WSr}8Y;`ADYuwGax+I<%3snVuo zDug18*Emgznyh9{{2^jiNJ=|82TI7*tu`XZ6RXv}cJcs+Cg_r_pKMa5*I9Oc`K;l| z>;rQ3^QwOW-FmVRK|D?Nc`<2?Txm@!Z881$o6RI*n`@w~se9j{E;=htj&!0GSI;Dq zO6k^Y>H-RQ(qgH1&0@XHRG6}PUgujolQO2>&9WCqZBEJT-JFfK*tuLUheg709z=@9 zcg;5Dz?S>CYEmb&p`6TYaNIbdiNPI8P)M#@Ru*6E2)krZFUhYW#g-zjwdu1 z__UD-mfwVUWxlkK^Ou8o(%p6}QV6WUp>Z@P7cA%G1Tk5+BobfR;5(d(Z~U6Yrb?dQ z$&WH8{92&VONvUmeH=ZF20o|XTrSDS;@>zswceNS=z)K8voo#8J;2;KTRAvGeo*LM zaO{MAi+|chF=Udo$~hb8PcUxXK1iBQ_>iFL95i$nlpSH%D)%w-Y<2En7J4$ItF6o- z7MCYJ55aIhAU12>1VilB(-{2Ldh}D?bS*jpd8(kv*UBBlf&QtB4r$hTC((V}B|PtrtQQ7m@YB4NK2aZYRU23Q1BKjU}()G>9zIJYUM+8qjxRN8_hG1_*{)g*h)4Chb`3wiiQ~Pb+!++7%H7c` z{WSuiNP0mY4q>dL1xAFaeFd2EC3lpL~saUC_^@;e`3B^txt( zX?Irix^-H$rpymr(57oYpf?z0U_XY#2-oN5rt_t*v(F;uyJeo*C{AyW2-Hu8$g=!5v6rY!KB)i~sI&@$<{raMho zU>B2yV0Yqr`ZqA~u=dLht#n|D9GIodS(t50e2tFGgNjq7#x zy*Qmv?WRn9if7^vcGR8ptJY)U5|B=`Hps9g-TVKt_deipR@a@s28+~5Nk*GOOVXs7 zrgDoCk|_C+R0h(_z$na^8C&5`;wF$Nh!R35Qi@0n0cVJ$G=msN-O?^}$yUFXZu?}( zgW9FLgpfQVgTXiuWDsEf1O^kpHekjeurdDg`+U#2@BEP^Y*Tjkx6e}&kKXsb_nv$2 zx%ZrV?z!ijn;E*@*zSF{Oj!6k3m|Pl7PzHW(-_XK*RDkv;07%rNg^XyJaElQbv^8W z=?-IlP&^rC&ZDgl_v%T^d*%UDBo|b8a$1*#F|i)_S*I*IlgEyg!j1bnIRT*@M4cyv z%kbk`E{dWZ4B`PmK8JKXr7z6ViD&mlN6-IdNAYbL5;M!ZHMP_o90T321?$H znV3kb_!vw3{rr1bO1aqWZ0ep9g|K6LP6Al>;=AIjVz#;4oFUFZQBiXx4X9X-5bO%_ za7!pcQHqG7SN*~^cjy$E9l9ZKZ8o+pFU25xM!@HSj%4$H_3hje-`wqZ&GvXX+uS)R zXcLzo{T6$;6WPLjjUC*{t=ZAfv61?TreLbLwtMtsPTl&JXGc$FxAv{HZPMta=i29CU<`H?+cX9j*gD=N1K|2ifQ?|rhD|%q@BJ|R5IKEiTTn0tZ#y% zotva?Rw>)S@pnp}P(XA_$=*x+oM_|bLS56w?NL9O(_H3&D94tW13A61Z@XRlHf=_2 z-*$YoZ)00@Y5M@0l4#%dlA2Yph9A?8Kr3m}PU1L};8ApFVRe0b$ZJ_W?9os(Z`)c& z00PVKJugmF;ovXYqA^V2u;c*==wrf!c4-{S9seU0HCb6zhV5PIbF}^0*}JyW+Rrr} zHrK8#2~68+&ADsya^ewZF=!~wrY2#Vrz>J=~ynlbSiAyx{H;`S2>5f(q#9^&1~F|{b{T=Zuo-! zx*fyW&Mnn(XS#Iw%fbhf!FO&ic6Us5zZGlE|4cwXxZ!)^BY(kDIF3%lzdz(}8ZX{1 z(%Wk%#=n2QW#4sp{QGC|6rI;a7puz*N^`GUy6rAP5RW`k`qqMUabp_xsEXs6$Gb~s zw=YO3z7mpn;F==*Ds7*9?T`^{<=D$;%#d>mv^gEmZsf?Z_!v9WE@-1TRV2{QYvTGE zaQ3XJ$z2?~t>aMh8O7yI#ct2C1Jog;Y#y{KX)CG5sXM6-Ho-fzN7RB?1_XzmucTv7 zYBo)gSzAe>wNSLe$0cExs*PI<)ob(EnU1}p1G*}?yNi2LMIzAMu%oz!OGAZWN@cch zAZRcr@2noGf#W)fz)avFJDk*DNmwj>1CsJ3Qzw~;?$YP>*q)aIM*?r`k#a?QxbD)o zIefgfy-h7ZovE6O*LxJ^=UX-Q+QKRKfc=FS_)Cs){-}l)YR~rFX1$n9Y1@DY;l_7P@d%IGBYg_O&D^nGJLZP zNcD%7kvcv}TMcc!dRs5UDLve(pZD4~3B@h6N(+;`JZ2QF$QC-?k6=|!a~88v&Dn0A zE2O@d$d%gvQb6tw(~d+WVU{8Ep3@u6rh(qmNSR3YzcPfA5mJaooSYn(k#n1@8QZvs zLOHg;iYjDhS2W|IHBEVem1NRRPJty0N3Bp~`nX!IWl~35{cBUjS7{w@3FxM1B>^=qk)^(SDD`<R%&U)CtGAm!Yu43p;F@enp?l{#jElY>CbOPjhp+LZkhN>QL3F-W9BO z=A0uvKxj26(I%Lx5peFZu1$DGt4VtZ4z$1^au2#8%85MW@-6#8%IL7zMv=0#DWGMO zkS5n`yfp<_ZPb3PVV04yEkPmnlYBLO!89)4DBLM(2-7%ESz~-Qn1r|8+Bl`YBgUES zBc)IgL`bmcTHL;9-@=6TG52w?c2+_=ZOi8f_7>7uNWcF{%hiTXMYa4wO_C_-G87USaUt?XZGCKo>1wCWXMUIW*WoD7?=X1LndzmRq5;*Tu`6sMuZI9 z<$x>yDo)L+01g*R$_ zn5-7LeDY;xGjAxN17O3wPxXTXmuvzzFh4jG%^R8+8dsr@#;L@{${^61RS+DT5|QNs zop(Kp+Ai9{qbQWnpc$6cSsf2K=d*mEilHQfTom2g#M)rk8w;(q^ZvI}x9W6I5h_&L zCzH(#VEo(~*b};~s6H%F0Z@HfIUuh+Ov75stjJVDkub4ou;YgZZ;jP#_4P*e&Dlb$ z!JBCgLXH8erxGx^)xb!+5&}*8H47!`Cr8VMb^5tZ`Meo(4OI`23}iKQB=AWr+V#ni zW*L&9PSdm5IPHsEn{y%-GtKwqr4e831fqE+1n6eG*Kccc;m)uQ)hd2KA$5nOID-qnozLNxsJ19>;JqG5LJOPrAK zdyN{S9vV#=>_)V4EFoxQGPH(Rk^#TQ1E>;RO)Ht0&{`T5vz%rdnOS?L)HkzRk_(+I zCRo1A{HS3y0qQJL6!@tLqFU)!EgMUnF$6-t~q!unAgP|2ZkI+<`1 znZ~zIU}>}5`jLbT#0-pG8Kkua&e3MYTK1N*%od+O!bR@41UvDbz7q>T65R*^nD-^J zF&)FR69ehJ-Z0a>Y$=_(w}=R+Bt!)>h{3cIG5tw;N5nuXT)Td2YUmmH5X{_zHCroU zrw-M|sj$0vA2*)vSz0>!N9tK>z4=5M)oj6Du$8)vhs->?q?CSNTX#`1vP6OYpOcp( zEJePipCC;&aVV-*{M?FyCDYH?3LQzQ0tW=cI&9zWWxu|?SLLJ8d{x6D(FD&>V3+_y zk~4CbE;aNvNt_n7_FdmT5E{`m(q2mKPysMt<~cI+>3^0|h+d%-CpY?REPtn3@C*G& z3)Qg9kc6g*q)U+x0f{b(7AiY&>`6A2lTnim)RfokZaLyUl7v4JFIfX2O% z5$B;M&JZPLHRHj$Rjzb2e5i&{%`9lVic! zJXHfRAS9WL|^8jq8@>KLpUM`LdoM;a#=#|&HJ-nJ$j6%GtX z6K^Jt%FQ^6M6WqYItLmau8o`UTyYlop9yG@=1-c4Y=pAE9ESL+M$F6{s<`lNm=OViUFlGm74V@(F!a zB>Pazh*UdF?WNh}QPePG{U)_x|0R!DlYMAIP0IAfNL!)klkxsDbZVtTAp`@ILH#=Q zno&dobbxA>fn*h8BRwM>p^5Bg+a;`6(t3MomiF)!Wv*|2fk6=d07E(>nqhVYijZt3 zy=J>kd;)eXiZDPUY{Si0)4Hs)Rt!kGidtd4vlw?W_eCp7%gjs>9Jzh_v@{ggwHH*1 zcG-}q-8JcPdYTNJnY+;Df{7c+OOnA%L?7b)oad^Bvn`577($IzL41=BWb~(+Qp;)q ztzf+zGGULcCk-VnU_1>+;E^6Hdw*H!x_%v(-Q`d{ zxBj&SgiKPQ)FWtHHD!{XY!`QMc%V1GK8d!_HT5K6i9u-+@4w1<0SF~s=TDeiq>5~K ze8h&H(jaIy%-oS$&pFZYm``41+jCRlE|KOI80YE8;)&J_ihB*-$)&-!C2UKep<3jS z=@c4YJXIG)c5DG<(}WO3g)2G)ZETSA;~q{NSxb_hwrNrkcp*kcXFX(>Ws8=_BJMl9 z$}_2xDr7=8!Cp}}UljPP|7diEPwZZh*?!1)y!yR^c1!_}6qd$CTg#f1Hn`-5MS5 zaI$f!oh7Dm0K(PPFRsRfm>!UChdN#L=d#rL1He&QL9kiA}t8GZzRl zhv*4R-yEE{X56-mDgw?~021~oZE9?eV6-}*9;X!SgBMOUKWx*`bhu3H9l4k$IG;`Z zD)R4?CMcW}fPa4z7hz7Bs!i6^ym6r9IwQ8wyn<2BD(VNz+S5~kk&hUT7h-(<16hJ9 zL(QA6WJZr6o)VK(Sw-E`6)gmvG@8Lgz~mtp+nS3(Q;68SiDQo2U(UZg8wX>wK?K;t zU&^t?^at3XKd^!F2hd1MG*1%zB`+i357)I9)JDT}pj}3pRa?i)y)=!k_Z%8wb{g}N zkjpo;*6fhNBveL<&{m#QOOkJlTG~MQam-ciQz0AXQvMX7tBraM0j6HJA$Iuz%vwXj zTZK334{zc%)E(ZYc%y#aG{tRQ0Ka0i(`8xG=)lZIT!W-;OXiUs&6MFmH8DUjVwy~^ zl}AC@lWo|6XLLs)ZN&h|#6CkV1|X*Fq(~mV8T;n13~$OzSUgKiMxX&zrz`hMNf*z? zpb1hRBa=wKhvv!BR)_z(wsO^lV2}f1(LD}G4`1vUzQrbsuHR8XaP~*Sdzd+ZSx$DN z7CrEJrKTF+dzd@cZZRpp$Lm!-FZQp!bAP?KZRoXF{JX!TNuEzLQa}^_s)?!BPJf-4 z2TI6Yp67Gt&}%qkuc%cLruoXGL)BzgB|ZMy#w$-usiuxx9C9b-`P?(~+JgA-Z`3LY zq_eWup#}n4IpD7Y^FRr?3-f#)7<#Rh+bZE>4kl3J&k78cr<5s4XMU!^^OUK%Vg0yX zrZP_s!5Dh&(D?8tYZZ#v1%A8+K9K#-CTc02n%Dax@W=Hs$$4@J#?WhTi4WgZ0~VOc z&&stFaTPUTs+KY~54d`n@p*Cx#?WiOHuxn+RVNf7(3Ki!aT%plYbiaXc<)4-y?XO` z!m4bpkO+dT*zt}Z3rf7>auwAo)I!$o;*=I5k)TI_%k_y$gH=Hm``HC9@kjJ$Cg<@R zxE>1tj6#lRGz9==OGh8i$_PW{1uB0|r^VUW?q#K8e%M_)7u!IRKEQ%aI1egIFUH$; zh^+a(=M|$^rBYZ{k^N{6$vLwz*w?og_^`Bi{fxlX+90q;|Im^FnZ~K|xO^Xrt;5mZ zF^c75w_)daGvM5aM?qr*Zb|W$U#Tfei^IPX*bF?!#E$sz_i!*U+mfHbxk#4m0W134 zMbX?HfQuWG9{`~r;tgZ0Vd}GH~>Ic#Yhd{02$*Q|Dq9zcl=NTHV4$3^w|Ly<_3I|N&*|M&{2au zVX*&hoKNwNznO3tvs-+VK0EC4Jg`*~*f4;*Yk7m-^5$|aMJptzxp}IVQk@50y-dr# zOM~XdO@cvC&Id z&<{H#02oPAQ9icAs2V`r_ScI!j+;ScE?+HXl{wT{79A+-O*Ug-AanWc_VMyb{n5?a zy}8(F3KrlZM<(XFOUP)J#rPUp1pic3qYxO0wm?&MHNzI zja=XzUMmx`_{hs#9U-GN<>eao_{d>j_ZB7jtY%l$GHuna;9b@?Kh$?eeP)4goMaUx zsG04LOZpNlGv-%?lh0thNh7RF>|B*il9}@GcZ_%KZYuiDw#m05rwT14nz}`~(zxrC zVDIY$_F5~NVI!k{$%|>l+xSb1RIQxHGuqyj`G~920?D7MTLw`=a=LhadvCUQH(fSQ zS&*D5K9nvJ=m`msjJQ@1&gSZTskN{P7!*8>{nhBYOEReF7-=XOte+=(JR`BRT&4b1+~QTAS?Pe=6fn!;27GAfx7A3+*-p>o2pl zLVt3l3#o&Rn(~ll^XND?7mE+Ku4m@bV?o&bKdS^rxY1mk4K%Inp&GfbE9sBi@s#&<3_ z-l>b4t`#*|d~5P)oaNZOkiM*R_98yA4Yo|zno*OT=u3rPe_`6zR;Q^4qzk|3!hos5^h12NKLy+|97P|r&Y;SYJ|uMv?Nzr|at zIgn;(g`py`;ZcATs7ZS-{4Sg!@mW|W6lEINY`rNHCZ%RauQh|S+iN1)+P*hcynU3Q zNU806LWC4uV6j2oa|sysD_rNUSW@iopug{Bh>`-Fq~KVzl$SC^97vtSAF`=3g7|j1 zy3}_|(4v4t@<*{!#?@x#RzWbNFiG;uFjASKrTk<_(Nd0Gs7%hnBKHS`D`yORCf35~9FRZ`>{;Rd#Pg5pTiw28h2-O&4nRa^k2tTu1D%#ejkG5@*? zafD$1_=+l1$YRdi&0WsC>rkuVXJ>8_Zp*h?dCezYC+%j&TJycL8?5a(a2|g;4#A{a zbak_OH~pYH)e(;yE#9_8;X?ju-5bmYqKu}5qGz_ zNP2Umlpa=di~Dn)P35{f-IPyTbo!K`Kd0g68^WY?moD1t5mLT`jBV4kOm3g9@pqoB zm`}dEGEsCGL(m5IK;n7U%+=(1FEzkOAVkgDoCj<;&EK4rIGvNXYti#6!NCUh(L>y_ z&J(Vdt>mIhqjcKFun56VtVy>H1oHlo5){9BL14_n0FWR~3JZgI!lY|k)b{DV@9@0R~3 z{vHHIS;y{_BH~`ZNnjLYDRaJ~_!ag{ky>;gdD0R2;l+GCFbaJ2KeGB*LXF-ND=Nv zfb^=xtmaK8c02$vXbI8>(T~t3`88>O(P@&NMqH7s5Fcr!qRl^ATZ`*y^z`lqKRR*t zKu_+EBgeV~s7y;N13X}OyUKksh$MjH?6T6~|6C_Dt(g$ob+&Z#;0KJeLZg_whFTo3 z$czfE))GLNX-<4jWC<;3MsUzw1AW;^ncZ6aOQR;Hz6^|TV`e(9%M4A5B+RicGV^&Ck!|r z;bdxVM&qd7Ql&HSQ;KIFmG2SmzMO#L(Nj(p*NX^^Dv0)nx5w{LP+K>nyW(~-ZO}#U z-2_K9FQknNa1N?U23NPbJJv-4yKaOoUT1Vnjt5f3N026a$diuUn(ezUU%ack^l3~B zIp|{{6imW@wSt~rUh{nqOX8hS_M*jB6?DPt)!+CH8wo`L_9PNBr1uF-n=o12S~_>n z(56=E633{yHzAe-T7!cZQ$Ds44<@;>NZYU7fgo#ZEJPrM9{=Z(qzl3wo=Uc31sG!g zRfyxAJc`~cs=Ihsbi9aoJV48}p%7&4q&3)6DP$mto2bF^PObmoB~p z>s$?4tYr{~FQGDpn?#17+(Im=kWM@c3kH=5s@O54q7cnGnRS3_$pE(6Q` zL^yg!R8hU|(lajX(4MM_o2dodnaubR(KSO|WRk$z5{ZQ51^b5SV0zI|StK>mKxT_h zKg*7y@ImqCXuUp6=>x#z$HERJphRA1sMV~I0wF}OAOr^^-N3?@hOq)yOgspKOi~DQ zn+oVft>~7qGRf(G3H)4NI~v^-!NjU)#;L+#Q`q3XUA+=modZ$)lh#Y}tXKVJvE*(> zEl{r7#XJDfFZd`3BXnddjjQ<|F}Y3<=k!I=2@U0HQ)NzHB*}Z~)Is_njjNcZ zni?iNF~q?K$zHyhP z&IpXPB^*~Zw#8aa{o1ZXc2Ny|s6z7mGK}k$I_p-K`{KkVi5jO_FkoGpg`n|(AhQxXSRbTtARCZVZ`y=_ zMmPzO!C}KxB`>_xW}?P^ew25(C%XXI>Ot;I{*SB~;-}B~_LaijSZpRD)A)Kv8{iHM%HGd5$;eqBIuz zBp)11Ns!{`N23GW922!}59otq)NgFFHbo6uo1oD(Xc(AI5P^7%Es_Z}yF@WQ&*`F6 zQQzLEixNQV)|N7SvV+5#;Ajp^N5IiUzb+p{E=oz00B}X}2Q%t4e1$%MVBAUT$qx~w)zFQ@XP8$7*#;typrLDFo!@`3F`9QQV&>>T$z@AF=WV&zaf&MAd+&OEJBp$Hu;AxxF;-0sxvw!){Nr z3dxTK&+AXrsczAkx@%H%+YFR89W@8?u+uleg|jx=W<%WLxNYE{;9CuE#=~4z>zzJ3 zY;QBmH>&|F#+VyCXr_WVbSXk5SdkQH-E8QJVMRLt$H*=11x;f|3W@3hMVJt1WFTvq zKq*N|j0PwNiV%ZQ33Q51Nkj>>`hvpdRB8K1$W#E*fcM$c1i-Uhiprt+wx9>~uXV7g zH-?i&m!k34#Yvlw4NmDzP!1F)Lhg-b!eZEI3D6pG(s^JwsRr7e6DM@jS8HpeBN&YN zY|$Wmh)&u(cbB4I2x&_wW_lC)Sj!b~7R{N8KKh)AzS!KaJ2F|P?tO0|B;{E#{F)QjuuXJfG}fb00UNirQfMAfI6D#ShetC<#Y2 zn(7G*wP6dTEkM#2R_7fEBznh>q1Gyt6YDpdJ40|PN@Q?Y@Wvn^%yKO1tX0BtG|RE5 zS3vDVi}l!e`P`00i2&n#cSSll$0B4EOopt(17S%uISDO3?ie&+lKD3sT#U@ugoet` z$gXUjk!B4;%g}=B+6&J3I0>U@g`Y;h9{M0L%d~WaUnhslh(65#sWUthJ!(lMi*ov>@!KH_ccZyC>XvJ~o0l;lMXa|cPJFNL-KwfM1H-d}(xtgjG zt>}Ryw>{yhrpby#*bPe54=xAIeBV21J*_L*Xrn;{fwTR9=%2})IwKDfe(-WKK48e9 z_2_-f;|Ik+ug^DJerN+~6Ph4$plgT*&TjEh0@{lS>+V<>Md?h0Mm0hjwYmI-YE9-+ zT}GK>R-B1!=MVr}PgGN{65MGC#UabHac{fXJkosBJ(&nW<6PF-z{R+?n6m-UUFE=Z zmYy!!+S7KE`|)7|!!$R@bM)60(KJm{$z0RIlmSN;8?r*5Ii)t$l`a;(&SMxMs7njX zXxSIJNCmf~@gPks(imCvH6fEz!z%AOtO$u{k9pw@Aepkszv} zCeImODBx%Smcy7(qXZzzoX{s_E=Lu&)iSsEz}7RjM48+8VvvKJL&KA^GgqR_JyGUr z4RViNNq3iym zKZ-g$K!VN>Tc8G;_j-tTd^jNBoHnf*`#2aCC}laCzg~>-|1OOG|It5d7E>*){6>PG z3p(d2k|2?c33brMys5s6G5+6$JJIPih@B4coFovx*fUp=q(e;30}+eW_~r7uMq5} z3Km%pL@M#&#{ie>yH#9LhyPkd#5{6+_lVi3Su8e(vNUbuxPGA=gvLqZ{qCj38+VEK zspu3%F!T9!{{1yR1iMU~T>4y0?ntU<3wRv;Y>5m}fI|Cj zQ#<4lRH0fr`^!#n$kNtk9*T>Y-h7*)eXxkjj#XE3?TL8DyM6vtgrJU72+z%uLI-Q? z!jWE^#tO64?it$MLwgAzwP|Q@I5MS`+!U@eOaGyWCp{i+JUY?O0YykI1F(F6{`qODDic zpxTw+Fk>jPbeB#!gbTKCZOWHgKZQuWC$r%AcBA|y5^FD|=GZ-1xfNL=P|}M}1Hv)+ zzSa5I9q=zM{9&&MtlWaucBYlE;_f=}Qbtk^BWjOK-?XE^lz-&jj|(KCBkwYm8)rmoPDk2hZBaUP zY1#p2Lj4Ld6wXJ&aY9AeY-l*loY_5)h@W5yShVpsT~xF@l)ki0+cC8ruswXDsY(9C zSABwCJ9PwLrgWvbA8Au0IC4u69GRt#Q&u<@EV+NIP=5HX`j*2>ED`)8g2NFA3jRjd znv+o7-)LwPB>6k7F%yL!|e^-4cvy3b)9VY*vrKMK+ z58)rAxx_^o1ZyxGUv{3tkGjftVOK2_76f4l{ze}yFDV`KM@*!}s#qq!%eXqd=5TZ` zoz%8*9)}}{Dd7%Bku%m1mn3Fz{V|M>9?#4O*t;C^Y5`qot>sh`|Nghl*{F8uMvum& z;nEGQzZFBom1d}C8wTMy<^!EAosP2+I~{;o8uKHEX|Q5b$A^D7geMbQ3qja?Kh8!h zo#Q-XT=|Ks>`nZDx1;o?=_VYxfiV9d#Ut zQ+;Z6fxok&zhhl5;PXd+SFiRP{aqqZ6#ZS@X^#G0rP_w*?`>qRM}L@@b3!m4k>6XGZy*%47ne0GN=SF-vhmi&Wy`wMtb=7dg%87p>tN~ zcU3E2E%YPfd>-=f?*$&EWEr!+&SJ<>=2bOl)@ENX{JSvGWLZhDcht$UbI}&OA_GdK zg7=b*-K321&|DG)K8oh9Se$N#`AN${SwgqFfgYFRxaP7Eh+Csq-Ox~;-Nqd2n295>xR8%eygpEgiX2*Woaw;T|4r5vMwy|zt z{YIQ_nj~ikd>CZLCjC&G@lg*VFOBunQruKQnV7I>NyV(#1Zd_0_%gMc^^AD@lY0tFW@xQ6!od*0+YBhoTv@l2LFU6|%!oZ)tbi|*|I0NiRWt5K0t;c|g>Iu6+J#aiz=~#_yd5lZyv<#ivC&>EEf`;!6b+i$ z=&&Gn+3QjvYgV@IoGl=zdW6EGSSymCo-h8#ftkpo%Kjv4!OwEec zu#s^7^Jc|@FEoJ*yMZ@qhdD)eXylDCK)+UAlUK3bPqv0hQjfNt1`!*lKT5IalbSkP(452>1HPBHF zmKl6ipn1cY6OHNtlWxhE46DPx-41gANH$c}(2>9z(a`IY8Wt$5C)8Ot5%H|vbZ@my zVYO20TMMfO{b}OdhuJBJaDZCHp=nhCRka$*1GK|zHc2%UBT#RQA@+<0nd!$jM^Ga7X#flBe1ur3PZa!UYq_1a(yB;8d|n_ zjyGs%IUWr^wISb!R>*3!PI9v?g6n6C;TTo}WPGs^ZR`RCEkf&jhL)W^bLKX*OsH>f z)X;KPEcm1xo;M)Ya zW?{L06CvSg($6*dUnB~=!o!2GnY)ND$m!;jQONfhZu_?JnFz=-GlP4I$VVg~658H* zorHA`ElDCK{L&~MJh9Z*=Lh84q*WY)*v!NkBhvo6T|!E4?t0{n~6gD!}$HGy~j7bE*bH8eD8O`I%)5 z+dNYHvtX4RY1pXDfen-Q_K%I~{;@$bXB%v`_({l&Ezs0SaWwE|;;7J!qeyt06GuJ1 zqU@K`ykQ(^U$yJu7Rf(XiD9PA!M9U5b09xJmKfS7NChA(krajnknxGa*Udfg2~8dA`GLmlv-n?>iIW?gV#aifmm&Z-A4Z#vf8wc z4QN2EW2RK0+D2PV(qyJ-KFXNCAs_v33kPqD4OKg8gB>Yo1?$FVX|u;LLwenyHey2r z+Ig5_oH1Dn>NFe%#?)vR1w{{Z)BPy!atiSUfSNH`rp+QX+U|AEpdpUt%W6hpQks165{2;L$E+_IGeMU* z)U;+Hv87?DHl`Hs;j@}2%oH&+v{yHER1+rHAIjuX-O+zEV^;ik@m(gzb!G>pqaj&c zS}-|^=v+j)Sy_)%C>}2@{e1w2bGPV9ci@-#sQ-unU$?k)harqLJcK36|4E%J76cd& z+C`C;ZqFzNe3EKV#!vE(<1~-{?nZm$;bPzl(JHVGq(9`iUDh$Zz)@O(db6=RfUJ`r z&fqG1m=*th?$GZ{dg82^QWd*Gsg{?Nt~y%B&r3?fN9(|uL2*+!vSM5l){ARR6RX{P zrxljrPd`WxilFOj7ff=rt2DQO1c0f@LJ)(x-6i}5-s4K2=}lrUjH{~&{FU>%@H$0S zYL%G5On_piD-0eM0NyfhY*{H|A*G@&XvaNHf}Lyd7i-C*Wtqb_4Eum)N_d*Oq;%!c z!2^{eVG@AOtnvr!mHmN}Fn_pmKT+X@qJn|vP@2E8wJbAS>Lb=LtmPIYo+*;HND$hr zL2>Ha4jELEjoLj06pe)^IumkLl?6K_UJVbXK_>~L(2yhGv(4oOC`dIY!kaKpmEldu zrMmFOS%j(#Zya%P!0jkww1s*Z>GM^`!&4d_8RA;aD&n+CRkpq}0c5XGCR?U>hJ|}y ztFVJ%y>aVM+n8xUBpZN|EA8~px}}}G7`V2%vMQZ0+C19>ziT~<0>9J9Uhu2{IpZYH z5&!+Sj)>5&xvP_Cm8X_}kN-ZcK=%J0|6K&q#!Lkn4OX3rkfPb*yL`D$xDw#($SR(H!yL+f?=- zuM^ULP=@8Tnh|31avdE>vA`8Y@X zcO}mo|9yLi|K39bEdKiq?^27{UciF1Z>jPjSGvx^tYk26baPiX7|mfgDjzXf^AxBh zv;7fmttPBfVJ|iLq+Q!(^9PBpqHN^R?2$mHjv;&5Mugo9CF7+YexuvLWT)tPu>i5? z+E$-S#gZgdS-OZ9pUIm*C1ziz{OpoZ$ajf)!dqmV3A&p@zgwI`)qKMpnX7Z9*6)N^ zre=(b?WC+R5z7Sv#0ZoOF*{|Vi8G(uPhyq17%NPR{VvN)tT7kMdeTfW@6FI20;ejv zAjV+6`%|szy$n9v$a6GehgFLn#Q`1ikN)|jsov=u>~_DB|P(QvQ{mUoE4yo(~E(C@Ov+c)(4TQ4LG zCS4VT&9`8U>GrW{1=rUMT$Q=cY%-^48qN*5HldqMFo0|R4FfQD`8NprEx=(ZWEa4L z2_SH#F6T%SVNbV%q~OV*$>bjstf8bgcrqwg!ao+GrtPdeeFHNsi;eH6qYv9_v{OkH5v@`%{ za6V{jOdbQ{=5er_2XBiwJRQGzyp|#^uN2K>&l7C`)!v)x8MX6^ntYRTn{UD2)rwUE zpmHrmyE=iMs-=`0uy|8F<3wY!au4nPwZW4^YXaCx4M-uRRO>0xEV14F0B>r#O2j+D z9VKk~nK#|V)n?5fAHL4+C4ilQb$kZ$eOqMe-{>-sD{f~A;)0;VJ|Y8ubTAMLFd|`+ zrNUeX%!S}QsvN}_w3}Gxt1`frdqA-&UjYi<;eH2Lj!VE55Q&xr+P74G21{(+aesrw z_s3T)q&yAEj2_>Pv%t&w*bDnq9=QkPV%uql`44Pff6v~%CUY(!nk^R63Gf8{O&=Bw z?0}EQ$Bfv7X0Q~*PM!kA6&l7Ox?pHyZEB7=h{g+sC>qh+cw>80I0+dP}N=T{JQraRc8IdF$p^ZcWkFJ8X^y0_k!@p)1)1YP|A=mdz zt`BXMG>}@XP5lE{=H|2<^w?@8X5HVTDbDAO+1PVSi`UIaub3`jIX->6-7~I zMF&*A?^k%_8G!f=+woE7p*e4@fBLbR{C&hE@ccVX*t+$9IZaILLlfnFXhKT~ zR3~jVDvvQ=SZ2{g52ZB!hF|(CO81&ylNAiR~I^}djc+rhCZ=<87Q`@o~4`quRvO}BOxV@(}J2V+f4L$K|@!@|3 zPgz}DH~f8nI%M#hye}C162Gm3C-Hme;79m<%iwQGL?1v6)qi0dAQRjO(U~18>tv|^ z19~8;M;nRfR+aMphLm0zyYzo7O6iVw3|LC76e*MIJ^)VryW9okY*wVxl???y*FGtc@)eL@O>fGR2%73OUqo-5xmO^Ie z`GV5&-c7@sfjep!W|yxM%q7(YwKNuS`Q%*zQ=bf-dMt%i&K3nL@F z6tlgJ8W*qCxU~AXu(fQJa*_>nYcDuvr#T>^c_h9{)Ctk_I???0+sUNpC)yg10mi&u z)`=)qx8*ygm|IZHCMc#1#Z(xUn~hF(gki~Y=h5J${4N+go8Q*K9KQ?;zi%0Qk3ojm zj7Clnfrx5S#1Gw~XDCW#@rJX*Z-+Y-`&lJg=Cz3h6 zqQq{|%vqwD&yVwS$^<`4%Dh1|r&M@3!Kg;HT=X~0^IoSKVSHbzIWJHRD28e#7>wxp zneX3lE4RY@Jr{4FWEcjy3>a(G39L~pVV{55m#nd({~amG*VA`c8rR|7V@mLtaOUj! z9Uo{lgXOE6MrB41<6d#tBVTOwxoSi-XKQ_+4 zEx6pRq+$LHsGSrQp2|3%=EuGt)i>3{M842Qx}zcGYnJl64Jmz=@~(!IFIdWdXh`9* zBu3_~4JjvAGsj(NU32Cbqcs;ph0hi;P$Dm1ALr+*;*4L`DF3nO>O_6WR`^L(Y4o?W zY+X3c&(}nMeWJfFO!0HF=F2Xc;hWL5`?%I{%e0aWN>AM z=n;eG@Y_1Ll;1-KKf&)?27ku@V-MdZ7PBG&$WHpZQd9X-hJ}S1+}zH;avNRytZ}u@ zD21Hgu#}@3QZBF*AbBbBoMS0RHl%#fQfiRN^Vw?dxMAy^bH{~hPm&y4;f|XMYRHOl z{SFv+Jh!apUX9IC`?$QKgiMu$Piu5mYIMG#(YZjQbB;#miyED!wTg^8Y5`(&g!=f% zXT%~wvY52pwN&(hDDVz#7(JlUOGwR`JXq?O*3D-GAvK1Xl-PWA6C{!1`-+7U?6mNO1s}_BLj(S5!<>I zsqP_2n~~Yf$V|p+0(Yh+aPx$~%`XBszpaB=er)UVZ=Ts`)3eS4h#JQ7gaXL|4>s#Vl7ivttD6aK2ajia$$Y(VoOSLq8dV;1gB;S~V zJQ$LC1sDCT#5o@l;krqfyRKCV1P(@8dqJ&YT(r?9so14biC=lm_gI*Ia0Tm7`PsE> zo8qtOu|XQVDoBG@nKZchFZ^z)yIHat!w(FfzR43rcD6)aLNy4BB2vgOZA;W3A_Idx z#;5uYtB*BmTZW5!W{cyZ)+mzN8y#TD z4xXovh^jP+^8^ifcIY{yx54&trJmnz5##TcwaDhCcmwN-TWRp_#uAT>guqn zQ=MmFD)n^I2jhZF=y!@~Nn|wM2RVE!N+1)wkCYEIq`><~d4EF+ypNQ3H>7;YQfh#K z^ErlqSx3hP&s6TW>E{HheF3`LBI3GaoS#1v)ty)7=d3A8Ez^h3SNK^Xte-Rn3k2)m z1nd1^{VQPoOJIFj4fsWWFPd|#8ykGrv)o*``8mh}f$j(BB6vG#W)>~6;|0?MiZNOe z{Y*sU5)=}mU&!U-P;+LKsY@1dcB7Uh(nrioEjQJuZZ~+ZPbxy{?O{V%&Os+=TlE=?*); zuyoyMOBWk}@>0kBt)OI0D9-LSq>Wan&ndw=9qw2b>Z8 zQ0Wp|w@wfRT_OtlGY!X=L_yr43qgs3KBu8rrgiHi#^T$I#Wxv?e#YV}jD=TV>sGIi zg)n{5$)bo(s@4VU70?vFa_8eLPp>{<Dv534vm{u zFsP2fF{<0r((>%z0GR(WH&IcG{-r*b`fTd-)M=-mnz{VBt|1FGTj0arv7OE=KB@Df zvqqO4UA&L8eJlvL6&hiS))qO5C)3ECgJ&=`^ay)===W`+=#4X@i%~0vZ_xAyh>jQD z2-o&UU#Qii)-WqoD0BI3YQPuHOr4QBGxf#Pmu}4#cjZRUTRwVTF3Z{dx#`hMGTG77 z7v{y_DVG`j^uk=3v98QLF75JjE|kL(j-Y3Y~|&xa_XOLqh(Ra}!UZV-!93$Mxd-M^iK{RcQv zspd+wfkc@!Y@%SQct@_NOFAdB04`#-j|R-z_)w;FxPi7XK_nHoq&|CIi~a}Rmn(wT z3PC#_XgMr_&lNa@2A1F@97~+a4BeJUeYUvM%W0doLKYy)Wu|d1)uoHCSTR}YcTp}? zd`Q+Jy-X*8kj(;Rdea}wwI6kjhUg|5(p_B3HRXK%9xv(Q6a%BDYqp2Fsgs@-tHo{C zY(H+~-qMO#yqh4^+?=_%nu;%7n~Ig0T^ejpr~-FD9rOWCUA}WKC(SE={ge6=|NbZc z&?cs;;Quf>woSB;TCe6kJ9_iEym{w+rl4J4^-1nERU~A@J6TT@iG;INJ=0h!^4C!* zZsuHCYW)Ww5=&{zA=&MZX1I7#h{0vxS-!Q8%K)X5utyfWh=PR2mYs`u5izR|rIZYI z)(Qe+-Pxzc4YnUCf;8llCtci|1zINd@X#~BVZ!0Yf&amE?PZF|e5mvHm|d}rG7m}u zSd(w3&xp(bVs^< zx#x5W<-J$${9U2Ha+w$)Uz!!Ez59bC`Pj@Eh`~MdHjZZ$h4xH`)y7Akn6iM-*+#R^l-Lx%)7Fs!zV_{@!^lW zL()GJ;wad9z7OPFX*@uNGa4iASKZ5LC-K#I(UB+SO4p=N1joPohZM~r#h-gl{DMnG za8H;eeS#K|a)!d!UAT=IFxZ~q!)fhDa_KIs4Gjblq@3NJ$#*=Ffe&WLd4D2b+`u_a zf>m;0dy9T=ZP8uF=j*fIEg2s+wxsge67x{~I#;XedubfBTuva7?0HOLiEe}JT67<4 z@`VpA)XFSxdv2p5HHj;Nq68^}f;jb~;y<(~g$q45X{Yu%h_zYD4S$(n79>&S0CJq|ZspNbcKnJCN+IfUZSm zy85VTckF`Z2fZtFL-!|lEwVsmkMn4kSoYf!p~=);aQC$GB+DvuCqH|1%lZsMr+<4s zZXLjxRF|NuCYt{^{oJ71aKQ#9(G>W_)sXLHr78diH%EYKi`j(%I|x|AkGF(i8+?Bh zARfXOV^465B$6qEpw&DqpCU)UYV3cI*NfG*u0^`VeK+d(DOKkIpYBlmALpklSs@SY zx9jA2stu$rlb|QmsEb|I0!o9csNvR!X#wh5WY@zJ_Do)4^h1%4Xe^Dtm9*`LsEOM8 zU8tM9b#Xr_f@yykEF1e21kk+*Ts-Ao0S%R(lY~Qv_Y&+M){{+TuHH+wEIBZ5t&JID|1MA83P3HUV*1?oWJXQMk zUQCuS>l9c$w2^93Kfl@E8gY{wRP31t}Ffvs4!Y)nv!cYfVaG-5rbMhOp*(3Osq86aN$49!*?=46;=?w z#1s~&SD3frP<|&nlb(;ak@JK$e$H%(cU;t#BM9vPN>=b6e_w(AwvdUPK7Cx1h5Hx> zQr#60p{Q;g2ZzVLBlKz3=gD95_dwaAzaou@Z1>Bs!k*VCJAf++^lm7I*=&;^iS z;nt>wznNd^N4!{({~$>*25}i0Zg<9rFgJ*I^puqeh7x^VO)qa%ktfMQI$w234A-fE z453}@p9j((T?jv(Qb|Qa>`K}_z;?^XWAJ1+qekjB<+MS38E%~v6LMeWz+l3#J)m^B zfmR%vP7p}zXlT9h;+zm1|VN* z{R?}hU~2IlD?i*_de4H))!Dw+U^Wj)s>onPC3bTnRNnVeCiYmqbof8RLT*!C+!qgL zkfK2{D#|JuLQ8-WyV$^A7^I|eu^YHj>>-4kqA{n`-l-r>i9*|S+%`sjYJEYnPV5m~ z&7LV8_U>G%^>@IBiC=^5T_!CLA!5bA)Iv0b1xm$7K95lssVibMVnv!T$i3+pn+4QY z-xGO^+>0{lb;NHwb_Wolki`!%xDT3D{z^opH%iaM=xUB1#M@y*JZCoJRzZpiHh*W^ z-o5enC6Tr_T(f;|@m7Q&M8Vfm#rspm^5&Px65<9ou-HRju`CYRzWXKEIg35X1-lAW znuI47%Y*Od)P2@hjZ{ z94|oL%XVxMp*>*>2vjt%8=hMcVGPX36q^MCF5)5v(|~jvy7? z+84wF_84cB4A!+sYc&iII#7f)E;BNQO#}pq7{(uK(EG&CM0v_!W9Huuy zonI4)o>+xZ{V8nrPmuDA&0eoz4(H8a;zR1DI4no2Nt(75!0=ZP@R;>SjdyX>L|fvr z(PZ5yCERM#$$*oFL*EhnMP89D9ZRPcACc4o!nz;!ps4|VL zkfST9mCQN54Fk)Gas3&DEMMVFG*!E`!ud)zP0&`@wl!HO(DEYe@Jl1*CTuKU^W+ib zvLG2m5z#V88*f!c6V$oYe)>tZwxaZh8L7UiBy?J*K(5PCh{0LUn8jKD3u6^H>y^!% zH893WXN(P8@jv3R5;4Z~;+>a$F;zPLH^^n|^p5xquS%6-CZ|=hO^ZFkOK-Mk7$Xe* zvJXbQ^uEAL*Jol6H1JXw>A$=wjCAwA>~y{(wzYzJ%2vM12pKg&F;iThEGxTL^q%T_s!1bBZF$N@Ed{Kzef^I6aG9Opx)ev9R zRIzoaxL!-81}b%`)MC9gm07Z`Ekmm!T}-cyCkw`Gm6;aT<;9(!sjJP-RZ=fgB8k?_ ziZ!UpOg$Fq%Zi&AXX(KdYWlDI7 zz77b|jqL^cdHHTrmt{;{wlc*)75K>iTi&|qpJEps*InGyU3@hpGyjMpz3F{~p?+U? zY*Tma)ztbOsr9dR7uR$b?|Jqe@%Kr&6akQ}NQ15|Gy2m;U7O~XR4cb{maYw=Fohe6 zwxy-_`~&r{uBF?%OdUe7@|uDzDB;ka)kHKVS{uMnmI=*A%9~C*Kp@rVeE0Wc#**czx28mDnf6o4BJU**GPo;66%JmkiNg zZZv^0F8#^t>Di8K)U*9MFU3uDE!(}RBAcTdPHRu3i%-Zg54O~#YYX}{Y1&Gy?^;qi z{JT=I{Z6D}dq3h+zVA`-&kAjnB&JdAE-h_VvB3*avEjq=-QRCeu}Q)zZk9$2u1Yam z)?B@&2@q-2ii)Fj0ot}4Ppq|@@*SI8$A*x#TS68Rn+aWrUmIQMYGYyhBrP(S_ak)0 zgjxa3mM|4GEn85*VW27LhE1;RYNP-P3o_BSS;eJOVONV7vX)m#T*wX2^3#>nhRH5l z*H$Q!%e0Z4*`)l^q=iCky`)c~cv%auHc8y499HNT+56O5i(tW`{fY#LX)c8eVh9nH z_IQ6JrrBao8Ndm$b zokzf?qco33Nxeu$i!rIrx*|%^ow^oLcmEoFy`K*%Au)_zJ})4@;HN7oIm~(;#7@3S zL?RFtKib?9eb6hi_^s>`wTGt~lx!~6bS>IT-f7)pZ=xEvrTTWUYmsTWi101d6njQ` zn>G1Md68o2IJz?_t)eknv>KI_urI{`yLs=9D>!X{x{`N+yfPnUOF_d;9YLd&m(49d zoAjfvG)Xf6rJ-;nUk7?)3RG*m)DB&XkTS{VcMnMTBFo(r*hc-X8a3UZL@>7*Mr$#> zP92eA?T^u!W)a3F<0H#l7)vs7(~GNh4M1 zoDK0Cxnrj}2?;EQ;YZ0<)3LoQQUnkhH6%A%G9hnNZl+&@5owaKDe3=(bZn@kR7Jn^ zPs}==YzOxgPcyz}GNqa6pMr`_s@|Cg^m98uu5o%see(T>F*B*en^ZOjtZ4%xdLA3QZmGk)uC*pl>v2Hf}a#w8A@N17{sFZKC#+Hj?U#6o#nW=umy&D(Zj_ z+#2OP$bsttvyqOJk>jm-h?pj zeO#A)!=2FFebT!l5R}Q8iM@!{%`|L?GPAU8>BW05leX>HS=zQ+Usv0v9U|Jc!?D2N zb~5T*Po1wdY1>{jZQFa$wsG3f2YcXvzph|I3dE>P!VZF}RBs4aCU6DS8~QesZ}X_! zaKxZn;vowGkf8c!@_o~}*nLvFu|;}!P`Poc@!Pg%z&)TqiTKF!nx^f1*R(B{rma`s zMI0lKuzQs7)g*lXq$YfuW(`1fO`C5i)UR;>AbDwMGQeDziOJ}jEkl_U3|+>of+f7`;%(ifW!xTpOu=P9Sqf@5-N=jM@4O{q#Wnq#vtq6D`Jx?{u|?cqJkxb= z`3|PmrEqH)@XMxlgZ)b3W*i`2l#~_jzlaaq)e;}xMQfFI1b-1$VZM?Cxj~FW7l7<} z0Lwt*wHtXu^=9X<0JF2S+4?tetFA@j#i0&JSDK-&q)tC{#g3_{>>igri&P^TAER|s zQXy^P9;s`R{i%0Aj&O%(L%zpW@+Gbe1u|)AJf3mT^t2zmkz@F*YSGZoEkW_ zL2{cjQX7T`X^mDznnY!cv9-`zu7>GTFbx(LpeXGPJKq&hTd#(n=8glRl-PSBF);IN;M5}2AeQ3R+MU~Q|HjC zAyY?MwTHFt^&Y}B{orZsN6n#CTaQ-F<5r?o)AyiNyV4$-tB1NlXtV3n4s-Qd{QFzdrv1_S%>S7fHPx6_)F#!~l{R(c z)&1Wd&^jK>(2h(OpHItxi3uY)zismQLx!H%CF?WwdAY!#?jk_aKTrX4zXkgFQNxNmk&wRD0-R4BrQ^mi98JmhG=D46=}f~c52+;d-+o{M58l8 zKb{~3Qsb)>K`Gz&i(KhiGd{x(Eoj9urARA=S=x!-S`n^#l@L89MIM+zxE)$J*9ZK3 z=4Zc4O_%R+>$4-8v|`2k%=GMx_M^Ie^nGfMOxZpb;=xYS{vzU@MB^i$Cq^TWL^<40QYmcdZOvtDC#@R{`)Z7GSNt; zUAEu0zN=1}PJ<=7S@EuefFBr3B6C%5%5NzN=no=`j7EYb9Yc1*1f0Ot{! zQnA+B)?8BsRRJF)M0(>*z_9-g8JaOOI}=xvbj0H6nt1WF*=A=!Gj{2r&6=?;c#dht zjO|F;sAz8}396Y`a2#Tb)^xo4avqRxkG?wRrjwF{hRM&E5?VsqE*0L&Ci zTY#EkjN%QZSZ>SIs2F4JqKux$1dWe#E5`VsJ2>Z1jNJ~B>-%x=JeM6>P>`9fo=#{p z7TAXMXE(*nu0gv|K6!ubY~m$-qQUT5+OV&FjIX9Ve9?AVk>#0n5(1Rv$9^o&_R)_~ zOh`vB1|K!jm#G=*O!~jVHu18Ki!ACiWY9m#GjBKTc?zmWac1o%(TvD!ZLkp(F9SM^ z&Mi4?1dV${8ZrtVO0wQJNlA7> z>G0qB-=HMByRIZdRy zvR*ewi)14V2(B^_1LQz zqZM!6{42iK>9tWmc1xrmyT4ICCLU~Tw~7DmM?dz5uvX(!}y0jH;I%`$-@ zBbvzwEvU!%%%_@u3`Xo`X+b|GeizhZeyE57ZQK6zV^Hmb&H6C{7jQJ;%n15;!8xp{ zgP=m413E`)v`ee0#W+cXgvnR4M-z`~(uN734B+63-b|Vh4nf^Q%{Y|lie#dNI7M49 zT9aysCp&R7P0Of)7Q!0%vjPH_uk*s;zLZsv5rmOZG+q<=G~=oK2p{@uLxx}I#wEIrvLOizZ!BI3NRC&LpV z=*jNezn<(x*OUDRw_2;~$vD_}T|L>&R!*J=4Tm93rfb`p^ki4ep(iW8D2|Dm>^t+S z$r=pL{?gTCEv_cJ!qsHEK?!;#ZNvG0OZu%-{}hXl+@z3n#Xa4{&EXsB7R-XS2D6|& z-LcK7^;|@XS~OUIY=u9lw*zq&}3gAU;M4JZyV-`jp!OXR;A6nuy;Q>n%4P!~R&immpib%0$?U-4+v zY}H*ysrXfs_%nvbH6TN?srz^DwgyVY(xYwNauixBk}mypFT5&s`QE*}#sB=)PC(oQ zoR(+bUW?C_8rl>~7b`S=$H=`GZG-Z{S)`8C&A`N&p!+JzA+4)&(gt1brAHdD3v*0w3QxCp z4?iCt`Ef5BQ9LTipdfw!I7y&OS@ogYOc&WByzq!LgPILGztA+3hb+@|it3~q)5J*? z*K&$_Xu5RxTM7Mf2$3$)4cu@?cd7kt*!%(Lw08WxkNyEgT37}y&kzW*VG4B5prz((f+3S~-H1yj1~ErkD1sh@_NL4bp9vhM;(SRazeP;bV6OFBCd zbt=B>1vPSn&44TKzrM4A;;J!V2k5RVS?FT%8#V$ln7h zJ(ogSH8d|OR?oH8t!q-n9p<+VjrP*5_zkDAeDvMR5S7JQJr{9W*vJUj}y4 zWst+v8CWk6(+P}*$8Dt9e1+0Qr1(sSZWf(&Q+JeS51J}tM*P=a@tjMdN{@Q(V9@6Yi*{Q~>n zb1OWV9poOl9_-noueyP|Ymo`8NO~0drA}1=Y7(JkW z|B5!+65M&eoA+hayo^u|-wQv1m>9NbXl-T@meK>QG8;uX@Ygm?8p2VOV|S!t53qh^ z`}VNi`lb9mV?Q_l(-Sq@SlqZQkrIv-U2g#{V?nY{#fRS3fX61)-ebog}%>r?rt0}uC6Pv`tb;hiD`_^S* zzrYmYzb`4?IO7J(K2f+>qxf{D__(&sE5vv=G13YcSKzO9gB)%+QE9(DklM>d8cz9k zF07oI8w%@SC1X4kTdF~%xPoJZT#uC>7Fk>$^9MhVi3o)WQWG2f86G|+tVoL zhk$KKc2W0QruKF1<0Of+3;eB3$$@7~EqRNj3Mcw%BoQ9aor-7bE%NWwT-V+erz0a2 zZ9Vp2mD#mOp7S!y#f&%Jf1gs&Ed`5L6n`^hlI#4mYzO53jp=T@e*=v{n}*6<5jshb zULBLI;617Qt9fC|<6E}rJGAd44;x1|bTYBn!xl>f8JPvs4%x-(E>+vOy4wy06Vs2Q z5VQD@Utr6#Gd}VWh(R&bL=8IE5xV;w8{Lby5@q@%EhUaL22XU6>+@I56bOWgnJd1S zjv>#i%k=HkX=TFaxNbnOd5UROE{3bc;Wxvdg~_GGt8BU&l?>exOUJf}P%g>!K?#r7 zh(rL)iol5E9&)Lv0zeiZa^(u^;^8+hk+A^;)4llRm7l;Rr91YRH6!2mn5Z=udkp$; zK#+6kuyyP;l)aIKz5(xP=$(eynJiY#Z?Y#}0RaN!> zKNqk_kvl3D73PGHkcyyKgJ12(=#1Xc8Aa2?(s&p!NMt4)6)g?pK_|RMsbyu!_xrWJ zR+LtjR+fVgfSNWSDq<>XDdQt5DLw@M_h;>W?pbrW2k7DZfBpae*Z;g`?mhQ&_S4yC z?X}ikd+l`&pzM?&MeA#&A+$YN#k1YW-UgG2h!Abx{<8$d95Qgu-pP>j$^;3kreu4d zM)HzUS~n>Su`zr|SZiaOwZ<0q^@d$+t#U3jYo%uC@KX%JFmdzV5$DBPp&33WTG3i9 zKT+17NXHT#ptjjzr{FhMVdxx6UVS?>h{uW~hQ^JNFIi>E)v6$A47VrduF{*=^k%sGDeEk!CECL8FT?F$&Ch=?1a9n|=TpP;A*h@*suHIk%MYVu z+vhtPADD-rnFkwl_n;Wq?p!1BhT`bm;lTr$w5ZG3|1ZA#Yxqgn?%T$8UtW0kqk?F2 zbA9e{WA?8z5dH>>wq!_(x>!nqQhF93UcWz0sK>!EviTJ#64rpaVxQenHF7JOi-Q&* zvaCii@!289KpVWiM5+nLhWAHWP z2?(sVJ*p{>LYE194uTbbX7RhArl=8fAUc32)xL3@?8$GDecS!|Yz@|2o@v_#lA)(i zV&Dm!pq?a+)>=`(OY;{BLq8#sX4FqaJxS@cL)Qn2f%*tEQaeV*Q%+H!BICWMDaU4;0hbpGOz0D)3mw3xHLY8^w2~J zxih{#Hv=6V6@Z{anss24;(cq)9dAn`*Axq@y=E*qy`F1>G7-FpuO{*^cyBTlM&EPj zQGILHqwjHi<3_ulv{Y~Ja0~Ms9I-S~F6Dw=W36;Lnifvc8`47HLvU#|I`ENdd}B>R z$7`DF-J+Ulf#xju-ZU{3F>TC}h9#s_HT%59JRbfA-Yf6

    I)LKo{7&bo54Q~`L-DoQ(S{!#FTBH`BAdoH0 z2DO)gbsuzEv)IcaWbT zJWv3f2vI!Hgw7k)gvORGDT(8oa&Hk|W0Aov1Ogd{X7JEOQ&5dzmvCC_3oWb4^0IG% z@>i8*r45NrP@h`~r)4Jr_R}W99OafmE-`jUzf0|$x(Y2!I!*9OhkaI@)dnh4ypc`0 z&4~`6X4V#pqkkoOU{i&RuWW~8C(@Ne<^ZAD7FvQKMV@K4i}Mx@_>N7~&rW3fEHDmM zaQ64P1p7b@KkRq;1yyw#5CcS4c%GTj72_giXi8>{SYG7zHTYqPXUb`eEfi(0a+_&K|X zD7jUh{KW2ne6@VH`T2~IfRZ5|SVjgs)M#Wf<-sH9Y|X zEC zcFZhuRef%S#?RH{WAtq*6#L9YPqx`4TeJ~4rudlUKyMdqQVa&7;zT=!RCoy`+Z}Gh zf?)!IE;6!T+0=qg(p!UGvopy{6K-9|j+1Z-)u)lYW(Y?1Y)82Qvi}5kH>j%E&j=j3 zij5+2`TAm?f%JQcq+ec@&}fO;14Ty)mkD`OPECw-n2ZI@7fbYQWSpOwa_vpgNpqT> z>;{wzGwEQ9T0U%?5~=&owX2q9Rp$B^$-IWAOJbtwA`-v zM+vE8C0-lovoZI!Hvo=DMGm3oQc|A`splSXl#aQs0;T($qV)Zs^xRrU=m>^QjoT!2 zStgj{8KIj3<_SHH0wZ)$Roxc$cU)tHF2jl~Mi_fv3p>|hoId$@i~r}1%o};fdw#)w z|6yEw_n_|Bs@hYf9Ra5enSB&H)48~`am>==P zBR_+ji-SHPbM`|=6$kZgY`ItLd_1fa!t731Z~<766bCJ2(rDZqiWcV#o;5+~;s%^(Xk}11QAFV)($0;U@-~Um?+D0e%>=9nfoy_$VqXl zQB2wm!sp-5R8FsWs%B!S3|tQ@Ve$#s@Uu;D^ha{>B2QUi-`&DaXLlaPq~&}^)p#jY^PZ04Ow z-M3$-iSA8SRb$J!1ok8Hf_IjZT~bqig7?n8JW+m3_ z*5+1W;Vrb>-jI7;$!?@{Y^+~zB6OSXjh~HNkK3Gd4=W5>(cJP1wD)R5?(XA?gM#YI zU379$#dPuAOk5HwZ|Jx~bFl(-nrJArya6M;pIu9uCyYe)%bsWRTmr2qe)32)uNY!? zH^rr6HFscBG3XQ8PBO8%<<4W`#paGz@QhGVVMF`VvM6h0Sc%6Aa2btqVXo!)qT1&0(=p+=o?U<^EdDQ3YSFm0B zmHIKu*f8}_UG9~JF>DDxZfS5d(9EEUTJqQ8>##ZaI$UeZ;UlT1p}0Uk4fXGNr0(KP z=GL&_b7Rr7z9*DW&s3XVa54xMi)u=#<|gK`XXd|V(eB&y>-qQO<#$;(_hoZjsC#3J z{2n?s;dP+1XX_ag%85e)4Z>mJoLyr(c&iAd$V)-jgO|cxmQn5si6;{`FP`DIZi%U= zYwd+GFD*vFSYpFXe0HEb7i2n;twV?D7qZG|y!5cUgoPHH&>vUq^F1Z5>*!YaA~WVP zEbeQ3<55ByeV6wMu`O#I{tVBXxrUBwWS5rgNN6uL8mJnE^~IA8prKzHGE6e}G5P~t zUfa14vaD6fimRbAknF{NKUIBAMKpvXMUcVfMHDEJ*dm%*-?51hQ1}alb>q3~-oZNl zQN3|k0g>jAxuHWgfI|l_%i$%4F3`6Qi`NKz<;!F8PYuBrluILZ zX_3rVB2>v3Bje%&a$5+MGgyQ*w=8dLA!t;v7bbD3=00Z{{&+IO7;33tF{hH-uuqgx zDN184af=K#alf@@?F{)~@477X7qwka)^9V+M>><&_(ID)l#CzJa@ZBYKbwXWkiL3; zq-G@{_4#l@UAq>~@&+7MH+eYnNBoHto1(lRu zKxu0G6Xm2!2zLd&j|;8A+!E1bmnJL?9i{cUm`4xthZ8Gg&uaHvSCSonF$T1utD^>4NauT(BKb zHlrt6I(-(_4W4B3V#{8uA0gwUb$px}R=Kop0LfL;6Db+l?t$BQJWz>-i!`EXaZn^->^M?^S$TTpL)iwFQJN zh*!d-l{_0Ld^Z*d61Rpwq;3t%HJM+-Qdd3ZGtK00FKAXTX9j5W&tf}qaqZ7q)|H+$ zj`mCcd#S18Wy!O#;;6GFD#KJBx>Y3YpxbE;_9ZqV*=r2DjDj&BlWSl@v3|&wWR@>| zOA?J>JdBbnO`_Y$I_{5MnkWM+3FrUKVC-GIzb5(r2inD2)~GptH*84MRvfT5&kPf< ze()S$CjJ$ZZJF5uC1d*fJw?LNLIzSFI`|&5PX#hHU<*tk{w!H|2T~%WeQPLMQ>SFC zWDdy|k$#!xL^i+AZSs zDYwk51tu>AnRny#UiRFrj9OIunv&V<-aPT0O{?VVGhLG)^Y7oRQGu@ylX(pD-F8nqnenp`pbB@ zqmbKhOvgO4h!y*c(c*6x0p=GZLr8K<(j$xqtpY>;(j;!ARbr=!eQPXZ0?{BRg{yUo zy~FNnvG~LH(Ex$PU#$Vc$PssMYC&meC{EC>;-G~p(D99`Dzf@xi~SY_b|3Z`e-24u zV`5sN<7M%Btq@D);86zqZhIwGALsU3JKmGTB3c_V;`>l^SS;Sju0*m!SX9vt%AaAu z{KA+>hD<66O7fDTjqfMx#YW&=VR1Nm7vRnY%?5WiGekk)!CfIN7C$K}Pxh8zzCu7L zm@f1khCxMNHrpKHeA&85jIpXT@n^B3MBrLKVNIFeP&lXBxof^`yklD#wmS3^(O>h2>xv5Mpw@IH8B&HH4%_-e?q;QyN;{@K>&!} zl}v)*2YsU;c-RC1^esV9S3FIE01d9Us2??kn`bzaYRu=I=O=M66ZsR^u{gS>l|}?6*3AYciLiy$ zK34gla8hf@Qw4=Dt>8g?Pn&iGVcVKu(2|f91i81Bx3IB^{5Yl3a_SDn1%^*H>|m^6 z5ZF4g>2^{@1%5AZYxhnRAitmoWp3@?Ng5LY!J=zCzZ(vXe`mV3XFQ+WJLuo3{rp?| zcLui4z={5y{5}Rd7}0K`f2SCKstb9>e?NVrM`g?o5+8U(v5(k3+bZM*Ifu)b$M$G$ zVeLMM2KfJLjk(v&SqvU7dI|jh)dEX>iTjx*Tyt+{kz@5mGjT4K3Ad*JFc-9v03dH4 z@)EYXg`rA&(4r&<4l+5A`;*LN(y3VRdjokO2CSx~!=xW76{{xX7VDQhU?d+R3 zG$oCLAQB+-eNf)FtdtTsQd5@2q|eWL<3naHt5qQXTF{X~=YQRdWr-q=)oi3}TAGa2 z=cWu9$YxGO_n%geL8}dC3C*_h^A6jWZY62=(-^dv&d#AI{w#`cety5-Y0JT zktaqHevU}On;LS@StQ}N8akFYbUfJo=ij2yUS|6cF7D1TIluVLFy$ZeCo5euZ|)!pSQVs9h8XsHqZS za-Dt^5BnwTav(O0*Mf`cp+nT!bj;Dk(H|^m!GFEHBgJd5e%nO-=wiQvHE|lfWeZ6dEw=ivN%S2#4SiN;Ly39`3*(&)fSARX zGuW@QLKD>yE4VzX5^WT*y2>;HRZRIM77?{auHVhG(^~}K4=Wn=5`p5$;ylrZ7NziR zI0n0+xNDkauGvBN3QAh-P`#6cNQn&ar*(!at3oF7lU)qaT~L!F#d~G4qz&C0^|Fu< zj8QDaoL%iTRC4EKw993{Ug;6SqcS_9i^3d8!C1pfqeA>sxr;NqiqTNtLI1lDv*GXy zHGM4G=!qCTi(MJRPbZo!3L!3l^hsc%xFzT{0=7+)d{Oa0Kjrr26^1iBAF5j$H)%nP zw#a0Vz$N6cKpA?))k?c_bxHI*dr>!Nvq$aGRHWODVtmXcX}E4wyD=>?M+k}Xik_}H z+w)QZ`r-wW5avS*wzPd(7uncaX%Ozi_CRP}>#3C@R zm*_yTneWj2+o{B2kk}4R+7?5OprO-LoaAUTwP>3u&(7{8zgZ8FMkV#p=AjZD4t>esV18u*R_xyz^Zg_V5Lb<2H#;uzcKe_ zGTr6Eal8-((lnE>n;xH{|7% zCwJ)J&Ej8&4qhQ&rgeC@iB@JnA#Kmm45>G$Dfd?Cv&&ec>P&Qg({spYbA@PJ&1AGK$vj=@`e=_Np+$B_uP1F9VH5qzO7be)OzJ zgNroHBe_*khVa`DCK@Kl;AWyxv9&`jc3}urV-EUn8v`PdUTnnxSKtiWMK1DZG~Xdm zBWv{*xKu+m1g;mHT$r#V#kNiKI@l?czqZ7fRG(sSY<`Ll0%y|^npW8S1um7&O9Iah zOY0U`Fud-Bn}ET0ao--W9}_@k8y>-pgM_@4@dgCp=fvY#S`I3t1mjexrI#@ zYmFPN+M%3MMfb&nA%hc{eIc3KB{qdbhwPy~I*Wp4S(p@vvbLBT$j|?SMbS8o7vlks zM@`fkWVBNkIv!zfTe9?|^^5|12~z_(rCTqu?<>QWL$Z$LH&*1U69w!zyo_Q|p`2K} z>D?!L$n218Lml3q z*oGcqX23GrfSb*do&TVWe(x!TNL-JHCq_K!)QE#!#D1#bdN{km4x_J`vCY0%CY1_W zVuVPo(<~d`XPB9+3ek_PY522RUx+Eq8$Of{<;fdP7Q=Ay(7{X1U>JVSeHI=$Ll%v; z^=SRk?XAQ}AJ!zY)SQyzM9^#U^M4CFHJ4G7JEH1@(=0;;>KtpMMmszs3G zr$O0r!&HnXm_aAB(;dPP*&MipYRp+wns zs2+^wB4Z^bMl-D^qZzvcj7F^qjOH#XRfwSQep4};rQsI1|JY_aMur-D8DEHooylm< zRS&MVdn+*-07)v8zsP82!sZB3M=wkL8K3Z(HraqSuQBJmQ{Xf8kS5-1kQ@W4O&%V> zP4zoamISv9&J%G{#e5m;j$IfiF_M^{C@!W4V_wGr1I}U~oFzJ^o`!R3>YK`xvpi#2 zHaVsa!>dCE{71}+(}{tS(G`3+i0fSJcQIy2^X||r8(pl){5mu>y)rCQfXHL1Sy2fB zdpRbW$u%NmCTU{5vX*$0x@GSB${};O(Yh@UH%0xZUfxS8PjefM_*?jHX7xUQ11<~p zS>Q7+RpGLy{fZjyte?JZeogR~OY#dE-Y2Dq`5gUM%_40l?-5}-rE$mJii1r{vOySU zV^VGUqJrTjazt^EP5O9tvgpUrstF#`V+Bym&k*$l;cq7R$R$c+mWTcXgEfrcI-)CT zRXPlL6eQ}tF5wOj)Tj^a5`)V$qaLF?Oj?-GDLD1BG+qN6O$mJ}c}Ub!&WL4fU&V2wMi-p zvovD>s?PS~Xhx!e&#o?XrNf4BS+j)rYDx3wvzdaHB33^IuHcJk`>0D(^)3>rN?#gM zouBs$o}{hsQLm;Bf5p%X`&ugYg(}F022zZn19Q^vl#!`c#TyF>hqMKI?I<_IjcSR` z=6%GS-kN=Bs!O_UvaeV2!_V6>_Vp%P)q1k8={?z3T{ioYsm-38H7DU;@dkKia5D40 zUmorv*`Lq{k;sF7>v&16TNt zox>q*WhHvk2Zj~65+gCJvGi~(P8^9JPNr~}1mRF|sRD(k6Pj)8xC6BW4rQM~A?l^H^1OJyH`~mT^#U1*nu3AJD$YuB*(NWlFExlOyOvPaUT)i zl192Efm2|9UKM#6)PSw0spIXau%Lt_oCX~wVMtBG5lgLlRe8`f5|$$dx1K2+hGtFS zO4^QQcn_2umNkqHp)pmQ%#f{v@Ug1*_EaIs&mV4~Y2mfa9dCvAcpPq76sysUFbrXv z(lLZZ`&b(|lU83#B&wjr2ku2}YgDDJ+ABz)Y%Wl_F&bl1wlh_%1d+~%4AMccO9`{J z#oD1#qqH5HPhMTgG{7cxOc>&ffES>1_5vT9-{hlq=qlwk(@k)qe-x^pF2jr_GL&MG zt$5#(0Pz!x#eQn1&a~lgTo>JlHp5TZ!PpT)9<4l~*ieVfORb?##$bjlMv-DwjT*0E ztEm}*82La?NrIT6rqD`x7{pd!f)8y0ws)#X$&2q7hW~Oc7~ABXx_X0t2Me3F+~8sg zXmcp3DybjSAPB{7}#204LV6Wuq77Vz~02bZfaRp9KAE+nz;*` z1Xo})G1UZ|xW$(+Ty1A-Jp;Uff&E#WXX=uXji+5buYM>S+48?5Sp0E{rNCfKM|K_W z(5O7JuT4ibyWNASJkxVp`aSyxN}khO2>2CjGk#zv4%Gk9z-BwySMm#PVJO%ng3r0VT_0h!`DLOP^%OP@L)rDuqUs4b7z4B61BZr1feW9~!m%Vo1D7!{6E~gQpjn8qfY1lru35#+Laf&g#W;o~79}&11%%*U|E!|b)0mnRYg(yS>8E7A zHu-R?PgoGt*iZIqiV%19&l<*^G{N?Vw65Xj31YZ`F%ZB)k6ImSh7xfo-tEEVRW+nU zO9>B*rEWA1?bD~(Vov#u(!O|t32)+u*4S6C3(txY;7u|3H&Y>FzBE&eD8zVNt-H4H zw_(y}E>rfhZ}u+nB*lGHAnea(nXM1;sx~pDQ`yRdo%n=kl__UR;T5#VWC`(!th1qL z5R>Kc;^^Iiy75GlTm5c%9ChQ`;&@!25_RM8i$2@baz}mc0j*dUB=2`&}EDod0`)nSqp!rjEDZNpj@E0e%(p zpT$ro?&R{s%g^7^g{GCiVM&}71$%9wIAeZQfjIhd_ON{zaJFGG`OG0xw5m{-%+8I3 zE_>W8EV?e$B3|!!JWXNOn>#iURSmd@(WjJkQH88b<%Ur~Orv?-!X;)h)8rLT|-&St-cUf~>r1gajDxGX2>DH!>yQM_gWVCniv`h+-VJy_zTGB_6?DPFl`4rv>p?Qz$l?}1b+sp!VU-!PyeaZg?W0$Q6 zkZhtg4_A1eh>T&?*6f}SpWZyK*yqwxQZD?4tXrSx@%4FdN(A+UwcF(%a$J>>#+7Y8= z=qBBWh(qP-hMICKyH9`Km!|=VMY&hx`@f2AZ4 zmTWl54`08~XzSVt@3Vq+6W0-nm#Gip9hEm{(*~@u@9O@>Z@AAzfaf@HyWtZ+x=3Mxw9Gyu9wV~s(zg!{)&1*RMC(ERY91v|d5sfQBWs9=w9v6M z)~ISU9a85(%Qwf`F-#4aLqe-qt+iaxwkS`I4N5N$)DpeLa8@KyzJ%zyhcE=vc+DzF z&f}s8yKQQZH8vIdd_Acswk=j3d=GT9Helq8V0W*(w%bqM&jSa!k!D#Kmam zya&AMR$72m#;nsW1*}1Od}7g&^ekWe($uMZ@e5@r%z?C~%NG8`2*PSGmRuAPQztB^ z44%QBgPs-aIkxzvi{5;sM6B4OQbmJ~5}4^Hm~_M9mx2Mwp1b-j6fJBxe4dq-cKZH^Jtsd7N|=(qdkJ@jDWZws4l0$zU9C37=W7OTWs_tcX#FMVh zc`p8k8&ABIJ12pgziyXuyN@#y&_e>wMQCBm@qZM z6n!bb*ndzJlAb|JXv!^Dz^Xaw6#A}BDso61w_qv^);weQXbED;V~u0uvc-37N38p` zeD%fR}j!%X(au&ZdPrx@XI~-FRkVEp~M@3 z;uq0W?4h@#8QdE-F&P>k})+U~0YqQA+GWRaCLSZ^o`DoOXR-!32aM)~d{ z?OAZmj(cYQEN1=;wgp*S?xJj#H29FL<7Mk{1C!*Xy+L!M-bKKin~NAOFkx_m2sK2k zinIMoI?)0S;>;%FVSmOmXr&XFk;}-U>#}no;K^g+(ROW^MKo9zpUDr${6i6sBkv|^ z`;v@^FX!jKzvuR)fkrtB$GVKC0@cr^gS6dF{VZCkwW<4Iz3Yf~%o`LHuV7+h3t?4p zl@UbRDP!F<0%ZmP1b1$a#3jcnHdcqhWNc3(EAWt2 zY|s$%o>pKHwJm@7Qc-oZKuZ0dLx0q`l0t z)fe~wKg9vRr#D1=c&W1>(9S+yM(s3wzz)=J(Y+QY2a=g~PUBIVJA)`axU6deZDYf( zVu_N8k(E$#GBmgbFyl?=!nSvaD=@68DF<^BhQ{TLQU;5bUTdt21hV@hUEriDnh@Ru zn3ta%*UXRI!mn!IQZ(U_vu7AU{nipx&;5K1P`%C~tOrQbwJjZ8rV)lnfKml*Jdjlj z`{o832ZU5+1_U&$lH-m(3lzwR097PF7VsHB=!RiPfXa>;#tlP)0xa}T7!wLy%tAlc zALFb=gW@x(AG?;!YLFu}^uH~Gm;Bh5=jZ)WgM)Y-qJ#noh{?w@?cLb&;8Df><r_Pr2x?=s1Ty9BV8#zq*_5U$t$jJP>g`#F8 zWcPb0pkyzRSlz%CMkc%EH7MDo5A!K7=%VZngBULYNuc9#-!G{I(=2jQH%{s${gp?P zxI?i&pTeT_i^erQ3Q4XGV%4&BUlj+nC=rsm^>XvtA@j=SS^My&Wvn|ARprg$8w+8e#PWrDS1WplAlZBiF~Lx_Ud zL@TFz8b&83BB*|?e@6$@Gw50zBIcMcuAHb>E8g=+a6D!Y;5 zwL4#Le^soDDs*oGQ>w4qNY*_3zB;bJt@Qxn6r+)MC*a(|@0*NDSy{eHO;f1K@eK@K z;ts&HTJM!Hxi4!;?=+Of+HNN zCF(ZOJb1-URn56|%m}XM=ds0;_QI1#e)7DFR5J(>47W9hFHPJw@GVP^2ul+iV}GcE z1u`qd!Un!|aJtw|NS+@a6kM~U)sT_<7m?{bySvDJkjJEaVo05QqeHhTZ+*v`dV*_! zMAIw6rD?j<-0_B}xyvqHV>ejHEhA&KuFAA+eV=2-y0or?5Uuh3Eqv>l7ji5Eu-B9D zH}E_FZN<-FK!o9|Nbb8@$++<<`^~N$`@JMaLTb0J^c zFi_;TRk%&oG77~o9V}4kI*j5CFHGG?Yw_Fi5d`tXNeZs}a3)E|RNa`xx z3NP{rQz)m_<1T3Wt>H*@|Zmsf-JHy~eM0IuXx;h_Y&t7>HUUoz32ed-MUu zr=o(7%13pJlub(gjY?Jglxh0a0mPh)R)0&_z0-F@AUvWyGY{mpS;N%y5N}gj%h(Nkl)@3bnV-KK95m6_8;G?w?+N-x z1f&ju_%)2>#iyS!D#4Y;$Y5-^39OJh*cUH>3Q-9T+YOa_U`ZvvCGqNk3{@Jzu5o+=ma&tMpJgtuGp@W8p&IhB332Ar;MiWsDTObqMOHhQDUOqxA4~0uxY#aNj#^d z+cUkl&jJcoCskjJ5>SnXZ*LT7-;b_Td;+QR7K(NqO8nmS#eSyYC!No8n;FPegS(Ta zLH$f|!L0XBTbYn&+rZP4?X_v(Elx7VQK$MA!UiB06P;PC$|;b)^^d9{YTg`Z`~ zIgAk(1q2`Uzxjw zckrwyFI3JHm2jO!>6N3NWh*xmtI_~FRjaNW8dkvBhufU{% zghh;3!0#^*uK>ZII5k>v8!K|>^$bxkgoQltL@=mh$^^qUBGvSXG65Y~COfP^#lyT{ zP-%M0$e1@S{O1IN74UBt44pJ6b!e}fjcG`3PYn{Fg~A32A3&dj%aSYPf zwkJ6e4HS33Xg{b+?2Jt`r4>56o2=#UshVu|q8P`3>&IXeMKlc9j%ZK^N;K$OQ_CKr zA%-+aEkN594Jzhe5)D>BXRm*sXz0`pZcjAW1j&g8wZcTh$sdlQ;c^>2ycjR6f1pBz z1+9kSXUjwbGlg#j`~UT1MmAsxZSHt7Mlie8te~K$As%?Z+NBWe`4U}S)XM{Id$5XY zT2yakqRSu!Yb}=&B*V>8Qi72X9K1Mbf^{*FR1m+yiS2KoyflDW4Aj&`xfKF;fY}nX z8CT^iY;Q}(?hm=k5O3yw$el(dOX7NGGFO8-Bbz0j&Je1?)lQe@!=4UE{6#WvHs{I5 z!O9{vksQyz@<>gJR#uAC6m0%by3HW)+~cAGHSr26i^CiirSO%@&wHE>V9KX{a1ud3 zdq@Q0EMV$fYQf4Nb0U(vslf__KuJkAq0mzzsI-4+s$d2D+a-dCcbgJnQ$9$92ey_7 zojWWM_CTz-slrnh#Dd8^mGh`)GO6%02SHcbMTrC@GWI=Av~8;a=sf8ANdOzPBke0t zx`dh`7>R`-NvxK{J=evqVY-lvB25yMOu|5h9qMB}o6i|T##2uk!CVuCu{Pd1x1K0u zjgJi)yXp9_wE$^!G0u}o11j-h%DLzQnR#JA>Zjq@&~+FTJ%xeoi>e%^oOLpGII@#i zF4789F28y?x0=WH>c44Gz@)&3wj%|sqmTmn{x?Vg74t7i0W09&DFtjyY)=ZvJmx0| zYK2LG|6v2KO%SdoMVuhq2u*Rsfw}L#dae-r$-sZM_c_2~1t2!a$5C=hf1~C{>RiX#wS6j`t25jac z$%DE;@a)xJy$k_EWPjTUgG)CTk3=5alPM3>6;2-L=f6)LsI-4c9#{eYc6lH&-ljab zrFW1AH+Xpf+gzc}f{~XQj>2;AdX|B&FP^z&?%$aOzyJMC7(mV@3{JDXAd&<`Bry9C zCV{;nv6;BIYHdsr_AiTeqGUxF-&d@y8Yjw*%~S?n#t$Gl{lp+hjKKWi`=s64gv9*X z#s)U@av9*pf$@Efh15Xwe;eNqv)q4z??dYBrP>DLFW~xM@hfFREw>EaDMsHx7dPP_ zH``dg>^0bd%rwMS?O+{RugS1t`#)dE{q2~JC1C=v5O+GgOw5~wc$|Ud&)5N*kn%c< z5FUdp7o+o!G=u3j%|W^Kh3y!FjNX~blpBMvmtHeuj&7U~WnIuVasCG|V#5jqq5MP4 zzBJnZMEujZsw@`hUyDlEk?c1oHkBksoF@nZ{E z@!dUbrzl)QY8|65zT|*0JML)K^Gq(0@rdI7^VJBAX_PB{v1-);buhAnOI35nO7fG> zc(m4l#x)P`=JIJn#%aw)c{oK_96I>z9D|tPK2O#(ckoVI?ObP&--`;^L<+4+uE_C4 z*dO2|%#TVanFwgNq%D#vKkrqYGV^#f`2C)QYY0zRYyfn#9BXZVw|79UynkIY8M$|a zSU>CZYnfz}iMfsZB)|+GonYnr*I5U$cav^o|GIN`wkTI7K_%~OYRK+MM60QQTNv>? zQ5eS5#N9b)w#g>;Id=2h@zc7&tcX8LX(o3=GHp;*ShncMtUFQs8}5di^Km%-EfIfa zb?QzCb+-M8KNobVjk1Sqeg}niS|=?_BmNZZs+?iA74e4$qB$%Pf08f+3c`WW-$ndM z199j92FG-BxORa!LR2Q({wGy?9c9Wtp2@1Vv$d|^L#tVdl~TLdY&pzS@y~Cd`92bd zXs$_n%M|g4kt(*_(Z%@deprYX36JlK6`vPk+|Fy| zSTj|eF?hp+X`HdGqWi3;k>2zkxNQZ~d*qbxM&q^fuf&~0x5Lhy%ffD+WPjOl#gmFv z3K4ATwU`XaY+spHz33(v3`qOR!bU5xSE=>!I-9LngNBEu8^gkUW?R=gE98fLl{S~j z&1@Wel*WYB@}yGjf?JekE>)dwF0-P^88?@0eJAQjSetDs(*pFR@b}o(LhrC>T`IOy z@71FzmPa#Mt<>v9O}9qxm>##vKC{};^3Wret9yXllorVn{3sg(3Q=A_W0Di~#bfcc zVQCeDqytSpXq^zBsVE(A*E(#WZeQZ0On%IL6CLn$tO|q|*ji|WdB5Za!zRw^kBUwz zi6qe#M29mD1o6Re3UWc4auX9m0apf*okY@thfsQpOGh7ft~)}FHcrv$dlJz(mGj?Cwjj#LIY7kh?E@^Ci%;MP6<5) zig_q+SES@GKhJWeV#Vr3xRe4|_7E)H1(N5rUN8Rifu}P0)7Aexf4XVsz@JPV#b+#7 zEW>Jc%A7hG;T4z@@o&i&v~w{jB};e{93?Dii*ei(=Cf!81mo>^K&;BrqAR=l)k77wwzI zqG+eyxVw2-iC3XmDc-CCGv?1GB%@B{U5)JI?c;I?7Gg_X2A~+DN}1M#ZWewi{O(~x&~t-FNnH0sb+Q}hVspCFP3n9I z!JEwILtKc>OR?pSEsw;j{e+c*@aneyq|UQ2B~R&0-4SCfNYPVRxBSNY$j_~ZV7B(c z7UB~CJ)>Jy;L5O2M=*jAg%|%4vc9G}U#+|fxhrohbVMze5Cz!`SM-2>y3kgu|A^xF zX(bF%t?ozyUH&lyVNgg09+Kh-x9Y_(%$5|4mv*CIO7%9y}$6BS>9unXM$rlv74uBJ+g@cSSCR~gsCkNvZZ>wA`ShZ)yQl571y zeCtAX=VjoqLD8G4n}-;{OfR|VFU=vBI0nwIkbdg@iWDrTkc?dw%a1>e#a zk37|@1oY6WEJY8!%8-hB6{PCt>QS$PSH-!yxK#PhWZ8|&cP4MJi5^5QD>dq2GQpK| zqs`PsriA6?V!AV#voKTNPFP;QGkM!#c~NKKVWq9f$y|L$^(xuDlX*J453kJ9JUvX? z|NnWqkIehOF;CCZtFC(DUt&*d^2JY5!at!`t<4wTZ#j3EJv|2}HN8p`SabW16W4;Q zblq37-xk*~^e#=rruBjcXeJ7?(oCbU3YPPk7AVMB0{%Na#D_-gPVAsmo!Q{%hUJE!}fT$R&3uzfxm7(F-hl z{PWWe;X3+;uPR^+c7YH4~pMOxd=Hs*m-LQTW9U4iQhxw z_ptcQd(`Ym+QW<5>_Ex4W0&|nGJcPa-?j1ku=t(ahj|@?5lYPtj2U)}joYoO2hPSe2Vyo%>+FclTWcaj2h?x{_xs zY-FgV%411ido1ZUk0pKJu{jmWyRrgyeFf~+3RqVeHjaDwOT-4;#7R^C>aX=WqW{3l zdmquirr76*{=+WYsjmOX1@EfsUpx2RNB3X1AlGZ|&i!k_>iQ2O4QY?)UwuUXzID6w zdba=i1-T>oZ_vjkeayKe_i1G*=)$qO&``S2tDq~rZ|hSb)7iRoM1>1qvOHgWSA|RG zmh#m1Y(cLxEz^jh6*6U3bfM*WVW$dtW@T11Nmq=A*NOpgeELp4w%4bfSM}-Dp3$eh zug&#eaaEtD{#{q`FxKc}-Bo>N_9st47mnz8p;x`G^lsj@!nHJ0gEG~ZN}FwY4z@f; z^(-$l&zF^_s_G=@N9JrD&&WT2d-u9wL9bpHXWi?CIj95bn{x>tm+E8DCB0_yG&$o! zea{QMKCdgi7woQUNB6vDnF(yeJuD49bRLInro$1CsDqOQhSo5si^`&@kHvC!Wv7A{BdH>s^u- z<#-|m%ke}irg)H3i4;xoA$B^Tq<syYV14xU{fE*fU2?foqil_rvplID!Eb{ z@o@moog~h*y{4mqdJI8D5O|*fRWzh)W*({v@y;^*j)-@a;de)TT^atTh;Jyv zAB=c8%X&28nhBI2y{i&!GrjU8r@Qw``Bsyl#eC&mkdw&Sc3o5@RX04E=M2S z4_=*iOX=({8MwT7Dy82Q8DFXIKc0b~BD_>vdNLy;{ns+m-;|Mlbq4-U2EGrZP^tZ& z%)rkF-@R9Q2d2ky8{aoP<^QJh==NqzpFI2Qv(B0@W#-HYvu4h0nL1;}gfm)aoi}OL zlo2geBbtsJsgF_mI7lC(^>MI14$;TQ^zm_h)aql5K1PKsmS>b@8f6(qS>92WZQx{#^D zll~*OXl0i&>{zQy$+v%_;Wqz~3@v9?O*p#g_#^6?CLDj%QJ-u$X~Ic$M>I7|C^Zc- zpECXIwn^q+tFP1;8BXHWm!S*_Ej_1R)!aMn39+IJ+~8D~w}RE+dtf?zCbeqmYC{Uyq#90h_C;=b!kKcC5_6KZCr`q4W*>C+Q!C59<(q z6#ohSng9-cNmoqp4@Z$Kyx{P^Mgb?h-r;*hkz9^n<#6Ag>L`+x-nVB^6v@IHUHQj2 z+_&eYD0-Ft7?*ypD0<8BlcV8HxNpz-aY`WEmw!eyc&S^Z)g=A1+~LZa;O9g`S9$uo z9PZP%)sUH^JpH9L1{?0v|7xEU_wD(k!+rYmK2kaT4;=2(-!Qy#`kNi@(|>3G%ISaU zaG!p~ftAy*bhuA{)3w_tvo6Jb`8PV;r+=Y7q%WtRvy_NKxbM$y4RVh1^#7sC zBJR`gqMMJnr?&wP_vsHfu5$WA9q!W)KCyE8k2>6^Uw?8)FM9C&;p3l5am{U$=hYG> zu3O#jq6~a=Gz6FT&v6d-?P<@zUp_Uor@a3MotEOhKbtb}H!|?u!o+*4=X+QN-u2nE zUf=$kC#JY>&utF(>3=vSq%ZIPWf}N08Tigq({lXys&lx1j%PZ&nXJk4z1HEroCh85 z%h_^9sJHyN4E$V*`+oTLnJMo3;pYzb>5ps;>D51;pL{g~Uy*_D+g7=p^Bt}smGr~S z4)^sw<#0bjCj>UFrU_l5tgoc=@qRT=Nfz@K!uFTdY^ zr{#Ejcy)!H)y3=iRzk8Telv z?)kv`f0CB($Mum8_v5FljbY>Bvv^YqsGE~VaD z&%Cp5iu?5UIo#9F;9Wv`)q6DmN&8Q5xF7F@ewE8#1SCbQb_rYm7 z9-o|n-|FxWx^nh)^W{MfKQIHY&A`VxTwR#7voQld(c$XaB>gmp`*IdI+^4_O;d&HF zIgdHqryn+iL>!`r8vc{?r#M_)p5PZ`;CDD&U7Muu%D|s>xG#VAp=9Pzdwe+!4%e+D z<$vDcDl)%|7t|jF+XW+9k@c9n+?fh;A{u_r6 zbM-#&@Zk=Bj~d6J_WSfT8Tbf?`*Nmd;PV~+QR+zg;hGHm2O0R!9KN3`rz-=0F#{i@ zo8-{_?$3YHoB{**2L7`Q{I3ol<;r#o5Q^v{7we`JBNEYb$Ng<)Nf)%N%~VV@XtHk^Z83N@SizctRg9Y zT?U@hpyE(_Gz63M12XU_4i^b0>A#qPU+!>`SdxB82ENMSp0D=Npyg0IJwDXoBAKN8 z+6??shl>P~^w(zKzi_x7X_CGx1ApD&>hdK0Ffj!Vwcq274%e+G=}*hRZ*jP)O49!+ z1AoflI!)3WbK$qzp@u$=Fczx#V z4E&23_&kSuKK3hzd;Mo+2L5^m{+`dI?eyjEoq-?jaL>oV|PINX=> zVg~-U!@V9g@U-;4JbrivKH1^>Ir=#>18>j37dza`soyx<>sR+U+^7Fb2A)4Xy)R$R zm<)WX!@Zt%ZU%m727Z&ny?ngS;a;9T?r@)eLk7P4XVd%g?eytCmVq}q+?R8L!@WLmmcxDe`3{$AmdLMfINX{a8F;6|2f6gmW#GF^PRrlRrTZ=*}PM=HL6{GwHxBoFGk3Z<}VI)cT)ng9#boi#jhr0B; zoSByI%NgKsU(Sym?#uau!#zDb?r{HHde2Df_4Kxj!~M88+~K~QlO68Uf7ao?{4Y4% zxAPWFN$;fn&^i?8=NyziG& z{QVB!&*1|d{s*HqarjDy`*Hn}!$0EE?>vtm9IDsj2WH@hJA7aAB<(ph1HUK( z|GL9{JO3vGzsupCzrEpbuMg}wUw0Mm%clp#6NP22DB0U7uxhkN~Jyu-bGyujf; z{k0kRFC6a6`K`mfe0<2^KK)A$_xjS?4)^5@>`42;<8==A(YNF1E1;ep)UPp8Tc<8KFp>6O9sBv*V6V3cj*Ua;L{xL z`SzDH@NYPLKUe;Z8TdmE-`}P0yC`j^$A>%o0GIx-4E!>Ok8tV#I|KiG{bW8FGN8L`}BWvxPOixyeh5N zmov`co^GdS;2jS4?f)-_`+6U8xTnu2GVlYxnYPnE_qq)HB!`bCYr@~I%)sCMgHV1S zzK(L~|KRXOhwuHvG=0J0bs8ufD(4vflX?|9P)9lbiQXyRz-f|x^G*R*wmls_=v|fZ zWf}N-hridA)9>ABIUc_x13!P~G`(+6(c!+F8T~_g-S5%-C-?Pxhx>keE?>F)U8^hO zg$#VU!+m=;X5f={OUw7^uiGQw<@C_5o98Hhjz7u3f2=}ddjEVUX!6fd{(LXaz;~4r zQz89<4)^UpCj(!Xf&WEHOoj4aakwx42`MEU>W5?bPoD3-gM|j%Ki{eQ1ibuy&(FZ` z&cL5{xNlGUkhFY{Kb3*M{*jPg?LUhDr2Th`X>thn<2NTIgQFb3%i;d{UMQx?QGUN! z`=(cCq<`Jv{=Vi(X^7=}e2$cZ3ivxCQ{0c=_l!z$UrwJx0p~+>7icX{`n66Y-N1TgcSGBckSeW zm-qhxEh+BX)8~v7_wC=+;Xb`zkMQXqn3k5~)4zCTiu?3^W~8`JpX?(pZ~qBxX?kDI zfGE)8{Tqai6~9D=F^Vf2+fN`ZwpM={-JvL5llwF37<1msCza#^JvFDH-@xm!{?T z^v8WQ;N|^1GXw9)z<=s+uMhmi;r@B`Uzpb0%xUr*4|ljP=M0DYa(?Xaqg**pXW&O) zme%XjpBU{P<^AxZ%hUA7x$*~pEyaB~*Jj||i$ePH@p#?WE8{ldnqK>Dzy^!+klAIo$KrAAYNHJ709TZ)ewcDyM(P;XeI2-%H!++w&8LdwO`> z;hsP2^8K_NiMWKW_H($WpR*nA>F`m9`+lguHZ9-dC;xZA%b)Mf4)^za!4E5^pX+d+ ze)5khr$5KxKK+$TDyP53;XeJjH&jmlU5EShKl?@H^v^lmr=Rz$%IUx3aG(D7w^UBQ z%;7%$ZUjy$*@jW?B#>I8ZE93WM;8(7woc?c!Q zls{I++Z^uOzi(GaU(WZ&XW&0~xaWs2JKXc1nozvy;XB21eaWVzL;d_NI}0D8JsrZ| zZD--*w0JJOufy9M{vL6e7(c-4nIr{=g?QR zorRBKy~DTg-5g#u;_UO!YM(Tj@Ag^Yds=eVK6Q3`)rcvR+9y?wn0)rxRU-t9s1&%f z6hO}hw%ozwX{beoxBN#l{lgh70^|ImLh zalqN_lS+-4I&H!kvnHK2)l>gJeJE`RhrgU%0EsQIkuO75os#rpQy|n+snU{G?ntVS?=xU ze?24roE^+RoXRWJulv%mz4m`PBmZK{z5V=O&&Yq{4(7izBY)=(=Kp&}{;nO&KL$ZW z5nJ`sx|lzS#~pw6kY&O~lV7^TSp9IvqR(=HV^|cHGSllck>vw*kz;B`{Z&!VP2yLF z)IDXXu!$M<-!dcAzvzIFQ}tK#Uta%J8TC&*U%e9F4Ps~ELuL45QAYiR?}YlhqL5ep zyYpXO{~u!gy5^5@$At8HHVBN%J6Vk;hF2wgE4L%e{lhQSXP#o2TFcr^Xuw|=Md}BMg5oSasp>+H)j$=20{F8k% z0#q%PlT5cJC=Y!oQeWEjfo_vZr!fU-19xe$R9=dvO-M{D51Z5|Y-M8`U4dlZckVg9 z$Jf3AZAZC2=lst1|9s~=_vW=owB6}&5Z)Z*d1B5uN=U;K<~T@)L9&iC5~aISzt^){ z&EE8=xoTX<4f@nT~L3UY__1jBm>$VP61#qrM>WG{!~vc*m%}4mNxAv z)a7=#&*&NT&Z54fOAkeJt}}Q3qldb4Cstn+ueM5rp7s0ELA${&2x`OytmR1Y0Otuz8N=-4|DnE5|1E-gVk_*~ULZ@Er2%zp|i zpulMA#&Dy zc$3m=zJ^O4Ak;HPbm&CYU8?>GlOA%blYf;b&I1wM=%2&jwt{=CrsNTtK!uQ}3ec21 zdi9#TDgqnsfq1!D(kKVO!63=rZyN5G))DK=DN8r3h@6LXu&Qt54RI+Q*y8P8uSwlm+C;AVGoYV z?8$5AArRWRojLG40!DStx_}NJFb}(^9)ZUl*1OPN@p*s`~w?{tx|6baOj%TYWH7^+G30@yYyH?d0{bZ$%tY#3xmMIZ%zMU-n{S&WK0Y;>Mh8n&HW<8T#bZDO0pMtq_X8iO|^~>`}%ks@Sx8{$T z&ns`so{qg-uaVa&`O9J?P^fW$#h!G8Lf`i`cl@Adleg8s%^&b?32bQzY-`!-ZGIuy z=T+mHfp2aJKr)im$PCSK`~~6|_c+$pJ>s5$YPBD*M!G`C%UHeIB+n^lSZbJZDUvr^ z6_x^rYa}lKisWgN+~Hd1ggB4Xz9hA(Qt+_Usz`x@u%vL(wptL^0M9E-z90K`$&+>N zm)z}hlyklv-!P0(I0zAA*hGO%uRaKx;BOiiH{Wq=4;ZSi67?GVB`+>xTX$!~TF67@i48wd_&9J8#wp@oItH|HY0xl4{D!$oa&N2l8Ee>7uI1$`)M2 zU4|jxA~iNRW@LGD5n8#+7F=Xy8HRw1NH#b|zr4A~s&Wtz7pb?w`Dra(c9F+R0XB0H z;U!w2aM9Bz05Dujr*?99#+p1{7}mqi^G0g~Ot!;Q$=eDKY%1Yf0k0QUc)fU$;%0u%7~;vR6gTse+xZU4 zhsWAJvBBr595bJbwPtXjcRVvR6zdxvj%7wt+E9Faa3quQCx|~93J`HQkxs_7q)0_# z9noEFN;I~sy?uA2JJzkVMI*5y)9}FXfJPGW!NHM45$uhp(*wzL%#c=uDXFOK<)ju% zfx@wLGBy++jrD?%BN>uPCzC}cG`cUD(b6NwO3dPNxGqLh!Y4S(Y>8>f;mk-n))&{} zgYjb{V;UJMwcQe=7}}Ugq~ltmKb9FdUTT=AKqMU1(zb^54h;8kiMB-Pk}_JX_h?^i zI5BLdmUVl8CiNg1+!;)Gc98gu@=ciIJJUV*Si-i0tz!ko8p83P3HTZTzbfEZ>pA^B z0mt_}jvs_Nf(_-bgNNgfAqWle_3&`qxIfD2pG9J5Ry&UyFvXi-%;mgcgMYwqEHPL$ zI6c;7Y$#upvq;f$e2u z5pawx$F~XiW&!sy9L=Kt?O^kh0xrHqy)EF6urYR9SsYM~c?-h#k$^uc(C-&;@vUoA zz|C8bkO=`dZ$U!-DB$KTNXRV#=Vt`fSF!kGCkys82zbdo=+7J*(&D|u4^`Xf`}s&q zj%&n^)tC5t;Y9kOTEzW``o+Rj;Tp1`taLTlkcy>+DJU;|<+g}yVvVPl!NZf|m9RW@ z$;GfdDX+2odUyzr#hYSz;pikQ-pdnmD>!lGy%ggo6#eBz7fZH3J%ZWdPh_-~?+`!S zYeUIlNTJ~oE$N3(^3547o;bqtDA^xNrQ<_MAxZy_A=Y0!65?4#aY6j4(Ae=Qt~9|2 zF;@{W20_c5Pp$k|yRa_&uly~*i!$x(=ie7VLR-lh1_Wy%*5V*MVhgZWSpNQscOug9 z8isc&HmuK;^q*sn`YDz9W7WS0mMZDbGyMU^iu%zi-R3mGpa2D6}S{EB;#jzX>aq_`4WC|IXn8to&<$xf1^<<8SAT zthe$LTPI96_mZxiwsq1vsoGSj|D^k+&`w)1KMGWXwkaKxI;6p%Yc^U6p@c$(_xYaB ziQ^;%bx-o$d!P6De&6@K_j%uU-WAar9fH6LMPT1zGpz#3n0L`kJ!Dlw%*#02^ZVTb z+dtD++Ez2G1M1KbK3IP%V_If)O^g~}w;E0Lv7x*VHjYgOZ#nm*T` z*;SR!34Cy$$k?72Fh)A(;6p#X1ytjNlG(6E$$Tlv&+nC&Ol4BdliYhd#V{%Jd-f?t zPRS%xH4}Bod~}l6ysa3mQcW!JUS_Kn%YdectbTRr4?q~2rNP>NL7k)or0K7u&oX^6p5=V#1(!8cGWATMMI2V_Xxd`u&c=5jgO zGfcTFn*GoLakNU^`AsvrrBKA|mU7sZ4yikuJ}97eA7;Tc^sR|bos4GBI{4y2f*2<4 zer{?&su}u_4^Kke8k`2y#K+d0BemliR#1)9zNhig%g~T6)vq>}k724bL-<{3Dz_toBMq75V~R1ZF3GCt2~o{XIMj4j zNS|`zNJ1-W`cpyQfTu%Wg45%zV~Bo-R}Nl%Oq}GcpG@-gmt^xn%xV1;rd?OAruT}I z5$+Y)q{6)`y&#sM)_M?+^#{`X1az*?qOnMv=2lwV4b$9riF@4UzC+wfi>n0>LyVZk z#6c&H*J1@H8q71~@+{G$au}(mYDJ(k5g39kgzyQGFA;f)$a=YGh6S3G;=)`g)+^Se z4q`F~%rLeQf$~L)MXpmotfnK+$Ar$NLebgRg3V^|0>3b81fb6ECgZY zvS{KGB`eBZRg|B6itz!jnTQx4DMqu@f^2mT*-DcF$X1j@LRRWRnB7v>bly^e=ktk6 zO&rYE&y$ez=?e1nBA5MW#9y<@ zT>hG^{AF|qp!?8*x*vhx|4;Wy@Ps9ii`eoDpRfB9dEM`$4bR>{_cnK&xNpzZeKeT0 zb$`@Rr2A7h(!K5WBDuY=pza~*L)N7Bk>K;X-#?@K4omk?ah`M!}l!B{BdM&s>2A+2$8;#}hC<3QI zsx&-W8u?hqFCEDyqhlk>_aHhIqHYQEG3^(QWY}=GFXGr4E%s4BSuFLgRv5$y7CyyhcvRi=svV|tah)OKG45mqueTQlsC!o{W$c4h@19e z_Wn12HhcfQeCQM;Vc)JT`Td8l+Py{I_WXuEGOZ5`Ez@d!U^E!wpXjS$LYYE(#YSKW zkrlDF+Rtec?(YzDGp6d8sJ;$?AuG}##z(fqrmGW6F&jks__tY8L&F2Unx<9jmiX%Z z_xl6BrGcfjf%|Lgd^HbtcKFmbJ&)g48(3n^j z_0$8CJvHEf8}@j^9@mQ7`aSgxb6|P@>wi6Z?9E14rYm-rKUHRm!_E;L<gkDh z^!CP6+md=uTYvYqR4U%yme{(zt*^tMV1BJ3uplp`$J@7e#CsFHiFooGOX+u0-1cPc zvN%0%hRxzl+Ul^Z#aA~ru8lOuo8_<;iBDU$xApaH>FkSd>D{8w0k^jSWRKVjMM|Sx zL^`p`-6U8P+7bGzTEPEdOOo(AitsBoUg~F=y>+Gd5gRY{Kd}HmU`wfV+|#y{O7Va# z)l&S~1@Pw=z(y63B?5muTpL(`1e zl~Vh=(M{6(qW1T~mV7Q0e<@y?i#9D_zXmv!8`=L1ocYGzOvWgMyqRy!&4tgPgU(Q> zK(OC0d-4^Zy=Xb+>KrFTw3_ zEF0=NfnNbDramzD%Cfpkx#w(qI{(EbWZUm822n4}zX1OmmHF21EuLo-@&&CQxh&Wh z){o*T9ow>WTTgb_4w$xse3aT#dMwp0Ssu3j6EjBf6_?WaQ|_P7zRDKrMT}UoI*LkItFO*Ya$wP$bC0 zmZ#TdJ@H|wI`Ac=c{wZBFWl(EqTxU zWLq?39Wbo5$_Cg-F4eW!!|bh_2kN0&3Z~nLoC8JzNA^E(yu>iq?Fa+*z32ag4|Y^ZQQA&MAodg4XaaW z*Wvyus8BZuq3VxW>WZrcAsl;=Z4T6jt&9%$wxB3{_P3eCa!5R-T7f}5o)*>kp_yv+ zFdK?s5JS|}zk!$<{}f&L6FVoF%kb?pqe%Vctw?8>*C5SfiJx922yAmfy`D6c_RJyK zDp*5CHMZGbZzQ{wb~STQ4p~}4Aee2g0onq|_hi+oi94b}fCh=a3(??eTQUF;?Siwe zr!X2oMO^YQ8Xn-1)tb`JhStMVBdICvMryHWq?#qRIaptZ`hM7e-1m%>?lV&De$)Uk za0kFZjGO?VTESI%YT1l>tL8~)=b<@$sI}1wqSKpas>s8m{3&B-X1x`7xX}u1m^4wa ziPTAl`?Kyx6B83UTDmy-8y-fgbwbUYko4qV)rK=i{d|;CcR@r`;J|SmAvV`Y4)Wko zIno!F-2f4 zn^h~IisHx44k%_QF~Noo=fP@h9|8*aT$&N>I@>s?tb&?uwJ7tecH+AQHZh{5V@$pV z1qgP8nvSY;BUPRtghs5E6v~=Xoj$QCRr1YwsXzr9CylY95~Ztx%^D;@;f|F5+QbKJ z=qic4J*>~!s>hG}^~`Zek8c&@hi9_EYY}=nW(php5DLgiY;dcc^RdC7**U-2gOV9$ z1JXRRbWCM(z=yObAW>fNVU+x1tW8ni01{y3yu6&32i@`@l_NoU4KJ^e#zIW?+vOT- z^DAQ-D?$piz=yP`fJ+1>B?4RmDI`j`LN9*L7@AUo4^wgmVB2exjZ1t3;T+G1VeyPw{W?p$^4Y`$ zEgfYPoO8R_N9j=2Q3DgCIg)DmmPkdlP}Y?EJ5nY8*tI^AGOrORz%q!Uu(dBT=X4kW z7Dj-D5nz~2h#^|j!iX?)8B)o-nf&^bG1h`WMFrvbFd}E+Ha;x@>^{vF?xH@r`@$G& zjwMsAzOqj>200V z+YeOk1Pb+@aZ=w)>gBu|fJLno3(XWh3tuD)keyWigtXu>6lxUGJxLnN$DP`mO}n-o zq=&rQ*qvzX2$mEK9W}X$nw+tRk{X;~2n030UDSOoiX5tbk0nY!r)3mdDUJI4stJQ_ z?53rO79uQ3&4y2-ICmw=Mg$;{Jg~0^f9bbsjicus&qJW!XJh^YWyhHsDI}XbOCD=EoO?ea20XdDK zG~``KHTfXYPWdGHk?6;0YSxHaJ!*F7vDb~*K`m7)Vsm*pEWQ#EPmI;0Tb>wW3*JYr zsQd}7`o#z5Mq&r!X=zFzNCIOd5PSmV#rmVVIAVx2&*YjlC_7J)RI?_&Lo~$QYV`z5 zlp|*1*z-gU5mKyGgFdmu*>PG_K>xJlQ+z`XbDH%b56uulU~uS_HfzsFMryejKKnnac%aMj_foiM)njaEIf5#bILZc( zKv<1a==`Uh!}hk;&iTy0M2X+jks^GMf>d3J(4oZ-QE%4-?$Nzj>Z-TOd$Cku3B-&b zw>H2?Bir2hYpJWjQkGL0kq+oR$IC!-x}TQ18m;mbc9}+HkMlCvjBX;Efj-+N%5+Ds zo^3WzR%aDBdT6Iq(!ejJ zNUGHrPBr;qAxauxuzT1Ghsa+lrQ>AiR_>NJW#U@LB`*L z?;4{kWCgm(l)?45H!I_`Hn_|yoP5w8$Ex(`;AJ#w+mp@nleJOR+O5txtzuO55E3x{ zH|SuX;}5i^9sG)#%QkK^VsGiO{o&YW{J;Q;#pyqe0TdOFsn)Oos#T_fVg-nWb1Dk$ zG)hC(S!_BW{%qq(t@;yETEuuod0^HKw)A_(rRNGfnuIP;!F z`$s@wvsRHNc;szVb3W&h=ceP4SBjiHVxrNMvw7ry**S24zqfOKb1CPMw{spjn>;HK zY5xL;LtYU%kId$9K+CxVI2`i9Nr_1g2egJufWv{piJ(#iip@ts;Q;wXTBktVr~;HK zZYd}tw~kU+C*vp$x}~UqTc$ORO5rGxJ;yRwACm*zGB_}N@K1UM_KLpqgFTM3!8g!w zDu!QZCde%EEux=Z&_+vb($E6>2&mI8=@y9_aK^^@BT7s_cnyfe7&(8$PJ$2YA|i1` z&g_ab&_}XDX}1_ayFvEQfNqdSvO(#9SWM6Ws3W;TX;dsh+YS*N1#d(>nrQPzvdh}u zCP%j+aaxtQp0kBv@W!)r-c1|;Z{$l4=Z%~&f;e^&N1~O#R!)qf*+!q%ISIj6$$QP9UxY<(Fz9wT&y(Sun~=R-#GG;2Tsmy`Xv>sb;ZAs5&$f-hM!;$Eb%vB%#`EtI`Xqw~%TUmxQWACZRe( zs^g@}$r zQi1iHMG}Erg%PGL^^z(xFoYy;wV5rvQ= zKopLItIx2(Bq}G`iXzzz$VdkQrAS)>l}Mw3T7;q&m=DGGkuw7ANIL_|@QvYIa2|jA z@wW%=P*vpeiBNTY!?##IaKLTUsKBJB)-N1kB`x=L|) zWGCm5X)V?6@W_#fI6l@0Ki420792;msC0UOCrxlG1LhPVpA=3 zho&^6LxseCHuNc7z#!0Qi>9sGrj~}%uhGSrp7{{F^I=>R;RpmZ3Ig2w9KsgRIybfnCY0Dd0!$1f{nXHDMq}vI z>alZVRu{*0A2uKa%5boNKT?>0c9$orG>sas1m&h756+E}rzVLzKaF(K%6a z=hEWne%S7E*LPTdArku?=6s23{UzHv4idEbczYR6%@ft6e@Ksg3JrT{Lmk{K{2nWf{PG9&Q-4Kbd;h=|P4yQA1wVnq zmXZ3gsNaSlVTnC3>M*I$jKbq~#@JDZ8^E!M%n5w*mc`Pf&b_0@d4``-7}7WMmTtJ<~=FlM{&hg_xoVG_8%P89o^fYE!~*y%)GnTeA#7yp`Y}=YK^vrPCkBGI2Q5|E^@Vik#w}Zh#;w6_ z+2{{WPE63c!IzBDk@KWY>jo_w4r*&(9Zo(#+IvX*PDgwBhqUWXb3HT4U{?@U!Bt9D zeaC*&r}H${*WX%N*4T8{oYJ`!H&;}a-dK5KRprf9!P2rj7j>5E9j1-nP*q7kSq2<6 z@qs%9v0oOyUNAF%J?1HG#U)M|_Hh0XR_WwjtP|gwg9S<9zCwAO6e=tp^3@fVru=n< zWou`sg_Uda)WW&(nT>@TB;Up>zfw3Cm|9o{nYzN_xXO9!wx$Rx+fk&Mwm@EIMWE2nK z+5NfC!E^iVc_+AY+dJ-o-&DlSU(!E6^1%Pq1Ao&4zwGkq?LY3|x&8L6vw`IH!wPCV zp4|4X_P~=4PIidkx^cDJL;ipVe#pbl)fB{da{W;}@Cpad^{3xM{vi+iZ#?Y0;2}Ta zfxqfu$4`@Onsya>;9qs{+;-jWAs_X?TRrUjDoef)e!1iLR+c<9#l>Hs!N!w2E+ZcJ zZU@ix{|fwT&UAKWdEknJ=h|7|As_R=mw4Fum52Nm5Bzx#JEuJ4&wJnk{tt?t+;#;W zJhz{}?Sa>O*jeErzuE&&de|AwlFyElPqXB+<7D~Qr`nZ`KjeWw>)_O0{K!2SZ+krC z4|&L+^}vf9a!mc4?a!Zk;H8kGhy0mhMc_XiwQ*-l?lP|;`h z+V90~&Ro=Q3Kg9lW`|I5Z(pBKK|qBU!I_S#e0hy2!cx(@sJr7!D+(38OUc8ESf2?( zHT41NQxRLbWXYl)w4rCIxu^nXm>c@cj@S<&MpZ0Y+}73Gv1F0cGylIA4J%Qf>^>)p zwUkWrI5cUpm=vEK?`M~MrdNw)w6LFC>3bK#GXnXdo!GFw7-1gZb2FI|1N|H+@+x=H2XYbX_j tW~Y$WUZR@0`gHQljeS7E!C90F!sqsDGRM>0Di?7=2D!2yRj8IF&e zy1{`lIIZKDNFv@H#z#p>qO>Gc`;pYAhZZG=fKE$O$p#}FE;3gEfd~l1vVjO3u(38}(-_i4E_FlJlz3cX__qNvlLN-`d9g9^}{$f@CrD`fviLI*Ydu8gq z-CohGNnrs*{&D7&KIhys&e7a z{qW@vLk-$i$OV;2+CU%Y40y~ zy7NJ=a|ZolmO;9Y)hTO!x7E1}4gB{eDO0!wO|OiEf( zgOXNRQiDrc=3N+{e~90@1*d!SGbdLq^XAu0CQ*u{1|?C7q^VNWMN*TJsEZ_5SH5+Z z*ZCkQD-~4vZBrgl6^5Dl39_~sw!wk~*kn$&KyiK}O$ibf*s`F}TGyx|7tC)aWlO>W z8x}NMMSYJNK@8xj^+v!LDD)~+fu^H-9xCSieK!Afh|N~NrtzZL|7gay_tSZgg? ztCS1oZzN?+!U7u>Y_x_cYC~dUuH(QqwXEwsM~)oHhR>!u9(^U-dPi|pHvD0>{JGe& zQtBH^N)6vSJ-pzXRZGH#o^0!Dmwa$ZDfz#bmEu20cMMi94Ojh3)v_?&Pq9mWb6Ke+ zTvBS;ae7!RlfPyA!4ubY-!k`mm&d9WJ?(egRO^TP&--|KP5R^M^V6T$z0Nx&!z8#R zp87614nE?-L3qoehir0vghi4m&Cirh`e!YdZ1|C?OjzrtTPrcVRTsDNTT?9X8!!6& z&MCf+U#427)bNc=DfO*vxaOObOZEIS(;0Ye-zX)C-15z;FxlgG?2cthE#D}$5mh9r zEs9D~wvM<}Dq9yt&I0HP9|dpho7A)JcfC_Oc~(^|E7g5@S*hV`LAg*JlrOG1Jq*4| zWZT~?3mdxW%kKo`^Wvw6TfSYjq}2A^p!{cZmW0XPWu@fxOT#VSSMhK7 z9*jl6SZD}n5*_{n;LZb+S{sUtCA9R@_#c2gV^ga8n@rfyPley1=(icHw%(}4LHX=h zrPV>}Zm;Vsroq^%*Nz-9#G{{E)NP_@#ozb4UMM#AYb;7svxAZcS@|b{@=Ggf%bSk`rMZg+ za^cH<>5|%X$Ky3#=XXg=m#WgOG4HFpH4j=Bq^*Q^{cbP2E7$QdWakPp@m5`IIK9-+ zeR^xdmq}vIetAi2!`IUPB;1KOMw&s`6+3w5(o#du($;Ftc;?Es}etBtmTAKP=$z^9+j>5t^XS8WqXuVP#IK|P6VK;aMBgAxXWec={1Ae#TxUHPt`u+4a1 z9q8Ouoa1*rz3JVz)O-i1e)zLRPrIr+ZemlJT$~J4RS^En4vGUfE*X&d42*Z9}LRpB-VM0 z+xHwwdl&w;b|8grR_I-o&`k=xr4rhu(Az7a-3pyp2@NXrp-SkMTZCW)$h4|>(bOUp!ZMN@27|Le$l9Y|8(3^2km#4{qDBko9y>y``u%|s%KLV zJu{vTpIzr&{0Ix`(O0ZhMD8c%;kdPGows!$?Oi*Nf#$ypnx8IzEVjhEHvZSGhc14L z$h+Q2ykw4I|ND8iej|E5usgfx)`NSG8o$ztMAxleC$p=noMyl-tux9$cP)-QB z5u4&E+M076`xCjr{qbDKKrG!c8jV@5V}dc`moej)G2@ppV@vc8T zaTGl5OmAx6erdwcYCj4erN29(w|bxB2wpgi)>{1)49UaZ6KA@flM4@9>!uj|o3{C- z50=hs=y>rRm;Qd*dMNpb!;23urG8^?%`W=Mq8Dz-q=sDhp+y7PMKAb7Ot^@r^)4oy zLQr(Q@EnewBhhm-dhU##y?T~brH1^{7izoi@^)NGdvo5kd-A6q^204DKL}q1OrNsl z!=L8Edk8fuO%ICQOtxvtmre)?y0iMVDAl zVkWg(vvMWi>wUm4_G4R8?Nl-Bb-mYRYov;JN(eLx0nPe8ApzR^UO94P(ZFn4Vw8B2 z+;n~Uk$>*sl2+6_f1eWpGr|7QlODqpC(CLen=edJO0Td(UoN<2T& z@iN5a<+npzUWT~*J&{_6jViF{X|{exOl)0d7eQSX-KBI}E0CUu*7Peo(X+{ktwG{# zsq*ZL@)?=2bu9W~%Xa|-q^|7u2S=;Z<@1qKpBJ@QpEH*4>1w9@K6TAT|M1JLnPQC!nJCa=bc?LWkm_Px~WsEZEJWYtI>3VLCAK z<3O4aBw@UeO=rjdHd@q@GE%fs6L2Aw_K~m0@=;~GU)YkG2x!02p7oRq~o`w+=8Qm0^RQ3pg@1; z-Vntub8qqDCye*{1LM8k0Tlw(-F^&`Gk(ZR#4q#9OHuF4kN;&pe3`mNaV$ICNWF;}F z#Q4m_hDu_C66efJOjQz7N}M}0v8j^Sq(q~3Gc(X+&~5okwYMu_G;a`^_gJ=d%|bgQ=&n&Wl#xEvGEtu3i~Q6z?(9aPk49KC^(<0(>$sm4+= z<;R%sB<=UQPI4uo^i0jQBAKhHq}C{&$Y#5vntP#V*>FE=8{(TF^P1FpEy9FP9zVl; zM@TUe`bndFV=>XTzlCEO`=2iyIbu^D)OKc({|??LN@ToQP+C`;E3~9?g;l9|zQEGW z^H!G&Ur19!A`2?XTzD_j@wp(}mth6(&v@4!^b5OLu{-_2C$%{ZWmupiXO`=_m?un> zpOdYR?j6k+o)%`~6d!}EjQZuKvdi*#Zty3X77w0TPJUleL3lx`G5rI$0A4qwy3;p8 zP3~CDm5WW z_17;RIRdCVL__z}okpQ)WPR?`+x)ODr3Tez%m4LAKnCQ@pD7}dJA6-8l2ji_cq zL`=nvu!~fRbd9Lpqkd^aYJ?%~UIa1TkQx@XT|haFWshdkF#C)ctOKCz7k&tO%tycQ zs%GgZ9IjtO&BiWXPYz?VzRAlO<#=5lN%PbJN7?S9!H%+BPwem)GcEw<-vaNS}UCYHt-5m34*mzqv zqJn6WW$Fguk>hXYJVRX`wo;TC@o{bF3gtJLT-Gh&FVV8aC@$B zPX_2av!#s*G*l;up!DX$hqc;<{BTGsF9VG3pq!F4H7HyLf^t%U-CE7qS6IkDV0lzU z3;9MZbBFKN zvpF9g%7#zcDjc)be?e-YR{wa0{+y^#kX*-2Mgm6F!u%Dxpv1n6 zkgyY8Qb-Jo5Clpn+zG*14F1p{iZ^Sd_bdA_Olhw0b4gos$!KiPemW^v&inyO<)z%I zyV4zx!PGtn{<)5qV0CVCZxDpPc5gMst)#@IZhC1>v5nuk#dqYw5hgVJ%6Kj|bh>v# z^7M6?FuBALh5zVDKBm3SN7cHcT0x98Bq*;GlLO6&8W=x)9VVR*6JJf9<0RD_KPi6v zq&de=ntS}Dc_h)@wZ}q54eUB4;%I#MmYyK?<4o)AUe`-3bj^;TG?PvgU^vstBJ{d$ zC4ucGpx0p;ySv=ju@|Sey!+3uJ9YV!cu6?(%Nk4kL$9f zi(wS*C7b2#%@uAJed+N!pM;D!3i1PSAZa5beuxI;6SWz3@p96*9Eiyy-nFm#<)rZ? zl9~qm5Y8@q5N6>Ou$2}A+Ib`>*L@X0qEtv85JQPG6B5=cDJI6Pc$W5uyD4}}Ztwsm zZlogu`ywJlKl_6Rt1Dze^rlf9BN!V*^eJK_8f3(1@3Dx{PtC9O;df4PF~4(*Z(}7) zBSxj1nK$8(WGZ8VHV+;N|;4*^X+!ch-lp()B36UO0Tv$Cj(;;L6o@VTzk(P*+*%(ik+7n`DjFq`h zOpU0?PEjGa7ih)_EM>ql@8N2UyZQIwWKN)65SlyuUkfK=GL1;ZFUiSFqaGOu$N%MI zCOLe$u#^KQwOUH)n7Khfenr zZC^T+k*?JVR}aGK4Ci9c?xba>wA3RIWkg zoPPPJaw(N_`sJC3Qm#qmoPOB|lXS=caQUX;-17AbheQ4rbjXsiRB_vv$cy%Sew%SS zQ=*N!A#RCMFe_0yZ7IywzCIP6KbZDyg5;DE}k< zfO@M>;$-a5*tZ{-lIl#P#LprvzAan&BXrGpwG{RQ;lugDCmm-8JGVbqcsL#I1^6&- zU}%$`lL~7jTS_OzFpfG+G8wVPZ9wx|ct#EFbre}mLa|j$$xQuoYY~aH|AAb1PX(WMgHOe;V#S=u zui`5985tDJ>L*iYWcqB>V8{@9Y}2PPmlPXOHlCx=bAR*{!BD(0lRQmBKGXF1j>q73 z(aFUIWcSSN8>6MNp!7*RUT=0#XZFm8Ph#<;QK;gueq#E}!tk@D2=4(^q}@(dP_&O{ z1tHrFR#4P@0<;#%;7sz>yL@OorN|gHRos`^a}XoU&`FkrvY0hgXt1SKJC#1(80zBo zyL9L(bs2#JeH>v2|hmT-ifsm^+twH#DTtx;2>k@IOeB z89}8ZJQ^Qj3|p*<88A&zAPl_KuY0mMJ<{m$39!Zgo z@Mu(I^*kjK+Jo4EZ0oaL*U#O~Gt26Uy#RGIc87ki>+f9-XZ389JUubp>M27)#p+pG znql=^pv@|L7OKzQCUa)R_Ng5Vng5ucHp*s{o2Pe!wQN*6D?CyN=gCp5(C zTJLtX6CbgS6g2@9V@bJg-oab6#`~DgEX>b9%QrEvKZ3;maU}L^En85&unB|bJ=xO7 zlTgULSRJr#YP0gLeIf`CX(#hzFF-ulh@?#jZ^eBH6UCa;$cl2E^m;@2!cSP<{i5Tx zSq0?=jQ=tXgtz;#TN(XlY>TRxCRl+`9F$@dzWI!D@ZPGicTL|;Z7bETs_m5hv*ah} zD)!RKwyF(P`>1WGxp1FQ#?Qtt+?q#y0pV5y*jr^2Whd2+hc3h#Aw)q4W3DW-ZaXFX zpdq-Te8GFGR+O7gpk;44qg+QTHqZ*&G^N9%U25}K@g=(Xqlu1}FcHQ+# zJ61nw{EjgSLGR^*Y(^pI{Y>AN zjHtS3y=djR5F(~%^54|qI|L$~x51&7a;IE%X@MpQX{AXA4 z_1s(@g^FjxaootFeZk1RZ3$CuF4!UPAa?Zr?IV(6*al#moO8>bY4m=EbIS%fw`Anp z@~kvdc>*~BGVAcC-Q|CL|vh`7)J0<$%l{U zV?QCq>?Dop8(C+{c(cgn3MeM;<7bqcs$?zic3Z$$+jMe2LwbVNSE&3<;b|7VH!43# z^P%+;p&~jjnm>cigUSfB-ib?+p58%mCpJxze1~q`TTDDMp-IRax!(b~pXMm}vAdxD z=ht(p$m_hiPA=G2oEZ5*{zRJro{?SK9ui6Z44w#IK(F?3 zC#%z~w|V8;$#r_Ur7>5yPlMWw4^i^1+3m@>*wf2mGMAKB&0!D6y)`J@wN$C1k|O|^ zn{M6fm9HRAHnvMS+H-}wtUMV;4F>5ONYA)*zXCyp!Adv^LYZ7IAgF$%oHZ9jjt@5qrnrwwiW+)FFH>#Nh-WA5AwOUdgT zAd+QMT|AfHn&RpF#*28;v79)2rR9|IuD>Xz1L{@T?VpViYN^g{Z?U5Zze?cXS(GYP z$Chm`)(Tp&x>~QSL0)V0T4S$qy~gczj$Y^3>s-CgwO7+{DtLsT!%64n{$^|lLvBzy z-$;9|@H^?!dGT!NY=i@3hoEpU7ve^M1|AxZ;BkRm5b?ggaQMiPaDO^H5FV%ycrC1s z$0i03)MU#WkEF|=j`@R+#na`BYI23W+4AyuP{0F>vB~Xw3JN%sE-$Ih$DZbO)bD{b z_K7f0^1q_a^`e*qlB^odA3}t)SvZVj%b%L7%>%mr={fTGvQsbky7p<2_vZ^}>0VT6 zHY*Ku_!^olRX%}lnjgvsOx_dnZ1^BUl`9MefH zQ}A5x@+G`#<#*>gp5Zv(blU42E}^{~=VP**&u>lf41SM6dw;t^drzGrwwf+RdmXWL z!HC#a0!C~r0VB4RfKgcsh$MCw{|I`Tg7T(tR)^d|c{jvFd5faF#h57X7E#_Uv1UhZ z+eL0;MtcLJy+O=q?<%9ct71lb*HmvL+nShBV5?};*R%33e}cIW?VYcEv5|e*yP@Uu zTtU7t+NWc?kQ!Rf$+X_=mH%0*q}*~U^jEpgQHh0xv;ntMajs0}VzkQ=pw8J7S%wl1t&rdc$w zsU+#UckEkoFcCTjf25=DU9UoNH6GX&E< zJd74O%}L5uaNHN5H+@@Ys#mb;*+%G6zlYPpiKJOKlYgFRePrwJkyfv{-s{pZL}FcQ z_+Gm8xzE3qv}%>x>C#eGF7@YlOQ&PwW^zlnKIL_NoEgEgdH&WZStLLXK^)1HHq|29 zy#skKFGs6lYQ$-|#EWTE|CKVieFsn|Wx@xt;g7_54`;&%2glg0&S086HlZ0Xj?nnH z4%TaP(iWC0;1rC`oV5Mv@`r1*hxJglpN%J8+4(9WjW#g$EW{K^64Ul9$u>wUpPDOE zQeS1yDr^71N3vggR$s1gFMF2urRjT?=H^7cz(qQ7&L9+{J!^4g&-yG|R;DVEE3cY2 z9<Yy6>Z1bq8)1sJC;cW&6z+p{jP zJ?nz$d)8NN&w7%^A8*eZu51OiXWdB7U$|%O&$frZw(PBIVqsb?KfXM|7{r|J}Ea zvv2*?)V`GqA8|Xoq%-Dh;#Aw%d7IB&8<~=d0Lf zu4+(mIkv|Xlm=Qm;}5%IQgo+5r$PMSkw6-pakV5M^Jsk;GaRETzD6H zo^{^OZpHhlCWl0mE4OgP$pJY2187@zp>26=p0q13#iU(%sk$gGUV%nuzTLj@MxVLXXN%1)^Vl|?xo-XR>ZeaXxOC0W4eMuAKjY-zgcHL) zzvEH1g1yD{$UZ?iZQA-3LF=wd&QbWRYS_Al(}UK#E;)VC3tX30L%8LfpmpyhA9Mxc zm^>S-zpt+hUt9wjI*r%zu* zVf7kTrCf;Net(juaPK_Wj$u`R(2O_wwJz{{a8P{EzaFDrscX zh=#u7G4@p^#z9(?D_;~_x+k8f@>`$tE`PvcmRB#`lS#yg+Us4u$D;nUMp4y7z2sf4 zb8C5IUq#ha(YfB`y%u%u97V;6deFPvRCVQZ=PGIr1s}X*E0-M1Q*bW9U6)))-L(qN zBlz4UYjfr0Cn#7;aPK8&<;s70qJk$7eCd)URQM(ZPgI4M{2q5R)hYNURdz`#S3dX6 z3f7VN!Anl2qLUPSGr?V#7z!s<*@CdqL_RrB<5Hy&aP@8a$u zyNl?|a{O(x?jkCl$4x^FBfktIzYHV43?sh`BfoAKe|%@I#ZAvt~`oy37ce3I^-_za*s%E z_^V!eOZ+bsX#@8XRJ@XF>}dj(`(M@n}3> zZs2;IG3NIe^Lvo_{pdWYkLVbm-)fP2I_7uzjZ29{$0YeRdU1*mz3}GKuY+&o5wt#_ ze$g<{FCXjXpMLpRAOG~r$A?dvV_RwC*y717AG?(@EQ8|II(*Z7%DPY=j0!rEt`nKvgw30V7(Y39vO zSnpi2_&EI&rw3-OfBJ{@Pyev~NoD=hKdgWHhxJeYu>PT^zvY)dLC3t?&70SKY^UN_pdA&c| z?f!;b>2oG8>&fA3qzgGOduo^|ICW)DYuYIi?8=_Y9-?9?$adLH0yX{S9+#fj^b@;r!*1?*z}>I$MGBObvALcs4srRZVgWhYWy|10j6FH~JuxR!a<&;FFKVBi-mDl>I*T$(= zd0gp_yaN$XKn*LZbe$Eq+k%ifa~>onyi1~OKf|2~pGPWX@8epIEvb$9@JBkAQgZm8 zS#En+AkHYaorj5$<0agWB6kOh{4h9mAP75Cz0|y-9RDjnyfW3tdxj&c9S69NqcgSB zg5#1O$5lzb3w`GmsbLVraqSO%&&NwRw-+N^$0&E2J`v{>Btq?Czc3zzb*W6*uf`@I z-^QWx`kMTyKjuh(T$2TjDSmU;rA7p)@I3HWmsiFm>TagcobVd6dzIItkHwf}FxI5j zMuoP~AQieYHNYEgwi$35;gzDb6dhOW`V#^&$S7!~uQGdK1Yn3*2Y^i$04FNzsLLu? z&BOxhO@4Tl-7Z2gz^+%`^>sqKHbVPK3Q6bxNs>~!RyZ~)pbot%)o$^RgIAxu z-AuC9C8L)QAHhvU`8N{5soel927s9lU(A%QPH6&~u4^xxu6zkMqR9$MH#)4Szq?XD zSg!?y|2s@ICRjSWCx_=T2P^&X#?*F$2;sdkwT)N5=0+P%yFTW|R1Z;rYOjDI*M5Nl zl~JL|1Z^bEwB>ibiG+g79B_Rv2USbV00H%`brTr}exG%qI)U0!fm*uK)lD~SCN3|J z&FfMdRg&hfwk7S?Z?5`6qeSF z*k12b<@NSTO*f_%@~RqrdtRAZZO`jco9)?|+Rig5Fxd^GD{NM>_S9&QzS4EFxi{j~8nVs*kh~bXQ-#}ua9=KVSXB%K zEbG_=BltolHI#4{M(d)?YPfhbsgH4q<%g2-|T|5C%I>p7sN(=Ov9ZG^qli6 zM&Wh>>}I11oLFV~>fRr7&x?>5iLQG)vZ9{f<6cc3zl~(5=ZQ#!dR7}wwGOAc<`dLL zONh{_i_ofbXw^k%)kSF4Re2zB);PEc{97Cn$q0#LghVnzBI%GwMo1(hB$5t^L~W(= zWYjmw2#I8bMA9LVjF3o1NF*owrt1KlnGU8d_BuWJM53+L`E|I50E)a&W8BO6rg;iA z+L3%yS+j}1=3QU+ex(oS){g`eBE7xt6v0~U?bwYJr6LACH7Zx@8k&RtO{YhSOWbo-h-aeuL5#LD@*?^`U4-nF(cxn5|1yvQLAw zonQc4v`x?0i^{9D*$cs!3cW`^-AlM2X)%W0?eYIKb>Fv^cYg;YAqLsUwfry#_%LJQ zf&WGBl+UKsx?QL%h!sjZ{!!G(cqB&o37`iu)jasK1ViDyevtj{J z$qO<3Ej@wI71Y0Q?zamuL`Wa8$i&2yDecX!^W%#MT$-rOm8^OU7>2=TQD4To#jD(T3- zik83_Zf;_FMx1XI!^!Ltf6eTI#o~o%g@YI(Q1mngnK`kUaO*Y3$a`I1|G8Svafsd7 z@aOQzz-cGlX6a59WVTLd+Av4@2y4Ri@{}l`j6M@}oAUA8Zs9AO3=q4{_~uM&r-?3J z*S{VT+V^I{$2iQD*9ivPR1Z?01Sjfq0=+!&xlQG2i`8+FjTvsc+Mj8?+QezEYn7w38IxaPJGg=5I_{n->Y}?gZoK2XU0SD2g2H_n_`l&a8)KaqPAxb3{iR&^P?{y0 z3<`S@JZv>xtFv|wAfhc)W5`>pr+4kk0ZzflaD>Qk2zjF3y>gxHjn?x);Z>GY!iwRh zlD4bv$cq8DB7|IFU8n*+*~T6Pcez=mOMlE&cfXM@oQ)^LQm}b7SGc$?T{)>f2#XsG%0C|A z%qy$>I-4_+C6E|7P{;%wn`|GjYzEMFL;qdsdT#3T3+Lke%PH~Y_|AS<$3WL+N=p*b z6zG<%5x8ghQmLJMV}>P z1=!>n*J*pymcZn_!tt5CH_jk3JF23NXMZ-&tLg$c<+JtXGBXEGdi zF0ZFEaWc3n)k~x=A35%7$`xKB+je50GmJsE-qz32_={_yAU7UfXQEJ4T5O1wzR|QQ z4uE(ov@QS;7gAmil2@9j1_16}y3XkgT=nZ|a0Eb`ssm8V8(J6Iz6}5>vDvac2grod z)|={O#3H#0#qFd(NBQD*JO`)S;6%!oKIJy$i`HAZd^&zU2$cpyfO~jjDX*nheV?i| zw^6}#JdFk>B2473QY;erYI5sux^Pa(dZ}Xet6#?jSO=5cu4;tn5P_Y!-Fi5^}-2) zu>lzbkj<7rZz&_qZnWnHeCBmK@SQA&wbH8mDVM4HcuKC<0@IUfzM=D~y_$1*Y>AhciDz!%95FmQ}nros~qW+{B_3m&$->7aIxAUkiC?&ZnFJSCV zh-Vw9&BYFg)2sJufSQl(#UCHv*LLsg@F!eKz88y6+WX3`OznN#{p;j;A9-FzV+6ZX z)&0Gy4~(^PaVO1}ldXPiKen8N@bB|F|CR)d@)vsIrACCn@VX4*D>C#l3B`!7_C$PD zv}9-tBfi@6#5wNyp}8*o{qyWuYXYq0K!CL;0<1wmfVD6JtUYVr?4FBHa_P;UU%nIt z!>E5+Q5b$eoh`g{CSgRlF7P|{G+JhaY{8I_ZO;fx3E38ouuMi+Cc)C;-{P=L)ViRP z^(`gBQew7+BP^2q#?qjA;P2~!lZt( zzZ# zQ0-_^YbI){VJYRTU%(V=hsBfpKEN#pFv7UDNzsl+It+0T8(OkGStW>~?%s}_CWK4G zU{tzoGHIUeB$2$5J9?#p;yC~;X4}vtq~QvrZi93_kR%e=PC%6qFS()_SX6t0#LXfA zW5>HsS049KfnYA>JcJM$HZX6Ez>E_T<`H6V2!i$-Z^ykXMDO+9V-7BwBNaF^rpN1m=rIt7 z86dT2{8s_dYap(RKur89Aa+t{>n34f9KbI-zxq`C11c`&!=`6tNOEN~ZLBp2csHYP zhqeo?5oEf$=4IRKkXLZ%?vSi6sl1=jLg2sCuR-kK`fpCQUy8^tmcfx&t>XcjTG6sL6dggadv4!8c#kX?7Er*Nh-UQ2l<$n;N`nl|O9S|JXnWHyR z!=R$+fy{Oh8b#O5jIM(d#>@2j_;euY?eVw#T44KJyw&e@eoeHOo_3umcDN5@sI|=@ z6C3ASA3CeS_kgb7^0t_fkjY4#)QQwZ=2AXHIq_8|QlHtF7KzI`k^0WYv`9SHiIlKD zEpo0!>Ps8bBIj8oGW%fd^aN`|*BSCDfq?|`$|f@Q138N@u$!4d>zPYiB-(Hy){kLe z(+uFb(4IFsyC&B%toFK=a&t3$A{Ied#UR|34SxoJja>YEb!v$0nr|qfuW+_r_|gF) z#ma6BVpm6&7so#O%s<~Ma15xfV`%icKCk)6uy!zE8tf8NNuGVJftZF2ei$= zUT2yr-5Sv&d{-+H|In3Q*AGxCAfxa4F}j7qxcGM(ha8A3Cw+pzaGAUYcC%I(DC2J} zK5rab30u-(9j$9DeK;wZQYIX3L%~X87Ez&qjcc=5X90q>AC+La+D?t~vT|&g8&{lF|NSfu&F52*_nT&gN)EBO$T7;u=;QQ9?4 z8XbXD(9x`MkZ$(N4R4YDAHMtxwURKjIzi&Si z?^z{N*LEV$EC*Q|Dp~Kl0Z^gG3*R zlGy7rEBx>drH1ho4g9$Vqn8Z%a&WI&e3M^J{)QsOfvA-`m1w$(dvb>#;h6NrS(s(4 zYH~-|00&-fdP~mpaZ$((b&OIAL!KUdrUr>IrVbOB!brZbD>wKgqBI6NdA*P0oSMNQ zRj0rfXZVmrP|#^0b+P#I3^%rS49q1Y=E3hVF_bqBC#ylcJuedtafQ0NWml~dq6$w z3or?)Kv=2*V;U4@+V%S}h&gH>7;9DBYXF>gipi3H3?HluYHC%-gkMUgM6nV98#ibz zG+EM*9pYfO&l%IN?}kwj^T7SwU)fcAx7T?wIAa)YKf$|vy}Aofuok#%P|@;o@+724 zKlZXpfp#CZ;qgTACsPQNQW3)_@-yS(gr5mQKlASK54VghMlOMB}cZ zyHA`IkcxbJgpD3(n^L@6GhlFbS^29RE&7$yk3A6=u3fUUDU>>X5S{zU{ z_Cs3Ko2iO|a8pTB5vGl<)JC&EXsZ?d6|LPZ2MSJ@UVD+vY?F)dwx zhtvV*<#Xr8x^4+{hz^boWXU+G!asGdeGKB~fbb_j2EiLLgFXhq8~6V3%e_DRa_epCTBv0ttahoyI(VA>Xa;YQ-k)WuH;VhVWmRr6u(A#H2GLijV5;vBkFiketZ-}B zfVblY?gP~o#Q~qC9DJ(UzKnO3;}u}{2KP&DeAezT)^}y#OKgpTY`^e1 zvEYjmE2{)iZFY2Qg`ak;sCeFCW#Bo&(>~_<6a-!!Vc*HXK*RjGL&{NX&G4L##iWLI z@>9cA#?w|<3g!`x-dEWUrTxHwd)Gp|m6?PgdqSppswMpvUL|+IH8O^;O7&Ws-eWa# zS&-VK2`I^|hubXZvW?m=H&Yci&>(CG0u3~Xpi`yHRBe!TLjhl;bsM54RIk;C#Ku!{ zYq!ap(z|N^hu>u-}?HmzTL52vF;FA4fU$Uwp30@VP2Ojo7JC?s$^#EnX>HbVB}IeyfYWBhK;MnUFf6NTg^{!L7a>@8!m{G(VkNo<=3c;=9svk zF3MOJWvq)bYS-8qebkkH=@I%nsb)T<+bNweu*s-&GAf;1Ttl9Z4V%;A)_Tj+5anrz z@-$eUPma1gjg|-M<(^TVl;!y&W2?Qb3oWfF%F`6(Xu@i3Bj1H)ny;p z)76=UA4)yct5?wPv-!TAP>!#~$vnPMoj0PNp;F7`#3q?VH%E%l;jq{x*XV8vpxf!* z`UL^gL@{6$7@OY+jGhXN(*#4*MDsYIsTq4rtuV95y_#u#)8SnvS6rAtaQSK#rvk+R z)&_9VxKctx129jV22bsE{b6MBXQ13Z0Z-O$4522zZsKYeLpUJx;i|Ms;u*Yv;OV{w z8P*DE1|3S_@58dGCm_8XSC&T0mgk5{$J`RnP%m89PmLiwVC3l)N;ey?Cb}qo(Doeq zaIIQOi?x|)QRC0m=`1ur9XF`g8L1N6sTG?!4V8%3^;-rV$|oWO(}b}Yw^r;Z#AjJ^LhcV1gG6g)OlklE1b)FP+elkWHl=WN_D zv(&v;}zIKjT~<^#n15@FMf>QIYmygy~1aCf^aNdYWXsN_L_DAhws>6SS&-F zp2?O{O5kJ)pFh;CD0|>8zE8?nlrpUdDP?HwQv$}W_n6A$WB24@H!TT^-&UDlUo|>u z+4;T7WSIt9?&=PEj;h6|Fk~&AF2821sj73vC)**?wOt9{(P~Az2YhQwg1DlA=8NcTPgOh zktl!cT>C=cM&E)O4qTKC$H{9yK=uWg|56^|Yrx zAMkTZQ@02}zgdb??^@YRrWB`|-kL*FlHyWEh#|She11@$^8RyeSsS&%b}K6yR2aO9 z0(Co4wwvAhz%jI^S^-CCPd5^Il=gHRk-r}8Degi4{_0a)tY+#?6aPNCQ+*9}hU%32 z*p2sds#6$bRHqfaX%_9Wz5*ZF5#St+6XLm&-j(7_6!Vu+o$9)s*{ajkn)Sb`>U6VI zr~l2k(+a=uHWH^gT`SnrRi~?UM}MR`)w~f`C326G&GA&HSyZQ2rnUh{+S6-{O=ZK@ zj-}}c6S8K-> zUQwKmM_x1fD!6G601o?#zYMuPPFo70&tgSi_w zooThxnZ_fXX*|-I#v`3+l_9wdknu=o8jp0Qap_DW47&;yWq`i-hz;vkR+z&74NNIa zuZ|p4-k`$t=A#v+P8@zch3Uhy6oaOAouM$j`VA>e-w2!m2GiTI%ZBuL3ez_NM~vGq zQkcFG7!!i=3lyeYHUDo^m?FraF#Qww1e?I3Hrh(zRbN+OdfFAKZQ$p1^+!5WZL<_y zYfSEblEiha71LTzik{+d#oUS%iizFK=lPt@G+WV`a;LG=nXU%Rl+IN9^>m%-dcx=e zR==Lklv7aXOh5EX>P$gxBf4OxGffUoorn1K>P>OpKAzt6pfr2O)|;x7B^|9dJx~F7 zwBGdBTvwzw{i`Xx=@eQs^`nz|Q%5?a7&d+Cq~3H^{G{G=R{W&ibXNSN-gH*{q~3H^ z{0zP6jI`N$(;u)Zq&H<>*=#CPw53^`4+5Q3xJ??K7 ze3Q+p;-+rCvEvj@xx!PrVs*JPSFFq%&HQ> zuO+8iQ)RW8|Aux!&Bk$_run25=F6}@(t9pe6&il3XeSfQ*pzBhrI}=0mT{%iY%uo> zl1}IMD>H4=ZY2R^)WS}`Yi56WpA1Qa?lIkNdsJh)P<3+%t20+$A!=?GTe*>{_DY!} zh}umBQQ8>wmo~5w!nxROe(Vl4sadg1(q3XE=_84l-r5~j3$}6%@<9e6sRhYkPh~($*4dwDv)eOFR9FL zn~}NRGB-q-8=}k&mN{sB)a#VaBH)`|^hY{I!*4cw6Rix(kFu;v^b(aa)MrL1+GS31RbXB$9p>wX+MHD{QceduKwwe_YYQ4}=>(C|!HRlBI@mj5FpZE! zd}XtFMXn};*6>Puc24SG`vAbuHOnv3ND)9q-+Z`M5wQvv%Mk!HuqPSpr}8>Ib67rY z;bWk_9T+gLH%1y*=fTIZ7scyqS-n!j?uRo)1N+BL1KW2!Jm&9K?hn)F=opR7s#n(e zeib}*AD4o8orV+_gL$3zD^{0gb?WFXZ&*oeGU$@93kc6CrupU2Z4mC)a5YHhy36TY zhZ$7{ZB3WRu@X}=m8(D3m5&#gdR3`SGxfO7Qgnkfxdzviyffiq9Ejky9Ruzb2bb%A z)-DqMGMCTAXy9Jc%08>rx~g2Iq=w71d{Eq0 z%Xr}yG7VQ!%JjIDWC5e$wNj{?mDE%zg@j!xwZWBgT31TVnJR^1Tcz46rRGkR+CnM% zFkY$4bhlASYpzo3tYB1VvYB1~sX|Z2{#r6xq`YM{uZGDpjc#AI^aWVbdzpI?S8AGE zoa4~s!eaEm2TarB>fozCb~~(SrgRBcXuZaqdgX2Kle(U+vynF)>eAMxzeP{;HOISr z_SFFk4rCW`x)!!6O4qwKUj6A^OQrIzHR#?^-0FpNEw|`ZD=>OpJN3GL>Qyt{GGJNP z(=UCZmPI8H+~&C>aTEPP+u(HFdiVE@X@eE&;x&bqw%8s$`L-ETW4AWqZ3?r*`9`H2 znW3aY8V6Qo)+sUWfW}`|yYd7H*^o~m-;(Fv@O`kg?(Q%?UMy~6zsiMVekNCrr(x^8oBsg1 zX2N2vnOgvF1TX&|KjUxT%0n?M5Z}_BY-BB00Dm^5B+Ic3YHo= zz!PkPkk4aWW$xA1Y+}V^_Rs?Lob2kI($(+3m-rp(0Ntml;V7XCspor zHIj5?bTtJmDoL8w+H&)SXLSJ6n#X}L%`bctq{k)01Q|4|y1TWx_Z$Db+4Qv0K~#Wt zTOJuD!AH|T=q(l!a1TI%_YHI|jt?eHa3WJ~&`3X5rh1jNaHS#0!ny@6SU7GYS|Vvb z&zz#1RHP-)&R3Phq|s@bl+wcQSMEqM*sk~3h=8bJz|+XUWOJCFr51Q%hoq0N9=C8c&ohfG&-7`@!`HlHOZ@1tSA-0-1%Ilf%3jj(sO zV&N9$Zyizoad+s{#mG~*U1v^5#LTy)@R^W*iIjB+Rv83D_AO0I4($!$Y;#)2vg1WlZeCc%^Vtgl3Zif{!ufeLj=a^=aEGLCu_{b^lSLJ2u6qTj+L7UmA#n z`em(&HEN7cpK)i0zNapK(jGoKkvv;NSHEs)m|Pj%!{nC5DrO>ic5>;jATgtMG57AD zS-OsoO5T;{PLSAxK&Uea+{1Lf<}`)Y8wA~}N?;>4WkG=P&`jvo4=&1eXUqn1jS*UJ zN4-ID^Lz@By9D5k!x~^hdpmB^w5}{cHOr8Smew1_OC;Ibu`PmSD|i|#t;08t*C@HS zE;hTJ${8GEy(DnZIZ#UPh86^sPU9!E?pe{ST2zWH?dCn0HYZcWs~~oZJbL@RuKyP4 zw)^1BMbBXWqcaQ8yM%S9(>7@BHxg#aE1z6D94UxMSquwp5JMf%>RgAtu9PXhuXFZg zptcJd#WY=V<&WZE!XaJW11{RpO#oi!@9``8B?>(DS$Kc%lngNtdLu^6iQ3$$To=Hd z@Q=ZmogzTyihBTzc!Sv=bZ;>GJ?<@DyvS}4*UjQ{i<}H_E^nG(EQKMuZ27FN;<{dR zQhb>X=NMMT9ee;Ld9l<0LG0Ne_Q1MqnDhgD9Ss8R>U`VyBA0w9h&@Da$5-pZx+wWR zl+JZg`f~x(KFzexbEO94sTcZsA4=|xDn7X?8sKwCmYT_uxqNi~RNW=q#Gp)|^-s}2v?s&?DxQIUMxQA?b7+{+n zFs3)tw8Be^opMJ^9rLRTew0xTQ$tn>Tcd?JC}B?+K%TnbXEjgkD0q$J?M9#Ngu(Vo z9el%eP_GHYNeG*Q3~Irul?QB2k5`TZJ!;lBBoqrP9)Azi$!2!N|p=f zkCIcgfb}Q-sTQ!6cP*XDyS9G!KE;9#M{yJrh0`CeqxNF2TCVydw%fglgB87FHJ>9n zMIgjLuD^jvDiw9ac^FrxY;X?Ayr!r2H)fpWDz0ziZL=(9lBgkmHtaNybm}=|s!y^^eoANK^%JIwc;j@S> z;-9vWKQE2AZWP+JqMRyab<&Mf#mmbL--Mtdc{zsE54lQdLNiqd4IhkTmLAGasx`gb z-CTlmNb2x0)ZxDF6h<39f$BS7cvR==)D`BFjL_gD5X3J>t+NOZI_GB|ZWr+RML6B? z{!A!YLf+4CU-jbU)bld6_+r~uG8CgkXmk>wqCW%3yVi&X&)k}wc_iI&5ORnZV#%Dt zC)uQlm+k&lN5e%7do+#KP6FIds45Dfz|J8f>TERKuX(T0S#MA0$HJ5t1+b<>o8T6l zvvMwckejbGIRqHmb)8$nxWO~b0z@qn6x5&Qj#dZvnL!Ei^7@MeUb?H@5}aOLozn-E zP)p@leUThuxF%JA!h3a4G1}}Znsd###X^AE>Uru9+I>F=IsX$4c(c{uBw7pu7%jM{ zOHBXH%ATs;)q2Q;ci0C}#kH!6Z7NxKM0=C^gz;Ig8bk^VD5x$Zo+-3DsBulw(AJ8( z9#W5PBe&$h%^Kg{==%}!lvjVg+iG!M%)(^AGQ%@0I$}Pdx{hZ8JgC}+^D!=5!UZlL!(Gn2VXjMUvnn>|`q*~s;V(im z+3-fSt&OXq{YWf5Q~^=p^h!{UcZh=-(&xv5aufH>Je+Igl3_$!@9TrP*v%I4ybcEL z^Ew{`n&b9$tXMGX?UWR7nikFcf&z@dH$VnWU#z%>U-?FR7k)q}aHH_nW^vUES{j`T zam!k<<_lU9sO>+R;L2ID;$I-%vtyV1o)=s8wkUg>%idOtdTauX+J*KBE|(X#YjaIm z5GV68c~g^_1AzK4|hk`JTj zo=`3`AuLEnxsp+?q|0^gXq3xr2MZdaTn$mK2AAto@P`J=3T>)20iMr zr&Cyqr`Sn2IvvNE0d4Txlp|)0?*{B(tykFU=H5#701+dR3N7kdJ>hn35tq+4&es<9 zX^yNiqm_e&tP*3l!#LY_AXtTJCQL6UX}D7I37Fo4)S2se6%F^lxHp*IzjSXk#Xln@ zUgX1Lug)p{0l#yL|4AK-0dy!`N=m~ztv8}OC9|cb%NR$yQWEwSiJ6|>j>B}6Fs$n% zr-lGho-Wm02KC#o1Xe!_A5m_n&ZUt6sA(nO3%eo5zhoV{wh;z4Z`W8b=ca*nhzP81 zoutAqPtA}h7H8XI#Nujvj97ehj}eQj@iAg?H9kfx?CJnszDK246S|gEmd)5U=9#XZ zmqYH$Ax`gHZH+F9k%P6=#)IUAW{&X}*QW>{KGilIp0;~&H>9t0t}-1Z-Ro+_1t%Zw zhe-vpF07X+dpGc^$Yyi7n-|UHA;Dv(=^?C{xm|FWnabz`ccndVbX#*QGWft3uU9_i z^@!DX4i~F17LgN@;~qd8so225;Q=ziy97bL5DdYmtXMs-YY?6qhV_Z7;Evef22gc@ z$+}u?soQy{6kgX{M#5+RN9)jh{g}0$W(@x54dmta) zBK7Ae!3-Nx|C!}@7w_m;Sx`cpOuOL(_UpCW^i4|}d=zG|;z~=uD2>7+eqj$&dx8z` zr+9K`iB&GKpvGE|B!!DUbZz3HGs_L%RtD_NWj#&1%~K0m9X^)8j-YS8XG$NbEoJLV zAEycbm7V_MFnIYoSO#@XCLGqjUdVDM!yze4!$;6pRayk@&swW+-M&c}tJPF~Q>5?} zAF1x6Va6ikgJC$)_^8FS6d-%@;b5g>blhbip!&&H^C3@Kt&{)zcs_^%?yJD0A*kzB zA2*M>_F6kDE-q@XxJyUBuX)%g~<7$S${mtIpCVp6bXuPDc>qMoNo?szq5-W0Kq+>epccPBJ|)gnzk<4W8( z!lOTjkJ1N#h#v5|NCj~#jo~WQ9|PxZYz}YX`&oaewrrztbblrxpU~KMo~(>uUL(j? zGHnPod>Te-RLvR4t4J$j$e~Wa!G7P73->ab5uQ|a$l8NC(~YFt6*Q8SX3Ju*TKMZ0 zlO7GMx=v$F;<3uFX-p#&_eicXhx16t9`#-eD5;LR@ z$T1cmCkCu27vJ%kzwaOefU&D+mMdr&1Yd>F-$xcp28;_DWwO-4RD7A}KGu*fmhpVZ z!K9swTVL*Twk=g%4-{-%h>dFztPnD>X_Z&kIFnaVcCACItWm9lwiIylk~rziTu{44 zS*})7`nV=8sq+_8P_{6H3&+`K!q?Q8^(+!mkVy-dTb)!mO6+V4*Fip|a=!_tM;W<< zV;@D>7DTQ!QMQA!{?rqTd5?@-S6c>C^VsbzQX69O(vGS5G|k9`;=n|MRZI}zjYROA zk!yVgdL)nz6V~vL+H=y#wUVHw+D~gipfGi0Ou%r#1MFJf^v5DZZdNptNzgzMbC!18 zAMD1r44AO$}mE`Jk@(^)3R={YZ7J|Rq;rN&#HO;9N~Sn%SkO0BmsG+18i)AbqssE zfZCi_jc6xo+$9NA&NtzUl0{+M)RPlrkO8P0K9>EwU&gyXpJCm)BDKx(z^;pf(6%N; z-C}CcRUbeZm#n$ik3Eo!Jt*-Gd3L>EZ?~)wH%+P(-3XxV>LE=dKZf89Pr_>u8Nn*r zC0~-ns8+y6ig6*U({>;sh)ELCxfsqRZr4*|(4Nrk1{C`+5{{ag&|2%2azS(r>z2C@ z-t1eQ2-`B~%{Fi?Ew^xW@vm;+`YCXo$onvFB4EwOHE^9LG4Xwq7A{>wbE-Qh(X_|} zypx;sK{}a}uyIYlrKZ{4fz!0n-GS3I)5>*&dQ`R<6tAY>bMAED(6F7 z9WCzO(UYg{uW`xEb7ZWtoKZF0Un59c&DJG!I2@w%wg^&N1WBhul&ss~9fYtT&|safWv+LC@UBAe?sykE7fD=UF}e>kE1Wb~ht`AU+>HmCvl`?l zF}XlO7K3}~F5t=TLl%Sk2|Say%3?8|;o%Jm?2<`@XA#UQP%(Ez@ChPxv-iH zjTU_Y@|B}ps;ea%{qY{7|1=)1AeStCDM7h7X&^9HtN1%h!Ca5tX&rcO z^Y4JR-dE5Tj@FURAx0paf0)KLctBh@sfPWioy=hFd7QIPG>h@?0!7d*ZvA)xKD#+2 zzDG@9DHQBswUCKDZCsC%#Bw6^%?`7d7C{HEm3_hn_TgBcQe&vD2(m>ZQNT__W)f}} zybD#qYUiGiGlM8L(*=fPcA#C;2z(l>G;*-nKB8Sr@lq*C^}W!|uMS@qQ^aP`Eohfs zSH;qmWt!de#4Q*DumDD4D_}r~<*>xT24T=H^^sxob_S=zqU>75kAb?db_KFvlFfE@(eKmbY2 zVzXpN5ZdU=0(I?|Uh9+6t|~%ZKb9yMcLSn8>;5E-dm+=Xr$=irpVGhM40jBs8R$6hA*?Mz@qhr%enFXqWd;mRMIO$2$VB_F*j*?@7e2?I8+Bv^P$> ztEFuPpbJPx0qB|_(1nSfU>e~yzr_7S(*U~C-hDzaK-VFJhCM1ru&!-CtPC`wkS_Qw zN4j|Vwjf(OQT*sMT<09~^Js|a+>3K*bkIvl|bd0B;Xfm_b^ znN_2?YZNzjvtU6{ID+#hikxzm&#YR_S*ti}g)>cz>p>=Ibcz`q3r9{T!iDhVbe~yu z+E<@%xTb2KWBMsf@S)gp(0d$Z!2 zBd$2&#I&#*1(;T!AoWOC^3opS4IJO3b8OFC%Z6H<&77h|6VyV}@E#z#BoGshGm2p0 zI+>zapFX2y6T@iTc(V-36qM=ofju|ao5IjyY;0&rXhv8SKO?Y=LUq1qz}9`V|-J~JrPf*^H=zEp7Ex3 z6W`Pk6nh2!(A15eI`LC){9xOLWC(uh#ZRO0gIycOPowyW8$Z~zVf@6!PmA$`KI|T> z{xkks##nd(@mH)5ox|a?&1W~&E|}PBf{A^ATaPDD{$nSkyUOS49HH zVN_0hQil<|qLScxDyr1*a2TLicJx0X=ds5IxLA7h69h)xjLxw>?FPCbNY4m#2Lpq9 z1W&|GzAS(*YH5zwmdaCgu6Z-yD!2=n)`h^sfWOH>(EAZd1TPhY3=5{Kb`xSFk>{}P zz}_bGY}Eqm@bH5Q4}0nR3Nq10JgyYTWQ>W$j|Z>=Q2+{71jTS)G;)^1?nYdVf&uvT%@P!Bh;eM*&vMtlHnr$#9cdF8e}|C^DT{Zz&L_YLzWc)hbX(g~M9mtrQMJRpRb2 z<-82Nf(VG|V#qtJFJki9rVtu>hKuhA|77Q%l*WQGHyQSAf8n z6+pJkO%{h=A>TmC)cA-swh6DN9snXX{?JyhXKMaN*nR@rhwZgkWf6xnx$Uyq3E4~d zFm90#Z+!s6PPhS>{Tt@aVLM^%UbgKlj$=)3yB62`T?>Jg3gY0$MX8^eE4q!YyVIG{FV7R(Du96<|fOPDiK%YAc z$%AeZIItAA0}CF}b?KJfC@E|s^L~m&M39C%2^uw!cL3Kkcc?uiF}!B5^JEI29O+^7 z`1e~W{0FQH@Ly$p4gb|v4OV?s zk4G5p3?J+@)zG*;Ec1xKQ40r_Hz`LQ9CgM~4@bRmG{Vtn9C0|};^4xT9acZWX#uUp z$7%0mEW&M_K5h%|&&HzM#p#RE(HVM;U-cLJnAM)lOADjnN7>B~!Egc4r{g>$D-qeVN=n=$034+JNcjTT!3NA(~ zLHtaIR{|gtlqljk|JY2E!bB-@jF52rH$r*?HGK@GX<_>Lv6+qt6IU9#hJ-Jw`Mr(n zUEf>y+Q6{%7z`0ITN}trSk6Ssck+q%kjz*F%!5Col+g_gE=CoB^Hf$;r#Qv* zAj&+8EBD5@(svim5$2XS%yR&Gh7w5Od=7+tq$Br-^k_h4TA*Tpa*C!}2)|IP*}Ub6 zU_cRBl<&N7y2JbXZ1Z~+KoH}! zVhCci7zkpt7zkpt7zkpt7z*MOwIDu;ve#>uy>Jv|FC0bL3rA7*!l7g@g)ST|1wp*q zx``4q;Dx{ph$zkmDT=d76n6$GiaV7kUKXS%UZzBGPmrRxM~ULTAVqPX62(@KqS#WR zcten)ctcSX?_PUXs`+5}_772WncQ9oVD_*f9HUqRi_JFitLNuyXFw=oXO6W5@_1i3 z{|tCVU?+HuX71M+@BVgh-mF*>&98>HKcZP;V8JFz2TM}UG|q33b{NMr#JMxO1t*g*()2?{ImV&4pXjY_94O zm&m3f1&r+o+q-hKri~i@z}jt`3K^ezfKvxd&O5K8FXRmmpC?CZwc`qc#Q};coYI4J z7=PS?m8dz?#nr*o+zFh6j-45pjm%$m6bnLtCP0epM2G252x%7&kiv5;mH&=6g_(wq z+{av9)RB85lY5)f;x~(ionD9ou-IeD8j1a3^YWd!4q|!wx)Fcp71E6gzGnvU1@_9E7+R^Lvp9K3^|daU>a`EJ%9s zAIU~yuxGvak90;T2|B&_k1UHUhJBeA|B;?ZH|#xL{73pC1F-jbRgbLs$lCjp&CgkF zAG9N;C-89u(>pllgfl^s&A-`rF6k`~fj(Q&G0T{2etF|eOgIL91abuA1ky6hGreWO ze8M}8d+unLk%*i=JX!jy7FTTMt2jP)+Tk+S;MimZ&kK zwy5fkWO7(~gkw?RKS+X7TZ`cLbEvHwz^>KSb+Edqtvi%p`7CN{hE`jsw*ES3oiDDb zY$}TFl^^_Wj3A)6YN)tsL~(VB;_6h2YndpnWlC}Nh~nx|imOi)SD#W`mMAVuDXtBo zxHdp>VL%cb;CQ4kg4n(GZc$sjw)C^AtzUsx)K*)*^Ayx#nNr#t&L2l_Q5}W1Z`J{o z=&cGmsZ4K`s3cK>74%k#0!%gk;Rp_!xN)`6)grEU1}LCw{t^E(Y96DCr@L0%wRHRGt`m11-2uAm z#a&N#72S>EZlt@K?zp()bc?bltwm@pkKt2~1UBQ17q%Hd>+xZXg@@N-7Jcet%yqL# zeJR5+oc-rnFn|6f(S`+=EjcGTuj#_3SoDI}1#@E;&W%SKuI%lJrk7iae*WCpIq+Mn zWB1;2rN_H2;yt70gy0wgOoLvh-@O=sr7aY>%hwi~z1`myisl1tp@uuF5~0}E>O^SX zmJ^bpR{vN1CxzxEL$O4t0W4sJm$p!F!O64XeU%RDawgWdBox`=^L-o&diYL#Ssu?t zgc`Oo4O^-ggj#PtDHKbF8WOgc zlPBL+Mq@vRw}8=LB2Y{e|D0@~`8l79nFjygc|*a}-VE&?5B)W2o@exvG)|e)NZRov zty9ym9$yS;t2AwnjkcizAIqNRH1&}3d05lVBTdTZZ&0jzJ?r3{(C_YOg<}!xXrX%@ zjeC7b)DeozQAdfB{fR)u7kl?V`!CNu-yRzGZaHCV^_^Auz;^##D33-&*McX_>*xu3 z5-g`oDC+;A_oPrH5elYH_V3fywv#1>;8ym5krR0xCCt50;NqFn$k? zda7Xj9;}_JVEmp_Do{O0W!Qy2!-YQ6g?_3F{VOhX(|Su^`#rNt3EJiNm@lRDwcm3_ zDWSlBrVIURF7!Dr^s`;)Uw5H@!-d{ZMpq@^7S3Om@iV70ZS6;7E@<=dqx^81?|@K= z-t9ua9`u>Eb|`Wfw7k%Z?a;`ve{ux)jkiEPuk?p#?z5S|GXIE#U zEt~4{(=}iXmb|z5|19ry!P<+1 zA?Pf24dqsrgwELA-jx<(iML$;!1^0ZrI)<9rFXDj7f=bEK2YQ46`Rkc{DYz9q4dMF zznE{62t#@lKSS?@oyJKwUnuF6iiTgwF~x7Ef0cp2IQfl4CEcE$D}=k8zu!jZS7YM6 z*G50fMmL(2;n&;f<_jgAQrd*m20INtKhY+J9v5mk-DpgPV^5EfFr?elgWZ>6NVk{I zf0F>icqxTUyrj{P&M<$ZRlQ~}&~MeMFGyw>!ynNZ1(Yqu&>z>~7@2D5PiVU7=NtMS zogwzWX$*ZyNV!el-_Y;Z0^Mjprk7gJXb{eLic5dyWvFDfcRYEO}bqxP6Am3^3R}Tq?MQ`%;vV(5Q zOOu74-_BobRup@FN&nf9qTBOJdYFnD#-3l&zplfv=a=+zG~J$G(%)Q)KNx#{N&oAt zkPBlw=9hHSF^;O&W$CYX(EmQ62y6p1CY;`jl)F(q3UrhHn1gQ8A9v7)bO7@l^te`` zEe`sfY&h6ftaSPtHW8>nXq(qt;Jm9;B-ugW!~a;CCy9KA5tpvw4E zx^qL;+>1O-Xt(UgdYabsTfI$~uRDLpTAsU6e-yo|x>gP@Uu}A8g|F*#TiA`@CvTT| z45XER6^s@Sg*S6xBTcK%0=>0hE44WNv%f(3`9=NDg7V_9 z@?uXv^_MgKdqAs{e$x+oOa)W;cb4CicvC62a3dKe+fG z(EjI`jO(Y<|AX*fsr;iA@_%ZD{6FUs|IW{t{%IHg6WV`C-v{wK(?1LSBm_|=zb3W+ z^K`z=!|8t}{8!5VJe~iT_Q>|f@rkqk|H>u)i0*XW(W(Q`4`=-IT;g9^V<2UnjI+(KhyWJ)J);}sA z_VVX^fiwOe>-g~~G51}ioY4|$dHx@9iGM)HKcW3I|D11e#?MWNmD=Bgj=wMusl##B z|FbUf$39dZ?D?NlA^wka{CJd@vBS#QT0$+)|NAcSckB2!F>^4?Kj&YZ`9B>U53<iC`cHE#24(Ed$6>6d0w`)^Xt!oN4zQHcjC@n9MxkXp5QX;rDJl%f{3Qi}poA6lWhzi)PC zcW!PMTa=fo{iNNQ`OWuz^V|8p`F79#!GynWAeUoOa_oMVj3mmKb6wJ}@NR|O&UUg3 z0`?!M9yxU2fH^Tad2n)%Szlb7pPQ)%a|t4gV;Xg}j-pUy7 zcfa;$-bUKqsdoo2%pPwvo3-hBFmr;YB5@yazvUiwrw$*2rz3v2*DCnkSFDSe^}DC7 zHEKiO8uq(qtoJeS!l$i4Z{-{4LgBo%%G)=sm-t*@d4Bh8>nsi6kuHxNNz}!v3H-=nD*Ae}PsGZFhj^ziLhb~-e@D%xZo*Y5G2w#`DC&oUcb|YTPxq+X5 z2CrRWBmPrgdqHJM_&Oi+n#kU-#erwjXpx__FTfAQdT@Yafw)6rRZO4^kx5T*W(O{H;{bH~$^@BhR zc6A26Ed$SI;KZt540dfl2*hAR8Mr*nz0(H!axWl82K!0|KAeHykb%oe(K~G@`>f6w z%v6VQ-(Wl9zU=Jc9hsF9Wvs-z{qee&-C>zrykUeV*WW|9(Em@vMEjmCvfT zgBT1ABv~eoE1TtblApYu0(%MtKBVL=ucg4K-VCuMKPCQCjwgA@ zYaFmOjwksh@%K2Mp$x*RoI6D`-h7{0}(nV~cH2 zImVRVWSC2San!_&!1(B7q=81UQjOwgg=@I)%*}La&E{BXq}Fa7uXTcY&=}o>&d4To zN}JGeD3r3I(i?zI)2Hi^$P-8SZ3=z|`XqPZ+KEf*j}izWeabG0N9Ud1NesWg6TFH( ziSEA~!5qI4ed(twaPsq21y}VcdpFXTZyZ|pW<~!=^htF8sy_9l|15ptMne9n3Q7Gb z0wKD8Ro}$?M*28LT*wc_Lh7q{QNd00Nr;a;(=^^qHwfWdc`v&8qEEQ^69-3LQt+f| zG4@9VSF5@s2$3JPUOuhhNi}2aT?Lo9N3OS#0|+%qm1E3Q@awsTxY?cWHG}#zvpYdM zYEQ^sfEvT>*?LfC_H?JiYyxb(*-FS5+l7R^`J() zN;XPZVysRBk5*QTO^S^=9os~XQnaIqST8b8(TgTxJu*9)#Ta!?<(jHT{MN7mh_o-?Nbe+)ZC&u zsAyw36lpAn{7Z^3{Y$zMYxH(>dphIvb@h4ecft0v`0MMbXO)=k7xp7GP{b4caRsBU z{1TLYsZOS?zbfqH_m&v;)C$7paZ#I+t&rA$Nw@ZL*#c`H0rH&guR96~TpcO+`!#It9 zC51nQMCX6ASYOYd+sP?J=S5seWLp;Yx}GEKH{w4E`!xQGDg5bus`H-^>+Afl7as#U zFUecB-x2n@o+<3l^QTR2`u;x(`!xP%Q~1;SUFY8v>+Ae~u}S{VWbnVVN&crY_+L!n zPc=v9|8mCqt71b1jd%m~{d*x}efcK|S?8#J(08}K{!cR2KfB5G&u6SZEPkr1>(lqQ zzWxQ+6DE!;dXc;;))&V>!aP^v0XlyY{2diUU)tLm4s|ZF{YcnrYV0?9LTh$X-$0!u zU!vSfnEh7RJKBVNeyR8`7^mgGBlhpIP^9?nz$NYFzNW2BCWGRK+xubzEnv5$*pr&R g_kh?%{x38til8|Wr{gOec zRW~{l&D+Pc*;72ruGSN~?Af%X3!YNSTGl3j8^Br-+X~_;YV8`At^xmmh~)SA-uLdz zotK%4ZO`sGdwLJ=-FH9td%ySlz4v>+_uquS4jKz`JRXS$k94DyN^w+@`m0jk zOLg;;8kA{aIjXsv#pW)nuNTbzUOjQT>_vH9vsaSj%?CX%Y6+!4D|=H;EYFc7D}1lK z?Z-XB$_>E^u(q*_{Fn3|Gjh1;w(Ehv*` zn1%wug1IOtX=bfbs>$0PiG!nNwXfAdT&S5xiVkYleNn?&Q(~Aa6u%C-5)_Ll%;%8Z7N2WZe zTk}t-ORT8UwoI+kEC{LA)GX00Ln+tg&3kmOp{!kEAX#M~X?WKu{is7-f=o?Wse9wf z2BfQqqRX%}Wvyy>pBZB>R4vA)q`yFqvkl2UA!N43x~OsQZZe`+XlT(#!^ zL^VHFM?Qv9hXNmIMTbbIM~RD|61t|!)}!x0(8_;OUi>^XtaB84IAqO#PpwWWWI&-M z85F8z3hg(%>urUq|4@ZUld@OjzUng*6Hu__w%gSe>h0>+)cRe`@@x%#X?=k-RxNL` z4`)o%BgdfbHq|^yeHb)%s)19%qMcgV0XgvA2Qn3!OC z#)dtC17icHCnk(gMO{re>|d@{-yFKR@_Ik%FZsW6qkmOHG}6-4=&uZh!&h~-clqyb z=t5qx3#WX+tc1&zq$!8#;2YbG&i)i zH8p}f8OT3`%wIDXQKLU_bo;q9%9+(+#O23IJs5p0I>sCueSR8!h4Mq!V{l)|9qro^IK= z9=S?4+b}X>JCrJ$F0S-~j7Xyw8djXa;YhAw+GB?(>o{e0i2!x3|Y7bj-(`H;bil}c`j<1?Y-fIU_ zR83S)#7^eO4?UJoT;9#m6OSodP@q|UcUUt|=_7AU(GxQ+)MM}D=_Bt<(PKM3u{Wkg zZ`EU`JW(C*oahaBd!xm8Plzh_n)tC7>&m{qQRL_+x@f`FC((aW|vr}Y=I%D=3zatgS8FZ(ni+o23m0? zI#7M!6cy1Eab*i8BHhI7l31_!AwW>R`Q^~aTRHE^_vb5y*{IYR=%FK^4m~h>8Uz}; zcnC=~(KFGk%hzA>5@c5ff%M|R=N3d1@X0zApE`PT4~uJ+0y6O~-Q1yU2Vx-Jk2q|e zkc-wU;{-*Z01?CNsgR0EC-55$%{R!2u`G95V0{!*iJCvr-*%-AV&RpusQ*$gBqtkG+G}fB_551>>Mq`yDptYgU^QRhL#R zUskQcaiOhAZTPcV9p+ufbc1It51G&CWTv-a0uP$wkcS4VVZMtJ<@h2`szdrhVs4HE z1IL0zcn@PR(H!N`%)P4o2#nWI^b?2rhA|EPf-wgR7HgbC#VC76e)ua8g|sB|(yV!phx;bvcBP(iS*8J-|wOq1?6qGOXkklz8p37=4)#2$2ZHtodHZ`_bMe) z6ic5SuaFr8J|v`+AzNP|Lm($rf*i@*7d@Xkl@hAmsXLSc%8}!LhqrLH8vi-j0tM=FtX^!3))QNmS&+}bU@o8-b1IE89aD=QHqX$2pTY8t zi#$eRqmoYv42vowu}R5e34_K+zCuHG2t^HZBO8t61%{Cxn+`LVXP2Ym6-s$1F~2xu zp4MqN8P5GW?&GP<(<$buQ9q21Al;5RBI&vduAewAWeh zVp#TF3KLhc{J3ild0u87k;ho;V&Lz^bI2%T_ANiSpK6aiofVpNDCN60@#?*A5Jior?viZ7;|V3}f`qCKKuV3#`b-zKoYVyBBQXnNEc~ zFU#?LGtj*9wpp{2Fmab)aoAhRtO+se?Sz{VD-XO-CvWw9}?TQ{=gQx`x?!&!Vm zbfr$3zO zne@Qqi|WY<(VAo7uJx%UBHx%2MXCxZ(nuBub z7hk~pOf~#<(F`Rh7gdkH7nD0^DMS;B?Y3#&B$`#@$X>;=uSn1SfU>(OyPIXtPi3p9 zVYFG5&s2=7a#6+jd#c=-pUOt=KFVE4xeHmYP&IPjq})}Mi<(iEYTgaOcIGuB_XEo9 zrrhrFFgA1X=gb0#@{rdWJF}=srMd4-mdW8DzMv?;zG|#ZU!K&>$ZE1a{Rr5tmjqD^KaL^pp zieB^VVC!?zgT{@4?amR3JsX)_((Yyp(qor>z}FEpZ(^py=8pWid#3vVW;)Dvv%a5w zrh6081%}DF9v(aIqon_YtKIb>tHfR}z#ZrF1p{*jhV9L)SxVqQKaO{3YesFaS~n)N z8zxSW57>aTCT~8Xn$KuuZ(t?*OKh9uZytjiMccCt*rYjmkeEAKMU| z^fY6ev5_sN;p8_d8;BomJ?w3%2Sl3`iK^_!(w}EyK?BIsEVsQ00(p=SI;D;m*vhuc#LeCi%NJ}^`gx|k^Yvi7bz;R#MT@WzL9Sf1RViWd4y7D1JWX8b z!iSDEN-xFm=>>tktPjYr=_5$%;eIUm#^$5XWIB)GeKmGDL!`iA+Vz-NuMiKunvwg; zPT)WXtwhTh%y@98V*x+|=>NyX{%7@#jl9e4E^r{&_Z#_}_t2=5x2W8~= zymtgr8JPE}hxj_a!kYiMYMxR@PT}Nhf8eB6w7+bpocJL~ND*3QoH$rvt$&O*k!;%r zQRQve$B=%DM;Bs|&h{)gL9K?_chxLWm)=3Aa`tkStu3%bCG$~hsMcYy#vbFC?%m1` zAC}SNp(7i+jdd(H?Ab?$M(dOzLLtQ4uoM*0QI@-r$ z>+N=_=Iw$d-gvTIkccx{I_jhbLZg1BHqk6NGN_32k))b4pyr|fgK9#@VA(I_z9KAr z_%NfR5bFD}R%czwtOdi3T;N#ZceJ3jdjO-QP0<6xi0xAptZl#lp$7LQ$DaT(86T~0 zs8^{I_T$@#;2gPZxmpW*_uL^_SHqB#HqVYaZJVc=Z5Z?z3FC%i^XfA6zumcp)4A$d z_AdEiX{uoP+ikPILZOOCNCRh$I<>(bBtqus7_Itb@;?h`iyVr4yeKl|@lb3O%eNh; z`QTNnxh54>{-dx+do02TaT33d*(r?Iky+k6OmbJjDAlSz+DvBi56H>ermVRFjPT9?xWiP*1WhcNV=5c$)SVRb%q)CP&9Ntvp)$Z>ODpowtI5l zzcZ7f_C>pneE}XFe)uc!$A~UI%A!iR;6@w|-v#l6pe_%Y&(o3c8Fgvcd`7Jen?J|h z(FeG9dJ(2n;4J2`F_Rz4LpM67zE%#7KcnZ-q*52V(G#t6W>99nWP)My^|1LiHXLh1 z<}SnhNH<@mgl_(~VV(#K(>H(A3DmT^1W!TQ*+egiNlTQjSea+d<3=P1gPv}fqiEd` z!`v5o;G`r?k?xKUd7@Z#Zu^UnIYzr4^L1grzYDvpT{w1|aR(;(sFJ^o?O;%Km%7{$ zcDFxd?qo+p*I`!DBR7O2^ORsGqmT^c6HT+%#ej=!~iRYO}tN2JML zIA@K&v)MnV)qi(q_nl4MJ^nS(*0$Drni`AP^+y&ybi2ZCRwA8^P5!QiXp2AE-MV^p zQ@3+%!pk}LEcS&8-NT?@XQUz8#BOV@?1^;K<&D3+slBuN>%xVLx;#wdD{THdw%z7) za2Gqs87`%}mxK6#Bh1CISRP$|7BFKnpP^M*TMleL;hg1!&0`o$YAtr~TiHSpQ7PCU zT5CtNnnY@)9nq3jT~0%%1;)FQ1}V4oD&5?ro1=D;&dYjGQ7~|r?7wd@%JuzDj?>); z4Pp!e!+cdYU%LaFpE#?ToGz^|Y1csOO**!-<-YY6Iuyp-!>)zY*!9?iKLkcrO^dv_ zrVUS*is|pc@1j)DdKBSuWhEC)ja_GpDpEc_Hho$K?xZm0YIOaXGJ%?xc|clpTPu^A+b5d+ZEE!ucjS z1AkzzX1+lhlw-BMPSHs(`!I#TT!3-5xYlq1p|hoaoGI;L-_LsFB10*Wi!{Y=wO4pv-3d1_@pL&gsnhQ8h^)E)mU(Tajddd(M*TApG3WUaPQDFuEY;n0knJ<4 znCn9Cuf((Ct-VUc&jiLEW(QKZ?)WwJP_XE?)I-1KJv4R;WTy61S^Lug zG&=#Nm3E&BMGgB({;BIwL;7LSxvY)GW`T&>-?`~7g_WEOh^>sr_(ObJ`7CYcNR5tD^r!4OzKh$cA@whh?p(ASc5inCAA z?EvFYZ6xg9O&F+~eidO4rU!OYF14W;NM5?EuM6W0zcY+gfVmevqFPtudU-}JNDXtC z_0p+jfe$gyw~!Z|wS4*Vz{DLm3Wa?AAUgX9ooma3qb^#a?4m`KbH@%wY5j{IlQ4Uu zyG7WZz0JeYNjK+XX||E6OLswoBqkPxLoBv;d;EMdxvIrVeGkjG&oAYN8pu7_uF#rB za|9_u+xuaPaimL!$F%Gx$Dd1Q46M9qpL^0SlXjJcb4W@zDT6F+%f)x4SWI0EJq0sn z;~;telPQbN4N`s+mxS!s1MrP>;aNyio1gA}@GBwwQsr$_3K3~rih)<2loJ3V-*<;_cr{H4X$6qoqtmdp*5Toah*FRaFIp7e&Ojb9lkDMBIqu7(BAJ=KzDZJy`i z8PjqH-~bh%vz)vVidA1;EXVK_-0Jf$@D(7X`f>r2`+!?w;k#Z$6~4T0;Jnm@)bAbGLhpCHzLOVL`g-$KUq7A!JR9%~=2!TJ@NCJyg{X>hG76|o_a5?~ zd=<)9qI@OFSEBqvl-E&SL;0n??cg&Ed=I|&BR-g4?K_em^t}yyJbyXyzm2#U;8FoD z8n{$|3)&p9EyW^msRWk@Q9m;k)GDZhj?6?|0csW0%bYrwfw~jaIqN}O;Rr|ya^eL7 zaWyAi{aJ`RLA=-zxK6D`hI}m*zOIn3it6kAAFOXH>Kk|JTa_HEF8wM=KgX^Ia+-V_a&+Gy2J{eG zehZ%Mc!u%p!LuLFLF66DS>_waX#?#d&_)o$Kythx55B5^RQ{gQl}_o(jE$z@cSh2^ zc!s9ng^-|EYCO7p43k^)mG@+17M!m6^hLfZyBkzri|XsD@U4YW!}IjR6AgGmnfwRU zU&t1mlJxCAP5p*%<=NM-qs}3x{wJt@&F43Kg&|)_rLVNoS6=DU7y7Ce`syluD=U31 zmALDfn^3@u__?;K_M5WAB8gXC5~Tr|0}AV}V6v8+mwkJM}wiNu=XJ z2kXZ-wHI_mdnI+mDE(|Y7EU8Z|9d(PIrVcLN$(vF=1f(rE%v)GCUk}um6;+PbM5MF zLa)R*rWf=|)WMU3S8|++E6a--+=12`IRehs77me@E;C$#@^b{7tuY)LuS6@5$-yg~ zI~fGxm2mv;3e`yG2soSXf!%?}BYinZ%{BQ3!>>rNmE=hn z0bbUVle4@Mej1%2cqN*>(!;lq^hop*MLTdg{}D-#bR~y{o}hZ!sZcVHe|V+8e2*fL0 zmIbGk@#Nr@E}skn@d|b#5_qMQouuQb_D{!C{gsZ>Pr@b#uN0UJ0`W?7C*d^rOBfg8 zcq8Or>6Kv|Jb-bt+>Nk@@IKGhv^;0iP07iGaO%YxsK1+UJ6w`Rfb&Vqj{3;w+=J+9R!o27R^QxFTq3o0M4P%h8rocc0hgo* zWTTLpy1P5O>pO9YTs&{?yfXY*M@x58PfKT8V|{zWT42{)H&?<}ee$xI=&qgTDtPTR zQn7KJCj7nbqISVQ?2f9XzfVC?YJ|?Rdqcs$vc|bl5j_B zM{6{hVl{MhHFU4;?p)I)^0a%SzN@>bxe08NC8A&NYU;s#I}@q3qaI~(MJ`mpST;7{ zro6Q)iWaB(<|d1?++-*AKyM~xb6dmeB+c6P_Quv8T+*Z3o@gUFPHMipyEWRBq~N-( zYHLWzd{^>=qSbk(z;+0>^qr;H>*;?*;|OG1r;ZVnsvgOU^*{CQ_mdp^Byn?hQ>t+|Wjp)HLz0Fm!2}TXZcho!F3NbG zeczBQ%#z6z6FT`E6y1b}^SM^QH32{1bQr0v2>7cGPVxr@{9S>bcITY` z2?wWY76|x5xkw-oUWtd(|E+@)e_gIHfO<#@f#0)0rpJF?K<=iu&sxz9(^*z%Mk=KSLhemO94+Z#-9-I#o)#3ZFer_-1PQuwInrIqgI3T!M%5xlzFBbc*AZ4o-Z;ak^BXr_&%#-|pZf=Pd%>E6|Jl*9r8u3iLk` z=+6;wIuD>g{KfH-M+zYjej6Sx&!rCTZU_1^VhTjR0uQJEo`8$v;#mi$NNl$^1^llB zKJPiWyWOTZ^A}~$73gO;ILRaW)r%e6Eq_4ZBi36k(2M=GM4+cJ$o0L&!QJ&bZid;9 zI8N_DSqenI3=ii+K9&OEU&F)kYkkIVDN=+z?d5%p>oa8a+-0`A8*1=5SgH4o&YC=gEbBMP zWC;|XB^gd&KDFv(g=u?wQr@m6sbN~ayAH`~3tL=dyHiy%^o zm83Nh(?d$P;q;Au<~-W&(-A3#~+nN5CLvxz~RGYF?WE=>jHc*KxfZl`#8S{o1@$x4CG z4YVfDRDTzvk|go>0|dG2r#(!j`b*1+T9Ss@Tau`L($QUiFEFZ~)|6tH>CA89TuAx6 z{dxS)4(2XT_Y4&3?I`)?_4D#CB8R@+?LXj@=f7{@4BYzv0vVa~KUr><@H>1+|MT$h z`g!{w14i|`)D1-c@yIcO-o%-GKmphA_?M&L# zi%K81D{mI_$ z`X9|w|3Td@Z8pibyZ)_N>i5$j7lK$nt#jS=AIMVw zptE>$XBvMS9QlWw!qk4WFL29$ zOP2bjzp@KD+Y^V&zdK9)c~1TP$(rpCxBlH(>aWXC|0Nmf zf5fTZ-G3XL`aiAz{w<6ATb%Nr)_>cx$lvA2FZSP@4D!E{rT+bec6)tV|Gku@{sQM< zRqQ{$)*{Xv{~$~KN1Xb(o!in(wx0{Ia3h+joWP04&$9fNQ9c)C5T%9po$}7~pUiWo z)hy*tI^~JC7~JVwKc&38K6kncz@<*I=2cF60=IJdcPi8fnA5kUH)8P5Mfd%y}^cl?&@&2{&e1$ Qsr(-2VDx9MOn3SJ4<@U=EdT%j literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_ext_xcopy.o b/third_party/libxsmm/obj/intel64/libxsmm_ext_xcopy.o new file mode 100644 index 0000000000000000000000000000000000000000..e3e0f57fd36da190964588f9be5e47e0f84e0af7 GIT binary patch literal 22352 zcmc(H4RjpEmG1P+=*RYWYD5GGQ1B2sAUm>EZvfj5z9xuyV`|a`C4oQgQ9}ET@$0QpAo5cLa!odhkz{KF$ z?^bt@S|cf8H|L$#=S+3ouWsF{uDVrq>sCpdBkB^jAh_&Da4m9;Q8wywZCg1O5A$f) zHOqCiOZt-ZMQN?HX6_n_oPHZlg6ZaM>iJKL@Tg*O9`IJOw zAhEh2u{>W+EEm;8=2}&Mqe`lh6N`f+Y?YGe7RzM)O_Mu*NV)yX(t7F3(j8LuLD&EH z@(L2MFe;93+MJCl<2%H}>Y%KvVy>E4C?`70=u^=OcjNQxie+6E14#7gYpm~A6RY#6 z4cM;3m5D2r#7)BC(B0*1#ogs-z3jDB(rUX&<%BA(krT35g@~G1BP!H1>+E)E8rLpr zv|`=r%F1A5)vA@N76i4XhG1*0*3sNGw=UMcDcIcBzEKO-w$j=g+YD6`623M?A z)oZ3xx1{B_SdhWW8f%;&vvp&q7TnMftZ!~?Z0Klc(@0X9oaVNBYFnD?gW9I{hPj>1 z_ca6?Q2}SoI@=rSnj4!N>L;xSeMd_}ZGCgwohYZZp{=ty)>hCtJ2**wnmcc(?Pv&U zu~@LPwYH^&3rwreSi1xpx-``JFUx9cxU*JkzNf*g8?}@4rKp~S%In8S>!bR6Ygg;f zD-ZnfBnFL=$RHN#^TK$Uz#gpjxLn5aVIGx2|IW5P z46HKr2iBLMNPm*`bs!Syf1eFDBa+a>LW;NY)Yn+wLWouqpQiFkPAG|)6lr8{KOISU zBtyl4A&JRP0`7lnBO`%7gD_Y8qMNNa8|fd?=E(Zn zY}W<0>ti`|v!#it{oKizv3sb--RI^B^IVY{A` z9yp(5^p%;=0;R5xe_CZfs(NLn%N4ErW&DDd_5JQsV0rLgQE>c1HtTyKK#>6J{~j{Q ziS=TkGW?3SQh!+)K84r$NXdzF5{q3EF?y2HPibEI339lk_#d()mJ^iX7X>z00z5)3 z_HR5XDX_lnX3VYKC&iNBs?1e#k_M% zSgd76K#8B}UBYr&JlhD+X6yu^d3F-fJ!3bBZqM#>6)Yyt7^+}7vS)}0b;fZJs^>Tn zYi67RvBqf=9*(|PT^0fE0%h{}6 zvxr10w6Ek!?lEMJpl^3-Zfpu3|ENG-1KcV@mUh}xPssb60R6rb~ND=`4l+v zm1HuxP1Y?^20pU&qhwM|%;y=)CuH=S`9jF2B)G7W6Kdj8;>OS!k`r0h(80)2*qp=A zhFje9L2&$noAv)G%c>5l8#TJcfTaH^YTO&3Sw9eAE6*x=sVM7bWW7Z!km9eO zkLYJq#LFT^t1x^9gLAoCGVb=s`ePz3w94?SUaSwr5o578Vl2;4K(t4U)tSiCq#)>` z>13U%_ldozXQX6^)|x6+-y#lh`e_+UOO1*^!Dv}-6i!=il)Iva?}lYY>Gg`IP*g%^ z!)*Iu#e*6up$`#163+gZ^;KfFDoJ7tWwAjCamBcMR2hEDtHcir@z=9hpN;~Q_{V}4 zL+I9SL+H^W2))`ogc;f_gqfP}9E)A|a~hW{=NiNd7qOhekP<%<08t8})F#S7l$(T% zA+$)w@JW$AB7G?`m&jaBDxnKn^Eu?mCoDfi(fUOJ#Op%=B3c#&L9~Q|L}-hOL1>|3 zBDxoaL3D@0sEWL(3d1B+g{Zoy27@G2gXo$??HD4Vb|wCHFACcDZ8@}FEMR>v(Z?sy zamx%f!?n!lo{1@9U#Z$C>7|=VbE)^(;2y9oH(D|fs3u1XajcXVYGN@P{3e&uEw(Q+ z3bU3OK1MReIGTFyr8T{4^3-)bw|9Dgw zmP2dALe_ua<79H=YCe9T3mAA4$B${ijeK~42HB@M(-MT^N9PQ4ko}4?@nz}p$_Y%< zOr~96;>&`ZL(4)AT9hvfu{>*8sJR3~6Wz$=m}|nvB`0V#z(iT;`Z3K5QKL1$7X-c- z><5DRL)(mLUeKBly0tGN^k_0dC@#Vb4J!v00Y6j<Cwq5K^H^SA1B2{*!L~m*U9A zvrt|sbgZW|KAddkwvWmm|#7=o>Cqj2N}YTFeE! zVLo|84ZW_FSXp^9tfgjHOU6q!J`PzU=S8! zrFo;LDq8m+B;qbFp%I`Gco!&jzcoc8Z3 ziBv=wR)ycoA(ckXH?Ve&ymmgBWcwCLMq3hdXQLkC3!-VDY{RJPe@pA8sdNL)q_@yS z8m4*l2AW2%rdf0*+t=p)Iq71f-t1=k3VRy$!cCY(g})^(g)Xwe0LoXeiiFudZx4D? zXw0j}kfC%tPzzPSEB&mGwa~Dd{Jrp8#>`zpK;_RO-X3?iE$G>$o!eI9^AiY}ZiV zOEO#elBz!}wu6&obm1)ztjF@NzaYh59mUe`m5kekD0||0X=nuTfpI8OJ zjw@CiaDfd&UN6$A&Msh4qp%!?lULR?u^_5nR1#+KYQihRu2K?ayDN!@d8?|z1tqaX zB*C--Q(=m}UTmjq)Y6qkzE>4qQHAqS;VG)$06o} z!mi$*^6L26gWW)2pIpgu@}%Amg?rY4*brbjH35)rEei6kAT}*QkRELT$c|!`(_ReH zt9=gSePNc<6$Y81<&!N`QH41+Tm>Lgn?b;`8URWS+Y9|HLm#p}TFN7!XX2~Yi(z>n zKQFvM6BRc2eWb{#lyy9%OcSe^@ij9~zMrx**$Fkw_zDtKv_~(#kJ8#9Ie!Ki7KV|t ziy72v#6WFB4Mz>*;MpVY1i*~hV53In?FBZ{YE&hknPu(aI?N2W#WsXwyBo%_J;kMw zT}Oo;#~U4Nagg3uu)XD1Rt%J*8TN49W`lnZnBBTI_!)X&EJvc8@ELhD6T|B(E4Twm0|6fn1)4 z^uFgZ%0pUjeuNJn;md2>Ms^)pbOmhi1|%v86-Iv|UyK^%*{VJq6;4(f>wST!;k{W6 zoy2JKfxbDwwjWeOFG_gH*)OsFdteOW+paY3NOBts_JTZ+7w)|%^n9=HC9S&`mcxc# z7!Fmvup7d?FdJ^@h1KxsUKkBC+1~pEZdHW`x&(ae^nv_rgPvtc1n?TE$251IRg@?_tFPIaJB4ACDL-2>d zkjBb4^6SxLk_@(=ff;kly?krP_lvYYqzMIcU;uVoq~s;+79;v;DSqnWFR^Wu#RA$B zM)V;V?Y~BDnD)Ns%(wOh5arjHo4U8nO&#?wHq_gP!`aWVKAMn9o`%`sWrO92|F`#a zjVy5ej{_){qD?bp9$tRaS#N zoxcXrHH`Lje(dQ+u%~+gd%A_#jcd8fjnZrkm%zIc%6$)EPCoX(2Te!F0owSX|N6mOycs=V~GzA%SJ8enV@os z$|Z^q93c6GXT-1Q} zO2cR15xHNGj8->!KLVtl{@#uIz zht>lxhM+YdbklYSJt-l?jD`?X8bYj>EO5Cs94;fB#W(wDG@!K+7p?f%C_yWy1_@d? zHAc|fjv=CA5HtX?h6!S8_z+m*hhhiT^AWJ#k8yyZK?9)Pj}Rk+^uNWA5CfwMam|kq z{!8rtp44UHTXaAm-?74YprnB)7o?ab&|NdAtny2}l#i%PP`O0q62*rRkbFY&|7C0# z-6hESJNk<_8@r1(`^i1x4yf*0X|0T9zYM0Kw3__y>&Xhi#{5w6$c8t~C1*eQ$e)?C zd<2S7$cG9x!5DwsPjGPftQ)(Lk1RidhYenuO=|`D2|i5s6Tt0<)m}m9))pf4XhjIU z+BFC>v|NOlS{BQ>Sus`ySPmR^7!`m0rG{SC3OL_LFU#4q4ZofE?Z$5izvK9w!VeyS zmp7i2^%oMrdRATg8}9+4L8&i96i)5*g|3Xhq(VY~{fLtjYS3_^DCZsG;53 zN!bVs;i0#^Mw1(xnrE>Gd5Wz6Vi}xes*!s&7`$NoE$z${bbx<_f%yWHTaBS)sCb7d z53x64gETA;)Qjs@7^0t&+GRt@wEewNVH8`zE|l^#K?A5IWW#jX!mVYqpIh5}n}3*U%Dg?D6Zp&kWFB@w)z3sI&)#t@>m?58L$j8ZXV8>5)%_U;d&sYGj2 zh;QA(ZDB)!HE-?-ZP{c4qev=-j9`2Uwu*h+?4?iXGkxA@t13k}2a(pkkbfMR!YFlS ze8^9w<$<)nk4#}y0{O|nLCX$N9mobO8pHr@qNyL1#6|De%3WYJ25t^k7wzH*@n&8RzR>tVpn5IfL)vv z?r9_Pv%~|aCIAc`bHe6YSW^&~M_WP|S2ysr7XyP+k}&R_0M-=-2B#3*8Qc*|dp02O zvjc(CgiJC8GyUH0^Tw3R>d9SA9uA4^ovtS8#bRs)f(zD&Wo+>OnD7jO!xoI)+_wO; zy&SEw&~~lA5fSYPNS8XLy)vCmSrX zcy=O7)g@CtT`7*3t=2`(d;kPTgKhx!8Ys79RGC*2;;a=HDP)HQKOp z{uvYfq9_i~qBuaq(O;HU8zswf1wywLM(EMzA@ph@!VK*)grgH|*^76@HVSr9u$zJ*3XW57ih@z_Z#*8Y!%BKwn*mo;^oaNO^l{|RjtYmy z{MjO{jJdBQlO->m`Ax}*kq6=0#_KDmlSzE)_e~k@prj*bJtncSofmHKy2^ojH`7XAX@D4jGQ|^99p!k0S=OYNG`B zKLEG9$Y$rKP8vk>q@e{T4W>I!KcE<{414|H`|nYi z4Rad<8}W_L^WOdiu2?9zLe%#pMW}Q0y!M?8_RtmRln_c}TdxN?rYu*0LJJ0PHzNb8 zvFRb};>6=)2fqnseJ}rjkqwwVaF0Z>d&TMr>LYZ=$gNnZl)A$zE(o>b1NiB$4~4J? z=`@C){$|gxPJh3GE{ozK5M2xk(CIJsFRFe3uI-y7qeV=b-Zj-I&BSDDZa+Q`6|=4c zk(1_e^aiG2*7t25aclR}BwR;xFrR{P{FbR*iXCihk*RS};Yr$*7%PKxCnrT9go+ZqIeOH^Wrvtowi6y z{K-_T8KS4N7<L3{Y=y;r-LpWDyo=k}kH*}jb~ zv%?hOaHFi>izCcR><|t&R_MMMt-ov^j;T4F%V}i!G6$CE`8IIB1(aJkOooavhK&|O zDQBQQ>2#JeR>qnbjFwaZ>tirh_Sw?AV>mChMX!(HNF9}@sx8sA*z_@ZYRu;p=*L%N zqe!}{MRyXO#0K|BBy^N*{So!c1$ddQYk!X~Nb46RVcoCzcQ7q{%$)c83a4S*n|GrU zi)jAek5uzU0ZkVeYt$2%DR%<5CjH0dG|^F8^jA3(-?Kcvd;$|Q=YlVS=`JCnpVD8v z%LNw(ncZ|TC98ZKLeZ-t3$cu3;vTy~$z;|QJyGx#id&C;=T=`&>(Rhcy z#2W1LAc+S1k06~7_6K>yt;J}tucN^(k-asC2K%R=_JM`wA`}%4Lds?@hE%{q1n$&% z_eJAR3WUtC5MK)Ul!fG`kgF^tKZRUtAz}*ooP`u}gqE$rUR02ZxY0@~-AB|yi|XFz zrAqP|(Z#-f*+lZ{(AB!YcX{>PHVeBXEc)STS+J4aZ1S2O*0E`_bYqI7Vn| zPunb;5$$bjEFTddTs+j=1TPTMe-x2kmH#)!E?O!a1iASCh@v zZgG*X*uXh=pE$_5aE5Ijl|$CUVlOa@k$gnPY9AQsLYJ|4CXTZee9|^hrcmqL*DU?t z053CgG2MQMbLf30(@XL|x40B83mjF@uG4cC4Q{>`{|A59>SX* z^8Ubmg!2+y6a&87gmKp0A}*$r^J3yq$d6Pd>-DsvqGvFcsK4k;39dov1KME@;jIeK zjfAnV^K<2HaWVWBw}Hg^IgjPwocMxp&mAD{gTJmT0Mf0kgd;On5yaXO#<4P7eTxWK zRt!L)3|?(60jt6Qs&KRn2OvLMzO4$tx+(zH(ej2G05u5qwulkAtVNVr-`Alv+&!Vf zwkg9wujpGkP(RZpm&xK1W^`D|3ME&;m*Mj*9>ASCd4Vi0WrM4cyaeEaB?G=Pd4cH* zGY`1{i}RKY!2brAREZ6KmJ5U4;6&9o2R?}cN+YFWgVT5#d~uX-I`U;x8YvVTe2bb5 z@^y{NM@q*Azr)kEh`o4^Wkr#sbUi(4!c?Zosbr01fddeBIwT8+y^3f421a*>LSD~oir=Mu_AT8CV@XqDW8)Om4`4X!}D zD7fyRVEx7wh!I790n3ykJg*u!lUrtZnHoB!Kd+7P<6+Yx$jDVwe<%twf0GlV!dgVJme ztPKEM5cmw=9CUgCE$gYzC4xQ>DX6STd=j=u*=aVI1sgRb79!O-LdAeuKmzOMxR(sn z7q}5s=XjO?^>79+F?cyc1~FuChD>6}khn2D`GN=OK5~lw-8?nxx_+pQ^=JTlAl68VbZ}^&$HlC zeu}xkqPmlGa>^@%6--VBK=F!zQj>Y0cqKrsNd`gj3V>RlEC9s|2UVXe2E_{o)sig3 zH(?$8X!C=Xb(M{UP!2uAwtj~e!vFCunjTgxOcO~zE$c7qzdMZvb&d2Pfg#Jc-rSY7M~+8yoRwG5qgcn%m~Kx75}-TpX!CC~^KhNko56S|g?Yq+~}i z?3tg6r}4sJ5V-_z#_X-&k`oXt&cCK;8AaJHY@*LbLQjR+H?dpzXmgl-`$#zZ;zn=i z8SRP(K6W7k+jDgl zK%7L9VrXKR{&EFyp(x|R#c_^u6Z1O&v4f<`S>4hHNj4kaY`S5O%kgVO4IL`h(}GyT zX5&hdtp6EyNiW)ke;^%M7MpF*u#YMJF?KknV60^A`OD*wFYq*wL;M%p>OOi?|p?W&WM`J?fSGyV1P+5g+odKt5j9qgGklnO#;|U+}VI>Wip< z)a}WbyewsgXS>&b-o4b{>sjg_KwoadZ-)nEd#aGnMW$9Rra5{Zr+33cLEVyR9o}0zoZYO;JVy~T>&fh zBaUB|!ksPXam`HO&WOk5cal7Q9HEe!Rj7}~9#5*YF^pm!7yi5=?Rgor;KE-kTF-bn zW4N78Z;zwg@%UIfjK{~mcRW7UKI8E#oMbBQab1~;+mUy9WV$QQ*^t0KuT84^O4xGdMoe6-+Ca{jOJ@MEc(R1Qt$lgW3rG;iqYY;CQ^pPg5C(ck13)uk|;5E4=y zYi%#0pt`ZGs5q4x)95gDTw+NovDr-A00~XwSmwDJIyzz<)v>m^hN9AW#qd0Ovl z=!jL{-O$n2&{EymQCE$>(cjomOz6nvh8#XOwns$QzDtdP{|l}nbaj;yI(BUPx8Y8SQ3oef&G*3^N22vFU& zv30DfR{LqSop)C^w`usY+LlzR)qayFTJ1L>lcP&gg4&wfnzabZm%qHfx()omxG5#npIkN3%ATK3**EoOH1qw8TOFcx|xcXR-6f@eTCUlsf_80mrCer>}VMKwl4Zl46!j) zcV@%za?Pax53R3r@Pp%nKerQzhw#tgXXBrQztF*7i8wv!^0%8X$4RCwf0l()U2bsT z2UE!3>yUr7gP#ua=t-A<)FJ04q}%m*-hqGKfxm85gP*J~yWB5W zrZAOT=D@c(@Q?%lnT1pR$v0rj8Flc}`JRn`-7;lJ{^uO{Ny{`OITW?|XIYzQ8}}iN zo*UC}d#}AT4X3e0&+Tb=xe2@4(s0$nd(-gs79J-h!;@a0SJH5Jllfzb;r;F$kNu_? zj+Y~Df3I_#lIzIE%U~DLlin}%*K0QJ?3aBOPW7a&w&k30@H_iu1~KAEAMXn-oL*$r z+j6!$_?_eZe>wO+M%Fi$jicKD^5z$JsvjIryFJlW_2V)=}v{+ObPG@Kms=gUbN&6E1Fvw% z`M(a_spmoP&_l`Gx5_D(tJ``LZcWu3IqATuo$V*=_+ERCq%`{5aC%72*zN#pw^bi{ z**;9-a}{-NYSn5t;H`C-?4mwEaO*M~vz+Y1yc9NfAu*i2@xpMDlnEj!I1}V2o)({t1qDpBc|RXVfT~%;eUBYRa6&i#lJ2?-?zoIh9daJ z=XPqfb$46M+t5_q*iqZsz}4q(4X%O!Qdw)dN{I`nV&fy0dZ)l^5F^Yo=Tdk$6&oK( z&rf4N8^8ZCK7C_)`F}S>`9Z6pe`wXiex>^wXx*8t{8N*ZPi3c<|Ifgv zC|V1g&r??Uwbt8Cv-8{V^GKk0djAz$3Ss-Z?ELBVKZbK}-hIks7HsQ<`Bw#&Es z?=4_detP|jMSytjv@2?osr)vaiwMQj>%WtX4Lr8r$)qOg9~~J^R{x!fnPb1TZczQn zd!1hYxmNk<^)I7E4^M^pN`-cQsxv*eS^3lJ-)rT!{aPmX#QKw;bF%ucTW;oPvI?U5 z(|Tx^Z@2#@q!X4t{>#hE{3P4?q|oj;u>zboZ0PUSQsRaW^$X}Db<`~7Jve;S*s$I73M+xcvKo|Qkn{=HWIooNYn z|J(WJB7HLb8L~Ep+bl-XAKEwD`SUEC>XTLkPYEtG85B#g$rnQB$LaaWrc2j7%Go4Q c=yF|}mYDwiq|<3EJYSXNNGeaOTzdZh1PNuWvH$=8 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_fsspmdm.o b/third_party/libxsmm/obj/intel64/libxsmm_fsspmdm.o new file mode 100644 index 0000000000000000000000000000000000000000..9fc1de47917773c3048951ed3f8fd19f7d8024d4 GIT binary patch literal 20296 zcmcgz4R{pQxt`fwlBnsHG zV(ji(reTmrrR830%kA|(^;+-MTdnPFQ4{_G_|u4je<)E9S^k2chzjJs-S6E7ca0{DcW6o%b@w79<@ns+`)1Aa4XYz!OEDdy}!rDwNGiElp|>3Mflvve-Cj{3;FpO=yw ze3AINK(n;!IW)ZLWhC34$Nh@5ZsVfnu;eQsp@o)^Dy>>gQlAOUs2Q}RlfsfGB4n&3_;@dsE%RlZ6|GaA|7Nx8WzD82P@N*8vM z0dxCE)0<4+c9!m8=^=sj;Q*DwDrpsiw{5k$f);n6#jT5|?Y(5>bCmlK*nWl`CF}+C zV=ww~jG*J6kiyCRGz3tB9>8?R=)aQD$?psf4nlHXk7~RI;r7>xNXvSBzE7H^_HCYK z>G)rprG*<6@zrqfIVnM(ENoT_HL;z|OdRnD!n0*C6|-8Ei;Rzqs2tdZcdQ~+9;YR{ zrNk?c!3^60wTq$0wyH>IVLUY%c2i@GJyP>8IVbn60SJA_2oT0F!jtH_5zmA5~{Y z)x@^q{XrNcM2#78L&R98869#ZD@4hEktA6@#_B_CrL~}?$2#POXF_&ik=FCR+tQaj z9PO%;i>Qi*iRq^0(jku#q{M$BUz6N~$vCj#pep(5WS?~ZDjJacdx|zLx?lbYv`NXI zP?auSMn_b+9OLS%>Q>=D$d7cj$N|-8l#9Yu8z!4AcY8GB$hrC{2i>^F9F0D(qgUHfN%MggNihLV>Gr4%X>nv=MktE4o;B# z#ru_~Au9Zt5-t9TC?z&oIhS5WcK@IVU8r~+Do#HbmZra<#rmKkY?R-NSAXf2h&fdx zIe81oz2nHetHstsPBBV;Mso6tT6~qb=XPc{>?(PY410-5!>~=ZVMg&IWK_w1&Ad*8 zm}d)#^*Caf@XUUrgo$}b%muMhCT7_PZEp+}D$H@A1LGXx9w65?cIqa;@5huM#!tF{r$BjF!#}wRG_#m(3+Uovc{z$~3A`sV?U0%>Exsl|7KSjxsH?hMDPR5rPy#v4)N1I_3q z`*m22$!|@1r&p*zRcRu(D$=rUCAd$+_Vq9t4N1RRrv}!A7CkB7g;y!@BkYg1BCBKQ z4%4Wit3Py%ypEP8ZE(uxZtZvXi+a^hg$4<+11ARbGtk!3-D!r;ajV3Sa ziQR=-tNJWySwfDJ%}=dlPwEowtyuG+)W0E9)t&mqr}(zU$MiLeEE~G8g44MSHZF~6 zRt_zXgANQaxw^a7P)Km&_mvsS_mv+g^{a=56RFFO3=Xcd%n|UUp%kDYL{ANwTFOvo z9H5mlu*~jk=9_6l479$ZW-ex~6w}Dq`D_u@G@q0o!c)0{X5}KveK!iJKT+;Fd>fi; z1dveFOt5#WW0*_i8OeFiEkOKYhZ;0ps}5If!1Sw=XGJmVwQ$uYL>yt`dAyXXuq&Xv zv`2~WIDSwXy*tvg-6NeFvf_?kVQ>`YUk)Jx(PCSov7V~#h}p3jQOJ}7TF-9JWYe>n z+(i7{LMickSf#~3g+J=DaxT3c?{57@q#k_&(oy=wNDK5Y(~d9v$p<;?u+!IKju*mj z_^!mZGyRz4zp@ZPjgJryA!{YSCykU6@)jIf8j%Tk4=yW>R1@-%MLJyfBUOy5k&Ta>&< z$v#TZaJ0}zxnpRrq>ysd#TkVGq>Y7@NLva+NcF;6q#cC~NY%ntExxOh3`^b&Z>Adk z;o$T7*{bO&5Z7zVBe5g+j90EcC{6zqTZ6yjn4p-~i>mmED!!-$ck4xpu_-kkey=Mk zPgAi#LzuL*XOUlKQ7=CE7{#1%Dgb{AT%+VW9mPWDN^5QdMI|Y5Cw5zuokL}`Io49F zFhiauC9X#a&HOQ}2$=w7;-869lJ+!Iipn8c$#7QqdSjn!8%=MG@-z(D*zHoBmNG2{ zYdumK*bp7#c`6ClH-UOZ@nz#(=RKvEL#T;tccBK(kbK8S+B> z-QzggpaoBIpS0vvLXKhSzl)tFYF>LF5_=)ilk(J<^7=@;N1SAqUZ}+1Db#00;8FBC zq%J*-)U98R)T3XBbd){@X@PzUW_Pt5iLddUTqYXau8*^;?(IZImzT111xp`iX*Wyz zSlZ9h1E@EDz0rrIbebye(w0*gRMJfmydr)S4!$ZS{rDmZMr1(6hEIczAyHl$pvi%7 zaE4qSfB!h9U)h&vgd!N5stq(cz0T-7jr9{T>krh#Hfue5Jd@1oO%PMtgtD(~;abMjJ3R#5Ud zCEb+tQPNKdMq}svKI2Gmz!?poC9703B8J7aYVh@co~s%gQs-bqXwtCbG(|Ib@gqKW zlfOv*E1THPa;NoSZ^>8#Y$Hy!VUNX1Wd~w-8F-4+1E>TD6Nx+u4k@A3TtM(rJ79UF=e>d&@tAb3fK^(qkr#{_ zpV86!VNCF-F%Z27vEx8U>lr9O3@OIn$8Pot)M@cSOzGcQIhV7q4Z8JtDDmi1kdD%? zKw6-mZL8gh$ci?}`jBSq*33B}WpE4SjBQb4XH_@7#aH9ou4U!_7<3zJ!r0APq^8hn zB>0w;po3hNG3Fg1&76wx(l*ZgODBY^n&vXflgZ{^x;d2c7AmmuLsiXOMtKVjaVX_2 zw39R4?LeYhUUp|9*>R|**xAM$-h|cgaWX3OYuO=b2YN}vn=&B!47NmO8FHw zE+rnc$-MavnRgh>$39FBC&JDm^Hk%7uubO0-y<#$;k1fvf}Bk?P3ZVBw{-VGY`q}z zut_>Zo+oO`_aWv5kylz0?}z_+fouVhCnbJk<=}rht(;rG2_+tVBGOTMInn~%k65{w z?U@dl1)#(R7~2PK>G(!~*3K%e0;L9u4B(3Zue2qJTibS&m@S039tMJO0~rW9Z4Dy2 z|4ZK=~or z4$8>aA!BRCaRpluSUx0&1ib4qb|p}Mj`FRn1898*wZiDWy=2UX^vXC$^a9i#-iY^O zFfkZJ*&W}A`vWMA$_Jd$qH!QolT$HwiQ+SM5y@9XLjk?>dQi9hH4kY314yLb7A2U{ zV_{PUbF~$GU-<)IDZuxw6K%e4BIEnsrJ{d#id1tN=)Tt%UK|!zM`ACd=su??-t8tr z&(8=w6vUl^uz54b=e?hL418WVd00yHBbs1T9;SCbDsLrZh{{v`jJcCV;z<<`$(=yc zWS$k%5Se#~zD|)t+bfD z*JX4bqw&TMMhq+>hsQ&FOlO{&*azV8sx~2(dm_!_Vfef@jd#`|qVX&aPU%&bVSoey zn7{ZwAf;j8dZeNX!K> znZ$@fWMV_wh8Wrgk4H9^lZ`~batIn;WX#<*t7&{t z>#*(k2kv};vOJU>?fUsCyIMptSl31CV26ODK@bl+5e=>LAyEx%Vqs1zWui_mZFX+B zb*97DJ?kTR^HGZ{ok~m`1r25!{ zx!7M3W>AD)ja3$1hQOmvX5?L(xO)I5gO(HZ+mX8TCZuk?7O6-77Sd7r1xO2YujIRm zQFhYw8?MSOmGQb0Ucr7X$LoHO`0N$hib>mx z&{oWlNN^2~dN2(|B?Z{53w+4UCF_7u2HO7UVh#8YpRHTBg4ZbSLip zxDP{XiHHmgkBUA#0n8Ag%HJqLmA^%VD!(p5mA^x5pu}e5)=0@a5E9@wK>Wz3H!Apn z&jf`S+oHujphe?(0_pTsv_jzA?XCaYidpOvKaF?0wAdyRgxr0HNDNYc5NMx23nGGe zL|bkZm&Q^228w5(*c=a9Zy+qL4+lRbT5nbA>x`oTI|rXP4fA>yae9o0p(6|ZoKdNm z9cK}-r*p&}+y|XG7_0XTA^}ulL|~fLYav$e9+afRa>@iS>H?j8lanx)=y-1(>=C;W zDLQtmMeCt5z3r9ak3pVRZ-&+*E9jHZam)H8ghW5S)ejShqXRiOD8)CUUQKd?Ma+={ zBvy}B5c3eE06qpa;BuP5>UA?#54M2Sdu=GIw<5bTnG?uzG^syYXO}B5n&kDy(QiSF z)uR<%W6TQF5UY13zBi-e;I>OhPpzSW(_0YO^9VS-P0Y7ggkGK8;CPp`XG#4YjZl0t z4$6yajPC>#V@gSCER;rL)}I$RWS$k)NRJ&!$2CL9JSXf)lX+I`Q%Vm75>a1gp`q0IF*2`!l?vg z6;35Z2vq#w|9mI)(vI3+k+`6GRl^izU?dz}rO65OSa!(pa?2CgRd{0R-5c?~$hEo$)iKhHDPr4C^1 zXz5y%AUptG7$t~xFC`nWIx?<;a@myNXrcsBpOL6k7e8mDW}q4z!TA_t@@S2SDaHco z?2u=H5u|vYt#xz$P(rY0KYoITyJUPISX1Hy%e0=I1(VI{tD2W$ftL;E13&)9@`0O& zP;i@b@_~O>GzTBJITCz32Oqdu8_EabJqI61Xg9ut!xB16rIlA%1{aJ#a3Kw5chcN?~; z0boIYYV(JT{o91Khj{*8A^~ZbrR1-%>XE$z*$c7NrPhJ8Tgv#qmBjPWI_1f~-&$aU zFn^C#E3piSL;Jl4WvK)J%q;$*zvaUre5z)AG(|H$qw|o~b=#P(i5@L^SXc0COtSk< z?;g;$S#qT&4h&8SsP0=z)Zi_iJFzm>6j`T<)%3e}vU?%O zy?yDwzl+2^N|V0u8l<9H6Zi8fwsI4d$v$cB(fm2tC(RxD{wMoHRYCaSkq|vP z6BPwNe)^qsWuyCjRz}4BlA6ihCyC+VV@XbKk#K}`H3XH3`tfDe-Y2~F=m6ugj z2gdIZ@*p;7x=`piwldEq7kA8O6@Z0FI|LHvYa&rlwE{d zwXuy~MrOR%caLklcgzy^cyGY;jQ5r;9;J9I?=Dchk}|K`XS6_ZBIqxo(5>^|lI~~!Yg&Gvj_4c`3cLKZ~;3OXWeLh=sNMCD9N1!4G6&mcAK_3g( zuPK4$CXLHtx9Sbt?NPmD@lg}Km9Fupddn1VKymsOK5fBJ05Kh^)8B2-HBw#lO+U1C zm6Jln>x;W4dIL9leX6&Jee)7>F1T`)TP4F&Qt1&s7#vpAJtI*B7SQdy^H}i10 zaPnG1FZ#@)!1x@iz_y)U+{{Z^u#=vPjF_JFc;QsKnOAZoGfy}3Hko%P=Vo5SdCNTA z%sXV>ot&GkAm=UfbPJ<1FI>*e+c^a1`a1+agL}(l-NKof7cS=(zQVm_vTot5%!{47 zm_5%J)&t8Q4A*~|EuXFEn+!N}n4YU``J$`}mTx&z2xQ?A_I_sCAMwR*f&H~64+)d; zHrkc>*p5$Y0`7Mm91PjgPWv59o_0Cwa^&yBYcBamfREWxtuN-c(Wl?#k>8L9e&7FiL4iXVl=l-9sELuXGVhgK*)y}o>`|3F0{}^xE-CwEo&_u9*_9tFKeWN5l1L>6tCF zZ=T!KSU+cGOUt}i8jiKjpWQh0t^U@Qrn%p^Gz~W1+=O9iY-*3S&2H7_wOL(phRbTa zzA@I`R^QfibGlY@TT@f|rE$)j#@X$yGxgZ4G^Wp<)6`b4&yL-a1vSoZo2k#9H#ZHZ zN5$@_({%e6F|0k?Fnr@-wxox$X+cwLKE^WLg!4hN3(_?(E_&O%9}lnL6xj2XVtu+W z{n+|E3Z{uJiht=M{HwSfoR=q`<>0)0fWw`gg!E9}k#}N0M?csvcF{i&&_y`yHjey7 zcEjXhzs}*$KcB-dtI4{4Q!zsRY^zX}qf1C#w?Tv`q<@KD#;s3FJoxfF_zn)I7{O`xG&-c=B6}!0aPY6?!LQ1LKVaiD4t#%ig2Q?J z4|4brd-CX)=zxGL+n!Pm=k+`1VX`xT+wpt!U*>cXPSLT`UpbGz`g0!q^*s2_JoqRw z92Y%g+)le?oWJ7z-dGO5fRpdw@Ua~JB!^dWcn#;5c==8a=jGRPI6n_J<-y;`gYU?L z@8@vd51elMANnpdg`M#!SN!-?<-xy|2Y-vh`S~W=ek^+)^8W2?ZXDvAf8+80w*3r!FW$esVB@5pxBpcR z=fBtfJbGMq=$@_r92+P7<(!?RdHl&m9M1cPt2vz4V{rZ_!0CT1kNipw=k;&ma9;m; zw!fl&@%krnevFr|<8WU8t*B2I>EZn_j|WiZtRwn%bY*{+G7Dxn|6NX0K49x1dx)2I zu4+3&xV`ilqH*{scAg%uaQLYl-hlHTU8KieI_&*X4libil^o%4d&;wdF{A(&J?yE? z-i3twm)MpA6H?_A56>Uu|Gb;$K$aB!kC6PYHu)`fE z;T$&7XA1kAlS3q}Bfm&mi*sc8JT;#qlh)%LWIj(%MrIxjQ^kj(Reo4DN6Q@M0{ia#<&22O15Z&{4xoDo#RI#_u zw9ZgEH#B2Ut2B5Xa)jC2ei{#@b3-%P^)Ey*Z9yknpY|-$na_UPy+XpeZ1?HBG9OAl zA(Td}Or=HerFl6zN0OYqU8b42DZp~IUu&shd%2xukL>n|&&$<*Lyq>TkJ;_d2bQb- z2H`$tovKxX&NOsU`{ZA<+usH(SNnZmu>Chs zKUe$9bF@$XI=g*08qd{!D{W@DXfNfj?D=;bg}K^)+-|?pu1x)>y4metn5X>}U$FhJ zp?IX|PIZ%&h@BY+%h1^znmU`Me zX6!7c%vg@u(HZe}o3Ya*ZRKM+6tgVSOE?iJF zzh!Zx=(hTzrp5(#*4Gu)++NeT0FaPgrBA%!vI!x5TG2&|FBl}vO#ZB8=N;C%q!-X5 zn%yDY+K4tHnbG}uX6$QnYn>?RG~-vueXM-1x~lGruFvbPyS;w?{5iMPL~0wDr9S#h zeTF_=ubw^)lxBr2!(R}J{lvc&xuICQzmr!h>2+mCP3wJ0ap7fOkh5NL@4%{f2{yR{xTL;OyVO$)@*-~UEgZ|NHZ2< zhC&Tm>&%$pFETAf!n7WdiiChvXj<)dRZ(v=-dZ6wHe)Q)vMT^Yl;zZlRkZB-#?Y1! z=&})Jbp5PbzOB#JzpdY@&(V9|1q`1Ae-`)GbIkB~{{Q~_3jHUX*H3?eS$-mXkDNX81y zj<+HsOSYHvnDG$!(%NZmGW>Z&lwO1ZIU9M_PD!Cd3U2A1r;RY!P1V5f*dEzs#s--k zTOvDpx9xk?&7@9B3JgyJ%edMtM>2C%{z*H}Tv z_LjkDq|=|jv;7b8FgmuhyssP%aWbC15?x^X0%k1#S)vLLKo~jxb{*$b2EySvGpFiR z9tTn_psJ~XNrCE#W)dvW2|7Si1IF};fqpWk1#iv(Pp;CZR!*HVy=qz#U6BuGq$g*j zZ^%eb&LnU{3LrJB)~EFcs+_6^Q?vzha%jp#pyH!^E9<1wP0{@$mFy`O%Ct79BqVcS zTbW1eI1}TtmP~6uVaYgUAguw{Z;}iT?taG~-FmU4GZeo_3XOhny4L;+79ah>tF^bO zyPU{8qNF#64q&Csu02RsoNl5dRoZ*OuHCpL!4FVn)De|J zB~T`ch<4`bn<&g4eN%uJk7|=@hh0zSqH36|RL7mYy}h!$Q(L|g*FeYnTE|h87gCsd?0)E2 zt>b-wL=IJXQdrR1z)6tei*MlJwQZnnd;6Am>B*CTX%E+69G0H^nD-CW1Bl|1KaGg--7s&yGmsgkIMU7hu8=^(aL>xiPHk^^A?2I~s)eL_0J9q$t| z-VdCg1Jd!>rh z`Cmu%9Z_)C(W#WX0w#(}b=Y>ko9{Nac5Q&s&*RRB~|Vp+5RE6yum^Ry7IM zpSG4SUml7ZnCK`w`8$dnO?n}_rR&s)k|U#zbiF%7*H*t8&wcG~0QIBqc6~I!D1X<` zCcay;Jz(t)Sua6&azmqD(tnr7=ezd!q_W`2jul_@_J(YmZF8g&1H-EKBo-i7$w7*R z)Lk(MsXLFSqQbfJR3v_dM_|{V<~)|J#wICBx7H_K`nYS*EPLpzP79zjk9b! zqqRqYo%q&Cr~Hy0dD5Xrae5unWs^_5jC=71`rzd*U|xCd2N&lVw1-MM>cRkDZLu1hQj zm83t_Pkm$P!6aCDYUII=9_``#k%gu|bRVt>{-pX-HO%Tx+)naIe<-8j)`rAQ{pWwE zIwmYltiaHruS~e#%Bu^(i_e>4oA;@sgJngbm zcb~Iw9QeRVYRj6X*@rrjc@2B;6T{Y1cMqBu&<=GTF|?aE0jLxdur_MT#>+F8DU}6$ z;SPnrh4`C^FTli5skmY}X|%NG57g}FvEZ@95Yk$i>{!K@>`ZbR`Ex0*E5B?M)A)Ry za2$9f*U#37Mo9ebSTFq3!$y2ouNgh)HLWA&wvWt#ubbA#W>+G|9Qe|}Z9(fuu)VuYoO9l?zabzKpjk$#|=&s05kQ7vOFBfoV~?YG7hso#s5S}wbI{>7J#DU!Pd$-6soH>e1ezwF|grsjqk z$!JC*iP2QYd*HY~7Xjrq7Ygr*LOQqAN0MY}T{yTOB^9*m2j6zR`bCy^4tG_{3E(n;0?ti{;Fr&P#2LCdoCBU zj*5_)(P#XdLBpf9-%pA$MEH<|KFJBiS4e=bpJ!i|(4i1`x(*MAFoPW#W`inFQM2}w zaBbKV(4Jjip*{N|i&)DdLbXl)LhT2gX6=0l6V=wCuaLD>WzF{cwI6i1?}!8me4N0@ zwfeM@&fX`b%k`-#Gv)Lo%WNI9)2~)Td&|r}1HZdSB#< z#8W_4c2@fBmoP05@6Jzj?dGrEcQts*LD_KnD7ABL^N_P}9&7yq@t3f=rHgZm-GYY=o%7Dz@4Bg=ry3h6`pA?cb>Z8gF* zwfED$)OIsSQEbK;tVK7GOwlLWcM-qSW!lMr5dk3v^p4I8YFA&_>6Oc7uQtl?m+3O3 z+NhAGBgq9?`#LmA*PeYr8i0utn-GdWCXGt6 zr?QFZEn<=G7MW;K5l!JyvlgK#X_Bg!37UndK7NyJG$~|&;dh>i_H6aaWe3_*?Jv_t z%~mbZtEDNs?52d!lp9r_^lgyvuGSubaR|kqkp_gUZFakoP3e^f)0@I4)0+YT2-nUB zkmFJtVRMp$bCh*wd+Pj!sxGRTAw=5<#;>$xk809EG^x&CR??%5YO)(OQyO(l8ijzM zF+JIs#2Pqy;*SNacM(x7(4+6})Z0!>){K*6f<&E@ZzWiZ=>XehP`4EfPwG032=MeNMaPW4~{_B({$3L7Ku_Rfl5cbjl}md(&ZWHzej&V zzLWUVkp?j`axawKpZn@JhOMrkwb`_`Sx20HnAOea%M1wSFy&a`cZh#^$Uodzjuv+G zMDC8Bf+~IoC2?NR%8?qbv$WO_ZVU7qL-9xaZOGQALKb)vivL8e`ZOD0;=zilP`us0 z9N9YVR;W8X8}iqYZi7>!Dh5rV(ZLc=J_Q&|fs zZklNqjB>M;W{@w@Bxo_t8Q(gh-HhlYeuw)pC8V-pvvE7yKW2t%yWtT9AE9(?N}?QU`JFKusUd-x}A;NX}pze{lInm?ZbGJwLc%$#00Fyy^4bj$Aoqz6@=TmZH?23FrAg~hNvFeK_QaS=Zpv>+ z)pg|N#1w>K*c{g<*NQIcTmeZ}T6~79B@mxb&lS47Q@L2^C7pnv=SjF;){~JNg zcn8Kiq#4hLtQQfMfFA;btV26JjdCyv(X}blJ?KQgI4LJlRvqrR6Qg|;jAO=vX{M9<0AqS;d@7<1Wgi&`di>dLo`))hk)63a$7wS(t-N_yvivBc~SNGeW9Irqu ztok$1u_e7jFGj8&J6->G)!XPUacNb5eummhAcvU_nG#BapsYeODEkW@03_!(|j+V~q$ z3x)rTs*e#n4^7t|-h!lJp~SPeP0nMJ>nyVv;4ix2lIEI4HBC+RO^u7^7ey8|-hO-i zBDKQ8(Mf$=`vaEl&qEJGM;H*;Hr(pk57q@RVflHd-o8$2zW_0JnUEFo2J-FjRmj>9 zv|hlzhvlFf!XAeCjs&!uJ}|692tEz1^P5t< z)jba6wC~lxam4a-G5ptxKs?ZkLhh4Bz&Wi#G$?w=3s4h5m|a(6fM+4yO0u{{>}#`q zoo!z^wPk*TeVxl|D!!*%Z?N7yj@;o|?Kh8vwTnN-?G=I?&_ zpm*emcI#pA$pNK^Q1@b32)G-cI4#z0I>^>W;DG78H?=XJb@b?(@oC9+X+O$=#||3r z;|^g$JgK@4{a|f4qTTc%zVGq5Pt%@bOtFj+hCpmugnW<(rR3A2CwyA_Md(Wdomr)O z-(jR855qbHE>B*`_M@NRnLONAd-vk52>eLsXS@f~`;jxtyBVqB2!b%@W8QOIiwtdb zlRqy|(gVuF5i|eH0Jn_+dDPxs-$n=E8N)gyP1DOaAu_1_(D0Ae;S)t#+wSv^_O#pu ztT8SuYeS%;r)8jCzFv!RT?Du}TKjR{^=$OE3=EVX(w2RMB%Uj9Jy*VA@lZSXu1Cvq zvcA@&Kb{;PeidR2VQuvrNcNcC9zFU&LD2dFF&O|4n$d1=NL#({hPd%{ zM88(AGrM->nB^O_2kRihC@ZNCIoU6x5}8Y3qXull6-sNfsRf=CXu!IDZXk9?gjft@ zcw(38)*Xh{J|>S(C}2$dsaed{+P%o2C&gqSR%BT38rHt(d%oyN@axrB9Sxy*W$3rL z(vxUkQy_MCtM!I%RgMo^Cv#%8F~ z9AFg(tO>=27O3;r>Crv80qfqTfVG3Y)~p5g>DB~{h#oht>^<63vRzx{V(7mG9#X!waHna3tE+oDn=$j`xf?e6=)rM z(O1GBrWHWgXz#vD*!CvF+Dn!?3TeF$+!8s6E@hSZV3{JX9hr;8+1+ z0nC%QMyx;l0#MA5w72bfLiDW_ zV({RNY8?}hweMLK$D^RO&jcV)zF&KAGH#@OkYoMmr_e)N?fskOv;C#o54+IR;tK7D z8!PfYZW)BS`^zy%EmMtg)&AMqgSSKA5rm!V(H_C9FL9Mrp&jkT65Dobr=zWKe+c(h zx74(}TdUk*y`A_iqW-D=LwVp&@C0-DE&uUKO2dAuyg|dRdy$()^A`{xGKE89QJ@7_s!z$;(8X-uq$s`8_`{* z+9VmrlI@@m6C@o#tLE51u~B(eH)=c=_d*30>EeAbfgga3{k`>57{;U8_XeQDWl>7; z2}+<a;7y*e2D3!XnF_w9)G(IOGN_+}Lz)ptsq0|Ny zn+uF4)RJ5X)2J=$#2`S^M@rrbseQ*#JNx-)D_Hyk?q?7Zc4%0)Hk5FgcA7yw1IQZpr=j+$ zInows2Vy7xjAG$iiJO+#1rA(^z8DVNqK>zKb076B*d0EKv{@to?m{bA9RmVgYyd0C z^ogGd54WLyN);xSd@55Jl&|I#)EB~gw6h(TLNpQQdhin>Is zKt;ehOsR#HnC)+}rjOTqHhFQ|jQSNDR!d0N0x#*&4|5DFY#N%e3sr`JFs$hfhV|_x zD^Lo%a1QLk5v!#bT2@=#Wd&#}&I1gq468Rnax9EYPlXjW_D7&Wgp5^KD- zzLPkBq3YOTJukB+tYK+^H?6mWqYj0vl!>LxobW*G&BTU-gSo+hABU~y zu-2Iu9{60?dL`JkuOK||&9L=auR8C?G`H;y z4n#yA_Wh%r(7<)3^@`cGFV`ISrfI!qb{)tw@u+nu*mVd?rC%^=(UmAN2PRDK%fa$f zTE`J67WjC2^mAVbeOZQS)ccg^wE<I$4T+;*VHD_m zU>~^z_FJT5m95yEg6_z^t`%8r{$X<7lCe?6DsKhjeXbK@{A}^8!%u8Sf#rZZK9zz-5|cs zjCO+fE)ef*IYX#NcR?o}^JyIokSEYGVpHl&?_oT!1>fhBO^BCzK?ou=vnf{D43F|n zo90H>{!n?ImD;zK)sX%DLF<%#L}KZFLSqj!1Y)-~bN#GYk4z-Z|&e9z72WPR`12A809Z6Im@s%)9Ha;Slt=mcDS*5PzOub zvly3$6DWUvNkVV~u}Rf|*o@gnj|O7jE!JZTN&~S4STRuEFU;?3&S{1Pyw7{zf?RL4 zHsLYw6@^-lEtw_4zYd&JU>!U9v9|hkgcW|mYL>sXJ+Y|>mkHUlHR11_P^)mFcT z8d77H!Rf?w>J@GEF&%!hVI@$(qXsJQIH_A7MR$9-2+e6A6j{6f*8rnoD|v}v(FvY5 z<}H6T1|Eujh((ZgF@ZFZZ&({KlyL4>zvPTIxJT=F2cBlnI@p;X;YPRKq$V%3Htzen zEM5aftoCypIwf1IjbTd6KDMt69I^xRjd#NFTdoU6--8%^kgIhZghhcbmghsjUTfzH zUD*3JTK0Ls+JYw%A>SB($eRdRWBfty^BkB**}C}n;pCTMJw62@kN7Eb9zo8y71Fs{ zbO7RMQ0E0S=!Ib{C!)<&A@t5FJ|CMiJ`h`I=+@SN^*)5zXP|>g8GCTT)<+@$?c6)s zvkqNz8*VU%Vb7sXe+v0#gA|*C8*nBZG)5m#gXN^4tyU)?z=Ucn!rp@+-;@3}W}+6D zwy41j#Qm4j296KJoBX}TIB=U5y)_ryW4#ivKIH;ogDt0@M)j^(VxKpKiqO?B%3kn6_*R#)Tb;`Hza`a{iAaA8=z?+*}FICZ0h9?fOc$ zO@S@eV^hZK&}Vme0wr6h+(XvuReDv`9at&jSpq2OPEpI4=Rp2ug4egETKOAL0QZJs z!~9xDfsmWo3V_ye0}=tud(mk)zpx8t3wEhHWc8pJ2?qzG-MJy_k0qTGdx!00vnIrc z_>A~n=;sKU6unJRLwRopt&hUi9_x8_8ct@_`HQUAqkDY8_?SUc;{I<&PeB=uhB8F; zTBTw4O5y>C>Xh*)4^2`{)T?IdXc>aAFeTEuWYZEwc&20vs%-7lt-S&33y9b;jL@NA zbOYq!gU+D$#e@eoQG|%X2nj1ea8qO~u|_5-2!K?!dRwCGg7(AUa44-D5L4DFeHuqhNWq9Fo2BS1 zPSRl)akLUM0mA$PLKcCb`}EjCZj*+mJ(vRrrbM^|;{P0229DQw(rE~J;Rty5Ar6Bf zf+&!G=)RGGA`k{#E9WLYMo%Q;OAyE19PTP~g2}4JrWn|^4Ar%jOL|zvEiJ&)b*wyA z)v&37Ob_w6&)+JIAZcMe>owL>_#Z4aAwK&wPkdr89Bi=vwHP~w*G$TKUgC(11;v@_ z00IZWOMDMNln$O?X{gHd8?k}nMi$q~;+jF)u!~$6Vvz}-u+e|$k*(-fHkGB=%|)Bg zHYHDA)->TjJvIjh{V`FM2sY3pcvT=f5YHX1$6I^HDbuG#H^BR5@37NpKB!$2-I)D? zX`%n938)Yaka-+e7@y;uRLu%|-vfr|#%9rtVe1og)7v56PyDNZ$ZH#}j=DsbRwNRj z3#~|kioA6k)(5J|P0m)eN&|`FTZzX|@ZA-}OneV1P%DFwC4L4U(AsHB4*)7C*2ku` za~mZ!TUnwSPe9}nA-Q%X{shl)WyzMrAf&UcTg=0n#cegUcNWzytgA0-u8A~cEM8>n z+nZO@*n};6k%dKga3Kz|))#U8(Ht`(>f+6sL~g;tJ zOZU@W!SLYxC^R;g8Sn7QmXJco4Pr~jkc~^3N{v*PB6Sg6>fSS9ZFFS1id(krjlc;1 zYwg8&gdwLp%}wjTvAxe$fN%<3M}W89*1->=j;nrn~;COe%DY9kB6#0X){kd;8&SW3ux*E(q5|A#NA z$E;+-6L+C?N+0%(N3}$q+00d_Q2H98J)^l5kK6XoUP6jqoue(Q1UTc|kJfQHfTq1D z$yiHr6OHTKPvUDpV)IiUo~L^bOUQD_nA;m<$=*5k7ea@N69eb@2|R?kOC(=gb2=9N z#!trTn6_q}iA^s@hfh?T45QA8ZEJPH33b2GRfTKm6A?qFwC zbPIAvhiZ44waGzSc;GQ;TR-aJ;Y+ zb%fK*3f9)+i$EvD5OY@S${dN!tI4IplxqEe>wb^p2?pic_@uV_jBR;>oEBLpqVIa6CouYm%&5T!X9UjO z4i-2clbjPexPzM=Mv%$tQPkOPRzN@?GMB1p73h-<3R7$VqtOo=aC32Yk^^L~hCahi z>(diHGy#R21#+H!KT!2Y`swjEW3$qIAMOlP( zxFf9l(aHS@EA}))Sl3AImk6uBkFdUn7JP}Yo}Codb3|Al13Cg_N?2+25sb#khml(Q zU!rUZD}lstgcbQA|Cr<`okL;8_$px?;|MD`o5E@+VXY8hRltxxB*My6hOjoIgw=JV znfvsd!(WN&l)U;WuSJyC;cj^?vgLIZ=znE-9iEieB9T{d;Owa)LcSDMDs*umZAtwF zB=uQUVRvP1`Eko@@Jr-X%OWnvc`BGk)hP-;v1Qe1cuSjRet7BZnvZ~|uT;_wg z+8ah<&Z`qrVq4(YSnlemkX;=;ku$V4S6UmQFXlLVegYd{W?>MEu&E7p7|}Fiv+PBA zNsr|ju^brFLK@8BvZ3u#!~yi!RcarB?63M3V+DRacBM$rr<@_i28p(my<0g+Te}Wf zftb(PDAWmCJB}Olhs$)1nSHMN3DH7p2Z!e$LrJ`rks=@z;Yvt@Ov81>z*1A`DQ@-!^H-+6Jjgw zH8|L?8hcAue-!X;unmwk-xEdv09Y+(a+%K4w%!+kg9WD5#{*t0G5ZWvr^OuIsHC?VO8dn2caPy^%X z+lA;Ks?ADxx(K#MmLIsHX5uUccltYi7~HsU?b#5sqe)e6?SvxV2EkbD?4*GEHKVUW zXGMoQGR*y~CuOA?o>xF(nj-lK7Qqo;m+&z9V68_`#oKZg>N(uj8@+I(K1_6DQG=TT z3YA;_Vd7r_=;#p&bhOr~bvs+~dnx%>VMbjbI}Jtfd44kx%k^V-VIY>zpp2i*gp&!Q zOr=!_O#!U{QT{(rv)q5yhVC%pcl2W8Ngu7ywL52WJkpTEa41wC>Wi9b56#Hb0xP)@ z;Gs)I0dxnjB$8AB6VnyI*OLn1BNVF?K&QHc0{Ef2%Z)UP0=QljfS(Eg@~`9_v2@D6 z?%e==@HIqWMf@*!q(+JVXOs?ndmZuL{J%cGyem`u*$A)&^q4LFsu4l&XSUS!ZQ%cO zUV24PdQ$wqysl_$KjOc(ulQrzMj_7JBO~#7+JYZli0x@E0A8Pk_&%4E8yEtFA7Hy@ zm_N}o$>O%d`d_Q&*dJoYZ!ag=*1EQr+Ta3&mHDS>r3(;-*v7Cg>qS^|xBx@2BUEd@ z0=5_;Ct(Ii#*`M1^%)nUbaCL}qF*ECe{jnD`zVCsIWQ{_JKdhBbDzA%so@Z__C%=m z9TYJf3aA%T?UO;r^j|{>;}OHNaYL{B+%PO}`?+@NY%Z1$5n0?L?m1Q%LbVAazNpuH zl=GM6=m==B-{1m!Iv(NA15@I4i*i&@U5EW7TXgGa*m_BO zYRJTp#n%*UDZvX3Zh9U|?~qz~l?7&ak-2)bHz!gY(4H#6Lp;V;@C|9*4e?wbQWtN; zL+^&;rRBJGeWJ};58zNA@UP{j75I3?IQ1L!o+9oc;2gm>0lb57B}az5aF7JE7n@27 zJSkMBZ(0Z3<9Kk^F(8)#x%^|T{r706JjC57ARO(=<+HLodyg`7Jo50}kxc$ z;7){Ylr1WVML}>zI)Wqiay*>&`?;}a7#mkeo5}rNLns_SL{D>LQgZ++nJ|?mOrtZ^ zz*|zL#Alm1VjG_jDI%h^I~7l%)m(<0aBYI}6tcFKY@eh(ReERG+CXl9Nw-dc2_R8> zAW=VOf51(qM4ifsT&8PJ@o4J-U=~~`J5k{B7DEpA;+c1Y@!UbUQc(5-W9Y@;(-z|Q zVCc)D>mX5|u?!?{I)U%UK*VbNP`JfV(?o>~2-D`-$l&dbWK?16E^=!Z8PkIY%Kw;@ zD)r!YP>-$?u_`1X*-wsFA%G&tye0wDJ$A$#1AZG_-V3Myga>}E}+@Q+#UE2yuO zU4XJdqhKTneZ;H}SfwHxTt@H-f{zl+rlIj8@$^S%?FO{guDmT(osg(Y>UoYY=yrNmy&hlpj~)i)Vhl0O<3xW8)w5kd<9Hb^yB4-dFn?`VTs0w z!y_zzkJHKu92Jq90;BvrE&c(5l$@=Ngpn%(#jy2481Z53j#~O5i{z?<*0B><UCgiJ05hh} zpG}Mzr+G$W$1ZkJ!MJ$0a@7Fy@JIaA9^4Bu3o;zJz^pw9^bLA!*tpRM?Qtv1qo%Lj z#=@p`IEav|0$8q!avXWqwV!nwqdIXm5Y9Nyll@>F=y1Y*!3mrZqq~b0rd=VS?|Un67HYl={n}J<>zZ_paCmmp=&(GXzXc#|FtDp`-X{iGNN!Q&8G8B zKwrnEYiN*(u`%OEV~Z=Aa?q?jVEPmWU6aFn0*f_&F)5Z?X0RVY4zrbT?P2agkZg8n z*!s8x;p4rg^@gnB;*43yhdLVADO*4lYaPUiHLjtSY4ORh7@YQ3a63Se_V7zShS2cr zL69K@U|KtQL>+PG-q#ZUjC`|pU!VlK5hF_nA8Y$KnAG*5&lm+SvuS+-@s@A)7Q1zH)^w4yESNG_^T0M&{l5Gj*Vh@S9T|3?|mLNjBo_wJR3lpfxQH;agCR6R+YZ8 zJ6|8QgS!&2p$OcsNAUuY_7_nF+~q{>agez>UgnIu>J6MvdqxP+OA-dg81XtKHl*~1 z*pREkvE0#S>>NgfM+Re8o3ShP-q(!y4Om38`NtkL*{~YMt`K{$r^LrxolzhMvhW7- z5vd1b!~9|Ir;}sGS>PckLf9aBup`6XFJLz@Ao@+s(7;k=~jG18S*f{A;^dk*xF$EI4Tt2!! z-gE`l6{W`Lys}W-e?CS44*if@C&zLZse*_3**Hi;_Hv~hSGiITdjQ9w08CeUU*ld9 z?6QF5v1<@@K(|bek0}pB=NjxEt>YBT0eS(ODpXH#cWlAYS8zm!ol_XZCVIF&rx2%m z$Mh=hfgF&K8{=0LM+~|rBtA|lZY5ck8$UKR=sPl&nWM_`#JkmB}APsYd}bR2qR6{mvp5W zAGU$F+!$0r5jMl#moSQFi@newZFKuE|5<&As-2>Sw#qw53aIGL#p%*9+L|q+VRE<) zt#+H~Q)tkD@YT#XUhE?D!B1iASp_f5Rfk}p-5|b7BXX#ULe__2>(!uj0`YuvlQKK3 zo5{UHT67?qMXg%KD5lbniIPs*I^c)E;?^EUL(kA-!v`9pu-g#o0k1>B`|)HI+q6nI zg`fzby@)b64dozFPysBcmu^DXR!}asQSeHLw?cR&HzJEB^lCwc_M=T^`I{9uNf7cL zuV_EovQ+UdXjG8Nz$p@}W~a^~gU#jElaX8+EDrj{nYKxV=JDbX5W+p&+f}6ZZWHo2 zGEQ~?Qwk-J&>lQMkA(Y@9e%LaxTRcbFqA9>1%2Qp_yeG<_x>@lfv$?gTQ_0PMaoWU z?N3Qs8B0zWm;V9)7*Bc;(cJ~8c^+PogApc|2o*{#HjkApx`)5OBPQZPejV?J+RbA5H9Pp_+{zBX(Ed zf-X4+-U8FjFc!CqjXE5z-38niOs{fF0_~UpeR1y&*$J3DNBCKHUK+YImS=xzk6X*4j^f}4H5j*~18ajSf=d^*mI zy@PnayniKn9LD7ziFX`P><+o{MH-{Wb0gvC@zWwD(c=RmX9c})bIK-<{sSL{$%U^= zU?^T;XZ$5Humjuy&$7Xb1-!!nn>Kj1fHynfDjOUS@Kyyjl3DREFQ)*;B5pi zFq3?UJD5|Y4*>@9P=RDkd4>-lvV6QTN7eoV1q;S2a^j6X1>2T{`QnYa3ikUXEH~bG znu7gP5_Vd=ae#vTAPE}~Z#-SWmIB6_;)R#V+CDrLUJYMmO_`@%!af?Mv>2D&1t8s- z!Vm-rI~}oqRA~-!6>?O90#ZP9LY>DI26fU#x_LsLFYsX*h$fw>_r9KJr&L@*Qsn!? zm=hRKS@bZuva+17CQI-|dp+$UIj|Og`59-~YLn_8o*xJ=pPL}C@ei9s9sV@Ynl{jo z8i=CAHZ7$x)a!n!Yj**G^==4Yqn&=i#vtAV8uq>(XveYT2}2y0_H2k~m@ch2-%apq zVF6n4C>YA#=URJ`c4`*=^Jdwr8p1{|YfsSnoG+DUG5c{)Y!_6xD;goFGU*<{V~eo$6JtZJ z9S!S?cmd*F+-|0Z02abAwK(*xtVgw)d|ark3g#Zy$lL>tb*Gw6uo?Z$K;Dmx(AGf3 zVtzAzw9HmEWz0XE=)OHzYE0~ZEnuJSi3fWxz?H9%-2>nBZi)FG$1#n2Aw}|L z5zMy`_Dj@)vs3X2Y4z_s1>l9PDY240YqqSViuyi~@xH=8|8r@wpDB6VEnaLg&cayp@=Vjhh&Qhbm5BNhkfhs>;}K zOatJh7^B`ZMtz`sWbAGVjCv1@mB54yTS;O3lQX>36b_TAFnkN!=f@li6D)gbruaQM zHKTaIWrqu_#g}jlZaKnr7d6{}=|)xOCVq67mKD6pNs!Ew^se`Ofl+@7*S-(ax8Ip4 zVW#HTy0Eo7Z0#jJrh%Me!lq#|#DR30&3%0oDV)O1u!zpI7L(j|AV8k2gV;b;(s}fC z!~2ePRN}g;05`$jG*#a2>*Hev2CV^fQpHw=y>Qje36$^C9{M?qqZu1&j-!3NfDQzB zyq-LMsTms^^1dkc5;P=wX)VbM#>QjHgXbT}OFqXpqeJNeJBc`KV=z{#5KA|S`{fj( zcCZ_KVw@RoX8rJB&>D{?hWYjQ{a%pQv3J`Vw#kGeU4Bw~=(rRhwQPAnl|-7U(t7x; zRBISk0TtqmP;9iSQrH?J#EtmE9EG&Wj8DpO5HINu(U(H>ISp+O(wu>5P1>0Xd2eCT zUqsx8PGcdZr#S2zYx|Axl&8eUU+b7QSUE@(PUpa5v=U@6tj&QBhz(W)ku(`Af|hdp zMJQBqg!qjqk4PCw+mD0`NxzgefE+thw8XuifNf#KS`%Bb(Ux;?Q`dgq89h8=)H=NB z33^_CWp{upJ2%4~Dk&u|WX4-$RwhC#5tmmmQ4VHyIXx`VZeVC_53L|cF5y^!}kiegGIDW*xmH2aB}h?A|w@1|g` zJIDic?~8l+L$pC&lZZ0|tZ^Ps#Esg2eSFvl=srl;V`7*@ZBm$Mr9{=aoQ$`LsW9To zj=|GNd>`ADH~N4KTbk&^-fgnMT$;>Tgq0=6p&HzC#&P1A1=!@oe&RWSDtjLGho@ji z>1nJ4Vi5m02pXEV_FJaZkv`Bah<^vJCOmct2NrYQxeND3{5E(M>w{hI!>d?depGw- zbBvY|zttPG4sezhvMWtCVJpa~aARaj^J~RLa5YTic=pX_+Zm3)z7snXp>~ELY;tWd zc4acA0sS9B;Id;X%0k_WF(<}}7QU2(O=d4*8eR?yX7AyE_Aj~qmP3g5xBn6)N~B%I0j+I;lI@rozw; z!$a7b0b`AVc`j@nT^n8dmtp>v!*c3nB?iGZ)=B&Ea!C*U80imP$WaiJ(c=AHDqJNFj4G`UUWoy}Iq#6a>6DMZ%tG`Cfuk1Q~c;`7m&9 z-$BbOR2Xy?c=9VgiioWHv$ zdXnup$#$Il>g_m^r5#7u4mH49fFnEr&4|ioW7P~bv>&D-)k--d_JPU0iVh)6gekth zaeybW=KxiP9d?C!P--9;yH?z;V5}HVNK^?SrB{SW690xrl#+fGyb`QEO4PYf$?(ST z&Rr-X>Re13@a_w;2Q@#{e{*v3z;H%GGAg`pA*DZK{($qK6jM23)hukDB#s!J26&$jRyu~JXQ@HU*3RC zQbrBt14l6>v<`pnMXarxAiWDsQqzFLyh9;n;{FF~aX11;jJX2BfuMheD#@6r2%D`_ zpeK8c3wGdDEYK>5edAR}oD&oxFoC+od=#5biVY_ON4Vdn{WqW|x`BMgbAlyrAyj>O zWIBS=-iU$UoCvnB9rH!T;5Ro?gx}L5!|<#0ik#l2Ug;LSlKs~xe^cp|?hN@shFhI-Qg6L(Tu>zc zdhHUtCak5l-Z?U3KYw*m^TJ5|f=FXcle{0hXi-gV{bK$zmnDFvJ$WgnkEu_rvheYO zNQwQ@gWp|;9Q}7Phge#65VM4!HhGKL@ls13ba}^v$hH69j~YZQ6hZhqBWEZhiFjWg z7mU6IKGfFC{JOO*x?9uQAID>D4D#@K=f56A%e_nfP3a5F_PS(c9Xhe)=gu2ct_E`Y zsguEUZLuE9^&}zxdPV2K+?Lsx%WiMq-ZD#Hn}>WH-yQ}Iosw&9T>4TQAlUu4If!^_ z$8(yo(W)%OHs=aY+ldj{gUuMSwimn`D~Ed=c;`pCk^Q1SR5_six-S1D2rriMM*cn! zzxkAUZ!L}!dzbu`Die9Rk!4A|)3kQZP#nAy1GM&eNMphX#M2qgOUyu;^|1dN2YN$W z_IF?k*@1>v3tYpdVyUrX1%USGJY5P{dY9x$mk`(fxZW3s|HlWJbLK8kG9j*`ZUv(C zoE(DX63q@IyH1QSt?~Y3tl)L-Z!`GvOLB3^OH>;LqxeeiGQM1@{ZUaf%3op_c&Hv5 zHyW?t26Cu&)AjLEgn`uSeIx=Lhz&AEuaImGLwv|K&f3-j8yg8CX9QxG8KZw~XTOcj zG&|wQ;-w3Cr-b1z)bW0%hlip-IH;jG?6Vnf!hS^S7=;P}_=w*KV}l_q_ehz&F+_>A zzSo`k7z5+f;w%a$&WDdMM*l?03fFc|i8pNkKR)JPB?vu{GXt?<=NqHdJBh-z&*3oa z3$*#HE7us$hz%>Wb3?UnVtwUps65+RU}HXZUb7Eomhctdyt6MnF~cO@cZ-+1@`z0v zQdG~gsHj_MaTEmct-Z*CVf9%RgNEnV3Ahq(gz zR^&`zvqTI`eE0@Z9JLd16As5&Xb-bED)C%dzJO|n4rLGF-L8+>51unAK1N)0W7J1j zNymb~L*d%vq1r#1=yNkzi?@{C$4QdvUNtHhA4AUVkat$05Gws5y7kh77@Fc9jIB3v z7tb-2^pwCk+G}@?E{xqf)=oOm-m8*#EgtrsUQGwLjl`9abNVax*_K)l#DrS_L7BQTJdI*{C}2q8rdrNVr5## zRef0ieUkD2B>ykT|FSQa|L=df{Qta<{GapRCjT9OPX1SXsr>)6U-=gup3cWf6`;8M zzeWr2gnfn=g*3mdWUPi3rE zts`B#@y?@{1J=fY#}U>DY~ysE2k7Ytor99{Bz~D@gYc5p{W!OPgYieOM$9Ag)cjbG zP97UYKNDnMc$l6^eJU&#Z=3_Dd>^z6kMEv$C8>y+H1O0DxGJL=UEhp(2I}RiQmQ`i zefY=>)GM8sow9uryvst>gq8lUu1z0r-&1tWwaPokDS3G_@ZB|wi{|05eB*-Kk%5!; zI6Ke(?QA>z4h8%FLp%Ryrs3`IOZP8l z|G~u?&W$#O(t~hz51Z-@xAgre_TZDy;f%Ute-w^mo43nt+Q z4A*00nX8%s?qDSu5yeFETtQXK<9h52qU}H~h7!aqL|n!{FpIn1b6s za3RSV2O>YPt=x(+Hw&*R;GZzP7f_P+p*cV(^9 zHDg=iU0|oZ+77QveAxbvluv&vquwSW)T)q1h$ky5AFxXS-^u|c`X~TYpuDDE$VY_g zQ^_)NM+M&(@hbqGf}}3?t2Mf%D)N&yA4dEP|N9DX#M}A8a!+iej(1;S{>yOXK4a7? z+?)`^FjKWW4g^kMD&jZreSqB5g}~Y)GK0noEAhWqkXUOY6`2_6*_euQW{of*rbn-H zYT1c=E{&Q&yn7Hcrjh&dWPD6upfbDoIP_OB#VlO=662J`_`jrhf6$mmP463~_s!{e z8xj5(IIQ4*DVm`c&LPZHP$BP2m{vly!|dtPS#3fQn$8V-pT~1ELx{jB(577dM(n71 zMC!zOYa0qjBa_hr6wXOCD&59t zWMRo`SOLr%h2M?2y7wR>ctu)AAs7QZ-{4|a#~xLy_jr&d3&ePgSL0#r>pD3CZ>7;o zKGH|M$>CryN%q9*@u^s+?w0KqoL0s|=U|EDGY$ndg~6UM8Ze$CR*+hcXH`2@RrAzI zTwiR=`O~rE9DnRiC)^wK?%?a5RJamNKQHJ#4nal**$1X~hjbn9gWfmwsmcA>-V?Z2 zPvKn{7zQtf?MB8%iHkQ{aG78xtMMr5#K)g6x@M+0njx&Uv;oYLf8t+-I~GX9Kx_!M zW9Js)rPaA4gg|G)>Fzwe>-Y$x_cslWIyUtTNapKiQBP%CQA5~O#=@pk27hPU_ zc}ejVC8b3pr^1&pYa$AMNlEc2JSm5yarNFi)#Gg~@SZp1^gOP96aY3HUw+P07l9Sq z_ih-}nta@_S^dD8`hm4&z*wJ0Y}gIF*5wz>%g^(exTTNF&tIT^o(cH{59dtCFMP;1 zA-^b|J0XAMiqrJ`;$;K${IcljM*d1~PH%o0AbS2tU;qQpEc;YnW7fiv(7U5R@ zYZ|V6H(M*^ffOg3>ad1c9;E zR|jqoxWw@d@B-@r7sUy{ssz>qU{%^!IS&G<-bE@CX$mX75J)+C7Oa3x%Ht_z-%`2Z zBTCE2FEH{8F|bDdNF%@4$S*VUD~x>8$geW;XHCeTTP?D>0{D;F&;5Wr53EIkRhd8Q z`uw?-`3=|SH&^DjPReidUZ1}l-<95JpoH-@2Hzle8RS2E`oT-wnv$oHI3lD@1HZPG$ZseD{L24kLXG*EZj+z{;uZx6pts z+gt52`C3kXI2n82hCOA=ux&Ro`b6wTW&UAr&h7n{pbz%7`0|f=Z^&;$g_fg2D{*}k z-&Hv`^07{2V#tuO96*%`!N{4$%)9JNKIvrg$!cWYfxLiC)tx6)f0c#mOj4bKOv5hH zsSk_11({!#Hv`Y1Ls66Tc95vY^91^>kbQ+u4t1rBT_tTo2G0r~L0@>o`8&||UHI+= zcO1s|7`|;7!R7d_#8-9H4B##S?hU}5kE@O}RC7OEWDLFq`BlIReF;r)W=f;GuuCC^ZKg1w!X;5<1@4JxIVZB&xRl$*F!!qklx9_&AGMTra+p$ z7-Y*a(J4+)<+wh9>sl{)_eoTF#X$Vv^Ofq~jJQrpz^Jh zU+POLm&y6fIkWn$dZkodG!ce}Nw(JBi5k$%+L7Jdby?0^=Z|b)`=HQ(F10s_#wx&zjPqc{`5HkZrDCT#7$SpQ* z;LqT~K|qC498Uw1jppS3G=Ycx@%RvQ6h8Usb}c2_?y+9!P4ACTKm8H>v;8qa^+)x8 zpg*Pp*KH{KYv+9Q$BqA~{`eF6VeQwXJ3J4OXM_j8gg+{TqbK#53mJpqaLvfCf-6<& zI7nuGE8QL#F|%^*YTNxU{K#VUYpP1I|!e{cR(W!gJqS8Bv#H$AkkbAX%hJOdLWrVGBeCo1ju#3 zg);*D6v69(dwm8jO&iS+#vR`$@LdZRpfe}L0>1|$@X(V)9Qz252Cz2RuH`wCU>~cA z+`l2L988__ILCgXiwcF?emP9Bfqx?l+@A&h z%`9*R`TKr+9!Bu`ete$MeSv`ZJeOpFUzP=4k_A3C3!Jn3^bdQ+>sgQl7TI@W1imB* zPv?^>lW_NqPepayAi4K>%KN|w@OijbvG2#{8P^BMeSKa}OA?-5AKCSn%yVvhDw5$O zlUJWdPu{z4e4dH!Wb)qUF_QQ08=q&AJ1O^*@#_`uS7oKw)9iy!fX@@`1D^wk*RwbY zPp5N367Ih7iTCSwd(uvd8YIMHUr}IEK0)D%e^rPci;Ft)7;7Az0Wf}dGEgQ zd1ktk$$OvYrsTc*#^<@kolM^QJhPMc&dpNXcs+MH$uv#v^GL)pndjX2Jae7fw5!i^ zTUy#l_&jyaZQ9l6sZUEg37==4bDO;W74Ezeg;5~KLi|GD3zK;1{ojy;yKj70j_L2? zQ_<2sSw5G%yEA>BRJrMJaz)>d&(+WFT%U`7(%~t7PKR^0*!Sae$-6t(=Sk_Mba=|H zrNdj@$>hDyb5HW#edF_d$DK^x`#kq0@12{cW693ZZYcX;}-r(FW?YacfXysy1` zUf@OY>;Bm)iN5yhHGzNAjVkwV%5Pu0uv6fD_4x-j94{Cg~0DglTU(tUjAO$hqL(d4Zw4yYl$jACIS1)(Bl-Pr1PR^4qlnFL$HLz5JVO*hj8$xIfnm{?(ox zZXoWfY&yOS_zW9l#QR65wqc$lJtFn_qDS>68Pjk{mG+0o__4(KM1_9-T5aQUXf9u)dKHp z7k(@7zIH+Oi@;tKXV78N=ef5Jv>4~L@YMXs zY47eVbi82v3q2Pk>+jIf1a1eXkn7O-CgA0l z0*B6z0YAevFLOM(UjyFXIQ}jRyfX`&|D*gu*SySLZt}b(aHrO(b#QV0XSn7&pX2Jg{;Agnzr?Phd&-_ak{HL?PUl2N}d8@3sgU(KYJ1LJx*3<$2 zRN%T@JA{<&yF9GvoZ*_EDnAB$g8=8-HZwn02>zrbSk|GDG6kNRhsv5T;5DR^#7^Bb zXMr!x0{>AK_;0how*Y>I$I*@+kF#EUP~f3tMP$vGDacuW?d3XYfAA8(&v4CuWgQh{ zt|NYudg^9I7I>}DNqIlAHi*aW%7Wh^_^J7_ga6lA@P8@zsd==l;eq~|Ecnk0{>)^> zW$g{AH?!a$68xKz__Ah()PRBg_ruo+?}x8OWWm2A3;uP0pW&LnJ8NsTS?JtD{2a$R zc|5Z2hI&3M@VUwQ%lZ>i|0witOTuLx2C1h7J}(KE^$Vn)1H8X+-i6_#qsD5$7!lt@8 z^J`i!Egd^{>=o%qV=uqTb7^Q|F@H+-)LR6S#1eeGIC4yWc zIF}2~#nz{1mSa}q!CqzA^ijwC?)Nv%|$Z-?ToXXIY33_PG zlu47O1*+#%>k~qOIbZt7oOvw^Y9oyc7kK8(SyZ1+uAe6ty9#qSDOnR&W*@Tdyet*U z0`5afHMlRf6p;nohlr|AUu+Uzk_Fs{h#+U_rGAL8%q4w@C}^_C+nQ?afULS!Z z^1p-5X>D9EZ(%Z{b>5=-)Wbv*vUsP#yn6ia&?Mf1#s!U$WJ+*qS|SVQIEZuV>gUz8 zG^J27@NK%S4*r*rO3!j0xV)vYE8_C>?yK15V?UNX=rjpbzNwMKz5JlHgXw zBFVJFOVU*5SqHe7T!*qE3!C~pP*=aWR^?@knK1h0p2Z7nj!zP8sGHNzcnMaLx^!io zO~QH71x!Z-tD$3Du*jS{G2r!084@6}*-A1lT<}Um&4Rk7dc`tMVRcJtnp#pk#W#pZ zH#qstix%R&A(1(1xE!deBD!QzV{-(CnOt~zvar_q^XnQHz3Z?;oP1{b7h2+`9?!gc-BT~t*PV{tzi zcQ>Z>_T7#B;M|r@=C=OmIGm`4)q{U?T-%&dd9{mb$6h|C1xCG@-bhW|oO>GUn^OE} z=U-WxtgCdW(|L0iqu)iK=QY*bp2UejY>`p3sJ6kSkI)Z-I)}`Z#6_L^fqof~24P`K z#50c)HxDYw152K~Pva+{oGeP2&vcmEW+=xJ;T!odj=KV197U7QnF{P7Jnn+K@gH`< z-S{hAa5p~JtoozR7>WaT^O`xgsY!3V}8(wH%9r&YmdUyt$eZ)_Z z{HE$zVW<5W@F(rG(?1UVwKjZA2L3a4x;O(q*G`vaz}MSphp!y^n{D{`4Ez`DbVUYy ztDQD7;IG(Fu9l}X3VCr<(&?~9d>V@F8m zw=O!}_BasEXq-d;c^jTdf0qk?jSK&X4bL2x;UtU?;ZNb~&>w5VdAZ#MFSFs98fw-Vyg8#h>-spn=lMT>pg>86deYoz*59!ax*THwr zZf4>yb>VYYql3RJ3;mzkaPrCTUHFf?=x|q*L+3YH=)B{i^Q;Shj|=}VUGRM__(LxE zVHf=SF8F6I_#-a(2^aiFE_kmC{x>f8nd~rp*q_v~PP+Kk1Enly?;BNWie@o%V;ZH{%_qyQZPY2HR zUVfOp4qvB#&dUNXvEigoz31Rh%YuKK3;xe8{O`En?sESs3;Z8!IQ!YH-+twScf066 zVZ&KZj-%7w{Vuq>KYLtocYiut+)0P>?$8-D0112uclYOL8=l#p6SBbX%mR;Pf&ZT@ z@NO4;J?i9)%ZFLuXJcKFAJ(5X%faXW3gL(FMff`KQWxB<_a?dEZaPz3a5tSg7u>BU zzhlEQ`|WSD!2i`n$32eExZv(_+~|V4$8oC*?jA?3fAYinxcjr-hO>X%54;x)@wVzInqZ|LmEckEOaMF*s=s11| z`(c#}K7fe$@X~^>Q=hNd@JxKeh3~G9^Nt4MyX)Vc1^@dl`fmI#7yc4gecsN3-{Zo+ z+l7BJ3x2c9zHD;g|I&u@G5X|AKdg4)yX(pQO8gN29(*1A7qj4VKM_B?xXWE+!%62} zd>uOXy71lQ{88 zIKB>@X$0bvsh8CLT*c3~BHzKE;_9EDy5NtpLHO`82VV!@?Vtb31%Jzh&pJ5x`)oMt zu(D(m8rCkk>mQ@x0($Q81Yf%yWqlHKlbj2TR)5K3P z#k5!^9dRUBaG;GKR{s2fi4RdjONWq(Mq@T%Y{!@ewJ~(0*-(*+a#cY5NeK)gO;?=^ z3dhd#-gCZvzVqJueI#-8o1gC~4*LrTZ=v{i5ndvEJ>g3ghaFtk%>Ltq^L2kc;ke$Jygkpt z4z6p4zec>z5Q^{tXh2DRvhxr5&j|J&l7&L-sgDUyA=0+-lsUi zhjWziK1}j_+&xM13rT*H;^5`uZYRmJ{nk1Wl;r)Ko4|V?7Ubb4_mguKhn@A3&HqbD zp8M?>!nt2vO*r?Tsf4qg>lOF@FCzI8`STOP+5gRivp*js?BlyCB*cOJY*O6&vmt@M zNp`p$>-5E(w_|fpc)Y0cL&P7xkNsA{&YXn&(uDjz!XFS0Yu_V#%Z4Q1Z&w^1#~)N2 zesX`_pg8Kq{kGSqf*s&|e4S4?p9e-K@S|Zs5{j*o&Hr}^e}V84ogY|!BjGH6C!Gga zzCgz#%i|wQB6)vW31@kGj}7@hOE&-K!H6W_nB z$uA<_eFs7MzB-M-Mxn9isztoHE@O>cTr=TNCLa|-4)oTF7ljToK z;1ej$JIKydx=&y`*C+5L6lZ>qcel!;fBuf_<9mN3;HxB?{{!oQ=5c$!;pNw=Jeq-d zSJL_Xb=hb3pGx3wEAH)FqUV3a`EO+B3c4;Z?=^(;?=>&e`Je6VP~7wGA)M_zOXvSx zB#eIDrBAn}cPv0J1 zw)1nvy`2ui+0H_J5QYC7w`Ki$>@$8%|2Y26F+Di9<3APm{_H25{dw`&9y`q2E5KR1 z?2&bg+c}EE{v(86O8Bn{pRG8K7rtM*o#dNI{zoLw_bvC6{4FGJ*FkUpr*s}*`*yv9 zJllViq332X_Z20d1N`m|^BpV*aJqP?b!WRS|Gbs(r%2v@CxyG;Bm6#+XFE2o3GwMB`G-iJ@!u-$IXA^$Bc0Sl?BD{-myVhg(677_bKbPc3 zX{QAKCBi!iKacPxy1yMk_+r9~gx^c}g@muB``eL(KYn=FfIk-zey`q_0>6at)q0Qs z{|e#z3BQc+ALzjY`O66}G|swl`s|65q?BV#Y+7iXG;3C&apvS{ z6B~gx+IQpSF7_=2J znH}q~7RV0g*bW_u6|tq`uOfSF{F08k^4@EY)v=WogQ%+1sft$2@5=F35hp8$UPT0~ z7TJI;UkBaU)8i_SnTp-f<3%N6u4gq31GOSaBU7dBJ=^D~i%mB-;=9SP4Cc+@CL=Cl_t z_axaf!*^vJa`vpLuM@KwI%E7TB8yau{|b?b@qZpK1tl>*>gf3w3Krup364~Yf4<1X z_{T>0!R`513KrvU7aXY;{|b?b@h^_><59@-KP6ath>M zvvi{9$NcaAo3MY0%HjCwBiZ=v_`!Fm@WuOo4?s!!cp5fjsc^Z({4l`0|7%px=HW8~ z-v4ikT+IIt^`DQwqr`~!|N9C4EdphBNcO-szZ{_`eGr?bvph&6}zg-FbhP6ST^W%TXiHThQ z>#O{}UqKeh$`e)J%QdL}t{`4s`t~0v`my$JQvbRC4G=5d|8o-lAF6@#ewp7^j!XDo zullw8zbfH>YsCM75&zo~{@ZwqTK+Fh_-`kK)#b>C6(9dQ6aFuc_&+$}|NRO7SF7W- z{NI@Hzf%=!`Ts=1|BVsj*3$E1{9uf~|6@VH+8O+KFYftgD!+uWBE7DHw^X2k zi=pWW{$a*1`;owpdv(u`?|)E#fBkCJ_vAkQrVkyY^{VgtPrK?*3u5J^kKek4|D)Cg zl_jbP|M44@@mpMXB>eBK@9jTfj;bBow6abb59{9ldVODS^QEeKuKyn_`my%krv4W+ z4)7nb^!_&}zmI>r{#Z51h_y7otsI%qx9>~(nxN_%zro*DeP90u{ZY*36E!z{`%Mvj z`70DnK+A0#!ixRwg!-Q=+4!yf?nv<4jr;j(&kE0fYl6S&4?)9tz>nW=JU@QtLM(j! zi~8VT^PL83`Sah(1b?&gk5WbOqb{C*V}gIX{%Cli5o_t||3rd+d8GaqMEG|ozwdv8 zP6&kBeal{0zp=8H+^59iH%#?y{#P^L{XazX#f4n|S)=}Us3PKbkz^mglN0 zC#z<1$M4_-|JXkU4SNm)|HYC$|Aor$P~@ytih<$8T!F z|Bg*TqrY}+_zw~_2C@B&AA1&GVE&hF^ zePZ#e9})EJyOpq&KmYU={n+u7eXK1?+f)(!VVTW z|HuS?+n0mB%`Y6b^7S8);9s*fXjq(4|0^Z?`hO$E|J9&hlKq@~`;SfVZ&dypR2lqN zN%s8s{x{bC-3XQ>oBud$61{|NS^o|210w_Je{Z>OV!Y@mv3UB*EXgAn4r|it^I)KbYWeel4gN5BOUpd;ZM{ z{^nB9w{bLKE8qUlB=}n+{8J1c z(f|Gi(e60VS{hUqyR*sQ>@~ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_generator.o b/third_party/libxsmm/obj/intel64/libxsmm_generator.o new file mode 100644 index 0000000000000000000000000000000000000000..72b9047dae909d5f078366b7ca915581fe94fefd GIT binary patch literal 15128 zcmcIr3v?XSd7jm-EP<7o^@B!$iI=b>axg+`Y|s)OJ7UJWf)y6>1C;=hEUj!2>wz>v zg3@5TlC0^n#%)NGoF+ZAJ*Oupr>SYnITT6~$-~ zV~YPg>Z(uEQ%l_u^tUUOdzHT#Qx$#h(3mpxS2TKj!lMk2qJ_cd+LiamK%gnwL4D|O z|6*WMydN3^Vzd#Wr8Wf1_eU?1w9(s^-M^sUrzW5Go45G@m&VK!=DO0*0nKbGH?P*r zXqjfN!{E}`Wb_MU254U{=`{56< zr{puRXMVg2IO5e0v6Za@{}@_l6|WvJ4~`%4oih`D8K<#A(Pt=rA!%TdB_D*t#-vz>jly<{$>H>nGZk<(ia@W8(b9lJEKBUU!^us zXsA-xGf2HvH^)_T$~^4V%D3`+6Zn#T2YbpgGhVO_&DfvAplPj8~Nqpzd8 zE6~vw=;}5Cef>Q>-MvPl?c#HVK6RsNKBt=#nt4FouyH-qm&Y-?#yB)K$5PvZgR#{2 zg4;o!JXJCfGG}7xHN+cl4UW`$eyh&CWz8e`NwG3?g%@O>lqf^{(XjAX>T^V%3J=fq zFM+)Xb6>kty+gf|&tH9-ZmSth8E4m!K76?U)p+V6;(59XkDt(REPWjTwW)<{+DD&k zuT+)~5hT?OA5ey(P5yOxSRou7hz-97$iFOTe2!`(SV|QAQf&A&qnMf(#z^auxw$!- zAD>;GeLIT~SXIRzX7+VZ57uJ&-Ytb?Q%c1ojHCTDNG4(F;dXVSmTC)bRX5}e{1D2t zVBkY#(F6JRA?O8z#^*8a$Hr{emG=)r!R?30x{YM0GSmq3Xy7#v-TvPh4PCbXlAm~# zp%OSb`ALy7e9Ni1Ip)##xW(r)i(lkDKUu6)Jf~D(zrbLaNd_}Z2h|Ojz*ZPfo+W?6 zPTXyH-r-^`biss7v#>+`F)cK78$Mn6=?e83t2@TJ0n!UiBzsW36(?h`0S2fWNHx4h zF2e}5d&Ig`VciUwQk7$>@|AtZltsGb37R9|0|fsJ@WRZsK=RgW3zXA(tzfhNF7H)Y zv;VnLu`j!a*!;<$tvIP{OR;sNxf^Wm+g#_~0F9Y8Ja#j0lm9X6b9D0srcps(RetJM z&0nyYTXUpmJ)8HfdHUx~TNy4U-^zIGrae(i2PL!XcbNBXY!!BvGWVdXESG3YU(UvV zev-zie<|xf&ic&0Cjr>)`!@Q2kbMc7e;3Uk_E_rw5(PBo-#D zU6Gk)uyy{y&*D+0$>O)u(dNpNcLM-rIgv zMSM&m$$OFH`B_rRgl1*_TrAoJGQ-;2UR^<33Lg&M1CFhEik?m-x z<_SG?_$e=%BA#gGSm^NV2Pzz=>^Y!^r~0qZ(>0#D)+yB-JL=nwCnM3#=W5LOu9LGL z)F#J0DlTMY&#QWP!uXYH9w1gV{P93(a=b_jkKg-4HF?5w@13|TjmFy6=Tr#POy8~v zyj;Kp_0Tl4ZMY5Fow?v>8JuOE7u7tdnR``p@1!T4c@<3)p}uHL3@Qq5CH6E>*kZ#S|iy*_Tf8GGp6(EzSw^KJE^ladb;-g!UbOx*m9 zX3jFlYU!Ij$xq>#%SsSE%;#e1XM)=yv0dqA0ZAQyr>NGPs~vyCcT0NN=WCPi_-n`C z@zo~xder0_B}Q{C?k1xS&m!XnJiSH;PoHrKp2fy_crGwXlociM^n-pCb!~F;tn0e) zuZ=2;>HY1fqr2(x7(E`R#~3{h(&Gp{Wa)bJ|wH>z{al^- zeqDOmL-F)Z3G{GDkyt;v%j>bh>e6@5>8Ts6XqkK)IoDRKW^M9!*pn+E6|Ev==u?PC z%c~4cS}mV4^i8W(Y_wx!fpI06PD;=u)9fXBcU+186sE&sZDZ_LQo>g{Mg4%sXMCr9rUU*tcpN+Egw3Nj*%8y`hVeJcS zlyA6GVi`-|{l{tgLx=4fU|>nUV0ZVA><159==43FoVt;p;?5?y`Q92ioP%wy8F@Pg z+x#$cA_vtVPD*wtTm@He?fUr z_I<-X-RvgH1R=z_k7$!-4zZkKJ0{I$o%wdIPg^t7Z102qOSwLIDf2E=)|w}1f8Ch! zJw`XuL(fvUwO%gNyVNFJa!fvx_bt#z(+>pa%(=Mvzlg~!0ITEgpKI<@!;{JmI^%S| z&3=UY<~kOxN9gR>!ZmIvb6Ouh@Qp{6J-%T06=mmxAgD{+p1P95Gg|mVWycH_Tr+3%)K@utLJNPY?0A*I zC-l^19G=#~N0l9W89c38LXvg1)2m_i`C*+<=B zHrQ8cPAPjnAVz~RV!W@6x|GM7h>d&t{M6l)?JlKmG}|qwZgsZ13>F3mx|BhgK1@_c zaLxSfCuoGm#8sL3*2i;L(chSpY(eRwxC$Jpr9n0POJ&E~nHuC<=uGx9Hdh(QKU}5+a z4$s>CI3p6JEx*`^oX`A}`A{Ez%eW-`KYh!?`|iCkZCpNQ{+i+FmYJ{6`2v@Q>wH34 zJHfi}NVd$}AW7Ttmn%3}>=GBX6ut4rYXTL~TQ;r;R9CL4tO~5ES{15V6IvapXh^gL zVl9S+UmmJj0p6utl;{2i$usEpTv%38x*NhMFJH#T{)5+qRq^(}*)W*PO45ck^0u9B zn+vv`V|Lz0U9cZ;>?K|YQ9r}mtEkrGa$OEY<%+0WrOVYda%8<+DoIlqdW+MYhtxu< zk^SkSo8|I{y{a4-@u_k}vRIQNMR%PmSKKHERJj}!HF9ZmVNtUjYmn=8x$!2sxkZk2 zg91ERirR%B*dW(O<;JMo+zc5#a%?^8>vD6G9JybP-Ng9Ot&z7z<(~B*+9bzrmg{el z8-Z??BZG1*%7|3CS(mr!a?d6Z^|_(s`^RaaQNr}VoNMGjja;G2RZ+Pb)6fJTEUKmX zSAgs-&T|dtvHYaTRhnEKl_RsB?gV69x8EV)S@Q@!2A=gQTpT!l4dZs98#TL|{m_1{YgH69(_FV zNMFw4c29Yw@+@v|c%=*M-E7;NjdWRvHM|o0B$E%K=atyM5xU^Z=V1hRr9c7viv{pY z=Hc|OkwsDtXJ>*_d8JF|(G%dc`NvM+Wl_P7SZ%LFH^ThlmA*0`1mcxe7QopjwY;(G zSWiTY+)xc`&et1d`22lh#ic^_T$U5eTeqTKY&%EdF6(A^?i`7GD$VfRITH8MKQTOa zjs*Sl49}e-foB+=J4Zqey)`Yiog;yNnc=x}^b+`~tYvua9I-eGybUMNRuXJznI0}rDlDNaB^*(R&lUo4flujQgZMqYdjKwGZ@m7>Vq-Q{(CF$yhUsL~;IXU?oD!V;$6Es?SwuBw3>g-$-TGiCswWX=gXfYBwM;4~Z8B>@hl$}FinkqV)A`aqs zvfCA3GFY+WL5%$|@JnoR#EnOKuCU>hr&1w%D8385tpuXb&37gK1fC)gh3F|;5jg!5 z85N=@nF9X-fha^zQBdHI6Np0gtihkaFF^dJLO8`@fwO<-;yBGk;Pe%k3ei(`An?OB zoNN+!8{!-lqNm$i;0J6tWfKB_o8u%dXv2S7052xNC`5k={)C+K3g8_ar;2|0CFqB2 z^g$c`Yytda0lbXfK~bFcTd0Lo7BI#EE{4$)PWL_)k)etf)zlCf#{en*5A~PcTui9|-D?9WFr<_sH zZ$g_2>7`s);7@XYI_11=!|ifLOGwIm`Sy8H79-@_&!Zg=kPsBIhpS?JJAE#@#CI?@ zYSD8O7;=XV7i&t8SRay~%Q_`#$VPuIZ&URh8=lKXC5iG@DkMkj4eF241BGz07Sx}n z2MXb|uS9u*?>oX53lR0R4PRozml7cgRbr1(zru#oJDe!j+Hm{ZUcC+fydXy1X2Zo^ zrQwHd`1v;a$82~mOT{mLY`EA1jIFZozD}cMvsBq<^jh^+(J~T)$fep^j25YKb6=lS z*_+tbQb}l~xDeg5DHlTb)F@wAf%M8nT7gWJ*BD5QZ-0hmdoed2X=50RLTrZ!5 zGCmh8Df!DMw55PHdL&h@?-%puou4n#ajBg5wjvg`wa=i1vsM`$yrJYep{u0I*6z;E zL>JB;eoUCC#5eBCku?J1{r>48)lDPJ$*YPW>B!y6Jz#ML*5ass9_m-1LueM<d7{K>tCW>`{(uFHZe;7tmkr zqMyF*IsN}luHUJrlpko3R|^{iKiF8=SHS)V=NI1v1;5jNIwx-atLOHg;EHJfP<|)$ z3;+Fw>*wi}P$M<+{+a^%$GHA$gkoMh_0tc5-1G-r^iw|T)KA|(-R3V| z0^(=TZx-OM=Pwa=pMjr#-QlKxjPvtpWTiOgf2aWe4Ck-Ta#{^1|8xQVD*m$LJXcQs zH^J}b|7sWi2VjEJ|DP1l-^2C4cLw{*5og`}*UbM3&i?N|B+6<367aj(U(f9yUJ4V+(8PW@{N=%3>Ht2iU+r;y^*zX|+=p~@?5TtCgtUYz}H-2TOO zBDA-1exXkci@vDuE5P4#2K~a0kpleNIlt3iqA%)4Ilu6euyZ%(Xa6fvfT(G&P!aX_ zIlqPGo-|I6_w_(fc$ zVW)25Ji=ezck{qb{vnZ0e#)sFCIZ2Cm5W&=)j0;8^QSx6LBbecp^uj+&o~C1{QnQe CL|RDz literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_generator_gemm_driver.o b/third_party/libxsmm/obj/intel64/libxsmm_generator_gemm_driver.o new file mode 100644 index 0000000000000000000000000000000000000000..938c8d1bae01f9cf708897e97ea51a792f7aede6 GIT binary patch literal 11544 zcmeI2e{dVc701_>O@s-ONeH+NluIkXNgX4}HW)(^oh>DDqa-GFAT0q$w&hrjENP^3 zVtxl~0m9_RwBbh!4Gd7G9q5#HXq(cO8j_Gu=rl}cXgXzJ3e12f{75OJlqRI_?d?j> zEA0%le|4ribGv)r{k*qt_jXTrXWtZxEUm1lU@}#(#jJQHRmRrV7wZ9D4Y1j47OUBu zNhLQJBiZPNn6YrN5gUzL^J;3C0ckLvh$W-LF~dq3(R4bN9N=;$HDbk+F$0^pu!83+ zdA^+Ik)UbR#|Mo|4WF?PyqZQf9nEB8^F?d>wAOZ|wS!a9M0zONX88C?U(ABK*GOfA z(6u_$(b?U(YGs?z-PK?OyI2id!v|465KCrb3wXYW=kwdR4_LoIKx{P;V9q6 z?Em%u$$1Em?R8G!^|LpjC~#|&D&N`s_QQ%dp=2sO$cn37tQ2N16j8|a!eA}oQ8BM#QzN116 zdRnFEC8Ae~twgMsVnB$U`TbG~bAQ-j&{HSHF8kb&=WBgOKX9KG^f;xkUO3`vtyG5! z?)RDb=eQcmz58?kMiI%s6?BJ=n!nisC%MV+))pvP&>ei6W4TFpZoUJi<@hXjt_msw z05je`cKoz}`>uVt$&r%-ME5Ml}Zarr87$;cWxy-0);0Y+fptb_xSAM1(VU6`Db&J*4brT zIlz_q6j-ni+_gO(=b*cB4xBK{C2Q{7Yj*28Pc3L?7g~AEoqGm^kZ&?PalNN5lsgfG z-Lx*0eRd8>pJt#1!1?=uE-s=+Qqo!3ZAe$wYbXX zfNyX#z(;j1#yxT-MCjgrJ5161AT*f-_fz_akpdk;Wyu@fiC$UiJE-&bB92l?==5Qo9s+part;>6geFd-SJD6WgAW_}-^2J@G&jbLwE zH6h>gum88Uy=L=VVH$5UX%d0IBc3v;qa#Bd|KjP%-8rqebdiMn{Xa} zjvd+?MNU4-AA;fW`-_PICce8Zi4sOa;a&*1Y0;d3VXTnj#N0&#mzEr$~*F&+7>D3{nJ zTE7#07S{qv$y@ucG_jy}<%|`udcz_N^En8pc^UeqSbYWO=?P^vrT3a!9 z<_zZ!nA|$h&k}71ak7K1TF3qh7Y-3uSF31y4XPZ;08Ljt7`D6Wz)`!)X`bSU zQ~^hL>MInhFrz|CMRlIyQA!zBV!JR^IMu6&{8frpoenF!K%y2-~}gZdYT8P zl|<)rEf5F8*)B~Ec58D!z`2 zDq{hmUqD5uhlCzb^sfnhsiLnEdPLECguYtQzb^E&qF*iaZHhi1^t%=Pn?iqB(QUug zj6YJ4woSZV&F&={R<9TOAw|DM=ykY&Ny0scEBZ;IVbzYinhh!ZPR>=a-zofeh5n+V zKPdFK6#Y@5A5-+lg0-& zVb+XU3uYH$b`fT+m|cvS&o<#UK5pdWRz7a#<96_Y07-7?*uz9yFYI0=eB-s z>*uz9ZtLf^es1gMwoTl&iStdIZ{mCt=bJd+#Q7%9H*>z3_tnh%YPLtmZJW7mGq-K# zw$0qOncKE-+ZJxy!fji)Z40+;;kGSyznp3Ev)*NqRqbY^ch%CR-Jzb|9SgeO zG0TRd@g&Py87q~rivcY9htuR*;T4Ky`!n%0Ua^6AGH#Jpc_p)Hyq0Ae_$7`f6Yx15 zKS=A=Ks*!cw-8`V?kiYE%moKLsq-Xv5I&)F5`8w*F(L1UL-KVfLQ?s2Y?<@8|0M4b zy2^i*@TU>ptKlyqJl+eW|4TJ|2jOunl9#^|YCQ6H0>^tg;d_K1mA_Kx$m4sx^uJ!{ zD!)PK$e#g+3>|qPZ0hKgx{gz?;-pb34gzam(O!J-noST zfrkIF(2+lj@K0*^UlAU^m&ox>YWV$x{}SO})bM{KJg#l&|B!}%jqnEH-_-Dj2|thU zM>PCD2>)fmAJg#o`3V!w=U3p6@jHdCu1`1N>3%$2!_Or=-H(QbKcDb)KQ?IiCc@MG z*s9@|5T5SGpoU*gcn_JME)9PL;m;-fIt|}Xc)B0udxbjxDZ-ye{979T*AbrX$1NKE zHo~7z{NJhJ@wo^SuAh211GN|MI;>_5TaPFChG{ zHU9Sy9)D}3{}(lU5Ex9@?{YXKZ@=f3>hgU+<;TzoBvrrLmP_=EgF6& z;qjS4&d+x=JY8@2&MA4i-e^3}690a}@7MVMGvS*E|Eh*RLilFFf284O;KqW4MGG7< z{+U8o=if(oe3zE|Vhz87@c67H`867Tfbgw^PiXkfguj^ZH);4g2;WBdZ)y1N5&jaw ze_z8tL3n(Il;hp4;r9@J5#e9Z@Gld7G2vg;@E;O>3E_`v_){DJLBi$$95VjdLRZ(r zJi?oV$3IwNQuz?!+o4_hUq$o|qW2R$Nc3T%Q~nj9W4|j2|4+ix`%(phkkmNw&(D}} z9;p9|iH`3+GX5)wj^9Y7-Yax0=zSnbc>HE6`Ar%=M|gUl+^XU4Cp^7RKBD2DCp?WG z|3r%k3mX3+!qfQQ(C~Arxf|{W--+aS=L;R@nfB`=Jnc6~c-n77!}k)N&f5kJzlrd4 z-fq?ZnUL^o_Vd>!I?lL_xFm~h@|cui!D*=vTaXdi3D zF9p0Dk~J$fYO%(Fs1;?6ec3E)%%t$7vG^$mjeL#pEke@*-!?L_M6?k-5wil1pD2nx z3S`!bW~`BPBmaUSefMWAaMzz29*!j;tYpfHHNv0676RPAQS=iV>K)ABzxDms3Evg* zCs0gt!C#Ij>Cq6UK6c8til@8^p zESCaT&VGvkmf%7n31h`1?eP*|U7f#4F`$DIL?cMGc2UUJLk({Y0XS$nf*&Cu8Depx z9rhvn$9Eh|_}v@3P{+SR?7s&PB1wiL%k9vHbv1sk5ajn+=|YVkzkipDzZPHOAq7Oh zXop{Y)%YI*4g1G;TAFH4wa?@@Wt+5@C4R?1d#?mUSugDIUPY4}w=8!oU nAqe)L<;ev63zPuxo}Q|l9N)k!-t?WigC9*Xk+LTIJva}GTs8n`Jg_M-Y zk~Nj3A}RlK#=T}5Gw=KRZ_np*?>*=HZ09`ZIp;oeIc#WV#K^#aL5TsQjUo3?6oV;| zBe!)(O&yE^Mjk_)9K&ghvQ(m6(8p5(#2Hdw8^Mj(<|Jn|PZIb)_`y`;x(Isc!`sE%@+h_ z7+RJ}cic(?3}!aycw)<^Zc-o7X&H3;Gc$uGFLz7Fp01aXQuS^kx~e1P8o zcefUIwJ@sYFF!n1FoL2@D$sGnZdVWhTVY0ce0&YmyMaq*O-g{KX%e z;|H?Q;*ZbqgSe^rTb0W2#90A+LXa@A77?&RJp3CU<}04qq-zBd3nuI~e1aSw$l9%V znQ|CAFS7(r7Bi5y=9*2oNCifaQVDj9CYo;p6?AcTdGT>-aBhaUyLJppQ{`XTKwv8L zRMy4lD*XaIfcKaX+W?Slu2~->1XQp{JWV1i8I(XYp++4LAmXDx9w3N^DW3wyf`=&|2JVamPjJT) zEWw?Lunydr36kK>Lf|$dwjd_Kszg?>{r~l$bz&_THTlA}1+fr%BUfm(~K&>{KTmxNCUo-~u`z?Xl)nkVcWXERSQWir!z9*83c6i5>5 z%@g#&K~xJF8_4_*x++ZP?d)}TR?aRuGh~t%9e4Ms z%DSWEB{wHlf)!2sl<%H27iM!}^I-Y!zxEb1f=?jX!Y7dY;S)%E4WB^DDm)S4t%s6k zh$N9DX*MfqhUgJz^@xbxtir6i*(}d&R&Q?OL6d-}``~>*`MQGdv;Wuk7ZRVjVNmx6 zXNURM9UHl4Y{Flw>)ZHd-j!$1*9t8jRU7C}m;B`V^i_k++hfn#ZiQ$_ckj`3lYT7qIOK--I^JD=wY#{z4b-~N6}ydIQ02nKM=`-e_$!4sVr?2dKszUmZ{cF38zJ~8>n zg^Oo`M|?|{AJBb#iTk#xk4_r?%PPyvV_h+6?8gXJQ|~;s?=L(PWSCL!U-LvWN-ae; z_U>4#_e%{PZW5_cg?^1?6Q-0Xi(^sT`;pQ>?k;?6sDrssEfvSV`K zA*P+#VX^8~#|R$A#0kzjdG#wl<>LtgJ06HHu+`W!d2461T;Yc=9fop`o*UhKyYRBh8SgIA@iOSWt+~BV(K4%oQ}C^C#xrhph}MrI2TD@?6HTAH#Mfn?@VMb^=(?J^cvD-fx4_7*N#-9r6!A(Z6?NQ#wF<_fB zvy8oH`kb>->y-fhHYvfO)vq@Gl-n-CQLFOlLsdbzyrO=iwo2K9XW4g5GqTnQZ{AYv zuWxCztni*_liU6D`)4gIhoa1zBKCTHJ}JH1_vgDysaKz*uBg!38_M=I_@~SAhy^NK zM~F7lPXd?RdO8}Wqs^$Wb$x_kdP{F&@N4&vu_9-RPK)w=aA%QyrontQ=jrHs2+VbrT%(k(5ofr$OX~xmq$gg_d=g~1m!>Z z$iH*_Qv4?E(_#;U{d~vw4l^?xS-FiT;-Yrv?#XYeUI#AMq#kW}eTn0W)P42h{r8IA z$C(?)pR=gy-<_2v|0r9`$jq{l&tc2fLwWEZ*mo|2_lu&TR7!5Y>U*bS7T14D+beHr z@G^Yj6IxQaIJLpv4x6>9_S83!LYAQEnSX>J#e8YG;I_>n^#m=xy=iqXC=aW(LR3MeT7##2esf#K}oXNto3)pM9?md*t2x-&fV!?HEicci{GC$;eA zh7B2O&bB7&;zFZKnX%6Gsp$chYDR_kVq z_pqMr7rLzW&DPrHxWbF6)MEGLSIho9zQ6ukjr~S%SBvJV$j~c0MZCJsnDK(`kHKZJ ztA?zZgvoMkZh5{| z%EC{^IhJwjl9g#X@x{i3U#ojPVqe)l9x?lM!|S8axfKT=#o66y9+i_1=)M!3+)}Yz zwR*!+b5VTaSEVK?Hsba$R zl-4qhrL-$XGaMHCC4i5Lj~F_;L}p>8bK%VbsgR1Nn>!ufKEcO+e$*)ONB`YU0ad*} z3-X2pwBH3jkiYz(W&3~_!5|>+ls^lvl1u17!h?nWt8-7c)?Mt;*`1h8Wc}W>D60>D zPw(Xhk!@4)?k|2sV#OWPB}oF-+mS0sCEz?MW*>xb(*y^3T}B zz2NnmTLNZ>x5;buPiWVCKK`~aSnjEGd;K2=L$A+*@l0oZ7H3C&E7dKC6z|D|`#j)6%LH~f4Ue?IB$!7pD-*PEus@uv3(STb$WAIB>--TJJz z=pyU7$o7nF{qilVg1v)m?&zgRN@3S6{$GfmY?-rn49b@2V!(BuqeEtPtwdgD9e$GU4&uCSG)_vT%S={wQ6I7ju=D`LlI zqq+l8q8hlVg8kQs{@ZmXy7_wTN{X*a$o}B89%}8A4oT7174lNwa9sMNnFULT@_Xa< zx8ZKJIXvRduU;-VdQHbvzEb>w?ZdlM(=YbE;w$)g=C;ry)fs2G;2n#G?F#(D1edS~ z4mpJ!yj&*kYz1W0>uur(K+;vo{}RO0Jqe zTK+B}Y2wPtCQCL${}p@RC_P|ax^l~b=w07JmgPG37!pSb&>$mryB)VYhCBYj-v+s?dj zdHiXosG$BnskrsM?g|zX_$tL>b|>5BpG%~4OA4$al6T1RzL|+J*?W7kO`R`pnZWdS zqK)mv&$;s5hX;?IcwFoJRX(Qn$%5!##pkMaRoM&`-G6$0QFFeXTi!asvW{t0*)-qa zL{8OLm4Vr3+!@>-#2@}PCBv7rz}}P5baGjR8;{#&@#5|AF1uw`h&Jr#;FJ(IGY>bH ze)vu0{`3}|(3cOi2TrXxc}v-HSU70na;N?Bu0WxZqkO~9enjPYMH@s${%Wa^Y#%Kb z%ybEV+?biaEnn&DNbR%lFb z_kZEWzb-0dx-L1A-rCn;noYR;rGeY;eBlv=4HbHr*QJfR{>M34@i+#q z+W2&CtIE*Yl;o7D;)DI3Ki--uZ$G1!#;0iHsPRr?Z2#dc$wZFTOm+D?tNIK?t#i7V z0KfT1#Jt1#WZ1d>=24%T6gRkogT?eC=iKv#I#9A|H7{$E^H#)gj7Q1Ov zrilCmyX#2XOW~C+A#N2rMvc?D?HCV9_b|F9nHb-@TISZaXtyv+++vA0Dvul&EKt|; zyp_s&Cq#nvMp>d>z)u!OUfmlKFMReniM(zXpB}bixn%W6U;mPk{tn4ume`eU;=MTy zP8xT!bhWu~Og-u`k`ak=E;|iRyBPk~mONKg%rtdyxvtda8s@qIZ=EGgz0Ui*Hc4H} zoRGS`%--2o^py_h`7q|DeuwPq4&QtBx*sfG+|d&CWU+5ltB%Dajs>bUP+pdrbMrk?kBDX==CAdk6Ll^p2L_PLrEl zd&EK~srJ;Css1yAn=|l@Z^LChgQ_Po>Z`v9I)$r<4PhJO{0IT{ba`KrLq}oZie@NN@lMnDrAOR6}3l4 zbM7m%QTx-qHo+>V-*`CZ&x}#GSWQEjZkSv2MaxI!K|_n9@|#n$pY2iaS(KjG!zLh+ z{aW}&RPi~M^4&!5E^nopD}`*EVwwwjE4f1LE0kx`Sd}Awv#+;bI(EUp{V;C}2J4=<~eFkr3=Fan-D!wf>vA9$&ANGq-($)N@xR=9};H zbprYLboS(zGT%L`CiQCdWoLG*fn?uK4*#=31&*gf6&zluyzrBGA%i? z^@y0lCu#Tbm+1y4c5^V8rQK%O#xG!SR3yoLC}pjf`kgM1)`g~`%&HX_k9beR`$mn1 znSVktVYTC;LWgZV%6(OgYg?9<@h`kbL+;Ma>R!MQn zXqY|7^?2c1k*x=uL^f8|yPloWGy1MKfR)A=>+7!-mNvH7CL^q&pr)WGtgNW4q^PE( zDlBd3<|>SLBFy6QN{TYztXK>W3_uK_mKcT*ZiZ!?tZYf(a~U_lgh(6={9Z{Sh$$j% zcH{{$@VS&kJjfGbHb`5X0`o@N8WdPK(l(>OP9yDVFb+u2UM{JH!BFa!A{amNgmR5E zu#Z$2l%GT}LHN|=;*Mmr;ezvoyv2xLY99QPh9bfX0AD&WCS2U{jCd~o7%ZMkIFbp= z%ea7x9~jf)V$Z%Jn-u%k=D$$5+7+ekP$Z%OAVkas2F z9DNf&zp%bU$Ai}85dZJWQ`*r3`jDvO5IhUd!>=MZ^h-IfX*xJ3a==l`3(>*l z=-}(<;AV92opkU3I(QfzJdO^2jt-th2fsxJe@q8|MhBmwz$x=n5H27f>UC77gS#L& zTo+0^Xypyr@U4U9+XB#r1aT2?p*Z@chqx#Oj=uLHE=GZ)ZzhP#P~eiF4GH4n;6m-o zLl6+eCBTK^st^POaV2n}xE2HfK^)pdaXknEf;g<+P}~H9fFQ06E)?GkK|l~!0T+sI zhaez`!!e-vE(ijGIQ%C96nBFlAc(I47m9mB5D>)G!G+>z{NcJ#&X*FWoEMS~q+dpq z^AZO7kf42X^*K9c6i3e@*vG>gASjQXIk3MI-T*F+hQnt5OhG3Y_An42B;G9JG(-Fcej# zz|oot;;`<71m)3p0qloR;N&V6gNdiW`Jf;|X%slQy2W6MC~$P|KowYzLV|wDRWAnf zo&rZ}W+*p8fs?CD@Y@_300bZ8Did@Rg7*7>LI*qvfwNCfv_)_WAz&0-od`}C@TY|! zi~$zQ(Pqz8e6-nfbsuf^T=hqjO{xJE0^PiwXtF_#h4}|AC{F%? z3k#KhV8f#2AK0*f`3E*EX6UjhIcUX0su@XT=9_^C&|HZBRX__2JnM&*%TC zKCB6$O?u;}Jo|wd>6;{cqO=h-Xz8QxQOb7&)S*0!Vc!#6wDcVz6Ahl` z9}a~9!J2}SsP(e}la_t~qNG8o4BFKC)pYbn5IwVh(f>$CUk45f5c>UQwm08D7Rb>W zKR1#P|2FhVM)Lv1Mdj8%}$(h{;*!5_J4?u z{{qB6WsLwWQs*zMV`#;HGY$W+{-O4Nk&gey|Iq(gI{w3H_=j~Cwf_fn{Es01(eHnQz`+rTxzdv%I2F$9@-KqUI((zvge*^#u2Yg7-Kdb|( z{ePz8e+2Q5s6!JZqV~@Y<_Be{&wupC0O}7>pBFz7I{MtmkBNUf|26683nTjfcK%z^ z(bqxr#nFKyPwM&m)6w@w^#5)Cdw@Qz{i7o_>kvKv;Q9%J3$=&h$LaVF{}25W>G-#y z;a`-7{~L7tCn5eRe?tQGdF!7;$G<-f|8Om+=U;~S2amrf4e?LyQ1~w?4CWo8uSkWb zAwRMdsBm;YpwAvepGpk#g7QZi1ryanp-DR9Z$$Lb^8=Mf=a2Tsfj+JLIf}$j8t@^( z{E-0{s*mOm91kJ_T4d@(e>5T+8`gOu;G))t`x7}T$;zPBiF~lBr7})EfA~(KrC$IC L3y6`5Nv;213H9sZ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_main.o b/third_party/libxsmm/obj/intel64/libxsmm_main.o new file mode 100644 index 0000000000000000000000000000000000000000..c238590b5ec3f17560fa5ae3ac1a579e5e1bf51c GIT binary patch literal 187464 zcmd443w%^XmOtK2I?#mR4H^{`rPXGJw*(Or1oB{zGrQ~TIJ^GlKRfHJ&fzN^4J z$*smd_vUmRqm<-_8r$Zz7?4YzE6)k)-^fKl{cUx}mM;98Gh5Zqqg+<>UYGW0&fe(q zYuql?oa)x9qs#l?QGZP}Ww%xwU7kj7TeLYyPXoxz9;*3Bnl>4)9%P(U&1D`f7jNl! z>r~CkbZr>kdgASP$dr3(1M$|&71Fna%wR9=s*wI#Fm}!tF*ghXb<(D>*Mv;X@0K@& zr?^}@6uc!JS`k4wvFY>2vSNy$rqf|h7H)NoUs(-8k&`XU|^;cPwL&gHXH=H=2 zT_sEF`lyK$+OXWPOACh+pK4xthsV!=W81Pq`mqNdlo!Yk%0JpF4T@8;E50kTKd8S- zuzx04;yu`R32eR#Fly{~UFEsQURX?WcU#e%mP}{ohRfl$lOwX*)P(WVj zvGh4J^xJgzuHe2S?Z2#F-`&2!4b&+57Ya_AKRX=u z1!m2fF^e*Ml?}d{TFqDDi_}y;5-Imlv6_}d8teiZLdRQ)+Gh5NOs?DG0W)WkS z`@O1uM%6XHPc`o(KouO#QuUC(KjWF2XqDo>Lu1jd!~rROCmut_a({mn%*+azW&Y8x zxPWSHu(^xAsY7)uq#q4RllN~TCUG?Sar8(?|4f}gP(gjWDox%Ua-U+kvfdURdPegk zTD6x#`j-l7AHgyVJS19WeOqLJ8Uw8^t)FV3R|7epMsr6~4vF@ah_ z9LQ-!GgYu4D}**X{X_7oaW6y0__Iq&e0#C0%X$DqHc1aP%PzxUNieB1M*&RqV7G>M zGAOWL)wkM|2J=3a5*G-$ic&rHPK%T zPtM*%Yb2euQ`L84%=E{g0zb|O&BIZuq6^V4p00xxYyWTKpPh3W+KdsT>K{FT?&wy% z>(cs}?vtMX2*scQA-ue3?5tZWQq97Pyt&j!^~@I5X#E4UyMs4*7n*zn^_^FRe4_YU zs{TRFX;FO}DGpWN`9O;NI5p^P?XMFj6J6RhDMU$K2bmm!cTwUP>GKUFmmN~uwUm(9 ztKG=(cf>~0J2d__?^XyO*Iz8F{S(*VO5fnp!41&&VSPKP{5MGDe>+G21})x4ZKdIt z39_a5S)c*woNDQnQycC@0veSG?y0fU_vW;!&FWgSAiOO74vrOlIxcL99%&~ovarVYoZN4puH=~_R0_LLI;KsD}bmt$@2@B0$@ zO5RTG77pjd*LVL1K;&cfX8~!GlHR43eiPQe(%*-2QccFPoL1g6)*J8H!x|jMc-`L1 zgzpVu{fVdVeR_25Yizm$qh7?;d^Q6haP+<0VlU(7@bOt7BZE~}tjI-y34<#$Fj+7?&-A-~ zntJqtYBD84${w1&zsFX>JUOUu%qv^-w*1?kHNB3%YE=^eq_hMjk7j zIItp9wAbnb3@nj-w13UUPusCURzDytUR+UIuho~-6gOzKb@-^StXT}4FA3enCT_H} z7pW?YJGn^K-{#s#Xluch9Xn8s=m)?{ihfxC0zE=f*mZENU^gn`6_wW3mlN&wzbn-z zsv>pD*>d^nvPheX`bZPIE>~aiBazB-jn{ak`uhklJHvOguF>PnBxk_QrD6 zRDn#wDBP*dbMgwK%cUM@gxo`$ghyII$Z6VLc*N?#ls(!AJf_oQx^@d5v2LJ6!*zJ< zMUTC-KES^8U&*e>Y2Au{)Z{KL9h{0>hsJ9?cd%8Fs(0A)XZP9PNoAE!tM%1KYJ9adER);Wb7r%(lDE0*6vo`j+~%&6 z_^?THf+bj3FB0}q0cVu{*o(=hzEyv}eSO8+=B}|`mn+haxBsAWiLMnNn?JplU^_ia-$wG2Y&`Ko zujXxC@lRsBCM`VMY}P+;R8JeXn;AWfMWbB}b@i^2>Lyod9sa7CTn!J?UzMw*vFQ%_ zxHD&r>*1Pe{8hQCD@$EV8kSP>7*|8hB6@HwZK!tDEU9)iG?b4mMl!xe@h|>KDR;P< za>vnM@wl~*3{P6gZX>X$5T36JYFHz^}=kvZ?@jGF( zDTXTmJ@rd&HFPLa8`GtM#IA_RJLOAzCJyn5FQgyt-Vb*(IA++dWzw^mYH9x`HZR$e zg_zFr-jm`#19c9Dl=SEo(wAN#{jn>g&%HwW#4Du#;0oziT_HXD3h8~Wkba7(J~p77 zej+}cyF&V*E2JO2Li$@*NdNT}($`)gz3B?+3$Ku_ULk$-71BFNC@&)qJFzagT>9^> zkp8nPq(`rizU1=hiC;qty$o7io9V+kir2L-cR_SF(}%U6vdJ>pOdr-xdi|DO>BCx2 zuNUZ*KCIu+E2-4Y^kMxoy^?0#Odr-7dTpmy`mi3O*DvUmKCBvg{hVIu!z!lN6ZA?S zRv}((BYSRO)}$G;1GX_k%KQR zhe9(7E_p5P4pSu0ni5DBnSI~%WPvGxaJaZ|R$y`fbbzN&SCeKd!C4b-uZt|Isx0-Z zGs1z}%NAGNt}UsoFDtGqsn?d>URAlMsiC?$1F&f5>Wh>$D~HV|q>T?q_?(sEyE zQ?9S1uCmnEP+sS&*6NY7m~u*drPZ~K0Mn@09pikZRTWi{h9$o0>Ly=B6EI-LqrWk} zC7Q2cSwktYq^61T%bPR?Qe0by(q(n!Ig1zh>dR1dov*4I#z{?sR#KxC6D{@Sb;XNo zYs=shtV4Oqg4sfyrV~cb6*BAgbE&?xV>q=|jg)DFIH8qqpmisXC`zmtA zfub^4|7E_ak|hWp|!*+YDlS8Urb8?=cQ(mZ&6)p zZdDaJZfQxS)&$Nh@-10fTn8>{)CY^oe3grk)>w^xBb%aOal@h#(6FewGKU&A#@A4d z?#1gEG_Rq&9K0-RLSjjs-LKSD4NFRj!B55ydS?%%zNW?H+M>oB{@Q>7kCxZh`YP*L z2QZlOdiZ~RCG`-N8kDPARNqLX(p`Belcqo| zOqPr2v%4l^I_&>kO=V0DDNIpL4#8LI-8M6!Jf&fjKvx_hf5QAsmN8m;3tbnY5o$P70Is+_3>MTIxdk$+$ zi0aRn8=M`S6buE6?nA~r*b@cQ=z#@pikZ+%7&))JzP`3TW8N&e5EvnX6Ba|@$fblv zrp9?_-3cv&l_#mZN@zt)f2>yD5Qf8d+u+KfSc+b;928k?De1S$8YqC8k}4l0tfIWW zfv;bAYzW?oT$QhfgrC_+)3@la#$I!GNH1IyIh8me#eWX+nCqF=A+o+Zr;9mw!*Joc zwTbxbq1}PcG;IhzJ=#t9OxLc)XHTuyskO`|Jhe8q!+ph%yiV{fOhWylOjq39)Ut+*8@jODQ~-$si%3g(yt1 zsib?fH;XdC_2AmU{8R)$j?g~L5h^9#g}*J;^~qx_8ZR~c6=J^3IygrGi2v|gtx`AEd+C^jfQ)8%_`Q)wVs{_U)dPTAfgfF>VK46WZ zS^<&MMZQ!Y2i&opTy6yHC%5pRoV`xn`u6rL;+>B$r~(jS#36y46R8|JhNkE2%GsN9 zS~$$Qy-3Tddf(Rjd05yS}R&Wd9p)BSy*yM7x z@;q|(K7(Zu#s9M4VtW$LHazz(P)rH%SxNXh0q+HT8J?YroQG6d;67W$7!0~xY0T-M zr2T~#rT3x8bphi+mu&ta3+tlT#moUGVn;5!&C{^{@)92f;pa@AFVa&gly2b|!9! zhDfU(M|}+Jq?E-f%sGv8fb5iFgxZ zc^Plfdlf_sjKd+$?v?+zns(Ps4s_kLaJy)&%8 z7i>G`3HRQr^llAdw?^BS-eB+Zz0ZdA4})zdvckRFmEOmKZS6j#_j~T0v9npyj4gt| zWu1;+19UTO3WQ);HIVo2W&VU4u@0WDLxFOKa75Ti;p zGB%90E3m#{P7k}Z&}!BLR1_MkKjRvxVi;xpRqG;%6|A(`g@_F+z&f8-eKSHOIeRhe zeY_o36^dQLW%O+eIXr}D@gYeq=WG$od59pk*fWao3K_W~EM#2;@>!L(TuQ4WUz)qP>{~5ajMIEl|`7P7=@8BRBZLkz8kfa z>ewE$;LvudfDNQY!=4A+a|600+^uIcb=3M{0ZNO0C5lyY#sx|=hCYG zVs&w)FJ`v(8=d%UNX&J~J{5DDjJXXFuNkxlX*L>-31}4S{YS?~?&gykLQbRQ*kdkt z<4_TgV*1TdcUL7hZaMXlcv;Nh%bC3)59D>%lD!$z+6^^@CQhZbkg1pj2d!)I5Ype1 zW1qM|+W~dYv0(45f<^NWSZ`xohxNm9>{H6##iow8MX$}@Wy#1!we(Mnw^**bF>*0V`Px|zuuA!Msw4sY9onfUw3C7X%0>|XBq&?& ziG78UnubVS9U}KwbdrzNr!A{XDoMb<vS?oC0>2Azt`LQmK zHckMfYa@7WS5NKMSXVEt-=!h)Y?dI}W5?l$#bO)6h997&%ZVH-M@&yP7iQ{d1hg>_ zZbn^82dE}9w{+|R@~l_gJ1K@A1vq zDIUb$5Z3Yys(ZKCN2Z!=zDHZ)jDr8y@A|peAt`pQr#3luu9tR8>|Bq?qP8zRcDRaD z2eDEmWjly;Auq%e1*NgB^q{1)+H!(?q-fWnNQn2ffCEq)R{p%T_BU(t_kNK4NYRNd zErWz89gEaRW~zMJn&6Ugll0S6z{X%+8f<=nbGo~i<~!q!Z7|>_O8qjjHsENjkJMrK ziEX@gJ&s+{y~NUz2Ds9fz{AUn3?7{`Ix-6W-g|@(%?|5=HKtp3f*|G`Lm`F>( zhBRU9u=4CfSS%fK%;hFeL~{NUvs6b&dV%Sm$TzfH?jJH2_(zkKog31tuuV5vAZ-4g zTm>Qhon%@_KWSUkVf~AcPRn8aoseGU&uV|f9>1|e%cc1J;AxDOL$rQ%Xx7GI|1dUJ zONj@Nukemn(%6V_z5o$$pOu@>x?DZrW|#{|_;Hr0YV6g8!Ps_hFb|%;&FHeQ@dI-3 z9F^9EUHZ=0!Ik6PQv5hF;;m8~iw4*DG$~PoULda$wxL?}cqV#ig-;9ioV~#fNTPq6 zX<^0|`Qck|>Ok?cCBD6SB8qfFU@z~*fHMV6C7e8!)%-=+-Z(G;5g~)9fa#ps){9L%CXom^9 z0KPRg`?J#4^Qcc)(5m$O8N~?n&Vdp3T3C91Pf&WkgT6y~?=}up_*#QGa6?$jOV7Vf ze`?;FOY_l3H-}31I|Q^^9&ekBy}Z;4C`6H=v{#1i7-@^^`Ji8R@&(b9v&yZ^7;YUVQ zWCX&owN(T&G{aR`U)hLdDyL*l9j#~aT;W6ALuz;&oI&Ch8?LHZeJ<8^*7V5WQs3Y^ zFZsjfeTa_xD#}YVtfCuyODHb6s2nkNEUl|+F&UAZ@#J4YW}>*_Hfy&Zm~=h)0S%7Ba{L&l8p>(W=|MElPdSvp+Aiq z(TTlHeZ}fiN^FFlw#y5zSI`*m=lGhpQOzmyMaue7s8z#D%g|q`MxPC&*izL1!_jVt z^+n1MB+tq9-*xsRqKp=y$2kQwq(NRztGxbVAPQ55DOq9NiqY52+0W;?*uB^Z+7IY# zDn}Ai72bZ;7!I@p?JQQG80#vgC$I2?qBNEbia*;1WQtWxGfbag(i{tb{)&E#V}{dD zXcmM9a+Tg43MWU*K{$Ri9^O5iILhyRjE)^BO7DXTQq|r&6(AbjyQBA6puk)2vv3pk z0hRd6#$SIa{%iP=xi+J1u<=ZqD0zwG5&AN%OW0(pHk zb-#Y>%%Qg9SIhc4Fs3-o7O0^MF(-cg{030+PrtlOD_5AtleFsz@&skNCjXR86-vN@3t zKNR(m!X;DiLi7^YAe&(~gSFc~2(rtYFTccEwTZ~U4*D|(L9hM}*gSRuC-_CMD&1+6 zynz5&tI(rk0hcS_iflN6ku_i7EAr>c&EIl-Uo!@5b1(|d9~7K=n;3v*l6*KVFOp|s zmYy%`)?8Z@W|63s$+P6Sj2WzTy!B{rPOHs=*tYysk+Asyy2lH;{h)`n@3M4{2c#k( z$!VSBM6!Fq2nArO0Gn-t$xL8gA%@AUdjUwN_3{Pa@!odL8(AN`%ppIe9)Jc&VT3#z z9bPzZ4w$gS=DeMgDo;H z!aR@kj?Fh(Q5TjbXIs`?ixXN1U`C6paTyfYOrBy;U4KtSM# zkUW!30b+_^Xa#3zGJw=$W@9zmGHpxoXd!ghI)iS5qon0ChxwQr?P#fWUQ+u>Eyctg zM_p|#<;)~Uz)3Na$PA+_CJ5+K!kRjlG&5K>m&k0|5z4C@qQ&Z=1eLu&R zak+?%@;Wge@Rl(~eXq(3a@CLCwpIFihX-8w3l+swe|M!m!-ixtn%r&B=K`axM2~`9 zY|!&Dzp)xJV5>Y*p|;9fX~S`ROyk)~B&qPI@v~__!_W}(pgdD9Lc#)j@XK=*QigPd zn*cS&3{rZm-%06Pi6F zge(_>=t}G$`7IQ1yP;DUTv8frbcI6wiO$a1JC7Q?FgX%n@y?Zt6r9+uLQIWGMiaP% zd2x-OJeUqgnBr%_{UGE|p{)c;pE8;T%4ixm0@y1|0}O@yvX3B=^T1YHCb$gCLVCz> z0vl4Ng8Lju##kmjjzvIP%`5oOVT*-8DIf((@1?Au&c_r6F#!{1;RSvp!5EyDgFsaI z74U?K&;o}KG&%U>_27Z4`zm3B%eC9JiCW2F$<0y&(jhJ^0Sluss_`TRI?^3|Df~^biD}ZRr%u`j_4u@)+F?869`35=V$N!_mmv30)L@wK zpfu;fn;6aQmtCIp+>&064P7G>7c=n&!n#TI5`ZNbv=sEdH}4Y z6;=1Dti4}Ru7Vj9GwKlV%4H>}4(Gi8dZmz^D6Mx@V7jZbu0p(?{EqA4KftuB`Rgzr zi%fREg7yBX)bg9D17IdgKg7Ca)i+;vb;-?Jz6NtQ?F%o;+Watb6`h&MJ!O5b^)>() zJ?nY7^Ss=XRL+c1Ir(KOxSk5iXFh2=j$->P4f&ie@=LtnMPBeuR^fIkxS7hyFYy{X zNIjwGyVfvM@)<(uP z$c?D_Ti){P7`wM!(odLR?XG&;I>q^hYIAwDcEgFG-&bj_^|c5G+t|% z()IV&GjcaLJR_Y;?Hy~4V0A`O2JUp{*Z zGB?KC{d11<>IP&f`c5tx$?mb`YP@w0+;Zrr)v)L#QUC9e4duo)mdv_Hj3d%pXaPi; zVPNu62DxUM!0F|wth{J{QWr24gwkWJp~?D%XbQv7NRC4xnlMSZ%6!*a!gPSk z)|Wzc{g4%Wl@;Xe779}g3DSINfN|!-@3S#Tx4`xmE*ITHs=kA-$C-7(tuyYMAmOXP zWJLOOpxuvxs9i`G%D8(!CiUq+>sdCF(X}1@xfnHzUMxX|zEh0vq!fuE4uiwGmlcg; zX!3EMExJ*N@o9ut9DSPFi$b4fTWPi?efawQa$i!sg>*VIJ5>Fv4JgIE;kdLb&W+1d z7dT(Ca4v%_R32s=_=RiW)hqj5|_n#cUd;YuQR7 zK*t&Sd`ZzYYr`;`z*LJ7ZW8PG7dXF4TE1IkpOWqOp z{|v9RT?UHe*LXn6Gwhx0InCdMA-){%xPYk(bC+x^1qIhqLa5a9`=EQb+;uQ$+|~T$ zm)KFH7}Gt9F-|dRus`}W^44d|((~`?d+_;&dl%l9H+LNb;L_WpD{gWx9U9P21*GRc zlw;?!8v7~wE`p1l%W6!Ib!IK?15$`0x&=Xn!l7m`eJge<1&l26km-v-SJ1duTDNi_ z%LX}czz>nut+az24IyZB8e>5Q^y;V`m=K)GB%VoAEx-744;p*YGxli>04Q(TyFk^c5I~2rZ8WgvWR^o5HlXhVG;rwB3I@!BaF^-$CgkVn=%@#{HR8G+!}hWd$g# zybeAxYzYEi%8dG4>}zQq&=ouAM%OGcrjCsr?Zoj%>bNS!m^ykuE8tY=kFzG2OSRa+ zPHEk8{}g0~OjRSGLZDu#fZ0WWn~wzBare2GId~BPgkK8YO~t=ZF2vkkH4j#1(_B~t z)R{92jky0wJRthadjSQWWUN4$xW5HiGTg}<{Riqt3+B8vh79g#g_FXeLs) zD{eQQ*COp%d}F7>{WQALBDVx%xK{SYps_TI?y*YzEu6?KijEtIcEIrrvYNNi5Kh@{ z5jJ{-js8I+-z!J|<|c^9)c|xbN8U7^V~r!L?s*`{*{gpculM3t68#=E#H}0fD4UPsr|s zmP~9jZ*}Lz8IqNUaqFXq%-A(Rg5d=Z8P78Gbc?Dtq~B&`!QsjH0`~&)8FuSD%$w#f z>9`i@i7q-RDy=$->~3RP?13?|Xhc7SB4J|?sG8P4dak#$`Ytp-);1dBq#4aY+h1fA zw{WR}6hC}ZK>B0B4FO|zrfmF;H(*TjVB47-y^txz{|xp5Rlz_=h!BQCiqdpI$IhAu z_q+ZTYaRqkob^CVVziZYIl9kancI!-kO({)J#kNxq#u9{6*jI1MfYbydw}%+MYYGa zrqIs&Pif(O6ELRs<-T%Uu4~gr@+%vUdgsYTJ&r4VJU3uuFdDfuB4TDe!fOf}89|Kg zpfS}OhHjFh%OFv!{}Y)3f3>#y>*%ddjGO{tZf57+=wsp2r6|yls z6YddXMYe1pKnsnO9WZb_^G6<>n1gRN*5Q_8pY`Cbe;;&^j4g|S0jHtF&@r*kIt%64 zb`8q)A@_j2^2~X$!<};#bE+n~a3+*W5si5aWei}e2W9~_vM{O`59rPv*>KB7p%-_0 zVrG#&^x@TaxVxYw(bUm47etfUD#o}9Ycc&A6ov$bjcd@pWSeDud)OF;q{r+g&q0&J zXxRObxj7y*othl>Vy;D#vHX^eNG94#XEotnVQoIrgEjU78he5o8#~e|$6lqD4q!V; z=gmdDNb@g1n_(f$lIK{oTH$_ikQy}Z2pY4!L1S80*eK{v^Q*M_ zS2)mzNjcM)$hH*xJI0=3G-8e(u>nKmd!*nF#u72n%7&MC(gj~&(70YTdPwV<{>Q~Y zo&(-M1U~BoC%7QUGY9Pb3u5ryfxOU~0=gj?LBkboBBuNV?~2i%1j7~pMSA`N{LS~! zhc}?VA8_xY366wB2*(6cJjPwLll3&*wn0M*8U6l$EF34HzEn)PUOsoQ=^|$@JDkm0 zWR+4YFndYyA~3|t!zSKohfrU@=rh?E(ZW!Y_R){4#zfVa?hP5!`-hClS?H@;9JSzvAzrZgu8OwP zjAKPa5Kb}rF(YvjoXHU05LZ~X^!8!{dPwSk4%2|9^Vpb3BBaY?{dN5{*$B`W&W0`K zi5>0{FsflH$T;mx1{oWy`D3NcuZ#2GkL(wgG#gIZy}Ha2DzVdnZ2k@-Q-P zsW{jPqX%(nGPH^idB$ED>^d{!Y`K6+Y41(dTt)@$Lu#6d&L-m@3I$siJ|+Wc4Kr`# znE@DhwvoiG5vb(3v&bfDr4oDvz-e#Z_6(A&LvSwj-IL(tmh)BmZGs$KTMLKHzEB|55rzY>Prl_r#O5)o? zOAWdX6o~p0I>=t?4(NE+O8hbaE+Yugn+pa_7FnS7IU{!e zJZ;#0oqj;zkT{1&XN2AmA#gf5)Mj{_F%aFZrGHd6!?dG+o7LOKzZS|n8hJKkdV;~W zgXtmn_R~F=N1jymh^w8hzE1f^M>nw;J%uj9WJBS9#FVin#c^9MJRqTz9ErGLe6N-# zts4{jY$bNfzX{!&l^?yB78yxQV2d&%-vgzTT_M#XPwLkPZSv)`K7c0W6N9p-b1#Ky z|A5br#G1EI5(j&^$=mN5D*nJ$LF*;^x~I8gW{x%=Z_IUZGq`{o+VvUMyDboV7D;AGglV&nOSTkxK`WC-y@G4V+^; zp$*+|$0Zap^7wM&YJfZOmdo#6=YSR_NMbK7+_dRQ_By=1j`n3kFi&fl(z=PUqbu{H zI1*Oncpqqwyp;GykYckq%Ra}N%ICwE@_El+3845~q2kRmIRetYDbtfgI9T;^K8FejDD{f*8&$6T9()`e#tY ze322{F~v(&1UI5S`nMS=S+u`LEoEfmm1zLg7!cIw`RKa-1nyVH`V*@fTuDr&!$xL* ztbBWiStJG1g&N!HZEq5BJ8@63YCeq9TMGtx36i0C8AXL~g^dNnSO$JvMM@k-hOG*X z<)gAt0>>x2BGoC-qC!BE6GJs0#GUfz=~5S5MQ!g08h-;yMp06VFGbTN{oer^j7{n9 zhKC?F1!supRJq;Xd?fH6ZNWjlwUYkYrQpi}?#Q>KPbJ2tqN88NNXh@;((-e<=hI=k zZsmKW=7(-MnzJ`VKZSBywSQU$H}l)`5*sQM!5f1@s*&kehof&*^Tob&c1^{cj?ih) zy)?_}2LOz3{CI_pFLcW%4oVvztPls~RVWcMJU|KO!gO;@8b1zqh){;zGqS7`7{4L? zz+a2+BD06HIX=}~a}6hv;R_PJ4+-BbpV;_7%XjE=i2Dh_ejjml06=^H59nL*m*{&C z0E__Zr?3W>4er)!bRo*J2WX(6KdD0@SRVo`e9IxbK_UEnhv>tQ{>G(!82Mw!Am*U| zrkhV+%bEQNXLfN``#-AshYq4!j@XDCEZ8z4m=k$iHOAV4u<=gL<)d5>R%Tfx=vGiD z{w2>t!<4t!yAu{YTKEe&nl zPmKgYo0O4Zi?K%IG4ZICIH=PC!W!GheVX_Sh7&idd1Kn}KC4CgQzSL%fv31`ky2Gl6 z1PgmiG)jukMB4W>EWzTY&VI!(CS&}zK&v~b_at4(?x%3nCB=`SEZUFQ;)V0zIeW2A zv!|;e6c2!Y!uee$t@iI~Y-SR-06)P&^7aDWAOgeH;cSvrU;2;V@P8zCd?IQ)kDmf! z-G)=1xCi9OO7mfNGsYl(wcy9UK>76-@u-%bpySc~hrS;&$DbggRQDTd+mW=O{zZ7` z#bDbPY4C60H1J7?)Kp{Oy~OU2ekg_0!^t>?0k~JY#TXMZGaG|#htfm*J~UWZdEm30 zdqIp5PEin74hG>$lj8pjw0P>F&A2>r$SE3x(*z`^-Wg^l?0E+}T$uPdo5L#ux$j7E z9?oYCCiIs_H-fH>0lEX)t+J5@V)Ikbf!N*mvGXF#K5D3GHAI zKSmVR!>;ysDLv;v`m#jmudcjYxX|=Vu=eT|`euNG8NnL{HyCs(3W{B#NrpoF_HPtJPRUa3KtGP3OZ}c^ zA&=-ejOTxWL{ko~Opy9L>;DPri=OKt#aD@RYEi$q|CcNsYNbY`SNTUv{hssF@1R7_ zdHA&mgh%t|OZ{H-x3P5mZq-Bqk7H|6zm5JqNJsO)r6bU2v8`@6JEBO7agLBSoR3#m z%L4y2_`~3QQ&N3>SpThGh-~uEMez-l+JZ0ordOo@j=mU|5Ns_-Eel1WJCRC+-YIMS~uc- zt-d>Uup@W36yHWDrA-}*nU9mX>&B1GZ`nAu8N?~MTcsy{h1{^TX{Tb&?=eMMmz_De z<;hH*dq8@kh3B@Lg=xqfKVovr#u1d6yH9$eisL1ICGXgep1VmtZ07jDd~w{aKsnEq(0P|@hxivUhwE~UXpl}3A~r^i1sR>R`94V zGQ0CglBV_k<-w*Sri8(YZJ9P!pK=C>?|gyW(R zob84WV34%>NjG%OOE;ra*l&BvL*EgS)8|mnh zV~Sbf*&ZJuN_>l)PM}3cerrxow+lqiA^kAYQ3BmGx##wHwkUBHIXiiY_Y`w_uVn31 z=0B0cyY@$3RBso8Mu`7C&uUl96}^#VQ_$tpmZSC^W=+QSxF-rASAO(@Pg{(pA2C<- z0jOW30(2$_ddr-fX|n^|&?W$Ki}|Czfb=lP=y{(u1c|T5wqJ!r&N#RRecIJ1etLU+ zECUncJ!LZ{+I&GthNQ6h8;Q#82VW0ulVp~gcQ8$712bA_+g)Hqq-wee?kmIgF z-vK7Jg+UA>)nvb@-4$+%yXp@~fKwbSnWJBA`#5W^BEhSg~ax z-RT67F$D4zfo#X+LcFC-DIikq4aAH0i5CYPe)j1GF^nLr3TgkV6F7qBS%>gCq}sbF z`y3&_Zg|o0<=r595JVdgbl}n`*6W?DI~a!iyTN0OOB~OpT{;C;I#{-rXup49$h?0c z4$_H&s2qY<2F7!BA z6aVHzjb>#UQF(~kk>uuJ0rH0^1Mv{ZR|N8=Bf!H1h;fjCq!Y+4w6#qz>v56)_xZ3z z%*vjWzlTtCI^0?f5VNvZ(ER&n0FzJ4qBwlp!~;fm8`0fr_g~^5m`HiENF`|5e#(8# zrYErlx%n;NsdY2%NU;5L$G`=b^*ZgnX7j~{g?M*1)54q(zcXs}Wt;9{f3rOW0plw2 z7017!7#RCO-Bfc@N_>cP;W11#Ab}5;_D)00#{aXYt$H9K0PG7VTlM&?}q_!vZz(kS%`&LwK&SlK&AdsXYC?F7Y z`aiKwpUXP^J?f56oFP;wKz_qIeJ< zh(6PBF*ym`TaH{`4HnTsheS1obnq2WNdNLfw^h*`D-qLz~RN$ZH{r@jK*R(UNTgrvRXqjgFo7Xty9~ zjlY#c1@mL)dq%E8Qry3rC&`5Y{V1mBjzHc~DPDr%r|9&X@WW`I@@N_ka9Ad1c5=3i zru9>njO&UfOY5G_jsg+(!Hve3i_iE3HEj>Z&Yp}l^$a(_V8is7Dxj-Px z?9Y%bOd{*`J;?Hs$h=oZc8x&R9TB6*oYEVq?*;PyEIsi#T7{QmQ%!9o6c85r@%bHvi7mSzl-j<&DIbPAwwt!jN$KmUAvnZYT@eMH-{8;fQ zUOogeL`ChsS$-2i>_Ptk&x-p5j=opMF^=O{c~v(YDEAg{fSoI@#q&|{hwICb`$$Xn z5IR4Tlqa=xgr8(;J{SOX9@3vYM$J)L*l4rdh!d@4A}2x<(z>%0g;QauTEE8}L4GjB zc-G&HLUQa|Xl%NX-&o^MAORbAq*ax8hwS+O1qmVP`9s|4qhNoojuiYRfPTL9xCb{2 z9`SyFTLO0g=HmdOB=`#F{kicTzH6VyV1o#vVRJkEh4cQ@I7k=YYO^0#+YWnF_nwZJ+imTG$#2^unCE_!RUv5u{WQ^S6%!q3 z!G6beomM8)FSAn=gS(OVmh^kjdL*!R(iZjM#Eo@vyPXHbO*YsP&DyyI;UcsX6n^C0 zLKoNLzC?emLV(9Ns%=NpLuTlnEU1)wLV0gA^i|D3RxqzsdORId9U2Vn6a|9< zQa@T>iA5gX3du66ecJyF&z$e0SDcpMuM0=h>vK%80eDG1y}rnl8eZ!Ph5wIy~Tk-T(&H?%i=k09F<1)m4qf&(@ z3MVb-FPOKx@n$gt$AS1A_LsL?`sr50#d~0E;WA++T~kd@48a>n*Uk=H{3_-) zZcK11$GRSyql7&|R^$E5?u5j_qHGt7*~D*QvsmOmb|u~bU@9Ra@rU?%Sb z_xrURZY&!c&PpRer>8CQU%);fxWDb-aImDYJhvN^091P&<^NqFWEf)%D+DYM?1r>Z zej$F!8t39*Nm>(e;*HK@GX0dZE39{d<@?!R-ahH4ZECEohjkLVhfJJ0#zGJV7M>Q1 zxx@r0tQJ@UgstN^9~T(k`82*gRQFlCtxp58{cSdG%r*XltSNDP(yTcLM#-y)|FnAL zmcp|bdJmopw9G$;?@qa8K2C5%&dV)z=dpwNTe+p_TL6SF1X|P!xcc;>+>(D07`jlh zOU<()FIXKQnCVH|ciR#o2T7(skoSi41pQ*JqQ4o4V`-CtA!%lgYiS`epnULa?jsOk zJd$pWfcOY|+ggL&aD)%-YNB?b%b~ZDUH%MCv07)C?G`0OfLlUvkRo_HY~If&s6BvM zi|~fBz7>FUE9}Q~#%cR%D(9B<=jl7=got?K)MA0l`XyVH3Arm}od799Cv4*jj>K2s ztd-6Vc}A%`$tMmDLO9V`utzW2i_OGQf;@oEcjWD#nH)1+EbTvK_v&GS`-1}~)y^0C zI%V)w;ApPpruLA${fCfmv>Ci{7E6hj2~#D*#N_s6A>ZI3mg8bUpLmI&3hep4yh{%)i)3~TmMP5^#sRtinAO}+(~e6B;mF* z>2W*aXZ?`VyNC)ykkx_JOuP2lx|F4z-JXsgq$ zEkjnQLR>2*qX^nj*q18nOO=WE*1i?Pk@W#$Gr*R@yWFiS$e-Ve#B^YK&eIH% z*7ES>Lpeig3YLDG-CkU4pC6xJ^b&oBXIa6>=P z_e?jR981U%sW}bn3d`LwaOP_x@lc-yZ$35_;K>&~m!=J%r_qR+Y7#x=lT$yPo(lEd zAR^?y7vdzl+bH7q2=Bc218}4wPQezf@J~||hh0Mt76!}`3h<1LqIP6axaJ;|LZ|&% zHfN^Ug;-GNR%8%&Wpf!KJiG{&|0v=`4t1q$2Gf&K5{lP+0kp{3Bb#zhI7XB=kY`Vh3JS+yI;_h znL?8mKqApYfu1ClSUjx7YsefsOOb|p{6YO~e8Wc+B)?S3Ix}dl0Rl*1?kCc! z|HvX0M#T|8(hn*_562E8BGibZA7aWAKTB5cKl*yBWCLka9S&7Y1T5U)nyC

    sKspKJW5+A1 zf=nBPkiZps97Lx$_BXKD1g}S`brZxEg6N=Mq+r2?-7NTEL-Zqvy_mR8+W9Tr@^dKv zBw=4icba4I3ol4XYjm&`?y3RmGhLl07=|3iG9Xn zLAOKx-X73-S@4J^`8g-?pP4`NT=M$@vsV{APik zQ^ZW%5iJ5|B@{41EQlP7m`zxu=K!_zb;mpTVs{W0%Q+8bb=i@ehVB3v#I4r>V23w` zbRTW}#F3_jiUYsrJ{c=mmhV=u|tAjC;5R z=SFgns1FqbghdJPn?lI|)&dT&Hp&4!)a-o#*lS}b8! z3KF1}p488;6RnON7uNuTtX`_Qmcj%9+J*KS9;n>a1iz2kZi{w;wgOWY%rzKB*lt6) z&QfzTa+MY^*Xj)v%pOabozlKLJITgaKiqPBa$4O2S}wPDKw6cKoSYNZeawLn1zQ-7S?JbxQ$r3DskU($f13g*u*D+_%}FfML&2aF-hEE_ zu<3u^iptDMtPgQyCK-Be7XYtN7n+k;4|Wp&lkvX}0b)`onv+-`zD6L)pwu@4{8JtlP_8F9~3qz5eMO$~pITK2xJ z0TYMNN!Ba)NL47z5a=HE$BKL673UKTs!Ce-rL4biZ(?FQniTzbK<~I@{}MJ4v$g** z>NogYRla{I@(|4mh&m!g&IXwE29jgS@{aeW=Q#Bmcu++^DD3#}M{#08GBh!!KKJxi?fK#vs9 zMe*)&t+!ZQ3t2ZIE7`>D5VOn1wV3(g#k2=KISMO3zNAm;$2X9Lw%}e6_~L>ufhPKg zpcXhU<8U84X7ZiUQsBX zCHhB9DpSK+H=vV}VXZ0-=Xm=s$MSm-UdL$o7=^Cy2VIJ;Anj-a+3z66>B2glDuOTtGQ2#J?q$3}j+TuMy zcPgbYWiKe`39NNCj*kM#4AXztq?V07Jlf$EM5fI*F@j)F?=>SVu9Y1{_H=XLn8_`f z{qu7_l1z$60oiuNET^UAMh`(uGc!jkE#tj%?w=*o;3&RT%m+OPjJ4?GFPU^+2vuM+-yDi%IrL7P)H2>D6O}`F6`+#B zSZf%SVRPVh4wbz)3Zl}RN@Y)?lC0z>j(}`5l&I`&Q;F~#>c~cT4n4mO07PXfiB}Pc zW*~#M={<{lPZ_2+V}RiKbuHNvdV9q{taLsE&Zh@bsi6BKgu(8z z^;s%?v~PPp?Y)^8E?0Z^p^$w0s#eYKas56(1NBt3n zQsYtoA0bJJM~Pp)p}T8Xyx@bdzK;i_K4Sr?R=A%($T>})6ps?Wo62^W1oaaLBe1`4 z-sg>jM92rg`dffM$NuuZr5~?SW7{blh2cIJ41nNguc8o6(ET=kD$e6~TUo4a#2wOe zE6Bw~;d@N)_JRZfDg2Dqx;Cp1^d`+SICMVrQZFTtoW;`F=p<#;oaP=8eI;R~!0<5sC`uwMmbU0X|X`Py3^& z4FRM9^u{mO!dnUP0l}tccxI)KUU3l*e|=;$y>4X62*Atop_-ny!|V4U5_KowFmGW$ z^HhD{M9iKDL`?>X!r>OsPeY8qrb{`b4Tlkk8iqVP9!DVRdh4cSzbJy?^rME-s$asj zi@;O@xG}>>tH{+$kya7sJj)wSu~8;gdxH7_I7-0AZ@sw5>!~e}oCAnb&{sI`W5hWp z5Se^q6GB!LpQ67oI>U)`e%dY0xdU;|ZNbRrC81E;wo+zA91|DeG!;(?2Q(zrma%u^^X#bWMMv52c|O#g>`vGtY#CZ;h=5b zO}6lFX&QhG6K2S;v+hoOO(4JJ{#U>ygrA=8DNSxkN_e;ipGf6PC3TK>8d| z)&emesNVk)es#p}sa|J+lQ=?*#<-s?mldLjCm$)5aN1?ld`q${P&)8qyQS{CZoXqKA?S*o}n8p%B(r5(&x||hA z;-BF@w*HB15~n`0#$^K2h$}G2ouz5w6hYJWbsGQBfg!y;q`#(~L1_3z|3R=Edp&Vv z2@fiSR9lYE9@-LorfIYA>CwjHGhO=uK6`3e7{Xy#KM%Tt=1jNTLOUXcV~q}9!&bM_ z@=zDval+qzpGEICCg1O;cU&M~=TC+Br`fM9lPDLj*mVU61@9`EgEbT^Sa%PdD@M9E zzva=hxS#4=%hYoKf4CF1cH&|7d8K96c|1JutY|HE(dXUz=T0mPoU{U4gw~+R5kx^y+iivhU z>0M|VzEn+FsXjVgw&2@)D8i3)TwnS#Wi}OB@Vt8F-&=rox!q-u#L0F1@Jv1 zhAFM~A?D7?H1QZgW{P^z-Y6TVNfA%(C~YLj!}xIY5AkQMJzhxxfJGlTYt)Ab{`aZy zh__W>gJD2F2S31nBJu{ofFbP!$PUY~&sl)%2n&$4Bl`ek;sJ%0e~^-PCKYR;2&iU+ zv_K{`yxY>4MV#)TYP;>H$$^2+j1}QPrBe?Rt*y`%gzFK8OZ2ki8%t;umih-wG!DO6 zd`##t#}$|;pnll_G2?JThPZTD2eEyQ>fjMA>Qm5b+Yb04EQ&-KRCX=65~Ie6$RBYO zeU%LTttZfh08Pg0CXt+wHj?)5v}11d1eOOdAVPSI>w0PbH}+(ZL-_+JA5^g^{5wvd z+2D)yb9ib91Tktbx{4E?oG^Y#%-M^U^QqDDPzbHtzvTI698-G_S>j-E>>fs(UHsw`B41)@rn7}sFYI09zJMt>Bqc)(^zX(;+Y8amdWYTiognA#D-f#`l?ev1a@ zEJFOM`Q#83#+K`kq}TwU*WUvokAG-K>>zadEV$3<=XFm2WIUX;JLns(+Mm<6M{67p zH|_YI+7L7;1AV~`6tvla+V~l`G$H2X8YUksb~F>mGl=2rK!xM-i7ysD)Yx_gasz?9 zN21hj$Ef3jg@06l3?mSBlH!6L*pKJHX-zc=kh=-wEX}iyFLwj_#fBhvFx7(1S9}XW zoZO{CaVt2?H<15t_Pzu@s_OdxB@-|OoC!-6E7hQ5MMX^5qo^5}zyv19s)%R^ zSs)rhNG2GpOEd{)8lu!{Ypre3+LpHdtt(R3u!y)JVy(C#?gS8VV-b-5_uS>ZGk4}i z{k24GxgW^9_kG`Y?!D)pyPbF6yU*J0@Y(FW1r(NWs_&qZ*r?mS`dwO0!l&L13YLqt zKe7F~Eldc#qyfez6voTj?9vwvK8?BX{zSR(p9*_&*0?WJ*Zbdw;XvgU^=Ebcubaem;XU1qeA=%1rHb%SHf*Q4 zTr88{O{%i{#rlZ>pTf(;0tZwth=NCR&80An=ZVm$=)9gt&t zavPq;#n!2?V01X?m;%$$ zzfg<^gze7l2*wVQVNZYRE#$X(E=>0(GF@si>z~7b1Nu1~WMSvdSV6quu^hUUc4qsF zcfw!1T=OW=6r z5iisoM22|Xv3FQ5^Y_h-u27!lVAc|f$5(#^j3$R(TA4%x9Jt)l@zCNbRHt3JSTu~4 z6-h_E#>LgW^Cpxr2NfDU>y7?e2&r2c2t;2GEZ;_sWk{jo?V1I@aT>fV$&qvrZ_1^_xb$jHb1^MAn$)Y}94vXp6!%$L@K20;!N56Q1exqX zS@f+K-F*3(bPvQ092(5R5}xZFAt|P9QQwSaY% zK}eY^op`CZ1>R?>73X^a=0Di%PffN_g@<6K>(p z^0Ic>>-U9UI4?@>)OF&2*2}(Y|Ay*{Z0!v-ivcH`zmG|GsqoRY){*BB{tIev=7~v= znjz|e9^8d3xW7gDuh1vX6<@{QLr!0v`fbAFd!tA>Jif0ek1zb`^P}tXq8nN^rvToV zx{mzFNZLxw8|f#UwQ}B>$_Xx)aD<~T!7sdp{)gnN3BrR){OB&+@+JKbXYGZ9>xtaD zC&;h8$MytIdeq{s;B%3xa1%_=_5evd2(79>*}wW0Yy<8 z1It81LL>Nv{#ulTud8@Z6K9L4TyQE}!16H_PRC`?U2;nYoKDH6(>wevl003%jX+<4 z3XP5iPo)78oWOUX2h53nX}^uocULZUcYy2n19*Yy1IIZ02wvbTtkyc5>E{)YGFP~K z?<4Dpz9MN~5r+weSbckG_%@eM!EC@O0`cRG?7e7GXbI^0HdVeuR>h z{d|h68^wP#T-Uxv{f%$w8s+01-Q4TC(DHZiw!w=#>O6ScKvWRftEsmQ-h#t26h)K1 z87e?mfumO1ql?BO7gj(!!V!s)ihe!5VIUTmi@7MRoWG2`QlRDuYgE3EkM*RYVKXjo z#WE)KJeru&-wU}ke}`Q4dUnGBqW)QqBZ&8?a3q1J92Cx)+udNcUWTqxtl$2vawJWY zV`uwy@PddQji+0kJW1mh}sxtVe|($@Y02&@qZ=!W64O3}+PGeWY54aPLSl~fO zhad13Dw)9QYv?DbwHa*y)&EMn{zHj+4%ToFimvq{E!S7y4e?Mpt%n=uapX>Z(#{?m zHP-u^A+@P9wkE8g4{r4JMK2;ZSpoRuuX;p09?|7TUSN9PA%yoG8bk2MR`1DftUxn3 z`HRMJc-i5?y^T}I8Jh=ZY>1q(7s44kWj_vbz6|1R2OLa>JCXYfslTGA5@ zOj>8iS0QNP)cO?OGC)~~72(R~=&Vi`ZqYs>E^#;6A4(0zN5zH_o#RBjkP2=MVi)Z& zcr!#LA(7zO%f}if6K}ER!Jcjq_N)!z3;h(Kl6W`&XH=y#ftGiGB>xo#zGM->d9jDO~p)*g1{(+bQxG{xzx|kWUX21$)6_Loeb# z7Q;t5(Y#-N=tI9>@KR2+74I1Ydq0QEZ_rEr={Wctg@x~bp_Cp$3q%I(tp18o3?q4a=X4IgSA=|U=qCOs6egLmFyT-r#(#XxiSisM0nzugE<=?<@v{K`YV>7%0Fd@m z!mEk&Rj_9#XMQ*_x))!YtO@DQnPZVtauYt`_K4UqhTasH-@X^$aSUO*6yX<-%nvo) zC=Lm_o1*!|m)0p#9m3;lI>i~*Qoc$be)QnOkM!NOdH9&Th0lsbF3f47i2?c!#EjjM z_5`D!jEio>4kWUlt6P;Giax22YnaP|>NSrLZX3(0ZbD>YNVc^Z<(9qkiKLt;UiS*F z+VPtJ*AXOH0XTdyX*3I|&spT_6|X9rs!h(hTDNB<#xJZjZi*8kB&u8cDgp_0qA zi6l_TPqm56_&=M7-TvBkV(Vprt(Qi_PHu{|lh*mDj|uJLkUm8FksEyupXdKt%1H4` zcOh>{Qt+}m+7+iY>8wp9*mRs-ZUON&Zv@w5lM-w?DyBtgiG=Z~PHI^ulGS8r>CZm9R5p%IqI z`s!EWAlTFgD}!s-roo5Xg*p@RJ(CIa6zh(~2Qfss(5vNXWDWRgr|fd9NGB@sHMpVa zBm!G-WI4tl*DD5_jxmhkQeh15M)GyjkhjrgG8yN!0qL0>nr_xuCK-ItgnpkxxO$|YFGQ3 z{yo>%rhi{wlLOJu1C#QitFXPybMZqPU;TZkyuqdn3`%R)CP7~*Uo{x^)wkdlYP4u< zjKSy+>^=8I0u^|S<^-*yU?r+QbnNW+hMg_I zxve*vGRt$RH|kN5cpVO=z5OrgPwtmYll!kRKBM;S`#e4)du-3~*&%(Y{kXrTeRchw z+t<(A*}i&z&-Ha%JL@a+d#68mcE5?f6qpo zGzSpAa_pibE=hkmE+yO3gFti(_iw>X{!|2h4D8(2LVgVH(`pSy-yVnE&#~RHNCMs! z#XcwLfrla6y86JE6a~I#>l^V}V5Lu-x2rz4 z2_uyDY3OsgxFqfKEA@Gmvd@|SdHcK?wfk`~yO0n2YmqK2@^I|*dc#g5!ZhoN=FBtB zlJ;urw|o28>mmKp!$q;*kbC?30SZdAuXvpL+&5`oqfahF>m_`{H;N{tgJA zZR;#)szj6Dmpi1-e7#Tex1djYXX0!2|KCgdq}h#ZZ`;Q{d;gc~^R{-@XXbyoJ~3`? zSNkmZFW2W=|E500`k`<1JRlkK&F|CmlVqA9eT&ae{tx=XIQQFoe)0|Ug>;HVN&nI3 zCtp)vVt%il&l&n^-{&XSpay=Io}V1jpBQVa@wUA_@@ucxApL!xpP!IDw&(McL;Ct= z&riC2&+Y4mcDAqMzUTV7y`A;d|9h^ld)rxGGrs5gdi&qf*EjX|Cx89<)c4}~3G(x; zK0ol0(%@8a{5Z=_Fq{_vlBe)5g<`91LOUxV8HU3z|UNWa4E{|%l`eKUXmH_-3* z(m(CkbNl$G59#wi`#h@K*YE$om-b1soA!Kua@>EpK5uVlefIw^*C)-6+S5K~{Fm$V z?f;BE>HU*Hbl>D4Hi(U`Vaw3$SLk-bA&F%Y*h9K@ZA!EUzPqB|pbM(kvFT$V;sr`_ zZDaKr?PsH3o5M;i@kF3SUU~;VY++hQdtq8m^zBIJuyTvzC2f1rgs=YhxF_5sKgE*M z)EOV%j9e9L!k#hqJC<95(f7yE%X|13b-3`IaN&noF(KBSg`-9O^iO?P;sY79Laij% zclvQpB>B2#AKUO+&HRQBYpPvd#FX#T(=L@Ma-!7%!zcTFUbSul>(f@X(z>6(PFk-< z@3X8G-;TD|tMw9tE?R>p*H?WkzPN$+##hy3wy;Kk1p$b4bokVtOZ_=iKJC}OeS77z6HB0t z&c}I$dlOosF!q*+?}zr(K9@qD0-=tG8hN-ud(cbVJw=NzU(Ut{vU*S#vH{m2W|F_B zT#`rUcwJlWiH|YM)|NE-{9{-E> zkIisdNAj)OzyE(x|95;|V(H+X?-H<(Qokcv^Rp8c2T!)i%5eIE{QnZrM@ zy)_^Gp8h^FeJ3sumHvTKfqy-J^r77 z{9u!W%l%z2K21Zip`A^1X`!FKXwf+37mKS+z;eOPy$R-Zt6EQt8BdD!58`=~2e>`ITPgi1_Wb-;^89O! zHxoHmRKBS2McIZ`;a4Gx&bN|MVybSco+`*mg0KPQF003FG_t2tZ3D_(c0MkV4bZ7= z11&hQl)p?r3lvLLgmGZUF>%waVx~Y#Rh*f^7Tic{Vrb$JY)Y~%f?Q%c;kbmg8~B z&Ufsty*;sOFR79b6tq7{Z5W5WKe37)+X)n-7}>6Qnw{>}4@EB$RJm`Ub!WIy3A|eDhhBc#539?V=3WF-$cSxkGjq9)-7=0K&Cs4N+r^|6l1$Gg3y4*^eg667C7{z9p06z*x zKTh4?yJ-cE*XCrl&`&7qIp4LnAg##l)1l^5DS|Q0JsRbvmoM$8%fW7bGa=d8Zw-=V z#iq|^V!c_8*gW!RoKR_Bg#FHT%Pj|Ce!0;XbE8{Yw$fgmds1JdjUusy&lD1j%^~}q5t8m59S>!caa1=H@XXVd@25Cp`!n4b#5I#s`~9_07S5`&arTJV%M)im#rB+{ zqK2{6O3$%Te?MpoyK{boGR#3PrwvYZJ{k{b`$fj0OOK$XYC}_L*pY)RJ4*lHA%`7@UQPpWI0O zMr;vB_5r&&0J~|xDW}^p+HhE@+s1$`WcC=5KuxJ4(VvcLW+`H#Med;b)SCEpuAe;x^jaCiNdJ05e;{ z(U(6B{VlM%mMBS?A7;Lm6J1Ql)VR0yM2TU9W_i@AuQyy0;GV)&D;n zglGfZ`Z{@(94W)OCo&m-QzB>KZ->Z<_}ej(h6Lr{yNSg~dCg;zavD9@I<~Jrw{Ce- zZr!S+Kw~Kkb!iU6T%*Wfg=jhtT;4abU08gutRR?z=NPGjH;U}gTjND{WWMJ?SIXdJ%}N7MZK z1C8hJ2XB)9OT_*qj;8q!P_&4p`i|BNMB`h!1e(BHt*<}pjVq4JuKzG{DRTZquzAv{ zXe@eV2OC8PL)}N~BUBQUV*H``rj?WQWuWe(q(I$vd}VikPVeoAb~`RUNAhrYuDGoM zw@G`Wy8a|AXgB@O%i3>moOmCS!V0u?0&WRBj%sVQ{szWDafuK(H<#XCKxMEW@J06q zqtD~hmeX~&))L(SA4$X$Xu`f4uh7<6DYWBSDEh(0U81kxbC(xe(d`#o;{+cLgLL-% zYq}nUtW@!xmrLKv$))`Ua%exn0Ja>6?njIh5#=n*eiwr%Bt^F%)*l@#N@kL>GvDA+qONdZ+(p-0XW$ZjG+u>k8TQ@n-CKm>LtcPsTNQBI2DO zk|3g8LkW&M~Pxbo~6POe;~Toj(jor6geeA#fUC+g_DjJwfADI3eu3k$rI_Z zXiWCkK?)K<$x)xMZZj>Q$?ucO@xIsxjC%8s>;dbpu+YG6#efT+O9)}mRUd@CHvRN) zArwRKUKlDjWB|jsADg(3NXuN0q5ZgPXn@g&gom4pFapE)e|%%mUl44X4QJXU{|sC% zqRYgaB@WoIO-=*4(cRGxU~r8@z;7-6+S7p5A`Jm}T7rIG&KY;n#p>>X(-+Wz@0yO7 zsL1{^9DU!O3hWd&h2`)P8&=}+xR?;+Mc<1)5#1VyZYx=o+ca!zxVguNB!@O_?t_=6 z^n8GJmUfvBP=8zZ4(0=U#e4v|`UQFqTM@aBV}L>7?C|j<^XgTk=Wuj4s|qT;O`@~O zY1oPRKuZ9o|73J^-SbI10=|2)sy|VQhljLb0>oh;-yH+!Yf%QO&~oZq0uLc-s4V9l z%m_YNokO#QA5(s|b}m_j4dadZK=h5=x?x+b>i5K@Tu8N(-MZZ#hltMCnGXngay}4! zgKlgfeYA>qi)K_B1C2=(u~*#NZU<1x;;$ znmniqxqBQk_`qIfCKna6S+vo?GnP>9FgL((l;#F2@C%EX=J%nsK7c~(g#in8!hJ7h zet=<5*gp=uupg~X#9t6DoaXNpLOS|TcY}MoE`%VYrcwR|8t)h9qoWHq5T4yA;%z-v zDyg0@Gy@nn^Qs+W@&O9z7so0VI00XsdWVIScDPxcroCf zjt&WVolJv;1rtR_8T6N9q>Ih!#Rw21_4A`E@}i%$Y@q@08>uTW0Dc0)r}r}t(BE*@ zJM&JLRMC&c4089*J64UrK!z0G-`COqaMp|H5uXgz!Rd`oG}J8Y2qUZO<>*)Jq2Brg z%>~qc<}@xEAE?`hL>@r;@g!n$-4{6dgtCwEK{-0ww`D4V7|x#`U7j2Ll*W9%zi&!i zjubBkVPj6_diu%BdVOBH+M)d+XcjemFS7O~{m;#M-FNMY9u&84>06;@QPLQe-!2oj zZbu!a5?2VF*WyN$gh+b0IXD~D3+0%*d{YOgtmo+Fpa&!LE}q=1Rld3(BbMA|JhaHo z`q)?J!&zSRoiP1-I(OG=wD~ugFq#gG-EPBCF3KiyIsPU`X5z0WG7*1MB4^?+#%=i9 zG13)A-Nm2VJk6h!+dLj)svp3Hj}O(YND5)99&DUJBN&f!KAg_sP}kQ-LvQxw#^E>{ zpf6@ORucJW5=258LiVybe! zoVCzYrGREFclv7)JyAqr-izt8AsuOrc7VdC;QY)Q>p za+~{Z4>wJwS(}zki&YwR7I-YRV|;Oc;$X z=DXs5*7kYFs6pg`+~^9(T}S`(vi`-Y%nql@JSZ*mRXG_E)+ifYc(#Jk_gec?9u`6G zn*)uaTg0S;_8y1Pqls%R#bsQSEofr|?OoQ!E6x!czW)s*7}>^A&BF$ho32+F&1bNY zx-9Ja{$7XH2e28oY9**hL7n6!(=(HKtW z;Gs?LmvNCgcsx?O-0wpRT@<7-g@|AhB2bf|P;?gt6+R0yqMYboCGZcG!*^2HU*S9HMt?0t z_)d~TVBbl1_!r|`ss%k&1?H>^0gwYJTW>3fa~ZyK`oYNWyQH6q(NAG3)MO>F72ks2 z(gjST2Gw2#`xs6mPB`gM)WS4`J-}M}kagg>V%VPn6Bvr&RcsbX`~~>JKE_$(g2(@J z((vcJbo|NB5asSW31c%V@aevjP^MJcwYC_P@;D5n%Y=_mY#RgPk>eRkkz52IzE#kA z4r~bb+2LU1kNRF|e#*qZ#r)A!950&CMronwOA|w)IF}%S35s*xWi%H@t$!^tg{uBU zs`_ICwO62M=Y&y}ze&{>6<$<*s`aU+N+=1k*2{7i`qD*5c7Ph5C;m=$D*lC{;unjG zA8Q9x>q*N(4B6gv*u&+r%F`Z^xFmzzFDRGxUy?fc96F&5q-m)H(@71{sX080)NwSa z;|S6YHt%>h(xs_8TzEHO@Oq%SUkHUwx4}_L~;1O_c zbae=`^v43t=Nzcplk8h?7lv;jWVS5-C^^_X>OgK?OA^LqDS6GI9l6U}Q-lvn_{qj2 zvRvUO`=LBa_SL_I6a(`}FB*{=%<_dxrw; za;^YL{F?#{5qOaRGw^qp0H*-F{35#Z193hJai}{fEz^YPkHn=Kx-?eYKMtA0-dkNG zV{vEKF7pJ4gmx{OCx9yjP!X!zTnn)5$~204Ec7> z)~?9NKy*h*4lJj6WYVXjd{evBe*$io)q$)h=bcO!*5gS49^FmtQU3|_+5;DM&Fe}R zUL&;E{e9OkF0T0_%_T7sMJ6qnU7LT;l|l82GR z{-jV;4x@v_ zw+eq36mW+=kY61DA{SHnPO29Rt%PKqdW{(?{n{I`xv`%JSl}-kl)Bxi4qJgmWgG zo?cQ=T2@>{H&qsw6&F+$r^m3NBXZ}^{&3)W3(C#63-&8lI|#7jacO zHJK}&^3v_5c=n|-Hu}lndlV7u=FLp$1helH!drWwD5QK5_YU83;R0ybS$(UTRy}l* zoiMz|!Tfctm-6lb4GhN{-YE8o7K!wv&PHVU~?YiXQ-k!~#9B=xV8u z_E|`B7jz7Ght?f7+Pfp&Gcc)>cWBU?5%Bf_$Bto4XWUNdr&>ZGM1RJ5)60>5!at*P zgemmrB8k)VqYLE+;Yj)u@TS!zkHsC9m5%>?)U{?DTjS|ko;N*tVkd7}(3^@F>He_d zoJX7@Z`zDb-qg`ZOg2tuscUD^uh3bJtutt7L5C4qXP?KbXJlt<8g9E&$+$UIKhS$~ zN;pk#x#PkynJuyIHM1Gfe`TiKvbQ}>qA?*ytRqo~Ybs|)E~RZk!wf+7dN zs|o3bY<01cF$BkCQxlX8FH`s#4EN)Box*=YxX=g6u+9_orl05Si8>5*2)Vlx$fa^h zBb^|eNPj|P=i^aQqdJrR`>Ja+MWjGjw!(z}1m~3INecTVj;-V}Z(6B0mAGXJ{|(^? zgw-nS7*_T-DomF>+5QrRt)pM)vW>PO=v|bQ{9!`VU%C3(=hi+S0HN-%1sxZ5XimA# zbA9ryh&z=gO$gh5gg+?7Hsp`;-rXVjvV?VOOlQxr$=+ohF7UQ=2zl3ZnCM;CVJy)% z01N{h4iFWtL#}rNjyHA4@ovWNjt=9A^gKu*ka9s94pIoDT#!a7(sUv{0#YtWIZ9Fv zNFgOD7m_9tX(c0(Bu)w`l4nU0SW_V?pi(O2F9(>Tk{(YApDCS?}+k2`*|Jp zIdOf%DDRHCRL_0M-raR*Y+EJex_U{DtlO4X6CU_~_3%sGxK5YH`1aK1liz(IY=9`mn8^fUO_r{xO}CSJ^2Ow~wUg zbLsXGUmg8Kk%P7WP6~25d-8<>C)gSg%5gL>igJ)g8VFJj0;GX)6!Cmp?_((AIh1l? z7^5iVxpdnIx-Exp8%ejJu8|R}qoWOEBAe*KW>VD-`m>ub7{+ii474r{sTYVGUL|rE zwo~X7u_&V)wDB1*=lUOzWn0z?`UBfm{Sn&ix|C7gjO4MMyy@s$bhjk>-eu~Z+vpx# zsY@OM<|K?qMo@=$sch-xSnXZKolb46=o3UAiIG$CD7%vkbWXm-J_&YC2ep@+B3Q?% z`|2MX`p*@`og+GVF+CK|H^z2iI+X~-5oCK=v2gxeP?S?5vz+;$s0 z60$OqAuE3PH$uqTpk(!;eon|jKbUwdi4NGaF|n*1A>`E>8B;bWW%+j4&|X#Qzp0ZvstgoU|G8#F!Z^V6ta& z2X}eY&!%<3AA}B6=0cv+&gReiOMQzVJ>FXli_`kM^>5BF|B zIc#d9973Hbp55&f&vY+kM9rU!{D?W7Q+F_`NlLy2c7(|b(!U`I!w%$SGydiZ@Y&%x^7v&m_QaIoYrR`L%<|0~8nN6GXb4Ea>w zrz`mfyFXmhPVr=re-zhX`aVuQ2szmO$Fg>cCtW=JJ^1}$V>`vOqn+ZJa;&T;2h$EBh@wTs^YaAb z6=ys@%Jz5NrN;AP)p&lA>v(=t=j3VLp?EB9JU_N9ig#$b%wK!Qd-t~#Z=c@npI>TE z7OQw83Ch>;pNaAn{<8;BuH-+%g9^+QHet@NISGp;k|tpef%OOpeD49_KTADM=I>y~ zQ_)WGY-p!=vX5^cy)9{nc&J{d`DK3YNnmeIdm`qicn+SF{Fs`bPEhU1`2c4pw6kEF z?MdUe6t7-j(%$*4QHvYeGv3Qlv^!M1w6@5(w82=NDqTfCMvuO9vt|91{)qJ=xpEC*ZPHwbz&Zk?MT)?(Cy2Tm z>01V@C21<63G20%Bx|~ob*(LHq?UC#WYJnZQ`RUY>mn|T%Jxnr>y1>Lv|Y9|@l((V`Ya_Gi+NlKKSG5swWe%Bj9kB{1DY-Fz<>v6t*|>Mg zzbPXfGHz8e{)uB*mboakWITSM1vn!+6Q|`XI77jCS8)zg`DrG5TS#j{7o=eAaUGWO ztxHO-GZwLq6l-77`#}K0xheQ#=-jbY(OzQBTXJ$BVG|V+vdXk9O2-&`os(F1ORL2n zWy_q#V5w%vn_ZW(z_T#9kS~A=i1ozFlq})i#=6!qD7&@-MVVrKd>|3WW6{<4;KAFZ ziGs&ci)W3C9>kC+cr3MekzJ(6*2xSPYw6W>k9ezuk!_qBwfNw%I>muhD|y%Oh9f8z`kyIlGN3 z9!oim<8bBli^KJ*W#__p941*qT+*qWEe>5Ad8~8;!P;*iN!DN&Icjoh+~u*c$6;b$UDq#A6L}!OwERvt97vE_l!dAL)XRio<^mB+1Hj$!~5Pp8$_F z#s$xF!N;IYn+2Z4C3sV+FR2JyjT zT^J7n@mLqR;L~0387_E{3tr-aQ)?3&{)*Eii#}2%gSLM<^H`+{b$%YkVUl&43;z;_ z$c{YLY}5=&`6(i4D)?6L#dWE@NlTp~t%1Ibla0>q>U!9D1x;=Y$=3tZUerbLgQ@qbCR+ zYe52(%CXS}|CtNE&;`HF1z+TX-{6AZ=z{;!1;5z^zr_W=)dl~Z3%=L|zug7D!v+77 z3%-{gY7 z?SjASg1_&AZ+5}AxZvAd@a-=64j23r7yL69e5VV(+Xer^1>fg_f9Zmw3ri3%KXt)V zT=0%Ac&ZCdU!{l-m_NGUUKd=vEfFWkV|8=k3(rde{t+&G+Cd~fU_R)A_j1AYTd1+J zFu!x*i#MZU$vE>^ei!~vTyU|PDuEo@B_KBJ!D*6QZ6_|Iu*)8cKK>XVJdDq;df2!q z7I(&PvA8o1jK!UKK`idf?_zOh-V}>F^Q>6hng7P(&ipzScjoW0xZ{6_#hr0rEPlF+ zKF@H$&ve1XTmM|NC2Lk{3`QsPi9^eZ_w&dakqX<24Mz!nD)S`C>(Y{sh-)~G7yKh) zh2TtDV7y;Ntm}loMJ*e*?~^5aRGgJK^}|{={YKH#w^y&Al_jH zf2`sc82BWGpJm`v6+Ya+rz_l?p_T}|m%+xn>jIDW|5OV6xHx`|z~lWYR|>pu9G~`s z!YbN0{AY@vY+PBS@Qw!lONDnbaC*m`zVhHT@Y@yM&A^u^+_aRt6#gTFf4{;_4T;Z$ zBHo`E{1(MO*}zvTyq|$TtML8?zFy&j4Sa*b&oJ(<8D&;imoU zRQLo#&OU{ko`Ymjuzjt3gP*GKA_Mm-+?;v$P`Ejh{*l5XhMaVT*BJOu6n=$)_f>d< zf%g}9hJj*zn8MSHD`zSE7y}O~e4v4k7WnB&W=lCv@kbl{a}_?_z^5tP^eD_wc)h`w z-z&+meqrF{ioe*vBLa{2PhKYQ_TcGewhs8#)P~fI^Eb9g(-_*1Ge#+@d zJ!8dKzZ3jHRPAX{wna1CT!g;@I3jQ5O#H`!c9;1BMR@#xOBKe;b!{RD7=Tk zU#oC4U;k9N*>baU}Oonqu8<;P)xM z9G5v49}@WaR=NX;qveXf%)r+u{4)c8Ug74z^j`}9k&&-A6#k+i=Us(wHt?+qH$54j zDEu*lzenLCjd-P_c#5^g;2$O?T+tu`G*SMY~Z~VKFf&rc!kUHg(K%= zg{K?*OoiWV;Nl~<37#*Ut@s&6dc^lT@YHg#6QwxHQ+(MUIv3?Po-T}gl6-;Smsk}J zRUFMw_)r5cRk#6L6#}1YWg8bRRk&%-S1R0~TMYt_pRX+xxSaLydw!vCGo8f2b2>xI zmURpG-IB#kLW~~Q+y^Ml&@mBc-?c>eyDs<_E_gR|L}s5!^82{pr@7#RUGQNp_;43I z*99Nvf}iJt=eyt~z`G@Pp)86JQopq>{ADiq^}vryc4k3Sl$_raKS6ojBlwP{)$5ej z<1YLc1mBrWH7ogV5?>wjpZ5jd8ReX(E`H*|-{XRR3A|gfPMu|GkHf2&ASQ~py9?e6 zc(-I{b|=S?mUXfVKNEP@l|aJ@0%0c(>#eY+OIld`sXzv2o4V zML1jUp&PCg%qzOFbsZbjoyP!Al+J-J_(&Igk_$c)csI+L)r)U0BJQhP_}3FZ*%>{H z?=OP?Ti{(S$KKF9if|9%3G8sCfouNDF8GHAuI<@_gGBoIG2sc?jbSeM1uk;RfhWq> z6@;^V>Ga=d;I&E*e>QN%funMlqtF8FCK_yiYxHgK*7 zBtr!Hc3Pr%e@Qq`pgR7G3;v)B{+NsW4ZwMNw4V0@@5a^GB5~b$|!V}of zsV;aBc%pbGy2vka;a_3kTAw$#$oaLw&sO>z;?f`8>mp~FAxGg|B16Kmv%ebMb2cB!)EnEl*Bm#Ki!tE{gKxS-0^Pf z7JRY5&$9Vi&QgJA+xUs<=vjdew{bLIB5W0S(8hI7)(JzXZq_Ip|D`%QTHv`h{u_04 zGU3T%ZCpNAz<$C4cf6ds-w6ph=h}Sny;hu^4?L0mm$~3|geNG+TU_wFUGOJ>Co1=i z2CnUJmkZwg@K`yTf0_#(2A(Lt#e^rwSCtF@3Ku-;g5U0fKj4Bt3!Ljm=j$yO{$>|` z3Wn6(k}tB$U3{k%XWf7&(nAK}3G^`91*h-GC6ZI`g8#(@|ECMS+XX)!-(gD>?8w@<$NA%+>rQg zY<->hwkBA;1wO~-OaCD1wZJQFTzvBqXL-Q!FD^`T!7p*aD+y0f?hP*d8-RCX>+!^Q zGm(lr1U}DBkNEy2&S-BD{EG|g1;56|#rG|7_Pz^0scT|6#{%!ho^EP616=sy1^;q8 z-ZV83FLB{F2>z9E{9n899~Asro4;QjJ?p~XD)`sL@slxSO{BNpz`L>agxX^=RNxD2 zIT}Al;Ei#3iNJprhhGUiQM|tu{Dn4OK8MEh0)bx_hkqdOMK-Qi3>=OJ*WIie;_&_g zztP5Z`lkq7_lo+txMkG}{N_0R5`o_mhp!X(t#SBnf&VTJ?+q6~H*2wti|@DKEKlIK z$Kmq@eus^V@1@}EPXhl_9KK%QOKd!(?C^lV{~Cw)`yr+0&N%!+f!`H}FBJIQarnam zzc&uwD)9SlT$gX!5ftyzIDC-6m)W>3-wOo(P#j(>@JDQ1m+zkh{#YEoR^Tmh_)dZA z*4nX$W8iA(X03|jj}-XoIDC%4*Vwo&-`@!Q={WpJfj?{Gx_mzq_}Vx;y(gt-osH}A z%@g?gIQ(*f|1%E%o524ShrcTD4RQDnj-q&9jl;79{<@9-T-n=20^b;i*9m-69R3%9 zza59K75KY09#Qf?6Zrdac&{H(IycARL4j|H!^;G|Ee^j?;M?QyM+Lsa#_v(-X%+Y< zarh69ru2Lkhg1L7&Dt4RETLj;E`ZZ9=SxS6$%zs`Kd}ke_PTA)I*RwSB4l8av@>oj0 z^Ylp9{49a%QKV&Q`~rbH>lnqmqmWZ2@SoUv5buQoZxXn(Uh!PTxLx4Rx%om6l_>tnz}g{EC9gs^a|e z8S^8>RsC=h={LaU7FEx#kl0|WvbeCivZ{1male5BGcvNQIpu|Pm^rAweK>fCeK;h` zDz2=YQ<<-51BMQ?%1g^jBer8A5-C(w|ImB~zeT z0uZ-n3T~zlkSQc&N{Ql5B%5v({QiR9UoZ#BW5Mq)`2B^P{vs0O4R`ey68Z}X{e^@9 zGQmQ^0CDdCaqj?e?*MV{0CDdCaWAqBIRnJK1I4`q#k~W?y#vL)1I4`qWwyn=1I4`q z#k~W?y@LdEkYElH%t3;QGKc&@faJEG0-pNEj@n4Hj}>Fpx7? zNEjmT6>^3MKFkDHQM~v&MBFt*NEjmS9U|@>BJUMGw ze`badkeMOw&CE~~DMQ*vrZkYu3?U>lQwo+gf<4ZJFlB9-iXx>*Bg@qGC}U7Ys|-`x zSf(;iX=|C%Y%-oj2g3R1j2t;JXHx#8!0>QRzEu^eESyzIKan|QR>_=- z;&Q8`vbdngLTrVzE3A^jvN=`7AQYj2vq~x|k^T~^xJ?<1XmT?AQD^*0)!&Jbgadf%&DAR z5XqlcP*#mBl&i$+MCMa_Z_O-@6qnC)ie*N1X<1QzRips<6Kc7{DgmdER4}JH0u@w? zlS*8)BLX46qO{1WDl0Couqq<6NQL?HON+~jto)fsMx?T!975+*;15(d(<&+x9R_}@ z=Ff)WE9KR~IhDn+XZdrhOOaJO(YU`1g=C-e4$V$aRcZMp&IuA1DX6407NE3|Os${9 z=)B^}87RKUe4AHTQO)iWXZhs?vmHgFFnAR9jg(o^lbu@X6YZ<8&r9-Wmd%+_Q06e? z-jYyxCB-|OY6tZV?4#lg)Y1NON!?^ z#Uk#Ai3YWyusAm68c}g!E0H-)M#Y^lz5Y5iPE_&9$R~4RT8fEL494s{L`r8DSLUN! zF3GP%Wwxt>14pVDTvmnr*tHtR`Gu(8XdmnnMaG3O7tX0hTNMXapbDV+Q6;jkp^DBd zD~`-5&o2>Gsid$x!b{%1JgcA_jfHcgtbV2CC3EbAOQ>e(U>lJ+EUHG=Qi>?zb1CbH zof^ASLAGjO5cUNr2zFTM)LUvv5;q;j6`h=te3uIXk8f|NYQ>`@Bfi*m9Qkxp8RXY9rRUOKe}xvD=5OAx~vzJR&6u&t+L+5!6K<~DAh$Yj`Hf+`PEg$tPPI6hIFvrFZSB(;7uT*1alR2lNC`(x^;6-7k*q^78}sv>s4 zKYMl@Mk)}+s%_D}zo;#CO*~ylDJn+YE3F`_kLecSA&HB+s?9rQB)VnBSKN~zmKklv za%pLTTP`ha^PZ}z8Hw(i(e_={iSMeaPAIH8F${L~j07p3(RMCbQNjj`stb!}kYVJ< zSQ*o|JBgP{8!RV@?j){ycUG*j-FdORYhp>-AZy!Nr^KTaVx?Y zl55$;vRn!+hU8jWF|0OP8 z6pYDjvwTeQR}_;;|BBKw>0eQ3CjBeQ&ZK`;5t@9rlEgU4nV-8AVGPN&>|$9ig%(3{ zEv*<<8^sjEYomN(cx@C+46lt6iQ%P zM0(O7({2xtVqulj{6hF>^Gl0rY???2yeEn0B!-YVn9|VD7!IMrSTP2%y1W2hV?F0e z7>6@=HSV2(XDaMwywHU&Qd`45ziin*S!l3mEQKI4#e4 z68|-S0OQYK{L2_#$ncvPUc~Tw7+%cqO$t95zahuH|ehI_H>+!fo;${P)t82+HOoj(JP=IixXBjp+cHy7T_?IDGt)HtI{ydI# zybE09{EFeceOt!l(5hH1=W)j8^=>ueQ+hO?bVY&k^#cBD{AB_WNN-%uCIS(NPpfV< zU#~nR{!f|w&N!w({1@?G^SdjY_`JUNW%%Vx&R`~o*6?XL`a2$GI!7x$wO3a#In;Mh zApVsMr#6!U;XGdAP%!xm7|z?DW`&#jxsma?e(q;DFW=Qn&P#}2=XWjRUxj0hzt8Ym zh6}~xeo7Cmde;0d1Rzkpc>U_5aQflpbt>cY@*2-@9`6)|oBc(;!s&->OsD4(CWqVG zJceJ*@>R?5YZ%_(g8u+tHL!t6vq^Z-@y121uAi} z4bA8AQr*lj|7a7yR(4F9zY{wIbnV*I;Z@COxc+UIhGQ~F=S zf1OUfZkW=;?Nj%sW;(T=&HSFBl9A% zxgPj9mgB!yd{Ym9Q8>we1OK)Br3xqcTo2S%P$2$B{MUTaAqC>!z{+K-Kqdc89Bclk z3OD&4+)IJvZ^D1g*ZqshKbrC1V*KM3Zu0dPTS@-gjDLm;e+-lV4&zUB;nSRg0?B_D z|8@FlY(~LM5A_igi2ok`Ykr*z{}+t^KI8w!h5t9k|A6uDbK$RKeCq3UdY*CNzsC5j zjQ@@c|5L`_!uWe#_?<~H2=qg?q|?(~;if+IJc2IMTuk#%ap9k#aQgX>@%27Kru>PF zzn$?faFMV3JG%T4<5#)x>zMo$~AwH&f}3ZyqauhH!- z@%Q4t<{!@L9q0E}IPrgpP#vuW{Xen)o=&iF};|C9^=Eym~b zEI!VsK0>G8Lm5J#AI|qFoYK$vCo?`Drxh{&ElmDghX01)w=n!xhTq2ICo}v{jL-FR z7sG$f_zy5S9)>@}_*~9p-@tWweQ4miygoH>U0z=pxGt}wN%07j{y($)_Gb7J;JO@J7~X;L zpJX`q^Q>j~Ul{*Ih11Vp8NQL>G*;C5X;nDYFFqeh?f?LR(og-7=66@P$v;`)B<@be zr|&gVF!3Oh&)chsjQ{_boC_JB_Y0S}@T(ZVBa?Hj3!lETNP+Up`F~_M?|1&es)a9ZXyNJ1Ly0F5KO!yDFuPjL;b1N|Bn=I@_Q@XESD@7{-2qA>i4z$`xwsqos~=u z?{^L`oc9-?p9Q^MpQsc_2IVGQ>xobtt=cbv{}E`L13xjtvP;FmF++f|doMY$tx zt%u(;K9_$N!)Yv``S&w9e+RDX^HP1I);;pY5l0? zR-}iwOD9kw5h$IH;=kr=e;4WV9;TlF&M8nj`8YgJpc1Dsq?S{_@Ov3v!f={vYyKq+ z=lm*$bH09FX{NJL@hM;ATh{UyG5ir6YkTAVG#b-t{$j-^`FuX|Cxw&zrQmBh`gthf z4>CCv7X{MWGKQ}dsMP-t7|!Q4|6n-zU@4HCOBqgcX9^TA=O=+ff%r5g*ZDe<;SV$X z#|kGok1)I+<3Gyq0SxEmo~3Y8&IpE6I`w=lPvN9z{+wc}!YTbVIM(T>xy$6CIgADQWw`+>x#KSY8N=%+K2GnL^yzXc3m#rWEvLzg*z8RL^2 zU61th3$h13zLNV|#HVu!lf%nb&yPqB?@w1VK7D;e>xcVocz-JQg^+#;E@wZJb2yXJ zK{dFfXD-Lf#=ShhM=_l1N5R4$!rNgVjQZnTe=JOIZ3j zPU9bKhgv>e=i{HANsoap|Q?x#TV39qLU1T{RdaXrrHA}`zcKy{vG;I-NBK&5>=XOQ-oeJ-* zen}71rfL0q7|zGxnosqVk2Bv=e9AW;Z|`uC!^`FVBo;4@DUh5O7*1tJfzrwSA%4bR z&-kY)+-&#GV*D2wUyt`F-hVPY#P}~U{2YaoeBR$(q;Q>1Jx(iQ%s@T>F{nGRe{7;omDh>FqVf*Uy^> z=l&x7{D^L&eChP)jU-Ke7iEYh-cR8qhtjXpa~8uX{Td(5aMGK`&r>+*jrTjV7{3=w zXNAHkJ-pw!+(k|^lS6YBo!^^W1Jw7BpFE8D$lisE>`P~1&=Tqk?`NZe* z*#d_D19G(8>VAOoyOGJiRPoLH)+*fO|AFy&`f2SG1xinECV!PcrQW#xzo>BHAIJC` z7=J#)w=ukc;q*+C0;QkN7m~rDK@S_w?@gC3Qkj+paocaiDhbJ=p1cvux_)i!< zK;aZGx3|+3ZkBs4lSB5e%W0f}52M5(oM+(K0=29n1J~omOAI`y_`DyXc(whJ%~K#f z5FQq1mQ`oqlN3%mr=a;Y@>KNo93Q9fTMYgbg;QHWLCewO8R{!JK3$$#7 zddk3ODV*v!1lb*teGJ3N=CyqNT+-A7 zf4)XzD$VE5(Rey}Kf>$Fkt7&_vJsOWhVy>p1cvi;%KhbJJHX591IFj+q5aAzP`o_- zKK8tZr}KvjCq7S4Z-(>!E0f_oomosiPv<_y=jr)@dd@=e^7QlPK|GzK7@wzS3d1v4 zdCgEbse`9yKjZUwJF({oJe@yc&mDMr`YYUQ4|5p*ah9I23xArzNe_>Kufrt_r*^Eb zx=?B0+CJwSxRz6E;L1F#1qR+l-LTNW(-i(o1J6+SZ3eFAX-f<|Rq>Y^_&|j}V&FX$ zPWwkuApMXY^t|o`10O0_7^@q2y29TyaKFMoG4MVLzeOdO;>~2`cpJlcyLy+xO?z9x zaNe#y$K>#KbpykByZR=>dAquq;oP1-VmNO<_15;3FWwG!Pz%8+{oKyGD4h7b9Xp!g zydCJnNNX2$2~{DR>; zoxaX8J!U$4E1c5F)6<9Hyd67@;XIwRe;ozWuC_2fPtP8P^Yr)b;>h9UJ67S8PM)58 zhVyo{h{@r4`<(Hgg)H4JbyEumNj^`{NeVahb~fYpXZ3C*!)c7FEsXyb<3G>%0~!8L#^?HfmGR$Z{8ojV>D~YjQ=R(7c%}} zhF{G1TyM9l2{5IH^Y2u+X;=5Ej*0l(&Q~&=+xZ%WoAvG*_29&m^9zP^IX5fZl=B+n z4`KOzkMVhaw=q8VBkf~2uP>e>kx>LnC$BGuE1c59>&wZEPyMXwKCBD_50OxW!3Iw9 zC=4_3poFdA2Cn<}kb&#@)K~*o#bHe{@JUMkR0Hp$@aYCVOW`F3?pJu3ffp&f(!lk2 zqsG8%6#q&CAE@wp1CJ>D=L~;^*~85YC;y(VU%zMgyA1ym!@2+S|1tb!#{U%9`6ChAIA8!KMV!Z z59jw&IO(6Ka}dLMI?rZucskEx{4-g+w9gC$Gu~$y|18E|@50~8_-8ZzCoX)?k7T@* zuWZKeq;NAm$1{F_@lST)UpzRX~JZa)=_Kb*<0W_(^QO^iQ+@o#Y9|CQm~{uLbO zR~#zUBg$nFlfOjaq_s@dvae9HLT9{$Sazj=)RcZHkw!{@nU8Gja=-<`wo3O2vv@!sIVzr}@*?1&)h z7mk*yb4nitjlZq{Rb6cSX4QVFY}xpo%FnL-g<4LjvTr?p*Z4$L|JAU?mNQO8sQrAJ zKSs4v=^~X*I<^0Okise1{JH#PjDHi;=K_ZRh2c70lJhac)o?@93y$k?y=iZHeUXWm zuz3H0W8KfrRXFkabNOo+e;iBC&lFDS#R&d^)CJ^7;BeuD2nQ9@k%7;DVPcoZ?-E|5|VJ8K3J->xcMUZ#O8una-^WH|1-6 zntJQP^uYDj!$l9;{z=X~OmF>JykwvHdG~k&*ZHkv_%E59pEI1>`NJ;w1{eHe7rdj& z7wLz~@9To=J@!b?ygn9^00a}i&jsh}WVoD(s=YDgT+Q&inSTDsa4zS^Bp^Y0bUR?? zcZ{URrGKizO}<_?Xr@!!A@Oh0D zuP{DO=MIK*IY+wWcL>WDmqTL!3Y1PRhuS&{rrs7XK9}=bhI2Xhx!}*Z$fvmm1ygVO z{RK0fM-qTwrgMnGO*vy3&eJ)=Mb6cX&+~f|!?~P0UF1B=_*~A9W%5No#oO&s4Cmv_ ze1(&^2k~FG+m|pt@28@SKM}gs{Oc4>dgJ}OUdKZH$$9v%`Hif+_Aoj0%#H%}!^E#w zg?y731^t&={M7PEV~$4?T0?xK1b4S&r*; zl1z^4_fAK68Giu7x%@nabABnqIsZO} z^LAqddv5g}i+7g7Mfs#erMpi-fdI38SlpmH|;Q0 zz2IcZ|AoR$IX5fZl=GmAoJU>oPF+#p2&R0W!cF-Z3OCa^*ad&bMgC?N`3Fons-Ac3 zCex|o)$K!1g;P5BvvQxVaIyz#t984yh~c*}TtD9=K7Wq6j`8XFotBf^U8cvBU&C-N z|5k=`Igc}(+xaLe5CoFX>qW5(zJlRAzdIPt)6<0%iD2fpAHyk~Is_Tc(^JQulXAUH zJ5t`M%S*T0buM^{a#)!BE(Wga*U?8w`IJ5Ce|5Q}E8Mi(GZn7o%e&EE7`V2ppD9Pb z=Iit;IBwkX3yKiIOwY{-=GDd%CfZp!bca4lck;h6@m(?8n4b@@&(a6P`dU*Qxl@2?+MxY-VPT>9&R{vw=kUZUt>7upW@PAKcsNV7w@m%WPC1vm%`0**~jE?yGmC5Dd~sz*J(z-tnJ|l z1J~uK*U6ai>T!i>4_6s-wEV+e`s;2k{q<=s{qi6eyh=HO&2-i%+)U>pg=@Rj>Fn;( zFZXommrr%+mj@cSw(}5^!~5l*E1b%U_fstl=l#^{4Cnoe9w(3--k)B|`fDzK3B$Sk z=NQiAe8h0xFVAKDHMjpJ7ko3rd3t(M0U}WPd3pvioac86!+F14&TyWdg7FPHDuUtjCeUmq}VU5`2%{k3inFLUW-web{{aKn>3Q0~)v0w76$pYEFW>Kg$9sn2>v-ke z@ClQ`5a@^ZBX295ZsT@xEPHOw=Y2OQ+)U4}6mFKw{R-FV(fNJEz_or>8aSny!Wt%@ z_t!5f+)U2~#^?HZ+mNsA^D_h2>Fi*Pb96cnH*now7b?F2>67aSJyr12}_&i=8dw$64T@Uu$oa>XXOXvOU$%?Pboyw3xrh#ky zU*r;Rp-a4th8!s!@9Mh9+2|r?mx~;|PTov^Up4MB?Jbjy)3|;{D%_OwmW!PCUF7UD zfqr zg#r;6=m2W0RVq_7AmdkNP$QIwP_amnjsr|kz&fG?EDSQJv)A2g-Td#md!2KbBHuTD z-CujpIsdcP{;mBy=bp!nI`5MH|3jtA{4*ZTm*}KGzR%vvy0p94zi8+ke7+&}(ogP# z^AL>l1!XV!ALYtJiTc%c5YIOYs@Lh>gv5C#=zoRlBu}zV^+UhaAoldz@07HjguBDr zBqAfMAAe~f26=$Mb(XQ%#&@%(?xXHTdft6U9G4_H6LpyPPNdc*pW=O`SqWIW>Y zDmYG)aB(~);o`VJF6v3RI36wTetZp+%8U8q^A)Hk;o^9Fr?^S_EsjU4pCrlW?{yr# zQ{~#4@%X4Q9)Axyj>kIapE&Mj8@km$k&ELo2^af6a#2sh#qnryt^Da;DkVKmlW=i7 z{;j!ke5d0Mj>p#v(fjivY+oyW%m@4JwtV~_mcaVK@%VP@XBgUL+wrlye{MTI`u>K; zaU74^jkH@9PxVyp7lW=i7 zCgI|^LoVt`xHul)O)iedcZz$mZ?3ZO_$KH$9*;EIH%UG?9=GM=$0OD=jz_D0PJ?#A z@%Rep*iSAr+G`z*JM8}(4c)q4HxBYg?scH!b%y7S{I?i-lJeqsd^gLB+;^(HbRU<} z!R|l4Z*oyVEr0G}A?q~X;(gw$SeN^2a=+|)rOSP{o0ZO+sg=A!`QvjAcPe|iZz%7t zY3>t{4?gelOV;Un8a`LBH}?ms2YlY67y1c4Coljy=5ssfX#WuCXkSHvxTNFx4Cr|O zw~2MCAH1*qS=RM^`6XbF?R5p@f1`^{?mynZy3}VOIQMb0(xv`?%(~{v@3`uIvKPM> zDf#nl-_!w0--E%mO5c<9HIiWO5PilIQL=uFDZL@pYY2{m-j_iD1Cp#=6^ltIF811 zxhZ}ekMef_wSETJV?4c}V?2+7j^%n4bj)WWJnuyDOPUHg#yN*|s#bjNqJ?$oSJI9b zC|%n9YNgA(Ex)58`N(+RSm7fRUF=^o^j*~RMH0_ZtW!N>y~%S>I)CZcRNnQ_f9Cjb z3g?q&xy)o9`!wihfIb;?%%=%-%%>A{%;#*-=Ry80prd^o=xCn>9qmy^`%4UanTPao z{0Oi=7vfyTx}LYM275eStYlrc%k4^+_SMI0HY@x_1g%p$;_>|WRLUMU7+K*qdr4P+iwOP?Pro9+oLYWW!jDN(ihmC z%7w@G+gaD`^RLG?PF+l&4o*7dx7BiQ44m&~iW9r1fTb?SF3|Mwd4nFg@81vkAXn<2 zxH!*j@A9NX`K*Eb_W&L1|0AsH`0H4wcEtD_zOWeh}DC;_*nksjoY%`eC5U?@Cer z(?EX<><4^djbz;vOfpx^PqnQ^o^jO3wj+7e#+-O z&~E_!e9+~+W3o?!-UjcLG=aVz^k&fOcoLy}T0p-Q^j6Tv^W;MIZJ@V-o&kLw=yEKh z-IMn4MwHKdzgqZQ3i<-jUjn@Y^fUQ_B;|7f==XwtA?TBM5+wUh(3gYW1^O1yyFovb zz6iQR`ScXl{Kws(XF;F9lPuX^1bQCy9OzGj-V6F{z7R_JvY}D$p+mef;~!>^}>78|ZQ^g?6t8{Zg>6rJ!#C{R^O<$p=Ns=L*p81^tVlPvQ&3 zWH0BUw0k+|Ujq9rpkE34xqQ&2e69k09q7wJKf2m)kp0!5-wpaTpikw4G}&JZ`YO-| zK;QKMzd`n220ah@b)Y{D`t_jC{;=Pmd~N`J4e0*}`h-bi_BVpQ9Q2z&AAjJO{R+@8 z0sWsq-vav0py&CdgyOsf^ev#@3VMFB-yr+1fW8IvuY#VRGG_lZ(6@j-2zuv7#_Yci z`tzW#1pVBD$LzlW`bN-Kfj;SwG5c?V-Vge>K;H=ZZJ<|89m{7m=xv~X8}uR2ZwEbf z=vY4g40;vlYd}92^gBQw1pPapuLJ$Npl=3!2=qxceuLWgPSCTUe-HF~LH`%fQ#HMd zyYkH~Zkf;dALw>MRj%XBu^kXqBxozLXDn>*7^$~5<8(!H&@ z=6vgdpiQ?)(=$Re=VxECKB*jZAZlKsUjrA9(ImhUUc z5O=%$imf6#)ow{8Col?_w5L@@0h@x!!c4BEr?(@&IAlPTA8A04`58ACHTUIv(%D?5 zJ)>qcDHmlGv@C7z{Nik;x8q_SD8ZT&lTg5Ih3SF!s0pOFubkN3+!Y#{5<3yeB%H6> zTbS#`lcs3rRZ48{a7TfXGDhvENxiI>NEXGsi~Usg=<#G2l+82lSJA4waFU^_u;1^_>BFE)=@3ztVy!!Hnkg)Mv!|F2Mi;T{&AwqdM05(vC89T2E)m_pa*1ev%O$GeEuW|s zkLV~-4Ikka+5S;}5#1odEwVR6_{DUJ2+x?l5#brrMIt<7dP;<6T!)Epwfc=aD{Xr_ zhZf7;>?x*$(M2qKvu{`q5uL(viRcZMOGG!YTq4@va*1kq%O|SEBRYyTQxGP?EwcTi z{35zRgj-~9i13T)6cL^=eIvp%ri(;)#`Kg3&$tc~;R;_)O3qyi9iZ@qx4o}76Pz>> zx<G6UzMr})+3t&dXkIL};0j8&_H_29JKFjac=WjIWAv5IT~cdJ>eJWV zoC{u>Q_-}zYf6a>3-=)6ESue>?*4S7f zOk27)+h4c@*xDQXGHV$J11L52`8Kzvb6xRac;l8B<2DziYvlP6`Sp&OlwNU?J3RjS zP{ak9Qo|6+U9u_MEh^kjs>$SX?sh^?cWb7mZu+!o)5_Jl=`(9X>a^)0^)Pq+&)udf z$+xbqHk9w-GwZzVe0Sl$u5Q{)HtH^11Si2=^rO6dbajW#II?tm`i#=;BRGRxrnN8E z+p*9UsjjxxIpryr;2|l^ zx_k1Onv+jG>(Ji3>wH|ZnFZ@dRhIm{|UU540QQ_=yrt4-_PZLgF*FfxwEqs zzSRG(oEG8_P^K<5_(kzk^Ph2hLj3H{JQ(YLS7)pF^!GAD{1N8QE0X<+=5ObmL;OMJ zOZ-&+-2(gr41WC%p2v@26{FDn{SE%+K>5j6m;We(-^Bc@i)6o|`O^)4)dsi2CF|hvv^U_yYkxJ(8#S^bEQCS2q4m<^i$O7`m`Vwv=RfE&J>7 z)5yXxV(8scI{qu&{!ss^c-%WSu^-hR^#vV2z2{Drdj7cPJ>C(Y$s)z)I<}-~k~AHE zHG@{fjDLEcJ{11~j=!465yelB>go7Ln6KlPzemrHGg9bE&*^{1t{g)oq)A1j~d>#MfT?+WZnD=Y`Q{t9ogAso{`^Uvk^K&Tv)Gx~8 zuXZ8m_~~6#vefb4#C(2?)d%lBC2m@WufE$s==3+ zD{3TAjzjbBG5l-a>m3ecKUrx1yN&oqIet0L)w?(ja+&D(pEUSQ%va%;%^#cDU$@@^ z_CG=b=cVu^--+(Tjn?(}8D#%W%3qg%59c51e-%%8g7{PU>5)Oij`Uj@ZMB@>Wjjc^3OH+mHa&J41~LN%|FH9&wSbQo7kWDGu^u8=M8=% z^N(VGS!jNb!C%4r0pQOH@NY8s1I#~2z`WG_>kR&sQD6RA;2#~}|C_-dPFVlnH~90I z-vs<)0{o{9zPz+X->)(0@;_zp*D}9>ovHon-MVi7KO6j7nyg*o(@4MJJ~f~Ij;H%q z$h&AnE3f$amwrzCtFnR>D)U!M*IWpeOp-xZuf)aB1I|3LPag)V=Ua}15YQRYiOCI0CFel7EL`>kO9O#89g>?aH1i~kes?-r$rzc{?a@7KvrUH-?MW2pRV z=|^x}s-EaQsr=`-RpCqde`mx$I@kM6Q0HIbFY#Bn0~D?6_8)(S_vh1izu`W`PnHL> zzb=0@`~QN!2T2CH{0BP!Q296Q?ng9@4@y-2^W9S7m-3&$d_Df(Xd*B7eym>c%tVwup{wX+`8spZtzE$|8)WL zQu8l0_?3HmzSJ}E+X8(0yO(ZPS^rrB zk^8CnOY@rz{>XlwU(fMV`}YL+S1@1Ce*2;V1%UO zzs-oh`h!03wG5>Ab1om@OaJ|e!5_ZLLywQ)uQT|YCUg6_{!NMadAHR37Y%;pZ3VZ~ z>3jL5eCqN)XYdC<>iN~cUl`!;uv2*aWbgIF9i*W+0L@Q1|Iqjup6>bm>__FNbzS~R z27k#9lJP%i@Mq2_=U*J)pJ4C@=*FT;l`-W%#^4VH_?Ni+bouG;`VvCVUxPm)dOV+g zCo@$4Q}_o5vHm~j^3(jw4gSzD(cNzk`bF_mxBsOEe~|eN>`Cpv#I0-oN{jzv?-;!0RKgUKXQLE{&NPuKfu2tz^~Xj-2W;c zNXFmZ`G@-7aDcxoz(3gF*FTtyf1tsym{nf?R|fcV41V?@&p$M#{TmE^BlB_mEf4T7 zHTZ)MC*$`U{D%YlYXbZ`4E~0nc>WPF^>yPsFkL4c-@SipKm3-n5$NzVOzc#?9 zb7?*Q?6ym||2MAp<&PWxJ30SQ|F1o|y#L)0;2&%7m;5vtf0n`T5Abgc@N))#=&@w{ zE`vWD;I9bqziIGCewK_sXz(jOR$l)%2l$%|e&yrI_)i%8#sL4;0DqTV!~L)R=gIgx zI{#4rTM^)YHNZdC;Afxk{5dh>f0n@?VLp!k!2o}u!5`k3jDL~ApLb08{IfE^zr)~H z{=)Omiz)wVgTEobUlriLVDPg~CF746{Dx!8%m1wae?ND9iHfMtKR0ah{DzqF@9q3U z{co80c>c3Gz(3RASN}2@f3Cr=;4iMozdgXe#^7iFBN_io2EQ@DUlZUzZ19JlPR76A z;I9bqzZ2kZw_CXVE1ylq|Fgj#3Gjyk{G$zi(|;!8&v5>s{x{{g^7;R+0DqCe9~$xe zxbwf9!Ot=u=fArH{M!uv$aBf~D-HhU0RQ^|{!<3O^4FdpcmDNrgWq&~`TVmsz^CVA zLi11K^U3(zJO5DoZwm1LHNc-~@CRP-{J8VK!wr5TKe&wZ4_#8J@Yxoh|L!pO!!IV| zw;B8u0X|*Q(0uxR%~1W1{w5j!CWAi`;137*KQs8%zfHz}#NbbvT|WNr3-Gsd7c@fU zZ~U)h{68D~Y=FNmz(2y^FZo?Eey#Hl_5Za2K7FOu?cZVWhhFmhxbr{yz10vubz=GW ze>lJ&H2A4MB;(&=@cRS&M*{q(41VLwo*#Gq_j7|k8sM)F@OO6)n1%Y^&}cIL&dxtn z{|inkum8sa{F4lR<*S|_cm8*r!5?9MgU_aL{d+@z|2c!-^v7iU#Rh-M$>rmpzEbJ_ zzt-RnZAr%ep25!s`1F-V^ItXisn?V7e{b;D2Ke-qM)Rlc8Sa0L|DB9K+4+b1Uy2?q zcM02{zEWs@yTKoPqu`#3>wnD#zahY$j5z*}4Swnq<>lWT;Qz_skB(1Pe)?VXQ2RFo_;gCA%RhBOxc_H&Ovaz={6qYK z0H02&G(T%3w`MM?;3+&{mJtAhfZlU z{}F>Cz8JYHM(mu{#ypWa!z^qw*>e{8vLQ%lJV=Df2jX0 z3-IZbLf3!R;Ai>9H~q~oOuGFqH2A4g%f~+*(lvjz!LQ!agX8$$F!&7t{+j{*uMGZ% z_a);$WAFz8{I>%9{U(O{-;%wO@%MKAq4wVt;L{;h*Z76_1yCIr-7#V z0|tMjDjEMOgTElar+1|_|51ZKu#e}z;uo&weA4~@L4!ZUd_4c7Lz?FAxOcey2ln&e zJ$y9^zHI&-=lnzMKc%sJ{-Hyf<~JDpA%5^YuKXt${J{XfBEVm2@vD=S|1yJLby|7( zcMk9$GWZ+#!No=H`3g+B{qHmQ1I%B|2bOxrrbCLZ|KA$?#;Lyi7a-iNYd-y*lu-N6 zoL4^n=#ZlM`&Nbf-!Stpi{Vdn{vrO70G|#inm^az*Ykz<&KUkF2LEB^*Rns=KOIsu zf04l-V*Y1i_&J0BMu0ydz#laDwfy4n7i0Lh82l-p_U(`5pBUgjV(^EV|BO@^$D#Y* z0|tK{^XdH_N>u+erAv0st&m!7&3CmAYy5{fi{@xFf z^!eWi^Qp}+X?~US5B2}bwC6XmGnJpFbj?4-;HQrC;5hz?20t6%)0D3Hiwu7CEYIJF zg?H<^{5gZaA;7N=@K+i96#X!WOL6?K8T{I&^7^MKU6=oHgFnK2{u)+@g+6ro>F-5_ z+J7LxpA_J4G5DiL`|_VnnM2b2mks_X^9R_M`rknT{+|2z{yz;EwAyf-=eHd)^@ zr9Q^~di)Hr|Jh>5`RMp(I{#4pEot`cw}kyEe)^l7!k76YZSX5k@Q&{ASdujV9D|=~ z_551)CjKD-{zB(ZF{#JwQ&X6)ny*k+3~2dQ`mfwd|Fv7`|JYXgKfjg!Z*HakPVV{s zt!ck~w$lILt@NL@mHww}r9Zu&QK&!F(WUFbf6`^aP2 f8t?y^YT6o0ULD(Zx37Z!!#zH#`(61kY5)HPUPa+= literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_malloc.o b/third_party/libxsmm/obj/intel64/libxsmm_malloc.o new file mode 100644 index 0000000000000000000000000000000000000000..b6e92b973e92106ef7da19ac4a297ba9660bbaf1 GIT binary patch literal 66040 zcmeIb3wTu3)jxbD6Chm7iHH^}>YzbG6->BjB3LspfipTmDxi1`F(g6-k|q-azTA)@FKtOF3vEFZ>RvD>^U_rE+|8K3m_n9?kX3+k9-|u_A z=lM3w%sIck_F8N2z4qFdv(L^AHIZorKA&ZR&-%KRzB8z0t$8hdUCo!(R>E3grRaK8 z+bLrgExc@5+v3G_-&`0SJC_4f1U_rr;xwyP)FKiE-ewVxD8yuXv6U}95O z&s8C3^CNu-PUl8q=p=StbV=~y;3dIJH#DuR=!qnHo$kYl9LXLc3Ql`QpKV!JZrt^| zCi~bB$1&X1({4M7S3`;Y`ZjvFW^bTfx7)`e@2PNda%oM)qcw@$u}6LDC=W=-iNc9T zoY+Tw(M^ezB3S1~Y;4EFAgJgAHB&j7>S(9)6GRsv5(*xIEThpP$b+Sh*iLzWd#RYQ@GzSLf0B5O8~{%eX9ZS;eO z_$u)n*W^kkvEPaPqv#=`W~t1|iES)$5}TPsl@}8@Iu&S5m{qaQNqoX79aLBe^hd&r zI3>w-60fNnoxthZX2%BnZ6y}TQmoK8fHv$DYfC}ze%23xSFxIBA_OWqfj7t8| zs6UeM!6-ZEzNVt5qGuPI3*B1mDH5N-Y+RcfB5lfRPya4GwV%Xi6@5nHziYpc_laZw z=2bHLw2GsZib`epA7m0O;Yla)V#PieOl-Mm1KOKTmFV5|Jo0Pw;LPW^|9A8EKdoOR zJZ0{z+S)+Px#ym9?&*P1ZGp1-XtZ&0OSEx*Ale*gU$VHqC2(1LQ&VGWU{uRVqZYM~ z9ytj9^$VqIXrA9FQ=!?T_-P|9a^8)yqQDTptT^dL(DlK!87ncU8qlhHu zmlg#R@7kjyr6rNkZ#%wEg0a5`?5-c87$eCMrL|ozMTa@bXzB1s;%&G|JAN+tmAy!%YtgU9<}3RK}3a+sv?5KyTQ)CQ&#MF4ep)fwWTFa_w+t=MxWZt zmxEtT#k;8^IBFsbSwr7Ymc0Jx*PN=oVSD;VPJ%TM$~sBzKbi0;XY>nB)zg2Xl#V_9 zaVNp*2$%)t!hTdd`YsBufvJwokYs%W-2GKKBAPa=m{rJXM)r=@Zu{rm2|R5 zVn;gJuFZ=0?N7OrT}QH=%{`FVx7)FMrxiKL>xVm|w?_MMKk9U!vlaesk6ogQ!LI3z zB&!1$tSpN(+3)x43EN-mX|l(1?9~O)^Y_>n^n#i*+GL-yzIE0f`(h}2X-UL?%$|sS z^474u?&C9St6pq9%Sl`x&~{jx=nclU`-o-usZ{O2EzEe2J-27qUDU5imTmVjAI%e- z+8%H=Pb))4Fxq}hnftg8)1BIiu4JL{bA46CjVW4VU9#f^*W7cIPe ziF;qaXi;;6xogO{Yiez5v=-JO0g+Jhv2D#`#{trG3+tE!_AuM&X7idei_4h?3H@kL z*Q9QB^=;lv1+Jh3j}KkyOPm-pfR#6+a!k!-q}X7n+Xp&bFWFrf#8}kuHmqC+Uu2Q9 zR@JO{?~0+Z>;6bHkNzMh;|x{oE;eFUod&4DK*=Ms(4G9JpCA~hOO z)UtHgpnpC69XE{nXme|ztufl(61Z|SAXW-PB)nkhyB#5K-11QTFX@ z=4vOd5oN0p)t&A7PgW>(AI$S%%0IdvHngt3_SH_3)mc{!Ts8$KS*!A~tIIpyS-4~= zoay{P8ysqVw7s=4(B7s7OqVxC1JMPIf%%P1_3evb5sizRTdxYZ29q`gNQ3``8El0U z9k_)@MMciQ-FAEuG=!PJNMIvQ;&C>QXF=^Gxi^e`9p*6vL``fT!9-7Db1=5Wx4Xvv zZH?9bK3UVxK+YDT#y!Er(-n^fV{Z(I)&^sr_@ZCOcR_SKzWvcMd>2NK#&^HyP<;1~ z4%oYrr)PUtV$PKJtLm}nS18hB9aWFDr?hb#nKq7V$T();OX(x*;}!TYn#i#asD+0j zd%-BG;Th9U541J3)<+u_q$|+V;`6j~!$*&+U3v>ryu?b5TZw-h#vCn+bs@P`*(Nkid`J@1lfVb}qc7DW;;_w<@kMb1lH>T#Wmk8-KiQpP`6gy0_ zj)vT;)5#d>snchR{6YAkvHE?k3?;8tYD#a-{V-$^ERlyNpEq2KMgiAq;@80|kc zovQ9pwYO0&#!0Zb>49@<39_cWQjq<)10cs>zpI=HIc=8f2`2|Bdmc=Q*<;g8fvvZO zMnE*YjwDAxNo!pdzNATqlS4{XY8sh{_Lh284T3?R&pz||Gt&S)B*y+m3g_Y@OJ$5-WN&ttq~1-_}E2K~Z=!h7J^mHr#(REzM#` z@h0JFeHEjM5_bie+zHl?uY;&-NmO6-cTVTq*6zMOUetZ49I3hO&~W-#$aSzdP?TDX zhQ#a2`!@NaN9|kR&+b|V2@F4CL6Q^CI>}{4oo`j6ABufeZFhZx$f_#sIJp65 z(z4-p{Ob&13O|u9sIxv97rFy9{azJ!hY@?K`lUH&)l;?hl*ekTdhOU95Y_p50hi1B z5EW75q+o0kMh33}2qjxkqR~=}ryj07q(tQ@ShdOS>OriNoK2a>K=QwF$ED~bzlr7g z#axO5LdkEIg%T%Kpnx&G_!FnNuqd1uQJPVhW}FwR9AbAp4XRWFv%~cNLb_{0wy01< z2%b+Cq1S+lBFWQgY2k+Ea~SYM5`V2lpM6nelFBqjK8fcr3JTh{uGg4-Yfo_C`kMH* z_KUF!XUFeBN+id$?*-`@P1fjM447Rryz15+}%p*giPL*a8RYqt7z&Z$*x$Iu1banFMG%NxhE z)L-5hxUxRl*t)pB^@>1iV|~K{40G$FSSVBjSx-Nrdj}BG%0WHNeF9DO3m2)eackqE z#`?BKJtk%bp7aZsG&HxiHa0{9jmx5~^-CI?+uH*3>!bC7wwA_*g-r_;&-5VLlRhm= zD}d$4HlCdYnikbxe#WSU>7;Cx!Sln3=jfrCS4JDi2bS$ELWCq0`s9Y6QL z&gOV&2PX#su2-0AxGyZlP}@peTw1cP*B3oe581W*jXfIBV*@9Fg-doq5u_?nV!C>+ zFQS>$1M~nT1_uSV_X~Ypg6sdnb>i`k$9wkm7TR6g(fo}a(qpwUQe*!H_Lg187g^T6 z4TX06EX+N_-LuD@5=qiRYU2F}c{B$%!y~?ro$lY4b|4vP9Z>d`*bwf%2Cl8|Q~RdX z18637Akuwv=}My7|3V}YDXnzyRbAU1I;q@Ayc15gTW5END*Ga%*E`3DN~;~;*4plC z{pEP@0{;<&pN7|U5^-ihZMkOP)=&ZoX1Q3T$e}jDHvDUuL zzV@lX#LL0ho4$P;u=YD?ASr@pmNi{(h7$+u_|Rg^5+izbHtaiK9MDY*5rIap*O!vN z;%x2P5U{&!l(|}vMiG}diFT|^*P#5`1B6zxBgO+vWMc*NiX<*6N?d}Bx2p$)iJFt) za1+;AsagHqae8g?67*&m{T>E~ih9{fg%ofS532e=|Ft-)|3c-cB0(LOQEdm%qbg^@ zcLc*_N5?x!RVUQhKHK>g8BXVCY(DY%lojf1?f3<_Oy^gaer(5UY>cTJFs#(`ip*XN z4%|HMh0&7fzyQ%a$b92Lq)LrohY0`3^G&%L-aditLHBIys4C?km+C#7RC713bGDO} zsN(o46d#5Mw+!bZn^h9-hTL$~-$LDU`l{{k!;wE8w7=gQDtgk6$I;VZWu(|%el@NV z?^X2lzE|k+xZw7SAWY2Mxlpk9_nj@}q)i1HS#y8Tc-YR^hu} zv=ra{qxRmQ-CS6@2F}9@+K2k-c1R(vhWMRWPYK-)quXKb?FhOZ;og?gZK->EGTol+ z-cF#~34H5RZHhJ%zsZD6_V2B+n?r>qHTIz)e+k`03In)__ycq^yRaNLv;F0Cb8%rc zZZ7s$)6M+CnYfwnpGh~13g_Wwk$)cDL!9e#jwMwKDBSThv6r?TM7UV88UUnkik{|v~THVcvJUG|8!6nPG2$1 zU-gWA%TmBj!^hoDVGXD!*Q^*asp_xxEp-fk5bB=U4|w5(niYlPLqtCf5#feyp>C&t z(0*X@u;7ZxhlMD`7>+=S>H&xtG9kEP2vR&^-!cplwWMZIhMJ##LfSvs?F0p548MsW(B*A(YudWY!<7ddKd*6A?W zJkyTfI1~;qGbxfRs8GXO>{&4BLZDh;63)Z$^sCBu1=f+_k|&%QLVI>)^97d@v&Qoh~aMNftpY+*X594xW$aJJCMGeNFU1e~+ z89J|lQ|McVt_PbeejI!BIOZ9~nl$ohca36C%_GF*p_Kl95b9oaGjDxKuL9z^jUC@} zDD>=pu$1U3dU(%DQ^;;+4%7oX*qRcuN9!&Q4sI0)Y0C=r?W=K3fp)6kVK=k-y>h{){Z}g3-mwX%^N6_E75PXpKQAN zCx<{$bo-c7#NXyz(ii~}(0`!MrZF!1=C;0!F-Pn(%p}@JP*sc}MMKF(3w@Z~HL?&T zuLqC3^n(pGG^9z7O|hRQOw}8(?ByHa{DX;?f}L+zxC)_{!JOq;jE&*I%ZaZUp+raM zAx$V*9PB*s2_{_jEr+18cw8G&6A@aYh?hZu5EL&DCDwO5(SemkU(l}E3VST)k@{{V zMYo^c!vjY>b}SVr9%n_mhXgU`cJ{v-9{B+cma3`y2w;TYWg(rimj_EbpcyQLCUS*N z@&=4+bdymwM6*F-J^MK$g(%zip%~jz{8LhaddS@oAI}3~j~6-JGco%47^9y$bRd!L zTe-HW->Qr~j+%ggifxg^Yur7dNuU~i_U-Fq>uuNei&C%cqb{);CZq<~_#F;itkiq^ zDZ*P>@kM~Sdy~&a&j+}!5~;Lx%l?n0hm%D@W*NwXi5I!Ts!^MrudAwr;x<2+K}@BE zn0*_H;3<$_{e%dGSsC%|aVGI%lpX6s`E)*NS%p;jB;P{Om!|?$9Er~&dM_1fC-swm z>iG;=T>O1xng(;2NN%H{6SE8Zwna4Kv7$*GCiNZ8`|sn^zFFR3Kqx($i$BQ`Ef~5*OUv{iJ$})LlooeQwdH{usxbnmxe{n_Ej_=AbaM+qi*>r$cRmyV z%K1;cpw~`mdmiQ#%*#nm;>gsM;Ah6m75k+5e1;f3f9_W97Y5ftjTXCWa2X}J_+5q{ z`n`5Kd=pJRRIvyDL_QRc=8vbjO+d?Ci-Le(d2;W)_U%0*XwV<;50)YIv?D=*3-zV75_l3NP@*h7XQEc%~y%X`Xy&GeXB`8i=zP9b$UB8hEIx1JfvQs%P7Q*CxTy1XetITxvU+y%+ySPN6-9Zpv1NcirE$}M%o4%tiW zjs4;7S$(PFxh|?L_3ewwen%$k$ZAQ9(#OA>}JGIFl zr1vC`RoOm6_LwwgS$1(62_Zu>v1RiZ_OhYGbDSZKf_N^r6rztb5xYiu_Jh8b?n`(- z=+f`$-uSm@6uN7oZpgmnRk%1@p>Oo^_7p{>Wy;ym$GxaVEFPQi^7<;Tb7&}l9vZp;4qW&-Sw~vU7+A)L}K_RN( z)LKNj^DV>nR%TnBcbw!IB*nL&@vC7FwI;6mGUQ^VgSk-D>7HHMLS1|y+`aZjGShCf zu?V8B)wZO|9Tb?}V9gND35vE*O+-oMYjLrxcl+>Q@9T$+0)rcL1Ov~zh zy*M;*E1;eMR%l?)z^!LY^Ud_zH!TC7PL!%FmvHD=^zR0EgBK7*_?+3-Rrb;(73a!srBi_rrtY*n$308dVR&EMnLs8f#ka zVBNuUEAX2zIk;lNB)Xx|H|PrahQ{59IoaF{!Oal9*>&zpL`Ek2f-54Eh(l;{L2yNA zQUTJ>p6st#F?*69myyYZH6Xww?14p-`vq4ln$!=M(aHUTE25M7gCI0{Kn;=&faoH` z6hZ(*SAE=mS1S5G`Pz&Tp3~jZmpYPqN4uFl3dK7-Y(-nw<7d&%xcR9;iX5=g`sKim z(=ljW4KJE*W7Ou*A+2~91)oOI=)TI04zgx)?Nf83j_VQ(SY0snN3O1F_=Pi2!pi$| z|27pR-cebO_1Td9K+n|fVb!Xiw|~7`byBH1*q?H@YKT^Y4PJUy!!sH($a(4M32t~! zgGn>$ek6R6OkB~2V50(;z)Uln*|vXM?DY~ec$G{n282gy&pr!N*I1;WZGsm!-RX^yhD?NP>lMAk<)2VMmNvEe1 zF>url>|FG{>S?6kzU7a5;HVDZs|oE{bNFlYB3reiLCh1J zRrq!H=PQRU%_!JW1rs^}UL;eui_*H?2i?-nwE)v-^ew5?7-T>{7bVuM zI(k}CR0`AJ6YkbY{f5Su=?59uI{&&q^#E3hq_Kr*;Sgr|5@ks}fK8nb%VO=|Kq`8; z574g&aP4tdVBbWKI#NHxm$ajr^we4#A?DM@p2*!vBm5pR>PehyVjaq`LfYQRR`@Vm zp`K2A4DebQ;K!5&)%J*fChj0ytrdE$kcv~ysu!Y6b)oS(;SY;cO*zI2G1Dav?X7D} zc<)iuSYe03j!CHO#ppZIm970nq6v((F+PY(WsC*(cg9iYr@T@vHG>OvYNmP z^umC;qF#dg#vy1-BvH9pZ9MS5s^pX>3n!jpSDP-jRY>k#Ct$s482o5HJKO{pn)*#H zKblz2ru{TEHf`F!#K=tNwg_E?v_5S6as4NXCf8a?{BQ4>Ow%xcX13`fNPPu0m)W$i zvmH%qFWQKazX|d=OJoLIpeS$}qG1&Po>h;~6h=LU9iRFxrb}wzmO7JCcPkIYZ|=x4 zVOPa0bpu@wWmCVz7tfF3$6T2q3}fZEP;Cj+s1kcOd_YL={!2Bm+;k6RoW>KPWZ!|F zvZeYjP(PsWbdsyYXp%RtkinHJYx%4=jvX52k zWRyWF@v6bx$0{&2j6A1VpF-)en^dnn@hHIQ)(wXS;d=K8Tjs%6uxRF4nYs=WNHFK6 zB2vxd+Ds|Doz<_hoCH0h9i+MipEuV$ve<3haaOJ~M;7~w zNZt4kRimnFJ9yququG5O3Q=V4ml=JYuBsJf5R$t}Gc)%I4OomnPwm;orOpS1viLr=+uDQ`Ua!N?2;@Vr{OOJqEN919P(MbbG-Cx`Y8t#T?y@?bP;p5m?ox}&UPQWF_GGu=4L18LaO?qTCTGv%o zlB;siDnO=P)gn^KXrH&odyNGA4L+I1tI{jeE^|Bn2hraIVG7JJOO4Ft^&Dytaut}G zZ&Dq!<6XQHYJ3eH?-w33d_N-)Dm$4C{lk1Pd3=cS&b%H%lGnjRs|fuoAt%qYz_!1wW-)l0z;ISM7R6OTg*T5g&a||& zpOs~Ynv2RZPYff8&6zp2X+9W%!%t5N@xV*v_f{TbYUc2iY3oRh)I+72tPRa(50>)= zG)+^}&KIg_jICm1yu4zGvgQ0z_MPmShQ6$YscAeJ{3&&3x~6r$pQDLld1mHO-B^(d zxzk}(4swoRhG!x&Gqsi$ug#kwa+>!94OIWHD|Itm%D-Q!wAgBLw^rG)r&f_M*dkYJ zXy9YYlu&rOUezo$XT5$~&kanyA`R2!w&+jaNS6wmj3^mSmU;^mtq5m8sXz=@^s9o%z4PHqf47o#Z~qMI59oe*|LqBSSGW+kafa9zYjn?q-B{V;;u( zt1pyzik4ng-I;a)4;Y6Tb1PzPj1!rg=|q(OF{$Q8uloYFHj|2UX4=-&ls>y9%@swu zWjp0q&7M#~PIitz#xzzGaSn=S;90Ww^IhH8+B~Lh;nmoQm$`+Tw&=}ojAGAS<9yHN zIBHXv;S5gUg0=vKaIc;Z&}+!C%7QCP9JS#VZ|z`z}sO@gNL4z*zI&*xs?QAzc6;nVJ|BU7X0MBX#?2DNJG1EPQc5q zTv@z#S8s-5)*$bqw`8bdi;~mI66-r(r`=7RAJaFUW5=$?X5z!sFR^VLe$gfR{mgwE zu?_E!5P{c2i=3*jM_)+vcJAP(kO!*U|Kd!#et2|~Q}xMGOqu(l51OZpYRxTDr8<6l zs-9df!NbNac-Yt*#@3lw5^R4?dv~L8y>Jrb2aRobB9mOA9y02^yPOZ!ANxf3B{uzB zl#5^r5_q&nEp&jrwHiB=kEiI^^|V8II4)Ba_+o~vh=}rQ}=E8!2IUMwg4qtOz#j;F2wFqV2ei zn@Dmo7Sd~XhZDPMNAJQ0&l%klj_l-Z9Co}A1Wt0!cHZn=hxM+rx<`CfOmgV}7pLku zJKly1-r${V2k}dh?pa&0XJI>};GucwY4hYe(f+(L&6jBGP{igGN#LRG`-z%=@!hTw zyzfEziBJ-cl+g^I0ULhagMH5>iRaB06r7i!)_bvUed#psi+L0Ej+V~1Xdi=~z0k`o z<^i)E!ft!)l~&I`D|%FGfrN^egp9Obf}%`^iGp>Eqdt!Q20+*Y-1= z)}VE#A}h=#RgT8v@ppFD5!9lT8(4P^NHdcwpO>N+c9lQn0lq3gW}4VQPOcJ6%z}*K zRg*k2S;f0*9s^S{99quiLj~Y%dCp01Ezvvq=CvStb}6N&UkkbtRK|~}hclt%5wwXQ z@d5ATBtH*Z<4r>m>}MBqHrJM>az3wnKJ^fiq@Snr_6ypn&I?5GqnV1mpf-iXAc7LR zcKw*^CDvD9FAqZ>7KW@aEt$*&&I;eIJKbcZ;w7z&+4++CCB!4v1a4E)VdwfenKBJ0 z_EwM6`+9!|n_Q}%#aNhI1r9brS^t2p@s`Z!51d%4+UZU8kL~c=@t=SroF{f*cWohx z*rsX+=hsk*%lV$3NM6rGcp_QJq|uR)hE{cA?W=#e-Z0O?nts1+joT~M9{5Jq1No*%Z9#r<0LH5y3(iu*#FAQ@?=bv%f`{v2W*{XLW}*}*?#03%^251}U287kUdv8S90C02?^9+tRxFR34k0N?eN zut(oxRHU)_gg?==QfpzyPa^YzChP3wysydE!+eJU-gR#}{RYUFL2abNpyDFnTs&qwThRLY?B z^MW;|Bj(*AV&smOcp5NMztT(n-@}M)TAB2l(-G|74sPE|46B&LHgD8Ro`GW?h?Xf$QBD-4^b?ek<=Iq)E%h ztQSYfyeznA_3kmfBC?7%#|D$tWr@w5ZwJ7uvyZ;9UlF;$Bq@3r%`}$c2Aa`K;|g5X zV#{VN?a{mxH^Ibuj9O{tg!e3}?XI(kfcI@y(^b_I_VUx{g7ztl#f8V!RjJi|dn*6d zCg%p6s>d9A%2Sc57wy=M&}9@&rLfBz60|qu^ATRheGT4%q$T0Ec=*cM<98r8F)gjC zQn|0;yX%XnESbg>=uNyMA@MeYv@sj!+Ki!Lde>bjD0&sozsV2ZuyLk4pk-1%_aZfr z+42Ae?}0Df98SEY^QQJwI(omLcI0ADrP(4$FKJK2= z!5tZq8@#U_>zD4`ZqU46hYoYN>h7G)>%kYhI@i#TW7n)FV&hNryiH9g>3}l#J-^Hs z)Xj|1q;uzsnPixuEv{ckGtLE=>WLK)lTOR?q-uiH%jz4h&@)9N40_W0gxkW2KK)8z zF!9&mYvH_^V}^mxXJZKyWYFj-?*4q@;Z>B(`peypKK{hR-PMN- zE|Pc_9*5`idXnwLUWW;uj3MM~^+IcjYWP^tRO_|WEtJs4=OTJ9dZ!(qOKmmrUdRkk zQ<21m$mmVn4?D54% z;gt$sZPgog{3tA(guCZrkaZ2ZnD|`KImxMIIQ9$c;??$Tn|ymBRogLWq6jiDP3Ncj zyr>LE#?cE%xApkeGrLb|hZ2#Y>f(2jmsa#^P~J?xkLVR&PH-aW+dUP0(ZxB54R)Nj zWF%Y4u&o|h=)*g%8$!MpBUNwN@fVrcRrX#mb&_qlD7P}DnlC90ew?3}-p_Hm@8}^4 z(wok+Q+jvt$pTe>b?guhlfnkB_%O<#)VFIURS%PFQDgQG;1wLVNV_g{Y1VW=5p?$p0jBN&t)AcD7{m6#Ro9f;U5}$+@w}IGodjLgatpeq6=QIx_v>F# z#IJpx(>g?x>TT70}}>a+deb3=f(rYdjwCJ$)Z(~abhm-C zy00D77sh+eBcF_nel6nLA5NT^nuq6Zs(sTjja{_9QMC+tq}gIBqkVJy0E(wFIELa6 zP(hgYA2|MPARJGV(9{buUOU=YAs#03{%|@15f1`t6WiHisD>XYEpnd&(_;cY0FlOY z$JF9&lHC|G;PvU*rMuB0cTzn$I9JgLCr;tBDf76SI1ySA-;)mBHm$71 zxw=?-j=hQdR5S9W>sc9-vbQeU%$B;7eBer>|E;t~;>1~2C+?0@IsJXX(Cx(1e5nL1e6G*~X z_CXPq!^YTQRbnNc9%f^7I+1EWS&n%;jRIG}(rQQdhOr0~#v%}&k6~^+8?7!}^?Y<7 zTvQniMsSL7Pc;&eRIHjH-2$Ww^Zs;l6S%_kc@J`fE4aEpt1G2t$KkJ zpm}Rr0mUR)&Rn|>N}9p-vV$s+p({Pz$)!NJ`&)3ySV(~b14(T%sCw-IxN)jprBmOo zCtV{x9LYJYI?_Eu#lKAjOL;*&X4xu!c?Zx)_bv3pZlf0$M0JQ0duTPTUBVvRfRKKe zkXEw{AvGSC5QAwVq#qDQ+YAv6e_Wz!MMOVDG{X=rkVGAdh<=D@v>{rZCW1n^agq=4 z5BaJmVExYKO=wbmc%yXbAx>2<&THM`jNXFvJ#$bs}zs3#&V0hx<9%m zbci^wbiFy<6z7%p+BZJS=*7Ks$Y>7^EB)w940TFqKDOc!M9~qWbU5h?NckdCykN&} zpcIaM@`gy2A8*w8oy}8AP*T`C(oh!m^(3yr0S>$2f#82oolatWxN5I`(_=7acN9~B z(GPqXx98#TWz?M2AcI;hx76V@c)AUV@QVBOC)WD0{xG{V&=kg*l5hCzLmP(ICQqsD zjvn6U?0+#l^4Z$NY3L8j@P6n3+LLo1o|9uADNTZLV!Yk;AV3_Ym%0LuiJkCexL_yz z0g=zf9Y41a{b<+yh}rd+8t*1&;Sk~;r)n$ap6i{&)Do(b|DrmIT~Xv?$&th>*ciF* z5jcR~AcBsU<%xONw@bf%XvfN-mVP%YRf=m?<++cvDrb@^kK)elFZOS;z$$up^}bD* zXP*Eq(aG&Z$9ohmgLp=oocTOnx>3{XkE-o1Kao_;+-k=UfR8h2=1#nqNmo@{?d5OL z1+9_1LKkX@TtB*WGAG(N#b|e=YJJ$Af<@?$ZM-vussCGQroD<1!-bZ+Z?oYo$s|DX zt0keN^DlZtg#F(<>n4wlw;3-k9Zs8pl&8k~k~sT^PJ=*YV=N8|@+RdtRU4gu>P+x5 z{R}?^rx5|I`J6zK$=@AK(vs5ysjuLI%kcXsx?oi`I^Auw+boV-s!t4n7hAs6#b`X} z;py$Iwm_}#)!M{|wWI%nV^KJm%;F7btI(d%^Pt8*8xL#8ZGf&+K=8Ca7GHw!S zHyZ?5Ph`#{4wL8zljVOwS{jzy@vi~oqnbuiED8vv!L4-NIM8JFgK9B`$J(hq*e<2U zAvk3C1#W^4HP{rrJ(|&~W3vcH3l8-B^bn~DP;L)x#sh~tJ|N{!;^ z=dt;aR-c`xc7hJaQ4&E9okyHnM;$QbZ#C>SrrwXFYp31M0(T6>}EI zd&1Krcd05UPIng00Q);1f2tqsqrx%kLYmDrOin8$XRCJ$u)1;G&g=2qfR0dk9YdFj zN14P)*6idp_29*qY)~=SwLu5X=ZNPNyM9d~I8^N8rv~QMnnTI&BZ>@#rRSyRt(+5+9 zP;IzSaB#uBS^r+D~s(j!(_&@!laILZcZ~xon>F@F0y$qblrQ9Bc_9ob)^B=oxqmQNl}*ja1?jRy zr%mKYtIMR^Uy(a)A%mY>fw- zi?n~ym`~^97P5Y?{Hr{t&Y0g+q9=W*K+<~$@xF!HeRtuM;G*pm3XicyK{IB2bNNI@14i5tui9ATw8O=&zy_)J8ke}+|n8*1$rmII;*&3Ir+DkA1W81WspS&i;K{f{$Q?y8#Xy8 zR{!z>i|SyePCtk00=nutVE+hxmQy*!plfx(M=0#6nRPH_FnPRkfYBN?g&=*MO?f%HI8e`8IvW5{x;Hh=lR8}p*hd=s-9h!7UfkjH-%*fl7D1y^ zHCBQs3{gKUZ??vY1hzn9qcnCcey`QoQ8sS4Ug1Bo>Q&p+N}qou6v91ld;}GLH}69qgvnG zP7%x_;_~v#ia{Q51^iZfv7 zc|sPamzJNPYLKFEkT3wJ(?ypIZ*d}tLl zJNVsEZ#wQGT>7kQTp*JV%|f$--x{9}(K---=}6 zG)DJXi(TB5CjowIhKtJ!zco|dy4QZ|T=!aD_^sLU*1h&ybKGlr;kV9{x9+u{M`rG= zd*Qb(bg$)w-?~`dy4QZ|QukV3_^mp5%hy}Dd~R^@%<`#sae3jl8sx2e?YA1;YkA?f zE|<6NwclFkUds!=b%nfjul=GPT@V+3(H=ALw0&mcX?xDZ)9o=6Pq&v$Jl$S0@pQY( z#MAyE6Hm9_Og!xeGV!z@$i&lrCKFHl?Myt~FJ$8BekT)8_iLGWx?j%3({`1KcgS1! z+HZBb*Yd(|#pJDf?YH9YwYhi!_KX9H%&q(TD&yJXI(ndHSplBcyTrKIsd0(X^}CFB zdSPpi#*P0q1p67!9*5BT7}(O5H6r6VF>tRZqd&Du^riG8!Vrynd-6jyUM*bV(R|Ez_s1(9GEB#G#?Wn;ZI<89&0U#9Dpz4)8R0H`@Au`44viV<&|e zm<<*4Zi7d;eA3>;$fYC9juZ24Be#V36pVEFE5Bn|#{xgJU{7DQjxqh%X?gH-^WfhE zo-1G1=Aow@nYrZtcOE_)^5B2TgFl@Ir(@-Y798)E!)yAAHlvp0Xm_+#b!b6)-GHAZ zAk9(0bLE%*9A++@j@ikDmjjpOP^R^nl7~L*p@-TWF3m&VmcZfGv1!k-yWub=R<6hyt8htE5Ca9SVERbOAlP%jrgArBrQTGc_d-@tf!ea7HFW<0$<d}zU$ZaMICAl!`yF6El`;t(H~I(M;`k9z-76esax=214+-B zu0BD1brR#}y14r+2e{+|wj3IXz979G!q0gSb3OB!tEtk@ZT^Gt^m>TlbAKNGPcVIY zon(~exRdaLI+s5`+o8Kbx$C_o5B?S4L#=w3kD0NLW!%(W`dQ2wj5oUUx9h7~#xHmA z3-r}|#uvJ{83|p<_!TZbU0;10xGZO5=k&+Dm$fZktpC>c*amCi68yQZCH0Hymgz$g z@Y`ZFwzf96)-^9_XdF9Xe0jxq4{&Og2dF&F15{1S1S-Z&bY*KvV_WK5qhmd+$CV@D zl4$EyEzQsj!pDuTu%OZSadcgATDHE8OP1C#-KkbftNJUvOj-qbix)=g=r5Z^7hT5K z#O&nmpF68t+Sqzob6X=b8+U3pr7LLsLE>^qaMMjVNV*Ess(h>3J{Dt(mmQRko;r)ZeZIOa1>Y02N?nAMRvcHQjYlt@jTRaZxUuulC+`^8t_H8A`= zy|yJStuVMIt9{9$g-fomnxd`vW8U~vc-CTS4~y&PFJ0J%oAxDiYg!ThDD>q^7Q2O zsD)fSP!&s2^c+`L2Vb&W!Wd#Zj-O&PEaG8`HK|y`HR}FTI?lkV)4=DMOm5oCfpU$Hshsd{4^|rBS9TPdPBMn z&Aq8Ej@8wrcU}$;nk_87j6;{~hT%W=4OS-i=PE-H)j6$_Wlr?a9KH!n@sH>};T ziq^MM_Uc<37A#B~9!`hCp-6R0>ld{*!pGJxZnWCa7MC=+>GY@|eV?l>sKc0Cn)&T; z-9~T*y(!(6=H^9hS$QC|PMzE2CMTtJsdJ$lwz94`!gVy7+K58WKz{|b&E>dE{{d7} z)ow&STpA4zUa}N7vQ%s?6+P)p0;QX)0c9$R>Onm=v|Q!F++Q`$*X7{Cp2XC9BBO3} z9vPGyPs(%|XXOlk1-8AlZ6R!mf>|G+aqC_9VADDWGj69T7Cn78xEUuC{oMjzfolpx zPfr02{cQvy5dBZ^G5D_tL?HYgd<_040uc!RDLw{&fWlDS3*5|KynLko|4;J4G>`-7b3Z;NU;k72EmrKgJY~5ZBXF5tv$8|_ z{6Xjw67;e>wV5#gy99kc{C}$a|F`q?U7cTQztjdzy_j}N_=5s}6xS5I_{$1r{Li>H zd{P>xa+dAuLydn8zYpPK=;?4S3dDZ_J_f%);13IYX&!uqzi<=Z6Q5puOukMK_c97Ch?a* zLxJ>MhmWD}M<4?6pUgg{JsLNBOu0WI=w&%S{m&8nrJf4} zpKAo4sGyg8t`)fCbDQ8pdYXE<(}SCG`-KNL`RdKX|F44nTA}AQLH~rnjsK+bm+dPh z=w-f)Up4hb^+4gDf?npA&aT#gP4Ik6)=xiVg1M%64kD*ulZxsF%aJ-Jm zVT_>PCg`gK{4YhtrvVqf4q3jVUfV4{&Xxc1`MiXWDW97K{pA9` zUE`ErISyGZ=&uv>zZLvn7I+1&DR}GafP%B?Ylz@4>uZF-WqzgKmiawf=p*xczQ(=v zXx`bRa%;lJ)YrEJy)1|8gg)1U-jrvTkSog}pWW`rQx0bSYwXkL^RcEk^=t5mkN4WU(JHQY+oIMemi8C z{C->D(oViBaA_yz{aRCQMjzR4%5u9`@FBT|{yu^45cpb+d+o~fnm36BGVOXk_!#^7BJH|U$j#TTO+QTK{HCBk6xS4}+-PiU%K3}Xzw&R=zxr>|zb@$i zfNKh*C-tF5&oROd-x2r(f$tKy94G%n;8O*?#J{O=DmU4Wn|(N>&%1(;8D|pSEATR0 zQy@Nf;$!rmOdtZ`cj0637=Z|c{|X<2-%20?;nMH@o>5gk5+AGeCw9_yf9{>X3a((-sdGN`4 zA(iM?<74u5uE6gV_$3-A{p;{CpCSP?LH~Jg=35|R0>g0jMMcK2=*J&Cj`aSpgP@NIe7(j=uCzDb zU;qeSJ#B$shIqsOG>sFVUkLnCLEj_r1p?nC@b*0TD*}H<(7!2g$!C|qrGMC)2mk62 zFh%$pKC*wAqH)TX>|g$*aj(7g=D`OaLfln(%K92B@Hpa4c~%O%OW;#9PWtQ>_}LmK z{bjwMFZe7M^mT&IMu9ICxU9#u0{>9ZKPmX%B=C0yF5AnvVh|ya{<2-n(74y$<_KKc z+hqcu2VSP!77AR_w`!cqN4BF|1-&f))dH90|KA$-mj7D8UzY!7!Dofg^GU%+mj6qF zUgqmC8;KF9d@jYuP{<55ZE$E5Ancux4@Jj^$j|DFIR~!x^1aCc_t#PBjsmJeYocxa*7p)fbNyHiY z`viYkUp<0e+SN+}m+f)8;3M1Pkw=jBIs9-Q{CfhI^|)H#vL1h@aZ?UvJh?^S7ecw~mkV6-zcUa2X9T_E^NPR+2>!bT-T>U>_hW%e zdmcVq<%{&cS?~!6TdQ2wc|t0*#Y>%6g9p`fee2g`k)A_CrB0+ub%nFa5yZ z1-;DgUV+Q<@gE5#5U3n(!N=G`k;X}Xsn1b@UY5gHL4S|nGezLie_kMPX}4Dj{<1tf z1-&fKy9F-gt`WGD`;@??+*dSCw;dE!#@SRoKN|`3I+(oN7~O30+;o8uEwdH z@5aZJ{{n%_d|fSYnXm5%T;}UX8u#k+OM!n!$lWM#>E8|rT;^-YQAmtH{G~lO8u#Yw zQi02SwFq41t4rWAUv~&x^7*yEC7;a#mwcYlxVLR*Ep5G^bdy|P1|ZVyxYV;%{wMkq|ZHq&nSUQ z`#fFXvfk%toa$Z5y-?6gxov{}79n?;pqKeg3VK=2|0(FD{=X8q)W28YQva6)f2sct zK`-?m7=Y3UlwYaOp#ee_F2^CI0fw2r0heYRGFISq0-qspX>WA`{|@Mly)_A3+QT0- zPW3C>``dZ&ijg2hAbQ!}rwd$`Tcf~#jxGG9;R!H14Y>o4tzOBr+o-nF@B&*WsCjZT!Du)PV?>!0-vLC(toAEFVT1*eqR=N ze~lA;xxgKb6MahHR|@>KVxzY|_$ishU9{vXe{cVE3IWL0pCHapaMG(CCHP2hAzU2DV1WoU) z$MZE#Ke8N}1idT=bv~CWhyOqtQ;!{*-m9nCf9RF_MdKc^$!X9I}m5|k7(Sh|6tHiApW1=W9aqrFysFMJ_#QR&j@?ik9$L}&c##p z^@zYnP$C49yAvNnua{w%f1kkh@H0yv$|nWVQ~IR}Miu@Jt_}am0+;$v6}a?chYDPl z!wY%v?E;tlPZV}8`79CTCfo0o0`CP+qrcf#O!<}hQs)L{mxG@O5$H$eYj7UCLgQ4f z(m&T~oa#&ZRk>eR`e*a^5h$6=FOHe#Kzu&K$K>m`0)I~6>ordLNc(?E5epQ3S+yHVhD zJ{kqW|A3Dvw;6&D<=4!o=-f04qz~1*!RHH{%FW=-0w=pN_|*cZ_GobPcOFO|S&y2W z`^`5IXY?$>H3g!lmpKh?=t)oi7r=`tJp$3ckB{MV9DxXgOM9EdsKRA?Je3F$NS}T9 z7(UH;@T)cMwL|0QDVoa1tn(YYC0xpt`+R;O`IIeb^( zl76usAX0w&3I5*`_yB?bRN&HYA3uPkD!B{sYsziB#=Y$#B zgYVHe$(88E53S8=A#(Q45;{?6rKSSV>50w`MQ*I_-x><9%ZAQGob+u%b z|9Iw-%_pdFDmN)NAn3JgWMqQCjSd7Y6Zk-VO+PdTr$G7-;w$xazrH41yAVb;3S2i$ zM&1;-ZaR#RucJVGvR>moTOJGUwV6A2W8Pm<58~)G6WSy*#2ni~t>e zB_he##KtE;-PxsXV|0)4GTr2RS42a*A0@sf+I99$t6%h1hJVwcm z7r317oFMQH(VvzH{8oXF7Wis`QyVbhL4luSK>ThM_{jo4nG6$w^c=%i>T9*Y%LV?C zz>ROA=wYINuNL&P1wK{a_Y1s6;2Q-#P2f8PK3(Ad68KpH5A|0!l&`Y|K3CuofiDtx zt-w13evZJ+pU@%x-w=4YZnT7-D{wj9m?iM2pf|peqE`vrtho{Xdx4)vfC*!93RJZI zGHWc3ol?IG%oQJ-GNCa!nJdJcET+x`6BOQ)$3#>*F&_YvH>o-vMkLHU znoLBv9Qn8=KKV?<>SVe!-O@Fx%d*kOXl0iD{xvjN&F`?lZn}WaQupkmXFhb05rgFurtVk2w^KE0zx%%i=4~ zE^^5Z#y4kz-1?fD_(FW;c};EtS@ovGNP%1#bM4F!<|~0L!hBiEBK*QS5}`H~bre`m z6EhkZ`| z43WxUmbc7n1$iiQsqHz2EQdl$O0Kl#aIzeUObz8vOofp@Hf;#p;g98mBF!PWbSf(i z13V~>vt`uHy%_|!r^3QmDOr93%{M;*PCGp~htJgkXQv*V1CChbu}s#{yt&imJye&; zJ;(Ga&TLkm!+KSWJpGr`XOue~ES!y&#a|9)UI~34Q)|6 z95;Q3nrvuZ%*VlwU4jD}$Kuagk7HvVdivtx)$Ud1g*W|)h^95(|J(ExNJ~8P>940Bpin{1c?qGp7EPI|@sZ*A zmFmvSNtff6=*)Vw!6ySF`DR@MmmF&J?^pD%`7!x7zfFk9rT;vgV6Gw5_g?+$f#=eH zx7MH5f+&#wG&S++e-p4=@_Tf;F2u{=m4725bIJcmFASM=5t2{qB3}9T0LvwRCQZ-~ zxi_lVkZ{}Ld{r*#&u{NDo0CBH+ugRf5t?zjCe-~IT`CGrB{GEB^Ti-_nf}?)eekku=`D9hOo!NY&ob$~_VYv@{kLoV%{ngG z??`-%d}CL%&jbIn>^Cq_NqkNHxZhs+JM+j7+^&)=(*@%VV<)A?y6-&=ltT7I)m zZ+?iDg88L=4|J`Ii{t34MKAW7zw2Zoy!zAncdq)sL+d|WGba5{z{jiq*?Hv8)Q;Ea znNR-0Jo%ri(+5pL{cGeKzCX$%f0a)EM?;0d^XNa11||q{{56K|8BIUypue+6x_Irc z`cOr9qoy}MCcV+0_F>VrHWNc;ZPn?`--|L4Uj1o*NG|(Zuu7%aiu_Ta@-N57$T#Kx ztvvdd4Wm?6*54g9>5cw$jz}*3J9PTbEB`z5=wJC0C0~~RctA$J(Vza#UM~41`XSZf zijDj2Ex$+e$gjRfC808-K;<_PAFurVdE}RWLHT>~$ZyFZ|Fj(P1E~C5`Jed({;3{7@eGTXV>tltX@N9{DS^e6yd_z|pEnsHkpurqf$~pd z2_xUw&#QUlTVYCtXU_%d*L-{BzmP}1vq`0xrxTKV+9TtYzb}vcdHSJxP^ULPUily7 zk-u8YH~A#_Gjqs4vVZRKzhBFL@^kV(4C!-~e~*^GQY%jK&&?sfQp@++-wk?0kw?7i zw~YPF%aeYEPVX&0Q~oA>w@#0rjIdRwH-CT7M0(4A1rp}U|IE!QMY*uQ^8gw7re1!V zNB&AGEQBHI$Nl!o|5YCO9gnFbcBmaIa-{h0z zUzkJwfjsi>*YZah!TQ&m{}1!X-;yK$7w3?FBoM}~D&g52C>f0;7A&-3L zNkzX*Cnovz_;}@iJCFQ;?l=*h5xnxd^2qPe@=ZQTKC$%5zbB9UaxGs^!(AP`^6$(e z|M?vGZ_FW|{(ddlg17#+YWWwqlGTM*KAlULtNv$hRrHQdPx-$bA8-D5@WX!$b#SLBdC914+6evh0L zT0WJn4Bq>(Uow5@bJCM;GI;MBzhrv)d&0ysU;e%Fy!Su*lIgd6$@Hl&nSKz~`KbNn z%fGk2z4s%Lp320Ve&u{sYZNDg_kQN*r1#46-p|jY{}+`{e-}Ad{_oKGd*ylWSLV_G zM_(}gydK@uIQ^Y#!o1`B-8%jA=pSS- zdK=6^`a;}P8$kb-Jf$w=_$dy^q&MS=T(T+UmPIQ40=-{ToNlNUbv|Y2!W&qL?!?GuFK5MWX9Uo zK2Mis@3r3bt#4hwZ++`qd$_+aIB9^(W$55CZZhI2jT%PF+<4q4qCVqFV~kxAjXi^TB!~tZfex z>qRg#3raR4t^=5x4Nss&R%4IZ@C!t`O!uU$N^{)gth#FvzcsgcB@1_%nUk}cf4D(L zr{-3g?w_th!fs-&{Uc_WnLi_hQeOari|aBFs~m!o7vt~Sh`1NKk+H;GXChuV66g{) z9`_*TUNHe#_fM$%a#sK?@bP)8h0OaWAeuRuK;~47@GMVdz#MnGr|u$Rw|j`qsU}ud zEwQp{V!!w?8bkQM7^zDqwTLov@m%ICnJWt{nM>^HRjR;m8Ic74Q5Hw=+aLPv5Bz&i_``=>p)dKvCtP_g{_sv$ zxYuR=sKwsu&;MZQX#4bLqDL+O^*=iR<#b-n(LiK!Z()9G&4GaZ+H`w!_W3|$nhTXQ z2>tqz{bphQe!p4JWuG}QSk#X-*nakm-PYJ%`={^+uF%DW*=>cc9bmNdMf=SY zrY;{Dc5VI#_oUmWXK$9}x=;L3Fjf!$a~{O}&*D*^*AlR|r?=X>{B|2+mia`Fm47y1 z765PCdn}ONVs8T2ZsqSoTUuJ90~E2Qw09}u)?;b?s80<{-)f)n+ouEeE)>{q~B4FQ6T+IyDMI4ck8ji>8I^h#lS4s>Tit>@rREO zF&l4&cx}#We}H0kbKU12=tQ4<&+%dJ*?at-5OL>kTRJ3r+XFioxA*Qn{=jy`2IlVu zx?7+F?N@;A0_w^?3-qi&U4C-#KhOqVMgUCG1Comm^A9$D@^K?)H**9hH*(sUgSrMH zXCHI+$&}yjV9ridf}FRQ^A>Yl2CJ!m(T5raWpAtB4B zaGNXtyx*L>NvNsm_J_wX8ixH=Ui)l&t6uhmM1b&I4CU9>WtL{(y$RyeiuZn`U&H$d z-iIJMSKx0j8Frg1+A7V?7L;r4GQZifdEtV)menj;R5rhMQMC~)Dkzy=TwE5IS$vyu zdCl0%tH&;_sLWYZHD9GdH6fAGl%B$A)22?l-c!4{rnb6z$j0Da?QEzFZdIehG>a>wzt7?Cd-JrT#npbNpqSE7$1!!o1!Szcw#1g5~Q)dyC7D z1ntMX2T`=xeB$k(z23`p6tEY1-OOk~!XN(F6}l;CpA{>t%a1Lo1ifbOv0Oslk>PWL z&8CNjV=y8VU&G#NmEZ6`mTPzKyIQ}Y(H^v67~6=qhi{Cg8}U-cPpp&rx*7Fi-EFtF zHF!@0XJGuCjEcB642Tb5VFbqAj`&DNTvP^{#*^miXRkr$nuDS-RV;{d^?W1v!%-JH z6!b1btE!0vFsp0u2Kum>&5=_ae%d)@Esl*J?HpuG0V?;Z+WKKa2*EFKvUq^a# zTx(rk+%Lvx^SndLdNT%~U|6)ovwh4q6c2Yo zc8JY!mwHFyiAJGh4+iQ@v&nWFMp2~BRh<96*$})03A@L_vVqaq2_vG*ij2=HFrUBS zTKj`8bIe;%17^c-P=?F13N-DLBLnk_%lAw+pC5fOq;57NZlZGaW_F8W}@}P>>;DxLNDRl}cg`R0H69&fr5+QBeZmwRKi*n@~z4a{5sf2(A zBM%E%4chN(S>+}$7XoTUMxWCnDIts-_b43^GNd?3tN2loG{-%<3b2q@ z5hKB(th%qEC)F^ov)&8XjU1XvbA*tbMOk))@@i)OYMp=~uC&9m-tRK!oW_jS=8CmY ziv@c}59WV{nEzMmuEy>kf}EP}3Llo#M=%a9uz}k!qW4kqO1Dr(_b~Pf9+1blj3PN- z2x+Ml(uELGU_vbZQKS?@TnaIj$_d@1oKjt(SL2SDO-~G?q^^*Xdi~{;)C-07>8?qp z+uJQf;-9duamiOrjEuP2D(~VZIQlGx^i_^icTGB8w_|IOvv`PuzNLm@D#a2N7nZQH zEmB)nlyEL9)S;?^vU$LK$c;+O#-Vt!r4&w-xL~e4hvp&Lc~UgxMam=&SFf8*zhx8? z!!=ZTBKp5VJk@yw>iq90pY7)Ayc{8vRqV856YVRfLL_6d$>l687pGkQEix$-MfB@R z81uG61`8wK?X}7`cl`ppsh0`{X)B!94uJh_9;hKRil zi9Co@g_Ot&O5{<_IzG=MiL=9OZOxK`gUE?46sr(rCoLj+(L6 zTFq+b6cR`;K<09YB{xkWmH$wCC%Q{5A=i3bDtqLzZ+do!WB;7-&tm`VhDeTnio(!S zXrJjWG`}U~^UidxtN`{+WVP_o3rWolmbZ6Lh7kV%;&bk__t#Y>`Ug#3?SISrNI7DN+~3{ z>4#EwvrciPbw>Y92rL(y0iFJL6IbqBg=0|1@}y>OxHG}(wKt~NPKDw8*~u=NH9VU} zuQb7fP+8be4_J|g#0}NLu?QRLn^PjAAJoDeeTo7c+fOmCdoc;69h`|qjw$8{(k6s8 zBGQ<_NL+wLl(RaMCh!j^vDyUYzA%C|ZsttnP<|18bRFJ8fbZfW>OwX)RjvwU0%Kdn zp3=kmweGIF4k@v<0$e1-_-zoCyh9ESF@Awd@H7_%wqhlm!g4_f4^PCYW3v}WBw^-$9+QAs$lBc4)5q*xL|gZSFmLW;^iFB^-YFz?yZO)t zNTk{LYdq0LY4@HtSN;@fOhsqAXvHQ z&+o6bGFwH-Vuc0*b;Bac6L0iW3 z2)Q^7Vyt^1oxPJH!>%hXZ=G!ZVZ>yL=Xs7)XJuhy`@O;lmb)rCv$e9(c``+NVD&;J zzw?PZYUxFk(^+_;evF~8fnP9J27o|o$Ob)2aTK<3(_@UHwjQpnv(cX+DdaTREUa;V za|U+5hBx2q?e#y0ZWyNnW~J&vn2t?mQVChkmVbq__7l4Ic)vp6KUT4b$!TI~iR7Ht;VpZ#%*0 zRJMB!?4<)V(5j26Oy^FEeVVajr|fBl8YG3Ul_GKinMN>^k8Wek~L{3O9f( zCyu-JZ$z;#*N(z!hWGJzoHVHX)AyM1@%bGbpn4`9V`(+xbS1H5>NJ*jTgSw_i4laA#gHg8fS5rU%W5FVJb58)$w+ z_$kCHv+Pb$=>!~5{AQpduLqtAIvbv&ds>S(_VRFxD|~K%*>nhut#WAXVEGJhX7vnRaxILOX2^v8@yE@4cd>0 zx=58J+KvX`);1#gDf%o2Qj8f}BAWOe+=Q_r*=p6G+u4wfvLUyV*KHOIy1A<47GM2+ z##qsi31-9Bzk-?O{UAI|jdU17mAUK0-RK!}<=`(#^&@l{6pMS(1U%<##WF2DW`#Ej zuQ-cjF1A9~nyc1gP)+3jK-^`rkk@ds@ zR*C!FIN(FYT=}lr(f;O@jQN|NrrGMZ#m%$dewrXObqQUGUGMhxlEvZ@KI>=*b+*H- zC;^Q@=+-JeS5%`#v*xnFlWt>_o)LLvjqJ={*GuY5apy~9^%{jd&;Au{n^cXL7>r5F z*t?%TF5s2x2qQ!npdo9*AG*xOZ{WsHi8?5UA0hc1A0AiWuhPwjhmYK7CMd=HKwZ!| z;sXq2d^%3p;(5cUgaM}8_)Jr)%N}ot;LaGz9 zxeoNv7w{1E%3V{y;Xty`Chl#_}=^2!1S`^`7MjWgTQ#MiidnL5_8p9xuScxqBqDDjkPl8 zHm(>=3}FNnxC$gGEBGReau>)pre9d=Ssktm##d55lypEA6(Q2ZcB+NfH0);g%7Pex?7rX*cUKthd__3 z9H@2Y@U@t>pn|{mMzp|)KE*O?FedH|VhW4x<4nj4JjUcMi&l=i3g=TS@2h~8J>qRc zX~x!yRT`G-+NG9~h4g+93;;!Dj5br!=HoKeY|NwSi{t96I8?I>2h5dMz)ObKx^p0L zClJY8JS6Wx^nDm?Li_WfthPl$!TLBYpM_pqFS^b?E>?PtQjZa#I7qc-mTl;x)ufRT z(`xdfRcbXeQ<6HAn#y)4wWp$gsZv`3qXP4Jl1goTOsPSKYo&&R;G7pYY%5YKwQP zK{ai`s=RjUIO-|tGddfBHqO<4`(^4gq2Hjjs|CK^TMrgO)kzh{-mL?KTWo4JpP+)9 z&%&n?V6SysgV1f%aco)i1RmmCKo1<`@_o6mc`<5^J5MBPK6n#&LBH)4`b}7pG5zKb zZxITP%*5VJ1-CGE$xJn;9pyq-2}NYZ^xHa7ET-SsQcNzldmI5o0ShK*y`Tc*3)(#j zl`ERLYOB=RFb(Fgm9}2Yu-LD`40_jWu)zg}z570Km^#Vz6k%EVu{$a7T&haF!PAY~ z;m-E%iS%|uwY}SIe(Pnk>5odi;Rdj@E4l|Mb$<(Dc(k&jc=teu&{8aAqTLLW3VKD} zFh@6tGU7CzOutdlaZOP4lc+c9xjksJ)^0p0a4Bi^HqfEosOLx<$cGtVHhu}}Ejn7; zV`!StTvC0VSc5^*dh5-<)*hp3=Hld5FpFvnl0S=T%V~{iV~mQb6nWAZgK0LbtFI}h zEwO-0E_JLcX^FXTZKw>ft#5o9o#l5JVm>Gf&J}f*Ja=#%+^TgJbPzmYbG8A@6!yHb z#KZ=)3WF&&2y^Au45^Un+*C+BkZ5ZR1?z;`f;sl@I2@C`R9gw=*kRm4iVFaIe4>pp z8d982hM=;>Vk(TE(@=HoN2UPV5Yhhn&t88Oq|xHQ36(Y$8x@R9uMS53?~f6+%JG?z zw$8L&#*=eZ&~-Rqzhgx<3Ux&rZJD;kitINE^NzB!4=bBY8q2iEprGU|$BnW-ic1PA zDvmo97==|EWgMM#NJmfM!E*@Y_qY@~_|IO$gp|Hc>_dFtVXg{rRYENH%A2Y_Dy%B0 z3R^CnkvQNBeI|4m&MrdvVAb*y7j2EP8GURt+%`CDG_A&@K_<#VjmdAnNGN*OUyWI? z)8Z%lTKH zayY5CDm(F3DqQJk(stt7xSgoyMVg760E)A=5+$Lw5;+ed*I^{ya^x>nxc7f9nZkuv zL>h@uxcjuio#n6+F}hMTIJ$5lUCGgfDuo#gg?oylAGZ^6`~?;4#GykZm9`TX#_U9V zs(3ft^7IMf{2PLE3@VBimP6xG*E}wkTDK+4MY;Z{de#9JI+qJmM7WgyE_P!V8r7`N zzf+)b!<+>1VU*lQsc)sk1@Tp%IsgYFj|h{Gm%TI;#pxItmwR~@(p18^0>{ZWfDdq? zZ&!h@*0=Ityv%}Htq06AF%=8@&g!E_@lbXmej!i_CsNj7>JA+0vOa=)Y!xd>VIw9x z1J4j9BG14y7va#vLD->PKlKS%o@qIw(rEnT6Ry z0Cc$H73brl(Hn)8R*xa4e*K&EgjUhC%jPJ%Oq{p+oF}1l1x{K`TXqdw59ZysZ#(fG%NBH1vanOX`TrU0Q1bDF4iWf3u%5qW6(Sk z>IIsF8pHmOPBj>4p}5P;ekq6x80rry3@VDaT^8^z6Q@dLPryqizsrOW9T94p;_wb) zSs$4XeobnPex5TN1claE?wyOFq}vQDOdc!MXOZ*exiV2}sBb;+v!&RfQrcpD&XL-0 zCKoXlh&WotZ8D+yV3TnqMXq%WF(bL+2-R>Q{f1>2@p@^6ZDB8^6$X9r26Tmc^jj1_ zd}erzQV%OteP(zBN!<6rKgLZ=E`KpZR0(33!tb&Kj*#MJRcH-vfXNLnxe#2w}) zk*enqa)iW12Rn-+(tlwAurFgpCS-P&xEk6AjGFXqaZ|8S{1kyB(*pJ%m`x+$tBQUF z4ag9QKgveAXjt9gZFMZ`5O&uzKX zqV|Nty}kAKx$?}_Z^LdB-#)yK2n6d%kPE?f3&EoMZkW`!@?!x@-GK7PK?lj&-kK+! z;Fxh-)<)NY4{e_o$p}}!0sCFS3r@6VoYcf!r{Bh9@I&20IRD=vQ-r87qGr3J-=V4-eud2h^1ROjt!%3EEWd&p>3o^oNy z`&`kV31;Ih7^~>>cu2iz;b$uB8%4~rPleyc@;YUupW^buH*F<7(cfWtahMd?{EbDUyJwKr(*$L}#o7EIqyvU4EojaL1MWNRfeh|VqEnvn9=j)6m~ZN*_Q2K1JylbZNOA!D-D#M1O=1eTvfe>C&;5 zbEk@q|1^Cz8+gu7((?a?jscfXA1cl3FKx-_<}V*bAxLRs@4dsVXB3Ni`JG4hw#cyI zMfL|)xsK4T|i2_z}>k$TvSm9Yqn#RcvnpVOua#TI90# z7Ij36^LL_>jwAl?CRFkksE0>x0riJPk!n{_$BCfbf&!<5kr`~zYxkrVbsQG$*%ycm zGa|Q~FS5@UNBkf?9<)?jGv|-Z#?iQ6k|QSv%s$t8>?mJAF4evuuy#f4u6k3zkX z@xDMLaILEl!1ldm zPcKT+(~J7|G%aaQ(^OA~4ac;iRl1A*{B!2yRy`+CIW4g#K6zR;+h;z#8=pK~QkU;* z=^TXAKBQcs>EZKfwR5^n{yqi&V#47;;xjpFqU^mL?{q_pZtXVjXvGiLeTJA?W_287 zMhn1tmO^Q<**emK+leRT{-L>_SVwB{ZB@P5uo5x*T+6w^_NKh^*_+JgchsN1p;32lkZgb8a{KQ$b$9~1va$yi$kXwEA7hItdfB0NlZGo;0%WRO` z3&|^x-Rkc+)LmqbX=Q)Fvv$3sy~*3ZF`@nIMe}3-?to0`=V^Kd5l(AmR4 zt$87dkIl=?Mu`!Uq2czo&>IkOtS`BG6_s z229OxhX)LpJv3v;B)H00X1>TZj6Ld2Y?+D&6`zqvKddm(CO_E(Gll>o{o^W67;FmL zs4!fCh`_Y2qvJXx3NqZ^8&Hrja&=l^h9@$xAS3JHLH>-~l^6LlCWHqUWjr}(z^{g8 zOaSE1$STP2fD}avGKNh2T-wcF$arQ@Fr#HqK}H+?_TXp#j`lmV(~L`<$yhwixGWZTM$(M0I+L+@nsIq7?u?`vS2&X* zE^F*xucYi>@0F@l;!NfWJQ0P=3f%0sg05G%csU+W>Rybp-PB85BGC7~LH>-|O~@wivv z{YQ@S;?-oaH#RzBxI;haz3B%6kLw4$KXJQF#(R^XkLwMwcL8e`cwBGnL7L$x{)RjB zhTgY&1g^Dpd_PThC9n4q;KLoOLGNdI0*~txtr7Ss-f)M05c?YHoSK6FycBp<3VcZl zye-qSPDF7UYC(|hMpfyecp-amT; z9@l$%9~_9bBx~nT;KLnKs`uAX0@s!kKIq3>F??O%aeb)y8~WePSu`(Hl~Zn1EM2-} zY1xv+Uo`OD46vz`?@&t9B zq!GpQu5;Ete?etsS^2_>c}x4DODpErmRA^MlY>(W{K2xRlO|0soKZHzUl1%TixJmU z)mBz6tcax-)ZD!^R91WK_(bIDtNJ2WmDM_+>ZMCiS*UE`f<+5LF~aZ8tD3jCtfqER z44F^%u@pv+?T@AFDwf{0q-H_rUZW;7A7g4%-m`Q;sKWSmMW|wNU5sq;g2kw$CbTrP zWT8>BxO(Y=#i2M8{BuQRvY4JEL9P@0r->HNTNLkcO`j$$D8DPQ{}OR@wx(iP^;HHR z@md8|Ry!fqNC_1>T$j|L+v|_7wOvYAZ@?XHg3LkTPKt+xbxn{8FVD6X|^^@Uj&6 zQH3YA^TQPQ<7!Jwtaoz?{8$QnurjF=`CO3#|B1pA+xh=e;NQR(xyj~zRtkJY3jEO& z_{J3Y^~%IgY-cb9zFFa8(AXRBr`Piv4*D-T@RJUDevhu{FF5EgbKrxOfyQyZ$$@7% zaHss1q`;d};LoJM-%WwHeIAWK;P}wm*X{36INM2^S>vxMoa5tj;2vcPkdN1a->Go& z@jLLJIOutr(EQ(X&=)xHo0LgGJ`)}I0}5xoJX2^s|K^}~^4aOYoqR4*rV06c&B3Qi z;jEWu3eErC6!=3aaF;sTkx#(EXPg6f&U;Y`{Dd-n*iNUu^?n%u0{J`n_)_2{De&*6 zz*nZgFIOg4VtWR2vqVU2&!s8wo$L%kBA0fC-P}icp{&pDe#X{;KR9C5fb@~O@Z%Jcw&3T(OH> zFH+#GDe%cOA`ue%b?H@c+^MhHu1=1>>%hGr(R%Dl<0U=&Rp`L4R5y9r2!VVkvKkK&L?BL)(s%%_Q#ST6m2fdm?fsA+H7pXX3mIEK`z^fd%o@+wO z9eBEm^R>o-4|U+{9k?>p1=8Zc)sze5We2WIW`XE==6EWTR^X4TfgrAyl)yJS@DT!$ zk39}tnQS71{uu`HQKptipL5{KL=yNg5+d-W*Qx+=YVKVWns=9x!_&QZSLs-&Vp+(@ z;Xf~IJs<4lJ_ZNh@EnXi>#tET+XvLKQi;kXh(3cb3OpzRPMtxvK0cBJB`&d?OA#ZM7DHsjmyp)U#PZz3sek_3 z@-!33GgbR%BF`E0KBY10T<(EB4XF3XsWPHvKMP6HX+eoczq>#dq<(RR#QIOAsNYj6OD$7{SwGJviS?UUxM0z5{`aW*|J?ZV zzAst-f2``)a>DxQa!RcKYbZ}FasS+}>Q9uH#Pp3Q&-OcQZX}I^s=S8@N6_P^-&0bQ z7l&As({!4JZoht)q$uy6C`;)6>+-t&`dy*Q=V}7=&Qj%hW^)Gil|jE3tMU?!KWC}( zaYV*+IgN)^dA$0B65alU4Bc-{yaM^`i%$dU{iK@zdR1W-@GJ1A%jRf{!ID*4{+Gc^Z)<= literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_matrixeqn.o b/third_party/libxsmm/obj/intel64/libxsmm_matrixeqn.o new file mode 100644 index 0000000000000000000000000000000000000000..ef72d412a8e4deffca4f42ebaa1f4255ed01a141 GIT binary patch literal 42136 zcmeHw3wTu3x%SRXfGMGwfN0~T?UWNWRHO-@GJrapOvnyQAVRoQiZO%$QAlF4!(Bip zL3cMnYH#*teX8y8v^~e3UM;o?Mh#V4FQ?U>wzkyMzcNw_t*t^q`QPtbYwcOHXLh80 z&i_2;|DS(7%$?#TZA(Y{*{8(47K8dy?Xy+8fiQPI93Fj&qDR{GK6q&xf6gQTFANS7d% zK2a3y`!bO9iBPn!pCe(rJ$<6sUXVU9!k(HwG17jI$3FK|dVf)E|Bs@5=YrXTuNCkw zqkSPAUMS$_qkTuRHn=Qn=J4 zD{A|p<2GPPcWyD{q?});24+WKsBI_Rg^>X>lyaT`!x&K7Eub1RV1@-%8Bxtoy4RKs zn8i%BaC~k5FYTG28b4r;NI5S+Ryk2sf~tJL94V+SAgZ%bt+4@f6sS-zl&XIjnq+7s zRq`M=&mdwdSgmN75tOpr;^Tt`w?*QOp-}HR4~BpuTHITbekJl?F;dawkj@Mw{dSRk zN%j8zH+n}o$J2+3?2&`f_a_Y_%X~EbN}0#JH!J@|MH1pXnNdP{G4g^~Fjbdx?8xiY`*TX{F0HWKDE#K&8bv8?D}vN@W3BK_(GX+_HOAuk=E4uph)LCxVyoA+tm$EycKiCa9=m0I7pFRV z?Phy@m$H?TIrd7ui#uAnH|J>9H1Bawoxhq*_0Flq10AOpS?NDS*gva(S=;|wbkBFt z2b0d>^dXFzN8pm_;}{&DL3^Z+V{oJaD1AJPA(kV>_I2swBkY;!<0I|z^jk*sj;SLJ zp{NXmRvW{%0p zOb?Etvi8wvLQQcUa=B3N#t_bSqqN+X$lb$W45W;RlI}T&C^e8uxaWO7kV>}aA|FU4 z+B4Y)Qc3n)?gOa=d#+L-jAjG4mF57AzCU4@Zc{}p{kKT8@26-F8ZG<&9hRp52Dkk= zIWoq_{`qi{`t+;{BieTtsOr<(Dk6yQmhm#gcOq`OsfuOh_aorPl=oiq`(UGGf<12F zrnk-SqmF1AirzMW7||w4d1du}GyOQG<791a3_HeUoQ+3yW+MXbr@N|*=p5#ECS92KLZY$;rHP*YPv`! z@wR9;t=@K=JbfCIejl~Ip_O+HaJ_!xoQ{Er{US5bmw)nax%G7fX-&<;l+1EJfTZwQ ztX!jgt5EsMaYsb^7IP9613w-a?VE#SZS{}RCq|jkPaMcx!MWt`>C@mnhBB!d1MhD+ z$-Jk*`#h4wds^@gaxU@0Gr_#oQ;amYZ}c=H4ff?iH4=h<9@1Jd4xTuT1tRwxEB#nx z`3mzR=86oYi;iobL#QOk$)v8_`!acYmFa$@V!EgctnxQ)!y;96;*->g7pYE+QR23b zljZ2dleiO0$)W>8Q}zBlC6XlC{}Bv5SsMon8wTCy%DRN^(t#SK`*_jDrCGh^o6qX1 zkECT&o7~vXANPoA4+z>>J!0 zE2Y6(BL`l4mx`CA|Cg8@EqAKrE{a(0yt4FmDo{U6b+;$tTN2TREs3OCpD>-{SYY*~ zxI`zy=0h0ajoN7HDKmcJ$QI2$&As*VhGJ>qZHeU4FSiJt_5>y6SeUJ(Y~(>6An^^X0Am?UcEQJQbc{Gw=OG+_ID8t&ok*A zi4bpiq78KkrzVj(A6c}&xEaOo--PNFbVD@>l-S6eLN}-RM#x!*e6qytO_=dN+2DEnh z8&l5GlxsCQ3mV<7rleE1%-PsP^zJCy?W+7HXF-#FmfO{sbQer=HZ~@m1ffhV1fDW^_YeaG#O>U)Fp9lPrkTC<0epCp7J$oGWX_i~pxhMWQG?T5uO19R&?1Bk!9?aoIevf=he$N-)cw!lLl z;f3{aOlmYBU>h0jzZ>~Xp7}8Owy-ChV-E&f*aQ9hNLc@V`+V;IH1Uj04m7^KkF;TZ zQM9pp^?)pM0Y=q9#RcMM_%{v(`yDIDZa zrh7YiQJ6id!5I`zWx6|W2by%Ai?eq{8~7o7jMizrXSv&(%;*D~8skIJh9M_S-+0^Are7G574}C@uCV=5DZ@9=K0}+@~J^%JmYyL^%aB&Hs}5 zDUK2|#WY9$m~iB1-=Bc9zZqfpTi4qr&qKT;djg0Ve|FngQQfpjn+XW>lJEo5ZBIEj z!Ut|@a(bG0DSQ&rYF)Dt&X9C!8rd6?P{8vYZA7_8G($?ZM+gtJTIX9sNpUWn!t7t) zx&`SUatUbyNVz54N!V~R&W>nC8~90w&!17{u;8%CfyCQ9&+!uIwpu{(CwGW)2a@Tj^ zZ`@Dsgk0&$;1RxNulHV=;`UY+ z1S*kxTjn#gAJ5V+gorGC5-HI4enHTR5}=>)DNAN7a>Y1zAB6Ba5ku%DG!|dZgXYD6 zt6viX@8j{GMR!jEka5_`ohX!HKF9;+Y3!ldZj;a7Ge=dL&u}vT0xqGa&(oM9?>cxo z^B+L;`53E(N;M~^+)?I%%)96)J>@PfGt<8F0+4LrwV zz!XelG<}*6A%o}7=xrT@#ecD*np=(L)D0J@7Pc=?t!s~CPgoCPrla-~J;vRh!qy|? zY{eRC4%SdLjc!Mib7NEQIZ1a3rsG8^cYdQY4=bzdusT{!Gcpgr30W0)tE%`JY~rf; zJdN<~Y#Pg~)@v}{iUkQu%iR?{u9o`)tp;W$u^RcID9wqY4T6H^<_VHygk8?rFyAiq z+>8B3)Z*4i=2ogT>}#k&u0{#jiZ`fb+=UiOVAZ#F!Hn;K45LNXj5(eev#8@sGtlub zWkbn~fAY*Y*E1vPnIXm@VaH6pDP8p`uaFAs>@79i6|>K`iPw>E2ELX?~wdh1ZJ7=z={A z#D7FPzog?*D)TjE7mvSZ6)b1MVy$YiK^yZ_>)h-N0JGpGb3BXCKHA*7;Zo-2_K_qb z>~qDTFWe-`3ql;4$#Iz`ehmCSB(r51YU7_JTWg~Y{G=A-0?mKPj6a3K{3kj;7v|R( zjkp{=sDo^K(UipQZCc_KNR>nwu8y)$m*c3r3e%`6>3ssUMcjwE=J*9>|7q0ZE=sc< z@q=i_14$>%2CRSq`h4~}3DDO4i^_QRoSk(H5|XJv>wEL>--@}GFp6X%syyi&IbM0{ z;c#k}Uj&D{Hal=-X0j>h_cUK!f!gHxD$WtiR~s>odp!gGe#hqtP^;)Wc$D9XLW+6$ ztQ`GlA}R-JkB2U_{RBC371`xj_3)U*%g3KVfSPZ4kpw@^Oo#B*lT~@Sy}kys^m-Ee zqnt%XO<8@b<&`({Uu+@z6Z3B^z+aF4sG8dw(_riScVrs=41na}yQL)23#| zXzd}kc$qpn4SopS!^q4+RE^ouz6z|A)VLk(qn9_L__X1PAm!ERNeERvA=*ctqIv_4 zASs19Mk9gEL`wCF%rqqA*(J5)3W!t%z#4KsCq2%qk<9L}ctxx6R->)Vyo}DP710OTJ6;v0zSyNd7PqELkK!`&d6SIxwHW+D> zWKFf4qlclEyL>jjyv*%kv-__>?aE_U&41mQ?ncVDe+MH8&X3I| z)A_0H5xh|P7nn&VHSt(Ah?Fpk-c$_I*^qW1REGnh1=@j7()mw$B7|clrx}YL97TyE zAvND$4OhULusadon6R8*M|aa{m~7~qBPl17bY2E75WJ%B5cq)hvOg)A@1y$>;#RnD%rjd%(9MYk@rPj1JKLK6@N_v!5^6 z>b)s>=w#u9F;`nAO8yf6Np$xHM7%s4aO%dCc^T@+G`hPR^ok~%&q$^Vd3-+TJmYaH zTCDXOvh+A4zq>qlneVo%HqMs7?dT>HV>0*_pM0xZR z-lZQ(<7CK7OMi^f88v$!-6JFKJBK<%%W|Au?IdmLiL+H{{#NmB0PAnuC6Qw_k4N<5WVeIRD$U)YDA%G zUFgML!*sVcaw#xgxHXmc$LtSSIFq@iaj2#d^l8!o062` z;INwxjmR?emwH~Pl9ZtudZzTK)jxY&-hZEEkaC&h3tYHfSRnwZ@g>+Hfsr}iyv9~cecf&92{ zMR%z-x7o4PxN@`(467!^dOGQ<6U6$2Ku**8u2_%V+1;|HEr!>7ovp%-mhSGB&9T)j z?d#fBO`bei=p?=7QgaFJ3^gsb1}>%aBCGW`;tbz@$xsLQIJA%UJ&9Uymz2>x#5)kR z+^g74QZ5J^+^gzacO-D@$U5*!i1B*oaf`O0gP^VL|6O#?C`8PGZ523D4Mlf7N3e6K z-odN8qx;PRheP!T4iz^HG#+ah7-Q5Qc(u4b{b-SyetDFAeLY=?Xux;Kz8c?QyBgov zUg3L$eKx*F+M}Xlaa(d(nHe1$u1^o1@v|ZRlwBp5j_ky1Ny&#|)^-1ea_ubTde9drQ zf~e$6c2~X4g#1h$JYUh{GM1vBsne@hjzmu@PeuP;qThG1V)EpdaC2u?V`Qw}pE;t< z^~AuwC11eNiy`Gw(swSYZ@n3_;9H*Cu+LnKV%9s4(S^06kTFn;qb2yvOu+OS?6>4H z2mTnc4*b5@8rXhp?!fqqtOG}jExfNvzcR{RVbLp-dH4?5SK%8DgYROy0^cKW)i-^5 zq&*tVSzJ*+kg6z)jxDm%2hQYS#h_Oq(MYz|6MfblU&eyrB(ah-I(LTBl(EohzEHiy7Vqb3I5=Rv-UQJIrtuf1a>Gf!5 zsv<_NfOE5HPR}fKeRNT@y8L>UMT%UM<) z9lNO_X!E{5zzK3sKMEm)jv{O@-sTTu>lPuRNw{Vf~GRrwA*m{3txry>4579n?7;v!5B)u=;77x-V zSU_(EU1D&MZ#-Yv9@GB8Pv!k&0akiX2Ep{Af^VCCh4N{6!H!H9Y>|7b%55){n^d`%6v`c`a{pJj zfaJ9RS?_{ExgS)yWBs{&;q*0_nfVHeAm+Vnv837Fz2I-+s3Kb^x-EM+y8Bgd)4kw+ zUuK9hUzVB6e3@a&RQG}}@nsfMrn(pWHtf=Mj-bpZrC3^0>6s%b^HG_3UruJuya4_B+~yhV0no&#Zhr5I3eS7CwLm-*?YAq=gsQ<*=xRZ>6hJBzZ+Wk zj_)^c)+ANc<%kqlhYMdhZJ?{V2jFadXel(H*NrkuF(T^sNk8aIzl9?$J)d3&>gjuZ z=?%VgJ*Q((%H_+P@5_G`#nJEc{({q4`{)&+p8t7D%knp)`qh2iMTlpsceuLHt5{sW z{W8id=C`cgbD$?%Us?O##*v_2{%1Lz&EN0Kztfk0E7Dr^FQ#OcdrofhJxIz`gj$R9 zttKwNJ$|{&LN4E3|sQB$KX@s+h(XTzwLWO%nFrsqzL3slZ$o6I0iv zm{$xjDRETJHylTcdqJK*Sn~YAeh&}$^ml7?{;*l!r_;U8Ms^jqN8NtM{aG%X*B!$BV7jql>HoUe?z; zf3VzXM|l7JCFG{JmlfG5%iS(3z?p*+y;(=-WG^*yls%dkz1&}L|0za)(ttm4#035+ z;yqE^`W<#$P1dCT$e!%^ay_k|?i#8qL#M_zCIuaSao{3(a zkKoKi(rbU8$cND5k6ZZLi+Xo%8CHX^5Wfc2J4;Tzc7Y{`;V-l8s3&f$40PdFDKTaM2sIb zoU)u`MdZ)XaZMEw=WulFStw;0JD5~NTQ6)=>jyqjVMNq=DLN6GY3z?Y(yyj;xC&VY2|9Ex+Okkg%1#Tcvg8Oy!kpkr4=tbY9Y z?bkqr!Jx?QT^;Q}KE7clsAJAsNiZa6cvx*lXG}O)v>M-!7GWpUwK}@)Ky)IuEKXBJ z(!wq>k)%K5p)&vrqRaR?UJ?Z^7j>3?Ean`=B?Qq37^l;ZO~=gzZ$HT$q~LB6gxqHG zo`dqA?Cw+fgE*Iw{y;_NfZnhEfR?JD5@m;nqTKX%3|#IN&NLik`~h3wx2UKnPz4); zvRAH-b|x!g7VZ6LTPBLvR8fY!;DS|B!Ls84?-saxUe_&U1*&E$bIZrgRlwh+|56hl z`HUzWzW=fEVKrs^zg#|SXa0wmZ}@RJ=X{#yvqi0I+t%akxi!|g8Ye?-I05hKZtLlx zLvfq_YPGVlyY14>?p1Bw6Fui(|HwIq#`M;Nb5RV@9b$;S`Vrc43s)39NyO>IwE7M8 zHh~6YCX{devh$ba4)5Pbz9`0VIgjP}b#@+uU75AsxNJ4zY;0X?@46NozQ-yTHqM_I z>*{Xrz#J4*{>~m-RREPq1m8LQ-&KCS^lIHA-g^EYEq`52&5T%O-TcK9W78*JHn}P` zrD{rb)n(PwPz`t&V&Q0z@k^_#CK}^qIg7R|G>SHr6`en3RAev42%3s6ka_e+xfBSi z;uw~S@<~`y#wTdlG8tc}Ve4dE?5u>mof7+yM#pEcNb_WhC(++8^T$UJnO#~o5UMRL z-yKet#aODoeOlBHGQ&yO5kT4|QX%+hj@%r1@8mdpcUws zwb$cQCF_E(W7L0zHrGv=XDgRi-dYJeV#fu`4x<_U9C?5D%jQPr6f0;c|LrZ4PfQ8qIZ* zED^rbCF$nVG^Np{B;A~n@NpTf0}IQ(Px71^mCJLpB&-EtC9>#ffx@QzdK>bpYF-KZ z@Nr4IlNhpgP_22S$&%3g(()wRe>=z~;ed)l4e`z`;DdFgRm*c7&>Vc?=^uv@rB&$> zyNY**2SVL&tcF2&-xKiBr{SbOE{&v0QU%btC84eY@SGBEmj|USW7s0mrcW2Z4oU1? z8r@%fn1#GZu^@PY>JEOL4lRYZ4q6lo%^%H9y$KIEF#1L$S#mumR^oe3$r5~G5kd|<8(k~X(dQN zD*A3+Neb-uNLhE1UEJ?Muse)SzN;7v=@E6MiEt7UD$#Uazn@dG3Wz=unDCKCh&U$Y zy;s_{oBGx+l9w(HwTh<4z)@4O+KbnfgznZ7bzU4z!EKS`C6{4zLq3I__$-syAL$9Z z_k`+8%Ll@>rLo;KUhSgsDm^ls8X@fTN?luG0l|o+AvfmHH2M&;1a+Y^@tFK8X~U)2 zHswaUcCa+Eof>w*XvFGEw$Nif6ysr)+(3^xB^a^LoW^u~Dn{Z9;S_zLEW2sU+eKq< zx)?q|qaLbkPD!ZUOQfjp()@9~uKNdhE1~ax$@@)Go-H%xoh@Dg7odQXkp0s@ zT52tHX;ci+4j0H}ij62vciTMeRXQGlg}t}L8QXoswp=s%%eyy zMyaqdE+1rEt=e+4!^SzefJ}uAK0nCI3mfO=VFZMY^8#>s3!5Lp-byhyFKqBiBo{;~ zY{c>~0>Z|G0K6gqzbF8w9+Mx!M$Xp8BM2KG^x}FXY+Rxzy?B^=nV0k;VS`Sk@*tj&$EPm`Tev*(%+sH!7-b^ zr{(pSEsW>))18u@AKKS0j>nDbHH@OWIi26{?v%J_H%{=EXs5WbHw$s>ZvLL`Lk;6T zMrP&BoL`l=pWz!4_s;;|mAJpAf57qxeAv^858qBmyp*SKU5(7RilwF5RkolJ} zE{T9C%jPkjU5E5Z`jw1l*C8L0_%_C~>kxG|LO;VfE|gtw@c9UM=p4ID^6HPD1mN@+ z;f3g53cz0mJ}#8y;PVhD8WkdZoTrk{Fp!$e_;?RjD+Brkv~i*AxlQTw(H|_13uV1l%|H<>J;#Nz>y|5|#A%FQ;K}E63#3*uuB^kBMY;g}K-joY%%kVC z2&4`$KEcEJ>;b7`jAz#^J0-gA$Az-%AKu?X z#Vp3N>lxms1E>A;xR7eS>|Wbue1>Nt@3)cq7~{$daFz&vrSMJ06C9_nQ!@4w#%nx$ zpN##PaHH14eWI>SapqE0Nm8cy=A~qNMm& z#Rp|2|Xnnquj(-#xRImXdy&aYtU zEht%dwU8<3S=JQhn!<9XF#Qzfog#8s+7#xU!n{+NcM8j#%3M>Y_=?($vNo@2v+)#v z6i^_)#~^~0tt~zHwNv~WYjZFC%BLS($v-;kM{V(wpa!nyHg|4l>)zPiZc72p?Hygc zwl7CWky*K|!lv5Y8=6Zlmf?v3TS? zg&fI~Asmj2*0!mwwYh5@Dyp@!V|DwQ-fmSF0&1T0sW?^_t3P?-9MB zxwESkwE}Ni)!W+Ed?VbxZCzfj-QCjBv%0f;eGq|Uw>#mIE$iUJkT2^cS3#DdKqqNg zbwe-pRs1q>5B?@6>)4vN;qr-AjP?B2IL2zc`!srNy!PxiR&!VNnmOAiy4%#n6>?jT zK$DFN_<^lX9e6FS7!~59JkhZW6pAvtyE=ji)^`R=Mm6%ire;)r^9}8GA?>1V3*_`* zI4a1}gPz*kQBYV{Z_nE1l`X9|QimxZvpGv3dP!%&-j}8FykK}?s#g>iNAI?SD==HT zm?k;U@@FMJxYNjCt$_JN@&*mRJplho0RC72{&E1Wj#z13q3eC0T*vtFs{-(k1mK$k z@J|Kc-wME=4ZvRuz(>pVrQd$_PSKCg37}sdfUEb5emQgjoPrzY>6dHvoS;0RL$K{y91DQF(Ry|5@TxKeM&^IjG?^8a_cDeGq+8!)rCXR>NB~ ze2#{HLBs1b{JWZ+>ooj70`Tclj=vq%J2=91IqC$C@b#K}2XP8iF1@^uGAeMrT(3&p zFXxQ_oc{L`3V!;f5+}VDKFUAU`GTLG{u34oe*8sEj$Y4u1Ub{|_3)zr{5nm(PXFrw zT%B`~J#@CJ>?a?i;Kx5N;5__00r)ck_$vW;ShG{Nr!oMa8h~FDfUgL^&knFhx-I(^ zbxl=2UzGIJ?mGS zPpW5Keq{hYBLII~tG5kWeIC{DUJa+8l%ha-H{zq}e;R=Zgzvyd;p!yPkM|KVf*&6c za321t0K8bUN7p+>(o?Zl;#0^TeZE_)$-hOT4{G0!1?c^phHuv7+^gZYYWPzU{~)lm zhW|?9WIvsfD*M%)MyfZeLxrCVkfZK4lAO63{dfWp{Br2eJ}LO^nI&<0P#;%%lL2zn zn+lSz*RwvJ+@i@brTs*&`!oHxKLx_|_Eqm!{C3jcTT<}bN$-&;_{+Oo!*4~tDpwCv z=huV2E=p8oiFo>5J=sfkxknI0Y)#4fv>fzJovnKmM-*&cmMyz%v0j{n`r! zzx)dW@HqkassLPHf9mDBJAnS70Q~0x_+JC?an!H{%BA|Lzr6DU=vxEuO#wLlUqprU z(yvAr!oL@QtM!%N&N*^@MYz5XU8v!72BrLmelVK?(d#(<+$RN6Pj815{TDQPT|WKK zT@?KGJQ0Au5P<(a0Hc#l!n)8_^&lw-_Nd-IN7;QqrWWxe^|ry_Ig#rbvf!yHOZ&B zPuV#WaSBvkddsTt4-<$$xL)3FMg>l5Ek*xP4X@Sk2PE#-`(OZmAOL?#;$+Vfe3ah* z(r~@K^z&!EyzkP+VI5cVFUeo3$^R!wPq^M*R3{XuTzdWM`y;*nX$?uiPrp~djO+CO zq2aoGeSXyG&)3SO(@)fJoqn{GPvu&PkE+j!n%>(qJZK$7YdH!eXBj?9&Taw`sJwmn zD14AWgb(7A#z*05e@Al2_Y|(~EBNJn2Q(B2--(Z+m)mHzM=#e@&`|K}O$wOluSHzR zNlDzVw^ibPIkqN8w_o4?_G$7z9w6sFP0n(S{_7I=+oQMRfJU$GzEFMAnpTxd-3%gJ zuOHR!e)`Fvqd*czEQGx5_ zqCP^wZ;x7U`^)utLC*4Zd%h)cznsS;PIB~eJslwD|CdWR?*A?AMSezs+IJ;B%0DkA z5P|ZkEDBfso#;QJ;cEOO`gIyU8*vJR>+8dyaZ>FcN&X6al-{6m(gh6#zuzi9Cpo&` z-pAC!o<78td|De&AUpN)QknwES&5ILKa={Qxn9u+)lUL66iAQ0-k8Uz(A$i-lJh^g z-t=~lY4*z{CnM7|T#Xe3SsEUZae9!CQK0&N7std`my8oWTEnSLDG*-5G4Z9&lL;TA z;a`_=!b>%LnzpV}b0uZprQu^WdUa2Ll%_80|vlygjc*&2SXhTpH@=V|z}8h*ZppVIIPG`uP-5|sU34gZ0Lt2G8? zKda&I*XVzz;V}(AuHhfh@G-KZlHLg#PWOu`5MIGC@ulx`FVyhAX!MmDZb>I5IchCL z*&osHNgDkj4Oe3(Wmn1tD9NeP=vQg@6btm*~a8mEq04Bd1X(JF`0nL z!(GPhZd=zPs0v&g%emVss0!X@Etn-PT}cysZ0csKU<|(BS}0#$<`NfRCwF&JX(tyj zKIgV8)m8OmTY2W?dVeC-%fQ1S81B78lBY9=`%&b*{gF7G%gM7_+7@b7J*NxA>J|FFw zjKbn3^E>GG?%ZZ=7;ZKrGdz~z>Qh~YKC}8%)Wb>iU!guT$$qXvuBF`^mv^v-?+V#V z(R8=^OliGq)@RD#b=Naip@u+Qz8<#O)g|mRt3vJ4_ApXCugD4wocevV)G>#1I*;6# z+}gQ*eH(7PPsZ<8woS&bYF*l6tfR7&h`ah1$Li|;DT{s0Dnw2=!f2c{|^EDjgtS;ci?|Fj3@i`y&19Wl>F-N z0hAb3|HMOKw&eHwPnX;$s=sR%)cN&yCHM=qf0b?dBOU)leBfc~9Qk-8^Dx+(be z|2RPZ+SP(Sf_w_3e;hu3{l)N8BJsE1F?k52pJ{+BPyguv{atH_I=BDQd5~ZK)dBj; z$XEn_dHrd+7g)%DZZDvp&X@f9KOCSxCiTBl`Kg~5s=tKf&pAsdP@aPU{5vJTx_?M@ zLBU`CBJ|Tj`u9tI{p_O<|MUQULmuj>zprQI1@un^f1&!nN7jFpWF-Hi^E7|`-xQ!f zArA@F-|g1PuRn&yE2O`yLNM+W5AWOWf9He0 zQ2S4B7c6@vC+R1Ve*M>gxsd+7M2t{_{-FoI{uKD}&q(M*h#!>t)wwq5zX%_{{;dJ} z4-qlK81eAF{rbNcz<=xv`JV{jH!c<`-{ol*5x@S|0{F|$kpDc)uZ8MACi&IxO(-+` z`p*VGOvtPM>*+EE!d~&<`G?k;{`#kTN~C{^q*IT>l_K^H1<0r3Cmsr$1MI&?PWjU&xRSEuX@$kO=`frr{{`&ip3n=`c?fM z39!FIE*vf*W-TbY)iXA#u>JQ){!101l;N-c^TA)p{^`Afp-hhdWIwHi{Px#L{r>iQ zcDmrFJ9Bza{K~A20rpqPmqVv?X2kvWcL&&SZ4`_@m;7XZ0w1Mc)&FM#?60E6LU^bC z_gsMednNxqA;$am+y8Wc{jp7g<(T9o`)ly=+fM-1rQd%WX9~t&iHG;?*MBlVf7xci zvR`tN{#tze`Y#O7Uo}fGzA5?D!>>OE{zCrutkkc!UtIzH*97U83-=e5V)^vzPX*|| ze~Vzy_172Be`|pLva1CDJL$hUK>yCI`TFVZ2FN)m|NVA={$ppT|A7Gg%eDzUdifgw z`Srgn_51t(!K(%T3i0s16~F5Ll)rJiP^r(~ zO`83x{0~a~Ri1?+^0L&g-!Y&ZWw&}Bko-P&qinVx5oyJza$k`A{(AEGRf9_rQA zchxPf(pKAg+gk6{_Udil%58foZLK$uq@bXJsGz7I{#-y&P(e^~zu#wO&N+M7wSM0B z&(~vfcIKJyJkK-F%slhVoO9^+np(yd7ZnA}RTTJoz|9$IAh7K!H{B@7#=utt!vc}? zQ<3!M$g*AUfM_LuT^tA;u)50dH*mIPubpje?Y^+TA4RFxgMmOKb8&yu$nD+PP~zn> zEBS3miu|<_HJid)tVAa&M^-a?V2sr@q?GsaGO&3LYhs9R% zO;lL=VbDtcw`Rw^SZWWABzh_%)~XHAsjdRI;TBlFHalu zathZ2$f#1>MpN`bh<;+(hYEGNMm1ThdJc@WxxR{2r4JYYxQ9YRI5Q=JHm`wZb*UZHE0w$jV31=(I~eA z`aoQxfY4GXg*ycLZMaGSCmCR_4bj}ST5LuEryC%YqXmVdSkUM65QEXe-lXg`Nx zH2l1E^JEgORa>Igs;Ys}aMd6yHCo|{n$qiG>+Ma$wnY*?5BXw3O7)%{kgjFpxi!oa*8FrY}mX z)|#UDw~&ce7mp1`Yl`Dd0JT&H$AnvIg7Gji(dy6`l!W4EBGX!3G6r=@;?=^SlnhGa zCy+r+8B8%iRI-1t8m0*JjO^~g&u9MOGyh=b-)H{eGyhQCpX(o*)0>+UFTZ0Ye}TmV z>qWlEf$LdwrkO4~wnm2z-=w>h&|yNu1}kwTYcpJ^me8RFi_QkF@N1lP_G>`I5)2#$ zqAF5|9NvI~LJFb7A5}`n4xk};z#Y*8`OT`X1yj)O>K0iV^-3YJ+JJ5LG~6wO!MkU{vq}DPUB9e9-<5?jbvhdx`y3+)M3BS#FD1 zc}zG|6Hm)ptn=kHRw~}#-w(De)k33mD>XyWqJ|a%ZGxgQ6S5;}LMx@YEQ);FDpa{|GGKL#vR_fnDfLa3_Ygx&;5S;x!=(DYwXds}X2xzCT z&-{hnYJBD|KJym{&p-1QpX@JqIFa5IS@!ZyI1np259bZe0;R%(UINV*^qB{=#hU zzkRoB$Uxpl59+(Lwl)Ot9DYMsD2tTqJP>OsAk0PA=>vIJL0De{Mwe z)x7n67v}@G)mgnW&kDCypytd|!krd5oMFFD4$npshrxl7OQO3LTh(p7r$S^k1&3;S z?<)x-2q@~k@5V5qfTF$^?Z?rSN2MvdAYu0G5mrqS300j-2NOh*i++Gs3e_t2c6%y_eM< zN45BDRB1F-Mv0(0R5?DpL?l|^9&V*&@0$>A zgJt{PvXAuLxOIovH8~6;K$hsfL)Z7ec`PgiCSX*KO?^2w^&`Y~*we4IR&_^) zt&JphVN3rI8Wjn5TQ`3J#Xt`!izT{?qUX1j!rVt#xb!d*tU_!;*C2NZ``qX z<6df?2tiWy{1(<+13Ur%MQJ64f~bw=FQ}g>nu0~o7aYT$Ts`8~n`-18p9^)byBf?( z!&PPRslc|B7DdA?WkvCc$h4LgN5idU#qkI-ZKc6zxUDQ0A4OCMpiq1mF(rUt8!weh zSsB}SRxYD@14I@3C(bs0WOw!fEXdFdv5my*JFVoAkj$h(eTTcMdtaHSW>SZqNiU-} zdlQGy?hZ^iH+IKlg3*5iTSxv{3vLp-Uk(U+d$^2_(HMr4)iC5dk^!aX(VB>`JP~-r z=IE9y!3Zft@?TLY%+xH{f`&-`L6s`8yO1xn&*N}yt*+(IteV>1(=peo>oIIG*J|qN zM{E7_?E&l=2w#F)P%w&Gm};%8g{ju2Yhg5QKzrASa&cAxc9yDQv&yt^r&O7MS)~N^ zP4>>Lh!)eLlxw4MoKsYebIK^kc@?>FYo9X1ak`eRB(<^@rxlf(2g7MeDSI2H$`<87 zw!%!(P-uwxi~DexJFdM4%w8kj<0zW5PMY&;R>+L6HM;^jM!2dF{Z$qG)rDc~0pPD- zOYv9d+OKfxtG=JBz?~JfKrgYN-xtua)DK34etj8xEp+Qv?YV&H!(D9^jkqJ+rx%)4 zJqhSII%pET+WSCjS9rWaFR7Rg-YwR2ak|wDU@=&(2EUOD?dPdVkF?_@j0FJiuUM)Q zf3%oj-~YCH1yZPYeE8uSm0a@$dn+WtMQ{Y)#~O#bk`?!=#Qhc%8~{JBUWpVGoEE;V zMv{H6*;&+lhSrYHL#;nnbOQp(Ns9?Ou!pNRAq9aKqLxZVn(WQgm5WdOyyj&+JBd9N z-c@`o`iSG8x)POOLG)2?=h_)=O%w5_xPSC0*1q?bH|%3RSuvOChf zClpD%g7vQv(@Gw5MZ=q{#En2i{#*%Ja&%?!US6TCy}_IcT|(5-BbbWCsErgBy*;Rd z6c)YJDuqRlQ66X|_GH{k?O}=TN-XT#$h5A8_uH5C#0p#Z zSnn}CV=e(|x&w4B)d8;a)9!AhM8no&!|;a>zrRL(##9Wz_0psL%-xVg0q zhh6(&4G3*x?A%sXY^7dSgcionEoC_D+600VLnFSItd66t)bAC%wG^>*YZ(r@{!J0u zOgzo)L~;?tW^Z9l_z!cg75O#~f4W#rHkx4_=N?it_Y=2Yg11rht z-;{nAzEL*W?{oAZu(vP9z1TjDEhw*pQjcTyJ*VAJ@>_KXU5Y-E1?xZ)PPFSN^nD8b zU>U3n{>ew6HZT~&AO4p_VSM2%t#A-{UuB-{t^;ULZs>jYW!aG~KOu z~ZGGFcJvr4NiZL=TfJPreqpha?T`0|T1@)*( z{tgBM>>R+}1uvIP%0^qWU%}0)S&A+W$b2X{8L{N@Cd`TVV;5j1yvymDem6So(Sw_1 zhsyoF9`>;Y066_QFc)!Lw&RJBJsw+3`)hnobUHVe7SI^w0%lm~$=;cpP32~<%gtVu zo1KDeW-=#F`k6?>W-HnHCAa`;^z~feKdEfy>S8IaHQAtKPcYe#WQ$FWaG=SPb2T1A91KoQ&zkJqZmd%6K-V-ED781K^J{&X2H)?DB#vSsc?BJiIEonOZ<2so+!G{@hU^82qa}88 z;%KRTnlds+;pyb{3VAspLvn~}hu1UhCc;F4R`T#irlZJ12I2nPLAWgkaRH?q<~V** zM^4E(l5d!m!J?+gnj#Qq~Sv=c@n&|9_yTaj*LrPE>JT_ zCDu|29j05!vlSrIRm^sXkABQEA+6*I1`s6RByxC)4K>fntmHn#cpL*=A!u+qG~ES| z$Cyu=@)DFEhw{f*?mS&hK4i+d*}#afceDI0XExtq%FC#MbiH`9BYL#R5}ch2l%A&? zXOm?;n^q}v_5c>YCRxLFVX!p~+piArR))A{@QhBPVJu}<@_#T?K*ON5hVbzJJ`P!3Z?cSZRADLKPa%qV{CLo=cW4-o`Mlrq9bbd%oCSjjP%n(l~{CceO4 zIA9i~m(c$A!LWjO=!psuZ3o`vP7KoV@~^ zGmB-NA6}O}Bn`|N*k`J>J~J+-iO~NA6R(_Y5XEk^A}xH@EM{SY2-4KLg^+^B7U~6q zPG_0k4w7UdQcxU^s~56A%9$)o@Jzu>sKUG5+o4d~yw6Qm$2?-=ov`h~=)? zk2mG=)~5CuP{E1ek|zO?nNy}`WO6=SF>NAlyqCHdH=cS6X3K8GjqwwW+mRE`ix+(S z^Ynu2vE=R*FZd4LI>4;`B_=dZD-p}%&d3=O+}1i%X&+V?D#B=9xeu(`Zqyu5je-AQ zeQC_&wC8h9)O^oLIqqes1oKPbGL)Bj@O{V2K&hkRN{Z|jco^kH#6w6YKJHA;3o!LC z;+$|E7>opQj4wKJFH>WeUIKQJ86q9^rqk0lPRKkn_+KD9`z5FY)_4LX)ynIl=}qXu zjqF0VO+Q4dIpi~AJX6>=j#fDsnZ%fY(D5Vo-VQU(w*vwmI;O|_Oz)wiF%_XKu+r1p z%t;D2xh&)CLE&RG@}tHgsb*c;ssejC1K4i@`{5|W-1-K%t9TtfHB-2VV>LrRUc)ZD zMuPS|)bnu@v6M(n4uecw1>XqWpoDCooOy9qnHu*w1I0uxt4*OBZ^$@t2mP}^Wac~J z`3{6))JGP@7pac3?n`|BHD7$b6|=Hw`83Ao(T0<)Z0W)zA+7odP*$udt*mbx&ak^AFvfqS%lHHq|eK9xtL~izfa z0mIhf7(<>n=C-BWG0lCz1IOabydqc*_Cud2+tFs_G~kUc3J|aciY9jXOjU>|7!4pZ z#VLY4o+B!3AZ@vjYl05s3kDL+h0F;#kp27m`?_G{dyWb>zMgZE&!!q2!K=gue#9fXOw29xP#>e0G8Wz z1E7TxzAy8gpqIlG#X+XEMxZ#7x!fHp=NgDRCIxauiKEAf2I7uMflPIW%RYF=$8DQxnMmLdT2V8(4_Xrt}NMBX!}WqPK*>&zs;URYeICAwl9okKSOh)nPFa z+Z9)HL^#_V2IrtCd%Q5<0fRo@^Tx5MvsfFfJ;!{9%+~w_R)1oTEB{hy-&0)2^|J{p zRiVVAb=(f1ZKi4OBhb&TKy76oEKF8v8M4{KS*}--zn1yGAX<~>jo2>eErk=LrQXN# zp1l|_DyhcB_MAB4JpzUSV9pPKyGHcP34c;=6IYarQ{&(W`5iQI)*HkcS(ff?YN^K@?hKm#Xd=vTXC^&N1 z$ElZV|HW@=+sN<|+n9rgRGTiFDWmIEw(}#?xs52xzKgCfc5}vud)k_!F}$M#1JBta zr5>Zk4+#gJA45MCkII3x^XUitfA;$TG3;HU!rsH21J3zN>OpFW=st?Dmy-{6Uhl-j zx9LI~=hkt1UnZclLr7gs77}#p83;3&&2egiE^Jljid zk3O_34_|V6W&rs>S@PlZaQBm`w-Av+n;(G-Ccl~=YW{;FVZmZ1md^_`T`;^3hg4V= zBAJPJR_+|aV}+2NVN(MEFJpr782(xw!%yci{3SewKcC0&XYd%lk}C=xTnd`cE4}&q ze0d%Yddm|zdHZ^|%)SO&NM!LMX~VNta_#HT8Y1jg4=>3T_rBwZ)z z5=jr1bg87vC0)jJ{4Q8*1_X$qrce^k?=C@tz0R#~7d@4|4VK<+HlHti4jU%19G;}% zsQnZKM&Rjes?~YWlk8*tlm;));l&0%ZNuR&8qOzf>NQuS5QqIt(L(kjwOcub-BV->EHwlpLe7oe;AnBCLGmpe*SZL(W!X;>;hrRK$UzJtVh0g> zoUQ(YIS43X*;`=@vlJ!MIpuQZun*`ffrJQA z2Mql=4l?;)22$c2f~=!uoERf{he5~@ALQtFINbB$c?MCI-GSI#2HAmE^dQ@fL^HPC z%@+*8Wcx1m@ElW3BcD=(>KAH$ME=64TVZ&_gOPzTv{a>v?RgwO_GP$->~G*+!Vj=9 zc20l+B8jy{s$w7DIRt)yl|#7CPxzsquq>B=cXKlfCB#nek2GM;oXqB+DfUzkV|yYp z(adGVQW8;_pkzj=Oh__cQJE6Sd`V?WC3C#Wlra;(Et-BN`}F-WlyKN^1Ar{Yn7Po$cUGD~We`O&83od@W>~lA`yB9^- zdh&MSPfvVizL6e>=no8Zc;=mOso^Nf?t7U{h>HFQUsUkx1TWu*o6c?o#H;@Ss_4EQ z6eZ>XVuX*C`Z?0se(c00pMEjB90U$-y?jcH;yle>jU@}7?>kaJ>mQPwu{`bb1;@85@U;S7s>ct)R`n>=c&yZ~UV|xH0D}FYpCcFg zInFmV8XW*>X9%xj;vhGaD&3*Fp+znHax8CNu1IBHQia|)=$zI`sWG2zn)i!R*CJwN(GrOYI2p{N`_yG90+$=X1@A**P)c0ApQGAwhpAg%M17eI zmVJapZk#xeMVKg<446f|xt%l#h1t`DwVschQZnBp+}vkYk<@!IJ*SikHa$DN?I_6Z zKvSJ@=!^&Wbmz4GH|P^IW0e3gz97fM^XQh{j+_nfLfI8S>G>(4679uM$q8aFP~7IY zIP|jr$o^F?`ulO$4(2ZDKVW~G+_^V$t;KY*ct>(Ds%u&5=|&~4)7aCOGyT{ zvTmq<=j;{E_o4|rtHONG`;zW7lugKdlOLEjui1^abopt}R}toTzAzul`gqN=T;8I^ z+pNbt_8bI6vm9$Ophil+ihvXXYCf<)DrC3gUSikdju!>_c?`MYrGEb4qbR&^PVc}^ zp&f7fH>kolIq@|Ne_Aw3Uc7~~arfePYGA7Zwz)vxx8v7nAU@b4$4(cR^Dm}E0i#01 zjFQ)QN6TxxIWLo@s)3r2xwLXZddmcSOq@8G%+Z?AAl|FYKLq5QrSV-mmb46#Mz;-Q zgE!F2JKKQNV;VDSP%WXI1L>#v7Opu_;MYAYu?N-jXUV{NztwIbf>s;UC{`Po_X~0W zS_e%cplzW3f*f;L#ZB>Xq|X{Ooj;ue_4nhjxN#)$FT*+uSX$6J$z4ay7vzjD%va?6 zw`lO8L3s_n`(O-x&FRl48KoL$rliwFTtS=(X?~7U*p0bj6KyAfae?=ZLjLlc% z2Ve~0ihpA!%CE?29r+Xk|ISQHO$Y~IAc)Q-{LUO-k#{SC`xW^^ir{`leh&yxYXJD@ zkK5H(F@OZD5|ks2|+k9laob(5OY zy;@}R=DiSh-NcskE<73<*{^ZCFV2iQy=B<0^twp8Ptg*+1ET5Ou{Aqur%sD~ZWCYV zM5EPugSR{f#M0Z;Yo7R?Lu%~d;}&&61uMA@=@^e0_<>GjGTz6fj!@8ARg>IqC2t0~ z#d>5!Q}QkQ##s7eyrLYl9@#FuT9$+Vo8y*OJ-DX=>sInndlAZ>i=_|6u*tSo#c&!^ zb$9%Y3F)1&^twIO`aG!_=LpT#BlV5R5A3ftEg$kTq{d&2EDZJiLqyxhIz^>jSYaL~G^JJlWURv6K*W#+o(#G6GEQoUfX1`O8h zc`-2VqKnS29Xf8}Qyq}rA0OEbPsMy9$KfSjmKiei~N={ zfvXZnKJts(ohliar+TfQM zRD=^{6DuYKFE9RlMfjqM`bfo4K*m%IMwKxY1I8Xl-MISl8l(A-LizzsYhpojMdfY9 z(Tc%02QR9qO@t>?bg1{(2KkKlyY7T#LP4|;g{p`FEdp*AFdpQ=6X zxJeZQdj7ug@jzfSs@a;y{p8`c1@aQ1;%}lc5z%7&ahEyiag_f_7n>OhyzBG2sz#8{ ziSt~+fL6p;77XYCdJJ86h=-2%$HB#=n|VPd zS$Cc*7^uiYC4Z{`owMfHD;PN8SRyFF0DJA&D;U6QX?d<-0565*xq<<_PL}5i28^f4 zEerE#2=C#l3nAf%Nh7mIZbLZO~3+K=1Y@aoE_U!gW*ZT>JI$R1W&78L&2V_WI zm~fhL-Qu&4sVT%<724;{pKH57sXIG9f2KXRYrad?Vw?gWkJ)q4g1O)AaGEDt>hJ2yYdhd(cuT`qWjpnY7+C1WBj?U#%ldudZ^duwD&OH;c8Qu;gDIWrc` zaWeD80;WYX7tZu(l2gQ5?ThS%bLU?bz)xq#e1`|A_zSeR&$Mq?(9v#pb#`6fv9P_f zYo>#Ng|3?4G0Vx@GZtReVYkm%ICIWimpXGneD19F>%bgpIG&hOfvX&LRdV4Ke7Q(x zOkn6{gpy8gXXrzb=0!R~GebXzL|mkE6*BbmNyJ4uR~ti*kci9AKc>jX@W0eU=W1{G zT;ZW}bv5)E9=ccmIUc%Ke-okj_3^fgUdQlXsV6HVl@T8LwgUA20`xEICDqSoLIJv6 zfc~=r^bG~*dkfG{(k|Vv&v^ytmlvQfDM0^40s12a=p#;29p=|(LIL{K1?aaFpx|D(ox{V?@k zt?8XU{KJ~Qz(>DTZ`Eucy;tKG`{?U6eTk30QPY?D=$kY>>7zfb>9_jm+cf<)AN>VQ zU*V(g)bu-j^j9_gr#||dn*MVieXpk9>!ZJ;>G%8S?`ir2KKda||E-Vyk*2To(btzN z!XJI~kX~0-`{)BS{b3)yT+_RK^h!mX6H)!C=cDlX@0t12IZvYYaTpx;N_QV_LF`-mngXK@ycE1p?meb&_i!P z+URfWK)IA<=;lO$az}gUwMg@#A7eaeF6zLGbnj_UOsIK{CABTm%KL1_3eDclWXyPQpz+a#L zw@CTwH|oW8vZ<(l{q^~RQNH^3>iXCIgZeiYs9#IRgv*@wtK6~UcNyyE8$Sy!S2fHy z;rL-(WBSw3?|k?=B$!RxSTc9XC)u<0QX)zH)&7NxWfX@OM*_%Dv|$ zBCXw$L;XVm`3rO)xmwR(%4gi=m;bbu@Av`qs`O|ex%IA>s%P&R$fW`9SEPK#pMLpg6p$}32LQiC*Ec_Y`LzY)51y%Ncp4aRsVh?SpWLv zUr<1Py_R3E>#_Z(;?FOi|4tz7Z*UqONKQ~Go`riaVtJY0c3odl-TUu#eV3wArk*{>}==J+NZ5x)oMtX0x|`eRX_&W%mfHC`Mp1DpK}wm)A#rN_v=f}J!kKI_Fil4 zwb!)QK6$(>bxpVY{Ji+T{Jf9mbw0(Vd3kGY?JON>#Ut~I@-E6tA3fgHe~G%(r1z;f z+g^T0k3XmK7j~?114{Gq(g&&cA5;A~R^M8F$K~Hn<-gUjv9o?E^~p&evigO|J5J3c z^V>S!?yR3>?_2+g?BY+{(c}Bk03$>H>3{S7*SgfVChzF+R5E{OSNJ~CS)a(Ze!R0m z)PG55eYQ1q$JtMn=kHE*taAgR`irTL@mpAN$EoRx{FMnI5Y?aZZ|av^-ATWN(5E() zzqBiTp8MzZFKAkthc^D3|96+Vf_@csjbo21bCU~@)chag@m%_JP?cLP@>Xl_fTU5S$+#vnTjbMCy-21GxtM%hT!#27onqQg~ z%%yIsDC5T0-<+41a8nETHx!^(@vo)L4Oz**`8D+BGI!B}(hlJIkRWYyRdac66lMo0 zydWw()aJ$TO&qw+ou>f^~+g?Ry(gKYjG2P zuR@?))s}KYR){DoUFLVZB{Hj`6`Cx2sIQQCpOC0=h5K@pPPnGs-D=#Zjmf%JH{nlE z*JgtOnuNuA=BG)#vr?~LY&3u@Zwu?`!n#>-H!;F2y4~bz@6`NIv`wgOc9{v4>alo( zOMmMD%}LSghMjvQ-x%ic5n`BE`jk;1SPfv&d&q&ZY%|0gVqCh%y=3CTVTJMy4FWd6e>ns6$5CvuWNav2L;>s*9cHrzEGZ~ffrdZO~PTXE=?dZ zQ;29*DrZ3Q>6*WtuC}hqz^&Y;(f7K{HzhK}r7k_>eyU)(8Kt&JN*m6wN%_TfS|tT# z4Sj{dDqUJM6)SZy>A*sMT#dvtJsvS^I)HMOQRtyQU|>@L(986;h>#OQ5f{ODi@K@9 zbwX<{Z;oo!WX;jzI*A2d8nc$C>b2Zeu2bLdsKt6$`=d_*ICyRV&vh=dSwe;C*IoMl z`>dv@HCjZ0&914SfzZ&ULyNtSF2i`#?9vZ^o+?+gz*tRW?91@2LUzuBJw=x}Jg*-r`FZQw-LBt-&1xj9#~x2_mo&PD z>qpX?aOb6IuJ()K2VM*aSy{%)TtO&e;Kqk~$C@eOie{T)Qon<9{cdv&KfT8K?NGlv zTy6DdLcf7`tt?|@UHYBj(x)(WuKq->iS%sVHO_;a!_@V0HSUrQ*RTv_FVdS3IpS)o zBvf3bQCxIE@mUcNg)&C#aOof3s;-~nnoPHJB#mU!+C~j&ZHQ5ZK}G9;DU-V!x;47m zudyf?Au5ilnVd$Ku_}5A9L)qRf9Qnrm!!A(?{Rc@g)3T~$__wg8|g(3>&Mj-xE znh7qN4Zc#Cg34$qs37Ho2sd5}q>G?JfsEBuy$dRru~IrP{i)G*x^#MsQ2#2|a!^l2 zg6Wm8o=+TptdJF(v!Z1llx3#@*O1v_y!kG(ZkSD}Nv1|*tO=%=Rjk_@iXjiR~D-R3igQ5c#%C8YdOxj|N z_qxk`$TTskFufuy<-Il8D`0_Cq~C8(1ztBX+b*fpUzefka<|KjjH;#fikn^iHxb#o zMN94IFi9J)ib6H%Dr#e_khC+qsn558wQ4sUGio7C3peX=op9R0-IxtrsyFgj`Hluy zqb7%4?O8&H!Xx6e4wo5*+*kcFmwvUzX0HfK!RE+K35xK!E0};_x@ww?n{1laFhrQ9 zH(VyGR^u@mFa0z<%GRLU9j)+|tFt1WR*i3Ub#H!=dT84wltukLz!O=I@Do;I(X3K8 zWg*qFk_!*2+L%_XAuO8?b)#aGc+f1m!r9!7*@Styotw(dnySsN_LrchXEWlkUGyxh zky=xOSyQwtOkOm1Z?V43oHRvg^luTN8+9>7IooCarbaT3yJI&94^3Tb8KzrZH1R5_%Xj;y=dtF9ON@Hm5D59krPyyMaj2D-dflheP)sMha%n1*nOaht2%D)M$! zt)V;(Z zGs&~if6`2F%(9fts1(jn)37LAeV8m$QW)I3@^+oBM#TJ zY_PEcUX1l-7)ndTh-StPro%32uo!Mv`;y4U5Xa)W8^7rGSg zb+uPTldv=L8~2SX6a_XOGrQ)GV|G2F1p$VekU2$5!ffMxYt4>d#F~{mUgoBngldVF z*pNA}9Ri9@TUega%5DTcV+bm%)N~vD2D3-t&jik@X##sD7j!v;&Jd|*&`%*w_XDCe zi|P=rIX2q33%Txl=!B#^bN#0nf||S9o9?0|SU0mOiyD(b!6Q%r`Kd%CQWf-&2iK~V zyIhyW^tTxEhZdq~E30`iKt$YcbjJf;l#>LAPdcfP14~?Hq2x6)v(lxjM=pw+8mz?g4cw{y;ro7KA^x?{YRF<;aF|de znU0A3uu(4PBwxyy-UN|AUdW}YRiGLdE#eL?f>y>w*bLWVp9%i0Wg#H7jTjFwOIp1U zfveW8cE@K!!`EqD)Kog5OH}Giz)j@*?I;8S{~VMVVM-O@yI1hSExb-F1O7M>Vgv>f zA581Ictw&iGPbJ~62RgHqDG(=ZJ+r>AlqAILuGOj=sRCO3iQ2X0H12DU2TN`c9oi^ zqUEI%kwdxV^VXk>cWh1OzP=SXo0Fl(0iH7v`1?|lVN_{!A^DufG*}tscU*W z*5Je!t_ENl1jA#qus94z*sQKE(d18=aCcw;a znWfhVXk6{|S}{BrzFs;(QMcWw$w)0ec(z7uN15H)4$KNlRmS4+)^=QNS@E<#BZ}}E zjB35*LB-lCn06EtDB55vr}!wYJ3Mog(tab<3$WUS2|U6serH!R;HTK&bn_lF;E&Nj?^c^pdE|OB z&_0(w3Ta*aey-{H9VRtk#jb|TS;b)j*Nd_6D1Ia96RTtujXR4sdKxTuQ2MkqgI1($vUXs(cT2W3cArgubhLqE+}D4PB$zJ?&PS7dn5I2i{)%jt^%XkP-M7B~!nL7EPEyEJQ&HxD2R zO%Kqhkw?`rJ_;O$Ns3%Qo66Wd4`NzNz?An^$ZQ;=uH^z%&5|@RXLdGs)$_!a3nboN zW57aSue&yI+%y3&i!f-g_Z4#3c_US5!?!~0BXvgzHjEoAVb-TC5U)!-m%fLzsT+jZ zjWmk!Xd%pLafZOv!XpBc@S+7kWs8{jlBNj4?;LRp7!YNkNTLp05PFSVg3z1B`ks!u z!!S9w%-U8S5(8r1pC*IH6)$k<*;@TcS2QiyASoAxa(`ba!|7H1Gais3Vm93JM3y+- z5c&GWhg>F2J5VwWv#O@95kxuzMY2$0lb@;(%BAsopt6|lA?6|Sbf66eW7Te9BbR)P z9P#RRxDfb+W^Zc}U&mwybWlVGnJx5vcD1-8xbUcKM(q80A@&p7CadH`Fs5xEwMw59 z>LR41`CpEz#c{;w>sQ}q&5Lc@qay@M0()tAbtKO;61g!mVV$&glf5%v&`UdxY7@8{ z^*B#cBRs~u#$&ajyKoqzy{0B`SfeI~JrYqA2HTEm5d*?tOK+wI2Ai3Rq6-Qpv=HjV z;AnIGwEk@wH&O&nGD!RNW`V+oIDR)?7%*R@I+-8+q`_i~esE`q!X`N4Q>I}*5a#%_ z8_fdNv5^2u`^1hSL%gB^hvEe}BxQtE66$lsimMf_aJ9c>o`m^w?XFR--8JxCXI=+d zH(16<_(#Q%G}Lk|qG=q+>u1LCr*StC5+*DF!j`wlVMf?Abv)B$N@zq_W%9fVU}qK# zennM62~@X;-6oY2$5wQXpa^PJwodhuvhl5t9T(~#ghKvf7V}uV$fbWBgh6C3f%5B2 zZqXb>E{o0f2@1XF5^G5dg*R=o*vnozCGgp93J-<*0q-yyGmrHNHAC94q-ANdx0pUYqQd)-K!RBzg`!zct83Xc zg_}fzAfFJk1euz=+3s(GM3n@2p>{mRY!S81_tQ)ivvu};j3#ObN7-{m&$@lnrM+Y= zG?*Uj#?mPM)c{z27~79p`aN&$mX1jb_C6h)}CS2E;U^)u6%2JD?2e(2M3ERV^;E5yxgT* z=~DcCfiXYydP8%AQAYTc7uvM|Bu6^cS{n_Tme~rhfU^YwWoV4qtFaWgj4}}jonJ}s zJnzwHJ6%T4x@D{KbV&g;^{(KTM)?M3f++zG$U%T3(sG3J3AM!JV5?~FKzd=xkN*s+ zhFN%1khYi<2u1N4&@?J~Is?F}nrW$ttKN?w7%M?Qm6Y_D!x97sTsk4mm0Rmjkock^ z(Nh(Pw!KX4`sO0I5E4r4GfJ=ksw7oeDo8_BR7-GGE>hQvTVRiT=E4b%fOydcNac;uRb=0uhaf73qLl`$_cQXA+t!SMv4*8n|=5{`NUidh(X zmk<(z6t}zDub`yh`hDRsqes}DK=eAv6$Ky+y*LSCJu>thNvomIpz)Ss#kESd`l3LhI3SO z!Fmvn@|@ALXtd6y%QYkx-WwtEiPp8eAiNVK^A;Qy!Y9n=w&2tZmP-;!PglY0H~|p z>f8*KLAZQ)G%{8q`{laYWS1Os;D5FEkJ)fUi%pWOEs@kFj2Ip<&!L-a4<_+CaNZ9#>63tyDie9+`Z%UJ}w8$NJ zT%i%P?3|E^Rq9?1F3pz(CP^Lq>r7HvPfH>+T-FGicW`QGhEySBp?z2pF!H~sJ{oN4 z;Nk$}OjX+kPcR~5ivgUbTULZEy>|d?lv!)CO5qtWI{*R4tmd&=+=BrnMiJiVl1B)B zn`VJycDc-bVt-JZ>C$h@0L{h8;zW;HmQs}HA&CV@BA!Y7A&c~8tIJfYjgWdmnLIQJ z-l>@K^?(!&r~v}gGullJ+5y>T_|j$5979SDPZgHhsI_!nsBJuqClIs5G{rBg^~g31 zI7!MD&T_RkM553f1VT4=YQ8LVGgLTXc0@)|?DE_zNf4paC;?a?k&rKRnq20FNKRM^ zMXRenI@(|_4k7)n+rtbmcNNa-lmY=U@Y!h&&vs#eEF85R&YGtO@mp}HrGl!(nw4^< z2t-Nby4vqb#j(I`%NSjKwJa5Axx=NelRm+NGeZZB8f^aWU384bZv-{GAG{qVss%!@ z!#Iv6Qtqv0JVg78T?%Bj_SjDWC_S+ZE7(?Um?<}b;8zs2jwe;UFEkm18=+9b(hjPWLXoQ zGkO+6WKOsY8KDy z)Iu^^nHGdUT4nbg6f=m}sL){dwNrw1!+h^?wYw0K@oL?JQK~SRnSQ4dDuv*SCMY5q z@5NRy$*~H2;{$4nL4q(|8*9K{sSu=tGc4jxcbU_9ONBB=q!Ew6V>}l(3GX*tee}Ge z(Y9j0d$ZLRidpHMl_@VJHu586vLf-Z9eBU`+S#UMN{2+tWkLl`Ws`=tU|@w0*(lnbDpKo|yf zL%jc$oEf;DE{1FY2dn|0-QY4Ci3pG!=q8`6{5$YB3uv3U##RLA)4{xOkLHziGD8$F zBW5;t)$_+U<{Uy9_hV$r%0(%ai0ZmrFz-o?4Q{a+f^Vw&9ODq6lBw79h8cpkbF++o z$e^J|ZBSG%L z)fR7YnF3^!D%OcOBstuGU#fXiR#epn6^eI);W$$jX7uia@k2Ppq?P3~&Q~{cUHZpp zvQEB$)#hp$DbfmQ#1OH71tx-yO^3CTmWd~!o(W()2l!I{7M=o~E_Ssa5LsBF(Q`)6 zVjVn(v%r8pgSvLb^IW?Alawf}B-%EPRL-(fj!`*5;iGfR$@ChCQ1qW^Bpov&xF&^C zXt)!(A}C-R!+TZU$Z9etGg?jPg!!?P*(f$_0_HJ_mXU>))~>|62!m3XhW=6NjHUGA zK;=XX(LLB^NlxOfjwJ4CqBxe;EAvRk^=BiNQ4cb%^~~fRa_Jwj1E p{aI`OW&fH zODAz6W%QSmbIK^UmXN|;j1vo*Cut+aQ_Td6F>J|om@cxScrBT|loK>~7Pi$}y(`v4 zVG|j+#>kM=UWzz!Ee#WRfgZZ!&}uU@YTV zNvV~}Z6l|-wm0fi&z6=-4xOGM&0?2sKv>eJS#m+j+<5~fNSS3x+nb4V|G(W&Tod`? z%z4B%Om4(Cpa9=|foA57uL)ZTNLv~ce}!UoQ5ix?4%vMbG=*3er?B9Z0$Vxc36ci3 zm>SsP80?WkGNGH^?8KhL-3a@LFJk1si~8t_yq*K{XT1mcdDqbvu88t#Vxx2Md7aw% zs{`~p)|U7p*3Q?eP*my4C_myY6`G4qqSUvfAR&_sw!N-CdX#(G4sC+-s)(h9pKS*{ zQYxK`fsJ&<(Q99pBdJXhCTb@R>d_IRB$i_?9?|rgR?H&HDEJIQL4cTDB@eJlmsq?Y zP9ig__Kv}HYvKhkF}w6E7DBvC5w}}j;8V<-Xw(z}_c=eR9U3S(9zUudNSGyuO-fi1 zEv=nhC5tT*q2zCTN&?ncBdq!21zi%$!NoGmyKG0~IKza2x|Fmf;u`Nqh zq$`63H6H?ndL~SVTAJpGKng2WErUQZ;V~>3Xc3pnrRsVqdA(RSdbZ6SVq@ej9Nh>4$zHzRTc^~ZbiOWAq}vZPRyc%M;9+uI0#2!ZM&BdqE* zQJcilG0J?amd{Z+U)atk90XQ+o*kYudWJXI?>h8Bt0|gW6SmD0oQypx*#yYd+gT{a3}&Lsdma#SeE-%_qj?&*TR#eCs!&1b$CCv3}- z8Kq~@VwX#I69vR^k<25{!s2^>lqeAYi3xKtmVRJ?Cdhn4wbWRe54+4ai4cs%rgb4_ z(^h=c9?cbJlNe02d8Ys$jwB>_osg-KAZSpLK-pG!L+KSJRwHRa`(-kJ^tQR-GoS!m z49Kw?K2s}UjYPf(oXd!oX@|p_CR4{+BczS29mheGG(f}Y|W6Y+x=E-_9B>p~#3iXEV zV^PT^w5YNpQOk4uNs>TpgtByqb%Wg-njntdJxB+_=BRLG^QJuTlHz0cNAnRKpoo+G zoS|RajcSR%U0MRPSk-3&X_SJ z8-(o}uC|B3BkCG56W_GV#4S23g?0^@i3FJU%2EJ_wVIJAb)3|a5KJzFwox$o*jR;j z$8s)2H=6;;44vr~Z9=dj2vI8opy`12Pooty{bw%u3qd({%2B!GGbKmg>y!VV1S>8C zYH-d4EA*@hK&MQQU3H8b@jL8)S&6e^D=|{J?PrQW$0j+LW;C5^VQJLWLba08ov8pS z^x1CHYrz~Vf;<-RJ1DGJh_(ytWCGf{auR6aFU~oT0Le1=O>{S29;d@@1e9s!pIZAwD>lbt8lmTZ91~9 z-I-uoh?dpO3dfn)RljB#+YttmPQA>494oNurwcS~a{NG$d_%7sOhS0Cs?RE)4n(V?*9-gq~ zlnfWFdvj*P38aebv_yK1?QyOzU*s$LPE&U4T71sR^}hd-j*g1#%0zmdHCR=?Xmds1 zl`?1)W~yA}pI+Y4;p+D9?C9XRpg!NwlCpYT82Yg^z&<7W=AfFzlxlCObn6n-awqp$JkmmL-OX4j#`3VotIdPDClT=p9TWGmmT`%?5ls`SuzZ5uI2hvEnK> z!bfaq%oIn#{A^MEuXL8&fsm;0>S|h;29W8m3sO}lXFf?;m<3*1t^jJ#L?4~AnPJp_)TqWxs$u%Mv+HWxQ2T(mpUDAWs_OnK_TaRJ21Ndi-YZq z$QJotS(}VFoz3>(+7`t}qFvo_i@HogKUxuAYF`+G4E=38eH-1+)g9p}sb z6MnJyvt#>mIt!N=EKt?9a+Gy5O^lt45+zSL)jRmxXMikYSBaoZsZY@ z`6B17B*nXv8Tc3ToJ@!PPsr>ETB-uKYTOcb_TK$!hK6)_LEv3F@tCugg>y44{0 z?zx&P#O2vpI^tp=utR^!8 z!N~?1VXQwzp~i{hnQPT<$6n3(Q)gcKGr&ycl9KV%iR3@Gyd#!(nniZ{z zSILCNz;ai{>z)kcv6RuB#J(8;J66%pQL#veI1VNJUF0_%l54vt?8aI4q`0%NnYKDU zOpYg@+JWP>dX()@&3>#9Q|GPX;3Yy41yxA!AP%rw5J*;r8zn7zt65{V$Fy3bDxad_||TDIDxA`5U12eq3Km-NpXY`)@mYwpv|R!BV{ccD(vUEy1p}kTLPRH&^iEz z1m6?Et(nnhJucF6=k~py^)O&!t;6c(t3z=}QmGz*Nw|_M57^k!c;(P?w2nDdVotF7 zlpkEkR>}=w>D%T4%<@g~ymk=Np9>QZf8fW}z4CnEzW!1@=K>f3=igCgWQF@I4%vqv%W0MNuK+1*vGG>n0V=d}Oj0Y9J|e&^A}f=<8Z&fy5O^KxPbT{q z^Jdn(oBcRQ?ncSCzm)(Y@{YddA@AA{8k#QoL)Dhfw^Nkn8#8a4ROu9VuJ6gEEgka3hTlBC^Qmw7B~KIvOVg{+wyVIbXVBi1B|PJNPi z$~Cv~#sDx3Fciq{V zfh=@sCB&XCgQ2xnTBB{Ry65=_h-R+8gvT%lLE`aGDYKaiw$6@LinoZ`tF(HTx%x)U znU;+%qJHt4VFBe-ugzwceT~@g#A;~kCLWkA^lfiLo4lhe^rf5uI}V2d4mETM@jO`% zg)OdjoFt>G)Ts;d7c&RhSKwvE_+HcWV2#cXCr4b}`e|MCFo%+KMtC7*Vg>%mS`b>h zgw_h!Vv(!A3OWfPvm`Xcs$7X#Xi<**Ugy&O6 znW!4_p%}aa;v5ivEMrdax}Qf6&=e!;Z?YbsngYH|5!9VUTY{CiHGqnV9n2!7ZdLQe zn>%d%1+z%;*)bMpl0T_nS#A!Sc66h|Q5(SY)=gBJI6oNMMQs%066srW%+hY*Muy@< zq29$VA{k#rng$%B+hyJ&)}wEP)y=}5(Bc}X3kU0vet(sT5iK9H2RpETwYK)?a=_L9 z7OLjx(#T!$uCTz&k`)<1+zxZ)_D(TM8JG37ubX(<#HO95RuFQOg)z;zB7zytQ_EUI z=!25UBXrd=;l-57PbkK+{JW6imdIDNylPBy+%0VzmS<yrt-eC1J4RSp&MBGC(#K%?$38N7f z2RMpEIO}Vm!FungO?9CGucxS|e`x_4{A^8^*SB@PP9%rX@1-`Gog)evfj1}n&JlqT zP0;72X1yDczr)_$siE0rs_gIssZC{DV)C~bYL;fN#OBHtKM2DynOzuBSF}k&T|iU7 z5!!5*$?U3PkE*EA@0PWN1UoT0xYg#S$@x{C+Iyyd*&*jk*KcW!3?R>cFq7SA7@kp3E@izfc z;gTzcHWCGC9^WVs^O7z(GlV@P5R|;mT2SO50{Z=uW4TUls|$qpxU+?rO^jsF z*6?vmeP&eQJnPm%LpACBkG~?2+_f{UbmaJU>)s{XhBlIe+iaYubS5L9czL_NmhsOi zSZ);j7}s*nZil#(?K54ZP9T%klib#G1gK$dyu%Bs=Q9LdQX&Ec(gy-b+_kfZHcoKO z!Bxp|W3<&2^+-PdakfAL_84D$fTZguaWLQ(2LsMF#cnpU0llVhpiql>;~bGNKn1q* zIH+m8K*P?=0qfzklLB?@5Za-k`ZQBW#u#T^ghjzD5Mq21+rZUlxpGoI1i(Yw%|0D6 z^@NB*EUX-bjI*CGM(QL{_-w2`@B=J!!x_=K7F*$uCtQ*vvYdpEf_n~?qseReo7R%} zWR4)T&aB&9h+R3ud>|@4iap|jUtPh8=!cgN|O^_vA`5c== zB;zV8N##TvveIN*sY@*(O~DSkJLtqCs87CS#W(PyPVA8R99RFr)uHp@{Nfhu^{=ee4SC(b-0te@ z2#PYuB!1LebmFdlGUheK|HBWB!>QEz|3Y6g@`F;_02CcltRFMW+J2TFN!S+uKd*%b z@?^-muRmvjHTVEObY`~j9#`)O@eOVDr-kTWjv7~B#WBby`4K=-MrQrzmkEznKb~Ke zE6T(gsQ(ni1kgJpnug7W`ftAzu>L;3Y8{h$8ZlNV{RTh6Aj%T1U&&zVe9yZEJwVK# z;!~D}<~kR^N#**Q=Ft1S{L&2=@A?gQu_dEc(_DT1R<&B?pX8Uiq6NO+X6_==li&DV z%~-qtk!FluX84DH8+y^^9tpbj-Ix^haGWNG7O(kFY*G(Ji~Tvj3r%z4z`OdFVOcew z&My*)x&H7}YgIJY51zvvlhf)S{WW)Vw$P7-b48qJXR5viM$`LWj2JFTjjG?UQsu3F z8^3BF?cdgqd^?n1%P*3(Xtm!JVjERahH?FmBf9=AqN}nDNx7-N7)}t!UoMW|%~`(x ztK31PX8$8iu+0rbz_On~|T2Y=ZBP?`pi$)xRI|(f9&U zQQO|2hW|DFPn5R!x%?n(q5S9n&K*>1L*%r%`ialkTi@nKP&fLs|G*uXzv)McS#kg7 z`3nRjwR`*m)w-%+McDvVVc2oex^g|@gB2NH-gV}(m&MwpAtct`-;pKHB z{5N@wd>HApl{A#Tk6KQa*0G0p#ecDUQ- z{}$2mkYD*CWk9yve^S%-E`VJ9M-d2SwfPF}M4A}V$U&(;m5DXG-G3lVan%xkeIO`f zw*4%;wbFm*r(vYGeQzzuPLjbx6JP`{5vBpc)WgFOEz3kpX~ie}Eute$UiaJ8+j|-# zM#zV)PXY+jgH_^hjbK{jf9-A5#eP&INsjvLDp9sAxZi=WZL(}uLbz4_5^4c+y8oiY z03UVs`&7c9NBkP~fn@i!#G$*))dS)hcE^1k44usGQrAoxuDwt8ov%q-s!410*;X1t z_pSb~e+iwwL&8%>DEYTx6{st+QFo(py7zwmB{&hhv zb}A9;@L6>~N#YQ&Hu&maaA$meSgM`7eSX}}t=R$pbFl)=TKza+E|2LB!)`9`+a`w9 zzEp=lE3By1{`Xp-u4=9S*!1w`Hvd0NdwI51E;n|Gy=m!)gtUlOlu#RoGq8Xmy5b>p z%(*lC_5YSuK~n7V_scYi9auPR6X(#inSwN!;rH$e&6@od(Hp(8%-;<~K?lY6bCHgA z`+UC2a!v!nkdJVkt}P6t;B0@-uc(cOu`qK;@zL(W54pTr<05PYPCI(OkYe zjGU7a*ahE)0~qwz{0Q>BKyF*A_EkG;$gET zKgSO`uy8dUg_9G?eu~1F@PA13@F!!2s97-I#jMJgje>+U3#YUEZD>S6)TZ7n1A!oC zXBt&V%Jnr+yi2l#m^xZ=hv*ryErzSqO-K$F^nCNavzw6E>d#veT7|<^f=L^nxgE92 zkeTK`0|8LKh04H|#m!%(=!)R$;_^9u6v88ipshxIufSM==9S?~CVr@7bu8B7uv7g+ zCQF?1Gff&Jtuc0E^{&-O@Fkqdq_JYL6?QUED*INXfrtTaFYmisJV9C<(&oE615(i` zQ&2Z>jnhS(kagtiBI$aLI+qP&H?IDL?*~xk@hg1ovdxYU{VdGQj`hlS`=+)&p|E9h z8H4GxN3=8;CLEpejl2WCfp=K7M_m0ckQ?Col5|R!n!`cS`~N~9spEgaxJ&)n2vR|3 zx$mnCx-U0}B7J@Le}YyJ9hB4W>TcKm8sM;-8l+WoCp54R77Fb1 zBCk|r7n!=EA4^19B=%WhBIQ-Q*YfP{NR>*NiMu+L*{9^)GNQSu5DKathdFFO2$%7q zJh<#siN*EpGc6Yw4}(kiqKzM{cMu12go_AAE>65BOtK^f>neW&?uF1%mY6N5&hOD` zHH2hB|3?h^KoF6^sIus_01)&PSzfSh$3kTuk9a5xd;J#R2*RnFhp;p%_( zW37ZnKb7BH+!`%N$lT)AjlACI8)fLAD;D?$c@}7#rT&MqHq8gxFGyHiY1LeB6f&QYQi&5Zl4y4*4xRR0|DZo zK$&q%0Nv-UQM5j7X2(xL@Utc@&B$_Y)TxPNi z@|`8CD;C+B4|d51ix^lPP5uEh=8D>VUjza#w)p&c+(4sg{s4CZ@-;VZ2o&$38WzO?G1dbAF=<cI#M+{}LEVmy>01|e;+TMleb4P^eBRg+{`H*> zzIH+P2VMGE-WE-Q27(XmPYU?ogigs%sC|Hm2Sh$wSG3!IRkVRzGyPS#R-q5*CUHD3WQxD#Rd`DeSgUhNUCINmRG%5D%`dxWIp+Z{Q<0? znq-+U5mht&ow8^2f{h}1GJeDqiXyHPi);hEaI%Q+xA_Cx{N*#-^e<$Q?-a9Hf`jRdx(>dFFq@|$857}Q`%?g=j&#{nY88<>+l-WnN$ znX$%dda^3A-Ru`L^IGcM=p}marG3iCFC4ihv0Uh0}mfzNi!@ z#^IYZFv!bbC{!S4u!tgsqE;g941e;wk!8Yq^$p0*@s|gMr6YA@ozy=YEs1u0ZEW%{ zc6X%{pl@KPs&)Rip%ov00S-Qk*j=_6WY#MlfA;vHWUnsaJ1+p~s$LTV0d2 zS!S)Uw)Cmm+{uBZe+wftV5|C?rC~hez$?9Y z4d&g>1tc$0TEh>0{%34ar2kzcMpA@EUGyI(R4 z4zo42%%??VWuMnRtChsG5-1|f`@Qed6EnZuuUj16RgMrRS#Jr9V4rJ?{_vYAR*+$f zHYmHfS|BIe0?7u>vrTT|A3@o%)prry#2nlSF-etTGBB}v6X2K9k}{IEHUeRoMWmv? zb|t31C4GJbR#mypR=e*dlz4;O?VVZ`(kN)JS$?vt3>nN%Eea^{ahN}f94r7G%0%j~ zTyEsxr0L3t%V+S$LVxRId$rv^q3ATdP4f*?tSOW`H&R^Ymo)mOC#ir9-l0Z|h~AYD zR`k06$~SD>;GfIUVY&xO2r}KBUxqPf(hM~nvq9ijZ_HW8N`mC$5G0`KODVDoj=W?5 z<(zn&GRB(b_fsAwc7=$*#5PMQH2OhWjXXh! z60TJboK;n1t^#V&;434iXwpYS6dLqv^L_D4O@HvH)h zKk|oGxnEPZz&@twN5DLczJ>kG0EbS1M=m`X!94;XCQJQAtXari<>?YY{{D4)b%m=x z9(Jb_3+P!KW}C~b?jBw>q2W6u1T8<96|*?Y10b7i)8Ze8bp)jB1R{|qN447xUA}d8 zaD+b_YN<&$dhMTR3e(G>l>7FFS#9^H%MuW8WZpa&%36JOaBKN)J&S%eeH|F#lUgp) zZkZ{XAvn$d?0LEev7mbnP72^f+AAd*0_{7dK!lSf)9L8NF-;Fm#WBk&h*WPOFnDMMr(!@vT=)t4Nxk(HNV#+UyAl0e-> zmUwsZu`#YzPUz}Q^^T?q)*8@Ss_Q=g5he(!rE|2V>D0|Ke_AA@&N15#5-Rlqjd}b| zTBGI`(A?;*VdnXhfm1F~<|Sw%iUy_0OptF1L7AK;QE0jUJ`MCv6tw91tU<-+CV{lY ze?+waa9Ctn23cUx8>2O9;p8TeVTs@ULU=8Tj%-231C_oM-nZdco|rG5&}R2&`)t>I zzlo(4C=Z#V?mjA}=Y3NC??e~CQ-LD@TPgatE8AhTj;O#9AESZZk758LM0xOuf(2LO zZ!_mn9N;po5$G1h82(4>uD_(p8lUTHC)nI}S=oRfX?pHq&4>xpARJOMyBIQPwkPWC zy)ME;ja~h7hwQaZrsydpfWX9@oCD*6RhWCQlE%870On{;u8~^rf7iLF-ob97>BTnj zZuFY~%LIb|X1L@{Kbz`+uPMk|BZ(5}ojt)ix5aBD6!tUypSO0|lq6l2-Co<23|m~k zqEpP@R;BdpQ{tH2g5&jsKj&+hc8JOV(Z$dZ!+ zJva?C2gfHz4>y{T!O@K-PFlFaKTD;l+st})LTMJvUC5>n*WE5g9v%Law{NC_Jk`u~ z;`^BGX{mJC+jW{|v{D@kclwJX&JM=fR=8A4*ehs`eYGjz|t9QtcC zz#zdo{sAkPrtyqvhv0Udp&L!^QsF%RDAh7tCO;pRMp&AlpmwX(PDf4m`=_Y3cDpn5 zms`WgLIwP8*B7Qnn~A&JFQNrdo3fhczx8Wl$WFa3KIXHu=c7LuBeL4zQU3#EyXGu1 zY$Ad!_*%P}Ft7w>OV@9S=Z>eVb4oJg1duYYpb;s{)j*t1{ z;8pGZ!S*ol$}Y0C|7cqnHVU*{CcQ)d^K;B;t=7{jnOz#~7_vJQ zaWtBS9mD0+8WH?k;G}*dYqbbzVKn{{H0v^FOEeeyZiTI~CBBksCYNsoJo2kB88M=x z^{Il}_18^Di)@8Q{AgNmrz6VyLa#w+yU~i|2X^J-a)pcia;oKr-SW$SH^GX6kLKWkW$zy8>qD_aux~4q#wH1%SAqgS+*&$7B z^;GH8>xHHf6dA5w?CL$cZnWVvFbj{`PIW^2TQ~dtVOOr5zU*5VtC?Rc75-2#J(*6@ z26Hukt+(GydcuEUha`ln@d_h3O-~E`0P6LUeL5>TvjWT!pRE15NC!C|=kopM(okQJOX|yRLzaz!rwkwp6NnU| z$}|xu zM80sQD;moBSUOj#hf~W+{<737)5%qSLchD zMlbg6eDNIU(?>6Nk~niyhlJ38QjQV;B{ZIX<)>~-R3&apR3~n4b^}J{S=Rr<8&eJ4 ztVOCM|4S!~H0yp!Cqr9Olg17zv@hr#M;xL&`(|mw zL~D}F&fW4kH{c0cM-%%2-VYP|GOsp_8gzfd*g;=TWe=pX`%=913@uX);|D#>Z$eG- zHt{bnk)5V*L5-pS?t)Ys#&u6+o0IA0{M0bMw|Tv~t=l%1?+ms)yfK;OJkR>{#_kV) z{a=qzG3m}h`8Gf*d*`468|C32CmT|O`XpiR35*;VIlIqVCDWT=&SwS?tNe!GNc>F@ z0*$M`7sC24a8bx~{9#T<4#ga-8dQ*Ck!8OysDCPZq&(Zfdg}vt71=eZ{JkB29rPRC zFHg_Pk8Z(SB{xKm(T~Wf;>3;7&5bEQtW0Hn68vsRHat2gx#?_lDbqPJHLS#C1|99_ zsB0YiO@9_Q>6lkFm-_+uUp)`yB0Fi@#g)w5Gt^mAJ-CrK`1TVM&*`9n45ulqTSy<8 zRsTzFzEPSDJnR=6=t0;S3aKog0LDlyMQXCTtZPk3pYmbTM!N6L80n_W@IPZZW`(gu zEkB7~(vJ?h|9E1r+?EQx)FiXZufLFSLwtKW6k{(-14-goc+5U=c$E^1tR%Hla8<>C5?U%Ekib_wR^!6TQ z!w2x0Qa&bFI{Yk^o<-@kJ2%mh8Mfq%@mFtev#(wqU5A&dnX0^ z)9@z0fN%(wWiq-F#gm$Xea{9@2E^iWe)a@C-59K_0AH3+Wgmdf$u)B@^GakY+x~x0(vgX7KF9|3jQQ~O`_y9SDT<|JdZUcP`oC4iW3!L{Fa#CVgmRUQ6Z5Xk(!k3=BAG1<4h%R+z383MQLv+d?Vi_D1nd8S5aBX zsexRl4x(xdv?zg@I_#5#C1;1KvsIPoMHL*iDpd6dRXqmrG3tBzFa})nC4IsvQBtI; zM7Bs(Hy%d;W8D<0Y^+4q#(LlcI=sTwz8E^ZJal;Zpz=wHlaj)&7uacz6Pn{8Fd%nV z4RV?D?UwoVXT)6Di|*mY=fsxTxv9a2lIblylGC;z7@G6b>)_1W^2LRR;LL&H4$j;L zr6_|lb)uU)II}+7^&DHtbFZ;Oa-7*vTv~Doe|<$lHR820q1vMiOySEOIg!?IWm(BE zSNlC{8OfBA;YhUXaG0k@PNFsBm6k}LeVT?jky0XomX$!G!yx(^`X@?mqut;xLnkyh zJrdco#pLf9dIw*H&55&rHS`o3{wG5}ZXE{pK6YrEsj35A`7jt?{-;o~EPHxtQe}^Z zkv)tF`}UUw*1N3XV?B(Qj^nwkBwwV-Pc(eKk8#l(%w?iv@VM@Y?BGPhO(z)tylRF0 zRM@Yq;h_TKoS#uxRx(tt4^1?D>ePf-x6WFgp_XTqHGIB*nK<)v){;KUvV+Sfee7(5 z`d0)IOZmI<#-DIfSxG;asnAWfpD&A9OW!4gwRT98W?!oqe7GFTt9;tl?y?*b>Gkl| z7&t0;E$4(gcxwyTQwDG8EDm?@R$aL3IrbKwdyOrO#YlO>6-ljx*IAIU?OHCp%Cke^ ztUj7bNyk*fl@%q!UF}S37i42aNy)hG71N1GLRKvhj$p>6*t@Zfwpl1Y5$%X3iLbdCZ zLVYSqoU8q~^%$7HqU1P%c-%h(@q7OaL`i@iM5{Vs4ea6iNP!j04ynj~mEzI;Q_={#IEGSDa

    3u4`&5CcM_}ao0W7Yo9ip#9H4D#N0X|m)ZSF6(}KT(D@G2#%uW0=IL9_vV)(;(kM&4Qco>eBxfU!HwEnZ8KBwp5 z1&?)4OekwG2OI4oH*qwjWy1>vO(CV9 z!A3jZO&n&W164ZEO?dcS0OE%Wt!dx~IxV(N#Sz$jG0#rzL*wJ1 zD6A+Po&hgPjyGtJ4{c-anW~!JZsPOCwn5;~Z&}`BJ%N_hZs3NV`CC3U>i1D=``6HK z$w>nDBn?J67%Wd@vp%Ht{DaDMN}ecv-dr zUuo)Sh$hPaK2+Tts%}=*?Y*di^Xfy@ZK3KmRbAVMs_t%LS*S8>%D{%I$B&~5b~`sz zeKowrd`-RYgc`)!#5`3bkZS;u2!idTlra0;F8K>n2l75Vd@;-itjM;>AFx{qx3go8 z9889B#iX;PLpgd|F?fG@dJ{Ty6FRgJu3Q6G-fFyvJ3apFa0gdzHV)hmS56IgIUTy+ zbm)F`sG^6`jpYr+<=AHv^);4Q;3P{1Kh`}djejkwBFq{(G=&bG_%&-6W-V3n(Z^6^ zA7$21WI^Vn8VW0n#Xdp9SSeSi!;0)+HH@WavY~$xyJKQsYZo+LdC8fNp}fxw?T%H- z6LpS6P(K6I6(zmYVXrW9;KF1>pK?^_#2?cxR^G`HL|8JY(AX?YHe6bcu67eYXB`IP zti0s%L64zriz*r(>|aqb)J^PfEiY5c%gP%*KCrx` zubX&?Y~LiyjF!h&H2gQ@9OfpzMIowKElbK9J_!c>+{9_tl7*hko>D&P>MITGK7gHU zxUyVteNv1Bn@*Nrn#`8?(wMUrlEW0sy1S5dMaa6Xot9aUb&4ksHLf}gR~-`SW|?Un z2FvVQ;SPV)EHnJkpdFBP4hF}fXcyj*=Ap!yKk6jw;wU4FK^|s-tNpvR%}Eao3rUZg zBk2)lg4UPP&rnP%5ijl;yVm^CE115V?;8A343&>t$FZn6P5$U3LLKCtnm(@&dr4;y9}~&P}cE z1uPi)Jyc$DreHXe=C?@9gU$NT&&k&OWH*&RFK7;Av&!WQ)ntd${2Fo2w03NuS8N%wI*sqfbvPQ(oi5`?w|dTD@tOx(WL70DY{eA>7k{0S#<4b6k7qq%)S z5QWOOvg@CRP_e*cfOEqDG6Zp5!?NIwOZiu#w5a+oTl?<}dIs=m|H07ygQ0x{5a|?o z z0a#c7dTXFAgLw&DrZZ5?ioA3l1M$vzHjtFq&o_6q1cM#n6^9i(mx1EQ9wOxNaBmO$ zTf)B~tJLUC-7QvkmO-a9Oprf4Ab&cwHTJReiwum*LhXg2_CjjMouGaL0gn&0SA^Ot z1pYOK`tgBUYhh9DdQ%Tny;Xn9sUpo;filMd6L%V$`qNckW8ciZh*F5KhNN5WG&=Pc z!sBO~2LF|}Tn(m@zDj7pWcGlqxx0cLJYtz@e}&)6oyGo!(NRTkg;Q~qr4Lb-?zt$- zuZ^u0Ww|}v!Rqf1cd+_82pgg-zYcdj$KG#ImJh|&8r$9lSLiaF&2qtW1&O_#Vi&;d zt6u%CkV=mQFZRF1PD zBH`_dKWsyMIJ^*S8210c1nf@|_MkF{nb#_P26Cy0pj$8M`}HvG9ht=kaC4s=&{j)gx423R>6N zccdeFpFzd3bf(4V0?kPOK-{Hl=Bb6&tY+e7^6(4pwomUD26CpOKJJQI+@%&@8mK<5 z4AdJPwRkvc@v!p93$85`m@2^uaW76R)szyU zabN*=Go8<%QKJGmZ5|;Mvr>Xs3Sov|@z7ahoHPuSX)^IOTEHOSvK}KPQG(MQqVzqI!=O9TO5$goS8%E-%j+)96SK8U)SSK8oReISJEi81->r;>n_i)ieS9dKO5Z*^k3r+ zleM?sP8LWU&=ekq!LRh1= zubVxftOn~@oAXL6NKLxC8z~HRANdc4_$h7v4~9rJgxm!X?XDqKz7=5+=Ps~QT_CQU z0Hy|1DLArqMlwU)hZxK61*fuXv40v)Ok@Z6ukg!?^wsu!x=+||u^*&ge9X))G*Kd( z?>)aFdAoim^!7Kb)9(QPLHoVXZ-QHgcPG1=-wEA)Ge2b$PWMOb*ZI}_;*k0}*>ldkGEL;4D9&A6o zXt&uOgZ!U#C-ll`R?eou2x1}?*Nk;UTAm&3=klv~_2j2@1DxOB zw*vHk#BX9kiPzWXVjn%WiIQ6VD4AJknq!x6E%Q@0;e0=9i+(TizvVYE;Vj=^znA%Y z?e`Raqkf08gfFv#Jbw|tsq87f9~K~vxNl=RQrYAEVt&iD*fW3DomTsAxv7D&J+|{( zO53;b^0?{lUkNwI`#W`$Fd6YtyVZM_huX9Jqv7ThU#S~On1RH^YD&7f%u&cwlkMhp zcvH)^yC2Ak(_+2Mw-ck)-T8igba#sX6?aIy5&kOmz}}KSM>k@$L+VOm>{ycvYM#tMbIEekUt;~V;ge9DgNu$`7g{m_CI6iCD-{Y_7G8Fg*`;{>2F;`HJ95% zfB|M--nENInb;idhVhVtZy%EytA76i4>@u@Lj;J=xBPU4$|8Dx#88VyKiqY$Q`S#Z zbAt5~!8Mntc|1F16MFdymQ6gOmd`0B=0fAr2_yWQbMWaEZ_7 zIcI_KzK*@+=I<}ah}@hP@sLC1eya)H`r-BvA$bErFz(Qw$wLfn1#5JQ^%0@=m_0;H zG#+k`)upd6){lBW#cHB=R(53~eK0iU0skF)2oq=`JEj^DN`JRO80gEAyE)H4Y#Jyk zQGCk}m%fS2Jyzo|f5b@L^`*ZXvAez`DR#_DWnEt?lU3ByUJCHD*vFfF5U?%-w2#|! zz_kn!)sN?xel%YxMH{VkHGjsQJDcClbKJZXD6vpFN4hQQG6iy{r9})|yM=e8I%uqzI4gZ_A&%*1vbWE#K-||K9%neI4wq z4S0f%8UMZg`|G>b%a%TNd|%wP{=MUS*R}p1jKAaPzaD=<*ZO}j{&Su6WQO)nW=D~E zig##Mnvy&F@a5iQ!{vk6cH}eMM*9>S6nu<|P(0zl>HVR-;omGG{9pX1{P1N%TgecA zMSv<#{77O{MRt0WDn3`K;uy2{C$nQQ9w&_*G%!G^#N4xZg%CWJ;RW2K8ji0R=3ISG zl}>teP)Eh2K^Q7`k|Eh}!k}cy37ov}j3Jr&gnr#~OUP56lT3HuZlBLQCL3<*lL#M& zEg4M?`Af9o!-M6650>w1^?Muv1sN`ma+`ZnUw?Pt#`faUH7%uUV9Dh zDgx?;LM=gUk8la;l|8^<^#p3dv(fADi$kvj-k18wq7_aha*}q{~4yFJ~wnWQH zprN~#{aZ9XArpFEfnI3%2Bq8t+i7%b-j*?Cd64}86nb9>#}{-CwPt)7F3Z3R`p<*H zaIRb5naI@TBcNB^!6w@S?g`VBjixJtROz9XSYy#QUTaXRu0t`bCHpih68k(H+Y^Z$ zF)OKgFKABDvpj?hxH1wu98UYpK*!B2!%i-|Rtv6qYdCiQE4T~gLThFOmaSZ{AOkC{ ze^D9R-rftgqZ?I=qzV5yez~$+UjqghD;Lq!zFxr!E*?;GzeqsLUAmY@%cv1$hb9>whJB2`bZv@ zMc`@CN>%CgEbHJL>=T0P7OW?H*Sc*0)P@*L1YxA(0*`^TSRetr%=?UP^`(>)QV6-;?9!e%&wv#gaT!B0z;@-c-zE;u2*M0hJol~0`) zh*cMBinNvDmJ*|~|I|Ga2l%InrBJGbUKTWWQl0g(d zuA1Pa83$5pNC8&zJskXfx$^fI{oM?I3He)}Q}Q}BusqFLUZ&*Jm3%Yhqmai;FcO?o z!Dp-KuAbOi?S@rtD$*>G}BO#mtJLW&>87i)V~w6&9ISeg9o zsF3aaQu)l=N#*4z@fLW|@4Mi>Q{jEYYk^I_88s1ofqZq6SGH(1>&TZRIq$>Pz&6f| z_qefAO{FQttrVP)2+t{D5cA)7I%DEjVlye$FR4$#F@&emZ(_U@VehYMR(g9joI{vO zYeBd?+Sg&OE7UrW=b@OM*lAw_hGwPx4$P*};s5fe+<*ri7#}I6VKxcRe))iP*XCt! z`HI`H%isl-f5r0&M<5d0p>u5TfBH#Mx@fQbNxPAtsd)+ElshO=v=2#m5k^ z#0YKB64qU9eSA%A=&}-6xi$0b(G7E_*VK=hHFWm0dGl&&7x;!P zvTJa2GJjrhnmuj8oSSMK$?C;*iF3LQpMzVP$e5v(5npi5TqRjGXWq2>#dP!ypH{n| z##de6P~#gqe{NlU&4L9rGnGqHsA{oYGp2rKO?`Ail@rPdY%~-v@LfC^5mg7qrijI6 zcFTxM(`Nfhj#BZyUYXAxRe2TPJj82<6GFFqIuUuzx5e(n`&&=ttzyor*Jhdy*qKqh zzjeNTHSLzI6qI8oRlk+v7W9u8hK3R;9?7fFJ!1?5OpKz{OO5%e&kM)4hR{_#%)r(# zFx%q5lcym)IY*bf9hVjAG^A(VIrI*-UWOqSy5~SBo_maOW!KnS>E^JYf_-2qn*~NC z(prlyZHEsJKr#2nhjYGY*#SGFa=RaQT1)cAI`>17*oPZ^^bk9)RKZ58$&Y%n@lyCn zP`uDgX@N4xIU)Lv!l4a`FmV{uWYj~g}Px!aeWl}5KFrsK=cT} z5dc_ap8;^qcL3i3(4^+MsL8s&P|pN6ryb~J`iWWJZ1ch9INF&=>;R%WAuSTWAuSTy zRb@TiZ(QZFBV8Nbrci~|#O?KH9M;TkNyp!hHJs7j0Hn$(kCz6>Q_Z0%B*9F3^gvXSgL=rqBF#4O}wpk@(KEFIG!Nb9v)QZJWP^0ZSNM ztc~{`WZTJRgYoe}GFZR~2^dk5n<#mifvgt2!;MR}5f&sMtOndcn45qI2v7xdoRt&- z**CC~5pSXFeqf#-!udPqd_T^2ne)9k-)heH;QR@5z65I7z)jryNRg50-}7$&qM&DE7Xt{_$G>@krSLtLYCl5xn|z( znsy8`zZ`Pe#(P?jOABPdwDn9++fa;ta)Ep1t-zP(57)D`4(N?)^50>+6+zGkN zhA!OlrRhzUr^X|_H}IA)(UqOxO^>Urjd)T2$f@*(A<1s#&FRdF;BwLitYS^20RuCP z%P=g%D*7Wmt){!sxZo^%1G}dbA)4Wc67Oa;RcP@Z8?b&bmAETjmWdbdI}l2inL$(| zN+fV1IV!BXwpA>9*H_VWP=$mhs!)g2O%6h(7cnMNUb>=x(xF%zux2uqaZX&YO7}<( z7v~*0bpzI&rxH!Xqp7Wd;-}bH$ay4p16KHZh$n`1&Fb+<s1^1=ue!hJPikvQw z*vP%P8CTZhrybr=VKukm9352qS~A{`KAv`XEBNnl-F}yX%V3-nucSG8tj{s;n{-Bx zk{^kErfLL7AX@=pwyq)A3?PIJ3sEU3hxJBd(GhCd_VZLse8q>={et5KDpq!ZOQKu`j1;JH*)Y^Hkq-$L0N?z z4^-4udN@kdqc?OHVMMDGrGe@1BtYZ0=({^JM9e&Kq*#e(8sFfNni*eojDO2NaYLvU zR6sg(w|cI-9UgSPvm@$_>7)P?qZn`Y&de){#=Z*2-qY?I>EHye1`2PZ!h|uPRV!tG zi$)}jZauaoTCiWQ<>w@;+6JU)wLVL?e~N3rJssZ}b_m~@b|Jny*}X9S2_b>6G zbXve_>aJ?z0S;|WJ{-J~NX;uW9>DDC#xo#xE!juVjF&vgP6ktQyA6ZaZbsWPNi^he za4CJRP5Si1=WsGE2H>En=?29OBTE{BIo3?L_Q0EqzY3n4HL1IS57bO>kQ^g9TJIeug^ zG|x-+69oqj^hAA~_00~B8^5N7n2NMEqN>CvqPo0{abx+F&bV<0$Bp|S1a(9Cu2uRl zG)8B%zYAo>jd-)l+KASo&1%{Ks%)xg_;@?hU`zcNl!pM9f)rr`istp045(aoGyFFF zQawk-c(DvGhA+oO$C9@4HqIxYgmr4Kfq-s@FjXHeE4cBY4&qDuDA8!V_RREfT+xSG)jVxgO43!?<=yjWc6g)Z4%rP#i%6!nvs4q4;c6R#;HZbu8WZa5}ah>ryeywX;~tj$*X6 z4wm2yt5{q0S%!Tbt~2c_e0Q?XitvF1hr3wk5yCht)L8xrv^~uS(a@%D|2VXcqqOsV zkMft``a9*iGt$}*mSD6Ri7!&CE=Dz$WAmEkWpL)Q^430ZhH6%ma11Zal)nYn>B{q- zkn$X%=L)DE=7Z-=a2~2WW4N6QyHv<^aK^I4@^9dJzVdu1#Jq_e3&qk@#U6#Y(}q!v zOOE*gIF;yzeVvoWHrA5f^~>^|6&St!R1{FwkbRX^UipQ2dM~IW!7v^*rRfu z^V=9mbL`ysFd9YGmWB)7Rn2KQjytD#HeS$-puxTEcDl0*8pAnS#2^|WR?ZPTo z3@h*35-EEYKW)P>_0+T-canIFgc;h@xe)9sn~_S<9douNkOnfCcO?__g@{`}(bj_p`tg3x;o`f{i> zzr1*;)w6t1xbd?dyv*QbM&dG?m)VKSTwdlTF8lDZPvSCIVUPOxb;vzC=FE6T#cj2OX&@Ns~E^foc$N3`#$07yXl_K}TU`y^0WR-R6FCn4& z8%WH_2p8-M6}&OrT3?P=LCYbrAOA+8BsN0_Ldgsld=e`791X#?}rUB`auGrT%;uM_vj3zbF=`vPQ3w%+yTlA1d=UO{3I(}uwMs~3o~+| z;t+P(U^$3sGsbm^6nqgW_y$oahZ*HMke!flqB5>)v|twkp`p-Gq9gl&lGvf5@{?}i zf;SLZ8ahN9icp#oIuP0HXu%=fqc7wZ)tu!Wr6PP66QzZSit_ zmSNApb*5c`?@soa9O~t>#VoDd$QzC?t@%-(RgjGhDSCF*_uU+9epzMB?=zrESDNiT ztO9JNppA!5I`%7uW550=({StuM?6>HM4fUhFG5`^FV>FbXrMXFWwcq*vIoI2yGT1` z!;$y;uu3_qF!8^#+Eusha7Eovb+kVN9pD5SE&B`&i8hNi4~O#N;cNzYT3y@&=hcIH z@M3at02h-71$Z&DxCj?B2Nm(6wzvWpwSy{nVHc0Zg*|92FP0Qf#l@0AQyE{oxDH|4 zgX+-!pca3|TKqM$C0zEk)f7={$JB5Yv5dx}X+u%Y3aeUj^TTETv>N-teT21sNnu(v zJ}iBB%aVN5d~6V$tNG-@^l*Hxe|XE}d_3U!+G_kB4IkvI3-RvKH5tQOs`K$&<2`Hn zK5%G`6lR3uH)mG0MDnrZ8!sTdpm}v+Cc^Gi1^LWK8TN%m{gJ{h2)8T4#X~ZLTdjF@VONCPjp24x z;g)MXOA5Oo+-!#1O@*7M`Q*ZEgxj6rW~*@df5Ixh-4SjM!|krZ{pxcSU%N1;s>ROF zY2Sxy_RDIft9=?q|50v3_H3ov@~%Nm9jPjY;-L2pv(~4zbPhLizq>0r8dv+>sfMBP z2fuwCuGPF8_Pgtc^G>$K2BHfx&eCgLOxZ;Ngnb#-2vldjF0QJPDwk-Ks=ND!x8xeG z3IXZ?8!N=5q>C0u^-$d})n`#Xu}e92FLU%C7ze@0-P4A)OzxJ3Q@eZm&=$K}I<(r| z{UdPd2aR;k7}^r)mVwh?_spR!!ETv2t?u4wXiIgsPB>lCz4OqPCEbt!o3gZ_BVcH> z?8SyP?RTrSn6NY)&n;{G812c)Xiv}z4-Ut6aZ)&3&^D~493xLI82JWm1Lsv@gVWC;MdhJUtY9O-&~kYzwyFFGGP&3ddg8%C-+{$<0wRZ)lkvEqyFV zbj6YwEwN)*OEyV*n0qGDfGxw2-q!lQ_@h=0HdnUv>v~`=ghn0@73_$_Z~6hZkH^A1 z9shkKUVjvxf?7@058;B%;ezdv_{byBV_EC3hu|h?fXb^Od^O%j`~>~emdchsU9I&u zVLyTL9H_o29r5mhld3d^kCXn~#W8*pf`sytZlNJ)1+C^AIhsBjyV{#Z6@{ezR~Str zsmxJ%RW$Ze<(1*s50zI$W7{gLu}#Vqv9Bk@UMb!kdta|c`bZ7O)QkfLV0IoJWRym* z@+T|yRy6iqG`1h>24bIK>pmqH+YA}Z5pV4s+a(^`nDIOcS02_{IJe4J&vf8b#}@vc z@Q8?AYzX(=O_!izsCn>;n+)&7#|?Lq9VjTnB&g>G2WEQAeQ(tD7zb< zA`t~!`(lnDVg-+4u~xKspPdoK(4Wt1xe|m&sI^-d3!-k#XMph=j~5?Tjl!Sc^b8QI zK9)|^;56Erfq8}uERg!1{#4^|t3FFtTVP@h6DZa&;Tvn1@QpQ0A?zR+ZOw*HF0I-S-VQw0*HO6fm$+|cJ~6#~*l-7@Md}n|gw^Vl>lNr4 zinXidY9_z*^pv`P)w@TAZrRGR6pZZ^C9-&TD2_Q8#XpZ=lg2FNU9DwdId9?uw$@Ws zt~8yF@}z12iaCaHsx{J+9c+8vffBSai95~X7~SZUL>n|)wLwAE1_f9!vXK&M)4VFG`}Ds{t|XW_0#ovYh>1DVj73AaO8#fUd|MjOlW$bzJ13Q$bd)Q7bixr{}5cq?%|7CrKLG#02~ zq99uKoPGYVrhRq*T3%-vW=n*nU?_uuNNjha7zSA6eJt`;^Dj{VgF|$qV>R=MHTI^|n2lbA&S_tjKi2r^S8Yc>SM_SL*rMn>mnOF9Lb=w{& zjK+r_9~sZy24z@nxs?yp^FwjmB;aH!_N^E-qy+2s{tQVqJz@Pf;sw>uVOx+iMsl{* z^f2|cUS$?}T$B5?N7uIPnL;WbYJ4TjeY*~0M zWOjz*KgF)Ux;b7LVY-vbZr_oHZG{$`yjp=eBFsuAsxD7`)KY%I3bYCeX{NR-^B z&shx2^2kVx3kflIhK0mW<*Du|u~HFL!I*?K7kUW?TV_;=Z__xqGkrlr4#HJIwx0^4 z@chl`FFyX)okW=vmlyFcR9DhC6$5tS#@S`Gq1qM*O9;?>6 z=tc&sUy;EW8a2;V4?{#rNEQM}3~SzpWw1)b*$^7}-d>K`X4UA-z@%}afHeOY#cJ7- zyey2&#^N+F`HClE=;qO$igqqiwwH^Xhgs`CN@l#3C>&NZSJ*iPM49mg9zHdj_qtk2<3cHDAts>3{LC<#t}H{-g;iNb&y`g5iR@5y6qx5VQK+ObZrFDiUpphSht-DimGqt%~mU<4xEBk zVF!kG*lPTHq&J+tz*B`Z48!tD7yg}@i8=4LkOIYRcw%zk%I3C)$z#z6g<|iad>psY z;-g-teW{jAvIuJ^@hBG-}S%y{Pd8YGcY8!p2G_d7S?q5CGreJU4SsdU78=m2hf zBj`obf|zk1lCvO-srVa|J(R=tJuD4QCY!}b$i;`C*lB<;mBsbr@q2ZCBCpVKFJZSl zne)^GtmVIil3~Ti4|L^*-I22LJbOAyR>RfKJ8`^;h9AA+S~p=PG4`i5JkMlOGfjbD z>@%90g$9S+IKB18LTeU4rdf|j8Xg+J>H~L0S~;PVc3@eg^-`FMUR0OU_RH1K1O8|) zV`1_U4=8DLQKalVYx$dqJYq$Ax5M6&JbNlKc@oWC11}60(%M{QDCP}S*$LwYx8XTv zsjBo7jlpbI9fONFt5$b>7lmMOFt6yqJ!(*~HSNHy3#a=6=QsEQ#f3$_uIDfCb?vZj zF>ombx|(&46Y8eTm^1IXfQqA9SE$?_XZJG56CCB5L^urR(#Ec4&7)H2XyFn``dr`u zXPqRVS}BQzD)Qj_Fvod+{5LLKjN;}v65D3}GGR9FDplc$VSON~eg%E<&TtKWc?;nW zw&wFT0~IuAx+&Zv+ErZCWtWHW-aj*Uf-XVgy zA%Z9n0VZDgK((6aJMvnG& z#}|F36dK08LsaU}@*eHyvN4DqN#RRHhYb%#WY!&RNY!L|C~+mw{wKDMOk)#^;$Gg+F97}Fz}pG)s$Ggqn$EdA=;wD_6=&Tz~8F;i1* zU#TC{ayPA8G|RNdtKGEDODIq3wN_TB<_{X5W9~0UkM&mqZDQ2 zzZTs3zZGQPqmu031zrEY6%11li|u+aON*?gSCWxu9XlcNOnOxabKyzVU8JNClF7++ zCMnaI-DXOMXS^I9@N(e#fRw6iI3R_xR8qAcV-(REwjpY2_D-L23b47il6)#}7u~P} zTPF;hR;w!~W1S`t1^$GhztVzN46@e1qi&T$s!>uxY+ggqcYp&ZRSQ_zR2^{iC0nR{ z7#~KP57St*0eF671;4}}pu`TF@T&8mnXk3IiUHTIh<~*nDaLa3E1g(egyz_4`4MdA z5Qqj@O}%l&{xn3lrdM$b9mJA>%4J9Kpv{6ChdJyXVw>AO;pS!Y@l*_bv$`E=1}7Se zXI(IxUsHKIQb#y7U=_?HoS9e0+mRm=))iR=sLfV^9((eBWGH#4!YV+*Sp|Bi&RdcT z$tRAr3eeD61@|g*mmTTca5DMqsa65XqE)cg7}xmWq0-G()3<4i%?bsvr00wHP1u>Y zUvWiCZOPb*(r>MmZ4lty7Ndj}Eic9E)A<0S>+qIfHY5&NEALY(v|gzATQEKnyZJ8Z zj=P;D*<*vHJFS&JRZ4WaV9CtOnDfEYr;;9c7?V4e0#_>?=H}Ac*dC0J&&0h{_Qk=L zNI&Sxjc%3eYVOnosJuj-746}c9k1JMw)FnqVv zi7u809fGY{iTEm0SzbBU!}uyw>$^Zl#;J&Ju8uEnH@O7>C3JkpcZIDHJf>)gFdP_r z`-xgM$47R9u_c`m;S5B0>0I7h8wH4LRW|O8atnej)r@T0($S6uv@w2tcf@m0ON2SH ze`$Y_8!t9)&PnD*>8_={I9@b*cjGzuDaeD|vAS^HfXNQN=R{`~34d%e7ANFQL@`x` zlgqN4w87Kd#V9Q3gz?}qZ#K*50^>7wLY5kvyC!XB$+(X8 z#_O?bp%X{c!KLGLIUJdtG!=z;DQ2NIZbIa`C+!TD?pb;sBZiX0C?j5W_C%DN9H`h0 zt+^+m6y{DGk9{^V*AR-=;s!V#$(vd^E*ksb3QRcQPHt=^=Nb0Ea@=wR^Dw|xx3)|2 zrkXZ-Yej5Z^TCE|**S1t0?%LAEb=rOE5+Q!2U>k3J{k9g@g;c^D<@Q5g_~9=$ES{W ztZP*Ol~??$5UkZ(0+aJB+>u&7`!Y_T26*4I3*o2-4Xj2xae*ga+A>Uy(a|1=CtGT$ zqj3AxT&cz0L|rx?w3AP<*IUCrYair@6d%+=9QJQ(4*+iuBKY=^n4rRgSFBXJ zyxoU^O!4mak?clyx8I8yF9h20*tyxkhN3t@%a8MFsGbYdogiG~SoCs7Klv*`2dirTKh4!21}t(d_MT z)Z$a5&V0Uq^u95KO#50EF!g3Uc{O+ZOaJS5=z(-Ji(SBv<1jz%g}EA8ka3mV@n9O zjD#%{a_X2K{b&P@nE9QIb570hc;GaAO<{T66sN?(_>YaIjK@(#Fp(Zy!f}5 zkP2NIJ1_6FA~T+`(tk&~F9elqjD9Xq5a~Va#5fM6{*gIjZ$b=*c8FmJubAEq@1Gi7 z&v$op{R>-{rTZi4DHE)+o4-rvoR;*_IjfNQcO$}U(nsVxgwG>5egW(1yg8S3r}$TJ{w=ja;4&_(oyE+nzZz$lrcqK{-G-}Z9Fn9nDazh zRZbgzccxY5?85mT6s)~z6H&;nfy@<8ppphAAWRy9cUrfpj-#=3}2Y#PPXqRul z&qT5VM~9x`@FRp&hu??MX^O+|W6e16;9DB$f~$IKMEpM2T_>VmJ1s!;aA|3S)R{UIFoy$js;V$eOX83HE z!zjZ8E;#^U3wQG&xY>(w}y}CHpmoFjC=Nn$+;}jmpu3^jhrK4d9*TRVU1a+fN{DmL3{t*P{U*QmsncxV)k&nzJ~4c*iytyP=;DVBALruV8(!w( z>8gp(<{y5XaB!xcb_PzMJ|Exo_W3%tSGk7&mrI`0b(DY5#RG=>U7Y{GOZhWgT#s2! z^Z|LjEl)1@%Z zryCx2an5D?QncIi6z|w>+eY4H$A`T!CzH+x+nCjaW$^mJ>-vga1Lco68*9>Chnzpvh`Qv zV(8~W!c=w+@Zf_zc!dWa;lZ!);FCT03=e+22mgr&U*y4;f}bSDA-ezY`S@aBs`PnS z$@fX*mm0g{id(RylHa7{0||LGp2pQH9`bK{a9{15>5CT3ojYaj?3sl#eAooBrheYE zT3$|BSU<Nf($imsI6!IZuYD;!{gto&N?U*Bx4-h+ zUx%)I^;f2M@%$Ps##+rI=P~)32A~L3`GBRb%@Zl4%BV~0ZcGjHY zo27G1Wa$(e&t>OLn_H7mNU(r=+ssmV>6V={Z`S;TEDHgprp(4yO}%f{d?@qHt(m)^ z#`ev+uEt*ATVU7Em|gFiGGzu8PnkZYcK!^)S_saXQ9BV2&a7F93&$wmtOYeSH=KZV^k>!8qZlVW%%25GJq~1# ztq)WOb$5lQ66)yZ;?8x%#V4EVS{GkouAN^;|5Jvqb;*Bfu3vC*{?8W=$KBBrF+AJ+ zI(&(_*8M;4xpchN8t(XWINvzqQR~u^O#vKg!8Cq(c_uzi`s@+%yjybk2ZFQLb@(B{ z1A_Cf{%qZ)bpQ$ zOFfrnQcFiYqXd_FZV+7R$uhV6ZhLYCmwE;WF7>=FxQy3+!KI#m3NH0LXa+iNd(?Jc zCcaY7CLu5NEXq>JqtaROR>1=hangU6;8OnYf}bbk`(i^89<)=+UnY1S+#LN4g3EHg zLU5^Pwcs4JIeH!v{6~VnD!8l{e-~Wp@75KbanOEg&$)t2`M!cn{bhp7`2Jq-3lN55 z|DOfV7yKo`Ww;A4$H;?rN#yd$_w=L3Sv^f}uM5^1N@f05u) z|3JZIc^)dbjMoyuW%;~QaH;=3!KM9w5nS5eF1Xa6Zl)I9>EBK8a}j5!Jf9`FY?rPS zT&7Qr;L`qif=m5N1ef~X7F?#=LBXZ|xS3F3yrew;-js(szLjRm)Ga?jaH-$QP3n>P z+gorM?s&nao_7QKP@tY)={mm-_D&TZQ^*qEx#6dl>o&2ldQvNTzz(IM* zcMBeXn^S+EU?ITamVZs~0Nfq5lTR z3ogssCRT2fp3~X+89lPSx=?W0?hO!J z=HsV=OFhR0mwGz0(3o(go=t*FJ+BBZ^}Hjv)bk4p8U0ev{enw9YXz5jenZbjkK|7Z z9)P=3PX5Hi#o;bjI|L8F-I4!8KT3C$e^_vtkFN_JfImmia~DxdM?LL=%kg$aVNzb| z-!Hg~um9qtywo#EaH%J)C@C-X^buU@sS{l4IlVZwo-)Cuo)v;iJ(vmA^`F(=R`t^n2vVP@VlC)Fyzn2Ow^$!c3iWsXud2Qoq!5p5OtI4tb@i<%5FD z@yr18fXZEdMhhV* zJvUyO)Fb6v1efwZy(}p&`CWp`@_bNmS^n?2Jhh%D1eg8V(u&mb_X#f3v#Bz*{2IZf z{$Et3mjAQhQvQyispbD9xRlQtmXw#}zn9>${GThhOy}K#OFjDpmwG-GTeT!K!DV_56kMj~GlEMypA}r%XA6a9Y0qy2m-fFRxRifS zaM@n#=2yPpGQS!Gm-XUjg3EgGg5XmA9l@o1-lU{G(*BDDm-XdF!KM5%!DW0u z6kN)GD!9~Ndv($t$$u`mEDwJbT;|J%f=hc|oSf7z>*HnDq~?>aP0g2i@c;7Q=TAwk zr@@2&*@HhWxGX=f2`=OHuHdr#cb}Tto?e1W`TGQy`X3Nn*2iZAm-0IWm*q2Snok|+ z_?Gek4_+X+Ooyo8QqPrw%W&_Uo;uw71efu;eMVAVwi~-V_(2bT*38s;ss)$!-{8S* z!OwRpQBq!x>na47 z@%@VjAF?>9N6O#m!LPb0wfxP3UnK1Oqu^5iGlEO`u0Kt!|0cm@I{c^LQvY@j{-y{2 z+JpC8lC($K`Ew8cvIn1db5f7YuR8<}K*Y)42L+em?)KohOOyI#I)pv=G+oMo$kR`dhqog z{1d^Y{oi}={9BXuNcky^NiNgpLBVBt`={Wt9$nIu)Dys|Qx0vxrJg4Rm-3yPlX|2* z!vvS%YH9u4EfW_N24t9Dra$>ibDiblE#^Ag#aE>pm{- ztm6&1c!ddbPG@&!uN+KGr3FnTcq%i(ksZf@7loNrb!R5)CH1 zcqMN}aZFX;$#mOyss>JZaqMX`e6~yePbS@JT|C<)fEkV@?0KA>HV#|=B>0~UU*zI< z8*aLVg#7J>FLTMyJ5z%u7r(~v78ift=vm?76O8;S7w>Dz&uSN6Wzy$v7k|K{&%G{Q zZqjFsi`N=^?sxHP4S&eR?;51RS{EO0%%1nrFM@bB%lt7vFQTme=oVCiV0+ythmKrjxY1)80GwZ11Ugz$IU9c%F;D zX3{O+#eZwA`?>f@CVh%r{D#gt+!7c6Z70o3UHlDpo;b=~yocdVf91sMk_;`c-v>>m zPm5pkkW0@e^oS$k;(YIqhkj2qspnx!^J_J;+?X!{+TXb z)=l%-F8*za=Cv+vch$Vk#TS%oZoByNLo{FH;!`fwe2I(SZQAE$F8=sHE#KtgJxo8? z;^KFi>lH3uWzuJri(gfs^{;mE<@uW5?c$GIp!vNn{;_GF*SPp|r)c^6UHmM=A9C^f zURr*wi|@$Q{1F%Tn|5ZMi+^MI6E5z=^C=e(ne=IM@mnv};cj*DxrLhVbn#;*e_wF% zD|pj@W0#A+GeGlKUHrLX&G)$Y$Ne>b$HgBj(tNLrzua5%eJ(!V@cl0Sb94Qvi}y9@ zbI8Se57qj=a&fCt^KV_eSC!`9yZE`6Ypxo2oI35#kn=U?|B&!-_|!aoo#o;`=1mC> z=lwQEe(brLtG!Th>hK4%G|zSE>15KUw~MbdcK31dJd-{F7k_fF*3bW0;lUp^wNI7t z3Tu5`d_$?`{apON&A6z@#s89{pVJ z_ikPx_%%YlKLv5{NAd-NOZ|5XF6CblT*~vW%z5xf%3p2zF*pB};fx-e1}9#pnesw; zssA#;rTztiOZndmF6CbrT*_zVAwV2%J2lnigkAgBXJ|q>9<-l4$Bf5J@lEhmrd>7J znc#hl0@JM~_+_SDH9rY{rn!$Z)gZywnemJnmL>RH(~k#W8xP0MuBIHC=qC81rav{^ zOoE?dBGgZ-N$_1J|75&)?&PEC#*{ppW`~<*QE@iq4xc3SQ^w&_1!oiN@Wq0&33m80 z!P%5M{2swMq;U8e!P&Jte1qWZ+8y2|IGX^6zbQDI42SO(yiD-#1ZR`v$ooV)JVfx* z1m_UPkq-!dso-UTUnY2k;B4X@J(C2N={Z&K3L(E(@Jhj#30@^Q_dMmnVm}2RCtNZf zET0?#2Jqn_>lb;oqQ1mFMUMO;!(}<7yfYq>^^1Isk(c$0e5>KIev$7tT-Go0Y!jb= zad+$~GF;X#%2yjM>lgVV!)5&5f=3Ofywrc4 z2fxLG|Hgwq=D}a`;O!nf$CO{Uoqauc(1Ty=!54Y(UwH7f9(=n8|5WffBK@;X`@?)0 zCiqanhYQ{;_z1xt6FemNKEcC+pJ5s-hC5R5(SnZ>{1(9@g0B@kD)`%ij~2X(X*d|} z7{T)dA1ipZ;mr3M!6yqoNAO35o^gWj5%SjyzF+X!f@gHa1rCOL13peYJwxzX!AE)U z1%@-eGG2EIF4N&jp=Ykp|E=KC9yMry9PO0yeb|ZOAfJYhWB(|@rTzB^K1;|y#zr3p z^+@@B9(;u9*eEa4^G8&IgIxCGEE_z?W&itAMKzy~bI1Oh4QG1F_Vx}TFWcM8DTssm zr{d%2;XYnG$YuHAzF9onc4mrpX(rqqJyn9sc-?L4ojcqug8xM5>22zZTmBNkrJXYc zpDOf>7v)*%f6#D7k6Gu~ztcnB+26x$&sCxvN_$!i?+e{BA3KS1CDZ3w!`=1-vY`Nn zJKWbjc)*lHcepD?JpToVhufaR9`eN$Od-#I@a5r-FaPqI zhg<(V5BX*fdHy{n54V2qOUHvhSK#BM=Qa=dj|7+H_8Xz+N}(syj|&{`a4$BT-mb#O z3AfxseuanpY7hAh9`YMKRZbjlND(b+;mxrIP)mf7Ila!?Xx?k{<1((l9oVf)$ub^Ta{5e&fnXkeHi|5+Yruz!HQKI^t?VQ;)i)>%v z%xU&CU*Yrx3w(w3HMP?UNei8w61h2IawEmWc9B}Lem?#F?_`zM6uW_*pedz$$3UDt zC>f>VLhPWK+HOb8xTmnzCDGAFy*;T?Z@iyiqHykzYSgVZzbs4zlfo)2Y7sc3|bjJL-b8F_I z0M46l*A(K9M=n}mPn&Ur(Oxrq%B*_sGy4DOI0JfErA%{_&{~l=JMqL+zXUu7XJn=u zPVkDv*@-7^{};fSL;U|&|Hbf2onH1k`#M8J9kPuf=!Xy6<_dh|v8c1Q^pOiSEx-@u z%baD*T5kO!)do?J>r%-QoWS7)4ys^gZzxKpaK-C-Lo${~EZY3O~O}2OJRL z^VZ89ej6A?_$)&n`%U-qz`Oo}g zR=Vx)0x?Ru({G)b0RDIBmjTbI!fzR-!w={mm48->@GpcrMcnZ}WWtyKCjzk!<>yE6 zoGSc>%mlMDZ_N1j#K#@~kO|)%|F)+Y+P^P9Oy*SK2SzYfKA(&~#|!T8uQTCyh`$Nn z*@s;z!`|@GBTG99jBDrGGD8E9y(MAGezD-Q{=jGg|NwBk%kebt=!u{%ria z?JsK6KAipH^;Jjvx$ZWV{h_c9Dc=nGn16gW_r9q^Y|=l8Z+H5=;1Pb^ zXdQmY_+~pwGY}R59iwXZT0Y9 zWc<71>s&j(_Za_0G7y|s82?<;AP=1?+Gg@U82?g+=kttzH+TFy{Bh$SKPMb@#(%&y zFmlq(5&jE2rpmvHt8_?vj3D#R8EX+a`9~>t*mPfI7A`pR@y@)X6TaiW3uIj;(z~yP f9TN^`U8=gkmplIK6H@u_JyA>Y{}<%p_WyqX6^Hti literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_noblas.o b/third_party/libxsmm/obj/intel64/libxsmm_noblas.o new file mode 100644 index 0000000000000000000000000000000000000000..ee54b138a9286dba044e8f882453735355730add GIT binary patch literal 7600 zcmeHMU2GiH6~60%#D%a~tU@h*CoqAf3e#CTMoX%qopqcHWzrzWDX9o7YkQMec-Llk zhzz2trKA<3jj11~iYlb$ArBx$RV(p8B_2$J1QMvtOA%64DMf%lYOAyo53Q=Y-??+| z%-y>)*rL2t*iLpq z!2bQ!BZm$gFefG_4^Hke>x+x?b2If|ZlPs1pX>zYbiP1A=v7+$^7hxSq~QrvA zz4v2&jjX*>Zwy|TJ>F>*0U3k3lfVlByvD29S>7x&^LF5LvS=*Yv5xJTVb+(+G-yZ(MBZM)c)`<1m{ zyu5n%2gUFr&wFpxyk;%ahGXtox`u&OrUHpd>6|E~DU<-fv#{{HuZU9OwSr=ZN)lC5 z;8jBjrCQ@PAqu8IiRIoXB&tq7s-n{oArwWC3PvO!1>;b`a6(-WqJmk5mq#^%+VJ{o z1GDV^1pR<a%(qZUrl^GscZ`c0g| zr~rT*2>(IZi;@8|q!hVAj!X}^DS9DRqy9iEL>)x5cKMKxbM zbETVieze8Smrrfon_tZhtmn%>+qo6glJsB z{CG{im-*Y^vADPiVaXhIXiT?%owe0Wf6GSX4H(VdcW3t+Q|6tYiYX|r3J2Wv>n{uoEM2&TVSxmEIi)~Oj z#+2V=m`i?f)WnRy_~>M0fs5j$I>q;j&jd?nF}ewxkxkf? zHeusXDrHBdHvm0N|5}ekK5>NKrr>u%C;2*#oj9cZD1i{Nr|Ob;^t{tMiQ)Hmf>)uF z=<&-D%<&tc%Q#(uQ=HosT(zg_-N;_PacJF}75gWklj!kPd(vh6EPLWcLh-2!N&6`R zA$ok(-o*Sy_IQlAP#nsIv{&(>f}7Auh>v`xX}p_G5W=@|Ejsx^CtUoAgCj2}c+#~P z`#{0fuI>mz6i4ls&nS4(%@}(_!Da1H=xr1LLYJh=F=i_G_1r?7?9Nlopgzs)PSB37 zC!`jj$1r=g9@LpV-RUr!09$W4yk*$!1+wd#jM@u4gji=0#JqKVj!+)idOMxRr;rrJ?FZWnKqa<6cY zVdZvsZu&O9_eC(X&}=qZNM>sxXxR9XbvL@_%;O?r<3#OPyWVW5W$}4+>OoYRTeJWb zZ7hePjOCDjNfD-hNmpWvUPbq(XPmySKCk~S_Jkf`{F`EZo&P(V|1Jwd%HIwg(qHat+S=qYD1NxTB{t9kc3X-+ndy5E fh+X9Wl4R`rw16Voke6ASze&voa*cNXy8nLwZ!c}K literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_perf.o b/third_party/libxsmm/obj/intel64/libxsmm_perf.o new file mode 100644 index 0000000000000000000000000000000000000000..35c387a89ff528e2236590a45ea10c02378ccaea GIT binary patch literal 3472 zcmbtW+iN3b6hBGS?aCU{uHt2djo_}kXlIkUU1af+X_{stNfVlM7xZB^Z6`LG8_ATd z3nEJq45iQq{|LbsMespHR(8QBq2TMnE{LGe2kVQVLOs99caq6;21O5ibI$pl+c)3k z%zTi^FN8xO5(?34G|~hm+M5{3DMw7vNt&VycC&M#V|8oOXEwBEwL{lm7&)?3Hg4`y zjOb^eM*qiepcp%kL+yWS^zS8aDbqb;XBMbZAB1e&yaWHYZbNa%*nSoK7d*~1Hsj9d zJq*<$V|x(z;Vt{3v9&*u>+c!;1Ec?k%5L<(lZt~(#@3w)CtK~&)AK~PDVt7Tj-Sge zE}xITtj%hP_)KCZnV3ypjGtSw*5XFh9^li-#Cf8}RNc^xB?@iELZ?m~o7m+Ae1>rh zP!| zog|7-SX{<$`OhVWf8ERSP3sV z2aWMbkKt)Y51Nnc$SCMEi&Npz`#O8dSdzz}_Q)3!?0)#w0Q_12USoVR{M%uQIY>^! zb(W{Xqji=U?=bFH_hZL@w9d}?V(ilZ|1SdYuNZgFM^^m%0Q{!_{PzI-I3H+tetF-s z9$e~P3c&LLI5p}ko4sb!M9hnXN zg}k0EoBC4PFe|x2rd-hrrJ?akW~rPjE)MmAp3SAr@>RW5%8XJtDDNjoZWQ)sipO7v z143^Dq4VIZ3jQ2GCitJ^A-D#R3I6B;g1-ci3I6E4f?o#61b^Il!KI^laXDAW^TfaK zmtKxM=suGFEXyF|d5VYN(*T*gbuSHI#~(dN{NoCT|3gX!ODP%n=(ZD8T)3>c18KeY zns#-Cw4U7^@O6=FYttret!h_ETj}*k>$ah2?knLOdc-C*z7k!lQPmK3)F;q5Jl($O zhsXP$Y78AoTWvR+R*R?JYTK5^U*_qaU0r=w&A?hWYu#$oQVa4w*(r|a)-kpRNSVUJ zhv4g`XITLz=T2-X5j@-*|2*4qHvenfcxO6Dkn( zai6^Pw;4ly@A`es7|2^+)=l`A?Bh58Jtvxf0~HsBdAR9wR{Z84Qy(i^kIpY^Mf+p+6ehX)i>0|!v7$0c;nGay?2ix`W2I6degHG9@Y8J^o!q9^Z#>@ JX882R{|zV#v^)R+ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_python.o b/third_party/libxsmm/obj/intel64/libxsmm_python.o new file mode 100644 index 0000000000000000000000000000000000000000..13e5bffd11ad168768231455759a944eaf2801db GIT binary patch literal 944 zcmbV~&r8EF6vtn;>GY^4!Am^3i3{nlARZ(z=5`Pdf@ev)!bJN6o5ObU-|^q`FPJZF z!a~v>G?0AX`{aF-Hp$og@u3$4U|In8u+IzyC@siq%;6Z002Z^^9T_d2UdH5>MwF2$ zn}#e3Z^-C5U6G}b4WEQ;ykE|X)ba!P*?AKP`mJB2c(1J-dB1}nbnt=CKi=X^m1VEx zR<4VjCIGbBW>Unc^0U#kiDbHwKvzNvpm9|JO^Pf_b7XlTQ;MfEsia8W{cZkA&e5RG z>d-NACpW2eZ8f^UQ8D*~cs=V6uKyBC@YetBzecZGlVP_T*9o;D#%6z`k;UxN+)k|6 w_zY5c+p8&c!MD~Ss_N=rRXb}0-@O0t^}OEbRM#1C_dn_IU+KcRF=}=FznxYvG5`Po literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_rng.o b/third_party/libxsmm/obj/intel64/libxsmm_rng.o new file mode 100644 index 0000000000000000000000000000000000000000..c6fe7cea28970168df4aabbd299a73469c85323e GIT binary patch literal 10128 zcmbVQeQ;C9wZF0i80_d;;P>h_G$^ey!NKwK1;f}4ZFH@BZ3*Po36RiYVmn|rwEiHr zNJv9d6Jeq|LVkIFyw`RnZPV$;WIFAHHt+FBo1qpEhh|9Xgpkq{QW^r7V2DHb2nFo^ z&VI<(*V0Uke|z?vv%BZ)?$vk9Xl=EU|3p2=szK!z<+F-@ ze>gRb+R@ZaROt6bQ(t72Rr<%F$t6mv$h20wyB*f>9^}Pl0tc> zC>Qf`<6Z)pzBWVcVusq(u>;f)%EpdR7>ZEOKSS@8^)lxQ6EoHu`rOYcsEoDxjWNJ_{S5Pu7DA{0?+^%#L zj_^66!Zd{~Balz_$YbWoC#Xo}OcqL*sIVbSx2f|CtZ*_K$d6jlK#_J?8&A&BlciC< z_NF$VCubPRTeUx!z%TE0#b)W+u=XcIJEf&{>kVBS(T4Q&-)0)xd)mwCfdW%I2(i4$ zVu!idi+NaSJ$=$9JfhAEDyI6AS9>PueVY%6r(Jsblui8NR8i7xrXF^Y8pry_U#-SACD$uSahs>k#$#? z^|MXZEj1E<#iLtGBJn{_1X@i?zaF(99H~C0_BCK=644*Ft_@S%hU5F^TYp`%R&UhT z>JR8mX9yIe-@hk_s(zo&^2xgPdrD8_QwbYtH3kY<65&^71dso)#+yLTtK=p+wwKBQP$DoJO zc5F!Xl~75B#*VAL<{-&(SAbB?LS!RlKqm3WMpfT)y;ues^NH%)Tx6)eU;};g(N!&WQC2%NsqpbpTE650p0Iv_i%t7nn(( znf!4t`t;;I!QkbWkv66_lu4UIruG&^)`^+AwilVh^5z8OkT*i^T}rRRrgjK&dnWSc z9SmU7XVrP|{U?`tCJlIt4~P#UZ{A1VWW-NQl@-d7G^TZwi<;J1Gu7$h?qI~ZsoK%c z@@IZ@vB6y4#ZCxDzOnaG%Z=;jpe8;=#R z1mfr7M33lCvLiKK<$9!$iK+8?sfRKOH&P+XDkNti8L%fJAFlLWR{ImkK>dDk7)1H? z_pw^Ul7#L*mJ{+mWNDjvU|hR=d7n$&b`W~>q~|w!vTUcB4E$P87QdidgVq^c+wU6I z-ax>lhi9hWLsY$PYA2zy_il==bb)D=y?{I7IxOMWv?_n|vD%0LEB>`^74FraO3 z14H{L{lQFAJFTbp7eMXSt*?>;5O~sB$xJtr=|C?`{~6ZiJ|D5J=<(BPDCzlaC|SHa zlq`I)E*bb8P3f?fb`2TYNz*!sDNz{d+Id}jjm;_jasiCjS!KTybAtMiRkk~f*oE>p z>a5D&)>(lUVWjx~!`1JqeRm*&Y&$OY&AGq3u?)T1o2#rFSFWtUrsk{9zhdpY`}%X*K?RHytKp;nZ3W(-pb$ww11Tolw_zi_?Ro!fZx1WfV=fd!maMQ2sP;sX?!rz;|eAo&r5{d zFqh9{D5rF5AQ8IG4eIG(5>2eQVI)FhiO{%BD$S%0LZ8kZN-T%ea*|@+d{?dqlosjM zUf52AM)mXuMdSjr|B;bcaUl`rN`iJaND^jP8g~BalQAdSKa=1thDtZE;{rKeCSbq?qKh@EA}-YxV}~2 z@*=eJV$0B88StyV6@Ikcu`i(A7{CRxrUGqG><+Zo2ZE|^T@dYQvCp909%xW~-)um8 zdTbWju|Tux>uI*m$4`w0t1hYAzKRJ%)jg+-)P1g6byr!SHc|F@z3KxsbvJtI)jjVR zsYmi^)Lq4MYZApb8yM4n8}AuR)wcbx!Ty!L1xl-8U0P<`&l`=FSOL9aj5$(1BTA zNY&0lho`bO;aS8y{u=K%kb0tkrUHix3xZWQ)P9|JoW{BoX4b_ZUAJ>1v@SPIUYDDA z&tQDeiw+Og#e;Rhf?d3UC{g#E$MnjAiOL0MP7q*SJ97x7cKOWX7>M|G9_afgXe%}gMHv4a2)(3 zcq_OU{2lNU;BSMsfFB2c3;Y;(GngEB1iT3x1D^(;0-prG4?Y2Y5BxXqaqwTk?}Cqk zkAnXKeh2(#@DcEG#QidG1RMs}fos7g*Z_yXHDDba1m6w*68MYYrC>V#mAX*q3;y|a zUtU$}uPR?$Ug2L@u~4g6tX2Bw-`l$0A8v_p{7$W+l!kz4Er?v-x>s>+E^^&mI3xdQ z=nDg*btL*Kj!vb5;(JF`n!^EsEu^y1 z>%Rw<$?kD#gZ_)wXc^66HQ8;*V%HsUO|cl>H+b;1*Z-&}0Sqr=>kKudFzhYbo@aVX z;%>w1@Aj62yhS=}2GO-r$h{`!+MkEjNV!d~e}~8L&fh*Q?5&7Tuk!}omTML>soNGz zFQ7SoPv~H;AhdH4iFuTdC}pOQJL^9LisDx2kvo;#=*dQISa6aGz{Rpj^ay|b43|;G zb4y4-`*c~&lp2#~pB@|&dZ^)6@;T=Cbt|(R2yeQT!W_6a2Tm(JmE7!P$c9hM%iim@ zuY;|_%}yK}AgXS5B4xwrw3uILUM1 zS2^_jA8`AAWiqFI!-Jc9Iy#!VJHKAOPKmX4Y;5X@wZvM>E49VdiBUx#O29KIK77?{yEf%2&cD_#LEc; z5l*=z@g)R;B)*h4*&S=gw+OsJlv4gB(O&JqrQfX%JSy6q4t$4b(|e3a>OaVv3Y}X- z5}(VP%72Se;*FyH3kUv;XusgVkBatw8*X3c!#3Q$p7bUpBD?neI>%6s+xO+74JZ9_ zeXiMX(l2p0@}5ZQmw150Y`X-TjR@rd+UXeJBB_er0SmJl0P9)by;)^+) z!KK}Z1K%Rr_c-vEM0>3Rm-E`-z@wr~-#kRppB=oZJS9qr9~AAU9r#?){;>mZ6zyF3 zDE*cCj|zEvKHB4L*rC5nw8tEHyJ*+j_MP;M2;3f5_INl%j)17J$Af&~(ms9;EcZ)n zyK)DxfDOMDbt00#4J_sH=Nlu!>DyJ}@_csUQl9*=pG)$^O1S-;k^4Y6-4Rkh`9MTC z#iYdL`VnqF$H+G#@@GC+%4-CIoVbidavU*8gIQOY0e=~2qNTiaX8 z2`!)Wf|1WCdN0~!?5`o`GcJeNl&4z`IcYJMIr*l_F`V9PIlAcCmZOVaaOK@yv>xSC zp4GBr$^%?>tm}zk_`0r+j@C}BYG)Uo==dk+PCRSZeN&iheWwwiCX64a{RUv!>>sDY8+1S{7}=*e zI_+-(md*Zs?+`nVA<>s3BhIP zx07@J^j~l```;7xE5u;3Pv^ALKD}eI+4s-n1Lgk+yfrocuA(EG{R6`O5iyYLlkZOZ zpN8>l_D97fWWVc3m2>{Z7@W=iv-DU7+2fD?1##N{4`H9~w@D-}JEuKooBm@b|DEIg z^0K5(8~JT@s?z^IBr@?24Hxi>OO@tkC)t)|rx;&h7eYNC#+N&A>5eSvyGGXSqW6mN zc8uyy*&SlMbN;Kv_~#r0Enj(Em6FIGV~o0A%jxBLDyZ literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_spmdm.o b/third_party/libxsmm/obj/intel64/libxsmm_spmdm.o new file mode 100644 index 0000000000000000000000000000000000000000..34acd1e73424f0ec108c588cc1fa62d696b6e38b GIT binary patch literal 163176 zcmeFadwi5tmhYXa6d16&pQr2+gOh24UDJ~|89HdDr8LJ*VU<1wPXS_D6OA?+nnoL2 zz)}$dqDg40;{$kUrF)!i&K%FY%)CxZ+s;TkOizMby#^4Cc*A?5q7hJ0BYD5S{Zu7E zo%7D;d_M2{*Wp8++Sj#jYp=cbT5IopQ_4Lj7zh*<{soH8DjN0`LW_##EesBM>Wr(d zsJiHTMTrZOxz5(NUP)Y#_+g^{hvz3dW|TU)-M>n9oSy9XQ8e+%ryW<9Bs;DS*DXI% zZ?*sR=%S+BwnT1M!g}CH%6ecAzn>=q&n2w~wkHBRTQ>y=*pq2~b%s|vXQ~(eG?9DG z?U>+pq)SqjAGjUo*~#38$;wZY@dHU~Mqg{E(m$HYeUQw}j}0Vp`*WL;m3tGFd*Z!L z^R|>V<0aRtYE88dS}PwLRaBJlel(bFK9ZW$IjJ+{T{Gz9nqs|9*W1BKFU++**x}}~ zv3?w#bI;C&lBULOchhd`q@q;&3)ae?;C4Gm<2HXjY1cH*{r93o>s!ZIZ6^_N(6S@e z#O@`*Y}9(VJ2h#L%FRvXYR@~Eup-er691uYeMFyh>yd!f_AX9bL91<}A69I&t@d4^ z>^x#dW$XBk&Yle9#*j6!2hhhQt%v&s_}pCW!h>g9k+KKV-h}~D#BR4H?z4VzTZrnm zui)#@nS@sn-a9hq*}166S@mE(MGYzX^C=DsHYsip?3zdNDQZb^`0HTQ`Y@7~9lAYf zP3-&%lyj{}$Ux~EplS5u!E-D-`g_ui*fRZ3H6{!wtMlqRG!HA?f<4xp?Wr5U9(wMtXyf}`NnD$Qs* zI!Y(0Q5vG*6m(-$DUK18u%K+ur!m8527{8q54zr>fD^Be^;_*dM+OJe-poMYUH8mn ztk-J$73h`Dbq`}@`&C~I4uX=)dQ!QUr+M}5MU!^;n0N|Ul`!~qfDXq6em&~Dz^}V~ zS15ZGF{83czN52$LFG@hCO&_3-NOb2${zWnFxCltMxv9giLY9}n5qfUp5)uw74RcY zu_pF?UGAz!hLW6WP5kiahKCIn);u_r_CzI;ckp-Wd ztGVGIsbcw525(Ox)luD5NL7_jWiaNp0XwQPGztL+p{h=FbNQN5(BN1^=B!B(X2l{$Y;N9`FS?e+&9B(xvFSr z{i*cVpHH;@2=&qelw{!;$S*z5CFA>8dmOLqY}TJww8ZqM+CR2d zo=DjVZ`q((Yj#cQbiL#cf==@n{>sBdcLIZ~ObPF&gL5Yq`j0uE9985mMcbK*W*xc{ zhtmPAaGMW0xxVRM&7w)WT3=;?*C~nh@L7qjw^=D$Vu$BiKTF0A`;+Et)}_$Mb;)V{ zly&I~dKFhN`xgo8;pxGySM8i@;4}%&^VToM7mEOVi?5a*N~^tKWN~wpOtS#$zFpKk zk&C9a2F8A&<)rWLfIr+m%7{+ydd<$&n=-P>d=%QBo%1OXXz!i()zWC%|($;967JelU!%UvN zUzbP3KqTML&u^;5U#=X3Pv+OYF0-M50$IrP+1F*#B2!$TbAR!b!Mt{0aIkM8BMQts%7B)1WB8H2Wdsk=t09iY z=lja;=zET=!}xCK86;yMPtcBtpe=xzC-O-&@N%+cC{fc6_I! zE!Hp3HBsiuf{ydclN~<|xw+e7A`8imX<>)?J}-Ay?0)>e>bR-&lN-yDME7excibP_ zM`kzI7CVeT39uggxs9=Pq$bsigytTKtrx7^eX)nk@BJ}DPjZhZb1!F_Ul5a?OU4#k z59}1+<)m>tZmdoQo^qQXiX8@?(|qw{uQuW|?{KXLo_9#pK%%tuz{{E3OVhpR-=@9L zN$K3nE=}mk2$2pYtm%i69ru}%6x``h$8vH}oBLFY%BK>!5AhtLhF$NUL}{&W9ZAFo z$T+~9uq#<1259;~``|LW_3c11_d3(3b@w2Icfe_GGxcj)SFD%% zvhbyf9Qte?C8^-q`n4YDQq_CaQMb0mPEo@G-L6;GPDoKEu-BT%UP8TjSIj1Z^@}sU zqp4WKHyOXr^rh97JfFe3-&DJOzqRrOpmeygfn>*RCQRFmL~CEbYaE>BWz)qw-%aIS zO;ql10v|hp7aUr)E|q&M;bmiHm$EjQdz_x=dMDWWx|+Qv-St`s?k?$tj-BS!oSwAK z>2kbtO6dAYWJt!=JCN-SDX$@r@a6?sEC$+lSt~PYAJtgvc%eTzUNkPxddCaj?&KbG zDmOW;@02)Q?-Ywtx4y>C=04i#w0_3Ap!FPAFnbl>;_L-{L)jnj9hI%%J31TLrcNPE zXUVoYvHL~;{{%Sk?b*dcMb&Mq+ic3N+wT87;s12|KhO9-J^qidzGz$BKL6(p|3~9a z;Cud$rVl=0A3vHr{Fv5J?zOij<8xvImiIovtu3)N!Q7|Tir?cA2T+`hZJ?i5O!cC+ z18T2Md$aoK-B7-FvrY6{AnWiAX3ykXoSn!wl%2qLR5r?YboLl4qAs=n>pu>ghSk|q zOora&R-|q9+|PKNOOT zzqvH*g(`;voYrlloikfv>#X+QLbt$b(Vg}k*2>er&D3bV*H9L>*^Mn0^)VBxW22!y z)4Z~ar+J~Yh{#$pxnx$SJ6*3AyUm)lUj0=?j#stK$=w?(H+1D*Lz>*&Q{d_z(t^cp zhVnVN`(tZ_)z$+CRggcE?~U1Ho9dN$CM{q}Nt3kx71EjRdbQZ`YS)PfE?3U3w}K9o zuo`!w^))!QhhnAVP9PxAG~D^Vw=9sn^{&`o5tw8q*}0E%hnc4koi$y(!Bk+AveYTd zC*4lhzEMdC!)lj;zAh2-K~(ay=9ZaoY|hptO)?dkOyTWZ(s@2OCJ^T@&!m4yt1tlp`CP z=^(f0r0GFY&sMT(>YMW1*x*zzxsrwCV@j=8L8?GoY%~sU4iGKFW=SAt)=-v6C!zQS{cx>8B7KcVe6dB0T6=^SYf&(EgSD}JrJlC$y9p3pt>%JpQqQ9%4duH;QlrM-;SArGOs2{DN>=(zdn5 z?&8Nl2Az5w0y^^nt-S#aQnIoeQXC?pt&lQs*y&(_cj#@0o^ritorZC?db9aQB{RHa zcd6F*A~G-;-ld*ZisN^t`Mp2ZpnSRgncPRI+*=OFv4*rshCtO}a<1p6)h{yObSx}M z<+`;ruO&DU_{{AX?{p;SfQMottsM^;_~I5sO%w40LXov{TGJ1xgFA6+DhPTacPLf4 zdHF^&iR!Jk@0>|o{Gt3>xxW@%Y=6UAxh+IeuPHX@BG0Q!O?rV1kweX@0l{coY_QlN zZk`JrD5C(Ek)HfVvK6H!z2SNng2`^I)bTEaAL>tP#T8dk0XL;;zn9m_u3{$sfsm5)1UMat(|XPhv*B&eUd(jXyqw%jKN^u4^KSU*#2h& zb#8p8)&65RY}zPNh|Ft^)wa^K`hG(Xr+M>7!DhV#cUJYl!6{ax>UWaCxDSMo9!A{X zg_37(G_-Qr#pUA-cFd3USa-4?%mntp|Nf4~n?9NLm90b971H^uU5X=f-fdt%=pL`R=#k-7n9GAbI+uT z*(*pAQea4hWGii)*;Dx1O_k-in@-_tw^UZ+ZaJlzqLY<%z)zl1NBI28nS{?jWhUWn zQ`yEOb|cPHZ?F8#q2uWCX9#NC`%H%Aeaf5{Dbq6NbimgT34LF~Na^3)l=Z=W7iz2J5sVY1(w@KW zuOMMs&K8K0?<1v4=6+&D=iJ;JFpyk_36NYN30W3M(N?jp*2+KprBL^&MC(@gfwtIM zMMA=r$O-Joc%c#~RFIh&a>=V`WHQyNsi^$9$Y;JuK=_+Z?f^9RLWE5hW>^>aU%68g zp_5dV{i3?nZu5@ZR%eaWDCLfKBkN=Iju~F)7OkqiNf?ki(cZ}}-U>GtjtwsWuTn{| zRs*T_UCXW(Jq=hZwlN=)RwX>0x&)?{1Z%+R`(w)q043IDlWuco(sM>95tp}i9+PSQ zFx|Y%SuwuztQUi1R zTeSb!bl|ge;8O>e6ZllEou`#}J;}5s+@R$2@U7+-rojBx#}-f8HFves(G;U|lXMUe zK^HgHiAHtJ}|?JUI#C7F&N`PQ3P6tD%dIBqmC4gPgH-eu^P8pjn7yO zY=OyTHEgpg^Uer}OHfk%>}w93=I!2s(TMWwvRhHt*rA-${4sop;h<0$+4JoXG7;O2 zghov7XEem? ztvereVbp=q!xBH{am2pkoVXk7VS(S|20l*557k+BZcU6ikZjv+wSD$yrt?dK)`|~s zs*IA9Rq;&Hs(3y%X;;eHx(gBP))y%5Qg(aqLc$*nw7!mL@Bb1aw0<7Q&fy!(PJ|X! z(~Y&U#3Q2DDXZ~7%4+xoMOaM?Wg`9j9r|py)$k0`({E`2Z92IoO5i4|VJp)4P?XTx z2%%-)BJ^{s;d4T3zm;y?Bw}yUS49E5iH_SM31^hxaD+js`4+*CQ{GdOVwoK7nf0*} zt4-T0Qq`)ks4{;>=Fdn~tI}dBEu_+3w;J|QT1+*|+DhGWvQ78_+CX^rWV0AVYzj-B zsE8L5RznXlO(&{yu3b)+`6riC!ef;1Bqg{fmlKnXl#{3BM8$w-4WAH`JyFX^yGZ5RClu=&-w#Bd~8NRqMbR@=X6B$#5h z=Gr{bJxyLsh&my5w1jWtGeXR9LbP!TEE|Kf@p)C~+k7J5*36i>^cu}*M>CpzE}Jjl zfa!K^sqlRO6rK$BPF7Ia_=C!CHSil5uiUej{*5Zopwgh8P-oE$l~Aq{$^{}kNxL}F z5PS_|_(aA?e&8@zy;j?eprCVRG-kE`Jxh?*6`xww#L~HbQ6YKFOc!^qR#G5jc%!Ll zlDe;D*%}ZXD$)8OwF2}Bs&o!iX^Om)r+K04#3sdVfxGFFF1-~vc>+{^G8ORZQ=o6f zA*jwh;Qq&=HsK0U8d~rabJnLWmHuxkzB`+Anx*=sOVO=D-0E)+G5hI<0rmPuL#EYy z4MHUEg-ZEz--`GSacC>@lb&5{e!{a|ZD*Q8qMyx3zIf+WrBeawAHv7Ok~VLF}QfvOK}uAxpTBrAAq5mE{mw3^~bXi7HEoED-!|viPOD zac0C$Rn|6HqajAGsa$h3SzZnlyl#25CT6*KxBBv#VB!Jt*HDPv6x2M=md~cZ7a6-3 zV86)C1P*F4HFg~hmt6%0`2;JLJP zldNV1GG{qjR^+GfAx+_j{FOgHqa%4Q5gUirZ{L-Ozi4^4npqr9t6w%Sl{2iKu4eJ4 z>A?0(9M$igT~4tiIq=);sF=lFR@(!mL$mlkoEh&N&En0eNjuVAy-edyCMgQq)_v@s z-lYf9t%nh0|GV$PFydFf%S_|LW*Q$Jm7Pqg(b*IH1^BeYAN^_EZ8e}Qj(m%DM#r!^ zl^sjxbZZu~I7N?5w{~e3#Ky1!l^ts}?z0+R*V49IDnx}96;{-C zqIM7!E`Y02N$lSn*)BG`54@^Erdnl+Diazle{nWZnfQlR+isdgWonc=dMpX&j5mw$ zcqVf{VeWXXqVvbs;cgjkCNr>Wj-{SWW3-NE$1}GYdw~2R)oB`U7Vj-7-qBjl~ncP zXOpUa4Ew%5&E`VS){pO3liL4Uv)N_&cG(ABDx^oCVTZ(wHg47|4QiGK<4w%u9|(z< z$;=)z*-fH1lxTtyO(4swTS2^3CE1E@6sOhDAzDwad_=B2NQY|ZTmKB*^3NHd@-3B-< z(5o}e2NK?_(bK)qzYv5_?*N==@pLbI3|Nd%!mga^g^Jz4Hd>Wuj8hm5C0aMJ@7V<* zt7QGOC$ZNv^Uvp;SuiHywI?fVV}TooA+hWe1m`Y)rL`g_z^1}O>2GtjgA{Ndb-)2v zY`zPiO!pdjtJQwbhbThnv^q$e+2h`8vaauE72BlvPfhPAxY>RTiY0dmN~Ib>!hTh`dj5B`{{maX042mXT~>g zPcXy9_N*l^iO_KkznQ=eH}E)fGAJUf80+C;u~*x1PJyq%Lg-<(&9}um2|%=4m#lm_ zXx)uv)Qy^Go|I*P!GK{Y#Rh~4-)!cvPs-k zXq6l2vFswV%V_g)y3+quGf%VZ*|ydAn$>WhX{}Z1|Ekqc&3O=+J=#yI@!Cx4S98I1 zK}ubaXZ@g1cY}s4cWc>T9)HAo{Lm2N_XP-@_YJKXaDry>NW~9W-p#W$ zH{=<=z3ISCH_mj^l3^G>=5l_;xCClW4~XxZg)`&LfF@OJ5#QGrf^*f1(Z!T{l@;R+ zdMj;AVjlKgtQbDy_i1tV24X_lM!uu6XYd`J9nUNSuk46n{8;z4L+#kuM8*(UhB1O2 z!+fQ&hfvt0NLj$={KAT<#fqqLI|SlssK1FIuqFukR+fII?g6sY)v%eB zBp+NwaE*d{Su9v93UI=N*9wl-m#2Z_m*B<^TWwF$GKz|-5@i7>cZrQFTcWL3Hw;&a zc(al^!rYRXTJ6Xl#Tvt&6I8C3iLFk*>Tx<~)lsF=STti&n5As|etP`jh9w$6}QcH!u(NZF_ zR^-{fs?kGh#Y91wSm613tlMs3u`sjjkMwDNmVG?IpUIQ03SX)s?f-JW&wr+N?qNZY zh&XaVe*#pyu7COdXm!$5FME=)7aU7pjF6y=rO{_G?eaVs>k?|+QW5>he7pDyp-Vy9 zE~S#ye*I^@l(SBxyIk{nqp#(K8cN?P;x5gp6Yy0_kYy=_0~>ucZvbH-UN>-`(dWeP zHOeikebJlrT<%aN_dfa|SzBNz{_tM_rg@_=L>W@+S`U2S6#Kel!`1#CXcT>j4mCBx zoATyhFoC8|3r?@;a{$vlj=Ah$c*M+8gwf{_h=(KNQVodcM zGGx5y>FM0TbZ$qca(BA*-B7yg-5?7j)Zulg!~Nt*Bh0bdI(%0!dky~L?8SUT*|YhM z%KjDK(b;c-Dt09t{&-{*A@DGA@dMc#lr>Iaq&32YCHoXINyP(bk@zD!kgv>?UinNM02aL` z-}rpoMg8>FlphXf^(hcfH1d?qC@FH*tR{#=q}R;;u(W%tM??&i&F8JCgvaK1T> z^^2G3cFdpo4kN-zs0JO(iyk6W;^uk$iciqkf`uA;Y>l3Ei|;Zv2H^cYl62T4lc;pg zE7(+d^J9Z-hfa5MuSnTnP>W~0)W0~4=DohL6bCJSM^eeA1{i3K z@HW^deh_kZi{0+U=_0~f&o6rK)7k3nLT!BhVCx3~7n#=v!lh`~`3{rrCJniF5fm}{ zq!h+tni_IF(hAS!AWAa#8s-!n8iCEK_AM$#<#yt6GJ##N4VxTKTX2tyk)?2a; zxRnYeJd7};XAtY^c$b8dm6*=8`PLQ$uQiosgD`0qIo8Q1@=Kw)+SnvC<%sUmxewK# znTbb2#JjosW1ak_b9=<5qf~*b+&q~8Z)O>?uJUwzXU1zlKT|h=^g9#ZDkgIs^6qWP zF`sAR&l`<5VynBiCdV9@u$R?r@+j+`N89#VZO@%5(s z=EpXg?uOE0k(ZLW38EaL%r9}~x3pMNU9`s9im_j=RU}=ri+vRzGbujOkTr4}$WUw5{TQ~Py5+{Sb9N;mI{$Sp5 zINv0*)775A(Vn5&YWwdrQTtu=XUJ($ZA^9pW}24$hDq1h!zWU9v*zEN-)M<%)clUj zxfzXjZEWdWBR@h9cUC6wz7nD2CQYyETJ?JcWbZb)`W2G>J-J$9-AlhG#YkPo3!RY( zyhDs#yF%FxD-q(_DRjpQXjtQR^{ab3xr`ND4w~#Xql?Kj?{P5XTB}o8>oeZs5Na8l z-Rti&Ud>5p(EgRB<8JHDw%BqSi++5gakz10jCXELjlu5T*H>l& zTQh-Y$?$tO{*e*vp4n!m$57s}51Fi~wok0Kt0uyS!Jq&WcK=8CB&uHxmAnf4>~y{J z`>n0}i_#e6F*Gdeud%Fa0Z1ErM;XH53TA)C7kfv(q3l_FM`gducXZaW>{AnYDep&C zM1qgnpw;+=P>r0OMgs$5-MEEb3l&1d?>9b4NVpIp=DTqZA<;sJb_oWdd2?sLX zIYo7Nn}Rbb8gEmPWHebxTtC8_Et-wDCAh$EU`r8_nqD;Ysz>``G2X}_!Pru(t(95{ zS=Wm8RM6$?h0hz3xzCb;0}$TkT}hS=pn;24BZhu!paE4kr{epTPD|yE^k>Mq(N5u9>dxCD~hm9D~O9VVJE+R<F`cI<5KUZ*=RT2SZe3P}Z z;w^0-KDOF6163!uvP*S(D_bXat7^GOzRJUg7S(E~y4iG^pw<|u9|~%Xfm-u(F~6%x z4fLRr)RNy<1*v*Q#bRc3gfHNu0Bc0W+z|aaP@|?&3Xd&UUlfWpBD9z?M9kL?DO(rW zLdq60*+79hYvsjo^rmMD3MoOWS?-|lb-V>Z*IOX+-gt<-HLc|3LoKNW-Lq2-p(1n- zUxLeSr5R$Ik;fBN)}biAfUM0?gHM4{y9y|L2rFAJ&@U2L_6Gv_03#$?M_tJudRY1S zLe#_S1@;Nhry=|Y5jVbCPks;BK2Wrw>S0m1s)sd1t4;^aVWaY@xAyioh81po-|ZES z)PXdPSkQtR!yGoLVRQxGx<->^65HYnKX2cBXdE7V< zH%Ma)@$u)B=uflA&)K3r-%$xQD+T)_0j@D-Pc^Fw6_E5xlPtPY-LXfF!7_@*L`}hp z3Yn-aiVB&i&`?y&M7^n~n2CxFMOB$7*~v6inW(CvD5^tQA1R9JP*#l}rP*m}`lM z?$Pj?uN}k2^gM$(Zz?Kgkji|aC*&HXTt+4kGP(FtQ)MzT<0ru(=BcJB+3i|oK^EI( zw8LKrVy($$zUV>xu^_l6KMggyt4;=M`kwhf=#kVC(76(LQ?6VCzN$0y)vDoEKdw~kmwlURR)wHPy$so1j_Fn zqZumnj>)2aHKq7O$yCsq5g2x#|o8Y!<-6)Swk}D z%-B;c$R%|On>r~9!b#M7I2)==RFxkEz#0Sakouv1)S(3w;I7^+ORU&UU2)m7Jq9&dAqg+^G9msS9 z(;ZVv(j7}|mo4O4g0+?0ZhklV?9A~!UxhCAU<6>;CJ8vWX59StRJM;!*> z{+?7~76Dd4o&5H#f`#E2+n+OOb(-yY^V3aR@O|TDGn3Dxzb-DF)Tu5wpB;qVTqft) zB{@SLK3zlnd_#G7To#BVe4#&6=EGx1@;Gx4vOc;h$mADDRCkB3iL6jiC8 zm{hi$TNNc)U*qLbz70<+@CoP4P|Ru@i_T3=?Nr06wUM7E(RaDFaFezDt8hEYVtohc z1MXjNDAmZWq1P@O(KT+o2bDjlLThU3 zOfzrMHwM{{UZ=nWtuo^hR|xHW4txK=(ak*T=w|xO%3}Z(@S8o{Out!q3{d=+n&~$y zj{%DRQZs3g>1EooaJZQ?ei#FYnmeMIz7bgaUm{+UW^kSn6u2GXn2WU4D*^8@QFbc3 zT3=j4nGW-axv}^pGYIIjzR&rh}eR_9xU2?WNGvloiE@B>I^ZebI`dP>b%cq8#ZM zci587XWULJa;6oz#D^n&=YZiW|n!lY6+7wNND)5BgM?6BvBm3pQ~o!Oqy>_P+k zXvuHje`k{oEv3%KA|tghBH3{LRbyRjtWb3@VwXggte>QH;5No8j6O}>Al0l?Av58)C&6=qTCb?tV?abj z$LG2{Q7ud_vql*bLyszcnDbV57#VEvrpB>Uk<<2VT*=4Y{}Pk*rG3W*yT~U zj_WAMtgvVKz2~Bi(kq*sezr z0_ACAeo$mj7dltX!}gUOjs^w#rPCu{k0xJ3ugrKA(C8?7Wnz3Zp`t#)^4Ij~<~Yew zq*_$HZ#p%E@~nCp{Yc+=@V}B#89|R1en6*AdNMg9udVq5hP1)IE0B?=TA?L+=iKZA(ibqdTPtw@oD`r1pj%YB2s_T>^$L@s0DvZr>dp?vJi zwLDzL$DY!IE5XNJ#11X8kdJ-QK3wQZ0K{%Ca>E;fgWelBiC#jle2>s8v>w`1wN^yZ zX2qDLs((!b+J7LF*s;y*b;j>CKi9D3K@Zj!W52}?XcqXAKXZU~XVS-Fz#1Ghh=HDT zfV1GP&h{o_i(R%;8vDL8yOrS~b86Qqe#qEP7_*1J-=T?rS8O}@lkk!#4^fkvGrp*K zkxT37woEZ}!kd-?X<-0m%$Xbt~6&{1x~!6bkH`}^&5?uATlQ~K-d97G_nVgN~f zg$M^%uEt-SZQvWqCisrZa!SzHIY5fbv6~QXhSX8MRl`_$(}Bn6Q4AJf3OK4*2=G&& zOfLkW1}DX-!vW~E2{>aoV2KGhaX8?j0VlpUyI8AKkYb3?G?Av8jHmboU-^cj$I3(0 z)L!|H%ll@%C*O&A9|jKiOg2)Y=*D+yJ`P1UzBBSZrbK-2wYQ?`!Kmc*j9ATofD@O5 zkA5v2{JZTKaC^i-zrJnHiuQ8Yc`xXaZa$=;EY(0TiVnk+&^ZH!<$^)6aFXX%6U!wk zyE5&&PylY*5G9_0JxPNtBoLGD)%*+0#bDY>?{+GmcLR^5yy^FNBNUk{JEWcW~>#o;{f6}c$VM%22;qDe&ZYH1OdO2!bl8b z1mMLbes_h%&F-Z%Q_y0+{Fx#2c#$9NqflZ|caD1w|MmS=8`g%CB2^py=yS4n2)MF_hzQ~Nw;}xvT~1(XRd*x{DG#Ij3%U$ z?%}4;jCW(XA}dn4XA@ofg9MkizBxcKp+w+)(PPJ(Hri=^o;2$d-gVp&(iAH(mlF_z zpyg#_p^~iJEenjvDQ`NqC+Im{?_61kbUNOQu@)0RtQF-rHP~xdbzJY_eHg5b${Vb~Pl)P6ra3^)H<yKMtZ9S-}zD#pJQ<1M&$A3bOCY!Y6 zMx*}Da=*RpHgnrpaWkA4(>HNRi02+Jq8}0AK;uA0w6M76`*$47Ux5|I3(J-nu^+rA ziM8U|F>{tZCQIx2_n7%T|LzdhqU)EQujK)w>2L*RT12B5p&n@20P6ZcvGO2kr!GwZ z_g$~)Lmya-6>x2ghad0OcH)+ZdgX2BSO^fJ$kAxK(=ATNRb+hw7?Hsj3ui+w|fB$w$J9a zgr{+;TFftF%Wg-g$T$jL?>k|0-*ChmeBMwkJNL#`IIw6dq zr?KCt4baG1<0CGlxKJUkMsYQTxEjR``O4%gGjYa7!d#b7tH9czkRbqGM(TsY&ZdA6 zuE`vhn-c)-B6EU#?fC(r69?lccuoMMB3!67KOl6O9~6Ss2S7033>#yZ>k`VDG^N?O z!kg{qdi8(9If+f&mv9=ytRo!L_=e5sg;ylvPb@ntncJK4>c3GGc&r1#32B{XbCX7X zh)tsLiNNCtCE^}9MN>F1=&9cb;kuKUSTlOYx?bc!JO!)Hrk}ppCq{<_@|dQgbH^ zV?fdPM|#WTEgOohmABUSvVuoV=ID^0)By97Q#;PWzzS-qkKN&VQ!sm&it_RzV+?}U zh*nkIqNY@}s^r)&NQ2BQ66-Mb2frryD9y&Izm-3F3t2~-+3DMnD7n8yrOa@Lz=eKo z=A%g;zC~%n>LqKHjC1-fq%uaHW5m9qEHe=@g*1N}I4VDi)u-&A5bE5S&UI%hd$8?c zuiDFAwQzhc!ff-8&q>gTi@mCUEaEV0i$B{qLiZD5Fi_w-I{R(4{>A}Tpb~Y>4Tfz$ zTN_Gx!;v`1zQ$}D3n|#i(ihh##lpjKUOXJlX3fOjs93hSWIlg57VvB;3A;GIsg%3< zsCc%zg#FaSUtyq+h&MZ73M86cpNT!+RNQP?3&5wD%<02MhjJX)D~M++-O!FTu~GG& zrY&o_Q_(kSvl_W!vx3it%0?2~6?~1^u`YbVG+0~Kh25sX+Ou-znZ!5t;Jl$niO-ka zOXR#4<-T~I+-zKNH*3=xxytm!K)bwvsL6v;1z5RB76u=ht(p! zxX;smBElDUV^jo28zzM{Mio}c6fTwbO0%(BXdq&@P83QR!k3sXI4=xzw(taBW*}Z} zhB`eAv9Lp(U;wg#H62RbtjHA8p$k>PMFtw`9Af}awHw(%L=b2II57;2a;LCG{0#Qx zk6Ot+A`LS*NV{p(c7-Mk80u)Sa1!B*3=*Q|qFn|F8`av2 z3=$?xWv0BSoK%Lo8yrj*>VhRXrHVU=U`CF!(Xq4;aB#{z0d$!WJUpv7vW3H zD4dtC4zeP;tbGp@t8c>wEf~I)e@a^4Up|02zd4ij-;QUiu#=0Lqo;GeaEJ5n1(fBY zOqWLkxHl!uMRGH^w+(7mtpmZF6WB6L(PXv{oeE@I_y!H-D$dTrjoCh5Znfmg;f-lt z?dev;VkcQ`MP_hs;taN6|6h6C_$yZ1z0^!a+`Mf$SZ+mDtrqo;tXk{;toMI9{hx0C zXP^Jm@Bb7rr^z5LlAm(_r`rFi^M7XgKePRx1?FeQa$2{VzqS0W=dY8$ZvOW1#~G_v ztoAu1WJ@k%q+CDO^2NDKq3CyD_Zn?EwZCFCkWz%X#*YiBBhj(jl4EUBW5YjIa*LwA zo&T&sz0bffa*%frMvFpd7no^&??04EvqihXw3pKTsJe{1N~;X~XZGWPOm2HRw->3# z7N?!Nw#xolq7TPPZ_5OZB>qFiAXUS**t(|+;`F?Lt7=f{C>b)A^9mfM?A-6!RI<}* z`xEB^T<@wAxNtGu`o%9el31v)!hRNR>nzJYryS8s^$ag`8vg4vbs}eK^MAsPAzq2lFSoLp z->~dbP(>QwHBtZav}=p}+t#iP6*KixCCD;PNQDFD2WxftFs2TCy9{pW;!l_)yUIxR ztF$rVoFbza{iVj0j-7LY5q-=rDZw-U1S9#VFA?>uKOwBFoR3sJ-4ks2k|k03k|#v_ zzH77rHL}UKe+aThZ%3dCU)IPym!Gg^)&)`liE0f08oebE-)ybCXm?%-^EjoWVqwX= z&v(IG36%&d&R)qkl%39ZRQ6Q9qqE~w$R$Pebt6V+RG^F_zI8?m^J5cgHii*jWu>%N znNV!=NLytTSqi<@gkqRSXw6XQQWJ_z9-%Pat;Vb`BavDo=74aOg)oXDjH2LLO_*uZ zid{xiL+yx^kzE7&n#IO$&-G?%WhuLby(VQ7Pj^xDuklmr*;ZpMR}wcEzPI)f0>vvA2IQob}s2Dlv zHbTnGFs#aePkm4AL^I1-7IDNgMarbxLe49DO-bXCAe}iKS-R`hZg6v*ZslR_9pS3v zUhXvP444}xE@9F;#z1p|?_w19la(;oBAp7w6|&meeb*>mq09nPK~LB}fs5Q!l5X7w z;=$Z8Aul)V?vG$dIpcvJ?t*kzk4z`tL;`BUuU9&8B@sl__-`7SP9gb|dgqFTZS2%E zMSH_)JYzT^5!)3p)hx{ynv~rN@Q?NV=%|$92_hGnls_>k_bC8=3Ya4*2~05qc`-V* zB)|{}Uy`nBZll@g+K5nQ)sydpB54l;bt_1p3L9VD>!O;6zWJLU=1`P}Z|j!~oS8P6!d?~rE{aAA)E{0>p; z?H`FPr%ZK(;mv;{-rTU}k;~N`_SJ7F!0!OV9$nw70KWqiY~S>r0{n&>{&EHkd%od@ z$@~di`+tTI>X`PoS@HW?(2u&!ZTZu(TuC4Vg#4L9lKyIq{-DR<-on{hn%HN%InM0j zmKx{>|1db{U5eBdU6O8W0K>Txpyk)>4{tMvXC)`yj#QBuAne7auqCz*8hbiYoH3GT zu4eyemS^-3V

    M!;R@WBey;;rsXgYH|5@p8BtZlEc!vCq~er_Op?|s*vWmM*`~FV z8=}uZ&puICDz3@oo<({+9*%BOy7F14646m;b-MDoG-oj@pG{XH1m4Tl<*lzVx7JWO zdK6M1MgD4a?sg~efo40l{*dhG28HRsNW?hpQu#Hs5xMNDTjkdTbrxdZ}(1wsJQ#5UgfMz+x)Jhl2no-s!eZ5wp*akDo z_~pWmMo7Ynjhn0XR6nSds#B}#9L9%(!a}Axohl3`wO>zkFVQ%N4x74cSF~|*{!|2$ z`uQF3pc0jdE>Z%CjMJVcQ*6HraMA_80BAQranM81Yoh!4h;##>R>4#X0LE#@9#nuP zLjrJH@bh*eg0MfX{;E9Aq>#X)&5(qJf8-yU+rB(0l zyZe8LN-r)Ir52*D>$?SA=&-`!dt-DWXT*Nt$7Im0q2gEBdvX`>Msv*4oEjCj8O;Mn zM%R)c(fUrnJC7M~zd7>)63#+2Rk8COcsy3HvX@{3i@YTPscC1*(6S+s+s*mW+>y%1 zy50{`pYqnX2XcGJRMPqycXPir3TJ8HF)#E36bzaUYrI8V**`PLEg<-~0WFknM)AY- zDE={0U`$pCC?P0hyR!xWb;#XfgOw~rJ{K!J1cyj3<$_}jJ~>TlZnIua-DH`o#fqt| z<;%?(eSv`$v@~1xkMrsXuq}To)G(s6%~i#-eIB*cXJ8w+saOo`Y;GzxzeA@&|CzRf zb@PwcH+#H)>~p?dqVq61z56K2gi&-#QPT@`r zJ9$cV{_JRl$=;Gb@0qkG%WNL$f3?;o1ED3j&xg@6;w_beRz96IlLzTsxMa6#ui z%k$4HbZ}i92tUB(1f0g!)Uv~W(tP3s8SeT&nZ0$r4ta9GQj+{^Yy+lK-%baPkc=}i zs!ZAM<(|U^D%Mr|(xZNVY+=?|qnO*u6T04d8Q)-b9$%y}d_&n< zzMQ$`J34Di93J|UW^U*aUxd4HAh z^B`c}Ut|0=#!n5cm9Mc<32^P7GTynIG&vW8+6#*_-ql9!#a>17cQcDZ#>zarf^>Q} zko8V+QBIDXYi!KW8sIPc9r-yK0=G{$l>L*!*ht}ynf!&Fn!>pwiSU{`RFPHJYpYUE zZA4Br=^Ym?Pb5FTRauTR6L^`~{VN1|W#^WOFH{T*XE_Ny#eHS~G zJA79tJKsD*a*lb1q+HzU()<=+ONp3XpHoj6CYsFzG%_!x4Rgo?JE`!J`jX?EUvuj$Uuf24$GA%JXiOfkEPj6|uCsYams zLkUwOv-Gtuc}DJw^tFH7BlqR{+E+=O8oAO4Rj)PMzD4@lOW#m9mT8ogGW&-qX1kD) zG&M5S$W(u5wh^=RwJ*^|>}wiiZo%xEgm6FO&EvWUaZNB+eJ2ruN!q#ZAqab-hMCWkqrsi}Pvp1-X5;^gw;29oGm2fVl20t{wYST+AJNJt} z(-wtY$CIi{ZolI-U4=R2CS8Bl^*Xj}wWqbdZfBQ?(oKd{ooNd?LaGTfv9q!4IxNw) zR%<&iIKFj_ZwOQumNgGarYhYtcX`ORBD!00+|$;$ z*SWe6dFWhjj6Pzu>6IXoRcO=}8F8C3%%c!^k_n4UEikR8esvO~83QB+>^LlXOm(P- z25@Q#wUK1Zm_+FpBO(lftjKliF*V{P+i^S`5IINYMi;CVxKfok*1%}M-Ut|%n_}kq zc18uGt9jIb)EMfMVu^3bw7}RKy{MfLG`wAZw?Jqq>7 zGuMgS<0xZ~BoU#u1=a{{d{$alHL0toAsv+nSDQJwXQ_xDg-F|&#B#fi*+XhWAyZvD z`3;GMcq2xhrf-zsX(~3Nw>jUC=xt81u{oOm4Mr+5zu1@@Wxo-{lP!++wcax%kd`jM z8B$#puqWe91?zWt(@`&%rJ;<^ySXiH1$&HMmXkXxMxSj%Hh2{%DHfb zRE$tt;ay+=qPD`l*Z@Rr1#F|eK@Ol}B8HTKOa>NoPaCw-yt3*dc7FD3b>`*o;hLwO zMCW2oTOb$`1fBAbzJ(He5L9Q1E(+oyio03l2$%8ZY#foP2G7{4;+}P%q8d!4E?Ot| zY}3li@0WX?X(h@j+zSe=L^*|bGM~tj4HV6h6JTz{HB%LnS+C8}6UyI=i;kk7w&U`< zi8efer-INcyK&GrG~pRs^b8Pt1f1-uzvM8sW9l$fyH0y%p$@a&khfH+=p3B%4sN;! zH~oW~4#I8Dqo|8;o9U`f0%*RQWtFwbYJW+xmg9ao@;$#1rIM3GZNa~Ak?Wif;^^evyTLdod8 zpF&r=O~6kL@tekQ+20ZAn`X!kuYNZU$lEM(GwvO{?^)wsMH>7Dqp=LDs>siV$!ie$ zLH=qHyM`h1x*$(pF{&iejMvfR6|vpbt!b6)hZ61;A&Bt50G%Vl-~ANeSguGiPd{ z={9+wyXJSXDVwrhWJTc+n(nS1XZnS5FQxGps27&`ME3|d_ccWKwmXDuzFy;KqU&qn z@fka1v-^In)&1>*k(xDqm`Svm|>% zAr*$5+|Ra)?u5V^5tsDZcAX|H0&6-%T6ja$jD*Xmltmkf-mYk4FCY?0K(_*n(SY5M zrv+4*u*N7aGFffYsVQDVfUA{3#K9*tMl=V-iOiA!*gxUsOS={snzPuHcv$I|7{W8# zlqk9*sqYZYFujUWT*r$29>tcxr(>o&A@hcy`hAJfIf$&ThU|#5R?`h&({$Kdn(h-L z)A}nRNSunBto^{vtLb+C?zACA=OYDexHf27kiBWpn??*wYtk7nbTE_KmZ^L?-TDCr z93NfWYvfOn~xQpRQ{GB>O; zvC1L*XJ$HIHlVC?3&6NF(FZnBpaDjt4~#<_WSVY(B@B=3SB_IDq?v9eHg-v*StejI z;csig#6Sg30_toliP1{%FJR?YcozmJ* z&Zyg6e$$hD5-3|S6FEf2+jH)6Rm;nGY zfpn>99Rq;d41hWI@Prn|?_dVNd)!}rzc$%%ahNx3VWaw4nB3-{PDUcxzzeoR&&DE^ zEhmacYhA9mtH7DcJr7pCpUgd(;W1fWnMJ0#9z6aP@89yUEg5vPXcB?vK9H15&+y}N zbf%x*q33NM%D-s%DR+yqX?a@jUPw2>3${qH7W1HO51i`vGLR#(%BCpk#UhZRmE1fp z;I}C#@RBqdoym@GbzB@uAjhIEI(ET3ckhr!JT4;Rhm^GPr9|#sJa15ouKg$FXuzAv zz{|W?n~cA1O@9}mZ*S$EMD9JdVTaoH^I+|pJXpIAj1eh*rwJjIpBumENU=LU9K z6TdFP?Xw`Q0t8PyW(ELXmy@o9-2g5gn+zt7?cTaU{N zlirNM*8PFXPcg`GyyR8Ibk`a#;dTO@vW;JBUbg*Ig2Q!sqn3AVG18HDef~jP^O)_Y zdd&9B+^0@6Tb*;Z=^5Ml5ii)fbQl&u8Pj5fi!u~l@A%Kyo&g{Ae_pWNR;Q8Sd0I*D z_|e{!AI%7Uqyphbw|ww3pr4@^Y!CgP7i`63iNFZQ{*5Ox%*(ZU##VGj9F{-THW~s0 z(MZGM@cf`}rKAT@*D2l5!?n!WdIfM$g|$zL8J)%xvm@#Ty6({c_%!K$b3L-k(6lzw zmo(;`zy?@z9{gxCG=VucMlV_(Qj*yu#hzruYqgDodZbon@w$4gSIU~5@A42V@6+-w zEy|vQQb2LUg?o$(&vHVl>c|k-@8o)N2c70GylaErRori@XJt8#rusC0A?F&w&Apwh zgow8Jrww&kew%rIF?Se42()N=&(VLf8UVmvjz9O2QVAtiF}e8sqG zj7!DOG_FzH=7gsWkIv|J0^wGM+JBgm#1$a;JQrn^Yn$;9wbS-IMe)?!W|(wt=mHR zl-v^I0?*pNGagpNM6z#3fn zKpS&Nm%^>od6i;qK~x&!EjI(cni3N*Zk%!@)mG= zuJo>{{O!Cc!i^t2Z2U?8jkUR};S;Z}{hkWRe$RHirP=|%@P&%{h|krp%(brO%;%zm z?3EXN!p<^MHjiEN2~G2WYJPoV!=`zZCd0v&t4ef9Po(84y_hLK348~S-?naHvlXdb ztgJ1IeGn~+eGn~+{X2!TON&fBvrF?8&&r@tHP0^ftKNRD1f7rpI+q6k)?}}jmc_w0 z!>jsBsPIX=AyRuq?u%T@gQ2cB?OfG^NlyEU_$_snbn*XNQqjew6O?q#|0-z$GYRhv z@PqN&K01|q$y)Ik>9tabp31^U#3|GKS-Sb4UW`o#2K6|I-i`HYA7rR+nDp_GZ1bO# zMJl$R7`bM(2IYHyvI%8eC!e-uu64~z5)e;5?Mb?0QNN_Zi#|k16FG*z8Djje#@ISR zg4g(7vsb5AC!e;LkvnA49z;qWyPmX7*Gp7mGQ|80&nUx-I&VAqv?s`LK;!)(>7&;p zd+?JxXFg<_#r;&&Nwo`|u5Z)nAdcA!T|NKwa@C^#bfM(r)B34$4`TJnrya!gf?s)| z@Z{4DEA(@}Nq&um%?8Pan@9jn{6t?`L&a;ZS3tko+;V-s{eB~fyZWexwp^`Rm6I}b zH6Pb_t7&!}ouH5=LJBQvp9n_DX4R~Bx?`8=)!04gTdM!bp2GW=f@8oLA}=-)FWmE}Ca7hQ=7pGt^VoK$vPmEFq3} zsd#_aWO=WAA8MLkUs)o#%H|=n#_C{n5i?2a2+h_%tq(osH~$7?KY+gUpe_r^jirE7 z9titBeyo+=Uy`nQck83(L0zo`KvD{Bj7LdeqjBKmANTZAImOcdmAXX*CA_5@y*T5Ac!SmsuSOKPQ}&Nz z8Tfoho$y^Pk$N|`;&}w;{g(ZZ>78l3Z0UGzhG&vqC_;w*aS?*m+6tPmN?nR9H9C>9 z*C0#$9Et9;g%@?Xw&;jenScC+_lx|lW>0Lj-KDk+J*iuJE40q8I_gPX)c;9)=t*58 z8!WuxrTd8~OVUB}psxS47q?mIRldUWU|4n(Ue0AKNt%CW7gzq3U4?zkgSwNkIAKrt z9uo$U?Zpm%u%(y4Z`EJIXkvfHlTjXwD|hzz59-2+l@=b<#liF{j#pY)-XLwCoAU z)nfT4bs34*xZ~*!1X{5e`V6AJPuOn4^qSu|jiGrxH;Ob~&)j~_RsH6fTl4yD%{UVC zsO}LS2dEv#l$L={m0r&c=@H&{0Mn~?1I(kkdN?mO4mghk=Lz71O%9jq+3I*UfqGJ! zc|F%Ws{0zLUL#dZnQQbuU#&8;yqPERYQ{+!8WGSI^Q!J%E*7J#rjw14k$G-+{;8bS zZ)CJTA|iXL4WOiXI+tg30fiQ~p^FldCu5Oh&$0E2!JJde$@04Tggp3&sgG0>ll33f zRnHK^tGYmI@vYgYL({4KK>R%4LoKJ8XLakW_DXR|yn6w@^yY0y{JE}43?0+(=)~`2 z+NEc8!Pai!oQ^O=j(S#?z&}&qpCvhQK67bS6v8~KyFg@A^PMTY*jVAfCQ3}6e_5B> z)kFySkPsbWo0NzjT=qAj3k{@h3}GkZ(+}nsZ$B~Y=Bxn*#lRIkQTWmC>K?Y*z9~Fp z=t`{iUzt^Bm~NH0@c(KUr+`G0MVHp9ezIL<=c2l52fjwcs7^97L#McD!m$6ms9tjp zeAoROf=k=f9B8Ej1+ud_nz)*Gv{kC`1mipr#r9kp6)12W5uL0gY`bP-qRKJ z^xxAp`w!~wr+;`V7yUw~@51`%Uf)%mz0o|V%Nt5Os9S3u)crG5A!k4u%lsF0 zK|@{?YxtZ!Ky8eu{F}N2KMTI^Cb;mXZjD~Mtqp8O78Twvg zstRxCnn!iRyadnn*XdWGgp~n%JdeQ3`H4wRhJl34m72K%Q;5At1B@NbysHd zu5Pwm^LKu^rs2y>F}J*0lXJwwx_kd0^4Q=B; z(OQj2mH4MdOEpqjp6~BE_e_F+cK6xm+0XO7&zsLDbD#U1>s;sWea?0MT-Of|f|lbh$r4dm|u)`iDk0*!yx9m66dQSKO)L%UtyJ+x5)>n3t*paO#VQK%9Itcw`| zs9u8OUZ`4&>WK-?8h%!>_;rlO*2Ujy7XJ*k?kW1}9k%ZNKKprhTtEW@yyeO6W6Z@?|drky9vZe6fbD+eyXoMPo`X z1iqC+)*TJ3dH_wo7^~)9f+gGOJJdQzADzCpM%`2-H_75y+#%~Sd2z6aDCIuq^c@q3 zfoM3&_j*9LxXW#TlHk`Yyvdq?v(ugcf)ZYR2ol)Cg>?}hZCnxscAYEwv?s1T6YrCK z>3Gr?sxIArC(+gfJQz?86TZO*Fb1pZFQrENsUH4Aj7)rQ4j%ICF<4!?1eR(YtZu8p z>e>-v@`qyTZfBbQ08HIej?I3_ML5C8D%Phw z>VfJ~l*4?z8M6THTNbLWv^G(g|k}LQaMlQLuTSCn932HX0-~T-Bu{*9qR3Ef+{SCYBC_)vZ=5-%$LB-Rh^0C z7yNC$B5}NL6QMs7$sl)&73ncP3?k)cA{l^hfg-)E??dym1=Ur(;H>nKWTD~ePSu<_ z!{+B1`k0xVg9~Pq^P8AaqnUO_J^lL`m={i^2o6|x-VA+a&y%0$U_Eb!&E*bQcSajb z-eI63Msh|GQKFOeHSZuFgAhk&XhsHE7eDT08Bj1OY4Dg#N=+|oR3a|v;s>@aeyUx( z8fG*T)dSZhdZvCGW&p1HLZGKapR)tk-Jb#1WlX4^q%T>gevLAP9h&K8=FjP?;8SJ+ zV&A(_jfiT7sAfc2tFHq<~u}_GIqa?z2kU`hIg}9oG*Pv%1 zblpVS@AfcT-r~y~kFGnJ&wGN!^XgMB_RZ-$66jCe1pT*rQ=Sgia6jW z1KE!;#1_;`iO(l)kwID0F@ZKau^WR0EfWRs#*q$ca6>x)~HU{ zuo_amu5JzU;kN9GXpDu8Em&D`i74iKoP$G}3^YZz<;l_tstv_`go;}0R0c@@vHa|* zOBY9LOL<^JzlD8g>=I`;L!TfOTMx!KWc0?&Gfpz0t|JSg`+_I$38&w@Ec*;YbW!TYzrXFt47fxu8+iFltkSdK^e~quM9a(m9TkC1;5?R@s6D zZMahmk<#*`nzTmg7I8^v#lq1AO`Sd}snILL&~YT=M77sE7~R)%h}K~kx|P6~fUaph zH&E)H+`GP^ie6J6C267eAHvd=NbG^>78PWtieqpJOc(TZhoyTY zlrY<5@5JP1`9;CmA?Z%`z_oVrZT})*IJ(t9T>)~d zafty((6ZGsSRTYvA$}jC2h`kI&4A?!6}K91Kylwd&@*Z%J;YlW?Sj#-U{sI^Dkg?7 zZVlI#Wm>4%OTwUf^Wo)XXdI5NG`QiK%^s@ZD3gnXx*cy$Jl zuF`Bng}F0ZVaYs)qI<@8;SS3!9*S<*Rw+}F&juI)5zRt5wosetNp{FbjC*CrLZHhe zjir+8N;7kTlkAEOK38+Wr-%=VniJ~)&rLHI*rz2i!uzz2pRO?mi&T8T9f%-qzlu@# ziV=GEKEuXk!ntCEwZsitA%p@Yt32e~hXbXIKICAsFtm;*He2O%a>Vd>kXm?`&ke)v zM87Gh(-a05eN=LS(7lI@os%wh6NLbD|GF-nsMbw1Y+Zv=@_@|``l?RXtiYU*WX1I}yY97CW&_#L2Fmy*LTA=4vE3`vf6tzt6fX)&Cjh z@2Mcw+cn)Bl0Z})1r3G|ae^5i?ERYu2O}9Kv(1BB z&>4?8;MhKwyg{7uNQVGT_@OzvSrx2Q%Dmldx*DkA*xT&=N~PT8>UvYxDDi@>)w(BW zZ-LOYrUDaobl9$fA!j(o0{c@{&n>={0|g2PdhVNnp6_$Z>Vwd8k5je%W$<&= z86xS~=zw%=^g96ETRs>-_lr<+IDBS|n?0%Deh`4}-;sSFa_I!!&Xt>7$UB&e{Q8q$ z;1_E={5lZbk73&U(+DM82X!F2R}q`tb{~)kvN$YF(lHN%(Oo#s!*FvKkMl6#+@<3@ zFfdG4tIhh*yk^HK`8E01Ar}1+{?-ZHee()bQbB|*p`p~hK=McyCM1-)+X(pLeUxYG^h5hYCdQ&}sV?w4jyE8V96{_USLi=}Si;#qPmx zgk;HqJ4Xe-sF0vD-!!BXRj+@LhIHWD+ai;Hje>M->J8S|gYhbs+TRAp^pCaE|IFN0 z0+jByNH-d=Pk~8_QkG%q#;cx}eim4|gS4+8b!+qf80g*tNmfakSzAidi#;W2jI;)} zKL_0H?}590R`V`aT6Og89iGp32j3i>GvuI8tut=H?Ap^i^9E>FJJjp5tJ{uav`-y9 zCdi!0b;9kVLc85b0*4SF9yLtcgaCEf4&pldS+Gn5;pH42pl(Ml1S~v3+?7ssM=k`; z5uU&h*I=sD^G;@YTe27Kzfuw}ViC3J3I6vPuT~FGmu+kBz1$_z8mu%x-G*_1x(yjX z-GQ?i5M#1_A^lB{(wn$6%fD~%?FvFXgHD7lIAD550OmLKA__+8a zm-}n0-dNV6mDMis9OXF4bgrqg>Tg5Na+!(&(2NDcJQS~WyPJ)-c|RiJj%6n+-SSLo z$gBcD9ii0#DMQ*b`u)5vkYabMPGDw8A^=gBd!cVQ^=n-f>6IN-)+HsVUUM3_{6D3B zz2;<-v^v55~zivX|JLGh* zshGE@mvMU7_#9MC!MRVnAceVLK~qys$r_IeJl?{?#lZtXzUdV_$}&*|wtEl{$HVlp zoh3>d!_R7`{TSf1!{@MTTyAh42NYvz;_)S|v)F-ZrqfgsVJ9;^JN}QJU>YWXrtd)vOA`>T?*AX1$Ej zif zFZ(B$02xHLl_GZ$!(M_(UBme}MQ-+OJVoyMK+mTm)AB$?RuIi?Dl1jn&8mDDHWowjK{~s|kepCDe(^5c zM21Y`MY<-9X23ya9N@^9;z8YRhD-HkorG1a$8Ur#)L3z+h8ceGExQw6hhF=A`IdcO zdgT5!N>Qd_8UbIStMaQ2(%a?z3wh_SlXw0)!Ba}H1R2MmZbt(>Zvl0isn_9e?U> zf@C%I3)epwzbZ0#rsS#C;7ltraHb#RjH|F@6(d z1%R#NOZlCctl@W3@&r=ADw#%DYXs`x$i`>rK?mQ;NY!PbPSwEFiDELeHkPT|jKPvx z*YbLqwSz-sCI%tuepHKMWW8nYOq=i7{wMwomw#vX<#%JSY>fci3$YK zY$6+ssU(9j7|#s;%uaLA8#b7OL;mIF0NYkG%`BO~`=t0AO!I#61#A?i zWrkv^(SW^HnPc%_pZnVpV`k3Q#J* zr;|v@4(34QCMAJ$1NoH2F;yQR8$g&6C^XX`gS|d1Jk63d_N)!Y^sgB|hHGuqSY`8v z^@j`|Ho_!RhIOV4tgWFzN2?wQtbXZr*B{=tMrU~Vg*oUCe`^ltJ(@mamjh zeWd7f3u&5G(#Yiyt*t){@Y*;I(c14|u-(ob10$aSKZv>pfoZrX z&@?~7)4P%9!-*WrjdTB#KSSz865;%{nX7&^LERc#j~AL03GQL{DU>6MEj)*>eEHvH zEp7BMRv z^*)-!ezd36)GTepMgkgNApitNzFI@sde7sq z>2Wsa$&JwiWkq*>n^^8{_9uC7dP%Q>t898wKfly#o5PYMC%ag}Y$xg0R9Tn@;1}h7 z0!*(~wgC*`@|V<^t`aob&Y1@&Nx^pF$9Tj~pF*F6tSv|~jYz?6bbB6$Lz9%y@TS7H zo8IhS(J2Ao;Pj;cnrzc}tpX$oSG$gVHBfq=o!p?EEzHRqSww_rT`(ty!_0Y<{Y?I^ z56`4=ZEz>n>!rdkq-$6FDcaT&FT_fPk^RH*O`NJ^5v)PX}w)y&EBGt z1kW$g?;O+EoLy$Ur3*|Q+bRDB2Z8_~BS7{1Sv{7O1s9AAGcd|R!vjDr4{Q!=zgWjR zHB^(sKQ3Yo`b%Ahl#jzdCY^GUcObuSC?hg*!7J@yh?frF(cT@tvB{~13H-d+EK$?B z=%zo*dnW$N7OoEJ_z6uo2q0u8+{L%PBlBe*{_)F3Wi9$kM@koR(8y2GoZDk(vaAYp z*ZN)I`?R#-7;m1gaU^tkL$_lQ%!_p7mpA+dmGGrCX1o7rqR>b4>y?6sf9&=2e-Hjx zD6?j9eO%2f(|6$BcPJS3Xe$}r%}Qpvw3(Vw_#Ux~LvXt{r%9sT|rhHx89=<_-5t`oG^n{WyrL?seF$sBV|6K zj2Y}>vUFH=K$$bG>vB83^%ee}GEy3=Ln1lsV~a(}$CuHS-fMi0W&=H^vI`2VFAzxu z`vV0psHM4KQSvbVhXN&UX}^Wzw~|3V^hZD_07d$zK^M>d^sXIKf zuT~$UZZPDbA{j`3LtU)2KbCr|>tzHI2^8vVcICKF=U+u&p}Nn@nMc-ab9TDW&^`ru zKF0>P>+^74uYw0>TX8k`vi73j8k_jRr4@4%U%XYk25B5G;n(<_s$!RZfc&+d|0l}_ zpbh%=!`wSkrs(Zto9uiM(x9JpXa#esgLUSz^;;QO=Z`2GH;>O3lykkjk%h6*l?^I} zQ@r|c%q|cUWlGS*u777(c{vSIPRp+q)-vr%OwN_3{1syM3;k$Ct&5xdV%>-_DHNkhm_S zuH#M>Rplif;P7ZnfWv?$jl&wYtLX?g1~bgJ_bW=P#Vo%w=n}wH(C`A~wV%e{D*(F+ zR3g)(027V~)(|Q43IyAP`VOw9n+nv(Y7#?-e$3#8?_l6JxZxgpicJt26zMLy^u`>0 zt~gb%B&NX)lS3g!oi5+&BwCOiOF2N6@UppiyM=1D9}E49Le--U#1?-tw4{JB+RF<- z!=r)pL~^4{^w~hoS9y>t-l{sTrs!iX>34Dmc*E)?DUDXKw8I-T&B zA5a0~0>_tM-hrlBuJrVHL0YZ)l=ax1ff5X&l~9lZE}a&L&K<)Zx|fhf6SzD}ms)<2cS4Uh#s=E>>Sro1V{jxcRfFYMvh1{kkkI zqWF8G|L1%aIZ{@OZi zul}k(D!T?>&F_CHn7T3V=M0XTY07Q{R4=t)vp+&Q(Nx)HjxM=@`+vwE?1$C=ypv5V z4thq_b8eJf-p%O6U6^Ysdz+QDi!96QGFe8$<3euuexU!P5Ao0Y5dSG3;-CK^{+w!W z4}MTsgreLwlMZCJk%spJ8p_(^pj(O5KTkV?G5c3foA2kb15WRFp8sDD*Z$F&2r8oO zbyrJ=|E#$hc*I*UIEPt2A?Zju)XCQU4=V{2(jeCj+nAm{Y;SCE`p`rbu2$*PfG3oXZ}kLr~D2*qN|HMe>t%sMe--#7qD1{zHGQeUuvEh^(69`0f7wpRYgQ*KSmfC!R=Mm|Dk)C%MB!FW% zlK*U!MAb{l>y(u5F<5b2Kq9`!U`(PoQWp*>F-Jg>`^cL|lR9fZG|Ga=UOR#e$z$^g zkP@WKpLIZ?sS8JyX;NIgn$wWnI^O{^s`8J>HZ+^v0`%0|R!!%-9jC%J{Nz(=W4rOp zRX2Kq#?waGVD#;&^RZgyR<=$*QKDK^+(~T@f8jB>c3Ury0`XS95qD9LZSy{Ljjdy* zgHTS!r>^u5iPW1kBesHGm93y>$JB4$bPdSxvCg@$6{MYX#nbSnSx|$(b=;Nj=WuiL zlDq_ZevJgzrutKFMN<)K=!GB#1gMWi2ULxz`2N&+BdN$}>ZU_PKb`+8EjkXmZ8rKJ zYEwhe)FP|lXzG^{i+jZZMx|(~2@KH4NNN!^8$U#{To2kjE5qdCEECW}QrlBEy+-1- zsb?b^&?2hV5n+O(TQsnFL+SGG8)GWb-L}WL{b>PG+0uD(e;HbyYM9!W3s#rt2LsI&Y{8pgsjml>x49M&(ZIz@O$ zjS_HVJ_t5bZsfko7xQg?NMDdEywxHl&TU!XTy-b3MubwWtmnDKt{C)cWYmQOeN_8n zsZk*#aB1Le3b1R&>sC;F)@vl(#qg+avyx+=LXLy{d^+1R)-Qn!nu4C;l%fPCH1BxG zZ2f)vq3Gn_KrBD$(8a6X;^M%z+R4ut(yMQh9oUYd$p1EL|9JZHA#N+g0D0wJh_CBK zHb!-OE+#O<8~6p|6@jKihVvpnEh2$AO3ZhFY2Bftg~|LxUCg?wcLUwOCzG&PV6n@@h=lPdoJyjv3v~6j z@De=jF|RVk{f2}KmHL9o$nwFYPw98$mGj#=Rp+{`Q_G3?j1&2~L0%it z7ffyDciz-C68fWJKB-F9L>xHgk0iy7eBvc1b96r3snYg*)aG=$vEA0D>tHjZqLkay zlKg{(o7n~q6$)%2nB45FvVRBbQ_n2BS!4Vj6oyOdd@PxQ-iZt-4Z&7mTS=w$R_+e~ zs=2gY3!6>cTRx@E_mDUh8`ok%?jQ__w&6QQyJcyyr11T$?oP%qBH^Ha5{yA>D#gQh zy(M!kL08Rm9w^a5$JFZ`$ZMaB<}ir-a5gyM=V8y}S8!g!%R6 zw>RQdC|*S-tW05L_Lg9=v&P_F%S2K}AE#H)i$kb@&|4bRGwBWd;$Mz?-ei7*Np(E> zhq@x&5BiSD{L+WjZ(1j-r?pOPgQnH>AxjAR@Z8VkVtiimN~tmReKB27f&mxw|4-p2 zmF}s~eg)Q)1-C}34lFw}ntHl6T~O|OlfYitdTHV6L8i5DjaI$1G#uf257!HTdX2a2 zq!)Nluea8vFXJlopsD94P}(X9bT2>vsY^XKoU6M8>B8^VrNF7+hE`oVg5CGWIl~I( zv%swQ&N%q1{MEX_aKCmjJ-hET8z;V80l@A%7fVY@yTIebv|FH z7!`KavAB-2Oy<59cHh}|hcPD}gjKARyJEGLjMXclo5+fe6CPfRze|)!ycYjAQ5FL{ z+lbY75Lk%8g?+MIuu@szdORV{Qss*+;+33+*^5)?3F^tKU0)*GkyaI;n&JVyx%U(1Gk z=qox*G>})1JLUzA`T55DLW?a7*s;W2ILw);;7;_L$HK8%Xv|y0{7V+Q1w-x+Y4MP! z7+aPremx&r=!RYLzSa~jzVd{;uib<9S9EwAyyRJ&m&rC`@O)hq2wgZN_a*z}zHFb| z*YB76hW#SHS=afN=`MY+g)N1Uf$IfZ4v6A?1#u{@mvOgTrjhp1lm?es^5YC#X2mgx zLs_`Y$N#J0GFhStF7qOc*l|F> zItQ1z&cS7_b8wmKf_!O#&^iZ~xvt#3Ik?Pq_3q8VWv*+pw~IO{-5UOT`QOBUAOE}f zw^A!_8MsU}NER;hjvukmUN1pXGODR>W#KXlr@Rj?vla4j^7Nl}6FXxOvSsk+aAqpL z&xBG?00GM^sL|9c-idVaKP$X0-S8yPN3UW^dZ4anA7ez12bj4X)D{Pr$%Xzz)h9)F9l-LHe? zr2{GKOFz%dkOH5x_{4-6obKNX9&7%`xhICL={1n(R+%!Ik*bkUCi8z&Fp*wV&VJU( zyhMuBErVY$%N6&<8CBvdwRf`H#$Je zubNHgcqN`UR^pzi^kRC7I%g!sdhEr& zM9>@VzLOmSUF%ZAFuw>!cLJM5&udksr4O@LutL9~3dt40jvOE~E4~L8?Z9_@))|t{ z;jkmr`dQ6&!PXoddhlizq#JUA1T}ot4EaZM*eMM~KU*;TNsYN5ZN!|Iv z6>N3BFUZ%!;ly`=^BtCNS(Y~Ge0BQc;cyy~=4ZHoDOQx9yIkBQ`uBeoZ0^Fx8T7Lmc65AU&+?n~K`;&uA~!vs;Ex zqT?7548$h!6D$5N5M@I7_bX=&-x^mT-&Ce=n?six?)dc>Kn-INyHA36 zW9NT9&@^+>{!N%s`+n?$p=mDDXkgdCKIvREqHc-chm4}p3>QFuDd(D)Y%+KnX`79Q zr*RjRJkU!ZVrVzbzb`1gh z6>yCSGXU7&F(MBKdN~foh5(5v9+b|d2qt#V%LDsdjPT+mE(SCjIwF*Fq39$dgsw3W zNDMt#n-4E{T=+JB$K*kJ2Grx>+mHoU0b3PtL7y_~D!@daOC|bv1vmNUs-5z11vepR zp@bl>z>kW@;$8A^1s12^QprPJK}{&S6rso~s7XePk&L_oK5GRW$`)`)@4(??3T9#w zkT@`O4XTr7+dza5QBG-(hLqZ`Q!(LuDwN?9{Rol%^B2 zlZ1&b&Y{53WaY>j%6!pt%i?Hi8)}n5aB81ym##SUCerh|#=talY^$l`{a~6J!^y;d z4oq_`xue#k3B|{|#_HSy)AT!h$t|whnF{jd33;EWn3olnrxAIW zGgzrqN0n+NX%s3?bUz2D>0O|P)Gp#>1Ssma|9T--os{=WMY{kG}h@TKnO6{R-YE`Hm9k_~rhJBeJkiFuL zgVKD3uFJyw!=N;u;H*&N?-uR!b4!3uG<@`b7bsM_2MR?O98LkTJGPBGWwN$*#;qlu z{W~o#oSrC4PfVw5E$*a;WT7;rYLHPI8waH+Igc(VD9xe|gwl+uL)S-AM?z_ayWS6_ zi3-?-hT^1^CT#;+F?LwUWInp!Z8TVA}ND4 zBRuk$AI_y0fet0Xv&ade%^rm2d=EksO)*ayOgcYw#-u@Lc6bn)Ne@D^!a-0g_gP{p9Am16f7ksGmel9U81SGLwF}Eze z(7p|A#bWUx3#7>|4Oz6y-KL+#%L3_g{dyTNnp`(Snn8P4ql@n35{f1l?vAh94pB!| zk87X4jrZ$uky44alMslG;^OP=T+91Mbhv49)6s7@F1pVb!Zg*>@W5~Oxn(GiwksXaD$DB zU{;XD$}F&DMqw5!v(T_Ii;uv{EHX* zD4A@Rf|6<5rAVveP$&`6&o+^rB?xX{dxKZcS?`uMTrfc6@P zl3AinQ#_TCF_g^fYyq*7}~S7`j-$!K<`5 z2wkb);KDs}U!&h3%LClk+0tOC76hRsT7Lv9B`?8?^P;9$QEDNOv{0BxA{BT!7A zW8iC-YrDyS>t1uGty2`@ zpfbNQM3RDTBSjKg#zwb?lzAN6AQ@S?mP-ymIn)Ycas7AIcUC#AT(8;Yq{|qY+=T^j z-*v&p&fE%8oafkDP%EO-v^Hp+CYu+nr`ZnMylGT@Xx?dg3Ry5sN+k} zy?uprOw$Hi$7uz_pCEBowA${|x^(OaK%0H)1BvvUa4hvSwjS7RyMe90-F2yloY@w; z`okvRZ^y3w{24y(7S{&n%1t(AI3d@t?$s4vG)1VIT)9lcBXqXJEekUQ?i;a@`Ebq-uw6J`+MLRUY^OF6!acHy9AD1!D}R5PR%*&{b~qx_q0 z4Yi{@YuT9v$P1x1#A}g4uCOYmesEJkcvC~TsUqA4fu~BcI)cBdIRlmHDyZjqZKm7{ zGOTA$ZQKBQ2fptcKz~+1Ct?BifupDc8`dc5&@wePGKlsaF+M)z4Wf_e?80FUKMdB@ zkvPL{09}$9Ko{8nn(4LNpdTAUWwH|t6RfqQ`j3Pg4s_o^UNfUEMK4l*G%Pf`k)>Hf z8_nSXLpCwcvJJ0Ms+nPihhaiGxZfr>;DBbxAT`6rpsVwP+&6!Mon|*A8BQiJl}%4G z*8Gv@n;ac-lMtv#er|9$#G_Ssaf)^N*=EguvLjy5{zQJI*!+OlUwW$Mebs1Ncmr!8A^tzH?ufYdpM^|(%BV@R%y#k?Y8Kr;Y!DALvn9N+PE}<>a z_&AhkcXV0{?2xv@sAK>#D;`Uv{*Wj|9(xsx=vOBooBbwx$?jb`iZ0o?lV`hVH%H@w=>a&mS$KLd91FooHc7O4_zhYjv%}O zAz0z8p*p>M#nhMTEMGBgS%&JwsjiCS-A-;3D6vXdHY;eU@vPY(cdhYk+eqximc9A+ z$$Oy{cK#;37h3KbAHaEOf%w#KQE?%~A6cZ9%4g?R#}%TA0w8LG4F#5D1R>a4cf)g41ijZclbVW`&lwC#6%7F&Mi zTU{-*a?Mv=Ewt=V?U`Dyw_3lz_%vE&U7D$^o?mGKJm-EOF^~AWAeQ=F1|#$QSn5x) zu6;wX<*#k@Be$=07;@~Zs*i$?xw000MrWrL{LJXek7+*=!$a5SM_0D^*-)((944rl zkmPzPF=rZsp#< zBMf3OOyog4N(dJX?u_}Kj#RxBgR_qDfXu{=J%2)p6OER>9Z9`S_Jmc~|N9f*1y*LH ziaXoq9KyQ>y3(i8mmd{L{Us28K9YJS(tmJ*EOfB5mg3lDB z6Bcj>NB@S6R+XnB6(d)DxC6Hq&f>S{Sbn2Guc-0^-90z+#(bmVXs>4VCy0n}Z|~`g z$laTRBNzQ*R=U9QzJbXqLvvqzD8l*h+f96_iaAwWl}w%@A#7K z{N`G3nvgsPw?A3M??kWw)UyKPT7kY)^k4THI}?eGoF?1HrD1ha?kc0+nykpY+Fa%7 zn9R90$)zT(q3x-P@s&AZ?WHOgL8pFt_0y-Hp-j4qSMYgnr0Ov8ts%i#OiBBzV(TM2 z{LA5=Q(N`ivNndoX=7dyye>V$r?%>)Wpf~!K1Nhq_4{R&c$JQMMXKIjc2e3uW6X!t zHONG&wlB+PoX94M+%PFY-DtE5^n8zPM6Rw@OIPBN3PzWf1;sTn&fa?|v~HDStc;;~ z4uHZrjL`|n?i!eD(_cL-`JGLMI696|tjD*W<>6E&gjvb(kNyV6(*QNC^S#dSTSJ1e z)RW9m3~q_yAr_L~Q!PZ9Ed&1qTczlGS@TK>7EUK=1l&nDTQusZF)pq}-eH<~@p%%? zWlEhF@g33%a0sH(M{bge44Cds`XG`L5A;VTvj!7y*L`_e|>_^i+16q!Q1yp zy#IF-#b5>GcK)dh?@7Dd!S(jLXH&HOI$Lox|wy+n_- z*!0k>5WHeMA*{4l5LWdhAUNam5_!Y`rTJn&azIiz7r|8HSOehYtMPs+6gf37~-4xXmyc8lkXSjSqr zoqi>fNu}}GU=0=9fi8q&RMDd+?~RV(0}jiy<*j)CJ`^s!%yRuZ^(*B@h2%;VWyV74 z1~V*8nRQc>`s&|12}aFgM2=F--^&49Q~Rhl)cKxG_@0*bKvWf76wxvi!~g`yy;unH zN4j41MN+%;;RMm!sx94xNO2du;=VuB`GzFgt>KMLkSzSFz9F$_t4$rKP5mX7-ul@- zNbLJiio#pp>cIENgh{&?>DS?$lu+v9Es?y`I~dMx_W7Qa$;3o_>~g*bl-2Y-1iHWa zC4C*-Yb4CXon@2MD=uJ1HpvIa&VoWDKX2qoURCZ>yitK_Sc9+bI*^4GICc2i$Gw80 zbJO$Y^2M*R-pvU;*tDOpO9NYrd&Q7F05LR^vbh$KFVkvhPYhULge(@KVeaH+ge)b5 z{fSuVj#$?l2w86+WbN>Ey^I*;&ZG|el8a;9c}#wGa5Ol3h)E-SGjlF7Q!IUc4bwxr z&C2b&m_dRk4zyevZVCDZ+e6Jm11(FZH8Zia2QS8bQurD#TGzH3!squXd_K|IgG*pB zF?_aIFx%L?Wbx+WJ~cddT)fdxy(L#~@y^D5M)*35*JknBpp|emiB^E|EE4`)_&Pjo z#)~cAmSX{-O*wp;%D^U1SP6c|d|tdN{P&(uXrsPpWoBbJ4i)bf&nNsd&nL8{#-rI% z@6r6ge5i&WIG^@VZ?kAftGCT)+JoP5J|~6$-t!5~@3h)jW3|y~wSil+u+_#U`On2o zZQ%Y7)3^WKszok0>?Yb?|@+Q3}Zw`jfb-QLg9pT(`#pfCLem*DN z@e{pN#;Oi7GHM^I;-3O{bepgq_@}K-Xs;SUn`(PhSitvOZ1R@6>ZHKezr~w&L^KG@ zVE3bKDBl13iDN)rN7DvCaIw)L^g3Kf?>}@dx}7uF{uiWoLMA4^!0)8wCn$fewPn!v zFGbMOgUVH9Z9z3m;miSAZoy2wgvu&aXA3HW^wJ$gg!&0x8t!nR)fU<@7Fs-0owik> z?V;jIF>TF+wu>`e*EqL}Goe||?ZIj|PYfp<=X~Rw7>ip#=&9ib7g}JU4PI#M%&k^E zf$kGsPtAw+jBwJ?Rv2v(T8bT=S)_s|XO=62$x7aqPD@tuHX58av&I#?UIkxJ$(!=0 zH{vzBVz()*qf&3vIx6)R+%U6~I#jtg%+wew&KoM7-_mItDm}m8T{C-)?xgU2Gc}Be z?!HRrw{+Tlm7ZU4>&!mUQ5UOb?lQX3(5g!3w{+U7O6S)eTsw2f=uQoY(xHK+XiZnw>5Sc)fQ_GDbcms1 z$j87gLSS__bR;;FJj60|6{L$OEHc5i~t9-K+kLMYLC*JWabsm16iXr zklw~rC8=@0tPB)A+{h*QTE!I~S7HhLdDM&>uFChIf z>PuXzu=Btpu=@9LLR)rp?EvD(O{2`JqKJVt&N^SVVVFcvIqP z#;#@O7{UCEe+mBPvm)aYbCmvN#pb^;<3B$`xijOl0v`a}1?zru2=f^jQuyA5s^+y$ zEL3Hw2vq47o)QG+>pks1j(RvM?%OkXs06%u_(^zo3`Pib^GuTnWM~S~WND=<Wyl`6+DIGJd2>rvWX?tMr((uFj0SA{o-lba(sVfIo|Ly0DS3464%<7G{X5h-jTb?jl z^Js4tw|z#@W)yfV-twqXu*eIbWObSL15dQN3d%LRV!Hz9%^|R-@<9!O^i# zX@^s*Qnc28q)Y#D(>X(RRiGo>u}#5RDaJq`-w7iOHuTp{M%#ruIf~_b(-E?p^?*nC zfD_NY-CbNygyC^{Aj19!J$31Yy9C>0pE2Qh3aeiJ5!JGe~bm@Y>s z3k-B2l*vovla2f%Mn2gi|A-?W8W8zMj2thSj`*)nA%PJWRVC+k>L&lCB@%aRP^bnv zs)}5VmAe|N;%Uu56|@qjDwCgLOm;D*xfqkZ7}Hz~tF%_a6hnTNMvY6O#-&m3rNMJ7 zjT)B*e#*|Y8f&;|yjqej=42PcOT)#;kW6iMX*9bu+PpM)j-}D;(!kG{x){@3jLBY% zX>GLhaHnE<=?uBDjQFx`)6*&;@&Vg8<5Xq8wbypqOCQ1>-ZU_;u=iCq2@c%7BObYR;R_fD3$I9Z>6pO~y(P9$mcY~fAJn(f_9kqD#JzIUdC z!kL2ygT<#0?7iaN>9CwG7H!7j^Ip2`mJhyXk5vqR%SWE_F^Nc%EFV*pk15K>6ftQw zCOSUO3IrM^JZ42=GM$X%4P1Hky;XxtLRDf?JVhD#6g)pA9x>ygO(vD0(K6u8sPNK^ zY8eR*Y@GS%B45j?&C06v9`nb7$e{q#kAmn6`F;9`|aiC~tKt?6J z0aqvv&~)QK>7+1zgtoeYDQi3R56WEZ?lVn}|} zWS{-Ywm58L^d`)t3c@5j?NnGtDGnFbn@!LOT#%bk0{25?KIW)Qez63dKn0aaIOgR9 zDannhgBg9yW>zxNTWdmGJhdA?rA&jnr8@7*rFDQJV)4FF@5G@PZSgwF^sqH{3&Us3KGn6-4vlNsp)S?10tq@%&68h&nK z^?OzLyn5e2OKmvPMt@qm<*k8jOSkNIlFFr`Z4ah89s8g5@}#(`GoqXk{*eh6_^8G# z!MjZqz(;jvX}`ovJ-I(p=D<^x=F>-}J^#S!T;v54>?rN1N=tvwBSd?wI-44?8bA}6 zrJbaLnD5DW)jKupwaCYWf-@baAAp%Pd0e1)Z1(u+y`Q%@n=YKuvhBi^F<;k9oSA-` zT*PeGHo^w^?l6qp`!+7FP-*9=6xpeTuuD>N2yg>b0MA0zQ$2J_a4pg+|T@clqUaHyxeIS zn3>{Vc$9cY6$9oSijm>&H}=s?jQ>m0NL2L&x}T&4{;f*8yFECMEuVX+5w?6>9Zqi# zCfMpZk$SDHw~28cyF2){F&KOS^D+B^i?ESo$LtLdm*a87W-% z*p5|hpQtStj)iqk%Y}tlE-ZusuFV}1wdKOtdMBus7%4<6hg=DfIB3}sC1}KKceE8D(S8Qn&xp3!_?R2U&Es zy;UWuT5kqmA?N>ys{`9 zoACJ3m;NRLzqCdrT7#cTv4Uq{b9EaHn6E9hui13$P0izH}vsyOYz zyL)+(c~w18P7U9=1`n;&)h#T+dpCJLs#oLpC$BziiXP-eb)h(_f^oC*Au5Jh`D-fh zC_!eGM2+yFv+`5%sy{I+?-)NT?G=>83F@$}bhR(^w7S%+-d zS@|$aZ8s~+xQLsT4`*lP!`WFGBO}bp7#TU0>Ns7q=EuBQ`RMQxs(kjs@Ejgjp0R#Z zv+|jmm05N}Tx_#)!34F+0-Kdj4Reba+LPAr7j4-D5ih{YX61H(^2A>;jt+ygXv`B9 z!{c6UF*?R#{9lnqqG~idD}QK#?rsk@gqbcEh7;j(uMVfnuV&yx>NQ+%Gg3p?X63cv zPCDh0bxeySH7l!X#XwbR+}bSW+HkL;FtcN$I>6M5cqC&WrXimmnrye$n6ug<-)NT%Y}tlE-b`yF=W)r zg|W@d%BoGPyCGLX&B}l}e9){sk?~KT`X0Vg&B|4pm35;tl%tufPv7%uB{ObTE?_KK zbZ=-NvLqA|QJWDxXhhlRxG*H5kBjKzB5F3G%zUhIUdhf^vMk1{GhOg#h zH&bOM66nfo-aZYw(?nM(I-8F*C5Wy@{0;^SLYlXO%(Fq!W#;3|{LV3pOP4)>8F%Ja zK{rL~gzTi(f14fMl){9mVbo2lZc@Q(z2|kP)XiUbb!NTX^o5tkYR8M! zErTvzMOiOaw|Lo{BB=4f(cns*?C?Bs;Kjw^JM2tddy3aGs=d~Dmph?V**% zhu7KSvvk@@$7eJ&yU0yuCo7Nb!Sg%$xt34!)bJ&1SREb{p2HVd+{mRIm!V5~2M;u| z@FsDsLt8fa2A^$of(F^Wp2cZh_~tex+0r=}C7Y@RrP#7Yhqvb=i!YsaK0HT5*Y?VN zEgafIv-!ZuTpQcA+HSv#iX|g+`rz5s$%|I5*YAQd*fKPj-{>|U^mSc3qVk0ASObx7 za4prbQyDFzLg%Z3MuT_sD*lTL!{6OxH0`0A`v&(>q7`1wZieq@XwHywGiOA(`EIA8 zkfHB7wxgkYdgTMbJ;o%q{Gm=NES`Dn&|7Rdi)z|G;|^xr9AfSwdZ6d-7iBjL>&?1Q zqcTQM+3JobxzmGU;a%<+4~lsrE7JWP^}EczMwC;}Xkhg%`A9ff0S6&WDYCm>!UMs( zjI6ZuBa}+~Y{!q2Ag3$NpB^GA`= zcj7HOB+jW|W9N%sMx3#({U!FN21hUIL)zq&K{YQ&42Qbavg_>Cx{mx;IDh}Li_L#y zhJL5>pR{b2`R~v8=g~s&$y;`^`Io3cjlc7svWy)m*jhsyfBfx^Qqg1;h^zVJIjxC7oQx|*Ta zg}9A59c!+49&RJhlL-~YZ3E(=UYHRz^|V2Y=L*k%jrUydJ$t>U#A(rMd_O{-<}5q^ zW~P;3{!i&naZiHY%jT9l#7iqjI=N#-Iyb0Kb_r*6OSe2@9t=OY_qsvn+n+MhB{ddQ zZ?w${f6F}DjP?~bBwXxh?VL@a-*lfQCvUxEP@84j%-gcN#^OCgyeWK~Uc4Io zslGl7R=V|aDcxolqE092J&y`E2Cwj<*0N2xZ?<{Cno%tHErQi}ilfHUQ;nym8c$C( zo}Qb$$ZEWCD}yN$=|#H#%fK)S>5(gf{I9F;=cx~RCweHVV0PW@*LjC{oDgj!W@f+c z1WL=wpc5}v1|i=5_wLsrn$VTp%77vKyr7-GX@S5I$u8{=eNSv=tq#3B$wS%(3 z=SWT7&(N|x2XO;SfXsf~|371l?brQ#l{TIIvtZG{iJW5w%Q-D+4~^9!HQwMWa+OV( zRkG37IQn{t&7lUO-$SJn-6HUoXFPTI+KpW08BakvHM8G}k(g3d!+WC4BuHrF8&Y{6IOTvH^^XZP3IjCPyRx(h8P zWQZ17ofAP;6UgAUm}oZI%sN4HnrP=cS~3AGvE1q;hOgEMnf)>cos&)f^Q;p-f8;tr z6Q5+48ZT`Y03WbU;Pl8#x7^Xui*C9zUngyowR54Xc{>RStUAf<+u3wy+iuyjH4Ub2 zeULp{>m`bl{GHr>#FD{D0(fb3&a4mSGpypZBJ1Tu1H9@zuMQ-cjgG1)OXY+EsDfE4 zCnn&Pm-TXj0$wJ_Snji@d76M+?w!zJVw)P7&h-)gujLC~9KIyTtaCy5dOlK##wFxq z@KMCi>Aa(K~Qa^S&A1fQF|b%U*K+Jm!enAt8ti2s}x47zCA!gpA~NlM@I2N@r4K+~cT8?G8sh8oEQABxWE$voNaRjpuipU5xO9^*AW`RpEQu+{9d zmS}Q}hE^(;%COR-UOMd`9QA1EAH<2ZO&2Ss&@!^hbie6_o*A=tuWN|4d9k}O$@Vt|Qd`$)K4(1mO#C&fJ+;eI; zS&G+!?a!Yx=s)J&frl36zxwE4{xMutJSqRFM+f=~w!eH%%=hMCYr#u{`;K{=k3Rpk zM@f78qvs6F{#^d1u)Q>Y$D{0X=D*1ARr$Ysl&^J6 z&v#*X2gk`5OnPO;!0cnf&yv-Jlb)I($;aOd$MCy^9y95Q83PY}ZqhGiNG6E(T1A8k zH)-#1-X;~3{nstdw|a|c7Ode79%XAZ^g#X;6TSxsMM^~W>Jp4)EVp2XAdP%J||e@QE>C@V^7^@lc0{Ia)_qQ z5p}$bmbgyO+Id$$Q~w=zxDz;`hMVVq4y^th-)+2w9#b$u7Z23k>(YL3+6%sEs`>pf z?pb#Y^EC}~S-aoY^#Ugr-&eotdY2Q6PUp%^Uc>u@WCOqc)Yke=q1-tb3w5g6NH8}9tVs-Bh&$RzxO-tLUX?5V{n%=dc&$V={ z%Z-mk8WY?=*=>VeQNrh40Ef~B4W2_{gtXenGtkqG(yPXvX5OT5uu0EUUoG}ER{xY{ zu1$aGaGXo=aMmcTUZDXZ@WXEFO|_*1>OeM|FrceL#eGk3oAq{J&QRWLxZ7WwzWi`% zM_szkPw7M`{o1p=kzWxKEj!|=cNqIS>wJ4qGg}@{JsK~4BHr~HBjani@vZ^(DfiKX z)P3WaRS0x%{jaEEc~SzXNUMLh4M`&X<-?#b{h6w_NqZyDj3j}=vHEJCyclmZp49pFvEh*` zWkiik>fI<`&M`g2^yNi#98E4O@Wz}(ZR&Tj|Di5QQzJ=iux?t*h5?$&k#w@yC+xfl z(o)Xot}-9)QEYmodS)%*pzZ-W!NjYND3U%0!{^cBi8D+cS>3&s;7N(}IW^JJex~V- zczLs=6jRponwhoXG2dSAv(c?~25Q}|TqfO2R2W~2N|i|MPdF7xrdkajGJ_#h$TkA& zQoG}&{oFFGiT4jr(%l6lB@SqRPLAUYhRGZG%}p{{wg+z>Vz94?qa8W$P;K~<5efc{ zeG(oUcWE;qc#5X37AB2RX-VSWESlhTLz+lujcA*thCWxrq*BwT8K!2Jw!qMKWS*WY zcc>oPrPJz(m(i}g@`!lN7O&ajU4#1=(~@itp(RN?v?Q?soYj(CLMA3!!bfOHj)ppO zyu*>sAP(j6R_C)g{7ug%w5G_TSyS%O{LJ~(hQH(agx1%1H0$d{ZOPYZU`Dn-aW()3SDyO1MrLdTR3dbNN+!Ts8>`a=>$vK%I>rQ=sCro1 zlAY=}zJvI#!y14Sg%edfx!&Jjo5!vED=)BZKy^^I0ULPJ#GqN&>@1&)+T<{|@~y8i zY<^s`wN1QxU9aP+fW~nA=!vVH%b%Rj`^4ldekUah=^nPFjF1p<0_)wLK|{;RZi@c%W%!E z8rm!rgU||{mt(KN`*J&}d(}hcLfhTkWy^7%+-1vIL$D%dj?Jl&vCC|s?wnplr(oSV zn~ai@tjg^(&ojbnb9Z^3p|!T26y@5SjA!t!+##bpiPT3jlpAuKr&2f1`0~CSDk)oK zJ^!j3YHt07R60Z#40OLjbTl1LWWul!S-O;E-@peu9ZYN(v}RITJpfhO|1nJ&8jBSa ztoVj$TtZaw^;!{0NF*Xu-f@zss>KV%OvtJs`^&Np-p`Z~360=RyHO6A^GL9 zCf+|Hfu=V@k}s0UMv@Q7_GO>peFC}BxNl4fM1fu>ObV3eWd-EKLaM!d9wY-OOiFPw z!E*&MF*%Ilanu#xV^*njRct|`@uhAmAoo6kkMKabcYSS(7 zbL&(_GL=kiQ#)gyx`Sfb~XH# z9sDd;B`v%y%?+yZKO@ zl=JaQ)M=HyMwNXNZVH0itLSVcq#L&EU4LBLg&U2xH0eFLkW2>5B@{44y?wcw7^?B)|lMx!BM@ z|ApCntMe4H9f-1%Epnq$0g(!sQGB5Mc3su4O`DIYN*|Ay_3?~OaVgh!KNaZSIn%@d zeN~77KjTeefMg0Z8p2DZqF0x`2w@T!MUZ8uoeFd_B_5}u3$pbAHXvfD>;_I(&dW!BJRD0A1ZH~&HqKU9|3UB@Mnhr}WKy&CGwm|a6 z6zVDM7 z#SoDkL?kz!$WL;g3pW_i0+NG>cef>X&!yoGBZ@Y?Bcci;N_xpj7fiJp=)Qz1 zBRp70;%sSI4l9~oRt}3Thd@-D(2$%`G;re4;RRF5DQH2{Q=(|d!w=aY4?jvdZwjIU z79yt9L(8j(cv@FSUqDaZ?46`ayWohC_#thMHu4AGj0A0%6+bkbxJG9H++ z1{Z{hrsqX?SDp)5I_<7J7eb<|ru2#&(P_h!O-3Fmq{t-wK%NU(I_-fx7XtZxQ~Hej znDE*uyBv8L+#$a!&xMG)3n4T2O&K!s3&Pz~MvPqjM44HeXCc)5+B^%1FSkJ{TeUrZ z>*R_(bY|p=ojd~FCs5t9(*=dh%F`q^Y{56tau7=i(-8{(WuQN|sV;}fTGKhQgBU{^ z0@)YhjSve>c)cmD-8RsFann)}S37CsA&8-k`u2)*Z;r_2jCv3=?acGAN^0!$un~yutGl<X6hWCjS+ zp_l5_^#z?3>e;8~@Vtj08@unMjh{6HkZ}GAX%pt(7RQkPJCA$#$*TU_B%|FraOD;E zR<9&i)ho5<7pZy_BLq8zg^2y*2rK|jX6t5(b202ZOReI^u=9`3#dYOQ=bD(j&|sF! zNVeFLExt{gTP`P|;@j}|-zr5!|4MS;Ul~uwbH`=zLvuBbA2jdhqn!W0jgQsj`8(Ls z8Eb=BdJ#tP&%^Lme8LA7Rt4Rp_A1`2g7Ope-)|N6W8VEgHZ{35u<;N4fJR-%dx+oZ zwB$DjK1oa7fjh+S-AiZzlA0*G;@d=LzpDQqsQ%@nRtR$Dl8$3)0Cq}~ADG>5-QvgU z7M5nQ(mcv|t)sH@jGs~u`oC%Xlw!zl>C)kpH%pUW+e*uu()>5esEWI72LsCy zTAn>0-~NAGuahN&zA0UJ-N?+sE?h0|Ii~ZOQQRoS;llKxW=^(CoY$tT*Z-Ver#l-O zz7S|NOiV=W(v+E3;QKdY3Lh{&n8^hE8%gv1<3-uO96p>GJa`^)&x-dT1Cb&Xc~40* zde@Mrr^ZM8J@sXpa1;f!#!^Yx-}V?hU53l!PwMkjtR*xb3rzFj_gUDA*IGbx*Rq( zxcgAhMHesV(y^M7gQdnFzlHe~=C@?VWh0yP`*@I@|CP8_l|u`7w9|G z*g{kQ7y2$|NZqASSunQM)PkD+J?*vJvVBB*6&OA3 zyF~4)wpoHj6=khZeuWvoV)HAOUvb8-%>2saSLXbvE=wcRUou4*+(mmW3Fcdb{4PLm zX|MKNBIXl2SaG}6#T~p?@Dr@&_GkSBYljE%tA0>*e!FG<4&qsRiLO9&)?Njo(@(JQ zb|@xU0mQFRbj6}ON`A!;T6<9l(Uw`dWuhy)W30W7Bextf$-1FWwj#~Uq1NPlCY~KA zf;iC5uaS_(5~ljG_{!tzR$h*E#Bu9%xh`$x90V#}#pnOC>69_FaD!pt#uSE_Z;Zp@$liB3U+|Dg3K&rh@do>PQl zb}z2;oB6qALXwK`kx2+`{yXa7(#fukYmN<{Fkd;p;a+#$Ek3`QOt? zK6qZs8vdm4`Mb1YUD=DDlxOCw;mG)No&tf^_1D(paeUZaao0`%*(2~;?A@_3%S|6Y zPL`XrGph??9L}h^G+t^8MuVU>Z8TnL6G(%gBqEKYT7YW&)G6Xdp09pvOnyrUKjC(m zagX>c>|=dVYs2bJ{T44*@8+&VoP*rCQp@`v1i)YY}`&4eLFjLyUc&tTy+)M+pA z(B5EUOL}5Y1`<1iGZ+;W6g4tnv4u9`pb-IsNi-u4;>&4ctLFsuJ_lOwHCEb2>p4we zf@pm~YZa|x1Yd}+@s$)U-0yGg|4f3mo<8Tf=lML(=j8Lr{P)Y+Yp=cc+Iz3P_u6aA zR1ml3(JiK2Q&o-&twvB`8`J$K`M+!ai^{T_CwBkd04h`$OUts;$8`4@gauqwBZ!v` zqTV2C1o4PLSO9nUfOyj&8V!PgG2I^+gazE)tjYxFd4+9Uhfi=nw_uQ2{y)tH-`y$b z={e{wK?`9YN$#__YBEkV3Mh0N8f$1& zSa+wq^AGQSPGe59BW?5H-8(cUwHtj|yzg_DzI%^;9?$D&ww6JX_`C9QQCP{0aVGrz zKmkGi(-<}`u^}TP6)*)3Ke&5z1X~CtH@<{Dfngn-55Yy-0Mi6et7pYU^$UyKPR-xC7(s|>8E$> z2YjOymoYuA)UFQL|`jY}Fyx1{?SdT+C)!N(Z)Y>0$w$%RjjE6^`A zG-5_$%LZIOQ_g&ER8|86Pn&SmihKA{1ETRZ+Cw6;mJa%A(DDsD$DzSN>vd25%?ln^ z%MA10C)nx0IKAYxt2OMK(Ad?3JjLqVG)JDoX#Fhh1s<=hK*>{n z6IuLMBO>^({`2zG)e`?K8{j<=NAShDuu8*Gsj$jim?x(xtlEX`3nF77R)&QyoC`x1 zBW!}gCRmsi<>lYddG|@!Ba-x)MrO8p$-mV=221Z&c&XN<{7?vFqDGUa{6A7bek6Iy zVEn%#oyk*zD;Au|Q^kVQPY}CnnR=6_Dgi0cPq12WCQntftVRB%5W*QYoZcFISf2U@ z)DF)(?!-g0xtrxZ#XgoTj2N2OS-!}&WrTY$nzeoVnN?1Wr3d4Z8kt!lE&cR3G3&AX zYkaq{K0nM)>^`8GQB&da8h$jsy8&N->IO`RnrZhEX=6QKZ*b3#`K0isPoRG<=&iA; zeSu7tO{N4EC(RUZPE=80B%)xMu=OAVYAC^}ql+l3>MliCg7xqaA2AxE@TJF4t-EB*!s@B5-*_?RQoEGVu`AKiBlqJrBNe7JPKBJKMI6pE_U%U-MrFh z@@UCXz%-z2mvn4R@Ob34-u%96VDpErCY@~RWK!BaGAv~Nwe5}&B&qQ)w>!~@K@+=Q z`0Lvpi3IM}QxuYo1FcK;s_%VTloKxqVxXMjXwtrJGJ9?VXR}1f^+OER^E&WF?7Y+C z(>wt1ugK_+u~;Rbr1~1SWHocZw{CDNRXqjLf+_UNWC?vi3oF!D{r+iN087RY4+i;%E_3`g4tXt2NeL*|#rPc`pDO4J+>xA^z3CjuA5dl^O|G zldMoVLw0!q0jboKqC*Urnf=SEJv*qHSOm`D=*SL49iq>E=822@M`i z7UHz8crI3*aSlOhuo^}A%${&NQy{}&Q~=H@3Xh)0eEW1r*Fc;en&tdDq!cUS$q@4bRIjj+7g%(_xrEmvxh)d!tB7*MQ7vwAT{dOsZix?59X^{`I zua}G*$B*9cC0Kgr`2Ag-Y%z}>b3!O}chHv$T#t$kfv1NS9~=l@bB?y+;6Rf17MK1h z+q<~*LC)$z3%qCo#`P-e_qRShi-&HNRru~ce`z)wDIWI$6i&E4d;7%DJm3+Ki(h*f zUGb065D%rUCnUL~_|=C|5P!-P#8C}@$W+4z9`0-PRKxc@oSkuc=y{IgR&3omiy^c4 zcPNXWfND7R7>j#6HE{+WSO%R=ukTdgeI zH5I`wOoC;F&rFphxIoEzQCeDU*CSWs17&54P^Mahl0!_sB%5M^i?4JbQ?3-qf>!4X zw<;O7z>SnPS_QF#qcgR1;JV{kd+A_HK5>h_(f}8mim%G_^_p4=mFg)HNW@BFsUUu7 zf@>bl-P90EH`Z3U)3U(Yc#cbwAET-E4^hs!hs5V;G0Ymaf_}|x#bV*E%^D-ipXQwi zf3$;_A_`D|v)f=?I51l_H!GP&$yN4X#3Fu2r%vV9pQ`3}OzO-0GE_3+5P9uC;TCF!uO##?&*Z11KbZ0Ilbe1TK+LAU= z+o^3_o4K~KwgnEgU}=L30ZI$+Y50^o?t8D+=UPAE`ylSy_A25F7in1JQN6(0?d#DFyTx>fBk6af`x{Mejg-HuI|;JD zpkj!nkOx-tpxUGNs)tN<+)LjF22K);agw>=DrIDxfhc9S(D=OAlKh~~BJMCR5WV&mxnb@`@fW&%t7 z-G%6-d>)mmMEL+Z%r$}5-wQ?n@$@rNPyUDOjJnWw+~~B>hNXpgh`1C9KVkb^j95$X ztQ^0j!eV}as`Kfglk06jOyF5LK4G5V6%A5?KC{tcPRFxy{ORU#yI(?hwK&NJ&$eb< zLfhto$n&mF1%dpZyKwlVg|iJu-%vRQMd4Z@yw##K7T+r9MAh$j9q*qd?s4=+fM}P^ zNfj!(y0;moWKdq;OqFVoUhm$&i!=qlLzyOq&Z|KBF1cB6;b|}6W_t>cK{`k&S+-nZ zn*#UR^LIqw>z>9Vi<2qcGHvpK{N8>K`MvI0JW~ItS)^?FO6W&cIDoDn#fG&gxIT!( zD)LvI0-{o`voghe?CX`J%LGSW=?a4b8O1pv^aGn*Hr9ApoSv2lrJ=SQZsiy%xw*lM zzS(x^R2Sc_vC5zvLnY@oi-7ajss4lCX|VEBf~JzWoyu`;mqpxVv{U%sINF;^F6&YB zW!sJF>x^m&|C>D3LnTsQ1424&j|bYCsE>imZar{dYR|w~uQ93RuPAgJ=EkBoD7lRv zW4Z1qjZeGPlzuOQh{jN&=*M}n;!e|>lnm5Eh|~}@^wpMcL5-5PCDJ>iQlR950gKe{ zlrbgt5Z(LMXp|(WKN( zL5-Ak<-M8HcuqAf!<6%U~nsn$I*JL4;%vn1xt4%e^%){4AjUbx=`kMN*1Gt;XAcQbeUV9S`$HP%_c>D^gaIsO{g&8UhStj|y#Kdx7gg5n?N`k1@4p!#z652%hC ztmlBqC4;~lESX4~-)z;gPQ>4r>uUE|0^yk6X_eAtmC~#(Gjp9)%KRRymF-IR*VV=+ zp16Dko{e*ar(Tk~r;yM+g~#1J1*d~}QpV3ch0WeQ1!*NlGybdHJ%!RAav4Dyi5tXZ zBkn24k0_JBr*Ip|THHzn1;v|`G2Ks{=KtJ1g&`Vww>5J9p2B3Y!lIyQ+36kv4=oko zTaWS|nsO-rje;XuxTi3QMGcz&s1YiD5^_DNanjEpj^`f2?T3HHJqIlP-H!60hrinJ zuNaZu5B?Ke{@^En@Q=)2=~=RP>B%4Za`{UPQ)L}=PvLg&o&u%eo&wNzPhoviu(qH~ ztEI&S6;{;6?w-P8-BX~PD++2{Mbukt+}%@%kHSmrZ0;#oH8ff^=$?YrgL%1q(9Ecy zdbr6m@ki!2jdFpyQ($Aev=Q$p=rqW4^5*@3ynmDn#9rUqgE)(#9ee_gw1FKl)@cQ8l6q$u2N8T6M*gNIze>;Mc7zr*A3;s{8!D$_>Wc! zcE9VY_O4tBNI;tW8mbM`6oGufK&All&dRgU0z6c+)}9c9sC_%7PFF=U`Hgc}6&EHhG!={kjO$AL!=mv6AVc=(lFA=}8{?pmc>}*CrRwUbUf=}`?M zy+7|SHh*#6PkQ2Adb?}llk5WG2{*(u(>Vy3&Zh0m(ecbozYtIOk&4A~+NV|*T;-L( zjZs{s`Tf_Brci!1R#yXyf94^OQ-rSs4NTIbUPSt8gD<{Pu*Fx4Skz-86z)0^a^0m6 zGH@|6jQGh(D(uSqFzvSg2aSuCAL!e2<`c2*pZ(PSAa%kOlD89?+j{^&x}KGpa!fq^ zR=j*?boE;bW9d^EV{67Of0(x)Qy?l#W_-OM(Du(|CaaccC_E)-dkUwNxle`r)VNQ* z`!u)@9i9Z&H-k+QZx3I?u-!%XyN}Jd|5>QxRRe+6W$ikOE%|?#ho(3y&$>UEf(Iq* z%)7eeB9H5Pa2V9~d|ZFtHJEpeq7hx~1oN)4ysIMbs>!?R^R9-xt5GiH+Kj`|mXGVq zySnnOp1f;&-qoLX4dz{=B#u~9HG4$n0U;-c2Ms0sy$h8i43gOf_cDxD5Q0*P}u zi9}Id4oQ#ue3B#7Y;7^<8MyZ7219GOSyYn?=f|R&jJ)+DH-n62L@8T89vE|qz$lz5 z%vebk=2!{HDORS%u1t4}p`>B8+VK+<)w*_KdtD@7h>T0Eth2^g;Yzc@l_u7y$`Jd3 zb+&7#*=nbz;M^W#iR~8UHp)~2cDJkqa7Mpz#-MSAM?2LpOuMn|cf__2_plY>Su*hK z;%Jkt9ZWK@6Ky+?IrEVCnnYP3_B2E9bJ5ZB!i%P7DxU{H)-P^OKhBC!vpA-3blA{C zcqILhw~j;Ebc|MY{mg-7W)uhH()9}5wN=oX+#|iOji#T}uvEcN&gZpqOQfGq$jS~( z1!n9cVE?A~#nI6Y-Go=LBnvJ020?|+(UtLZKi86UjxXIgEd%u@ojA?$nN@nJ96LJ` zjBG6OXuT=vD3UqxfF1fzA_EN2I67>Z&&A#_L1u9T= zn156Ls$T_uYM_Y^i_isH%{Jmrq+iyq^CFxS#1B3HLuBfZqk%Alk4NO!G08Rqe^i2& zD%V!IU&NtJ8daJOfK&-OMiQM`Vsy%%DLz{3a+QPkY7f&nP`l!NJK}7Tl1jA4eftxc z;%_C&Ux=5#z!GP*A_PW@emzlc_Khr6`OZW+Pa(T&SjuYyK(OG^{7(`Wvdon3i^iw* zO~J}t`knHke*MSl7t=P8($Yuy8TXd%{Kj7d5`lfiv3!q7gjVd5e@SRTIp0a7Dp9Hl zN9qPsszh-ML-URd`Z(}-v9GrDeKUj;soeV9tiJ5Hmvpcp^EQ~p!TybQ@>M4sq-@>H zNygziy@-@&Cu*Wqpt zA%V8<(v=mjHq$&KrWc3Q;}>T{6`r}YA7?ktoZwx{8etp$z*==a-XG1)SM;T#=xE2+ zumpM~rPp)95l{bF6E{DGXE9*oOI^WlLFz(&N2gBX*Pl9p-!ZBG#;-F@@@1IqZxJZ{ zeYSs`w{Eio|48dL^4os1cQPTr?N@vI%R96+to`dUA=9&(7^{)rvyLPG(vj9}b^upK zpjo$J&shMcde&{^HtRN6w2^!H8+~^Ap-iWHCmC{^bsJ7a1T%dI7{imdZsVCFv8v=b z47EY-Gy^l>R5RVOiH{maqXEYdYK*)Yn}qKpwVkbZILuIb8rq%C$9-P+#*|x%|fs`!AON z=wbiaVgxL-yY$P2Z}~93<=~4~y>4cYjw1u$#26Rri0fEoxV+?hVvHl5`?Ni{k9O{} zbujTw=e}Hw!I_>2oS}mWOS`Y+dqcoj`0|n&%4iwuwFmsQ^d>byyz1#d+s`!fwy#j1 zb;G;P`PJUBOvHz_1=njIqdtCUx(exRN@tPzx^2iR&xNsSCG2Ryv1-hPq3{qna_;F0;~_4kWf2O#FY4ysJ~Sb` zFh6166x==!&PwrBN)R3LlRMThxS!Av!*R;FJD8&k{LlIJmYE_hlt?n!; zbixf%)hND~q+{IWL{(JS6gVo=aoq*5n}`aXa*be1h2ao)?GOTs6ayA;Pr27!HdKh; zR60t%W>AQ${Oe)}cLPh=N(g*G?>TFktAX_nzG}~dMW{rwKf2}|Nm1Apd@??Ic6bp7 zOE;eAJuuI6%}mtl!zM;KQ+tvNdca1E28F=dH;NcU93N?fs4Z>;3!PWy zgo@sNgq;2ArkL7e0++r;BojHH18b9-5VNGUk(uxTXplKRl*prD15s&o@BS}f22sMp z9)*zJ&Op;Wy>CsFM8o-ZuMkTZ>e0yHi7vviffNzhpvn|ygc_C=SO;JzyL6QA&1rbV z(!V3Gg7ki6Ol({)Ry$)Fb(u2`DAz8-ec}N$Q?{V3V$zI)5c1l|Mt?XZ_~48VTe6x*$V1MQ)HUF=hJPZ|_J*Sna?ImFnO$AmW~9Wr<;0 zb)_Q}jp>pzRsLobkgz7cIi!K?=T$^(?*@|F?3ik?eA8vDp`?$Q9Yvk;sgl? z?UufFb(!ESo|)yNVMV4ykz*FZwz9B2 zJrtAhcTQ*?0wMz_^E|=F)P+(KAd^dUv?6)2_-Orx$Ln%;?a>HMt)=fsMnpcHa)bIKNW71;H^9dQ?Ji@w1XJiyz@- z=W#O6IAaWE7da~_(m6(y+_x+Y^#eF2^f;)Ek0Rfz~_G_7;B0(9I`UCtc$N#|MR#C(;p;@>giMTT??hMGpw8S29>1b~p%V|Tl zUCIWR*i-D9$SkAZ-C=IJmj;tKU222AUUyfIYKN00x-}6lK}%guoNwKT3^Y_pNo1Si ztM0IsNMP+ogKBl_yW}foNYH|uNEr9^$87jv@8{9D@6|+R#i)xjCR~e+j+kvbUUJlYInjk^# zO4xnWM}0kWcL{JoxF5i<3OU(ICO|Mdl)a;l`fXq1+$WUzY)5lj965opPcvMVo09^q zH&6~SOBoFUkImtevsh5)d(tZtWS9Z$Y0@XGbs$V`MYfmaGV>VbzfbLe3s!PohoN+kCSe!{)k-uVb=+Q*;L9O2-B z_t7P%a!K7^i_*9zn?PIGx{K3z@|Gq=u)bspdV*loQ#9AGUa<~D*^2N+S>$x+ODc7g zG6k%|5%;+Xge)Yr1lqpnnUJ#C4qE$_D#_Fe@+xORGq85{MJTPN-yu&)S#?vNs7((t zX?eRS&bVg`%?2r6sT9asn5}tQ(v|CPtZ~@v*()ME@w^ z0C_-DvHEf%z3b#mcmapNmJODvB;=?t>{R)&+xW3$F%zlMZmy4I>ilNBQ4^;+V&E>; zMi2NIRQBLQ!gw;by+r1!u1IEHxB$g4vfN6eA-5I3r^ zZTl9Z#t>&Y{`@)4PiRe+g1*#w{1&99NCx-U^zF0deK33bWs|Clg4tu{P704A?e@OV zQE0-K*+ft-Nxr&#Qn;7R0qE!}Xy$8~iDy%G&swR63(D;=bH2U^& zju@%hxoQ&ApSY6~+s{Xn4>EEA#Svofa?kk8)pUPR;q$1`8Lb;-9E12V%&YZNpzZUF zG#UvPfo@>olH7fH>#cO?IEG)E`D*TprT1g%{9h8xWaS@!+HCHV;B#hkf9hP^?@Bw@ z=+t5}oqMvG&OOdd=Z>R!83w-z#q97f7iuTsANpXyY~(DK%3dI~RdRv$6GGXDR<|R9 zzpvnFZVj6YW>}r2;9$1*Sk}EGg3nNJQMUIus(nQ8B?_*{_Ksy$DumE+g_A*F(q${L?R%x%RC>vSo zs8i4xnwUAj*0kd#oTUwp3OBWQL6>L)EaVtLLFD*-BeLymt&8twI2WTe(~cSrY#>TM z6lHp8w;*q1$W;G|Wva%5@<%_lfAlELrC}k^s-agyL!hi zhRy3V8Srw)g4jw^bf5%Se2B0Qlk*mv*Qr4e%LNT$omK^2ZbRUmV8BM+49Fu~H}RZK zL_Bk<(}@5sY_@;czVbm0cQAvMt4$?>C{!XQG3s)|5tRs{tDU(Xc9Krbt3)uytzWL) zwADywS3`Lzdsq;2wSud!S8(-@Oe12s>bLt&cmGGi0=qU6Crh*dI)&wxK4tDx;XXC) zQ|~?v?$c-=r>dYd*RKPY3+v3gy5t&oZ)g5!k853PLyMI8ohGSd&a>l!4ZyJjaJC5( za&xmUk;(MnoRtaR$_EyBO}hf|^yYZ^Yc0DtmbOkQ>4VtZ4gm^teM9i)?z#{fXv}Tp zT2)>sr!G1s6QP<4R&gZ}+$+aNa1R9+A-kU*s;^i0DYz*=Zti(E=!mwVQQ_y~M%@t1HudmJA|-+<`UZtrlFOsyEPD@Ey2g zh;FlcS*RWbxYe6Ay{KhwXl9pIReZxb9HL+jYvqM@1D)vG?7W)j+w8oWC29?YI3X0J zy&_E|*Vt+G474ciw3_Jp2JJNH7AL9iv$JYa-)CplM0YfcbhZULtuVrIo1IsK<2F05 zRvn0LRp*9W`2_dVGMG!t|DW1{HOMOTq75yfteov=vihkVQWM)mImW0(6n&+gQ4{T% zjpnv}ja3HaAo|p1Rl%w2to)RK^1Bku?^MM6E-lWR9qlKC+Vix}>=8kzv=Y6}sHX7U zd8)yT8Ep_OAN1)~rBirbwosG*ot_TGn}M|p2+%TpkYd+m>hd}i2;?<6rse3OFq*li z2M{(h-ELNzQ;E{bc>0NG`Lo8fuMnF{kYn0d`W>p2s!D_!g1j&~cc^cuq)L2(ngTJ4 z%EX}23Z583AcPt5!;JXN7y(SWsNUCCYEs`w25xLHhHqp?iMXMXQ>ZqW8$S%%3?X7r zS5uxn)AcS?y7fd0uDerkWZ{ezXq7!w?#}f*YSudl(zr@xByK%^0nDt{Pg?6^ZBWcC z62&G?`DTX343D8**PKI%!srwLvf7v$kkyW7sY3CL)8^j-h-2=d9CI(s6`KkbuJt(v_iV6or`eT?tIQs&?8sA}V{*?9t6GkZ zFef-ne2v!)Ry#(A_%b7&^5uv(W*6~|z^bAhvp3XR`-^z2uR=L08AQ&ZB^QRGjm7~Y z`20?B_4!@mYIpsbns&FYb97>>77*FG9noDo@}==AI>J5~1EH87ZT~3kG0-&szyTWF z{hc07zZ^L;az^Cz$n4Xi>5rlmydqgW!FG6f9tWCoJ~^5>3`s)cQWG%S`#4_SpLg zilm=pyB>WBZt;_tr;~nTcT_pS#GBrm@cp)S&0$(zheRZ)8VHY6y%SAdjI#ghk-)4! zLP$ex`QC_~4QS`B_w6s#`u4WGy1%ySt-!3E;GRsP+Vb6z^z)gsb%pi$^fSGG98Eo> zBI(B>Eq^*>)e-4!SO`F^zvb}*gvHk^3nh{-=_Oe#2fPqZ-ee_+b1XAGQ~D<;hnPFU zJt{)rS4TkK2C@*H9f}Iv^;F-!V|#1cYs2RyIQAR*Dve=E7B{Lb266Mm#F1LWoX>A< z)ty$of!04=L@OcoQVQKR3Xx38vGV8g)uoNT!80Fxe2>gqb1#DjBVwcL()I>aGgN5o zQf)q?PE$4GMslokzCb>Z|q-SSak;KY5=^f?-N|-;DYE`<%z1{`!l zVoAG=%F4FWfoSA)u8JIp4YdO4auGJ-8WxSPF3cNZOQJoN-*yX)_PY<8^VEiSNrOvD zFil|X>+py6lkgPgOi~A~I!#=Y?A4KzaV+Xe9FCiHwQe9|nN$1WvN3rslkov~hh+%A zEJOHZ8Nx5i5Pn&P@T+CWFlX6$#($budOD&s)ifeYtCI>>8TD$JXqg2yy2@KeA)L#m z4#n5d_Eg2l${e&NRh0{%SwEfn>CsPrF5hMCj2;>kQ1|KL6KK5>?DlX#7}IGzNJQI-xqGLU9%vZ(x79LS~4D9d}5~ZWsx1v zgwNJquh@sN&5qpDsqk!0FJFvgPCmeF^7z0>JVsnwj@FAC$l2LOy};k6^pW&)I_hc0 z8B0GHN$-?}nRvQ3yeHQCqCdhGCHuSR1wN(uwWBVR;8DLpt&(KXT(fXpnsKk6SN(3X z`KYdZM=iUrniyr!6b{t4xGuIMmRa#FE@{_>g9?piO7CUkOFTf_-lr8o>~!db3t+xD$C`=v2j^7=2Vj?{+)$JlNpR7<}erjgs{=ez5=DI(ufjRm>ZMX?P zl~Nm?txeAcaO#v#og4)ajp_R1Vyr~;PMndJH@K&BF0Zro#?l|CW6V+Mw-F`by9~6@ zaEl$|VW4C3GbVcxOV872$G*C-`kEO|X2;izkFGhT2-K6QpZJwD>=smS(QeD?Bt_V zy4vs@rSIT9FuS13XiBLy*a~&uz}xn4sjG=p_$9Jbd|mn(I_cC0wK1iKO=214-upU7 zLR)+-yO9RpCzoi;N1jF-u7XruEIHGU68dlZ_WfYOO!gwCrDg_w_XdINd-MEP`rf>7 zKlbsay^VIoIVV0Q10?x7kx4|A3hgLG%VNo;Lh@eJ_o(nJH9Sj$zQ2HCis|GoZV!SL zMcj775^pbK8VSad%Z2JA^ksxRE>O ztRS2+x9tBW?%~qz|Eo%i?uk=Qw$gUH(som6xstx{SCzEG&~$i}*5S}}Slv?ylyrM* zTpR5CtEl=5=u~NKzeS*}l_CUe6lYD zvE(~Zbf|qh6{e||!^xSkOzIb-ral;LdA$s7M5gq(QM07u#keUwVUr@_j3J;bM zN|CER@L^W*Yp-Cj?rV?q^(}p$4f=&k-wDEs-%^cWCJpDgCh46^rR@7bWq|#sX)6NM z)RvW1yst{~SFv&SwP*T%Px8J;mcFOdQ&3Z)U1nMO)GR}gnx#chZy8vs!BRJEv9LhI z>xSiJp?a_H7sArF^ur*`UyM>bJnXQna9CD&Saim#S_rfr%P%;m1lazX76+%;P9(iZ z>Z(W4?fgMJP}cXdN=FtLpe&hS zUZAxO?qcW{!=MCN5y=!!jVAX-lh0rtcWLj7!ALrOE8_P4K<4v6&d5|&%*adtNqzUBY>*K^A*B{|N9fvRm~c>vkY(t3C?wK9+L-44hpC zc_5I9{d+iXvfQClZ=8GvPP4cK!Xd$D{vwg{!V9d_j~m!uq;N&J(u%Be;LpTXVc1&! zJ#16JwyNPXF-->3suMo5gwF}f(RElHsLK>r#gc=ugLx#Dxtyo0dLSmwljxd2*(^DN zp)38-z%Mc4nA?B~Tyys!#I^m5U&Mg(W(Qh-npD%}_MZ-{gd?5)nrx$WyvuQ2=1iVk ztF}EaNfNY%q+#1(di*sh%a}B%eVaW<>3>M1Uy!m(6w#srLLzL%OO%kenag8(QwF_Jme zM7(+-%PjS2Lo14#PmMmv6?~+aFsQ0Qi0l^RR31xnGHhM?#kzF=)E-+#USVVYCMO`H zcyf@|$)OaRgJVPt&x}$*>f6!e$(qu+0u-EZ4TDf=s6xmY(29jFY$_1x8+abQr^ialxO#I0cNDPH-4g z4r9v0sOeho3r2rbGxr%N37n4nL+gqV&d7lMxHmf9k$;$M1!ZL+DeVxHted_tg3G_@y&+!I6Yi6QqyT6VK#dpN8SGco*W zx*IOGZ0~X|g!HSOi!EDX#^@>Pj|x+trw);J#)M7``f`t`+E!LXJt*@hHM)x>Ut)9l=C;uStzV=vC>&I%;Tu?ETlh1{5>DT9l1@ABH z6dXOGwXnUSm{R4o>TkenCP^?C zGMvSPs}q@dVXnGw>4BK|-``=gW}x+w1=^;1Zr~J@qh$QTrt7}fj(r@NHncg=nk0UB z-u^g+4-z#yo8IS641u|sd-$ z`~)5;e=t(ck&;sttW7^Patdd^X?PZYNc#_JQ0+EFGRv98&*FX5Q*lo()JUaq1o1%) zS(lm56++G6uEChPZ3b_(8C*L^Y9D59Xl;@!y}LI33ePucQ$UN558eC}Or0A`p6QR+ zLZ4y!Nje1zLrDw}T_5Dahn?H=Lg~u!m!rTjJU^#am+5SO-s%cp-vT?qPuh%{WO_z~ z?;^`U>t@OlW7h+`AIDk7Gd|s`i}$tng1VZ6^GO)?T4NK&8eu77O$xw0jOD{34Ijnbj)v{X9yGeYfU|%^hO&9ZXBjPnpshx zsW1#Eg%qeO0p=b z`gql|xoIai<#4Qvj`YnK$>P1wYi$be2VoBOeVc>EKf6(fME z0pd+?0rf5*yY-gc?JQ_>K$E!kK2}Dd?8(l^l)29d(*(mbZv+r;jdIJ$XzDGU`-U(b z{P?HXS{=}PvJ<>|c@#rlmigkjFPZ3NuYBg7>|8Mx0c#BJfL1_WEjfd9UXcl1^NI+g zlyRQyJcb@Iww}7Ep8KSTn_Wa0(^nB;WHWDUgTzsLvNKW%X@D#)B8)wYqDI4V=h$YN zOxBR|Pj(iAi|lSHvKVUQrlNJg82BAyI}IG0THceLy?$yK@cWA_2Jrifx^m#RjO`Kl z?F&QKj@@p7YIu>+T4XU$(puzVp!cS+{Q}4M*DYfQ9k|B{*A}^$mE*7d)RUc7wYeud z7g5igr!`u2zXuM^@{i)&>u=)8XPHGN&72#)S+iHT*(N!0E|)#^`1Ut6gK2<_B_C(% znK>6_NC2k*$n>QX2!Os2@x8|yryj5bD*im@2^@D|1sAFMePC@faOCF0$gG>_^7&R; zzLlVRWYsyK3PgPGC`%2#F3X4JqBN;FOj)j~8qRLo$n13GR`iqIDokyW724tu{LtLW zyA@N}{O4!6b!-!BmirRQ%9_6<(vtnP5X?mmor|#daU#Ren(DC3ju){tzK6L=f7(h8 z%JO$eIu%Y+cwV?n+vP%IheA1<7ZKe_Nn>i5NmB;Z@JXRNG(}P&?Eq}*uF@Q8u{zta_&p_iMo+&<2 zq1r;%`d8HzPd=^DGm4Xe@rO}fLyk+7XTx22Lb&Q(&=vM|r)0yM>Q$A@d~F}DIwd0Y z=Cx;6!vy@hbl>@V}Q6jQj0)0?*Ub%>(>DW`P^X zThr*5Nr7ta+F#eIzBd`Ws?g$It)WBqz19wF81*rN-POEu>oIRN&=C zaZ!xDSh8&P5=Hqczo?J_)@b1nYJTp<*q2Q=zEcmZQ`xLFD3h9^mbP4_23>BgsKnw1 zVou^Tn1+fW@`e8mHV#g?kJBjc4X{0WqY7EvDsY< zdB6hyQuA|gKYvO0egmiwz|1m1Y&8gmP$J)Qjk_5RCtfNZAgednn(X|A-A@^;1)xp0 zD~GMoAb7fGVfTjy!B7get>T7NO<-zo2BTtfIw69oN3wCL?QGu35j^WMrwlS_=JFH$Ssn_C|IH6O_p_4oI-{U?m z6mj2}dtQXHMomjesK?wqDZ_mLxse=75>*c)Fl<(Imk`jWoB2ipKkThL^daFj48C_9 zAPOPgz7Do=1sM1thoXFI$2!Y?b*-dDOT(YFr-mULJgh^Qduo5GHcQ*I2`i z$7@--l!v+$ULGz*4&;aimq&xkqtVNQ?{FRsE)T*CsY@~5r8v|}F}{(O<|uz)6lM7R>artdvLrncjNfJeI?`NEj=>3 z`IL<%n)ypK1k4eVwO4vb+=Q|zybZsQZ4fg|S{^J8X6Mh{cm!8Zju4W{BeT0s**F!P zQ-woYRe`nNG8}Fa5=WhUAa*S7jW+ zr&8EN0(0yGbbSnMVMf#v+%hh$I z>&5yEFGSyr4hhq)rack;MF=R%-ZuBri7Z6eHeP+|033{r-#lD+L$@F0v2D={jh$vXkuJ-1|S#K2(H24>Kz;m z&TRLje%`Yw@2h}pY^IU;-rdPp&`xJwBkj#@#sd5~rP-}o-Jm*Hdk7O1lYWg_qtnXo zH1bUI3Wco>g-LqD-6OYed#oz!uDiNjk|-CI+o={V*w++o4YdB@n340!Blu)r3%%Oj zw>UN4>RxB~G48QH4Fy`Cq7|s+zgPw9 zn}UnWB*szGw^Qw96*dGfE~~MM&sQZl78JnTs&hbw;p;v421f3Z6sT@zdI5_isBsB^ z8BWk>3GOUw7F$~yrVyn;DpV%)3}pgXF3l}vCf#XjxU;M?pN!c-$&ej~uuN^}NV&1ds97kt&H^@Uuw@L{Wr7_A?5gi8UNeoT)XW218uF|PE!nUlm@sUa6#d-DdcYzuc;%q1_SP^+4-m3TwJ1A7^%?Y0-GwXtz?a^L)jwo z)yVH=0ypyKr;tpuCKPXO6&0dVC$Daz&P{lm+`Sa()a;Ns2-UczGgYbB^XYSmbs_@Ws<6KB6) zTJGL#bd{>QTX7?2r7fSFm70aio0Z0VVdSi|_iwe}N`*f;E&Wi;VdvhbI2Y5>IOk$o zdIn0}v~;U;jWO%mg{iO5Bfn_Nn*6LZKR9x2WA;&w(JYgsVr_Jdu(TPip1(j-(hQb1 zqxmAL8f##3B7cD3t&MMj9 zYaD#Njk60#&LF#hIb0iRhEtU0+R!>& zUa|(4tnuK9=#Jb(luz85PwYAg1N$8gBXr*3Fp{(P$^z8hQQ-BaJc^z?itV^8|2smR z3N#c3ZYX*bW+)tBuFThl`W0hE7{ukF=ux1dFmOZBuP{SlfVnA2v_ScSP`@iSt1nvh z&rV5WY;ZnkN;-cdnu5#SDDI{tWupPWrXzG;-;iu8&G~KVtUzp{zgPc z76<|=vtcGT7ukr*a1$cbn>)xhH+PI8RC5u_E?k;BG|Fq`RFa*4zFWZw=kQ$Q<_tIb zgHLnY`5W_dTW&7$_Cdj`Wesqw0580Gf|h`o>~zbP2676tO=B+7fUjAF%_cV+{hWbvHG)Lf_0B?O(jm2{7A z=u}7>GrL!gzsJy!WRT;c!<0WJ+!K}w4+2xA3dr!MW;!FUWW%-sftq+Q&U@oZFjZ~#zM=-lRYJUx=X#7R-%N8 zio>bSuhaR*9ztq4fp7-}t>Ymu$@+5(b`Em~1rCV&mr)V9{G5ct*65@n=A zFHxhDFHa#K5>M}}Eq@kWoVgqg+|6YONkuB;@c#beqQr!VP#+yWZyJ!x!`s6#yJVDD?@tJrId zg-Ly1r$R{dj0GE)%KIYeXaNn8g6a{J(rw3~obKya2!MaI;6Vk;%Ugi^i&o7S#3+hl zknvkqoFWBL*W32^Y6){*pAD{zQ4f04TXUl3IjA04saMnKM5Tb zgSb2t25u+}+)x;xp>TkP!f3V_gSHqaMyhSw!cK>-iJr<4Si3r`Tb|MJ8}UhYJ1)C9 zwvh)1k@QN#8YAWiZ0%c6qcPpaaEAIt0!M@!o1GuqSj6(bNU$TfGn`hn#+1npb@g%C zU9pW-h-X!TEi=4-NrW`mzbHbwsV1Ml9RdjFCq+nibWE{Ol_BqG zr2o1>DMWHT4o7mMTz#A7e1M#(M>y^gtP$*BM6iQ_V5RpoEV0_u1JRN&HNTk~sd5w$ zfi~ZVCRI8S(bg4CG%R+ zRkRf@OaBRzD(=2ToBvBBK*DM#L+`S>Q@@nScOIliN%D}oK)q&nISc*pGWMzum2u<*H!bq?8#4o_a%Gw1E?2trgD>=MI2exl|QQoQ6=(nCel4dSOzy7o_k~&3> z)OdRI&CEC?nmKK>{2HRPTi8#o@yz$^f!Qv?W^&ibv_~x7ol>d{QS)OUlCPxFZoX5W zh$jQE>Y?!0*TtY0(x=yRiJ@1&#ap@3`rB+WA%%2lvZc1_TNhf~>%b<+>~=8x_3SYV zLo22d);qg&d(4N?I~4OK*-MvSCzN+=Uh&?sSs9u)iI7RC0Dc+haXqLwWN-gUDD~x_ zkC$pt+*zy_YW%VvzJ%9lviANAst}0oTVI}~XKVCe4FXMZmM3bew?2e1@e_*Q0q}Sp zr|E0mx|26+PM2Nr-Ah8RGPsN>{+&}DY}Z5?Zx3?CB!CT}5D(JKm{{D$%QXv&AC&>} z!ndY!^W+gdq&h&tC53yjFuqdH)NDDS@WH9BTqR+{v%uk@ems>7iZKb*!**%z%^MM^ zT2@LTxe!aFF2q8CrBriyXo-Zpk&zpQQ&w8a)QFT~5-(+`rJU^|DZpcdW74CDMkp!H z7dpaY)z<}>Af@0eR*(i*4PQRZe6&V14x>kjEbTZq@NMZW(lSyVRNLHh6a`}$S(CM}kUo??&sNNvP+0LtC+=6J?`V_Z)SZ~b$YE7!x# zZj1Fw=PF1o#y>iBhPGB^+XkzZ$MVa29NEb7;7at8W}iTkUfHa?%zN;p7wwsS%yQ=E z!IPZ-BE^4k*nhr^QRe;2OU!RdZc4wrB;q}gl1o3AcS(BfWp>N}?*or^Z6f_3;n5Q% zI7PNYdGthXY}N9bigU;G016MCDEva12{re=k_%;MlE+jOevwQ}8lHV6rzoRkKSCQ1 z9fdxBUDcz3w)M20+Q6ELk<7J$Yc83IR$G*D@wSMidOPmGb=R0>5I4|iL{w-U=7-nb zY0ZfpY`$U$jUkETwm1(dHN~T z{1&E;q%MxY0@k-UcCkk|V;`%z|6X3+`@v1)YMq=9>CJ_FEeAu$MFw-sh>-IQPd)@Z zE@X5LhL8xg|5)lWq5nLY5mP3CQO5O`2S+P=e36$#(C0j2PC?z!L5nmLdGQ_Q#l_fo zSRVX4&v)Vb!eTzKTsWk-uX%A7DdS~VM|cQEk0Pn9+ODA4`-U!!W)|)^J2EeFPUPIk zdFN~ie4&*pbS=?DOY|YzwoBWiv^|=BG`jbxTHpSbR}K-OrS0%o@im-eZ7f~`pB1lx zaea}h$2dsYdEfvR30kMqrgpAzm^nDscpltzJ9lSEQd#S23mKC-Q+@9%7uk)m2I;Z}=^~H|d1{b*`e=}Q3|Tt_6&kX3L)LD{t}$e544GwGsZ0+s;?~iC z_KI!gIwSA~Bk)EeP#KqY8P{2+rCs8*pBty$BTlnSDcNV1QuTM8tZNN?|2fQ7U+~o& zZwJ0mV~aB5ZxCOp#aMAml$j&P-^!$gZ4KD@0Ss%r=h%6Ru`0TVx0(ZXZZ^m0)OR$A z%+efkyx4dG1>jv?)=t$eR8n<|)_{&{dyH!>u2gZQmW|5ik&(~(Q#c?;74siWYehMS z%f=&+AbfI%zY&@s#Ouz~h3n2DfR?5Z_W(Cq1)<6HurOhzYe?r;I)G^$AjVidx&~5O zgR2%?q0-WSgQbSARQRyj+f$_UB(Py5pRWkL?o9Ui2kv?&XKy@Lf#BeN56S=&nxpB> z(Y;U9`u1VUPOM2tuE93+(_l{oS|3vLNO|;jbI@RI<`|v&o+gGbn&Sh z>57!SQjn#BEL0#JkwCg2nF#U_1x}&+_F&q2dC3$(Pq5s1a*==uUDkdltNqHL`hvh0 zYBis@4xqMPt@EqT>TkEM?LV5<+U=(= z<=PG6=IJ?8rx zSMXE;f>VDB+bC-I)Zgk4?r$UW9qw-*?>=w~@N zVe#JBnv&R>SW#ll`9UrJYBV>q+|PdL;i3Y^2)#r$icW49u#?kN z4}kze=lmtzPwVGHAiA5u&bO15)Nd}4=8c*Volq_X6hej8>6ewoeQ$}1i-J}Qu}_eDk;si1xtBR| zFZu+z_j=?yO;E^9=gGa-s??FYYhuUKpgJNuePsxqB}bI@Az)RRifDFeCU&!x7rdlK zb+d}hHczC+nu4>)9cX1cv0HaH|4lh6)moPm!K$}kb}E>(|K?>o(Q2vV)p*sb=--;) z%aQW#{8=-cq3uHBh>{u5!44w1@X2w-%JGgX*qLJwvZ>_a27Z2`?D^iV{02t72_H5E z?H&~0i)*I#K#LtXe-FY>U}KFjk%m4zbU9^8o*tT^JQjv7?o#n{2gN@TGTk$E$Gu=; zSCn&dD+)f2(c$stZ1OAGL*=Z|og#yw#m>gz_wQBs^5^x&>Fboi#Ukgle)0O*+aY6R zsKw6Ewda={^yzkzl*e^D>3`7)rEgpWN2#Aqvu|#%$33c}EwCEGwW6=EuR9HX{cy=w z@hU7E)PaU`#}hf3{kC|K=~EoJTJtoFsB1rT7?{xxA_q8&gu+QW0G~_Aq};SC(fwQb z&ock{rWp2-0Z_!D;CwVf6Wv=3!UE2(QEzzNAnFZ*fJFE248j7=Z%`+6I2sM2K@jg5 zgaxc`Rs(>>@ZM~kMP9eL;6(Qy1p4D#@cK><+UU$_JSVz$ljP?X2Or+%_>hd>h*xb7 z*sXkBxK;h;TPoj_xjhnrvWcgrcsg4CK+CI)KEEY5ErFueB8-T9xm)FrWOq@sSE^>Y zFII!K%)*3kKxckDWS0oVf;tlkQi1$pvT4gXp$jWeU|txysNT*(|Co(f@7{uPU89Xc zZ>pa53w~p>iipE?Y_`E%rKpaw!k*3*Z^JE4qhZ(a8*Xy<;67$o*W2E8Au>(S0MUWg z3t={n$X`@(RUF*I2Ma2fW*&hO;gIMMd%6|$D!oZ6(QF31HIuC$c3HVImMK0p;oBpY z61kIArYOVOfVS=$sRBy3h+;$ce*L^DsKy%oE^1IgqEv=Jse#QpVg_cbkhp7IfVq(v zG#jk_m8Lyc`0nSko0oMw-iL;L_O_Lw;sS^^;lxsDxFjtT~5Gl1;PNT?}nH0w;Sa|p2lc}4&tCC`-#;T_%63`w3GN54g8$V@KoXk!KRXA zKOP$i2WnUW=lY7INXHT7o;Ff$_|s_QMkz!ON~Xme?4yB+MO__S64A}};?=yFX*OjM z#9ChQc`6W8?6LfxMKhfqIyDHL3qwm)=-~Vs_4e=9Gj$)|{g8gt3a2*k^K;iiOG0Ng z(u~J<|6ViBZLZ8i!Jl@j(}3;iqHA=%)wiJ#sv zONE%IYbg`Ew-9=PD*d0V2sFv}DZ=!L=$z10zjT$}6kKa10Nwv(B~XqM6+Tdoa_p3d zGk;0POX{Gd*MU#kAD39O%R-%UH3}}UX8*n@Si%g%V9*QTx$tSwE_!QZpuHEGFrxe% z1w+CAtXA!x~6!x-yp4ZO@`uY7p6cEPZ9izmJ7zLqOb^Uv=?TK2>%i_uU zqFT<(IAP#U4lsH%x#^@#^=zGz$)?C25(L)1GzR&&<#m*}QsM3>M|eG~95kHkFUslN zwn*>mqZ8>j`g-4njcUW&Q80f#NTlwbcSSh`a2?oP8r9wNPAx2~E92P^<&es{I(VDG zTSQXX9IhaHSxVDfqO-C^@{|AUc=}Ge`jtrc+2B}21QqF&@(F1TZ_`w?f-z--bKK!> zsjVmnn+v(cE{$nVTUlb5DRb$N-dI}ux0b6rnNtT4W$MEsS2LTYc%2WbOe6N7ay#A8 zWUMI0MRwk2h+)+jQ;t*4jQSqR?jAR-_)H%`C&x3@$0=m}@*iNM*m?Tu&i^BaKGVWi zup5|GH|E4wre-HC|3UQ?oMcU_Jktlp zxBB`nd^Fozcm9*)eq!&`zHlkK+;!*gMY_EUo^RxP%9QOp~mx|xxNG-f||E#{saoe&pzEu1I z$6|@l>%P7T<6g|pSXlh#!+p=`#I2XZtX4=Y{_tVW^f|DTg3;nub<`V?@iUg+{>K3jVs)V$TKGNqG^HnnR=EKzNwJHcXaMHzP1^B z4>b|{2Ym^8{ltNI09lLU!9V;Gv_(F1GSV9;1{R#76knTc(3d~|ELjG$d&$%URj!mSG*&YW{wcTZy;LfeF;evLK>e1Y-srZhDRD8r?Ovm}3)g z3E%rVb8Dt)ixR#MBo)XV_q|u^bL^Awy%*=rRvTyd+Bn14#u>ggc0z(izd+kbA4wpb z?Cua_l){a9G6JH}QYWqAFd%Yc;AA%j>cvN~7fEts;pE&{h%i1G2GjBF9un(mWlb4a zO8GVO2j-_S_!vt*DbwElbOEuhSlL)R<3F>uoVK~^L2h)$Gqb;4kbcabF}YMM>vf1k z@FIiMnMv(eWqDS9309sjR{qZnidMaw=i4&925Wq4S&MHK zgj$Ly5#NsRif>1F#kV88;@c5k@$Cq&_;!SsClKFte9Nm31rgsL6Iz%y!m_lLw7YYr zVXl4o-UA0lScDmP?{SHLP60%6!Xo%&>W7-^CW}*P@{n=tX3cRyoD#%`#BQi@Gm(ok z6TZz@SQnN-i|CE}%mkJu;<#_K_WIOIG>zL($Tx~#`pjZjq1glI!2O${x7GNstq1EW zkjevjQu&$lznl9O@FuHl?W9Rd!9Y@{bOM&Kg*Z~CVvAKlw zH}#zJ|IhP(Pn+-CYp=cc+G}6d+PAeccTvF*;{t>@+RyD<411|3vDfD;7OQx%Tmy0m zi^YjB6Fw)qyb8T~xcHl}vkbS+-p}(=tU=voHJ-DP3s@obk(>~@!0xR$(ss+l1yO%2Arf$hj z5Yf4Ww_+t90q4U_lXjG`4g7}jIyo(+TAihiCZ(y|*}&V5yBM7vh>T+2hq6e7uNePL zG%5>yA5(-C);nOhL)e#1NPdXJ-f7SgXqy%OK9uyGoNW{4W`%iKY?I&ynRHldg{H6p zd!u0`6{7~)n7!Ug)d7(~8N+mDLORT)0%;|Y?jw@F2BYYK*6NxJV*TI=PRodgT44#L z!j2j2%VOiN8e_YNaf!9CON>~WLyWZis>C>rF~W>yZ!y|LjI>zGIT;p|x%M_PgA})0 z5<&4CtEWr+CK10!jlW$`e8?*(#*!Xw%ywlhh&0wCxU~>OMvMW2P}m$4(H5$N!o(T~C2O^W0?#H` zL?gu!uGT$IZ7^aHuurSAFn~|Pvo17)btny(#OGjWGd^AV9b8k{C_Wn%tFv@rC(8yH zgxaE5ofWu2(@L5EEMOG+?KcID|EgG>B_v{XwgwKE;%9?`)!7QXILX2`^ncCjY`Z0i zRp@5b>TD&qP{kSz0+DQ zeba?+x$28b0c)_R6!b+K!`K0}A?k~C*~FwT(uJ=kU4}*JOW&?GXS`4kuGUElzc(oobQa~3h(#{YqByK+ars?5A+tAZx zc{*+>x=dvI*PmO;z17@=vc&Sr8{v_|4JR#_4srfpc- zjZH{IbVtp>Wiq*oAP$5~5z(0#ok2urddQK}vV>M=`y)k)8DVPH0^_#O2}Mja4V8YY|BZ3$zKQhq~C~gPuGeQQ%&^1JqJJZJC74@f5Xe?3_B- zEZ(x2;|SZs7{YmicD<{u3G3>X8y&E!?s#r43G(0b(NhIR-r=DDUF zI~!}-4cJfBj;(Js?XmtVYTD!cR_x2Mz%KbFaB?|Q6I~2cKf*DV1w4H@R6$rzs-kR3 ztx7OP-0+7x>YpXtSrPwS;m(zA#E0;RPlgv815cv|WWkgc8r@iN1_Pqc!ejf)>g~9X=s7Qygw`~v6pi=lt@f%~VyEPe#W;?EyMsvQ! zBKj*{ElpR3eC5ekfqc!7uleG&b}3?m>EcGFCBigG?`HXGWnc0CDr#5GNpo%6S}w5v zPt(w$c9Z?NMmgSJ_J@%#IHH>)f)NIp%f)|ERnC2$X7<>D=+d@ z5c!%B`Qm6tq`DLzlD3H7#>m&^$X7$;t10r;9QkUEeCbfpWZ^7Th*SY7=x8#qeS5VLw?CY2fa26z%32()xAcr*y`M%r{%geQ1hi;&nu!X`iPZX zTBmjd@9NCl%MOTLvzWW&KnaHc-gf(@P+_xrFoExil@+hlx=~PuVa@^23uCIFqhvi) zFJC2*pCy@G)m|#71R6Fn?}WMk1S8BQW8UpGi{9%ipq@K>Ddr8W^VjGx+}Vqz0iob# zt)+tNUrVgn#H!pVZpGeAtnaW}*-E9-_#Bl=$8*HOb{H&pG;k$+N)xe8p$Vo?qr-kA z#5L_@Vu3g;l{AMA4YL-s66+jhy+$&w#Mc;CA?SkjWOJPn^tXKX0lNo`L-Y^`I4lC4 z3{%#AXI+CG-3)HEeajI{{1E)oA%k5Hjo<>&*;? zzK3J0;C&oaQ#A46*ozY-ub_ZuG+fFUD8W z6qY-fjG>pjL(UbPgcBY%6r!9T^ahO>tjS@VH<*QyvlMpTpb6%|F&IZmus6+-UN%0d zds5EvLJ>z=6Hl`UA*;s)2&w^@7X*2*tffqbdxLW@6^@>C53&`pbMCv=9cTl$+c7D5 zC8~#UE$WN49L0-iXt+%<7!Di8aAT7DJ{*1-#p&ryIKc55k9)i&PH}`r%(0fEd^Fz4 zuklm}&2?*lFwFhbox(7;2ABm*C|XfW+7eBzq&@McuC`7CT$~_5?SS!JyMu>B5(i@k z3{H)Js*v4}32079Q7nhHS8(dt?ikz|Ju7mV=>vdxex&D#kUkw(;J%rsZ2`_c>7S<;oyX{m@cqtRX1;3|5OkgE)1CPxGp~MH5laRXrxpBq@Rv2dmsZoZUL- zGRRX-DCEakH(1|X;>u3!M|x?SH<#_&TqlCIAuz@70N>uk61+9=cF4$K2BM;LRDBu ze{0k|l}9#|Nk4|4Nq>yKRs0F`2OZK~m5`5#-*e)~21aMCfTtkRd>+|=Azel=iAOf9 z@_Q?>ktQc$C)*8oj~7dZV-MxOvr7r797V*TA8V z@#xREd;`#{{nZn|)s$bxc+w8wDr;D?{SIepw#lokpAof$dwrM7uNeP4anbM;anZ0{ zTr`XWJ~#vN8|sZIZm9XVRz(~F@v(5T>B7`2bkKzRMeR`mjS3(`16U;Lj|Omus6Z+J zaUfFEAQgZ(Zo1&jIpkk}q`>O8;E}?l#x(Twb1>d5HBCN%0*&pd&Z=>O6PL>+0cvNB~(K2uT0h+B+qx)_0o zF$G5;7*E(qix`2B9Fl63BM^{h6vGXZb_5~^C32^+z*f#&Xht9lKnd)h$U8P#HY^wO zu%byhY<{n=(NkZMQnxoap7{QWIy|Zt5WHn4GwSmw$3&n7yX?LgNt}S$$e`K3@H9&Iel=J?xajwg&F9)f2?i z@;J#9BMzg23Y-VSAyuiJ(BA@ouA!<62$jN(9!0#dBN<_+@2n7Sso(%T1>Ttl*~@Lk zm*|!f@WH-_#8P-M&zNDR$}A$sn>lj_aAz_K;JpdKT)ju%;SDC=K=fwt$`ee?M4g0S z*_p!gUTpR3u-Uy~!4U3kDJ~zj0KAJKkF=V*a zTHO_hhzxgH9Z?`m2f~Q8CHKX!DeHvkZ;16 zVabrY1lC>6^uAl*-VF#%srX{8`Oul(#7#Es3B!py6RB(hvtH+6W~LcVF4`}~BY;3f zBNbqs3#n9CXSZUq6Iw$e?zBN_5z-L%eMotrmGm7eYDMI9FBx*5Aj{$g5m#G_;Q2Lf zw*ggUf~qpN94*Ua!9Hs2&$Mlg`!D< z8IxM|T;XM#Fs~ki0XTbZ_9JWv>H_gS_o2>mJa3RYt9A-C?}!XtJU z>y$<}8{VA4W4yEhMX6(6^*MaGkjU17WaugOHvL2NI%sa$fbcdQuMf9L$Q~36x|v?z zZU)PMFSN=A^jmcd;nnZKDD7=@TW^F1j5;$hm^*;SgV)_0EPUgFT|_Dr&gK}5?_6fr zS(kpNN8jej`8dz~_)fswX4f99)p+zL-8t`=A8*Wy-(#*_5GQ8N^N`p8^Ku_tp1@AT zo#(-|Jo;9yIL?P3m&rnn@jK2ynkN3IBu8A&utMZPBEe9&#ogG#D~T|7^d;tw*ulDM z&g!{#eKy9Q9zJJDEWi&`)kaHbL|Lk7t0nYQaccTfs0ldaS&(!#tSj2VDr5kftpM^L zy$x`mDoV`-?ua3&$pSaLF=jMgHfO_R}rE~88djnJnWi3idpo;@$AIczP$HGnuK3ltOML#7s_rZZX4 zt;Cm+^kMi2@Y)lKq6F9R6cUH?Z?W28dfJL?`0odRLIf3v)sVe8UiM(3(fra5+}fgV z=Tz{3qO9dWq*xHD$)9r?AFw-vxj4xzm$iZH^?j@&r!OZ~`lF7VT@MUJB|M3lW|M23 zzQg6i);vV^H|i5L4qW6-&6w&u_3WjH-dy(`cY>ldun|!`*8$2#6pyz@FArK%ttU3d zB-`U1dfCG0=>iZfXfs~s)w!5crfwmGo1(G1d~%+ECV0_su}uB|?PYxD3>>pVEq4ZB zZbNP{Qs+VKHD6z_^N2OBD02oqHoZOQutDI;g|>BXev4exmbS27A!HkMSI^!Dmc2Gj$gKSP4hxSK=WFM>W=2M!h-!1 zY+j+pl%n_8wGRNUY+s@g`_RUK`xM%kz!hx_Ex8CsViFc`q$!Y*8afBxEvdOgM*Ir$ zlq75%%_};LmR1~XLABhJgmjYGK2i11$kan!CjK_t-231c-W5<=z5p?Kgz%#CM=KS=)?*gs3*X=sQxYOg?BT*Ww zih!~VC@auxnYdmd!hwt1*a4`cbfelT>&`2K?L1(&)KqmjvJ895Qg(VUj-vvL8tM(~ zhuw?S*oWaw`-0p0xT%jj`s8lDe@EW2R}TZ?8u^8FA-E0xTk&i3!vF%NPA0AoOrRSg0Fj47=w z!f04ViXg5F?wq&GwMVfD8mJh*Ho-H6$j)wdK8T>;7#z@m zxjL2rE1@fkG8D)BAK+k&^qJj`2(pz3Dw$}->Xr(m%8r8_5bw7sz}pk}h_!=#2^w&{ zCvesaX`Tk7CoPz2XhB!ksIU1F`y`(QKV1;f=Gt2M72}@+x3DF0*66J@P zc8;PH+FWo;YG?}<)JQ=1T*3o18xg^Ph?pe+koF=70Cx!hqEi8LX?p@JiNtZUzb?@QHQzqA!uui9alLu!7~Ms-3*9`BJWK~gybliK4j*-$a zx0R7((@~Z10Jf7WXjKXKLjR~jO%XohnJ8FmLaMB-ve%O`i#icIOU&lHhLaf~`-Gz0 z8cYnY5C`tRhd~KMo%;}`9&z9vc0{tUCdY#j9OH5w?SAgc!5Mple=-l+-@t?Rf8as; z--a3)6*7a+C_J`c#1cbrGQpU~;F>0!COS>hiS&&)?w(+bI5A?!^g+aluvKbUAR}y* z5?0Xn*dydDZj{4;-b#$LT$r((ZvYOq5d)n520qhY4=uwdj=xtLg=(;pURi$yb}YQJ zULjG5Zz!jA69u-7*y5lrLtrZjLg36jcq6cL-(W^x2MG@@Yq5X^vDuEgc;GArLIQ^} zw_uRLSBiXP$XA|x70A~N`I;|Ya)FUy)~*4+C2V8lYcs!EuE)t>@R#}nH`;w&!9rG6 zGNyE9?xOn5=?AtT3UBVWn~&>Fzl8Y3u9jjfV2b^A9BCly80K-$4~8(bsJl{K`&ubh zn7Fcosg9Gpj>qvzO2~jw@GBYmaJm?@Y>?|r*o-IEnY=z1k#~u^dsd1|e@a#saD;x+9!; zd)6OaV!{~1yy|fPRAyq51Kec$k=&GHA#!tJFWd~bTWnjzUYIRlW_!Z(DkGm4jAS5Y z8fgA4DPr5n7Vxn>;dQ}75caylLqtyIq=MmFAmOjKdbWUvl?f9B|0a-s`HZR&qtn8N zXK=}F+QVR9Wx~{rQZK6ll8-R!D4=5ht9ZO=MZcx;c?KWKH-U+1YG%xFCI!}x0JtiL1CQGYLtQOz~k zm!2&&*ZED*8FttM??a%s!HnCx=9jSd`aP^<=|9F~v&#otx-Wg`!Feo4pc4WZ^L70_ z_4asAz++qIsdrlf?>o|3()KuV&cPs^!#wc>Iv`ksa|Ft5YXaNS4%pK^cI32rd}r+D ziC@4#oIkKXFwM5PX8Wb64DS6khnnGhRj9*PgBH%4h4>N zH{E{RwbNnS$ZX77}3UKdW)ETmON1OFQauwkH$S ztZJSoGsUaN*^jHU3o|qH`DUvr6+fO-3ns_H-xQw>e^YjHR-n)p!gND6&S%(5ix1$Z z?5psb+qRhi9M46r%d&sahacN^&J_OBwo@j!$6v_A8NJ0Nwour(?QK)|LIKPif9`5L z6borI&omUmW?u3oryb_4haGy(R%90rWoLI;w&N0uJ~)*-xtnK!d7;4$L>)6 z6-f=Dua+!!*h0dC!(=Q+kCV&M!KqesdpVv;#L@_>W|c@1+xRgf7wF)eCzH(`%Vdq$ zVcsgca4ZnVU7tPeI*UHD^R#(u2L`;Cbu=@X<7AN@>u#Gu2CTbn0;XX}PRR_s0kzEt zR~H=i1P*gwEcC*8S;P68I|4mMwr4nz5&Z~@9vmbO%55PvpOWtUilj5lh~E}sE<^S! zI>O8h7CDg_E{BS&Rl=3y5c#PtIICs@88D4yW|)nuv!{t{EXgj>WFwS+SyE^CUpn3+ zwYee>)yzW;*uy+51AAuhp3~KPA>UDB8d!-oyL^SVSH<2k#ObcAFE>g3E&)>$Sivxu@bySmbj>sQ)t_4 zUbPtVhQ=3bpF}OU*Idy1XW%!+KN3txzs=tG4K~SxSK+gBtMS`D`X}?s?=XdJuKOCl zj0SkQz~c!OVl1AET8VT&M7kG~kxY18AQ2b2LHuuGh^}NZ^2*W2dm%GjL*`ykC-{LV zvyj|jB*zRco{Nfy@L?r6kxwL7$>fY80)((31b6$h;xC@-5h31{DehMoGlh&Xt#<{h zn4_!*v`LVkm7_oFnnbQhBKb8>aAph+T_(?pa-Mvs9Q~6fOU{5za6Xq58AK^xg>IJ} z!$xoLK1@_p+g6xsbNZp*M7}n*tU8Z6fRg8`e{n44&9RLHMehjSo!|)0wA5dNDT43u zf7q`7$?N;kuD<{q0J#es!G*tZ`#uEmHFNcJ;Q+Kk#UA4t;4)NGOokk z|I^So_wXO|_P`Gy0idtkzAr#lZc%V*GSk;%hUQy~{=Ca~5Mhh#{l8$MTM@^lNYsHZ znJ9w=tA0lp@lX#$mO7c8#g6{xy}k=5keHBf@$f_F%eN$#fjtKJW-UgT??d+f<-755 z7y;S$eWG{tKMf|a8G}(EL+~m%H5L(pE)kIIa`yibC1TW{N8mHy4%0)zq#@UB|h~*x3G{LHVqQhWmb?Ct@FtQD73r_7X6SOT* zQvDkIaQ6Q}_{E2|_aD9+l2?)xPAhL=jS&nN+_v}soUIhawD2OqF&5JqP={&<*Z{f& zqF@=Jk}1Kefg}WM2pw8H{4_d@{gNc+kagT7;*mM^`pz)a0%!j=wRp_P3=2vY53-_$ z8-z~tj3`u28iGM0SZJJnIMzGt?($Hmv9$O!1?fQ?bIg7VqR%`29Y|7d_E8~ZVZ@=d z>`U;_V|y!{xNWZ~?5DiNnNLXCKS9#+;Abc^Tutd(U2uW|1-${z;<_8ZiN#iyf|Jfn z?$ex?J4y(tKX|awvGH(h$Pk-dGKx~ln?3Ch7BscH<4-%yTQ4{`1zlZm4pOia$FY-0 zHG~u7(t076)*}ogQuP$P0nMGl$W3E158`sp?2<8n!ku;1EYXWYpni_!rmu!2H3Szz zfL9mnM|4L*H3PH#R{<4r5tYsx=%7ViN+jbZUaaY$oNDxToS(%$SCRJg&YeN=RV_qt z)x5;9BJJx!hAXnGDFI0VBp@WNJMaZ_bC{|I%FjyeB0o0icz9Sxt9eT+6apdHpA(WD zThA!j{j>4bBcrp0exN8P&dlSm>YD9`UV=P<9L9SB%jO|yDb>(Ym<=Vf@4N%IBE=36KOUp24Zgt-cyXxfY}7P9sS^D3X1y@1x{@!jBoO-y~K z`K6CwD$`}aS!qu&tB6upu~bBvSuAAcAFxt#4O#%nLKT#f?U;EbuZcC+@eV2`;Dvoh z>|-@=`6pChMP^Eov|CDyCMlA#3WP{n3uFWV{_mm^hJvhe^w)|+IwEB(7zLqcdrk(( z@k6}VZoq6cgv@5xXZU?o2D0~}Fb{Q4un|IYZG=Egg3M9n=pTAZFzY!42;QdenuIEt zl_a`|S>t+ya0KVlU{X@!R~9O=q8leoY8pauxG|7EgOhw&n`(Bzd}l{ao4GEtr{JvH zi5ov*YSRN{35xnO+d8NmG{w}#@@C*8?5<)DfR151-ak6@J6e{XM%kOUl-OQnOCTd} z4A=xC)1PbcVl)YGtK@Iue3Fv;a!YI*j0ZFvUCV(%GSYU58}K(zQV}VL8pE@UQNzm;>FOju$d9Ljxkvcl|<(Gwp~AW2tA|^%LTqfm6=3ZH}B%&cI=(dE$4-sxp*XCx=oWIdYo4zC*Ml z0On2n918#&>kc}Ci!OBp@8}01g?`qJ%SsP718R03l(q_{+1?c6t9)i5cqa9i0x!OK7DKDQVrb>Cy~+a2H`ktlM&#tc3R`PAu!`E4 zjE<-F-yBRA<0&jd{s*Jt60zxdul@s?zAi^k!2#58LaT?Sa2z!p>t#;sw4oVdHQ7Hj zPXI0A*NT3<-6RI;Lua6iLAi0rFv4r~48@4uwyoL3;k$q6d@73j!&%#!IKDyzE5unR;7d?1wXO$xK)Oix36%^}0! z?CLc3Q`H5>xUy}0k}KPeCowLp>o}>zz@_-CA}qY;_=~UwS4lLrYM$-N(1>B#4kQ}N zzdq?G<4sNr8PGK%PV}AZEsweb2gR5Vv8O*HVnqt&&p_&+8)eMJBIaT-6ivJoa}n7k zmm)Y6Pc{;Yp+<0~#MZ!`*)#+mRt|g0hag#!{9@)oZo823q~WyPy!9Z&4zq#14N6FK z(}(~Z4A}>FoM*7yGN9yCmmyhjQq9yR$jU2)tV|0%<;?alYY>u!n*5M#F?_?A4c$~i z){~sy@PlP|=QzM20F6b?oR7?_!x&uA!u^|c1$U!Ld41E-3VwjbWN-VxssF$mIDnlZ zO+0RiUS97FrVqtgk*TZ^rssJDM)~vDAEH0VN;Oo-AgE2qhLAfAs6G2dRXB=cvZ5-W zRFDA_L>4j-n_bNeP~b*0@SQVw8x|aogtj@c&SUp>hH4x;B!5f#2WrF}uL4EVYV0tnsHiBZzHGyKzUo@b$9x>s>MY&ghh_xD( z?6{f_RCLxZKO&Iz3At(tf%;^@{@gUyokK_+t7WK9q8ywe8?4IPVv}}-2X=hK`eXWM zLHR#R$_qJP3d$32K{hQ4<&z-{*{MC@_1%YQk<*wKVKcr2(;|yVw+(a;vpPUG4yd!z zk;>uG<_JQexQ27Ly(O4^l2K5N0=enevbLBqMXMueBL-(ryB0JU&>Atl;s*b)Nyx$h zd3!k6(MIBOUu5;2-8Eh>ld5N!tMqFVn;aNrfQ5=tr z?@||z*s>Int_$&Bo*oHKhpE`GdI- zS<%r!@{$F4(-iX9L`Yt4ndIi`EtFHxEuP>Id8<;rKQylrdqE%$Uxk2}4*L~*-M)iZ zA=uyv>~pp41Dh;ak2>^6YYz2;;N?s~WW+u`kXS4h<)1-A#{QH-+g9@`H>w*hK&ZV0 zg;#SPt22jC?P|`$=qn9d)SQP#R4+b^{+TuBWBoZb=i~fCkyQ&!YGO*jyfPV=mznF5 z@XKiF10$N1qrWk)Jl>x^-{M#GG15mJ*9&bx5S0oQMG(c%FzH!2nj?7BQmuf(5OVlH3tUEaLLPlPX3sG1TFGmz0ttDdiG}L;4;m<%*OP7X2QqAzwjQ zyCAGfqF5e@B$~wtd-5ElXFcN$-i-+%?AiDWVqv1uSkjBaSfO|l7;;x9F}lT)u9jSI zssSCeCGpQlNu7@oOv(==3nnGL=nhUb630fIZo0S(s&2&(|DhD2M$1v8PGWQp<%|j^#{;>Jgl7cqI8TA;{@FA(;F~D1YE1rpHi)nlWs6B==E_p4c;@ra({&3D1U%{?jY_ z^ss|%dy|my=xeglDB=!K#7TiK<{Z(#3n7kLrvyVX+yYS1@uTQ+o)KLR#4h`HKcp?8 zkiF<2Y*#viJ}4=}L_Z5UC~GAN@UXDAJq?Caw%1&i?UAA++)l}NNJ>6-*56U4FgZ33 z%EcneLuw2{^N{SrQrWzyjHhhJAeT}do8Yqjkm%9W!3;q-#5<_4icCvi{eH388e_zF zBK;`HRRlb`DpMKzRS!EZaQ2AG5MmK}630fmuS&NEed&D~i6SF!wD4^z$iqTC{6!;%9P-n{BE%4rCdH@%|y9YV7sd7paq z+cBotfa>x{?jwRaMbx2{Ns;^s^e8um`>f8%m$I@Nhp_nehMu}C+YfaTtByir1n2f+ zPe*8*H)#)bL8(G?QGNtFBCqeaJn{$8q9UG@JkQ)GqT}`Or1qg`fIh<*I%SVqrn;qtWSH3D>)JuXdVEXd&aeiazu&Cf%>@>iU;O5n8`5+pw1jb zNko9G7gbguyD>eY)lT6P=xu z*Ql6X3L^K#mKt==5&5pvTSXbOJ=tQ!t!2?bQj1G`v1Q+noNQ?+#;gz8MD)4U6BG-~ znPyUXm!5B@HkoTzcK2K(J$I)z`4@Bv^_cW{4<4|T zq={~w^k{;|4P88*kRE&C@%t_w8>Pnw@E8cH8ZbZd^ep^*6#i*mAr`BDnm5gca{@~q z#Ac`^4_cRXkW;vJVF#`tYUG}=6igE#llXPm-hy6L*NL)6Fs^j}G_PC+PG#VcM|XJw zg0G%{;3z7=uI-3UH<_#c^$(8BJadFj*UoVx#k>Cgj3_9BpgRP9{f$_}xiLk26Dr-w z5qF;X?`JwMqmO*LyUl-h*y7E9-%B|18n4A=h`#(i?))i}z1|e(q)8JejZK+c;B|OY z#!a4-vdFiv*zcR2GN*LTyoHNXip$IA`HKBz^X8_^DlRK8ElI_w1W&Kwe??gAkO=I- ztY6>^_DKYOvKQX!34|wOpoJ|Gm_!o07p#JHtukHEMQkaU_e!L>Q6&RIRlOBcRCZ7GN!inKx1Hgz-e^^_OG|M zUBJ9aeDIYHc-3rs(Cj{iHov>EqaV~JOQ0eJQTk*fKdw-Vs z#NJbG^Qe#TaS;EzV0Odz{wedQU2uHu#6fMh?>y!zN71tZ|HrY#m+o&df4Lo-V)mRe z7rjG3i$mWELOM^G@7j&<4R|-YV1R1H?_@G(PPxpZPM&g@ErP|f%p>0h%6s^4K@{Ox zcJq}D=H-p#XmEa~Gwn-z&2jAJY}QxI#RlxFNhx}$6UXg=*7a>29$!A?|t`)ZDcTUqxq`e7kx1OwcA%Ly+SP z_chbw$CPq>Oa=iR;$hcLes41VmJHR81d|9s^)hvn$Y!$&+osz3Uqn_P4C~ zD~gA4b&{BN!2*!6hh&@tR?*>_judk(CrTxez`e5BynHvdvGmS^2nR?=cP4}`1${^mvSpTo*cGpICNAJ zF%8#mVz6Z7K+HhH{Q(YIUMyN7$g49VjLTPHb@{w0Tspwz>*#H`smm2iyce6=_lYfJ zW5FJe&ujAdDiVV?p|w`qY@T2l&c%6b@zBgC1|9uzw9aEoL<;Y5B$496O87|V&K8*P zbzy(TYBY-PBlU$hNe&VtlndCW`Jl&8ZHtFt`3K!W(>_mNulpa0fs_FggPwP+xYm;P zP-?!nx?#ZpNwg`(P@qX3LpEs)#a#Oj`2|h#&+^OQzaKtE{~h>^^xkBytCEn30%^$1Pm~X`+j84?h0NFhz@-}dO-C3?Dt&3>k0EH7_x)6zexmFK%lz< z5<_~mbP793LhL*0FQV1uIu88mmzW&&6^Wu%puQ)HTI~tkInw2G*iz8&(4N=~6VZMw zuHcdk*G{L<0HW;>N8?fJKX?1uFqr+9PBUuivafZm&^%$tPHBUFhq4^zsPD(NR@VYT zcREG$_0(U%tk{<=(H%oWeQ>d>Cgi%~PW3vy?a}ohN1!zOXp`3MG@+Yl`2ez2+(#C8 zujWfVI`=cK_rG`bKPe@JdCHlVcaeMS@5B56?9~H-U6i~3M?zwqVh`m6!Wuzrx75Ww z98!wEejUjCJD8?&AIohbNXy^Qztuaz$vKJ8W5C36TS`khfanSwC?zHQwS9#@1yuO; z{w%#gg%e5pPdLdz7o%ghi2h5*%E4NI`8M5#^?Xj=pu2J0xUnfCZ@l@|w3O`8SC7s} z8Iv(4Gvn&atdxl3fI-=yu^U--L?`0A zpu~6aqF)!^jg$@ zw0xv?F7ax6fpM*aPm+9z_+ID+n+?AP1$HMB7In+XI^kp;bw=u_gLQQLprv}l#D2Y& z;Y}CCl=bSty7zMrS!L9n?6pZ7M2qbi+8|nSZ*aEko{vGa_O5UiX^aLg+NgqJ5IsXz ze7nFGYP9eWZ83t+2Hy3lfHOw)E)p)pNRrL3KwlgK{4nq?urc8GJzk3 zi#7xJzX7h-*`oL<^79P=AJGF&fFaTgN4+Int^M8MzwHW8Tn5ofdwLmkLwf=NF+_U! zuHFXS@45mMmqC}(2RyY8xF)|E41<=Wzi~(MB!4Pbu)c z9zHBbO7gBx8}JiiPIcxfV0MpueuMt9H#nE(F4ojT(Z>Vd33#+sh$zI*9eu#LN^^0f z*T(5=>xu`4|r`K za9w%X%*sV`=1gBSe@@Aq(LP<-Tz~1pxy9uYFx|JXwAf!dd4BQ2MWvI=%Y3ENXU)HA z%yj>3z)Pkty6>0qXU;00SM1Nc=4Zmpn=`+{|BEpzSpU8Fitnu)(}UJNV(b;S2OqlA z@#{iqxz}5Wvbkmc>5FDp%$ifY=$`4o(20VHfEG=^YRu@Y%xf~P>fw=jjqu3QjrNSo z;7=xh#_;DV{s@n&`E^ZqPN)eo_P+fxujyJ_rk9j1@-LjXSU3Gf&%}JYXZpnP<0m@{ zrx)7uJK9sNd#IsM@-H~-1n9)oH=h%nSZfvk-wyL;X>W4`xch@OJ$GvtdMim z-E=}) z9Y0q_BjOo@Zd}-v%Q+lA#2<}+6;9I{e8?Xs z{#E!Qr2vQ@tAST(;GAbt@mFc!*yR_2|1S-kbwkB}LIc<4_c;w*o8QeExHi9AH1Ir4 z{M!{c%W=F0o<*YZbkFY%3cNdhkp`}<4__*9@@Fvq)qI`T;NPf$n>76o7w}a4LpR0C%{SD(!qxZ3B?uUP|U`%)iOH&*$q!ZQfK!`Be;C4W2`xOS8~Lj%7=gYVbC zlQnSKf8xWJm^l|No$d(@{C5hBugx0xr5boh15ee!b)y$8p5rf`iMM~De9u-cp6{>P~Z-T0RY>z>Xp z1^abc{LiG@BZ3|Zecyq5>h>DkuKJg%S<6!C>i3Jf^UnrS)jz30_esFd>&O3vVBJbc zTiLp1t2ZUM?{w5`bW?UmFPz7AG`eflWIWql=IHFc(-&GU+u!K!Bd+McsyE-rcuY;( zx5$s7mO=;|7&d8K7+`5!pa=P$Q#uzNz}$KM($QGBxdJM-?;eHF(%I8zEi9g+Xvf+P z<-hmMzK~t7A}%A5C&zBt^_e zW_I@i)?5Ay9)brR?YukP7XE0Sck$g@`jtxhE(MqAGwtr__e0{nr9b~i8BjZqPe}Ll zV-UEv^c#A~|A1c7AKORzmJOszr_-(r(A_=#tNTd5wU_j{+R#1!3;IZZ;}x>}wCfDL zrO$PO-twRF$DZZS)r;=wztTtgO=D$v?Yc#8>HkL`=@%&JyBG)$%b&dKp8knG($7;A zF?Ah;aq#KR|L^)pf1Q&43=vAE&#_t#wM|;b^?q+!4VVluD-TD7G(0a@N$k{TyIupJ^ew+K{rCDvKd+bcsn2&$e-H#mZ~0GA(pTr}RUUNDe}9DUE&o$`NuT3` z?&)9GNBVicGW~1%NPm7W>2ti%J^eo_>5ow}sJy2r=~s4xtKrErKCAkmzex$Nu3t#( zp6xf#2mLD)`c=Ws`jd)(HGQ@I{5!%EhF6y-?{-=LbSYiH-JQDd|1kU^geNc9w`d#361{#O$i@`$S*aV3cpqfuave5)@@b7j|6|Tk4m=+ z8-wup)8(lE<-Ot_8RI%7LDs)N;9m`|)<1f8Pn%IRPnFR9iXKQlu#I&O&pDEABm$z- cZ7r7J>q(sE>7IYif%FzWWsVGQ|Ap}X2d_^R(f|Me literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_sync.o b/third_party/libxsmm/obj/intel64/libxsmm_sync.o new file mode 100644 index 0000000000000000000000000000000000000000..1ea4199fe0d0bd257746eeffc1582bd4f6cdf1aa GIT binary patch literal 12088 zcmcgy4{Q|InV;D)_Qno78*ne1#0QHCJ<}u`Z-C&v=E@G>YNN9=o zzVFR@Gqd9v&|OdW%+9>|eShEgzW2R1Bl?r6$9z1`vBq;>=d34&a@?UsRzFU2Y!40E%<+;2)Hd<24~11F#SpA<&CFm=_1`2CCX0;>81Vp(l>iE`HP}4qz0d zZ)iEG=?79{lXi_*UN38Z{;a?JND$;x=^S;ST-JO2a{7&W?bA992R0*XMQ<+sLaRKC zcEkZ3i#)U|oKqj-#lctMNnaDv)}X(X>&UPpCnu_G$=A zcq>!sE9>;cHLgx?0OC8cR(_&h*chtUDkf!hjE}9U=>KQ*EBiHlTS(i6KKU?C(q`$K z;}FO_U><_9e(}H=ad2HepEvr-rRml9V9*odNP4lf=cZ42-Rx#BTs%mEcBC3Kuo*0; zGk7eqFUw8Dd7cnpz~!=*^y8o_CpcbGKj)Q&(rd=p<1n5ke?5P)4jh&ye0(Cdro8{6 zoZeMoXd63U%=A-4O9sshvG}xP7|_|gA(L17H_Uf%rBc>~wuH91>T&IX-1MrMd>-Qf z`JxnKuJq4=(8M9)wJ`D5Q+iRZPLKP=f6djUgR;=z7e`75CMEItvBbX8qz>bn6bDy< zX0^YZ-@QiG8~oBCV{m&=@7kdX9|W#}m&B1}QuUSWf;Mqsg5$0ZBP*diK6EFt+>@++ zN13WqM*C635Z@qiz+C!>F$^OH&+U8^^ahS%uj_pTi( z{|~geuT&iAu9CGY7w3U7W4&A*rkJd!a*!^k!6GP?qf+&W><0m;5-f3vpt1-(wU2 zWwEU9@*7#Df6ch7ulr@~y-k~^QJ(mtZwN*&51!fmEFGt;m&y9_X<6TGib8IDQm6L? ziHek7R_l2mb`LcfJ1zF!wNuQTNB>ziRj~VMEwk_EIkFZV$ zFN+7BhQUNH8+n4h0^=t(#!&x->6ml8IMgFW;Ju+8>T>aM5)2u}HqLQmWCSoSdWkls z9aSg!ocL2Qdo?;bct(6-6c}m4@ZmZjxttTnguzj9;JZL1RjX}SI~~2QzF(r$syBEg zrrz)=52-fICn3iCt0dMxy`fu>CJ~B|E+M zPkE8!`tRb;=)%ix=&hi*u9ZxiJ91j+bDU68^8_BYi_^k`DX8 zfYg|be$;drhlxQ;Xy+iL_h;`gOKn15mK?mUd`u$u{~NctX52CmHRduHoS(e zN6)|l`g^D*qq6qyk&}pwk}(m^r$~68ym&aF<#GualQL-p{1sq?_#Oh4c}R^C^>d$6 zGEVbVy%aP8FKaVQJ6ZT-_|%Azr0VM-bK1!K_(8Ejg8VI!?Fl?9e>W5jt>(4UZIH&N zAmHnYvX}Igu`6-$h2wE?K! zVg6#*#DO2f16}vVPSrM?(B6VDspm_Sb!whhR;hWPvRusz%2G97q%2VLbCmgNzF2`A zHtIjzp_JeHnv9Jp_Yf9G>H{HrUfO$uB>&a^!ZZkf;fmZ4_Ei5 z&EsvuBy7@aWI^Db(Uwz7cv}z-aj^ZtWYrdl2k&l+9Ti^~6Gz4@hFSE1w=injF;+*# zgJN53OdNQT#D+5l*%o2-IiXEN^XkVXij4UcQ)jRs%wR#7!GbV@1!0C1gd?;d9LIv- zevx9IDPy)x*)M718D4$|o|dVYx*y@CTz+OHmKB za^uGFJD=eAz94_!ohANZi0pE}D=6(Hysr)f_xloo&~t(u2&+ZDZ_{|${ z64G;X_7@Ea2T&Z((ZoL7$8pA?Rl*PQjIz&nth4ETxOn-nK=1AVj=q&DuRv=H1EQ|zaRfi2bNdlgg={y3$TQhtnQBgLHd zIw)?pw})cPo(%HOQ|t@W3BftwgCGrI-1iqHFn*6={OU^`@hkJa5U5TC;u{0izyPA? z`#Gu$^>R-KLTdxz&FDM&d6Uu>A`Rm56pc$c1|FK-&q|6dGDu_&9;Mi2#H=|;1j4@G z8`%MqP*X|8gdW? z4py;C6TpI76F3&VLaM;Idn|Bw6f)8$frEwN(gbd<3(A3)aCUNO0*Ur)g22t24FV!? zEc%6Hfg@F|5G4F8GYUxp$?QT5p9JpS*&xALB`YKgMnxzD34eGNhJe7a>@Oq>+?NXh z`Xq3RJ@6$S_}_Tol^*z4J#bt}g$>t5F&8mGdxi-%++=``z*X5F&Az}z&Az=Oa1YrZ zNgroqo-LnWq8z9JeR8%90J!0`9{4u_pJ!VaWX}Ptm*CdAV0*_tf?Ml?>>q%=2tPNw zFMHts0{A@JIwG|muww+b))A{g?-JZvM=Y~HBe-Rm>@k4hK4hT)(6#{ZdGN;?+jmwH z+*)7EpA^BZ^~L8rg%OdjN1Zf4)cft#v0;xmO5otvj;E0PQHj znRnJ6^ESc5RGKqaT8`k0Oq}dB@KgdNb)IeAvC6oB;MTfh<61%RC8i#-zra(9;MTfh z`$(GLl_o!nTQA^laoCCcK5IR)nb46Re$MtH@^jWYWpVon@;jOFxubncUvFn;bMKC> zbR^9w?X8jJ53g7eH3?fEgc=((41SU>%BXA(-1k+u0eGg?RnO1@Dz2LrLfa3g_&U%K?-|V1SJ?8(>+Ux8w^Sc;Ab*Q zb9Z|y2Y0-+cPW{ku9l8w#3AiWt#k&`+p}F(vM{FRW~HN-h_tk}_P}c}&1u|sa24Lf z7Zzww+VHQ~@a-OW+ymbRbpjjex9b`3z<=$5zwd#6=7GMw`^s!-_lkj6W zI}zYcZ|$z|Ar79gIrWn-4uCuP!^jK``PrU_&l~Xq4RN;D;xnF)up!QT#^-;>3pB+2 zq-R{FXdiJlb^_4=A7!-li74wCoS zaO*S&-xh56JvKeB+i>eN$8l$E_|jCZow*w}N zsit5ACYkNv`@(UUD%iH~qQW?dczbEk-CZ{T!8Jb)P;K>^X?7g`b=KJ=DOWrTT6gU4X=_bVDfd1pT!0A7J7jg6d96hAi&pg07%YQuQ zy4erW&5q3z+Q)N-)Bf)ObF-!70?f^Rm5Y5mw>a%z1{UCiQrpws$!fF3g5BV^azg_-);^BX_i~p4_{)=$V zaLfNHY9DK^y*d3a1$nplHMrRSDkyU1&;1_u8>oHRZXEhf`(N>}zm?kWqw-k)@T};x zFH!pf4#L*HPwhKkGFBTK_XL%%vV+k7A(f9faEvuJcHczh4b-}xr1BPG^q3sOyQw^+ zK|wo39% hk7dH)A|M>sM?bP0aVU3=|1J;tYI^Y4=}_*J|8F%xX^8*; literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_timer.o b/third_party/libxsmm/obj/intel64/libxsmm_timer.o new file mode 100644 index 0000000000000000000000000000000000000000..530d82eebaa3888228b02565e750deccdc67a6b8 GIT binary patch literal 4312 zcmbtXPfT1z82^@6g)O)Xv?8S1J_Au`>~pua4b|9e`8$WzMy%1Qaaoqdu4Q-Y?kicf zMpHBad%+k@)0o(!M-wl-3B^e2LE9cUc+*3rVuIpZdi2CS zB(WIXlK?imK7BJpJa+vz7G4$-FU|W>6VW{b0IX-DvO}6!@|kmN_V)CzLVtyYmq;Jk ziarm<|FG%b+rOa&u-Tg{gtFYWF8Kc9Q_LFZYGBjfwlDg!G>FF8!Y?)pUlZ_UuKVGp zyL&y_%$I>dU7d(NzA&|Osh*hed0ZGR?tRi{=ef{yZ)mD%sh$vzPjRr>vt1252()p7 zoyQp(eTan6;7?OKq(L5*mj`KMb(O>bs-lPl5%c_#pCASsm(7=7W9@Ss529GR!6UgU zjAD9%R7n5#(*VVIKXr(9yn}Y!mD@!-eq}M($9E)y_VEg4iuM<^I#DY+bCFxCu9XkC zKdOUQRDG|M(752wZus@I#=>{hdcqj6UjFv$1$EWJh*$!u;4yr77Q(j1`}muL5ngAh8DC26EX56tL?OL ze$==Ei|_PSQ@e25S{LL;*2*D{2l?Ukg4i?IsJ3*Q9ZiNh97 zYWz$o?dFPwRLRX~EH{@q#o^R&@>t)|Wa`+^(DQ@GQ^$|?B?nWGnJgmz^{h)SOsZ}< zu!v$_D5+dwqNquiGwI2|LLpbkxxU1f311A_$(-`}9wb+RZ>~p_@(8wn@Kz#vx(5Do z4gB32xXd(F7~Mb8KKUXdzZ8SSpCAxvFE+Y^BtAkQ64~j%CUL4eBHfOxXdQpl6R)y= zSm1h`&ndX-_k_T8`?dTwDe`n`P4TPwKc~p6xUJy)DHHPp3LczVfTV)+pA%v}s^G!t z1UReU>TUPAg3B|^A*cLS-c65zQ+7*Ue^QRzY{dm3=+yYsmb+^j8iJonw(78?FA=O%;&QOTxX%^W*vOD_m$l=o;S}< z_GD_Jl*Z%!e~IpCdMnkvC7TKQ7>Hz5dKf2!=`UnL6;j5!KAjt?lmDt8M_sa6D}Gt; zD8fm<(;q{W&Q^kRgU?r@s7D1vcMQqNnx;F1=tXQM|JOJdpi}g+)BV%^WAfiD97#Xq zpVp}RcM&uB=OFU7oGF~^{y(ds-z+vtzMm+@N!R}vQB(ZgBG6G!jo;C9?)CUDBW8;K zlJGCzS7D|qetN%6{%3@vicqBd(XT1p|1x4G|8||H=)dFMw95Z=#7+L?PmoceNB-%} z(*6I182Q(KiN(Z$(DNz5g)P{0eR5-3 a>Sb}E+(y1i`ufSHN&gQKf48QfP2pcW0Gv7i literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_trace.o b/third_party/libxsmm/obj/intel64/libxsmm_trace.o new file mode 100644 index 0000000000000000000000000000000000000000..b8a118abce233b38974f8b75c6f642711a518d97 GIT binary patch literal 7608 zcmbtZe~?qf9p7B|5rMvUo)oWw<*Y&P5Mx3CiK6Bm3HL%@fXX2jj-KHzSI*uYGc`1=yY`IbjtWgr|nFuqa%~^z*Ad?;ieRSFxcYHjDUg#QH1pS-RvfN zo21UP-?{g8_p{$$``z8|?#n$Gu~yZ&TttwI+)2tUff90IPPrap)exCWt|87;Y-DnS zx9lxIWe@#2wjK5!K1%3PY|=aMotYS%G0xc0Kp;1UppdtD7h=2%t0Z&YXxD!}8vMrw z{g=7*=}~*CnJElj5jaIZ+`Pr;7|U*QJx8Vx5~W|dj_UBBp4356a_^!VW_tfp;UBf}TDDP%Lv^Pd7rR;mo1!-g&XiDLp-x zwFe1kYPPbrcU@p&KROPA7X6GVHNfqh{^;*Pg^40tpxgNgp}V~GvhA>AdF#W@9$+I8 z5XcIy1k)T=3PBK zjy%%rTlD?saN(QTXW>nlWX!q23cjPK_k%H$9yjSYgH4>ErvZ!6e?{q=(Uv!2Ehm66 zrnerB1;0qmM2FG87(My|m?5!jj|YZTH+W5tPsA%iPdk0y#{2*<>hnXe?vLm1V)bG? zEB$X^qw4U6Sg58b{)2xN{zsj8-Y6Z529N9M&%h6hp10`R#O2yIoHR6@7o)}KPkq8Z4loWJsWE|1Mvk05ntvT#1`ULZ)LkY!v|eP5cXCO zOhug)-t3kF?SRz)Zf5=r3(umThO>jNNbsB~ zCsDac#~@V|qXSEw4kMU<5NwRdQefNp19n;2eON%Qp__oo?7C!TS>g@kV)Ufxw12tI zq8~GT`5-Xp^I=XuHRx%2E`Kd-4QLlrSN>NQv4#5$qBJj(Ikj5@D}ht_Iwlr^jFA5( zZalFbK~H|$r9#2L#ZJVQ1FnM*K|qX+F?xun)F`IZDE&KR)pL+lFPg(ACuP!N9)q)F zp*eiO6QiHU=qDDQaG9UT8OW7;#>j$&sRgOfg25TZ+6-hZ6Xf+# zlU|C_m%)QiO?ns>GWgMmvoy6*J|mvgl;NKu#7#H``XOysfFZZOw+3rsck+ zK7Z3<|KfmuX<$iH^WBN=CNpjq@tXtw7GR&vhvs_VZsHoMcg>wO-7^AHQ4jb$-nW96 zi1~T_YF;)&3DeDJGrq%G{Udd&w8n?rVXbL+%9z1>HLsC*&qP$Xm7g z$LdGQ{;rmsZ{zVP$1}Rx`ZP&$S#^BiGJxY4{Amm%FqpOAz@U zR2!uE6??J|@GI-i7DAj=#QCQ+@F!~E&(y#V0p1|3AF=P>XLx!2u)Po11$Zz|sB{{D zYPbdgsD{s}fiI|mFRFnrse#{K178VvgR~A=UIE|lVR+e7mJa~$uAztY^==(X_4jw$ z$#_@7*EPA_**nm)-M1s2w0(<7S0cGJksM6QjjjzQO zdU|b;uoKCFcwf;L=mT|Z<4C7o^7+E&BrT)S%3`bc|cyU}VzI!S+`zkmCW6Qr}#?n`z4AYpgL zyStO5yN?wqJ2}v`V<&+U{zz&^5=PWhQqB~!h)sq_4`;-+CNd?K*T4hU4MrjpAU;>Z z+07o>D*ttWAt8?EpU`t7f}WfiTm_x#%v2R{F`VjL?UA=EEcO{Fd{pcAw_B zYWFW3N4sq7uwRhgEEc6W2>s%x1eG86C6a2_7IIjBhGl~Ct&?UzE6vtJ6_DTG*KhI0J?9aXEAP^R^ zKm8n6{YgsvvOjw{uKM$c#4r2v3zeU*r>8is+TADVk?sDmhMs3toQ;x@S2>Q;i8e$$ zk4gG5cM1FpiC@lR=mV1KC*JRoP)zny`%wK_JBO7AA0^AN4Gh@cW^Tg#7Zj(hRNC??^b_X@vgAIj-t&lK7|cIu=VLe42!t z5YH>cVK;@^y`@vWihTLN_a(QZr3%fZ`wco6@WBE{<9C?bZ{YHx5OEXb)lk8@>i;Jk5Z{MFUiH5Lgsb`Aj0Z4Kj=hWVLs`}Tn?N2h zH;*%CCnNl@nS#zqq_B|{UnMWeVpWy@F36(^zrrW%i~E7-7Ve006PFkLqTDYZmI8!Q z$O&jWm&fx`rW}`FrwEAn310kfLACgQ$m2i48FBsM{X@u${DbOM+bB}c4|NX@*f#*f iN0mo5HTMEU8u{Vhr}(cl$(1$ahwu^wv{MmKn&dyUL(rZ8 literal 0 HcmV?d00001 diff --git a/third_party/libxsmm/obj/intel64/libxsmm_xcopy.o b/third_party/libxsmm/obj/intel64/libxsmm_xcopy.o new file mode 100644 index 0000000000000000000000000000000000000000..f4e1f0161e5aad3a5e497f233898fed82b5bd0eb GIT binary patch literal 111176 zcmce<4SZcyneU&IoTQ-y_6Y_WG)kN~IxR#@nJ76W;yGm>*avnXLJAQBMpL;_<1{#@ zS~U&PB;4#NJ5;Goonf4gf9@S-E)Ko-){2Tu(h4m_AuR$`MvVB@R0^>bS_-uH`&(?EW3iILzgWrnC9_Z=wWP%R|7KIuCOKWw zQgU92pIhE>MQiK(TQ9h*?Y-}BeHnb|EPQqL^2_Gt8+bOq{blc4cs9So)xRZ zf8AS5ezv5fUQqN>g%UG6;>nVPp5*e+3vC5KRdU}vc)JBh9{1KYC3C@e{)EBvN}Q2v zsHSyf+!@|LtxLxmf8`j^FRwBNgR^5W{mEm;k2_}!tN%;9;3r;?PgX`B?FI2@VAi!H zy>(=G!SG~g3fSDN{WMW*z^yhs8$`wFI3MuBXOFQWgy3H=Z)oI)l0TvRmnqHCy_02owLi8 zqQ-qeWoFeH-#PPsT34&qkuZM9b*ulr1$%9G8=f*ZQ z6SUm~@84!|r_UqNIYa#!H8q+P^d2)z(?Z?nG1Th5e}3usaX0EeH&{_}DB~pF>YQyB zk{Y>ze0uD_p`3H(e%e@Ld+_J<;1oUd3-GJo>MVYVE;MV9E_~5>`(fwC_wgbK&(ejz zG)U7q6?JxDyo=*(*Nqz{OQ!z;j6%MAI(-|+#m}=VYvcrL;`ccOPTgu@LLGsXP?^1bXqr@ ztYq!_>$W5p^9*iEb}@MCHY5{3>uyb!@Nk3SWDlulZAjJwo^@-oR;X^W-34ERt*2-~ zupzl#k6V+Ac|a(f;dh)(IiOOOO3nsHyl_i$Jyp9TsaT)YNMW@euiZs^eV*eywPF*g zVN;@W(}_@8H|S5cdGxKj?ru}Vx;t$3sk_bdh91A$PeR}CHU&kwt1YwSrs_KdGDplx1ock*^U*Yzh8nd6U^-C@2tb%&{T z?LBmmx+2z|tX)Tqsheyo!|vn9j|X=oEBP#UT|15Kwt!DR>zjB_JxXuu;RoYBouH4- zh5mt(Trj<4zc(=zA9_0G(WO%q6Flb8{WHXtJlz`nBsdamNj3v{L-)m6gC~Zjj$Itq z{g@I;yoq1NX@0$48cyEMWB2~pz8tCEMvu2f`JPPha4yV$jqhXJfhq22ZsKS0p{WDg65BWN>(bDRdEG~b#!FSgq_Q6!dh&?JD_{+)gbm4!Bzqg(*sh7E%lz(L zKA8JRx-@M1PN7{MouVx(7xsS44Dcv`-~B^XI!H9UnhzdMmZ-o07 zvYArvQ`aRZMK$=Wv5vy)UE%5zc)c%Nv2c{$Vwl3#6Vs!;$$Ag89{X;8a*cg)I5`T2 zeV4T>?A2AH8k%i42vI8J~{3)YGPhY_^FsWH>oVyfw@w z(xZ1IkJ$7BG-M}H)>C_WbW3utl0lu~NnN6fDrLqz??{&DQ^`*AbVvJ>wQkU!tfUzm zL2!fJZnQ*Q)@b_>3q20~u4Wg#aZx3eu;&jQt8|9n%+@z_Ea8lluq(N&QoYWiZ_lKT zIN_%#++(rcMPjad(j#B9*n_f+nr^-GAhcsca-4iGcApp9rbXyYJQ??6zw~4KmmXI$ zD%HgE?82sR09DuOKnTMX!> zp2@%ekcv}jL{^@g@P?ilbgEY_S`XNJ?k1=T14*?jpAIRqf}_IMwZoW+;C3KV^o-V_umh7GBu~L*(4^*ejfo?d)=l zQ9CP11CnCa(QQunR?=ODr&61TM-q?4JTj@ro$%tghH#77XT0uSZBH*~YAj~hdO-4m z&Sb(5e$yKK=8~}Kom#ogN_Ou2KD>Bt;-~Rku#H7~ujt9`|;^ z)O5Qztogdqa*VTLxfpHc+Q1dDJP$ujvQ}J5*+0cu# zN}kZP-UOV1!l^t>v7B?y{h2TgEs5x>bI)Utn5WZxPcw++A`qkIrH%@ZO|2sndJ9Au z(iC~v6;WWpJ|O%qQl8gt+C%No2|u}ps!ve0qkIOXEgbDk_NghX!9jf)wQCKoC$8eO zEpAqe16Hr4qpi+8Gnt|NG?1;4wcrLXxsaXu^!R}|&qiJwR~t2anXuzTec0_L?p3Kn zw4`XxNlb#1-BCzC#Z%_EOA|JcXB%In8qcd|D}@uA5}n)&N^5ux!)d7oZ;7Vpo2GcG z@uC(g%0E|Zbh3WcR4gJwffw`#61&?XPs&tnbDL%*04mYStx^#k=9B25} zI?<75;Yf;fmnw~85`myuREe>>+F1S*HtCmvn=5N%oaxKK<*SxYFV)8SH$q>!Y z4ZRS{f0VE^-$7WG&l1M-=Ma|X&mgSGSFs>Z|CVTpQ+)>UWHqE>kvFt00nh-@Pym_$ znhHP*KuZCr0;nngae%l1ywt(`$0((Sl$u4NOYTDG$NoVGjJs^T68wdm0Q_Z}1hg;Q z4A8!8vw+Tp;{ct@B&w`hxo{F-<+4cuy$h!RdY4TJ$S<4$$S;GK99(^iB_?*=dS%QT z(?`(|jC|M74Ka>(P~BM6i_uhaQZ&=1;9Xif_(VsIh zHjJm1YoOU&+?3^(ow3`flL~rxG$kb}M6#INFB z44zLr#(tN5MSNi{_G>X^*)V&&HTJX2l6^TSWe&bDs}1EgQ>uI3Z3fo~PowbGu#Y+L zG;`oS=D;tD8_F@Wp&TgQZETx7PW@)e&i7iD^iLmm=g|C`K6(M zD(rAyqBVB^C1LwXC0^>$wDYC={IZ+OCexbQ>D)4&uDlOL1B7PiZnIZaI5(~W@~D0d zMfuFFah_-ihW00NVbh1biT!a_aBS#F=D?R|gEw@PIY7u9AY={@G6x8m1B99bS!Y$r zT@h1JA*iZ^rBzko4UH!R)(EUAf^`DxieOS;vIsT^Y$$?F0-Frzoi}Wn4QZgz7G<FaLb@I2>6Z-~Y3V#TaA)LF5khCHo!@$Df=~aq=Z+uO zj>szc1e!2c+`9VJ?sQ*1$|~u$yECdaarf#Vt(IEe_Sk^cMVEy6Yf1vP^dGebKa4g( z@dWk4jqM2@WYM&ZDDjTz2{*?vzc1qeIvzrq0;Z@@$xc&Dvn4LQ{4LZOC=g z1{i#z+6nXxGjyb+A&FgDo^2EmOfsXiX**+ik}nAxPS1s2!iyb17=Ifp|G~6#%Y7P# zL6CB(eaWzFh2^fv^XlbkGC5&kxobMTI-aK2 zTC?1b{7xKAw6$irYo5J&Iht*2&2rcLdleiQ2p@*FnOX`vjF$U)-~yjIauqAy*=m=p zL|rY_>HAplNK5^#!R}nx{;`tf&Nt#$<__=9#Gb!yc@V!m7cPh6`00Yw4%E^w96xT< zs?xe=hMte*dk9PO?Sy6dG+{j7KvGO7m?)FO}ur zJ@isMfA-Kz<@q-by#%k$biMMrpy55dSXzi{qyJOls{LmSI%Ih-Bc_q{!jI5X@%Iq_ zE1n+)rn+LhY)(8rCtf}$UNI+5?`*3qz^TV>xRq(8s-}CXmd#1LlVP#$bHb{bxh&Sx z;&J+zo)Tpf;@ibSjoR%f_>K51%Te3CD`#1)yT#~L@|F|Y#p#}8m@Jg4rA*)Qh|AjH zG56D;SwOU!xNVHd+Rc}R_XkLs} zzM_q2zxb;DiY}tQ*eZXPtx_$k|Bm6SCxUKflWP5_88AbkP$7l!Gi6ta-%VKUju`#d zAejTMc#<^+O?+3&*+Q$?Ahh=H7&K{>2i_<&QND}AIA}Fz!!ah}!-UoD7fXHz^CG%! znU`lR^P=S+aWKpmw6COB!}WD#yKa6=JpGH_E7 zZZU9+g-vmV(rAn`b`n+v>`uuWy>LwolVS36Kn?d0YxovPr9e%Bngo>rwFqhv6bGsj zR3)e!C@v^2r~;@)P>mo%$kcccG&8?N)i(swpNBdjznFyziv!6ByPul7P@kEVh3IZ6 zT!gF1pzTWQRJyuo@g^9d;Q2WMQd|PkcO`z?Rr>`HBGG6~0x7u+UQx2qVPvBng`KM) z8?kD?3jXY>-2kMCGs;uU*A!35Mo%LfwM@_Bk*sVa`50cv$VQTn4UdFuB>C9#NLoyl z1=+|EWEZEpDPmaw~^OYwHC;r0bZ(MWZz-XPKF zc4A`TwV3m#qVQY>i97S4*4vAO)$X&uYrToY+Vv*8Fv5Z+0E4XwP?Q+nv0v-1L1SbugEW!;2ZYaV{25u_CEe39}uqm!k8f|jMUgGNxs#Nh2 z2U&mj9i&Ga9wMx6cu=f_;3fk%2`&@dV&E3Ralusvt`b}>IBwv$;0nPt2CfmT8BwWT z0*9`NKd5Hh{3*!5^z~Yg*e76Gt<3cn=$hAUb(S8(S<9fH1r;ACVSxb|Hgc{P+btz~ z8`&BojX&$~L79bWi9G5dv}g)QplRf(sp09FszT9t6PvF}G8ifrny{BDwD^VX$L^bc z6YKptM}7>0*lAC8ZhSWh(2`Y48Ju&{gD=HCaizddX+!#Sf`rn17A4_(nblX<1D56A z3HSjf_6HjQ0nQU~%R_Z&Ptu--sw5F7#W@qd@5mpMUWhAAYvGZ+G&FL48*1Vo1 z&lx+5Bp=bPLj_iYdJ5@X&e-ds^q#rtea_emrc2tBYi83?K$GZ#LXRtD)e0^1z!!RYk-vQg3QPR07anrDoh=iOy?DI>XnJ=)neKiiafRf*Bu)%a1)s zPgaiX$?v@kB#`EtWMA9X!$U6*vCYJdY^AUb>d92un zIXGy5I71shB?9a?{{cB}^TWD3~kUb#+FIb5~S_h>Ns1<2+l# z_GeLb6cji)m~u#mYRd$>pp%*1`ynYJ5^V85kbcC+2_@KtCI<`5PT>tbc}yB`O#ZJ)oUV2n-;umFjQAFyLFTI*N z4-ln;aQAGe7E4LrxmZ7mIy7l#=td#%LLaMxK40faMHj@!R3M*4BQ~NLU5CT zn*_rjY^zjSi%DylRzq(7r1*op_yaNO1*25?FQk*T{DG)qfqZYFr1|8~nCdM1{cm@Xaq>5EDr(+_qPcn4zgkZFHTJ|4fc{9ZbnPY@TuAg1@tb)dZ5_PGTgw~@~hO$s+HL3u!1OZ@{9stae z1Atj-U|0MsO@K{gy_@Auw7nG0slB?Mb?ch7?p8Mv_svrGtJB2W%}Q^tZX@1l7J6rO z7x9&5ov)m`&i{_dA?nRWVAeSTy``1(>lvs_9f{x#TO@+_GEWfMG>mHz=n(h`_{_6%!L%bG&{wBL6g*BkH7_^oRLhJvIr5?AwLOr=r%_-P$Uw|EuT2d2N zR710N`Y)a|RO42*7N&1#q&L{B-O*E*L_}eaUhmf00quNUDFc z^|4`~zJyqu{5*w7xNZ>C@OjYrf=EXkI#Ye4LuH~0jLTUaoC0txjf>%#M zS^j(_T;51RTO$ea{M(dpwYJ8NGzsPTIwgFtjfAc?5-Rd1D&ZqtB=mGSTl09hH0P6T z&TtnEf=g>T<-*aH#4MddV417L29BKVB zgHrnuLiPzE5rmL^LWl$*+yw*n>DWEYKqQJa*e$lUoWt-n$_V!-1L@}9IAh0Ik;RgT z^q+(H+TW`M9UN{Gf|u^`y1mXTCQlhiB>3DVWP-T{f$t8`zop82IGNO z?B9`1RES(55@6(tHabQGx5GROS^SaJSd%N5la)k@8M zXnRVh+n%;0ug2&*Yd(kSwRHO1D2QgwJ_c_eJPYf8Jq4e)#W<)tH!Y+#Kw_yN8GKg+7G=_w(@59*STs%O0b29z`$)}UIG zxbTX}xr?eGT!+F|OvrGiAW&0yjU4nQus}4*PZxe?9K3LCX=eP_6gX_@m+|IBuva%Y z_u#v1b!vF9J>+2#?lVQ-Ghe*>xSZF2u1`4!RGgNA2QP*=m#)HpQPPSG_@s=k;gh+q)Q6PlKy^ zq~+N}JY0b{-rYt)5U!Ocuc-4TUc%LH)OzfxIoa^?}`)`I`yh+;cz;)2c2w-l^*wfmL~%dEOLx-#w4o*i5rvd$7|u>$L{^v%!yBgP+MS zFVPx&FBcq^N=jHRJ~H*xB|H0Gs%NT*sv_5VJe-@)e{ICN9Dlg-&o=B>dqY$UmUrwm_G{dJUbF>m$2bBod!QqEr}GM1t2p0JFBVytmU7aI6%CBRs&FD0ru1Q zY3!#q4~oaJN2zZR>IDM4C`(k7q zN@$v7#)Ijlv5sxABM?mysmwL&obXp{DePZnCf;Gsh1juWVQH*T)vh;)Eg-fAkF*B= zA;-bQ^b_Bw8tF^%s7c^aGkgNG!xki8F*7?p4VNfwjG2{89Zd>%^bT|8fV~iR-Yw=t z0!}h$U_@{Bf`K6Yar_E{=c8$(^qr)miMax4?Lod_hOwX6UxrhS6f(>zCaiIHg2#hc93@_NeDk4Glx7l6#?XTKSqa18noVZoZsN(*3>>Yf_vS{r}35n8|hHrhLg<=Kt^>bf}t+UEzQwE^RF4>tU^PETXbea&l^BrVBrzbtJ{kwh`pkC{YFYJL z<)qlKSayT6b3^wg1U3n5DuOKnTZ&+nz^Wn`7Z@*sH3Dl4=%3e%tIhSsgLAQr24^ek zmnVq(XB$_}_HsnU>h`nimD5>{tXSQ7cB6om8G&j*VXhu21N7OHjm1YMN>TEgN;9J*l3BGj=mou+5 zS)6cpgr@oOs%V-%PWZ18nr6_eqG>`o;ZTHT5LYik zDZq^`3Dd^K^w+ilwFGlW(n&GeRo`@Tv zaYhuevCfEUa^yy6<`m0GjVlmrKBNOs+=W(3f!O7ON1%7NM)Xc(3Iq2N(K}dOJYA}m zMFdYe**0aTjz}+-Mg$uE2`~gttwbHRn@5r0=`%gJ1gPELpbr0Icm?!V1ts{Kh#QsH zY2pUG$>p_@qHQuuAuim4fK+k*?QtVGfC~<*ySfrg1}gwFIJO6Wk5SMK<38G%Q^{+HlS?PJ20HIkjkO)9L8GUQY^&u5%-#H-q$NB%$a^+N7ow z3aw=%q3EiPuucGLAxRz1*fF}oTi1)aYirXI$~%+f9%t-VQF7m$wlrOHid0PlE0r-dKf`FZE>VVCtnH#;YYZurWOh z3-LMHj6X#z|KA8p^D7APbR&%Cml7fb5@OzTwtm>^yV9c{iD~fD48GJ|e@Z;vJY)Ua z3WYdZdzZ%0R-8kzy-Q0KD^;vau`!Cfl-)|0=1PFVE=f}rIx z9#onbhKSP;hp{)RWercsYEwLeV`td~&&chE-_Qv#D8i(&o)vGX*zN-!o4#7XiPbRv z@X$|>dBJ{4Py=;A0JgSFOxCn|aA-;g4C_PztgLXLR9z!9$rDJt^AD@suX6Pxy@S)StWfQ)I~r(%7AHLY`y z@xXl3ddRn6QF~3M^~I3};_Whr8Ro$pH56pHhazOyhazP7hazM+2Mb}QiYK%o8L(hm zC$&ncRf}W}=coel_C?0H-f4{Mor{cdeWfwDtz2Y`>%FCE(0Ugc<9gnh-13WzalPNz z-1-+8AH2@eJ|wP1bC`_vI`XISb5RFrVDmLFawi}GEIG(`C0`({P9B6QkbdGW^%l&* zI8VtHn|ZOw6f9?YYwl=*i8+{2sgf)95)qf6@t!{G8Oawd4hAirjVHt0NCS56OdyAl!M91TT#^LQoe zDUaoSOXQjN#U0l2u0xd=9tKiB@b9!`5 zhRf|JL%)vYxXK~BBYve;CQL$ z5jy1mb)O&G=W+AGE#n!Td~C8OA5|8rnYRM@^zfRc<4oaC_~Gi8op}{C$%&f$H6c@# z5Jw$COgDt)+%t|kI`689qWzT+k^ab$H+Apozo#oT4^nJA&hR6|)JL_Kgz<`2ZaLtbQ;U1u z1wsF646MuyXYBt(DD}SsWij_&fkN&A3OM&=1vCz+goEhy|O$v7Q}v7aDyG$2#+)U=ig-m?qvzWa53+k zu~SGWtT%4`kspjEBHNyp9!m`~;TKs#elpE9I+{N+1z>Dp)w#)fy=qo1JiOM?ViwsE z?xd0L;+DdaW>v9t@ERo9)i<--3)z zxf${PCZhg27QGft)3^Qb8Wzpm%K~&(i|YUimjzr!9QKbeo_{+b=Z^?6gFA^6qBV0e zB$_2-&#pj*wR|eHd_=o7+ii|0w;+3rSoB&@5z{Um^PZR5mw!J9cKKkbO!S(S(p#Y- z^JbysD_Aax{>nxznuHoe9&>8c67T`ePJ&c`AfEm>EQ|K{~yL7B~kMQIQH+rTrH2~4nLfYJs>ZY znlI`8hD>T=;NQ&cVgJC;WHdAbgrz!6$Ix6(7|*v5mgnC}SdkYwNyI-#SRKDn_EW)C z2Cll1aWO|yY7AUcgzF4kSA;oJ!uuRCiSQc?+)#v@4BTX3TIh@w+*evY=d&T{Wlpik z_2%!4!_8MnEft(Ja8huY;06OX2#yPGGH{dNa=|SIZV{}r;fX2(R|ytjtyEgvq{Y9Y zhFG^Zxv$L1Pf`qC-WtG(K=yKnAI-*|koxf|b+@iZFkb!tYECP1MVUQqz_x)7Roe!k z(V^~CUH+H*u|8J1`6x-F0!}l zq*1LZa_)gxvuTj{M7X*Pt9UwGf%^kyBGwP3R2W;5MSW0LIIUX7KbY^x^vpl;yl~CN-!=qZi@RvF`PBNcDZC8i2+*hxknE{S$9aaUPm*wG zcr0qvP!($2sq7UetMo~|L9(>qcbIGRB;q#m zJ~!>*vmIyKWYa>jH7*Ad64ha8^R(in*vSsy=^?mAR9%(0eEl;~n4pDDbtR_&K5-TC z1HH25}fd_A~f~CBzkpS0d1MkG<%)! zq6p1&pgO*QV~Wx&cEUf8a7^cFT;}0ub~~Z2#3(q}t2%Xj0nJpm$yBGt3T+=xq>UO5 z-kemz$C=tR^PTYP5z3KQLs@T7BvY!CyIzR`d%yM*${T-W-WD^{*IuP;>0@Kr?=A zv!4X6ntz;A4^=4Ia5eK&lW5;qQ0?zZNY&=2_P~j4b1NUQmuEOzm$~7U$G=HA zI_ZF7syRJ)46R#sc#ktIwktEp-7Fs_EX`j#z8CAO>bgy#~)rB+X*NICM3vStSfphqCfpZvifpa)?ff08Ox04WP<=P5?;@`WK)}-Q_ zDirNE96ZO)X+mcuM44mfL|01E<=8pVUNP_-J13eK2hXu{qWxmwId)Fe7Y|=Jb`B5! z4{oX@nHL9+WLtN>6lC$=9ux=u<9Qso5f9leBp$M}Ne~?A68|0FYWQ#IaccI!f4@}V zzuhuBV(MJ!cWNGm(a~%g_-Dh2{}bH}tNnW!acQOE+l=;t~bd7`Ubg z*BN-u*;ND2IlF4$IcHZ5Y|pMnt&C2nQh3WY*kq#;YcXe3>wcuqse6J{b4E33;G|%4 zMzz7f4T8-X)g}Ws2{va`TMXPH*r>#+3|uAH@Q>=sxJipYQE1G~e=WV(N9k~_Eb~wI zujhhex`I6uT|T@Inr5`a^s>w9Ou#?XD}L6ny@?pM1{9!1QsF}khr)9wYeV+5yLj@r zTMujb*FD7>0bAu5@zpaxWAzJ-b*Zv4!C^my>m%xgZ&Sm37(P7rQ0~hKKpUax|6ZQX z%|9-uVE97G>Z&`7*A{v#RnXt+bsH)EHj@LLurKGje=8=(RAB4?u_s_k5Bss_#A5AK z6OKRPmfdDP&`sTD3R` z!rbx9l4&>K6z_gW*#UjilIhl9UnY1~eY8nPyQk6W9&>}8xaMD?(e&_bW-`T z13n!m2N_gnt=;!#gtwQL{J*6v+WUT!8T`C^l5Jd9qGJK2bO;Ys>#-OWd)uM`;NHl8%8 zi-!vD;HjCol4mxUQYAf#BI)QL;~Im8waH}v6vTdF-bP)Ot50qOa)YU%Dcq7_V`xa& z6GP^Mk=M1p^*mn^JKh-flo3bDK*;jLhLkdVPD3f70}G+K3pDIA0z{X&Fti;xiZ_(e zL*%fEY4bwvbYdCwF{)Vfxbv5bAv9eLQxBFtKwMkc#{Pb{;qiI-s8BYM?8Uejb&KJ* zys#-XYNe{M4M8HhT`WQ#R~daLfFRaQ4H1SoHX5s_SW7Q#sg~$6MLg_VXhulM5w=*$ za19p~qfDn~^bta-S3HK#G-@s{_JBLOQVV*6VKY2Fi~VR7F2N=6Q1(PqxW?Q;+B~O> zH1V+QByJ-Wl4ohHG+#Zo@glfq#h-c-DjTG z`Gzy(qweQS-i;iAW5O z;uWXSrn^m%Fe-W@b=V2ZdHUUOFDz_DbxCohB3O+yspVgwid$ooq6(LVofRU$!o^)U z;f&ouYc37rZ^FagU$RZN3(mP&F#iGW4UFY4B`nRS3CrXz4Ck68EYF`tSdp*9F{U;L zckqWkTT^@U2vT}^pYDe2Sg&A{g3SuX6-+9aQovP@dsk0#Hri(sujQMt)2bVsk+ax` z_$UDki#k0btxotQb|>u)3G&2cZqy57)AV~bc#>7CDOB$UzZ6&LslU2yp&8cYb^NM= z=!+-PXq&d1xSbq4rF?^5O zuj_%2um&Qsuv?7~S!j*1NVqJYs2@XbWd%1s#znzaSwbrfE!b>IU4!DpQUIz&j&?bf zz?39UUoslmIBCB$DIyN4|2k(Co3T2Etx;+E6ddAAlp_yzZkv|a`@;DWbK$V)1z%zm z9<{c|%!R{R;+hZ9H#s9K`K4r_?(N@9KAV&c*Fc_zwNTkRK((f%_fsSRJ%Op-plO<8DZ2D%qQ2BP)9#V}IN*mHcL8|*&IPIJ)$mCV5>QGwfu=IF=6kBI;@6{7v#V#UD=mJ6G?;`9yxUduc7_gv35NBk!`S zzR-F4WV|`_LAo1kqdx3rEq~jiKYgou^ly`e9_@I0p+{FQ<_&niyUd%nMt%BrL9kyU z3c2nV$k*uT!e9iQRd1ui#G<_9bszGB=}frlxO#NSkMY^^V$X1-T_=(7_&{lJK3)}c zHiKqL!1oKX`)%I8EKFAW!5)9flP@Q`J@jcT-%iNIDTLgYLdZ2Kgxroo$fYQlGa8b) zq3>cA`eP!ksUPH}1)CL&E0|O;rGS~t{zZE?(OzcgI1gud+;?_}78gDNDTToZ58#viy%LX|YJo5uPG0T2ao(`I5-5Ibs$fli{mS zco`IwsjJh?{2BX8T&=RfZk>M83f6SEUwg4uwWJ@z$LNh{o1@9>Uu1@DF=Fae4xK1b(yC8qX9i6wj)yDf>8 z#G=H6NsN;ywj)ZcH;LwFNTA?0opv-Uwo&D)leBkqQTi;Y{SuIAp>yN?8oRItUXood z(ujxenVHLfUB^=Yz5ExG%kjdC5ebY>0Pm|cwB=$6kJxyXy zli1TF_GSt%t$sl4lzQD6-WE+lb+cy7jG3dN!s>1({A!f19-p7Do_E5}MfvLd`S}`z zwK!x$*I$vRu~=(nv4&#)dm4|mW)^D{=I3dI)}Ep=H8Qh#8T2-~nST?hn$6=pb!OHQ zxMiBfA(w+g#|<5`d}8XD6Mh!2VZZx6kA1h6cQki@F3t5uDuNkApeEIijWZ{2Bw0AA zc5MP@5R1rJsdJ;Y1nwa~96SPpw`}e-cui!C;rWYb8q`PA;59u9WDI081G+Q=x-Ihe6foUf<%n<`uf;5elkz)w;Z?IE(clo zO47-e`2Vu(*Xg=4jb&?W`vp7VU(su3-#2ei9#^fFy^lG=6F_RUCgPvC$1INan8nfH zyrySia6mSL)1$%Z(ctuGaC$U2JsO-I4NgySkC~*0SbrL#d==8j`Iqd7Kf$-aMb4+y zT355Q`k=b>e;;I)TZI}++|uV;xE)xgl(f(hCZNDi$^x9G0xg=`dBPEQ36v&`5uPHR z3yxw>kz_2|AO?7p=d3^mDR;(ZVCOV6LZMz2i6Q2+=`T=MbYFjT55yg$Fp{el@vRsF zW1skFQQ{}1!`KEpl=K#TDoI_&K$umMVcFD2#zuIKlHL(v^%*na8A^Irl(fcJ3Q?=C zI#nr{4!badaQTDV?8`C!XEYLPQx?^xpKRn(vC3ub@`ZoErk5!>9VKHt+(I(%TlBUl z9S`_hN%zqO_6Wa=0=Z(%6xHX94VpYe2I^P#nqsyU&YUsbzNVhuXfXI`GM;mgHUTFLK>fsG-;5YF_~8I(g6oaXmhI53UtPHMM#p!gfufBsf3E0 zvHuYvi4+P+1aeME^#*A$LTZeV7SBg&1{`?5KEEwO@a7|Q2;qDo^hF3g^AXkvf$M7M zVuKJ(PG@UncC@O|Im3kzOvG>(LJuQz4^H`h?6K^~&+}Nb-FIRfjK8xrb;J+LX=zEu z`O<`+f?>X8JX<;8jOd37v8$fzh8LSl?TQyr9E}a!Ztk?xksNLw;XYwPt{WqiJBn^A zIhU|JpCqivS2J-;4bdGG=@?yAb*@Vr<&+N)2Z$GdKR74de2{w%;opnJ5^>Mv4hpwX zrTgb{2Zi5QYErQgx{YNfl{+Z>#<)qv3BzqHH>p?&{l*HDieAAj+(B`WTkWF09N7Yq z8M3>txqf0Zeaj^1QWtaE#Co16=k2z;8+)M6P*+Y@LzN18ez6{wF1>&!zSnJuMuA5Z ztw)xB$cxFJ{z11W8N)m7;wHM?`=l9%<5G>Wqg+4gxQ|J7f;(dLwX@M4Q2DdKP#u!?5v&*%8v(5z?C!Xh%qoE6|RR zo>DM}VWU&^#rZ0j&U14+;*auO><9e*P{j3!cb^19wh`O?^1z<{Tkh+ACCC0*Qs;2& zh0VY+>*$G8&Kzd_pgbn0ab`RgvBa~?Iu6WDTVKJf<4Va8?V}hNZr(Yfm|?7E6f0~c zMIFY;q>8bIliI2QLmlh|Y(1~eSij#e)*j)1s?#{x3ok^c!7xj;@p~V|0eYK3mSV$O z&)+8gHuE>m-z0xi{LK!`!zjjT${=s$L%U5wdpGyAXygp{G|SQ6$xSV6<6A7}8u7L4_~L5CT5}0( zYwVygdC|Y7+GvYs8?U7rtK@jpZ0vSLqho&ewD?0qS0g8%TeY9F>!z5g%@DiTC$5{b z<0eSqHw8#}qrR059#;uS`@suJQ!q}t?a%z*&J*sCgrx0#PD$F|ov~k6f5!MKSGjoMa+)hV^ zvo5j%{h4?s-;_=9uO%}1TF8n*9Zu|(QaY~St8D!nts_TAZR zTzpb;@oCl-Qk^Nb@6KjjA=RE@`)+TRlTT-g?YpzN!lbf&w>M+_-+`CUQMT`>D@VR= zw{MXQb1|^i1Iu2pj{kJ8ptH&9@P8&w~|!Z1ktc@M`^ z<|py!Rjqe5Zq(k_#M@X<&_Ear3W&_DwhS$B$}n~nE$gU^aYZRZ*jmV5>}kf(+ySN4 zPMKI&?JWsWW2Q{<1sG1O*i1oCOT%I+Neew{l-5247#pgVHf8fMjl^gO2A1t7>jkZC z)0i#R5KD8FthKFjT)u9UIAi&^TB0RQK|;~uMWwj#GseUUv(O9-lbPVbZo83(bJMB> z#j?8z7Yd_1+5Fh~tj2*FOGx9fo`+bDDV{1_uu8Ya2dnq7za#iQY&On~!{poF5xfI? z4KWP3OEKz7ZK9SG)Fw`7Z8sBV;c82|_F|Qd?M6(Qs)J*~c4L2T7L!8gHzinY<0=H@ zlwPv-8!wDw6#W0se&fAH`we);exq}ZTBmO631=iT*LGvO6VjoNkxJN%PPhR&!>B|9 zXB=m&QxM~!&xiHdtFI6BW9+TLFSEgmx!@<$cXOE~mkJIubxqHsa{dNm!q}@M#(pP{ zwV!i?**>=%iaW}Dqk+Lr2@+Fax#4^Dv-OWp?BgfIj_RCFrSYl3#=~3?BZRL_*wT+4 z8n}ZuVw3W2ftTe>9qukHN;`LQ-47SsO~^4fgGi$kB^w=#`K4>CVUCjP)m!*$sgU$6 zwiN%-kv-+K*&G@mwZ7$$I}{_OQK+n!EbBWgGYAx^qVJ@y#6jwt&_DXRbnIoW z0I$&sc!pGv*z_&O{CgjMYDQK-@Wd>4z$|xc;tyuIW1K&j<&G)-&}+@C-iuhGPIiXx zi<+yYl3n^7caUQ=X)?dWx>*R}|1b!wV!o%S;dRX~wUXk5BT;@WkSSnB1sDYN`dXa4 z(T%_2Wp({pUwP{Fm%XS?Ui)@ss{7~Hs-Y2&UND`|WLu*QXCVuW0mCCn6e?4X2X% zekZSSvi*VLSv1)YbQBBhD8wowPM)d9KKF;v!D3YqF1 zJKpLQtab=&h9cSVwx%+T`wXSbj`#Nq5!FVJ;P^6Gl8I{BUeg0 z$@CjSF`}9dU@6V;%-|WtEBuUvH59MAXe4U2Rz9&a*~m{;NLoYj`hRTtLM5l8WE8J| zZ<9|@a$A&)-qq~SDEgG#6(u94eM~(dRjbmueuTS9C?CFq_Bmtrpnz4g7NoRUoQ(!Y zzayfXV{Qj32x+r8V&R;z1rg48gd=8d7Dr4Qzt%>ma`r|z;_PN|rUZ@sJE3sIu&J(n zN-efoyc^#i;Vh<3;nee-#c2c@_>9W%qWtu{{5IvkPx(Dje%HMGKINaM{J|)H&Aj|k z<@0NW)~(6($85fHMq1{b^YYTKn!e=+qhdeN;S2PuojQCGugax<9XVr4K7*IwH$yU& z4>%(qG~!t+JE4`&Mk~$v3yEj{YQ(e8^W_L&c+L}+=08BlwS9!~e2TC<{}w_ko*DIm zIeX!bb{LuIBm}iO=mh6Pkc$3vsCgF`b6OcKD$SIOkk(mgtB{ap&S7+(gpk%*X@sgc9Z#)QL#corayv_hp|#}#3wo7l`{NvM;uZx0Xwv(z#g4tc~16Le&Py>75C%a4i29&&^GvcYza0}i<1XO8m|yU5u`vP+(EWBeldFXIR%6&dHk zd-0xzTiw^%jdab?H#`#k0NLCPyUM)oM_Rj|lDxavC$G^Bd+@u)(-5kQKOw%>Mr=k! z%dvJb1adwmo`8vwzmxs7B~|3yC|^@gzrrks2fE7(ubk)Yz%MGk7DuylEsKf4LrU4- z`|xxxZvM>3PX~P%u6{JjZ*UEdn#t$tM{xG>*{CIQokPffRYx^Cb_Af4Zt+G$;P>9zaB^_t$8+$OB&JxNhNyo659qGB_?JQ zhwOyl&foi^xuG-X9Yx;+{dC{-z>hRbZj1bCZj0B<&ohW5o^ z@*d3aYZdZcMdtzdL?-;W4nsF)#-HJ+h0%j&!>c*sm#5K?ZsT@{N~oS|!^+Xo%^(w? znBkVGTg|>sQIBi%l`tA)W;cqY^WG zkjk(+H5DKRhivp=m^Nk$5T|6Sz@SdHQGSUtipm$W6t!SIQ83VhZLubF+r~`=3}5=Z zi;eQDgr-CVblI<03(~fMC#isrSqxG5sER3D1!^*D7m3QKj4+hkW?#fs%C#Z-4O9_Q zSfA6Vpiy|>c)~%%+Gt?hsClBDO?EI_%sRt^(iyu7HHS3iroUN%gH-MYKb6#}?vn}A z!)${?hCHeJR3_NA>tqS*e5=;M&P+=kfYT30y*c@jPB1Ie{z5jhtYZGWdV}Tn87F8#&=EGzwFV z`Na4`+em<2>xBQ4Wc6Qyr+P&LK&go=2cf~}b3#cH!nmE53BwFc0mH5XCwzZ| zG#(+T?~BFQrQn3iBBap>Y0Z2jtp+DN)gWd{5zb&#ft6C6z=aMM%r+5*grk&#eMh zI34qGn)&foiq`L*4;w9raB5$H(@2Vb_k1{#RIJy>oCX!l6T|wwM_5rm_E2Q$X+I51 zPdRmP8TfuJEF=kjelYUX*yfBhTThJ@$P({?4SRCIXu|&bAU_p?rv_)Ztt#h;K zZ_1UwMnYAzpNcHeo-qCzr^;U=A%BhbQ;{Xw6UJZTRQYQp?XHL#-78w*~6?eJ+6V)N#kcqvnM#s zu|`hk!G-)>ZhPA6GD~pv{S7^FshQ(;)!mD63W?IAKAH!(8)f)csi4>3ddmp*fcg1Lc zCxM|04zp_$e{h&xicHmi0jvE~La^~>!nP7(-OcFbu33g|KZ5M=Yvq3(* zbWfJEO`%(U{P45<9#kgvBiFg;;funS*JV;aq1>Oi&Ye56sfny}(M}g`DK$?1xHs}# zzEe6X4-bNDuusQu(|C^PTz(fK1^1w!n-T9Zdvah}AIrVBH1s@MH$_iI6>{Ykhv_fe zn_YT8Wv{+XWjDLQek1t&QpL8p!EQ>XLVDFB?4Nhv<5g%%|0A7Z#b+RxXk% zV`U}FvP}p`hH5;A;`O-UBRa**k10jOH_C-|-0lV3QN!1$q+GXtC+8urtR=}>gx~)m zkMEhYPW{$Q zoU6IY1#q@rx#()^a!2n-eurn&xzaM|!?;&S->MCOJA)?Rhxr)`FN{B{LG*%WWhTbK zRf}J4qu)2l+{h;G_AXx1%gsD9D87SfZsHDwkzF$Bhc)jU`YANR?vZ@#Z^QSN!qhf~ z8SFh)OCpQ%dfw(j2*YTmZ8N?XUT^s8X862g;q`_szMU*UeikTVu4LF{{*m?sHq3jd zHOz3I2GcXVo~w%B#yInai3o2IgX8i;K$!(X=nJpM*{t3CAdo?j9a)78nnF0IW%t3{ z;Ypc8*HCjdfN$rDW7~*n`@2PcvNp?aCea|nR8SlD(jV8UTJgT&^)ScQ@*EYv$rNvH zG1^R2%@eW|jp6lG)W6dpoHR56M~gWLNN);993~^C2|pUyGM#(O@kzsom=CpfS7wmf z-vj2QVSI%57{=&VhAiKy2G5WT=WEz1u*bp1vk-p)CSR=p7C~i>yE#x)@WaDDK*=Rs zOemUu(jZCeR0SpvwaOX(S{31qr|Y*@I+K+{FQ9h(*va6&fN|D4og!IoDnrXpFJ^T{ zX)6(}5KW<6_dyObnj6_Q9o;2+JoGvqQ86w0AU~{HD(t(6vzY7*_JfD~*e-MWPG?0q z5D&9`erxET#^oo6G=Gtz&XCW%mH|?|Xp0m61!W*(Hs%7j@T0jUKgwYw$xZC%=YO_w zx!IHaG|3mi&q-(UDZsebAqdo+O2G{bKPwMd5^hT9YUsT8Sd;|1B`MJAo_-()9_92BsqY1X)-=)549 zBE$5cp)$CZ<^nE&uyPs0{E`>!%T4?oGSvAt=CXA zteJm1WrtUnGIzelcv8UKW-hq__sZX9eE_%&!3{sixnpbdW??=|y~Yf^Q*SSaVD;-} zw0mB3M{hG^6nt}lSW{68S*|PL^77GJ4J88eKcrJwL)MfUvBoq4vUHmjE5Z*hV7-e> zu)??VmWY7f62aMM%lwh2l}UQIyz(;6kc*Ug;nfgIYpB(&Vj@V;Akn5#qHb^ixpi@R z`~d9eZH0rsv;bX&O4d53-YE)Iy;|Mear;3Cl?zo|ej*LoN1ko|ZPxS=eSsvD@M?|# z-)3&i;;oa0ww+92T2;|Ulu&%(-GrKh!b}^wKEzNkyX$9$o{EibD0s8%$t?Mvx~N_+ z$G&GgTp0+F0AAA^(_GNp6M>QrHmyeVk$_7|t1cnphXI&znc#pIyu=LCJ4{z`3zkwf zB`HB%0w)Vj5tWj&2-|1n&{3$=!320mp;CX(?EuoW%T8=|wjb%@(ZY@@0%aS=1(qAkTx*T}D0i4&Pj30k9OOw`KL_b_gjEOXbjkD- zWwr)5$~Cp%`(DMpsQsrU*3~4@aGA9;Cc_ukC7s8K54i5!l3lN)R{x}dgIYPmpS7{l z{6~Px@>dXY(83YuStMmFC=^b%JqEyzlqLpx5Qq43QCmg!-jG-+ETAyuiRz}d}anaxDGg@|ZwoTzQ(dnSpZH6-=DqFD7NYN}cNWjyzxSeD1|Q};y_tH%5Wgkd6a zhf&8e&or!1uQnde&HrAYUai4ld|tSzn@c0g7i-EaeyhZvmzXbKXf!b}+ZFRiWQka5{*#1dy8jugUY4*t z-#}QAf1L)Z`qHo=mK*wR0)PAVK%tZNF~4s1XiR%OE(K36>kW$xOj4w%71>j`7ZNL*rQ#CZZ~hj9!+D1E6Ul zJ~G<%=o!{QbnG6$c4ftn9Hb^}xt3c9u4K!L!xW%<%wJUca`s1>t|I$$#Iir7x!wC` zxgT9FD0&icKi!6wSnek^hn6_bH(-8njvLxudR=hEF>Y{$Fl>i`*vTEQR~(xI{)xF_ zeLi4{=T?}Z?cAOgTrs2KsVTUEJHc#})ZZ3UpPQ3LNLHYUFTLasd~0|uL-0KA7Vh`Z9 z)pm@V%ZqTv!exd0KL(%Q(8_<B91R5^wJHo_Cmb8~Qx z%)vQ22j|5(ILD}H_204;FrHkvM0a^|j@0j$OwZ`3F}xvNB)LSR=JHzv;al|+MI!TwYv(nI(G*D^RlaxG=&hE|e;+Jb5dQX+_>n-esYCBU`;z-+zx&l@UH}jp(tI z(Oi1D)CeD}zCHhsxp#q&x;pd!nPh+wqu-%TF=ASSJJm#Kn+sbq$hI>$!5N$Zw5VvY zMuP~7fH>G_F=CV0%*ILGF5SAjwxzrM?b`j>)!+VIwXN+h0l8GGV!a{Oiq;De5xj!R z<^TRX=R1=DZ~S(CahLN{4saCML?m@eFdN395k2J|JO>xby2qFL_?;aBfrJ`dAZoCoXj z6!T%lFZxU~A4Yc3=ZrmEk(0ebNp*NfNDc1M=WdY?tMLX=7d|hF72l{!wu)l8H)`oC zqFC>Zx|&T0Wq{!IZ{lXfmwVc7+#2J!?Ul>&4AG!IMLVsSM!eSjeSOkPQAz&5S@jG8 z3bSga4XFCitePmDRUMCs{kPGIeY-N3y(f)e4CeD~e-Bgnc!*hbEnkGa>g%WFvroY2 zIQzsJnQpa}PN?QqZnV=`Y~5yX6+l~GE~ky$(EJC|Gx+?y8m_YJ0-o&Ecm}h7!E<<4 z=1BOsT>8PetWB^X?)UiWZammntx@|xw~6Ls7wAhx^RWx`6{7jr1^OGiWj&F%I}g6m zDdxk+F8Z9dTXXF)MO4cZ62m?EEB*4(#}@TfJ?%q_YTGDEA6nFy<`?K5mbFT$s(9^>O<+&}AhI;fdJI?X<@G*qPFeIs5!7 zeJ@aEnD=uB`;5?MGy`bL)zoVr)VuA;!YH1RuMTpgLNsh30a(jOf|7{L7gN%#G;oYD zekB8J?hQr|A-Ea%Di~Vr-rE((?+Qhx>nM@V{%+)#QC^)6yde!^UB^(ZOndnHnZO&^ zdPT+p7f;fDbqFe`_lU(^0bLN3Bw>SAL8k+MNU3OU+EPXv@E!aGyNn?{?i%ofPvXOl z7)(YjmyZRmoCC3xef|>83AuVE19^1~utAOpxuWVzjKamfeP=v^+0V(tH}H(d@NBg_ zd<)Nb9BkqlpdjGETCeK}-fIBCBod%z&OfRT2j|rkn{J#zC@bf*! zD|>y9snR5ARoLNj&3EZQVR(Ia5jvHCNmJ43iA`D_L8I#*Ed?!31Ne#jt5W>;X$$N1 zEhn16;-J*Zn(hxMmbmVMewb@cz>n33D@{K^e5AOa;YN$;@Q1MBAh4?asG^4AZLxTQ zaI73FJKTK0S_L3$zS%aZ>fl_u^BEMVLp>Eu z*eAlG|H;9K*EQW5Bq1?5N`Yxv+I%<_7+|+5Hk=#S9t8s-o$tzubd<@ITjCFAZ!F71bIJE9y12sE~Ckl-` z?G85Zu%E$=vZ8nAw1CFVu%2Xq>*U6AJ08sI8es*i$I|zVpcZ2RsLKCP;>4Pt@%l!= z2^1oe*_^S$AXbq+Jr>z_6<4rj$8rT*w#opAV+BC03MzCsp~DqALZKrpEtQ*7#fjk> zRK%XS_LgF6cK;hwB4>tGv;iCdk||APnlQB%$ybWrHVXyZ8L<>uZPM(XtW(6-Q7&e3 zwAqxi5E{bk`#pZL5anjm8~hEd6R{?GD|Bdazw8HT>J#2hv#H&huI__7!)9*R!z_*s zQbo}Q79qy(Ls%KUmrwbd5S*+X9vrxdwNrVF^rToPCD+)P>f9Pip%KEy5Gm{yq_F?O zXG(SMX06OQS6TLQ=<@72JcHRd&*9lRo^GxD5^E)Ljn!H!uP&^0t65{TB#oX@SnG;D z)67R3n?js3%!lt@^f}jjSagd%7n%>tZqerw*35UamuaPA-P=Tnt(i-0&1CJ{q}N}~ zqNlZQ6NQ_y(XD-QPvfDbR_-&l;AWAP`&_M+HO_sZ*2-GvwwEpYYxcIRmG{_wGp)Vt z54BcSayQxg#aR2$MJ{|_Elx>w69fvg`Gv3G@aK_}9BWbdx=1Iqnr$RPD)0yb3HCIT zB+}By#nj~w5UfwQnSJg`M6(s&<>L%1T^;plfkqyk(Ir4{r0U*%{~lK;2zIc1+Fs`> z)0IEX5+OMK2YpJPgV z&NmjL#oFVZL+S&;^qpx}SC0n(_^@qH68k(tZQcCuXcB~iBy=G2qtl?Qtm^Kh$ei_&tBz2km zRifou{A7zI#0!jRDF~7N3Gl3NJxur7T_HQJ)k|yi9%d7zO9e1UGcb6fW|U)Nf-h-5 zat0)Oj#1v|5TI~(uj(1>M`p4gLCVxZ*tzUSq<@aM>5GsT)^nT>S_!$+36Ph>oaQ+K zs4B>%Gug|`Bos4!kXN_*I_JJ*zIiT?qplTJjp`HGX@EgOg<&xs>?0y4PVI@vE9Ujh zv7%EC?v|)NZc0ye4VW68Bzk>uKPWjAl-8|Zb-w@XQvX(W|7yB-Ze)I`{}Qk79deU@ z`;h+)krnm|84OAzA@AY0H1tLb?e{}(Iw*7%wAc3!eyCRT(zI?v)8&Wu9~8O)y5kbP zbK^lFTJJk%DrCz+A=;>Pd{H4gT}UMjRjGy=MpJZf3A0;>XhCj_xW~E_EjhBFf8h6? zt~S5n=$mvQyEx#9OyJp6;I|2yCnPk{MkF00wFmf8GP;?)#7Ll`Ut$jsjDR3zMT|t) z2*g9b^Sb}hZ3K=>lsG0Gttw01KVg!|&G^A1f@sVQ9?{NPvBUYcjmaJS7HUyIb zXn-s8#3XP;iRLo-mE)%kL1Gdcf<$vre#7yT(s*JL7@|ZoH$%*sG!j2wW&E9Me_|t0 zDjt3l8yjpdWN)Au0{w9y0@@qIXncmVJkuCNwjs;bKtq<)TJpW_BhYjkJpOr^9Wmo` zHh^yUb391dN|Xe6<(`E!NbVWU9a=b?-SW3Mm0+gXOYU}7Q>Bo6&>goS+1BiGH-4Z~ z#LH~L$)tp_@U5&H+m*`9bfe^M2`j&x%x6Ex++|z?hN=TP0Mas)%neE}ImVQ=0VCy{ zI&IY<*cO5~XnP2HpU&{KK7qM6mAYGSb0qa!S&P-k#veGrazFy@1{O#X$PG)g%t9|_ zYO{LQ!-P(oUIuL z{q6q~<~42ixV4H+$!tPNtjH(9W%Gjw<_Y?rqIUw%rieZGqVh-Kk_zX0-^edySOL6h z6~8)t7Ofj8Rwyl-WV@C#q*`lP^SX|PefO?U5-Jm^o%=Z2xeRsk&V8KhJdBW+*Qy|b z`S9cA@8&5qjq=r?s^VANi{7Q8(5KODb_J9W_eT5@2!DvGBC(xH1vYY-P83B09rWld za9Tb)9DKz_pf#V=v5p`ow8K{7r}OEvvO?EJ)Ox&*Ovb(nqAQah|;qFWtK z2V~nwvN#Uk9L-KuB=R8(X0s|%8)n63zbdk-NS-TFTV)q|A?JRtvvcCukJt#pRv?dD z0a9!A3;M7ydhF#qXm(NXjvQ3IOEkC(rXC510l55YH7w}pq&ho`TrGt*~RSj5i| zF_qhqqT~0{@ipOA4fdf3V&^+y&UaR_Q98Ku=!odx&Qs_tLK=qDdkVd#u@X?8!XssF)2F$2-ETaEK|uHSL=}H+?_rxH(V4iw6mpi-z;(Kpr_!!yTq-do`V0L5;uoEuj_EY z9roNpb6H>crFPj54Rr<0RQA<>f#1R3&n%}MIQm%yl$AgFTk1dtyv!%sm=gDd3PAB= zX1jya^Fd>pG~hRpDw&`B3%+b`VZb-xckqA*F@Tk?!hrv~!hpZl20S9<;($kl%z#IP zEE9Kd7bE@guJZw;`;xjM%4gt6#4cI`+NaMDt*`3S=Ze;l_UWPX1($y4e8JI6?PTEV zB+|$xT@5d}+RgVX<@ly{I&ix_gwRSq)i-?)xraZzi{DhLnOc1`4~T=&gl-p?>WgfUr|z(Y zzrGQ-Sfa2#tUfTz)IP0dnix`nmx;Gd3sDWEaH(7LU0AAO(AxYo^Xm^?H$2xeLum9X zK-?4(vE>-UAx^vbILpDzAu5{BR5das4as?H;j7NUhl(#gk;)_KMR9z#$faRu$$F81 zohpy3Yw(b?7h#yVt0{0-F}N$FV&bl*!CfU|kdC`L1MX@j+!b;%aaZTUUCowcb~z|u z1ee-@cwOJ5ToTH{Oc{R#Gv)JFlaHI)GkJ)QI>+im<%%L)ST8ik9e>3>tkfTAuj8*k zR+svtJ3iXaXGy6)l3>SQxh>gH5YjLevrqEXH0^vNzons?{Lo7ALsuOXs-2nF_d`GQ z_Jcwli*=nJ+J8`}_FNqo>zx}93NaQdqmZo!h3r(CCWQ>6maub`lVGu$VX?HE>o{60 zRxK_*g5$B&&uI9OMg8t2cu4Y&rLI}rQm$LsV6L7t=8BzHWk|+)aDv-U5C*}!wmS<1p z$ra%|xj2od-Sv8>&tLhv-v{tclVPwXH~ZXHHAK;9*P%o+ChHwzvgluw*CoOMm&K;_ z05;2frTTpUqh-EQ{XT%zGG7T_hnTG&*#IrHclV8jw{~{fV29IU5bFR(UqZ|D78)oG zi=`2s47h_GoeVmo#iJb{rs}O(u7Rlxf`J+$yaNtm(WwRmL8u*`s$V*pGNlTmL8u* z^6HizpGNK~A!sqZ1fj2UQt4I!{A0WXnX6lRa3gW0Oj>qS9AvIgmKPLKS12tz3PbJ+ zrDaDHl2<4#JED-iLe=_|y>#fL@@|^Y7Mh=Eq4^F70&rq%17Bj2=Tn3T1_SG)>@Pkg z+3d|eJnVkHnvRUM2pM3Df!<1VgyzR+-@qXyVpEJf>%LzsoCn@R5PMoaIEs%sBfs$c zvH~IuXy8-_2YUt)=_fLQcQS$dkiVZrjDxXY<9GqeKPP2!ucUInPSw2vYKpBPKlidV ze7?9f++$nAJ+?L6V_U;Lh9cYpa3s~ahiwJhDL`Nqfn?{}GWKR?3Mpa)GQfGc76D0) zR+z{{?mPvLF3`j#I9ebRxobo>4AIL(j~k*d+Lw-Q&t8n@X!65fTf;Mj8)1K?*S${A z(f+I6)T_+<%(wB5I`3EZc>1*b!hf|ZPYy)+*&!Pfd`hOe9!S+a%9n-9 z!{ z@I5F1CQydng92azW#Bz1047j|-GhRurCbKxQ^~=YKpJdNBrp?5gA9tacQ5Or8*ogp zD;XX1x^GZ7I9@vB?wExAp5zFF>wzX{%GNlS2`o3z_da3$MWY!B4(cOFyI{O__HPY> z*OnX&ONUptZJ3>UN37EVP06l3x`p2g6~rS)w~oC1@MKDSUi6zS}~}i zG5?L>jw)%zXyZg=E0i`~xK$|av?QXcQY$(w1ntgO;5{Q>IRZA>)Z)Vgt3)O;qan8t zE+rV`dI^yGr0Bs%`}Z3y#GSTFxaEc&FgK%C~1I2sezoTkRq zs5sy?Gx6xCSCB@8Zdv!i9maher7yXK)QmeiZ}qW0xWl0l{4vm#B4v`k?vNesT&Jab zd-h4r{*UBHCpZx%gYJJM^h5{i*j$8l9P8MP4}x|4?Vkp$!}~9Xb@&b0eHW_?OobK` zPT$?^t|>X@8N$$68Ev_@&mx+_O(P@P%PPDMoTrNMilBC-{Y@?aXPjKfi$fmv9 z2J7BM9h8kpIqM4w(|ePF$Nb%9CO>xvg6`tJWF1$f8XMv7PYOHEIgbE z=PTKmtT!&>{M~H+!YJ9T4`l*&uk*#`gPvW{XR-O9YFG66y7{1OSM<4dw=Tm*zUF*5 z)Hjb&Lmo4Br=u@ruTV60*{cZ7jG1Cq9Yb>64B_0uWu1C z$r0CXRR!26Tj5XLDQc8Y*((Kl9RMu&W-|Ju*ZsHJiv0aUAEdu-3hM`GvCoRcm>H73 z0{d@$1@^SBz~15|9_fC{DX<$N3Ptlx=%jzr^uk}yp@xqBy5JeoU*EOY)YmsieI0%7 z+`vKF>-Xgo<@nkY9Fy1<(w~k7@y$Y51gajE=H#KbE_P-QuSUNRj_E$2Q znwB%pm~qCWn(X|GYA(AVyL3VO=h_x8xe9veMHkHd+=96^iJg+8u z!4;QYQL}*mi`o`1U3yVlb{xk-hmhZbMN5`tYc5zc_Y35=cv)5y*-Sq(opw1KR31MH z33!zv>*WWP@v>!CWNR+GsAlehdGjt>deI^)%R%`pSajtD3#mo+swEeF?urFhUsN-X z0+iPDiX|7dEtt39qPZVf4!v_BEwNzH#k}XTixyq6VDX~bE9yR|JPWS)!Uan&s>v>1 zTyw=`7c5+886H%gp*qxD)SjixAD!2ti!Qz(yWq-;3a#wgT2I*z*rm>lCbF(P|IL|S z$vu+nd<97h=X30o7qSUrV>P?8IiUTOcl~Cj8ceZWw|^hO-nTvp8IKNPyuh3Oa6I~` z_w4{andn}xOPfo(;rt04pbxyh2hGv%^*wElx4oW6am1rqPI-aanzk$%obK)K9$c4euQ9jifjJoo-B_B<{@(bnkbg%+Lmu2UGb^ zWh1!_3>L)9bloeD%DvO@+eB^?NV{FHnZZP?N_eMW>ESotH*d14?DoF-Z5(HDoiO*n zpLNNwv(vd9r(=fbOl^r^p6;#R_W7cHW-h}`9JXc|^1rv+52qr_Kk4j_7Md^I{o7^0 zrS7dN(NYp_BW~!*oe6K?)5?{CE_;$ZydJfKHOOil)_OlGw5K+Cg=W83i7529|F;$T z74O^sfP+SEAKud4e-&pOpCiVwpC@_ziV%)ItR+*u;s{HNW0oq4^?|pjNWf`uh?|H6 zt;T%z3Ryj2*}-dY7&GI%&ll`dC;j(KJ?K42-m~p*=G~a={c}Qw{e0R`5t6+}T8Rd; zIjo|Hx9`8iHdKgn>ZatIoZZp7%8AD#RSBFu( zpVF)2xi!4(DRsH0lBzYqeR5cN7}FF`|EM!pxD|%d!=26{&|Ls53H{@ zIG1?I-uLzV{`d9#)%5(4D#h->fHrnH?43M2I1MM?_LcA@flG!_*V>td4qLlM^{{`w zy%3G}w^D7d`SwCK66P~wwBpJ{E*JqF=yknA19;bum6VNB86tJ=<_Lcw3cj7YIl}K6 zNkDa^`zf#QG(JtbZB}HZ^eodkVE!*BbGtBblF2=yQ}~wrDfvU|1=88M#crXF{3yyhMnTJG-hV z5x!9#>Ch@Tw{+N!(`SWlhnN+iN>Blsu}qS<1+#XZOzo}Om%Jr z`mmLx1r+c<(wZp-L_r;jLx+XX<=HcNB8BEzKmmV}_BoF!C2<5P)`jEV=&uNiY7(I6In_KuehfUOn@4!a<1B9OeKosC>2%zj>U?zt(Z04q z9yKnLc{hpx!K)k>+2@gl48)bUE2P;G9sLwWZVKA55jW=ta!=iYJLKHc^|oc-krXr) z*qdfO>;C`rVXnJZgQa9PN3+Q5z8V2VqIW_>Q+VGVROiV(vo>WrQ*DEq#&+W$Ot&GmL01;P6c#pYQpO*HgbO1Z z@kID@J87|-urP8J|2#Ff4w$|0g=D%kA>FWO`SQDd}<6M9}Qhhp>D*P%3sRp-OlJ; z8|Zw2(c4X|WanPatax%}#gj8Do}5|n_t4W9K#cfc02*s@N_3AnAT=X2g4z_xM9yf= zPF-*!cQt8+*>JH9m{ZJ3Ij5t~WzSdkKCULW6y0e}M)Pu)z)(9U*$Pyn*@xm>{kD&t z$(_3!UnNx`-}RljrRYxkIVk&OKY|^GhrQPh!yaKi8oU(gL~lhzYsF}G?6L=ZM6l)LM9ztPEEwzGq1%r( zaL60s#)jE;t_N){*Z{qCBj0$^g{{uv_`r zqcnw8ku&B_If2j51eytIlAMLhl=M98%9V~xbMKuMIZcHlpS0d7xp{s5^8I>K#xxCi z)pQjgrgzOSJ643Xl1+aG33fWLHMu?!Sw({kORn#VtdN6$@@$ub-J*+E!Pd zCGzSpbbA;Nhv9cRg^;Bv6Ub7O31lhC1hN!mBB}P>=n~%UbxCFdOSyNB^r2Bq=AQDVr<#eouv6xcdYJ*;xh6$eNiB%wmiI zJ6(ZCGQgOPPGB**!qy*mO2V~9A+PIOa4fjOUf185#rpSPauSDY+RSRmvUxiOm1jwV zHHoI$ey`iL)ZgRA8zRB&=Mp{*btcmR$W35%kTkrbqojV42+e+XZg1yHJM&);B=R#b zE3F>k{pzLs*=33R*OB}F>T%5%{rOX|NAs)Kpf@vJtz**J#C)_NfA$F3meOQZmH#@^ zVm7jgIInAJ)h;ZNKb503ci@*lKSYw#NcuSQma^;D>Dn~n-!Ew&ljGMpC1irjpM8W% zcDv@jw7rfh)aOsdQXZN+l(V99{gHat484olFr)IQoYSq4xjU>uX2@8!oea8lf=ITP z#KKj4KUvRKZCh0aX2NV_38ib-nxeHu6YWM8_)pXStB=)ux(Z+YlUrk6|4OD=m6cg_ z7Rv&;ZtV?5x|yQ>jVV!5EC{Dx1t|<* zLB9&JJlhQE^s69)**Zw4Uj;clI~LOES3&xk-cwqc=wq!Vv-1^nIz~zLam>3lSZeO~ z@^;dV*B4a_Zl$X}8GmfrB#EjyXN&x|>J+cZ58UMCrBAKEoeBhtrUy)4aZi)u{L$IQ4pcKlA+$oPNE&ANl^xrPJ+n zuTMZX<#W(9yi9&GOXJS&xIlh1)zDNwUpEynFPHkY>!t!X>I<{DGYZ`?PCZ4P(Yk;; zYMW4xYVyei`HY>pMS~H~KX03*4#a#S6Np^R7eJ+YH3w{8=D=;%pMyz~BK4Q?`16A& z^feO9NRj#~Uy=GH;BIa{k98sD?4%T|&1g^YM0<*7c{aflK?hF+9Xv;5kMQnX>h!2% zJxfq>o{S>(WHzDiQ=}f!us)G+kseZ0Z2djX$8e$8Q_0! z5euEl0m@j;(%iONB1Fv`eA<wT=R zz9-DB39gtC5L_`4Ah=@uBe-JNBe-I;SIqOm0bg)chI>d>1u-P6f*6ujK@7>NAckaB z5Kc@bk(LVka>xAs6%67kY*(+$ehR)90Q`7QM{bkX^Blc=JMFa^!_dzr z%Na%~4D|$_OhhYLn7d0wTt7i=Ispq~T;h*ywQb2S*pEF*4lmaRKyzR^(y@sH&vFH7 zc7JGkKDb?9*EZQ-=G&nrU7O5plkL8`H((XPS8fAe>GO)}TyTR(w!YTh)+1;EWOm>EYpD(($QUB_buWoKZO>wzub z@{xa6yCdCYv@zxwX5W`bJck;_ANDZ#Y=p(Jvji}ztR;UM9t$J4XWG^<;x=U3UQ4#^ zmV8oMC=EbKp!zkG*)H7T574_CbmPm)?(JS*pN-2nV{)~I<_C>S-C*j*?M_k^wAbb*R&qadT0Od#8KqM57kyIIgeVQ&DR@~sX6esc1Vd(81 zIHIGI8PMs{0lkUK*Au;wNV4r#8_i;BkXDxgP{<>gL=`eQ@B#y7O@Lwh!{n}~7;n~# z?2Paf8_t!DoJ(dH(RELxI(Hz_-T}mNJyZKjh#7xEk4ts#XFT^fS6OxmwArg*{hWf^ zkPNJ!l`1@#LtX!Z=tjCZ=-%Lm#e&sAOrYU`@(mssEGT$@!o=gGArCN^cr*=pI0Pm$ zj7!4EMB|5cKfJ_eny;gS_a}J~AvN&yfS?ue7`+gqRDBBQ`v?iL|~Zq#OqUb`&E)CsQg)szI+Kh}_*GN{uf#-d4TwYfuk=r9uz zk&4-BFYRf7Dvgy{(B;_}&tP^uPecklM|j=35P^i%*O93Do2)xnSqeu)MGDk-QR55L z1W^+lMduA(r)md{-X>vwxbi3Rsr;8Dw|0rHHo97Lx#)<|5z)fPhQ}K{Ui5I$6O5i9 zdW7i7Mo$)vUR{5P^5Nx;p8PKgt+)>V0Ity$-%Os2oXhL}I{To(hy<53&sJ-mQA~Qk zp6BJttbwRHjq{c!BQ;K%XL%M_3ys6FLF0_!N#ndm3}nCYdAadZ)HH|c7PGXfkZ9X1 z@2&|8=H(__LP^U?+~py~TdP z5*V;*vvibEu|w(?&(WbsSUEtV%GIhoRJEw;0u>PzDNy4@jW19WL`^Vi+Y(l~aK)h3 zyNZ|92BIsCt`uD^y4vV!(LvD>qa&h+iym+Ec+n$7PcV9dXlobwO*X&DFUuX|&|`i8 z*XZDCwa+bH_cHa1DC#2ic?N&ZD56fO(A{(n>NS9(Q`s zyy$jk1P6b^PL?G4cMy(a-USO5UIeUUaScF^#cda4FRHn4nP4b}FIlkYb4wOp&;}M{ z=>?0fShDzv_X9Wz_h}ex3t(?nqaO`#c z6A~U%KEEBK)Ep~&T`z>x77HWeNiKU4-kp)CG9Jp9s#knk>L8^^JhGe>=P?p`4;L`k=NPy-<7Lvf^-?J^j z4Ur#HXRh|5XC^?TyF@enM2qrgRwxmNE9V{2x%S93lA{&GS5n1C%3v>PxA-d~|BJ9` z`B@{m58~FyKSRX4JGbX_YICdc%9-2)GJbvwAqYm6G_ddMGD%$S=>*vCf5P7v=I9Gi z8w4m0*sbuLZIftWmunEZI0X&4Qr@5K!)hl`u%wRk&rF=!iD0T+j0P!>lcB9wx(wLU_#i<$wH{J}#a`dr zbV(&>KZFD;ZL;$ER$H1IA}jbkq-5&%4i~Favl|aJndYO8*-9co-yyw8lU~PXC6>}8 z2X~YPcO1arPQ^$%ZJ@ENr!3wWi1!5Qxkz+PAn-={c&tRviBx#q6LkD~Yh)Ga1k(wy zQsmDI=<=W?Tv0yW>xy6o8F9TL4DNWBy+ocXb0M@wa-$D?^XaxgvR>M%^|kW{U&BVZ zxd*?4%PrYDns^clPM3A+A-VEj+m~wVPv#eg;y({rj+g|`VJ{?NarV;@Ga)uyp{+VFS!;g44LTexOa1ZBDB`)y8JM4=6ixM?|W-w zT}=O8*Vp8g`yII*#TFutN+h?A#NWO*oIQ)#rQ=v;oKM6}JT@=oO**dH3%EJ!6AJx; z`HeTfm*>U3$IW959+|*1q&$weuXv+R%tW_%UH^xwv;HXE#5bmZFe$Ut6D;+iasGc$rsgvSA7x5*fJc<(1=7q7gEZRWb)TWV$J3oJ?2B&0iVPBEbggc~ zeADD-GQAu{KY*}vN2Vzl$>c`5ADj&%xkM%zSeHVR7z`Nq-M*PVSul*xiR1eTM_IY2 zg{Jr+E|Y)#4G!*|OXklnqxlVkn!7WMJObCL&$ySfdbOdWZ<-ZA~K5HBPYzqlkca zp$wx6u3{M~R0d-CWvD2)ie;c$4IT1!Wl$~6rANa&H9GjYx6~rtuhJ=7@>i&*BADEe zYFm?tzU6h5aT42-dt>lAI45n%CHH$3gW|R$e*p&_Hl05|*txyTIxHJN&SE&1lIAkX zF7)0!T@Iln_z>>XOXkK$n7St>a}Q{$1LX+(R8yUBx-I!cBtX~prIx?vI<;=NGmLYs zO#%1tZhLrozS?}qDU;is_L8epf&NT%4F>#ws@G_GBaDd<{T;61-0a=FIv(2SbzKZ| z!E`qw?tQO6(Z!3su332LUNa8=neLM16cE^7nI_-(+iQr>O@{NlMWC4%mHG1?(@{k| z(;l-@Bj$m<#YfNc77j4A0TvjIdlB2yMJFn}(nTBJNbH3#6Jy&vuR|eY@B&RLS1pGO z(DmiYX&l=1nj^`mF2fCkU7w-n3cd?&bkC-eGK1+69beKtLw%H*eslJfHS z1P@nM{Z>+n7H{Fhn$~n`&|$%2@K#BPF(#}NiAiHBDGS+F@hk3aW_BlzNm!nCyyOLA<>fZlN%9k2?e z?Qga8mbWyu@PYjwoDFb+=LS%N=$Mmf+Xk9TDmE3?-?zJD)wg|k(v#;#Dmwlw-S!*_ zu|k^)aARWix9Fn(+*=lJD$nT=&y_bYdR|Kg9*_i;89#Y)H8cL7F?t5?;~w-$8<%7@F*5Ll;}~hcT?=$ z6q|tLI|&Y059Apw7lxO@1k$4!d;q~Hjdzv352A;}sD zsZ!l1F^3JFk70~d+bcQ{MG(-`tY?p?Lg-iFgg_8}J zhO9De;bLuwUX^ZW3%mGLrSWTsU*+Q0l*X?mevOM?TN=Nf__Z#6eQEq=;@7+Q%`QF> zn`x?;T9cY|e^DQK6xGSJy&_m)jYfB1J3!i75YnLfnW?)Uy-!X{SYwzRoEf_9_C0m_ zIpAD;y+kUqmvfh&P-Wa`mm?$FFbid0{faLz#q2GpQt# zuw_y?lt~qtRJu&6N;9b@6FihjwabLm)ieqplr`uDuRE$QJ%+jr-R~=HoilxX9U*>l z6%k$Et9`fW>Re-&gWgO?r>@-l9txFq<=*#^u=U<-@x%&DBSN~ZB7$~~sp-5f{Sg2Z zx*N|lO8)42It@PDp%E7C!*TwR0A~YbtRoV^4IvyLh9D9ULlB9Ao{qywNEUU#8LK>s zhpZxsr>r81$E+fX=d2=%2dyGXPFhV*XLTScJ)Qe>k&)@?qyffBrC)Mz{iXrC)$+Ta z)mPrv-K$SkBGfi*kx@v?1=QBJ2w zMP{P{;`BK%0djS@Y~rm@c4&@hdnrLct*=fpzo0hKjE6}Gis~XYE{oP7b&<3p%+y}G z-i0l3>LM}!u)e*ZE;0^Dne;?H;%L5_E{VEG8g-G;INyhv(e790OaIU6B152;|8$=! zsEfFcMRs#SrtKwW{=Z6HWHy&%s_F-+i;SNtZILRkZ=C{r7c#%QNxoEDM3GiTRuIGM z`+*;?R`F01I5^%cu*=sdBTP^5JwGVI{Ej(_(5wjPB)mU|B^@U#LQD~29~7Zo@Mxq? ziojZNU=DG1f$6|tvK$*NzLv-h*6qFSyR>1w2i^-wod$yN@;`;lJ(6w1}}!^QR6c&SpH`jQ0<+dAl{Tg5r@0xLd?} zN|@-GYcSd^iJq>AcFm*$JL)ipFe2W+)1G0H*}(=`j-6|^s;C_Y8<0KAy7Ea zyD|$!i-RCu>F!~QGvy4~=d3!?yZ&r^G*LHl;870%{y@D6Je=Od(i#l!)Id7DiL0^B z*{S}!y8UZ5LW5#gA1|F*~&QxQY%^KyrdN)K8DP#MQ@_U>-t+d7#s4U-bB6E zcRgO5rb%YU-kXn9T?Ixq=Jj1hfRM8@@fLI*5#6;AV#h3H+ky4xA0r~UiHfk zlqX!bD}ASWX~%TsxDi+B`_vCRCP*N!o=9$$Wti(6#=F#wHR0TBJwj=k)Opm2XuEpu zLy|rQ7{e|tyh1o&KAI^ctqz>qmkvCG_QeR@fPlFi-GDG+$Xw2lHpUN_n@k%czTZxk z#XLLk?hnEHJFyMcHw#xE%*GAalV{=TgHqd^zoL&)xY=SMgN?KqN@MGQ@>VxeU^!I_ z{ENiLM~XGShPoGf#A23ozDDjnFa~x`BTVX?dU2+SM%dCseYt{z_=)$N(*_? zBdlhkeuO-^`Jf(tave7zy(ajs-;UjPBM!k_&K4?RIvBfgNcTeXt;VnBTMqLTu8R*e zjFSO3hLfmegfYCS>D}wzB|AFz!@ZE$Cj5S{?;glx8&=h%CHwjTkX}%MKB|{((bMEG zEcXof!p@iWf$6nl4u6|dm0Qjd(N@4Knk}l2hVot$w0%h!EbmgB@7tGjst($hT*~vK zvAjj#AW+;OTiBp&VLuYFJG7rA+R%O$!fj=>2mT+z@`j@0BiYJQDDMP=@=kcwNAjiv zFBfpUEB((ZEbri($Z$(O>kE*D&sv0xzV0BSLr_tnMESN(6l-YFQ_zHdk8fMWqS2gb z+aS@c_MXAB2k* zSsE}zd~~O|T-pJ=62vxGB;q*Z0^pi(JL3eUAK!S#+y2#1kE;2H>i+l|3 zA{&Fd$i?6;VPXUq<>(T$Xq&ccApt=F6*?d&l%Yi@Ur$gyIM6TKmd*!@PB27i*awSF zP@HDsn2vDmfmqSWh83OsuMR3&>@vlPV7hwD>euNa>>Fnq?5vJ&@jkp=C7w0_KNN2l zv1I=-yj|$O7jHMy;q5|$gBlxDys8v$7cJoJ1S&*Q^YJ*$PqC_iM*ApCU4<}p6$OOd zha&U-H1Kv+KHl!HStZmwHO1xuP3PcH;oD-Hn+`ChHCr?ZTUS+bcItkn$VHp0|&t)CyNTtmBRN)w(4(u?l z5K8R4lJORE?s_Wq>3zNpPP=l z%)b8X!86&)S#~{y&_339`7djOmwPYWwh|=^>;$_Z-q;i`RGA(9GUBj6bw^A@6#=)0byoc_~E|6&dPH(t5QbSlgpVlo;n?a^6Llkkd*_MsdpMj8SqcS^@+aIsv8jDTp zS>kM3Y}+egTWkJW4#9UPQAr(mt~|}nH}LG*OQ-S+9}9Wk?XpfI<4Wp-wdxFL-$;#u zK@@{j6fuMhjP;;2c4h)k#q+5^Jb!u_;)ZAASVfNKFApMa;C^FgBwu4Xo{yK~hvkNw z?P@5AK6EcuQg85n^mXsr9A^1)6yPT1Fhu!nz-TgygZ$@4nk~awvYCAPs029+k<5=^M7-cI=>WwF$!97#A_+mwcol*X@UzNE(A*_d z1@XvSYzV82PB9#1;A(PP;c`0y(GwyQ$8~nTEN1AL1RnvzUp!F8C{uxZh^#mI!WgNA za`NYdNi-X3Nlx^jii$~9)2QbVWtWj|qAAQ2l2qJ+zobpaBpFp(AS766)tBkq51R9f zA#)+2?>*97oW?w18t8K()dElk&Hl;vN7_D>4Pn=JaMGoN@Uk3L^r!RR0APEo-DZ%6 z^_@x9ylIu*WN9cBldIsB1 zVG`1Q$15RHps9w?oi6~vjnZ zCwM24DNM(I(duiE-1!E{wfY*p#OQUZZ>wTWZ!OjrG1fgc>SRe;vkNAj*heyxotVxC zYgs`vfvw)16XT8bu^v6+J@v8riO5|v33z?~N@VW80Mr|-|A@$%Ca_aPWILrKMHwmR zMIVxVI#TQP=?YUtum9NSH`tux_0{+(nhzm`<``f_bQHZK_OU6z=X!k-FDv@YkBxqV z%~M`q-cK?AV^gFfOT0c|()EJ&kB#oj1pCHYiY}YD4k6KvR&ATASnuzKI)p_1mS{Ck z+bkVgBF&D}-%q4D@(|vnf_2Q%TXr8>f?-s@<0NemL%2Bp2BvHC4P2&oCn()vg6a;7 z_YA8q^NwXlqyvZq>c_!EIa0vH$;@jxG4fSn=Sm>=J1JR-7Zs5lnKMp{h)WJU*?c2^KQz#n_5l{P;lx8v}UTHH~hxl@$T#>OVkt8*v`EJ9cAvf zoxA1koLU~MC$KRFvSum>8~w)KBSDY^=f}s0t=K{Jrrw%895H|p8rUJ0hZ)`6K zMv!1Q5xHu#V}$!{$L80aQ%91(Z}O?&YYI(2)#Rb7Q1=(KE4+=3pw`s+`n>GPH|fLY z&nZi@rD)YHg|pF7#iHW9Y=#O(Pv%MhE!$N7Vyi8Q(n)Gvt9;7c=24rU*|J5_n!+4z z*D2s0ZKqA7lzcrMc#+!0jRrTqjxcbP znALa`kXtkrWwpEc-bCm{uj@U-j_)1;c6?o|%N??#N`r4Acb^)x6-X!YLs`O12VP4B z)+Ni`fPQ;*39gc*_2~FLVO}p_V@0i?fs01)0A2Zc0UHNcRW}uIm4qFyc$F3p=%Gb< zjwpX1NyqU@HwMpCV4pyRH3lV&)Ryy>D030^O(4B=btkTpOQ7pSR5Ed&=dw@h)?RswAS)7^;#oA89cs`-QF`<`Axu<^a0P zyYcp$!$($9h$-IU#5m4^)gl{!+NZ~1#`E49HB9$5W(JfD;+-$d(5)XRyc;}W^!?Ri{0e^wyTaC}R2XUB!GbLqDG;^4Y7 zfrk}esWbVAa>xW;N%S7s(j=6O{j}t8HXIp`ZtSSbw5?Oxz$V=rnXmj7HaWkQ3B1ay z;=Lo9%Hla5pe-9lz!(ZhrBN47pX?5|xLZs@7`$c|CIOvC*`$EFu_gp@nBl=7(denw zI{JKoCC4ACt8q#_MYDj0cy-)V8|lxQ6K2zKGGu}MuuNb>CikSnb7t~OU|2SsmY+N? zle<^PC3OSbPK!yzK_M*-mM^h_yGBeKT3oC;KX)!xoxgIfV0NKhCzjwjB6}<-ae775 zI1YFlK+T(*-*JGq!SU&mr^?8@rZ8}M_iGR?FyWOEzNkR^6C(2xP3(wsz9W$XXRwpx z@bJVSg9 zP#Jk!B}5i#LFlXqlSzYls*Oa_Sk_3EH}!6ir)we@xPvnn0AJpI=A#bGTmMtzsH7g%GTEGbbje>by6)R;z3ubPxutO!wQmc19}-V62CzVEi7@tSIXVV@3H5$}${P{5li&%{98wLg!w)EGh0MX zW6G?az``L#O7W>3k;puGM$Gd$ro_jc&lr42#|1Hux^(=FK42Aii22n5rLlUSseQQ&+(?9FNKYT!~O)XM>iMuB@b$B*lL- zY${QdC*@LrN=ksKB{?rEDp@TIadXb23VeZ7bA^K%uu^wVXLo^Vr7ioSYqFo`qO>Z$ z1ox9fHZ#j+dV`TP8}a<))i8MJrjeN(9!=;skIXa#5#+rg!yWWG)ZMnGPrkv%i@*?J`(UT#(??*(?RPL5TzaxQ zRo9>D+zP;XD*)$$QS0YP35=Z`(%sI*oT$wUeg<&n#2$b%V0XZ;fZdNy_tvDF#-(%a zIgYJGx}k=BH#p|1WYf_}u&sI?O+c~%lNikRy~EN?sLr`3XeUx=7f--{Lsn7bFS@Jb z{l!c7#zGX)$`%XRj}@-BHJomc2erhhA5}BiI~#y=YuKtcd#ILV1HhRu)iFkm`0?`r z;4Ba2Q(Kx(J^9qSeCms>MLx~sQ}6O=_Va1Lv$H>@_fr!GaMm^9nYJwj4NlwD(l2M^ zgFi}2?_p;qw=PxpAojmOI=%|fc^yD!*^d8>eOKv6N+6RmMgN?0m1i%34rWi|X}9lT zL;jP(ITs+#TN}7HaUj^+&{Ut(AB))52JT86Kwda5(NvSjxn~XjiH315;IKp!b`IQ= zWsLyoM3AC^i8zsSPvw+wSsmI7ZJy|4!~X$q?o9=GyS)~d-fr*hy*!j?kOyzYsUN+z z_wv&6=PnQXwM_J0P}y)fU}oxg0cd4@tdhN{szif4cnQvu zcUAK)JoGN$%X$}Ysix*ir_y&%5x3OOc#UrL^6f9uJ#S(%Pq(eM6NWC6UZmOv@@H_q zu@80C6n7%!?nT=W=qkeRE_Y295^eGO0s){V%0%FV;F5GpaM0r2_7L_Nl7W6{++vpj zLGD|!&2S7jxAzJ{?nM5~p!P{5PAW5OWXAe+;lMzdrU&Ow>ho zBgCoIkOVpKlUmHgt>J!M;6qygd5f~u;-+jRH-}oGYAUg{-Xx|^we9DZ!f)V572asSoRQe? z4NeFSx%e2H&ozf5hVR&FBrX!MAk0puKZ#0Bn-m zmnU*h<~HdDwZ7j5X{mgBIGs;{iN@N&_xsaP6c;N-#k}u3LHT~7_?9f*#EHnYOakK3 zdQ@pofw0$z8VE^#O%fi3ctRmu7><e8e`IPbn`qVR<`Se=Kki4lQ9f zTW4p3V~Npmqjy?3>G`^ngDK|eF+FF;{CYdGRiw$hE%Yjps{84P%9*&A>c#PyN{Ic6 zLd9pwH#t)U8Q?`R3%}gLt>Q?`neOd0VrHx2p3_>aw9&0bTcwR&V6;`*B<4FQa*TTu zkKq6tNg%T9I`~yIpElB?un^E3Z;Zx!+%p=lZ;mzJVH1j%|D}u09j_JWM15HFNq6}2 z^*$FpO7!NBjP4}t=M!CK7-}ieVWbItL>lbbVBgzMAafWWlYlB{;_~wE+5pB%@FAr! zIeUJcpGGz=KJGo7`FZ*8`Uz%cNRZ*e;}5S11=E?Jm%qSI(|SlRnO(rsrX%w|KEWi; z7`;BBBnsP*Lr8OnO{`v@=1e8ilzK?XH0OGKV$hYW|4@_ZJsn?D63xzs^d`;t9VbcP zz6Fbln0lHSBk*y_GJEBN>O#x#LckDe9l#`$+FDC&g;PyQv2$G;EFDHRO z!Lhw07*2wsyqpBXNwALuBS=t`my=+G66`0zND>s~nj=)az=(lS|(iG<0Fir(jmJ8gkSJ&cZK&O1 zm<}M@{yj!J%!Uu}^1l7Is5}pj*{r!m@!2IJaK(^+$68s+$^KtSW+fL*=g%#3a@GcT zHVPqAN4~nfc)QUU6~yp|&#Ndv!@aIw zI2Ze(Z#!2odkG=K&Ga*;dfr`#0Gn9S#y24PloK)GVe=Y7%uNi`yMRZ?v_>N5J6dwi z=1HPKj4{?9Ye{q2uz5NEH=etUQ=7NuHDGsA!O9Zj9+ow~@tm3V_;1QzJiAF(KfEH| zUGovBHZ=RSfUxTL)$s002In=Hrz!zqwef2~uI(e{HQ?*F5mc!O6!J!%+JLWXBB+Bh zA4TSc7D~#v$xst1^9nNeZJQ)=PzPm>JhcH|KXX}XxlAq0qhbD$G`|d+nApNeSEp#@ z!8F6BRNK38S%CR6(;s(L4zA)ijWr1I4sPDyMlnvvwq{&%Xvf*e2qPvq79g~uF6SV$;zyN9_r?)g z5vHodkx}tub?H4Fp_M$y7-vcT2(9Fy{E<-&<&ReW0;E{VzuC{f0axchOz)v4PH5F^ zkOGjhW|F)G3REV{>B+c~`KyDO+zV#B2F6i~n&b*;PJYEIi@jdOs_8^1Q7fHlkB=o_w$Ivv1o!KafJowDhA6p z`e4laA;NJ*7+^g}Bbd3lKNec=b>&!(%Z*DNf1%f~jD?Zuqw&b)c z2#XJ34GK+ha*4k^z*(#ItsHaK@E84+EqqfXX08d$)9>VQCa)gJQRg15ZR=UvAUGA} znDH=+GM=bYwb+c)vbgX8tza8)bsQ$|F?go~1JIMkaKvanH9B&g$o3UqTP?kG^g(w& zsJ{*nfT0{7M_bmF{wu0oX(AJO$HL9FR8k&DVp@qBEPA0t6DSc8F+t_+c#}9C7CZ*; zLW$~Ki6&5@1nTW76Gh_!b|sNv$_W#xtnB-^=BV(SB}mT2(`Q9@c74WKgQ&FDh8D@4 zKYJY(HYVcUO=EEUOr$XpYmDU+Iq&FaLFNQ{?nET_>yX?(=ykuzVwxN5e0EQCFuOdr z0qn5WV~oR1{+Hf~bdxH4{<3M=u-rsFmJQ1<2sBJgL?6Cp2705(w!O?~fd7cd8O-FL z!(;ws@5FdhLr#x)uAyNfJb_HsJgn$CPkooA{yWM1_-|q_qA8FIU?2yR5e*nV+l46$ zrpPtiP5FYDhV~=}s@m0g-N!RS4u;vnrP^K>7-1E@plDs!19F(oSKgJ*J(jM+=FN6) zzGPiS2xA%o4-wTZ$`bmzMfrN?Lg)(}%ucrjS+_7?-J&-TZwkaI7iGhcdAvchHgaEA zA8OWMz76IZjWys8}P%=bT?Z$`#x9^-qx{{<>^ z9D5w`V6oV7EA!@Y;Y_H7t6dLY8}tH`rzD9Fbik#4}ufqC;&lCp|TQTZgUmb4dL16{O@(zkTAcM?iTB<4_2lVmnKKSMg; zC~kr@qNJvZE*Wq;ycjZIcSa^Lu*$uPlq&Nr{HnmW^D9P7nfpTD{kl&P;cdU7cCINS z)drjr1#grWi_#YRbz>BHw#j-WCY7u=siqU0TBxM?Y`TM~-lNRe@*@)3?gQWaQ@DRS zgK0io`*Kc}ed#TS%Wosl1=+*``3f$I7DT!mLT-B_?<8wDJ||e$*Pn?8#nS#?jG)#| z#6uEN&sHhCJ8!k(q^Zr_7U#$IWeW$*w8IL#8@B7*m9?{)RB8o4^eZ^uU$7S~SB$qUKrx-X-l15tw?n5XUQiNl*DBQ2G zIG0Fy*Pl!A<$tk>ZV}}YtfN;)FERQ>61RpG>sq5pm5y?0(-PINnm=n~!5P@<2KkRpo&9(|IZ;1EQ&h?7lp z+qaAUp_d$j=$15^lf1q)ewx@J6%3H{cl-o0PyBH;@kO^j?wMKb5Yhnk^ZGvLzhucF zC1~dkW-22I(@p#yLK+8b^!nEN$uw~uQZmh~TvOvG>p#?Fnrb_~`u{Gvtvyh5E4eKk zC6ZgDwnL&@<}=gZKR|9f*2!%(!IsKx-8|@vZZ$PJ(d`>fbgS5d8%%WDg_Ct*NPdgO z*i_x_IHnQ2uK%E2K`+rLK?rLWt|S6n{^~vMxRsC=!6GY@0*fNXge(+T;Gb5?WE6x^ zjy=#&v9T+tVmrHn-kmc7l$|;PcE(w-E9l*MDyV09$etkXf?Yuplqc%tA$x+j3w8xb z0qj{GvL}eUU{{b7!%2aM>Fhi}3UBQQZ zyMn?!YsU_A%ROw9%8g-U?XJhgeAmd;OqdXR%2{Vcc&lw(nw{7RIt<{P!D;|Kr(0nzhDYaw}ayM4&rZ5#tOx)dUO^rO*0zMNH8v8|?6D62 zB{o&u*{S?ipJ1Hr2`pnntmFxflO)B&A}6WH6HX}QBtc_MR32mS&IHzzX404gIms`J zn`3lc$h-kfH$m^@dnVxnY2bd4k1Mp%pViIlP5M zNb?Eu9D}Dyf$IRCaR*=!W{RE%ZBQ|*v}d5gF|YoNwz81!=oLO9crQ@mRM3HD%TRcKr%!1PSg814* z?qL|)c=SGZn_`Z`_R3ni%UaBff2XDQsAPUZ72V_CTAD_;MR{HiaBJ_3(CN z^Z=z^V74AfS4x>6Tg(71_L8?b7n{SJa|N?=2^p^2wAg$N39=`Dupqk@>FF1cW4qs$ z+!wGNkNmVmOzk--wTB4RQ(#dB{y&h^B-gJ}QN+L>^m+u-5?f2?PO)+8*Oi+T^ATF( z=PC>#hpRE^0Q)xJ=MqXZU79dNLJ6_eQb?u`NQf0bil>jQld_ zeQ!Tft1`p*-x=KY`yu3_P?q+l+sx8?G*i5f+Kx0d~2mWk%*-jO9q!m{0k#tps zHJ>r~C`&S1Sf;tyKthQ!@Zhg|-7+|k$UW8h2af5U(lK52;e3Zpe1y%Z-ib-BPR>_u zPBn1}-$Z*hrWz({=Y^CH$tS|MP^RtCbaV%Hr+vwf*7w0cYlW9qhv6%e` z_&!wHlXdHoov(5==c`AHvqYsiP?p*sitF7Irlu4fK*yOy%5jq6IX%fdBdw8O+0X zt)YoNpm@DYLn@-mhD$$IxZZQasRnsa8JzmD+Vn21N;Sws@i9O=6u*Y}cql&hhKJ(U z5+4u6#{ltA{CeW!q4?Mv9*W;gd^{8%1H_6?L~zloV>Fs-^8dGXE#OgA`TZt?7zJk{ z_`pS%QE?X#VkY4cTz1Wb8Av3M04h))$q)tz=H<-94a)b4^3X9NY1eJl+O78Mx3#SJpL0{P-XzW&cT35>Lt7U$J>5Yz$6C5DITr4mK`^8{+E)wGP#dXj@J74cXL_w6S0E!d`O;xV{lY%)dwu~>n(3!7*Rw&AgS0(hN8n||Fe zVgcEB-Edo_5`Km!9IqR0;eK5R*ZXxD<-rwr-7o+dMQZ$x85r~@Me+1%U9UiJGf1J%+`2id=grq*O=Z^gcc*)DQ?gS}{H43bZO3yHw|$;FJ2Cg^ zTgD~kzHrNF!&GJZM~#CgDb1DmB3x+C+M$1detsnm!&kmiv>z`+pGEKE-@;$xj&IZ7 zI2cx&O_U%4aF(6D{kgvWTr}0Q1}4+jyEd9zFFwQ^8u_EAW3L`Q zgh6#YzbBG>_E1w~+8%Vq4nrK0GOj z_q@#Ao1KWqw_>m6@xix8UW*_8U7WL)+%uhs%-@&5=PPdGcHuZ40lRHAJxlpbMl3cQ zTbdu<;H+py+AWK>;xmn^{x)uTrh5Oh2>1Qv_a{#IW&i2hUKltHQQLH;Uws=7`U&FI zN}OL;6$|xGRl04T_rJ>odJZtwBRig4HppuLg*W|_u_*HH^3y;c;C*j$4 zVBjRA`^J2;uF?y@tMuu5-86^}ue3;2aI;UldGc7SqoB#fCUg~BmV~i5u!9lM@PFX8 zwsxp_a0aA0k6&6Md$F_sp|wNiqmzwRI6gYL|28%6##Mh#S0dM6Rd345e;j)i-LPq| zK>b*}p63~wfS&8B7Y{W@roRwB{AvCp=v8u6`6Y_bZ&74%&7Y>DD}W5eXkh!Lf2=?BwC~%2L(xKE9}g zATr#D@I?OC@%(Rwui=0LS%@;K-u3hm-X`@v3aMAfWSdQusO6O#$~N%Pf+?-05Kq}- zw^yd!3%u{FPUSk*do5t0-SfV$ir1n|sfI!}1zefSA7ZL}0$)7;dNRKkH>k+Y_tfP} z$^6SK?>qllUGBquae%D(FWG0J53f?5fG`X{A9)J1)C_&<$fVd)Q}J4wNA~VWoO=Ae z!%r$waBN0s43%PxCOgNxm7 z$L`-{>erUJSRUhABOa-(XzDnKw-e#XQoIm$$EL{>ypPSQmv?O2ooh1PfNHdl^Hd!| z8>EWdw&VR5KTeI!gQ`y;8;pdKu0I#6WDV&~4w8!n5DSxIAx`~B#&Xp*_^{Sk_hoY^&a69}8{A-6)-PcOMt@0xFc}Z?0P$;N?9|v?* z!6G!(I!B6>46y#d(u(-4U#OhFdo7Utx4Z2)9uQVKm35fI9jn@vVSW;v1@QmJZNAJ| zIDh`7(J2d;TsbvbS2e4uI$Bd*6RVyTn;D(53|}~IOyyjBMyz@&l%Am#m)*3?DI2IP zn>hC5iXr58Bk*%|*e^%H(-5xQS>6zyaCfL79K{aJ@RYkwiifMWkB*1yww#;@w}yUL zc51i|P&_;ZBp?M#L%5=0OeNUYYgrJjTr>VRY5#Xu!lUFT3-XhcpD4&*7~VWObabq* zf<`QBJCyeeEpI9Nn4Z1Kg1u7SgQG(S#`wx>pu8(^un$5vpJz{=7k^J)`KzavM7(iK zd9nngojENx#fss@TiL1N?WOC&@Q0&9!~XG}PdnMtKcnT1DcIRqNJo`V!p_~Lb~aPa z0FJTjv%tNK_3?0DB0P`?Z!T*Jx8{5kH94kyvv0tX zF{+&1p>>~40}?DdQJ9YOYYJB|%^ud_8_z4Xt+BW)E)3sM9-0yI)fQ)5yKo%Q@;>C9 z)h}Dgq=Z%H3 z>FEP=DD-3h%B1Yy1ra~h^4i(?^5psD)xz+u5bXbH=-Tk^& zP}Xt$$|7HIFjI%W%QAH0vV;2DUjA74DG)2v3<)Q^1X!xWX?k z;D(++$YE<%>Ozj{VU+}goNx(7AXKorI5^}m^@`m(WzVz%D04LAj4$BQ6LQ$dmb#F0 zRw)Q1w&3tY9^ptxJvGo?UVY!BXY?Bhbt*L+awH9el<;E=F!wUrPgd8v3igC}* zV%*F3V%#gA#kg1gi*c_W72{q#D#pEfT8w-3v>5m5S26C@uVUP5ABypr^0MF$IkO9X z=?SS;MIsnYB460D+Kx}=5;yhSg#TRQQ3L4nK8@QkenaE+7X8~AZ?*6bG~RCE|EBQL z_zr8_jsf4`$<}VYI(x!?ukj%dQa(Pt?6r;}P~Ac*9nRrNH(KMi{caz%(wSiC9jE9^ z+mi_jFRj;KSNJ5S&yq7)j! z^yA9C`=%PBf#Pc5l|_{9#SOqucYb%gUWb|d+#Z1cX#l<}0Dm?Be>(vGEpR_OKM%ma z43Kjsx*Frky?%m8*69lO`Uz@`iMS^f?)4MYSP}Sj#Krwojro9YQuN;aRE^ny-=pZg z`>7gx0e@8C-hESzrGW1R?ib(J1Mpu3;0FTm-v{7j$Y?*kY9Urr%|EaU~_DB*YYVOtIIf&>EL=F1c!4N+q{O$*GxHh>t2pZJj-x zIj3!5b4x?Kxvgcvg5}98+OCK9GH59JhqS}`2$Z##901t50wAX7m2G(ZMLWdhS&D6F5<-(Y!<&`9sixM#{7tZ=C%sWlSa@|imgd*)Z-;C@}Y2|AqkRRT977UC-eei{69#8=`layY$2M|>3y zgL8hAj`$zoFnER_9B$VR*X|5{GeI~s&(76241OCyIMQ?4*5G#%gtO`QxUj;nf#1-- zrE$ucC-C<*Zp-<1LH}(*e;|PV>$+n>IZ|(p#;Nx^Le3mPFXQ_iLC-Wd_Adya=eU>7 z);lQV)Fa%;`JSMcdLPy}^XpnczenIQ-(M8C%=b41F7y2iCIFmm=VFcX6vtug+$``0 zf&ZPxZ8^UY^it0G==abuSjxFt1!mn}3on2pdJ+?g) zbc1eiWB;8|kA5b+CSRUB-@`eoH248MO?X|`|^uXYP> zcGWpK3peq)zs@DF?Fr2-#!bA2wLDhWHXJ5imzB8z#M=e_o(rq?WVOIe{FvNnf#0X; ziOc@OUkLmvLC+mEbkv)|VeGkDj;9E4#_;S|I(7z+-*9n~a!RSaoABUlzL;#L@ zJ8>9%zQ(Cn%DF=0JS7BuM*w}Vpic_=EdliR2zvIBj6Hh=y-c@XYn=9P5cHGu{e|{R z`b7el@w!Ii6t_^w$p+AG74)3DGIoAX(93?{0|J-*fsn9&y^v#OKp3wz0%zZYj`5P^ zZHGeLeC&qb*z@-S?-KYPf%gd9^y?^JmbXKip5}5^+Q>hqanj55jEM5qBnjEuK>JB4*+cYFLcdO z`76_3H)E>)%JfeNdYS&M8mIcpahUwlw{azhZI{8>ccr5}GGESDXlZ%-Gl9!~?2iR5 z`>{_)HwCZ@-NG$Y;W%u^s>GEk4=}KeD2?7?I&KL z{XFf{e&VNSKTl@ehWL1an{hgE({?fVd67QetUBi8NeIW2nY-roT>?LcfOdZ)@UIyV z=c58QeLMztiSn<92#V`TfkzeM{`D~-;VA9`<#Ye!IB~MTC!FM7kp3G2uM_wbfjd>% zjorD_YMe1Asm|+-FNa4BoT@dcT*|3hoy|H`nO3oCB6YHCpMA{Q5J>`XoZH5Qdd zD41(mBNtqQHLVc}=7QVE1;?=PHbTK%3mduMYOINkR4^72jo^-K4ogoRz1@83&Z+9@ z&84fbS$jq{m+H74RBYGM>)Y05Qr*0be`Plj5_k)e_d*J-=bw+$x2rFV`GPJ-|o{pP$}V|M&^Yd*7YLSL4wm9sC)TVuT; z(C77f0=`1uH#zR%tb8ap+4iT->5|8C81_B90Y|7`$&{dF#f z+5Zs4|1bjl;@`K^wfs$;v5Y_aNp}2S0p=I~2k%1woaq<4SEYLuet`f#{f&B|qgpdk z{}>#${?CB<=|6bC%V_p@!|eMA;^%n4&;9{zzbPlQpJNT%{%>jh zcKPpn%hmWOjT6rB8+eW8x65DkJ1&#ipJGH*8jn7=f#1*mA#MLkb^zgM|LHi4eiN^) z0s5T}$W`3;<+ZK$lT?=z5obp5`|v z!|#S-PBOs$eVRXI81>n>wh9Z|MPlc!suc8 zUErht4Xxi!zlw~jzky1HGxnSGJEr;V^4qHU3;V;g4BP(y1ix?oYx{LFs`l?9$T0d% zyw1V>k(k~8ncnYONZHca*Oz=ne(rbtD*CyO{#E#I2;i^Q@i+No(!frCu1Amt*-+#R zzfEsnzZjsueuVm2zx?zYep{b?&Ha>q>DM|!{Y${_r{D0~`t0jX0s7lVsQ<45^c#L# zpMCvCfd0M!{dRd^r<&NfqTI)vtLA6gN(Xhyne%CyzgiN(KcM-mEZpcf=d(1wMdtia z^V_)LGx)`t-!8uoYyN`>m(GsQHxSOa)Jr1x2Oe-W$@XUsaKmrfA5z)6DeB&vUH&7d s^2*(-#j%pj&pth>K#3dL?(*|%d(ye6hp +#include + +extern "C" int batch_reduce_kernel_update(const float *weight, const float *input, float *output, int blocks, int ofmblock, int ifmblock, int ofw, int stride_w, int r, int s, int ifh, int ifw){ + int ld_b = stride_w*ifmblock; + libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr(ofmblock,ofw, ifmblock,NULL,&ld_b,NULL,NULL,NULL, NULL, NULL); + const unsigned long long cblocks = blocks; + const float * A[cblocks]; + const float * B[cblocks]; + int weight_stride = ofmblock*ifmblock*r*s; + int input_stride = ifw*ifh*ifmblock; + if(r == 1 && s == 1){ + for (int icb = 0; icb < cblocks; icb ++) { + A[icb] = &weight[icb*weight_stride]; + B[icb] = &input[icb*input_stride]; + } + }else{/*Eg.if( r == 3 && s == 3){*/ + for( int k = 0 ; k < blocks/(r*s); k++){ + for(int i=0; i < r; i++){ + for(int j =0; j < s; j++){ + A[k*r*s + i*s + j] = &weight[k*r*s*ofmblock*ifmblock + (i*s + j)*ofmblock*ifmblock]; + B[k*r*s + i*s + j] = &input[k*ifw*ifh*ifmblock + i*ifw*ifmblock + j*ifmblock]; + } + } + } + } + + /* Reduce batch gemm call */ + batchreduce_kernela(A, B, output, &cblocks); + + return 0; +} + +extern "C" int batch_reduce_kernel_init_update(const float *weight, const float *input, float *output, int blocks, int ofmblock, int ifmblock,int r, int s, int ifh, int ifw,int ofw, int stride_w ){ + float beta = 0.0; + int lda = ofmblock; + int ldx = ofmblock; + int ld_b = stride_w*ifmblock; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ); + libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr(ofmblock,ofw, ifmblock,&lda,&ld_b,&ldx,NULL,&beta, &l_flags, NULL); + + const unsigned long long cblocks = blocks; + const float * A[cblocks]; + const float * B[cblocks]; + int weight_stride = ofmblock*ifmblock*r*s; + int input_stride = ifw*ifh*ifmblock; + if(r == 1 && s == 1){ + for (int icb = 0; icb < cblocks; icb ++) { + A[icb] = &weight[icb*weight_stride]; + B[icb] = &input[icb*input_stride]; + } + }else{ /*if( r == 3 && s == 3){*/ + for( int k = 0 ; k < blocks/(r*s); k++) + for(int i=0; i < r; i++) + for(int j =0; j < s; j++){ + A[k*r*s + i*s + j] = &weight[k*r*s*ofmblock*ifmblock + (i*s + j)*ofmblock*ifmblock]; + B[k*r*s + i*s + j] = &input[k*ifw*ifh*ifmblock + i*ifw*ifmblock + j*ifmblock]; + } + + } + /* Reduce batch gemm call */ + batchreduce_kernela(A, B, output, &cblocks); + + + return 0; +} + +extern "C" int batch_reduce_kernel_init(float *output, int ofmblock, int ofw){ + int num_elements = ofw*ofmblock; + + LIBXSMM_PRAGMA_SIMD + for(int i=0; i < num_elements; i++) + output[i] = 0.0; + + return 0; +} + + diff --git a/third_party/libxsmm/samples/nek/.make b/third_party/libxsmm/samples/nek/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/samples/smm/.make b/third_party/libxsmm/samples/smm/.make new file mode 100644 index 00000000..e69de29b diff --git a/third_party/libxsmm/samples/utilities/mhd/mhd_in.mhd b/third_party/libxsmm/samples/utilities/mhd/mhd_in.mhd new file mode 100644 index 0000000000000000000000000000000000000000..9eaad14031b1c51fe700d0e5446f824d60538a48 GIT binary patch literal 27225 zcmdU130xLM8($8&lrxPAHNx;fK=4e7JP=O~%`mSoMR^qy(LBC>l$NHZsaal_m5HdS z2>6u=QCb>#ROAIJ;X$Hc-X{tu?Dy`@&av0~?(VxB?eC{E`^-Gg^Z)P6&O9@-FHwCb zPo3!<>fO;PJi##4kT@%9_SEr)8POBN;$ssN4GBzHFf8knG|Lb@BhD}*a#$auykBg> zOoNlrz_7{h7#JHt9le7)b%wQvO^=;0IdKwWATWse2Mb@41aBGOxS?luR?j> z*jcgtCMPgu5z%42Bb=!ExEwuw0=o7Z5H)NxykepU^y=#WLQ}i?fTP-{o$E7g!eZwp zXYT*w^o6op<&SCddQyI??0o)_A5%Y!AK1>biM49;b)(=GJaqbsU4LFDp_|HI`E}Er z5uF;>4LF5fns|4gxaPpw>s6+JOP1B#_-pq&J-r(#bX2R=9JP{tS4U#%XyoA9tVQb%Jq8;-*pyp*?V*f`%Nt)Jl=KEq*z-%={Xi1K zmBZVXPK^j^M$|Qtwe#(tuqg_0>~*7LV%K(oOX|=j7Eb zG3~U(i1A=osFgm5C9OCQ-+Dzz*|rX13G3eb zkV__WX`m!DRYLm_zaX2H`=o zjh4JV;+Epqcl3ADXv1Q)kW&PeB3~``>l3 z(5V762{XCmsY!}nY39J$IyrKegX-N8bGFTuzlm@@ zgbR^j*~#-#Q@o+(5LAQCZ32Zd>FAR4;402*BU7c;#1j$Dn;n3Y(5z*L_RpD`4)<#u z@zSC^)C>;%Ty6!Lsj7Rih>6X{u$7tNciXH%@vi}o#2=XPKN&rj>vbMziI{k7cUhS` z@2{svp^?h77~%WVOz&?{OkC5NewtUjoSK}3p8p2K_bnS&O`6>%)Xgvvl@eqJAn;-av>=($2;yP z{qys;*O4E}h?yi~XDwIb@sl%BIs-1D*lLQHbi=2$gRt>Ati>F(PBxaPWJ1%&i3T;r zB!X>z{&S$|3^L+S#AG-=AjBP__Z`OYtuC0F1}5|=AEYnO#Clq}kOa8@wI$M~6fv2L z?A$?9pvUj5a49xr%&Be@<8wv3d-6=C3y;?ko}b{rLMdWm%g*Dh_8HW%7qR!n{9|Yh zOsZdl&q9{TC|M#jy$(GHC2it~zXr&LZ$+0IER(k&43KJILTABqM>!?|cn&~qfijg` z6t1jILg4utO|RjQ+w5j-6>@7u$VUs4docbNGx?0MYqtx*HfBzzt^Xp2oyIunF*utk zX_E!WRP{s-a|}4lO@I`*)HIVFT%hdh+cO!DhV(d288bQH2Wmdelne!~7z~eh^=@kH zO-?T^CBEf|=hPK2aXgBc+~?*y*F&tR2)#9&HPcMUiEax1eqjZtjG62qzMin#hrHj5 zM>`d-vjZnva**3jRKR2qJ_bI?y5wNXVJ zp+5~R{%T$UV)IO_HzS`kH|9Fzv7-B;1*CE&)DC1d+a!~ZQ9V3f8xe+Qi{BtXIuHNykl)2bQN;0QZGCyD0e#nxg3w zU>v`e( zZf=ez-xEq969Xcg!tVj@XdHSR#lkYj#3~J$q96GICZ7P&7T6Lv)xS+vAos%IJ^DH6CGTIt%O+Tdf49awNW*)18Uy5fN^&AeDWCOf#8?7bbL7FVG~e zCxIXVPB2K);7=JvX%E)F>>J+#A6^^H+~@$&`f@+?*Z>7&syh>EBSV6{=97*j@^VZ9 zd4on34yb#U1H&Ik7}J03qQF)Z<8ljHoJe_=UHuL`5AbsTCpSXc!#UBLNJ3=vZ)O;YcDY*(NsA(DH`GokJYfP3M@5mM~TeZPF62DbU`7lxOKSd44P2f?#m}LF|uF zOF1SHQucIelb`T5sV#n8_X|hMXWNh%@JCN*eKL}Z5IM;7UvTq-)r0X%BxwcaEn>0; zI;0DUUDCIav_|l1`sAJohK`MCEUL&cv0F(*C`{o6{@#B&r=UL*yoK|497Y(3*GE-z z;ntY3*ecTaE%LdSaK&gL6L;RAMbm{uWk!uT>18g-UOSd8V*W#4;1GNDW>#8p=;fww zCWZR@wfAS2a>OoZxjWg2^~!g2?ni zx*C`Sz7_iEvm}$8=V3*d!ZtzC z3oKyLHt1+o0~2fhGc-OFu<%%d9*miRkAe#g zOsEOCdjTiOtYLQ%LkO8o@5Amt8j8hCuE%qMJVI*BZ~-K}ixi^#f1oujOk8rIHkSe> zRKPiboOhcI*RBpQP!uuwxibPtWL9%#@+E@@2e?|*!i0M18V@7QF`>E}hbg)1OJ@WQ zempN=axJL|szPSwawe~MlTwuT<0d0bOsJt(d4M4ks_j0W9N*mL{Y)_gIOj2`+7X1n z@iTFpnF_GW#4j}7bprcP+L%xwymdQbLN$K36vM~4Z@cpS`+UKi0o(zBJ2MK`WXu0v zZZM2wbl49fJ z-Py2cbqb@B%KxR?QzO?Lyw0&GFW4~)df=;5lvfwv8e;J} z*o_a%cCUsATLRM?kV!>r-R|zfPONvq2ly;Q3uVN+&HCJ+>8Sqz8<_~`D#I_XNh{jN%?~1qzWcf81ETWZ>+b! zui!amw_rzxeZq>{grbsglQ6l0dg);x;D{6+qQnAwv@mr zP>6xMkA;;Lr9DbSW{%H(Mv)um=KpX-&5&9w z`)Uwaj!i(DR0ftVE{YVOS^D0gFK6`iv=-?o5m-MLI(E_5N61hW*_F#wWDHUJe(<1- zxF~0tkw3bAeCvB7dis0euLXG8f*bp0t%7^S%wC&abfa8pf8yL1t?FoLk8TZ5QRNxc zTs@V$d*k96V8up?)^tED0{61z8)Qj6CLgf z9+HwqGn6w(-q1W0q00AMm5%!BQU<^o7hZE%oYEJXY5I zGt9(wIx%3(R2Quo-h$^|*Bjd|pIy)qt*oiiy$ib6`Vykmq+hr%C1r6~R&ej+r^EfU z&9-V=&8EwTM=Tms<(b|+^m$y{I^F&7=>ERCI71#hEO}6pyKG2%ZNZ`1wh+`0n79ES zPe44{(rf#Zr-mu~>-7Q@I^8zyFfrwiOG=hTG7KxP6zpAP=++#3Dxec8t!8Z73`$u0 z8=Ec*8e@t7QG`6>o zn`5oac`^W1i``oIb?rTP^pv+3e!PDD=FANC`fB~Ur3>GlGA1fKB)GMgbKMNEM(F libxsmm_target_archid " + "/* JIT code gen. is not available */" + ) + print( + " /* conditions allows to avoid JIT " + "(if static code is good enough) */" + ) + print( + " || (LIBXSMM_STATIC_TARGET_ARCH == libxsmm_target_archid)" + ) + print( + " || (LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid &&" + ) + print( + " libxsmm_cpuid_vlen32(LIBXSMM_STATIC_TARGET_ARCH) ==" + ) + print( + " libxsmm_cpuid_vlen32(libxsmm_target_archid)))" + ) + print("#endif") + print("{") + print(" libxsmm_xmmfunction func;") + for mnk in mnklist: + mstr, nstr, kstr, mnkstr = ( + str(mnk[0]), + str(mnk[1]), + str(mnk[2]), + "_".join(map(str, mnk)), + ) + mnksig = mstr + ", " + nstr + ", " + kstr + # prefer registering double-precision kernels + # when approaching an exhausted registry + if 1 != precision: # only double-precision + print( + " func.dmm = (libxsmm_dmmfunction)libxsmm_dmm_" + + mnkstr + + ";" + ) + print( + " internal_register_static_code(" + + "LIBXSMM_GEMM_PRECISION_F64, " + + mnksig + + ", func, new_registry);" + ) + for mnk in mnklist: + mstr, nstr, kstr, mnkstr = ( + str(mnk[0]), + str(mnk[1]), + str(mnk[2]), + "_".join(map(str, mnk)), + ) + mnksig = mstr + ", " + nstr + ", " + kstr + # prefer registering double-precision kernels + # when approaching an exhausted registry + if 2 != precision: # only single-precision + print( + " func.smm = (libxsmm_smmfunction)libxsmm_smm_" + + mnkstr + + ";" + ) + print( + " internal_register_static_code(" + + "LIBXSMM_GEMM_PRECISION_F32, " + + mnksig + + ", func, new_registry);" + ) + print("}") + else: + sys.tracebacklimit = 0 + raise ValueError(sys.argv[0] + ": wrong number of arguments!") diff --git a/third_party/libxsmm/scripts/libxsmm_interface.py b/third_party/libxsmm/scripts/libxsmm_interface.py new file mode 100755 index 00000000..9c013d8c --- /dev/null +++ b/third_party/libxsmm/scripts/libxsmm_interface.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) Intel Corporation - All rights reserved. # +# This file is part of the LIBXSMM library. # +# # +# For information on the license, see the LICENSE file. # +# Further information: https://github.com/hfp/libxsmm/ # +# SPDX-License-Identifier: BSD-3-Clause # +############################################################################### +# Hans Pabst (Intel Corp.) +############################################################################### +from string import Template +import libxsmm_utilities +import fnmatch +import sys + + +if __name__ == "__main__": + argc = len(sys.argv) + if 1 < argc: + # required argument(s) + filename = sys.argv[1] + + # default configuration if no arguments are given + precision = 0 # all + ifversion = 1 # interface + prefetch = -1 # auto + mnklist = list() + + # optional argument(s) + if 2 < argc: + ivalue = int(sys.argv[2]) + ifversion = (ivalue >> 2) + precision = (ivalue & 3) + if 3 < argc: + prefetch = int(sys.argv[3]) + if 4 < argc: + mnklist = sorted(libxsmm_utilities.load_mnklist(sys.argv[4:], 0)) + + template = Template(open(filename, "r").read()) + if fnmatch.fnmatch(filename, "*.h*"): + optional = [", ...", ""][0 <= prefetch] + substitute = {"MNK_INTERFACE_LIST": ""} + for mnk in mnklist: + mnkstr = "_".join(map(str, mnk)) + if 2 != precision: + pfsig = [ + optional + ");", + ",\n " + "const float* pa, " + "const float* pb, " + "const float* pc);" + ][0 < prefetch] + substitute["MNK_INTERFACE_LIST"] += ( + "\nLIBXSMM_API void libxsmm_smm_" + + mnkstr + + "(const float* a, const float* b, float* c" + + pfsig + ) + if 1 != precision: + pfsig = [ + optional + ");", + ",\n " + "const double* pa, " + "const double* pb, " + "const double* pc);" + ][0 < prefetch] + substitute["MNK_INTERFACE_LIST"] += ( + "\nLIBXSMM_API void libxsmm_dmm_" + + mnkstr + + "(const double* a, const double* b, double* c" + + pfsig + ) + if 0 == precision: + substitute["MNK_INTERFACE_LIST"] += "\n" + if mnklist and 0 != precision: + substitute["MNK_INTERFACE_LIST"] += "\n" + print(template.substitute(substitute)) + else: # Fortran interface + if 1 > ifversion and 0 != ifversion: + raise ValueError("Fortran interface level is inconsistent!") + # Fortran's OPTIONAL allows to always generate an interface + # with prefetch signature (more flexible usage) + if 0 == prefetch: + prefetch = -1 + version, branch, realversion = libxsmm_utilities.version_branch(16) + major, minor, update, patch = libxsmm_utilities.version_numbers( + version + ) + substitute = { + "VERSION": realversion, + "BRANCH": branch, + "MAJOR": major, + "MINOR": minor, + "UPDATE": update, + "PATCH": patch, + "MNK_INTERFACE_LIST": "", + "CONTIGUOUS": ["", ", CONTIGUOUS"][1 < ifversion] + } + if mnklist: + substitute["MNK_INTERFACE_LIST"] += "\n" + for mnk in mnklist: + mnkstr = "_".join(map(str, mnk)) + if 0 == precision: + substitute["MNK_INTERFACE_LIST"] += ( + "\n " + "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smm_" + + mnkstr + + ", libxsmm_dmm_" + + mnkstr + ) + elif 2 != precision: + substitute["MNK_INTERFACE_LIST"] += ( + "\n " + "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_smm_" + + mnkstr + ) + elif 1 != precision: + substitute["MNK_INTERFACE_LIST"] += ( + "\n " + "!DIR$ ATTRIBUTES OFFLOAD:MIC :: libxsmm_dmm_" + + mnkstr + ) + substitute["MNK_INTERFACE_LIST"] += "\n INTERFACE" + optional = [", OPTIONAL", ""][0 < prefetch] + bindc = ["", "BIND(C)"][0 < prefetch] + for mnk in mnklist: + mnkstr = "_".join(map(str, mnk)) + if 2 != precision: + pfsiga = [ + ") BIND(C)\n", + "," + + "&".rjust(26 - len(mnkstr)) + + "\n & pa, pb, pc) " + + bindc + + "\n" + ][0 != prefetch] + pfsigb = [ + "", + " REAL(C_FLOAT), " + "INTENT(IN)" + optional + " :: " + "pa(*), " + "pb(*), " + "pc(*)\n" + ][0 != prefetch] + substitute["MNK_INTERFACE_LIST"] += ( + "\n " + "PURE SUBROUTINE libxsmm_smm_" + + mnkstr + + "(a, b, c" + + pfsiga + + " IMPORT :: C_FLOAT\n" + " REAL(C_FLOAT), " + "INTENT(IN) :: a(*), b(*)\n" + " REAL(C_FLOAT), " + "INTENT(INOUT) :: c(*)\n" + + pfsigb + + " END SUBROUTINE" + ) + if 1 != precision: + pfsiga = [ + ") BIND(C)\n", + "," + + "&".rjust(26 - len(mnkstr)) + + "\n & pa, pb, pc) " + + bindc + + "\n" + ][0 != prefetch] + pfsigb = [ + "", + " REAL(C_DOUBLE), " + "INTENT(IN)" + optional + " :: " + "pa(*), " + "pb(*), " + "pc(*)\n" + ][0 != prefetch] + substitute["MNK_INTERFACE_LIST"] += ( + "\n " + "PURE SUBROUTINE libxsmm_dmm_" + + mnkstr + + "(a, b, c" + + pfsiga + + " IMPORT :: C_DOUBLE\n" + " REAL(C_DOUBLE), " + "INTENT(IN) :: a(*), b(*)\n" + " REAL(C_DOUBLE), " + "INTENT(INOUT) :: c(*)\n" + + pfsigb + + " END SUBROUTINE" + ) + substitute["MNK_INTERFACE_LIST"] += "\n END INTERFACE" + print(template.safe_substitute(substitute)) + else: + sys.tracebacklimit = 0 + raise ValueError(sys.argv[0] + ": wrong number of arguments!") diff --git a/third_party/libxsmm/scripts/libxsmm_source.sh b/third_party/libxsmm/scripts/libxsmm_source.sh new file mode 100755 index 00000000..863206cf --- /dev/null +++ b/third_party/libxsmm/scripts/libxsmm_source.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env sh + +SRCDIR=../src +GREP=$(command -v grep) + +if [ "" = "${GREP}" ]; then + >&2 echo "Error: missing prerequisites!" + exit 1 +fi +cat << EOM +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_SOURCE_H +#define LIBXSMM_SOURCE_H + +#if defined(LIBXSMM_MACROS_H) +# error Please do not include any LIBXSMM header other than libxsmm_source.h! +#endif +#if defined(LIBXSMM_BUILD) +# error LIBXSMM_BUILD cannot be defined for the header-only LIBXSMM! +#endif + +/** + * This header is intentionally called "libxsmm_source.h" since the followings block + * includes *internal* files, and thereby exposes LIBXSMM's implementation. + * The so-called "header-only" usage model gives up the clearly defined binary interface + * (including support for hot-fixes after deployment), and requires to rebuild client + * code for every (internal) change of LIBXSMM. Please make sure to only rely on the + * public interface as the internal implementation may change without notice. + */ +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +EOM + +HERE=$(cd "$(dirname "$0")" && pwd -P) + +if [ "" = "$1" ]; then + DSTDIR=${SRCDIR} +else + DSTDIR=$1 +fi + +# determine order of filenames in directory list +export LC_ALL=C + +# good-enough pattern to match a main function, and to exclude this translation unit +for FILE in $(cd "${HERE}/${SRCDIR}" && ${GREP} -L "main[[:space:]]*(.*)" ./*.c); do + BASENAME=$(basename "${FILE}") + echo "#include \"${DSTDIR}/${BASENAME}\"" +done + +cat << EOM +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#endif /*LIBXSMM_SOURCE_H*/ +EOM + diff --git a/third_party/libxsmm/scripts/libxsmm_specialized.py b/third_party/libxsmm/scripts/libxsmm_specialized.py new file mode 100755 index 00000000..fd9b9dd8 --- /dev/null +++ b/third_party/libxsmm/scripts/libxsmm_specialized.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) Intel Corporation - All rights reserved. # +# This file is part of the LIBXSMM library. # +# # +# For information on the license, see the LICENSE file. # +# Further information: https://github.com/hfp/libxsmm/ # +# SPDX-License-Identifier: BSD-3-Clause # +############################################################################### +# Hans Pabst (Intel Corp.) +############################################################################### +import sys + + +if __name__ == "__main__": + argc = len(sys.argv) + if 6 == argc: + precision = int(sys.argv[1]) + m, n, k = int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) + prefetch = int(sys.argv[5]) + + mnkstr = str(m) + "_" + str(n) + "_" + str(k) + optional = ["", ", ..."][0 > prefetch] + signature = ["a, b, c", "a, b, c, pa, pb, pc"][0 < prefetch] + if 2 != precision: + pfsig = [ + optional + ")", + "\n" + ", const float* pa" + ", const float* pb" + ", const float* pc)", + ][0 < prefetch] + print + print + print( + "LIBXSMM_API void libxsmm_smm_" + + mnkstr + + "(const float* a, const float* b, float* c" + + pfsig + ) + print("{") + print( + "#if defined(__AVX512F__) && " + "defined(LIBXSMM_GENTARGET_skx_sp) && \\" + ) + print(" !(defined(__AVX512PF__) && defined(__AVX512ER__))") + print(" libxsmm_smm_" + mnkstr + "_skx(" + signature + ");") + print( + "#elif defined(__AVX512F__) && " + "defined(LIBXSMM_GENTARGET_knl_sp)" + ) + print(" libxsmm_smm_" + mnkstr + "_knl(" + signature + ");") + print( + "#elif defined(__AVX2__) && " + "defined(LIBXSMM_GENTARGET_hsw_sp)" + ) + print(" libxsmm_smm_" + mnkstr + "_hsw(" + signature + ");") + print( + "#elif defined(__AVX__) && " + "defined(LIBXSMM_GENTARGET_snb_sp)" + ) + print(" libxsmm_smm_" + mnkstr + "_snb(" + signature + ");") + print( + "#elif defined(__SSE3__) && " + "defined(LIBXSMM_GENTARGET_wsm_sp)" + ) + print(" libxsmm_smm_" + mnkstr + "_wsm(" + signature + ");") + print("#else") + print( + " const char transa = (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & " + "LIBXSMM_FLAGS) ? 'N' : 'T');" + ) + print( + " const char transb = (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & " + "LIBXSMM_FLAGS) ? 'N' : 'T');" + ) + print(" const float alpha = LIBXSMM_ALPHA, beta = LIBXSMM_BETA;") + print( + " const libxsmm_blasint " + "m = " + str(m) + ", " + "n = " + str(n) + ", " + "k = " + str(k) + ";" + ) + if 0 < prefetch: + print( + " LIBXSMM_UNUSED(pa);" + " LIBXSMM_UNUSED(pb);" + " LIBXSMM_UNUSED(pc);" + ) + print( + " LIBXSMM_INLINE_XGEMM(float, float, &transa, &transb," + " &m, &n, &k, &alpha, a, &m, b, &k, &beta, c, &m);" + ) + print("#endif") + print("}") + print + print + print( + "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_smm_" + + mnkstr + + ")(const float* a, const float* b, float* c" + + pfsig + + ";" + ) + print( + "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_smm_" + + mnkstr + + ")(const float* a, const float* b, float* c" + + pfsig + ) + print("{") + print(" libxsmm_smm_" + mnkstr + "(" + signature + ");") + print("}") + if 1 != precision: + pfsig = [ + optional + ")", + "\n" + ", const double* pa" + ", const double* pb" + ", const double* pc)", + ][0 < prefetch] + print + print + print( + "LIBXSMM_API void libxsmm_dmm_" + + mnkstr + + "(const double* a, const double* b, double* c" + + pfsig + ) + print("{") + print( + "#if defined(__AVX512F__) && " + "defined(LIBXSMM_GENTARGET_skx_dp) && \\" + ) + print(" !(defined(__AVX512PF__) && defined(__AVX512ER__))") + print(" libxsmm_dmm_" + mnkstr + "_skx(" + signature + ");") + print( + "#elif defined(__AVX512F__) && " + "defined(LIBXSMM_GENTARGET_knl_dp)" + ) + print(" libxsmm_dmm_" + mnkstr + "_knl(" + signature + ");") + print( + "#elif defined(__AVX2__) && " + "defined(LIBXSMM_GENTARGET_hsw_dp)" + ) + print(" libxsmm_dmm_" + mnkstr + "_hsw(" + signature + ");") + print( + "#elif defined(__AVX__) && " + "defined(LIBXSMM_GENTARGET_snb_dp)" + ) + print(" libxsmm_dmm_" + mnkstr + "_snb(" + signature + ");") + print( + "#elif defined(__SSE3__) && " + "defined(LIBXSMM_GENTARGET_wsm_dp)" + ) + print(" libxsmm_dmm_" + mnkstr + "_wsm(" + signature + ");") + print("#else") + print( + " const char transa = (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & " + "LIBXSMM_FLAGS) ? 'N' : 'T');" + ) + print( + " const char transb = (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & " + "LIBXSMM_FLAGS) ? 'N' : 'T');" + ) + print(" const double alpha = LIBXSMM_ALPHA, beta = LIBXSMM_BETA;") + print( + " const libxsmm_blasint " + "m = " + str(m) + ", " + "n = " + str(n) + ", " + "k = " + str(k) + ";" + ) + if 0 < prefetch: + print( + " LIBXSMM_UNUSED(pa);" + " LIBXSMM_UNUSED(pb);" + " LIBXSMM_UNUSED(pc);" + ) + print( + " LIBXSMM_INLINE_XGEMM(double, double, &transa, &transb," + " &m, &n, &k, &alpha, a, &m, b, &k, &beta, c, &m);" + ) + print("#endif") + print("}") + print + print + print( + "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dmm_" + + mnkstr + + ")(const double* a, const double* b, double* c" + + pfsig + + ";" + ) + print( + "LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dmm_" + + mnkstr + + ")(const double* a, const double* b, double* c" + + pfsig + ) + print("{") + print(" libxsmm_dmm_" + mnkstr + "(" + signature + ");") + print("}") + else: + sys.tracebacklimit = 0 + raise ValueError(sys.argv[0] + ": wrong number of arguments!") diff --git a/third_party/libxsmm/scripts/libxsmm_utilities.py b/third_party/libxsmm/scripts/libxsmm_utilities.py new file mode 100755 index 00000000..b63372be --- /dev/null +++ b/third_party/libxsmm/scripts/libxsmm_utilities.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) Intel Corporation - All rights reserved. # +# This file is part of the LIBXSMM library. # +# # +# For information on the license, see the LICENSE file. # +# Further information: https://github.com/hfp/libxsmm/ # +# SPDX-License-Identifier: BSD-3-Clause # +############################################################################### +# Hans Pabst (Intel Corp.) +############################################################################### +import itertools +import operator +import inspect +import sys +import os + +try: + from functools import reduce +except ImportError: + pass + + +def upper_list(lists, level): + nlist = len(lists) + upper = [level, level + nlist][1 > level] - 1 + above = lists[upper] + if above: + return above + elif -nlist <= level: + return upper_list(lists, level - 1) + else: + return [] + + +# https://docs.python.org/3/library/itertools.html#itertools.product +def itertools_product(*args): + # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy + # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 + pools = [tuple(pool) for pool in args] + result = [[]] + for pool in pools: + result = [x + [y] for x in result for y in pool] + for prod in result: + yield tuple(prod) + + +def load_mnklist(argv, threshold, inputformat=0, resultset=None): + if resultset is None: + resultset = set() + if 0 == inputformat: # indexes format + resultset = set(map(lambda mnk: tuple(map(int, mnk.split("_"))), argv)) + elif -1 == inputformat: # new input format + groups = map( + lambda group: [int(i) for i in group.split()], + " ".join(argv[0:]).split(","), + ) + resultset = set( + itertools.chain( + *[list(itertools_product(*(i, i, i))) for i in groups] + ) + ) + elif -2 == inputformat: # legacy format + mlist = list( + map( + int, + map( + lambda s: str(s).replace(",", " ").strip(), + argv[2:2 + int(argv[0])], + ), + ) + ) + nlist = list( + map( + int, + map( + lambda s: str(s).replace(",", " ").strip(), + argv[2 + int(argv[0]):2 + int(argv[0]) + int(argv[1])], + ), + ) + ) + klist = list( + map( + int, + map( + lambda s: str(s).replace(",", " ").strip(), + argv[2 + int(argv[0]) + int(argv[1]):], + ), + ) + ) + mnk = [mlist, nlist, klist] + top = [ + [mlist, upper_list(mnk, 0)][0 == len(mlist)], + [nlist, upper_list(mnk, 1)][0 == len(nlist)], + [klist, upper_list(mnk, 2)][0 == len(klist)], + ] + for m in top[0]: + for n in top[1]: + if not nlist: + n = m + for k in top[2]: + if not klist: + k = n + if not mlist: + m = k + resultset.add((m, n, k)) + else: + sys.tracebacklimit = 0 + raise ValueError("load_mnklist: unexpected input format!") + if 0 != threshold: # threshold requested + return set( + filter( + lambda mnk: (0 < mnk[0]) + and (0 < mnk[1]) + and (0 < mnk[2]) + and (threshold >= (mnk[0] * mnk[1] * mnk[2])), + resultset, + ) + ) + else: + return set( + filter( + lambda mnk: (0 < mnk[0]) and (0 < mnk[1]) and (0 < mnk[2]), + resultset, + ) + ) + + +def max_mnk(mnklist, init=0, index=None): + if index is not None and 0 <= index and index < 3: + mapped = map(lambda mnk: mnk[index], mnklist) + else: + mapped = map(lambda mnk: mnk[0] * mnk[1] * mnk[2], mnklist) + return reduce(max, mapped, init) + + +def median(list_of_numbers, fallback=None, average=True): + size = len(list_of_numbers) + if 0 < size: + # TODO: use nth element + list_of_numbers.sort() + size2 = int(size / 2) + if average and 0 == (size - size2 * 2): + medval = int( + 0.5 * (list_of_numbers[size2 - 1] + list_of_numbers[size2]) + + 0.5 + ) + else: + medval = list_of_numbers[size2] + if fallback is not None: + result = min(medval, fallback) + else: + result = medval + elif fallback is not None: + result = fallback + else: + sys.tracebacklimit = 0 + raise ValueError("median: empty list!") + return result + + +def is_pot(num): + return 0 <= num or 0 == (num & (num - 1)) + + +def sanitize_alignment(alignment): + if 0 >= alignment: + alignment = [1, 64][0 != alignment] + elif not is_pot(alignment): + sys.tracebacklimit = 0 + raise ValueError( + "sanitize_alignment: alignment must be a Power of Two (POT)!" + ) + return alignment + + +def align_value(n, typesize, alignment): + if 0 < typesize and 0 < alignment: + return ( + ((n * typesize + alignment - 1) / alignment) * alignment + ) / typesize + else: + sys.tracebacklimit = 0 + raise ValueError("align_value: invalid input!") + + +def version_branch_from_file(version_filepath): + version_file = open(version_filepath, "r") + version, branch, sep = "1.0", "", "-" + try: + version_list, n = version_file.read().replace("\n", "").split(sep), 0 + for word in version_list: + if not reduce( + operator.and_, + (subword.isdigit() for subword in word.split(".")), + True, + ): + branch += [sep + word, word][0 == n] + n += 1 + else: + break + version = sep.join(version_list[n:]) + finally: + version_file.close() + return (version, branch) + + +def version_numbers(version, branch=None): + version_list = version.split("-") + if not version_list[0][0].isdigit(): + vbranch = version_list[0] + else: + vbranch = "master" + if branch is None or vbranch == branch: + minor = update = patch = 0 + major = 1 + n = len(version_list) + if 1 < n: + patch_list = version_list[n - 1] + if 1 == len(patch_list.split(".")): + version_list = version_list[n - 2].split(".") + if version_list != [vbranch]: + patch = int(patch_list) + else: + major = int(patch_list) + else: + version_list = patch_list.split(".") + else: + version_list = version.split(".") + n = len(version_list) + try: + if 0 < n: + major = int(version_list[0]) + if 1 < n: + minor = int(version_list[1]) + if 2 < n: + update = int(version_list[2]) + except ValueError: + # if 1 == n: major = 0 + pass + else: + major = minor = update = patch = -1 + return major, minor, update, patch + + +def version_branch(max_strlen=-1): + version_filename = "version.txt" + filepath_default = os.path.realpath( + os.path.join( + os.path.dirname(inspect.getfile(inspect.currentframe())), + "..", + version_filename, + ) + ) + filepath_local = os.path.realpath(version_filename) # local version file + realversion, branch = version_branch_from_file(filepath_default) + version = realversion + out_of_tree = filepath_default != filepath_local + if out_of_tree and os.path.isfile(filepath_local): + local, ignored = version_branch_from_file(filepath_local) + if version_numbers(realversion) < version_numbers(local): + version = local + if 0 < max_strlen: + start = int(max_strlen / 3) + cut = max( + branch.rfind("-", start, max_strlen), + branch.rfind("_", start, max_strlen), + branch.rfind(".", start, max_strlen), + ) + if start < cut: + branch = branch[0:cut] + else: + branch = branch[0:max_strlen] + return (version, branch, realversion) + + +if __name__ == "__main__": + argc = len(sys.argv) + if 1 < argc: + arg1 = int(sys.argv[1]) + else: + arg1 = 0 + if -1 == arg1: + if 5 < argc: + # threshold = int(sys.argv[2]) + mnk_size = int(sys.argv[3]) + dims = load_mnklist(sys.argv[4:4 + mnk_size], 0, -1) + dims = load_mnklist(sys.argv[4 + mnk_size:], 0, -2, dims) + mnklist = map(lambda mnk: "_".join(map(str, mnk)), sorted(dims)) + print(" ".join(mnklist)) + elif 3 == argc: + major, minor, update, patch = ( + version_numbers(sys.argv[2], "release") + ) + print(["0", "1"][0 == patch]) + elif 0 <= arg1: + if 0 == arg1 and 3 == argc: + major, minor, update, patch = version_numbers(sys.argv[2]) + print(major) # soname version + else: + version, branch, realversion = version_branch() + major, minor, update, patch = version_numbers(version) + if 1 == arg1: + print(major) + elif 2 == arg1: + print(minor) + elif 3 == arg1: + print(update) + elif 4 == arg1: + print(patch) + elif "" != branch: + print("{}-{}".format(branch, realversion)) + else: + print(realversion) + else: + sys.tracebacklimit = 0 + raise ValueError( + "{}: wrong ({}) number of arguments ('{}') given!".format( + sys.argv[0], argc - 1, " ".join(sys.argv[1:])) + ) diff --git a/third_party/libxsmm/scripts/libxsmm_version.sh b/third_party/libxsmm/scripts/libxsmm_version.sh new file mode 100755 index 00000000..670b15d0 --- /dev/null +++ b/third_party/libxsmm/scripts/libxsmm_version.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env sh +############################################################################### +# Copyright (c) Intel Corporation - All rights reserved. # +# This file is part of the LIBXSMM library. # +# # +# For information on the license, see the LICENSE file. # +# Further information: https://github.com/hfp/libxsmm/ # +# SPDX-License-Identifier: BSD-3-Clause # +############################################################################### +# Hans Pabst (Intel Corp.) +############################################################################### +GIT=$(command -v git) + +SHIFT=0 +if [ "$1" ]; then + SHIFT=$1 +fi + +NAME=$(${GIT} rev-parse --abbrev-ref HEAD 2>/dev/null) +MAIN=$(${GIT} describe --tags --match "[0-9]*" --abbrev=0 2>/dev/null) + +if [ "${MAIN}" ]; then + VERSION="${NAME}-${MAIN}" + REVC=$(${GIT} rev-list --count --no-merges "${MAIN}"..HEAD 2>/dev/null) +else + VERSION=${NAME} + REVC=$(${GIT} rev-list --count --no-merges HEAD 2>/dev/null) +fi + +echo "${VERSION}-$((REVC+SHIFT))" diff --git a/third_party/libxsmm/src/libxsmm_cpuid_arm.c b/third_party/libxsmm/src/libxsmm_cpuid_arm.c new file mode 100644 index 00000000..d3c49592 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_cpuid_arm.c @@ -0,0 +1,96 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include +#include +#include + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if defined(_MSC_VER) +# define LIBXSMM_CPUID_ARM_ENC16(OP0, OP1, CRN, CRM, OP2) ( \ + (((OP0) & 1) << 14) | \ + (((OP1) & 7) << 11) | \ + (((CRN) & 15) << 7) | \ + (((CRM) & 15) << 3) | \ + (((OP2) & 7) << 0)) +# define ID_AA64ISAR1_EL1 LIBXSMM_CPUID_ARM_ENC16(0b11, 0b000, 0b0000, 0b0110, 0b001) +# define ID_AA64PFR0_EL1 LIBXSMM_CPUID_ARM_ENC16(0b11, 0b000, 0b0000, 0b0100, 0b000) +# define LIBXSMM_CPUID_ARM_MRS(RESULT, ID) RESULT = _ReadStatusReg(ID) +#else +# define LIBXSMM_CPUID_ARM_MRS(RESULT, ID) __asm__ __volatile__( \ + "mrs %0," LIBXSMM_STRINGIFY(ID) : "=r"(RESULT)) +#endif + + +#if defined(LIBXSMM_PLATFORM_AARCH64) +LIBXSMM_APIVAR_DEFINE(jmp_buf internal_cpuid_arm_jmp_buf); + +LIBXSMM_API_INTERN void internal_cpuid_arm_sigill(int /*signum*/); +LIBXSMM_API_INTERN void internal_cpuid_arm_sigill(int signum) +{ + void (*const handler)(int) = signal(signum, internal_cpuid_arm_sigill); + LIBXSMM_ASSERT(SIGILL == signum); + if (SIG_ERR != handler) longjmp(internal_cpuid_arm_jmp_buf, 1); +} +#endif + + +LIBXSMM_API int libxsmm_cpuid_arm(libxsmm_cpuid_info* info) +{ + static int result = LIBXSMM_TARGET_ARCH_UNKNOWN; +#if defined(LIBXSMM_PLATFORM_AARCH64) +#if defined(__APPLE__) && defined(__arm64__) + result = LIBXSMM_AARCH64_V81; +#else + if (LIBXSMM_TARGET_ARCH_UNKNOWN == result) { /* avoid redetecting features */ + void (*const handler)(int) = signal(SIGILL, internal_cpuid_arm_sigill); + result = LIBXSMM_AARCH64_V81; + if (SIG_ERR != handler) { + uint64_t capability; /* 64-bit value */ + if (0 == setjmp(internal_cpuid_arm_jmp_buf)) { + LIBXSMM_CPUID_ARM_MRS(capability, ID_AA64ISAR1_EL1); + if (0xF & capability) { /* DPB */ + result = LIBXSMM_AARCH64_V82; + if (0 == setjmp(internal_cpuid_arm_jmp_buf)) { + LIBXSMM_CPUID_ARM_MRS(capability, ID_AA64PFR0_EL1); + if (0xF & (capability >> 32)) { /* SVE */ + result = LIBXSMM_AARCH64_A64FX; + } + } + } + } + /* restore original state */ + signal(SIGILL, handler); + } + if (NULL != info) LIBXSMM_MEMZERO127(info); + } +#endif +#else +# if !defined(NDEBUG) + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: libxsmm_cpuid_arm called on non-ARM platform!\n"); + } +# endif + if (NULL != info) LIBXSMM_MEMZERO127(info); +#endif + return result; +} diff --git a/third_party/libxsmm/src/libxsmm_cpuid_x86.c b/third_party/libxsmm/src/libxsmm_cpuid_x86.c new file mode 100644 index 00000000..6e90c0cb --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_cpuid_x86.c @@ -0,0 +1,336 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include +#include +#if !defined(_WIN32) +# include +#endif + +#if defined(LIBXSMM_PLATFORM_X86) +/* XGETBV: receive results (EAX, EDX) for eXtended Control Register (XCR). */ +/* CPUID, receive results (EAX, EBX, ECX, EDX) for requested FUNCTION/SUBFN. */ +#if defined(_MSC_VER) /*defined(_WIN32) && !defined(__GNUC__)*/ +# define LIBXSMM_XGETBV(XCR, EAX, EDX) { \ + unsigned long long libxsmm_xgetbv_ = _xgetbv(XCR); \ + EAX = (int)libxsmm_xgetbv_; \ + EDX = (int)(libxsmm_xgetbv_ >> 32); \ + } +# define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) { \ + int libxsmm_cpuid_x86_[/*4*/] = { 0, 0, 0, 0 }; \ + __cpuidex(libxsmm_cpuid_x86_, FUNCTION, SUBFN); \ + EAX = (unsigned int)libxsmm_cpuid_x86_[0]; \ + EBX = (unsigned int)libxsmm_cpuid_x86_[1]; \ + ECX = (unsigned int)libxsmm_cpuid_x86_[2]; \ + EDX = (unsigned int)libxsmm_cpuid_x86_[3]; \ + } +# elif defined(__GNUC__) || !defined(_CRAYC) +# if (64 > (LIBXSMM_BITS)) + LIBXSMM_EXTERN LIBXSMM_RETARGETABLE int __get_cpuid( /* prototype */ + unsigned int, unsigned int*, unsigned int*, unsigned int*, unsigned int*); +# define LIBXSMM_XGETBV(XCR, EAX, EDX) EAX = (EDX) = 0xFFFFFFFF +# define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) \ + EAX = (EBX) = (EDX) = 0; ECX = (SUBFN); \ + __get_cpuid(FUNCTION, &(EAX), &(EBX), &(ECX), &(EDX)) +# else /* 64-bit */ +# define LIBXSMM_XGETBV(XCR, EAX, EDX) __asm__ __volatile__( \ + ".byte 0x0f, 0x01, 0xd0" /*xgetbv*/ : "=a"(EAX), "=d"(EDX) : "c"(XCR) \ + ) +# define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) \ + __asm__ __volatile__ (".byte 0x0f, 0xa2" /*cpuid*/ \ + : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX) \ + : "a"(FUNCTION), "b"(0), "c"(SUBFN), "d"(0) \ + ) +# endif +# else /* legacy Cray Compiler */ +# define LIBXSMM_XGETBV(XCR, EAX, EDX) EAX = (EDX) = 0 +# define LIBXSMM_CPUID_X86(FUNCTION, SUBFN, EAX, EBX, ECX, EDX) EAX = (EBX) = (ECX) = (EDX) = 0 +# endif +#endif + +#define LIBXSMM_CPUID_CHECK(VALUE, CHECK) ((CHECK) == ((CHECK) & (VALUE))) + + +LIBXSMM_API int libxsmm_cpuid_x86(libxsmm_cpuid_info* info) +{ + static int result = LIBXSMM_TARGET_ARCH_UNKNOWN; +#if defined(LIBXSMM_PLATFORM_X86) + unsigned int eax, ebx, ecx, edx; + LIBXSMM_CPUID_X86(0, 0/*ecx*/, eax, ebx, ecx, edx); + if (1 <= eax) { /* CPUID max. leaf */ + /* avoid redetecting features but redetect on request (info given) */ + if (LIBXSMM_TARGET_ARCH_UNKNOWN == result || NULL != info) { + int feature_cpu = LIBXSMM_X86_GENERIC, feature_os = LIBXSMM_X86_GENERIC, has_context = 0; + unsigned int maxleaf = eax; +# if defined(__linux__) + if (0 == libxsmm_se && LIBXSMM_TARGET_ARCH_UNKNOWN == result) { + FILE *const selinux = fopen("/sys/fs/selinux/enforce", "rb"); + if (NULL != selinux) { + if (1 == fread(&libxsmm_se, 1/*sizeof(char)*/, 1/*count*/, selinux)) { + libxsmm_se = ('0' != libxsmm_se ? 1 : 0); + } + else { /* conservative assumption in case of read-error */ + libxsmm_se = 1; + } + fclose(selinux); + } + } +# elif defined(MAP_JIT) + libxsmm_se = 1; +# endif + LIBXSMM_CPUID_X86(1, 0/*ecx*/, eax, ebx, ecx, edx); + if (LIBXSMM_CPUID_CHECK(ecx, 0x00000001)) { /* SSE3(0x00000001) */ + if (LIBXSMM_CPUID_CHECK(ecx, 0x00100000)) { /* SSE42(0x00100000) */ + if (LIBXSMM_CPUID_CHECK(ecx, 0x10000000)) { /* AVX(0x10000000) */ + if (LIBXSMM_CPUID_CHECK(ecx, 0x00001000)) { /* FMA(0x00001000) */ + unsigned int ecx2; + LIBXSMM_CPUID_X86(7, 0/*ecx*/, eax, ebx, ecx2, edx); + /* AVX512F(0x00010000), AVX512CD(0x10000000) */ + if (LIBXSMM_CPUID_CHECK(ebx, 0x10010000)) { /* Common */ + /* AVX512DQ(0x00020000), AVX512BW(0x40000000), AVX512VL(0x80000000) */ + if (LIBXSMM_CPUID_CHECK(ebx, 0xC0020000)) { /* AVX512-Core */ + if (LIBXSMM_CPUID_CHECK(ecx2, 0x00000800)) { /* VNNI */ + unsigned int edx2; /* we need to save edx for AMX check */ +# if 0 /* no check required yet */ + unsigned int ecx3; + LIBXSMM_CPUID_X86(7, 1/*ecx*/, eax, ebx, ecx3, edx); +# else + LIBXSMM_CPUID_X86(7, 1/*ecx*/, eax, ebx, ecx2, edx2); +# endif + if (LIBXSMM_CPUID_CHECK(eax, 0x00000020)) { /* BF16 */ + feature_cpu = LIBXSMM_X86_AVX512_CPX; + if (LIBXSMM_CPUID_CHECK(edx, 0x03400000)) { /* AMX-TILE, AMX-INT8, AMX-BF16 */ + feature_cpu = LIBXSMM_X86_AVX512_SPR; + } + } + else feature_cpu = LIBXSMM_X86_AVX512_CLX; /* CLX */ + } + else feature_cpu = LIBXSMM_X86_AVX512_CORE; /* SKX */ + } + /* AVX512PF(0x04000000), AVX512ER(0x08000000) */ + else if (LIBXSMM_CPUID_CHECK(ebx, 0x0C000000)) { /* AVX512-MIC */ + if (LIBXSMM_CPUID_CHECK(edx, 0x0000000C)) { /* KNM */ + feature_cpu = LIBXSMM_X86_AVX512_KNM; + } + else feature_cpu = LIBXSMM_X86_AVX512_MIC; /* KNL */ + } + else feature_cpu = LIBXSMM_X86_AVX512; /* AVX512-Common */ + } + else feature_cpu = LIBXSMM_X86_AVX2; + } + else feature_cpu = LIBXSMM_X86_AVX; + } + else feature_cpu = LIBXSMM_X86_SSE42; + } + else feature_cpu = LIBXSMM_X86_SSE3; + } +# if !defined(LIBXSMM_INTRINSICS_DEBUG) + LIBXSMM_ASSERT_MSG(LIBXSMM_STATIC_TARGET_ARCH <= LIBXSMM_MAX(LIBXSMM_X86_GENERIC, feature_cpu), "missed detecting ISA extensions"); + /* coverity[dead_error_line] */ + if (LIBXSMM_STATIC_TARGET_ARCH > feature_cpu) feature_cpu = LIBXSMM_STATIC_TARGET_ARCH; +# endif + /* XSAVE/XGETBV(0x04000000), OSXSAVE(0x08000000) */ + if (LIBXSMM_CPUID_CHECK(ecx, 0x0C000000)) { /* OS SSE support */ + feature_os = LIBXSMM_MIN(LIBXSMM_X86_SSE42, feature_cpu); + if (LIBXSMM_X86_AVX <= feature_cpu) { + LIBXSMM_XGETBV(0, eax, edx); + if (LIBXSMM_CPUID_CHECK(eax, 0x00000006)) { /* OS XSAVE 256-bit */ + feature_os = LIBXSMM_MIN(LIBXSMM_X86_AVX2, feature_cpu); + if (LIBXSMM_CPUID_CHECK(eax, 0x000000E0)) { /* OS XSAVE 512-bit */ + feature_os = LIBXSMM_MIN(LIBXSMM_X86_AVX512_CPX, feature_cpu); + if (LIBXSMM_X86_AVX512_SPR <= feature_cpu && 7 <= maxleaf + && LIBXSMM_CPUID_CHECK(eax, 0x00060000)) /* OS XSAVE 512-bit */ + { + feature_os = feature_cpu; /* unlimited AMX */ + } + } + } + } + } + else if (LIBXSMM_X86_GENERIC <= feature_cpu) { + /* assume FXSAVE, which should be fine + * 16 years after the first x86_64 OS + */ + feature_os = LIBXSMM_X86_SSE42; + } + else feature_os = LIBXSMM_TARGET_ARCH_GENERIC; + has_context = (LIBXSMM_STATIC_TARGET_ARCH >= feature_cpu || feature_os >= feature_cpu) ? 1 : 0; + if (LIBXSMM_TARGET_ARCH_UNKNOWN == result && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ +# if !defined(LIBXSMM_TARGET_ARCH) + const int target_vlen32 = libxsmm_cpuid_vlen32(feature_cpu); + const char *const compiler_support = (libxsmm_cpuid_vlen32(LIBXSMM_MAX_STATIC_TARGET_ARCH) < target_vlen32 + ? "" : (((2 <= libxsmm_verbosity || 0 > libxsmm_verbosity) && LIBXSMM_MAX_STATIC_TARGET_ARCH < feature_cpu) + ? "highly " : NULL)); + if (NULL != compiler_support) { + const char *const name = libxsmm_cpuid_name( /* exclude MIC when running on Core processors */ + (((LIBXSMM_X86_AVX512_MIC == LIBXSMM_MAX_STATIC_TARGET_ARCH) || + (LIBXSMM_X86_AVX512_KNM == LIBXSMM_MAX_STATIC_TARGET_ARCH)) && (LIBXSMM_X86_AVX512_CORE <= feature_cpu)) + ? LIBXSMM_X86_AVX2 : LIBXSMM_MAX_STATIC_TARGET_ARCH); + fprintf(stderr, "LIBXSMM WARNING: %soptimized non-JIT code paths are limited to \"%s\"!\n", compiler_support, name); + } +# endif +# if !defined(NDEBUG) && defined(__OPTIMIZE__) + fprintf(stderr, "LIBXSMM WARNING: library is optimized without -DNDEBUG and contains debug code!\n"); +# endif +# if !defined(__APPLE__) || !defined(__MACH__) /* permitted features */ + if (0 == has_context) { + fprintf(stderr, "LIBXSMM WARNING: detected CPU features are not permitted by the OS!\n"); + if (0 == libxsmm_se) { + fprintf(stderr, "LIBXSMM WARNING: downgraded code generation to supported features!\n"); + } + } +# endif + } + /* macOS is faulting AVX-512 (on-demand larger state) */ + result = feature_cpu; +# if !defined(__APPLE__) || !defined(__MACH__) +# if 0 /* opportunistic */ + if (0 == libxsmm_se) +# endif + { /* only permitted features */ + result = LIBXSMM_MIN(feature_cpu, feature_os); + } +# endif + if (NULL != info) { + LIBXSMM_CPUID_X86(0x80000007, 0/*ecx*/, eax, ebx, ecx, edx); + info->constant_tsc = LIBXSMM_CPUID_CHECK(edx, 0x00000100); + info->has_context = has_context; + } + } + } + else { + if (NULL != info) LIBXSMM_MEMZERO127(info); + result = LIBXSMM_X86_GENERIC; + } +#else +# if !defined(NDEBUG) + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: libxsmm_cpuid_x86 called on non-x86 platform!\n"); + } +# endif + if (NULL != info) LIBXSMM_MEMZERO127(info); +#endif + return result; +} + + +LIBXSMM_API int libxsmm_cpuid(void) +{ +#if defined(LIBXSMM_PLATFORM_X86) + return libxsmm_cpuid_x86(NULL/*info*/); +#else + return libxsmm_cpuid_arm(NULL/*info*/); +#endif +} + + +/** + * This implementation also accounts for non-x86 platforms, + * which not only allows to resolve any given ID but to + * fallback gracefully ("unknown"). + */ +LIBXSMM_API const char* libxsmm_cpuid_name(int id) +{ + const char* target_arch = NULL; + switch (id) { + case LIBXSMM_X86_AVX512_SPR: { + target_arch = "spr"; + } break; + case LIBXSMM_X86_AVX512_CPX: { + target_arch = "cpx"; + } break; + case LIBXSMM_X86_AVX512_CLX: { + target_arch = "clx"; + } break; + case LIBXSMM_X86_AVX512_CORE: { + target_arch = "skx"; + } break; + case LIBXSMM_X86_AVX512_KNM: { + target_arch = "knm"; + } break; + case LIBXSMM_X86_AVX512_MIC: { + target_arch = "knl"; + } break; + case LIBXSMM_X86_AVX512: { + /* TODO: rework BE to use target ID instead of set of strings (target_arch = "avx3") */ + target_arch = "hsw"; + } break; + case LIBXSMM_X86_AVX2: { + target_arch = "hsw"; + } break; + case LIBXSMM_X86_AVX: { + target_arch = "snb"; + } break; + case LIBXSMM_X86_SSE42: { + target_arch = "wsm"; + } break; + case LIBXSMM_X86_SSE3: { + target_arch = "sse3"; + } break; + case LIBXSMM_AARCH64_V81: { + target_arch = "aarch64"; + } break; + case LIBXSMM_AARCH64_A64FX: { + target_arch = "a64fx"; + } break; + case LIBXSMM_TARGET_ARCH_GENERIC: { + target_arch = "generic"; + } break; + default: if (LIBXSMM_X86_GENERIC <= id) { + target_arch = "x86_64"; + } + else { + target_arch = "unknown"; + } + } + LIBXSMM_ASSERT(NULL != target_arch); + return target_arch; +} + + +/** + * This implementation also accounts for non-x86 platforms, + * which not only allows to resolve any given ID but to + * fallback gracefully (scalar). + */ +LIBXSMM_API int libxsmm_cpuid_vlen32(int id) +{ + int result; +#if defined(LIBXSMM_PLATFORM_X86) + if (LIBXSMM_X86_AVX512 <= id) { + result = 16; + } + else if (LIBXSMM_X86_AVX <= id) { + result = 8; + } + else if (LIBXSMM_X86_SSE42 <= id) { + result = 4; + } + else +#elif defined(LIBXSMM_PLATFORM_AARCH64) + if (LIBXSMM_AARCH64_V81 == id) { + result = 4; + } + else if (LIBXSMM_AARCH64_A64FX == id) { + result = 16; + } + else +#else + LIBXSMM_UNUSED(id); +#endif + { /* scalar */ + result = 1; + } + return result; +} diff --git a/third_party/libxsmm/src/libxsmm_diff.h b/third_party/libxsmm/src/libxsmm_diff.h new file mode 100644 index 00000000..fed7b82e --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_diff.h @@ -0,0 +1,144 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DIFF_H +#define LIBXSMM_DIFF_H + +#include + +#if !defined(LIBXSMM_DIFF_AVX512_ENABLED) && 0 +# define LIBXSMM_DIFF_AVX512_ENABLED +#endif + +#define LIBXSMM_DIFF_4_DECL(A) const uint32_t */*const*/ A = NULL +#define LIBXSMM_DIFF_4_ASSIGN(A, B) (A) = (B) +#define LIBXSMM_DIFF_4_LOAD(A, SRC) A = (const uint32_t*)(SRC) +#define LIBXSMM_DIFF_4(A, B, ...) ((unsigned char)(0 != (*(A) ^ (*(const uint32_t*)(B))))) + +#define LIBXSMM_DIFF_8_DECL(A) const uint64_t */*const*/ A = NULL +#define LIBXSMM_DIFF_8_ASSIGN(A, B) (A) = (B) +#define LIBXSMM_DIFF_8_LOAD(A, SRC) A = (const uint64_t*)(SRC) +#define LIBXSMM_DIFF_8(A, B, ...) ((unsigned char)(0 != (*(A) ^ (*(const uint64_t*)(B))))) + +#define LIBXSMM_DIFF_SSE_DECL(A) __m128i A = LIBXSMM_INTRINSICS_MM_UNDEFINED_SI128() +#define LIBXSMM_DIFF_SSE_ASSIGN(A, B) (A) = (B) +#define LIBXSMM_DIFF_SSE_LOAD(A, SRC) A = LIBXSMM_INTRINSICS_LOADU_SI128((const __m128i*)(SRC)) +#define LIBXSMM_DIFF_SSE(A, B, ...) ((unsigned char)(0xFFFF != _mm_movemask_epi8(_mm_cmpeq_epi8( \ + A, LIBXSMM_INTRINSICS_LOADU_SI128((const __m128i*)(B)))))) + +#if (LIBXSMM_X86_GENERIC <= LIBXSMM_STATIC_TARGET_ARCH) /*|| defined(LIBXSMM_INTRINSICS_TARGET)*/ +# define LIBXSMM_DIFF_16_DECL LIBXSMM_DIFF_SSE_DECL +# define LIBXSMM_DIFF_16_ASSIGN LIBXSMM_DIFF_SSE_ASSIGN +# define LIBXSMM_DIFF_16_LOAD LIBXSMM_DIFF_SSE_LOAD +# define LIBXSMM_DIFF_16 LIBXSMM_DIFF_SSE +#else +# define LIBXSMM_DIFF_16_DECL(A) const uint64_t */*const*/ A = NULL +# define LIBXSMM_DIFF_16_ASSIGN(A, B) (A) = (B) +# define LIBXSMM_DIFF_16_LOAD(A, SRC) A = (const uint64_t*)(SRC) +# define LIBXSMM_DIFF_16(A, B, ...) ((unsigned char)(0 != (((A)[0] ^ (*(const uint64_t*)(B))) | \ + ((A)[1] ^ ((const uint64_t*)(B))[1])))) +#endif + +#define LIBXSMM_DIFF_AVX2_DECL(A) __m256i A = LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256() +#define LIBXSMM_DIFF_AVX2_ASSIGN(A, B) (A) = (B) +#define LIBXSMM_DIFF_AVX2_LOAD(A, SRC) A = _mm256_loadu_si256((const __m256i*)(SRC)) +#define LIBXSMM_DIFF_AVX2(A, B, ...) ((unsigned char)(-1 != _mm256_movemask_epi8(_mm256_cmpeq_epi8( \ + A, _mm256_loadu_si256((const __m256i*)(B)))))) + +#if (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_DIFF_32_DECL LIBXSMM_DIFF_AVX2_DECL +# define LIBXSMM_DIFF_32_ASSIGN LIBXSMM_DIFF_AVX2_ASSIGN +# define LIBXSMM_DIFF_32_LOAD LIBXSMM_DIFF_AVX2_LOAD +# define LIBXSMM_DIFF_32 LIBXSMM_DIFF_AVX2 +#else +# define LIBXSMM_DIFF_32_DECL(A) LIBXSMM_DIFF_16_DECL(A); LIBXSMM_DIFF_16_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _)) +# define LIBXSMM_DIFF_32_ASSIGN(A, B) LIBXSMM_DIFF_16_ASSIGN(A, B); LIBXSMM_DIFF_16_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_32_, B, _)) +# define LIBXSMM_DIFF_32_LOAD(A, SRC) LIBXSMM_DIFF_16_LOAD(A, SRC); LIBXSMM_DIFF_16_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), (const uint64_t*)(SRC) + 2) +# define LIBXSMM_DIFF_32(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_16(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_16(LIBXSMM_CONCATENATE3(libxsmm_diff_32_, A, _), (const uint64_t*)(B) + 2, __VA_ARGS__))) +#endif + +#define LIBXSMM_DIFF_48_DECL(A) LIBXSMM_DIFF_16_DECL(A); LIBXSMM_DIFF_32_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _)) +#define LIBXSMM_DIFF_48_ASSIGN(A, B) LIBXSMM_DIFF_16_ASSIGN(A, B); LIBXSMM_DIFF_32_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_48_, B, _)) +#define LIBXSMM_DIFF_48_LOAD(A, SRC) LIBXSMM_DIFF_16_LOAD(A, SRC); LIBXSMM_DIFF_32_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), (const uint64_t*)(SRC) + 2) +#define LIBXSMM_DIFF_48(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_16(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_32(LIBXSMM_CONCATENATE3(libxsmm_diff_48_, A, _), (const uint64_t*)(B) + 2, __VA_ARGS__))) + +#define LIBXSMM_DIFF_64SW_DECL(A) LIBXSMM_DIFF_32_DECL(A); LIBXSMM_DIFF_32_DECL(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _)) +#define LIBXSMM_DIFF_64SW_ASSIGN(A, B) LIBXSMM_DIFF_32_ASSIGN(A, B); LIBXSMM_DIFF_32_ASSIGN(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), LIBXSMM_CONCATENATE3(libxsmm_diff_64_, B, _)) +#define LIBXSMM_DIFF_64SW_LOAD(A, SRC) LIBXSMM_DIFF_32_LOAD(A, SRC); LIBXSMM_DIFF_32_LOAD(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), (const uint64_t*)(SRC) + 4) +#define LIBXSMM_DIFF_64SW(A, B, ...) ((unsigned char)(0 != LIBXSMM_DIFF_32(A, B, __VA_ARGS__) ? 1 : LIBXSMM_DIFF_32(LIBXSMM_CONCATENATE3(libxsmm_diff_64_, A, _), (const uint64_t*)(B) + 4, __VA_ARGS__))) + +#if defined(LIBXSMM_DIFF_AVX512_ENABLED) +# define LIBXSMM_DIFF_AVX512_DECL(A) __m512i A = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32() +# define LIBXSMM_DIFF_AVX512_ASSIGN(A, B) (A) = (B) +# define LIBXSMM_DIFF_AVX512_LOAD(A, SRC) A = _mm512_loadu_si512((const __m512i*)(SRC)) +# define LIBXSMM_DIFF_AVX512(A, B, ...) ((unsigned char)(0xFFFF != (unsigned int)/*_cvtmask16_u32*/(_mm512_cmpeq_epi32_mask( \ + A, _mm512_loadu_si512((const __m512i*)(B)))))) +#else +# define LIBXSMM_DIFF_AVX512_DECL LIBXSMM_DIFF_64SW_DECL +# define LIBXSMM_DIFF_AVX512_ASSIGN LIBXSMM_DIFF_64SW_ASSIGN +# define LIBXSMM_DIFF_AVX512_LOAD LIBXSMM_DIFF_64SW_LOAD +# define LIBXSMM_DIFF_AVX512 LIBXSMM_DIFF_64SW +#endif + +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) +# define LIBXSMM_DIFF_64_DECL LIBXSMM_DIFF_AVX512_DECL +# define LIBXSMM_DIFF_64_ASSIGN LIBXSMM_DIFF_AVX512_ASSIGN +# define LIBXSMM_DIFF_64_LOAD LIBXSMM_DIFF_AVX512_LOAD +# define LIBXSMM_DIFF_64 LIBXSMM_DIFF_AVX512 +#else +# define LIBXSMM_DIFF_64_DECL LIBXSMM_DIFF_64SW_DECL +# define LIBXSMM_DIFF_64_ASSIGN LIBXSMM_DIFF_64SW_ASSIGN +# define LIBXSMM_DIFF_64_LOAD LIBXSMM_DIFF_64SW_LOAD +# define LIBXSMM_DIFF_64 LIBXSMM_DIFF_64SW +#endif + +#define LIBXSMM_DIFF_DECL(N, A) LIBXSMM_CONCATENATE3(LIBXSMM_DIFF_, N, _DECL)(A) +#define LIBXSMM_DIFF_LOAD(N, A, SRC) LIBXSMM_CONCATENATE3(LIBXSMM_DIFF_, N, _LOAD)(A, SRC) +#define LIBXSMM_DIFF(N) LIBXSMM_CONCATENATE(LIBXSMM_DIFF_, N) + +#define LIBXSMM_DIFF_N(TYPE, RESULT, DIFF, A, BN, ELEMSIZE, STRIDE, HINT, N) { \ + const char* libxsmm_diff_b_ = (const char*)(BN) + (size_t)(HINT) * (STRIDE); \ + for (RESULT = (HINT); (RESULT) < (N); ++(RESULT)) { \ + if (0 == DIFF(A, libxsmm_diff_b_, ELEMSIZE)) break; \ + libxsmm_diff_b_ += (STRIDE); \ + } \ + if ((N) == (RESULT)) { /* wrong hint */ \ + TYPE libxsmm_diff_r_ = 0; \ + libxsmm_diff_b_ = (const char*)(BN); /* reset */ \ + for (; libxsmm_diff_r_ < (HINT); ++libxsmm_diff_r_) { \ + if (0 == DIFF(A, libxsmm_diff_b_, ELEMSIZE)) { \ + RESULT = libxsmm_diff_r_; \ + break; \ + } \ + libxsmm_diff_b_ += (STRIDE); \ + } \ + } \ +} + + +/** Function type representing the diff-functionality. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE unsigned int (*libxsmm_diff_function)( + const void* /*a*/, const void* /*b*/, ... /*size*/); + +/** Compare two data blocks of 4 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_4(const void* a, const void* b, ...); +/** Compare two data blocks of 8 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_8(const void* a, const void* b, ...); +/** Compare two data blocks of 16 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_16(const void* a, const void* b, ...); +/** Compare two data blocks of 32 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_32(const void* a, const void* b, ...); +/** Compare two data blocks of 48 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_48(const void* a, const void* b, ...); +/** Compare two data blocks of 64 Byte each. */ +LIBXSMM_API unsigned char libxsmm_diff_64(const void* a, const void* b, ...); + +#endif /*LIBXSMM_DIFF_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_dnn.c b/third_party/libxsmm/src/libxsmm_dnn.c new file mode 100644 index 00000000..4627a34c --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn.c @@ -0,0 +1,759 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(_OPENMP) +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + + +LIBXSMM_API_INTERN void libxsmm_dnn_init(int target_arch) +{ + LIBXSMM_UNUSED(target_arch); +} + + +LIBXSMM_API_INTERN void libxsmm_dnn_finalize(void) +{ +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_get_feature_map_blocks( int C, int K, int* C_block, int* K_block, int* fm_lp_block, libxsmm_dnn_datatype datatype_in, libxsmm_dnn_datatype datatype_out ) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int ifmblock = 0; + int ofmblock = 0; + int lp_block = 0; + int tmp_max_c_block = 64; + int tmp_max_k_block = 64; + int tmp_block = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + /* C */ + if ( ((libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR) && (datatype_in == LIBXSMM_DNN_DATATYPE_BF16)) || + (libxsmm_target_archid < LIBXSMM_X86_AVX512 ) ) { + tmp_max_c_block = 32; + } else if ( libxsmm_target_archid == LIBXSMM_AARCH64_V81 ) { + tmp_max_c_block = 16; + } + if ( C < tmp_max_c_block ) { + ifmblock = C; + } else { + for ( tmp_block = 1; tmp_block <= tmp_max_c_block; tmp_block *= 2 ) { + if ( C % tmp_block == 0 ) ifmblock = tmp_block; + } + } + + /* K */ + if ( ((libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR) && (datatype_in == LIBXSMM_DNN_DATATYPE_BF16)) || + (libxsmm_target_archid < LIBXSMM_X86_AVX512 ) ) { + tmp_max_k_block = 32; + } else if ( libxsmm_target_archid == LIBXSMM_AARCH64_V81 ) { + tmp_max_k_block = 16; + } + if ( K < tmp_max_k_block ) { + ofmblock = K; + } else { + for ( tmp_block = 1; tmp_block <= tmp_max_k_block; tmp_block *= 2 ) { + if ( K % tmp_block == 0 ) ofmblock = tmp_block; + } + } + + /* when do we need VNNI format? */ + if ( (datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + lp_block = 1; + } else if ( (datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + lp_block = 2; + } else if ( (datatype_in == LIBXSMM_DNN_DATATYPE_I16) && ((datatype_out == LIBXSMM_DNN_DATATYPE_I32) || (datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { + lp_block = 2; + } else if (datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + lp_block = 4; + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + + *C_block = ifmblock; + *K_block = ofmblock; + *fm_lp_block = lp_block; + + return status; +} + + +LIBXSMM_API const char* libxsmm_dnn_get_error(libxsmm_dnn_err_t code) +{ + switch (code) { + case LIBXSMM_DNN_SUCCESS: + return "LIBXSMM DNN Success!"; + case LIBXSMM_DNN_WARN_FALLBACK: + return "LIBXSMM DNN Warning: Falling back to naive code as target is currently not supported by LIBXSMM!"; + case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING: + return "LIBXSMM DNN Warning: RNN cell suboptimal minibatch blocking!"; + case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING: + return "LIBXSMM DNN Warning: RNN cell suboptimal input feature blocking!"; + case LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING: + return "LIBXSMM DNN Warning: RNN cell suboptimal output feature blocking!"; + case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING: + return "LIBXSMM DNN Warning: FC layer suboptimal minibatch blocking!"; + case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING: + return "LIBXSMM DNN Warning: FC layer suboptimal input feature blocking!"; + case LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING: + return "LIBXSMM DNN Warning: FC layer suboptimal output feature blocking!"; + case LIBXSMM_DNN_ERR_GENERAL: + return "LIBXSMM DNN Error: General error occurred!"; + case LIBXSMM_DNN_ERR_CREATE_HANDLE: + return "LIBXSMM DNN Error: Handle creation failed!"; + case LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE: + return "LIBXSMM DNN Error: Requested datatype is not available!"; + case LIBXSMM_DNN_ERR_INVALID_BLOCKING: + return "LIBXSMM DNN Error: Requested Input/Output buffer size cannot be blocked!"; + case LIBXSMM_DNN_ERR_INVALID_HANDLE: + return "LIBXSMM DNN Error: An invalid handle was provided!"; + case LIBXSMM_DNN_ERR_DATA_NOT_BOUND: + return "LIBXSMM DNN Error: Not all required sources and destinations have been bound to convolution!"; + case LIBXSMM_DNN_ERR_CREATE_TENSOR: + return "LIBXSMM DNN Error: Tensor creation failed!"; + case LIBXSMM_DNN_ERR_INVALID_TENSOR: + return "LIBXSMM DNN Error: Invalid tensor was specified!"; + case LIBXSMM_DNN_ERR_MISMATCH_TENSOR: + return "LIBXSMM DNN Error: Tensor doesn't match handle it should be bind to!"; + case LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR: + return "LIBXSMM DNN Error: Invalid handle or tensor!"; + case LIBXSMM_DNN_ERR_INVALID_KIND: + return "LIBXSMM DNN Error: Invalid convolution kind!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW: + return "LIBXSMM DNN Error: NCHW format is currently not natively supported by LIBXSMM!"; + case LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT: + return "LIBXSMM DNN Error: Unsupported destination format when copying data!"; + case LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT: + return "LIBXSMM DNN Error: Unsupported source format when copying data!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE: + return "LIBXSMM DNN Error: Unsupported format when requesting a convolution!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS: + return "LIBXSMM DNN Error: KCRS format is currently not natively supported by LIBXSMM!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL: + return "LIBXSMM DNN Error: Invalid format was specified!"; + case LIBXSMM_DNN_ERR_CREATE_LAYOUT: + return "LIBXSMM DNN Error: Layout creation failed!"; + case LIBXSMM_DNN_ERR_INVALID_LAYOUT: + return "LIBXSMM DNN Error: Invalid layout was specified!"; + case LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH: + return "LIBXSMM DNN Error: Unsupported architecture!"; + case LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED: + return "LIBXSMM DNN Error: scratch binding failed as scratch was not allocated!"; + case LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE: + return "LIBXSMM DNN Error: an unknown tensor type was provided!"; + case LIBXSMM_DNN_ERR_INVALID_ALGO: + return "LIBXSMM DNN Error: Invalid algorithm was specified!"; + case LIBXSMM_DNN_ERR_INVALID_PADDING: + return "LIBXSMM DNN Error: Invalid padding was specified!"; + case LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL: + return "LIBXSMM DNN Error: time steps should be >= 2 for RNN/LSTM!"; + case LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS: + return "LIBXSMM DNN Error: failed to create internal layout arrays!"; + case LIBXSMM_DNN_ERR_NOT_IMPLEMENTED: + return "LIBXSMM DNN Error: the requested functionality is right now not implemented!"; + case LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER: + return "LIBXSMM DNN Error: the requested order of fusion in batch norm is right now not implemented!"; + case LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION: + return "LIBXSMM DNN Error: the requested fusion in batch norm is right now not implemented!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN: + return "LIBXSMM DNN Error: Unsupported format when requesting a fused batch norm!"; + case LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING: + return "LIBXSMM DNN Error: Unsupported pooling operations was requested!"; + case LIBXSMM_DNN_ERR_INVALID_FORMAT_FC: + return "LIBXSMM DNN Error: Unsupported format when requesting a fullyconnected layer!"; + case LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN: + return "LIBXSMM DNN Error: max sequence length is shorter than sequence length we attempt to set!"; + case LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER: + return "LIBXSMM DNN Error: the requested order of fusion in group norm is right now not implemented!"; + case LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION: + return "LIBXSMM DNN Error: the requested fusion in group norm is right now not implemented!"; + case LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION: + return "LIBXSMM DNN Error: the requested fusion in fullyconnected is right now not implemented!"; + default: + return "LIBXSMM DNN Error: Unknown error or warning occurred!"; + } +} + + +LIBXSMM_API size_t libxsmm_dnn_typesize(libxsmm_dnn_datatype datatype) +{ + switch (datatype) { + case LIBXSMM_DNN_DATATYPE_F32: return 4; + case LIBXSMM_DNN_DATATYPE_I32: return 4; + case LIBXSMM_DNN_DATATYPE_BF16: return 2; + case LIBXSMM_DNN_DATATYPE_I16: return 2; + case LIBXSMM_DNN_DATATYPE_I8: return 1; + /* no error expected as enumeration really arrives at an enum; compiler-checked */ + default: return 1; + } +} + + +LIBXSMM_API size_t libxsmm_dnn_get_simd_width(libxsmm_dnn_datatype datatype) +{ + size_t l_cl_width_bytes; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( libxsmm_target_archid == LIBXSMM_X86_GENERIC || + libxsmm_target_archid == LIBXSMM_X86_SSE3 || + libxsmm_target_archid == LIBXSMM_X86_SSE42 ) { + l_cl_width_bytes = 16; + } else if ( libxsmm_target_archid == LIBXSMM_X86_AVX2 || + libxsmm_target_archid == LIBXSMM_X86_AVX ) { + l_cl_width_bytes = 32; + } else { + l_cl_width_bytes = 64; + } + + return l_cl_width_bytes/libxsmm_dnn_typesize(datatype); +} + +LIBXSMM_API_INLINE float libxsmm_internal_get_max( float* in_buffer, int length ) { + float absmax_value = LIBXSMM_ABS(in_buffer[0]); + int i = 0; +#ifdef _OPENMP + LIBXSMM_OMP_VAR(i); +# pragma omp parallel private(i) + { + float my_absmax_value = absmax_value; +# pragma omp for + for (i = 0; i < length; ++i ) { + if (LIBXSMM_ABS(in_buffer[i]) > my_absmax_value) { + my_absmax_value = LIBXSMM_ABS(in_buffer[i]); + } + } +# pragma omp critical + { + if (my_absmax_value > absmax_value) { + absmax_value = my_absmax_value; + } + } + } +#else + for (i = 1; i < length; ++i ) { + if (LIBXSMM_ABS(in_buffer[i]) > absmax_value) { + absmax_value = LIBXSMM_ABS(in_buffer[i]); + } + } +#endif + + return absmax_value; +} + + +LIBXSMM_API_INLINE unsigned char libxsmm_internal_get_max_exp( float* in_buffer, int length ) { + libxsmm_intfloat val_exp; + unsigned char max_exp = 0; + + /* bit-wise conversion to int */ + val_exp.f = libxsmm_internal_get_max( in_buffer, length ); + /* shift by mantissa to the right and convert to char */ + max_exp = (unsigned char)((val_exp.ui & LIBXSMM_DNN_MASK_ABS_F32) >> LIBXSMM_DNN_MANT_SZ_F32); + + return max_exp; +} + + +LIBXSMM_API_INLINE short libxsmm_internal_quantize_scalar_no_scf( float input, unsigned char max_exp, unsigned char add_shift, int round_mode ) { + libxsmm_intfloat value; + unsigned int qvalue = 0; + unsigned int mant = 0; + unsigned int sign = 0; + unsigned char rhs = 0; + unsigned char exp_off = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + /* in case of zero we don't need to do anything */ + if (LIBXSMM_FEQ(input, 0)) { + qvalue = 0; + } else { + /* let's get a float copy to work on */ + /* vinp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( in_buffer[i] ); */ + value.f = input; + /* let's compute the offset of the current exp at pos i from max offset, we need to mask the sign bit though */ + /*__m512i vexp = _mm512_cvtps_epi32(_mm512_getexp_ps (vinp)); + __m512i vexp_off = _mm512_sub_epi32(maxexpf, vexp);*/ + exp_off = (unsigned char)(max_exp - ((value.ui & LIBXSMM_DNN_MASK_ABS_F32) >> LIBXSMM_DNN_MANT_SZ_F32)); + /* cut out mantissa and set leading bit */ + /*__m512i mmask = _mm512_set1_epi32(LIBXSMM_DNN_MASK_MANT_F32); + __m512i vmant = _mm512_or_epi32(_mm512_set1_epi32(0x1 << LIBXSMM_DNN_MANT_SZ_F32), _mm512_and_epi32( _mm512_castps_si512( vinp ), mmask));*/ + mant = ((0x1 << LIBXSMM_DNN_MANT_SZ_F32) | (value.ui & LIBXSMM_DNN_MASK_MANT_F32)); + /* extract sign */ + /* __mmask16 smask = _mm512_cmplt_ps_mask (inp, _mm512_set1_ps(0)); */ + sign = ((value.ui & LIBXSNN_DNN_MASK_SIGN_F32) >> (LIBXSMM_DNN_SZ_F32-1)); + /* calculate rhs, be aware of the now explicit leading bit, @TODO add DFP8/4 */ + rhs = (unsigned char)((LIBXSMM_DNN_MANT_SZ_F32+1) - LIBXSMM_DNN_MANT_DFP16 + exp_off + add_shift); + /* some safety, to generate 0 when we fall off quant region, @TODO issue a LIBXSMM WARNING: that we shifted out the entire mantissa */ + if (rhs > (LIBXSMM_DNN_MANT_SZ_F32+1)) { + rhs = (LIBXSMM_DNN_MANT_SZ_F32+1); + } + /* finally shift the value into the region we need, this is now a 15-add_rhs bit number for the max value in in_buffer */ + qvalue = (mant >> rhs); + /* handle sign, 2 complement */ + if ( (sign > 0) && (qvalue > 0) ) { + qvalue = (~qvalue + 1); + } + + if (round_mode == LIBXSMM_DNN_QUANT_BIAS_ROUND) { + /* biased rounding towards next bigger number */ + /* first let's determine in the original number if we need a bias rounding, @TODO need fix for F64 */ + int bias_needed = (mant & (0x3 << (rhs-2))); + /* apply bias */ + if (bias_needed > 0) { + qvalue++; + } + } else if (round_mode == LIBXSMM_DNN_QUANT_NEAREST_ROUND) { + int nearest_needed = (mant & (0x1 << (rhs-1))); + /* apply rounding */ + if ((nearest_needed > 0) && (rhs > 1)) { + qvalue++; + } + } else if (round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND) { + /* stochastic rounding, as implemented in the IBM paper from 2015, @TODO, fix F64 and DFP8 */ + const float eps = LIXSMMM_DNN_RES_DFP16; + /* coverity[dont_call] */ + const float r = (float)rand(); + libxsmm_intfloat fvalue; + float p, q; + /* masking all bits which will be shifted out */ + fvalue.ui = value.ui & ((LIBXSMM_DNN_MASK_FULL_F32) << rhs); + /* drawing a random number */ + p = r/((float)RAND_MAX); + q = (input - fvalue.f)/eps; + /* apply rounding if needed */ + if ((p + q) > 0.5f) { + ++qvalue; + } + } else { + /* do nothing about rounding, just chop */ + } + } + + return (short)qvalue; +} + + +/* @TODO make this routine aware of any int type */ +LIBXSMM_API void libxsmm_dnn_quantize( float* in_buffer, short* out_buffer, int length, unsigned char add_shift, unsigned char* scf, int round_mode ) { + int i = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + /* in case we are using FP-Mul based quantization we use a different path for now + @TODO let's unify the paths by using the similar vectorization for both */ + if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { + const float max_value = libxsmm_internal_get_max( in_buffer, length ); + int maxexp = 0; + /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ + float scfq = LIBXSMM_FREXPF(max_value, &maxexp); + maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); + scfq = libxsmm_sexp2_i8i(-maxexp); + +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + if ( length % 16 == 0 ) { + __m512 vscfq = _mm512_set1_ps(scfq); +#ifdef _OPENMP +# pragma omp parallel for private(i) +#endif + for (i = 0; i < length; i+=16 ) { + _mm256_stream_si256( (__m256i *)&(out_buffer[i]), LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &(in_buffer[i]), vscfq ) ); + } + } else { +#endif +#ifdef _OPENMP +# pragma omp parallel for private(i) +#endif + for (i = 0; i < length; ++i ) { + out_buffer[i] = (short)LIBXSMM_ROUNDF(in_buffer[i] * scfq); + } +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + } +#endif + /* @TODO, we need to potentially fix this unsigned char problem */ +#if !defined(NDEBUG) /* library code is expected to be mute */ + if (maxexp > 0) { + fprintf(stderr, "error quant fil\n"); + } +#endif + *scf = (unsigned char)(-maxexp); + } else { + /* get max exponent */ + unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, length ); + + /* if we go for stochastic rounding, let's initialize random seed */ + if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { + srand(libxsmm_timer_tick() % ((unsigned int)-1)); + } + +#ifdef _OPENMP +# pragma omp parallel for private(i) +#endif + for (i = 0; i < length; ++i ) { + out_buffer[i] = libxsmm_internal_quantize_scalar_no_scf( in_buffer[i], max_exp, add_shift, round_mode ); + } + + *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); + } +} + + +LIBXSMM_API void libxsmm_dnn_quantize_act( float* in_buffer, short* out_buffer, unsigned int N, unsigned int C, unsigned int H, unsigned int W, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ) { + LIBXSMM_VLA_DECL(5, const float, in, in_buffer, C/cblk_f32, H, W, cblk_f32); + LIBXSMM_VLA_DECL(6, short, out, out_buffer, C/(cblk_i16*lp_blk), H, W, cblk_i16, lp_blk); + const unsigned int cblk = C/(cblk_i16*lp_blk); + int i1 = 0, i2 = 0, i3 = 0, i4 = 0, i5, i6; + + /* init libxsmm */ + LIBXSMM_INIT + + /* some quick and dirty checks */ + assert((C % cblk_f32) == 0); + assert((C % cblk_i16) == 0); + + /* in case we are using FP-Mul based quantization we use a different path for now + @TODO let's unify the paths by using the similar vectorization for both */ + if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { + const float max_value = libxsmm_internal_get_max( in_buffer, N*C*H*W ); + int maxexp = 0; + /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ + float scfq = LIBXSMM_FREXPF(max_value, &maxexp); + maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); + scfq = libxsmm_sexp2_i8i(-maxexp); + +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + if ( (cblk_f32 == 16) && (cblk_i16*lp_blk == 16) ) { + __m512 vscfq = _mm512_set1_ps(scfq); +#ifdef _OPENMP + LIBXSMM_OMP_VAR(i1); +# pragma omp parallel for private(i1) +#endif + for (i1 = 0; i1 < (int)(N*C*H*W); i1 += 16 ) { + _mm256_stream_si256( (__m256i *)&(out_buffer[i1]), LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( &(in_buffer[i1]), vscfq ) ); + } + } else { +#endif +#ifdef _OPENMP + LIBXSMM_OMP_VAR(i1); LIBXSMM_OMP_VAR(i2); LIBXSMM_OMP_VAR(i3); LIBXSMM_OMP_VAR(i4); LIBXSMM_OMP_VAR(i5); LIBXSMM_OMP_VAR(i6); +# pragma omp parallel for private(i1, i2, i3, i4, i5, i6) LIBXSMM_OPENMP_COLLAPSE(4) +#endif + for (i1 = 0; i1 < (int)N; ++i1 ) { + for (i2 = 0; i2 < (int)cblk; ++i2 ) { + for (i3 = 0; i3 < (int)H; ++i3 ) { + for (i4 = 0; i4 < (int)W; ++i4 ) { + for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { + for (i6 = 0; i6 < (int)lp_blk; ++i6 ) { + const int fi1 = i1; + const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)/cblk_f32; + const int fi3 = i3; + const int fi4 = i4; + const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)%cblk_f32; + LIBXSMM_VLA_ACCESS(6, out, i1, i2, i3, i4, i5, i6, cblk, H, W, cblk_i16, lp_blk) = (short)LIBXSMM_ROUNDF( + LIBXSMM_VLA_ACCESS(5, in, fi1, fi2, fi3, fi4, fi5, C / cblk_f32, H, W, cblk_f32) * scfq); + } + } + } + } + } + } +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + } +#endif + /* @TODO, we need to potentially fix this unsigned char problem */ +#if !defined(NDEBUG) /* library code is expected to be mute */ + if (maxexp > 0) { + fprintf(stderr, "error quant act\n"); + } +#endif + *scf = (unsigned char)(-maxexp); + } else { + /* get max exponent */ + unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, N*C*H*W ); + + /* if we go for stochastic rounding, let's initialize random seed */ + if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { + srand(libxsmm_timer_tick() % ((unsigned int)-1)); + } + +#ifdef _OPENMP +# pragma omp parallel for private(i1, i2, i3, i4, i5, i6) LIBXSMM_OPENMP_COLLAPSE(4) +#endif + for (i1 = 0; i1 < (int)N; ++i1 ) { + for (i2 = 0; i2 < (int)cblk; ++i2 ) { + for (i3 = 0; i3 < (int)H; ++i3 ) { + for (i4 = 0; i4 < (int)W; ++i4 ) { + for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { + for (i6 = 0; i6 < (int)lp_blk; ++i6 ) { + const int fi1 = i1; + const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)/cblk_f32; + const int fi3 = i3; + const int fi4 = i4; + const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i6)%cblk_f32; + LIBXSMM_VLA_ACCESS(6, out, i1, i2, i3, i4, i5, i6, cblk, H, W, cblk_i16, lp_blk) = libxsmm_internal_quantize_scalar_no_scf( + LIBXSMM_VLA_ACCESS(5, in, fi1, fi2, fi3, fi4, fi5, C / cblk_f32, H, W, cblk_f32), max_exp, add_shift, round_mode); + } + } + } + } + } + } + + *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); + } +} + + +LIBXSMM_API void libxsmm_dnn_quantize_fil( float* in_buffer, short* out_buffer, unsigned int K, unsigned int C, unsigned int R, unsigned int S, unsigned int cblk_f32, unsigned int cblk_i16, unsigned int kblk_f32, unsigned int kblk_i16, unsigned int lp_blk, unsigned char add_shift, unsigned char* scf, int round_mode ) { + LIBXSMM_VLA_DECL(6, const float, in, in_buffer, C/cblk_f32, R, S, cblk_f32, kblk_f32); + LIBXSMM_VLA_DECL(7, short, out, out_buffer, C/(cblk_i16*lp_blk), R, S, cblk_i16, kblk_i16, lp_blk); + unsigned int cblk = C/(cblk_i16*lp_blk); + unsigned int kblk = K/kblk_i16; + int i1 = 0, i2 = 0, i3 = 0, i4 = 0, i5, i6, i7; + + /* some quick and dirty checks */ + assert((C % cblk_f32) == 0); + assert((C % (cblk_i16*lp_blk)) == 0); + assert((K % kblk_f32) == 0); + assert((K % kblk_i16) == 0); + assert((lp_blk % 2) == 0); + + /* init libxsmm */ + LIBXSMM_INIT + + /* in case we are using FP-Mul based quantization we use a different path for now + @TODO let's unify the paths by using the similar vectorization for both */ + if ( round_mode == LIBXSMM_DNN_QUANT_FPHW_ROUND ) { + const float max_value = libxsmm_internal_get_max( in_buffer, K*C*R*S ); + int maxexp = 0; + /* take return value of LIBXSMM_FREXPF to mute static analysis issue */ + float scfq = LIBXSMM_FREXPF(max_value, &maxexp); + maxexp -= (15/*LIBXSMM_DNN_MANT_DFP16?*/ - add_shift); + scfq = libxsmm_sexp2_i8i(-maxexp); + +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + if ( (kblk_f32 == 16) && (cblk_f32 == 16) && (kblk_i16 == 16) && (cblk_i16*lp_blk == 16) ) { + const __m512 vscfq = _mm512_set1_ps(scfq); + const __m512i permute_compact_idx = _mm512_set_epi32(15,14,13,12,7,6,5,4,11,10,9,8,3,2,1,0); +#ifdef _OPENMP +# pragma omp parallel for private(i1, i2, i3, i4, i5) LIBXSMM_OPENMP_COLLAPSE(4) +#endif + for (i1 = 0; i1 < (int)kblk; ++i1 ) { + for (i2 = 0; i2 < (int)cblk; ++i2 ) { + for (i3 = 0; i3 < (int)R; ++i3 ) { + for (i4 = 0; i4 < (int)S; ++i4 ) { + for (i5 = 0; i5 < 16; i5+=2 ) { + __m256i even_ch = LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( + &LIBXSMM_VLA_ACCESS(6, in, i1, i2, i3, i4, i5 + 0, 0, C / cblk_f32, R, S, cblk_f32, kblk_f32), vscfq); + __m256i odd_ch = LIBXSMM_INTRINSICS_MM512_QUANTIZE_NEAR_PS_EPI16( + &LIBXSMM_VLA_ACCESS(6, in, i1, i2, i3, i4, i5 + 1, 0, C / cblk_f32, R, S, cblk_f32, kblk_f32), vscfq); + __m256i compressed_lo = _mm256_unpacklo_epi16(even_ch, odd_ch); + __m256i compressed_hi = _mm256_unpackhi_epi16(even_ch, odd_ch); + __m512i compact = _mm512_inserti64x4( _mm512_setzero_si512(), compressed_lo, 0); + compact = _mm512_inserti64x4(compact, compressed_hi, 1); + compact = _mm512_permutexvar_epi32(permute_compact_idx, compact); + LIBXSMM_INTRINSICS_MM512_STREAM_SI512( + (void*)&LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5 / 2, 0, 0, cblk, R, S, cblk_i16, kblk_i16, lp_blk), + compact); + } + } + } + } + } + } else { +#endif +#ifdef _OPENMP + LIBXSMM_OMP_VAR(i1); LIBXSMM_OMP_VAR(i2); LIBXSMM_OMP_VAR(i3); LIBXSMM_OMP_VAR(i4); LIBXSMM_OMP_VAR(i5); LIBXSMM_OMP_VAR(i6); LIBXSMM_OMP_VAR(i7); +# pragma omp parallel for private(i1, i2, i3, i4, i5, i6, i7) LIBXSMM_OPENMP_COLLAPSE(4) +#endif + for (i1 = 0; i1 < (int)kblk; ++i1 ) { + for (i2 = 0; i2 < (int)cblk; ++i2 ) { + for (i3 = 0; i3 < (int)R; ++i3 ) { + for (i4 = 0; i4 < (int)S; ++i4 ) { + for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { + for (i6 = 0; i6 < (int)kblk_i16; ++i6 ) { + for (i7 = 0; i7 < (int)lp_blk; ++i7 ) { + const int fi1 = ((i1*kblk_i16)+i6)/kblk_f32; + const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)/cblk_f32; + const int fi3 = i3; + const int fi4 = i4; + const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)%cblk_f32; + const int fi6 = ((i1*kblk_i16)+i6)%kblk_f32; + LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5, i6, i7, cblk, R, S, cblk_i16, kblk_i16, lp_blk) = (short)LIBXSMM_ROUNDF( + LIBXSMM_VLA_ACCESS(6, in, fi1, fi2, fi3, fi4, fi5, fi6, C / cblk_f32, R, S, cblk_f32, kblk_f32) * scfq); + } + } + } + } + } + } + } +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + } +#endif + /* @TODO, we need to potentially fix this unsigned char problem */ +#if !defined(NDEBUG) /* library code is expected to be mute */ + if (maxexp > 0) { + fprintf(stderr, "error quant fil\n"); + } +#endif + *scf = (unsigned char)(-maxexp); + } else { + /* get max exponent */ + unsigned char max_exp = libxsmm_internal_get_max_exp( in_buffer, K*C*R*S ); + + /* if we go for stochastic rounding, let's initialize random seed */ + if ( round_mode == LIBXSMM_DNN_QUANT_STOCH_ROUND ) { + srand(libxsmm_timer_tick() % ((unsigned int)-1)); + } + +#ifdef _OPENMP +# pragma omp parallel for private(i1, i2, i3, i4, i5, i6, i7) LIBXSMM_OPENMP_COLLAPSE(4) +#endif + for (i1 = 0; i1 < (int)kblk; ++i1 ) { + for (i2 = 0; i2 < (int)cblk; ++i2 ) { + for (i3 = 0; i3 < (int)R; ++i3 ) { + for (i4 = 0; i4 < (int)S; ++i4 ) { + for (i5 = 0; i5 < (int)cblk_i16; ++i5 ) { + for (i6 = 0; i6 < (int)kblk_i16; ++i6 ) { + for (i7 = 0; i7 < (int)lp_blk; ++i7 ) { + const int fi1 = ((i1*kblk_i16)+i6)/kblk_f32; + const int fi2 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)/cblk_f32; + const int fi3 = i3; + const int fi4 = i4; + const int fi5 = ((i2*cblk_i16*lp_blk)+(i5*lp_blk)+i7)%cblk_f32; + const int fi6 = ((i1*kblk_i16)+i6)%kblk_f32; + LIBXSMM_VLA_ACCESS(7, out, i1, i2, i3, i4, i5, i6, i7, cblk, R, S, cblk_i16, kblk_i16, lp_blk) = libxsmm_internal_quantize_scalar_no_scf( + LIBXSMM_VLA_ACCESS(6, in, fi1, fi2, fi3, fi4, fi5, fi6, C / cblk_f32, R, S, cblk_f32, kblk_f32), max_exp, add_shift, round_mode); + } + } + } + } + } + } + } + + *scf = (unsigned char)(14 - add_shift - (max_exp - 127)); + } +} + + +LIBXSMM_API void libxsmm_dnn_dequantize( short* in_buffer, float* out_buffer, int length, unsigned char scf ) { + const float val_exp = libxsmm_sexp2_i8i(-scf); + int i = 0; + +#ifdef _OPENMP +# pragma omp parallel for private(i) +#endif + for ( i = 0; i < length; ++i ) { + out_buffer[i] = ((float)in_buffer[i])*val_exp; + } +} + + +LIBXSMM_API void libxsmm_truncate_convert_f32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int length) { + unsigned int i = 0; + + /* truncate buffer to bf16 */ + for ( i = 0; i < length; ++i ) { + libxsmm_bfloat16_hp t; + + t.f = in[i]; + out[i] = t.i[1]; + } +} + + +LIBXSMM_API void libxsmm_rnaz_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len) { + unsigned int i = 0; + + /* truncate buffer to bf16 */ + for ( i = 0; i < len; ++i ) { + unsigned int int_round = 0; + unsigned int do_round = 1; + + int_round = *((unsigned int*)&(in[i])); + + /* we don't round NaN and inf */ + if ( (int_round & 0x7f800000) == 0x7f800000 ) { + do_round = 0; + } + + /* perform round nearest tie away from zero */ + if ( do_round != 0 ) { + int_round = int_round + 0x00008000; + } + + /* create the bf16 value by shifting out the lower 16bits */ + int_round = int_round >> 16; + + out[i] = (libxsmm_bfloat16)int_round; + } +} + + +LIBXSMM_API void libxsmm_rne_convert_fp32_bf16(const float* in, libxsmm_bfloat16* out, unsigned int len) { + unsigned int i = 0; + + /* truncate buffer to bf16 */ + for ( i = 0; i < len; ++i ) { + unsigned int int_round = 0; + unsigned int do_round = 1; + + int_round = *((unsigned int*)&(in[i])); + + /* we don't round NaN and inf */ + if ( (int_round & 0x7f800000) == 0x7f800000 ) { + do_round = 0; + } + + /* perform round nearest tie even */ + if ( do_round != 0 ) { + unsigned int fixup = (int_round >> 16) & 1; + int_round = int_round + 0x00007fff + fixup; + } + + /* create the bf16 value by shifting out the lower 16bits */ + int_round = int_round >> 16; + + out[i] = (unsigned short)int_round; + } +} + + +LIBXSMM_API void libxsmm_convert_bf16_f32(const libxsmm_bfloat16* in, float* out, unsigned int length) { + unsigned int i = 0; + + /* up-convert is super simple */ + for ( i = 0; i < length; ++i ) { + libxsmm_bfloat16_hp t; + + t.i[1] = in[i]; + t.i[0] = 0; + out[i] = t.f; + } +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution.c b/third_party/libxsmm/src/libxsmm_dnn_convolution.c new file mode 100644 index 00000000..2ba07679 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution.c @@ -0,0 +1,2747 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke, Evangelos Georganas, Rajkishore Barik (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" +#include "libxsmm_dnn_convolution_forward.h" +#include "libxsmm_dnn_convolution_backward.h" +#include "libxsmm_dnn_convolution_weight_update.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(_OPENMP) +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#define MIXED 0 +#define KHWC 1 +#define HWKC 2 +#define CHWK 3 +#define HWCK 4 + +/**********************************************************/ +/* Helper functions for convolutions' general param setup */ +/**********************************************************/ +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_ifmblock( libxsmm_dnn_layer* handle ) { + int result = 1; + int ofm, lp; + + libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &result, &ofm, &lp, handle->desc.datatype_in, handle->desc.datatype_out ); + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_ofmblock( libxsmm_dnn_layer* handle ) { + int result = 1; + int ifm, lp; + + libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &ifm, &result, &lp, handle->desc.datatype_in, handle->desc.datatype_out ); + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fm_lp_block( libxsmm_dnn_layer* handle ) { + int result = 1; + int ifm, ofm; + + libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, &ifm, &ofm, &result, handle->desc.datatype_in, handle->desc.datatype_out ); + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fallback_loops_fwd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* FIXME: For now fallback only if MB is not divisible by number of threads */ + if (handle->desc.N % handle->desc.threads != 0) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksifm( libxsmm_dnn_layer* handle ) { + int result = handle->desc.C / handle->ifmblock; + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksofm( libxsmm_dnn_layer* handle ) { + int result = handle->desc.K / handle->ofmblock; + return result; +} + +/**********************************************************/ +/* Helper functions for FWD convolutions' parameter setup */ +/**********************************************************/ +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_ofw_rb( libxsmm_dnn_layer* handle ) { + int result = 0; + result = handle->ofw; + if (handle->ofw == 56) { + result = 28; + } + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + if (handle->ofw % 2 == 0) { + result = handle->ofw/2; + } + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_fwd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Pack only for small images and when having large K to amortize, and we can only pack for 1x1 convolutions */ + if ((handle->ofw <= 14) && (handle->desc.K > 512) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.u == 2) && (handle->desc.v == 2)) { + result = 1; + } + + /* For SPR we allow packing more aggressively to generate more efficient BRGEMMs */ + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + if ((handle->ofw <= 14) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.u == 2) && (handle->desc.v == 2)) { + result = 1; + } + } + + /* Make sure we don't pack when minibatch is not divisible by number of threads since H is used potentially for parallelism */ + if (handle->desc.N != handle->desc.threads) { + result = 0; + } + /* we don't pack for int8 */ + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_ofh_rb( libxsmm_dnn_layer* handle ) { + int result = 1; + /* Multiple rows for "small" images and 1x1 convolutions */ + if ((handle->ofh <= 14) && (handle->desc.R == 1) && (handle->desc.S == 1)) { + result = handle->ofh; + } + + /* In this case we will be using fallback generic loops, thus ofh_rb should be 1 */ + if ((handle->desc.N % handle->desc.threads != 0) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) { + result = 1; + } + + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + if (handle->ofw == 7 && handle->ofh == 7 && handle->desc.R == 3 && handle->desc.S == 3) { + result = 7; + } + if (handle->ofw == 14 && handle->ofh == 14 /*&& handle->desc.R == 3 && handle->desc.S == 3*/) { + result = 2; + } + } + + /* Make sure we don't use multiple rows when we don't pack input and convolutions are strided*/ + if ((handle->pack_input == 0) && ((handle->desc.u !=1 ) || (handle->desc.v != 1))) { + result = 1; + } + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_pixels_gemm( libxsmm_dnn_layer* handle ) { + int result = handle->fwd_ofw_rb * handle->fwd_ofh_rb; + /* In the case below we calculate redundantly pixels in order to efficiently use AMX */ + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + if (handle->desc.R != 1 || handle->desc.R != 1) { + if (handle->ofw < 24) { + result = (handle->fwd_ofw_rb+2*handle->desc.pad_w) * (handle->fwd_ofh_rb-2) + 2 * (handle->fwd_ofw_rb+handle->desc.pad_w); + } + } + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_block_H( libxsmm_dnn_layer* handle ) { + int result = 14; + + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + /* Spatial dimension block tuning for SPR */ + if ((handle->ofh == 7 && handle->desc.u == 2) || (handle->ofh == 14 && handle->desc.R != 3 ) || handle->ofh == 27 || (handle->ofh == 28 && handle->desc.R == 1) || handle->ofh == 48 || handle->ofh == 54 || handle->ofh == 56 || handle->ofh == 112 ) { + result = 4; + } + } else { + /* Block H only for large images */ + if (handle->ofh >= 28) { + result = 4; + } + if (handle->ofh == 28 && handle->desc.R == 3 ) { + result = 14; + } + } + /* Make sure it is divisible bu the ofh_rb factor in the kernel */ + while ( result % handle->fwd_ofh_rb != 0 ) { + result--; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksifm_blocking( libxsmm_dnn_layer* handle ) { + int result = 1; + /* For 1x1 Convolutions bring in kernel all IFMs unless filters are huge*/ + if ((handle->desc.R == 1) && (handle->desc.S == 1) ) { + result = handle->blocksifm; + if ((handle->desc.C >= 2048) && (handle->desc.K >= 512)) { + result = 1; + } + if ( (handle->target_archid < LIBXSMM_X86_AVX512) && (handle->desc.C >= 512) ) { + result = 2; + } + if ( (handle->target_archid < LIBXSMM_X86_AVX512) && (handle->desc.C >= 1024) ) { + result = 4; + } + } else { + result = 1; + /* If small image can bring in more IFMS even if NOT 1x1 convolution */ + if (handle->ofw <= 7) { + result = 2; + } + } + if (handle->blocksifm % result != 0) { + result = 1; + } + + /* In case of SPR bring always in all accumulation */ + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8))) { + result = handle->blocksifm; + } + + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + result = handle->blocksifm; + } + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_fwd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Switch to loop order 1 only if 1x1 convolution with "large" input image and "small" K */ + if ((handle->desc.H >= 28) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.C >=512) && (handle->desc.K <=512)) { + result = 1; + } + if (handle->ofw == 56 && handle->desc.R == 1 && handle->desc.C == 256 && handle->desc.K == 64 ) { + result = 1; + } + if (handle->ofw == 28 && handle->desc.R == 1) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_fwd_IFM( libxsmm_dnn_layer* handle ) { + int result = 8; + if (handle->ofw == 7 && handle->desc.C == 2048 && handle->desc.K == 512) { + result = 4; + } + /* Make sure it is divisible by ifms in the kernel */ + while (result % handle->blocksifm_blocking != 0) { + result++; + } + result = LIBXSMM_MIN(handle->blocksifm, result); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_fwd_OFM( libxsmm_dnn_layer* handle ) { + int result = 8; + if (handle->ofw == 14 && handle->desc.K == 1024) { + result = 16; + } + if (handle->ofw == 7) { + result = 16; + } + result = LIBXSMM_MIN(handle->blocksofm, result); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_ofm_parallelization( libxsmm_dnn_layer* handle ) { + int result = 0; +#if 0 + /* Use "hybrid" minibatch/ofm parallelization if we have huge filters */ + if ((handle->desc.R >= 3) && (handle->desc.S >= 3) && (handle->desc.C >= 512) && (handle->desc.K >= 512)) { + result = 1; + } +#endif + if ((handle->ofw <= 7) && (handle->desc.C == 1024) && (handle->desc.K == 512)) { + result = 1; + } + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8))) { + if (handle->ofw == 7) { + result = 1; + } + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Avoid rim FMA if the convolution is 3x3 (non-strided) and the image is "small" */ + if ((handle->desc.R == 3) && (handle->desc.S == 3) && + (handle->desc.u == 1) && (handle->desc.v == 1) && + (handle->desc.pad_h_in == 1) && (handle->desc.pad_w_in == 1) && + (handle->desc.H == handle->desc.W) ) { + if (handle->ofw <= 28) { + result = 1; + } + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + result = 0; + } + } + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8))) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_shuffle_filter_accesses( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Shuffle filter accesses only if "pure minibatch" parallelization and large filters are involved */ + if ((handle->use_ofm_parallelization == 0) && (handle->desc.C > 512) && (handle->desc.K > 512)) { + result = 1; + } + if (handle->ofw == 7 && handle->desc.R == 3 && handle->desc.C == 512) { + result = 1; + } + if (handle->ofw == 7 && handle->desc.R == 1 && handle->desc.C == 512 && handle->desc.K == 2048) { + result = 1; + } + if (handle->ofw == 7 && handle->desc.R == 1 && handle->desc.C == 2048 && handle->desc.K == 512) { + result = 1; + } + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_acc_load( libxsmm_dnn_layer* handle ) { + int result = 0; + if ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) { + if ((handle->desc.R == 1) && (handle->desc.S == 1)) { + if (handle->blocksifm_blocking == handle->blocksifm) { + result = 1; + } + } else { + if ((handle->blocksifm_blocking == handle->blocksifm) && (handle->avoid_fmas_in_rim == 0)) { + result = 1; + } + } + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_fwd_gemm_flags( libxsmm_dnn_layer* handle ) { + int result = 0; + +#if defined(LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS) + /* If large image and NOT already loaded in accumulators, tnen use streaming stores */ + if ((handle->ofw >= 56) && (handle->desc.K >= 256) && (handle->avoid_acc_load == 1) && (handle->desc.R == 1) && (handle->desc.S == 1)) { + result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; + } + if (handle->ofw == 56 && handle->desc.C == 64 && handle->desc.K == 64 && handle->desc.R == 1) { + result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; + } + if (handle->ofw == 56 && handle->desc.C == 256 && handle->desc.K == 64 && handle->desc.R == 1) { + result = LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; + } + /* Disable since the GEMM output is going to f32 scratch */ + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 || handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) { + result = 0; + } +#else + LIBXSMM_UNUSED(handle); +#endif + + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8))) { + result = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + } + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fwd_padding_copy( libxsmm_dnn_layer* handle ) { + int result = 0; + if ( (handle->desc.pad_h != handle->desc.pad_h_in) && (handle->desc.pad_w != handle->desc.pad_w_in) ) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_fwd_scratch( libxsmm_dnn_layer* handle ) { + handle->fwd_packing_padding_scratch_size = 0; + /* packing of input */ + if ( handle->pack_input != 0 ) { + handle->fwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * + handle->desc.H/handle->desc.u * + handle->desc.W/handle->desc.v * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* logical padding with copying in the fly */ + if ( handle->fwd_padding_copy != 0 ) { + handle->fwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * + (handle->desc.H + 2*handle->desc.pad_h) * + (handle->desc.W + 2*handle->desc.pad_w) * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* output buffer in high precision when we use BF16 */ + if ( ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) || + ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8 ) ) { + handle->fwd_lp_output_full_scratch_size = (size_t) LIBXSMM_MAX(handle->desc.threads * handle->fwd_gemm_pixels * handle->ofmblock * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32), handle->desc.N * handle->desc.K * handle->ofwp * handle->ofhp * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32)); + handle->fwd_lp_output_block_scratch_size = (size_t)handle->desc.threads * handle->fwd_ofw_rb * + handle->fwd_ofh_rb * handle->ofmblock * + libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); + } else { + handle->fwd_lp_output_full_scratch_size = 0; + handle->fwd_lp_output_block_scratch_size = 0; + } + /* align sizes to full cacheline */ + handle->fwd_packing_padding_scratch_size += ( handle->fwd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->fwd_packing_padding_scratch_size % LIBXSMM_CACHELINE); + handle->fwd_lp_output_full_scratch_size += ( handle->fwd_lp_output_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->fwd_lp_output_full_scratch_size % LIBXSMM_CACHELINE); + handle->fwd_lp_output_block_scratch_size += ( handle->fwd_lp_output_block_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->fwd_lp_output_block_scratch_size % LIBXSMM_CACHELINE); + + /* set offsets */ + handle->fwd_packing_padding_scratch_offset = 0; + handle->fwd_lp_output_full_scratch_offset = handle->fwd_packing_padding_scratch_size; + handle->fwd_lp_output_block_scratch_offset = handle->fwd_lp_output_full_scratch_offset + + handle->fwd_lp_output_full_scratch_size; + + /* set overall scratch size for forward */ + handle->fwd_scratch_size = handle->fwd_packing_padding_scratch_size + + handle->fwd_lp_output_full_scratch_size + + handle->fwd_lp_output_block_scratch_size; +} + +/**********************************************************/ +/* Helper functions for BWD convolutions' parameter setup */ +/**********************************************************/ +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_fallback_loops_bwd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* FIXME: Fallback if MB is not divisible by number of threads */ + if (handle->desc.N % handle->desc.threads != 0) { + result = 1; + } + if (handle->desc.R == 1 && handle->desc.S == 1 && (handle->desc.pad_h != 0 || handle->desc.pad_w != 0)) { + result = 1; + } + if ((handle->desc.R > 1 && handle->desc.pad_h == 0) || (handle->desc.S > 1 && handle->desc.pad_w == 0)) { + result = 1; + } + if ((handle->desc.R > 1 && (handle->desc.pad_h_out == 0 || handle->desc.pad_h_in == 0)) || + (handle->desc.S > 1 && (handle->desc.pad_w_out == 0 || handle->desc.pad_w_in == 0)) ) { + result = 1; + } + if ((handle->desc.R > 1 && handle->desc.u > 1) || (handle->desc.S > 1 && handle->desc.v > 1)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_ofw_rb( libxsmm_dnn_layer* handle ) { + int result = libxsmm_dnn_convolution_setup_fwd_ofw_rb(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_ofh_rb( libxsmm_dnn_layer* handle ) { + int result = libxsmm_dnn_convolution_setup_fwd_ofh_rb(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_pixels_gemm( libxsmm_dnn_layer* handle ) { + int result = handle->bwd_ofw_rb * handle->bwd_ofh_rb; + /* In the case below we calculate redundantly pixels in order to efficiently use AMX */ + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + if (handle->desc.R != 1 || handle->desc.R != 1) { + if (handle->ofw < 24) { + result = (handle->bwd_ofw_rb+2*handle->desc.pad_w) * (handle->bwd_ofh_rb-2) + 2 * (handle->bwd_ofw_rb+handle->desc.pad_w); + } + } + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_bwd_block_H( libxsmm_dnn_layer* handle ) { + int result = 0; + result = libxsmm_dnn_convolution_setup_fwd_block_H(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_bwd( libxsmm_dnn_layer* handle ) { + int result = 0; + result = libxsmm_dnn_convolution_setup_loop_order_fwd(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_bwd_IFM( libxsmm_dnn_layer* handle ) { + int result = 0; + result = LIBXSMM_MIN(handle->blocksifm, 16); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_bwd_OFM( libxsmm_dnn_layer* handle ) { + int result = 8; + while (result % handle->blocksofm_blocking != 0) { + result++; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_bwd( libxsmm_dnn_layer* handle ) { + int result = 0; + if ((handle->desc.u != 1) && (handle->bwd_ofh_rb != 1)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_ifm_parallelization( libxsmm_dnn_layer* handle ) { + int result = 0; + if (handle->ofw <= 7) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_bwd( libxsmm_dnn_layer* handle ) { + int result = libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_blocksofm_blocking( libxsmm_dnn_layer* handle ) { + int result = 0; + if (handle->desc.R == 1 && handle->desc.S == 1) { + result = handle->blocksofm; + } else { + result = 1; + if (handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 7 && handle->ofw == 7) { + result = 2; + } + } + + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + result = handle->blocksofm; + } + + if (handle->blocksofm % result != 0) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_bwd_gemm_flags( libxsmm_dnn_layer* handle ) { + int result = 0; + LIBXSMM_UNUSED( handle ); + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) && ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8)) ) { + result = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_spread_input_bwd( libxsmm_dnn_layer* handle ) { + int result = 0; + LIBXSMM_UNUSED(handle); + if (((handle->desc.u != 1) || (handle->desc.v != 1)) && (handle->bwd_ofh_rb == 1)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_acc_load_bwd( libxsmm_dnn_layer* handle ) { + int result = 0; + if ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) { + if ((handle->desc.R == 1) && (handle->desc.S == 1)) { + if (handle->blocksofm_blocking == handle->blocksofm) { + result = 1; + } + } else { + if ((handle->blocksofm_blocking == handle->blocksofm) && (handle->avoid_fmas_in_rim == 0)) { + result = 1; + } + } + } + return result; +} + +LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_bwd_scratch( libxsmm_dnn_layer* handle ) { + /* transpose of weights */ + handle->bwd_filter_trans_scratch_size = (size_t)handle->desc.C * handle->desc.K * + handle->desc.R * handle->desc.S * + libxsmm_dnn_typesize(handle->datatype_in); + + handle->bwd_packing_padding_scratch_size = 0; + /* packing of input */ + if ( handle->pack_input_bwd != 0 ) { + handle->bwd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * + handle->ofhp * handle->ofwp * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* logical padding with copying in the fly */ + if ( handle->use_fallback_bwd_loops != 0 ) { + handle->bwd_packing_padding_scratch_size = (size_t)handle->desc.threads * handle->ifmblock * + (handle->desc.H + 2*handle->desc.pad_h) * + (handle->desc.W + 2*handle->desc.pad_w) * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* input bufffer in high precision when we use BF16 */ + if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) { + handle->bwd_lp_input_full_scratch_size = (size_t) LIBXSMM_MAX(handle->desc.threads * handle->bwd_gemm_pixels * handle->ifmblock * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32), handle->desc.N * handle->desc.C * handle->ifwp * handle->ifhp * libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32)); + /* logical padding with copying in the fly */ + if ( handle->use_fallback_bwd_loops != 0 ) { + handle->bwd_packing_padding_scratch_size = (size_t)handle->desc.threads * handle->ifmblock * + (handle->desc.H + 2*handle->desc.pad_h) * + (handle->desc.W + 2*handle->desc.pad_w) * + libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); + } + } else { + handle->bwd_lp_input_full_scratch_size = 0; + } + /* align sizes to full cacheline */ + handle->bwd_filter_trans_scratch_size += ( handle->bwd_filter_trans_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->bwd_filter_trans_scratch_size % LIBXSMM_CACHELINE); + handle->bwd_packing_padding_scratch_size += ( handle->bwd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->bwd_packing_padding_scratch_size % LIBXSMM_CACHELINE); + handle->bwd_lp_input_full_scratch_size += ( handle->bwd_lp_input_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->bwd_lp_input_full_scratch_size % LIBXSMM_CACHELINE); + + /* set offsets */ + handle->bwd_filter_trans_scratch_offset = 0; + handle->bwd_packing_padding_scratch_offset = handle->bwd_filter_trans_scratch_size; + handle->bwd_lp_input_full_scratch_offset = handle->bwd_packing_padding_scratch_offset + + handle->bwd_packing_padding_scratch_size; + + /* set overall scratch size for forward */ + handle->bwd_scratch_size = handle->bwd_filter_trans_scratch_size + + handle->bwd_packing_padding_scratch_size + + handle->bwd_lp_input_full_scratch_size; +} + +/**********************************************************/ +/* Helper functions for UPD convolutions' parameter setup */ +/**********************************************************/ +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_loop_order_upd( libxsmm_dnn_layer* handle ) { + int result = 1; + if (handle->ofh == 28 && handle->desc.R == 1 && handle->desc.u == 1 && handle->desc.C == 128 && handle->desc.K == 512) { + result = 0; + } + if (handle->ofh == 28 && handle->desc.R == 3 && handle->desc.u == 1 && handle->desc.C == 128 && handle->desc.K == 128) { + result = 0; + } + if (handle->ofw == 28 && handle->desc.R == 1 && handle->desc.C == 256 && handle->desc.K == 512) { + result = 0; + } + if (handle->ofw == 14 && !(handle->desc.R == 1 && handle->desc.C == 1024 && handle->desc.K == 256)) { + result = 0; + } + if (handle->ofw == 7) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_pack_input_upd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Pack input only for very small images, 1x1 convs, with large K to amortize the relevant overhead */ + if ((handle->ofh <= 7) && (handle->desc.R == 1) && (handle->desc.S == 1) && (handle->desc.u != 1) && (handle->desc.v != 1) && (handle->desc.K >= 2048)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_avoid_rim_fmas_upd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Avoid rim FMAs only for small images */ + if ( (handle->ofh <= 7) && (handle->desc.R == 3) && (handle->desc.S == 3) && (handle->desc.pad_w == 1) && (handle->desc.pad_h == 1)) { + result = 1; + } + if (handle->desc.N != handle->desc.threads) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_ofw_rb( libxsmm_dnn_layer* handle ) { + int result = 1; + result = handle->ofw; + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_ofh_rb( libxsmm_dnn_layer* handle ) { + int result = 1; + /* Restrict the reduction chain which is ofw_rb*ofh_rb*/ + if (handle->ofh <= 28 ) { + result = handle->ofh; + } + /* In the following scenario with strided convolutions and non batch reduce kernel make sure we have ofh_rb = 1 */ + if ((handle->desc.u != 1) && (handle->desc.v != 1) && (handle->upd_use_batchreduce == 0) && (handle->upd_pack_input == 0)) { + result = 1; + } + /* If using linearized taskview and have strided convs, make sure ofh_rb is 1.. */ + if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 0 && handle->upd_pack_input == 0 && handle->desc.u != 1) { + result = 1; + } + if (handle->upd_linearized_tasklist == 1 && handle->upd_use_batchreduce == 0 && (handle->desc.R != 1 || handle->desc.S != 1)) { + result = 1; + } + if (handle->upd_linearized_tasklist == 0 && handle->upd_use_batchreduce == 0 && (handle->desc.R != 1 || handle->desc.S != 1)) { + result = 1; + } + if (handle->ofw == 56 && handle->desc.R == 1) { + result = 2; + } + if (handle->upd_linearized_tasklist == 1 && handle->upd_use_batchreduce == 1 && handle->upd_avoid_rim_fmas == 1) { + result = handle->ofh; + } + + if ((handle->desc.N != handle->desc.threads) && (handle->desc.R > 1 || handle->desc.S > 1 ) && (handle->desc.u > 1 || handle->desc.v > 1 )) { + result = 1; + } + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_upd_IFM( libxsmm_dnn_layer* handle ) { + int result = 1; + if (handle->ofh == 56 && handle->desc.R == 1 && handle->desc.S == 1 && handle->desc.u == 1 && handle->desc.v == 1) { + result = 4; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_block_upd_OFM( libxsmm_dnn_layer* handle ) { + int result = 1; + LIBXSMM_UNUSED(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_img_batchreduce_block( libxsmm_dnn_layer* handle ) { + int result = 1; + LIBXSMM_UNUSED(handle); + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_use_batchreduce_upd( libxsmm_dnn_layer* handle ) { + int result = 1; + /* If W is large, no need for batchreduce kernel */ + if (handle->ofw >= 56) { + result = 0; + } + /* If we have packed the input, then disable batch-reduce GEMM */ + if (handle->upd_pack_input == 1) { + result = 0; + } + if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 0) { + result = 0; + } + if (handle->upd_linearized_tasklist == 1 && handle->upd_avoid_rim_fmas == 1) { + result = 1; + } + + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_weight_copies_upd( libxsmm_dnn_layer* handle ) { + int result = handle->desc.threads; + if (handle->ofw <= 14) { + result = 9; + } + if (handle->ofw == 14 && handle->desc.N == 92 && handle->desc.threads == 92) { + result = 23; + } + if (handle->ofw == 7 && handle->desc.N == 92 && handle->desc.threads == 92 && handle->desc.R == 3 && handle->desc.S == 3 && handle->desc.u == 1 && handle->desc.v == 1) { + result = 23; + } + while (handle->desc.threads % result != 0) { + result--; + } + /* FIXME: Hardcoded logic for N=27, N=26 */ + if (handle->desc.N == 27 && handle->desc.threads == 27 && handle->desc.R == 1 && handle->ofw == 14 && handle->desc.u == 1) { + result = 7; + } + if (((handle->ofh == 14) || (handle->ofw == 7 && handle->desc.u == 2)) && handle->desc.N == 26 && handle->desc.threads == 26) { + result = 13; + } + if ((handle->desc.N != handle->desc.threads) && !(handle->upd_linearized_tasklist == 0 && handle->upd_use_batchreduce == 0)) { + result = handle->desc.N; + } + /* Make sure a single copy when we use linearized-task view */ + if (handle->upd_linearized_tasklist == 1) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_linearized_tasklist_upd( libxsmm_dnn_layer* handle ) { + int result = 0; + /* Use linearized task-list (i.e. no reduction) only if small images and large filters */ + if (handle->ofh <= 10 && handle->ofw <= 10) { + result = 1; + } + if (handle->ofw == 7 && handle->desc.N == 92 && handle->desc.threads == 92 && handle->desc.R == 3 && handle->desc.S == 3 && handle->desc.u == 1 && handle->desc.v == 1) { + result = 0; + } + if (handle->ofh == 14 && handle->ofw == 14 && handle->desc.N == 23 && handle->desc.threads == 23) { + result = 1; + } +#if 0 + if ((handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S > (handle->desc.threads * 4)) && (handle->ofh <= 56)) { + result = 1; + } +#endif + if (handle->desc.u == 2 && handle->desc.v == 2 && handle->desc.K == 512) { + result = 0; + } + return result; +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_init_upd_gemm_flags( libxsmm_dnn_layer* handle ) { + int result = 0; + LIBXSMM_UNUSED(handle); + return result; +} + +LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_bf16_upd( libxsmm_dnn_layer* handle ) { + int remainder_pixels, max_init_offset, max_compute_offset_input, input_compute_pad, accum_length_pixels, compute_pixels; + const int multiple_target = 2; + int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2 * handle->desc.pad_h : handle->ifhp; + int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2 * handle->desc.pad_w : handle->ifwp; + int OFHP = (handle->upd_padding_copy == 1) ? handle->ofhp + 2 * handle->desc.pad_h : handle->ofhp; + int OFWP = (handle->upd_padding_copy == 1) ? handle->ofwp + 2 * handle->desc.pad_w : handle->ofwp; + + handle->upd_linearized_pixels = 1; + if (handle->desc.S != 1 && handle->desc.v != 1) { + handle->upd_linearized_pixels = 0; + handle->upd_trans_w_only = 0; + } + /* For large images facilitate the "large" transposes by blocking the pixel/reduction domains */ + if (handle->ofw >= 56 && handle->ofh >=56 && handle->desc.R == 1 && handle->desc.S == 1 && handle->desc.u == 1 && handle->desc.v == 1) { + handle->upd_linearized_pixels = 0; + handle->upd_trans_w_only = 1; + } + + handle->on_the_fly_input_packing = 0; + handle->upd_pack_input_upfront = 0; + handle->use_hybrid_imgofm_parallelization = 0; + handle->upd_linearized_tasklist = 0; + + if (handle->upd_linearized_pixels == 1) { + /* Logistics to pad accumulation chainlength */ + compute_pixels = handle->ofw * handle->ofh + 2 * handle->desc.pad_w * (handle->ofh-1); + remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; + accum_length_pixels = compute_pixels + remainder_pixels; + + /* In this case compact input upfront */ + if (handle->desc.R == 1 && handle->desc.S == 1 && (handle->desc.u != 1 || handle->desc.v != 1)) { + handle->upd_pack_input_upfront = 1; + } + + /* Logistics for input transpose and additional pixel padding */ + max_init_offset = 2 * handle->desc.pad_h * IFWP + 2 * handle->desc.pad_w; + max_compute_offset_input = max_init_offset + accum_length_pixels; + input_compute_pad = (max_compute_offset_input > IFWP*IFHP) ? max_compute_offset_input - IFWP*IFHP : 0; + handle->input_pixels = IFWP * IFHP + input_compute_pad; + if (handle->upd_pack_input_upfront) { + handle->input_pixels = accum_length_pixels; + } + handle->output_pixels = accum_length_pixels; + handle->pixel_blocking = accum_length_pixels; + handle->n_used_pixels = accum_length_pixels; + handle->compute_pixels = compute_pixels; + + handle->use_intermediate_f32_wt_tensor = (handle->pixel_blocking == handle->n_used_pixels) ? 0 : 1; + + if (handle->ofw <= 14) { + handle->use_hybrid_imgofm_parallelization = 1; + handle->weight_copies = libxsmm_dnn_convolution_setup_weight_copies_upd(handle); + if (handle->ofw == 14 && handle->desc.K >= 1024) { + handle->use_hybrid_imgofm_parallelization = 0; + handle->weight_copies = handle->desc.threads; + } + } else { + handle->weight_copies = handle->desc.threads; + } + } + + if (handle->upd_linearized_pixels == 0) { + handle->weight_copies = handle->desc.threads; + if (handle->desc.v !=1) { + handle->on_the_fly_input_packing = 1; + } + remainder_pixels = (handle->ofw % multiple_target == 0) ? 0 : (handle->ofw/multiple_target+1)*multiple_target - handle->ofw; + handle->ofwp_extended = OFWP + remainder_pixels; + handle->ifwp_extended = IFWP + remainder_pixels; + handle->output_pixels = OFHP * handle->ofwp_extended; + /* coverity[identical_branches] */ + handle->batchreduce_h_pixels = (handle->upd_trans_w_only) ? 1 : 1; /* TODO: identical_branches */ + handle->use_intermediate_f32_wt_tensor = (handle->batchreduce_h_pixels == handle->ofh) ? 0 : 1; + } + + if (handle->desc.N != handle->desc.threads) { + handle->use_intermediate_f32_wt_tensor = 1; + handle->use_hybrid_imgofm_parallelization = 0; + handle->weight_copies = LIBXSMM_MIN(handle->desc.N, handle->desc.threads); + } + +} + +LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_bf16_upd_amx( libxsmm_dnn_layer* handle ) { + /* JIT related variables... */ + libxsmm_blasint LDA = handle->ofmblock; + libxsmm_blasint LDB = handle->input_pixels; + libxsmm_blasint LDC = handle->ofmblock; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + int l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + size_t stride_a, stride_b; + int unroll_hint; + float beta; + + int remainder_pixels, max_init_offset, max_compute_offset_input, input_compute_pad, accum_length_pixels, compute_pixels; + const int multiple_target = 32; + int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2 * handle->desc.pad_h : handle->ifhp; + int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2 * handle->desc.pad_w : handle->ifwp; + int OFWP = (handle->upd_padding_copy == 1) ? handle->ofwp + 2 * handle->desc.pad_w : handle->ofwp; + + handle->upd_linearized_pixels = 1; + if (handle->desc.S != 1 && handle->desc.v != 1) { + handle->upd_linearized_pixels = 0; + } + handle->fuse_upd_transposes = 1; + handle->pack_to_cnhw = 0; + handle->on_the_fly_input_packing = 0; + handle->upd_pack_input_upfront = 0; + handle->use_hybrid_imgofm_parallelization = 0; + handle->upd_linearized_tasklist = 0; + if (((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) && (handle->ofw == 7) && (handle->desc.R == 1) && (handle->desc.S == 1) ) { + handle->pack_to_cnhw= 1; + } + + if (handle->upd_linearized_pixels == 1) { + if (handle->pack_to_cnhw == 0) { + handle->fuse_upd_transposes = 1; + /* Logistics to pad accumulation chainlength */ + compute_pixels = handle->ofw * handle->ofh + 2 * handle->desc.pad_w * (handle->ofh-1); + remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; + accum_length_pixels = compute_pixels + remainder_pixels; + + /* In this case compact input upfront */ + if (handle->desc.R == 1 && handle->desc.S == 1 && (handle->desc.u != 1 || handle->desc.v != 1)) { + handle->upd_pack_input_upfront = 1; + } + + /* Logistics for input transpose and additional pixel padding */ + max_init_offset = 2 * handle->desc.pad_h * IFWP + 2 * handle->desc.pad_w; + max_compute_offset_input = max_init_offset + accum_length_pixels; + input_compute_pad = (max_compute_offset_input > IFWP*IFHP) ? max_compute_offset_input - IFWP*IFHP : 0; + handle->input_pixels = IFWP*IFHP+ input_compute_pad; + if (handle->upd_pack_input_upfront) { + handle->input_pixels = accum_length_pixels; + } + handle->output_pixels = accum_length_pixels; + handle->pixel_blocking = accum_length_pixels; + handle->n_used_pixels = accum_length_pixels; + handle->compute_pixels = compute_pixels; + + handle->use_intermediate_f32_wt_tensor = (handle->pixel_blocking == handle->n_used_pixels) ? 0 : 1; +#if 0 + handle->scratch2_size = (size_t) (handle->desc.N * handle->output_pixels * handle->desc.K * sizeof(float)/2); + if (handle->use_intermediate_f32_wt_tensor) { + handle->scratch2_size += (size_t) handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * handle->desc.threads * sizeof(float); + } + handle->scratch3_size = (size_t) (handle->desc.N * handle->input_pixels * handle->desc.C * sizeof(float)/2); +#endif + + if (handle->ofw <= 14) { + handle->use_hybrid_imgofm_parallelization = 1; + handle->fuse_upd_transposes = 0; + } else { + handle->weight_copies = handle->desc.threads; + } + + if ((handle->ofmblock % 32 != 0) || (handle->ifmblock % 32 != 0)) { + handle->fuse_upd_transposes = 0; + } + } else { + /* Logistics to pad accumulation chainlength */ + handle->use_hybrid_imgofm_parallelization = 1; + handle->weight_copies = 7; + while (handle->desc.threads % handle->weight_copies != 0) { + handle->weight_copies--; + } + compute_pixels = handle->ofw * handle->ofh * (handle->desc.N/handle->weight_copies); + remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; + handle->remainder_pixels = remainder_pixels; + accum_length_pixels = compute_pixels + remainder_pixels; + handle->output_pixels = accum_length_pixels * handle->weight_copies; + handle->input_pixels = accum_length_pixels * handle->weight_copies; + handle->pixel_blocking = accum_length_pixels; + handle->n_used_pixels = accum_length_pixels; + + handle->use_intermediate_f32_wt_tensor = 0; +#if 0 + handle->scratch2_size = (size_t) (handle->weight_copies * handle->output_pixels * handle->desc.K * sizeof(float)/2); + handle->scratch3_size = (size_t) (handle->weight_copies * handle->input_pixels * handle->desc.C * sizeof(float)/2); +#endif + } + } + + if (handle->upd_linearized_pixels == 0) { + handle->weight_copies = handle->desc.threads; + if (handle->desc.v !=1) { + handle->on_the_fly_input_packing = 1; + } + remainder_pixels = (handle->ofw % multiple_target == 0) ? 0 : (handle->ofw/multiple_target+1)*multiple_target - handle->ofw; + handle->remainder_pixels = remainder_pixels; + handle->ofwp_extended = OFWP + remainder_pixels; + handle->ifwp_extended = IFWP + remainder_pixels; + handle->batchreduce_h_pixels = handle->ofh; + handle->use_intermediate_f32_wt_tensor = (handle->batchreduce_h_pixels == handle->ofh) ? 0 : 1; +#if 0 + handle->scratch2_size = (size_t) (handle->desc.N * handle->ofhp*handle->ofwp_extended * handle->desc.K * sizeof(float)/2); + if (handle->use_intermediate_f32_wt_tensor) { + handle->scratch2_size += (size_t) handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * handle->desc.threads * sizeof(float); + } + handle->scratch3_size = (size_t) (handle->desc.N * handle->ifhp * handle->ifwp_extended * handle->desc.C * sizeof(float)/2); +#endif + } + + /* Now that all decisions have been made, JIT the proper kernel... */ + beta = (handle->use_intermediate_f32_wt_tensor) ? (float)1.0 : (float)0.0; + if (handle->upd_linearized_pixels == 0) { + LDA = handle->ofmblock; + LDB = IFHP*handle->ifwp_extended; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + unroll_hint = handle->batchreduce_h_pixels; + stride_a = handle->ofwp_extended * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + stride_b = handle->desc.u * handle->ifwp_extended * libxsmm_dnn_typesize(handle->datatype_in); + handle->upd_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->ofw+handle->remainder_pixels, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); + handle->upd_compute_kernel_brgemm_no_linearized_pixels = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->ofmblock, handle->ifmblock, handle->ofw+handle->remainder_pixels, + (libxsmm_blasint)stride_a, (libxsmm_blasint)stride_b, unroll_hint, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } else { + LDA = handle->ofmblock; + LDB = handle->input_pixels; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + if (handle->use_hybrid_imgofm_parallelization == 0) { + handle->upd_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); + handle->upd_compute_kernel_gemm_linearized_pixels_no_hybrid_par = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } else { + if (handle->pack_to_cnhw == 1) { + handle->upd_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); + handle->upd_compute_kernel_gemm_linearized_pixels_hybrid_par_cnhw = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } else { + /* TODO: Hoist here hybrid parallelization logic and then we should be able to also provide unroll hint in the BRGEMM call */ + stride_a = handle->blocksofm * handle->output_pixels * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + stride_b = handle->blocksifm * handle->ifmblock * handle->input_pixels * libxsmm_dnn_typesize(handle->datatype_in); + handle->upd_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); + handle->upd_compute_kernel_brgemm_linearized_pixels_hybrid_par_no_cnhw = libxsmm_bsmmdispatch_reducebatch_strd(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, + (libxsmm_blasint)stride_a, (libxsmm_blasint)stride_b, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } + } + } + + if (handle->desc.N != handle->desc.threads) { + handle->use_intermediate_f32_wt_tensor = 1; + handle->use_hybrid_imgofm_parallelization = 0; + handle->weight_copies = LIBXSMM_MIN(handle->desc.N, handle->desc.threads); + } + +} + +LIBXSMM_API_INLINE int libxsmm_dnn_convolution_setup_upd_padding_copy( libxsmm_dnn_layer* handle ) { + int result = 0; + if ( (handle->desc.pad_h != handle->desc.pad_h_in) && (handle->desc.pad_w != handle->desc.pad_w_in) ) { + result = 1; + } + return result; +} + +LIBXSMM_API_INLINE void libxsmm_dnn_convolution_setup_upd_scratch( libxsmm_dnn_layer* handle ) { + handle->upd_packing_padding_scratch_size = 0; + /* packing of input */ + if ( handle->upd_pack_input != 0 ) { + handle->upd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * + handle->desc.H/handle->desc.u * + handle->desc.W/handle->desc.v * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* logical padding with copying in the fly */ + if ( handle->upd_padding_copy != 0 ) { + handle->upd_packing_padding_scratch_size = (size_t)handle->desc.N * handle->desc.C * + (handle->desc.H + 2*handle->desc.pad_h) * + (handle->desc.W + 2*handle->desc.pad_w) * + libxsmm_dnn_typesize(handle->datatype_in); + } + /* output/input buffer to transpose when we use bf16 */ + if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16 ) { + if (handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + int OFHP = (handle->upd_padding_copy == 1) ? handle->ofhp + 2 * handle->desc.pad_h : handle->ofhp; + int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2 * handle->desc.pad_h : handle->ifhp; + + if (handle->upd_linearized_pixels == 1) { + handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * handle->output_pixels * handle->desc.K * sizeof(handle->datatype_in)); + handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * handle->input_pixels * handle->desc.C * sizeof(handle->datatype_in)); + } + + if (handle->upd_linearized_pixels == 0) { + handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * OFHP * handle->ofwp_extended * handle->desc.K * sizeof(handle->datatype_in)); + handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * IFHP * handle->ifwp_extended * handle->desc.C * sizeof(handle->datatype_in)); + } + } else { + const int multiple_target = 2; + int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2 * handle->desc.pad_h : handle->ifhp; + int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2 * handle->desc.pad_w : handle->ifwp; + int OFHP = (handle->upd_padding_copy == 1) ? handle->ofhp + 2 * handle->desc.pad_h : handle->ofhp; + int OFWP = (handle->upd_padding_copy == 1) ? handle->ofwp + 2 * handle->desc.pad_w : handle->ofwp; + + if (handle->upd_linearized_pixels == 1) { + int compute_pixels = handle->ofw * handle->ofh + 2 * handle->desc.pad_w * (handle->ofh-1); + int remainder_pixels = (compute_pixels % multiple_target == 0) ? 0 : (compute_pixels/multiple_target+1)*multiple_target - compute_pixels; + int accum_length_pixels = compute_pixels + remainder_pixels; + + int max_init_offset = 2 * handle->desc.pad_h * IFWP + 2 * handle->desc.pad_w; + int max_compute_offset_input = max_init_offset + accum_length_pixels; + int input_compute_pad = (max_compute_offset_input > IFWP*IFHP) ? max_compute_offset_input - IFWP*IFHP : 0; + int input_pixels = IFWP * IFHP + input_compute_pad; + + if (handle->upd_pack_input_upfront == 1) { + input_pixels = accum_length_pixels; + } + + handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * accum_length_pixels * handle->desc.K * sizeof(handle->datatype_in)); + handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * input_pixels * handle->desc.C * sizeof(handle->datatype_in)); + } + + if (handle->upd_linearized_pixels == 0) { + int remainder_pixels = (handle->ofw % multiple_target == 0) ? 0 : (handle->ofw/multiple_target+1)*multiple_target - handle->ofw; + int ofwp_extended = OFWP + remainder_pixels; + int ifwp_extended = IFWP + remainder_pixels; + + handle->upd_lp_output_full_scratch_size = (size_t) (handle->desc.N * OFHP * ofwp_extended * handle->desc.K * sizeof(handle->datatype_in)); + handle->upd_lp_input_full_scratch_size = (size_t) (handle->desc.N * IFHP * ifwp_extended * handle->desc.C * sizeof(handle->datatype_in)); + } + } + handle->upd_lp_filter_full_scratch_size = (size_t)handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * handle->desc.threads * + libxsmm_dnn_typesize(LIBXSMM_DNN_DATATYPE_F32); + } else { + handle->upd_lp_output_full_scratch_size = 0; + handle->upd_lp_input_full_scratch_size = 0; + handle->upd_lp_filter_full_scratch_size = 0; + } + /* filter scratch */ + handle->upd_filter_scratch_size = (size_t) handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K * LIBXSMM_MAX(handle->desc.threads, handle->desc.N) * sizeof(float); + + /* align sizes to full cacheline */ + handle->upd_packing_padding_scratch_size += ( handle->upd_packing_padding_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->upd_packing_padding_scratch_size % LIBXSMM_CACHELINE); + handle->upd_lp_output_full_scratch_size += ( handle->upd_lp_output_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->upd_lp_output_full_scratch_size % LIBXSMM_CACHELINE); + handle->upd_lp_input_full_scratch_size += ( handle->upd_lp_input_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->upd_lp_input_full_scratch_size % LIBXSMM_CACHELINE); + handle->upd_filter_scratch_size += ( handle->upd_filter_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->upd_filter_scratch_size % LIBXSMM_CACHELINE); + handle->upd_lp_filter_full_scratch_size += ( handle->upd_lp_filter_full_scratch_size % LIBXSMM_CACHELINE == 0 ) ? 0 : + LIBXSMM_CACHELINE - (handle->upd_lp_filter_full_scratch_size % LIBXSMM_CACHELINE); + + /* calculate offsets */ + handle->upd_packing_padding_scratch_offset = 0; + handle->upd_lp_output_full_scratch_offset = handle->upd_packing_padding_scratch_size; + handle->upd_lp_input_full_scratch_offset = handle->upd_lp_output_full_scratch_offset + handle->upd_lp_output_full_scratch_size; + handle->upd_filter_scratch_offset = handle->upd_lp_input_full_scratch_offset + handle->upd_lp_input_full_scratch_size; + handle->upd_lp_filter_full_scratch_offset = handle->upd_filter_scratch_offset + handle->upd_filter_scratch_size; + + /* set overall scratch size for update */ + handle->upd_scratch_size = handle->upd_packing_padding_scratch_size + + handle->upd_lp_output_full_scratch_size + + handle->upd_lp_input_full_scratch_size + + handle->upd_filter_scratch_size + + handle->upd_lp_filter_full_scratch_size; +} + +LIBXSMM_API_INLINE libxsmm_dnn_err_t libxsmm_dnn_convolution_setup( libxsmm_dnn_layer* handle ) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + libxsmm_blasint _ldi = 64, _ldo = 64; + libxsmm_blasint ldx; + libxsmm_blasint ldA; + libxsmm_blasint ldC; + int beta_int; + float beta; + int l_flags; + int l_tc_flags; + + /* init libxsmm */ + LIBXSMM_INIT + + /* Generic parameter setup */ + handle->target_archid = libxsmm_target_archid; + if ( ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) && (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && ((handle->desc.C % 16 != 0) || (handle->desc.K % 16 != 0)) ) { + handle->target_archid = LIBXSMM_X86_AVX512_CPX; + } + handle->ifmblock = libxsmm_dnn_convolution_setup_ifmblock(handle); + handle->ofmblock = libxsmm_dnn_convolution_setup_ofmblock(handle); + handle->fm_lp_block = libxsmm_dnn_convolution_setup_fm_lp_block(handle); + handle->blocksifm = libxsmm_dnn_convolution_setup_blocksifm(handle); + handle->blocksofm = libxsmm_dnn_convolution_setup_blocksofm(handle); + + /* If in SPR, generate tilerelease kernel */ + if (handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + int l_tr_flags = LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + handle->tilerelease_kernel = libxsmm_bsmmdispatch(handle->ifmblock, handle->ifmblock, handle->ifmblock, NULL, NULL, NULL, NULL, NULL, &l_tr_flags, NULL); + } + + /* FWD parameter setup */ + handle->fwd_ofw_rb = libxsmm_dnn_convolution_setup_fwd_ofw_rb(handle); + handle->pack_input = libxsmm_dnn_convolution_setup_pack_input_fwd(handle); + handle->fwd_ofh_rb = libxsmm_dnn_convolution_setup_fwd_ofh_rb(handle); + handle->fwd_gemm_pixels = libxsmm_dnn_convolution_setup_fwd_pixels_gemm(handle); + handle->block_fwd_oj = libxsmm_dnn_convolution_setup_fwd_block_H(handle); + handle->loop_order = libxsmm_dnn_convolution_setup_loop_order_fwd(handle); + handle->blocksifm_blocking = libxsmm_dnn_convolution_setup_blocksifm_blocking(handle); + handle->block_fwd_ofm = libxsmm_dnn_convolution_setup_block_fwd_OFM(handle); + handle->block_fwd_ifm = libxsmm_dnn_convolution_setup_block_fwd_IFM(handle); + handle->avoid_fmas_in_rim = libxsmm_dnn_convolution_setup_avoid_rim_fmas_fwd(handle); + handle->use_ofm_parallelization = libxsmm_dnn_convolution_setup_use_ofm_parallelization(handle); + handle->shuffle_filter_accesses = libxsmm_dnn_convolution_setup_shuffle_filter_accesses(handle); + handle->avoid_acc_load = libxsmm_dnn_convolution_setup_avoid_acc_load(handle); + handle->fwd_flags = libxsmm_dnn_convolution_setup_init_fwd_gemm_flags(handle); + handle->use_fallback_fwd_loops = libxsmm_dnn_convolution_setup_fallback_loops_fwd(handle); + handle->fwd_padding_copy = libxsmm_dnn_convolution_setup_fwd_padding_copy(handle); + +#if 0 + if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 ) { + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + handle->block_fwd_ofm = 1; + handle->block_fwd_oj = handle->fwd_ofh_rb; + ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + ldA = handle->ofmblock; + ldC = handle->ofmblock; + beta = (handle->avoid_acc_load) ? (float)0.0 : (float)1.0; + l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + handle->fwd_compute_kernel_offs_f32 = NULL; + handle->fwd_compute_kernel_strd_f32 = NULL; + handle->fwd_compute_kernel_addr_a_f32 = NULL; + handle->fwd_compute_kernel_addr_b_f32 = NULL; + if (handle->desc.R == 1 && handle->desc.S == 1) { + const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; + const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; + int stride_a = handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + int stride_b = IFW * IFH * handle->ifmblock * libxsmm_dnn_typesize(handle->datatype_in); + handle->fwd_compute_kernel_strd_f32 = libxsmm_smmdispatch_reducebatch_strd_unroll(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, stride_a, stride_b, handle->blocksifm_blocking, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } else { + const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); + const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); + int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; + int i = 0, ifm, ki, kj; + handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + + ki * handle->ifmblock * handle->ofmblock) * libxsmm_dnn_typesize(handle->datatype_in); + handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + + kj * IFW * handle->ifmblock + + ki * handle->ifmblock) * libxsmm_dnn_typesize(handle->datatype_in); + i++; + } + } + } + handle->fwd_compute_kernel_offs_f32 = libxsmm_smmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } + handle->fwd_compute_kernel_addr_a_f32 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + handle->fwd_compute_kernel_addr_b_f32 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + } +#endif + + if ( ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) && (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) ) { + handle->block_fwd_ofm = 1; + handle->block_fwd_oj = handle->fwd_ofh_rb; + ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + ldA = handle->ofmblock; + ldC = handle->ofmblock; + beta = (handle->avoid_acc_load) ? (float)0.0 : (float)1.0; + l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + handle->fwd_compute_kernel_addr = NULL; + handle->fwd_compute_kernel_offs_a = NULL; + handle->fwd_compute_kernel_offs_b = NULL; + handle->fwd_compute_kernel_strd = NULL; + if (handle->desc.R == 1 && handle->desc.S == 1) { + const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; + const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; + size_t stride_a = handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + size_t stride_b = IFW * IFH * handle->ifmblock * libxsmm_dnn_typesize(handle->datatype_in); + handle->fwd_compute_kernel_strd = libxsmm_bmmdispatch_reducebatch_strd_unroll(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, + (libxsmm_blasint)stride_a, (libxsmm_blasint)stride_b, handle->blocksifm_blocking, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } else { + const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); + const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); + int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; + int i = 0, ifm, ki, kj; + handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + + ki * handle->ifmblock * handle->ofmblock) * libxsmm_dnn_typesize(handle->datatype_in); + handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + + kj * IFW * handle->ifmblock + + ki * handle->ifmblock) * libxsmm_dnn_typesize(handle->datatype_in); + i++; + } + } + } + handle->fwd_compute_kernel_offs_a = libxsmm_bmmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + handle->fwd_compute_kernel_offs_b = libxsmm_bsmmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } + handle->fwd_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->fwd_gemm_pixels, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_tc_flags, NULL); + } + + handle->code_fwd[0].ptr = 0; + handle->code_fwd[1].ptr = 0; + handle->code_fwd[2].ptr = 0; + + /* JIT cvt eltwise functions for fwd convolutions */ + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { + _ldi = handle->ofmblock * handle->ofwp; + _ldo = handle->ofmblock * handle->ofwp; + handle->fwd_cvtfp32bf16_kernel = libxsmm_dispatch_meltw_unary(handle->ofmblock * handle->fwd_ofw_rb, handle->fwd_ofh_rb, &_ldi, &_ldo, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); + } + + /* Create strided BRGEMMs for i8i32 convolutions */ + if ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I32)) { + ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + ldA = handle->ofmblock; + ldC = handle->ofmblock; + beta_int = (handle->avoid_acc_load) ? 0 : 1; + l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; + if (handle->desc.R == 1 && handle->desc.S == 1) { + const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; + const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; + libxsmm_blasint stride_A = handle->ifmblock * handle->ofmblock * sizeof(char); + libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; + handle->gemm_fwd.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + } else { + const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; + const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; + if (handle->avoid_fmas_in_rim == 0) { + int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; + int i = 0, ifm, ki, kj; + handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + + ki * handle->ifmblock * handle->ofmblock) * sizeof(char); + handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + + kj * IFW * handle->ifmblock + + ki * handle->ifmblock) * sizeof(char); + i++; + } + } + } + handle->gemm_fwd.xgemm.subimro = libxsmm_subimmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + } else { + libxsmm_blasint stride_A = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(char); + libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; + handle->gemm_fwd.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + handle->gemm_fwd2.xgemm.subimrs = libxsmm_subimmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + } + } + } else if ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I8)) { + ldx = (libxsmm_blasint)handle->desc.v*handle->ifmblock; + ldA = handle->ofmblock; + ldC = handle->ofmblock; + beta_int = 0; + l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; + if (handle->desc.R == 1 && handle->desc.S == 1) { + const int IFW = handle->ifwp; + const int IFH = handle->ifhp; + libxsmm_blasint stride_A = handle->ifmblock * handle->ofmblock * sizeof(char); + libxsmm_blasint stride_B = handle->ifmblock * IFW * IFH * sizeof(char) ; + handle->gemm_fwd.xgemm.sububmrs = libxsmm_sububmmdispatch_reducebatch_strd(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, stride_A, stride_B, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + } else { + const int IFW = handle->ifwp; + const int IFH = handle->ifhp; + int n_blocks = handle->desc.R * handle->desc.S * handle->blocksifm_blocking; + int i = 0, ifm, ki, kj; + handle->A_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + handle->B_offsets = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + for (ifm = 0; ifm < handle->blocksifm_blocking; ifm++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + handle->A_offsets[i] = (ifm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + + ki * handle->ifmblock * handle->ofmblock) * sizeof(char); + handle->B_offsets[i] = (ifm * IFH * IFW * handle->ifmblock + + kj * IFW * handle->ifmblock + + ki * handle->ifmblock) * sizeof(char); + i++; + } + } + } + handle->gemm_fwd.xgemm.sububmro = libxsmm_sububmmdispatch_reducebatch_offs(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta_int, &l_flags, NULL); + } + } + +#if 0 + /* Spit out FWD parameters that are selected... */ + printf("FWD params...\n"); + printf("Fwd_ofw_rb = %d\n", handle->fwd_ofw_rb); + printf("Fwd_ofh_rb = %d\n", handle->fwd_ofh_rb); + printf("Pack input = %d\n", handle->pack_input); + printf("Block oj = %d\n", handle->block_fwd_oj); + printf("Loop order = %d\n", handle->loop_order); + printf("Blocksifm_blocking = %d\n", handle->blocksifm_blocking); + printf("Block fwd ofm = %d\n", handle->block_fwd_ofm); + printf("Block fwd ifm = %d\n", handle->block_fwd_ifm); + printf("Avoid rim fmas = %d\n", handle->avoid_fmas_in_rim); + printf("Ofm parallelization = %d\n", handle->use_ofm_parallelization); + printf("Shuffle filter accesses = %d\n", handle->shuffle_filter_accesses); + printf("Avoid acc load = %d\n", handle->avoid_acc_load); + printf("Fwd GEMM flags = %d\n", handle->fwd_flags); +#endif + + /* BWD parameter setup */ + handle->bwd_ofw_rb = libxsmm_dnn_convolution_setup_bwd_ofw_rb(handle); + handle->bwd_ofh_rb = libxsmm_dnn_convolution_setup_bwd_ofh_rb(handle); + handle->bwd_gemm_pixels = libxsmm_dnn_convolution_setup_bwd_pixels_gemm(handle); + handle->pack_input_bwd = libxsmm_dnn_convolution_setup_pack_input_bwd(handle); + handle->spread_input_bwd = libxsmm_dnn_convolution_setup_spread_input_bwd(handle); + handle->blocksofm_blocking = libxsmm_dnn_convolution_setup_blocksofm_blocking(handle); + handle->avoid_acc_load_bwd = libxsmm_dnn_convolution_setup_avoid_acc_load_bwd(handle); + handle->use_ifm_parallelization = libxsmm_dnn_convolution_setup_use_ifm_parallelization(handle); + handle->block_bwd_ofm = libxsmm_dnn_convolution_setup_block_bwd_OFM(handle); + handle->block_bwd_ifm = libxsmm_dnn_convolution_setup_block_bwd_IFM(handle); + handle->block_bwd_oj = libxsmm_dnn_convolution_setup_bwd_block_H(handle); + handle->use_fallback_bwd_loops = libxsmm_dnn_convolution_setup_fallback_loops_bwd(handle); + handle->bwd_flags = libxsmm_dnn_convolution_setup_init_bwd_gemm_flags(handle); + + if ( ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) && (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) ) { + handle->block_bwd_ifm = 1; + handle->block_bwd_oj = handle->bwd_ofh_rb ; + ldx = ((libxsmm_blasint)handle->ofmblock); + ldA = handle->ifmblock; + ldC = (handle->spread_input_bwd == 1) ? handle->ifmblock * handle->desc.v : handle->ifmblock; + beta = (handle->avoid_acc_load_bwd) ? (float)0.0 : (float)1.0; + l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + handle->bwd_compute_kernel_addr = NULL; + handle->bwd_compute_kernel_offs = NULL; + handle->bwd_compute_kernel_strd = NULL; + if (handle->desc.R == 1 && handle->desc.S == 1) { + size_t stride_a = handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + size_t stride_b = handle->ofwp * handle->ofhp * handle->ofmblock * libxsmm_dnn_typesize(handle->datatype_in); + handle->bwd_compute_kernel_strd = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->ifmblock, handle->bwd_gemm_pixels, handle->ofmblock, + (libxsmm_blasint)stride_a, (libxsmm_blasint)stride_b, handle->blocksofm_blocking, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } else { + int n_blocks = handle->desc.R * handle->desc.S * handle->blocksofm_blocking; + int i = 0, ofm, ki, kj; + handle->A_offsets_bwd = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + handle->B_offsets_bwd = (unsigned long long*) malloc(n_blocks * sizeof(unsigned long long)); + for (ofm = 0; ofm < handle->blocksofm_blocking; ofm++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + handle->A_offsets_bwd[i] = (ofm * handle->desc.R * handle->desc.S * handle->ifmblock * handle->ofmblock + + kj * handle->desc.S * handle->ifmblock * handle->ofmblock + + ki * handle->ifmblock * handle->ofmblock) * libxsmm_dnn_typesize(handle->datatype_in); + handle->B_offsets_bwd[i] = (ofm * handle->ofhp * handle->ofwp * handle->ofmblock + + kj * handle->ofwp * handle->ofmblock + + ki * handle->ofmblock) * libxsmm_dnn_typesize(handle->datatype_in); + i++; + } + } + } + handle->bwd_compute_kernel_offs = libxsmm_bsmmdispatch_reducebatch_offs(handle->ifmblock, handle->bwd_gemm_pixels, handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + } + handle->bwd_config_kernel = libxsmm_bsmmdispatch(handle->ifmblock, handle->bwd_gemm_pixels, handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_tc_flags, NULL); + } + +#if 0 + /* Spit out BWD parameters that are selected... */ + printf("BWD params...\n"); + printf("Bwd_ofw_rb = %d\n", handle->bwd_ofw_rb); + printf("Bwd_ofh_rb = %d\n", handle->bwd_ofh_rb); + printf("Pack input = %d\n", handle->pack_input_bwd); + printf("Spread input = %d\n", handle->spread_input_bwd); + printf("Blocksofm_blocking = %d\n", handle->blocksofm_blocking); + printf("Avoid acc load = %d\n", handle->avoid_acc_load_bwd); + printf("Ifm parallelization = %d\n", handle->use_ifm_parallelization); + printf("Block bwd ofm = %d\n", handle->block_bwd_ofm); + printf("Block bwd ifm = %d\n", handle->block_bwd_ifm); + printf("Block oj = %d\n", handle->block_bwd_oj); +#endif + + handle->code_bwd[0].ptr = 0; + handle->code_bwd[1].ptr = 0; + handle->code_bwd[2].ptr = 0; + + /* Transpose kernel used for filter transpose in bwd pass */ + handle->tr_kernel = libxsmm_dispatch_meltw_unary(64, 16, &(_ldi), &(_ldo), LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + + /* UPD parameter setup */ + handle->upd_linearized_tasklist = libxsmm_dnn_convolution_setup_linearized_tasklist_upd(handle); + handle->upd_avoid_rim_fmas = libxsmm_dnn_convolution_setup_avoid_rim_fmas_upd(handle); + handle->upd_pack_input = libxsmm_dnn_convolution_setup_pack_input_upd(handle); + handle->upd_use_batchreduce = libxsmm_dnn_convolution_setup_use_batchreduce_upd(handle); + handle->upd_ofw_rb = libxsmm_dnn_convolution_setup_upd_ofw_rb(handle); + handle->upd_ofh_rb = libxsmm_dnn_convolution_setup_upd_ofh_rb(handle); + handle->upd_loop_order = libxsmm_dnn_convolution_setup_loop_order_upd(handle); + handle->weight_copies = libxsmm_dnn_convolution_setup_weight_copies_upd(handle); + handle->block_upd_ofm = libxsmm_dnn_convolution_setup_block_upd_OFM(handle); + handle->block_upd_ifm = libxsmm_dnn_convolution_setup_block_upd_IFM(handle); + handle->upd_loop_order = libxsmm_dnn_convolution_setup_loop_order_upd(handle); + handle->upd_padding_copy = libxsmm_dnn_convolution_setup_upd_padding_copy(handle); + + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + libxsmm_dnn_convolution_setup_bf16_upd_amx(handle); + } else { + libxsmm_dnn_convolution_setup_bf16_upd(handle); + } + } + +#if 0 + /* Spit out UPD parameters that are selected... */ + printf("UPD params...\n"); + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) { + printf("BF16 path...\n"); + printf("UPD use_hybrid_imgofm_parallelization = %d\n", handle->use_hybrid_imgofm_parallelization); + printf("UPD linearized_pixels = %d\n", handle->upd_linearized_pixels); + printf("UPD upd_trans_w_only = %d\n", handle->upd_trans_w_only); + printf("UPD on_the_fly_input_packing = %d\n", handle->on_the_fly_input_packing); + printf("UPD use_intermediate_f32_wt_tensor = %d\n", handle->use_intermediate_f32_wt_tensor); + printf("UPD pack to CNHW format = %d\n", handle->pack_to_cnhw); + printf("UPD batchreduce H pixels = %d\n", handle->batchreduce_h_pixels); + } + printf("UPD linearized tasks = %d\n", handle->upd_linearized_tasklist); + printf("UPD avoid rim fmas = %d\n", handle->upd_avoid_rim_fmas); + printf("UPD Pack input = %d\n", handle->upd_pack_input); + printf("UPD use batch-reduce GEMM = %d\n", handle->upd_use_batchreduce); + printf("Upd_ofw_rb = %d\n", handle->upd_ofw_rb); + printf("Upd_ofh_rb = %d\n", handle->upd_ofh_rb); + printf("UPD loop order = %d\n", handle->upd_loop_order); + printf("UPD weight_copies = %d\n", handle->weight_copies); + printf("Block upd ofm = %d\n", handle->block_upd_ofm); + printf("Block upd ifm = %d\n", handle->block_upd_ifm); +#endif + + handle->code_upd[0].ptr = 0; + handle->code_upd[1].ptr = 0; + + /* prepare barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + + /* setup up scratch */ + libxsmm_dnn_convolution_setup_fwd_scratch( handle ); + libxsmm_dnn_convolution_setup_bwd_scratch( handle ); + libxsmm_dnn_convolution_setup_upd_scratch( handle ); + handle->scratch = 0; + handle->scratch_size = LIBXSMM_MAX( handle->fwd_scratch_size, LIBXSMM_MAX( handle->bwd_scratch_size, handle->upd_scratch_size ) ); + + return status; +} + +#undef MIXED +#undef KHWC +#undef HWKC +#undef CHWK +#undef HWCK + + +LIBXSMM_API libxsmm_dnn_layer* libxsmm_dnn_create_conv_layer( + libxsmm_dnn_conv_desc conv_desc, + libxsmm_dnn_err_t* status) +{ + libxsmm_dnn_layer* handle = 0; + *status = LIBXSMM_DNN_SUCCESS; + + /* currently we don't support NCHW */ + if ( (conv_desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCHW) > 0 ) { + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_NCHW; + return 0; + } + /* currently we don't support KCRS */ + if ( (conv_desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_KCRS) > 0 ) { + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_KCRS; + return 0; + } + /* we only support physical paddind in these days */ + /* @TODO: add logical padding support for other datatypes than FP32 */ + if ( ( ( conv_desc.pad_h != conv_desc.pad_h_in ) || + ( conv_desc.pad_w != conv_desc.pad_w_in ) || + ( conv_desc.pad_h != conv_desc.pad_h_out ) || + ( conv_desc.pad_w != conv_desc.pad_w_out ) ) && + ( conv_desc.datatype_in != LIBXSMM_DNN_DATATYPE_F32 ) && (conv_desc.datatype_in != LIBXSMM_DNN_DATATYPE_BF16) ) { + *status = LIBXSMM_DNN_ERR_INVALID_PADDING; + return 0; + } + + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_layer*)calloc(1, sizeof(libxsmm_dnn_layer)); + + if (0 != handle) { + /* initialize known handle components */ + handle->desc = conv_desc; + handle->datatype_in = conv_desc.datatype_in; + handle->datatype_out = conv_desc.datatype_out; + /* select the intermediate format, only applicable for integer types */ + if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { + /* error */ + } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_BF16) ) { + /* error */ + } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { + /* error */ + } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_I32) ) { + /* error */ + } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_I8) ) { + /* error */ + } else if ( (conv_desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8) && (conv_desc.datatype_out != LIBXSMM_DNN_DATATYPE_F32) ) { + /* error */ + } else { + /* fine, no error */ + } + handle->buffer_format = conv_desc.buffer_format; + handle->filter_format = conv_desc.filter_format; + handle->fuse_ops = conv_desc.fuse_ops; + handle->options = conv_desc.options; + + /* derive additional values */ + handle->ifhp = conv_desc.H + 2*conv_desc.pad_h_in; + handle->ifwp = conv_desc.W + 2*conv_desc.pad_w_in; + handle->ofh = (conv_desc.H + 2*conv_desc.pad_h - conv_desc.R) / conv_desc.u + 1; + handle->ofw = (conv_desc.W + 2*conv_desc.pad_w - conv_desc.S) / conv_desc.v + 1; + handle->ofhp = handle->ofh + 2*conv_desc.pad_h_out; + handle->ofwp = handle->ofw + 2*conv_desc.pad_w_out; + handle->ifmblock = 1; + handle->ofmblock = 1; + handle->blocksifm = conv_desc.C; + handle->blocksofm = conv_desc.K; + handle->fwd_ofw_rb = 1; + handle->fwd_ofh_rb = 1; + handle->bwd_ofw_rb = 1; + handle->bwd_ofh_rb = 1; + handle->upd_ofw_rb = 1; + handle->upd_ofh_rb = 1; + handle->fm_lp_block = 1; + handle->blocksifm_blocking = 1; + handle->blocksofm_blocking = 1; + /* Set algorithm to use */ + if (conv_desc.algo == LIBXSMM_DNN_CONV_ALGO_AUTO) { + handle->algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; + } else { + handle->algo = conv_desc.algo; + } + if ( handle->algo != LIBXSMM_DNN_CONV_ALGO_DIRECT ) { + *status = LIBXSMM_DNN_ERR_INVALID_ALGO; + free(handle); + handle = 0; + return 0; + } + + *status = libxsmm_dnn_convolution_setup(handle); + } + else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + /* account for eventually deallocated handle */ + if ( LIBXSMM_DNN_SUCCESS != *status ) { + handle = 0; + } + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_conv_layer(const libxsmm_dnn_layer* handle) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + + /* deallocate handle structure itself */ + free(/*remove constness*/(libxsmm_dnn_layer*)handle); + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_create_tensor_datalayout(const libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_ACTIVATION; + + if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + /* @TODO this need to change */ + } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_I32) ) { + if ( ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) ) ) { + layout->datatype = handle->datatype_in; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->datatype = handle->datatype_out; + } + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) ) { + if ( ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) ) ) { + layout->datatype = handle->datatype_in; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) ) { + layout->datatype = handle->datatype_out; + } + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock * handle->blocksifm; + layout->dim_size[1] = handle->ifwp; + layout->dim_size[2] = handle->ifhp; + layout->dim_size[3] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock * handle->blocksofm; + layout->dim_size[1] = handle->ofwp; + layout->dim_size[2] = handle->ofhp; + layout->dim_size[3] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { + layout->format = handle->filter_format; + layout->tensor_type = LIBXSMM_DNN_FILTER; + + if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 6; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ifmblock; + layout->dim_size[2] = handle->desc.S; + layout->dim_size[3] = handle->desc.R; + layout->dim_size[4] = handle->blocksifm; + layout->dim_size[5] = handle->blocksofm; + } + } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 7; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->ofmblock; + layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; + layout->dim_size[3] = handle->desc.S; + layout->dim_size[4] = handle->desc.R; + layout->dim_size[5] = handle->blocksifm; + layout->dim_size[6] = handle->blocksofm; + } + } else if ( ((handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8 ) ) { + if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { + layout->datatype = handle->datatype_in; + } else if (type == LIBXSMM_DNN_GRADIENT_FILTER) { + layout->datatype = handle->datatype_out; + } + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + if ((type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_FILTER)) { + layout->num_dims = 7; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->ofmblock; + layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; + layout->dim_size[3] = handle->desc.S; + layout->dim_size[4] = handle->desc.R; + layout->dim_size[5] = handle->blocksifm; + layout->dim_size[6] = handle->blocksofm; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { + if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_size[0] = handle->ofmblock * handle->blocksofm; + layout->dim_size[1] = handle->ifmblock * handle->blocksifm; + layout->dim_size[2] = handle->desc.S; + layout->dim_size[3] = handle->desc.R; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { + layout->format = handle->filter_format; + layout->tensor_type = LIBXSMM_DNN_REGULAR_FILTER_TRANS; + + if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 6; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->ofmblock; + layout->dim_size[2] = handle->desc.S; + layout->dim_size[3] = handle->desc.R; + layout->dim_size[4] = handle->blocksofm; + layout->dim_size[5] = handle->blocksifm; + } + } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 7; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->ifmblock; + layout->dim_size[2] = handle->ofmblock/handle->fm_lp_block; + layout->dim_size[3] = handle->desc.S; + layout->dim_size[4] = handle->desc.R; + layout->dim_size[5] = handle->blocksofm; + layout->dim_size[6] = handle->blocksifm; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } +#if 0 + } else if ((handle->filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { + if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_size[0] = handle->ofmblock * handle->blocksofm; + layout->dim_size[1] = handle->ifmblock * handle->blocksifm; + layout->dim_size[2] = handle->desc.S; + layout->dim_size[3] = handle->desc.K; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } +#endif + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) || (type == LIBXSMM_DNN_CHANNEL_BIAS) ) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->datatype_out; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->blocksofm; + } +#if 0 + } else if ( (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I16) || (handle->datatype_in == LIBXSMM_DNN_DATATYPE_I8) ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 3; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->ofmblock; + layout->dim_size[2] = handle->blocksofm; + } +#endif + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + layout->datatype = handle->datatype_out; + if ( handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 1; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->ofmblock*handle->blocksofm; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_BATCH_STATS) ) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_BATCH_STATS; + + if ((handle->buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( (handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32) || (handle->datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->desc.N; + layout->dim_size[2] = handle->blocksofm; + layout->dim_size[3] = 2; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if (type == LIBXSMM_DNN_MAX_STATS_FWD) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_MAX_STATS_FWD; + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.N; + } + } else if (type == LIBXSMM_DNN_MAX_STATS_BWD) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_MAX_STATS_BWD; + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.N; + } + } else if (type == LIBXSMM_DNN_MAX_STATS_UPD) { + layout->format = handle->buffer_format; + layout->tensor_type = LIBXSMM_DNN_MAX_STATS_UPD; + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.N; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_bf16_filter(const libxsmm_dnn_layer* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (handle != 0) { + if ( (handle->reg_filter != 0) && (handle->reg_filter_tr != 0) ) { + /* TODO handle more datatypes */ + int ifm1, ifm2, kj, ki, ofm1, ofm2; + int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; + int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; + int lpb = handle->fm_lp_block; + LIBXSMM_VLA_DECL(7, libxsmm_bfloat16, wt, (libxsmm_bfloat16*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); + LIBXSMM_VLA_DECL(7, libxsmm_bfloat16, tr_wt, (libxsmm_bfloat16*)handle->reg_filter_tr->data, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + + /* TODO we might want to do this in parallel.... */ + for ( ifm1 = 0; ifm1 < handle->blocksifm; ++ifm1 ) { + for ( ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1 ) { + for (kj=0; kj < handle->desc.R; ++kj) { + for (ki=0; ki < handle->desc.S; ++ki) { + for ( ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2 ) { + for ( ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2 ) { + LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = + LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); + } + } + } + } + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_trans_reg_filter(const libxsmm_dnn_layer* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (handle != 0) { + if ( (handle->reg_filter != 0) && (handle->reg_filter_tr != 0) ) { + /* TODO handle more datatypes */ + int ifm1, ifm2, kj, ki, ofm1, ofm2; + LIBXSMM_VLA_DECL(6, float, wt, (float*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + LIBXSMM_VLA_DECL(6, float, tr_wt, (float*)handle->reg_filter_tr->data, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + + /* TODO we might want to do this in parallel.... */ + for ( ifm1 = 0; ifm1 < handle->blocksifm; ++ifm1 ) { + for ( ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1 ) { + for (kj=0; kj < handle->desc.R; ++kj) { + for (ki=0; ki < handle->desc.S; ++ki) { + for ( ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2 ) { + for ( ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2 ) { + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } + } + } + } + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + handle->reg_bias = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + handle->grad_bias = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { + handle->reg_filter_tr = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { + handle->batch_stats = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { + handle->maxstats_fwd = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { + handle->maxstats_bwd = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { + handle->maxstats_upd = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_get_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) +{ + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + return_tensor = handle->grad_output; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + return_tensor = handle->reg_filter; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + return_tensor = handle->grad_filter; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + return_tensor = handle->reg_bias; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + return_tensor = handle->grad_bias; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { + return_tensor = handle->reg_filter_tr; + } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { + return_tensor = handle->batch_stats; + } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { + return_tensor = handle->maxstats_fwd; + } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { + return_tensor = handle->maxstats_bwd; + } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { + return_tensor = handle->maxstats_upd; + } else { + /* cannot happen */ + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_tensor(libxsmm_dnn_layer* handle, const libxsmm_dnn_tensor_type type) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_REGULAR_FILTER_TRANS) && (type != LIBXSMM_DNN_BATCH_STATS) && (type != LIBXSMM_DNN_MAX_STATS_FWD) && (type != LIBXSMM_DNN_MAX_STATS_BWD) && (type != LIBXSMM_DNN_MAX_STATS_UPD) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + handle->reg_bias = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + handle->grad_bias = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER_TRANS ) { + handle->reg_filter_tr = 0; + } else if ( type == LIBXSMM_DNN_BATCH_STATS ) { + handle->batch_stats = 0; + } else if ( type == LIBXSMM_DNN_MAX_STATS_FWD ) { + handle->maxstats_fwd = 0; + } else if ( type == LIBXSMM_DNN_MAX_STATS_BWD ) { + handle->maxstats_bwd = 0; + } else if ( type == LIBXSMM_DNN_MAX_STATS_UPD ) { + handle->maxstats_upd = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API size_t libxsmm_dnn_get_scratch_size(const libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) +{ + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; + case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + l_scratch_size += handle->scratch_size + 64; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_bind_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind, const void* scratch) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + address += handle->scratch_size + 64; + + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; + case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_release_scratch(libxsmm_dnn_layer* handle, const libxsmm_dnn_compute_kind kind) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: break; + case LIBXSMM_DNN_COMPUTE_KIND_UPD: break; + case LIBXSMM_DNN_COMPUTE_KIND_ALL: break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API_INLINE libxsmm_dnn_err_t internal_execute_st(libxsmm_dnn_layer* handle, + libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (handle->algo) { + case LIBXSMM_DNN_CONV_ALGO_DIRECT: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + switch (handle->buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_fwd_custom_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { + status = libxsmm_dnn_convolve_st_fwd_nhwc_rsck(handle, start_thread, tid); + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_fwd_nhwc_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handle->buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_bwd_custom_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { + status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck(handle, start_thread, tid); + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_bwd_nhwc_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_UPD: { + switch (handle->buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_upd_custom_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { + status = libxsmm_dnn_convolve_st_upd_nhwc_rsck(handle, start_thread, tid); + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_upd_nhwc_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { + switch (handle->buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_upd_custom_custom(handle, start_thread, tid); + status = libxsmm_dnn_convolve_st_bwd_custom_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_NHWC: { + switch (handle->filter_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_RSCK: { + status = libxsmm_dnn_convolve_st_upd_nhwc_rsck(handle, start_thread, tid); + status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck(handle, start_thread, tid); + } break; + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_convolve_st_upd_nhwc_custom(handle, start_thread, tid); + status = libxsmm_dnn_convolve_st_bwd_nhwc_custom(handle, start_thread, tid); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_CONVOLVE; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_ALGO; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_execute_st(libxsmm_dnn_layer* handle, + libxsmm_dnn_compute_kind kind, /*unsigned*/int start_thread, /*unsigned*/int tid) +{ + return internal_execute_st(handle, kind, start_thread, tid); +} + + +LIBXSMM_API void libxsmm_dnn_execute(libxsmm_dnn_layer* handle, libxsmm_dnn_compute_kind kind) +{ +#if defined(_OPENMP) +# pragma omp parallel num_threads(handle->desc.threads) + { + const int tid = omp_get_thread_num(); + internal_execute_st(handle, kind, 0, tid); + } +#else + internal_execute_st(handle, kind, 0/*start_thread*/, 0/*tid*/); +#endif +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.c b/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.c new file mode 100644 index 00000000..32da0d18 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.c @@ -0,0 +1,719 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_convolution_backward.h" +#include "libxsmm_main.h" + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_vnni_transpose_16x16_kernel(void* source_void, void* dest_void, int source_stride, int dest_stride) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + libxsmm_bfloat16 *source = (libxsmm_bfloat16*)source_void; + libxsmm_bfloat16 *dest = (libxsmm_bfloat16*)dest_void; + __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; + __m512i tmp0, tmp1, tmp2, tmp3; + const __m512i abcdefgh_to_abefcdgh = _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100); + + zmm0 = _mm512_load_epi32(source); + zmm1 = _mm512_load_epi32(source + source_stride); + zmm2 = _mm512_load_epi32(source + source_stride*2); + zmm3 = _mm512_load_epi32(source + source_stride*3); + zmm4 = _mm512_load_epi32(source + source_stride*4); + zmm5 = _mm512_load_epi32(source + source_stride*5); + zmm6 = _mm512_load_epi32(source + source_stride*6); + zmm7 = _mm512_load_epi32(source + source_stride*7); + + zmm0 = _mm512_shuffle_epi8(zmm0, abcdefgh_to_abefcdgh); + zmm1 = _mm512_shuffle_epi8(zmm1, abcdefgh_to_abefcdgh); + zmm2 = _mm512_shuffle_epi8(zmm2, abcdefgh_to_abefcdgh); + zmm3 = _mm512_shuffle_epi8(zmm3, abcdefgh_to_abefcdgh); + zmm4 = _mm512_shuffle_epi8(zmm4, abcdefgh_to_abefcdgh); + zmm5 = _mm512_shuffle_epi8(zmm5, abcdefgh_to_abefcdgh); + zmm6 = _mm512_shuffle_epi8(zmm6, abcdefgh_to_abefcdgh); + zmm7 = _mm512_shuffle_epi8(zmm7, abcdefgh_to_abefcdgh); + + tmp0 = _mm512_unpacklo_epi64(zmm0, zmm1); + tmp1 = _mm512_unpackhi_epi64(zmm0, zmm1); + tmp2 = _mm512_unpacklo_epi64(zmm2, zmm3); + tmp3 = _mm512_unpackhi_epi64(zmm2, zmm3); + zmm0 = _mm512_unpacklo_epi64(zmm4, zmm5); + zmm1 = _mm512_unpackhi_epi64(zmm4, zmm5); + zmm2 = _mm512_unpacklo_epi64(zmm6, zmm7); + zmm3 = _mm512_unpackhi_epi64(zmm6, zmm7); + + zmm4 = _mm512_shuffle_i32x4(tmp0, tmp2, 0x88); + zmm6 = _mm512_shuffle_i32x4(tmp0, tmp2, 0xdd); + zmm5 = _mm512_shuffle_i32x4(tmp1, tmp3, 0x88); + zmm7 = _mm512_shuffle_i32x4(tmp1, tmp3, 0xdd); + tmp0 = _mm512_shuffle_i32x4(zmm0, zmm2, 0x88); + tmp1 = _mm512_shuffle_i32x4(zmm0, zmm2, 0xdd); + tmp2 = _mm512_shuffle_i32x4(zmm1, zmm3, 0x88); + tmp3 = _mm512_shuffle_i32x4(zmm1, zmm3, 0xdd); + + zmm0 = _mm512_shuffle_i32x4(zmm4, tmp0, 0x88); + zmm1 = _mm512_shuffle_i32x4(zmm5, tmp2, 0x88); + zmm2 = _mm512_shuffle_i32x4(zmm6, tmp1, 0x88); + zmm3 = _mm512_shuffle_i32x4(zmm7, tmp3, 0x88); + zmm4 = _mm512_shuffle_i32x4(zmm4, tmp0, 0xdd); + zmm5 = _mm512_shuffle_i32x4(zmm5, tmp2, 0xdd); + zmm6 = _mm512_shuffle_i32x4(zmm6, tmp1, 0xdd); + zmm7 = _mm512_shuffle_i32x4(zmm7, tmp3, 0xdd); + + _mm512_store_epi32(dest, zmm0); + _mm512_store_epi32(dest + dest_stride, zmm1); + _mm512_store_epi32(dest + dest_stride * 2, zmm2); + _mm512_store_epi32(dest + dest_stride * 3, zmm3); + _mm512_store_epi32(dest + dest_stride * 4, zmm4); + _mm512_store_epi32(dest + dest_stride * 5, zmm5); + _mm512_store_epi32(dest + dest_stride * 6, zmm6); + _mm512_store_epi32(dest + dest_stride * 7, zmm7); +#else + LIBXSMM_UNUSED(source_void); LIBXSMM_UNUSED(dest_void); LIBXSMM_UNUSED(source_stride); LIBXSMM_UNUSED(dest_stride); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_vnni_transpose_kernel(libxsmm_bfloat16* src, libxsmm_bfloat16* dst, int M, int N, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + const int _M = M/16, _N = N/16; + int i = 0, j = 0; + for (i = 0; i < _N; i++) { + for (j = 0; j < _M; j++) { + bf16_vnni_transpose_16x16_kernel((libxsmm_bfloat16*) src+i*16*ld_in+j*32, (libxsmm_bfloat16*) dst+j*16*ld_out+i*32, ld_in*2, ld_out*2); + } + } +#else + LIBXSMM_UNUSED(src); LIBXSMM_UNUSED(dst); LIBXSMM_UNUSED(M); LIBXSMM_UNUSED(N); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); +#endif +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c" + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, NULL, NULL, &ldC, NULL, NULL, NULL, NULL); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c" + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + { typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; + typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; + const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } + } else { + const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction_reducebatch_strd brgemm_function; + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + int stride_a = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(libxsmm_bfloat16); + int stride_b = handle->ofmblock * handle->ofwp * handle->ofhp * sizeof(libxsmm_bfloat16); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + brgemm_function bf16fp32_brgemm_kernel = libxsmm_bsmmdispatch_reducebatch_strd(handle->ifmblock, handle->ofw, handle->ofmblock, stride_a, stride_b, NULL, NULL, &ldC, NULL, NULL, &l_flags, NULL); + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + { + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction gemm_function; + typedef libxsmm_bsmmfunction_reducebatch_offs gemm_br_function_offs; + typedef libxsmm_bsmmfunction_reducebatch_strd gemm_br_function_strd; + gemm_br_function_offs br_gemm_kernel_offs = handle->bwd_compute_kernel_offs; + gemm_br_function_strd br_gemm_kernel_strd = handle->bwd_compute_kernel_strd; + gemm_function tile_config_kernel = handle->bwd_config_kernel; +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } + } else { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction_reducebatch_strd brgemm_function; + const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + int stride_a = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(libxsmm_bfloat16); + int stride_b = handle->ofmblock * handle->ofwp * handle->ofhp * sizeof(libxsmm_bfloat16); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + brgemm_function bf16fp32_brgemm_kernel = libxsmm_bsmmdispatch_reducebatch_strd(handle->ifmblock, handle->ofw, handle->ofmblock, stride_a, stride_b, NULL, NULL, &ldC, NULL, NULL, &l_flags, NULL); + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) + LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +# define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + { + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; + typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; + const libxsmm_blasint ldB = (libxsmm_blasint)handle->ofmblock; + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : (libxsmm_blasint)handle->ifmblock; + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, NULL); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } +# undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + } else { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction_reducebatch_strd brgemm_function; + const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + int stride_a = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(libxsmm_bfloat16); + int stride_b = handle->ofmblock * handle->ofwp * handle->ofhp * sizeof(libxsmm_bfloat16); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + brgemm_function bf16fp32_brgemm_kernel = libxsmm_bsmmdispatch_reducebatch_strd(handle->ifmblock, handle->ofw, handle->ofmblock, stride_a, stride_b, NULL, NULL, &ldC, NULL, NULL, &l_flags, NULL); +# define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +# undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else + LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) + LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +# define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + { + typedef libxsmm_bsmmfunction gemm_function; + typedef libxsmm_bsmmfunction_reducebatch_offs gemm_br_function_offs; + typedef libxsmm_bsmmfunction_reducebatch_strd gemm_br_function_strd; + gemm_br_function_offs br_gemm_kernel_offs = handle->bwd_compute_kernel_offs; + gemm_br_function_strd br_gemm_kernel_strd = handle->bwd_compute_kernel_strd; + gemm_function tile_config_kernel = handle->bwd_config_kernel; +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } +# undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + } else { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction_reducebatch_strd brgemm_function; + const libxsmm_blasint ldC = (libxsmm_blasint)(handle->desc.v*handle->ifmblock); + int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + int stride_a = handle->ifmblock * handle->desc.R * handle->desc.S * handle->ofmblock * sizeof(libxsmm_bfloat16); + int stride_b = handle->ofmblock * handle->ofwp * handle->ofhp * sizeof(libxsmm_bfloat16); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + brgemm_function bf16fp32_brgemm_kernel = libxsmm_bsmmdispatch_reducebatch_strd(handle->ifmblock, handle->ofw, handle->ofmblock, stride_a, stride_b, NULL, NULL, &ldC, NULL, NULL, &l_flags, NULL); + +# define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +# undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else + LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid ); +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? + (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : + (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? + (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : + (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CPX && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_amx( handle, start_thread, tid); + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_bwd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldx = ((libxsmm_blasint)handle->ofmblock); + const libxsmm_blasint ldA = handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? handle->ifmblock * handle->desc.v : handle->ifmblock; + const float beta = (handle->avoid_acc_load_bwd) ? 0.f : 1.f; + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c" + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldx = ((libxsmm_blasint)handle->desc.v*handle->ifmblock); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, NULL, NULL, &ldx, NULL, NULL, NULL, NULL); +# include "template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c" + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_bwd_nhwc_rsck_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? + (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : + (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->grad_input == 0 || handle->grad_output == 0 || handle->reg_filter == 0 || handle->scratch == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_bwd_nhwc_custom_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + if (handle->use_fallback_bwd_loops == 0) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = (handle->spread_input_bwd == 1) ? (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v) : (libxsmm_blasint)(handle->blocksifm * handle->ifmblock); + const float beta = (handle->avoid_acc_load_bwd ? 0.f : 1.f); + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*handle->bwd_ofw_rb, handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ifmblock, handle->bwd_ofh_rb*(handle->bwd_ofw_rb-1), handle->ofmblock, &ldA, &ldB, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM + } + } else { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + const libxsmm_blasint ldB = (libxsmm_blasint)(handle->blocksofm * handle->ofmblock); + const libxsmm_blasint ldA = (libxsmm_blasint)handle->ifmblock; + const libxsmm_blasint ldC = ( (handle->desc.pad_h != handle->desc.pad_h_in) || (handle->desc.pad_w != handle->desc.pad_w_in) ) ? + (libxsmm_blasint)(handle->ifmblock * handle->desc.v) : + (libxsmm_blasint)(handle->blocksifm * handle->ifmblock * handle->desc.v); + /* let's do a ifmblock x ofw_rb x ofmblock GEMM :-) or in other words M=nbIfm, N=ofw, K=nbOfm (col-major) */ + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ifmblock, handle->ofw, handle->ofmblock, &ldA, &ldB, &ldC, NULL, NULL, NULL, NULL); +# define LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.h b/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.h new file mode 100644 index 00000000..ed1928d0 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_backward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Rajkishore Barik, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_CONVOLUTION_BACKWARD_H +#define LIBXSMM_DNN_CONVOLUTION_BACKWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_bwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_CONVOLUTION_BACKWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.c b/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.c new file mode 100644 index 00000000..b56b60b6 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.c @@ -0,0 +1,544 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_convolution_forward.h" +#include "libxsmm_main.h" + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8(libxsmm_dnn_layer* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; +#if 1 + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function_addr; + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function_addr br_gemm_kernel_a_addr = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function_addr br_gemm_kernel_b_addr = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +#else + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function_addr; + typedef libxsmm_smmfunction_reducebatch_offs gemm_br_function_offs; + typedef libxsmm_smmfunction_reducebatch_strd gemm_br_function_strd; + + { + gemm_br_function_addr br_gemm_kernel_a_addr = handle->fwd_compute_kernel_addr_a_f32; + gemm_br_function_addr br_gemm_kernel_b_addr = handle->fwd_compute_kernel_addr_b_f32; + gemm_br_function_offs br_gemm_kernel_offs = handle->fwd_compute_kernel_offs_f32; + gemm_br_function_strd br_gemm_kernel_strd = handle->fwd_compute_kernel_strd_f32; +#endif +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c" + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + { + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; + typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') )| handle->fwd_flags; + + /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction gemm_function; + typedef libxsmm_bmmfunction_reducebatch_offs gemm_br_function_offs_a; + typedef libxsmm_bsmmfunction_reducebatch_offs gemm_br_function_offs_b; + typedef libxsmm_bmmfunction_reducebatch_strd gemm_br_function_strd; + gemm_br_function_offs_a br_gemm_kernel_offs_a = handle->fwd_compute_kernel_offs_a; + gemm_br_function_offs_b br_gemm_kernel_offs_b = handle->fwd_compute_kernel_offs_b; + gemm_br_function_strd br_gemm_kernel_strd = handle->fwd_compute_kernel_strd; + gemm_function tile_config_kernel = handle->fwd_config_kernel; +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c" +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; + typedef libxsmm_bmmfunction_reducebatch_addr gemm_br_function_bf16bf16; + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | handle->fwd_flags; + gemm_br_function br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function br_gemm_kernel2 = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); + gemm_br_function_bf16bf16 br_gemm_kernel2_bf16bf16 = libxsmm_bmmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, NULL); +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction gemm_function; + typedef libxsmm_bmmfunction_reducebatch_offs gemm_br_function_offs_a; + typedef libxsmm_bsmmfunction_reducebatch_offs gemm_br_function_offs_b; + typedef libxsmm_bmmfunction_reducebatch_strd gemm_br_function_strd; + gemm_br_function_offs_a br_gemm_kernel_offs_a = handle->fwd_compute_kernel_offs_a; + gemm_br_function_offs_b br_gemm_kernel_offs_b = handle->fwd_compute_kernel_offs_b; + gemm_br_function_strd br_gemm_kernel_strd = handle->fwd_compute_kernel_strd; + gemm_function tile_config_kernel = handle->fwd_config_kernel; +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid ); +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef unsigned char element_input_type; + typedef int element_output_type; + typedef char element_filter_type; + /* Basically we need only offset based and strided BRGEMMs */ + libxsmm_subimmfunction_reducebatch_strd br_gemm_kernel_strided = handle->gemm_fwd.xgemm.subimrs; + libxsmm_subimmfunction_reducebatch_strd br_gemm_kernel_strided2 = handle->gemm_fwd2.xgemm.subimrs; + libxsmm_subimmfunction_reducebatch_offs br_gemm_kernel_offset = handle->gemm_fwd.xgemm.subimro; +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c" +#else + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef unsigned char element_input_type; + typedef unsigned char element_output_type; + typedef char element_filter_type; + /* Basically we need only offset based and strided BRGEMMs */ + libxsmm_sububmmfunction_reducebatch_strd br_gemm_kernel_strided = handle->gemm_fwd.xgemm.sububmrs; + libxsmm_sububmmfunction_reducebatch_offs br_gemm_kernel_offset = handle->gemm_fwd.xgemm.sububmro; +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c" +#else + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->blocksofm*handle->ofmblock; + const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_I32 ) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i32( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_I8 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_I8 ) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_i8_i8( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_CPX) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CPX && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_amx( handle, start_thread, tid); + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_fwd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; +#if 1 + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function_addr; + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->ifmblock : (libxsmm_blasint)handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function_addr br_gemm_kernel_a_addr = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function_addr br_gemm_kernel_b_addr = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +#else + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function_addr; + typedef libxsmm_smmfunction_reducebatch_offs gemm_br_function_offs; + typedef libxsmm_smmfunction_reducebatch_strd gemm_br_function_strd; + + { + gemm_br_function_addr br_gemm_kernel_a_addr = handle->fwd_compute_kernel_addr_a_f32; + gemm_br_function_addr br_gemm_kernel_b_addr = handle->fwd_compute_kernel_addr_b_f32; + gemm_br_function_offs br_gemm_kernel_offs = handle->fwd_compute_kernel_offs_f32; + gemm_br_function_strd br_gemm_kernel_strd = handle->fwd_compute_kernel_strd_f32; +#endif +# include "template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c" + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_fwd_nhwc_custom_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->ofmblock; + const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->reg_output == 0 || handle->reg_filter == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_fwd_nhwc_rsck_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + const libxsmm_blasint ldx = (handle->pack_input == 1) ? (libxsmm_blasint)handle->blocksifm*handle->ifmblock : (libxsmm_blasint)handle->blocksifm*handle->desc.v*handle->ifmblock; + const libxsmm_blasint ldA = handle->blocksofm*handle->ofmblock; + const libxsmm_blasint ldC = handle->blocksofm*handle->ofmblock; + const float beta = (handle->avoid_acc_load) ? 0.f : 1.f; + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; + int l_flags = ( LIBXSMM_GEMM_FLAGS('N', 'N') ) | handle->fwd_flags; + int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + int brgemm_pf_oob = 0; + const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); + if ( 0 == env_brgemm_pf_oob ) { + } else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); + } + if (brgemm_pf_oob > 0) { + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); + } + { /* let's do a ofmblock x ofw_rb x ifmblock GEMM :-) or in other words M=nbOfm, N=ofw, K=nbIfm (col-major) */ + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*handle->fwd_ofw_rb, handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->fwd_ofh_rb*(handle->fwd_ofw_rb-1), handle->ifmblock, &ldA, &ldx, &ldC, NULL, &beta, &l_flags, &prefetch_mode); +# define LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c" +# undef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.h b/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.h new file mode 100644 index 00000000..de2c4fdb --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_forward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_CONVOLUTION_FORWARD_H +#define LIBXSMM_DNN_CONVOLUTION_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_fwd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_CONVOLUTION_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.c b/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.c new file mode 100644 index 00000000..c20d74c7 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.c @@ -0,0 +1,914 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Rajkishore Barik, Alexander Heinecke, Ankush Mandal, Jason Sewall (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_convolution_weight_update.h" +#include "libxsmm_main.h" + + +/* function prototypes for below implementations */ +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid); + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void transpose_32x16(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; + const int in_width=ld_in, out_width=ld_out; + const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); + + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + re = _mm512_loadu_si512(in + 14*in_width); + rf = _mm512_loadu_si512(in + 15*in_width); + + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); + t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); + t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); + t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); + t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); + t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); + t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); + t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); + t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); + t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); + ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); + tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); + tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); + td = _mm512_permutex2var_epi64(rd, idx_hi, r5); + te = _mm512_permutex2var_epi64(re, idx_hi, r6); + tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); + + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 0*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 1*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 2*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 3*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 4*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 5*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 6*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 7*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 8*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 9*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 10*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 11*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 12*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 13*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 14*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 15*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 16*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 17*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 18*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 19*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 20*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 21*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 22*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 23*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 24*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 25*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 26*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 27*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 28*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 29*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 30*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 31*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); +#else + LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void transpose_32xcols(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int col, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; + const int in_width=ld_in, out_width=ld_out; + const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); + __mmask16 store_mask = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(((unsigned int)1 << col) - 1); + + rf = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + if (col == 15) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + re = _mm512_loadu_si512(in + 14*in_width); + } else if (col == 14) { + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + } else if (col == 13) { + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + } else if (col == 12) { + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + } else if (col == 11) { + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + } else if (col == 10) { + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + } else if (col == 9) { + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + } else if (col == 8) { + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + } else if (col == 7) { + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + } else if (col == 6) { + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + } else if (col == 5) { + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + } else if (col == 4) { + r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + } else if (col == 3) { + r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + } else if (col == 2) { + r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + } else if (col == 1) { + r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r0 = _mm512_loadu_si512(in + 0*in_width); + } else { + r0 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + } + + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); + t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); + t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); + t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); + t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); + t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); + t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); + t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); + t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); + t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); + ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); + tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); + tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); + td = _mm512_permutex2var_epi64(rd, idx_hi, r5); + te = _mm512_permutex2var_epi64(re, idx_hi, r6); + tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); + + _mm256_mask_storeu_epi16(out + 0*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); + _mm256_mask_storeu_epi16(out + 1*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); + _mm256_mask_storeu_epi16(out + 2*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); + _mm256_mask_storeu_epi16(out + 3*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); + _mm256_mask_storeu_epi16(out + 4*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); + _mm256_mask_storeu_epi16(out + 5*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); + _mm256_mask_storeu_epi16(out + 6*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); + _mm256_mask_storeu_epi16(out + 7*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); + _mm256_mask_storeu_epi16(out + 8*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); + _mm256_mask_storeu_epi16(out + 9*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); + _mm256_mask_storeu_epi16(out + 10*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); + _mm256_mask_storeu_epi16(out + 11*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); + _mm256_mask_storeu_epi16(out + 12*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); + _mm256_mask_storeu_epi16(out + 13*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); + _mm256_mask_storeu_epi16(out + 14*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); + _mm256_mask_storeu_epi16(out + 15*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); + _mm256_mask_storeu_epi16(out + 16*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); + _mm256_mask_storeu_epi16(out + 17*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); + _mm256_mask_storeu_epi16(out + 18*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); + _mm256_mask_storeu_epi16(out + 19*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); + _mm256_mask_storeu_epi16(out + 20*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); + _mm256_mask_storeu_epi16(out + 21*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); + _mm256_mask_storeu_epi16(out + 22*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); + _mm256_mask_storeu_epi16(out + 23*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); + _mm256_mask_storeu_epi16(out + 24*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); + _mm256_mask_storeu_epi16(out + 25*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); + _mm256_mask_storeu_epi16(out + 26*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); + _mm256_mask_storeu_epi16(out + 27*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); + _mm256_mask_storeu_epi16(out + 28*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); + _mm256_mask_storeu_epi16(out + 29*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); + _mm256_mask_storeu_epi16(out + 30*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); + _mm256_mask_storeu_epi16(out + 31*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); +#else + LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(col); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void transpose_input_pixels_bf16(const libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int M, int N, int ld_in, int ld_out){ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + int i, j; + int full16_chunks = N/16; + int remainder_cols = N%16; + int _N = N - remainder_cols; + + if (full16_chunks) { + for (i=0; i FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; +# include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction gemm_function; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction_reducebatch_strd gemm_br_function; + gemm_function tile_config_kernel = handle->upd_config_kernel; + gemm_function gemm_kernel = NULL; + gemm_br_function br_gemm_kernel = NULL; +# include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction gemm_function; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction_reducebatch_addr gemm_br_function; +# include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid ); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_bsmmfunction gemm_function; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + typedef libxsmm_bsmmfunction_reducebatch_strd gemm_br_function; + gemm_function tile_config_kernel = handle->upd_config_kernel; + gemm_function gemm_kernel = NULL; + gemm_br_function br_gemm_kernel = NULL; +# include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c" + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_amx(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + return libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid ); +} +#endif + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; +#define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" +#undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; +#define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" +#undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ((handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CPX && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_amx( handle, start_thread, tid); + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_convolve_st_upd_custom_custom_bf16_bf16_emu_amx( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; +# include "template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c" + } + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_upd_nhwc_custom_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; +#define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM +# include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" +#undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM + } + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ + if (handle->reg_input == 0 || handle->grad_output == 0 || handle->grad_filter == 0 || handle->scratch == 0) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_convolve_st_upd_nhwc_rsck_f32_f32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + typedef libxsmm_smmfunction_reducebatch_addr gemm_br_function; +#define LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK +# include "template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c" +#undef LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK + } + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.h b/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.h new file mode 100644 index 00000000..2966a80e --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_convolution_weight_update.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Rajkishore Barik, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H +#define LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_custom_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_rsck(libxsmm_dnn_layer* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_convolve_st_upd_nhwc_custom(libxsmm_dnn_layer* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_CONVOLUTION_WEIGHT_UPDATE_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_elementwise.c b/third_party/libxsmm/src/libxsmm_dnn_elementwise.c new file mode 100644 index 00000000..06d5782a --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_elementwise.c @@ -0,0 +1,618 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_elementwise.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + src[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)0; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + c[i] = a[i] + b[i]; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + c[i] = a[i] * b[i]; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + const LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[i]); + dst[i] = 1 / (1 + exp_value); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double)src[i]); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = (src[i] > 0.0f) ? src[i] : 0.0f; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + const LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[i]); + const LIBXSMM_DNN_ELTWISE_FTYPE sig_exp = 1 / (1 + exp_value); + dst[i] = (1 - sig_exp)*sig_exp; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + const LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double)src[i]); + dst[i] = 1 - (tanh_value * tanh_value); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = (LIBXSMM_DNN_ELTWISE_FTYPE)(src[i] > 0.0f ? 1.0f : 0.0f); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_transpose(libxsmm_blasint rows, libxsmm_blasint cols, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const libxsmm_blasint size = rows * cols; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + LIBXSMM_VLA_DECL(2, LIBXSMM_DNN_ELTWISE_FTYPE, src2D, src, cols); + LIBXSMM_VLA_DECL(2, LIBXSMM_DNN_ELTWISE_FTYPE, dst2D, dst, rows); + libxsmm_blasint job; + + for (job = thr_begin; job < thr_end; ++job) { + const libxsmm_blasint i = job / cols; + const libxsmm_blasint j = job % cols; + LIBXSMM_VLA_ACCESS(2, dst2D, j, i, rows) = LIBXSMM_VLA_ACCESS(2, src2D, i, j, cols); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = src[i]; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = 1 - src[i]; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = 1 - (src[i] * src[i]); + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (size % nthreads == 0) ? (size / nthreads) : (size / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < size) ? (ltid * chunksize) : size; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, size); + libxsmm_blasint i; + + for (i = thr_begin; i < thr_end; i++) { + dst[i] = -src[i]; + } +} + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_1D_2D(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint bm, libxsmm_blasint bn, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads) +{ + const int ltid = tid - start_thread; + /* compute chunk size */ + const libxsmm_blasint chunksize = (m % nthreads == 0) ? (m / nthreads) : (m / nthreads) + 1; + /* compute thr_begin and thr_end */ + const libxsmm_blasint thr_begin = (ltid * chunksize < m) ? (ltid * chunksize) : m; + const libxsmm_blasint thr_end = LIBXSMM_MIN(ltid * chunksize + chunksize, m); + libxsmm_blasint i, j; + LIBXSMM_VLA_DECL(4, LIBXSMM_DNN_ELTWISE_FTYPE, real_dst, (LIBXSMM_DNN_ELTWISE_FTYPE*)dst, m/bm, bn, bm); + + for (i = thr_begin; i < thr_end; i++) { + const libxsmm_blasint mb = i/bm; + const libxsmm_blasint ibm = i%bm; + for (j = 0; j < n; j++) { + const libxsmm_blasint nb = j/bn; + const libxsmm_blasint ibn = j%bn; + LIBXSMM_VLA_ACCESS(4, real_dst, nb, mb, ibn, ibm, m/bm, bn, bm) = src[i]; + } + } +} + + +/* #define LSTM_TIMING */ +#if defined(LSTM_TIMING) +extern double Gbl_t_input_total, Gbl_t_recur_total, Gbl_t_eltwise_total, Gbl_t_nonlin_total; +extern unsigned long long Gbl_t_input, Gbl_t_recur, Gbl_t_eltwise, Gbl_t_nonlin; +extern double Gbl_duration_input, Gbl_duration_recur, Gbl_duration_eltwise, Gbl_duration_nonlin; +#endif + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + srcdst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)0; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = src[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = src0[(j*ld)+i] + src1[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sub_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = src0[(j*ld)+i] - src1[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = src0[(j*ld)+i] * src1[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + srcdst[(j*ld)+i] *= src0[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_fma_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] += src0[(j*ld)+i] * src1[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + srcdst[(j*ld)+i] += colv[i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + srcdst[(j*ld)+i] = colv[i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv) { + libxsmm_blasint i, j; + libxsmm_bfloat16_hp t; + + t.i[0] = 0; + for ( j = 0; j < n; ++j ) { + for ( i = 0; i < m; ++i ) { + t.i[1] = colv[i]; + srcdst[(j*ld)+i] = t.f; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + srcdst[(j*ld)+i] = colv[i] + const_bias; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias) { + libxsmm_blasint i, j; + libxsmm_bfloat16_hp t; + + t.i[0] = 0; + for ( j = 0; j < n; ++j ) { + for ( i = 0; i < m; ++i ) { + t.i[1] = colv[i]; + srcdst[(j*ld)+i] = t.f + const_bias; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + const LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); + dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + mid_value); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : src[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); + LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + exp_value); + dst[(j*ld)+i] = ((LIBXSMM_DNN_ELTWISE_FTYPE)1 - mid_value) * mid_value; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); + dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (tanh_value * tanh_value); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : (LIBXSMM_DNN_ELTWISE_FTYPE)1; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + LIBXSMM_DNN_ELTWISE_FTYPE exp_value = (LIBXSMM_DNN_ELTWISE_FTYPE)exp((double) -src[(j*ld)+i]); + LIBXSMM_DNN_ELTWISE_FTYPE mid_value = (LIBXSMM_DNN_ELTWISE_FTYPE)1 / ((LIBXSMM_DNN_ELTWISE_FTYPE)1 + exp_value); + dst[(j*ld)+i] *= ((LIBXSMM_DNN_ELTWISE_FTYPE)1 - mid_value) * mid_value; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + LIBXSMM_DNN_ELTWISE_FTYPE tanh_value = (LIBXSMM_DNN_ELTWISE_FTYPE)tanh((double) src[(j*ld)+i]); + dst[(j*ld)+i] *= (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (tanh_value * tanh_value); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] *= (src[(j*ld)+i] < 0) ? (LIBXSMM_DNN_ELTWISE_FTYPE)0 : (LIBXSMM_DNN_ELTWISE_FTYPE)1; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - src[(j*ld)+i]; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i = 0, j; + + for ( j = 0; j < n; ++j ) { + LIBXSMM_PRAGMA_SIMD + for ( i = 0; i < m; ++i ) { + dst[(j*ld)+i] = (LIBXSMM_DNN_ELTWISE_FTYPE)1 - (src[(j*ld)+i] * src[(j*ld)+i]); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_mask_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, float* dst) { + libxsmm_blasint i,j; + + /* rnaz buffer to bfp16 */ + for ( j = 0; j < n; ++j ) { + for ( i = 0; i < m; ++i ) { + unsigned int int_round = 0; + unsigned int do_round = 1; + const void *const ptr = &int_round; + + int_round = *((unsigned int*)&(src[(j*ld)+i])); + + /* we don't round NaN and inf */ + if ( (int_round & 0x7f800000) == 0x7f800000 ) { + do_round = 0; + } + + /* perform round nearest tie even */ + if ( do_round != 0 ) { + unsigned int fixup = (int_round >> 16) & 1; + int_round = int_round + 0x00007fff + fixup; + } + + /* chop bits to create BFP16 in FP32 */ + int_round = int_round & 0xffff0000; + + dst[(j*ld)+i] = *((float*)ptr); + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_cvt_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, libxsmm_bfloat16* dst) { + libxsmm_blasint i,j; + + /* truncate buffer to bfp16 */ + for ( j = 0; j < n; ++j ) { + for ( i = 0; i < m; ++i ) { + unsigned int int_round = 0; + unsigned int do_round = 1; + int_round = *((unsigned int*)&(src[(j*ld)+i])); + /* we don't round NaN and inf */ + if ( (int_round & 0x7f800000) == 0x7f800000 ) { + do_round = 0; + } + /* perform round nearest tie even */ + if ( do_round != 0 ) { + unsigned int fixup = (int_round >> 16) & 1; + int_round = int_round + 0x00007fff + fixup; + } + /* create the bfp16 value by shifting out the lower 16bits */ + int_round = int_round >> 16; + dst[(j*ld)+i] = (unsigned short)int_round; + } + } +} + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_cvt_bf16_fp32_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, libxsmm_bfloat16 *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst) { + libxsmm_blasint i, j; + libxsmm_bfloat16_hp t; + + t.i[0] = 0; + for ( j = 0; j < n; ++j ) { + for ( i = 0; i < m; ++i ) { + t.i[1] = src[(j*ld)+i]; + dst[(j*ld)+i] = t.f; + } + } +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_elementwise.h b/third_party/libxsmm/src/libxsmm_dnn_elementwise.h new file mode 100644 index 00000000..aea0b129 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_elementwise.h @@ -0,0 +1,65 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_ELEMENTWISE_H +#define LIBXSMM_DNN_ELEMENTWISE_H + +#include + +#if !defined(LIBXSMM_DNN_ELTWISE_FTYPE) +# define LIBXSMM_DNN_ELTWISE_FTYPE float +#endif + + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *a, LIBXSMM_DNN_ELTWISE_FTYPE *b, LIBXSMM_DNN_ELTWISE_FTYPE *c, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_transpose(libxsmm_blasint rows, libxsmm_blasint cols, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_inverse(libxsmm_blasint size, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_1D_2D(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint bm, libxsmm_blasint bn, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst, int start_thread, int tid, int nthreads); + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_zero_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sub_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_copy_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_eltwise_fma_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src0, LIBXSMM_DNN_ELTWISE_FTYPE *src1, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_add_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, LIBXSMM_DNN_ELTWISE_FTYPE *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_bcst_cvt_bf16_fp32_colvector_const_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *srcdst, libxsmm_bfloat16 *colv, LIBXSMM_DNN_ELTWISE_FTYPE const_bias); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); + +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_complement_square_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, LIBXSMM_DNN_ELTWISE_FTYPE *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_mask_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, float* dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_rne_cvt_fp32_bfp16_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, float* src, libxsmm_bfloat16* dst); +LIBXSMM_API_INTERN void libxsmm_internal_matrix_cvt_bf16_fp32_ld(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ld, libxsmm_bfloat16 *src, LIBXSMM_DNN_ELTWISE_FTYPE *dst); +#endif /*LIBXSMM_DNN_ELEMENTWISE_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fullyconnected.c b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected.c new file mode 100644 index 00000000..9fde7fce --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected.c @@ -0,0 +1,1514 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fullyconnected_backward_weight_update.h" +#include "libxsmm_dnn_fullyconnected_forward.h" +#include "libxsmm_main.h" + +LIBXSMM_API libxsmm_dnn_fullyconnected* libxsmm_dnn_create_fullyconnected(libxsmm_dnn_fullyconnected_desc fullyconnected_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_fullyconnected* handle = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || + ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((fullyconnected_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fullyconnected_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_fullyconnected*)calloc(1, sizeof(libxsmm_dnn_fullyconnected)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = fullyconnected_desc; + handle->target_archid = libxsmm_target_archid; + if ( ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) && ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && ((handle->desc.C % 16 != 0) || (handle->desc.K % 16 != 0)) ) { + handle->target_archid = LIBXSMM_X86_AVX512_CPX; + } + + /* @TODO perhaps we need a better switch here */ + if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + handle->bk = handle->desc.bk; + handle->bn = handle->desc.bn; + handle->bc = handle->desc.bc; + + if ( handle->desc.N % handle->bn != 0 ) { + handle->bn = handle->desc.N; + *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_N_BLOCKING; + } + if ( handle->desc.C % handle->bc != 0 ) { + handle->bc = handle->desc.C; + *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_C_BLOCKING; + } + if ( handle->desc.K % handle->bk != 0 ) { + handle->bk = handle->desc.K; + *status = LIBXSMM_DNN_WARN_FC_SUBOPTIMAL_K_BLOCKING; + } + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { +#if 0 + handle->fwd_bf = atoi(getenv("FWD_BF")); + handle->bwd_bf = atoi(getenv("BWD_BF")); + handle->upd_bf = atoi(getenv("UPD_BF")); + handle->fwd_2d_blocking = atoi(getenv("FWD_2D_BLOCKING")); + handle->bwd_2d_blocking = atoi(getenv("BWD_2D_BLOCKING")); + handle->upd_2d_blocking = atoi(getenv("UPD_2D_BLOCKING")); + handle->fwd_row_teams = atoi(getenv("FWD_ROW_TEAMS")); + handle->fwd_column_teams = atoi(getenv("FWD_COLUMN_TEAMS")); + handle->bwd_row_teams = atoi(getenv("BWD_ROW_TEAMS")); + handle->bwd_column_teams = atoi(getenv("BWD_COLUMN_TEAMS")); + handle->upd_row_teams = atoi(getenv("UPD_ROW_TEAMS")); + handle->upd_column_teams = atoi(getenv("UPD_COLUMN_TEAMS")); + handle->ifm_subtasks = atoi(getenv("IFM_SUBTASKS")); + handle->ofm_subtasks = atoi(getenv("OFM_SUBTASKS")); +#else + /* Initialize with default values */ + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 0; + handle->bwd_2d_blocking = 0; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 14; + handle->fwd_column_teams = 2; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 7; + handle->fwd_column_teams = 4; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 8 == 0) ? 8 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 7; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 7; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 14; + handle->bwd_column_teams = 2; + handle->upd_bf = ((handle->desc.N/handle->bn) % 2 == 0) ? 2 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 9 == 0) ? 9 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = ((handle->bk % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 6; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 6; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 6; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 12; + handle->bwd_column_teams = 2; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 6; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 6; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } +#endif + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { +#if 0 + handle->fwd_bf = atoi(getenv("FWD_BF")); + handle->bwd_bf = atoi(getenv("BWD_BF")); + handle->upd_bf = atoi(getenv("UPD_BF")); + handle->fwd_2d_blocking = atoi(getenv("FWD_2D_BLOCKING")); + handle->bwd_2d_blocking = atoi(getenv("BWD_2D_BLOCKING")); + handle->upd_2d_blocking = atoi(getenv("UPD_2D_BLOCKING")); + handle->fwd_row_teams = atoi(getenv("FWD_ROW_TEAMS")); + handle->fwd_column_teams = atoi(getenv("FWD_COLUMN_TEAMS")); + handle->bwd_row_teams = atoi(getenv("BWD_ROW_TEAMS")); + handle->bwd_column_teams = atoi(getenv("BWD_COLUMN_TEAMS")); + handle->upd_row_teams = atoi(getenv("UPD_ROW_TEAMS")); + handle->upd_column_teams = atoi(getenv("UPD_COLUMN_TEAMS")); + handle->ifm_subtasks = atoi(getenv("IFM_SUBTASKS")); + handle->ofm_subtasks = atoi(getenv("OFM_SUBTASKS")); +#else + if (handle->desc.compressed_A > 0) { + handle->compressed_A = 1; + handle->sparsity_factor_A = handle->desc.sparsity_factor_A; + } + + /* Initialize with default values */ + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 0; + handle->bwd_2d_blocking = 0; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + + if (handle->desc.threads == 14) { + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 1; + handle->bwd_2d_blocking = 1; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 2; + handle->fwd_column_teams = 7; + handle->bwd_row_teams = 2; + handle->bwd_column_teams = 7; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + } + + if (handle->desc.threads == 2) { + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 1; + handle->bwd_2d_blocking = 1; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 2; + handle->fwd_column_teams = 1; + handle->bwd_row_teams = 2; + handle->bwd_column_teams = 1; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + } + + if (handle->desc.threads == 4) { + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 1; + handle->bwd_2d_blocking = 1; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 2; + handle->fwd_column_teams = 2; + handle->bwd_row_teams = 2; + handle->bwd_column_teams = 2; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + } + + if (handle->desc.threads == 8) { + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 1; + handle->bwd_2d_blocking = 1; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 2; + handle->fwd_column_teams = 4; + handle->bwd_row_teams = 2; + handle->bwd_column_teams = 4; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + } + + if (handle->desc.threads == 16) { + handle->fwd_bf = 1; + handle->bwd_bf = 1; + handle->upd_bf = 1; + handle->fwd_2d_blocking = 1; + handle->bwd_2d_blocking = 1; + handle->upd_2d_blocking = 0; + handle->fwd_row_teams = 2; + handle->fwd_column_teams = 8; + handle->bwd_row_teams = 2; + handle->bwd_column_teams = 8; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1; + handle->ofm_subtasks = 1; + } + + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 14; + handle->fwd_column_teams = 2; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 7; + handle->fwd_column_teams = 4; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 8 == 0) ? 8 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 7; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 7; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 14 == 0) ? 14 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 28) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 1; + handle->fwd_column_teams = 1; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 14; + handle->bwd_column_teams = 2; + handle->upd_bf = ((handle->desc.N/handle->bn) % 2 == 0) ? 2 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 9 == 0) ? 9 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = ((handle->bk % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + } + + if (handle->desc.C == 1024 && handle->desc.K == 1024 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 6; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 6; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 6; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 100 && handle->desc.K == 1024 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 12; + handle->bwd_column_teams = 2; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = ((handle->desc.K/handle->bk) % 4 == 0) ? 4 : 1; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 2 == 0) && (handle->upd_2d_blocking == 0)) ? 2 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 512 && handle->desc.K == 512 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 1; + handle->bwd_column_teams = 1; + handle->upd_bf = ((handle->desc.N/handle->bn) % 15 == 0) ? 15 : 1; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 1; + handle->upd_column_teams = 1; + handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 24) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 5; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 0; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 5; + handle->upd_column_teams = 4; + handle->ifm_subtasks = ((handle->bc % 4 == 0) && (handle->upd_2d_blocking == 0)) ? 4 : 1; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } + if (handle->desc.C == 1024 && handle->desc.K == 1 && handle->desc.threads == 20) { + handle->fwd_bf = 1/*((handle->desc.C/handle->bc) % 1 == 0) ? 1 : 1*/; + handle->fwd_2d_blocking = 0; + handle->fwd_row_teams = 6; + handle->fwd_column_teams = 4; + handle->bwd_bf = 1/*((handle->desc.K/handle->bk) % 1 == 0) ? 1 : 1*/; + handle->bwd_2d_blocking = 1; + handle->bwd_row_teams = 5; + handle->bwd_column_teams = 4; + handle->upd_bf = 1/*((handle->desc.N/handle->bn) % 1 == 0) ? 1 : 1*/; + handle->upd_2d_blocking = 0; + handle->upd_row_teams = 6; + handle->upd_column_teams = 4; + handle->ifm_subtasks = 1/*((handle->bc % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + handle->ofm_subtasks = 1/*((handle->bk % 1 == 0) && (handle->upd_2d_blocking == 0)) ? 1 : 1*/; + } +#endif + + /* In this case force 2D decomposition */ + if (handle->compressed_A == 1) { + handle->fwd_2d_blocking = 1; + handle->fwd_row_teams = 2; + while (handle->desc.threads % handle->fwd_row_teams != 0) { + handle->fwd_row_teams--; + } + handle->fwd_column_teams = handle->desc.threads/handle->fwd_row_teams; + } + + } + } else { + /* check that we cannot fuse */ + if ( handle->desc.fuse_ops != LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + free( handle ); + *status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + return 0; + } + + /* we need to compute the memory layout given the */ + if ( (handle->desc.C % 16 == 0) && (handle->desc.K % 16 == 0) ) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, + &(handle->ifmblock), &(handle->ofmblock), &(handle->fm_lp_block), + LIBXSMM_DNN_DATATYPE_F32, LIBXSMM_DNN_DATATYPE_F32 ); + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, + &(handle->ifmblock), &(handle->ofmblock), &(handle->fm_lp_block), + handle->desc.datatype_in, handle->desc.datatype_out ); + } else { + /* should not happen, not implemented */ + } + } else if ( (handle->desc.C % 64 == 0) && (handle->desc.K == 1000) ) { + /* @TODO this a hack for the last FC layer */ + handle->ifmblock = 64; + handle->fm_lp_block = 1; + handle->ofmblock = 10; + } else if ( (handle->desc.C % 16 == 0) && (handle->desc.K == 1000) ) { + /* @TODO this a hack for the last FC layer */ + handle->ifmblock = 16; + handle->fm_lp_block = 1; + handle->ofmblock = 10; + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + free( handle ); + return 0; + } + /* compute the outer blocks */ + handle->blocksifm = handle->desc.C / handle->ifmblock; + handle->blocksofm = handle->desc.K / handle->ofmblock; + } + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + + /* If in SPR, generate tilerelease kernel */ + if ((handle->target_archid >= LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + int l_tr_flags = LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + handle->tilerelease_kernel = libxsmm_bsmmdispatch(handle->bk, handle->bk, handle->bk, NULL, NULL, NULL, NULL, NULL, &l_tr_flags, NULL); + } + /* calculate scratch size */ + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + handle->scratch_size = sizeof(float) * ( ( (size_t)handle->desc.C * (size_t)handle->desc.N ) + ( (size_t)handle->desc.C * (size_t)handle->desc.K ) ); + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + /* Let's allocate maximum required scratch */ + size_t size_fwd = sizeof(float) * LIBXSMM_MAX(handle->desc.K * handle->desc.N, handle->desc.threads * LIBXSMM_MAX(handle->bk * handle->bn, handle->desc.K)); + /* In case of K = 1 we pad A and B to "bk=2" */ + size_t size_bwd = (handle->desc.K != 1) ? ( sizeof(float) * LIBXSMM_MAX(handle->desc.C * handle->desc.N, handle->desc.threads * handle->bc * handle->bn) + sizeof(libxsmm_bfloat16) * handle->desc.C * handle->desc.K ) : ( sizeof(float) * handle->desc.C * handle->desc.N + sizeof(libxsmm_bfloat16) * handle->desc.C * 2 + sizeof(libxsmm_bfloat16) * 2 * handle->desc.N ); + size_t size_upd = sizeof(float) * LIBXSMM_MAX(handle->desc.C * handle->desc.K, handle->desc.threads * handle->bc * handle->bk) + sizeof(libxsmm_bfloat16) * handle->desc.threads * handle->bk * handle->bc + sizeof(libxsmm_bfloat16) * (handle->desc.N * (handle->desc.C + handle->desc.K)); + if (handle->compressed_A == 1) { + size_fwd += handle->desc.threads * handle->desc.C * handle->bk *sizeof(libxsmm_bfloat16); + } + handle->scratch_size = LIBXSMM_MAX(LIBXSMM_MAX(size_fwd, size_bwd), size_upd); + handle->doutput_scratch_mark = handle->scratch_size; + handle->scratch_size += 2 * sizeof(libxsmm_bfloat16) * handle->desc.N * handle->desc.K; + } else { + handle->scratch_size = sizeof(float) * ( (((size_t)handle->desc.C + (size_t)handle->desc.K) * (size_t)handle->desc.N) + ((size_t)handle->desc.C * (size_t)handle->desc.K) ); + } + /* create code pointers in some special cases */ + if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) && ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) ) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + float alpha = 1.0f; + /* beta is set to 1 for ncnc kcck format because ifm is split into 2 blocks */ + float beta = 1.0f; + float zerobeta = 0.0f; + int updflags = LIBXSMM_GEMM_FLAGS( 'N', 'T' ); + /* For UPD kernels we consider subtasking... */ + libxsmm_blasint M = handle->bk/handle->ofm_subtasks; + libxsmm_blasint N = handle->bc/handle->ifm_subtasks; + + libxsmm_blasint lda = (libxsmm_blasint)handle->bk; + libxsmm_blasint ldb = (libxsmm_blasint)handle->bc; + libxsmm_blasint ldc = (libxsmm_blasint)handle->bk; + + handle->gemm_fwd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(float), handle->bc*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); + handle->gemm_fwd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(float), handle->bc*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); + handle->gemm_bwd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(float), handle->bk*handle->bn*sizeof(float), &ldb, &lda, &ldb, &alpha, &beta, NULL, NULL); + handle->gemm_bwd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(float), handle->bk*handle->bn*sizeof(float), &ldb, &lda, &ldb, &alpha, &zerobeta, NULL, NULL); + + /* Transpose kernel used for weight transpose in bwd pass */ + handle->tr_kernel = libxsmm_dispatch_meltw_unary((libxsmm_blasint)(handle->bk), (libxsmm_blasint)(handle->bc), (const libxsmm_blasint*)&(handle->bk), (const libxsmm_blasint*)&(handle->bc), LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + + /* update has different LDs */ + lda = (libxsmm_blasint)handle->bk; + ldb = (libxsmm_blasint)handle->bc; + ldc = (libxsmm_blasint)handle->bk; + handle->gemm_upd.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(M, N, handle->bn, handle->desc.K*handle->bn*sizeof(float), handle->desc.C*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &beta, &updflags, NULL); + handle->gemm_upd2.xgemm.smrs = libxsmm_smmdispatch_reducebatch_strd(M, N, handle->bn, handle->desc.K*handle->bn*sizeof(float), handle->desc.C*handle->bn*sizeof(float), &lda, &ldb, &ldc, &alpha, &zerobeta, &updflags, NULL); + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + float alpha = 1.0f; + float beta = 1.0f; + float zerobeta = 0.0f; + /* For UPD kernels we consider subtasking... */ + libxsmm_blasint M = handle->bk/handle->ofm_subtasks; + libxsmm_blasint N = handle->bc/handle->ifm_subtasks; + + libxsmm_blasint lda = (libxsmm_blasint)handle->bk; + libxsmm_blasint ldb = (libxsmm_blasint)handle->bc; + libxsmm_blasint ldc = (libxsmm_blasint)handle->bk; + + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + libxsmm_meltw_flags fusion_flags; + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + int l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + libxsmm_blasint unroll_hint = (handle->desc.C/handle->bc)/handle->fwd_bf; + + handle->gemm_fwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &beta, &l_flags, NULL); + handle->gemm_fwd2.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL); + handle->fwd_config_kernel = libxsmm_bsmmdispatch(handle->bk, handle->bn, handle->bc, &lda, &ldb, &ldc, NULL, &beta, &l_tc_flags, NULL); + handle->gemm_fwd3.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_OVERWRITE_C; + handle->gemm_fwd4.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT, LIBXSMM_DATATYPE_F32, fusion_flags, 0, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_ACT_RELU_OVERWRITE_C; + handle->gemm_fwd5.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT, LIBXSMM_DATATYPE_F32, fusion_flags, 0, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_ACT_SIGM_OVERWRITE_C; + handle->gemm_fwd6.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT, LIBXSMM_DATATYPE_F32, fusion_flags, 0, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_ACT_RELU_OVERWRITE_C; + handle->gemm_fwd7.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT, LIBXSMM_DATATYPE_F32, fusion_flags, 0, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_ACT_SIGM_OVERWRITE_C; + handle->gemm_fwd8.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT, LIBXSMM_DATATYPE_F32, fusion_flags, 0, 0, 0, 0); + + if (handle->compressed_A == 1) { + fusion_flags = LIBXSMM_MELTW_FLAG_FUSE_NONE; + handle->gemm_fwd9.xgemm.bsmrs_meltwfused = libxsmm_bsmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &beta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + handle->gemm_fwd10.xgemm.bsmrs_meltwfused = libxsmm_bsmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + handle->fwd_config_kernel = libxsmm_bsmmdispatch(handle->bk, handle->bn, handle->bc, &lda, &ldb, &ldc, NULL, &beta, &l_tc_flags, NULL); + handle->gemm_fwd11.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_OVERWRITE_C; + handle->gemm_fwd12.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_ACT_RELU_OVERWRITE_C; + handle->gemm_fwd13.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_ACT_SIGM_OVERWRITE_C; + handle->gemm_fwd14.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_ACT_RELU_OVERWRITE_C; + handle->gemm_fwd15.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + fusion_flags = LIBXSMM_MELTW_FLAG_COLBIAS_ACT_SIGM_OVERWRITE_C; + handle->gemm_fwd16.xgemm.bmrs_meltwfused = libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll(handle->bk, handle->bn, handle->bc, (handle->bk*handle->bc*sizeof(libxsmm_bfloat16))/handle->sparsity_factor_A, handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL, LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A, LIBXSMM_DATATYPE_F32, fusion_flags, handle->sparsity_factor_A, 0, 0, 0); + } + + /* Also JIT eltwise functions... */ + handle->fwd_cvtfp32bf16_kernel = libxsmm_dispatch_meltw_unary(handle->bk, handle->bn, &ldc, &ldc, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); + handle->fwd_cvtfp32bf16_relu_kernel = libxsmm_dispatch_meltw_unary(handle->bk, handle->bn, &ldc, &ldc, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_BITMASK, LIBXSMM_MELTW_TYPE_UNARY_RELU); + handle->fwd_sigmoid_cvtfp32bf16_kernel = libxsmm_dispatch_meltw_unary(handle->bk, handle->bn, &ldc, &ldc, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_SIGMOID); + } else { + handle->gemm_fwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); + handle->gemm_fwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); + handle->gemm_fwd3.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bk, handle->bn, handle->bc, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); + } + + /* Special bwd kernels for K == 1 */ + if (handle->desc.K == 1) { + libxsmm_blasint _bk = 2; + handle->gemm_bwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bc, handle->bn, _bk, _bk*handle->bc*sizeof(libxsmm_bfloat16), _bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &_bk, &ldb, &alpha, &beta, NULL, NULL); + handle->gemm_bwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bc, handle->bn, _bk, _bk*handle->bc*sizeof(libxsmm_bfloat16), _bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &_bk, &ldb, &alpha, &zerobeta, NULL, NULL); + } else { + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + int l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + libxsmm_blasint unroll_hint = (handle->desc.K/handle->bk)/handle->bwd_bf; + handle->gemm_bwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &ldb, &lda, &ldb, &alpha, &beta, &l_flags, NULL); + handle->gemm_bwd2.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &ldb, &lda, &ldb, &alpha, &zerobeta, &l_flags, NULL); + handle->bwd_config_kernel = libxsmm_bsmmdispatch(handle->bc, handle->bn, handle->bk, &ldb, &lda, &ldb, NULL, &beta, &l_tc_flags, NULL); + handle->gemm_bwd3.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd_unroll(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &ldb, &lda, &ldb, &alpha, &zerobeta, &l_flags, NULL); + /* Also JIT eltwise functions... */ + handle->bwd_cvtfp32bf16_kernel = libxsmm_dispatch_meltw_unary(handle->bc, handle->bn, &ldb, &ldb, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); + handle->bwd_relu_kernel = libxsmm_dispatch_meltw_unary(handle->bc, handle->bn, &ldb, &ldb, LIBXSMM_DATATYPE_BF16, LIBXSMM_DATATYPE_BF16, LIBXSMM_DATATYPE_BF16, LIBXSMM_MELTW_FLAG_UNARY_BITMASK, LIBXSMM_MELTW_TYPE_UNARY_RELU_INV); + } else { + handle->gemm_bwd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &lda, &ldb, &alpha, &beta, NULL, NULL); + handle->gemm_bwd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(handle->bc, handle->bn, handle->bk, handle->bk*handle->bc*sizeof(libxsmm_bfloat16), handle->bk*handle->bn*sizeof(libxsmm_bfloat16), &ldb, &lda, &ldb, &alpha, &zerobeta, NULL, NULL); + } + } + lda = (libxsmm_blasint)handle->bk; + ldb = (libxsmm_blasint)handle->bn; + ldc = (libxsmm_blasint)handle->bk; + if ((handle->target_archid == LIBXSMM_X86_AVX512_SPR) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT)) { + int l_flags = ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ) | LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG; + int l_tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + libxsmm_blasint unroll_hint = (handle->desc.N/handle->bn)/handle->upd_bf; + handle->gemm_upd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &beta, &l_flags, NULL); + handle->gemm_upd2.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd_unroll(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL); + handle->upd_config_kernel = libxsmm_bsmmdispatch(M, N, handle->bn, &lda, &ldb, &ldc, NULL, &beta, &l_tc_flags, NULL); + l_flags = l_flags | LIBXSMM_GEMM_FLAG_VNNI_C; + handle->gemm_upd3.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd_unroll(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), unroll_hint, &lda, &ldb, &ldc, &alpha, &zerobeta, &l_flags, NULL); + } else { + handle->gemm_upd.xgemm.bsmrs = libxsmm_bsmmdispatch_reducebatch_strd(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); + handle->gemm_upd2.xgemm.bmrs = libxsmm_bmmdispatch_reducebatch_strd(M, N, handle->bn, handle->bk*handle->bn*sizeof(libxsmm_bfloat16), handle->bc*handle->bn*sizeof(libxsmm_bfloat16), &lda, &ldb, &ldc, &alpha, &zerobeta, NULL, NULL); + + } + } else { + + } + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fullyconnected(const libxsmm_dnn_fullyconnected* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_fullyconnected*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fullyconnected_create_tensor_datalayout(const libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->format = handle->desc.buffer_format; + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->datatype = handle->desc.datatype_out; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->desc.K; + layout->dim_size[1] = 1; + layout->dim_size[2] = 1; + layout->dim_size[3] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = (unsigned int)handle->bc; + layout->dim_size[1] = (unsigned int)handle->bn; + layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bn; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_FILTER) ) { + layout->format = handle->desc.filter_format; + layout->tensor_type = LIBXSMM_DNN_FILTER; + + if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 6; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ifmblock; + layout->dim_size[2] = 1; + layout->dim_size[3] = 1; + layout->dim_size[4] = handle->blocksifm; + layout->dim_size[5] = handle->blocksofm; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) || + ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(7*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(7*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 7; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[6] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->ofmblock; + layout->dim_size[2] = handle->ifmblock/handle->fm_lp_block; + layout->dim_size[3] = 1; + layout->dim_size[4] = 1; + layout->dim_size[5] = handle->blocksifm; + layout->dim_size[6] = handle->blocksofm; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_RSCK) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_S; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_R; + layout->dim_size[0] = handle->ofmblock * handle->blocksofm; + layout->dim_size[1] = handle->ifmblock * handle->blocksifm; + layout->dim_size[2] = 1; + layout->dim_size[3] = 1; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + + if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bc; + layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)2; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)handle->bc/2; + layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) || (type == LIBXSMM_DNN_CHANNEL_BIAS) ) { + layout->format = handle->desc.buffer_format; + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { + if ( (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) || (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = handle->desc.datatype_out; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { + layout->format = handle->desc.buffer_format; + layout->tensor_type = LIBXSMM_DNN_RELU_MASK; + + if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I8; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 1; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = handle->desc.N * handle->desc.K; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + +LIBXSMM_API size_t libxsmm_dnn_fullyconnected_get_scratch_size(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API void* libxsmm_dnn_fullyconnected_get_scratch_ptr(const libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->scratch; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_scratch(libxsmm_dnn_fullyconnected* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_scratch(libxsmm_dnn_fullyconnected* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_bind_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + handle->reg_bias = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + handle->grad_bias = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fullyconnected_get_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_RELU_MASK) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + return_tensor = handle->grad_output; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + return_tensor = handle->reg_filter; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + return_tensor = handle->grad_filter; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + return_tensor = handle->reg_bias; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + return_tensor = handle->grad_bias; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + return_tensor = handle->relumask; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_release_tensor(libxsmm_dnn_fullyconnected* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BIAS) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS) && + (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BIAS ) { + handle->reg_bias = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS ) { + handle->grad_bias = 0; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_execute_st(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) ) { + status = libxsmm_dnn_fullyconnected_st_fwd_custom( handle, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FC; + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { + if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_custom( handle, kind, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck( handle, kind, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FC; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.c b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.c new file mode 100644 index 00000000..d985dc37 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.c @@ -0,0 +1,1281 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fullyconnected_backward_weight_update.h" +#include "libxsmm_main.h" + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +#if 0 +#define USE_CLDEMOTE +#endif + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_vnni_transpose_16x16(void* source_void, void* dest_void, int source_stride, int dest_stride) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + libxsmm_bfloat16 *source = (libxsmm_bfloat16*)source_void; + libxsmm_bfloat16 *dest = (libxsmm_bfloat16*)dest_void; + __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7; + __m512i tmp0, tmp1, tmp2, tmp3; + const __m512i abcdefgh_to_abefcdgh = _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100); + + zmm0 = _mm512_loadu_si512(source); + zmm1 = _mm512_loadu_si512(source + source_stride); + zmm2 = _mm512_loadu_si512(source + source_stride*2); + zmm3 = _mm512_loadu_si512(source + source_stride*3); + zmm4 = _mm512_loadu_si512(source + source_stride*4); + zmm5 = _mm512_loadu_si512(source + source_stride*5); + zmm6 = _mm512_loadu_si512(source + source_stride*6); + zmm7 = _mm512_loadu_si512(source + source_stride*7); + + zmm0 = _mm512_shuffle_epi8(zmm0, abcdefgh_to_abefcdgh); + zmm1 = _mm512_shuffle_epi8(zmm1, abcdefgh_to_abefcdgh); + zmm2 = _mm512_shuffle_epi8(zmm2, abcdefgh_to_abefcdgh); + zmm3 = _mm512_shuffle_epi8(zmm3, abcdefgh_to_abefcdgh); + zmm4 = _mm512_shuffle_epi8(zmm4, abcdefgh_to_abefcdgh); + zmm5 = _mm512_shuffle_epi8(zmm5, abcdefgh_to_abefcdgh); + zmm6 = _mm512_shuffle_epi8(zmm6, abcdefgh_to_abefcdgh); + zmm7 = _mm512_shuffle_epi8(zmm7, abcdefgh_to_abefcdgh); + + tmp0 = _mm512_unpacklo_epi64(zmm0, zmm1); + tmp1 = _mm512_unpackhi_epi64(zmm0, zmm1); + tmp2 = _mm512_unpacklo_epi64(zmm2, zmm3); + tmp3 = _mm512_unpackhi_epi64(zmm2, zmm3); + zmm0 = _mm512_unpacklo_epi64(zmm4, zmm5); + zmm1 = _mm512_unpackhi_epi64(zmm4, zmm5); + zmm2 = _mm512_unpacklo_epi64(zmm6, zmm7); + zmm3 = _mm512_unpackhi_epi64(zmm6, zmm7); + + zmm4 = _mm512_shuffle_i32x4(tmp0, tmp2, 0x88); + zmm6 = _mm512_shuffle_i32x4(tmp0, tmp2, 0xdd); + zmm5 = _mm512_shuffle_i32x4(tmp1, tmp3, 0x88); + zmm7 = _mm512_shuffle_i32x4(tmp1, tmp3, 0xdd); + tmp0 = _mm512_shuffle_i32x4(zmm0, zmm2, 0x88); + tmp1 = _mm512_shuffle_i32x4(zmm0, zmm2, 0xdd); + tmp2 = _mm512_shuffle_i32x4(zmm1, zmm3, 0x88); + tmp3 = _mm512_shuffle_i32x4(zmm1, zmm3, 0xdd); + + zmm0 = _mm512_shuffle_i32x4(zmm4, tmp0, 0x88); + zmm1 = _mm512_shuffle_i32x4(zmm5, tmp2, 0x88); + zmm2 = _mm512_shuffle_i32x4(zmm6, tmp1, 0x88); + zmm3 = _mm512_shuffle_i32x4(zmm7, tmp3, 0x88); + zmm4 = _mm512_shuffle_i32x4(zmm4, tmp0, 0xdd); + zmm5 = _mm512_shuffle_i32x4(zmm5, tmp2, 0xdd); + zmm6 = _mm512_shuffle_i32x4(zmm6, tmp1, 0xdd); + zmm7 = _mm512_shuffle_i32x4(zmm7, tmp3, 0xdd); + + _mm512_storeu_si512(dest, zmm0); + _mm512_storeu_si512(dest + dest_stride, zmm1); + _mm512_storeu_si512(dest + dest_stride * 2, zmm2); + _mm512_storeu_si512(dest + dest_stride * 3, zmm3); + _mm512_storeu_si512(dest + dest_stride * 4, zmm4); + _mm512_storeu_si512(dest + dest_stride * 5, zmm5); + _mm512_storeu_si512(dest + dest_stride * 6, zmm6); + _mm512_storeu_si512(dest + dest_stride * 7, zmm7); +#ifdef USE_CLDEMOTE + _mm_cldemote(dest); + _mm_cldemote(dest + dest_stride); + _mm_cldemote(dest + dest_stride * 2); + _mm_cldemote(dest + dest_stride * 3); + _mm_cldemote(dest + dest_stride * 4); + _mm_cldemote(dest + dest_stride * 5); + _mm_cldemote(dest + dest_stride * 6); + _mm_cldemote(dest + dest_stride * 7); +#endif +#else + LIBXSMM_UNUSED(source_void); LIBXSMM_UNUSED(dest_void); LIBXSMM_UNUSED(source_stride); LIBXSMM_UNUSED(dest_stride); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_vnni_transpose(libxsmm_bfloat16* src, libxsmm_bfloat16* dst, int M, int N, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + const int _M = M/16, _N = N/16; + int i = 0, j = 0; + for (i = 0; i < _N; i++) { + for (j = 0; j < _M; j++) { + bf16_vnni_transpose_16x16((libxsmm_bfloat16*) src+i*16*ld_in+j*32, (libxsmm_bfloat16*) dst+j*16*ld_out+i*32, ld_in*2, ld_out*2); + } + } +#else + LIBXSMM_UNUSED(src); LIBXSMM_UNUSED(dst); LIBXSMM_UNUSED(M); LIBXSMM_UNUSED(N); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_transpose_32x16(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; + const int in_width=ld_in, out_width=ld_out; + const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); + + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + re = _mm512_loadu_si512(in + 14*in_width); + rf = _mm512_loadu_si512(in + 15*in_width); + + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); + t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); + t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); + t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); + t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); + t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); + t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); + t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); + t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); + t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); + ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); + tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); + tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); + td = _mm512_permutex2var_epi64(rd, idx_hi, r5); + te = _mm512_permutex2var_epi64(re, idx_hi, r6); + tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); + + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 0*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 1*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 2*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 3*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 4*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 5*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 6*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 7*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 8*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 9*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 10*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 11*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 12*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 13*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 14*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 15*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 16*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 17*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 18*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 19*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 20*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 21*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 22*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 23*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 24*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 25*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 26*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 27*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 28*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 29*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 30*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); + LIBXSMM_INTRINSICS_MM256_STORE_EPI32(out + 31*out_width, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); +#ifdef USE_CLDEMOTE + _mm_cldemote(out + 0*out_width); + _mm_cldemote(out + 1*out_width); + _mm_cldemote(out + 2*out_width); + _mm_cldemote(out + 3*out_width); + _mm_cldemote(out + 4*out_width); + _mm_cldemote(out + 5*out_width); + _mm_cldemote(out + 6*out_width); + _mm_cldemote(out + 7*out_width); + _mm_cldemote(out + 8*out_width); + _mm_cldemote(out + 9*out_width); + _mm_cldemote(out + 10*out_width); + _mm_cldemote(out + 11*out_width); + _mm_cldemote(out + 12*out_width); + _mm_cldemote(out + 13*out_width); + _mm_cldemote(out + 14*out_width); + _mm_cldemote(out + 15*out_width); + _mm_cldemote(out + 16*out_width); + _mm_cldemote(out + 17*out_width); + _mm_cldemote(out + 18*out_width); + _mm_cldemote(out + 19*out_width); + _mm_cldemote(out + 20*out_width); + _mm_cldemote(out + 21*out_width); + _mm_cldemote(out + 22*out_width); + _mm_cldemote(out + 23*out_width); + _mm_cldemote(out + 24*out_width); + _mm_cldemote(out + 25*out_width); + _mm_cldemote(out + 26*out_width); + _mm_cldemote(out + 27*out_width); + _mm_cldemote(out + 28*out_width); + _mm_cldemote(out + 29*out_width); + _mm_cldemote(out + 30*out_width); + _mm_cldemote(out + 31*out_width); +#endif +#else + LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_transpose_32xcols(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int col, int ld_in, int ld_out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + __m512i r0 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r1 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r2 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r3 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r4 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r5 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r6 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r7 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r8 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), r9 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), ra = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rb = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rc = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rd = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), re = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), rf = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; + const int in_width=ld_in, out_width=ld_out; + const __m512i idx_lo = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + const __m512i idx_hi = _mm512_set_epi64(7, 6, 15, 14, 3, 2, 11, 10); + __mmask16 store_mask = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK16(((unsigned int)1 << col) - 1); + + if (col == 15) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + re = _mm512_loadu_si512(in + 14*in_width); + } else if (col == 14) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + } else if (col == 13) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + rc = _mm512_loadu_si512(in + 12*in_width); + } else if (col == 12) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + } else if (col == 11) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + ra = _mm512_loadu_si512(in + 10*in_width); + } else if (col == 10) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + } else if (col == 9) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + r8 = _mm512_loadu_si512(in + 8*in_width); + } else if (col == 8) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + } else if (col == 7) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + r6 = _mm512_loadu_si512(in + 6*in_width); + } else if (col == 6) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + } else if (col == 5) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + r4 = _mm512_loadu_si512(in + 4*in_width); + } else if (col == 4) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + } else if (col == 3) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + r2 = _mm512_loadu_si512(in + 2*in_width); + } else if (col == 2) { + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + } else if (col == 1) { + r0 = _mm512_loadu_si512(in + 0*in_width); + } + + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + t0 = _mm512_permutex2var_epi64(r0, idx_lo, r8); + t1 = _mm512_permutex2var_epi64(r1, idx_lo, r9); + t2 = _mm512_permutex2var_epi64(r2, idx_lo, ra); + t3 = _mm512_permutex2var_epi64(r3, idx_lo, rb); + t4 = _mm512_permutex2var_epi64(r4, idx_lo, rc); + t5 = _mm512_permutex2var_epi64(r5, idx_lo, rd); + t6 = _mm512_permutex2var_epi64(r6, idx_lo, re); + t7 = _mm512_permutex2var_epi64(r7, idx_lo, rf); + t8 = _mm512_permutex2var_epi64(r8, idx_hi, r0); + t9 = _mm512_permutex2var_epi64(r9, idx_hi, r1); + ta = _mm512_permutex2var_epi64(ra, idx_hi, r2); + tb = _mm512_permutex2var_epi64(rb, idx_hi, r3); + tc = _mm512_permutex2var_epi64(rc, idx_hi, r4); + td = _mm512_permutex2var_epi64(rd, idx_hi, r5); + te = _mm512_permutex2var_epi64(re, idx_hi, r6); + tf = _mm512_permutex2var_epi64(rf, idx_hi, r7); + + _mm256_mask_storeu_epi16(out + 0*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 0)); + _mm256_mask_storeu_epi16(out + 1*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t0, 1)); + _mm256_mask_storeu_epi16(out + 2*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 0)); + _mm256_mask_storeu_epi16(out + 3*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t1, 1)); + _mm256_mask_storeu_epi16(out + 4*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 0)); + _mm256_mask_storeu_epi16(out + 5*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t2, 1)); + _mm256_mask_storeu_epi16(out + 6*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 0)); + _mm256_mask_storeu_epi16(out + 7*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t3, 1)); + _mm256_mask_storeu_epi16(out + 8*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 0)); + _mm256_mask_storeu_epi16(out + 9*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t4, 1)); + _mm256_mask_storeu_epi16(out + 10*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 0)); + _mm256_mask_storeu_epi16(out + 11*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t5, 1)); + _mm256_mask_storeu_epi16(out + 12*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 0)); + _mm256_mask_storeu_epi16(out + 13*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t6, 1)); + _mm256_mask_storeu_epi16(out + 14*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 0)); + _mm256_mask_storeu_epi16(out + 15*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t7, 1)); + _mm256_mask_storeu_epi16(out + 16*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 0)); + _mm256_mask_storeu_epi16(out + 17*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t8, 1)); + _mm256_mask_storeu_epi16(out + 18*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 0)); + _mm256_mask_storeu_epi16(out + 19*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(t9, 1)); + _mm256_mask_storeu_epi16(out + 20*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 0)); + _mm256_mask_storeu_epi16(out + 21*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(ta, 1)); + _mm256_mask_storeu_epi16(out + 22*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 0)); + _mm256_mask_storeu_epi16(out + 23*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tb, 1)); + _mm256_mask_storeu_epi16(out + 24*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 0)); + _mm256_mask_storeu_epi16(out + 25*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tc, 1)); + _mm256_mask_storeu_epi16(out + 26*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 0)); + _mm256_mask_storeu_epi16(out + 27*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(td, 1)); + _mm256_mask_storeu_epi16(out + 28*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 0)); + _mm256_mask_storeu_epi16(out + 29*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(te, 1)); + _mm256_mask_storeu_epi16(out + 30*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 0)); + _mm256_mask_storeu_epi16(out + 31*out_width, store_mask, LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(tf, 1)); +#ifdef USE_CLDEMOTE + _mm_cldemote(out + 0*out_width); + _mm_cldemote(out + 1*out_width); + _mm_cldemote(out + 2*out_width); + _mm_cldemote(out + 3*out_width); + _mm_cldemote(out + 4*out_width); + _mm_cldemote(out + 5*out_width); + _mm_cldemote(out + 6*out_width); + _mm_cldemote(out + 7*out_width); + _mm_cldemote(out + 8*out_width); + _mm_cldemote(out + 9*out_width); + _mm_cldemote(out + 10*out_width); + _mm_cldemote(out + 11*out_width); + _mm_cldemote(out + 12*out_width); + _mm_cldemote(out + 13*out_width); + _mm_cldemote(out + 14*out_width); + _mm_cldemote(out + 15*out_width); + _mm_cldemote(out + 16*out_width); + _mm_cldemote(out + 17*out_width); + _mm_cldemote(out + 18*out_width); + _mm_cldemote(out + 19*out_width); + _mm_cldemote(out + 20*out_width); + _mm_cldemote(out + 21*out_width); + _mm_cldemote(out + 22*out_width); + _mm_cldemote(out + 23*out_width); + _mm_cldemote(out + 24*out_width); + _mm_cldemote(out + 25*out_width); + _mm_cldemote(out + 26*out_width); + _mm_cldemote(out + 27*out_width); + _mm_cldemote(out + 28*out_width); + _mm_cldemote(out + 29*out_width); + _mm_cldemote(out + 30*out_width); + _mm_cldemote(out + 31*out_width); +#endif +#else + LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); LIBXSMM_UNUSED(ld_in); LIBXSMM_UNUSED(ld_out); LIBXSMM_UNUSED(col); +#endif +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void bf16_transpose(libxsmm_bfloat16 *in, libxsmm_bfloat16 *out, int M, int N, int ld_in, int ld_out){ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + int i, j; + int full16_chunks = N/16; + int remainder_cols = N%16; + int _N = N - remainder_cols; + + if (full16_chunks) { + for (i=0; iifmblock; + libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; + libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; + element_input_type alpha = (element_input_type)1; + element_input_type beta = (element_input_type)0; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + typedef libxsmm_smmfunction gemm_function; + gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); + gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef float element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_smmfunction gemm_function; + libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; + libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; + libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; + float alpha = (element_input_type)1; + float beta = (element_input_type)0; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); + gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); +# define LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 +# define LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 +# undef LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.smrs; + +#define LIBXSMM_DNN_FC_BWD_USE_AVX512 + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#undef LIBXSMM_DNN_FC_BWD_USE_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.bmrs; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.bmrs; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) + LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.bmrs; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.bmrs; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + return libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd3.xgemm.bmrs; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_upd_zerobeta = handle->gemm_upd3.xgemm.bmrs; + libxsmm_bsmmfunction bwd_tile_config_kernel = handle->bwd_config_kernel; + /*libxsmm_bsmmfunction upd_tile_config_kernel = handle->upd_config_kernel;*/ + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + return libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx_emu(handle, kind, start_thread, tid); +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx_emu(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd3.xgemm.bmrs; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_upd_zerobeta = handle->gemm_upd3.xgemm.bmrs; + libxsmm_bsmmfunction bwd_tile_config_kernel = handle->bwd_config_kernel; + /*libxsmm_bsmmfunction upd_tile_config_kernel = handle->upd_config_kernel;*/ + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if ( kind == LIBXSMM_DNN_COMPUTE_KIND_BWD ) { + if (handle->grad_input == 0 || handle->grad_output == 0 || + handle->reg_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } else if ( kind == LIBXSMM_DNN_COMPUTE_KIND_UPD ) { + if (handle->reg_input == 0 || handle->grad_output == 0 || + handle->grad_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } else { + if (handle->grad_input == 0 || handle->grad_output == 0 || + handle->reg_input == 0 || handle->grad_filter == 0 || + handle->reg_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_custom_f32_f32( handle, kind, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__*/ + else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_custom_bf16_f32( handle, kind, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; + libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; + libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; + element_input_type alpha = (element_input_type)1; + element_input_type beta = (element_input_type)0; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); + gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef float element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_smmfunction gemm_function; + libxsmm_blasint lda_bwd = (libxsmm_blasint)handle->ifmblock; + libxsmm_blasint ldb_bwd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldc_bwd = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint lda_upd = (libxsmm_blasint)handle->desc.K; + libxsmm_blasint ldb_upd = (libxsmm_blasint)handle->desc.N; + libxsmm_blasint ldc_upd = (libxsmm_blasint)handle->ofmblock; + float alpha = (element_input_type)1; + float beta = (element_input_type)0; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel_bwd = libxsmm_smmdispatch(handle->ifmblock, handle->desc.N, handle->desc.K, &lda_bwd, &ldb_bwd, &ldc_bwd, &alpha, &beta, NULL, NULL); + gemm_function gemm_kernel_upd = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->desc.N, &lda_upd, &ldb_upd, &ldc_upd, &alpha, &beta, NULL, NULL); +# define LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 +# define LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32 +# undef LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32 + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int l_emu_amx = 0; + const char *const l_env_emu_amx = getenv("EMULATE_AMX"); + if ( 0 == l_env_emu_amx ) { + } else { + l_emu_amx = atoi(l_env_emu_amx); + } + + /* check if all required tensors are bound */ + if ( kind == LIBXSMM_DNN_COMPUTE_KIND_BWD ) { + if (handle->grad_input == 0 || handle->grad_output == 0 || + handle->reg_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } else if ( kind == LIBXSMM_DNN_COMPUTE_KIND_UPD ) { + if (handle->reg_input == 0 || handle->grad_output == 0 || + handle->grad_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } else { + if (handle->grad_input == 0 || handle->grad_output == 0 || + handle->reg_input == 0 || handle->grad_filter == 0 || + handle->reg_filter == 0 || handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) != 0) && ( handle->grad_bias == 0 ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) != 0) && ( handle->relumask == 0 ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_f32_f32( handle, kind, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_CPX) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CPX && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16( handle, kind, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + if ( l_emu_amx == 0 ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid); + } else { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx_emu( handle, kind, start_thread, tid); + } + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR ) { + if ( l_emu_amx == 0 ) { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid); + } else { + status = libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_bf16_bf16_amx_emu( handle, kind, start_thread, tid); + } + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + LIBXSMM_UNUSED( l_emu_amx ); + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd = handle->gemm_bwd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_bwd_zerobeta = handle->gemm_bwd2.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd = handle->gemm_upd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_upd_zerobeta = handle->gemm_upd2.xgemm.smrs; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_BWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_BWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_nhwc(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( kind ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.h b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.h new file mode 100644 index 00000000..ab59cd44 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_backward_weight_update.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H +#define LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_custom(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_bwdupd_nhwc(libxsmm_dnn_fullyconnected* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FULLYCONNECTED_BACKWARD_WEIGHT_UPDATE_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.c b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.c new file mode 100644 index 00000000..52904ac7 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.c @@ -0,0 +1,649 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fullyconnected_forward.h" +#include "libxsmm_main.h" + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + element_input_type alpha = (element_input_type)1; + element_input_type beta = (element_input_type)0; + libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; + libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); +# include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef float element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + typedef libxsmm_smmfunction gemm_function; + libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; + libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; + float alpha = (element_input_type)1; + float beta = (element_input_type)0; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); +# define LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 +# include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32 + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.smrs; + +#define LIBXSMM_DNN_FC_FWD_USE_AVX512 + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } +#undef LIBXSMM_DNN_FC_FWD_USE_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.bmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd3.xgemm.bmrs; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.bmrs; + libxsmm_bmmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd3.xgemm.bmrs; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + return libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid ); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_zerobeta = handle->gemm_fwd3.xgemm.bmrs; + libxsmm_bsmmfunction tile_config_kernel = handle->fwd_config_kernel; +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if (handle->compressed_A == 1) { + libxsmm_bsmmfunction_reducebatch_strd_meltwfused batchreduce_kernel_decompress = handle->gemm_fwd9.xgemm.bsmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_decompress = handle->gemm_fwd11.xgemm.bmrs_meltwfused; + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd4.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd12.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd5.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd13.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd6.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd14.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd7.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd15.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd8.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd16.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd4.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd5.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd6.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd7.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd8.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) { + return libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx_emu( handle, start_thread, tid ); +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx_emu(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernel = handle->gemm_fwd.xgemm.bsmrs; + libxsmm_bmmfunction_reducebatch_strd bf16_batchreduce_kernel_zerobeta = handle->gemm_fwd3.xgemm.bmrs; + libxsmm_bsmmfunction tile_config_kernel = handle->fwd_config_kernel; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if (handle->compressed_A == 1) { + libxsmm_bsmmfunction_reducebatch_strd_meltwfused batchreduce_kernel_decompress = handle->gemm_fwd9.xgemm.bsmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_decompress = handle->gemm_fwd11.xgemm.bmrs_meltwfused; + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd4.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd12.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd5.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd13.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd6.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd14.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd7.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd15.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd8.xgemm.bmrs_meltwfused; + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress = handle->gemm_fwd16.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd4.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd5.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd6.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd7.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { + libxsmm_bmmfunction_reducebatch_strd_meltwfused bf16_batchreduce_kernel_zerobeta_fused_eltwise = handle->gemm_fwd8.xgemm.bmrs_meltwfused; +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if (handle->reg_input == 0 || handle->reg_output == 0 || + handle->reg_filter == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fullyconnected_st_fwd_custom_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE ) { + status = libxsmm_dnn_fullyconnected_st_fwd_custom_bf16_f32( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + typedef libxsmm_smmfunction gemm_function; + libxsmm_blasint lda = (libxsmm_blasint)handle->ofmblock; + libxsmm_blasint ldb = (libxsmm_blasint)handle->desc.C; + libxsmm_blasint ldc = (libxsmm_blasint)handle->desc.K; + element_input_type beta = (element_input_type)0; + element_input_type alpha = (element_input_type)1; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->desc.N, handle->desc.C, &lda, &ldb, &ldc, &alpha, &beta, NULL, NULL); +# include "template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c" + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int l_emu_amx = 0; + const char *const l_env_emu_amx = getenv("EMULATE_AMX"); + if ( 0 == l_env_emu_amx ) { + } else { + l_emu_amx = atoi(l_env_emu_amx); + } + + /* check if all required tensors are bound */ + if (handle->reg_input == 0 || handle->reg_output == 0 || + handle->reg_filter == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) != 0) && ( handle->reg_bias == 0 ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) != 0) && ( handle->relumask == 0 ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (handle->target_archid >= LIBXSMM_X86_AVX512) && (handle->target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_CPX) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CPX && handle->target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR) { + if ( l_emu_amx == 0 ) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } else { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx_emu( handle, start_thread, tid); + } + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_CORE && handle->target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_emu( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && handle->target_archid >= LIBXSMM_X86_AVX512_SPR ) { + if ( l_emu_amx == 0 ) { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } else { + status = libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_bf16_bf16_amx_emu( handle, start_thread, tid); + } + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + LIBXSMM_UNUSED( l_emu_amx ); + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_beta = handle->gemm_fwd.xgemm.smrs; + libxsmm_smmfunction_reducebatch_strd batchreduce_kernel_zerobeta = handle->gemm_fwd2.xgemm.smrs; + + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_NONE +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_NONE + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_RELU ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_RELU +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_RELU +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else if ( handle->desc.fuse_ops == LIBXSMM_DNN_FULLYCONNECTED_FUSE_BIAS_SIGMOID ) { +#define LIBXSMM_DNN_FC_FWD_FUSE_BIAS +#define LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +# include "template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c" +#undef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID +#undef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + } else { + status = LIBXSMM_DNN_ERR_FC_UNSUPPORTED_FUSION; + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.h b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.h new file mode 100644 index 00000000..949bc955 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fullyconnected_forward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H +#define LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_custom(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fullyconnected_st_fwd_nhwc(libxsmm_dnn_fullyconnected* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FULLYCONNECTED_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm.c b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm.c new file mode 100644 index 00000000..6d91c8d4 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm.c @@ -0,0 +1,638 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedbatchnorm_backward.h" +#include "libxsmm_dnn_fusedbatchnorm_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API libxsmm_dnn_fusedbatchnorm* libxsmm_dnn_create_fusedbatchnorm(libxsmm_dnn_fusedbatchnorm_desc fusedbatchnorm_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_fusedbatchnorm* handle = 0; + int lpb; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( fusedbatchnorm_desc.partN > fusedbatchnorm_desc.fullN ) { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + return handle; + } else if ( (fusedbatchnorm_desc.partN != fusedbatchnorm_desc.fullN) && ((fusedbatchnorm_desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) == 0 ) && ((fusedbatchnorm_desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) == 0 ) ) { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + return handle; + } else { + } + + if ( ((fusedbatchnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fusedbatchnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || + ((fusedbatchnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fusedbatchnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_fusedbatchnorm*)calloc(1, sizeof(libxsmm_dnn_fusedbatchnorm)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = fusedbatchnorm_desc; + /* we need to compute the memory layout given the */ + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, + &(handle->ifmblock), &(handle->ofmblock), &lpb, + handle->desc.datatype_in, handle->desc.datatype_out ); + /* compute the outer blocks */ + handle->blocksifm = handle->desc.C / handle->ifmblock; + handle->blocksofm = handle->desc.C / handle->ofmblock; + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + /* calculate scratch size for batchstats */ + handle->scratch_size = (sizeof(float) * 2 * handle->desc.C * handle->desc.partN); + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedbatchnorm(const libxsmm_dnn_fusedbatchnorm* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_fusedbatchnorm*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(const libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + layout->format = handle->desc.buffer_format; + + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.partN; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.partN; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.partN; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.partN; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->desc.partN; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->desc.partN; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) || (type == LIBXSMM_DNN_CHANNEL_BETA) || + (type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_CHANNEL_GAMMA) || + (type == LIBXSMM_DNN_CHANNEL_EXPECTVAL) || (type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV) || (type == LIBXSMM_DNN_CHANNEL_VARIANCE) ) { + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype_stats; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->blocksifm; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype_stats; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 1; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->desc.C; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { + layout->tensor_type = LIBXSMM_DNN_RELU_MASK; + + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I8; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.partN; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I8; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 6; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock*handle->blocksofm; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->desc.partN; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + +LIBXSMM_API size_t libxsmm_dnn_fusedbatchnorm_get_scratch_size(const libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_scratch(libxsmm_dnn_fusedbatchnorm* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_scratch(libxsmm_dnn_fusedbatchnorm* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_bind_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fusedbatchnorm_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + handle->reg_add = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + handle->grad_add = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + handle->reg_beta = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + handle->grad_beta = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + handle->reg_gamma = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + handle->grad_gamma = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + handle->expvalue = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + handle->rcpstddev = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + handle->variance = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedbatchnorm_get_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + return_tensor = handle->grad_output; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + return_tensor = handle->reg_add; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + return_tensor = handle->grad_add; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + return_tensor = handle->reg_beta; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + return_tensor = handle->grad_beta; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + return_tensor = handle->reg_gamma; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + return_tensor = handle->grad_gamma; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + return_tensor = handle->expvalue; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + return_tensor = handle->rcpstddev; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + return_tensor = handle->variance; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + return_tensor = handle->relumask; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_release_tensor(libxsmm_dnn_fusedbatchnorm* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + handle->reg_add = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + handle->grad_add = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + handle->reg_beta = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + handle->grad_beta = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + handle->reg_gamma = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + handle->grad_gamma = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + handle->expvalue = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + handle->rcpstddev = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + handle->variance = 0; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_execute_st(libxsmm_dnn_fusedbatchnorm* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handles && num_handles > 0) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + switch (handles[0]->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom( handles, num_handles, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handles[0]->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom( handles, num_handles, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.c b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.c new file mode 100644 index 00000000..2d632f42 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.c @@ -0,0 +1,604 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedbatchnorm_backward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if ( handle->reg_input == 0 || handle->reg_gamma == 0 || + handle->grad_input == 0 || handle->grad_output == 0 || + handle->grad_beta == 0 || handle->grad_gamma == 0 || + handle->expvalue == 0 || handle->rcpstddev == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0 ) { + if ( handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) > 0 ) { + if ( handle->grad_add == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) > 0 ) { + if ( handle->reg_output == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) > 0 ) { + if ( handle->relumask == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c16( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c32( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_f32_c64( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) { +# define LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_BWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int l_count; + + /* check if all required tensors are bound */ + for ( l_count = 0; l_count < num_handles; ++l_count ) { + if ( handles[l_count]->grad_beta == 0 || handles[l_count]->grad_gamma == 0 || handles[l_count]->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + +#if 0 + /* check if we are on an AVX512 platform */ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom_avx512( handles, num_handles, start_thread, tid ); + } else +#endif + { + const int nImg = handles[0]->desc.partN; + const int nBlocksFm = handles[0]->blocksifm; + const int nFmBlock = handles[0]->ifmblock; + /* computing first logical thread */ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const int work2 = nBlocksFm; + /* compute chunk size */ + const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; + const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + int v = 0, fm; + + LIBXSMM_VLA_DECL(2, float, dgamma0, (float*)handles[0]->grad_gamma->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, dbeta0, (float*)handles[0]->grad_beta->data, nFmBlock); + LIBXSMM_VLA_DECL(3, float, dgamma_img0, (float*)handles[0]->scratch, nImg, nFmBlock); + LIBXSMM_VLA_DECL(3, float, dbeta_img0, ((float*)handles[0]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); + + /* lazy barrier init */ + libxsmm_barrier_init(handles[0]->barrier, ltid); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); + float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); + float* dgamma_img0_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img0, fm, 0, 0, nImg, nFmBlock); + float* dbeta_img0_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img0, fm, 0, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + dgamma0_ptr[v] = dgamma_img0_ptr[v]; + dbeta0_ptr[v] = dbeta_img0_ptr[v]; + } + } + + /* now we need to reduce the dgamma and dbeta */ + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(3, float, dgamma_imgr, (float*)handles[l_count]->scratch, nImg, nFmBlock); + LIBXSMM_VLA_DECL(3, float, dbeta_imgr, ((float*)handles[l_count]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); + float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); + float* dgamma_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_imgr, fm, 0, 0, nImg, nFmBlock); + float* dbeta_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_imgr, fm, 0, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + dgamma0_ptr[v] += dgamma_imgr_ptr[v]; + dbeta0_ptr[v] += dbeta_imgr_ptr[v]; + } + } + } + + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); + float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); + float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); + float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + dgammar_ptr[v] = dgamma0_ptr[v]; + dbetar_ptr[v] = dbeta0_ptr[v]; + } + } + } + + libxsmm_barrier_wait(handles[0]->barrier, ltid); + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.h b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.h new file mode 100644 index 00000000..a09c3421 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_backward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H +#define LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_bwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FUSEDBATCHNORM_BACKWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.c b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.c new file mode 100644 index 00000000..fd3bf92e --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.c @@ -0,0 +1,618 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedbatchnorm_forward.h" +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if ( handle->reg_input == 0 || handle->reg_output == 0 || + handle->reg_beta == 0 || handle->reg_gamma == 0 || + handle->expvalue == 0 || handle->rcpstddev == 0 || handle->variance == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0 ) { + if ( handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) > 0 ) { + if ( handle->reg_add == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) > 0 ) { + if ( handle->relumask == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c16( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c32( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_f32_c64( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedbatchnorm_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDBN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDBN_ORDER_BN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BN) || + (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) || (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) ) { +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDBN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU) == LIBXSMM_DNN_FUSEDBN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDBN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDBN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDBN_FWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int l_count; + + /* check if all required tensors are bound */ + for ( l_count = 0; l_count < num_handles; ++l_count ) { + if ( handles[l_count]->expvalue == 0 || handles[l_count]->rcpstddev == 0 || handles[l_count]->variance == 0 || handles[l_count]->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + +#if 0 + /* check if we are on an AVX512 platform */ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + status = libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom_avx512( handles, num_handles, start_thread, tid ); + } else +#endif + { + const int nImg = handles[0]->desc.partN; + const int nBlocksFm = handles[0]->blocksifm; + const int nFmBlock = handles[0]->ifmblock; + /* computing first logical thread */ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const int work2 = nBlocksFm; + /* compute chunk size */ + const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; + const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + int v = 0, fm; + const float sqrt_eps = 1e-7f; + const float nhw = (float)(handles[0]->desc.fullN * handles[0]->desc.H * handles[0]->desc.W); + const float recp_nhw = 1.0f/nhw; + + LIBXSMM_VLA_DECL(2, float, bmean0, (float*)handles[0]->expvalue->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, brstd0, (float*)handles[0]->rcpstddev->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, variance0, (float*)handles[0]->variance->data, nFmBlock); + LIBXSMM_VLA_DECL(3, float, sum_img0, (float*)handles[0]->scratch, nImg, nFmBlock); + LIBXSMM_VLA_DECL(3, float, sumsq_img0, ((float*)handles[0]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); + + /* lazy barrier init */ + libxsmm_barrier_init(handles[0]->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(3, float, sum_imgr, (float*)handles[l_count]->scratch, nImg, nFmBlock); + LIBXSMM_VLA_DECL(3, float, sumsq_imgr, ((float*)handles[l_count]->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* sum_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img0, fm, 0, 0, nImg, nFmBlock); + float* sumsq_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img0, fm, 0, 0, nImg, nFmBlock); + float* sum_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, sum_imgr, fm, 0, 0, nImg, nFmBlock); + float* sumsq_imgr_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_imgr, fm, 0, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + sum_img0_ptr[v] += sum_imgr_ptr[v]; + sumsq_img0_ptr[v] += sumsq_imgr_ptr[v]; + } + } + } + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* bmean0_ptr = &LIBXSMM_VLA_ACCESS(2, bmean0, fm, 0, nFmBlock); + float* brstd0_ptr = &LIBXSMM_VLA_ACCESS(2, brstd0, fm, 0, nFmBlock); + float* tvar0_ptr = &LIBXSMM_VLA_ACCESS(2, variance0, fm, 0, nFmBlock); + float* sum_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img0, fm, 0, 0, nImg, nFmBlock); + float* sumsq_img0_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img0, fm, 0, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + const float tbmean = (recp_nhw * sum_img0_ptr[v]); + const float tbmeansq = tbmean * tbmean; + const float tsqbmean = recp_nhw * sumsq_img0_ptr[v]; + const float tvar = tsqbmean - tbmeansq; + const float tbrstd = (float)(1.0/sqrt((double)tvar + sqrt_eps)); + bmean0_ptr[v] = tbmean; + brstd0_ptr[v] = tbrstd; + tvar0_ptr[v] = tvar; + } + } + + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(2, float, bmeanr, (float*)handles[l_count]->expvalue->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, brstdr, (float*)handles[l_count]->rcpstddev->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, variancer, (float*)handles[l_count]->variance->data, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* bmean0_ptr = &LIBXSMM_VLA_ACCESS(2, bmean0, fm, 0, nFmBlock); + float* brstd0_ptr = &LIBXSMM_VLA_ACCESS(2, brstd0, fm, 0, nFmBlock); + float* tvar0_ptr = &LIBXSMM_VLA_ACCESS(2, variance0, fm, 0, nFmBlock); + float* bmeanr_ptr = &LIBXSMM_VLA_ACCESS(2, bmeanr, fm, 0, nFmBlock); + float* brstdr_ptr = &LIBXSMM_VLA_ACCESS(2, brstdr, fm, 0, nFmBlock); + float* tvarr_ptr = &LIBXSMM_VLA_ACCESS(2, variancer, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + bmeanr_ptr[v] = bmean0_ptr[v]; + brstdr_ptr[v] = brstd0_ptr[v]; + tvarr_ptr[v] = tvar0_ptr[v]; + } + } + } + + libxsmm_barrier_wait(handles[0]->barrier, ltid); + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.h b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.h new file mode 100644 index 00000000..dfd76f66 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedbatchnorm_forward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H +#define LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_custom(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_st_fwd_nhwc(libxsmm_dnn_fusedbatchnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedbatchnorm_reduce_stats_st_fwd_custom(libxsmm_dnn_fusedbatchnorm** handles, int num_handles, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FUSEDBATCHNORM_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm.c b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm.c new file mode 100644 index 00000000..97796014 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm.c @@ -0,0 +1,648 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedgroupnorm_backward.h" +#include "libxsmm_dnn_fusedgroupnorm_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API libxsmm_dnn_fusedgroupnorm* libxsmm_dnn_create_fusedgroupnorm(libxsmm_dnn_fusedgroupnorm_desc fusedgroupnorm_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_fusedgroupnorm* handle = 0; + int lpb; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( ((fusedgroupnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (fusedgroupnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || + ((fusedgroupnorm_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (fusedgroupnorm_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_fusedgroupnorm*)calloc(1, sizeof(libxsmm_dnn_fusedgroupnorm)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = fusedgroupnorm_desc; + /* we need to compute the memory layout given the */ + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, + &(handle->ifmblock), &(handle->ofmblock), &lpb, + handle->desc.datatype_in, handle->desc.datatype_out ); + /* compute the outer blocks */ + handle->blocksifm = handle->desc.C / handle->ifmblock; + handle->blocksofm = handle->desc.C / handle->ofmblock; + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + /* calculate scratch size for batchstats */ + handle->scratch_size = (sizeof(float) * 2 * ((handle->desc.C * handle->desc.N) + (handle->desc.G * handle->desc.N))); + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_fusedgroupnorm(const libxsmm_dnn_fusedgroupnorm* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_fusedgroupnorm*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(const libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + layout->format = handle->desc.buffer_format; + + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_INPUT_ADD) || (type == LIBXSMM_DNN_GRADIENT_INPUT_ADD) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) || (type == LIBXSMM_DNN_CHANNEL_BETA) || + (type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) || (type == LIBXSMM_DNN_CHANNEL_GAMMA) ) { + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype_stats; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->blocksifm; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype_stats; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 1; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = handle->desc.C; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_CHANNEL_EXPECTVAL) || (type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV) || (type == LIBXSMM_DNN_CHANNEL_VARIANCE) ) { + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) || ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) ) { + if ( handle->desc.datatype_stats == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype_stats; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 2; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_G; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->desc.G; + layout->dim_size[1] = handle->desc.N; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_RELU_MASK) ) { + layout->tensor_type = LIBXSMM_DNN_RELU_MASK; + + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I8; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I8; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 6; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->ofmblock*handle->blocksofm; + layout->dim_size[1] = (handle->desc.W/handle->desc.v) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->desc.H/handle->desc.u) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->desc.N; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + +LIBXSMM_API size_t libxsmm_dnn_fusedgroupnorm_get_scratch_size(const libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_scratch(libxsmm_dnn_fusedgroupnorm* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_scratch(libxsmm_dnn_fusedgroupnorm* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_bind_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_fusedgroupnorm_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + handle->reg_add = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + handle->grad_add = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + handle->reg_beta = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + handle->grad_beta = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + handle->reg_gamma = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + handle->grad_gamma = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + handle->expvalue = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + handle->rcpstddev = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + handle->variance = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_fusedgroupnorm_get_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + return_tensor = handle->grad_output; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + return_tensor = handle->reg_add; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + return_tensor = handle->grad_add; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + return_tensor = handle->reg_beta; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + return_tensor = handle->grad_beta; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + return_tensor = handle->reg_gamma; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + return_tensor = handle->grad_gamma; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + return_tensor = handle->expvalue; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + return_tensor = handle->rcpstddev; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + return_tensor = handle->variance; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + return_tensor = handle->relumask; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_release_tensor(libxsmm_dnn_fusedgroupnorm* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_REGULAR_INPUT_ADD) && (type != LIBXSMM_DNN_GRADIENT_INPUT_ADD) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_BETA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_BETA) && + (type != LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA) && (type != LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA) && + (type != LIBXSMM_DNN_CHANNEL_EXPECTVAL) && (type != LIBXSMM_DNN_CHANNEL_RCPSTDDEV) && + (type != LIBXSMM_DNN_CHANNEL_VARIANCE) && (type != LIBXSMM_DNN_RELU_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_INPUT_ADD ) { + handle->reg_add = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT_ADD ) { + handle->grad_add = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_BETA ) { + handle->reg_beta = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_BETA ) { + handle->grad_beta = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA ) { + handle->reg_gamma = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA ) { + handle->grad_gamma = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_EXPECTVAL ) { + handle->expvalue = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_RCPSTDDEV ) { + handle->rcpstddev = 0; + } else if ( type == LIBXSMM_DNN_CHANNEL_VARIANCE ) { + handle->variance = 0; + } else if ( type == LIBXSMM_DNN_RELU_MASK ) { + handle->relumask = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_execute_st(libxsmm_dnn_fusedgroupnorm* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handles && num_handles > 0) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handles[0]->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom( handles, num_handles, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.c b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.c new file mode 100644 index 00000000..1cdc7142 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.c @@ -0,0 +1,581 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedgroupnorm_backward.h" +#include "libxsmm_main.h" + +#if 0 +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if ( handle->reg_input == 0 || handle->reg_gamma == 0 || + handle->grad_input == 0 || handle->grad_output == 0 || + handle->grad_beta == 0 || handle->grad_gamma == 0 || + handle->expvalue == 0 || handle->rcpstddev == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_GN) > 0 ) { + if ( handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { + if ( handle->grad_add == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { + if ( handle->reg_output == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { + if ( handle->relumask == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + /* check if we are on an AVX512 platform */ +#if 0 +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c16( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c32( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_f32_c64( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_BWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_BWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + int l_count; + + /* check if all required tensors are bound */ + for ( l_count = 0; l_count < num_handles; ++l_count ) { + if ( handles[l_count]->grad_beta == 0 || handles[l_count]->grad_gamma == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + +#if 0 + /* check if we are on an AVX512 platform */ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + status = libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom_avx512( handles, num_handles, start_thread, tid ); + } else +#endif + { + const int nBlocksFm = handles[0]->blocksifm; + const int nFmBlock = handles[0]->ifmblock; + /* computing first logical thread */ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const int work2 = nBlocksFm; + /* compute chunk size */ + const int chunksize2 = (work2 % handles[0]->desc.threads == 0) ? (work2 / handles[0]->desc.threads) : ((work2 / handles[0]->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; + const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + int v = 0, fm; + + LIBXSMM_VLA_DECL(2, float, dgamma0, (float*)handles[0]->grad_gamma->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, dbeta0, (float*)handles[0]->grad_beta->data, nFmBlock); + + /* lazy barrier init */ + libxsmm_barrier_init(handles[0]->barrier, ltid); + + /* now we need to reduce the dgamma and dbeta */ + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); + float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); + float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); + float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + dgamma0_ptr[v] += dgammar_ptr[v]; + dbeta0_ptr[v] += dbetar_ptr[v]; + } + } + } + + for ( l_count = 1; l_count < num_handles; ++l_count ) { + LIBXSMM_VLA_DECL(2, float, dgammar, (float*)handles[l_count]->grad_gamma->data, nFmBlock); + LIBXSMM_VLA_DECL(2, float, dbetar, (float*)handles[l_count]->grad_beta->data, nFmBlock); + + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + float* dgamma0_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma0, fm, 0, nFmBlock); + float* dbeta0_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta0, fm, 0, nFmBlock); + float* dgammar_ptr = &LIBXSMM_VLA_ACCESS(2, dgammar, fm, 0, nFmBlock); + float* dbetar_ptr = &LIBXSMM_VLA_ACCESS(2, dbetar, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + dgammar_ptr[v] = dgamma0_ptr[v]; + dbetar_ptr[v] = dbeta0_ptr[v]; + } + } + } + + libxsmm_barrier_wait(handles[0]->barrier, ltid); + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.h b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.h new file mode 100644 index 00000000..4ef94633 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_backward.h @@ -0,0 +1,22 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H +#define LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_bwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_reduce_stats_st_bwd_custom(libxsmm_dnn_fusedgroupnorm** handles, int num_handles, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FUSEDGROUPNORM_BACKWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.c b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.c new file mode 100644 index 00000000..df7d3a9b --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.c @@ -0,0 +1,500 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_fusedgroupnorm_forward.h" +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if 0 +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( (handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN) ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if all required tensors are bound */ + if ( handle->reg_input == 0 || handle->reg_output == 0 || + handle->reg_beta == 0 || handle->reg_gamma == 0 || + handle->expvalue == 0 || handle->rcpstddev == 0 || handle->variance == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_GN) > 0 ) { + if ( handle->scratch == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) > 0 ) { + if ( handle->reg_add == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) > 0 ) { + if ( handle->relumask == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + } + + /* check if we are on an AVX512 platform */ +#if 0 +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c16( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c32( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_f32_c64( handle, start_thread, tid ); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_fusedgroupnorm_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_stats_type; + + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef float element_stats_type; + +# define LIBXSMM_DNN_FUSEDGN_FWD_BF16 + if ( handle->desc.fuse_order != LIBXSMM_DNN_FUSEDGN_ORDER_GN_ELTWISE_RELU ) { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_ORDER; + } else { + if ( handle->desc.fuse_ops == LIBXSMM_DNN_FUSEDGN_OPS_GN ) { +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE) == LIBXSMM_DNN_FUSEDGN_OPS_ELTWISE ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU) == LIBXSMM_DNN_FUSEDGN_OPS_RELU ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU + } else if ( (handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK) == LIBXSMM_DNN_FUSEDGN_OPS_RELU_WITH_MASK ) { +# define LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK +# include "template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK + } else { + status = LIBXSMM_DNN_ERR_FUSEDGN_UNSUPPORTED_FUSION; + } + } +# undef LIBXSMM_DNN_FUSEDGN_FWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.h b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.h new file mode 100644 index 00000000..41d11bfa --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_fusedgroupnorm_forward.h @@ -0,0 +1,20 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H +#define LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_custom(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_fusedgroupnorm_st_fwd_nhwc(libxsmm_dnn_fusedgroupnorm* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_FUSEDGROUPNORM_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_optimizer.c b/third_party/libxsmm/src/libxsmm_dnn_optimizer.c new file mode 100644 index 00000000..8d322879 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_optimizer.c @@ -0,0 +1,345 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_optimizer_sgd.h" +#include "libxsmm_main.h" + + +LIBXSMM_API libxsmm_dnn_optimizer* libxsmm_dnn_create_optimizer(libxsmm_dnn_optimizer_desc optimizer_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_optimizer* handle = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( (optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_F32) || (optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_optimizer*)calloc(1, sizeof(libxsmm_dnn_optimizer)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = optimizer_desc; + + if ( (handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + /* we need to compute the memory layout given the */ + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.K, + &(handle->bc), &(handle->bk), &(handle->fm_lp_block), + handle->desc.datatype, handle->desc.datatype ); + /* compute the outer blocks */ + handle->Bc = handle->desc.C / handle->bc; + handle->Bk = handle->desc.K / handle->bk; + } else if ( (handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0 ) { + if ( optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + handle->fm_lp_block = 1; + } else if ( optimizer_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + handle->fm_lp_block = 2; + } else { + } + handle->bc = handle->desc.bc; + handle->bk = handle->desc.bk; + handle->Bc = handle->desc.C / handle->bc; + handle->Bk = handle->desc.K / handle->bk; + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + free( handle ); + handle = 0; + return handle; + } + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + /* calculate scratch size for local optimizer copies of one feature map block per thread */ + handle->scratch_size = 1; + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_optimizer(const libxsmm_dnn_optimizer* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_optimizer*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_optimizer_create_tensor_datalayout(const libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + layout->format = handle->desc.filter_format; + + if ( (type == LIBXSMM_DNN_REGULAR_FILTER) || (type == LIBXSMM_DNN_GRADIENT_FILTER) || (type == LIBXSMM_DNN_MASTER_FILTER) ) { + if ( ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) || ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) ) { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + layout->datatype = handle->desc.datatype; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->bk; + layout->dim_size[1] = handle->bc; + layout->dim_size[2] = handle->Bc; + layout->dim_size[3] = handle->Bk; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + layout->datatype = handle->desc.datatype; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = handle->fm_lp_block; + layout->dim_size[1] = handle->bk; + layout->dim_size[2] = handle->bc/handle->fm_lp_block; + layout->dim_size[3] = handle->Bc; + layout->dim_size[4] = handle->Bk; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + + +LIBXSMM_API size_t libxsmm_dnn_optimizer_get_scratch_size(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API void* libxsmm_dnn_optimizer_get_scratch_ptr(const libxsmm_dnn_optimizer* handle, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->scratch; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_scratch(libxsmm_dnn_optimizer* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_scratch(libxsmm_dnn_optimizer* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_bind_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_optimizer_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { + handle->master_filter = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_optimizer_get_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + return_tensor = handle->reg_filter; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + return_tensor = handle->grad_filter; + } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { + return_tensor = handle->master_filter; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_release_tensor(libxsmm_dnn_optimizer* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_FILTER) && (type != LIBXSMM_DNN_GRADIENT_FILTER) && (type != LIBXSMM_DNN_MASTER_FILTER) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_FILTER ) { + handle->reg_filter = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_FILTER ) { + handle->grad_filter = 0; + } else if ( type == LIBXSMM_DNN_MASTER_FILTER ) { + handle->master_filter = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_optimizer_execute_st(libxsmm_dnn_optimizer* handle, /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + if (handle->desc.opt_type == LIBXSMM_DNN_OPTIMIZER_SGD) { + libxsmm_dnn_optimizer_sgd_st( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.c b/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.c new file mode 100644 index 00000000..b1532c24 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.c @@ -0,0 +1,103 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_optimizer_sgd.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_f32_f32(libxsmm_dnn_optimizer* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_bf16_bf16(libxsmm_dnn_optimizer* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_f32_f32(libxsmm_dnn_optimizer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_filter_type; + +# define LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512 +# include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" +# undef LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st_bf16_bf16(libxsmm_dnn_optimizer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_filter_type; + typedef float element_master_type; + +# define LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512 +# include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" +# undef LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st(libxsmm_dnn_optimizer* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have filter, grad_filter */ + if ( handle->reg_filter == 0 || handle->grad_filter == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + if ( (handle->master_filter == 0) && (handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_optimizer_sgd_st_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_optimizer_sgd_st_bf16_bf16( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_filter_type; + +# include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_filter_type; + typedef float element_master_type; + +# define LIBXSMM_DNN_OPTIMIZER_SGD_BF16 +# include "template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c" +# undef LIBXSMM_DNN_OPTIMIZER_SGD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.h b/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.h new file mode 100644 index 00000000..7bc64fc8 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_optimizer_sgd.h @@ -0,0 +1,18 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_OPTIMIZER_SGD_H +#define LIBXSMM_DNN_OPTIMIZER_SGD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_optimizer_sgd_st(libxsmm_dnn_optimizer* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_OPTIMIZER_SGD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_pooling.c b/third_party/libxsmm/src/libxsmm_dnn_pooling.c new file mode 100644 index 00000000..764663d4 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_pooling.c @@ -0,0 +1,451 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_pooling_backward.h" +#include "libxsmm_dnn_pooling_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API libxsmm_dnn_pooling* libxsmm_dnn_create_pooling(libxsmm_dnn_pooling_desc pooling_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_pooling* handle = 0; + int lpb; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( ((pooling_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (pooling_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) || + ((pooling_desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (pooling_desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_pooling*)calloc(1, sizeof(libxsmm_dnn_pooling)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = pooling_desc; + /* we need to compute the memory layout given the */ + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, + &(handle->ifmblock), &(handle->ofmblock), &lpb, + handle->desc.datatype_in, handle->desc.datatype_out ); + /* compute the outer blocks */ + handle->blocksifm = handle->desc.C / handle->ifmblock; + handle->blocksofm = handle->desc.C / handle->ofmblock; + /* setting ofh and ofw */ + handle->ofh = (handle->desc.H + 2 * handle->desc.pad_h - handle->desc.R) / handle->desc.u + 1; + handle->ofw = (handle->desc.W + 2 * handle->desc.pad_w - handle->desc.S) / handle->desc.v + 1; + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + /* calculate scratch size for local pooling copies of one feature map block per thread */ + handle->scratch_size = (sizeof(float) * ( (size_t)handle->desc.H + (size_t)LIBXSMM_MAX(handle->desc.pad_h_in, handle->desc.pad_h_out)*2 ) + * ( (size_t)handle->desc.W + (size_t)LIBXSMM_MAX(handle->desc.pad_w_in, handle->desc.pad_w_out)*2 ) + * LIBXSMM_MAX( handle->ofmblock, handle->ifmblock ) + * handle->desc.threads ); + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_pooling(const libxsmm_dnn_pooling* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_pooling*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_pooling_create_tensor_datalayout(const libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + layout->format = handle->desc.buffer_format; + + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) || + (type == LIBXSMM_DNN_POOLING_MASK) ) { + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) ) { + if ( type == LIBXSMM_DNN_POOLING_MASK ) { + layout->datatype = handle->desc.datatype_mask; + } else { + layout->datatype = LIBXSMM_DNN_DATATYPE_F32; + } + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_POOLING_MASK) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofw; + layout->dim_size[2] = handle->ofh; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + if ( type == LIBXSMM_DNN_POOLING_MASK ) { + layout->datatype = handle->desc.datatype_mask; + } else { + layout->datatype = LIBXSMM_DNN_DATATYPE_BF16; + } + + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 5; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->ifmblock; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->blocksifm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_POOLING_MASK) ) { + layout->dim_size[0] = handle->ofmblock; + layout->dim_size[1] = handle->ofw; + layout->dim_size[2] = handle->ofh; + layout->dim_size[3] = handle->blocksofm; + layout->dim_size[4] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NHWC) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || + ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + if ( type == LIBXSMM_DNN_POOLING_MASK ) { + layout->datatype = handle->desc.datatype_mask; + } else { + layout->datatype = handle->desc.datatype_in; + } + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_W; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_H; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = handle->desc.W + (2*handle->desc.pad_w_in); + layout->dim_size[2] = handle->desc.H + (2*handle->desc.pad_h_in); + layout->dim_size[3] = handle->desc.N; + } else if ( (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_GRADIENT_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + layout->dim_size[0] = handle->desc.C; + layout->dim_size[1] = (handle->ofw) + (2*handle->desc.pad_w_out); + layout->dim_size[2] = (handle->ofh) + (2*handle->desc.pad_h_out); + layout->dim_size[3] = handle->desc.N; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + +LIBXSMM_API size_t libxsmm_dnn_pooling_get_scratch_size(const libxsmm_dnn_pooling* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_scratch(libxsmm_dnn_pooling* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_scratch(libxsmm_dnn_pooling* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_bind_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_POOLING_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_pooling_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { + handle->mask = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_pooling_get_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_POOLING_MASK) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + return_tensor = handle->grad_output; + } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { + return_tensor = handle->mask; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_release_tensor(libxsmm_dnn_pooling* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_GRADIENT_OUTPUT) && + (type != LIBXSMM_DNN_POOLING_MASK) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_OUTPUT ) { + handle->grad_output = 0; + } else if ( type == LIBXSMM_DNN_POOLING_MASK ) { + handle->mask = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_pooling_execute_st(libxsmm_dnn_pooling* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_pooling_st_fwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + switch (handle->desc.buffer_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM: { + status = libxsmm_dnn_pooling_st_bwd_custom( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_FUSEDBN; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.c b/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.c new file mode 100644 index 00000000..6cffe9c8 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.c @@ -0,0 +1,301 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_pooling_backward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_BWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_BWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_BWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_BWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and mask */ + if ( handle->grad_input == 0 || handle->grad_output == 0 || + ( (handle->mask == 0) && (handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX) ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c16( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c16( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c32( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_f32_f32_c64( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_bwd_custom_bf16_bf16_c64( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_BWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_BWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_BWD_AVG +# include "template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_BWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_BWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.h b/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.h new file mode 100644 index 00000000..ce08683d --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_pooling_backward.h @@ -0,0 +1,20 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_POOLING_BACKWARD_H +#define LIBXSMM_DNN_POOLING_BACKWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_bwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_POOLING_BACKWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.c b/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.c new file mode 100644 index 00000000..dc2a16d9 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.c @@ -0,0 +1,301 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_pooling_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_FWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_FWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_FWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_FWD_BF16 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and mask */ + if ( handle->reg_input == 0 || handle->reg_output == 0 || + ( (handle->mask == 0) && (handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX) ) ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 16) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c16( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c16( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 32) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c32( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c32( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else if ( ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) && + (handle->ofmblock == 64) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_f32_f32_c64( handle, start_thread, tid); + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + LIBXSMM_ASSERT(NULL != handle->mask); + status = libxsmm_dnn_pooling_st_fwd_custom_bf16_bf16_c64( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } + } else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + +# define LIBXSMM_DNN_POOLING_FWD_BF16 + if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_MAX ) { +# define LIBXSMM_DNN_POOLING_FWD_MAX + typedef int element_mask_type; +# include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_MAX + } else if ( handle->desc.pooling_type == LIBXSMM_DNN_POOLING_AVG ) { +# define LIBXSMM_DNN_POOLING_FWD_AVG +# include "template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c" +# undef LIBXSMM_DNN_POOLING_FWD_AVG + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_POOLING; + } +# undef LIBXSMM_DNN_POOLING_FWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + LIBXSMM_UNUSED( handle ); + LIBXSMM_UNUSED( start_thread ); + LIBXSMM_UNUSED( tid ); + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.h b/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.h new file mode 100644 index 00000000..e7eb4322 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_pooling_forward.h @@ -0,0 +1,20 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_POOLING_FORWARD_H +#define LIBXSMM_DNN_POOLING_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_custom(libxsmm_dnn_pooling* handle, int start_thread, int tid); + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_pooling_st_fwd_nhwc(libxsmm_dnn_pooling* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_POOLING_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_rnncell.c b/third_party/libxsmm/src/libxsmm_dnn_rnncell.c new file mode 100644 index 00000000..ad3fa5b6 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_rnncell.c @@ -0,0 +1,2357 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_rnncell_forward.h" +#include "libxsmm_dnn_rnncell_backward_weight_update.h" +#include "libxsmm_dnn_elementwise.h" +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +LIBXSMM_API libxsmm_dnn_rnncell* libxsmm_dnn_create_rnncell(libxsmm_dnn_rnncell_desc rnncell_desc, libxsmm_dnn_err_t* status) +{ + libxsmm_dnn_rnncell* handle = 0; + + /* init libxsmm */ + LIBXSMM_INIT + + /* some check we can do before allocating the handle */ + if ( (rnncell_desc.datatype_in != rnncell_desc.datatype_out) || + ( (rnncell_desc.datatype_in != LIBXSMM_DNN_DATATYPE_BF16) && (rnncell_desc.datatype_in != LIBXSMM_DNN_DATATYPE_F32) ) ) { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return NULL; + } + /* let's do some simple checks for BF16 as this limits the cell and architecture */ + if ( (rnncell_desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) || (rnncell_desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + if ( (LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid) || (rnncell_desc.C % 16 != 0) || (rnncell_desc.K % 16 != 0) ) { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return NULL; + } + } + /* we need at least one timestep */ + if (rnncell_desc.max_T < 1) { + *status = LIBXSMM_DNN_ERR_TIME_STEPS_TOO_SMALL; + return NULL; + } + + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_rnncell*)calloc(1, sizeof(libxsmm_dnn_rnncell)); + if (NULL != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* initialize known handle components */ + handle->desc = rnncell_desc; + /* set current seq length to max length */ + handle->T = rnncell_desc.max_T; + /* set blocking factors */ + handle->bk = (handle->desc.bk == 0) ? 64 : handle->desc.bk; + handle->bn = (handle->desc.bn == 0) ? 64 : handle->desc.bn; + handle->bc = (handle->desc.bc == 0) ? 64 : handle->desc.bc; + handle->use_fwd_fused_impl = handle->desc.use_fwd_fused_impl; + handle->fwd_block = handle->desc.fwd_block; + handle->bwdupd_block = handle->desc.bwdupd_block; + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + handle->lpb = 2; + } else { + handle->lpb = 1; + } + /* validate blocking factors */ + if ( handle->desc.N % handle->bn != 0 ) { + handle->bn = handle->desc.N; + *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_N_BLOCKING; + } + if ( handle->desc.C % handle->bc != 0 ) { + handle->bc = handle->desc.C; + *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_C_BLOCKING; + } + if ( handle->desc.K % handle->bk != 0 ) { + handle->bk = handle->desc.K; + *status = LIBXSMM_DNN_WARN_RNN_SUBOPTIMAL_K_BLOCKING; + } + + /* If in SPR, generate tilerelease kernel */ + if ((libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + int l_tr_flags = LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + handle->tilerelease_kernel = libxsmm_bsmmdispatch(handle->bk, handle->bk, handle->bk, NULL, NULL, NULL, NULL, NULL, &l_tr_flags, NULL); + } + + /* In case of BF16 for now hoist the BRGEMM and make them to use STRIDED variant by default */ + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + libxsmm_blasint BF, CB_BLOCKS, KB_BLOCKS; + const libxsmm_blasint K = handle->desc.K; + const libxsmm_blasint N = handle->desc.N; + const libxsmm_blasint C = handle->desc.C; + const libxsmm_blasint bk = handle->bk; + const libxsmm_blasint bn = handle->bn; + const libxsmm_blasint bc = handle->bc; + const libxsmm_blasint cBlocks = C/bc; + const libxsmm_blasint kBlocks = K/bk; + const libxsmm_blasint nBlocks = N/bn; + int tc_flags = 0; + int kernel_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + int stride_a, stride_b; + + if ((libxsmm_target_archid == LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + kernel_flags = ((handle->bk % 32 == 0) && (handle->bc % 32 == 0) && (handle->bn % 32 == 0)) ? LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG : 0; + kernel_flags = kernel_flags | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + tc_flags = LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG | ( LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N') ); + } + + /* Blocking reduction domain if it is too large */ + BF = 1; + if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } + } + if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } + } + if (C == 2048 && K == 1024) { + BF = 2; + } + BF = handle->fwd_block; + + if (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) { + CB_BLOCKS = cBlocks/BF; + KB_BLOCKS = kBlocks/BF; + + /* define batch-reduce gemm kernels */ + stride_a = bc * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bc * bn * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->fwd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bc, stride_a, stride_b, CB_BLOCKS, &bk, &bc, &bk, NULL, NULL, &kernel_flags, NULL ); + stride_a = bk * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bk * bn * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->fwd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &bk, &bk, NULL, NULL, &kernel_flags, NULL ); + if ((libxsmm_target_archid == LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + handle->fwd_tileconfig = libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL ); + } + + BF = handle->bwdupd_block; + KB_BLOCKS = kBlocks/BF; + + stride_a = bc * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bk * bn * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bc, bn, bk, stride_a, stride_b, KB_BLOCKS, &bc, &bk, &bc, NULL, NULL, &kernel_flags, NULL); + stride_a = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bk, bn, stride_a, stride_b, nBlocks, &bk, &bn, &bk, NULL, NULL, &kernel_flags, NULL); + stride_a = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bn * bc * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernelc = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bc, bn, stride_a, stride_b, nBlocks, &bk, &bn, &bk, NULL, NULL, &kernel_flags, NULL); + stride_a = bk * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kerneld = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &bk, &bk, NULL, NULL, &kernel_flags, NULL); + if ((libxsmm_target_archid == LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + handle->bwdupd_tileconfig = libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL); + } + } else { + CB_BLOCKS = cBlocks/BF; + KB_BLOCKS = kBlocks/BF; + + /* define batch-reduce gemm kernels */ + stride_a = bc * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bc * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->fwd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bc, stride_a, stride_b, CB_BLOCKS, &bk, &C, &K, NULL, NULL, &kernel_flags, NULL ); + stride_a = bk * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->fwd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL ); + if ((libxsmm_target_archid == LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + handle->fwd_tileconfig = libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL ); + } + + BF = handle->bwdupd_block; + KB_BLOCKS = kBlocks/BF; + + stride_a = bc * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernela = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bc, bn, bk, stride_a, stride_b, KB_BLOCKS, &bc, &K, &C, NULL, NULL, &kernel_flags, NULL); + stride_a = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bn * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernelb = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bk, bn, stride_a, stride_b, nBlocks, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL); + stride_a = bn * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bn * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kernelc = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bc, bn, stride_a, stride_b, nBlocks, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL); + stride_a = bk * bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + stride_b = bk * libxsmm_dnn_typesize(handle->desc.datatype_in); + handle->bwdupd_kerneld = libxsmm_bsmmdispatch_reducebatch_strd_unroll( bk, bn, bk, stride_a, stride_b, KB_BLOCKS, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL); + if ((libxsmm_target_archid == LIBXSMM_X86_AVX512_SPR) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT)) { + handle->bwdupd_tileconfig = libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL); + } + } + } + + /* Need to allocate space for scratch libxsmm_dnn_tensor's, let's set all pointers to zero */ + handle->internal_z = 0; + handle->scratch_wT = 0; + handle->scratch_rT = 0; + handle->scratch_xT = 0; + handle->scratch_hT = 0; + handle->scratch_deltat = 0; + handle->scratch_di = 0; + handle->scratch_df = 0; + handle->scratch_do = 0; + handle->scratch_dci = 0; + handle->scratch_diB = 0; + handle->scratch_dfB = 0; + handle->scratch_dpB = 0; + handle->scratch_dciB = 0; + /* initialize a high-performant barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + if (NULL == handle->barrier) + { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + free(handle); + return NULL; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_rnncell(const libxsmm_dnn_rnncell* handle) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_rnncell*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_rnncell_create_tensor_datalayout(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) +{ + libxsmm_dnn_tensor_datalayout* layout; + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + if (layout != 0) { + if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) || + (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || + (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || + (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || + (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + layout->format = handle->desc.buffer_format; + layout->tensor_type = LIBXSMM_DNN_ACTIVATION; + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; + layout->dim_size[0] = (unsigned int)handle->bc; + layout->dim_size[1] = (unsigned int)handle->bn; + layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); + layout->dim_size[4] = (unsigned int)handle->desc.max_T; + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || + (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || + (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || + (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bn; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.N / handle->bn); + layout->dim_size[4] = (unsigned int)handle->desc.max_T; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NC) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 3; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_RNN_GRADIENT_INPUT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; + layout->dim_size[0] = (unsigned int)handle->desc.C; + layout->dim_size[1] = (unsigned int)handle->desc.N; + layout->dim_size[2] = (unsigned int)handle->desc.max_T; + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) || + (type == LIBXSMM_DNN_RNN_REGULAR_CS) || (type == LIBXSMM_DNN_RNN_GRADIENT_CS) || + (type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) || (type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) || + (type == LIBXSMM_DNN_RNN_INTERNAL_I) || (type == LIBXSMM_DNN_RNN_INTERNAL_F) || + (type == LIBXSMM_DNN_RNN_INTERNAL_O) || (type == LIBXSMM_DNN_RNN_INTERNAL_CI) || + (type == LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_T; + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)handle->desc.N; + layout->dim_size[2] = (unsigned int)handle->desc.max_T; + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) || + (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->format = handle->desc.filter_format; + layout->tensor_type = LIBXSMM_DNN_FILTER; + if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = handle->desc.datatype_in; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bc; + layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[4] = 4; + } else { + layout->dim_size[4] = 3; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[4] = 4; + } else { + layout->dim_size[4] = 3; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bc; + layout->dim_size[2] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = handle->desc.datatype_in; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 6; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bc / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[5] = 4; + } else { + layout->dim_size[5] = 3; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[5] = 4; + } else { + layout->dim_size[5] = 3; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bc / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CK) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); + layout->dim_size[1] = (unsigned int)handle->desc.C; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); + layout->dim_size[1] = (unsigned int)handle->desc.C; + } else { + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)handle->desc.C; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) || (type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); + layout->dim_size[1] = (unsigned int)handle->desc.K; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); + layout->dim_size[1] = (unsigned int)handle->desc.K; + } else { + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)handle->desc.K; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) || (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->format = handle->desc.filter_format; + layout->tensor_type = LIBXSMM_DNN_FILTER; + if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) > 0) { + if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32) ) { + layout->datatype = handle->desc.datatype_in; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->bc; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[4] = 4; + } else { + layout->dim_size[4] = 3; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[4] = 4; + } else { + layout->dim_size[4] = 3; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 4; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = (unsigned int)handle->bc; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.C / handle->bc); + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->bk; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else if ( (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ) { + layout->datatype = handle->desc.datatype_in; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM || handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(6*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(6*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 6; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bc; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.C / handle->bc); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[5] = 4; + } else { + layout->dim_size[5] = 3; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[5] = LIBXSMM_DNN_TENSOR_DIMTYPE_X; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[5] = 4; + } else { + layout->dim_size[5] = 3; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(5*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(5*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 5; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bc; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.C / handle->bc); + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[4] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_size[0] = (unsigned int)handle->lpb; + layout->dim_size[1] = (unsigned int)handle->bk; + layout->dim_size[2] = (unsigned int)(handle->bk / handle->lpb); + layout->dim_size[3] = (unsigned int)(handle->desc.K / handle->bk); + layout->dim_size[4] = (unsigned int)(handle->desc.K / handle->bk); + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else if ((handle->desc.filter_format & LIBXSMM_DNN_TENSOR_FORMAT_CK) > 0) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(2*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(2*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 2; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[0] = (unsigned int)handle->desc.C; + layout->dim_size[1] = (unsigned int)(handle->desc.K * 4); + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_size[0] = (unsigned int)handle->desc.C; + layout->dim_size[1] = (unsigned int)(handle->desc.K * 3); + } else { + layout->dim_size[0] = (unsigned int)handle->desc.C; + layout->dim_size[1] = (unsigned int)handle->desc.K; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)(handle->desc.K * 4); + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)(handle->desc.K * 3); + } else { + layout->dim_size[0] = (unsigned int)handle->desc.K; + layout->dim_size[1] = (unsigned int)handle->desc.K; + } + } else { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( (type == LIBXSMM_DNN_RNN_REGULAR_BIAS) || (type == LIBXSMM_DNN_RNN_GRADIENT_BIAS) ) { + layout->format = handle->desc.buffer_format; + layout->tensor_type = LIBXSMM_DNN_CHANNEL_SCALAR; + + + if ( ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NC) > 0) || ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) ) { + if ( ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32)) || ((handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16) && (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16)) ) { + layout->datatype = handle->desc.datatype_in; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { /* TODO: handle the error */ + layout->num_dims = 1; + + if ( (type == LIBXSMM_DNN_RNN_REGULAR_BIAS) || (type == LIBXSMM_DNN_RNN_GRADIENT_BIAS) ) { + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_K; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 4); + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + layout->dim_size[0] = (unsigned int)(handle->desc.K * 3); + } else { + layout->dim_size[0] = (unsigned int)handle->desc.K; + } + } else { /* coverity[dead_error_begin] */ + free(layout->dim_type); + free(layout->dim_size); + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + return layout; +} + + +LIBXSMM_API size_t libxsmm_dnn_rnncell_get_scratch_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) +{ + size_t size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + const size_t typesize_in = libxsmm_dnn_typesize(handle->desc.datatype_in); + const size_t dwdr_typesize = (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ? sizeof(float) : typesize_in; + + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + size += 0; + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in + 64; /* wT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in + 64; /* rT */ + size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) * (size_t)handle->desc.max_T + 64; /* deltat */ + + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* w */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* r */ + /* The scratches below are needed only for BF16 code for the intermediate results */ + if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { + size += (size_t)7 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64); /* intermediate scratches */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; /* intermediate scratches */ + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + size += (size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize * 4 + 4 * 64; /* w */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize * 4 + 4 * 64; /* r */ + size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* wT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 4 + 4 * 64; /* rT */ + size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* deltat */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* di */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dci */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dpB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dciB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ + /* The scratches below are needed only for BF16 code for the intermediate results */ + if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { + size += (size_t)4 *((size_t)handle->desc.K * sizeof(float) + 64); /* intermediate db scratch */ + size += (size_t)handle->desc.C * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; /* intermediate dx scratches */ + size += (size_t)7 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64); /* intermediate scratches */ + size += (size_t)2 *((size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64); /* intermediate scratches */ + } + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* w */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* r */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + size += (size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize * 3 + 3 * 64; /* w */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize * 3 + 3 * 64; /* r */ + size += (size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* wT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in * 3 + 3 * 64; /* rT */ + size += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; /* xT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* hT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; /* deltat */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* di */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dc */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* df */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* do */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* diB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dcB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* dfB */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* oT */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t1 */ + size += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; /* t2 */ + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return size; +} + + +LIBXSMM_API void* libxsmm_dnn_rnncell_get_scratch_ptr(const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->scratch_base; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return NULL; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* scratch) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (NULL != handle) { + const size_t typesize_in = libxsmm_dnn_typesize(handle->desc.datatype_in); + const size_t dwdr_typesize = (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) ? sizeof(float) : typesize_in; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + /* forward only has no scratch need */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + handle->scratch_base = (void*)address; + /* wT */ + if (address % 64 == 0) { + handle->scratch_wT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_wT = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) + 64; + /* rT */ + if (address % 64 == 0) { + handle->scratch_rT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_rT = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) + 64; + /* xT */ + if (address % 64 == 0) { + handle->scratch_xT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_xT = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in) + 64; + /* hT */ + if (address % 64 == 0) { + handle->scratch_hT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_hT = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out)) + 64; + /* deltat */ + if (address % 64 == 0) { + handle->scratch_deltat = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_deltat = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) * (size_t)handle->desc.max_T) + 64; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + handle->scratch_base = (void*)address; + /* w scratch */ + if (address % 64 == 0) { + handle->scratch_w = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_w = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 4 + 64; + /* r scratch */ + if (address % 64 == 0) { + handle->scratch_r = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_r = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 4 + 64; + /* The scratches below are needed only for BF16 code for the intermediate results */ + if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { + /* cst scratch */ + if (address % 64 == 0) { + handle->cst_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cst_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ht scratch */ + if (address % 64 == 0) { + handle->ht_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ht_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* it scratch */ + if (address % 64 == 0) { + handle->it_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->it_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ft scratch */ + if (address % 64 == 0) { + handle->ft_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ft_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ot scratch */ + if (address % 64 == 0) { + handle->ot_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ot_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* cit scratch */ + if (address % 64 == 0) { + handle->cit_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cit_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* cot scratch */ + if (address % 64 == 0) { + handle->cot_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cot_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* csp scratch */ + if (address % 64 == 0) { + handle->csp_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->csp_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + handle->scratch_base = (void*)address; + /* w scratch */ + if (address % 64 == 0) { + handle->scratch_w = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_w = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize) * 4 + 64; + /* r scratch */ + if (address % 64 == 0) { + handle->scratch_r = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_r = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize) * 4 + 64; + /* wT */ + if (address % 64 == 0) { + handle->scratch_wT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_wT = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 4 + 64; + /* rT */ + if (address % 64 == 0) { + handle->scratch_rT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_rT = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 4 + 64; + /* xT */ + if (address % 64 == 0) { + handle->scratch_xT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_xT = (void*)(address+offset); + } + address += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; + /* hT */ + if (address % 64 == 0) { + handle->scratch_hT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_hT = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* deltat */ + if (address % 64 == 0) { + handle->scratch_deltat = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_deltat = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; + /* di */ + if (address % 64 == 0) { + handle->scratch_di = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_di = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* df */ + if (address % 64 == 0) { + handle->scratch_df = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_df = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* do */ + if (address % 64 == 0) { + handle->scratch_do = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_do = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dci */ + if (address % 64 == 0) { + handle->scratch_dci = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dci = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* diB */ + if (address % 64 == 0) { + handle->scratch_diB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_diB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dfB */ + if (address % 64 == 0) { + handle->scratch_dfB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dfB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dpB */ + if (address % 64 == 0) { + handle->scratch_dpB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dpB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dciB */ + if (address % 64 == 0) { + handle->scratch_dciB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dciB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* t1 */ + if (address % 64 == 0) { + handle->scratch_t1 = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_t1 = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* t2 */ + if (address % 64 == 0) { + handle->scratch_t2 = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_t2 = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* The scratches below are needed only for BF16 code for the intermediate results */ + if (handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { + /* dx scratch */ + if (address % 64 == 0) { + handle->scratch_dx = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dx = (void*)(address+offset); + } + address += (size_t)handle->desc.C * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* dhp scratch */ + if (address % 64 == 0) { + handle->scratch_dhp = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dhp = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; + /* db scratch */ + if (address % 64 == 0) { + handle->scratch_db = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_db = (void*)(address+offset); + } + address += (size_t)handle->desc.K * 4 * sizeof(float) + 64; + /* cst scratch */ + if (address % 64 == 0) { + handle->cst_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cst_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ht scratch */ + if (address % 64 == 0) { + handle->ht_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ht_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* it scratch */ + if (address % 64 == 0) { + handle->it_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->it_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ft scratch */ + if (address % 64 == 0) { + handle->ft_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ft_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* ot scratch */ + if (address % 64 == 0) { + handle->ot_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->ot_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* cit scratch */ + if (address % 64 == 0) { + handle->cit_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cit_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* cot scratch */ + if (address % 64 == 0) { + handle->cot_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->cot_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) * (size_t)handle->desc.max_T + 64; + /* csp scratch */ + if (address % 64 == 0) { + handle->csp_scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->csp_scratch = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof(float) + 64; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + handle->scratch_base = (void*)address; + /* w scratch */ + if (address % 64 == 0) { + handle->scratch_w = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_w = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 3 + 64; + /* r scratch */ + if (address % 64 == 0) { + handle->scratch_r = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_r = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 3 + 64; + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + handle->scratch_base = (void*)address; + /* w scratch */ + if (address % 64 == 0) { + handle->scratch_w = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_w = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * dwdr_typesize) * 3 + 64; + /* r scratch */ + if (address % 64 == 0) { + handle->scratch_r = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_r = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * dwdr_typesize) * 3 + 64; + /* wT */ + if (address % 64 == 0) { + handle->scratch_wT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_wT = (void*)(address+offset); + } + address += ((size_t)handle->desc.C * (size_t)handle->desc.K * typesize_in) * 3 + 64; + /* rT */ + if (address % 64 == 0) { + handle->scratch_rT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_rT = (void*)(address+offset); + } + address += ((size_t)handle->desc.K * (size_t)handle->desc.K * typesize_in) * 3 + 64; + /* xT */ + if (address % 64 == 0) { + handle->scratch_xT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_xT = (void*)(address+offset); + } + address += (size_t)handle->desc.C * (size_t)handle->desc.N * typesize_in + 64; + /* hT */ + if (address % 64 == 0) { + handle->scratch_hT = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_hT = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* deltat */ + if (address % 64 == 0) { + handle->scratch_deltat = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_deltat = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * dwdr_typesize + 64; + /* di */ + if (address % 64 == 0) { + handle->scratch_di = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_di = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dc */ + if (address % 64 == 0) { + handle->scratch_dci = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dci = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* df */ + if (address % 64 == 0) { + handle->scratch_df = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_df = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* do */ + if (address % 64 == 0) { + handle->scratch_do = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_do = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* diB */ + if (address % 64 == 0) { + handle->scratch_diB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_diB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dcB */ + if (address % 64 == 0) { + handle->scratch_dciB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dciB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* dfB */ + if (address % 64 == 0) { + handle->scratch_dfB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dfB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* doB (repurposed for oT) */ + if (address % 64 == 0) { + handle->scratch_dpB = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_dpB = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* t1 */ + if (address % 64 == 0) { + handle->scratch_t1 = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_t1 = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + /* t2 */ + if (address % 64 == 0) { + handle->scratch_t2 = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch_t2 = (void*)(address+offset); + } + address += (size_t)handle->desc.K * (size_t)handle->desc.N * libxsmm_dnn_typesize(handle->desc.datatype_out) + 64; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_scratch(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + /* forward only has no scratch need */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + handle->scratch_wT = 0; + handle->scratch_rT = 0; + handle->scratch_xT = 0; + handle->scratch_hT = 0; + handle->scratch_deltat = 0; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + handle->scratch_w = 0; + handle->scratch_r = 0; + handle->csp_scratch = 0; + handle->cst_scratch = 0; + handle->ht_scratch = 0; + handle->it_scratch = 0; + handle->ft_scratch = 0; + handle->ot_scratch = 0; + handle->cit_scratch = 0; + handle->cot_scratch = 0; + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + handle->scratch_w = 0; + handle->scratch_r = 0; + handle->scratch_wT = 0; + handle->scratch_rT = 0; + handle->scratch_xT = 0; + handle->scratch_hT = 0; + handle->scratch_deltat = 0; + handle->scratch_di = 0; + handle->scratch_df = 0; + handle->scratch_do = 0; + handle->scratch_dci = 0; + handle->scratch_diB = 0; + handle->scratch_dfB = 0; + handle->scratch_dpB = 0; + handle->scratch_dciB = 0; + handle->scratch_t1 = 0; + handle->scratch_t2 = 0; + handle->csp_scratch = 0; + handle->cst_scratch = 0; + handle->ht_scratch = 0; + handle->it_scratch = 0; + handle->ft_scratch = 0; + handle->ot_scratch = 0; + handle->cit_scratch = 0; + handle->cot_scratch = 0; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + handle->scratch_w = 0; + handle->scratch_r = 0; + handle->ht_scratch = 0; + handle->it_scratch = 0; + handle->cit_scratch = 0; + handle->ft_scratch = 0; + handle->ot_scratch = 0; + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + handle->scratch_w = 0; + handle->scratch_r = 0; + handle->scratch_wT = 0; + handle->scratch_rT = 0; + handle->scratch_xT = 0; + handle->scratch_hT = 0; + handle->scratch_deltat = 0; + handle->scratch_di = 0; + handle->scratch_dci = 0; + handle->scratch_df = 0; + handle->scratch_do = 0; + handle->scratch_diB = 0; + handle->scratch_dciB = 0; + handle->scratch_dfB = 0; + handle->scratch_dpB = 0; + handle->scratch_t1 = 0; + handle->scratch_t2 = 0; + handle->ht_scratch = 0; + handle->it_scratch = 0; + handle->ft_scratch = 0; + handle->ot_scratch = 0; + handle->cit_scratch = 0; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API size_t libxsmm_dnn_rnncell_get_internalstate_size(const libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, libxsmm_dnn_err_t* status) +{ + size_t size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + const size_t sizeof_datatype = sizeof(float); + + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof_datatype * (size_t)handle->desc.max_T + 64; /* zt */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + size += (size_t)handle->desc.K * (size_t)handle->desc.N * sizeof_datatype * (size_t)handle->desc.max_T + 64; /* zt */ + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + /* with i, f, o, ci, co, cs exposed as i/o, there is currently no need for internal state */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + /* with i, f, o, ci, co, cs exposed as i/o, there is currently no need for internal state */ + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + /* with i, f, c, o exposed as i/o, there is currently no need for internal state */ + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + /* with i, f, c, o exposed as i/o, there is currently no need for internal state */ + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + *status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return size; +} + + +LIBXSMM_API void* libxsmm_dnn_rnncell_get_internalstate_ptr(const libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->internal_z; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return NULL; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind, const void* internalstate) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)internalstate; + size_t offset = 0; + + if (0 != handle) { + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + if (internalstate == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + if (address % 64 == 0) { + handle->internal_z = (void*)address; + } else { + offset = (64 - address % 64); + handle->internal_z = (void*)(address+offset); + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + if (address % 64 == 0) { + handle->internal_z = (void*)address; + } else { + offset = (64 - address % 64); + handle->internal_z = (void*)(address+offset); + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_internalstate(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_compute_kind kind) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (handle->desc.cell_type) { + case LIBXSMM_DNN_RNNCELL_RNN_RELU: + case LIBXSMM_DNN_RNNCELL_RNN_SIGMOID: + case LIBXSMM_DNN_RNNCELL_RNN_TANH: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + handle->internal_z = 0; + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + handle->internal_z = 0; + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_LSTM: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + case LIBXSMM_DNN_RNNCELL_GRU: { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: + case LIBXSMM_DNN_COMPUTE_KIND_ALL: { + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_RNN_TYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_allocate_forget_bias(libxsmm_dnn_rnncell* handle, const float forget_bias) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (handle != 0) { + handle->forget_bias = forget_bias; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_bind_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && + (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && + (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && + (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && + (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_rnncell_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { + handle->xt = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { + handle->dxt = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { + handle->csp = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { + handle->dcsp = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { + handle->hp = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { + handle->dhp = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { + handle->w = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { + handle->wt = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { + handle->dw = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { + handle->r = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { + handle->rt = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { + handle->dr = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { + handle->b = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { + handle->db = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { + handle->cst = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { + handle->dcs = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { + handle->ht = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { + handle->dht = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { + handle->it = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { + handle->ft = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { + handle->ot = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { + handle->cit = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { + handle->cot = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_rnncell_get_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) +{ + libxsmm_dnn_tensor* tensor = 0; + LIBXSMM_UNUSED(status/*TODO*/); + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && + (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && + (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && + (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && + (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + return tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { + tensor = handle->xt; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { + tensor = handle->dxt; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { + tensor = handle->csp; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { + tensor = handle->dcsp; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { + tensor = handle->hp; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { + tensor = handle->dhp; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { + tensor = handle->w; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { + tensor = handle->wt; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { + tensor = handle->dw; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { + tensor = handle->r; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { + tensor = handle->rt; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { + tensor = handle->dr; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { + tensor = handle->b; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { + tensor = handle->db; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { + tensor = handle->cst; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { + tensor = handle->dcs; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { + tensor = handle->ht; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { + tensor = handle->dht; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { + tensor = handle->it; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { + tensor = handle->ft; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { + tensor = handle->ot; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { + tensor = handle->cit; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { + tensor = handle->cot; + } else { + /* cannot happen */ + } + } + + return tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_release_tensor(libxsmm_dnn_rnncell* handle, const libxsmm_dnn_tensor_type type) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_RNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_RNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT) && (type != LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT) && + (type != LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS) && (type != LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS) && + (type != LIBXSMM_DNN_RNN_REGULAR_BIAS) && (type != LIBXSMM_DNN_RNN_GRADIENT_BIAS) && + (type != LIBXSMM_DNN_RNN_REGULAR_CS) && (type != LIBXSMM_DNN_RNN_GRADIENT_CS) && + (type != LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE) && (type != LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE) && + (type != LIBXSMM_DNN_RNN_INTERNAL_I) && (type != LIBXSMM_DNN_RNN_INTERNAL_F) && + (type != LIBXSMM_DNN_RNN_INTERNAL_O) && (type != LIBXSMM_DNN_RNN_INTERNAL_CI) && + (type != LIBXSMM_DNN_RNN_INTERNAL_CO) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_RNN_REGULAR_INPUT ) { + handle->xt = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_INPUT ) { + handle->dxt = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS_PREV ) { + handle->csp = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS_PREV ) { + handle->dcsp = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE_PREV ) { + handle->hp = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE_PREV ) { + handle->dhp = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT ) { + handle->w = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_WEIGHT_TRANS ) { + handle->wt = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_WEIGHT ) { + handle->dw = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT ) { + handle->r = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_RECUR_WEIGHT_TRANS ) { + handle->rt = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_RECUR_WEIGHT ) { + handle->dr = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_BIAS ) { + handle->b = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_BIAS ) { + handle->db = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_CS ) { + handle->cst = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_CS ) { + handle->dcs = 0; + } else if ( type == LIBXSMM_DNN_RNN_REGULAR_HIDDEN_STATE ) { + handle->ht = 0; + } else if ( type == LIBXSMM_DNN_RNN_GRADIENT_HIDDEN_STATE ) { + handle->dht = 0; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_I ) { + handle->it = 0; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_F ) { + handle->ft = 0; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_O ) { + handle->ot = 0; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CI ) { + handle->cit = 0; + } else if ( type == LIBXSMM_DNN_RNN_INTERNAL_CO ) { + handle->cot = 0; + } else { + /* cannot happen */ + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_set_sequence_length( libxsmm_dnn_rnncell* handle, const libxsmm_blasint T ) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + if ( handle->desc.max_T < T ) { + status = LIBXSMM_DNN_ERR_RNN_INVALID_SEQ_LEN; + } else { + handle->T = T; + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_blasint libxsmm_dnn_rnncell_get_sequence_length( libxsmm_dnn_rnncell* handle, libxsmm_dnn_err_t* status ) { + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->T; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_rnncell_execute_st(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CK) ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck( handle, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck( handle, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck( handle, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: + case LIBXSMM_DNN_COMPUTE_KIND_UPD: + case LIBXSMM_DNN_COMPUTE_KIND_BWDUPD: { + if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CK) ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck( handle, kind, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NC) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck( handle, kind, start_thread, tid ); + } else if ( (handle->desc.buffer_format == LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) && (handle->desc.filter_format == LIBXSMM_DNN_TENSOR_FORMAT_CKPACKED) ) { + status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck( handle, kind, start_thread, tid ); + } else { + status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.c b/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.c new file mode 100644 index 00000000..54cef8b6 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.c @@ -0,0 +1,1016 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Kunal Banerjee, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_rnncell_backward_weight_update.h" +#include "libxsmm_dnn_elementwise.h" +#include "libxsmm_main.h" + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +void trans_act(short int *in, short int *out) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) + __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; + __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; + __m512i v0, v1, v2, v3, v4, v5, v6, v7; + const __m512i idx_v = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2); + const __mmask8 mask0 = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(204); + const __mmask8 mask1 = LIBXSMM_INTRINSICS_MM512_CVTU32_MASK8(51); + const int in_width = 32, out_width = 32; + + r0 = _mm512_loadu_si512(in + 0*in_width); + r1 = _mm512_loadu_si512(in + 1*in_width); + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + r2 = _mm512_loadu_si512(in + 2*in_width); + r3 = _mm512_loadu_si512(in + 3*in_width); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + r4 = _mm512_loadu_si512(in + 4*in_width); + r5 = _mm512_loadu_si512(in + 5*in_width); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + r6 = _mm512_loadu_si512(in + 6*in_width); + r7 = _mm512_loadu_si512(in + 7*in_width); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + r8 = _mm512_loadu_si512(in + 8*in_width); + r9 = _mm512_loadu_si512(in + 9*in_width); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ra = _mm512_loadu_si512(in + 10*in_width); + rb = _mm512_loadu_si512(in + 11*in_width); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + rc = _mm512_loadu_si512(in + 12*in_width); + rd = _mm512_loadu_si512(in + 13*in_width); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + re = _mm512_loadu_si512(in + 14*in_width); + rf = _mm512_loadu_si512(in + 15*in_width); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + v0 = _mm512_permutex2var_epi64(r0, idx_v, r8); + t0 = _mm512_mask_blend_epi64( mask0, r0, v0); + _mm256_storeu_si256((__m256i*)(out + 0*out_width), _mm512_extracti64x4_epi64(t0, 0)); + _mm256_storeu_si256((__m256i*)(out + 1*out_width), _mm512_extracti64x4_epi64(t0, 1)); + t8 = _mm512_mask_blend_epi64( mask1, r8, v0); + _mm256_storeu_si256((__m256i*)(out + 16*out_width), _mm512_extracti64x4_epi64(t8, 0)); + _mm256_storeu_si256((__m256i*)(out + 17*out_width), _mm512_extracti64x4_epi64(t8, 1)); + v1 = _mm512_permutex2var_epi64(r1, idx_v, r9); + t1 = _mm512_mask_blend_epi64( mask0, r1, v1); + _mm256_storeu_si256((__m256i*)(out + 2*out_width), _mm512_extracti64x4_epi64(t1, 0)); + _mm256_storeu_si256((__m256i*)(out + 3*out_width), _mm512_extracti64x4_epi64(t1, 1)); + t9 = _mm512_mask_blend_epi64( mask1, r9, v1); + _mm256_storeu_si256((__m256i*)(out + 18*out_width), _mm512_extracti64x4_epi64(t9, 0)); + _mm256_storeu_si256((__m256i*)(out + 19*out_width), _mm512_extracti64x4_epi64(t9, 1)); + v2 = _mm512_permutex2var_epi64(r2, idx_v, ra); + t2 = _mm512_mask_blend_epi64( mask0, r2, v2); + _mm256_storeu_si256((__m256i*)(out + 4*out_width), _mm512_extracti64x4_epi64(t2, 0)); + _mm256_storeu_si256((__m256i*)(out + 5*out_width), _mm512_extracti64x4_epi64(t2, 1)); + ta = _mm512_mask_blend_epi64( mask1, ra, v2); + _mm256_storeu_si256((__m256i*)(out + 20*out_width), _mm512_extracti64x4_epi64(ta, 0)); + _mm256_storeu_si256((__m256i*)(out + 21*out_width), _mm512_extracti64x4_epi64(ta, 1)); + v3 = _mm512_permutex2var_epi64(r3, idx_v, rb); + t3 = _mm512_mask_blend_epi64( mask0, r3, v3); + _mm256_storeu_si256((__m256i*)(out + 6*out_width), _mm512_extracti64x4_epi64(t3, 0)); + _mm256_storeu_si256((__m256i*)(out + 7*out_width), _mm512_extracti64x4_epi64(t3, 1)); + tb = _mm512_mask_blend_epi64( mask1, rb, v3); + _mm256_storeu_si256((__m256i*)(out + 22*out_width), _mm512_extracti64x4_epi64(tb, 0)); + _mm256_storeu_si256((__m256i*)(out + 23*out_width), _mm512_extracti64x4_epi64(tb, 1)); + v4 = _mm512_permutex2var_epi64(r4, idx_v, rc); + t4 = _mm512_mask_blend_epi64( mask0, r4, v4); + _mm256_storeu_si256((__m256i*)(out + 8*out_width), _mm512_extracti64x4_epi64(t4, 0)); + _mm256_storeu_si256((__m256i*)(out + 9*out_width), _mm512_extracti64x4_epi64(t4, 1)); + tc = _mm512_mask_blend_epi64( mask1, rc, v4); + _mm256_storeu_si256((__m256i*)(out + 24*out_width), _mm512_extracti64x4_epi64(tc, 0)); + _mm256_storeu_si256((__m256i*)(out + 25*out_width), _mm512_extracti64x4_epi64(tc, 1)); + v5 = _mm512_permutex2var_epi64(r5, idx_v, rd); + t5 = _mm512_mask_blend_epi64( mask0, r5, v5); + _mm256_storeu_si256((__m256i*)(out + 10*out_width), _mm512_extracti64x4_epi64(t5, 0)); + _mm256_storeu_si256((__m256i*)(out + 11*out_width), _mm512_extracti64x4_epi64(t5, 1)); + td = _mm512_mask_blend_epi64( mask1, rd, v5); + _mm256_storeu_si256((__m256i*)(out + 26*out_width), _mm512_extracti64x4_epi64(td, 0)); + _mm256_storeu_si256((__m256i*)(out + 27*out_width), _mm512_extracti64x4_epi64(td, 1)); + v6 = _mm512_permutex2var_epi64(r6, idx_v, re); + t6 = _mm512_mask_blend_epi64( mask0, r6, v6); + _mm256_storeu_si256((__m256i*)(out + 12*out_width), _mm512_extracti64x4_epi64(t6, 0)); + _mm256_storeu_si256((__m256i*)(out + 13*out_width), _mm512_extracti64x4_epi64(t6, 1)); + te = _mm512_mask_blend_epi64( mask1, re, v6); + _mm256_storeu_si256((__m256i*)(out + 28*out_width), _mm512_extracti64x4_epi64(te, 0)); + _mm256_storeu_si256((__m256i*)(out + 29*out_width), _mm512_extracti64x4_epi64(te, 1)); + v7 = _mm512_permutex2var_epi64(r7, idx_v, rf); + t7 = _mm512_mask_blend_epi64( mask0, r7, v7); + _mm256_storeu_si256((__m256i*)(out + 14*out_width), _mm512_extracti64x4_epi64(t7, 0)); + _mm256_storeu_si256((__m256i*)(out + 15*out_width), _mm512_extracti64x4_epi64(t7, 1)); + tf = _mm512_mask_blend_epi64( mask1, rf, v7); + _mm256_storeu_si256((__m256i*)(out + 30*out_width), _mm512_extracti64x4_epi64(tf, 0)); + _mm256_storeu_si256((__m256i*)(out + 31*out_width), _mm512_extracti64x4_epi64(tf, 1)); + + r0 = _mm512_loadu_si512(in + 16*32 + 0*in_width); + r1 = _mm512_loadu_si512(in + 16*32 + 1*in_width); + t0 = _mm512_unpacklo_epi16(r0,r1); + t1 = _mm512_unpackhi_epi16(r0,r1); + r2 = _mm512_loadu_si512(in + 16*32 + 2*in_width); + r3 = _mm512_loadu_si512(in + 16*32 + 3*in_width); + t2 = _mm512_unpacklo_epi16(r2,r3); + t3 = _mm512_unpackhi_epi16(r2,r3); + r4 = _mm512_loadu_si512(in + 16*32 + 4*in_width); + r5 = _mm512_loadu_si512(in + 16*32 + 5*in_width); + t4 = _mm512_unpacklo_epi16(r4,r5); + t5 = _mm512_unpackhi_epi16(r4,r5); + r6 = _mm512_loadu_si512(in + 16*32 + 6*in_width); + r7 = _mm512_loadu_si512(in + 16*32 + 7*in_width); + t6 = _mm512_unpacklo_epi16(r6,r7); + t7 = _mm512_unpackhi_epi16(r6,r7); + r8 = _mm512_loadu_si512(in + 16*32 + 8*in_width); + r9 = _mm512_loadu_si512(in + 16*32 + 9*in_width); + t8 = _mm512_unpacklo_epi16(r8,r9); + t9 = _mm512_unpackhi_epi16(r8,r9); + ra = _mm512_loadu_si512(in + 16*32 + 10*in_width); + rb = _mm512_loadu_si512(in + 16*32 + 11*in_width); + ta = _mm512_unpacklo_epi16(ra,rb); + tb = _mm512_unpackhi_epi16(ra,rb); + rc = _mm512_loadu_si512(in + 16*32 + 12*in_width); + rd = _mm512_loadu_si512(in + 16*32 + 13*in_width); + tc = _mm512_unpacklo_epi16(rc,rd); + td = _mm512_unpackhi_epi16(rc,rd); + re = _mm512_loadu_si512(in + 16*32 + 14*in_width); + rf = _mm512_loadu_si512(in + 16*32 + 15*in_width); + te = _mm512_unpacklo_epi16(re,rf); + tf = _mm512_unpackhi_epi16(re,rf); + + r0 = _mm512_unpacklo_epi32(t0,t2); + r1 = _mm512_unpackhi_epi32(t0,t2); + r2 = _mm512_unpacklo_epi32(t1,t3); + r3 = _mm512_unpackhi_epi32(t1,t3); + r4 = _mm512_unpacklo_epi32(t4,t6); + r5 = _mm512_unpackhi_epi32(t4,t6); + r6 = _mm512_unpacklo_epi32(t5,t7); + r7 = _mm512_unpackhi_epi32(t5,t7); + r8 = _mm512_unpacklo_epi32(t8,ta); + r9 = _mm512_unpackhi_epi32(t8,ta); + ra = _mm512_unpacklo_epi32(t9,tb); + rb = _mm512_unpackhi_epi32(t9,tb); + rc = _mm512_unpacklo_epi32(tc,te); + rd = _mm512_unpackhi_epi32(tc,te); + re = _mm512_unpacklo_epi32(td,tf); + rf = _mm512_unpackhi_epi32(td,tf); + + t0 = _mm512_unpacklo_epi64(r0,r4); + t1 = _mm512_unpackhi_epi64(r0,r4); + t2 = _mm512_unpacklo_epi64(r1,r5); + t3 = _mm512_unpackhi_epi64(r1,r5); + t4 = _mm512_unpacklo_epi64(r2,r6); + t5 = _mm512_unpackhi_epi64(r2,r6); + t6 = _mm512_unpacklo_epi64(r3,r7); + t7 = _mm512_unpackhi_epi64(r3,r7); + t8 = _mm512_unpacklo_epi64(r8,rc); + t9 = _mm512_unpackhi_epi64(r8,rc); + ta = _mm512_unpacklo_epi64(r9,rd); + tb = _mm512_unpackhi_epi64(r9,rd); + tc = _mm512_unpacklo_epi64(ra,re); + td = _mm512_unpackhi_epi64(ra,re); + te = _mm512_unpacklo_epi64(rb,rf); + tf = _mm512_unpackhi_epi64(rb,rf); + + r0 = _mm512_shuffle_i32x4(t0, t1, 0x88); + r1 = _mm512_shuffle_i32x4(t2, t3, 0x88); + r2 = _mm512_shuffle_i32x4(t4, t5, 0x88); + r3 = _mm512_shuffle_i32x4(t6, t7, 0x88); + r4 = _mm512_shuffle_i32x4(t0, t1, 0xdd); + r5 = _mm512_shuffle_i32x4(t2, t3, 0xdd); + r6 = _mm512_shuffle_i32x4(t4, t5, 0xdd); + r7 = _mm512_shuffle_i32x4(t6, t7, 0xdd); + r8 = _mm512_shuffle_i32x4(t8, t9, 0x88); + r9 = _mm512_shuffle_i32x4(ta, tb, 0x88); + ra = _mm512_shuffle_i32x4(tc, td, 0x88); + rb = _mm512_shuffle_i32x4(te, tf, 0x88); + rc = _mm512_shuffle_i32x4(t8, t9, 0xdd); + rd = _mm512_shuffle_i32x4(ta, tb, 0xdd); + re = _mm512_shuffle_i32x4(tc, td, 0xdd); + rf = _mm512_shuffle_i32x4(te, tf, 0xdd); + + v0 = _mm512_permutex2var_epi64(r0, idx_v, r8); + t0 = _mm512_mask_blend_epi64( mask0, r0, v0); + _mm256_storeu_si256((__m256i*)(out + 16 + 0*out_width), _mm512_extracti64x4_epi64(t0, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 1*out_width), _mm512_extracti64x4_epi64(t0, 1)); + t8 = _mm512_mask_blend_epi64( mask1, r8, v0); + _mm256_storeu_si256((__m256i*)(out + 16 + 16*out_width), _mm512_extracti64x4_epi64(t8, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 17*out_width), _mm512_extracti64x4_epi64(t8, 1)); + v1 = _mm512_permutex2var_epi64(r1, idx_v, r9); + t1 = _mm512_mask_blend_epi64( mask0, r1, v1); + _mm256_storeu_si256((__m256i*)(out + 16 + 2*out_width), _mm512_extracti64x4_epi64(t1, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 3*out_width), _mm512_extracti64x4_epi64(t1, 1)); + t9 = _mm512_mask_blend_epi64( mask1, r9, v1); + _mm256_storeu_si256((__m256i*)(out + 16 + 18*out_width), _mm512_extracti64x4_epi64(t9, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 19*out_width), _mm512_extracti64x4_epi64(t9, 1)); + v2 = _mm512_permutex2var_epi64(r2, idx_v, ra); + t2 = _mm512_mask_blend_epi64( mask0, r2, v2); + _mm256_storeu_si256((__m256i*)(out + 16 + 4*out_width), _mm512_extracti64x4_epi64(t2, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 5*out_width), _mm512_extracti64x4_epi64(t2, 1)); + ta = _mm512_mask_blend_epi64( mask1, ra, v2); + _mm256_storeu_si256((__m256i*)(out + 16 + 20*out_width), _mm512_extracti64x4_epi64(ta, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 21*out_width), _mm512_extracti64x4_epi64(ta, 1)); + v3 = _mm512_permutex2var_epi64(r3, idx_v, rb); + t3 = _mm512_mask_blend_epi64( mask0, r3, v3); + _mm256_storeu_si256((__m256i*)(out + 16 + 6*out_width), _mm512_extracti64x4_epi64(t3, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 7*out_width), _mm512_extracti64x4_epi64(t3, 1)); + tb = _mm512_mask_blend_epi64( mask1, rb, v3); + _mm256_storeu_si256((__m256i*)(out + 16 + 22*out_width), _mm512_extracti64x4_epi64(tb, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 23*out_width), _mm512_extracti64x4_epi64(tb, 1)); + v4 = _mm512_permutex2var_epi64(r4, idx_v, rc); + t4 = _mm512_mask_blend_epi64( mask0, r4, v4); + _mm256_storeu_si256((__m256i*)(out + 16 + 8*out_width), _mm512_extracti64x4_epi64(t4, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 9*out_width), _mm512_extracti64x4_epi64(t4, 1)); + tc = _mm512_mask_blend_epi64( mask1, rc, v4); + _mm256_storeu_si256((__m256i*)(out + 16 + 24*out_width), _mm512_extracti64x4_epi64(tc, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 25*out_width), _mm512_extracti64x4_epi64(tc, 1)); + v5 = _mm512_permutex2var_epi64(r5, idx_v, rd); + t5 = _mm512_mask_blend_epi64( mask0, r5, v5); + _mm256_storeu_si256((__m256i*)(out + 16 + 10*out_width), _mm512_extracti64x4_epi64(t5, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 11*out_width), _mm512_extracti64x4_epi64(t5, 1)); + td = _mm512_mask_blend_epi64( mask1, rd, v5); + _mm256_storeu_si256((__m256i*)(out + 16 + 26*out_width), _mm512_extracti64x4_epi64(td, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 27*out_width), _mm512_extracti64x4_epi64(td, 1)); + v6 = _mm512_permutex2var_epi64(r6, idx_v, re); + t6 = _mm512_mask_blend_epi64( mask0, r6, v6); + _mm256_storeu_si256((__m256i*)(out + 16 + 12*out_width), _mm512_extracti64x4_epi64(t6, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 13*out_width), _mm512_extracti64x4_epi64(t6, 1)); + te = _mm512_mask_blend_epi64( mask1, re, v6); + _mm256_storeu_si256((__m256i*)(out + 16 + 28*out_width), _mm512_extracti64x4_epi64(te, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 29*out_width), _mm512_extracti64x4_epi64(te, 1)); + v7 = _mm512_permutex2var_epi64(r7, idx_v, rf); + t7 = _mm512_mask_blend_epi64( mask0, r7, v7); + _mm256_storeu_si256((__m256i*)(out + 16 + 14*out_width), _mm512_extracti64x4_epi64(t7, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 15*out_width), _mm512_extracti64x4_epi64(t7, 1)); + tf = _mm512_mask_blend_epi64( mask1, rf, v7); + _mm256_storeu_si256((__m256i*)(out + 16 + 30*out_width), _mm512_extracti64x4_epi64(tf, 0)); + _mm256_storeu_si256((__m256i*)(out + 16 + 31*out_width), _mm512_extracti64x4_epi64(tf, 1)); +#else + LIBXSMM_UNUSED(in); LIBXSMM_UNUSED(out); +#endif +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +# define LIBXSMM_DNN_RNN_RELU_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_RELU_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +# define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +# define LIBXSMM_DNN_RNN_TANH_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_TANH_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c" + } else { + /* should not happen */ + } +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + return libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu(handle, kind, start_thread, tid); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ +#define LIBXSMM_RNN_CELL_AVX512 +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ +#define LIBXSMM_RNN_CELL_AVX512 + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" + +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__ */ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + return libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu(handle, kind, start_thread, tid); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ +#define LIBXSMM_RNN_CELL_AVX512 +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ +#define LIBXSMM_RNN_CELL_AVX512 + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +# define LIBXSMM_DNN_RNN_RELU_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_RELU_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +# define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +# define LIBXSMM_DNN_RNN_TANH_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_TANH_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c" + } else { + /* should not happen */ + } +#undef LIBXSMM_RNN_CELL_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; +#if 0 + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_ncnc_kcck_generic.tpl.c" +#endif + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); LIBXSMM_UNUSED(kind); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_f32_f32( handle, kind, start_thread, tid ); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16) { + if ( handle->desc.N % 2 != 0 ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu( handle, kind, start_thread, tid ); + } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16( handle, kind, start_thread, tid ); + } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_amx( handle, kind, start_thread, tid ); + } +#else + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_ck_bf16_bf16_emu( handle, kind, start_thread, tid ); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +#define LIBXSMM_DNN_RNN_RELU_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_RELU_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +#define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +#define LIBXSMM_DNN_RNN_TANH_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_TANH_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c" + } else { + /* should not happen */ + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_f32_f32( handle, kind, start_thread, tid ); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 ) { + if ( handle->desc.N % 2 != 0 ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); + } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16( handle, kind, start_thread, tid ); + } else if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid ); + } +#else + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_emu( handle, kind, start_thread, tid ); + } else if (libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_rnncell_st_bwdupd_nc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid ); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +#define LIBXSMM_DNN_RNN_RELU_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_RELU_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +#define LIBXSMM_DNN_RNN_SIGMOID_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_SIGMOID_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +#define LIBXSMM_DNN_RNN_TANH_BWDUPD +# include "template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_TANH_BWDUPD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c" + } else { + /* should not happen */ + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32( handle, kind, start_thread, tid ); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#elif defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_f32_f32( handle, kind, start_thread, tid ); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck_bf16_bf16_amx( handle, kind, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + LIBXSMM_UNUSED(kind); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.h b/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.h new file mode 100644 index 00000000..47d53988 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_rnncell_backward_weight_update.h @@ -0,0 +1,21 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H +#define LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H + +#include +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_ck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_nc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_bwdupd_ncnc_kcck(libxsmm_dnn_rnncell* handle, libxsmm_dnn_compute_kind kind, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_RNNCELL_BACKWARD_WEIGHT_UPDATE_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.c b/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.c new file mode 100644 index 00000000..c61e41df --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.c @@ -0,0 +1,740 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_rnncell_forward.h" +#include "libxsmm_dnn_elementwise.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +# define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +# define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +# define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +# undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c" + } else { + /* should not happen */ + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__, __AVX512BW__, __AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__, __AVX512BW__, __AVX512DQ__, __AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + return libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu(handle, start_thread, tid); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__, __AVX512BW__, __AVX512DQ__, __AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__, __AVX512BW__, __AVX512DQ__ */ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +# define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +# define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +# define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +# define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +# define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +# define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +# undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c" + } else { + /* should not happen */ + } +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + return libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu(handle, start_thread, tid); +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CPX) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + +#define LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#undef LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#else +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_amx(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__ */ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef libxsmm_bfloat16 element_filter_type; + + /* some portable macrros fof BF16 <-> FP32 */ +# include "template/libxsmm_dnn_bf16_macros_define.tpl.c" + + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +#define LIBXSMM_RNN_CELL_AVX512 +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c" +#undef LIBXSMM_RNN_CELL_AVX512 + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + +# include "template/libxsmm_dnn_bf16_macros_undefine.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} +#endif + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_amx( handle, start_thread, tid); + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_ck_bf16_bf16_amx( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +#define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +#define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +#define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c" +#undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c" + } else { + /* should not happen */ + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#elif defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_ncnc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +#define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +#define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +#define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { + status = LIBXSMM_DNN_ERR_NOT_IMPLEMENTED; + } else { + /* should not happen */ + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and filter */ +#if 0 + if (handle->? == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } +#endif + + /* check if we are on AVX512 */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( (libxsmm_target_archid >= LIBXSMM_X86_AVX512) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) ) { + if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_f32_f32( handle, start_thread, tid); + } +#if defined(LIBXSMM_INTRINSICS_AVX512_CPX) /*__AVX512F__,__AVX512BW__,__AVX512DQ__,__AVX512BF16__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_CPX ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CPX && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } +#elif defined(LIBXSMM_INTRINSICS_AVX512_CORE) /*__AVX512F__,__AVX512BW__,__AVX512DQ__*/ + else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_CORE && libxsmm_target_archid < LIBXSMM_X86_AVX512_SPR) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_emu( handle, start_thread, tid); + } else if ( handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_BF16 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_BF16 && libxsmm_target_archid >= LIBXSMM_X86_AVX512_SPR ) { + status = libxsmm_dnn_rnncell_st_fwd_nc_kcck_bf16_bf16_amx( handle, start_thread, tid); + } +#endif + else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if (handle->desc.datatype_in == LIBXSMM_DNN_DATATYPE_F32 && handle->desc.datatype_out == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef float element_filter_type; + if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_RELU ) { +#define LIBXSMM_DNN_RNN_RELU_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_RELU_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_SIGMOID ) { +#define LIBXSMM_DNN_RNN_SIGMOID_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_SIGMOID_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_RNN_TANH ) { +#define LIBXSMM_DNN_RNN_TANH_FWD +# include "template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c" +#undef LIBXSMM_DNN_RNN_TANH_FWD + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_LSTM ) { +# include "template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c" + } else if ( handle->desc.cell_type == LIBXSMM_DNN_RNNCELL_GRU ) { +# include "template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c" + } else { + /* should not happen */ + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} diff --git a/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.h b/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.h new file mode 100644 index 00000000..7cb2efec --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_rnncell_forward.h @@ -0,0 +1,21 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_RNNCELL_FORWARD_H +#define LIBXSMM_DNN_RNNCELL_FORWARD_H + +#include +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_ck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_ncnc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_rnncell_st_fwd_nc_kcck(libxsmm_dnn_rnncell* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_RNNCELL_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_softmaxloss.c b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss.c new file mode 100644 index 00000000..806f09fb --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss.c @@ -0,0 +1,382 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_softmaxloss_backward.h" +#include "libxsmm_dnn_softmaxloss_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API libxsmm_dnn_softmaxloss* libxsmm_dnn_create_softmaxloss(libxsmm_dnn_softmaxloss_desc softmaxloss_desc, libxsmm_dnn_err_t* status) { + libxsmm_dnn_softmaxloss* handle = 0; + int lpb; + + /* init libxsmm */ + LIBXSMM_INIT + + if ( (softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_F32) || (softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16) ) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + handle = (libxsmm_dnn_softmaxloss*)calloc(1, sizeof(libxsmm_dnn_softmaxloss)); + + if (0 != handle) { + *status = LIBXSMM_DNN_SUCCESS; + /* let's make the description persistent */ + handle->desc = softmaxloss_desc; + + /* cnn */ + if ( (handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + int bk; + /* we need to compute the memory layout given the */ + *status = libxsmm_dnn_get_feature_map_blocks( handle->desc.C, handle->desc.C, + &(handle->bc), &bk, &lpb, + handle->desc.datatype, handle->desc.datatype ); + /* compute the outer blocks */ + handle->Bc = handle->desc.C / handle->bc; + handle->bn = 1; + handle->Bn = handle->desc.N; + } else if ( (handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0 ) { + handle->bc = handle->desc.bc; + handle->bn = handle->desc.bn; + handle->Bc = handle->desc.C / handle->bc; + handle->Bn = handle->desc.N / handle->bn; + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + free( handle ); + handle = 0; + return handle; + } + /* create barrier */ + handle->barrier = libxsmm_barrier_create(handle->desc.threads, 1); + /* calculate scratch size for local softmaxloss copies of one feature map block per thread */ + if ( softmaxloss_desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + handle->scratch_size = (sizeof(float)*handle->desc.C*handle->desc.N*2); + } else { + handle->scratch_size = 1; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_HANDLE; + } + } else { + *status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + + return handle; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_softmaxloss(const libxsmm_dnn_softmaxloss* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + /* Deallocate barrier */ + if (handle->barrier != 0 ) { libxsmm_barrier_release((const libxsmm_barrier*)handle->barrier); } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_softmaxloss*)handle); + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_softmaxloss_create_tensor_datalayout(const libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* layout; + + *status = LIBXSMM_DNN_SUCCESS; + layout = 0; + + if (handle != 0) { + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + + if (layout != 0) { + layout->format = handle->desc.buffer_format; + + if ( (type == LIBXSMM_DNN_REGULAR_INPUT) || (type == LIBXSMM_DNN_GRADIENT_INPUT) || (type == LIBXSMM_DNN_INPUT) || + (type == LIBXSMM_DNN_REGULAR_OUTPUT) || (type == LIBXSMM_DNN_OUTPUT) ) { + if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0) { + layout->datatype = handle->desc.datatype; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(3*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(3*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 3; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->bc; + layout->dim_size[1] = handle->Bc; + layout->dim_size[2] = handle->desc.N; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else if ((handle->desc.buffer_format & LIBXSMM_DNN_TENSOR_FORMAT_NCPACKED) > 0) { + layout->datatype = handle->desc.datatype; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(4*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(4*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 4; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[1] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_type[2] = LIBXSMM_DNN_TENSOR_DIMTYPE_C; + layout->dim_type[3] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->bc; + layout->dim_size[1] = handle->bn; + layout->dim_size[2] = handle->Bc; + layout->dim_size[3] = handle->Bn; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_INVALID_FORMAT_GENERAL; + } + } else if ( type == LIBXSMM_DNN_LABEL ) { + layout->datatype = LIBXSMM_DNN_DATATYPE_I32; + layout->dim_type = (libxsmm_dnn_tensor_dimtype*) malloc(1*sizeof(libxsmm_dnn_tensor_dimtype)); + layout->dim_size = (unsigned int*) malloc(1*sizeof(unsigned int)); + + if (0 != layout->dim_type && 0 != layout->dim_size) { + layout->num_dims = 1; + layout->dim_type[0] = LIBXSMM_DNN_TENSOR_DIMTYPE_N; + layout->dim_size[0] = handle->desc.N; + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT_ARRAYS; + } + } else { + free(layout); + layout = 0; /* make sure a NULL is returned */ + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return layout; +} + + +LIBXSMM_API size_t libxsmm_dnn_softmaxloss_get_scratch_size(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) { + size_t l_scratch_size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_scratch_size = handle->scratch_size + 64; /* 64 byte extra in case the user code does not care about alignment */ + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_scratch_size; +} + + +LIBXSMM_API void* libxsmm_dnn_softmaxloss_get_scratch_ptr(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + return handle->scratch; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_scratch(libxsmm_dnn_softmaxloss* handle, const void* scratch) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + uintptr_t address = (uintptr_t)scratch; + size_t offset = 0; + + if (scratch == 0) { + status = LIBXSMM_DNN_ERR_SCRATCH_NOT_ALLOCED; + return status; + } + + if (0 != handle) { + /* align the internal scratch buffer if needed */ + if (address % 64 == 0) { + handle->scratch = (void*)address; + } else { + offset = (64 - address % 64); + handle->scratch = (void*)(address+offset); + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_scratch(libxsmm_dnn_softmaxloss* handle) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + handle->scratch = 0; + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_bind_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor* tensor, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0 && tensor != 0) { + libxsmm_dnn_tensor_datalayout* handle_layout = libxsmm_dnn_softmaxloss_create_tensor_datalayout(handle, type, &status); + + if ( libxsmm_dnn_compare_tensor_datalayout(handle_layout, tensor->layout, &status) == 0 ) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = (libxsmm_dnn_tensor*)tensor; + } else if ( type == LIBXSMM_DNN_LABEL ) { + handle->label = (libxsmm_dnn_tensor*)tensor; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_MISMATCH_TENSOR; + } + + libxsmm_dnn_destroy_tensor_datalayout( handle_layout ); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_softmaxloss_get_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor* return_tensor = 0; + + *status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { + *status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return return_tensor; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + return_tensor = handle->reg_input; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + return_tensor = handle->grad_input; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + return_tensor = handle->reg_output; + } else if ( type == LIBXSMM_DNN_LABEL ) { + return_tensor = handle->label; + } else { + /* cannot happen */ + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return return_tensor; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_release_tensor(libxsmm_dnn_softmaxloss* handle, const libxsmm_dnn_tensor_type type) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check for tensor type */ + if ( (type != LIBXSMM_DNN_REGULAR_INPUT) && (type != LIBXSMM_DNN_GRADIENT_INPUT) && + (type != LIBXSMM_DNN_REGULAR_OUTPUT) && (type != LIBXSMM_DNN_LABEL) ) { + status = LIBXSMM_DNN_ERR_UNKNOWN_TENSOR_TYPE; + return status; + } + + if (handle != 0) { + if ( type == LIBXSMM_DNN_REGULAR_INPUT ) { + handle->reg_input = 0; + } else if ( type == LIBXSMM_DNN_GRADIENT_INPUT ) { + handle->grad_input = 0; + } else if ( type == LIBXSMM_DNN_REGULAR_OUTPUT ) { + handle->reg_output = 0; + } else if ( type == LIBXSMM_DNN_LABEL ) { + handle->label = 0; + } else { + /* cannot happen */ + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_execute_st(libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_compute_kind kind, + /*unsigned*/int start_thread, /*unsigned*/int tid) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + switch (kind) { + case LIBXSMM_DNN_COMPUTE_KIND_FWD: { + status = libxsmm_dnn_softmaxloss_st_fwd_ncnc( handle, start_thread, tid ); + } break; + case LIBXSMM_DNN_COMPUTE_KIND_BWD: { + status = libxsmm_dnn_softmaxloss_st_bwd_ncnc( handle, start_thread, tid ); + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_KIND; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return status; +} + +LIBXSMM_API float libxsmm_dnn_softmaxloss_get_loss(const libxsmm_dnn_softmaxloss* handle, libxsmm_dnn_err_t* status) { + float l_loss = 0.0f; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != handle) { + l_loss = handle->loss; + } else { + *status = LIBXSMM_DNN_ERR_INVALID_HANDLE; + } + + return l_loss; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.c b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.c new file mode 100644 index 00000000..b9dd837c --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.c @@ -0,0 +1,103 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_softmaxloss_backward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef int element_label_type; + +# include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef int element_label_type; + +# define LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512 +# include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" +# undef LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and mask */ + if ( handle->grad_input == 0 || handle->reg_output == 0 || handle->label == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_softmaxloss_st_bwd_ncnc_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_softmaxloss_st_bwd_ncnc_bf16_bf16( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef int element_label_type; + +# include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef int element_label_type; + +# define LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16 +# include "template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c" +# undef LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.h b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.h new file mode 100644 index 00000000..6fbe1b91 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_backward.h @@ -0,0 +1,18 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H +#define LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_bwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_SOFTMAXLOSS_BACKWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.c b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.c new file mode 100644 index 00000000..ee351b2a --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.c @@ -0,0 +1,103 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_dnn_softmaxloss_forward.h" +#include "libxsmm_main.h" + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef float element_input_type; + typedef float element_output_type; + typedef int element_label_type; + +# include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef int element_label_type; + +# define LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512 +# include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" +# undef LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512 +#else /* should not happen */ + LIBXSMM_UNUSED(handle); LIBXSMM_UNUSED(start_thread); LIBXSMM_UNUSED(tid); + status = LIBXSMM_DNN_ERR_UNSUPPORTED_ARCH; +#endif + return status; +} + + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* check if we have input, output and mask */ + if ( handle->reg_input == 0 || handle->reg_output == 0 || handle->label == 0 ) { + status = LIBXSMM_DNN_ERR_DATA_NOT_BOUND; + return status; + } + + /* check if we are on an AVX512 platform */ +#if defined(LIBXSMM_INTRINSICS_AVX512) /*__AVX512F__*/ + if ( libxsmm_target_archid >= LIBXSMM_X86_AVX512 ) { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + status = libxsmm_dnn_softmaxloss_st_fwd_ncnc_f32_f32( handle, start_thread, tid); + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + status = libxsmm_dnn_softmaxloss_st_fwd_ncnc_bf16_bf16( handle, start_thread, tid); + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } else +#endif + { + if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_F32 ) { + typedef float element_input_type; + typedef float element_output_type; + typedef int element_label_type; + +# include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" + } else if ( handle->desc.datatype == LIBXSMM_DNN_DATATYPE_BF16 ) { + typedef libxsmm_bfloat16 element_input_type; + typedef libxsmm_bfloat16 element_output_type; + typedef int element_label_type; + +# define LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16 +# include "template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c" +# undef LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16 + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + return status; + } + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.h b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.h new file mode 100644 index 00000000..e40464b8 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_softmaxloss_forward.h @@ -0,0 +1,18 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H +#define LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H + +#include + +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_softmaxloss_st_fwd_ncnc(libxsmm_dnn_softmaxloss* handle, int start_thread, int tid); + +#endif /* LIBXSMM_DNN_SOFTMAXLOSS_FORWARD_H */ diff --git a/third_party/libxsmm/src/libxsmm_dnn_tensor.c b/third_party/libxsmm/src/libxsmm_dnn_tensor.c new file mode 100644 index 00000000..e9501097 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_dnn_tensor.c @@ -0,0 +1,642 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" +#include "libxsmm_dnn_tensor.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(_OPENMP) +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_tensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, libxsmm_dnn_err_t* status) +{ + return libxsmm_dnn_link_qtensor(layout, data, 0, status); +} + + +LIBXSMM_API libxsmm_dnn_tensor* libxsmm_dnn_link_qtensor(const libxsmm_dnn_tensor_datalayout* layout, const void* data, const unsigned char scf, libxsmm_dnn_err_t* status) +{ + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + libxsmm_dnn_tensor* tensor = (libxsmm_dnn_tensor*)calloc(1, sizeof(libxsmm_dnn_tensor)); + *status = LIBXSMM_DNN_SUCCESS; + + if (layout != 0 && tensor != 0 && data != 0) { + tensor->layout = libxsmm_dnn_duplicate_tensor_datalayout(layout, status); + tensor->data = (void*)data; + tensor->scf = scf; + /* when layout copy failed, free layout */ + if (*status != LIBXSMM_DNN_SUCCESS) { + libxsmm_dnn_destroy_tensor_datalayout(tensor->layout); + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_TENSOR; + } + + if (*status != LIBXSMM_DNN_SUCCESS) { + free((libxsmm_dnn_tensor*)tensor); + tensor = 0; + } + + return tensor; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_duplicate_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* dst_layout; + + *status = LIBXSMM_DNN_SUCCESS; + dst_layout = 0; + + if (layout != 0 && layout->num_dims != 0) { + unsigned int dim = 0; + + /* zero entire content; not only safer but also sets data and code pointers to NULL */ + dst_layout = (libxsmm_dnn_tensor_datalayout*)calloc(1, sizeof(libxsmm_dnn_tensor_datalayout)); + if (0 != dst_layout) { + dst_layout->dim_type = (libxsmm_dnn_tensor_dimtype*)malloc(layout->num_dims * sizeof(libxsmm_dnn_tensor_dimtype)); + dst_layout->dim_size = (unsigned int*)malloc(layout->num_dims * sizeof(unsigned int)); + dst_layout->num_dims = layout->num_dims; + dst_layout->format = layout->format; + dst_layout->datatype = layout->datatype; + dst_layout->tensor_type = layout->tensor_type; + if (0 != dst_layout->dim_type && 0 != dst_layout->dim_size) { + for (dim = 0; dim < layout->num_dims; ++dim) { + dst_layout->dim_type[dim] = layout->dim_type[dim]; + dst_layout->dim_size[dim] = layout->dim_size[dim]; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } else { + *status = LIBXSMM_DNN_ERR_CREATE_LAYOUT; + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + } + + return dst_layout; +} + + +LIBXSMM_API unsigned int libxsmm_dnn_compare_tensor_datalayout(const libxsmm_dnn_tensor_datalayout* layout_a, const libxsmm_dnn_tensor_datalayout* layout_b, libxsmm_dnn_err_t* status) { + unsigned int result = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (layout_a != 0 && layout_b != 0) { + unsigned int dim = 0; + + if (layout_a->num_dims != layout_b->num_dims) { result = 1; } + if (layout_a->format != layout_b->format) { result = 1; } + if (layout_a->datatype != layout_b->datatype) { result = 1; } + + if (result == 0) { + for ( dim = 0; dim < layout_a->num_dims; ++dim ) { + if ( layout_a->dim_type[dim] != layout_b->dim_type[dim] ) { result = 1; } + if ( layout_a->dim_size[dim] != layout_b->dim_size[dim] ) { result = 1; } + } + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + result = 100; + } + + return result; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor_datalayout(libxsmm_dnn_tensor_datalayout* layout) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != layout) { + free(layout->dim_type); + free(layout->dim_size); + free(layout); + } + else { + status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + } + + return status; +} + + +LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_size(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { + unsigned int size = 0; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != layout) { + unsigned int dim = 0; + size = (unsigned int)libxsmm_dnn_typesize(layout->datatype); + for (dim = 0; dim < layout->num_dims; ++dim) { + size *= layout->dim_size[dim]; + } + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + } + + return size; +} + + +LIBXSMM_API unsigned int libxsmm_dnn_get_tensor_elements(const libxsmm_dnn_tensor_datalayout* layout, libxsmm_dnn_err_t* status) { + unsigned int elements = 1; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != layout) { + unsigned int dim = 0; + for ( dim = 0; dim < layout->num_dims; ++dim ) { + elements *= layout->dim_size[dim]; + } + } else { + *status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + elements = 0; + } + + return elements; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_tensor_data_ptr(libxsmm_dnn_tensor* tensor, const void* data) { + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if ((0 != tensor) && (0 != data)) { + if (0 != tensor->layout) { + if (0 < tensor->layout->num_dims) { + tensor->data = (void*)data; + } else { + status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + } + } else { + status = LIBXSMM_DNN_ERR_INVALID_LAYOUT; + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return status; +} + + +LIBXSMM_API void* libxsmm_dnn_get_tensor_data_ptr(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { + return tensor->data; + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_tensor_datalayout* libxsmm_dnn_get_tensor_datalayout(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) { + libxsmm_dnn_tensor_datalayout* dst_layout = NULL; + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { + dst_layout = libxsmm_dnn_duplicate_tensor_datalayout( tensor->layout, status ); + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return dst_layout; +} + + +LIBXSMM_API unsigned char libxsmm_dnn_get_qtensor_scf(const libxsmm_dnn_tensor* tensor, libxsmm_dnn_err_t* status) +{ + *status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { + return tensor->scf; + } + else { + *status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return 0; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_set_qtensor_scf(libxsmm_dnn_tensor* tensor, const unsigned char scf) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { + tensor->scf = scf; + } + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_destroy_tensor(const libxsmm_dnn_tensor* tensor) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { /* it is not an error attempting to destroy a NULL-handle */ + /* free layout information stored in tensor */ + if (0 != tensor->layout) { + libxsmm_dnn_destroy_tensor_datalayout( (libxsmm_dnn_tensor_datalayout*)tensor->layout ); + } + /* deallocate handle structure */ + free(/*remove constness*/(libxsmm_dnn_tensor*)tensor); + } +#if 0 /* releasing a NULL-buffer should be not an error (similar to freeing a NULL pointer) */ + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } +#endif + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyin_tensor(const libxsmm_dnn_tensor* tensor, const void* data, const libxsmm_dnn_tensor_format in_format) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* @TODO check for valid combination */ + + if (0 != tensor) { + switch (tensor->layout->tensor_type) { + case LIBXSMM_DNN_REGULAR_INPUT: + case LIBXSMM_DNN_GRADIENT_INPUT: + case LIBXSMM_DNN_REGULAR_OUTPUT: + case LIBXSMM_DNN_GRADIENT_OUTPUT: + case LIBXSMM_DNN_INPUT: + case LIBXSMM_DNN_OUTPUT: + case LIBXSMM_DNN_ACTIVATION: { + switch (in_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I32: { + typedef int element_type; +#include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef unsigned char element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } + } break; + case LIBXSMM_DNN_REGULAR_FILTER: + case LIBXSMM_DNN_GRADIENT_FILTER: + case LIBXSMM_DNN_FILTER: { + switch (in_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_KCRS: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef char element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c" + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } + } break; + case LIBXSMM_DNN_REGULAR_CHANNEL_BIAS: + case LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS: + case LIBXSMM_DNN_CHANNEL_BIAS: + case LIBXSMM_DNN_REGULAR_CHANNEL_BETA: + case LIBXSMM_DNN_GRADIENT_CHANNEL_BETA: + case LIBXSMM_DNN_CHANNEL_BETA: + case LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA: + case LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA: + case LIBXSMM_DNN_CHANNEL_GAMMA: + case LIBXSMM_DNN_CHANNEL_EXPECTVAL: + case LIBXSMM_DNN_CHANNEL_RCPSTDDEV: + case LIBXSMM_DNN_CHANNEL_VARIANCE: + case LIBXSMM_DNN_CHANNEL_SCALAR: { + switch (in_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef char element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_zero_tensor(const libxsmm_dnn_tensor* tensor) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + if (0 != tensor) { + const size_t size = libxsmm_dnn_get_tensor_elements( tensor->layout, &status ); + size_t i; + /* use for-loops to potentially leverage NUMA in the future */ + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + float* fp32_data = (float*)tensor->data; + for (i = 0; i < size; ++i) fp32_data[i] = 0.0f; + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + libxsmm_bfloat16* bfp16_data = (libxsmm_bfloat16*)tensor->data; + for (i = 0; i < size; ++i) bfp16_data[i] = 0; + } break; + case LIBXSMM_DNN_DATATYPE_I32: { + int* int32_data = (int*)tensor->data; + for (i = 0; i < size; ++i) int32_data[i] = 0; + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + short* int16_data = (short*)tensor->data; + for (i = 0; i < size; ++i) int16_data[i] = 0; + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + char* int8_data = (char*)tensor->data; + for (i = 0; i < size; ++i) int8_data[i] = 0; + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return status; +} + + +LIBXSMM_API libxsmm_dnn_err_t libxsmm_dnn_copyout_tensor(const libxsmm_dnn_tensor* tensor, void* data, const libxsmm_dnn_tensor_format out_format) +{ + libxsmm_dnn_err_t status = LIBXSMM_DNN_SUCCESS; + + /* @TODO check for valid combination */ + + if (0 != tensor) { + switch (tensor->layout->tensor_type) { + case LIBXSMM_DNN_REGULAR_INPUT: + case LIBXSMM_DNN_GRADIENT_INPUT: + case LIBXSMM_DNN_REGULAR_OUTPUT: + case LIBXSMM_DNN_GRADIENT_OUTPUT: + case LIBXSMM_DNN_INPUT: + case LIBXSMM_DNN_OUTPUT: + case LIBXSMM_DNN_ACTIVATION: { + switch (out_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I32: { + typedef int element_type; +#include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef unsigned char element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } + } break; + case LIBXSMM_DNN_REGULAR_FILTER: + case LIBXSMM_DNN_GRADIENT_FILTER: + case LIBXSMM_DNN_FILTER: { + switch (out_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_KCRS: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" + } break; + + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I32: { + typedef int element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef char element_type; +#include "template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c" + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } + } break; + case LIBXSMM_DNN_REGULAR_CHANNEL_BIAS: + case LIBXSMM_DNN_GRADIENT_CHANNEL_BIAS: + case LIBXSMM_DNN_CHANNEL_BIAS: + case LIBXSMM_DNN_REGULAR_CHANNEL_BETA: + case LIBXSMM_DNN_GRADIENT_CHANNEL_BETA: + case LIBXSMM_DNN_CHANNEL_BETA: + case LIBXSMM_DNN_REGULAR_CHANNEL_GAMMA: + case LIBXSMM_DNN_GRADIENT_CHANNEL_GAMMA: + case LIBXSMM_DNN_CHANNEL_GAMMA: + case LIBXSMM_DNN_CHANNEL_EXPECTVAL: + case LIBXSMM_DNN_CHANNEL_RCPSTDDEV: + case LIBXSMM_DNN_CHANNEL_VARIANCE: + case LIBXSMM_DNN_CHANNEL_SCALAR: { + switch (out_format) { + case LIBXSMM_DNN_TENSOR_FORMAT_NCHW: { + if ( (tensor->layout->format & LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM) > 0 ) { + switch (tensor->layout->datatype) { + case LIBXSMM_DNN_DATATYPE_F32: { + typedef float element_type; +#include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" + } break; + case LIBXSMM_DNN_DATATYPE_BF16: { + typedef libxsmm_bfloat16 element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I16: { + typedef short element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + case LIBXSMM_DNN_DATATYPE_I8: { + typedef char element_type; +#define LIBXSMM_DNN_COPY_LOW_PRECISION +#include "template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c" +#undef LIBXSMM_DNN_COPY_LOW_PRECISION + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DATATYPE; + } + } + } else { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_SRC_FORMAT; + } + } break; + default: { + status = LIBXSMM_DNN_ERR_UNSUPPORTED_DST_FORMAT; + } + } + } break; + default: { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + } + } + else { + status = LIBXSMM_DNN_ERR_INVALID_TENSOR; + } + + return status; +} + diff --git a/third_party/libxsmm/src/libxsmm_ext.c b/third_party/libxsmm/src/libxsmm_ext.c new file mode 100644 index 00000000..42bc227a --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_ext.c @@ -0,0 +1,267 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_ext.h" +#include "libxsmm_gemm.h" +#include + + +#if defined(LIBXSMM_BUILD) +#if defined(LIBXSMM_BUILD_EXT) && !defined(__STATIC) + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(dgemm_batch)(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], + const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + if (LIBXSMM_FSYMBOL(__real_dgemm_batch) != libxsmm_original_dgemm_batch_function) { + LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else { + libxsmm_blas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(sgemm_batch)(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], + const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + if (LIBXSMM_FSYMBOL(__real_sgemm_batch) != libxsmm_original_sgemm_batch_function) { + LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else { + libxsmm_blas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(dgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) +{ + if (LIBXSMM_FSYMBOL(__real_dgemm) != libxsmm_original_dgemm_function) { + LIBXSMM_FSYMBOL(__wrap_dgemm)(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + else { + libxsmm_blas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(sgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) +{ + if (LIBXSMM_FSYMBOL(__real_sgemm) != libxsmm_original_sgemm_function) { + LIBXSMM_FSYMBOL(__wrap_sgemm)(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + else { + libxsmm_blas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, + const double* beta, double* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) +{ + if (LIBXSMM_FSYMBOL(__real_dgemv) != libxsmm_original_dgemv_function) { + LIBXSMM_FSYMBOL(__wrap_dgemv)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + else { + libxsmm_blas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void LIBXSMM_FSYMBOL(sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, + const float* beta, float* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) +{ + if (LIBXSMM_FSYMBOL(__real_sgemv) != libxsmm_original_sgemv_function) { + LIBXSMM_FSYMBOL(__wrap_sgemv)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + else { + libxsmm_blas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void dgemm_batch(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], + const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_WEAK +void sgemm_batch(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], + const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + +#elif (0 != LIBXSMM_NO_BLAS) /* no-BLAS library */ + +LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_ATTRIBUTE_COMMON unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); + +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_noblas_sink(LIBXSMM_VARIADIC); +LIBXSMM_API_INTERN void internal_noblas_sink(LIBXSMM_VARIADIC) +{ + /* does nothing else but sinking given arguments */ +} + +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE libxsmm_sink_function internal_noblas_error(const char* /*symbol*/); +LIBXSMM_API_INTERN libxsmm_sink_function internal_noblas_error(const char* symbol) +{ + static int internal_noblas_nerror = 0; + LIBXSMM_BLAS_ERROR(symbol, &internal_noblas_nerror); + return internal_noblas_sink; +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(dgemm_batch)(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], + const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + internal_noblas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(sgemm_batch)(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], + const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + internal_noblas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(dgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) +{ + internal_noblas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(sgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) LIBXSMM_BLAS_NOEXCEPT(gemm) +{ + internal_noblas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, + const double* beta, double* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) +{ + internal_noblas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE /*LIBXSMM_ATTRIBUTE_WEAK*/ +void LIBXSMM_FSYMBOL(sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, + const float* beta, float* y, const libxsmm_blasint* incy) LIBXSMM_BLAS_NOEXCEPT(gemv) +{ + internal_noblas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE +void dgemm_batch(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], + const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_BLAS_SYMBOL_VISIBILITY LIBXSMM_ATTRIBUTE_NO_TRACE +void sgemm_batch(const char transa_array[], const char transb_array[], + const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], + const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], + const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) LIBXSMM_BLAS_NOEXCEPT(gemm_batch) +{ + LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + +#endif +#endif /*defined(LIBXSMM_BUILD)*/ + diff --git a/third_party/libxsmm/src/libxsmm_ext.h b/third_party/libxsmm/src/libxsmm_ext.h new file mode 100644 index 00000000..1f682889 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_ext.h @@ -0,0 +1,46 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_EXT_H +#define LIBXSMM_EXT_H + +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if defined(_OPENMP) +# if !defined(__INTEL_COMPILER) +# if defined(__clang__) +# pragma clang diagnostic push +# elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# pragma GCC diagnostic push +# endif +# if defined(__clang__) +# pragma clang diagnostic ignored "-Wpedantic" +# elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# pragma GCC diagnostic ignored "-Wpedantic" +# endif +# endif +# include +# if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) +# if defined(__clang__) +# pragma clang diagnostic pop +# elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# pragma GCC diagnostic pop +# endif +# endif +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#endif /*LIBXSMM_EXT_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_ext_gemm.c b/third_party/libxsmm/src/libxsmm_ext_gemm.c new file mode 100644 index 00000000..9a17e35c --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_ext_gemm.c @@ -0,0 +1,1268 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_gemm.h" +#include "libxsmm_ext.h" + +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) +# include "libxsmm_trace.h" +#endif + +#if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) && 0 +# define LIBXSMM_EXT_GEMM_PARGROUPS_INFO +#endif + +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) +# if !defined(LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH) +# define LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO) +# endif +# if !defined(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) +# define LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH 8/*POT*/ +# endif +LIBXSMM_APIVAR_DEFINE(libxsmm_gemm_descriptor internal_ext_gemm_batchdesc[LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH]); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_ext_gemm_batchdepth); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_ext_gemm_batchsize); +#endif + + +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) +LIBXSMM_API_INLINE int internal_mmbatch_sortrev(const void* stat_a, const void* stat_b) +{ + const libxsmm_mmbatch_item *const a = (const libxsmm_mmbatch_item*)stat_a; + const libxsmm_mmbatch_item *const b = (const libxsmm_mmbatch_item*)stat_b; + LIBXSMM_ASSERT(NULL != stat_a && NULL != stat_b); + return a->stat.count < b->stat.count ? 1 : (b->stat.count < a->stat.count ? -1 : 0); +} +#endif /*defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT)*/ + + +LIBXSMM_API_INLINE int internal_mmbatch_flush(const libxsmm_gemm_descriptor* batchdesc, + libxsmm_blasint batchsize, libxsmm_mmbatch_item* batcharray) +{ + int result = EXIT_SUCCESS; +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + if (0 != batchsize) { /* recorded/lazy multiplications */ + const libxsmm_blasint itemsize = sizeof(libxsmm_mmbatch_item); + LIBXSMM_ASSERT(NULL != batchdesc && 0 < batchsize); + if (0 == (LIBXSMM_MMBATCH_FLAG_STATISTIC & batchdesc->flags)) { /* process batch */ + const libxsmm_xmmfunction kernel = libxsmm_xmmdispatch(batchdesc); + if (NULL != kernel.xmm) { + const unsigned char itypesize = libxsmm_typesize((libxsmm_datatype)LIBXSMM_GETENUM_INP(batchdesc->datatype)); + const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)LIBXSMM_GETENUM_OUT(batchdesc->datatype)); +#if defined(_OPENMP) + if (0 == (LIBXSMM_MMBATCH_FLAG_SEQUENTIAL & batchdesc->flags)) { /* parallelized */ + const int nchunks = (int)LIBXSMM_UPDIV(batchsize, libxsmm_gemm_taskgrain); +# if defined(LIBXSMM_EXT_TASKS) + if (0 == omp_get_active_level()) { + const int max_nthreads = omp_get_max_threads(); + const int nthreads = LIBXSMM_MIN(max_nthreads, nchunks); + if (0 == libxsmm_gemm_tasks) +# else + if (0 == omp_in_parallel()) { + const int max_nthreads = omp_get_max_threads(); + const int nthreads = LIBXSMM_MIN(max_nthreads, nchunks); +# endif + { /* classic internal parallelization */ +# pragma omp parallel num_threads(nthreads) + /*check*/libxsmm_mmbatch_kernel( + kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, + &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, + 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, + omp_get_thread_num(), nthreads, itypesize, otypesize, batchdesc->flags); + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* internal parallelization with tasks */ +# pragma omp parallel num_threads(nthreads) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + { int tid; for (tid = 0; tid < nchunks/*ntasks*/; ++tid) { +# pragma omp task untied + /*check*/libxsmm_mmbatch_kernel( + kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, + &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, + 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, + tid, nchunks/*ntasks*/, itypesize, otypesize, batchdesc->flags); + } + } + } /* implicit synchronization (barrier) */ + } +# endif + } + else { /* assume external parallelization */ + int tid; for (tid = 0; tid < nchunks/*ntasks*/; ++tid) { +# if defined(LIBXSMM_EXT_TASKS) +# pragma omp task untied +#endif + /*check*/libxsmm_mmbatch_kernel( + kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, + &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, + 0 == (LIBXSMM_MMBATCH_FLAG_SYNCHRONIZED & batchdesc->flags) ? batchsize : -batchsize, + tid, nchunks/*ntasks*/, itypesize, otypesize, batchdesc->flags); + } +# if defined(LIBXSMM_EXT_TASKS) + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# endif + } + } + else +#endif + { /* sequential */ + result = libxsmm_mmbatch_kernel( + kernel, 0/*index_base*/, 0/*index_stride*/, &itemsize, &itemsize, &itemsize, + &batcharray->value.a, &batcharray->value.b, &batcharray->value.c, batchsize, + 0/*tid*/, 1/*nthreads*/, itypesize, otypesize, batchdesc->flags); + } + } + else { /* no fallback */ + /* several reasons to arrive here: try-lock, unsuitable SMM, etc. */ + result = EXIT_FAILURE; + } + memset(batcharray, 0, (size_t)batchsize * (size_t)itemsize); /* clear */ + } + else { /* print statistic */ + const libxsmm_blasint limit = (LIBXSMM_GEMM_MMBATCH_VERBOSITY < libxsmm_verbosity ? batchsize/*unlimited*/ : 7/*limited*/); + unsigned int threshold, batchcount; + libxsmm_blasint count = 0, i; + LIBXSMM_ASSERT(NULL != batcharray); + qsort(batcharray, (size_t)batchsize, (size_t)itemsize, internal_mmbatch_sortrev); + batchcount = batcharray[0].stat.count; + threshold = ((LIBXSMM_GEMM_MMBATCH_VERBOSITY < libxsmm_verbosity || 3 >= batchsize) ? 0 : (batchcount / 2)); + for (i = 1; i < batchsize; ++i) batchcount += batcharray[i].stat.count; + LIBXSMM_STDIO_ACQUIRE(); + for (i = 0; i < batchsize; ++i) { + const libxsmm_gemm_descriptor descriptor = batcharray[i].stat.desc; + const libxsmm_blasint lda = descriptor.lda, ldb = descriptor.ldb, ldc = descriptor.ldc; + const libxsmm_blasint m = descriptor.m, n = descriptor.n, k = descriptor.k; + const char *const symbol = batcharray[i].stat.symbol; + const unsigned int ci = batcharray[i].stat.count; + LIBXSMM_MEMZERO127(batcharray + i); /* clear */ + if (threshold < ci && count < limit /* limit printed statistic */ + && 0 < m && 0 < n && 0 < k) + { + const unsigned int ciperc = (unsigned int)(100.0 * ci / batchcount + 0.5); + if (0 != ciperc) { + LIBXSMM_ASSERT(0 != ci); + if (0 == count) { + fprintf(stderr, "\nLIBXSMM STATISTIC: %u multiplication%c\n", batchcount, 1 < batchcount ? 's' : ' '); + } + LIBXSMM_GEMM_PRINT2(stderr, + LIBXSMM_GETENUM_INP(descriptor.datatype), LIBXSMM_GETENUM_OUT(descriptor.datatype), descriptor.flags, m, n, k, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & descriptor.flags) ? 0 : */1, NULL/*a*/, lda, NULL/*b*/, ldb, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & descriptor.flags) ? 0 : 1, NULL/*c*/, ldc); + if (NULL != symbol && 0 != *symbol) { + fprintf(stderr, ": %u%% [%s]\n", ciperc, symbol); + } + else { + fprintf(stderr, ": %u%%\n", ciperc); + } + ++count; + } + else break; + } + } + LIBXSMM_STDIO_RELEASE(); + } + } +#else + LIBXSMM_UNUSED(batchdesc); LIBXSMM_UNUSED(batchsize); LIBXSMM_UNUSED(batcharray); +#endif + return result; +} + + +#if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) + +#if defined(LIBXSMM_BLAS_WRAP_DYNAMIC) +LIBXSMM_API libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, double, gemm_batch, libxsmm_original_dgemm_batch_function, libxsmm_original_dgemm_batch/*self*/); + /*LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_batch_function);*/ +# else + LIBXSMM_BLAS_WRAPPER(0, double, gemm_batch, libxsmm_original_dgemm_batch_function, libxsmm_original_dgemm_batch/*self*/); +# endif + return libxsmm_original_dgemm_batch_function; +} + +LIBXSMM_API libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, float, gemm_batch, libxsmm_original_sgemm_batch_function, libxsmm_original_sgemm_batch/*self*/); + /*LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_batch_function);*/ +# else + LIBXSMM_BLAS_WRAPPER(0, float, gemm_batch, libxsmm_original_sgemm_batch_function, libxsmm_original_sgemm_batch/*self*/); +# endif + return libxsmm_original_sgemm_batch_function; +} + +LIBXSMM_API libxsmm_dgemm_function libxsmm_original_dgemm(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, double, gemm, libxsmm_original_dgemm_function, libxsmm_original_dgemm/*self*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_function); +# else + LIBXSMM_BLAS_WRAPPER(0, double, gemm, libxsmm_original_dgemm_function, libxsmm_original_dgemm/*self*/); +# endif + return libxsmm_original_dgemm_function; +} + +LIBXSMM_API libxsmm_sgemm_function libxsmm_original_sgemm(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, float, gemm, libxsmm_original_sgemm_function, libxsmm_original_sgemm/*self*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_function); +# else + LIBXSMM_BLAS_WRAPPER(0, float, gemm, libxsmm_original_sgemm_function, libxsmm_original_sgemm/*self*/); +# endif + return libxsmm_original_sgemm_function; +} + +LIBXSMM_API libxsmm_dgemv_function libxsmm_original_dgemv(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, double, gemv, libxsmm_original_dgemv_function, libxsmm_original_dgemv/*self*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_dgemv_function); +# else + LIBXSMM_BLAS_WRAPPER(0, double, gemv, libxsmm_original_dgemv_function, libxsmm_original_dgemv/*self*/); +# endif + return libxsmm_original_dgemv_function; +} + +LIBXSMM_API libxsmm_sgemv_function libxsmm_original_sgemv(void) +{ +# if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, float, gemv, libxsmm_original_sgemv_function, libxsmm_original_sgemv/*self*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_sgemv_function); +# else + LIBXSMM_BLAS_WRAPPER(0, float, gemv, libxsmm_original_sgemv_function, libxsmm_original_sgemv/*self*/); +# endif + return libxsmm_original_sgemv_function; +} +#endif /*defined(LIBXSMM_BLAS_WRAP_DYNAMIC)*/ + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemm_batch)( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_ASSERT(NULL != lda_array && NULL != ldb_array && NULL != ldc_array && NULL != m_array && NULL != n_array && NULL != k_array); + LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != alpha_array && NULL != beta_array); + LIBXSMM_ASSERT(NULL != group_count && NULL != group_size); + LIBXSMM_INIT + if (0 != libxsmm_gemm_wrap) { + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + libxsmm_dgemm_batch(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else { /* parallelized */ + libxsmm_dgemm_batch_omp(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + } + else { + LIBXSMM_GEMM_BATCH_SYMBOL(double)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemm_batch)( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_ASSERT(NULL != lda_array && NULL != ldb_array && NULL != ldc_array && NULL != m_array && NULL != n_array && NULL != k_array); + LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != alpha_array && NULL != beta_array); + LIBXSMM_ASSERT(NULL != group_count && NULL != group_size); + LIBXSMM_INIT + if (0 != libxsmm_gemm_wrap) { + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + libxsmm_sgemm_batch(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else { /* parallelized */ + libxsmm_sgemm_batch_omp(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + } + else { + LIBXSMM_GEMM_BATCH_SYMBOL(float)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemm)( + const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_ASSERT(NULL != lda && NULL != ldb && NULL != ldc && NULL != m && NULL != n && NULL != k); + LIBXSMM_ASSERT(NULL != transa && NULL != transb && NULL != alpha && NULL != beta); + { +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + unsigned int i = 0; /* no flush */ + int flags = -1; +# if !defined(NDEBUG) + static int error_once = 0; + int result = EXIT_SUCCESS; +# endif + LIBXSMM_INIT + if (0 != libxsmm_gemm_wrap && (NULL == libxsmm_mmbatch_array + || LIBXSMM_GEMM_PRECISION_F64 != libxsmm_mmbatch_desc.datatype + || ((unsigned int)*lda) != libxsmm_mmbatch_desc.lda + || ((unsigned int)*ldb) != libxsmm_mmbatch_desc.ldb + || ((unsigned int)*ldc) != libxsmm_mmbatch_desc.ldc + || ((unsigned int)*m) != libxsmm_mmbatch_desc.m + || ((unsigned int)*n) != libxsmm_mmbatch_desc.n + || ((unsigned int)*k) != libxsmm_mmbatch_desc.k + || (flags = LIBXSMM_GEMM_FLAGS(*transa, *transb)) != (int)(LIBXSMM_GEMM_FLAG_TRANS_AB & libxsmm_mmbatch_desc.flags) + || LIBXSMM_NEQ(/*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, *alpha) + || LIBXSMM_NEQ(0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, *beta))) +#endif + { +#if defined(_DEBUG) + const char *const env_check = getenv("LIBXSMM_GEMM_CHECK"); + const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); + void* d = NULL; + if (LIBXSMM_NEQ(0, check)) { + const size_t size = (size_t)(*ldc) * (size_t)(*n) * sizeof(double); + d = libxsmm_scratch_malloc(size, 0/*auto*/, LIBXSMM_MALLOC_INTERNAL_CALLER); + if (NULL != d && LIBXSMM_NEQ(0, *beta)) memcpy(d, c, size); /* copy destination */ + } +#endif + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + else { /* parallelized */ + libxsmm_dgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +#if defined(_DEBUG) + if (NULL != d) { + libxsmm_matdiff_info diff; + libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, d, ldc); + if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F64, *m, *n, d, c, ldc, ldc) + && check < 100.0 * diff.normf_rel) + { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "LIBXSMM: "); + libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION_F64, transa, transb, + m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + fprintf(stderr, " => %f%% ERROR\n", 100.0 * diff.normf_rel); + LIBXSMM_STDIO_RELEASE(); + } + libxsmm_free(d); + } +#endif +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + if (0 != (LIBXSMM_MMBATCH_FLAG_STATISTIC & libxsmm_mmbatch_desc.flags)) { + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const descriptor = libxsmm_dgemm_descriptor_init(&blob, + *m, *n, *k, *lda, *ldb, *ldc, *alpha, *beta, LIBXSMM_GEMM_FLAGS(*transa, *transb), + LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH); + + LIBXSMM_ASSERT(0 != libxsmm_mmbatch_size); + if (NULL != descriptor) { + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + const unsigned int batchsize = LIBXSMM_ATOMIC_LOAD(&internal_ext_gemm_batchsize, LIBXSMM_ATOMIC_RELAXED); + const unsigned int max_size = (0 != batchsize ? (((batchsize - 1) % max_batchsize) + 1) : 0); + libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; + libxsmm_mmbatch_item* batcharray_cur = batcharray; + unsigned int size = max_size; + if (libxsmm_mmbatch_size < max_size) { + size = max_size - libxsmm_mmbatch_size; + batcharray_cur += libxsmm_mmbatch_size; + } + i = libxsmm_diff_n(descriptor, batcharray_cur, sizeof(libxsmm_gemm_descriptor), + sizeof(libxsmm_mmbatch_item)/*stride*/, 0/*hint*/, size); + + if (i < size) { /* update existing entry */ + LIBXSMM_ATOMIC_ADD_FETCH(&batcharray_cur[i].stat.count, 1, LIBXSMM_ATOMIC_RELAXED); + } + else { /* new entry needed */ + const int all = -1, shift = 0; + void* extra = 0; + i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; + batcharray[i-1].stat.desc = *descriptor; + batcharray[i-1].stat.count = 1; + batcharray[i-1].stat.symbol = libxsmm_trace_info(NULL/*depth*/, NULL/*tid*/, &all, LIBXSMM_FUNCNAME, &shift, &all); + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(libxsmm_mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra)) { + *(libxsmm_mmbatch_flush_function*)extra = libxsmm_mmbatch_end; + } +# if !defined(NDEBUG) + else { + result = EXIT_FAILURE; + } +# endif + } + } + } +#endif + } +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + else { + libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; + batcharray[i-1].value.a = a; + batcharray[i-1].value.b = b; + batcharray[i-1].value.c = c; + LIBXSMM_ASSERT(0 <= flags); + } + if (libxsmm_mmbatch_size == (i - 1)) { /* condition ensure to flush once (first discovery) */ +# if !defined(NDEBUG) + result = +# endif + internal_mmbatch_flush(&libxsmm_mmbatch_desc, libxsmm_mmbatch_size, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); + } +# if !defined(NDEBUG) /* library code is expected to be mute */ + if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity && + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: DGEMM batch recording failed!\n"); + } +# endif +#endif + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemm)( + const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_ASSERT(NULL != lda && NULL != ldb && NULL != ldc && NULL != m && NULL != n && NULL != k); + LIBXSMM_ASSERT(NULL != transa && NULL != transb && NULL != alpha && NULL != beta); + { +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + unsigned int i = 0; /* no flush */ + int flags = -1; +# if !defined(NDEBUG) + static int error_once = 0; + int result = EXIT_SUCCESS; +# endif + LIBXSMM_INIT + if (0 != libxsmm_gemm_wrap && (NULL == libxsmm_mmbatch_array + || LIBXSMM_GEMM_PRECISION_F32 != libxsmm_mmbatch_desc.datatype + || ((unsigned int)*lda) != libxsmm_mmbatch_desc.lda + || ((unsigned int)*ldb) != libxsmm_mmbatch_desc.ldb + || ((unsigned int)*ldc) != libxsmm_mmbatch_desc.ldc + || ((unsigned int)*m) != libxsmm_mmbatch_desc.m + || ((unsigned int)*n) != libxsmm_mmbatch_desc.n + || ((unsigned int)*k) != libxsmm_mmbatch_desc.k + || (flags = LIBXSMM_GEMM_FLAGS(*transa, *transb)) != (int)(LIBXSMM_GEMM_FLAG_TRANS_AB & libxsmm_mmbatch_desc.flags) + || LIBXSMM_NEQ(/*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, *alpha) + || LIBXSMM_NEQ(0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, *beta))) +#endif + { +#if defined(_DEBUG) + const char *const env_check = getenv("LIBXSMM_GEMM_CHECK"); + const double check = LIBXSMM_ABS(NULL == env_check ? 0 : atof(env_check)); + void* d = NULL; + if (LIBXSMM_NEQ(0, check)) { + const size_t size = (size_t)(*ldc) * (size_t)(*n) * sizeof(float); + d = libxsmm_scratch_malloc(size, 0/*auto*/, LIBXSMM_MALLOC_INTERNAL_CALLER); + if (NULL != d && LIBXSMM_NEQ(0, *beta)) memcpy(d, c, size); /* copy destination */ + } +#endif + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } + else { /* parallelized */ + libxsmm_sgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +#if defined(_DEBUG) + if (NULL != d) { + libxsmm_matdiff_info diff; + libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, d, ldc); + if (EXIT_SUCCESS == libxsmm_matdiff(&diff, LIBXSMM_DATATYPE_F32, *m, *n, d, c, ldc, ldc) + && check < 100.0 * diff.normf_rel) + { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "LIBXSMM: "); + libxsmm_gemm_print(stderr, LIBXSMM_GEMM_PRECISION_F32, transa, transb, + m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + fprintf(stderr, " => %f%% ERROR\n", 100.0 * diff.normf_rel); + LIBXSMM_STDIO_RELEASE(); + } + libxsmm_free(d); + } +#endif +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + if (0 != (LIBXSMM_MMBATCH_FLAG_STATISTIC & libxsmm_mmbatch_desc.flags)) { + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const descriptor = libxsmm_sgemm_descriptor_init(&blob, + *m, *n, *k, *lda, *ldb, *ldc, *alpha, *beta, LIBXSMM_GEMM_FLAGS(*transa, *transb), + LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH); + + LIBXSMM_ASSERT(0 != libxsmm_mmbatch_size); + if (NULL != descriptor) { + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + const unsigned int batchsize = LIBXSMM_ATOMIC_LOAD(&internal_ext_gemm_batchsize, LIBXSMM_ATOMIC_RELAXED); + const unsigned int max_size = (0 != batchsize ? (((batchsize - 1) % max_batchsize) + 1) : 0); + libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; + libxsmm_mmbatch_item* batcharray_cur = batcharray; + unsigned int size = max_size; + if (libxsmm_mmbatch_size < max_size) { + size = max_size - libxsmm_mmbatch_size; + batcharray_cur += libxsmm_mmbatch_size; + } + i = libxsmm_diff_n(descriptor, batcharray_cur, sizeof(libxsmm_gemm_descriptor), + sizeof(libxsmm_mmbatch_item)/*stride*/, 0/*hint*/, size); + + if (i < size) { /* update existing entry */ + LIBXSMM_ATOMIC_ADD_FETCH(&batcharray_cur[i].stat.count, 1, LIBXSMM_ATOMIC_RELAXED); + } + else { /* new entry needed */ + const int all = -1, shift = 0; + void* extra = 0; + i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; + batcharray[i-1].stat.desc = *descriptor; + batcharray[i-1].stat.count = 1; + batcharray[i-1].stat.symbol = libxsmm_trace_info(NULL/*depth*/, NULL/*tid*/, &all, LIBXSMM_FUNCNAME, &shift, &all); + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(libxsmm_mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra)) { + *(libxsmm_mmbatch_flush_function*)extra = libxsmm_mmbatch_end; + } +# if !defined(NDEBUG) + else { + result = EXIT_FAILURE; + } +# endif + } + } + } +#endif + } +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) + else { + libxsmm_mmbatch_item *const batcharray = (libxsmm_mmbatch_item*)libxsmm_mmbatch_array; + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + i = ((LIBXSMM_ATOMIC_ADD_FETCH(&internal_ext_gemm_batchsize, 1, LIBXSMM_ATOMIC_RELAXED) - 1) % max_batchsize) + 1; + batcharray[i-1].value.a = a; + batcharray[i-1].value.b = b; + batcharray[i-1].value.c = c; + LIBXSMM_ASSERT(0 <= flags); + } + if (libxsmm_mmbatch_size == (i - 1)) { /* condition ensure to flush once (first discovery) */ +# if !defined(NDEBUG) + result = +# endif + internal_mmbatch_flush(&libxsmm_mmbatch_desc, libxsmm_mmbatch_size, (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); + } +# if !defined(NDEBUG) /* library code is expected to be mute */ + if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity && + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: SGEMM batch recording failed!\n"); + } +# endif +#endif + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, + const double* beta, double* y, const libxsmm_blasint* incy) +{ + LIBXSMM_ASSERT(NULL != trans && NULL != m && NULL != n && NULL != lda && NULL != incx && NULL != incy && NULL != alpha && NULL != beta); + LIBXSMM_INIT + if ((2 < libxsmm_gemm_wrap || 2 > libxsmm_gemm_wrap) && 1 == *incx && 1 == *incy && LIBXSMM_SMM(*m, 1, *n, 2/*RFO*/, sizeof(double))) { + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + const int flags = LIBXSMM_GEMM_FLAGS(*trans, 'N'); + const libxsmm_dmmfunction xgemv = libxsmm_dmmdispatch(*m, 1, *n, lda, n/*ldb*/, m/*ldc*/, alpha, beta, &flags, NULL); + if (NULL != xgemv) { + LIBXSMM_MMCALL_LDX(xgemv, a, x, y, *m, 1, *n, *lda, *n/*ldb*/, *m/*ldc*/); + } + else { + LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + } + else { /* TODO: parallelized */ + LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + } + else { + LIBXSMM_GEMV_SYMBOL(double)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void LIBXSMM_FSYMBOL(__wrap_sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, + const float* beta, float* y, const libxsmm_blasint* incy) +{ + LIBXSMM_ASSERT(NULL != trans && NULL != m && NULL != n && NULL != lda && NULL != incx && NULL != incy && NULL != alpha && NULL != beta); + LIBXSMM_INIT + if ((2 < libxsmm_gemm_wrap || 2 > libxsmm_gemm_wrap) && 1 == *incx && 1 == *incy && LIBXSMM_SMM(*m, 1, *n, 2/*RFO*/, sizeof(float))) { + if (0 != (libxsmm_gemm_wrap & 1)) { /* sequential */ + const int flags = LIBXSMM_GEMM_FLAGS(*trans, 'N'); + const libxsmm_smmfunction xgemv = libxsmm_smmdispatch(*m, 1, *n, lda, n/*ldb*/, m/*ldc*/, alpha, beta, &flags, NULL); + if (NULL != xgemv) { + LIBXSMM_MMCALL_LDX(xgemv, a, x, y, *m, 1, *n, *lda, *n/*ldb*/, *m/*ldc*/); + } + else { + LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + } + else { /* TODO: parallelized */ + LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } + } + else { + LIBXSMM_GEMV_SYMBOL(float)(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); + } +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void __wrap_dgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_APIEXT LIBXSMM_ATTRIBUTE_USED void __wrap_sgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + +#endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT)*/ + + +LIBXSMM_APIEXT void libxsmm_xgemm_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc) +{ + libxsmm_gemm_blob blob; +#if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + const int outerpar = omp_get_active_level(), nthreads = (0 == outerpar ? omp_get_max_threads() : omp_get_num_threads()); +#elif defined(_OPENMP) + const int outerpar = omp_in_parallel(), nthreads = (0 == outerpar ? omp_get_max_threads() : 1); +#else + const int nthreads = 1; +#endif + const libxsmm_gemm_handle *const handle = libxsmm_gemm_handle_init(&blob, iprec, oprec, transa, transb, + m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_GEMM_HANDLE_FLAG_AUTO, nthreads); + const size_t scratch_size = libxsmm_gemm_handle_get_scratch_size(handle); + void* scratch = NULL; + if (NULL != handle && (0 == scratch_size || + NULL != (scratch = libxsmm_scratch_malloc(scratch_size, LIBXSMM_CACHELINE, LIBXSMM_MALLOC_INTERNAL_CALLER)))) + { +#if defined(_OPENMP) + if (0 == outerpar) { /* enable internal parallelization */ +# if defined(LIBXSMM_EXT_TASKS) + if (0 == libxsmm_gemm_tasks) +# endif + { +# pragma omp parallel num_threads(nthreads) + libxsmm_gemm_task(handle, scratch, a, b, c, omp_get_thread_num(), nthreads); + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* tasks requested */ + const int ntasks = nthreads; /* TODO: apply grain-size */ +# pragma omp parallel num_threads(nthreads) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + { int tid; for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_gemm_task(handle, scratch, a, b, c, tid, ntasks); + } + } + } /* implicit synchronization (barrier) */ + } +# endif + } + else { /* assume external parallelization */ +# if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + const int ntasks = nthreads; /* TODO: apply grain-size */ + int tid; for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_gemm_task(handle, scratch, a, b, c, tid, ntasks); + } + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# else + libxsmm_gemm_task(handle, scratch, a, b, c, 0/*tid*/, 1/*nthreads*/); +# endif + } + if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */ + const unsigned int ntasks = handle->mt * handle->nt * handle->kt; + const double imbalance = 100.0 * LIBXSMM_DELTA((unsigned int)nthreads, ntasks) / nthreads; + static double max_imbalance = 50.0; + if (max_imbalance < imbalance) { + fprintf(stderr, "LIBXSMM WARNING: XGEMM %.0f%% imbalance (%u of %i workers utilized)!\n", + imbalance, ntasks, nthreads); + max_imbalance = imbalance; + } + } +#else + libxsmm_gemm_task(handle, scratch, a, b, c, 0/*tid*/, 1/*nthreads*/); +#endif /*defined(_OPENMP)*/ + libxsmm_free(scratch); + } + else { /* fallback or error */ + static int error_once = 0; + if (NULL == handle) { /* fallback */ + if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: XGEMM fallback code path triggered!\n"); + } + } + else if (0 != libxsmm_verbosity && /* library code is expected to be mute */ + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate GEMM-scratch memory!\n"); + } + libxsmm_blas_xgemm(iprec, oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +} + + +LIBXSMM_API_INLINE void internal_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char transa[], const char transb[], const libxsmm_blasint m[], const libxsmm_blasint n[], const libxsmm_blasint k[], + const void* alpha, const void* a[], const libxsmm_blasint lda[], const void* b[], const libxsmm_blasint ldb[], + const void* beta, void* c[], const libxsmm_blasint ldc[], libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const libxsmm_blasint batchsize[], libxsmm_blasint group_count) +{ + static int error_once = 0; + LIBXSMM_INIT + if ( /* check for sensible arguments */ +#if defined(LIBXSMM_BATCH_CHECK) + NULL != a && NULL != b && NULL != c && (1 == group_count || -1 == group_count || + (0 == index_stride && (NULL == stride_a || 0 != *stride_a) && (NULL == stride_b || 0 != *stride_b) && (NULL == stride_c || 0 != *stride_c))) && +#endif + 0 != group_count) + { + int result = EXIT_SUCCESS; + const int max_npargroups = (int)(0 < libxsmm_gemm_npargroups + ? LIBXSMM_MIN(libxsmm_gemm_npargroups, LIBXSMM_GEMM_NPARGROUPS) : LIBXSMM_GEMM_NPARGROUPS); + const libxsmm_gemm_prefetch_type prefetch = libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + const size_t sa = (NULL != stride_a ? (size_t)(*stride_a) : sizeof(void*)); + const size_t sb = (NULL != stride_b ? (size_t)(*stride_b) : sizeof(void*)); + const size_t sc = (NULL != stride_c ? (size_t)(*stride_c) : sizeof(void*)); + const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)oprec); + const int ngroups = (int)LIBXSMM_ABS(group_count); + int group = 0, group_next = LIBXSMM_GEMM_NPARGROUPS; + libxsmm_code_pointer kernel[LIBXSMM_GEMM_NPARGROUPS]; + libxsmm_blasint base[LIBXSMM_GEMM_NPARGROUPS], i; +#if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + int kflags[LIBXSMM_GEMM_NPARGROUPS]; +#endif + int max_nthreads = 1; +#if defined(_OPENMP) +# if defined(LIBXSMM_EXT_TASKS) + const int outerpar = omp_get_active_level(); +# else + const int outerpar = omp_in_parallel(); +# endif + if (0 == outerpar) max_nthreads = omp_get_max_threads(); +#endif + for (i = 0; i < max_npargroups; ++i) { +#if !defined(NDEBUG) + kernel[i].ptr = NULL; +# if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + kflags[i] = 0; +# endif +#endif + base[i] = 0; + } + for (group = 0; group < ngroups; group = group_next, group_next += max_npargroups) { + const int npargroups = LIBXSMM_MIN(group_next, ngroups); + libxsmm_blasint size = 0; + int suitable = 0; + if (0 < group) { /* base is maintained even if par-group is not suitable */ + for (i = 0; i < npargroups; ++i) { + const libxsmm_blasint isize = batchsize[group+i-1], asize = LIBXSMM_ABS(isize); + base[i] += asize; + } + } + for (i = 0; i < npargroups; ++i) { + const libxsmm_blasint g = group + i, im = m[g], in = n[g], ik = k[g]; + suitable = LIBXSMM_SMM_AI(im, in, ik, 2/*RFO*/, otypesize); + if (0 != suitable) { + const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); + const char *const ta = (NULL != transa ? (transa + g) : NULL); + const char *const tb = (NULL != transb ? (transb + g) : NULL); + const int flags = LIBXSMM_GEMM_PFLAGS(ta, tb, LIBXSMM_FLAGS); + const void **const galpha = &alpha, **const gbeta = β + libxsmm_descriptor_blob blob; + /* coverity[ptr_arith] */ + libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, im, in, ik, + NULL != lda ? lda[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) ? im : ik), + NULL != ldb ? ldb[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) ? ik : in), + NULL != ldc ? ldc[g] : im, NULL != alpha ? galpha[g] : NULL, NULL != beta ? gbeta[g] : NULL, + flags, prefetch); + if (NULL != desc) { + libxsmm_gemm_internal_set_batchflag(desc, c, index_stride, 0 < group_count ? isize : -asize, 1 != max_nthreads); + kernel[i].xgemm = libxsmm_xmmdispatch(desc); + } + else kernel[i].ptr = NULL; + if (NULL != kernel[i].ptr_const) { + if (size < asize) size = asize; +#if !defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + LIBXSMM_ASSERT(NULL != desc); /* coverity[var_deref_op] */ + kflags[i] = desc->flags; +#endif + } + else { + suitable = 0; + break; + } + } + else break; + } + if (0 != suitable) { /* check if an SMM is suitable */ + const unsigned char itypesize = libxsmm_typesize((libxsmm_datatype)iprec); +#if defined(_OPENMP) + const int nchunks = (int)LIBXSMM_UPDIV(size, libxsmm_gemm_taskgrain); + const int ntasks = nchunks * npargroups, nthreads = LIBXSMM_MIN(max_nthreads, ntasks); + if (1 < nthreads) { + if (0 == outerpar) { /* enable internal parallelization */ +# if defined(LIBXSMM_EXT_TASKS) + if (0 == libxsmm_gemm_tasks) +# endif + { +# pragma omp parallel for num_threads(nthreads) private(i) + for (i = 0; i < ntasks; ++i) { + const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; + const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); + if (v < asize) { +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + libxsmm_mmkernel_info kernel_info; +#endif + /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, + (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], + 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); +#else + kflags[g]); +#endif + } + } + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* tasks requested */ +# pragma omp parallel num_threads(nthreads) private(i) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + for (i = 0; i < ntasks; ++i) { + const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; + const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); + if (v < asize) { +# pragma omp task + { +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + libxsmm_mmkernel_info kernel_info; +#endif + /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, + (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], + 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); +#else + kflags[g]); +#endif + } + } + } + } /* implicit synchronization (barrier) */ + } +# endif + } + else { /* assume external parallelization */ + for (i = 0; i < (libxsmm_blasint)ntasks; ++i) { + const libxsmm_blasint j = i * libxsmm_gemm_taskgrain, u = j / size, v = j - u * size, g = group + u; + const libxsmm_blasint isize = batchsize[g], asize = LIBXSMM_ABS(isize); + if (v < asize) { +# if defined(LIBXSMM_EXT_TASKS) /* OpenMP-tasks */ +# pragma omp task +#endif + { +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + libxsmm_mmkernel_info kernel_info; +#endif + /*check*/libxsmm_mmbatch_kernel(kernel[g].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, + (const char*)a + sa * base[u], (const char*)b + sb * base[u], (char*)c + sc * base[u], + 0 < group_count ? isize : -asize, (int)i, nchunks, itypesize, otypesize, +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[g].xgemm, &kernel_info) ? kernel_info.flags : 0); +#else + kflags[g]); +#endif + } + } + } +# if defined(LIBXSMM_EXT_TASKS) /* OpenMP-tasks */ + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# endif + } + } + else +#endif /*defined(_OPENMP)*/ + { /* sequential */ + for (i = 0; i < npargroups; ++i) { + const libxsmm_blasint g = group + i; +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + libxsmm_mmkernel_info kernel_info; +#endif + libxsmm_mmbatch_kernel(kernel[i].xgemm, index_base, index_stride, stride_a, stride_b, stride_c, + (const char*)a + sa * base[i], (const char*)b + sb * base[i], (char*)c + sc * base[i], batchsize[g], + 0/*tid*/, 1/*nthreads*/, itypesize, otypesize, +#if defined(LIBXSMM_EXT_GEMM_PARGROUPS_INFO) + EXIT_SUCCESS == libxsmm_get_mmkernel_info(kernel[i].xgemm, &kernel_info) ? kernel_info.flags : 0); +#else + kflags[i]); +#endif + } + } + } + else { /* trigger fallback */ + result = EXIT_FAILURE; + } + if (EXIT_SUCCESS != result) { + for (i = 0; i < npargroups; ++i) { + const libxsmm_blasint g = group + i; + const char *const ta = (NULL != transa ? (transa + g) : NULL); + const char *const tb = (NULL != transb ? (transb + g) : NULL); + const int flags = LIBXSMM_GEMM_PFLAGS(ta, tb, LIBXSMM_FLAGS); + const libxsmm_blasint im = m[g], in = n[g], ik = k[g]; + const libxsmm_blasint ilda = (NULL != lda ? lda[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & flags) ? im : ik)); + const libxsmm_blasint ildb = (NULL != ldb ? ldb[g] : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & flags) ? ik : in)); + const libxsmm_blasint ildc = (NULL != ldc ? ldc[g] : im); + const void **const galpha = &alpha, **const gbeta = β + /* coverity[overrun-local] */ + const void *const ialpha = (NULL != alpha ? galpha[g] : NULL); + /* coverity[overrun-local] */ + const void *const ibeta = (NULL != beta ? gbeta[g] : NULL); + if (EXIT_SUCCESS == libxsmm_mmbatch_blas(iprec, oprec, ta, tb, im, in, ik, ialpha, + (const char*)a + sa * base[i], &ilda, (const char*)b + sb * base[i], &ildb, ibeta, (char*)c + sc * base[i], &ildc, + index_base, index_stride, stride_a, stride_b, stride_c, batchsize[g])) + { + if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { + const size_t threshold = LIBXSMM_MNK_SIZE(im, in, im); + static size_t threshold_max = 0; + if (threshold_max < threshold) { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "LIBXSMM WARNING: "); + libxsmm_gemm_print2(stderr, iprec, oprec, ta, tb, &im, &in, &ik, + ialpha, NULL/*a*/, &ilda, NULL/*b*/, &ildb, ibeta, NULL/*c*/, &ildc); + fprintf(stderr, " => batched GEMM/omp was falling back to BLAS!\n"); + LIBXSMM_STDIO_RELEASE(); + threshold_max = threshold; + } + } + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: libxsmm_gemm_batch_omp failed!\n"); + } + return; /* exit routine */ + } + } + } + } + } +#if defined(LIBXSMM_BATCH_CHECK) + else if (0 != group_count && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: incorrect arguments (libxsmm_gemm_batch_omp)!\n"); + } +#endif +} + + +LIBXSMM_APIEXT void libxsmm_gemm_batch_omp(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize) +{ + internal_gemm_batch_omp(iprec, oprec, transa, transb, &m, &n, &k, + alpha, (const void**)a, lda, (const void**)b, ldb, beta, (void**)c, ldc, index_base, index_stride, + stride_a, stride_b, stride_c, &batchsize, 1); +} + + +LIBXSMM_APIEXT void libxsmm_dgemm_batch_omp( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + if (NULL != group_count) { + const libxsmm_blasint ptrsize = sizeof(void*); + internal_gemm_batch_omp(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, transa_array, transb_array, m_array, n_array, k_array, + alpha_array, (const void**)a_array, lda_array, (const void**)b_array, ldb_array, beta_array, (void**)c_array, ldc_array, + 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, group_size, *group_count); + } +} + + +LIBXSMM_APIEXT void libxsmm_sgemm_batch_omp( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + if (NULL != group_count) { + const libxsmm_blasint ptrsize = sizeof(void*); + internal_gemm_batch_omp(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, transa_array, transb_array, m_array, n_array, k_array, + alpha_array, (const void**)a_array, lda_array, (const void**)b_array, ldb_array, beta_array, (void**)c_array, ldc_array, + 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, group_size, *group_count); + } +} + + +LIBXSMM_APIEXT void libxsmm_mmbatch_begin(libxsmm_gemm_precision precision, + const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const void* alpha, const void* beta) +{ +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 26115) /* try-lock is treated incorrectly by static analysis */ +# endif + LIBXSMM_INIT + if (NULL != libxsmm_mmbatch_array /* batch-recording available, but not yet running */ + /* currently, batch recording is only enabled if all values are present (no complex filtering) */ + && NULL != flags && NULL != alpha && NULL != beta + && NULL != lda && NULL != ldb && NULL != ldc + && NULL != m && NULL != n && NULL != k + && LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_DEFAULT) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock)) + { + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const descriptor = libxsmm_gemm_descriptor_init(&blob, precision, + *m, *n, *k, *lda, *ldb, *ldc, alpha, beta, *flags, libxsmm_get_gemm_prefetch(LIBXSMM_EXT_GEMM_MMBATCH_PREFETCH)); + static int error_once = 0; + int result = EXIT_SUCCESS; + + if (NULL != descriptor) { + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + unsigned int i; +#if !defined(NDEBUG) + const unsigned int mmbatch_maxdepth = LIBXSMM_UP2POT(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); + LIBXSMM_ASSERT((LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) == mmbatch_maxdepth/*is pot*/); +#endif + /* eventually overwrite the oldest entry */ + i = LIBXSMM_MOD2(internal_ext_gemm_batchdepth, LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); + internal_ext_gemm_batchdesc[i] = libxsmm_mmbatch_desc; /* backup */ + ++internal_ext_gemm_batchdepth; + + /* ensure descriptor does not match any GEMM such that... */ + LIBXSMM_MEMZERO127(&libxsmm_mmbatch_desc); + /* ...the batch stops and completely flushes */ + if (0 != internal_ext_gemm_batchsize) { + result = internal_mmbatch_flush(internal_ext_gemm_batchdesc + i, + (((libxsmm_blasint)internal_ext_gemm_batchsize - 1) % max_batchsize) + 1, + (libxsmm_mmbatch_item*)libxsmm_mmbatch_array); + } + + if (EXIT_SUCCESS == result) { /* enable descriptor */ + internal_ext_gemm_batchsize = 0; /* reset */ + if (0 == (LIBXSMM_MMBATCH_FLAG_STATISTIC & *flags)) { + libxsmm_mmbatch_desc = *descriptor; + } + else { + libxsmm_mmbatch_desc.flags = LIBXSMM_MMBATCH_FLAG_STATISTIC; + } + } + } + else { + result = EXIT_FAILURE; + } + if (EXIT_SUCCESS != result && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: GEMM batch enabling failed!\n"); + } + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); + } +# if defined(_MSC_VER) +# pragma warning(pop) +# endif +#else + LIBXSMM_UNUSED(precision); LIBXSMM_UNUSED(flags); + LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k); + LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(ldb); LIBXSMM_UNUSED(ldc); + LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(beta); +#endif +} + + +LIBXSMM_APIEXT void libxsmm_mmbatch_end(void) +{ +#if defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD_EXT) +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 26115) /* try-lock is treated incorrectly by static analysis */ +# endif + /*const*/ int trystate = LIBXSMM_LOCK_TRYLOCK(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_DEFAULT) == trystate) { + const unsigned int max_batchsize = (unsigned int)((LIBXSMM_GEMM_MMBATCH_SCALE) * libxsmm_mmbatch_size); + const libxsmm_gemm_descriptor flushdesc = libxsmm_mmbatch_desc; + static int error_once = 0; +#if !defined(NDEBUG) + const unsigned int mmbatch_maxdepth = LIBXSMM_UP2POT(LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH); +#endif + /* ensure descriptor does not match any GEMM such that... */ + LIBXSMM_MEMZERO127(&libxsmm_mmbatch_desc); + /* ...the batch stops and completely flushes */ + if (EXIT_SUCCESS == internal_mmbatch_flush(&flushdesc, + 0 != internal_ext_gemm_batchsize ? (((internal_ext_gemm_batchsize - 1) % max_batchsize) + 1) : 0, + (libxsmm_mmbatch_item*)libxsmm_mmbatch_array)) + { + internal_ext_gemm_batchsize = 0; /* reset */ + --internal_ext_gemm_batchdepth; /* restore the previous descriptor */ + assert((LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH) == mmbatch_maxdepth/*is pot*/); /* no LIBXSMM_ASSERT! */ + libxsmm_mmbatch_desc = internal_ext_gemm_batchdesc[LIBXSMM_MOD2(internal_ext_gemm_batchdepth, LIBXSMM_EXT_GEMM_MMBATCH_MAXDEPTH)]; + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: GEMM batch processing failed!\n"); + } + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK_DEFAULT, &libxsmm_mmbatch_lock); + } +# if defined(_MSC_VER) +# pragma warning(pop) +# endif +#endif +} + + +#if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_xgemm_omp)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, + const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const double*, const double*, const libxsmm_blasint*, const double*, const libxsmm_blasint*, + const double*, double*, const libxsmm_blasint*); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_xgemm_omp)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_ASSERT(NULL != iprec && NULL != oprec); + libxsmm_xgemm_omp(*iprec, *oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_dgemm_omp)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const double*, const double*, const libxsmm_blasint*, + const double*, const libxsmm_blasint*, + const double*, double*, const libxsmm_blasint*); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_dgemm_omp)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + libxsmm_dgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_sgemm_omp)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const float*, const float*, const libxsmm_blasint*, + const float*, const libxsmm_blasint*, + const float*, float*, const libxsmm_blasint*); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_sgemm_omp)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + libxsmm_sgemm_omp(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_gemm_batch_omp)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, + const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, + const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], + const libxsmm_blasint*); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_gemm_batch_omp)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const libxsmm_blasint* batchsize) +{ + LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); + LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); + libxsmm_gemm_batch_omp(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, + *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_begin)(const libxsmm_gemm_precision*, + const int*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const void*, const void*); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_begin)(const libxsmm_gemm_precision* precision, + const int* flags, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const void* alpha, const void* beta) +{ + LIBXSMM_ASSERT(NULL != precision); + libxsmm_mmbatch_begin(*precision, flags, m, n, k, lda, ldb, ldc, alpha, beta); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_end)(void); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_mmbatch_end)(void) +{ + libxsmm_mmbatch_end(); +} + +#endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_ext_xcopy.c b/third_party/libxsmm/src/libxsmm_ext_xcopy.c new file mode 100644 index 00000000..b6f2c35a --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_ext_xcopy.c @@ -0,0 +1,472 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_xcopy.h" +#include "libxsmm_ext.h" + +#define LIBXSMM_MCOPY_MT(MT, NT, M, N) ((MT) <= (M) && (NT) <= (N) && (64U * 64U) <= (((unsigned int)(M)) * (N))) + + +LIBXSMM_APIEXT void libxsmm_matcopy_omp(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + LIBXSMM_INIT + if (0 < typesize && 256 > typesize && m <= ldi && m <= ldo && out != in && + ((NULL != out && 0 < m && 0 < n) || (0 == m && 0 == n))) + { + if (0 < m && 0 < n) { +#if defined(_OPENMP) + unsigned int tm, tn, ts; + if (NULL != in) { /* mcopy */ + tm = LIBXSMM_UPDIV(libxsmm_mcopy_mbytes, typesize); + tn = (unsigned int)(libxsmm_mcopy_nscale * tm); + ts = libxsmm_mcopy_mbytes; + } + else { /* mzero */ + tm = LIBXSMM_UPDIV(libxsmm_mzero_mbytes, typesize); + tn = (unsigned int)(libxsmm_mzero_nscale * tm); + ts = libxsmm_mzero_mbytes; + } + if (0 == tm) tm = m; + if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); + if (0 != ts && ts < (tm * tn * typesize)) { + tm = LIBXSMM_MAX(ts / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); + } + if (LIBXSMM_MCOPY_MT(tm, tn, (unsigned int)m, (unsigned int)n)) { /* consider problem-size */ + libxsmm_xcopykernel kernel; + kernel.ptr = NULL; +# if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 2)) + if (0 != (2 & libxsmm_xcopy_jit)) { /* JIT'ted matrix-copy permitted? */ + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary(tm, tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary(tm, tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary(tm, tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary(tm, tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + } + } +# endif +# if defined(LIBXSMM_EXT_TASKS) && 0/* implies _OPENMP */ + if (0 == omp_get_active_level()) +# else + if (0 == omp_in_parallel()) +# endif + { /* enable internal parallelization */ + const int nthreads = omp_get_max_threads(); +# if defined(LIBXSMM_EXT_TASKS) + if (0 >= libxsmm_xcopy_taskscale) +# endif + { +# pragma omp parallel num_threads(nthreads) + libxsmm_matcopy_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, omp_get_thread_num(), nthreads); + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* tasks requested */ + const int ntasks = nthreads * libxsmm_xcopy_taskscale; +# pragma omp parallel num_threads(nthreads) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + { int tid; + for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_matcopy_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + } + } + } +# endif + } + else { /* assume external parallelization */ +# if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + const int nthreads = omp_get_num_threads(); + const int ntasks = (0 == libxsmm_xcopy_taskscale + ? (LIBXSMM_XCOPY_TASKSCALE) + : libxsmm_xcopy_taskscale) * nthreads; + int tid; + for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_matcopy_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# else + libxsmm_matcopy_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, 0/*tid*/, 1/*nthreads*/); +# endif + } + } + else +#endif /*defined(_OPENMP)*/ + if (NULL != in) { /* no MT, or small problem-size */ + LIBXSMM_XCOPY_NONJIT(LIBXSMM_MCOPY_KERNEL, + typesize, out, in, ldi, ldo, 0, m, 0, n); + } + else { /* no MT, or small problem-size */ + /* coverity[ptr_arith] */ + LIBXSMM_XCOPY_NONJIT(LIBXSMM_MZERO_KERNEL, + typesize, out, in, ldi, ldo, 0, m, 0, n); + } + } + } + else { + static int error_once = 0; + if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (NULL == out) { + fprintf(stderr, "LIBXSMM ERROR: the matrix-copy input and/or output is NULL!\n"); + } + else if (out == in) { + fprintf(stderr, "LIBXSMM ERROR: output and input of the matrix-copy must be different!\n"); + } + else if (0 == typesize || 256 <= typesize) { + fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-copy specified!\n"); + } + else if (ldi < m || ldo < m) { + fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the matrix-copy is/are too small!\n"); + } + else if (0 > m || 0 > n) { + fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the matrix-copy is/are negative!\n"); + } + } + } +} + + +LIBXSMM_APIEXT void libxsmm_otrans_omp(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + static int error_once = 0; + LIBXSMM_INIT + if (0 < typesize && 256 > typesize && m <= ldi && n <= ldo && + ((NULL != out && NULL != in && 0 < m && 0 < n) || (0 == m && 0 == n))) + { + if (0 < m && 0 < n) { + if (out != in) { +#if defined(_OPENMP) + unsigned int tm = LIBXSMM_UPDIV(libxsmm_tcopy_mbytes, typesize); + unsigned int tn = (unsigned int)(libxsmm_tcopy_nscale * tm); + if (0 == tm) tm = m; + if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); + if (0 != libxsmm_tcopy_mbytes && libxsmm_tcopy_mbytes < (tm * tn * typesize)) { + tm = LIBXSMM_MAX(libxsmm_tcopy_mbytes / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); + } + if (tm <= (unsigned int)m && tn <= (unsigned int)n) { /* consider problem-size */ + libxsmm_xcopykernel kernel; + kernel.ptr = NULL; +# if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + if (0 == omp_get_active_level()) +# else + if (0 == omp_in_parallel()) +# endif + { /* enable internal parallelization */ + const int nthreads = omp_get_max_threads(); +# if defined(LIBXSMM_EXT_TASKS) + if (0 >= libxsmm_xcopy_taskscale) +# endif + { +# pragma omp parallel num_threads(nthreads) + { /* coverity[divide_by_zero] */ + libxsmm_otrans_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, omp_get_thread_num(), nthreads); + } + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* tasks requested */ + const int ntasks = nthreads * libxsmm_xcopy_taskscale; +# pragma omp parallel num_threads(nthreads) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + { int tid; + for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_otrans_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + } + } + } +# endif + } + else { /* assume external parallelization */ +# if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + const int nthreads = omp_get_num_threads(); + const int ntasks = (0 == libxsmm_xcopy_taskscale + ? (LIBXSMM_XCOPY_TASKSCALE) + : libxsmm_xcopy_taskscale) * nthreads; + int tid; + for (tid = 0; tid < ntasks; ++tid) { +# pragma omp task untied + libxsmm_otrans_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# else /* coverity[divide_by_zero] */ + libxsmm_otrans_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, 0/*tid*/, 1/*nthreads*/); +# endif + } + } + else +#endif /*defined(_OPENMP)*/ + { /* no MT, or small problem-size */ +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + libxsmm_xcopykernel kernel; + kernel.ptr = NULL; + if (0 != (1 & libxsmm_xcopy_jit)) { /* JIT'ted transpose permitted? */ + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + } + if (NULL != kernel.ptr) { /* JIT-kernel available */ + LIBXSMM_TCOPY_CALL(kernel, typesize, in, ldi, out, ldo); + } + } + else +#endif + { + LIBXSMM_XCOPY_NONJIT(LIBXSMM_TCOPY_KERNEL, + typesize, out, in, ldi, ldo, 0, m, 0, n); + } + } + } + else if (ldi == ldo) { + libxsmm_itrans/*TODO: omp*/(out, typesize, m, n, ldi, ldo); + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); + } + } + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (NULL == out || NULL == in) { + fprintf(stderr, "LIBXSMM ERROR: the transpose input and/or output is NULL!\n"); + } + else if (out == in) { + fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); + } + else if (0 == typesize || 256 <= typesize) { + fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-transpose specified!\n"); + } + else if (ldi < m || ldo < n) { + fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the transpose is/are too small!\n"); + } + else if (0 > m || 0 > n) { + fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the transpose is/are negative!\n"); + } + } + } +} + + +LIBXSMM_APIEXT void libxsmm_itrans_batch_omp(void* inout, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride[], libxsmm_blasint batchsize) +{ +#if defined(_OPENMP) + if (1 < batchsize) { /* consider problem-size */ + const libxsmm_blasint scratchsize = m * n * typesize; + const libxsmm_blasint size = LIBXSMM_ABS(batchsize); + char buffer[LIBXSMM_ITRANS_BUFFER_MAXSIZE]; + char *const mat0 = (char*)inout; + void* scratch = NULL; + libxsmm_xcopykernel kernel = { NULL }; + if (m != n || ldi != ldo || 127 < typesize) { + if (scratchsize <= LIBXSMM_ITRANS_BUFFER_MAXSIZE) { + scratch = buffer; + } + else { + static int error_once = 0; + LIBXSMM_INIT + if (EXIT_SUCCESS != libxsmm_xmalloc(&scratch, scratchsize, 0/*auto-align*/, + LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, + 0/*extra*/, 0/*extra_size*/) + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate buffer for in-place transpose!\n"); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + if (0 != (1 & libxsmm_xcopy_jit) /* JIT'ted transpose permitted? */ + /* avoid outgrown transpose kernel upfront */ + && (m <= LIBXSMM_CONFIG_MAX_DIM || n <= LIBXSMM_CONFIG_MAX_DIM)) + { + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + } + } +#endif + } +# if defined(LIBXSMM_EXT_TASKS) && 0/* implies _OPENMP */ + if (0 == omp_get_active_level()) +# else + if (0 == omp_in_parallel()) +# endif + { /* enable internal parallelization */ + const int nthreads = omp_get_max_threads(); +# if defined(LIBXSMM_EXT_TASKS) + if (0 >= libxsmm_xcopy_taskscale) +# endif + { + const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, nthreads); +# pragma omp parallel num_threads(nthreads) + { + const libxsmm_blasint begin = omp_get_thread_num() * tasksize; + const libxsmm_blasint span = begin + tasksize; + libxsmm_itrans_internal(mat0, scratch, typesize, m, n, ldi, ldo, index_base, + index_stride, stride, kernel, begin, LIBXSMM_MIN(span, size)); + } + } +# if defined(LIBXSMM_EXT_TASKS) + else { /* tasks requested */ + const int ntasks = nthreads * libxsmm_xcopy_taskscale; + const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, ntasks); +# pragma omp parallel num_threads(nthreads) + { /* first thread discovering work will launch all tasks */ +# pragma omp single nowait /* anyone is good */ + { int tid; + for (tid = 0; tid < ntasks; ++tid) { + const libxsmm_blasint begin = tid * tasksize; + const libxsmm_blasint span = begin + tasksize; +# pragma omp task untied + libxsmm_itrans_internal(mat0, scratch, typesize, m, n, ldi, ldo, index_base, + index_stride, stride, kernel, begin, LIBXSMM_MIN(span, size)); + } + } + } + } +# endif + } + else { /* assume external parallelization */ +# if defined(LIBXSMM_EXT_TASKS) /* implies _OPENMP */ + const int nthreads = omp_get_num_threads(); + const int ntasks = (0 == libxsmm_xcopy_taskscale + ? (LIBXSMM_XCOPY_TASKSCALE) + : libxsmm_xcopy_taskscale) * nthreads; + const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, ntasks); + int tid; + for (tid = 0; tid < ntasks; ++tid) { + const libxsmm_blasint begin = tid * tasksize; + const libxsmm_blasint span = begin + tasksize; +# pragma omp task untied + libxsmm_itrans_internal(mat0, scratch, typesize, m, n, ldi, ldo, index_base, + index_stride, stride, kernel, begin, LIBXSMM_MIN(span, size)); + } + if (0 == libxsmm_nosync) { /* allow to omit synchronization */ +# pragma omp taskwait + } +# else + libxsmm_itrans_internal(mat0, scratch, typesize, m, n, ldi, ldo, index_base, + index_stride, stride, kernel, 0, batchsize); +# endif + } + if (NULL != scratch && LIBXSMM_ITRANS_BUFFER_MAXSIZE < scratchsize) { + libxsmm_xfree(scratch, 0/*no check*/); + } + } + else +#endif /*defined(_OPENMP)*/ + libxsmm_itrans_batch(inout, typesize, m, n, ldi, ldo, + index_base, index_stride, stride, batchsize, + 0/*tid*/, 1/*ntasks*/); +} + + +#if defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_matcopy_omp)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_matcopy_omp)(void* out, const void* in, const int* typesize, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) +{ + libxsmm_blasint ldx; + LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); + ldx = *(NULL != ldi ? ldi : m); + libxsmm_matcopy_omp(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); +} + + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_otrans_omp)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(libxsmm_otrans_omp)(void* out, const void* in, const int* typesize, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) +{ + libxsmm_blasint ldx; + LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); + ldx = *(NULL != ldi ? ldi : m); + libxsmm_otrans_omp(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); +} + +#endif /*defined(LIBXSMM_BUILD) && defined(LIBXSMM_BUILD_EXT) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_fsspmdm.c b/third_party/libxsmm/src/libxsmm_fsspmdm.c new file mode 100644 index 00000000..5bcc447c --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_fsspmdm.c @@ -0,0 +1,602 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "generator_spgemm_csr_asparse_reg.h" +#include +#include "libxsmm_main.h" + + +/* Double precision AVX-512 lane broadcasts */ +LIBXSMM_APIVAR_DEFINE(const double* internal_fsspmdm_dperm); +/* Single precision AVX-512 lane broadcasts */ +LIBXSMM_APIVAR_DEFINE(const float* internal_fsspmdm_sperm); + + +LIBXSMM_API_INTERN void internal_dfsspmdm_init(void); +LIBXSMM_API_INTERN void internal_dfsspmdm_init(void) +{ + LIBXSMM_ALIGNED(static const unsigned int dperm[], LIBXSMM_ALIGNMENT) = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, + 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, + 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, + 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 + }; + LIBXSMM_ASSERT(NULL == internal_fsspmdm_dperm); + internal_fsspmdm_dperm = (const double*)((const void*)dperm); +} + + +LIBXSMM_API_INTERN void internal_sfsspmdm_init(void); +LIBXSMM_API_INTERN void internal_sfsspmdm_init(void) +{ + LIBXSMM_ALIGNED(static const unsigned int sperm[], LIBXSMM_ALIGNMENT) = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 + }; + LIBXSMM_ASSERT(NULL == internal_fsspmdm_sperm); + internal_fsspmdm_sperm = (const float*)((const void*)sperm); +} + + +LIBXSMM_API libxsmm_dfsspmdm* libxsmm_dfsspmdm_create( + libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + const double alpha, const double beta, libxsmm_blasint c_is_nt, + const double* a_dense) +{ + double one = 1.0; + double* a_csr_values = NULL; + unsigned int* a_csr_rowptr = NULL; + unsigned int* a_csr_colidx = NULL; + double* aa_dense = NULL; + int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; + const libxsmm_gemm_descriptor* xgemm_desc; + libxsmm_descriptor_blob xgemm_blob; + libxsmm_dfsspmdm* new_handle = NULL; + libxsmm_dmmfunction k_sparse1 = NULL; + libxsmm_dmmfunction k_sparse2 = NULL; + libxsmm_dmmfunction k_dense = NULL; + int i, j, n, a_nnz, N_sparse1, N_sparse2, N_dense, nkerns; + + /* internal lazy initialization */ + if (NULL == internal_fsspmdm_dperm) internal_dfsspmdm_init(); + + /* some checks... */ + assert(N % 8 == 0); + assert(N >= 8); + assert(LIBXSMM_FEQ(beta, 1.0) || LIBXSMM_FEQ(beta, 0.0)); + assert(K <= lda); + assert(N <= ldc); + assert(N <= ldb); + + /* Get the number of non-zeros */ + a_nnz = 0; + for (i = 0; i < M; i++) { + for (j = 0; j < K; j++) { + if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0)) { + a_nnz++; + } + } + } + + /* Null matrix */ + if ( 0 == a_nnz ) return NULL; + + /* Allocate handle */ + new_handle = (libxsmm_dfsspmdm*)malloc(sizeof(libxsmm_dfsspmdm)); + if ( NULL == new_handle ) return NULL; + + /* Initialize the handle */ + LIBXSMM_MEMZERO127(new_handle); + /* TODO: in case of ILP64, check value ranges */ + new_handle->N = (int)N; + new_handle->M = (int)M; + new_handle->K = (int)K; + new_handle->ldb = (int)ldb; + new_handle->ldc = (int)ldc; + + /* update flags */ + if ( beta == 0.0 && c_is_nt != 0 ) { + flags |= LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; + } + + /* Allocate CSR structure */ + a_csr_values = (double*)malloc((size_t)a_nnz * sizeof(double)); + a_csr_rowptr = (unsigned int*)malloc(((size_t)M + 1) * sizeof(unsigned int)); + a_csr_colidx = (unsigned int*)malloc((size_t)a_nnz * sizeof(unsigned int)); + + /* Allocate dense storage */ + aa_dense = (double*)libxsmm_aligned_malloc((size_t)M * (size_t)K * sizeof(double), 64); + + if ( NULL == a_csr_values || NULL == a_csr_rowptr || NULL == a_csr_colidx || NULL == aa_dense ) { + free( a_csr_values ); free( a_csr_rowptr ); free( a_csr_colidx ); + free( new_handle ); + libxsmm_free( aa_dense ); + return NULL; + } + + /* Populate CSR structure */ + for (i = 0, n = 0; i < M; i++) { + a_csr_rowptr[i] = n; + for (j = 0; j < K; j++) { + if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0)) { + a_csr_values[n] = alpha*a_dense[(i*lda) + j]; + a_csr_colidx[n] = j; + n++; + } + } + } + a_csr_rowptr[M] = a_nnz; + + /* Attempt to JIT a sparse kernel */ + N_sparse1 = libxsmm_cpuid_vlen32(libxsmm_cpuid()) / 2; + xgemm_desc = libxsmm_dgemm_descriptor_init(&xgemm_blob, M, N_sparse1, K, + 0, ldb, ldc, one, beta, flags, prefetch); + if ( NULL != xgemm_desc ) { + k_sparse1 = libxsmm_create_dcsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); + } + + /* If that worked try to JIT a second (wider) sparse kernel */ + N_sparse2 = N_sparse1*2; + if ( NULL != k_sparse1 && N_sparse2 <= N ) { + xgemm_desc = libxsmm_dgemm_descriptor_init(&xgemm_blob, M, N_sparse2, K, + 0, ldb, ldc, one, beta, flags, prefetch); + + if ( NULL != xgemm_desc ) { + k_sparse2 = libxsmm_create_dcsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); + } + } + + /* Free CSR */ + free( a_csr_values ); + free( a_csr_rowptr ); + free( a_csr_colidx ); + + /* Also generate a dense kernel */ + N_dense = 8; + k_dense = libxsmm_dmmdispatch(N_dense, M, K, &ldb, &K, &ldc, &one, &beta, &flags, (const int*)LIBXSMM_GEMM_PREFETCH_NONE); + + if ( NULL != k_dense ) { + /* copy A over */ + for ( i = 0; i < M; ++i ) { + for ( j = 0; j < K; ++j ) { + aa_dense[i*K + j] = alpha*a_dense[i*lda + j]; + } + } + } + + /* Tally up how many kernels we got */ + nkerns = !!k_dense + !!k_sparse1 + !!k_sparse2; + + /* We have at least one kernel */ + if ( nkerns ) { + libxsmm_timer_tickint t; + double *B = NULL, *C = NULL; + double dt_dense = ( NULL != k_dense ) ? 1e5 : 1e6; + double dt_sparse1 = ( NULL != k_sparse1 ) ? 1e5 : 1e6; + double dt_sparse2 = ( NULL != k_sparse2 ) ? 1e5 : 1e6; + void* fp; + + /* If we have two or more kernels then try to benchmark them */ + if ( nkerns >= 2 ) { + B = (double*)libxsmm_aligned_malloc((size_t)K * (size_t)ldb * sizeof(double), 64); + C = (double*)libxsmm_aligned_malloc((size_t)M * (size_t)ldc * sizeof(double), 64); + + if ( NULL != B && NULL != C ) { + for ( i = 0; i < K; i++ ) { + for ( j = 0; j < N; j++ ) { + B[i*ldb + j] = 1; + } + } + for ( i = 0; i < M; i++ ) { + for ( j = 0; j < N; j++ ) { + C[i*ldc + j] = 1; + } + } + } + } + + /* Benchmark dense */ + if ( NULL != k_dense && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_dense ) { + k_dense( B + j, aa_dense, C + j ); + } + } + dt_dense = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Benchmark sparse (regular) */ + if ( NULL != k_sparse1 && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_sparse1 ) { + k_sparse1( internal_fsspmdm_dperm, B + j, C + j ); + } + } + dt_sparse1 = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Benchmark sparse (wide) */ + if ( NULL != k_sparse2 && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_sparse2 ) { + k_sparse2( internal_fsspmdm_dperm, B + j, C + j ); + } + } + dt_sparse2 = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Dense fastest */ + if ( dt_dense <= dt_sparse1 && dt_dense <= dt_sparse2 ) { + new_handle->N_chunksize = N_dense; + new_handle->kernel = k_dense; + new_handle->a_dense = aa_dense; + } else { + libxsmm_free( aa_dense ); + } + + /* Sparse (regular) fastest */ + if ( dt_sparse1 < dt_dense && dt_sparse1 <= dt_sparse2 ) { + new_handle->N_chunksize = N_sparse1; + new_handle->kernel = k_sparse1; + } else if ( NULL != k_sparse1 ) { + LIBXSMM_ASSIGN127( &fp, &k_sparse1 ); + libxsmm_free( fp ); + } + + /* Sparse (wide) fastest */ + if ( dt_sparse2 < dt_dense && dt_sparse2 < dt_sparse1 ) { + new_handle->N_chunksize = N_sparse2; + new_handle->kernel = k_sparse2; + } else if ( NULL != k_sparse2 ) { + LIBXSMM_ASSIGN127( &fp, &k_sparse2 ); + libxsmm_free( fp ); + } + + libxsmm_free( B ); + libxsmm_free( C ); + } + else { + libxsmm_free( aa_dense ); + free( new_handle ); + new_handle = NULL; + } + + return new_handle; +} + + +LIBXSMM_API libxsmm_sfsspmdm* libxsmm_sfsspmdm_create( + libxsmm_blasint M, libxsmm_blasint N, libxsmm_blasint K, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + const float alpha, const float beta, libxsmm_blasint c_is_nt, + const float* a_dense) +{ + float one = 1.0f; + float* a_csr_values = NULL; + unsigned int* a_csr_rowptr = NULL; + unsigned int* a_csr_colidx = NULL; + float* aa_dense = NULL; + int flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE; + const libxsmm_gemm_descriptor* xgemm_desc; + libxsmm_descriptor_blob xgemm_blob; + libxsmm_sfsspmdm* new_handle = NULL; + libxsmm_smmfunction k_sparse1 = NULL; + libxsmm_smmfunction k_sparse2 = NULL; + libxsmm_smmfunction k_dense = NULL; + int i, j, n, a_nnz, N_sparse1, N_sparse2, N_dense, nkerns; + + /* internal lazy initialization */ + if (NULL == internal_fsspmdm_sperm) internal_sfsspmdm_init(); + + /* some checks... */ + assert(N % 16 == 0); + assert(N >= 16); + assert(LIBXSMM_FEQ(beta, 1.0f) || LIBXSMM_FEQ(beta, 0.0f)); + assert(K <= lda); + assert(N <= ldc); + assert(N <= ldb); + + /* Get the number of non-zeros */ + a_nnz = 0; + for (i = 0; i < M; i++) { + for (j = 0; j < K; j++) { + if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0)) { + a_nnz++; + } + } + } + + /* Null matrix */ + if ( 0 == a_nnz ) return 0; + + /* Allocate handle */ + new_handle = (libxsmm_sfsspmdm*)malloc(sizeof(libxsmm_sfsspmdm)); + if ( NULL == new_handle ) return NULL; + + /* Initialize the handle */ + LIBXSMM_MEMZERO127(new_handle); + /* TODO: in case of ILP64, check value ranges */ + new_handle->N = (int)N; + new_handle->M = (int)M; + new_handle->K = (int)K; + new_handle->ldb = (int)ldb; + new_handle->ldc = (int)ldc; + + /* update flags */ + if ( beta == 0.0 && c_is_nt != 0 ) { + flags |= LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT; + } + + /* Allocate CSR structure */ + a_csr_values = (float*)malloc((size_t)a_nnz * sizeof(float)); + a_csr_rowptr = (unsigned int*)malloc(((size_t)M + 1) * sizeof(unsigned int)); + a_csr_colidx = (unsigned int*)malloc((size_t)a_nnz * sizeof(unsigned int)); + + /* Allocate dense storage */ + aa_dense = (float*)libxsmm_aligned_malloc((size_t)M * (size_t)K * sizeof(float), 64); + + if ( NULL == a_csr_values || NULL == a_csr_rowptr || NULL == a_csr_colidx || NULL == aa_dense ) { + free( a_csr_values ); free( a_csr_rowptr ); free( a_csr_colidx ); + free( new_handle ); + libxsmm_free( aa_dense ); + return NULL; + } + + /* Populate CSR structure */ + for (i = 0, n = 0; i < M; i++) { + a_csr_rowptr[i] = n; + for (j = 0; j < K; j++) { + if (LIBXSMM_NEQ(a_dense[(i*lda) + j], 0.0f)) { + a_csr_values[n] = alpha*a_dense[(i*lda) + j]; + a_csr_colidx[n] = j; + n++; + } + } + } + a_csr_rowptr[M] = a_nnz; + + /* Attempt to JIT a sparse kernel */ + N_sparse1 = libxsmm_cpuid_vlen32(libxsmm_cpuid()); + xgemm_desc = libxsmm_sgemm_descriptor_init(&xgemm_blob, M, N_sparse1, K, + 0, ldb, ldc, one, beta, flags, prefetch); + if ( NULL != xgemm_desc ) { + k_sparse1 = libxsmm_create_scsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); + } + + /* If that worked try to JIT a second (wider) sparse kernel */ + N_sparse2 = N_sparse1*2; + if ( NULL != k_sparse1 && N_sparse2 <= N ) { + xgemm_desc = libxsmm_sgemm_descriptor_init(&xgemm_blob, M, N_sparse2, K, + 0, ldb, ldc, one, beta, flags, prefetch); + + if ( NULL != xgemm_desc ) { + k_sparse2 = libxsmm_create_scsr_reg(xgemm_desc, a_csr_rowptr, a_csr_colidx, a_csr_values); + } + } + + /* Free CSR */ + free( a_csr_values ); + free( a_csr_rowptr ); + free( a_csr_colidx ); + + /* Also generate a dense kernel */ + N_dense = 16; + k_dense = libxsmm_smmdispatch(N_dense, M, K, &ldb, &K, &ldc, &one, &beta, &flags, (const int*)LIBXSMM_GEMM_PREFETCH_NONE); + + if ( NULL != k_dense ) { + /* copy A over */ + for ( i = 0; i < M; ++i ) { + for ( j = 0; j < K; ++j ) { + aa_dense[i*K + j] = alpha*a_dense[i*lda + j]; + } + } + } + + /* Tally up how many kernels we got */ + nkerns = !!k_dense + !!k_sparse1 + !!k_sparse2; + + /* We have at least one kernel */ + if ( nkerns ) { + libxsmm_timer_tickint t; + float *B = NULL, *C = NULL; + double dt_dense = ( NULL != k_dense ) ? 1e5 : 1e6; + double dt_sparse1 = ( NULL != k_sparse1 ) ? 1e5 : 1e6; + double dt_sparse2 = ( NULL != k_sparse2 ) ? 1e5 : 1e6; + void* fp; + + /* If we have two or more kernels then try to benchmark them */ + if ( nkerns >= 2 ) { + B = (float*)libxsmm_aligned_malloc((size_t)K * (size_t)ldb * sizeof(float), 64); + C = (float*)libxsmm_aligned_malloc((size_t)M * (size_t)ldc * sizeof(float), 64); + + if ( NULL != B && NULL != C ) { + for ( i = 0; i < K; i++ ) { + for ( j = 0; j < N; j++ ) { + B[i*ldb + j] = 1; + } + } + for ( i = 0; i < M; i++ ) { + for ( j = 0; j < N; j++ ) { + C[i*ldc + j] = 1; + } + } + } + } + + /* Benchmark dense */ + if ( NULL != k_dense && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_dense ) { + k_dense( B + j, aa_dense, C + j ); + } + } + dt_dense = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Benchmark sparse (regular) */ + if ( NULL != k_sparse1 && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_sparse1 ) { + k_sparse1( internal_fsspmdm_sperm, B + j, C + j ); + } + } + dt_sparse1 = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Benchmark sparse (wide) */ + if ( NULL != k_sparse2 && NULL != B && NULL != C ) { + t = libxsmm_timer_tick(); + for ( i = 0; i < 250; i++ ) { + for ( j = 0; j < N; j += N_sparse2 ) { + k_sparse2( internal_fsspmdm_sperm, B + j, C + j ); + } + } + dt_sparse2 = libxsmm_timer_duration( t, libxsmm_timer_tick() ); + } + + /* Dense fastest */ + if ( dt_dense <= dt_sparse1 && dt_dense <= dt_sparse2 ) { + new_handle->N_chunksize = N_dense; + new_handle->kernel = k_dense; + new_handle->a_dense = aa_dense; + } else { + libxsmm_free( aa_dense ); + } + + /* Sparse (regular) fastest */ + if ( dt_sparse1 < dt_dense && dt_sparse1 <= dt_sparse2 ) { + new_handle->N_chunksize = N_sparse1; + new_handle->kernel = k_sparse1; + } else if ( NULL != k_sparse1 ) { + LIBXSMM_ASSIGN127( &fp, &k_sparse1 ); + libxsmm_free( fp ); + } + + /* Sparse (wide) fastest */ + if ( dt_sparse2 < dt_dense && dt_sparse2 < dt_sparse1 ) { + new_handle->N_chunksize = N_sparse2; + new_handle->kernel = k_sparse2; + } else if ( NULL != k_sparse2 ) { + LIBXSMM_ASSIGN127( &fp, &k_sparse2 ); + libxsmm_free( fp ); + } + + libxsmm_free( B ); + libxsmm_free( C ); + } + else { + libxsmm_free( aa_dense ); + free( new_handle ); + new_handle = NULL; + } + + return new_handle; +} + + +LIBXSMM_API void libxsmm_dfsspmdm_execute( const libxsmm_dfsspmdm* handle, const double* B, double* C ) +{ + int i; + assert( handle != NULL ); + + if ( handle->a_dense == NULL ) { + for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { + handle->kernel( internal_fsspmdm_dperm, B+i, C+i ); + } + } else { + for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { + handle->kernel( B+i, handle->a_dense, C+i ); + } + } +} + + +LIBXSMM_API void libxsmm_sfsspmdm_execute( const libxsmm_sfsspmdm* handle, const float* B, float* C ) +{ + int i; + assert( handle != NULL ); + + if ( handle->a_dense == NULL ) { + for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { + handle->kernel( internal_fsspmdm_sperm, B+i, C+i ); + } + } else { + for ( i = 0; i < handle->N; i+=handle->N_chunksize ) { + handle->kernel( B+i, handle->a_dense, C+i ); + } + } +} + + +LIBXSMM_API void libxsmm_dfsspmdm_destroy( libxsmm_dfsspmdm* handle ) +{ + assert( handle != NULL ); + + if ( handle->a_dense != NULL ) { + libxsmm_free(handle->a_dense); + } + else { + /* deallocate code known to be not registered; no index attached + do not use libxsmm_release_kernel here! We also need to work + around pointer-to-function to pointer-to-object conversion */ + void* fp; + LIBXSMM_ASSIGN127(&fp, &handle->kernel); + libxsmm_free(fp); + } + + free(handle); +} + + +LIBXSMM_API void libxsmm_sfsspmdm_destroy( libxsmm_sfsspmdm* handle ) +{ + assert( handle != NULL ); + + if ( handle->a_dense != NULL ) { + libxsmm_free(handle->a_dense); + } + else { + /* deallocate code known to be not registered; no index attached + do not use libxsmm_release_kernel here! We also need to work + around pointer-to-function to pointer-to-object conversion */ + void* fp; + LIBXSMM_ASSIGN127(&fp, &handle->kernel); + libxsmm_free(fp); + } + + free(handle); +} + diff --git a/third_party/libxsmm/src/libxsmm_gemm.c b/third_party/libxsmm/src/libxsmm_gemm.c new file mode 100644 index 00000000..1c997234 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_gemm.c @@ -0,0 +1,2156 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_gemm.h" +#include "libxsmm_xcopy.h" +#include "libxsmm_hash.h" +#include + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if !defined(LIBXSMM_NO_LIBM) +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(LIBXSMM_GEMM_XCOPY_JIT) && defined(LIBXSMM_XCOPY_JIT) && (0 != LIBXSMM_XCOPY_JIT) +# define LIBXSMM_GEMM_XCOPY_JIT +#endif +#if !defined(LIBXSMM_GEMM_KPARALLEL) && 0 +# define LIBXSMM_GEMM_KPARALLEL +#endif +#if !defined(LIBXSMM_GEMM_BATCHSIZE) +# define LIBXSMM_GEMM_BATCHSIZE 1024 +#endif +#if !defined(LIBXSMM_GEMM_TASKGRAIN) +# define LIBXSMM_GEMM_TASKGRAIN 128 +#endif +#if !defined(LIBXSMM_GEMM_BATCHREDUCE) && !defined(_WIN32) && !defined(__CYGWIN__) /* not supported */ +# define LIBXSMM_GEMM_BATCHREDUCE +#endif +#if !defined(LIBXSMM_GEMM_BATCHSCALE) && (defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP)) +#define LIBXSMM_GEMM_BATCHSCALE ((unsigned int)LIBXSMM_ROUND(sizeof(libxsmm_mmbatch_item) * (LIBXSMM_GEMM_MMBATCH_SCALE))) +#endif +#if defined(LIBXSMM_BUILD) +# define LIBXSMM_GEMM_WEAK LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK +#else +# define LIBXSMM_GEMM_WEAK LIBXSMM_API +#endif + +#if (0 != LIBXSMM_SYNC) /** Locks for the batch interface (duplicated C indexes). */ +# define LIBXSMM_GEMM_LOCKIDX(IDX, NPOT) LIBXSMM_MOD2(LIBXSMM_CRC32U(LIBXSMM_BLASINT_NBITS)(2507/*seed*/, &(IDX)), NPOT) +# define LIBXSMM_GEMM_LOCKPTR(PTR, NPOT) LIBXSMM_MOD2(LIBXSMM_CRC32U(LIBXSMM_BITS)(1975/*seed*/, &(PTR)), NPOT) +# if !defined(LIBXSMM_GEMM_MAXNLOCKS) +# define LIBXSMM_GEMM_MAXNLOCKS 1024 +# endif +# if !defined(LIBXSMM_GEMM_LOCKFWD) +# define LIBXSMM_GEMM_LOCKFWD +# endif +# if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_GEMM_LOCK) +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_gemm_locktype { + char pad[LIBXSMM_CACHELINE]; + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) state; +} internal_gemm_locktype; +# else +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_gemm_locktype { + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) state; +} internal_gemm_locktype; +# endif +LIBXSMM_APIVAR_DEFINE(internal_gemm_locktype internal_gemm_lock[LIBXSMM_GEMM_MAXNLOCKS]); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_nlocks); /* populated number of locks */ +#endif + +/* definition of corresponding variables */ +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch_function); +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch_function); +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemm_function libxsmm_original_dgemm_function); +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemm_function libxsmm_original_sgemm_function); +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_dgemv_function libxsmm_original_dgemv_function); +LIBXSMM_APIVAR_PUBLIC_DEF(/*volatile*/libxsmm_sgemv_function libxsmm_original_sgemv_function); +/* definition of corresponding variables */ +LIBXSMM_APIVAR_PUBLIC_DEF(libxsmm_gemm_descriptor libxsmm_mmbatch_desc); +LIBXSMM_APIVAR_PUBLIC_DEF(void* libxsmm_mmbatch_array); +LIBXSMM_APIVAR_PUBLIC_DEF(LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) libxsmm_mmbatch_lock); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mmbatch_size); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_gemm_npargroups); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_gemm_taskgrain); +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_gemm_tasks); +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_gemm_wrap); + +LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch_default); +/** Determines the prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ +LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch); + +/** Prefetch strategy for tiled GEMM. */ +LIBXSMM_APIVAR_DEFINE(libxsmm_gemm_prefetch_type internal_gemm_tiled_prefetch); +/** Vector width used for GEMM. */ +LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_vwidth); +/** Limit the M-extent of the tile. */ +LIBXSMM_APIVAR_DEFINE(unsigned int internal_gemm_mlimit); +/** Table of M-extents per type-size (tile shape). */ +LIBXSMM_APIVAR_DEFINE(float internal_gemm_nstretch); +/** Table of M-extents per type-size (tile shape). */ +LIBXSMM_APIVAR_DEFINE(float internal_gemm_kstretch); +/** Determines if batch-reduce is enabled */ +LIBXSMM_APIVAR_DEFINE(int internal_gemm_batchreduce); + + +#if defined(LIBXSMM_BUILD) +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemm_batch)( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ +#if (0 != LIBXSMM_BLAS) +# if defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) + if (0 > libxsmm_gemm_wrap) { + LIBXSMM_FSYMBOL(dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else +# endif + { + const libxsmm_blasint ptrsize = sizeof(void*); + libxsmm_blasint i, j = 0; + LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != group_count && NULL != group_size); + LIBXSMM_ASSERT(NULL != m_array && NULL != n_array && NULL != k_array && NULL != lda_array && NULL != ldb_array && NULL != ldc_array); + LIBXSMM_ASSERT(NULL != a_array && NULL != b_array && NULL != c_array && NULL != alpha_array && NULL != beta_array); + for (i = 0; i < *group_count; ++i) { + const libxsmm_blasint size = group_size[i]; + libxsmm_dmmbatch_blas(transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, + a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, + c_array + j, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); + j += size; + } + } +#else + libxsmm_blas_error("dgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemm_batch)( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ +#if (0 != LIBXSMM_BLAS) +# if defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) + if (0 > libxsmm_gemm_wrap) { + LIBXSMM_FSYMBOL(sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); + } + else +# endif + { + const libxsmm_blasint ptrsize = sizeof(void*); + libxsmm_blasint i, j = 0; + LIBXSMM_ASSERT(NULL != transa_array && NULL != transb_array && NULL != group_count && NULL != group_size); + LIBXSMM_ASSERT(NULL != m_array && NULL != n_array && NULL != k_array && NULL != lda_array && NULL != ldb_array && NULL != ldc_array); + LIBXSMM_ASSERT(NULL != a_array && NULL != b_array && NULL != c_array && NULL != alpha_array && NULL != beta_array); + for (i = 0; i < *group_count; ++i) { + const libxsmm_blasint size = group_size[i]; + libxsmm_smmbatch_blas(transa_array + i, transb_array + i, m_array[i], n_array[i], k_array[i], alpha_array + i, + a_array + i, lda_array + i, b_array + i, ldb_array + i, beta_array + i, + c_array + i, ldc_array + i, 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); + j += size; + } + } +#else + libxsmm_blas_error("sgemm_batch")(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_FSYMBOL(dgemm)((LIBXSMM_BLAS_CONST char*)transa, (LIBXSMM_BLAS_CONST char*)transb, + (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST libxsmm_blasint*)k, + (LIBXSMM_BLAS_CONST double*)alpha, (LIBXSMM_BLAS_CONST double*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, + (LIBXSMM_BLAS_CONST double*)b, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldb, + (LIBXSMM_BLAS_CONST double*) beta, c, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldc); +#else + libxsmm_blas_error("dgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_FSYMBOL(sgemm)((LIBXSMM_BLAS_CONST char*)transa, (LIBXSMM_BLAS_CONST char*)transb, + (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, (LIBXSMM_BLAS_CONST libxsmm_blasint*)k, + (LIBXSMM_BLAS_CONST float*)alpha, (LIBXSMM_BLAS_CONST float*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, + (LIBXSMM_BLAS_CONST float*)b, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldb, + (LIBXSMM_BLAS_CONST float*) beta, c, (LIBXSMM_BLAS_CONST libxsmm_blasint*)ldc); +#else + libxsmm_blas_error("sgemm")(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_dgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const double* alpha, const double* a, const libxsmm_blasint* lda, const double* x, const libxsmm_blasint* incx, + const double* beta, double* y, const libxsmm_blasint* incy) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_FSYMBOL(dgemv)((LIBXSMM_BLAS_CONST char*)trans, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, + (LIBXSMM_BLAS_CONST double*)alpha, (LIBXSMM_BLAS_CONST double*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, + (LIBXSMM_BLAS_CONST double*)x, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incx, + (LIBXSMM_BLAS_CONST double*) beta, y, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incy); +#else + libxsmm_blas_error("dgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void LIBXSMM_FSYMBOL(__real_sgemv)(const char* trans, const libxsmm_blasint* m, const libxsmm_blasint* n, + const float* alpha, const float* a, const libxsmm_blasint* lda, const float* x, const libxsmm_blasint* incx, + const float* beta, float* y, const libxsmm_blasint* incy) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_FSYMBOL(sgemv)((LIBXSMM_BLAS_CONST char*)trans, (LIBXSMM_BLAS_CONST libxsmm_blasint*)m, (LIBXSMM_BLAS_CONST libxsmm_blasint*)n, + (LIBXSMM_BLAS_CONST float*)alpha, (LIBXSMM_BLAS_CONST float*)a, (LIBXSMM_BLAS_CONST libxsmm_blasint*)lda, + (LIBXSMM_BLAS_CONST float*)x, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incx, + (LIBXSMM_BLAS_CONST float*) beta, y, (LIBXSMM_BLAS_CONST libxsmm_blasint*)incy); +#else + libxsmm_blas_error("sgemv")(trans, m, n, alpha, a, lda, x, incx, beta, y, incy); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void __real_dgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_FSYMBOL(__real_dgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void __real_sgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + LIBXSMM_FSYMBOL(__real_sgemm_batch)(transa_array, transb_array, m_array, n_array, k_array, + alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, + group_count, group_size); +} +#endif /*defined(LIBXSMM_BUILD)*/ + + +LIBXSMM_GEMM_WEAK libxsmm_dgemm_batch_function libxsmm_original_dgemm_batch(void) +{ +#if (0 != LIBXSMM_BLAS) && defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) + LIBXSMM_BLAS_WRAPPER(1, double, gemm_batch, libxsmm_original_dgemm_batch_function, NULL/*unknown*/); + /*LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_batch_function);*/ +#else + LIBXSMM_BLAS_WRAPPER(0, double, gemm_batch, libxsmm_original_dgemm_batch_function, NULL/*unknown*/); +#endif + return libxsmm_original_dgemm_batch_function; +} + + +LIBXSMM_GEMM_WEAK libxsmm_sgemm_batch_function libxsmm_original_sgemm_batch(void) +{ +#if (0 != LIBXSMM_BLAS) && defined(LIBXSMM_WRAP) && (0 > LIBXSMM_WRAP) + LIBXSMM_BLAS_WRAPPER(1, float, gemm_batch, libxsmm_original_sgemm_batch_function, NULL/*unknown*/); + /*LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_batch_function);*/ +#else + LIBXSMM_BLAS_WRAPPER(0, float, gemm_batch, libxsmm_original_sgemm_batch_function, NULL/*unknown*/); +#endif + return libxsmm_original_sgemm_batch_function; +} + + +LIBXSMM_GEMM_WEAK libxsmm_dgemm_function libxsmm_original_dgemm(void) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, double, gemm, libxsmm_original_dgemm_function, NULL/*unknown*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_dgemm_function); +#else + LIBXSMM_BLAS_WRAPPER(0, double, gemm, libxsmm_original_dgemm_function, NULL/*unknown*/); +#endif + return libxsmm_original_dgemm_function; +} + + +LIBXSMM_GEMM_WEAK libxsmm_sgemm_function libxsmm_original_sgemm(void) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, float, gemm, libxsmm_original_sgemm_function, NULL/*unknown*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_sgemm_function); +#else + LIBXSMM_BLAS_WRAPPER(0, float, gemm, libxsmm_original_sgemm_function, NULL/*unknown*/); +#endif + return libxsmm_original_sgemm_function; +} + + +LIBXSMM_GEMM_WEAK libxsmm_dgemv_function libxsmm_original_dgemv(void) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, double, gemv, libxsmm_original_dgemv_function, NULL/*unknown*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_dgemv_function); +#else + LIBXSMM_BLAS_WRAPPER(0, double, gemv, libxsmm_original_dgemv_function, NULL/*unknown*/); +#endif + return libxsmm_original_dgemv_function; +} + + +LIBXSMM_GEMM_WEAK libxsmm_sgemv_function libxsmm_original_sgemv(void) +{ +#if (0 != LIBXSMM_BLAS) + LIBXSMM_BLAS_WRAPPER(1, float, gemv, libxsmm_original_sgemv_function, NULL/*unknown*/); + LIBXSMM_ASSERT(NULL != libxsmm_original_sgemv_function); +#else + LIBXSMM_BLAS_WRAPPER(0, float, gemv, libxsmm_original_sgemv_function, NULL/*unknown*/); +#endif + return libxsmm_original_sgemv_function; +} + + +LIBXSMM_API libxsmm_sink_function libxsmm_blas_error(const char* symbol) +{ + static int error_once = 0; + LIBXSMM_BLAS_ERROR(symbol, &error_once); + return libxsmm_sink; +} + + +LIBXSMM_API_INTERN void libxsmm_gemm_init(int archid) +{ + const char* env_w = getenv("LIBXSMM_GEMM_WRAP"); + LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_GEMM_LOCK) attr; + LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_GEMM_LOCK, &attr); +#if defined(LIBXSMM_WRAP) /* determines if wrap is considered */ + { /* intercepted GEMMs (1: sequential and non-tiled, 2: parallelized and tiled) */ +# if defined(__STATIC) /* with static library the user controls interceptor already */ + libxsmm_gemm_wrap = ((NULL == env_w || 0 == *env_w) /* LIBXSMM_WRAP=0: no promotion */ + ? (0 < (LIBXSMM_WRAP) ? (LIBXSMM_WRAP + 2) : (LIBXSMM_WRAP - 2)) : atoi(env_w)); +# else + libxsmm_gemm_wrap = ((NULL == env_w || 0 == *env_w) ? (LIBXSMM_WRAP) : atoi(env_w)); +# endif + } +#endif + { /* setup prefetch strategy for tiled GEMMs */ + const char *const env_p = getenv("LIBXSMM_TGEMM_PREFETCH"); + const libxsmm_gemm_prefetch_type tiled_prefetch_default = LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; + const int uid = ((NULL == env_p || 0 == *env_p) ? LIBXSMM_PREFETCH_AUTO/*default*/ : atoi(env_p)); + internal_gemm_tiled_prefetch = (0 <= uid ? libxsmm_gemm_uid2prefetch(uid) : tiled_prefetch_default); + } +#if (0 != LIBXSMM_SYNC) + { /* initialize locks for the batch interface */ + const char *const env_locks = getenv("LIBXSMM_GEMM_NLOCKS"); + const int nlocks = ((NULL == env_locks || 0 == *env_locks) ? -1/*default*/ : atoi(env_locks)); + unsigned int i; + internal_gemm_nlocks = LIBXSMM_UP2POT(0 > nlocks ? (LIBXSMM_GEMM_MAXNLOCKS) : LIBXSMM_MIN(nlocks, LIBXSMM_GEMM_MAXNLOCKS)); + for (i = 0; i < internal_gemm_nlocks; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_GEMM_LOCK, &internal_gemm_lock[i].state, &attr); + } +#endif +#if defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP) + { /* determines if batch-reduce kernel or batch-wrap is considered */ + const char *const env_r = getenv("LIBXSMM_GEMM_BATCHREDUCE"); + internal_gemm_batchreduce = (NULL == env_r || 0 == *env_r) ? 0 : atoi(env_r); + if ((NULL == env_w || 0 == *env_w) && ((LIBXSMM_GEMM_MMBATCH_VERBOSITY <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity) || 0 > libxsmm_verbosity)) { + libxsmm_mmbatch_desc.flags = LIBXSMM_MMBATCH_FLAG_STATISTIC; /* enable auto-batch statistic */ + internal_gemm_batchreduce = 0; + } + if (0 != internal_gemm_batchreduce || 0 != libxsmm_gemm_wrap) { + const char *const env_b = getenv("LIBXSMM_GEMM_BATCHSIZE"); + const int env_bi = (NULL == env_b || 0 == *env_b) ? -1/*auto*/ : atoi(env_b); + const unsigned int env_bu = (unsigned int)(0 >= env_bi ? (LIBXSMM_GEMM_BATCHSIZE) : env_bi); + const unsigned int batchscale = LIBXSMM_ABS(internal_gemm_batchreduce) * 2048/*arbitrary*/ * 2/*A and B-matrices*/ * sizeof(void*); + const unsigned int minsize = LIBXSMM_UPDIV(batchscale * env_bu, LIBXSMM_GEMM_BATCHSCALE); + const unsigned int batchsize = LIBXSMM_MAX(env_bu, minsize); + const void *const extra = NULL; + LIBXSMM_ASSERT(1 < (LIBXSMM_GEMM_MMBATCH_SCALE) && NULL == libxsmm_mmbatch_array); + if (EXIT_SUCCESS == libxsmm_xmalloc(&libxsmm_mmbatch_array, (size_t)batchsize * (LIBXSMM_GEMM_BATCHSCALE), 0/*auto-alignment*/, + LIBXSMM_MALLOC_FLAG_PRIVATE /*| LIBXSMM_MALLOC_FLAG_SCRATCH*/, &extra, sizeof(extra))) + { + LIBXSMM_LOCK_INIT(LIBXSMM_GEMM_LOCK, &libxsmm_mmbatch_lock, &attr); + LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array); + libxsmm_mmbatch_size = batchsize; + } + } + } +#else + LIBXSMM_UNUSED(env_w); +#endif + { /* determines grain-size of tasks (when available) */ + const char *const env_s = getenv("LIBXSMM_GEMM_NPARGROUPS"); + libxsmm_gemm_npargroups = ((NULL == env_s || 0 == *env_s || 0 >= atoi(env_s)) + ? (LIBXSMM_GEMM_NPARGROUPS) : atoi(env_s)); + } + if (LIBXSMM_X86_AVX512_CORE <= archid) { + internal_gemm_vwidth = 64; + internal_gemm_mlimit = 48; + internal_gemm_nstretch = 3.0f; + internal_gemm_kstretch = 2.0f; + } + else if (LIBXSMM_X86_AVX512_MIC <= archid) { + internal_gemm_vwidth = 64; + internal_gemm_mlimit = 64; + internal_gemm_nstretch = 1.0f; + internal_gemm_kstretch = 1.0f; + } + else if (LIBXSMM_X86_AVX2 <= archid) { + internal_gemm_vwidth = 32; + internal_gemm_mlimit = 48; + internal_gemm_nstretch = 3.0f; + internal_gemm_kstretch = 2.0f; + } + else if (LIBXSMM_X86_AVX <= archid) { + internal_gemm_vwidth = 32; + internal_gemm_mlimit = 48; + internal_gemm_nstretch = 5.0f; + internal_gemm_kstretch = 1.0f; + } + else { + internal_gemm_vwidth = 16; + internal_gemm_mlimit = 48; + internal_gemm_nstretch = 7.0f; + internal_gemm_kstretch = 5.0f; + } + { /* setup tile sizes according to environment (LIBXSMM_TGEMM_M, LIBXSMM_TGEMM_N, LIBXSMM_TGEMM_K) */ + const char *const env_m = getenv("LIBXSMM_TGEMM_M"), *const env_n = getenv("LIBXSMM_TGEMM_N"), *const env_k = getenv("LIBXSMM_TGEMM_K"); + const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); + const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); + const int k = ((NULL == env_k || 0 == *env_k) ? 0 : atoi(env_k)); + if (0 < m) { + if (0 < n) internal_gemm_nstretch = ((float)n) / m; + if (0 < k) internal_gemm_kstretch = ((float)k) / m; + } + } + { /* setup tile sizes according to environment (LIBXSMM_TGEMM_NS, LIBXSMM_TGEMM_KS) */ + const char *const env_ns = getenv("LIBXSMM_TGEMM_NS"), *const env_ks = getenv("LIBXSMM_TGEMM_KS"); + const double ns = ((NULL == env_ns || 0 == *env_ns) ? 0 : atof(env_ns)); + const double ks = ((NULL == env_ks || 0 == *env_ks) ? 0 : atof(env_ks)); + if (0 < ns) internal_gemm_nstretch = (float)LIBXSMM_MIN(24, ns); + if (0 < ks) internal_gemm_kstretch = (float)LIBXSMM_MIN(24, ks); + } + { /* determines if OpenMP tasks are used (when available) */ + const char *const env_t = getenv("LIBXSMM_GEMM_TASKS"); + const int gemm_tasks = ((NULL == env_t || 0 == *env_t) ? 0/*disabled*/ : atoi(env_t)); + libxsmm_gemm_tasks = (0 <= gemm_tasks ? LIBXSMM_ABS(gemm_tasks) : 1/*enabled*/); + } + { /* determines grain-size of tasks (when available) */ + const char *const env_g = getenv("LIBXSMM_GEMM_TASKGRAIN"); + const int gemm_taskgrain = ((NULL == env_g || 0 == *env_g || 0 >= atoi(env_g)) + ? (LIBXSMM_GEMM_TASKGRAIN) : atoi(env_g)); + /* adjust grain-size or scale beyond the number of threads */ + libxsmm_gemm_taskgrain = LIBXSMM_MAX(0 < libxsmm_gemm_tasks ? (gemm_taskgrain / libxsmm_gemm_tasks) : gemm_taskgrain, 1); + } + LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_GEMM_LOCK, &attr); + /* determine BLAS function-pointers */ + libxsmm_original_dgemm_batch(); + libxsmm_original_sgemm_batch(); + libxsmm_original_dgemm(); + libxsmm_original_sgemm(); + libxsmm_original_dgemv(); + libxsmm_original_sgemv(); +} + + +LIBXSMM_API_INTERN void libxsmm_gemm_finalize(void) +{ +#if (0 != LIBXSMM_SYNC) + unsigned int i; for (i = 0; i < internal_gemm_nlocks; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_GEMM_LOCK, &internal_gemm_lock[i].state); +#endif +#if defined(LIBXSMM_GEMM_BATCHREDUCE) || defined(LIBXSMM_WRAP) + if (NULL != libxsmm_mmbatch_array) { + void *extra = NULL, *const mmbatch_array = libxsmm_mmbatch_array; + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(mmbatch_array, NULL/*size*/, NULL/*flags*/, &extra) && NULL != extra) { + const libxsmm_mmbatch_flush_function flush = *(libxsmm_mmbatch_flush_function*)extra; + if (NULL != flush) flush(); + } +#if !defined(NDEBUG) + libxsmm_mmbatch_array = NULL; +#endif + libxsmm_xfree(mmbatch_array, 0/*no check*/); + LIBXSMM_LOCK_DESTROY(LIBXSMM_GEMM_LOCK, &libxsmm_mmbatch_lock); + } +#endif +} + + +LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_xprefetch(const int* prefetch) +{ + LIBXSMM_INIT /* load configuration */ + return libxsmm_get_gemm_prefetch(NULL == prefetch ? ((int)libxsmm_gemm_auto_prefetch) : *prefetch); +} + + +LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_prefetch(int prefetch) +{ + libxsmm_gemm_prefetch_type result; +#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(__MINGW32__) + if (0 > prefetch) { + LIBXSMM_INIT /* load configuration */ + result = libxsmm_gemm_auto_prefetch_default; + } + else { + result = (libxsmm_gemm_prefetch_type)prefetch; + } +#else /* TODO: full support for Windows calling convention */ + result = LIBXSMM_GEMM_PREFETCH_NONE; + LIBXSMM_UNUSED(prefetch); +#endif + return result; +} + + +LIBXSMM_API_INTERN int libxsmm_gemm_prefetch2uid(libxsmm_gemm_prefetch_type prefetch) +{ + switch (prefetch) { + case LIBXSMM_GEMM_PREFETCH_SIGONLY: return 2; + case LIBXSMM_GEMM_PREFETCH_BL2_VIA_C: return 3; + case LIBXSMM_GEMM_PREFETCH_AL2_AHEAD: return 4; + case LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD: return 5; + case LIBXSMM_GEMM_PREFETCH_AL2: return 6; + case LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C: return 7; + case LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB: return 8; + default: { + LIBXSMM_ASSERT(LIBXSMM_GEMM_PREFETCH_NONE == prefetch); + return 0; + } + } +} + + +LIBXSMM_API_INTERN libxsmm_gemm_prefetch_type libxsmm_gemm_uid2prefetch(int uid) +{ + switch (uid) { + case 1: return LIBXSMM_GEMM_PREFETCH_NONE; /* nopf */ + case 2: return LIBXSMM_GEMM_PREFETCH_SIGONLY; /* pfsigonly */ + case 3: return LIBXSMM_GEMM_PREFETCH_BL2_VIA_C; /* BL2viaC */ + case 4: return LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; /* curAL2 */ + case 5: return LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD; /* curAL2_BL2viaC */ + case 6: return LIBXSMM_GEMM_PREFETCH_AL2; /* AL2 */ + case 7: return LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; /* AL2_BL2viaC */ + case 8: return LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB; + default: { + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM WARNING: invalid prefetch strategy requested!\n"); + } + } + return LIBXSMM_GEMM_PREFETCH_NONE; + } + } +} + + +LIBXSMM_API void libxsmm_gemm_print(void* ostream, + libxsmm_gemm_precision precision, const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc) +{ + libxsmm_gemm_print2(ostream, precision, precision, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_API void libxsmm_gemm_print2(void* ostream, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, + const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc) +{ + const libxsmm_blasint nn = *(n ? n : m), kk = *(k ? k : m); + const char ctransa = (char)(NULL != transa ? (*transa) : (0 == (LIBXSMM_FLAGS & LIBXSMM_GEMM_FLAG_TRANS_A) ? 'n' : 't')); + const char ctransb = (char)(NULL != transb ? (*transb) : (0 == (LIBXSMM_FLAGS & LIBXSMM_GEMM_FLAG_TRANS_B) ? 'n' : 't')); + const libxsmm_blasint ilda = (NULL != lda ? *lda : (('n' == ctransa || 'N' == ctransa) ? *m : kk)); + const libxsmm_blasint ildb = (NULL != ldb ? *ldb : (('n' == ctransb || 'N' == ctransb) ? kk : nn)); + const libxsmm_blasint ildc = *(NULL != ldc ? ldc : m); + libxsmm_mhd_elemtype mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; + char string_a[128], string_b[128], typeprefix = 0; + + switch (iprec | oprec) { + case LIBXSMM_GEMM_PRECISION_F64: { + LIBXSMM_ASSERT(iprec == oprec); + LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "%g", NULL != alpha ? *((const double*)alpha) : LIBXSMM_ALPHA); + LIBXSMM_SNPRINTF(string_b, sizeof(string_b), "%g", NULL != beta ? *((const double*)beta) : LIBXSMM_BETA); + mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_F64; + typeprefix = 'd'; + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + LIBXSMM_ASSERT(iprec == oprec); + LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "%g", NULL != alpha ? *((const float*)alpha) : LIBXSMM_ALPHA); + LIBXSMM_SNPRINTF(string_b, sizeof(string_b), "%g", NULL != beta ? *((const float*)beta) : LIBXSMM_BETA); + mhd_elemtype = LIBXSMM_MHD_ELEMTYPE_F32; + typeprefix = 's'; + } break; + default: if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { /* TODO: support I16, etc. */ + fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); + } + } + } + + if (0 != typeprefix) { + if (NULL != ostream) { /* print information about GEMM call */ + if (NULL != a && NULL != b && NULL != c) { + fprintf((FILE*)ostream, "%cgemm('%c', '%c', %" PRIuPTR "/*m*/, %" PRIuPTR "/*n*/, %" PRIuPTR "/*k*/,\n" + " %s/*alpha*/, %p/*a*/, %" PRIuPTR "/*lda*/,\n" + " %p/*b*/, %" PRIuPTR "/*ldb*/,\n" + " %s/*beta*/, %p/*c*/, %" PRIuPTR "/*ldc*/)", + typeprefix, ctransa, ctransb, (uintptr_t)*m, (uintptr_t)nn, (uintptr_t)kk, + string_a, a, (uintptr_t)ilda, b, (uintptr_t)ildb, string_b, c, (uintptr_t)ildc); + } + else { + fprintf((FILE*)ostream, "%cgemm(trans=%c%c mnk=%" PRIuPTR ",%" PRIuPTR ",%" PRIuPTR + " ldx=%" PRIuPTR ",%" PRIuPTR ",%" PRIuPTR " a,b=%s,%s)", + typeprefix, ctransa, ctransb, (uintptr_t)*m, (uintptr_t)nn, (uintptr_t)kk, + (uintptr_t)ilda, (uintptr_t)ildb, (uintptr_t)ildc, string_a, string_b); + } + } + else { /* dump A, B, and C matrices into MHD files */ + char extension_header[256]; + size_t data_size[2], size[2]; + + if (NULL != a) { + LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "TRANS = %c\nALPHA = %s", ctransa, string_a); + LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_a_%p.mhd", a); + data_size[0] = (size_t)ilda; data_size[1] = (size_t)kk; size[0] = (size_t)(*m); size[1] = (size_t)kk; + libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, + NULL/*conversion*/, a, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); + } + if (NULL != b) { + LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "\nTRANS = %c", ctransb); + LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_b_%p.mhd", b); + data_size[0] = (size_t)ildb; data_size[1] = (size_t)nn; size[0] = (size_t)kk; size[1] = (size_t)nn; + libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, + NULL/*conversion*/, b, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); + } + if (NULL != c) { + LIBXSMM_SNPRINTF(extension_header, sizeof(extension_header), "BETA = %s", string_b); + LIBXSMM_SNPRINTF(string_a, sizeof(string_a), "libxsmm_c_%p.mhd", c); + data_size[0] = (size_t)ildc; data_size[1] = (size_t)nn; size[0] = (size_t)(*m); size[1] = (size_t)nn; + libxsmm_mhd_write(string_a, NULL/*offset*/, size, data_size, 2/*ndims*/, 1/*ncomponents*/, mhd_elemtype, + NULL/*conversion*/, c, NULL/*header_size*/, extension_header, NULL/*extension*/, 0/*extension_size*/); + } + } + } +} + + +LIBXSMM_API void libxsmm_gemm_dprint( + void* ostream, libxsmm_gemm_precision precision, char transa, char transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, + const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc) +{ + libxsmm_gemm_dprint2(ostream, precision, precision, transa, transb, m, n, k, dalpha, a, lda, b, ldb, dbeta, c, ldc); +} + + +LIBXSMM_API void libxsmm_gemm_dprint2( + void* ostream, libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, char transa, char transb, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, double dalpha, const void* a, libxsmm_blasint lda, + const void* b, libxsmm_blasint ldb, double dbeta, void* c, libxsmm_blasint ldc) +{ + switch (iprec) { + case LIBXSMM_GEMM_PRECISION_F64: { + libxsmm_gemm_print2(ostream, LIBXSMM_GEMM_PRECISION_F64, oprec, &transa, &transb, + &m, &n, &k, &dalpha, a, &lda, b, &ldb, &dbeta, c, &ldc); + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + const float alpha = (float)dalpha, beta = (float)dbeta; + libxsmm_gemm_print2(ostream, LIBXSMM_GEMM_PRECISION_F32, oprec, &transa, &transb, + &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); + } break; + default: { + libxsmm_gemm_print2(ostream, iprec, oprec, &transa, &transb, + &m, &n, &k, &dalpha, a, &lda, b, &ldb, &dbeta, c, &ldc); + } + } +} + + +LIBXSMM_API void libxsmm_gemm_xprint(void* ostream, + libxsmm_xmmfunction kernel, const void* a, const void* b, void* c) +{ + const libxsmm_descriptor* desc; + libxsmm_code_pointer code; + size_t code_size; + code.xgemm = kernel; + if (NULL != libxsmm_get_kernel_xinfo(code, &desc, &code_size) && + NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) + { + libxsmm_gemm_dprint2(ostream, + (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype), + (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype), + (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_A & desc->gemm.desc.flags) ? 'N' : 'T'), + (char)(0 == (LIBXSMM_GEMM_FLAG_TRANS_B & desc->gemm.desc.flags) ? 'N' : 'T'), + (libxsmm_blasint)desc->gemm.desc.m, (libxsmm_blasint)desc->gemm.desc.n, (libxsmm_blasint)desc->gemm.desc.k, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & libxsmm_mmbatch_desc.flags) ? 0 : */1, a, + (libxsmm_blasint)desc->gemm.desc.lda, b, (libxsmm_blasint)desc->gemm.desc.ldb, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & libxsmm_mmbatch_desc.flags) ? 0 : 1, c, (libxsmm_blasint)desc->gemm.desc.ldc); + fprintf((FILE*)ostream, " = %p+%u", code.ptr_const, (unsigned int)code_size); + } +} + + +LIBXSMM_API void libxsmm_blas_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_INIT + switch (iprec) { + case LIBXSMM_GEMM_PRECISION_F64: { + LIBXSMM_ASSERT(iprec == oprec); + LIBXSMM_BLAS_XGEMM(double, double, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + LIBXSMM_ASSERT(iprec == oprec); + LIBXSMM_BLAS_XGEMM(float, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } break; + default: if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + static int error_once = 0; + LIBXSMM_UNUSED(oprec); + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { /* TODO: support I16, etc. */ + fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); + } + } + } +} + + +LIBXSMM_API_INLINE int libxsmm_gemm_plan_internal(unsigned int ntasks, + unsigned int m, unsigned int n, unsigned int k, /* whole problem size */ + unsigned int tm, unsigned int tn, unsigned int tk, /* tile size (kernel) */ + unsigned int* nmt, unsigned int* nnt, unsigned int* nkt, /* number of tiles */ + unsigned int* mt, unsigned int* nt, unsigned int* kt) /* number of tasks */ +{ + unsigned int result = EXIT_SUCCESS, replan = 0; + LIBXSMM_ASSERT(NULL != nmt && NULL != nnt && NULL != nkt); + LIBXSMM_ASSERT(NULL != mt && NULL != nt && NULL != kt); + LIBXSMM_ASSERT(0 < ntasks); + *nmt = (m + tm - 1) / LIBXSMM_MAX(tm, 1); + *nnt = (n + tn - 1) / LIBXSMM_MAX(tn, 1); + *nkt = (k + tk - 1) / LIBXSMM_MAX(tk, 1); +#if !defined(NDEBUG) + *mt = *nt = *kt = 0; +#endif + do { + if (1 >= replan) *mt = libxsmm_product_limit(*nmt, ntasks, 0); + if (1 == replan || ntasks <= *mt) { /* M-parallelism */ + *nt = 1; + *kt = 1; + replan = 0; + } + else { + const unsigned int mntasks = libxsmm_product_limit((*nmt) * (*nnt), ntasks, 0); + if (0 == replan && *mt >= mntasks) replan = 1; + if (2 == replan || (0 == replan && ntasks <= mntasks)) { /* MN-parallelism */ + *nt = mntasks / *mt; + *kt = 1; + replan = 0; + } + else { /* MNK-parallelism */ + const unsigned int mnktasks = libxsmm_product_limit((*nmt) * (*nnt) * (*nkt), ntasks, 0); + if (mntasks < mnktasks) { +#if defined(LIBXSMM_GEMM_KPARALLEL) + *nt = mntasks / *mt; + *kt = mnktasks / mntasks; + replan = 0; +#else + static int error_once = 0; + if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: XGEMM K-parallelism triggered!\n"); + } +#endif + } +#if defined(LIBXSMM_GEMM_KPARALLEL) + else +#endif + if (0 == replan) replan = 2; + } + } + } while (0 != replan); + if (0 == *mt || 0 == *nt || 0 == *kt) { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API libxsmm_gemm_handle* libxsmm_gemm_handle_init(libxsmm_gemm_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const void* alpha, const void* beta, int flags, /*unsigned*/int ntasks) +{ + unsigned int ulda, uldb, um, un, uk, tm = 0, tn = 0, tk = 0, max_ntasks = 0; + libxsmm_descriptor_blob desc_blob; + union { + libxsmm_gemm_handle* ptr; + libxsmm_gemm_blob* blob; + } result; + LIBXSMM_ASSERT(sizeof(libxsmm_gemm_handle) <= sizeof(libxsmm_gemm_blob)); + if (NULL != blob && NULL != m && 0 < ntasks) { + unsigned int ntm = 0, ntn = 0, ntk = 0, mt = 1, nt = 1, kt = 1; + const char *const env_tm = getenv("LIBXSMM_TGEMM_M"); + libxsmm_blasint klda, kldb, kldc, km, kn; + libxsmm_gemm_descriptor* desc; + double dbeta; + LIBXSMM_INIT + result.blob = blob; +#if defined(NDEBUG) + result.ptr->copy_a.ptr = result.ptr->copy_b.ptr = result.ptr->copy_i.ptr = result.ptr->copy_o.ptr = NULL; +#else + memset(blob, 0, sizeof(libxsmm_gemm_blob)); +#endif + if (EXIT_SUCCESS != libxsmm_dvalue((libxsmm_datatype)oprec, beta, &dbeta)) dbeta = LIBXSMM_BETA; /* fuse beta into flags */ + result.ptr->gemm_flags = LIBXSMM_GEMM_PFLAGS(transa, transb, LIBXSMM_FLAGS) | (LIBXSMM_NEQ(0, dbeta) ? 0 : LIBXSMM_GEMM_FLAG_BETA_0); + /* TODO: check that arguments fit into handle (unsigned int vs. libxsmm_blasint) */ + um = (unsigned int)(*m); uk = (NULL != k ? ((unsigned int)(*k)) : um); un = (NULL != n ? ((unsigned int)(*n)) : uk); + result.ptr->otypesize = libxsmm_typesize((libxsmm_datatype)oprec); + if (NULL == env_tm || 0 >= atoi(env_tm)) { + const unsigned int vwidth = LIBXSMM_MAX(internal_gemm_vwidth / result.ptr->otypesize, 1); + const double s2 = (double)internal_gemm_nstretch * internal_gemm_kstretch; /* LIBXSMM_INIT! */ + unsigned int tmi = libxsmm_product_limit(um, internal_gemm_mlimit, 0); /* LIBXSMM_INIT! */ + for (; vwidth <= tmi; tmi = libxsmm_product_limit(um, tmi - 1, 0)) { + const double si = (double)(LIBXSMM_CONFIG_MAX_MNK) / ((double)tmi * tmi * tmi), s = (s2 <= si ? 1 : (s2 / si)); + unsigned int tni = libxsmm_product_limit(un, LIBXSMM_MAX((unsigned int)(tmi * (s * internal_gemm_nstretch)), 1), 0); + unsigned int tki = libxsmm_product_limit(uk, LIBXSMM_MAX((unsigned int)(tmi * (s * internal_gemm_kstretch)), 1), 0); + unsigned int ntmi, ntni, ntki, mti = 1, nti = 1, kti = 1; + LIBXSMM_ASSERT(tmi <= um && tni <= un && tki <= uk); + if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { + const unsigned int ttm = (unsigned int)libxsmm_product_limit(tmi, (unsigned int)ntasks, 0); + const unsigned int ttn = (unsigned int)libxsmm_product_limit(tni, (unsigned int)ntasks, 0); + tmi = tni = LIBXSMM_MIN(ttm, ttn); /* prefer threads over larger tile */ + } + if (EXIT_SUCCESS == libxsmm_gemm_plan_internal((unsigned int)ntasks, um, un, uk, tmi, tni, tki, + &ntmi, &ntni, &ntki, &mti, &nti, &kti)) + { + const int exit_plan = ((tmi < um && tni < un && tki < uk && (tm != tmi || tn != tni || tk != tki)) ? 0 : 1); + const unsigned itasks = mti * nti * kti; + LIBXSMM_ASSERT(1 <= itasks); + if (max_ntasks < itasks) { + ntm = ntmi; ntn = ntni; ntk = ntki; + mt = mti; nt = nti; kt = kti; + tm = tmi; tn = tni; tk = tki; + max_ntasks = itasks; + } + if (itasks == (unsigned int)ntasks || 0 != exit_plan) break; + } + } + } + else { + const unsigned int tmi = atoi(env_tm); + const double s2 = (double)internal_gemm_nstretch * internal_gemm_kstretch; /* LIBXSMM_INIT! */ + double si, s; + tm = libxsmm_product_limit(um, LIBXSMM_MIN(tmi, internal_gemm_mlimit), 0); /* LIBXSMM_INIT! */ + si = (double)(LIBXSMM_CONFIG_MAX_MNK) / ((double)tm * tm * tm); s = (s2 <= si ? 1 : (s2 / si)); + tn = libxsmm_product_limit(un, LIBXSMM_MAX((unsigned int)(tm * (s * internal_gemm_nstretch)), 1), 0); + tk = libxsmm_product_limit(uk, LIBXSMM_MAX((unsigned int)(tm * (s * internal_gemm_kstretch)), 1), 0); + if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { + const unsigned int ttm = (unsigned int)libxsmm_product_limit(tm, (unsigned int)ntasks, 0); + const unsigned int ttn = (unsigned int)libxsmm_product_limit(tn, (unsigned int)ntasks, 0); + tm = tn = LIBXSMM_MIN(ttm, ttn); /* prefer threads over larger tile */ + } + if (EXIT_SUCCESS == libxsmm_gemm_plan_internal((unsigned int)ntasks, um, un, uk, tm, tn, tk, + &ntm, &ntn, &ntk, &mt, &nt, &kt)) + { +#if defined(NDEBUG) + max_ntasks = 2; /* only need something unequal to zero to pass below condition */ +#else + max_ntasks = mt * nt * kt; +#endif + } + } + LIBXSMM_ASSERT(LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags) || tm == tn); + /* check for non-conforming GEMM parameters (error), and conforming GEMM parameters (fast-path, fallback) */ + if (0 == max_ntasks || 0 == tm || 0 == tn || 0 == tk || 0 != (um % tm) || 0 != (un % tn) || 0 != (uk % tk)) { + return NULL; + } + result.ptr->flags = flags; + if (LIBXSMM_GEMM_HANDLE_FLAG_AUTO == flags && 0 == LIBXSMM_SMM_AI(um, un, uk, + 0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0) ? 1 : 2/*RFO*/, result.ptr->otypesize)) + { + if (um == LIBXSMM_UP2POT(um) || un == LIBXSMM_UP2POT(un)) { /* power-of-two (POT) extent(s) */ + result.ptr->flags |= LIBXSMM_GEMM_HANDLE_FLAG_COPY_C; + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { + result.ptr->flags |= LIBXSMM_GEMM_HANDLE_FLAG_COPY_A; + } + } + } + result.ptr->itypesize = libxsmm_typesize((libxsmm_datatype)iprec); + result.ptr->ldc = (unsigned int)(NULL != ldc ? *ldc : *m); + ulda = (NULL != lda ? ((unsigned int)(*lda)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & result.ptr->gemm_flags) ? ((unsigned int)(*m)) : uk)); + uldb = (NULL != ldb ? ((unsigned int)(*ldb)) : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & result.ptr->gemm_flags) ? uk : un)); + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & result.ptr->gemm_flags)) { /* NN, NT, or TN */ + const libxsmm_blasint itm = (libxsmm_blasint)tm, itk = (libxsmm_blasint)tk; +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + const libxsmm_blasint itn = (libxsmm_blasint)tn; +#endif + kldc = (libxsmm_blasint)result.ptr->ldc; + klda = (libxsmm_blasint)ulda; + kldb = (libxsmm_blasint)uldb; + if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & result.ptr->gemm_flags)) { /* TN */ +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + result.ptr->copy_a.function = libxsmm_dispatch_meltw_unary(itk, itm, &klda, &itm, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); +#endif + klda = itm; + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & result.ptr->flags)) { +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + result.ptr->copy_a.function = libxsmm_dispatch_meltw_unary(itm, itk, &klda, &itm, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); +#endif + klda = (libxsmm_blasint)tm; + } + if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & result.ptr->gemm_flags)) { /* NT */ +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + result.ptr->copy_b.function = libxsmm_dispatch_meltw_unary(itn, itk, &kldb, &itk, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); +#endif + kldb = itk; + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & result.ptr->flags)) { +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + result.ptr->copy_b.function = libxsmm_dispatch_meltw_unary(itk, itn, &kldb, &itk, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); +#endif + kldb = (libxsmm_blasint)tk; + } + if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & result.ptr->flags)) { +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + result.ptr->copy_o.function = libxsmm_dispatch_meltw_unary(itm, itn, &itm, &kldc, + (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); + if (0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ + result.ptr->copy_i.function = libxsmm_dispatch_meltw_unary(itm, itn, &kldc, &itm, + (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); + } +#endif + kldc = (libxsmm_blasint)tm; + } + result.ptr->lda = ulda; result.ptr->ldb = uldb; + result.ptr->km = tm; result.ptr->kn = tn; + result.ptr->mt = mt; result.ptr->nt = nt; + result.ptr->m = um; result.ptr->n = un; + result.ptr->dm = LIBXSMM_UPDIV(ntm, mt) * tm; + result.ptr->dn = LIBXSMM_UPDIV(ntn, nt) * tn; + km = tm; kn = tn; + } + else { /* TT */ + const unsigned int tt = tm; + const libxsmm_blasint itt = (libxsmm_blasint)tt; +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + const libxsmm_blasint ildc = (libxsmm_blasint)result.ptr->ldc; + result.ptr->copy_o.function = libxsmm_dispatch_meltw_unary(itt, itt, &itt, &ildc, + (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + if (0 == (result.ptr->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ + result.ptr->copy_i.function = libxsmm_dispatch_meltw_unary(itt, itt, &ildc, &itt, + (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, (libxsmm_datatype)oprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + } +#endif + klda = (libxsmm_blasint)uldb; + kldb = (libxsmm_blasint)ulda; + kldc = itt; + LIBXSMM_ASSERT(tt == tn); + if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & result.ptr->flags)) { +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + const libxsmm_blasint itk = (libxsmm_blasint)tk; + result.ptr->copy_a.function = libxsmm_dispatch_meltw_unary(itt, itk, &kldb, &itk, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); +#endif + klda = itt; + } + if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & result.ptr->flags)) { +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + const libxsmm_blasint itn = (libxsmm_blasint)tn, itk = (libxsmm_blasint)tk; + result.ptr->copy_b.function = libxsmm_dispatch_meltw_unary(itk, itn, &klda, &itk, + (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, (libxsmm_datatype)iprec, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_IDENTITY); +#endif + kldb = (libxsmm_blasint)tk; + } + result.ptr->lda = uldb; result.ptr->ldb = ulda; + result.ptr->km = tn; result.ptr->kn = tm; + result.ptr->mt = nt; result.ptr->nt = mt; + result.ptr->m = un; result.ptr->n = um; + result.ptr->dm = LIBXSMM_UPDIV(ntn, nt) * tn; + result.ptr->dn = LIBXSMM_UPDIV(ntm, mt) * tm; + km = kn = tt; + } + result.ptr->dk = ntk / kt * tk; + result.ptr->kk = tk; + result.ptr->kt = kt; + result.ptr->k = uk; + desc = libxsmm_gemm_descriptor_init2( /* remove transpose flags from kernel request */ + &desc_blob, iprec, oprec, km, kn, result.ptr->kk, klda, kldb, kldc, + alpha, beta, result.ptr->gemm_flags & ~LIBXSMM_GEMM_FLAG_TRANS_AB, internal_gemm_tiled_prefetch); + result.ptr->kernel[0] = libxsmm_xmmdispatch(desc); + if (NULL != result.ptr->kernel[0].xmm) { + if (0 == (desc->flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* beta!=0 */ + result.ptr->kernel[1] = result.ptr->kernel[0]; + } + else { /* generate kernel with beta=1 */ + desc->flags &= ~LIBXSMM_GEMM_FLAG_BETA_0; + result.ptr->kernel[1] = libxsmm_xmmdispatch(desc); + if (NULL == result.ptr->kernel[1].xmm) result.ptr = NULL; + } + } + else result.ptr = NULL; + } + else { + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_a(const libxsmm_gemm_handle* handle) +{ + size_t result; + if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_A) + && (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) || + (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags) == 0))) + { + result = 0; + } + else { + const size_t size = (size_t)handle->km * handle->kk * handle->itypesize; + result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); + } + return result; +} + + +LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_b(const libxsmm_gemm_handle* handle) +{ + size_t result; + if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_B) + && (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) || + (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags) == 0))) + { + result = 0; + } + else { + const size_t size = (size_t)handle->kk * handle->kn * handle->itypesize; + result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); + } + return result; +} + + +LIBXSMM_API_INLINE size_t libxsmm_gemm_handle_get_scratch_size_c(const libxsmm_gemm_handle* handle) +{ + size_t result; + if (NULL == handle || (0 == (handle->flags & LIBXSMM_GEMM_HANDLE_FLAG_COPY_C) + && LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags))) + { + result = 0; + } + else { + const size_t size = (size_t)handle->km * handle->kn * handle->otypesize; + result = LIBXSMM_UP2(size, LIBXSMM_CACHELINE); + } + return result; +} + + +LIBXSMM_API size_t libxsmm_gemm_handle_get_scratch_size(const libxsmm_gemm_handle* handle) +{ + size_t result; + if (NULL != handle) { /* thread-local scratch buffer for GEMM */ + const size_t size_a = libxsmm_gemm_handle_get_scratch_size_a(handle); + const size_t size_b = libxsmm_gemm_handle_get_scratch_size_b(handle); + const size_t size_c = libxsmm_gemm_handle_get_scratch_size_c(handle); + result = (size_a + size_b + size_c) * handle->mt * handle->nt * handle->kt; + } + else { + result = 0; + } + return result; +} + + +LIBXSMM_API void libxsmm_gemm_task(const libxsmm_gemm_handle* handle, void* scratch, + const void* a, const void* b, void* c, /*unsigned*/int tid, /*unsigned*/int ntasks) +{ +#if !defined(NDEBUG) + if (NULL != handle && 0 <= tid && tid < ntasks) +#endif + { + const unsigned int utasks = (unsigned int)ntasks; + const unsigned int wksize = handle->mt * handle->nt * handle->kt; + const unsigned int spread = (wksize <= utasks ? (utasks / wksize) : 1); + const unsigned int utid = (unsigned int)tid, vtid = utid / spread; + if (utid < (spread * wksize) && 0 == (utid - vtid * spread)) { + const int excess = (utasks << 1) <= (vtid + wksize); + const unsigned int rtid = vtid / handle->mt, mtid = vtid - rtid * handle->mt, ntid = rtid % handle->nt, ktid = vtid / (handle->mt * handle->nt); + const unsigned int m0 = mtid * handle->dm, m1 = (0 == excess ? LIBXSMM_MIN(m0 + handle->dm, handle->m) : handle->m); + const unsigned int n0 = ntid * handle->dn, n1 = (0 == excess ? LIBXSMM_MIN(n0 + handle->dn, handle->n) : handle->n); + const unsigned int k0 = ktid * handle->dk, k1 = (0 == excess ? LIBXSMM_MIN(k0 + handle->dk, handle->k) : handle->k); + const unsigned int ldo = (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) ? handle->km : handle->kk); + /* calculate increments to simplify address calculations */ + const unsigned int dom = handle->km * handle->otypesize; + const unsigned int don = handle->kn * handle->otypesize; + const unsigned int dik = handle->kk * handle->itypesize; + const unsigned int on = handle->otypesize * n0; + /* calculate base address of thread-local storage */ + const size_t size_a = libxsmm_gemm_handle_get_scratch_size_a(handle); + const size_t size_b = libxsmm_gemm_handle_get_scratch_size_b(handle); + const size_t size_c = libxsmm_gemm_handle_get_scratch_size_c(handle); + char *const at = (char*)scratch + (size_a + size_b + size_c) * vtid; + char *const bt = at + size_a, *const ct = bt + size_b; + const libxsmm_xcopykernel kernel = { NULL }; + /* loop induction variables and other variables */ + unsigned int om = handle->otypesize * m0, im = m0, in = n0, ik = k0, im1, in1, ik1; + LIBXSMM_ASSERT_MSG(mtid < handle->mt && ntid < handle->nt && ktid < handle->kt, "Invalid task-ID"); + LIBXSMM_ASSERT_MSG(m1 <= handle->m && n1 <= handle->n && k1 <= handle->k, "Invalid task size"); + for (im1 = im + handle->km; (im1 - 1) < m1; im = im1, im1 += handle->km, om += dom) { + unsigned int dn = don, dka = dik, dkb = dik; + char *c0 = (char*)c, *ci; + const char *aa; + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { + if (0 != (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags)) { /* TN */ + aa = (const char*)a + ((size_t)im * handle->lda + k0) * handle->itypesize; + } + else if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags)) { /* NT */ + aa = (const char*)a + ((size_t)k0 * handle->lda + im) * handle->itypesize; + dka *= handle->lda; dkb *= handle->ldb; + } + else { /* NN */ + aa = (const char*)a + ((size_t)k0 * handle->lda + im) * handle->itypesize; + dka *= handle->lda; + } + c0 += (size_t)on * handle->ldc + om; + dn *= handle->ldc; + } + else { /* TT */ + aa = (const char*)b + ((size_t)k0 * handle->lda + im) * handle->itypesize; + c0 += (size_t)on + handle->ldc * (size_t)om; + dka *= handle->lda; + } + for (in = n0, in1 = in + handle->kn; (in1 - 1) < n1; in = in1, in1 += handle->kn, c0 += dn) { + const char *a0 = aa, *b0 = (const char*)b; + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { + if (0 != (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags)) { /* NT */ + b0 += ((size_t)k0 * handle->ldb + in) * handle->itypesize; + } + else { /* NN or TN */ + b0 += ((size_t)in * handle->ldb + k0) * handle->itypesize; + } + } + else { /* TT */ + b0 = (const char*)a + ((size_t)in * handle->ldb + k0) * handle->itypesize; + } +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + if (NULL == handle->copy_i.ptr) +#endif + { + ci = (NULL == handle->copy_o.ptr ? c0 : ct); + if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { + const unsigned int km = handle->kn, kn = handle->km; + libxsmm_otrans_internal(ct/*out*/, c0/*in*/, handle->otypesize, handle->ldc/*ldi*/, kn/*ldo*/, + 0, km, 0, kn, km/*tile*/, kn/*tile*/, kernel); + ci = ct; + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & handle->flags)) { + if (0 == (handle->gemm_flags & LIBXSMM_GEMM_FLAG_BETA_0)) { /* copy-in only if beta!=0 */ + libxsmm_matcopy_internal(ct/*out*/, c0/*in*/, handle->otypesize, handle->ldc/*ldi*/, handle->km/*ldo*/, + 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); + } + ci = ct; + } + } +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + else { /* MCOPY/TCOPY kernel */ + LIBXSMM_MCOPY_CALL(handle->copy_i, handle->otypesize, c0, &handle->ldc, ct, &handle->km); + ci = ct; + } +#endif + for (ik = k0, ik1 = ik + handle->kk; (ik1 - 1) < k1; ik = ik1, ik1 += handle->kk) { + const char *const a1 = a0 + dka, *const b1 = b0 + dkb, *ai = a0, *bi = b0; +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + if (NULL == handle->copy_a.ptr) +#endif + { + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) && + (LIBXSMM_GEMM_FLAG_TRANS_A & handle->gemm_flags) != 0) /* pure A-transpose */ + { + LIBXSMM_ASSERT(ldo == handle->km); + libxsmm_otrans_internal(at/*out*/, a0/*in*/, handle->itypesize, handle->lda/*ldi*/, ldo, + 0, handle->kk, 0, handle->km, handle->kk/*tile*/, handle->km/*tile*/, kernel); + ai = at; + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_A & handle->flags)) { + libxsmm_matcopy_internal(at/*out*/, a0/*in*/, handle->itypesize, handle->lda/*ldi*/, ldo, + 0, handle->km, 0, handle->kk, handle->km/*tile*/, handle->kk/*tile*/, kernel); + ai = at; + } + } +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + else { /* MCOPY/TCOPY kernel */ + LIBXSMM_MCOPY_CALL(handle->copy_a, handle->itypesize, a0, &handle->lda, at, &ldo); + ai = at; + } +#endif +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + if (NULL == handle->copy_b.ptr) +#endif + { + if (LIBXSMM_GEMM_FLAG_TRANS_AB != (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags) && + (LIBXSMM_GEMM_FLAG_TRANS_B & handle->gemm_flags) != 0) /* pure B-transpose */ + { + libxsmm_otrans_internal(bt/*out*/, b0/*in*/, handle->itypesize, handle->ldb/*ldi*/, handle->kk/*ldo*/, + 0, handle->kn, 0, handle->kk, handle->kn/*tile*/, handle->kk/*tile*/, kernel); + bi = bt; + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_B & handle->flags)) { + libxsmm_matcopy_internal(bt/*out*/, b0/*in*/, handle->itypesize, handle->ldb/*ldi*/, handle->kk/*ldo*/, + 0, handle->kk, 0, handle->kn, handle->kk/*tile*/, handle->kn/*tile*/, kernel); + bi = bt; + } + } +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + else { /* MCOPY/TCOPY kernel */ + LIBXSMM_MCOPY_CALL(handle->copy_b, handle->itypesize, b0, &handle->ldb, bt, &handle->kk); + bi = bt; + } +#endif + /* beta0-kernel on first-touch, beta1-kernel otherwise (beta0/beta1 are identical if beta=1) */ + LIBXSMM_MMCALL_PRF(handle->kernel[k0!=ik?1:0].xmm, ai, bi, ci, a1, b1, c0); + a0 = a1; + b0 = b1; + } + /* TODO: synchronize */ +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + if (NULL == handle->copy_o.ptr) +#endif + { + if (LIBXSMM_GEMM_FLAG_TRANS_AB == (LIBXSMM_GEMM_FLAG_TRANS_AB & handle->gemm_flags)) { + libxsmm_otrans_internal(c0/*out*/, ct/*in*/, handle->otypesize, handle->km/*ldi*/, handle->ldc/*ldo*/, + 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); + } + else if (0 != (LIBXSMM_GEMM_HANDLE_FLAG_COPY_C & handle->flags)) { + libxsmm_matcopy_internal(c0/*out*/, ct/*in*/, handle->otypesize, handle->km/*ldi*/, handle->ldc/*ldo*/, + 0, handle->km, 0, handle->kn, handle->km/*tile*/, handle->kn/*tile*/, kernel); + } + } +#if defined(LIBXSMM_GEMM_XCOPY_JIT) + else { /* MCOPY/TCOPY kernel */ + LIBXSMM_MCOPY_CALL(handle->copy_o, handle->otypesize, ct, &handle->km, c0, &handle->ldc); + } +#endif + } + } + } + } +#if !defined(NDEBUG) + else if (/*implies LIBXSMM_INIT*/0 != libxsmm_get_verbosity()) { /* library code is expected to be mute */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM ERROR: libxsmm_gemm_task - invalid handle!\n"); + } + } +#endif +} + + +LIBXSMM_API void libxsmm_xgemm(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc) +{ + libxsmm_gemm_blob blob; + const libxsmm_gemm_handle *const handle = libxsmm_gemm_handle_init(&blob, iprec, oprec, transa, transb, + m, n, k, lda, ldb, ldc, alpha, beta, LIBXSMM_GEMM_HANDLE_FLAG_AUTO, 1/*ntasks*/); + const size_t scratch_size = libxsmm_gemm_handle_get_scratch_size(handle); + void* scratch = NULL; + if (NULL != handle && (0 == scratch_size || + NULL != (scratch = libxsmm_scratch_malloc(scratch_size, LIBXSMM_CACHELINE, LIBXSMM_MALLOC_INTERNAL_CALLER)))) + { + libxsmm_gemm_task(handle, scratch, a, b, c, 0/*tid*/, 1/*ntasks*/); + libxsmm_free(scratch); + } + else { /* fallback or error */ + static int error_once = 0; + if (NULL == handle) { /* fallback */ + if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: XGEMM fallback code path triggered!\n"); + } + } + else if (0 != libxsmm_verbosity && /* library code is expected to be mute */ + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate GEMM-scratch memory!\n"); + } + libxsmm_blas_xgemm(iprec, oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } +} + + +LIBXSMM_API void libxsmm_dgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const double alpha_array[], const double* a_array[], const libxsmm_blasint lda_array[], const double* b_array[], const libxsmm_blasint ldb_array[], + const double beta_array[], double* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + const libxsmm_blasint ngroups = LIBXSMM_ABS(*group_count), ptrsize = sizeof(void*); + libxsmm_blasint i, j = 0; + for (i = 0; i < ngroups; ++i) { + const libxsmm_blasint size = group_size[i]; + libxsmm_gemm_batch(LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64, transa_array + i, transb_array + i, + m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, c_array + j, ldc_array + i, + 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); + j += LIBXSMM_ABS(size); + } +} + + +LIBXSMM_API void libxsmm_sgemm_batch( + const char transa_array[], const char transb_array[], const libxsmm_blasint m_array[], const libxsmm_blasint n_array[], const libxsmm_blasint k_array[], + const float alpha_array[], const float* a_array[], const libxsmm_blasint lda_array[], const float* b_array[], const libxsmm_blasint ldb_array[], + const float beta_array[], float* c_array[], const libxsmm_blasint ldc_array[], const libxsmm_blasint* group_count, const libxsmm_blasint group_size[]) +{ + const libxsmm_blasint ngroups = LIBXSMM_ABS(*group_count), ptrsize = sizeof(void*); + libxsmm_blasint i, j = 0; + for (i = 0; i < ngroups; ++i) { + const libxsmm_blasint size = group_size[i]; + libxsmm_gemm_batch(LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32, transa_array + i, transb_array + i, + m_array[i], n_array[i], k_array[i], alpha_array + i, a_array + j, lda_array + i, b_array + j, ldb_array + i, beta_array + i, c_array + j, ldc_array + i, + 0/*index_base*/, 0/*index_stride*/, &ptrsize, &ptrsize, &ptrsize, size); + j += LIBXSMM_ABS(size); + } +} + + +LIBXSMM_API void libxsmm_dgemm(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_XGEMM(double, double, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_API void libxsmm_sgemm(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_XGEMM(float, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_API void libxsmm_wigemm(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const int* alpha, const short* a, const libxsmm_blasint* lda, + const short* b, const libxsmm_blasint* ldb, + const int* beta, int* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_XGEMM(short, int, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_API void libxsmm_bsgemm(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, + const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_XGEMM(libxsmm_bfloat16, float, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +LIBXSMM_API int libxsmm_mmbatch_kernel(libxsmm_xmmfunction kernel, libxsmm_blasint index_base, + libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const void* a, const void* b, void* c, libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int ntasks, + unsigned char itypesize, unsigned char otypesize, int flags) +{ + int result = EXIT_SUCCESS; + const libxsmm_blasint size = LIBXSMM_ABS(batchsize); + const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, ntasks); + const libxsmm_blasint begin = tid * tasksize, span = begin + tasksize; + const libxsmm_blasint end = LIBXSMM_MIN(span, size); + + LIBXSMM_ASSERT(NULL != a && NULL != b && NULL != c && NULL != kernel.xmm); + if (begin < end) { + const char *const a0 = (const char*)a, *const b0 = (const char*)b; + char *const c0 = (char*)c; + + LIBXSMM_ASSERT(0 < itypesize && 0 < otypesize); + if (0 == (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & flags)) { + if (0 != index_stride) { /* stride arrays contain indexes */ + libxsmm_blasint i = begin * index_stride, ic = (NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0); + const char* ai = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; + const char* bi = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; + char* ci = &c0[ic * otypesize]; + const libxsmm_blasint end1 = (end != size ? end : (end - 1)) * index_stride; +#if (0 != LIBXSMM_SYNC) + if (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize || 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) +#endif + { /* no locking */ + if (NULL != stride_a && NULL != stride_b && NULL != stride_c) { + const unsigned char ibits = (unsigned char)LIBXSMM_INTRINSICS_BITSCANBWD32(itypesize); + const unsigned char obits = (unsigned char)LIBXSMM_INTRINSICS_BITSCANBWD32(otypesize); + + if (itypesize == (1 << ibits) && otypesize == (1 << obits)) { + for (i += index_stride; i <= end1; i += index_stride) { + const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) << ibits]; + const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) << ibits]; + char *const cn = &c0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) << obits]; + kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ + ai = an; bi = bn; ci = cn; /* next */ + } + } + else { /* non-pot type sizes */ + for (i += index_stride; i <= end1; i += index_stride) { + const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize]; + const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize]; + char *const cn = &c0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize]; + kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ + ai = an; bi = bn; ci = cn; /* next */ + } + } + } + else { /* mixed specification of strides */ + for (i += index_stride; i <= end1; i += index_stride) { + const char *const an = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; + const char *const bn = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; + char *const cn = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize) : 0]; + kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ + ai = an; bi = bn; ci = cn; /* next */ + } + } + if (end == size) { /* remainder multiplication */ + kernel.xmm(ai, bi, ci, ai, bi, ci); /* pseudo-prefetch */ + } + } +#if (0 != LIBXSMM_SYNC) + else { /* synchronize among C-indexes */ + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; +# if defined(LIBXSMM_GEMM_LOCKFWD) + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock0 = NULL; +# endif + LIBXSMM_ASSERT(NULL != lock); + if (NULL != stride_a && NULL != stride_b && NULL != stride_c) { + for (i += index_stride; i <= end1; i += index_stride) { + ic = LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base; + { + const char *const an = &a0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize]; + const char *const bn = &b0[(LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize]; + char *const cn = &c0[ic * otypesize]; + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } +# else + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); +# endif + kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } +# else + LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; +# endif + ai = an; bi = bn; ci = cn; /* next */ + } + } + } + else { + for (i += index_stride; i <= end1; i += index_stride) { + ic = (NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0); + { + const char *const an = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; + const char *const bn = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; + char *const cn = &c0[ic * otypesize]; + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKIDX(ic, internal_gemm_nlocks)].state; +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } +# else + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); +# endif + kernel.xmm(ai, bi, ci, an, bn, cn); /* with prefetch */ +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } +# else + LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; +# endif + ai = an; bi = bn; ci = cn; /* next */ + } + } + } + if (end == size) { /* remainder multiplication */ + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); + kernel.xmm(ai, bi, ci, ai, bi, ci); /* pseudo-prefetch */ + LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); + } + } +#endif /*(0 != LIBXSMM_SYNC)*/ + } + else { /* array of pointers to matrices (singular strides are measured in Bytes) */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); + const libxsmm_blasint end1 = (end != size ? end : (end - 1)); + const char *ai = a0 + (size_t)da * begin, *bi = b0 + (size_t)db * begin; + char* ci = c0 + (size_t)dc * begin; + libxsmm_blasint i; +#if (0 != LIBXSMM_SYNC) + if (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize || 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) +#endif + { /* no locking */ + for (i = begin; i < end1; ++i) { + const char *const an = ai + da, *const bn = bi + db; + char *const cn = ci + dc; +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != *((const void**)ci)) +#endif + { + kernel.xmm( /* with prefetch */ + *((const void**)ai), *((const void**)bi), *((void**)ci), + *((const void**)an), *((const void**)bn), *((const void**)cn)); + } + ai = an; bi = bn; ci = cn; /* next */ + } + if ( /* remainder multiplication */ +#if defined(LIBXSMM_BATCH_CHECK) + NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != *((const void**)ci) && +#endif + end == size) + { + kernel.xmm( /* pseudo-prefetch */ + *((const void**)ai), *((const void**)bi), *((void**)ci), + *((const void**)ai), *((const void**)bi), *((const void**)ci)); + } + } +#if (0 != LIBXSMM_SYNC) + else { /* synchronize among C-indexes */ + void* cc = *((void**)ci); + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock = &internal_gemm_lock[LIBXSMM_GEMM_LOCKPTR(cc, internal_gemm_nlocks)].state; +# if defined(LIBXSMM_GEMM_LOCKFWD) + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK)* lock0 = NULL; +# endif + LIBXSMM_ASSERT(NULL != lock); + for (i = begin + 1; i <= end1; ++i) { + const char *const an = ai + da, *const bn = bi + db; + char *const cn = ci + dc; + void *const nc = *((void**)cn); +# if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != cc) +# endif + { + LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) *const lock1 = &internal_gemm_lock[LIBXSMM_GEMM_LOCKPTR(nc, internal_gemm_nlocks)].state; +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock0) { lock0 = lock; LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); } +# else + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); +# endif + kernel.xmm( /* with prefetch */ + *((const void**)ai), *((const void**)bi), cc, + *((const void**)an), *((const void**)bn), *((const void**)cn)); +# if defined(LIBXSMM_GEMM_LOCKFWD) + if (lock != lock1 || i == end1) { LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; } +# else + LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); lock = lock1; +# endif + } + ai = an; bi = bn; ci = cn; cc = nc; /* next */ + } + if ( /* remainder multiplication */ +# if defined(LIBXSMM_BATCH_CHECK) + NULL != *((const void**)ai) && NULL != *((const void**)bi) && NULL != cc && +# endif + end == size) + { + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_GEMM_LOCK, lock); + kernel.xmm( /* pseudo-prefetch */ + *((const void**)ai), *((const void**)bi), cc, + *((const void**)ai), *((const void**)bi), cc); + LIBXSMM_LOCK_RELEASE(LIBXSMM_GEMM_LOCK, lock); + } + } +#endif /*(0 != LIBXSMM_SYNC)*/ + } + } +#if defined(LIBXSMM_GEMM_BATCHREDUCE) + else /* LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS */ +# if defined(LIBXSMM_BATCH_CHECK) + if ( +# if (0 != LIBXSMM_SYNC) + (1 == ntasks || 0 == internal_gemm_nlocks || 0 > batchsize) && +# endif + (0 == (LIBXSMM_GEMM_FLAG_BETA_0 & flags)) && + (0 != internal_gemm_batchreduce)) +# endif + { + const unsigned int n = libxsmm_mmbatch_size * (LIBXSMM_GEMM_BATCHSCALE) / ((unsigned int)sizeof(void*)); + LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array && 0 != libxsmm_mmbatch_size); + if ((2U/*A and B matrices*/ * tasksize) <= n) { + const void **ai = (const void**)libxsmm_mmbatch_array + begin, **bi = ai + size; + unsigned long long count; + if (0 != index_stride) { /* stride arrays contain indexes */ + const size_t end_stride = (size_t)end * index_stride; + size_t i = (size_t)begin * index_stride; + char *ci = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) * otypesize) : 0], *cn = ci; + do { + for (count = 0; i < end_stride && ci == cn; ++count) { + const size_t j = i + index_stride; + *ai++ = &a0[NULL != stride_a ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) * itypesize) : 0]; + *bi++ = &b0[NULL != stride_b ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) * itypesize) : 0]; + cn = &c0[NULL != stride_c ? ((LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, j) - index_base) * otypesize) : 0]; + i = j; + } + ai = (const void**)libxsmm_mmbatch_array + begin; bi = ai + size; + kernel.xbm(ai, bi, ci, &count); + ci = cn; + } while (i < end_stride); + } + else { /* array of pointers to matrices (singular strides are measured in Bytes) */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); + const char *ia = a0 + (size_t)da * begin, *ib = b0 + (size_t)db * begin; + char* ic = c0 + (size_t)dc * begin; + if ( +# if defined(LIBXSMM_BATCH_CHECK) + NULL != *((const void**)ia) && NULL != *((const void**)ib) && NULL != *((const void**)ic) && +# endif + sizeof(void*) == da && sizeof(void*) == db) /* fast path */ + { + if (0 != dc) { + libxsmm_blasint i = begin; + char* jc = ic; + do { + for (count = 0; i < end && *((const void**)ic) == *((const void**)jc); ++i) { +# if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const void**)jc)) +# endif + ++count; + jc += dc; /* next */ + } + memcpy((void*)ai, ia, count * sizeof(void*)); + memcpy((void*)bi, ib, count * sizeof(void*)); + kernel.xbm(ai, bi, *((void**)ic), &count); + ic = jc; + } while (i < end); + } + else { /* fastest path */ + count = (unsigned long long)end - begin; + memcpy((void*)ai, ia, count * sizeof(void*)); + memcpy((void*)bi, ib, count * sizeof(void*)); + kernel.xbm(ai, bi, *((void**)ic), &count); + } + } + else { /* custom-copy required */ + libxsmm_blasint i = begin; + char* jc = ic; + do { + for (count = 0; i < end && *((const void**)ic) == *((const void**)jc); ++i) { +# if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const void**)ia) && NULL != *((const void**)ib) && NULL != *((const void**)jc)) +# endif + { + *ai++ = *((const void**)ia); *bi++ = *((const void**)ib); + ++count; + } + ia += da; ib += db; jc += dc; /* next */ + } + ai = (const void**)libxsmm_mmbatch_array + begin; bi = ai + size; + kernel.xbm(ai, bi, *((void**)ic), &count); + ic = jc; + } while (i < end); + } + } + } + else { /* fallback */ + result = EXIT_FAILURE; + } + } +#endif /*defined(LIBXSMM_GEMM_BATCHREDUCE)*/ + } + /* coverity[missing_unlock] */ + return result; +} + + +LIBXSMM_API void libxsmm_gemm_internal_set_batchflag(libxsmm_gemm_descriptor* descriptor, void* c, libxsmm_blasint index_stride, + libxsmm_blasint batchsize, int multithreaded) +{ + LIBXSMM_ASSERT(NULL != descriptor); + if (0 != (LIBXSMM_GEMM_FLAG_BETA_0 & descriptor->flags)) { + const uintptr_t vw = (LIBXSMM_X86_AVX512 <= libxsmm_target_archid ? 64 : 32); + /* assume that all C-matrices are aligned eventually */ + if (0 == LIBXSMM_MOD2((uintptr_t)c, vw) +#if 0 /* should fallback in BE */ + && LIBXSMM_X86_AVX <= libxsmm_target_archid +#endif + && 0 != index_stride) + { + const int oprec = LIBXSMM_GETENUM_OUT(descriptor->datatype); + const libxsmm_blasint typesize = LIBXSMM_TYPESIZE(oprec); + const libxsmm_blasint csize = (libxsmm_blasint)descriptor->ldc * descriptor->n * typesize; + /* finalize assumption if matrix-size is a multiple of the vector-width */ + descriptor->flags |= (unsigned short)(0 == LIBXSMM_MOD2(csize, vw) ? LIBXSMM_GEMM_FLAG_ALIGN_C_NTS_HINT : 0); + } + } +#if defined(LIBXSMM_GEMM_BATCHREDUCE) + else if (0 != internal_gemm_batchreduce) { /* check if reduce-batch kernel can be used */ + static int error_once = 0; + LIBXSMM_ASSERT(NULL != libxsmm_mmbatch_array); +# if (0 != LIBXSMM_SYNC) + if (0 == multithreaded || 0 == internal_gemm_nlocks || 0 > batchsize) +# endif + { + int result = EXIT_FAILURE; + switch (LIBXSMM_GETENUM_INP(descriptor->datatype)) { + case LIBXSMM_GEMM_PRECISION_F64: { + if (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(descriptor->datatype)) { + result = EXIT_SUCCESS; + } + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + if (LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(descriptor->datatype)) { + result = EXIT_SUCCESS; + } + } break; + } + if (EXIT_SUCCESS == result) { + descriptor->flags |= LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS; + descriptor->prefetch = 0; /* omit decision */ + } + else { + if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && /* library code is expected to be mute */ + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: data type not supported in batch-reduce!\n"); + } + } + } +# if (0 != LIBXSMM_SYNC) + else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) && /* library code is expected to be mute */ + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM: potential data races prevent batch-reduce.\n"); + } +# endif + } +#endif /*defined(LIBXSMM_GEMM_BATCHREDUCE)*/ +#if !defined(LIBXSMM_GEMM_BATCHREDUCE) || (0 == LIBXSMM_SYNC) + LIBXSMM_UNUSED(batchsize); LIBXSMM_UNUSED(multithreaded); +#endif +} + + +LIBXSMM_API_INTERN void libxsmm_dmmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const double* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const double* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize) +{ +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != a && NULL != b && NULL != c) +#endif + { + const libxsmm_blasint end = LIBXSMM_ABS(batchsize); + libxsmm_blasint i; + if (0 != index_stride) { /* stride arrays contain indexes */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base) : 0); + const libxsmm_blasint end1 = end * index_stride; + const double *const a0 = (const double*)a, *const b0 = (const double*)b, *ai = a0 + da, *bi = b0 + db; + double *const c0 = (double*)c, *ci = c0 + dc; + for (i = index_stride; i <= end1; i += index_stride) { + const double *const an = &a0[NULL != stride_a ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) : 0]; + const double *const bn = &b0[NULL != stride_b ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) : 0]; + double *const cn = &c0[NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0]; + libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, ai, lda, bi, ldb, beta, ci, ldc); + ai = an; bi = bn; ci = cn; /* next */ + } + } + else { /* array of pointers to matrices (singular strides are measured in Bytes) */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); + const char *const a0 = (const char*)a, *const b0 = (const char*)b, *ai = a0, *bi = b0; + char *const c0 = (char*)c, *ci = c0; + for (i = 0; i < end; ++i) { + const char *const an = ai + da, *const bn = bi + db; + char *const cn = ci + dc; +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const double**)ai) && NULL != *((const double**)bi) && NULL != *((const double**)ci)) +#endif + { + libxsmm_blas_dgemm(transa, transb, &m, &n, &k, alpha, *((const double**)ai), lda, *((const double**)bi), ldb, beta, *((double**)ci), ldc); + } + ai = an; bi = bn; ci = cn; /* next */ + } + } + } +} + + +LIBXSMM_API_INTERN void libxsmm_smmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const float* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const float* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize) +{ +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != a && NULL != b && NULL != c) +#endif + { + const libxsmm_blasint end = LIBXSMM_ABS(batchsize); + libxsmm_blasint i; + if (0 != index_stride) { /* stride arrays contain indexes */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base) : 0); + const libxsmm_blasint end1 = end * index_stride; + const float *a0 = (const float*)a, *b0 = (const float*)b, *ai = a0 + da, *bi = b0 + db; + float *c0 = (float*)c, *ci = c0 + dc; + for (i = index_stride; i <= end1; i += index_stride) { + const float *const an = &a0[NULL != stride_a ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_a, i) - index_base) : 0]; + const float *const bn = &b0[NULL != stride_b ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_b, i) - index_base) : 0]; + float *const cn = &c0[NULL != stride_c ? (LIBXSMM_ACCESS(const libxsmm_blasint, stride_c, i) - index_base) : 0]; + libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, ai, lda, bi, ldb, beta, ci, ldc); + ai = an; bi = bn; ci = cn; /* next */ + } + } + else { /* array of pointers to matrices (singular strides are measured in Bytes) */ + const libxsmm_blasint da = (NULL != stride_a ? (*stride_a - index_base * sizeof(void*)) : 0); + const libxsmm_blasint db = (NULL != stride_b ? (*stride_b - index_base * sizeof(void*)) : 0); + const libxsmm_blasint dc = (NULL != stride_c ? (*stride_c - index_base * sizeof(void*)) : 0); + const char *a0 = (const char*)a, *b0 = (const char*)b, *ai = a0, *bi = b0; + char *c0 = (char*)c, *ci = c0; + for (i = 0; i < end; ++i) { + const char *const an = ai + da; + const char *const bn = bi + db; + char *const cn = ci + dc; +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != *((const float**)ai) && NULL != *((const float**)bi) && NULL != *((const float**)ci)) +#endif + { + libxsmm_blas_sgemm(transa, transb, &m, &n, &k, alpha, *((const float**)ai), lda, *((const float**)bi), ldb, beta, *((float**)ci), ldc); + } + ai = an; bi = bn; ci = cn; /* next */ + } + } + } +} + + +LIBXSMM_API int libxsmm_mmbatch_blas( + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize) +{ + int result; + if (NULL != a && NULL != b && NULL != c) { + switch (LIBXSMM_GETENUM(iprec, oprec)) { + case LIBXSMM_GEMM_PRECISION_F64: { + libxsmm_dmmbatch_blas(transa, transb, m, n, k, + (const double*)alpha, a, lda, b, ldb, (const double*)beta, c, ldc, + index_base, index_stride, stride_a, stride_b, stride_c, batchsize); + result = EXIT_SUCCESS; + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + libxsmm_smmbatch_blas(transa, transb, m, n, k, + (const float*)alpha, a, lda, b, ldb, (const float*)beta, c, ldc, + index_base, index_stride, stride_a, stride_b, stride_c, batchsize); + result = EXIT_SUCCESS; + } break; + default: result = EXIT_FAILURE; + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API void libxsmm_mmbatch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int ntasks) +{ + static int error_once = 0; +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != a && NULL != b && NULL != c && 0 <= tid && tid < ntasks) +#endif + { + const unsigned char otypesize = libxsmm_typesize((libxsmm_datatype)oprec); + int result = EXIT_FAILURE; + LIBXSMM_INIT + if (LIBXSMM_SMM_AI(m, n, k, 2/*RFO*/, otypesize)) { /* check if an SMM is suitable */ + const int gemm_flags = LIBXSMM_GEMM_PFLAGS(transa, transb, LIBXSMM_FLAGS); + libxsmm_descriptor_blob blob; + libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_init2(&blob, iprec, oprec, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, alpha, beta, gemm_flags, libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO)); + if (NULL != desc) { + libxsmm_xmmfunction kernel; + libxsmm_gemm_internal_set_batchflag(desc, c, index_stride, batchsize, 0/*multi-threaded*/); + kernel = libxsmm_xmmdispatch(desc); + if (NULL != kernel.xmm) { + result = libxsmm_mmbatch_kernel(kernel, index_base, index_stride, + stride_a, stride_b, stride_c, a, b, c, batchsize, tid, ntasks, + libxsmm_typesize((libxsmm_datatype)iprec), otypesize, desc->flags); + } + } + } + if (EXIT_SUCCESS != result) { /* quiet fallback */ + if (EXIT_SUCCESS == libxsmm_mmbatch_blas(iprec, oprec, + transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + index_base, index_stride, stride_a, stride_b, stride_c, batchsize)) + { + if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { + const size_t threshold = LIBXSMM_MNK_SIZE(m, n, m); + static size_t threshold_max = 0; + if (threshold_max < threshold) { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "LIBXSMM WARNING: "); + libxsmm_gemm_print2(stderr, iprec, oprec, transa, transb, &m, &n, &k, + alpha, NULL/*a*/, lda, NULL/*b*/, ldb, beta, NULL/*c*/, ldc); + fprintf(stderr, " => batched GEMM was falling back to BLAS!\n"); + LIBXSMM_STDIO_RELEASE(); + threshold_max = threshold; + } + } + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: libxsmm_mmbatch failed!\n"); + } + } + } +#if defined(LIBXSMM_BATCH_CHECK) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: incorrect arguments (libxsmm_mmbatch)!\n"); + } +#endif +} + + +LIBXSMM_API void libxsmm_gemm_batch(libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, + const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize) +{ + libxsmm_mmbatch(iprec, oprec, transa, transb, m, n, k, + alpha,a, lda, b, ldb, beta, c, ldc, index_base, index_stride, + stride_a, stride_b, stride_c, batchsize, 0/*tid*/, 1/*ntasks*/); +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dgemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const double*, const double*, const libxsmm_blasint*, + const double*, const libxsmm_blasint*, + const double*, double*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_dgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + libxsmm_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_sgemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const float*, const float*, const libxsmm_blasint*, + const float*, const libxsmm_blasint*, + const float*, float*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_sgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + libxsmm_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_wigemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const int*, const short*, const libxsmm_blasint*, + const short*, const libxsmm_blasint*, + const int*, int*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_wigemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const int* alpha, const short* a, const libxsmm_blasint* lda, + const short* b, const libxsmm_blasint* ldb, + const int* beta, int* c, const libxsmm_blasint* ldc) +{ + libxsmm_wigemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_bsgemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const float*, const libxsmm_bfloat16*, const libxsmm_blasint*, + const libxsmm_bfloat16*, const libxsmm_blasint*, + const float*, float*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_bsgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const libxsmm_bfloat16* a, const libxsmm_blasint* lda, + const libxsmm_bfloat16* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + libxsmm_bsgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_xgemm)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, + const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const float*, const float*, const libxsmm_blasint*, + const float*, const libxsmm_blasint*, + const float*, float*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_xgemm)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + LIBXSMM_ASSERT(NULL != iprec && NULL != oprec); + libxsmm_blas_xgemm(*iprec, *oprec, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_dgemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const double*, const double*, const libxsmm_blasint*, + const double*, const libxsmm_blasint*, + const double*, double*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_dgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const double* alpha, const double* a, const libxsmm_blasint* lda, + const double* b, const libxsmm_blasint* ldb, + const double* beta, double* c, const libxsmm_blasint* ldc) +{ + libxsmm_blas_dgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_sgemm)(const char*, const char*, + const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const float*, const float*, const libxsmm_blasint*, + const float*, const libxsmm_blasint*, + const float*, float*, const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_blas_sgemm)(const char* transa, const char* transb, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const float* alpha, const float* a, const libxsmm_blasint* lda, + const float* b, const libxsmm_blasint* ldb, + const float* beta, float* c, const libxsmm_blasint* ldc) +{ + libxsmm_blas_sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_mmbatch)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, + const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, + const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], + const libxsmm_blasint*, const /*unsigned*/int*, const /*unsigned*/int*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_mmbatch)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const libxsmm_blasint* batchsize, const /*unsigned*/int* tid, const /*unsigned*/int* ntasks) +{ + LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); + LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); + LIBXSMM_ASSERT(NULL != tid && NULL != ntasks); + libxsmm_mmbatch(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, + *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize, *tid, *ntasks); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_gemm_batch)(const libxsmm_gemm_precision*, const libxsmm_gemm_precision*, + const char*, const char*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const void*, const void*, const libxsmm_blasint*, const void*, const libxsmm_blasint*, + const void*, void*, const libxsmm_blasint*, const libxsmm_blasint*, const libxsmm_blasint*, + const libxsmm_blasint[], const libxsmm_blasint[], const libxsmm_blasint[], + const libxsmm_blasint*); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_gemm_batch)(const libxsmm_gemm_precision* iprec, const libxsmm_gemm_precision* oprec, + const char* transa, const char* transb, const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, + const void* beta, void* c, const libxsmm_blasint* ldc, const libxsmm_blasint* index_base, const libxsmm_blasint* index_stride, + const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const libxsmm_blasint* batchsize) +{ + LIBXSMM_ASSERT(NULL != iprec && NULL != oprec && NULL != m && NULL != n && NULL != k); + LIBXSMM_ASSERT(NULL != index_base && NULL != index_stride && NULL != batchsize); + libxsmm_gemm_batch(*iprec, *oprec, transa, transb, *m, *n, *k, alpha, a, lda, b, ldb, beta, c, ldc, + *index_base, *index_stride, stride_a, stride_b, stride_c, *batchsize); +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_gemm.h b/third_party/libxsmm/src/libxsmm_gemm.h new file mode 100644 index 00000000..8d9db076 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_gemm.h @@ -0,0 +1,219 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_GEMM_H +#define LIBXSMM_GEMM_H + +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_BLAS_WRAP_DYNAMIC) && defined(LIBXSMM_INTERCEPT_DYNAMIC) && (!defined(__BLAS) || (0 != __BLAS)) +# define LIBXSMM_BLAS_WRAP_DYNAMIC +#endif + +#if !defined(LIBXSMM_GEMM_LOCK) +# define LIBXSMM_GEMM_LOCK LIBXSMM_LOCK_DEFAULT +#endif +#if !defined(LIBXSMM_GEMM_MMBATCH_SCALE) +# define LIBXSMM_GEMM_MMBATCH_SCALE 1.5 +#endif +#if !defined(LIBXSMM_GEMM_MMBATCH_VERBOSITY) +# define LIBXSMM_GEMM_MMBATCH_VERBOSITY ((LIBXSMM_VERBOSITY_HIGH) + 1) +#endif +#if !defined(LIBXSMM_GEMM_NPARGROUPS) +# define LIBXSMM_GEMM_NPARGROUPS 128 +#endif + +#if !defined(LIBXSMM_WRAP) && defined(LIBXSMM_BUILD) && \ + (defined(LIBXSMM_CONFIG_WRAP) && 0 != (LIBXSMM_CONFIG_WRAP)) && \ + (defined(LIBXSMM_BLAS_WRAP_DYNAMIC) || !defined(NDEBUG) || defined(_WIN32)) /* debug */ +# define LIBXSMM_WRAP LIBXSMM_CONFIG_WRAP +#endif + +/** Undefine (disarm) MKL's DIRECT_CALL macros. */ +#if (defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) +# if defined(sgemm_) +# undef sgemm_ +# endif +# if defined(dgemm_) +# undef dgemm_ +# endif +#endif + +#if !defined(LIBXSMM_BLAS_ERROR) +#define LIBXSMM_BLAS_ERROR(SYMBOL, PCOUNTER) do { \ + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(PCOUNTER, 1, LIBXSMM_ATOMIC_RELAXED)) { \ + fprintf(stderr, "LIBXSMM ERROR: application must be linked against LAPACK/BLAS %s!\n", SYMBOL); \ + } \ + } while(0) +#endif + +#if defined(LIBXSMM_BUILD) +# define LIBXSMM_BLAS_WRAPPER_STATIC1(TYPE, KIND, ORIGINAL) if (NULL == (ORIGINAL)) { \ + ORIGINAL = LIBXSMM_FSYMBOL(LIBXSMM_CONCATENATE(__real_, LIBXSMM_TPREFIX(TYPE, KIND))); \ + } +# define LIBXSMM_BLAS_WRAPPER_STATIC0 LIBXSMM_BLAS_WRAPPER_STATIC1 +#else +# define LIBXSMM_BLAS_WRAPPER_STATIC1(TYPE, KIND, ORIGINAL) if (NULL == (ORIGINAL)) { \ + ORIGINAL = (LIBXSMM_BLAS_FNTYPE(TYPE, KIND))LIBXSMM_BLAS_SYMBOL(TYPE, KIND); \ + } +# define LIBXSMM_BLAS_WRAPPER_STATIC0(TYPE, KIND, ORIGINAL) +#endif +#define LIBXSMM_BLAS_WRAPPER_STATIC(CONDITION, TYPE, KIND, ORIGINAL) \ + LIBXSMM_CONCATENATE(LIBXSMM_BLAS_WRAPPER_STATIC, CONDITION)(TYPE, KIND, ORIGINAL) + +#if defined(LIBXSMM_BLAS_WRAP_DYNAMIC) +# define LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT) { \ + union { const void* pfin; \ + LIBXSMM_BLAS_FNTYPE(TYPE, KIND) (*chain)(void); /* chain */ \ + LIBXSMM_BLAS_FNTYPE(TYPE, KIND) pfout; \ + } libxsmm_blas_wrapper_dynamic_ /*= { 0 }*/; \ + dlerror(); /* clear an eventual error status */ \ + libxsmm_blas_wrapper_dynamic_.chain = NEXT; \ + libxsmm_blas_wrapper_dynamic_.pfin = ((NULL == libxsmm_blas_wrapper_dynamic_.pfin) ? \ + dlsym(LIBXSMM_RTLD_NEXT, "libxsmm_original_" LIBXSMM_STRINGIFY(LIBXSMM_TPREFIX(TYPE, KIND))) : NULL); \ + if (NULL == libxsmm_blas_wrapper_dynamic_.pfout || NULL != dlerror() || NULL == libxsmm_blas_wrapper_dynamic_.chain()) { \ + libxsmm_blas_wrapper_dynamic_.pfin = dlsym(LIBXSMM_RTLD_NEXT, LIBXSMM_STRINGIFY(LIBXSMM_BLAS_SYMBOL(TYPE, KIND))); \ + /*LIBXSMM_ATOMIC_STORE(&(ORIGINAL), libxsmm_blas_wrapper_dynamic_.pfout, LIBXSMM_ATOMIC_RELAXED);*/ \ + ORIGINAL = (NULL == dlerror() ? libxsmm_blas_wrapper_dynamic_.pfout : NULL); \ + } \ + } +#else +# define LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT) +#endif + +#define LIBXSMM_BLAS_WRAPPER(CONDITION, TYPE, KIND, ORIGINAL, NEXT) if (NULL == (ORIGINAL)) { \ + LIBXSMM_BLAS_WRAPPER_DYNAMIC(TYPE, KIND, ORIGINAL, NEXT); \ + LIBXSMM_BLAS_WRAPPER_STATIC(CONDITION, TYPE, KIND, ORIGINAL); \ +} + + +/** Provides GEMM functions available via BLAS; NOT thread-safe. */ +LIBXSMM_API_INTERN void libxsmm_gemm_init(int archid); + +/** Finalizes the GEMM facility; NOT thread-safe. */ +LIBXSMM_API_INTERN void libxsmm_gemm_finalize(void); + +LIBXSMM_API_INTERN int libxsmm_gemm_prefetch2uid(libxsmm_gemm_prefetch_type prefetch); +LIBXSMM_API_INTERN libxsmm_gemm_prefetch_type libxsmm_gemm_uid2prefetch(int uid); + +#if defined(LIBXSMM_BUILD) +#if defined(LIBXSMM_BUILD_EXT) +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_dgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); +LIBXSMM_APIEXT void LIBXSMM_FSYMBOL(__wrap_sgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); +LIBXSMM_APIEXT void __wrap_dgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); +LIBXSMM_APIEXT void __wrap_sgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); +#endif +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemm_batch)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm)); +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemm)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm)); +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_dgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemv)); +LIBXSMM_API void LIBXSMM_FSYMBOL(__real_sgemv)(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemv)); +LIBXSMM_API void __real_dgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, double, gemm_batch)); +LIBXSMM_API void __real_sgemm_batch(LIBXSMM_BLAS_SYMBOL_SIGNATURE(const*, *, float, gemm_batch)); +#endif + +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemm_batch); +LIBXSMM_BLAS_SYMBOL_CDECL(LIBXSMM_BLAS_CONST*, *, double, gemm_batch); +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemm_batch); +LIBXSMM_BLAS_SYMBOL_CDECL(LIBXSMM_BLAS_CONST*, *, float, gemm_batch); +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemm); +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemm); +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, double, gemv); +LIBXSMM_BLAS_SYMBOL_FDECL(LIBXSMM_BLAS_CONST*, *, float, gemv); + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_gemm_handle { + libxsmm_xcopykernel copy_a, copy_b, copy_i, copy_o; + libxsmm_xmmfunction kernel[2]; + unsigned int m, n, k, lda, ldb, ldc; + /* kernel size (tile) */ + unsigned int km, kn, kk; + /* tile size per task */ + unsigned int dm, dn, dk; + unsigned int itypesize, otypesize; + /* number of tasks per direction */ + unsigned int mt, nt, kt; + int gemm_flags, flags; +}; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_mmbatch_item { + struct { + const void *a, *b; + void *c; + } value; + struct { + libxsmm_gemm_descriptor desc; + unsigned int count; + const char* symbol; + } stat; + /* TODO: consider padding */ +} libxsmm_mmbatch_item; + +LIBXSMM_API void libxsmm_gemm_internal_set_batchflag(libxsmm_gemm_descriptor* descriptor, void* c, libxsmm_blasint index_stride, + libxsmm_blasint batchsize, int multithreaded); + +LIBXSMM_API int libxsmm_mmbatch_kernel(libxsmm_xmmfunction kernel, libxsmm_blasint index_base, + libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + const void* a, const void* b, void* c, libxsmm_blasint batchsize, /*unsigned*/int tid, /*unsigned*/int ntasks, + unsigned char itypesize, unsigned char otypesize, int flags); + +LIBXSMM_API int libxsmm_mmbatch_blas( + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const void* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const void* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize); + +LIBXSMM_API_INTERN void libxsmm_dmmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const double* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const double* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize); + +LIBXSMM_API_INTERN void libxsmm_smmbatch_blas(const char* transa, const char* transb, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const float* alpha, const void* a, const libxsmm_blasint* lda, const void* b, const libxsmm_blasint* ldb, const float* beta, void* c, const libxsmm_blasint* ldc, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride_a[], const libxsmm_blasint stride_b[], const libxsmm_blasint stride_c[], + libxsmm_blasint batchsize); + +LIBXSMM_EXTERN_C typedef void (*libxsmm_mmbatch_flush_function)(void); + +/** auto-batch descriptor (filter). */ +LIBXSMM_APIVAR_PUBLIC(libxsmm_gemm_descriptor libxsmm_mmbatch_desc); +/** Records a batch of SMMs or is used for batch-reduce. */ +LIBXSMM_APIVAR_PUBLIC(void* libxsmm_mmbatch_array); +/** Lock: libxsmm_mmbatch_begin, libxsmm_mmbatch_end, internal_mmbatch_flush. */ +LIBXSMM_APIVAR_PUBLIC(LIBXSMM_LOCK_TYPE(LIBXSMM_GEMM_LOCK) libxsmm_mmbatch_lock); +/** Maximum size of the recorded batch. */ +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mmbatch_size); +/** Maximum number of parallelized batch-groups. */ +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_gemm_npargroups); +/** Minimum batchsize per thread/task. */ +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_gemm_taskgrain); +/** Determines if OpenMP tasks are used. */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_gemm_tasks); +/** + * Intercepted GEMM + * - [>=1 and odd]: sequential and non-tiled (small problem sizes only) + * - [>=2 and even]: parallelized and tiled (all problem sizes) + * - [>=3 and odd]: GEMV is intercepted; small problem sizes + * - [>=4 and even]: GEMV is intercepted; all problem sizes + * - [0]: disabled + */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_gemm_wrap); + +/** Determines the default prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch_default); +/** Determines the prefetch strategy, which is used in case of LIBXSMM_PREFETCH_AUTO. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_gemm_prefetch_type libxsmm_gemm_auto_prefetch); + +#endif /*LIBXSMM_GEMM_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_generator.c b/third_party/libxsmm/src/libxsmm_generator.c new file mode 100644 index 00000000..4d76d8ee --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_generator.c @@ -0,0 +1,530 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_PRODUCT_LIMIT) +# define LIBXSMM_PRODUCT_LIMIT 1024 +#endif + + +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state0[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state1[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state2[16]); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_intrinsics_mm512_rng_state3[16]); + +/* definition of corresponding variables */ +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_ninit); +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_target_archid); +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_verbosity); +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_se); + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_dgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + double alpha, double beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR(*result.ptr, LIBXSMM_GEMM_PRECISION(double), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* quiet error (unsupported) */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_sgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR(*result.ptr, LIBXSMM_GEMM_PRECISION(float), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_wigemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(short), LIBXSMM_GEMM_PRECISION(int), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bsgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), LIBXSMM_GEMM_PRECISION(float), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + float alpha, float beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), LIBXSMM_GEMM_PRECISION(libxsmm_bfloat16), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bigemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(char), LIBXSMM_GEMM_PRECISION(int), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_bbgemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, + int alpha, int beta, int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, LIBXSMM_GEMM_PRECISION(char), LIBXSMM_GEMM_PRECISION(char), + flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* unsupported */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, + int flags, int prefetch) +{ + return libxsmm_gemm_descriptor_dinit2(blob, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_dinit2(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, double alpha, double beta, + int flags, int prefetch) +{ + union { + libxsmm_gemm_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + if (LIBXSMM_GEMM_NO_BYPASS(flags, alpha, beta) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc) + && LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k)) + { + result.blob = blob; + /* Note: iprec/oprec combination is not checked to omit type-switch (invalid combination may result in BE-error) */ + LIBXSMM_GEMM_DESCRIPTOR2(*result.ptr, iprec, oprec, flags, m, n, k, lda, ldb, ldc, alpha, beta, prefetch); + } + else { /* quiet error (unsupported) */ + result.ptr = NULL; + } + return result.ptr; +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch) +{ + return libxsmm_gemm_descriptor_init2(blob, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init2(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch) +{ + return libxsmm_gemm_descriptor_init3(blob, iprec, oprec, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch, + NULL/*dalpha*/, NULL/*dbeta*/); +} + + +LIBXSMM_API libxsmm_gemm_descriptor* libxsmm_gemm_descriptor_init3(libxsmm_descriptor_blob* blob, + libxsmm_gemm_precision iprec, libxsmm_gemm_precision oprec, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_blasint lda, libxsmm_blasint ldb, libxsmm_blasint ldc, const void* alpha, const void* beta, + int flags, int prefetch, double* dalpha, double* dbeta) +{ + /* avoid warning about potentially uninitialized variable (initialize outside of control flow) */ + libxsmm_gemm_descriptor* result = NULL; + switch (iprec) { + case LIBXSMM_GEMM_PRECISION_F64: { + const double aa = (NULL != alpha ? *((const double*)alpha) : (LIBXSMM_ALPHA)); + const double bb = (NULL != beta ? *((const double*)beta) : (LIBXSMM_BETA)); + LIBXSMM_ASSERT(LIBXSMM_GEMM_PRECISION_F64 == oprec); + result = libxsmm_dgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = aa; + if (NULL != dbeta) *dbeta = bb; + } break; + case LIBXSMM_GEMM_PRECISION_F32: { + const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); + const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); + LIBXSMM_ASSERT(LIBXSMM_GEMM_PRECISION_F32 == oprec); + result = libxsmm_sgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } break; + case LIBXSMM_GEMM_PRECISION_I16: { + /** + * Take alpha and beta as short data although wgemm works on integers. + * However, alpha and beta are only JIT-supported for certain values, + * and the call-side may not distinct different input and output types + * (integer/short), hence it is safer to only read short data. + */ + const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); + const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); + LIBXSMM_ASSERT(LIBXSMM_GEMM_PRECISION_I32 == oprec); + result = libxsmm_wigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } break; + case LIBXSMM_GEMM_PRECISION_I8: { + /** + * Take alpha and beta as short data although wgemm works on integers. + * However, alpha and beta are only JIT-supported for certain values, + * and the call-side may not distinct different input and output types + * (integer/short), hence it is safer to only read short data. + */ + if (LIBXSMM_GEMM_PRECISION_I32 == oprec) { + const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); + const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); + result = libxsmm_bigemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } + else if (LIBXSMM_GEMM_PRECISION_I8 == oprec) { + const short aa = (short)(NULL != alpha ? *((const short*)alpha) : (LIBXSMM_ALPHA)); + const short bb = (short)(NULL != beta ? *((const short*)beta) : (LIBXSMM_BETA)); + result = libxsmm_bbgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } + } break; + case LIBXSMM_GEMM_PRECISION_BF16: { + if (LIBXSMM_GEMM_PRECISION_F32 == oprec) { + const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); + const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); + result = libxsmm_bsgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } + else if (LIBXSMM_GEMM_PRECISION_BF16 == oprec) { + const float aa = (NULL != alpha ? *((const float*)alpha) : (LIBXSMM_ALPHA)); + const float bb = (NULL != beta ? *((const float*)beta) : (LIBXSMM_BETA)); + result = libxsmm_bgemm_descriptor_init(blob, m, n, k, lda, ldb, ldc, aa, bb, flags, prefetch); + if (NULL != dalpha) *dalpha = (double)aa; + if (NULL != dbeta) *dbeta = (double)bb; + } + } break; + default: /* result remains NULL */; + } + if (NULL == result) { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: GEMM precision is not supported!\n"); + } + } + return result; +} + + +LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_datatype in_type, libxsmm_datatype out_type, + libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldi, libxsmm_blasint ldo, + unsigned short flags, unsigned char param, unsigned char operation) +{ + union { + libxsmm_meltw_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + LIBXSMM_DESCRIPTOR_CLEAR(blob); + result.blob = blob; + result.ptr->datatype = (unsigned char)LIBXSMM_GETENUM(in_type, out_type); + result.ptr->datatype2 = 0; + result.ptr->flags = (unsigned short)flags; + result.ptr->operation = (unsigned char)operation; + result.ptr->param = (unsigned char)param; + result.ptr->ldi = ldi; + result.ptr->ldo = ldo; + result.ptr->ldi2 = 0; + result.ptr->ldi3 = 0; + result.ptr->m = m; + result.ptr->n = n; + return result.ptr; +} + + +LIBXSMM_API libxsmm_meltw_descriptor* libxsmm_meltw_descriptor_init2(libxsmm_descriptor_blob* blob, + libxsmm_datatype in_type, libxsmm_datatype in2_type, libxsmm_datatype out_type, libxsmm_datatype out2_type, + libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldi, libxsmm_blasint ldo, libxsmm_blasint ldi2, libxsmm_blasint ldi3, + unsigned short flags, unsigned char param, unsigned char operation) +{ + union { + libxsmm_meltw_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + LIBXSMM_DESCRIPTOR_CLEAR(blob); + result.blob = blob; + result.ptr->datatype = (unsigned char)LIBXSMM_GETENUM(in_type, out_type); + result.ptr->datatype2 = (unsigned char)LIBXSMM_GETENUM(in2_type, out2_type); + result.ptr->flags = (unsigned short)flags; + result.ptr->operation = (unsigned char)operation; + result.ptr->param = (unsigned char)param; + result.ptr->ldi = ldi; + result.ptr->ldo = ldo; + result.ptr->ldi2 = ldi2; + result.ptr->ldi3 = ldi3; + result.ptr->m = m; + result.ptr->n = n; + return result.ptr; +} + + +LIBXSMM_API libxsmm_meqn_descriptor* libxsmm_meqn_descriptor_init(libxsmm_descriptor_blob* blob, + libxsmm_datatype out_type, libxsmm_blasint m, libxsmm_blasint n, + libxsmm_blasint ldo, unsigned int eqn_idx) +{ + union { + libxsmm_meqn_descriptor* ptr; + libxsmm_descriptor_blob* blob; + } result; + LIBXSMM_DESCRIPTOR_CLEAR(blob); + result.blob = blob; + result.ptr->datatype = (unsigned char)LIBXSMM_GETENUM( LIBXSMM_DATATYPE_UNSUPPORTED, out_type); + result.ptr->eqn_idx = eqn_idx; + result.ptr->ldo = ldo; + result.ptr->m = m; + result.ptr->n = n; + return result.ptr; +} + + +LIBXSMM_API size_t libxsmm_gcd(size_t a, size_t b) +{ + while (0 != b) { + const size_t r = a % b; + a = b; b = r; + } + return 0 != a ? a : 1; +} + + +LIBXSMM_API size_t libxsmm_lcm(size_t a, size_t b) +{ + const size_t gcd = libxsmm_gcd(a, b); + return 0 != gcd ? ((a / gcd) * b) : 0; +} + + +LIBXSMM_API int libxsmm_primes_u32(unsigned int num, unsigned int num_factors_n32[]) +{ + unsigned int c = num, i; + int n = 0; + if (0 < c && 0 == (c & 1)) { /* non-zero even */ + unsigned int j = c / 2; + while (c == (2 * j)) { + num_factors_n32[n++] = 2; + c = j; j /= 2; + } + } + for (i = 3; i <= c; i += 2) { + unsigned int j = c / i; + while (c == (i * j)) { + num_factors_n32[n++] = i; + c = j; j /= i; + } + if ((i * i) > num) { + break; + } + } + if (1 < c && 0 != n) { + num_factors_n32[n++] = c; + } + return n; +} + + +LIBXSMM_API_INLINE unsigned int internal_product_limit(unsigned int product, unsigned int limit) +{ + unsigned int fact[32], maxp = limit, result = 1; + int i, n; + /* attempt to lower the memory requirement for DP; can miss best solution */ + if (LIBXSMM_PRODUCT_LIMIT < limit) { + const unsigned int minfct = (limit + limit - 1) / LIBXSMM_PRODUCT_LIMIT; + const unsigned int maxfct = (unsigned int)libxsmm_gcd(product, limit); + result = maxfct; + if (minfct < maxfct) { + n = libxsmm_primes_u32(result, fact); + for (i = 0; i < n; ++i) { + if (minfct < fact[i]) { + result = fact[i]; + break; + } + } + } + maxp /= result; + } + if (LIBXSMM_PRODUCT_LIMIT >= maxp) { + unsigned int k[2][LIBXSMM_PRODUCT_LIMIT], *k0 = k[0], *k1 = k[1], *kt, p; + n = libxsmm_primes_u32(product / result, fact); + /* initialize table with trivial factor */ + for (p = 0; p <= maxp; ++p) k[0][p] = 1; + k[0][0] = k[1][0] = 1; + for (i = 1; i <= n; ++i) { + for (p = 1; p <= maxp; ++p) { + const unsigned int f = fact[i - 1], h = k0[p]; + if (p < f) { + k1[p] = h; + } + else { + const unsigned int g = f * k0[p / f]; + k1[p] = LIBXSMM_MAX(g, h); + } + } + kt = k0; k0 = k1; k1 = kt; + } + result *= k0[maxp]; + } + else { /* trivial approximation */ + n = libxsmm_primes_u32(product, fact); + for (i = 0; i < n; ++i) { + const unsigned int f = result * fact[i]; + if (f <= limit) { + result = f; + } + else break; + } + } + return result; +} + + +LIBXSMM_API unsigned int libxsmm_product_limit(unsigned int product, unsigned int limit, int is_lower) +{ + unsigned int result; + if (1 < limit) { /* check for fast-path */ + result = internal_product_limit(product, limit); + } + else { + result = limit; + } + if (0 != is_lower && limit < product) { + if (result < limit) { + result = internal_product_limit(product, 2 * limit - 1); + } + if (result < limit) { + result = product; + } + LIBXSMM_ASSERT(limit <= result); + } + if (product < result) { + result = product; + } + LIBXSMM_ASSERT(result <= product); + return result; +} + diff --git a/third_party/libxsmm/src/libxsmm_generator_gemm_driver.c b/third_party/libxsmm/src/libxsmm_generator_gemm_driver.c new file mode 100644 index 00000000..d83c6df8 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_generator_gemm_driver.c @@ -0,0 +1,280 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include + + +LIBXSMM_INLINE void print_help(void) { + printf("\nwrong usage -> exit!\n\n\n"); + printf("Usage (sparse*dense=dense, dense*sparse=dense):\n"); + printf(" sparse, sparse_csr, sparse_csr_reg\n"); + printf(" filename to append\n"); + printf(" routine name\n"); + printf(" M\n"); + printf(" N\n"); + printf(" K\n"); + printf(" LDA (if < 1 --> A sparse)\n"); + printf(" LDB (if < 1 --> B sparse)\n"); + printf(" LDC\n"); + printf(" alpha: 1\n"); + printf(" beta: 0 or 1\n"); + printf(" 0: unaligned A, otherwise aligned (ignored for sparse)\n"); + printf(" 0: unaligned C, otherwise aligned (ignored for sparse)\n"); + printf(" ARCH: noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx\n"); + printf(" PREFETCH: nopf (none), pfsigonly, other options fallback to pfsigonly\n"); + printf(" PRECISION: SP, DP\n"); + printf(" matrix input (CSC mtx file)\n"); + printf("\n\n"); + printf("Usage (dense*dense=dense):\n"); + printf(" dense, dense_asm\n"); + printf(" filename to append\n"); + printf(" routine name\n"); + printf(" M\n"); + printf(" N\n"); + printf(" K\n"); + printf(" LDA\n"); + printf(" LDB\n"); + printf(" LDC\n"); + printf(" alpha: -1 or 1\n"); + printf(" beta: 0 or 1\n"); + printf(" 0: unaligned A, otherwise aligned\n"); + printf(" 0: unaligned C, otherwise aligned\n"); + printf(" ARCH: noarch, wsm, snb, hsw, knl, knm, skx, clx, cpx\n"); + printf(" PREFETCH: nopf (none), pfsigonly, BL2viaC, AL2, curAL2,\n" + " AL2_BL2viaC, curAL2_BL2viaC,\n"); + printf(" PRECISION: I16, SP, DP\n"); + printf("\n\n\n\n"); +} + +int main(int argc, char* argv []) { + const libxsmm_gemm_descriptor* l_xgemm_desc = 0; + int l_flags = LIBXSMM_GEMM_FLAGS('N', 'N'); + libxsmm_gemm_prefetch_type l_prefetch; + libxsmm_descriptor_blob l_xgemm_blob; + char* l_type; + char* l_file_out; + char* l_matrix_file_in; + char* l_routine_name; + char* l_arch; + char* l_precision; + int l_m = 0; + int l_n = 0; + int l_k = 0; + int l_lda = 0; + int l_ldb = 0; + int l_ldc = 0; + int l_aligned_a = 0; + int l_aligned_c = 0; + double l_alpha = 0; + double l_beta = 0; + int l_single_precision = 0; + int l_is_csr = 0; + + /* check argument count for a valid range */ + if (argc != 17 && argc != 18) { + print_help(); + return EXIT_FAILURE; + } + + /* names of files and routines */ + l_type = argv[1]; + l_file_out = argv[2]; + l_routine_name = argv[3]; + + /* xgemm sizes */ + l_m = atoi(argv[4]); + l_n = atoi(argv[5]); + l_k = atoi(argv[6]); + l_lda = atoi(argv[7]); + l_ldb = atoi(argv[8]); + l_ldc = atoi(argv[9]); + + /* condense < 1 to 0 for lda and ldb */ + if ( l_lda < 1 ) + l_lda = 0; + if ( l_ldb < 1 ) + l_ldb = 0; + + /* some sugar */ + l_alpha = atof(argv[10]); + l_beta = atof(argv[11]); + l_aligned_a = atoi(argv[12]); + l_aligned_c = atoi(argv[13]); + + l_flags |= (0 != l_aligned_a ? LIBXSMM_GEMM_FLAG_ALIGN_A : 0); + l_flags |= (0 != l_aligned_c ? LIBXSMM_GEMM_FLAG_ALIGN_C : 0); + + /* arch specific stuff */ + l_arch = argv[14]; + l_precision = argv[16]; + + /* some initial parameters checks */ + /* check for sparse / dense only */ + if ( (strcmp(l_type, "sparse") != 0) && + (strcmp(l_type, "sparse_csr") != 0) && + (strcmp(l_type, "sparse_csr_reg") != 0) && + (strcmp(l_type, "dense") != 0) && + (strcmp(l_type, "dense_asm") != 0) ) { + print_help(); + return EXIT_FAILURE; + } + + /* check for the right number of arguments depending on type */ + if ( ( (strcmp(l_type, "sparse") == 0) && (argc != 18) ) || + ( (strcmp(l_type, "sparse_csr") == 0) && (argc != 18) ) || + ( (strcmp(l_type, "sparse_csr_reg") == 0) && (argc != 18) ) || + ( (strcmp(l_type, "dense") == 0) && (argc != 17) ) || + ( (strcmp(l_type, "dense_asm") == 0) && (argc != 17) ) ) { + print_help(); + return EXIT_FAILURE; + } + + /* set value of prefetch flag */ + if (strcmp("nopf", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_NONE; + } + else if (strcmp("pfsigonly", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_SIGONLY; + } + else if (strcmp("BL2viaC", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_BL2_VIA_C; + } + else if (strcmp("curAL2", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2_AHEAD; + } + else if (strcmp("curAL2_BL2viaC", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C_AHEAD; + } + else if (strcmp("AL2", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2; + } + else if (strcmp("AL2_BL2viaC", argv[15]) == 0) { + l_prefetch = LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C; + } + else { + print_help(); + return EXIT_FAILURE; + } + + /* check value of arch flag */ + if ( (strcmp(l_arch, "wsm") != 0) && + (strcmp(l_arch, "snb") != 0) && + (strcmp(l_arch, "hsw") != 0) && + (strcmp(l_arch, "knl") != 0) && + (strcmp(l_arch, "knm") != 0) && + (strcmp(l_arch, "skx") != 0) && + (strcmp(l_arch, "clx") != 0) && + (strcmp(l_arch, "cpx") != 0) && + (strcmp(l_arch, "noarch") != 0) ) { + print_help(); + return EXIT_FAILURE; + } + + /* check and evaluate precision flag */ + if ( strcmp(l_precision, "SP") == 0 ) { + l_single_precision = 1; + } else if ( strcmp(l_precision, "DP") == 0 ) { + l_single_precision = 0; + } else if ( strcmp(l_precision, "I16") == 0 ) { + l_single_precision = 2; + } else { + print_help(); + return EXIT_FAILURE; + } + + /* check alpha */ + if ((l_alpha < -1 || -1 < l_alpha) && (l_alpha < 1 || 1 < l_alpha)) { + print_help(); + return EXIT_FAILURE; + } + + /* check beta */ + if ((l_beta < 0 || 0 < l_beta) && (l_beta < 1 || 1 < l_beta)) { + print_help(); + return EXIT_FAILURE; + } + + switch (l_single_precision) { + case 0: { + l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_F64, + l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); + } break; + case 1: { + l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_F32, + l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); + } break; + case 2: { + l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION_I16, + l_m, l_n, l_k, l_lda, l_ldb, l_ldc, l_alpha, l_beta, l_flags, l_prefetch); + } break; + default: { + print_help(); + return EXIT_FAILURE; + } + } + + if (NULL == l_xgemm_desc) { + print_help(); + return EXIT_FAILURE; + } + + if ( strcmp(l_type, "sparse") == 0 || strcmp(l_type, "sparse_csr") == 0 || + strcmp(l_type, "sparse_csr_reg") == 0 ) { + /* read additional parameter for CSC/CSR description */ + l_matrix_file_in = argv[17]; + + /* some more restrictive checks are needed in case of sparse */ + if ( (l_alpha < 1) || (1 < l_alpha) ) { + print_help(); + return EXIT_FAILURE; + } + + if (l_lda < 1 && l_ldb < 1) { + print_help(); + return EXIT_FAILURE; + } + + if (l_ldc < 1) { + print_help(); + return EXIT_FAILURE; + } + + if ( l_single_precision > 1 ) { + print_help(); + return EXIT_FAILURE; + } + + if ( strcmp(l_type, "sparse_csr") == 0 ) { + l_is_csr = 1; + } + if ( strcmp(l_type, "sparse_csr_reg") == 0 ) { + l_is_csr = 3; + } + + libxsmm_generator_spgemm( l_file_out, l_routine_name, l_xgemm_desc, l_arch, l_matrix_file_in, l_is_csr ); + } + + if ( (strcmp(l_type, "dense") == 0) || + (strcmp(l_type, "dense_asm") == 0) ) { + if (l_lda < 1 || l_ldb < 1 || l_ldc < 1) { + print_help(); + return EXIT_FAILURE; + } + + if ( strcmp(l_type, "dense") == 0 ) { + libxsmm_generator_gemm_inlineasm( l_file_out, l_routine_name, l_xgemm_desc, l_arch ); + } else { + libxsmm_generator_gemm_directasm( l_file_out, l_routine_name, l_xgemm_desc, l_arch ); + } + } + + return EXIT_SUCCESS; +} + diff --git a/third_party/libxsmm/src/libxsmm_hash.c b/third_party/libxsmm/src/libxsmm_hash.c new file mode 100644 index 00000000..8f3289c7 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_hash.c @@ -0,0 +1,595 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_hash.h" +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_HASH_ALIGNMENT) +# define LIBXSMM_HASH_ALIGNMENT 8 +#endif + +#define LIBXSMM_HASH_U64(FN, SEED, BEGIN, END) { \ + const uint8_t *const end = (NULL != (END) ? ((END) - 7) : NULL); \ + for (; (BEGIN) < end; (BEGIN) += 8) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ + SEED = (uint32_t)FN(SEED, BEGIN); \ + } \ +} +#define LIBXSMM_HASH_U32(FN, SEED, BEGIN, END) { \ + const uint8_t *const next = (BEGIN) + 4; \ + if (next <= (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ + SEED = FN(SEED, BEGIN); BEGIN = next; \ + } \ +} +#define LIBXSMM_HASH_U16(FN, SEED, BEGIN, END) { \ + const uint8_t *const next = (BEGIN) + 2; \ + if (next <= (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ + SEED = FN(SEED, BEGIN); BEGIN = next; \ + } \ +} +#define LIBXSMM_HASH_U8(FN, SEED, BEGIN, END) { \ + if ((BEGIN) < (END)) { LIBXSMM_ASSERT(NULL != (BEGIN) || NULL == (END)); \ + SEED = FN(SEED, BEGIN); ++(BEGIN); \ + } \ +} + +#define LIBXSMM_HASH_CRC32_U8(SEED, PVALUE) _mm_crc32_u8(SEED, *(const uint8_t*)(PVALUE)) +#define LIBXSMM_HASH_CRC32_U16(SEED, PVALUE) _mm_crc32_u16(SEED, *(const uint16_t*)(PVALUE)) +#define LIBXSMM_HASH_CRC32_U32(SEED, PVALUE) _mm_crc32_u32(SEED, *(const uint32_t*)(PVALUE)) + +#if (64 > (LIBXSMM_BITS)) || defined(__PGI) +# define LIBXSMM_HASH_CRC32_U64(SEED, PVALUE) \ + LIBXSMM_HASH_CRC32_U32(LIBXSMM_HASH_CRC32_U32((uint32_t)(SEED), PVALUE), (const uint32_t*)(PVALUE) + 1) +#else +# define LIBXSMM_HASH_CRC32_U64(SEED, PVALUE) _mm_crc32_u64(SEED, *(const uint64_t*)(PVALUE)) +#endif + +#define LIBXSMM_HASH_UNALIGNED(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ + const uint8_t *begin = (const uint8_t*)(DATA); \ + const uint8_t *const endb = begin + (SIZE); \ + LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ + LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ + LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ + return begin == endb ? (SEED) : FN8(SEED, begin); \ +} + +#if defined(LIBXSMM_HASH_ALIGNMENT) && 8 < (LIBXSMM_HASH_ALIGNMENT) +# define LIBXSMM_HASH(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ + const uint8_t *begin = (const uint8_t*)(DATA); \ + const uint8_t *const endb = begin + (SIZE); \ + const uint8_t *const enda = LIBXSMM_ALIGN(begin, LIBXSMM_HASH_ALIGNMENT); \ + if ((SIZE) > (size_t)(endb - enda)) { \ + LIBXSMM_HASH_U64(FN64, SEED, begin, enda); \ + LIBXSMM_HASH_U32(FN32, SEED, begin, enda); \ + LIBXSMM_HASH_U16(FN16, SEED, begin, enda); \ + LIBXSMM_HASH_U8(FN8, SEED, begin, enda); \ + } \ + LIBXSMM_ASSUME_ALIGNED(begin, LIBXSMM_HASH_ALIGNMENT); \ + LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ + LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ + LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ + return begin == endb ? (SEED) : FN8(SEED, begin); \ + } +#elif defined(LIBXSMM_HASH_ALIGNMENT) && 1 < (LIBXSMM_HASH_ALIGNMENT) +# define LIBXSMM_HASH(FN64, FN32, FN16, FN8, SEED, DATA, SIZE) { \ + const uint8_t *begin = (const uint8_t*)(DATA); \ + const uint8_t *const endb = begin + (SIZE); \ + const uint8_t *const enda = LIBXSMM_ALIGN(begin, LIBXSMM_HASH_ALIGNMENT); \ + if ((SIZE) > (size_t)(endb - enda)) { \ + LIBXSMM_HASH_U32(FN32, SEED, begin, enda); \ + LIBXSMM_HASH_U16(FN16, SEED, begin, enda); \ + LIBXSMM_HASH_U8(FN8, SEED, begin, enda); \ + } \ + LIBXSMM_ASSUME_ALIGNED(begin, LIBXSMM_HASH_ALIGNMENT); \ + LIBXSMM_HASH_U64(FN64, SEED, begin, endb); \ + LIBXSMM_HASH_U32(FN32, SEED, begin, endb); \ + LIBXSMM_HASH_U16(FN16, SEED, begin, endb); \ + return begin == endb ? (SEED) : FN8(SEED, begin); \ + } +#else +# define LIBXSMM_HASH LIBXSMM_HASH_UNALIGNED +#endif + +typedef uint32_t internal_crc32_entry_type[256]; +LIBXSMM_APIVAR_DEFINE(const internal_crc32_entry_type* internal_crc32_table); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u32_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u64_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u128_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u256_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u384_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_u512_function); +LIBXSMM_APIVAR_DEFINE(libxsmm_hash_function internal_hash_function); + + +LIBXSMM_API_INLINE unsigned int internal_crc32_u8(unsigned int seed, const void* value) +{ + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8 && NULL != internal_crc32_table); + return internal_crc32_table[0][(seed^(*pu8)) & 0xFF] ^ (seed >> 8); +} + + +LIBXSMM_API_INLINE unsigned int internal_crc32_u16(unsigned int seed, const void* value) +{ + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u8(seed, pu8 + 0); + seed = internal_crc32_u8(seed, pu8 + 1); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u32(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u32(unsigned int seed, const void* value, ...) +{ + const uint32_t *const pu32 = (const uint32_t*)value; + uint32_t c0, c1, c2, c3, s; + LIBXSMM_ASSERT(NULL != pu32 && NULL != internal_crc32_table); + s = seed ^ (*pu32); + c0 = internal_crc32_table[0][(s >> 24) & 0xFF]; + c1 = internal_crc32_table[1][(s >> 16) & 0xFF]; + c2 = internal_crc32_table[2][(s >> 8) & 0xFF]; + c3 = internal_crc32_table[3][(s & 0xFF)]; + return (c0 ^ c1) ^ (c2 ^ c3); +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u32_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u32_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + return LIBXSMM_HASH_CRC32_U32(seed, value); +#else + return internal_crc32_u32(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u64(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u64(unsigned int seed, const void* value, ...) +{ + const uint32_t *const pu32 = (const uint32_t*)value; + LIBXSMM_ASSERT(NULL != pu32); + seed = internal_crc32_u32(seed, pu32 + 0); + seed = internal_crc32_u32(seed, pu32 + 1); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u64_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u64_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + return (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, value); +#else + return internal_crc32_u64(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u128(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u128(unsigned int seed, const void* value, ...) +{ + const uint64_t *const pu64 = (const uint64_t*)value; + LIBXSMM_ASSERT(NULL != pu64); + seed = internal_crc32_u64(seed, pu64 + 0); + seed = internal_crc32_u64(seed, pu64 + 1); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u128_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u128_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + const uint64_t *const pu64 = (const uint64_t*)value; + LIBXSMM_ASSERT(NULL != pu64); + seed = (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, pu64 + 0); + seed = (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, pu64 + 1); +#else + seed = internal_crc32_u128(seed, value); +#endif + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u256(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u256(unsigned int seed, const void* value, ...) +{ + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u128(seed, pu8 + 0x00); + seed = internal_crc32_u128(seed, pu8 + 0x10); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u256_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u256_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u128_sse4(seed, pu8 + 0x00); + seed = internal_crc32_u128_sse4(seed, pu8 + 0x10); + return seed; +#else + return internal_crc32_u256(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u384(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u384(unsigned int seed, const void* value, ...) +{ + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u256(seed, pu8 + 0x00); + seed = internal_crc32_u128(seed, pu8 + 0x20); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u384_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u384_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u256_sse4(seed, pu8 + 0x00); + seed = internal_crc32_u128_sse4(seed, pu8 + 0x20); + return seed; +#else + return internal_crc32_u384(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u512(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int internal_crc32_u512(unsigned int seed, const void* value, ...) +{ + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u256(seed, pu8 + 0x00); + seed = internal_crc32_u256(seed, pu8 + 0x20); + return seed; +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_u512_sse4(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_u512_sse4(unsigned int seed, const void* value, ...) +{ +#if defined(LIBXSMM_INTRINSICS_SSE42) + const uint8_t *const pu8 = (const uint8_t*)value; + LIBXSMM_ASSERT(NULL != pu8); + seed = internal_crc32_u256_sse4(seed, pu8 + 0x00); + seed = internal_crc32_u256_sse4(seed, pu8 + 0x20); + return seed; +#else + return internal_crc32_u512(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32(unsigned int seed, const void* data, size_t size); +LIBXSMM_API_INTERN unsigned int internal_crc32(unsigned int seed, const void* data, size_t size) +{ + LIBXSMM_ASSERT(NULL != data || 0 == size); + LIBXSMM_HASH(internal_crc32_u64, internal_crc32_u32, internal_crc32_u16, internal_crc32_u8, seed, data, size); +} + + +LIBXSMM_API_INTERN unsigned int internal_crc32_sse4(unsigned int seed, const void* data, size_t size); +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_SSE42) +unsigned int internal_crc32_sse4(unsigned int seed, const void* data, size_t size) +{ + LIBXSMM_ASSERT(NULL != data || 0 == size); +#if defined(LIBXSMM_INTRINSICS_SSE42) + LIBXSMM_HASH(LIBXSMM_HASH_CRC32_U64, LIBXSMM_HASH_CRC32_U32, LIBXSMM_HASH_CRC32_U16, LIBXSMM_HASH_CRC32_U8, seed, data, size); +#else + return internal_crc32(seed, data, size); +#endif +} + + +LIBXSMM_API_INTERN void libxsmm_hash_init(int target_arch) +{ + /* table-based implementation taken from http://dpdk.org/. */ + static const LIBXSMM_RETARGETABLE internal_crc32_entry_type crc32_table[] = { + { /*table0*/ + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 + }, + { /*table1*/ + 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945, + 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD, + 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4, + 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C, + 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47, + 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF, + 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6, + 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E, + 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41, + 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9, + 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0, + 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78, + 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43, + 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB, + 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2, + 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A, + 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC, + 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004, + 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D, + 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185, + 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE, + 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306, + 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F, + 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287, + 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8, + 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600, + 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439, + 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781, + 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA, + 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502, + 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B, + 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 + }, + { /*table2*/ + 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469, + 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC, + 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3, + 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726, + 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D, + 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8, + 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7, + 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32, + 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0, + 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75, + 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A, + 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF, + 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4, + 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161, + 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E, + 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB, + 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A, + 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF, + 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0, + 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065, + 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E, + 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB, + 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4, + 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71, + 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3, + 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36, + 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79, + 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC, + 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7, + 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622, + 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D, + 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 + }, + { /*table3*/ + 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA, + 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C, + 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7, + 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11, + 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41, + 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7, + 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C, + 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A, + 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D, + 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB, + 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610, + 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6, + 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6, + 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040, + 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B, + 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D, + 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5, + 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213, + 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8, + 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E, + 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E, + 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698, + 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443, + 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5, + 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12, + 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4, + 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F, + 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9, + 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99, + 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F, + 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4, + 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 + } + }; + internal_crc32_table = crc32_table; +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + LIBXSMM_UNUSED(target_arch); +#else + if (LIBXSMM_X86_SSE42 <= target_arch) +#endif + { + internal_hash_u32_function = internal_crc32_u32_sse4; + internal_hash_u64_function = internal_crc32_u64_sse4; + internal_hash_u128_function = internal_crc32_u128_sse4; + internal_hash_u256_function = internal_crc32_u256_sse4; + internal_hash_u384_function = internal_crc32_u384_sse4; + internal_hash_u512_function = internal_crc32_u512_sse4; + internal_hash_function = (libxsmm_hash_function)internal_crc32_sse4; + } +#if (LIBXSMM_X86_SSE42 > LIBXSMM_STATIC_TARGET_ARCH) + else { +# if defined(LIBXSMM_PLATFORM_X86) && !defined(LIBXSMM_INTRINSICS_SSE42) + static int error_once = 0; + if (0 == error_once && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM WARNING: unable to access CRC32 instructions due to the compiler used!\n"); + error_once = 1; /* no need for atomics */ + } +# endif + internal_hash_u32_function = internal_crc32_u32; + internal_hash_u64_function = internal_crc32_u64; + internal_hash_u128_function = internal_crc32_u128; + internal_hash_u256_function = internal_crc32_u256; + internal_hash_u384_function = internal_crc32_u384; + internal_hash_u512_function = internal_crc32_u512; + internal_hash_function = (libxsmm_hash_function)internal_crc32; + } +#endif + LIBXSMM_ASSERT(NULL != internal_hash_u32_function); + LIBXSMM_ASSERT(NULL != internal_hash_u64_function); + LIBXSMM_ASSERT(NULL != internal_hash_u128_function); + LIBXSMM_ASSERT(NULL != internal_hash_u256_function); + LIBXSMM_ASSERT(NULL != internal_hash_u384_function); + LIBXSMM_ASSERT(NULL != internal_hash_u512_function); + LIBXSMM_ASSERT(NULL != internal_hash_function); +} + + +LIBXSMM_API_INTERN void libxsmm_hash_finalize(void) +{ +#if !defined(NDEBUG) + internal_crc32_table = NULL; + internal_hash_u32_function = NULL; + internal_hash_u64_function = NULL; + internal_hash_u128_function = NULL; + internal_hash_u256_function = NULL; + internal_hash_u384_function = NULL; + internal_hash_u512_function = NULL; + internal_hash_function = NULL; +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u32(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return LIBXSMM_HASH_CRC32_U32(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u32(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u32_function); + return internal_hash_u32_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u64(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return (unsigned int)LIBXSMM_HASH_CRC32_U64(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u64(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u64_function); + return internal_hash_u64_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u128(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_crc32_u128_sse4(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u128(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u128_function); + return internal_hash_u128_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u256(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_crc32_u256_sse4(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u256(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u256_function); + return internal_hash_u256_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u384(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_crc32_u384_sse4(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u384(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u384_function); + return internal_hash_u384_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u512(unsigned int seed, const void* value, ...) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_crc32_u512_sse4(seed, value); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32_u512(seed, value); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_u512_function); + return internal_hash_u512_function(seed, value); +#endif +} + + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32(unsigned int seed, const void* data, size_t size) +{ +#if (LIBXSMM_X86_SSE42 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_crc32_sse4(seed, data, size); +#elif (LIBXSMM_X86_SSE42 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_crc32(seed, data, size); +#else /* pointer based function call */ + LIBXSMM_ASSERT(NULL != internal_hash_function); + return internal_hash_function(seed, data, size); +#endif +} + diff --git a/third_party/libxsmm/src/libxsmm_hash.h b/third_party/libxsmm/src/libxsmm_hash.h new file mode 100644 index 00000000..c5df564e --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_hash.h @@ -0,0 +1,47 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_HASH_H +#define LIBXSMM_HASH_H + +#include + +/* Map number of Bits to corresponding routine. */ +#define LIBXSMM_CRC32U(N) LIBXSMM_CONCATENATE(libxsmm_crc32_u, N) +/* Map number of Bytes to number of bits. */ +#define LIBXSMM_CRC32(N) LIBXSMM_CONCATENATE(libxsmm_crc32_b, N) +#define libxsmm_crc32_b4 libxsmm_crc32_u32 +#define libxsmm_crc32_b8 libxsmm_crc32_u64 +#define libxsmm_crc32_b16 libxsmm_crc32_u128 +#define libxsmm_crc32_b32 libxsmm_crc32_u256 +#define libxsmm_crc32_b48 libxsmm_crc32_u384 +#define libxsmm_crc32_b64 libxsmm_crc32_u512 + + +/** Function type representing the CRC32 functionality. */ +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE unsigned int (*libxsmm_hash_function)( + unsigned int /*seed*/, const void* /*data*/, ... /*size*/); + +/** Initialize hash function module; not thread-safe. */ +LIBXSMM_API_INTERN void libxsmm_hash_init(int target_arch); +LIBXSMM_API_INTERN void libxsmm_hash_finalize(void); + +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u32(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u64(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u128(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u256(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u384(unsigned int seed, const void* value, ...); +LIBXSMM_API_INTERN unsigned int libxsmm_crc32_u512(unsigned int seed, const void* value, ...); + +/** Calculate the CRC32 for a given quantity (size) of raw data according to the seed. */ +LIBXSMM_API_INTERN unsigned int libxsmm_crc32(unsigned int seed, const void* data, size_t size); + +#endif /*LIBXSMM_HASH_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_main.c b/third_party/libxsmm/src/libxsmm_main.c new file mode 100644 index 00000000..4326fffd --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_main.c @@ -0,0 +1,4981 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_trace.h" +#include "libxsmm_xcopy.h" +#include "libxsmm_gemm.h" +#include "libxsmm_hash.h" +#include "libxsmm_diff.h" +#include "libxsmm_main.h" +#if defined(LIBXSMM_PERF) +# include "libxsmm_perf.h" +#endif +#include "generator_common.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if !defined(NDEBUG) +# include +#endif +#if defined(_WIN32) +# include +#else +# include +# include +# include +# include +# include +#endif +#if defined(__APPLE__) +# include +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(LIBXSMM_CODE_MAXSIZE) +# define LIBXSMM_CODE_MAXSIZE 131072 +#endif +#if !defined(LIBXSMM_DIFF_SIZE) +# define LIBXSMM_DIFF_SIZE LIBXSMM_DESCRIPTOR_SIGSIZE +#endif +#if !defined(LIBXSMM_HASH_SIZE) +/* can be smaller than MAXSIZE/SIGSIZE at the expense of collisions */ +# define LIBXSMM_HASH_SIZE 32 +#endif +#if !defined(LIBXSMM_HASH_SEED) +# define LIBXSMM_HASH_SEED 25071975 +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_ALIGN) && 1 +# define LIBXSMM_MALLOC_HOOK_ALIGN +#endif +#if !defined(LIBXSMM_ENABLE_DEREG) && 0 +# define LIBXSMM_ENABLE_DEREG +#endif +#if !defined(LIBXSMM_REGUSER_HASH) && 1 +# define LIBXSMM_REGUSER_HASH +#endif +#if !defined(LIBXSMM_REGLOCK_TRY) && 0 +# define LIBXSMM_REGLOCK_TRY +#endif +#if !defined(LIBXSMM_UNIFY_LOCKS) && 1 +# define LIBXSMM_UNIFY_LOCKS +#endif +#if !defined(LIBXSMM_REGKEY_PAD) && 0 +# define LIBXSMM_REGKEY_PAD +#endif +#if !defined(LIBXSMM_CACHE_PAD) && 1 +# define LIBXSMM_CACHE_PAD +#endif +#if !defined(LIBXSMM_AUTOPIN) && 0 +# define LIBXSMM_AUTOPIN +#endif +#if !defined(INTERNAL_DELIMS) +# define INTERNAL_DELIMS ";,:" +#endif + +#if !defined(_WIN32) && !defined(__CYGWIN__) +LIBXSMM_EXTERN int posix_memalign(void**, size_t, size_t) LIBXSMM_THROW; +#endif +#if defined(LIBXSMM_AUTOPIN) && !defined(_WIN32) +LIBXSMM_EXTERN int putenv(char*) LIBXSMM_THROW; +#endif + +/* flag fused into the memory address of a code version in case of non-JIT */ +#define LIBXSMM_CODE_STATIC (1ULL << (8 * sizeof(void*) - 1)) +/* flag fused into the memory address of a code version in case of collision */ +#if 1 /* beneficial when registry approaches capacity (collisions) */ +# define LIBXSMM_HASH_COLLISION (1ULL << (8 * sizeof(void*) - 2)) +#endif + +/** Helper macro determining the default prefetch strategy which is used for statically generated kernels. */ +#if (0 > LIBXSMM_PREFETCH) /* auto-prefetch (frontend) */ || (defined(_WIN32) || defined(__CYGWIN__)) +# define INTERNAL_PREFETCH LIBXSMM_GEMM_PREFETCH_NONE +#else +# define INTERNAL_PREFETCH ((libxsmm_gemm_prefetch_type)LIBXSMM_PREFETCH) +#endif + +#if (0 != LIBXSMM_SYNC) +# if !defined(INTERNAL_REGLOCK_MAXN) +# if defined(_MSC_VER) +# define INTERNAL_REGLOCK_MAXN 0 +# else +# define INTERNAL_REGLOCK_MAXN 0 +# endif +# endif +# if (1 < INTERNAL_REGLOCK_MAXN) +# if !defined(LIBXSMM_CACHE_MAXSIZE) && (8 > INTERNAL_REGLOCK_MAXN) +# define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE +# endif +# if !defined(LIBXSMM_REGLOCK) +# define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT +# endif +# if !defined(LIBXSMM_CLEANUP_NTRY) +# define LIBXSMM_CLEANUP_NTRY 7 +# endif +# if LIBXSMM_LOCK_TYPE_ISPOD(LIBXSMM_REGLOCK) +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype { + char pad[LIBXSMM_CACHELINE]; + LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state; +} internal_reglocktype; +# else +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_reglocktype { + LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) state; +} internal_reglocktype; +# endif +LIBXSMM_APIVAR_DEFINE(internal_reglocktype internal_reglock[INTERNAL_REGLOCK_MAXN]); +# else /* RW-lock */ +# if !defined(LIBXSMM_CACHE_MAXSIZE) +# define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE +# endif +# if !defined(LIBXSMM_REGLOCK) +# if defined(LIBXSMM_UNIFY_LOCKS) +# define LIBXSMM_REGLOCK LIBXSMM_LOCK +# elif defined(_MSC_VER) +# define LIBXSMM_REGLOCK LIBXSMM_LOCK_MUTEX +# elif 0 +# define LIBXSMM_REGLOCK LIBXSMM_LOCK_RWLOCK +# else +# define LIBXSMM_REGLOCK LIBXSMM_LOCK_DEFAULT +# endif +# endif +LIBXSMM_APIVAR_DEFINE(LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK)* internal_reglock_ptr); +# endif +#elif !defined(LIBXSMM_CACHE_MAXSIZE) +# define LIBXSMM_CACHE_MAXSIZE LIBXSMM_CAPACITY_CACHE +#endif +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ +# define LIBXSMM_CACHE_STRIDE LIBXSMM_MAX(sizeof(libxsmm_descriptor), LIBXSMM_DESCRIPTOR_MAXSIZE) +#else +# define LIBXSMM_CACHE_STRIDE LIBXSMM_DESCRIPTOR_MAXSIZE +#endif + +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) +# define INTERNAL_FIND_CODE_CACHE_GROW(RESULT_INDEX, CACHE_SIZE) \ + RESULT_INDEX = CACHE_SIZE; CACHE_SIZE = (unsigned char)(0 != (CACHE_SIZE) ? ((CACHE_SIZE) << 1) : 1) +# define INTERNAL_FIND_CODE_CACHE_EVICT(RESULT_INDEX, CACHE_SIZE, CACHE_HIT) \ + RESULT_INDEX = (unsigned char)LIBXSMM_MOD2((CACHE_HIT) + ((CACHE_SIZE) - 1), CACHE_SIZE) +#endif + +#if (0 == LIBXSMM_SYNC) +# define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { +# define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) } +#else +# if defined(LIBXSMM_REGLOCK_TRY) +# define INTERNAL_REGLOCK_TRY(DIFF, CODE) \ + if (1 != internal_reglock_count) { /* (re-)try and get (meanwhile) generated code */ \ + LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \ + continue; \ + } \ + else { /* exit dispatch and let client fall back */ \ + DIFF = 0; CODE = 0; break; \ + } +# else +# define INTERNAL_REGLOCK_TRY(DIFF, CODE) \ + LIBXSMM_ASSERT(NULL != internal_registry); /* engine is not shut down */ \ + continue +# endif +# if (1 < INTERNAL_REGLOCK_MAXN) +# define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \ + const unsigned int LOCKINDEX = (0 != internal_reglock_count ? LIBXSMM_MOD2(INDEX, internal_reglock_count) : 0); \ + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state)) { \ + INTERNAL_REGLOCK_TRY(DIFF, CODE); \ + } +# define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[LOCKINDEX].state); } +# else /* RW-lock */ +# define INTERNAL_FIND_CODE_LOCK(LOCKINDEX, INDEX, DIFF, CODE) { \ + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) != LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, internal_reglock_ptr)) { \ + INTERNAL_REGLOCK_TRY(DIFF, CODE); \ + } +# define INTERNAL_FIND_CODE_UNLOCK(LOCKINDEX) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); } +# endif +#endif + + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_statistic_type { + unsigned int ntry, ncol, njit, nsta; +} internal_statistic_type; + +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_cache_entry_type { + libxsmm_descriptor keys[LIBXSMM_CACHE_MAXSIZE]; + libxsmm_code_pointer code[LIBXSMM_CACHE_MAXSIZE]; + unsigned int id; /* to invalidate */ + unsigned char size, hit; +} internal_cache_entry_type; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_cache_type { +# if defined(LIBXSMM_CACHE_PAD) + char pad[LIBXSMM_UP2(sizeof(internal_cache_entry_type),LIBXSMM_CACHELINE)]; +# endif + internal_cache_entry_type entry; +} internal_cache_type; + +# if defined(LIBXSMM_NTHREADS_USE) +LIBXSMM_APIVAR_DEFINE(internal_cache_type* internal_cache_buffer); +# endif +LIBXSMM_APIVAR_DEFINE(int internal_cache_size); +#endif /*defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE))*/ + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_regkey_type { +#if defined(LIBXSMM_REGKEY_PAD) + char pad[LIBXSMM_UP2(sizeof(libxsmm_descriptor), LIBXSMM_CACHELINE)]; +#endif + libxsmm_descriptor entry; +} internal_regkey_type; + +/** Determines the try-lock property (1m && 1 < desc->n) { /* only record matrix-matrix multiplication */ + const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k); + const int idx = (LIBXSMM_GEMM_PRECISION_F64 == LIBXSMM_GETENUM_OUT(desc->datatype) ? 0 : 1); + int bucket; + if (LIBXSMM_MNK_SIZE(internal_statistic_sml, internal_statistic_sml, internal_statistic_sml) >= kernel_size) { + bucket = 0; + } + else if (LIBXSMM_MNK_SIZE(internal_statistic_med, internal_statistic_med, internal_statistic_med) >= kernel_size) { + bucket = 1; + } + else if (LIBXSMM_MNK_SIZE(internal_statistic_mnk, internal_statistic_mnk, internal_statistic_mnk) >= kernel_size) { + bucket = 2; + } + else { /*huge*/ + bucket = 3; + } + if (0 != ncol) ncol/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ncol, ncol, LIBXSMM_ATOMIC_RELAXED); + if (0 != ntry) ntry/*dummy assignment*/ = LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[idx][bucket].ntry, ntry, LIBXSMM_ATOMIC_RELAXED); + /* the following counters are not manipulated concurrently (no need for atomic increment) */ + if (0 != njit) internal_statistic[idx][bucket].njit += njit; + if (0 != nsta) internal_statistic[idx][bucket].nsta += nsta; + } +} + + +LIBXSMM_API_INLINE unsigned int internal_print_number(unsigned int n, char default_unit, char* unit) +{ + unsigned int number = n; + LIBXSMM_ASSERT(NULL != unit); + *unit = default_unit; + if ((1000000) <= n) { + number = (n + 500000) / 1000000; + *unit = 'm'; + } + else if (9999 < n) { + number = (n + 500) / 1000; + *unit = 'k'; + } + return number; +} + + +LIBXSMM_API_INLINE unsigned int internal_print_statistic(FILE* ostream, + const char* target_arch, int precision, unsigned int linebreaks, unsigned int indent) +{ + const internal_statistic_type statistic_sml = internal_statistic[precision][0/*SML*/]; + const internal_statistic_type statistic_med = internal_statistic[precision][1/*MED*/]; + const internal_statistic_type statistic_big = internal_statistic[precision][2/*BIG*/]; + const internal_statistic_type statistic_xxx = internal_statistic[precision][3/*XXX*/]; + int printed = 0; + LIBXSMM_ASSERT(NULL != ostream && (0 <= precision && precision < 2)); + if (/* omit to print anything if it is superfluous */ + 0 != statistic_sml.ntry || 0 != statistic_sml.njit || 0 != statistic_sml.nsta || 0 != statistic_sml.ncol || + 0 != statistic_med.ntry || 0 != statistic_med.njit || 0 != statistic_med.nsta || 0 != statistic_med.ncol || + 0 != statistic_big.ntry || 0 != statistic_big.njit || 0 != statistic_big.nsta || 0 != statistic_big.ncol || + 0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) + { + char title[256], range[256], unit[4]; + unsigned int counter[4]; + { + unsigned int n; + if (NULL != target_arch && '\0' != *target_arch) { + assert(strlen(target_arch) < sizeof(title)); /* !LIBXSMM_ASSERT */ + for (n = 0; 0 != target_arch[n] /*avoid code-gen. issue with some clang versions: && n < sizeof(title)*/; ++n) { + const char c = target_arch[n]; + title[n] = (char)(('a' <= c && c <= 'z') ? (c - 32) : c); /* toupper */ + } + LIBXSMM_SNPRINTF(title + n, sizeof(title) - n, "/%s", 0 == precision ? "DP" : "SP"); + } + else { + LIBXSMM_SNPRINTF(title, sizeof(title), "%s", 0 == precision ? "DP" : "SP"); + } + for (n = 0; n < linebreaks; ++n) fprintf(ostream, "\n"); + } + fprintf(ostream, "%*s%-8s %6s %6s %6s %6s\n", (int)indent, "", title, "TRY", "JIT", "STA", "COL"); + LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", 0u, internal_statistic_sml); + counter[0] = internal_print_number(statistic_sml.ntry, ' ', unit + 0); + counter[1] = internal_print_number(statistic_sml.njit, ' ', unit + 1); + counter[2] = internal_print_number(statistic_sml.nsta, ' ', unit + 2); + counter[3] = internal_print_number(statistic_sml.ncol, ' ', unit + 3); + fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, + counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); + LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_sml + 1u, internal_statistic_med); + counter[0] = internal_print_number(statistic_med.ntry, ' ', unit + 0); + counter[1] = internal_print_number(statistic_med.njit, ' ', unit + 1); + counter[2] = internal_print_number(statistic_med.nsta, ' ', unit + 2); + counter[3] = internal_print_number(statistic_med.ncol, ' ', unit + 3); + fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, + counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); + LIBXSMM_SNPRINTF(range, sizeof(range), "%u..%u", internal_statistic_med + 1u, internal_statistic_mnk); + counter[0] = internal_print_number(statistic_big.ntry, ' ', unit + 0); + counter[1] = internal_print_number(statistic_big.njit, ' ', unit + 1); + counter[2] = internal_print_number(statistic_big.nsta, ' ', unit + 2); + counter[3] = internal_print_number(statistic_big.ncol, ' ', unit + 3); + fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, + counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); + if (0 != statistic_xxx.ntry || 0 != statistic_xxx.njit || 0 != statistic_xxx.nsta || 0 != statistic_xxx.ncol) { + LIBXSMM_SNPRINTF(range, sizeof(range), "> %u", internal_statistic_mnk); + counter[0] = internal_print_number(statistic_xxx.ntry, ' ', unit + 0); + counter[1] = internal_print_number(statistic_xxx.njit, ' ', unit + 1); + counter[2] = internal_print_number(statistic_xxx.nsta, ' ', unit + 2); + counter[3] = internal_print_number(statistic_xxx.ncol, ' ', unit + 3); + fprintf(ostream, "%*s%8s %6u%c %5u%c %5u%c %5u%c\n", (int)indent, "", range, + counter[0], unit[0], counter[1], unit[1], counter[2], unit[2], counter[3], unit[3]); + } + printed = 1; + } + return printed; +} + + +#if !(defined(_WIN32) || defined(__CYGWIN__)) +LIBXSMM_API_INLINE unsigned int internal_statistic_ntry(int precision) +{ + return internal_statistic[precision][0/*SML*/].ntry + internal_statistic[precision][1/*MED*/].ntry + + internal_statistic[precision][2/*BIG*/].ntry + internal_statistic[precision][3/*XXX*/].ntry; +} +#endif + + +#if !defined(_WIN32) +LIBXSMM_API_INLINE void internal_register_static_code( + libxsmm_gemm_precision precision, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + libxsmm_xmmfunction xgemm, libxsmm_code_pointer* registry) +{ + const libxsmm_blasint lda = m, ldb = k, ldc = m; + /*const*/ int precondition = LIBXSMM_GEMM_NO_BYPASS_DIMS(m, n, k) && LIBXSMM_GEMM_NO_BYPASS_DIMS(lda, ldb, ldc); + if (precondition) { + const size_t size = (LIBXSMM_HASH_SIZE) - sizeof(libxsmm_descriptor_kind); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob, precision, + m, n, k, lda, ldb, ldc, LIBXSMM_ALPHA, LIBXSMM_BETA, LIBXSMM_FLAGS, INTERNAL_PREFETCH); + unsigned int i = LIBXSMM_MOD2( + libxsmm_crc32(LIBXSMM_HASH_SEED, desc, LIBXSMM_MIN(sizeof(libxsmm_gemm_descriptor), size)), + LIBXSMM_CAPACITY_REGISTRY); + libxsmm_code_pointer* dst_entry = registry + i; +#if !defined(NDEBUG) + libxsmm_code_pointer code; code.xgemm = xgemm; + LIBXSMM_ASSERT(NULL != code.ptr_const && NULL != registry); + LIBXSMM_ASSERT(0 == (LIBXSMM_CODE_STATIC & code.uval)); +#endif + if (NULL != dst_entry->ptr_const) { /* collision */ + const unsigned int i0 = i; + do { /* continue to linearly search for an available slot */ + i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); + if (NULL == registry[i].ptr_const) break; + } while (i != i0); +#if defined(LIBXSMM_HASH_COLLISION) /* mark entry as a collision */ + dst_entry->uval |= LIBXSMM_HASH_COLLISION; +#endif + dst_entry = registry + i; /* update destination */ + internal_update_mmstatistic(desc, 0, 1/*collision*/, 0, 0); + /* out of capacity (no registry slot available) */ + LIBXSMM_ASSERT(NULL == dst_entry->ptr_const || i == i0); + } + if (NULL == dst_entry->ptr_const) { /* registry not exhausted */ + internal_registry_keys[i].entry.kind = LIBXSMM_KERNEL_KIND_MATMUL; + LIBXSMM_ASSIGN127(&internal_registry_keys[i].entry.gemm.desc, desc); + dst_entry->xgemm = xgemm; + /* mark current entry as static code (non-JIT) */ + dst_entry->uval |= LIBXSMM_CODE_STATIC; + } + internal_update_mmstatistic(desc, 1/*try*/, 0, 0, 0); + } +} +#endif + + +LIBXSMM_API_INTERN void internal_release_scratch(void); +LIBXSMM_API_INTERN void internal_release_scratch(void) +{ + libxsmm_xrelease_scratch(NULL/*lock*/); + /* release global services */ + libxsmm_memory_finalize(); + libxsmm_hash_finalize(); + libxsmm_malloc_finalize(); +} + + +/* Caution: cannot be used multiple times in a single expression! */ +LIBXSMM_API_INTERN size_t libxsmm_format_value(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base) +{ + const int len = (NULL != scale ? ((int)strlen(scale)) : 0); + const int m = LIBXSMM_INTRINSICS_BITSCANBWD64(nbytes) / base, n = LIBXSMM_MIN(m, len); + int i; + buffer[0] = 0; /* clear */ + LIBXSMM_ASSERT(NULL != unit && 0 <= base); + for (i = 0; i < n; ++i) nbytes >>= base; + LIBXSMM_SNPRINTF(buffer, buffer_size, "%i %c%s", + (int)nbytes, 0 < n ? scale[n-1] : *unit, 0 < n ? unit : ""); + return nbytes; +} + + +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_dump(FILE* ostream, int urgent); +LIBXSMM_API_INTERN void internal_dump(FILE* ostream, int urgent) +{ + char *const env_dump_build = getenv("LIBXSMM_DUMP_BUILD"); + char *const env_dump_files = (NULL != getenv("LIBXSMM_DUMP_FILES") + ? getenv("LIBXSMM_DUMP_FILES") + : getenv("LIBXSMM_DUMP_FILE")); + LIBXSMM_ASSERT_MSG(INTERNAL_SINGLETON(internal_singleton_handle), "Invalid handle"); + /* determine whether this instance is unique or not */ + if (NULL != env_dump_files && '\0' != *env_dump_files && 0 == urgent) { /* dump per-node info */ + const char* filename = strtok(env_dump_files, INTERNAL_DELIMS); + char buffer[1024]; + for (; NULL != filename; filename = strtok(NULL, INTERNAL_DELIMS)) { + FILE* file = fopen(filename, "r"); + if (NULL != file) buffer[0] = '\0'; + else { /* parse keywords */ + const int seconds = atoi(filename); + if (0 == seconds) { + const char *const pid = strstr(filename, "PID"); + if (NULL != pid) { /* PID-keyword is present */ + int n = (int)(pid - filename); + n = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%.*s%u%s", n, filename, libxsmm_get_pid(), filename + n + 3); + if (0 < n && (int)sizeof(buffer) > n) { + file = fopen(buffer, "r"); + filename = buffer; + } + } + } + else { + fprintf(stderr, "LIBXSMM INFO: PID=%u\n", libxsmm_get_pid()); + if (0 < seconds) { +#if defined(_WIN32) + Sleep((DWORD)(1000 * seconds)); +#else + LIBXSMM_EXPECT(EXIT_SUCCESS, sleep(seconds)); +#endif + } + else for(;;) LIBXSMM_SYNC_YIELD; + } + } + if (NULL != file) { + int c = fgetc(file); + fprintf(ostream, "\n\nLIBXSMM_DUMP_FILE: %s\n", filename); + /* coverity[tainted_data] */ + while (EOF != c) { + fputc(c, stdout); + c = fgetc(file); + } + fputc('\n', stdout); + fclose(file); + } + } + } + if (NULL != internal_build_state /* dump build state */ + && NULL != env_dump_build && '\0' != *env_dump_build) + { + const int dump_build = atoi(env_dump_build); + if (0 == urgent ? (0 < dump_build) : (0 > dump_build)) { + fprintf(ostream, "\n\nBUILD_DATE=%i\n", LIBXSMM_CONFIG_BUILD_DATE); + fprintf(ostream, "%s\n", internal_build_state); + } + } +} + + +LIBXSMM_API_INTERN void internal_finalize(void); +LIBXSMM_API_INTERN void internal_finalize(void) +{ + libxsmm_finalize(); + LIBXSMM_STDIO_ACQUIRE(); /* synchronize I/O */ + if (0 != libxsmm_verbosity) { /* print statistic on termination */ + const char *const env_target_hidden = getenv("LIBXSMM_TARGET_HIDDEN"); + const char *const target_arch = (NULL == env_target_hidden || 0 == atoi(env_target_hidden)) + ? libxsmm_cpuid_name(libxsmm_target_archid) : NULL/*hidden*/; + fprintf(stderr, "\nLIBXSMM_VERSION: %s%s%s (%i)", LIBXSMM_BRANCH, + 0 != *(LIBXSMM_BRANCH) ? "-" : "", 0 != *(LIBXSMM_VERSION) ? (LIBXSMM_VERSION) : "unconfigured", + LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH)); + if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { + unsigned int linebreak = (0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0)) ? 1 : 0; + const int high_verbosity = (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity); + char number_format_buffer[32]; + libxsmm_scratch_info scratch_info; +#if defined(LIBXSMM_PLATFORM_X86) + libxsmm_cpuid_info info; + libxsmm_cpuid_x86(&info); + if ((LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) && + 0 == internal_cpuid_info.has_context && 0 != info.has_context) + { + fprintf(stderr, "\nLIBXSMM: CPU features have been promoted."); + } +#endif + if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak && NULL != target_arch) { + fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch); + } + if (0 != libxsmm_format_value(number_format_buffer, sizeof(number_format_buffer), +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + sizeof(internal_cache_type) * (LIBXSMM_NTHREADS_MAX) + +#endif + (sizeof(internal_regkey_type) + sizeof(libxsmm_code_pointer)) * (LIBXSMM_CAPACITY_REGISTRY), + "KM", "B", 10)) + { + fprintf(stderr, "Registry and code: %s", number_format_buffer); + if (0 != libxsmm_format_value(number_format_buffer, sizeof(number_format_buffer), internal_registry_nbytes, "KM", "B", 10)) { + fprintf(stderr, " + %s", number_format_buffer); + } + if (0 != high_verbosity) { + unsigned int ngemms = 0; + int i; for (i = 0; i < 4; ++i) { + ngemms += internal_statistic[0/*DP*/][i].nsta + internal_statistic[1/*SP*/][i].nsta; + ngemms += internal_statistic[0/*DP*/][i].njit + internal_statistic[1/*SP*/][i].njit; + } + if (0 != ngemms || 0 != internal_statistic_num_gemv + || 0 != internal_statistic_num_meltw + || 0 != libxsmm_statistic_num_spmdm + || 0 != internal_statistic_num_user + || 0 != internal_registry_nleaks) + { + const char sep[] = " ", *s = ""; + fprintf(stderr, " ("); + if (0 != ngemms) { fprintf(stderr, "gemm=%u", ngemms); s = sep; } + if (0 != internal_statistic_num_gemv) { fprintf(stderr, "%sgemv=%u", s, internal_statistic_num_gemv); s = sep; } + if (0 != internal_statistic_num_meltw) { fprintf(stderr, "%smeltw=%u", s, internal_statistic_num_meltw); s = sep; } + if (0 != libxsmm_statistic_num_spmdm) { fprintf(stderr, "%sspmdm=%u", s, libxsmm_statistic_num_spmdm); s = sep; } + if (0 != internal_statistic_num_user) { fprintf(stderr, "%suser=%u", s, internal_statistic_num_user); s = sep; } + if (0 != internal_registry_nleaks) { fprintf(stderr, "%snleaks=%u", s, internal_registry_nleaks); s = sep; } + fprintf(stderr, ")"); + } + } + fprintf(stderr, "\n"); + } + if (EXIT_SUCCESS == libxsmm_get_scratch_info(&scratch_info)) { + if (0 != scratch_info.size && + 0 != libxsmm_format_value(number_format_buffer, sizeof(number_format_buffer), scratch_info.size, "KM", "B", 10)) + { + fprintf(stderr, "Scratch: %s", number_format_buffer); + if (0 != high_verbosity) { + fprintf(stderr, " (mallocs=%lu, pools=%u)\n", (unsigned long int)scratch_info.nmallocs, scratch_info.npools); + } + else { + fprintf(stderr, "\n"); + } + } + if (0 != scratch_info.internal && 0 != high_verbosity && + libxsmm_format_value(number_format_buffer, sizeof(number_format_buffer), scratch_info.internal, "KM", "B", 10)) + { + fprintf(stderr, "Private: %s\n", number_format_buffer); + } + } + if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { + fprintf(stderr, "Uptime: %f s", libxsmm_timer_duration(internal_timer_start, libxsmm_timer_tick())); + if (1 < libxsmm_thread_count && INT_MAX == libxsmm_verbosity) { + fprintf(stderr, " (nthreads=%u)", libxsmm_thread_count); + } + fprintf(stderr, "\n"); + } + } + else { + fprintf(stderr, "\nLIBXSMM_TARGET: %s\n", target_arch); + } + } + /* release scratch memory pool */ + if (EXIT_SUCCESS != atexit(internal_release_scratch) && 0 != libxsmm_verbosity) { + fprintf(stderr, "LIBXSMM ERROR: failed to perform final cleanup!\n"); + } + /* determine whether this instance is unique or not */ + if (INTERNAL_SINGLETON(internal_singleton_handle)) { + internal_dump(stdout, 0/*urgent*/); + /* cleanup singleton */ +#if defined(_WIN32) + ReleaseMutex(internal_singleton_handle); + CloseHandle(internal_singleton_handle); +#else + unlink(internal_singleton_fname); + close(internal_singleton_handle); +#endif + } + LIBXSMM_STDIO_RELEASE(); /* synchronize I/O */ +#if (0 != LIBXSMM_SYNC) + { /* release locks */ +# if (1 < INTERNAL_REGLOCK_MAXN) + int i; for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, &internal_reglock[i].state); +# elif !defined(LIBXSMM_UNIFY_LOCKS) + LIBXSMM_LOCK_DESTROY(LIBXSMM_REGLOCK, internal_reglock_ptr); +# endif + LIBXSMM_LOCK_DESTROY(LIBXSMM_LOCK, &libxsmm_lock_global); + } +#endif +} + + +#if defined(LIBXSMM_INTERCEPT_DYNAMIC) +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* /*message*/, int /*len*/, int /*quiet*/); +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void _gfortran_stop_string(const char* message, int len, int quiet) +{ /* STOP termination handler for GNU Fortran runtime */ + static int once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { + union { const void* dlsym; void (*ptr)(const char*, int, int); } stop; + dlerror(); /* clear an eventual error status */ + stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "_gfortran_stop_string"); + if (NULL != stop.dlsym) { + stop.ptr(message, len, quiet); + } + else exit(EXIT_SUCCESS); /* statically linked runtime */ + } +} + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* /*message*/, int /*len*/); +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core(const char* message, int len) +{ /* STOP termination handler for Intel Fortran runtime */ + static int once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { + union { const void* dlsym; void (*ptr)(const char*, int); } stop; + dlerror(); /* clear an eventual error status */ + stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core"); + if (NULL != stop.dlsym) { + stop.ptr(message, len); + } + else exit(EXIT_SUCCESS); /* statically linked runtime */ + } +} + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void); +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void for_stop_core_quiet(void) +{ /* STOP termination handler for Intel Fortran runtime */ + static int once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&once, 1, LIBXSMM_ATOMIC_RELAXED)) { + union { const void* dlsym; void (*ptr)(void); } stop; + dlerror(); /* clear an eventual error status */ + stop.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "for_stop_core_quiet"); + if (NULL != stop.dlsym) { + stop.ptr(); + } + else exit(EXIT_SUCCESS); /* statically linked runtime */ + } +} +#endif + + +LIBXSMM_API_INTERN size_t internal_strlen(const char* /*cstr*/, size_t /*maxlen*/); +LIBXSMM_API_INTERN size_t internal_strlen(const char* cstr, size_t maxlen) +{ + size_t result = 0; + if (NULL != cstr) { + while (0 != cstr[result] && result < maxlen) ++result; + } + return result; +} + + +LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* /*nbytes*/, size_t /*ndefault*/, int* /*valid*/); +LIBXSMM_API_INTERN size_t internal_parse_nbytes(const char* nbytes, size_t ndefault, int* valid) +{ + size_t result = ndefault; + if (NULL != nbytes && '\0' != *nbytes) { + size_t u = internal_strlen(nbytes, 32) - 1; + const char units[] = "kmgKMG", *const unit = strchr(units, nbytes[u]); + char* end = NULL; + /* take parsed value with increased type-width */ + const long long int ibytes = strtol(nbytes, &end, 10); + if (NULL != end && ( /* no obvious error */ + /* must match allowed set of units */ + (NULL != unit && *unit == *end) || + /* value is given without unit */ + (NULL == unit && '\0' == *end))) + { + result = (size_t)ibytes; + if ((size_t)LIBXSMM_UNLIMITED != result) { + u = (NULL != unit ? ((unit - units) % 3) : 3); + if (u < 3) { + result <<= (u + 1) * 10; + } + } + if (NULL != valid) *valid = 1; + } + else if (NULL != valid) *valid = 0; + } + else if (NULL != valid) { + *valid = 0; + } + return result; +} + + +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_NO_TRACE void internal_init(void); +LIBXSMM_API_INTERN void internal_init(void) +{ + int i; +#if (0 != LIBXSMM_SYNC) /* setup the locks in a thread-safe fashion */ + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global); +# if (1 < INTERNAL_REGLOCK_MAXN) + for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, &internal_reglock[i].state); +# elif !defined(LIBXSMM_UNIFY_LOCKS) + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr); +# endif +#endif + if (NULL == internal_registry) { /* double-check after acquiring the lock(s) */ +#if defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_AUTOPIN) + /* clear error status (dummy condition: it does not matter if MPI_Init or MPI_Abort) */ + const char *const dlsymname = (NULL == dlerror() ? "MPI_Init" : "MPI_Abort"); + const void *const dlsymbol = dlsym(LIBXSMM_RTLD_NEXT, dlsymname); + const void *const dlmpi = (NULL == dlerror() ? dlsymbol : NULL); +#endif + const char *const env_verbose = getenv("LIBXSMM_VERBOSE"); + void* new_registry = NULL, * new_keys = NULL; +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) +# if defined(LIBXSMM_NTHREADS_USE) + void* new_cache = NULL; +# endif + const char *const env_cache = getenv("LIBXSMM_CACHE"); + if (NULL != env_cache && '\0' != *env_cache) { + const int cache_size = atoi(env_cache), cache_size2 = LIBXSMM_UP2POT(cache_size); + internal_cache_size = LIBXSMM_MIN(cache_size2, LIBXSMM_CACHE_MAXSIZE); + } + else { + internal_cache_size = LIBXSMM_CACHE_MAXSIZE; + } +#endif + /* setup verbosity as early as possible since below code may rely on verbose output */ + if (NULL != env_verbose && '\0' != *env_verbose) { + libxsmm_verbosity = atoi(env_verbose); + } +#if !defined(NDEBUG) + else { + libxsmm_verbosity = INT_MAX; /* quiet -> verbose */ + } +#endif +#if (0 == LIBXSMM_JIT) + if (2 > libxsmm_ninit && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) { + fprintf(stderr, "LIBXSMM: JIT-code generation was disabled at compile-time.\n"); + } +#endif +#if defined(LIBXSMM_AUTOPIN) +# if defined(LIBXSMM_INTERCEPT_DYNAMIC) + /* MPI: unwanted affinity can slow-down unrelated jobs (over-subscription), e.g., CP2K regtests */ + if (NULL == dlmpi) +# endif + { /* setup some viable affinity if nothing else is present */ + const char *const gomp_cpu_affinity = getenv("GOMP_CPU_AFFINITY"); + const char *const kmp_affinity = getenv("KMP_AFFINITY"); + const char *const omp_proc_bind = getenv("OMP_PROC_BIND"); + if ((NULL == gomp_cpu_affinity || 0 == *gomp_cpu_affinity) + && (NULL == kmp_affinity || 0 == *kmp_affinity) + && (NULL == omp_proc_bind || 0 == *omp_proc_bind)) + { + static char affinity[] = "OMP_PROC_BIND=TRUE"; + LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(affinity)); + if (LIBXSMM_VERBOSITY_HIGH < libxsmm_verbosity || 0 > libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM: prepared to pin threads.\n"); + } + } + } +# if defined(LIBXSMM_INTERCEPT_DYNAMIC) && 1 + else if (NULL == getenv("I_MPI_SHM_HEAP")) { + static char shmheap[] = "I_MPI_SHM_HEAP=1"; + LIBXSMM_EXPECT(EXIT_SUCCESS, LIBXSMM_PUTENV(shmheap)); + } +# endif +#endif +#if !defined(_WIN32) && 0 + umask(S_IRUSR | S_IWUSR); /* setup default/secure file mask */ +#endif +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + { const char *const env = getenv("LIBXSMM_SCRATCH_POOLS"); + if (NULL == env || 0 == *env) { + libxsmm_scratch_pools = LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS; + } + else { + libxsmm_scratch_pools = LIBXSMM_CLMP(atoi(env), 0, LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + /*libxsmm_scratch_pools_locked = 1;*/ + } + LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + } + { const char *const env = getenv("LIBXSMM_SCRATCH_SCALE"); + if (NULL == env || 0 == *env) { + libxsmm_scratch_scale = LIBXSMM_MALLOC_SCRATCH_SCALE; + } + else { + libxsmm_scratch_scale = LIBXSMM_CLMP(atof(env), 1.0, 10.0); + /*libxsmm_scratch_scale_locked = 1;*/ + } + assert(1 <= libxsmm_scratch_scale); /* !LIBXSMM_ASSERT */ + } + libxsmm_set_scratch_limit(internal_parse_nbytes(getenv("LIBXSMM_SCRATCH_LIMIT"), LIBXSMM_SCRATCH_DEFAULT, NULL/*valid*/)); +#endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ + { /* setup malloc-interception after internal allocations */ + const libxsmm_malloc_function null_malloc_fn = { 0 }; + const libxsmm_free_function null_free_fn = { 0 }; + char *const env_k = getenv("LIBXSMM_MALLOC"), *const env_t = getenv("LIBXSMM_MALLOC_LIMIT"), *end = NULL; + const char* env_i = (NULL != env_t ? strtok(env_t, INTERNAL_DELIMS) : NULL); + size_t malloc_lo = internal_parse_nbytes(env_i, LIBXSMM_MALLOC_LIMIT, NULL/*valid*/); + size_t malloc_hi = (NULL != env_i ? internal_parse_nbytes( + strtok(NULL, INTERNAL_DELIMS), LIBXSMM_SCRATCH_UNLIMITED, NULL/*valid*/) : LIBXSMM_SCRATCH_UNLIMITED); + const int malloc_kind = ((NULL == env_k || 0 == *env_k) ? 0/*disabled*/ : ((int)strtol(env_k, &end, 10))); + libxsmm_xset_default_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn); + libxsmm_xset_scratch_allocator(NULL/*lock*/, NULL/*context*/, null_malloc_fn, null_free_fn); + /* libxsmm_set_malloc implies libxsmm_malloc_init */ + if (NULL == end) { + libxsmm_set_malloc(0, &malloc_lo, &malloc_hi); + } + else if ('\0' == *end) { + libxsmm_set_malloc(malloc_kind, &malloc_lo, &malloc_hi); + } + else { + int valid = 1; + env_i = strtok(env_k, INTERNAL_DELIMS); + malloc_lo = internal_parse_nbytes(env_i, LIBXSMM_MALLOC_LIMIT, &valid); + env_i = (0 != valid ? strtok(NULL, INTERNAL_DELIMS) : NULL); + malloc_hi = (NULL != env_i + ? internal_parse_nbytes(env_i, LIBXSMM_SCRATCH_UNLIMITED, &valid) + : LIBXSMM_SCRATCH_UNLIMITED); + libxsmm_set_malloc(0 != valid ? 1 : 0, &malloc_lo, &malloc_hi); + } + } +#if defined(LIBXSMM_MAXTARGET) + libxsmm_set_target_arch(LIBXSMM_STRINGIFY(LIBXSMM_MAXTARGET)); +#else /* attempt to set libxsmm_target_archid per environment variable */ + libxsmm_set_target_arch(getenv("LIBXSMM_TARGET")); +#endif + { const char *const env = getenv("LIBXSMM_SYNC"); + libxsmm_nosync = (NULL == env || 0 == *env) ? 0/*default*/ : atoi(env); + } + /* clear internal counters/statistic */ + for (i = 0; i < 4/*sml/med/big/xxx*/; ++i) { + LIBXSMM_MEMZERO127(&internal_statistic[0/*DP*/][i]); + LIBXSMM_MEMZERO127(&internal_statistic[1/*SP*/][i]); + } + internal_statistic_mnk = LIBXSMM_MAX_DIM; + internal_statistic_sml = 13; + internal_statistic_med = 23; + LIBXSMM_ASSERT(LIBXSMM_ISPOT(LIBXSMM_CAPACITY_REGISTRY)); + libxsmm_hash_init(libxsmm_target_archid); /* used by debug memory allocation (checksum) */ + libxsmm_memory_init(libxsmm_target_archid); + if ( +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + (EXIT_SUCCESS == libxsmm_xmalloc(&new_cache, /* if internal_cache_size is zero, allocation must still happen (later control-flow too expensive) */ + sizeof(internal_cache_type) * (LIBXSMM_NTHREADS_MAX), LIBXSMM_CACHELINE/*alignment*/, + LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_cache) && +#endif + (EXIT_SUCCESS == libxsmm_xmalloc(&new_keys, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(internal_regkey_type), 0/*auto-align*/, + LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_keys) && + (EXIT_SUCCESS == libxsmm_xmalloc(&new_registry, (LIBXSMM_CAPACITY_REGISTRY) * sizeof(libxsmm_code_pointer), 0/*auto-align*/, + LIBXSMM_MALLOC_FLAG_PRIVATE, NULL/*extra*/, 0/*extra-size*/) && NULL != new_registry)) + { +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + LIBXSMM_ASSERT(NULL != new_cache); /* SA: suppress false positive */ + memset(new_cache, 0, (LIBXSMM_NTHREADS_MAX) * sizeof(internal_cache_type)); +#endif + libxsmm_xcopy_init(libxsmm_target_archid); + libxsmm_dnn_init(libxsmm_target_archid); + { const char *const env = getenv("LIBXSMM_GEMM_PREFETCH"); +#if (defined(_WIN32) || defined(__CYGWIN__)) + libxsmm_gemm_auto_prefetch_default = INTERNAL_PREFETCH; +#else + libxsmm_gemm_auto_prefetch_default = (0 == internal_statistic_ntry(0/*DP*/) && 0 == internal_statistic_ntry(1/*SP*/)) + /* avoid special prefetch if static code is present, since such code uses INTERNAL_PREFETCH */ + ? (((LIBXSMM_X86_AVX512 >= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid)) + ? LIBXSMM_GEMM_PREFETCH_AL2BL2_VIA_C : LIBXSMM_GEMM_PREFETCH_BL2_VIA_C) + : INTERNAL_PREFETCH; +#endif + libxsmm_gemm_auto_prefetch = INTERNAL_PREFETCH; + if (NULL != env && '\0' != *env) { /* user input beyond auto-prefetch is always considered */ + const int uid = atoi(env); + if (0 <= uid) { + libxsmm_gemm_auto_prefetch_default = libxsmm_gemm_uid2prefetch(uid); + libxsmm_gemm_auto_prefetch = libxsmm_gemm_auto_prefetch_default; + internal_gemm_auto_prefetch_locked = 1; + } + } + } + for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) ((libxsmm_code_pointer*)new_registry)[i].ptr = NULL; + LIBXSMM_ASSERT(NULL == internal_registry && NULL == internal_registry_keys); +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + LIBXSMM_ASSERT(NULL == internal_cache_buffer); + internal_cache_buffer = (internal_cache_type*)new_cache; +#endif + internal_registry_keys = (internal_regkey_type*)new_keys; /* prior to registering static kernels */ +#if defined(LIBXSMM_BUILD) && !defined(LIBXSMM_DEFAULT_CONFIG) +# include +#endif + libxsmm_gemm_init(libxsmm_target_archid); +#if defined(LIBXSMM_TRACE) + { int filter_threadid = 0/*only main-thread*/, filter_mindepth = 0, filter_maxnsyms = 0; + const int init_code = libxsmm_trace_init(filter_threadid, filter_mindepth, filter_maxnsyms); + if (EXIT_SUCCESS != init_code && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: failed to initialize TRACE (error #%i)!\n", init_code); + } + } +#endif + { /* commit the registry buffer and enable global visibility */ + void *const pv_registry = &internal_registry; + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)((void**)pv_registry, (void*)new_registry, LIBXSMM_ATOMIC_SEQ_CST); + } + } + else { + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: failed to allocate internal buffers!\n"); + } + libxsmm_xfree(new_registry, 0/*no check*/); + libxsmm_xfree(new_keys, 0/*no check*/); +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + libxsmm_xfree(new_cache, 0/*no check*/); +#endif + } + } +#if (0 != LIBXSMM_SYNC) /* release locks */ +# if (1 < INTERNAL_REGLOCK_MAXN) + for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state); +# elif !defined(LIBXSMM_UNIFY_LOCKS) + LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); +# endif + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global); +#endif +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_CTOR void libxsmm_init(void) +{ + if (0 == LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED)) { + static unsigned int ninit = 0, gid = 0; + const unsigned int tid = LIBXSMM_ATOMIC_ADD_FETCH(&ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); + LIBXSMM_ASSERT(0 < tid); + /* libxsmm_ninit (1: initialization started, 2: library initialized, higher: to invalidate code-TLS) */ + if (1 == tid) { + libxsmm_timer_tickint s0 = libxsmm_timer_tick_rtc(); /* warm-up */ + libxsmm_timer_tickint t0 = libxsmm_timer_tick_tsc(); /* warm-up */ + s0 = libxsmm_timer_tick_rtc(); t0 = libxsmm_timer_tick_tsc(); /* start timing */ + assert(0 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */ + /* coverity[check_return] */ + LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); + gid = tid; /* protect initialization */ +#if (0 != LIBXSMM_SYNC) + /* coverity[check_return] */ + LIBXSMM_TLS_CREATE(&libxsmm_tlskey); + { /* construct and initialize locks */ +# if defined(LIBXSMM_REGLOCK_TRY) + const char *const env_trylock = getenv("LIBXSMM_TRYLOCK"); +# endif + LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_LOCK) attr_global; +# if (1 < INTERNAL_REGLOCK_MAXN) + int i; + LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr; + LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr); +# elif defined(LIBXSMM_UNIFY_LOCKS) + internal_reglock_ptr = &libxsmm_lock_global; +# else + static LIBXSMM_LOCK_TYPE(LIBXSMM_REGLOCK) internal_reglock; + internal_reglock_ptr = &internal_reglock; + LIBXSMM_LOCK_ATTR_TYPE(LIBXSMM_REGLOCK) attr; + LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_REGLOCK, &attr); + LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, internal_reglock_ptr, &attr); + LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr); +# endif + LIBXSMM_LOCK_ATTR_INIT(LIBXSMM_LOCK, &attr_global); + LIBXSMM_LOCK_INIT(LIBXSMM_LOCK, &libxsmm_lock_global, &attr_global); + LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_LOCK, &attr_global); + /* control number of locks needed; LIBXSMM_TRYLOCK implies only 1 lock */ +# if defined(LIBXSMM_REGLOCK_TRY) + if (NULL == env_trylock || 0 == *env_trylock) +# endif + { /* no LIBXSMM_TRYLOCK */ +# if defined(LIBXSMM_VTUNE) + internal_reglock_count = 1; /* avoid duplicated kernels */ +# elif (1 < INTERNAL_REGLOCK_MAXN) + const char *const env_nlocks = getenv("LIBXSMM_NLOCKS"); + const int reglock_count = (NULL == env_nlocks || 0 == *env_nlocks || 1 > atoi(env_nlocks)) + ? (INTERNAL_REGLOCK_MAXN) : LIBXSMM_MIN(atoi(env_nlocks), INTERNAL_REGLOCK_MAXN); + internal_reglock_count = LIBXSMM_LO2POT(reglock_count); +# else + internal_reglock_count = 0; +# endif + } +# if defined(LIBXSMM_REGLOCK_TRY) + else { /* LIBXSMM_TRYLOCK environment variable specified */ + internal_reglock_count = (0 != atoi(env_trylock) ? 1 +# if (1 < INTERNAL_REGLOCK_MAXN) + : INTERNAL_REGLOCK_MAXN); +# else + : 0); +# endif + } +# endif +# if (1 < INTERNAL_REGLOCK_MAXN) + LIBXSMM_ASSERT(1 <= internal_reglock_count); + for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_INIT(LIBXSMM_REGLOCK, &internal_reglock[i].state, &attr); + LIBXSMM_LOCK_ATTR_DESTROY(LIBXSMM_REGLOCK, &attr); +# endif + } +#endif + { /* determine whether this instance is unique or not */ +#if defined(_WIN32) + internal_singleton_handle = CreateMutex(NULL, TRUE, "GlobalLIBXSMM"); +#else + const int result = LIBXSMM_SNPRINTF(internal_singleton_fname, sizeof(internal_singleton_fname), "/tmp/.libxsmm.%u", + /*rely on user id to avoid permission issues in case of left-over files*/(unsigned int)getuid()); + struct flock singleton_flock; + int singleton_handle; + singleton_flock.l_start = 0; + singleton_flock.l_len = 0; /* entire file */ + singleton_flock.l_type = F_WRLCK; /* exclusive across PIDs */ + singleton_flock.l_whence = SEEK_SET; + singleton_handle = ((0 < result && (int)sizeof(internal_singleton_fname) > result) ? open( + internal_singleton_fname, O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR) : -1); + internal_singleton_handle = fcntl(singleton_handle, F_SETLK, &singleton_flock); + if (0 > internal_singleton_handle && 0 <= singleton_handle) close(singleton_handle); +#endif /* coverity[leaked_handle] */ + } + { /* calibrate timer */ + int register_termination_proc; + libxsmm_timer_tickint s1, t1; + internal_init(); /* must be first to initialize verbosity, etc. */ + if (INTERNAL_SINGLETON(internal_singleton_handle)) { /* after internal_init */ + internal_dump(stdout, 1/*urgent*/); + } + s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* mid-timing */ +#if defined(LIBXSMM_PLATFORM_X86) + libxsmm_cpuid_x86(&internal_cpuid_info); + if (0 != internal_cpuid_info.constant_tsc && t0 < t1) { + libxsmm_timer_scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0); + } +#endif + register_termination_proc = atexit(internal_finalize); + s1 = libxsmm_timer_tick_rtc(); t1 = libxsmm_timer_tick_tsc(); /* final timing */ + /* set timer-scale and determine start of the "uptime" (shown at termination) */ + if (t0 < t1 && 0.0 < libxsmm_timer_scale) { + const double scale = libxsmm_timer_duration_rtc(s0, s1) / (t1 - t0); + const double diff = LIBXSMM_DELTA(libxsmm_timer_scale, scale) / scale; + if (5E-4 > diff) { + libxsmm_timer_scale = scale; + internal_timer_start = t0; + } + else { + libxsmm_timer_scale = 0; + internal_timer_start = s0; +#if defined(_DEBUG) + libxsmm_se = 1; +#endif + } + } + else { + internal_timer_start = s0; + libxsmm_timer_scale = 0; + } + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + if (EXIT_SUCCESS != register_termination_proc) { + fprintf(stderr, "LIBXSMM ERROR: failed to register termination procedure!\n"); + } + if (0 == libxsmm_timer_scale +#if defined(LIBXSMM_PLATFORM_X86) + && 0 == internal_cpuid_info.constant_tsc +#endif + && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) + { + /* ARM: TSC is currently not implemented, hence warning shows up (if verbose) */ + fprintf(stderr, "LIBXSMM WARNING: timer is maybe not cycle-accurate!\n"); + } + } + } + assert(1 == LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_SEQ_CST)); /* !LIBXSMM_ASSERT */ + /* coverity[check_return] */ + LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_SEQ_CST); + } + else /*if (gid != tid)*/ { /* avoid recursion */ + LIBXSMM_ASSERT(gid != tid); + LIBXSMM_UNUSED(gid); + while (2 > LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_RELAXED)) LIBXSMM_SYNC_YIELD; + internal_init(); + } +#if defined(LIBXSMM_PERF) + libxsmm_perf_init(); +#endif + } + LIBXSMM_ASSERT(1 < libxsmm_ninit); +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void libxsmm_finalize(void); +LIBXSMM_API LIBXSMM_ATTRIBUTE_DTOR void libxsmm_finalize(void) +{ + void *const regaddr = &internal_registry; + uintptr_t regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED); + libxsmm_code_pointer* registry = (libxsmm_code_pointer*)regptr; + if (NULL != registry) { + int i; +#if (0 != LIBXSMM_SYNC) + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, &libxsmm_lock_global); +# if (1 < INTERNAL_REGLOCK_MAXN) + { /* acquire locks and thereby shortcut lazy initialization later on */ + int ntry = 0, n; + do { + for (i = 0, n = 0; i < internal_reglock_count; ++i) { + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_REGLOCK) == LIBXSMM_LOCK_TRYLOCK(LIBXSMM_REGLOCK, &internal_reglock[i].state)) ++n; + } + ntry += (0 == n ? 1 : 0); + } while (n < internal_reglock_count && ntry < LIBXSMM_CLEANUP_NTRY); + } +# elif !defined(LIBXSMM_UNIFY_LOCKS) + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_REGLOCK, internal_reglock_ptr); +# endif +#endif + regptr = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_RELAXED); + registry = (libxsmm_code_pointer*)regptr; + if (NULL != registry) { + internal_regkey_type *const registry_keys = internal_registry_keys; +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + internal_cache_type *const cache_buffer = internal_cache_buffer; +#endif + unsigned int rest = 0, errors = 0; +#if defined(LIBXSMM_TRACE) + i = libxsmm_trace_finalize(); + if (EXIT_SUCCESS != i && 0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: failed to finalize trace (error #%i)!\n", i); + } +#endif +#if defined(LIBXSMM_PERF) + libxsmm_perf_finalize(); +#endif + libxsmm_xcopy_finalize(); + libxsmm_gemm_finalize(); + libxsmm_dnn_finalize(); + /* coverity[check_return] */ + LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */ +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + internal_cache_buffer = NULL; +#endif + internal_registry_keys = NULL; /* make registry keys unavailable */ + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE_ZERO, LIBXSMM_BITS)((uintptr_t*)regaddr, LIBXSMM_ATOMIC_SEQ_CST); + internal_registry_nbytes = 0; internal_registry_nleaks = 0; + for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) { + /*const*/ libxsmm_code_pointer code = registry[i]; + if (NULL != code.ptr_const) { + /* check if the registered entity is a GEMM kernel */ + switch (LIBXSMM_DESCRIPTOR_KIND(registry_keys[i].entry.kind)) { + case LIBXSMM_KERNEL_KIND_MATMUL: { + const libxsmm_gemm_descriptor *const desc = ®istry_keys[i].entry.gemm.desc; + if (1 < desc->m && 1 < desc->n) { + const unsigned int njit = (0 == (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0); + const unsigned int nsta = (0 != (LIBXSMM_CODE_STATIC & code.uval) ? 1 : 0); + /* count whether kernel is static or JIT-code */ + internal_update_mmstatistic(desc, 0, 0, njit, nsta); + } + else { + ++internal_statistic_num_gemv; + } + ++rest; + } break; + case LIBXSMM_KERNEL_KIND_MELTW: { + ++internal_statistic_num_meltw; + } break; + case LIBXSMM_KERNEL_KIND_USER: { + ++internal_statistic_num_user; + } break; + default: if (LIBXSMM_KERNEL_UNREGISTERED <= LIBXSMM_DESCRIPTOR_KIND(registry_keys[i].entry.kind)) { + ++errors; + } + else { + ++rest; + } + } + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + if (0 != errors) { + fprintf(stderr, "LIBXSMM ERROR: code registry is corrupted!\n"); + } + if (LIBXSMM_CAPACITY_REGISTRY == (rest + errors + internal_statistic_num_gemv + + internal_statistic_num_user + internal_statistic_num_meltw)) + { + fprintf(stderr, "LIBXSMM WARNING: code registry was exhausted!\n"); + } + } + if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */ +# if defined(__APPLE__) && defined(__arm64__) +# else + void* buffer = NULL; + size_t size = 0; +# endif +#if defined(LIBXSMM_HASH_COLLISION) + code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */ +#endif +# if defined(__APPLE__) && defined(__arm64__) + ++internal_registry_nleaks; +#else + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(code.ptr_const, &size, NULL/*flags*/, &buffer)) { + if (LIBXSMM_KERNEL_KIND_USER == LIBXSMM_DESCRIPTOR_KIND(registry_keys[i].entry.kind) + /* dump user-data just like JIT'ted code */ + && 0 > libxsmm_verbosity) + { + char name[16]; + int nchar; +#if defined(LIBXSMM_REGUSER_HASH) + const size_t descsize = LIBXSMM_DESCRIPTOR_ISBIG(registry_keys[i].entry.kind) + ? LIBXSMM_DESCRIPTOR_MAXSIZE : LIBXSMM_DESCRIPTOR_SIGSIZE; + const unsigned int id = libxsmm_crc32(LIBXSMM_HASH_SEED, registry_keys[i].entry.user.desc, + descsize - sizeof(libxsmm_descriptor_kind)); + LIBXSMM_ASSERT(descsize > sizeof(libxsmm_descriptor_kind)); +#else + const unsigned int id = internal_statistic_num_user; +#endif + nchar = LIBXSMM_SNPRINTF(name, sizeof(name), "%010u.user", id); + if (0 < nchar && (int)sizeof(name) > nchar) { + LIBXSMM_EXPECT(EXIT_SUCCESS, libxsmm_dump("LIBXSMM-USER-DUMP", name, code.ptr_const, size, 0/*unique*/)); + } + } +#if !defined(NDEBUG) + registry[i].ptr = NULL; +#endif + libxsmm_xfree(code.ptr_const, 0/*no check*/); + /* round-up size (it is fine to assume 4 KB pages since it is likely more accurate than not rounding up) */ + internal_registry_nbytes += LIBXSMM_UP2(size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE); + } + else ++internal_registry_nleaks; +#endif + } + } + } + /* release buffers (registry, keys, cache) */ +#if defined(LIBXSMM_NTHREADS_USE) && defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + libxsmm_xfree(cache_buffer, 0/*no check*/); +#endif + libxsmm_xfree(registry_keys, 0/*no check*/); + libxsmm_xfree(registry, 0/*no check*/); + } +#if (0 != LIBXSMM_SYNC) /* LIBXSMM_LOCK_RELEASE, but no LIBXSMM_LOCK_DESTROY */ +# if (1 < INTERNAL_REGLOCK_MAXN) + for (i = 0; i < internal_reglock_count; ++i) LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, &internal_reglock[i].state); +# elif !defined(LIBXSMM_UNIFY_LOCKS) + LIBXSMM_LOCK_RELEASE(LIBXSMM_REGLOCK, internal_reglock_ptr); +# endif + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, &libxsmm_lock_global); + /* coverity[check_return] */ + LIBXSMM_TLS_DESTROY(libxsmm_tlskey); +#endif + } +} + + +LIBXSMM_API void libxsmm_sink(LIBXSMM_VARIADIC) +{ + /* does nothing else but sinking given arguments */ +} + + +LIBXSMM_API int libxsmm_get_target_archid(void) +{ + LIBXSMM_INIT +#if !defined(__MIC__) + return libxsmm_target_archid; +#else /* no JIT support */ + return LIBXSMM_MIN(libxsmm_target_archid, LIBXSMM_X86_GENERIC); +#endif +} + + +LIBXSMM_API void libxsmm_set_target_archid(int id) +{ + int target_archid = LIBXSMM_TARGET_ARCH_UNKNOWN; + switch (id) { + case LIBXSMM_X86_AVX512_SPR: + case LIBXSMM_X86_AVX512_CPX: + case LIBXSMM_X86_AVX512_CLX: + case LIBXSMM_X86_AVX512_CORE: + case LIBXSMM_X86_AVX512_KNM: + case LIBXSMM_X86_AVX512_MIC: + case LIBXSMM_X86_AVX512: + case LIBXSMM_X86_AVX2: + case LIBXSMM_X86_AVX: + case LIBXSMM_X86_SSE42: + case LIBXSMM_X86_SSE3: + case LIBXSMM_AARCH64_V81: + case LIBXSMM_AARCH64_V82: + case LIBXSMM_AARCH64_A64FX: { + target_archid = id; + } break; + case LIBXSMM_TARGET_ARCH_GENERIC: +#if defined(LIBXSMM_PLATFORM_X86) + target_archid = LIBXSMM_X86_GENERIC; + break; +#elif defined(LIBXSMM_PLATFORM_AARCH64) + target_archid = LIBXSMM_AARCH64_V81; + break; +#endif + default: target_archid = libxsmm_cpuid(); + } + LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED); + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + const int cpuid = libxsmm_cpuid(); + if (cpuid < target_archid) { + const char *const target_arch = libxsmm_cpuid_name(target_archid); + fprintf(stderr, "LIBXSMM WARNING: \"%s\" code may fail to run on \"%s\"!\n", + target_arch, libxsmm_cpuid_name(cpuid)); + } + } +} + + +LIBXSMM_API const char* libxsmm_get_target_arch(void) +{ + LIBXSMM_INIT + return libxsmm_cpuid_name(libxsmm_target_archid); +} + + +/* function serves as a helper for implementing the Fortran interface */ +LIBXSMM_API const char* libxsmmf_get_target_arch(int* length); +LIBXSMM_API const char* libxsmmf_get_target_arch(int* length) +{ + const char *const arch = libxsmm_get_target_arch(); + /* valid here since function is not in the public interface */ + LIBXSMM_ASSERT(NULL != arch && 0 != length); + *length = (int)strlen(arch); + return arch; +} + + +LIBXSMM_API void libxsmm_set_target_arch(const char* arch) +{ + const int cpuid = libxsmm_cpuid(); + int target_archid; + if (NULL != arch && '\0' != *arch) { +#if defined(LIBXSMM_PLATFORM_X86) + const int jit = atoi(arch); +#endif + if (0 == strcmp("0", arch)) { +#if defined(LIBXSMM_PLATFORM_X86) + target_archid = LIBXSMM_X86_GENERIC; +#elif defined(LIBXSMM_PLATFORM_AARCH64) + target_archid = LIBXSMM_AARCH64_V81; +#else + target_archid = LIBXSMM_TARGET_ARCH_GENERIC; +#endif + } +#if defined(LIBXSMM_PLATFORM_X86) + else if (0 < jit) { + target_archid = LIBXSMM_X86_GENERIC + jit; + } + else if (arch == libxsmm_stristr(arch, "spr") || arch == libxsmm_stristr(arch, "amx")) { + target_archid = LIBXSMM_X86_AVX512_SPR; + } + else if (arch == libxsmm_stristr(arch, "cpx")) { + target_archid = LIBXSMM_X86_AVX512_CPX; + } + else if (arch == libxsmm_stristr(arch, "clx")) { + target_archid = LIBXSMM_X86_AVX512_CLX; + } + else if (arch == libxsmm_stristr(arch, "skx") || arch == libxsmm_stristr(arch, "skl") + /* "avx3"/"avx512" previously enabled LIBXSMM_X86_AVX512 */ + || arch == libxsmm_stristr(arch, "avx3") || arch == libxsmm_stristr(arch, "avx512")) + { + target_archid = LIBXSMM_X86_AVX512_CORE; + } + else if (arch == libxsmm_stristr(arch, "knm")) { + target_archid = LIBXSMM_X86_AVX512_KNM; + } + else if (arch == libxsmm_stristr(arch, "knl") || arch == libxsmm_stristr(arch, "mic")) { + target_archid = LIBXSMM_X86_AVX512_MIC; + } + else if (arch == libxsmm_stristr(arch, "hsw") || arch == libxsmm_stristr(arch, "avx2")) { + target_archid = LIBXSMM_X86_AVX2; + } + else if (arch == libxsmm_stristr(arch, "snb") || arch == libxsmm_stristr(arch, "avx")) { + target_archid = LIBXSMM_X86_AVX; + } + else if (arch == libxsmm_stristr(arch, "wsm") || arch == libxsmm_stristr(arch, "nhm") + || arch == libxsmm_stristr(arch, "sse4_2") || arch == libxsmm_stristr(arch, "sse4.2") + || arch == libxsmm_stristr(arch, "sse42") || arch == libxsmm_stristr(arch, "sse4")) + { + target_archid = LIBXSMM_X86_SSE42; + } + else if (arch == libxsmm_stristr(arch, "sse3")) { + target_archid = LIBXSMM_X86_SSE3; + } + else if (arch == libxsmm_stristr(arch, "x86") || arch == libxsmm_stristr(arch, "x86_64") + || arch == libxsmm_stristr(arch, "x64") || arch == libxsmm_stristr(arch, "sse2")) + { + target_archid = LIBXSMM_X86_GENERIC; + } +#elif defined(LIBXSMM_PLATFORM_AARCH64) + else if (arch == libxsmm_stristr(arch, "arm") || arch == libxsmm_stristr(arch, "arm64") + || arch == libxsmm_stristr(arch, "arm_v81") + || arch == libxsmm_stristr(arch, "aarch64")) + { + target_archid = LIBXSMM_AARCH64_V81; + } + else if (arch == libxsmm_stristr(arch, "arm_v82")) { + target_archid = LIBXSMM_AARCH64_V82; + } + else if (arch == libxsmm_stristr(arch, "a64fx")) + { + target_archid = LIBXSMM_AARCH64_A64FX; + } +#endif + else if (arch == libxsmm_stristr(arch, "generic")) { +#if defined(LIBXSMM_PLATFORM_X86) + target_archid = LIBXSMM_X86_GENERIC; +#elif defined(LIBXSMM_PLATFORM_AARCH64) + target_archid = LIBXSMM_AARCH64_V81; +#else + target_archid = LIBXSMM_TARGET_ARCH_GENERIC; +#endif + } + else if (arch == libxsmm_stristr(arch, "none")) { + target_archid = LIBXSMM_TARGET_ARCH_GENERIC; + } + else { + target_archid = cpuid; + } + } + else { + target_archid = cpuid; + } + if (cpuid < target_archid) { /* warn about code path if beyond CPUID */ + static int error_once = 0; + if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + const char *const target_arch = libxsmm_cpuid_name(target_archid); + fprintf(stderr, "LIBXSMM WARNING: \"%s\" code will fail to run on \"%s\"!\n", + target_arch, libxsmm_cpuid_name(cpuid)); + } +#if 0 /* limit code path to confirmed features */ + target_archid = cpuid; +#endif + } + LIBXSMM_ATOMIC_STORE(&libxsmm_target_archid, target_archid, LIBXSMM_ATOMIC_RELAXED); +} + + +LIBXSMM_API int libxsmm_get_verbosity(void) +{ + LIBXSMM_INIT + return libxsmm_verbosity; +} + + +LIBXSMM_API void libxsmm_set_verbosity(int level) +{ + LIBXSMM_INIT + LIBXSMM_ATOMIC_STORE(&libxsmm_verbosity, level, LIBXSMM_ATOMIC_RELAXED); +} + + +LIBXSMM_API libxsmm_gemm_prefetch_type libxsmm_get_gemm_auto_prefetch(void) +{ + return (libxsmm_gemm_prefetch_type)libxsmm_gemm_auto_prefetch; +} + + +LIBXSMM_API void libxsmm_set_gemm_auto_prefetch(libxsmm_gemm_prefetch_type strategy) +{ + if (0 == internal_gemm_auto_prefetch_locked) { /* LIBXSMM_GEMM_PREFETCH environment takes precedence */ + LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch_default, strategy, LIBXSMM_ATOMIC_RELAXED); + LIBXSMM_ATOMIC_STORE(&libxsmm_gemm_auto_prefetch, strategy, LIBXSMM_ATOMIC_RELAXED); + } +} + + +LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype) +{ + const unsigned char result = (unsigned char)LIBXSMM_TYPESIZE(datatype); + if (0 != result) { + return result; + } + else { + static int error_once = 0; + LIBXSMM_ASSERT_MSG(0, "unsupported data type"); + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM ERROR: unsupported data type!\n"); + } + return 1; /* avoid to return 0 to avoid div-by-zero in static analysis of depending code */ + } +} + + +LIBXSMM_API int libxsmm_dvalue(libxsmm_datatype datatype, const void* value, double* dvalue) +{ + int result = EXIT_SUCCESS; + if (NULL != value && NULL != dvalue) { + switch (datatype) { + case LIBXSMM_DATATYPE_F64: *dvalue = (*(const double *)value); break; + case LIBXSMM_DATATYPE_F32: *dvalue = (double)(*(const float *)value); break; + case LIBXSMM_DATATYPE_I64: *dvalue = (double)(*(const long long*)value); break; + case LIBXSMM_DATATYPE_I32: *dvalue = (double)(*(const int *)value); break; + case LIBXSMM_DATATYPE_I16: *dvalue = (double)(*(const short *)value); break; + case LIBXSMM_DATATYPE_I8: *dvalue = (double)(*(const char *)value); break; + default: result = EXIT_FAILURE; + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype) +{ + switch (datatype) { + case LIBXSMM_DATATYPE_F64: return "f64"; + case LIBXSMM_DATATYPE_F32: return "f32"; + case LIBXSMM_DATATYPE_BF16: return "bf16"; + case LIBXSMM_DATATYPE_F16: return "f16"; + case LIBXSMM_DATATYPE_I64: return "i64"; + case LIBXSMM_DATATYPE_I32: return "i32"; + case LIBXSMM_DATATYPE_I16: return "i16"; + case LIBXSMM_DATATYPE_I8: return "i8"; + default: { + if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) && + LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype)) + { + return "i16i32"; + } + else if (LIBXSMM_GEMM_PRECISION_I16 == LIBXSMM_GETENUM_INP(datatype) && + LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype)) + { + return "i16f32"; + } + else if (LIBXSMM_GEMM_PRECISION_I8 == LIBXSMM_GETENUM_INP(datatype) && + LIBXSMM_GEMM_PRECISION_I32 == LIBXSMM_GETENUM_OUT(datatype)) + { + return "i8i32"; + } + else if (LIBXSMM_GEMM_PRECISION_BF16 == LIBXSMM_GETENUM_INP(datatype) && + LIBXSMM_GEMM_PRECISION_F32 == LIBXSMM_GETENUM_OUT(datatype)) + { + return "bf16f32"; + } + else { + return "void"; + } + } + } +} + + +LIBXSMM_API_INLINE void internal_get_typesize_string(char buffer[4], int buffer_size, size_t typesize) +{ + LIBXSMM_ASSERT(256 > typesize && 4 <= buffer_size); + if (10 > typesize) { + buffer[0] = (char)('0' + typesize); + buffer[1] = 0; + } + else { + LIBXSMM_SNPRINTF(buffer, buffer_size, "%i", (int)typesize); + } +} + + +LIBXSMM_API_INTERN int libxsmm_dump(const char* title, const char* name, const void* data, size_t size, int unique) +{ + int result; + if (NULL != name && '\0' != *name && NULL != data && 0 != size) { + FILE* data_file = fopen(name, "rb"); + int diff = 0, result_close; + if (NULL == data_file) { /* file does not exist */ + data_file = fopen(name, "wb"); + if (NULL != data_file) { /* dump data into a file */ + result = ((size == fwrite(data, 1, size, data_file)) ? EXIT_SUCCESS : EXIT_FAILURE); + result_close = fclose(data_file); + if (EXIT_SUCCESS == result) result = result_close; + } + else result = EXIT_FAILURE; + } + else if (0 != unique) { /* check existing file */ + const char* check_a = (const char*)data; + char check_b[4096]; + size_t rest = size; + do { + const size_t n = fread(check_b, 1, LIBXSMM_MIN(sizeof(check_b), rest), data_file); + diff += memcmp(check_a, check_b, LIBXSMM_MIN(sizeof(check_b), n)); + check_a += n; + rest -= n; + } while (0 < rest && 0 == diff); + result = fclose(data_file); + } + else { + result = fclose(data_file); + } + if (EXIT_SUCCESS == result && NULL != title && '\0' != *title) { + fprintf(stderr, "%s(ptr:file) %p : %s\n", title, data, name); + } + if (0 != diff) { /* override existing dump and warn about erroneous condition */ + fprintf(stderr, "LIBXSMM ERROR: %s is not a unique filename!\n", name); + data_file = fopen(name, "wb"); + if (NULL != data_file) { /* dump data into a file */ + if (size != fwrite(data, 1, size, data_file)) result = EXIT_FAILURE; + result_close = fclose(data_file); + if (EXIT_SUCCESS == result) result = result_close; + } + if (EXIT_SUCCESS == result) result = EXIT_FAILURE; + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code) +{ + int result = EXIT_SUCCESS; +#if !defined(__MIC__) + const char * /*const*/ target_arch = libxsmm_cpuid_name(libxsmm_target_archid); + /* large enough temporary buffer for generated code */ + char jit_buffer[LIBXSMM_CODE_MAXSIZE], jit_name[256] = { 0 }; + libxsmm_generated_code generated_code; + libxsmm_kernel_xinfo extra; + + LIBXSMM_MEMZERO127(&generated_code); + generated_code.generated_code = jit_buffer; + generated_code.buffer_size = sizeof(jit_buffer); + /* setup code generation */ + generated_code.arch = libxsmm_target_archid; + generated_code.code_type = 2; + +# if !defined(NDEBUG) /* should not be needed (all members will be initialized below) */ + LIBXSMM_MEMZERO127(&extra); +# endif + extra.registered = regindex; + extra.nflops = 0; + + LIBXSMM_ASSERT(NULL != generated_code.generated_code || 0 == generated_code.buffer_size); + LIBXSMM_ASSERT(NULL != request && 0 != libxsmm_target_archid); + LIBXSMM_ASSERT(NULL != code && NULL == code->ptr_const); + LIBXSMM_ASSERT(0 == LIBXSMM_DESCRIPTOR_ISBIG(request->kind)); + + switch (request->kind) { /* generate kernel */ + case LIBXSMM_BUILD_KIND_GEMM: { /* small MxM kernel */ + LIBXSMM_ASSERT(NULL != request->descriptor.gemm); +# if 0 /* dummy kernel for an empty shape is desired */ + if (0 < request->descriptor.gemm->m && 0 < request->descriptor.gemm->n && 0 < request->descriptor.gemm->k && + 0 < request->descriptor.gemm->lda && 0 < request->descriptor.gemm->ldb && 0 < request->descriptor.gemm->ldc) +# endif + { + const unsigned int m = request->descriptor.gemm->m, n = request->descriptor.gemm->n, k = request->descriptor.gemm->k; + extra.nflops = 2 * m * n * k; +# if !defined(LIBXSMM_DENY_RETARGET) /* disable: ECFLAGS=-DLIBXSMM_DENY_RETARGET */ + if ((LIBXSMM_X86_AVX2 < libxsmm_target_archid) && (libxsmm_target_archid <= LIBXSMM_X86_ALLFEAT) && + (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.gemm->datatype)) && + (16 >= (m * k) || 16 >= (k * n) || 16 >= (m * n))) + { + /* TODO: shall we update variable "target_arch" (name)? */ + generated_code.arch = LIBXSMM_X86_AVX2; + } +# endif + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_gemm_kernel, &generated_code, request->descriptor.gemm); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.gemm->datatype); + const char *const meltw_tname = libxsmm_typename((libxsmm_datatype)request->descriptor.gemm->meltw_datatype_aux); + int typesigns = 0, br = 0; + char tc_option[16] = { 0 }; + int decompress_A = 0; + int sparsity_factor_A = 1; + if ((request->descriptor.gemm->meltw_operation == LIBXSMM_MELTW_OPERATION_DECOMPRESS_A) || + (request->descriptor.gemm->meltw_operation == LIBXSMM_MELTW_OPERATION_COLBIAS_ACT_DECOMPRESS_A)) + { + decompress_A = 1; + sparsity_factor_A = (int)request->descriptor.gemm->meltw_param; + } + + /* query batch reduce variant */ + if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS & request->descriptor.gemm->flags) > 1 ) { + br = 1; + } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET & request->descriptor.gemm->flags) > 1 ) { + br = 2; + } else if ( (LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE & request->descriptor.gemm->flags) > 1 ) { + br = 3; + } else { + br = 0; + } + /* query A/B sign combinations */ + if ( (LIBXSMM_GEMM_FLAG_A_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { + typesigns = 1; + } else if ( (LIBXSMM_GEMM_FLAG_B_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { + typesigns = 2; + } else if ( (LIBXSMM_GEMM_FLAG_AB_UNSIGNED & request->descriptor.gemm->flags) > 1 ) { + typesigns = 3; + } else { + typesigns = 0; + } + /* query tileconfig options */ + if (((LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG & request->descriptor.gemm->flags) != 0) && + ((LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG & request->descriptor.gemm->flags) == 0) ) { + LIBXSMM_SNPRINTF(tc_option, sizeof(tc_option), "conf"); + } else if (((LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG & request->descriptor.gemm->flags) == 0) && + ((LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG & request->descriptor.gemm->flags) != 0) ) { + LIBXSMM_SNPRINTF(tc_option, sizeof(tc_option), "rele"); + } else if (((LIBXSMM_GEMM_FLAG_NO_RESET_TILECONFIG & request->descriptor.gemm->flags) != 0) && + ((LIBXSMM_GEMM_FLAG_NO_SETUP_TILECONFIG & request->descriptor.gemm->flags) != 0)) { + LIBXSMM_SNPRINTF(tc_option, sizeof(tc_option), "none"); + } else { + LIBXSMM_SNPRINTF(tc_option, sizeof(tc_option), "abid"); + } + + if ( request->descriptor.gemm->meltw_operation != 0 ) { + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i_br%i_uh%u_si%i_tc-%s_avnni%i_bvnni%i_cvnni%i_meop%u-%s_mefl%u_meld%u-%u-%u_decompress_A%i_spfactor%i.mxm", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.gemm->flags) ? 'n' : 't', m, n, k, + request->descriptor.gemm->lda, request->descriptor.gemm->ldb, request->descriptor.gemm->ldc, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.gemm->flags) ? 0 : 1, uid, + br, (unsigned int)request->descriptor.gemm->c3, typesigns, tc_option, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_A & request->descriptor.gemm->flags) ? 1 : 0, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_B & request->descriptor.gemm->flags) ? 1 : 0, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_C & request->descriptor.gemm->flags) ? 1 : 0, + (unsigned int)request->descriptor.gemm->meltw_operation, meltw_tname, (unsigned int)request->descriptor.gemm->meltw_flags, + request->descriptor.gemm->meltw_ldx, request->descriptor.gemm->meltw_ldy, request->descriptor.gemm->meltw_ldz, decompress_A, sparsity_factor_A ); + } else { + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i_br%i_uh%u_si%i_tc-%s_avnni%i_bvnni%i_cvnni%i_decompress_A%i_spfactor%i.mxm", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.gemm->flags) ? 'n' : 't', m, n, k, + request->descriptor.gemm->lda, request->descriptor.gemm->ldb, request->descriptor.gemm->ldc, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.gemm->flags) ? 0 : 1, uid, + br, (unsigned int)request->descriptor.gemm->c3, typesigns, tc_option, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_A & request->descriptor.gemm->flags) ? 1 : 0, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_B & request->descriptor.gemm->flags) ? 1 : 0, + 0 != (LIBXSMM_GEMM_FLAG_VNNI_C & request->descriptor.gemm->flags) ? 1 : 0, decompress_A, sparsity_factor_A ); + } + } + } + } break; + case LIBXSMM_BUILD_KIND_PSPGEMM_CSR: { /* packed sparse gemm kernel, CSR format */ + LIBXSMM_ASSERT(NULL != request->descriptor.pspgemm_csr && 0 != request->descriptor.pspgemm_csr->gemm); + LIBXSMM_ASSERT(NULL != request->descriptor.pspgemm_csr->row_ptr && 0 != request->descriptor.pspgemm_csr->column_idx && 0 != request->descriptor.pspgemm_csr->values); + /* only floating point */ + if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pspgemm_csr->gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pspgemm_csr->gemm->datatype)) + { + const unsigned int nnz = (request->descriptor.pspgemm_csr->gemm->lda == 0) ? + request->descriptor.pspgemm_csr->row_ptr[request->descriptor.pspgemm_csr->gemm->m] : request->descriptor.pspgemm_csr->row_ptr[request->descriptor.pspgemm_csr->gemm->k]; + const unsigned int gemm_factor = (request->descriptor.pspgemm_csr->gemm->lda == 0) ? request->descriptor.pspgemm_csr->gemm->n : request->descriptor.pspgemm_csr->gemm->m; + extra.nflops = 2 * nnz * gemm_factor * request->descriptor.pspgemm_csr->packed_width; + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_spgemm_csr_kernel, &generated_code, request->descriptor.pspgemm_csr->gemm, + request->descriptor.pspgemm_csr->row_ptr, request->descriptor.pspgemm_csr->column_idx, request->descriptor.pspgemm_csr->values, request->descriptor.pspgemm_csr->packed_width); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pspgemm_csr->gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pspgemm_csr->gemm->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.pspgemm_csr", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pspgemm_csr->gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pspgemm_csr->gemm->flags) ? 'n' : 't', + request->descriptor.pspgemm_csr->gemm->m, request->descriptor.pspgemm_csr->gemm->n, request->descriptor.pspgemm_csr->gemm->k, + request->descriptor.pspgemm_csr->gemm->lda, request->descriptor.pspgemm_csr->gemm->ldb, request->descriptor.pspgemm_csr->gemm->ldc, + request->descriptor.pspgemm_csr->packed_width, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pspgemm_csr->gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pspgemm_csr->gemm->flags) ? 0 : 1, + uid, nnz); + } + } + } break; + case LIBXSMM_BUILD_KIND_PSPGEMM_CSC: { /* packed sparse gemm kernel, CSC format */ + LIBXSMM_ASSERT(NULL != request->descriptor.pspgemm_csc && 0 != request->descriptor.pspgemm_csc->gemm); + LIBXSMM_ASSERT(NULL != request->descriptor.pspgemm_csc->row_idx && 0 != request->descriptor.pspgemm_csc->column_ptr && 0 != request->descriptor.pspgemm_csc->values); + /* only floating point */ + if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pspgemm_csc->gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pspgemm_csc->gemm->datatype)) + { + const unsigned int nnz = (request->descriptor.pspgemm_csc->gemm->lda == 0) ? + request->descriptor.pspgemm_csc->column_ptr[request->descriptor.pspgemm_csc->gemm->k] : request->descriptor.pspgemm_csc->column_ptr[request->descriptor.pspgemm_csc->gemm->n]; + const unsigned int gemm_factor = (request->descriptor.pspgemm_csc->gemm->lda == 0) ? request->descriptor.pspgemm_csc->gemm->n : request->descriptor.pspgemm_csc->gemm->m; + extra.nflops = 2 * nnz * gemm_factor * request->descriptor.pspgemm_csc->packed_width; + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_spgemm_csc_kernel, &generated_code, request->descriptor.pspgemm_csc->gemm, + request->descriptor.pspgemm_csc->row_idx, request->descriptor.pspgemm_csc->column_ptr, request->descriptor.pspgemm_csc->values, request->descriptor.pspgemm_csc->packed_width); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pspgemm_csc->gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pspgemm_csc->gemm->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i_nnz%u.pspgemm_csc", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pspgemm_csc->gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pspgemm_csc->gemm->flags) ? 'n' : 't', + request->descriptor.pspgemm_csc->gemm->m, request->descriptor.pspgemm_csc->gemm->n, request->descriptor.pspgemm_csc->gemm->k, + request->descriptor.pspgemm_csc->gemm->lda, request->descriptor.pspgemm_csc->gemm->ldb, request->descriptor.pspgemm_csc->gemm->ldc, + request->descriptor.pspgemm_csc->packed_width, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pspgemm_csc->gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pspgemm_csc->gemm->flags) ? 0 : 1, + uid, nnz); + } + } + } break; + case LIBXSMM_BUILD_KIND_PGEMMRMAC: { /* packed GEMM, B regular matrix, row-major */ + LIBXSMM_ASSERT(NULL != request->descriptor.pgemmacrm && 0 != request->descriptor.pgemmacrm->gemm); + /* only floating point */ + if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmacrm->gemm->datatype)) + { + extra.nflops = 2 * request->descriptor.pgemmacrm->packed_width * request->descriptor.pgemmacrm->gemm->m * request->descriptor.pgemmacrm->gemm->n * request->descriptor.pgemmacrm->gemm->k; + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_ac_rm, &generated_code, request->descriptor.pgemmacrm->gemm, request->descriptor.pgemmacrm->packed_width); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmacrm->gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmacrm->gemm->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmacrm", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmacrm->gemm->flags) ? 'n' : 't', + request->descriptor.pgemmacrm->gemm->m, request->descriptor.pgemmacrm->gemm->n, request->descriptor.pgemmacrm->gemm->k, + request->descriptor.pgemmacrm->gemm->lda, request->descriptor.pgemmacrm->gemm->ldb, request->descriptor.pgemmacrm->gemm->ldc, + request->descriptor.pgemmacrm->packed_width, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmacrm->gemm->flags) ? 0 : 1, + uid); + } + } + } break; + case LIBXSMM_BUILD_KIND_PGEMMRMBC: { /* packed GEMM, A regular matrix, row-major */ + LIBXSMM_ASSERT(NULL != request->descriptor.pgemmbcrm && 0 != request->descriptor.pgemmbcrm->gemm); + /* only floating point */ + if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.pgemmbcrm->gemm->datatype)) + { + extra.nflops = 2 * request->descriptor.pgemmbcrm->packed_width * request->descriptor.pgemmbcrm->gemm->m * request->descriptor.pgemmbcrm->gemm->n * request->descriptor.pgemmbcrm->gemm->k; + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_packed_gemm_bc_rm, &generated_code, request->descriptor.pgemmbcrm->gemm, request->descriptor.pgemmbcrm->packed_width); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.pgemmbcrm->gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.pgemmbcrm->gemm->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_w%u_a%i_b%i_p%i.pgemmbcrm", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.pgemmbcrm->gemm->flags) ? 'n' : 't', + request->descriptor.pgemmbcrm->gemm->m, request->descriptor.pgemmbcrm->gemm->n, request->descriptor.pgemmbcrm->gemm->k, + request->descriptor.pgemmbcrm->gemm->lda, request->descriptor.pgemmbcrm->gemm->ldb, request->descriptor.pgemmbcrm->gemm->ldc, + request->descriptor.pgemmbcrm->packed_width, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.pgemmbcrm->gemm->flags) ? 0 : 1, + uid); + } + } + } break; + case LIBXSMM_BUILD_KIND_SREG: { /* sparse register kernel */ + LIBXSMM_ASSERT(NULL != request->descriptor.sreg && 0 != request->descriptor.sreg->gemm); + LIBXSMM_ASSERT(NULL != request->descriptor.sreg->row_ptr && 0 != request->descriptor.sreg->column_idx && 0 != request->descriptor.sreg->values); + /* only floating point */ + if (LIBXSMM_GEMM_PRECISION_F64 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype) || + LIBXSMM_GEMM_PRECISION_F32 == /*LIBXSMM_GETENUM_OUT*/(request->descriptor.sreg->gemm->datatype)) + { + const unsigned int nnz = request->descriptor.sreg->row_ptr[request->descriptor.sreg->gemm->m]; + extra.nflops = 2 * libxsmm_cpuid_vlen32(libxsmm_target_archid)/2 * request->descriptor.sreg->gemm->n * nnz; + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_spgemm_csr_reg_kernel, &generated_code, request->descriptor.sreg->gemm, target_arch, + request->descriptor.sreg->row_ptr, request->descriptor.sreg->column_idx, + (const double*)request->descriptor.sreg->values); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + const int uid = libxsmm_gemm_prefetch2uid((libxsmm_gemm_prefetch_type)request->descriptor.sreg->gemm->prefetch); + const char *const tname = libxsmm_typename((libxsmm_datatype)request->descriptor.sreg->gemm->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_%s_%c%c_%ux%ux%u_%u_%u_%u_a%i_b%i_p%i.sreg", target_arch, tname, + 0 == (LIBXSMM_GEMM_FLAG_TRANS_A & request->descriptor.sreg->gemm->flags) ? 'n' : 't', + 0 == (LIBXSMM_GEMM_FLAG_TRANS_B & request->descriptor.sreg->gemm->flags) ? 'n' : 't', + request->descriptor.sreg->gemm->m, request->descriptor.sreg->gemm->n, request->descriptor.sreg->gemm->k, + request->descriptor.sreg->gemm->lda, request->descriptor.sreg->gemm->ldb, request->descriptor.sreg->gemm->ldc, + /*0 != (LIBXSMM_GEMM_FLAG_ALPHA_0 & request->descriptor.sreg->gemm->flags) ? 0 : */1, + 0 != (LIBXSMM_GEMM_FLAG_BETA_0 & request->descriptor.sreg->gemm->flags) ? 0 : 1, + uid); + } + } + } break; + case LIBXSMM_BUILD_KIND_MELTW: { /* matcopy kernel */ + LIBXSMM_ASSERT(NULL != request->descriptor.meltw); + { + /* dispatch eltwise code with AVX512_BF16 by demoting seemlessly to the current CPU arch */ + if ( ( generated_code.arch >= LIBXSMM_X86_AVX512_SPR ) && + ( generated_code.arch <= LIBXSMM_X86_ALLFEAT ) ) { + int emu_amx = 0; + const char *const env_emu_amx = getenv("EMULATE_AMX"); + if ( 0 == env_emu_amx ) { + } else { + emu_amx = atoi(env_emu_amx); + } + if (emu_amx > 0) { + generated_code.arch = libxsmm_cpuid(); + } + } + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_mateltwise_kernel, &generated_code, request->descriptor.meltw); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + char tsizename[4]; + internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.meltw->datatype); + /* adopt scheme which allows kernel names of LIBXSMM to appear in order (Intel VTune, etc.) */ + if ( request->descriptor.meltw->operation == LIBXSMM_MELTW_OPERATION_REDUCE_COLS_IDX ) { + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_idxtsize%u_%u_%ux%u_opcode%u_flags%u.meltw", target_arch, tsizename, + request->descriptor.meltw->n, request->descriptor.meltw->m, request->descriptor.meltw->ldi, request->descriptor.meltw->ldo, + (unsigned int)request->descriptor.meltw->operation, (unsigned int)request->descriptor.meltw->flags); + } else { + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%ux%u_opcode%u_flags%u_params%u.meltw", target_arch, tsizename, + request->descriptor.meltw->m, request->descriptor.meltw->n, request->descriptor.meltw->ldi, request->descriptor.meltw->ldo, + (unsigned int)request->descriptor.meltw->operation, (unsigned int)request->descriptor.meltw->flags, (unsigned int)request->descriptor.meltw->param); + } + } + } + } break; + case LIBXSMM_BUILD_KIND_MEQN: { /* matequation kernel */ + LIBXSMM_ASSERT(NULL != request->descriptor.meltw); + { + /* dispatch eltwise code with AVX512_BF16 by demoting seemlessly to the current CPU arch */ + if ( ( generated_code.arch >= LIBXSMM_X86_AVX512_SPR ) && + ( generated_code.arch <= LIBXSMM_X86_ALLFEAT ) ) { + int emu_amx = 0; + const char *const env_emu_amx = getenv("EMULATE_AMX"); + if ( 0 == env_emu_amx ) { + } else { + emu_amx = atoi(env_emu_amx); + } + if (emu_amx > 0) { + generated_code.arch = libxsmm_cpuid(); + } + } + LIBXSMM_NO_OFFLOAD(void, libxsmm_generator_matequation_kernel, &generated_code, request->descriptor.meqn); +# if !defined(LIBXSMM_VTUNE) + if (0 > libxsmm_verbosity) +# endif + { + char tsizename[4]; + internal_get_typesize_string(tsizename, sizeof(tsizename), request->descriptor.meqn->datatype); + LIBXSMM_SNPRINTF(jit_name, sizeof(jit_name), "libxsmm_%s_tsize%s_%ux%u_%u_eqn-idx%u.meltw", target_arch, tsizename, + request->descriptor.meqn->m, request->descriptor.meqn->n, request->descriptor.meqn->ldo, + (unsigned int)request->descriptor.meqn->eqn_idx ); + } + } + } break; + case LIBXSMM_BUILD_KIND_USER: break; +# if !defined(NDEBUG) /* library code is expected to be mute */ + default: { /* unknown kind */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM ERROR: invalid build request discovered!\n"); + } + /*result = EXIT_FAILURE;*/ + } +# endif + } + + if (0 == generated_code.last_error /* no error raised */ + && 0 != generated_code.code_size /*check (tcopy issue?)*/) + { + char* code_buffer = NULL; +# if defined(__APPLE__) && defined(__arm64__) +# else + void* code_buffer_result = &code_buffer; +# endif + LIBXSMM_ASSERT(generated_code.code_size <= LIBXSMM_CODE_MAXSIZE); + LIBXSMM_ASSERT(NULL != generated_code.generated_code); + /* attempt to create executable buffer */ +# if defined(__APPLE__) && defined(__arm64__) + code_buffer = mmap( 0, generated_code.code_size, PROT_WRITE | PROT_EXEC | PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_JIT, -1, 0 ); + if ( (long long)code_buffer >= 0 ) { + result = EXIT_SUCCESS; + } else { + result = EXIT_FAILURE; + } +# else + result = libxsmm_xmalloc((void**)code_buffer_result, generated_code.code_size, 0/*auto*/, + /* flag must be a superset of what's populated by libxsmm_malloc_attrib */ + LIBXSMM_MALLOC_FLAG_RWX, &extra, sizeof(extra)); +# endif + if (EXIT_SUCCESS == result) { /* check for success */ + LIBXSMM_ASSERT(NULL != code_buffer); +# if defined(__APPLE__) && defined(__arm64__) + pthread_jit_write_protect_np(0/*false*/); +# endif + /* copy temporary buffer into the prepared executable buffer */ +# if defined(NDEBUG) + { int i; /* precondition: jit_buffer == generated_code.generated_code */ + for (i = 0; i < (int)generated_code.code_size; ++i) code_buffer[i] = jit_buffer[i]; + } +# else + memcpy(code_buffer, generated_code.generated_code, generated_code.code_size); +# endif +# if defined(__APPLE__) && defined(__arm64__) + code->ptr = code_buffer; /* commit buffer */ + LIBXSMM_ASSERT(NULL != code->ptr && 0 == (LIBXSMM_CODE_STATIC & code->uval)); + sys_icache_invalidate(code_buffer, generated_code.code_size); + pthread_jit_write_protect_np(1/*true*/); +# else + /* attribute/protect buffer and revoke unnecessary flags */ + result = libxsmm_malloc_attrib((void**)code_buffer_result, LIBXSMM_MALLOC_FLAG_X, jit_name); + if (EXIT_SUCCESS == result) { /* check for success */ + code->ptr = code_buffer; /* commit buffer */ + LIBXSMM_ASSERT(NULL != code->ptr && 0 == (LIBXSMM_CODE_STATIC & code->uval)); +# if defined(__aarch64__) +# if defined(__clang__) + __clear_cache(code_buffer, code_buffer + generated_code.code_size); +# else + __builtin___clear_cache(code_buffer, code_buffer + generated_code.code_size); +# endif +# endif + } + else { /* release buffer */ + libxsmm_xfree(code_buffer, 0/*no check*/); + } +# endif + } + } + else if (request->kind == LIBXSMM_BUILD_KIND_USER && NULL != request->descriptor.ptr) { /* user-data */ + if (0 != request->user_size) { + void* user_data = &code->ptr; + result = libxsmm_xmalloc((void**)user_data, request->user_size, 0/*auto*/, + LIBXSMM_MALLOC_FLAG_PRIVATE, &extra, sizeof(extra)); + } + else { + result = EXIT_SUCCESS; + code->ptr = NULL; + } + } + else { + result = (0 != generated_code.last_error ? generated_code.last_error : EXIT_FAILURE); + } +#else /* unsupported platform */ + LIBXSMM_UNUSED(request); LIBXSMM_UNUSED(regindex); LIBXSMM_UNUSED(code); + /* libxsmm_get_target_arch also serves as a runtime check whether JIT is available or not */ + if (LIBXSMM_X86_GENERIC <= libxsmm_target_archid) result = EXIT_FAILURE; +#endif + return result; +} + + +LIBXSMM_API_INLINE void internal_pad_descriptor(libxsmm_descriptor* desc, signed char size) +{ + LIBXSMM_ASSERT(LIBXSMM_DESCRIPTOR_MAXSIZE < 128 && NULL != desc); + LIBXSMM_ASSERT(LIBXSMM_DIFF_SIZE <= LIBXSMM_DESCRIPTOR_MAXSIZE); + LIBXSMM_ASSERT(LIBXSMM_HASH_SIZE <= LIBXSMM_DIFF_SIZE); + for (; size < LIBXSMM_DIFF_SIZE; ++size) desc->data[size] = 0; +} + + +LIBXSMM_API_INLINE libxsmm_code_pointer internal_find_code(libxsmm_descriptor* desc, size_t desc_size, size_t user_size, unsigned int* hash) +{ + libxsmm_code_pointer flux_entry = { 0 }; + const int is_big_desc = LIBXSMM_DESCRIPTOR_ISBIG(desc->kind); + const signed char size = (signed char)(sizeof(libxsmm_descriptor_kind) + desc_size); + LIBXSMM_DIFF_DECL(LIBXSMM_DIFF_SIZE, xdesc); +#if !defined(NDEBUG) && (0 != LIBXSMM_JIT) + int build = EXIT_SUCCESS; +#endif +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) +# if defined(LIBXSMM_NTHREADS_USE) + const unsigned int tid = libxsmm_get_tid(); + internal_cache_type *const cache = internal_cache_buffer + tid; +# else + static LIBXSMM_TLS internal_cache_type internal_cache_buffer; + internal_cache_type *const cache = &internal_cache_buffer; +# endif + unsigned char cache_index; + const unsigned int ninit = LIBXSMM_ATOMIC_LOAD(&libxsmm_ninit, LIBXSMM_ATOMIC_RELAXED); + internal_pad_descriptor(desc, size); + LIBXSMM_ASSERT(NULL != hash); + if (0 == is_big_desc) { + LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); + LIBXSMM_DIFF_N(unsigned char, cache_index, LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE), xdesc, cache->entry.keys, + LIBXSMM_DIFF_SIZE, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); + } + else { + cache_index = (unsigned char)libxsmm_diff_n(desc, cache->entry.keys, + size, LIBXSMM_CACHE_STRIDE, cache->entry.hit, cache->entry.size); + } + if (ninit == cache->entry.id && cache_index < cache->entry.size) { /* valid hit */ + flux_entry = cache->entry.code[cache_index]; + cache->entry.hit = cache_index; + } + else +#else + internal_pad_descriptor(desc, size); + LIBXSMM_ASSERT(NULL != hash); +#endif + { + unsigned int i, i0, mode = 0, diff = 1; + *hash = LIBXSMM_CRC32(LIBXSMM_HASH_SIZE)(LIBXSMM_HASH_SEED, desc); + i0 = i = LIBXSMM_MOD2(*hash, LIBXSMM_CAPACITY_REGISTRY); + LIBXSMM_ASSERT(&desc->kind == &desc->gemm.pad && desc->kind == desc->gemm.pad); + LIBXSMM_ASSERT(NULL != internal_registry); + do { /* use calculated location and check if the requested code is already JITted */ +#if (1 < INTERNAL_REGLOCK_MAXN) || !LIBXSMM_LOCK_TYPE_ISRW(LIBXSMM_REGLOCK) /* read registered code */ +# if 1 /* omitting an atomic load is safe but avoids race-detectors to highlight this location */ + uintptr_t *const fluxaddr = &internal_registry[i].uval; + flux_entry.uval = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)(fluxaddr, LIBXSMM_ATOMIC_RELAXED); +# else + flux_entry = internal_registry[i]; +# endif +#else + LIBXSMM_LOCK_ACQREAD(LIBXSMM_REGLOCK, internal_reglock_ptr); + flux_entry = internal_registry[i]; /* read registered code */ + LIBXSMM_LOCK_RELREAD(LIBXSMM_REGLOCK, internal_reglock_ptr); +#endif + if ((NULL != flux_entry.ptr_const || 1 == mode) && 2 > mode) { /* confirm entry */ + if (NULL != flux_entry.ptr_const) { + if (0 == is_big_desc) { +#if !defined(LIBXSMM_CACHE_MAXSIZE) || (0 == (LIBXSMM_CACHE_MAXSIZE)) + LIBXSMM_DIFF_LOAD(LIBXSMM_DIFF_SIZE, xdesc, desc); +#endif + diff = LIBXSMM_DIFF(LIBXSMM_DIFF_SIZE)(xdesc, internal_registry_keys + i, 0/*dummy*/); + } + else { + diff = libxsmm_diff(desc, internal_registry_keys + i, size); + } + } +#if !defined(NDEBUG) + else LIBXSMM_ASSERT(0 != diff); +#endif + if (0 != diff) { /* search for code version */ + if (0 == mode) { /* transition to higher mode */ + i0 = i; /* keep current position on record */ +#if defined(LIBXSMM_HASH_COLLISION) + /* enter code generation, and collision fix-up */ + if (0 == (LIBXSMM_HASH_COLLISION & flux_entry.uval)) { + LIBXSMM_ASSERT(NULL != flux_entry.ptr_const); /* collision */ + mode = 3; + } + else +#endif /* search for an existing code version */ + mode = 1; /* else */ + } + i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); + if (i == i0) { /* search finished, no code version exists */ +#if defined(LIBXSMM_HASH_COLLISION) + mode = 3; /* enter code generation, and collision fix-up */ +#else + mode = 2; /* enter code generation */ +#endif + if (LIBXSMM_KERNEL_KIND_MATMUL == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) { + internal_update_mmstatistic(&desc->gemm.desc, 0, 1/*collision*/, 0, 0); + } + } + LIBXSMM_ASSERT(0 != diff); /* continue */ + } + } + else { /* enter code generation (there is no code version yet) */ + LIBXSMM_ASSERT(0 == mode || 1 < mode); +#if (0 == LIBXSMM_JIT) + LIBXSMM_UNUSED(user_size); +#else + if (LIBXSMM_X86_GENERIC <= libxsmm_target_archid || /* check if JIT is supported (CPUID) */ + (LIBXSMM_KERNEL_KIND_USER == LIBXSMM_DESCRIPTOR_KIND(desc->kind))) + { + LIBXSMM_ASSERT(0 != mode || NULL == flux_entry.ptr_const/*code version does not exist*/); + INTERNAL_FIND_CODE_LOCK(lock, i, diff, flux_entry.ptr); /* lock the registry entry */ + if (NULL == internal_registry[i].ptr_const) { /* double-check registry after acquiring the lock */ + libxsmm_build_request request; /* setup the code build request */ + LIBXSMM_ASSERT(LIBXSMM_KERNEL_UNREGISTERED > LIBXSMM_DESCRIPTOR_KIND(desc->kind)); + request.kind = (libxsmm_build_kind)LIBXSMM_DESCRIPTOR_KIND(desc->kind); + request.descriptor.ptr = &desc->gemm.desc; + request.user_size = user_size; +# if defined(NDEBUG) + if (EXIT_SUCCESS == libxsmm_build(&request, i, &flux_entry) && NULL != flux_entry.ptr_const) +# else + build = libxsmm_build(&request, i, &flux_entry); + if (EXIT_SUCCESS == build && NULL != flux_entry.ptr_const) +# endif + { + LIBXSMM_ASSIGN127(internal_registry_keys + i, desc); +# if (1 < INTERNAL_REGLOCK_MAXN) + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_registry[i].ptr, flux_entry.ptr, LIBXSMM_ATOMIC_SEQ_CST); +# else + internal_registry[i] = flux_entry; +# endif +# if defined(LIBXSMM_HASH_COLLISION) + if (2 < mode) { /* arrived from collision state; now mark as collision */ + libxsmm_code_pointer fix_entry; +# if (1 < INTERNAL_REGLOCK_MAXN) + fix_entry.ptr = LIBXSMM_ATOMIC_LOAD(&internal_registry[i0].ptr, LIBXSMM_ATOMIC_RELAXED); +# else + fix_entry = internal_registry[i0]; +# endif + LIBXSMM_ASSERT(NULL != fix_entry.ptr_const); + if (0 == (LIBXSMM_HASH_COLLISION & fix_entry.uval)) { + fix_entry.uval |= LIBXSMM_HASH_COLLISION; /* mark current entry as collision */ +# if (1 < INTERNAL_REGLOCK_MAXN) + LIBXSMM_ATOMIC_STORE(&internal_registry[i0].ptr, fix_entry.ptr, LIBXSMM_ATOMIC_RELAXED); +# else + internal_registry[i0] = fix_entry; +# endif + } + } +# endif + } + if (LIBXSMM_KERNEL_KIND_MATMUL == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) { + internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0); + } + /* leave here even in case of a build-error; do not use break (inside of locked region) */ + diff = 0; + } + INTERNAL_FIND_CODE_UNLOCK(lock); + if (0 != diff) { /* acquire registry slot */ + if (0 == mode) { /* initial condition */ + mode = 2; /* continue to linearly search for an empty slot */ + i0 = i; /* keep current position on record */ + } + do { /* continue to linearly search for an available slot */ + i = LIBXSMM_MOD2(i + 1, LIBXSMM_CAPACITY_REGISTRY); + if (NULL == internal_registry[i].ptr_const) break; + } while (i != i0); + if (i == i0) { /* out of capacity (no registry slot available) */ + diff = 0; /* do not use break if inside of locked region */ + } + flux_entry.ptr = NULL; /* no result */ + } + } + else /* JIT-code generation not available */ +#endif + { /* leave the dispatch loop */ + if (LIBXSMM_KERNEL_KIND_MATMUL == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) { + internal_update_mmstatistic(&desc->gemm.desc, 1/*try*/, 0, 0, 0); + } +#if !defined(NDEBUG) && (0 != LIBXSMM_JIT) + build = EXIT_FAILURE; +#endif + flux_entry.ptr = NULL; + diff = 0; + } + } + } while (0 != diff); +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + if (NULL != flux_entry.ptr_const) { /* keep code version on record (cache) */ + LIBXSMM_ASSERT(0 == diff); + if (ninit == cache->entry.id) { /* maintain cache */ + if (cache->entry.size < internal_cache_size) { /* grow */ + INTERNAL_FIND_CODE_CACHE_GROW(cache_index, cache->entry.size); + LIBXSMM_ASSERT(cache->entry.size <= internal_cache_size); + } + else { /* evict */ + LIBXSMM_ASSERT(cache->entry.hit < cache->entry.size); + INTERNAL_FIND_CODE_CACHE_EVICT(cache_index, cache->entry.size, cache->entry.hit); + } + } + else if (0 != internal_cache_size) { /* reset cache */ + /* INTERNAL_FIND_CODE_CACHE_GROW doubles size (and would expose invalid entries) */ + memset(cache->entry.keys, 0, LIBXSMM_CACHE_MAXSIZE * sizeof(*cache->entry.keys)); + cache->entry.id = ninit; + cache->entry.size = 1; + cache_index = 0; + } + LIBXSMM_MEMCPY127(cache->entry.keys + cache_index, desc, 0 == is_big_desc ? LIBXSMM_DIFF_SIZE : size); + cache->entry.code[cache_index] = flux_entry; + cache->entry.hit = cache_index; + } +# if !defined(NDEBUG) + else { + memset(cache, 0, sizeof(*cache)); + } +# endif +#endif + } +#if defined(LIBXSMM_HASH_COLLISION) + flux_entry.uval &= ~(LIBXSMM_CODE_STATIC | LIBXSMM_HASH_COLLISION); /* clear non-JIT and collision flag */ +#else + flux_entry.uval &= ~LIBXSMM_CODE_STATIC; /* clear non-JIT flag */ +#endif +#if (0 != LIBXSMM_JIT) + assert( /*!LIBXSMM_ASSERT*/ + LIBXSMM_KERNEL_KIND_MATMUL != LIBXSMM_DESCRIPTOR_KIND(desc->kind) + || NULL != flux_entry.ptr_const + || 1 == internal_reglock_count + || EXIT_SUCCESS != build); +#endif + return flux_entry; +} + + +LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, + const libxsmm_descriptor** desc, size_t* code_size) +{ + libxsmm_kernel_xinfo* result = NULL; + void *const result_address = &result; + int flags = LIBXSMM_MALLOC_FLAG_X; + if (NULL != code.ptr_const && EXIT_SUCCESS == libxsmm_get_malloc_xinfo( + code.ptr_const, code_size, &flags, (void**)result_address) && NULL != result) + { + if (NULL != desc) { + if (NULL != internal_registry && NULL != internal_registry_keys && result->registered < (LIBXSMM_CAPACITY_REGISTRY) +#if defined(LIBXSMM_HASH_COLLISION) + && code.uval == (~LIBXSMM_HASH_COLLISION & internal_registry[result->registered].uval) +#else + && code.ptr_const == internal_registry[result->registered].ptr_const +#endif + && LIBXSMM_KERNEL_UNREGISTERED > LIBXSMM_DESCRIPTOR_KIND(internal_registry_keys[result->registered].entry.kind)) + { + *desc = &internal_registry_keys[result->registered].entry; + } + else *desc = NULL; + } + } + else { + LIBXSMM_ASSERT(NULL == result); + if (NULL != code_size) *code_size = 0; + if (NULL != desc) *desc = NULL; + } + return result; +} + + +LIBXSMM_API int libxsmm_get_kernel_info(const void* kernel, libxsmm_kernel_info* info) +{ + int result; + const libxsmm_kernel_xinfo* xinfo; + libxsmm_kernel_info result_info; + const libxsmm_descriptor* desc; + libxsmm_code_pointer code; + code.ptr_const = kernel; + LIBXSMM_MEMZERO127(&result_info); + xinfo = libxsmm_get_kernel_xinfo(code, &desc, &result_info.code_size); + if (NULL != xinfo) { + if (NULL != desc) { + const libxsmm_kernel_kind kind = (libxsmm_kernel_kind)LIBXSMM_DESCRIPTOR_KIND(desc->kind); + result_info.kind = kind; + if (LIBXSMM_KERNEL_KIND_USER == kind) { + result_info.code_size = 0; /* invalid */ + } + } + else { + result_info.kind = LIBXSMM_KERNEL_UNREGISTERED; + } + result_info.nflops = xinfo->nflops; + LIBXSMM_ASSIGN127(info, &result_info); + result = EXIT_SUCCESS; + } + else { + LIBXSMM_ASSERT(NULL == desc); + if (NULL != info) { + LIBXSMM_ASSIGN127(info, &result_info); + result = EXIT_FAILURE; + } + else { + result = EXIT_SUCCESS; + } + } + return result; +} + + +LIBXSMM_API int libxsmm_get_mmkernel_info(libxsmm_xmmfunction kernel, libxsmm_mmkernel_info* info) +{ + libxsmm_code_pointer code; + static int error_once = 0; + int result; + code.xgemm = kernel; + if (NULL != info) { + const libxsmm_descriptor* desc; + if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && + NULL != desc && LIBXSMM_KERNEL_KIND_MATMUL == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) + { + info->iprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_INP(desc->gemm.desc.datatype); + info->oprecision = (libxsmm_gemm_precision)LIBXSMM_GETENUM_OUT(desc->gemm.desc.datatype); + info->prefetch = (libxsmm_gemm_prefetch_type)desc->gemm.desc.prefetch; + info->flags = desc->gemm.desc.flags; + info->lda = desc->gemm.desc.lda; + info->ldb = desc->gemm.desc.ldb; + info->ldc = desc->gemm.desc.ldc; + info->m = desc->gemm.desc.m; + info->n = desc->gemm.desc.n; + info->k = desc->gemm.desc.k; + result = EXIT_SUCCESS; + } + else { +#if defined(__APPLE__) && defined(__arm64__) + info->iprecision = 1; + info->oprecision = 1; + info->prefetch = 1; + info->flags = 1; + info->lda = 1; + info->ldb = 1; + info->ldc = 1; + info->m = 1; + info->n = 1; + info->k = 1; + result = EXIT_SUCCESS; +# else + if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (NULL == code.ptr_const) { + fprintf(stderr, "LIBXSMM ERROR: NULL-kernel cannot be inspected!\n"); + } + else { + fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); + } + } + result = EXIT_FAILURE; +# endif + } + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); + } + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API int libxsmm_get_meltwkernel_info(libxsmm_xmeltwfunction kernel, libxsmm_meltwkernel_info* info) +{ + libxsmm_code_pointer code; + static int error_once = 0; + int result; + code.xmateltw = kernel; + if (NULL != info) { + const libxsmm_descriptor* desc; + if (NULL != libxsmm_get_kernel_xinfo(code, &desc, NULL/*code_size*/) && + NULL != desc && LIBXSMM_KERNEL_KIND_MELTW == LIBXSMM_DESCRIPTOR_KIND(desc->kind)) + { + info->datatype = desc->meltw.desc.datatype; + info->operation = desc->meltw.desc.operation; + info->flags = desc->meltw.desc.flags; + info->ldi = desc->meltw.desc.ldi; + info->ldo = desc->meltw.desc.ldo; + info->m = desc->meltw.desc.m; + info->n = desc->meltw.desc.n; + result = EXIT_SUCCESS; + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid kernel cannot be inspected!\n"); + } + result = EXIT_FAILURE; + } + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument!\n"); + } + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API int libxsmm_get_registry_info(libxsmm_registry_info* info) +{ + int result = EXIT_SUCCESS; + LIBXSMM_INIT /* verbosity */ + if (0 != info && 0 != internal_registry) { + size_t i; + LIBXSMM_MEMZERO127(info); /* info->nstatic = 0; info->size = 0; */ + info->nbytes = (LIBXSMM_CAPACITY_REGISTRY) * (sizeof(libxsmm_code_pointer) + sizeof(libxsmm_descriptor)); + info->capacity = LIBXSMM_CAPACITY_REGISTRY; +#if defined(LIBXSMM_CACHE_MAXSIZE) && (0 < (LIBXSMM_CACHE_MAXSIZE)) + info->ncache = internal_cache_size; +#else + info->ncache = 0; +#endif + for (i = 0; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) { + libxsmm_code_pointer code = internal_registry[i]; + if (0 != code.ptr_const && EXIT_SUCCESS == result) { + if (0 == (LIBXSMM_CODE_STATIC & code.uval)) { /* check for allocated/generated JIT-code */ + size_t buffer_size = 0; + void* buffer = 0; +#if defined(LIBXSMM_HASH_COLLISION) + code.uval &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */ +#endif + result = libxsmm_get_malloc_xinfo(code.ptr_const, &buffer_size, NULL/*flags*/, &buffer); + if (EXIT_SUCCESS == result) { + info->nbytes += LIBXSMM_UP2(buffer_size + (((char*)code.ptr_const) - (char*)buffer), LIBXSMM_PAGE_MINSIZE); + } + } + else { + ++info->nstatic; + } + ++info->size; + } + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API_INLINE void* internal_get_registry_entry(int i, libxsmm_kernel_kind kind, const void** key) +{ + void* result = NULL; + LIBXSMM_ASSERT(kind < LIBXSMM_KERNEL_UNREGISTERED && NULL != internal_registry); + for (; i < (LIBXSMM_CAPACITY_REGISTRY); ++i) { + const libxsmm_code_pointer regentry = internal_registry[i]; + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(regentry.ptr_const, + NULL/*code_size*/, NULL/*flags*/, &result) && NULL != result) + { + const libxsmm_kernel_xinfo info = *(const libxsmm_kernel_xinfo*)result; + const libxsmm_descriptor *const desc = &internal_registry_keys[info.registered].entry; + if (LIBXSMM_DESCRIPTOR_KIND(desc->kind) == (int)kind) { + if (NULL != key) *key = desc->user.desc; + result = regentry.ptr; + break; + } + } + } + return result; +} + + +LIBXSMM_API void* libxsmm_get_registry_begin(libxsmm_kernel_kind kind, const void** key) +{ + void* result = NULL; + if (kind < LIBXSMM_KERNEL_UNREGISTERED && NULL != internal_registry) { + result = internal_get_registry_entry(0, kind, key); + } + return result; +} + + +LIBXSMM_API void* libxsmm_get_registry_next(const void* regentry, const void** key) +{ + void* result = NULL; + const libxsmm_descriptor* desc; + libxsmm_code_pointer entry; + entry.ptr_const = regentry; + if (NULL != libxsmm_get_kernel_xinfo(entry, &desc, NULL/*code_size*/) + /* given regentry is indeed a registered kernel */ + && NULL != desc) + { + result = internal_get_registry_entry( + (int)(desc - &internal_registry_keys->entry + 1), + (libxsmm_kernel_kind)LIBXSMM_DESCRIPTOR_KIND(desc->kind), key); + } + return result; +} + + +LIBXSMM_API void* libxsmm_xregister(const void* key, size_t key_size, + size_t value_size, const void* value_init, unsigned int* key_hash) +{ + static int error_once = 0; + void* result; + LIBXSMM_INIT /* verbosity */ + if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) { + libxsmm_descriptor wrap; + unsigned int hash = 0; + void* dst; +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_MEMSET127(&wrap, 0, key_size); +#endif + LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size); + wrap.kind = (libxsmm_descriptor_kind)(LIBXSMM_DESCRIPTOR_SIGSIZE >= key_size + ? ((libxsmm_descriptor_kind)LIBXSMM_KERNEL_KIND_USER) + : LIBXSMM_DESCRIPTOR_BIG(LIBXSMM_KERNEL_KIND_USER)); + dst = internal_find_code(&wrap, key_size, value_size, &hash).ptr; + if (NULL != key_hash) *key_hash = hash; + if (NULL != dst) { + size_t size; + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo(dst, &size, NULL/*flags*/, NULL/*extra*/) + && value_size <= size) + { + if (NULL != value_init) memcpy(dst, value_init, value_size); + result = dst; + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: value too large for previously registered key!\n"); + } + result = NULL; + } + } + else result = NULL; + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n"); + } + else { + fprintf(stderr, "LIBXSMM ERROR: libxsmm_xregister has maximum key-size of %i Byte!\n", + LIBXSMM_DESCRIPTOR_MAXSIZE); + } + } + result = NULL; + } + return result; +} + + +LIBXSMM_API void* libxsmm_xdispatch(const void* key, size_t key_size, unsigned int* key_hash) +{ + void* result; + LIBXSMM_INIT /* verbosity */ +#if !defined(NDEBUG) + if (NULL != key && 0 < key_size && LIBXSMM_DESCRIPTOR_MAXSIZE >= key_size) +#endif + { + unsigned int hash = 0; + libxsmm_descriptor wrap; +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_MEMSET127(&wrap, 0, key_size); +#endif + LIBXSMM_MEMCPY127(wrap.user.desc, key, key_size); + wrap.kind = (libxsmm_descriptor_kind)(LIBXSMM_DESCRIPTOR_SIGSIZE >= key_size + ? ((libxsmm_descriptor_kind)LIBXSMM_KERNEL_KIND_USER) + : LIBXSMM_DESCRIPTOR_BIG(LIBXSMM_KERNEL_KIND_USER)); + result = internal_find_code(&wrap, key_size, 0/*user_size*/, &hash).ptr; + if (NULL != key_hash) *key_hash = hash; + } +#if !defined(NDEBUG) + else { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n"); + } + result = NULL; + } +#endif + return result; +} + + +LIBXSMM_API void libxsmm_xrelease(const void* key, size_t key_size) +{ + libxsmm_release_kernel(libxsmm_xdispatch(key, key_size, NULL/*key_hash*/)); +} + + +LIBXSMM_API libxsmm_xmmfunction libxsmm_xmmdispatch(const libxsmm_gemm_descriptor* descriptor) +{ + libxsmm_xmmfunction result; + LIBXSMM_INIT /* verbosity */ +#if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); +#endif + if (NULL != descriptor) { + unsigned int hash; + const int batch_reduce = + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS | + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET | + LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE; + libxsmm_descriptor wrap; +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); +#endif + LIBXSMM_ASSIGN127(&wrap.gemm.desc, descriptor); + wrap.kind = (libxsmm_descriptor_kind)(0 == (batch_reduce & descriptor->flags) + ? ((libxsmm_descriptor_kind)LIBXSMM_KERNEL_KIND_MATMUL) + : LIBXSMM_DESCRIPTOR_BIG(LIBXSMM_KERNEL_KIND_MATMUL)); + if (0 != (0x80 & descriptor->prefetch)) { /* "sign"-bit of byte-value is set */ + wrap.gemm.desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + } + result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/, &hash).xgemm; +#if defined(_DEBUG) + if (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity && INT_MAX != libxsmm_verbosity && NULL != result.xmm) { + LIBXSMM_STDIO_ACQUIRE(); + fprintf(stderr, "\nLIBXSMM: "); + libxsmm_gemm_xprint(stderr, result, NULL/*a*/, NULL/*b*/, NULL/*c*/); + LIBXSMM_STDIO_RELEASE(); + } +#endif + } + else { /* quietly accept NULL-descriptor */ + result.xmm = NULL; + } + return result; +} + + +LIBXSMM_API libxsmm_dmmfunction libxsmm_dmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.dmm; +} + + +LIBXSMM_API libxsmm_smmfunction libxsmm_smmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.smm; +} + + +LIBXSMM_API libxsmm_bsmmfunction libxsmm_bsmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bsmm; +} + + +LIBXSMM_API libxsmm_bmmfunction libxsmm_bmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bmm; +} + + +LIBXSMM_API libxsmm_wimmfunction libxsmm_wimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.wimm; +} + + +LIBXSMM_API libxsmm_ssbimmfunction libxsmm_ssbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.ssbimm; +} + + +LIBXSMM_API libxsmm_usbimmfunction libxsmm_usbimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.usbimm; +} + + +LIBXSMM_API libxsmm_subimmfunction libxsmm_subimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.subimm; +} + + +LIBXSMM_API libxsmm_uubimmfunction libxsmm_uubimmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.uubimm; +} + + +LIBXSMM_API libxsmm_sububmmfunction libxsmm_sububmmdispatch(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.sububmm; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.dmra; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.smra; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bsmra; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bmra; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.wimra; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.ssbimra; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.usbimra; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.subimra; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.uubimra; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.sububmra; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_addr libxsmm_dmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.dmra; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_addr libxsmm_smmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.smra; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_addr libxsmm_bsmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.bsmra; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_addr libxsmm_bmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.bmra; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_addr libxsmm_wimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.wimra; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_addr libxsmm_ssbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.ssbimra; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_addr libxsmm_usbimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.usbimra; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_addr libxsmm_subimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.subimra; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_addr libxsmm_uubimmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.uubimra; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_addr libxsmm_sububmmdispatch_reducebatch_addr_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_ADDRESS, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.sububmra; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.dmro; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.smro; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bsmro; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.bmro; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.wimro; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.ssbimro; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.usbimro; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.subimro; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.uubimro; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + const libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result = libxsmm_xmmdispatch(desc); + return result.sububmro; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_offs libxsmm_dmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.dmro; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_offs libxsmm_smmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.smro; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_offs libxsmm_bsmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.bsmro; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_offs libxsmm_bmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.bmro; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_offs libxsmm_wimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.wimro; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_offs libxsmm_ssbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.ssbimro; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_offs libxsmm_usbimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.usbimro; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_offs libxsmm_subimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.subimro; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_offs libxsmm_uubimmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.uubimro; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_offs libxsmm_sububmmdispatch_reducebatch_offs_unroll(libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_OFFSET, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + result = libxsmm_xmmdispatch(desc); + return result.sububmro; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.dmrs; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.smrs; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.bsmrs; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.bmrs; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.wimrs; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.ssbimrs; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.usbimrs; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.subimrs; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.uubimrs; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, + libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.sububmrs; +} + + +LIBXSMM_API libxsmm_dmmfunction_reducebatch_strd libxsmm_dmmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const double* alpha, const double* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_dgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.dmrs; +} + + +LIBXSMM_API libxsmm_smmfunction_reducebatch_strd libxsmm_smmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? LIBXSMM_FLAGS : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_sgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.smrs; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd libxsmm_bsmmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.bsmrs; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd libxsmm_bmmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const float* alpha, const float* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.bmrs; +} + + +LIBXSMM_API libxsmm_wimmfunction_reducebatch_strd libxsmm_wimmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_wigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.wimrs; +} + + +LIBXSMM_API libxsmm_ssbimmfunction_reducebatch_strd libxsmm_ssbimmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.ssbimrs; +} + + +LIBXSMM_API libxsmm_usbimmfunction_reducebatch_strd libxsmm_usbimmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_A_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.usbimrs; +} + + +LIBXSMM_API libxsmm_subimmfunction_reducebatch_strd libxsmm_subimmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.subimrs; +} + + +LIBXSMM_API libxsmm_uubimmfunction_reducebatch_strd libxsmm_uubimmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bigemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_AB_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.uubimrs; +} + + +LIBXSMM_API libxsmm_sububmmfunction_reducebatch_strd libxsmm_sububmmdispatch_reducebatch_strd_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const int* alpha, const int* beta, const int* flags, const int* prefetch) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bbgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_B_UNSIGNED | LIBXSMM_GEMM_FLAG_C_UNSIGNED | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, + libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + result = libxsmm_xmmdispatch(desc); + return result.sububmrs; +} + + +/* GEMMs fused with eltwise kernels */ +LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd_meltwfused libxsmm_bmmdispatch_reducebatch_strd_meltwfused( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch, + libxsmm_meltw_operation meltw_op, libxsmm_datatype meltw_dt, libxsmm_meltw_flags meltw_flags, unsigned char meltw_param, unsigned int meltw_ldx, unsigned int meltw_ldy, unsigned int meltw_ldz) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + desc->meltw_datatype_aux = (unsigned char)meltw_dt; + desc->meltw_flags = (unsigned short)meltw_flags; + desc->meltw_operation = (unsigned char)meltw_op; + desc->meltw_param = (unsigned char)meltw_param; + desc->meltw_ldx = (unsigned int) meltw_ldx; + desc->meltw_ldy = (unsigned int) meltw_ldy; + desc->meltw_ldz = (unsigned int) meltw_ldz; + result = libxsmm_xmmdispatch(desc); + return result.bmrs_meltwfused; +} + + +LIBXSMM_API libxsmm_bmmfunction_reducebatch_strd_meltwfused libxsmm_bmmdispatch_reducebatch_strd_meltwfused_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch, + libxsmm_meltw_operation meltw_op, libxsmm_datatype meltw_dt, libxsmm_meltw_flags meltw_flags, unsigned char meltw_param, unsigned int meltw_ldx, unsigned int meltw_ldy, unsigned int meltw_ldz) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + desc->meltw_datatype_aux = (unsigned char)meltw_dt; + desc->meltw_flags = (unsigned short)meltw_flags; + desc->meltw_operation = (unsigned char)meltw_op; + desc->meltw_param = (unsigned char)meltw_param; + desc->meltw_ldx = (unsigned int) meltw_ldx; + desc->meltw_ldy = (unsigned int) meltw_ldy; + desc->meltw_ldz = (unsigned int) meltw_ldz; + result = libxsmm_xmmdispatch(desc); + return result.bmrs_meltwfused; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd_meltwfused libxsmm_bsmmdispatch_reducebatch_strd_meltwfused( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch, + libxsmm_meltw_operation meltw_op, libxsmm_datatype meltw_dt, libxsmm_meltw_flags meltw_flags, unsigned char meltw_param, unsigned int meltw_ldx, unsigned int meltw_ldy, unsigned int meltw_ldz) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + desc->meltw_datatype_aux = (unsigned char)meltw_dt; + desc->meltw_flags = (unsigned short)meltw_flags; + desc->meltw_operation = (unsigned char)meltw_op; + desc->meltw_param = (unsigned char)meltw_param; + desc->meltw_ldx = (unsigned int) meltw_ldx; + desc->meltw_ldy = (unsigned int) meltw_ldy; + desc->meltw_ldz = (unsigned int) meltw_ldz; + result = libxsmm_xmmdispatch(desc); + return result.bsmrs_meltwfused; +} + + +LIBXSMM_API libxsmm_bsmmfunction_reducebatch_strd_meltwfused libxsmm_bsmmdispatch_reducebatch_strd_meltwfused_unroll( + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint k, libxsmm_blasint stride_a, libxsmm_blasint stride_b, libxsmm_blasint unroll_hint, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, const float* alpha, const float* beta, const int* flags, const int* prefetch, + libxsmm_meltw_operation meltw_op, libxsmm_datatype meltw_dt, libxsmm_meltw_flags meltw_flags, unsigned char meltw_param, unsigned int meltw_ldx, unsigned int meltw_ldy, unsigned int meltw_ldz) +{ + const int gemm_flags = (NULL == flags ? (LIBXSMM_FLAGS | LIBXSMM_GEMM_FLAG_VNNI_A) : *flags); + libxsmm_descriptor_blob blob; + /*const*/ libxsmm_gemm_descriptor *const desc = libxsmm_bsgemm_descriptor_init(&blob, m, n, k, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? m : k), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? k : n), + NULL != ldc ? *ldc : m, NULL != alpha ? *alpha : LIBXSMM_ALPHA, NULL != beta ? *beta : LIBXSMM_BETA, + gemm_flags | LIBXSMM_GEMM_FLAG_BATCH_REDUCE_STRIDE, libxsmm_get_gemm_xprefetch(prefetch)); + /*const*/ libxsmm_xmmfunction result; + desc->c1 = (unsigned long long)stride_a; + desc->c2 = (unsigned long long)stride_b; + desc->c3 = (unsigned char)(((unroll_hint < 255) && (unroll_hint > 0)) ? unroll_hint : 0); + if ( (stride_a < 0) || (stride_b < 0) ) { + return NULL; + } + desc->meltw_datatype_aux = (unsigned char)meltw_dt; + desc->meltw_flags = (unsigned short)meltw_flags; + desc->meltw_operation = (unsigned char)meltw_op; + desc->meltw_param = (unsigned char)meltw_param; + desc->meltw_ldx = (unsigned int) meltw_ldx; + desc->meltw_ldy = (unsigned int) meltw_ldy; + desc->meltw_ldz = (unsigned int) meltw_ldz; + result = libxsmm_xmmdispatch(desc); + return result.bsmrs_meltwfused; +} + + +LIBXSMM_API libxsmm_xmeltwfunction libxsmm_dispatch_meltw(const libxsmm_meltw_descriptor* descriptor) +{ + libxsmm_xmeltwfunction result; + LIBXSMM_INIT /* verbosity */ +#if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); +#endif + if (NULL != descriptor) { + unsigned int hash; + libxsmm_descriptor wrap; +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); +#endif + LIBXSMM_ASSIGN127(&wrap.meltw.desc, descriptor); + wrap.kind = LIBXSMM_DESCRIPTOR_BIG(LIBXSMM_KERNEL_KIND_MELTW); + result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/, &hash).xmateltw; + } + else { + result.xmeltw = NULL; + } + return result; +} + + +LIBXSMM_API libxsmm_meltwfunction_reduce_cols_idx libxsmm_dispatch_meltw_reduce_cols_idx( + libxsmm_blasint m, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, + libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_datatype idx_type) +{ + libxsmm_descriptor_blob blob; + libxsmm_blasint idx_dtype_size = libxsmm_typesize(idx_type); + const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, + in_type, out_type, m, idx_dtype_size, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, + 0, 0, LIBXSMM_MELTW_OPERATION_REDUCE_COLS_IDX); + + libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); + + return result.meltw_reduce_cols_idx; +} + + +LIBXSMM_API libxsmm_meltwfunction_opreduce_vecs_idx libxsmm_dispatch_meltw_opreduce_vecs_idx( + libxsmm_blasint m, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, + libxsmm_datatype in_type, libxsmm_datatype out_type, libxsmm_datatype idx_type, libxsmm_meltw_opreduce_vecs_flags flags) +{ + libxsmm_descriptor_blob blob; + libxsmm_blasint idx_dtype_size = libxsmm_typesize(idx_type); + unsigned char argidx_params = (unsigned char) (((flags & LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0) | (flags & LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1)) >> 16); + const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init(&blob, + in_type, out_type, m, idx_dtype_size, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, + (unsigned short)flags, argidx_params, LIBXSMM_MELTW_OPERATION_OPREDUCE_VECS_IDX); + + libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); + + return result.meltw_opreduce_vecs_idx; +} + + +LIBXSMM_API libxsmm_meltwfunction_unary libxsmm_dispatch_meltw_unary( + libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo, + libxsmm_datatype in_type, libxsmm_datatype compute_type, libxsmm_datatype out_type, libxsmm_meltw_unary_flags flags, libxsmm_meltw_unary_type type) +{ + libxsmm_descriptor_blob blob; + const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init2(&blob, + in_type, compute_type, out_type, LIBXSMM_DATATYPE_UNSUPPORTED, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, 0, 0, + (unsigned short)flags, (unsigned char)type, LIBXSMM_MELTW_OPERATION_UNARY); + + libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); + + return result.meltw_unary; +} + + +LIBXSMM_API libxsmm_meltwfunction_binary libxsmm_dispatch_meltw_binary( + libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldi2, const libxsmm_blasint* ldo, + libxsmm_datatype in_type, libxsmm_datatype compute_type, libxsmm_datatype out_type, libxsmm_meltw_binary_flags flags, libxsmm_meltw_binary_type type) +{ + libxsmm_descriptor_blob blob; + const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init2(&blob, + in_type, compute_type, out_type, LIBXSMM_DATATYPE_UNSUPPORTED, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, (ldi2 == NULL) ? m : *ldi2, 0, + (unsigned short)flags, (unsigned char)type, LIBXSMM_MELTW_OPERATION_BINARY); + + libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); + + return result.meltw_binary; +} + + +LIBXSMM_API libxsmm_meltwfunction_ternary libxsmm_dispatch_meltw_ternary( + libxsmm_blasint m, libxsmm_blasint n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldi2, const libxsmm_blasint* ldi3, const libxsmm_blasint* ldo, + libxsmm_datatype in_type, libxsmm_datatype compute_type, libxsmm_datatype out_type, libxsmm_meltw_ternary_flags flags, libxsmm_meltw_ternary_type type) +{ + libxsmm_descriptor_blob blob; + const libxsmm_meltw_descriptor *const desc = libxsmm_meltw_descriptor_init2(&blob, + in_type, compute_type, out_type, LIBXSMM_DATATYPE_UNSUPPORTED, m, n, (ldi == NULL) ? m : *ldi, (ldo == NULL) ? m : *ldo, (ldi2 == NULL) ? m : *ldi2, (ldi3 == NULL) ? m : *ldi3, + (unsigned short)flags, (unsigned char)type, LIBXSMM_MELTW_OPERATION_TERNARY); + + libxsmm_xmeltwfunction result = libxsmm_dispatch_meltw(desc); + + return result.meltw_ternary; +} + + +LIBXSMM_API libxsmm_matrix_eqn_function libxsmm_dispatch_matrix_eqn_desc( const libxsmm_meqn_descriptor* descriptor ) { + libxsmm_matrix_eqn_function result; + LIBXSMM_INIT /* verbosity */ +#if !defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_ASSERT((sizeof(*descriptor) + sizeof(libxsmm_descriptor_kind)) <= (LIBXSMM_DESCRIPTOR_MAXSIZE)); +#endif + if (NULL != descriptor) { + unsigned int hash; + libxsmm_descriptor wrap; + + /* check if equation is ready for JIT */ + if ( libxsmm_matrix_eqn_is_ready_for_jit( descriptor->eqn_idx) == 0 ) { +#if defined(LIBXSMM_UNPACKED) /* CCE/Classic */ + LIBXSMM_MEMSET127(&wrap, 0, sizeof(*descriptor)); +#endif + LIBXSMM_ASSIGN127(&wrap.meqn.desc, descriptor); + wrap.kind = LIBXSMM_DESCRIPTOR_BIG(LIBXSMM_KERNEL_KIND_MEQN); + result = internal_find_code(&wrap, sizeof(*descriptor), 0/*user_size*/, &hash).xmateqn; + } else { + result = NULL; + } + } + else { + result = NULL; + } + return result; +} + + +LIBXSMM_API libxsmm_matrix_eqn_function libxsmm_dispatch_matrix_eqn( + const libxsmm_blasint m, const libxsmm_blasint n, const libxsmm_blasint* ldo, + const libxsmm_datatype out_type, const unsigned int eqn_idx ) +{ + libxsmm_descriptor_blob blob; + const libxsmm_meqn_descriptor *const desc = libxsmm_meqn_descriptor_init(&blob, + out_type, m, n, (ldo == NULL) ? m : *ldo, eqn_idx ); + + return libxsmm_dispatch_matrix_eqn_desc( desc ); +} + + +LIBXSMM_API libxsmm_xmmfunction libxsmm_create_packed_spxgemm_csr(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width, + const unsigned int* row_ptr, const unsigned int* column_idx, const void* values) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { + libxsmm_pspgemm_csr_descriptor pspgemm_csr; + libxsmm_build_request request; + libxsmm_gemm_descriptor desc; + if (0 == (0x80 & descriptor->prefetch)) { + pspgemm_csr.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + pspgemm_csr.gemm = &desc; + } + pspgemm_csr.row_ptr = row_ptr; + pspgemm_csr.column_idx = column_idx; + pspgemm_csr.values = values; + pspgemm_csr.packed_width = packed_width; + request.descriptor.pspgemm_csr = &pspgemm_csr; + request.kind = LIBXSMM_BUILD_KIND_PSPGEMM_CSR; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + } + return result.xgemm; +} + + +LIBXSMM_API libxsmm_xmmfunction libxsmm_create_packed_spxgemm_csc(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width, + const unsigned int* column_ptr, const unsigned int* row_idx, const void* values) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor && NULL != column_ptr && NULL != row_idx && NULL != values) { + libxsmm_pspgemm_csc_descriptor pspgemm_csc; + libxsmm_build_request request; + libxsmm_gemm_descriptor desc; + if (0 == (0x80 & descriptor->prefetch)) { + pspgemm_csc.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + pspgemm_csc.gemm = &desc; + } + pspgemm_csc.column_ptr = column_ptr; + pspgemm_csc.row_idx = row_idx; + pspgemm_csc.values = values; + pspgemm_csc.packed_width = packed_width; + request.descriptor.pspgemm_csc = &pspgemm_csc; + request.kind = LIBXSMM_BUILD_KIND_PSPGEMM_CSC; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + } + return result.xgemm; +} + + +LIBXSMM_API libxsmm_xmmfunction libxsmm_create_packed_xgemm_ac_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor) { + libxsmm_pgemm_ac_rm_descriptor pgemmacrm; + libxsmm_build_request request; + libxsmm_gemm_descriptor desc; + if (0 == (0x80 & descriptor->prefetch)) { + pgemmacrm.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + pgemmacrm.gemm = &desc; + } + pgemmacrm.packed_width = packed_width; + request.descriptor.pgemmacrm = &pgemmacrm; + request.kind = LIBXSMM_BUILD_KIND_PGEMMRMAC; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + } + return result.xgemm; +} + + +LIBXSMM_API libxsmm_xmmfunction libxsmm_create_packed_xgemm_bc_rm(const libxsmm_gemm_descriptor* descriptor, unsigned int packed_width) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor) { + libxsmm_pgemm_bc_rm_descriptor pgemmbcrm; + libxsmm_build_request request; + libxsmm_gemm_descriptor desc; + if (0 == (0x80 & descriptor->prefetch)) { + pgemmbcrm.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + pgemmbcrm.gemm = &desc; + } + pgemmbcrm.packed_width = packed_width; + request.descriptor.pgemmbcrm = &pgemmbcrm; + request.kind = LIBXSMM_BUILD_KIND_PGEMMRMBC; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + } + return result.xgemm; +} + + +LIBXSMM_API libxsmm_dmmfunction libxsmm_create_dcsr_reg(const libxsmm_gemm_descriptor* descriptor, + const unsigned int* row_ptr, const unsigned int* column_idx, const double* values) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { + libxsmm_csr_reg_descriptor sreg; + libxsmm_build_request request; + libxsmm_gemm_descriptor desc; + if (0 == (0x80 & descriptor->prefetch)) { + sreg.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + sreg.gemm = &desc; + } + sreg.row_ptr = row_ptr; + sreg.column_idx = column_idx; + sreg.values = values; + request.descriptor.sreg = &sreg; + request.kind = LIBXSMM_BUILD_KIND_SREG; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + } + return result.xgemm.dmm; +} + + +LIBXSMM_API libxsmm_smmfunction libxsmm_create_scsr_reg(const libxsmm_gemm_descriptor* descriptor, + const unsigned int* row_ptr, const unsigned int* column_idx, const float* values) +{ + libxsmm_code_pointer result = { 0 }; + LIBXSMM_INIT + if (NULL != descriptor && NULL != row_ptr && NULL != column_idx && NULL != values) { + libxsmm_csr_reg_descriptor sreg; + libxsmm_build_request request; + const unsigned int n = row_ptr[descriptor->m]; + double *const d_values = (double*)(0 != n ? malloc(n * sizeof(double)) : NULL); + if (NULL != d_values) { + libxsmm_gemm_descriptor desc; + unsigned int i; + /* we need to copy the values into a double precision buffer */ + for (i = 0; i < n; ++i) d_values[i] = (double)values[i]; + if (0 == (0x80 & descriptor->prefetch)) { + sreg.gemm = descriptor; + } + else { /* "sign"-bit of byte-value is set */ + LIBXSMM_ASSIGN127(&desc, descriptor); + desc.prefetch = (unsigned char)libxsmm_get_gemm_prefetch(LIBXSMM_PREFETCH_AUTO); + sreg.gemm = &desc; + } + sreg.row_ptr = row_ptr; + sreg.column_idx = column_idx; + sreg.values = d_values; + request.descriptor.sreg = &sreg; + request.kind = LIBXSMM_BUILD_KIND_SREG; + libxsmm_build(&request, LIBXSMM_CAPACITY_REGISTRY/*not managed*/, &result); + free(d_values); + } + } + return result.xgemm.smm; +} + + +LIBXSMM_API void libxsmm_release_kernel(const void* kernel) +{ + if (NULL != kernel) { + static int error_once = 0; + libxsmm_kernel_xinfo* extra = NULL; + void *const extra_address = &extra; + LIBXSMM_INIT + if (EXIT_SUCCESS == libxsmm_get_malloc_xinfo( + kernel, NULL/*size*/, NULL/*flags*/, (void**)extra_address) && NULL != extra) + { + const unsigned int regindex = extra->registered; + if ((LIBXSMM_CAPACITY_REGISTRY) <= regindex) { + libxsmm_xfree(kernel, 0/*no check*/); + } + else { /* attempt to unregister kernel */ + libxsmm_kernel_info info; +#if !defined(LIBXSMM_ENABLE_DEREG) + if (EXIT_SUCCESS == libxsmm_get_kernel_info(kernel, &info) + && LIBXSMM_KERNEL_KIND_USER == info.kind) +#endif + { + LIBXSMM_ASSERT(LIBXSMM_KERNEL_UNREGISTERED > info.kind); + /* coverity[check_return] */ + LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_ninit, 1, LIBXSMM_ATOMIC_RELAXED); /* invalidate code cache (TLS) */ + internal_registry[regindex].ptr = NULL; +#if !defined(NDEBUG) + memset(internal_registry_keys + regindex, 0, sizeof(*internal_registry_keys)); +#endif + libxsmm_xfree(kernel, 0/*no check*/); + } +#if !defined(LIBXSMM_ENABLE_DEREG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: attempt to unregister JIT-kernel!\n"); + } +#endif + } + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to release kernel!\n"); + } + } +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_init)(void) +{ + libxsmm_init(); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_finalize)(void) +{ + libxsmm_finalize(); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** /*kernel*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_release_kernel)(const void** kernel) +{ +#if !defined(NDEBUG) + if (NULL != kernel) +#endif + { + libxsmm_release_kernel(*kernel); + } +#if !defined(NDEBUG) + else { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_release_kernel!\n"); + } + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* /*fn*/, const int* /*iprec*/, const int* /*oprec*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/, + const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/, + const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(intptr_t* fn, const int* iprec, const int* oprec, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const void* alpha, const void* beta, const int* flags, const int* prefetch) +{ +#if !defined(NDEBUG) + if (NULL != fn && NULL != m + && (NULL == iprec || (0 <= *iprec && *iprec < LIBXSMM_DATATYPE_UNSUPPORTED)) + && (NULL == oprec || (0 <= *oprec && *oprec < LIBXSMM_DATATYPE_UNSUPPORTED))) +#endif + { + const int gemm_flags = (NULL != flags ? *flags : LIBXSMM_FLAGS); + const libxsmm_gemm_descriptor* descriptor; + libxsmm_gemm_prefetch_type gemm_prefetch; + libxsmm_descriptor_blob blob; + libxsmm_code_pointer result; +#if !defined(NDEBUG) + const libxsmm_gemm_precision itype = (NULL != iprec ? ((libxsmm_gemm_precision)*iprec) : LIBXSMM_GEMM_PRECISION_F64); + const libxsmm_gemm_precision otype = (NULL != oprec ? ((libxsmm_gemm_precision)*oprec) : itype); + const libxsmm_blasint kk = *(NULL != k ? k : m), nn = (NULL != n ? *n : kk); +#else + const libxsmm_gemm_precision itype = (libxsmm_gemm_precision)*iprec, otype = (libxsmm_gemm_precision)*oprec; + const libxsmm_blasint kk = *k, nn = *n; +#endif + LIBXSMM_PRAGMA_FORCEINLINE + gemm_prefetch = libxsmm_get_gemm_xprefetch(prefetch); + LIBXSMM_PRAGMA_FORCEINLINE + descriptor = libxsmm_gemm_descriptor_init2(&blob, itype, otype, *m, nn, kk, + NULL != lda ? *lda : (0 == (LIBXSMM_GEMM_FLAG_TRANS_A & gemm_flags) ? *m : kk), + NULL != ldb ? *ldb : (0 == (LIBXSMM_GEMM_FLAG_TRANS_B & gemm_flags) ? kk : nn), + *(NULL != ldc ? ldc : m), alpha, beta, gemm_flags, gemm_prefetch); +#if !defined(NDEBUG) + if (NULL != descriptor) +#endif + { + LIBXSMM_PRAGMA_FORCEINLINE + result.xgemm = libxsmm_xmmdispatch(descriptor); + *fn = result.ival; + } +#if !defined(NDEBUG) + else { /* quiet */ + *fn = 0; + } +#endif + } +#if !defined(NDEBUG) + else { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument passed into libxsmm_xmmdispatch!\n"); + } + if (NULL != fn) *fn = 0; + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* /*fn*/, const int* /*precision*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*k*/, + const libxsmm_blasint* /*lda*/, const libxsmm_blasint* /*ldb*/, const libxsmm_blasint* /*ldc*/, + const void* /*alpha*/, const void* /*beta*/, const int* /*flags*/, const int* /*prefetch*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmdispatch)(intptr_t* fn, const int* precision, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k, + const libxsmm_blasint* lda, const libxsmm_blasint* ldb, const libxsmm_blasint* ldc, + const void* alpha, const void* beta, const int* flags, const int* prefetch) +{ + LIBXSMM_FSYMBOL(libxsmm_xmmdispatch2)(fn, precision, precision, m, n, k, lda, ldb, ldc, alpha, beta, flags, prefetch); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)( + const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_abc)( + const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != fn && NULL != a && NULL != b && NULL != c) +#endif + { +#if !defined(NDEBUG) + if (NULL != fn->xmm) +#endif + { + fn->xmm(a, b, c); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_abc!\n"); + } +#endif + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_abc specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)( + const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/, + const void* /*pa*/, const void* /*pb*/, const void* /*pc*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)( + const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c, + const void* pa, const void* pb, const void* pc) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != fn && NULL != a && NULL != b && NULL != c) +#endif + { +#if !defined(NDEBUG) + if (NULL != fn->xmm) +#endif + { + fn->xmm(a, b, c, pa, pb, pc); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: NULL-function passed into libxsmm_xmmcall_prf!\n"); + } +#endif + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xmmcall_prf specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)( + const libxsmm_xmmfunction* /*fn*/, const void* /*a*/, const void* /*b*/, void* /*c*/, + const void* /*pa*/, const void* /*pb*/, const void* /*pc*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xmmcall)( + const libxsmm_xmmfunction* fn, const void* a, const void* b, void* c, + const void* pa, const void* pb, const void* pc) +{ + LIBXSMM_FSYMBOL(libxsmm_xmmcall_prf)(fn, a, b, c, pa, pb, pc); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/, + const int* /*valsize*/, const void* /*valinit*/, int* /*keyhash*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xregister)(void** regval, const void* key, const int* keysize, + const int* valsize, const void* valinit, int* keyhash) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != regval && NULL != key && NULL != keysize && NULL != valsize) +#endif + { + unsigned int hash = 0; + *regval = libxsmm_xregister(key, *keysize, *valsize, valinit, &hash); + if (NULL != keyhash) { + *keyhash = (hash & 0x7FFFFFFF/*sign-bit*/); + } + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xregister specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** /*regval*/, const void* /*key*/, const int* /*keysize*/, int* /*keyhash*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdispatch)(void** regval, const void* key, const int* keysize, int* keyhash) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != regval && NULL != key && NULL != keysize) +#endif + { + unsigned int hash = 0; + *regval = libxsmm_xdispatch(key, *keysize, &hash); + if (NULL != keyhash) { + *keyhash = (hash & 0x7FFFFFFF/*sign-bit*/); + } + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdispatch specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* /*key*/, const int* /*keysize*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xrelease)(const void* key, const int* keysize) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != key && NULL != keysize) +#endif + { + libxsmm_xrelease(key, *keysize); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xrelease specified!\n"); + } +#endif +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_main.h b/third_party/libxsmm/src/libxsmm_main.h new file mode 100644 index 00000000..d33cc5db --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_main.h @@ -0,0 +1,1069 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MAIN_H +#define LIBXSMM_MAIN_H + +#include +/** + * TF includes src/libxsmm_main.h and uses LIBXSMM's sync primitives + * without including libxsmm_sync. However, libxsmm_sync.h shall be + * an explicit include separate from including libxsmm.h. + */ +#include "libxsmm_sync.h" + +/** Allow external definition to enable testing corner cases (exhausted registry space). */ +#if !defined(LIBXSMM_CAPACITY_REGISTRY) /* must be POT */ +# define LIBXSMM_CAPACITY_REGISTRY 131072 +#endif +#if !defined(LIBXSMM_CAPACITY_CACHE) /* must be POT */ +# define LIBXSMM_CAPACITY_CACHE 16 +#endif + +#if !defined(LIBXSMM_PAGE_MINSIZE) +# define LIBXSMM_PAGE_MINSIZE 4096 /* 4 KB */ +#endif + +#if !defined(LIBXSMM_BATCH_CHECK) && !defined(NDEBUG) +# define LIBXSMM_BATCH_CHECK +#endif + +#if !defined(LIBXSMM_NTHREADS_MAX) +# if (0 != LIBXSMM_SYNC) +# define LIBXSMM_NTHREADS_MAX 1024 +# else +# define LIBXSMM_NTHREADS_MAX 1 +# endif +#endif +/* relies on LIBXSMM_NTHREADS_MAX */ +#if !defined(LIBXSMM_NTHREADS_USE) && 0 +# define LIBXSMM_NTHREADS_USE +#endif +#if !defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) +# define LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS LIBXSMM_NTHREADS_MAX +#endif +#if !defined(LIBXSMM_MALLOC_SCRATCH_SCALE) +# define LIBXSMM_MALLOC_SCRATCH_SCALE 1.0 +#endif +#if !defined(LIBXSMM_MALLOC_LIMIT) +# define LIBXSMM_MALLOC_LIMIT (2U << 20) /* 2 MB */ +#endif +/* map memory also for non-executable buffers */ +#if !defined(LIBXSMM_MALLOC_MMAP) && 0 +# define LIBXSMM_MALLOC_MMAP +#endif +/* map memory for hooked allocation */ +#if !defined(LIBXSMM_MALLOC_MMAP_HOOK) && 1 +# define LIBXSMM_MALLOC_MMAP_HOOK +#endif +/* map memory for scratch buffers */ +#if !defined(LIBXSMM_MALLOC_MMAP_SCRATCH) && 1 +# define LIBXSMM_MALLOC_MMAP_SCRATCH +#endif +/* align even if interceptor is disabled at runtime */ +#if !defined(LIBXSMM_MALLOC_ALIGN_ALL) && 1 +# define LIBXSMM_MALLOC_ALIGN_ALL +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_INTRINSIC) && 1 +# if defined(LIBXSMM_PLATFORM_X86) && defined(LIBXSMM_INTRINSICS_INCLUDE) && \ + !defined(LIBXSMM_INTRINSICS_DEBUG) && !defined(LIBXSMM_MALLOC_MMAP) +# define LIBXSMM_MALLOC_HOOK_INTRINSIC +# endif +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_REALLOC) && 1 +# if !defined(LIBXSMM_MALLOC_HOOK_INTRINSIC) +# define LIBXSMM_MALLOC_HOOK_REALLOC +# endif +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_CALLOC) && 1 +# define LIBXSMM_MALLOC_HOOK_CALLOC +#endif +#if !defined(LIBXSMM_MALLOC_INTERNAL_CALLER_ID) +# define LIBXSMM_MALLOC_INTERNAL_CALLER_ID ((uintptr_t)LIBXSMM_UNLIMITED) +#endif +#if !defined(LIBXSMM_MALLOC_INTERNAL_CALLER) +# define LIBXSMM_MALLOC_INTERNAL_CALLER ((const void*)(LIBXSMM_MALLOC_INTERNAL_CALLER_ID)) +#endif + +#if !defined(LIBXSMM_INTERCEPT_DYNAMIC) && defined(LIBXSMM_BUILD) && \ + (defined(__GNUC__) || defined(_CRAYC)) && !defined(_WIN32) && !defined(__CYGWIN__) && \ + !(defined(__APPLE__) && defined(__MACH__) && LIBXSMM_VERSION2(6, 1) >= \ + LIBXSMM_VERSION2(__clang_major__, __clang_minor__)) +# define LIBXSMM_INTERCEPT_DYNAMIC +#endif + +#if !defined(LIBXSMM_MALLOC_HOOK_STATIC) && \ + (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) /* GLIBC */ && \ + (!defined(_WIN32)) /* TODO */ +# define LIBXSMM_MALLOC_HOOK_STATIC +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) && defined(LIBXSMM_INTERCEPT_DYNAMIC) && \ + defined(LIBXSMM_MALLOC_HOOK_STATIC) && !defined(_CRAYC) && !defined(__TRACE) +# define LIBXSMM_MALLOC_HOOK_DYNAMIC +#endif +#if (defined(LIBXSMM_MALLOC_HOOK_STATIC) || defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)) +# define LIBXSMM_MALLOC_HOOK +#endif +#if !defined(LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS) && defined(LIBXSMM_MALLOC_HOOK) && \ + (defined(LIBXSMM_MALLOC_ALIGN_ALL) || (defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC))) +# define LIBXSMM_DNN_CONVOLUTION_SETUP_USE_NTS +#endif + +#if defined(LIBXSMM_INTERCEPT_DYNAMIC) +# if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +# endif +# include +# if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +# endif +# if !defined(RTLD_NEXT) +# define LIBXSMM_RTLD_NEXT ((void*)-1l) +# else +# define LIBXSMM_RTLD_NEXT RTLD_NEXT +# endif +#endif + +#if !defined(LIBXSMM_VERBOSITY_HIGH) +# define LIBXSMM_VERBOSITY_HIGH 3 /* secondary warning or info-verbosity */ +#endif +#if !defined(LIBXSMM_VERBOSITY_WARN) +# define LIBXSMM_VERBOSITY_WARN ((LIBXSMM_VERBOSITY_HIGH) - LIBXSMM_MIN(1, LIBXSMM_VERBOSITY_HIGH)) +#endif + +#if !defined(LIBXSMM_LOCK) +# define LIBXSMM_LOCK LIBXSMM_LOCK_DEFAULT +#endif + +#if !defined(LIBXSMM_EXT_MIN_NTASKS) +# define LIBXSMM_MIN_NTASKS(NT) 1 +#endif +#if !defined(LIBXSMM_OVERHEAD) +# define LIBXSMM_OVERHEAD(NT) 0 +#endif +#if !defined(LIBXSMM_NOOP_ARGS) +# define LIBXSMM_NOOP_ARGS(...) +#endif +#if !defined(LIBXSMM_NOOP) +# define LIBXSMM_NOOP +#endif + +/** Check if M, N, K, or LDx fits into the descriptor. */ +#if (0 != LIBXSMM_ILP64) +# define LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K) (0xFFFFFFFF >= (M) && 0xFFFFFFFF >= (N) && 0xFFFFFFFF >= (K)) +#else /* always fits */ +# define LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K) 1 +#endif + +#if defined(LIBXSMM_ASSERT) /* assert available */ +# define LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K) LIBXSMM_ASSERT(LIBXSMM_GEMM_NO_BYPASS_DIMS(M, N, K)) +#else +# define LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K) +#endif + +#if defined(LIBXSMM_UNPACKED) +# define LIBXSMM_DESCRIPTOR_CLEAR_AUX(DST, SIZE) LIBXSMM_MEMSET127(DST, 0, SIZE) +#else +# define LIBXSMM_DESCRIPTOR_CLEAR_AUX(DST, SIZE) +#endif +#define LIBXSMM_DESCRIPTOR_CLEAR(BLOB) \ + LIBXSMM_ASSERT((LIBXSMM_DESCRIPTOR_MAXSIZE) == sizeof(*(BLOB))); \ + LIBXSMM_DESCRIPTOR_CLEAR_AUX(BLOB, LIBXSMM_DESCRIPTOR_MAXSIZE) + +/** Low-level/internal GEMM descriptor initialization. */ +#define LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, DATA_TYPE, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ + LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(LDA, LDB, LDC); \ + LIBXSMM_GEMM_DESCRIPTOR_DIM_CHECK(M, N, K); \ + LIBXSMM_DESCRIPTOR_CLEAR_AUX(&(DESCRIPTOR), sizeof(DESCRIPTOR)); \ + (DESCRIPTOR).datatype = (unsigned char)(DATA_TYPE); (DESCRIPTOR).prefetch = (unsigned char)(PREFETCH); \ + (DESCRIPTOR).flags = (unsigned int)((FLAGS) \ + /*| (LIBXSMM_NEQ(0, ALPHA) ? 0 : LIBXSMM_GEMM_FLAG_ALPHA_0)*/ \ + | (LIBXSMM_NEQ(0, BETA) ? 0 : LIBXSMM_GEMM_FLAG_BETA_0)); \ + (DESCRIPTOR).m = (unsigned int)(M); (DESCRIPTOR).n = (unsigned int)(N); (DESCRIPTOR).k = (unsigned int)(K); \ + (DESCRIPTOR).lda = (unsigned int)(LDA); (DESCRIPTOR).ldb = (unsigned int)(LDB); (DESCRIPTOR).ldc = (unsigned int)(LDC); \ + (DESCRIPTOR).meltw_datatype_aux = 0; (DESCRIPTOR).c1 = 0; (DESCRIPTOR).c2 = 0; (DESCRIPTOR).c3 = 0; \ + (DESCRIPTOR).meltw_ldx = 0; (DESCRIPTOR).meltw_ldy = 0; (DESCRIPTOR).meltw_ldz = 0; \ + (DESCRIPTOR).meltw_param = 0; (DESCRIPTOR).meltw_flags = 0; \ + (DESCRIPTOR).meltw_operation = 0 + +/** Similar to LIBXSMM_GEMM_DESCRIPTOR, but separately taking the input-/output-precision. */ +#define LIBXSMM_GEMM_DESCRIPTOR2(DESCRIPTOR, IPREC, OPREC, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ + LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, LIBXSMM_GETENUM(IPREC, OPREC), FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) + +/** Declare and construct a GEMM descriptor. */ +#define LIBXSMM_GEMM_DESCRIPTOR_TYPE(DESCRIPTOR, DATA_TYPE, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ + libxsmm_gemm_descriptor DESCRIPTOR; LIBXSMM_GEMM_DESCRIPTOR(DESCRIPTOR, DATA_TYPE, \ + FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) + +/** Similar to LIBXSMM_GEMM_DESCRIPTOR_TYPE, but separately taking the input-/output-precision. */ +#define LIBXSMM_GEMM_DESCRIPTOR2_TYPE(DESCRIPTOR, IPREC, OPREC, FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) \ + LIBXSMM_GEMM_DESCRIPTOR_TYPE(DESCRIPTOR, LIBXSMM_GETENUM(IPREC, OPREC), FLAGS, M, N, K, LDA, LDB, LDC, ALPHA, BETA, PREFETCH) + +#define LIBXSMM_REGDESC_DEFAULT +#define LIBXSMM_REGDESC(START, MODIFIER) \ + START libxsmm_gemm_descriptor MODIFIER gemm; \ + START libxsmm_meltw_descriptor MODIFIER meltw; \ + START libxsmm_meqn_descriptor MODIFIER meqn + +/** +* Packed structure, which stores the argument description of GEMM routines. +* The size of the structure is padded to LIBXSMM_DESCRIPTOR_MAXSIZE. +*/ +LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_gemm_descriptor { + /** Extents of the matrix. */ + unsigned int m, n, k; + /** Leading dimensions. */ + unsigned int lda, ldb, ldc; + /** Set of flags. */ + unsigned int flags; + /** Prefetch strategy. */ + unsigned char prefetch; + /** Denotes the data-type. */ + unsigned char datatype; + /** + * Do not reorder elements between above and below blocks! + */ + /** Denotes of optional eltwise data-type */ + unsigned char meltw_datatype_aux; + /** multipurpose 64-bit field, currently used for: a) stride_a in brgemm */ + unsigned long long c1; + /** multipurpose 64-bit field, currently used for: a) stride_b in brgemm */ + unsigned long long c2; + /** multipurpose 8-bit field, currently used for: a) unroll hint in brgemm */ + unsigned char c3; + /** LDx, LDy, LDz, additional meltw LDs */ + unsigned int meltw_ldx, meltw_ldy, meltw_ldz; + /** optional param field */ + unsigned char meltw_param; + /** Set of flags */ + unsigned short meltw_flags; + /** operation specifier */ + unsigned char meltw_operation; +}; + +/** Packed structure storing the mateltw argument description. */ +LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_meltw_descriptor { + /** LDx, M, and N. */ + unsigned int m, n, ldi, ldo, ldi2, ldi3; + /** Size of data element. */ + unsigned char datatype; + unsigned char datatype2; + /** Set of flags */ + unsigned short flags; + /** optional param field */ + unsigned char param; + /** operation specifier */ + unsigned char operation; +}; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pspgemm_csr_descriptor { + const libxsmm_gemm_descriptor* gemm; + const unsigned int* row_ptr; + const unsigned int* column_idx; + const void* values; + unsigned int packed_width; +} libxsmm_pspgemm_csr_descriptor; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pspgemm_csc_descriptor { + const libxsmm_gemm_descriptor* gemm; + const unsigned int* column_ptr; + const unsigned int* row_idx; + const void* values; + unsigned int packed_width; +} libxsmm_pspgemm_csc_descriptor; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pgemm_ac_rm_descriptor { + const libxsmm_gemm_descriptor* gemm; + unsigned int packed_width; +} libxsmm_pgemm_ac_rm_descriptor; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_pgemm_bc_rm_descriptor { + const libxsmm_gemm_descriptor* gemm; + unsigned int packed_width; +} libxsmm_pgemm_bc_rm_descriptor; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_csr_reg_descriptor { + const libxsmm_gemm_descriptor* gemm; + const unsigned int* row_ptr; + const unsigned int* column_idx; + const void* values; +} libxsmm_csr_reg_descriptor; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_xcopykernel { + libxsmm_meltwfunction_unary function; + const void* ptr; +} libxsmm_xcopykernel; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_code_pointer { + void (*ptr_fn)(LIBXSMM_VARIADIC); + const void* ptr_const; + void* ptr; + uintptr_t uval; + intptr_t ival; + libxsmm_xmmfunction xgemm; /* GEMM: smm, dmm, wimm, or void-function */ + libxsmm_xmeltwfunction xmateltw; + libxsmm_matrix_eqn_function xmateqn; +} libxsmm_code_pointer; + +/** Structure which describes all tensors in LIBXSMM's DNN module */ +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_tensor { + libxsmm_dnn_tensor_datalayout* layout; /* data-layout descriptor */ + void* data; /* pointer to data */ + unsigned char scf; /* fix point scaling factor for this tensor */ +}; + +/* Structure to record segment in stream of code */ +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE segment_t { + int segment_type; + int n_convs; + int aux_index; + int img; + int ofm; + int ifm; +} segment_t; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_layer { + libxsmm_dnn_datatype datatype_in; + libxsmm_dnn_datatype datatype_out; + libxsmm_dnn_conv_desc desc; + libxsmm_dnn_conv_algo algo; + libxsmm_dnn_tensor_format buffer_format; + libxsmm_dnn_tensor_format filter_format; + libxsmm_dnn_conv_fuse_op fuse_ops; + libxsmm_dnn_conv_option options; + int target_archid; + + /* additional size for internal data types */ + int ifhp; + int ifwp; + int ofh; + int ofw; + int ofhp; + int ofwp; + int ifmblock; + int ofmblock; + int blocksifm; + int blocksofm; + int fwd_ofw_rb; + int fwd_ofh_rb; + int bwd_ofw_rb; + int bwd_ofh_rb; + int upd_ofw_rb; + int upd_ofh_rb; + int fm_lp_block; /* additional blocking for low precision datatypes of feature maps */ + int blocksifm_blocking; + int blocksofm_blocking; + int avoid_acc_load; + int avoid_acc_load_bwd; + int pack_input; + int pack_input_bwd; + int spread_input_bwd; + int weight_copies; + int loop_order; + int use_ofm_parallelization; + int use_ifm_parallelization; + int avoid_fmas_in_rim; + int upd_use_batchreduce; + int upd_pack_input; + int upd_loop_order; + int upd_linearized_tasklist; + int upd_avoid_rim_fmas; + int fwd_flags; + int bwd_flags; + int shuffle_filter_accesses; + int use_fallback_fwd_loops; + int use_fallback_bwd_loops; + int fwd_gemm_pixels; + int bwd_gemm_pixels; + int input_pixels; + int output_pixels; + int n_used_pixels; + int pixel_blocking; + int use_intermediate_f32_wt_tensor; + int upd_linearized_pixels; + int ifwp_extended; + int ofwp_extended; + int batchreduce_h_pixels; + int on_the_fly_input_packing; + int upd_pack_input_upfront; + int use_hybrid_imgofm_parallelization; + int remainder_pixels; + int pack_to_cnhw; + int fuse_upd_transposes; + int compute_pixels; + int upd_trans_w_only; + int fwd_padding_copy; + int upd_padding_copy; + int block_fwd_oj; + int block_fwd_ifm; + int block_fwd_ofm; + int block_bwd_oj; + int block_bwd_ifm; + int block_bwd_ofm; + int block_upd_ifm; + int block_upd_ofm; + + libxsmm_meltwfunction_unary tr_kernel; + libxsmm_meltwfunction_unary fwd_cvtfp32bf16_kernel; + + /* Hoisting the compute kernels for FWD */ + libxsmm_bsmmfunction fwd_config_kernel; + libxsmm_bsmmfunction_reducebatch_addr fwd_compute_kernel_addr; + libxsmm_bsmmfunction_reducebatch_offs fwd_compute_kernel_offs_b; + libxsmm_bmmfunction_reducebatch_offs fwd_compute_kernel_offs_a; + libxsmm_bmmfunction_reducebatch_strd fwd_compute_kernel_strd; + libxsmm_smmfunction_reducebatch_addr fwd_compute_kernel_addr_a_f32; + libxsmm_smmfunction_reducebatch_addr fwd_compute_kernel_addr_b_f32; + libxsmm_smmfunction_reducebatch_offs fwd_compute_kernel_offs_f32; + libxsmm_smmfunction_reducebatch_strd fwd_compute_kernel_strd_f32; + + /* Hoisting the compute kernels for BWD */ + libxsmm_bsmmfunction bwd_config_kernel; + libxsmm_bsmmfunction_reducebatch_addr bwd_compute_kernel_addr; + libxsmm_bsmmfunction_reducebatch_offs bwd_compute_kernel_offs; + libxsmm_bsmmfunction_reducebatch_strd bwd_compute_kernel_strd; + + /* Hoisting the compute kernels for UPD */ + libxsmm_bsmmfunction upd_config_kernel; + libxsmm_bsmmfunction_reducebatch_strd upd_compute_kernel_brgemm_no_linearized_pixels; + libxsmm_bsmmfunction_reducebatch_strd upd_compute_kernel_brgemm_linearized_pixels_hybrid_par_no_cnhw; + libxsmm_bsmmfunction upd_compute_kernel_gemm_linearized_pixels_hybrid_par_cnhw; + libxsmm_bsmmfunction upd_compute_kernel_gemm_linearized_pixels_no_hybrid_par; + + libxsmm_bsmmfunction tilerelease_kernel; + + unsigned long long *A_offsets; + unsigned long long *B_offsets; + unsigned long long *A_offsets_bwd; + unsigned long long *B_offsets_bwd; + + /* AMX specific fields */ + int x_rows; + int n_pixel_tiles; + int n_ofm_tiles; + int wrb_1; + int wrb_2; + int wrb_3; + int wrb_4; + int hrb_1; + int hrb_2; + int n_compute_pixels; + int pixels; + int linearize_pixels; + int split_pixel; + int reconfig; + int zero_rim; + char tc[64]; + char tc2[64]; + char tc_upd[64]; + int input_padded_pixels; + int output_padded_pixels; + int blocks_pixels; + /* End of AMX specific fields */ + + /* internal data representation */ + libxsmm_dnn_tensor* reg_input; + libxsmm_dnn_tensor* reg_output; + libxsmm_dnn_tensor* reg_filter; + libxsmm_dnn_tensor* grad_input; + libxsmm_dnn_tensor* grad_output; + libxsmm_dnn_tensor* grad_filter; + libxsmm_dnn_tensor* reg_bias; + libxsmm_dnn_tensor* grad_bias; + /* internal data representations for copies of tensors */ + libxsmm_dnn_tensor* reg_input_tr; + libxsmm_dnn_tensor* reg_filter_tr; + /* batchnorm stats */ + libxsmm_dnn_tensor* batch_stats; + /* maxstats used in low-precision kernels */ + libxsmm_dnn_tensor* maxstats_fwd; + libxsmm_dnn_tensor* maxstats_bwd; + libxsmm_dnn_tensor* maxstats_upd; + + /* barrier */ + libxsmm_barrier* barrier; + + /* scratch */ + size_t fwd_packing_padding_scratch_size; + size_t fwd_lp_output_full_scratch_size; + size_t fwd_lp_output_block_scratch_size; + size_t fwd_packing_padding_scratch_offset; + size_t fwd_lp_output_full_scratch_offset; + size_t fwd_lp_output_block_scratch_offset; + size_t fwd_scratch_size; + + size_t bwd_filter_trans_scratch_size; + size_t bwd_packing_padding_scratch_size; + size_t bwd_lp_input_full_scratch_size; + size_t bwd_filter_trans_scratch_offset; + size_t bwd_packing_padding_scratch_offset; + size_t bwd_lp_input_full_scratch_offset; + size_t bwd_scratch_size; + + size_t upd_packing_padding_scratch_size; + size_t upd_lp_output_full_scratch_size; + size_t upd_lp_input_full_scratch_size; + size_t upd_filter_scratch_size; + size_t upd_lp_filter_full_scratch_size; + size_t upd_packing_padding_scratch_offset; + size_t upd_lp_output_full_scratch_offset; + size_t upd_lp_input_full_scratch_offset; + size_t upd_lp_filter_full_scratch_offset; + size_t upd_filter_scratch_offset; + size_t upd_scratch_size; + + void* scratch; + size_t scratch_size; + + libxsmm_code_pointer gemm_fwd; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd2; /* ability to hoist forward GEMMs */ + + /* JIT-generated convolution code */ + libxsmm_code_pointer code_fwd[3]; + libxsmm_code_pointer code_bwd[3]; + libxsmm_code_pointer code_upd[5]; + + libxsmm_code_pointer matcopy_fwd[4]; + libxsmm_code_pointer matcopy_bwd[4]; + libxsmm_code_pointer matcopy_upd[3]; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedbatchnorm { + libxsmm_dnn_fusedbatchnorm_desc desc; + libxsmm_dnn_tensor* reg_input; /* input tensor */ + libxsmm_dnn_tensor* reg_output; /* output tensor */ + libxsmm_dnn_tensor* grad_input; /* grad input tensor */ + libxsmm_dnn_tensor* grad_output; /* grad output tensor */ + libxsmm_dnn_tensor* reg_add; /* elementwise tensor */ + libxsmm_dnn_tensor* grad_add; /* grad elementwise tensor */ + libxsmm_dnn_tensor* reg_beta; /* beta tensor */ + libxsmm_dnn_tensor* reg_gamma; /* gamma tensor */ + libxsmm_dnn_tensor* grad_beta; /* grad beta tensor */ + libxsmm_dnn_tensor* grad_gamma; /* grad gamma tensor */ + libxsmm_dnn_tensor* expvalue; /* expected value */ + libxsmm_dnn_tensor* rcpstddev; /* reciprocal of standard derivation */ + libxsmm_dnn_tensor* variance; /* variance */ + libxsmm_dnn_tensor* relumask; /* relumask */ + libxsmm_barrier* barrier; /* barrier */ + int ifmblock; + int ofmblock; + int blocksifm; + int blocksofm; + size_t scratch_size; + void* scratch; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_softmaxloss { + libxsmm_dnn_softmaxloss_desc desc; + libxsmm_dnn_tensor* reg_input; /* input tensor */ + libxsmm_dnn_tensor* reg_output; /* output tensor */ + libxsmm_dnn_tensor* grad_input; /* grad input tensor */ + libxsmm_dnn_tensor* label; /* labels tensor */ + libxsmm_barrier* barrier; /* barrier */ + int bc; + int Bc; + int bn; + int Bn; + float loss; + size_t scratch_size; + void* scratch; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_optimizer { + libxsmm_dnn_optimizer_desc desc; + libxsmm_dnn_tensor* reg_filter; /* filter tensor */ + libxsmm_dnn_tensor* grad_filter; /* grad filter tensor */ + libxsmm_dnn_tensor* master_filter; /* master filter tensor */ + libxsmm_barrier* barrier; /* barrier */ + int bc; + int Bc; + int bk; + int Bk; + int fm_lp_block; + size_t scratch_size; + void* scratch; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fusedgroupnorm { + libxsmm_dnn_fusedgroupnorm_desc desc; + libxsmm_dnn_tensor* reg_input; /* input tensor */ + libxsmm_dnn_tensor* reg_output; /* output tensor */ + libxsmm_dnn_tensor* grad_input; /* grad input tensor */ + libxsmm_dnn_tensor* grad_output; /* grad output tensor */ + libxsmm_dnn_tensor* reg_add; /* elementwise tensor */ + libxsmm_dnn_tensor* grad_add; /* grad elementwise tensor */ + libxsmm_dnn_tensor* reg_beta; /* beta tensor */ + libxsmm_dnn_tensor* reg_gamma; /* gamma tensor */ + libxsmm_dnn_tensor* grad_beta; /* grad beta tensor */ + libxsmm_dnn_tensor* grad_gamma; /* grad gamma tensor */ + libxsmm_dnn_tensor* expvalue; /* expected value */ + libxsmm_dnn_tensor* rcpstddev; /* reciprocal of standard derivation */ + libxsmm_dnn_tensor* variance; /* variance */ + libxsmm_dnn_tensor* relumask; /* relumask */ + libxsmm_barrier* barrier; /* barrier */ + int ifmblock; + int ofmblock; + int blocksifm; + int blocksofm; + size_t scratch_size; + void* scratch; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_fullyconnected { + libxsmm_dnn_fullyconnected_desc desc; + libxsmm_dnn_tensor* reg_input; /* input tensor */ + libxsmm_dnn_tensor* reg_output; /* output tensor */ + libxsmm_dnn_tensor* grad_input; /* grad input tensor */ + libxsmm_dnn_tensor* grad_output; /* grad output tensor */ + libxsmm_dnn_tensor* reg_filter; /* filter tensor */ + libxsmm_dnn_tensor* grad_filter; /* grad filter tensor */ + libxsmm_dnn_tensor* reg_bias; /* bias tensor */ + libxsmm_dnn_tensor* grad_bias; /* grad bais tensor */ + libxsmm_dnn_tensor* relumask; /* relumask */ + libxsmm_barrier* barrier; /* barrier */ + int target_archid; + + int ifmblock; + int ofmblock; + int blocksifm; + int blocksofm; + /* Parameters to tune/specialize FC algorithms */ + int fwd_2d_blocking; + int bwd_2d_blocking; + int upd_2d_blocking; + int fwd_bf; + int bwd_bf; + int upd_bf; + int fwd_row_teams; + int fwd_column_teams; + int bwd_row_teams; + int bwd_column_teams; + int upd_row_teams; + int upd_column_teams; + int ifm_subtasks; + int ofm_subtasks; + int compressed_A; + int sparsity_factor_A; + + int fm_lp_block; + int bn; + int bk; + int bc; + size_t scratch_size; + size_t doutput_scratch_mark; + void* scratch; + + libxsmm_bsmmfunction fwd_config_kernel; + libxsmm_bsmmfunction bwd_config_kernel; + libxsmm_bsmmfunction upd_config_kernel; + libxsmm_bsmmfunction tilerelease_kernel; + + libxsmm_meltwfunction_unary tr_kernel; + libxsmm_code_pointer gemm_fwd; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd2; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd3; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd4; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd5; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd6; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd7; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd8; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd9; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd10; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd11; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd12; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd13; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd14; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd15; /* ability to hoist forward GEMMs */ + libxsmm_code_pointer gemm_fwd16; /* ability to hoist forward GEMMs */ + + libxsmm_code_pointer gemm_bwd; /* ability to hoist backward GEMMs */ + libxsmm_code_pointer gemm_bwd2; /* ability to hoist backward GEMMs */ + libxsmm_code_pointer gemm_bwd3; /* ability to hoist backward GEMMs */ + libxsmm_code_pointer gemm_upd; /* ability to hoist update GEMMs */ + libxsmm_code_pointer gemm_upd2; /* ability to hoist update GEMMs */ + libxsmm_code_pointer gemm_upd3; /* ability to hoist update GEMMs */ + + /* JITed eltwise kernels... */ + libxsmm_meltwfunction_unary fwd_cvtfp32bf16_kernel; + libxsmm_meltwfunction_unary bwd_cvtfp32bf16_kernel; + libxsmm_meltwfunction_unary bwd_relu_kernel; + libxsmm_meltwfunction_unary fwd_cvtfp32bf16_relu_kernel; + libxsmm_meltwfunction_unary fwd_sigmoid_cvtfp32bf16_kernel; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_pooling { + libxsmm_dnn_pooling_desc desc; + libxsmm_dnn_tensor* reg_input; /* input tensor */ + libxsmm_dnn_tensor* reg_output; /* output tensor */ + libxsmm_dnn_tensor* grad_input; /* grad input tensor */ + libxsmm_dnn_tensor* grad_output; /* grad output tensor */ + libxsmm_dnn_tensor* mask; /* elementwise tensor */ + libxsmm_barrier* barrier; /* barrier */ + int ifmblock; + int ofmblock; + int blocksifm; + int blocksofm; + int ofh; + int ofw; + size_t scratch_size; + void* scratch; +}; + +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_dnn_rnncell { + libxsmm_dnn_rnncell_desc desc; + libxsmm_blasint T; /* sequence length, must be smaller than max sequence length in desc */ + libxsmm_blasint bk; + libxsmm_blasint bn; + libxsmm_blasint bc; + libxsmm_blasint lpb; + + /* external tensors */ + libxsmm_dnn_tensor* xt; + libxsmm_dnn_tensor* csp; + libxsmm_dnn_tensor* hp; + libxsmm_dnn_tensor* w; + libxsmm_dnn_tensor* wt; + libxsmm_dnn_tensor* r; + libxsmm_dnn_tensor* rt; + libxsmm_dnn_tensor* b; + libxsmm_dnn_tensor* cst; + libxsmm_dnn_tensor* ht; + libxsmm_dnn_tensor* dxt; + libxsmm_dnn_tensor* dcsp; + libxsmm_dnn_tensor* dhp; + libxsmm_dnn_tensor* dw; + libxsmm_dnn_tensor* dr; + libxsmm_dnn_tensor* db; + libxsmm_dnn_tensor* dcs; + libxsmm_dnn_tensor* dht; + libxsmm_dnn_tensor* it; + libxsmm_dnn_tensor* ft; + libxsmm_dnn_tensor* ot; + libxsmm_dnn_tensor* cit; + libxsmm_dnn_tensor* cot; + float forget_bias; + /* internal state */ + void* internal_z; + /* scratch pointers */ + void* scratch_base; + void* scratch_wT; + void* scratch_rT; + void* scratch_w; + void* scratch_r; + void* scratch_xT; + void* scratch_hT; + void* scratch_deltat; + void* scratch_di; + void* scratch_df; + void* scratch_do; + void* scratch_dci; + void* scratch_diB; + void* scratch_dfB; + void* scratch_dpB; + void* scratch_dciB; + void* scratch_dx; + void* scratch_dhp; + void* scratch_db; + void* scratch_t1; + void* scratch_t2; + void* csp_scratch; + void* cst_scratch; + void* ht_scratch; + void* it_scratch; + void* ft_scratch; + void* ot_scratch; + void* cit_scratch; + void* cot_scratch; + /* options */ + int use_fwd_fused_impl; + int fwd_block; + int bwdupd_block; + int fwd_generic; + int bwdupd_generic; + /* Ability to hoist GEMMs */ + libxsmm_bsmmfunction_reducebatch_strd fwd_kernela; + libxsmm_bsmmfunction_reducebatch_strd fwd_kernelb; + libxsmm_bsmmfunction_reducebatch_addr fwd_tileconfig; + libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernela; + libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernelb; + libxsmm_bsmmfunction_reducebatch_strd bwdupd_kernelc; + libxsmm_bsmmfunction_reducebatch_strd bwdupd_kerneld; + libxsmm_bsmmfunction_reducebatch_addr bwdupd_tileconfig; + libxsmm_bsmmfunction tilerelease_kernel; + libxsmm_barrier* barrier; /* barrier */ +}; + +struct LIBXSMM_RETARGETABLE libxsmm_dfsspmdm { + int M; + int N; + int K; + int ldb; + int ldc; + int N_chunksize; + double* a_dense; + libxsmm_dmmfunction kernel; +}; + +struct LIBXSMM_RETARGETABLE libxsmm_sfsspmdm { + int M; + int N; + int K; + int ldb; + int ldc; + int N_chunksize; + float* a_dense; + libxsmm_smmfunction kernel; +}; + +/** Packed structure storing the mateltw argument description. */ +LIBXSMM_EXTERN_C LIBXSMM_PACKED(struct LIBXSMM_RETARGETABLE) libxsmm_meqn_descriptor { + /** LDx, M, and N. */ + unsigned int m, n, ldo; + /** Size of data element. */ + unsigned char datatype; + /** Set of flags */ + unsigned int eqn_idx; +}; + +typedef enum libxsmm_build_kind { + LIBXSMM_BUILD_KIND_GEMM = LIBXSMM_KERNEL_KIND_MATMUL, + LIBXSMM_BUILD_KIND_MELTW = LIBXSMM_KERNEL_KIND_MELTW, + LIBXSMM_BUILD_KIND_MEQN = LIBXSMM_KERNEL_KIND_MEQN, + LIBXSMM_BUILD_KIND_USER = LIBXSMM_KERNEL_KIND_USER, + LIBXSMM_BUILD_KIND_PGEMMRMAC = LIBXSMM_KERNEL_UNREGISTERED, + LIBXSMM_BUILD_KIND_PGEMMRMBC, + LIBXSMM_BUILD_KIND_PSPGEMM_CSR, + LIBXSMM_BUILD_KIND_PSPGEMM_CSC, + LIBXSMM_BUILD_KIND_SREG +} libxsmm_build_kind; + +/** Integral type (libxsmm_kernel_kind, libxsmm_build_kind). */ +#if defined(LIBXSMM_UNPACKED) +# define LIBXSMM_DESCRIPTOR_BIG(KIND) ((libxsmm_descriptor_kind)((KIND) | 0x8000000000000000)) +# define LIBXSMM_DESCRIPTOR_ISBIG(KIND) ((int)(((libxsmm_descriptor_kind)(KIND)) >> 63)) +# define LIBXSMM_DESCRIPTOR_KIND(KIND) ((int)(((libxsmm_descriptor_kind)(KIND)) & 0x7FFFFFFFFFFFFFFF)) +typedef uint64_t libxsmm_descriptor_kind; +#else +# define LIBXSMM_DESCRIPTOR_BIG(KIND) ((libxsmm_descriptor_kind)((KIND) | 0x80)) +# define LIBXSMM_DESCRIPTOR_ISBIG(KIND) ((int)((KIND) >> 7)) +# define LIBXSMM_DESCRIPTOR_KIND(KIND) ((int)((KIND) & 0x7F)) +typedef unsigned char libxsmm_descriptor_kind; +#endif + +/** All descriptor types, which are valid for code-registration. */ +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_descriptor { + char data[LIBXSMM_DESCRIPTOR_MAXSIZE]; + libxsmm_descriptor_kind kind; /* kind: must be the first member */ + LIBXSMM_REGDESC(LIBXSMM_PACKED(struct) { libxsmm_descriptor_kind /*repeated kind*/ pad; , desc; }); + LIBXSMM_PACKED(struct) { libxsmm_descriptor_kind /*repeated kind*/ pad; char desc[1]; } user; +} libxsmm_descriptor; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_build_request { + union { + const void* ptr; /* raw content */ + LIBXSMM_REGDESC(LIBXSMM_REGDESC_DEFAULT, const*); + const libxsmm_pspgemm_csr_descriptor* pspgemm_csr; + const libxsmm_pspgemm_csc_descriptor* pspgemm_csc; + const libxsmm_pgemm_ac_rm_descriptor* pgemmacrm; + const libxsmm_pgemm_bc_rm_descriptor* pgemmbcrm; + const libxsmm_csr_reg_descriptor* sreg; + } descriptor; + libxsmm_build_kind kind; + /* used by user-kind */ + size_t user_size; +} libxsmm_build_request; + +typedef enum libxsmm_malloc_flags { + LIBXSMM_MALLOC_FLAG_DEFAULT = 0, + LIBXSMM_MALLOC_FLAG_SCRATCH = 1, + LIBXSMM_MALLOC_FLAG_PRIVATE = 2, + LIBXSMM_MALLOC_FLAG_REALLOC = 4, + LIBXSMM_MALLOC_FLAG_PHUGE = 8, + LIBXSMM_MALLOC_FLAG_PLOCK = 16, + LIBXSMM_MALLOC_FLAG_MMAP = 32, + LIBXSMM_MALLOC_FLAG_R = 64, + LIBXSMM_MALLOC_FLAG_W = 128, + LIBXSMM_MALLOC_FLAG_X = 256, + LIBXSMM_MALLOC_FLAG_RW = LIBXSMM_MALLOC_FLAG_R | LIBXSMM_MALLOC_FLAG_W, + LIBXSMM_MALLOC_FLAG_WX = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_W, + LIBXSMM_MALLOC_FLAG_RWX = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_RW, + LIBXSMM_MALLOC_FLAG_VALID = LIBXSMM_MALLOC_FLAG_SCRATCH | + LIBXSMM_MALLOC_FLAG_PRIVATE | LIBXSMM_MALLOC_FLAG_REALLOC | + LIBXSMM_MALLOC_FLAG_PHUGE | LIBXSMM_MALLOC_FLAG_PLOCK | + LIBXSMM_MALLOC_FLAG_MMAP | LIBXSMM_MALLOC_FLAG_RWX +} libxsmm_malloc_flags; + +LIBXSMM_EXTERN_C typedef LIBXSMM_RETARGETABLE void* (*libxsmm_realloc_fun)(void* /*ptr*/, size_t /*size*/); + +#if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_malloc_fntype { + union { const void* dlsym; void* (*ptr)(size_t, size_t); } alignmem; + union { const void* dlsym; void* (*ptr)(size_t, size_t); } memalign; + union { const void* dlsym; libxsmm_malloc_fun ptr; } malloc; +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + union { const void* dlsym; void* (*ptr)(size_t, size_t); } calloc; +# endif +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + union { const void* dlsym; libxsmm_realloc_fun ptr; } realloc; +# endif + union { const void* dlsym; libxsmm_free_fun ptr; } free; +} libxsmm_malloc_fntype; +LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_fntype libxsmm_malloc_fn); +#endif + +#if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) +/* prototypes for GLIBC internal implementation */ +LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_memalign(size_t alignment, size_t size); +LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_malloc(size_t size); +#if defined(LIBXSMM_MALLOC_HOOK_CALLOC) +LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_calloc(size_t num, size_t size); +#endif +#if defined(LIBXSMM_MALLOC_HOOK_REALLOC) +LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void* __libc_realloc(void* ptr, size_t size); +#endif +LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void __libc_free(void* ptr); +#endif /*(defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD)))*/ + +LIBXSMM_API_INTERN void* libxsmm_memalign_internal(size_t alignment, size_t size); + +/* See https://sourceware.org/binutils/docs-2.34/ld/Options.html#index-_002d_002dwrap_003dsymbol */ +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_memalign(size_t alignment, size_t size); +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_malloc(size_t size); +#if defined(LIBXSMM_MALLOC_HOOK_CALLOC) +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_calloc(size_t num, size_t size); +#endif +#if defined(LIBXSMM_MALLOC_HOOK_REALLOC) +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void* __real_realloc(void* ptr, size_t size); +#endif +LIBXSMM_API_INTERN LIBXSMM_ATTRIBUTE_WEAK void __real_free(void* ptr); + +/** Retrieve internal information about a buffer (default memory domain). */ +LIBXSMM_API int libxsmm_get_malloc_xinfo(const void* memory, size_t* size, int* flags, void** extra); + +/** Initializes malloc hooks and other internals. */ +LIBXSMM_API_INTERN void libxsmm_malloc_init(void); +LIBXSMM_API_INTERN void libxsmm_malloc_finalize(void); + +/** Calculates an alignment depending on supposedly allocated size; alignment can be zero ("auto"). */ +LIBXSMM_API_INTERN size_t libxsmm_alignment(size_t size, size_t alignment); + +/** Same as libxsmm_set_default_allocator, but takes a lock (can be NULL). */ +LIBXSMM_API_INTERN int libxsmm_xset_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); +/** Same as libxsmm_get_default_allocator, but takes a lock (can be NULL). */ +LIBXSMM_API_INTERN int libxsmm_xget_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); + +/** Same as libxsmm_set_scratch_allocator, but takes a lock (can be NULL). */ +LIBXSMM_API_INTERN int libxsmm_xset_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn); +/** Same as libxsmm_get_scratch_allocator, but takes a lock (can be NULL). */ +LIBXSMM_API_INTERN int libxsmm_xget_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn); + +/** + * Attribute memory allocation and protect with only the necessary flags. + * This procedure is expected to run only one time per buffer, and may + * relocate the given memory. + */ +LIBXSMM_API_INTERN int libxsmm_malloc_attrib(void** memory, int flags, + /** If a name is given, an executable buffer will be dumped into a file. */ + const char* name); + +/** Like libxsmm_release_scratch, but takes a lock (can be NULL). */ +LIBXSMM_API_INTERN void libxsmm_xrelease_scratch(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock); + +/** Allocate memory of the requested size, which is aligned according to the given alignment. */ +LIBXSMM_API int libxsmm_xmalloc(void** memory, size_t size, size_t alignment, int flags, + /* The extra information is stored along with the allocated chunk; can be NULL/zero. */ + const void* extra, size_t extra_size); +/** Release memory, which was allocated using libxsmm_[*]malloc. */ +LIBXSMM_API void libxsmm_xfree(const void* memory, int check); + +/** + * Format for instance an amount of Bytes like libxsmm_format_value(result, sizeof(result), nbytes, "KMGT", "B", 10). + * The value returned is in requested/determined unit so that the user can decide about printing the buffer. + */ +LIBXSMM_API_INTERN size_t libxsmm_format_value(char buffer[32], int buffer_size, size_t nbytes, const char scale[], const char* unit, int base); + +/** Returns the type-name of data-type (can be also libxsmm_gemm_precision). */ +LIBXSMM_API_INTERN const char* libxsmm_typename(libxsmm_datatype datatype); + +/** Dump data and (optionally) checks attempt to dump different data into an existing file (unique). */ +LIBXSMM_API_INTERN int libxsmm_dump(const char* title, const char* name, const void* data, size_t size, int unique); + +/** Services a build request, and (optionally) registers the code (use regindex=LIBXSMM_CAPACITY_REGISTRY for unmanaged code). */ +LIBXSMM_API_INTERN int libxsmm_build(const libxsmm_build_request* request, unsigned int regindex, libxsmm_code_pointer* code); + +/** Returns the type-size of data-type (can be also libxsmm_gemm_precision). */ +LIBXSMM_API unsigned char libxsmm_typesize(libxsmm_datatype datatype); + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE libxsmm_kernel_xinfo { + /** Non-zero if kernel is registered. */ + unsigned int registered; + /** Number of FLoating Point OPerationS (FLOPS). */ + unsigned int nflops; +} libxsmm_kernel_xinfo; + +/** Receive information about JIT-generated code. */ +LIBXSMM_API_INTERN const libxsmm_kernel_xinfo* libxsmm_get_kernel_xinfo(libxsmm_code_pointer code, const libxsmm_descriptor** desc, size_t* code_size); + +/** Calculates duration in seconds from given RTC ticks. */ +LIBXSMM_API_INTERN double libxsmm_timer_duration_rtc(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1); +/** Returns the current tick of platform-specific real-time clock. */ +LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_rtc(void); +/** Returns the current tick of a (monotonic) platform-specific counter. */ +LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_tsc(void); + +LIBXSMM_API_INTERN void libxsmm_memory_init(int target_arch); +LIBXSMM_API_INTERN void libxsmm_memory_finalize(void); + +LIBXSMM_API_INTERN void libxsmm_dnn_init(int target_arch); +LIBXSMM_API_INTERN void libxsmm_dnn_finalize(void); + +/** intern function to calculate blockings, that's private API hence it's in this function */ +LIBXSMM_API_INTERN libxsmm_dnn_err_t libxsmm_dnn_get_feature_map_blocks( + int C, int K, int* C_block, int* K_block, int* fm_lp_block, + libxsmm_dnn_datatype datatype_in, libxsmm_dnn_datatype datatype_out); + +/** Global lock; create an own lock for an independent domain. */ +LIBXSMM_APIVAR_PUBLIC(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK) libxsmm_lock_global); +/** Determines whether a threaded implementation is synchronized or not. */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_nosync); + +/** Function used to allocate default memory. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_function libxsmm_default_malloc_fn); +/** Function used to allocate scratch memory. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_malloc_function libxsmm_scratch_malloc_fn); +/** Function used to release default memory. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_free_function libxsmm_default_free_fn); +/** Function used to release scratch memory. */ +LIBXSMM_APIVAR_PRIVATE(libxsmm_free_function libxsmm_scratch_free_fn); +/** If non-NULL, this context is used by the context-form of memory allocation. */ +LIBXSMM_APIVAR_PRIVATE(const void* libxsmm_default_allocator_context); +/** If non-NULL, this context is used by the context-form of memory allocation. */ +LIBXSMM_APIVAR_PRIVATE(const void* libxsmm_scratch_allocator_context); +/** Number of scratch memory pools used; clamped against internal maximum. */ +LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_scratch_pools); +/** Growth factor used to scale the scratch memory in case of reallocation. */ +LIBXSMM_APIVAR_PRIVATE(double libxsmm_scratch_scale); +/** Number of seconds per RDTSC-cycle (zero or negative if RDTSC invalid). */ +LIBXSMM_APIVAR_PRIVATE(double libxsmm_timer_scale); +/** Counts the number of attempts to create an SPMDM-handle. */ +LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_statistic_num_spmdm); +/** Counts the maximum number of thread that have been active. */ +LIBXSMM_APIVAR_PRIVATE(unsigned int libxsmm_thread_count); + +#if (0 != LIBXSMM_SYNC) +LIBXSMM_APIVAR_PRIVATE(LIBXSMM_TLS_TYPE libxsmm_tlskey); +#endif + +#endif /*LIBXSMM_MAIN_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_malloc.c b/third_party/libxsmm/src/libxsmm_malloc.c new file mode 100644 index 00000000..6555d5cc --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_malloc.c @@ -0,0 +1,2617 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_trace.h" +#include "libxsmm_main.h" +#include "libxsmm_hash.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) +# include +# include +#endif +#if !defined(LIBXSMM_MALLOC_GLIBC) +# if defined(__GLIBC__) +# define LIBXSMM_MALLOC_GLIBC __GLIBC__ +# else +# define LIBXSMM_MALLOC_GLIBC 6 +# endif +#endif +#if defined(_WIN32) +# include +# include +# include +#else +# include +# if defined(__linux__) +# include +# include +# endif +# if defined(MAP_POPULATE) +# include +# endif +# include +# include +# include +# include +# if defined(__MAP_ANONYMOUS) +# define LIBXSMM_MAP_ANONYMOUS __MAP_ANONYMOUS +# elif defined(MAP_ANONYMOUS) +# define LIBXSMM_MAP_ANONYMOUS MAP_ANONYMOUS +# elif defined(MAP_ANON) +# define LIBXSMM_MAP_ANONYMOUS MAP_ANON +# else +# define LIBXSMM_MAP_ANONYMOUS 0x20 +# endif +# if defined(MAP_SHARED) +# define LIBXSMM_MAP_SHARED MAP_SHARED +# else +# define LIBXSMM_MAP_SHARED 0 +# endif +LIBXSMM_EXTERN int ftruncate(int, off_t) LIBXSMM_THROW; +LIBXSMM_EXTERN int mkstemp(char*) LIBXSMM_NOTHROW; +#endif +#if !defined(LIBXSMM_MALLOC_FINAL) +# define LIBXSMM_MALLOC_FINAL 3 +#endif +#if defined(LIBXSMM_VTUNE) +# if (2 <= LIBXSMM_VTUNE) /* no header file required */ +# if !defined(LIBXSMM_VTUNE_JITVERSION) +# define LIBXSMM_VTUNE_JITVERSION LIBXSMM_VTUNE +# endif +# define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load_V2 +# define LIBXSMM_VTUNE_JIT_LOAD 21 +# define LIBXSMM_VTUNE_JIT_UNLOAD 14 +# define iJIT_SAMPLING_ON 0x0001 +LIBXSMM_EXTERN unsigned int iJIT_GetNewMethodID(void); +LIBXSMM_EXTERN /*iJIT_IsProfilingActiveFlags*/int iJIT_IsProfilingActive(void); +LIBXSMM_EXTERN int iJIT_NotifyEvent(/*iJIT_JVM_EVENT*/int event_type, void *EventSpecificData); +LIBXSMM_EXTERN_C typedef struct LineNumberInfo { + unsigned int Offset; + unsigned int LineNumber; +} LineNumberInfo; +LIBXSMM_EXTERN_C typedef struct iJIT_Method_Load_V2 { + unsigned int method_id; + char* method_name; + void* method_load_address; + unsigned int method_size; + unsigned int line_number_size; + LineNumberInfo* line_number_table; + char* class_file_name; + char* source_file_name; + char* module_name; +} iJIT_Method_Load_V2; +# else /* more safe due to header dependency */ +# include +# if !defined(LIBXSMM_VTUNE_JITVERSION) +# define LIBXSMM_VTUNE_JITVERSION 2 +# endif +# if (2 <= LIBXSMM_VTUNE_JITVERSION) +# define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load_V2 +# define LIBXSMM_VTUNE_JIT_LOAD iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED_V2 +# else +# define LIBXSMM_VTUNE_JIT_DESC_TYPE iJIT_Method_Load +# define LIBXSMM_VTUNE_JIT_LOAD iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED +# endif +# define LIBXSMM_VTUNE_JIT_UNLOAD iJVM_EVENT_TYPE_METHOD_UNLOAD_START +# endif +# if !defined(LIBXSMM_MALLOC_FALLBACK) +# define LIBXSMM_MALLOC_FALLBACK LIBXSMM_MALLOC_FINAL +# endif +#else /* VTune JIT-API not enabled */ +# if !defined(LIBXSMM_MALLOC_FALLBACK) +# define LIBXSMM_MALLOC_FALLBACK 0 +# endif +#endif /*defined(LIBXSMM_VTUNE)*/ +#if !defined(LIBXSMM_MALLOC_XMAP_TEMPLATE) +# define LIBXSMM_MALLOC_XMAP_TEMPLATE ".libxsmm_jit." LIBXSMM_MKTEMP_PATTERN +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif +#if defined(LIBXSMM_PERF) +# include "libxsmm_perf.h" +#endif + +#if !defined(LIBXSMM_MALLOC_ALIGNMAX) +# define LIBXSMM_MALLOC_ALIGNMAX (2 << 20) /* 2 MB */ +#endif +#if !defined(LIBXSMM_MALLOC_ALIGNFCT) +# define LIBXSMM_MALLOC_ALIGNFCT 16 +#endif +#if !defined(LIBXSMM_MALLOC_SEED) +# define LIBXSMM_MALLOC_SEED 1051981 +#endif + +#if !defined(LIBXSMM_MALLOC_HOOK_KMP) && 0 +# define LIBXSMM_MALLOC_HOOK_KMP +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) && 0 +# define LIBXSMM_MALLOC_HOOK_QKMALLOC +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_IMALLOC) && 1 +# define LIBXSMM_MALLOC_HOOK_IMALLOC +#endif +#if !defined(LIBXSMM_MALLOC_HOOK_CHECK) && 0 +# define LIBXSMM_MALLOC_HOOK_CHECK 1 +#endif + +#if !defined(LIBXSMM_MALLOC_CRC_LIGHT) && !defined(_DEBUG) && 1 +# define LIBXSMM_MALLOC_CRC_LIGHT +#endif +#if !defined(LIBXSMM_MALLOC_CRC_OFF) +# if defined(NDEBUG) && !defined(LIBXSMM_MALLOC_HOOK) +# define LIBXSMM_MALLOC_CRC_OFF +# elif !defined(LIBXSMM_BUILD) +# define LIBXSMM_MALLOC_CRC_OFF +# endif +#endif + +#if !defined(LIBXSMM_MALLOC_SCRATCH_LIMIT) +# define LIBXSMM_MALLOC_SCRATCH_LIMIT 0xFFFFFFFF /* ~4 GB */ +#endif +#if !defined(LIBXSMM_MALLOC_SCRATCH_PADDING) +# define LIBXSMM_MALLOC_SCRATCH_PADDING LIBXSMM_CACHELINE +#endif +/* pointers are checked first if they belong to scratch */ +#if !defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) && 1 +# define LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST +#endif +/* can clobber memory if allocations are not exactly scoped */ +#if !defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD) && 0 +# define LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD +#endif +#if !defined(LIBXSMM_MALLOC_SCRATCH_JOIN) && 1 +# define LIBXSMM_MALLOC_SCRATCH_JOIN +#endif +#if !defined(LIBXSMM_MALLOC_HUGE_PAGES) && 1 +# define LIBXSMM_MALLOC_HUGE_PAGES +#endif +#if !defined(LIBXSMM_MALLOC_LOCK_PAGES) && 1 +/* 0: on-map, 1: mlock, 2: mlock2/on-fault */ +# define LIBXSMM_MALLOC_LOCK_PAGES 1 +#endif +#if !defined(LIBXSMM_MALLOC_LOCK_ALL) && \ + defined(LIBXSMM_MALLOC_ALIGN_ALL) && 0 +# define LIBXSMM_MALLOC_LOCK_ALL +#endif +/* record real allocation size */ +#if !defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) && 0 +# define LIBXSMM_MALLOC_INFO_ALLOCSIZE +#endif +/* protected against double-delete (if possible) */ +#if !defined(LIBXSMM_MALLOC_DELETE_SAFE) && 0 +# define LIBXSMM_MALLOC_DELETE_SAFE +#elif !defined(NDEBUG) +# define LIBXSMM_MALLOC_DELETE_SAFE +#endif + +#define INTERNAL_MEMALIGN_REAL(RESULT, ALIGNMENT, SIZE) do { \ + const size_t internal_memalign_real_alignment_ = INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT); \ + (RESULT) = (0 != internal_memalign_real_alignment_ \ + ? __real_memalign(internal_memalign_real_alignment_, SIZE) \ + : __real_malloc(SIZE)); \ +} while(0) +#define INTERNAL_REALLOC_REAL(RESULT, PTR, SIZE) (RESULT) = __real_realloc(PTR, SIZE) +#define INTERNAL_FREE_REAL(PTR) __real_free(PTR) + +#if defined(LIBXSMM_MALLOC_LOCK_ALL) && defined(LIBXSMM_MALLOC_LOCK_PAGES) && 0 != (LIBXSMM_MALLOC_LOCK_PAGES) +# if 1 == (LIBXSMM_MALLOC_LOCK_PAGES) || !defined(MLOCK_ONFAULT) || !defined(SYS_mlock2) +# define INTERNAL_MALLOC_LOCK_PAGES(BUFFER, SIZE) if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_MALLOC_ALIGNMAX) <= (SIZE)) \ + mlock(BUFFER, SIZE) +# else +# define INTERNAL_MALLOC_LOCK_PAGES(BUFFER, SIZE) if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_MALLOC_ALIGNMAX) <= (SIZE)) \ + syscall(SYS_mlock2, BUFFER, SIZE, MLOCK_ONFAULT) +# endif +#else +# define INTERNAL_MALLOC_LOCK_PAGES(BUFFER, SIZE) +#endif + +#if defined(LIBXSMM_MALLOC_ALIGN_ALL) +# define INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT) libxsmm_alignment(SIZE, ALIGNMENT) +#else +# define INTERNAL_MALLOC_AUTOALIGN(SIZE, ALIGNMENT) (ALIGNMENT) +#endif + +#if defined(LIBXSMM_MALLOC_HOOK) && defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) +# define INTERNAL_MEMALIGN_HOOK(RESULT, FLAGS, ALIGNMENT, SIZE, CALLER) { \ + const int internal_memalign_hook_recursive_ = LIBXSMM_ATOMIC_ADD_FETCH( \ + &internal_malloc_recursive, 1, LIBXSMM_ATOMIC_RELAXED); \ + if ( 1 < internal_memalign_hook_recursive_ /* protect against recursion */ \ + || 0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ + || (internal_malloc_limit[0] > (SIZE)) \ + || (internal_malloc_limit[1] < (SIZE) && 0 != internal_malloc_limit[1])) \ + { \ + INTERNAL_MEMALIGN_REAL(RESULT, ALIGNMENT, SIZE); \ + } \ + else { /* redirect */ \ + LIBXSMM_INIT \ + if (NULL == (CALLER)) { /* libxsmm_trace_caller_id may allocate memory */ \ + internal_scratch_malloc(&(RESULT), SIZE, ALIGNMENT, FLAGS, \ + libxsmm_trace_caller_id(0/*level*/)); \ + } \ + else { \ + internal_scratch_malloc(&(RESULT), SIZE, ALIGNMENT, FLAGS, CALLER); \ + } \ + } \ + LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_recursive, 1, LIBXSMM_ATOMIC_RELAXED); \ + } +# define INTERNAL_REALLOC_HOOK(RESULT, FLAGS, PTR, SIZE, CALLER) \ + if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ + /*|| (0 != LIBXSMM_ATOMIC_LOAD(&internal_malloc_recursive, LIBXSMM_ATOMIC_RELAXED))*/ \ + || (internal_malloc_limit[0] > (SIZE)) \ + || (internal_malloc_limit[1] < (SIZE) && 0 != internal_malloc_limit[1])) \ + { \ + INTERNAL_REALLOC_REAL(RESULT, PTR, SIZE); \ + } \ + else { \ + const int nzeros = LIBXSMM_INTRINSICS_BITSCANFWD64((uintptr_t)(PTR)), alignment = 1 << nzeros; \ + LIBXSMM_ASSERT(0 == ((uintptr_t)(PTR) & ~(0xFFFFFFFFFFFFFFFF << nzeros))); \ + if (NULL == (CALLER)) { /* libxsmm_trace_caller_id may allocate memory */ \ + internal_scratch_malloc(&(PTR), SIZE, (size_t)alignment, FLAGS, \ + libxsmm_trace_caller_id(0/*level*/)); \ + } \ + else { \ + internal_scratch_malloc(&(PTR), SIZE, (size_t)alignment, FLAGS, CALLER); \ + } \ + (RESULT) = (PTR); \ + } +# define INTERNAL_FREE_HOOK(PTR, CALLER) { \ + LIBXSMM_UNUSED(CALLER); \ + if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind \ + /*|| (0 != LIBXSMM_ATOMIC_LOAD(&internal_malloc_recursive, LIBXSMM_ATOMIC_RELAXED))*/ \ + ){ \ + INTERNAL_FREE_REAL(PTR); \ + } \ + else { /* recognize pointers not issued by LIBXSMM */ \ + libxsmm_free(PTR); \ + } \ + } +#elif defined(LIBXSMM_MALLOC_ALIGN_ALL) +# define INTERNAL_MEMALIGN_HOOK(RESULT, FLAGS, ALIGNMENT, SIZE, CALLER) do { \ + LIBXSMM_UNUSED(FLAGS); LIBXSMM_UNUSED(CALLER); \ + INTERNAL_MEMALIGN_REAL(RESULT, ALIGNMENT, SIZE); \ + INTERNAL_MALLOC_LOCK_PAGES(RESULT, SIZE); \ + } while(0) +# define INTERNAL_REALLOC_HOOK(RESULT, FLAGS, PTR, SIZE, CALLER) do { \ + LIBXSMM_UNUSED(FLAGS); LIBXSMM_UNUSED(CALLER); \ + INTERNAL_REALLOC_REAL(RESULT, PTR, SIZE); \ + INTERNAL_MALLOC_LOCK_PAGES(RESULT, SIZE); \ + } while(0) +# define INTERNAL_FREE_HOOK(PTR, CALLER) do { \ + LIBXSMM_UNUSED(CALLER); \ + INTERNAL_FREE_REAL(PTR); \ + } while(0) +#endif + +#if !defined(WIN32) +# if defined(MAP_32BIT) +# define INTERNAL_XMALLOC_MAP32(ENV, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) \ + if (MAP_FAILED == (BUFFER) && 0 != (MAP_32BIT & (MFLAGS))) { \ + (BUFFER) = internal_xmalloc_xmap(ENV, SIZE, (MFLAGS) & ~MAP_32BIT, REPTR); \ + if (MAP_FAILED != (BUFFER)) (MAPSTATE) = 0; \ + } +# else +# define INTERNAL_XMALLOC_MAP32(ENV, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) +# endif + +# define INTERNAL_XMALLOC(I, ENTRYPOINT, ENVVAR, ENVDEF, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR) \ + if ((ENTRYPOINT) <= (I) && (MAP_FAILED == (BUFFER) || NULL == (BUFFER))) { \ + static const char* internal_xmalloc_env_ = NULL; \ + LIBXSMM_ASSERT(NULL != (ENVVAR) && '\0' != *(ENVVAR)); \ + if (NULL == internal_xmalloc_env_) { \ + internal_xmalloc_env_ = getenv(ENVVAR); \ + if (NULL == internal_xmalloc_env_) internal_xmalloc_env_ = ENVDEF; \ + } \ + (BUFFER) = internal_xmalloc_xmap(internal_xmalloc_env_, SIZE, MFLAGS, REPTR); \ + INTERNAL_XMALLOC_MAP32(internal_xmalloc_env_, MAPSTATE, MFLAGS, SIZE, BUFFER, REPTR); \ + if (MAP_FAILED != (BUFFER)) (ENTRYPOINT) = (I); \ + } + +# define INTERNAL_XMALLOC_WATERMARK(NAME, WATERMARK, LIMIT, SIZE) { \ + const size_t internal_xmalloc_watermark_ = (WATERMARK) + (SIZE) / 2; /* accept data-race */ \ + if (internal_xmalloc_watermark_ < (LIMIT)) { \ + static size_t internal_xmalloc_watermark_verbose_ = 0; \ + (LIMIT) = internal_xmalloc_watermark_; /* accept data-race */ \ + if (internal_xmalloc_watermark_verbose_ < internal_xmalloc_watermark_ && \ + (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity)) \ + { /* muted */ \ + char internal_xmalloc_watermark_buffer_[32]; \ + /* coverity[check_return] */ \ + libxsmm_format_value(internal_xmalloc_watermark_buffer_, sizeof(internal_xmalloc_watermark_buffer_), \ + internal_xmalloc_watermark_, "KM", "B", 10); \ + fprintf(stderr, "LIBXSMM WARNING: " NAME " watermark reached at %s!\n", internal_xmalloc_watermark_buffer_); \ + internal_xmalloc_watermark_verbose_ = internal_xmalloc_watermark_; \ + } \ + } \ +} + +# define INTERNAL_XMALLOC_KIND(KIND, NAME, FLAG, FLAGS, MFLAGS, WATERMARK, LIMIT, INFO, SIZE, BUFFER) \ + if (0 != ((KIND) & (MFLAGS))) { \ + if (MAP_FAILED != (BUFFER)) { \ + LIBXSMM_ASSERT(NULL != (BUFFER)); \ + LIBXSMM_ATOMIC_ADD_FETCH(&(WATERMARK), SIZE, LIBXSMM_ATOMIC_RELAXED); \ + (FLAGS) |= (FLAG); \ + } \ + else { /* retry */ \ + (BUFFER) = mmap(NULL == (INFO) ? NULL : (INFO)->pointer, SIZE, PROT_READ | PROT_WRITE, \ + MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | ((MFLAGS) & ~(KIND)), -1, 0/*offset*/); \ + if (MAP_FAILED != (BUFFER)) { /* successful retry */ \ + LIBXSMM_ASSERT(NULL != (BUFFER)); \ + INTERNAL_XMALLOC_WATERMARK(NAME, WATERMARK, LIMIT, SIZE); \ + } \ + (FLAGS) &= ~(FLAG); \ + } \ + } \ + else (FLAGS) &= ~(FLAG) +#endif + + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_malloc_info_type { + libxsmm_free_function free; + void *pointer, *reloc; + const void* context; +#if defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) + /* real/allocated size */ + size_t size_alloc; +#endif + /* user-requested size */ + size_t size; + int flags; +#if defined(LIBXSMM_VTUNE) + unsigned int code_id; +#endif +#if !defined(LIBXSMM_MALLOC_CRC_OFF) /* hash *must* be the last entry */ + unsigned int hash; +#endif +} internal_malloc_info_type; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_malloc_pool_type { + char pad[LIBXSMM_MALLOC_SCRATCH_PADDING]; + struct { + size_t minsize, counter, incsize; + char *buffer, *head; +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const void* site; +# if (0 != LIBXSMM_SYNC) + unsigned int tid; +# endif +#endif + } instance; +} internal_malloc_pool_type; + +/* Scratch pool, which supports up to MAX_NSCRATCH allocation sites. */ +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) +/* LIBXSMM_ALIGNED appears to contradict LIBXSMM_APIVAR, and causes multiple defined symbols (if below is seen in multiple translation units) */ +LIBXSMM_APIVAR_DEFINE(char internal_malloc_pool_buffer[(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)*sizeof(internal_malloc_pool_type)+(LIBXSMM_MALLOC_SCRATCH_PADDING)-1]); +#endif +/* Maximum total size of the scratch memory domain. */ +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_scratch_limit); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_scratch_nmallocs); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_private_max); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_private_cur); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_public_max); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_public_cur); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_local_max); +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_local_cur); +LIBXSMM_APIVAR_DEFINE(int internal_malloc_recursive); +/** 0: regular, 1/odd: intercept/scratch, otherwise: all/scratch */ +LIBXSMM_APIVAR_DEFINE(int internal_malloc_kind); +#if defined(LIBXSMM_MALLOC_HOOK) && defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) +/* Interval of bytes that permit interception (internal_malloc_kind) */ +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_limit[2]); +#endif +#if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) +LIBXSMM_APIVAR_DEFINE(int internal_malloc_join); +#endif +#if !defined(_WIN32) +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_hugetlb); +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) +LIBXSMM_APIVAR_DEFINE(size_t internal_malloc_plocked); +# endif +#endif + + +LIBXSMM_API_INTERN size_t libxsmm_alignment(size_t size, size_t alignment) +{ + size_t result; + if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_MALLOC_ALIGNMAX) <= size) { + result = libxsmm_lcm(0 == alignment ? (LIBXSMM_ALIGNMENT) : libxsmm_lcm(alignment, LIBXSMM_ALIGNMENT), LIBXSMM_MALLOC_ALIGNMAX); + } + else { /* small-size request */ + if ((LIBXSMM_MALLOC_ALIGNFCT * LIBXSMM_ALIGNMENT) <= size) { + result = (0 == alignment ? (LIBXSMM_ALIGNMENT) : libxsmm_lcm(alignment, LIBXSMM_ALIGNMENT)); + } + else if (0 != alignment) { /* custom alignment */ + result = libxsmm_lcm(alignment, sizeof(void*)); + } + else { /* tiny-size request */ + result = sizeof(void*); + } + } + return result; +} + + +LIBXSMM_API size_t libxsmm_offset(const size_t offset[], const size_t shape[], size_t ndims, size_t* size) +{ + size_t result = 0, size1 = 0; + if (0 != ndims && NULL != shape) { + size_t i; + result = (NULL != offset ? offset[0] : 0); + size1 = shape[0]; + for (i = 1; i < ndims; ++i) { + result += (NULL != offset ? offset[i] : 0) * size1; + size1 *= shape[i]; + } + } + if (NULL != size) *size = size1; + return result; +} + + +LIBXSMM_API_INLINE +LIBXSMM_ATTRIBUTE_NO_SANITIZE(address) +internal_malloc_info_type* internal_malloc_info(const void* memory, int check) +{ + const char *const buffer = (const char*)memory; + internal_malloc_info_type* result = (internal_malloc_info_type*)(NULL != memory + ? (buffer - sizeof(internal_malloc_info_type)) : NULL); +#if defined(LIBXSMM_MALLOC_HOOK_CHECK) + if ((LIBXSMM_MALLOC_HOOK_CHECK) < check) check = (LIBXSMM_MALLOC_HOOK_CHECK); +#endif + if (0 != check && NULL != result) { /* check ownership */ +#if !defined(_WIN32) /* mprotect: pass address rounded down to page/4k alignment */ + if (1 == check || 0 == mprotect((void*)(((uintptr_t)result) & 0xFFFFFFFFFFFFF000), + sizeof(internal_malloc_info_type), PROT_READ | PROT_WRITE) || ENOMEM != errno) +#endif + { + const int flags_rs = LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_SCRATCH; + const int flags_px = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_PRIVATE; + const int flags_mx = LIBXSMM_MALLOC_FLAG_X | LIBXSMM_MALLOC_FLAG_MMAP; + const char *const pointer = (const char*)result->pointer; + union { libxsmm_free_fun fun; const void* ptr; } convert; + convert.fun = result->free.function; + if (((flags_mx != (flags_mx & result->flags)) && NULL != result->reloc) + || (0 == (LIBXSMM_MALLOC_FLAG_X & result->flags) ? 0 : (0 != (flags_rs & result->flags))) + || (0 != (LIBXSMM_MALLOC_FLAG_X & result->flags) && NULL != result->context) +#if defined(LIBXSMM_VTUNE) + || (0 == (LIBXSMM_MALLOC_FLAG_X & result->flags) && 0 != result->code_id) +#endif + || (0 != (~LIBXSMM_MALLOC_FLAG_VALID & result->flags)) + || (0 == (LIBXSMM_MALLOC_FLAG_R & result->flags)) + || (pointer == convert.ptr || pointer == result->context || pointer >= buffer || NULL == pointer) +#if defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) + || (result->size_alloc < result->size) +#endif + || (LIBXSMM_MAX(LIBXSMM_MAX(internal_malloc_public_max, internal_malloc_local_max), internal_malloc_private_max) < result->size + && 0 == (flags_px & result->flags)) || (0 == result->size) + || (2 > libxsmm_ninit) /* before checksum calculation */ +#if !defined(LIBXSMM_MALLOC_CRC_OFF) /* last check: checksum over info */ +# if defined(LIBXSMM_MALLOC_CRC_LIGHT) + || result->hash != LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &result) +# else + || result->hash != libxsmm_crc32(LIBXSMM_MALLOC_SEED, result, + (const char*)&result->hash - (const char*)result) +# endif +#endif + ) { /* mismatch */ + result = NULL; + } + } +#if !defined(_WIN32) + else { /* mismatch */ + result = NULL; + } +#endif + } + return result; +} + + +LIBXSMM_API_INLINE size_t internal_get_scratch_size(const internal_malloc_pool_type* exclude) +{ + size_t result = 0; +#if !defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) || (1 >= (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + LIBXSMM_UNUSED(exclude); +#else + const internal_malloc_pool_type* pool = (const internal_malloc_pool_type*)LIBXSMM_UP2( + (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); +# if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const internal_malloc_pool_type *const end = pool + libxsmm_scratch_pools; + LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + for (; pool != end; ++pool) +# endif /*(1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ + { + if (0 != pool->instance.minsize) { +# if 1 /* memory info is not used */ + if (pool != exclude && (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { + result += pool->instance.minsize; + } +# else + const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); + if (NULL != info && pool != exclude && (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { + result += info->size; + } +# endif + } + else break; /* early exit */ + } +#endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ + return result; +} + + +LIBXSMM_API_INLINE internal_malloc_pool_type* internal_scratch_malloc_pool(const void* memory) +{ + internal_malloc_pool_type* result = NULL; + internal_malloc_pool_type* pool = (internal_malloc_pool_type*)LIBXSMM_UP2( + (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); + const char *const buffer = (const char*)memory; +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const unsigned int npools = libxsmm_scratch_pools; +#else + const unsigned int npools = 1; +#endif + internal_malloc_pool_type *const end = pool + npools; + LIBXSMM_ASSERT(npools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + LIBXSMM_ASSERT(NULL != memory); + for (; pool != end; ++pool) { + if (0 != pool->instance.minsize) { + if (0 != /*LIBXSMM_ATOMIC_LOAD(&*/pool->instance.counter/*, LIBXSMM_ATOMIC_SEQ_CST)*/ +#if 1 /* should be implied by non-zero counter */ + && NULL != pool->instance.buffer +#endif + ){/* check if memory belongs to scratch domain or local domain */ +#if 1 + const size_t size = pool->instance.minsize; +#else + const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); + const size_t size = info->size; +#endif + if (pool->instance.buffer == buffer /* fast path */ || + (pool->instance.buffer < buffer && buffer < (pool->instance.buffer + size))) + { + result = pool; + break; + } + } + } + else break; /* early exit */ + } + return result; +} + + +LIBXSMM_API_INTERN int internal_xfree(const void* /*memory*/, internal_malloc_info_type* /*info*/); + + +LIBXSMM_API_INTERN void internal_scratch_free(const void* /*memory*/, internal_malloc_pool_type* /*pool*/); +LIBXSMM_API_INTERN void internal_scratch_free(const void* memory, internal_malloc_pool_type* pool) +{ +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const size_t counter = LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); + char *const pool_buffer = pool->instance.buffer; +# if (!defined(NDEBUG) || defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD)) + char *const buffer = (char*)memory; /* non-const */ + LIBXSMM_ASSERT(pool_buffer <= buffer && buffer < pool_buffer + pool->instance.minsize); +# endif + LIBXSMM_ASSERT(pool_buffer <= pool->instance.head); + if (0 == counter) { /* reuse or reallocate scratch domain */ + internal_malloc_info_type *const info = internal_malloc_info(pool_buffer, 0/*no check*/); + const size_t scale_size = (size_t)(1 != libxsmm_scratch_scale ? (libxsmm_scratch_scale * info->size) : info->size); /* hysteresis */ + const size_t size = pool->instance.minsize + pool->instance.incsize; + LIBXSMM_ASSERT(0 == (LIBXSMM_MALLOC_FLAG_X & info->flags)); /* scratch memory is not executable */ + if (size <= scale_size) { /* reuse scratch domain */ + pool->instance.head = pool_buffer; /* reuse scratch domain */ + } + else { /* release buffer */ +# if !defined(NDEBUG) + static int error_once = 0; +# endif + pool->instance.buffer = pool->instance.head = NULL; +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + pool->instance.site = NULL; /* clear affinity */ +# endif +# if !defined(NDEBUG) + if (EXIT_SUCCESS != internal_xfree(pool_buffer, info) /* invalidates info */ + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); + } +# else + internal_xfree(pool_buffer, info); /* !libxsmm_free, invalidates info */ +# endif + } + } +# if defined(LIBXSMM_MALLOC_SCRATCH_TRIM_HEAD) /* TODO: document linear/scoped allocator policy */ + else if (buffer < pool->instance.head) { /* reuse scratch domain */ + pool->instance.head = buffer; + } +# else + LIBXSMM_UNUSED(memory); +# endif +#else + LIBXSMM_UNUSED(memory); LIBXSMM_UNUSED(pool); +#endif +} + + +LIBXSMM_API_INTERN void internal_scratch_malloc(void** /*memory*/, size_t /*size*/, size_t /*alignment*/, int /*flags*/, const void* /*caller*/); +LIBXSMM_API_INTERN void internal_scratch_malloc(void** memory, size_t size, size_t alignment, int flags, const void* caller) +{ + LIBXSMM_ASSERT(NULL != memory && 0 == (LIBXSMM_MALLOC_FLAG_X & flags)); + if (0 == (LIBXSMM_MALLOC_FLAG_REALLOC & flags) || NULL == *memory) { + static int error_once = 0; + size_t local_size = 0; +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + if (0 < libxsmm_scratch_pools) { + internal_malloc_pool_type *const pools = (internal_malloc_pool_type*)LIBXSMM_UP2( + (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); + internal_malloc_pool_type *const end = pools + libxsmm_scratch_pools, *pool = pools; + const size_t align_size = libxsmm_alignment(size, alignment), alloc_size = size + align_size - 1; +# if (0 != LIBXSMM_SYNC) + const unsigned int tid = libxsmm_get_tid(); +# endif + unsigned int npools = 1; +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const void *const site = caller; /* no further attempt in case of NULL */ + internal_malloc_pool_type *pool0 = end; + for (; pool != end; ++pool) { /* counter: memory info is not employed as pools are still manipulated */ + if (NULL != pool->instance.buffer) { + if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) ++npools; /* count number of occupied pools */ + if ( /* find matching pool and enter fast path (draw from pool-buffer) */ +# if (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_MALLOC_SCRATCH_JOIN) + (site == pool->instance.site && tid == pool->instance.tid)) +# elif (0 != LIBXSMM_SYNC) + (site == pool->instance.site && (0 != internal_malloc_join || tid == pool->instance.tid))) +# else + (site == pool->instance.site)) +# endif + { + break; + } + } + else { + if (end == pool0) pool0 = pool; /* first available pool*/ + if (0 == pool->instance.minsize) { /* early exit */ + pool = pool0; break; + } + } + } +# endif + LIBXSMM_ASSERT(NULL != pool); + if (end != pool && 0 <= internal_malloc_kind) { + const size_t counter = LIBXSMM_ATOMIC_ADD_FETCH(&pool->instance.counter, (size_t)1, LIBXSMM_ATOMIC_SEQ_CST); + if (NULL != pool->instance.buffer || 1 != counter) { /* attempt to (re-)use existing pool */ + const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 1/*check*/); + const size_t pool_size = ((NULL != info && 0 != counter) ? info->size : 0); + const size_t used_size = pool->instance.head - pool->instance.buffer; + const size_t req_size = alloc_size + used_size; + if (req_size <= pool_size) { /* fast path: draw from pool-buffer */ +# if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) + void *const headaddr = &pool->instance.head; + char *const head = (0 == internal_malloc_join + ? (pool->instance.head += alloc_size) + : ((char*)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + (uintptr_t*)headaddr, alloc_size, LIBXSMM_ATOMIC_SEQ_CST))); +# else + char *const head = (char*)(pool->instance.head += alloc_size); +# endif + *memory = LIBXSMM_ALIGN(head - alloc_size, align_size); + } + else { /* fallback to local memory allocation */ + const size_t incsize = req_size - LIBXSMM_MIN(pool_size, req_size); + pool->instance.incsize = LIBXSMM_MAX(pool->instance.incsize, incsize); +# if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) + if (0 == internal_malloc_join) { + --pool->instance.counter; + } + else { + LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); + } +# else + --pool->instance.counter; +# endif + if ( +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + (LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site && +# endif + 0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & flags)) + { + const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + &internal_malloc_local_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); + if (internal_malloc_local_max < watermark) internal_malloc_local_max = watermark; /* accept data-race */ + } + else { + const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + &internal_malloc_private_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); + if (internal_malloc_private_max < watermark) internal_malloc_private_max = watermark; /* accept data-race */ + } + local_size = size; + } + } + else { /* fresh pool */ + const size_t scratch_limit = libxsmm_get_scratch_limit(); + const size_t scratch_size = internal_get_scratch_size(pool); /* exclude current pool */ + const size_t limit_size = (1 < npools ? (scratch_limit - LIBXSMM_MIN(scratch_size, scratch_limit)) : LIBXSMM_SCRATCH_UNLIMITED); + const size_t scale_size = (size_t)(1 != libxsmm_scratch_scale ? (libxsmm_scratch_scale * alloc_size) : alloc_size); /* hysteresis */ + const size_t incsize = (size_t)(libxsmm_scratch_scale * pool->instance.incsize); + const size_t maxsize = LIBXSMM_MAX(scale_size, pool->instance.minsize) + incsize; + const size_t limsize = LIBXSMM_MIN(maxsize, limit_size); + const size_t minsize = limsize; + assert(1 <= libxsmm_scratch_scale); /* !LIBXSMM_ASSERT */ + LIBXSMM_ASSERT(1 == counter); + pool->instance.incsize = 0; /* reset */ + pool->instance.minsize = minsize; +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + pool->instance.site = site; +# if (0 != LIBXSMM_SYNC) + pool->instance.tid = tid; +# endif +# endif + if (alloc_size <= minsize && /* allocate scratch pool */ + EXIT_SUCCESS == libxsmm_xmalloc(memory, minsize, 0/*auto-align*/, + (flags | LIBXSMM_MALLOC_FLAG_SCRATCH) & ~LIBXSMM_MALLOC_FLAG_REALLOC, + NULL/*extra*/, 0/*extra_size*/)) + { + pool->instance.buffer = (char*)*memory; + pool->instance.head = pool->instance.buffer + alloc_size; + *memory = LIBXSMM_ALIGN((char*)*memory, align_size); +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) +# endif + { + LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_scratch_nmallocs, 1, LIBXSMM_ATOMIC_RELAXED); + } + } + else { /* fallback to local allocation */ + LIBXSMM_ATOMIC_SUB_FETCH(&pool->instance.counter, 1, LIBXSMM_ATOMIC_SEQ_CST); + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (alloc_size <= minsize) { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate scratch memory!\n"); + } + else if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != caller + && (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) + { + fprintf(stderr, "LIBXSMM WARNING: scratch memory domain exhausted!\n"); + } + } + local_size = size; + } + } + } + else { /* fallback to local memory allocation */ + local_size = size; + } + } + else { /* fallback to local memory allocation */ + local_size = size; + } + if (0 != local_size) +#else + local_size = size; +#endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ + { /* local memory allocation */ + if (EXIT_SUCCESS != libxsmm_xmalloc(memory, local_size, alignment, + flags & ~(LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_REALLOC), NULL/*extra*/, 0/*extra_size*/) + && /* library code is expected to be mute */0 != libxsmm_verbosity + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: scratch memory fallback failed!\n"); + LIBXSMM_ASSERT(NULL == *memory); + } + if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != caller) { + LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_scratch_nmallocs, 1, LIBXSMM_ATOMIC_RELAXED); + } + } + } + else { /* reallocate memory */ + const void *const preserve = *memory; +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(preserve); + if (NULL != pool) { + const internal_malloc_info_type *const info = internal_malloc_info(pool->instance.buffer, 0/*no check*/); + void* buffer; + LIBXSMM_ASSERT(pool->instance.buffer <= pool->instance.head && NULL != info); + internal_scratch_malloc(&buffer, size, alignment, + ~LIBXSMM_MALLOC_FLAG_REALLOC & (LIBXSMM_MALLOC_FLAG_SCRATCH | flags), caller); + if (NULL != buffer) { + memcpy(buffer, preserve, LIBXSMM_MIN(size, info->size)); /* TODO: memmove? */ + *memory = buffer; + } + internal_scratch_free(memory, pool); + } + else +#endif + { /* non-pooled (potentially foreign pointer) */ +#if !defined(NDEBUG) + const int status = +#endif + libxsmm_xmalloc(memory, size, alignment/* no need here to determine alignment of given buffer */, + ~LIBXSMM_MALLOC_FLAG_SCRATCH & flags, NULL/*extra*/, 0/*extra_size*/); + assert(EXIT_SUCCESS == status || NULL == *memory); /* !LIBXSMM_ASSERT */ + } + } +} + + +#if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) +LIBXSMM_APIVAR_PRIVATE_DEF(libxsmm_malloc_fntype libxsmm_malloc_fn); + +#if defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) +LIBXSMM_API_INTERN void* internal_memalign_malloc(size_t /*alignment*/, size_t /*size*/); +LIBXSMM_API_INTERN void* internal_memalign_malloc(size_t alignment, size_t size) +{ + LIBXSMM_UNUSED(alignment); + LIBXSMM_ASSERT(NULL != libxsmm_malloc_fn.malloc.dlsym); + return libxsmm_malloc_fn.malloc.ptr(size); +} +#elif defined(LIBXSMM_MALLOC_HOOK_KMP) +LIBXSMM_API_INTERN void* internal_memalign_twiddle(size_t /*alignment*/, size_t /*size*/); +LIBXSMM_API_INTERN void* internal_memalign_twiddle(size_t alignment, size_t size) +{ + LIBXSMM_ASSERT(NULL != libxsmm_malloc_fn.alignmem.dlsym); + return libxsmm_malloc_fn.alignmem.ptr(size, alignment); +} +#endif +#endif /*defined(LIBXSMM_MALLOC_HOOK_DYNAMIC)*/ + + +#if (defined(LIBXSMM_MALLOC_HOOK) && defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC)) || defined(LIBXSMM_MALLOC_ALIGN_ALL) +LIBXSMM_API_INTERN void* internal_memalign_hook(size_t /*alignment*/, size_t /*size*/, const void* /*caller*/); +LIBXSMM_API_INTERN void* internal_memalign_hook(size_t alignment, size_t size, const void* caller) +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, caller); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, caller); +# endif + return result; +} + +LIBXSMM_API void* __wrap_memalign(size_t /*alignment*/, size_t /*size*/); +LIBXSMM_API void* __wrap_memalign(size_t alignment, size_t size) +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, NULL/*caller*/); +# endif + return result; +} + +LIBXSMM_API_INTERN void* internal_malloc_hook(size_t /*size*/, const void* /*caller*/); +LIBXSMM_API_INTERN void* internal_malloc_hook(size_t size, const void* caller) +{ + return internal_memalign_hook(0/*auto-alignment*/, size, caller); +} + +LIBXSMM_API void* __wrap_malloc(size_t /*size*/); +LIBXSMM_API void* __wrap_malloc(size_t size) +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, size, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, size, NULL/*caller*/); +# endif + return result; +} + +#if defined(LIBXSMM_MALLOC_HOOK_CALLOC) +LIBXSMM_API void* __wrap_calloc(size_t /*num*/, size_t /*size*/); +LIBXSMM_API void* __wrap_calloc(size_t num, size_t size) +{ + void* result; + const size_t nbytes = num * size; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, nbytes, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, nbytes, NULL/*caller*/); +# endif + /* TODO: signal anonymous/zeroed pages */ + if (NULL != result) memset(result, 0, nbytes); + return result; +} +#endif + +#if defined(LIBXSMM_MALLOC_HOOK_REALLOC) +LIBXSMM_API_INTERN void* internal_realloc_hook(void* /*ptr*/, size_t /*size*/, const void* /*caller*/); +LIBXSMM_API_INTERN void* internal_realloc_hook(void* ptr, size_t size, const void* caller) +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, caller); +# else + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, caller); +# endif + return result; +} + +LIBXSMM_API void* __wrap_realloc(void* /*ptr*/, size_t /*size*/); +LIBXSMM_API void* __wrap_realloc(void* ptr, size_t size) +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, NULL/*caller*/); +# else + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, NULL/*caller*/); +# endif + return result; +} +#endif + +LIBXSMM_API_INTERN void internal_free_hook(void* /*ptr*/, const void* /*caller*/); +LIBXSMM_API_INTERN void internal_free_hook(void* ptr, const void* caller) +{ + INTERNAL_FREE_HOOK(ptr, caller); +} + +LIBXSMM_API void __wrap_free(void* /*ptr*/); +LIBXSMM_API void __wrap_free(void* ptr) +{ + INTERNAL_FREE_HOOK(ptr, NULL/*caller*/); +} +#endif + +#if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) && ((defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC)) || defined(LIBXSMM_MALLOC_ALIGN_ALL)) +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* memalign(size_t /*alignment*/, size_t /*size*/) LIBXSMM_THROW; +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* memalign(size_t alignment, size_t size) LIBXSMM_THROW +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, alignment, size, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, alignment, size, NULL/*caller*/); +# endif + return result; +} + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* malloc(size_t /*size*/) LIBXSMM_THROW; +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* malloc(size_t size) LIBXSMM_THROW +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, size, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, size, NULL/*caller*/); +# endif + return result; +} + +#if defined(LIBXSMM_MALLOC_HOOK_CALLOC) +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* calloc(size_t /*num*/, size_t /*size*/) LIBXSMM_THROW; +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK LIBXSMM_ATTRIBUTE_MALLOC void* calloc(size_t num, size_t size) LIBXSMM_THROW +{ + void* result; + const size_t nbytes = num * size; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_MMAP, 0/*auto-alignment*/, nbytes, NULL/*caller*/); +# else + INTERNAL_MEMALIGN_HOOK(result, LIBXSMM_MALLOC_FLAG_DEFAULT, 0/*auto-alignment*/, nbytes, NULL/*caller*/); +# endif + /* TODO: signal anonymous/zeroed pages */ + if (NULL != result) memset(result, 0, nbytes); + return result; +} +#endif + +#if defined(LIBXSMM_MALLOC_HOOK_REALLOC) +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void* realloc(void* /*ptr*/, size_t /*size*/) LIBXSMM_THROW; +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void* realloc(void* ptr, size_t size) LIBXSMM_THROW +{ + void* result; +# if defined(LIBXSMM_MALLOC_MMAP_HOOK) + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_MMAP, ptr, size, NULL/*caller*/); +# else + INTERNAL_REALLOC_HOOK(result, LIBXSMM_MALLOC_FLAG_REALLOC | LIBXSMM_MALLOC_FLAG_DEFAULT, ptr, size, NULL/*caller*/); +# endif + return result; +} +#endif + +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void free(void* /*ptr*/) LIBXSMM_THROW; +LIBXSMM_API LIBXSMM_ATTRIBUTE_WEAK void free(void* ptr) LIBXSMM_THROW +{ + INTERNAL_FREE_HOOK(ptr, NULL/*caller*/); +} +#endif + + +LIBXSMM_API_INTERN int internal_xfree(const void* memory, internal_malloc_info_type* info) +{ +#if !defined(LIBXSMM_BUILD) || !defined(_WIN32) + static int error_once = 0; +#endif + int result = EXIT_SUCCESS; + internal_malloc_info_type local; + LIBXSMM_ASSIGN127(&local, info); +#if !defined(LIBXSMM_BUILD) /* sanity check */ + if (NULL != local.pointer || 0 == local.size) +#endif + { +#if !defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) || !defined(NDEBUG) + const size_t size = local.size + (size_t)(((const char*)memory) - ((const char*)local.pointer)); +#endif +#if defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) + const size_t size_alloc = local.size_alloc; + assert(0 == local.size || (NULL != local.pointer && size <= size_alloc)); /* !LIBXSMM_ASSERT */ +#else + const size_t size_alloc = /*LIBXSMM_UP2(*/size/*, LIBXSMM_PAGE_MINSIZE)*/; +#endif + assert(NULL != memory && NULL != info && sizeof(internal_malloc_info_type) < size_alloc); /* !LIBXSMM_ASSERT */ +#if defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) && defined(NDEBUG) + LIBXSMM_UNUSED(memory); +#endif + if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & local.flags)) { + if (NULL != local.free.function) { +#if defined(LIBXSMM_MALLOC_DELETE_SAFE) + LIBXSMM_MEMZERO127(info); +#endif + if (NULL == local.context) { +#if defined(LIBXSMM_MALLOC_HOOK) + if (free == local.free.function) { + __real_free(local.pointer); + } + else +#endif + if (NULL != local.free.function) { + local.free.function(local.pointer); + } + } + else { + LIBXSMM_ASSERT(NULL != local.free.ctx_form); + local.free.ctx_form(local.pointer, local.context); + } + } + } + else { +#if defined(LIBXSMM_VTUNE) + if (0 != (LIBXSMM_MALLOC_FLAG_X & local.flags) && 0 != local.code_id && iJIT_SAMPLING_ON == iJIT_IsProfilingActive()) { + iJIT_NotifyEvent(LIBXSMM_VTUNE_JIT_UNLOAD, &local.code_id); + } +#endif +#if defined(_WIN32) + result = (NULL == local.pointer || FALSE != VirtualFree(local.pointer, 0, MEM_RELEASE)) ? EXIT_SUCCESS : EXIT_FAILURE; +#else /* !_WIN32 */ + { + if (0 != munmap(local.pointer, size_alloc)) { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: %s (attempted to unmap buffer %p+%" PRIuPTR ")!\n", + strerror(errno), local.pointer, (uintptr_t)size_alloc); + } + result = EXIT_FAILURE; + } + if (0 != (LIBXSMM_MALLOC_FLAG_X & local.flags) && EXIT_SUCCESS == result + && NULL != local.reloc && MAP_FAILED != local.reloc && local.pointer != local.reloc + && 0 != munmap(local.reloc, size_alloc)) + { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: %s (attempted to unmap code %p+%" PRIuPTR ")!\n", + strerror(errno), local.reloc, (uintptr_t)size_alloc); + } + result = EXIT_FAILURE; + } + } +#endif + } + if (0 == (LIBXSMM_MALLOC_FLAG_X & local.flags)) { /* update statistics */ +#if !defined(_WIN32) +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) + if (0 != (LIBXSMM_MALLOC_FLAG_PHUGE & local.flags)) { /* huge pages */ + LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & local.flags)); + LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_hugetlb, size_alloc, LIBXSMM_ATOMIC_RELAXED); + } +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) + if (0 != (LIBXSMM_MALLOC_FLAG_PLOCK & local.flags)) { /* page-locked */ + LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & local.flags)); + LIBXSMM_ATOMIC_SUB_FETCH(&internal_malloc_plocked, size_alloc, LIBXSMM_ATOMIC_RELAXED); + } +# endif +#endif + if (0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & local.flags)) { /* public */ + if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & local.flags)) { /* scratch */ + const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( + &internal_malloc_public_cur, LIBXSMM_ATOMIC_RELAXED); + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_public_cur, + size_alloc <= current ? (current - size_alloc) : 0, LIBXSMM_ATOMIC_RELAXED); + } + else { /* local */ + const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( + &internal_malloc_local_cur, LIBXSMM_ATOMIC_RELAXED); + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_local_cur, + size_alloc <= current ? (current - size_alloc) : 0, LIBXSMM_ATOMIC_RELAXED); + } + } + else { /* private */ + const size_t current = (size_t)LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_LOAD, LIBXSMM_BITS)( + &internal_malloc_private_cur, LIBXSMM_ATOMIC_RELAXED); + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_STORE, LIBXSMM_BITS)(&internal_malloc_private_cur, + size_alloc <= current ? (current - size_alloc) : 0, LIBXSMM_ATOMIC_RELAXED); + } + } + } +#if !defined(LIBXSMM_BUILD) + else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: attempt to release memory from non-matching implementation!\n"); + } +#endif + return result; +} + + +LIBXSMM_API_INTERN void libxsmm_malloc_init(void) +{ +#if (0 != LIBXSMM_SYNC) && defined(LIBXSMM_MALLOC_SCRATCH_JOIN) + const char *const env = getenv("LIBXSMM_MALLOC_JOIN"); + if (NULL != env && '\0' != *env) internal_malloc_join = atoi(env); +#endif +#if defined(LIBXSMM_MALLOC_HOOK_DYNAMIC) +# if defined(LIBXSMM_MALLOC_HOOK_QKMALLOC) + void* handle_qkmalloc = NULL; + dlerror(); /* clear an eventual error status */ + handle_qkmalloc = dlopen("libqkmalloc.so", RTLD_LAZY); + if (NULL != handle_qkmalloc) { + libxsmm_malloc_fn.memalign.ptr = internal_memalign_malloc; + libxsmm_malloc_fn.malloc.dlsym = dlsym(handle_qkmalloc, "malloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.dlsym = dlsym(handle_qkmalloc, "calloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) +# endif + { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.dlsym = dlsym(handle_qkmalloc, "realloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) +# endif + { + libxsmm_malloc_fn.free.dlsym = dlsym(handle_qkmalloc, "free"); + } + } + } + dlclose(handle_qkmalloc); + } + if (NULL == libxsmm_malloc_fn.free.ptr) +# elif defined(LIBXSMM_MALLOC_HOOK_KMP) + dlerror(); /* clear an eventual error status */ + libxsmm_malloc_fn.alignmem.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_aligned_malloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.alignmem.dlsym) { + libxsmm_malloc_fn.memalign.ptr = internal_memalign_twiddle; + libxsmm_malloc_fn.malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_malloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_calloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) +# endif + { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_realloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) +# endif + { + libxsmm_malloc_fn.free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "kmp_free"); + } + } + } + } + if (NULL == libxsmm_malloc_fn.free.ptr) +# endif /*defined(LIBXSMM_MALLOC_HOOK_QKMALLOC)*/ + { + dlerror(); /* clear an eventual error status */ +# if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) + libxsmm_malloc_fn.memalign.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_memalign"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.memalign.dlsym) { + libxsmm_malloc_fn.malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_malloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_calloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) +# endif + { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_realloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) +# endif + { + libxsmm_malloc_fn.free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__libc_free"); + } + } + } + } + if (NULL == libxsmm_malloc_fn.free.ptr) { + void* handle_libc = NULL; + dlerror(); /* clear an eventual error status */ + handle_libc = dlopen("libc.so." LIBXSMM_STRINGIFY(LIBXSMM_MALLOC_GLIBC), RTLD_LAZY); + if (NULL != handle_libc) { + libxsmm_malloc_fn.memalign.dlsym = dlsym(handle_libc, "__libc_memalign"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.memalign.dlsym) { + libxsmm_malloc_fn.malloc.dlsym = dlsym(handle_libc, "__libc_malloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.dlsym = dlsym(handle_libc, "__libc_calloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.calloc.dlsym) +# endif + { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.dlsym = dlsym(handle_libc, "__libc_realloc"); + if (NULL == dlerror() && NULL != libxsmm_malloc_fn.realloc.dlsym) +# endif + { + libxsmm_malloc_fn.free.dlsym = dlsym(handle_libc, "__libc_free"); + } + } + } + } + dlclose(handle_libc); + } + } +# if 0 + { /* attempt to setup deprecated GLIBC hooks */ + union { const void* dlsym; void* (**ptr)(size_t, size_t, const void*); } hook_memalign; + dlerror(); /* clear an eventual error status */ + hook_memalign.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__memalign_hook"); + if (NULL == dlerror() && NULL != hook_memalign.dlsym) { + union { const void* dlsym; void* (**ptr)(size_t, const void*); } hook_malloc; + hook_malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__malloc_hook"); + if (NULL == dlerror() && NULL != hook_malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + union { const void* dlsym; void* (**ptr)(void*, size_t, const void*); } hook_realloc; + hook_realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__realloc_hook"); + if (NULL == dlerror() && NULL != hook_realloc.dlsym) +# endif + { + union { const void* dlsym; void (**ptr)(void*, const void*); } hook_free; + hook_free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "__free_hook"); + if (NULL == dlerror() && NULL != hook_free.dlsym) { + *hook_memalign.ptr = internal_memalign_hook; + *hook_malloc.ptr = internal_malloc_hook; +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + *hook_realloc.ptr = internal_realloc_hook; +# endif + *hook_free.ptr = internal_free_hook; + } + } + } + } + } +# endif +# else /* TODO */ +# endif /*(defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD)))*/ + } + if (NULL != libxsmm_malloc_fn.free.ptr) { +# if defined(LIBXSMM_MALLOC_HOOK_IMALLOC) + union { const void* dlsym; libxsmm_malloc_fun* ptr; } i_malloc; + i_malloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_malloc"); + if (NULL == dlerror() && NULL != i_malloc.dlsym) { +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + union { const void* dlsym; void* (**ptr)(size_t, size_t); } i_calloc; + i_calloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_calloc"); + if (NULL == dlerror() && NULL != i_calloc.dlsym) +# endif + { +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + union { const void* dlsym; libxsmm_realloc_fun* ptr; } i_realloc; + i_realloc.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_realloc"); + if (NULL == dlerror() && NULL != i_realloc.dlsym) +# endif + { + union { const void* dlsym; libxsmm_free_fun* ptr; } i_free; + i_free.dlsym = dlsym(LIBXSMM_RTLD_NEXT, "i_free"); + if (NULL == dlerror() && NULL != i_free.dlsym) { + *i_malloc.ptr = libxsmm_malloc_fn.malloc.ptr; +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + *i_calloc.ptr = libxsmm_malloc_fn.calloc.ptr; +# endif +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + *i_realloc.ptr = libxsmm_malloc_fn.realloc.ptr; +# endif + *i_free.ptr = libxsmm_malloc_fn.free.ptr; + } + } + } + } +# endif /*defined(LIBXSMM_MALLOC_HOOK_IMALLOC)*/ + } + else { /* fallback: potentially recursive */ +# if (defined(LIBXSMM_BUILD) && (1 < (LIBXSMM_BUILD))) + libxsmm_malloc_fn.memalign.ptr = __libc_memalign; + libxsmm_malloc_fn.malloc.ptr = __libc_malloc; +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.ptr = __libc_calloc; +# endif +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.ptr = __libc_realloc; +# endif + libxsmm_malloc_fn.free.ptr = __libc_free; +# else + libxsmm_malloc_fn.memalign.ptr = libxsmm_memalign_internal; + libxsmm_malloc_fn.malloc.ptr = malloc; +# if defined(LIBXSMM_MALLOC_HOOK_CALLOC) + libxsmm_malloc_fn.calloc.ptr = calloc; +# endif +# if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + libxsmm_malloc_fn.realloc.ptr = realloc; +# endif + libxsmm_malloc_fn.free.ptr = free; +# endif + } +#endif +} + + +LIBXSMM_API_INTERN void libxsmm_malloc_finalize(void) +{ +} + + +LIBXSMM_API_INTERN int libxsmm_xset_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) +{ + int result = EXIT_SUCCESS; + if (NULL != lock) { + LIBXSMM_INIT + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); + } + if (NULL != malloc_fn.function && NULL != free_fn.function) { + libxsmm_default_allocator_context = context; + libxsmm_default_malloc_fn = malloc_fn; + libxsmm_default_free_fn = free_fn; + } + else { + libxsmm_malloc_function internal_malloc_fn; + libxsmm_free_function internal_free_fn; + const void* internal_allocator = NULL; + internal_malloc_fn.function = __real_malloc; + internal_free_fn.function = __real_free; + /*internal_allocator = NULL;*/ + if (NULL == malloc_fn.function && NULL == free_fn.function) { + libxsmm_default_allocator_context = internal_allocator; + libxsmm_default_malloc_fn = internal_malloc_fn; + libxsmm_default_free_fn = internal_free_fn; + } + else { /* invalid allocator */ + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: allocator setup without malloc or free function!\n"); + } + /* keep any valid (previously instantiated) default allocator */ + if (NULL == libxsmm_default_malloc_fn.function || NULL == libxsmm_default_free_fn.function) { + libxsmm_default_allocator_context = internal_allocator; + libxsmm_default_malloc_fn = internal_malloc_fn; + libxsmm_default_free_fn = internal_free_fn; + } + result = EXIT_FAILURE; + } + } + if (NULL != lock) { + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); + } + LIBXSMM_ASSERT(EXIT_SUCCESS == result); + return result; +} + + +LIBXSMM_API_INTERN int libxsmm_xget_default_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) +{ + int result = EXIT_SUCCESS; + if (NULL != context || NULL != malloc_fn || NULL != free_fn) { + if (NULL != lock) { + LIBXSMM_INIT + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); + } + if (context) *context = libxsmm_default_allocator_context; + if (NULL != malloc_fn) *malloc_fn = libxsmm_default_malloc_fn; + if (NULL != free_fn) *free_fn = libxsmm_default_free_fn; + if (NULL != lock) { + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); + } + } + else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM ERROR: invalid signature used to get the default memory allocator!\n"); + } + result = EXIT_FAILURE; + } + LIBXSMM_ASSERT(EXIT_SUCCESS == result); + return result; +} + + +LIBXSMM_API_INTERN int libxsmm_xset_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) +{ + int result = EXIT_SUCCESS; + static int error_once = 0; + if (NULL != lock) { + LIBXSMM_INIT + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); + } + /* make sure the default allocator is setup before adopting it eventually */ + if (NULL == libxsmm_default_malloc_fn.function || NULL == libxsmm_default_free_fn.function) { + const libxsmm_malloc_function null_malloc_fn = { NULL }; + const libxsmm_free_function null_free_fn = { NULL }; + libxsmm_xset_default_allocator(NULL/*already locked*/, NULL/*context*/, null_malloc_fn, null_free_fn); + } + if (NULL == malloc_fn.function && NULL == free_fn.function) { /* adopt default allocator */ + libxsmm_scratch_allocator_context = libxsmm_default_allocator_context; + libxsmm_scratch_malloc_fn = libxsmm_default_malloc_fn; + libxsmm_scratch_free_fn = libxsmm_default_free_fn; + } + else if (NULL != malloc_fn.function) { + if (NULL == free_fn.function + && /*warning*/(LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: scratch allocator setup without free function!\n"); + } + libxsmm_scratch_allocator_context = context; + libxsmm_scratch_malloc_fn = malloc_fn; + libxsmm_scratch_free_fn = free_fn; /* NULL allowed */ + } + else { /* invalid scratch allocator */ + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid scratch allocator (default used)!\n"); + } + /* keep any valid (previously instantiated) scratch allocator */ + if (NULL == libxsmm_scratch_malloc_fn.function) { + libxsmm_scratch_allocator_context = libxsmm_default_allocator_context; + libxsmm_scratch_malloc_fn = libxsmm_default_malloc_fn; + libxsmm_scratch_free_fn = libxsmm_default_free_fn; + } + result = EXIT_FAILURE; + } + if (NULL != lock) { + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); + } + LIBXSMM_ASSERT(EXIT_SUCCESS == result); + return result; +} + + +LIBXSMM_API_INTERN int libxsmm_xget_scratch_allocator(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock, + const void** context, libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) +{ + int result = EXIT_SUCCESS; + if (NULL != context || NULL != malloc_fn || NULL != free_fn) { + if (NULL != lock) { + LIBXSMM_INIT + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); + } + if (context) *context = libxsmm_scratch_allocator_context; + if (NULL != malloc_fn) *malloc_fn = libxsmm_scratch_malloc_fn; + if (NULL != free_fn) *free_fn = libxsmm_scratch_free_fn; + if (NULL != lock) { + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); + } + } + else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM ERROR: invalid signature used to get the scratch memory allocator!\n"); + } + result = EXIT_FAILURE; + } + LIBXSMM_ASSERT(EXIT_SUCCESS == result); + return result; +} + + +LIBXSMM_API int libxsmm_set_default_allocator(const void* context, + libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) +{ + return libxsmm_xset_default_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); +} + + +LIBXSMM_API int libxsmm_get_default_allocator(const void** context, + libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) +{ + return libxsmm_xget_default_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); +} + + +LIBXSMM_API int libxsmm_set_scratch_allocator(const void* context, + libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) +{ + return libxsmm_xset_scratch_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); +} + + +LIBXSMM_API int libxsmm_get_scratch_allocator(const void** context, + libxsmm_malloc_function* malloc_fn, libxsmm_free_function* free_fn) +{ + return libxsmm_xget_scratch_allocator(&libxsmm_lock_global, context, malloc_fn, free_fn); +} + + +LIBXSMM_API int libxsmm_get_malloc_xinfo(const void* memory, size_t* size, int* flags, void** extra) +{ + int result; +#if !defined(NDEBUG) + if (NULL != size || NULL != extra) +#endif + { + const int check = ((NULL == flags || 0 == (LIBXSMM_MALLOC_FLAG_X & *flags)) ? 2 : 1); + const internal_malloc_info_type *const info = internal_malloc_info(memory, check); + if (NULL != info) { + if (NULL != size) *size = info->size; + if (NULL != flags) *flags = info->flags; + if (NULL != extra) *extra = info->pointer; + result = EXIT_SUCCESS; + } + else { /* potentially foreign buffer */ + result = (NULL != memory ? EXIT_FAILURE : EXIT_SUCCESS); + if (NULL != size) *size = 0; + if (NULL != flags) *flags = 0; + if (NULL != extra) *extra = NULL; + } + } +#if !defined(NDEBUG) + else { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: attachment error for memory buffer %p!\n", memory); + } + LIBXSMM_ASSERT_MSG(0/*false*/, "LIBXSMM ERROR: attachment error"); + result = EXIT_FAILURE; + } +#endif + return result; +} + + +#if !defined(_WIN32) + +LIBXSMM_API_INLINE void internal_xmalloc_mhint(void* buffer, size_t size) +{ + LIBXSMM_ASSERT((MAP_FAILED != buffer && NULL != buffer) || 0 == size); +#if (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE)) + /* proceed after failed madvise (even in case of an error; take what we got) */ + /* issue no warning as a failure seems to be related to the kernel version */ + madvise(buffer, size, MADV_NORMAL/*MADV_RANDOM*/ +# if defined(MADV_NOHUGEPAGE) /* if not available, we then take what we got (THP) */ + | ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) > size ? MADV_NOHUGEPAGE : 0) +# endif +# if defined(MADV_DONTDUMP) + | ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) > size ? 0 : MADV_DONTDUMP) +# endif + ); +#else + LIBXSMM_UNUSED(buffer); LIBXSMM_UNUSED(size); +#endif +} + + +LIBXSMM_API_INLINE void* internal_xmalloc_xmap(const char* dir, size_t size, int flags, void** rx) +{ + void* result = MAP_FAILED; + char filename[4096] = LIBXSMM_MALLOC_XMAP_TEMPLATE; + int i = 0; + LIBXSMM_ASSERT(NULL != rx && MAP_FAILED != *rx); + if (NULL != dir && '\0' != *dir) { + i = LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s/" LIBXSMM_MALLOC_XMAP_TEMPLATE, dir); + } + if (0 <= i && i < (int)sizeof(filename)) { + /* coverity[secure_temp] */ + i = mkstemp(filename); + if (0 <= i) { + if (0 == unlink(filename) && 0 == ftruncate(i, size) /*&& 0 == chmod(filename, S_IRWXU)*/) { + const int mflags = (flags | LIBXSMM_MAP_SHARED); + void *const xmap = mmap(*rx, size, PROT_READ | PROT_EXEC, mflags, i, 0/*offset*/); + if (MAP_FAILED != xmap) { + LIBXSMM_ASSERT(NULL != xmap); +#if defined(MAP_32BIT) + result = mmap(NULL, size, PROT_READ | PROT_WRITE, mflags & ~MAP_32BIT, i, 0/*offset*/); +#else + result = mmap(NULL, size, PROT_READ | PROT_WRITE, mflags, i, 0/*offset*/); +#endif + if (MAP_FAILED != result) { + LIBXSMM_ASSERT(NULL != result); + internal_xmalloc_mhint(xmap, size); + *rx = xmap; + } + else { + munmap(xmap, size); + *rx = NULL; + } + } + } + close(i); + } + } + return result; +} + +#endif /*!defined(_WIN32)*/ + + +LIBXSMM_API_INLINE void* internal_xrealloc(void** ptr, internal_malloc_info_type** info, size_t size, + libxsmm_realloc_fun realloc_fn, libxsmm_free_fun free_fn) +{ + char *const base = (char*)(NULL != *info ? (*info)->pointer : *ptr), *result; + LIBXSMM_ASSERT(NULL != *ptr && NULL != free_fn); + /* reallocation may implicitly invalidate info */ + result = (char*)(NULL != realloc_fn ? realloc_fn(base, size) : __real_malloc(size)); + if (result == base) { /* signal no-copy */ + LIBXSMM_ASSERT(NULL != result); + *info = NULL; /* no delete */ + *ptr = NULL; /* no copy */ + } + else if (NULL != result) { /* copy */ + if (NULL != realloc_fn) { + const size_t offset_src = (const char*)*ptr - base; + *ptr = result + offset_src; /* copy */ + *info = NULL; /* no delete */ + } + } +#if !defined(NDEBUG) && 0 + else { /* failed */ + if (NULL != *info) { + internal_xfree(*ptr, *info); /* invalidates info */ + } + else { /* foreign pointer */ + free_fn(*ptr); + } + *info = NULL; /* no delete */ + *ptr = NULL; /* no copy */ + } +#else + LIBXSMM_UNUSED(free_fn); +#endif + return result; +} + + +LIBXSMM_API_INTERN void* internal_xmalloc(void** /*ptr*/, internal_malloc_info_type** /*info*/, size_t /*size*/, + const void* /*context*/, libxsmm_malloc_function /*malloc_fn*/, libxsmm_free_function /*free_fn*/); +LIBXSMM_API_INTERN void* internal_xmalloc(void** ptr, internal_malloc_info_type** info, size_t size, + const void* context, libxsmm_malloc_function malloc_fn, libxsmm_free_function free_fn) +{ + void* result; + LIBXSMM_ASSERT(NULL != ptr && NULL != info && NULL != malloc_fn.function); + if (NULL == *ptr) { + result = (NULL == context + ? malloc_fn.function(size) + : malloc_fn.ctx_form(size, context)); + } + else { /* reallocate */ + if (NULL != free_fn.function /* prefer free_fn since it is part of pointer-info */ + ? (__real_free == free_fn.function || free == free_fn.function) + : (__real_malloc == malloc_fn.function || malloc == malloc_fn.function)) + { +#if defined(LIBXSMM_MALLOC_HOOK_REALLOC) + result = internal_xrealloc(ptr, info, size, __real_realloc, __real_free); +#else + result = internal_xrealloc(ptr, info, size, NULL, __real_free); +#endif + } + else { /* fallback with regular allocation */ + result = (NULL == context + ? malloc_fn.function(size) + : malloc_fn.ctx_form(size, context)); + if (NULL == result) { /* failed */ + if (NULL != *info) { + internal_xfree(*ptr, *info); /* invalidates info */ + } + else { /* foreign pointer */ + (NULL != free_fn.function ? free_fn.function : __real_free)(*ptr); + } + *ptr = NULL; /* safe delete */ + } + } + } + return result; +} + + +LIBXSMM_API int libxsmm_xmalloc(void** memory, size_t size, size_t alignment, + int flags, const void* extra, size_t extra_size) +{ + int result = EXIT_SUCCESS; +#if !defined(NDEBUG) + if (NULL != memory) +#endif + { + static int error_once = 0; + if (0 != size) { + size_t alloc_alignment = 0, alloc_size = 0, max_preserve = 0; + internal_malloc_info_type* info = NULL; + void *buffer = NULL, *reloc = NULL; + /* ATOMIC BEGIN: this region should be atomic/locked */ + const void* context = libxsmm_default_allocator_context; + libxsmm_malloc_function malloc_fn = libxsmm_default_malloc_fn; + libxsmm_free_function free_fn = libxsmm_default_free_fn; + if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { + context = libxsmm_scratch_allocator_context; + malloc_fn = libxsmm_scratch_malloc_fn; + free_fn = libxsmm_scratch_free_fn; +#if defined(LIBXSMM_MALLOC_MMAP_SCRATCH) + flags |= LIBXSMM_MALLOC_FLAG_MMAP; +#endif + } + if ((0 != (internal_malloc_kind & 1) && 0 < internal_malloc_kind) + || NULL == malloc_fn.function || NULL == free_fn.function) + { + malloc_fn.function = __real_malloc; + free_fn.function = __real_free; + context = NULL; + } + /* ATOMIC END: this region should be atomic */ + flags |= LIBXSMM_MALLOC_FLAG_RW; /* normalize given flags since flags=0 is accepted as well */ + if (0 != (LIBXSMM_MALLOC_FLAG_REALLOC & flags) && NULL != *memory) { + info = internal_malloc_info(*memory, 2/*check*/); + if (NULL != info) { + max_preserve = info->size; + } + else { /* reallocation of unknown allocation */ + flags &= ~LIBXSMM_MALLOC_FLAG_MMAP; + } + } + else *memory = NULL; +#if !defined(LIBXSMM_MALLOC_MMAP) + if (0 == (LIBXSMM_MALLOC_FLAG_X & flags) && 0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { + alloc_alignment = (0 == (LIBXSMM_MALLOC_FLAG_REALLOC & flags) ? libxsmm_alignment(size, alignment) : alignment); + alloc_size = size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1; + buffer = internal_xmalloc(memory, &info, alloc_size, context, malloc_fn, free_fn); + } + else +#endif + if (NULL == info || size != info->size) { +#if defined(_WIN32) || defined(__CYGWIN__) + const int mflags = (0 != (LIBXSMM_MALLOC_FLAG_X & flags) ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE); + static SIZE_T alloc_alignmax = 0, alloc_pagesize = 0; + if (0 == alloc_alignmax) { /* first/one time */ + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + alloc_pagesize = system_info.dwPageSize; + alloc_alignmax = GetLargePageMinimum(); + } + if ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) <= size) { /* attempt to use large pages */ + HANDLE process_token; + alloc_alignment = (NULL == info + ? (0 == alignment ? alloc_alignmax : libxsmm_lcm(alignment, alloc_alignmax)) + : libxsmm_lcm(alignment, alloc_alignmax)); + alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_alignmax); + if (TRUE == OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &process_token)) { + TOKEN_PRIVILEGES tp; + if (TRUE == LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid)) { + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; tp.PrivilegeCount = 1; /* enable privilege */ + if (TRUE == AdjustTokenPrivileges(process_token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0) + && ERROR_SUCCESS == GetLastError()/*may has failed (regardless of TRUE)*/) + { + /* VirtualAlloc cannot be used to reallocate memory */ + buffer = VirtualAlloc(NULL, alloc_size, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, mflags); + } + tp.Privileges[0].Attributes = 0; /* disable privilege */ + AdjustTokenPrivileges(process_token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); + } + CloseHandle(process_token); + } + } + else { /* small allocation using regular page-size */ + alloc_alignment = (NULL == info ? libxsmm_alignment(size, alignment) : alignment); + alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_pagesize); + } + if (NULL == buffer) { /* small allocation or retry with regular page size */ + /* VirtualAlloc cannot be used to reallocate memory */ + buffer = VirtualAlloc(NULL, alloc_size, MEM_RESERVE | MEM_COMMIT, mflags); + } + if (NULL != buffer) { + flags |= LIBXSMM_MALLOC_FLAG_MMAP; /* select the corresponding deallocation */ + } + else if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { /* fallback allocation */ + buffer = internal_xmalloc(memory, &info, alloc_size, context, malloc_fn, free_fn); + } +#else /* !defined(_WIN32) */ +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) + static size_t limit_hugetlb = LIBXSMM_SCRATCH_UNLIMITED; +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) + static size_t limit_plocked = LIBXSMM_SCRATCH_UNLIMITED; +# endif +# if defined(MAP_32BIT) + static int map32 = 1; +# endif + int mflags = 0 +# if defined(MAP_UNINITIALIZED) && 0/*fails with WSL*/ + | MAP_UNINITIALIZED /* unlikely available */ +# endif +# if defined(MAP_NORESERVE) + | (LIBXSMM_MALLOC_ALIGNMAX < size ? 0 : MAP_NORESERVE) +# endif +# if defined(MAP_32BIT) + | ((0 != (LIBXSMM_MALLOC_FLAG_X & flags) && 0 != map32 + && (LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid) + && (LIBXSMM_X86_AVX512 < libxsmm_target_archid || + LIBXSMM_X86_AVX > libxsmm_target_archid)) ? MAP_32BIT : 0) +# endif +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) + | ((0 == (LIBXSMM_MALLOC_FLAG_X & flags) + && ((LIBXSMM_MALLOC_ALIGNMAX * LIBXSMM_MALLOC_ALIGNFCT) <= size || + 0 != (LIBXSMM_MALLOC_FLAG_PHUGE & flags)) + && (internal_malloc_hugetlb + size) < limit_hugetlb) ? MAP_HUGETLB : 0) +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) && 0 == (LIBXSMM_MALLOC_LOCK_PAGES) + | (((0 != (LIBXSMM_MALLOC_FLAG_PLOCK & flags) || 0 == (LIBXSMM_MALLOC_FLAG_X & flags)) + && (internal_malloc_plocked + size) < limit_plocked) ? MAP_LOCKED : 0) +# endif + ; /* mflags */ +# if defined(MAP_POPULATE) + { static int prefault = 0; + if (0 == prefault) { /* prefault only on Linux 3.10.0-327 (and later) to avoid data race in page-fault handler */ + struct utsname osinfo; unsigned int version_major = 3, version_minor = 10, version_update = 0, version_patch = 327; + if (0 <= uname(&osinfo) && 0 == strcmp("Linux", osinfo.sysname) + && 4 == sscanf(osinfo.release, "%u.%u.%u-%u", &version_major, &version_minor, &version_update, &version_patch) + && LIBXSMM_VERSION4(3, 10, 0, 327) > LIBXSMM_VERSION4(version_major, version_minor, version_update, version_patch)) + { + mflags |= MAP_POPULATE; prefault = 1; + } + else prefault = -1; + } + else if (1 == prefault) mflags |= MAP_POPULATE; + } +# endif + /* make allocated size at least a multiple of the smallest page-size to avoid split-pages (unmap!) */ + alloc_alignment = libxsmm_lcm(0 == alignment ? libxsmm_alignment(size, alignment) : alignment, LIBXSMM_PAGE_MINSIZE); + alloc_size = LIBXSMM_UP2(size + extra_size + sizeof(internal_malloc_info_type) + alloc_alignment - 1, alloc_alignment); + if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* anonymous and non-executable */ +# if defined(MAP_32BIT) + LIBXSMM_ASSERT(0 == (MAP_32BIT & mflags)); +# endif +# if 0 + LIBXSMM_ASSERT(NULL != info || NULL == *memory); /* no memory mapping of foreign pointer */ +# endif + buffer = mmap(NULL == info ? NULL : info->pointer, alloc_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | mflags, -1, 0/*offset*/); +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) + INTERNAL_XMALLOC_KIND(MAP_HUGETLB, "huge-page", LIBXSMM_MALLOC_FLAG_PHUGE, flags, mflags, + internal_malloc_hugetlb, limit_hugetlb, info, alloc_size, buffer); +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) +# if 0 == (LIBXSMM_MALLOC_LOCK_PAGES) + INTERNAL_XMALLOC_KIND(MAP_LOCKED, "locked-page", LIBXSMM_MALLOC_FLAG_PLOCK, flags, mflags, + internal_malloc_plocked, limit_plocked, info, alloc_size, buffer); +# else + if (0 != (MAP_LOCKED & mflags) && MAP_FAILED != buffer) { + LIBXSMM_ASSERT(NULL != buffer); +# if 1 == (LIBXSMM_MALLOC_LOCK_PAGES) || !defined(MLOCK_ONFAULT) || !defined(SYS_mlock2) + if (0 == mlock(buffer, alloc_size)) +# elif 0 /* mlock2 is potentially not exposed */ + if (0 == mlock2(buffer, alloc_size, MLOCK_ONFAULT)) +# else + if (0 == syscall(SYS_mlock2, buffer, alloc_size, MLOCK_ONFAULT)) +# endif + { + LIBXSMM_ATOMIC_ADD_FETCH(&internal_malloc_plocked, alloc_size, LIBXSMM_ATOMIC_RELAXED); + flags |= LIBXSMM_MALLOC_FLAG_PLOCK; + } + else { /* update watermark */ + INTERNAL_XMALLOC_WATERMARK("locked-page", internal_malloc_plocked, limit_plocked, alloc_size); + flags &= ~LIBXSMM_MALLOC_FLAG_PLOCK; + } + } +# endif +# endif + } + else { /* executable buffer requested */ + static /*LIBXSMM_TLS*/ int entrypoint = -1; /* fallback allocation method */ +# if defined(MAP_HUGETLB) && defined(LIBXSMM_MALLOC_HUGE_PAGES) + LIBXSMM_ASSERT(0 == (MAP_HUGETLB & mflags)); +# endif +# if defined(MAP_LOCKED) && defined(LIBXSMM_MALLOC_LOCK_PAGES) + LIBXSMM_ASSERT(0 == (MAP_LOCKED & mflags)); +# endif + if (0 > (int)LIBXSMM_ATOMIC_LOAD(&entrypoint, LIBXSMM_ATOMIC_RELAXED)) { + const char *const env = getenv("LIBXSMM_SE"); + LIBXSMM_ATOMIC_STORE(&entrypoint, NULL == env + /* libxsmm_se decides */ + ? (0 == libxsmm_se ? LIBXSMM_MALLOC_FINAL : LIBXSMM_MALLOC_FALLBACK) + /* user's choice takes precedence */ + : ('0' != *env ? LIBXSMM_MALLOC_FALLBACK : LIBXSMM_MALLOC_FINAL), + LIBXSMM_ATOMIC_SEQ_CST); + LIBXSMM_ASSERT(0 <= entrypoint); + } + INTERNAL_XMALLOC(0, entrypoint, "JITDUMPDIR", "", map32, mflags, alloc_size, buffer, &reloc); /* 1st try */ + INTERNAL_XMALLOC(1, entrypoint, "TMPDIR", "/tmp", map32, mflags, alloc_size, buffer, &reloc); /* 2nd try */ + /* coverity[string_size] */ + INTERNAL_XMALLOC(2, entrypoint, "HOME", "", map32, mflags, alloc_size, buffer, &reloc); /* 3rd try */ + if (3 >= entrypoint && (MAP_FAILED == buffer || NULL == buffer)) { /* 4th try */ + buffer = mmap(reloc, alloc_size, PROT_READ | PROT_WRITE | PROT_EXEC, +# if defined(MAP_32BIT) + MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | (0 == map32 ? (mflags & ~MAP_32BIT) : mflags), +# else + MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | mflags, +# endif + -1, 0/*offset*/); + if (MAP_FAILED != buffer) entrypoint = 3; +# if defined(MAP_32BIT) + else if (0 != (MAP_32BIT & mflags) && 0 != map32) { + buffer = mmap(reloc, alloc_size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | LIBXSMM_MAP_ANONYMOUS | (mflags & ~MAP_32BIT), + - 1, 0/*offset*/); + if (MAP_FAILED != buffer) { + entrypoint = 3; + map32 = 0; + } + } +# endif + } + /* upgrade to SE-mode and retry lower entry-points */ + if (MAP_FAILED == buffer && 0 == libxsmm_se) { + libxsmm_se = 1; entrypoint = 0; + INTERNAL_XMALLOC(0, entrypoint, "JITDUMPDIR", "", map32, mflags, alloc_size, buffer, &reloc); /* 1st try */ + INTERNAL_XMALLOC(1, entrypoint, "TMPDIR", "/tmp", map32, mflags, alloc_size, buffer, &reloc); /* 2nd try */ + INTERNAL_XMALLOC(2, entrypoint, "HOME", "", map32, mflags, alloc_size, buffer, &reloc); /* 3rd try */ + } + } + if (MAP_FAILED != buffer && NULL != buffer) { + flags |= LIBXSMM_MALLOC_FLAG_MMAP; /* select deallocation */ + } + else { /* allocation failed */ + if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { /* ultimate fallback */ + buffer = (NULL != malloc_fn.function + ? (NULL == context ? malloc_fn.function(alloc_size) : malloc_fn.ctx_form(alloc_size, context)) + : (NULL)); + } + reloc = NULL; + } + if (MAP_FAILED != buffer && NULL != buffer) { + internal_xmalloc_mhint(buffer, alloc_size); + } +#endif /* !defined(_WIN32) */ + } + else { /* reallocation of the same pointer and size */ + alloc_size = size + extra_size + sizeof(internal_malloc_info_type) + alignment - 1; + if (NULL != info) { + buffer = info->pointer; + flags |= info->flags; + } + else { + flags |= LIBXSMM_MALLOC_FLAG_MMAP; + buffer = *memory; + } + alloc_alignment = alignment; + *memory = NULL; /* signal no-copy */ + } + if ( +#if !defined(_WIN32) && !defined(__clang_analyzer__) + MAP_FAILED != buffer && +#endif + NULL != buffer) + { + char *const cbuffer = (char*)buffer, *const aligned = LIBXSMM_ALIGN( + cbuffer + extra_size + sizeof(internal_malloc_info_type), alloc_alignment); + internal_malloc_info_type *const buffer_info = (internal_malloc_info_type*)( + aligned - sizeof(internal_malloc_info_type)); + LIBXSMM_ASSERT((aligned + size) <= (cbuffer + alloc_size)); + LIBXSMM_ASSERT(0 < alloc_alignment); + /* former content must be preserved prior to setup of buffer_info */ + if (NULL != *memory) { /* preserve/copy previous content */ +#if 0 + LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_REALLOC & flags)); +#endif + /* content behind foreign pointers is not explicitly preserved; buffers may overlap */ + memmove(aligned, *memory, LIBXSMM_MIN(max_preserve, size)); + if (NULL != info /* known allocation (non-foreign pointer) */ + && EXIT_SUCCESS != internal_xfree(*memory, info) /* !libxsmm_free, invalidates info */ + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { /* display some extra context of the failure (reallocation) */ + fprintf(stderr, "LIBXSMM ERROR: memory reallocation failed to release memory!\n"); + } + } + if (NULL != extra || 0 == extra_size) { + const char *const src = (const char*)extra; + int i; for (i = 0; i < (int)extra_size; ++i) cbuffer[i] = src[i]; + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: incorrect extraneous data specification!\n"); + /* no EXIT_FAILURE because valid buffer is returned */ + } + if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* update statistics */ + if (0 == (LIBXSMM_MALLOC_FLAG_PRIVATE & flags)) { /* public */ + if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { /* scratch */ + const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + &internal_malloc_public_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); + if (internal_malloc_public_max < watermark) internal_malloc_public_max = watermark; /* accept data-race */ + } + else { /* local */ + const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + &internal_malloc_local_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); + if (internal_malloc_local_max < watermark) internal_malloc_local_max = watermark; /* accept data-race */ + } + } + else if (0 != (LIBXSMM_MALLOC_FLAG_SCRATCH & flags)) { /* private scratch */ + const size_t watermark = LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_ADD_FETCH, LIBXSMM_BITS)( + &internal_malloc_private_cur, alloc_size, LIBXSMM_ATOMIC_RELAXED); + if (internal_malloc_private_max < watermark) internal_malloc_private_max = watermark; /* accept data-race */ + } + } + /* keep allocation function on record */ + if (0 == (LIBXSMM_MALLOC_FLAG_MMAP & flags)) { + buffer_info->context = context; + buffer_info->free = free_fn; + } + else { + buffer_info->free.function = NULL; + buffer_info->context = NULL; + } +#if defined(LIBXSMM_MALLOC_INFO_ALLOCSIZE) + buffer_info->size_alloc = alloc_size; +#endif + buffer_info->size = size; + buffer_info->pointer = buffer; + buffer_info->reloc = reloc; + buffer_info->flags = flags; +#if defined(LIBXSMM_VTUNE) + buffer_info->code_id = 0; +#endif /* info must be initialized to calculate correct checksum */ +#if !defined(LIBXSMM_MALLOC_CRC_OFF) +# if defined(LIBXSMM_MALLOC_CRC_LIGHT) + buffer_info->hash = LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &buffer_info); +# else + buffer_info->hash = libxsmm_crc32(LIBXSMM_MALLOC_SEED, buffer_info, + (unsigned int)(((char*)&buffer_info->hash) - ((char*)buffer_info))); +# endif +#endif /* finally commit/return allocated buffer */ + *memory = aligned; + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + char alloc_size_buffer[32]; + libxsmm_format_value(alloc_size_buffer, sizeof(alloc_size_buffer), alloc_size, "KM", "B", 10); + fprintf(stderr, "LIBXSMM ERROR: failed to allocate %s with flag=%i!\n", alloc_size_buffer, flags); + } + result = EXIT_FAILURE; + *memory = NULL; + } + } + else { + if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: zero-sized memory allocation detected!\n"); + } + *memory = NULL; /* no EXIT_FAILURE */ + } + } +#if !defined(NDEBUG) + else if (0 != size) { + result = EXIT_FAILURE; + } +#endif + return result; +} + + +LIBXSMM_API void libxsmm_xfree(const void* memory, int check) +{ +#if (!defined(LIBXSMM_MALLOC_HOOK) || defined(_DEBUG)) + static int error_once = 0; +#endif + /*const*/ internal_malloc_info_type *const info = internal_malloc_info(memory, check); + if (NULL != info) { /* !libxsmm_free */ +#if (!defined(LIBXSMM_MALLOC_HOOK) || defined(_DEBUG)) + if (EXIT_SUCCESS != internal_xfree(memory, info)) { /* invalidates info */ + if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); + } + } +#else + internal_xfree(memory, info); /* invalidates info */ +#endif + } + else if (NULL != memory) { +#if 1 + union { const void* const_ptr; void* ptr; } cast; + cast.const_ptr = memory; /* C-cast still warns */ + __real_free(cast.ptr); +#endif +#if (!defined(LIBXSMM_MALLOC_HOOK) || defined(_DEBUG)) + if ( 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: deallocation does not match allocation!\n"); + } +#endif + } +} + + +#if defined(LIBXSMM_VTUNE) +LIBXSMM_API_INLINE void internal_get_vtune_jitdesc(const void* code, + unsigned int code_id, size_t code_size, const char* code_name, + LIBXSMM_VTUNE_JIT_DESC_TYPE* desc) +{ + LIBXSMM_ASSERT(NULL != code && 0 != code_id && 0 != code_size && NULL != desc); + desc->method_id = code_id; + /* incorrect constness (method_name) */ + desc->method_name = (char*)code_name; + /* incorrect constness (method_load_address) */ + desc->method_load_address = (void*)code; + desc->method_size = code_size; + desc->line_number_size = 0; + desc->line_number_table = NULL; + desc->class_file_name = NULL; + desc->source_file_name = NULL; +# if (2 <= LIBXSMM_VTUNE_JITVERSION) + desc->module_name = "libxsmm.jit"; +# endif +} +#endif + + +LIBXSMM_API_INTERN int libxsmm_malloc_attrib(void** memory, int flags, const char* name) +{ + internal_malloc_info_type *const info = (NULL != memory ? internal_malloc_info(*memory, 0/*no check*/) : NULL); + int result = EXIT_SUCCESS; + static int error_once = 0; + if (NULL != info) { + void *const buffer = info->pointer; + const size_t size = info->size; +#if defined(_WIN32) + LIBXSMM_ASSERT(NULL != buffer || 0 == size); +#else + LIBXSMM_ASSERT((NULL != buffer && MAP_FAILED != buffer) || 0 == size); +#endif + flags |= (info->flags & ~LIBXSMM_MALLOC_FLAG_RWX); /* merge with current flags */ + /* quietly keep the read permission, but eventually revoke write permissions */ + if (0 == (LIBXSMM_MALLOC_FLAG_W & flags) || 0 != (LIBXSMM_MALLOC_FLAG_X & flags)) { + const size_t alignment = (size_t)(((const char*)(*memory)) - ((const char*)buffer)); + const size_t alloc_size = size + alignment; + if (0 == (LIBXSMM_MALLOC_FLAG_X & flags)) { /* data-buffer; non-executable */ +#if defined(_WIN32) + /* TODO: implement memory protection under Microsoft Windows */ + LIBXSMM_UNUSED(alloc_size); +#else + if (EXIT_SUCCESS != mprotect(buffer, alloc_size/*entire memory region*/, PROT_READ) + && (LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: read-only request for buffer failed!\n"); + } +#endif + } + else { /* executable buffer requested */ + void *const code_ptr = (NULL != info->reloc ? ((void*)(((char*)info->reloc) + alignment)) : *memory); + LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_X & flags)); + if (name && *name) { /* profiler support requested */ + if (0 > libxsmm_verbosity) { /* avoid dump if just the profiler is enabled */ + LIBXSMM_EXPECT(EXIT_SUCCESS, libxsmm_dump("LIBXSMM-JIT-DUMP", name, code_ptr, size, 1/*unique*/)); + } +#if defined(LIBXSMM_VTUNE) + if (iJIT_SAMPLING_ON == iJIT_IsProfilingActive()) { + LIBXSMM_VTUNE_JIT_DESC_TYPE vtune_jit_desc; + const unsigned int code_id = iJIT_GetNewMethodID(); + internal_get_vtune_jitdesc(code_ptr, code_id, size, name, &vtune_jit_desc); + iJIT_NotifyEvent(LIBXSMM_VTUNE_JIT_LOAD, &vtune_jit_desc); + info->code_id = code_id; + } + else { + info->code_id = 0; + } +#endif +#if defined(LIBXSMM_PERF) + /* If JIT is enabled and a valid name is given, emit information for profiler + * In jitdump case this needs to be done after mprotect as it gets overwritten + * otherwise. */ + libxsmm_perf_dump_code(code_ptr, size, name); +#endif + } + if (NULL != info->reloc && info->pointer != info->reloc) { +#if defined(_WIN32) + /* TODO: implement memory protection under Microsoft Windows */ +#else + /* memory is already protected at this point; relocate code */ + LIBXSMM_ASSERT(0 != (LIBXSMM_MALLOC_FLAG_MMAP & flags)); + *memory = code_ptr; /* relocate */ + info->pointer = info->reloc; + info->reloc = NULL; +# if !defined(LIBXSMM_MALLOC_CRC_OFF) /* update checksum */ +# if defined(LIBXSMM_MALLOC_CRC_LIGHT) + { const internal_malloc_info_type *const code_info = internal_malloc_info(code_ptr, 0/*no check*/); + info->hash = LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &code_info); + } +# else + info->hash = libxsmm_crc32(LIBXSMM_MALLOC_SEED, info, + /* info size minus actual hash value */ + (unsigned int)(((char*)&info->hash) - ((char*)info))); +# endif +# endif /* treat memory protection errors as soft error; ignore return value */ + munmap(buffer, alloc_size); +#endif + } +#if !defined(_WIN32) + else { /* malloc-based fallback */ + int mprotect_result; +# if !defined(LIBXSMM_MALLOC_CRC_OFF) && defined(LIBXSMM_VTUNE) /* check checksum */ +# if defined(LIBXSMM_MALLOC_CRC_LIGHT) + assert(info->hash == LIBXSMM_CRC32U(LIBXSMM_BITS)(LIBXSMM_MALLOC_SEED, &info)); /* !LIBXSMM_ASSERT */ +# else + assert(info->hash == libxsmm_crc32(LIBXSMM_MALLOC_SEED, info, /* !LIBXSMM_ASSERT */ + /* info size minus actual hash value */ + (unsigned int)(((char*)&info->hash) - ((char*)info)))); +# endif +# endif /* treat memory protection errors as soft error; ignore return value */ + mprotect_result = mprotect(buffer, alloc_size/*entire memory region*/, PROT_READ | PROT_EXEC); + if (EXIT_SUCCESS != mprotect_result) { + if (0 != libxsmm_se) { /* hard-error in case of SELinux */ + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate an executable buffer!\n"); + } + result = mprotect_result; + } + else if ((LIBXSMM_VERBOSITY_HIGH <= libxsmm_verbosity || 0 > libxsmm_verbosity) /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: read-only request for JIT-buffer failed!\n"); + } + } + } +#endif + } + } + } + else if (NULL == memory || NULL == *memory) { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: libxsmm_malloc_attrib failed because NULL cannot be attributed!\n"); + } + result = EXIT_FAILURE; + } + else if ((LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM WARNING: %s buffer %p does not match!\n", + 0 != (LIBXSMM_MALLOC_FLAG_X & flags) ? "executable" : "memory", *memory); + } + return result; +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_aligned_malloc(size_t size, size_t alignment) +{ + void* result = NULL; + LIBXSMM_INIT + if (2 > internal_malloc_kind) { +#if !defined(NDEBUG) + int status = +#endif + libxsmm_xmalloc(&result, size, alignment, LIBXSMM_MALLOC_FLAG_DEFAULT, NULL/*extra*/, 0/*extra_size*/); + assert(EXIT_SUCCESS == status || NULL == result); /* !LIBXSMM_ASSERT */ + } + else { /* scratch */ + const void *const caller = libxsmm_trace_caller_id(0/*level*/); + internal_scratch_malloc(&result, size, alignment, LIBXSMM_MALLOC_FLAG_DEFAULT, caller); + } + return result; +} + + +LIBXSMM_API void* libxsmm_realloc(size_t size, void* ptr) +{ + const int nzeros = LIBXSMM_INTRINSICS_BITSCANFWD64((uintptr_t)ptr), alignment = 1 << nzeros; + LIBXSMM_ASSERT(0 == ((uintptr_t)ptr & ~(0xFFFFFFFFFFFFFFFF << nzeros))); + LIBXSMM_INIT + if (2 > internal_malloc_kind) { +#if !defined(NDEBUG) + int status = +#endif + libxsmm_xmalloc(&ptr, size, alignment, LIBXSMM_MALLOC_FLAG_REALLOC, NULL/*extra*/, 0/*extra_size*/); + assert(EXIT_SUCCESS == status || NULL == ptr); /* !LIBXSMM_ASSERT */ + } + else { /* scratch */ + const void *const caller = libxsmm_trace_caller_id(0/*level*/); + internal_scratch_malloc(&ptr, size, alignment, LIBXSMM_MALLOC_FLAG_REALLOC, caller); + } + return ptr; +} + + +LIBXSMM_API void* libxsmm_scratch_malloc(size_t size, size_t alignment, const void* caller) +{ + void* result; + LIBXSMM_INIT + internal_scratch_malloc(&result, size, alignment, + LIBXSMM_MALLOC_INTERNAL_CALLER != caller ? LIBXSMM_MALLOC_FLAG_DEFAULT : LIBXSMM_MALLOC_FLAG_PRIVATE, + caller); + return result; +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_MALLOC void* libxsmm_malloc(size_t size) +{ + return libxsmm_aligned_malloc(size, 0/*auto*/); +} + + +LIBXSMM_API void libxsmm_free(const void* memory) +{ + if (NULL != memory) { +#if defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) || /* prefer safe method if possible */ \ + !defined(LIBXSMM_MALLOC_HOOK) +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(memory); + if (NULL != pool) { /* memory belongs to scratch domain */ + internal_scratch_free(memory, pool); + } + else +# endif + { /* local */ + libxsmm_xfree(memory, 2/*check*/); + } +#else /* lookup matching pool */ + internal_malloc_info_type *const info = internal_malloc_info(memory, 2/*check*/); + static int error_once = 0; + if (NULL != info && 0 == (LIBXSMM_MALLOC_FLAG_SCRATCH & info->flags)) { /* !libxsmm_free */ +# if !defined(NDEBUG) + if (EXIT_SUCCESS != internal_xfree(memory, info) /* invalidates info */ + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: memory deallocation failed!\n"); + } +# else + internal_xfree(memory, info); /* !libxsmm_free, invalidates info */ +# endif + } + else { +# if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + internal_malloc_pool_type *const pool = internal_scratch_malloc_pool(memory); + if (NULL != pool) { /* memory belongs to scratch domain */ + internal_scratch_free(memory, pool); + } + else +# endif + { +# if defined(NDEBUG) && defined(LIBXSMM_MALLOC_HOOK) + __real_free((void*)memory); +# else +# if defined(LIBXSMM_MALLOC_HOOK) + __real_free((void*)memory); +# endif + if (0 != libxsmm_verbosity && /* library code is expected to be mute */ + 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: deallocation does not match allocation!\n"); + } +# endif + } + } +#endif + } +} + + +LIBXSMM_API_INTERN void libxsmm_xrelease_scratch(LIBXSMM_LOCK_TYPE(LIBXSMM_LOCK)* lock) +{ +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + internal_malloc_pool_type* pools = NULL; + libxsmm_scratch_info scratch_info; + LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + if (NULL != lock) { + LIBXSMM_LOCK_ACQUIRE(LIBXSMM_LOCK, lock); + } +# if defined(LIBXSMM_MALLOC_DELETE_SAFE) + if (0 == (internal_malloc_kind & 1) || 0 >= internal_malloc_kind) +# endif + { + unsigned int i; + pools = (internal_malloc_pool_type*)LIBXSMM_UP2( + (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); + for (i = 0; i < libxsmm_scratch_pools; ++i) { + if (0 != pools[i].instance.minsize) { + if ( +# if !defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) + 1 < /*LIBXSMM_ATOMIC_LOAD(&*/pools[i].instance.counter/*, LIBXSMM_ATOMIC_SEQ_CST)*/ && +# endif + NULL != pools[i].instance.buffer) + { + internal_malloc_info_type *const info = internal_malloc_info(pools[i].instance.buffer, 2/*check*/); + if (NULL != info) internal_xfree(info->pointer, info); /* invalidates info */ + } + } + else break; /* early exit */ + } + } + LIBXSMM_EXPECT(EXIT_SUCCESS, libxsmm_get_scratch_info(&scratch_info)); + if (0 != scratch_info.npending && /* library code is expected to be mute */ + (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity)) + { + char pending_size_buffer[32]; + libxsmm_format_value(pending_size_buffer, sizeof(pending_size_buffer), + internal_malloc_public_cur + internal_malloc_local_cur, "KM", "B", 10); + fprintf(stderr, "LIBXSMM WARNING: %s pending scratch-memory by %" PRIuPTR " allocation%s!\n", + pending_size_buffer, (uintptr_t)scratch_info.npending, 1 < scratch_info.npending ? "s" : ""); + } + if (NULL != pools) { + memset(pools, 0, (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) * sizeof(internal_malloc_pool_type)); + /* no reset: keep private watermark (internal_malloc_private_max, internal_malloc_private_cur) */ + internal_malloc_public_max = internal_malloc_public_cur = 0; + internal_malloc_local_max = internal_malloc_local_cur = 0; + internal_malloc_scratch_nmallocs = 0; + } + if (NULL != lock) { + LIBXSMM_LOCK_RELEASE(LIBXSMM_LOCK, lock); + } +#endif +} + + +LIBXSMM_API void libxsmm_release_scratch(void) +{ + libxsmm_xrelease_scratch(&libxsmm_lock_global); +} + + +LIBXSMM_API int libxsmm_get_malloc_info(const void* memory, libxsmm_malloc_info* info) +{ + int result = EXIT_SUCCESS; + if (NULL != info) { + size_t size; + result = libxsmm_get_malloc_xinfo(memory, &size, NULL/*flags*/, NULL/*extra*/); + LIBXSMM_MEMZERO127(info); + if (EXIT_SUCCESS == result) { + info->size = size; + } +#if !defined(NDEBUG) /* library code is expected to be mute */ + else if (LIBXSMM_VERBOSITY_WARN <= libxsmm_verbosity || 0 > libxsmm_verbosity) { + static int error_once = 0; + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) { + fprintf(stderr, "LIBXSMM WARNING: foreign memory buffer %p discovered!\n", memory); + } + } +#endif + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API int libxsmm_get_scratch_info(libxsmm_scratch_info* info) +{ + int result = EXIT_SUCCESS; + if (NULL != info) { +#if defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + LIBXSMM_MEMZERO127(info); + info->nmallocs = internal_malloc_scratch_nmallocs; + info->internal = internal_malloc_private_max; + info->local = internal_malloc_local_max; + info->size = internal_malloc_public_max; + { const internal_malloc_pool_type* pool = (const internal_malloc_pool_type*)LIBXSMM_UP2( + (uintptr_t)internal_malloc_pool_buffer, LIBXSMM_MALLOC_SCRATCH_PADDING); +# if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + const internal_malloc_pool_type *const end = pool + libxsmm_scratch_pools; + LIBXSMM_ASSERT(libxsmm_scratch_pools <= LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS); + for (; pool != end; ++pool) if ((LIBXSMM_MALLOC_INTERNAL_CALLER) != pool->instance.site) { +# endif + if (0 != pool->instance.minsize) { + const size_t npending = /*LIBXSMM_ATOMIC_LOAD(&*/pool->instance.counter/*, LIBXSMM_ATOMIC_RELAXED)*/; +# if defined(LIBXSMM_MALLOC_SCRATCH_DELETE_FIRST) + info->npending += npending; +# else + info->npending += 1 < npending ? (npending - 1) : 0; +# endif + ++info->npools; + } +# if (1 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS)) + else break; /* early exit */ + } +# endif + } +#else + LIBXSMM_MEMZERO127(info); +#endif /*defined(LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS) && (0 < (LIBXSMM_MALLOC_SCRATCH_MAX_NPOOLS))*/ + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API void libxsmm_set_scratch_limit(size_t nbytes) +{ + /* !LIBXSMM_INIT */ + internal_malloc_scratch_limit = nbytes; +} + + +LIBXSMM_API size_t libxsmm_get_scratch_limit(void) +{ + size_t result; + /* !LIBXSMM_INIT */ + if (LIBXSMM_SCRATCH_DEFAULT != internal_malloc_scratch_limit) { + result = internal_malloc_scratch_limit; + } + else if (0 == internal_malloc_kind) { + result = LIBXSMM_MALLOC_SCRATCH_LIMIT; + } + else { + result = LIBXSMM_SCRATCH_UNLIMITED; + } + return result; +} + + +LIBXSMM_API void libxsmm_set_malloc(int enabled, const size_t* lo, const size_t* hi) +{ + /* !LIBXSMM_INIT */ +#if defined(LIBXSMM_MALLOC_HOOK) && defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) +# if (0 < LIBXSMM_MALLOC) + LIBXSMM_UNUSED(enabled); + internal_malloc_kind = LIBXSMM_MALLOC; +# else + internal_malloc_kind = enabled; +# endif + /* setup lo/hi after internal_malloc_kind! */ + if (NULL != lo) internal_malloc_limit[0] = *lo; + if (NULL != hi) { + const size_t scratch_limit = libxsmm_get_scratch_limit(); + const size_t malloc_upper = LIBXSMM_MIN(*hi, scratch_limit); + internal_malloc_limit[1] = LIBXSMM_MAX(malloc_upper, internal_malloc_limit[0]); + } +#else + LIBXSMM_UNUSED(lo); LIBXSMM_UNUSED(hi); + internal_malloc_kind = enabled; +#endif + libxsmm_malloc_init(); +} + + +LIBXSMM_API int libxsmm_get_malloc(size_t* lo, size_t* hi) +{ + LIBXSMM_INIT +#if defined(LIBXSMM_MALLOC_HOOK) && defined(LIBXSMM_MALLOC) && (0 != LIBXSMM_MALLOC) + if (NULL != lo) *lo = internal_malloc_limit[0]; + if (NULL != hi) *hi = internal_malloc_limit[1]; +#else + if (NULL != lo) *lo = 0; + if (NULL != hi) *hi = 0; +#endif + return internal_malloc_kind; +} + diff --git a/third_party/libxsmm/src/libxsmm_math.c b/third_party/libxsmm/src/libxsmm_math.c new file mode 100644 index 00000000..7c8f8e36 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_math.c @@ -0,0 +1,569 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if !defined(LIBXSMM_NO_LIBM) +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#define LIBXSMM_MATDIFF_DIV(NOMINATOR, DENREF, DENTST) \ + (0 < (DENREF) ? ((NOMINATOR) / (DENREF)) : \ + (0 < (DENTST) ? ((NOMINATOR) / (DENTST)) : 0)) + + +LIBXSMM_API int libxsmm_matdiff(libxsmm_matdiff_info* info, + libxsmm_datatype datatype, libxsmm_blasint m, libxsmm_blasint n, const void* ref, const void* tst, + const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst) +{ + int result = EXIT_SUCCESS, result_swap = 0, result_nan = 0; + libxsmm_blasint ldr = (NULL == ldref ? m : *ldref), ldt = (NULL == ldtst ? m : *ldtst); + if (NULL == ref && NULL != tst) { ref = tst; tst = NULL; result_swap = 1; } + if (NULL != ref && NULL != info && m <= ldr && m <= ldt) { + const size_t ntotal = (size_t)m * n; + libxsmm_blasint mm = m, nn = n; + double inf; + if (1 == n) { mm = ldr = ldt = 1; nn = m; } /* ensure row-vector shape to standardize results */ + libxsmm_matdiff_clear(info); + inf = info->min_ref; + switch (datatype) { + case LIBXSMM_DATATYPE_F64: { +# define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE double +# include "template/libxsmm_matdiff.tpl.c" +# undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE + } break; + case LIBXSMM_DATATYPE_F32: { +# define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE float +# include "template/libxsmm_matdiff.tpl.c" +# undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE + } break; + case LIBXSMM_DATATYPE_I32: { +# define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE int +# include "template/libxsmm_matdiff.tpl.c" +# undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE + } break; + case LIBXSMM_DATATYPE_I16: { +# define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE short +# include "template/libxsmm_matdiff.tpl.c" +# undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE + } break; + case LIBXSMM_DATATYPE_I8: { +# define LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE signed char +# include "template/libxsmm_matdiff.tpl.c" +# undef LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE + } break; + default: { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: unsupported data-type requested!\n"); + } + result = EXIT_FAILURE; + } + } + LIBXSMM_ASSERT((0 <= info->m && 0 <= info->n) || (0 > info->m && 0 > info->n)); + LIBXSMM_ASSERT(info->m < mm && info->n < nn); + if (EXIT_SUCCESS == result) { + const char *const env = getenv("LIBXSMM_DUMP"); + LIBXSMM_INIT + if (NULL != env && 0 != *env && '0' != *env) { + if ('-' != *env || (0 <= info->m && 0 <= info->n)) { + const char *const defaultname = (('0' < *env && '9' >= *env) || '-' == *env) ? "libxsmm_dump" : env; + const libxsmm_mhd_elemtype type_src = (libxsmm_mhd_elemtype)datatype; + const libxsmm_mhd_elemtype type_dst = LIBXSMM_MIN(LIBXSMM_MHD_ELEMTYPE_F32, type_src); + const int envi = atoi(env), reshape = (1 < envi || -1 > envi); + size_t shape[2], size[2]; + char filename[256]; + if (0 == reshape) { + shape[0] = (size_t)mm; shape[1] = (size_t)nn; + size[0] = (size_t)ldr; size[1] = (size_t)nn; + } + else { /* reshape */ + const size_t y = (size_t)libxsmm_isqrt2_u32((unsigned int)ntotal); + shape[0] = ntotal / y; shape[1] = y; + size[0] = shape[0]; + size[1] = shape[1]; + } + LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s-%p-ref.mhd", defaultname, ref); + libxsmm_mhd_write(filename, NULL/*offset*/, shape, size, 2/*ndims*/, 1/*ncomponents*/, + type_src, &type_dst, ref, NULL/*header_size*/, NULL/*extension_header*/, + NULL/*extension*/, 0/*extension_size*/); + if (NULL != tst) { + if (0 == reshape) { + size[0] = (size_t)ldt; + size[1] = (size_t)nn; + } + LIBXSMM_SNPRINTF(filename, sizeof(filename), "%s-%p-tst.mhd", defaultname, ref/*adopt ref-ptr*/); + libxsmm_mhd_write(filename, NULL/*offset*/, shape, size, 2/*ndims*/, 1/*ncomponents*/, + type_src, &type_dst, tst, NULL/*header_size*/, NULL/*extension_header*/, + NULL/*extension*/, 0/*extension_size*/); + if ('-' == *env && '1' < env[1]) { + printf("LIBXSMM MATDIFF (%s): m=%" PRIuPTR " n=%" PRIuPTR " ldi=%" PRIuPTR " ldo=%" PRIuPTR " failed.\n", + libxsmm_typename(datatype), (uintptr_t)m, (uintptr_t)n, (uintptr_t)ldr, (uintptr_t)ldt); + } + } + } + else if ('-' == *env && '1' < env[1] && NULL != tst) { + printf("LIBXSMM MATDIFF (%s): m=%" PRIuPTR " n=%" PRIuPTR " ldi=%" PRIuPTR " ldo=%" PRIuPTR " passed.\n", + libxsmm_typename(datatype), (uintptr_t)m, (uintptr_t)n, (uintptr_t)ldr, (uintptr_t)ldt); + } + } + if (0 == result_nan) { + info->rsq = 1.0 - LIBXSMM_MATDIFF_DIV(info->l2_abs, info->var_ref, info->var_tst); + if (0 != ntotal) { /* final variance */ + info->var_ref /= ntotal; + info->var_tst /= ntotal; + } + info->normf_rel = libxsmm_dsqrt(info->normf_rel); + info->l2_abs = libxsmm_dsqrt(info->l2_abs); + info->l2_rel = libxsmm_dsqrt(info->l2_rel); + } + else if (1 == result_nan) { + /* in case of NaN in test-set, statistics is not set to inf (ref/test) */ + info->norm1_abs = info->norm1_rel = info->normi_abs = info->normi_rel = info->normf_rel + = info->linf_abs = info->linf_rel = info->l2_abs = info->l2_rel + = inf; + } + if (1 == n) LIBXSMM_ISWAP(info->m, info->n); + if (0 != result_swap) { + info->min_tst = info->min_ref; + info->min_ref = 0; + info->max_tst = info->max_ref; + info->max_ref = 0; + info->avg_tst = info->avg_ref; + info->avg_ref = 0; + info->var_tst = info->var_ref; + info->var_ref = 0; + info->l1_tst = info->l1_ref; + info->l1_ref = 0; + info->v_tst = info->v_ref; + info->v_ref = 0; + } + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API void libxsmm_matdiff_reduce(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input) +{ + if (NULL != output && NULL != input) { + if (output->linf_abs < input->linf_abs) { + output->linf_abs = input->linf_abs; + output->linf_rel = input->linf_rel; + output->v_ref = input->v_ref; + output->v_tst = input->v_tst; + LIBXSMM_ASSERT(0 <= input->m); + output->m = input->m; + LIBXSMM_ASSERT(0 <= input->n); + output->n = input->n; + } + if (output->norm1_abs < input->norm1_abs) { + output->norm1_abs = input->norm1_abs; + output->norm1_rel = input->norm1_rel; + } + if (output->normi_abs < input->normi_abs) { + output->normi_abs = input->normi_abs; + output->normi_rel = input->normi_rel; + } + if (output->l2_abs < input->l2_abs) { + output->l2_abs = input->l2_abs; + output->l2_rel = input->l2_rel; + output->rsq = input->rsq; + } + if (output->normf_rel < input->normf_rel) { + output->normf_rel = input->normf_rel; + } + if (output->var_ref < input->var_ref) { + output->var_ref = input->var_ref; + } + if (output->var_tst < input->var_tst) { + output->var_tst = input->var_tst; + } + if (output->max_ref < input->max_ref) { + output->max_ref = input->max_ref; + } + if (output->max_tst < input->max_tst) { + output->max_tst = input->max_tst; + } + if (output->min_ref > input->min_ref) { + output->min_ref = input->min_ref; + } + if (output->min_tst > input->min_tst) { + output->min_tst = input->min_tst; + } + output->avg_ref = 0.5 * (output->avg_ref + input->avg_ref); + output->avg_tst = 0.5 * (output->avg_tst + input->avg_tst); + output->l1_ref += input->l1_ref; + output->l1_tst += input->l1_tst; + } + else { + libxsmm_matdiff_clear(output); + } +} + + +LIBXSMM_API void libxsmm_matdiff_clear(libxsmm_matdiff_info* info) +{ + if (NULL != info) { + union { int raw; float value; } inf; +#if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) + inf.value = (float)(INFINITY); +#else + inf.raw = 0x7F800000; +#endif + memset(info, 0, sizeof(*info)); /* nullify */ + /* no location discovered yet with a difference */ + info->m = info->n = -1; + /* initial minimum/maximum of reference/test */ + info->min_ref = info->min_tst = +inf.value; + info->max_ref = info->max_tst = -inf.value; + } +} + + +LIBXSMM_API size_t libxsmm_shuffle(unsigned int n) +{ + const unsigned int s = (0 != (n & 1) ? ((n / 2 - 1) | 1) : ((n / 2) & ~1)); + const unsigned int d = (0 != (n & 1) ? 1 : 2); + unsigned int result = (1 < n ? 1 : 0), i; + for (i = (d < n ? (n - 1) : 0); d < i; i -= d) { + unsigned int c = (s <= i ? (i - s) : (s - i)); + unsigned int a = n, b = c; + do { + const unsigned int r = a % b; + a = b; + b = r; + } while (0 != b); + if (1 == a) { + result = c; + if (2 * c <= n) { + i = d; /* break */ + } + } + } + assert((0 == result && 1 >= n) || (result < n && 1 == libxsmm_gcd(result, n))); + return result; +} + + +LIBXSMM_API unsigned int libxsmm_isqrt_u64(unsigned long long x) +{ + unsigned long long b; unsigned int y = 0, s; + for (s = 0x80000000/*2^31*/; 0 < s; s >>= 1) { + b = y | s; y |= (b * b <= x ? s : 0); + } + return y; +} + + +LIBXSMM_API unsigned int libxsmm_isqrt_u32(unsigned int x) +{ + unsigned int b; unsigned int y = 0; int s; + for (s = 0x40000000/*2^30*/; 0 < s; s >>= 2) { + b = y | s; y >>= 1; + if (b <= x) { x -= b; y |= s; } + } + return y; +} + + +LIBXSMM_API unsigned int libxsmm_isqrt2_u32(unsigned int x) +{ + return libxsmm_product_limit(x, libxsmm_isqrt_u32(x), 0/*is_lower*/); +} + + +LIBXSMM_API double libxsmm_kahan_sum(double value, double* accumulator, double* compensation) +{ + double r, c; + LIBXSMM_ASSERT(NULL != accumulator && NULL != compensation); + c = value - *compensation; r = *accumulator + c; + *compensation = (r - *accumulator) - c; + *accumulator = r; + return r; +} + + +LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) double libxsmm_dsqrt(double x) +{ +#if defined(LIBXSMM_INTRINSICS_X86) && !defined(__PGI) + const __m128d a = LIBXSMM_INTRINSICS_MM_UNDEFINED_PD(); + const double result = _mm_cvtsd_f64(_mm_sqrt_sd(a, _mm_set_sd(x))); +#elif !defined(LIBXSMM_NO_LIBM) + const double result = sqrt(x); +#else /* fallback */ + double result, y = x; + if (LIBXSMM_NEQ(0, x)) { + do { + result = y; + y = 0.5 * (y + x / y); + } while (LIBXSMM_NEQ(result, y)); + } + result = y; +#endif + return result; +} + + +LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) float libxsmm_ssqrt(float x) +{ +#if defined(LIBXSMM_INTRINSICS_X86) + const float result = _mm_cvtss_f32(_mm_sqrt_ss(_mm_set_ss(x))); +#elif !defined(LIBXSMM_NO_LIBM) + const float result = LIBXSMM_SQRTF(x); +#else /* fallback */ + float result, y = x; + if (LIBXSMM_NEQ(0, x)) { + do { + result = y; + y = 0.5f * (y + x / y); + } while (LIBXSMM_NEQ(result, y)); + } + result = y; +#endif + return result; +} + + +LIBXSMM_API unsigned int libxsmm_icbrt_u64(unsigned long long x) +{ + unsigned long long b; unsigned int y = 0; int s; + for (s = 63; 0 <= s; s -= 3) { + y += y; b = ((unsigned long long)y + 1) * 3 * y + 1ULL; + if (b <= (x >> s)) { x -= b << s; ++y; } + } + return y; +} + + +LIBXSMM_API unsigned int libxsmm_icbrt_u32(unsigned int x) +{ + unsigned int b; unsigned int y = 0; int s; + for (s = 30; 0 <= s; s -= 3) { + y += y; b = 3 * y * (y + 1) + 1; + if (b <= (x >> s)) { x -= b << s; ++y; } + } + return y; +} + +#if defined(LIBXSMM_NO_LIBM) +/* Implementation based on Claude Baumann's product (http://www.convict.lu/Jeunes/ultimate_stuff/exp_ln_2.htm). + * Exponential function, which exposes the number of iterations taken in the main case (1...22). + */ +LIBXSMM_API_INLINE float internal_math_sexp2(float x, int maxiter) +{ + static const float lut[] = { /* tabulated powf(2.f, powf(2.f, -index)) */ + 2.00000000f, 1.41421354f, 1.18920708f, 1.09050775f, 1.04427373f, 1.02189720f, 1.01088929f, 1.00542986f, + 1.00271130f, 1.00135469f, 1.00067711f, 1.00033855f, 1.00016928f, 1.00008464f, 1.00004232f, 1.00002110f, + 1.00001061f, 1.00000525f, 1.00000262f, 1.00000131f, 1.00000072f, 1.00000036f, 1.00000012f + }; + const int lut_size = sizeof(lut) / sizeof(*lut), lut_size1 = lut_size - 1; + int sign, temp, unbiased, exponent, mantissa; + union { int i; float s; } result; + + result.s = x; + sign = (0 == (result.i & 0x80000000) ? 0 : 1); + temp = result.i & 0x7FFFFFFF; /* clear sign */ + unbiased = (temp >> 23) - 127; /* exponent */ + exponent = -unbiased; + mantissa = (temp << 8) | 0x80000000; + + if (lut_size1 >= exponent) { + if (lut_size1 != exponent) { /* multiple lookups needed */ + if (7 >= unbiased) { /* not a degenerated case */ + const int n = (0 >= maxiter || lut_size1 <= maxiter) ? lut_size1 : maxiter; + int i = 1; + if (0 > unbiased) { /* regular/main case */ + LIBXSMM_ASSERT(0 <= exponent && exponent < lut_size); + result.s = lut[exponent]; /* initial value */ + i = exponent + 1; /* next LUT offset */ + } + else { + result.s = 2.f; /* lut[0] */ + i = 1; /* next LUT offset */ + } + for (; i <= n && 0 != mantissa; ++i) { + mantissa <<= 1; + if (0 != (mantissa & 0x80000000)) { /* check MSB */ + LIBXSMM_ASSERT(0 <= i && i < lut_size); + result.s *= lut[i]; /* TODO: normalized multiply */ + } + } + for (i = 0; i < unbiased; ++i) { /* compute squares */ + result.s *= result.s; + } + if (0 != sign) { /* negative value, so reciprocal */ + result.s = 1.f / result.s; + } + } + else { /* out of range */ +#if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) + result.s = (0 == sign ? ((float)(INFINITY)) : 0.f); +#else + result.i = (0 == sign ? 0x7F800000 : 0); +#endif + } + } + else if (0 == sign) { + result.s = lut[lut_size1]; + } + else { /* reciprocal */ + result.s = 1.f / lut[lut_size1]; + } + } + else { + result.s = 1.f; /* case 2^0 */ + } + return result.s; +} +#endif + + +LIBXSMM_API float libxsmm_sexp2(float x) +{ +#if !defined(LIBXSMM_NO_LIBM) + return LIBXSMM_EXP2F(x); +#else /* fallback */ + return internal_math_sexp2(x, 20/*compromise*/); +#endif +} + + +LIBXSMM_API float libxsmm_sexp2_u8(unsigned char x) +{ + union { int i; float s; } result; + if (128 > x) { + if (31 < x) { + const float r32 = 2.f * ((float)(1U << 31)); /* 2^32 */ + const int n = x >> 5; + int i; + result.s = r32; + for (i = 1; i < n; ++i) result.s *= r32; + result.s *= (1U << (x - (n << 5))); + } + else { + result.s = (float)(1U << x); + } + } + else { +#if defined(INFINITY) && /*overflow warning*/!defined(_CRAYC) + result.s = (float)(INFINITY); +#else + result.i = 0x7F800000; +#endif + } + return result.s; +} + + +LIBXSMM_API float libxsmm_sexp2_i8(signed char x) +{ + union { int i; float s; } result; + if (-128 != x) { + const signed char ux = (signed char)LIBXSMM_ABS(x); + if (31 < ux) { + const float r32 = 2.f * ((float)(1U << 31)); /* 2^32 */ + const int n = ux >> 5; + int i; + result.s = r32; + for (i = 1; i < n; ++i) result.s *= r32; + result.s *= (1U << (ux - (n << 5))); + } + else { + result.s = (float)(1U << ux); + } + if (ux != x) { /* signed */ + result.s = 1.f / result.s; + } + } + else { + result.i = 0x200000; + } + return result.s; +} + + +LIBXSMM_API float libxsmm_sexp2_i8i(int x) +{ + LIBXSMM_ASSERT(-128 <= x && x <= 127); + return libxsmm_sexp2_i8((signed char)x); +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff)(libxsmm_matdiff_info* /*info*/, + const int* /*datatype*/, const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const void* /*ref*/, const void* /*tst*/, + const libxsmm_blasint* /*ldref*/, const libxsmm_blasint* /*ldtst*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff)(libxsmm_matdiff_info* info, + const int* datatype, const libxsmm_blasint* m, const libxsmm_blasint* n, const void* ref, const void* tst, + const libxsmm_blasint* ldref, const libxsmm_blasint* ldtst) +{ + static int error_once = 0; + if ((NULL == datatype || LIBXSMM_DATATYPE_UNSUPPORTED <= *datatype || 0 > *datatype || NULL == m + || EXIT_SUCCESS != libxsmm_matdiff(info, (libxsmm_datatype)*datatype, *m, *(NULL != n ? n : m), ref, tst, ldref, ldtst)) + && 0 != libxsmm_verbosity && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_matdiff specified!\n"); + } +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_reduce)(libxsmm_matdiff_info* /*output*/, const libxsmm_matdiff_info* /*input*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_reduce)(libxsmm_matdiff_info* output, const libxsmm_matdiff_info* input) +{ + libxsmm_matdiff_reduce(output, input); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_clear)(libxsmm_matdiff_info* /*info*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matdiff_clear)(libxsmm_matdiff_info* info) +{ + libxsmm_matdiff_clear(info); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_shuffle)(long long* /*coprime*/, const int* /*n*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_shuffle)(long long* coprime, const int* n) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != coprime && NULL != n && 0 <= *n) +#endif + { + *coprime = (long long)(libxsmm_shuffle((unsigned int)(*n)) & 0x7FFFFFFF); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_shuffle specified!\n"); + } +#endif +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_matrixeqn.c b/third_party/libxsmm/src/libxsmm_matrixeqn.c new file mode 100644 index 00000000..54d8f5ef --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_matrixeqn.c @@ -0,0 +1,1265 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#include "libxsmm_matrixeqn.h" + +/* aux struct for matrix equations */ +LIBXSMM_APIVAR_DEFINE(libxsmm_matrix_eqn* libxsmm_matrix_eqns[256]); +LIBXSMM_APIVAR_DEFINE(libxsmm_blasint libxsmm_matrix_eqns_init); +LIBXSMM_APIVAR_DEFINE(libxsmm_blasint libxsmm_matrix_eqns_count); + +LIBXSMM_API_INTERN libxsmm_matrix_eqn* libxsmm_matrix_eqn_get_equation( libxsmm_blasint eqn_idx ) { + return libxsmm_matrix_eqns[eqn_idx]; +} + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_unary(libxsmm_meltw_unary_flags flags) { + libxsmm_matrix_eqn_bcast_type result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE; + if ((flags & LIBXSMM_MELTW_FLAG_UNARY_BCAST_ROW) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_UNARY_BCAST_COL) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_UNARY_BCAST_SCALAR) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + return result; +} + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_binary(libxsmm_meltw_binary_flags flags, unsigned int side) { + libxsmm_matrix_eqn_bcast_type result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE; + if (side == RIGHT) { + if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_ROW_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_COL_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_SCALAR_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + } + if (side == LEFT) { + if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_ROW_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_COL_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_BINARY_BCAST_SCALAR_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + } + return result; +} + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_ternary(libxsmm_meltw_ternary_flags flags, unsigned int side) { + libxsmm_matrix_eqn_bcast_type result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE; + if (side == RIGHT2) { + if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_2) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_2) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_2) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + } + if (side == RIGHT) { + if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_1) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + } + if (side == LEFT) { + if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_ROW_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_COL_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL; + } else if ((flags & LIBXSMM_MELTW_FLAG_TERNARY_BCAST_SCALAR_IN_0) > 0) { + result = LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR; + } + } + return result; +} + +LIBXSMM_API_INTERN libxsmm_blasint can_overwrite_unary_input(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN libxsmm_blasint can_overwrite_unary_input(libxsmm_matrix_eqn_elem* cur_node) { + libxsmm_blasint result = 1; + if (cur_node->info.u_op.type == LIBXSMM_MELTW_TYPE_UNARY_IDENTITY) { + result = 0; + } + if ((cur_node->le->tmp.dtype == LIBXSMM_DATATYPE_BF16) && (cur_node->tmp.dtype == LIBXSMM_DATATYPE_F32)) { + result = 0; + } + if (is_unary_opcode_transform_kernel(cur_node->info.u_op.type) > 0) { + result = 0; + } + return result; +} + +LIBXSMM_API_INTERN libxsmm_blasint can_overwrite_binary_input(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN libxsmm_blasint can_overwrite_binary_input(libxsmm_matrix_eqn_elem* cur_node) { + libxsmm_blasint result = 1; + if (cur_node->info.b_op.type == LIBXSMM_MELTW_TYPE_BINARY_MATMUL) { + result = 0; + } + if (((cur_node->le->tmp.dtype == LIBXSMM_DATATYPE_BF16) || (cur_node->ri->tmp.dtype == LIBXSMM_DATATYPE_BF16)) && (cur_node->tmp.dtype == LIBXSMM_DATATYPE_F32)) { + result = 0; + } + return result; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_dbg_print( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint indent ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_dbg_print( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint indent ) { + libxsmm_blasint i; + libxsmm_blasint tree_print_indent = 4; + + for ( i = 0; i < indent; ++i ) { + if ( i < indent - tree_print_indent ) { + printf(" "); + } else { + if ( i % tree_print_indent == 0 ) { + printf("|"); + } else { + printf("-"); + } + } + } + + /* check if we are at an argument leaf, then we move up */ + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + libxsmm_blasint argid = cur_node->info.arg.in_pos; + if ( (cur_node->le == NULL) && (cur_node->ri == NULL) ) { + if (argid >= 0) { + printf("ARG: M=%i, N=%i, LD=%i, arg_id=%i, dtype=%i\n", cur_node->info.arg.m, cur_node->info.arg.n, cur_node->info.arg.ld, cur_node->info.arg.in_pos, LIBXSMM_TYPESIZE(cur_node->info.arg.dtype) ); + } else { + printf("ARG: M=%i, N=%i, LD=%i, arg_id is scratch=%i, dtype=%i\n", cur_node->info.arg.m, cur_node->info.arg.n, cur_node->info.arg.ld, -1-argid, LIBXSMM_TYPESIZE(cur_node->info.arg.dtype) ); + } + } else { + printf("ERROR: Arg cannot have left or right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* we have to push more in this branch */ + if ( cur_node->le != NULL ) { + printf("UNARY: type=%i, flags=%i, timestamp=%i, out_tmp_id=%i, out_dtype=%i\n", (int)cur_node->info.u_op.type, (int)cur_node->info.u_op.flags, cur_node->visit_timestamp, cur_node->tmp.id, LIBXSMM_TYPESIZE(cur_node->tmp.dtype)); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->le, indent+tree_print_indent ); + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( (cur_node->ri != NULL) ) { + printf("ERROR: Unary cannot have right childs!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) ) { + printf("BINARY: type=%i, flags=%i, timestamp=%i, out_tmp_id=%i, out_dtype=%i\n", (int)cur_node->info.b_op.type, (int)cur_node->info.b_op.flags, cur_node->visit_timestamp, cur_node->tmp.id, LIBXSMM_TYPESIZE(cur_node->tmp.dtype)); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->le, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->ri, indent+tree_print_indent ); + } else { + printf("ERROR: Binary needs left and right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) && (cur_node->r2 != NULL)) { + printf("TERNARY: type=%i, flags=%i, timestamp=%i, out_tmp_id=%i, out_dtype=%i\n", (int)cur_node->info.t_op.type, (int)cur_node->info.t_op.flags, cur_node->visit_timestamp, cur_node->tmp.id, LIBXSMM_TYPESIZE(cur_node->tmp.dtype)); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->le, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->ri, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_dbg_print( cur_node->r2, indent+tree_print_indent ); + } else { + printf("ERROR: Ternary needs three children!\n"); + } + } else { + /* shouldn't happen */ + } +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_assign_reg_scores( libxsmm_matrix_eqn_elem* cur_node ) { + /* check if we are at an argument leaf, then we assign register score 0 */ + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + if ( (cur_node->le == NULL) && (cur_node->ri == NULL) ) { + cur_node->reg_score = 0; + } + else { + printf("ERROR: Arg cannot have left or right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* If the node is unary type we have the following cases: + * 1) If the left child is an arg, we just set the score to 1 (we do not overwrite the input) + * 2) if the left child is NOT an arg AND we can overwrite the tmp, we just propagate the register score from it (no additional tmp storage is needed) + * 3) if the left child is NOT an arg AND we CAN NOT overwrite the tmp, we should make the register score at least 2 + * */ + if ( cur_node->le != NULL ) { + libxsmm_matrix_eqn_assign_reg_scores( cur_node->le ); + if ( cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + cur_node->reg_score = 1; + } else { + if (can_overwrite_unary_input(cur_node) > 0) { + cur_node->reg_score = cur_node->le->reg_score; + } else { + cur_node->reg_score = LIBXSMM_MAX(2, cur_node->le->reg_score); + } + } + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( (cur_node->ri != NULL) ) { + printf("ERROR: Unary cannot have right childs!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) ) { + libxsmm_matrix_eqn_assign_reg_scores( cur_node->le ); + libxsmm_matrix_eqn_assign_reg_scores( cur_node->ri ); + + /* If left and right are args, we just need 1 tmp */ + if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->reg_score = 1; + } else { + if (can_overwrite_binary_input(cur_node) > 0) { + /* If the node is binary type we have two cases: + * 1) If the left/right subtrees have the same register score, we have to increase it by one (i.e. we have to first compute one of the subtrees and keep the result in a tmp storage and then compute the other subtree, so we would need an extra tmp storage) + * 2) If the left/right subtrees DO NOT have the same register score, then we assign the maximum of the register scores (i.e. we would compute first the subtree with the maximum score and then the tree with the smallest score, thus no extra tmp storage is required) */ + if (cur_node->le->reg_score == cur_node->ri->reg_score) { + cur_node->reg_score = cur_node->le->reg_score + 1; + } else { + cur_node->reg_score = LIBXSMM_MAX(cur_node->le->reg_score, cur_node->ri->reg_score); + } + } else { + if (cur_node->le->reg_score == cur_node->ri->reg_score) { + cur_node->reg_score = LIBXSMM_MAX(3, cur_node->le->reg_score + 1); + } else { + cur_node->reg_score = LIBXSMM_MAX(3, LIBXSMM_MAX(cur_node->le->reg_score, cur_node->ri->reg_score)); + } + } + } + } else { + printf("ERROR: Binary needs left and right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) && (cur_node->r2 != NULL) ) { + int use_r2_as_output = ((cur_node->info.t_op.flags & LIBXSMM_MELTW_FLAG_TERNARY_REUSE_IN_2_AS_OUT) > 0) ? 1 : 0; + libxsmm_matrix_eqn_assign_reg_scores( cur_node->le ); + libxsmm_matrix_eqn_assign_reg_scores( cur_node->ri ); + libxsmm_matrix_eqn_assign_reg_scores( cur_node->r2 ); + /* If all children re args, we just need 1 tmp */ + if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->reg_score = 1; + } else { + if (use_r2_as_output > 0) { + cur_node->reg_score = LIBXSMM_MAX(3, LIBXSMM_MAX(LIBXSMM_MAX(cur_node->le->reg_score, cur_node->ri->reg_score), cur_node->r2->reg_score)); + } else { + cur_node->reg_score = LIBXSMM_MAX(4, LIBXSMM_MAX(LIBXSMM_MAX(cur_node->le->reg_score, cur_node->ri->reg_score), cur_node->r2->reg_score)); + } + } + } else { + printf("ERROR: Ternary needs all three children!\n"); + } + } else { + /* shouldn't happen */ + } +} + +LIBXSMM_API_INTERN +void libxsmm_generator_assign_new_timestamp(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *current_timestamp ) { + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + /* Do not increase the timestamp, this node is just an arg so it's not part of the execution */ + cur_node->visit_timestamp = -1; + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + cur_node->visit_timestamp = *current_timestamp; + *current_timestamp = *current_timestamp + 1; + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + if (cur_node->le->reg_score >= cur_node->ri->reg_score) { + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + } else { + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + } + cur_node->visit_timestamp = *current_timestamp; + *current_timestamp = *current_timestamp + 1; + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + if ((cur_node->le->reg_score >= cur_node->ri->reg_score) && (cur_node->le->reg_score >= cur_node->r2->reg_score) ) { + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + if ( cur_node->ri->reg_score >= cur_node->r2->reg_score ) { + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->r2, current_timestamp ); + } else { + libxsmm_generator_assign_new_timestamp( cur_node->r2, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + } + } else if ((cur_node->ri->reg_score >= cur_node->le->reg_score) && (cur_node->ri->reg_score >= cur_node->r2->reg_score) ) { + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + if ( cur_node->le->reg_score >= cur_node->r2->reg_score ) { + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->r2, current_timestamp ); + } else { + libxsmm_generator_assign_new_timestamp( cur_node->r2, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + } + } else { + libxsmm_generator_assign_new_timestamp( cur_node->r2, current_timestamp ); + if ( cur_node->le->reg_score >= cur_node->ri->reg_score ) { + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + } else { + libxsmm_generator_assign_new_timestamp( cur_node->ri, current_timestamp ); + libxsmm_generator_assign_new_timestamp( cur_node->le, current_timestamp ); + } + } + } else { + /* shouldn't happen */ + } +} + +LIBXSMM_API_INTERN +void libxsmm_generator_matequation_assign_timestamps(libxsmm_matrix_eqn *eqn) { + libxsmm_blasint timestamp = 0; + libxsmm_generator_assign_new_timestamp(eqn->eqn_root, ×tamp ); +} + +LIBXSMM_API_INTERN libxsmm_blasint reserve_tmp_storage(libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool) { + libxsmm_blasint i; + if ( tmp_storage_pool != NULL ) { + for (i = 0; i < n_max_tmp; i++) { + if (tmp_storage_pool[i] == 0) { + tmp_storage_pool[i] = 1; + return i; + } + } + } + return -1; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_unary_tmp(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_unary_tmp(libxsmm_matrix_eqn_elem* cur_node) { + cur_node->tmp.m = cur_node->le->tmp.m; + cur_node->tmp.n = cur_node->le->tmp.n; + cur_node->tmp.ld = cur_node->le->tmp.ld; + cur_node->tmp.dtype = cur_node->info.u_op.dtype; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_binary_tmp(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_binary_tmp(libxsmm_matrix_eqn_elem* cur_node) { + cur_node->tmp.m = cur_node->le->tmp.m; + cur_node->tmp.ld = cur_node->le->tmp.ld; + if (cur_node->info.b_op.type == LIBXSMM_MELTW_TYPE_BINARY_MATMUL) { + cur_node->tmp.n = cur_node->ri->tmp.n; + } else { + cur_node->tmp.n = cur_node->le->tmp.n; + } + cur_node->tmp.dtype = cur_node->info.b_op.dtype; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_ternary_tmp(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_configure_ternary_tmp(libxsmm_matrix_eqn_elem* cur_node) { + cur_node->tmp.m = cur_node->r2->tmp.m; + cur_node->tmp.n = cur_node->r2->tmp.n; + cur_node->tmp.ld = cur_node->r2->tmp.ld; + if (cur_node->info.t_op.type == LIBXSMM_MELTW_TYPE_TERNARY_MATMUL) { + cur_node->tmp.m = cur_node->r2->tmp.m; + cur_node->tmp.n = cur_node->r2->tmp.n; + cur_node->tmp.ld = cur_node->r2->tmp.ld; + } + cur_node->tmp.dtype = cur_node->info.t_op.dtype; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_arg_node(libxsmm_matrix_eqn_elem* cur_node); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_arg_node(libxsmm_matrix_eqn_elem* cur_node) { + /* Do not increase the timestamp, this node is just an arg so it's not part of the execution */ + cur_node->visit_timestamp = -1; + cur_node->n_args = 1; + cur_node->max_tmp_size = cur_node->info.arg.ld * cur_node->info.arg.n; + cur_node->tmp.m = cur_node->info.arg.m; + cur_node->tmp.n = cur_node->info.arg.n; + cur_node->tmp.ld = cur_node->info.arg.ld; + cur_node->tmp.dtype = cur_node->info.arg.dtype; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_unary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_unary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool) { + /* Assign timestamp and propagate info for n_args/max_tmp_size */ + cur_node->visit_timestamp = *global_timestamp; + *global_timestamp = *global_timestamp + 1; + cur_node->n_args = cur_node->le->n_args; + cur_node->max_tmp_size = cur_node->le->max_tmp_size; + /* When assigning the tmp output storage, we have two cases in the unary: + * 1) The child is an arg, so we have to reserve a tmp storage + * 2) The child is NOT an arg, so we just reuse the tmp storage of the child IF we are allowed to overwrite */ + if ( cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + cur_node->tree_max_comp_tsize = LIBXSMM_TYPESIZE( cur_node->info.u_op.dtype ); + } else { + if (can_overwrite_unary_input(cur_node) > 0) { + cur_node->tmp.id = cur_node->le->tmp.id; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE(cur_node->info.u_op.dtype), cur_node->le->tree_max_comp_tsize ); + } + libxsmm_matrix_eqn_exec_plan_configure_unary_tmp( cur_node ); +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_binary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_binary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool) { + /* Assign timestamp and propagate info for n_args/max_tmp_size */ + cur_node->visit_timestamp = *global_timestamp; + *global_timestamp = *global_timestamp + 1; + cur_node->n_args = cur_node->le->n_args + cur_node->ri->n_args; + cur_node->max_tmp_size = LIBXSMM_MAX(cur_node->le->max_tmp_size, cur_node->ri->max_tmp_size); + /* Max tmp size has to be adjusted if it is a MATMUL op */ + if (cur_node->info.b_op.type == LIBXSMM_MELTW_TYPE_BINARY_MATMUL) { + libxsmm_blasint matmul_out_size = cur_node->le->tmp.ld * cur_node->ri->tmp.n; + cur_node->max_tmp_size = LIBXSMM_MAX(matmul_out_size, cur_node->max_tmp_size); + } + /* When assigning the tmp output storage, we have three cases in the binary: + * 1) Both children are arg, so we have to reserve a tmp storage + * 2) Both child are NOT arg, so we reuse the tmp storage of either one for our output and we make the other tmp storage available IF we are allowed to overwrite + * 3) One child IS arg and the other child is NOT an arg, so we just reuse the tmp storage of the non-arg child IF we are allowed to overwrite */ + if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + cur_node->tree_max_comp_tsize = LIBXSMM_TYPESIZE( cur_node->info.b_op.dtype ); + } else if ( (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + if (can_overwrite_binary_input(cur_node) > 0) { + cur_node->tmp.id = cur_node->le->tmp.id; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.b_op.dtype ), LIBXSMM_MAX( cur_node->ri->tree_max_comp_tsize, cur_node->le->tree_max_comp_tsize )); + } else { + if (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) { + if (can_overwrite_binary_input(cur_node) > 0) { + cur_node->tmp.id = cur_node->le->tmp.id; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE(cur_node->info.b_op.dtype), cur_node->le->tree_max_comp_tsize ); + } else { + if (can_overwrite_binary_input(cur_node) > 0) { + cur_node->tmp.id = cur_node->ri->tmp.id; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE(cur_node->info.b_op.dtype), cur_node->ri->tree_max_comp_tsize ); + } + } + libxsmm_matrix_eqn_exec_plan_configure_binary_tmp( cur_node ); +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_ternary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_exec_plan_visit_ternary_node(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool) { + /* Assign timestamp and propagate info for n_args/max_tmp_size */ + int use_r2_as_output = ((cur_node->info.t_op.flags & LIBXSMM_MELTW_FLAG_TERNARY_REUSE_IN_2_AS_OUT) > 0) ? 1 : 0; + cur_node->visit_timestamp = *global_timestamp; + *global_timestamp = *global_timestamp + 1; + cur_node->n_args = cur_node->le->n_args + cur_node->ri->n_args + cur_node->r2->n_args; + cur_node->max_tmp_size = LIBXSMM_MAX( LIBXSMM_MAX(cur_node->le->max_tmp_size, cur_node->ri->max_tmp_size), cur_node->r2->max_tmp_size); + if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + cur_node->tree_max_comp_tsize = LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ); + } else if ( (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + if (use_r2_as_output > 0 ) { + cur_node->tmp.id = cur_node->r2->tmp.id; + tmp_storage_pool[cur_node->le->tmp.id] = 0; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + tmp_storage_pool[cur_node->r2->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( cur_node->r2->tree_max_comp_tsize, LIBXSMM_MAX( cur_node->ri->tree_max_comp_tsize, cur_node->le->tree_max_comp_tsize ))); + } else if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + if (use_r2_as_output > 0 ) { + cur_node->tmp.id = cur_node->r2->tmp.id; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + tmp_storage_pool[cur_node->r2->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( cur_node->r2->tree_max_comp_tsize, LIBXSMM_MAX( cur_node->ri->tree_max_comp_tsize, 1 ))); + } else if ( (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + if (use_r2_as_output > 0 ) { + cur_node->tmp.id = cur_node->r2->tmp.id; + tmp_storage_pool[cur_node->le->tmp.id] = 0; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + tmp_storage_pool[cur_node->r2->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( cur_node->r2->tree_max_comp_tsize, LIBXSMM_MAX( 1, cur_node->le->tree_max_comp_tsize ))); + } else if ( (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( 1, LIBXSMM_MAX( cur_node->ri->tree_max_comp_tsize, cur_node->le->tree_max_comp_tsize ))); + } else if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + if (use_r2_as_output > 0 ) { + cur_node->tmp.id = cur_node->r2->tmp.id; + } else { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->r2->tmp.id] = 0; + } + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( cur_node->r2->tree_max_comp_tsize, LIBXSMM_MAX( 1, 1 ))); + } else if ( (cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->le->tmp.id] = 0; + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( 1, LIBXSMM_MAX( 1, cur_node->le->tree_max_comp_tsize ))); + } else if ( (cur_node->le->type == LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (cur_node->r2->type == LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->tmp.id = reserve_tmp_storage( n_max_tmp, tmp_storage_pool ); + tmp_storage_pool[cur_node->ri->tmp.id] = 0; + cur_node->tree_max_comp_tsize = LIBXSMM_MAX( LIBXSMM_TYPESIZE( cur_node->info.t_op.dtype ), LIBXSMM_MAX( 1, LIBXSMM_MAX( cur_node->ri->tree_max_comp_tsize, 1))); + } + libxsmm_matrix_eqn_exec_plan_configure_ternary_tmp( cur_node ); +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_reassign_children_bcast_tmp(libxsmm_matrix_eqn *eqn, libxsmm_matrix_eqn_elem* cur_node) { + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + /* Do nothing */ + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + if ((cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_unary(cur_node->info.u_op.flags) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->le->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->le); + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + if ((cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_binary(cur_node->info.b_op.flags, LEFT) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->le->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + if ((cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_binary(cur_node->info.b_op.flags, RIGHT) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->ri->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->le); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->ri); + } else if( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + if ((cur_node->le->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_ternary(cur_node->info.t_op.flags, LEFT) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->le->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + if ((cur_node->ri->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_ternary(cur_node->info.t_op.flags, RIGHT) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->ri->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + if ((cur_node->r2->type != LIBXSMM_MATRIX_EQN_NODE_ARG) && (get_bcast_type_ternary(cur_node->info.t_op.flags, RIGHT2) != LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE)) { + cur_node->r2->tmp.id = eqn->eqn_root->reg_score; + eqn->eqn_root->reg_score = eqn->eqn_root->reg_score + 1; + } + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->le); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->ri); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, cur_node->r2); + } else { + /* This should not happen */ + } +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_reassign_bcast_tmp(libxsmm_matrix_eqn *eqn) { + libxsmm_matrix_eqn_elem* root = eqn->eqn_root; + if ( root->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->le); + } + if ( root->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->le); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->ri); + } + if ( root->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->le); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->ri); + libxsmm_matrix_eqn_reassign_children_bcast_tmp(eqn, root->r2); + } +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_create_exec_plan( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool ) { + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + libxsmm_matrix_eqn_exec_plan_visit_arg_node(cur_node); + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* First visit left child tree */ + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_exec_plan_visit_unary_node(cur_node, global_timestamp, n_max_tmp, tmp_storage_pool); + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + /* First we visit the child tree with the maximum register score */ + if (cur_node->le->reg_score >= cur_node->ri->reg_score) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + } else { + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + } + libxsmm_matrix_eqn_exec_plan_visit_binary_node(cur_node, global_timestamp, n_max_tmp, tmp_storage_pool); + } else if( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + if ((cur_node->le->reg_score >= cur_node->ri->reg_score) && (cur_node->le->reg_score >= cur_node->r2->reg_score) ) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + if ( cur_node->ri->reg_score >= cur_node->r2->reg_score ) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->r2, global_timestamp, n_max_tmp, tmp_storage_pool ); + } else { + libxsmm_matrix_eqn_create_exec_plan( cur_node->r2, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + } + } else if ((cur_node->ri->reg_score >= cur_node->le->reg_score) && (cur_node->ri->reg_score >= cur_node->r2->reg_score) ) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + if ( cur_node->le->reg_score >= cur_node->r2->reg_score ) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->r2, global_timestamp, n_max_tmp, tmp_storage_pool ); + } else { + libxsmm_matrix_eqn_create_exec_plan( cur_node->r2, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + } + } else { + libxsmm_matrix_eqn_create_exec_plan( cur_node->r2, global_timestamp, n_max_tmp, tmp_storage_pool ); + if ( cur_node->le->reg_score >= cur_node->ri->reg_score ) { + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + } else { + libxsmm_matrix_eqn_create_exec_plan( cur_node->ri, global_timestamp, n_max_tmp, tmp_storage_pool ); + libxsmm_matrix_eqn_create_exec_plan( cur_node->le, global_timestamp, n_max_tmp, tmp_storage_pool ); + } + } + libxsmm_matrix_eqn_exec_plan_visit_ternary_node(cur_node, global_timestamp, n_max_tmp, tmp_storage_pool); + } else { + /* This should not happen */ + } +} + +LIBXSMM_API_INTERN +int is_unary_opcode_reduce_kernel (unsigned int opcode) { + int result = 0; + if ((opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_ADD) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_MAX) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_MUL) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X2_OP_ADD) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_OP_ADD_NCNC_FORMAT) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_X_X2_OP_ADD)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INTERN +int is_unary_opcode_transform_kernel (unsigned int opcode) { + int result = 0; + if ((opcode == LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNI) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_VNNI_TO_VNNIT) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNIT) || + (opcode == LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_VNNI_PAD)) { + result = 1; + } + return result; +} + +LIBXSMM_API_INTERN +int is_unary_opcode_reduce_to_scalar (unsigned int opcode) { + int result = 0; + if (opcode == LIBXSMM_MELTW_TYPE_UNARY_REDUCE_TO_SCALAR_OP_ADD) { + result = 1; + } + return result; +} + +LIBXSMM_API_INTERN +int is_binary_opcode_reduce_to_scalar (unsigned int opcode) { + int result = 0; + if (opcode == LIBXSMM_MELTW_TYPE_BINARY_MUL_AND_REDUCE_TO_SCALAR_OP_ADD) { + result = 1; + } + return result; +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_adjust_tmp_sizes( libxsmm_matrix_eqn_elem* cur_node ) { + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + /* Do nothing */ + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->le ); + /* If it is reduce kernel, have to resize out tmp */ + if ( is_unary_opcode_reduce_kernel(cur_node->info.u_op.type) > 0 ) { + if ((cur_node->info.u_op.flags & LIBXSMM_MELTW_FLAG_UNARY_REDUCE_ROWS) > 0) { + cur_node->tmp.m = cur_node->le->tmp.n; + cur_node->tmp.n = 1; + cur_node->tmp.ld = cur_node->le->tmp.n; + } else if ((cur_node->info.u_op.flags & LIBXSMM_MELTW_FLAG_UNARY_REDUCE_COLS) > 0) { + cur_node->tmp.m = cur_node->le->tmp.m; + cur_node->tmp.n = 1; + cur_node->tmp.ld = cur_node->le->tmp.ld; + } + } else if ( is_unary_opcode_reduce_to_scalar(cur_node->info.u_op.type) > 0 ) { + cur_node->tmp.m = 1; + cur_node->tmp.n = 1; + cur_node->tmp.ld = 1; + } else if ( is_unary_opcode_transform_kernel(cur_node->info.u_op.type) > 0 ) { + cur_node->tmp.m = cur_node->le->tmp.n; + cur_node->tmp.n = cur_node->le->tmp.m; + cur_node->tmp.ld = cur_node->le->tmp.n; + } else { + cur_node->tmp.m = cur_node->le->tmp.m; + cur_node->tmp.n = cur_node->le->tmp.n; + cur_node->tmp.ld = cur_node->le->tmp.ld; + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->le); + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->ri); + if ( is_binary_opcode_reduce_to_scalar(cur_node->info.b_op.type) > 0 ) { + cur_node->tmp.m = 1; + cur_node->tmp.n = 1; + cur_node->tmp.ld = 1; + } else { + cur_node->tmp.m = LIBXSMM_MAX(cur_node->le->tmp.m, cur_node->ri->tmp.m); + cur_node->tmp.n = LIBXSMM_MAX(cur_node->le->tmp.n, cur_node->ri->tmp.n); + cur_node->tmp.ld = LIBXSMM_MAX(cur_node->le->tmp.ld, cur_node->ri->tmp.ld); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->le ); + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->ri); + libxsmm_matrix_eqn_adjust_tmp_sizes( cur_node->r2); + cur_node->tmp.m = LIBXSMM_MAX(cur_node->r2->tmp.m, LIBXSMM_MAX(cur_node->le->tmp.m, cur_node->ri->tmp.m)); + cur_node->tmp.n = LIBXSMM_MAX(cur_node->r2->tmp.n, LIBXSMM_MAX(cur_node->le->tmp.n, cur_node->ri->tmp.n)); + cur_node->tmp.ld = LIBXSMM_MAX( cur_node->r2->tmp.ld, LIBXSMM_MAX(cur_node->le->tmp.ld, cur_node->ri->tmp.ld)); + } +} + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_opt_exec_plan( libxsmm_blasint idx ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_opt_exec_plan( libxsmm_blasint idx ) { + libxsmm_blasint global_timestamp = 0; + libxsmm_blasint max_reg_score = 0; + libxsmm_blasint *tmp_storage_pool = NULL; + libxsmm_blasint i; + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist, nothing to optimize!\n" ); + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 0 ) { + fprintf( stderr, "the requested equation is not yet finalized, so can't optimize!\n" ); + } +#if 0 + printf("\n"); + printf("Assigning register scores to find optimal traversal plan (i.e. that minimizes tmp storage)... \n"); +#endif + libxsmm_matrix_eqn_assign_reg_scores( libxsmm_matrix_eqns[idx]->eqn_root ); + max_reg_score = libxsmm_matrix_eqns[idx]->eqn_root->reg_score; + tmp_storage_pool = (libxsmm_blasint*) malloc(max_reg_score * sizeof(libxsmm_blasint)); + if (tmp_storage_pool == NULL) { + fprintf( stderr, "Tmp storage allocation array failed...\n" ); + return; + } else { + for (i = 0; i < max_reg_score; i++) { + tmp_storage_pool[i] = 0; + } + } +#if 0 + printf("Optimal number of intermediate tmp storage is %d\n", max_reg_score); +#endif + libxsmm_matrix_eqn_create_exec_plan( libxsmm_matrix_eqns[idx]->eqn_root, &global_timestamp, max_reg_score, tmp_storage_pool ); + libxsmm_matrix_eqn_adjust_tmp_sizes( libxsmm_matrix_eqns[idx]->eqn_root ); + libxsmm_matrix_eqn_reassign_bcast_tmp( libxsmm_matrix_eqns[idx] ); +#if 0 + printf("Created optimal exexution plan...\n"); +#endif + if (tmp_storage_pool != NULL) { + free(tmp_storage_pool); + } +#if 0 + printf("\n\n"); +#endif + libxsmm_matrix_eqns[idx]->is_optimized = 1; +} + +LIBXSMM_API_INTERN +void libxsmm_generator_reoptimize_eqn(libxsmm_matrix_eqn *eqn) { + libxsmm_blasint max_reg_score = 0, global_timestamp = 0, i = 0; + libxsmm_blasint *tmp_storage_pool = NULL; + libxsmm_matrix_eqn_assign_reg_scores( eqn->eqn_root ); + max_reg_score = eqn->eqn_root->reg_score; + tmp_storage_pool = (libxsmm_blasint*) malloc(max_reg_score * sizeof(libxsmm_blasint)); + if (tmp_storage_pool == NULL) { + fprintf( stderr, "Tmp storage allocation array failed...\n" ); + return; + } else { + for (i = 0; i < max_reg_score; i++) { + tmp_storage_pool[i] = 0; + } + } + libxsmm_matrix_eqn_create_exec_plan( eqn->eqn_root, &global_timestamp, max_reg_score, tmp_storage_pool ); + libxsmm_matrix_eqn_adjust_tmp_sizes( eqn->eqn_root ); + if (tmp_storage_pool != NULL) { + free(tmp_storage_pool); + } +} + +LIBXSMM_API_INTERN libxsmm_matrix_eqn_elem* libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqn_elem* cur_node, libxsmm_matrix_eqn_node_type type, libxsmm_matrix_eqn_info info ); +LIBXSMM_API_INTERN libxsmm_matrix_eqn_elem* libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqn_elem* cur_node, libxsmm_matrix_eqn_node_type type, libxsmm_matrix_eqn_info info ) { + if ( type == LIBXSMM_MATRIX_EQN_NODE_NONE ) { + /* shouldn't happen */ + fprintf( stderr, "wrong op node type to add!\n"); + } + + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + libxsmm_matrix_eqn_elem *node = (libxsmm_matrix_eqn_elem*) malloc( sizeof(libxsmm_matrix_eqn_elem) ); + + node->le = NULL; + node->ri = NULL; + node->r2 = NULL; + node->up = cur_node; + node->type = type; + node->info = info; + + if ( cur_node->le == NULL ) { + cur_node->le = node; + } else { + /* shouldn't happen */ + fprintf( stderr, "this is not a leaf node, so we cannot add a node!\n"); + free( node ); + node = NULL; + } + + return node; + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + libxsmm_matrix_eqn_elem *node = (libxsmm_matrix_eqn_elem*) malloc( sizeof(libxsmm_matrix_eqn_elem) ); + + node->le = NULL; + node->ri = NULL; + node->r2 = NULL; + node->up = cur_node; + node->type = type; + node->info = info; + + if ( cur_node->le == NULL ) { + cur_node->le = node; + } else if ( cur_node->ri == NULL ) { + cur_node->ri = node; + } else { + /* shouldn't happen */ + fprintf( stderr, "this is not a leaf node, so we cannot add a node!\n"); + free( node ); + node = NULL; + } + + return node; + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + libxsmm_matrix_eqn_elem *node = (libxsmm_matrix_eqn_elem*) malloc( sizeof(libxsmm_matrix_eqn_elem) ); + + node->le = NULL; + node->ri = NULL; + node->r2 = NULL; + node->up = cur_node; + node->type = type; + node->info = info; + + if ( cur_node->le == NULL ) { + cur_node->le = node; + } else if ( cur_node->ri == NULL ) { + cur_node->ri = node; + } else if ( cur_node->r2 == NULL ) { + cur_node->r2 = node; + } else { + /* shouldn't happen */ + fprintf( stderr, "this is not a leaf node, so we cannot add a node!\n"); + free( node ); + node = NULL; + } + + return node; + /* we converting the root */ + } else if ( (cur_node->type == LIBXSMM_MATRIX_EQN_NODE_NONE) && (type != LIBXSMM_MATRIX_EQN_NODE_ARG) ) { + cur_node->le = NULL; + cur_node->ri = NULL; + cur_node->r2 = NULL; + cur_node->up = NULL; + cur_node->type = type; + cur_node->info = info; + + return cur_node; + } else { + /* shouldn't happen */ + fprintf( stderr, "at this position we cannot add an op!\n"); + } + + return NULL; +} + + +LIBXSMM_API_INTERN libxsmm_matrix_eqn_elem* libxsmm_matrix_eqn_trv_head( libxsmm_matrix_eqn_elem* cur_node ); +LIBXSMM_API_INTERN libxsmm_matrix_eqn_elem* libxsmm_matrix_eqn_trv_head( libxsmm_matrix_eqn_elem* cur_node ) { + /* check if we are at an argument leaf, then we move up */ + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + return libxsmm_matrix_eqn_trv_head( cur_node->up ); + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* we have to push more in this branch */ + if ( cur_node->le == NULL ) { + return cur_node; + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( cur_node->up == NULL ) { + return cur_node; + /* we have to find another node */ + } else { + return libxsmm_matrix_eqn_trv_head( cur_node->up ); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + /* we have to push more in this branch */ + if ( cur_node->le == NULL ) { + return cur_node; + } else if ( cur_node->ri == NULL ) { + return cur_node; + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( cur_node->up == NULL ) { + return cur_node; + /* we have to find another node */ + } else { + return libxsmm_matrix_eqn_trv_head( cur_node->up ); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + /* we have to push more in this branch */ + if ( cur_node->le == NULL ) { + return cur_node; + } else if ( cur_node->ri == NULL ) { + return cur_node; + } else if ( cur_node->r2 == NULL ) { + return cur_node; + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( cur_node->up == NULL ) { + return cur_node; + /* we have to find another node */ + } else { + return libxsmm_matrix_eqn_trv_head( cur_node->up ); + } + } else { + /* should not happen */ + } + + return NULL; +} + + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_print( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint indent ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_print( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint indent ) { + libxsmm_blasint i; + libxsmm_blasint tree_print_indent = 4; + + for ( i = 0; i < indent; ++i ) { + if ( i < indent - tree_print_indent ) { + printf(" "); + } else { + if ( i % tree_print_indent == 0 ) { + printf("|"); + } else { + printf("-"); + } + } + } + + /* check if we are at an argument leaf, then we move up */ + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + if ( (cur_node->le == NULL) && (cur_node->ri == NULL) ) { + printf("ARG: %i %i %i %i %i\n", cur_node->info.arg.m, cur_node->info.arg.n, cur_node->info.arg.ld, cur_node->info.arg.in_pos, cur_node->info.arg.offs_in_pos ); + } else { + printf("ERROR: Arg cannot have left or right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* we have to push more in this branch */ + if ( cur_node->le != NULL ) { + printf("UNARY: %i %i (timestamp = %i, tmp = %i)\n", (int)cur_node->info.u_op.type, (int)cur_node->info.u_op.flags, cur_node->visit_timestamp, cur_node->tmp.id ); + libxsmm_matrix_eqn_trv_print( cur_node->le, indent+tree_print_indent ); + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( (cur_node->ri != NULL) ) { + printf("ERROR: Unary cannot have right childs!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) ) { + printf("BINARY: %i %i (timestamp = %i, tmp = %i)\n", (int)cur_node->info.b_op.type, (int)cur_node->info.b_op.flags, cur_node->visit_timestamp, cur_node->tmp.id ); + libxsmm_matrix_eqn_trv_print( cur_node->le, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_print( cur_node->ri, indent+tree_print_indent ); + } else { + printf("ERROR: Binary needs left and right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) && (cur_node->r2 != NULL) ) { + printf("TERNARY: %i %i (timestamp = %i, tmp = %i)\n", (int)cur_node->info.t_op.type, (int)cur_node->info.t_op.flags, cur_node->visit_timestamp, cur_node->tmp.id ); + libxsmm_matrix_eqn_trv_print( cur_node->le, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_print( cur_node->ri, indent+tree_print_indent ); + libxsmm_matrix_eqn_trv_print( cur_node->r2, indent+tree_print_indent ); + } else { + printf("ERROR: Ternary needs left, right and right2 child!\n"); + } + } else { + /* shouldn't happen */ + } +} + + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_rpn_print( libxsmm_matrix_eqn_elem* cur_node ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_trv_rpn_print( libxsmm_matrix_eqn_elem* cur_node ) { + /* check if we are at an argument leaf, then we move up */ + if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_ARG ) { + if ( (cur_node->le == NULL) && (cur_node->ri == NULL) ) { + printf("ARG "); + } else { + printf("ERROR: Arg cannot have left or right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_UNARY ) { + /* we have to push more in this branch */ + if ( cur_node->le != NULL ) { + libxsmm_matrix_eqn_trv_rpn_print( cur_node->le ); + printf("UNARY-%i ", (int)cur_node->info.u_op.type ); + /* we have reached the root, as we are unary, there is no right branch */ + } else if ( (cur_node->ri != NULL) ) { + printf("ERROR: Unary cannot have right childs!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_BINARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) ) { + libxsmm_matrix_eqn_trv_rpn_print( cur_node->le ); + libxsmm_matrix_eqn_trv_rpn_print( cur_node->ri ); + printf("BINARY-%i ", (int)cur_node->info.b_op.type ); + } else { + printf("ERROR: Binary needs left and right child!\n"); + } + } else if ( cur_node->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY ) { + /* we have to push more in this branch */ + if ( (cur_node->le != NULL) && (cur_node->ri != NULL) && (cur_node->r2 != NULL) ) { + libxsmm_matrix_eqn_trv_rpn_print( cur_node->le ); + libxsmm_matrix_eqn_trv_rpn_print( cur_node->ri ); + libxsmm_matrix_eqn_trv_rpn_print( cur_node->r2 ); + printf("TERNARY-%i ", (int)cur_node->info.t_op.type ); + } else { + printf("ERROR: Ternary needs left, right and right2 child!\n"); + } + } else { + /* shouldn't happen */ + } +} + + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_mov_head( libxsmm_blasint idx ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_mov_head( libxsmm_blasint idx ) { + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 1 ) { + fprintf( stderr, "the requested equation is already finalized!\n" ); + } + + libxsmm_matrix_eqns[idx]->eqn_cur = libxsmm_matrix_eqn_trv_head( libxsmm_matrix_eqns[idx]->eqn_cur ); + +#if 0 + printf("cur node address: %lld\n", libxsmm_matrix_eqns[idx]->eqn_cur ); +#endif + + /* let's see if we need seal the equation */ + if ( (libxsmm_matrix_eqns[idx]->eqn_cur == libxsmm_matrix_eqns[idx]->eqn_root) && + ( ((libxsmm_matrix_eqns[idx]->eqn_cur->type == LIBXSMM_MATRIX_EQN_NODE_UNARY) && (libxsmm_matrix_eqns[idx]->eqn_cur->le != NULL)) || + ((libxsmm_matrix_eqns[idx]->eqn_cur->type == LIBXSMM_MATRIX_EQN_NODE_BINARY) && (libxsmm_matrix_eqns[idx]->eqn_cur->ri != NULL)) || + ((libxsmm_matrix_eqns[idx]->eqn_cur->type == LIBXSMM_MATRIX_EQN_NODE_TERNARY) && (libxsmm_matrix_eqns[idx]->eqn_cur->r2 != NULL)) ) ) { + libxsmm_matrix_eqns[idx]->is_constructed = 1; + libxsmm_matrix_eqn_opt_exec_plan( idx ); + } +} + + +LIBXSMM_API_INTERN int libxsmm_matrix_eqn_is_ready_for_jit( libxsmm_blasint eqn_idx ) { + if ( libxsmm_matrix_eqns[eqn_idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + return 1; + } + if ( libxsmm_matrix_eqns[eqn_idx]->is_constructed == 0 ) { + fprintf( stderr, "the requested equation is not finalized, yet!\n" ); + return 2; + } + if ( libxsmm_matrix_eqns[eqn_idx]->is_optimized == 0 ) { + fprintf( stderr, "the requested equation is not optimized, yet!\n" ); + return 2; + } + + return 0; +} + + +LIBXSMM_API libxsmm_blasint libxsmm_matrix_eqn_create(void) { + libxsmm_blasint ret = libxsmm_matrix_eqns_count; + libxsmm_matrix_eqn_elem* node; + + /* lazy init of helper array */ + if ( libxsmm_matrix_eqns_init == 0 ) { + libxsmm_blasint i; + for ( i = 0; i < 256; ++i ) { + libxsmm_matrix_eqns[i] = NULL; + } + libxsmm_matrix_eqns_count = 0; + libxsmm_matrix_eqns_init = 1; + } + + libxsmm_matrix_eqns_count++; + + libxsmm_matrix_eqns[ret] = (libxsmm_matrix_eqn*) malloc( sizeof(libxsmm_matrix_eqn) ); + + node = (libxsmm_matrix_eqn_elem*) malloc( sizeof(libxsmm_matrix_eqn_elem) ); + + node->le = NULL; + node->ri = NULL; + node->up = NULL; + node->type = LIBXSMM_MATRIX_EQN_NODE_NONE; + + libxsmm_matrix_eqns[ret]->eqn_root = node; + libxsmm_matrix_eqns[ret]->eqn_cur = node; + libxsmm_matrix_eqns[ret]->is_constructed = 0; + libxsmm_matrix_eqns[ret]->is_optimized = 0; + libxsmm_matrix_eqns[ret]->unary_only = 0; + libxsmm_matrix_eqns[ret]->unary_only = 0; +#if 0 + printf("created equation no: %i\n", ret); + printf("root node address: %lld\n", libxsmm_matrix_eqns[ret]->eqn_cur ); +#endif + + return ret; +} + + +LIBXSMM_API int libxsmm_matrix_eqn_push_back_arg( const libxsmm_blasint idx, const libxsmm_blasint m, const libxsmm_blasint n, const libxsmm_blasint ld, const libxsmm_blasint in_pos, const libxsmm_blasint offs_in_pos, const libxsmm_datatype dtype ) { + union libxsmm_matrix_eqn_info info; + + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + return 1; + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 1 ) { + fprintf( stderr, "the requested equation is already finalized!\n" ); + return 2; + } + + info.arg.m = m; + info.arg.n = n; + info.arg.ld = ld; + info.arg.in_pos = in_pos; + info.arg.offs_in_pos = offs_in_pos; + info.arg.dtype = dtype; + libxsmm_matrix_eqns[idx]->eqn_cur = libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqns[idx]->eqn_cur, LIBXSMM_MATRIX_EQN_NODE_ARG, info ); +#if 0 + printf("added arg node: %lld %i %i %i %i %i %i\n", libxsmm_matrix_eqns[idx]->eqn_cur, M, N, ld, in_pos, offs_in_pos, dtype ); +#endif + + /* move to the next head position in the tree */ + libxsmm_matrix_eqn_mov_head( idx ); + + return 0; +} + + +LIBXSMM_API int libxsmm_matrix_eqn_push_back_unary_op( const libxsmm_blasint idx, const libxsmm_meltw_unary_type type, const libxsmm_meltw_unary_flags flags, const libxsmm_datatype dtype ) { + union libxsmm_matrix_eqn_info info; + + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + return 1; + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 1 ) { + fprintf( stderr, "the requested equation is already finalized!\n" ); + return 2; + } + + info.u_op.type = type; + info.u_op.flags = flags; + info.u_op.dtype = dtype; + libxsmm_matrix_eqns[idx]->eqn_cur = libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqns[idx]->eqn_cur, LIBXSMM_MATRIX_EQN_NODE_UNARY, info ); +#if 0 + printf("added unary node: %lld %i %i %i\n", libxsmm_matrix_eqns[idx]->eqn_cur, type, flags, dtype ); +#endif + + /* move to the next head position in the tree */ + libxsmm_matrix_eqn_mov_head( idx ); + + return 0; +} + + +LIBXSMM_API int libxsmm_matrix_eqn_push_back_binary_op( const libxsmm_blasint idx, const libxsmm_meltw_binary_type type, const libxsmm_meltw_binary_flags flags, const libxsmm_datatype dtype ) { + union libxsmm_matrix_eqn_info info; + + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + return 1; + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 1 ) { + fprintf( stderr, "the requested equation is already finalized!\n" ); + return 2; + } + + info.b_op.type = type; + info.b_op.flags = flags; + info.b_op.dtype = dtype; + libxsmm_matrix_eqns[idx]->eqn_cur = libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqns[idx]->eqn_cur, LIBXSMM_MATRIX_EQN_NODE_BINARY, info ); +#if 0 + printf("added binary node: %lld %i %i %i\n", libxsmm_matrix_eqns[idx]->eqn_cur, type, flags, dtype ); +#endif + + /* move to the next head position in the tree */ + libxsmm_matrix_eqn_mov_head( idx ); + + return 0; +} + + +LIBXSMM_API int libxsmm_matrix_eqn_push_back_ternary_op( const libxsmm_blasint idx, const libxsmm_meltw_ternary_type type, const libxsmm_meltw_ternary_flags flags, const libxsmm_datatype dtype ) { + union libxsmm_matrix_eqn_info info; + + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + return 1; + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 1 ) { + fprintf( stderr, "the requested equation is already finalized!\n" ); + return 2; + } + + info.t_op.type = type; + info.t_op.flags = flags; + info.t_op.dtype = dtype; + libxsmm_matrix_eqns[idx]->eqn_cur = libxsmm_matrix_eqn_add_node( libxsmm_matrix_eqns[idx]->eqn_cur, LIBXSMM_MATRIX_EQN_NODE_TERNARY, info ); +#if 0 + printf("added ternary node: %lld %i %i %i\n", libxsmm_matrix_eqns[idx]->eqn_cur, type, flags, dtype ); +#endif + + /* move to the next head position in the tree */ + libxsmm_matrix_eqn_mov_head( idx ); + + return 0; +} + +LIBXSMM_API void libxsmm_matrix_eqn_tree_print( const libxsmm_blasint idx ) { + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 0 ) { + fprintf( stderr, "the requested equation is not yet finalized!\n" ); + } + + printf("\n"); + printf("Schematic of the expression tree (Pre-order)\n"); + libxsmm_matrix_eqn_trv_print( libxsmm_matrix_eqns[idx]->eqn_root, 0 ); + printf("\n"); +} + + +LIBXSMM_API void libxsmm_matrix_eqn_rpn_print( const libxsmm_blasint idx ) { + if ( libxsmm_matrix_eqns[idx] == NULL ) { + fprintf( stderr, "the requested equation doesn't exist!\n" ); + } + if ( libxsmm_matrix_eqns[idx]->is_constructed == 0 ) { + fprintf( stderr, "the requested equation is not yet finalized!\n" ); + } + + printf("\n"); + printf("HP calculator (RPN) print of the expression tree (Post-order)\n"); + libxsmm_matrix_eqn_trv_rpn_print( libxsmm_matrix_eqns[idx]->eqn_root ); + printf("\n\n"); +} + + diff --git a/third_party/libxsmm/src/libxsmm_matrixeqn.h b/third_party/libxsmm/src/libxsmm_matrixeqn.h new file mode 100644 index 00000000..47d2243b --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_matrixeqn.h @@ -0,0 +1,148 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_MATRIXEQN_H +#define LIBXSMM_MATRIXEQN_H + +#define LEFT 0 +#define RIGHT 1 +#define RIGHT2 2 + +#include +/** + * TF includes src/libxsmm_main.h and uses LIBXSMM's sync primitives + * without including libxsmm_sync. However, libxsmm_sync.h shall be + * an explicit include separate from including libxsmm.h. + */ +#include "libxsmm_sync.h" + +LIBXSMM_EXTERN_C typedef enum libxsmm_matrix_eqn_node_type { + LIBXSMM_MATRIX_EQN_NODE_NONE = 0, + LIBXSMM_MATRIX_EQN_NODE_UNARY = 1, + LIBXSMM_MATRIX_EQN_NODE_BINARY = 2, + LIBXSMM_MATRIX_EQN_NODE_TERNARY = 4, + LIBXSMM_MATRIX_EQN_NODE_ARG = 8 +} libxsmm_matrix_eqn_node_type; + +LIBXSMM_EXTERN_C typedef enum libxsmm_matrix_eqn_bcast_type { + LIBXSMM_MATRIX_EQN_BCAST_TYPE_NONE = 0, + LIBXSMM_MATRIX_EQN_BCAST_TYPE_ROW = 1, + LIBXSMM_MATRIX_EQN_BCAST_TYPE_COL = 2, + LIBXSMM_MATRIX_EQN_BCAST_TYPE_SCALAR = 4 +} libxsmm_matrix_eqn_bcast_type; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_unary_op { + libxsmm_meltw_unary_type type; + libxsmm_meltw_unary_flags flags; + libxsmm_datatype dtype; +} libxsmm_matrix_eqn_unary_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_binary_op { + libxsmm_meltw_binary_type type; + libxsmm_meltw_binary_flags flags; + libxsmm_datatype dtype; +} libxsmm_matrix_eqn_binary_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_ternary_op { + libxsmm_meltw_ternary_type type; + libxsmm_meltw_ternary_flags flags; + libxsmm_datatype dtype; +} libxsmm_matrix_eqn_ternary_op; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_arg { + libxsmm_blasint m; + libxsmm_blasint n; + libxsmm_blasint ld; + libxsmm_blasint in_pos; + libxsmm_blasint offs_in_pos; + libxsmm_datatype dtype; + libxsmm_matrix_eqn_bcast_type bcast_type; +} libxsmm_matrix_eqn_arg; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_tmp_info { + libxsmm_blasint id; + libxsmm_blasint m; + libxsmm_blasint n; + libxsmm_blasint ld; + libxsmm_datatype dtype; + libxsmm_matrix_eqn_bcast_type bcast_type; + libxsmm_blasint m_s; + libxsmm_blasint n_s; + libxsmm_blasint ld_s; + libxsmm_datatype dtype_s; + libxsmm_matrix_eqn_bcast_type bcast_type_s; + libxsmm_blasint m_t; + libxsmm_blasint n_t; + libxsmm_blasint ld_t; + libxsmm_datatype dtype_t; + libxsmm_matrix_eqn_bcast_type bcast_type_t; +} libxsmm_matrix_eqn_tmp_info; + +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE libxsmm_matrix_eqn_info { + libxsmm_matrix_eqn_unary_op u_op; + libxsmm_matrix_eqn_binary_op b_op; + libxsmm_matrix_eqn_ternary_op t_op; + libxsmm_matrix_eqn_arg arg; +} libxsmm_matrix_eqn_info; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn_elem { + struct libxsmm_matrix_eqn_elem* le; + struct libxsmm_matrix_eqn_elem* ri; + struct libxsmm_matrix_eqn_elem* r2; + struct libxsmm_matrix_eqn_elem* up; + libxsmm_matrix_eqn_node_type type; + libxsmm_matrix_eqn_info info; + libxsmm_blasint reg_score; + libxsmm_blasint visit_timestamp; + libxsmm_matrix_eqn_tmp_info tmp; + libxsmm_blasint max_tmp_size; + libxsmm_blasint n_args; + libxsmm_blasint tree_max_comp_tsize; +} libxsmm_matrix_eqn_elem; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE LIBXSMM_MAY_ALIAS libxsmm_matrix_eqn { + libxsmm_matrix_eqn_elem* eqn_root; + libxsmm_matrix_eqn_elem* eqn_cur; + libxsmm_blasint is_constructed; + libxsmm_blasint is_optimized; + libxsmm_blasint unary_only; + libxsmm_blasint binary_only; +} libxsmm_matrix_eqn; + +/* Helper functions for matrix equation handling */ +LIBXSMM_API_INTERN libxsmm_matrix_eqn* libxsmm_matrix_eqn_get_equation( libxsmm_blasint eqn_idx ); +LIBXSMM_API_INTERN int libxsmm_matrix_eqn_is_ready_for_jit( libxsmm_blasint eqn_idx ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_assign_reg_scores( libxsmm_matrix_eqn_elem* cur_node ); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_create_exec_plan( libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *global_timestamp, libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool ); +LIBXSMM_API_INTERN libxsmm_blasint reserve_tmp_storage(libxsmm_blasint n_max_tmp, libxsmm_blasint *tmp_storage_pool); +LIBXSMM_API_INTERN void libxsmm_generator_assign_new_timestamp(libxsmm_matrix_eqn_elem* cur_node, libxsmm_blasint *current_timestamp ); +LIBXSMM_API_INTERN void libxsmm_generator_matequation_assign_timestamps(libxsmm_matrix_eqn *eqn); +LIBXSMM_API_INTERN void libxsmm_generator_reoptimize_eqn(libxsmm_matrix_eqn *eqn); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_adjust_tmp_sizes( libxsmm_matrix_eqn_elem* cur_node ); +LIBXSMM_API_INTERN int is_unary_opcode_reduce_kernel (unsigned int opcode); +LIBXSMM_API_INTERN int is_unary_opcode_transform_kernel (unsigned int opcode); +LIBXSMM_API_INTERN int is_unary_opcode_reduce_to_scalar (unsigned int opcode); +LIBXSMM_API_INTERN int is_binary_opcode_reduce_to_scalar (unsigned int opcode); + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_unary(libxsmm_meltw_unary_flags flags); + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_binary(libxsmm_meltw_binary_flags flags, unsigned int side); + +LIBXSMM_API_INTERN +libxsmm_matrix_eqn_bcast_type get_bcast_type_ternary(libxsmm_meltw_ternary_flags flags, unsigned int side); + +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_reassign_bcast_tmp(libxsmm_matrix_eqn *eqn); +LIBXSMM_API_INTERN void libxsmm_matrix_eqn_reassign_children_bcast_tmp(libxsmm_matrix_eqn *eqn, libxsmm_matrix_eqn_elem* cur_node); + + +#endif /*LIBXSMM_MATRIXEQN_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_memory.c b/third_party/libxsmm/src/libxsmm_memory.c new file mode 100644 index 00000000..2226bbef --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_memory.c @@ -0,0 +1,593 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_hash.h" +#include "libxsmm_diff.h" +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(LIBXSMM_MEMORY_STDLIB) && 0 +# define LIBXSMM_MEMORY_STDLIB +#endif +#if !defined(LIBXSMM_MEMORY_SW) && 0 +# define LIBXSMM_MEMORY_SW +#endif + + +#if !defined(LIBXSMM_MEMORY_SW) +LIBXSMM_APIVAR_DEFINE(unsigned char (*internal_diff_function)(const void*, const void*, unsigned char)); +LIBXSMM_APIVAR_DEFINE(int (*internal_memcmp_function)(const void*, const void*, size_t)); +#endif + + +LIBXSMM_API_INLINE +unsigned char internal_diff_sw(const void* a, const void* b, unsigned char size) +{ +#if defined(LIBXSMM_MEMORY_STDLIB) && defined(LIBXSMM_MEMORY_SW) + return (unsigned char)memcmp(a, b, size); +#else + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + unsigned char i; + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xF0); i += 16) { + LIBXSMM_DIFF_16_DECL(aa); + LIBXSMM_DIFF_16_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_16(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) +unsigned char internal_diff_sse(const void* a, const void* b, unsigned char size) +{ +#if defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + unsigned char i; + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xF0); i += 16) { + LIBXSMM_DIFF_SSE_DECL(aa); + LIBXSMM_DIFF_SSE_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_SSE(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_diff_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +unsigned char internal_diff_avx2(const void* a, const void* b, unsigned char size) +{ +#if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + unsigned char i; + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xE0); i += 32) { + LIBXSMM_DIFF_AVX2_DECL(aa); + LIBXSMM_DIFF_AVX2_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_AVX2(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_diff_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +unsigned char internal_diff_avx512(const void* a, const void* b, unsigned char size) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + unsigned char i; + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xC0); i += 64) { + LIBXSMM_DIFF_AVX512_DECL(aa); + LIBXSMM_DIFF_AVX512_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_AVX512(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_diff_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INLINE +int internal_memcmp_sw(const void* a, const void* b, size_t size) +{ +#if defined(LIBXSMM_MEMORY_STDLIB) + return memcmp(a, b, size); +#else + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + size_t i; + LIBXSMM_DIFF_16_DECL(aa); + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xFFFFFFFFFFFFFFF0); i += 16) { + LIBXSMM_DIFF_16_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_16(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) +int internal_memcmp_sse(const void* a, const void* b, size_t size) +{ +#if defined(LIBXSMM_INTRINSICS_X86) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + size_t i; + LIBXSMM_DIFF_SSE_DECL(aa); + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xFFFFFFFFFFFFFFF0); i += 16) { + LIBXSMM_DIFF_SSE_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_SSE(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_memcmp_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +int internal_memcmp_avx2(const void* a, const void* b, size_t size) +{ +#if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + size_t i; + LIBXSMM_DIFF_AVX2_DECL(aa); + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xFFFFFFFFFFFFFFE0); i += 32) { + LIBXSMM_DIFF_AVX2_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_AVX2(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_memcmp_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +int internal_memcmp_avx512(const void* a, const void* b, size_t size) +{ +#if defined(LIBXSMM_INTRINSICS_AVX512) && !defined(LIBXSMM_MEMORY_SW) + const uint8_t *const a8 = (const uint8_t*)a, *const b8 = (const uint8_t*)b; + size_t i; + LIBXSMM_DIFF_AVX512_DECL(aa); + LIBXSMM_PRAGMA_UNROLL/*_N(2)*/ + for (i = 0; i < (size & 0xFFFFFFFFFFFFFFC0); i += 64) { + LIBXSMM_DIFF_AVX512_LOAD(aa, a8 + i); + if (LIBXSMM_DIFF_AVX512(aa, b8 + i, 0/*dummy*/)) return 1; + } + for (; i < size; ++i) if (a8[i] ^ b8[i]) return 1; + return 0; +#else + return internal_memcmp_sw(a, b, size); +#endif +} + + +LIBXSMM_API_INTERN void libxsmm_memory_init(int target_arch) +{ +#if defined(LIBXSMM_MEMORY_SW) + LIBXSMM_UNUSED(target_arch); +#else + if (LIBXSMM_X86_AVX512 <= target_arch) { +# if defined(LIBXSMM_DIFF_AVX512_ENABLED) + internal_diff_function = internal_diff_avx512; +# else + internal_diff_function = internal_diff_avx2; +# endif +# if defined(LIBXSMM_DIFF_AVX512_ENABLED) + internal_memcmp_function = internal_memcmp_avx512; +# else + internal_memcmp_function = internal_memcmp_avx2; +# endif + } + else if (LIBXSMM_X86_AVX2 <= target_arch) { + internal_diff_function = internal_diff_avx2; + internal_memcmp_function = internal_memcmp_avx2; + } + else if (LIBXSMM_X86_GENERIC <= target_arch) { + internal_diff_function = internal_diff_sse; + internal_memcmp_function = internal_memcmp_sse; + } + else { + internal_diff_function = internal_diff_sw; + internal_memcmp_function = internal_memcmp_sw; + } + LIBXSMM_ASSERT(NULL != internal_diff_function); + LIBXSMM_ASSERT(NULL != internal_memcmp_function); +#endif +} + + +LIBXSMM_API_INTERN void libxsmm_memory_finalize(void) +{ +#if !defined(NDEBUG) && !defined(LIBXSMM_MEMORY_SW) + internal_diff_function = NULL; + internal_memcmp_function = NULL; +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_4(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 4); +#else + LIBXSMM_DIFF_4_DECL(a4); + LIBXSMM_DIFF_4_LOAD(a4, a); + return LIBXSMM_DIFF_4(a4, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_8(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 8); +#else + LIBXSMM_DIFF_8_DECL(a8); + LIBXSMM_DIFF_8_LOAD(a8, a); + return LIBXSMM_DIFF_8(a8, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_16(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 16); +#else + LIBXSMM_DIFF_16_DECL(a16); + LIBXSMM_DIFF_16_LOAD(a16, a); + return LIBXSMM_DIFF_16(a16, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_32(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 32); +#else + LIBXSMM_DIFF_32_DECL(a32); + LIBXSMM_DIFF_32_LOAD(a32, a); + return LIBXSMM_DIFF_32(a32, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_48(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 48); +#else + LIBXSMM_DIFF_48_DECL(a48); + LIBXSMM_DIFF_48_LOAD(a48, a); + return LIBXSMM_DIFF_48(a48, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff_64(const void* a, const void* b, ...) +{ +#if defined(LIBXSMM_MEMORY_SW) + return internal_diff_sw(a, b, 64); +#else + LIBXSMM_DIFF_64_DECL(a64); + LIBXSMM_DIFF_64_LOAD(a64, a); + return LIBXSMM_DIFF_64(a64, b, 0/*dummy*/); +#endif +} + + +LIBXSMM_API unsigned char libxsmm_diff(const void* a, const void* b, unsigned char size) +{ +#if defined(LIBXSMM_MEMORY_SW) && !defined(LIBXSMM_MEMORY_STDLIB) + return internal_diff_sw(a, b, size); +#else +# if defined(LIBXSMM_MEMORY_STDLIB) + return 0 != memcmp(a, b, size); +# elif (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_DIFF_AVX512_ENABLED) + return internal_diff_avx512(a, b, size); +# elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_diff_avx2(a, b, size); +# elif (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH) +# if (LIBXSMM_X86_AVX2 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_diff_sse(a, b, size); +# else /* pointer based function call */ +# if defined(LIBXSMM_INIT_COMPLETED) + LIBXSMM_ASSERT(NULL != internal_diff_function); + return internal_diff_function(a, b, size); +# else + return (unsigned char)(NULL != internal_diff_function + ? internal_diff_function(a, b, size) + : internal_diff_sse(a, b, size)); +# endif +# endif +# else + return internal_diff_sw(a, b, size); +# endif +#endif +} + + +LIBXSMM_API unsigned int libxsmm_diff_n(const void* a, const void* bn, unsigned char size, + unsigned char stride, unsigned int hint, unsigned int n) +{ + unsigned int result; + LIBXSMM_ASSERT(size <= stride); +#if defined(LIBXSMM_MEMORY_STDLIB) && !defined(LIBXSMM_MEMORY_SW) + LIBXSMM_DIFF_N(unsigned int, result, memcmp, a, bn, size, stride, hint, n); +#else +# if !defined(LIBXSMM_MEMORY_SW) + switch (size) { + case 64: { + LIBXSMM_DIFF_64_DECL(a64); + LIBXSMM_DIFF_64_LOAD(a64, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_64, a64, bn, size, stride, hint, n); + } break; + case 48: { + LIBXSMM_DIFF_48_DECL(a48); + LIBXSMM_DIFF_48_LOAD(a48, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_48, a48, bn, size, stride, hint, n); + } break; + case 32: { + LIBXSMM_DIFF_32_DECL(a32); + LIBXSMM_DIFF_32_LOAD(a32, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_32, a32, bn, size, stride, hint, n); + } break; + case 16: { + LIBXSMM_DIFF_16_DECL(a16); + LIBXSMM_DIFF_16_LOAD(a16, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_16, a16, bn, size, stride, hint, n); + } break; + case 8: { + LIBXSMM_DIFF_8_DECL(a8); + LIBXSMM_DIFF_8_LOAD(a8, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_8, a8, bn, size, stride, hint, n); + } break; + case 4: { + LIBXSMM_DIFF_4_DECL(a4); + LIBXSMM_DIFF_4_LOAD(a4, a); + LIBXSMM_DIFF_N(unsigned int, result, LIBXSMM_DIFF_4, a4, bn, size, stride, hint, n); + } break; + default: +# endif + { + LIBXSMM_DIFF_N(unsigned int, result, libxsmm_diff, a, bn, size, stride, hint, n); + } +# if !defined(LIBXSMM_MEMORY_SW) + } +# endif +#endif + return result; +} + + +LIBXSMM_API int libxsmm_memcmp(const void* a, const void* b, size_t size) +{ +#if defined(LIBXSMM_MEMORY_SW) && !defined(LIBXSMM_MEMORY_STDLIB) + return internal_memcmp_sw(a, b, size); +#else +# if defined(LIBXSMM_MEMORY_STDLIB) + return memcmp(a, b, size); +# elif (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_DIFF_AVX512_ENABLED) + return internal_memcmp_avx512(a, b, size); +# elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) + return internal_memcmp_avx2(a, b, size); +# elif (LIBXSMM_X86_SSE3 <= LIBXSMM_STATIC_TARGET_ARCH) +# if (LIBXSMM_X86_AVX2 > LIBXSMM_MAX_STATIC_TARGET_ARCH) + return internal_memcmp_sse(a, b, size); +# else /* pointer based function call */ +# if defined(LIBXSMM_INIT_COMPLETED) + LIBXSMM_ASSERT(NULL != internal_memcmp_function); + return internal_memcmp_function(a, b, size); +# else + return NULL != internal_memcmp_function + ? internal_memcmp_function(a, b, size) + : internal_memcmp_sse(a, b, size); +# endif +# endif +# else + return internal_memcmp_sw(a, b, size); +# endif +#endif +} + + +LIBXSMM_API unsigned int libxsmm_hash(const void* data, unsigned int size, unsigned int seed) +{ + LIBXSMM_INIT + return libxsmm_crc32(seed, data, size); +} + + +LIBXSMM_API unsigned long long libxsmm_hash_string(const char* string) +{ + unsigned long long result; + const size_t length = (NULL != string ? strlen(string) : 0); + if (sizeof(result) < length) { + const size_t length2 = length / 2; + unsigned int seed32 = 0; /* seed=0: match else-optimization */ + LIBXSMM_INIT + seed32 = libxsmm_crc32(seed32, string, length2); + result = libxsmm_crc32(seed32, string + length2, length - length2); + result = (result << 32) | seed32; + } + else { /* reinterpret directly as hash value */ +#if 1 + result = (unsigned long long)string; +#else + char *const s = (char*)&result; signed char i; + for (i = 0; i < (signed char)length; ++i) s[i] = string[i]; + for (; i < (signed char)sizeof(result); ++i) s[i] = 0; +#endif + } + return result; +} + + +LIBXSMM_API const char* libxsmm_stristr(const char* a, const char* b) +{ + const char* result = NULL; + if (NULL != a && NULL != b && '\0' != *a && '\0' != *b) { + do { + if (tolower(*a) != tolower(*b)) { + ++a; + } + else { + const char* c = b; + result = a; + while ('\0' != *++a && '\0' != *++c) { + if (tolower(*a) != tolower(*c)) { + result = NULL; + break; + } + } + if ('\0' != c[0] && '\0' != c[1]) { + result = NULL; + } + else break; + } + } while ('\0' != *a); + } + return result; +} + + +LIBXSMM_API int libxsmm_aligned(const void* ptr, const size_t* inc, int* alignment) +{ + const int minalign = 4 * libxsmm_cpuid_vlen32(libxsmm_target_archid); + const uintptr_t address = (uintptr_t)ptr; + int ptr_is_aligned; + LIBXSMM_ASSERT(LIBXSMM_ISPOT(minalign)); + if (NULL == alignment) { + ptr_is_aligned = !LIBXSMM_MOD2(address, (uintptr_t)minalign); + } + else { + const unsigned int nbits = LIBXSMM_INTRINSICS_BITSCANFWD64(address); + *alignment = (32 > nbits ? (1 << nbits) : INT_MAX); + ptr_is_aligned = (minalign <= *alignment); + } + return ptr_is_aligned && (NULL == inc || !LIBXSMM_MOD2(*inc, (size_t)minalign)); +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xhash)(int* /*hash_seed*/, const void* /*data*/, const int* /*size*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xhash)(int* hash_seed, const void* data, const int* size) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != hash_seed && NULL != data && NULL != size && 0 <= *size) +#endif + { + *hash_seed = (int)(libxsmm_hash(data, (unsigned int)*size, (unsigned int)*hash_seed) & 0x7FFFFFFF/*sign-bit*/); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xhash specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdiff)(int* /*result*/, const void* /*a*/, const void* /*b*/, const long long* /*size*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xdiff)(int* result, const void* a, const void* b, const long long* size) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != result && NULL != a && NULL != b && NULL != size && 0 <= *size) +#endif + { + *result = libxsmm_memcmp(a, b, (size_t)*size); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xdiff specified!\n"); + } +#endif +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xclear)(void* /*dst*/, const int* /*size*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_xclear)(void* dst, const int* size) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != dst && NULL != size && 0 <= *size && 128 > *size) +#endif + { + LIBXSMM_MEMSET127(dst, 0, *size); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_xclear specified!\n"); + } +#endif +} + + +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_aligned)(int* /*result*/, const void* /*ptr*/, const int* /*inc*/, int* /*alignment*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_aligned)(int* result, const void* ptr, const int* inc, int* alignment) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != result) +#endif + { + const size_t next = (NULL != inc ? *inc : 0); + *result = libxsmm_aligned(ptr, &next, alignment); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_aligned specified!\n"); + } +#endif +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_mhd.c b/third_party/libxsmm/src/libxsmm_mhd.c new file mode 100644 index 00000000..864e01e2 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_mhd.c @@ -0,0 +1,925 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" /* libxsmm_typesize */ + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(LIBXSMM_MHD_MAX_LINELENGTH) +# define LIBXSMM_MHD_MAX_LINELENGTH 1024 +#endif + +#if !defined(LIBXSMM_MHD_MAX_ELEMSIZE) +# define LIBXSMM_MHD_MAX_ELEMSIZE 8 +#endif + +#define LIBXSMM_MHD_MINMAX(TYPE, DATA, NELEMENTS, PMIN_INOUT, PMAX_INOUT) { \ + LIBXSMM_ASSERT(NULL != (PMIN_INOUT) && NULL != (PMAX_INOUT)); \ + if (0 < (NELEMENTS)) { \ + size_t libxsmm_mhd_minmax_index_ = 0; \ + do { \ + TYPE libxsmm_mhd_minmax_value_; \ + LIBXSMM_ASSERT(NULL != (DATA)); \ + libxsmm_mhd_minmax_value_ = ((const TYPE*)DATA)[libxsmm_mhd_minmax_index_]; \ + if (libxsmm_mhd_minmax_value_ < *((const TYPE*)PMIN_INOUT)) { \ + *((TYPE*)PMIN_INOUT) = libxsmm_mhd_minmax_value_; \ + } \ + else if (libxsmm_mhd_minmax_value_ > *((const TYPE*)PMAX_INOUT)) { \ + *((TYPE*)PMAX_INOUT) = libxsmm_mhd_minmax_value_; \ + } \ + ++libxsmm_mhd_minmax_index_; \ + } while (libxsmm_mhd_minmax_index_ < (NELEMENTS)); \ + } \ + else *((TYPE*)PMIN_INOUT) = *((TYPE*)PMAX_INOUT) = 0; \ +} + +#define LIBXSMM_MHD_TYPE_PROMOTE(DST_TYPE, SRC_TYPE) \ + (LIBXSMM_MHD_ELEMTYPE_I64 > (DST_TYPE) || (LIBXSMM_MHD_ELEMTYPE_U64 > (DST_TYPE) \ + ? /*dst is signed*/(LIBXSMM_MHD_ELEMTYPE_U64 > (SRC_TYPE) ? ((SRC_TYPE) > (DST_TYPE)) : 0) \ + : /*dst is unsigned*/(LIBXSMM_MHD_ELEMTYPE_U64 > (SRC_TYPE) ? 0 : ((SRC_TYPE) > (DST_TYPE))))) + +#define LIBXSMM_MHD_ELEMENT_CONVERSION_F(SRC_TYPE, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ + const double h = (0.5 - (DST_TYPE)0.5); \ + SRC_TYPE s = *((const SRC_TYPE*)PSRC); \ + double s0 = 0, s1 = 0; \ + if (NULL != (PSRC_MIN) && LIBXSMM_NOTNAN(s)) { \ + LIBXSMM_ASSERT_MSG(NULL != (PSRC_MAX) && *((const SRC_TYPE*)PSRC_MIN) <= s && s <= *((const SRC_TYPE*)PSRC_MAX), "Invalid value range"); \ + s0 = (double)*((const SRC_TYPE*)PSRC_MIN); s1 = (double)*((const SRC_TYPE*)PSRC_MAX); \ + } \ + if (LIBXSMM_MHD_ELEMTYPE_I64 <= (DST_ENUM) && s0 < s1) { /* scale */ \ + if (LIBXSMM_MHD_ELEMTYPE_U64 <= (DST_ENUM)) { \ + const double s0pos = LIBXSMM_MAX(0, s0), s1pos = LIBXSMM_MAX(0, s1), scale = (s0pos < s1pos ? ((s1 - s0) / (s1pos - s0pos)) : 1); \ + s = (SRC_TYPE)(scale * (double)LIBXSMM_MAX(0, s)); \ + s0 = s0pos; s1 = s1pos; \ + } \ + else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM) && 0 > s0 && 0 < s1) { \ + s1 = LIBXSMM_MAX(-s0, s1); s0 = -s1; \ + } \ + { const double d0 = (0 <= s0 ? 0 : (DST_MIN)), d1 = (0 <= s1 ? (DST_MAX) : 0), d = ((double)s - s0) * (d1 - d0) / (s1 - s0) + d0; \ + *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(0 <= d ? (d + h) : (d - h), d0, d1); \ + } \ + } \ + else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM)) { /* clamp */ \ + *((DST_TYPE*)PDST) = (DST_TYPE)(0 <= s ? LIBXSMM_CLMP(s + h, DST_MIN, DST_MAX) : LIBXSMM_CLMP(s - h, DST_MIN, DST_MAX)); \ + } \ + else { /* promote */ \ + *((DST_TYPE*)PDST) = (DST_TYPE)(0 <= s ? (s + h) : (s - h)); \ + } \ + RESULT = EXIT_SUCCESS; \ +} + +#define LIBXSMM_MHD_ELEMENT_CONVERSION_I(SRC_TYPE, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ + const double h = (0.5 - (DST_TYPE)0.5); \ + SRC_TYPE s = *((const SRC_TYPE*)PSRC); \ + double s0 = 0, s1 = 0; \ + if (NULL != (PSRC_MIN)) { \ + LIBXSMM_ASSERT_MSG(NULL != (PSRC_MAX) && *((const SRC_TYPE*)PSRC_MIN) <= s && s <= *((const SRC_TYPE*)PSRC_MAX), "Invalid value range"); \ + s0 = (double)*((const SRC_TYPE*)PSRC_MIN); s1 = (double)*((const SRC_TYPE*)PSRC_MAX); \ + } \ + if (LIBXSMM_MHD_ELEMTYPE_I64 <= (DST_ENUM) && s0 < s1) { /* scale */ \ + if (LIBXSMM_MHD_ELEMTYPE_U64 <= (DST_ENUM)) { \ + const double s0pos = LIBXSMM_MAX(0, s0), s1pos = LIBXSMM_MAX(0, s1), scale = (s0pos < s1pos ? ((s1 - s0) / (s1pos - s0pos)) : 1); \ + const double ss = scale * (double)LIBXSMM_MAX(0, s); \ + s = (SRC_TYPE)(0 <= ss ? (ss + h) : (ss - h)); \ + s0 = s0pos; s1 = s1pos; \ + } \ + else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM) && 0 > s0 && 0 < s1) { \ + s1 = LIBXSMM_MAX(-s0, s1); s0 = -s1; \ + } \ + { const double d0 = (0 <= s0 ? 0 : (DST_MIN)), d1 = (0 <= s1 ? (DST_MAX) : 0), d = ((double)s - s0) * (d1 - d0) / (s1 - s0) + d0; \ + *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(0 <= d ? (d + h) : (d - h), d0, d1); \ + } \ + } \ + else if (0 == LIBXSMM_MHD_TYPE_PROMOTE(DST_ENUM, SRC_ENUM)) { /* clamp */ \ + *((DST_TYPE*)PDST) = (DST_TYPE)LIBXSMM_CLMP(s, DST_MIN, DST_MAX); \ + } \ + else { /* promote */ \ + *((DST_TYPE*)PDST) = (DST_TYPE)s; \ + } \ + RESULT = EXIT_SUCCESS; \ +} + +#define LIBXSMM_MHD_ELEMENT_CONVERSION_U LIBXSMM_MHD_ELEMENT_CONVERSION_I + +#define LIBXSMM_MHD_ELEMENT_CONVERSION(DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT) { \ + LIBXSMM_ASSERT_MSG(NULL != (PDST) && NULL != (PSRC), "Invalid input or output"); \ + switch(SRC_ENUM) { \ + case LIBXSMM_MHD_ELEMTYPE_F64: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_F(double, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_F32: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_F(float, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_BF16: { \ + LIBXSMM_ASSERT_MSG(0, "Not implemented yet"); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_I64: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_I(long long, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_I32: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_I(int, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_I16: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_I(short, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_I8: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_I(signed char, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_U64: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned long long, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_U32: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned int, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_U16: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned short, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + case LIBXSMM_MHD_ELEMTYPE_U8: { \ + LIBXSMM_MHD_ELEMENT_CONVERSION_U(unsigned char, DST_TYPE, DST_ENUM, DST_MIN, DST_MAX, PDST, SRC_ENUM, PSRC, PSRC_MIN, PSRC_MAX, RESULT); \ + } break; \ + default: RESULT = EXIT_FAILURE; \ + } \ +} + + +LIBXSMM_API const char* libxsmm_mhd_typename(libxsmm_mhd_elemtype type, size_t* typesize, const char** ctypename) +{ + const char *mhd_typename = NULL, *c_typename = NULL; + size_t size = 0; + switch (type) { + case LIBXSMM_MHD_ELEMTYPE_F64: { size = 8; mhd_typename = "MET_DOUBLE"; c_typename = "double"; } break; + case LIBXSMM_MHD_ELEMTYPE_F32: { size = 4; mhd_typename = "MET_FLOAT"; c_typename = "float"; } break; + case LIBXSMM_MHD_ELEMTYPE_BF16: { size = 2; mhd_typename = "MET_BFLOAT"; c_typename = "unsigned short"; } break; + case LIBXSMM_MHD_ELEMTYPE_I64: { size = 8; mhd_typename = "MET_LONG"; c_typename = "signed long long"; } break; + case LIBXSMM_MHD_ELEMTYPE_I32: { size = 4; mhd_typename = "MET_INT"; c_typename = "signed int"; } break; + case LIBXSMM_MHD_ELEMTYPE_I16: { size = 2; mhd_typename = "MET_SHORT"; c_typename = "signed short"; } break; + case LIBXSMM_MHD_ELEMTYPE_I8: { size = 1; mhd_typename = "MET_CHAR"; c_typename = "signed char"; } break; + case LIBXSMM_MHD_ELEMTYPE_U64: { size = 8; mhd_typename = "MET_ULONG"; c_typename = "unsigned long long"; } break; + case LIBXSMM_MHD_ELEMTYPE_U32: { size = 4; mhd_typename = "MET_UINT"; c_typename = "unsigned int"; } break; + case LIBXSMM_MHD_ELEMTYPE_U16: { size = 2; mhd_typename = "MET_USHORT"; c_typename = "unsigned short"; } break; + case LIBXSMM_MHD_ELEMTYPE_U8: { size = 1; mhd_typename = "MET_UCHAR"; c_typename = "unsigned char"; } break; + default: size = libxsmm_typesize((libxsmm_datatype)type); /* fallback */ + } + LIBXSMM_ASSERT(size <= LIBXSMM_MHD_MAX_ELEMSIZE); + if (NULL != ctypename) *ctypename = c_typename; + if (NULL != typesize) *typesize = size; + return mhd_typename; +} + + +LIBXSMM_API libxsmm_mhd_elemtype libxsmm_mhd_typeinfo(const char elemname[]) +{ + libxsmm_mhd_elemtype result = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; + if (0 == strcmp("MET_DOUBLE", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_F64; + } + else if (0 == strcmp("MET_FLOAT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_F32; + } + else if (0 == strcmp("MET_BFLOAT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_BF16; + } + else if (0 == strcmp("MET_LONG", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_I64; + } + else if (0 == strcmp("MET_INT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_I32; + } + else if (0 == strcmp("MET_SHORT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_I16; + } + else if (0 == strcmp("MET_CHAR", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_I8; + } + else if (0 == strcmp("MET_ULONG", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_U64; + } + else if (0 == strcmp("MET_UINT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_U32; + } + else if (0 == strcmp("MET_USHORT", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_U16; + } + else if (0 == strcmp("MET_UCHAR", elemname)) { + result = LIBXSMM_MHD_ELEMTYPE_U8; + } + return result; +} + +LIBXSMM_API_INLINE int internal_mhd_readline(char buffer[], char split, size_t* key_end, size_t* value_begin) +{ + int result = EXIT_SUCCESS; + char *const isplit = strchr(buffer, split); + + if (0 != isplit) { + char* i = isplit; + LIBXSMM_ASSERT(0 != key_end && 0 != value_begin); + while (buffer != i) { --i; if (0 == isspace((int)(*i))) break; } + *key_end = i - buffer + 1; + i = isplit; + while ('\n' != *++i) if (0 == isspace((int)(*i))) break; + *value_begin = i - buffer; + while (0 != *i && 0 != isprint((int)(*i))) ++i; + if (0 == isprint((int)(*i))) *i = 0; /* fix-up */ + if (i <= (buffer + *value_begin)) { + result = EXIT_FAILURE; + } + } + else { + result = EXIT_FAILURE; + } + + return result; +} + + +LIBXSMM_API int libxsmm_mhd_read_header(const char header_filename[], size_t filename_max_length, + char filename[], size_t* ndims, size_t size[], size_t* ncomponents, libxsmm_mhd_elemtype* type, + size_t* header_size, size_t* extension_size) +{ + int result = EXIT_SUCCESS; + char buffer[LIBXSMM_MHD_MAX_LINELENGTH]; + FILE *const file = (0 < filename_max_length && 0 != filename && 0 != ndims && 0 < *ndims && 0 != size && 0 != type && 0 != ncomponents) + ? fopen(header_filename, "rb") : 0; + + if (0 != file) { + size_t key_end, value_begin; + if (0 != extension_size) *extension_size = 0; + if (0 != header_size) *header_size = 0; + memset(size, 0, *ndims * sizeof(*size)); + *type = LIBXSMM_MHD_ELEMTYPE_UNKNOWN; + *ncomponents = 1; + if (header_filename != filename) { + *filename = 0; + } + + while (0 != fgets(buffer, sizeof(buffer), file) && EXIT_SUCCESS == result && + EXIT_SUCCESS == internal_mhd_readline(buffer, '=', &key_end, &value_begin)) + { + if (0 == strncmp("NDims", buffer, key_end) + && key_end == strlen("NDims")) + { + const int value = atoi(buffer + value_begin); + if (0 < value && value <= ((int)*ndims)) { + *ndims = value; + } + } + else if (0 == strncmp("ElementNumberOfChannels", buffer, key_end) + && key_end == strlen("ElementNumberOfChannels")) + { + const int value = atoi(buffer + value_begin); + if (0 < value) { + *ncomponents = value; + } + else { + result = EXIT_FAILURE; + } + } + else if (0 != extension_size + && 0 == strncmp("ExtensionDataSize", buffer, key_end) + && key_end == strlen("ExtensionDataSize")) + { + const int value = atoi(buffer + value_begin); + if (0 <= value) { + *extension_size = value; + } + else { + result = EXIT_FAILURE; + } + } + else if (0 == strncmp("ElementType", buffer, key_end) + && key_end == strlen("ElementType")) + { + const libxsmm_mhd_elemtype value = libxsmm_mhd_typeinfo(buffer + value_begin); + if (LIBXSMM_MHD_ELEMTYPE_UNKNOWN != value) { + *type = value; + } + } + else if (0 == strncmp("ElementDataFile", buffer, key_end) + && key_end == strlen("ElementDataFile")) + { + const char *const value = buffer + value_begin; + if (0 == strcmp("LOCAL", value) || 0 == strcmp(header_filename, value)) { + if (header_size) { + const long file_position = ftell(file); /* determine the header size */ + const size_t len = strlen(header_filename); + if (0 <= file_position && len < filename_max_length) { + memcpy(filename, header_filename, len + 1); + LIBXSMM_ASSERT(0 == filename[len]); + *header_size = ftell(file); + } + else { + result = EXIT_FAILURE; + } + break; /* ElementDataFile is just before the raw data */ + } + } + else { + const size_t len = strlen(value); + if (len < filename_max_length) { + memcpy(filename, value, len + 1); + LIBXSMM_ASSERT(0 == filename[len]); + } + else { + result = EXIT_FAILURE; + } + } + } + else if (0 == strncmp("DimSize", buffer, key_end) + && key_end == strlen("DimSize")) + { + char* value = buffer + value_begin; + size_t *isize = size, n = 0; + while (EXIT_SUCCESS == internal_mhd_readline(value, ' ', &key_end, &value_begin) && n < *ndims) { + const int ivalue = atoi(value); + if (0 < ivalue) { + *isize = ivalue; + } + else { + result = EXIT_FAILURE; + } + value += key_end + 1; + ++isize; + ++n; + } + if (EXIT_SUCCESS == result) { + if (0 != *value && n < *ndims) { + const int ivalue = atoi(value); + if (0 < ivalue) { + *isize = ivalue; + } + else { + result = EXIT_FAILURE; + } + ++n; + } +#if 0 + else { + result = EXIT_FAILURE; + } +#endif + } + } + else if (0 == strncmp("BinaryData", buffer, key_end) + && key_end == strlen("BinaryData")) + { + const char *const value = buffer + value_begin; + if (0 == strcmp("False", value) || 0 != strcmp("True", value)) { + result = EXIT_FAILURE; + } + } + else if (0 == strncmp("CompressedData", buffer, key_end) + && key_end == strlen("CompressedData")) + { + const char *const value = buffer + value_begin; + if (0 == strcmp("True", value) || 0 != strcmp("False", value)) { + result = EXIT_FAILURE; + } + } + else if ((0 == strncmp("BinaryDataByteOrderMSB", buffer, key_end) && key_end == strlen("BinaryDataByteOrderMSB")) + || (0 == strncmp("ElementByteOrderMSB", buffer, key_end) && key_end == strlen("ElementByteOrderMSB"))) + { + const char *const value = buffer + value_begin; + if (0 == strcmp("True", value) || 0 != strcmp("False", value)) { + result = EXIT_FAILURE; + } + } + } + + if (EXIT_SUCCESS == result && (0 == *filename || LIBXSMM_MHD_ELEMTYPE_UNKNOWN == *type)) { + result = EXIT_FAILURE; + } + /* check size, and eventually trim dimensionality */ + if (EXIT_SUCCESS == result) { + size_t i, d = 1; + for (i = *ndims; 0 < i; --i) { + if (0 != d && 1 == size[i-1]) { + --*ndims; + } + else if (0 == size[i-1]) { + result = EXIT_FAILURE; + break; + } + else { + d = 0; + } + } + } + /* prefix the path of the header file to make sure that the data file can be found */ + if (EXIT_SUCCESS == result && (0 == header_size || 0 == *header_size)) { + const char* split = header_filename + strlen(header_filename) - 1; + while (header_filename != split && 0 == strchr("/\\", *split)) --split; + if (header_filename < split) { + const size_t len = strlen(filename), n = split - header_filename + 1; + if ((len+ n) <= filename_max_length) { + size_t i; + for (i = 1; i <= len; ++i) { + filename[len + n - i] = filename[len - i]; + } + for (i = 0; i < n; ++i) { + filename[i] = header_filename[i]; + } + } + } + } + /* release file handle */ + if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; + } + else { + result = EXIT_FAILURE; + } + + return result; +} + + +LIBXSMM_API int libxsmm_mhd_element_conversion( + void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, + const void* src, const void* src_min, const void* src_max) +{ + int result = EXIT_SUCCESS; + switch (dst_type) { + case LIBXSMM_MHD_ELEMTYPE_F64: { + LIBXSMM_MHD_ELEMENT_CONVERSION(double, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_F32: { + LIBXSMM_MHD_ELEMENT_CONVERSION(float, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_BF16: { + LIBXSMM_MHD_ELEMENT_CONVERSION(libxsmm_bfloat16, dst_type, -1.0, 1.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_I64: { + LIBXSMM_MHD_ELEMENT_CONVERSION(long long, dst_type, -9223372036854775808.0, 9223372036854775807.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_I32: { + LIBXSMM_MHD_ELEMENT_CONVERSION(int, dst_type, -2147483648.0, 2147483647.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_I16: { + LIBXSMM_MHD_ELEMENT_CONVERSION(short, dst_type, -32768.0, 32767.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_I8: { + LIBXSMM_MHD_ELEMENT_CONVERSION(signed char, dst_type, -128.0, 127.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_U64: { + LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned long long, dst_type, 0.0, 18446744073709551615.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_U32: { + LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned int, dst_type, 0.0, 4294967295.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_U16: { + LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned short, dst_type, 0.0, 65535.0, dst, src_type, src, src_min, src_max, result); + } break; + case LIBXSMM_MHD_ELEMTYPE_U8: { + LIBXSMM_MHD_ELEMENT_CONVERSION(unsigned char, dst_type, 0.0, 255.0, dst, src_type, src, src_min, src_max, result); + } break; + default: result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API int libxsmm_mhd_element_comparison( + void* dst, libxsmm_mhd_elemtype dst_type, libxsmm_mhd_elemtype src_type, + const void* src, const void* src_min, const void* src_max) +{ + size_t typesize; + int result; + + if (0 != libxsmm_mhd_typename(src_type, &typesize, NULL/*ctypename*/)) { + if (dst_type == src_type) { /* direct comparison */ + result = libxsmm_diff(src, dst, (unsigned char)typesize); + } + else { /* conversion into source type */ + char element[LIBXSMM_MHD_MAX_ELEMSIZE]; + result = libxsmm_mhd_element_conversion(element, dst_type, src_type, src, src_min, src_max); + if (EXIT_SUCCESS == result) { + result = libxsmm_diff(src, element, (unsigned char)typesize); + } + } + } + else { + result = EXIT_FAILURE; + } + + return result; +} + + +/* coverity[var_deref_op] */ +LIBXSMM_API_INLINE int internal_mhd_minmax(const void* data, size_t nelements, + libxsmm_mhd_elemtype type, const void* minval, const void* maxval) +{ + int result; + if ((NULL != data || 0 == nelements) && NULL != minval && NULL != maxval) { + result = EXIT_SUCCESS; + switch (type) { + case LIBXSMM_MHD_ELEMTYPE_F64: { + LIBXSMM_MHD_MINMAX(double, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_F32: { + LIBXSMM_MHD_MINMAX(float, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_BF16: { + LIBXSMM_MHD_MINMAX(libxsmm_bfloat16, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_I64: { + LIBXSMM_MHD_MINMAX(long long, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_I32: { + LIBXSMM_MHD_MINMAX(int, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_I16: { + LIBXSMM_MHD_MINMAX(short, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_I8: { + LIBXSMM_MHD_MINMAX(signed char, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_U64: { + LIBXSMM_MHD_MINMAX(unsigned long long, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_U32: { + LIBXSMM_MHD_MINMAX(unsigned int, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_U16: { + LIBXSMM_MHD_MINMAX(unsigned short, data, nelements, minval, maxval); } break; + case LIBXSMM_MHD_ELEMTYPE_U8: { + LIBXSMM_MHD_MINMAX(unsigned char, data, nelements, minval, maxval); } break; + default: result = EXIT_FAILURE; + } + } + else { + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API_INLINE int internal_mhd_read(FILE* file, void* data, const size_t size[], const size_t pitch[], + size_t ndims, size_t ncomponents, libxsmm_mhd_elemtype type_stored, libxsmm_mhd_elemtype type_data, + size_t typesize, libxsmm_mhd_element_handler handle_element, int minmax, void* minval, void* maxval) +{ + int result = EXIT_SUCCESS; + size_t typesize_stored; + + LIBXSMM_ASSERT(0 != pitch && 0 != typesize); + if (0 != libxsmm_mhd_typename(type_stored, &typesize_stored, NULL/*ctypename*/)) { + if (1 < ndims) { + if (size[0] <= pitch[0]) { + const size_t d = ndims - 1; + + if (EXIT_SUCCESS == result) { + if (size[d] <= pitch[d]) { + size_t sub_size = ncomponents * typesize * pitch[0], i; + + for (i = 1; i < d; ++i) { + if (size[i] <= pitch[i]) { + sub_size *= pitch[i]; + } + else { + result = EXIT_FAILURE; + break; + } + } + for (i = 0; i < size[d] && EXIT_SUCCESS == result; ++i) { + result = internal_mhd_read(file, data, size, pitch, d, ncomponents, + type_stored, type_data, typesize, handle_element, minmax, minval, maxval); + data = ((char*)data) + sub_size; + } + } + else { + result = EXIT_FAILURE; + } + } + } + else { + result = EXIT_FAILURE; + } + } + else if (1 == ndims) { + if (size[0] <= pitch[0]) { + if (type_stored == type_data && 0 == handle_element) { + if (size[0] != fread(data, ncomponents * typesize_stored, size[0], file)) { + result = EXIT_FAILURE; + } + } + else { /* data-conversion or custom data-handler */ + const libxsmm_mhd_element_handler handler = (0 == minmax ? (0 != handle_element ? handle_element : libxsmm_mhd_element_conversion) : NULL); + char element[LIBXSMM_MHD_MAX_ELEMSIZE]; + size_t i, j; + + for (i = 0; i < size[0]; ++i) { + for (j = 0; j < ncomponents; ++j) { + if (EXIT_SUCCESS == result) { + if (1 == fread(element, typesize_stored, 1, file)) { + if (NULL == handler) { /* determine value-range for scaled data-conversion */ + LIBXSMM_ASSERT(0 != minmax); + result = internal_mhd_minmax(element, 1/*n*/, type_stored, minval, maxval); + } + else { /* re-read data incl. conversion */ + LIBXSMM_ASSERT(0 == minmax); + result = handler(data, type_data, type_stored, element, minval, maxval); + data = ((char*)data) + typesize; + } + } + else { + result = EXIT_FAILURE; + } + } + else { + i = size[0]; /* break outer */ + break; + } + } + } + } + } + else { + result = EXIT_FAILURE; + } + } + } + else { + result = EXIT_FAILURE; + } + + return result; +} + + +LIBXSMM_API int libxsmm_mhd_read(const char filename[], + const size_t offset[], const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, + size_t header_size, libxsmm_mhd_elemtype type_stored, const libxsmm_mhd_elemtype* type_data, + void* data, libxsmm_mhd_element_handler handle_element, char extension[], size_t extension_size) +{ + int result = EXIT_SUCCESS; + FILE *const file = (0 != filename && 0 != *filename && + 0 != size && 0 != ndims && 0 != ncomponents && + LIBXSMM_MHD_ELEMTYPE_UNKNOWN != type_stored && + (0 == type_data || LIBXSMM_MHD_ELEMTYPE_UNKNOWN != *type_data) && + 0 != data) + ? fopen(filename, "rb") + : NULL; + + if (0 != file) { + const libxsmm_mhd_elemtype datatype = (type_data ? *type_data : type_stored); + const size_t *const shape = (0 != pitch ? pitch : size); + size_t offset1 = (0 != offset ? offset[0] : 0), typesize = 0, i; + + /* check that size is less-equal than pitch */ + if (EXIT_SUCCESS == result) { + for (i = 0; i < ndims; ++i) { + if (size[i] > shape[i]) { + result = EXIT_FAILURE; + break; + } + } + } + /* zeroing buffer if pitch is larger than size */ + if (EXIT_SUCCESS == result) { + if (0 != libxsmm_mhd_typename(datatype, &typesize, NULL/*ctypename*/)) { + size_t size1 = size[0], pitch1 = shape[0]; + for (i = 1; i < ndims; ++i) { + offset1 += (0 != offset ? offset[i] : 0) * pitch1; + pitch1 *= shape[i]; + size1 *= size[i]; + } + LIBXSMM_ASSERT(size1 <= pitch1); + if (size1 != pitch1 && 0 == handle_element) { + memset(data, 0, pitch1 * ncomponents * typesize); + } + } + else { + result = EXIT_FAILURE; + } + } + if (EXIT_SUCCESS == result) { + char *const output = ((char*)data) + offset1 * ncomponents * typesize; + char minmax[2*(LIBXSMM_MHD_MAX_ELEMSIZE)]; + + if (0 != header_size) result = fseek(file, (long)header_size, SEEK_SET); /* set file position to data section */ + if (EXIT_SUCCESS == result && datatype != type_stored) { /* conversion needed */ + if (1 == fread(minmax, typesize, 1, file)) { + LIBXSMM_ASSERT(typesize <= (LIBXSMM_MHD_MAX_ELEMSIZE)); + LIBXSMM_MEMCPY127(minmax + (LIBXSMM_MHD_MAX_ELEMSIZE), minmax, typesize); + result = fseek(file, (long)header_size, SEEK_SET); /* reset file position */ + if (EXIT_SUCCESS == result) { + result = internal_mhd_read(file, NULL/*output*/, size, shape, + ndims, ncomponents, type_stored, datatype, typesize, handle_element, + 1/*search min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); + } + if (EXIT_SUCCESS == result) { + result = fseek(file, (long)header_size, SEEK_SET); /* reset file position */ + } + } + else { + result = EXIT_FAILURE; + } + } + if (EXIT_SUCCESS == result) { + result = internal_mhd_read(file, output, size, shape, + ndims, ncomponents, type_stored, datatype, typesize, handle_element, + 0/*use min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); + } + } + if (0 != extension && 0 < extension_size) { + if (extension_size != fread(extension, 1, extension_size, file)) { + result = EXIT_FAILURE; + } + } + /* release file handle */ + if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; + } + else { + result = EXIT_FAILURE; + } + + return result; +} + + +LIBXSMM_API_INLINE int internal_mhd_write(FILE* file, const void* data, const size_t size[], const size_t pitch[], + size_t ndims, size_t ncomponents, libxsmm_mhd_elemtype type_data, libxsmm_mhd_elemtype type, + size_t typesize_data, size_t typesize, int minmax, void* minval, void* maxval) +{ + int result = EXIT_SUCCESS; + + LIBXSMM_ASSERT(0 != pitch); + if (1 < ndims) { + if (size[0] <= pitch[0]) { + const size_t d = ndims - 1; + + if (EXIT_SUCCESS == result) { + if (size[d] <= pitch[d]) { + size_t sub_size = ncomponents * typesize_data * pitch[0], i; + + for (i = 1; i < d; ++i) { + if (size[i] <= pitch[i]) { + sub_size *= pitch[i]; + } + else { + result = EXIT_FAILURE; + break; + } + } + for (i = 0; i < size[d] && EXIT_SUCCESS == result; ++i) { + result = internal_mhd_write(file, data, size, pitch, d, ncomponents, + type_data, type, typesize_data, typesize, minmax, minval, maxval); + data = ((const char*)data) + sub_size; + } + } + else { + result = EXIT_FAILURE; + } + } + } + else { + result = EXIT_FAILURE; + } + } + else if (1 == ndims) { + if (size[0] <= pitch[0]) { + if (type == type_data) { + if (size[0] != fwrite(data, ncomponents * typesize_data, size[0], file)) { + result = EXIT_FAILURE; + } + } + else { /* data-conversion */ + char element[LIBXSMM_MHD_MAX_ELEMSIZE]; + size_t i, j; + + if (0 != minmax) { + /* determine value-range for scaled data-conversion */ + result = internal_mhd_minmax(data, size[0] * ncomponents, type_data, minval, maxval); + } + else { + for (i = 0; i < size[0]; ++i) { + for (j = 0; j < ncomponents; ++j) { + if (EXIT_SUCCESS == result) { + result = libxsmm_mhd_element_conversion(element, type, type_data, data, minval, maxval); + if (EXIT_SUCCESS == result) { + if (1 == fwrite(element, typesize, 1, file)) { + data = ((char*)data) + typesize_data; + } + else { + result = EXIT_FAILURE; + } + } + } + else { + i = size[0]; /* break outer */ + break; + } + } + } + } + } + } + else { + result = EXIT_FAILURE; + } + } + + return result; +} + + +LIBXSMM_API int libxsmm_mhd_write(const char filename[], + const size_t offset[], const size_t size[], const size_t pitch[], size_t ndims, size_t ncomponents, + libxsmm_mhd_elemtype type_data, const libxsmm_mhd_elemtype* type, const void* data, size_t* header_size, + const char extension_header[], const void* extension, size_t extension_size) +{ + size_t typesize = 0; + const libxsmm_mhd_elemtype elemtype = (NULL == type ? type_data : *type); + const char *const elemname = libxsmm_mhd_typename(elemtype, &typesize, NULL/*ctypename*/); + FILE *const file = (0 != filename && 0 != *filename && + NULL != size && 0 != ndims && 0 != ncomponents && NULL != data && NULL != elemname && 0 < typesize) + ? fopen(filename, "wb") + : NULL; + int result = EXIT_SUCCESS; + + if (0 != file) { + size_t typesize_data = 0, i; + if (0 < fprintf(file, "NDims = %u\nElementNumberOfChannels = %u\nElementByteOrderMSB = False\nDimSize =", + (unsigned int)ndims, (unsigned int)ncomponents)) + { + for (i = 0; i != ndims; ++i) { + if (0 >= fprintf(file, " %u", (unsigned int)size[i])) { + result = EXIT_FAILURE; + break; + } + } + } + else { + result = EXIT_FAILURE; + } + if (EXIT_SUCCESS == result) { + if (0 < fprintf(file, "\nElementSpacing =")) { + for (i = 0; i != ndims; ++i) { + if (0 >= fprintf(file, " 1.0")) { + result = EXIT_FAILURE; + break; + } + } + } + else { + result = EXIT_FAILURE; + } + } + if (EXIT_SUCCESS == result && 0 != extension_header && 0 != *extension_header) { + if (0 >= fprintf(file, "\n%s", extension_header)) { + result = EXIT_FAILURE; + } + } + /* size of the data, which is silently appended after the regular data section */ + if (EXIT_SUCCESS == result && 0 < extension_size) { + if (0 >= fprintf(file, "\nExtensionDataSize = %u", (unsigned int)extension_size)) { + result = EXIT_FAILURE; + } + } + /* source data type is not required to have MHD element name (type-size is needed) */ + if (EXIT_SUCCESS == result) { + libxsmm_mhd_typename(type_data, &typesize_data, NULL/*ctypename*/); + if (0 == typesize_data) result = EXIT_FAILURE; + } + /* ElementDataFile must be the last entry before writing the data */ + if (EXIT_SUCCESS == result && 0 < fprintf(file, "\nElementType = %s\nElementDataFile = LOCAL\n", elemname)) { + const size_t *const shape = (0 != pitch ? pitch : size); + const char *const input = ((const char*)data) + libxsmm_offset(offset, shape, ndims, NULL/*size*/) * ncomponents * typesize_data; + const long file_position = ftell(file); /* determine the header size */ + char minmax[2*(LIBXSMM_MHD_MAX_ELEMSIZE)]; + + result = (0 <= file_position ? EXIT_SUCCESS : EXIT_FAILURE); + if (EXIT_SUCCESS == result && type_data != elemtype) { /* conversion needed */ + LIBXSMM_MEMCPY127(minmax, data, typesize_data); + LIBXSMM_MEMCPY127(minmax + (LIBXSMM_MHD_MAX_ELEMSIZE), data, typesize_data); /* initial condition */ + result = internal_mhd_write(file, input, size, shape, ndims, ncomponents, type_data, elemtype, typesize_data, typesize, + 1/*search min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); + } + if (EXIT_SUCCESS == result) { + if (NULL != header_size) *header_size = file_position; + assert(file_position == ftell(file)); /* !LIBXSMM_ASSERT */ + result = internal_mhd_write(file, input, size, shape, ndims, ncomponents, type_data, elemtype, typesize_data, typesize, + 0/*use min-max*/, minmax, minmax + (LIBXSMM_MHD_MAX_ELEMSIZE)); + } + } + /* append the extension data after the regular data section */ + if (EXIT_SUCCESS == result && 0 < extension_size) { + if (extension_size != fwrite(extension, 1, extension_size, file)) { + result = EXIT_FAILURE; + } + } + /* release file handle */ + if (0 != fclose(file) && EXIT_SUCCESS == result) result = EXIT_FAILURE; + } + else { + result = EXIT_FAILURE; + } + + return result; +} + diff --git a/third_party/libxsmm/src/libxsmm_perf.c b/third_party/libxsmm/src/libxsmm_perf.c new file mode 100644 index 00000000..4d7b3d39 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_perf.c @@ -0,0 +1,287 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Maciej Debski (Google Inc.) +******************************************************************************/ +#include "libxsmm_perf.h" +#include +#include +#include + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include "perf_jitdump.h" +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) +# include +# include +# include +# include +# include +# include +# include +#endif +#if defined(__linux__) +# include +#endif +#if defined(_WIN32) +# include +# define LIBXSMM_MAX_PATH MAX_PATH +#else +# if defined(__linux__) +# include +# define LIBXSMM_MAX_PATH PATH_MAX +# elif defined(PATH_MAX) +# define LIBXSMM_MAX_PATH PATH_MAX +# else /* fallback */ +# define LIBXSMM_MAX_PATH 1024 +# endif +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(NDEBUG) +# define LIBXSMM_PERF_ERROR(msg) fprintf(stderr, msg) +#else +# define LIBXSMM_PERF_ERROR(msg) +#endif + +#if !defined(PERF_JITDUMP_NOLIBXSMM) +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_MAGIC); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_MAGIC_SWAPPED); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_VERSION); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint64_t JITDUMP_FLAGS_ARCH_TIMESTAMP); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_LOAD); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_MOVE); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_DEBUG_INFO); +LIBXSMM_APIVAR_PRIVATE_DEF(/*const*/ uint32_t JITDUMP_CODE_CLOSE); +#endif + +LIBXSMM_APIVAR_DEFINE(FILE* internal_perf_fp); +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) +LIBXSMM_APIVAR_DEFINE(void* internal_perf_marker); +LIBXSMM_APIVAR_DEFINE(int internal_perf_codeidx); +#endif + + +LIBXSMM_API_INTERN void libxsmm_perf_init(void) +{ + const uint32_t pid = (uint32_t)libxsmm_get_pid(); + char file_name[LIBXSMM_MAX_PATH]; +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) + char file_path[LIBXSMM_MAX_PATH]; + int fd, page_size, res; + struct jitdump_file_header header; + char * path_base; + char date[64]; + time_t t = time(NULL); + struct tm tm = *localtime(&t); + + /* initialize global variables */ + JITDUMP_MAGIC = ('J' << 24 | 'i' << 16 | 'T' << 8 | 'D'); + JITDUMP_MAGIC_SWAPPED = ('J' | 'i' << 8 | 'T' << 16 | 'D' << 24); + JITDUMP_VERSION = 1; + JITDUMP_FLAGS_ARCH_TIMESTAMP = 1ULL /*<< 0*/; + JITDUMP_CODE_LOAD = 0; + JITDUMP_CODE_MOVE = 1; + JITDUMP_CODE_DEBUG_INFO = 2; + JITDUMP_CODE_CLOSE = 3; + + path_base = getenv("JITDUMPDIR"); + if (path_base == NULL) { + path_base = getenv("HOME"); + } + if (path_base == NULL) { + path_base = "."; + } + + LIBXSMM_SNPRINTF(file_path, sizeof(file_path), "%s/.debug/", path_base); + res = mkdir(file_path, S_IRWXU); + if (res < 0 && errno != EEXIST) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create .debug dir\n"); + goto error; + } + + LIBXSMM_SNPRINTF(file_path, sizeof(file_path), "%s/.debug/jit", path_base); + res = mkdir(file_path, S_IRWXU); + if (res < 0 && errno != EEXIST) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create .debug/jit dir\n"); + goto error; + } + + strftime(date, sizeof(date), "%Y%m%d", &tm); + + LIBXSMM_SNPRINTF(file_path, sizeof(file_path), + "%s/.debug/jit/libxsmm-jit-%s.XXXXXX", path_base, date); + path_base = mkdtemp(file_path); + if (path_base == NULL) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to create temporary dir\n"); + goto error; + } + + LIBXSMM_SNPRINTF(file_name, sizeof(file_name), "%s/jit-%u.dump", path_base, pid); + + fd = open(file_name, O_CREAT|O_TRUNC|O_RDWR, 0600); + if (fd < 0) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to open file\n"); + goto error; + } + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to get page size\n"); + goto error; + } + internal_perf_marker = mmap(NULL, page_size, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0); + if (internal_perf_marker == MAP_FAILED) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: mmap failed.\n"); + goto error; + } + + /* initialize code index */ + internal_perf_codeidx = 0; + + internal_perf_fp = fdopen(fd, "wb+"); + if (internal_perf_fp == NULL) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: fdopen failed.\n"); + goto error; + } + + LIBXSMM_MEMZERO127(&header); + header.magic = JITDUMP_MAGIC; + header.version = JITDUMP_VERSION; + header.elf_mach = 62; /* EM_X86_64 */ + header.total_size = sizeof(header); + header.pid = pid; + header.timestamp = libxsmm_timer_tick(); + header.flags = JITDUMP_FLAGS_ARCH_TIMESTAMP; + + res = fwrite(&header, sizeof(header), 1, internal_perf_fp); + if (res != 1) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to write header.\n"); + goto error; + } +#else + LIBXSMM_SNPRINTF(file_name, sizeof(file_name), "/tmp/perf-%u.map", pid); + internal_perf_fp = fopen(file_name, "w+"); + if (internal_perf_fp == NULL) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to open map file\n"); + goto error; + } +#endif + return; +error: + if (internal_perf_fp != NULL) { + fclose(internal_perf_fp); + internal_perf_fp = NULL; + } + assert(0); +} + + +LIBXSMM_API_INTERN void libxsmm_perf_finalize(void) +{ +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) + int res, page_size; + struct jitdump_record_header hdr; + + if (internal_perf_fp == NULL) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: jit dump file not opened\n"); + goto error; + } + + LIBXSMM_MEMZERO127(&hdr); + hdr.id = JITDUMP_CODE_CLOSE; + hdr.total_size = sizeof(hdr); + hdr.timestamp = libxsmm_timer_tick(); + res = fwrite(&hdr, sizeof(hdr), 1, internal_perf_fp); + + if (res != 1) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to write JIT_CODE_CLOSE record\n"); + goto error; + } + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) { + LIBXSMM_PERF_ERROR("LIBXSMM ERROR: failed to get page_size\n"); + goto error; + } + munmap(internal_perf_marker, page_size); + fclose(internal_perf_fp); + return; +error: + assert(0); +#else + fclose(internal_perf_fp); +#endif +} + + +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) +/** Utility function to receive the OS-specific thread ID. */ +LIBXSMM_API_INLINE unsigned int internal_perf_get_tid(void) +{ +#if defined(__linux__) + return (unsigned int)syscall(__NR_gettid); +#else /* fallback */ + return libxsmm_get_tid(); +#endif +} +#endif + + +LIBXSMM_API_INTERN void libxsmm_perf_dump_code(const void* memory, size_t size, const char* name) +{ + assert(internal_perf_fp != NULL); + assert(name && *name); + assert(memory != NULL && size != 0); + if (internal_perf_fp != NULL) { +#if defined(LIBXSMM_PERF_JITDUMP) && !defined(_WIN32) + int res; + struct jitdump_record_header hdr; + struct jitdump_record_code_load rec; + size_t name_len = strlen(name) + 1; + + LIBXSMM_MEMZERO127(&hdr); + LIBXSMM_MEMZERO127(&rec); + + hdr.id = JITDUMP_CODE_LOAD; + hdr.total_size = sizeof(hdr) + sizeof(rec) + name_len + size; + hdr.timestamp = libxsmm_timer_tick(); + + rec.code_size = size; + rec.vma = (uintptr_t) memory; + rec.code_addr = (uintptr_t) memory; + rec.pid = (uint32_t) libxsmm_get_pid(); + rec.tid = (uint32_t) internal_perf_get_tid(); + + LIBXSMM_FLOCK(internal_perf_fp); + + /* This will be unique as we hold the file lock. */ + rec.code_index = internal_perf_codeidx++; + + /* Count number of written items to check for errors. */ + res = 0; + res += fwrite_unlocked(&hdr, sizeof(hdr), 1, internal_perf_fp); + res += fwrite_unlocked(&rec, sizeof(rec), 1, internal_perf_fp); + res += fwrite_unlocked(name, name_len, 1, internal_perf_fp); + res += fwrite_unlocked((const void*) memory, size, 1, internal_perf_fp); + + LIBXSMM_FUNLOCK(internal_perf_fp); + fflush(internal_perf_fp); + + assert(res == 4); /* Expected 4 items written above */ +#else + fprintf(internal_perf_fp, "%" PRIxPTR " %lx %s\n", (uintptr_t)memory, (unsigned long)size, name); + fflush(internal_perf_fp); +#endif + } +} + diff --git a/third_party/libxsmm/src/libxsmm_perf.h b/third_party/libxsmm/src/libxsmm_perf.h new file mode 100644 index 00000000..66029c64 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_perf.h @@ -0,0 +1,23 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Maciej Debski (Google Inc.) +******************************************************************************/ +#ifndef LIBXSMM_PERF_H +#define LIBXSMM_PERF_H + +#include + + +LIBXSMM_API_INTERN void libxsmm_perf_init(void); +LIBXSMM_API_INTERN void libxsmm_perf_finalize(void); +LIBXSMM_API_INTERN void libxsmm_perf_dump_code( + const void* memory, size_t size, + const char* name); + +#endif /* LIBXSMM_PERF_H */ diff --git a/third_party/libxsmm/src/libxsmm_python.c b/third_party/libxsmm/src/libxsmm_python.c new file mode 100644 index 00000000..e7da7cbc --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_python.c @@ -0,0 +1,142 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#if defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC) +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include /* must be included first */ +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif +#endif +#include + + +#if defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC) + +LIBXSMM_API PyObject* libxsmmpy_get_target_arch(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_get_target_arch(PyObject* self, PyObject* args) +{ + LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); + return PyString_InternFromString(libxsmm_get_target_arch()); +} + +LIBXSMM_API PyObject* libxsmmpy_set_target_arch(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_set_target_arch(PyObject* self, PyObject* args) +{ + int ivalue = LIBXSMM_TARGET_ARCH_UNKNOWN; + char* svalue = NULL; + LIBXSMM_UNUSED(self); + if (0 != PyArg_ParseTuple(args, "s", &svalue)) { + libxsmm_set_target_arch(svalue); + } + else if (0 != PyArg_ParseTuple(args, "i", &ivalue)) { + libxsmm_set_target_archid(ivalue); + } + else { /* error */ + return NULL; + } + Py_RETURN_NONE; +} + + +LIBXSMM_API PyObject* libxsmmpy_get_target_archid(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_get_target_archid(PyObject* self, PyObject* args) +{ + LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); + return Py_BuildValue("i", libxsmm_get_target_archid()); +} + +LIBXSMM_API PyObject* libxsmmpy_set_target_archid(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_set_target_archid(PyObject* self, PyObject* args) +{ + int value = LIBXSMM_TARGET_ARCH_UNKNOWN; + LIBXSMM_UNUSED(self); + if (0 != PyArg_ParseTuple(args, "i", &value)) { + libxsmm_set_target_archid(value); + } + else { /* error */ + return NULL; + } + Py_RETURN_NONE; +} + + +LIBXSMM_API PyObject* libxsmmpy_get_verbosity(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_get_verbosity(PyObject* self, PyObject* args) +{ + LIBXSMM_UNUSED(self); LIBXSMM_UNUSED(args); + return Py_BuildValue("i", libxsmm_get_verbosity()); +} + +LIBXSMM_API PyObject* libxsmmpy_set_verbosity(PyObject* self, PyObject* args); +LIBXSMM_API PyObject* libxsmmpy_set_verbosity(PyObject* self, PyObject* args) +{ + int value = 0; + LIBXSMM_UNUSED(self); + if (0 != PyArg_ParseTuple(args, "i", &value)) { + libxsmm_set_verbosity(value); + } + else { /* error */ + return NULL; + } + Py_RETURN_NONE; +} + + +LIBXSMM_API PyMODINIT_FUNC initlibxsmm(void); +LIBXSMM_API PyMODINIT_FUNC initlibxsmm(void) +{ + static PyMethodDef pymethod_def[] = { + { "GetTargetArch", libxsmmpy_get_target_arch, METH_NOARGS, + PyDoc_STR("Get the name of the code path.") }, + { "SetTargetArch", libxsmmpy_set_target_arch, METH_VARARGS, + PyDoc_STR("Set the name of the code path.") }, + { "GetTargetArchId", libxsmmpy_get_target_archid, METH_NOARGS, + PyDoc_STR("Get the id of the code path.") }, + { "SetTargetArchId", libxsmmpy_set_target_archid, METH_VARARGS, + PyDoc_STR("Set the id of the code path.") }, + { "GetVerbosity", libxsmmpy_get_verbosity, METH_NOARGS, + PyDoc_STR("Get the verbosity level.") }, + { "SetVerbosity", libxsmmpy_set_verbosity, METH_VARARGS, + PyDoc_STR("Set the verbosity level.") }, + { NULL, NULL, 0, NULL } /* end of table */ + }; + PyObject *const pymod = Py_InitModule3("libxsmm", pymethod_def, PyDoc_STR( + "Library targeting Intel Architecture for small, dense or " + "sparse matrix multiplications, and small convolutions.")); + PyModule_AddIntConstant(pymod, "VERSION_API", LIBXSMM_VERSION2(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR)); + PyModule_AddIntConstant(pymod, "VERSION_ALL", LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, + LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH)); + PyModule_AddIntConstant(pymod, "VERSION_MAJOR", LIBXSMM_VERSION_MAJOR); + PyModule_AddIntConstant(pymod, "VERSION_MINOR", LIBXSMM_VERSION_MINOR); + PyModule_AddIntConstant(pymod, "VERSION_UPDATE", LIBXSMM_VERSION_UPDATE); + PyModule_AddIntConstant(pymod, "VERSION_PATCH", LIBXSMM_VERSION_PATCH); + PyModule_AddStringConstant(pymod, "VERSION", LIBXSMM_VERSION); + PyModule_AddStringConstant(pymod, "BRANCH", LIBXSMM_BRANCH); + PyModule_AddIntConstant(pymod, "TARGET_ARCH_UNKNOWN", LIBXSMM_TARGET_ARCH_UNKNOWN); + PyModule_AddIntConstant(pymod, "TARGET_ARCH_GENERIC", LIBXSMM_TARGET_ARCH_GENERIC); + PyModule_AddIntConstant(pymod, "X86_GENERIC", LIBXSMM_X86_GENERIC); + PyModule_AddIntConstant(pymod, "X86_SSE3", LIBXSMM_X86_SSE3); + PyModule_AddIntConstant(pymod, "X86_SSE42", LIBXSMM_X86_SSE42); + PyModule_AddIntConstant(pymod, "X86_AVX", LIBXSMM_X86_AVX); + PyModule_AddIntConstant(pymod, "X86_AVX2", LIBXSMM_X86_AVX2); + PyModule_AddIntConstant(pymod, "X86_AVX512", LIBXSMM_X86_AVX512); + PyModule_AddIntConstant(pymod, "X86_AVX512_MIC", LIBXSMM_X86_AVX512_MIC); + PyModule_AddIntConstant(pymod, "X86_AVX512_KNM", LIBXSMM_X86_AVX512_KNM); + PyModule_AddIntConstant(pymod, "X86_AVX512_CORE", LIBXSMM_X86_AVX512_CORE); + PyModule_AddIntConstant(pymod, "X86_AVX512_CLX", LIBXSMM_X86_AVX512_CLX); + PyModule_AddIntConstant(pymod, "X86_AVX512_CPX", LIBXSMM_X86_AVX512_CPX); + libxsmm_init(); /* initialize LIBXSMM */ +} + +#endif /*defined(__PYTHON) && defined(LIBXSMM_BUILD) && !defined(__STATIC)*/ + diff --git a/third_party/libxsmm/src/libxsmm_rng.c b/third_party/libxsmm/src/libxsmm_rng.c new file mode 100644 index 00000000..0a8f868b --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_rng.c @@ -0,0 +1,314 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_rng.h" +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_RNG_DRAND48) && (!defined(_WIN32) && !defined(__CYGWIN__) && (defined(_SVID_SOURCE) || defined(_XOPEN_SOURCE))) +# define LIBXSMM_RNG_DRAND48 +#endif + +#if !defined(LIBXSMM_RNG_SIMD_MIN) +# define LIBXSMM_RNG_SIMD_MIN 8 +#endif + +/* dispatched RNG functions (separate typedef for legacy Cray C++ needed) */ +typedef void (*internal_rng_f32_seq_fn)(float*, libxsmm_blasint); +LIBXSMM_APIVAR_DEFINE(internal_rng_f32_seq_fn internal_rng_f32_seq); +/* 2048-bit state for RNG */ +LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state0[16]); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state1[16]); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state2[16]); +LIBXSMM_APIVAR_DEFINE(unsigned int internal_rng_state3[16]); + + +LIBXSMM_API_INLINE void internal_rng_float_jump(uint32_t* state0, uint32_t* state1, uint32_t* state2, uint32_t* state3) +{ + static const uint32_t jump_table[] = { 0x8764000b, 0xf542d2d3, 0x6fa035c3, 0x77f2db5b }; + uint32_t s0 = 0, s1 = 0, s2 = 0, s3 = 0; + int i, b; + + LIBXSMM_ASSERT(4 == sizeof(jump_table) / sizeof(*jump_table)); + for (i = 0; i < 4; ++i) { + for (b = 0; b < 32; ++b) { + if (jump_table[i] & (1U << b)) { + s0 ^= *state0; + s1 ^= *state1; + s2 ^= *state2; + s3 ^= *state3; + } + { /* draw one more integer */ + const uint32_t t = *state1 << 9; + *state2 ^= *state0; + *state3 ^= *state1; + *state1 ^= *state2; + *state0 ^= *state3; + *state2 ^= t; + *state3 = ((*state3 << 11) | (*state3 >> (32 - 11))); + } + } + } + *state0 = s0; + *state1 = s1; + *state2 = s2; + *state3 = s3; +} + + +LIBXSMM_API_INLINE float internal_rng_scalar_float_next(int i) +{ + const uint32_t rng_mantissa = (internal_rng_state0[i] + internal_rng_state3[i]) >> 9; + const uint32_t t = internal_rng_state1[i] << 9; + union { uint32_t i; float f; } rng; + + internal_rng_state2[i] ^= internal_rng_state0[i]; + internal_rng_state3[i] ^= internal_rng_state1[i]; + internal_rng_state1[i] ^= internal_rng_state2[i]; + internal_rng_state0[i] ^= internal_rng_state3[i]; + internal_rng_state2[i] ^= t; + internal_rng_state3[i] = ((internal_rng_state3[i] << 11) | (internal_rng_state3[i] >> (32 - 11))); + + rng.i = 0x3f800000 | rng_mantissa; + return rng.f - 1.0f; +} + + +LIBXSMM_API_INTERN void internal_rng_set_seed_sw(uint32_t seed); +LIBXSMM_API_INTERN void internal_rng_set_seed_sw(uint32_t seed) +{ + static const uint32_t temp_state[] = { + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, + 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, + 331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317, 316 + }; + libxsmm_blasint i; + + /* finish initializing the state */ + LIBXSMM_ASSERT((16 * 4) == sizeof(temp_state) / sizeof(*temp_state)); + for (i = 0; i < 16; ++i) { + internal_rng_state0[i] = seed + temp_state[i]; + internal_rng_state1[i] = seed + temp_state[i+16]; + internal_rng_state2[i] = seed + temp_state[i+32]; + internal_rng_state3[i] = seed + temp_state[i+48]; + } + for (i = 0; i < 16; ++i) { + internal_rng_float_jump( /* progress each sequence by 2^64 */ + internal_rng_state0 + i, internal_rng_state1 + i, + internal_rng_state2 + i, internal_rng_state3 + i); + } + /* for consistency, other RNGs are seeded as well */ +#if !defined(_WIN32) && !defined(__CYGWIN__) && (defined(_SVID_SOURCE) || defined(_XOPEN_SOURCE)) + srand48(seed); +#endif + srand(seed); +} + + +LIBXSMM_API_INLINE void internal_rng_f32_seq_sw(float* rngs, libxsmm_blasint count) +{ + libxsmm_blasint i = 0; + for (; i < count; ++i) { + rngs[i] = internal_rng_scalar_float_next(LIBXSMM_MOD2(i, 16)); + } +} + + +#if defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +void internal_rng_set_seed_avx512(uint32_t seed) +{ + internal_rng_set_seed_sw(seed); + /* bring scalar state to AVX-512 */ + LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_loadu_si512(internal_rng_state0); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_loadu_si512(internal_rng_state1); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_loadu_si512(internal_rng_state2); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_loadu_si512(internal_rng_state3); +} + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512) +void internal_rng_f32_seq_avx512(float* rngs, libxsmm_blasint count) +{ + if ((LIBXSMM_RNG_SIMD_MIN << 4) <= count) { /* SIMD code path */ + const libxsmm_blasint n = (count >> 4) << 4; /* multiple of vector-length */ + libxsmm_blasint i = 0; + for (; i < n; i += 16) { + _mm512_storeu_ps(rngs + i, LIBXSMM_INTRINSICS_MM512_RNG_PS()); + } + if (i < count) { /* remainder */ +#if 0 /* assert(0 < n) */ + if (0 < n) +#endif + { /* bring AVX-512 state to scalar */ + _mm512_storeu_si512(internal_rng_state0, LIBXSMM_INTRINSICS_MM512_RNG_STATE(0)); + _mm512_storeu_si512(internal_rng_state1, LIBXSMM_INTRINSICS_MM512_RNG_STATE(1)); + _mm512_storeu_si512(internal_rng_state2, LIBXSMM_INTRINSICS_MM512_RNG_STATE(2)); + _mm512_storeu_si512(internal_rng_state3, LIBXSMM_INTRINSICS_MM512_RNG_STATE(3)); + } + LIBXSMM_ASSERT(count < i + 16); + do { /* scalar remainder */ + rngs[i] = internal_rng_scalar_float_next(LIBXSMM_MOD2(i, 16)); + ++i; + } while (i < count); + /* bring scalar state to AVX-512 */ + LIBXSMM_INTRINSICS_MM512_RNG_STATE(0) = _mm512_loadu_si512(internal_rng_state0); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(1) = _mm512_loadu_si512(internal_rng_state1); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(2) = _mm512_loadu_si512(internal_rng_state2); + LIBXSMM_INTRINSICS_MM512_RNG_STATE(3) = _mm512_loadu_si512(internal_rng_state3); + } + } + else { /* scalar code path */ + internal_rng_f32_seq_sw(rngs, count); + } +} +#endif /*defined(LIBXSMM_INTRINSICS_AVX512)*/ + + +LIBXSMM_API unsigned int* libxsmm_rng_create_extstate(unsigned int/*uint32_t*/ seed) +{ + unsigned int* state = (unsigned int*) libxsmm_aligned_malloc( 64*sizeof(unsigned int), 64 ); + static const uint32_t temp_state[] = { + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 131, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, + 231, 230, 229, 228, 227, 226, 225, 224, 223, 222, 221, 220, 219, 218, 217, 216, + 331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317, 316 + }; + libxsmm_blasint i; + + /* finish initializing the state */ + LIBXSMM_ASSERT((16 * 4) == sizeof(temp_state) / sizeof(*temp_state)); + for (i = 0; i < 16; ++i) { + state[i ] = seed + temp_state[i]; + state[i+16] = seed + temp_state[i+16]; + state[i+32] = seed + temp_state[i+32]; + state[i+48] = seed + temp_state[i+48]; + } + for (i = 0; i < 16; ++i) { + internal_rng_float_jump( /* progress each sequence by 2^64 */ + state + i, state + 16 + i, + state + 32 + i, state + 48 + i); + } + + return state; +} + + +LIBXSMM_API void libxsmm_rng_destroy_extstate(unsigned int* stateptr) +{ + if ( stateptr != NULL ) { + libxsmm_free( stateptr ); + } +} + + +LIBXSMM_API void libxsmm_rng_set_seed(unsigned int/*uint32_t*/ seed) +{ + LIBXSMM_INIT +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) +# if !defined(NDEBUG) /* used to track if seed is initialized */ + internal_rng_f32_seq = internal_rng_f32_seq_avx512; +# endif + internal_rng_set_seed_avx512(seed); +#elif defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ + if (LIBXSMM_X86_AVX512 <= libxsmm_target_archid) { + internal_rng_f32_seq = internal_rng_f32_seq_avx512; + internal_rng_set_seed_avx512(seed); + } + else { + internal_rng_f32_seq = internal_rng_f32_seq_sw; + internal_rng_set_seed_sw(seed); + } +#else +# if !defined(NDEBUG) /* used to track if seed is initialized */ + internal_rng_f32_seq = internal_rng_f32_seq_sw; +# endif + internal_rng_set_seed_sw(seed); +#endif +} + + +LIBXSMM_API void libxsmm_rng_f32_seq(float* rngs, libxsmm_blasint count) +{ + LIBXSMM_ASSERT_MSG(NULL != internal_rng_f32_seq, "RNG must be initialized"); +#if (LIBXSMM_X86_AVX512 <= LIBXSMM_STATIC_TARGET_ARCH) + internal_rng_f32_seq_avx512(rngs, count); +#else +# if defined(LIBXSMM_INTRINSICS_AVX512) /* __AVX512F__ */ + if ((LIBXSMM_RNG_SIMD_MIN << 4) <= count) { /* SIMD code path */ + internal_rng_f32_seq(rngs, count); /* pointer based function call */ + } + else /* scalar code path */ +# endif + internal_rng_f32_seq_sw(rngs, count); +#endif +} + + +LIBXSMM_API unsigned int libxsmm_rng_u32(unsigned int n) +{ +#if defined(LIBXSMM_RNG_DRAND48) + const unsigned int q = ((1U << 31) / n) * n; + unsigned int r = (unsigned int)lrand48(); + if (q != (1U << 31)) +#else + const unsigned int rand_max1 = (unsigned int)(RAND_MAX)+1U; + const unsigned int q = (rand_max1 / n) * n; + unsigned int r = (unsigned int)rand(); + if (q != rand_max1) +#endif + { +#if defined(LIBXSMM_RNG_DRAND48) + /* coverity[dont_call] */ + while (q <= r) r = (unsigned int)lrand48(); +#else + while (q <= r) r = (unsigned int)rand(); +#endif + } + return r % n; +} + + +LIBXSMM_API void libxsmm_rng_seq(void* data, libxsmm_blasint nbytes) +{ + unsigned char* dst = (unsigned char*)data; + unsigned char* end = dst + (nbytes & 0xFFFFFFFFFFFFFFFC); + unsigned int r; + for (; dst < end; dst += 4) { +#if defined(LIBXSMM_RNG_DRAND48) + /* coverity[dont_call] */ + r = (unsigned int)lrand48(); +#else + r = (unsigned int)rand(); +#endif + LIBXSMM_MEMCPY127(dst, &r, 4); + } + end = (unsigned char*)data + nbytes; + if (dst < end) { +#if defined(LIBXSMM_RNG_DRAND48) + r = (unsigned int)lrand48(); +#else + r = (unsigned int)rand(); +#endif + LIBXSMM_MEMCPY127(dst, &r, end - dst); + } +} + + +LIBXSMM_API double libxsmm_rng_f64(void) +{ +#if defined(LIBXSMM_RNG_DRAND48) + /* coverity[dont_call] */ + return drand48(); +#else + static const double scale = 1.0 / (RAND_MAX); + return scale * (double)rand(); +#endif +} + diff --git a/third_party/libxsmm/src/libxsmm_spmdm.c b/third_party/libxsmm/src/libxsmm_spmdm.c new file mode 100644 index 00000000..4d677226 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_spmdm.c @@ -0,0 +1,612 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish, Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" + +/* Enable/disable specific code paths */ +#if defined(LIBXSMM_INTRINSICS_AVX) && !defined(LIBXSMM_SPMDM_AVX) +# define LIBXSMM_SPMDM_AVX +#endif +#if defined(LIBXSMM_INTRINSICS_AVX2) && !defined(LIBXSMM_SPMDM_AVX2) && \ + !(defined(__PGI) && defined(__cplusplus)) +# define LIBXSMM_SPMDM_AVX2 +#endif +#if defined(LIBXSMM_INTRINSICS_AVX512_CORE) && !defined(LIBXSMM_SPMDM_AVX512_CORE) && \ + !(defined(__PGI) && defined(__cplusplus)) +# define LIBXSMM_SPMDM_AVX512_CORE +#endif + + +/* function pointer for the CPUID-dispatched implementation (separate typedef for legacy Cray C++ needed) */ +typedef void (*internal_spmdm_createSparseSlice_fp32_thread_fn)(const libxsmm_spmdm_handle*, char, const float*, libxsmm_CSR_sparseslice*, int, int, int); +LIBXSMM_APIVAR_DEFINE(internal_spmdm_createSparseSlice_fp32_thread_fn internal_spmdm_createSparseSlice_fp32_thread); +typedef void (*internal_spmdm_createSparseSlice_bfloat16_thread_fn)(const libxsmm_spmdm_handle*, char, const libxsmm_bfloat16*, libxsmm_CSR_sparseslice*, int, int, int); +LIBXSMM_APIVAR_DEFINE(internal_spmdm_createSparseSlice_bfloat16_thread_fn internal_spmdm_createSparseSlice_bfloat16_thread); +typedef void (*internal_spmdm_compute_fp32_thread_fn)(const libxsmm_spmdm_handle*, char, char, const float*, libxsmm_CSR_sparseslice*, const float*, char, const float*, float*, int, int, int); +LIBXSMM_APIVAR_DEFINE(internal_spmdm_compute_fp32_thread_fn internal_spmdm_compute_fp32_thread); +typedef void (*internal_spmdm_compute_bfloat16_thread_fn)(const libxsmm_spmdm_handle*, char, char, const libxsmm_bfloat16*, libxsmm_CSR_sparseslice*, const libxsmm_bfloat16*, char, const libxsmm_bfloat16*, float*, int, int, int); +LIBXSMM_APIVAR_DEFINE(internal_spmdm_compute_bfloat16_thread_fn internal_spmdm_compute_bfloat16_thread); + +#if defined(LIBXSMM_SPMDM_AVX) +LIBXSMM_APIVAR_DEFINE(__m256i* internal_spmdm_shufmasks_32); +LIBXSMM_APIVAR_DEFINE(__m256i* internal_spmdm_shufmasks_16); +#endif + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_init_shufmask_avx(void) +{ +#if defined(LIBXSMM_SPMDM_AVX) + static __m256i spmdm_shufmasks_32[256], spmdm_shufmasks_16[256]; + LIBXSMM_ALIGNED(int temp_shufmasks[8], 64); + LIBXSMM_ALIGNED(uint16_t temp_shufmasks2[16], 64); + unsigned int i, j, c, last_bit; + int cnt; + for (i = 0; i < 256; i++) { + cnt = 0; + j = i; + for (c = 0; c < 8; c++) temp_shufmasks[c] = 0; + for (c = 0; c < 16; c++) temp_shufmasks2[c] = 0; + while (j) { + last_bit = LIBXSMM_INTRINSICS_BITSCANFWD32(j); + temp_shufmasks[cnt] = last_bit; + temp_shufmasks2[cnt] = (uint16_t)last_bit; + j &= (~(1<mb; + int k_blocks = handle->kb; + + const size_t sz_block = (((size_t)handle->bm + 1) * sizeof(uint16_t) + + (size_t)handle->bm * handle->bk * sizeof(uint16_t) + + (size_t)handle->bm * handle->bk * sizeof(float) + + sizeof(libxsmm_CSR_sparseslice)); + size_t sz_all_blocks = sz_block * handle->mb * handle->kb; + char* memory_block = 0; + void *const pv = &memory_block; + + /* use low-level scratch memory allocation since life-time of this buffer is unknown */ + if (EXIT_SUCCESS == libxsmm_xmalloc((void**)pv, sz_all_blocks, 2097152, + LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, 0/*extra*/, 0/*extra_size*/)) + { + char* memory_head = memory_block; + libxsmm_CSR_sparseslice* libxsmm_output_csr_a = (libxsmm_CSR_sparseslice*)(memory_head); + memory_head += (size_t)handle->mb * handle->kb * sizeof(libxsmm_CSR_sparseslice); + LIBXSMM_ASSERT(0 != libxsmm_output_csr_a/*sanity check*/); + + for (kb = 0; kb < k_blocks; kb++) { + for (mb = 0; mb < m_blocks; mb++) { + int i = kb*m_blocks + mb; + libxsmm_output_csr_a[i].rowidx = (uint16_t*)(memory_head); + memory_head += ((size_t)handle->bm + 1) * sizeof(uint16_t); + libxsmm_output_csr_a[i].colidx = (uint16_t*)(memory_head); + memory_head += (size_t)handle->bm * handle->bk * sizeof(uint16_t); + libxsmm_output_csr_a[i].values = (float*)(memory_head); + memory_head += (size_t)handle->bm * handle->bk * sizeof(float); + } + } + LIBXSMM_ASSERT(memory_head == (memory_block + sz_all_blocks)); + *libxsmm_output_csr = libxsmm_output_csr_a; + } + else if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: SPMDM CSR scratch memory allocation failed!\n"); + } + + handle->base_ptr_scratch_A = memory_block; +} + + +LIBXSMM_API_INLINE void internal_spmdm_allocate_scratch(libxsmm_spmdm_handle* handle, int max_threads) +{ + void *const pv = &handle->base_ptr_scratch_B_scratch_C; + size_t sz_total_memory, sz_memory_for_scratch_per_thread = + (size_t)handle->bm * handle->bn * sizeof(float) + + (size_t)handle->bk * handle->bn * sizeof(float); + sz_memory_for_scratch_per_thread = LIBXSMM_UP2(sz_memory_for_scratch_per_thread, 4096); + sz_total_memory = sz_memory_for_scratch_per_thread * max_threads; + handle->base_ptr_scratch_B_scratch_C = 0; + + /* use low-level scratch memory allocation since life-time of this buffer is unknown */ + if (EXIT_SUCCESS == libxsmm_xmalloc((void**)pv, sz_total_memory, 2097152, + LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, 0/*extra*/, 0/*extra_size*/)) + { + handle->memory_for_scratch_per_thread = (int)sz_memory_for_scratch_per_thread; + } + else { + if (0 != libxsmm_verbosity) { /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: SPMDM scratch memory allocation failed!\n"); + } + handle->memory_for_scratch_per_thread = 0; + } +} + + +LIBXSMM_API_INLINE void internal_spmdm_deallocate_csr_a(libxsmm_spmdm_handle* handle) +{ + libxsmm_xfree(handle->base_ptr_scratch_A, 0/*no check*/); + handle->base_ptr_scratch_A = NULL; + libxsmm_xfree(handle->base_ptr_scratch_B_scratch_C, 0/*no check*/); + handle->base_ptr_scratch_B_scratch_C = NULL; +} + + +LIBXSMM_API void libxsmm_spmdm_destroy(libxsmm_spmdm_handle* handle) +{ + internal_spmdm_deallocate_csr_a(handle); +} + + +LIBXSMM_API int libxsmm_spmdm_get_num_createSparseSlice_blocks(const libxsmm_spmdm_handle* handle) +{ + return handle->mb * handle->kb; +} + + +LIBXSMM_API int libxsmm_spmdm_get_num_compute_blocks(const libxsmm_spmdm_handle* handle) +{ + return handle->mb * handle->nb; +} + + +LIBXSMM_API_INLINE +void internal_spmdm_createSparseSlice_fp32_thread_sw( + const libxsmm_spmdm_handle* handle, + char transa, + const float* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +# include "libxsmm_spmdm_begin.h" +# include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_fp32_thread_avx2( + const libxsmm_spmdm_handle* handle, + char transa, + const float* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX2) +# include "libxsmm_spmdm_begin_avx2.h" +# include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_createSparseSlice_fp32_thread_sw(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} + + +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_fp32_thread_avx512_core( + const libxsmm_spmdm_handle* handle, + char transa, + const float* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +# include "libxsmm_spmdm_begin_avx512.h" +# include "template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_createSparseSlice_fp32_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} +#endif + + +LIBXSMM_API +void libxsmm_spmdm_createSparseSlice_fp32_thread( + const libxsmm_spmdm_handle* handle, + char transa, + const float* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ + /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ +#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) + internal_spmdm_createSparseSlice_fp32_thread_avx512_core(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ + (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) + internal_spmdm_createSparseSlice_fp32_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#else /* pointer based function call */ + LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); + internal_spmdm_createSparseSlice_fp32_thread(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} + + +LIBXSMM_API_INLINE +void internal_spmdm_createSparseSlice_bfloat16_thread_sw( + const libxsmm_spmdm_handle* handle, + char transa, + const libxsmm_bfloat16* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +# include "libxsmm_spmdm_begin.h" +# include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_bfloat16_thread_avx2( + const libxsmm_spmdm_handle* handle, + char transa, + const libxsmm_bfloat16* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX2) +# include "libxsmm_spmdm_begin_avx2.h" +# include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_createSparseSlice_bfloat16_thread_sw(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} + + +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core( + const libxsmm_spmdm_handle* handle, + char transa, + const libxsmm_bfloat16* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +# include "libxsmm_spmdm_begin_avx512.h" +# include "template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_createSparseSlice_bfloat16_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} +#endif + + +LIBXSMM_API +void libxsmm_spmdm_createSparseSlice_bfloat16_thread( + const libxsmm_spmdm_handle* handle, + char transa, + const libxsmm_bfloat16* a, + libxsmm_CSR_sparseslice* libxsmm_output_csr_a, + int block_id, + int tid, int nthreads) +{ + /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ +#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) + internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ + (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) + internal_spmdm_createSparseSlice_bfloat16_thread_avx2(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#else /* pointer based function call */ + LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); + internal_spmdm_createSparseSlice_bfloat16_thread(handle, transa, a, libxsmm_output_csr_a, block_id, tid, nthreads); +#endif +} + + +LIBXSMM_API_INLINE +void internal_spmdm_compute_fp32_thread_sw( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const float* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const float* b, + char transc, + const float* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +# include "libxsmm_spmdm_begin.h" +# include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_fp32_thread_avx2( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const float* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const float* b, + char transc, + const float* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX2) +# include "libxsmm_spmdm_begin_avx2.h" +# include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_compute_fp32_thread_sw(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} + + +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_fp32_thread_avx512_core( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const float* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const float* b, + char transc, + const float* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +# include "libxsmm_spmdm_begin_avx512.h" +# include "template/libxsmm_spmdm_compute_fp32_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_compute_fp32_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} +#endif + + +LIBXSMM_API +void libxsmm_spmdm_compute_fp32_thread( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const float* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const float* b, + char transc, + const float* beta, + float* c, + int block_id, + int tid, int nthreads) +{ + /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ +#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) + internal_spmdm_compute_fp32_thread_avx512_core(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ + (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) + internal_spmdm_compute_fp32_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#else /* pointer based function call */ + LIBXSMM_ASSERT(0 != internal_spmdm_compute_fp32_thread); + internal_spmdm_compute_fp32_thread(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} + + +LIBXSMM_API_INLINE +void internal_spmdm_compute_bfloat16_thread_sw( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const libxsmm_bfloat16* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const libxsmm_bfloat16* b, + char transc, + const libxsmm_bfloat16* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +# include "libxsmm_spmdm_begin.h" +# include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +} + + +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX2) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_bfloat16_thread_avx2( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const libxsmm_bfloat16* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const libxsmm_bfloat16* b, + char transc, + const libxsmm_bfloat16* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX2) +# include "libxsmm_spmdm_begin_avx2.h" +# include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_compute_bfloat16_thread_sw(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} + + +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +LIBXSMM_API_INLINE LIBXSMM_INTRINSICS(LIBXSMM_X86_AVX512_CORE) +LIBXSMM_ATTRIBUTE_UNUSED void internal_spmdm_compute_bfloat16_thread_avx512_core( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const libxsmm_bfloat16* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const libxsmm_bfloat16* b, + char transc, + const libxsmm_bfloat16* beta, + float* c, + int block_id, + int tid, int nthreads) +{ +#if defined(LIBXSMM_SPMDM_AVX512_CORE) +# include "libxsmm_spmdm_begin_avx512.h" +# include "template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c" +# include "libxsmm_spmdm_end.h" +#else + internal_spmdm_compute_bfloat16_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} +#endif + + +LIBXSMM_API +void libxsmm_spmdm_compute_bfloat16_thread( + const libxsmm_spmdm_handle* handle, + char transa, + char transb, + const libxsmm_bfloat16* alpha, + libxsmm_CSR_sparseslice* a_sparse, + const libxsmm_bfloat16* b, + char transc, + const libxsmm_bfloat16* beta, + float* c, + int block_id, + int tid, int nthreads) +{ + /* if highest implemented code path is statically present, no need for an indirect call (function pointer) */ +#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) && defined(LIBXSMM_SPMDM_AVX512_CORE) + internal_spmdm_compute_bfloat16_thread_avx512_core(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#elif (LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) && /* no need for an indirect call */ \ + (LIBXSMM_X86_AVX512_CORE > LIBXSMM_MAX_STATIC_TARGET_ARCH) + internal_spmdm_compute_bfloat16_thread_avx2(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#else /* pointer based function call */ + LIBXSMM_ASSERT(0 != internal_spmdm_compute_bfloat16_thread); + internal_spmdm_compute_bfloat16_thread(handle, transa, transb, alpha, a_sparse, b, transc, beta, c, block_id, tid, nthreads); +#endif +} + + +LIBXSMM_API void libxsmm_spmdm_init(int M, int N, int K, int max_threads, + libxsmm_spmdm_handle* handle, libxsmm_CSR_sparseslice** libxsmm_output_csr) +{ + double load_imbalance_tolerate = 1.1; + int max_work_per_block; + double avg_work_per_block; + int max_blocks_per_thread; + double avg_blocks_per_thread; + double load_imbalance_1, load_imbalance_2, load_imbalance; + + libxsmm_init(); /* !LIBXSMM_INIT */ + { unsigned int dummy = + LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_statistic_num_spmdm, 1, + LIBXSMM_ATOMIC_RELAXED); /* count number of invocations */ + LIBXSMM_UNUSED(dummy); + } + + handle->m = M; + handle->n = N; + handle->k = K; + handle->bm = (M >= 4096 || M <= 1024) ? 512 : 256; + +#if defined(LIBXSMM_SPMDM_AVX512_CORE) + if (LIBXSMM_X86_AVX512_CORE <= libxsmm_target_archid || LIBXSMM_X86_AVX512_CORE <= LIBXSMM_STATIC_TARGET_ARCH) { + internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_avx512_core; + internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_avx512_core; + internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_avx512_core; + internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_avx512_core; + handle->bn = 96; + } + else +#endif +#if defined(LIBXSMM_SPMDM_AVX2) + if (LIBXSMM_X86_AVX2 <= libxsmm_target_archid || LIBXSMM_X86_AVX2 <= LIBXSMM_STATIC_TARGET_ARCH) { + internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_avx2; + internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_avx2; + internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_avx2; + internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_avx2; + handle->bn = 48; + } + else +#endif + { + internal_spmdm_createSparseSlice_fp32_thread = internal_spmdm_createSparseSlice_fp32_thread_sw; + internal_spmdm_createSparseSlice_bfloat16_thread = internal_spmdm_createSparseSlice_bfloat16_thread_sw; + internal_spmdm_compute_fp32_thread = internal_spmdm_compute_fp32_thread_sw; + internal_spmdm_compute_bfloat16_thread = internal_spmdm_compute_bfloat16_thread_sw; + handle->bn = 6; + } + handle->bk = 128; + handle->mb = LIBXSMM_UPDIV(handle->m, handle->bm); + handle->nb = LIBXSMM_UPDIV(handle->n, handle->bn); + handle->kb = LIBXSMM_UPDIV(handle->k, handle->bk); + + max_work_per_block = handle->bm * handle->bn; + avg_work_per_block = (double)((size_t)handle->m * handle->n) / ((size_t)handle->mb * handle->nb); + load_imbalance_1 = max_work_per_block / avg_work_per_block; + max_blocks_per_thread = LIBXSMM_UPDIV(handle->mb * handle->nb, max_threads); + avg_blocks_per_thread = (double)handle->mb * handle->nb / max_threads; + load_imbalance_2 = max_blocks_per_thread / avg_blocks_per_thread; + load_imbalance = load_imbalance_1 * load_imbalance_2; + + while (32 < handle->bm && load_imbalance > load_imbalance_tolerate) { + handle->bm--; + handle->mb = LIBXSMM_UPDIV(handle->m, handle->bm); + + max_blocks_per_thread = LIBXSMM_UPDIV(handle->mb * handle->nb, max_threads); + avg_blocks_per_thread = (double)handle->mb * handle->nb / max_threads; + load_imbalance_2 = max_blocks_per_thread / avg_blocks_per_thread; + max_work_per_block = handle->bm * handle->bn; + avg_work_per_block = (double)((size_t)handle->m * handle->n) / ((size_t)handle->mb * handle->nb); + load_imbalance_1 = max_work_per_block / avg_work_per_block; + load_imbalance = load_imbalance_1 * load_imbalance_2; + } + + /* This is temporary space needed; allocate for each different size of a */ + internal_spmdm_allocate_csr_a(handle, libxsmm_output_csr); + internal_spmdm_allocate_scratch(handle, max_threads); + + /* Initialize shuffle masks for the computation */ +#if defined(LIBXSMM_SPMDM_AVX) + if (LIBXSMM_X86_AVX <= libxsmm_target_archid || LIBXSMM_X86_AVX <= LIBXSMM_STATIC_TARGET_ARCH) { + internal_spmdm_init_shufmask_avx(); + LIBXSMM_ASSERT(0 != internal_spmdm_shufmasks_32); + LIBXSMM_ASSERT(0 != internal_spmdm_shufmasks_16); + } +#endif + /* post-conditions */ + LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_fp32_thread); + LIBXSMM_ASSERT(0 != internal_spmdm_createSparseSlice_bfloat16_thread); + LIBXSMM_ASSERT(0 != internal_spmdm_compute_fp32_thread); + LIBXSMM_ASSERT(0 != internal_spmdm_compute_bfloat16_thread); +} + diff --git a/third_party/libxsmm/src/libxsmm_spmdm_begin.h b/third_party/libxsmm/src/libxsmm_spmdm_begin.h new file mode 100644 index 00000000..af703326 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_spmdm_begin.h @@ -0,0 +1,64 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish, Hans Pabst (Intel Corp.) +******************************************************************************/ + +#define SIMD_WIDTH_FP32 (1) +#define SIMDTYPE_FP32 float +#define SIMDTYPE_INT32 int +#define SIMDMASKTYPE_FP32 int +#define _MM_SETZERO_FP32() (0) +#define _MM_SETZERO_INT32() (0) +#define _MM_SET1_FP32(x) (x) +#define _MM_SET1_INT32(x) (x) +#define _MM_SET1_INT16 (x) +#define _MM_LOAD_FP32(x) (*(x)) +#define _MM_LOADU_FP32(x) (*(x)) +#define _MM_LOAD_INT32(x) (*(x)) +#define _MM_STORE_INT32(x,y) ((*(x)) = (y)) +#define _MM_LOADU_INT32(x) (*(x)) +#define _MM_GATHER_FP32(Addr, idx, scale) (*(Addr + (idx))) +#define _MM_CMPNEQ_FP32(v1,v2) (LIBXSMM_FEQ(v1, v2) ? 0 : 1) +#define _MM_STORE_FP32(x,y) ((*(x)) = (y)) +#define _MM_STOREU_FP32(x,y) ((*(x)) = (y)) +#define _MM_ADD_FP32(x,y) ((x) + (y)) +#define _MM_FMADD_FP32(x,y,z) (((x)*(y))+(z)) +#define _MM_MUL_FP32(x,y) ((x)*(y)) +#define _MM_PREFETCH(x, y) +#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) ((*(ptr_B)) = (*(ptr_A))) +#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ + uint16_t restmp = (*(ptr_A)); \ + union { int i; float f; } res; \ + res.i = restmp; \ + res.i <<= 16; \ + (*(ptr_B)) = res.f; \ +} + +#define COMPRESS_FP32(v, k, m, cnt) if (m) { \ + values_ptr[cnt] = v; \ + colidx_ptr[cnt] = (uint16_t)(k); \ + cnt++; \ +} + +#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ + union { int i; float f; } vlo_tmp, vhi_tmp; \ + vlo_tmp.i = (v) & 0xFFFF; vlo_tmp.i <<= 16; \ + vlo_final = vlo_tmp.f; \ + vhi_tmp.i = (v) & 0x0000FFFF; \ + vhi_final = vhi_tmp.f; \ +} + +#define COMPRESS_BFLOAT16(vlo, vhi, v) { \ + union { int i; float f; } vlo_tmp, vhi_tmp; \ + vlo_tmp.f = vlo; \ + v = (vlo_tmp.i >> 16); \ + vhi_tmp.f = vhi; \ + v = v | (vhi_tmp.i & 0xFFFF0000); \ +} + diff --git a/third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h new file mode 100644 index 00000000..0912a489 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx2.h @@ -0,0 +1,166 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish, Hans Pabst (Intel Corp.) +******************************************************************************/ +#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# error "libxsmm_intrinsics_x86.h not included!" +#endif + +#if (LIBXSMM_X86_AVX2 <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +#define SIMD_WIDTH_FP32 (8) +#define SIMDTYPE_FP32 __m256 +#define SIMDTYPE_INT32 __m256i +#define SIMDMASKTYPE_FP32 __m256 +#define _MM_SETZERO_FP32 _mm256_setzero_ps +#define _MM_SETZERO_INT32 _mm256_setzero_si256 +#define _MM_SET1_FP32 _mm256_set1_ps +#define _MM_SET1_INT32 _mm256_set1_epi32 +#define _MM_SET1_INT16 _mm256_set1_epi16 +#define _MM_SET_INT32 _mm256_set_epi32 +#define _MM_LOAD_FP32 _mm256_loadu_ps +#define _MM_LOADU_FP32 _mm256_loadu_ps +#define _MM_LOAD_INT32 _mm256_loadu_si256 +#define _MM_STORE_INT32 _mm256_storeu_si256 +#define _MM_LOADU_INT32(x) _mm256_loadu_si256( (__m256i const *)(x)) +#define _MM_GATHER_INT32(Addr, idx, scale) _mm256_i32gather_epi32((Addr), (idx), (scale)) +#define _MM_GATHER_FP32(Addr, idx, scale) _mm256_i32gather_ps(((float const *)(Addr)), (idx), (scale)) +#define _MM_CMPNEQ_FP32(v1,v2) _mm256_cmp_ps(v1,v2,12) +#define _MM_STORE_FP32 _mm256_storeu_ps +#define _MM_STOREU_FP32 _mm256_storeu_ps +#define _MM_ADD_FP32 _mm256_add_ps +#define _MM_FMADD_FP32 _mm256_fmadd_ps +#define _MM_MUL_FP32 _mm256_mul_ps +#define _MM_PREFETCH(x, y) _mm_prefetch(x, y) +#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \ + __m256 ymm9 = _mm256_loadu_ps(ptr_A); \ + __m256 ymm10 = _mm256_loadu_ps(ptr_A + (size_t)ldA); \ + __m256 ymm11 = _mm256_loadu_ps(ptr_A + (size_t)ldA*2); \ + __m256 ymm12 = _mm256_loadu_ps(ptr_A + (size_t)ldA*3); \ + __m256 ymm13 = _mm256_loadu_ps(ptr_A + (size_t)ldA*4); \ + __m256 ymm14 = _mm256_loadu_ps(ptr_A + (size_t)ldA*5); \ + __m256 ymm15 = _mm256_loadu_ps(ptr_A + (size_t)ldA*6); \ + __m256 ymm2 = _mm256_loadu_ps(ptr_A + (size_t)ldA*7); \ + __m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \ + __m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \ + __m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \ + __m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \ + ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \ + __m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \ + ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \ + ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \ + __m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \ + ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \ + ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \ + ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \ + _mm256_storeu_ps(ptr_B, ymm3);{ \ + __m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \ + ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \ + __m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \ + ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \ + ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \ + ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \ + ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \ + ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \ + ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \ + ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \ + ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \ + ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \ + ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \ + ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \ + ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \ + ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}} \ +} + +#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ + __m256 ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm2; \ + __m256i vload_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A))); \ + vload_1 = _mm256_inserti128_si256(vload_1, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA)), 1); \ + EXPAND_BFLOAT16(vload_1, ymm9, ymm10);{ \ + __m256i vload_2 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*2))); \ + vload_2 = _mm256_inserti128_si256(vload_2, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*3)), 1); \ + EXPAND_BFLOAT16(vload_2, ymm11, ymm12);{ \ + __m256i vload_3 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*4))); \ + vload_3 = _mm256_inserti128_si256(vload_3, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*5)), 1); \ + EXPAND_BFLOAT16(vload_3, ymm13, ymm14);{ \ + __m256i vload_4 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*6))); \ + vload_4 = _mm256_inserti128_si256(vload_4, _mm_loadu_si128((const __m128i*)(ptr_A + (size_t)ldA*7)), 1); \ + EXPAND_BFLOAT16(vload_4, ymm15, ymm2);{ \ + __m256 ymm6 = _mm256_unpacklo_ps(ymm9, ymm10); \ + __m256 ymm1 = _mm256_unpacklo_ps(ymm11, ymm12); \ + __m256 ymm8 = _mm256_unpackhi_ps(ymm9, ymm10); \ + __m256 ymm0 = _mm256_unpacklo_ps(ymm13, ymm14); \ + ymm9 = _mm256_unpacklo_ps(ymm15, ymm2);{ \ + __m256 ymm3 = _mm256_shuffle_ps(ymm6, ymm1, 0x4E); \ + ymm10 = _mm256_blend_ps(ymm6, ymm3, 0xCC); \ + ymm6 = _mm256_shuffle_ps(ymm0, ymm9, 0x4E);{ \ + __m256 ymm7 = _mm256_unpackhi_ps(ymm11, ymm12); \ + ymm11 = _mm256_blend_ps(ymm0, ymm6, 0xCC); \ + ymm12 = _mm256_blend_ps(ymm3, ymm1, 0xCC); \ + ymm3 = _mm256_permute2f128_ps(ymm10, ymm11, 0x20); \ + _mm256_storeu_ps(ptr_B, ymm3);{ \ + __m256 ymm5 = _mm256_unpackhi_ps(ymm13, ymm14); \ + ymm13 = _mm256_blend_ps(ymm6, ymm9, 0xCC);{ \ + __m256 ymm4 = _mm256_unpackhi_ps(ymm15, ymm2); \ + ymm2 = _mm256_permute2f128_ps(ymm12, ymm13, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB, ymm2); \ + ymm14 = _mm256_shuffle_ps(ymm8, ymm7, 0x4E); \ + ymm15 = _mm256_blend_ps(ymm14, ymm7, 0xCC); \ + ymm7 = _mm256_shuffle_ps(ymm5, ymm4, 0x4E); \ + ymm8 = _mm256_blend_ps(ymm8, ymm14, 0xCC); \ + ymm5 = _mm256_blend_ps(ymm5, ymm7, 0xCC); \ + ymm6 = _mm256_permute2f128_ps(ymm8, ymm5, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*2, ymm6); \ + ymm4 = _mm256_blend_ps(ymm7, ymm4, 0xCC); \ + ymm7 = _mm256_permute2f128_ps(ymm15, ymm4, 0x20); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*3, ymm7); \ + ymm1 = _mm256_permute2f128_ps(ymm10, ymm11, 0x31); \ + ymm0 = _mm256_permute2f128_ps(ymm12, ymm13, 0x31); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*4, ymm1); \ + ymm5 = _mm256_permute2f128_ps(ymm8, ymm5, 0x31); \ + ymm4 = _mm256_permute2f128_ps(ymm15, ymm4, 0x31); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*5, ymm0); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*6, ymm5); \ + _mm256_storeu_ps(ptr_B + (size_t)ldB*7, ymm4);}}}}}}}} \ +} + +#define COMPRESS_FP32(v, k, m, cnt) { \ + const unsigned int mask = _mm256_movemask_ps(m); \ + const SIMDTYPE_INT32 vk = _MM_SET1_INT16((short)(k)); \ + const __m256i perm_ctrl = _mm256_loadu_si256(&shufmasks[mask]); \ + const __m256 v_packed = _mm256_permutevar8x32_ps(v, perm_ctrl); \ + const __m256i v_shuff = _mm256_loadu_si256(&shufmasks2[mask]); \ + const __m256i v_idx = _mm256_add_epi32(vk, v_shuff); \ + _mm256_storeu_ps(values_ptr + (cnt), v_packed); \ + _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \ + cnt = (unsigned short)((cnt) + _mm_popcnt_u32(mask)); \ +} + +#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ + const __m256i vlo = _mm256_unpacklo_epi16(vzero, v); \ + const __m256i vhi = _mm256_unpackhi_epi16(vzero, v); \ + vlo_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x20)); \ + vhi_final = _mm256_castsi256_ps(_mm256_permute2f128_si256(vlo, vhi, 0x31)); \ +} + +#define COMPRESS_BFLOAT16(vlo, vhi, v) { \ + const __m256i vtmp1 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x20)); \ + const __m256i vtmp2 = _mm256_castps_si256(_mm256_permute2f128_ps(vlo, vhi, 0x31)); \ + const __m256i a = _mm256_srli_epi32(vtmp1, 16), b = _mm256_srli_epi32(vtmp2, 16); \ + v = _mm256_packus_epi32(a, b); \ +} + +#endif + diff --git a/third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h new file mode 100644 index 00000000..0174e287 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_spmdm_begin_avx512.h @@ -0,0 +1,310 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish, Hans Pabst (Intel Corp.) +******************************************************************************/ +#if !defined(LIBXSMM_MAX_STATIC_TARGET_ARCH) +# error "libxsmm_intrinsics_x86.h not included!" +#endif + +#if (LIBXSMM_X86_AVX512_CORE <= LIBXSMM_MAX_STATIC_TARGET_ARCH) +#define SIMD_WIDTH_FP32 (16) +#define SIMDTYPE_FP32 __m512 +#define SIMDTYPE_INT32 __m512i +#define SIMDMASKTYPE_FP32 __mmask16 +#define _MM_SETZERO_FP32 _mm512_setzero_ps +#define _MM_SETZERO_INT32 _mm512_setzero_epi32 +#define _MM_SET1_FP32 _mm512_set1_ps +#define _MM_SET1_INT32 _mm512_set1_epi32 +#define _MM_SET1_INT16 _mm512_set1_epi16 +#define _MM_SET_INT32 _mm512_set_epi32 +#define _MM_LOAD_FP32 LIBXSMM_INTRINSICS_MM512_LOAD_PS +#define _MM_LOADU_FP32 _mm512_loadu_ps +#define _MM_LOAD_INT32 _mm512_loadu_si512 +#define _MM_STORE_INT32 _mm512_storeu_si512 +#define _MM_LOADU_INT32(x) _mm512_loadu_si512( (void const *)(x)) +#define _MM_GATHER_INT32(Addr, idx, scale) _mm512_i32gather_epi32((idx), (Addr), (scale)) +#define _MM_GATHER_FP32(Addr, idx, scale) _mm512_i32gather_ps((idx), (Addr), (scale)) +#define _MM_CMPNEQ_FP32(v1,v2) _mm512_cmp_ps_mask(v1,v2,12) +#define _MM_STORE_FP32 _mm512_storeu_ps +#define _MM_STOREU_FP32 _mm512_storeu_ps +#define _MM_ADD_FP32 _mm512_add_ps +#define _MM_FMADD_FP32 _mm512_fmadd_ps +#define _MM_MUL_FP32 _mm512_mul_ps +#define _MM_PREFETCH(x, y) _mm_prefetch(x, y) +#define TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_A, ldA, ptr_B, ldB) { \ + __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \ + __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \ + r0 = _mm512_loadu_ps(ptr_A); \ + r1 = _mm512_loadu_ps(ptr_A + ldA); \ + r2 = _mm512_loadu_ps(ptr_A + 2*ldA); \ + r3 = _mm512_loadu_ps(ptr_A + 3*ldA); \ + r4 = _mm512_loadu_ps(ptr_A + 4*ldA); \ + r5 = _mm512_loadu_ps(ptr_A + 5*ldA); \ + r6 = _mm512_loadu_ps(ptr_A + 6*ldA); \ + r7 = _mm512_loadu_ps(ptr_A + 7*ldA); \ + r8 = _mm512_loadu_ps(ptr_A + 8*ldA); \ + r9 = _mm512_loadu_ps(ptr_A + 9*ldA); \ + ra = _mm512_loadu_ps(ptr_A + 10*ldA); \ + rb = _mm512_loadu_ps(ptr_A + 11*ldA); \ + rc = _mm512_loadu_ps(ptr_A + 12*ldA); \ + rd = _mm512_loadu_ps(ptr_A + 13*ldA); \ + re = _mm512_loadu_ps(ptr_A + 14*ldA); \ + rf = _mm512_loadu_ps(ptr_A + 15*ldA); \ + \ + t0 = _mm512_unpacklo_ps(r0,r1); \ + t1 = _mm512_unpackhi_ps(r0,r1); \ + t2 = _mm512_unpacklo_ps(r2,r3); \ + t3 = _mm512_unpackhi_ps(r2,r3); \ + t4 = _mm512_unpacklo_ps(r4,r5); \ + t5 = _mm512_unpackhi_ps(r4,r5); \ + t6 = _mm512_unpacklo_ps(r6,r7); \ + t7 = _mm512_unpackhi_ps(r6,r7); \ + t8 = _mm512_unpacklo_ps(r8,r9); \ + t9 = _mm512_unpackhi_ps(r8,r9); \ + ta = _mm512_unpacklo_ps(ra,rb); \ + tb = _mm512_unpackhi_ps(ra,rb); \ + tc = _mm512_unpacklo_ps(rc,rd); \ + td = _mm512_unpackhi_ps(rc,rd); \ + te = _mm512_unpacklo_ps(re,rf); \ + tf = _mm512_unpackhi_ps(re,rf); \ + \ + { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \ + r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \ + r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \ + r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \ + r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \ + r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \ + ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \ + rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \ + re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2));} \ + \ + t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \ + t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \ + t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \ + t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \ + t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \ + t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \ + t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \ + t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \ + t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \ + t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \ + ta = _mm512_shuffle_f32x4(ra, re, 0x88); \ + tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \ + tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \ + td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \ + te = _mm512_shuffle_f32x4(ra, re, 0xdd); \ + tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \ + \ + r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \ + r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \ + r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \ + r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \ + r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \ + r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \ + r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \ + r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \ + r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \ + r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \ + ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \ + rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \ + rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \ + rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \ + re = _mm512_shuffle_f32x4(t6, te, 0xdd); \ + rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \ + \ + _mm512_storeu_ps(ptr_B + 0*ldB, r0); \ + _mm512_storeu_ps(ptr_B + 1*ldB, r1); \ + _mm512_storeu_ps(ptr_B + 2*ldB, r2); \ + _mm512_storeu_ps(ptr_B + 3*ldB, r3); \ + _mm512_storeu_ps(ptr_B + 4*ldB, r4); \ + _mm512_storeu_ps(ptr_B + 5*ldB, r5); \ + _mm512_storeu_ps(ptr_B + 6*ldB, r6); \ + _mm512_storeu_ps(ptr_B + 7*ldB, r7); \ + _mm512_storeu_ps(ptr_B + 8*ldB, r8); \ + _mm512_storeu_ps(ptr_B + 9*ldB, r9); \ + _mm512_storeu_ps(ptr_B + 10*ldB, ra); \ + _mm512_storeu_ps(ptr_B + 11*ldB, rb); \ + _mm512_storeu_ps(ptr_B + 12*ldB, rc); \ + _mm512_storeu_ps(ptr_B + 13*ldB, rd); \ + _mm512_storeu_ps(ptr_B + 14*ldB, re); \ + _mm512_storeu_ps(ptr_B + 15*ldB, rf); \ +} + +#define TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_A, ldA, ptr_B, ldB) { \ + __m512 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; \ + __m512 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; \ + __m512i vload_1 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A))); \ + vload_1 = _mm512_inserti32x8(vload_1, _mm256_loadu_si256((const __m256i*)(ptr_A + ldA)), 1); \ + EXPAND_BFLOAT16(vload_1, r0, r1);{ \ + __m512i vload_2 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 2*ldA))); \ + vload_2 = _mm512_inserti32x8(vload_2, _mm256_loadu_si256((const __m256i*)(ptr_A + 3*ldA)), 1); \ + EXPAND_BFLOAT16(vload_2, r2, r3);{ \ + __m512i vload_3 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 4*ldA))); \ + vload_3 = _mm512_inserti32x8(vload_3, _mm256_loadu_si256((const __m256i*)(ptr_A + 5*ldA)), 1); \ + EXPAND_BFLOAT16(vload_3, r4, r5);{ \ + __m512i vload_4 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 6*ldA))); \ + vload_4 = _mm512_inserti32x8(vload_4, _mm256_loadu_si256((const __m256i*)(ptr_A + 7*ldA)), 1); \ + EXPAND_BFLOAT16(vload_4, r6, r7);{ \ + __m512i vload_5 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 8*ldA))); \ + vload_5 = _mm512_inserti32x8(vload_5, _mm256_loadu_si256((const __m256i*)(ptr_A + 9*ldA)), 1); \ + EXPAND_BFLOAT16(vload_5, r8, r9);{ \ + __m512i vload_6 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 10*ldA))); \ + vload_6 = _mm512_inserti32x8(vload_6, _mm256_loadu_si256((const __m256i*)(ptr_A + 11*ldA)), 1); \ + EXPAND_BFLOAT16(vload_6, ra, rb);{ \ + __m512i vload_7 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 12*ldA))); \ + vload_7 = _mm512_inserti32x8(vload_7, _mm256_loadu_si256((const __m256i*)(ptr_A + 13*ldA)), 1); \ + EXPAND_BFLOAT16(vload_7, rc, rd);{ \ + __m512i vload_8 = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i*)(ptr_A + 14*ldA))); \ + vload_8 = _mm512_inserti32x8(vload_8, _mm256_loadu_si256((const __m256i*)(ptr_A + 15*ldA)), 1); \ + EXPAND_BFLOAT16(vload_8, re, rf); \ + \ + t0 = _mm512_unpacklo_ps(r0,r1); \ + t1 = _mm512_unpackhi_ps(r0,r1); \ + t2 = _mm512_unpacklo_ps(r2,r3); \ + t3 = _mm512_unpackhi_ps(r2,r3); \ + t4 = _mm512_unpacklo_ps(r4,r5); \ + t5 = _mm512_unpackhi_ps(r4,r5); \ + t6 = _mm512_unpacklo_ps(r6,r7); \ + t7 = _mm512_unpackhi_ps(r6,r7); \ + t8 = _mm512_unpacklo_ps(r8,r9); \ + t9 = _mm512_unpackhi_ps(r8,r9); \ + ta = _mm512_unpacklo_ps(ra,rb); \ + tb = _mm512_unpackhi_ps(ra,rb); \ + tc = _mm512_unpacklo_ps(rc,rd); \ + td = _mm512_unpackhi_ps(rc,rd); \ + te = _mm512_unpacklo_ps(re,rf); \ + tf = _mm512_unpackhi_ps(re,rf); \ + \ + { const __m512d td1 = _mm512_castps_pd(t0), td2 = _mm512_castps_pd(t2); \ + r0 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r1 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(t1), td2 = _mm512_castps_pd(t3); \ + r2 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r3 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(t4), td2 = _mm512_castps_pd(t6); \ + r4 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r5 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(t5), td2 = _mm512_castps_pd(t7); \ + r6 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r7 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(t8), td2 = _mm512_castps_pd(ta); \ + r8 = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + r9 = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(t9), td2 = _mm512_castps_pd(tb); \ + ra = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rb = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(tc), td2 = _mm512_castps_pd(te); \ + rc = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rd = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + { const __m512d td1 = _mm512_castps_pd(td), td2 = _mm512_castps_pd(tf); \ + re = _mm512_castpd_ps(_mm512_unpacklo_pd(td1, td2)); \ + rf = _mm512_castpd_ps(_mm512_unpackhi_pd(td1, td2)); } \ + \ + t0 = _mm512_shuffle_f32x4(r0, r4, 0x88); \ + t1 = _mm512_shuffle_f32x4(r1, r5, 0x88); \ + t2 = _mm512_shuffle_f32x4(r2, r6, 0x88); \ + t3 = _mm512_shuffle_f32x4(r3, r7, 0x88); \ + t4 = _mm512_shuffle_f32x4(r0, r4, 0xdd); \ + t5 = _mm512_shuffle_f32x4(r1, r5, 0xdd); \ + t6 = _mm512_shuffle_f32x4(r2, r6, 0xdd); \ + t7 = _mm512_shuffle_f32x4(r3, r7, 0xdd); \ + t8 = _mm512_shuffle_f32x4(r8, rc, 0x88); \ + t9 = _mm512_shuffle_f32x4(r9, rd, 0x88); \ + ta = _mm512_shuffle_f32x4(ra, re, 0x88); \ + tb = _mm512_shuffle_f32x4(rb, rf, 0x88); \ + tc = _mm512_shuffle_f32x4(r8, rc, 0xdd); \ + td = _mm512_shuffle_f32x4(r9, rd, 0xdd); \ + te = _mm512_shuffle_f32x4(ra, re, 0xdd); \ + tf = _mm512_shuffle_f32x4(rb, rf, 0xdd); \ + \ + r0 = _mm512_shuffle_f32x4(t0, t8, 0x88); \ + r1 = _mm512_shuffle_f32x4(t1, t9, 0x88); \ + r2 = _mm512_shuffle_f32x4(t2, ta, 0x88); \ + r3 = _mm512_shuffle_f32x4(t3, tb, 0x88); \ + r4 = _mm512_shuffle_f32x4(t4, tc, 0x88); \ + r5 = _mm512_shuffle_f32x4(t5, td, 0x88); \ + r6 = _mm512_shuffle_f32x4(t6, te, 0x88); \ + r7 = _mm512_shuffle_f32x4(t7, tf, 0x88); \ + r8 = _mm512_shuffle_f32x4(t0, t8, 0xdd); \ + r9 = _mm512_shuffle_f32x4(t1, t9, 0xdd); \ + ra = _mm512_shuffle_f32x4(t2, ta, 0xdd); \ + rb = _mm512_shuffle_f32x4(t3, tb, 0xdd); \ + rc = _mm512_shuffle_f32x4(t4, tc, 0xdd); \ + rd = _mm512_shuffle_f32x4(t5, td, 0xdd); \ + re = _mm512_shuffle_f32x4(t6, te, 0xdd); \ + rf = _mm512_shuffle_f32x4(t7, tf, 0xdd); \ + \ + _mm512_storeu_ps(ptr_B + 0*ldB, r0); \ + _mm512_storeu_ps(ptr_B + 1*ldB, r1); \ + _mm512_storeu_ps(ptr_B + 2*ldB, r2); \ + _mm512_storeu_ps(ptr_B + 3*ldB, r3); \ + _mm512_storeu_ps(ptr_B + 4*ldB, r4); \ + _mm512_storeu_ps(ptr_B + 5*ldB, r5); \ + _mm512_storeu_ps(ptr_B + 6*ldB, r6); \ + _mm512_storeu_ps(ptr_B + 7*ldB, r7); \ + _mm512_storeu_ps(ptr_B + 8*ldB, r8); \ + _mm512_storeu_ps(ptr_B + 9*ldB, r9); \ + _mm512_storeu_ps(ptr_B + 10*ldB, ra); \ + _mm512_storeu_ps(ptr_B + 11*ldB, rb); \ + _mm512_storeu_ps(ptr_B + 12*ldB, rc); \ + _mm512_storeu_ps(ptr_B + 13*ldB, rd); \ + _mm512_storeu_ps(ptr_B + 14*ldB, re); \ + _mm512_storeu_ps(ptr_B + 15*ldB, rf);}}}}}}} \ +} + +#define COMPRESS_FP32(v, k, m, cnt) { \ + _mm512_mask_compressstoreu_ps(values_ptr + (cnt), m, v); \ + { \ + __m256i vk1 = _mm256_set1_epi16((short)(k)); \ + __m256i vk2 = _mm256_set1_epi16((short)((k) + 8)); \ + __m256i v_idx = _mm256_add_epi32(vk1, _mm256_loadu_si256(&shufmasks2[(m)&0xFF])); \ + __m256i v_idx_2 = _mm256_add_epi32(vk2, _mm256_loadu_si256(&shufmasks2[((m)>>8)&0xFF])); \ + _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx); \ + cnt = (unsigned short)((cnt) + _mm_popcnt_u32((m)&0xFF)); \ + _mm256_storeu_si256((__m256i *)(colidx_ptr + (cnt)), v_idx_2); \ + cnt = (unsigned short)((cnt) + _mm_popcnt_u32(((m)>>8)&0xFF)); \ + } \ +} + +#define EXPAND_BFLOAT16(v, vlo_final, vhi_final) { \ + const __m512i vlo = _mm512_unpacklo_epi16(vzero, v); \ + const __m512i vhi = _mm512_unpackhi_epi16(vzero, v); \ + const __m512i permmask1 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); \ + const __m512i permmask2 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); \ + vlo_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask1, vhi)); \ + vhi_final = _mm512_castsi512_ps(_mm512_permutex2var_epi64(vlo, permmask2, vhi)); \ +} + +#define COMPRESS_BFLOAT16(vlo, vhi, v) { \ + const __m512i permmask1 = _mm512_set_epi64(13, 12, 9, 8, 5, 4, 1, 0); \ + const __m512i permmask2 = _mm512_set_epi64(15, 14, 11, 10, 7, 6, 3, 2); \ + const __m512i va = _mm512_castps_si512(vlo), vb = _mm512_castps_si512(vhi); \ + const __m512i vtmp1 = _mm512_permutex2var_epi64(va, permmask1, vb); \ + const __m512i vtmp2 = _mm512_permutex2var_epi64(va, permmask2, vb); \ + const __m512i a = _mm512_srli_epi32(vtmp1, 16), b = _mm512_srli_epi32(vtmp2, 16); \ + v = _mm512_packus_epi32(a, b); \ +} + +#endif + diff --git a/third_party/libxsmm/src/libxsmm_spmdm_end.h b/third_party/libxsmm/src/libxsmm_spmdm_end.h new file mode 100644 index 00000000..12bd27f7 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_spmdm_end.h @@ -0,0 +1,42 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ + +#undef SIMD_WIDTH_FP32 +#undef SIMDTYPE_FP32 +#undef SIMDTYPE_INT32 +#undef SIMDMASKTYPE_FP32 +#undef _MM_SETZERO_FP32 +#undef _MM_SETZERO_INT32 +#undef _MM_SET1_FP32 +#undef _MM_SET1_INT32 +#undef _MM_SET1_INT16 +#undef _MM_SET_INT32 +#undef _MM_LOAD_FP32 +#undef _MM_LOADU_FP32 +#undef _MM_LOAD_INT32 +#undef _MM_STORE_INT32 +#undef _MM_LOADU_INT32 +#undef _MM_GATHER_INT32 +#undef _MM_GATHER_FP32 +#undef _MM_CMPNEQ_FP32 +#undef _MM_STORE_FP32 +#undef _MM_STOREU_FP32 +#undef _MM_ADD_FP32 +#undef _MM_FMADD_FP32 +#undef _MM_MUL_FP32 +#undef _MM_PREFETCH +#undef TRANSPOSE_SIMD_WIDTH_KERNEL +#undef TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16 +#undef COMPRESS_FP32 +#undef EXPAND_BFLOAT16 +#undef COMPRESS_BFLOAT16 +#undef num_regs + diff --git a/third_party/libxsmm/src/libxsmm_sync.c b/third_party/libxsmm/src/libxsmm_sync.c new file mode 100644 index 00000000..40dace51 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_sync.c @@ -0,0 +1,673 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +/* Lock primitives inspired by Karl Malbrain, Concurrency Kit, and TF/sync. +******************************************************************************/ +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) +# define LIBXSMM_SYNC_FUTEX +#endif + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#include +#if defined(_WIN32) +# include +#else +# if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) +# include +# endif +# include +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if !defined(LIBXSMM_SYNC_RWLOCK_BITS) +# if defined(__MINGW32__) +# define LIBXSMM_SYNC_RWLOCK_BITS 32 +# else +# define LIBXSMM_SYNC_RWLOCK_BITS 16 +# endif +#endif + +#if !defined(LIBXSMM_SYNC_GENERIC_PID) && 1 +# define LIBXSMM_SYNC_GENERIC_PID +#endif + + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_core_tag { /* per-core */ + uint8_t id; + volatile uint8_t core_sense; + volatile uint8_t* thread_senses; + volatile uint8_t* my_flags[2]; + uint8_t** partner_flags[2]; + uint8_t parity; + uint8_t sense; +} internal_sync_core_tag; + +LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_thread_tag { /* per-thread */ + int core_tid; + internal_sync_core_tag *core; +} internal_sync_thread_tag; + +struct LIBXSMM_RETARGETABLE libxsmm_barrier { + internal_sync_core_tag** cores; + internal_sync_thread_tag** threads; + int ncores, nthreads_per_core; + int nthreads, ncores_nbits; /* nbits(ncores) != log2(ncores) */ + /* internal counter type which is guaranteed to be atomic when using certain methods */ + volatile int threads_waiting; + /* thread-safety during initialization */ + volatile uint8_t init_done; +}; + + +LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core) +{ + libxsmm_barrier *const barrier = (libxsmm_barrier*)malloc(sizeof(libxsmm_barrier)); +#if (0 == LIBXSMM_SYNC) + LIBXSMM_UNUSED(ncores); LIBXSMM_UNUSED(nthreads_per_core); +#else + if (NULL != barrier && 1 < ncores && 1 <= nthreads_per_core) { + barrier->ncores = ncores; + barrier->ncores_nbits = (int)LIBXSMM_NBITS(ncores); + barrier->nthreads_per_core = nthreads_per_core; + barrier->nthreads = ncores * nthreads_per_core; + barrier->threads = (internal_sync_thread_tag**)libxsmm_aligned_malloc( + barrier->nthreads * sizeof(internal_sync_thread_tag*), LIBXSMM_CACHELINE); + barrier->cores = (internal_sync_core_tag**)libxsmm_aligned_malloc( + barrier->ncores * sizeof(internal_sync_core_tag*), LIBXSMM_CACHELINE); + barrier->threads_waiting = barrier->nthreads; /* atomic */ + barrier->init_done = 0; /* false */ + } + else +#endif + if (NULL != barrier) { + barrier->nthreads = 1; + } + return barrier; +} + + +LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid) +{ +#if (0 == LIBXSMM_SYNC) + LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid); +#else + if (NULL != barrier && 1 < barrier->nthreads) { + const int cid = tid / barrier->nthreads_per_core; /* this thread's core ID */ + internal_sync_core_tag* core = 0; + int i; + internal_sync_thread_tag* thread; + + /* we only initialize the barrier once */ + if (barrier->init_done == 2) { + return; + } + + /* allocate per-thread structure */ + thread = (internal_sync_thread_tag*)libxsmm_aligned_malloc( + sizeof(internal_sync_thread_tag), LIBXSMM_CACHELINE); + barrier->threads[tid] = thread; + thread->core_tid = tid - (barrier->nthreads_per_core * cid); /* mod */ + + /* each core's thread 0 does all the allocations */ + if (0 == thread->core_tid) { + core = (internal_sync_core_tag*)libxsmm_aligned_malloc( + sizeof(internal_sync_core_tag), LIBXSMM_CACHELINE); + core->id = (uint8_t)cid; + core->core_sense = 1; + + core->thread_senses = (uint8_t*)libxsmm_aligned_malloc( + barrier->nthreads_per_core * sizeof(uint8_t), LIBXSMM_CACHELINE); + for (i = 0; i < barrier->nthreads_per_core; ++i) core->thread_senses[i] = 1; + + for (i = 0; i < 2; ++i) { + core->my_flags[i] = (uint8_t*)libxsmm_aligned_malloc( + barrier->ncores_nbits * sizeof(uint8_t) * LIBXSMM_CACHELINE, + LIBXSMM_CACHELINE); + core->partner_flags[i] = (uint8_t**)libxsmm_aligned_malloc( + barrier->ncores_nbits * sizeof(uint8_t*), + LIBXSMM_CACHELINE); + } + + core->parity = 0; + core->sense = 1; + barrier->cores[cid] = core; + } + + /* barrier to let all the allocations complete */ + if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) { + barrier->threads_waiting = barrier->nthreads; /* atomic */ + barrier->init_done = 1; /* true */ + } + else { + while (0/*false*/ == barrier->init_done); + } + + /* set required per-thread information */ + thread->core = barrier->cores[cid]; + + /* each core's thread 0 completes setup */ + if (0 == thread->core_tid) { + int di; + for (i = di = 0; i < barrier->ncores_nbits; ++i, di += LIBXSMM_CACHELINE) { + /* find dissemination partner and link to it */ + const int dissem_cid = (cid + (1 << i)) % barrier->ncores; + assert(0 != core); /* initialized under the same condition; see above */ + core->my_flags[0][di] = core->my_flags[1][di] = 0; + core->partner_flags[0][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[0][di]; + core->partner_flags[1][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[1][di]; + } + } + + /* barrier to let initialization complete */ + if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) { + barrier->threads_waiting = barrier->nthreads; /* atomic */ + barrier->init_done = 2; + } + else { + while (2 != barrier->init_done); + } + } +#endif +} + + +LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) +void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid) +{ +#if (0 == LIBXSMM_SYNC) + LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid); +#else + if (NULL != barrier && 1 < barrier->nthreads) { + internal_sync_thread_tag *const thread = barrier->threads[tid]; + internal_sync_core_tag *const core = thread->core; + + /* first let's execute a memory fence */ + LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); + + /* first signal this thread's arrival */ + core->thread_senses[thread->core_tid] = (uint8_t)(0 == core->thread_senses[thread->core_tid] ? 1 : 0); + + /* each core's thread 0 syncs across cores */ + if (0 == thread->core_tid) { + int i; + /* wait for the core's remaining threads */ + for (i = 1; i < barrier->nthreads_per_core; ++i) { + uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[i]; + while (core_sense == thread_sense) { /* avoid evaluation in unspecified order */ + LIBXSMM_SYNC_PAUSE; + core_sense = core->core_sense; + thread_sense = core->thread_senses[i]; + } + } + + if (1 < barrier->ncores) { + int di; +# if defined(__MIC__) + /* cannot use LIBXSMM_ALIGNED since attribute may not apply to local non-static arrays */ + uint8_t sendbuffer[LIBXSMM_CACHELINE+LIBXSMM_CACHELINE-1]; + uint8_t *const sendbuf = LIBXSMM_ALIGN(sendbuffer, LIBXSMM_CACHELINE); + __m512d m512d; + _mm_prefetch((const char*)core->partner_flags[core->parity][0], _MM_HINT_ET1); + sendbuf[0] = core->sense; + m512d = LIBXSMM_INTRINSICS_MM512_LOAD_PD(sendbuf); +# endif + + for (i = di = 0; i < barrier->ncores_nbits - 1; ++i, di += LIBXSMM_CACHELINE) { +# if defined(__MIC__) + _mm_prefetch((const char*)core->partner_flags[core->parity][i+1], _MM_HINT_ET1); + _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d); +# else + *core->partner_flags[core->parity][i] = core->sense; +# endif + while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE; + } + +# if defined(__MIC__) + _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d); +# else + *core->partner_flags[core->parity][i] = core->sense; +# endif + while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE; + if (1 == core->parity) { + core->sense = (uint8_t)(0 == core->sense ? 1 : 0); + } + core->parity = (uint8_t)(1 - core->parity); + } + + /* wake up the core's remaining threads */ + core->core_sense = core->thread_senses[0]; + } + else { /* other threads wait for cross-core sync to complete */ + uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[thread->core_tid]; + while (core_sense != thread_sense) { /* avoid evaluation in unspecified order */ + LIBXSMM_SYNC_PAUSE; + core_sense = core->core_sense; + thread_sense = core->thread_senses[thread->core_tid]; + } + } + } +#endif +} + + +LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier) +{ +#if (0 != LIBXSMM_SYNC) + if (NULL != barrier && 1 < barrier->nthreads) { + if (2 == barrier->init_done) { + int i; + for (i = 0; i < barrier->ncores; ++i) { + int j; + libxsmm_free((const void*)barrier->cores[i]->thread_senses); + for (j = 0; j < 2; ++j) { + libxsmm_free((const void*)barrier->cores[i]->my_flags[j]); + libxsmm_free(barrier->cores[i]->partner_flags[j]); + } + libxsmm_free(barrier->cores[i]); + } + for (i = 0; i < barrier->nthreads; ++i) { + libxsmm_free(barrier->threads[i]); + } + } + libxsmm_free(barrier->threads); + libxsmm_free(barrier->cores); + } +#endif + free((libxsmm_barrier*)barrier); +} + + +#if (0 != LIBXSMM_SYNC) +enum { + INTERNAL_SYNC_LOCK_FREE = 0, + INTERNAL_SYNC_LOCK_LOCKED = 1, + INTERNAL_SYNC_LOCK_CONTESTED = 2, + INTERNAL_SYNC_RWLOCK_READINC = 0x10000/*(USHRT_MAX+1)*/, + INTERNAL_SYNC_FUTEX = 202 +}; +#endif + + +typedef unsigned int libxsmm_spinlock_state; +struct LIBXSMM_RETARGETABLE libxsmm_spinlock { + volatile libxsmm_spinlock_state state; +}; + + +LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void) +{ + libxsmm_spinlock *const result = (libxsmm_spinlock*)malloc(sizeof(libxsmm_spinlock)); +#if (0 != LIBXSMM_SYNC) + if (0 != result) { + result->state = INTERNAL_SYNC_LOCK_FREE; + } +#endif + return result; +} + + +LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock) +{ + free((libxsmm_spinlock*)spinlock); +} + + +LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock) +{ +#if (0 != LIBXSMM_SYNC) +# if 0 + /*const*/ libxsmm_spinlock_state lock_free = INTERNAL_SYNC_LOCK_FREE; + assert(0 != spinlock); + return 0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&spinlock->state, lock_free, INTERNAL_SYNC_LOCK_LOCKED, LIBXSMM_ATOMIC_RELAXED) + ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + 1) /* not acquired */ + : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK)); +# else + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + !LIBXSMM_ATOMIC_TRYLOCK(&spinlock->state, LIBXSMM_ATOMIC_RELAXED); +# endif +#else + LIBXSMM_UNUSED(spinlock); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK); +#endif +} + + +LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != spinlock); + for (;;) { + if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&spinlock->state, 1, LIBXSMM_ATOMIC_RELAXED)) break; + LIBXSMM_SYNC_CYCLE(&spinlock->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE); + } + LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); +#else + LIBXSMM_UNUSED(spinlock); +#endif +} + + +LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != spinlock); + LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); + spinlock->state = INTERNAL_SYNC_LOCK_FREE; +#else + LIBXSMM_UNUSED(spinlock); +#endif +} + + +#if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) +typedef int libxsmm_mutex_state; +#else +typedef char libxsmm_mutex_state; +#endif +struct LIBXSMM_RETARGETABLE libxsmm_mutex { + volatile libxsmm_mutex_state state; +}; + + +LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void) +{ + libxsmm_mutex *const result = (libxsmm_mutex*)malloc(sizeof(libxsmm_mutex)); +#if (0 != LIBXSMM_SYNC) + if (0 != result) { + result->state = INTERNAL_SYNC_LOCK_FREE; + } +#endif + return result; +} + + +LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex) +{ + free((libxsmm_mutex*)mutex); +} + + +LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != mutex); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) + !LIBXSMM_ATOMIC_TRYLOCK(&mutex->state, LIBXSMM_ATOMIC_RELAXED); +#else + LIBXSMM_UNUSED(mutex); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX); +#endif +} + + +LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex) +{ +#if (0 != LIBXSMM_SYNC) +# if defined(_WIN32) + assert(0 != mutex); + while (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) != libxsmm_mutex_trylock(mutex)) { + LIBXSMM_SYNC_CYCLE(&mutex->state, 0/*free*/, LIBXSMM_SYNC_NPAUSE); + } +# else + libxsmm_mutex_state lock_free = INTERNAL_SYNC_LOCK_FREE, lock_state = INTERNAL_SYNC_LOCK_LOCKED; + assert(0 != mutex); + while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&mutex->state, lock_free, lock_state, LIBXSMM_ATOMIC_RELAXED)) { + libxsmm_mutex_state state; + /* coverity[unreachable] may be reachable more than once due to volatile state */ + for (state = mutex->state; INTERNAL_SYNC_LOCK_FREE != state; state = mutex->state) { +# if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) + LIBXSMM_SYNC_CYCLE_ELSE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE, { + /*const*/ libxsmm_mutex_state state_locked = INTERNAL_SYNC_LOCK_LOCKED; + if (INTERNAL_SYNC_LOCK_LOCKED != state || LIBXSMM_ATOMIC_CMPSWP(&mutex->state, + state_locked, INTERNAL_SYNC_LOCK_CONTESTED, LIBXSMM_ATOMIC_RELAXED)) + { + syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAIT, INTERNAL_SYNC_LOCK_CONTESTED, NULL, NULL, 0); + lock_state = INTERNAL_SYNC_LOCK_CONTESTED; + }} + ); + break; +# else + LIBXSMM_SYNC_CYCLE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE); +# endif + } + lock_free = INTERNAL_SYNC_LOCK_FREE; + } +# endif +#else + LIBXSMM_UNUSED(mutex); +#endif +} + + +LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != mutex); + LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST); +# if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU) + if (INTERNAL_SYNC_LOCK_CONTESTED == LIBXSMM_ATOMIC_FETCH_SUB(&mutex->state, 1, LIBXSMM_ATOMIC_RELAXED)) { + mutex->state = INTERNAL_SYNC_LOCK_FREE; + syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAKE, 1, NULL, NULL, 0); + } +# else + mutex->state = INTERNAL_SYNC_LOCK_FREE; +# endif +#else + LIBXSMM_UNUSED(mutex); +#endif +} + + +#if (0 != LIBXSMM_SYNC) +typedef LIBXSMM_CONCATENATE3(uint,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_uint_t; +typedef LIBXSMM_CONCATENATE3(int,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_int_t; +LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_sync_counter { + struct { internal_sync_uint_t writer, reader; } kind; + uint32_t bits; +} internal_sync_counter; +#endif +LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_rwlock { +#if (0 != LIBXSMM_SYNC) + volatile internal_sync_counter completions; + volatile internal_sync_counter requests; +#else + int dummy; +#endif +}; + + +LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void) +{ + libxsmm_rwlock *const result = (libxsmm_rwlock*)malloc(sizeof(libxsmm_rwlock)); + if (0 != result) { +#if (0 != LIBXSMM_SYNC) + LIBXSMM_MEMZERO127(&result->completions); + LIBXSMM_MEMZERO127(&result->requests); +#else + LIBXSMM_MEMZERO127(result); +#endif + } + return result; +} + + +LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock) +{ + free((libxsmm_rwlock*)rwlock); +} + + +#if (0 != LIBXSMM_SYNC) +LIBXSMM_API_INLINE int internal_rwlock_trylock(libxsmm_rwlock* rwlock, internal_sync_counter* prev) +{ + internal_sync_counter next; + assert(0 != rwlock && 0 != prev); + do { + prev->bits = rwlock->requests.bits; + next.bits = prev->bits; + ++next.kind.writer; + } + while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&rwlock->requests.bits, prev->bits, next.bits, LIBXSMM_ATOMIC_RELAXED)); + return rwlock->completions.bits != prev->bits + ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */ + : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK)); +} +#endif + + +LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + internal_sync_counter prev; + return internal_rwlock_trylock(rwlock, &prev); +#else + LIBXSMM_UNUSED(rwlock); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); +#endif +} + + +LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + internal_sync_counter prev; + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_trylock(rwlock, &prev)) { + while (rwlock->completions.bits != prev.bits) { + LIBXSMM_SYNC_CYCLE(&rwlock->completions.bits, prev.bits, LIBXSMM_SYNC_NPAUSE); + } + } +#else + LIBXSMM_UNUSED(rwlock); +#endif +} + + +LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != rwlock); + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.writer, 1, LIBXSMM_ATOMIC_SEQ_CST); +#else + LIBXSMM_UNUSED(rwlock); +#endif +} + + +#if (0 != LIBXSMM_SYNC) +LIBXSMM_API_INLINE int internal_rwlock_tryread(libxsmm_rwlock* rwlock, internal_sync_counter* prev) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != rwlock && 0 != prev); + prev->bits = LIBXSMM_ATOMIC_FETCH_ADD(&rwlock->requests.bits, INTERNAL_SYNC_RWLOCK_READINC, LIBXSMM_ATOMIC_SEQ_CST); + return rwlock->completions.kind.writer != prev->kind.writer + ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */ + : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK)); +#else + LIBXSMM_UNUSED(rwlock); LIBXSMM_UNUSED(prev); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); +#endif +} +#endif + + +LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + internal_sync_counter prev; + return internal_rwlock_tryread(rwlock, &prev); +#else + LIBXSMM_UNUSED(rwlock); + return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK); +#endif +} + + +LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + internal_sync_counter prev; + if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_tryread(rwlock, &prev)) { + while (rwlock->completions.kind.writer != prev.kind.writer) { + LIBXSMM_SYNC_CYCLE(&rwlock->completions.kind.writer, prev.kind.writer, LIBXSMM_SYNC_NPAUSE); + } + } +#else + LIBXSMM_UNUSED(rwlock); +#endif +} + + +LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock) +{ +#if (0 != LIBXSMM_SYNC) + assert(0 != rwlock); + LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.reader, 1, LIBXSMM_ATOMIC_SEQ_CST); +#else + LIBXSMM_UNUSED(rwlock); +#endif +} + + +LIBXSMM_API unsigned int libxsmm_get_pid(void) +{ +#if defined(_WIN32) + return (unsigned int)_getpid(); +#else + return (unsigned int)getpid(); +#endif +} + + +LIBXSMM_API_INTERN unsigned int internal_get_tid(void); +LIBXSMM_API_INTERN unsigned int internal_get_tid(void) +{ + const unsigned int nthreads = LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_thread_count, 1, LIBXSMM_ATOMIC_RELAXED); +#if !defined(NDEBUG) + static int error_once = 0; + if (LIBXSMM_NTHREADS_MAX < nthreads + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: maximum number of threads is exhausted!\n"); + } +#endif + LIBXSMM_ASSERT(LIBXSMM_ISPOT(LIBXSMM_NTHREADS_MAX)); + return LIBXSMM_MOD2(nthreads - 1, LIBXSMM_NTHREADS_MAX); +} + + +LIBXSMM_API unsigned int libxsmm_get_tid(void) +{ +#if (0 != LIBXSMM_SYNC) +# if defined(LIBXSMM_SYNC_GENERIC_PID) + static LIBXSMM_TLS unsigned int tid = 0xFFFFFFFF; + if (0xFFFFFFFF == tid) tid = internal_get_tid(); + return tid; +# else + void* tls = LIBXSMM_TLS_GETVALUE(libxsmm_tlskey); + if (NULL == tls) { + static unsigned int tid[LIBXSMM_NTHREADS_MAX]; + const int i = internal_get_tid(); + tid[i] = i; tls = tid + i; + /* coverity[check_return] */ + LIBXSMM_TLS_SETVALUE(libxsmm_tlskey, tls); + } + return *(unsigned int*)tls; +# endif +#else + return 0; +#endif +} + diff --git a/third_party/libxsmm/src/libxsmm_timer.c b/third_party/libxsmm/src/libxsmm_timer.c new file mode 100644 index 00000000..5a5c5705 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_timer.c @@ -0,0 +1,221 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include +#include "libxsmm_main.h" + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if defined(_WIN32) +# include +#elif defined(__GNUC__) || defined(__PGI) || defined(_CRAYC) +# include +# include +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +#if defined(__powerpc64__) +# include +#endif + +#if !defined(LIBXSMM_TIMER_TSC) +# define LIBXSMM_TIMER_TSC +#endif +#if !defined(LIBXSMM_TIMER_WPC) +# define LIBXSMM_TIMER_WPC +#endif + +#if defined(LIBXSMM_TIMER_TSC) +# if defined(__powerpc64__) +# define LIBXSMM_TIMER_RDTSC(CYCLE) { \ + CYCLE = __ppc_get_timebase(); \ + } +# elif ((defined(LIBXSMM_PLATFORM_X86) && (64 <= (LIBXSMM_BITS))) && \ + (defined(__GNUC__) || defined(LIBXSMM_INTEL_COMPILER) || defined(__PGI))) +# define LIBXSMM_TIMER_RDTSC(CYCLE) { libxsmm_timer_tickint libxsmm_timer_rdtsc_hi_; \ + __asm__ __volatile__ ("rdtsc" : "=a"(CYCLE), "=d"(libxsmm_timer_rdtsc_hi_)); \ + CYCLE |= libxsmm_timer_rdtsc_hi_ << 32; \ + } +# elif (defined(_rdtsc) || defined(_WIN32)) +# define LIBXSMM_TIMER_RDTSC(CYCLE) (CYCLE = __rdtsc()) +# endif +#endif + + +LIBXSMM_API_INTERN double libxsmm_timer_duration_rtc(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) +{ + double result = (double)LIBXSMM_DELTA(tick0, tick1); +#if defined(_WIN32) +# if defined(LIBXSMM_TIMER_WPC) + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + result /= (double)frequency.QuadPart; +# else /* low resolution */ + result *= 1E-3; +# endif +#elif defined(CLOCK_MONOTONIC) + result *= 1E-9; +#else + result *= 1E-6; +#endif + return result; +} + + +LIBXSMM_API_INTERN libxsmm_timer_tickint libxsmm_timer_tick_rtc(void) +{ + libxsmm_timer_tickint result; +#if defined(_WIN32) +# if defined(LIBXSMM_TIMER_WPC) + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + result = (libxsmm_timer_tickint)t.QuadPart; +# else /* low resolution */ + result = (libxsmm_timer_tickint)GetTickCount64(); +# endif +#elif defined(CLOCK_MONOTONIC) + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + result = 1000000000ULL * t.tv_sec + t.tv_nsec; +#else + struct timeval t; + gettimeofday(&t, 0); + result = 1000000ULL * t.tv_sec + t.tv_usec; +#endif + return result; +} + + +LIBXSMM_API_INTERN LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC) +libxsmm_timer_tickint libxsmm_timer_tick_tsc(void) +{ + libxsmm_timer_tickint result; +#if defined(LIBXSMM_TIMER_RDTSC) + LIBXSMM_TIMER_RDTSC(result); +#else + result = libxsmm_timer_tick_rtc(); +#endif + return result; +} + + +LIBXSMM_API int libxsmm_get_timer_info(libxsmm_timer_info* info) +{ + int result; + if (NULL != info) { +#if defined(LIBXSMM_TIMER_RDTSC) + if (0 < libxsmm_timer_scale) { + info->tsc = 1; + } +# if !defined(LIBXSMM_INIT_COMPLETED) + else if (2 > libxsmm_ninit) { + libxsmm_init(); + if (0 < libxsmm_timer_scale) { + info->tsc = 1; + } + else { + info->tsc = 0; + } + } +# endif + else { + info->tsc = 0; + } +#else + info->tsc = 0; +#endif + result = EXIT_SUCCESS; + } + else { +#if !defined(NDEBUG) + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument for libxsmm_get_timer_info specified!\n"); + } +#endif + result = EXIT_FAILURE; + } + return result; +} + + +LIBXSMM_API libxsmm_timer_tickint libxsmm_timer_tick(void) +{ + libxsmm_timer_tickint result; +#if defined(LIBXSMM_TIMER_RDTSC) + if (0 < libxsmm_timer_scale) { + LIBXSMM_TIMER_RDTSC(result); + } +# if !defined(LIBXSMM_INIT_COMPLETED) + else if (2 > libxsmm_ninit) { + libxsmm_init(); + if (0 < libxsmm_timer_scale) { + LIBXSMM_TIMER_RDTSC(result); + } + else { + result = libxsmm_timer_tick_rtc(); + } + } +# endif + else { + result = libxsmm_timer_tick_rtc(); + } +#else + result = libxsmm_timer_tick_rtc(); +#endif + return result; +} + + +LIBXSMM_API double libxsmm_timer_duration(libxsmm_timer_tickint tick0, libxsmm_timer_tickint tick1) +{ + double result; +#if defined(LIBXSMM_TIMER_RDTSC) + if (0 < libxsmm_timer_scale) { + result = (double)LIBXSMM_DELTA(tick0, tick1) * libxsmm_timer_scale; + } + else +#endif + { + result = libxsmm_timer_duration_rtc(tick0, tick1); + } + return result; +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_timer_ncycles)(libxsmm_timer_tickint* /*ncycles*/, const libxsmm_timer_tickint* /*tick0*/, const libxsmm_timer_tickint* /*tick1*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_timer_ncycles)(libxsmm_timer_tickint* ncycles, const libxsmm_timer_tickint* tick0, const libxsmm_timer_tickint* tick1) +{ +#if !defined(NDEBUG) + static int error_once = 0; + if (NULL != ncycles && NULL != tick0 && NULL != tick1) +#endif + { + *ncycles = libxsmm_timer_ncycles(*tick0, *tick1); + } +#if !defined(NDEBUG) + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid arguments for libxsmm_timer_ncycles specified!\n"); + } +#endif +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_trace.c b/third_party/libxsmm/src/libxsmm_trace.c new file mode 100644 index 00000000..a0f41dcf --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_trace.c @@ -0,0 +1,567 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_trace.h" +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_TRACE_MINDEPTH) || 0 > (LIBXSMM_TRACE_MINDEPTH) +# undef LIBXSMM_TRACE_MINDEPTH +# define LIBXSMM_TRACE_MINDEPTH 1 +#endif +#if !defined(LIBXSMM_TRACE_MAXDEPTH) || 0 >= (LIBXSMM_TRACE_MAXDEPTH) +# undef LIBXSMM_TRACE_MAXDEPTH +# define LIBXSMM_TRACE_MAXDEPTH 1024 +#endif +#if !defined(LIBXSMM_TRACE_SYMBOLSIZE) || 0 >= (LIBXSMM_TRACE_SYMBOLSIZE) +# undef LIBXSMM_TRACE_SYMBOLSIZE +# define LIBXSMM_TRACE_SYMBOLSIZE 256 +#endif +#if !defined(LIBXSMM_TRACE_DLINFO) && defined(__USE_GNU) +# define LIBXSMM_TRACE_DLINFO +#endif + +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET)) +#endif +#if !defined(NDEBUG) +# include +#endif +#if defined(_WIN32) || defined(__CYGWIN__) +# include +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4091) +# endif +# include +# if defined(_MSC_VER) +# pragma comment(lib, "dbghelp") +# endif +# if defined(_MSC_VER) +# pragma warning(pop) +# endif +LIBXSMM_APIVAR_DEFINE(volatile LONG internal_trace_initialized); +#else +LIBXSMM_APIVAR_DEFINE(volatile int internal_trace_initialized); +# include +# if defined(LIBXSMM_TRACE_DLINFO) +# include +# else +# include +# include +# include +# include +# include +# if (0 != LIBXSMM_SYNC) +LIBXSMM_APIVAR_DEFINE(LIBXSMM_TLS_TYPE internal_trace_key); +LIBXSMM_APIVAR_DEFINE(void* internal_trace_symbols[LIBXSMM_NTHREADS_MAX]); +# endif +LIBXSMM_API_INLINE void internal_delete(void* value) +{ + int fd; +# if !(defined(__APPLE__) && defined(__MACH__)) + LIBXSMM_ASSERT(NULL != value); +# endif + fd = *((int*)value); +# if defined(NDEBUG) + munmap(value, LIBXSMM_TRACE_SYMBOLSIZE); +# else /* library code is expected to be mute */ + if (0 != munmap(value, LIBXSMM_TRACE_SYMBOLSIZE)) { + const int error = errno; + fprintf(stderr, "LIBXSMM ERROR: %s (munmap error #%i at %p)\n", + strerror(error), error, value); + } +# endif + if (0 <= fd) { + close(fd); + } +# if !defined(NDEBUG) /* library code is expected to be mute */ + else { + fprintf(stderr, "LIBXSMM ERROR: invalid file descriptor (%i)\n", fd); + } +# endif +} +# if defined(__APPLE__) && defined(__MACH__) +/* taken from "libtransmission" fdlimit.c */ +LIBXSMM_API_INLINE int posix_fallocate(int fd, off_t offset, off_t length) +{ + fstore_t fst; + fst.fst_flags = F_ALLOCATECONTIG; + fst.fst_posmode = F_PEOFPOSMODE; + fst.fst_offset = offset; + fst.fst_length = length; + fst.fst_bytesalloc = 0; + return fcntl(fd, F_PREALLOCATE, &fst); +} +# elif (!defined(_XOPEN_SOURCE) || 600 > _XOPEN_SOURCE) && \ + (!defined(_POSIX_C_SOURCE) || 200112L > _POSIX_C_SOURCE) +/* C89: avoid warning about posix_fallocate declared implicitly */ +LIBXSMM_EXTERN int posix_fallocate(int, off_t, off_t); +# endif +# endif +LIBXSMM_EXTERN int mkstemp(char*) LIBXSMM_NOTHROW; +#endif +#if defined(LIBXSMM_OFFLOAD_TARGET) +# pragma offload_attribute(pop) +#endif + +LIBXSMM_APIVAR_DEFINE(int internal_trace_mindepth); +LIBXSMM_APIVAR_DEFINE(int internal_trace_threadid); +LIBXSMM_APIVAR_DEFINE(int internal_trace_maxnsyms); + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE int libxsmm_trace_init(int /*filter_threadid*/, int /*filter_mindepth*/, int /*filter_maxnsyms*/); +LIBXSMM_API int libxsmm_trace_init(int filter_threadid, int filter_mindepth, int filter_maxnsyms) +{ + int result = EXIT_SUCCESS; + if (0 == internal_trace_initialized) { + if (0 <= filter_threadid) ++filter_threadid; +#if defined(__TRACE) + { const char *const env = getenv("LIBXSMM_TRACE"); + if (NULL != env && 0 != *env) { + char buffer[32] = { 0 }; + if (1 == sscanf(env, "%32[^,],", buffer)) { + result = (0 <= sscanf(buffer, "%i", &filter_threadid) ? EXIT_SUCCESS : EXIT_FAILURE); + } + if (1 == sscanf(env, "%*[^,],%32[^,],", buffer)) { + result = (0 <= sscanf(buffer, "%i", &filter_mindepth) ? EXIT_SUCCESS : EXIT_FAILURE); + } + if (1 == sscanf(env, "%*[^,],%*[^,],%32s", buffer)) { + result = (0 <= sscanf(buffer, "%i", &filter_maxnsyms) ? EXIT_SUCCESS : EXIT_FAILURE); + } + else { + filter_maxnsyms = -1; /* all */ + } + if (EXIT_SUCCESS == result) { + internal_trace_initialized = -1; /* auto */ + } + } + } + if (EXIT_SUCCESS == result) +#endif + { +#if defined(LIBXSMM_TRACE) +# if defined(_WIN32) || defined(__CYGWIN__) + SymSetOptions(SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME); + result = (FALSE != SymInitialize(GetCurrentProcess(), NULL, TRUE) ? EXIT_SUCCESS : GetLastError()); +# elif (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_TRACE_DLINFO) + result = LIBXSMM_TLS_CREATE(&internal_trace_key); +# endif + if (EXIT_SUCCESS == result) { + internal_trace_threadid = filter_threadid; + internal_trace_maxnsyms = filter_maxnsyms; + internal_trace_mindepth = filter_mindepth; + if (0 == internal_trace_initialized) { + internal_trace_initialized = 1; + } + } +#else + LIBXSMM_UNUSED(filter_threadid); + LIBXSMM_UNUSED(filter_mindepth); + LIBXSMM_UNUSED(filter_maxnsyms); +#endif + } + } + return result; +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE int libxsmm_trace_finalize(void); +LIBXSMM_API int libxsmm_trace_finalize(void) +{ + int result; +#if defined(LIBXSMM_TRACE) + result = EXIT_SUCCESS; + if (0 != internal_trace_initialized) { + internal_trace_initialized = 0; /* disable */ +# if defined(_WIN32) || defined(__CYGWIN__) + result = (FALSE != SymCleanup(GetCurrentProcess()) ? EXIT_SUCCESS : GetLastError()); +# elif (0 != LIBXSMM_SYNC) && !defined(LIBXSMM_TRACE_DLINFO) + result = LIBXSMM_TLS_DESTROY(internal_trace_key); + { int i = 0; + for (; i < LIBXSMM_NTHREADS_MAX; ++i) { + void *const buffer = internal_trace_symbols[i]; + if (NULL != buffer) internal_delete(buffer); + } + } +# endif + } +#else + result = EXIT_FAILURE; +#endif + return result; +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE unsigned int libxsmm_backtrace(const void* /*buffer*/[], unsigned int /*size*/, unsigned int /*skip*/); +LIBXSMM_API +#if defined(_WIN32) +/*TODO: no inline*/ +#elif defined(__GNUC__) +/*LIBXSMM_ATTRIBUTE(noinline)*/ +#endif +unsigned int libxsmm_backtrace(const void* buffer[], unsigned int size, unsigned int skip) +{ + unsigned int result; + if (NULL != buffer && 0 != size && skip < size) { + skip += LIBXSMM_TRACE_MINDEPTH; +#if defined(_WIN32) || defined(__CYGWIN__) + result = CaptureStackBackTrace(skip, LIBXSMM_MIN(size, LIBXSMM_TRACE_MAXDEPTH), (PVOID*)buffer, NULL/*hash*/); +#else + { const int n = backtrace((void**)buffer, LIBXSMM_MIN((int)(size + skip), LIBXSMM_TRACE_MAXDEPTH)); + if ((int)skip < n) { + result = n - skip; + if (0 != skip) { + memmove(buffer, buffer + skip, result * sizeof(void*)); + } + } + else { + result = 0; + } + } +#endif + } + else { + result = 0; + } + return result; +} + + +#if !defined(_WIN32) && !defined(__CYGWIN__) +LIBXSMM_API_INLINE const char* internal_trace_get_symbolname(const void* address, char* map, int fd, off_t fdoff) +{ + const char* result = NULL; +#if defined(LIBXSMM_TRACE_DLINFO) + Dl_info info; + LIBXSMM_UNUSED(fd); LIBXSMM_UNUSED(fdoff); + LIBXSMM_ASSERT(NULL != address && NULL != map); + if (0 != dladdr(address, &info) && NULL != info.dli_sname) { + strncpy(map, info.dli_sname, LIBXSMM_TRACE_SYMBOLSIZE - 1); + result = map; + } +#else + LIBXSMM_ASSERT(NULL != address && NULL != map); + backtrace_symbols_fd((void**)&address, 1, fd); + if (fdoff == lseek(fd, fdoff, SEEK_SET) /* reset map */ + && 1 == sscanf(map, "%*[^(](%s0x", map)) + { + char* c = map; + for (; '+' != *c && 0 != *c; ++c); + if ('+' == *c && c != map) { + result = map; + map = c; + } + } + *map = 0; /* terminate */ +#endif + return result; +} +#endif + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE +const char* libxsmm_trace_info(unsigned int* /*depth*/, unsigned int* /*threadid*/, const int* /*filter_threadid*/, + const void* /*filter_symbol*/, const int* /*filter_mindepth*/, const int* /*filter_maxnsyms*/); + +LIBXSMM_API +#if defined(_WIN32) +/*TODO: no inline*/ +#elif defined(__GNUC__) +/*LIBXSMM_ATTRIBUTE(noinline)*/ +#endif +const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, + const void* filter_symbol, const int* filter_mindepth, const int* filter_maxnsyms) +{ + const char *fname = NULL; +#if defined(LIBXSMM_TRACE) + static LIBXSMM_TLS int cerberus = 0; + /* check against entering a recursion (recursion should not happen due to + * attribute "no_instrument_function" but better prevent this in any case) + */ + if (0 == cerberus) { + int init; + ++cerberus; +# if defined(__GNUC__) && !defined(_CRAYC) + __asm__(""); +# endif + init = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED); + if (0 != init) { /* do nothing if not yet initialized */ + const int mindepth = (NULL != filter_mindepth ? *filter_mindepth : internal_trace_mindepth); + const int maxnsyms = (NULL != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms); + const void *stacktrace[LIBXSMM_TRACE_MAXDEPTH]; + const int n = libxsmm_backtrace(stacktrace, LIBXSMM_TRACE_MAXDEPTH, 0); + int symbol = 0; + if (0 < n) { + const int filter = (NULL != filter_threadid ? *filter_threadid : internal_trace_threadid); + int abs_tid = 0; +# if defined(_WIN32) || defined(__CYGWIN__) || defined(LIBXSMM_TRACE_DLINFO) + static LIBXSMM_TLS struct { +# if defined(_WIN32) || defined(__CYGWIN__) + char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE]; +# else + char buffer[LIBXSMM_TRACE_SYMBOLSIZE]; +# endif + int tid; + } info; + if (0 != info.tid) { + abs_tid = LIBXSMM_ABS(info.tid); + } + else { + const int tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 0 < init ? 1 : -1, LIBXSMM_ATOMIC_RELAXED); + abs_tid = LIBXSMM_ABS(tid) - 1; + /* use sign bit to flag enabled fallback for symbol resolution */ + info.tid = -abs_tid; + } + LIBXSMM_ASSERT(0 < abs_tid); + if (0 > filter || filter == abs_tid) { + int next = symbol + 1; +# if defined(_WIN32) || defined(__CYGWIN__) + const HANDLE process = GetCurrentProcess(); + PSYMBOL_INFO value = (PSYMBOL_INFO)info.buffer; + value->SizeOfStruct = sizeof(SYMBOL_INFO); + value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1; + value->NameLen = 0; +# endif + if (NULL != filter_symbol) { + struct { size_t d; int s; } approx = { (size_t)LIBXSMM_UNLIMITED, 0 }; + while (next < n && (filter_symbol == stacktrace[symbol] || +# if defined(_WIN32) || defined(__CYGWIN__) + (FALSE != SymFromAddr(process, (DWORD64)stacktrace[symbol], NULL, value) && 0 < value->NameLen))) + { + if (filter_symbol == stacktrace[symbol] || NULL != strstr(value->Name, (const char*)filter_symbol)) { +# else + (NULL != internal_trace_get_symbolname(stacktrace[symbol], info.buffer, 0, 0)))) + { + if (filter_symbol == stacktrace[symbol] || NULL != strstr(info.buffer, (const char*)filter_symbol)) { +# endif + symbol = next++; /* determine the symbol after the match which is checked below */ + break; + } + { const size_t d = LIBXSMM_DELTA((const char*)filter_symbol, (const char*)stacktrace[symbol]); + if (d < approx.d) { + approx.s = symbol + 1; + approx.d = d; + } + } + symbol = next++; + } + symbol = LIBXSMM_MAX((next != n ? symbol : approx.s/*not found*/) + mindepth/*shift*/, 0); + } + /* apply filters based on absolute symbol position */ + if ((NULL != filter_symbol || LIBXSMM_MAX(mindepth, 0) <= symbol) && (0 >= maxnsyms || symbol < maxnsyms)) { + if (symbol != next && symbol < n && filter_symbol != stacktrace[symbol] && +# if defined(_WIN32) || defined(__CYGWIN__) + FALSE != SymFromAddr(process, (DWORD64)stacktrace[symbol], NULL, value) && 0 < value->NameLen) +# else + NULL != internal_trace_get_symbolname(stacktrace[symbol], info.buffer, 0, 0)) +# endif + { + /* disable fallback allowing unresolved symbol names */ + info.tid = abs_tid; /* make unsigned */ +# if defined(_WIN32) || defined(__CYGWIN__) + fname = value->Name; +# else + fname = info.buffer; +# endif + } + if (NULL == fname && 0 > info.tid) { /* fallback allowing unresolved symbol names */ +# if defined(__MINGW32__) + sprintf(info.buffer, "%p", stacktrace[symbol]); +# else + sprintf(info.buffer, "0x%" PRIxPTR, (uintptr_t)stacktrace[symbol]); +# endif + fname = info.buffer; + } + } + } +# else +# if (0 == LIBXSMM_SYNC) + static char raw_c; + char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */ +# else + char *const raw_value = (char*)LIBXSMM_TLS_GETVALUE(internal_trace_key); +# endif + const off_t fdoff = sizeof(int) * 2; + int* ivalue = NULL, fd = -1; + char* value = NULL; + if (NULL != raw_value) { + ivalue = (int*)raw_value; + abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]); + if (0 > filter || filter == abs_tid) { + fd = ivalue[0]; + if (0 <= fd && fdoff == lseek(fd, fdoff, SEEK_SET)) { + value = raw_value + fdoff; + } +# if !defined(NDEBUG) /* library code is expected to be mute */ + else { + fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n"); + } +# endif + } + } + else { + char filename[] = "/tmp/.libxsmm_map." LIBXSMM_MKTEMP_PATTERN; + /* coverity[secure_temp] */ + fd = mkstemp(filename); + if (0 <= fd) { + if (0 == unlink(filename) && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) { + char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (MAP_FAILED != buffer) { + int check = -1; + ivalue = (int*)buffer; + ivalue[0] = fd; /* valid file descriptor for internal_delete */ + if ( +# if (0 != LIBXSMM_SYNC) + 0 == LIBXSMM_TLS_SETVALUE(internal_trace_key, buffer) && +# endif + (sizeof(int) * 1) == read(fd, &check, sizeof(int)) && + fdoff == lseek(fd, sizeof(int), SEEK_CUR) && + check == fd) + { + const int tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 0 < init ? 1 : -1, LIBXSMM_ATOMIC_RELAXED); + abs_tid = LIBXSMM_ABS(tid) - 1; + LIBXSMM_ASSERT(0 < abs_tid); +# if (0 != LIBXSMM_SYNC) + LIBXSMM_ASSERT(abs_tid < LIBXSMM_NTHREADS_MAX); + internal_trace_symbols[abs_tid] = buffer; +# endif + /* use sign bit to flag enabled fallback for symbol resolution */ + ivalue[1] = -abs_tid; + if (0 > filter || (abs_tid - 1) == filter) { + value = buffer + fdoff; + } + } + else { +# if !defined(NDEBUG) /* library code is expected to be mute */ + fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n"); +# endif + internal_delete(buffer); + } + } +# if !defined(NDEBUG) + else { + const int error = errno; + fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n", + strerror(error), error); + } +# endif + } +# if !defined(NDEBUG) /* library code is expected to be mute */ + else { + fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd); + } +# endif + } + } + if (NULL != value) { + int next = symbol + 1; + if (NULL != filter_symbol) { + struct { size_t d; int s; } approx = { (size_t)LIBXSMM_UNLIMITED, 0 }; + while (next < n && (filter_symbol == stacktrace[symbol] || + NULL != internal_trace_get_symbolname(stacktrace[symbol], value, fd, fdoff))) + { + if (filter_symbol == stacktrace[symbol] || NULL != strstr(value, (const char*)filter_symbol)) { + symbol = next++; /* determine the symbol after the match which is checked below */ + break; + } + { const size_t d = LIBXSMM_DELTA((const char*)filter_symbol, (const char*)stacktrace[symbol]); + if (d < approx.d) { + approx.s = symbol + 1; + approx.d = d; + } + } + symbol = next++; + } + symbol = LIBXSMM_MAX((next != n ? symbol : approx.s/*not found*/) + mindepth/*shift*/, 0); + } + /* apply filters based on absolute symbol position */ + if ((NULL != filter_symbol || LIBXSMM_MAX(mindepth, 0) <= symbol) && (0 >= maxnsyms || symbol < maxnsyms)) { + if (symbol != next && symbol < n && filter_symbol != stacktrace[symbol] && + NULL != internal_trace_get_symbolname(stacktrace[symbol], value, fd, fdoff)) + { + /* disable fallback allowing unresolved symbol names */ + ivalue[1] = abs_tid; /* make unsigned */ + fname = value; + } + if (NULL == fname && 0 > ivalue[1]) { /* fallback to symbol address */ + sprintf(value, "0x%llx", (unsigned long long)stacktrace[symbol]); + fname = value; + } + } + } +# endif + if (threadid) *threadid = abs_tid - 1; + if (depth) *depth = symbol; + } + } + --cerberus; + } +#else + LIBXSMM_UNUSED(depth); + LIBXSMM_UNUSED(threadid); + LIBXSMM_UNUSED(filter_threadid); + LIBXSMM_UNUSED(filter_symbol); + LIBXSMM_UNUSED(filter_mindepth); + LIBXSMM_UNUSED(filter_maxnsyms); +#endif + return fname; +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE +void libxsmm_trace(FILE* stream, const int* /*filter_threadid*/, const void* /*filter_symbol*/, const int* /*filter_mindepth*/, const int* /*filter_maxnsyms*/); + +LIBXSMM_API void libxsmm_trace(FILE* stream, const int* filter_threadid, const void* filter_symbol, const int* filter_mindepth, const int* filter_maxnsyms) +{ +#if defined(LIBXSMM_TRACE) + unsigned int depth, threadid; + const char *const name = libxsmm_trace_info(&depth, &threadid, filter_threadid, filter_symbol, filter_mindepth, filter_maxnsyms); + if (NULL != name && 0 != *name) { /* implies actual other results to be valid */ + LIBXSMM_ASSERT(NULL != stream/*otherwise fprintf handles the error*/); + if ((NULL == filter_threadid && 0 > internal_trace_threadid) || (NULL != filter_threadid && 0 > *filter_threadid)) { + fprintf(stream, "%*s%s@%u\n", (int)depth, "", name, threadid); + } + else { + fprintf(stream, "%*s%s\n", (int)depth, "", name); + } + } +#else /* suppress warning */ + LIBXSMM_UNUSED(stream); + LIBXSMM_UNUSED(filter_threadid); + LIBXSMM_UNUSED(filter_symbol); + LIBXSMM_UNUSED(filter_mindepth); + LIBXSMM_UNUSED(filter_maxnsyms); +#endif +} + + +#if defined(__TRACE) && defined(__GNUC__) && defined(LIBXSMM_BUILD) + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void __cyg_profile_func_enter(void* /*this_fn*/, void* /*call_site*/); +LIBXSMM_API void __cyg_profile_func_enter(void* this_fn, void* call_site) +{ +#if defined(LIBXSMM_TRACE) + if (0 > internal_trace_initialized) { + /* NULL: inherit global settings from libxsmm_trace_init */ + libxsmm_trace(stderr, NULL/*filter_threadid*/, "__cyg_profile_func_enter"/*LIBXSMM_FUNCNAME*/, NULL, NULL); + } +#endif + LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); +} + + +LIBXSMM_API LIBXSMM_ATTRIBUTE_NO_TRACE void __cyg_profile_func_exit(void* /*this_fn*/, void* /*call_site*/); +LIBXSMM_API void __cyg_profile_func_exit(void* this_fn, void* call_site) +{ + LIBXSMM_UNUSED(this_fn); LIBXSMM_UNUSED(call_site); /* suppress warning */ +} + +#endif /*defined(__TRACE) && defined(__GNUC__) && defined(LIBXSMM_BUILD)*/ + diff --git a/third_party/libxsmm/src/libxsmm_trace.h b/third_party/libxsmm/src/libxsmm_trace.h new file mode 100644 index 00000000..3a6772b2 --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_trace.h @@ -0,0 +1,124 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_TRACE_H +#define LIBXSMM_TRACE_H + +#include + +#if (defined(__TRACE) || defined(LIBXSMM_BUILD) || !defined(_WIN32)) +# define LIBXSMM_TRACE +#endif +#if !defined(LIBXSMM_TRACE_CALLERID_MAXDEPTH) +# define LIBXSMM_TRACE_CALLERID_MAXDEPTH 8 +#endif +#if !defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && \ + ((!defined(_WIN32) || defined(__MINGW32__) || (defined(_MSC_VER) && defined(__clang__))) && \ + (!defined(__PGI) || LIBXSMM_VERSION2(19, 0) <= LIBXSMM_VERSION2(__PGIC__, __PGIC_MINOR__)) && \ + (defined(__GNUC__) || defined(__clang__))) +# define LIBXSMM_TRACE_CALLERID_GCCBUILTIN +#endif + + +/** Initializes the trace facility; NOT thread-safe. */ +LIBXSMM_API int libxsmm_trace_init( + /* Filter for thread id (-1: all). */ + int filter_threadid, + /* Specify min. depth of stack trace (0: all). */ + int filter_mindepth, + /* Specify max. depth of stack trace (-1: all). */ + int filter_maxnsyms); + +/** Finalizes the trace facility; NOT thread-safe. */ +LIBXSMM_API int libxsmm_trace_finalize(void); + +/** Receives the backtrace of up to 'size' addresses. Returns the actual number of addresses (n <= size). */ +LIBXSMM_API unsigned int libxsmm_backtrace(const void* buffer[], unsigned int size, unsigned int skip); + +#if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) +# if defined(__clang__) +# pragma clang diagnostic push +# elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# pragma GCC diagnostic push +# endif +# if defined(__clang__) +# pragma clang diagnostic ignored "-Wunknown-warning-option" +# if LIBXSMM_VERSION2(9, 0) <= LIBXSMM_VERSION2(__clang_major__, __clang_minor__) +# pragma clang diagnostic ignored "-Wframe-address" +# endif +# elif defined(__GNUC__) /* no version-check */ +# pragma GCC diagnostic ignored "-Wpragmas" +# pragma GCC diagnostic ignored "-Wframe-address" +# endif +#endif +LIBXSMM_API_INLINE const void* libxsmm_trace_caller_id(unsigned int level) { /* must be inline */ +#if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) + switch (level) { +# if 0 + case 0: return __builtin_extract_return_addr(__builtin_return_address(0)); + case 1: return __builtin_extract_return_addr(__builtin_return_address(1)); + case 2: return __builtin_extract_return_addr(__builtin_return_address(2)); + case 3: return __builtin_extract_return_addr(__builtin_return_address(3)); +# else + case 0: return __builtin_frame_address(1); + case 1: return __builtin_frame_address(2); + case 2: return __builtin_frame_address(3); + case 3: return __builtin_frame_address(4); +# endif + default: +#else + { +# if defined(_WIN32) + if (0 == level) return _AddressOfReturnAddress(); + else +# endif +#endif + { const void* stacktrace[LIBXSMM_TRACE_CALLERID_MAXDEPTH]; + const unsigned int n = libxsmm_backtrace(stacktrace, LIBXSMM_TRACE_CALLERID_MAXDEPTH, 0/*skip*/); + return (level < n ? stacktrace[level] : NULL); + } + } +} +#if defined(LIBXSMM_TRACE_CALLERID_GCCBUILTIN) && !defined(__INTEL_COMPILER) +# if defined(__clang__) +# pragma clang diagnostic pop +# elif defined(__GNUC__) && LIBXSMM_VERSION2(4, 6) <= LIBXSMM_VERSION2(__GNUC__, __GNUC_MINOR__) +# pragma GCC diagnostic pop +# endif +#endif + +/** Returns the name of the function where libxsmm_trace is called from; thread-safe. */ +LIBXSMM_API const char* libxsmm_trace_info( + /* Query and output the abs. location in stacktrace (no input). */ + unsigned int* depth, + /* Query and output the thread id (no input). */ + unsigned int* threadid, + /* Filter for thread id (-1: all, NULL: libxsmm_trace_init). */ + const int* filter_threadid, + /* Lookup symbol (depth argument becomes relative to symbol position). */ + const void* filter_symbol, + /* Specify min. abs. position in stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ + const int* filter_mindepth, + /* Specify max. depth of stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ + const int* filter_maxnsyms); + +/** Prints an entry of the function where libxsmm_trace is called from (indented/hierarchical). */ +LIBXSMM_API void libxsmm_trace(FILE* stream, + /* Filter for thread id (-1: all, NULL: libxsmm_trace_init). */ + const int* filter_threadid, + /* Lookup symbol (depth argument becomes relative to symbol position). */ + const void* filter_symbol, + /* Specify min. absolute pos. in stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ + const int* filter_mindepth, + /* Specify max. depth of stack trace (-1 or 0: all, NULL: libxsmm_trace_init). */ + const int* filter_maxnsyms); + +#endif /*LIBXSMM_TRACE_H*/ + diff --git a/third_party/libxsmm/src/libxsmm_xcopy.c b/third_party/libxsmm/src/libxsmm_xcopy.c new file mode 100644 index 00000000..4dfc679c --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_xcopy.c @@ -0,0 +1,735 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#include "libxsmm_xcopy.h" + +#if !defined(LIBXSMM_MCOPY_JIT_TINY) && 0 +# define LIBXSMM_MCOPY_JIT_TINY +#endif + + +/* definition of corresponding variables */ +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_xcopy_jit); +#endif +LIBXSMM_APIVAR_PUBLIC_DEF(int libxsmm_xcopy_taskscale); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mcopy_mbytes); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_mzero_mbytes); +LIBXSMM_APIVAR_PUBLIC_DEF(unsigned int libxsmm_tcopy_mbytes); +LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_mcopy_nscale); +LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_mzero_nscale); +LIBXSMM_APIVAR_PUBLIC_DEF(float libxsmm_tcopy_nscale); + + +LIBXSMM_API_INTERN void libxsmm_xcopy_init(int archid) +{ + { /* setup tile sizes according to CPUID or environment */ + if (LIBXSMM_X86_AVX512_CORE <= archid) { /* avx-512/core */ + libxsmm_mcopy_mbytes = 0; + libxsmm_mcopy_nscale = 0.f; + libxsmm_mzero_mbytes = 0; + libxsmm_mzero_nscale = 0.f; + libxsmm_tcopy_mbytes = 32768; + libxsmm_tcopy_nscale = 0.f; + } + else if (LIBXSMM_X86_AVX512_MIC <= archid && LIBXSMM_X86_AVX512_CORE > archid) { + libxsmm_mcopy_mbytes = 0; + libxsmm_mcopy_nscale = 0.f; + libxsmm_mzero_mbytes = 0; + libxsmm_mzero_nscale = 0.f; + libxsmm_tcopy_mbytes = 32768; + libxsmm_tcopy_nscale = 0.f; + } + else { /* avx2 */ + libxsmm_mcopy_mbytes = 0; + libxsmm_mcopy_nscale = 0.f; + libxsmm_mzero_mbytes = 8192; + libxsmm_mzero_nscale = 0.f; + libxsmm_tcopy_mbytes = 4096; + libxsmm_tcopy_nscale = 0.f; + } + } + { /* mcopy: load/adjust tile sizes (measured as if DP) */ + const char *const env_m = getenv("LIBXSMM_MCOPY_M"), *const env_n = getenv("LIBXSMM_MCOPY_N"); + const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); + const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); + if (0 < m) libxsmm_mcopy_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; + if (0 != libxsmm_mcopy_mbytes && 0 != libxsmm_mcopy_nscale) { + if (0 < n) libxsmm_mcopy_nscale = ((float)(n * 8/*DP*/)) / libxsmm_mcopy_mbytes; + if (1 > (libxsmm_mcopy_nscale * libxsmm_mcopy_mbytes)) { + const float stretch = 1.f / libxsmm_mcopy_mbytes; + libxsmm_mcopy_nscale = LIBXSMM_MAX(stretch, libxsmm_mcopy_nscale); + } + } + } + { /* mzero: load/adjust tile sizes (measured as if DP) */ + const char *const env_m = getenv("LIBXSMM_MZERO_M"), *const env_n = getenv("LIBXSMM_MZERO_N"); + const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); + const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); + if (0 < m) libxsmm_mzero_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; + if (0 != libxsmm_mzero_mbytes && 0 != libxsmm_mzero_nscale) { + if (0 < n) libxsmm_mzero_nscale = ((float)(n * 8/*DP*/)) / libxsmm_mzero_mbytes; + if (1 > (libxsmm_mzero_nscale * libxsmm_mzero_mbytes)) { + const float stretch = 1.f / libxsmm_mzero_mbytes; + libxsmm_mzero_nscale = LIBXSMM_MAX(stretch, libxsmm_mzero_nscale); + } + } + } + { /* tcopy: load/adjust tile sizes (measured as if DP) */ + const char *const env_m = getenv("LIBXSMM_TCOPY_M"), *const env_n = getenv("LIBXSMM_TCOPY_N"); + const int m = ((NULL == env_m || 0 == *env_m) ? 0 : atoi(env_m)); + const int n = ((NULL == env_n || 0 == *env_n) ? 0 : atoi(env_n)); + if (0 < m) libxsmm_tcopy_mbytes = LIBXSMM_MAX(m, 1) * 8/*DP*/; + if (0 != libxsmm_tcopy_mbytes && 0 != libxsmm_tcopy_nscale) { + if (0 < n) libxsmm_tcopy_nscale = ((float)(n * 8/*DP*/)) / libxsmm_tcopy_mbytes; + if (1 > (libxsmm_tcopy_nscale * libxsmm_tcopy_mbytes)) { + const float stretch = 1.f / libxsmm_tcopy_mbytes; + libxsmm_tcopy_nscale = LIBXSMM_MAX(stretch, libxsmm_tcopy_nscale); + } + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) && defined(LIBXSMM_PLATFORM_X86) + /* check if JIT-code generation is permitted */ + if (LIBXSMM_X86_AVX2 <= libxsmm_target_archid && LIBXSMM_X86_ALLFEAT >= libxsmm_target_archid) { + const char *const env_jit = getenv("LIBXSMM_XCOPY_JIT"); + libxsmm_xcopy_jit = ((NULL == env_jit || 0 == *env_jit) ? (LIBXSMM_XCOPY_JIT) : atoi(env_jit)); + } +#endif + { /* determines if OpenMP tasks are used (when available) */ + const char *const env_t = getenv("LIBXSMM_XCOPY_TASKS"); + libxsmm_xcopy_taskscale = ((NULL == env_t || 0 == *env_t) + ? 0/*disabled*/ : (LIBXSMM_XCOPY_TASKSCALE * atoi(env_t))); + } +} + + +LIBXSMM_API_INTERN void libxsmm_xcopy_finalize(void) +{ +} + + +LIBXSMM_API void libxsmm_matcopy_task_internal(void* out, const void* in, unsigned int typesize, + unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, + unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, + int tid, int ntasks) +{ + const unsigned int tm = (0 == km ? m : km); + const unsigned int tn = (0 == kn ? LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n) : kn); + const int mtasks = LIBXSMM_UPDIV(m, tm); + unsigned int m0, m1, n0, n1; + + LIBXSMM_ASSERT_MSG(tid < ntasks && 0 < ntasks, "Invalid task setup"); + LIBXSMM_ASSERT_MSG(tm <= m && tn <= n, "Invalid problem size"); + LIBXSMM_ASSERT_MSG(0 < tm && 0 < tn, "Invalid tile size"); + LIBXSMM_ASSERT_MSG(typesize <= 255, "Invalid type-size"); + LIBXSMM_ASSERT(0 < mtasks); + + if (ntasks <= mtasks) { /* parallelized over M */ + const unsigned int mt = LIBXSMM_UPDIV(m, ntasks); + m0 = LIBXSMM_MIN(tid * mt, m); + m1 = LIBXSMM_MIN(m0 + mt, m); + n0 = 0; n1 = n; + } + else { /* parallelized over M and N */ + const int mntasks = ntasks / mtasks; + const int mtid = tid / mntasks, ntid = tid - mtid * mntasks; + const unsigned int nt = LIBXSMM_UP(LIBXSMM_UPDIV(n, mntasks), tn) ; + m0 = LIBXSMM_MIN(mtid * tm, m); m1 = LIBXSMM_MIN(m0 + tm, m); + n0 = LIBXSMM_MIN(ntid * nt, n); n1 = LIBXSMM_MIN(n0 + nt, n); + } + + LIBXSMM_ASSERT_MSG(m0 <= m1 && m1 <= m, "Invalid task size"); + LIBXSMM_ASSERT_MSG(n0 <= n1 && n1 <= n, "Invalid task size"); + + if (NULL != in) { /* copy-kernel */ + libxsmm_matcopy_internal(out, in, typesize, ldi, ldo, + m0, m1, n0, n1, tm, tn, kernel); + } + else { + libxsmm_matzero_internal(out, typesize, ldo, + m0, m1, n0, n1, tm, tn, kernel); + } +} + + +LIBXSMM_API void libxsmm_otrans_task_internal(void* out, const void* in, unsigned int typesize, + unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, + unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, + int tid, int ntasks) +{ + const unsigned int tm = (0 == km ? m : km); + const unsigned int tn = (0 == kn ? LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n) : kn); + const int mtasks = LIBXSMM_UPDIV(m, tm); + unsigned int m0, m1, n0, n1; + + LIBXSMM_ASSERT_MSG(tid < ntasks && 0 < ntasks, "Invalid task setup"); + LIBXSMM_ASSERT_MSG(tm <= m && tn <= n, "Invalid problem size"); + LIBXSMM_ASSERT_MSG(0 < tm && 0 < tn, "Invalid tile size"); + LIBXSMM_ASSERT_MSG(typesize <= 255, "Invalid type-size"); + LIBXSMM_ASSERT(0 < mtasks); + + if (ntasks <= mtasks) { /* parallelized over M */ + const unsigned int mt = LIBXSMM_UPDIV(m, ntasks); + m0 = LIBXSMM_MIN(tid * mt, m); + m1 = LIBXSMM_MIN(m0 + mt, m); + n0 = 0; n1 = n; + } + else { /* parallelized over M and N */ + const int mntasks = ntasks / mtasks; + const int mtid = tid / mntasks, ntid = tid - mtid * mntasks; + const unsigned int nt = LIBXSMM_UP(LIBXSMM_UPDIV(n, mntasks), tn); + m0 = LIBXSMM_MIN(mtid * tm, m); m1 = LIBXSMM_MIN(m0 + tm, m); + n0 = LIBXSMM_MIN(ntid * nt, n); n1 = LIBXSMM_MIN(n0 + nt, n); + } + + LIBXSMM_ASSERT_MSG(m0 <= m1 && m1 <= m, "Invalid task size"); + LIBXSMM_ASSERT_MSG(n0 <= n1 && n1 <= n, "Invalid task size"); + + libxsmm_otrans_internal(out, in, typesize, ldi, ldo, m0, m1, n0, n1, tm, tn, kernel); +} + + +LIBXSMM_API_INTERN void libxsmm_matcopy_internal(void* out, const void* in, + unsigned int typesize, unsigned int ldi, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) +{ + LIBXSMM_ASSERT(NULL != in); + LIBXSMM_XCOPY(LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, kernel, + out, in, typesize, ldi, ldo, tm, tn, m0, m1, n0, n1); +} + + +LIBXSMM_API_INTERN void libxsmm_matzero_internal(void* out, unsigned int typesize, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) +{ + /* coverity[ptr_arith] */ + LIBXSMM_XCOPY(LIBXSMM_MZERO_KERNEL, LIBXSMM_MZERO_CALL, kernel, + out, NULL, typesize, 0, ldo, tm, tn, m0, m1, n0, n1); +} + + +LIBXSMM_API_INTERN void libxsmm_otrans_internal(void* out, const void* in, + unsigned int typesize, unsigned int ldi, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel) +{ + LIBXSMM_ASSERT(NULL != in); + LIBXSMM_XCOPY(LIBXSMM_TCOPY_KERNEL, LIBXSMM_TCOPY_CALL, kernel, + out, in, typesize, ldi, ldo, tm, tn, m0, m1, n0, n1); +} + + +LIBXSMM_API void libxsmm_matcopy_task(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + int tid, int ntasks) +{ + LIBXSMM_INIT + if (0 < typesize && 256 > typesize && m <= ldi && m <= ldo && out != in && + ((NULL != out && 0 < m && 0 < n) || (0 == m && 0 == n)) && + /* use (signed) integer types, but check sanity of input */ + 0 <= tid && tid < ntasks) + { + if (0 < m && 0 < n) { + unsigned int tm, tn, ts; + libxsmm_xcopykernel kernel; + kernel.ptr = NULL; + if (NULL != in) { /* mcopy */ + tm = LIBXSMM_UPDIV(libxsmm_mcopy_mbytes, typesize); + tn = (unsigned int)(libxsmm_mcopy_nscale * tm); + ts = libxsmm_mcopy_mbytes; + } + else { /* mzero */ + tm = LIBXSMM_UPDIV(libxsmm_mzero_mbytes, typesize); + tn = (unsigned int)(libxsmm_mzero_nscale * tm); + ts = libxsmm_mzero_mbytes; + } + if (0 == tm) tm = m; + if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); + if (0 != ts && ts < (tm * tn * typesize)) { + tm = LIBXSMM_MAX(ts / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); + } + if ((unsigned int)m < tm || (unsigned int)n < tn) { + if (1 == ntasks) { + tm = (unsigned int)m; tn = (unsigned int)n; + } + else { + const unsigned int tasksize = (((unsigned int)m) * (unsigned int)n) / ((unsigned int)(ntasks * libxsmm_mcopy_nscale)); + const unsigned int nn = libxsmm_isqrt_u32(tasksize); + const unsigned int mm = (unsigned int)(libxsmm_mcopy_nscale * nn); + tn = LIBXSMM_CLMP((unsigned int)n, 1, nn); + tm = LIBXSMM_CLMP((unsigned int)m, 1, mm); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 2)) +# if !defined(LIBXSMM_MCOPY_JIT_TINY) + else +# endif + if (0 != (2 & libxsmm_xcopy_jit)) { /* JIT'ted matrix-copy permitted? */ + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_MELTW_FLAG_UNARY_NONE, + NULL != in ? LIBXSMM_MELTW_TYPE_UNARY_IDENTITY/*mcopy*/ : LIBXSMM_MELTW_TYPE_UNARY_XOR/*mzero*/); + break; + } + } +#endif + libxsmm_matcopy_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + } + else { + static int error_once = 0; + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (0 > tid || tid >= ntasks) { + fprintf(stderr, "LIBXSMM ERROR: the matrix-copy thread-id or number of tasks is incorrect!\n"); + } + else if (NULL == out) { + fprintf(stderr, "LIBXSMM ERROR: the matrix-copy input and/or output is NULL!\n"); + } + else if (out == in) { + fprintf(stderr, "LIBXSMM ERROR: output and input of the matrix-copy must be different!\n"); + } + else if (0 == typesize || 256 <= typesize) { + fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-copy specified!\n"); + } + else if (ldi < m || ldo < m) { + fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the matrix-copy is/are too small!\n"); + } + else if (0 > m || 0 > n) { + fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the matrix-copy is/are negative!\n"); + } + } + } +} + + +LIBXSMM_API void libxsmm_matcopy(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + libxsmm_matcopy_task(out, in, typesize, m, n, ldi, ldo, 0/*tid*/, 1/*ntasks*/); +} + + +LIBXSMM_API void libxsmm_otrans_task(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + int tid, int ntasks) +{ + static int error_once = 0; + LIBXSMM_INIT + if (0 < typesize && 256 > typesize && m <= ldi && n <= ldo && + ((NULL != out && NULL != in && 0 < m && 0 < n) || (0 == m && 0 == n)) && + /* use (signed) integer types, but check sanity of input */ + 0 <= tid && tid < ntasks) + { + if (0 < m && 0 < n) { + if (out != in) { + unsigned int tm = LIBXSMM_UPDIV(libxsmm_tcopy_mbytes, typesize); + unsigned int tn = (unsigned int)(libxsmm_tcopy_nscale * tm); + libxsmm_xcopykernel kernel; + kernel.ptr = NULL; + if (0 == tm) tm = m; + if (0 == tn) tn = LIBXSMM_MIN(LIBXSMM_XCOPY_TILE_MIN, n); + if (0 != libxsmm_tcopy_mbytes && libxsmm_tcopy_mbytes < (tm * tn * typesize)) { + tm = LIBXSMM_MAX(libxsmm_tcopy_mbytes / (tn * typesize), LIBXSMM_XCOPY_TILE_MIN); + } + if ((unsigned int)m < tm || (unsigned int)n < tn) { + if (1 == ntasks) { +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + if (0 != (1 & libxsmm_xcopy_jit)) { /* JIT'ted transpose permitted? */ + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + } + if (NULL != kernel.ptr) { /* JIT-kernel available */ + LIBXSMM_TCOPY_CALL(kernel, typesize, in, ldi, out, ldo); + return; /* fast path */ + } + } +#endif + tm = (unsigned int)m; tn = (unsigned int)n; + } + else { + const unsigned int tasksize = (((unsigned int)m) * (unsigned int)n) / ((unsigned int)(ntasks * libxsmm_tcopy_nscale)); + const unsigned int nn = libxsmm_isqrt_u32(tasksize); + const unsigned int mm = (unsigned int)(libxsmm_tcopy_nscale * nn); + tn = LIBXSMM_CLMP((unsigned int)n, 1, nn); + tm = LIBXSMM_CLMP((unsigned int)m, 1, mm); +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + if (0 != (1 & libxsmm_xcopy_jit)) { /* JIT'ted transpose permitted? */ + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary((libxsmm_blasint)tm, (libxsmm_blasint)tn, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + } + } +#endif + } + } + libxsmm_otrans_task_internal(out, in, typesize, + (unsigned int)m, (unsigned int)n, (unsigned int)ldi, (unsigned int)ldo, + tm, tn, kernel, tid, ntasks); + } + else if (ldi == ldo) { + libxsmm_itrans(out, typesize, m, n, ldi, ldo); + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); + } + } + } + else { + if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + if (0 > tid || tid >= ntasks) { + fprintf(stderr, "LIBXSMM ERROR: the transpose thread-id or number of tasks is incorrect!\n"); + } + else if (NULL == out || NULL == in) { + fprintf(stderr, "LIBXSMM ERROR: the transpose input and/or output is NULL!\n"); + } + else if (out == in) { + fprintf(stderr, "LIBXSMM ERROR: output and input of the transpose must be different!\n"); + } + else if (0 == typesize || 256 <= typesize) { + fprintf(stderr, "LIBXSMM ERROR: invalid type-size for matrix-transpose specified!\n"); + } + else if (ldi < m || ldo < n) { + fprintf(stderr, "LIBXSMM ERROR: the leading dimension(s) of the transpose is/are too small!\n"); + } + else if (0 > m || 0 > n) { + fprintf(stderr, "LIBXSMM ERROR: the matrix extent(s) of the transpose is/are negative!\n"); + } + } + } +} + + +LIBXSMM_API void libxsmm_otrans(void* out, const void* in, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + libxsmm_otrans_task(out, in, typesize, m, n, ldi, ldo, 0/*tid*/, 1/*ntasks*/); +} + + +LIBXSMM_API_INTERN void libxsmm_itrans_scratch(void* /*inout*/, void* /*scratch*/, unsigned int /*typesize*/, + libxsmm_blasint /*m*/, libxsmm_blasint /*n*/, libxsmm_blasint /*ldi*/, libxsmm_blasint /*ldo*/); +LIBXSMM_API_INTERN void libxsmm_itrans_scratch(void* inout, void* scratch, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + LIBXSMM_ASSERT(NULL != inout && 0 < typesize && m <= ldi && n <= ldo); + LIBXSMM_XCOPY_TILE(LIBXSMM_MCOPY_KERNEL, typesize, scratch, inout, ldi, m, 0, n, 0, m); + LIBXSMM_XCOPY_TILE(LIBXSMM_TCOPY_KERNEL, typesize, inout, scratch, m, ldo, 0, m, 0, n); +} + + +LIBXSMM_API_INTERN void libxsmm_itrans_scratch_jit(void* /*inout*/, void* /*scratch*/, unsigned int /*typesize*/, + libxsmm_blasint /*m*/, libxsmm_blasint /*n*/, libxsmm_blasint /*ldi*/, libxsmm_blasint /*ldo*/, libxsmm_xcopykernel /*kernel*/); +LIBXSMM_API_INTERN void libxsmm_itrans_scratch_jit(void* inout, void* scratch, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, libxsmm_xcopykernel kernel) +{ + LIBXSMM_ASSERT(NULL != inout && 0 < typesize && m <= ldi && n <= ldo); + LIBXSMM_XCOPY_TILE(LIBXSMM_MCOPY_KERNEL, typesize, scratch, inout, ldi, m, 0, n, 0, m); + LIBXSMM_TCOPY_CALL(kernel, typesize, scratch, m, inout, ldo); +} + + +LIBXSMM_API void libxsmm_itrans_internal(char* inout, void* scratch, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride[], + libxsmm_xcopykernel kernel, libxsmm_blasint begin, libxsmm_blasint end) +{ +#if !defined(LIBXSMM_XCOPY_JIT) || 0 == (LIBXSMM_XCOPY_JIT & 1) + LIBXSMM_UNUSED(kernel); +#endif + if (NULL != stride) { + if (0 != index_stride) { /* stride array contains indexes */ + libxsmm_blasint i; + if (NULL == scratch) { /* in-place transpose */ + LIBXSMM_ASSERT(m == n && ldi == ldo); + for (i = begin * index_stride; i < (end * index_stride); i += index_stride) { + char *const mat = &inout[(LIBXSMM_ACCESS(const libxsmm_blasint, stride, i) - index_base) * typesize]; + LIBXSMM_ITRANS(typesize, mat, ldi, m); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + else if (NULL != kernel.ptr) { /* out-of-place transpose using JIT'ted kernel */ + for (i = begin * index_stride; i < (end * index_stride); i += index_stride) { + char *const mat = &inout[(LIBXSMM_ACCESS(const libxsmm_blasint, stride, i) - index_base) * typesize]; + libxsmm_itrans_scratch_jit(mat, scratch, typesize, m, n, ldi, ldo, kernel); + } + } +#endif + else { /* out-of-place transpose */ + for (i = begin * index_stride; i < (end * index_stride); i += index_stride) { + char *const mat = &inout[(LIBXSMM_ACCESS(const libxsmm_blasint, stride, i) - index_base) * typesize]; + libxsmm_itrans_scratch(mat, scratch, typesize, m, n, ldi, ldo); + } + } + } + else { /* array of pointers to matrices (singular stride is measured in Bytes) */ + const libxsmm_blasint d = *stride - index_base * sizeof(void*); + const char *const endi = inout + (size_t)d * end; + char* i = inout + begin * (size_t)d; + if (NULL == scratch) { /* in-place transpose */ + LIBXSMM_ASSERT(m == n && ldi == ldo); + for (; i < endi; i += d) { + void *const mat = *((void**)i); +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != mat) +#endif + LIBXSMM_ITRANS(typesize, mat, ldi, m); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + else if (NULL != kernel.ptr) { /* out-of-place transpose using JIT'ted kernel */ + for (; i < endi; i += d) { + void *const mat = *((void**)i); +# if defined(LIBXSMM_BATCH_CHECK) + if (NULL != mat) +# endif + libxsmm_itrans_scratch_jit(mat, scratch, typesize, m, n, ldi, ldo, kernel); + } + } +#endif + else { /* out-of-place transpose */ + for (; i < endi; i += d) { + void *const mat = *((void**)i); +#if defined(LIBXSMM_BATCH_CHECK) + if (NULL != mat) +#endif + libxsmm_itrans_scratch(mat, scratch, typesize, m, n, ldi, ldo); + } + } + } + } + else { /* consecutive matrices */ + libxsmm_blasint i; + if (NULL == scratch) { /* in-place transpose */ + LIBXSMM_ASSERT(m == n && ldi == ldo); + for (i = begin; i < end; ++i) { + LIBXSMM_ITRANS(typesize, inout + (size_t)i * typesize, ldi, m); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + else if (NULL != kernel.ptr) { /* out-of-place transpose using JIT'ted kernel */ + for (i = begin; i < end; ++i) { + libxsmm_itrans_scratch_jit(inout + (size_t)i * typesize, scratch, typesize, m, n, ldi, ldo, kernel); + } + } +#endif + else { /* out-of-place transpose */ + for (i = begin; i < end; ++i) { + libxsmm_itrans_scratch(inout + (size_t)i * typesize, scratch, typesize, m, n, ldi, ldo); + } + } + } +} + + +LIBXSMM_API void libxsmm_itrans(void* inout, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo) +{ + static int error_once = 0; + if (NULL != inout && 0 < typesize && m <= ldi && n <= ldo) { + if (m == n && ldi == ldo && typesize <= 127) { /* in-place transpose */ + LIBXSMM_ITRANS(typesize, inout, ldi, m); + } + else { /* out-of-place transpose */ + const libxsmm_blasint scratchsize = m * n * typesize; + if (scratchsize <= LIBXSMM_ITRANS_BUFFER_MAXSIZE) { + char buffer[LIBXSMM_ITRANS_BUFFER_MAXSIZE]; + libxsmm_itrans_scratch(inout, buffer, typesize, m, n, ldi, ldo); + } + else { + void* buffer = NULL; + LIBXSMM_INIT + if (EXIT_SUCCESS == libxsmm_xmalloc(&buffer, scratchsize, 0/*auto-align*/, + LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, + 0/*extra*/, 0/*extra_size*/)) + { + LIBXSMM_ASSERT(NULL != buffer); + libxsmm_itrans_scratch(inout, buffer, typesize, m, n, ldi, ldo); + libxsmm_xfree(buffer, 0/*no check*/); + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate buffer for in-place transpose!\n"); + } + } + } + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument(s) for in-place transpose!\n"); + } +} + + +LIBXSMM_API void libxsmm_itrans_batch(void* inout, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + libxsmm_blasint index_base, libxsmm_blasint index_stride, + const libxsmm_blasint stride[], libxsmm_blasint batchsize, + /*unsigned*/int tid, /*unsigned*/int ntasks) +{ + static int error_once = 0; + if (NULL != inout && 0 < typesize && m <= ldi && n <= ldo) { + const libxsmm_blasint scratchsize = m * n * typesize; + const libxsmm_blasint size = LIBXSMM_ABS(batchsize); + const libxsmm_blasint tasksize = LIBXSMM_UPDIV(size, ntasks); + const libxsmm_blasint begin = tid * tasksize, span = begin + tasksize; + const libxsmm_blasint end = LIBXSMM_MIN(span, size); + char buffer[LIBXSMM_ITRANS_BUFFER_MAXSIZE]; + char *const mat0 = (char*)inout; + void* scratch = NULL; + libxsmm_xcopykernel kernel = { NULL }; + if (m != n || ldi != ldo || 127 < typesize) { + if (scratchsize <= LIBXSMM_ITRANS_BUFFER_MAXSIZE) { + scratch = buffer; + } + else { + LIBXSMM_INIT + if (EXIT_SUCCESS != libxsmm_xmalloc(&scratch, scratchsize, 0/*auto-align*/, + LIBXSMM_MALLOC_FLAG_SCRATCH | LIBXSMM_MALLOC_FLAG_PRIVATE, + 0/*extra*/, 0/*extra_size*/) + && 0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: failed to allocate buffer for in-place transpose!\n"); + } + } +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT & 1)) + if (0 != (1 & libxsmm_xcopy_jit) /* JIT'ted transpose permitted? */ + /* avoid outgrown transpose kernel upfront */ + && (m <= LIBXSMM_CONFIG_MAX_DIM || n <= LIBXSMM_CONFIG_MAX_DIM)) + { + switch (typesize) { + case 8: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, LIBXSMM_DATATYPE_F64, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 4: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 2: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, LIBXSMM_DATATYPE_I16, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + case 1: kernel.function = libxsmm_dispatch_meltw_unary(m, n, &ldi, &ldo, + LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, LIBXSMM_DATATYPE_I8, + LIBXSMM_MELTW_FLAG_UNARY_NONE, LIBXSMM_MELTW_TYPE_UNARY_TRANSFORM_NORM_TO_NORMT); + break; + } + } +#endif + } + libxsmm_itrans_internal(mat0, scratch, typesize, m, n, ldi, ldo, index_base, + index_stride, stride, kernel, begin, end); + if (NULL != scratch && LIBXSMM_ITRANS_BUFFER_MAXSIZE < scratchsize) { + libxsmm_xfree(scratch, 0/*no check*/); + } + } + else if (0 != libxsmm_verbosity /* library code is expected to be mute */ + && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) + { + fprintf(stderr, "LIBXSMM ERROR: invalid argument(s) for in-place batch-transpose!\n"); + } +} + + +#if defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__)) + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matcopy)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_matcopy)(void* out, const void* in, const int* typesize, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) +{ + libxsmm_blasint ldx; + LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); + ldx = *(NULL != ldi ? ldi : m); + libxsmm_matcopy(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_otrans)(void* /*out*/, const void* /*in*/, const int* /*typesize*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_otrans)(void* out, const void* in, const int* typesize, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) +{ + libxsmm_blasint ldx; + LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); + ldx = *(NULL != ldi ? ldi : m); + libxsmm_otrans(out, in, (unsigned int)*typesize, *m, *(NULL != n ? n : m), ldx, NULL != ldo ? *ldo : ldx); +} + + +/* implementation provided for Fortran 77 compatibility */ +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_itrans)(void* /*inout*/, const int* /*typesize*/, + const libxsmm_blasint* /*m*/, const libxsmm_blasint* /*n*/, const libxsmm_blasint* /*ldi*/, const libxsmm_blasint* /*ldo*/); +LIBXSMM_API void LIBXSMM_FSYMBOL(libxsmm_itrans)(void* inout, const int* typesize, + const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* ldi, const libxsmm_blasint* ldo) +{ + const libxsmm_blasint nvalue = *(NULL != n ? n : m); + LIBXSMM_ASSERT(NULL != typesize && 0 < *typesize && NULL != m); + libxsmm_itrans(inout, (unsigned int)*typesize, *m, nvalue, *(NULL != ldi ? ldi : m), NULL != ldo ? *ldo : nvalue); +} + +#endif /*defined(LIBXSMM_BUILD) && (!defined(LIBXSMM_NOFORTRAN) || defined(__clang_analyzer__))*/ + diff --git a/third_party/libxsmm/src/libxsmm_xcopy.h b/third_party/libxsmm/src/libxsmm_xcopy.h new file mode 100644 index 00000000..e9ccce6a --- /dev/null +++ b/third_party/libxsmm/src/libxsmm_xcopy.h @@ -0,0 +1,286 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ +#ifndef LIBXSMM_XCOPY_H +#define LIBXSMM_XCOPY_H + +#include +#include "libxsmm_main.h" + +#if !defined(LIBXSMM_XCOPY_CHECK) && !defined(NDEBUG) +# define LIBXSMM_XCOPY_CHECK +#endif +#if !defined(LIBXSMM_ITRANS_BUFFER_MAXSIZE) +# if defined(NDEBUG) +# define LIBXSMM_ITRANS_BUFFER_MAXSIZE (12 << 10/*12kB*/) +# else +# define LIBXSMM_ITRANS_BUFFER_MAXSIZE 1 +# endif +#endif +#if !defined(LIBXSMM_XCOPY_TASKSCALE) +# define LIBXSMM_XCOPY_TASKSCALE 2 +#endif +#if !defined(LIBXSMM_XCOPY_TILE_MIN) +# define LIBXSMM_XCOPY_TILE_MIN 2 +#endif +/* 0: none, 1: transpose, 2: matcopy, 3: transpose+matcopy */ +#if defined(LIBXSMM_PLATFORM_X86) +# if !defined(LIBXSMM_XCOPY_JIT) +# if (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBXSMM_XCOPY_JIT 0 +# elif defined(NDEBUG) +# define LIBXSMM_XCOPY_JIT 0 +# else +# define LIBXSMM_XCOPY_JIT 3 +# endif +# endif +#else +# define LIBXSMM_XCOPY_JIT 0 +#endif + +/* kernel uses consecutive stores */ +#define LIBXSMM_MZERO_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ + static /*const*/ TYPE libxsmm_mzero_kernel_src_value_ /* zero */; \ + const TYPE *const SRC = &libxsmm_mzero_kernel_src_value_; \ + TYPE *const DST = (TYPE*)(((char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) +/* kernel uses consecutive stores and consecutive loads (copy) */ +#define LIBXSMM_MCOPY_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ + const TYPE *const SRC = (const TYPE*)(((const char*) (IN)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDI) + (INDEX_J))); \ + TYPE *const DST = ( TYPE*)((( char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) + +#define LIBXSMM_MZERO_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ + libxsmm_meltw_unary_param libxsmm_mzero_call_args_; \ + libxsmm_mzero_call_args_.in.primary = (void*)(SRC); \ + libxsmm_mzero_call_args_.out.primary = (DST); \ + (KERNEL).function(&libxsmm_mzero_call_args_); \ + LIBXSMM_UNUSED(LDO); \ +} +#define LIBXSMM_MCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ + libxsmm_meltw_unary_param libxsmm_mcopy_call_args_; \ + libxsmm_mcopy_call_args_.in.primary = (void*)(SRC); \ + libxsmm_mcopy_call_args_.out.primary = (DST); \ + (KERNEL).function(&libxsmm_mcopy_call_args_); \ + LIBXSMM_UNUSED(LDO); \ +} + +/* kernel uses consecutive stores and strided loads (transpose) */ +#define LIBXSMM_TCOPY_KERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, INDEX_I, INDEX_J, SRC, DST) \ + const TYPE *const SRC = (const TYPE*)(((const char*) (IN)) + (TYPESIZE) * ((size_t)(INDEX_J) * (LDI) + (INDEX_I))); \ + TYPE *const DST = ( TYPE*)((( char*)(OUT)) + (TYPESIZE) * ((size_t)(INDEX_I) * (LDO) + (INDEX_J))) + +/* call JIT-kernel (transpose) */ +#define LIBXSMM_TCOPY_CALL(KERNEL, TYPESIZE, SRC, LDI, DST, LDO) { \ + libxsmm_meltw_unary_param libxsmm_tcopy_call_args_; \ + libxsmm_tcopy_call_args_.in.primary = (void*)(SRC); \ + libxsmm_tcopy_call_args_.out.primary = (DST); \ + (KERNEL).function(&libxsmm_tcopy_call_args_); \ + LIBXSMM_UNUSED(LDO); \ +} + +#define LIBXSMM_XCOPY_LOOP(TYPE, TYPESIZE, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1) { \ + libxsmm_blasint libxsmm_xcopy_loop_i_, libxsmm_xcopy_loop_j_; \ + for (libxsmm_xcopy_loop_i_ = M0; libxsmm_xcopy_loop_i_ < (libxsmm_blasint)(M1); ++libxsmm_xcopy_loop_i_) { \ + LIBXSMM_PRAGMA_NONTEMPORAL(OUT) \ + for (libxsmm_xcopy_loop_j_ = N0; libxsmm_xcopy_loop_j_ < (libxsmm_blasint)(N1); ++libxsmm_xcopy_loop_j_) { \ + XKERNEL(TYPE, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_loop_i_, libxsmm_xcopy_loop_j_, \ + libxsmm_xcopy_loop_src_, libxsmm_xcopy_loop_dst_); *libxsmm_xcopy_loop_dst_ = *libxsmm_xcopy_loop_src_; \ + } \ + } \ +} + +#define LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) { \ + switch(TYPESIZE) { \ + case 2: { \ + LIBXSMM_XCOPY_LOOP(short, 2, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ + } break; \ + case 4: { \ + LIBXSMM_XCOPY_LOOP(float, 4, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ + } break; \ + case 8: { \ + LIBXSMM_XCOPY_LOOP(double, 8, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ + } break; \ + case 16: { \ + typedef struct /*libxsmm_xcopy_tile_elem_t*/ { double value[2]; } libxsmm_xcopy_tile_elem_t; \ + LIBXSMM_XCOPY_LOOP(libxsmm_xcopy_tile_elem_t, 16, XKERNEL, OUT, IN, LDI, LDO, M0, M1, N0, N1); \ + } break; \ + default: { /* generic type-size */ \ + libxsmm_blasint libxsmm_xcopy_tile_i_, libxsmm_xcopy_tile_j_; \ + for (libxsmm_xcopy_tile_i_ = M0; libxsmm_xcopy_tile_i_ < (libxsmm_blasint)(M1); ++libxsmm_xcopy_tile_i_) { \ + for (libxsmm_xcopy_tile_j_ = N0; libxsmm_xcopy_tile_j_ < (libxsmm_blasint)(N1); ++libxsmm_xcopy_tile_j_) { \ + XKERNEL(char, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_tile_i_, libxsmm_xcopy_tile_j_, \ + libxsmm_xcopy_tile_src_, libxsmm_xcopy_tile_dst_); \ + LIBXSMM_MEMCPY127_LOOP(libxsmm_xcopy_tile_dst_, libxsmm_xcopy_tile_src_, TYPESIZE, LIBXSMM_PRAGMA_NONTEMPORAL); \ + } \ + } \ + } \ + } \ +} + +#define LIBXSMM_ITRANS_LOOP(TYPE, INOUT, LD, M) { \ + libxsmm_blasint libxsmm_itrans_loop_i_, libxsmm_itrans_loop_j_; \ + LIBXSMM_ASSERT(NULL != (INOUT) && (M) <= (LD)); \ + for (libxsmm_itrans_loop_i_ = 0; libxsmm_itrans_loop_i_ < (M); ++libxsmm_itrans_loop_i_) { \ + for (libxsmm_itrans_loop_j_ = 0; libxsmm_itrans_loop_j_ < libxsmm_itrans_loop_i_; ++libxsmm_itrans_loop_j_) { \ + TYPE *const libxsmm_itrans_loop_a_ = ((TYPE*)(INOUT)) + (size_t)(LD) * libxsmm_itrans_loop_i_ + libxsmm_itrans_loop_j_; \ + TYPE *const libxsmm_itrans_loop_b_ = ((TYPE*)(INOUT)) + (size_t)(LD) * libxsmm_itrans_loop_j_ + libxsmm_itrans_loop_i_; \ + LIBXSMM_ISWAP(*libxsmm_itrans_loop_a_, *libxsmm_itrans_loop_b_); \ + } \ + } \ +} + +#define LIBXSMM_ITRANS(TYPESIZE, INOUT, LD, M) { \ + switch(TYPESIZE) { \ + case 2: { \ + LIBXSMM_ITRANS_LOOP(short, INOUT, LD, M); \ + } break; \ + case 4: { \ + LIBXSMM_ITRANS_LOOP(int, INOUT, LD, M); \ + } break; \ + case 8: { \ + LIBXSMM_ITRANS_LOOP(int64_t, INOUT, LD, M); \ + } break; \ + default: { /* generic type-size */ \ + const signed char libxsmm_itrans_c_ = (signed char)(TYPESIZE); \ + libxsmm_blasint libxsmm_itrans_i_, libxsmm_itrans_j_; \ + LIBXSMM_ASSERT(NULL != (INOUT) && (M) <= (LD)); \ + LIBXSMM_ASSERT(0 < (TYPESIZE) && (TYPESIZE) <= 127); \ + for (libxsmm_itrans_i_ = 0; libxsmm_itrans_i_ < (M); ++libxsmm_itrans_i_) { \ + for (libxsmm_itrans_j_ = 0; libxsmm_itrans_j_ < libxsmm_itrans_i_; ++libxsmm_itrans_j_) { \ + char *const libxsmm_itrans_a_ = &((char*)(INOUT))[((LD)*libxsmm_itrans_i_+libxsmm_itrans_j_)*(TYPESIZE)]; \ + char *const libxsmm_itrans_b_ = &((char*)(INOUT))[((LD)*libxsmm_itrans_j_+libxsmm_itrans_i_)*(TYPESIZE)]; \ + signed char libxsmm_itrans_k_ = 0; \ + for (; libxsmm_itrans_k_ < libxsmm_itrans_c_; ++libxsmm_itrans_k_) { \ + LIBXSMM_ISWAP( \ + libxsmm_itrans_a_[libxsmm_itrans_k_], \ + libxsmm_itrans_b_[libxsmm_itrans_k_]); \ + } \ + } \ + } \ + } \ + } \ +} + +#define LIBXSMM_MZERO_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, N0, N1, M0, M1) +#define LIBXSMM_MCOPY_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, N0, N1, M0, M1) +#define LIBXSMM_TCOPY_KERNEL_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) + +#define LIBXSMM_XCOPY_NONJIT(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) \ + LIBXSMM_CONCATENATE(XKERNEL,_TILE)(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, M0, M1, N0, N1) + +#if 1 +# define LIBXSMM_XCOPY_PRECOND(COND) +#else +# define LIBXSMM_XCOPY_PRECOND(COND) COND +#endif + +#define LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) { \ + libxsmm_blasint libxsmm_xcopy_i_ = M0, libxsmm_xcopy_j_ = N0; \ + LIBXSMM_ASSERT_MSG(0 < (TILE_M) && 0 < (TILE_N), "XCOPY cannot make progress"); \ + if (NULL != (KERNEL).ptr) { /* inner tiles with JIT */ \ + for (; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ + for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N) + 1); libxsmm_xcopy_j_ += TILE_N) { \ + XKERNEL(char, TYPESIZE, OUT, IN, LDI, LDO, libxsmm_xcopy_i_, libxsmm_xcopy_j_, libxsmm_xcopy_src_, libxsmm_xcopy_dst_); \ + KERNEL_CALL(KERNEL, TYPESIZE, libxsmm_xcopy_src_, LDI, libxsmm_xcopy_dst_, LDO); \ + } \ + } \ + } \ + else { /* inner tiles without JIT */ \ + for (; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ + for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N) + 1); libxsmm_xcopy_j_ += TILE_N) { \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ + libxsmm_xcopy_i_, libxsmm_xcopy_i_ + (TILE_M), \ + libxsmm_xcopy_j_, libxsmm_xcopy_j_ + (TILE_N)); \ + } \ + } \ + } \ + { /* remainder/border tiles */ \ + LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_j_ < ((libxsmm_blasint)N1))) { \ + for (libxsmm_xcopy_i_ = M0; libxsmm_xcopy_i_ < (((libxsmm_blasint)M1) - ((libxsmm_blasint)TILE_M) + 1); libxsmm_xcopy_i_ += TILE_M) { \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ + libxsmm_xcopy_i_, libxsmm_xcopy_i_ + (TILE_M), \ + libxsmm_xcopy_j_, N1); \ + } \ + } \ + LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_i_ < ((libxsmm_blasint)M1))) { \ + for (libxsmm_xcopy_j_ = N0; libxsmm_xcopy_j_ < (((libxsmm_blasint)N1) - ((libxsmm_blasint)TILE_N)); libxsmm_xcopy_j_ += TILE_N) { \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ + libxsmm_xcopy_i_, M1, \ + libxsmm_xcopy_j_, libxsmm_xcopy_j_ + (TILE_N)); \ + } \ + } \ + LIBXSMM_XCOPY_PRECOND(if (libxsmm_xcopy_i_ < ((libxsmm_blasint)M1) && libxsmm_xcopy_j_ < ((libxsmm_blasint)N1))) { \ + LIBXSMM_XCOPY_TILE(XKERNEL, TYPESIZE, OUT, IN, LDI, LDO, \ + libxsmm_xcopy_i_, M1, \ + libxsmm_xcopy_j_, N1); \ + } \ + } \ +} + +#define LIBXSMM_MZERO_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_N, TILE_M, N0, N1, M0, M1) +#define LIBXSMM_MCOPY_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_N, TILE_M, N0, N1, M0, M1) +#define LIBXSMM_TCOPY_KERNEL_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ + LIBXSMM_XCOPY_TILES(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) + +#define LIBXSMM_XCOPY(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) \ + LIBXSMM_CONCATENATE(XKERNEL,_TILES)(XKERNEL, KERNEL_CALL, KERNEL, OUT, IN, TYPESIZE, LDI, LDO, TILE_M, TILE_N, M0, M1, N0, N1) + +/** Initializes the transpose functionality; NOT thread-safe. */ +LIBXSMM_API_INTERN void libxsmm_xcopy_init(int archid); +/** Finalizes the transpose functionality; NOT thread-safe. */ +LIBXSMM_API_INTERN void libxsmm_xcopy_finalize(void); + +LIBXSMM_API void libxsmm_matcopy_task_internal(void* out, const void* in, unsigned int typesize, + unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, + unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, + int tid, int ntasks); +LIBXSMM_API void libxsmm_otrans_task_internal(void* out, const void* in, unsigned int typesize, + unsigned int m, unsigned int n, unsigned int ldi, unsigned int ldo, + unsigned int km, unsigned int kn, libxsmm_xcopykernel kernel, + int tid, int ntasks); + +LIBXSMM_API_INTERN void libxsmm_matcopy_internal(void* out, const void* in, + unsigned int typesize, unsigned int ldi, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); +LIBXSMM_API_INTERN void libxsmm_matzero_internal(void* out, + unsigned int typesize, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); +LIBXSMM_API_INTERN void libxsmm_otrans_internal(void* out, const void* in, + unsigned int typesize, unsigned int ldi, unsigned int ldo, + unsigned int m0, unsigned int m1, unsigned int n0, unsigned int n1, + unsigned int tm, unsigned int tn, libxsmm_xcopykernel kernel); +LIBXSMM_API void libxsmm_itrans_internal(char* inout, void* scratch, unsigned int typesize, + libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, + libxsmm_blasint index_base, libxsmm_blasint index_stride, const libxsmm_blasint stride[], + libxsmm_xcopykernel kernel, libxsmm_blasint begin, libxsmm_blasint end); + +#if (defined(LIBXSMM_XCOPY_JIT) && 0 != (LIBXSMM_XCOPY_JIT)) +/** Determines whether JIT-kernels are used or not; values see LIBXSMM_XCOPY_JIT. */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_xcopy_jit); +#endif +/** Determines if OpenMP tasks are used, and scales beyond the number of threads. */ +LIBXSMM_APIVAR_PUBLIC(int libxsmm_xcopy_taskscale); +/** M-extent of type-size in Byte. */ +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mcopy_mbytes); +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_mzero_mbytes); +LIBXSMM_APIVAR_PUBLIC(unsigned int libxsmm_tcopy_mbytes); +/** M-factor shaping the N-extent. */ +LIBXSMM_APIVAR_PUBLIC(float libxsmm_mcopy_nscale); +LIBXSMM_APIVAR_PUBLIC(float libxsmm_mzero_nscale); +LIBXSMM_APIVAR_PUBLIC(float libxsmm_tcopy_nscale); + +#endif /*LIBXSMM_XCOPY_H*/ + diff --git a/third_party/libxsmm/src/template/libxsmm_config.h b/third_party/libxsmm/src/template/libxsmm_config.h new file mode 100644 index 00000000..bfb98616 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_config.h @@ -0,0 +1,44 @@ +#ifndef LIBXSMM_CONFIG_H +#define LIBXSMM_CONFIG_H + +#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(LIBXSMM_SOURCE_H) && !defined(LIBXSMM_CONFIGURED) +# define LIBXSMM_DEFAULT_CONFIG +#endif +#if !defined(LIBXSMM_DEFAULT_CONFIG) && defined(_WIN32) +# define LIBXSMM_DEFAULT_CONFIG +#endif + +#if !defined(LIBXSMM_DEFAULT_CONFIG) && (!defined(LIBXSMM_SOURCE_H) || defined(LIBXSMM_CONFIGURED)) +# include "libxsmm_version.h" +$LIBXSMM_OFFLOAD_BUILD +$MNK_PREPROCESSOR_LIST +#else +# define LIBXSMM_CONFIG_VERSION "" +# define LIBXSMM_CONFIG_BRANCH "" +# define LIBXSMM_CONFIG_VERSION_MAJOR INT_MAX +# define LIBXSMM_CONFIG_VERSION_MINOR INT_MAX +# define LIBXSMM_CONFIG_VERSION_UPDATE INT_MAX +# define LIBXSMM_CONFIG_VERSION_PATCH INT_MAX +# define LIBXSMM_CONFIG_BUILD_DATE INT_MAX +#endif + +#define LIBXSMM_CONFIG_CACHELINE $CACHELINE +#define LIBXSMM_CONFIG_ALIGNMENT $CACHELINE +#define LIBXSMM_CONFIG_MALLOC $MALLOC +#define LIBXSMM_CONFIG_ILP64 $ILP64 +#define LIBXSMM_CONFIG_SYNC $SYNC +#define LIBXSMM_CONFIG_JIT $JIT + +#define LIBXSMM_CONFIG_PREFETCH $PREFETCH +#define LIBXSMM_CONFIG_MAX_MNK $MAX_MNK +#define LIBXSMM_CONFIG_MAX_DIM $MAX_DIM +#define LIBXSMM_CONFIG_AVG_DIM $AVG_DIM +#define LIBXSMM_CONFIG_MAX_M $MAX_M +#define LIBXSMM_CONFIG_MAX_N $MAX_N +#define LIBXSMM_CONFIG_MAX_K $MAX_K +#define LIBXSMM_CONFIG_FLAGS $FLAGS +#define LIBXSMM_CONFIG_ALPHA $ALPHA +#define LIBXSMM_CONFIG_BETA $BETA +#define LIBXSMM_CONFIG_WRAP $WRAP + +#endif diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_define.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_define.tpl.c new file mode 100644 index 00000000..fd0c448b --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_define.tpl.c @@ -0,0 +1,95 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +#if 0 +#define USE_CLDEMOTE +#define WR_PREFETCH_OUTPUT +#endif + +#if defined(LIBXSMM_DNN_BF16_USE_CPX_AVX512_NI) +# define LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( A ) (__m256i)_mm512_cvtneps_pbh( A ) +# define LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( A, B ) (__m512i)_mm512_cvtne2ps_pbh( A, B ) +#else +# define LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( A ) LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16( A ) +# define LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( A, B ) LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16( A, B ) +#endif + +#ifdef WR_PREFETCH_OUTPUT +#define prefetchwt_chunk(ptr, nbytes) do { \ + int __i; \ + for (__i = 0; __i < nbytes; __i += 64) { \ + _mm_prefetch((char*)ptr+__i, _MM_HINT_ET0); \ + } \ +} while(0) +#endif + +#ifdef USE_CLDEMOTE +#define LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(in, out, length) do { \ + unsigned int full_chunks = length / 32; \ + unsigned int remainder = length % 32; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 32) { \ + _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ + _mm_cldemote((libxsmm_bfloat16*)out+__i); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 32; \ + _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ + _mm_cldemote((libxsmm_bfloat16*)out+__i); \ + } \ + libxsmm_rne_convert_fp32_bf16((const float*)in+32*full_chunks, (element_output_type*)out+32*full_chunks, remainder); \ + _mm_cldemote((libxsmm_bfloat16*)out+32*full_chunks); \ + } \ +} while(0) +#else +#define LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(in, out, length) do { \ + unsigned int full_chunks = length / 32; \ + unsigned int remainder = length % 32; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 32) { \ + _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 32; \ + _mm512_storeu_si512((libxsmm_bfloat16*)out+__i, LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i+16), LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i))); \ + } \ + libxsmm_rne_convert_fp32_bf16((const float*)in+32*full_chunks, (element_output_type*)out+32*full_chunks, remainder); \ + } \ +} while(0) +#endif + +#define LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32(in, out, length) do { \ + unsigned int full_chunks = length / 16; \ + unsigned int remainder = length % 16; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 16) { \ + _mm512_storeu_ps( (float*)out+__i, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS( _mm256_loadu_si256((__m256i*)((const libxsmm_bfloat16*)in+__i)))); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 16; \ + _mm512_storeu_ps( (float*)out+__i, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS( _mm256_loadu_si256((__m256i*)((const libxsmm_bfloat16*)in+__i)))); \ + } \ + libxsmm_convert_bf16_f32((const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ + } \ +} while(0) + +#define _mm512_loadcvt_bf16_fp32(A) LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)(A))) +#define _mm512_storecvt_fp32_bf16(A,B) _mm256_storeu_si256((__m256i*)(A),(__m256i)LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH((B))) +#define _mm512_streamstorecvt_fp32_bf16(A,B) _mm256_stream_si256((__m256i*)(A), (__m256i)LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH((B))) + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_undefine.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_undefine.tpl.c new file mode 100644 index 00000000..f5e2a127 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_bf16_macros_undefine.tpl.c @@ -0,0 +1,28 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +#undef LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16 +#undef LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32 +#undef LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH +#undef LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH +#undef _mm512_loadcvt_bf16_fp32 +#undef _mm512_storecvt_fp32_bf16 +#undef _mm512_streamstorecvt_fp32_bf16 + +#ifdef USE_CLDEMOTE +#undef USE_CLDEMOTE +#endif + +#ifdef WR_PREFETCH_OUTPUT +#undef prefetchwt_chunk +#undef WR_PREFETCH_OUTPUT +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c new file mode 100644 index 00000000..5afa0f7f --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic.tpl.c @@ -0,0 +1,177 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Rajkishore Barik, Ankush Mandal, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +int imgifm1, img, ofm1, ifm1, oj, ij, oi, ii, kj, ki, ifm2, ofm2, ifm1ofm1; +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could be run in parallel */ +const int work = handle->desc.N * handle->blocksifm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm; +/* compute chunk size */ +const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + +/* offset pointer in case of physical padding */ +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; + +/* Weight and transpose_weight tensor declaration */ +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = 0; + +/* padding via stack allocated buffers */ +const int padded_w = handle->desc.W + (2 * handle->desc.pad_w); +const int padded_h = handle->desc.H + (2 * handle->desc.pad_h); +const int size_tls1 = padded_h * padded_w * handle->ifmblock; +element_input_type *const del_input_scratch_padding = (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) + ltid * size_tls1; +for ( ii = 0; ii < size_tls1; ++ii ) { del_input_scratch_padding[ii] = (element_input_type)0; } + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) { + weight_base = (element_filter_type*)handle->reg_filter_tr->data; +} else { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / handle->blocksifm; + ifm1 = ifm1ofm1 % handle->blocksifm; + for (kj=0; kj < handle->desc.R; kj++) { + for (ki=0; ki < handle->desc.S; ki++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } + } + } + } + } + weight_base = (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} + +{/* open new scope for additional variable declarations (C89) */ +LIBXSMM_VLA_DECL(5, element_input_type, del_input, (element_output_type*)handle->grad_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); +LIBXSMM_VLA_DECL(3, element_input_type, del_input_padded, del_input_scratch_padding, padded_w, handle->ifmblock); +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + +for (imgifm1 = thr_begin; imgifm1 < thr_end; ++imgifm1) { + img = imgifm1 / handle->blocksifm; + ifm1 = imgifm1 % handle->blocksifm; + + /* check if we need padding, for now we do physical padding on the fly, however we can play with N parameter of the GEMM */ + /* @TODO: add variant which deals with multiple GEMMS by varying N to deal with padding */ + if ( (handle->desc.pad_h == handle->desc.pad_h_in) && (handle->desc.pad_w == handle->desc.pad_w_in) ) { + + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + LIBXSMM_PRAGMA_SIMD + for (ij = 0; ij < handle->ifhp*handle->ifwp*handle->ifmblock; ij++) { + temp_ptr[ij] = (element_input_type)0; + } + } + + /* run convolution */ + for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij + kj, ii + ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) ); + } + } + } + } + + /* zero rim in case of physical padding.... this code is extremely stupid and crappy as it requires a complicated if... */ + if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { + for ( ij = 0; ij < handle->ifhp; ij++ ) { + for ( ii = 0; ii < handle->ifwp; ii++ ) { + if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || + (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } else { + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { + LIBXSMM_PRAGMA_SIMD + for (ij = 0; ij < size_tls1; ++ij) { + del_input_scratch_padding[ij] = (element_output_type)0; + } + } else { + for (ij = 0; ij < handle->desc.H; ij++) { + for (ii = 0; ii < handle->desc.W; ii++) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + + /* run convolution */ + for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + kj, ii + ki, 0, padded_w, handle->ifmblock) ); + } + } + } + } + + /* input padding copy back */ + for (ij = 0; ij < handle->desc.H; ij++) { + for (ii = 0; ii < handle->desc.W; ii++) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock); + } + } + } + } +} /* end of imgifm1 loop */ + +} /* end of new scope for additional variable declarations (C89) */ + +libxsmm_barrier_wait(handle->barrier, ltid); diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c new file mode 100644 index 00000000..40f9fd0a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_fallback_generic_bf16.tpl.c @@ -0,0 +1,172 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +int imgifm1, img, ofm1, ifm1, oj, ij, oi, ii, kj, ki, ifm2, ofm2; +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* auxiliary lp variables */ +int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +int lpb = handle->fm_lp_block; +unsigned long long n_blocks = handle->blocksofm; + +/* number of tasks that could be run in parallel */ +int task; +const int work = handle->desc.N * handle->blocksifm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; +/* compute chunk size */ +const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + +/* offset pointer in case of physical padding */ +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; + +/* Weight and transpose_weight tensor declaration */ +LIBXSMM_VLA_DECL(7, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); +LIBXSMM_VLA_DECL(7, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = 0; + +/* padding via stack allocated buffers */ +const int padded_w = handle->desc.W + (2 * handle->desc.pad_w); +const int padded_h = handle->desc.H + (2 * handle->desc.pad_h); +const int size_tls1 = padded_h * padded_w * handle->ifmblock; +float *const del_input_scratch_padding = (float*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) + ltid * size_tls1; +for ( ii = 0; ii < size_tls1; ++ii ) { del_input_scratch_padding[ii] = (float)0.0; } + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = + LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); + } + } + } + weight_base = (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} else { + weight_base = (element_filter_type*)handle->reg_filter_tr->data; +} + +{/* open new scope for additional variable declarations (C89) */ +LIBXSMM_VLA_DECL(5, element_input_type, del_input, (element_output_type*)handle->grad_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); +LIBXSMM_VLA_DECL(3, float, del_input_padded, del_input_scratch_padding, padded_w, handle->ifmblock); +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(7, element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); +/* Auxiliary fp32 accumulators */ +float *del_inp_fp32 = (float*)((char*)handle->scratch + handle->bwd_lp_input_full_scratch_offset); +LIBXSMM_VLA_DECL(5, float, del_input_fp32, del_inp_fp32, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + +for (imgifm1 = thr_begin; imgifm1 < thr_end; ++imgifm1) { + img = imgifm1 / handle->blocksifm; + ifm1 = imgifm1 % handle->blocksifm; + + /* check if we need padding, for now we do physical padding on the fly, however we can play with N parameter of the GEMM */ + /* @TODO: add variant which deals with multiple GEMMS by varying N to deal with padding */ + if ( (handle->desc.pad_h == handle->desc.pad_h_in) && (handle->desc.pad_w == handle->desc.pad_w_in) ) { + + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input_fp32, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + LIBXSMM_PRAGMA_SIMD + for (ij = 0; ij < handle->ifhp*handle->ifwp*handle->ifmblock; ij++) { + temp_ptr[ij] = (float)0.0; + } + } + + /* run convolution */ + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + bf16fp32_brgemm_kernel( &LIBXSMM_VLA_ACCESS(7, weight, ifm1, 0, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb), + &LIBXSMM_VLA_ACCESS(5, output, img, 0, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij + kj, ii + ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), &n_blocks ); + } + } + } + + /* Downconvert computed result to bf16 */ + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + handle->ifhp * handle->ifwp * handle->ifmblock); + + /* zero rim in case of physical padding.... this code is extremely stupid and crappy as it requires a complicated if... */ + if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { + for ( ij = 0; ij < handle->ifhp; ij++ ) { + for ( ii = 0; ii < handle->ifwp; ii++ ) { + if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || + (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + + } else { + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + LIBXSMM_PRAGMA_SIMD + for (ij = 0; ij < size_tls1; ++ij) { + del_input_scratch_padding[ij] = (float)0.0; + } + + + /* run convolution */ + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + bf16fp32_brgemm_kernel( &LIBXSMM_VLA_ACCESS(7, weight, ifm1, 0, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb), + &LIBXSMM_VLA_ACCESS(5, output, img, 0, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + kj, ii + ki, 0, padded_w, handle->ifmblock), &n_blocks ); + } + } + } + + /* input padding copy back */ + for (ij = 0; ij < handle->desc.H; ij++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, handle->desc.pad_w, 0, padded_w, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + handle->desc.W * handle->ifmblock); + } + } +} /* end of imgifm1 loop */ + +} /* end of new scope for additional variable declarations (C89) */ + +libxsmm_barrier_wait(handle->barrier, ltid); diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c new file mode 100644 index 00000000..7738322a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c @@ -0,0 +1,352 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, task, ifm1ofm1; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ifm_start = 0; +int my_ifm_end = handle->blocksifm; + +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; +/* compute chunk size */ +int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; +/* offset output pointer in case of physical padding */ +const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; +const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; +element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; +LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +element_output_type *const out = (element_output_type*)handle->grad_output->data; +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + +/* Weight and transpose_weight tensor declaration */ +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { + /* Special case of 64x64 transpose with JITed transpose */ + if (handle->ifmblock == 64 && handle->ofmblock == 64) { + libxsmm_meltwfunction_unary tr_kernel = handle->tr_kernel; + libxsmm_meltw_unary_param trans_param; + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + trans_param.in.primary = &LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + trans_param.out.primary = &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + tr_kernel( &trans_param ); + trans_param.in.primary = &LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 16, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + trans_param.out.primary = &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 16, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + tr_kernel( &trans_param ); + trans_param.in.primary = &LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 32, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + trans_param.out.primary = &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 32, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + tr_kernel( &trans_param ); + trans_param.in.primary = &LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 48, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + trans_param.out.primary = &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 48, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + tr_kernel( &trans_param ); + } + } else { + /* number of tasks for transpose that could be run in parallel */ + transpose_work = handle->blocksifm * handle->blocksofm; + /* compute chunk size */ + transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / handle->blocksifm; + ifm1 = ifm1ofm1 % handle->blocksifm; + for (kj=0; kj < handle->desc.R; kj++) { + for (ki=0; ki < handle->desc.S; ki++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } + } + } + } + } + } + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myIfmId = ltid % threads_per_image; + nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); + my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); +} + +if ( handle->use_ifm_parallelization == 1 ) { + int spread_out = 0; + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + } +} + +if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + ij_use = oj; + ii_use = oi; + oj_use = oj - (1-handle->desc.pad_h_out); + oi_use = oi - (1-handle->desc.pad_w_out); + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } + } +} + +if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->ifmblock; + } + } + } + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } +} + +if (handle->pack_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock); + } + } + } + } + } + } +} else if (handle->spread_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c new file mode 100644 index 00000000..efd2d68e --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c @@ -0,0 +1,407 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +int img, ofm1, ofm2, ifm1, ifm2, oj, ojj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, task; +int last_ki, last_kj, next_kj; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ifm_start = 0; +int my_ifm_end = handle->blocksifm; +int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +int lpb = handle->fm_lp_block; + +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; +/* compute chunk size */ +int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; +/* offset output pointer in case of physical padding */ +const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; +const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; +const int ifwp_scratch = (handle->spread_input_bwd == 1) ? handle->desc.v * handle->bwd_ofw_rb : handle->bwd_ofw_rb; + +/* Auxiliary fp32 accumulators */ +float *del_inp_ptr; +float *del_inp_fp32 = (float*)((char*)handle->scratch + handle->bwd_lp_input_full_scratch_offset) + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; +LIBXSMM_VLA_DECL(5, float, del_input_fp32, del_inp_fp32, handle->blocksifm, IFH, IFW, handle->ifmblock); + +element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; +LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +element_output_type *const out = (element_output_type*)handle->grad_output->data; +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + +/* Weight and transpose_weight tensor declaration */ +LIBXSMM_VLA_DECL(7, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); +LIBXSMM_VLA_DECL(7, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = + LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); + } + } + } + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myIfmId = ltid % threads_per_image; + nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); + my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); +} + +if ( handle->use_ifm_parallelization == 1 ) { + int spread_out = 0; + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + } +} + +if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + ij_use = oj; + ii_use = oi; + oj_use = oj - (1-handle->desc.pad_h_out); + oi_use = oi - (1-handle->desc.pad_w_out); + last_kj = handle->desc.R-1; + last_ki = handle->desc.S-1; + next_kj = kj+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load_bwd == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load_bwd == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } else { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load_bwd == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + if (handle->avoid_acc_load_bwd == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } + } + } +} + +if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS( 5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + if (handle->avoid_acc_load_bwd == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), &n_blocks); + } else { + del_inp_ptr = &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } + } +} + +if (handle->pack_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock); + } + } + } + } + } + } +} else if (handle->spread_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..36b7d4fd --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16_amx.tpl.c @@ -0,0 +1,530 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +int img, ofm1, ofm2, ifm1, ifm2, oj, ojj, oi, kj, ki, /*oi_use, oj_use, ii_use, ij_use, ofmb,*/ ifmb, ojb, myIfmId, nIfmBlocks, /*ind,*/ task; +/*int last_ki, last_kj, next_kj;*/ +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = (handle->desc.N + handle->desc.threads - 1)/handle->desc.threads; +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN( ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN( (ltid+1) * imgpt, handle->desc.N); +int my_ifm_start = 0; +int my_ifm_end = handle->blocksifm; +int ofmblock_lp = handle->ofmblock/handle->fm_lp_block; +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +int lpb = handle->fm_lp_block; + +/* Batch reduce related variables */ +#if 0 +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +#endif +unsigned long long n_blocks; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; +/* compute chunk size */ +int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; +/* offset output pointer in case of physical padding */ +const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; +const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; + +/* Auxiliary fp32 accumulators */ +float *out_ptr; +/*float *del_inp_fp32 = (float*)handle->scratch6 + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock;*/ +float *del_inp_scratch = (float*)((char*)handle->scratch + handle->bwd_lp_input_full_scratch_offset) + ltid * handle->bwd_gemm_pixels * handle->ifmblock; +/*LIBXSMM_VLA_DECL(5, float, del_input_fp32, del_inp_fp32, handle->blocksifm, IFH, IFW, handle->ifmblock);*/ +int scratch_ifwp = (handle->bwd_gemm_pixels == (handle->bwd_ofw_rb * handle->bwd_ofh_rb)) ? handle->bwd_ofw_rb : handle->ifwp; +LIBXSMM_VLA_DECL(3, float, scratch_fp32, del_inp_scratch, scratch_ifwp, handle->ifmblock); + +element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock; +LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +element_output_type *const out = (element_output_type*)handle->grad_output->data; +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + +/* Weight and transpose_weight tensor declaration */ +LIBXSMM_VLA_DECL(7, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); +LIBXSMM_VLA_DECL(7, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* Execute the tileconfig kernel */ +tile_config_kernel(NULL, NULL, NULL); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { + if ((handle->ifmblock % 16 == 0) && (handle->ofmblock % 16 == 0)) { + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + bf16_vnni_transpose_kernel( &LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb), + &LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb), + handle->ifmblock, handle->ofmblock, handle->ifmblock, handle->ofmblock); + } + } else { + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(7, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2/lpb, ifm2, ofm2%lpb, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb) = + LIBXSMM_VLA_ACCESS(7, wt, ofm1, ifm1, kj, ki, ifm2/lpb, ofm2, ifm2%lpb, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, lpb); + } + } + } + } +} +/* wait for transpose to finish */ + +libxsmm_barrier_wait(handle->barrier, ltid); + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN( ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN( my_img_start + 1, handle->desc.N); + myIfmId = ltid % threads_per_image; + nIfmBlocks = (handle->blocksifm + threads_per_image - 1) / threads_per_image; + my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); +} + +if ( handle->use_ifm_parallelization == 1 ) { + int spread_out = 0; + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ifmpt = (handle->blocksifm+spread_out-1)/spread_out; + int ifm_id = ltid % spread_out; + imgpt = ((handle->desc.N + handle->desc.threads - 1)/handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN( tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN( (tile_id+1) * imgpt, handle->desc.N); + my_ifm_start = LIBXSMM_MIN( ifm_id * ifmpt, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN( (ifm_id+1) * ifmpt, handle->blocksifm); + } +} + +n_blocks = (unsigned long long)handle->blocksofm_blocking * handle->desc.R * handle->desc.S; +out_ptr = (float*) &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ifwp, handle->ifmblock); + +#if 1 +if (handle->desc.R == 1 && handle->desc.S == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Batch-reduce GEMM call */ + br_gemm_kernel_strd( &LIBXSMM_VLA_ACCESS(7, weight, ifm1, 0, 0, 0, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb), + &LIBXSMM_VLA_ACCESS(5, output, img, 0, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), out_ptr, &n_blocks); + /* Downconvert accumulated tiles to BF16 */ + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, scratch_ifwp, handle->ifmblock), &LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj+ojj, oi, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } + } + } + } +} +else { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Batch-reduce GEMM call */ + br_gemm_kernel_offs( &LIBXSMM_VLA_ACCESS(7, weight, ifm1, 0, 0, 0, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb), + &LIBXSMM_VLA_ACCESS(5, output, img, 0, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), out_ptr, &n_blocks, handle->A_offsets_bwd, handle->B_offsets_bwd); + /* Downconvert accumulated tiles to BF16 */ + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, scratch_ifwp, handle->ifmblock), &LIBXSMM_VLA_ACCESS( 5, del_input, img, ifm1, oj+ojj, oi, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } + } + } + } +} + +if (handle->pack_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock); + } + } + } + } + } + } +} else if (handle->spread_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } +} +#else +if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + ij_use = oj; + ii_use = oi; + oj_use = oj - (1-handle->desc.pad_h_out); + oi_use = oi - (1-handle->desc.pad_w_out); + last_kj = handle->desc.R-1; + last_ki = handle->desc.S-1; + next_kj = kj+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + del_inp_ptr = (handle->avoid_acc_load_bwd == 1) ? &LIBXSMM_VLA_ACCESS(3, scratch_fp32, 0, 0, 0, ifwp_scratch, handle->ifmblock) + : &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (handle->avoid_acc_load_bwd == 1) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, ifwp_scratch, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + (handle->bwd_ofw_rb-1) * handle->ifmblock); + } + } else if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + del_inp_ptr = (handle->avoid_acc_load_bwd == 1) ? &LIBXSMM_VLA_ACCESS(3, scratch_fp32, 0, 0, 0, ifwp_scratch, handle->ifmblock) + : &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (handle->avoid_acc_load_bwd == 1) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, ifwp_scratch, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + (handle->bwd_ofw_rb-1) * handle->ifmblock); + } + } else if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } else { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + n_blocks = ind; + del_inp_ptr = (handle->avoid_acc_load_bwd == 1) ? &LIBXSMM_VLA_ACCESS(3, scratch_fp32, 0, 0, 0, ifwp_scratch, handle->ifmblock) + : &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (handle->avoid_acc_load_bwd == 1) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, ifwp_scratch, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } else if (ofm2 == handle->blocksofm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + handle->bwd_ofw_rb * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + del_inp_ptr = (handle->avoid_acc_load_bwd == 1) ? &LIBXSMM_VLA_ACCESS(3, scratch_fp32, 0, 0, 0, ifwp_scratch, handle->ifmblock) + : &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (handle->avoid_acc_load_bwd == 1) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, ifwp_scratch, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } else if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } + } +} + +if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float *temp_ptr = (float*)&LIBXSMM_VLA_ACCESS( 5, del_input_fp32, img, ifm1, oj, 0, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (float)0; + } + temp_ptr += handle->ifmblock; + } + } + } + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ifm1, ofm2, kj, ki, 0, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, ofmblock_lp, handle->ifmblock, lpb); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + del_inp_ptr = (handle->avoid_acc_load_bwd == 1) ? &LIBXSMM_VLA_ACCESS(3, scratch_fp32, 0, 0, 0, ifwp_scratch, handle->ifmblock) + : &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); + if (handle->avoid_acc_load_bwd == 1) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, ifwp_scratch, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } else if (ofm2 == handle->blocksofm && kj == handle->desc.R && ki == handle->desc.S) { + for (ojj = 0; ojj < handle->bwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, del_input_fp32, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use+ojj, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + ifwp_scratch * handle->ifmblock); + } + } + } + } + } + } + } + } + } + } +} + +if (handle->pack_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, oj/handle->desc.u, oi/handle->desc.v, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) ; + } + } + } + } + } + } +} else if (handle->spread_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->ifmblock, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, ifm1, oj, oi, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } +} +#endif + +handle->tilerelease_kernel(NULL, NULL, NULL); +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c new file mode 100644 index 00000000..22a3beeb --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_fallback_generic.tpl.c @@ -0,0 +1,191 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Rajkishore Barik, Ankush Mandal, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +int imgifm1, img, ofm1, ifm1, oj, ij, oi, ii, kj, ki, ifm2, ofm2, ifm1ofm1; +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could be run in parallel */ +const int work = handle->desc.N * handle->blocksifm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm; +/* compute chunk size */ +const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + +/* offset pointer in case of physical padding */ +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; + +/* Weight and transpose_weight tensor declaration */ +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif +LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = 0; + +/* padding via stack allocated buffers */ +const int padded_w = handle->desc.W + (2 * handle->desc.pad_w); +const int padded_h = handle->desc.H + (2 * handle->desc.pad_h); +const int size_tls1 = padded_h * padded_w * handle->ifmblock; +element_input_type *const del_input_scratch_padding = (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) + ltid * size_tls1; +for ( ii = 0; ii < size_tls1; ++ii ) { del_input_scratch_padding[ii] = (element_input_type)0; } + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) { + weight_base = (element_filter_type*)handle->reg_filter_tr->data; +} else { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / handle->blocksifm; + ifm1 = ifm1ofm1 % handle->blocksifm; + for (kj=0; kj < handle->desc.R; kj++) { + for (ki=0; ki < handle->desc.S; ki++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + } + } + } + } + } + weight_base = (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} + +{/* open new scope for additional variable declarations (C89) */ +LIBXSMM_VLA_DECL(5, element_input_type, del_input, (element_output_type*)handle->grad_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); +LIBXSMM_VLA_DECL(3, element_input_type, del_input_padded, del_input_scratch_padding, padded_w, handle->ifmblock); +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + +for (imgifm1 = thr_begin; imgifm1 < thr_end; ++imgifm1) { + img = imgifm1 / handle->blocksifm; + ifm1 = imgifm1 % handle->blocksifm; + + /* check if we need padding, for now we do physical padding on the fly, however we can play with N parameter of the GEMM */ + /* @TODO: add variant which deals with multiple GEMMS by varying N to deal with padding */ + if ( (handle->desc.pad_h == handle->desc.pad_h_in) && (handle->desc.pad_w == handle->desc.pad_w_in) ) { + + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, 0, 0, ifm1, 0, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock)); + /*LIBXSMM_PRAGMA_SIMD*/ + for (ij = 0; ij < handle->ifhp*handle->ifwp; ij++) { + for (ii = 0; ii < handle->ifmblock; ii++) { + temp_ptr[ii] = (element_input_type)0; + } + temp_ptr += handle->blocksifm * handle->ifmblock; + } + } + + /* run convolution */ + for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, del_input, img, ij + kj, ii + ki, ifm1, 0, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) ); + } + } + } + } + + /* zero rim in case of physical padding.... this code is extremely stupid and crappy as it requires a complicated if... */ + if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { + for ( ij = 0; ij < handle->ifhp; ij++ ) { + for ( ii = 0; ii < handle->ifwp; ii++ ) { + if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || + (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } else { + /* reset result buffer to zero when intent is to overwrite when first block + of input channels should be convoluted */ + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) ) { + LIBXSMM_PRAGMA_SIMD + for (ij = 0; ij < size_tls1; ++ij) { + del_input_scratch_padding[ij] = (element_output_type)0; + } + } else { + for (ij = 0; ij < handle->desc.H; ij++) { + for (ii = 0; ii < handle->desc.W; ii++) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } + } + } + + /* run convolution */ + for (ofm1 = 0; ofm1 < handle->blocksofm; ++ofm1) { + for ( oj = 0; oj < handle->ofh; ++oj) { + ij = oj * handle->desc.u; + oi = 0; ii = 0; + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + gemm_kernel( &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + kj, ii + ki, 0, padded_w, handle->ifmblock) ); + } + } + } + } + + /* input padding copy back */ + for (ij = 0; ij < handle->desc.H; ij++) { + for (ii = 0; ii < handle->desc.W; ii++) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(3, del_input_padded, ij + handle->desc.pad_h, ii + handle->desc.pad_w, ifm2, padded_w, handle->ifmblock); + } + } + } + } +} /* end of imgifm1 loop */ + +} /* end of new scope for additional variable declarations (C89) */ + +libxsmm_barrier_wait(handle->barrier, ltid); diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c new file mode 100644 index 00000000..d2cfb8e6 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c @@ -0,0 +1,364 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myIfmId, nIfmBlocks, ind, /*task,*/ ifm1ofm1; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ifm_start = 0; +int my_ifm_end = handle->blocksifm; + +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +/* number of tasks for transpose that could be run in parallel */ +int transpose_work = handle->blocksifm * handle->blocksofm * handle->desc.R * handle->desc.S; +/* compute chunk size */ +int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; +int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; +/* offset output pointer in case of physical padding */ +const int IFW = (handle->pack_input_bwd == 1) ? handle->ofw : handle->ifwp; +const int IFH = (handle->pack_input_bwd == 1) ? handle->ofh : handle->ifhp; +element_input_type *input_ptr = (handle->pack_input_bwd == 1) ? (element_input_type*)((char*)handle->scratch + handle->bwd_packing_padding_scratch_offset) : (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock; +LIBXSMM_VLA_DECL(5, element_input_type, del_input, input_ptr, IFH, IFW, handle->blocksifm, handle->ifmblock); +element_output_type *const out = (element_output_type*)handle->grad_output->data; +LIBXSMM_VLA_DECL(5, const element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + +/* Weight and transpose_weight tensor declaration */ +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) +LIBXSMM_VLA_DECL(6, element_filter_type, wt, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif +LIBXSMM_VLA_DECL(6, element_filter_type, tr_wt, (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset), handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); +/* define weight pointer which has the correct format */ +element_filter_type* weight_base = ((handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) > 0 ) ? (element_filter_type*)handle->reg_filter_tr->data : (element_filter_type*)((char*)handle->scratch + handle->bwd_filter_trans_scratch_offset); +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, weight_base, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* transpose filters, if requested */ +if ( (handle->options & LIBXSMM_DNN_CONV_OPTION_BWD_NO_FILTER_TRANSPOSE) == 0 ) { + /* Special case of 64x64 transpose with JITed transpose */ +#if 0 + if (handle->ifmblock == 64 && handle->ofmblock == 64) { + libxsmm_xtransfunction tr_kernel = handle->tr_kernel; + const unsigned int ld_in = 64; + const unsigned int ld_out = 64; + for (task = transpose_thr_begin; task < transpose_thr_end; ++task) { + ifm1 = task/(handle->blocksofm * handle->desc.R * handle->desc.S); + ofm1 = (task%(handle->blocksofm * handle->desc.R * handle->desc.S))/(handle->desc.R * handle->desc.S); + kj = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))/handle->desc.S; + ki = ((task%(handle->blocksofm * handle->desc.R * handle->desc.S))%(handle->desc.R * handle->desc.S))%handle->desc.S; + tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, + &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); + tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 16, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, + &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 16, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); + tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 32, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, + &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 32, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); + tr_kernel(&LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, 48, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &ld_in, + &LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj, handle->desc.S-1-ki, 0, 48, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock), &ld_out); + } + } else { +#endif + /* number of tasks for transpose that could be run in parallel */ + transpose_work = handle->blocksifm * handle->blocksofm; + /* compute chunk size */ + transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / handle->blocksifm; + ifm1 = ifm1ofm1 % handle->blocksifm; + for (kj=0; kj < handle->desc.R; kj++) { + for (ki=0; ki < handle->desc.S; ki++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_BWD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_ACCESS(6, tr_wt, ifm1, ofm1, handle->desc.R-1-kj , handle->desc.S-1-ki, ofm2, ifm2, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(6, wt, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + } + } + } + } + } +#if 0 + } +#endif + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myIfmId = ltid % threads_per_image; + nIfmBlocks = LIBXSMM_UPDIV(handle->blocksifm, threads_per_image); + my_ifm_start = LIBXSMM_MIN(myIfmId * nIfmBlocks, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((myIfmId+1) * nIfmBlocks, handle->blocksifm); +} + +if ( handle->use_ifm_parallelization == 1 ) { + int spread_out = 0; + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + } +} + +if (handle->loop_order == 0) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) {*/ + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->blocksifm * handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + ij_use = oj; + ii_use = oi; + oj_use = oj - (1-handle->desc.pad_h_out); + oi_use = oi - (1-handle->desc.pad_w_out); + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki + 1, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use + 1, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); + } else if (oi == handle->ofw-handle->bwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); + } else { + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->blocksifm * handle->ifmblock; + } + } + } + + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } + } +} + +if (handle->loop_order == 1) { /* (loop_order == N_Kb_Cb_Hb_k_c_h_w) { */ + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += handle->block_bwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_bwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_bwd_oj,handle->ofh); oj += handle->bwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->bwd_ofw_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_bwd_ifm, my_ifm_end); ifm1++ ) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_bwd_ofm) { + if ( (ofmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load_bwd == 0 && ojb == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_input_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, del_input, img, oj, 0, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + temp_ptr[ifm2] = (element_input_type)0; + } + temp_ptr += handle->blocksifm * handle->ifmblock; + } + } + } + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_bwd_ofm, handle->blocksofm); ofm1 += handle->blocksofm_blocking) { + /* Prepare batch-reduce kernel arguments */ + ij_use = (handle->spread_input_bwd == 1) ? oj * handle->desc.u : oj; + ii_use = (handle->spread_input_bwd == 1) ? oi * handle->desc.v : oi; + oi_use = oi; + oj_use = oj; + ind = 0; + for (ofm2 = ofm1; ofm2 < ofm1 + handle->blocksofm_blocking; ofm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ifm1, ofm2, kj, ki, 0, 0, handle->blocksofm, handle->desc.R, handle->desc.S, handle->ofmblock, handle->ifmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), &n_blocks); + } + } + } + } + } + } + } + } +} + +if (handle->pack_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, del_input, img, oj/handle->desc.u, oi/handle->desc.v, ifm1, ifm2, IFH, IFW, handle->blocksifm,handle->ifmblock); + } + } + } + } + } + } +} else if (handle->spread_input_bwd == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, del_input_full, (element_input_type*)handle->grad_input->data + ((size_t)handle->desc.pad_h_in * handle->ifwp + handle->desc.pad_w_in) * handle->blocksifm * handle->ifmblock, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ifhp; oj++) { + for (oi = 0; oi < handle->ifwp; oi++) { + if (oi % handle->desc.v != 0 || oj % handle->desc.u != 0) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, del_input_full, img, oj, oi, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c new file mode 100644 index 00000000..c116cc20 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c @@ -0,0 +1,519 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, ii, ij, spread_out = 1; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ofm_start = 0; +int my_ofm_end = handle->blocksofm; + +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +/* offset output pointer in case of physical output padding */ +element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +element_input_type *input_ptr = ( (handle->pack_input == 1) || (handle->fwd_padding_copy == 1) ) ? (element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); +const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); +LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myOfmId = ltid % threads_per_image; + nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); + my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); +} + +if ( handle->use_ofm_parallelization == 1 ) { + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); + int ofm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); + } +} + +/* remove stride from input */ +if (handle->pack_input == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +/* physical pad input */ +if (handle->fwd_padding_copy == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +if (handle->use_fallback_fwd_loops == 1) { + /* number of tasks that could be run in parallel */ + const int work = handle->desc.N * handle->blocksofm * handle->ofh; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int imgofm1ofh; + + if ( handle->avoid_fmas_in_rim == 1) { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); +#if 1 + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; +#else + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; +#endif + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_b_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_b_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_a_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } else { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); +#if 1 + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; +#else + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; +#endif + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; +#if 1 + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel_a_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); +#else + LIBXSMM_UNUSED( ifm2 ); + LIBXSMM_UNUSED( kj ); + LIBXSMM_UNUSED( ki ); + n_blocks = handle->blocksifm_blocking * handle->desc.R * handle->desc.S; + if (handle->desc.R == 1 && handle->desc.S == 1) { + br_gemm_kernel_strd( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks ); + } else { + br_gemm_kernel_offs( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets ); + } +#endif + } + } + } + } + } + +} else { + if (handle->loop_order == 0) { + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_b_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_b_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_a_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; +#if 1 + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel_a_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); +#else + LIBXSMM_UNUSED( ifm2 ); + LIBXSMM_UNUSED( kj ); + LIBXSMM_UNUSED( ki ); + n_blocks = handle->blocksifm_blocking * handle->desc.R * handle->desc.S; + if (handle->desc.R == 1 && handle->desc.S == 1) { + br_gemm_kernel_strd( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks ); + } else { + br_gemm_kernel_offs( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets ); + } +#endif + } + } + } + } + } + } + } + } + } + } + + if (handle->loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (ojj = 0; ojj < handle->ofh; ++ojj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, ojj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->ofw; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + } + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; +#if 1 + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel_a_addr(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); +#else + LIBXSMM_UNUSED( ifm2 ); + LIBXSMM_UNUSED( kj ); + LIBXSMM_UNUSED( ki ); + n_blocks = handle->blocksifm_blocking * handle->desc.R * handle->desc.S; + if (handle->desc.R == 1 && handle->desc.S == 1) { + br_gemm_kernel_strd( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks ); + } else { + br_gemm_kernel_offs( &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm1, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets ); + } +#endif + } + } + } + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c new file mode 100644 index 00000000..170e3afd --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c @@ -0,0 +1,609 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, spread_out = 1, ij = 0, ii = 0; +int last_ki, last_kj, next_kj; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ofm_start = 0; +int my_ofm_end = handle->blocksofm; +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; +/* JITed eltwise function */ +libxsmm_meltwfunction_unary cvt_kernel = handle->fwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param cvt_params; + +/* offset output pointer in case of physical output padding */ +element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +float* out_fp32 = (float*)((char*)handle->scratch + handle->fwd_lp_output_full_scratch_offset) + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +float* out_scratch = (float*)((char*)handle->scratch + handle->fwd_lp_output_block_scratch_offset) + ((size_t) ltid * handle->fwd_ofw_rb * handle->fwd_ofh_rb * handle->ofmblock); +float* out_ptr; +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(5, float, output_fp32, out_fp32, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(3, float, scratch_fp32, out_scratch, handle->fwd_ofw_rb, handle->ofmblock); +element_input_type *input_ptr = ((handle->pack_input == 1) || (handle->fwd_padding_copy == 1)) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); +const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); +LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + +libxsmm_barrier_init(handle->barrier, ltid); + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myOfmId = ltid % threads_per_image; + nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); + my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); +} + +if ( handle->use_ofm_parallelization == 1 ) { + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); + int ofm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); + } +} + +if (handle->pack_input == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +/* physical pad input */ +if (handle->fwd_padding_copy == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +if (handle->use_fallback_fwd_loops == 1) { + /* number of tasks that could be run in parallel */ + const int work = handle->desc.N * handle->blocksofm * handle->ofh; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int imgofm1ofh; + + if ( handle->avoid_fmas_in_rim == 1) { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + last_kj = handle->desc.R-1; + last_ki = handle->desc.S-1; + next_kj = kj+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } + } + } + } + } + } + } + } else { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, handle->fwd_ofw_rb, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && kj == handle->desc.R && ki == handle->desc.S) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } + } + } + } + } +} else { + if (handle->loop_order == 0) { + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + last_ki = (handle->shuffle_filter_accesses == 1) ? (handle->desc.S-1+ltid)%handle->desc.S : handle->desc.S-1; + last_kj = (handle->shuffle_filter_accesses == 1) ? (handle->desc.R-1+ltid)%handle->desc.R : handle->desc.R-1; + next_kj = (handle->shuffle_filter_accesses == 1) ? (kj1+1+ltid)%handle->desc.R : kj1+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load == 1) { + br_gemm_kernel2_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + if (handle->avoid_acc_load == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj1 = 0; + ki1 = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (kj1 == handle->desc.R && ki1 == handle->desc.S && ifm2 == handle->blocksifm) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } + } + } + } + } + } + } + } + } + } + + if (handle->loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (ojj = 0; ojj < handle->ofh; ++ojj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, ojj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->ofw; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + + if (handle->avoid_acc_load == 1) { + br_gemm_kernel_bf16bf16(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + out_ptr = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (kj == handle->desc.R && ki == handle->desc.S && ifm2 == handle->blocksifm) { + cvt_params.in.primary = &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_params.out.primary = &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + cvt_kernel(&cvt_params); + } + } + } + } + } + } + } + } + } + } + } + +#if 0 + /* In case we used intermediate fp32 buffer, now downconvert the result to the actual bf16 output */ + if (handle->avoid_acc_load == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->ofw * handle->ofmblock); + } + } + } + } +#endif + +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..3db6596c --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16_amx.tpl.c @@ -0,0 +1,732 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +int img, ofm1, ifm1, ifm2, /*ofm2, ifm1, ifm2,*/ oj, oi, ij, ii, /*kj, ki, oi_use, oj_use, */ii_use, ij_use, ofmb,/* ifmb,*/ ojb, myOfmId, nOfmBlocks, /*ind, ofm11, ki1, kj1,*/ ojj, /*oii,*/ spread_out = 1; +/*int last_ki, last_kj, next_kj;*/ +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = (handle->desc.N + handle->desc.threads - 1)/handle->desc.threads; +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN( ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN( (ltid+1) * imgpt, handle->desc.N); +int my_ofm_start = 0; +int my_ofm_end = handle->blocksofm; +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +/* Batch reduce related variables */ +/*const element_filter_type *A_ptrs[1024];*/ +/*const element_input_type *B_ptrs[1024];*/ +unsigned long long n_blocks; + +/* offset output pointer in case of physical output padding */ +element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +/*float* out_fp32 = (float*)handle->scratch6 + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock;*/ +float* out_ptr; +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +/*LIBXSMM_VLA_DECL(5, float, output_fp32, out_fp32, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);*/ +int scratch_ofwp = (handle->fwd_gemm_pixels == (handle->fwd_ofw_rb * handle->fwd_ofh_rb)) ? handle->fwd_ofw_rb : ((handle->fwd_padding_copy == 1) ? handle->ofwp + 2 * handle->desc.pad_w : handle->ofwp); +/*float scratch_stack_fp32[8*16*16];*/ +float *out_scratch = (float*)((char*)handle->scratch + handle->fwd_lp_output_full_scratch_offset) + ltid * handle->fwd_gemm_pixels * handle->ofmblock; +LIBXSMM_VLA_DECL(3, float, scratch_fp32, out_scratch, scratch_ofwp, handle->ofmblock); +element_input_type *input_ptr = ((handle->pack_input == 1) || (handle->fwd_padding_copy == 1)) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); +const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); +LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN( ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN( my_img_start + 1, handle->desc.N); + myOfmId = ltid % threads_per_image; + nOfmBlocks = (handle->blocksofm + threads_per_image - 1) / threads_per_image; + my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); +} + +if ( handle->use_ofm_parallelization == 1 ) { + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ofmpt = (handle->blocksofm+spread_out-1)/spread_out; + int ofm_id = ltid % spread_out; + imgpt = ((handle->desc.N + handle->desc.threads - 1)/handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN( tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN( (tile_id+1) * imgpt, handle->desc.N); + my_ofm_start = LIBXSMM_MIN( ofm_id * ofmpt, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN( (ofm_id+1) * ofmpt, handle->blocksofm); + } +} + +n_blocks = (unsigned long long)handle->blocksifm_blocking * handle->desc.R * handle->desc.S; +out_ptr = (float*) &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock); + +libxsmm_barrier_init(handle->barrier, ltid); + +if (handle->pack_input == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +/* physical pad input */ +if (handle->fwd_padding_copy == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 || handle->desc.N % handle->desc.threads != 0 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +/* Execute the tileconfig kernel */ +tile_config_kernel(NULL, NULL, NULL); + +#if 1 +if (handle->desc.R == 1 && handle->desc.S == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u; + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v; + /* Batch-reduce GEMM call */ + br_gemm_kernel_strd( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, 0, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } + } + } + } + } + } +} +/* @TODO this needs a reasonable fix */ +else if ( handle->fwd_ofw_rb*handle->fwd_ofh_rb == handle->fwd_gemm_pixels ) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Batch-reduce GEMM call */ + br_gemm_kernel_offs_a( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, 0, oj*handle->desc.u, oi*handle->desc.v, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &n_blocks, handle->A_offsets, handle->B_offsets); + } + } + } + } + } + } +} else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Batch-reduce GEMM call */ + br_gemm_kernel_offs_b( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, 0, oj, oi, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), out_ptr, &n_blocks, handle->A_offsets, handle->B_offsets); + /* Downconvert accumulated tiles to BF16 */ + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, scratch_ofwp, handle->ofmblock), &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj+ojj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } +} +#else +if (handle->pack_input == 1) { + int ifmpt = (handle->blocksifm+spread_out-1)/spread_out; + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN( ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN( (ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +if (handle->use_fallback_fwd_loops == 1) { + /* number of tasks that could be run in parallel */ + const int work = handle->desc.N * handle->blocksofm * handle->ofh; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int imgofm1ofh; + + if ( handle->avoid_fmas_in_rim == 1) { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + last_kj = handle->desc.R-1; + last_ki = handle->desc.S-1; + next_kj = kj+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use+1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + (handle->fwd_ofw_rb-1) * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + (handle->fwd_ofw_rb-1) * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } + } + } + } else { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && kj == handle->desc.R && ki == handle->desc.S) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } +} else { + if (handle->loop_order == 0) { + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + last_ki = (handle->shuffle_filter_accesses == 1) ? (handle->desc.S-1+ltid)%handle->desc.S : handle->desc.S-1; + last_kj = (handle->shuffle_filter_accesses == 1) ? (handle->desc.R-1+ltid)%handle->desc.R : handle->desc.R-1; + next_kj = (handle->shuffle_filter_accesses == 1) ? (kj1+1+ltid)%handle->desc.R : kj1+1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use+1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + (handle->fwd_ofw_rb-1) * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + (handle->fwd_ofw_rb-1) * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } else if (ifm2 == handle->blocksifm && + ((kj == last_kj && ki == last_ki) || + (next_kj == 0 && next_kj == last_kj && oj == 0) || + (next_kj == handle->desc.R-1 && next_kj == last_kj && oj == handle->ofh-1))) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj1 = 0; + ki1 = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } else if (kj1 == handle->desc.R && ki1 == handle->desc.S && ifm2 == handle->blocksifm) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } + } + } + } + } + } + + if (handle->loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (ojj = 0; ojj < handle->ofh; ++ojj) { + float* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, ojj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->ofw; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (float)0; + } + temp_ptr += handle->ofmblock; + } + } + } + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + kj = 0; + ki = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm2, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + out_ptr = (handle->avoid_acc_load == 1) ? &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, 0, 0, 0, scratch_ofwp, handle->ofmblock) : &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); + + if (handle->avoid_acc_load == 1) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 3, scratch_fp32, ojj, 0, 0, handle->fwd_ofw_rb, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } else if (kj == handle->desc.R && ki == handle->desc.S && ifm2 == handle->blocksifm) { + for (ojj = 0; ojj < handle->fwd_ofh_rb; ojj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS(5, output_fp32, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj_use+ojj, oi_use, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->fwd_ofw_rb * handle->ofmblock); + } + } + } + } + } + } + } + } + } + } + } + +#if 0 + /* In case we used intermediate fp32 buffer, now downconvert the result to the actual bf16 output */ + if (handle->avoid_acc_load == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16( &LIBXSMM_VLA_ACCESS( 5, output_fp32, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + handle->ofw * handle->ofmblock); + } + } + } + } +#endif + +} +#endif + +handle->tilerelease_kernel(NULL, NULL, NULL); +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c new file mode 100644 index 00000000..7b654f25 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i32.tpl.c @@ -0,0 +1,170 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int img, ofm1, ofm2, ifm1, ifm2, oj, oi, kj, ki, ii_use, ij_use, oii, spread_out = 1; +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could be run in parallel */ +const int w_tasks = handle->ofw/handle->fwd_ofw_rb; +const int work = handle->desc.N * handle->blocksofm * handle->ofh * w_tasks; +const int work_KHW = handle->blocksofm * handle->ofh * w_tasks; +const int work_HW = handle->ofh * w_tasks; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; +int imgofm1ofhofw; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +/* Batch reduce related variables */ +unsigned long long n_blocks; + +/* offset output pointer in case of physical output padding */ +element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +element_input_type *input_ptr = (handle->pack_input == 1) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +const int IFW = (handle->pack_input == 1) ? handle->ofwp : handle->ifwp; +const int IFH = (handle->pack_input == 1) ? handle->ofhp : handle->ifhp; +LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, handle->blocksifm, IFH, IFW, handle->ifmblock); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + +libxsmm_barrier_init(handle->barrier, ltid); + +if (handle->pack_input == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij_use, ii_use, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +if (handle->avoid_fmas_in_rim == 1) { + n_blocks = handle->blocksifm_blocking; + for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { + img = imgofm1ofhofw / work_KHW; + ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; + oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; + oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; + ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v - (1-handle->desc.pad_w_in); + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + br_gemm_kernel_strided2( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki+1, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi+1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + br_gemm_kernel_strided2( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } else { + br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, kj, ki, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use+kj, ii_use+ki, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } + } + } + } + } +} else { + /* Strided based BRGEMM */ + n_blocks = (unsigned long long)handle->blocksifm_blocking * handle->desc.R * handle->desc.S; + if (handle->desc.R == 1 && handle->desc.S == 1) { + for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { + img = imgofm1ofhofw / work_KHW; + ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; + oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; + oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; + ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u; + ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v; + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { + br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks); + } + } + } else { /* Offset based BRGEMM */ + for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { + img = imgofm1ofhofw / work_KHW; + ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; + oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; + oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; + ij_use = (handle->pack_input == 1) ? oj : oj * handle->desc.u; + ii_use = (handle->pack_input == 1) ? oi : oi * handle->desc.v; + if ( ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock)); + for (oii = 0; oii < handle->fwd_ofw_rb; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->ofmblock; + } + } + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1 += handle->blocksifm_blocking) { + br_gemm_kernel_offset( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, ifm1, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij_use, ii_use, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets); + } + } + } +} +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c new file mode 100644 index 00000000..96135547 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_custom_custom_generic_i8i8.tpl.c @@ -0,0 +1,61 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ +const int ifmblock_lp = handle->ifmblock/handle->fm_lp_block; +int imgofm1ofhofw, img, ofm1, oj, oi, ii, ij; +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int w_tasks = handle->ofw/handle->fwd_ofw_rb; +const int work = handle->desc.N * handle->blocksofm * handle->ofh * w_tasks; +const int work_KHW = handle->blocksofm * handle->ofh * w_tasks; +const int work_HW = handle->ofh * w_tasks; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; +/* Batch reduce related variables */ +unsigned long long n_blocks = (unsigned long long)handle->blocksifm_blocking * handle->desc.R * handle->desc.S; +/* Calculate scaling factor here for output... */ +float _scf = libxsmm_sexp2_i8i(-(handle->reg_filter->scf + handle->reg_input->scf - handle->reg_output->scf)); +/* offset output pointer in case of physical output padding */ +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); +LIBXSMM_VLA_DECL(7, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block); + +libxsmm_barrier_init(handle->barrier, ltid); +if (handle->desc.R == 1 && handle->desc.S == 1) { /* Strided based BRGEMM */ + for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { + img = imgofm1ofhofw / work_KHW; + ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; + oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; + oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + br_gemm_kernel_strided( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, 0, ij, ii, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, &_scf); + } +} else { /* Offset based BRGEMM */ + for (imgofm1ofhofw = thr_begin; imgofm1ofhofw < thr_end; ++imgofm1ofhofw) { + img = imgofm1ofhofw / work_KHW; + ofm1 = (imgofm1ofhofw % work_KHW)/work_HW; + oj = ((imgofm1ofhofw % work_KHW)%work_HW)/w_tasks; + oi = (((imgofm1ofhofw % work_KHW)%work_HW)%w_tasks)*handle->fwd_ofw_rb; + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + br_gemm_kernel_offset( &LIBXSMM_VLA_ACCESS(7, weight, ofm1, 0, 0, 0, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, ifmblock_lp, handle->ofmblock, handle->fm_lp_block), + &LIBXSMM_VLA_ACCESS(5, input, img, 0, ij, ii, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(5 , output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), &n_blocks, handle->A_offsets, handle->B_offsets, &_scf); + } +} +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c new file mode 100644 index 00000000..04232958 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c @@ -0,0 +1,522 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int img, ofm1, ofm2 = 0, ifm1, ifm2 = 0, oj, oi, kj, ki, oi_use, oj_use, ii_use, ij_use, ofmb, ifmb, ojb, myOfmId, nOfmBlocks, ind, ofm11, ki1, kj1, ojj, oii, ii, ij, spread_out = 1; +/* computing first logical thread */ +const int ltid = tid - start_thread; +int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); +int threads_per_image = handle->desc.threads / handle->desc.N; +int my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); +int my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); +int my_ofm_start = 0; +int my_ofm_end = handle->blocksofm; + +/* Batch reduce related variables */ +const element_filter_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +/* offset output pointer in case of physical output padding */ +element_output_type* out = (element_output_type*)handle->reg_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; +LIBXSMM_VLA_DECL(5, element_output_type, output, out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); +element_input_type *input_ptr = ( (handle->pack_input == 1) || (handle->fwd_padding_copy == 1) ) ?(element_input_type*)((char*)handle->scratch + handle->fwd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +const int IFW = (handle->fwd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : ( (handle->pack_input == 1) ? handle->ofwp : handle->ifwp ); +const int IFH = (handle->fwd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : ( (handle->pack_input == 1) ? handle->ofhp : handle->ifhp ); +LIBXSMM_VLA_DECL(5, element_input_type, input, input_ptr, IFH, IFW, handle->blocksifm, handle->ifmblock); +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK +LIBXSMM_VLA_DECL(6, const element_filter_type, weight, (element_filter_type*)handle->reg_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( imgpt <= 1 ) { + my_img_start = LIBXSMM_MIN(ltid / threads_per_image, handle->desc.N); + my_img_end = LIBXSMM_MIN(my_img_start + 1, handle->desc.N); + myOfmId = ltid % threads_per_image; + nOfmBlocks = LIBXSMM_UPDIV(handle->blocksofm, threads_per_image); + my_ofm_start = LIBXSMM_MIN(myOfmId * nOfmBlocks, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((myOfmId+1) * nOfmBlocks, handle->blocksofm); +} + +if ( handle->use_ofm_parallelization == 1 ) { + if ( handle->desc.N % 8 == 0) { + spread_out = 8; + } else if ( handle->desc.N % 4 == 0) { + spread_out = 4; + } else if (handle->desc.N % 2 == 0) { + spread_out = 2; + } else if (handle->desc.N % 3 == 0) { + spread_out = 3; + } else { + spread_out = 1; + } + if ((spread_out > 1) && (handle->desc.threads % spread_out == 0)) { + int tile_id = ltid / spread_out; + int ofmpt = LIBXSMM_UPDIV(handle->blocksofm, spread_out); + int ofm_id = ltid % spread_out; + imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads) * spread_out; + my_img_start = LIBXSMM_MIN(tile_id * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * imgpt, handle->desc.N); + my_ofm_start = LIBXSMM_MIN(ofm_id * ofmpt, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((ofm_id+1) * ofmpt, handle->blocksofm); + } +} + +/* remove stride from input */ +if (handle->pack_input == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + /* @TODO think about packed format */ + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij_use, ii_use, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +/* physical pad input */ +if (handle->fwd_padding_copy == 1) { + int ifmpt = LIBXSMM_UPDIV(handle->blocksifm, spread_out); + int ifm_id = ltid % spread_out; + int my_ifm_start = LIBXSMM_MIN(ifm_id * ifmpt, handle->blocksifm); + int my_ifm_end = LIBXSMM_MIN((ifm_id+1) * ifmpt, handle->blocksifm); + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + if ( handle->use_ofm_parallelization == 1 ) { + libxsmm_barrier_wait(handle->barrier, ltid); + } +} + +if (handle->use_fallback_fwd_loops == 1) { + /* number of tasks that could be run in parallel */ + const int work = handle->desc.N * handle->blocksofm * handle->ofh; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int imgofm1ofh; + + if ( handle->avoid_fmas_in_rim == 1) { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); +#if 1 + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; +#else + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; +#endif + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->blocksofm*handle->ofmblock; + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } else { + for (imgofm1ofh = thr_begin; imgofm1ofh < thr_end; ++imgofm1ofh) { + img = imgofm1ofh / (handle->blocksofm*handle->ofh); +#if 1 + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->ofh; + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->ofh; +#else + oj = (imgofm1ofh % (handle->blocksofm*handle->ofh))/handle->blocksofm; + ofm1 = (imgofm1ofh % (handle->blocksofm*handle->ofh))%handle->blocksofm; +#endif + + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0) { + /* set output feature map to zero */ + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->blocksofm*handle->ofmblock; + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } + } + } + } + } + +} else { + if (handle->loop_order == 0) { + if ( handle->avoid_fmas_in_rim == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->blocksofm*handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u - (1-handle->desc.pad_h_in); + ii_use = oi * handle->desc.v - (1-handle->desc.pad_w_in); + } + oi_use = oi; + oj_use = oj; + + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; + + if (kj == 0 && oj == 0) { + /* Do no FLOPS */ + } else if (kj == handle->desc.R-1 && oj == handle->ofh-1 ) { + /* Do no FLOPS */ + } else if ( oi == 0 && ki == 0 ) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } else { + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (ofm11 = ofmb; ofm11 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm11++ ) { + ofm1 = (handle->shuffle_filter_accesses == 1) ? (ofm11+ltid)%handle->blocksofm : ofm11; + if ( (ifmb == 0) && ((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && ojb == 0) { + /* set output feature map to zero */ + for (oj = 0; oj < handle->ofh; ++oj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, oj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); + for (oi = 0; oi < handle->ofw; ++oi) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->blocksofm * handle->ofmblock; + } + } + } + + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj1 = 0; kj1 < handle->desc.R; kj1++) { + for (ki1 = 0; ki1 < handle->desc.S; ki1++) { + ki = (handle->shuffle_filter_accesses == 1) ? (ki1+ltid)%handle->desc.S : ki1; + kj = (handle->shuffle_filter_accesses == 1) ? (kj1+ltid)%handle->desc.R : kj1; +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + + if (handle->loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += handle->block_fwd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->block_fwd_oj) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->block_fwd_oj,handle->ofh); oj += handle->fwd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->fwd_ofw_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_fwd_ofm, my_ofm_end); ofm1++ ) { + if (((handle->options & LIBXSMM_DNN_CONV_OPTION_OVERWRITE) > 0) && handle->avoid_acc_load == 0 && oj == 0 && oi == 0) { + /* set output feature map to zero */ + for (ojj = 0; ojj < handle->ofh; ++ojj) { + element_output_type* temp_ptr = &(LIBXSMM_VLA_ACCESS( 5, output, img, ojj, 0, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock)); + for (oii = 0; oii < handle->ofw; ++oii) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ++ofm2) { + temp_ptr[ofm2] = (element_output_type)0; + } + temp_ptr += handle->blocksofm * handle->ofmblock; + } + } + } + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_fwd_ifm) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_fwd_ifm, handle->blocksifm); ifm1 += handle->blocksifm_blocking) { + /* Prepare batch-reduce kernel arguments */ + if (handle->pack_input == 1) { + ij_use = oj; + ii_use = oi; + } else { + ij_use = oj * handle->desc.u; + ii_use = oi * handle->desc.v; + } + oi_use = oi; + oj_use = oj; + ind = 0; + for (ifm2 = ifm1; ifm2 < ifm1 + handle->blocksifm_blocking; ifm2++) { + for (kj = 0; kj < handle->desc.R; kj++) { + for (ki = 0; ki < handle->desc.S; ki++) { +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_CUSTOM + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, ofm1, ifm2, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#ifdef LIBXSMM_DNN_TPL_FWD_DIRECT_GENERIC_NHWC_RSCK + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(6, weight, kj, ki, ifm2, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, handle->blocksifm, handle->ifmblock); + ind++; + } + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c new file mode 100644 index 00000000..356d4138 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c @@ -0,0 +1,577 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +int img, my_img_start, my_img_end, ofmb, ifmb, ojb, ofm1, ifm1, ifm2 = 0, ofm2 = 0, oj, oi, ii, ij, kj, ki, ind, j_br, img_br, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm; +/* computing first logical thread */ +const int ltid = tid - start_thread; +libxsmm_blasint LDA = handle->ofmblock; +libxsmm_blasint LDB = (handle->upd_pack_input == 1) ? handle->ifmblock : handle->desc.v * handle->ifmblock; +libxsmm_blasint LDC = handle->ofmblock; +int l_flags = LIBXSMM_GEMM_FLAGS('N', 'T'); +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; +const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; +element_input_type *input_ptr_to_use = (handle->upd_padding_copy == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*) input_ptr_to_use, handle->blocksifm, IFHP, IFWP, handle->ifmblock); +LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +element_filter_type *weight_ptr = (handle->weight_copies == 1) ? (element_filter_type*)handle->grad_filter->data : (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; +LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +int prefetch_mode = (handle->desc.u == 2 || (handle->desc.R == 3 && handle->ofw == 7) ) ? libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE) : libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BL1); + +/* Batch reduce related variables */ +const element_output_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +int brgemm_pf_oob = 0; +const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); +if ( 0 == env_brgemm_pf_oob ) { +} else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); +} +if (brgemm_pf_oob > 0) { + prefetch_mode = prefetch_mode | libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); +} + +libxsmm_barrier_init(handle->barrier, ltid); + +/* physical pad input */ +if (handle->upd_padding_copy == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); + + my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); +} + + +if (handle->upd_use_batchreduce == 0 && handle->upd_linearized_tasklist == 0) { + /* Parallelize over minibatch */ + const int img_work = handle->desc.N; + const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; + const float beta = ((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; + my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; + + if (!((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + memset(weight_ptr, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(element_filter_type)); + } + + if (handle->upd_loop_order == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); + } + } + } + } + } + } + } + } + } + } + } + if (handle->upd_loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); + } + } + } + } + } + } + } + } + } + } + } +} else { + if (handle->upd_linearized_tasklist == 1) { + /* Amount of work when using linearized view of tasks */ + const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm; + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; + const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int work_item; + int Cb = handle->blocksifm; +#if 0 + int Kb = handle->blocksofm; +#endif + int R = handle->desc.R; + int S = handle->desc.S; + + if (handle->upd_avoid_rim_fmas == 0) { + const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; + const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; + element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; + LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, handle->blocksifm, IFH, IFW, handle->ifmblock); + const float beta = ((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + /* If requested, pack input to avoid strided accesses */ + if (handle->upd_pack_input == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; + const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); + const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); + + for (img = img_copy_start; img < img_copy_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij, ii, ifm2, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); + } + + /* Initialize weights to zero */ + if (!((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; + } + } + } + } + + for (img = 0; img < handle->desc.N; img++) { + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + oi = 0; + ii = ki; + for (oj = 0; oj < handle->ofh; oj += handle->upd_ofh_rb) { + ij = oj * handle->desc.u + kj; + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, ij, ii, 0, handle->blocksifm, IFH, IFW, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); + } + } + } + } else { + const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb-1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + oi = 0; + oj = 0; + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + img = 0; + img_block_size = handle->desc.N; + + if (kj == 0) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 1; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } else if (ki == 0) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi + 1, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii + 1, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } else { + if (kj == handle->desc.R-1) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb-1; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } else { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } + } + } + } + } else { + /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ + /* FIXME: Hardcoed logic for N=27 */ + int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tiles = handle->weight_copies; + int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); + int my_in_tile_id = ltid % group_size; + int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); + int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); + int my_R_start = 0; + int my_R_end = handle->desc.R; + const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + const float beta_flat = 0.0; + gemm_br_function br_gemm_kernel_flat = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta_flat, &l_flags, &prefetch_mode); + element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; + LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); + my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + /* FIXME: Hardcoed logic for N=27 */ + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { + my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { + int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); + my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); + } + if (handle->desc.threads == 92 && handle->desc.N == 92 && handle->desc.C == 512 && handle->desc.K == 512 && handle->ofh == 7 && handle->desc.u == 1 && handle->desc.R == 3) { + my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + block_ofm = my_ofm_end-my_ofm_start+1; + block_ifm = my_ifm_end-my_ifm_start+1; + img_block_size = my_img_end - my_img_start; + + if (handle->desc.N != handle->desc.threads) { + /* Use "flat" parallelism + reduction */ + const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm * handle->desc.N; + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; + const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int work_item; + int Cb = handle->blocksifm; + int Kb = handle->blocksofm; + int R = handle->desc.R; + int S = handle->desc.S; + const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; + const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; + element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; + LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, handle->blocksifm, IFH, IFW, handle->ifmblock); + + /* If requested, pack input to avoid strided accesses */ + if (handle->upd_pack_input == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; + const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); + const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); + + for (img = img_copy_start; img < img_copy_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, oj, oi, ifm2, handle->blocksifm, IFH, IFW, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); + } + + /* Initialize weights to zero */ + if (handle->upd_ofw_rb != handle->ofw) { + for (work_item = work_begin; work_item < work_end; work_item++) { + img = work_item/(Cb*Kb*R*S); + ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); + ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); + kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; + ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; + { + element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; + } + } + } + } + } + + for (work_item = work_begin; work_item < work_end; work_item++) { + img = work_item/(Cb*Kb*R*S); + ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); + ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); + kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; + ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; + ii = 0 + ki; + ij = 0 + kj; + oj = 0; + oi = 0; + { + element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + ind = 0; + for (j_br = 0; j_br < handle->ofh; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img , ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFH, IFW, handle->ifmblock); + ind++; + } + n_blocks = ind; + br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } + } + } else { + /* May need to initialized private weights to zero */ + if (!((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++ ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; + } + } + } + } + } + } + } + + if (handle->upd_loop_order == 0) { + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, ofm1, oj + j_br, oi, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0, handle->blocksifm, IFHP, IFWP, handle->ifmblock); + ind++; + } + } + n_blocks = ind; + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); + } + } + } + } + } + } + } + } + } + } + } + } + } +} + +if (handle->weight_copies > 1) { + /* reduce work-related variables */ + const int fm_blocking = (handle->ofmblock % 16 == 0) ? 16 : handle->ofmblock; + const int reduce_work = handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S * (handle->ofmblock/fm_blocking) * handle->ifmblock; + const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; + const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; + const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; + + /* Perform reduction here */ + libxsmm_barrier_wait(handle->barrier, ltid); + + for ( ij = reduce_thr_begin; ij < reduce_thr_end; ij++ ) { + element_filter_type *weight_ptr_glb = (element_filter_type*) handle->grad_filter->data; +#if 1 + float weight_sum[64]; + int wtcnt = 0; + assert( handle->ofmblock <= 64 ); + + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_sum[wtcnt] = 0.0f; + } + + for ( ii = 0; ii < handle->weight_copies; ii++ ) { + element_filter_type *weight_ptr_src = (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * fm_blocking; + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_sum[wtcnt] += weight_ptr_src[wtcnt]; + } + } + + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_ptr_glb[(ij*fm_blocking) + wtcnt] = weight_sum[wtcnt]; + } +#else + __m512 weight_sum = _mm512_setzero_ps(); + for ( ii = 0; ii < handle->weight_copies; ii++ ) { + element_filter_type *weight_ptr_src = (element_filter_type*)handle->scratch7 + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * 16; + weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(weight_ptr_src)); + } + _mm512_storeu_ps(&weight_ptr_glb[ij*16], weight_sum); +#endif + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c new file mode 100644 index 00000000..8ef6e8e2 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c @@ -0,0 +1,723 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +#define TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1) do {\ + __m512i zero_reg = _mm512_setzero_si512();\ + src_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);\ + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2);\ + for (pixel_pair = 0; pixel_pair < n_full_pixel_pairs; pixel_pair++) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ + pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2));\ + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ + }\ + src_out += 2* handle->ofmblock;\ + tr_out += 2*handle->ofmblock;\ + }\ + if (half_pixel_pair == 1) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ + pixel_1 = _mm512_setzero_si512();\ + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ + }\ + }\ + for (oi = ((handle->compute_pixels+1)/2)*2; oi < handle->output_pixels; oi+=2) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, oi/2, ofm2, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2);\ + _mm512_storeu_si512((element_output_type*)tr_out, zero_reg);\ + _mm512_storeu_si512((element_output_type*)tr_out+32, zero_reg);\ + }\ + }\ +} while(0) + +#define TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, oj, H) do {\ + int h, w_pixel_pair, w_full_pixel_pairs = handle->ofwp/2;\ + for (h=0; hblocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);\ + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, 0, h, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2);\ + for (w_pixel_pair = 0; w_pixel_pair < w_full_pixel_pairs; w_pixel_pair++) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ + pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2));\ + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ + }\ + src_out += 2* handle->ofmblock;\ + tr_out += 2*handle->ofmblock;\ + }\ + }\ +} while(0) + +int img, my_img_start, my_img_end, ofmb, ifmb, ofm1, ifm1, ifm2, ofm2, oj, oi, ii, ij, kj, ki, j_br, img_br, i, j, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm, pix; +/* computing first logical thread */ +const int ltid = tid - start_thread; + +const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; +const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; +const int OFWP = (handle->upd_padding_copy == 1) ? handle->ofwp + 2*handle->desc.pad_w : handle->ofwp; +const int OFHP = (handle->upd_padding_copy == 1) ? handle->ofhp + 2*handle->desc.pad_h : handle->ofhp; + +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(5, const element_input_type, input, (const element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + +element_filter_type *weight_ptr = (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; + +element_filter_type *filter_dst_ptr = (handle->weight_copies > 1) ? (element_filter_type*)weight_ptr : (element_filter_type*)handle->grad_filter->data; +LIBXSMM_VLA_DECL(7, element_filter_type, weight_dst, (element_filter_type*)filter_dst_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); + +/* This intermediate tensor is used when pixels are NOT fully accumulated */ +float *weight_ptr_f32 = (float*) ((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; + +LIBXSMM_VLA_DECL(6, float, weight_private_f32, (float*)weight_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +/* Accumulation scratch is used when pixels are ully accumulated */ +element_filter_type *filter_scratch = (element_filter_type*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->ofmblock * handle->ifmblock * 2; + +LIBXSMM_VLA_DECL(2, float, filter_tmp, (float*)filter_scratch, handle->ofmblock); + +element_input_type *scratch_tr_input = (element_input_type*)((char*)handle->scratch + handle->upd_lp_input_full_scratch_offset); +element_input_type *zero_ptr_in; +element_output_type *zero_ptr_out; +LIBXSMM_VLA_DECL(4, element_input_type, tr_input, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, handle->input_pixels); +LIBXSMM_VLA_DECL(5, element_input_type, tr_input_2, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended); + +element_output_type *scratch_tr_output = (element_input_type*)((char*)handle->scratch + handle->upd_lp_output_full_scratch_offset); +LIBXSMM_VLA_DECL(5, element_output_type, tr_output, (element_output_type*) scratch_tr_output, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); +LIBXSMM_VLA_DECL(6, element_output_type, tr_output_2, (element_output_type*) scratch_tr_output, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2); +#if 0 +element_output_type *out_ptr = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +element_output_type *zero_ptr_out; +#endif + +/* transpose, copy and reduce work-related variables */ +const int reduce_work = (handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S)/16; +const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; +const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; +const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; + +const float beta = (handle->use_intermediate_f32_wt_tensor ? 1.f : 0.f); +float *dst_ptr; +gemm_br_function br_gemm_kernel = 0; + +/* These are used for the vnni reformatting of the f32 output */ +__m512i c01 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); +const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + +/* Related to the output transpose */ +int n_full_pixel_pairs = handle->compute_pixels/2, half_pixel_pair = handle->compute_pixels%2, pixel_pair; +element_output_type *tr_out, *src_out; +const __m512i selector = LIBXSMM_INTRINSICS_MM512_SET_EPI16(32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0); +const __m512i offsets_lo = LIBXSMM_INTRINSICS_MM512_SET_EPI16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); +const __m512i offsets_hi = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16); +const __m512i idx_lo = _mm512_or_epi32(selector, offsets_lo); +const __m512i idx_hi = _mm512_or_epi32(selector, offsets_hi); +__m512i pixel_0, pixel_1, ofms_lo, ofms_hi; + +/* Batch reduce related variables */ +const element_output_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +libxsmm_blasint LDA = handle->ofmblock; +libxsmm_blasint LDB = handle->input_pixels; +libxsmm_blasint LDC = handle->ofmblock; +int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); +int l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + +const int img_work = handle->desc.N; +const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; +my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; +my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; + +libxsmm_barrier_init(handle->barrier, ltid); + +if (handle->upd_linearized_pixels == 1) { + /* First transpose input and output */ + if (handle->use_hybrid_imgofm_parallelization == 1) { + if (handle->upd_pack_input_upfront == 0) { + for (img = my_img_start; img < my_img_end; img++) { + if (handle->upd_padding_copy == 1) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->ifmblock * handle->input_pixels * sizeof(element_input_type)); + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, (ij + handle->desc.pad_h) * IFWP + (ii + handle->desc.pad_w), handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } else { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels), + handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels ); +#if 0 + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * handle->ifwp + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } +#endif + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { +#if 0 + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->desc.C * handle->input_pixels * sizeof(element_input_type)); +#endif + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { + transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, ij * (handle->ifwp/handle->desc.v), handle->blocksifm, handle->ifmblock, handle->input_pixels), + handle->ifmblock, handle->ifwp/handle->desc.v, 2*handle->ifmblock, handle->input_pixels ); +#if 0 + for (ii = 0; ii < handle->ifwp/handle->desc.v; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * (handle->ifwp/handle->desc.v) + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, ii*handle->desc.v, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } +#endif + } + } + } + } + + if (handle->upd_padding_copy == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * handle->output_pixels * sizeof(element_output_type)); + for (oj = 0; oj < handle->ofhp; oj++) { + for (oi = 0; oi < handle->ofwp; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, (oj*OFWP+oi)/2, ofm2, (oj*OFWP+oi)%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); + } + } + } + } +#if 0 + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, 0, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->desc.K * handle->output_pixels * sizeof(element_output_type)); + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (oi = 0; oi < handle->n_used_pixels; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, oi/2, ofm2, oi%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + *((element_output_type*)out_ptr + img * handle->blocksofm * handle->ofwp * handle->ofhp * handle->ofmblock + ofm1 * handle->ofwp * handle->ofhp * handle->ofmblock + oi * handle->ofmblock + ofm2); + } + } + } + } +#endif +} else { + if (handle->upd_trans_w_only == 0) { + if (handle->on_the_fly_input_packing == 0) { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); + memset(zero_ptr_in, 0, handle->desc.C * handle->ifhp * handle->ifwp_extended * sizeof(element_input_type)); + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { +#if 0 + TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, 0, handle->ofh); +#else + for (oj = 0; oj < handle->ofh; oj++) { +#if 0 + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, 0, 0, 0, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * handle->ofwp_extended * sizeof(element_output_type)); +#endif + for (oi = 0; oi < handle->ofw; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, oi/2, ofm2, oi%2, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + if (handle->ofw % 2 == 1) { + for (oj = 0; oj < handle->ofh; oj++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, handle->ofw/2, ofm2, handle->ofw%2, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2) = (element_output_type)0; + } + } + } +#endif + } + } + } +} + +/* Make sure we initialize intermediate weights to zero */ +if (handle->use_intermediate_f32_wt_tensor == 1 && handle->use_hybrid_imgofm_parallelization == 0) { + memset(weight_ptr_f32, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(float)); +} + +if (handle->upd_linearized_pixels == 0) { + if (handle->upd_trans_w_only == 1) { + LDA = handle->ofmblock; + LDB = IFHP*handle->ifwp_extended; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + n_blocks = handle->batchreduce_h_pixels; + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (oj = 0; oj < handle->ofh; oj += handle->batchreduce_h_pixels){ + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + /* Transpose output block */ + TRANS_OUTPUT_W_TO_VNNI_FORMAT(img, ofm1, oj, handle->batchreduce_h_pixels); + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + /* Transpose input block */ + for (j=0; j < handle->batchreduce_h_pixels; j++) { + transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj+j, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), + handle->ifmblock, handle->ifwp_extended, handle->ifmblock, handle->ifhp*handle->ifwp_extended ); + } + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + + for (j_br = 0; j_br < handle->batchreduce_h_pixels; j_br++) { + A_ptrs[j_br] = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, 0, j_br, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2); + B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); + } + + br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); + + /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if ((oj + handle->batchreduce_h_pixels >= handle->ofh) && (img == my_img_end - 1)) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), + LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } + } + } + } + } + } + } + } else { + int fast_trans = (handle->ofw == 112 && handle->desc.v == 2 && handle->ifmblock == 4 && handle->batchreduce_h_pixels == 1) ? 1 : 0; + const __m512i skipper = LIBXSMM_INTRINSICS_MM512_SET_EPI16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0); + __m512i p0, p1, p2, p3; + __m256i _p0, _p1, _p2, _p3; + __m256i r0 = LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256(); + __m256i r1 = LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256(); + __m256i r2 = LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256(); + __m256i r3 = LIBXSMM_INTRINSICS_MM256_UNDEFINED_SI256(); + LDA = handle->ofmblock; + LDB = IFHP*handle->ifwp_extended; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + n_blocks = handle->batchreduce_h_pixels; + /* Handle case when ofw is odd number... */ + if (handle->ofw % 2 == 1) { + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw+1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } else { + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + } + + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (oj = 0; oj < handle->ofh; oj += handle->batchreduce_h_pixels){ + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + + /* Copy the input in such a way that we ignore "w-pixels" based on ki value */ + if (handle->on_the_fly_input_packing == 1) { + if ((fast_trans == 1) && (handle->upd_padding_copy == 0)) { + for (ii = 0; ii < handle->ofw*2; ii+=32) { + p0 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + p0 = _mm512_permutexvar_epi16(skipper, p0); + _p0 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p0, 0); + p1 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+8+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + p1 = _mm512_permutexvar_epi16(skipper, p1); + _p1 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p1, 0); + p2 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+16+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + p2 = _mm512_permutexvar_epi16(skipper, p2); + _p2 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p2, 0); + p3 = _mm512_loadu_si512((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, oj*handle->desc.u+kj, ii+24+ki, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock)); + p3 = _mm512_permutexvar_epi16(skipper, p3); + _p3 = LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(p3, 0); + + r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p0, 0), 0); + r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p1, 0), 1); + r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p2, 0), 2); + r0 = _mm256_insert_epi64 (r0, _mm256_extract_epi64(_p3, 0), 3); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r0); + + r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p0, 1), 0); + r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p1, 1), 1); + r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p2, 1), 2); + r1 = _mm256_insert_epi64 (r1, _mm256_extract_epi64(_p3, 1), 3); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 1, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r1); + + r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p0, 2), 0); + r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p1, 2), 1); + r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p2, 2), 2); + r2 = _mm256_insert_epi64 (r2, _mm256_extract_epi64(_p3, 2), 3); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 2, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r2); + + r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p0, 3), 0); + r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p1, 3), 1); + r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p2, 3), 2); + r3 = _mm256_insert_epi64 (r3, _mm256_extract_epi64(_p3, 3), 3); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 3, 0, ii/2, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended), r3); + + } + } else { + if (handle->upd_padding_copy == 1) { + for (ij = 0; ij < handle->batchreduce_h_pixels; ij++) { + for (ii = 0; ii < handle->ofw; ii++) { + int j_pixel = (oj+ij)*handle->desc.u+kj; + int i_pixel = ii*handle->desc.v+ki; + if ( (j_pixel >= handle->desc.pad_h) && (i_pixel >= handle->desc.pad_w) && (j_pixel < handle->ifhp+handle->desc.pad_h) && (i_pixel < handle->ifwp+handle->desc.pad_w) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, (oj+ij)*handle->desc.u+kj-handle->desc.pad_h, ii*handle->desc.v+ki-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } else { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = (element_input_type)0; + } + } + } + } + } else { + for (ij = 0; ij < handle->batchreduce_h_pixels; ij++) { + for (ii = 0; ii < handle->ofw; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, (oj+ij)*handle->desc.u+kj, ii*handle->desc.v+ki, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } + + for (j_br = 0; j_br < handle->batchreduce_h_pixels; j_br++) { + A_ptrs[j_br] = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj+j_br, 0, 0, 0, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2); + B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended); + } + + br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); + + /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if ((oj + handle->batchreduce_h_pixels >= handle->ofh) && (img == my_img_end - 1)) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), + LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } + } + } + } + } + } + } + } +} else { + LDA = handle->ofmblock; + LDB = handle->input_pixels; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + l_flags = LIBXSMM_GEMM_VNNI_FLAGS('N', 'N', 'V', 'N'); + + if (handle->use_hybrid_imgofm_parallelization == 1) { + /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ + /* FIXME: Hardcoed logic for N=27 */ + int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tiles = handle->weight_copies; + int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); + int my_in_tile_id = ltid % group_size; + int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); + int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); + int my_R_start = 0; + int my_R_end = handle->desc.R; + element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; + LIBXSMM_VLA_DECL(7, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); + /* This intermediate tensor is used when pixels are NOT fully accumulated */ + float *weight_tile_ptr_f32 = (float*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; + LIBXSMM_VLA_DECL(6, float, weight_private_tile_f32, (float*)weight_tile_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + + my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); + my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + /* FIXME: Hardcoed logic for N=27 */ + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { + my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { + int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); + my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); + } + block_ofm = my_ofm_end-my_ofm_start+1; + block_ifm = my_ifm_end-my_ifm_start+1; + img_block_size = my_img_end - my_img_start; + + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + n_blocks = img_block_size; + + /* Make sure we initialize intermediate weights to zero */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + memset((float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), 0, handle->ofmblock * handle->ifmblock * handle->desc.S * sizeof(float)); + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + + for (img_br = 0; img_br < img_block_size; img_br++) { + A_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(5, tr_output, img + img_br, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + B_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(4, tr_input, img + img_br, ifm1, 0, pix + kj * IFWP + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels); + } + + br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); + + /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if ((pix + handle->pixel_blocking >= handle->n_used_pixels) && (img == my_img_end - img_block_size)) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), + LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_private_group, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } + } + } + } + } + } + } + + } else { + gemm_function gemm_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + /* Transpose output block */ + if (pix == 0 && ifmb == 0) { + if (handle->upd_padding_copy == 1) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * handle->output_pixels * sizeof(element_output_type)); + for (oj = 0; oj < handle->ofhp; oj++) { + for (oi = 0; oi < handle->ofwp; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, (oj*OFWP+oi)/2, ofm2, (oj*OFWP+oi)%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } else { + TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + /* Transpose input block */ + if (pix == 0 && ofmb == 0 && ofm1 == 0) { + if (handle->upd_padding_copy == 1) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->ifmblock * handle->input_pixels * sizeof(element_input_type)); + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, (ij + handle->desc.pad_h) * IFWP + (ii + handle->desc.pad_w), handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } else { + if (handle->upd_pack_input_upfront == 0) { + transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels), + handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels ); + } else { + for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { + transpose_input_pixels_bf16( (element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, ij * (handle->ifwp/handle->desc.v), handle->blocksifm, handle->ifmblock, handle->input_pixels), + handle->ifmblock, handle->ifwp/handle->desc.v, 2*handle->ifmblock, handle->input_pixels ); + } + } + } + } + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2), + &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, pix + kj * IFWP + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels), + dst_ptr); + + /* Convert fully accumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if ((pix + handle->pixel_blocking >= handle->n_used_pixels) && (img == my_img_end - 1)) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), + LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock)) ); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } + } + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +if (handle->weight_copies > 1) { + int active_copies = handle->weight_copies; + const int filter_size = handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K; + LIBXSMM_VLA_DECL(2, element_filter_type, weight_copies_buffer, (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset), filter_size); + element_filter_type *weight_global_ptr = (element_filter_type*) handle->grad_filter->data; + + /* In this case calculate how many weight copies have been indeed computed */ + if (handle->desc.N != handle->desc.threads) { + active_copies = 1; + while (active_copies * img_chunksize < handle->desc.N) { + active_copies++; + } + } + + for ( j = reduce_thr_begin; j < reduce_thr_end; j++) { + __m512 weight_sum = _mm512_setzero_ps(); + for ( i = 0; i < active_copies; i++ ) { + weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, weight_copies_buffer, i, j*16, filter_size)))); + } + _mm256_storeu_si256((__m256i*)(((libxsmm_bfloat16*) weight_global_ptr) + j*16), LIBXSMM_INTRINSICS_MM512_CVT_FP32_BF16(weight_sum)); + } + libxsmm_barrier_wait(handle->barrier, ltid); +} + +#undef TRANS_OUTPUT_W_TO_VNNI_FORMAT +#undef TRANS_OUTPUT_TO_VNNI_FORMAT diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..eb7f7d97 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16_amx.tpl.c @@ -0,0 +1,783 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1) do {\ + __m512i zero_reg = _mm512_setzero_si512();\ + src_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock);\ + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2);\ + for (pixel_pair = 0; pixel_pair < n_full_pixel_pairs; pixel_pair++) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ + pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2));\ + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ + }\ + src_out += 2* handle->ofmblock;\ + tr_out += 2*handle->ofmblock;\ + }\ + if (half_pixel_pair == 1) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2);\ + pixel_1 = _mm512_setzero_si512();\ + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1);\ + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1);\ + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi);\ + }\ + tr_out += 2*handle->ofmblock;\ + } \ + for (oi = (n_full_pixel_pairs+half_pixel_pair)*2; oi < handle->output_pixels; oi+=2) {\ + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) {\ + _mm512_storeu_si512((element_output_type*)tr_out+ofm2*2, zero_reg);\ + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, zero_reg);\ + } \ + tr_out += 2*handle->ofmblock;\ + }\ +}while(0) + +#define TRANS_INPUT(img, ifm1) do {\ + transpose_input_pixels_bf16((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock),(element_input_type*)&LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels), handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels);\ + if (handle->input_pixels - handle->ifhp*handle->ifwp > 0) {\ + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) {\ + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, handle->ifhp * handle->ifwp, handle->blocksifm, handle->ifmblock, handle->input_pixels);\ + memset(zero_ptr_in, 0, (handle->input_pixels - handle->ifhp * handle->ifwp)*sizeof(element_input_type));\ + }\ + }\ +} while(0) + +int img, my_img_start, my_img_end, ofmb, ifmb, ofm1, ifm1, ifm2, ofm2, oj, oi, ii, ij, kj, ki, /*j_br, img_br,*/ i, j, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm, pix; +/* computing first logical thread */ +const int ltid = tid - start_thread; + +const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; +const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; +const int OFWP = (handle->upd_padding_copy == 1) ? handle->ofwp + 2*handle->desc.pad_w : handle->ofwp; +const int OFHP = (handle->upd_padding_copy == 1) ? handle->ofhp + 2*handle->desc.pad_h : handle->ofhp; + +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); +LIBXSMM_VLA_DECL(5, const element_input_type, input, (const element_input_type*)handle->reg_input->data, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + +element_filter_type *weight_ptr = (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; +element_filter_type *filter_dst_ptr = (handle->weight_copies > 1) ? (element_filter_type*)weight_ptr : (element_filter_type*)handle->grad_filter->data; +LIBXSMM_VLA_DECL(7, element_filter_type, weight_dst, (element_filter_type*)filter_dst_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); + +/* This intermediate tensor is used when pixels are NOT fully accumulated */ +float *weight_ptr_f32 = (float*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; +LIBXSMM_VLA_DECL(6, float, weight_private_f32, (float*)weight_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +/* Accumulation scratch is used when pixels are ully accumulated */ +element_filter_type *filter_scratch = (element_filter_type*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + ltid * handle->ofmblock * handle->ifmblock * 2; +LIBXSMM_VLA_DECL(2, float, filter_tmp, (float*)filter_scratch, handle->ofmblock); + +element_input_type *scratch_tr_input = (element_input_type*)((char*)handle->scratch + handle->upd_lp_input_full_scratch_offset); +element_input_type *zero_ptr_in; +LIBXSMM_VLA_DECL(4, element_input_type, tr_input, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, handle->input_pixels); +LIBXSMM_VLA_DECL(5, element_input_type, tr_input_2, (element_input_type*) scratch_tr_input, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended); +LIBXSMM_VLA_DECL(3, element_input_type, tr_input_3, (element_input_type*) scratch_tr_input, handle->ifmblock, handle->input_pixels); + +element_output_type *scratch_tr_output = (element_input_type*)((char*)handle->scratch + handle->upd_lp_output_full_scratch_offset); +LIBXSMM_VLA_DECL(5, element_output_type, tr_output, (element_output_type*) scratch_tr_output, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); +LIBXSMM_VLA_DECL(6, element_output_type, tr_output_2, (element_output_type*) scratch_tr_output, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2); +LIBXSMM_VLA_DECL(4, element_output_type, tr_output_3, (element_output_type*) scratch_tr_output, handle->output_pixels/2, handle->ofmblock, 2); + +element_output_type *out_ptr = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->ofmblock; +element_output_type *zero_ptr_out; + +/* transpose, copy and reduce work-related variables */ +const int reduce_work = (handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S)/16 ; +const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; +const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; +const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; + +#if 0 +const float beta = (handle->use_intermediate_f32_wt_tensor) ? 1.0 : 0.0; +#endif +float *dst_ptr; +#if 0 +gemm_br_function br_gemm_kernel = 0; +#endif + +/* These are used for the vnni reformatting of the f32 output */ +__m512i c01; +const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + +/* Related to the output transpose */ +int n_full_pixel_pairs = handle->compute_pixels/2, half_pixel_pair = handle->compute_pixels%2, pixel_pair; +element_output_type *tr_out, *src_out; +const __m512i selector = LIBXSMM_INTRINSICS_MM512_SET_EPI16(32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0); +const __m512i offsets_lo = LIBXSMM_INTRINSICS_MM512_SET_EPI16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); +const __m512i offsets_hi = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 25, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16); +const __m512i idx_lo = _mm512_or_epi32(selector, offsets_lo); +const __m512i idx_hi = _mm512_or_epi32(selector, offsets_hi); +__m512i pixel_0, pixel_1, ofms_lo, ofms_hi; + +/* Batch reduce related variables */ +#if 0 +const element_output_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +#endif +unsigned long long n_blocks; + +#if 0 +int LDA = handle->ofmblock; +int LDB = handle->input_pixels; +int LDC = handle->ofmblock; +int prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); +int l_flags = (LIBXSMM_GEMM_FLAGS('N', 'N')) | LIBXSMM_GEMM_FLAG_EXCLUDE_TILECONFIG; +int l_tc_flags = LIBXSMM_GEMM_FLAG_ONLY_TILECONFIG; +gemm_function tile_config_kernel = 0; +#endif + +const int img_work = handle->desc.N; +const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; + +/* select kernel */ +if (handle->upd_linearized_pixels == 0) { + br_gemm_kernel = handle->upd_compute_kernel_brgemm_no_linearized_pixels; + gemm_kernel = handle->upd_compute_kernel_gemm_linearized_pixels_no_hybrid_par; /* @TODO: ci check */ +} else { + if (handle->use_hybrid_imgofm_parallelization == 0) { + gemm_kernel = handle->upd_compute_kernel_gemm_linearized_pixels_no_hybrid_par; + br_gemm_kernel = handle->upd_compute_kernel_brgemm_no_linearized_pixels; /* @TODO: ci check */ + } else { +#if 0 /* if/else branches with same outcome */ + if (handle->pack_to_cnhw == 1) +#endif + { + gemm_kernel = handle->upd_compute_kernel_gemm_linearized_pixels_hybrid_par_cnhw; + br_gemm_kernel = handle->upd_compute_kernel_brgemm_linearized_pixels_hybrid_par_no_cnhw; /* @TODO: ci check */ + } +#if 0 /* if/else branches with same outcome */ + else { + gemm_kernel = handle->upd_compute_kernel_gemm_linearized_pixels_hybrid_par_cnhw; /* @TODO: ci check */ + br_gemm_kernel = handle->upd_compute_kernel_brgemm_linearized_pixels_hybrid_par_no_cnhw; + } +#endif + } +} + +my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; +my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; + +libxsmm_barrier_init(handle->barrier, ltid); + +if (handle->upd_linearized_pixels == 1) { + /* First transpose input and output */ + if (handle->pack_to_cnhw == 0) { + if (handle->fuse_upd_transposes == 0) { + if (handle->upd_pack_input_upfront == 0) { + if (handle->upd_padding_copy == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->ifmblock * handle->input_pixels * sizeof(element_input_type)); + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, (ij + handle->desc.pad_h) * IFWP + (ii + handle->desc.pad_w), handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } else { + if (handle->ifmblock % 32 == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + TRANS_INPUT(img, ifm1); + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->desc.C * handle->input_pixels * sizeof(element_input_type)); + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * handle->ifwp + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, 0, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->desc.C * handle->input_pixels * sizeof(element_input_type)); + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { + for (ii = 0; ii < handle->ifwp/handle->desc.v; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, ij * (handle->ifwp/handle->desc.v) + ii, handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, ii*handle->desc.v, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } + + /* Reformat output */ + if (handle->upd_padding_copy == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * handle->output_pixels * sizeof(element_output_type)); + for (oj = 0; oj < handle->ofhp; oj++) { + for (oi = 0; oi < handle->ofwp; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, (oj*OFWP+oi)/2, ofm2, (oj*OFWP+oi)%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } + } + } else { + if (handle->ofmblock % 32 == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, 0, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->desc.K * handle->output_pixels * sizeof(element_output_type)); + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (oi = 0; oi < handle->compute_pixels; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, oi/2, ofm2, oi%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + *((element_output_type*)out_ptr + img * handle->blocksofm * handle->ofwp * handle->ofhp * handle->ofmblock + ofm1 * handle->ofwp * handle->ofhp * handle->ofmblock + oi * handle->ofmblock + ofm2); + } + } + } + } + } + } + } + } else { + int img_tile_id, img_in_tile, init_offset, /*pix_id,*/ images_in_tile = handle->desc.N/handle->weight_copies; + /* Zero out the input padding pixels */ + for (img = my_img_start; img < my_img_end; img++) { + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + if (img_in_tile == images_in_tile-1) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(3, tr_input_3, ifm1, ifm2, img_tile_id * handle->pixel_blocking + images_in_tile * (handle->ifhp/handle->desc.u) * (handle->ifwp/handle->desc.v), handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->remainder_pixels * sizeof(element_input_type)); + } + } + } + } + + if ((handle->ifmblock % 32 == 0) && (handle->desc.u == 1) && (handle->desc.v == 1)) { + for (img = my_img_start; img < my_img_end; img++) { + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + transpose_input_pixels_bf16((element_input_type*)&LIBXSMM_VLA_ACCESS(5, input, img, ifm1, 0, 0, 0, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock), + (element_input_type*)&LIBXSMM_VLA_ACCESS(3, tr_input_3, ifm1, 0, img_tile_id * handle->pixel_blocking + img_in_tile * handle->ifhp * handle->ifwp, handle->ifmblock, handle->input_pixels) , + handle->ifmblock, handle->ifhp*handle->ifwp, handle->ifmblock, handle->input_pixels); + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp/handle->desc.u; ij++) { + for (ii = 0; ii < handle->ifwp/handle->desc.v; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(3, tr_input_3, ifm1, ifm2, img_tile_id * handle->pixel_blocking + img_in_tile * (handle->ifhp/handle->desc.u) * (handle->ifwp/handle->desc.v) + ij * (handle->ifwp/handle->desc.v) + ii, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij*handle->desc.u, ii*handle->desc.v, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } + + /* Zero out the output padding pixels */ + for (img = my_img_start; img < my_img_end; img++) { + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + if (img_in_tile == images_in_tile-1) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + init_offset = img_tile_id * handle->pixel_blocking + images_in_tile * handle->ofw * handle->ofh; + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, init_offset/2, 0, init_offset%2, handle->output_pixels/2, handle->ofmblock, 2); + memset(tr_out, 0, handle->remainder_pixels * handle->ofmblock * sizeof(element_input_type)); +#if 0 + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + for (oi = 0; oi < handle->remainder_pixels; oi++ ) { + init_offset = img_tile_id * handle->pixel_blocking + images_in_tile * handle->ofw * handle->ofh; + pix_id = init_offset + oi; + LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, pix_id/2, ofm2, pix_id%2, handle->output_pixels/2, handle->ofmblock, 2) = (element_output_type)0; + } + } +#endif + } + } + } + + if (handle->ofmblock % 32 == 0) { + int _trans_pixels = handle->ofw*handle->ofh, _n_full_pixel_pairs, _half_pixel_pair, init_pixel_pos; + for (img = my_img_start; img < my_img_end; img++) { + int pix_id; + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + pix_id = img_tile_id * handle->pixel_blocking + img_in_tile * handle->ofh * handle->ofw; + /* The first-odd pixel is done with scalar code... */ + if (pix_id % 2 == 1) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, pix_id/2, ofm2, 1, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, 0, 0, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + pix_id += 1; + _trans_pixels--; + init_pixel_pos = 1; + } else { + init_pixel_pos = 0; + } + _n_full_pixel_pairs = _trans_pixels/2; + _half_pixel_pair = _trans_pixels%2; + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + src_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, 0, init_pixel_pos, 0, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + tr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, pix_id/2, 0, 0, handle->output_pixels/2, handle->ofmblock, 2); + for (pixel_pair = 0; pixel_pair < _n_full_pixel_pairs; pixel_pair++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2+=32) { + pixel_0 = _mm512_loadu_si512((element_output_type*)src_out+ofm2); + pixel_1 = _mm512_loadu_si512(((element_output_type*)src_out+handle->ofmblock+ofm2)); + ofms_lo = _mm512_permutex2var_epi16(pixel_0, idx_lo, pixel_1); + ofms_hi = _mm512_permutex2var_epi16(pixel_0, idx_hi, pixel_1); + _mm512_storeu_si512(tr_out+ofm2*2, ofms_lo); + _mm512_storeu_si512((element_output_type*)tr_out+32+ofm2*2, ofms_hi); + } + src_out += 2* handle->ofmblock; + tr_out += 2*handle->ofmblock; + } + } + /* The last-odd pixel is done with scalar code... */ + if (_half_pixel_pair == 1) { + pix_id = pix_id + _n_full_pixel_pairs*2; + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, pix_id/2, ofm2, pix_id%2, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, handle->ofh-1, handle->ofw-1, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + img_tile_id = img/images_in_tile; + img_in_tile = img%images_in_tile; + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + int pix_id = img_tile_id * handle->pixel_blocking + img_in_tile * handle->ofh * handle->ofw + oj * handle->ofw + oi; + LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, pix_id/2, ofm2, pix_id%2, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } + } + } + } +} else { + if (handle->on_the_fly_input_packing == 0) { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, 0, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended); + memset(zero_ptr_in, 0, handle->desc.C * handle->ifhp * handle->ifwp_extended * sizeof(element_input_type)); + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img++) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, 0, 0, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended); + memset(zero_ptr_in, 0, handle->desc.C * IFHP * handle->ifwp_extended * sizeof(element_input_type)); + } + } + for (img = my_img_start; img < my_img_end; img++) { + for (ofm1 = 0; ofm1 < handle->blocksofm; ofm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, 0, 0, 0, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * (handle->ofw+handle->remainder_pixels) * sizeof(element_output_type)); + for (oi = 0; oi < handle->ofw; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, oi/2, ofm2, oi%2, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } + } +} + +/* Make sure we initialize intermediate weights to zero */ +if (handle->use_intermediate_f32_wt_tensor == 1 && handle->use_hybrid_imgofm_parallelization == 0) { + memset(weight_ptr_f32, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(float)); +} + +tile_config_kernel(NULL, NULL, NULL); + +if (handle->upd_linearized_pixels == 0) { +#if 0 + LDA = handle->ofmblock; + LDB = handle->ifhp*handle->ifwp_extended; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->ofw+handle->remainder_pixels, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + tile_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->ofw+handle->remainder_pixels, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); +#endif + n_blocks = handle->batchreduce_h_pixels; + + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (oj = 0; oj < handle->ofh; oj += handle->batchreduce_h_pixels){ + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + + /* Copy the input in such a way that we ignore "w-pixels" based on ki value */ + if (handle->on_the_fly_input_packing == 1) { + if (handle->upd_padding_copy == 1) { + for (ij = kj; ij < IFHP; ij+=handle->desc.u) { + for (ii = 0; ii < handle->ofw; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + if ( (ij >= handle->desc.pad_h) && (ii*handle->desc.v+ki >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii*handle->desc.v+ki < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij-handle->desc.pad_h, ii*handle->desc.v+ki-handle->desc.pad_w, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } else { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = (element_input_type)0; + } + } + } + } + } else { + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ofw; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, ifm2, ij, ii, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii*handle->desc.v+ki, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } + } + +#if 0 + for (j_br = 0; j_br < handle->batchreduce_h_pixels; j_br++) { + A_ptrs[j_br] = (element_output_type*) &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj+j_br, 0, 0, 0, handle->blocksofm, handle->ofhp, handle->ofwp_extended/2, handle->ofmblock, 2); + B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, 0, (oj+j_br)*handle->desc.u + kj, 0, handle->blocksifm, handle->ifmblock, handle->ifhp, handle->ifwp_extended); + } + br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); +#endif + br_gemm_kernel( &LIBXSMM_VLA_ACCESS(6, tr_output_2, img, ofm1, oj, 0, 0, 0, handle->blocksofm, OFHP, handle->ofwp_extended/2, handle->ofmblock, 2), + &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, ifm1, 0, oj*handle->desc.u + kj, 0, handle->blocksifm, handle->ifmblock, IFHP, handle->ifwp_extended), dst_ptr, &n_blocks); + + /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if (oj + handle->batchreduce_h_pixels >= handle->ofh) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index,(__m512i)c01)); + } + } + } + + } + } + } + } + } + } + } + } +} else { +#if 0 + LDA = handle->ofmblock; + LDB = handle->input_pixels; + LDC = handle->ofmblock; + prefetch_mode = libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE); +#endif + if (handle->use_hybrid_imgofm_parallelization == 1) { + /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ + /* FIXME: Hardcoed logic for N=27 */ + int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : ((handle->desc.threads+handle->weight_copies-1)/handle->weight_copies); + int tile_id = ltid/( (handle->desc.threads+handle->weight_copies-1)/handle->weight_copies ); + int tiles = handle->weight_copies; + int img_per_tile = (handle->desc.N+tiles-1)/tiles; + int my_in_tile_id = ltid % group_size; + int ifms_per_thread = (handle->blocksifm+group_size-1)/group_size; + int ofms_per_thread = (handle->blocksofm+group_size-1)/group_size; + int my_R_start = 0; + int my_R_end = handle->desc.R; + element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; + LIBXSMM_VLA_DECL(7, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2); + /* This intermediate tensor is used when pixels are NOT fully accumulated */ + float *weight_tile_ptr_f32 = (float*)((char*)handle->scratch + handle->upd_lp_filter_full_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; + LIBXSMM_VLA_DECL(6, float, weight_private_tile_f32, (float*)weight_tile_ptr_f32, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + + my_img_start = LIBXSMM_MIN( tile_id * img_per_tile, handle->desc.N); + my_img_end = LIBXSMM_MIN( (tile_id+1) * img_per_tile, handle->desc.N); + my_ifm_start = LIBXSMM_MIN( my_in_tile_id * ifms_per_thread, handle->blocksifm ); + my_ifm_end = LIBXSMM_MIN( (my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + /* FIXME: Hardcoed logic for N=27 */ + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { + my_ofm_start = LIBXSMM_MIN( my_in_tile_id * ofms_per_thread, handle->blocksofm ); + my_ofm_end = LIBXSMM_MIN( (my_in_tile_id+1) * ofms_per_thread, handle->blocksofm ); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { + int r_per_tile = (handle->desc.R+group_size-1)/group_size; + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + my_R_start = LIBXSMM_MIN( my_in_tile_id * r_per_tile, handle->desc.R ); + my_R_end = LIBXSMM_MIN( (my_in_tile_id+1) * r_per_tile, handle->desc.R ); + } + if (handle->pack_to_cnhw == 1) { + my_ofm_start = LIBXSMM_MIN( my_in_tile_id * ofms_per_thread, handle->blocksofm ); + my_ofm_end = LIBXSMM_MIN( (my_in_tile_id+1) * ofms_per_thread, handle->blocksofm ); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + + block_ofm = my_ofm_end-my_ofm_start+1; + block_ifm = my_ifm_end-my_ifm_start+1; + img_block_size = my_img_end - my_img_start; + + /* Make sure we initialize intermediate weights to zero */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + memset((float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, 0, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), 0, handle->ofmblock * handle->ifmblock * handle->desc.S * sizeof(float)); + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if (handle->pack_to_cnhw == 0) { +#if 0 + br_gemm_kernel = libxsmm_bsmmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + tile_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); +#endif + n_blocks = img_block_size; + + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_tile_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + +#if 0 + for (img_br = 0; img_br < img_block_size; img_br++) { + A_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(5, tr_output, img + img_br, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + B_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(4, tr_input, img + img_br, ifm1, 0, pix + kj * handle->ifwp + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels); + } + br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); +#endif + + br_gemm_kernel( &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2), + &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, pix + kj * IFWP + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels), + dst_ptr, &n_blocks); + + /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if (pix + handle->pixel_blocking >= handle->n_used_pixels) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(7, weight_private_group, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, (__m512i)c01)); + } + } + } + } + } + } + } + } + } + } + } + } else { +#if 0 + gemm_function gemm_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + tile_config_kernel = libxsmm_bsmmdispatch(handle->ofmblock, handle->ifmblock, handle->pixel_blocking, &LDA, &LDB, &LDC, NULL, &beta, &l_tc_flags, NULL); +#endif + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + gemm_kernel( &LIBXSMM_VLA_ACCESS(4, tr_output_3, ofm1, tile_id * handle->pixel_blocking/2, 0, 0, handle->output_pixels/2, handle->ofmblock, 2), + &LIBXSMM_VLA_ACCESS(3, tr_input_3, ifm1, 0, tile_id * handle->pixel_blocking, handle->ifmblock, handle->input_pixels), + dst_ptr); + /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(7, weight_private_group, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, (__m512i)c01)); + } + } + } + } + } + } + } + } + } + } + + } else { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (pix = 0; pix < handle->n_used_pixels; pix += handle->pixel_blocking){ + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + if ((handle->fuse_upd_transposes == 1) && (pix == 0) && (ifmb == 0)) { + /* (img,ofm1) transpose of output */ + if (handle->upd_padding_copy == 1) { + zero_ptr_out = (element_output_type*) &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, 0, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2); + memset(zero_ptr_out, 0, handle->ofmblock * handle->output_pixels * sizeof(element_output_type)); + for (oj = 0; oj < handle->ofhp; oj++) { + for (oi = 0; oi < handle->ofwp; oi++) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { + LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, (oj*OFWP+oi)/2, ofm2, (oj*OFWP+oi)%2, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2) = + LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj, oi, ofm2, handle->blocksofm, handle->ofhp, handle->ofwp, handle->ofmblock); + } + } + } + } else { + TRANS_OUTPUT_TO_VNNI_FORMAT(img, ofm1); + } + } + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + if ((handle->fuse_upd_transposes == 1) && (pix == 0) && (ofm1 == 0)) { + /* (img,ifm1) transpose of input */ + if (handle->upd_padding_copy == 1) { + zero_ptr_in = (element_input_type*) &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, 0, handle->blocksifm, handle->ifmblock, handle->input_pixels); + memset(zero_ptr_in, 0, handle->ifmblock * handle->input_pixels * sizeof(element_input_type)); + for (ij = 0; ij < handle->ifhp; ij++) { + for (ii = 0; ii < handle->ifwp; ii++) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, ifm2, (ij + handle->desc.pad_h) * IFWP + (ii + handle->desc.pad_w), handle->blocksifm, handle->ifmblock, handle->input_pixels) = + LIBXSMM_VLA_ACCESS(5, input, img, ifm1, ij, ii, ifm2, handle->blocksifm, handle->ifhp, handle->ifwp, handle->ifmblock); + } + } + } + } else { + TRANS_INPUT(img, ifm1); + } + } + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + /* Determine if destination is the accumulation scratch or the intermediate fp32 weight tensor */ + if (handle->use_intermediate_f32_wt_tensor == 1) { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(6, weight_private_f32, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); + } else { + dst_ptr = (float*)&LIBXSMM_VLA_ACCESS(2, filter_tmp, 0, 0, handle->ofmblock); + } + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, tr_output, img, ofm1, pix/2, 0, 0, handle->blocksofm, handle->output_pixels/2, handle->ofmblock, 2), + &LIBXSMM_VLA_ACCESS(4, tr_input, img, ifm1, 0, pix + kj * IFWP + ki, handle->blocksifm, handle->ifmblock, handle->input_pixels), + dst_ptr); + /* Convert fully caccumulated buffer to bf16 weight buffer in case of full accumulation has happened */ + if (pix + handle->pixel_blocking >= handle->n_used_pixels) { + LIBXSMM_VLA_DECL(2, float, filter_acc_buffer, (float*)dst_ptr, handle->ofmblock); + for (ij = 0; ij < handle->ifmblock; ij+=2) { + for (ii = 0; ii < handle->ofmblock; ii+=16) { + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij+1, ii, handle->ofmblock)), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&LIBXSMM_VLA_ACCESS(2, filter_acc_buffer, ij, ii, handle->ofmblock))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(7, weight_dst, ofm1, ifm1, kj, ki, ij/2, ii, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock/2, handle->ofmblock, 2), _mm512_permutexvar_epi16(perm_index, (__m512i)c01)); + } + } + } + + } + } + } + } + } + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +if (handle->weight_copies > 1) { + const int filter_size = handle->desc.R * handle->desc.S * handle->desc.C * handle->desc.K; + LIBXSMM_VLA_DECL(2, element_filter_type, weight_copies_buffer, (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset), filter_size); + element_filter_type *weight_global_ptr = (element_filter_type*) handle->grad_filter->data; + for ( j = reduce_thr_begin; j < reduce_thr_end; j++) { + __m512 weight_sum = _mm512_setzero_ps(); + for ( i = 0; i < handle->weight_copies; i++ ) { + weight_sum = _mm512_add_ps(weight_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(2, weight_copies_buffer, i, j*16, filter_size))); + } + _mm512_streamstorecvt_fp32_bf16( ((libxsmm_bfloat16*) weight_global_ptr) + j*16, weight_sum); + } + libxsmm_barrier_wait(handle->barrier, ltid); +} +handle->tilerelease_kernel(NULL, NULL, NULL); + +#undef TRANS_OUTPUT_TO_VNNI_FORMAT +#undef TRANS_INPUT diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c new file mode 100644 index 00000000..fcc2f533 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c @@ -0,0 +1,675 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +int img, my_img_start, my_img_end, ofmb, ifmb, ojb, ofm1, ifm1, ifm2 = 0, ofm2 = 0, oj, oi, ii, ij, kj, ki, ind, j_br, img_br, img_block_size = 1, my_ofm_start, my_ofm_end, my_ifm_start, my_ifm_end, block_ofm, block_ifm; +/* computing first logical thread */ +const int ltid = tid - start_thread; +libxsmm_blasint LDA = handle->blocksofm * handle->ofmblock; +libxsmm_blasint LDB = (handle->upd_pack_input == 1) ? handle->blocksifm * handle->ifmblock : handle->desc.v * handle->blocksifm * handle->ifmblock; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) +libxsmm_blasint LDC = handle->ofmblock; +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) +libxsmm_blasint LDC = handle->blocksofm * handle->ofmblock; +#endif +int l_flags = LIBXSMM_GEMM_FLAGS('N', 'T'); +element_output_type *const out = (element_output_type*)handle->grad_output->data + ((size_t)handle->desc.pad_h_out * handle->ofwp + handle->desc.pad_w_out) * handle->blocksofm * handle->ofmblock; +LIBXSMM_VLA_DECL(5, const element_output_type, output, (const element_output_type*)out, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); +const int IFWP = (handle->upd_padding_copy == 1) ? handle->ifwp + 2*handle->desc.pad_w : handle->ifwp; +const int IFHP = (handle->upd_padding_copy == 1) ? handle->ifhp + 2*handle->desc.pad_h : handle->ifhp; +element_input_type *input_ptr_to_use = (handle->upd_padding_copy == 1) ? (element_input_type*) ((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)handle->reg_input->data; +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type*) input_ptr_to_use, IFHP, IFWP, handle->blocksifm, handle->ifmblock); +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) +LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) +LIBXSMM_VLA_DECL(6, element_filter_type, weight_global, (element_filter_type*)handle->grad_filter->data, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif +element_filter_type *weight_ptr = (handle->weight_copies == 1) ? (element_filter_type*)handle->grad_filter->data : (element_filter_type*) ((char*)handle->scratch + handle->upd_filter_scratch_offset) + ltid * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) +LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) +LIBXSMM_VLA_DECL(6, element_filter_type, weight_private, (element_filter_type*)weight_ptr, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif +int prefetch_mode = (handle->desc.u == 2 || (handle->desc.R == 3 && handle->ofw == 7) ) ? libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_NONE) : libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BL1); + +/* Batch reduce related variables */ +const element_output_type *A_ptrs[1024]; +const element_input_type *B_ptrs[1024]; +unsigned long long n_blocks; + +int brgemm_pf_oob = 0; +const char *const env_brgemm_pf_oob = getenv("BRGEMM_PF_OOB"); +if ( 0 == env_brgemm_pf_oob ) { +} else { + brgemm_pf_oob = atoi(env_brgemm_pf_oob); +} +if (brgemm_pf_oob > 0) { + prefetch_mode = prefetch_mode | libxsmm_get_gemm_prefetch(LIBXSMM_GEMM_PREFETCH_BRGEMM_OOB); +} + +libxsmm_barrier_init(handle->barrier, ltid); + +/* physical pad input */ +if (handle->upd_padding_copy == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + int imgpt = LIBXSMM_UPDIV(handle->desc.N, handle->desc.threads); + my_img_start = LIBXSMM_MIN(ltid * imgpt, handle->desc.N); + my_img_end = LIBXSMM_MIN((ltid+1) * imgpt, handle->desc.N); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + + for (img = my_img_start; img < my_img_end; img++) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + /* copy the inner part */ + for (ij = 0; ij < handle->ifhp+(2*handle->desc.pad_h); ij++) { + for (ii = 0; ii < handle->ifwp+(2*handle->desc.pad_w); ii++) { + if ( (ij >= handle->desc.pad_h) && (ii >= handle->desc.pad_w) && (ij < handle->ifhp+handle->desc.pad_h) && (ii < handle->ifwp+handle->desc.pad_w) ) { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFHP, IFWP, handle->blocksifm, handle->ifmblock) = + LIBXSMM_VLA_ACCESS(5, input_src, img, ij-handle->desc.pad_h, ii-handle->desc.pad_w, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } else { + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, ifm2, IFHP, IFWP, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if (handle->upd_use_batchreduce == 0 && handle->upd_linearized_tasklist == 0) { + /* Parallelize over minibatch */ + const int img_work = handle->desc.N; + const int img_chunksize = (img_work % handle->desc.threads == 0) ? (img_work / handle->desc.threads) : (img_work / handle->desc.threads) + 1; + const float beta = ((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + my_img_start = (ltid * img_chunksize < img_work) ? (ltid * img_chunksize) : img_work; + my_img_end = ((ltid + 1) * img_chunksize < img_work) ? ((ltid + 1) * img_chunksize) : img_work; + + if (!((img_chunksize == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + memset(weight_ptr, 0, handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S * sizeof(element_filter_type)); + } + + if (handle->upd_loop_order == 0) { + for (img = my_img_start; img < my_img_end; img++) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); +#endif + } + } + } + } + } + } + } + } + } + } + } + if (handle->upd_loop_order == 1) { + for (img = my_img_start; img < my_img_end; img++) { + for (ifmb = 0; ifmb < handle->blocksifm; ifmb += handle->block_upd_ifm) { + for (ofmb = 0; ofmb < handle->blocksofm; ofmb += handle->block_upd_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+handle->block_upd_ifm, handle->blocksifm); ifm1++) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+handle->block_upd_ofm, handle->blocksofm); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = 0; kj < handle->desc.R; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input, img, ij, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_private, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); +#endif + } + } + } + } + } + } + } + } + } + } + } +} else { + if (handle->upd_linearized_tasklist == 1) { + /* Amount of work when using linearized view of tasks */ + const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm; + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; + const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int work_item; + int Cb = handle->blocksifm; +#if 0 + int Kb = handle->blocksofm; +#endif + int R = handle->desc.R; + int S = handle->desc.S; + + if (handle->upd_avoid_rim_fmas == 0) { + const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; + const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; + element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*)((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; + LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, IFH, IFW, handle->blocksifm, handle->ifmblock); + const float beta = ((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_function gemm_kernel = libxsmm_smmdispatch(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb * handle->upd_ofh_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + /* If requested, pack input to avoid strided accesses */ + if (handle->upd_pack_input == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; + const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); + const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); + + for (img = img_copy_start; img < img_copy_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input_use, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); + } + + /* Initialize weights to zero */ + if (!((handle->desc.N == 1) && (handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; +#endif + } + } + } + } + + for (img = 0; img < handle->desc.N; img++) { + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + oi = 0; + ii = ki; + for (oj = 0; oj < handle->ofh; oj += handle->upd_ofh_rb) { + ij = oj * handle->desc.u + kj; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input_use, img, ij, ii, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) ); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + gemm_kernel( &LIBXSMM_VLA_ACCESS(5, output, img, oj, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock), + &LIBXSMM_VLA_ACCESS(5, input_use, img, ij, ii, ifm1, 0, IFH, IFW, handle->blocksifm, handle->ifmblock), + &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) ); +#endif + } + } + } + } else { + const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + gemm_br_function br_gemm_kernel2 = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb-1, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + + for (work_item = work_begin; work_item < work_end; work_item++) { + ofm1 = work_item/(Cb*R*S); + ifm1 = (work_item%(Cb*R*S))/(R*S); + kj = ((work_item%(Cb*R*S))%(R*S))/S; + ki = ((work_item%(Cb*R*S))%(R*S))%S; + oi = 0; + oj = 0; + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + img = 0; + img_block_size = handle->desc.N; + + if (kj == 0) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 1; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } else if (ki == 0) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi + 1, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii + 1, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } else if (oi == handle->ofw-handle->fwd_ofw_rb && ki == handle->desc.S-1) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } else { + if (kj == handle->desc.R-1) { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb-1; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } else { + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } + } + } + } + } else { + /* Here we are using batch-reduce kernel and hybrid minibatch/FM parallelization */ + /* FIXME: Hardcoed logic for N=27 */ + int group_size = (handle->desc.threads == 27 && handle->desc.N == 27 && handle->ofw == 14 && handle->desc.R == 1 && handle->desc.u == 1 && ltid >= 24) ? 3 : LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tile_id = ltid / LIBXSMM_UPDIV(handle->desc.threads, handle->weight_copies); + int tiles = handle->weight_copies; + int img_per_tile = LIBXSMM_UPDIV(handle->desc.N, tiles); + int my_in_tile_id = ltid % group_size; + int ifms_per_thread = LIBXSMM_UPDIV(handle->blocksifm, group_size); + int ofms_per_thread = LIBXSMM_UPDIV(handle->blocksofm, group_size); + int my_R_start = 0; + int my_R_end = handle->desc.R; + const float beta = ((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw)) ? 0.f : 1.f; + gemm_br_function br_gemm_kernel = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta, &l_flags, &prefetch_mode); + const float beta_flat = 0.0; + gemm_br_function br_gemm_kernel_flat = libxsmm_smmdispatch_reducebatch_addr(handle->ofmblock, handle->ifmblock, handle->upd_ofw_rb, &LDA, &LDB, &LDC, NULL, &beta_flat, &l_flags, &prefetch_mode); + element_filter_type *weight_ptr_group = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + tile_id * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_private_group, (element_filter_type*)weight_ptr_group, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + my_img_start = LIBXSMM_MIN(tile_id * img_per_tile, handle->desc.N); + my_img_end = LIBXSMM_MIN((tile_id+1) * img_per_tile, handle->desc.N); + my_ifm_start = LIBXSMM_MIN(my_in_tile_id * ifms_per_thread, handle->blocksifm ); + my_ifm_end = LIBXSMM_MIN((my_in_tile_id+1) * ifms_per_thread, handle->blocksifm ); + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + /* FIXME: Hardcoed logic for N=27 */ + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.C == 256 && handle->desc.K == 1024 && handle->ofh == 14 && handle->desc.u == 1) { + my_ofm_start = LIBXSMM_MIN(my_in_tile_id * ofms_per_thread, handle->blocksofm); + my_ofm_end = LIBXSMM_MIN((my_in_tile_id+1) * ofms_per_thread, handle->blocksofm); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + } + if (handle->desc.threads == 27 && handle->desc.N == 27 && handle->desc.R == 3 && handle->desc.S == 3 && handle->ofh == 14) { + int r_per_tile = LIBXSMM_UPDIV(handle->desc.R, group_size); + my_ifm_start = 0; + my_ifm_end = handle->blocksifm; + my_ofm_start = 0; + my_ofm_end = handle->blocksofm; + my_R_start = LIBXSMM_MIN(my_in_tile_id * r_per_tile, handle->desc.R); + my_R_end = LIBXSMM_MIN((my_in_tile_id+1) * r_per_tile, handle->desc.R); + } + block_ofm = my_ofm_end-my_ofm_start+1; + block_ifm = my_ifm_end-my_ifm_start+1; + img_block_size = my_img_end - my_img_start; + + if (handle->desc.N != handle->desc.threads) { + /* Use "flat" parallelism + reduction */ + const int work = handle->desc.R * handle->desc.S * handle->blocksofm * handle->blocksifm * handle->desc.N; + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : (work / handle->desc.threads) + 1; + const int work_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int work_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int work_item; + int Cb = handle->blocksifm; + int Kb = handle->blocksofm; + int R = handle->desc.R; + int S = handle->desc.S; + const int IFH = (handle->upd_pack_input == 1) ? handle->ifhp/handle->desc.u : IFHP; + const int IFW = (handle->upd_pack_input == 1) ? handle->ifwp/handle->desc.v : IFWP; + element_input_type *input_ptr_base = (handle->upd_pack_input == 1) ? (element_input_type*)((char*)handle->scratch + handle->upd_packing_padding_scratch_offset) : (element_input_type*)input_ptr_to_use; + LIBXSMM_VLA_DECL(5, element_input_type, input_use, (element_input_type*)input_ptr_base, IFH, IFW, handle->blocksifm, handle->ifmblock); + + /* If requested, pack input to avoid strided accesses */ + if (handle->upd_pack_input == 1) { + LIBXSMM_VLA_DECL(5, element_input_type, input_src, (element_input_type*)handle->reg_input->data, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + const int img_chunk = (handle->desc.N % handle->desc.threads == 0) ? handle->desc.N/handle->desc.threads : (handle->desc.N/handle->desc.threads) + 1; + const int img_copy_start = LIBXSMM_MIN(ltid*img_chunk, handle->desc.N); + const int img_copy_end = LIBXSMM_MIN((ltid+1)*img_chunk, handle->desc.N); + + for (img = img_copy_start; img < img_copy_end; img++) { + for (ifm1 = 0; ifm1 < handle->blocksifm; ifm1++) { + for (oj = 0; oj < handle->ofh; oj++) { + for (oi = 0; oi < handle->ofw; oi++) { + ij = oj * handle->desc.u; + ii = oi * handle->desc.v; + LIBXSMM_PRAGMA_SIMD + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_VLA_ACCESS(5, input_use, img, oj, oi, ifm1, ifm2, IFH, IFW, handle->blocksifm, handle->ifmblock) = LIBXSMM_VLA_ACCESS(5, input_src, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock); + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, ltid); + } + + /* Initialize weights to zero */ + if (handle->upd_ofw_rb != handle->ofw) { + for (work_item = work_begin; work_item < work_end; work_item++) { + img = work_item/(Cb*Kb*R*S); + ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); + ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); + kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; + ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; + { + element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset)+ img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { + LIBXSMM_PRAGMA_SIMD + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++) { +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_ACCESS(6, weight_current, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; +#endif + } + } + } + } + } + + for (work_item = work_begin; work_item < work_end; work_item++) { + img = work_item/(Cb*Kb*R*S); + ofm1 = (work_item%(Cb*Kb*R*S))/(Cb*R*S); + ifm1 = ((work_item%(Cb*Kb*R*S))%(Cb*R*S))/(R*S); + kj = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))/S; + ki = (((work_item%(Cb*Kb*R*S))%(Cb*R*S))%(R*S))%S; + ii = 0 + ki; + ij = 0 + kj; + oj = 0; + oi = 0; + { + element_filter_type *weight_ptr_current = (handle->weight_copies > 1) ? (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset) + img * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S : (element_filter_type*)handle->grad_filter->data; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_DECL(6, element_filter_type, weight_current, (element_filter_type*)weight_ptr_current, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock); +#endif + ind = 0; + for (j_br = 0; j_br < handle->ofh; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img , oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel_flat(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_current, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } + } + } else { + /* May need to initialized private weights to zero */ + if (!((handle->upd_ofh_rb == handle->ofh) && (handle->upd_ofw_rb == handle->ofw))) { + for (ofm1 = my_ofm_start; ofm1 < my_ofm_end; ofm1++ ) { + for (ifm1 = my_ifm_start; ifm1 < my_ifm_end; ifm1++) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + for (ofm2 = 0; ofm2 < handle->ofmblock; ofm2++ ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ifm2++) { +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, ifm2, ofm2, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock) = (element_filter_type)0; +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, ifm2, ofm1, ofm2, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock) = (element_filter_type)0; +#endif + } + } + } + } + } + } + } + + if (handle->upd_loop_order == 0) { + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } + } + } + } + } + } + } + } + } + } + } else { + for (img = my_img_start; img < my_img_end; img += img_block_size) { + for (ifmb = my_ifm_start; ifmb < my_ifm_end; ifmb += block_ifm) { + for (ofmb = my_ofm_start; ofmb < my_ofm_end; ofmb += block_ofm) { + for (ojb = 0; ojb < handle->ofh; ojb += handle->upd_ofh_rb) { + for (ifm1 = ifmb; ifm1 < LIBXSMM_MIN(ifmb+block_ifm, my_ifm_end); ifm1++) { + for (ofm1 = ofmb; ofm1 < LIBXSMM_MIN(ofmb+block_ofm, my_ofm_end); ofm1++ ) { + for (oj = ojb; oj < LIBXSMM_MIN(ojb+handle->upd_ofh_rb,handle->ofh); oj+= handle->upd_ofh_rb) { + for (oi = 0; oi < handle->ofw; oi += handle->upd_ofw_rb) { + for (kj = my_R_start; kj < my_R_end; ++kj) { + for (ki = 0; ki < handle->desc.S; ++ki) { + ii = oi * handle->desc.u + ki; + ij = oj * handle->desc.v + kj; + ind = 0; + for (img_br = 0; img_br < img_block_size; img_br++) { + for (j_br = 0; j_br < handle->upd_ofh_rb; j_br++) { + A_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img + img_br, oj + j_br, oi, ofm1, 0, handle->ofhp, handle->ofwp, handle->blocksofm, handle->ofmblock); + B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP, IFWP, handle->blocksifm, handle->ifmblock); + ind++; + } + } + n_blocks = ind; +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_CUSTOM) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, ofm1, ifm1, kj, ki, 0, 0, handle->blocksifm, handle->desc.R, handle->desc.S, handle->ifmblock, handle->ofmblock), &n_blocks); +#endif +#if defined(LIBXSMM_DNN_TPL_UPD_DIRECT_GENERIC_NHWC_RSCK) + br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_private_group, kj, ki, ifm1, 0, ofm1, 0, handle->desc.S, handle->blocksifm, handle->ifmblock, handle->blocksofm, handle->ofmblock), &n_blocks); +#endif + } + } + } + } + } + } + } + } + } + } + } + } + } +} + +if (handle->weight_copies > 1) { + /* reduce work-related variables */ + const int fm_blocking = (handle->ofmblock % 16 == 0) ? 16 : handle->ofmblock; + const int reduce_work = handle->blocksofm * handle->blocksifm * handle->desc.R * handle->desc.S * (handle->ofmblock/fm_blocking) * handle->ifmblock; + const int reduce_chunksize = (reduce_work % handle->desc.threads == 0) ? (reduce_work / handle->desc.threads) : (reduce_work / handle->desc.threads) + 1; + const int reduce_thr_begin = (ltid * reduce_chunksize < reduce_work) ? (ltid * reduce_chunksize) : reduce_work; + const int reduce_thr_end = ((ltid + 1) * reduce_chunksize < reduce_work) ? ((ltid + 1) * reduce_chunksize) : reduce_work; + + /* Perform reduction here */ + libxsmm_barrier_wait(handle->barrier, ltid); + + for ( ij = reduce_thr_begin; ij < reduce_thr_end; ij++ ) { + element_filter_type *weight_ptr_glb = (element_filter_type*) handle->grad_filter->data; +#if 1 + float weight_sum[64]; + int wtcnt = 0; + assert( handle->ofmblock <= 64 ); + + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_sum[wtcnt] = 0.0f; + } + + for ( ii = 0; ii < handle->weight_copies; ii++ ) { + element_filter_type *weight_ptr_src = (element_filter_type*)((char*)handle->scratch + handle->upd_filter_scratch_offset)+ ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * fm_blocking; + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_sum[wtcnt] += weight_ptr_src[wtcnt]; + } + } + + LIBXSMM_PRAGMA_SIMD + for ( wtcnt = 0; wtcnt < fm_blocking; ++wtcnt ) { + weight_ptr_glb[(ij*fm_blocking) + wtcnt] = weight_sum[wtcnt]; + } +#else + __m512 weight_sum = _mm512_setzero_ps(); + for ( ii = 0; ii < handle->weight_copies; ii++ ) { + element_filter_type *weight_ptr_src = (element_filter_type*)handle->scratch7 + ii * handle->desc.C * handle->desc.K * handle->desc.R * handle->desc.S + ij * 16; + weight_sum = _mm512_add_ps(weight_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(weight_ptr_src)); + } + _mm512_storeu_ps(&weight_ptr_glb[ij*16], weight_sum); +#endif + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c new file mode 100644 index 00000000..950f0a23 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_custom_generic.tpl.c @@ -0,0 +1,246 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + /* size variables, all const */ + /* here we assume that input and output blocking is similar */ + const int nBlocksIFm = handle->blocksifm; + const int nIFmBlock = handle->ifmblock; + const int nBlocksOFm = handle->blocksofm; + const int nOFmBlock = handle->ofmblock; + + /* computing first logical thread */ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const int work = nBlocksIFm; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + + /* number of tasks for transpose that could be run in parallel */ + const int transpose_work = nBlocksIFm * nBlocksOFm; + /* compute chunk size */ + const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + + /* loop variables */ + int ofm1 = 0; + int ofm2 = 0; + int ifm1 = 0; + int ifm2 = 0; + int ifm1ofm1 = 0; + + LIBXSMM_VLA_DECL(3, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksOFm, nOFmBlock); + LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); +#if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) + float* dinput_f32_ptr = (float*)handle->scratch; + float* filter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); + LIBXSMM_VLA_DECL(3, float, dinput, dinput_f32_ptr, nBlocksIFm, nIFmBlock); + LIBXSMM_VLA_DECL(4, float, filter_tr, filter_f32_ptr, nBlocksOFm, nOFmBlock, nIFmBlock); + + /* number of tasks that could be run in parallel */ + const int work_input = handle->desc.N * handle->desc.C; + /* compute chunk size */ + const int chunksize_input = (work_input % handle->desc.threads == 0) ? (work_input / handle->desc.threads) : ((work_input / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin_input = (ltid * chunksize_input < work_input) ? (ltid * chunksize_input) : work_input; + const int thr_end_input = ((ltid + 1) * chunksize_input < work_input) ? ((ltid + 1) * chunksize_input) : work_input; +#else + LIBXSMM_VLA_DECL(3, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, nIFmBlock); + LIBXSMM_VLA_DECL(4, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, nOFmBlock, nIFmBlock); +#endif + + /* lazy barrier init */ + libxsmm_barrier_init(handle->barrier, ltid); + + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + + for (ofm2 = 0; ofm2 < nOFmBlock; ++ofm2) { + for (ifm2 = 0; ifm2 < nIFmBlock; ++ifm2) { +#if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) + union libxsmm_bfloat16_hp filter_f32; + filter_f32.i[0] = 0; + filter_f32.i[1] = LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock); + LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) = filter_f32.f; +#else + LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) = + LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock); +#endif + } + } + } + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); + + for ( ifm1 = thr_begin; ifm1 < thr_end; ++ifm1 ) { /* outer GEMM m-loop */ +#if 1 + gemm_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, nOFmBlock, nIFmBlock), + &LIBXSMM_VLA_ACCESS(3, doutput, 0, 0, 0, nBlocksOFm, nOFmBlock), + &LIBXSMM_VLA_ACCESS(3, dinput, 0, ifm1, 0, nBlocksIFm, nIFmBlock) ); +#else + const int nImg = handle->desc.N; + int img2; + + /* this is a simple replacement code using regular loops */ + for ( img2 = 0; img2 < nImg; ++img2 ) { + LIBXSMM_PRAGMA_SIMD + for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { + LIBXSMM_VLA_ACCESS(3, dinput, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock) = (element_output_type)0; + } + } + for ( ofm1 = 0; ofm1 < nBlocksOFm; ++ofm1 ) { /* outer GEMM k-loop */ + for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM K-loop */ + for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM n-loop */ + LIBXSMM_PRAGMA_SIMD + for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM m-loop */ + LIBXSMM_VLA_ACCESS(3, dinput, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock) += + LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, nOFmBlock, nIFmBlock) * LIBXSMM_VLA_ACCESS(3, doutput, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock); + } + } + } + } +#endif + } + +#if defined(LIBXSMM_DNN_FULLYCONNECTED_BWD_BF16_F32) + libxsmm_barrier_wait(handle->barrier, ltid); + + libxsmm_rne_convert_fp32_bf16( dinput_f32_ptr+thr_begin_input, ((element_input_type*)handle->grad_input->data)+thr_begin_input, thr_end_input-thr_begin_input ); +#endif + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + /* size variables, all const */ + const int nImg = handle->desc.N; + /* here we assume that input and output blocking is similar */ + const int nBlocksIFm = handle->blocksifm; + const int nIFmBlock = handle->ifmblock; + const int nBlocksOFm = handle->blocksofm; + const int nOFmBlock = handle->ofmblock; + + /* computing first logical thread */ + const int ltid = tid - start_thread; + /* number of tasks that could be run in parallel */ + const int work = nBlocksIFm * nBlocksOFm; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + + /* number of tasks for transpose that could be run in parallel */ + const int transpose_work = nBlocksIFm; + /* compute chunk size */ + const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + + /* loop variables */ + int img2 = 0; + int ifm1ofm1 = 0; + int ofm1 = 0; + int ifm1 = 0; + int ifm2 = 0; + + LIBXSMM_VLA_DECL(3, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, nIFmBlock); + LIBXSMM_VLA_DECL(3, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksOFm, nOFmBlock); +#if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) + float* input_f32_ptr = (float*)handle->scratch; + float* dfilter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); + LIBXSMM_VLA_DECL(3, float, input_tr, input_f32_ptr, nIFmBlock, nImg); + LIBXSMM_VLA_DECL(4, float, dfilter, dfilter_f32_ptr, nBlocksIFm, nIFmBlock, nOFmBlock); + + /* number of tasks that could be run in parallel */ + const int work_filter = handle->desc.C * handle->desc.K; + /* compute chunk size */ + const int chunksize_filter = (work_filter % handle->desc.threads == 0) ? (work_filter / handle->desc.threads) : ((work_filter / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin_filter = (ltid * chunksize_filter < work_filter) ? (ltid * chunksize_filter) : work_filter; + const int thr_end_filter = ((ltid + 1) * chunksize_filter < work_filter) ? ((ltid + 1) * chunksize_filter) : work_filter; +#else + LIBXSMM_VLA_DECL(4, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); + LIBXSMM_VLA_DECL(3, element_input_type, input_tr, (element_input_type* )handle->scratch, nIFmBlock, nImg); +#endif + + /* lazy barrier init */ + libxsmm_barrier_init(handle->barrier, ltid); + + for (ifm1 = transpose_thr_begin; ifm1 < transpose_thr_end; ++ifm1) { + for (ifm2 = 0; ifm2 < nIFmBlock; ++ifm2) { + for (img2 = 0; img2 < nImg; ++img2) { +#if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) + union libxsmm_bfloat16_hp input_f32; + input_f32.i[0] = 0; + input_f32.i[1] = LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); + LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg) = input_f32.f; +#else + LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg) = + LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); +#endif + } + } + } + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); + + for ( ifm1ofm1 = thr_begin; ifm1ofm1 < thr_end; ++ifm1ofm1 ) { /* outer GEMM m/n-loop */ + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + +#if 1 + gemm_kernel_upd( &LIBXSMM_VLA_ACCESS(3, doutput, 0, ofm1, 0, nBlocksOFm, nOFmBlock), + &LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, 0, 0, nIFmBlock, nImg), + &LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, 0, 0, nBlocksIFm, nIFmBlock, nOFmBlock) ); +#else + { + const int nImg = handle->desc.N; + int ifm2, ofm2; + + /* this is a simple replacement code using regular loops */ + for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { + LIBXSMM_PRAGMA_SIMD + for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) = (element_output_type)0; + } + } + for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM k-loop */ + for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM n-loop */ + LIBXSMM_PRAGMA_SIMD + for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM m-loop */ + LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) += + LIBXSMM_VLA_ACCESS(3, doutput, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) * LIBXSMM_VLA_ACCESS(3, input_tr, ifm1, ifm2, img2, nIFmBlock, nImg); + } + } + } + } +#endif + } + +#if defined(LIBXSMM_DNN_FULLYCONNECTED_UPD_BF16_F32) + libxsmm_barrier_wait(handle->barrier, ltid); + + libxsmm_rne_convert_fp32_bf16( dfilter_f32_ptr+thr_begin_filter, ((element_input_type*)handle->grad_filter->data)+thr_begin_filter, thr_end_filter-thr_begin_filter ); +#endif + + libxsmm_barrier_wait(handle->barrier, ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c new file mode 100644 index 00000000..84313611 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic.tpl.c @@ -0,0 +1,346 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* here we assume that input and output blocking is similar */ +const int bn = handle->bn; +const int bk = handle->bk; +const int bc = handle->bc; +const int nBlocksIFm = handle->desc.C / bc; +const int nBlocksOFm = handle->desc.K / bk; +const int nBlocksMB = handle->desc.N / bn; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +/* number of tasks for transpose that could be run in parallel */ +const int eltwise_work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int eltwise_chunksize = (eltwise_work % handle->desc.threads == 0) ? (eltwise_work / handle->desc.threads) : ((eltwise_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int eltwise_thr_begin = (ltid * eltwise_chunksize < eltwise_work) ? (ltid * eltwise_chunksize) : eltwise_work; +const int eltwise_thr_end = ((ltid + 1) * eltwise_chunksize < eltwise_work) ? ((ltid + 1) * eltwise_chunksize) : eltwise_work; +int mb1ofm1; +#endif + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +/* number of tasks for transpose that could be run in parallel */ +const int dbias_work = nBlocksOFm; +/* compute chunk size */ +const int dbias_chunksize = (dbias_work % handle->desc.threads == 0) ? (dbias_work / handle->desc.threads) : ((dbias_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int dbias_thr_begin = (ltid * dbias_chunksize < dbias_work) ? (ltid * dbias_chunksize) : dbias_work; +const int dbias_thr_end = ((ltid + 1) * dbias_chunksize < dbias_work) ? ((ltid + 1) * dbias_chunksize) : dbias_work; +#endif + +/* loop variables */ +int ofm1 = 0, mb1 = 0, ofm2 = 0, mb2 = 0; + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +element_output_type *grad_output_ptr = ((element_output_type*)handle->scratch)+(handle->desc.C*handle->desc.K); +LIBXSMM_VLA_DECL(4, const element_output_type, doutput_orig, (element_output_type*)handle->grad_output->data, nBlocksOFm, bn, bk); +#else +element_output_type *grad_output_ptr = (element_output_type*)handle->grad_output->data; +#endif +LIBXSMM_VLA_DECL(4, element_output_type, doutput, grad_output_ptr, nBlocksOFm, bn, bk); + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +LIBXSMM_VLA_DECL(2, float, dbias, (float*) handle->grad_bias->data, handle->bk); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU +LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*) handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + float l_cur_out = LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU + l_cur_out = (LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) != 0) ? l_cur_out : (element_output_type)0; +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + l_cur_out = l_cur_out*(1.0f - l_cur_out); +#endif + LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } +} + +/* wait for eltwise to finish */ +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) +for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, ofm2, handle->bk ) = 0.0f; + } + + for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, ofm2, handle->bk ) += LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); + } + } + } +} + +/* wait for eltwise to finish */ +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + const int use_2d_blocking = handle->bwd_2d_blocking; + + /* number of tasks that could be run in parallel */ + const int work = nBlocksIFm * nBlocksMB; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + + /* number of tasks for transpose that could be run in parallel */ + const int transpose_work = nBlocksIFm * nBlocksOFm; + /* compute chunk size */ + const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + + /* loop variables */ + int ifm1 = 0, ifm2 = 0, ifm1ofm1 = 0, mb1ifm1 = 0; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc, bk); + LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(4, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, bk, bc); + + unsigned long long blocks = nBlocksOFm; + int KB_BLOCKS = nBlocksOFm, BF = 1; + libxsmm_meltw_unary_param trans_param; + + BF = handle->bwd_bf; + KB_BLOCKS = nBlocksOFm/BF; + blocks = KB_BLOCKS; + + if (use_2d_blocking == 1) { + row_teams = handle->bwd_row_teams; + column_teams = handle->bwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksIFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksIFm); + } + + /* transpose weight */ + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + trans_param.in.primary = (void*)&LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, 0, 0, nBlocksIFm, bc, bk); + trans_param.out.primary = &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, 0, 0, nBlocksOFm, bk, bc); + handle->tr_kernel( &trans_param ) ; +#if 0 + for (ofm2 = 0; ofm2 < bk; ++ofm2) { + for (ifm2 = 0; ifm2 < bc; ++ifm2) { + LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1, ofm2, ifm2, nBlocksOFm, bk, bc) = + LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, bc, bk); + } + } +#endif + } + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); + + if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + for ( mb2 = 0; mb2 < bn; ++mb2 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc) = (element_input_type)0; + } + } + } + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bk, bc ), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + } else { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, bk, bc), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + } else { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + for ( mb2 = 0; mb2 < bn; ++mb2 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc) = (element_input_type)0; + } + } + } + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bk, bc ), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } else { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter_tr, ifm1, 0, 0, 0, nBlocksOFm, bk, bc ), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + /* number of tasks that could be run in parallel */ + const int ofm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ofm_subtasks; + const int ifm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ifm_subtasks; + const int bbk = (handle->upd_2d_blocking == 1) ? bk : bk/ofm_subtasks; + const int bbc = (handle->upd_2d_blocking == 1) ? bc : bc/ifm_subtasks; + const int work = nBlocksIFm * ifm_subtasks * nBlocksOFm * ofm_subtasks; + const int Cck_work = nBlocksIFm * ifm_subtasks * ofm_subtasks; + const int Cc_work = nBlocksIFm * ifm_subtasks; + + /* 2D blocking parameters */ + int use_2d_blocking = handle->upd_2d_blocking; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int BF = handle->upd_bf; + + /* loop variables */ + int ifm1ofm1 = 0, ifm1 = 0, ifm2 = 0, bfn = 0, ii = 0, jj = 0; + + /* Batch reduce related variables */ + unsigned long long blocks = nBlocksMB/BF; + + LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(4, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, bc, bk); + + if (use_2d_blocking == 1) { + row_teams = handle->upd_row_teams; + column_teams = handle->upd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksIFm); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksIFm); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); + } + + if (use_2d_blocking == 1) { + if (BF == 1) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { + batchreduce_kernel_upd_zerobeta(&LIBXSMM_VLA_ACCESS(4, doutput, 0, ofm1, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, input, 0, ifm1, 0, 0, nBlocksIFm, bn, bc), + &LIBXSMM_VLA_ACCESS(4, dfilter, ofm1, ifm1, 0, 0, nBlocksIFm, bc, bk), &blocks); + } + } + } else { + for (bfn = 0; bfn < BF; bfn++) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { + /* initialize current work task to zero */ + if (bfn == 0) { + for (ii = 0; iibarrier, ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c new file mode 100644 index 00000000..a47e49c7 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16.tpl.c @@ -0,0 +1,625 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int bn = handle->bn; +const int bk = handle->bk; +const int bc = handle->bc; +int lpb = 2; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, mb2 = 0, ofm2 = 0; +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) || defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) +int iteri = 0, iterj = 0; +#endif +int performed_doutput_transpose = 0; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +/* number of tasks for transpose that could be run in parallel */ +const int eltwise_work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int eltwise_chunksize = (eltwise_work % handle->desc.threads == 0) ? (eltwise_work / handle->desc.threads) : ((eltwise_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int eltwise_thr_begin = (ltid * eltwise_chunksize < eltwise_work) ? (ltid * eltwise_chunksize) : eltwise_work; +const int eltwise_thr_end = ((ltid + 1) * eltwise_chunksize < eltwise_work) ? ((ltid + 1) * eltwise_chunksize) : eltwise_work; +#endif + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +/* number of tasks for transpose that could be run in parallel */ +const int dbias_work = nBlocksOFm; +/* compute chunk size */ +const int dbias_chunksize = (dbias_work % handle->desc.threads == 0) ? (dbias_work / handle->desc.threads) : ((dbias_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int dbias_thr_begin = (ltid * dbias_chunksize < dbias_work) ? (ltid * dbias_chunksize) : dbias_work; +const int dbias_thr_end = ((ltid + 1) * dbias_chunksize < dbias_work) ? ((ltid + 1) * dbias_chunksize) : dbias_work; +#endif + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dbias, (libxsmm_bfloat16*) handle->grad_bias->data, handle->bk); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU +LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, __mmask32, relubitmask, (__mmask32*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/32); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +element_output_type *grad_output_ptr = (element_output_type*)((char*)handle->scratch + handle->doutput_scratch_mark); +element_output_type *tr_doutput_ptr = (element_output_type*)grad_output_ptr + handle->desc.N * handle->desc.K; +LIBXSMM_VLA_DECL(4, const element_output_type, doutput_orig, (element_output_type*)handle->grad_output->data, nBlocksOFm, bn, bk); +#else +element_output_type *grad_output_ptr = (element_output_type*)handle->grad_output->data; +element_output_type *tr_doutput_ptr = (element_output_type*)handle->scratch; +#endif +LIBXSMM_VLA_DECL(4, element_output_type, doutput, grad_output_ptr, nBlocksOFm, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, doutput_tr, tr_doutput_ptr, nBlocksMB, bn_lp, bk, lpb); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +/* Apply to doutput potential fusions */ +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +if (bk % 32 == 0) { + for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; iterj += 32 ) { + __m512i cur_out_reg = _mm512_loadu_si512(&LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk)); +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + __m512 cur_out_reg_0, cur_out_reg_1; + const __m512 ones = _mm512_set1_ps(1.0f); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU + __m512i zero_reg = _mm512_setzero_si512(); + __mmask32 relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK32 (&LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, iteri, iterj/32, nBlocksOFm, handle->bn, handle->bk/32)); + cur_out_reg = _mm512_mask_blend_epi16 (relumask, zero_reg, cur_out_reg); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + cur_out_reg_0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(cur_out_reg, 0)),16)); + cur_out_reg_1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(LIBXSMM_INTRINSICS_MM512_EXTRACTI64X4_EPI64(cur_out_reg, 1)),16)); + cur_out_reg_0 = _mm512_mul_ps(cur_out_reg_0, _mm512_sub_ps(ones, cur_out_reg_0)); + cur_out_reg_1 = _mm512_mul_ps(cur_out_reg_1, _mm512_sub_ps(ones, cur_out_reg_1)); + cur_out_reg = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(cur_out_reg_1, cur_out_reg_0); +#endif + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk), cur_out_reg); + } + } + + /* If in UPD pass, also perform transpose of doutput */ + if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); + } + } +} else { + for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + float l_cur_out_f32 = 0; + libxsmm_bfloat16_hp tmp; +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU + l_cur_out = (element_output_type)((LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk) != 0) ? l_cur_out : (element_output_type)0); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID + tmp.i[0] = 0; + tmp.i[1] = l_cur_out; + l_cur_out_f32 = tmp.f; + l_cur_out_f32 = l_cur_out_f32*(1.0f - l_cur_out_f32); + libxsmm_rne_convert_fp32_bf16(&l_cur_out_f32, &l_cur_out, 1); +#endif + LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + + /* If in UPD pass, also perform transpose of doutput */ + if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + for (mb2 = 0; mb2 < bn; mb2++) { + for (ofm2 = 0; ofm2 < bk; ofm2++) { + LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); + } + } + } + } +} +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + performed_doutput_transpose = 1; +} +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) +/* Accumulation of bias happens in f32 */ +{ + float *scratch_dbias = (float*) ((element_output_type*)handle->scratch + handle->desc.N * (handle->desc.K + handle->desc.C) + ltid * bk * 2); + if (handle->bk % 16 == 0) { + __m512 zero_reg = _mm512_setzero_ps(); + __m512 doutput_reg = _mm512_setzero_ps(); + __m512 dbias_reg = _mm512_setzero_ps(); + for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { + for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { + _mm512_storeu_ps(scratch_dbias+iterj, zero_reg); + } + for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { + doutput_reg = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk))); + dbias_reg = LIBXSMM_INTRINSICS_MM512_LOAD_PS(scratch_dbias+iterj); + dbias_reg = _mm512_add_ps(dbias_reg, doutput_reg); + _mm512_storeu_ps(scratch_dbias+iterj, dbias_reg); + } + } + } + for ( iterj = 0; iterj < handle->bk; iterj += 16 ) { + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, iterj, handle->bk ), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(scratch_dbias+iterj)) ); + } + } + } else { + for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + scratch_dbias[iterj] = 0.0; + } + for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + float doutput_f32 = 0; + libxsmm_bfloat16_hp tmp; + tmp.i[0] = 0; + tmp.i[1] = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); + doutput_f32 = tmp.f; + scratch_dbias[iterj] += doutput_f32; + } + } + } + libxsmm_rne_convert_fp32_bf16(scratch_dbias, &LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, 0, handle->bk ), handle->bk); + } + } +} + +/* wait for eltwise to finish */ +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ){ + int use_2d_blocking = handle->bwd_2d_blocking; + + /* number of tasks that could be run in parallel */ + const int work = nBlocksIFm * nBlocksMB; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + + /* number of tasks for transpose that could be run in parallel */ + const int transpose_work = nBlocksIFm * nBlocksOFm; + /* compute chunk size */ + const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + + /* loop variables */ + int ifm1 = 0, ifm2 = 0, ifm1ofm1 = 0, mb1ifm1 = 0; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, bk, lpb); + LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(5, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, bk_lp, bc, lpb); + float* temp_output = (float*)handle->scratch + (handle->desc.C * handle->desc.K)/2; + LIBXSMM_VLA_DECL(4, float, dinput_f32, (float*) temp_output, nBlocksIFm, bn, bc); + + unsigned long long blocks = nBlocksOFm; + int KB_BLOCKS = nBlocksOFm, BF = 1; + BF = handle->bwd_bf; + KB_BLOCKS = nBlocksOFm/BF; + blocks = KB_BLOCKS; + + if (use_2d_blocking == 1) { + row_teams = handle->bwd_row_teams; + column_teams = handle->bwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksIFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksIFm); + } + + if (handle->desc.K > 1) { + /* transpose weight */ + if ((bk % 16 == 0) && (bc % 16 == 0)) { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + bf16_vnni_transpose((element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, 0, 0, 0, nBlocksIFm, bc_lp, bk, lpb), (element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), bk, bc, bk, bc); + } + } else { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + for (ofm2 = 0; ofm2 < bk; ++ofm2) { + for (ifm2 = 0; ifm2 < bc; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, ofm2/lpb, ifm2, ofm2%lpb, nBlocksOFm, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, ifm2/lpb, ofm2, ifm2%lpb, nBlocksIFm, bc_lp, bk, lpb); + } + } + } + } + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); + + if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); + } + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ofm1 == BF-1 ) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), bn*bc); + } + } + } + } + } else { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + } else { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); + } + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ofm1 == BF-1 ) { + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), bn*bc); + } + } + } + } else { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + } else { + /* Special case when K = 1 */ + /* number of tasks for doutput copy that could be run in parallel */ + const int copy_work_output = nBlocksMB * nBlocksOFm; + /* compute chunk size */ + const int copy_chunksize = (copy_work_output % handle->desc.threads == 0) ? (copy_work_output / handle->desc.threads) : ((copy_work_output / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int copy_thr_begin = (ltid * copy_chunksize < copy_work_output) ? (ltid * copy_chunksize) : copy_work_output; + const int copy_thr_end = ((ltid + 1) * copy_chunksize < copy_work_output) ? ((ltid + 1) * copy_chunksize) : copy_work_output; + LIBXSMM_VLA_DECL(5, element_filter_type, filter_tr_padded, (element_filter_type*)handle->scratch, nBlocksOFm, 1, bc, lpb); + LIBXSMM_VLA_DECL(4, element_output_type, doutput_padded, (element_output_type*)handle->scratch + handle->desc.C * 2, nBlocksOFm, bn, lpb); + + /* Copy in weights and doutput in a padded buffer */ + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + ofm2 = 0; + for (ifm2 = 0; ifm2 < bc; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, ofm1, ofm2/lpb, ifm2, ofm2%lpb, nBlocksOFm, 1, bc, lpb) = LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, ifm2/lpb, ofm2, ifm2%lpb, nBlocksIFm, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, ofm1, ofm2/lpb, ifm2, 1, nBlocksOFm, 1, bc, lpb) = (element_filter_type)0; + } + } + + for (mb1ofm1 = copy_thr_begin; mb1ofm1 < copy_thr_end; ++mb1ofm1) { + mb1 = mb1ofm1 / nBlocksOFm; + ofm1 = mb1ofm1 % nBlocksOFm; + ofm2 = 0; + for (mb2 = 0; mb2 < bn; ++mb2) { + LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, ofm1, mb2, 0, nBlocksOFm, bn, 2) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, 0, nBlocksOFm, bn, bk); + LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, ofm1, mb2, 1, nBlocksOFm, bn, 2) = (element_output_type)0; + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr_padded, ifm1, 0, 0, 0, 0, nBlocksOFm, 1, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput_padded, mb1, 0, 0, 0, nBlocksOFm, bn, 2), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + /* number of tasks that could be run in parallel */ + const int ofm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ofm_subtasks; + const int ifm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ifm_subtasks; + const int bbk = (handle->upd_2d_blocking == 1) ? bk : bk/ofm_subtasks; + const int bbc = (handle->upd_2d_blocking == 1) ? bc : bc/ifm_subtasks; + const int work = nBlocksIFm * ifm_subtasks * nBlocksOFm * ofm_subtasks; + const int Cck_work = nBlocksIFm * ifm_subtasks * ofm_subtasks; + const int Cc_work = nBlocksIFm * ifm_subtasks; + + /* 2D blocking parameters */ + int use_2d_blocking = handle->upd_2d_blocking; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int BF = handle->upd_bf; + + /* loop variables */ + int ifm1ofm1 = 0, ifm1 = 0, ifm2 = 0, bfn = 0, ii = 0, jj = 0, mb1ifm1 = 0, jc = 0, jk = 0; + + /* Batch reduce related variables */ + unsigned long long blocks = nBlocksMB/BF; + + LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(5, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, bc_lp, bk, lpb); + + /* Set up tensors for transposing/scratch before vnni reformatting dfilter */ + element_input_type *tr_inp_ptr = (element_input_type*) ((element_output_type*)handle->scratch + handle->desc.N * handle->desc.K); + float *dfilter_f32_ptr = (float*) ((element_input_type*)tr_inp_ptr + handle->desc.N * handle->desc.C); + element_filter_type *dfilter_scratch = (element_filter_type*) ((float*)dfilter_f32_ptr + handle->desc.C * handle->desc.K) + ltid * bc * bk; + + LIBXSMM_VLA_DECL(4, element_input_type, input_tr, (element_input_type*)tr_inp_ptr, nBlocksMB, bc, bn); + LIBXSMM_VLA_DECL(4, float, dfilter_f32, (float*)dfilter_f32_ptr, nBlocksIFm, bc, bk); + LIBXSMM_VLA_DECL(2, element_filter_type, dfilter_block, (element_filter_type*)dfilter_scratch, bk); + + const int tr_out_work = nBlocksMB * nBlocksOFm; + const int tr_out_chunksize = (tr_out_work % handle->desc.threads == 0) ? (tr_out_work / handle->desc.threads) : ((tr_out_work / handle->desc.threads) + 1); + const int tr_out_thr_begin = (ltid * tr_out_chunksize < tr_out_work) ? (ltid * tr_out_chunksize) : tr_out_work; + const int tr_out_thr_end = ((ltid + 1) * tr_out_chunksize < tr_out_work) ? ((ltid + 1) * tr_out_chunksize) : tr_out_work; + + const int tr_inp_work = nBlocksMB * nBlocksIFm; + const int tr_inp_chunksize = (tr_inp_work % handle->desc.threads == 0) ? (tr_inp_work / handle->desc.threads) : ((tr_inp_work / handle->desc.threads) + 1); + const int tr_inp_thr_begin = (ltid * tr_inp_chunksize < tr_inp_work) ? (ltid * tr_inp_chunksize) : tr_inp_work; + const int tr_inp_thr_end = ((ltid + 1) * tr_inp_chunksize < tr_inp_work) ? ((ltid + 1) * tr_inp_chunksize) : tr_inp_work; + + /* These are used for the vnni reformatting of the f32 output */ + __m256i c0, c1; + __m512 a01, b01; + __m512i c01 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + + if (use_2d_blocking == 1) { + row_teams = handle->upd_row_teams; + column_teams = handle->upd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksIFm, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksIFm); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksIFm); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); + } + + /* Required upfront tranposes */ + if (bc % 32 == 0) { + for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + bf16_transpose((element_input_type*)&LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, 0, 0, nBlocksMB, bc, bn), bc, bn, bc, bn); + } + } else { + for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + for (mb2 = 0; mb2 < bn; mb2++) { + for (ifm2 = 0; ifm2 < bc; ifm2++) { + LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, ifm2, mb2, nBlocksMB, bc, bn) = LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc); + } + } + } + } + + if (performed_doutput_transpose == 0) { + if (bk % 32 == 0) { + for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); + } + } else { + for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + for (mb2 = 0; mb2 < bn; mb2++) { + for (ofm2 = 0; ofm2 < bk; ofm2++) { + LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); + } + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if (use_2d_blocking == 1) { + if (BF == 1) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { + batchreduce_kernel_upd_zerobeta(&LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, 0, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, 0, 0, 0, nBlocksMB, bc, bn), &LIBXSMM_VLA_ACCESS(2, dfilter_block, 0, 0, bk), &blocks); + /* TODO: Make this vnni reformating in the kernel... */ + /* Copy result back to vnni format */ + if ((bc % 2 == 0) && (bk % 16 == 0)) { + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c1 = _mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dfilter_block, jc+1,jk, bk)); + c0 = _mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dfilter_block, jc, jk, bk)); + c01 = _mm512_inserti64x4(c01, c0, 0); + c01 = _mm512_inserti64x4(c01, c1, 1); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, jc/lpb, jk, 0, nBlocksIFm, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } else { + for (ii = 0; ii < bc; ii++) { + for (jj = 0; jj < bk; jj++) { + LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, ii/lpb, jj, ii%lpb, nBlocksIFm, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, dfilter_block, ii, jj, bk); + } + } + } + } + } + } else { + for (bfn = 0; bfn < BF; bfn++) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { + /* initialize current work task to zero */ + if (bfn == 0) { + for (ii = 0; iibarrier, ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..19538fd9 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_bwdupd_ncnc_kcck_generic_bf16_amx.tpl.c @@ -0,0 +1,604 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int bn = handle->bn; +const int bk = handle->bk; +const int bc = handle->bc; +int lpb = 2; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, mb2 = 0, ofm2 = 0; +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) || defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) +int iteri = 0, iterj = 0; +#endif +int performed_doutput_transpose = 0; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +/* number of tasks for transpose that could be run in parallel */ +const int eltwise_work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int eltwise_chunksize = (eltwise_work % handle->desc.threads == 0) ? (eltwise_work / handle->desc.threads) : ((eltwise_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int eltwise_thr_begin = (ltid * eltwise_chunksize < eltwise_work) ? (ltid * eltwise_chunksize) : eltwise_work; +const int eltwise_thr_end = ((ltid + 1) * eltwise_chunksize < eltwise_work) ? ((ltid + 1) * eltwise_chunksize) : eltwise_work; +#endif + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +/* number of tasks for transpose that could be run in parallel */ +const int dbias_work = nBlocksOFm; +/* compute chunk size */ +const int dbias_chunksize = (dbias_work % handle->desc.threads == 0) ? (dbias_work / handle->desc.threads) : ((dbias_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int dbias_thr_begin = (ltid * dbias_chunksize < dbias_work) ? (ltid * dbias_chunksize) : dbias_work; +const int dbias_thr_end = ((ltid + 1) * dbias_chunksize < dbias_work) ? ((ltid + 1) * dbias_chunksize) : dbias_work; +#endif + +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_BIAS +LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dbias, (libxsmm_bfloat16*) handle->grad_bias->data, handle->bk); +#endif +#ifdef LIBXSMM_DNN_FC_BWD_FUSE_RELU +LIBXSMM_VLA_DECL(4, __mmask32, relubitmask, (__mmask32*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/32); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) || defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +element_output_type *grad_output_ptr = (element_output_type*)((char*)handle->scratch + handle->doutput_scratch_mark); +element_output_type *tr_doutput_ptr = (element_output_type*)grad_output_ptr + handle->desc.N * handle->desc.K; +LIBXSMM_VLA_DECL(4, const element_output_type, doutput_orig, (element_output_type*)handle->grad_output->data, nBlocksOFm, bn, bk); +#else +element_output_type *grad_output_ptr = (element_output_type*)handle->grad_output->data; +element_output_type *tr_doutput_ptr = (element_output_type*)handle->scratch; +#endif +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) +libxsmm_meltw_unary_param relu_params; +libxsmm_meltwfunction_unary relu_kernel = handle->bwd_relu_kernel; +#endif +LIBXSMM_VLA_DECL(4, element_output_type, doutput, grad_output_ptr, nBlocksOFm, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, doutput_tr, tr_doutput_ptr, nBlocksMB, bn_lp, bk, lpb); + +libxsmm_meltwfunction_unary eltwise_kernel = handle->bwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); +bwd_tile_config_kernel(NULL, NULL, NULL); + +/* Apply to doutput potential fusions */ +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_RELU) +LIBXSMM_UNUSED(iteri); +LIBXSMM_UNUSED(iterj); +for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1/nBlocksOFm; + ofm1 = mb1ofm1%nBlocksOFm; + + relu_params.in.primary = (void*) &LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); + relu_params.out.primary = &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); + relu_params.in.secondary = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); + relu_kernel(&relu_params); + + /* If in UPD pass, also perform transpose of doutput */ + if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); + } +} + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + performed_doutput_transpose = 1; +} +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_SIGMOID) +if (bk % 32 == 0) { + for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; iterj += 32 ) { + __m512i cur_out_reg = _mm512_loadu_si512(&LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk)); + __m512 cur_out_reg_0, cur_out_reg_1; + const __m512 ones = _mm512_set1_ps(1.0f); + cur_out_reg_0 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(cur_out_reg, 0)),16)); + cur_out_reg_1 = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(cur_out_reg, 1)),16)); + cur_out_reg_0 = _mm512_mul_ps(cur_out_reg_0, _mm512_sub_ps(ones, cur_out_reg_0)); + cur_out_reg_1 = _mm512_mul_ps(cur_out_reg_1, _mm512_sub_ps(ones, cur_out_reg_1)); + cur_out_reg = LIBXSMM_INTRINSICS_MM512_CVT2_FP32_BF16(cur_out_reg_1, cur_out_reg_0); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk), cur_out_reg); +#ifdef USE_CLDEMOTE + _mm_cldemote(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk)); +#endif + } + } + + /* If in UPD pass, also perform transpose of doutput */ + if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); + } + } +} else { + for ( mb1ofm1 = eltwise_thr_begin; mb1ofm1 < eltwise_thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, doutput_orig, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); + float l_cur_out_f32 = 0; + libxsmm_bfloat16_hp tmp; + tmp.i[0] = 0; + tmp.i[1] = l_cur_out; + l_cur_out_f32 = tmp.f; + l_cur_out_f32 = l_cur_out_f32*(1.0f - l_cur_out_f32); + libxsmm_rne_convert_fp32_bf16(&l_cur_out_f32, &l_cur_out, 1); + LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + + /* If in UPD pass, also perform transpose of doutput */ + if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + for (mb2 = 0; mb2 < bn; mb2++) { + for (ofm2 = 0; ofm2 < bk; ofm2++) { + LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); + } + } + } + } +} +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + performed_doutput_transpose = 1; +} +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +#if defined(LIBXSMM_DNN_FC_BWD_FUSE_BIAS) +/* Accumulation of bias happens in f32 */ +{ + float *scratch_dbias = (float*) ((element_output_type*)handle->scratch + handle->desc.N * (handle->desc.K + handle->desc.C) + ltid * bk * 2); + if (handle->bk % 32 == 0) { + for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { + for ( iterj = 0; iterj < handle->bk; iterj += 32 ) { + __m512 doutput_reg_0, doutput_reg_1, dbias_reg_0, dbias_reg_1; + dbias_reg_0 = _mm512_setzero_ps(); + dbias_reg_1 = _mm512_setzero_ps(); + for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + doutput_reg_0 = _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk)); + doutput_reg_1 = _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj+16, nBlocksOFm, handle->bn, handle->bk)); + dbias_reg_0 = _mm512_add_ps(dbias_reg_0, doutput_reg_0); + dbias_reg_1 = _mm512_add_ps(dbias_reg_1, doutput_reg_1); + } + } + _mm512_store_si512(&LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, iterj, handle->bk), LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(dbias_reg_1, dbias_reg_0)); + } + } + } else { + for ( ofm1 = dbias_thr_begin; ofm1 < dbias_thr_end; ++ofm1 ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + scratch_dbias[iterj] = 0.0; + } + for ( mb1 = 0; mb1 < nBlocksMB; ++mb1 ) { + for ( iteri = 0; iteri < handle->bn; ++iteri ) { + for ( iterj = 0; iterj < handle->bk; ++iterj ) { + float doutput_f32 = 0; + libxsmm_bfloat16_hp tmp; + tmp.i[0] = 0; + tmp.i[1] = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, iteri, iterj, nBlocksOFm, handle->bn, handle->bk); + doutput_f32 = tmp.f; + scratch_dbias[iterj] += doutput_f32; + } + } + } + libxsmm_rne_convert_fp32_bf16(scratch_dbias, &LIBXSMM_VLA_ACCESS( 2, dbias, ofm1, 0, handle->bk ), handle->bk); + } + } +} + +/* wait for eltwise to finish */ +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ){ + int use_2d_blocking = handle->bwd_2d_blocking; + + /* number of tasks that could be run in parallel */ + const int work = nBlocksIFm * nBlocksMB; + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + + /* number of tasks for transpose that could be run in parallel */ + const int transpose_work = nBlocksIFm * nBlocksOFm; + /* compute chunk size */ + const int transpose_chunksize = (transpose_work % handle->desc.threads == 0) ? (transpose_work / handle->desc.threads) : ((transpose_work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int transpose_thr_begin = (ltid * transpose_chunksize < transpose_work) ? (ltid * transpose_chunksize) : transpose_work; + const int transpose_thr_end = ((ltid + 1) * transpose_chunksize < transpose_work) ? ((ltid + 1) * transpose_chunksize) : transpose_work; + + /* loop variables */ + int ifm1 = 0, ifm2 = 0, ifm1ofm1 = 0, mb1ifm1 = 0; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, bk, lpb); + LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(5, element_filter_type, filter_tr, (element_filter_type*)handle->scratch, nBlocksOFm, bk_lp, bc, lpb); + float* temp_output = (float*)handle->scratch + (handle->desc.C * handle->desc.K)/2; + LIBXSMM_VLA_DECL(4, float, dinput_f32, (float*) temp_output, nBlocksIFm, bn, bc); + + unsigned long long blocks = nBlocksOFm; + int KB_BLOCKS = nBlocksOFm, BF = 1; + BF = handle->bwd_bf; + KB_BLOCKS = nBlocksOFm/BF; + blocks = KB_BLOCKS; + + if (use_2d_blocking == 1) { + row_teams = handle->bwd_row_teams; + column_teams = handle->bwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = (nBlocksMB + row_teams-1)/row_teams; + in_tasks_per_thread = (nBlocksIFm + column_teams-1)/column_teams; + my_im_start = LIBXSMM_MIN( my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN( (my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN( my_col_id * in_tasks_per_thread, nBlocksIFm); + my_in_end = LIBXSMM_MIN( (my_col_id+1) * in_tasks_per_thread, nBlocksIFm); + } + + /* transpose weight */ + if ((bk % 16 == 0) && (bc % 16 == 0)) { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + bf16_vnni_transpose((element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, 0, 0, 0, nBlocksIFm, bc_lp, bk, lpb), (element_filter_type*)&LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), bk, bc, bk, bc); + } + } else { + for (ifm1ofm1 = transpose_thr_begin; ifm1ofm1 < transpose_thr_end; ++ifm1ofm1) { + ofm1 = ifm1ofm1 / nBlocksIFm; + ifm1 = ifm1ofm1 % nBlocksIFm; + for (ofm2 = 0; ofm2 < bk; ++ofm2) { + for (ifm2 = 0; ifm2 < bc; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1, ofm2/lpb, ifm2, ofm2%lpb, nBlocksOFm, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1, ifm2/lpb, ofm2, ifm2%lpb, nBlocksIFm, bc_lp, bk, lpb); + } + } + } + } + + /* wait for transpose to finish */ + libxsmm_barrier_wait(handle->barrier, ltid); + + if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); + } +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(float)); + if ( ofm1 == BF-1 ) { + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(libxsmm_bfloat16)); + } +#endif + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ofm1 == BF-1 ) { + eltwise_params.in.primary = &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc); + eltwise_params.out.primary = &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc); + eltwise_kernel(&eltwise_params); + } + } + } + } + } else { + for (ifm1 = my_in_start; ifm1 < my_in_end; ++ifm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(libxsmm_bfloat16)); +#endif + bf16_batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + } else { + if (BF > 1) { + for ( ofm1 = 0; ofm1 < BF; ++ofm1 ) { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ofm1 == 0 ) { + memset(&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), 0, bn*bc*sizeof(float)); + } +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(float)); + if ( ofm1 == BF-1 ) { + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(libxsmm_bfloat16)); + } +#endif + batchreduce_kernel_bwd( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, ofm1*KB_BLOCKS, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1*KB_BLOCKS, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ofm1 == BF-1 ) { + eltwise_params.in.primary = &LIBXSMM_VLA_ACCESS(4, dinput_f32, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc); + eltwise_params.out.primary = &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc); + eltwise_kernel(&eltwise_params); + } + } + } + } else { + for ( mb1ifm1 = thr_begin; mb1ifm1 < thr_end; ++mb1ifm1 ) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), handle->bn*handle->bc*sizeof(libxsmm_bfloat16)); +#endif + bf16_batchreduce_kernel_bwd_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter_tr, ifm1, 0, 0, 0, 0, nBlocksOFm, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, doutput, mb1, 0, 0, 0, nBlocksOFm, bn, bk), + &LIBXSMM_VLA_ACCESS(4, dinput, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &blocks); + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) || (kind == LIBXSMM_DNN_COMPUTE_KIND_BWDUPD) ) { + /* number of tasks that could be run in parallel */ + const int ofm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ofm_subtasks; + const int ifm_subtasks = (handle->upd_2d_blocking == 1) ? 1 : handle->ifm_subtasks; + const int bbk = (handle->upd_2d_blocking == 1) ? bk : bk/ofm_subtasks; + const int bbc = (handle->upd_2d_blocking == 1) ? bc : bc/ifm_subtasks; + const int work = nBlocksIFm * ifm_subtasks * nBlocksOFm * ofm_subtasks; + const int Cck_work = nBlocksIFm * ifm_subtasks * ofm_subtasks; + const int Cc_work = nBlocksIFm * ifm_subtasks; + + /* 2D blocking parameters */ + int use_2d_blocking = handle->upd_2d_blocking; + int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; + + /* compute chunk size */ + const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); + /* compute thr_begin and thr_end */ + const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; + const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + int BF = handle->upd_bf; + + /* loop variables */ + int ifm1ofm1 = 0, ifm1 = 0, ifm2 = 0, bfn = 0, ii = 0, jj = 0, mb1ifm1 = 0, jc = 0, jk = 0; + + /* Batch reduce related variables */ + unsigned long long blocks = nBlocksMB/BF; + + LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, bn, bc); + LIBXSMM_VLA_DECL(5, element_filter_type, dfilter, (element_filter_type*)handle->grad_filter->data, nBlocksIFm, bc_lp, bk, lpb); + + /* Set up tensors for transposing/scratch before vnni reformatting dfilter */ + element_input_type *tr_inp_ptr = (element_input_type*) ((element_output_type*)handle->scratch + handle->desc.N * handle->desc.K); + float *dfilter_f32_ptr = (float*) ((element_input_type*)tr_inp_ptr + handle->desc.N * handle->desc.C); + element_filter_type *dfilter_scratch = (element_filter_type*) ((float*)dfilter_f32_ptr + handle->desc.C * handle->desc.K) + ltid * bc * bk; + + LIBXSMM_VLA_DECL(4, element_input_type, input_tr, (element_input_type*)tr_inp_ptr, nBlocksMB, bc, bn); + LIBXSMM_VLA_DECL(4, float, dfilter_f32, (float*)dfilter_f32_ptr, nBlocksIFm, bc, bk); + LIBXSMM_VLA_DECL(2, element_filter_type, dfilter_block, (element_filter_type*)dfilter_scratch, bk); + + const int tr_out_work = nBlocksMB * nBlocksOFm; + const int tr_out_chunksize = (tr_out_work % handle->desc.threads == 0) ? (tr_out_work / handle->desc.threads) : ((tr_out_work / handle->desc.threads) + 1); + const int tr_out_thr_begin = (ltid * tr_out_chunksize < tr_out_work) ? (ltid * tr_out_chunksize) : tr_out_work; + const int tr_out_thr_end = ((ltid + 1) * tr_out_chunksize < tr_out_work) ? ((ltid + 1) * tr_out_chunksize) : tr_out_work; + + const int tr_inp_work = nBlocksMB * nBlocksIFm; + const int tr_inp_chunksize = (tr_inp_work % handle->desc.threads == 0) ? (tr_inp_work / handle->desc.threads) : ((tr_inp_work / handle->desc.threads) + 1); + const int tr_inp_thr_begin = (ltid * tr_inp_chunksize < tr_inp_work) ? (ltid * tr_inp_chunksize) : tr_inp_work; + const int tr_inp_thr_end = ((ltid + 1) * tr_inp_chunksize < tr_inp_work) ? ((ltid + 1) * tr_inp_chunksize) : tr_inp_work; + + /* These are used for the vnni reformatting of the f32 output */ + __m512 a01, b01; + __m512i c01 = LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(); + const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + + if (use_2d_blocking == 1) { + row_teams = handle->upd_row_teams; + column_teams = handle->upd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = (nBlocksIFm + row_teams-1)/row_teams; + in_tasks_per_thread = (nBlocksOFm + column_teams-1)/column_teams; + my_im_start = LIBXSMM_MIN( my_row_id * im_tasks_per_thread, nBlocksIFm); + my_im_end = LIBXSMM_MIN( (my_row_id+1) * im_tasks_per_thread, nBlocksIFm); + my_in_start = LIBXSMM_MIN( my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN( (my_col_id+1) * in_tasks_per_thread, nBlocksOFm); + } + + /* Required upfront tranposes */ + if (bc % 32 == 0) { + for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + bf16_transpose((element_input_type*)&LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, 0, 0, nBlocksIFm, bn, bc), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, 0, 0, nBlocksMB, bc, bn), bc, bn, bc, bn); + } + } else { + for (mb1ifm1 = tr_inp_thr_begin; mb1ifm1 < tr_inp_thr_end; mb1ifm1++) { + mb1 = mb1ifm1%nBlocksMB; + ifm1 = mb1ifm1/nBlocksMB; + for (mb2 = 0; mb2 < bn; mb2++) { + for (ifm2 = 0; ifm2 < bc; ifm2++) { + LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, mb1, ifm2, mb2, nBlocksMB, bc, bn) = LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1, mb2, ifm2, nBlocksIFm, bn, bc); + } + } + } + } + + if (performed_doutput_transpose == 0) { + if (bk % 32 == 0) { + for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + bf16_vnni_reformat((element_output_type*)&LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, 0, 0, 0, nBlocksMB, bn_lp, bk, lpb), bk, bn, bk, bn); + } + } else { + for (mb1ofm1 = tr_out_thr_begin; mb1ofm1 < tr_out_thr_end; mb1ofm1++) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + for (mb2 = 0; mb2 < bn; mb2++) { + for (ofm2 = 0; ofm2 < bk; ofm2++) { + LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, mb1, mb2/lpb, ofm2, mb2%lpb, nBlocksMB, bn_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(4, doutput, mb1, ofm1, mb2, ofm2, nBlocksOFm, bn, bk); + } + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if (use_2d_blocking == 1) { + ifm2 = 0; + ofm2 = 0; + if (BF == 1) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, 0, 0, 0, nBlocksIFm, bc_lp, bk, lpb), bbc*bbk*sizeof(libxsmm_bfloat16)); +#endif + bf16_batchreduce_kernel_upd_zerobeta(&LIBXSMM_VLA_ACCESS(5, doutput_tr, ofm1, 0, 0, ofm2*bbk, 0, nBlocksMB, bn_lp, bk, lpb), &LIBXSMM_VLA_ACCESS(4, input_tr, ifm1, 0, ifm2*bbc, 0, nBlocksMB, bc, bn), &LIBXSMM_VLA_ACCESS(5, dfilter, ofm1, ifm1, 0, 0, 0, nBlocksIFm, bc_lp, bk, lpb), &blocks); + } + } + } else { + for (bfn = 0; bfn < BF; bfn++) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (ifm1 = my_im_start; ifm1 < my_im_end; ++ifm1) { + /* initialize current work task to zero */ + if (bfn == 0) { + for (ii = 0; iibarrier, ltid); +} + +handle->tilerelease_kernel(NULL, NULL, NULL); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c new file mode 100644 index 00000000..69cedb30 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_custom_generic.tpl.c @@ -0,0 +1,102 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int nBlocksIFm = handle->blocksifm; +const int nIFmBlock = handle->ifmblock; +const int nBlocksOFm = handle->blocksofm; +const int nOFmBlock = handle->ofmblock; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nBlocksOFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int ofm1 = 0; + +LIBXSMM_VLA_DECL(3, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, nOFmBlock); +#if defined(LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32) +float* input_f32_ptr = (float*)handle->scratch; +float* filter_f32_ptr = ((float*)handle->scratch)+((size_t)handle->desc.N*(size_t)handle->desc.C); +LIBXSMM_VLA_DECL(3, const float, input, input_f32_ptr, nBlocksIFm, nIFmBlock); +LIBXSMM_VLA_DECL(4, const float, filter, filter_f32_ptr, nBlocksIFm, nIFmBlock, nOFmBlock); + +/* number of tasks that could be run in parallel */ +const int work_input = handle->desc.N * handle->desc.C; +/* compute chunk size */ +const int chunksize_input = (work_input % handle->desc.threads == 0) ? (work_input / handle->desc.threads) : ((work_input / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin_input = (ltid * chunksize_input < work_input) ? (ltid * chunksize_input) : work_input; +const int thr_end_input = ((ltid + 1) * chunksize_input < work_input) ? ((ltid + 1) * chunksize_input) : work_input; + +/* number of tasks that could be run in parallel */ +const int work_filter = handle->desc.C * handle->desc.K; +/* compute chunk size */ +const int chunksize_filter = (work_filter % handle->desc.threads == 0) ? (work_filter / handle->desc.threads) : ((work_filter / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin_filter = (ltid * chunksize_filter < work_filter) ? (ltid * chunksize_filter) : work_filter; +const int thr_end_filter = ((ltid + 1) * chunksize_filter < work_filter) ? ((ltid + 1) * chunksize_filter) : work_filter; +#else +LIBXSMM_VLA_DECL(3, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, nIFmBlock); +LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, nIFmBlock, nOFmBlock); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +#if defined(LIBXSMM_DNN_FULLYCONNECTED_FWD_BF16_F32) +libxsmm_convert_bf16_f32( ((element_input_type*)handle->reg_input->data)+thr_begin_input, input_f32_ptr+thr_begin_input, thr_end_input - thr_begin_input ); +libxsmm_convert_bf16_f32( ((element_filter_type*)handle->reg_filter->data)+thr_begin_filter, filter_f32_ptr+thr_begin_filter, thr_end_filter - thr_begin_filter ); + +libxsmm_barrier_wait(handle->barrier, ltid); +#endif + +for ( ofm1 = thr_begin; ofm1 < thr_end; ++ofm1 ) { /* outer GEMM m-loop */ +#if 1 + gemm_kernel( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, nIFmBlock, nOFmBlock), + &LIBXSMM_VLA_ACCESS(3, input, 0, 0, 0, nBlocksIFm, nIFmBlock), + &LIBXSMM_VLA_ACCESS(3, output, 0, ofm1, 0, nBlocksOFm, nOFmBlock) ); +#else + { + const int nImg = handle->desc.N; + int img2, ifm1, ifm2, ofm2; + + /* this is a simple replacement code using regular loops */ + for ( img2 = 0; img2 < nImg; ++img2 ) { + LIBXSMM_PRAGMA_SIMD + for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(3, output, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) = (element_output_type)0; + } + } + for ( ifm1 = 0; ifm1 < nBlocksIFm; ++ifm1 ) { /* outer GEMM k-loop */ + for ( ifm2 = 0; ifm2 < nIFmBlock; ++ifm2 ) { /* GEMM K-loop */ + for ( img2 = 0; img2 < nImg; ++img2 ) { /* GEMM n-loop */ + LIBXSMM_PRAGMA_SIMD + for ( ofm2 = 0; ofm2 < nOFmBlock; ++ofm2 ) { /* GEMM m-loop */ + LIBXSMM_VLA_ACCESS(3, output, img2, ofm1, ofm2, nBlocksOFm, nOFmBlock) += + LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1, ifm2, ofm2, nBlocksIFm, nIFmBlock, nOFmBlock) * LIBXSMM_VLA_ACCESS(3, input, img2, ifm1, ifm2, nBlocksIFm, nIFmBlock); + } + } + } + } + } +#endif +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c new file mode 100644 index 00000000..e0f854b3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic.tpl.c @@ -0,0 +1,235 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +int use_2d_blocking = handle->fwd_2d_blocking; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, ifm1 = 0; +int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; +int mb2 = 0, ofm2 = 0; + +LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); +LIBXSMM_VLA_DECL(4, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, handle->bc, handle->bk); +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS +LIBXSMM_VLA_DECL(2, const element_output_type, bias, (element_output_type*)handle->reg_bias->data, handle->bk); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU +LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*) handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); +#endif +#endif + +unsigned long long blocks = nBlocksIFm; +int CB_BLOCKS = nBlocksIFm, BF = 1; + +BF = handle->fwd_bf; +CB_BLOCKS = nBlocksIFm/BF; +blocks = CB_BLOCKS; + +if (use_2d_blocking == 1) { + row_teams = handle->fwd_row_teams; + column_teams = handle->fwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } +#else + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (element_output_type)0; + } + } +#endif + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if ( ifm1 == BF-1 ) { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + float l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); + l_cur_out = (l_cur_out > (element_output_type)0) ? l_cur_out : (element_output_type)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } +#endif + } + } + } + } else { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#else + batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); + l_cur_out = ( l_cur_out > (element_output_type)0 ) ? l_cur_out : (element_output_type)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } +#endif + } + } + } +} else { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } +#else + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (element_output_type)0; + } + } +#endif + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if ( ifm1 == BF-1 ) { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + float l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); + l_cur_out = (l_cur_out > (element_output_type)0) ? l_cur_out : (element_output_type)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } +#endif + } + } + } else { + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#else + batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(4, filter, ofm1, 0, 0, 0, nBlocksIFm, handle->bc, handle->bk), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + element_output_type l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (element_output_type)0 ) ? 1 : 0); + l_cur_out = ( l_cur_out > (element_output_type)0 ) ? l_cur_out : (element_output_type)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } +#endif + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c new file mode 100644 index 00000000..bb3a22da --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16.tpl.c @@ -0,0 +1,379 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +int lpb = 2; +const int bc_lp = handle->bc/lpb; +/* const int bc = handle->bc;*/ +int use_2d_blocking = handle->fwd_2d_blocking; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, ifm1 = 0; +int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +int mb2 = 0, ofm2 = 0; +#endif +LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); +LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, handle->bk, lpb); +float* temp_output = (float*)handle->scratch; +LIBXSMM_VLA_DECL(4, float, output_f32, (float*) temp_output, nBlocksOFm,handle->bn,handle->bk); +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS +LIBXSMM_VLA_DECL(2, const element_input_type, bias, (element_input_type*) handle->reg_bias->data, handle->bk); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU +LIBXSMM_VLA_DECL(4, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, __mmask16, relubitmask, (__mmask16*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/16); +#endif +#endif +unsigned long long blocks = nBlocksIFm; +int CB_BLOCKS = nBlocksIFm, BF = 1; + +BF = handle->fwd_bf; +CB_BLOCKS = nBlocksIFm/BF; +blocks = CB_BLOCKS; + +if (use_2d_blocking == 1) { + row_teams = handle->fwd_row_teams; + column_teams = handle->fwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = LIBXSMM_UPDIV(nBlocksMB, row_teams); + in_tasks_per_thread = LIBXSMM_UPDIV(nBlocksOFm, column_teams); + my_im_start = LIBXSMM_MIN(my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN((my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN(my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN((my_col_id+1) * in_tasks_per_thread, nBlocksOFm); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm,handle->bn,handle->bk), handle->bk ); + } +#else + memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); +#endif + } + batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ifm1 == BF-1 ) { +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if (handle->bk % 32 == 0) { + __m512 cur_out_0 = _mm512_setzero_ps(); + __m512 cur_out_1 = _mm512_setzero_ps(); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + __mmask16 relumask0; + __mmask16 relumask1; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + __m512 ones = _mm512_set1_ps(1.0); + __m512 halves = _mm512_set1_ps(0.5); +#endif + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { + cur_out_0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk)); + cur_out_1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk)); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); + relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); + cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); + cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_0, halves)), ones), halves); + cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_1, halves)), ones), halves); +#endif + _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), cur_out_0); + _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk), cur_out_1); + } + } + } else { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + float l_cur_out = LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > (float)0 ) ? 1 : 0); + l_cur_out = (l_cur_out > (float)0) ? l_cur_out : (float)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } +#endif + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm,handle->bn,handle->bk), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm,handle->bn,handle->bk),handle->bn*handle->bk); + } + } + } + } + } else { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#else + batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if (handle->bk % 32 == 0) { + __m512 cur_out_0 = _mm512_setzero_ps(); + __m512 cur_out_1 = _mm512_setzero_ps(); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + __mmask16 relumask0; + __mmask16 relumask1; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + __m512 ones = _mm512_set1_ps(1.0); + __m512 halves = _mm512_set1_ps(0.5); +#endif + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { + cur_out_0 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk))); + cur_out_1 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk))); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); + relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); + cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); + cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_0, halves)), ones), halves); + cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_1, halves)), ones), halves); +#endif + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( cur_out_1, cur_out_0 )); + } + } + } else { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + libxsmm_bfloat16_hp t; +#endif + libxsmm_bfloat16 l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( (l_cur_out & 0x8000) > 0 ) ? 0 : 1); + l_cur_out = (libxsmm_bfloat16)(( (l_cur_out & 0x8000) > 0 ) ? 0 : l_cur_out); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + t.i[1] = l_cur_out; + t.i[0] = 0; + t.f = (libxsmm_stanh_pade78( t.f / 2.0f ) + 1.0f) / 2.0f; + l_cur_out = t.i[1]; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } +#endif + } + } + } +} else { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm, handle->bn, handle->bk), handle->bk ); + } +#else + memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); +#endif + } + batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ifm1 == BF-1 ) { +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if (handle->bk % 32 == 0) { + __m512 cur_out_0 = _mm512_setzero_ps(); + __m512 cur_out_1 = _mm512_setzero_ps(); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + __mmask16 relumask0; + __mmask16 relumask1; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + __m512 ones = _mm512_set1_ps(1.0); + __m512 halves = _mm512_set1_ps(0.5); +#endif + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { + cur_out_0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk)); + cur_out_1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk)); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); + relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); + cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); + cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_0, halves)), ones), halves); + cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_1, halves)), ones), halves); +#endif + _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), cur_out_0); + _mm512_storeu_ps(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk), cur_out_1); + } + } + } else { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { + float l_cur_out = LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( l_cur_out > 0.0 ) ? 1 : 0); + l_cur_out = (l_cur_out > (float)0) ? l_cur_out : (float)0; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + l_cur_out = (libxsmm_stanh_pade78( l_cur_out / 2.0f ) + 1.0f) / 2.0f; +#endif + LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } +#endif + LIBXSMM_DNN_CONVERT_BUFFER_F32_BF16(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk); + } + } + } + } else { + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 bk; ++ofm2 ) { + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = LIBXSMM_VLA_ACCESS(2, bias, ofm1, ofm2, handle->bk); + } + } + batchreduce_kernel_beta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#else + batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE + if (handle->bk % 32 == 0) { + __m512 cur_out_0 = _mm512_setzero_ps(); + __m512 cur_out_1 = _mm512_setzero_ps(); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + __mmask16 relumask0; + __mmask16 relumask1; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + __m512 ones = _mm512_set1_ps(1.0); + __m512 halves = _mm512_set1_ps(0.5); +#endif + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ofm2 += 32 ) { + cur_out_0 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk))); + cur_out_1 = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2+16, nBlocksOFm, handle->bn, handle->bk))); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + relumask0 = _mm512_cmp_ps_mask( cur_out_0, _mm512_setzero_ps(), _CMP_GT_OQ ); + relumask1 = _mm512_cmp_ps_mask( cur_out_1, _mm512_setzero_ps(), _CMP_GT_OQ ); + cur_out_0 = _mm512_mask_blend_ps( relumask0, _mm512_setzero_ps(), cur_out_0 ); + cur_out_1 = _mm512_mask_blend_ps( relumask1, _mm512_setzero_ps(), cur_out_1 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16, nBlocksOFm, handle->bn, handle->bk/16), relumask0 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, mb2, ofm2/16+1, nBlocksOFm, handle->bn, handle->bk/16), relumask1 ); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + cur_out_0 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_0, halves)), ones), halves); + cur_out_1 = _mm512_mul_ps(_mm512_add_ps(LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(_mm512_mul_ps(cur_out_1, halves)), ones), halves); +#endif + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk), LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( cur_out_1, cur_out_0 )); + } + } + } else { + for ( mb2 = 0; mb2 < handle->bn; ++mb2 ) { + for ( ofm2 = 0; ofm2 < handle->bk; ++ofm2 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + libxsmm_bfloat16_hp t; +#endif + libxsmm_bfloat16 l_cur_out = LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk); +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + LIBXSMM_VLA_ACCESS(4, relumask, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = (unsigned char)(( (l_cur_out & 0x8000) > 0 ) ? 0 : 1); + l_cur_out = (libxsmm_bfloat16)(( (l_cur_out & 0x8000) > 0 ) ? 0 : l_cur_out); +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID + /* we ar using Pade 7/8 approximation */ + t.i[1] = l_cur_out; + t.i[0] = 0; + t.f = (libxsmm_stanh_pade78( t.f / 2.0f ) + 1.0f) / 2.0f; + l_cur_out = t.i[1]; +#endif + LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, mb2, ofm2, nBlocksOFm, handle->bn, handle->bk) = l_cur_out; + } + } + } + +#endif + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..a8fb8f8a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_amx.tpl.c @@ -0,0 +1,223 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +const int bn = handle->bn; +const int bk = handle->bk; +const int lpb = 2; +const int bc_lp = handle->bc/lpb; +/* const int bc = handle->bc;*/ +int use_2d_blocking = handle->fwd_2d_blocking; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nBlocksOFm * nBlocksMB; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int mb1ofm1 = 0, mb1 = 0, ofm1 = 0, ifm1 = 0; +int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; +LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); +LIBXSMM_VLA_DECL(5, const element_filter_type, filter, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, handle->bk, lpb); +float* temp_output = (float*)handle->scratch; +LIBXSMM_VLA_DECL(4, float, output_f32, (float*) temp_output, nBlocksOFm, bn, bk); + +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +libxsmm_meltw_gemm_param gemm_eltwise_params; +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_BIAS) +int mb2 = 0; +float* fp32_bias_scratch = (float*)handle->scratch + ltid * handle->desc.K; +LIBXSMM_VLA_DECL(2, const element_input_type, bias, (element_input_type*) handle->reg_bias->data, handle->bk); +#endif +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_RELU) +LIBXSMM_VLA_DECL(4, __mmask32, relubitmask, (__mmask32*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/32); +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_relu_kernel; +libxsmm_meltw_unary_param eltwise_params; +#elif defined(LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID) +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_sigmoid_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#else +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#endif +#else +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#endif + +unsigned long long blocks = nBlocksIFm; +int CB_BLOCKS = nBlocksIFm, BF = 1; + +BF = handle->fwd_bf; +CB_BLOCKS = nBlocksIFm/BF; +blocks = CB_BLOCKS; + +if (use_2d_blocking == 1) { + row_teams = handle->fwd_row_teams; + column_teams = handle->fwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = (nBlocksMB + row_teams-1)/row_teams; + in_tasks_per_thread = (nBlocksOFm + column_teams-1)/column_teams; + my_im_start = LIBXSMM_MIN( my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN( (my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN( my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN( (my_col_id+1) * in_tasks_per_thread, nBlocksOFm); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +tile_config_kernel(NULL, NULL, NULL); + +if (use_2d_blocking == 1) { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm,handle->bn,handle->bk), handle->bk ); + } +#else + memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); +#endif + } + +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(float)); + if ( ifm1 == BF-1 ) { + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); + } +#endif + batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ifm1 == BF-1 ) { + eltwise_params.in.primary = &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); + eltwise_params.out.primary = &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_RELU) + eltwise_params.out.secondary = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + eltwise_kernel(&eltwise_params); + } + } + } + } + } else { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, 0, 0,handle->bk), fp32_bias_scratch, handle->desc.K ); +#endif + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + gemm_eltwise_params.bias_ptr = (float*) fp32_bias_scratch + ofm1 * handle->bk; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + gemm_eltwise_params.out_ptr = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + bf16_batchreduce_kernel_zerobeta_fused_eltwise( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks, &gemm_eltwise_params); +#else + bf16_batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks); +#endif + } + } + } +} else { + if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm,handle->bn,handle->bk), handle->bk ); + } +#else + memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); +#endif + } +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(float)); + if ( ifm1 == BF-1 ) { + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); + } +#endif + batchreduce_kernel( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ifm1 == BF-1 ) { + eltwise_params.in.primary = &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); + eltwise_params.out.primary = &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_RELU) + eltwise_params.out.secondary = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + eltwise_kernel(&eltwise_params); + } + } + } + } else { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, 0, 0,handle->bk), fp32_bias_scratch, handle->desc.K ); +#endif + for ( mb1ofm1 = thr_begin; mb1ofm1 < thr_end; ++mb1ofm1 ) { + mb1 = mb1ofm1%nBlocksMB; + ofm1 = mb1ofm1/nBlocksMB; +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + gemm_eltwise_params.bias_ptr = (float*) fp32_bias_scratch + ofm1 * handle->bk; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + gemm_eltwise_params.out_ptr = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + bf16_batchreduce_kernel_zerobeta_fused_eltwise( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks, &gemm_eltwise_params); +#else + bf16_batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(5, filter, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks); +#endif + } + } +} + +handle->tilerelease_kernel(NULL, NULL, NULL); +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c new file mode 100644 index 00000000..57f2712c --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fullyconnected_st_fwd_ncnc_kcck_generic_bf16_sparse_A_amx.tpl.c @@ -0,0 +1,177 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke (Intel Corp.) +******************************************************************************/ +/* size variables, all const */ +/* here we assume that input and output blocking is similar */ +const int nBlocksIFm = handle->desc.C / handle->bc; +const int nBlocksOFm = handle->desc.K / handle->bk; +const int nBlocksMB = handle->desc.N / handle->bn; +const int bn = handle->bn; +const int bk = handle->bk; +const int lpb = 2; +const int bc_lp = handle->bc/lpb; +/* const int bc = handle->bc;*/ +int use_2d_blocking = handle->fwd_2d_blocking; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* loop variables */ +int mb1 = 0, ofm1 = 0, ifm1 = 0; +int im_tasks_per_thread = 0, in_tasks_per_thread = 0, my_in_start = 0, my_in_end = 0, my_im_start = 0, my_im_end = 0, my_row_id = 0, my_col_id = 0, row_teams = 0, column_teams = 0; +LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksOFm, handle->bn, handle->bk); +LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksIFm, handle->bn, handle->bc); + +LIBXSMM_VLA_DECL(5, const element_filter_type, filter_compressed, (element_filter_type*)handle->reg_filter->data, nBlocksIFm, bc_lp, handle->bk/handle->sparsity_factor_A, lpb); +LIBXSMM_VLA_DECL(5, __mmask32, idx_filter_compressed, (__mmask32*) ((element_filter_type*)handle->reg_filter->data + (handle->desc.C*handle->desc.K)/handle->sparsity_factor_A), nBlocksIFm, bc_lp, handle->bk/32, lpb); +LIBXSMM_VLA_DECL(4, element_filter_type, decompressed_filter, (element_filter_type*)handle->scratch + ltid * handle->bk * handle->desc.C, bc_lp, handle->bk, lpb); + +float* temp_output = (float*)handle->scratch + (handle->desc.threads * handle->desc.C * handle->bk)/2; +LIBXSMM_VLA_DECL(4, float, output_f32, (float*) temp_output, nBlocksOFm, bn, bk); +libxsmm_meltw_gemm_param gemm_eltwise_params; + +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_BIAS) +int mb2 = 0; +float* fp32_bias_scratch = (float*)handle->scratch + (handle->desc.threads * handle->desc.C * handle->bk)/2 + ltid * handle->desc.K; +LIBXSMM_VLA_DECL(2, const element_input_type, bias, (element_input_type*) handle->reg_bias->data, handle->bk); +#endif +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_RELU) +LIBXSMM_VLA_DECL(4, __mmask32, relubitmask, (__mmask32*)handle->relumask->data, nBlocksOFm, handle->bn, handle->bk/32); +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_relu_kernel; +libxsmm_meltw_unary_param eltwise_params; +#elif defined(LIBXSMM_DNN_FC_FWD_FUSE_SIGMOID) +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_sigmoid_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#else +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#endif +#else +libxsmm_meltwfunction_unary eltwise_kernel = handle->fwd_cvtfp32bf16_kernel; +libxsmm_meltw_unary_param eltwise_params; +#endif + +unsigned long long blocks = nBlocksIFm; +int CB_BLOCKS = nBlocksIFm, BF = 1; + +BF = handle->fwd_bf; +CB_BLOCKS = nBlocksIFm/BF; +blocks = CB_BLOCKS; + +if (use_2d_blocking == 1) { + row_teams = handle->fwd_row_teams; + column_teams = handle->fwd_column_teams; + my_col_id = ltid % column_teams; + my_row_id = ltid / column_teams; + im_tasks_per_thread = (nBlocksMB + row_teams-1)/row_teams; + in_tasks_per_thread = (nBlocksOFm + column_teams-1)/column_teams; + my_im_start = LIBXSMM_MIN( my_row_id * im_tasks_per_thread, nBlocksMB); + my_im_end = LIBXSMM_MIN( (my_row_id+1) * im_tasks_per_thread, nBlocksMB); + my_in_start = LIBXSMM_MIN( my_col_id * in_tasks_per_thread, nBlocksOFm); + my_in_end = LIBXSMM_MIN( (my_col_id+1) * in_tasks_per_thread, nBlocksOFm); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +tile_config_kernel(NULL, NULL, NULL); + +if (BF > 1) { + for ( ifm1 = 0; ifm1 < BF; ++ifm1 ) { + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { + /* Initialize intermediate f32 tensor */ + if ( ifm1 == 0 ) { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + for ( mb2 = 0; mb2 bn; ++mb2 ) { + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, ofm1, 0,handle->bk), &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, mb2, 0, nBlocksOFm,handle->bn,handle->bk), handle->bk ); + } +#else + memset(&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), 0, handle->bn*handle->bk*sizeof(float)); +#endif + } + +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(float)); + if ( ifm1 == BF-1 ) { + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); + } +#endif + if (mb1 == my_im_start) { + gemm_eltwise_params.sparse_bitmap = &LIBXSMM_VLA_ACCESS(5, idx_filter_compressed, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/32, lpb); + gemm_eltwise_params.decompress_buffer = &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb); + batchreduce_kernel_decompress( &LIBXSMM_VLA_ACCESS(5, filter_compressed, ofm1, ifm1*CB_BLOCKS, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/handle->sparsity_factor_A, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks, &gemm_eltwise_params); + } else { + batchreduce_kernel( &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, ifm1*CB_BLOCKS, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), &blocks); + } + + /* downconvert intermediate f32 tensor to bf 16 and store to final C */ + if ( ifm1 == BF-1 ) { + eltwise_params.in.primary = &LIBXSMM_VLA_ACCESS(4, output_f32, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); + eltwise_params.out.primary = &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk); +#if defined(LIBXSMM_DNN_FC_FWD_FUSE_RELU) + eltwise_params.out.secondary = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + eltwise_kernel(&eltwise_params); + } + } + } + } +} else { +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + LIBXSMM_DNN_CONVERT_BUFFER_BF16_F32( &LIBXSMM_VLA_ACCESS(2, bias, 0, 0,handle->bk), fp32_bias_scratch, handle->desc.K ); +#endif + for (ofm1 = my_in_start; ofm1 < my_in_end; ++ofm1) { + for (mb1 = my_im_start; mb1 < my_im_end; ++mb1) { +#ifdef WR_PREFETCH_OUTPUT + prefetchwt_chunk((char*)&LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk), handle->bn*handle->bk*sizeof(libxsmm_bfloat16)); +#endif +#ifndef LIBXSMM_DNN_FC_FWD_FUSE_NONE +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_BIAS + gemm_eltwise_params.bias_ptr = (float*) fp32_bias_scratch + ofm1 * handle->bk; +#endif +#ifdef LIBXSMM_DNN_FC_FWD_FUSE_RELU + gemm_eltwise_params.out_ptr = &LIBXSMM_VLA_ACCESS(4, relubitmask, mb1, ofm1, 0, 0, nBlocksOFm, handle->bn, handle->bk/32); +#endif + if (mb1 == my_im_start) { + gemm_eltwise_params.sparse_bitmap = &LIBXSMM_VLA_ACCESS(5, idx_filter_compressed, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/32, lpb); + gemm_eltwise_params.decompress_buffer = &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb); + bf16_batchreduce_kernel_zerobeta_fused_eltwise_decompress( &LIBXSMM_VLA_ACCESS(5, filter_compressed, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/handle->sparsity_factor_A, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks, &gemm_eltwise_params); + } else { + bf16_batchreduce_kernel_zerobeta_fused_eltwise( &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks, &gemm_eltwise_params); + } +#else + if (mb1 == my_im_start) { + gemm_eltwise_params.sparse_bitmap = &LIBXSMM_VLA_ACCESS(5, idx_filter_compressed, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/32, lpb); + gemm_eltwise_params.decompress_buffer = &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb); + bf16_batchreduce_kernel_zerobeta_decompress( &LIBXSMM_VLA_ACCESS(5, filter_compressed, ofm1, 0, 0, 0, 0, nBlocksIFm, bc_lp, handle->bk/handle->sparsity_factor_A, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks, &gemm_eltwise_params); + } else { + bf16_batchreduce_kernel_zerobeta( &LIBXSMM_VLA_ACCESS(4, decompressed_filter, 0, 0, 0, 0, bc_lp, handle->bk, lpb), + &LIBXSMM_VLA_ACCESS(4, input, mb1, 0, 0, 0, nBlocksIFm, handle->bn, handle->bc), + &LIBXSMM_VLA_ACCESS(4, output, mb1, ofm1, 0, 0, nBlocksOFm, bn, bk), &blocks); + } +#endif + } + } +} +handle->tilerelease_kernel(NULL, NULL, NULL); +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..d0acc711 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,251 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)16), nImg, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 16); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + const __m512 value = _mm512_load_act( output_ptr ); + const __mmask16 lcl_relumask = _mm512_cmp_ps_mask( value, _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + output_ptr += 16; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const __mmask16 lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); + del_input_add_ptr += sw*16; +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + input_ptr += sw*16; + del_output_ptr += 16; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 16; + del_beta_img_ptr += 16; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16), lcl_vdbeta ); + } + } else { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 16; + del_beta_img_ptr += 16; + } + + _mm512_storeu_ps( del_gamma_img_ptr - (nImg*16), lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr - (nImg*16), lcl_vdbeta ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + + del_input_ptr += sw*16; + input_ptr += sw*16; + del_output_ptr += 16; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..dfc6c368 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,312 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + __m512 lcl_vbmean2, lcl_vbrstd2; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput, lcl_vdeloutput2; +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const __m512 vzero = _mm512_setzero_ps(); + __mmask16 lcl_relumask, lcl_relumask2; +#endif + + lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + output_ptr += 32; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); + del_input_add_ptr += sw*32; +#endif + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); + + input_ptr += sw*32; + del_output_ptr += 32; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); + _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); + del_gamma_img_ptr += 32; + del_beta_img_ptr += 32; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32), lcl_vdbeta ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32), lcl_vdbeta2 ); + } + } else { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); + del_gamma_img_ptr += 32; + del_beta_img_ptr += 32; + } + + _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg), lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr - (32*nImg), lcl_vdbeta ); + _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg) + 16, lcl_vdgamma2 ); + _mm512_storeu_ps( del_beta_img_ptr - (32*nImg) + 16, lcl_vdbeta2 ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32) ); + lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + __m512 lcl_vdelinput2; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + + lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); + lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); + + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); + + del_input_ptr += sw*32; + input_ptr += sw*32; + del_output_ptr += 32; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..3d09b972 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,386 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm * 4; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)64), nImg, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + __m512 lcl_vdgamma3 = _mm512_setzero_ps(); + __m512 lcl_vdbeta3 = _mm512_setzero_ps(); + __m512 lcl_vdgamma4 = _mm512_setzero_ps(); + __m512 lcl_vdbeta4 = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + __m512 lcl_vbmean2, lcl_vbrstd2; + __m512 lcl_vbmean3, lcl_vbrstd3; + __m512 lcl_vbmean4, lcl_vbrstd4; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 64); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput, lcl_vdeloutput2, lcl_vdeloutput3, lcl_vdeloutput4; +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; + const __m512 vzero = _mm512_setzero_ps(); +#endif + + lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); +#endif + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); + + lcl_vdeloutput3 = _mm512_load_act( del_output_ptr+32 ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask3 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+32 ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, vzero, lcl_vdeloutput3 ); + _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask3 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, vzero, lcl_vdeloutput3 ); + _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+32, lcl_vdeloutput3 ); +#endif + lcl_vdgamma3 = _mm512_add_ps( lcl_vdgamma3, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ), lcl_vdeloutput3 ), lcl_vbrstd3 ) ); + lcl_vdbeta3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdeloutput3 ); + + lcl_vdeloutput4 = _mm512_load_act( del_output_ptr+48 ); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + lcl_relumask4 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+48 ), vzero, _CMP_NEQ_OQ ); + lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, vzero, lcl_vdeloutput4 ); + _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); + output_ptr += 64; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask4 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, vzero, lcl_vdeloutput4 ); + _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+48, lcl_vdeloutput4 ); + del_input_add_ptr += sw*64; +#endif + lcl_vdgamma4 = _mm512_add_ps( lcl_vdgamma4, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ), lcl_vdeloutput4 ), lcl_vbrstd4 ) ); + lcl_vdbeta4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdeloutput4 ); + + input_ptr += sw*64; + del_output_ptr += 64; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); + _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); + _mm512_storeu_ps( del_gamma_img_ptr+32, lcl_vdgamma3 ); + _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); + _mm512_storeu_ps( del_gamma_img_ptr+48, lcl_vdgamma4 ); + _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 64; + del_beta_img_ptr += 64; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, (fm/4), ((fm%4)*16), 64), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, (fm/4), ((fm%4)*16), 64), lcl_vdbeta ); + } + } else { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 64; + del_beta_img_ptr += 64; + } + + _mm512_storeu_ps( del_gamma_img_ptr - (64*nImg), lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr - (64*nImg), lcl_vdbeta ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; + __m512 lcl_vgamma3, lcl_vbmean3, lcl_vbrstd3, lcl_vdgamma3, lcl_vdbeta3; + __m512 lcl_vgamma4, lcl_vbmean4, lcl_vbrstd4, lcl_vdgamma4, lcl_vdbeta4; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 64) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 64) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 64) ); + lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 64) ); + + lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + lcl_vdgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 32, 64) ); + lcl_vdbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 32, 64) ); + + lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + lcl_vdgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 48, 64) ); + lcl_vdbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + __m512 lcl_vdelinput2; + __m512 lcl_vdelinput3; + __m512 lcl_vdelinput4; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + + lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); + lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); + + lcl_vdelinput3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vdgamma3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vbrstd3 ); + lcl_vdelinput3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+32 ) ), lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vbrstd3, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vgamma3, lcl_vdelinput3 ); + + lcl_vdelinput4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vdgamma4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vbrstd4 ); + lcl_vdelinput4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+48 ) ), lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vbrstd4, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vgamma4, lcl_vdelinput4 ); + + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); + _mm512_stream_act( del_input_ptr+32, lcl_vdelinput3 ); + _mm512_stream_act( del_input_ptr+48, lcl_vdelinput4 ); + + del_input_ptr += sw*64; + input_ptr += sw*64; + del_output_ptr += 64; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c new file mode 100644 index 00000000..e7b286c4 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c @@ -0,0 +1,274 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; + +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int v = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) +union libxsmm_bfloat16_hp input_f32; +union libxsmm_bfloat16_hp del_input_f32; +union libxsmm_bfloat16_hp del_output_f32; +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) +union libxsmm_bfloat16_hp output_f32; +output_f32.i[1] = 0; +output_f32.i[0] = 0; +#endif +input_f32.i[1] = 0; +input_f32.i[0] = 0; +del_output_f32.i[1] = 0; +del_output_f32.i[0] = 0; +del_input_f32.i[1] = 0; +del_input_f32.i[0] = 0; +#endif + +assert( nFmBlock <= 64 ); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + /* @TODO check if we can bake this in into scratch */ + element_stats_type lcl_gamma_ptr[64]; + element_stats_type lcl_beta_ptr[64]; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_gamma_ptr[v] = 0.0f; + lcl_beta_ptr[v] = 0.0f; + } + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); + const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); + const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); + +#if !defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for ( v=0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) + del_output_f32.i[1] = del_output_ptr[v]; + del_output_f32.i[0] = 0; +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + output_f32.i[1] = output_ptr[v]; + del_output_f32.f = LIBXSMM_FEQ(output_f32.f, 0) ? 0 : del_output_f32.f; + del_output_ptr[v] = del_output_f32.i[1]; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + del_input_add_ptr[v] = del_output_ptr[v]; +#endif + input_f32.i[1] = input_ptr[v]; + lcl_gamma_ptr[v] += (input_f32.f - bmean_ptr[v]) * del_output_f32.f * brstd_ptr[v]; + lcl_beta_ptr[v] += del_output_f32.f; +#else +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) + del_output_ptr[v] = LIBXSMM_FEQ(output_ptr[v], 0) ? 0 : del_output_ptr[v]; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) + del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) + del_input_add_ptr[v] = del_output_ptr[v]; +#endif + lcl_gamma_ptr[v] += (input_ptr[v] - bmean_ptr[v]) * del_output_ptr[v] * brstd_ptr[v]; + lcl_beta_ptr[v] += del_output_ptr[v]; +#endif + } + } + } + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_img_ptr[v] = lcl_gamma_ptr[v]; + del_beta_img_ptr[v] = lcl_beta_ptr[v]; + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); + element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_ptr[v] = (element_stats_type)0; + del_beta_ptr[v] = (element_stats_type)0; + } + + for ( img=0; img < nImg; img++ ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_ptr[v] += del_gamma_img_ptr[v]; + del_beta_ptr[v] += del_beta_img_ptr[v]; + } + } + } + } else { + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, nFmBlock); + element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, nFmBlock); + + for ( img=1; img < nImg; img++ ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_ptr[v] += del_gamma_img_ptr[v]; + del_beta_ptr[v] += del_beta_img_ptr[v]; + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); + const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); + const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); + const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); + const element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); + const element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); + +#if !defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for ( v=0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) + del_output_f32.i[1] = del_output_ptr[v]; + input_f32.i[1] = input_ptr[v]; + del_input_f32.f = gamma_ptr[v] * brstd_ptr[v] * recp_nhw * (nhw*del_output_f32.f - + (del_beta_ptr[v] + (input_f32.f - bmean_ptr[v]) * del_gamma_ptr[v] * brstd_ptr[v])); + del_input_ptr[v] = del_input_f32.i[1]; +#else + del_input_ptr[v] = gamma_ptr[v] * brstd_ptr[v] * recp_nhw * (nhw*del_output_ptr[v] - + (del_beta_ptr[v] + (input_ptr[v] - bmean_ptr[v]) * del_gamma_ptr[v] * brstd_ptr[v])); +#endif + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..4d09a868 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,248 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 16); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 16), nImg, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 16); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 16); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + input_ptr += 16; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 16); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 16); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + sum_img_ptr += 16; + sumsq_img_ptr += 16; + } + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ +#if 0 + { + __m512d lcl_voned = _mm512_set1_pd(1.0); + __m512d lcl_vepsd = _mm512_set1_pd(1e-7); + __m512d lcl_vlo = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 0 ) ) ); + __m512d lcl_vhi = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 1 ) ) ); + lcl_vlo = _mm512_sqrt_pd( _mm512_add_pd( lcl_vlo, lcl_vepsd ) ); + lcl_vhi = _mm512_sqrt_pd( _mm512_add_pd( lcl_vhi, lcl_vepsd ) ); + lcl_vlo = _mm512_div_pd( lcl_voned, lcl_vlo ); + lcl_vhi = _mm512_div_pd( lcl_voned, lcl_vhi ); + lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_setzero_pd(), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vlo ) ), 0 ) ); + lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_castps_pd( lcl_vbrstd ), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vhi ) ), 1 ) ); + } +#else + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); +#endif + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 16), lcl_vvar ); + } else { + sum_img_ptr -= 16*nImg; + sumsq_img_ptr -= 16*nImg; + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 16) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + _mm512_stream_act( output_ptr, lcl_vo ); + + input_ptr += sw*16; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*16; +#endif + output_ptr += 16; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..fac158a7 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,294 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 32); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 32), nImg, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 32); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 32); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); + + input_ptr += 32; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + + _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); + _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 32); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 32); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, _mm512_loadu_ps( sum_img_ptr+16 ) ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_loadu_ps( sumsq_img_ptr+16 ) ); + + sum_img_ptr += 32; + sumsq_img_ptr += 32; + } + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + __m512 lcl_vbmean2, lcl_vbmeansq2, lcl_vsqbmean2, lcl_vbrstd2, lcl_vvar2; + + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); + + lcl_vbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum2 ); /* E(X) */ + lcl_vbmeansq2 = _mm512_mul_ps( lcl_vbmean2, lcl_vbmean2 ); /* E(X)^2 */ + lcl_vsqbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq2 ); /* E(X^2) */ + lcl_vvar2 = _mm512_sub_ps( lcl_vsqbmean2, lcl_vbmeansq2 ); /* variance */ + lcl_vbrstd2 = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar2, lcl_vsqrt_eps ) ) ); + + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 32), lcl_vvar ); + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32), lcl_vbmean2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32), lcl_vbrstd2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 16, 32), lcl_vvar2 ); + } else { + sum_img_ptr -= 32*nImg; + sumsq_img_ptr -= 32*nImg; + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + + _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); + _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 32) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); + lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; + __m512 lcl_vo2; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; + __mmask16 lcl_relumask2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); + lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); + relumask_ptr += 2; +#endif + + _mm512_stream_act( output_ptr, lcl_vo ); + _mm512_stream_act( output_ptr+16, lcl_vo2 ); + + input_ptr += sw*32; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*32; +#endif + output_ptr += 32; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..2aacc9a7 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,348 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm * 4; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 64); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 64), nImg, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + __m512 lcl_vsum3 = _mm512_setzero_ps(); + __m512 lcl_vsumsq3 = _mm512_setzero_ps(); + __m512 lcl_vsum4 = _mm512_setzero_ps(); + __m512 lcl_vsumsq4 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 64); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 64); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); + __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); + + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); + + lcl_vsum3 = _mm512_add_ps( lcl_vsum3, lcl_vinput3 ); + lcl_vsumsq3 = _mm512_add_ps( lcl_vsumsq3, _mm512_mul_ps( lcl_vinput3, lcl_vinput3 ) ); + + lcl_vsum4 = _mm512_add_ps( lcl_vsum4, lcl_vinput4 ); + lcl_vsumsq4 = _mm512_add_ps( lcl_vsumsq4, _mm512_mul_ps( lcl_vinput4, lcl_vinput4 ) ); + + input_ptr += 64; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + + _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); + _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); + + _mm512_storeu_ps( sum_img_ptr+32, lcl_vsum3 ); + _mm512_storeu_ps( sumsq_img_ptr+32, lcl_vsumsq3 ); + + _mm512_storeu_ps( sum_img_ptr+48, lcl_vsum4 ); + _mm512_storeu_ps( sumsq_img_ptr+48, lcl_vsumsq4 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + + sum_img_ptr += 64; + sumsq_img_ptr += 64; + } + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, (fm/4), ((fm%4)*16), 64), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, (fm/4), ((fm%4)*16), 64), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, (fm/4), ((fm%4)*16), 64), lcl_vvar ); + } else { + sum_img_ptr -= 64*nImg; + sumsq_img_ptr -= 64*nImg; + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; + __m512 lcl_vgamma3, lcl_vbeta3, lcl_vbmean3, lcl_vbrstd3; + __m512 lcl_vgamma4, lcl_vbeta4, lcl_vbmean4, lcl_vbrstd4; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 64) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); + lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + + lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); + lcl_vbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 32, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + + lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); + lcl_vbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 48, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; + __m512 lcl_vo2; + __m512 lcl_vo3; + __m512 lcl_vo4; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; + __mmask16 lcl_relumask2; + __mmask16 lcl_relumask3; + __mmask16 lcl_relumask4; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); + lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); + lcl_vo3 = _mm512_mul_ps( lcl_vgamma3, lcl_vo3 ); + lcl_vo3 = _mm512_fmadd_ps( lcl_vo3, lcl_vbrstd3, lcl_vbeta3 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo3 = _mm512_add_ps( lcl_vo3, _mm512_load_act( input_add_ptr+32 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo3 = _mm512_max_ps( lcl_vo3, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask3 = _mm512_cmp_ps_mask( lcl_vo3, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vo3 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask3 ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); + lcl_vo4 = _mm512_mul_ps( lcl_vgamma4, lcl_vo4 ); + lcl_vo4 = _mm512_fmadd_ps( lcl_vo4, lcl_vbrstd4, lcl_vbeta4 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + lcl_vo4 = _mm512_add_ps( lcl_vo4, _mm512_load_act( input_add_ptr+48 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + lcl_vo4 = _mm512_max_ps( lcl_vo4, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask4 = _mm512_cmp_ps_mask( lcl_vo4, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vo4 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask4 ); + relumask_ptr += 2; +#endif + + _mm512_stream_act( output_ptr, lcl_vo ); + _mm512_stream_act( output_ptr+16, lcl_vo2 ); + _mm512_stream_act( output_ptr+32, lcl_vo3 ); + _mm512_stream_act( output_ptr+48, lcl_vo4 ); + + input_ptr += sw*64; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*64; +#endif + output_ptr += 64; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c new file mode 100644 index 00000000..76e512f8 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedbatchnorm_st_fwd_custom_generic.tpl.c @@ -0,0 +1,265 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.partN; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int v = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) +union libxsmm_bfloat16_hp input_f32; +union libxsmm_bfloat16_hp output_f32; +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +union libxsmm_bfloat16_hp input_add_f32; +input_add_f32.i[1] = 0; +input_add_f32.i[0] = 0; +#endif +input_f32.i[1] = 0; +input_f32.i[0] = 0; +output_f32.i[1] = 0; +output_f32.i[0] = 0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + /* @TODO check if we can bake this in into scratch */ + element_stats_type lcl_sum_ptr[64]; + element_stats_type lcl_sumsq_ptr[64]; + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, nFmBlock); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_sum_ptr[v] = (element_stats_type)0; + lcl_sumsq_ptr[v] = (element_stats_type)0; + } + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + +#if !defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for (v=0; v < nFmBlock; v++) { +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + input_f32.i[1] = input_ptr[v]; + lcl_sum_ptr[v] += input_f32.f; + lcl_sumsq_ptr[v] += (input_f32.f * input_f32.f); +#else + lcl_sum_ptr[v] += input_ptr[v]; + lcl_sumsq_ptr[v] += (input_ptr[v] * input_ptr[v]); +#endif + } + } + } + + LIBXSMM_PRAGMA_SIMD + for (v=0; v < nFmBlock; v++) { + sum_img_ptr[v] = lcl_sum_ptr[v]; + sumsq_img_ptr[v] = lcl_sumsq_ptr[v]; + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + /* @TODO check if we can bake this in into scratch */ + element_stats_type lcl_sum_ptr[64]; + element_stats_type lcl_sumsq_ptr[64]; + element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); + element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); + element_stats_type* tvar_ptr = &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_sum_ptr[v] = (element_stats_type)0; + lcl_sumsq_ptr[v] = (element_stats_type)0; + } + + for ( img=0; img < nImg; img++ ) { + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, nFmBlock); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_sum_ptr[v] += sum_img_ptr[v]; + lcl_sumsq_ptr[v] += sumsq_img_ptr[v]; + } + } + + if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + const element_stats_type tbmean = (recp_nhw * lcl_sum_ptr[v]); + const element_stats_type tbmeansq = tbmean * tbmean; + const element_stats_type tsqbmean = recp_nhw * lcl_sumsq_ptr[v]; + const element_stats_type tvar = tsqbmean - tbmeansq; + const element_stats_type tbrstd = (element_stats_type)(1.0/sqrt((double)tvar + sqrt_eps)); + bmean_ptr[v] = tbmean; + brstd_ptr[v] = tbrstd; + tvar_ptr[v] = tvar; + } + } else { + element_stats_type* sum_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, nFmBlock); + element_stats_type* sumsq_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + sum_ptr[v] = lcl_sum_ptr[v]; + sumsq_ptr[v] = lcl_sumsq_ptr[v]; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || + ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif + const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); + const element_stats_type* beta_ptr = &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, nFmBlock); + const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, nFmBlock); + const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, nFmBlock); + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + float o; + +#if !defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for (v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + input_f32.i[1] = input_ptr[v]; + o = gamma_ptr[v]*(input_f32.f - bmean_ptr[v])*brstd_ptr[v] + beta_ptr[v]; +#else + /* BN + scale (gamma, beta) */ + o = gamma_ptr[v]*(input_ptr[v] - bmean_ptr[v])*brstd_ptr[v] + beta_ptr[v]; +#endif + /* Eltwise */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_ELTWISE) +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + input_add_f32.i[1] = input_add_ptr[v]; + o += input_add_f32.f; +#else + o += input_add_ptr[v]; +#endif +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU) + o = ( o > 0.0f ) ? o : 0.0f; +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_ENABLE_RELU_WITH_MASK) + o = ( o > 0.0f ) ? o : 0.0f; + relumask_ptr[v] = (unsigned char)(o > 0.0f ? 1 : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDBN_FWD_BF16) + output_f32.f = o; + output_ptr[v] = output_f32.i[1]; +#else + output_ptr[v] = o; +#endif + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..cb10fbd8 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,222 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)16), nImg, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 16); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + const __mmask16 lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + output_ptr += 16; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + const __mmask16 lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); + del_input_add_ptr += sw*16; +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + input_ptr += sw*16; + del_output_ptr += 16; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 16); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 16; + del_beta_img_ptr += 16; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16), lcl_vdbeta ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 16) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + + del_input_ptr += sw*16; + input_ptr += sw*16; + del_output_ptr += 16; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..11d7dd00 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,280 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + __m512 lcl_vbmean2, lcl_vbrstd2; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput, lcl_vdeloutput2; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + __mmask16 lcl_relumask, lcl_relumask2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask, lcl_relumask2; +#endif + + lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + output_ptr += 32; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); + del_input_add_ptr += sw*32; +#endif + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); + + input_ptr += sw*32; + del_output_ptr += 32; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); + _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); + del_gamma_img_ptr += 32; + del_beta_img_ptr += 32; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32), lcl_vdbeta ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32), lcl_vdbeta2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32) ); + lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + __m512 lcl_vdelinput2; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + + lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); + lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); + + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); + + del_input_ptr += sw*32; + input_ptr += sw*32; + del_output_ptr += 32; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..b3c58231 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,360 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm * 4; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)64), nImg, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + __m512 lcl_vdgamma2 = _mm512_setzero_ps(); + __m512 lcl_vdbeta2 = _mm512_setzero_ps(); + __m512 lcl_vdgamma3 = _mm512_setzero_ps(); + __m512 lcl_vdbeta3 = _mm512_setzero_ps(); + __m512 lcl_vdgamma4 = _mm512_setzero_ps(); + __m512 lcl_vdbeta4 = _mm512_setzero_ps(); + __m512 lcl_vbmean, lcl_vbrstd; + __m512 lcl_vbmean2, lcl_vbrstd2; + __m512 lcl_vbmean3, lcl_vbrstd3; + __m512 lcl_vbmean4, lcl_vbrstd4; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 64); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdeloutput, lcl_vdeloutput2, lcl_vdeloutput3, lcl_vdeloutput4; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask, lcl_relumask2, lcl_relumask3, lcl_relumask4; +#endif + + lcl_vdeloutput = _mm512_load_act( del_output_ptr ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vdeloutput ); + _mm512_store_act( del_output_ptr, lcl_vdeloutput ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); +#endif + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); + + lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vdeloutput2 ); + _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); +#endif + lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); + lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); + + lcl_vdeloutput3 = _mm512_load_act( del_output_ptr+32 ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask3 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+32 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vdeloutput3 ); + _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask3 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vdeloutput3 ); + _mm512_store_act( del_output_ptr+32, lcl_vdeloutput3 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+32, lcl_vdeloutput3 ); +#endif + lcl_vdgamma3 = _mm512_add_ps( lcl_vdgamma3, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ), lcl_vdeloutput3 ), lcl_vbrstd3 ) ); + lcl_vdbeta3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdeloutput3 ); + + lcl_vdeloutput4 = _mm512_load_act( del_output_ptr+48 ); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + lcl_relumask4 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+48 ), _mm512_setzero_ps(), _CMP_NEQ_OQ ); + lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vdeloutput4 ); + _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); + output_ptr += 64; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + lcl_relumask4 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); + lcl_vdeloutput4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vdeloutput4 ); + _mm512_store_act( del_output_ptr+48, lcl_vdeloutput4 ); + relumask_ptr += 2; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + _mm512_stream_act( del_input_add_ptr+48, lcl_vdeloutput4 ); + del_input_add_ptr += sw*64; +#endif + lcl_vdgamma4 = _mm512_add_ps( lcl_vdgamma4, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ), lcl_vdeloutput4 ), lcl_vbrstd4 ) ); + lcl_vdbeta4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdeloutput4 ); + + input_ptr += sw*64; + del_output_ptr += 64; + } + } + + _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); + _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); + _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); + _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); + _mm512_storeu_ps( del_gamma_img_ptr+32, lcl_vdgamma3 ); + _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); + _mm512_storeu_ps( del_gamma_img_ptr+48, lcl_vdgamma4 ); + _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the del_gamm and del_beta */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + __m512 lcl_vdgamma = _mm512_setzero_ps(); + __m512 lcl_vdbeta = _mm512_setzero_ps(); + + for ( img=0; img < nImg; img++ ) { + lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); + lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); + del_gamma_img_ptr += 64; + del_beta_img_ptr += 64; + } + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, (fm/4), ((fm%4)*16), 64), lcl_vdgamma ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, (fm/4), ((fm%4)*16), 64), lcl_vdbeta ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual backward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; + __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; + __m512 lcl_vgamma3, lcl_vbmean3, lcl_vbrstd3, lcl_vdgamma3, lcl_vdbeta3; + __m512 lcl_vgamma4, lcl_vbmean4, lcl_vbrstd4, lcl_vdgamma4, lcl_vdbeta4; + __m512 lcl_vnhw = _mm512_set1_ps( nhw ); + __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 64) ); + lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 64) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 64) ); + lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 64) ); + + lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + lcl_vdgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 32, 64) ); + lcl_vdbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 32, 64) ); + + lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + lcl_vdgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 48, 64) ); + lcl_vdbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + __m512 lcl_vdelinput; + __m512 lcl_vdelinput2; + __m512 lcl_vdelinput3; + __m512 lcl_vdelinput4; + + lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); + lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); + lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); + lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); + lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); + + lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); + lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); + lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); + + lcl_vdelinput3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vdgamma3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vdelinput3, lcl_vbrstd3 ); + lcl_vdelinput3 = _mm512_add_ps( lcl_vdbeta3, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+32 ) ), lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vbrstd3, lcl_vdelinput3 ); + lcl_vdelinput3 = _mm512_mul_ps( lcl_vgamma3, lcl_vdelinput3 ); + + lcl_vdelinput4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vdgamma4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vdelinput4, lcl_vbrstd4 ); + lcl_vdelinput4 = _mm512_add_ps( lcl_vdbeta4, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+48 ) ), lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vbrstd4, lcl_vdelinput4 ); + lcl_vdelinput4 = _mm512_mul_ps( lcl_vgamma4, lcl_vdelinput4 ); + + _mm512_stream_act( del_input_ptr, lcl_vdelinput ); + _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); + _mm512_stream_act( del_input_ptr+32, lcl_vdelinput3 ); + _mm512_stream_act( del_input_ptr+48, lcl_vdelinput4 ); + + del_input_ptr += sw*64; + input_ptr += sw*64; + del_output_ptr += 64; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c new file mode 100644 index 00000000..7fee60ee --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c @@ -0,0 +1,264 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int nG = handle->desc.G; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; +/* derive channels per group */ +const int nFmG = (nBlocksFm * nFmBlock) / nG; + +/* size of sample */ +const element_stats_type ghw = (element_stats_type)(nFmG * ifh * ifw); +const element_stats_type recp_ghw = 1.0f/ghw; +const element_stats_type eps = 1e-7f; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +/* @TODO let's fix parallelization to include channel groups while avoiding conflict misses */ +const int work = nImg; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* loop variables */ +int img = 0; +int fm = 0; +/*int imgfm = 0;*/ +int hi = 0; +int wi = 0; +int v = 0; +int ho = 0; +int wo = 0; +int g = 0; + +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) +LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); + +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nG); +LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nG); +LIBXSMM_VLA_DECL(2, const element_stats_type, variance, (element_stats_type*)handle->variance->data, nG); +LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nImg, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, d1_val_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * 2 * (size_t)nBlocksFm * (size_t)nFmBlock), nG); +LIBXSMM_VLA_DECL(2, element_stats_type, d2_val_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * 2 * (size_t)nBlocksFm * (size_t)nFmBlock) + ((size_t)nImg*(size_t)nG), nG); +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) +union libxsmm_bfloat16_hp input_f32; +union libxsmm_bfloat16_hp del_input_f32; +union libxsmm_bfloat16_hp del_output_f32; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) +union libxsmm_bfloat16_hp output_f32; +output_f32.i[1] = 0; +output_f32.i[0] = 0; +#endif +input_f32.i[1] = 0; +input_f32.i[0] = 0; +del_output_f32.i[1] = 0; +del_output_f32.i[0] = 0; +del_input_f32.i[1] = 0; +del_input_f32.i[0] = 0; +#endif + +assert( nFmBlock <= 64 ); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for ( img = thr_begin; img < thr_end; ++img ) { + element_stats_type* d1_val_img_ptr = &LIBXSMM_VLA_ACCESS(2, d1_val_img, img, 0, nG); + element_stats_type* d2_val_img_ptr = &LIBXSMM_VLA_ACCESS(2, d2_val_img, img, 0, nG); + + for ( g = 0; g < nG; ++g ) { + d1_val_img_ptr[g] = 0.0f; + d2_val_img_ptr[g] = 0.0f; + } + + for ( fm = 0; fm < nBlocksFm; ++fm ) { + /* @TODO check if we can bake this in into scratch */ + element_stats_type lcl_gamma_ptr[64]; + element_stats_type lcl_beta_ptr[64]; + element_stats_type* del_gamma_img_ptr; + element_stats_type* del_beta_img_ptr; + + del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); + del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_gamma_ptr[v] = 0.0f; + lcl_beta_ptr[v] = 0.0f; + } + + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); + const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); + const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); + const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); + + for ( v=0; v < nFmBlock; v++ ) { + g = ((fm*nFmBlock)+v)/nFmG; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) + del_output_f32.i[1] = del_output_ptr[v]; + del_output_f32.i[0] = 0; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + output_f32.i[1] = output_ptr[v]; + del_output_f32.f = LIBXSMM_FEQ(output_f32.f, 0) ? 0 : del_output_f32.f; + del_output_ptr[v] = del_output_f32.i[1]; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + del_input_add_ptr[v] = del_output_ptr[v]; +#endif + input_f32.i[1] = input_ptr[v]; + lcl_gamma_ptr[v] += (input_f32.f - bmean_ptr[g]) * del_output_f32.f * brstd_ptr[g]; + lcl_beta_ptr[v] += del_output_f32.f; + d1_val_img_ptr[g] += (input_f32.f - bmean_ptr[g]) * del_output_f32.f * gamma_ptr[v]; + d2_val_img_ptr[g] += del_output_f32.f * gamma_ptr[v]; +#else +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU) + del_output_ptr[v] = LIBXSMM_FEQ(output_ptr[v], 0) ? 0 : del_output_ptr[v]; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_RELU_WITH_MASK) + del_output_ptr[v] = (element_output_type)(relumask_ptr[v] == 1 ? del_output_ptr[v] : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_ENABLE_ELTWISE) + del_input_add_ptr[v] = del_output_ptr[v]; +#endif + lcl_gamma_ptr[v] += (input_ptr[v] - bmean_ptr[g]) * del_output_ptr[v] * brstd_ptr[g]; + lcl_beta_ptr[v] += del_output_ptr[v]; + d1_val_img_ptr[g] += (input_ptr[v] - bmean_ptr[g]) * del_output_ptr[v] * gamma_ptr[v]; + d2_val_img_ptr[g] += del_output_ptr[v] * gamma_ptr[v]; +#endif + } + } + } + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_img_ptr[v] = lcl_gamma_ptr[v]; + del_beta_img_ptr[v] = lcl_beta_ptr[v]; + } + } + + for ( fm = 0; fm < nBlocksFm; ++fm ) { + for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { + element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); + const element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); + const element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); + const element_stats_type* variance_ptr = &LIBXSMM_VLA_ACCESS(2, variance, img, 0, nG); + const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); + +#if 0 +#if !defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif +#endif + for ( v=0; v < nFmBlock; v++ ) { + element_stats_type t0_val; + g = ((fm*nFmBlock)+v)/nFmG; + t0_val = brstd_ptr[g] * recp_ghw; +#if defined(LIBXSMM_DNN_FUSEDGN_BWD_BF16) + del_output_f32.i[1] = del_output_ptr[v]; + input_f32.i[1] = input_ptr[v]; + del_input_f32.f = t0_val * ((gamma_ptr[v] * ghw * del_output_f32.f) - d2_val_img_ptr[g] - ((input_f32.f - bmean_ptr[g]) * d1_val_img_ptr[g] * (1.0f/(variance_ptr[g] + eps)))); + del_input_ptr[v] = del_input_f32.i[1]; +#else + del_input_ptr[v] = t0_val * ((gamma_ptr[v] * ghw * del_output_ptr[v]) - d2_val_img_ptr[g] - ((input_ptr[v] - bmean_ptr[g]) * d1_val_img_ptr[g] * (1.0f/(variance_ptr[g] + eps)))); +#endif + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +/* now we need to reduce the del_gamm and del_beta */ +for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + element_stats_type* del_gamma_ptr = &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, nFmBlock); + element_stats_type* del_beta_ptr = &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_ptr[v] = (element_stats_type)0; + del_beta_ptr[v] = (element_stats_type)0; + } + + for ( img=0; img < nImg; img++ ) { + element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, nFmBlock); + element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + del_gamma_ptr[v] += del_gamma_img_ptr[v]; + del_beta_ptr[v] += del_beta_img_ptr[v]; + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..9d1a104e --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,232 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 16); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 16); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 16); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 16); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 16), nImg, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 2); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 16); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 16); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + input_ptr += 16; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 16); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 16); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + sum_img_ptr += 16; + sumsq_img_ptr += 16; + } + + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ +#if 0 + { + __m512d lcl_voned = _mm512_set1_pd(1.0); + __m512d lcl_vepsd = _mm512_set1_pd(1e-7); + __m512d lcl_vlo = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 0 ) ) ); + __m512d lcl_vhi = _mm512_cvtps_pd( _mm256_castpd_ps( _mm512_extractf64x4_pd( _mm512_castps_pd( lcl_vvar ), 1 ) ) ); + lcl_vlo = _mm512_sqrt_pd( _mm512_add_pd( lcl_vlo, lcl_vepsd ) ); + lcl_vhi = _mm512_sqrt_pd( _mm512_add_pd( lcl_vhi, lcl_vepsd ) ); + lcl_vlo = _mm512_div_pd( lcl_voned, lcl_vlo ); + lcl_vhi = _mm512_div_pd( lcl_voned, lcl_vhi ); + lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_setzero_pd(), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vlo ) ), 0 ) ); + lcl_vbrstd = _mm512_castpd_ps( _mm512_insertf64x4( _mm512_castps_pd( lcl_vbrstd ), _mm256_castps_pd( _mm512_cvtpd_ps( lcl_vhi ) ), 1 ) ); + } +#else + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); +#endif + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 16), lcl_vvar ); + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 16) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 16) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 16) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 16) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 2); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + _mm512_stream_act( output_ptr, lcl_vo ); + + input_ptr += sw*16; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*16; +#endif + output_ptr += 16; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); +} + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..6e238f89 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,275 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 32); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 32); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 32), nImg, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 32); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 32); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); + + input_ptr += 32; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + + _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); + _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, 0, 0, nImg, 32); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, 0, 0, nImg, 32); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, _mm512_loadu_ps( sum_img_ptr+16 ) ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_loadu_ps( sumsq_img_ptr+16 ) ); + + sum_img_ptr += 32; + sumsq_img_ptr += 32; + } + + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + __m512 lcl_vbmean2, lcl_vbmeansq2, lcl_vsqbmean2, lcl_vbrstd2, lcl_vvar2; + + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); + + lcl_vbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum2 ); /* E(X) */ + lcl_vbmeansq2 = _mm512_mul_ps( lcl_vbmean2, lcl_vbmean2 ); /* E(X)^2 */ + lcl_vsqbmean2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq2 ); /* E(X^2) */ + lcl_vvar2 = _mm512_sub_ps( lcl_vsqbmean2, lcl_vbmeansq2 ); /* variance */ + lcl_vbrstd2 = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar2, lcl_vsqrt_eps ) ) ); + + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 0, 32), lcl_vvar ); + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32), lcl_vbmean2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32), lcl_vbrstd2 ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, fm, 16, 32), lcl_vvar2 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 32) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); + lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 32) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; + __m512 lcl_vo2; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; + __mmask16 lcl_relumask2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); + lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); + relumask_ptr += 2; +#endif + + _mm512_stream_act( output_ptr, lcl_vo ); + _mm512_stream_act( output_ptr+16, lcl_vo2 ); + + input_ptr += sw*32; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*32; +#endif + output_ptr += 32; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..332a88ad --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,332 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel, delta gamma and beta reduction */ +const int work2 = nBlocksFm * 4; +/* compute chunk size */ +const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; +const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; +const element_stats_type nhw = (element_stats_type)(handle->desc.N * ifh * ifw); +const element_stats_type recp_nhw = 1.0f/nhw; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int hi = 0; +int wi = 0; +int ho = 0; +int wo = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, 64); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 64); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 64); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nImg, 64); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * 64), nImg, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 8); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + __m512 lcl_vsum2 = _mm512_setzero_ps(); + __m512 lcl_vsumsq2 = _mm512_setzero_ps(); + __m512 lcl_vsum3 = _mm512_setzero_ps(); + __m512 lcl_vsumsq3 = _mm512_setzero_ps(); + __m512 lcl_vsum4 = _mm512_setzero_ps(); + __m512 lcl_vsumsq4 = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr; + element_stats_type* sumsq_img_ptr; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, fm, img, 0, nImg, 64); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, fm, img, 0, nImg, 64); + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); + __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); + + lcl_vsum = _mm512_add_ps( lcl_vsum, lcl_vinput ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_mul_ps( lcl_vinput, lcl_vinput ) ); + + lcl_vsum2 = _mm512_add_ps( lcl_vsum2, lcl_vinput2 ); + lcl_vsumsq2 = _mm512_add_ps( lcl_vsumsq2, _mm512_mul_ps( lcl_vinput2, lcl_vinput2 ) ); + + lcl_vsum3 = _mm512_add_ps( lcl_vsum3, lcl_vinput3 ); + lcl_vsumsq3 = _mm512_add_ps( lcl_vsumsq3, _mm512_mul_ps( lcl_vinput3, lcl_vinput3 ) ); + + lcl_vsum4 = _mm512_add_ps( lcl_vsum4, lcl_vinput4 ); + lcl_vsumsq4 = _mm512_add_ps( lcl_vsumsq4, _mm512_mul_ps( lcl_vinput4, lcl_vinput4 ) ); + + input_ptr += 64; + } + } + + _mm512_storeu_ps( sum_img_ptr, lcl_vsum ); + _mm512_storeu_ps( sumsq_img_ptr, lcl_vsumsq ); + + _mm512_storeu_ps( sum_img_ptr+16, lcl_vsum2 ); + _mm512_storeu_ps( sumsq_img_ptr+16, lcl_vsumsq2 ); + + _mm512_storeu_ps( sum_img_ptr+32, lcl_vsum3 ); + _mm512_storeu_ps( sumsq_img_ptr+32, lcl_vsumsq3 ); + + _mm512_storeu_ps( sum_img_ptr+48, lcl_vsum4 ); + _mm512_storeu_ps( sumsq_img_ptr+48, lcl_vsumsq4 ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we need to reduce the sum and sum^2, we use the final */ + for ( fm = thr_begin2; fm < thr_end2; ++fm ) { + __m512 lcl_vsum = _mm512_setzero_ps(); + __m512 lcl_vsumsq = _mm512_setzero_ps(); + element_stats_type* sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + element_stats_type* sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, (fm/4), 0, ((fm%4)*16), nImg, 64); + + for ( img=0; img < nImg; img++ ) { + lcl_vsum = _mm512_add_ps( lcl_vsum, _mm512_loadu_ps( sum_img_ptr ) ); + lcl_vsumsq = _mm512_add_ps( lcl_vsumsq, _mm512_loadu_ps( sumsq_img_ptr ) ); + + sum_img_ptr += 64; + sumsq_img_ptr += 64; + } + + __m512 lcl_vsqrt_eps = _mm512_set1_ps(sqrt_eps); + __m512 lcl_vrec_nhw = _mm512_set1_ps(recp_nhw); + __m512 lcl_vone = _mm512_set1_ps(1.0); + __m512 lcl_vbmean, lcl_vbmeansq, lcl_vsqbmean, lcl_vbrstd, lcl_vvar; + + lcl_vbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsum ); /* E(X) */ + lcl_vbmeansq = _mm512_mul_ps( lcl_vbmean, lcl_vbmean ); /* E(X)^2 */ + lcl_vsqbmean = _mm512_mul_ps( lcl_vrec_nhw, lcl_vsumsq ); /* E(X^2) */ + lcl_vvar = _mm512_sub_ps( lcl_vsqbmean, lcl_vbmeansq ); /* variance */ + lcl_vbrstd = _mm512_div_ps( lcl_vone, _mm512_sqrt_ps( _mm512_add_ps( lcl_vvar, lcl_vsqrt_eps ) ) ); + + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, (fm/4), ((fm%4)*16), 64), lcl_vbmean ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, (fm/4), ((fm%4)*16), 64), lcl_vbrstd ); + _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, variance, (fm/4), ((fm%4)*16), 64), lcl_vvar ); + } + + libxsmm_barrier_wait(handle->barrier, ltid); + + /* now we apply the actual forward batch norm */ + for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { + __m512 lcl_vgamma, lcl_vbeta, lcl_vbmean, lcl_vbrstd; + __m512 lcl_vgamma2, lcl_vbeta2, lcl_vbmean2, lcl_vbrstd2; + __m512 lcl_vgamma3, lcl_vbeta3, lcl_vbmean3, lcl_vbrstd3; + __m512 lcl_vgamma4, lcl_vbeta4, lcl_vbmean4, lcl_vbrstd4; + + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 64) ); + lcl_vbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, 64) ); + lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 64) ); + lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 64) ); + + lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 64) ); + lcl_vbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 16, 64) ); + lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 64) ); + lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 64) ); + + lcl_vgamma3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 32, 64) ); + lcl_vbeta3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 32, 64) ); + lcl_vbmean3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 32, 64) ); + lcl_vbrstd3 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 32, 64) ); + + lcl_vgamma4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 48, 64) ); + lcl_vbeta4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, beta, fm, 48, 64) ); + lcl_vbmean4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 48, 64) ); + lcl_vbrstd4 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 48, 64) ); + + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#endif + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 8); +#endif + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + __m512 lcl_vo; + __m512 lcl_vo2; + __m512 lcl_vo3; + __m512 lcl_vo4; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + __mmask16 lcl_relumask; + __mmask16 lcl_relumask2; + __mmask16 lcl_relumask3; + __mmask16 lcl_relumask4; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); + lcl_vo = _mm512_mul_ps( lcl_vgamma, lcl_vo ); + lcl_vo = _mm512_fmadd_ps( lcl_vo, lcl_vbrstd, lcl_vbeta ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo = _mm512_add_ps( lcl_vo, _mm512_load_act( input_add_ptr ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo = _mm512_max_ps( lcl_vo, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask = _mm512_cmp_ps_mask( lcl_vo, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo = _mm512_mask_blend_ps( lcl_relumask, _mm512_setzero_ps(), lcl_vo ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); + lcl_vo2 = _mm512_mul_ps( lcl_vgamma2, lcl_vo2 ); + lcl_vo2 = _mm512_fmadd_ps( lcl_vo2, lcl_vbrstd2, lcl_vbeta2 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo2 = _mm512_add_ps( lcl_vo2, _mm512_load_act( input_add_ptr+16 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo2 = _mm512_max_ps( lcl_vo2, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask2 = _mm512_cmp_ps_mask( lcl_vo2, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo2 = _mm512_mask_blend_ps( lcl_relumask2, _mm512_setzero_ps(), lcl_vo2 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask2 ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo3 = _mm512_sub_ps( _mm512_load_act( input_ptr+32 ), lcl_vbmean3 ); + lcl_vo3 = _mm512_mul_ps( lcl_vgamma3, lcl_vo3 ); + lcl_vo3 = _mm512_fmadd_ps( lcl_vo3, lcl_vbrstd3, lcl_vbeta3 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo3 = _mm512_add_ps( lcl_vo3, _mm512_load_act( input_add_ptr+32 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo3 = _mm512_max_ps( lcl_vo3, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask3 = _mm512_cmp_ps_mask( lcl_vo3, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo3 = _mm512_mask_blend_ps( lcl_relumask3, _mm512_setzero_ps(), lcl_vo3 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask3 ); + relumask_ptr += 2; +#endif + + /* BN + scale (gamma, beta) */ + lcl_vo4 = _mm512_sub_ps( _mm512_load_act( input_ptr+48 ), lcl_vbmean4 ); + lcl_vo4 = _mm512_mul_ps( lcl_vgamma4, lcl_vo4 ); + lcl_vo4 = _mm512_fmadd_ps( lcl_vo4, lcl_vbrstd4, lcl_vbeta4 ); + /* eltwise add */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + lcl_vo4 = _mm512_add_ps( lcl_vo4, _mm512_load_act( input_add_ptr+48 ) ); +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + lcl_vo4 = _mm512_max_ps( lcl_vo4, _mm512_setzero_ps() ); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + lcl_relumask4 = _mm512_cmp_ps_mask( lcl_vo4, _mm512_setzero_ps(), _CMP_GT_OQ ); + lcl_vo4 = _mm512_mask_blend_ps( lcl_relumask4, _mm512_setzero_ps(), lcl_vo4 ); + LIBXSMM_INTRINSICS_MM512_STORE_MASK16( relumask_ptr, lcl_relumask4 ); + relumask_ptr += 2; +#endif + + _mm512_stream_act( output_ptr, lcl_vo ); + _mm512_stream_act( output_ptr+16, lcl_vo2 ); + _mm512_stream_act( output_ptr+32, lcl_vo3 ); + _mm512_stream_act( output_ptr+48, lcl_vo4 ); + + input_ptr += sw*64; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + input_add_ptr += sw*64; +#endif + output_ptr += 64; + } + } + } + + libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c new file mode 100644 index 00000000..89b70194 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_fusedgroupnorm_st_fwd_custom_generic.tpl.c @@ -0,0 +1,229 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int nG = handle->desc.G; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = ifh/sh; +const int ofw = ifw/sw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; +/* derive channels per group */ +const int nFmG = (nBlocksFm * nFmBlock) / nG; +/* size of sample */ +const element_stats_type ghw = (element_stats_type)(nFmG * ifh * ifw); +const element_stats_type recp_ghw = 1.0f/ghw; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +/* @TODO let's fix parallelization to include channel groups while avoiding conflict misses */ +const int work = nImg; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* eps to avoid sqrt of zero */ +const element_stats_type sqrt_eps = 1e-7f; + +/* loop variables */ +int img = 0; +int fm = 0; +/*int imgfm = 0;*/ +int hi = 0; +int wi = 0; +int v = 0; +int ho = 0; +int wo = 0; +int g = 0; + +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +LIBXSMM_VLA_DECL(5, const element_input_type, input_add, (element_input_type* )handle->reg_add->data, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, nFmBlock); +LIBXSMM_VLA_DECL(2, const element_stats_type, beta, (element_stats_type*)handle->reg_beta->data, nFmBlock); +LIBXSMM_VLA_DECL(2, element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, nG); +LIBXSMM_VLA_DECL(2, element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, nG); +LIBXSMM_VLA_DECL(2, element_stats_type, variance, (element_stats_type*)handle->variance->data, nG); +LIBXSMM_VLA_DECL(3, element_stats_type, sum_img, (element_stats_type*)handle->scratch, nBlocksFm, nFmBlock); +LIBXSMM_VLA_DECL(3, element_stats_type, sumsq_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)nFmBlock), nBlocksFm, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) +LIBXSMM_VLA_DECL(5, unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) +union libxsmm_bfloat16_hp input_f32; +union libxsmm_bfloat16_hp output_f32; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +union libxsmm_bfloat16_hp input_add_f32; +input_add_f32.i[1] = 0; +input_add_f32.i[0] = 0; +#endif +input_f32.i[1] = 0; +input_f32.i[0] = 0; +output_f32.i[1] = 0; +output_f32.i[0] = 0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for ( img = thr_begin; img < thr_end; ++img ) { + element_stats_type* bmean_ptr = &LIBXSMM_VLA_ACCESS(2, bmean, img, 0, nG); + element_stats_type* brstd_ptr = &LIBXSMM_VLA_ACCESS(2, brstd, img, 0, nG); + element_stats_type* tvar_ptr = &LIBXSMM_VLA_ACCESS(2, variance, img, 0, nG); + element_stats_type* sum_img_ptr = NULL; + element_stats_type* sumsq_img_ptr = NULL; + + /* create reduction over all pixels per channel */ + for ( fm = 0; fm < nBlocksFm; ++fm ) { + /* @TODO check if we can bake this in into scratch */ + element_stats_type lcl_sum_ptr[64]; + element_stats_type lcl_sumsq_ptr[64]; + + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, img, fm, 0, nBlocksFm, nFmBlock); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, img, fm, 0, nBlocksFm, nFmBlock); + + LIBXSMM_PRAGMA_SIMD + for ( v=0; v < nFmBlock; v++ ) { + lcl_sum_ptr[v] = (element_stats_type)0; + lcl_sumsq_ptr[v] = (element_stats_type)0; + } + + for ( hi=iph; hi < (ifh + iph); hi++ ) { + for ( wi=ipw; wi < (ifw + ipw); wi++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); + +#if !defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for (v=0; v < nFmBlock; v++) { +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + input_f32.i[1] = input_ptr[v]; + lcl_sum_ptr[v] += input_f32.f; + lcl_sumsq_ptr[v] += (input_f32.f * input_f32.f); +#else + lcl_sum_ptr[v] += input_ptr[v]; + lcl_sumsq_ptr[v] += (input_ptr[v] * input_ptr[v]); +#endif + } + } + } + + LIBXSMM_PRAGMA_SIMD + for (v=0; v < nFmBlock; v++) { + sum_img_ptr[v] = lcl_sum_ptr[v]; + sumsq_img_ptr[v] = lcl_sumsq_ptr[v]; + } + } + + /* new we compute mean, variance and rstd per channel group */ + sum_img_ptr = &LIBXSMM_VLA_ACCESS(3, sum_img, img, 0, 0, nImg, nFmBlock); + sumsq_img_ptr = &LIBXSMM_VLA_ACCESS(3, sumsq_img, img, 0, 0, nImg, nFmBlock); + for ( g = 0; g < nG; ++g ) { + element_stats_type lcl_fm_sum = 0.0f; + element_stats_type lcl_fm_sumsq = 0.0f; + + for ( fm = g*nFmG; fm < (g+1)*nFmG; ++fm ) { + lcl_fm_sum += sum_img_ptr[fm]; + lcl_fm_sumsq += sumsq_img_ptr[fm]; + } + + { + const element_stats_type tbmean = (recp_ghw * lcl_fm_sum); + const element_stats_type tbmeansq = tbmean * tbmean; + const element_stats_type tsqbmean = recp_ghw * lcl_fm_sumsq; + const element_stats_type tvar = tsqbmean - tbmeansq; + const element_stats_type tbrstd = (element_stats_type)(1.0/sqrt((double)tvar + sqrt_eps)); + bmean_ptr[g] = tbmean; + brstd_ptr[g] = tbrstd; + tvar_ptr[g] = tvar; + } + } + + /* let's scale the data */ + for ( fm = 0; fm < nBlocksFm; ++fm ) { + for ( hi=iph, ho=oph; hi < (ifh+iph); hi+=sh, ho++ ) { + for ( wi=ipw, wo=opw; wi < (ifw+ipw); wi+=sw, wo++ ) { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) + const element_input_type* input_add_ptr = &LIBXSMM_VLA_ACCESS(5, input_add, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#endif + const element_stats_type* gamma_ptr = &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, nFmBlock); + const element_stats_type* beta_ptr = &LIBXSMM_VLA_ACCESS(2, beta, fm, 0, nFmBlock); + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#endif + float o; + +#if 0 +#if !defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif +#endif + for (v = 0; v < nFmBlock; v++ ) { + g = ((fm*nFmBlock)+v)/nFmG; +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + input_f32.i[1] = input_ptr[v]; + o = gamma_ptr[v]*(input_f32.f - bmean_ptr[g])*brstd_ptr[g] + beta_ptr[v]; +#else + /* BN + scale (gamma, beta) */ + o = gamma_ptr[v]*(input_ptr[v] - bmean_ptr[g])*brstd_ptr[g] + beta_ptr[v]; +#endif + /* Eltwise */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_ELTWISE) +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + input_add_f32.i[1] = input_add_ptr[v]; + o += input_add_f32.f; +#else + o += input_add_ptr[v]; +#endif +#endif + /* ReLU */ +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU) + o = ( o > 0.0f ) ? o : 0.0f; +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_ENABLE_RELU_WITH_MASK) + o = ( o > 0.0f ) ? o : 0.0f; + relumask_ptr[v] = (unsigned char)(o > 0.0f ? 1 : 0); +#endif +#if defined(LIBXSMM_DNN_FUSEDGN_FWD_BF16) + output_f32.f = o; + output_ptr[v] = output_f32.i[1]; +#else + output_ptr[v] = o; +#endif + } + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c new file mode 100644 index 00000000..1818ab34 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_optimizer_sgd_st_generic.tpl.c @@ -0,0 +1,91 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) +# define _mm512_load_fil(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +# define _mm512_store_fil(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16((B)),16))) +#endif + +/* loop counters */ +libxsmm_blasint i; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could run in parallel for the filters */ +const int work = handle->desc.C * handle->desc.K; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +element_filter_type* filter = (element_filter_type*)handle->reg_filter->data; +element_filter_type* dfilter = (element_filter_type*)handle->grad_filter->data; +#if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16) || defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) +element_master_type* master = (element_master_type*)handle->master_filter->data; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init( handle->barrier, ltid ); + +#if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16) || defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) +#if defined(LIBXSMM_DNN_OPTIMIZER_SGD_BF16_AVX512) +{ + libxsmm_blasint iv = ( (thr_end-thr_begin)/16 ) * 16; /* compute iterations which are vectorizable */ + __m512 vlr = _mm512_set1_ps( handle->desc.learning_rate ); + for ( i = thr_begin; i desc.learning_rate*t1.f); + t2.f = master[i]; + filter[i] = t2.i[1]; + } +} +#undef _mm512_load_fil +#undef _mm512_store_fil +#else +for ( i = thr_begin; i < thr_end; ++i ) { + libxsmm_bfloat16_hp t1, t2; + t1.i[0] =0; + t1.i[1] = dfilter[i]; + master[i] = master[i] - (handle->desc.learning_rate*t1.f); + t2.f = master[i]; + filter[i] = t2.i[1]; +} +#endif +#else +#if defined(LIBXSMM_DNN_OPTIMIZER_SGD_F32_AVX512) +{ + libxsmm_blasint iv = ( (thr_end-thr_begin)/16 ) * 16; /* compute iterations which are vectorizable */ + __m512 vlr = _mm512_set1_ps( handle->desc.learning_rate ); + for ( i = thr_begin; i < thr_begin + iv; i+=16 ) { + _mm512_storeu_ps( filter+i, _mm512_sub_ps( _mm512_loadu_ps( filter+i ), _mm512_mul_ps( vlr, _mm512_loadu_ps( dfilter + i ) ) ) ) ; + } + for ( i = thr_begin + iv; i < thr_end; ++i ) { + filter[i] = filter[i] - (handle->desc.learning_rate*dfilter[i]); + } +} +#else +for ( i = thr_begin; i < thr_end; ++i ) { + filter[i] = filter[i] - (handle->desc.learning_rate*dfilter[i]); +} +#endif +#endif + +libxsmm_barrier_wait( handle->barrier, ltid ); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..72e92417 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,153 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +const int sh = handle->desc.u; +const int sw = handle->desc.v; +#endif +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +int kh = 0; +int kw = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)16*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 16); +#else +element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)16*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 16); +#endif +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 16); +LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 16); +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) +LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 16); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + for ( v = 0; v < ifh*ifw*16; v += 16 ) { + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); + } + +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) + for ( ho = oph; ho < (ofh+oph); ho++ ) { + for ( wo = opw; wo < (ofw+opw); wo++ ) { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 16); + const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16); + + __m512 lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); + lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); + _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); + } + } +#endif +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) + for ( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for ( wo = opw; wo < (ofw+opw); wo++ ) { + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for ( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for ( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 16); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 16); + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); + const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); + _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); + } + } + } + } + } +#endif + + /* copy the local buffer into dinput activations */ + for ( hi = iph; hi < (ifh+iph); hi++ ) { + for ( wi = ipw; wi < (ifw+ipw); wi++ ) { + element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, 16); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 16); + _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..b5740474 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,161 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +const int sh = handle->desc.u; +const int sw = handle->desc.v; +#endif +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +int kh = 0; +int kw = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)32*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 32); +#else +element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)32*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 32); +#endif +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); +LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) +LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 32); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + for( v = 0; v < ifh*ifw*32; v += 16 ) { + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); + } + +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) + for( ho = oph; ho < (ofh+oph); ho++ ) { + for( wo = opw; wo < (ofw+opw); wo++ ) { + __m512 lcl_vdinput, lcl_vdinput2; + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 32); + const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32); + + lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); + lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); + _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); + + lcl_vdinput2 = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr+16 ), lcl_buffer_ptr, 4 ); + lcl_vdinput2 = _mm512_add_ps( lcl_vdinput2, _mm512_load_act( doutput_ptr+16 ) ); + _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr+16 ), lcl_vdinput2, 4 ); + } + } +#endif +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) + for( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for( wo = opw; wo < (ofw+opw); wo++ ) { + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, 32); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 32); + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); + const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); + const __m512 lcl_dinput_ps2 = _mm512_loadu_ps( lcl_dinput_ptr+16 ); + _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); + _mm512_storeu_ps( lcl_dinput_ptr+16, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr+16 ), recp_pool_size_ps, lcl_dinput_ps2 ) ); + } + } + } + } + } +#endif + + /* copy the local buffer into dinput activations */ + for( hi = iph; hi < (ifh+iph); hi++ ) { + for( wi = ipw; wi < (ifw+ipw); wi++ ) { + element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, 32); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 32); + _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); + _mm512_stream_act( dinput_ptr+16, _mm512_loadu_ps( lcl_dinput_ptr+16 ) ); + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..70b65202 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,170 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +const int sh = handle->desc.u; +const int sw = handle->desc.v; +#endif +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm * 4; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm1 = 0; +int fm2 = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +int kh = 0; +int kw = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)64*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, 16); +#else +element_output_type* lcl_buffer_ptr = ((element_input_type*)handle->scratch)+((size_t)ifh*(size_t)ifw*(size_t)64*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, 16); +#endif +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 64); +LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 64); +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) +LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 64); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { + img = imgfm / (nBlocksFm*4); + fm1 = imgfm % (nBlocksFm*4); + fm2 = imgfm % (nBlocksFm*4); + fm1 = fm1/4; + fm2 = (fm2%4)*16; + + for( v = 0; v < ifh*ifw*16; v += 16 ) { + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); + } + +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) + for( ho = oph; ho < (ofh+oph); ho++ ) { + for( wo = opw; wo < (ofw+opw); wo++ ) { + __m512 lcl_vdinput/*, lcl_vdinput2, lcl_vdinput3, lcl_vdinput4*/; + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm1, ho, wo, fm2, nBlocksFm, ofhp, ofwp, 64); + const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm1, ho-oph, wo-opw, fm2, nBlocksFm, ofh, ofw, 64); +#if 1 + lcl_vdinput = _mm512_i32gather_ps( _mm512_loadu_si512( mask_ptr ), lcl_buffer_ptr, 4 ); + lcl_vdinput = _mm512_add_ps( lcl_vdinput, _mm512_load_act( doutput_ptr ) ); + _mm512_i32scatter_ps( lcl_buffer_ptr, _mm512_loadu_si512( mask_ptr ), lcl_vdinput, 4 ); +#else + for ( v = 0; v < 16; ++v ) { +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + union libxsmm_bfloat16_hp del_output_f32; + del_output_f32.i[1] = doutput_ptr[v]; + del_output_f32.i[0] = 0; + lcl_buffer_ptr[mask_ptr[v]] += del_output_f32.f; +#else + lcl_buffer_ptr[mask_ptr[v]] += doutput_ptr[v]; +#endif + } +#endif + } + } +#endif +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) + for( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for( wo = opw; wo < (ofw+opw); wo++ ) { + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm1, ho, wo, fm2, nBlocksFm, ofhp, ofwp, 64); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, 16); + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); + const __m512 lcl_dinput_ps = _mm512_loadu_ps( lcl_dinput_ptr ); + _mm512_storeu_ps( lcl_dinput_ptr, _mm512_fmadd_ps( _mm512_load_act( doutput_ptr ), recp_pool_size_ps, lcl_dinput_ps ) ); + } + } + } + } + } +#endif + + /* copy the local buffer into dinput activations */ + for( hi = iph; hi < (ifh+iph); hi++ ) { + for( wi = ipw; wi < (ifw+ipw); wi++ ) { + element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm1, hi, wi, fm2, nBlocksFm, ifhp, ifwp, 64); + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, 16); + _mm512_stream_act( dinput_ptr, _mm512_loadu_ps( lcl_dinput_ptr ) ); + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c new file mode 100644 index 00000000..805db71f --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_bwd_custom_generic.tpl.c @@ -0,0 +1,184 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +const int sh = handle->desc.u; +const int sw = handle->desc.v; +#endif +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) +int kh = 0; +int kw = 0; +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_input_type recp_pool_size = 1.0f/((element_input_type)handle->desc.R*(element_input_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +float *const lcl_buffer_ptr = (float*)handle->scratch + (size_t)ifh*ifw*nFmBlock*ltid; +LIBXSMM_VLA_DECL(3, float, lcl_dinput, lcl_buffer_ptr, ifw, nFmBlock); +#else +element_output_type *const lcl_buffer_ptr = (element_input_type*)handle->scratch + (size_t)ifh*ifw*nFmBlock*ltid; +LIBXSMM_VLA_DECL(3, element_input_type, lcl_dinput, lcl_buffer_ptr, ifw, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +LIBXSMM_VLA_DECL(5, const element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) +LIBXSMM_VLA_DECL(5, const element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) +union libxsmm_bfloat16_hp del_input_f32; +union libxsmm_bfloat16_hp del_output_f32; +del_input_f32.i[1] = 0; +del_input_f32.i[0] = 0; +del_output_f32.i[1] = 0; +del_output_f32.i[0] = 0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + LIBXSMM_PRAGMA_SIMD + for ( v = 0; v < ifh*ifw*nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + lcl_buffer_ptr[v] = (float)0; +#else + lcl_buffer_ptr[v] = (element_input_type)0; +#endif + } + +#if defined(LIBXSMM_DNN_POOLING_BWD_MAX) + for ( ho = oph; ho < (ofh+oph); ho++ ) { + for ( wo = opw; wo < (ofw+opw); wo++ ) { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); + const element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, nFmBlock); + +#if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + del_output_f32.i[1] = doutput_ptr[v]; + lcl_buffer_ptr[mask_ptr[v]] += del_output_f32.f; +#else + lcl_buffer_ptr[mask_ptr[v]] += doutput_ptr[v]; +#endif + } + } + } +#endif +#if defined(LIBXSMM_DNN_POOLING_BWD_AVG) + for ( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for ( wo = opw; wo < (ofw+opw); wo++ ) { + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for ( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for ( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_output_type* doutput_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, nFmBlock); +#else + element_input_type* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi+kh, wi+kw, 0, ifw, nFmBlock); +#endif + +#if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + del_output_f32.i[1] = doutput_ptr[v]; + lcl_dinput_ptr[v] += (del_output_f32.f * recp_pool_size); +#else + lcl_dinput_ptr[v] += (doutput_ptr[v] * recp_pool_size); +#endif + } + } + } + } + } + } +#endif + + /* copy the local buffer into dinput activations */ + for ( hi = iph; hi < (ifh+iph); hi++ ) { + for ( wi = ipw; wi < (ifw+ipw); wi++ ) { + element_input_type* dinput_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, wi, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + float* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, nFmBlock); +#else + element_input_type* lcl_dinput_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_dinput, hi-iph, wi-ipw, 0, ifw, nFmBlock); +#endif + +#if !defined(LIBXSMM_DNN_POOLING_BWD_BF16) + LIBXSMM_PRAGMA_SIMD +#endif + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_BWD_BF16) + del_input_f32.f = lcl_dinput_ptr[v]; + dinput_ptr[v] = del_input_f32.i[1]; +#else + dinput_ptr[v] = lcl_dinput_ptr[v]; +#endif + } + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c new file mode 100644 index 00000000..76137cd5 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c16_avx512.tpl.c @@ -0,0 +1,171 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int kh = 0; +int kw = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)16*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 16); +#else +element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)16*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 16); +#endif +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 16); +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 16); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) +LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 16); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); +#endif + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + for ( v = 0; v < ofh*ofw*16; v+=16 ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); +#endif + } + + for ( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for ( wo = opw; wo < (ofw+opw); wo++ ) { + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 16); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16) ); +#endif + __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); + + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for ( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for ( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 16); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); + lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); + lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); +#endif + } + } + } +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 16), lcl_vmask ); +#endif + _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); + } + } + + /* copy the local buffer into output activations */ + for ( ho = oph; ho < (ofh+oph); ho++ ) { + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 16); + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 16); + for ( wo = opw; wo < (ofw+opw); wo++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); +#endif + output_ptr += 16; + lcl_output_ptr += 16; + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c new file mode 100644 index 00000000..7f53b509 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c32_avx512.tpl.c @@ -0,0 +1,181 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int kh = 0; +int kw = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)32*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 32); +#else +element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)32*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 32); +#endif +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) +LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 32); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); +#endif + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + for( v = 0; v < ofh*ofw*32; v+=16 ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); +#endif + } + + for( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for( wo = opw; wo < (ofw+opw); wo++ ) { + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 32); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32) ); + __m512i lcl_vmask2 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 32) ); +#endif + __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); + __m512 lcl_voutput2 = _mm512_loadu_ps( lcl_output_ptr+16 ); + + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 32); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*32 + (wi+kw)*32) ); + __m512i lcl_vnewmask2 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*32 + (wi+kw)*32 + 16) ); + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); + __mmask16 lcl_mlt2 = _mm512_cmp_ps_mask( lcl_voutput2, lcl_vinput2, _CMP_LT_OS ); + lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); + lcl_voutput2 = _mm512_mask_blend_ps( lcl_mlt2, lcl_voutput2, lcl_vinput2 ); + lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); + lcl_vmask2 = _mm512_mask_blend_epi32( lcl_mlt2, lcl_vmask2, lcl_vnewmask2 ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); + lcl_voutput2 = _mm512_add_ps( lcl_voutput2, _mm512_load_act( input_ptr+16 ) ); +#endif + } + } + } +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 32), lcl_vmask ); + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 32), lcl_vmask2 ); +#endif + _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); + _mm512_storeu_ps( lcl_output_ptr+16, lcl_voutput2 ); + } + } + + /* copy the local buffer into output activations */ + for( ho = oph; ho < (ofh+oph); ho++ ) { + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 32); + for( wo = opw; wo < (ofw+opw); wo++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); + _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); + _mm512_stream_act( output_ptr+16, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+16 ), recp_pool_size_ps ) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); + _mm512_stream_act( output_ptr+16, _mm512_loadu_ps( lcl_output_ptr+16 ) ); +#endif + output_ptr += 32; + lcl_output_ptr += 32; + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c new file mode 100644 index 00000000..b7f91174 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_f32_bf16_c64_avx512.tpl.c @@ -0,0 +1,205 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +# define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) +#if 1 +# define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) +#else +# define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +# define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) +#endif +#else +# define _mm512_load_act(A) _mm512_loadu_ps(A) +# define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) +# define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) +#endif + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int kh = 0; +int kw = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float* lcl_buffer_ptr = ((float*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)64*(size_t)ltid); +LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, 64); +#else +element_output_type* lcl_buffer_ptr = ((element_output_type*)handle->scratch)+((size_t)ofh*(size_t)ofw*(size_t)64*(size_t)ltid); +LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, 64); +#endif +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 64); +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 64); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) +LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, 64); +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_viadd = _mm512_set_epi32( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); +#endif + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + for( v = 0; v < ofh*ofw*64; v+=16 ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_set1_ps(-FLT_MAX) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + _mm512_storeu_ps( &(lcl_buffer_ptr[v]), _mm512_setzero_ps() ); +#endif + } + + for( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for( wo = opw; wo < (ofw+opw); wo++ ) { + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, 64); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vmask = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 64) ); + __m512i lcl_vmask2 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 64) ); + __m512i lcl_vmask3 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 32, nBlocksFm, ofh, ofw, 64) ); + __m512i lcl_vmask4 = _mm512_loadu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 48, nBlocksFm, ofh, ofw, 64) ); +#endif + __m512 lcl_voutput = _mm512_loadu_ps( lcl_output_ptr ); + __m512 lcl_voutput2 = _mm512_loadu_ps( lcl_output_ptr+16 ); + __m512 lcl_voutput3 = _mm512_loadu_ps( lcl_output_ptr+32 ); + __m512 lcl_voutput4 = _mm512_loadu_ps( lcl_output_ptr+48 ); + + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, 64); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + __m512i lcl_vnewmask = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); + __m512i lcl_vnewmask2 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); + __m512i lcl_vnewmask3 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); + __m512i lcl_vnewmask4 = _mm512_add_epi32( lcl_viadd, _mm512_set1_epi32((hi+kh)*ifw*16 + (wi+kw)*16) ); + __m512 lcl_vinput = _mm512_load_act( input_ptr ); + __m512 lcl_vinput2 = _mm512_load_act( input_ptr+16 ); + __m512 lcl_vinput3 = _mm512_load_act( input_ptr+32 ); + __m512 lcl_vinput4 = _mm512_load_act( input_ptr+48 ); + __mmask16 lcl_mlt = _mm512_cmp_ps_mask( lcl_voutput, lcl_vinput, _CMP_LT_OS ); + __mmask16 lcl_mlt2 = _mm512_cmp_ps_mask( lcl_voutput2, lcl_vinput2, _CMP_LT_OS ); + __mmask16 lcl_mlt3 = _mm512_cmp_ps_mask( lcl_voutput3, lcl_vinput3, _CMP_LT_OS ); + __mmask16 lcl_mlt4 = _mm512_cmp_ps_mask( lcl_voutput4, lcl_vinput4, _CMP_LT_OS ); + lcl_voutput = _mm512_mask_blend_ps( lcl_mlt, lcl_voutput, lcl_vinput ); + lcl_voutput2 = _mm512_mask_blend_ps( lcl_mlt2, lcl_voutput2, lcl_vinput2 ); + lcl_voutput3 = _mm512_mask_blend_ps( lcl_mlt3, lcl_voutput3, lcl_vinput3 ); + lcl_voutput4 = _mm512_mask_blend_ps( lcl_mlt4, lcl_voutput4, lcl_vinput4 ); + lcl_vmask = _mm512_mask_blend_epi32( lcl_mlt, lcl_vmask, lcl_vnewmask ); + lcl_vmask2 = _mm512_mask_blend_epi32( lcl_mlt2, lcl_vmask2, lcl_vnewmask2 ); + lcl_vmask3 = _mm512_mask_blend_epi32( lcl_mlt3, lcl_vmask3, lcl_vnewmask3 ); + lcl_vmask4 = _mm512_mask_blend_epi32( lcl_mlt4, lcl_vmask4, lcl_vnewmask4 ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + lcl_voutput = _mm512_add_ps( lcl_voutput, _mm512_load_act( input_ptr ) ); + lcl_voutput2 = _mm512_add_ps( lcl_voutput2, _mm512_load_act( input_ptr+16 ) ); + lcl_voutput3 = _mm512_add_ps( lcl_voutput3, _mm512_load_act( input_ptr+32 ) ); + lcl_voutput4 = _mm512_add_ps( lcl_voutput4, _mm512_load_act( input_ptr+48 ) ); +#endif + } + } + } +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, 64), lcl_vmask ); + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 16, nBlocksFm, ofh, ofw, 64), lcl_vmask2 ); + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 32, nBlocksFm, ofh, ofw, 64), lcl_vmask3 ); + _mm512_storeu_si512( &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 48, nBlocksFm, ofh, ofw, 64), lcl_vmask4 ); +#endif + _mm512_storeu_ps( lcl_output_ptr, lcl_voutput ); + _mm512_storeu_ps( lcl_output_ptr+16, lcl_voutput2 ); + _mm512_storeu_ps( lcl_output_ptr+32, lcl_voutput3 ); + _mm512_storeu_ps( lcl_output_ptr+48, lcl_voutput4 ); + } + } + + /* copy the local buffer into output activations */ + for( ho = oph; ho < (ofh+oph); ho++ ) { + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 64); + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, 0, 0, ofw, 64); + for( wo = opw; wo < (ofw+opw); wo++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + const __m512 recp_pool_size_ps = _mm512_set1_ps( recp_pool_size ); + _mm512_stream_act( output_ptr, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr ), recp_pool_size_ps ) ); + _mm512_stream_act( output_ptr+16, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+16 ), recp_pool_size_ps ) ); + _mm512_stream_act( output_ptr+32, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+32 ), recp_pool_size_ps ) ); + _mm512_stream_act( output_ptr+48, _mm512_mul_ps( _mm512_loadu_ps( lcl_output_ptr+48 ), recp_pool_size_ps ) ); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + _mm512_stream_act( output_ptr, _mm512_loadu_ps( lcl_output_ptr ) ); + _mm512_stream_act( output_ptr+16, _mm512_loadu_ps( lcl_output_ptr+16 ) ); + _mm512_stream_act( output_ptr+32, _mm512_loadu_ps( lcl_output_ptr+32 ) ); + _mm512_stream_act( output_ptr+48, _mm512_loadu_ps( lcl_output_ptr+48 ) ); +#endif + output_ptr += 64; + lcl_output_ptr += 64; + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + +# undef _mm512_load_act +# undef _mm512_stream_act +# undef _mm512_store_act + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c new file mode 100644 index 00000000..0a902203 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_pooling_st_fwd_custom_generic.tpl.c @@ -0,0 +1,194 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) +******************************************************************************/ + +/* size variables, all const */ +const int nImg = handle->desc.N; +const int ifh = handle->desc.H; +const int ifw = handle->desc.W; +const int sh = handle->desc.u; +const int sw = handle->desc.v; +const int ofh = handle->ofh; +const int ofw = handle->ofw; +const int iph = handle->desc.pad_h_in; +const int ipw = handle->desc.pad_w_in; +const int oph = handle->desc.pad_h_out; +const int opw = handle->desc.pad_w_out; +const int ofhp = ofh + 2*oph; +const int ofwp = ofw + 2*opw; +const int ifhp = ifh + 2*iph; +const int ifwp = ifw + 2*ipw; +/* here we assume that input and output blocking is similar */ +const int nBlocksFm = handle->blocksifm; +const int nFmBlock = handle->ifmblock; + +/* computing first logical thread */ +const int ltid = tid - start_thread; +/* number of tasks that could be run in parallel */ +const int work = nImg * nBlocksFm; +/* compute chunk size */ +const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* loop variables */ +int img = 0; +int fm = 0; +int imgfm = 0; +int ho = 0; +int wo = 0; +int hi = 0; +int wi = 0; +int kh = 0; +int kw = 0; +int v = 0; +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float recp_pool_size = 1.0f/((float)handle->desc.R*(float)handle->desc.S); +#else +element_output_type recp_pool_size = 1.0f/((element_output_type)handle->desc.R*(element_output_type)handle->desc.S); +#endif +#endif + +/* multi-dim arrays declaration */ +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +float *const lcl_buffer_ptr = (float*)handle->scratch + (size_t)ofh*ofw*nFmBlock*ltid; +LIBXSMM_VLA_DECL(3, float, lcl_output, lcl_buffer_ptr, ofw, nFmBlock); +#else +element_output_type *const lcl_buffer_ptr = (element_output_type*)handle->scratch + (size_t)ofh*ofw*nFmBlock*ltid; +LIBXSMM_VLA_DECL(3, element_output_type, lcl_output, lcl_buffer_ptr, ofw, nFmBlock); +#endif +LIBXSMM_VLA_DECL(5, const element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, nFmBlock); +LIBXSMM_VLA_DECL(5, element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) +LIBXSMM_VLA_DECL(5, element_mask_type, mask, (element_mask_type* )handle->mask->data, nBlocksFm, ofh, ofw, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) +union libxsmm_bfloat16_hp input_f32; +union libxsmm_bfloat16_hp output_f32; +input_f32.i[1] = 0; +input_f32.i[0] = 0; +output_f32.i[1] = 0; +output_f32.i[0] = 0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, ltid); + +for (imgfm = thr_begin; imgfm < thr_end; ++imgfm) { + img = imgfm / nBlocksFm; + fm = imgfm % nBlocksFm; + + LIBXSMM_PRAGMA_SIMD + for ( v = 0; v < ofh*ofw*nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + lcl_buffer_ptr[v] = -FLT_MAX; +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) + lcl_buffer_ptr[v] = (float)0.0; +#else + lcl_buffer_ptr[v] = (element_output_type)0.0; +#endif +#endif + } + + for ( ho = oph; ho < (ofh+oph); ho++ ) { + hi = ((ho-oph) * sh) - handle->desc.pad_h; + for ( wo = opw; wo < (ofw+opw); wo++ ) { + wi = ((wo-opw) * sw) - handle->desc.pad_w; + for ( kh = 0; kh < handle->desc.R; kh++ ) { + if (hi+kh < 0 || hi+kh >= ifh) continue; + for ( kw = 0; kw < handle->desc.S; kw++ ) { + if (wi+kw < 0 || wi+kw >= ifw) { + continue; + } else { + const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi+kh+iph, wi+kw+ipw, 0, nBlocksFm, ifhp, ifwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); +#else + element_output_type* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + const int idx = (hi+kh)*ifw*nFmBlock + (wi+kw)*nFmBlock; + element_mask_type* mask_ptr = &LIBXSMM_VLA_ACCESS(5, mask, img, fm, ho-oph, wo-opw, 0, nBlocksFm, ofh, ofw, nFmBlock); +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) + for ( v = 0; v < nFmBlock; v++ ) { + input_f32.i[1] = input_ptr[v]; +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + if ( input_f32.f > lcl_output_ptr[v] ) { + lcl_output_ptr[v] = input_f32.f; + mask_ptr[v] = idx + v; + } +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + lcl_output_ptr[v] += input_f32.f; +#endif + } +#else + LIBXSMM_PRAGMA_SIMD + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + if ( input_ptr[v] > lcl_output_ptr[v] ) { + lcl_output_ptr[v] = input_ptr[v]; + mask_ptr[v] = idx + v; + } +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + lcl_output_ptr[v] += input_ptr[v]; +#endif + } +#endif + } + } + } + } + } + + /* copy the local buffer into output activations */ + for ( ho = oph; ho < (ofh+oph); ho++ ) { + for ( wo = opw; wo < (ofw+opw); wo++ ) { + element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, wo, 0, nBlocksFm, ofhp, ofwp, nFmBlock); +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) + float* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); +#else + element_output_type* lcl_output_ptr = &LIBXSMM_VLA_ACCESS(3, lcl_output, ho-oph, wo-opw, 0, ofw, nFmBlock); +#endif + +#if defined(LIBXSMM_DNN_POOLING_FWD_BF16) + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + output_f32.f = lcl_output_ptr[v]; +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + output_f32.f = lcl_output_ptr[v] * recp_pool_size; +#endif + output_ptr[v] = output_f32.i[1]; + } +#else + LIBXSMM_PRAGMA_SIMD + for ( v = 0; v < nFmBlock; v++ ) { +#if defined(LIBXSMM_DNN_POOLING_FWD_MAX) + output_ptr[v] = lcl_output_ptr[v]; +#endif +#if defined(LIBXSMM_DNN_POOLING_FWD_AVG) + output_ptr[v] = lcl_output_ptr[v] * recp_pool_size; +#endif + } +#endif + } + } +} + +libxsmm_barrier_wait(handle->barrier, ltid); + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..94e18fa6 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_ck_generic.tpl.c @@ -0,0 +1,637 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint K3 = K * 3; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *ht = (element_output_type*)(handle->ht ? handle->ht->data : NULL); +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ct = (element_output_type*)handle->cit->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_input_type *dxt = (element_input_type* )handle->dxt->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dcD = (element_output_type*)handle->scratch_dci; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *doutD = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +element_output_type *scratch_oT = (element_output_type*)handle->scratch_dpB; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[K]); +element_filter_type *dwfD = &(dw[2*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K]); +element_filter_type *drfD = &(dr[2*K]); +element_filter_type *dwiD_scratch = &(w_scratch[0]); +element_filter_type *dwcD_scratch = &(w_scratch[C*K]); +element_filter_type *dwfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *driD_scratch = &(r_scratch[0]); +element_filter_type *drcD_scratch = &(r_scratch[K*K]); +element_filter_type *drfD_scratch = &(r_scratch[2*K*K]); +element_output_type *dbi = &(db[0]); +element_output_type *dbc = &(db[K]); +element_output_type *dbf = &(db[2*K]); +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_output_type *t1D = (element_output_type*)handle->scratch_t1; +element_output_type *t2D = (element_output_type*)handle->scratch_t2; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); +LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K3); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, K3); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dc, dcD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +LIBXSMM_VLA_DECL(2, element_output_type, oT, scratch_oT, N); +element_output_type *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL ); +#if 0 +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); +#endif +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); + +/* Auxiliary arrays for batch-reduce gemm calls */ +const element_filter_type *A_array[1024]; +const element_output_type *B_array[1024]; + +#if 0 +LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dcB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); +#endif + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; + +/* int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; */ + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*3, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*3, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*3, db, start_thread, tid, handle->desc.threads); +} + +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, K3); + LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, K3); + LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, K3); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, K3); + LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, K3); + LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, K3); + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + + /* compute dhp */ + if (j == t-1) { + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } + /* df = dout . (1 - c) . (1 - (f . f)) */ + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + /* dc = dout . (hp - f) . c . (1 - c) */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + LIBXSMM_ASSERT(NULL != ht); /* coverity[var_deref_op] */ + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + in = (icin / (C/bc))*bn; + ic = (icin % (C/bc))*bc; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); + } + } + } + } + + /* transpose ot for current timestep */ + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, oT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, o, j, en, ek, N, K); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + /* do = {R_f}^T * df */ + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &blocks); + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + /* di = do . hp . i . (1 - i) */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dx = W^T * dicf */ + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic % (N/bn))*bn; + icb = inic / (N/bn); + ic = icb*bc; + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + } + } + } + + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); + + if (0 == KB) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), dout_ptr ); + } + + /* dhp += R^T * dic */ + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + } + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + if ((C == K) && (bc == bk) /*&& (bcbk_multiples_of_16 == 1)*/) { +#if 0 + if (K % 2048 != 0) { +#endif + /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } +#if 0 + } else { + /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dc, df */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +#endif + } else { + /* dr = dicf * o^T/h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = dicf * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } + + /* gradient bias */ + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + dbc[ik] += LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* Store result weight matrices in CK format */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; ++jc) { + for (jk = 0; jk < bk; ++jk) { + LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; ++jc) { + for (jk = 0; jk < bk; ++jk) { + LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk, K3) = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c new file mode 100644 index 00000000..834be810 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_bwdupd_nc_kcck.tpl.c @@ -0,0 +1,626 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ct = (element_output_type*)handle->cit->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_input_type *dxt = (element_input_type* )handle->dxt->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dcD = (element_output_type*)handle->scratch_dci; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *doutD = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +element_output_type *scratch_oT = (element_output_type*)handle->scratch_dpB; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[C*K]); +element_filter_type *dwfD = &(dw[2*C*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K*K]); +element_filter_type *drfD = &(dr[2*K*K]); +element_output_type *dbi = &(db[0]); +element_output_type *dbc = &(db[K]); +element_output_type *dbf = &(db[2*K]); +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_output_type *t1D = (element_output_type*)handle->scratch_t1; +element_output_type *t2D = (element_output_type*)handle->scratch_t2; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); +LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dc, dcD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +LIBXSMM_VLA_DECL(2, element_output_type, oT, scratch_oT, N); +element_output_type *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL ); +#if 0 +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL ); +#endif +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); + +/* Auxiliary arrays for batch-reduce gemm calls */ +const element_filter_type *A_array[1024]; +const element_output_type *B_array[1024]; + +#if 0 +LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dcB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); +#endif + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; + +libxsmm_blasint ikic, inic, inik, icin, ikin; +#if defined(LIBXSMM_RNN_CELL_AVX512) +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K >= 1024 && K%2==0) { + BF = 2; +} +if (K >= 2048 && K%4==0) { + BF = 4; +} +if (K >= 4096 && K%8==0) { + BF = 8; +} +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*3, dw, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*3, dr, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*3, db, start_thread, tid, handle->desc.threads); +} + +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk); + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bcbk_multiples_of_16) { +#include "libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c" + } else { + /* compute dhp */ + if (j == t-1) { + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } + /* df = dout . (1 - c) . (1 - (f . f)) */ + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + /* dc = dout . (hp - f) . c . (1 - c) */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); + } +#else + /* compute dhp */ + if (j == t-1) { + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } + /* df = dout . (1 - c) . (1 - (f . f)) */ + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + /* dc = dout . (hp - f) . c . (1 - c) */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + libxsmm_internal_matrix_sub_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K) ); +#endif + } + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + in = (icin / (C/bc))*bn; + ic = (icin % (C/bc))*bc; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); + } + } + } + } + + /* transpose ot for current timestep */ + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, oT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, o, j, en, ek, N, K); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + /* do = {R_f}^T * df */ + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &blocks); + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + /* di = do . hp . i . (1 - i) */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bcbk_multiples_of_16) { +#include "libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c" + } else { + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + } +#else + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (0 == j) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + } + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dx = W^T * dicf */ + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic % (N/bn))*bn; + icb = inic / (N/bn); + ic = icb*bc; + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + } + } + } + + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); + + if (0 == KB) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), dout_ptr ); + } + + /* dhp += R^T * dic */ + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + } + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + if ((C == K) && (bc == bk) /*&& (bcbk_multiples_of_16 == 1)*/) { +#if 0 + if (K % 2048 != 0) { +#endif + /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } +#if 0 + } else { + /* Interleave computation of dr = dicf * o^T/h^T and dw = dicf * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dc, df */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dcB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +#endif + } else { + /* dr = dicf * o^T/h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, oT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = dicf * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } + + /* gradient bias */ + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + dbc[ik] += LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..dfe775ad --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_ck_generic.tpl.c @@ -0,0 +1,285 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint K3 = K * 3; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ct = (element_output_type*)handle->cit->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *wiD_scratch = &(w_scratch[0]); +element_filter_type *wcD_scratch = &(w_scratch[C*K]); +element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *riD_scratch = &(r_scratch[0]); +element_filter_type *rcD_scratch = &(r_scratch[K*K]); +element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, K3); +LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, K3); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); +/* define gemm kernels */ +/* Auxiliary arrays for batch-reduce gemms */ +const element_filter_type *A_array[1024]; +const element_input_type *B_array[1024]; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; +#if 0 +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; +#endif +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +/* Upfront reformatting of W and R */ +/* reformat W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 3*K); + LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 3*K); + LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 3*K); + } + } +} + +/* reformat R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 3*K); + LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 3*K); + LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 3*K); + } + } +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* All data is in column-major format */ +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); + /* i += R.hp */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); + /* initialize c with bd */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &bd[ik] ); + /* c += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); + /* c += R.hp */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); + + if (CB == BF-1) { + /* i = sigmoid(i) */ + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + /* o = hp . i */ + if (0 == j) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + /* We need a barrier here to ensure all elements of o are computed before f can be computed */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize f with bf */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik] ); + /* f += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); + /* f += R.o */ + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, o, j, in, ic + CB*KB_BLOCKS*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); + + if (CB == BF-1) { + /* f = tanh(f) */ + libxsmm_internal_matrix_tanh_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + /* c = sigmoid(c) */ + libxsmm_internal_matrix_sigmoid_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K) ); + /* h = (1 - c) . f */ + libxsmm_internal_matrix_complement_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + /* h += c . hp */ + if (0 == j) { + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } else { + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c new file mode 100644 index 00000000..92d429bd --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_gru_fwd_nc_kcck.tpl.c @@ -0,0 +1,222 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ct = (element_output_type*)handle->cit->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, c, ct, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); +/* define gemm kernels */ +/* Auxiliary arrays for batch-reduce gemms */ +const element_filter_type *A_array[1024]; +const element_input_type *B_array[1024]; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +#if 0 +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; +#endif +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* All data is in column-major format */ +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); + /* i += R.hp */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); + /* initialize c with bd */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &bd[ik] ); + /* c += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); + /* c += R.hp */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &blocks); + + if (CB == BF-1) { + /* i = sigmoid(i) */ + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + /* o = hp . i */ + if (0 == j) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + /* We need a barrier here to ensure all elements of o are computed before f can be computed */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize f with bf */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik] ); + /* f += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); + /* f += R.o */ + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, o, j, in, ic + CB*KB_BLOCKS*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); + + if (CB == BF-1) { + /* f = tanh(f) */ + libxsmm_internal_matrix_tanh_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + /* c = sigmoid(c) */ + libxsmm_internal_matrix_sigmoid_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K) ); + /* h = (1 - c) . f */ + libxsmm_internal_matrix_complement_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld ( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + /* h += c . hp */ + if (0 == j) { + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } else { + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..d4574e93 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic.tpl.c @@ -0,0 +1,360 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint K4 = K * 4; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +element_output_type *doutD = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[K]); +element_filter_type *dwfD = &(dw[2*K]); +element_filter_type *dwoD = &(dw[3*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K]); +element_filter_type *drfD = &(dr[2*K]); +element_filter_type *droD = &(dr[3*K]); +element_filter_type *dwiD_scratch = &(w_scratch[0]); +element_filter_type *dwcD_scratch = &(w_scratch[C*K]); +element_filter_type *dwfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *dwoD_scratch = &(w_scratch[3*C*K]); +element_filter_type *driD_scratch = &(r_scratch[0]); +element_filter_type *drcD_scratch = &(r_scratch[K*K]); +element_filter_type *drfD_scratch = &(r_scratch[2*K*K]); +element_filter_type *droD_scratch = &(r_scratch[3*K*K]); +element_output_type *dbi = &(db[0]); +element_output_type *dbc = &(db[K]); +element_output_type *dbf = &(db[2*K]); +element_output_type *dbo = &(db[3*K]); +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +element_output_type *t1D = (element_output_type*)handle->scratch_t1; +element_output_type *t2D = (element_output_type*)handle->scratch_t2; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); +LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wo, woD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ro, roD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K4); +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwo_ck, dwoD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dro_ck, droD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, 4*K); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(4, element_filter_type, wiT, scratch_wiT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wcT, scratch_wcT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wfT, scratch_wfT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, woT, scratch_woT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, riT, scratch_riT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rcT, scratch_rcT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rfT, scratch_rfT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, roT, scratch_roT, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +element_output_type *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); + +/* Auxiliary arrays for batch-reduce gemm calls */ +const element_filter_type *A_array[1024]; +const element_output_type *B_array[1024]; + +LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K;__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, woT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(2, wo, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, roT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ro, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c" + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in CK format */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; ++jc) { + for (jk = 0; jk < bk; ++jk) { + LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(2, dwo_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; ++jc) { + for (jk = 0; jk < bk; ++jk) { + LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(2, dro_ck, ic+jc, ik+jk , K4) = LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk); + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c new file mode 100644 index 00000000..fb1def38 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16.tpl.c @@ -0,0 +1,361 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint K4 = K * 4; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = handle->lpb; +/*const int bc_lp = bc/lpb;*/ +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db_bf16 = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +float *dxD = (float*)handle->scratch_dx; +float *doutD = (float*)handle->scratch_deltat; +float *dhpD_f32 = (float*)handle->scratch_dhp; +float *db = (float*)handle->scratch_db; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +float *w_scratch = (float*)handle->scratch_w; +float *r_scratch = (float*)handle->scratch_r; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[K]); +element_filter_type *dwfD = &(dw[2*K]); +element_filter_type *dwoD = &(dw[3*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K]); +element_filter_type *drfD = &(dr[2*K]); +element_filter_type *droD = &(dr[3*K]); +float *dwiD_scratch = &(w_scratch[0]); +float *dwcD_scratch = &(w_scratch[C*K]); +float *dwfD_scratch = &(w_scratch[2*C*K]); +float *dwoD_scratch = &(w_scratch[3*C*K]); +float *driD_scratch = &(r_scratch[0]); +float *drcD_scratch = &(r_scratch[K*K]); +float *drfD_scratch = &(r_scratch[2*K*K]); +float *droD_scratch = &(r_scratch[3*K*K]); +float *dbi = &(db[0]); +float *dbc = &(db[K]); +float *dbf = &(db[2*K]); +float *dbo = &(db[3*K]); +element_output_type *dbi_bf16 = &(db_bf16[0]); +element_output_type *dbc_bf16 = &(db_bf16[K]); +element_output_type *dbf_bf16 = &(db_bf16[2*K]); +element_output_type *dbo_bf16 = &(db_bf16[3*K]); +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +/*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ +/*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ +/* multidimensional arrays */ +/*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ +/*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wo, woD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ro, roD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K4); +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); +LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); +LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwo_ck, dwoD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dro_ck, droD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, 4*K); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, float, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(5, element_filter_type, wiT, scratch_wiT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wcT, scratch_wcT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wfT, scratch_wfT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, woT, scratch_woT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, riT, scratch_riT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rcT, scratch_rcT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rfT, scratch_rfT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, roT, scratch_roT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +float *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wo, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c" + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in CK format and downcovert to bf16 */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; ++jc) { + for (jk = 0; jk < bk; jk += 16) { + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dwo_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk)))); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; ++jc) { + for (jk = 0; jk < bk; jk += 16) { + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk)))); + _mm256_storeu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dro_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk)))); + } + } + } +#else + /* TODO: Add here non AVX512 replacement code */ +#endif + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..14b465ab --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_ck_generic_bf16_amx.tpl.c @@ -0,0 +1,376 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint K4 = K * 4; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = handle->lpb; +/*const int bc_lp = bc/lpb;*/ +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db_bf16 = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +float *dxD = (float*)handle->scratch_dx; +float *doutD = (float*)handle->scratch_deltat; +float *dhpD_f32 = (float*)handle->scratch_dhp; +float *db = (float*)handle->scratch_db; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +float *w_scratch = (float*)handle->scratch_w; +float *r_scratch = (float*)handle->scratch_r; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[K]); +element_filter_type *dwfD = &(dw[2*K]); +element_filter_type *dwoD = &(dw[3*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K]); +element_filter_type *drfD = &(dr[2*K]); +element_filter_type *droD = &(dr[3*K]); +float *dwiD_scratch = &(w_scratch[0]); +float *dwcD_scratch = &(w_scratch[C*K]); +float *dwfD_scratch = &(w_scratch[2*C*K]); +float *dwoD_scratch = &(w_scratch[3*C*K]); +float *driD_scratch = &(r_scratch[0]); +float *drcD_scratch = &(r_scratch[K*K]); +float *drfD_scratch = &(r_scratch[2*K*K]); +float *droD_scratch = &(r_scratch[3*K*K]); +float *dbi = &(db[0]); +float *dbc = &(db[K]); +float *dbf = &(db[2*K]); +float *dbo = &(db[3*K]); +element_output_type *dbi_bf16 = &(db_bf16[0]); +element_output_type *dbc_bf16 = &(db_bf16[K]); +element_output_type *dbf_bf16 = &(db_bf16[2*K]); +element_output_type *dbo_bf16 = &(db_bf16[3*K]); +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +/*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ +/*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ +/* multidimensional arrays */ +/*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ +/*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, wi, wiD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wf, wfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wo, woD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, wc, wcD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ri, riD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rf, rfD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, ro, roD, K4); +LIBXSMM_VLA_DECL(2, element_filter_type, rc, rcD, K4); +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); +LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); +LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, dwi_ck, dwiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwf_ck, dwfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwo_ck, dwoD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dwc_ck, dwcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dri_ck, driD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drf_ck, drfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, dro_ck, droD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, drc_ck, drcD, 4*K); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, float, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(5, element_filter_type, wiT, scratch_wiT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wcT, scratch_wcT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wfT, scratch_wfT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, woT, scratch_woT, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, riT, scratch_riT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rcT, scratch_rcT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rfT, scratch_rfT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, roT, scratch_roT, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +float *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; /*libxsmm_bsmmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL);*/ +libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->bwdupd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL);*/ + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} + +BF = handle->bwdupd_block; +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wi, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wc, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wf, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(2, wo, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c" + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in CK format and downcovert to bf16 */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; ++jc) { + for (jk = 0; jk < bk; jk += 16) { + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dwi_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dwc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dwf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dwo_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk))); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; ++jc) { + for (jk = 0; jk < bk; jk += 16) { + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dri_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, drc_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, drf_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_storecvt_fp32_bf16(&LIBXSMM_VLA_ACCESS(2, dro_ck, ic+jc, ik+jk , K4), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk))); + } + } + } +#else + /* TODO: Add here non AVX512 replacement code */ +#endif + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c new file mode 100644 index 00000000..272a22b3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck.tpl.c @@ -0,0 +1,306 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wt = (element_filter_type*)handle->wt->data; +element_filter_type *rt = (element_filter_type*)handle->rt->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +element_output_type *doutD = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +#if 0 +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +#endif +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +element_filter_type *witD = &(wt[0]); +element_filter_type *wctD = &(wt[C*K]); +element_filter_type *wftD = &(wt[2*C*K]); +element_filter_type *wotD = &(wt[3*C*K]); +element_filter_type *ritD = &(rt[0]); +element_filter_type *rctD = &(rt[K*K]); +element_filter_type *rftD = &(rt[2*K*K]); +element_filter_type *rotD = &(rt[3*K*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[C*K]); +element_filter_type *dwfD = &(dw[2*C*K]); +element_filter_type *dwoD = &(dw[3*C*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K*K]); +element_filter_type *drfD = &(dr[2*K*K]); +element_filter_type *droD = &(dr[3*K*K]); +element_output_type *dbi = &(db[0]); +element_output_type *dbc = &(db[K]); +element_output_type *dbf = &(db[2*K]); +element_output_type *dbo = &(db[3*K]); +#if 0 +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +#endif +element_output_type *t1D = (element_output_type*)handle->scratch_t1; +element_output_type *t2D = (element_output_type*)handle->scratch_t2; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K); +LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +#if 0 +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); +#endif +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, dwi, dwiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwf, dwfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwo, dwoD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dwc, dwcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dri, driD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drf, drfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dro, droD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, drc, drcD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(4, element_filter_type, wiT, witD, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wcT, wctD, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, wfT, wftD, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, woT, wotD, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, riT, ritD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rcT, rctD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rfT, rftD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, roT, rotD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +element_output_type *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb1 = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc1 = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); + +/* Auxiliary arrays for batch-reduce gemm calls */ +const element_filter_type *A_array[1024]; +const element_output_type *B_array[1024]; + +LIBXSMM_VLA_DECL(4, element_output_type, diB, (element_output_type*)handle->scratch_diB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, kBlocks, bn, bk); + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K;__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, dw, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, dr, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +/* Here we assume that the weight tensors come in transposed from framework */ +#if 0 +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wiT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(4, wcT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(4, wfT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(4, woT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, wo, ik, ic, jc, jk, cBlocks, bc, bk); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, riT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(4, rcT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(4, rfT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(4, roT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, ro, ik, ic, jc, jk, kBlocks, bk, bk); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c" + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c new file mode 100644 index 00000000..1f43f01a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16.tpl.c @@ -0,0 +1,447 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = handle->lpb; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wt = (element_filter_type*)handle->wt->data; +element_filter_type *rt = (element_filter_type*)handle->rt->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db_bf16 = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +float *dxD = (float*)handle->scratch_dx; +float *doutD = (float*)handle->scratch_deltat; +float *dhpD_f32 = (float*)handle->scratch_dhp; +float *db = (float*)handle->scratch_db; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +#if 0 +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +#endif +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +float *w_scratch = (float*)handle->scratch_w; +float *r_scratch = (float*)handle->scratch_r; +element_filter_type *witD = &(wt[0]); +element_filter_type *wctD = &(wt[C*K]); +element_filter_type *wftD = &(wt[2*C*K]); +element_filter_type *wotD = &(wt[3*C*K]); +element_filter_type *ritD = &(rt[0]); +element_filter_type *rctD = &(rt[K*K]); +element_filter_type *rftD = &(rt[2*K*K]); +element_filter_type *rotD = &(rt[3*K*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[C*K]); +element_filter_type *dwfD = &(dw[2*C*K]); +element_filter_type *dwoD = &(dw[3*C*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K*K]); +element_filter_type *drfD = &(dr[2*K*K]); +element_filter_type *droD = &(dr[3*K*K]); +float *dwiD_scratch = &(w_scratch[0]); +float *dwcD_scratch = &(w_scratch[C*K]); +float *dwfD_scratch = &(w_scratch[2*C*K]); +float *dwoD_scratch = &(w_scratch[3*C*K]); +float *driD_scratch = &(r_scratch[0]); +float *drcD_scratch = &(r_scratch[K*K]); +float *drfD_scratch = &(r_scratch[2*K*K]); +float *droD_scratch = &(r_scratch[3*K*K]); +float *dbi = &(db[0]); +float *dbc = &(db[K]); +float *dbf = &(db[2*K]); +float *dbo = &(db[3*K]); +element_output_type *dbi_bf16 = &(db_bf16[0]); +element_output_type *dbc_bf16 = &(db_bf16[K]); +element_output_type *dbf_bf16 = &(db_bf16[2*K]); +element_output_type *dbo_bf16 = &(db_bf16[3*K]); +#if 0 +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +#endif +/*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ +/*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ +/* multidimensional arrays */ +/*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ +/*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +#if 0 +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +#endif +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); +LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); +LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(5, element_filter_type, dwi_bf16, dwiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwc_bf16, dwcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwf_bf16, dwfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwo_bf16, dwoD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dri_bf16, driD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drc_bf16, drcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drf_bf16, drfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dro_bf16, droD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, float, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(5, element_filter_type, wiT, witD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wcT, wctD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wfT, wftD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, woT, wotD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, riT, ritD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rcT, rctD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rfT, rftD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, roT, rotD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +float *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} + +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +/* Here we assume that the weight tensors come in transposed from framework */ +#if 0 +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c" + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in KCCK bf16 format and downcovert to bf16 */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + /* Below is the commented reference code */ +#if 0 + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; jc++) { + for (jk = 0; jk < bk; jk++) { + libxsmm_bfloat16_hp tmp; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; jc++) { + for (jk = 0; jk < bk; jk++) { + libxsmm_bfloat16_hp tmp; + tmp.f = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + } + } + } +#endif + __m512 a01, b01; + __m512i c01; + const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + a01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc+1, jk, cBlocks, bc, bk)); + b01 = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, cBlocks, bc, bk)); + c01 = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(a01, b01); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } +#else + /* TODO: Add here non AVX512 replacement code */ + LIBXSMM_UNUSED(thr_begin_kk); + LIBXSMM_UNUSED(thr_begin_ck); + LIBXSMM_UNUSED(ikic); + LIBXSMM_UNUSED(jk); + LIBXSMM_UNUSED(jc); + LIBXSMM_UNUSED(thr_end_ck); + LIBXSMM_UNUSED(thr_end_kk); +#endif + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c new file mode 100644 index 00000000..1eada735 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_bf16_amx.tpl.c @@ -0,0 +1,441 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = handle->lpb; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wt = (element_filter_type*)handle->wt->data; +element_filter_type *rt = (element_filter_type*)handle->rt->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db_bf16 = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +float *dxD = (float*)handle->scratch_dx; +float *doutD = (float*)handle->scratch_deltat; +float *dhpD_f32 = (float*)handle->scratch_dhp; +float *db = (float*)handle->scratch_db; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +#if 0 +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +#endif +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +float *w_scratch = (float*)handle->scratch_w; +float *r_scratch = (float*)handle->scratch_r; +element_filter_type *witD = &(wt[0]); +element_filter_type *wctD = &(wt[C*K]); +element_filter_type *wftD = &(wt[2*C*K]); +element_filter_type *wotD = &(wt[3*C*K]); +element_filter_type *ritD = &(rt[0]); +element_filter_type *rctD = &(rt[K*K]); +element_filter_type *rftD = &(rt[2*K*K]); +element_filter_type *rotD = &(rt[3*K*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[C*K]); +element_filter_type *dwfD = &(dw[2*C*K]); +element_filter_type *dwoD = &(dw[3*C*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K*K]); +element_filter_type *drfD = &(dr[2*K*K]); +element_filter_type *droD = &(dr[3*K*K]); +float *dwiD_scratch = &(w_scratch[0]); +float *dwcD_scratch = &(w_scratch[C*K]); +float *dwfD_scratch = &(w_scratch[2*C*K]); +float *dwoD_scratch = &(w_scratch[3*C*K]); +float *driD_scratch = &(r_scratch[0]); +float *drcD_scratch = &(r_scratch[K*K]); +float *drfD_scratch = &(r_scratch[2*K*K]); +float *droD_scratch = &(r_scratch[3*K*K]); +float *dbi = &(db[0]); +float *dbc = &(db[K]); +float *dbf = &(db[2*K]); +float *dbo = &(db[3*K]); +element_output_type *dbi_bf16 = &(db_bf16[0]); +element_output_type *dbc_bf16 = &(db_bf16[K]); +element_output_type *dbf_bf16 = &(db_bf16[2*K]); +element_output_type *dbo_bf16 = &(db_bf16[3*K]); +#if 0 +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +#endif +/*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ +/*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ +/* multidimensional arrays */ +/*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ +/*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +#if 0 +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +#endif +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +LIBXSMM_VLA_DECL(3, float, dx, dxD, N, C); +LIBXSMM_VLA_DECL(3, element_input_type, dx_bf16, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, dcp, dcsp, K); +LIBXSMM_VLA_DECL(2, element_input_type, dhp, dhpD, K); +LIBXSMM_VLA_DECL(2, float, dhp_f32, dhpD_f32, K); +LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(5, element_filter_type, dwi_bf16, dwiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwc_bf16, dwcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwf_bf16, dwfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwo_bf16, dwoD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dri_bf16, driD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drc_bf16, drcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drf_bf16, drfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dro_bf16, droD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, dcs, dcsD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(2, element_output_type, di, diD, K); +LIBXSMM_VLA_DECL(2, element_output_type, df, dfD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dp, doD, K); +LIBXSMM_VLA_DECL(2, element_output_type, dci, dciD, K); +LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, float, dout, doutD, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(5, element_filter_type, wiT, witD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wcT, wctD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wfT, wftD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, woT, wotD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, riT, ritD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rcT, rctD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rfT, rftD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, roT, rotD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +float *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; /*libxsmm_bsmmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL);*/ +libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->bwdupd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL);*/ + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} + +BF = handle->bwdupd_block; +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +/* Here we assume that the weight tensors come in transposed from framework */ +#if 0 +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c" + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in KCCK bf16 format and downcovert to bf16 */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + /* Below is the commented reference code */ +#if 0 + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; jc++) { + for (jk = 0; jk < bk; jk++) { + libxsmm_bfloat16_hp tmp; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk); + LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = tmp.i[1]; + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; jc++) { + for (jk = 0; jk < bk; jk++) { + libxsmm_bfloat16_hp tmp; + tmp.f = LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + tmp.f = LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk); + LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = tmp.i[1]; + } + } + } +#endif + __m512i c01; + const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } +#else + /* TODO: Add here non AVX512 replacement code */ + LIBXSMM_UNUSED(thr_begin_kk); + LIBXSMM_UNUSED(thr_begin_ck); + LIBXSMM_UNUSED(ikic); + LIBXSMM_UNUSED(jk); + LIBXSMM_UNUSED(jc); + LIBXSMM_UNUSED(thr_end_ck); + LIBXSMM_UNUSED(thr_end_kk); +#endif + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c new file mode 100644 index 00000000..f7b04c0f --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core.tpl.c @@ -0,0 +1,526 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) + /* Compute dcp, dci, di, df, dp */ + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + if (bcbk_multiples_of_16) { + if (K % 2048 != 0 || LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) { +#include "libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c" + } else { + /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ +#include "libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c" + } + } else { + /* compute dhp */ + if (j == t-1) { + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } + /* compute dcp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (j == t-1) { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); + } + /* compute dci */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K) ); + /* compute di */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + /* compute df */ + if (j == 0) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + } + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + /* compute dp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); + /* update dcp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); + } +#else + /* compute dhp */ + if (j == t-1) { + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) ); + } + /* compute dcp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + if (j == t-1) { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); + } else { + libxsmm_internal_matrix_add_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); + } + /* compute dci */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_square_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K) ); + /* compute di */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K), &LIBXSMM_VLA_ACCESS(2, di, in, ik, K) ); + /* compute df */ + if (j == 0) { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + } else { + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + } + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K), &LIBXSMM_VLA_ACCESS(2, df, in, ik, K) ); + /* compute dp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K) ); + libxsmm_internal_matrix_complement_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, t1, in, ik, K), &LIBXSMM_VLA_ACCESS(2, t2, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K) ); + /* update dcp */ + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K), &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K) ); +#endif + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + eltwise_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + in = (icin / (C/bc))*bn; + ic = (icin % (C/bc))*bc; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); + } + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + act_trans_cycles += _end - _start; + } +#endif + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* dx = W^T * difoc */ + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic % (N/bn))*bn; + icb = inic / (N/bn); + ic = icb*bc; + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wiT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wcT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wfT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); + + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik += bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, woT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C) , &blocks); + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dx_cycles += _end - _start; + } +#endif + } + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + + dout_ptr = (j > 0) ? (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (element_output_type*) &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K); + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, dout_ptr); + /* dout += R^T * difoc */ + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, riT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, di, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rcT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rfT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, df, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, roT, ikb, icb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ic + KB*KB_BLOCKS*bk, K); + } + /* Reduce batch gemm call */ + batchreduce_kerneld(A_array, B_array, dout_ptr, &blocks); + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dout_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { + if (K % 2048 != 0) { + /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } else { + /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dci, df and dp */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + blocks = nBlocks; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } + } else { + /* dr = difoc * h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + batchreduce_kernelb1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = difoc * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + batchreduce_kernelc1(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dwdr_cycles += _end - _start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* gradient bias */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bcbk_multiples_of_16) { + for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { + dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); + dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); + dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); + dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); + for (in = 0; in < N; in++) { + dbi_sum = _mm512_add_ps(dbi_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, di, in, ik, K))); + dbf_sum = _mm512_add_ps(dbf_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, df, in, ik, K))); + dbo_sum = _mm512_add_ps(dbo_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, dp, in, ik, K))); + dbc_sum = _mm512_add_ps(dbc_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(2, dci, in, ik, K))); + } + _mm512_storeu_ps(&dbi[ik], dbi_sum); + _mm512_storeu_ps(&dbf[ik], dbf_sum); + _mm512_storeu_ps(&dbo[ik], dbo_sum); + _mm512_storeu_ps(&dbc[ik], dbc_sum); + } + } else { + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + dbo[ik] += LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + dbc[ik] += LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + } + } + } +#else + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + dbi[ik] += LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + dbf[ik] += LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + dbo[ik] += LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + dbc[ik] += LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + } + } +#endif +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + gradient_cycles += _end - _start; + } +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c new file mode 100644 index 00000000..42789b49 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16.tpl.c @@ -0,0 +1,343 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const src = _src; \ + libxsmm_bfloat16 *const dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ + _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ + } \ + } \ +} while (0) + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) + /* Compute dcp, dci, di, df, dp */ + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + if (bcbk_multiples_of_16) { + /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ +#include "libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c" + } else { + /* TODO: Add alternative path here */ + } +#else + /* TODO: Add alternative path here */ +#endif + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + eltwise_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + in = (icin / (C/bc))*bn; + ic = (icin % (C/bc))*bc; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); + } + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + act_trans_cycles += _end - _start; + } +#endif + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* dx = W^T * difoc */ + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic % (N/bn))*bn; + icb = inic / (N/bn); + ic = icb*bc; + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wiT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wcT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wfT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, woT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + /* If last block, make sure we downconvert dx to bf16 */ + if (KB == BF-1) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bc, bn, C, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &LIBXSMM_VLA_ACCESS(3, dx_bf16, j, in, ic, N, C)); + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dx_cycles += _end - _start; + } +#endif + } + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + dout_ptr = (j > 0) ? (float*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (float*) &LIBXSMM_VLA_ACCESS(2, dhp_f32, in, ik, K); + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, dout_ptr); + /* dout += R^T * difoc */ + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, riT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rcT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rfT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, roT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + /* Make sure when last and j == 0 to downconvert dhp to BF16 */ + if ((j == 0) && (KB == BF-1)) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, dout_ptr, &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K)); + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dout_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + blocks = nBlocks; + if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { + /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dci, df and db */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } else { + /* dr = difoc * h^T */ + /* Use blocked format for di, dci, df and db */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = difoc * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dwdr_cycles += _end - _start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* gradient bias */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bcbk_multiples_of_16) { + for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { + dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); + dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); + dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); + dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); + for (in = 0; in < N; in++) { + dbi_sum = _mm512_add_ps(dbi_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, di, in, ik, K)))); + dbf_sum = _mm512_add_ps(dbf_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, df, in, ik, K)))); + dbo_sum = _mm512_add_ps(dbo_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dp, in, ik, K)))); + dbc_sum = _mm512_add_ps(dbc_sum, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&LIBXSMM_VLA_ACCESS(2, dci, in, ik, K)))); + } + _mm512_storeu_ps(&dbi[ik], dbi_sum); + _mm512_storeu_ps(&dbf[ik], dbf_sum); + _mm512_storeu_ps(&dbo[ik], dbo_sum); + _mm512_storeu_ps(&dbc[ik], dbc_sum); + /* Downconvert delta bias to bf16 if done with all timesteps */ + if (j == 0) { + _mm256_storeu_si256((__m256i*)&dbi_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbi_sum)); + _mm256_storeu_si256((__m256i*)&dbf_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbf_sum)); + _mm256_storeu_si256((__m256i*)&dbo_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbo_sum)); + _mm256_storeu_si256((__m256i*)&dbc_bf16[ik], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(dbc_sum)); + } + } + } else { + /* TODO: Add alternative path here */ + } +#else + /* TODO: Add alternative path here */ +#endif +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + gradient_cycles += _end - _start; + } +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c new file mode 100644 index 00000000..49722273 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_nc_kcck_core_bf16_amx.tpl.c @@ -0,0 +1,342 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const __src = _src; \ + libxsmm_bfloat16 *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i __packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + __packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i])); \ + _mm512_storeu_si512((libxsmm_bfloat16*)&__dst[(__j*ld)+__i], (__m512i) __packed_result); \ + } \ + } \ +} while (0) + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) + /* Compute dcp, dci, di, df, dp */ + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + if (bcbk_multiples_of_16) { + /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ +#include "libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c" + } else { + /* TODO: Add alternative path here */ + } +#else + /* TODO: Add alternative path here */ +#endif + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + eltwise_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + in = (icin / (C/bc))*bn; + ic = (icin % (C/bc))*bc; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, j, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + in = (ikin / (K/bk))*bn; + ik = (ikin % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, j-1, en, ek, N, K); + } + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + act_trans_cycles += _end - _start; + } +#endif + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* dx = W^T * difoc */ + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic % (N/bn))*bn; + icb = inic / (N/bn); + ic = icb*bc; + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wiT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wcT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wfT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, woT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &blocks); + + /* If last block, make sure we downconvert dx to bf16 */ + if (KB == BF-1) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bc, bn, C, &LIBXSMM_VLA_ACCESS(3, dx, j, in, ic, N, C), &LIBXSMM_VLA_ACCESS(3, dx_bf16, j, in, ic, N, C)); + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dx_cycles += _end - _start; + } +#endif + } + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + dout_ptr = (j > 0) ? (float*) &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K) : (float*) &LIBXSMM_VLA_ACCESS(2, dhp_f32, in, ik, K); + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, K, dout_ptr); + /* dout += R^T * difoc */ + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, riT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, di, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rcT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, dci, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rfT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, df, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, roT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, dp, in, KB*KB_BLOCKS*bk, K), + dout_ptr, &blocks); + + /* Make sure when last and j == 0 to downconvert dhp to BF16 */ + if ((j == 0) && (KB == BF-1)) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, dout_ptr, &LIBXSMM_VLA_ACCESS(2, dhp, in, ik, K)); + } + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dout_cycles += _end - _start; + } +#endif + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + blocks = nBlocks; + if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { + /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dci, df and db */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } else { + /* dr = difoc * h^T */ + /* Use blocked format for di, dci, df and db */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = difoc * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, xT, ic, 0, N), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + dwdr_cycles += _end - _start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* gradient bias */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bcbk_multiples_of_16) { + for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { + dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); + dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); + dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); + dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); + for (in = 0; in < N; in++) { + dbi_sum = _mm512_add_ps(dbi_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(2, di, in, ik, K))); + dbf_sum = _mm512_add_ps(dbf_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(2, df, in, ik, K))); + dbo_sum = _mm512_add_ps(dbo_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(2, dp, in, ik, K))); + dbc_sum = _mm512_add_ps(dbc_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(2, dci, in, ik, K))); + } + _mm512_store_ps(&dbi[ik], dbi_sum); + _mm512_store_ps(&dbf[ik], dbf_sum); + _mm512_store_ps(&dbo[ik], dbo_sum); + _mm512_store_ps(&dbc[ik], dbc_sum); + /* Downconvert delta bias to bf16 if done with all timesteps */ + if (j == 0) { + _mm512_storecvt_fp32_bf16(&dbi_bf16[ik], dbi_sum); + _mm512_storecvt_fp32_bf16(&dbf_bf16[ik], dbf_sum); + _mm512_storecvt_fp32_bf16(&dbo_bf16[ik], dbo_sum); + _mm512_storecvt_fp32_bf16(&dbc_bf16[ik], dbc_sum); + } + } + } else { + /* TODO: Add alternative path here */ + } +#else + /* TODO: Add alternative path here */ +#endif +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + gradient_cycles += _end - _start; + } +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c new file mode 100644 index 00000000..c731115b --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_bf16_amx.tpl.c @@ -0,0 +1,366 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, inb, icb, jk, jb, jc, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = handle->lpb; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +const int bn_lp = bn/lpb; +unsigned long long blocks; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wt = (element_filter_type*)handle->wt->data; +element_filter_type *rt = (element_filter_type*)handle->rt->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = handle->ht ? (element_output_type*)handle->ht->data : (element_output_type*)NULL; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_input_type *dcsp = (element_input_type* )handle->dcsp->data; +element_input_type *dhpD = (element_input_type* )handle->dhp->data; +element_filter_type *dw = (element_filter_type*)handle->dw->data; +element_filter_type *dr = (element_filter_type*)handle->dr->data; +element_output_type *db_bf16 = (element_output_type*)handle->db->data; +element_output_type *dcsD = (element_output_type*)handle->dcs->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *diD = (element_output_type*)handle->scratch_di; +element_output_type *dfD = (element_output_type*)handle->scratch_df; +element_output_type *doD = (element_output_type*)handle->scratch_do; +element_output_type *dciD = (element_output_type*)handle->scratch_dci; +float *dxD = (float*)handle->scratch_dx; +float *doutD = (float*)handle->scratch_deltat; +float *dhpD_f32 = (float*)handle->scratch_dhp; +float *db = (float*)handle->scratch_db; +element_input_type *scratch_xT = (element_input_type* )handle->scratch_xT; +#if 0 +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +#endif +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +float *w_scratch = (float*)handle->scratch_w; +float *r_scratch = (float*)handle->scratch_r; +element_filter_type *witD = &(wt[0]); +element_filter_type *wctD = &(wt[C*K]); +element_filter_type *wftD = &(wt[2*C*K]); +element_filter_type *wotD = &(wt[3*C*K]); +element_filter_type *ritD = &(rt[0]); +element_filter_type *rctD = &(rt[K*K]); +element_filter_type *rftD = &(rt[2*K*K]); +element_filter_type *rotD = &(rt[3*K*K]); +element_filter_type *dwiD = &(dw[0]); +element_filter_type *dwcD = &(dw[C*K]); +element_filter_type *dwfD = &(dw[2*C*K]); +element_filter_type *dwoD = &(dw[3*C*K]); +element_filter_type *driD = &(dr[0]); +element_filter_type *drcD = &(dr[K*K]); +element_filter_type *drfD = &(dr[2*K*K]); +element_filter_type *droD = &(dr[3*K*K]); +float *dwiD_scratch = &(w_scratch[0]); +float *dwcD_scratch = &(w_scratch[C*K]); +float *dwfD_scratch = &(w_scratch[2*C*K]); +float *dwoD_scratch = &(w_scratch[3*C*K]); +float *driD_scratch = &(r_scratch[0]); +float *drcD_scratch = &(r_scratch[K*K]); +float *drfD_scratch = &(r_scratch[2*K*K]); +float *droD_scratch = &(r_scratch[3*K*K]); +float *dbi = &(db[0]); +float *dbc = &(db[K]); +float *dbf = &(db[2*K]); +float *dbo = &(db[3*K]); +element_output_type *dbi_bf16 = &(db_bf16[0]); +element_output_type *dbc_bf16 = &(db_bf16[K]); +element_output_type *dbf_bf16 = &(db_bf16[2*K]); +element_output_type *dbo_bf16 = &(db_bf16[3*K]); +#if 0 +element_filter_type *scratch_wiT = &(scratch_wT[0]); +element_filter_type *scratch_wcT = &(scratch_wT[C*K]); +element_filter_type *scratch_wfT = &(scratch_wT[2*C*K]); +element_filter_type *scratch_woT = &(scratch_wT[3*C*K]); +element_filter_type *scratch_riT = &(scratch_rT[0]); +element_filter_type *scratch_rcT = &(scratch_rT[K*K]); +element_filter_type *scratch_rfT = &(scratch_rT[2*K*K]); +element_filter_type *scratch_roT = &(scratch_rT[3*K*K]); +#endif +/*element_output_type *t1D = (element_output_type*)handle->scratch_t1;*/ +/*element_output_type *t2D = (element_output_type*)handle->scratch_t2;*/ +/* multidimensional arrays */ +/*LIBXSMM_VLA_DECL(2, element_output_type, t1, t1D, K);*/ +/*LIBXSMM_VLA_DECL(2, element_output_type, t2, t2D, K);*/ +LIBXSMM_VLA_DECL(5, element_input_type, x, xt, nBlocks, cBlocks, bn, bc); +LIBXSMM_VLA_DECL(4, element_input_type, cp, csp, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_input_type, hp, hpD, kBlocks, bn, bk); +#if 0 +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +#endif +LIBXSMM_VLA_DECL(5, element_output_type, cs, cst, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, h, ht, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, i, it, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, f, ft, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, o, ot, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, ci, cit, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, co, cot, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, float, dx, dxD, nBlocks, cBlocks, bn, bc); +LIBXSMM_VLA_DECL(5, element_input_type, dx_bf16, dxt, nBlocks, cBlocks, bn, bc); +LIBXSMM_VLA_DECL(4, element_input_type, dcp, dcsp, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_input_type, dhp, dhpD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, float, dhp_f32, dhpD_f32, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, float, dwi, dwiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwf, dwfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwo, dwoD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dwc, dwcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, float, dri, driD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drf, drfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, dro, droD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, float, drc, drcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(5, element_filter_type, dwi_bf16, dwiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwc_bf16, dwcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwf_bf16, dwfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dwo_bf16, dwoD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dri_bf16, driD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drc_bf16, drcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, drf_bf16, drfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, dro_bf16, droD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(4, element_output_type, dcs, dcsD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, dh, dht, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, di, diD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, df, dfD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dp, doD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_output_type, dci, dciD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, diB, (element_output_type*)handle->scratch_diB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dfB, (element_output_type*)handle->scratch_dfB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dpB, (element_output_type*)handle->scratch_dpB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_output_type, dciB, (element_output_type*)handle->scratch_dciB, nBlocks, bn_lp, bk, lpb); +LIBXSMM_VLA_DECL(4, float, dout, doutD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_input_type, xT, scratch_xT, nBlocks, bc, bn); +LIBXSMM_VLA_DECL(5, element_filter_type, wiT, witD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wcT, wctD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wfT, wftD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, woT, wotD, kBlocks, bk_lp, bc, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, riT, ritD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rcT, rctD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rfT, rftD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, roT, rotD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(4, element_output_type, hT, scratch_hT, nBlocks, bk, bn); +float *dout_ptr = NULL; +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->bwdupd_kernela; /*libxsmm_bsmmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->bwdupd_kernelb; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bk, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelc = handle->bwdupd_kernelc; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bc, bn, &bk, &N, &bk, NULL, NULL, &kernel_flags, NULL);*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kerneld = handle->bwdupd_kerneld; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL);*/ +libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->bwdupd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL);*/ + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +element_output_type *cps_ptr = NULL; +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 dbi_sum, dbf_sum, dbo_sum, dbc_sum; +#endif +#ifdef PROFILE +__int64_t _start, _end, eltwise_cycles = 0, dout_cycles = 0, weight_trans_cycles = 0, act_trans_cycles = 0, dx_cycles = 0, dwdr_cycles = 0, gradient_cycles = 0, reformat_cycles = 0; +float total_time = 0.0; +#endif +int bcbk_multiples_of_16 = ((bc % 16 == 0) && (bk % 16 == 0)) ? 1 : 0; + +libxsmm_blasint ikic, inic, inik, icin, ikin; +__m512i c01; +const __m512i perm_index = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (K > 1024 && K <= 2048) { + BF = 8; + while (kBlocks % BF != 0) { + BF--; + } +} + +if (K > 2048) { + BF = 16; + while (kBlocks % BF != 0) { + BF--; + } +} + +BF = handle->bwdupd_block; +KB_BLOCKS = kBlocks/BF; + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxD, start_thread, tid, handle->desc.threads); +} + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K*4, w_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K*4, r_scratch, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*4, db, start_thread, tid, handle->desc.threads); +} + +/* Here we assume that the weight tensors come in transposed from framework */ +#if 0 +#ifdef PROFILE +if (ltid == 0) _start = _rdtsc(); +#endif +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(5, wiT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, wfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, woT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bc, lpb) = LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, riT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rcT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, rfT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + LIBXSMM_VLA_ACCESS(5, roT, ic, ik, jk/lpb, jc, jk%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb); + } + } +} +#ifdef PROFILE +if (ltid == 0) { + _end = _rdtsc(); + weight_trans_cycles += _end - _start; +} +#endif +#endif + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +#include "libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c" + +handle->tilerelease_kernel(NULL, NULL, NULL); + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#ifdef PROFILE + if (ltid == 0) _start = _rdtsc(); +#endif + /* Store result weight matrices in KCCK bf16 format and downcovert to bf16 */ +#if defined(LIBXSMM_RNN_CELL_AVX512) +#else + /* TODO: Add here non AVX512 replacement code */ + LIBXSMM_UNUSED(thr_begin_kk); + LIBXSMM_UNUSED(thr_begin_ck); + LIBXSMM_UNUSED(ikic); + LIBXSMM_UNUSED(jk); + LIBXSMM_UNUSED(jc); + LIBXSMM_UNUSED(thr_end_ck); + LIBXSMM_UNUSED(thr_end_kk); +#endif + libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE + if (ltid == 0) { + _end = _rdtsc(); + reformat_cycles += _end - _start; + } +#endif +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM BWD/UPD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gradient_cycles+dwdr_cycles+dx_cycles+act_trans_cycles+weight_trans_cycles+dout_cycles+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Transpose weights time is %f ms (%.2f%%)\n", weight_trans_cycles/(2.5 * 1e9)*1000.0f, weight_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dx GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dx_cycles/(2.5 * 1e9)*1000.0f, dx_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*C*K*4/1e9/(dx_cycles/(2.5 * 1e9))); + printf("Dh GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dout_cycles/(2.5 * 1e9)*1000.0f, dout_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*N*K*K*4/1e9/(dout_cycles/(2.5 * 1e9))); + printf("Transpose input activations time is %f ms (%.2f%%)\n", act_trans_cycles/(2.5 * 1e9)*1000.0f, act_trans_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Dwdr GEMM time is %f ms (%.2f%%) at %f GFLOPS\n", dwdr_cycles/(2.5 * 1e9)*1000.0f, dwdr_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*2.0*(N*K*K*2.0+N*C*K*2.0)*2.0/1e9/(dwdr_cycles/(2.5 * 1e9))); + printf("Gradient bias calculation time is %f ms (%.2f%%)\n", gradient_cycles/(2.5 * 1e9)*1000.0f, gradient_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat dwdr time is %f ms (%.2f%%)\n\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); +} +#undef PROFILE +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c new file mode 100644 index 00000000..94b535e1 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_bwdupd_ncnc_kcck_core_bf16_amx.tpl.c @@ -0,0 +1,405 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const __src = _src; \ + libxsmm_bfloat16 *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i __packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + __packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i])); \ + _mm512_storeu_si512((libxsmm_bfloat16*)&__dst[(__j*ld)+__i], (__m512i) __packed_result); \ + } \ + } \ +} while (0) + +for (j = t-1; j >= 0; --j) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + in = (inik % (N/bn))*bn; + ik = (inik / (N/bn))*bk; + /* Compute dcp, dci, di, df, dp */ + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(4, cp, inb, ikb, 0, 0, kBlocks, bn, bk) : &LIBXSMM_VLA_ACCESS(5, cs, j-1, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + /* Also reformat di, dci, df and dp to be used in the UPD pass in blocked format ... */ +#include "libxsmm_internal_lstm_bwdupd_fused_eltwise_ncnc_reformat_bf16.tpl.c" + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + inb = icin / (C/bc); + icb = icin % (C/bc); + if (bc == 32 && bk == 32) { + trans_act((short int*)&LIBXSMM_VLA_ACCESS(5, x, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), (short int*)&LIBXSMM_VLA_ACCESS(4, xT, icb, inb, 0, 0, nBlocks, bc, bn)); + } else { + in = inb*bn; + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + LIBXSMM_VLA_ACCESS(4, xT, icb, inb, jc, jb, nBlocks, bc, bn) = LIBXSMM_VLA_ACCESS(5, x, j, inb, icb, jb, jc, nBlocks, cBlocks, bn, bc); + } + } + } + } + + /* transpose ht for current timestep */ + if (j == 0) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + inb = ikin / (K/bk); + ikb = ikin % (K/bk); + if (bc == 32 && bk == 32) { + trans_act((short int*)&LIBXSMM_VLA_ACCESS(4, hp, inb, ikb, 0, 0, kBlocks, bn, bk), (short int*)&LIBXSMM_VLA_ACCESS(4, hT, ikb, inb, 0, 0, nBlocks, bk, bn)); + } else { + in = inb*bn; + ik = ikb*bk; + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + LIBXSMM_VLA_ACCESS(4, hT, ikb, inb, jk, jb, nBlocks, bk, bn) = LIBXSMM_VLA_ACCESS(4, hp, inb, ikb, jb, jk, kBlocks, bn, bk); + } + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + inb = ikin / (K/bk); + ikb = ikin % (K/bk); + if (bc == 32 && bk == 32) { + trans_act((short int*)&LIBXSMM_VLA_ACCESS(5, h, j-1, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), (short int*)&LIBXSMM_VLA_ACCESS(4, hT, ikb, inb, 0, 0, nBlocks, bk, bn)); + } else { + ik = ikb*bk; + in = inb*bn; + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + LIBXSMM_VLA_ACCESS(4, hT, ikb, inb, jk, jb, nBlocks, bk, bn) = LIBXSMM_VLA_ACCESS(5, h, j-1, inb, ikb, jb, jk, nBlocks, kBlocks, bn, bk); + } + } + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dx = W^T * difoc */ + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + inb = inic % (N/bn); + in = inb*bn; + icb = inic / (N/bn); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wiT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, di, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, dx, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wcT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, dci, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, dx, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wfT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, df, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, dx, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), &blocks); + + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, woT, icb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bc, lpb), + &LIBXSMM_VLA_ACCESS(4, dp, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, dx, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), &blocks); + + /* If last block, make sure we downconvert dx to bf16 */ + if (KB == BF-1) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bc, bn, bc, &LIBXSMM_VLA_ACCESS(5, dx, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc), &LIBXSMM_VLA_ACCESS(5, dx_bf16, j, inb, icb, 0, 0, nBlocks, cBlocks, bn, bc)); + } + } + } + } + + blocks = KB_BLOCKS; + for (KB = 0; KB < BF; KB++) { + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + inb = inik % (N/bn); + in = inb*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + dout_ptr = (j > 0) ? (float*) &LIBXSMM_VLA_ACCESS(4, dout, inb, ikb, 0, 0, kBlocks, bn, bk) : (float*) &LIBXSMM_VLA_ACCESS(4, dhp_f32, inb, ikb, 0, 0, kBlocks, bn, bk); + + if (KB == 0) libxsmm_internal_matrix_zero_ld( bk, bn, bk, dout_ptr); + /* dout += R^T * difoc */ + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, riT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, di, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rcT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, dci, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, rfT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, df, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + dout_ptr, &blocks); + + batchreduce_kerneld(&LIBXSMM_VLA_ACCESS(5, roT, ikb, KB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, dp, inb, KB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + dout_ptr, &blocks); + + /* Make sure when last and j == 0 to downconvert dhp to BF16 */ + if ((j == 0) && (KB == BF-1)) { + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, bk, dout_ptr, &LIBXSMM_VLA_ACCESS(4, dhp, inb, ikb, 0, 0, kBlocks, bn, bk)); + } + } + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + blocks = nBlocks; + if ((C == K) && (bc == bk) && (bcbk_multiples_of_16 == 1)) { + /* Interleave computation of dr = difoc * h^T and dw = difoc * x^T to take advantage of temporal locality */ + /* Use blocked format for di, dci, df and db */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } else { + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dri, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dri_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drc, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drc_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, drf, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, drf_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hT, icb, 0, 0, 0, nBlocks, bk, bn), + &LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc+1, jk, kBlocks, bk, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dro, ikb, icb, jc, jk, kBlocks, bk, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dro_bf16, ikb, icb, jc/lpb, jk, 0, kBlocks, bk_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ikb = ikic % (K/bk); + ik = ikb*bk; + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, diB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwi, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwi_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dciB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwc, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwc_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dfB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bc; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwf, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwf_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + + batchreduce_kernelc(&LIBXSMM_VLA_ACCESS(5, dpB, ikb, 0, 0, 0, 0, nBlocks, bn_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, xT, icb, 0, 0, 0, nBlocks, bc, bn), + &LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + if (j == 0) { + for (jc = 0; jc < bk; jc+=2) { + for (jk = 0; jk < bk; jk+=16) { + c01 = (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH( LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc+1, jk, cBlocks, bc, bk)), LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(4, dwo, ikb, icb, jc, jk, cBlocks, bc, bk))); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(5, dwo_bf16, ikb, icb, jc/lpb, jk, 0, cBlocks, bc_lp, bk, lpb), _mm512_permutexvar_epi16(perm_index, c01)); + } + } + } + } + } + + /* gradient bias */ + if (bcbk_multiples_of_16) { + for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { + dbi_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbi[ik]); + dbf_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbf[ik]); + dbo_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbo[ik]); + dbc_sum = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&dbc[ik]); + for (in = 0; in < N; in++) { + dbi_sum = _mm512_add_ps(dbi_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, di, in/bn, ik/bk, in%bn, ik%bk, kBlocks, bn, bk))); + dbf_sum = _mm512_add_ps(dbf_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, df, in/bn, ik/bk, in%bn, ik%bk, kBlocks, bn, bk))); + dbo_sum = _mm512_add_ps(dbo_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, dp, in/bn, ik/bk, in%bn, ik%bk, kBlocks, bn, bk))); + dbc_sum = _mm512_add_ps(dbc_sum, _mm512_loadcvt_bf16_fp32(&LIBXSMM_VLA_ACCESS(4, dci, in/bn, ik/bk, in%bn, ik%bk, kBlocks, bn, bk))); + } + _mm512_store_ps(&dbi[ik], dbi_sum); + _mm512_store_ps(&dbf[ik], dbf_sum); + _mm512_store_ps(&dbo[ik], dbo_sum); + _mm512_store_ps(&dbc[ik], dbc_sum); + /* Downconvert delta bias to bf16 if done with all timesteps */ + if (j == 0) { + _mm512_storecvt_fp32_bf16(&dbi_bf16[ik], dbi_sum); + _mm512_storecvt_fp32_bf16(&dbf_bf16[ik], dbf_sum); + _mm512_storecvt_fp32_bf16(&dbo_bf16[ik], dbo_sum); + _mm512_storecvt_fp32_bf16(&dbc_bf16[ik], dbc_sum); + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..50ad74c6 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic.tpl.c @@ -0,0 +1,214 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *wiD_scratch = &(w_scratch[0]); +element_filter_type *wcD_scratch = &(w_scratch[C*K]); +element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *woD_scratch = &(w_scratch[3*C*K]); +element_filter_type *riD_scratch = &(r_scratch[0]); +element_filter_type *rcD_scratch = &(r_scratch[K*K]); +element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); +element_filter_type *roD_scratch = &(r_scratch[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD_scratch, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD_scratch, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wo_ck, woD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ro_ck, roD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, 4*K); +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); +/* Auxiliary arrays for batch-reduce gemms */ +const element_filter_type *A_array[1024]; +const element_input_type *B_array[1024]; +element_output_type *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +/* Upfront reformatting of W and R */ +/* reformat W */ +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wi, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, wc, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, wf, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, wo, ik, ic, jc, jk, cBlocks, bc, bk) = LIBXSMM_VLA_ACCESS(2, wo_ck, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* reformat R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, ri, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, rc, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, rf, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(4, ro, ik, ic, jc, jk, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(2, ro_ck, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c new file mode 100644 index 00000000..ab013d3b --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16.tpl.c @@ -0,0 +1,283 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +#define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ +do { \ + libxsmm_bfloat16 *src = _src; \ + float *dst = _dst; \ + libxsmm_blasint __i,__j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&dst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&src[(__j*ld)+__i]))); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ +do { \ + libxsmm_bfloat16 *colv = _colv; \ + float *srcdst = _srcdst; \ + libxsmm_blasint __i,__j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i]))); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ +do { \ + libxsmm_bfloat16 *colv = _colv; \ + float *srcdst = _srcdst; \ + libxsmm_blasint __i,__j; \ + __m512 vbias = _mm512_set1_ps(const_bias); \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], _mm512_add_ps(vbias, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i])))); \ + } \ + } \ +} while (0) + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, /*icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const int lpb = handle->lpb; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +unsigned long long blocks, blocksa, blocksb; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +/* These buffers are scratch for fp32 output of gemms (intermmediate results) */ +float *cst = (float*)handle->cst_scratch; +float *ht = (float*)handle->ht_scratch; +float *it = (float*)handle->it_scratch; +float *ft = (float*)handle->ft_scratch; +float *ot = (float*)handle->ot_scratch; +float *cit = (float*)handle->cit_scratch; +float *cot = (float*)handle->cot_scratch; +/* This has to be also upconverted since it is used in the elementwise functions */ +float *csp_f32 = (float*)handle->csp_scratch; +/* These are the output bf16 data */ +element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; +element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; +element_output_type *it_bf16 = (element_output_type*)handle->it->data; +element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; +element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; +element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; +element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *wiD_scratch = &(w_scratch[0]); +element_filter_type *wcD_scratch = &(w_scratch[C*K]); +element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *woD_scratch = &(w_scratch[3*C*K]); +element_filter_type *riD_scratch = &(r_scratch[0]); +element_filter_type *rcD_scratch = &(r_scratch[K*K]); +element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); +element_filter_type *roD_scratch = &(r_scratch[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); +LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wo_ck, woD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ro_ck, roD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, 4*K); +LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, float, h, ht, N, K); +LIBXSMM_VLA_DECL(3, float, i, it, N, K); +LIBXSMM_VLA_DECL(3, float, f, ft, N, K); +LIBXSMM_VLA_DECL(3, float, o, ot, N, K); +LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, float, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; + +float *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +/* Upfront reformatting of W and R */ +/* reformat W */ +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc;++jc) { + LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wo_ck, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* reformat R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro_ck, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} + +/* Upconvert the cp input to fp32 that is used for elementwise stuff */ +for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + libxsmm_internal_matrix_cvt_bf16_fp32_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif + +#undef MATRIX_CVT_BF16_FP32_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c new file mode 100644 index 00000000..b9cc50c3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_ck_generic_bf16_amx.tpl.c @@ -0,0 +1,291 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +#define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ +do { \ + libxsmm_bfloat16 *__src = _src; \ + float *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__dst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__src[(__j*ld)+__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__colv[__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + __m512 __vbias = _mm512_set1_ps(const_bias); \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_add_ps(__vbias, _mm512_loadcvt_bf16_fp32(&__colv[__i]))); \ + } \ + } \ +} while (0) + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, inik, BF, CB, CB_BLOCKS, KB_BLOCKS, ikic, jk, jc; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const int lpb = handle->lpb; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +unsigned long long blocks, blocksa, blocksb; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_filter_type *w_scratch = (element_filter_type*)handle->scratch_w; +element_filter_type *r_scratch = (element_filter_type*)handle->scratch_r; +element_output_type *b = (element_output_type*)handle->b->data; +/* These buffers are scratch for fp32 output of gemms (intermmediate results) */ +float *cst = (float*)handle->cst_scratch; +float *ht = (float*)handle->ht_scratch; +float *it = (float*)handle->it_scratch; +float *ft = (float*)handle->ft_scratch; +float *ot = (float*)handle->ot_scratch; +float *cit = (float*)handle->cit_scratch; +float *cot = (float*)handle->cot_scratch; +/* This has to be also upconverted since it is used in the elementwise functions */ +float *csp_f32 = (float*)handle->csp_scratch; +/* These are the output bf16 data */ +element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; +element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; +element_output_type *it_bf16 = (element_output_type*)handle->it->data; +element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; +element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; +element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; +element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; + +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[K]); +element_filter_type *wfD = &(w[2*K]); +element_filter_type *woD = &(w[3*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K]); +element_filter_type *rfD = &(r[2*K]); +element_filter_type *roD = &(r[3*K]); +element_filter_type *wiD_scratch = &(w_scratch[0]); +element_filter_type *wcD_scratch = &(w_scratch[C*K]); +element_filter_type *wfD_scratch = &(w_scratch[2*C*K]); +element_filter_type *woD_scratch = &(w_scratch[3*C*K]); +element_filter_type *riD_scratch = &(r_scratch[0]); +element_filter_type *rcD_scratch = &(r_scratch[K*K]); +element_filter_type *rfD_scratch = &(r_scratch[2*K*K]); +element_filter_type *roD_scratch = &(r_scratch[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); +LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD_scratch, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD_scratch, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(2, element_filter_type, wi_ck, wiD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wf_ck, wfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wo_ck, woD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, wc_ck, wcD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ri_ck, riD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rf_ck, rfD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, ro_ck, roD, 4*K); +LIBXSMM_VLA_DECL(2, element_filter_type, rc_ck, rcD, 4*K); +LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, float, h, ht, N, K); +LIBXSMM_VLA_DECL(3, float, i, it, N, K); +LIBXSMM_VLA_DECL(3, float, f, ft, N, K); +LIBXSMM_VLA_DECL(3, float, o, ot, N, K); +LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, float, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); + +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; /*= libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; /* libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->fwd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL );*/ + +float *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +/* Upfront reformatting of W and R */ +/* reformat W */ +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk)); + ik = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc;++jc) { + LIBXSMM_VLA_ACCESS(5, wi, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wi_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wc, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wc_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wf, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wf_ck, ic*bc+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, wo, ik, ic, jc/lpb, jk, jc%lpb, cBlocks, bc_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, wo_ck, ic*bc+jc, ik*bk+jk, 4*K); + } + } +} + +/* reformat R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(5, ri, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ri_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rc, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rc_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, rf, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, rf_ck, ic*bk+jc, ik*bk+jk, 4*K); + LIBXSMM_VLA_ACCESS(5, ro, ik, ic, jc/lpb, jk, jc%lpb, kBlocks, bk_lp, bk, lpb) = LIBXSMM_VLA_ACCESS(2, ro_ck, ic*bk+jc, ik*bk+jk, 4*K); + } + } +} + +/* Upconvert the cp input to fp32 that is used for elementwise stuff */ +for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + MATRIX_CVT_BF16_FP32_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif + +#undef MATRIX_CVT_BF16_FP32_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c new file mode 100644 index 00000000..a581f284 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck.tpl.c @@ -0,0 +1,138 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, ic, icb, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *cst = (element_output_type*)handle->cst->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *it = (element_output_type*)handle->it->data; +element_output_type *ft = (element_output_type*)handle->ft->data; +element_output_type *ot = (element_output_type*)handle->ot->data; +element_output_type *cit = (element_output_type*)handle->cit->data; +element_output_type *cot = (element_output_type*)handle->cot->data; +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *woD = &(w[3*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_filter_type *roD = &(r[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, cp, csp, K); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wi, wiD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wf, wfD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wo, woD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, wc, wcD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ri, riD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rf, rfD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, ro, roD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, rc, rcD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i, it, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f, ft, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o, ot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co, cot, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, NULL ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL ); +/* Auxiliary arrays for batch-reduce gemms */ +const element_filter_type *A_array[1024]; +const element_input_type *B_array[1024]; +element_output_type *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0; +float total_time = 0.0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c new file mode 100644 index 00000000..39526f8f --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16.tpl.c @@ -0,0 +1,223 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +#define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ +do { \ + libxsmm_bfloat16 *src = _src; \ + float *dst = _dst; \ + libxsmm_blasint __i,__j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&dst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&src[(__j*ld)+__i]))); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ +do { \ + libxsmm_bfloat16 *colv = _colv; \ + float *srcdst = _srcdst; \ + libxsmm_blasint __i,__j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i]))); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ +do { \ + libxsmm_bfloat16 *colv = _colv; \ + float *srcdst = _srcdst; \ + libxsmm_blasint __i,__j; \ + __m512 vbias = _mm512_set1_ps(const_bias); \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_storeu_ps((float*)&srcdst[(__j*ld)+__i], _mm512_add_ps(vbias, LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&colv[__i])))); \ + } \ + } \ +} while (0) + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, /*ic, icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +int lpb = 2; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +unsigned long long blocks, blocksa, blocksb; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; + +/* These buffers are scratch for fp32 output of gemms (intermmediate results) */ +float *cst = (float*)handle->cst_scratch; +float *ht = (float*)handle->ht_scratch; +float *it = (float*)handle->it_scratch; +float *ft = (float*)handle->ft_scratch; +float *ot = (float*)handle->ot_scratch; +float *cit = (float*)handle->cit_scratch; +float *cot = (float*)handle->cot_scratch; +/* This has to be also upconverted since it is used in the elementwise functions */ +float *csp_f32 = (float*)handle->csp_scratch; +/* These are the output bf16 data */ +element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; +element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; +element_output_type *it_bf16 = (element_output_type*)handle->it->data; +element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; +element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; +element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; +element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; + +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *woD = &(w[3*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_filter_type *roD = &(r[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); +LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, float, h, ht, N, K); +LIBXSMM_VLA_DECL(3, float, i, it, N, K); +LIBXSMM_VLA_DECL(3, float, f, ft, N, K); +LIBXSMM_VLA_DECL(3, float, o, ot, N, K); +LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, float, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; + +float *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +const int use_fused_implementation = (C == 2048 && K == 2048) ? 1 : 0; + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif + +/* Upconvert the cp input to fp32 that is used for elementwise stuff */ +for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + MATRIX_CVT_BF16_FP32_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif + +#undef MATRIX_CVT_BF16_FP32_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c new file mode 100644 index 00000000..4948cd24 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_bf16_amx.tpl.c @@ -0,0 +1,236 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +#define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ +do { \ + libxsmm_bfloat16 *__src = _src; \ + float *const __dst = _dst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__dst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__src[(__j*ld)+__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__colv[__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + __m512 __vbias = _mm512_set1_ps(const_bias); \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_add_ps(__vbias, _mm512_loadcvt_bf16_fp32(&__colv[__i]))); \ + } \ + } \ +} while (0) + +/* helper variables */ +libxsmm_blasint j, ik, ikb, in, /*ic, icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const int lpb = 2; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +unsigned long long blocks, blocksa, blocksb; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; + +/* These buffers are scratch for fp32 output of gemms (intermmediate results) */ +float *cst = (float*)handle->cst_scratch; +float *ht = (float*)handle->ht_scratch; +float *it = (float*)handle->it_scratch; +float *ft = (float*)handle->ft_scratch; +float *ot = (float*)handle->ot_scratch; +float *cit = (float*)handle->cit_scratch; +float *cot = (float*)handle->cot_scratch; +/* This has to be also upconverted since it is used in the elementwise functions */ +float *csp_f32 = (float*)handle->csp_scratch; +/* These are the output bf16 data */ +element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; +element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; +element_output_type *it_bf16 = (element_output_type*)handle->it->data; +element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; +element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; +element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; +element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; + +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *woD = &(w[3*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_filter_type *roD = &(r[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(2, float, cp, csp_f32, K); +LIBXSMM_VLA_DECL(2, element_input_type, cp_bf16, csp, K); +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(3, float, cs, cst, N, K); +LIBXSMM_VLA_DECL(3, float, h, ht, N, K); +LIBXSMM_VLA_DECL(3, float, i, it, N, K); +LIBXSMM_VLA_DECL(3, float, f, ft, N, K); +LIBXSMM_VLA_DECL(3, float, o, ot, N, K); +LIBXSMM_VLA_DECL(3, float, ci, cit, N, K); +LIBXSMM_VLA_DECL(3, float, co, cot, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, cs_out, cst_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, h_out, ht_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, i_out, it_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, f_out, ft_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, o_out, ot_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, ci_out, cit_bf16, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, co_out, cot_bf16, N, K); +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->fwd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL );*/ + +/* Auxiliary arrays for batch-reduce gemms */ +#if 0 +const element_filter_type *A_array[1024]; +const element_input_type *B_array[1024]; +#endif +float *cps_ptr = NULL; + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; +const int use_fused_implementation = handle->use_fwd_fused_impl; /*(C == 2048 && K == 2048) ? 1 : 0;*/ + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +/* Overwrite the blocking factor based on the value passed onto the descriptor */ +BF = handle->fwd_block; + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif + +/* Upconvert the cp input to fp32 that is used for elementwise stuff */ +for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + MATRIX_CVT_BF16_FP32_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(2, cp_bf16, in, ik, K), &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K)); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +if (use_fused_implementation) { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c" +} else { +#include "libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c" +} + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif + +#undef MATRIX_CVT_BF16_FP32_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c new file mode 100644 index 00000000..f2d535f4 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused.tpl.c @@ -0,0 +1,254 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +/* First perform the W*x part of the output */ +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_const_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wo, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + } + } +} + +/* Compute the R*h part of the output */ +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c new file mode 100644 index 00000000..74103dc7 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16.tpl.c @@ -0,0 +1,331 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const src = _src; \ + libxsmm_bfloat16 *const dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ + _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ + } \ + } \ +} while (0) + +/* First perform the W*x part of the output */ +blocks = CB_BLOCKS; +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + + +/* Compute the R*h part of the output */ +blocks = KB_BLOCKS; +/* Peel off the t=0 iteration to hoist the innermost if conditions */ +j = 0; +/* let's run the cell in blocks for good locality */ +/* Block reduction loop if requested */ +for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = 1; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c new file mode 100644 index 00000000..93cfb6d3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_diffused_bf16_amx.tpl.c @@ -0,0 +1,331 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const __src = _src; \ + libxsmm_bfloat16 *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i __packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + __packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i])); \ + _mm512_storeu_si512((libxsmm_bfloat16*)&__dst[(__j*ld)+__i], (__m512i) __packed_result); \ + } \ + } \ +} while (0) + +/* First perform the W*x part of the output */ +blocks = CB_BLOCKS; +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + + +/* Compute the R*h part of the output */ +blocks = KB_BLOCKS; +/* Peel off the t=0 iteration to hoist the innermost if conditions */ +j = 0; +/* let's run the cell in blocks for good locality */ +/* Block reduction loop if requested */ +for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = 1; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c new file mode 100644 index 00000000..d4894a4e --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused.tpl.c @@ -0,0 +1,237 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +/* All data is in column-major format */ +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wi, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ri, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wc, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rc, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_const_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wf, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, rf, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + for (icb = 0, ic = 0; icb < CB_BLOCKS; ic += bc, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, wo, ikb, icb + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, x, j, in, ic + CB*CB_BLOCKS*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + if (0 == j) { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(2, hp, in, ic + CB*KB_BLOCKS*bk, K); + } + } else { + for (ic = 0, icb = 0; icb < KB_BLOCKS; ic += bk, icb++) { + A_array[icb] = &LIBXSMM_VLA_ACCESS(4, ro, ikb, icb + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array[icb] = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ic + CB*KB_BLOCKS*bk, N, K); + } + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = (j == 0) ? &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) : &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c new file mode 100644 index 00000000..49e8e63a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16.tpl.c @@ -0,0 +1,374 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ + +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const src = _src; \ + libxsmm_bfloat16 *const dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&src[(__j*ld)+__i])); \ + _mm512_storeu_si512(&dst[(__j*ld)+__i], packed_result); \ + } \ + } \ +} while (0) + +blocksa = CB_BLOCKS; +blocksb = KB_BLOCKS; + +/* All data is in column-major format */ +/* Peel off the t=0 iteration to hoist the innermost if conditions */ +j = 0; +for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = 1; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K); + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c new file mode 100644 index 00000000..01a9d4af --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_nc_kcck_fused_bf16_amx.tpl.c @@ -0,0 +1,374 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *const __src = _src; \ + libxsmm_bfloat16 *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i __packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + __packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i])); \ + _mm512_storeu_si512((libxsmm_bfloat16*)&__dst[(__j*ld)+__i], (__m512i) __packed_result); \ + } \ + } \ +} while (0) + +blocksa = CB_BLOCKS; +blocksb = KB_BLOCKS; + +/* All data is in column-major format */ +/* Peel off the t=0 iteration to hoist the innermost if conditions */ +j = 0; +for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, i, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, ci, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, f, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(2, hp, in, CB*KB_BLOCKS*bk, K), + &LIBXSMM_VLA_ACCESS(3, o, 0, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(2, cp, in, ik, K) ; + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = 1; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik % (N/bn))*bn; + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, x, j, in, CB*CB_BLOCKS*bc, N, C), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksa); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(3, h_out, j-1, in, CB*KB_BLOCKS*bk, N, K), + &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &blocksb); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + + if (CB == BF-1) { +#ifdef PROFILE + if (ltid == 0) { + eltwise_start = _rdtsc(); + } +#endif + cps_ptr = &LIBXSMM_VLA_ACCESS(3, cs, j-1, in, ik, N, K) ; + /* Compute i, ci, f, o, cs, co and h */ +#if defined(LIBXSMM_RNN_CELL_AVX512) + if (bk % 16 == 0 && bc % 16 == 0) { +#include "libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c" + } else { + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); + } +#else + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K) ); + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), cps_ptr, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_fma_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K) ); + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K) ); + libxsmm_internal_matrix_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K) ); +#endif + /* Downconvert computed results to bf16 output buffers */ + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, cs_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, i_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, f_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, o_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, ci_out, j, in, ik, N, K)); + NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(bk, bn, K, &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, co_out, j, in, ik, N, K)); + +#ifdef PROFILE + if (ltid == 0) { + eltwise_end = _rdtsc(); + eltwise_cycles += eltwise_end-eltwise_start; + } +#endif + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c new file mode 100644 index 00000000..9ac18aff --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_bf16_amx.tpl.c @@ -0,0 +1,226 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Kunal Banerjee (Intel Corp.) +******************************************************************************/ +#if 0 +#define PROFILE +#endif + +#define MATRIX_CVT_BF16_FP32_LD(m, n, ld, _src, _dst) \ +do { \ + libxsmm_bfloat16 *__src = _src; \ + float *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__dst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__src[(__j*ld)+__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD(m, n, ld, _srcdst, _colv) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_loadcvt_bf16_fp32(&__colv[__i])); \ + } \ + } \ +} while (0) + +#define MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD(m, n, ld, _srcdst, _colv, const_bias) \ +do { \ + libxsmm_bfloat16 *__colv = _colv; \ + float *__srcdst = _srcdst; \ + libxsmm_blasint __i, __j; \ + __m512 __vbias = _mm512_set1_ps(const_bias); \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=16 ) { \ + _mm512_store_ps((float*)&__srcdst[(__j*ld)+__i], _mm512_add_ps(__vbias, _mm512_loadcvt_bf16_fp32(&__colv[__i]))); \ + } \ + } \ +} while (0) + +/* helper variables */ +libxsmm_blasint j, ik, ikb, /*in,*/ inb, /*ic, icb,*/ inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +const libxsmm_blasint cBlocks = C/bc; +const libxsmm_blasint kBlocks = K/bk; +const libxsmm_blasint nBlocks = N/bn; +const int lpb = 2; +const int bc_lp = bc/lpb; +const int bk_lp = bk/lpb; +unsigned long long blocks/*, blocksa, blocksb*/; + +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *csp = (element_input_type* )handle->csp->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *w = (element_filter_type*)handle->w->data; +element_filter_type *r = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; + +/* These buffers are scratch for fp32 output of gemms (intermmediate results) */ +float *cst = (float*)handle->cst_scratch; +/*float *ht = (float*)handle->ht_scratch;*/ +float *it = (float*)handle->it_scratch; +float *ft = (float*)handle->ft_scratch; +float *ot = (float*)handle->ot_scratch; +float *cit = (float*)handle->cit_scratch; +/*float *cot = (float*)handle->cot_scratch;*/ +/* This has to be also upconverted since it is used in the elementwise functions */ +float *csp_f32 = (float*)handle->csp_scratch; +/* These are the output bf16 data */ +element_output_type *cst_bf16 = (element_output_type*)handle->cst->data; +element_output_type *ht_bf16 = (element_output_type*)handle->ht->data; +element_output_type *it_bf16 = (element_output_type*)handle->it->data; +element_output_type *ft_bf16 = (element_output_type*)handle->ft->data; +element_output_type *ot_bf16 = (element_output_type*)handle->ot->data; +element_output_type *cit_bf16 = (element_output_type*)handle->cit->data; +element_output_type *cot_bf16 = (element_output_type*)handle->cot->data; + +element_filter_type *wiD = &(w[0]); +element_filter_type *wcD = &(w[C*K]); +element_filter_type *wfD = &(w[2*C*K]); +element_filter_type *woD = &(w[3*C*K]); +element_filter_type *riD = &(r[0]); +element_filter_type *rcD = &(r[K*K]); +element_filter_type *rfD = &(r[2*K*K]); +element_filter_type *roD = &(r[3*K*K]); +element_output_type *bi = &(b[0]); +element_output_type *bd = &(b[K]); +element_output_type *bf = &(b[2*K]); +element_output_type *bo = &(b[3*K]); +LIBXSMM_VLA_DECL(4, float, cp, csp_f32, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_input_type, cp_bf16, csp, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_input_type, x, xt, nBlocks, cBlocks, bn,bc); +LIBXSMM_VLA_DECL(4, element_input_type, hp, hpD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_filter_type, wi, wiD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wf, wfD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wo, woD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, wc, wcD, cBlocks, bc_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ri, riD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rf, rfD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, ro, roD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, element_filter_type, rc, rcD, kBlocks, bk_lp, bk, lpb); +LIBXSMM_VLA_DECL(5, float, cs, cst, nBlocks, kBlocks, bn, bk); +/*LIBXSMM_VLA_DECL(5, float, h, ht, nBlocks, kBlocks, bn, bk);*/ +LIBXSMM_VLA_DECL(5, float, i, it, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, float, f, ft, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, float, o, ot, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, float, ci, cit, nBlocks, kBlocks, bn, bk); +/*LIBXSMM_VLA_DECL(5, float, co, cot, nBlocks, kBlocks, bn, bk);*/ +LIBXSMM_VLA_DECL(5, element_output_type, cs_out, cst_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, h_out, ht_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, i_out, it_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, f_out, ft_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, o_out, ot_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, ci_out, cit_bf16, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, co_out, cot_bf16, nBlocks, kBlocks, bn, bk); +/* define batch-reduce gemm kernels */ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernela = handle->fwd_kernela; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_strd batchreduce_kernelb = handle->fwd_kernelb; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &kernel_flags, NULL );*/ +const libxsmm_bsmmfunction_reducebatch_addr tile_config_kernel = handle->fwd_tileconfig; /*libxsmm_bsmmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, &tc_flags, NULL );*/ + +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +#ifdef PROFILE +__int64_t eltwise_start, eltwise_end, eltwise_cycles = 0, gemm_start, gemm_end, gemm_cycles = 0, gemm_cycles2 = 0, reformat_start, reformat_end, reformat_cycles = 0; +float total_time = 0.0; +#endif + +/* Hoist tileconfig if possible */ +if ((bk % 32 == 0) && (bc % 32 == 0) && (bn % 32 == 0)) { + tile_config_kernel(NULL, NULL, NULL, NULL); +} + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if ((C > 1024 && C <= 2048) || (K > 1024 && K <= 2048)) { + BF = 8; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} +if (C > 2048 || K > 2048) { + BF = 16; + while ( (cBlocks % BF != 0) || (kBlocks % BF != 0) ) { + BF--; + } +} + +if (C == 2048 && K == 1024) { + BF = 2; +} + +/* Overwrite the blocking factor based on the value passed onto the descriptor */ +BF = handle->fwd_block; + +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; + +#ifdef PROFILE +if (ltid == 0) reformat_start = _rdtsc(); +#endif + +/* Upconvert the cp input to fp32 that is used for elementwise stuff */ +for (inik = thr_begin; inik < thr_end; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + MATRIX_CVT_BF16_FP32_LD( bk, bn, bk, &LIBXSMM_VLA_ACCESS(4, cp_bf16, inb, ikb, 0, 0, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(4, cp, inb, ikb, 0, 0, kBlocks, bn, bk)); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); +#ifdef PROFILE +if (ltid == 0) { + reformat_end = _rdtsc(); + reformat_cycles = reformat_end - reformat_start; +} +#endif + +#include "libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c" + +handle->tilerelease_kernel(NULL, NULL, NULL); + +#ifdef PROFILE +if (ltid == 0) { + printf("----- PROFILING LSTM FWD (N = %d, C = %d, K = %d, bn = %d. bc = %d, bk = %d)----\n", N, C, K, bn, bc, bk ); + total_time = (gemm_cycles+gemm_cycles2+eltwise_cycles+reformat_cycles)/(2.5 * 1e9)*1000.0f; + printf("Elementwise time is %f ms (%.2f%%)\n", eltwise_cycles/(2.5 * 1e9)*1000.0f, eltwise_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("Reformat weights time is %f ms (%.2f%%)\n", reformat_cycles/(2.5 * 1e9)*1000.0f, reformat_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time ); + printf("GEMM W*x time is %f ms (%.2f%%) at %f GFLOPS\n", gemm_cycles/(2.5 * 1e9)*1000.0f, gemm_cycles/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*C*K*2.0)*4.0/1e9/(gemm_cycles/(2.5 * 1e9))); + printf("GEMM R*h time is %f ms (%.2f%%) at %f GFLOPS\n\n", gemm_cycles2/(2.5 * 1e9)*1000.0f, gemm_cycles2/(2.5 * 1e9)*1000.0f*100.0/total_time, t*(N*K*K*2.0)*4.0/1e9/(gemm_cycles2/(2.5 * 1e9))); +} +#undef PROFILE +#endif + +#undef MATRIX_CVT_BF16_FP32_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD +#undef MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c new file mode 100644 index 00000000..dc2d0fe6 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_lstm_fwd_ncnc_kcck_diffused_bf16_amx.tpl.c @@ -0,0 +1,409 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.) +******************************************************************************/ +#define NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD(m, n, ld, _src, _dst) \ +do { \ + float *__src = _src; \ + libxsmm_bfloat16 *__dst = _dst; \ + libxsmm_blasint __i, __j; \ + __m512i __packed_result; \ + for ( __j = 0; __j < n; ++__j ) { \ + for ( __i = 0; __i < m; __i+=32 ) { \ + __packed_result = LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i+16]), LIBXSMM_INTRINSICS_MM512_LOAD_PS((float*)&__src[(__j*ld)+__i])); \ + _mm512_storeu_si512((libxsmm_bfloat16*)&__dst[(__j*ld)+__i], (__m512i) __packed_result); \ + } \ + } \ +} while (0) + +/* First perform the W*x part of the output */ +blocks = CB_BLOCKS; +for (j = 0; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + ik = ikb*bk; + /* initialize i with bi */ +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &bi[ik] ); + /* i += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wi, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, x, j, inb, CB*CB_BLOCKS, 0, 0, nBlocks, cBlocks, bn, bc), + &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + + /* initialize ci with bd */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &bd[ik] ); + /* ci += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wc, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, x, j, inb, CB*CB_BLOCKS, 0, 0, nBlocks, cBlocks, bn, bc), + &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + + /* initialize f with (bf + forget_bias) */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_CONST_LD( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &bf[ik], handle->forget_bias ); + /* f += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wf, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, x, j, inb, CB*CB_BLOCKS, 0, 0, nBlocks, cBlocks, bn, bc), + &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + + /* initialize o with bo */ + if (CB == 0) MATRIX_BCST_CVT_BF16_FP32_COLVECTOR_LD( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &bo[ik] ); + /* o += W.x */ + batchreduce_kernela(&LIBXSMM_VLA_ACCESS(5, wo, ikb, CB*CB_BLOCKS, 0, 0, 0, cBlocks, bc_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, x, j, inb, CB*CB_BLOCKS, 0, 0, nBlocks, cBlocks, bn, bc), + &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles += gemm_end-gemm_start; + } +#endif + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +/* Compute the R*h part of the output */ +blocks = KB_BLOCKS; +/* Peel off the t=0 iteration to hoist the innermost if conditions */ +j = 0; +/* let's run the cell in blocks for good locality */ +/* Block reduction loop if requested */ +for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hp, inb, CB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the i computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _i = &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, i_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vi0, _vi1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vi0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k] ); + _vi0 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi0, _halves ) ), _halves, _halves); + _mm512_store_ps( &_i[(_j*bk)+_k], _vi0 ); + _vi1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k+16] ); + _vi1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi1, _halves ) ), _halves, _halves); + _mm512_store_ps( &_i[(_j*bk)+_k+16], _vi1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vi1, _vi0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hp, inb, CB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the ci computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _ci = &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, ci_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vci0, _vci1; + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vci0 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k] )); + _mm512_store_ps( &_ci[(_j*bk)+_k], _vci0 ); + _vci1 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k+16] )); + _mm512_store_ps( &_ci[(_j*bk)+_k+16], _vci1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vci1, _vci0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hp, inb, CB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the f computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _f = &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, f_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vf0, _vf1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vf0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k] ); + _vf0 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf0, _halves ) ), _halves, _halves); + _mm512_store_ps( &_f[(_j*bk)+_k], _vf0 ); + _vf1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k+16] ); + _vf1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf1, _halves ) ), _halves, _halves); + _mm512_store_ps( &_f[(_j*bk)+_k+16], _vf1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vf1, _vf0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(4, hp, inb, CB*KB_BLOCKS, 0, 0, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the o computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _o = &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _i = &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _f = &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _ci = &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _cps = &LIBXSMM_VLA_ACCESS(4, cp, inb, ikb, 0, 0, kBlocks, bn, bk); + float* _cs = &LIBXSMM_VLA_ACCESS(5, cs, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_o = &LIBXSMM_VLA_ACCESS(5, o_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_cs = &LIBXSMM_VLA_ACCESS(5, cs_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_h = &LIBXSMM_VLA_ACCESS(5, h_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_co = &LIBXSMM_VLA_ACCESS(5, co_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh, _vf1, _vcs1, _vi1, _vci1, _vco1, _vo1, _vh1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*bk)+_k] ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k] ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k] ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k] ); + _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*bk)+_k] ); + _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); + _vcs = _mm512_mul_ps( _vf, _vcs ); + _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); + _mm512_store_ps( &_cs[(_j*bk)+_k], _vcs ); + _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs ); + _vh = _mm512_mul_ps( _vo, _vco ); + _vo1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*bk)+_k+16] ); + _vi1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k+16] ); + _vci1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k+16] ); + _vf1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k+16] ); + _vcs1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*bk)+_k+16] ); + _vo1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo1, _halves ) ), _halves, _halves); + _vcs1 = _mm512_mul_ps( _vf1, _vcs1 ); + _vcs1 = _mm512_fmadd_ps( _vi1, _vci1, _vcs1 ); + _mm512_store_ps( &_cs[(_j*bk)+_k+16], _vcs1 ); + _vco1 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs1 ); + _vh1 = _mm512_mul_ps( _vo1, _vco1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_o[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vo1, _vo)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_cs[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vcs1, _vcs)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_h[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vh1, _vh)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_co[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vco1, _vco)); + } + } + } + } +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (j = 1; j < t; ++j) { + /* let's run the cell in blocks for good locality */ + /* Block reduction loop if requested */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + inb = inik % (N/bn); + ikb = inik / (N/bn); + ik = ikb*bk; +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* i += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ri, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, h_out, j-1, inb, CB*KB_BLOCKS, 0, 0, nBlocks, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the i computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _i = &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, i_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vi0, _vi1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vi0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k] ); + _vi0 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi0, _halves ) ), _halves, _halves); + _mm512_store_ps( &_i[(_j*bk)+_k], _vi0 ); + _vi1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k+16] ); + _vi1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi1, _halves ) ), _halves, _halves); + _mm512_store_ps( &_i[(_j*bk)+_k+16], _vi1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vi1, _vi0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* ci += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rc, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, h_out, j-1, inb, CB*KB_BLOCKS, 0, 0, nBlocks, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the ci computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _ci = &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, ci_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vci0, _vci1; + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vci0 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k] )); + _mm512_store_ps( &_ci[(_j*bk)+_k], _vci0 ); + _vci1 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2(LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k+16] )); + _mm512_store_ps( &_ci[(_j*bk)+_k+16], _vci1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vci1, _vci0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* f += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, rf, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, h_out, j-1, inb, CB*KB_BLOCKS, 0, 0, nBlocks, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the f computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _f = &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst = &LIBXSMM_VLA_ACCESS(5, f_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vf0, _vf1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vf0 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k] ); + _vf0 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf0, _halves ) ), _halves, _halves); + _mm512_store_ps( &_f[(_j*bk)+_k], _vf0 ); + _vf1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k+16] ); + _vf1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf1, _halves ) ), _halves, _halves); + _mm512_store_ps( &_f[(_j*bk)+_k+16], _vf1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vf1, _vf0)); + } + } + } +#ifdef PROFILE + if (ltid == 0) gemm_start = _rdtsc(); +#endif + /* o += R.h */ + batchreduce_kernelb(&LIBXSMM_VLA_ACCESS(5, ro, ikb, CB*KB_BLOCKS, 0, 0, 0, kBlocks, bk_lp, bk, lpb), + &LIBXSMM_VLA_ACCESS(5, h_out, j-1, inb, CB*KB_BLOCKS, 0, 0, nBlocks, kBlocks, bn, bk), + &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); +#ifdef PROFILE + if (ltid == 0) { + gemm_end = _rdtsc(); + gemm_cycles2 += gemm_end-gemm_start; + } +#endif + /* Eltwise ops and downcovert for the o computed block */ + if (CB == BF-1) { + libxsmm_blasint _k, _j; + float* _o = &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _i = &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _f = &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _ci = &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _cps = &LIBXSMM_VLA_ACCESS(5, cs, j-1, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + float* _cs = &LIBXSMM_VLA_ACCESS(5, cs, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_o = &LIBXSMM_VLA_ACCESS(5, o_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_cs = &LIBXSMM_VLA_ACCESS(5, cs_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_h = &LIBXSMM_VLA_ACCESS(5, h_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + libxsmm_bfloat16 *dst_co = &LIBXSMM_VLA_ACCESS(5, co_out, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh, _vf1, _vcs1, _vi1, _vci1, _vco1, _vo1, _vh1; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 32 ) { + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*bk)+_k] ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k] ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k] ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k] ); + _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*bk)+_k] ); + _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); + _vcs = _mm512_mul_ps( _vf, _vcs ); + _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); + _mm512_store_ps( &_cs[(_j*bk)+_k], _vcs ); + _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs ); + _vh = _mm512_mul_ps( _vo, _vco ); + _vo1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*bk)+_k+16] ); + _vi1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*bk)+_k+16] ); + _vci1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*bk)+_k+16] ); + _vf1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*bk)+_k+16] ); + _vcs1 = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*bk)+_k+16] ); + _vo1 = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo1, _halves ) ), _halves, _halves); + _vcs1 = _mm512_mul_ps( _vf1, _vcs1 ); + _vcs1 = _mm512_fmadd_ps( _vi1, _vci1, _vcs1 ); + _mm512_store_ps( &_cs[(_j*bk)+_k+16], _vcs1 ); + _vco1 = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs1 ); + _vh1 = _mm512_mul_ps( _vo1, _vco1 ); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_o[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vo1, _vo)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_cs[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vcs1, _vcs)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_h[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vh1, _vh)); + _mm512_storeu_si512((libxsmm_bfloat16*)&dst_co[(_j*bk)+_k], (__m512i) LIBXSMM_INTRINSISCS_MM512_CVTNE2PS_PBH(_vco1, _vco)); + } + } + } + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + +#undef NATIVE_MATRIX_RNE_CVT_FP32_BFP16_LD + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..3dba8bbd --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_ck_generic.tpl.c @@ -0,0 +1,357 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint i, ik, in, ic, jk, jb/*jn shadows global variable*/, jc, ek, en, ec; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wD = (element_filter_type*)handle->w->data; +element_filter_type *rD = (element_filter_type*)handle->r->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_filter_type *dwD = (element_filter_type*)handle->dw->data; +element_filter_type *drD = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *deltat = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type*)handle->scratch_xT; +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, w, wD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, r, rD, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(2, element_filter_type, dw, dwD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, dr, drD, K); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, delta, deltat, N, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +LIBXSMM_VLA_DECL(2, element_filter_type, wT, scratch_wT, C); +LIBXSMM_VLA_DECL(2, element_filter_type, rT, scratch_rT, K); +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) || defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) || defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) +element_output_type *zt = (element_output_type*)handle->internal_z; +LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); +#endif +/* define gemm kernels */ +libxsmm_smmfunction gemmkernela = libxsmm_smmdispatch( bc, bn, bk, &C, &K, &C, NULL, NULL, NULL, NULL ); +libxsmm_smmfunction gemmkernelb = libxsmm_smmdispatch( bk, bk, bn, &K, &N, &K, NULL, NULL, NULL, NULL ); +libxsmm_smmfunction gemmkernelc = libxsmm_smmdispatch( bk, bc, bn, &K, &N, &K, NULL, NULL, NULL, NULL ); +libxsmm_smmfunction gemmkerneld = libxsmm_smmdispatch( bk, bn, bk, &K, &K, &K, NULL, NULL, NULL, NULL ); + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* initialization is done at the beginning */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(N*C*t, dxt, start_thread, tid, handle->desc.threads); +} +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + libxsmm_internal_matrix_zero(C*K, dwD, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K*K, drD, start_thread, tid, handle->desc.threads); + libxsmm_internal_matrix_zero(K, db, start_thread, tid, handle->desc.threads); +} + +/* transpose W */ +for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ik = (ikic / (C/bc))*bk; + ic = (ikic % (C/bc))*bc; + + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + ek = ik + jk; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, wT, ek, ec, C) = LIBXSMM_VLA_ACCESS(2, w, ec, ek, K); + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk))*bk; + ic = (ikic % (K/bk))*bk; + + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + ek = ik + jk; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, rT, ek, ec, K) = LIBXSMM_VLA_ACCESS(2, r, ec, ek, K); + } + } +} + +/* transpose xt for current timestep */ +for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + ic = (icin / (N/bn))*bc; + in = (icin % (N/bn))*bn; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, t-1, en, ec, N, C); + } + } +} + +/* transpose ht for current timestep */ +for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, t-2, en, ek, N, K); + } + } +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +/* The following code is for time step t-1 */ +for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik / (K/bk))*bn; + ik = (inik % (K/bk))*bk; + +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) + libxsmm_internal_matrix_relu_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) + libxsmm_internal_matrix_sigmoid_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) + libxsmm_internal_matrix_tanh_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif + + libxsmm_internal_matrix_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), + &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* gemm kernel bwd_d */ + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic / (C/bc))*bn; + ic = (inic % (C/bc))*bc; + + for (ik = 0; ik < K; ik += bk) { + gemmkernela( &LIBXSMM_VLA_ACCESS(2, wT, ik, ic, C), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, dx, t-1, in, ic, N, C) ); + } + } +} +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* gradient bias */ + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + db[ik] += LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); + } + } + + /* dr = delta * h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ic = (ikic / (K/bk))*bk; + ik = (ikic % (K/bk))*bk; + + for (in = 0; in < N; in += bn) { + gemmkernelb( &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dr, ic, ik, K) ); + } + } + + /* dw = delta * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk))*bc; + ik = (ikic % (K/bk))*bk; + + for (in = 0; in < N; in += bn ) { + gemmkernelc( &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dw, ic, ik, K) ); + } + } +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +for (i = t-2; i >= 0; --i) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + ic = (icin / (N/bn))*bc; + in = (icin % (N/bn))*bn; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, i, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (0 == i) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, i-1, en, ek, N, K); + } + } + } + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik / (K/bk))*bn; + ik = (inik % (K/bk))*bk; + + /* delta = dh */ + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); + + /* delta += R^T * delta+1 */ + for (ic = 0; ic < K; ic += bk) { + gemmkerneld( &LIBXSMM_VLA_ACCESS(2, rT, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, delta, i+1, in, ic, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); + } + + /* run inverse non-linear op */ +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) + libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) + libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) + libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dx = W^T * delta */ + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic / (C/bc))*bn; + ic = (inic % (C/bc))*bc; + + for (ik = 0; ik < K; ik += bk) { + gemmkernela( &LIBXSMM_VLA_ACCESS(2, wT, ik, ic, C), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, dx, i, in, ic, N, C) ); + } + } + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* gradient bias */ + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + db[ik] += LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); + } + } + + /* dr = delta * h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ic = (ikic / (K/bk))*bk; + ik = (ikic % (K/bk))*bk; + + for (in = 0; in < N; in += bn) { + gemmkernelb( &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dr, ic, ik, K) ); + } + } + + /* dw = delta * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ic = (ikic / (K/bk))*bc; + ik = (ikic % (K/bk))*bk; + + for (in = 0; in < N; in += bn ) { + gemmkernelc( &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N), &LIBXSMM_VLA_ACCESS(2, dw, ic, ik, K) ); + } + } + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c new file mode 100644 index 00000000..2b18e0d7 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_bwdupd_nc_kcck.tpl.c @@ -0,0 +1,425 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint i, ik, ikb, in, inb, ic, icb, jk, jb/*jn shadows global variable*/, jc, ek, en, ec, BF, KB_BLOCKS, KB; +/* tensor dimensions */ +libxsmm_blasint K = handle->desc.K; +libxsmm_blasint N = handle->desc.N; +libxsmm_blasint C = handle->desc.C; +libxsmm_blasint t = handle->T; +libxsmm_blasint bk = handle->bk; +libxsmm_blasint bn = handle->bn; +libxsmm_blasint bc = handle->bc; +/* tensor raw pointers */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD = (element_input_type* )handle->hp->data; +element_filter_type *wtD = (element_filter_type*)handle->wt->data; +element_filter_type *rtD = (element_filter_type*)handle->rt->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_input_type *dxt = (element_input_type*)handle->dxt->data; +element_filter_type *dwD = (element_filter_type*)handle->dw->data; +element_filter_type *drD = (element_filter_type*)handle->dr->data; +element_output_type *db = (element_output_type*)handle->db->data; +element_output_type *dht = (element_output_type*)handle->dht->data; +element_output_type *deltat = (element_output_type*)handle->scratch_deltat; +element_input_type *scratch_xT = (element_input_type*)handle->scratch_xT; +#if 0 +element_filter_type *scratch_wT = (element_filter_type*)handle->scratch_wT; +element_filter_type *scratch_rT = (element_filter_type*)handle->scratch_rT; +#endif +element_output_type *scratch_hT = (element_output_type*)handle->scratch_hT; +/* Auxiliary variables for bact-reduce calls */ +libxsmm_blasint nBlocks = N/bn; +libxsmm_blasint cBlocks = C/bc; +libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; +const float beta = 0.0; +/* multidimensional arrays */ +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, wT, wtD, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, rT, rtD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_input_type, dx, dxt, N, C); +LIBXSMM_VLA_DECL(4, element_filter_type, dw, dwD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, dr, drD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, dh, dht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, delta, deltat, N, K); +LIBXSMM_VLA_DECL(2, element_input_type, xT, scratch_xT, N); +#if 0 +LIBXSMM_VLA_DECL(4, element_filter_type, wT, scratch_wT, kBlocks, bk, bc); +LIBXSMM_VLA_DECL(4, element_filter_type, rT, scratch_rT, kBlocks, bk, bk); +#endif +LIBXSMM_VLA_DECL(2, element_output_type, hT, scratch_hT, N); +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) || defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) || defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) +element_output_type *zt = (element_output_type*)handle->internal_z; +LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); +#endif +/* define batch-reduce gemm kernels */ +/*const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelaz = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, &beta, NULL, NULL);*/ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelbz = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, &beta, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelcz = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, &beta, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bk, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelc = libxsmm_smmdispatch_reducebatch_addr( bk, bc, bn, &K, &N, &bk, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kerneld = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, NULL); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bc, bn, bk, &bc, &K, &C, NULL, NULL, NULL, NULL); + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; + +/* number of tasks that could be run in parallel for N and K blocks*/ +const libxsmm_blasint work_nk = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_nk = (work_nk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nk / (libxsmm_blasint)handle->desc.threads) : ((work_nk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nk = (ltid * chunksize_nk < work_nk) ? (ltid * chunksize_nk) : work_nk; +const libxsmm_blasint thr_end_nk = ((ltid + 1) * chunksize_nk < work_nk) ? ((ltid + 1) * chunksize_nk) : work_nk; + +/* number of tasks that could be run in parallel for N and C blocks*/ +const libxsmm_blasint work_nc = (N/bn) * (C/bc); +/* compute chunk size */ +const libxsmm_blasint chunksize_nc = (work_nc % (libxsmm_blasint)handle->desc.threads == 0) ? (work_nc / (libxsmm_blasint)handle->desc.threads) : ((work_nc / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_nc = (ltid * chunksize_nc < work_nc) ? (ltid * chunksize_nc) : work_nc; +const libxsmm_blasint thr_end_nc = ((ltid + 1) * chunksize_nc < work_nc) ? ((ltid + 1) * chunksize_nc) : work_nc; + +/* number of tasks that could be run in parallel for C and K blocks*/ +const libxsmm_blasint work_ck = (C/bc) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_ck = (work_ck % (libxsmm_blasint)handle->desc.threads == 0) ? (work_ck / (libxsmm_blasint)handle->desc.threads) : ((work_ck / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_ck = (ltid * chunksize_ck < work_ck) ? (ltid * chunksize_ck) : work_ck; +const libxsmm_blasint thr_end_ck = ((ltid + 1) * chunksize_ck < work_ck) ? ((ltid + 1) * chunksize_ck) : work_ck; + +/* number of tasks that could be run in parallel for K and K blocks*/ +const libxsmm_blasint work_kk = (K/bk) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize_kk = (work_kk % (libxsmm_blasint)handle->desc.threads == 0) ? (work_kk / (libxsmm_blasint)handle->desc.threads) : ((work_kk / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_kk = (ltid * chunksize_kk < work_kk) ? (ltid * chunksize_kk) : work_kk; +const libxsmm_blasint thr_end_kk = ((ltid + 1) * chunksize_kk < work_kk) ? ((ltid + 1) * chunksize_kk) : work_kk; + +#if defined(LIBXSMM_RNN_CELL_AVX512) +int k_tasks = K/16; +int k_chunksize = (k_tasks % (libxsmm_blasint)handle->desc.threads == 0) ? (k_tasks / (libxsmm_blasint)handle->desc.threads) : ((k_tasks / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint k_thr_begin = (ltid * k_chunksize * 16 < K) ? (ltid * k_chunksize * 16) : K; +const libxsmm_blasint k_thr_end = ((ltid + 1) * k_chunksize * 16 < K) ? ((ltid + 1) * k_chunksize * 16) : K; +__m512 db_sum; +#else +/* number of tasks that could be run in parallel for K blocks*/ +/* compute chunk size */ +const libxsmm_blasint chunksize_k = (K % (libxsmm_blasint)handle->desc.threads == 0) ? (K / (libxsmm_blasint)handle->desc.threads) : ((K / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin_k = (ltid * chunksize_k < K) ? (ltid * chunksize_k) : K; +const libxsmm_blasint thr_end_k = ((ltid + 1) * chunksize_k < K) ? ((ltid + 1) * chunksize_k) : K; + +#endif + +libxsmm_blasint ikic, inic, inik, icin, ikin; + +/* Auxiliary arrays for batch-reduce gemm calls */ +const element_filter_type *A_array[1024]; +const element_output_type *B_array[1024]; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (C >= 512 && K >= 512 && C%2 == 0 && K%2 == 0) { + BF = 2; +} +if (C >= 2048 && K >= 2048 && C%8 == 0 && K%8 == 0) { + BF = 8; +} +KB_BLOCKS = kBlocks/BF; + +#if 0 +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose W */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + ik = (ikic / (C/bc)); + ic = (ikic % (C/bc)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bc; ++jc) { + LIBXSMM_VLA_ACCESS(4, wT, ic, ik, jk, jc, kBlocks, bk, bc) = LIBXSMM_VLA_ACCESS(4, w, ik, ic, jc, jk, cBlocks, bc, bk); + } + } + } +} + +/* transpose R */ +for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + ik = (ikic / (K/bk)); + ic = (ikic % (K/bk)); + for (jk = 0; jk < bk; ++jk) { + for (jc = 0; jc < bk; ++jc) { + LIBXSMM_VLA_ACCESS(4, rT, ic, ik, jk, jc, kBlocks, bk, bk) = LIBXSMM_VLA_ACCESS(4, r, ik, ic, jc, jk, kBlocks, bk, bk); + } + } +} +#endif + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + ic = (icin / (N/bn))*bc; + in = (icin % (N/bn))*bn; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, t-1, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, t-2, en, ek, N, K); + } + } + } +} + +/* The following code is for time step t-1 */ +for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik / (K/bk))*bn; + ik = (inik % (K/bk))*bk; + +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) + libxsmm_internal_matrix_relu_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) + libxsmm_internal_matrix_sigmoid_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) + libxsmm_internal_matrix_tanh_inverse_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +#endif + libxsmm_internal_matrix_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, t-1, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K) ); +} + +libxsmm_barrier_wait(handle->barrier, (int)ltid); + +if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* gemm kernel bwd_d */ + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic / (C/bc))*bn; + icb = (inic % (C/bc)); + ic = icb * bc; + /* Prepare arguments for batch-reduce call */ + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik+=bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik + KB*KB_BLOCKS*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, t-1, in, ic, N, C), &blocks); + } + } +} + +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dr = delta * h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelbz(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dr, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = delta * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, t-1, in, ik, N, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelcz(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dw, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } +} + +for (i = t-2; i >= 0; --i) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin_nk; inik < thr_end_nk; ++inik ) { + in = (inik / (K/bk))*bn; + ikb = (inik % (K/bk)); + ik = ikb*bk; + /* delta = dh */ + libxsmm_internal_matrix_copy_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, dh, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); + + /* delta += R^T * delta+1 */ + for (ic = 0; ic < kBlocks; ic++) { + A_array[ic] = &LIBXSMM_VLA_ACCESS(4, rT, ikb, ic, 0, 0, kBlocks, bk, bk); + B_array[ic] = &LIBXSMM_VLA_ACCESS(3, delta, i+1, in, ic*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = kBlocks; + batchreduce_kerneld(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) , &blocks); + + /* run inverse non-linear op */ +#if defined(LIBXSMM_DNN_RNN_RELU_BWDUPD) + libxsmm_internal_matrix_relu_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_BWDUPD) + libxsmm_internal_matrix_sigmoid_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_BWDUPD) + libxsmm_internal_matrix_tanh_inverse_inplace_eltwise_mult_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K) ); +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* transpose xt for current timestep */ + for (icin = thr_begin_nc; icin < thr_end_nc; ++icin ) { + ic = (icin / (N/bn))*bc; + in = (icin % (N/bn))*bn; + + for (jc = 0; jc < bc; ++jc) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ec = ic + jc; + LIBXSMM_VLA_ACCESS(2, xT, ec, en, N) = LIBXSMM_VLA_ACCESS(3, x, i, en, ec, N, C); + } + } + } + + /* transpose ht for current timestep */ + if (0 == i) { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(2, hp, en, ek, K); + } + } + } + } else { + for (ikin = thr_begin_nk; ikin < thr_end_nk; ++ikin ) { + ik = (ikin / (N/bn))*bk; + in = (ikin % (N/bn))*bn; + + for (jk = 0; jk < bk; ++jk) { + for (jb = 0; jb < bn; ++jb) { + en = in + jb; + ek = ik + jk; + LIBXSMM_VLA_ACCESS(2, hT, ek, en, N) = LIBXSMM_VLA_ACCESS(3, h, i-1, en, ek, N, K); + } + } + } + } + } + + if ( (LIBXSMM_DNN_COMPUTE_KIND_BWD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dx = W^T * delta */ + for (KB = 0; KB < BF; KB++) { + for (inic = thr_begin_nc; inic < thr_end_nc; ++inic ) { + in = (inic / (C/bc))*bn; + icb = (inic % (C/bc)); + ic = icb * bc; + /* Prepare arguments for batch-reduce call */ + for (ik = 0, ikb = 0; ikb < KB_BLOCKS; ik+=bk, ikb++) { + A_array[ikb] = &LIBXSMM_VLA_ACCESS(4, wT, icb, ikb + KB*KB_BLOCKS, 0, 0, kBlocks, bk, bc); + B_array[ikb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik + KB*KB_BLOCKS*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, dx, i, in, ic, N, C), &blocks); + } + } + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); + + if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { + /* dr = delta * h^T */ + for (ikic = thr_begin_kk; ikic < thr_end_kk; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bk; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, hT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelb(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dr, ikb, icb, 0, 0, kBlocks, bk, bk), &blocks); + } + + /* dw = delta * x^T */ + for (ikic = thr_begin_ck; ikic < thr_end_ck; ++ikic ) { + icb = ikic / (K/bk); + ic = icb*bc; + ikb = ikic % (K/bk); + ik = ikb*bk; + + for (in = 0, inb = 0; in < N; in += bn, inb++) { + A_array[inb] = &LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); + B_array[inb] = &LIBXSMM_VLA_ACCESS(2, xT, ic, in, N); + } + blocks = nBlocks; + batchreduce_kernelc(A_array, B_array, &LIBXSMM_VLA_ACCESS(4, dw, ikb, icb, 0, 0, cBlocks, bc, bk), &blocks); + } + } +} + +/* gradient bias */ +if ( (LIBXSMM_DNN_COMPUTE_KIND_UPD == kind) || (LIBXSMM_DNN_COMPUTE_KIND_BWDUPD == kind) ) { +#if defined(LIBXSMM_RNN_CELL_AVX512) + for (ik = k_thr_begin; ik < k_thr_end; ik += 16) { + db_sum = _mm512_setzero_ps(); + for (i = 0; i < t; i++) { + for (in = 0; in < N; in++) { + db_sum = _mm512_add_ps(db_sum, LIBXSMM_INTRINSICS_MM512_LOAD_PS(&LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K))); + } + } + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&db[ik], db_sum); + } +#else + for (i = 0; i < t; i++) { + for (ik = thr_begin_k; ik < thr_end_k; ik++) { + for (in = 0; in < N; in++) { + db[ik] += LIBXSMM_VLA_ACCESS(3, delta, i, in, ik, N, K); + } + } + } +#endif +} +libxsmm_barrier_wait(handle->barrier, (int)ltid); diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c new file mode 100644 index 00000000..0e77df1e --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_ck_generic.tpl.c @@ -0,0 +1,92 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint i, ik, in, ic, inik; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD= (element_input_type* )handle->hp->data; +element_filter_type *wD = (element_filter_type*)handle->w->data; +element_filter_type *rD = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *zt = (element_output_type*)handle->internal_z; +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, w, wD, K); +LIBXSMM_VLA_DECL(2, element_filter_type, r, rD, K); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); +/* define gemm kernels */ +libxsmm_smmfunction gemmkernela = libxsmm_smmdispatch( bk, bn, bc, &K, &C, &K, NULL, NULL, NULL, NULL ); +libxsmm_smmfunction gemmkernelb = libxsmm_smmdispatch( bk, bn, bk, &K, &K, &K, NULL, NULL, NULL, NULL ); +/* parallelize over C-blocks */ +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +const libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* All data is in column-major format */ +for (i = 0; i < t; ++i) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = (inik / (K/bk))*bn; + ik = (inik % (K/bk))*bk; + + /* z = per_col(b) */ + libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &b[ik] ); + + /* z += W.x */ + for (ic = 0; ic < C; ic += bc) { + /* this is a small matmul */ + gemmkernela( &LIBXSMM_VLA_ACCESS(2, w, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, x, i, in, ic, N, C), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); + } + /* z += U.h */ + if (0 == i) { + for (ic = 0; ic < K; ic += bk) { + /* this is a small matmul */ + gemmkernelb( &LIBXSMM_VLA_ACCESS(2, r, ic, ik, K), &LIBXSMM_VLA_ACCESS(2, hp, in, ic, K), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); + } + } else { + for (ic = 0; ic < K; ic += bk) { + /* this is a small matmul */ + gemmkernelb( &LIBXSMM_VLA_ACCESS(2, r, ic, ik, K), &LIBXSMM_VLA_ACCESS(3, h, i-1, in, ic, N, K), &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K) ); + } + } +#if defined(LIBXSMM_DNN_RNN_RELU_FWD) + libxsmm_internal_matrix_relu_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_FWD) + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in, ik, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in, ik, N, K) ); +#endif + } + + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c new file mode 100644 index 00000000..a945819a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_nc_kcck.tpl.c @@ -0,0 +1,136 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint i, ik, in, ic, inik, BF, CB, CB_BLOCKS, KB_BLOCKS; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD= (element_input_type* )handle->hp->data; +element_filter_type *wD = (element_filter_type*)handle->w->data; +element_filter_type *rD = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *zt = (element_output_type*)handle->internal_z; +/*libxsmm_blasint nBlocks = N/bn;*/ +libxsmm_blasint cBlocks = C/bc; +libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; +LIBXSMM_VLA_DECL(3, element_input_type, x, xt, N, C); +LIBXSMM_VLA_DECL(2, element_input_type, hp, hpD, K); +LIBXSMM_VLA_DECL(4, element_filter_type, w, wD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, r, rD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(3, element_output_type, h, ht, N, K); +LIBXSMM_VLA_DECL(3, element_output_type, z, zt, N, K); +int prefetch_mode = LIBXSMM_GEMM_PREFETCH_NONE/*LIBXSMM_GEMM_PREFETCH_AL1_BL1*/; +/* define gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &C, &K, NULL, NULL, NULL, &prefetch_mode ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &K, &K, NULL, NULL, NULL, &prefetch_mode ); + +/* Auxiliary arrays for batch-reduce gemms */ +const element_input_type *A_array[1024]; +const element_input_type *B_array[1024]; +const element_input_type *A_array2[1024]; +const element_input_type *B_array2[1024]; + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* Blocking reduction domain if it is too large */ +BF = 1; +if (C >= 2048 && K >= 2048 && C%2 == 0 && K%2 == 0) { + BF = 2; +} +CB_BLOCKS = cBlocks/BF; +KB_BLOCKS = kBlocks/BF; +assert(CB_BLOCKS <= 1024); +assert(KB_BLOCKS <= 1024); + +/* lazy barrier init */ +libxsmm_barrier_init(handle->barrier, (int)ltid); + +/* All data is in column-major format */ +for (i = 0; i < t; ++i) { + /* let's run the cell in blocks for good locality */ + for (CB = 0; CB < BF; CB++) { + for (inik = thr_begin; inik < thr_end; ++inik ) { + if (C >= 2048 && K >= 2048) { + in = inik % (N/bn); + ik = inik / (N/bn); + } else { + in = inik / (K/bk); + ik = inik % (K/bk); + } + + /* z = per_col(b) */ + if (0 == CB) { + libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &b[ik*bk] ); + } + + /* z += W.x */ + /* Prepare arrays for the call */ + for (ic = 0; ic < CB_BLOCKS; ic++) { + /* this is a small matmul */ + A_array[ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic + CB*CB_BLOCKS, 0, 0, cBlocks, bc, bk); + B_array[ic] = &LIBXSMM_VLA_ACCESS(3, x, i, in*bn, (ic + CB*CB_BLOCKS)*bc, N, C); + } + /* Reduce batch gemm call */ + blocks = CB_BLOCKS; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); + + /* z += U.h */ + if (0 == i) { + /* Prepare arrays for the call */ + for (ic = 0; ic < KB_BLOCKS; ic++) { + A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array2[ic] = &LIBXSMM_VLA_ACCESS(2, hp, in*bn, (ic + CB*KB_BLOCKS)*bk, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); + } else { + /* Prepare arrays for the call */ + for (ic = 0; ic < KB_BLOCKS; ic++) { + A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic + CB*KB_BLOCKS, 0, 0, kBlocks, bk, bk); + B_array2[ic] = &LIBXSMM_VLA_ACCESS(3, h, i-1, in*bn, (ic + CB*KB_BLOCKS)*bk, N, K); + } + /* Reduce batch gemm call */ + blocks = KB_BLOCKS; + batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &blocks); + } +#if defined(LIBXSMM_DNN_RNN_RELU_FWD) + libxsmm_internal_matrix_relu_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) + libxsmm_internal_matrix_sigmoid_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_FWD) + libxsmm_internal_matrix_tanh_ld( bk, bn, K, &LIBXSMM_VLA_ACCESS(3, z, i, in*bn, ik*bk, N, K), &LIBXSMM_VLA_ACCESS(3, h, i, in*bn, ik*bk, N, K) ); +#endif + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c new file mode 100644 index 00000000..2717adf3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_rnncell_st_rnn_fwd_ncnc_kcck.tpl.c @@ -0,0 +1,234 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas, Alexander Heinecke, Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +/* helper variables */ +libxsmm_blasint i, ik, in, ic, inik; +/* input sizes */ +const libxsmm_blasint K = handle->desc.K; +const libxsmm_blasint N = handle->desc.N; +const libxsmm_blasint C = handle->desc.C; +const libxsmm_blasint t = handle->T; +const libxsmm_blasint bk = handle->bk; +const libxsmm_blasint bn = handle->bn; +const libxsmm_blasint bc = handle->bc; +/* define tensors */ +element_input_type *xt = (element_input_type* )handle->xt->data; +element_input_type *hpD= (element_input_type* )handle->hp->data; +element_filter_type *wD = (element_filter_type*)handle->w->data; +element_filter_type *rD = (element_filter_type*)handle->r->data; +element_output_type *b = (element_output_type*)handle->b->data; +element_output_type *ht = (element_output_type*)handle->ht->data; +element_output_type *zt = (element_output_type*)handle->internal_z; +libxsmm_blasint nBlocks = N/bn; +libxsmm_blasint cBlocks = C/bc; +libxsmm_blasint kBlocks = K/bk; +unsigned long long blocks; +LIBXSMM_VLA_DECL(5, element_input_type, x, xt, nBlocks, cBlocks, bn, bc); +LIBXSMM_VLA_DECL(4, element_input_type, hp, hpD, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, w, wD, cBlocks, bc, bk); +LIBXSMM_VLA_DECL(4, element_filter_type, r, rD, kBlocks, bk, bk); +LIBXSMM_VLA_DECL(5, element_output_type, h, ht, nBlocks, kBlocks, bn, bk); +LIBXSMM_VLA_DECL(5, element_output_type, z, zt, nBlocks, kBlocks, bn, bk); +int prefetch_mode = LIBXSMM_GEMM_PREFETCH_NONE/*LIBXSMM_GEMM_PREFETCH_AL1_BL1*/; +/* define gemm kernels */ +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernela = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bc, &bk, &bc, &bk, NULL, NULL, NULL, &prefetch_mode ); +const libxsmm_smmfunction_reducebatch_addr batchreduce_kernelb = libxsmm_smmdispatch_reducebatch_addr( bk, bn, bk, &bk, &bk, &bk, NULL, NULL, NULL, &prefetch_mode ); + +/* computing first logical thread */ +const libxsmm_blasint ltid = (libxsmm_blasint)tid - (libxsmm_blasint)start_thread; +/* number of tasks that could be run in parallel */ +const libxsmm_blasint work = (N/bn) * (K/bk); +/* compute chunk size */ +const libxsmm_blasint chunksize = (work % (libxsmm_blasint)handle->desc.threads == 0) ? (work / (libxsmm_blasint)handle->desc.threads) : ((work / (libxsmm_blasint)handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +libxsmm_blasint thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; +libxsmm_blasint thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; + +/* The snippet below does a 2D domain decomposition of output IF the number of threads and the number of work items are compatible */ +/* TODO: For now 2D decomposition targets single socket SKX */ +int row_teams = 7; +int column_teams = 4; +libxsmm_blasint my_col_id = ltid % column_teams; +libxsmm_blasint my_row_id = ltid / column_teams; +int in_tasks = (int)(N/bn); +int ik_tasks = (int)(K/bk); +int in_tasks_per_thread = in_tasks/row_teams; +int ik_tasks_per_thread = ik_tasks/column_teams; +libxsmm_blasint my_in_start = my_row_id * in_tasks_per_thread; +libxsmm_blasint my_in_end = (my_row_id+1) * in_tasks_per_thread; +libxsmm_blasint my_ik_start = my_col_id * ik_tasks_per_thread; +libxsmm_blasint my_ik_end = (my_col_id+1) * ik_tasks_per_thread; +int perform_2d_decomp = (in_tasks % row_teams == 0 && ik_tasks % column_teams == 0 && row_teams*column_teams == handle->desc.threads && cBlocks <= 32 && kBlocks <= 32 && ik_tasks_per_thread <= 16 && in_tasks_per_thread <= 2 ) ? 1 : 0; + +if (perform_2d_decomp) { + /* Auxiliary arrays for batch-reduce gemms and potential prefetch */ + const element_input_type *A_array[16][2][32]; + const element_input_type *B_array[16][2][32]; + const element_input_type *A_array2[16][2][32]; + const element_input_type *B_array2[16][2][32]; + const element_input_type *A_array_pf[16][2][32]; + const element_input_type *B_array_pf[16][2][32]; + const element_input_type *A_array2_pf[16][2][32]; + const element_input_type *B_array2_pf[16][2][32]; + int ii, jj; + + /* lazy barrier init */ + libxsmm_barrier_init(handle->barrier, (int)ltid); + + /* All data is in column-major format */ + for (i = 0; i < t; ++i) { + /* Prepare arrays for the batch-reduce calls */ + for (ik = my_ik_start, ii = 0; ik < my_ik_end; ++ik, ii++ ) { + for (in = my_in_start, jj = 0; in < my_in_end; ++in, jj++ ) { + /* Prepare arrays for the call */ + for (ic = 0; ic < cBlocks; ic++) { + /* this is a small matmul */ + A_array[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic, 0, 0, cBlocks, bc, bk); + B_array[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(5, x, i, in, ic, 0, 0, nBlocks, cBlocks, bn, bc); + } + /* z += U.h */ + if (0 == i) { + /* Prepare arrays for the call */ + for (ic = 0; ic < kBlocks; ic++) { + A_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); + B_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, hp, in, ic, 0, 0, kBlocks, bn, bk); + } + } else { + /* Prepare arrays for the call */ + for (ic = 0; ic < kBlocks; ic++) { + A_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); + B_array2[ii][jj][ic] = &LIBXSMM_VLA_ACCESS(5, h, i-1, in, ic, 0, 0, nBlocks, kBlocks, bn, bk); + } + } + } + } + + if (prefetch_mode != LIBXSMM_GEMM_PREFETCH_NONE) { /* coverity[dead_error_begin] */ + /* Prepare additional prefetch arrays that are shifted images of regular ones when external prefetching is requested */ + int pf_dist_A = 2; + int pf_dist_B = 4; + libxsmm_blasint total_blocks = in_tasks_per_thread*ik_tasks_per_thread*cBlocks; + const element_input_type **src_ptr = &A_array[0][0][0]; + const element_input_type **dst_ptr = &A_array_pf[0][0][0]; + for (ii = 0; ii < total_blocks - pf_dist_A; ii++) { + dst_ptr[ii] = src_ptr[ii+pf_dist_A]; + } + src_ptr = &B_array[0][0][0]; + dst_ptr = &B_array_pf[0][0][0]; + for (ii = 0; ii < total_blocks - pf_dist_B; ii++) { + dst_ptr[ii] = src_ptr[ii+pf_dist_B]; + } + total_blocks = in_tasks_per_thread*ik_tasks_per_thread*kBlocks; + src_ptr = &A_array2[0][0][0]; + dst_ptr = &A_array2_pf[0][0][0]; + for (ii = 0; ii < total_blocks - pf_dist_A; ii++) { + dst_ptr[ii] = src_ptr[ii+pf_dist_A]; + } + src_ptr = &B_array2[0][0][0]; + dst_ptr = &B_array2_pf[0][0][0]; + for (ii = 0; ii < total_blocks - pf_dist_B; ii++) { + dst_ptr[ii] = src_ptr[ii+pf_dist_B]; + } + } + + /* let's run the cell in blocks for good locality */ + for (ik = my_ik_start, ii = 0; ik < my_ik_end; ++ik, ii++ ) { + for (in = my_in_start, jj = 0; in < my_in_end; ++in, jj++ ) { + /* z = per_col(b) */ + libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &b[ik*bk]); + /* z += W.x */ + blocks = cBlocks; + batchreduce_kernela(&A_array[ii][jj][0], &B_array[ii][jj][0], &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks, &A_array_pf[ii][jj][0], &B_array_pf[ii][jj][0]); + /* z += U.h */ + blocks = kBlocks; + batchreduce_kernelb(&A_array2[ii][jj][0], &B_array2[ii][jj][0], &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks, &A_array2_pf[ii][jj][0], &B_array2_pf[ii][jj][0]); + +#if defined(LIBXSMM_DNN_RNN_RELU_FWD) + libxsmm_internal_matrix_relu_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) + libxsmm_internal_matrix_sigmoid_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_FWD) + libxsmm_internal_matrix_tanh_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif + } + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + } +} else { + /* Auxiliary arrays for batch-reduce gemms */ + const element_input_type *A_array[1024]; + const element_input_type *B_array[1024]; + const element_input_type *A_array2[1024]; + const element_input_type *B_array2[1024]; + assert(kBlocks <= 1024); + assert(cBlocks <= 1024); + + /* lazy barrier init */ + libxsmm_barrier_init(handle->barrier, (int)ltid); + + /* All data is in column-major format */ + for (i = 0; i < t; ++i) { + /* let's run the cell in blocks for good locality */ + for (inik = thr_begin; inik < thr_end; ++inik ) { + in = inik / (K/bk); + ik = inik % (K/bk); + + /* z = per_col(b) */ + libxsmm_internal_matrix_bcst_colvector_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &b[ik*bk]); + + /* z += W.x */ + /* Prepare arrays for the call */ + for (ic = 0; ic < cBlocks; ic++) { + /* this is a small matmul */ + A_array[ic] = &LIBXSMM_VLA_ACCESS(4, w, ik, ic, 0, 0, cBlocks, bc, bk); + B_array[ic] = &LIBXSMM_VLA_ACCESS(5, x, i, in, ic, 0, 0, nBlocks, cBlocks, bn, bc); + } + /* Reduce batch gemm call */ + blocks = cBlocks; + batchreduce_kernela(A_array, B_array, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + + /* z += U.h */ + if (0 == i) { + /* Prepare arrays for the call */ + for (ic = 0; ic < kBlocks; ic++) { + A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); + B_array2[ic] = &LIBXSMM_VLA_ACCESS(4, hp, in, ic, 0, 0, kBlocks, bn, bk); + } + /* Reduce batch gemm call */ + blocks = kBlocks; + batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + } else { + /* Prepare arrays for the call */ + for (ic = 0; ic < kBlocks; ic++) { + A_array2[ic] = &LIBXSMM_VLA_ACCESS(4, r, ik, ic, 0, 0, kBlocks, bk, bk); + B_array2[ic] = &LIBXSMM_VLA_ACCESS(5, h, i-1, in, ic, 0, 0, nBlocks, kBlocks, bn, bk); + } + /* Reduce batch gemm call */ + blocks = kBlocks; + batchreduce_kernelb(A_array2, B_array2, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &blocks); + } + +#if defined(LIBXSMM_DNN_RNN_RELU_FWD) + libxsmm_internal_matrix_relu_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif +#if defined(LIBXSMM_DNN_RNN_SIGMOID_FWD) + libxsmm_internal_matrix_sigmoid_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif +#if defined(LIBXSMM_DNN_RNN_TANH_FWD) + libxsmm_internal_matrix_tanh_ld( bk, bn, bk, &LIBXSMM_VLA_ACCESS(5, z, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk), &LIBXSMM_VLA_ACCESS(5, h, i, in, ik, 0, 0, nBlocks, kBlocks, bn, bk)); +#endif + } + libxsmm_barrier_wait(handle->barrier, (int)ltid); + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c new file mode 100644 index 00000000..0ea81cef --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_bwd_ncnc_generic.tpl.c @@ -0,0 +1,148 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) +#define LIBXSMM_DNN_CONVERT_F32_BF16(in, out, length) do { \ + unsigned int full_chunks = length / 16; \ + unsigned int remainder = length % 16; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 16) { \ + _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 16; \ + _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ + } \ + libxsmm_rne_convert_fp32_bf16((const float*)in+16*full_chunks, (libxsmm_bfloat16*)out+16*full_chunks, remainder); \ + } \ +} while(0) + +#define LIBXSMM_DNN_CONVERT_BF16_F32(in, out, length) do { \ + unsigned int full_chunks = length / 16; \ + unsigned int remainder = length % 16; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 16) { \ + _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 16; \ + _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ + } \ + libxsmm_convert_bf16_f32( (const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ + } \ +} while(0) +#endif + +libxsmm_blasint bn = handle->bn; +libxsmm_blasint Bn = handle->Bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint Bc = handle->Bc; + +/* loop counters */ +int i = 0; +libxsmm_blasint img1, img2, ifm1, ifm2; + +float rcp_N = 1.0f/handle->desc.N; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could run in parallel for the batch */ +const int n_work = Bn * bn; +/* compute chunk size */ +const int n_chunksize = (n_work % handle->desc.threads == 0) ? (n_work / handle->desc.threads) : ((n_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int n_thr_begin = (ltid * n_chunksize < n_work) ? (ltid * n_chunksize) : n_work; +const int n_thr_end = ((ltid + 1) * n_chunksize < n_work) ? ((ltid + 1) * n_chunksize) : n_work; + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) || defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) +/* number of tasks that could run in parallel for the batch */ +const int nc_work = Bn * bn; +/* compute chunk size */ +const int nc_chunksize = (nc_work % handle->desc.threads == 0) ? (nc_work / handle->desc.threads) : ((nc_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int nc_thr_begin = (ltid * nc_chunksize < nc_work) ? (ltid * nc_chunksize) : nc_work; +const int nc_thr_end = ((ltid + 1) * nc_chunksize < nc_work) ? ((ltid + 1) * nc_chunksize) : nc_work; + +libxsmm_bfloat16* poutput_bf16 = (element_output_type*)handle->reg_output->data; +libxsmm_bfloat16* pdinput_bf16 = (element_input_type*)handle->grad_input->data; +float* poutput_fp32 = (float*)handle->scratch; +float* pdinput_fp32 = ((float*)handle->scratch)+(handle->desc.N*handle->desc.C); +LIBXSMM_VLA_DECL(4, const float, output, poutput_fp32, Bc, bn, bc); +LIBXSMM_VLA_DECL(4, float, dinput, pdinput_fp32, Bc, bn, bc); +#else +LIBXSMM_VLA_DECL(4, const element_output_type, output, (element_output_type*)handle->reg_output->data, Bc, bn, bc); +LIBXSMM_VLA_DECL(4, element_input_type, dinput, (element_input_type*)handle->grad_input->data, Bc, bn, bc); +#endif +LIBXSMM_VLA_DECL(2, const element_label_type, label, (element_label_type*)handle->label->data, bn); + +/* lazy barrier init */ +libxsmm_barrier_init( handle->barrier, ltid ); + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) +for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { + libxsmm_bfloat16_hp out; + out.i[0] = 0; + out.i[1] = poutput_bf16[i]; + poutput_fp32[i] = out.f; +} + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) +LIBXSMM_DNN_CONVERT_BF16_F32(poutput_bf16+nc_thr_begin, poutput_fp32+nc_thr_begin, nc_thr_end-nc_thr_begin); + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif + +for ( i = n_thr_begin; i < n_thr_end; ++i ) { + img1 = i/bn; + img2 = i%bn; + + /* set output to input and set compute max per image */ + for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + if ( (ifm1*Bc)+ifm2 == (libxsmm_blasint)LIBXSMM_VLA_ACCESS( 2, label, img1, img2, bn ) ) { + LIBXSMM_VLA_ACCESS( 4, dinput, img1, ifm1, img2, ifm2, Bc, bn, bc ) = + ( LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) - 1.0f ) * rcp_N * handle->desc.loss_weight; + } else { + LIBXSMM_VLA_ACCESS( 4, dinput, img1, ifm1, img2, ifm2, Bc, bn, bc ) = + LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) * rcp_N * handle->desc.loss_weight; + } + } + } +} + +libxsmm_barrier_wait( handle->barrier, ltid ); + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16) +for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { + libxsmm_bfloat16_hp din; + din.f = pdinput_fp32[i]; + pdinput_bf16[i] = din.i[1]; +} + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_BWD_BF16_AVX512) +LIBXSMM_DNN_CONVERT_F32_BF16(pdinput_fp32+nc_thr_begin, pdinput_bf16+nc_thr_begin, nc_thr_end-nc_thr_begin); + +libxsmm_barrier_wait( handle->barrier, ltid ); +#undef LIBXSMM_DNN_CONVERT_F32_BF16 +#undef LIBXSMM_DNN_CONVERT_BF16_F32 +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c new file mode 100644 index 00000000..af7b022a --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_softmaxloss_st_fwd_ncnc_generic.tpl.c @@ -0,0 +1,179 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) +#define LIBXSMM_DNN_CONVERT_F32_BF16(in, out, length) do { \ + unsigned int full_chunks = length / 16; \ + unsigned int remainder = length % 16; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 16) { \ + _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 16; \ + _mm256_storeu_si256((__m256i*)(out+__i), _mm512_cvtepi32_epi16( _mm512_srai_epi32( LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16( LIBXSMM_INTRINSICS_MM512_LOAD_PS((const float*)in+__i) ),16)) ); \ + } \ + libxsmm_rne_convert_fp32_bf16((const float*)in+16*full_chunks, (libxsmm_bfloat16*)out+16*full_chunks, remainder); \ + } \ +} while(0) + +#define LIBXSMM_DNN_CONVERT_BF16_F32(in, out, length) do { \ + unsigned int full_chunks = length / 16; \ + unsigned int remainder = length % 16; \ + int __i = 0; \ + if (remainder == 0) { \ + for ( __i = 0; __i < length; __i+= 16) { \ + _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ + } \ + } else { \ + unsigned int chunk; \ + for ( chunk = 0; chunk < full_chunks; chunk++) { \ + __i = chunk * 16; \ + _mm512_storeu_ps( out+__i, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(in+__i))),16))); \ + } \ + libxsmm_convert_bf16_f32( (const libxsmm_bfloat16*)in+16*full_chunks, (float*)out+16*full_chunks, remainder); \ + } \ +} while(0) +#endif + +libxsmm_blasint bn = handle->bn; +libxsmm_blasint Bn = handle->Bn; +libxsmm_blasint bc = handle->bc; +libxsmm_blasint Bc = handle->Bc; + +/* loop counters */ +int i = 0; +libxsmm_blasint img1, img2, ifm1, ifm2; + +/* computing first logical thread */ +const int ltid = tid - start_thread; + +/* number of tasks that could run in parallel for the batch */ +const int n_work = Bn * bn; +/* compute chunk size */ +const int n_chunksize = (n_work % handle->desc.threads == 0) ? (n_work / handle->desc.threads) : ((n_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int n_thr_begin = (ltid * n_chunksize < n_work) ? (ltid * n_chunksize) : n_work; +const int n_thr_end = ((ltid + 1) * n_chunksize < n_work) ? ((ltid + 1) * n_chunksize) : n_work; + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) || defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) +/* number of tasks that could run in parallel for the batch */ +const int nc_work = Bn * bn; +/* compute chunk size */ +const int nc_chunksize = (nc_work % handle->desc.threads == 0) ? (nc_work / handle->desc.threads) : ((nc_work / handle->desc.threads) + 1); +/* compute thr_begin and thr_end */ +const int nc_thr_begin = (ltid * nc_chunksize < nc_work) ? (ltid * nc_chunksize) : nc_work; +const int nc_thr_end = ((ltid + 1) * nc_chunksize < nc_work) ? ((ltid + 1) * nc_chunksize) : nc_work; + +libxsmm_bfloat16* poutput_bf16 = (element_output_type*)handle->reg_output->data; +libxsmm_bfloat16* pinput_bf16 = (element_input_type*)handle->reg_input->data; +float* poutput_fp32 = (float*)handle->scratch; +float* pinput_fp32 = ((float*)handle->scratch)+(handle->desc.N*handle->desc.C); +LIBXSMM_VLA_DECL(4, float, output, poutput_fp32, Bc, bn, bc); +LIBXSMM_VLA_DECL(4, const float, input, pinput_fp32, Bc, bn, bc); +#else +LIBXSMM_VLA_DECL(4, element_output_type, output, (element_output_type*)handle->reg_output->data, Bc, bn, bc); +LIBXSMM_VLA_DECL(4, const element_input_type, input, (element_input_type*)handle->reg_input->data, Bc, bn, bc); +#endif +LIBXSMM_VLA_DECL(2, const element_label_type, label, (element_label_type*)handle->label->data, bn); + +/* lazy barrier init */ +libxsmm_barrier_init( handle->barrier, ltid ); + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) +for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { + libxsmm_bfloat16_hp in; + in.i[0] = 0; + in.i[1] = pinput_bf16[i]; + pinput_fp32[i] = in.f; +} + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) +LIBXSMM_DNN_CONVERT_BF16_F32(pinput_bf16+nc_thr_begin, pinput_fp32+nc_thr_begin, nc_thr_end-nc_thr_begin); + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif + +for ( i = n_thr_begin; i < n_thr_end; ++i ) { + float max = FLT_MIN; + float sum_of_exp = 0.0f; + + img1 = i/bn; + img2 = i%bn; + + /* set output to input and set compute max per image */ + for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ); + if ( LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ) > max ) { + max = LIBXSMM_VLA_ACCESS( 4, input, img1, ifm1, img2, ifm2, Bc, bn, bc ); + } + } + } + + /* sum exp over outputs */ + for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = (float)exp( (double)(LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) - max) ); + sum_of_exp += LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ); + } + } + + /* scale output */ + sum_of_exp = 1.0f/sum_of_exp; + for ( ifm1 = 0; ifm1 < Bc; ++ifm1 ) { + for ( ifm2 = 0; ifm2 < bc; ++ifm2 ) { + LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) = LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1, img2, ifm2, Bc, bn, bc ) * sum_of_exp; + } + } +} + +libxsmm_barrier_wait( handle->barrier, ltid ); + +/* calculate loss single threaded */ +if ( ltid == 0 ) { + handle->loss = 0.0f; + for ( img1 = 0; img1 < Bn; ++img1 ) { + for ( img2 = 0; img2 FLT_MIN ) ? LIBXSMM_VLA_ACCESS( 4, output, img1, ifm1b, img2, ifm2b, Bc, bn, bc ) : FLT_MIN; + handle->loss = LIBXSMM_LOGF( val ); + } + } + handle->loss = ((-1.0f)*handle->loss)/handle->desc.N; +} + +libxsmm_barrier_wait( handle->barrier, ltid ); + +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16) +for ( i = nc_thr_begin; i < nc_thr_end; ++i ) { + libxsmm_bfloat16_hp in; + in.f = poutput_fp32[i]; + poutput_bf16[i] = in.i[1]; +} + +libxsmm_barrier_wait( handle->barrier, ltid ); +#endif +#if defined(LIBXSMM_DNN_SOFTMAXLOSS_FWD_BF16_AVX512) +LIBXSMM_DNN_CONVERT_F32_BF16(poutput_fp32+nc_thr_begin, poutput_bf16+nc_thr_begin, nc_thr_end-nc_thr_begin); + +libxsmm_barrier_wait( handle->barrier, ltid ); +#undef LIBXSMM_DNN_CONVERT_F32_BF16 +#undef LIBXSMM_DNN_CONVERT_BF16_F32 +#endif + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c new file mode 100644 index 00000000..300a7baf --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_in_nchw.tpl.c @@ -0,0 +1,34 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* use for-loops to potentially leverage NUMA in the future */ +int i1, i2, i3; +#if defined(LIBXSMM_DNN_COPY_LOW_PRECISION) +int lpb = tensor->layout->dim_size[0]; +int bfm = tensor->layout->dim_size[1]; +int fmb = tensor->layout->dim_size[2]; +#else +int lpb = 1; +int bfm = tensor->layout->dim_size[0]; +int fmb = tensor->layout->dim_size[1]; +#endif + +const element_type* user_data = (const element_type*)data; +LIBXSMM_VLA_DECL(3, element_type, handle_data, (element_type*)tensor->data, bfm, lpb); + +for (i1 = 0; i1 < fmb; ++i1) { + for (i2 = 0; i2 < bfm; ++i2) { + for (i3 = 0; i3 < lpb; ++i3) { + LIBXSMM_VLA_ACCESS(3, handle_data, i1, i2, i3, bfm, lpb) = user_data[(i1*bfm*lpb) + (i2*lpb) + i3]; + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c new file mode 100644 index 00000000..f559d886 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_bias_copy_out_nchw.tpl.c @@ -0,0 +1,34 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* use for-loops to potentially leverage NUMA in the future */ +int i1, i2, i3; +#if defined(LIBXSMM_DNN_COPY_LOW_PRECISION) +int lpb = tensor->layout->dim_size[0]; +int bfm = tensor->layout->dim_size[1]; +int fmb = tensor->layout->dim_size[2]; +#else +int lpb = 1; +int bfm = tensor->layout->dim_size[0]; +int fmb = tensor->layout->dim_size[1]; +#endif + +element_type* user_data = (element_type*)data; +LIBXSMM_VLA_DECL(3, const element_type, handle_data, (const element_type*)tensor->data, bfm, lpb); + +for (i1 = 0; i1 < fmb; ++i1) { + for (i2 = 0; i2 < bfm; ++i2) { + for (i3 = 0; i3 < lpb; ++i3) { + user_data[(i1*bfm*lpb) + (i2*lpb) + i3] = LIBXSMM_VLA_ACCESS(3, handle_data, i1, i2, i3, bfm, lpb); + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c new file mode 100644 index 00000000..d1c1d1a5 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_in_nchw.tpl.c @@ -0,0 +1,51 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int i1, i2, i3, i4, i5, i6; +int lpb, bfm, W, H, fmb, N, C; +/* low precision formatting */ +if ( tensor->layout->num_dims == 6 ) { + lpb = tensor->layout->dim_size[0]; + bfm = tensor->layout->dim_size[1]; + W = tensor->layout->dim_size[2]; + H = tensor->layout->dim_size[3]; + fmb = tensor->layout->dim_size[4]; + N = tensor->layout->dim_size[5]; +} else { + lpb = 1; + bfm = tensor->layout->dim_size[0]; + W = tensor->layout->dim_size[1]; + H = tensor->layout->dim_size[2]; + fmb = tensor->layout->dim_size[3]; + N = tensor->layout->dim_size[4]; +} +C = fmb * bfm * lpb; + +/*printf(" layout act copy in N %i fmb %i H %i W %i bfm %i lpb %i \n", N, fmb, H, W, bfm, lpb);*/ +{ + LIBXSMM_VLA_DECL(6, element_type, handle_data_1, (element_type*)tensor->data, fmb, H, W, bfm, lpb); + LIBXSMM_VLA_DECL(4, const element_type, user_data, (const element_type*)data, C, H, W); + + for (i1 = 0; i1 < N; ++i1) { + for (i2 = 0; i2 < fmb; ++i2) { + for (i3 = 0; i3 < H; ++i3) { + for (i4 = 0; i4 < W; ++i4) { + for (i5 = 0; i5 < bfm; ++i5) { + for (i6 = 0; i6 < lpb; ++i6) { + LIBXSMM_VLA_ACCESS(6, handle_data_1, i1, i2, i3, i4, i5, i6, fmb, H, W, bfm, lpb) = + LIBXSMM_VLA_ACCESS(4, user_data, i1, ((size_t)i2*bfm*lpb) + ((size_t)i5*lpb) + i6, i3, i4, C, H, W); + } + } + } + } + } + } +} diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c new file mode 100644 index 00000000..356809e6 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_buffer_copy_out_nchw.tpl.c @@ -0,0 +1,51 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) +******************************************************************************/ + +int i1, i2, i3, i4, i5, i6; +int lpb, bfm, W, H, fmb, N, C; +/* low precision formatting */ +if ( tensor->layout->num_dims == 6 ) { + lpb = tensor->layout->dim_size[0]; + bfm = tensor->layout->dim_size[1]; + W = tensor->layout->dim_size[2]; + H = tensor->layout->dim_size[3]; + fmb = tensor->layout->dim_size[4]; + N = tensor->layout->dim_size[5]; +} else { + lpb = 1; + bfm = tensor->layout->dim_size[0]; + W = tensor->layout->dim_size[1]; + H = tensor->layout->dim_size[2]; + fmb = tensor->layout->dim_size[3]; + N = tensor->layout->dim_size[4]; +} +C = fmb * bfm * lpb; + +/* printf(" layout act copy out N %i fmb %i H %i W %i bfm %i lpb %i \n", N, fmb, H, W, bfm, lpb); */ +{ + LIBXSMM_VLA_DECL(6, const element_type, handle_data_1, (const element_type*)tensor->data, fmb, H, W, bfm, lpb); + LIBXSMM_VLA_DECL(4, element_type, user_data, (element_type*)data, C, H, W); + + for (i1 = 0; i1 < N; ++i1) { + for (i2 = 0; i2 < fmb; ++i2) { + for (i3 = 0; i3 < H; ++i3) { + for (i4 = 0; i4 < W; ++i4) { + for (i5 = 0; i5 < bfm; ++i5) { + for (i6 = 0; i6 < lpb; ++i6) { + LIBXSMM_VLA_ACCESS(4, user_data, i1, ((size_t)i2*bfm*lpb) + ((size_t)i5*lpb) + i6, i3, i4, C, H, W) = + LIBXSMM_VLA_ACCESS(6, handle_data_1, i1, i2, i3, i4, i5, i6, fmb, H, W, bfm, lpb); + } + } + } + } + } + } +} diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c new file mode 100644 index 00000000..456f54e2 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_in_kcrs.tpl.c @@ -0,0 +1,64 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) +******************************************************************************/ + +/* @TODO: use for-loops to potentially leverage NUMA in the future */ +int i1, i2, i3, i4, i5, i6, i7; +int lpb = 0; +int bofm = 0; +int bifm = 0; +int S = 0; +int R = 0; +int ifmb = 0; +int ofmb = 0; +/* low precision formatting */ +if ( tensor->layout->num_dims == 7 ) { + lpb = tensor->layout->dim_size[0]; + bofm = tensor->layout->dim_size[1]; + bifm = tensor->layout->dim_size[2]; + S = tensor->layout->dim_size[3]; + R = tensor->layout->dim_size[4]; + ifmb = tensor->layout->dim_size[5]; + ofmb = tensor->layout->dim_size[6]; +} else if ( tensor->layout->num_dims == 6 ) { + lpb = 1; + bofm = tensor->layout->dim_size[0]; + bifm = tensor->layout->dim_size[1]; + S = tensor->layout->dim_size[2]; + R = tensor->layout->dim_size[3]; + ifmb = tensor->layout->dim_size[4]; + ofmb = tensor->layout->dim_size[5]; +} else { + /* should not happen, @TODO throw ERR */ +} + +/*printf("Layout of filters fil ofmb %i ifmb %i R %i S %i bifm %i bofm %i lpb %i \n", ofmb, ifmb, R, S, bifm, bofm, lpb);*/ +{ + LIBXSMM_VLA_DECL(7, element_type, handle_data_1, (element_type*)tensor->data, ifmb, R, S, bifm, bofm, lpb); + LIBXSMM_VLA_DECL(4, const element_type, user_data, (const element_type*)data, ifmb * bifm * lpb, R, S); + + for (i1 = 0; i1 < ofmb; ++i1) { + for (i2 = 0; i2 < ifmb; ++i2) { + for (i3 = 0; i3 < R; ++i3) { + for (i4 = 0; i4 < S; ++i4) { + for (i5 = 0; i5 < bifm; ++i5) { + for (i6 = 0; i6 < bofm; ++i6) { + for (i7 = 0; i7 < lpb; ++i7) { + LIBXSMM_VLA_ACCESS(7, handle_data_1, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb) = + LIBXSMM_VLA_ACCESS(4, user_data, i1 * bofm + i6, ((size_t)i2*bifm*lpb) + ((size_t)i5*lpb) + i7, i3, i4, ifmb * bifm * lpb, R, S); + } + } + } + } + } + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c new file mode 100644 index 00000000..63175f90 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_tensor_filter_copy_out_kcrs.tpl.c @@ -0,0 +1,63 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke, Evangelos Georganas, Hans Pabst (Intel Corp.) +******************************************************************************/ + +/* @TODO: use for-loops to potentially leverage NUMA in the future */ +int i1, i2, i3, i4, i5, i6, i7; +int lpb = 0; +int bofm = 0; +int bifm = 0; +int S = 0; +int R = 0; +int ifmb = 0; +int ofmb = 0; +/* low precision formatting */ +if ( tensor->layout->num_dims == 7 ) { + lpb = tensor->layout->dim_size[0]; + bofm = tensor->layout->dim_size[1]; + bifm = tensor->layout->dim_size[2]; + S = tensor->layout->dim_size[3]; + R = tensor->layout->dim_size[4]; + ifmb = tensor->layout->dim_size[5]; + ofmb = tensor->layout->dim_size[6]; +} else if ( tensor->layout->num_dims == 6 ) { + lpb = 1; + bofm = tensor->layout->dim_size[0]; + bifm = tensor->layout->dim_size[1]; + S = tensor->layout->dim_size[2]; + R = tensor->layout->dim_size[3]; + ifmb = tensor->layout->dim_size[4]; + ofmb = tensor->layout->dim_size[5]; +} else { + /* should not happen, @TODO throw ERR */ +} + +{ + LIBXSMM_VLA_DECL(4, element_type, user_data, (element_type*)data, ifmb * bifm * lpb, R, S); + LIBXSMM_VLA_DECL(7, const element_type, handle_data_1, (const element_type*)tensor->data, ifmb, R, S, bifm, bofm, lpb); + + for (i1 = 0; i1 < ofmb; ++i1) { + for (i2 = 0; i2 < ifmb; ++i2) { + for (i3 = 0; i3 < R; ++i3) { + for (i4 = 0; i4 < S; ++i4) { + for (i5 = 0; i5 < bifm; ++i5) { + for (i6 = 0; i6 < bofm; ++i6) { + for (i7 = 0; i7 < lpb; ++i7) { + LIBXSMM_VLA_ACCESS(4, user_data, i1 * bofm + i6, ((size_t)i2*bifm*lpb) + ((size_t)i5*lpb) + i7, i3, i4, ifmb * bifm * lpb, R, S) = + LIBXSMM_VLA_ACCESS(7, handle_data_1, i1, i2, i3, i4, i5, i6, i7, ifmb, R, S, bifm, bofm, lpb); + } + } + } + } + } + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_custom.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_custom.tpl.c new file mode 100644 index 00000000..1cf97883 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_custom.tpl.c @@ -0,0 +1,25 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* this is crappy as it requires a complicated if... */ +if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { + for ( ij = 0; ij < handle->ifhp; ij++ ) { + for ( ii = 0; ii < handle->ifwp; ii++ ) { + if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || + (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1lpblock, ij, ii, ifm2, handle->blocksifm*handle->fm_lp_block, handle->ifhp, handle->ifwp, handle->ifmblock) = (element_input_type)0; + } + } + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c b/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c new file mode 100644 index 00000000..9809dfd3 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_dnn_zero_rim_st_input_nhwc.tpl.c @@ -0,0 +1,25 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +/* this is crappy as it requires a complicated if... */ +if (handle->desc.pad_h_in > 0 || handle->desc.pad_w_in > 0) { + for ( ij = 0; ij < handle->ifhp; ij++ ) { + for ( ii = 0; ii < handle->ifwp; ii++ ) { + if ( (ij < handle->desc.pad_h_in) || (ij >= (handle->desc.H+handle->desc.pad_h_in)) || + (ii < handle->desc.pad_w_in) || (ii >= (handle->desc.W+handle->desc.pad_w_in)) ) { + for (ifm2 = 0; ifm2 < handle->ifmblock; ++ifm2) { + LIBXSMM_VLA_ACCESS(5, del_input, img, ij, ii, ifm1, ifm2, handle->ifhp, handle->ifwp, handle->blocksifm, handle->ifmblock) = (element_input_type)0; + } + } + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c new file mode 100644 index 00000000..1147cde5 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_1.tpl.c @@ -0,0 +1,72 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + __m512 _vdh, _vdout, _vdf, _vdc, _vf, _vc, _vhp, _vt1, _vt2; + element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); + element_input_type* _hp; + element_input_type* _c = &LIBXSMM_VLA_ACCESS(3, c, j, in, ik, N, K); + element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); + element_input_type* _dc = &LIBXSMM_VLA_ACCESS(2, dc, in, ik, K); + element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); + const __m512 _vones = _mm512_set1_ps( (float)1.0 ); + if (0 == j) { + _hp = &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K); + } else { + _hp = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K); + } + if (j == t-1) { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dh[(_j*K)+_k]); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dout[(_j*K)+_k], _vdout); + _vc = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_c[(_j*K)+_k]); + _vt1 = _mm512_sub_ps(_vones, _vc); + _vt1 = _mm512_mul_ps(_vdout, _vt1); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_f[(_j*K)+_k]); + _vt2 = _mm512_fnmsub_ps(_vf, _vf, _vneg_ones); + _vdf = _mm512_mul_ps(_vt1, _vt2); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_df[(_j*K)+_k], _vdf); + _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); + _vt1 = _mm512_mul_ps(_vt1, _vc); + _vt2 = _mm512_sub_ps(_vhp, _vf); + _vdc = _mm512_mul_ps(_vt1, _vt2); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dc[(_j*K)+_k], _vdc); + } + } + } else { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dout[(_j*K)+_k]); + _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_dh[(_j*K)+_k]); + _vdout = _mm512_add_ps(_vdout, _vdh); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dout[(_j*K)+_k], _vdout); + _vc = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_c[(_j*K)+_k]); + _vt1 = _mm512_sub_ps(_vones, _vc); + _vt1 = _mm512_mul_ps(_vdout, _vt1); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_f[(_j*K)+_k]); + _vt2 = _mm512_fnmsub_ps(_vf, _vf, _vneg_ones); + _vdf = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_df[(_j*K)+_k], _vdf); + _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); + _vt1 = _mm512_mul_ps(_vt1, _vc); + _vt2 = _mm512_sub_ps(_vhp, _vf); + _vdc = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_dc[(_j*K)+_k], _vdc); + } + } + } +} diff --git a/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c new file mode 100644 index 00000000..aa0d3273 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_gru_bwdupd_fused_eltwise_2.tpl.c @@ -0,0 +1,38 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Kunal Banerjee (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + __m512 _vdi, _vdo, _vi, _vhp, _vt1, _vt2; + element_input_type* _hp; + element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + element_input_type* _do = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + const __m512 _vones = _mm512_set1_ps( (float)1.0 ); + if (0 == j) { + _hp = &LIBXSMM_VLA_ACCESS(2, hp, in, ik, K); + } else { + _hp = &LIBXSMM_VLA_ACCESS(3, h, j-1, in, ik, N, K); + } + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_i[(_j*K)+_k]); + _vt1 = _mm512_sub_ps(_vones, _vi); + _vt1 = _mm512_mul_ps(_vi, _vt1); + _vhp = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_hp[(_j*K)+_k]); + _vdo = LIBXSMM_INTRINSICS_MM512_LOAD_PS(&_do[(_j*K)+_k]); + _vt2 = _mm512_mul_ps(_vdo, _vhp); + _vdi = _mm512_mul_ps(_vt1, _vt2); + LIBXSMM_INTRINSICS_MM512_STREAM_PS(&_di[(_j*K)+_k], _vdi); + } + } +} diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c new file mode 100644 index 00000000..e3b4d9df --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise.tpl.c @@ -0,0 +1,113 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vi, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; + element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); + element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); + element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); + element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); + element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); + element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); + element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + element_input_type* _cps = cps_ptr; + element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); + const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); + const __m512 _vones = _mm512_set1_ps( (float)1.0 ); + if (j == t-1) { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcs[(_j*K)+_k] ); + _vdcp = _mm512_add_ps( _vdcs, _vt1 ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vi, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _vones, _vi ); + _vdi = _mm512_mul_ps( _vi, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); + _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vt2 = _mm512_sub_ps( _vones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _vones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); + } + } + } else { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); + _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); + _vdout = _mm512_add_ps( _vdout, _vdh ); + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcp[(_j*K)+_k] ); + _vdcp = _mm512_add_ps( _vdcp, _vt1 ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vi, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _vones, _vi ); + _vdi = _mm512_mul_ps( _vi, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); + _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vt2 = _mm512_sub_ps( _vones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _vones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); + } + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_ncnc_reformat_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_ncnc_reformat_bf16.tpl.c new file mode 100644 index 00000000..fc1d8c68 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_ncnc_reformat_bf16.tpl.c @@ -0,0 +1,159 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +#define NATIVE_STORECVT_F32_BF16(A,B) _mm256_storeu_si256((__m256i*)(A), (__m256i)LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(B)) +{ + float* _dout = &LIBXSMM_VLA_ACCESS(4, dout, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dh = &LIBXSMM_VLA_ACCESS(5, dh, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _o = &LIBXSMM_VLA_ACCESS(5, o, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _co = &LIBXSMM_VLA_ACCESS(5, co, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(4, dcs, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _ii = &LIBXSMM_VLA_ACCESS(5, i, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _ci = &LIBXSMM_VLA_ACCESS(5, ci, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _dci = &LIBXSMM_VLA_ACCESS(4, dci, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _di = &LIBXSMM_VLA_ACCESS(4, di, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _cps = cps_ptr; + element_input_type* _f = &LIBXSMM_VLA_ACCESS(5, f, j, inb, ikb, 0, 0, nBlocks, kBlocks, bn, bk); + element_input_type* _df = &LIBXSMM_VLA_ACCESS(4, df, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dp = &LIBXSMM_VLA_ACCESS(4, dp, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(4, dcp, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dciB = &LIBXSMM_VLA_ACCESS(5, dciB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _diB = &LIBXSMM_VLA_ACCESS(5, diB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _dfB = &LIBXSMM_VLA_ACCESS(5, dfB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _dpB = &LIBXSMM_VLA_ACCESS(5, dpB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + + libxsmm_blasint _k, _j; + __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vii, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; + const __m512 _neg_ones = _mm512_set1_ps( (float)-1.0 ); + const __m512 _ones = _mm512_set1_ps( (float)1.0 ); + const int _lpb = 2; + + if (j == t-1) { + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = _mm512_loadcvt_bf16_fp32( &_dh[(_j*bk)+_k] ); + _vo = _mm512_loadcvt_bf16_fp32( &_o[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = _mm512_loadcvt_bf16_fp32( &_co[(_j*bk)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcs = _mm512_loadcvt_bf16_fp32( &_dcs[(_j*bk)+_k] ); + _vdcp = _mm512_add_ps( _vdcs, _vt1 ); + _vii = _mm512_loadcvt_bf16_fp32( &_ii[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vii, _vdcp ); + _vci = _mm512_loadcvt_bf16_fp32( &_ci[(_j*bk)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + NATIVE_STORECVT_F32_BF16( &_dci[(_j*bk)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _ones, _vii ); + _vdi = _mm512_mul_ps( _vii, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + NATIVE_STORECVT_F32_BF16( &_di[(_j*bk)+_k], _vdi ); + _vcps = _mm512_loadcvt_bf16_fp32( &_cps[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = _mm512_loadcvt_bf16_fp32( &_f[(_j*bk)+_k] ); + _vt2 = _mm512_sub_ps( _ones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + NATIVE_STORECVT_F32_BF16( &_df[(_j*bk)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _ones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + NATIVE_STORECVT_F32_BF16( &_dp[(_j*bk)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + NATIVE_STORECVT_F32_BF16( &_dcp[(_j*bk)+_k], _vdcp ); + } + } + } else { + for ( _j = 0; _j < bn; ++_j ) { + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*bk)+_k] ); + _vdh = _mm512_loadcvt_bf16_fp32( &_dh[(_j*bk)+_k] ); + _vdout = _mm512_add_ps( _vdout, _vdh ); + _vo = _mm512_loadcvt_bf16_fp32( &_o[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = _mm512_loadcvt_bf16_fp32( &_co[(_j*bk)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcp = _mm512_loadcvt_bf16_fp32( &_dcp[(_j*bk)+_k] ); + _vdcp = _mm512_add_ps( _vdcp, _vt1 ); + _vii = _mm512_loadcvt_bf16_fp32( &_ii[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vii, _vdcp ); + _vci = _mm512_loadcvt_bf16_fp32( &_ci[(_j*bk)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + NATIVE_STORECVT_F32_BF16( &_dci[(_j*bk)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _ones, _vii ); + _vdi = _mm512_mul_ps( _vii, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + NATIVE_STORECVT_F32_BF16( &_di[(_j*bk)+_k], _vdi ); + _vcps = _mm512_loadcvt_bf16_fp32( &_cps[(_j*bk)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = _mm512_loadcvt_bf16_fp32( &_f[(_j*bk)+_k] ); + _vt2 = _mm512_sub_ps( _ones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + NATIVE_STORECVT_F32_BF16( &_df[(_j*bk)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _ones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + NATIVE_STORECVT_F32_BF16( &_dp[(_j*bk)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + NATIVE_STORECVT_F32_BF16( &_dcp[(_j*bk)+_k], _vdcp ); + } + } + } + { + /* Store di/dci/df/dp to diB/dciB/dfB/dpB which is CNNC AND vnni format */ + const __m512i perm_idx = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m256i c0, c1; + __m512i _c01; + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, di_, _di, bk); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, df_, _df, bk); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dp_, _dp, bk); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dci_, _dci, bk); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, diB_, _diB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dfB_, _dfB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dpB_, _dpB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dciB_, _dciB, bk, _lpb); + for (_j = 0; _j < bn; _j+=2) { + for (_k = 0; _k < bk; _k+=16) { + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j, _k, bk)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j+1, _k, bk)); + _c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + _c01 = _mm512_inserti64x4 (_c01, c1, 1); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(3, diB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, _c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j, _k, bk)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j+1, _k, bk)); + _c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + _c01 = _mm512_inserti64x4 (_c01, c1, 1); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(3, dfB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, _c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j, _k, bk)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j+1, _k, bk)); + _c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + _c01 = _mm512_inserti64x4 (_c01, c1, 1); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(3, dpB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, _c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j, _k, bk)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j+1, _k, bk)); + _c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + _c01 = _mm512_inserti64x4 (_c01, c1, 1); + _mm512_store_epi32(&LIBXSMM_VLA_ACCESS(3, dciB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, _c01)); + } + } + } +} + +#undef NATIVE_STORECVT_F32_BF16 + diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c new file mode 100644 index 00000000..623cf71d --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat.tpl.c @@ -0,0 +1,124 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vi, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; + element_input_type* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); + element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); + element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); + element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); + element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); + element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); + element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + element_input_type* _cps = cps_ptr; + element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); + element_input_type* _dciB = &LIBXSMM_VLA_ACCESS(4, dciB, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _diB = &LIBXSMM_VLA_ACCESS(4, diB, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dfB = &LIBXSMM_VLA_ACCESS(4, dfB, inb, ikb, 0, 0, kBlocks, bn, bk); + element_input_type* _dpB = &LIBXSMM_VLA_ACCESS(4, dpB, inb, ikb, 0, 0, kBlocks, bn, bk); + const __m512 _vneg_ones = _mm512_set1_ps( (float)-1.0 ); + const __m512 _vones = _mm512_set1_ps( (float)1.0 ); + if (j == t-1) { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcs[(_j*K)+_k] ); + _vdcp = _mm512_add_ps( _vdcs, _vt1 ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vi, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dciB[(_j*bk)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _vones, _vi ); + _vdi = _mm512_mul_ps( _vi, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_diB[(_j*bk)+_k], _vdi ); + _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vt2 = _mm512_sub_ps( _vones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dfB[(_j*bk)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _vones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dpB[(_j*bk)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); + } + } + } else { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); + _vdh = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dh[(_j*K)+_k] ); + _vdout = _mm512_add_ps( _vdout, _vdh ); + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_co[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _vneg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcp = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dcp[(_j*K)+_k] ); + _vdcp = _mm512_add_ps( _vdcp, _vt1 ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vi, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _vneg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dci[(_j*K)+_k], _vdci ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dciB[(_j*bk)+_k], _vdci ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _vones, _vi ); + _vdi = _mm512_mul_ps( _vi, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_di[(_j*K)+_k], _vdi ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_diB[(_j*bk)+_k], _vdi ); + _vcps = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vt2 = _mm512_sub_ps( _vones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_df[(_j*K)+_k], _vdf ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dfB[(_j*bk)+_k], _vdf ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _vones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dp[(_j*K)+_k], _vdp ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dpB[(_j*bk)+_k], _vdp ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_dcp[(_j*K)+_k], _vdcp ); + } + } + } +} diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c new file mode 100644 index 00000000..4ebd4aae --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_bwdupd_fused_eltwise_reformat_bf16.tpl.c @@ -0,0 +1,169 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ +{ + float* _dout = &LIBXSMM_VLA_ACCESS(2, dout, in, ik, K); + element_input_type* _dh = &LIBXSMM_VLA_ACCESS(3, dh, j, in, ik, N, K); + element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); + element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); + element_input_type* _dcs = &LIBXSMM_VLA_ACCESS(2, dcs, in, ik, K); + element_input_type* _ii = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); + element_input_type* _dci = &LIBXSMM_VLA_ACCESS(2, dci, in, ik, K); + element_input_type* _di = &LIBXSMM_VLA_ACCESS(2, di, in, ik, K); + element_input_type* _cps = cps_ptr; + element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + element_input_type* _df = &LIBXSMM_VLA_ACCESS(2, df, in, ik, K); + element_input_type* _dp = &LIBXSMM_VLA_ACCESS(2, dp, in, ik, K); + element_input_type* _dcp = &LIBXSMM_VLA_ACCESS(2, dcp, in, ik, K); + element_input_type* _dciB = &LIBXSMM_VLA_ACCESS(5, dciB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _diB = &LIBXSMM_VLA_ACCESS(5, diB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _dfB = &LIBXSMM_VLA_ACCESS(5, dfB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + element_input_type* _dpB = &LIBXSMM_VLA_ACCESS(5, dpB, ikb, inb, 0, 0, 0, nBlocks, bn_lp, bk, lpb); + + libxsmm_blasint _k, _j; + __m512 _vdout, _vdh, _vo, _vt1, _vt2, _vco, _vdcs, _vdcp, _vii, _vci, _vdci, _vdi, _vcps, _vf, _vdf, _vdp; + const __m512 _neg_ones = _mm512_set1_ps( (float)-1.0 ); + const __m512 _ones = _mm512_set1_ps( (float)1.0 ); + int _lpb = 2; + + if (j == t-1) { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dh[(_j*K)+_k] )); + _vo = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_o[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_co[(_j*K)+_k] )); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcs = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dcs[(_j*K)+_k] )); + _vdcp = _mm512_add_ps( _vdcs, _vt1 ); + _vii = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ii[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vii, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ci[(_j*K)+_k] )); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + _mm256_stream_si256((__m256i*)&_dci[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdci) ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _ones, _vii ); + _vdi = _mm512_mul_ps( _vii, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + _mm256_stream_si256((__m256i*)&_di[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdi) ); + _vcps = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_cps[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_f[(_j*K)+_k] )); + _vt2 = _mm512_sub_ps( _ones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + _mm256_stream_si256((__m256i*)&_df[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdf) ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _ones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + _mm256_stream_si256((__m256i*)&_dp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdp) ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + _mm256_stream_si256((__m256i*)&_dcp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdcp) ); + } + } + } else { + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vdout = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_dout[(_j*K)+_k] ); + _vdh = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dh[(_j*K)+_k] )); + _vdout = _mm512_add_ps( _vdout, _vdh ); + _vo = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_o[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vdout, _vo ); + _vco = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_co[(_j*K)+_k] )); + _vt2 = _mm512_fnmsub_ps ( _vco, _vco, _neg_ones); + _vt1 = _mm512_mul_ps( _vt1, _vt2 ); + _vdcp = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_dcp[(_j*K)+_k] )); + _vdcp = _mm512_add_ps( _vdcp, _vt1 ); + _vii = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ii[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vii, _vdcp ); + _vci = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_ci[(_j*K)+_k] )); + _vt2 = _mm512_fnmsub_ps ( _vci, _vci, _neg_ones); + _vdci = _mm512_mul_ps( _vt1, _vt2 ); + _mm256_stream_si256((__m256i*)&_dci[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdci) ); + _vt1 = _mm512_mul_ps( _vci, _vdcp ); + _vt2 = _mm512_sub_ps( _ones, _vii ); + _vdi = _mm512_mul_ps( _vii, _vt2); + _vdi = _mm512_mul_ps( _vdi, _vt1); + _mm256_stream_si256((__m256i*)&_di[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdi) ); + _vcps = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_cps[(_j*K)+_k] )); + _vt1 = _mm512_mul_ps( _vcps, _vdcp ); + _vf = LIBXSMM_INTRINSICS_MM512_CVTPBH_PS(_mm256_loadu_si256((__m256i*)&_f[(_j*K)+_k] )); + _vt2 = _mm512_sub_ps( _ones, _vf ); + _vdf = _mm512_mul_ps( _vf, _vt2); + _vdf = _mm512_mul_ps( _vdf, _vt1); + _mm256_stream_si256((__m256i*)&_df[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdf) ); + _vt1 = _mm512_mul_ps( _vdout, _vco); + _vt2 = _mm512_sub_ps( _ones, _vo ); + _vt2 = _mm512_mul_ps( _vo, _vt2); + _vdp = _mm512_mul_ps( _vt1, _vt2 ); + _mm256_stream_si256((__m256i*)&_dp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdp) ); + _vdcp = _mm512_mul_ps( _vdcp, _vf); + _mm256_stream_si256((__m256i*)&_dcp[(_j*K)+_k], LIBXSMM_INTRINSISCS_MM512_CVTNEPS_PBH(_vdcp) ); + } + } + } + + { /* Store di/dci/df/dp to diB/dciB/dfB/dpB which is CNNC AND vnni format */ + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, di_, _di, K); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, df_, _df, K); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dp_, _dp, K); + LIBXSMM_VLA_DECL(2, libxsmm_bfloat16, dci_, _dci, K); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, diB_, _diB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dfB_, _dfB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dpB_, _dpB, bk, _lpb); + LIBXSMM_VLA_DECL(3, libxsmm_bfloat16, dciB_, _dciB, bk, _lpb); + if ( (bn % 2 == 0) && (bk % 16 == 0) ) { + const __m512i perm_idx = LIBXSMM_INTRINSICS_MM512_SET_EPI16(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8, 23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m256i c0, c1; + __m512i c01; + for (_j = 0; _j < bn; _j+=2) { + for (_k = 0; _k < bk; _k+=16) { + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j, _k, K)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, di_, _j+1, _k, K)); + c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + c01 = _mm512_inserti64x4 (c01, c1, 1); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, diB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j, _k, K)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, df_, _j+1, _k, K)); + c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + c01 = _mm512_inserti64x4 (c01, c1, 1); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dfB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j, _k, K)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dp_, _j+1, _k, K)); + c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + c01 = _mm512_inserti64x4 (c01, c1, 1); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dpB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); + c0 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j, _k, K)); + c1 = _mm256_loadu_si256((const __m256i*)&LIBXSMM_VLA_ACCESS(2, dci_, _j+1, _k, K)); + c01 = _mm512_inserti64x4 (LIBXSMM_INTRINSICS_MM512_UNDEFINED_EPI32(), c0, 0); + c01 = _mm512_inserti64x4 (c01, c1, 1); + _mm512_storeu_si512(&LIBXSMM_VLA_ACCESS(3, dciB_, _j/_lpb, _k, 0, bk, _lpb), _mm512_permutexvar_epi16(perm_idx, c01)); + } + } + } else { + for (_j = 0; _j < bn; _j++) { + for (_k = 0; _k < bk; _k++) { + LIBXSMM_VLA_ACCESS(3, diB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, di_, _j, _k, K); + LIBXSMM_VLA_ACCESS(3, dfB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, df_, _j, _k, K); + LIBXSMM_VLA_ACCESS(3, dpB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, dp_, _j, _k, K); + LIBXSMM_VLA_ACCESS(3, dciB_, _j / _lpb, _k, _j%_lpb, bk, _lpb) = LIBXSMM_VLA_ACCESS(2, dci_, _j, _k, K); + } + } + } + } +} + + diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c new file mode 100644 index 00000000..7a50dd1d --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise.tpl.c @@ -0,0 +1,50 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + element_input_type* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); + element_input_type* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + element_input_type* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + element_input_type* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); + element_input_type* _cps = cps_ptr; + element_input_type* _cs = &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K); + element_input_type* _h = &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K); + element_input_type* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); + __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); + _vi = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi, _halves ) ), _halves, _halves); + _vci = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vci ); + _vf = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf, _halves ) ), _halves, _halves); + _vcs = _mm512_mul_ps( _vf, _vcs ); + _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); + _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs ); + _vh = _mm512_mul_ps( _vo, _vco ); + _mm512_storeu_ps( &_o[(_j*K)+_k], _vo ); + _mm512_storeu_ps( &_i[(_j*K)+_k], _vi ); + _mm512_storeu_ps( &_ci[(_j*K)+_k], _vci ); + _mm512_storeu_ps( &_f[(_j*K)+_k], _vf ); + _mm512_storeu_ps( &_cs[(_j*K)+_k], _vcs ); + _mm512_storeu_ps( &_co[(_j*K)+_k], _vco ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_h[(_j*K)+_k], _vh ); + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c b/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c new file mode 100644 index 00000000..4d1c8603 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_internal_lstm_fwd_fused_eltwise_bf16.tpl.c @@ -0,0 +1,50 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Evangelos Georganas (Intel Corp.), Alexander Heinecke (Intel Corp.) +******************************************************************************/ + +{ + libxsmm_blasint _k, _j; + float* _o = &LIBXSMM_VLA_ACCESS(3, o, j, in, ik, N, K); + float* _i = &LIBXSMM_VLA_ACCESS(3, i, j, in, ik, N, K); + float* _f = &LIBXSMM_VLA_ACCESS(3, f, j, in, ik, N, K); + float* _ci = &LIBXSMM_VLA_ACCESS(3, ci, j, in, ik, N, K); + float* _cps = cps_ptr; + float* _cs = &LIBXSMM_VLA_ACCESS(3, cs, j, in, ik, N, K); + float* _h = &LIBXSMM_VLA_ACCESS(3, h, j, in, ik, N, K); + float* _co = &LIBXSMM_VLA_ACCESS(3, co, j, in, ik, N, K); + __m512 _vf, _vcs, _vi, _vci, _vco, _vo, _vh; + const __m512 _halves = _mm512_set1_ps( (LIBXSMM_DNN_ELTWISE_FTYPE)0.5 ); + for ( _j = 0; _j < bn; ++_j ) { + LIBXSMM_PRAGMA_UNROLL_N(4) + for ( _k = 0; _k < bk; _k += 16 ) { + _vo = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_o[(_j*K)+_k] ); + _vi = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_i[(_j*K)+_k] ); + _vci = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_ci[(_j*K)+_k] ); + _vf = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_f[(_j*K)+_k] ); + _vcs = LIBXSMM_INTRINSICS_MM512_LOAD_PS( &_cps[(_j*K)+_k] ); + _vo = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vo, _halves ) ), _halves, _halves); + _vi = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vi, _halves ) ), _halves, _halves); + _vci = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vci ); + _vf = _mm512_fmadd_ps( LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _mm512_mul_ps( _vf, _halves ) ), _halves, _halves); + _vcs = _mm512_mul_ps( _vf, _vcs ); + _vcs = _mm512_fmadd_ps( _vi, _vci, _vcs ); + _vco = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX2( _vcs ); + _vh = _mm512_mul_ps( _vo, _vco ); + _mm512_storeu_ps( &_o[(_j*K)+_k], _vo ); + _mm512_storeu_ps( &_i[(_j*K)+_k], _vi ); + _mm512_storeu_ps( &_ci[(_j*K)+_k], _vci ); + _mm512_storeu_ps( &_f[(_j*K)+_k], _vf ); + _mm512_storeu_ps( &_cs[(_j*K)+_k], _vcs ); + _mm512_storeu_ps( &_co[(_j*K)+_k], _vco ); + LIBXSMM_INTRINSICS_MM512_STREAM_PS( &_h[(_j*K)+_k], _vh ); + } + } +} + diff --git a/third_party/libxsmm/src/template/libxsmm_matdiff.tpl.c b/third_party/libxsmm/src/template/libxsmm_matdiff.tpl.c new file mode 100644 index 00000000..c9a83c76 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_matdiff.tpl.c @@ -0,0 +1,174 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Hans Pabst (Intel Corp.) +******************************************************************************/ + +const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE *const real_ref = (const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE*)ref; +const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE *const real_tst = (const LIBXSMM_MATDIFF_TEMPLATE_ELEM_TYPE*)tst; +double compf = 0, compfr = 0, compft = 0, normfr = 0, normft = 0, normr = 0, normt = 0; +double normrc = 0, normtc = 0, compr = 0, compt = 0, compd = 0; +libxsmm_blasint i, j; + +for (i = 0; i < nn; ++i) { + double comprj = 0, comptj = 0, compij = 0; + double normrj = 0, normtj = 0, normij = 0; + double v0, v1; + + for (j = 0; j < mm; ++j) { + const double ti = (0 != real_tst ? real_tst[i*ldt+j] : 0); + const double ri = real_ref[i*ldr+j]; + const double ta = LIBXSMM_ABS(ti); + const double ra = LIBXSMM_ABS(ri); + + /* minimum/maximum of reference set */ + if (ri < info->min_ref) info->min_ref = ri; + if (ri > info->max_ref) info->max_ref = ri; + + if (LIBXSMM_NOTNAN(ti) && inf > ta) { + const double di = (0 != real_tst ? (ri < ti ? (ti - ri) : (ri - ti)) : 0); + + /* minimum/maximum of test set */ + if (ti < info->min_tst) info->min_tst = ti; + if (ti > info->max_tst) info->max_tst = ti; + + /* maximum absolute error and location */ + if (info->linf_abs < di) { + info->linf_abs = di; + info->v_ref = ri; + info->v_tst = ti; + info->m = j; + info->n = i; + } + + /* maximum error relative to current value */ + if (0 < ra) { + const double dri = di / ra; + if (info->linf_rel < dri) info->linf_rel = dri; + /* sum of relative differences */ + v0 = dri * dri; + if (inf > v0) { + v0 -= compd; + v1 = info->l2_rel + v0; + compd = (v1 - info->l2_rel) - v0; + info->l2_rel = v1; + } + } + + /* row-wise sum of reference values with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ra, &normrj, &comprj); + + /* row-wise sum of test values with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ta, &normtj, &comptj); + + /* row-wise sum of differences with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(di, &normij, &compij); + + /* Froebenius-norm of reference matrix with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ri * ri, &normfr, &compfr); + + /* Froebenius-norm of test matrix with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ti * ti, &normft, &compft); + + /* Froebenius-norm of differences with Kahan compensation */ + v0 = di * di; + if (inf > v0) { + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(v0, &info->l2_abs, &compf); + } + } + else { /* NaN */ + info->m = j; info->n = i; + result_nan = ((LIBXSMM_NOTNAN(ri) && inf > ra) ? 1 : 2); + break; + } + } + + if (0 == result_nan) { + /* summarize reference values */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(normrj, &info->l1_ref, &compr); + + /* summarize test values */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(normtj, &info->l1_tst, &compt); + + /* calculate Infinity-norm of differences */ + if (info->normi_abs < normij) info->normi_abs = normij; + /* calculate Infinity-norm of reference/test values */ + if (normr < normrj) normr = normrj; + if (normt < normtj) normt = normtj; + } + else { + break; + } +} + +if (0 == result_nan) { + double compr_var = 0, compt_var = 0; + + /* initial variance */ + assert(0 == info->var_ref); /* !LIBXSMM_ASSERT */ + assert(0 == info->var_tst); /* !LIBXSMM_ASSERT */ + + if (0 != ntotal) { /* final average */ + info->avg_ref = info->l1_ref / ntotal; + info->avg_tst = info->l1_tst / ntotal; + } + + /* Infinity-norm relative to reference */ + info->normi_rel = LIBXSMM_MATDIFF_DIV(info->normi_abs, normr, normt); + /* Froebenius-norm relative to reference */ + info->normf_rel = LIBXSMM_MATDIFF_DIV(info->l2_abs, normfr, normft); + + for (j = 0; j < mm; ++j) { + double compri = 0, compti = 0, comp1 = 0; + double normri = 0, normti = 0, norm1 = 0; + + for (i = 0; i < nn; ++i) { + const double ri = real_ref[i*ldr + j], ti = (0 != real_tst ? real_tst[i*ldt + j] : 0); + const double di = (0 != real_tst ? (ri < ti ? (ti - ri) : (ri - ti)) : 0); + const double rd = ri - info->avg_ref, td = ti - info->avg_tst; + const double ra = LIBXSMM_ABS(ri), ta = LIBXSMM_ABS(ti); + + /* variance of reference set with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(rd * rd, &info->var_ref, &compr_var); + + /* variance of test set with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(td * td, &info->var_tst, &compt_var); + + /* column-wise sum of reference values with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ra, &normri, &compri); + + /* column-wise sum of test values with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(ta, &normti, &compti); + + /* column-wise sum of differences with Kahan compensation */ + LIBXSMM_PRAGMA_FORCEINLINE + libxsmm_kahan_sum(di, &norm1, &comp1); + } + + /* calculate One-norm of differences */ + if (info->norm1_abs < norm1) info->norm1_abs = norm1; + /* calculate One-norm of reference/test values */ + if (normrc < normri) normrc = normri; + if (normtc < normti) normtc = normti; + } + + /* One-norm relative to reference */ + info->norm1_rel = LIBXSMM_MATDIFF_DIV(info->norm1_abs, normrc, normtc); +} diff --git a/third_party/libxsmm/src/template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c b/third_party/libxsmm/src/template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c new file mode 100644 index 00000000..5f16f1ab --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_spmdm_compute_bfloat16_thread.tpl.c @@ -0,0 +1,564 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish (Intel Corp.) +******************************************************************************/ + +const int m_blocks = handle->mb; +/*const int n_blocks = handle->nb;*/ +const int k_blocks = handle->kb; +const int m_block_size = handle->bm; +const int n_block_size = handle->bn; +const int k_block_size = handle->bk; +int mb = block_id / handle->nb; +int nb = block_id % handle->nb; + + +#define LIBXSMM_SPMDM_COMPUTE_NREGS (6) +int m_overall_start = mb*m_block_size; +int m_overall_end = (mb + 1)*m_block_size; +int num_m; +int num_m_aligned; + +int n_overall_start = nb*n_block_size; +int n_overall_end = (nb + 1)*n_block_size; +int num_n; +int m, n, k, kb; +int last_block_n, num_full_regs, last_n_start; + +int k_overall_start, k_overall_end, num_k; + +float *const scratch_C = (float *)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread); +float *const scratch_B = (float *)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread + (size_t)m_block_size*n_block_size*sizeof(float)); +#if 0 +float *const scratch_C = (float *)(handle->spmdm_scratch_C + tid*m_block_size*n_block_size*sizeof(float)); +float *const scratch_B = (float *)(handle->spmdm_scratch_B + tid*k_block_size*n_block_size*sizeof(float)); +#endif + +SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; +float* LIBXSMM_RESTRICT ptr_result; +#if SIMD_WIDTH_FP32 > 1 +SIMDTYPE_INT32 vzero = _MM_SETZERO_INT32(); +#endif + +LIBXSMM_UNUSED(nthreads); +LIBXSMM_UNUSED(transa); +LIBXSMM_UNUSED(alpha); +LIBXSMM_UNUSED(beta); +LIBXSMM_UNUSED(tid); + +/* really is twice this */ +assert(n_block_size == LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32); + +if (m_overall_end > handle->m) m_overall_end = handle->m; +num_m = (m_overall_end - m_overall_start); +num_m_aligned = (num_m / 2) * 2; + +if (n_overall_end > handle->n) n_overall_end = handle->n; +num_n = (n_overall_end - n_overall_start); +last_block_n = (num_n != n_block_size); +num_full_regs = (num_n / SIMD_WIDTH_FP32); +if ((num_full_regs > 0) && (num_full_regs%2)) num_full_regs--; +last_n_start = num_full_regs*SIMD_WIDTH_FP32; + +/* Copy in c matrix to buffer */ +ptr_result = c + (size_t)m_overall_start*handle->n + n_overall_start; +if (LIBXSMM_FEQ(0.f, *beta)) { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + } + } else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = 0; + } + } + } +} +else if (LIBXSMM_FEQ(1.f, *beta)) { + if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int m2; + + ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; + + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle->m + m, handle->m, scratch_C + (size_t)m*n_block_size + n, n_block_size); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { + for (n = num_n_simd; n < num_n; n++) { + scratch_C[m2*n_block_size + n] = ptr_result[n*handle->m + m2]; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (m = num_m_simd; m < num_m; m++) { + for (n = 0; n < num_n; n++) { + scratch_C[m*n_block_size + n] = ptr_result[n*handle->m + m]; + } + } + } + else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32)); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32)); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32+n] = ptr_result[m*handle->n+n]; + } + } + } + } +} +else { + SIMDTYPE_FP32 beta_v = _MM_SET1_FP32(*beta); + if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int m2; + + ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; + + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle->m + m, handle->m, scratch_C + (size_t)m*n_block_size + n, n_block_size); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7))); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { + for (n = num_n_simd; n < num_n; n++) { + scratch_C[m2*n_block_size + n] = (*beta)*ptr_result[n*handle->m + m2]; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (m = num_m_simd; m < num_m; m++) { + for (n = 0; n < num_n; n++) { + scratch_C[m*n_block_size + n] = (*beta)*ptr_result[n*handle->m + m]; + } + } + + } + else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32))); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = (*beta)*ptr_result[m*handle->n + n]; + } + } + } + } +} + +for (kb = 0; kb < k_blocks; kb++) { + const uint16_t* LIBXSMM_RESTRICT ptr_dense; + float * LIBXSMM_RESTRICT scratch_C_base; + const float * LIBXSMM_RESTRICT scratch_B_base; + int block_A = kb * m_blocks + mb; + libxsmm_CSR_sparseslice slice = a_sparse[block_A]; + int m_local = 0; + + k_overall_start = kb*k_block_size; + k_overall_end = (kb+1)*k_block_size; + num_k = (k_overall_end - k_overall_start); + + /* Copy in b matrix */ + if ('T' == transb || 't' == transb) { + int num_k_simd = num_k / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int k2; + + ptr_dense = b + (size_t)n_overall_start*handle->k + k_overall_start; + + for (k = 0; k < num_k_simd; k += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL_BFLOAT16(ptr_dense + (size_t)n*handle->k + k, handle->k, scratch_B + (size_t)k*n_block_size + n, n_block_size); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (k2 = k; k2 < k + SIMD_WIDTH_FP32; k2++) { + for (n = num_n_simd; n < num_n; n++) { + uint16_t restmp = ptr_dense[n*handle->k + k2]; + union { int i; float f; } res; + res.i = restmp; + res.i <<= 16; + scratch_B[k2*n_block_size + n] = res.f; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (k = num_k_simd; k < num_k; k++) { + for (n = 0; n < num_n; n++) { + uint16_t restmp = ptr_dense[n*handle->k + k]; + union { int i; float f; } res; + res.i = restmp; + res.i <<= 16; + scratch_B[k*n_block_size + n] = res.f; + } + } + } else { + ptr_dense = b + (size_t)k_overall_start*handle->n + n_overall_start; + if (!last_block_n) { + for (k = 0; k < num_k; k++) { + SIMDTYPE_INT32 vload_0 = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(ptr_dense + (size_t)k*handle->n + 2*0*SIMD_WIDTH_FP32)); + SIMDTYPE_INT32 vload_1, vload_2; + SIMDTYPE_FP32 v1_0, v2_0; + SIMDTYPE_FP32 v1_1, v2_1; + SIMDTYPE_FP32 v1_2, v2_2; + EXPAND_BFLOAT16(vload_0, v1_0, v2_0); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*0*SIMD_WIDTH_FP32, v1_0); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*0+1)*SIMD_WIDTH_FP32, v2_0); + vload_1 = _MM_LOADU_INT32((const SIMDTYPE_INT32 *)(ptr_dense + (size_t)k*handle->n + 2*1*SIMD_WIDTH_FP32)); + EXPAND_BFLOAT16(vload_1, v1_1, v2_1); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*1*SIMD_WIDTH_FP32, v1_1); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*1+1)*SIMD_WIDTH_FP32, v2_1); + vload_2 = _MM_LOADU_INT32((const SIMDTYPE_INT32 *)(ptr_dense + (size_t)k*handle->n + 2*2*SIMD_WIDTH_FP32)); + EXPAND_BFLOAT16(vload_2, v1_2, v2_2); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*2*SIMD_WIDTH_FP32, v1_2); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + (2*2+1)*SIMD_WIDTH_FP32, v2_2); + } + } else { + for (k = 0; k < num_k; k++) { + for (n = 0; n < num_full_regs; n += 2) { + SIMDTYPE_INT32 vload_0 = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(ptr_dense + (size_t)k*handle->n + (size_t)n*SIMD_WIDTH_FP32)); + SIMDTYPE_FP32 v1_0, v2_0; + EXPAND_BFLOAT16(vload_0, v1_0, v2_0); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, v1_0); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, v2_0); + } + for (n = last_n_start; n < num_n; n++) { + uint16_t restmp = ptr_dense[k*handle->n + n]; + union { int i; float f; } res; + res.i = restmp; + res.i <<= 16; + { + scratch_B[k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = res.f; + } + } + } + } + } + + scratch_C_base = scratch_C - (size_t)m_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + scratch_B_base = scratch_B; /* - (size_t)k_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; */ + + for (m = m_overall_start; m < m_overall_start + num_m_aligned; m += 2, m_local += 2) { + int start_j, end_j, end_j_2, num_j, num_j_2; + const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; + const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base_2; + const float *LIBXSMM_RESTRICT sp_v_ptr_base; + const float *LIBXSMM_RESTRICT sp_v_ptr_base_2; + float *const LIBXSMM_RESTRICT result_m_index = scratch_C_base + ((size_t)m) *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + float *const LIBXSMM_RESTRICT result_m_index_2 = scratch_C_base + ((size_t)m+1)*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + + if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } + + start_j = slice.rowidx[m_local]; + end_j = slice.rowidx[m_local + 1]; + end_j_2 = slice.rowidx[m_local + 2]; + num_j = (end_j - start_j); + num_j_2 = (end_j_2 - end_j); + sp_c_ptr_base = slice.colidx + start_j; + sp_c_ptr_base_2 = slice.colidx + end_j; + sp_v_ptr_base = (float *)(slice.values) + start_j; + sp_v_ptr_base_2 = (float *)(slice.values) + end_j; + + if (!last_block_n) + { + int64_t j = 0, j2 = 0; + sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); + sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32); + sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); + sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32); + sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); + sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32); + sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); + sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32); + sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); + sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32); + sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); + sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32); + for (; j < num_j && j2 < num_j_2; j++, j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j] *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + } + for (; j2 < num_j_2; j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); + _MM_STORE_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32, sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); + _MM_STORE_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32, sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); + _MM_STORE_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32, sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); + _MM_STORE_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32, sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); + _MM_STORE_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32, sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); + _MM_STORE_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32, sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + else { + int64_t j = 0, j2 = 0; + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_SETZERO_FP32(); + sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); + sum[n+1] = _MM_SETZERO_FP32(); + sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); + } + for (; j < num_j && j2 < num_j_2; j++, j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j] *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + (size_t)n*SIMD_WIDTH_FP32), sum[n]); + sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + (size_t)n*SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + { + float v_v_f = sp_v_ptr_base[j]; + float v_v_f_2 = sp_v_ptr_base_2[j2]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; + } + } + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + } + { + float v_v_f = sp_v_ptr_base[j]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + } + } + } + for (; j2 < num_j_2; j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n) *SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + { + float v_v_f_2 = sp_v_ptr_base_2[j2]; + for (n = last_n_start; n < num_n; n++) { + result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; + } + } + } + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + (size_t)n*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index_2 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + (size_t)n*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + } + } + for (m = m_overall_start + num_m_aligned; m < m_overall_end; m++, m_local++) { + int start_j, end_j, num_j; + const uint16_t* LIBXSMM_RESTRICT sp_c_ptr_base; + const float* LIBXSMM_RESTRICT sp_v_ptr_base; + float* LIBXSMM_RESTRICT result_m_index; + + if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } + + start_j = slice.rowidx[m_local]; + end_j = slice.rowidx[m_local + 1]; + num_j = (end_j - start_j); + sp_c_ptr_base = slice.colidx + start_j; + sp_v_ptr_base = slice.values + start_j; + result_m_index = scratch_C_base + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + + if (!last_block_n) { + int64_t j = 0; + sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); + sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); + sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); + sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); + sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); + sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + } + _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); + _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); + _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); + _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); + _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); + _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); + } + else { + int64_t j = 0; + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_SETZERO_FP32(); + sum[n+1] = _MM_SETZERO_FP32(); + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + } + { + float v_v_f = sp_v_ptr_base[j]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + } + } + } + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + + } + } +} /* kb */ + +/* Copy out c matrix */ +if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int n2; + + ptr_result = c + (size_t)n_overall_start*handle->m + m_overall_start; + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(scratch_C + (size_t)m*n_block_size + n, n_block_size, ptr_result + (size_t)n*handle->m + m, handle->m); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_m - num_m_simd) block of output space - input is of size (num_m - num_m_simd) * SIMD_WIDTH_FP32 */ + for (n2 = n; n2 < n + SIMD_WIDTH_FP32; n2++) { + for (m = num_m_simd; m < num_m; m++) { + ptr_result[n2*handle->m + m] = scratch_C[m*n_block_size + n2]; + } + } + } + /* Transpose a (num_n - num_n_simd) * num_m block of output space - input is of size num_m * (num_n - num_n_simd) */ + for (n = num_n_simd; n < num_n; n++) { + for (m = 0; m < num_m; m++) { + ptr_result[n*handle->m + m] = scratch_C[m*n_block_size + n]; + } + } +} +else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 0*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 1*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 2*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 3*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 4*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + 5*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32)); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle->n + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32)); + } + for (n = last_n_start; n < num_n; n++) { + ptr_result[m*handle->n + n] = scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n]; + } + } + } +} + +#undef LIBXSMM_SPMDM_COMPUTE_NREGS diff --git a/third_party/libxsmm/src/template/libxsmm_spmdm_compute_fp32_thread.tpl.c b/third_party/libxsmm/src/template/libxsmm_spmdm_compute_fp32_thread.tpl.c new file mode 100644 index 00000000..18601cc8 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_spmdm_compute_fp32_thread.tpl.c @@ -0,0 +1,542 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish (Intel Corp.) +******************************************************************************/ + +const int m_blocks = handle->mb; +/* const int n_blocks = handle->nb; */ +const int k_blocks = handle->kb; +const int m_block_size = handle->bm; +const int n_block_size = handle->bn; +const int k_block_size = handle->bk; +const int handle_m = handle->m; +const int handle_n = handle->n; +int mb = block_id / handle->nb; +int nb = block_id % handle->nb; + +#define LIBXSMM_SPMDM_COMPUTE_NREGS (6) +int m_overall_start = mb*m_block_size; +int m_overall_end = (mb + 1)*m_block_size; +int num_m; +int num_m_aligned; + +int n_overall_start = nb*n_block_size; +int n_overall_end = (nb + 1)*n_block_size; +int num_n; +int m, n, k, kb; +int last_block_n, num_full_regs, last_n_start; + +int k_overall_start, k_overall_end, num_k; + +float *const scratch_C = (float*)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread); +float *const scratch_B = (float*)(handle->base_ptr_scratch_B_scratch_C + (size_t)tid*handle->memory_for_scratch_per_thread + (size_t)m_block_size*n_block_size*sizeof(float)); +float* LIBXSMM_RESTRICT ptr_result; + +LIBXSMM_UNUSED(nthreads); +LIBXSMM_UNUSED(transa); +LIBXSMM_UNUSED(alpha); +LIBXSMM_UNUSED(beta); +LIBXSMM_UNUSED(tid); + +/* really is twice this */ +assert(n_block_size == LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32); + +if (m_overall_end > handle_m) m_overall_end = handle_m; +num_m = (m_overall_end - m_overall_start); +num_m_aligned = (num_m / 2) * 2; + +if (n_overall_end > handle_n) n_overall_end = handle_n; +num_n = (n_overall_end - n_overall_start); +last_block_n = (num_n != n_block_size); +num_full_regs = (num_n / SIMD_WIDTH_FP32); +if ((num_full_regs > 0) && (num_full_regs%2)) num_full_regs--; +last_n_start = num_full_regs*SIMD_WIDTH_FP32; + +/* Copy in c matrix to buffer*/ +ptr_result = c + (size_t)m_overall_start*handle_n + n_overall_start; +if (LIBXSMM_FEQ(0.f, *beta)) { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + } + } else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_SETZERO_FP32()); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = 0; + } + } + } +} +else if (LIBXSMM_FEQ(1.f, *beta)) { + if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int m2; + + ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; + + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle_m + m, handle_m, scratch_C + (size_t)m*n_block_size + n, n_block_size); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { + for (n = num_n_simd; n < num_n; n++) { + scratch_C[m2*n_block_size + n] = ptr_result[n*handle_m + m2]; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (m = num_m_simd; m < num_m; m++) { + for (n = 0; n < num_n; n++) { + scratch_C[m*n_block_size + n] = ptr_result[n*handle_m + m]; + } + } + } + else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32)); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n) *SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32)); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = ptr_result[m*handle_n + n]; + } + } + } + } +} +else { + SIMDTYPE_FP32 beta_v = _MM_SET1_FP32(*beta); + if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int m2; + + ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; + + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_result + (size_t)n*handle_m + m, handle_m, scratch_C + (size_t)m*n_block_size + n, n_block_size); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*1))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*2))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*3))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*4))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*5))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*6))); + _MM_STORE_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(scratch_C + (size_t)m*n_block_size + n + (size_t)n_block_size*7))); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (m2 = m; m2 < m + SIMD_WIDTH_FP32; m2++) { + for (n = num_n_simd; n < num_n; n++) { + scratch_C[m2*n_block_size + n] = (*beta)*ptr_result[n*handle_m + m2]; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (m = num_m_simd; m < num_m; m++) { + for (n = 0; n < num_n; n++) { + scratch_C[m*n_block_size + n] = (*beta)*ptr_result[n*handle_m + m]; + } + } + + } + else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32))); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n) *SIMD_WIDTH_FP32))); + _MM_STORE_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_MUL_FP32(beta_v, _MM_LOADU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + for (n = last_n_start; n < num_n; n++) { + scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = (*beta)*ptr_result[m*handle_n + n]; + } + } + } + } +} + +for (kb = 0; kb < k_blocks; kb++) { + const float * LIBXSMM_RESTRICT ptr_dense; + float * LIBXSMM_RESTRICT scratch_C_base; + const float * LIBXSMM_RESTRICT scratch_B_base; + int block_A = kb * m_blocks + mb; + libxsmm_CSR_sparseslice slice = a_sparse[block_A]; + int m_local = 0; + + k_overall_start = kb*k_block_size; + k_overall_end = (kb+1)*k_block_size; + if (k_overall_end > handle->k) k_overall_end = handle->k; + num_k = (k_overall_end - k_overall_start); + + /* Copy in b matrix*/ + if ('T' == transb || 't' == transb) { + int num_k_simd = num_k / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int k2; + + ptr_dense = b + (size_t)n_overall_start*handle->k + k_overall_start; + + for (k = 0; k < num_k_simd; k += SIMD_WIDTH_FP32) { + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(ptr_dense + (size_t)n*handle->k + k, handle->k, scratch_B + (size_t)k*n_block_size + n, n_block_size); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_n - num_n_simd) block of output space - input is of size (num_n - num_n_simd) * SIMD_WIDTH_FP32 */ + for (k2 = k; k2 < k + SIMD_WIDTH_FP32; k2++) { + for (n = num_n_simd; n < num_n; n++) { + scratch_B[k2*n_block_size + n] = ptr_dense[n*handle->k + k2]; + } + } + } + /* Transpose a (num_m - num_m_simd) * num_n block of output space - input is of size num_n * (num_m - num_m_simd) */ + for (k = num_k_simd; k < num_k; k++) { + for (n = 0; n < num_n; n++) { + scratch_B[k*n_block_size + n] = ptr_dense[n*handle->k + k]; + } + } + } + else { + ptr_dense = b + (size_t)k_overall_start*handle_n + n_overall_start; + if (!last_block_n) { + for (k = 0; k < num_k; k++) { + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 0*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 1*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 2*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 3*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 4*SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + 5*SIMD_WIDTH_FP32)); + } + } else { + for (k = 0; k < num_k; k++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + ((size_t)n) *SIMD_WIDTH_FP32)); + _MM_STORE_FP32(scratch_B + (size_t)k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_LOADU_FP32(ptr_dense + (size_t)k*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32)); + } + for (n = last_n_start; n < num_n; n++) { + scratch_B[k*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n] = ptr_dense[k*handle_n + n]; + } + } + } + } + + scratch_C_base = scratch_C - (size_t)m_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + scratch_B_base = scratch_B; /* - (size_t)k_overall_start*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32;*/ + + for (m = m_overall_start; m < m_overall_start + num_m_aligned; m += 2, m_local += 2) { + int start_j, end_j, end_j_2, num_j, num_j_2; + const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; + const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base_2; + const float *LIBXSMM_RESTRICT sp_v_ptr_base; + const float *LIBXSMM_RESTRICT sp_v_ptr_base_2; + float *LIBXSMM_RESTRICT result_m_index; + float *LIBXSMM_RESTRICT result_m_index_2; + const uint16_t* rowidx; + + if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } + + rowidx = slice.rowidx; + start_j = rowidx[m_local]; + end_j = rowidx[m_local+1]; + end_j_2 = rowidx[m_local+2]; + num_j = (end_j - start_j); + num_j_2 = (end_j_2 - end_j); + sp_c_ptr_base = slice.colidx + start_j; + sp_c_ptr_base_2 = slice.colidx + end_j; + sp_v_ptr_base = (float *)(slice.values) + start_j; + sp_v_ptr_base_2 = (float *)(slice.values) + end_j; + result_m_index = scratch_C_base + ((size_t)m) *LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + result_m_index_2 = scratch_C_base + ((size_t)m+1)*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + + if (!last_block_n) + { + int64_t j = 0, j2 = 0; + SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; + sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); + sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32); + sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); + sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32); + sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); + sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32); + sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); + sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32); + sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); + sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32); + sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); + sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_LOAD_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32); + for (; j < num_j && j2 < num_j_2; j++, j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + } + for (; j2 < num_j_2; j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + sum[0 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 0*SIMD_WIDTH_FP32), sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 1*SIMD_WIDTH_FP32), sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[2 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 2*SIMD_WIDTH_FP32), sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[3 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 3*SIMD_WIDTH_FP32), sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[4 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 4*SIMD_WIDTH_FP32), sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[5 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + 5*SIMD_WIDTH_FP32), sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); + _MM_STORE_FP32(result_m_index_2 + 0*SIMD_WIDTH_FP32, sum[0+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); + _MM_STORE_FP32(result_m_index_2 + 1*SIMD_WIDTH_FP32, sum[1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); + _MM_STORE_FP32(result_m_index_2 + 2*SIMD_WIDTH_FP32, sum[2+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); + _MM_STORE_FP32(result_m_index_2 + 3*SIMD_WIDTH_FP32, sum[3+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); + _MM_STORE_FP32(result_m_index_2 + 4*SIMD_WIDTH_FP32, sum[4+LIBXSMM_SPMDM_COMPUTE_NREGS]); + _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); + _MM_STORE_FP32(result_m_index_2 + 5*SIMD_WIDTH_FP32, sum[5+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + else { + int64_t j = 0, j2 = 0; + SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_SETZERO_FP32(); + sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); + sum[n+1] = _MM_SETZERO_FP32(); + sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_SETZERO_FP32(); + } + for (; j < num_j && j2 < num_j_2; j++, j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + (size_t)n*SIMD_WIDTH_FP32), sum[n]); + sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + (size_t)n*SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + { + float v_v_f = sp_v_ptr_base[j]; + float v_v_f_2 = sp_v_ptr_base_2[j2]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; + } + } + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + } + { + float v_v_f = sp_v_ptr_base[j]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + } + } + } + for (; j2 < num_j_2; j2++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index_2 = scratch_B_base + (size_t)sp_c_ptr_base_2[j2]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v_2 = _MM_SET1_FP32(sp_v_ptr_base_2[j2]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n) *SIMD_WIDTH_FP32), sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS]); + sum[n+1 + LIBXSMM_SPMDM_COMPUTE_NREGS] = _MM_FMADD_FP32(v_v_2, _MM_LOAD_FP32(sp_col_dense_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS]); + } + { + float v_v_f_2 = sp_v_ptr_base_2[j2]; + for (n = last_n_start; n < num_n; n++) { + result_m_index_2[n] += sp_col_dense_index_2[n]*v_v_f_2; + } + } + } + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + (size_t)n*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index_2 + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + (size_t)n*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1+LIBXSMM_SPMDM_COMPUTE_NREGS], _MM_LOAD_FP32(result_m_index_2 + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + } + } + for (m = m_overall_start + num_m_aligned; m < m_overall_end; m++, m_local++) { + int start_j, end_j, num_j; + const uint16_t *LIBXSMM_RESTRICT sp_c_ptr_base; + const float *LIBXSMM_RESTRICT sp_v_ptr_base; + float *LIBXSMM_RESTRICT result_m_index; + const uint16_t* rowidx; + + if (m_local >= m_block_size) { block_A++; slice = a_sparse[block_A]; m_local = 0; } + + rowidx = slice.rowidx; + start_j = rowidx[m_local]; + end_j = rowidx[m_local+1]; + num_j = (end_j - start_j); + sp_c_ptr_base = slice.colidx + start_j; + sp_v_ptr_base = slice.values + start_j; + result_m_index = scratch_C_base + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + + if (!last_block_n) { + int64_t j = 0; + SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; + sum[0] = _MM_LOAD_FP32(result_m_index + 0*SIMD_WIDTH_FP32); + sum[1] = _MM_LOAD_FP32(result_m_index + 1*SIMD_WIDTH_FP32); + sum[2] = _MM_LOAD_FP32(result_m_index + 2*SIMD_WIDTH_FP32); + sum[3] = _MM_LOAD_FP32(result_m_index + 3*SIMD_WIDTH_FP32); + sum[4] = _MM_LOAD_FP32(result_m_index + 4*SIMD_WIDTH_FP32); + sum[5] = _MM_LOAD_FP32(result_m_index + 5*SIMD_WIDTH_FP32); + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + sum[0] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 0*SIMD_WIDTH_FP32), sum[0]); + sum[1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 1*SIMD_WIDTH_FP32), sum[1]); + sum[2] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 2*SIMD_WIDTH_FP32), sum[2]); + sum[3] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 3*SIMD_WIDTH_FP32), sum[3]); + sum[4] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 4*SIMD_WIDTH_FP32), sum[4]); + sum[5] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + 5*SIMD_WIDTH_FP32), sum[5]); + } + _MM_STORE_FP32(result_m_index + 0*SIMD_WIDTH_FP32, sum[0]); + _MM_STORE_FP32(result_m_index + 1*SIMD_WIDTH_FP32, sum[1]); + _MM_STORE_FP32(result_m_index + 2*SIMD_WIDTH_FP32, sum[2]); + _MM_STORE_FP32(result_m_index + 3*SIMD_WIDTH_FP32, sum[3]); + _MM_STORE_FP32(result_m_index + 4*SIMD_WIDTH_FP32, sum[4]); + _MM_STORE_FP32(result_m_index + 5*SIMD_WIDTH_FP32, sum[5]); + } + else { + SIMDTYPE_FP32 sum[2*LIBXSMM_SPMDM_COMPUTE_NREGS]; + int64_t j = 0; + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_SETZERO_FP32(); + sum[n+1] = _MM_SETZERO_FP32(); + } + for (; j < num_j; j++) { + const float *const LIBXSMM_RESTRICT sp_col_dense_index = scratch_B_base + (size_t)sp_c_ptr_base[j]*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32; + SIMDTYPE_FP32 v_v = _MM_SET1_FP32(sp_v_ptr_base[j]); + for (n = 0; n < num_full_regs; n += 2) { + sum[n] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n) *SIMD_WIDTH_FP32), sum[n]); + sum[n+1] = _MM_FMADD_FP32(v_v, _MM_LOAD_FP32(sp_col_dense_index + ((size_t)n+1)*SIMD_WIDTH_FP32), sum[n+1]); + } + { + float v_v_f = sp_v_ptr_base[j]; + for (n = last_n_start; n < num_n; n++) { + result_m_index[n] += sp_col_dense_index[n]*v_v_f; + } + } + } + for (n = 0; n < num_full_regs; n += 2) { + _MM_STORE_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n], _MM_LOAD_FP32(result_m_index + ((size_t)n) *SIMD_WIDTH_FP32))); + _MM_STORE_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32, _MM_ADD_FP32(sum[n+1], _MM_LOAD_FP32(result_m_index + ((size_t)n+1)*SIMD_WIDTH_FP32))); + } + } + } +} /* kb */ + +/* Copy out c matrix */ +if ('T' == transc || 't' == transc) { + int num_m_simd = num_m / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int num_n_simd = num_n / SIMD_WIDTH_FP32 * SIMD_WIDTH_FP32; + int n2; + + ptr_result = c + (size_t)n_overall_start*handle_m + m_overall_start; + for (n = 0; n < num_n_simd; n += SIMD_WIDTH_FP32) { + for (m = 0; m < num_m_simd; m += SIMD_WIDTH_FP32) { + TRANSPOSE_SIMD_WIDTH_KERNEL(scratch_C + (size_t)m*n_block_size + n, n_block_size, ptr_result + (size_t)n*handle_m + m, handle_m); + } + /* Transpose a SIMD_WIDTH_FP32 * (num_m - num_m_simd) block of output space - input is of size (num_m - num_m_simd) * SIMD_WIDTH_FP32 */ + for (n2 = n; n2 < n + SIMD_WIDTH_FP32; n2++) { + for (m = num_m_simd; m < num_m; m++) { + ptr_result[n2*handle_m + m] = scratch_C[m*n_block_size + n2]; + } + } + } + /* Transpose a (num_n - num_n_simd) * num_m block of output space - input is of size num_m * (num_n - num_n_simd) */ + for (n = num_n_simd; n < num_n; n++) { + for (m = 0; m < num_m; m++) { + ptr_result[n*handle_m + m] = scratch_C[m*n_block_size + n]; + } + } +} +else { + if (!last_block_n) { + for (m = 0; m < num_m; m++) { + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 0*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 0*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 1*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 1*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 2*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 2*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 3*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 3*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 4*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 4*SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + 5*SIMD_WIDTH_FP32, _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + 5*SIMD_WIDTH_FP32)); + } + } + else { + for (m = 0; m < num_m; m++) { + for (n = 0; n < num_full_regs; n += 2) { + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n)*SIMD_WIDTH_FP32, + _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n) *SIMD_WIDTH_FP32)); + _MM_STOREU_FP32(ptr_result + (size_t)m*handle_n + ((size_t)n+1)*SIMD_WIDTH_FP32, + _MM_LOAD_FP32(scratch_C + (size_t)m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + ((size_t)n+1)*SIMD_WIDTH_FP32)); + } + for (n = last_n_start; n < num_n; n++) { + ptr_result[m*handle_n + n] = scratch_C[m*LIBXSMM_SPMDM_COMPUTE_NREGS*SIMD_WIDTH_FP32 + n]; + } + } + } +} + +#undef LIBXSMM_SPMDM_COMPUTE_NREGS diff --git a/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c b/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c new file mode 100644 index 00000000..14b5720f --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_bfloat16_thread.tpl.c @@ -0,0 +1,126 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish (Intel Corp.) +******************************************************************************/ + +int i, k; +int mb, kb; +#if SIMD_WIDTH_FP32 == 8 +const __m256i *const shufmasks = internal_spmdm_shufmasks_32; +#endif +#if SIMD_WIDTH_FP32 > 1 +const __m256i *const shufmasks2 = internal_spmdm_shufmasks_16; +#endif +int block_offset_base, block_offset; + +LIBXSMM_UNUSED(nthreads); +LIBXSMM_UNUSED(tid); + +kb = block_id / handle->mb; +mb = block_id % handle->mb; + +if ('T' == transa || 't' == transa) { + block_offset_base = mb * handle->bm; + block_offset = block_offset_base + kb * handle->m * handle->bk; +} +else { + block_offset_base = kb * handle->bk; + block_offset = block_offset_base + mb * handle->k * handle->bm; +} +{ + libxsmm_CSR_sparseslice slice = libxsmm_output_csr_a[kb*handle->mb + mb]; + int nrows = ((mb + 1)*handle->bm > handle->m)?(handle->m - (mb)*handle->bm):handle->bm; + int ncols = ((kb + 1)*handle->bk > handle->k)?(handle->k - (kb)*handle->bk):handle->bk; + /*printf("nrows: %d, ncols: %d\n", nrows, ncols);*/ + const uint16_t * input_ptr = a + block_offset; + uint16_t * rowidx_ptr = slice.rowidx; + uint16_t * colidx_ptr = slice.colidx; + float * values_ptr = (float *)(slice.values); + uint16_t cnt = 0; +#if SIMD_WIDTH_FP32 > 1 + const SIMDTYPE_INT32 vzero = _MM_SETZERO_INT32(); + const SIMDTYPE_FP32 vzerof = _MM_SETZERO_FP32(); + const int ncols_aligned = ncols / (4*SIMD_WIDTH_FP32)*(4*SIMD_WIDTH_FP32); +#else + const int ncols_aligned = 0; +#endif + for (i = 0; i < nrows; i++) { + rowidx_ptr[i] = cnt; + if ('T' == transa || 't' == transa) { +#if SIMD_WIDTH_FP32 > 1 + for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { + int vals[32]; + int kk; + for (kk = 0; kk < 4*SIMD_WIDTH_FP32; kk += 2) { vals[kk/2] = (int)input_ptr[(k+kk)*handle->m + i]; vals[kk/2] |= ((int)(input_ptr[(k+kk+1)*handle->m + i]) << 16); } + { + SIMDTYPE_INT32 v1tmp = _MM_LOADU_INT32(vals); + SIMDTYPE_INT32 v2tmp = _MM_LOADU_INT32(vals + SIMD_WIDTH_FP32); + SIMDTYPE_FP32 v1, v2, v3, v4; + SIMDMASKTYPE_FP32 m1, m2, m3, m4; + EXPAND_BFLOAT16(v1tmp, v1, v2); + EXPAND_BFLOAT16(v2tmp, v3, v4); + m1 = _MM_CMPNEQ_FP32(v1, vzerof); + m2 = _MM_CMPNEQ_FP32(v2, vzerof); + m3 = _MM_CMPNEQ_FP32(v3, vzerof); + m4 = _MM_CMPNEQ_FP32(v4, vzerof); + COMPRESS_FP32(v1, k, m1, cnt); + COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); + COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); + COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); + } + } +#endif + for (k = ncols_aligned; k < ncols; k++) { + uint16_t v1tmp = input_ptr[k*handle->m + i]; + union {int i; float f; } v1tmp_int; + v1tmp_int.i = v1tmp; + v1tmp_int.i <<= 16; + { + const int m1 = LIBXSMM_FEQ(0, v1tmp_int.f) ? 0 : 1; + if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1tmp_int.f; cnt++; } + } + } + } + else { +#if SIMD_WIDTH_FP32 > 1 + for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { + SIMDTYPE_INT32 v1tmp, v2tmp; + SIMDTYPE_FP32 v1, v2, v3, v4; + SIMDMASKTYPE_FP32 m1, m2, m3, m4; + v1tmp = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(input_ptr + (size_t)i*handle->k + k)); + _MM_PREFETCH((char *)(input_ptr + ((size_t)i+2)*handle->k + k), _MM_HINT_T0); + v2tmp = _MM_LOADU_INT32((const SIMDTYPE_INT32*)(input_ptr + (size_t)i*handle->k + k + 2*SIMD_WIDTH_FP32)); + _MM_PREFETCH((char *)(input_ptr + ((size_t)i+2)*handle->k + k + SIMD_WIDTH_FP32), _MM_HINT_T0); + EXPAND_BFLOAT16(v1tmp, v1, v2); + EXPAND_BFLOAT16(v2tmp, v3, v4); + m1 = _MM_CMPNEQ_FP32(v1, vzerof); + m2 = _MM_CMPNEQ_FP32(v2, vzerof); + m3 = _MM_CMPNEQ_FP32(v3, vzerof); + m4 = _MM_CMPNEQ_FP32(v4, vzerof); + COMPRESS_FP32(v1, k, m1, cnt); + COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); + COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); + COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); + } +#endif + for (k = ncols_aligned; k < ncols; k++) { + uint16_t v1tmp = input_ptr[i*handle->k + k]; + union {int i; float f; } v1tmp_int; + v1tmp_int.i = v1tmp; + v1tmp_int.i <<= 16; + { + int m1 = LIBXSMM_FEQ(0, v1tmp_int.f) ? 0 : 1; + if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1tmp_int.f; cnt++; } + } + } + } + } + rowidx_ptr[nrows] = cnt; +} + diff --git a/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c b/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c new file mode 100644 index 00000000..7d1bb355 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_spmdm_createSparseSlice_fp32_thread.tpl.c @@ -0,0 +1,129 @@ +/****************************************************************************** +* Copyright (c) Intel Corporation - All rights reserved. * +* This file is part of the LIBXSMM library. * +* * +* For information on the license, see the LICENSE file. * +* Further information: https://github.com/hfp/libxsmm/ * +* SPDX-License-Identifier: BSD-3-Clause * +******************************************************************************/ +/* Nadathur Satish (Intel Corp.) +******************************************************************************/ + +int i, k; +int mb, kb; +#if SIMD_WIDTH_FP32 == 8 +const __m256i *const shufmasks = internal_spmdm_shufmasks_32; +#endif +#if SIMD_WIDTH_FP32 > 1 +const __m256i *const shufmasks2 = internal_spmdm_shufmasks_16; +SIMDTYPE_INT32 vindex = _MM_SETZERO_INT32(); +int idx_array[16]; +#endif +int block_offset_base, block_offset; + +LIBXSMM_UNUSED(nthreads); +LIBXSMM_UNUSED(tid); + +kb = block_id / handle->mb; +mb = block_id % handle->mb; +if ('T' == transa || 't' == transa) { +#if SIMD_WIDTH_FP32 > 1 + int kk; + for (kk = 0; kk < SIMD_WIDTH_FP32; kk++) idx_array[kk] = kk * handle->m; + vindex = _MM_LOADU_INT32(idx_array); +#endif + block_offset_base = mb * handle->bm; + block_offset = block_offset_base + kb * handle->m * handle->bk; +} +else { + block_offset_base = kb * handle->bk; + block_offset = block_offset_base + mb * handle->k * handle->bm; +} +{ + libxsmm_CSR_sparseslice slice = libxsmm_output_csr_a[kb*handle->mb + mb]; + int nrows = ((mb + 1)*handle->bm > handle->m)?(handle->m - (mb)*handle->bm):handle->bm; + int ncols = ((kb + 1)*handle->bk > handle->k)?(handle->k - (kb)*handle->bk):handle->bk; + /*printf("nrows: %d, ncols: %d\n", nrows, ncols);*/ + const float * input_ptr = a + block_offset; + uint16_t * rowidx_ptr = slice.rowidx; + uint16_t * colidx_ptr = slice.colidx; + float * values_ptr = (float *)(slice.values); + uint16_t cnt = 0; +#if SIMD_WIDTH_FP32 > 1 + const SIMDTYPE_FP32 vzero = _MM_SETZERO_FP32(); + const int ncols_aligned = ncols / (4*SIMD_WIDTH_FP32)*(4*SIMD_WIDTH_FP32); + const int ncols_aligned_2 = ncols / (SIMD_WIDTH_FP32)*(SIMD_WIDTH_FP32); +#else + const int ncols_aligned_2 = 0; +#endif + for (i = 0; i < nrows; i++) { + rowidx_ptr[i] = cnt; + if ('T' == transa || 't' == transa) { +#if SIMD_WIDTH_FP32 > 1 + for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { + SIMDTYPE_FP32 v1 = _MM_GATHER_FP32(input_ptr + (size_t)k * handle->m + i, vindex, 4); + SIMDTYPE_FP32 v2 = _MM_GATHER_FP32(input_ptr + ((size_t)k+1*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); + SIMDTYPE_FP32 v3 = _MM_GATHER_FP32(input_ptr + ((size_t)k+2*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); + SIMDTYPE_FP32 v4 = _MM_GATHER_FP32(input_ptr + ((size_t)k+3*SIMD_WIDTH_FP32) * handle->m + i, vindex, 4); + SIMDMASKTYPE_FP32 m1 = _MM_CMPNEQ_FP32(v1, vzero); + SIMDMASKTYPE_FP32 m2 = _MM_CMPNEQ_FP32(v2, vzero); + SIMDMASKTYPE_FP32 m3 = _MM_CMPNEQ_FP32(v3, vzero); + SIMDMASKTYPE_FP32 m4 = _MM_CMPNEQ_FP32(v4, vzero); + COMPRESS_FP32(v1, k, m1, cnt); + COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); + COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); + COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); + } + for (k = ncols_aligned; k < ncols_aligned_2; k += SIMD_WIDTH_FP32) { + SIMDTYPE_FP32 v1 = _MM_GATHER_FP32(input_ptr + (size_t)k * handle->m + i, vindex, 4); + SIMDMASKTYPE_FP32 m1 = _MM_CMPNEQ_FP32(v1, vzero); + COMPRESS_FP32(v1, k, m1, cnt); + } +#endif + for (k = ncols_aligned_2; k < ncols; k++) { + const float v1 = input_ptr[i + k*handle->m]; + const int m1 = LIBXSMM_FEQ(0, v1) ? 0 : 1; + if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1; cnt++; } + } + } + else { +#if SIMD_WIDTH_FP32 > 1 + for (k = 0; k < ncols_aligned; k += 4*SIMD_WIDTH_FP32) { + SIMDTYPE_FP32 v1, v2, v3, v4; + SIMDMASKTYPE_FP32 m1, m2, m3, m4; + v1 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k); + _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k, _MM_HINT_T0); + v2 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)SIMD_WIDTH_FP32); + _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)SIMD_WIDTH_FP32, _MM_HINT_T0); + v3 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)2 * SIMD_WIDTH_FP32); + _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)2 * SIMD_WIDTH_FP32, _MM_HINT_T0); + v4 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k + (size_t)3 * SIMD_WIDTH_FP32); + _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k + (size_t)3 * SIMD_WIDTH_FP32, _MM_HINT_T0); + m1 = _MM_CMPNEQ_FP32(v1, vzero); + m2 = _MM_CMPNEQ_FP32(v2, vzero); + m3 = _MM_CMPNEQ_FP32(v3, vzero); + m4 = _MM_CMPNEQ_FP32(v4, vzero); + COMPRESS_FP32(v1, k, m1, cnt); + COMPRESS_FP32(v2, k + SIMD_WIDTH_FP32, m2, cnt); + COMPRESS_FP32(v3, k + 2*SIMD_WIDTH_FP32, m3, cnt); + COMPRESS_FP32(v4, k + 3*SIMD_WIDTH_FP32, m4, cnt); + } + for (k = ncols_aligned; k < ncols_aligned_2; k += SIMD_WIDTH_FP32) { + SIMDTYPE_FP32 v1; + SIMDMASKTYPE_FP32 m1; + v1 = _MM_LOADU_FP32(input_ptr + ((size_t)i) * handle->k + (size_t)k); + _MM_PREFETCH((char*)input_ptr + ((size_t)i+2) * handle->k + (size_t)k, _MM_HINT_T0); + m1 = _MM_CMPNEQ_FP32(v1, vzero); + COMPRESS_FP32(v1, k, m1, cnt); + } +#endif + for (k = ncols_aligned_2; k < ncols; k++) { + const float v1 = input_ptr[i*handle->k + k]; + const int m1 = LIBXSMM_FEQ(0, v1) ? 0 : 1; + if (m1) { colidx_ptr[cnt] = (uint16_t)k; values_ptr[cnt] = v1; cnt++; } + } + } + } + rowidx_ptr[nrows] = cnt; +} + diff --git a/third_party/libxsmm/src/template/libxsmm_version.h b/third_party/libxsmm/src/template/libxsmm_version.h new file mode 100644 index 00000000..43bec851 --- /dev/null +++ b/third_party/libxsmm/src/template/libxsmm_version.h @@ -0,0 +1,12 @@ +#ifndef LIBXSMM_VERSION_H +#define LIBXSMM_VERSION_H + +#define LIBXSMM_CONFIG_VERSION "$VERSION" +#define LIBXSMM_CONFIG_BRANCH "$BRANCH" +#define LIBXSMM_CONFIG_VERSION_MAJOR $MAJOR +#define LIBXSMM_CONFIG_VERSION_MINOR $MINOR +#define LIBXSMM_CONFIG_VERSION_UPDATE $UPDATE +#define LIBXSMM_CONFIG_VERSION_PATCH $PATCH +#define LIBXSMM_CONFIG_BUILD_DATE $DATE + +#endif diff --git a/third_party/libxsmm/tests/mhd_image.mhd b/third_party/libxsmm/tests/mhd_image.mhd new file mode 100644 index 00000000..495486bb --- /dev/null +++ b/third_party/libxsmm/tests/mhd_image.mhd @@ -0,0 +1,13 @@ +ObjectType = Image +NDims = 3 +BinaryData = True +BinaryDataByteOrderMSB = False +CompressedData = False +TransformMatrix = 1 0 0 0 1 0 0 0 1 +Offset = 0 0 0 +CenterOfRotation = 0 0 0 +AnatomicalOrientation = RAI +ElementSpacing = 1 1 1 +DimSize = 202 134 1 +ElementType = MET_SHORT +ElementDataFile = mhd_image.raw diff --git a/third_party/nanoflann/.gitignore b/third_party/nanoflann/.gitignore deleted file mode 100644 index 9850badc..00000000 --- a/third_party/nanoflann/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*~ -build* - diff --git a/third_party/pcg/.gitignore b/third_party/pcg/.gitignore deleted file mode 100644 index 9f598fd5..00000000 --- a/third_party/pcg/.gitignore +++ /dev/null @@ -1,33 +0,0 @@ -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Debug Information -*.dSYM - -# Executables -*.exe -*.out -*.app - -# Actual Project Executables diff --git a/third_party/phmap/.gitignore b/third_party/phmap/.gitignore deleted file mode 100644 index b208e241..00000000 --- a/third_party/phmap/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -VagrantFile -benchmark/build -benchmark/output -benchmark/charts.html -build* -.vagrant -**/.vscode -TAGS diff --git a/third_party/tensorpipe/.gitignore b/third_party/tensorpipe/.gitignore deleted file mode 100644 index 16e39eac..00000000 --- a/third_party/tensorpipe/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*~ -.DS_Store -/build/ -/cmake-build-debug/ diff --git a/third_party/thrust/.gitignore b/third_party/thrust/.gitignore deleted file mode 100644 index 93835e48..00000000 --- a/third_party/thrust/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -discrete_voronoi.pgm -*build*/ -.idea/ diff --git a/third_party/tvm/.gitignore b/third_party/tvm/.gitignore deleted file mode 100644 index cdcf6780..00000000 --- a/third_party/tvm/.gitignore +++ /dev/null @@ -1,235 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -*.S -# C extensions -*.so -*.ll -.npm -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg -.conda/ -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ -docs/gen_modules - -# PyBuilder -/target/ - -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -venv/ -ENV/ - -# Spyder project settings -.spyderproject - -# Rope project settings -.ropeproject -*~ -*.pyc -*~ -config.mk -config.cmake -Win32 -*.dir -perf -*.wasm -.emscripten - -## IOS -DerivedData/ - -## Java -*.class -jvm/*/target/ -jvm/*/*/target/ -*.worksheet -*.idea -*.iml -*.classpath -*.project -*.settings -*/node_modules/ - -## Various settings -*.pbxuser -!default.pbxuser -*.mode1v3 -!default.mode1v3 -*.mode2v3 -!default.mode2v3 -*.perspectivev3 -!default.perspectivev3 -xcuserdata/ -.pkl_memoize_* - -.emscripten* -.m2 - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app - -## Other -*.moved-aside -*.xccheckout -*.xcscmblueprint -.DS_Store -tags -cscope* -*.lock - -# vim temporary files -*.swp -*.swo - -# TVM generated code -perf -.bash_history -*.json -*.params -*.onnx -*.h5 -synset.txt -cat.jpg -cat.png -docs.tgz -cat.png -*.mlmodel -tvm_u.* -tvm_t.* -# Mac OS X -.DS_Store - -# Jetbrain -.idea -.ipython -.jupyter -.nv -.pylint.d -.python_history -.pytest_cache -.local -cmake-build-debug - -# Visual Studio -.vs - -# Visual Studio Code -.vscode - -# tmp file -.nfs* - -# keys -*.pem -*.p12 -*.pfx -*.cer -*.crt -*.der - -# patch sentinel -patched.txt - -# Python type checking -.mypy_cache/ -.pyre/ - -# pipenv files -Pipfile -Pipfile.lock - -# conda package artifacts -conda/Dockerfile.cuda* -conda/pkg -.node_repl_history -# nix files -.envrc -*.nix diff --git a/third_party/tvm/apps/android_camera/app/src/main/jni/make/config.mk b/third_party/tvm/apps/android_camera/app/src/main/jni/make/config.mk new file mode 100644 index 00000000..49e33266 --- /dev/null +++ b/third_party/tvm/apps/android_camera/app/src/main/jni/make/config.mk @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#------------------------------------------------------------------------------- +# Template configuration for compiling +# +# If you want to change the configuration, please use the following +# steps. Assume you are on the root directory. First copy the this +# file so that any local changes will be ignored by git +# +# cp make/config.mk . +# +# Next modify the according entries, and then compile by +# +# ./build.sh +# +#------------------------------------------------------------------------------- +APP_ABI = all + +APP_PLATFORM = android-24 + +# whether enable OpenCL during compile +USE_OPENCL = 0 + +# whether to enable Vulkan during compile +USE_VULKAN = 0 + +# whether to enable contrib sort functions during compile +USE_SORT = 1 + +ifeq ($(USE_VULKAN), 1) + # Statically linking vulkan requires API Level 24 or higher + APP_PLATFORM = android-24 +endif + +# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc +ADD_C_INCLUDES = + +# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so +ADD_LDLIBS = diff --git a/third_party/tvm/apps/android_deploy/app/src/main/jni/make/config.mk b/third_party/tvm/apps/android_deploy/app/src/main/jni/make/config.mk new file mode 100644 index 00000000..bcd56e37 --- /dev/null +++ b/third_party/tvm/apps/android_deploy/app/src/main/jni/make/config.mk @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#------------------------------------------------------------------------------- +# Template configuration for compiling +# +# If you want to change the configuration, please use the following +# steps. Assume you are on the root directory. First copy the this +# file so that any local changes will be ignored by git +# +# cp make/config.mk . +# +# Next modify the according entries, and then compile by +# +# ./build.sh +# +#------------------------------------------------------------------------------- +APP_ABI = all + +APP_PLATFORM = android-17 + +# whether enable OpenCL during compile +USE_OPENCL = 0 + +# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc +ADD_C_INCLUDES = + +# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so +ADD_LDLIBS = diff --git a/third_party/tvm/apps/android_rpc/app/src/main/jni/make/config.mk b/third_party/tvm/apps/android_rpc/app/src/main/jni/make/config.mk new file mode 100644 index 00000000..851430cd --- /dev/null +++ b/third_party/tvm/apps/android_rpc/app/src/main/jni/make/config.mk @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#------------------------------------------------------------------------------- +# Template configuration for compiling +# +# If you want to change the configuration, please use the following +# steps. Assume you are on the root directory. First copy the this +# file so that any local changes will be ignored by git +# +# cp make/config.mk . +# +# Next modify the according entries, and then compile by +# +# ./build.sh +# +#------------------------------------------------------------------------------- +APP_ABI = all + +APP_PLATFORM = android-24 + +# whether enable OpenCL during compile +USE_OPENCL = 0 + +# whether to enable Vulkan during compile +USE_VULKAN = 0 + +# whether to enable contrib sort functions during compile +USE_SORT = 1 + +# whether to eanble contrib random functions during compile +USE_RANDOM = 1 + +ifeq ($(USE_VULKAN), 1) + # Statically linking vulkan requires API Level 24 or higher + APP_PLATFORM = android-24 +endif + +# the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc +ADD_C_INCLUDES = + +# the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so +ADD_LDLIBS = diff --git a/third_party/tvm/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json b/third_party/tvm/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 00000000..1d060ed2 --- /dev/null +++ b/third_party/tvm/apps/ios_rpc/tvmrpc/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,93 @@ +{ + "images" : [ + { + "idiom" : "iphone", + "size" : "20x20", + "scale" : "2x" + }, + { + "idiom" : "iphone", + "size" : "20x20", + "scale" : "3x" + }, + { + "idiom" : "iphone", + "size" : "29x29", + "scale" : "2x" + }, + { + "idiom" : "iphone", + "size" : "29x29", + "scale" : "3x" + }, + { + "idiom" : "iphone", + "size" : "40x40", + "scale" : "2x" + }, + { + "idiom" : "iphone", + "size" : "40x40", + "scale" : "3x" + }, + { + "idiom" : "iphone", + "size" : "60x60", + "scale" : "2x" + }, + { + "idiom" : "iphone", + "size" : "60x60", + "scale" : "3x" + }, + { + "idiom" : "ipad", + "size" : "20x20", + "scale" : "1x" + }, + { + "idiom" : "ipad", + "size" : "20x20", + "scale" : "2x" + }, + { + "idiom" : "ipad", + "size" : "29x29", + "scale" : "1x" + }, + { + "idiom" : "ipad", + "size" : "29x29", + "scale" : "2x" + }, + { + "idiom" : "ipad", + "size" : "40x40", + "scale" : "1x" + }, + { + "idiom" : "ipad", + "size" : "40x40", + "scale" : "2x" + }, + { + "idiom" : "ipad", + "size" : "76x76", + "scale" : "1x" + }, + { + "idiom" : "ipad", + "size" : "76x76", + "scale" : "2x" + }, + { + "idiom" : "ipad", + "size" : "83.5x83.5", + "scale" : "2x" + } + ], + "info" : { + "version" : 1, + "author" : "xcode" + } +} \ No newline at end of file diff --git a/third_party/tvm/apps/sgx/Cargo.lock b/third_party/tvm/apps/sgx/Cargo.lock new file mode 100644 index 00000000..b02ab331 --- /dev/null +++ b/third_party/tvm/apps/sgx/Cargo.lock @@ -0,0 +1,853 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "addr2line" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "gimli 0.22.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "adler" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "aho-corasick" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "arrayvec" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hermit-abi 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "autocfg" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "backtrace" +version = "0.3.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "addr2line 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", + "miniz_oxide 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "object 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bindgen" +version = "0.51.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "cexpr 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.33.1 (registry+https://github.com/rust-lang/crates.io-index)", + "env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "peeking_take_while 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-hash 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "shlex 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "which 3.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cc" +version = "1.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cexpr" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "clang-sys" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", + "libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "clap" +version = "2.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "atty 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-width 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", + "vec_map 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-channel 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-deque 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-queue 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-channel" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-queue" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "either" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "env_logger" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "atty 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)", + "humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "backtrace 0.3.50 (registry+https://github.com/rust-lang/crates.io-index)", + "failure_derive 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "failure_derive" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", + "synstructure 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "gimli" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "goblin" +version = "0.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "plain 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "scroll 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "hermit-abi" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "quick-error 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "itertools" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "itertools" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "itoa" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "lexical-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "arrayvec 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", + "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "static_assertions 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "libc" +version = "0.2.72" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libloading" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.58 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "matrixmultiply" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rawpointer 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memoffset" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "miniz_oxide" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "adler 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ndarray" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "itertools 0.7.11 (registry+https://github.com/rust-lang/crates.io-index)", + "matrixmultiply 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)", + "num-complex 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.12 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "nom" +version = "4.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lexical-core 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-complex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.2.12 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-traits" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hermit-abi 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "object" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "old-tvm-macros" +version = "0.1.1" +dependencies = [ + "goblin 0.0.24 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "proc-macro2" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "quote" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rawpointer" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "regex" +version = "1.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.7.13 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc-demangle" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "scroll" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "scroll_derive 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scroll_derive" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.15.44 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "itoa 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "sgx-demo" +version = "0.1.0" +dependencies = [ + "tvm-runtime 0.1.0", +] + +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "syn" +version = "0.15.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "syn" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "synstructure" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "termcolor" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-util 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-width 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tvm-common" +version = "0.1.0" +dependencies = [ + "bindgen 0.51.1 (registry+https://github.com/rust-lang/crates.io-index)", + "failure 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", + "ndarray 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "tvm-runtime" +version = "0.1.0" +dependencies = [ + "crossbeam 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)", + "failure 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", + "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "ndarray 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", + "nom 5.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.13.0 (registry+https://github.com/rust-lang/crates.io-index)", + "old-tvm-macros 0.1.1", + "serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.56 (registry+https://github.com/rust-lang/crates.io-index)", + "tvm-common 0.1.0", +] + +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-xid" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum addr2line 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1b6a2d3371669ab3ca9797670853d61402b03d0b4b9ebf33d677dfa720203072" +"checksum adler 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +"checksum aho-corasick 0.7.13 (registry+https://github.com/rust-lang/crates.io-index)" = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +"checksum arrayvec 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cff77d8686867eceff3105329d4698d96c2391c176d5d03adc90c7389162b5b8" +"checksum atty 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)" = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +"checksum autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +"checksum backtrace 0.3.50 (registry+https://github.com/rust-lang/crates.io-index)" = "46254cf2fdcdf1badb5934448c1bcbe046a56537b3987d96c51a7afc5d03f293" +"checksum bindgen 0.51.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ebd71393f1ec0509b553aa012b9b58e81dadbdff7130bd3b8cba576e69b32f75" +"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +"checksum cc 1.0.58 (registry+https://github.com/rust-lang/crates.io-index)" = "f9a06fb2e53271d7c279ec1efea6ab691c35a2ae67ec0d91d7acec0caf13b518" +"checksum cexpr 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "fce5b5fb86b0c57c20c834c1b412fd09c77c8a59b9473f86272709e78874cd1d" +"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +"checksum clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81de550971c976f176130da4b2978d3b524eaa0fd9ac31f3ceb5ae1231fb4853" +"checksum clap 2.33.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" +"checksum crossbeam 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "69323bff1fb41c635347b8ead484a5ca6c3f11914d784170b158d8449ab07f8e" +"checksum crossbeam-channel 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cced8691919c02aac3cb0a1bc2e9b73d89e832bf9a06fc579d4e71b68a2da061" +"checksum crossbeam-deque 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" +"checksum crossbeam-epoch 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +"checksum crossbeam-queue 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" +"checksum crossbeam-utils 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +"checksum either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" +"checksum env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3" +"checksum failure 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" +"checksum failure_derive 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" +"checksum gimli 0.22.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aaf91faf136cb47367fa430cd46e37a788775e7fa104f8b4bcb3861dc389b724" +"checksum glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +"checksum goblin 0.0.24 (registry+https://github.com/rust-lang/crates.io-index)" = "e3fa261d919c1ae9d1e4533c4a2f99e10938603c4208d56c05bec7a872b661b0" +"checksum hermit-abi 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "3deed196b6e7f9e44a2ae8d94225d80302d81208b1bb673fd21fe634645c85a9" +"checksum humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +"checksum itertools 0.7.11 (registry+https://github.com/rust-lang/crates.io-index)" = "0d47946d458e94a1b7bcabbf6521ea7c037062c81f534615abcad76e84d4970d" +"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +"checksum itoa 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" +"checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +"checksum lexical-core 0.7.4 (registry+https://github.com/rust-lang/crates.io-index)" = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616" +"checksum libc 0.2.72 (registry+https://github.com/rust-lang/crates.io-index)" = "a9f8082297d534141b30c8d39e9b1773713ab50fdbe4ff30f750d063b3bfd701" +"checksum libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" +"checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +"checksum matrixmultiply 0.1.15 (registry+https://github.com/rust-lang/crates.io-index)" = "dcad67dcec2d58ff56f6292582377e6921afdf3bfbd533e26fb8900ae575e002" +"checksum maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" +"checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +"checksum memoffset 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "c198b026e1bbf08a937e94c6c60f9ec4a2267f5b0d2eec9c1b21b061ce2be55f" +"checksum miniz_oxide 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "be0f75932c1f6cfae3c04000e40114adf955636e19040f9c0a2c380702aa1c7f" +"checksum ndarray 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7cf380a8af901ad627594013a3bbac903ae0a6f94e176e47e46b5bbc1877b928" +"checksum nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2ad2a91a8e869eeb30b9cb3119ae87773a8f4ae617f41b1eb9c154b2905f7bd6" +"checksum nom 5.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +"checksum num-complex 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95" +"checksum num-traits 0.2.12 (registry+https://github.com/rust-lang/crates.io-index)" = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" +"checksum num_cpus 1.13.0 (registry+https://github.com/rust-lang/crates.io-index)" = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +"checksum object 0.20.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1ab52be62400ca80aa00285d25253d7f7c437b7375c4de678f5405d3afe82ca5" +"checksum peeking_take_while 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +"checksum plain 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +"checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" +"checksum proc-macro2 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)" = "beae6331a816b1f65d04c45b078fd8e6c93e8071771f41b8163255bbd8d7c8fa" +"checksum quick-error 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +"checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +"checksum quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +"checksum rawpointer 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebac11a9d2e11f2af219b8b8d833b76b1ea0e054aa0e8d8e9e4cbde353bdf019" +"checksum regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" +"checksum regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)" = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" +"checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" +"checksum rustc-hash 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +"checksum scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +"checksum scroll 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "2f84d114ef17fd144153d608fba7c446b0145d038985e7a8cc5d08bb0ce20383" +"checksum scroll_derive 0.9.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8f1aa96c45e7f5a91cb7fabe7b279f02fea7126239fc40b732316e8b6a2d0fcb" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3" +"checksum serde_derive 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "2a0be94b04690fbaed37cddffc5c134bf537c8e3329d53e982fe04c374978f8e" +"checksum serde_json 1.0.56 (registry+https://github.com/rust-lang/crates.io-index)" = "3433e879a558dde8b5e8feb2a04899cf34fdde1fafb894687e52105fc1162ac3" +"checksum shlex 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" +"checksum static_assertions 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +"checksum syn 0.15.44 (registry+https://github.com/rust-lang/crates.io-index)" = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" +"checksum syn 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)" = "936cae2873c940d92e697597c5eee105fb570cd5689c695806f672883653349b" +"checksum synstructure 0.12.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" +"checksum termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" +"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +"checksum thread_local 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +"checksum unicode-width 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +"checksum vec_map 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" +"checksum version_check 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +"checksum which 3.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +"checksum winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +"checksum winapi-util 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/third_party/tvm/nnvm/make/config.mk b/third_party/tvm/nnvm/make/config.mk new file mode 100644 index 00000000..4a210ea4 --- /dev/null +++ b/third_party/tvm/nnvm/make/config.mk @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#------------------------------------------------------------------------------- +# Template configuration for compiling nnvm +# +# If you want to change the configuration, please use the following +# steps. Assume you are on the root directory of nnvm. First copy the this +# file so that any local changes will be ignored by git +# +# $ cp make/config.mk . +# +# Next modify the according entries, and then compile by +# +# $ make +# +# or build in parallel with 8 threads +# +# $ make -j8 +#------------------------------------------------------------------------------- + +#--------------------- +# choice of compiler +#-------------------- + +export NVCC = nvcc + +# choice of archiver +export AR = ar + +# the additional link flags you want to add +ADD_LDFLAGS= + +# the additional compile flags you want to add +ADD_CFLAGS= + +# path to dmlc-core module +#DMLC_CORE_PATH= + +#---------------------------- +# plugins +#---------------------------- + +# whether to use fusion integration. This requires installing cuda. +# ifndef CUDA_PATH +# CUDA_PATH = /usr/local/cuda +# endif +# NNVM_FUSION_PATH = plugin/nnvm-fusion +# NNVM_PLUGINS += $(NNVM_FUSION_PATH)/nnvm-fusion.mk diff --git a/third_party/tvm/tests/python/contrib/test_arm_compute_lib/test_config.json b/third_party/tvm/tests/python/contrib/test_arm_compute_lib/test_config.json new file mode 100644 index 00000000..c8168ae8 --- /dev/null +++ b/third_party/tvm/tests/python/contrib/test_arm_compute_lib/test_config.json @@ -0,0 +1,8 @@ +{ + "connection_type": "local", + "host": "localhost", + "port": 9090, + "target": "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", + "device_key": "", + "cross_compile": "" +} diff --git a/third_party/tvm/tutorials/auto_scheduler/ci_logs/conv2d.json b/third_party/tvm/tutorials/auto_scheduler/ci_logs/conv2d.json new file mode 100644 index 00000000..c748920d --- /dev/null +++ b/third_party/tvm/tutorials/auto_scheduler/ci_logs/conv2d.json @@ -0,0 +1,2 @@ +# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. +{"i": [["[\"conv2d_layer\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32"], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 512, [1, 64, 2, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 7, [1, 1, 7, 1], 1], ["SP", 3, 20, 512, [4, 2], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 48, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 504, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000429498], 0, 1.59126, 1603259147], "v": "v0.2"} diff --git a/third_party/tvm/tutorials/auto_scheduler/ci_logs/matmul.json b/third_party/tvm/tutorials/auto_scheduler/ci_logs/matmul.json new file mode 100644 index 00000000..827cfc9a --- /dev/null +++ b/third_party/tvm/tutorials/auto_scheduler/ci_logs/matmul.json @@ -0,0 +1,2 @@ +# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. +{"i": [["[\"matmul_add\", 128, 128, 128, \"float32\"]", "llvm -keys=cpu"], [[], [["SP", 2, 0, 128, [4, 2, 4], 1], ["SP", 2, 4, 128, [1, 32, 2], 1], ["SP", 2, 8, 128, [2], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 1], ["FSP", 4, 2, 1, 1], ["RE", 4, [0, 2, 1, 3]], ["CA", 2, 4, 1], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$0"], ["AN", 2, 9, 2]]]], "r": [[5.80388e-05], 0, 0.299169, 1603402396], "v": "v0.2"} diff --git a/third_party/tvm/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json b/third_party/tvm/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json new file mode 100644 index 00000000..41b6c0e5 --- /dev/null +++ b/third_party/tvm/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1.json @@ -0,0 +1,26 @@ +# Provide valid schedules for resnet-18. +# This is used to run the tutorial on the documentation web server. +{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [50], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$0"], ["PR", 3, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.54041e-06], 0, 1.27943, 1605490839], "v": "v0.3"} +{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 4], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 4, [4], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 4, [2], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.03431e-05], 0, 2.09134, 1605490924], "v": "v0.3"} +{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [8], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[5.51259e-06], 0, 1.30207, 1605491060], "v": "v0.3"} +{"i": [["[\"944921d3fd999ba7aa9ffe5a592a9241\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [56], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$512"]]]], "r": [[2.24305e-05], 0, 1.60311, 1605493879], "v": "v0.3"} +{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [2, 1, 1, 8], 1], ["SP", 3, 10, 112, [1, 8, 1, 1], 1], ["SP", 3, 15, 64, [2, 16, 2, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 294, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 441, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.63468e-05], 0, 2.59544, 1605493932], "v": "v0.3"} +{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [7, 4, 2, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [8, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 128, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.26775e-05], 0, 1.94247, 1605494103], "v": "v0.3"} +{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 2], 1], ["SP", 3, 10, 28, [1, 1, 2, 1], 1], ["SP", 3, 15, 128, [1, 16, 1, 8], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 128, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.13004e-05], 0, 1.86312, 1605494224], "v": "v0.3"} +{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 2, 1], 1], ["SP", 3, 10, 14, [1, 14, 1, 1], 1], ["SP", 3, 15, 256, [1, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 48, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.29425e-05], 0, 1.70493, 1605494303], "v": "v0.3"} +{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 16, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.04683e-05], 0, 1.80217, 1605494406], "v": "v0.3"} +{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 1, 7], 1], ["SP", 3, 10, 28, [1, 4, 1, 1], 1], ["SP", 3, 15, 128, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 72, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 348, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.93528e-05], 0, 1.74125, 1605498773], "v": "v0.3"} +{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [8], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 8, 1], 1], ["SP", 6, 15, 512, [1, 32, 2, 1], 1], ["SP", 6, 20, 512, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [49], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000129562], 0, 3.40317, 1605500470], "v": "v0.3"} +{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 7], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 256, [4, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 288, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 1440, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[7.57476e-05], 0, 2.59558, 1605501054], "v": "v0.3"} +{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 2, 1], 1], ["SP", 6, 10, 196, [4, 1, 1, 7], 1], ["SP", 6, 15, 128, [2, 32, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [49], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.77244e-05], 0, 2.67201, 1605501438], "v": "v0.3"} +{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 128, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 7, 1], 1], ["SP", 6, 15, 128, [8, 16, 1, 1], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[6.23875e-05], 0, 1.93274, 1605501606], "v": "v0.3"} +{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 2, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 4], 1], ["SP", 6, 15, 64, [2, 16, 1, 1], 1], ["SP", 6, 20, 64, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[6.65448e-05], 0, 2.94376, 1605501803], "v": "v0.3"} +{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 2], 1], ["SP", 3, 10, 14, [2, 7, 1, 1], 1], ["SP", 3, 15, 256, [1, 32, 2, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 192, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 240, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.31245e-05], 0, 1.9322, 1605501903], "v": "v0.3"} +{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 2, 4, 2], 1], ["SP", 6, 15, 512, [2, 32, 1, 1], 1], ["SP", 6, 20, 512, [16, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000143154], 0, 2.20107, 1605502293], "v": "v0.3"} +{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 2, 2, 2], 1], ["SP", 6, 20, 256, [2, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000115017], 0, 3.89122, 1605502608], "v": "v0.3"} +{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [4], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 1, 2, 14], 1], ["SP", 6, 15, 128, [1, 32, 1, 2], 1], ["SP", 6, 20, 128, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.20936e-05], 0, 3.36582, 1605502968], "v": "v0.3"} +{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [8, 1, 2, 2], 1], ["SP", 6, 20, 256, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000122349], 0, 4.2774, 1605503135], "v": "v0.3"} +{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 1, 7], 1], ["SP", 6, 15, 256, [8, 4, 1, 1], 1], ["SP", 6, 20, 256, [1, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [64], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 256, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.9277e-05], 0, 3.07064, 1605503350], "v": "v0.3"} +{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 1], 1], ["SP", 6, 5, 6, [1, 2, 1, 1], 1], ["SP", 6, 10, 196, [7, 7, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 64, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[7.64176e-05], 0, 5.45091, 1605503568], "v": "v0.3"} +{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [1], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [14, 7, 1, 2], 1], ["SP", 6, 15, 64, [1, 16, 1, 2], 1], ["SP", 6, 20, 64, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [64], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [4], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 4, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[7.60496e-05], 0, 3.00771, 1605503805], "v": "v0.3"} +{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [16], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 1, 4, 4], 1], ["SP", 6, 15, 512, [1, 64, 1, 1], 1], ["SP", 6, 20, 512, [1, 32], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [16], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [16], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135079], 0, 2.40957, 1605504233], "v": "v0.3"} diff --git a/third_party/tvm/web/.eslintrc.json b/third_party/tvm/web/.eslintrc.json new file mode 100644 index 00000000..0724c440 --- /dev/null +++ b/third_party/tvm/web/.eslintrc.json @@ -0,0 +1,34 @@ +{ + "env": { + "browser": true, + "es6": true + }, + "extends": ["eslint:recommended"], + "root": true, + "parser": "@typescript-eslint/parser", + "parserOptions": { + "ecmaVersion": 2018, + "sourceType": "module" + }, + "overrides": [ + { + "files": ["src/**.ts", "src/**.tsx"], + "plugins": ["@typescript-eslint"], + "extends": [ + "plugin:@typescript-eslint/eslint-recommended", + "plugin:@typescript-eslint/recommended" + ], + "rules": { + "require-jsdoc": 0, + "@typescript-eslint/no-explicit-any": 0, + "@typescript-eslint/no-empty-function": 0 + } + }, + { + "files": ["tests/node/*.js", "apps/node/*.js"], + "env": { + "node": true + } + } + ] +} diff --git a/third_party/tvm/web/package.json b/third_party/tvm/web/package.json new file mode 100644 index 00000000..dafccb0a --- /dev/null +++ b/third_party/tvm/web/package.json @@ -0,0 +1,32 @@ +{ + "name": "tvmjs", + "displayName": "TVM Wasm JS runtime", + "license": "Apache-2.0", + "version": "0.8.0-dev0", + "scripts": { + "prepwasm": "make && python3 tests/python/prepare_test_libs.py", + "build": "tsc -b && make rmtypedep", + "lint": "eslint -c .eslintrc.json .", + "typedoc": "typedoc .", + "test": "jest", + "bundle": "npm run build && rollup -c rollup.config.js", + "example": "npm run bundle && node apps/node/example.js", + "example:wasi": "npm run bundle && node --experimental-wasi-unstable-preview1 --experimental-wasm-bigint apps/node/wasi_example.js", + "rpc": "npm run bundle && node --experimental-wasi-unstable-preview1 --experimental-wasm-bigint apps/node/wasi_rpc_server.js" + }, + "devDependencies": { + "@rollup/plugin-commonjs": "^11.1.0", + "@rollup/plugin-node-resolve": "^7.1.3", + "@types/node": "^12.12.37", + "@typescript-eslint/eslint-plugin": "^2.29.0", + "@typescript-eslint/parser": "^2.29.0", + "@webgpu/types": "^0.0.31", + "eslint": "^6.8.0", + "jest": "^26.0.1", + "rollup": "^2.7.6", + "rollup-plugin-typescript2": "^0.27.0", + "typedoc": "^0.17.6", + "typescript": "^3.8.3", + "ws": "^7.2.5" + } +} diff --git a/third_party/tvm/web/tsconfig.json b/third_party/tvm/web/tsconfig.json new file mode 100644 index 00000000..6aec4485 --- /dev/null +++ b/third_party/tvm/web/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "module": "commonjs", + "target": "es6", + "outDir": "dist", + "rootDir": "src", + "declaration": true, + "sourceMap": true, + "strict": true + }, + "include": ["src"], + "exclude": ["node_modules"] +} diff --git a/third_party/tvm/web/typedoc.json b/third_party/tvm/web/typedoc.json new file mode 100644 index 00000000..65631ea5 --- /dev/null +++ b/third_party/tvm/web/typedoc.json @@ -0,0 +1,11 @@ +{ + "out": "dist/docs", + "readme": "none", + "mode": "file", + "excludeNotExported": true, + "excludePrivate": true, + "listInvalidSymbolLinks": true, + "module": "umd", + "includes": ["src"], + "exclude": ["node_modules"] +} diff --git a/third_party/xbyak/.gitignore b/third_party/xbyak/.gitignore deleted file mode 100644 index 24b0b1de..00000000 --- a/third_party/xbyak/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/build* # cmake -- GitLab

    8EmexCVo;#+FFn%d<>I%1_kNS~ zbl8r3MPsR`_YS@gn>P_#EO1bWi{eB@HNi&UXZRwh5Az*W${Id`Vu^=`B6NUoOyyU~ zQUsIoPphV02A8?LezCFp;qpy9el=YS9TlrZjeXptkt#gP7ftqhc%v)00Pn#g4(IzY z@zH#lLxnF@pN{QtCJyE^i@3R>QWOI@q-;JCFcTRRx|YWkjczU1b3Y;r7p2-D`Wba9 z(5PlWjeObpV9`N;kQG?df1Hv-xFZ#A-&k9e(Y9F1^E{TKjr4_~lVpwg&~rk%+SPF< zVk@k+xttZ>t(=Y-H%DWSQ78>i6~~ko?;zbTRC29$aOy>?xiY2wDRL+sN59-<#OrDz zoAN>zCVi5->05HQ<+J=9A{~DONuRU5@_K+#TRDx43AE6j#u*!2me%FRueq&jKk6~1 z+%$|U$9G+P?S&wV9=^gp9`$NVKOQEs;aUq0cf9j`$z@cd{pl#A#tk4nMw_3NjwZ*Y zQA*#_GuWTxhlC}@SDopj=MDCqS5uj0pbSjUgVQvzR&qo8{u%-ECIJEA&7ld46zPzN zhTb7asybn@LrI_aSU1**V;c@PRpJ44^8>k$KthZheacle`vC1bcsI?l`G;m{%yw*c z6k(%1UC!yImCtaiO`9Xa4N`oxNAS)%=6EQo_&3U}$2p*VACDfF))3wf0ln+Ot||&)8trT)2HV~ zl9f7r`78c6=(Pl@^47@g35u6pi#!DFutj6&>8DxU*2oYR+PiYMH*5<9L$M{*!t&Q) zeqWMO@>%pZU>6o7SB0NE+LM*AdeJu_1Cl#_kc;hyKc>f*NY2khJKOo7(>U?18uI92 zvwtl}^<{POEI_nv7K!v9TP7fe|1L?zO~gj`p}a`No&sau&a8>RBCbbLL#vU7kL9^n z0-9Wt@&3bDn%ArauBUL3MX(ZX2TcMa$dsweh&O*mT9b<#Re= zgPVhBolOLsuCQBwiT*@xGEVa#dmmgyyD`I%qY1}og#kV6Bnwv6E8Dhh+qP}nwr$(CZQHhOb9xrN z>u<--8hO71L3(0gFj-M$CVyo{#@s>s%BXda2eOYsJ7CqxFEvPK_Vu!lBOdeq{B0}*|1i$ba*$b;bZWV@Gq*?}7m>vCz5F;?(ja@-u zH55BjZ#DP%sHkjs_GdKvMKFd}m?!kTe2*N{)CtImlohuo-XTFGWqj~Kg+G2Xjxar< z>hD)*2ICZME6N6JaGF1#{7ht4lF)>QL(10+ruZw&fgUgif(RDX6qA?#k>|{}^V3b| zY!)?PbKtrrOt>*@B$h+l(YcfYO}+%H10WbME@w*CVUy&*!p5NPNos?}k&UG7QbxpD zxng9ux}2X*OIDf#Re$B%gj}Ezn#G`#waLa$bMgy2?f?y-J0Fm;Rc;j77 zL(|&ysPKzNQU4aAx^~$bGcaKCD>fj+n8x7lFZ7#|!cFM?d0j-MHyD2OV6s#AC0<+e z{}L%|M4mfWB}QiFBhrP=66ra+|1QmA@EJmM=O%gX;=gY-I<1i}+b7dyCv({#a z{$P(#(lQxM=a=tqc-D3tr$0Z6>aH0CWhTXrQ+s`j%CqJx-{`P4H z3p~tmjsUep{rK&NQY&763+YLdM3uxV_%mF$Tw4}Xhl^E+-|N5F-LNt=N)~0!KtLpScx<%t&y2U(3_=So|8t#$ zHf^WDp`Er{rmGzDA=G9q5!P^2!dbgW=27^5U=C8 zp>6eQH^%QI6#|N+mxa9;8>f}&z@>ED_+ZG{wiV`fca_>YA>e;kQfG8D{6l;d(wOt$ z29;_<#9lO$}FR5Byic z_Y;LM+u-S7moaxX$Mpfzg%!E17txkLkc(W1PR!4jRkO-1dS?ELiR)IR%(>!G#|X3( z$Izj=KZGbwhmPu|d!c~P*F%t@=Wf_j8l5c~w~j@yb+E>T4j=}$gGOKB0KhR%6M#iT z|AKS{ICN}kMrdC#Z?^&MJri_x1&ax#qBjH`J>pq8>$mTQ>(J|dScMruAkz+ZXSuA% zqNftk2;rQ)u3;pau7ctmF)5fPF^)^lh19@5JsU?RS#few zU6}zpv8Ru%ZlvjKe@WTf>dqHx(AX2<{!sUP@<6bVIz^eC83c*qyQg za!8PdLpS_WYjwqO_HM2%8PnVlZ%2Q9-7SjhE~16W${wTPTvOfLEJA@OCMe>3RnWuA zJ&>SSO+;2<&tw69xtYEtF#{Vc5x$_*Y!ncvx1sR3M1AAP8WYo*)d`M9^_E0bTH1UH zb59HDWKuEy<#EPNoI1V>^bko6<#yEKfVGZ%!2614llRs%u`#{YZ?C0WQuslDtGC0) zcl6g@sRU3jUbElq-Mt^h0wv-vw|#_aS#r>4aD}JXEGcoT;~>qb4%9pEj$Io9-l)+D zTTBZcpA_0)0#Uy(=XXQ_J|a!I3pSPUEWrAI>|}k3ajLFcJL% z7NI5|?i&SxjqN;~H3d}YQr<*Z$N&Annl$#KjUY;u*)I_=n;lUAvf(F~u7TnFrp)tC_r9A?<1qB!;0@ zE}zLK-Sv6PRLvE2(17{_K-)k-o!V`1*?%|`l5AXLJ&n_!>EAD319A_5wiuAyoPJ)- za$5_+DeYht%MfKto?*p{POi%GeBn-=w59IcgISm!ZO5OA_ghTdms+a%y&-wn`UeYR#K1WB}S1KpW%P3Hx3hpfutdt$+FIJ;)C+(_=JG&eH}2v{hxZL}b;+ z?<;>sf-BwH$=ZPVF-!kR|LdW0;WXU!>VD3`3F8eiG0{CjP4K~55AhvpMby{{x-a4u zy(W`%>t=hN^G4?@4VK(Af6Xb9qo8Rd`6h`Q6@{kHc!H!x_km0m3_xWq@>}z6VLp@7 z2>2)fEH=yHG;e2EdI)Hy^#u{wF@_hGVCQ1e6y@{JHK7rxYlMg*g;02q0lW2Gq4K)a zJ$a-f?NzK2?z_x*f(U_+nrl(6*swB3;Aa0EFn04q0CF=;Pxfz0_xUpe_S~!r*&K~v zJF+{mKP!bbc)!f?QfqU0kLZ|9*Y%xO9$_AiH zGmGJ+99Y0`Y}^8Vx(Kkllt&U_0*@yw)$qXLgy;X~1q8&YnI7i-3LhA8@z8RZs}Ag% z1T8$exfYx#LsPl!(wAbC( zhH#YwAUy6Qkp;VVwU8OorCwxW4kgZONNCK}T-Mf6>B^`2_jeH+l?ttuOvo;QS|E8C z4)%#b-qmcN9)DyTccA zhFqQT*+Uv^`_eqW`HylM7=$U^drL7Wl)Tp?%lV|+#|}wnGO&-lw;D~I-8)gI3!6!- z&Q-dB1Lt}EXl;df3^6#8@J^==g=XCTRq4p%4YbDAoGgJ5zW=9cK1Z9EsqXu+!JDv$ z(YoCpSGbi;Nup@~r!M0xQ}Mfsj)B(j$PtQYCf2HuWO z$7BLYVil%VemxKe45(DqbCH(831&&SyvYh!+5f;Y8z$lXuCT@LNry&|v2xOO zej^tUs5WkK7+;tZzb3}`)wf8_)UhSHpBT9DC)_*Vc`}_v3vlluIP{iS`3U`GdMyKb zlHcUCH()L1Zc40dZ~V>`I(>5`qEdj*OPT*dy0wKfQSymqSr(;<;Kc5^0ZZ(5(;NEG zib;F*f;{2?+DUr?p;54dG$%PCX^9Hit3TVfm&XY4_Pu=M`}BUV6mpO(2R(T@8wFnC zu#ZR$YQfXek}+jHm~Q)fhk0={Li6X3Ssw}GTPlAU&aB|_f{3O=D_+w2CtX_~J+ z`GH-4Ax9-xsgv86y1>I7%nBZ*`1~cuz%d01ZwL;@d#D~WoSAB4cF~BU{lcG7w&2_g z!@y-_Fr$K)Lx|2?7=Rn2F+ha(ZC4PJ0cGAsZ^#I2y$I+jd$J}ks=1eV_xEI$G0+cq zb|$SF9=&1%{LcvCGf=Aozy6K0EZVMF+DT9Git3xAk3k$*EW9-dw&f?N7XwuZGhE%z ze3zlvlT>Ul<;@8Hx@ObCi~f9v2a^YYVODR7=-PBu5=S$C#LayIzei3J zc=-+*>w?R$TEh6I$T!@X$M*;pWxxa@p|oNK5!Bqh;`3v(Vx8?kg`#hmx3ob8VxptAX_(NbR87qw>O*)RJSXG%nNX>tfA8eq+PdWq_r&V&SJo$Qswng_9y$z_ugCh z_1Sn^fpIg$gYo&6KFHB2Q89jQDF~$yaJRQ^Ck^B)1r%jdXm>aM0|98C&PYug=|Cpb zW_A?BD=Yo<-6c1md_blp%D^Y_2~iL82FhfK7G~JW&GYf#!>Zs*6Nbn{U&v9(>PU4G zJHo0|m}9AQSo6=BCgl&Yn*MKn=<&*dmt+siN+R%26OnF*VVUvwTzq!N!RII}4=D&? z)^MsHovjyAQdIBNjv^fXyc4ZhKpHAZ2(L16v2rDWNtQ{W6AA=#>6YfoFIKboE@){7 z|Bt0=gBGdgU9YT_@<%Mf44!OD1ZsRwA7HJ-A(MbFH|k~bLAz8^ct-QVSJw4o4aIpvVse|u*?TK3Imgl~Y|)@tfrx3!dl954+rlJ~56u+C z&YYYK?cY{mMlRwEOfN@0)4A{(3$q)BY-q(uxwJ=LMc5R&zLH1E&%JH1;bO-D=X_cS zf6Toc@Wg_UQ#({-8;9_fid)_@H3 zH!b+bNC}Ym{8B}|Bd!a@x_t3X#v5}A#!`RT^^*n7wx*vS-nJNLZyd6kMVpH_29sEo z8weKnxe`Bx2+Q&SoS;p$)%9N>kQ^{qBp#vFRa}T6zJKzYD z#U14x27W5~tFc$zIt8)hV>bmjaRW4)N5K*>SV<1OCtYwxhaVAQtB7`sSewKB{n4O( zWzcGdxTf#FGn^8*MhM%J%?YA8v``R226B4NJ`*vWp2s9W6eK@VglZnQI;-d}8F1U6 zblKbtX=L6ywTHKTvXv&t16Pd9*g_zX_4hT8gKkuu(%2 zfqk#D15ZL2KByt`G=bysMwwabU1YVgQD$pz^7`kWXT3}Hhj?JVH$wHIFh@)0ALn-I z%bp7DN{rE`@ z>(PUgYE7@Rau(}Qx8!Z46I_iKhB$tB47m%LYgr21^NWMrrBf4ypxbBHo9b6RxX4F5 zC^Ep%msz|s%W>C^<`l^zQ%>S@q>E*e?0z*SfsI48Fypy%_Yx= zxB2&puT<#~Var9Ea7&tHg_BY$0W%!RrnSa)??ZoS$4LX((W^Rz?t@{5*bR7JXuerV zIoL%E48^)|*_DZ~?4n)9!UX5XJWaZpm-IC07V}MofRA%t1Li0^Rzp^-*))lg;b)9A zX9?UE`bw3$iqkQ32qT6qtf6J<82Uyy8MA>p!jjrn`qE*!W-KIa9EzIq97l!9FFhblflr*{da7S!WT^?1@LOmKq;wlAPgoCO9IMU^ z`I}37k6P2SIeydZxz+`rn0315EB)SnE7^FVL}Td~*Hs^@J&el=e*`vL>L^1Nb@8y? zL&=rIf=+dW%oyU9HD&K?5oVIvO8z(&B%*{^gg?`d(4v1)$H{D>;~7bq5zq2p*Ojmn zwj(gR;9O4SF)J-Az+zCx`GULm1`B~6v~)FKkLUafq3qvWp`l21Eg&v_v5|aeLOZ-Y zhFeVCI4?m0CB6w|`g!?-OMLxSkqK{T*Q!dtHvJDMjnbbY2MeGuyP`TIq#I}a+E)nc zr-5#PA^;?i$`)3S9^@dRnlk-~dBWtM_vd<~i=#&m$3#PiKlEB}X}L3Zga#fNfD ztGkd*i6il2%v)Qd#*8U~6*6-ghbUod=lO8h#Uxx^CtiSN;>4FPTFJ3qWgA|mHZanG z&bDflh1-!7`~yVl!*q6AWS$<+aRm9Phe&VXMIWc?DUDqou~sudl=3aj?85JOi!}5C zat#t<|EpRg;=Fzhg86)enN-^r53|e>`K-(Xce;_K3oVbIh(|igp1YR~*nbUA5asWb zB!s+*^V3{KcUY~%->0!o5Yn`oalmUNjQxsATk#&pNn>`~?1N^d%?fYJ^>P^|E&VXo zpQ~pV<0|C~8H9H?F*!!D(UD=Y+ftx`CT1+kVP18M>T;Nrvj!!pkpM!356putmfkEe z8;cyS$YFfu{V5qK!G^DP)h#(uTD7FjLL?2wU7;=jox(D=)#Svf2g++H+T~s751WA9 zNQyHCSvrpY{O$?8uG0jPJB>LzD5Ea;52R31Zw_QdaUPQ)*nBc>)2b53|3Q_iAV;_Y zcp?iQuPUIB+@X8A`t{u%Ek{%3ZPn}x3|laqCAYo%5|nJ?l` zU!bn;Z78;{VM0FrS*Ce#c?w$_v%;UvPP)YPj@9Z`>yMXr+_n4W`Ro zQlsmcfao<3rOYjRRC&#qw3!5TfaeBxBBRgH+ zry+LlHl>*%)R!}%qhy6O+C~%lEX_fcdo? z#A73g@j&ObN;b}G0HY^h;+8SVDkns;1{&vJoM{C_oJqXaQ>aO8%ZkfgAcnbrRCPz+ zaE4SBICTFZ@OdMa{0cwoKR!~bDrF7In(NGby3|BfVZYm8rA>GTAD);3;Nbbdm$e>T zU}<}TV^fu*i2fJv@vXX;S$`cHz%;A6x*_YFKv6<6xqv^Md&?AD!y`yxUq_c0Og>p| zHc?ce$A}qfBtP>FLaxhwnf{>%B&`{G5*TEl14y>B10Gr%ffeT7nc0_4D9F{pCY74^ zYYK$QD&-I!9b#ksyw8F6Di@GxW(K&z(m^T=1p4`ZQ(-PN)^b!k6sX69zz|LF&2Uc* z{ek$U(W>8Sb-n|N_Fige*!_N@f7UjrYHKynKzyUd*%jAyMgFKh#~XK zBx|(L1V82V=tU(9l?j7yU+#}8&6x@f>3N8&nGR$l88-yEaO`+k9hGe)NslA$9TvH=G#Tv_97^qYoAn zW!#s^x#u)fTcvad2vZfCX(jZ*yY9?M2_?ofwL7w`q}}4T>i1TuTLjm~XslXd`G3+j zf8LI)E>i=@qEocgC_K1vU9M`J2vP7ZYC_c3wkSYM3impQ zE8W#Oz87bF&~rOAUJQ{~EA7L<xEQI}q{gFOSJBL+@!qzdQME_4q*lcp&$;#Y{lFay!|9mlqNdnDCf zq-Au*lO_lyK8*mldJnU@;*)+lG7{vy9b9-%f%B<>>QZ3aTt~qkW3`c`GIl(_CZG2^ zB%OvpEe;qyELG-rULjTlsq-<*+%yP{TX+!yO_og3LJ8|}XmQ^GJ5)htneP%`e8R#B zmV$g@(pE|NKi;TBf~lku%@3aDP0l-{(HXtv_&b^&a|4xL6+4YksS+`Tr;d-t3D4YQ zXj$8~&2kf-gb&fNQyCtW^Ad3Mc^jE8DM!dg^ucOW!jS4{-o78g`6@QM{&7agOTXn| zp+?Xe*TvW360(y{zg$4gDppBSc8}F}96&!I``}2^jghfbR^e{`tEFdek{wrNq`&TY zH0&Q){j4~UFl7=3dNT^IvFz9~#RUH6jewDtIRfhB09(3$(Hwt!UHDnBH^I#@vlteiCmTSKt?WfuIGHP}ju*;m=9+l>UVw7?siw~VMc!-wcQ9BPw^Hb4C=JX`@o=`s|4QhWv z)}6vcgVdA`BJl|X#x)3_Y7kp#xMW~8@Wth=$9sNH1}5#!cy|1>RrZK51S!)WMeHO@ zS^tlfT+edEI*V*8eBTy#o{G#<5PvtMMf|bfqpD@}B|H(pT|X#t>h;2ih}%yhq~X%= z;LgPrLFLm7@__zkj~eDP(W*`1_!xYW#i@3JElB_~rf>`i)9I;D>n|=p!&9N4PitlM8PSd+5Gd;lex(!ph2m(I2sw# zW9){paS73vD{GXAcrVP%bf@?qW~_b`JZt#kxqe4a_X^B8mzE>ipii1UqQ9z55~0AM zF0^fC@g*W2*_kLYnUU2b2NBPcMmBoismje^V+YE1$qJ+|3t|%q3E%C>%cWMBrXX?B&|wR@Z>e%;i(on)SdMp*z^LXO z%#jf;++{Rs6dqWeMO3#ws)WVyx)KCW8OxTLg^0y@5rS6o-yBb9#$wdXx-y-c2L7nJ z2HVNsSvEdr4MBancI@O@3umWHX}SD_prVu2Yz=_Aq<@Lt#rukp61;E`s2;sIaEYz<6xd4fEkpS3<8p<2{u@%{7&;mI-MYnCqW+GEyK4>3x ztu}SQsulqpnwD=jtO<|m_k*EU^g4&S9LEPeKJ9ygd4>}}d8V-C(D*9REU18D!BzP- zId(_DebVLz+l8=E|}>*%t%?U z^hkjad)^~uE^q`AH4KCV5e{yZX@#$i1>C}?mS>6do1YVj-ux?;*b`Opry3%ahkp#+ z3)Lr+o5GioINA!@s}Q{D63O>WE5s-8BynO<6RR|01X|$0;k*w;B6_Xef!P3h{H_w_ zVrk|2OffR&@U&RQSwpebU6)os3`^<&CD-RTlNVgUX~spa9X0%t?@unNdH*++_u`Sv zszf5y>q)U1*i+35K=P3wmaaCITw;yO=R=-xMwYXh#q;F5j`6(Tv+pXdi6;(FLeDu5 z#`R{FSl*U|uY={hp<0no2C5POuK*OKhRd-~VLegLv&+2mu1ljH0$%>Ru-5*k ziMhL&nliFTAshUf9iUh!bOl2bIeHIvi<8>jbRrC;(RY~z47dlJRr(?+uJ+yIFI}9F z2hos&;dI$?L)5|?7OPd(CA8teI=i7ww@!+}@%aS@htYlspa}43y zyy0*pF6z0)A57Gz6ujT-fiG{hpc?4%$ zzL%}bRr%OZwB+gKn28^Pe2=M1+^ncVl!QPumPAjwpm1O0*Y!?$O zJu%e#A4@Z+ve+G4Z4{DubwAF+6;?S2KvSesN2fHv@S7f;EnEA3vTL&ejenwL-QAPU ztxL#FoOO?==#VP93A(B~3}AT4F(k(Cfs3VUkdSiniv)EN8i$mPKNvCr z{FbIu-x!r8(h{%tO~mYjt8;-T`q(33Z<+i0LFgTC;kn?gS|&K92tKB~%qL~LG@MJu zD7PVC@_73?)9eyzLi`NksyS9pLYX%SGQ7Q9rX-jeoWQ6ZXxa*z$|FAH5aDe6(p^Ng zs&HA{Iywn>V4i8+%IzdHnoohgSBsnvGl=TAsVp{58HnV-IM`)G=rO@QUOSdmqhUJZ zi5lx`m@8~ZCA$VA^arO^@S`I~p?c?(&F?*OqpjhbO!6-Jze9It!d(4L^ICJ+bg`w9 zLnCjqH1lzV2m6{7{4y)3xD@Npf#MEwYyWhNkyYIbHb)X8ugJa3G@sI*Yb>=4Dn|@{xlTI4lWlb=TqKXd%Ho{TLu8LviWzU zl$d+OEI=QYhtDYC66E$2c}i!RZcFst-#96@OS&sW*&S!+@#+qlU#Q#yKBx_Hz@pkb z)5SeuA!z+?5O?0!F<`be=``6O6B#IrWQFiV-U%10fn2xEE4!8wr*8o1erxvoY?Lc{ z*sQ0G&7Hx2hNlX`y1O+NK_OnNb@MqhQyneGKHt7i^+L}OmKsu`-*W4VbEB3830a=| z@B3u>ULc>`txaFP-doanr2pw*FsbW5JBT-0EdIcQs2Z4wUS9^h404cuyPh1q4l{F_ zv3edi(|KSJK1Yr^$Ef9fyiLxIoHDj9NO07LKoq4M)GsuQeyXeYT#x!FiQDgsq9UyG zN3qehMCoC|rOpgz3{mT3@q0(sNk4tTujhf1`_ajAavMHdwqudA1^8e4TqhJmB)*|Q z5&RDWAL(D*fD9CbFuva33(1T6ral{1T~%9SxQtzPRv9D&65bN`!A!5XdwPVx1P z?`q)iql(FZ0N+AiXn;9nCszKb%V&YE=`ULVQL;3m24z{awiw^4)6I3P_W}tcIm7_&y8%oyykdLTzc(A*G z<$kF~_8{fi4_t;@7~tf{$X}~mh{Z4}R8PZ}5q{^)v3bZkg!1zz&4|=)p9C5K5kjHs zA-->;$Vz!M*dCRYzKsHld{7ELFfxrx=kEc;p3z)bt81ny9fJa-=EM_+OZ_SvO&H`s zAFt0zidB5~gMGkfL>9)N&-X8h^+-|(m#&vDs$_?tc{8{RqOF!F z3FMVMwIvW&9Q(;>Mxv~y?fEJPc|fYq4=sL(X+cVe&2mff(%;qbi=7_|d-up=)|7gy zUb%1u8|4Rt?ZL3A(XInTGfZ%L`L^z?1$+)&buVs z{YfE9{TS@`wu2j?0&9Nq8;^ zhz-)43>(9ZyG1mr8%!=TQy9ObqpyBNI&6Y2BAD04!$i{tvqQ4XA+6DdZ&EJ{h~Xd)Ikj@l922~b2#r>M)ftl}FL;gb-;2Og zhHi+{R2irwIANbuQinIa<$e%P!}WDnx}>Hdwm)YX3z-qe^*$w5P%Fez5!M;iAATRb z*tS3vc%=8%WQrtl5`_;O@Ne=1C&ZJH_z{H8Iar&kFHecJ3NBfA@{%ifg<&$f*JRV* z#C*g~5IS;Kt30m|nUXuDg0A0b&Zw+9_A>13gU_3^DfOZ)KWcVd_i#_i78M#QiT+zg zlzn{j&?^3L_M34du{cpXL+$R>1U0gx1#;((2bh^aLHVA2Un}uo#A%f z-}3b#n-mX@rD{uqK{NH>tc+20fKV~rYI-6Y7HH(4st`Z8miEYP^bUQwn?$r%`LZ~} z#Xsm1uhyqgSczCMc4&>@SP?AdLZ&URx7^?_*kqtN`=BTJlxW;E=JIV8Ycz*{jmk3hgxW+TwjRZGmx6& ziKTH5br(m~MiF<0){yE#@yZlT@Jbdn(*yV)ow$RDFa;IgBFNu1!ru}5K7qLz?-!K< z2%CeYVwiV-z+Er|uiGgoP94K)H8|kns_GDMW}rUnW+_skJ7Z&My{*E@QYWdc)na9Z z2E`v93sQyG=0x9cs$e1Db7WX_vzqJ6Tk930KwwJ0cqg1}SRNSS_IdK9p9XoCgHqDV zqU+4mUUP8iZnWnmKA9?zKGc?}WV6s9au07m<6nxXS7whX>Gcx2|Ngpp8v)-g;`);% zJkN39<782t@k|W)30k%}i0SNNYUFIotU`9fNzXfPz|%#54K*%3BGQm{a8>=tl@OC> z?J2#rw7)+EB9i<>T^vaL!}~ELWE7voJbYgKH(?>=kiH(FNcP@BF=t3D6o%65bWavxvNve>SI>sKlKNpwjS$<=`F$SA-2ntr$RmZS3zCH&as-j-^p?45Ab{Vm|=f7jUd0loCn1 zkg_YF_go zCkvg#ESXd}M10lH$Npddg!?puZjB&PYijaaF@5o9;zj}0@BWBk2jj`b5e@L7Wz66#wLDDH6K57OwR&=W;WKdICs+II^3B+E_8-snxKjUq zr_MeNui9(!!x#R!25>}o{qxF4XYr4^+%LIZU6Y)~;P(V%n|af=__dU>JuJ)+ciwZs z;>yAPJ*@CCP$!t-T4mES;h@w+5>7oaC>k0(m1Y&Np2pgKw;=Gp_BZ*u7%d|e@zb4_ zsNG0yF=-0wq`Wbr3V0hYF#FAEOgPJus}%5OY2S|%n6$>pi@D_8xZ^y1nnh(c z&zdiM#GV?egp;HQN~TwdHtJfIyn-={A2ejs~m_Psb`)N{H!2wn|biO(>bdo1E3w%CH!NTQX=u42fUo4*{ zu&Cvuf;q?+wb@vop) zAz0eJ=uRNzHTKiAQa4oix}qz;O9b#(7A-T5bK8b-`F$AKL0Qf3c){bh^f_u2pPt|4 zJ4B6k>5_tS5UN(G!T;+)w}TKXnBlYugX7TWIEDu6Ki@Lt$4yO+)bWvMdAVm?=5 zJh+r!1v@)N`|Jt&yF$`x|Ri-k_zVg&E^EeoGbtholGflIDFq|2{ zQdvv_8ACQlBGEF=E^`7@3|Dp;WMxLM*SA-Z7J}k$b9q{7J4W0?lnY zH^r`&?0yg^6*`))>_=A)KJyxT{mIbs#hg8^Ry-6{&l&$oJ&^l%s%m*XKmcVxn!oBL;BXIcjF*R%3-@ATt-zz0Sd*>ocI9zU zIkymIIYZNssty?oV|bJZ3(YFvW#1eAk8So4TS%nlJEsJ7$~0NG&A_Dh`Z+^qNJrFb zc$Uft7W#+{S{C2BM!Km<%}!!I2}c6+H8a% zdIyNkHk@wZc3U|V$|JrMY7%20B_~5>&XM7&%$c2x;P%I#7&kfmQjSMbJIpYsCV7pR zvdD*cV*av%w2|n@R>H~yS=3`n!@UsS2QyUO-J;L_{dSh3w&FX^sdvtPl|tl2H;EB0 z8!Y7@UD?lO#?c4+gg6s<$V+)d+F`@2FO3tE6cfF<>Zl6Ob)W5`kGgjrNZ%ioqX|fA zS-{yzelt$!mqMOCt_$6<<&g8p9z{ynJX}H_98%hE*cj8*97OGm6vHl+!?5xlTA8x0 zkPnvuMv)C5e!(IJ2#^&??UZRg3mD>sM_Vs{aF?*y9){9{YHxUD9)hXJVwBm(5H`?; z_F#I)a0?M_%&Cv_s0#=B1BB-ipy{I)nxNJawIjaYYQ@0x`=$KDyYe*_GHTN2W%~#l zRVhePV038+UFg!&Z37@P1LGP+gDrQK21+LZO|A%1Gi=#i@cv!Fj?TtE&l0dA!kK%c z9XuXKW{ms`A~=^G_HmzrbOXrG=l7#LopAp>J_NVvB16q8C<^cOr;a&lP%tkw+y%P^Zm!SSi?1#kC7hWP2i4 zsrnZ}*0CnjKPa^~A=|PHZ1hQ`WoY>P2$s3Cw)y3g9(<(HnvZ1Yyv?eE@V5D}a0?x& ztCsM!SZB%$ef^lxwXROj=d``YggbaV@D%{*s<6Yn$YFH}1EcPdUTZiJx^qtuNkPr>0~+32-*eG1n9P&f z#>Me1GDuLko~v0b+q}Mz&yd+Lw=`~NvO+P0#QV~&(_&JrDqE(VZ4J^IHjoDxHhSG} z$*FXoHXq>^3Xi(rK zF1=fo)}2c-4)qn_sl;8!scX9xrO1&A-}KmeU@v3O z{!t$lnq38mK;1y(PbO+qu^kJAcj2+XyyBZ{vQKvJAvgO9d&L-NvnTG5v$RaGn43Mf zgN@*pu;C2nrqIdstmAZyq9lpAU1+qPKlXU=n`SNq*ty;$uxEHjYK_t~RYOAiAIPEI zv<=qP^29T@)aXgz%tfU<8!EMa_)?Rm8{42+qQ{niwX&ix;3Eh;o&sbx zgXvTs&Iz2vg5?Qcsod51zNID(>a%;tnA|0b>(HAiFb(s*X!K5Od)bMl08VCd!a8L} zHZS4pETXT-s`Z}swhb9ecuc0+u5&r>M`Wctp2%MVwylKa{z)PfN_Rq_up?O<)_i9K z{bhH8lTZFH0@b1;>?qoc1r)f7qtgN>;-Aa5=u-OZoxqRJ9Hu6?sJeJ(#HA;M*uT7p z3pK5vHsdb#cpIfZbH}R)HPP^>Z(OY-lBG__-8+*FWSZ zi%Hc5GwH8ki^9l4iN24ha-e0(?(ur`%kLNZK> zuAP6?b-gHDNPY3LfsA+TWgrTF*^%b6vVu{&XzR^1RB_Twt>ug{fZ6Ms5$SZzE~cop z?tCVq(3K4DuD$yJ^b{=Ygw(?Z%gfS9qE{JlCwCS((aI&xSG_?gMX!>0FBPic89pWr zEnn{c7Z?ub@g#k9g&3=W5ufS@L!i7EF!`_oLALi&QuoMjX$fGQzv#(kpGXH+Zdf2g{gw$9xP zVN!BA=^>cdW?zT|q^T6sOFl0=o#CTw?Ka<_7gZ6cE~&~|TSZ3w4Wk!`5B&k#gi}Y; zE_uc9E&RWcjgFHm>y`ok!DF@=AHWnVbrxP&1_OZso(eAl=7#IJagUqN*2;DCm*lV& z1}}%Wy>J$|Kf{@*Y(r$#We0@xSwk$y)gm5m3Y+*@ltx`prXJ+1a!bWermyzujDkR< zP52V*EE6h3&%R|ZB`1}Io(%BoFCGq}Na_bD)b{W}tZb;BWr!HKcvuEnmcN~Yss4P` zA#rlP9YrJ8cIAEwVBzhaM)P%YguFt?UBjR0GiLKsAIe#4^45Mb0zLd4eN0ae>|SpQ zt;4ZGY*QheF(P2pB8fD3$H$&pWlRCv>fIaD#WcJa8=$1xw%ouQUZdX+Cn??>ULBC7 zJkK~sv>!KyHGWc)RL1;ot=cs>s6Q@;IoZ8eq}JXKa`9<2UDeP*HRE)nA5&#$K`ZR7 z%CH1n<(+{s+5ADS>S$SE{ZG4G_&;T)@hh`!7Cpl(n15`nUfv%+YC%?AHsceyxqJ|G z#5$IWu3^f2xaIvrF2HYXPf1dp^L1cRZquxp(yhrwnU0A2T^m43Fk$e16quLso9S6N zSeoO60BM}j@l-3!#M8Vecag3SOrFOJ;Y8wMIA>FTy(6{Z%Z^wB-kf zD{M{r(%j7;9rTZUue%VM<&av?JBM1>3q|~fd?qPQE8me_2!$G5KI1uGY@`sPEfYY7 zE)%Ft@DGOk$_BS`P3=4M*q8d0bXY%5q|}XGMh3aBsMxH?*{V%naAGGes&;heU^!71JIa^oj>$~Z3&K^B5jy_~Ol zfIYk9&(zrm7H=IbmAa7b7e^Oqt_|jPXyV+gOK^8gxG(MXUjlEX2J@M@q4x!ZHcx?G z&08cQU3ex9`Fzbd`KfJ?*+u-wWpPE6HpWv4wJ3klt}f<9$a%FdI7#tO)@yHq6X{q< zuVLpp1A7kuC(qTFfeI-JUX;uKTa+DF5MHd?fv%ZqCSN%>UF&>U=m{qCTjmLSIypwR z4z3#;F$(HzsGt$1s4c@BWvmcI{u(a@FT(pp_9`;**Lq?QxN{-8rWFj*PEM8BO1Q=z zyq6M%*GB)(u%?|ileEG=vXiKDqJ=jYM#zaiug9-d)QUv9oZn4)Z1?~t3`LUUt?F!nAsBbyZ3Gi!nT{bzwH?6L0mpilNPNg=wWO_`(wb(=*1+}-UGbfunFRYvi#wpTj))ej{O!d{9fkL23voBrtPq~Y zhRj$QK|b|M;(_eaH63zb)ZFjVIwArXJT>yQqpJb9wlU*0PHeR{)tXH` z|CfNtBCFvA8EKi+MmKNz&(T3(&k$oo-SDpTMzWiJP5zvHz?E%DuTD61e|ka6SY2=Y z7A%!0)rF~Y4PB%Q2-jXUqiSos=;#wqf0LuI(@a^fA{n9jtcKGzuCgl-1A0MabNU+6 zo{jnbUYBCsj1(ov$izVad!)pe%$!SHRCunODKPYTgLv_|8+8%V&{qRzhZU_ z811F*g&eV9D(LGd&FBSOccVmC*v-qAraap&o#5e#|4LOk0J^&yiTKRlJn+91K# znB2@-#{w=O0tCHavlvn>@tsi5@yO=!*dNnfm4?UovtQ^q`PY_wpaL{RQR`8I&s0r- z5k-gOmVPP3B-Y@Z+6Rcp7ckEj)U)#iv#X;Z>%lINec#hQl(01k5Q|~$GqGTMD!Pa5 z)TLuOUs}!Y6Q*0s4gN{StlQ5gFu?(H!Q#jSb>Cg|69givv<_RYD!IlglqRl%C)|s@ z(>D)&((9`AfP|E^MQ=*2@reLCo-`ae_5k)A6eNCAaocjW(Za3z%n@OF45k}?3SI;{ z(xxfiKP4TRzM_0>sodAl8^&bQTUnrp%c-84#gO8=C;g=*Q$YWsNJ3MqWvlG}< zt+7pz1~*PJMwzU(h>@SJeCF&PWo=^nQy}^Y`g7$sE)K(5FivA&oo0HS4eaL5gC|RA z9qY(S&)S$kCt^1lGLA7y%im3R#bBZ+0+T(;eEeEy;2HgK!pzf}dL-yt2!WZb(HhjR*G&f9PS_~^u+g@c6#>dA!dAHKs zTpNWM1U;JX$!x#4C|Sl!DtNTV)gEj?<-1xI)88UvR+ay4s{44px4hHeEQK1FC2>M( zou5I%R;L1H}G96e?p%_|uKQSC&9&Du+ zPe_*Vk(OPrsz9`PUU|w({9s6Q`tWrhjqu(fv2g@g7KM9dx+62BYk6rz3EVs5lK)=e+5lo9ohf$X3 zAd?U(f5`gwFh_R`p|fHpK=r&09?8V5?-(*c+^xWWdHOBPT#lJ{uF>&AmXwD%f)VLw z%cl74PJ0w`dkAkQWFdZ=5Oe}C{A&CLj~(vHpv>Yn5z6TOi{jSpfFto8@IhE z9sMwtwZSE%w?C^WJPbuoY1CP;sxKR2y}%_esgFLEZ!HugNKAH(Nb?YRA_~ z39_B-!Q4xd2Un$Ubvjc~DO9H|O{-tm0dVK%R^5FES7JGNe#=geQ_C~InzK2*5xv-vgBxOv%a1o5BRwLH?+pvjmNc zPa3?%T+Lfx7^Wi@_6_PA5Q2QiL`k#&-4cPK1kvw!x|UbHPlv3p73h0vx*b71eO)?j zr|q7+yxR`nknu_l%*vmancKOc*%uA)K6Cx*g$QC@mFwvpdb#78#(Z6hhtHjgHL3%; zqk&V0Gsd$SwEvL>KNxtle#UvbcXhPNW>)5(FS-BJYx4v^n(9042=Umm)N;`F-9++R zOE)yKTKl><7d!nT$XyIo&lDQ)EFRyq??3HA*Tg#W!f#V* z?~LxCIS*prpv31Vx=lhQU=!X&!^mbdrU~&+$*+mBv($>!Oo%aNkY(LAn}3_!Kpqm_ z`!^^=oX=a!oy+2@i2+R`beq3mcoE(f{>)jyCwv$5TFP{=3B!X6)ECPg(uj`AekKqC z8ZbkMOaNXkuZV74si+Q30-kx+RqcIYF!aFjY=z4zeF|l6WOHhpWkh9TZ)9Z(K0XR_baG{3Z3=kW zw7OMTT-&xaiUlXQTS0JlcXtR*a4Fm=+zGD1-95qG-QC?K!5xBg$=c`awe~*$>;3p( z(z=Y^MjvhTSr0`{qNGAEY+`2wl(4e}(K9nJ@c`uHmF;W{ZJC+qr9p<)7RCS;1|}vp zcye-4N1!3d!p>IA5Cr4_aDvPMipHS#R7U^{6B9Q)IY1I<3v_&UngER40rEhQp{lz* zkQqQ__!m&Ja{|#D89KeYfwpEAwm|Cl6j3{ScSj2|bI_kPSn28itn?>agaII9Xl!NY z>SSdBFtjxR$S}w=02J(8-;EXkDmz<%5zySw+7w`C3Qz@V0@PH*l~n+e%8F`AD%1?` z3#&NW+uJ$*4;N7tRW(UEfS9m?syG0sP6v=wQ&Ii%R~2ac&fkm(KmXbI#=-_+hyO&#rQ{we^VG6#X|c^Da8U0oT> zoSi@nc8+EY_SS##Q#H460=U{aS^?gljzDYRUyX6LHF<9*$Q<~u1^yfgK+eJ#XzK+0 zlO$pHFQ?6WE8mme?Vx`Xdv65j&zjc%3I{j=f&Wp)+|cQ-Tsb8rIe?9!g)InZYiMiy z9tbi7IXeLi{<6LQ0!=9YMGy!Ob#`?8Q$zlLU5@`Q^S{(Z?A~{#W9{i}==$F$W@zi| zMVej53(uBb-tN6S zwhSOQ&|lGi))N+!;{k9naR8XP*#J!MA{Dna5w)|id1vhe|EGOo7Vmum**Us1{vTai z+1k07UMj;0q^N#=mG?Q9G!vQp8s_GCxU0@1ejPDgWiSy zep%rET3y=K)DFP?FY`N-|8@P31gQSHsMPPb)Wpu#+8tm5G=*nWumimdg6jXDd-h+o zB%H0S6%1{FRR7)5|Bf@Xv9Nak&&dA@(E$D-OQm4vXk%#oUp@;b2@5x%iIN4#*!*9e z{g?e;wS;ZWtbqV}W;O;URt}bbdDQ+~E9-atzRw_wKO^jqpX0xL@6t53vIROh0oa)S zasl7l@n6K>&)^Sj0OJ=aaS17P+W(`Szrw_AjqOY>L0?M@K_@tSBQ})mpn3i7@%&g!4u_ils!4cXsWfL#Ak znC0Ez_7C_@%l#klotDR6@IPgv^k;njH5r)x*3tjGSN_5(AV)hZpoWFX`#bd?A@YVG zM+-M?ruS>j{BD2$`QQI^|Dyo;-xKK{*&-r#Zl3gP>|6kP7VdZbv2tU?im%l+V5ILF+G z;b@KKwuFM$j=wb34i0*Iz=(-7>=9n3nKj4@#u}A@w(ssl-vf^<6nu`L+Dm6T0HeV` zsA1YnfW+!8iE4Shl72L-*Opc9Pnn~zMf+WK0gWN3c4O+F^UH1Ezk_yaD$6S zfiV5cfX_!sE)~fBVYd82L6efgdfomiKfYPL_1T<06Ra_e=TEsgB;VB)G{x&y$dFA+ zX6eTlOb;P;8kSX-CEtunDB(LZiHQ*LhXxWG@{1TUI{AZ3WS}h0b5_mG#wC^h5Jaah z&9Bp{x>wrq$FuyJ;5i?N2#ujv^Tep(mxYFq!YeGonzEP7mNYtBUR+-RCmk6Ane_Pd zfVWlNWT6YjYI>`*fBOdfHHDHfHqta4Kuagl8$bYkeER~~jsI(~XQODA86A51dXu?R zwscxtEOK0DpTyNsj?V)hd^Sa@;|vqCFix}`$kWW z6LefPg>p}8wr!FDKAv0pmau-%XgH8>^#>Ul%2GL&iKgh6>H=Ept5_V$cT(?XR^YA1I=4m$E2=kY$J4 zXsSDd;bx2E2}E0~L!LS7HVqI^DQ+_qcos~~P zP&%+7g!Wy1^fD+EU_>1V?MRt|UGVxVZ5Px9I-i))x$-)pJGDW2sJ`%rS!9*Jq>;~i zpIC5{wnk#pzpb~n!GtB}F(f+4CPC%vOvAJJ^qw!%X6wZ7Ilr=>k!&JRW~LA2li<_* zG@A1$n=5zyjX*80VBe5E9W?)?;gy84+OMPn|0o$(E)_m0QAPSd z1KT^HvTISfyXxL*!(g#X-iySarEb<7ER=S)WY5a4p%^MhTu_e_?qK!_Xqp!<*3H&Aq0RVhA^mR?UIoJt#M{lHQUJ_E2Vj)yD0_ayfF9Kx&G zg0ya}nr|tsAzD2RJQD2|vax>ca4I0Y14T(W!Xq1>cFZ$)hz~9&*Urk)Yt7ZEkD7mFr6#>sZDkBiQdjN`C2jNiaoCe*0Ljjr3`e!R;gcAquS6r0XwUPly8W z@KwyVUolZTuFBucVtn~%F}L}S8r@L$;bX?ig<`s)C^JN&PCn&TdzGrNG_Y8RJy|$! z{!Z%wK$GV3mayO$YeZuX)@NC5t}z+4Y=s5j#CiOn#%I}muxLM)R%%GoT_|( zS_k=WuooadK=FsPxoscmOD9rk=x>*Qg?vgamHS5l$gB(P|T z?$stmBd!9!2+j0c>Zp?xc0AJE$%K?wWm zorc&L-%3El)oav?^faEA!S%Y_DoA(vZVAv~>o>Y0g0 z?Yk$(SiADFiiIR=&4Yi7i>i?1ws&UnIdw?fI6U3zmk(B6DRb0?-V-Ah;>kqOljNEu zVMcAJe`G_6{E1138D6|_v-(rvCVgwHEh$75DiZv}M@KpB-aN8Zl_$v@GFp$9h!S57 zoKqa6?!y(iJ0Sdo$(!V7Nhpr&m_MOs4-=o{7fZV}`SN2bQ%8Wc^7*RTi?p4^y2&0x zb@Np~t)KjbsdW%>i(?al+a(BgK8_x_F7n|#pns}9rGQJCH93Kpr?^<`5ULTCAun#w{ zcdbm$HAul+F!Td_#-QaQ59)dd2Me$)jdJ`!sJSa?wM829>0T;p)0VcNRp9&cY3~iA zFnC!ggj8S`&fwIhVU?{mc*nl!c|VUevZ|#IV@ZL(Cf*Wti4AfR9Od;SZR~^-iZK#+ zFU1=8#cjKN7&sWxPes;dUHuZTSH@6W3<_QRwY23;5r`do1^OUeiEH|8PYa#sRj}9A z*Ne}7R&aq;P}ehuEC8o6gvy%I+CGg@tpfg4Y#qz63EBCue4spmqO)(}u@DzoAw$pTI;LCpTlcZnRc zb49g~^*u!vE>&cWuykfZ_04by+s7a*bHV(4XlW^HFxsaS^HOZ*EH&p3xg54f-53nE zqQCl)rYbo+*?)XvwR+yuM;*Y)Cg|3A3!=2oJEV zr?GtV9Qa{ND{1%`$h5~(g649S<#jo1ou2k3Ox}Tg#rZJ|+lnU{cI*xWp%aCMyn9mD z_T8&8;~WAC>?M>y#8>P#6b_7VTlBThkt!7$dFX+dcd`NV`fMANT=vAYgci-I>_V4A zEqha3u=%<3kxB+ek} zS29?hg$-$tS}{@2*Ye%#jtL7*U`3eKoi+G(-7J<+MF`ba0G0^AK2h`-uSh_&W@Ehw%fEzXf>isoN@!JcJySyg3U6+0OcO+kTsyL)X z`SJP5d?uc#j(ba7m+P}ne@+6Vn&q(&l6eht{?W|2lM7mV@m__bEbVyxGg&dLXO!0^ zJm<%koa6C{WxGoaQ?N^}^fIW2^I~c1>J~~YG({Vp?!fk@Z#nDoUPjfrW2wjFyN4lc zIU_Ru@&O8aBPw49jzdf)pp{%~sf*JV$|}Ivf(m-@xw55s@k|rcB_*J%F#wOn`-#cK z3_R^)I&c9L!=IXQxQ$kfrj_6>!c?G;)z1XW_IZEE_ z7jci8H6gCPr{8}mRl|%8_9M@8R17#sB+ysxuRkt(>g`o2AV{VahJQ452`Vz`| z+Ehe4vvy=BaWO4kgz5M~mlPQ5ggErYu8i(s!T$2ecCK3menF=9{1^2Nfs88YHq(;w zV&D)>+-<)A3WWu`4kKL0tj94==y!Opk-DV;Ud0Le;3Qe^YozK=@TFH~=6V7Y}Q3nklPMOe)vVf>mm55xBZL<}|=0>utoermD`LDdRdcANdu>{v!lfhCmVot>k zorlc4_fHBc=fb9|4mG^+8JCw&A*{e7;*s%LTQTokRnBj-Fj^280MdnD?6f}&Ceznm z23pgYJnM3gyMlST>=3HshFSxe1iN$ZA#i!49fkEWUg)@g6O92cnGk!FvG4t=#T`}U z5k(aTexQrQ(TM6P4T0BV$DhRfx-;qjTHkg;wE+=TkzgVXSg4g5r^AfdjUlYFPFC@d zf8+feeg(ZA|MO!(3RC*>xs)sG<7^}3*u6i)c8}N#dKS1I5+QqRhZa+>`p>5Bf{P!Y zvO^sL54Jb}+8CLo>2x#KoEV}$*ITCFgv%XPO zNp%`NawD50XSx;)j=@x~4c{DGf<5v4LiznI_ULAg?yh7dLUJsSvtTLoQL(_4O1uyU zB~neQWG}h-agH{alAme9ua9h$DzeoaWQN4w1^>bb>b3IlZzs{gU0Zj!= z)pvIk1r!lPL9@NsBkFK-IA@dZf^umo!EqPy_$Df0>sjYT(1YDTs3&Z5UoBvW`}VE1 zZa8L`a#RPRI$Bt9^3DT(;fnj>536Q8cpJNB+v(D>NYZ^*HN#J!6V}Ylb8zs|k-76D ztQd|i<@uxFX6@-5OcU4Jv6krn$uCs_!1kR?n{CYYm)|m+?)zLH4T~+i5^Y|cVpLYh=?RP{ZX{V$(i`lDJ0?A5DFDQw zA){SR&<7|KEb4V}9mcU_Ah~n?pC#^MX!yHIcn5ukuBN3<0Ry9eBOYI51Tivv*^%) z7Cx))sSzJHqLEEde-2AwTKIt2`^mEirf>gnjf^du+E6mEmo>9fPF_BdQetA_HUQDC zx7b%-Z59y%e-RUn<+7$XI$(q&zm?5v1(AX0N8MC~QII;n-zuY#-U>h8-kv#$I4FgQ z0J7}Pj?p8G2uBK?kA8YwSzlW8&>g$i3A58Kk)eZ;4)T@GxY|C#&2&90W_jQN+NFKy zXNKv6S_0B%53Wr+%`3~=3K_GH(zP|LD6D%Ipqb2jCBF*`brjq!8C+*eOpdtwT|zko z+LqozOJb8NaaNbk>m7A{0dg$K>+>7YgUPv1=B8YsTD<4{;>U(xRu&uwI!O7~3!)SA zNQRDzMEOTV-1K*IGde#)#k^=br-q!q(SMHjK`l{CaU}!t*tQTe%~0lk@+XumsthQh zSO~;{o4=^1*ZNs$O7~J)hLbN)!)zn7BY@2uFt;2fr}qd;zRY@X8wjG8?S5XUJ-?#_ za4=QUE+wz>c`r|OJl7k_$+`N~df=}_=4gzNz{bxVxjjXE_(B+eH(5zjd_;ePTNyR8 zTxW|fh00;nP!Mt{o0+B!2#X}|m0xDQtWS08)w??%=HO_<2)K>3wDgl!geaCCyZ`XU z)P9(}YMybRZ;Vz{s)+D)1gN7d7POyv9iwY^5w|TJ6aL*?qBi|}#%gI%+xZ9j6GK)X zW0rHHl4><`B6#Z$i2c18YTX)lpvWDQeqOd|BME z9qOUKs_i&vJ+7C4e^sYUj7z3O!P))zT}wBVnC54dH;JOe+1M@Fe6)q6N&2@f8aL{w zv}%eSe!;fvoStOE%zWzpEkk>{QP#|)&TyzwG_aoaF$a?6HMH&TK`OM`^aCE|=w+H8 zJF`(xs%*JfmvcXqTIoa8*6Ivj-eMeuUAAh41zm9cR9_zU$b=v!ym17yI=n6;#$kt^ z->vE15L}VNx<|y^826o_PpU-5dBIotS$^wo+L}o(whS&nzX@zbgvstG^MeeJOS$i% zxo37DW&K`q4W`D{50%{pu~egoeV%Y2j0?MnMd1@V7UiPVhcdPC{9pqe>nY9y^KwIo z8*Te@=*yS+q6v(T8_xcog((a4PL_V5T8;Gl`5)pftrYGP)SGaXf|j8ge--35=!g^< zu-ZlMy0@$M-r29Uq?tio(@NwEOa$b!;w~caj}wYOslj!E6BdTD^jYOQM*Vi_Oh_av z9r0;%iBY$>j~&j28LM_|-&L(yOq%-O;-gBq7*5q^iOgr)j_?z50X?*MaAHvJI((&N zN4MJud;A84OWC0ixB@a_qnZ0|#43V^;sRaAQ+Y*@ee=DD?&2_b65m1$I{txYQ8+)G zrG6E}p2Qfn3(IJk1XJeLZ8@a5}WuXJJrXGxdgu|z@>g#&ly?dzY zjWN#hFaj#wASR5?cp3+y+NJ2tJtO|k?SYzWrx^f4+tAf)iD%mZ?O3NX;eS61hu~~7 z!T*1~L?nK-Y>6>S*jjqdk5F?V`I)+Tgi{{@1L`j z_&)Pi$Fr4!ljTZb3XJD)y#5S2G(s>#50};t|9p+V| zxctjAH}FM-d(AqO5M49u0n_g+WJo5*KTeG26(uzHb46o(2&w^6vX)iZ@|A`HRuF4! zV!QWaP=UKn1}rr<7^AO;7C)|Kr}YPUn)1!q`&&JjjeuyLM4Kvc6E5dTq{I)-zINr2 zP9Hyb!Z+!Ioxm1M{?;dEB_R+|#5@Kif4uau=T)!6gw!@XMk=|I<}O>;EZ{nlTcQg= zVYuzy;nSxl;~2c{pJ&JJk@63*m;O;ti!YyiH-oo3aYixYTe)5_l@wd_Jp0hV^RSxV zfXsptzE^1?GQj0WMnc2v;C>_RF}k4V(mO8%uG#ZYaaeMaeZtY=-t^j+Uw72&swCZ6 z3<}VE_Qzzi5RfsuA1{y*=40RX1H!+wvXb^dzZce^5iR$ohp!H#V!^%xBemgo}9ve~^pILA7==r$yASShtIh%n7Vzw14cllEV#RGwQf z9dn(m8Dho{Pp%j*S8Z}3l5BJB_*`AQy|u~kFqL{|S(2kEi)X&Eqdwoqk=FQICe7UT zM)7)|Ik-NcnJYD*g7q@lR zEKhMhuDu5V8V;-`L_@EUbSQTi2quy~;Au)%l}T&Z&?i6`Q&g=oO#Oz^-d&UE3Nzf(2A6j6m%(W- zzN_qM#}BJ-QTEnF`#iO%vTb{xQT9G&^$_SM(JIp=c#e2#+TliuSUPN$OswuM%Wlft zcjj3S{tlCu!M2YP=zL7|=+GcMp?X6DR4{BK&yiITBx%i?xPyyV(I*Sn42%i2aSX-7 zlF-?zv32=3%zW9eTvo;FO_t7c@PMJksPibFQoAZQ3_)Gaqt=a_7@n8h&P1xRnL)-wmZ;1kfJoH~7WqNEL_R|uA5(X|6bm5@0XpjUQXVwD@#8@Ch9 z^>)!qN)LAP6Prb}9^=WyKM3aWLnZhkPvUvz*10kz9N}tgeGGlWJevtOHiw%YL|Z6r znq!mMVTE>!dkifZl1>Z59;p!oYeQ39W=fi|PIhcL$}b~){)y8Am*FxX0Aeu4o|vJrtGV)2b%}kUFtTqTvqxXvxOu-#CsH1mH?mI z#2+^;Cz%@+1+Q9JPrqv8y7T_3r@?)cf}p$E%ANc?+k+>#nq_B_`a%ypEUrYZPF%6{ zVG+Z^-&6MT{t1(_`r*G4WaW2Sw#C%rzOm~ zF~&T`hTmAV2q`Z+wMu7gN#>|e5r00VUPJWIw+X>=TSvv5tS0|kvjzi|f64MTeQ*!v zghR7m@U9~*l?IBkDa}vQ-xSd{iLcD($Wh0}JBbGYRu6!o*9u;ehHZ+3jV7+4T7%=a zra6ri@Ni}bjMEGFynGlJ-rhXL?XfZnLbmCxwtAybRyOA%J~o{8kg&a#x>2+>g5HBgWoxA8j<4FSpd| z+iny4cv!f4It%L7ehF%h>5t;V=9s9NG%Z>16A$;^NT4mILsGi))yHG;m$ox6|KMP^ zc)7BSGM_yY3%?7k-9n9+>N7i*WmIZ}i<|6K3?Nq~pIjYQMRE zV@+bL9lKeZS~6QNLU)Z(Gvsrs?|^n;KL+&&9E0bb2-a^M(BiovdDWQloyu~BmEIW8 z_qm8W)A?~HySq8{C0}(VA$N{6sI;ykWx!NL7R|n6(o`Z}o?^5Qu9e-d&PLjV1xQQx3^aj0#aX;K_APbc|u%Czs1buEbj zooKjOf;AX59EBuL#t{n4d*ggymHf5}1s%UVmdo>aoAPybYh?lL=vrjMz}4P(QWZT3 zS>pF+ugRrE-?CDvuNLJjc&;&)zOubRrhy zkXo)ozTHoXjuN;vLT~sd<#x|0D?SNDILyLbWiwJii60ozprz^N;F{wKHK-H26d?`8 z$t|#)AA9&e<`LSzvACjxO=<_#UV@lb(jN3a5W8+geB<2laR$y(sB4k27`w>io zsG1@M<@GbOBPlj+GbF&0$Kf;W8C=KyBc-Cb-M45~`$U+~x!=&;swFP;Xe^uju>jW5 zh^Yz;H}o!^z)Wd3tOxSTS2aE_qT3Qo2DUhaMB7|p71+`%p)XVn_@DcSicX}FRUtx_ zbY$^sT|n6RDs7V5TBtT$?dLp89qUuuH^V=@^=FwPHJoI*w=&}heJ0Mf|= zWL(>X>M}rI;BUz4hdarrGL7ilzBAArU4+=V9&OB$j2?|@r`l!UuDh2Q)n71^l(KXd zJbKQCnlvIac_C{lA?ULb5ed|xe?c;^jBs{{=fqKNzvJviIVrKVvDZY2s!qi>hNO`u z6=J;Ks*SA-xZ~L8QYDIEhmh9dz5luy$ZxCANyr%TYx54LvjbcI(k%2%aNXb;z(0vo ze9v#s?z!J1DL)CQt#bS+z$C~afM|A%$39QUtjBmCiJZ^hYqt^z4AL6fiO|w%cV~{B zW1-R&zmdN|6FNie8juVf;8kSNVxf~7Y442nL0s@`5}rvLG~Bj$RQOoJD!I|rp4}w$ zNLQN+GlbKda`zFl!bCgb>Qx|D8OQcoSF1Hy+V6f!GK{_7_4_b4VfS9+>K*b_2$(qr zQl0*{jX3H4@4@G2S2Sf;vpn_7ubu;6BW$p|U-(pml?)OmIk(d)<~-4UM<4o#s)aU6 z4{aWj=7u4-=`zuAokv348v^!2eqZl#3s_=9ZND*CcO&?*f1$ARP;(~zeGWtIOuWWaB`UZq3R(*%(VF)M%x za?yFR=+tH9*11!DGCg7&)QwlV0>;@y)CEnrI42x}rMz`^xW}iA7>)UFY4wcOu1bX#ig_r{KVw&*^1bMlMBte&%Uj}k=TEhKfIt1{ljSCpb3LoRnURA<= zd@Kb*uiy-^IBNRp>!Hcy`{kOJmEycqYFLCdQcMOFiVm{m>M? zqd#*)lpGC3kK7tuiFoG8G#IU=%-pfRS)OnMhD90$4bunNRz(iGu~T~=Msy=KY0wng zeDZPm?5-YM4vk-1k9Lxq)_BM*v>SFp#O%f86tPfByqQJNii9m5%xzR@^$)aDB-Et!fQO=i}vd;(u=-Eogp+BQV?j%~fH37@NN~3n?y6^O| z>f9Px9G9>hr|MD8+5#_|fsXz%SZo@F!WQHt^gHzvA|Bj#BnQQCms9ZxS#-0#?GDAR zoKlRQ?)G{va~_RokNf)K$ET8wr$(C-F@4(ZQHhO^R{i< zwry+fyoh-*i+?f8s$5hpDk`$_WPPW^qJxFD$`-OMQQe_qK2z>nO0@t_4r)w?!H^G+ z7%cRxP0=lFyT+SCg(T#3>L^e4Bh}Eyo|_Y*L=bT|grFWkO&0ka*<;RI3<-=AD-DU&>??i%SgW9odQ?9OuITtnc1fO%DAeR1s%9DYla zZ~q$)b)r7DcghD}b?i{E1)OgRt=312dE?KrAtkfOn=Q1ppd%TNb=X?oLNxs6myh~v` zM4?SL{N1ZeRh^uIj>i$2Kf_pRmR(4O@Vk4i=GdvYB*!ie zwH3CDWt@uPtyR!*T8u4g62==%HGsV{$;_(u(La+A4NVZZo>(W^ofPa`BKf6p2gOb) zk1&!`rmArkos;9WsUEf6t&YLcr|o^J<+#Xc1BvwbD!>~MiAh)9q6O5BUi3#ISJ`Wl zAj5{r`}C~dkwa%EUwIxdiW8$D zMulNm>S~({u2r8f`EZI4@gK2LC9z2;aJ~9@ijhIz*pTcZciist8Lge84NY2O4Ni&Z$NT1CVK3AWtn00KI;_=96lyer zab4Zb_gx=X8n*U|NI&>Q?9xzhX9(x(mwI?_7r2@%%9OqL18~|bs{_&zP4+$QI z;fE2>qB>8oMb=8cvO|&T;$c+nRgHB2W1Az(9b+HVN^S^)vGsPa_J=yJiC5ou4Gr7D z{1O^=A}VX0Jf@o#`@#8n{oe1crnf^HF?@P+zm=)b*-tL_h7pad*v7kjXUB;S?$j^M_N6vB2w z@}MG=ok?~ho$`XU-SYu)d&XCrR3%MlW#&`J2Dn75y4)@~8MJy3i|k6Za<3zAdT4K> zfp9_#tLER2#njjt&e!T4>;`j4qgMSAA||8KxvCt8HAe#u+{U5DN0=>3nHgrTLHGM= zsw4WmU2lO&FiBB4W4*zET~~-&N^77N1h=|pWm7SE{KO@tcrp$BRC^`o{?=uL6xqz! zgf`BBqfOGO+IDQXR)ju$^lO(Qu>5Gzu}>qHS>&~L1|hSVviueYzWKAe*T!3g!r)Th z{j!0a&o=z9mbdcWKdnb9WAPHq{hAbmT~F2ry6cfZ%Yp5k{4W2TBLg#?wpAIpIwS{; zokNNw%T&^6^5b>7RmtY~$0h|F%uQYeedwMSff|+Y?Gv(-Kb4=D!id#h(lALS7-pK7 z;cIkA2C8WZ0-8GBtj0N~;+fCkRSgOe+Y8!05nw@n5@-8mZEJVRBI%71_cSz% zH)wuJ?`nYgyFJU!=?S1iiPFC&?xMD&V%>9}{uPZ5qm^MDvzo*Gb~&(;OTYZCg&mU|>?oJRz8bYHr5?s^2Sb@ zLh=E_QlP*SxO*G(k_K6(6~LFk5i@%7QNIG!gyP&9vGIdeInl0)xXBX^q96n^i3khx zNL_Vt0P$6Jso(vfcS5I>mNKLaE45c{K5uUY<#d>Ar&SRz{tkxc?Z9QPzf-oE)cU)- ztqhGx>1WjObMdEaV;CAi_FXY`&Qog1=)4Cn?X3+VV?hFE+E2d>aL#h))x;NwvL zORBO9_!2_R+x4#zWLof0G+>V$8tEUZu@0(7uU!M?hM zi$&rIE~74}cu(0l?MUNmpF<{3&c8zQ!>c)uG>cA|%>131!SG?&{&E>lJ~lew$kNfF zXguCzlvx z=|vpgZ%MU`E}0<;4MhxCB|E#?!A?uibYum^z6D^qEs@y|cX@`BWME%7>bOJ6Oo`|! zo8{&vS9{aO!e;}B?)l30_cm%&yk=IB`MrTSpBa=R-%H5Iw!TV**jcP}$G4 zc5WgcGgi`-yg%uIOxr8|I)(jo^rAdD%nJW$X|uw46!{Gr{Udz&#G#qiu=l4d&CZmE#i>=%QYt8HDDJL-ZDd`@=DDyP?=$+u1rJgsk`k(4O)dc+n+t;010 z-tMX7E89AXaNG4*xgh-3@iooMMnloF{d^X-Z`;F^^sLMN%+PK(8b_u^zbFPcWIs(_ z_yYnm{L1-CDDzVB*XLLAD~$Fywj~inLkdfbQo0giZU8fAnQb2^ln#SqqU$P4j*L+r zgH+&Q{2|GKT=vZFgC)~GxD1AEk7#uACs%UQ(~TB5!*R^{#YG?UF-2*qPg3?QdGjh5 z7aP1R{qly*vjr+@j6Jc#0|?)#;1yzOL)v16@`-@10siz4AYxGi&%=;z<+Z={yU|nL zKy>B?;MbS>`VjL{PcV5Iv_Ss`}ar?&}|v! z2V;<2!ke)Y?I#_6X*U4J6rK;hXh#7dtG=s^asK37r&{3{LTxHHwn-33S2Yun3ezYO zmF9O^FUJ9J)VQc2nz5*8C_!{*HpTGsU>w=M@sO~@{BrZPZ$6Uq(Hw#4o(V$i+^v~z zTp;8UwA0yQbr$k0XU_1@J-7Qat-_;)4TyC|7$+Jxt!N85_)UG!b5wrLr$ohpBMkEs}%fmO}0(so4{l5gs}tR!=|T zaKTdJ`^U}A@YLILUB~ZUr4IC{C=|jTiORCk5|NW-S+uf+RLRYMm1?sxPvu-=(YV_+SZhwO@BZrm9g8Ms<(`j>0lF!Z zzR{LuW#a$*@GZv_v{ zD2Bjquc9b;!*P7}3HX56Il}zhptE4P!`@Pl56e2bKHd}ffCe!I?#6A~*!GuWEyPLo zvJpo9;Mo+r-W^Y4*RPsLb_ZJ^i}~gGrVP^X4_4FFNSL3Ezd@LAAIdDRu>M)mCdgT}NR)cOytL z!NDh;NU=7RyS<9aVD}NsNOL9GR%82}eRvi8MI@u_36p|l|DuIpYfxf?tFeB@BMpE$ zI>%$%-oPRzD*f;f$j_FjO)X_K@V;7Nm7|PPtvMmIbvHU}TC;fI@{H_8R1;sfFFF)* zH@i5my3YRbI8Fr zR^xy;Wo(%GXn|==`!E=)t~6b192+nMwPphwdSw%4oLxd!_zce5{;((z{#Bd^jdP0S zGrlZ)^0R?^t1M6o?{1YnaS$pc8(0-o5cMyt$L=x{pjRK_-cI@?V3XE%eb36OSu%QQ z(tW-m7OHp(h^p9Y*G=k-mUXdwU7^8>)`mA*08VCK5%gKs=F%fB^VOL)A+)?$#&txQ za{4)b#swRfFNCSa-gT<(AmcW5wxB9=v!KXoxztgdoXP`o-=cDIDJA`vO@F)tzYxnH z5HVzsrugFqBe^w2Q(zKiyeTk#N(upYR3ZyONF z@Ch(c0IH*7Ow!wHojP{p*&RgQxB<9$$?|1I@WOrJrW-B9J2o)LIgZ!c>a$Sq5Urn4 zOd>ORZKRRm!mG%_8vkT|Ng8u2Ttz-o{UK}WfyP8t`RifC4t?8Pu`c}l8o)k8FtU3Ngc9{25T2Ii6RqdT>Vd)_*ylCF{Tu{hs zn|i0Lumd5`ly#K9J@(5**B0k?>rDl`4+q3fRYhkC*8#)R2e$d;Pcj=F>cO<`NR3(z z%=va?(_OGL71c2a7;!}!t;?aUV$n%Rc|66pMn6+NLY$`~zT{uLP#Umqh`DBEO%eZx zmvhUB(Dmu#bewcIV&sz_@>Fl9=K9MAA5O7SF*VicF~E5|$ESeKD_FK39YAY)UrM3G zpccw?PSwICQzW=QY3Sm8QJS~}xkl%!;9!g~?Ma!{Z}-#a`3b5q9~^jBxJ54xgH@xH zqU!cWYgtYx&rq-=EiP0kDoeS?55$Q8>&uhHV061h~u_quz!c+la0Oh=?CBm|tmXr%||j2jXp!Ee=5?KjoeTFAe6$WJim{d20^ zYBICaEOWmv>}%~`MJ0*CV{*{#(d?%>{x%;cCJ)(93d!87H^%n)^H1A@x2;QT;vZL#ZGo-{Sr_FC5?mE=WN^%`_@Hc6x7OPJbQ!?80VN< zQ`;(~+p8c-N!$8V=U@O1vwOQ(BcGiYA=vz|Ve+oxW`Ylkh{4r7!gklaux!&z8{EFJ zSdP3Hz1IKMxkASF?IUZNqeB(uk$|FMszi3fZ9cDv5Ejt|xUhc1XXw|V%+rjBzGq-S zkjvJ-90)@msY@F2#O3u^*<8{GjW#^hQ(C;HMQmk=K18=|Xcnd=MG#3#XvXID!>S*9 z%rhudBZ-8>UUO%vXdsp4bG^9&eCzR^k3Z8RpB;AjQAgCbV7Y8~0w}QwmmER+)?Y`h zZ(DeCrPKWh*| zto)A3uoRip?6fvT+;U`^T6HO!{c@0s>^hRP-R+HqGxQiIGm4olk-zb)Kg~HFe<485 zpldMpm;dSJ$2XbBJyLX?tMgvDapKh6lZZf=dDP(2m24@$2wEwMR$4Sy%CKZy)Mx*9 z;*X#{g_2_umZCTe)ec9*gX_*!q|1aio^Q_FT520>pumdrYY8<_`;laVaNin$Q9O55 zKTD@lkP;1)nIFx2j0bl$SnpF#WMx15hggP^!3-2|QG^r-tlR!%Iwpi?FkHo$189G}s< zUVPhLLz^q2e>#CSrUD4tQ6R|{I!j~(kvdZ=!lX{~g0uEX_7W~O&B>3c^A1%~d0X{Y z#D(8rN3nncFWUxu>gt9=q}mLiZ|CK3NxbL~E6F{Kzdy6H6?$ZPW9r>$a@E7GokSb8 zMc%+&40<=GDMUq9d&whX+BemkC?wFa&|ecGVo5-{6F!b!=bOON0#I?alEHn~%|u|Z z)UO12h6>D6ul=Bk15MoiQ-L(_4`vs*E-OTmUN-FL35hzR{V;KO{3R`4ZgM?WdjgwH zM1n9s?{r;;dAwxR(98h>?xsK4;A3mnV9Q8gh7JhkTBO*XUWM5lEICu@rF-Ozshbj|tQdH!MGiAV7Hh#fUFcvU*-Tii z*}Mo9T=d8LqZYsxMjV9g_$*&dyBN$Ds$?)?@@xD;ty5BVMtFDOiIeLO)e)aX62_1i z%qid-TFvb;`N)w|kJ5K;I~-h#GMZaBi?*I$yw6-G0*8_zsOPyIxsZ$8v4(s#g!~sC zZu^v3&q2mZd$Lh10TG11(Q&yzI0i-D&GjyA%H|`)A%Cc6eCeCb?Ww>ELjcqFxvuid zoI!>=2j(*)LH~i6V)Rx14{fV4zSvuJdOUF-_RT5Nru?O|Hiq+O`{?;aX*=bc!2qMM zbItSHFTE2`^go!{=&yq%1z7a@DvU66_ zCbi6c*(CFci%9Iu{M*C@kDGk*MO!rcxH%?Rw;GS9_<^;062Li5bryl|aTM=_nY?5b z)=G$w1A9V>+U3m6(%RO5oM(Q#9btX6y8FLluBKd-T2CuEmKu zWNSf5ldF5911J$d4#eK6c%~o*l)ua5+cyh8!G3XUp@NgSCd}bJHgs}pdF41_i2pkl zNv{=+Pcm*n<0G{si!DW5#w6aR0JtTiI|>n>Lnessr5n&&4V5HV*KMM(uEsMD+wnIU z!7v{#Zl0|YNQ?hRB$0v2r7Q9dC|hMLIqrqHPo-ZYozlx50&wRd@Aiu53|{dz z{pN~cth*kPe=WJ8Z&$qpBHmQ$YbphxZWX6PLZ5nv?rqcU^zaLtv(4div2XX$k;eq* zLpave;@V-wthy@FQ+*nGqlFtVY}0r#k=-DAh+X2Ja(f-m4SLAsH2t(6iT7RU2UP5_ z!l;e2*3_X`KO9moNkiL6xBrPT!ulT=BaCbu%>P3YVIp8w_u(7f+{XfFU z{{b=5tn9|K@Q4Z#V0?W&3nh6zV1RIZTR!t% zIH)tBc|GI`G!f7iNFFvA1d3)KV)Q@=p}iUf&Oi|LA3=A61O+wrf1Hy)fB}tiX3Gc2|X17 zjBy0D4U8j`&;ei2QIQQ16=3}NmBnU-U;GBmJDBEd5|fCzxz1QIC!_y+iEZT4gSe);=SI`I&*kAQu$>?|A7ViIGkt7>zo zK|;_(@wI`v^-kVowhAbFQ39R-ZRAKmLD|Sb0o;dYBAtT1>>S+40^k=Q|9Hw~q8H)2 zg7P>iZA97l-@SeIBKfnP1HNUz6c9)JMq7R5kD9o^z_@+LFd;>R_6%E)Y2jc+j1+zQ zTVbC<+3kM7!2nbkBoEf1H@k#Kzp?Mw9ZW#4rfl{81ogLh5z4SP)q!_ULj__9x$|XE!}q~r1Pwxd6C1#>cOl?k_^r?=H&>1$oy;TB zV<>RXBLYpzzfvRZ62JHDLUaBM0QX=7zH+ugW`i@$jwfiGPKOc@PB2Ml zJ4?a@y#4k3h6>4COYjHQ0HWOO-zU$lzpoUeZPYM85aHpFP*GC;zTz^os!*YebT>Z% zxjK4nZkOPS3+K_u8g5ZF35G3d836T)eKtgzC zJ-sp(0%MQ(N+K8tfxkw9_NBY+?|S86W|y9|qWgaJzba(@wT;w%dz1D7AbBIzBapleg-DAl**h;U{x+^47V zGnohB{!GEwbS90b#LdkoH^9n4hVZxUMYln5XH?76UGKWGb+JOjt-zX=Dkxoyvmvv< zPA=hgDt(3Kn4P=6C`|%26ikH{gQ@m4nQ4fAxI7B-l;0+MGN){pw(VQpi0+!@iC9`B zFktmEQk9aOGz64FpDhA!1asJDEpXh5hn4+t=53AW(#)!(NnLB7r{Mcxbx#RKB&EG9 z`Q%8Q`&Y5Nn>0Yt!NyP3ur*BDYv@O2jVZro_?&}ochF=eDN%dm%G;hE`z6?ABp+oQ zTBqoKcKMW;Pk7%^$@IZEH;p71_xa;s(5B!^IBw!snjRZL_(LD)zmOLDjc)-fzbh;onXE za5DhS(#AY=qn_t?O^K;`^Ub}iIUrsXRJ3LtRjnCnNm#ViekAEYAZlfCi%jkzXK*lC zK9Y5x5Q2n|*y4!>Km4>Ub2Ze_a>B~Q0*PZ?AOnirNM2j-r7p8S0o`8bQ2=PnWY8F! zS7@=qYMP>6?hM?of}L8PCXU_0E$gH2b}Q%2e5E=Wa0Y&j17}ey`W$P74=m9?G_!Rb ze?(a_Tur_|o7Nd|GFG5h!zy)3i|%nnITy%f*-*2q_N{j=VB=Tw{S+ffQ?pv3PiuVr z70jycn43`Kj>Ssl=a%NZ9c8C>EP1alAc^T9JGjycDd!>w#o7{lr%1@1%g60|v(|pK zuGmW&#@m?|j=c$1H1(a}4#ZoDFgk`N-CuHn1Ejw6~YJu(Ie}7g=mg=j8=z-^T5JBjhPZYuDLi9BHC6 zCLvZJ}PE)*W+IGY^xuc7(4#pow%2R;^fZI*5RO+D6c zw%4eeW4cI7o`1rMaaF%*2MvSd3EE(TT~(_l?Qi<*%-!PQz-hB#H(LW#R6=S1T!0lh z^x%b5^CKFzG_Ib@!cB^<;itsQL}*hm4r3`K@!WRQb9XsqvZsGVsnQo9Uj)_FlR_-$ zeA+6`rIJjEN3;iu5LN~Jsw9}n99Ic~+v)d-=w=1GHxlH9iMtwvCX{sfxM-#Te;)H< zM6ZrO=q)84Bs56^A_cU=kVk(@Zs5Xjv3|@m$*PJ8lt#w*;A5S>UQMp%i{xm}&Lo@C zAo+O@wXLXRIF?*e)Pt|=e2)|NBL7f%kkf?unPu{I*P9bsK!)NkZ|*o+QRKNVX(0Es zA?XvU(-$HqDwnd{t$Bm0#_@&}P)o!MCDg)E(Sc`|>W!Erwr7{&j`k!zNrJU0Fnohz zX@M5f2Gb-YYFhECl#pAa8a_V-H^Yy>m-dM>clWcSlhQA0w^g-EO4A+1z=H|N9d6l+R@U`Kjvle&pvpn_yiKD6DyzWF_TJ-5)f%4{*UIt;$Hfg&l6 zVf4F9z@YBD^Hr3kp1H(F1g^QpCgn9n3lEe-#-fUQnP(?K^zIXJa^sr2ZFthgi%>FO zc0MDBP)Z=sBv`E-K0LPlc`2JPVdQ|Jbq_W&u1a21=^g7&$B;zSB*QO(lTO8C`4v2D z+Vz(?{V+k_CRQjabhujDf=e<6kX^^^xGC;1y>0XPF{_2;%?8wLsdu<>TG$Z7I(+YUC*kAr9lrRrdOi4!<0Lb$j~IfHtK*X=6x?G5*186%Obx`@`bNa3Q8#{Nr}3fn zt&Xjaq82PuDqJC?zI<&EgB1mo4^g8 z!zFyX08m1Rl0GjIkqL&6e<o1+>O&$QN8Byiw3qn}*pqz4fmOex1hM%iVUqtwSe z*IFZ)f16@~=vA>QAF_32GqIz2%eOKk5i*SlRPBgjsBZLNQRdEfC+Hw+U(zARah!l~ zajs=cc+E{{^dPVFn%nkS&w}Kh+ddQLR;#0q&gHIO(|H!B0We{>PeX6PCIR42XX-4i z9iN!1gUxRTuekjvy!P?%7k4QMxZA1nipWHP*} zby=*x`%oM2+b-Ay$>S%H5QGx}V=)K`gA&(;Fblq?$$^%{w5i|G_atBI_s++{*g!n} z07gx19HBLJ*8m4sy8FbNTx7{p98(rIswBNv(6~#PAg!^L*@5-Cofh4?szl3V-d&_` z2MrX7f|v)$_6WYw{wneFl}PZ53%boy^R8jB@F4NMRF74cwcECML=sbBBImV#ba;_E zXNXw2(H3eBWJks^z1s(_ZD~&A?#1TRyBg#wf9f9JNIy`|K5N{kNv*X&Rdrt~5FU^* z-%38X7Xoz?r3|oF=T+NxuKfGz)8<0Lk1lv^(`Fl!&6B*R#aj6Qym$E7>#B#UP`Nw& z(e%)XjAP&=LBieJeMz}Yuul^^(js}e)X1q7^W4QzP41rZR%qp~Bk|`%<-o;o)2cO+ z2o0?UE7;Z!1%`jfYIRean^8&5uW&Nec+!%A^B^+Z%Mnv!Q^=-o2vjo@yXGMn`CJi=FGaeHuL2+L?xu?L%3xQQ4OjwnLUAmBTIaR#+U< zD+k$w3$di?%za9_{G&U&e;6*!;O}+Yfoqir7<@Q;`_n1oK2l_CQFQkj)@zYj8(oo{ z8-@*rLLW(9HL!*^I}Y!0ThlB8s)sSDC-k4`=}QQdM;<4gZn(LzEhc>&;IJIo@z~w} z_*~VlPZoQGnSJ|5xZO+=x|!Y;`FpkU3ia@ke5|?H{oC^O5kvt%=svCbeF<2aiNix zC28?@&lBbRFnmI+I3(oX->5*awFx_ph^kh8f%FI8Q{Uv|^~*4`YZJHce6Ye!FZVTZ zU?f$8#UwB&TW<8&wUC(c5OpzCJJ^8fR9UR@*jxAwmu3mw8%~#q(`c8eV1HsKB#)B6 zs#)zz1u*f@?T1*@b?C+Luv(kO+~P@WJo}x|=CySNR^ECgyjb@LWvw&z@fTEEaW|_K zB44xc{vgWcK??bfuOu%kLEIh(K%#Zjo&QZO<)vlyV3Tcmo9=fp@$XM6Fd#cNy!%4- zAe8Ls6x~i@W{Y|4>VUwTd%xRj<~ybEwcA^FQV^QCi69%O5V+*3v|6J)jL{F+2_51v}hLIaJC-l!O>peOx>q^e7zuzrj~*E zd{*t5SoapJ1%mW0xq|zS>dS>eOX_0sf$yirN3<)^R{!Nn?lRMCIxLM%8Y~_^7d?h2qr~#^&+R&NCBZo~7Xk zF_>o{@9G1*lRGGdqU=kWoVRlp4AZ#BgfD9oj}cQ!oLENCEZb)DHIk4>gkd3uVaMf| z=jDB~I*Q57bHh&fm-yA9p@a*MZ0FCHn8~{l&NgZ;+wBop*pbc6;wy92QCRckEOch? z5>7u3csndtNpzF}byne4b8B&(jH=aT#LDIYGxYh%qS{j_<(K|+K{m5d2LmKzX15B} zqW~XW{%cH^0u?`b4fQ7Wv%Wy#=a^*f9m9#j`$^#i-#a-wDirK2I z8+f7sTWR(q=X9}(EC@DF3a0?g)0q{YOV3WE{urcA3#Qqp!=i=kR)edlx)M+m(|!Hbyn}+=ndn)hBQ{lbc{2q+pus~@e*r@8f}h*7QC=)58X0n(ytJHrw2_d zQrqIkGr6~)uKdiAGm2md!ZR$GZ64 zOP;c!QHk3d-bBjttU&U@f)}BiG3E_&HPVZWGxV#m3Fcsj{f7YEPpxcxQ1nSP@Anm9 zf3JMFHtR3oM^9Vo6m_R!SR19wfhx$;R8>cv&S0I_lfOuOlw~i~k}(SY8Bs0PoJu_l z_Emc*EM(gkMr<-Vgkw&HKdMtlEA2WRf} z%!(SM-K<8m*?GV;Ao=P0aj;ve7gy1L4y2@rl^B}G-0%u7 zqPOXdMx4dpTVIr1T~N1z)Z-Atf09UlFNlqkDW8aWRUcnOx#D&-z~!a01*&79hqV@! ztMhL^&IjU&K()t^934}}+J^a~xdh+D=wUo(uCd5blQQQ#dKDvu^ z4PZ#H?mNK|#OG)j$nD)Y!RvZ3RPZ+~R_Z;jlUI-2uMv8IU0J*GG0kb8aQh<3gd50% zs2Ey>HQIf)igj&vq>Mb(w7w8x+q1)n`N-DAS=z4Lq~BnRbML+-`u6!Fz@gv0DZ^xo zj7Qk;Lnhf=M!dN&H$U=JR3b(hzIIjV#cxM!G#mi=TLKBVkJ{fV4 zF7aQ5G$aXjG~!a3jX5j-zBog8_JScz6gRyAofXRyZ83pn=c677Hix(=u40^5%prN| z{NO_)p(g1{iz6vTo29qglaU^uEP(ly^%F#LI^mlz%HEeZ^{}z#amp%o3lo><%$5x} zp9U$uFIHw#Cp|M4FNMu*plVNxU+^+o`HR$s}Q7-d9lkt3dU8nVi#E9rzbn zlGHec%I0+9H68_tqpZB)m8G(JCvjdjB{>G!4GyV`c$?!yj+Vg5!#|-7=egCi*7DR{ zE%zwbpC}*{|F(BOJ4E~SA6L3P;SXg(mR}rB?r>skmTW@g%q3gPhT!+KHUgkSF5-Cr zXl|wH**i3%#q`VMs#-jDZ)`f|p-Ie6ruh!gDf;D);Z;~+TT#Z5$ltFJ8lWiDzqds8 z8kd;dWMyiz?<-5w<~9><`FZWKWp9uZT`E;AVfS}2#<|MTip^P?kEttC1>j^XF4YT4*qD>3QqE@J7NiorjB!~Ug zGOFUcw^q%oK4-JnB)E`*rHy9%bAs!5cc5m%a+um6v7|8Ch5xhgB#;;>>eVxhOe}Rv zXNE=O-q-i6&ndW+1UzxK_mq0UnrP#DxgwB*un!eq_@;ws#%mzllzt%u8XKcSKu)Q3bKW;h2DL zn5%iI?FbVaIxK za=W`$eqpJ$3DAOR=B&8FiDg2JVW)T7aq;1i*TRBU&c4aVpr?FG?MGS(x^?1oQCw|X z5IyF4i6Ki0ZV>;1Kap4m|4+&g`+rb|nAjLu|3?dAB4Fa+Wc&Zf`Ts2qF*C5RGW?(E z^CrqDH!o~5ZfOMjbEN-Z^SBEPEHI4DQbEpP5)ccp3p$EG3)|bUKo7CKkNo1Z8ndfz z4f<`ax?a1VvBQCx+A2YkQ$WUG$zYs;8Efep?|_j~Sf#PG*3~ix1YXbp+|1O}=il%*AA zRKp_=onQdieYj@_mXF-mzxG7|48UInU=GdBfSg}R00L&PVBGhO_3rNO3|j5&ff<5y zs#9_=zjupGEe@dW{@Pan@$%^TymxscZJe3{8;~tvKh%AA<);0lhiJg1?Gf z0-BHxfn)u@gkTmRZ*DT5ei^{p0e!zxMm858adm~2l|+60OCyLd&MYo0VHkn3{kF9L zNq)xhcKM*!y%0cvNVarzy-pP0TE`E0EWf36aDmFE`nQK>7x%r?rWa=?7vE}jKb_W2 z@kLcJ)N9Xrhyei7*j{^fFSWg^QzQ5I8HwqCvzwAb$&!eS?{G4b1>$cwn1Xowf7O3u z$;m3l;P*_9!RqfFfYb?vsgBJF0UhoA>s>|PxA`QoCGz?R(CJfu@hrK;K>T&K{C)ZSsDH2v>hA$)tf%_p5yPyJlm9)tIW+?T zynm5|)4#Ny{i5VwXVFT+uH>QPi@MSKfXpKX3SoeR#R6`A^Kf7I+l#lau2W=j{hfc8 zXMb6lT@0uM_TJAwK%QyU|Ykrp({l<^`O5+32otplF3#}aD zC#>A^8pC6%js+M!es%#dG&nf^#=$X#ow&vdsHp+akMdIy6a)VXqc6Pd8UxZ)IwJjz zw2c1crT$%$;LHr7vBs$ZxYFJNP(nvnqDN631E_a$0Q6oXP_hPm`Z5y&pqYv#U>E}3 zI|6$FRu9xx@U0_aNB0-KYCY2n$L=Hg0lx=OC;1`J1&BJr9|G4;_<%n!0Iiq!6|@FG z%is^k?IZdGcK}d7`XP`5i2A|@*)O~U7i^>W5k#Pq@Y)Mf657a`Uev>-B)YJ(6-Aj;XhQJ?-l5!H;axd}Fr}Ews z=X8F85;=ZN+P#Q#-*sa4f>Sy+Jij2p$~TsF7na{@5E2{wBy9v+eJgZ*y{Yhuz-z!y zX}~0*n|cTfysy0k#l(1iw6sH;kMt0bxBoWoJoyRFDM4bFKiRAJ{rX<`@u~QIx}yK> zwD+}Zb%h02%c|19mJ$pnfor0Fg2oA4T??^q&y zd+AlpN}?XAqg@hdmHAHjieM}r5_4?IXb$|Fd;%=ZO>qW)C?*Owtq&*dzu= z`nJJ^Z@AXmtFKq8h>_evbK38PBG<@z(7tx)K4a8vOP-r&9tPISvo|c7^%73YAd_(v zh8R6Y%;Dm8U@I(+9{E*V``I9Q^px^8Wvf|Dt(^ac19;(oh>gY6Pi=fL5ELziNGT>6Tg<%H^0yt(KodC5eG3dSclUbhG407Da%20X)dpq2N!2nLigz_f62WzY4-rG3# zLJVIAT9;4>40)mE(_adUGK&}Orse-sz4LANsf8)8e0hB^u@3bD}(r?Y5bn|yB^K`K}bxUy#FD;7p}m)so@vx+KlTg}8!wu*g?oF$Qw z>R37Q@x0Zn4p6O*Y+XSd%jD>W$yGyR`9-6$jFLcG-teQJi5eFA{lv_`0gcxs@K>wA zk}7&JF{;L@f1gbrKm|^b_PZB_k}rS82)jlABxTi{e%!-Di*5 zK46u_${K)|hjtW$7B%bZ8{p4KGzk%&VNqBY+lv1EwMNjYmQ*_jKWxVS%9rT*6iVKt zT)cd{J=)K*1m*wJq$Wa0CVfWbk|a@FN!+cXm_ zBk8j)5VGL;H-5FsV9#DDcs4DSKm-AI^n@74(hg0!@3X&@=NalQjapx zhdzt>T(yP%4(M0fX2))M8whT_mDF&`el!sc)>%+zJU+^UdpDG&Gc6Swe*kyUlKWZ; zy9G?oACO6;wcRnzlhOtfPu(>@u+ZA%*^pwjd&>E50L{fDZ49PMWv??W(`bEE67E=1 zJMUkQldgc9Ij#DspG?HgL7yc?U_V=SA-XPy>qNvibF`v$9hdbG3JN#0-3H}0cMLT! zdcRw4-vR-Oq+L4<@x%IPtWmZ1^M9w_t9^lfn)#I)phmzJU?EcA#*aP` zX`u;i6-7v%vx)_srF*g_E+1~Zp#U-K>m5nErV2UmHQAsw$2}UsP7aC0x!QafHhtU5 zkaq6_F1c>*yc3VQxM@@eqZh81<3kfKP_oT&g& zkF7w%zd?>X46z%FkAN39^iz)oI42^7Zom7wPI<1eTglFoajdUe8eEDCo=Y=*JS%i0 zkvIu~2NrJ4{CJx=f*TgB>vd+_5|bt1D%cdJUePTTBfNVtkx^~5u%uZ5Asdg(h_kYf zXQrY)`SmYp_0{9pMFB7L zO~bMf!ENRh)l#IR6I2Ty#p0rzCvS>%4O!!2-*w6vIgGeKoLm#o_|ASkfe3r$REnI{ z*Kn;68_T)-SV!wuJ)hv{;84XA=dR*XG8?eZnxWKO(7oQM&PYYxq*@)VB`9BFDI!yb z<8F3QS@z@U+NA^}r?lj$a>eETf?U}{a4LP>k-vevmd(N zEsg|jjo-Lc9&Lqy_>91eIeoWwhwdRVrplwU%0b-t$I5e2vG34xxSXCem1e^Wt8NcUpUW#r|tQn+y{jf#m1;1`)fkGE=vMLT!b?f$~ z`hb?CbYZu>+-y@n>hlFYjP2q_4HyL1<_g7re0gZJvVIP}{jm#|CKommPyRT@6R)X} zfa|kzQXEJ9Ji3My5JqWZ!psr!tY=S8xRs$RzPvP)PJ*J8Irq>BUkfAhF?0(~x4CM` z%_jbltVY8_A|2)0L@DV#MQcL8jH>|QQZ<=^;068kN-wb~9M>a-8$u5!DFIxGm-?0b zvcr?xx8q;^#T&}{uU;&hMY9#?;H~y&nh#^6E?;voo<4)nc5AhpqITV%d&kn*Iy`DM z)5zI(ym2n8R$S$^LX)K(h}Z)Z8cLeh``lPuRpzLhbmaRrr9%l2W{0`g1dy$%HZ|ufHbrn}`El!{dU0O{9R=1Czq7QfBON*T_o|GGb zGC^AD(>RX&Rpn3BT>5xPUuq@SHE9FdN`9>v?&9S_I=6|cV= zZ!ZHgLL$gC*EFx@u-1->kcg~SwhSqL(#gaQhp1t=86!x|onef^i#dL0K_Gcoy6zF_FJD2kVZT!Hr`Hc2MGIcdW1dLo0Mt4Q;- z@q0Jlr`Xw+29r%zZ&F=EhlkaTZx-=y!{-v2wa4DencfPc?Y=-gaFc&h(JREek-)vc z0PixJLN~lYN4&MIxGzlJUaRF`;mNk11c&i0F2Fo=?w*=3jhtr+jQj}TFp#ME`ji5+ z?HH$!Rc=*{lggh;cYBS_qocE+zh%Lv(qrU^4l@t}U588meh>S#yY-Qgt1F61%xWb! zU22}&X4mMrBPQ)e!(MQq2K2bz!`?hQEYHvYJ9fslwy%A^RE9fo7CZ}GvEJ5xSepg` z-B`|IIN^J)y4mlTG}vlWB*F2iaDe$IcssZ~Cz$U~!{}oauI;5oobIDEpILbJh`N&A zG`UnQ<8hS#V0~|W5kiWFua;g%|0pdK9+;RmIRqp6iHVPlW&X~hXCN8Zdrs=nnH>Ke z?unMIz8Ytf!7b^!;-^I45d?MQ;882F)j;8Qjx~EXYTjX}Hc{`9xI3-)E;$5S~?-8f__n4<=&G_mZIT%<8Vmo}z2nLn*hSk3~ zC(o0>SvEa!NMpZ=2OTrIa#tQE5_3mClOvD$m*$UT_n7q+-R}i-?yp46lG|X%BZuGe zpti;HSg{!Y)R$47D3kI|5cwF?8b!3Do0>0l6&>sQ0ndAVl-daX$ZpBTFmy>Fw$ZFT ze{_+>NPp4H$w9i!pTFPgOEUeT5Lv+J`4$JX^bPHB7*~{fjP{qugDWUz?L)nT9;r`< z@2)5l4g%b}U}^C*++JjN%LSM@s=k65`zBsf)}zWGoH;wlT9VfIVJV95uclw>b4zxm~DD&OsNt7m(ThH2nu&b_|z@iXt}y!v*T2=lsW>ysrR9CC&p??b!3 zeQvk_!_!o)2z7jS2QV zc{O+QMcZgnSo>@jDT0H63->M-FP*YXGn(-*+8|KvHSMfbWw3l|qINozC3GN*PNDr8 z`@XEz^X@fj>_Z3PNMw&Zusy}+zP3?XfIyCUfNp7bY8I)e1J!>MuJ>kZp1XQ@?mp|% zVf7b(CqG|?&%5z=L@Oy~?v8#MQY0_~7Y2mC-hN3FH`)N?1$gj>4%Lw67yEs&=~zby zJWY0OwiX?xDOl{+PWf`4?!1cIrRsHP{snrsiv!Nq*URgP^*nW^pM>qwa=JY_xpD`; zM`Ttj#vr?tY|`V;O7WvfKi&DlEg@9GYKZDNQE;%|5?@#%iA#&j?!!bhH9T?JnSYU# zXCpbGZP&>0w4AGJVkg41WAq|5lK!A1f6(RI5eWa0A-Fo)ZlxT(T9(|22{Y;O26wJW z8}?i7mA{%l$ANFlYV{R8L)0YFbd6|=*gAiD`+^vs(*tQ|oE7vE+l~9Jq}wh_Co7_Y z6EUBqvBEV0b%Lw6cg-wE(vm!X#W3&3jq*-FCCk@IEd-zxH}h_KWo@!$$3h@Yg_W;u ziUwuPHLmDUT1$Ru{w$HYcZz|o=-#le6m-Z5}q8_ReVw3g&w?o(i2K z;jLxe^|M9NFaxWPP)5;a?tYt4cVWVzXK2}YZi8}j&RVJFbD(uy|4$?rd1<(DfJSC> z4Ygs0ajvZ!x>MJ_fP?2Y;o!F%6CoNd-iD!omcqywS)bI^d$|vzuXHn=Fe0*y z){0+Pyb608!5S{~u$-x%l0J|qidbqd%17!1%UT|~m?QoQQDSe@wi7R?9t)?_28k$$i9O6Qc$54Ya|7r6KyM<44baQ7s}sONloX4yzAOtk_~w*oiVt=jVs&}}-K{@s}!}v97h1Zm=1qoYhxJ983&-6( z%a?q5C3Z{0lHg;On6k(m!gEKHijT5xwvGPw0-dFRZ;Xax^a}wulJRKo%|h&$&?xTF zXSig-osXJ?8_6x3FXNeYL_=3?xpLTzK0sd##|3n1qOu25N5Iz_Mm1kD?V2dH9{MR{ z{IgQ}ucY#n`P);?2l=J3lo`-J9>{)>)q%qlmrK8~t~fAKYCwOCZad>ir;59i{@8>& zDJ7KdNEIq5zCQuo_-%?fYL0Q&csd!p5c}*NqeY+P{}^(cUMKzLGT*)%zHJ<590N`{ z;SHGECV2m-wpE?3;`*W7og6AN-+8^B*q+3pfU>V$11Y~NU%8)r)ZYA*V4`QmXhN3( z>C+JY5Sh6*1wW=8E|M)rdX&S@`%J5?GLv#*8mu|So33DnFT#7yKg0R}K;tw!AqwfE zfh3LUU@87${u-cn%L30Bi2DFm*aBfUUNK2&4U2kWa2ZFF)UpO2G0UQPFd~M@Qz!!u zO#%)*dKD}8O(rZL5VpiHkB2N2fNB-Tj3I_V#zj7!1}xB=mTQl^vUoe?Y14*uQ4&EF zPjL>29eWqF?P`f^G$5}$Ul5#S)h8n#PFXf8Rpa1z|DkuOyk)A@jjL*p$*%%=s)#!h zcX^(2EzI@viBLlMb`2Fa>ftnUd9zj68&l|>4=#(drPy>Asb>CEuM)tb$aAOWaGVzU zKjN>!L1kpJ)=d}L0`~l*qt}#?`6iDFYb@_tmgc09&`FbDwOR;=AaihBxh{6kwaG|Zs za!OU;kE$0TuR$#YU(2fNIT(Bw6;*0ajf7KhzQdZB`P}Gj5qRLX@cH?5GAySxw^udr zWKvH)wKm$jhL4yUTBU_6v!U}!`evoNT{lxRl8 ztdPXg&-gw<@z}$FeO0C?(*{iGhGra=+bWed%9g@B++$WxiS)Hw!!K{+x3bXQbi-9y zNZ!5xkCVEPiPJPAXUufxb~#qupeR2mFI#49?bXdHWOWO8$dh!i(f7_brvQs4%MS_w#N* zWvorcrEsDMqCfq7KhKIJC!?Sm#m3)!4dRF4t#wa;>-0d=*_W8FmzoG`jk?gCABz7n z$}m^>2~A-_!{~GUtt#|lrIs76J%_8ntteNFTQ^Fa^T}ycSI8VqTj(Rl(@v+mzeUMt z6_xL6C;gC9hh`FA6lkc6w9tVeM4c5@^90;Q?2@=c)hvS)Llx--Stu1qYvfh z7YPFlIuXJvZcLMgW~X#;7)lZP_2WeiwZ>5C)L}&t8XL$t?Y@7OTXN2SEK-^`qG=$h z_%ikJW$AM?_8?`nYp_#awCY|Ip;GuZ^zq2;+SKZ~p#~;395)kR2YHrWU26t{-PqD> z33|nIHzCImJCd=0uV7W-LuQ8;Bactoy51ao?mVx^fF>3JnRf`}i74HJ_^GfBCG!9s zL4p3qj8%BR%1PR@?3wwQ*$^GtsZCDbw;^a9ndQmmL@pZ(Y!6 zgQc9hQSTG)vRFAWDnhwCY?GGH?_l04$2?xld7G<%)BNpc7mOBV#9e~ID7R9pIJ6$v zwR}feDj5k<6YhRI*fpvci9q)~$1KMjQASZ~7WvV7RP$6n=2B?A!yLsU&;4rB$V-S) z)CCW>FzAo;bOAjj93SP}te{gw_KAH(YG9}tE;7V1HI&ZNi^-y>BR_aIGk*XpQG{Bb za^E%_$zEHCiMBDklUav*h8`@z!i6$V25MQAv!6WX$@#F5ucbqS~i-czrRnZPG@j6OUiI0u}Zmh36;6Q6^XGC+|KWDo>OIj zdRo+6Yxl|4mb95%kX*d-5Y>iZnCyLKXY7zsr1PYYJ>E=hE;+sut%4hWiH*fT!%QNN zutJ1q8uxh1MR@>0>w7u7(Ur@hMcJJbe#QcU5(Fi^Zs*GIR*G#Wk9#QJV`GAg?Pdn=q?Y_TroV|l z4fWZ1`?*&R^@9|Bpb-YEV+s_3`S~fnjrdhXZM^Cw5(w3TUzx&{`A6%!e6xM|oW{WZ za}4-1=ol+sCI*l71h|I0CQnNYKd(mi%mCX)%E}XyPG)S~bM-tP9$`FVP83HErAp-${)o5_9CD28`0L~+mTm8RD+@f%vtCl)cGn^AQ85)jbYdLs>- zaG+z55GLAtsQIGdkMt@{w$G2f+stjxwamnNGN%oWS>Q912~sOFX79UWR`sy&DhThz zR^sAtRBf58ai*<6!p(kt_=ySr^3eA0&^?wT&@{J6n>k1nM&VG=3~&_U=Utb-RpSt% z2dUWFRa4rim_MLclpmmG+7fBn$p8R0eB zUQRQ6Zt$QUZ_*%kyXj<-pAH2EKZgBBFD z)^Pr%JZ7#lrJ?wwar4ftear9hac7D6slP&jc}cnB+puwqvT{c8d)OrXgc7W;<<=;% zEmy9b(=S%J;vT2|7+)jT#Ef^KWr1K`_Zsyx8B8b{wHIXW(s2r#;x@3v%dFJ?(T9+> zZ<(Y4hn4DWq$dWiO^AQ|npf(jJTE7YbFoHlA&?yG9I^C^GNrt#%Nvcbb)C&*H0!iJ z3m!;!lw?vEu2<^y|JQCAW-?&t{()U?lL2-4d&o@}n#s(npcKJ0#V z2=9P1F+ZSlHKXg$G5NtA=1W%Sz3s7?#v-4-*6ISbXNUC( zLGvP-y~C=%l1x4oyf2=vK-d~305-gu`qf8!pOMy>G(4)zOkegGzZKS@ToHcd2IxDJ zGj~ROcq|G);XZc3{VT9vM3Jw7D)J)T8~PK^2AP6+&c)o%l4X`+Bbc(GfIo^L|6{eM?C|ldeQ7h_m0u zL{iL#WVze~z@l3hAoqpCv}*++DEcw<^Jm-)rj0I@Ql&^Cekp62)#mHhT@mL`RulQ> zl_#0m33ioW6(Upz&yQwqVRwqf;%@dk+uHkGB<~r~*ssrRu(@yfa0WS@GI88kFJdli zhojPPF`fHl+pRjwGnV)iJAKLmnGl}~)hz&kU_ z`lzPdOs<(4-8@x_UI-}9Re9*8O((ua>;0lL1Z4E{*FmxujNk6a3~|B2jM_$x2Co56 z{F-&BX`6ymu&6;=cm3HQVUa^y?|%3K&UDkOUmwP5>tzZ#`KEl)=|cfM6E@KtJ~zof z?kIY2GD1o2 zRxK^RDtxJWg&5s$xJ|gA!9w8rjVyfSmU!Py^ z%fj@sc_K?p4po2oS z>g-C6)8E4!lOnjx1zOnHpF1}y8fctDP^9b79l++$)#!{k&Oc{ALy%^zY zFPlexiow!N5WwMn3l2fe|)ri;~=4r1uW7xv@%$VW;M~FSALO zWu#&_Up6NAriKxHv-7jwh6`#?y}$Zv?XCJue6r~sc*^n8!^uq3VB2$UL^_w{Q}a+M_xc|ahV>Z_Tz{lghANI68SM~ga$2rBiA&K~>} z!wS)NEwVv*A1q-gjbT99iu=wxqyBXm{1fP&pR{Q>%s4Syq6@Mze_(Ca&pDM*vKMA$ zD<{WP4a|T)3w+%5*5=!i$7e_|pG&O(o~|Dc9WR?cBF2lhVsl_)%+!fV65Z^pOIBbI z3Xdru`)~WgC$#USY@?VB9roa#U>pHq%trfLCIjqwSn^yi4mu5o?L>JN_`ShDuW!y9 ztI$qs-)*?3c*7*y_)T!dX@At4&>1ZynG0K7>RBLJ|0+FSrs|sKb&9?YFKYcPT@aET z-+e(W2JrNi#fML*QN|{a`H}iWfo4Gv9uPp3a!5mqyXZeq9yeKF8o(QRaO0c|E4^_g zQsF^Rs1Bcnj1YLSJ|y3u`f)PR>$^AQAd&f%4Tsrx_)KiH-U@UOgJeE6xoENYS#gR~ zFY^Ph{JHkqP#eX^mCmJ+uP%|q-$U(7TZeHKO2o|>M%a`hc7phUue5^w-1te=es@YLCR zy=F90gp~2e*hU+>2JadkJNH1>k;@dHvVqx+>QrU!`FQovw;OBBnAT_xPpc>g50shT zf||9|`^nXUcOKPD-j#aE&hB7QEIOC=XfebMZ|5nQ8JngtHs2h(doUt=SU2ZEKcic( zWhD(gf0C;}&%@cskn~4WvtM~+f58YXxj!UE6v%qq5 z9lhgM@w+2jKN;WoInFYimrSRjp3msp7HjomqV4GMe7@wkWghyK55`X&hEC<(%WSA# z>Zh=;WnkEZT4?jrItnekBHKYlP68n&4MXqMu6v(80eAPk#F&>QqX8^?dAvk5e%W^j z_~%pefy3M7gJ1V+P4WV3!RZ?F(IU4f(b+WLp)S-{hvFiuC_X>JZE~&~eGKyOSrtnt zrB+@{jPQbQy;b=7EkCqYA7Nhl$txZw*)oLKLzsHx2%q8?1+2`&+yqsH%>XMQykb+O zcTe`Q3JlLhB$-n)sY40VhwOLXvKGI%A2EIMakpTa^`XRKKc!vTE72Wm4wC~S4aS~d zwaJ;u+zZ01!qgL>zKvofuWWp&h%u}eL$YM%Xb-&d3OaBJPaHZOe|7fy#)maYiKDIA zh4tcIRq{YB8tn~CE?RhyViGbzBuYT|K-{vI=tlaRuaok!?``nE>@qobw{4Szns!wy z_%$4&80_r^eJ$3TSO3AV@~yt~;F5M5&wMtZL*e|j_m~R@O4}zfA<~}mI6U-?i50ue zje;3wKt=Pc%&G}8-Gs&#lIpIr+qXm2knLX+=6+sJabCUn;fQP*S7H-HL7Bh8cx1Yt z2kSl@@3*2yu8!OUr+oiPASQ9i#8*DUa->)~m~-{`nUHRTfDVgisEBgNpUkg$?vo|O zlJMmr4oys_Byy8jHRlLrP!v&R5}wf5=`&a6DFP)^(NUkFTCL%0!m5)5U^$gFX5~A6 z1i4n4$>;sK#b!Lq z`n!w_w0+R~%^e?AQ0BJFzKk^Sa4tf%8Y&>i3HDnZEM}Y|f1b8THHj)qCVP#~#dxz4ig+|fV4Aq)CRphHFR| z7qf>YVqDM6-mGz;BSGaQakgB#d*=V+k1%qWo9>~qVF_pCbf8K7q!(K_>wyXTfjo8{ zskp{y#}NyY{P)F}s??NiF!YQU>P2*o=Lch1Y5tgVzKnFwUIQOsnr4o?h88a>=b>jQ zKYDn)q%AQF_y$-M&ECHY7m}>ks|_FK-0gT#qfLwGX_CG1QSrDlW~StBw8fs3Pa)iH zLR!4<4enZ%?aE>YS)3hJ$~1RO(FWj8KY$oP{3KSS7S8K_3QqHRNs(u-F7@si?luv8 zrD>E;tIIGunHOJ&0M3B&Z+OFAw{Oaq-E|^<>(4k>yC-~;zZhGWtIoPT*GUGnzj>FV ze~4Hfh(G-*f{e2Cppoia1c5CK8`03?dQQbVUOn~Aj_;XGkbR2#SvG=m3O;;>(9>Bd z{nDgUm!43LU1aKq*P*I~&u(N+8U);T%{^atsgw=AC3TdkUG6_tvKaP1B^yUJ*Q`p2HaKayrZ;giAlw}B`2E+2Uqcm+`=dKtT|paPFC|q zES59m9>b7qzsdv{eMJY}rB_!f-n}%4U%jM|n|oTt8tuQ3eMD@e2-A`8Ms=p6F0i#e z@+=j43m+2E^J!D35tk0?bli29*-rY-AqASfshOmQg9bJ*ti8K2G~u#8O8M^ zbN90Fz1sEEF=j1lzE7L)PMw@ht0hHuJQIR_f*89J^1X0w(bt)gZ%^jHZ{2V{P&4rX zA_PC)HVC!&U@dcZjrWB;?h@hgUij&gsRclefMp zwEz$s#wYSbySjE#WL&K&A$*_vaD1ikbWI8~ct%yQ!Ctx3en%F+$T=fZK5W|IAa=~k zMCPmNxNpOo9?^58bE$Us_ruWoy<=2LzGk{I(k>!ReAwXZWJ|@#wy=ypo%pDBj%T~;RA9f$4wAQ) zHlM$&AwjJ@Q*~lbM+I~LT~(_VQM3UJK)Gb0)(<>TiDq&1sa%r`rt(#Qq^A#`g8AW= zW3_xC|Kau0((QR5M#saUr!TbJxOKdL0YD$MS3)Z_J%+?e)OLzTV%kOeaS0Dm?8moK z`HLJ1J?ijWCT~F`}z%N@8{iqwUg)A051Z&RI?%1{f^QRayUsb=qK!j({vmKNr+Hh* zev#ZyB^4++^$o2a9oU(W6;K(f$Iv08S;3m|NM$jypn7m$4J26y0rxnjN4b4{?0tjp}0A27jG|E*OhpW}GxKiePPVb=9lJtUOgy zXHeO&G<&1ZdGOq;Br9)U1X;X+Ckot-T#j>FlNr4wx1=epDp>0ac$lQ@8ztN5Jgth0 zO4ve1)RdmwT^qs05R5uZ^8&M9)hf2#yivsK{y?TCeVbsSWQ4hBON*;k%(m_0A(nTl za_(d|3bS*VVz%VUBgiXJo{D{%V}SfsdW!Phz{k%d97LAi5W}yG%J`En6!$3-Px`LYqqOfm+3%~N8Qg)Q(~mMVLn+>tYCf)OHwK|k6y zO}v4Ta9%b1qDeu9_(Qe2e)YR_f&rPlywyH$a=PjGcdj|RpqRqIv2hmUE)$$ux3piB zskI9>KQYnizsCy|^Eo%}U0IUW#GP%fV|3I|+<0?%4MloLtBjJ;Y)KdhdS@I~!aplz zxxEh_)c-l}+kwJXkrMLj&6`bNDdk>Um6r32VozISAT)T_F``)U)VjhI+QYnCSn#B4 zPU@~M$015yd0TOv-U2s;uk?$d_(}U%rulAryazlrhRsRbX-qqlUW=k8|IQ5OuCW=q zibjmaMezv^d!ONsi2aXu20;uoHQ$d&eZI9{pgBSDKGzC(UbTMs7SD|MYFOhgHqJED zL17=HVpZFQps!uVys-=ex0E9+(t#w;bX-bgQNgzGs(rEeEd6`dF zVepH4W;ClA6$0NbM&Zg^f53useHcI?6gtjioPGf`1`Mn)T%T4rB)i4;z$?)zd|taC zmiKIH`E1*L1xh{|bZOm`*;wV76dnwh%+#ge$)QI;mjStl~`7D`-*(STy z$f36ggi}jdP8I;`HC>xwC4HzOX*l5a{A?raTKEY zBTWr6D9%{U$e7B&RPMFM`Cem5-?<@^3W_&cAJk#e-E=(nUaY*Ad*)R5Xm7c48ao`j zy`r`_W7y4WY}u26v1MPz>*)nQPI@=);aJvKIQbDnCkiIsP`?!D)xty5 zk#~IfStZA1QVC39KHaw62Yl}f98U8MA2T(gUl}Bn*nQZ4ZKHO0(JOp7__z?`fozz6Ttn_@Aw5I}PluR!(~uK((_ z`l|A#AQgu73#LH45pm}|7vs5$v*L&J-bR6^pC1N-8#PRZpuCS8m!kH>xMT`995^{9 zOai`D=DqdjWgz(PDT zkzTE=KyE-lrhy6}sP4n3%3adNX?N-%eo|P&erqVPVxil=P2w}*?uwJ0j>6FHht^qG zdQ3TZKICFne|cc-sZz1^Rkr3T8)~++ zoX1c#Jdlpm=B5?X`0HL;6)aBOLinZgk^S*FlEykiJx#u8v6jFpv6vop z;`E86@TLx>tLJ4qqno6Cl(wr+akeotMDx@)V{UtBxQgFlHk$0BLti*LYN|gFxlQYB zpW=Kn_s+;9v3HM3i=S3+qKwJL*@%d`Wd5RacRI`pvtAVm*^DC53e%C8(lf@OJr8=O zS5ukxA?hPTUjVZ>CswaKY4{f^nh!sV7R}UcR#>y+xX$0eG6$;fe16(W))~OMdsPmX zxv_uZj)YDabz%rFBipXD?L?E9lapogm6!9a8W)bLwu00Hg=F*CI(={dQyaF3Mq{-s zv*7y|ZJw9*GD4~Q{zR_N=2_c5h5InvEjS40u(J`-+X z1ow`xc@UmXvSnK>m^#t>G~4y`X$$%VDo}O91s|Y*_NURla=hEUAo|>3_%Z7_w`3g( z?#s1mS~ODoO=S%$&ix(^Ah}&Frs#cM+sN)2m+*1akf&I(-cJc9Ht>!`hgo97_KBmW zfXrC&7HkER>QyEu1*r6KwWmy0(cGP5u?>`D2K zP4UPWr-u|Dd8qsD#V=xo+Ui*jO_ z=z<8~#mZU2Oby>3_`>tuhy;ru(gr9~eeG2ALe;|Iz~Ier#y!>%;4!$<6EDAA54x5l zAKXOUTQ9;as1$S~=eTZmFzVzt%n5znF;_J$t^A@zV5sP&$`u?Mi1bL|Bx$SuIY~5p z;mpn#%_Z5wfWjNT6`;SK3?AdPJwF#GB-~)?M=Y`m-0O`keGqx@V@#9xUV2R|$j!C0 z6;eEZ5WbSr0u^cao)*`THW(aC%6&4_VR0u(}i7KPmeH7 z*bj!hrYhxs*OonjJ6h@OpC#GT`Z?i_^gbdyC99r3trBaJ!p-8ZuUM$9ufa3ZQy-rUkq$3cecRjq8ael(-YTkew2R*`w+=}FsGM}{RcKYHf<7|#( z`2|2&8K7J*?+Dv!Ar2=^SsJ`ZZ}F@2BYon0;Q~3b8FLUb!h?&RlvaXg5qgs0b_i5k z8*L9tRL_MR^_h<4IT|%ILw4uIxMcWx3HvqpB;jn0*U_(8HomrP2v&0KYQ8rC^3E}I zR7>&{nq;~=2(Ni>0gVn28-dn#ny;(Q;@NdamAq>d@%mVX%Zm7NhtiABHtT_+<{1>_ z7=OSg1*oF8vvoAKpyXU4Qo82K6%Su7jx$>+&a0WL&X&WwF~2fw&Rd#O77)BKX>ZN6 zH*eCAoBPg7d=mUA=Xr5}afvaRdl6HGS$}KV=rP{~Hp}-^jNN;DO8HR~j2I}TI z(k|+!Og$XW($8kvSIEjEHiEWv{-X6N7F-jNl_2qb2I0Hyx|jX}^3z|QD4%Ro#mRLo z4UpPBN%GMNlViwwcrLCtFcLj05Ip#$bLm1!RrDVIzc%h^yAlA=qG)WpV%xTD+qO}$ zZQCcdt%_~iw%zxo?^A!q9%Jt1PmTJ|LX=NY0kFgB+Pk0X{w`(xCX{Z0?EQDz4Pz)@ z!x)duTDEBa8UvbQV?{gG_l+84?+==8$SDw5s)zSz`&uh6(QL(T(~(Y!*NF<42o$k4 zRyNH0s8B4jsa+vZ+9a8ptlpB0^9#(I53uw@?H!4bI9I|=x2(?`C=`tDctFF28mr=Q zIL~kx)XVmmVuaRhgSVDdt0;z6{W;4!>qOy1P7#3`n4D`H>h?H?r5Y>+!=S~Z1r-RMMQ7Q2*6=Yv!XFzq z!5C>CP1ssIey*?10&r>4Rf1b3)l7L&lo&oPLLK%^kSz?=4RW#}fEb^-g=05;9ttMa z5$GX@>bXJv)pfad45>q80nVfpV`eSggIXKrity(1`REP5i;neGMdE5g&&z})@zNck_ELm(7e>dka zu7=DwdWvnniRM(+0zO|YQPW=>lgR8Q)0+s1dY0uv+8Vre1WA#hWSXFI$<_SAzwOC~ zA6qKHsuIAMS@U;go12kyH$0>~dS=S2Sh>>Q1DkRNKb-8`tVfT~Ce}GHCN!1oUhmNrK0oN{U z$agKZFONf#HU*8BtKvUfdX zMOC>F8z!wU`;O=WLP#_n@sW!jV->x&Mc@&tnl(DAifrLTV_Om+2?%VD&2lJU_(|f*@7XdsES?t&`Yy)2EGo3y#jp ziZZsY^9@AW2jy-zp34@VD3J;A>BZy&VAhfkVG*Fy8I_jBs?zp-g7Z;?Llx^xj@08m z?m3hrbSZ3+mQyxlw~xIta=GiH*%+*^*uM--Z)p~}Y%ZLU~1pifWZiwH~G{o}U(ELW5Q{%UMNL3Ae9 z$HWo@DVX<`tUELqT07x;?^?wE@dL=buvBOPC0YiPfS&PQ1#8#!Iiy2OUAV?%&*7z5 z$R|!g?VSL5s8t{3gYkfqlY2NVV=o94yRQBvZD`z&L!5azEko=5-rn+Gr{4>S8NP?J z&Q}%tPrqmprR#heH^{VG-1rzG(t&YFQk`g!Wn{pftf+^YM$*;#?n-M zX6^tCNGGBkKKpv4v1by0F>F!7%i`^vYFU+WqLnkeM}EztHX9d5 z+|X<=I*@u*i^osfy^+XRGRb@%t47luUl6qYx|iAwVIglQx?QMkeHmWvf>$?0R=09W z9Z{X;;m>WYxt735c{3|xyLX6H@lU9!Sas=cGXT7&J0^~SoHYTgj*rSL^vimb&V%rd zw+5AOs493H z>$J09IDWpE^sV`Uz~Tqma?SQGHQD>KjnMUu)p(Q};zpDL#l-!4{qovXeVE`o<$2QB z&yx+-e;@a+LvHaMUhBTmH;NX^1YvEeN+wL_l)1oTL9TA$8#${7Za>l|+)dwByJ;0- zpt%rVFN?rea%*ZIR^4*!M3?A|;#j?RNXbndT-7x}N_GDe^7$3>$M`e9b7zEKwU-|9 z_@M&C=227KP2?e4eA=e1SGT^$llQSn2Ne^N@l1Er07i61#bDow8T%CJ$P^teyk{Gn zaCV5)+UDbf0R>g(mx(of#CNo9BJ|lJe9vO z`j)(+_cg|3cR*sYwq&t5&BNBan6N_}`4Jr}>r{qr(geQeBD^ueCH;sHeJT~x#Se;l zd8w6vm;7pKNWjkyzBsEe7=5~Eapc@!cfLd-!(VFSFr(Y zp?6}QFj-?e227*hF;Pe>;M*?4xfji4A+b=O!^o-bEXah~RxcAjEaJy4Tg7tHwCQg} zbjhQ6c?#v43wHPO!k9qiKxGmr`KW>M<6KcLyXEWW5rOA*aQ*w0dy ztT_zD%b%)#tdO3JXwb9G27DSsW*0H4#z+w}q47zMBCf8Xie{${)=G2Z81o&jz2Mn; zV@fmu9e!(^jFyId(%2l;6G<`T-DTkQ$CSJ6{yl9t-ui`*r}J77xtvzs z!lt9};K|6^y{OE9{NsX_P3SjnU7*hg&$GbGdnqgyLX6!*a4t-=F5uX13*TWcO`>p=#NJDh_W=H(5xh>Hj-A{_@ zF=o33s6i0OJkN8h!Pi;}LGYL5Zc|J`BdT6tSveD z7b_T_C0cd{(Tsq;w}M-lmpwnr_yYxl)xVq> zQ~aIEL-;U3EI;KZ3(}1*+tyNE$xMmq<@yD|59B_5i}$j#`(yXaqO-Kh+v}^}=~2W- z@8U>dt9PP2d^jmP0X~Mcn0po^ZHR)QPHLzqn4T};U7CyL(+5;Z)7!OP`+%U-f{x?0 zCx(Ez<(oN3KTa>IchEJGiE+i3L6;wEVz66<0cEdg>t80(uXe%}Ha%llwKNMMj2-JbKQOBw<$+h-yYyuOorAlNPIu3M09e@Qw%Clv?$7%#`|cq+0;2< z?bwG{LgklFc>x5g+g&U{5wQmw*OdPDZW1St6z7OA71li@dbmY8cF*yaHT0FfpU6WIDpo z`=J0+kv_xloBL|D3LkusC z_b`1qy-Al`dyWuc3I*d%fPTto_AGI|F6*D4*6zPLZd-0KhHH?zX)U(a3CkiotrHMv zh*)2{`zocrcm>p5tA))$WpErOJQ$p4gV}soi2M-4u1p?Vw1D%9;P@K>X)5LXV^786 zPA5FbtwyN2J+eg=BBEO5ie3_;nBGtCvSX-uTd>_e(6BTP26O7HieMdV#jdm`nv^g5 znUj$T7CG~6q^jo7;Kfoa`HPP&MObZ(D#}yBiHZYiCxXH}r&c-&JV84W$|5_bUfAP1QFnO&c8ftpP zzU_vXu-7~#Hb|fcei9`Lu5+J_CbXxcwD7B~ZImYP&N)v`b`?oQPjN^7;+{*IbGg&6 zPJ^oAN4x3Z&Gg06(#YfsTT)`ZW0ZW|T0ku6#{+uYhoj1XZ9nVH#^4Y?;rKK6WiS5? zE60+Z1rzF5AJf4is&q9VdX8w+#5@0EfC#exLdSMr;OY_7RQ2{nZm%oo`-cVPAaNL@ z_~XT7*W~sX7UGsWAaf_!pk^R@(K7Dv#sVa`J{$iEHlW#n-I4a7D3het%Zn~7(9rh> zKEwQuIGzfjF^ySLK1wDVrI@j+-6zjTX`V%D0w&pWrp!)S_v~LbsW(1ymG+R@rBgH zG0$`byL??K?LbeWA&4MmTmeh;y16iGI0zykGa4m8^7=^g~xyDbO=YD=m`_ zjbaA!p7*J?hiLbK$|z^Ax$lv{q1x`x+J}Y;SC`W*ttUDP4gj9>ZNlFghYbz6?Jq4$?$~8$Jn$K%9Q88@UA!#2)r{JV`zDTNN^LKqOX%)Npj8_;CA&4 z%SNR`)4Pibt(Hq^e+q`6!VxhKH(9_3SUBfs@Lw~Rjq`kPUoD$B1KsHxb<)Ca3!?ug z9SDXy-Q&~+z6t0AmcL*XDJAni%p*If<#LX=sL!bf`RC|^%~_N_sGO(?U0GgH$K9#P#_K6e0B-Q9SIJY-Q$u&tz`0qJtAs3X?N@sTrUk>pusF|;bWpq~=W`ot2E6p{Mh14Cd#0b@t)q^AcuuT;!)TIH zZb!4rNFmtk{!>kN&k9#VZxiUqyVdj!UV7lp*FSYGR_w(N=3fvA;sHexWmXd?j%=b^ zF#~4;(k8^*g6z$7-YB>-4UMUxyeGyp_e^Y@OUQ93l<=5cga~Pk_gaUl9I0-oKS=?u zRd!3^Sevy7YxS8REs2_7l0`uNo?)__0z?o#F-Tjnuo-^>W|-QODR5>A9{u@Ar}e+w~T+g+g##r zfXmGcwy`ZDT9*nJ)*duhkIBSjf`cKmeI%={wgh}*5o7y|GbwevA#P-~(K;*HAzZiI z_RX&T;v^szY1zD|W;fBYzv`x8dkXCm^>erPov>x^h1-Z4CKG|e@OH-GPHPdW%RR`j z^xdX5B0rZbzt!mGS(uT)$^S>9c@5&9B1GdP>>wK1ZGQW~+yo(GF*1Uv>uicb=3bgzXWx4~tr}km zXxvl059MZx&Y?P`0eKz>WEb;m&h~0xW_nIJ?t(#lw3jb2ms1Cbhr()T;d*KCfzZKe z<_+QY_+5QPiIVs=l#5-(uUt~+^GHV&;=#Qzi3_pcJW9m|RyHv}3V#cH2?~iBOLxoL zWlf!m^j<4$EEyXGLGOev2p=zDYOD878Be)J@~fB~koLT5YSdPjW|%tk%H|TlPj^dd zz0;K5qKW9zff85!*Co~IeMmL4=+*LP*$;oy`qrdO&Km-E&)Eqw>{5Nm%VU_AJO8hy z>G}uVw$r4XI{vI8eQj|Nbli$UpGClqln9F$?=d+f(!7Z9xD~*BT{H zL(&w4c$1YRldHLLnx0^Za%XG!&2&L6-96g|Finu$MU`OP$q+W@1gw){W7&UV_b=G6 zXUgE`(3}EIi}IQIBIbAHoYSq5o)?n~VTvF3jQ#BSs3-V)9%k$M`cFx)VT4xig&O>g z{bXg^@3yoD!jpv_)xRd(9vf4Gu1(-*aN*~No2T=V&AzEbq#Bge&dI1Q^Y%RHziDe0 z%xIh&e?}_q@*Z-kur6zD>cMd+c03X)4Z6Vg{?0BlNA1-LlF7$s(}Zmg8WTP{3Atgy2F=lfv zY2pj5zk?C!vk_jGwtqh8xkiD?peZ&46F58l4=LT9%dYW(v6lN+o-$p=Eu^5uAWAG^Ite}yVM@{Lg~GlJF>LgDwLU zu_407Sq|+vwY5Xm)1iGCL}fQ{U}DnZYrKknI*@~TWlP1zB(FHSo`!y}C*Sg5^$oDZj(oU)pr7t6T4*GQ&^hwi&} z+|x;#oRXTU56wKrYmrI3*_F(*_N{l73v@6bMi~D`7}Ytu_Lha7-HA4RDv5hD+#t*; zQ{t!MilGMjUr3=HTOOWk0c010GRe;eBDR~WmwDyzEIm^%5bRilCTJ4|tJf3l@92^5 z=dujx2drFS?4fFmA^q3PUz(5(f|d-o4P9A&--o2OWlfZNmA+*>Ayl?LBs-u1p>>Q1 z3sqn8kJW`RP0N9Nv#F3KR{MzS{(8l6gK2IZu(EF>kZ3LrkjqQVJRkogdg`y+F7dor zd9|=eR#3yNQ<FCR_NI(0EQLT?rA`?}gBU=u=r-1#}z#ZubBPJt`SfMN4?ziIRo1H}`|$KHVD($4F95^EkcD5PssTF6Qi zB35sK6&DC!tUzH4Ckg#Q^V`ov8gb@Nu%=TS(lI^X==1*rh0tl6o*>S zt0Q6alZtgkmVL1I*=EgxdGk=#c_xo%*>D>so__G*^g`O5lOE8Hwd~lgUqVpu-5^15 zSN2mxO`aIsdmjvadUx?F#_YCMNy_5~2^D1(rxLpSJ_n&jCoI)Zyh?TFdLdiEw1-3; zUM?s`Gotl>;M|7!D6xh_6?)CJW=wK{}uE{ZJ}Ic3vz$`9@&k%Pwr zHK6wstSDF->K~w#ff+!zL{LS`HKL-qJ}W)OZdL>zSUx??VKhhCV?g`E5+JIx6i$Fz z+E?^ahlxAIB+W1I%#@X+5iD(CfapGwU3#>(Dw5AhnWyzEG$N4dxbK{BYYAV}%(h`G zPqxJ>T}Zxt0R1e{gtTKV#GVJQ+xW*a!MvbEp%M>G!sqnO-gX;|1);RQYR^yXvS)rj zS4G#ra6a$)R2iVY_x0_{w^NccknZ)g2}LE1)ufumM|l&_+W1fwGE{QkJVi?CsUtKL zAhz13GNOzCH{K+u%*w8zhB`ADVM)H$@GL&FCH<^2EkGF#yQD`a_+q0f07DJB6`LSy zo#OA=7Z*j=M8=Y7Z=}&KUOBlcHd3IKGTFaOOX|ztjgX_4{%I+N@(#5Y7Ae}Q;6=g= zaELMhYFeO5;09~LhD0KiUEwPuLfaE4@! zf_{>+VDatgugJ%5|Dz&jMd*Ip-tculc@wm{j?Qp-o%pR)Eb;a~SBL1Y3);%T)y}!E znaRwc(lJi1E2rZ5Q&P`>;vmj!DWPpMe}G|G`u^V~3c?HQwr?lh@hTJ#JSv$GD#a%R zhl=_Dz>_7`MZwP55F~e5s}{;$pZ7LEu%*MYeOA4lV#OF6Wui(VRvQ^$`Hj%}u?}6` zLv_Nz3p2~)*r8bfxZxni{yIg6dpqkKo0P6xigyEi81dxD5)4Ig=NczW>-UW+PZ26u z@&6`x2y&Wzk#>GZfMuMSO$If>e~8w2mwJ2c9%r0S@}4^|(l@l4h%KQqlp4zCoHfOw zg@Eov)#`Qi|AB#skGVzNoMQSIfjEhOFXTab%x?(R_a5!UTI1-3K)99+dC-pK@guM7 zXo^~WFZeuCEq&Yjer0-|P49%2w1o6=pAMI(>;bp`OEaLmvkt34>JiS!nxT9`9@C}A zl;^+t`*tN}^}OYLsrD=y=`2wRl}l*0FA}Iu{%ki>PceDpFCeEMI003_Y9siD`Bu!5 zk>UuIXJ+@>Y?%)(W{9S}>>S7uDff1qv!0VV-YX64wca$!Pc<8pSvx7hJT4`hEl5F; zty+6BLzFxl4PZH3-9X^8kOI9=Li2^zg4p#HQ*zyd_@=rTw^qJhj_XY1n-OX4wn@TQ zmuPG>-|gDF7sK^&cwrgFxKP>|OdyHQ_>u-U&EE)nceu`<_FPsr`ZHRJFH^j7Xy~tf zbls8YQK8Ih^rfR?OF9B!wNlIeuRi_WdSpn|^nIJZX%hWFd43nwoxKH%-L*?fe*3>q zUfxXnU&g_ii>PsqtylERW=nvBsL&SaQ679SQ69sMzmNOTk5Z$U>9QZBU50{(VyoDJ zZ6zh0l#NhQljHB35yKnhz7FE77bQ2(sXwj}ZX%G8wz~Bef~G51q{o|{Pe{VtvT5^% zgtrXV>g?=o1C@=kMh~NgTfnsED71RrDNyXF!^QcR)U_ZeUg0FqutwKB@E;g3sjwZ< zp8dTDMRUwq;0rT|j*d{)+)D$Pdyu449k*3Zc$^G72nWJN8}GN~H30V^oX}qt#@bL=*pqu-9qdu_pkEAC{dorGxE-OSTwj6F8K= zF;@?{ftB0(5p*$J0iD|sNi)Q-qoD@6$ebIA@uZ(6$z?I-<@GMms|G#10J^!6_P=sopS+0yz^9Z9I$BW6^ zWmBXpBI@i}M72cS{JGIpw|l+_$3rtA%?=iEjYv-6#GX~@-v*Zy;bOH?ffot2fVk>T zbaog)o{oDEA3qlpUtG~a?rj={X!pg9B7KMKUr3@aM{-r-$ZsW)Fs)uF);Y>6?DE1? zr86iL3mz@QS`V>NlBl@=P02ik_Oc8#CU9#qUKcQBCKssh z;vnqI;xXmxrIvC^WHQqx`p&c)7m1YzcOQBpvGosY9fO1^<@=jKykjh#WKghudy{rvl> z#3owTNL=Gh37iEK#nb4GvefPcTgs0G9PS3KFraXpR5u>6N8KeVH(!p%4`I2TuvDO= zOINn;Vk>|O$XW1<=!9HQ7x^m-hlX|!>QyZEMl)Na?<(c7`!>Jd9D}?EQC*HOTLiRh zu)BSmz)H@)*zsOzf*m&fu!JWktkf{x+`5Vq?>lHE*Pu8gfWVLgb825% zt=Ph%2IImvjfyVbR&4?;n{y_JQQJRVZOx?;SRrqWi;VFP^tU6;NB49(*TsM~Qjr@C zmI6`dja^{2R8!VpM|bp?=Y?Z?dxLcOz3L!?2)v<9(}0^UyC ze64rP^&)-ls-_m_2f{vGuSjVF=z(^sVHy`c6ZY*Wz-8Vm3~AVFNCc>Ka=oJEB!>Ch zy~SH5<+piPr5;n44}Wyx&``xMNZu)3bDk~Bvcp12-;I_bBI|tL-tdXNo=$Ef0WpTD zxV5OauJ?t20j!qTcH7t3?)nxSCo%G6@U#@0#;Sn3ua!x|Jjk1ugS`9j3Eov+BcCtD;2eA$6&)dfA~>yz%Y=b@EJqfgj!v%_x@t& z(}H)Ucs7c?+IFWbYmvtp9!@WA<>D%9c!*38lqLjv5@!Tqw~uCG+_jJQi)R0)*wQo( zA366&wdO80>oB|A4UV)mazyo!x=;)~_*w_du@A4~>=2pVsUC=<*EE39ecb9ZYF`1w za%oXGCas5-k6RwoK@2iNT7@U+*a&lgL7fheQJ@dzQ3NZSlk~d_X~pe_3I#P!)L}5l z4FqP;$ke6_2CteJK0qXCHTWOLnjFh?zW?l~!Rtz)A30Tu`(h^Oy@r7PUOiWv?YmMy z7t`U_eIzCpxc<^?UmtSq%j!Y6us7!-De=u^XJnIPgu2}FNEh|;cA%}p2q|9JJ?=h! zM9%vyR-At8>uVl=HowFh!;RKgT%GYPm0lLaDdN}3F>W(4GC=$wD<*=RuqMN&FchAx9~aV2NMiqfHP?eHTX;(W_GCQHeE;nR zw;#_6Y$y5yHqXpRq3QaA$;C-u@5O~(Y@vbo##GpW!Q|sh9E+oj!kA2rK=Mhr7Ke_js zi3^pKo|M{$X|Ik|AQTX_URfgRAI5e+$6DTZ%gdQ*U+w`M2AMPlSM&ydUkRpsfxPSf z9v38jvb6hD78UY<#k>I`A+@c9nPAL3m*V2<3VC7$#b5vC%? z{%a@qU!}o+*3#`Nn7+itzIgH$EV)w5s* z8mz&B(Y`zwsin6?6DeB^R;jxt7Qlq9Vi?*+*9#v6o>4w1#KB1Cs1_ZvsAHZ=#jAW& zI+BgvV>0KFaZ8d zTX1ik;258GtEwd@K;xlJtzYy0*d!Xh`XMraG8if?`oGH1rynWfHWh})#@wU`G&r6k zWVkkick&K+d1b8g8r7!*Y*EC_v1LH6fxC}<0e3jhltP4s&dX2A(e9aOEZK5`%Ai(C z2y=nFtK$a(21UVo{4u#kygqAk_W@ltO?jC(taBq#C|4Uampu4!ShtIuT~qtv2wcWB3Sc6V!(-TJp9}4 zcc?Hr`&q&orAJM%3SUZGhS(t`dtCM%0Jg(21b0f^!OoKedqb5O-8(MZQw5eBW-dFk z17!@m@u5jXDbnYbNQxd%!;;-<%gAy6^m;EtUkrxQo5ZgK-N~<0dl*|vA66FpY6emY zNt65<3%P^>>@hDy#$^LzH7K@tOw2dOHcSMhGxcYYDuQ;q}UI zq`LXR>SDaXu)Hs4sjqeM$=|<^&DyRGEm1S12gCj2h2#HlM!^be*Lzq9G7Acoy8$j0 zEORXsk@6o|9mh&d3u?2MdM}^f634QNALs&nq2BC0chs^E$@7^XV^8PFhkaSFLh&sC zeFz~9NLlbBzBox0@)hJ0`(b!#0!zsfmVWe7S9W0{c6UvDQDz7=Sz%`eOP7Ecxe}|r z2cFHmJHz$S303&&2p#7X6awD`ejK#B202n5IfOums^)zDGKB%Sgg%EFVk1Jh{*Al>3{F7ciF-1oAs z(OW+Wpp%+|!0Xjm_(oRYIB!>)bDr6muF1JS zjpUtE0$h}XQKJp<*Q(l~l(R9X#BXcs=6eVg_zgBvJ+eYkrL#ea* z;qOwG*-YW#9plKW>k^F=eDzvZ#e~ff2I$t2$0X^K9jFD0k^B7WJF;CjSc5Ec3qHBH z2%JMpMpc3(f~N;PsYg4&OBP#;R2n(sjBwoZR`IC1cAu?;8}h{UPsn)_E@UHAm1EeF zK)Ke5ytrY}8o{pjI_n<9lLitmwhPzJOx`dEi8d-4&9^w(1j>aqxxa2$ zGmo8*7`^1yIb60B-Y#(^_B50tu{;&h&(U}n-$%Q#s$P7AxhZWmAK_TvY0zX!zmZLg zoB@;}tIie=`8qX7F49Wg%>g0}Li?)QtscKR(NtQ<{Dm zQCpF#SXU9?JKUs{uz7mSFGYskp_=0_j1=FUi^AO2IhnC{nXf#y2nh|+Kp_=DsC|_hp_f}vy%Utjt^=|e-uh~jQTCcn{96c*_zyou^M7Hjd zpMLJZd^fNWa30&@)HF{t_n!CZ)R+btE5x0l!xao|M9L82G@R=l`K0EUy%?o=CQOyv z1%!P~_bgRX@o(&wcxMG`Us&-a z8S&XHHIo#us53}51ukI=07ajiUeRLSk@|(%zD zI?tLf639n%MzThh$5a+IhbRR#IB70mMrD#fhwq8!8(-Z4{!SV0ZoCG|n?KTQ)Tkj0 zxq3Z7?=z@k&R@S@>4}fmebtoc7*Wq>S|_p)am?1y6-_Z97Ip(O7Uvju%VhIVZSLC# zYQjm%;AdT23ZTMiGt8`RfA+H|G7leJHgZNV`vaJvJd^9c<4bjt`yh9WK2#EQ+2e$= zW%Yy{2)(KOE90yGq zVep*YLHDKOs4#;OD*SjE2j4o+j>PdKFjak)jIFs;rC;9>SQ~;vO@%*v3lz*56anIm zqF)?2GL2ZsnnX>_l3CEjz8fbGuD&f;Up%Pq2pF}9Nk261Rf^g5i}Y5-6jlhT72UJk zkM!Q%koX9VChZ#ry8$J_Kvbczp2km7oFq#xDnE{rc4Om3^?zWUBj}2##%|!G4cQJ! zkI>M4HOZQ<>Z=lLuMq5^onyr?KOc@&2#*Uzfu7A7yZV8KU`0kElg zl_HWZ*1Cn0M-t!4of?sxG0-I2&#Xs*wI;~~2SKqV)Z?eJ_*a4i^0UkUgi|f_LaWjg zXmwmUGOgD9JLyL$9Y2aWrB;xa6t|6VF_2xV676HP|y7r7WI3)^EI`B{)0&Y$uG zOM+7PrYQDnzuvNsb#O^Y5N!zPI@&dSr6dSHn(R36GjY>DJnolKMZ;lSFAn(qCwXhF zDa_qHs{Vne(#s*PG|IP}MwnuJK8n7sZ$2!@`RzfVa0w}&T{LgGqN|FN6bq5IA<5J9)8!@NGhMS6=v<*!&MeLkxC;ry zPdcWnrXM!88A@q>2>?&^6W15He$k)rZ?@RfGSf}e;WdUw<53Ts-z6$dgez44t-Z5< zj6h(`JfzTmO#U2_8oV2;K>j{c-im*u4G$uy6uYXVg((i9*|Ql~f3 zo5AQ7l@*NY6LKnM|AK-``;j{gTK5MU8u6C+->6;A|ApFRWM^WBp_ehWGk3B0kJ@GY z58M5}%m0_!WnyAxX8Zr9cAZ^Jc3UBEHgjG6NvhjynvI!-qjRIX+x(Z@0&ut5Ru9EJ zF7`Rkb-m`f|N0h-E$=*L>p1&b$Ho$qC@GYoGPSaTMQv^boMoJ0cK|arHMO#}G7*xC zaT#jU7y~i$l914VU}r{v&I6g)S(8`+fjEI;ah_*loo8ZWcOZ=ep$Om>3_;q!F}Q$^ zg9t)U+qnsl1CaGf6LjM*wFs0z;)(-!z5*5)DRSm)*q$VvVh5)Ks03lULRsH#?0LS8il>1Le@@+5qN*_a{KD-yi4D#2C&6C}29^7v|2L z4T$MPOaNDWPoKhZ)1RuAU+aDUDKPkFcaRSU_yc$8LjMFvobb-Sl#$utYfN2CP!Phv z*1!%FWaGcn`(g#b3CP6(efu}W)6Zud=!1g*6U!kW_&bmPXdmDGmc6^554IAJEHysd zu{gZXVKX;4I=TE3^t|s5u{E_gvbckB{?HI385r4te`)s|?s77>y+bCWCncj`DytPH zzZc}d=hj?{yW5-z))n~I{Enicr0zpFI5dG|WUc|n5Ims)7%>`KT7N1wkoWCBMP~Rv zz?xhgjNjyiv^Ul|*F3)b4N#qNjg$L)+1;9q6pj5xb#lMzj2DWy`F9cz~hQ$b9+O)^N`?fcfb~i9AY7CIr zncqvp{sT&c!H7Gs>ls?W{AEW363@3vO&@z#W4*>d?n5B+jE#V=@SET`u{1UTad9Au zM}F!(anzsUo%q?`T8w_e%EE#oYPrY!koPgE>Cy3(wXIRK{^?N!1_uWQ2VsLR17K!c z-2UM6;Mq}uReuW*gMzk3aNl;Is~mwmAXIJ0kUr&!5pV;B57=)KMn)zOteD^MaWDg* zpSJNQqKzNKMC^WpSAG#Spx~ijq;+7pg5UqtUE;1ElAl{e>Mz~>-}&_WzVL5-?OpyR zrtyIllnOgLIHu2h@+JsENN081QN$ZY-#vnS|NQfxdN7mU9-E)=wuXkcL=Z4&Z>?`7$(U=C5>A(1WFH%7mI>ennmxA7&{;@E~;=1FKD-to2O3|Z3 zxQmOyPwBR{Ar&V<+AZoq`iDlDfcE|6f&}C91%H!nAV1~xZ3b=eZo^WQ+^#PI-8DmO zJIGhe)-YU+5!Rk}0ZwM126!bZ>pn@Gmc8nBwL;yv=797?aCk+4bC~Tqu9XMOa=ZP=xI%}sps@GC`pdG(u z;;93w{w=FXh!Vp+|9E3I(SvEH|z zWW?;NI6Ej>xT43dpOm}Dj4bLH1!;>i2G=1$(i9rIv5EUAbgIsz%x3~cHq0CUMJY}{ zF*~+tY)-aQg(w*cbmsdl5-FFWiaFJgZA6ZtlKP*z@% zShxLq)2Qd`K!(L>x}qkZL0wtMzjRl|rD;H|_A+-jkLwV5Q-CVJKWm~{o)|hOi9}<& z+=v7{x&H;#I<}vu-vr4c{jR`?X#ym5T#{NHA~C$L^|({AJ&DW2mb|ONtpiG1#0#`I z?M#~5ZtXn()x?&l>y>%8_@fseP&{NI{lSvAxX&nhXjElTmRcRj!eOkD_$vM=1BM5y z_e2ukf+Rfte%VWB>Ky4#?24+BPUd3b0$=CH?-i;2^`TMAQW|iA|E?$R<~?_Y8<*vx zFG&K9RYZV(E|bbmEWHE+G|$U^#W4^mYGO$WOoA1Ygo+OOFb)xCZa@9`jCj4Z9uqiY zw&a#SChbH1d?=1kVaxcHPp0EkN%DlFvf6ve6`ojQk#uo6}jnjSJB)^L85lqoFnwaosnR2B8m5*4GS7a zNhfa8APDxl^Hg*Op^?%|^56ulB3c&Q1rHyd9>byGMrjsU?>+t6+vCiH*0#cO#yz{8eU>3FYWH2aGZ#UE?n&gYPI-eO*ok~0mcW3R zvKS||<{V-M^tClaSM}rjsKG2+iDrA(wZ2K}@@-7%5sWqxcuB_a7hYe-m%AgHcD` z@vU_D<5$0uO_i&jlB-Y2`{F#S$62bP0fTB(x1vcyT-YF(hQxmQElZ3AE~LT}j5m*G(U^5#^1^^SW&3 zkP!GtP)N#P9@hFEs)5&q(iAEmJF&xI&uSTsjZ^Dv{6`_TQv6Tdf_hfYx0DhB_YmeU zP>B>fK_vbm*E(KQ`+rAPzdMtFUukeXDO;F+x@%dr=1^nA(tjl!Yr-P~EBDtQ6eUN| zkbUfb%Tjr=sH&iMvd_6fI(H5%+XYjvk^=0g{ru@v(@1+uZi@s9j3Fp&Ny#d1A~4F7 zw|)zDI3IXbbPGC`t$&i&?MnrviD1bh*KIT5ujQU|rHO6dn~M}oD%q%aodF6QMiYTH zcexL)I=y6M;#B3Wd#ONWXJN9S>Y$RuTVYy6hdLS|2Q(KZ2y-Nfc0M8)%wf1?O^7^K zWDbfB`P>r(auU?s-V7FL({GrZP=T4m#aCj z!D@BR?cqR@V18TA^1CmgalO%jo$F1;#gOUuGLI(E@EC1&c=4F_4_|bYwY$6zU@~g z5!mo6&K^ip{dPyy zz|2FdWwg0EXY^(_o!6!|U4K;lpN!%Ebd_Cd8tuEKN#=49E}~l<7XJZsZ`wCCP*Prk zhTvrHFzLpjz%8(TU^V>3<_8RslL%pZb6WE+#ePv|n4q|*t1j&_3cKfdlwk`hw^jXd z_d14c;xmX$m!h$ete?k!?7ddK4Tki;ldSyH4 zB#LOQF&QV}-XU1dr+P1W#_lY>X->R?**jS_R&=Tnw_xnsmhyTXhV4C7RPV-Q3gV1# zLlifTAr6}ewlOI|lw>v|I3R~ff||cc_N?-o5J5r@=PZ2d;0{T^%$KQsDoqUT<-usc zAPRs`e6;gC`#)@pj>5WVd5-^YW%y9JY#$~!6PuFb8==a*sy`zo+MnNvBm{UG1+9{m zx-m?piOI)5#EYyCalxPJSVkIPYHkr)UG#C?4O$oM;Ah~eJsE`Y2xE(jd(&>WVh~M( zgEJu8S*v4eo+;M`BF-C?CJef?4L2OsFBlS!W+-jei3IB*B*e zLSou*a9~7i8&@ojfqOA=4SWW9ZxRn!w9IPyMJ6U)jjuY4EbZWSS#?N0)k0Dakh zLH>~20MVH4`PeKnPIkQ*MPY^=+vA=9l*6{}Z@jj9=Vp1GPpiR}e?P6nVTqeH63B*y z%x~?e%QU^?dXF~>+k>si%f;7sUyKX{^3Gmtkl$bzPjl1TF6)ULeS9+Y$kCeO3 z3jt(#ci}kMsMA$7^Vl8BATBK929ASanyLdEA9ys7&hZ2@xDY)n{BZn_*g9Qeq0!wN z3;3nGV5b8{knTCNS1{cmec&~9_}EH=mEw@K+%ryO@^0d8;sB^dQyxpMsg0LIy?n`N z8|AS7vXy9p2darEnUgnEo;@(Ir%%>+3%$>_*Mvx6+Iyt}OTG(>gU%0vef}lGZLU-d z<}BAT7wNRS=jEE?7seVWIpRxW#%)Xo(Pem1bqBB%=t}wnm62srMUEC!EM3=m*;|z8 z`u6dzv%|7&hjhUQFAs&qaM>-59LezqjESu zUr%L@;lWEmYh`>Jm#$IVwH&B`_<#)8zu&$KPi}do)e##N#vsM}!*%1^O*HUETtxyr zF8-CTg3P2se6+JbQnaW@)T9S8 z<*J7$kicxbhO-o00dJuCkldmuhWuL<68nQb#PzkQr82#kT!y5uG2lPfd8#{xnALD< z;QYlvoDF0GI1ll$=gl+#jCKJjls0o9hoO21M_hoX<5vr;Ol_0 zkHgD9vy4>k4cJfhvg8b@GT4N=W1kKEX;wAp*rP?Yi)=QgBJvGs`FZibbhpZ_cTU5- zOf^1Zg?jP?>`~)`)Dj}P;#Bg(b8o)%Wsu@S(cS+E-ET~7`&e1yUiGbL4k6z6cQl7M zPQ){Y!$KFFQiz(xKZSBEOSeT-Z7P!&$quyG;JntG6ppQ|8I`sXoT`7RXhrGx(yDPm zD~cW{dJ)16EI+$L%#BdtzM+IzoX(-LUkGT*;o7o|7!wbfE?kTBS&QWuD48uv3WZJ5 z>bEm%mrqO|0rlQ6qrQnc^TBMBIKX|%r*GNEeUGFVnbd4TK?D1ckQ}J5P^yupLiG3gQP42+`t+@7tS;NZ0Y{ww?O{Ga3-tO!D(&@ z*;rb&x;XOYX9|C2!qi^y0bd0U{H{sh{4vG10D5i6b=@`e`BXkhiivb@%oY^d$NOp> z(9mg!O1A=|Iab;=&KA%otdrYSt{@-|qq3#!q1~3v)R@y#f$!)c^6@eC@3L!OCrB)f z>&|*>`4hANe7=$B3(IMmWh>4W$Y%Bqj%O|D-Dv7N#%G?T=<)1IdkmBcH#tmy;;7!u zPs;H&saIy67il8ws&~Skb3H2fNa=el&@@r2@nx367f!fwSr^0AG*%dz5(+^Td_eR_ zk$>+^b-Qq)XGGegyQK0>AMS#ike) zr;@9Sj}R{8q7cV0Cf@BZ>oT^nByZ(8tv{Vfs~BkRU_#dJ@uwT1YmnBR3~@w!4- zjMU6Lk8A*@J^b_f-enP(JM=#QH$ce0jJbX(L*4tUEuO#RWdGYDX&ie`DrI7eZ^gxd z*K>qlh;F7g=!|XcwRJIJ{1SK1jT4bIB>=m8ClkzRL>)f=Jqm<^~R{JK6 zhN$y3`8Rl!xN%~kENElP4KHls?GS>uvd{N$lqvzBPZI2FK9{QP1X={Hj{0AVZJ?%o zLg}*b=4U-BRws@ zC4@@5-!jX^?Ww}aFq&J7;guj&biix6Wvnx5QWBqC`0DiVz+0ztQuA3&covae@ntB8n>lgkxEW9E?CH4lE-fLeNff)5|cbyd`32*(q&v^hv%ijGWF9mDiuS&W{vzYDquZ|O8@>N7n0wT(98m!(mEjjpQxxY88anlUCCdha)~nzinfr4SFTI!tALnW}LO@_$i@^+9vXAzO{MbGMdXn=%v&5 zqpY`6MwNU&_-eqy8b;0^ktau&)gxzg4C%XDNX{ATfpZ4*gU*1o7>^y$_A^+1f`dv^u%+mr3nrcl3c-#k});@*w;^8NK8g~)1G`^O)x$Ow*PCOB6eOt7tGo7Fr7*ZDy7 z?`szXIZOg3j{5JCro3SfXP(?XQJ1H|Og;3H{OEMmN}U;@KNYeNBR{Fj_x-ZHr=bt$ z@YZWc*a_d9)8TmIEUxxB6M2%!)7E*@RTrr5>*k|^o_sn!`Q2%?X$mV&8tcUoY|sc! za`m(j{98%bc>{cB-mDv+at6muqk~lh1PG&-W4|xbSSvw!@#6~i%1#-iZs4aS1M`!PxB4*)OqVbn>4v3xhsNreTVoo41}>_$;sr1Y zKs8wCJ&95Ftm4@8P~2QhH>oU2C`H^h*-ib8gGK{lhvEaFM(n z5n&0`fg$;bOegUA>};5dRiNb;>jHPW{Y=JM19w8Fh?d*b+;wp)tY6QP`%2YyeKM)9 zPmA^K3Yx|8+$T-2pW4#YzLz9%D;7vfwS}D3i3~ah(Zh7wn4v&;UB}*mz8|gBZ>juh z{1HGTmyNxnsNa=%1{+Niku!;l*z(QLNvT_l6h4~SI{0%97l~a8>d`WJivbsgazUSPws(KNpZ8=oxr-CdDwF#OIizw0_ zy#ifoN@5&IE$j;m(2QpVEK8~73QGDNw`VmGJHr5Xj4iq~3oYCuR4=k8U~zj5i+h5P`cn*4GTEvsNn!{jAL8biz2=(DPA; zoAgO$HcwDo#4$JkBM4T`l>LS|Z&$=nNI$h8gcRFLJ(^uS=DWleF z^U{^RF{C)^OHBcI4DwE81!*C{)V3(3YvNwCL>=4nDi4)~X`UE9hMidmzv7Ivc#CeJ z&o8QCP3o95`OxQ`-HVq<8tduCfuH80N`%fL>AE8>%z4=yIpKp}JnWw9gw!yrDH2^h z{N*6l_)5h=M+Yo*a&1hOw8yE0Ed#`jT;SX|`oIEmVZ7Q~0W&u7E>U6;nZDP4E%`M# z%abD>6H~X$Ovc@HNj`sm>0`sh8c5pcV_7`rDuNITEvS2|b8yS3#cM?fOPY-d@_qUfq+l>-N z;y;XKIX{rf3f-8;b|_MOZ7Y0GQ&HQ3gyB>+u&SvD7o3V%dlhl{sLE_$y}O~w3jIrQ zX1RJdTanJc*StWtN|O&74>Z&4z_XH@x`jMcWP6WqtqiptxEAaKp8fAojhz*aQMhY4 zVj1c~;NG3=lI9=yRPsY#h#Xep2Rw;)h~dr~A%b`xW@nGFX;u~Bt>UGK0AW!)P36(x zb6QAy4`(UaS8%}nFpolA7H^D(75>g{ZX^3lf3GG=Ll=?WLmaD@A8p)Z9JE&sbC#9~ zLi9f#>VArb9xRl^GX+qa_(jN+!+6;`Rse=7u3;`M3=FIs8`MvtCw$;%k>6_neRsKP z>8ulYjtO0O?LnDqswhZZ=UM`vMJi(8-yFZ)p6jNfW3 zwBkA!fThM!I-WO1;R9W6RlH6F@E1kmHJIt<@xxD{Op2$%>#zt?o=o9)QL>}Rl)NcV z@~CBq2PH7wxam?|Us_N&-B8p6Yg`6JN4?v^7=z2v$7yntY~}Ki?@StXzpePM*i$of zvC;|8({eJ!JBk~^=*|Oq;lC!e%DfM?Bc_8`gSPTT7J~fN7q=0z>-?~rhZ9ly6BfEy zdk9IepT6@(pL%-Bh?pY;TyoPQWA~XG@l}})etOG^?KUE*sjmBzk)LV6hD3yu-(RKjmw(WJs7$skb5vaFHC8E&i2V@ADP z#NFdN(W{X|S8oCyFZ;ot^s`(LJyWRiWtqK335v(KwW2_ZZ~Y18$-hC~qGix?Ioc1q znPzeQQ9kwTS#XEb^0l%A2a|=ykiG$2C8JJ$hUO$C3k4px;Dqhz7h@?r7|T7A+2g_G zXC|7a;4=Pd2(b_yoy$SohF#>}yj_oen5#2wg;h`+pL3dq_eHnd^$I))iK1Svsx7SF zIS)NY#<_oy%G=_hJ8GJBqQFQ*(@3VHq?6N{Ly3uUtShYxpLpmfsRkSHu_#KqeX(uI zK50oxYz(4JI>8W8`^40@eBl9XAt6`AyTy6UvMRB4#kXqWihj(dq%%HLN+gVr4Q(czJHtF6$>4ARc~+5~S25tShZjN+mka4d)#z3RrL)elUpy zTE;1BsX2b4YwNZhRphyf9K{m^Fh(?IXFe`hs4jU1<)h4pj8F9lf#o;&hPz$3DY3G^ ztzhD{gF0Gp&)2D*{djwUP|wj#kAVz+tTGwc-q|9lqH*Q94L+*M*; z_^Ud=ZU#tk$g`a}8!58GP$->S-$mi&BV(ECO%>}d0d5b~dY9T`i;K4?lYUw&3KO@`a2DIbVmHylzMfy-dF_C!__FLNSka86SJ>X~ zfdlDChf~^M<`%lIU6!k18iIXM7;>h?DG`AxGQl-mdKw*Fr~r7YdCj-A)Q=(6KFnX! zX(&Q+Xrh414;*qdj_>oCIaiT&J$Qo$Gr16l$3xmNhl4J@(8;}-y`X_wlj*{ENnJ^J zCYNjxWe&Bx5D~=g6w96rePZ0A#Dgp6fB*u%vhKrM>@65KdWV;C<|eapW(B#tau`q) zH_($oE;xL*pD~6MATiHbm|I#Jo-;i=lKE587xGv~k8oh7AhFa1W5*}V%%9G|vd(|F z>FpxBKQ)ztT+;WHDG$@sSMzi!A!C<_wNRw$HI+ffA>EaOZ5PC&H@z@kk)B3SsR32l zt^EdP2J(1bkIR(+07+zw?i<`g$}XAj6Y+N?Hp1un2QM*h+b&ij=Mx@ZXK&%>qXSJ$|`j+@w=BTr*M7j~RHP-#`@%;N7MI;1vdXQq| zgQ{Y`Zn7#dTI5^<3Ns3|z65)a{I12J>KtK_S%?egaHMDMfHiG>Ud>sBGS8q2>HN*= zOtb|SL&Wn=*Q8lFeVocq23IK=Lmigo)NUXQw2lrJU_m)KvFD0%5%b_(a(&vm^t}x_pQh@X-Q9 zB<4Z?z{?S;vg$Nq71EN>*H-(^PF7tPSMYZ6g&QnLe$;}kf=hEkqT;Km&_B+K!`h3O zYHe^X@O~c9C8&RbMa>DkS$z8TQc|O*IF`AZ9Tt+cH5e(}R=*1cDO3WA-RpS0Ki_pC zWK1%8Tic(Qgp-QEe?Hf^WEYgd2RK zk5%oylrlV4h^A%c0b1#u2yOi=fu-;kQcWzV`cm@XGqJF7Fp3AoeLPh1#m{@Was75Q z8ZaczSs3_spi2Syss3TeXMpe$J1r}*xULw5JJxQ%k7Y$Kx2yN6D7Ub%yHOhdXb=il z)}c5%0UF*UXw-60Up9b~L_&!;dHOO2CA?UU(|sf%uBr)qniu@eY{ZYf^I{ATFOYW8 zd!q1@xUqm%I<>~_92}RjZ!)ebS+qS^G@dyvfKpvDr9cEtzsIlwpOCkY133tH6|L8> z)kX?D9uWECc)Xg-`s{Ti*(S;BoOWk`%bcHK=DN?Cy0&^^=a(wkZZf^FRZFpM?KDM0 zl{X5;G4lkE!f)(!a3+OLbx45HW|^quSTl-e=-Pz@9u74?H?V0HsAs%PvSf!wjmhsP zc1_|aI9$tHQyI))&gS;Ya!NRqjbL+1{qqqs;f_G%^`|1%+Q5iC=5r#>E>262wNVtr zlTo2U)WE{~^qpEBELxZP^z>ciL#fvfA-v|APIPEBRjyHL4S(uVy22ZITrd9TD|@WR zJa)aj+adi+^+&R=r44#w{YFs0RNb~Ueo`k0kf*IoZkru)@;H8Io;tdqDBb>&l65vI zY;(@^H|NHP?3iKF32<6b=XNt!9;3GPS$!4Lkgq=qS@)!%BxxuSaF*EQq%|AMRkVN5coU9Q4 zE}&(G`dhwz9|>0%jI?Z9VEQ!U*#XPa7Y5vhPpWTzxp1ss?fe_BW?hNfTi{O0(2T3P zlrmvu=pd_(BimII#AKMRW&O@4a|$bbJY184BYeJ}^YRjWre$)7U|5WIy)9LDyX2{T z;+PSxj{ItCXqjvWb&EgEr0Jnp(X1D;OhGOT=IL>NigBdDUKej{yOfd3Wig-}D*R~J zFlPt(x-o-FM%%VcSE#(TJWgsB$jliGh)qiQ@7M+b2}(L1e(Xy$>hEw1ba=Tk6_h~_ zn=<*!*56doMBBB%U!xh1EnIjAapRC5HO~MYkL`E|clLT~{lV(cKnx-|4%38g`_d}m zrM{)DL|w4dL>9gdf9RdXdUvp8$0~NP*+7X# zQQ2Z4I|#V|rY2_~@dQ_|P1)HXltuUK(*Gn8OAsN>k8TwJT|0y2rlE>@NTx+HzeEUlMat6gS;0W5s9ifvx$w+ zTrp`1NomPX4)wvXnA%gf0%;gA$6v%b6FV`EN|;s_40cU!J-)KF^ls`>%$z&eUs>(> zncx`c=A|-juI@@bv)~<{IJv4wewnUs**}Y*@oJyP<#eq^=08;J)YN!oXFjlFCp46< zP^svHSp&Ok46~aT?>vrEbuvqB`pFjRgt*@c6zOX17ZUbF5Q1ITFn%;!K25D3!DQ*B zGgtCwBtuQNA|ys>NgyWc6uW-|AGJaQ*AQy5TIiN>WY_;C5KsmGuHy!g$C*bCd!_Nd zhc$FNK~~`uSnQQR5vOMai`sinBL>IQ<02UXc@;)xe(&)RVO7Rxaa%9ajy5He&`_6o ztC46|i+Pl3yU@+N1OhHmR0DU(~E_WC{j;5 zWw1Q>6$u>X{@I_syJ6PN+{Urx8@M)O)U(p5gKKU`VXrZzQovh>o9Va7yvlP?wzQbp zHf|Y8-!|vr!E3G4=O3fz(ER!-I|6=YEB^Qzf$s2FW#@{RPhSrjhbxm_`}klUO_@u5;79hA^RbWNeyxZl)s>yj z#~Ai%;$xJy&~|+lfLuIwJp6{-Q36&I*M=7mpl!euW_i*EGxN|xgg}my8;WjpwI*!{ z&Wmcjlz+5cG-jj9AwJkoqUfQ~eOuOk_=gQg@GMJAx!&vX>wBqq_Ttx?g8i{*l<`w= z&lVX!v4q2y|KnLA%%rX-N_^dZJ85!fI?xbKu*HbrvNdpP-n1!viiv9Tpuxd3?+GR+ zM{M3|)$2t6tqEW|v3V}ss2X#_tlPW)IK+BF;JD?fNxrScCNR*1Scwhad>VrUJ3PsZ z+>+#5SOSOJ7MC$HP_GU9a^(&q>utpjR-Vav49Pn2K1b#C#MYJ?42cW z8Ag6rWOSm2e!}Q8U`#WwxM_o%KhLnaR@GRBz5di?4rNnQGc@2E_Gi1oex=;y$bFRCSCdMEP`74#o#AE4-ZUHzsHGQ{OI&@EpQhog9uuFZ!X@>zj&>MQlV_PPd5mSmqpTOQX8BuO9na*lFcL^an+BZ=I#XXo_;$?JshX&ffm zu&Gwrws)e2p2gdHEZ=v6`c1ZPf~)YBk^#?$z7xrtF#f0mDV;Trm9&Lt>`iXQB}aWe z&{Tx7&fyh~uG^&cyt&aVtT*q!vq!6}@EgH~xYo!z?U7`&Xkjd9$GWbW*mPUaT*c#i2H5pHr{c!p(UsGqLMw)tEi!_ zRu-Ks`-hUS(uI>$!5MGFy?Eu`YWwP7)b!A2Co}2|?r}41fl)K~a5L5tO^qDKl!cRv zej?4I><;ZUJT*J9u2o%Ogl>($n_N{OXG3>n=k&xzXkbWuKcwpsn5VF^TD+;XCL!}Y z`<*QAdZG&`S-2;2Rs22}g;OJT3pW1^3|Rz*1#WS?niL7%CYSWJu*Ea^Z8z6$#=LCE za_#_5k}!~W5DgO#Er&z;F8gl$({}=jz4iJyGu42`&SA!Awmk96^|}!?v;8c6F2i~L zp%|txE&Di^r$4e6Gv$;jXNC%q5SA_u70e6P079uosNSmj2;X8=*={6iVtve(dI*^# zpV*^#MoqT_LztsT3fZNLa}q>~txelJ;(s6# z84_7+dqDd1=ISfS>raN0X+2|6E7s^_Uy;M-gA|X!$9{FP77C6b23jb(xPfw(!o1bY z{5=T*p`7&{_QH>erb1~P;DZCMwlGaMG_ixZMYE(g=)($#(NgIn(DmXhTRNu%q0TE> zb_+9j;%pIbG&s)F$36G)s$}Cveg`gF^Pym}Vr>Wx#9cueDaN86w6X{r+B^_O%u`jB zikO0w9?2n=wJYwieCDyB=FPIUJYT8^3ZIYcmmu|Aeq5t*%Dn(P>Q(ybcE5M!^>6{v zwlnM=Lwdr&Kif(mdPVCal>j!wzg6#Jbqs?#`l7)8kk&FGwSnn}eItd^tMMzBf74js zr{NLuV*jHJ^HX)%>}B}GXn;-#JT&Cj92B&;Qk5kQxn9ny6U zOLjaF?AmWf1=c$zm0ZgiAU{~}JC_9Zgt#NGs+eK$9a2W{}v?-QT-?1POwP5TbOm@V_4{4D}eD3Q7_p4an z^vBcHoFCAtS?5~hqE33~sHU;|NBmXuHp|U~y0mD{P%YI=#R2ClYHD;Fe+OEMUt`bt z+EL+9r;v4|TLl+D{G{xN5bHYtqV1?z5NH9JS$~YSqM~W&NuxiQzndmwG&2K$#fY6* zWhX(tW>1^+IyPRWgAZLwL)Q~%XmxZH#VII(Xi<8ZKA`@gz{dG_Y9@^0-W{VAwDYSM z>2t!3k}YCal)Nq_!pRVzn*P4Xk~Hdyig7wmTXeo)zo{X3!)}oBns+`#hvxxTcC z+J(_FC)k6UvQt!cy5@4_-n0}48{K@#Bn1boT%_1(BjhEe&p+e9=DFpytdyF6AKFr2 zAH8N5gCNpeX1$Bgieyu8FyZvZ=`$wMP6X*%*G+GU)f>MJCKc!im3 zK)#A+4isl1!O|w-8iX)Us>UE6qvc{LLUgj?(96(}#|ZdmZ#co5OJR`WH5r1Yn?aM2 z85?OxvPip+Axk0Iq)MA-QlD)8N+CgUtO767rjvTcjLfLkk7H9923%I{47&5E%da(g z(xT}E1c(w-L%-`ECOZjTY3|eha6m9EU!qNf_m2*k2OGJdtWKlGIhVDYMq(iEl~nN# zCrNzsg^nE($Z$>iWihuuv17&e zOJagI8a;CpVWEsd?8?{vV8k}YF-p*wnebuSrr?eN8~T)94Hv4|w{~eyJSDcV!#i_! zyBC?^VfD*ghyZgKrEW>ROYM9+$x3J^GK=Ny*ZEzxI=|O}x%05I`VXSm$QapcE8^=3 zm{_e7-8cq+nSCBxa$(@m^TsXY+P~j!fZZisfS*cOR>mLVR549nAnI629zMTDo!sy4W0Af&NWS&z$FfL6r#Ne@a53| zVRj258jO{fy|>UOU`uuh-}N7_AhZ1j+xEB2MEzu1Xp&b4kl zf9tWVojP~bqMR(F!bpfn`KZOydk_8FOq4?ar>(wUA3MFwB)`d7KQBle5Q#O9Pe}a2 zWA|OB2y>_KB+qZt5Qd|ArZ=sNukn|H{zCufaIKF7>1$7^E4;&P=@}SHWStA z0Wn{1L|JL?>Qtp}kE2K*;F*7oc;C8RM~6j^v23^gTSTXaHC3ohBn~VV60zSEE-W6y zA?h}wz|f~<=YxHjsFJEO>|}~r=5p(U-(;Z^zHy*%N+m?m)E>9_qP0Je?Or@w*W*X{ zTBvevZQ2Nae!fE_4s<2ZA?66JQMxuLy|-CV#(y#uHAL;0T>qe#DU0>jx#B|jcND6R zH<@@Vi1{Hn?>DsIFJ<^G;hkaL2cn-%OZSsM4sUI-Rl z7#at|twnqQgDR^3N_aAXdF3rR>mF2&-qo_V8bNUa;iu$%k_6Px4)7T?*9~0#7S(Ln znK!n=@Lg-g(9T-=cCoTp%7VavKvy`-NF0v5sjG~wb6oFj|Gca{u{OBCm@}SB=Uc7Q zRr|6ClVvQ0{!*fxbqT9$e8iHI;!3m5h8WmSCoiU^@R1td-ZuE%KRchWPYUO&BSd#TgY3a+)crTp>s<28M;YZRpI(oPNlEH8MO9z169q5 zynyR&maEaR(?9ln-Czq2PCG2eP>)0?iXIykO3s35u?o#0LtR-$DruYMkDrU3PM@z& z_3QuoUrvQKs~yd&!dZlncLjlISIVwrACV3QB5AKV$u}%lNXW}+)O54pJp^6XgW5B1 zs79gaLE!9((f&mpCyI3I!GUV^PSIR(!aZ$NqRwjAPzUQTh^ar^@Dr7{6)X*6cG4;q zX?Q0uP((OYSL83}MOKR$=?odaJ;e}_pw|*kD1>p>Cy>513u6+ZcD1bydG*>ves2Kb z7K}$atJq$6G~*PrLX;va8o=KHt02>aRtfOFNE@DP{Mj+wr4n3Knle%*Iojo0@f%sE zdE)DqvPfMUR_Wr}$AYb;A77>ge2z6imX@r=o}a5tFcr>Z*3Y^c$jdTwDQlA)*Z-u^ z{Y&L$P|>!_9KqDTF|F@bmG%d$r&mzYtWm|8 zjz?C-ygr(4=lh97Sx7Km*`UQEAoDkq>_R79yO~#MiXr;OXTAka7ew9o^r;L-sCld_ zG7lF^33?g5@p1GmnQ|wv?;7 z+XzbsjV7>R20_%W=96wpyl}(vLO%*W7wbR>uu2PZDbCRIf5L8!xYW->X&sNbrd>3_ zSH12gT~F&7*R_kqB{%+QBxZ`NH_4_zGxwgbE2^Px`pkhE6cKdLJ!vIm{4MS9p|i}A zzd5od8&2xEd(tf8NRy%=BEG#n>e~2@E6avuGda(`0tS zO`z*S#viPZpZo%gEkwLmdDn0VCuTrGop??I-Gk43`K1!NoV@@6)qg2Rro$ns4*ppS z@n%R(Z9#09;k6%)G3)3yJ)Iz^ekXTgs5Kx3uFs+>z= zMVC*DeqB_q8pl`0wPd=^(7GpU4l8?BFDvr?)UV1p5j37&$NWVtr6HOV5Bu=WxQ&Bt zPAnA~mJ+JEAbZAvKW5vx0EzYgctpF1a#2X}zTOu+ACM+m;@@}&zB^f`Lag3Vj%tbz z(I)xhzA@zVY{oXd`Dg<%LWFf~*1Q8KwKPSRw2{CIw<;u;(0P%PBikb%L9u*e}2|-{ncVzHf7mo?o3|eoD&M98WpH zeA4HbCY2<3*dOuK69|Y)FR-^#aX#9|3qO+vQDFQ={9NP~X{x{cwy(HFYm>macL%G< zWD*W#9?shpO%4!gw=80rJK4W3*CXsne!ws#G52HT^bU>wV;dv5Yf(%fggz=HecPY= zPLN}{VN4T>4+Gt-!d4HfUA&@a%-Yhaf(0AyJVKip&W3Hvq;lls{)H&eN$o<2^6@$c zN`_|wO4ZWBR!L@spE&6|Lt*vIP8hVEVW#5hv`|ZPN@QojsLLoI-uJrEN>YTnpE+co zfs=ynP=G9wCbfHOU+!%om{I&nE)bcqEksKdYXF&-cqY=Y}v^-vY>vAbmftEkG4uyp!ZTqyG>N*X;8fd;O5G0$k>(sbnfBA9aDy=OOdZ z!Ee@8%~^|{%l~xZ;S0`ebZ*BfPnrVEq^TN$#tEsJlnTu2A;Yj-JGs`dH+Pi9IH937Cf)w3J zZ$g9>@mJvQ9-3l(DR)c7 zGtQJW7e)EZP)L}rU%oxa@4wx`v+Ic9_oSJOw1gIg+oZ5ZWW~1c1z3D00dTKTJVqt^ zN8iQUr#B?8wboDQUisJ<)G?-CVb1+p$H)CQ`6#u({nQUe0d0*KcPGumJxY=9=+$?Kof|)s#(8P)pZdpHaQ%MM;U@mmf_#!SJzx63 zew}PcI6$K=Lb-}*of<(sfbn_KSuVpS_gz~X(SVMQsLXsk-Ceg}UD`37WFI92TJwu% zc!6J}9Pbz~g~Q@cy}G$l&Sx^LphZ7>_m9w)tuYpcE-oU#+c>vF_#=!bwhnWJhI}kN z-^&)D07sHY%^?0OkeiKCQ9f%r_LhBl8!aCR&eZZxD=d8e`F%NnCGy${g)J+{Nw_II z*!mSvX-T>*I$8j?_I6SOGg3CaGnuJoW6yy#6J|X3sB!7R@oMmyO12uhdnVJCqGvq^ z7YvPD#ekOS78%kkE>W+=;Lg21Ps)fAf=08_Kct$RRGa3jIA#B(kLuO-sHE7dlGwa# zDW{%3-y$U;2=8)f@{*~rNntREwO@rWs6l3MXVPqe%gQ@e=YOD^jsk`##eIIz>D=Gf zs2L@EYH^F;w}d!f)WIyvnKjsOvCtX78bPe9)FZc-P@u2nKJ_D*(+(TIK8qRiWLL=$ zXHm5ObAEao{%Wf8YUMP$Q@LM=iZMwI3^ktig8K8~=ejHNQ!;8`M(lY)gy21d);QAs{oEc2`ILdtP07soL`XLx7C9S&U_=pU;DZc^ zY2W=M;1^F~L^n>1dO^=TS#bU;cJv7={8rIMSLCGg^5S5pkbTWiTFiw%$W zT*acIxBh9)4Qz7O+^zeh*kW^#jA>ANPm(3X-7n-|EKZGhKC3hgY012HM}yN+gnq{e z&nWh-rIlgo>Mg8PmO5oH#x|R|v5zdO4GgRO^zvKCO7khEJc2}b{G3aii$FB52-8AeMaOz;y}LUk zfL@S&rPCZ9^YiJzPxPedvZg>aIVb z07~5cg?tet4n+L1(^lrPQq$|00@%x!G@@UokS0^fT53hDGZP}F>lLYbrJkQkAoTA*gQth%c#TXl|p4h^dh5&etsnED<*kLP#p zo{=yNqMG8HdpSs{Q(0bptm277r~Wo2#}$EJHEFV>Fp;4Sa|SYqB<^fAvfLJ&U^Gzt zmr6@Jq!#hUl_}6tGpk&>i&i(nDvnl>P?l#VtyWL?u>sGAw6j&a;`g-+>R#yQC9A&a zmhT!PJX&=Ye`uwE6&Kn(4>3Pba%o}dJG<~6=XTTpwq3$H%NMNP{bVkIT1eNen|@3n zm0h9uqNK=bwe=!LDC;nqoEzT7YyLm3Ag3fP5jW2IT=wR{nl$J9?q7*jd(@}eu=7u?a^zrXiv9s8=OC#A;pSSgaY3@V~^#>Yiz zh}?6AP8@x7o9(~xq7?+?`VpqsWW*ekck}O^+}4N zj^sl4xu8p3>uHSyQ+95=O7(96lM&0;Lhxk?gAE^L^T}(ZNKk*BvH?L##&;BZ?BhA=&4y+r16d$c??HBlac#IbuOiT)U6%+k_-n&FB3 zBH>Rd3p!Ojd3Db5=Y$7bQ+PjdZowjkH2DhGbuB|Qv_PRa4fB_vM z3G#TvGNR}p5{7NgkV0fAMDrwXoV9`ZW-GRKDE%1>!G;75+RLI?(4FKrCq;03jKuO$ zaD9c$O$ltJZ!LE}aZan2FA8{@6f@a0$rC>o4Va9Q3$HRQNe(Se{_OT1FTXseJJc^3 z2=;UX(O@383Hb>lU=pZDgbrRW|YqT}4r0X`Cs87|190d+G(0hr-IxSfl;5i{^a zyy>Ljz>Y;4FxX1gw#pm~25kSJl{j8dJSUGwR6kR2#d?`YX8+@vl~A&75$PLLkcctq z`zQ~MAtSD!BDdOOglY|_ExuAT2at$zo{5FrBKue#6^DqL;z*QSRC6~t`-xME9iO_2 zj$A%~9xA)NL-OYhj3;LHtXq%FInV{5H*SFOBW&#jaD~jgrDs@KF2q$uW~<~U3d{T7 zYCD=Z=)le^^lYL#RIV%y28+Y(o~m2i{6eWx5)62pa>~oB@v3>{;KF{wBk5?8S?MA-w z3U|4}N4$li7^4$yxI1a*MC(y&r}$YSh~@2~Obv0#FK%-D&-XIK1lANXj*V_+Uv*QJ zoY)^)I_ys-yW+NbB7dL@>hLUvuWW3ns!EkL6#a|b|1#-AUP{PpWU?Mj!N)Qz74q)q zTL(=}iemoxxRNtOd6>F4@>DX0^~fI0gI2imZ{1d4nr;rT_yjRG&yilhDAis$?{EiBMzBakVrw+mhu@Zw8DBpfKqh&hD-E}O$35Cw5KsYq%X}MHuMsWC z?pXIcjwS<4QIkX9Z@L>3bw0*aUn3EYn3-^Sl2HNc1?ZPc4p0Hd+j#Foi`NHEn|^Od zl%=N~GU`Hf7U%Lhq&SPFIp(8+jmz)g4+;}e$Nv}<{-cy~y?s!j_QZpHz?!~3tN1TT z>Zq@wC5LmEAat!jGHD@Og#oC`xgI;?kYLen@wWw0#C_AGOi|^B(W543%%MV>r-V&~ z#lJAU>UFyr?h_5*Qme3tANM2g4y zCNP+pqrhr9M7L8i zl{x$T`jdlj9{C!raM?@8BBS;|OzdB1$ZGx7(b8l%w`4qzvnX#6-{-%NctKW?r0hv6 z1hwRr8LQeXP0HG{=9aBZYlAU^5&Mt5W5n(Lq8BS$d?8zqPFDEXrGrd29$8EEbnw#f zEURJ#@T@ix6{OQs7k&^suy(BSEAD}HKXvO~_m2stB$%pyQy=q!zKs{C_U8J1ioNZo z2dwCx_QAW0vY;$cUO6`g`ai6e-pK^U{n+JKIHXF5cb@p*N3UByh;uPaIDHlJPX%x7 zeAJRG^V8i@7|XeSwz+Jr8AzC-*a-LMZ21D*QYrbK#rZ7c?Z{R~?y?=UxekHm%NxMZ zCVLpNOSj^*a!g9C=B8r3@9;%aCpS5J+QFRn^Ed+L-S-&2U1)ziN>|}?^YR^-+cY$$ z|3F2>2zDD$x&q-zuufI1x3tiHnDG8kzV|?+pe)dPX{2-~tNIj$LvGr);F4k3A>rZn zlX&BMkYZb&%8-Gkd0LCrKi1H)qQjS;dzKND`Npi_?NEk`Whqm@&0sMj1M2k!-3u?s zLET`ie}1m%T>Un61}47Ts=s9Q`w`6*l*!olMcsR;<0C{ihpz`Nbq=^`WmkVV@t?3; z4;exmp369RyW88kf%`Vm(V5_+?_FGqToL6-5!Hq_YH{7pgK;6cdw~+eW@^UKr<}=U zK?rnZbGGBN3NIz*e-DD9CgfXolv49krmhP*FEU1uNGHEkN-Mv~(31#ho?1ZG}^r z^8}mMkNCVYV2roiU~$C4y97HQ+P^5q%4}AAUMl-7G6xjUa&ouZhfmUtcT7GJe zUJ~^YV~4O>Qo!=1vYl_oVfZ!>y$y^Z3?Pk{BV?WPnfosY^XTpXO?lN)!<$EzAQE5M}u>EHHChtde(sF&hOg{i^0%7|#%9pYTW5;q4>%xg+G z`P*W?WHql%GxH^Xd2zmMjgI~QGj;v3K1BXaoZYXkho_S(s71&(->*)05kV)S2w!fe zDg|}}ky+ed0IpT6n&4)6jJ`PR13j}JNT};KjdK-MC~BMVVQ{^&I>;cSG7kB}gNMT> z?U^DF1qW{3^8_%6Fp7G2G6HLt{}CwoV<4d8$6DN9n$hI5g%B+{Fm|G#mHAv~J8?yC zvD3)`;Agnk4#pqM*#`yu(qVSYwE;u)~oVD9Osvr>Lj9c0~x=s~?G374ZA zE%Tv&sjR+}!$`0M1B6RQ>)y;u0IX8dCk&J0Dr{XwLIlJk+I}J>4K}1p)AySunUqc5 zX;Hn5(SBpk2*+s&!aXrF>9z4)^>Nv>Qd7AZ1^si#8pc#slACCdx=$~0`YP1qOJXn1 zycm~-yZNye=Qs9=ho?ip5pMumjDW!8p#5kU!+defZ#u42&ri_Vf{mw^PR7-0!K)%< zD48Wp!_Q~b>Ia-mAu4-P1E%lDA!=-LV1asy6QSaGnKWdhT4WS2=Zu=NhV^!_+SW;V zy4OGZfehunw=PVXBV0yt8&4E14uNQx#mBTSc*##qYY;O`?&bt!qqiP4!3I0*#oKji z?!TRwN3Y~NE(MmW7N3p!7jpSp?&WAOyl`NJDCVpto0&x^$*AIHj@wwq%+Dcwg2}ZU zb%2X5aQ3W==2w;fapkG7^OW}sT6acaz7O=HjV!!+I-q!gKLyhMUaFfdU5_5+TRp3j zI%X$B*9`RXxK>DRB);nKg$2LY?(iA}fyv}KP~fK8Z+YY+@)sa-%ce?7p`+@%%#XL_ zWM3@EJpiilv}SR76?Q9LAhh(gC1oI<#cCk+#$2W)TonWem`raM`?e) znLNmNpw<@?)FE3ygz2l_DWYGFBjhu!vj=W(C~bfnG%1Kow^ac*CSrbaEu1LpYkn}} z<4MAm@v8}ip@as$sNGNYT-J1Q#RfCa#Rf&@&~oK1ii|mj8geUL$FHGN@D7#mM*Db@dqZ$fnK7GJsmcM+$ zV}B6}dCh%|#R^(D4zqHVm*H@@k~#GnQq`k*EIs!t#r_5FC0fn|#s~z`H7XX|{AaAd zU@%}idb=;}Z)=>$HG5Tv%$*434y@%D$|d&TSvaou!oSIjQsn|5{P*`7fT~J@1>@N= z!_~}%L-u1%Va>OW>uv6diS@pH{`a%0-}2%(-wh9)O0dN`qBhnqwEZGg+7XD> z)GzPifg}$3diS*V@JN3}j3u=4${2s=VW8{9A(7}-Z^7Eq0kv#W^1=wAmj3uSJC1Af zv=mFlPIy;Y&CTd5`RdY0Urdi`E}bI&ZMnp{ATMIwYjCsXz@7Bi(3RzcfkuQ5Z3WuM zU7oYKV||jJM0zhT0NUPtPK{QVF@i0^Zzb%UQ^dxNZ}k8Pc-%W(G1No6t`%SWzwX&* zX?PG5bGqwExwD#X1-8;nrIO{wsO}M7N+7$>1(KB+);JuM2XAh5dFWTdKYQHuuYrzi zAopL%%cZIA`i7t)!4LJh4v0-ZpYU49IKSzQ2yChy>wHw3apAGJ+V+(`2}?lFl* ze@$QMX-?*rLlPHk;LKCX5zQh|?^&)3HkFbSjReBG<@#!4{oF#Q z)e0qYj^XuYjva8>|G*B|P|wdP!21=|9TLXWsHfDAQ`0o%_YXta{D$_be;aV^`TNkA ztLDseSg^L|t5E+}k?Tv3K*pbW^8UOI?FiG!_NvqW$A3>?IQ{!45|?(UZVc4#}a7xs5062^9Rg0&Y426uvl z9n$xabbvtIow%9zxF=_N{qKwfo8xbE%jI}q8Vbn#i|1&};Fv(Ag0FQmHZ?au0HUO% z2)q&ao3R#_R@X)#|3iF^jfG0c(jI`YI5vW9N@sNdfCBW;sR5ueqdSms1jWYUCt?u* zw}AgA%{m28mJdK$7!?^_2AqH^VE%ya0p#pv$j;&j-VyMYw)Q6A`cebRqB#pj?<5Spx!S0ZD^y2R@DvjVso z{G`nUn8GsnqsMdDuXiD(gk^-yWmmec>0bjd0Pg_I&8hWY{1wNB)xg|uXK|-es+{Bq|Qz=Zflk4=qy((jD2 z-=zS$mZ`bX4R|GcHQN_JM@CmS4|vdLyJug>d9@D|2neZ8UZ#gD{p-y6E&RxLaYXPi z{cPLqBfI_EVMO*;H?QaSdb%^s4!q8-&DG8MtAha1(AWm@uXO)yHD7!6-%J@vIVmY! zVa-4t3^t#y0ReI!&l?>bKB+&~e;FY`9W0)h2H)S`2WSWkSDD(?0WvWMSG$RMul7w^ z+vn>H$l=la<6dKXC2;q?`%B*(+S>hhsJ=I=i>twUQ;Vw`=#=z_4#P(H+olPm3t$6K zKn!qhXg2y8eO<5lVQThg3dTOTz72Q`(7@2>3ihd`4KRowu{%d*ClC;wOuexFB|qMe zNW@J)@Na7dl;?E6IOtv7mD3vhxRS!43=KVdn5i$AWi*F7dH_~y13&`cm&M9mZd z`aQ?O&!6+u?=b0&m36g^5qQa$y!aosnUT5e)ra}MKhQ3aKV_*h80Ti@#_zcK&3}zB zAdA28jgi@lI{Y8obZ17uxZcd}5*oyAV!upQzg!Znhsj_(?v=hgEdVq3o4@bAB};Q7 zI2SjthsU22;2`gNypF?>eiUDVRrdaQPLvcjG?+o^Z|nwO4O2fNc!26F`yoif)Q^~fTj(CdOx-(n;AW~H zF+=yGJ&2k52lzwa=H9oM0blEXVfin%BcKNB|D>n?rR$%u13p{*h#B%{_#^OYW{=o` z#IpYXg_Is)xwf)7wZDx`&H<(wT$vex`E6)){u!Nr^WSfXALS5VduG@L)BT4ZoPUjf z9AKgS7_8(VO;J=taYroyeI{R^& zo?IG$H-8ZT+ckdy2Bmku0sA{mAHYG`Puh&|9nfw4atHSO9jR{pgxnna?5OM=X}s^x zzqi;KTf3`s&+EYpkMNS$`?~>t(O~N!KZ&DraC#NP_UswMgCIUTKi18i8-MekceSS= z(hdMzKdyj($Wu@0vsfm-;GiANasBHb(49cJe_l90%28M8h#-A0GJhMwZVfN-ug$^g z@7=ur_ER~4Z)*N{_5Sr61=#stL$Y^7G`Ei1hZ?N@aUxwCEcT}l!0OdTry}3S8v?qI zj^aLx19gO5-6euM|Gvx}+JHHIHn;Bo{P(^I0`2~D+#e9-{S7cso98F+(C+cG22+0o z;`*uG--|zG@86aB?Q+`m%07$Ktl)j^k8-l)E=FI)FY$!_!|gM8GqBSH1qH0_;FPYzDxN#(6^@r zAXosl2^uRKbK{<)40o`nSII2)Q7gseB{yp5&>GB9J&VkQM;X z5=tPd2?*SuBJgEodx?Ovo&K_RJT*z}th#86%umWsq(a%as6&f3TL3Q=4QQH+@*Muy ziSH`aVaC@&+L8#wA|6ljzTJIm%t_{Fq*J1Fp59t}I^2Oi+sJOry=mk{Q-th>Kr81e z(DmIj?uT!@MBO&RVExSq28>9C#z|$zjwAR0g3mo#_N0P+L?83oPA3CRP-Ru?7l6QOP5fD)DSF}Dzp2; zyHS-*|I90)Ah!sxO&DJE70_qEf+WjUzTb#XywapF$Z=9~84Q@neYHzwYzNVgM9cIGA0m4dj1l*?V%6d8o53dJlwZ}=DrFUO zy(bl`K`&c+2SwUq-?;u7KK^ou<1P_FIH~lA}%eTaro(xGH$6 zmgLG|!i@=%$KY~F(0%sWVOgLdA<3YZdsN7PjI{=Hd6{fmiE?LW--Gir@?On~z(Y~c zRZ@q5(@Jc{_H&N8cx<()(hKs@L}`O@GO(K|wv{wR_s-CdH3DZUO_(@0O<%33W|@Sx zAgY9)X@8Qpn5~LS3Fs+hnL&1lncC5Aa=qcf+{%I-O1w^E0r>375;MPTclOv;NHdK) zD8e_G%+m^#gVyCp)~63vBQdUluZywmVY`uWdTF|PpBTLiG*tsuFJ)Dh${WfP8DWbD zA3S(kZ1rvY{PkA-tCdFD{YU=S_=)fg!~J2PG@TJxOGhW}{mOS4v|S};I^ZPu@hF}0 z>Ob60w$9kiwoEJ>ASA97Db6kE?xn`CX12R9eW09~K5H52EHo%@^M|#x#8fCYk*u76!|8zmFot_lHQ