From 62704db5ed7e237a7e8b93486574050c3a3b4c5f Mon Sep 17 00:00:00 2001 From: Doridian Date: Sun, 27 Mar 2022 10:19:34 -0700 Subject: [PATCH] Add very basic feature flagging for NEON --- client/deps/hardnested.cmake | 21 ++++ client/deps/hardnested/Makefile | 31 ++++- client/deps/hardnested/hardnested_bf_core.c | 24 +++- client/deps/hardnested/hardnested_bf_core.h | 13 +++ .../hardnested/hardnested_bitarray_core.c | 107 +++++++++++++++--- client/src/cmdhfmf.c | 25 ++++ client/src/cmdhfmfhard.c | 5 + doc/commands.json | 10 +- 8 files changed, 210 insertions(+), 26 deletions(-) diff --git a/client/deps/hardnested.cmake b/client/deps/hardnested.cmake index dc569641c..b9651f394 100644 --- a/client/deps/hardnested.cmake +++ b/client/deps/hardnested.cmake @@ -10,10 +10,13 @@ target_include_directories(pm3rrg_rdv4_hardnested_nosimd PRIVATE ../../include ../src) +target_compile_definitions(pm3rrg_rdv4_hardnested_nosimd NOSIMD_BUILD) + ## CPU-specific code ## These are mostly for x86-based architectures, which is not useful for many Android devices. ## Mingw platforms: AMD64 set(X86_CPUS x86 x86_64 i686 AMD64) +set(ARM64_CPUS arm64 aarch64) message(STATUS "CMAKE_SYSTEM_PROCESSOR := ${CMAKE_SYSTEM_PROCESSOR}") @@ -104,6 +107,24 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" IN_LIST X86_CPUS) $ $ $) +elseif ("${CMAKE_SYSTEM_PROCESSOR}" IN_LIST ARM64_CPUS) + message(STATUS "Building optimised arm64 binaries") + + ## arm64 / NEON + add_library(pm3rrg_rdv4_hardnested_neon OBJECT + hardnested/hardnested_bf_core.c + hardnested/hardnested_bitarray_core.c) + + target_compile_options(pm3rrg_rdv4_hardnested_neon PRIVATE -Wall -Werror -O3) + set_property(TARGET pm3rrg_rdv4_hardnested_neon PROPERTY POSITION_INDEPENDENT_CODE ON) + + target_include_directories(pm3rrg_rdv4_hardnested_neon PRIVATE + ../../common + ../../include + ../src) + + set(SIMD_TARGETS + $) else () message(STATUS "Not building optimised targets") set(SIMD_TARGETS) diff --git a/client/deps/hardnested/Makefile b/client/deps/hardnested/Makefile index 624eb730f..5f58606ae 100644 --- a/client/deps/hardnested/Makefile +++ b/client/deps/hardnested/Makefile @@ -11,6 +11,9 @@ endif ifneq ($(findstring amd64, $(cpu_arch)), ) MULTIARCHSRCS = hardnested_bf_core.c hardnested_bitarray_core.c endif +ifneq ($(findstring arm64, $(cpu_arch)), ) + MULTIARCHSRCS = hardnested_bf_core.c hardnested_bitarray_core.c +endif ifeq ($(MULTIARCHSRCS), ) MYSRCS += hardnested_bf_core.c hardnested_bitarray_core.c endif @@ -18,20 +21,30 @@ endif LIB_A = libhardnested.a MYOBJS = $(MYSRCS:%.c=$(OBJDIR)/%.o) -MYOBJS += $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_NOSIMD.o) \ - $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_MMX.o) \ - $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_SSE2.o) \ - $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX.o) \ - $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX2.o) +ifneq ($(findstring arm64, $(cpu_arch)), ) + MYOBJS += $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_NOSIMD.o) \ + $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_NEON.o) +else + MYOBJS += $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_NOSIMD.o) \ + $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_MMX.o) \ + $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_SSE2.o) \ + $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX.o) \ + $(MULTIARCHSRCS:%.c=$(OBJDIR)/%_AVX2.o) +endif SUPPORTS_AVX512 := $(shell echo | $(CC) -E -mavx512f - > /dev/null 2>&1 && echo "True" ) -HARD_SWITCH_NOSIMD = -mno-mmx -mno-sse2 -mno-avx -mno-avx2 +HARD_SWITCH_NOSIMD = -mno-mmx -mno-sse2 -mno-avx -mno-avx2 -DNOSIMD_BUILD +HARD_SWITCH_NEON = HARD_SWITCH_MMX = -mmmx -mno-sse2 -mno-avx -mno-avx2 HARD_SWITCH_SSE2 = -mmmx -msse2 -mno-avx -mno-avx2 HARD_SWITCH_AVX = -mmmx -msse2 -mavx -mno-avx2 HARD_SWITCH_AVX2 = -mmmx -msse2 -mavx -mavx2 HARD_SWITCH_AVX512 = -mmmx -msse2 -mavx -mavx2 -mavx512f +ifneq ($(findstring arm64, $(cpu_arch)), ) + SUPPORTS_AVX512=0 + HARD_SWITCH_NOSIMD = -DNOSIMD_BUILD +endif ifeq "$(SUPPORTS_AVX512)" "True" HARD_SWITCH_NOSIMD += -mno-avx512f HARD_SWITCH_MMX += -mno-avx512f @@ -52,6 +65,12 @@ $(OBJDIR)/%_NOSIMD.o : %.c $(OBJDIR)/%_NOSIMD.d $(Q)$(CC) $(DEPFLAGS:%.Td=%_NOSIMD.Td) $(CFLAGS) $(HARD_SWITCH_NOSIMD) -c -o $@ $< $(Q)$(MV) -f $(OBJDIR)/$*_NOSIMD.Td $(OBJDIR)/$*_NOSIMD.d && $(TOUCH) $@ +$(OBJDIR)/%_NEON.o : %.c $(OBJDIR)/%_NEON.d + $(info [-] CC(NEON) $<) + $(Q)$(MKDIR) $(dir $@) + $(Q)$(CC) $(DEPFLAGS:%.Td=%_NEON.Td) $(CFLAGS) $(HARD_SWITCH_NEON) -c -o $@ $< + $(Q)$(MV) -f $(OBJDIR)/$*_NEON.Td $(OBJDIR)/$*_NEON.d && $(TOUCH) $@ + $(OBJDIR)/%_MMX.o : %.c $(OBJDIR)/%_MMX.d $(info [-] CC(MMX) $<) $(Q)$(MKDIR) $(dir $@) diff --git a/client/deps/hardnested/hardnested_bf_core.c b/client/deps/hardnested/hardnested_bf_core.c index c9af2702b..65d77adf6 100644 --- a/client/deps/hardnested/hardnested_bf_core.c +++ b/client/deps/hardnested/hardnested_bf_core.c @@ -74,7 +74,7 @@ THE SOFTWARE. #define MAX_BITSLICES 128 #elif defined(__SSE2__) #define MAX_BITSLICES 128 -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(NOSIMD_BUILD) #define MAX_BITSLICES 128 #else // MMX or SSE or NOSIMD #define MAX_BITSLICES 64 @@ -120,6 +120,9 @@ typedef union { #elif defined (__MMX__) #define BITSLICE_TEST_NONCES bitslice_test_nonces_MMX #define CRACK_STATES_BITSLICED crack_states_bitsliced_MMX +#elif defined (__ARM_NEON) && !defined(NOSIMD_BUILD) +#define BITSLICE_TEST_NONCES bitslice_test_nonces_NEON +#define CRACK_STATES_BITSLICED crack_states_bitsliced_NEON #else #define BITSLICE_TEST_NONCES bitslice_test_nonces_NOSIMD #define CRACK_STATES_BITSLICED crack_states_bitsliced_NOSIMD @@ -132,6 +135,7 @@ crack_states_bitsliced_t crack_states_bitsliced_AVX2; crack_states_bitsliced_t crack_states_bitsliced_AVX; crack_states_bitsliced_t crack_states_bitsliced_SSE2; crack_states_bitsliced_t crack_states_bitsliced_MMX; +crack_states_bitsliced_t crack_states_bitsliced_NEON; crack_states_bitsliced_t crack_states_bitsliced_NOSIMD; crack_states_bitsliced_t crack_states_bitsliced_dispatch; @@ -141,6 +145,7 @@ bitslice_test_nonces_t bitslice_test_nonces_AVX2; bitslice_test_nonces_t bitslice_test_nonces_AVX; bitslice_test_nonces_t bitslice_test_nonces_SSE2; bitslice_test_nonces_t bitslice_test_nonces_MMX; +bitslice_test_nonces_t bitslice_test_nonces_NEON; bitslice_test_nonces_t bitslice_test_nonces_NOSIMD; bitslice_test_nonces_t bitslice_test_nonces_dispatch; @@ -545,7 +550,7 @@ out: -#ifndef __MMX__ +#ifdef NOSIMD_BUILD // pointers to functions: crack_states_bitsliced_t *crack_states_bitsliced_function_p = &crack_states_bitsliced_dispatch; @@ -582,6 +587,11 @@ static SIMDExecInstr GetSIMDInstr(void) { else if (__builtin_cpu_supports("mmx")) instr = SIMD_MMX; else +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) + instr = SIMD_NEON; + else #endif instr = SIMD_NONE; @@ -620,6 +630,11 @@ uint64_t crack_states_bitsliced_dispatch(uint32_t cuid, uint8_t *best_first_byte case SIMD_MMX: crack_states_bitsliced_function_p = &crack_states_bitsliced_MMX; break; +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + case SIMD_NEON: + crack_states_bitsliced_function_p = &crack_states_bitsliced_NEON; + break; #endif case SIMD_AUTO: case SIMD_NONE: @@ -651,6 +666,11 @@ void bitslice_test_nonces_dispatch(uint32_t nonces_to_bruteforce, const uint32_t case SIMD_MMX: bitslice_test_nonces_function_p = &bitslice_test_nonces_MMX; break; +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + case SIMD_NEON: + bitslice_test_nonces_function_p = &bitslice_test_nonces_NEON; + break; #endif case SIMD_AUTO: case SIMD_NONE: diff --git a/client/deps/hardnested/hardnested_bf_core.h b/client/deps/hardnested/hardnested_bf_core.h index 51eca7de5..4693462e3 100644 --- a/client/deps/hardnested/hardnested_bf_core.h +++ b/client/deps/hardnested/hardnested_bf_core.h @@ -61,6 +61,16 @@ THE SOFTWARE. # endif #endif +// ARM64 mandates implementation of NEON +#if defined(__arm64__) +#define COMPILER_HAS_SIMD_NEON +#define arm_has_neon() (true) +// ARMv7 or older, NEON is optional and autodetection is difficult +#elif defined(__ARM_NEON) +#define COMPILER_HAS_SIMD_NEON +#define arm_has_neon() (false) +#endif + typedef enum { SIMD_AUTO, #if defined(COMPILER_HAS_SIMD_AVX512) @@ -71,6 +81,9 @@ typedef enum { SIMD_AVX, SIMD_SSE2, SIMD_MMX, +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + SIMD_NEON, #endif SIMD_NONE, } SIMDExecInstr; diff --git a/client/deps/hardnested/hardnested_bitarray_core.c b/client/deps/hardnested/hardnested_bitarray_core.c index d62da774c..68a885be9 100644 --- a/client/deps/hardnested/hardnested_bitarray_core.c +++ b/client/deps/hardnested/hardnested_bitarray_core.c @@ -98,6 +98,20 @@ #define COUNT_BITARRAY_AND2 count_bitarray_AND2_MMX #define COUNT_BITARRAY_AND3 count_bitarray_AND3_MMX #define COUNT_BITARRAY_AND4 count_bitarray_AND4_MMX +#elif defined (__ARM_NEON) && !defined (NOSIMD_BUILD) +#define MALLOC_BITARRAY malloc_bitarray_NEON +#define FREE_BITARRAY free_bitarray_NEON +#define BITCOUNT bitcount_NEON +#define COUNT_STATES count_states_NEON +#define BITARRAY_AND bitarray_AND_NEON +#define BITARRAY_LOW20_AND bitarray_low20_AND_NEON +#define COUNT_BITARRAY_AND count_bitarray_AND_NEON +#define COUNT_BITARRAY_LOW20_AND count_bitarray_low20_AND_NEON +#define BITARRAY_AND4 bitarray_AND4_NEON +#define BITARRAY_OR bitarray_OR_NEON +#define COUNT_BITARRAY_AND2 count_bitarray_AND2_NEON +#define COUNT_BITARRAY_AND3 count_bitarray_AND3_NEON +#define COUNT_BITARRAY_AND4 count_bitarray_AND4_NEON #else #define MALLOC_BITARRAY malloc_bitarray_NOSIMD #define FREE_BITARRAY free_bitarray_NOSIMD @@ -117,31 +131,31 @@ // typedefs and declaration of functions: typedef uint32_t *malloc_bitarray_t(uint32_t); -malloc_bitarray_t malloc_bitarray_AVX512, malloc_bitarray_AVX2, malloc_bitarray_AVX, malloc_bitarray_SSE2, malloc_bitarray_MMX, malloc_bitarray_NOSIMD, malloc_bitarray_dispatch; +malloc_bitarray_t malloc_bitarray_AVX512, malloc_bitarray_AVX2, malloc_bitarray_AVX, malloc_bitarray_SSE2, malloc_bitarray_MMX, malloc_bitarray_NOSIMD, malloc_bitarray_NEON, malloc_bitarray_dispatch; typedef void free_bitarray_t(uint32_t *); -free_bitarray_t free_bitarray_AVX512, free_bitarray_AVX2, free_bitarray_AVX, free_bitarray_SSE2, free_bitarray_MMX, free_bitarray_NOSIMD, free_bitarray_dispatch; +free_bitarray_t free_bitarray_AVX512, free_bitarray_AVX2, free_bitarray_AVX, free_bitarray_SSE2, free_bitarray_MMX, free_bitarray_NOSIMD, free_bitarray_NEON, free_bitarray_dispatch; typedef uint32_t bitcount_t(uint32_t); -bitcount_t bitcount_AVX512, bitcount_AVX2, bitcount_AVX, bitcount_SSE2, bitcount_MMX, bitcount_NOSIMD, bitcount_dispatch; +bitcount_t bitcount_AVX512, bitcount_AVX2, bitcount_AVX, bitcount_SSE2, bitcount_MMX, bitcount_NOSIMD, bitcount_NEON, bitcount_dispatch; typedef uint32_t count_states_t(uint32_t *); -count_states_t count_states_AVX512, count_states_AVX2, count_states_AVX, count_states_SSE2, count_states_MMX, count_states_NOSIMD, count_states_dispatch; +count_states_t count_states_AVX512, count_states_AVX2, count_states_AVX, count_states_SSE2, count_states_MMX, count_states_NOSIMD, count_states_NEON, count_states_dispatch; typedef void bitarray_AND_t(uint32_t[], uint32_t[]); -bitarray_AND_t bitarray_AND_AVX512, bitarray_AND_AVX2, bitarray_AND_AVX, bitarray_AND_SSE2, bitarray_AND_MMX, bitarray_AND_NOSIMD, bitarray_AND_dispatch; +bitarray_AND_t bitarray_AND_AVX512, bitarray_AND_AVX2, bitarray_AND_AVX, bitarray_AND_SSE2, bitarray_AND_MMX, bitarray_AND_NOSIMD, bitarray_AND_NEON, bitarray_AND_dispatch; typedef void bitarray_low20_AND_t(uint32_t *, uint32_t *); -bitarray_low20_AND_t bitarray_low20_AND_AVX512, bitarray_low20_AND_AVX2, bitarray_low20_AND_AVX, bitarray_low20_AND_SSE2, bitarray_low20_AND_MMX, bitarray_low20_AND_NOSIMD, bitarray_low20_AND_dispatch; +bitarray_low20_AND_t bitarray_low20_AND_AVX512, bitarray_low20_AND_AVX2, bitarray_low20_AND_AVX, bitarray_low20_AND_SSE2, bitarray_low20_AND_MMX, bitarray_low20_AND_NOSIMD, bitarray_low20_AND_NEON, bitarray_low20_AND_dispatch; typedef uint32_t count_bitarray_AND_t(uint32_t *, uint32_t *); -count_bitarray_AND_t count_bitarray_AND_AVX512, count_bitarray_AND_AVX2, count_bitarray_AND_AVX, count_bitarray_AND_SSE2, count_bitarray_AND_MMX, count_bitarray_AND_NOSIMD, count_bitarray_AND_dispatch; +count_bitarray_AND_t count_bitarray_AND_AVX512, count_bitarray_AND_AVX2, count_bitarray_AND_AVX, count_bitarray_AND_SSE2, count_bitarray_AND_MMX, count_bitarray_AND_NOSIMD, count_bitarray_AND_NEON, count_bitarray_AND_dispatch; typedef uint32_t count_bitarray_low20_AND_t(uint32_t *, uint32_t *); -count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512, count_bitarray_low20_AND_AVX2, count_bitarray_low20_AND_AVX, count_bitarray_low20_AND_SSE2, count_bitarray_low20_AND_MMX, count_bitarray_low20_AND_NOSIMD, count_bitarray_low20_AND_dispatch; +count_bitarray_low20_AND_t count_bitarray_low20_AND_AVX512, count_bitarray_low20_AND_AVX2, count_bitarray_low20_AND_AVX, count_bitarray_low20_AND_SSE2, count_bitarray_low20_AND_MMX, count_bitarray_low20_AND_NOSIMD, count_bitarray_low20_AND_NEON, count_bitarray_low20_AND_dispatch; typedef void bitarray_AND4_t(uint32_t *, uint32_t *, uint32_t *, uint32_t *); -bitarray_AND4_t bitarray_AND4_AVX512, bitarray_AND4_AVX2, bitarray_AND4_AVX, bitarray_AND4_SSE2, bitarray_AND4_MMX, bitarray_AND4_NOSIMD, bitarray_AND4_dispatch; +bitarray_AND4_t bitarray_AND4_AVX512, bitarray_AND4_AVX2, bitarray_AND4_AVX, bitarray_AND4_SSE2, bitarray_AND4_MMX, bitarray_AND4_NOSIMD, bitarray_AND4_NEON, bitarray_AND4_dispatch; typedef void bitarray_OR_t(uint32_t[], uint32_t[]); -bitarray_OR_t bitarray_OR_AVX512, bitarray_OR_AVX2, bitarray_OR_AVX, bitarray_OR_SSE2, bitarray_OR_MMX, bitarray_OR_NOSIMD, bitarray_OR_dispatch; +bitarray_OR_t bitarray_OR_AVX512, bitarray_OR_AVX2, bitarray_OR_AVX, bitarray_OR_SSE2, bitarray_OR_MMX, bitarray_OR_NOSIMD, bitarray_OR_NEON, bitarray_OR_dispatch; typedef uint32_t count_bitarray_AND2_t(uint32_t *, uint32_t *); -count_bitarray_AND2_t count_bitarray_AND2_AVX512, count_bitarray_AND2_AVX2, count_bitarray_AND2_AVX, count_bitarray_AND2_SSE2, count_bitarray_AND2_MMX, count_bitarray_AND2_NOSIMD, count_bitarray_AND2_dispatch; +count_bitarray_AND2_t count_bitarray_AND2_AVX512, count_bitarray_AND2_AVX2, count_bitarray_AND2_AVX, count_bitarray_AND2_SSE2, count_bitarray_AND2_MMX, count_bitarray_AND2_NOSIMD, count_bitarray_AND2_NEON, count_bitarray_AND2_dispatch; typedef uint32_t count_bitarray_AND3_t(uint32_t *, uint32_t *, uint32_t *); -count_bitarray_AND3_t count_bitarray_AND3_AVX512, count_bitarray_AND3_AVX2, count_bitarray_AND3_AVX, count_bitarray_AND3_SSE2, count_bitarray_AND3_MMX, count_bitarray_AND3_NOSIMD, count_bitarray_AND3_dispatch; +count_bitarray_AND3_t count_bitarray_AND3_AVX512, count_bitarray_AND3_AVX2, count_bitarray_AND3_AVX, count_bitarray_AND3_SSE2, count_bitarray_AND3_MMX, count_bitarray_AND3_NOSIMD, count_bitarray_AND3_NEON, count_bitarray_AND3_dispatch; typedef uint32_t count_bitarray_AND4_t(uint32_t *, uint32_t *, uint32_t *, uint32_t *); -count_bitarray_AND4_t count_bitarray_AND4_AVX512, count_bitarray_AND4_AVX2, count_bitarray_AND4_AVX, count_bitarray_AND4_SSE2, count_bitarray_AND4_MMX, count_bitarray_AND4_NOSIMD, count_bitarray_AND4_dispatch; +count_bitarray_AND4_t count_bitarray_AND4_AVX512, count_bitarray_AND4_AVX2, count_bitarray_AND4_AVX, count_bitarray_AND4_SSE2, count_bitarray_AND4_MMX, count_bitarray_AND4_NOSIMD, count_bitarray_AND4_NEON, count_bitarray_AND4_dispatch; inline uint32_t *MALLOC_BITARRAY(uint32_t x) { @@ -287,7 +301,7 @@ inline uint32_t COUNT_BITARRAY_AND4(uint32_t *restrict A, uint32_t *restrict B, } -#ifndef __MMX__ +#ifdef NOSIMD_BUILD // pointers to functions: malloc_bitarray_t *malloc_bitarray_function_p = &malloc_bitarray_dispatch; @@ -306,6 +320,11 @@ count_bitarray_AND4_t *count_bitarray_AND4_function_p = &count_bitarray_AND4_dis // determine the available instruction set at runtime and call the correct function uint32_t *malloc_bitarray_dispatch(uint32_t x) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) malloc_bitarray_function_p = &malloc_bitarray_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) malloc_bitarray_function_p = &malloc_bitarray_AVX512; else @@ -324,6 +343,11 @@ uint32_t *malloc_bitarray_dispatch(uint32_t x) { } void free_bitarray_dispatch(uint32_t *x) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) free_bitarray_function_p = &free_bitarray_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) free_bitarray_function_p = &free_bitarray_AVX512; else @@ -342,6 +366,11 @@ void free_bitarray_dispatch(uint32_t *x) { } uint32_t bitcount_dispatch(uint32_t a) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) bitcount_function_p = &bitcount_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) bitcount_function_p = &bitcount_AVX512; else @@ -360,6 +389,11 @@ uint32_t bitcount_dispatch(uint32_t a) { } uint32_t count_states_dispatch(uint32_t *bitarray) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_states_function_p = &count_states_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_states_function_p = &count_states_AVX512; else @@ -378,6 +412,11 @@ uint32_t count_states_dispatch(uint32_t *bitarray) { } void bitarray_AND_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) bitarray_AND_function_p = &bitarray_AND_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) bitarray_AND_function_p = &bitarray_AND_AVX512; else @@ -396,6 +435,11 @@ void bitarray_AND_dispatch(uint32_t *A, uint32_t *B) { } void bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) bitarray_low20_AND_function_p = &bitarray_low20_AND_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) bitarray_low20_AND_function_p = &bitarray_low20_AND_AVX512; else @@ -414,6 +458,11 @@ void bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) { } uint32_t count_bitarray_AND_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_bitarray_AND_function_p = &count_bitarray_AND_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_bitarray_AND_function_p = &count_bitarray_AND_AVX512; else @@ -432,6 +481,11 @@ uint32_t count_bitarray_AND_dispatch(uint32_t *A, uint32_t *B) { } uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_bitarray_low20_AND_function_p = &count_bitarray_low20_AND_AVX512; else @@ -450,6 +504,11 @@ uint32_t count_bitarray_low20_AND_dispatch(uint32_t *A, uint32_t *B) { } void bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) bitarray_AND4_function_p = &bitarray_AND4_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) bitarray_AND4_function_p = &bitarray_AND4_AVX512; else @@ -468,6 +527,11 @@ void bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) } void bitarray_OR_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) bitarray_OR_function_p = &bitarray_OR_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) bitarray_OR_function_p = &bitarray_OR_AVX512; else @@ -486,6 +550,11 @@ void bitarray_OR_dispatch(uint32_t *A, uint32_t *B) { } uint32_t count_bitarray_AND2_dispatch(uint32_t *A, uint32_t *B) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_bitarray_AND2_function_p = &count_bitarray_AND2_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_bitarray_AND2_function_p = &count_bitarray_AND2_AVX512; else @@ -504,6 +573,11 @@ uint32_t count_bitarray_AND2_dispatch(uint32_t *A, uint32_t *B) { } uint32_t count_bitarray_AND3_dispatch(uint32_t *A, uint32_t *B, uint32_t *C) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_bitarray_AND3_function_p = &count_bitarray_AND3_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_bitarray_AND3_function_p = &count_bitarray_AND3_AVX512; else @@ -522,6 +596,11 @@ uint32_t count_bitarray_AND3_dispatch(uint32_t *A, uint32_t *B, uint32_t *C) { } uint32_t count_bitarray_AND4_dispatch(uint32_t *A, uint32_t *B, uint32_t *C, uint32_t *D) { +#if defined(COMPILER_HAS_SIMD_NEON) + if (arm_has_neon()) count_bitarray_AND4_function_p = &count_bitarray_AND4_NEON; + else +#endif + #if defined(COMPILER_HAS_SIMD_AVX512) if (__builtin_cpu_supports("avx512f")) count_bitarray_AND4_function_p = &count_bitarray_AND4_AVX512; else diff --git a/client/src/cmdhfmf.c b/client/src/cmdhfmf.c index 81463e3ed..5d1f34dd5 100644 --- a/client/src/cmdhfmf.c +++ b/client/src/cmdhfmf.c @@ -1873,6 +1873,9 @@ static int CmdHF14AMfNestedHard(const char *Cmd) { #endif #if defined(COMPILER_HAS_SIMD_AVX512) arg_lit0(NULL, "i5", "AVX512"), +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + arg_lit0(NULL, "ie", "NEON"), #endif arg_param_end }; @@ -1930,6 +1933,9 @@ static int CmdHF14AMfNestedHard(const char *Cmd) { #endif #if defined(COMPILER_HAS_SIMD_AVX512) bool i5 = arg_get_lit(ctx, 20); +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + bool ie = arg_get_lit(ctx, 16); #endif CLIParserFree(ctx); @@ -1951,6 +1957,12 @@ static int CmdHF14AMfNestedHard(const char *Cmd) { if (im) SetSIMDInstr(SIMD_MMX); #endif + +#if defined(COMPILER_HAS_SIMD_NEON) + if (ie) + SetSIMDInstr(SIMD_NEON); +#endif + if (in) SetSIMDInstr(SIMD_NONE); @@ -2067,6 +2079,9 @@ static int CmdHF14AMfAutoPWN(const char *Cmd) { #endif #if defined(COMPILER_HAS_SIMD_AVX512) arg_lit0(NULL, "i5", "AVX512"), +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + arg_lit0(NULL, "ie", "NEON"), #endif arg_param_end }; @@ -2118,6 +2133,10 @@ static int CmdHF14AMfAutoPWN(const char *Cmd) { #if defined(COMPILER_HAS_SIMD_AVX512) bool i5 = arg_get_lit(ctx, 18); #endif +#if defined(COMPILER_HAS_SIMD_NEON) + bool ie = arg_get_lit(ctx, 14); +#endif + CLIParserFree(ctx); //validations @@ -2167,6 +2186,12 @@ static int CmdHF14AMfAutoPWN(const char *Cmd) { if (im) SetSIMDInstr(SIMD_MMX); #endif + +#if defined(COMPILER_HAS_SIMD_NEON) + if (ie) + SetSIMDInstr(SIMD_NEON); +#endif + if (in) SetSIMDInstr(SIMD_NONE); diff --git a/client/src/cmdhfmfhard.c b/client/src/cmdhfmfhard.c index 6b219d574..0ed615bd1 100644 --- a/client/src/cmdhfmfhard.c +++ b/client/src/cmdhfmfhard.c @@ -101,6 +101,11 @@ static void get_SIMD_instruction_set(char *instruction_set) { case SIMD_MMX: strcpy(instruction_set, "MMX"); break; +#endif +#if defined(COMPILER_HAS_SIMD_NEON) + case SIMD_NEON: + strcpy(instruction_set, "NEON"); + break; #endif case SIMD_AUTO: case SIMD_NONE: diff --git a/doc/commands.json b/doc/commands.json index 8d302b9f1..ddea4a265 100644 --- a/doc/commands.json +++ b/doc/commands.json @@ -3643,9 +3643,10 @@ "--is sse2", "--ia avx", "--i2 avx2", - "--i5 avx512" + "--i5 avx512", + "--ie neon" ], - "usage": "hf mf autopwn [-habslv] [-k ] [-s ] [-f ] [--mini] [--1k] [--2k] [--4k] [--in] [--im] [--is] [--ia] [--i2] [--i5]" + "usage": "hf mf autopwn [-habslv] [-k ] [-s ] [-f ] [--mini] [--1k] [--2k] [--4k] [--in] [--im] [--is] [--ia] [--i2] [--i5] [--ie]" }, "hf mf cgetblk": { "command": "hf mf cgetblk", @@ -4122,9 +4123,10 @@ "--is sse2", "--ia avx", "--i2 avx2", - "--i5 avx512" + "--i5 avx512", + "--ie neon" ], - "usage": "hf mf hardnested [-habrstw] [-k ] [--blk ] [--tblk ] [--ta] [--tb] [--tk ] [-u ] [-f ] [--in] [--im] [--is] [--ia] [--i2] [--i5]" + "usage": "hf mf hardnested [-habrstw] [-k ] [--blk ] [--tblk ] [--ta] [--tb] [--tk ] [-u ] [-f ] [--in] [--im] [--is] [--ia] [--i2] [--i5] [--ie]" }, "hf mf help": { "command": "hf mf help",