arm64/mc: add 8-bit neon asm for avg, w_avg and mask
checkasm --bench on a Qualcomm Kryo (Sanpdragon 820):
nop: 33.0
avg_w4_8bpc_c: 450.5
avg_w4_8bpc_neon: 20.1
avg_w8_8bpc_c: 438.6
avg_w8_8bpc_neon: 45.2
avg_w16_8bpc_c: 1003.7
avg_w16_8bpc_neon: 112.8
avg_w32_8bpc_c: 3249.6
avg_w32_8bpc_neon: 429.9
avg_w64_8bpc_c: 7213.3
avg_w64_8bpc_neon: 1299.4
avg_w128_8bpc_c: 16791.3
avg_w128_8bpc_neon: 2978.4
w_avg_w4_8bpc_c: 605.7
w_avg_w4_8bpc_neon: 30.9
w_avg_w8_8bpc_c: 545.8
w_avg_w8_8bpc_neon: 72.9
w_avg_w16_8bpc_c: 1430.1
w_avg_w16_8bpc_neon: 193.5
w_avg_w32_8bpc_c: 4876.3
w_avg_w32_8bpc_neon: 715.3
w_avg_w64_8bpc_c: 11338.0
w_avg_w64_8bpc_neon: 2147.0
w_avg_w128_8bpc_c: 26822.0
w_avg_w128_8bpc_neon: 4596.3
mask_w4_8bpc_c: 604.6
mask_w4_8bpc_neon: 37.2
mask_w8_8bpc_c: 654.8
mask_w8_8bpc_neon: 96.0
mask_w16_8bpc_c: 1663.0
mask_w16_8bpc_neon: 272.4
mask_w32_8bpc_c: 5707.6
mask_w32_8bpc_neon: 1028.9
mask_w64_8bpc_c: 12735.3
mask_w64_8bpc_neon: 2533.2
mask_w128_8bpc_c: 31027.6
mask_w128_8bpc_neon: 6247.2
Edited by Janne Grunau