[gurk-rs] Update to 0.3.0

This commit is contained in:
c0dev0id 2022-11-20 12:36:32 +01:00
parent 940e26830b
commit 4219dd4815
307 changed files with 706 additions and 105423 deletions

View File

@ -1,41 +1,36 @@
COMMENT = Signal Messenger client for terminal
V = 0.2.5
GH_ACCOUNT = boxdot
GH_PROJECT = gurk-rs
GH_TAGNAME = v${V}
GH_TAGNAME = v0.3.0
CATEGORIES = net
MAINTAINER = Stefan Hagen <sh+ports@codevoid.de>
# LICENSE
# AGPL 3.0
PERMIT_PACKAGE = Yes
# as devel/cargo MODULES adds DISTFILES, GH_* didn't
DISTFILES += ${DISTNAME}${EXTRACT_SUFX}
DISTFILES = ${DISTNAME}{master}${EXTRACT_SUFX}
# vendor files (see $FILESDIR/config)
MASTER_SITES0 = https://codevoid.de/h/
DISTFILES += ${DISTNAME}-vendorfiles${EXTRACT_SUFX}:0
MODULES = devel/cargo
.include "crates.inc"
#MODCARGO_NO_DEFAULT_FEATURES = Yes
WANTLIB = c c++abi pthread
#LIB_DEPENDS =
#RUN_DEPENDS =
#BUILD_DEPENDS =
#TEST_DEPENDS =
CONFIGURE_STYLE = cargo
#MAKE_FLAGS =
#NO_TEST = Yes
#TEST_TARGET =
post-extract:
mv ${WRKDIR}/vendor ${WRKSRC}/vendor;
pre-configure:
cat ${FILESDIR}/config >> ${WRKSRC}/.cargo/config.toml; \
cp -rf ${FILESDIR}/vendor ${WRKSRC}/
cat ${FILESDIR}/config >> ${WRKSRC}/.cargo/config
.include "crates.inc"
.include <bsd.port.mk>

View File

@ -1,360 +1,378 @@
# run: make modcargo-gen-crates-licenses
MODCARGO_CRATES += adler 1.0.2
MODCARGO_CRATES += aead 0.4.2
MODCARGO_CRATES += aes 0.7.4
MODCARGO_CRATES += aes-gcm 0.9.2
MODCARGO_CRATES += aes-gcm-siv 0.10.1
MODCARGO_CRATES += aho-corasick 0.7.18
MODCARGO_CRATES += aead 0.4.3
MODCARGO_CRATES += aes 0.7.5
MODCARGO_CRATES += aes-gcm 0.9.4
MODCARGO_CRATES += aes-gcm-siv 0.10.3
MODCARGO_CRATES += aho-corasick 0.7.19
MODCARGO_CRATES += android_system_properties 0.1.5
MODCARGO_CRATES += ansi_term 0.11.0
MODCARGO_CRATES += anes 0.1.6
MODCARGO_CRATES += ansi_term 0.12.1
MODCARGO_CRATES += anyhow 1.0.42
MODCARGO_CRATES += anyhow 1.0.64
MODCARGO_CRATES += arrayref 0.3.6
MODCARGO_CRATES += arrayvec 0.5.2
MODCARGO_CRATES += ascii 0.9.3
MODCARGO_CRATES += async-broadcast 0.3.4
MODCARGO_CRATES += async-channel 1.6.1
MODCARGO_CRATES += async-broadcast 0.4.1
MODCARGO_CRATES += async-channel 1.7.1
MODCARGO_CRATES += async-executor 1.4.1
MODCARGO_CRATES += async-io 1.6.0
MODCARGO_CRATES += async-io 1.9.0
MODCARGO_CRATES += async-lock 2.5.0
MODCARGO_CRATES += async-recursion 0.3.2
MODCARGO_CRATES += async-task 4.2.0
MODCARGO_CRATES += async-trait 0.1.51
MODCARGO_CRATES += async-tungstenite 0.15.0
MODCARGO_CRATES += async-task 4.3.0
MODCARGO_CRATES += async-trait 0.1.57
MODCARGO_CRATES += async-tungstenite 0.17.2
MODCARGO_CRATES += atty 0.2.14
MODCARGO_CRATES += autocfg 1.1.0
MODCARGO_CRATES += base64 0.12.3
MODCARGO_CRATES += base64 0.13.0
MODCARGO_CRATES += bincode 1.3.3
MODCARGO_CRATES += bitflags 1.2.1
MODCARGO_CRATES += bitflags 1.3.2
MODCARGO_CRATES += block 0.1.6
MODCARGO_CRATES += block-buffer 0.7.3
MODCARGO_CRATES += block-buffer 0.9.0
MODCARGO_CRATES += block-buffer 0.10.3
MODCARGO_CRATES += block-modes 0.8.1
MODCARGO_CRATES += block-padding 0.1.5
MODCARGO_CRATES += block-padding 0.2.1
MODCARGO_CRATES += bstr 0.2.16
MODCARGO_CRATES += bumpalo 3.7.0
MODCARGO_CRATES += byte-tools 0.3.1
MODCARGO_CRATES += bstr 0.2.17
MODCARGO_CRATES += bumpalo 3.11.0
MODCARGO_CRATES += byteorder 1.4.3
MODCARGO_CRATES += bytes 1.0.1
MODCARGO_CRATES += cache-padded 1.1.1
MODCARGO_CRATES += bytes 1.2.1
MODCARGO_CRATES += cache-padded 1.2.0
MODCARGO_CRATES += cassowary 0.3.0
MODCARGO_CRATES += cc 1.0.69
MODCARGO_CRATES += cesu8 1.1.0
MODCARGO_CRATES += cast 0.3.0
MODCARGO_CRATES += cc 1.0.73
MODCARGO_CRATES += cfg-if 1.0.0
MODCARGO_CRATES += checked_int_cast 1.0.0
MODCARGO_CRATES += chrono 0.4.22
MODCARGO_CRATES += ciborium 0.2.0
MODCARGO_CRATES += ciborium-io 0.2.0
MODCARGO_CRATES += ciborium-ll 0.2.0
MODCARGO_CRATES += cipher 0.3.0
MODCARGO_CRATES += clap 2.33.3
MODCARGO_CRATES += combine 3.8.1
MODCARGO_CRATES += concurrent-queue 1.2.2
MODCARGO_CRATES += core-foundation 0.9.1
MODCARGO_CRATES += clap 3.2.23
MODCARGO_CRATES += clap 4.0.18
MODCARGO_CRATES += clap_derive 4.0.18
MODCARGO_CRATES += clap_lex 0.2.4
MODCARGO_CRATES += clap_lex 0.3.0
MODCARGO_CRATES += cmake 0.1.48
MODCARGO_CRATES += concurrent-queue 1.2.4
MODCARGO_CRATES += core-foundation 0.9.3
MODCARGO_CRATES += core-foundation-sys 0.8.3
MODCARGO_CRATES += cpufeatures 0.1.5
MODCARGO_CRATES += crc32fast 1.2.1
MODCARGO_CRATES += crossbeam-channel 0.5.5
MODCARGO_CRATES += crossbeam-epoch 0.9.5
MODCARGO_CRATES += crossbeam-utils 0.8.9
MODCARGO_CRATES += cpufeatures 0.2.5
MODCARGO_CRATES += crc32fast 1.3.2
MODCARGO_CRATES += criterion 0.4.0
MODCARGO_CRATES += criterion-plot 0.5.0
MODCARGO_CRATES += crossbeam-channel 0.5.6
MODCARGO_CRATES += crossbeam-deque 0.8.2
MODCARGO_CRATES += crossbeam-epoch 0.9.10
MODCARGO_CRATES += crossbeam-utils 0.8.11
MODCARGO_CRATES += crossterm 0.19.0
MODCARGO_CRATES += crossterm 0.23.2
MODCARGO_CRATES += crossterm_winapi 0.7.0
MODCARGO_CRATES += crypto-mac 0.7.0
MODCARGO_CRATES += crossterm_winapi 0.9.0
MODCARGO_CRATES += crypto-common 0.1.6
MODCARGO_CRATES += crypto-mac 0.11.1
MODCARGO_CRATES += ct-logs 0.8.0
MODCARGO_CRATES += ctr 0.7.0
MODCARGO_CRATES += curve25519-dalek 3.1.0
MODCARGO_CRATES += ctr 0.8.0
MODCARGO_CRATES += derivative 2.2.0
MODCARGO_CRATES += digest 0.8.1
MODCARGO_CRATES += digest 0.9.0
MODCARGO_CRATES += digest 0.10.3
MODCARGO_CRATES += dirs 3.0.2
MODCARGO_CRATES += dirs 4.0.0
MODCARGO_CRATES += dirs-next 2.0.0
MODCARGO_CRATES += dirs-sys 0.3.6
MODCARGO_CRATES += dirs-sys 0.3.7
MODCARGO_CRATES += dirs-sys-next 0.1.2
MODCARGO_CRATES += displaydoc 0.2.3
MODCARGO_CRATES += easy-parallel 3.2.0
MODCARGO_CRATES += either 1.6.1
MODCARGO_CRATES += either 1.8.0
MODCARGO_CRATES += emoji 0.2.1
MODCARGO_CRATES += enumflags2 0.7.5
MODCARGO_CRATES += enumflags2_derive 0.7.4
MODCARGO_CRATES += env_logger 0.8.4
MODCARGO_CRATES += error-chain 0.12.4
MODCARGO_CRATES += event-listener 2.5.2
MODCARGO_CRATES += fake-simd 0.1.2
MODCARGO_CRATES += fastrand 1.4.1
MODCARGO_CRATES += filetime 0.2.14
MODCARGO_CRATES += fixedbitset 0.2.0
MODCARGO_CRATES += fixedbitset 0.4.1
MODCARGO_CRATES += flate2 1.0.20
MODCARGO_CRATES += event-listener 2.5.3
MODCARGO_CRATES += fastrand 1.8.0
MODCARGO_CRATES += filetime 0.2.17
MODCARGO_CRATES += fixedbitset 0.4.2
MODCARGO_CRATES += flate2 1.0.24
MODCARGO_CRATES += fnv 1.0.7
MODCARGO_CRATES += form_urlencoded 1.0.1
MODCARGO_CRATES += fs2 0.4.3
MODCARGO_CRATES += futures 0.3.15
MODCARGO_CRATES += futures-channel 0.3.15
MODCARGO_CRATES += futures-core 0.3.15
MODCARGO_CRATES += futures-executor 0.3.15
MODCARGO_CRATES += futures-io 0.3.15
MODCARGO_CRATES += futures 0.3.24
MODCARGO_CRATES += futures-channel 0.3.24
MODCARGO_CRATES += futures-core 0.3.24
MODCARGO_CRATES += futures-executor 0.3.24
MODCARGO_CRATES += futures-io 0.3.24
MODCARGO_CRATES += futures-lite 1.12.0
MODCARGO_CRATES += futures-macro 0.3.15
MODCARGO_CRATES += futures-sink 0.3.15
MODCARGO_CRATES += futures-task 0.3.15
MODCARGO_CRATES += futures-util 0.3.15
MODCARGO_CRATES += futures-macro 0.3.24
MODCARGO_CRATES += futures-sink 0.3.24
MODCARGO_CRATES += futures-task 0.3.24
MODCARGO_CRATES += futures-util 0.3.24
MODCARGO_CRATES += fuzzy-matcher 0.3.7
MODCARGO_CRATES += fxhash 0.2.1
MODCARGO_CRATES += generic-array 0.12.4
MODCARGO_CRATES += generic-array 0.14.4
MODCARGO_CRATES += generic-array 0.14.6
MODCARGO_CRATES += getopts 0.2.21
MODCARGO_CRATES += getrandom 0.1.16
MODCARGO_CRATES += getrandom 0.2.3
MODCARGO_CRATES += gh-emoji 1.0.3
MODCARGO_CRATES += ghash 0.4.2
MODCARGO_CRATES += hashbrown 0.11.2
MODCARGO_CRATES += headers 0.3.4
MODCARGO_CRATES += getrandom 0.2.7
MODCARGO_CRATES += gh-emoji 1.0.7
MODCARGO_CRATES += ghash 0.4.4
MODCARGO_CRATES += half 1.8.2
MODCARGO_CRATES += hashbrown 0.12.3
MODCARGO_CRATES += headers 0.3.8
MODCARGO_CRATES += headers-core 0.2.0
MODCARGO_CRATES += heck 0.3.3
MODCARGO_CRATES += heck 0.4.0
MODCARGO_CRATES += hermit-abi 0.1.19
MODCARGO_CRATES += hex 0.4.3
MODCARGO_CRATES += hkdf 0.11.0
MODCARGO_CRATES += hmac 0.7.1
MODCARGO_CRATES += hkdf 0.12.3
MODCARGO_CRATES += hmac 0.11.0
MODCARGO_CRATES += hmac 0.12.1
MODCARGO_CRATES += hostname 0.3.1
MODCARGO_CRATES += http 0.2.4
MODCARGO_CRATES += http-body 0.4.2
MODCARGO_CRATES += httparse 1.4.1
MODCARGO_CRATES += httpdate 1.0.1
MODCARGO_CRATES += hyper 0.14.11
MODCARGO_CRATES += hyper-rustls 0.22.1
MODCARGO_CRATES += http 0.2.8
MODCARGO_CRATES += http-body 0.4.5
MODCARGO_CRATES += httparse 1.8.0
MODCARGO_CRATES += httpdate 1.0.2
MODCARGO_CRATES += hyper 0.14.20
MODCARGO_CRATES += hyper-rustls 0.23.0
MODCARGO_CRATES += hyper-timeout 0.4.1
MODCARGO_CRATES += iana-time-zone 0.1.47
MODCARGO_CRATES += idna 0.2.3
MODCARGO_CRATES += indexmap 1.7.0
MODCARGO_CRATES += instant 0.1.10
MODCARGO_CRATES += indexmap 1.9.1
MODCARGO_CRATES += instant 0.1.12
MODCARGO_CRATES += itertools 0.9.0
MODCARGO_CRATES += itertools 0.10.1
MODCARGO_CRATES += itoa 0.4.7
MODCARGO_CRATES += itoa 1.0.2
MODCARGO_CRATES += jni 0.16.0
MODCARGO_CRATES += jni-sys 0.3.0
MODCARGO_CRATES += itertools 0.10.3
MODCARGO_CRATES += itoa 1.0.3
MODCARGO_CRATES += js-sys 0.3.59
MODCARGO_CRATES += lazy_static 1.4.0
MODCARGO_CRATES += lexical-core 0.7.6
MODCARGO_CRATES += libc 0.2.126
MODCARGO_CRATES += linked-hash-map 0.5.4
MODCARGO_CRATES += lock_api 0.4.7
MODCARGO_CRATES += log 0.4.14
MODCARGO_CRATES += log-panics 2.0.0
MODCARGO_CRATES += libc 0.2.132
MODCARGO_CRATES += linked-hash-map 0.5.6
MODCARGO_CRATES += lock_api 0.4.8
MODCARGO_CRATES += log 0.4.17
MODCARGO_CRATES += log-panics 2.1.0
MODCARGO_CRATES += lru-cache 0.1.2
MODCARGO_CRATES += mac-notification-sys 0.5.2
MODCARGO_CRATES += mac-notification-sys 0.5.6
MODCARGO_CRATES += malloc_buf 0.0.6
MODCARGO_CRATES += match_cfg 0.1.0
MODCARGO_CRATES += matches 0.1.8
MODCARGO_CRATES += memchr 2.4.0
MODCARGO_CRATES += memoffset 0.6.4
MODCARGO_CRATES += matches 0.1.9
MODCARGO_CRATES += memchr 2.5.0
MODCARGO_CRATES += memoffset 0.6.5
MODCARGO_CRATES += mime 0.3.16
MODCARGO_CRATES += mime_guess 2.0.3
MODCARGO_CRATES += miniz_oxide 0.4.4
MODCARGO_CRATES += mio 0.7.13
MODCARGO_CRATES += mime_guess 2.0.4
MODCARGO_CRATES += miniz_oxide 0.5.4
MODCARGO_CRATES += mio 0.7.14
MODCARGO_CRATES += mio 0.8.4
MODCARGO_CRATES += miow 0.3.7
MODCARGO_CRATES += mpart-async 0.5.0
MODCARGO_CRATES += mpart-async 0.6.1
MODCARGO_CRATES += multimap 0.8.3
MODCARGO_CRATES += nix 0.23.1
MODCARGO_CRATES += nom 5.1.2
MODCARGO_CRATES += notify-rust 4.5.8
MODCARGO_CRATES += ntapi 0.3.6
MODCARGO_CRATES += num-integer 0.1.44
MODCARGO_CRATES += num-traits 0.2.14
MODCARGO_CRATES += num_cpus 1.13.0
MODCARGO_CRATES += num_enum 0.5.2
MODCARGO_CRATES += num_enum_derive 0.5.2
MODCARGO_CRATES += notify-rust 4.5.10
MODCARGO_CRATES += ntapi 0.3.7
MODCARGO_CRATES += num-integer 0.1.45
MODCARGO_CRATES += num-traits 0.2.15
MODCARGO_CRATES += num_cpus 1.13.1
MODCARGO_CRATES += num_enum 0.5.7
MODCARGO_CRATES += num_enum_derive 0.5.7
MODCARGO_CRATES += num_threads 0.1.6
MODCARGO_CRATES += objc 0.2.7
MODCARGO_CRATES += objc-foundation 0.1.1
MODCARGO_CRATES += objc_id 0.1.1
MODCARGO_CRATES += once_cell 1.13.1
MODCARGO_CRATES += once_cell 1.14.0
MODCARGO_CRATES += oncemutex 0.1.1
MODCARGO_CRATES += opaque-debug 0.2.3
MODCARGO_CRATES += oorandom 11.1.3
MODCARGO_CRATES += opaque-debug 0.3.0
MODCARGO_CRATES += opener 0.5.0
MODCARGO_CRATES += openssl-probe 0.1.4
MODCARGO_CRATES += openssl-probe 0.1.5
MODCARGO_CRATES += ordered-stream 0.0.1
MODCARGO_CRATES += os_str_bytes 6.3.1
MODCARGO_CRATES += parking 2.0.0
MODCARGO_CRATES += parking_lot 0.11.1
MODCARGO_CRATES += parking_lot_core 0.8.3
MODCARGO_CRATES += parking_lot 0.11.2
MODCARGO_CRATES += parking_lot 0.12.1
MODCARGO_CRATES += parking_lot_core 0.8.5
MODCARGO_CRATES += parking_lot_core 0.9.3
MODCARGO_CRATES += percent-encoding 2.1.0
MODCARGO_CRATES += petgraph 0.5.1
MODCARGO_CRATES += petgraph 0.6.0
MODCARGO_CRATES += petgraph 0.6.2
MODCARGO_CRATES += phf 0.8.0
MODCARGO_CRATES += phf 0.11.1
MODCARGO_CRATES += phf_generator 0.8.0
MODCARGO_CRATES += phf_macros 0.8.0
MODCARGO_CRATES += phf_shared 0.8.0
MODCARGO_CRATES += phf_shared 0.11.1
MODCARGO_CRATES += phonenumber 0.3.1+8.12.9
MODCARGO_CRATES += pin-project 1.0.7
MODCARGO_CRATES += pin-project-internal 1.0.7
MODCARGO_CRATES += pin-project-lite 0.2.7
MODCARGO_CRATES += pin-project-lite 0.2.9
MODCARGO_CRATES += pin-utils 0.1.0
MODCARGO_CRATES += polling 2.1.0
MODCARGO_CRATES += polyval 0.5.1
MODCARGO_CRATES += ppv-lite86 0.2.10
MODCARGO_CRATES += proc-macro-crate 1.0.0
MODCARGO_CRATES += plotters 0.3.4
MODCARGO_CRATES += plotters-backend 0.3.4
MODCARGO_CRATES += plotters-svg 0.3.3
MODCARGO_CRATES += polling 2.3.0
MODCARGO_CRATES += polyval 0.5.3
MODCARGO_CRATES += ppv-lite86 0.2.16
MODCARGO_CRATES += proc-macro-crate 1.2.1
MODCARGO_CRATES += proc-macro-error 1.0.4
MODCARGO_CRATES += proc-macro-error-attr 1.0.4
MODCARGO_CRATES += proc-macro-hack 0.5.19
MODCARGO_CRATES += proc-macro-nested 0.1.7
MODCARGO_CRATES += proc-macro2 1.0.27
MODCARGO_CRATES += prost 0.8.0
MODCARGO_CRATES += proc-macro2 1.0.43
MODCARGO_CRATES += prost 0.9.0
MODCARGO_CRATES += prost-build 0.8.0
MODCARGO_CRATES += prost 0.10.4
MODCARGO_CRATES += prost-build 0.9.0
MODCARGO_CRATES += prost-derive 0.8.0
MODCARGO_CRATES += prost-build 0.10.4
MODCARGO_CRATES += prost-derive 0.9.0
MODCARGO_CRATES += prost-types 0.8.0
MODCARGO_CRATES += prost-derive 0.10.1
MODCARGO_CRATES += prost-types 0.9.0
MODCARGO_CRATES += prost-types 0.10.1
MODCARGO_CRATES += pulldown-cmark 0.8.0
MODCARGO_CRATES += qr2term 0.2.2
MODCARGO_CRATES += qr2term 0.3.0
MODCARGO_CRATES += qrcode 0.12.0
MODCARGO_CRATES += quick-xml 0.18.1
MODCARGO_CRATES += quick-xml 0.23.1
MODCARGO_CRATES += quickcheck 1.0.3
MODCARGO_CRATES += quickcheck_macros 1.0.0
MODCARGO_CRATES += quote 1.0.9
MODCARGO_CRATES += quote 1.0.21
MODCARGO_CRATES += rand 0.7.3
MODCARGO_CRATES += rand 0.8.4
MODCARGO_CRATES += rand 0.8.5
MODCARGO_CRATES += rand_chacha 0.2.2
MODCARGO_CRATES += rand_chacha 0.3.1
MODCARGO_CRATES += rand_core 0.5.1
MODCARGO_CRATES += rand_core 0.6.3
MODCARGO_CRATES += rand_hc 0.2.0
MODCARGO_CRATES += rand_hc 0.3.1
MODCARGO_CRATES += rand_pcg 0.2.1
MODCARGO_CRATES += redox_syscall 0.2.9
MODCARGO_CRATES += redox_users 0.4.0
MODCARGO_CRATES += regex 1.5.6
MODCARGO_CRATES += rayon 1.5.3
MODCARGO_CRATES += rayon-core 1.9.3
MODCARGO_CRATES += redox_syscall 0.2.16
MODCARGO_CRATES += redox_users 0.4.3
MODCARGO_CRATES += regex 1.6.0
MODCARGO_CRATES += regex-automata 0.1.10
MODCARGO_CRATES += regex-cache 0.2.1
MODCARGO_CRATES += regex-syntax 0.6.26
MODCARGO_CRATES += regex-syntax 0.6.27
MODCARGO_CRATES += remove_dir_all 0.5.3
MODCARGO_CRATES += ring 0.16.20
MODCARGO_CRATES += rustls 0.19.1
MODCARGO_CRATES += rustls-native-certs 0.5.0
MODCARGO_CRATES += ryu 1.0.5
MODCARGO_CRATES += rustls 0.20.6
MODCARGO_CRATES += rustls-native-certs 0.6.2
MODCARGO_CRATES += rustls-pemfile 0.3.0
MODCARGO_CRATES += rustls-pemfile 1.0.1
MODCARGO_CRATES += ryu 1.0.11
MODCARGO_CRATES += same-file 1.0.6
MODCARGO_CRATES += schannel 0.1.19
MODCARGO_CRATES += schannel 0.1.20
MODCARGO_CRATES += scopeguard 1.1.0
MODCARGO_CRATES += sct 0.6.1
MODCARGO_CRATES += security-framework 2.3.1
MODCARGO_CRATES += security-framework-sys 2.3.0
MODCARGO_CRATES += semver 1.0.3
MODCARGO_CRATES += serde 1.0.126
MODCARGO_CRATES += serde_derive 1.0.126
MODCARGO_CRATES += serde_json 1.0.64
MODCARGO_CRATES += serde_repr 0.1.7
MODCARGO_CRATES += sha-1 0.9.6
MODCARGO_CRATES += sct 0.7.0
MODCARGO_CRATES += security-framework 2.7.0
MODCARGO_CRATES += security-framework-sys 2.6.1
MODCARGO_CRATES += semver 1.0.13
MODCARGO_CRATES += serde 1.0.144
MODCARGO_CRATES += serde_derive 1.0.144
MODCARGO_CRATES += serde_json 1.0.85
MODCARGO_CRATES += serde_repr 0.1.9
MODCARGO_CRATES += sha-1 0.10.0
MODCARGO_CRATES += sha1 0.6.1
MODCARGO_CRATES += sha1 0.10.4
MODCARGO_CRATES += sha1_smol 1.0.0
MODCARGO_CRATES += sha2 0.8.2
MODCARGO_CRATES += sha2 0.9.5
MODCARGO_CRATES += sha2 0.9.9
MODCARGO_CRATES += sha2 0.10.5
MODCARGO_CRATES += sharded-slab 0.1.4
MODCARGO_CRATES += signal-hook 0.1.17
MODCARGO_CRATES += signal-hook 0.3.14
MODCARGO_CRATES += signal-hook-mio 0.2.3
MODCARGO_CRATES += signal-hook-registry 1.4.0
MODCARGO_CRATES += siphasher 0.3.5
MODCARGO_CRATES += slab 0.4.3
MODCARGO_CRATES += sled 0.34.6
MODCARGO_CRATES += smallvec 1.6.1
MODCARGO_CRATES += siphasher 0.3.10
MODCARGO_CRATES += slab 0.4.7
MODCARGO_CRATES += sled 0.34.7
MODCARGO_CRATES += smallvec 1.9.0
MODCARGO_CRATES += smawk 0.3.1
MODCARGO_CRATES += socket2 0.4.0
MODCARGO_CRATES += socket2 0.4.7
MODCARGO_CRATES += spin 0.5.2
MODCARGO_CRATES += static_assertions 1.1.0
MODCARGO_CRATES += strsim 0.8.0
MODCARGO_CRATES += structopt 0.3.22
MODCARGO_CRATES += structopt-derive 0.4.15
MODCARGO_CRATES += strsim 0.10.0
MODCARGO_CRATES += strum 0.22.0
MODCARGO_CRATES += strum_macros 0.22.0
MODCARGO_CRATES += subtle 1.0.0
MODCARGO_CRATES += subtle 2.4.1
MODCARGO_CRATES += syn 1.0.73
MODCARGO_CRATES += synstructure 0.12.5
MODCARGO_CRATES += syn 1.0.99
MODCARGO_CRATES += synstructure 0.12.6
MODCARGO_CRATES += tar 0.4.38
MODCARGO_CRATES += tempfile 3.2.0
MODCARGO_CRATES += textwrap 0.11.0
MODCARGO_CRATES += tauri-winrt-notification 0.1.0
MODCARGO_CRATES += tempfile 3.3.0
MODCARGO_CRATES += termcolor 1.1.3
MODCARGO_CRATES += textwrap 0.14.2
MODCARGO_CRATES += thiserror 1.0.30
MODCARGO_CRATES += thiserror-impl 1.0.30
MODCARGO_CRATES += textwrap 0.16.0
MODCARGO_CRATES += thiserror 1.0.34
MODCARGO_CRATES += thiserror-impl 1.0.34
MODCARGO_CRATES += thread_local 1.1.4
MODCARGO_CRATES += time 0.1.44
MODCARGO_CRATES += time 0.3.9
MODCARGO_CRATES += tinyvec 1.2.0
MODCARGO_CRATES += time 0.3.14
MODCARGO_CRATES += tinytemplate 1.2.1
MODCARGO_CRATES += tinyvec 1.6.0
MODCARGO_CRATES += tinyvec_macros 0.1.0
MODCARGO_CRATES += tokio 1.16.1
MODCARGO_CRATES += tokio-io-timeout 1.1.1
MODCARGO_CRATES += tokio 1.21.0
MODCARGO_CRATES += tokio-io-timeout 1.2.0
MODCARGO_CRATES += tokio-macros 1.8.0
MODCARGO_CRATES += tokio-rustls 0.22.0
MODCARGO_CRATES += tokio-stream 0.1.7
MODCARGO_CRATES += tokio-util 0.6.7
MODCARGO_CRATES += toml 0.5.8
MODCARGO_CRATES += tower-service 0.3.1
MODCARGO_CRATES += tracing 0.1.35
MODCARGO_CRATES += tokio-rustls 0.23.4
MODCARGO_CRATES += tokio-stream 0.1.9
MODCARGO_CRATES += tokio-util 0.6.10
MODCARGO_CRATES += toml 0.5.9
MODCARGO_CRATES += tower-service 0.3.2
MODCARGO_CRATES += tracing 0.1.36
MODCARGO_CRATES += tracing-appender 0.2.2
MODCARGO_CRATES += tracing-attributes 0.1.21
MODCARGO_CRATES += tracing-core 0.1.27
MODCARGO_CRATES += tracing-attributes 0.1.22
MODCARGO_CRATES += tracing-core 0.1.29
MODCARGO_CRATES += tracing-log 0.1.3
MODCARGO_CRATES += tracing-subscriber 0.3.11
MODCARGO_CRATES += tracing-subscriber 0.3.15
MODCARGO_CRATES += try-lock 0.2.3
MODCARGO_CRATES += tui 0.15.0
MODCARGO_CRATES += tungstenite 0.15.0
MODCARGO_CRATES += twoway 0.2.2
MODCARGO_CRATES += typenum 1.13.0
MODCARGO_CRATES += unchecked-index 0.2.2
MODCARGO_CRATES += tungstenite 0.17.3
MODCARGO_CRATES += typenum 1.15.0
MODCARGO_CRATES += uds_windows 1.0.2
MODCARGO_CRATES += unicase 2.6.0
MODCARGO_CRATES += unicode-bidi 0.3.5
MODCARGO_CRATES += unicode-linebreak 0.1.1
MODCARGO_CRATES += unicode-normalization 0.1.19
MODCARGO_CRATES += unicode-segmentation 1.8.0
MODCARGO_CRATES += unicode-width 0.1.8
MODCARGO_CRATES += unicode-xid 0.2.2
MODCARGO_CRATES += universal-hash 0.4.0
MODCARGO_CRATES += unreachable 1.0.0
MODCARGO_CRATES += unicode-bidi 0.3.8
MODCARGO_CRATES += unicode-ident 1.0.3
MODCARGO_CRATES += unicode-linebreak 0.1.2
MODCARGO_CRATES += unicode-normalization 0.1.21
MODCARGO_CRATES += unicode-segmentation 1.9.0
MODCARGO_CRATES += unicode-width 0.1.9
MODCARGO_CRATES += unicode-xid 0.2.3
MODCARGO_CRATES += universal-hash 0.4.1
MODCARGO_CRATES += untrusted 0.7.1
MODCARGO_CRATES += url 2.2.2
MODCARGO_CRATES += utf-8 0.7.6
MODCARGO_CRATES += uuid 0.8.2
MODCARGO_CRATES += uuid 1.2.1
MODCARGO_CRATES += valuable 0.1.0
MODCARGO_CRATES += vec_map 0.8.2
MODCARGO_CRATES += version_check 0.9.3
MODCARGO_CRATES += void 1.0.2
MODCARGO_CRATES += version_check 0.9.4
MODCARGO_CRATES += waker-fn 1.1.0
MODCARGO_CRATES += walkdir 2.3.2
MODCARGO_CRATES += want 0.3.0
MODCARGO_CRATES += wasi 0.9.0+wasi-snapshot-preview1
MODCARGO_CRATES += wasi 0.10.0+wasi-snapshot-preview1
MODCARGO_CRATES += wasi 0.11.0+wasi-snapshot-preview1
MODCARGO_CRATES += wasm-bindgen 0.2.82
MODCARGO_CRATES += wasm-bindgen-backend 0.2.82
MODCARGO_CRATES += wasm-bindgen-macro 0.2.82
MODCARGO_CRATES += wasm-bindgen-macro-support 0.2.82
MODCARGO_CRATES += wasm-bindgen-shared 0.2.82
MODCARGO_CRATES += web-sys 0.3.51
MODCARGO_CRATES += webpki 0.21.4
MODCARGO_CRATES += web-sys 0.3.59
MODCARGO_CRATES += webpki 0.22.0
MODCARGO_CRATES += wepoll-ffi 0.1.2
MODCARGO_CRATES += which 4.1.0
MODCARGO_CRATES += whoami 1.1.2
MODCARGO_CRATES += which 4.3.0
MODCARGO_CRATES += whoami 1.2.1
MODCARGO_CRATES += winapi 0.3.9
MODCARGO_CRATES += winapi-i686-pc-windows-gnu 0.4.0
MODCARGO_CRATES += winapi-util 0.1.5
MODCARGO_CRATES += winapi-x86_64-pc-windows-gnu 0.4.0
MODCARGO_CRATES += windows 0.24.0
MODCARGO_CRATES += windows_i686_gnu 0.24.0
MODCARGO_CRATES += windows_i686_msvc 0.24.0
MODCARGO_CRATES += windows_x86_64_gnu 0.24.0
MODCARGO_CRATES += windows_x86_64_msvc 0.24.0
MODCARGO_CRATES += winrt-notification 0.5.1
MODCARGO_CRATES += x25519-dalek 1.1.1
MODCARGO_CRATES += xattr 0.2.2
MODCARGO_CRATES += xflags 0.2.2
MODCARGO_CRATES += xflags-macros 0.2.2
MODCARGO_CRATES += xml-rs 0.8.4
MODCARGO_CRATES += xshell 0.1.14
MODCARGO_CRATES += xshell-macros 0.1.14
MODCARGO_CRATES += zbus 2.1.1
MODCARGO_CRATES += zbus_macros 2.1.1
MODCARGO_CRATES += zbus_names 2.1.0
MODCARGO_CRATES += windows 0.39.0
MODCARGO_CRATES += windows-sys 0.36.1
MODCARGO_CRATES += windows_aarch64_msvc 0.36.1
MODCARGO_CRATES += windows_aarch64_msvc 0.39.0
MODCARGO_CRATES += windows_i686_gnu 0.36.1
MODCARGO_CRATES += windows_i686_gnu 0.39.0
MODCARGO_CRATES += windows_i686_msvc 0.36.1
MODCARGO_CRATES += windows_i686_msvc 0.39.0
MODCARGO_CRATES += windows_x86_64_gnu 0.36.1
MODCARGO_CRATES += windows_x86_64_gnu 0.39.0
MODCARGO_CRATES += windows_x86_64_msvc 0.36.1
MODCARGO_CRATES += windows_x86_64_msvc 0.39.0
MODCARGO_CRATES += x25519-dalek 1.2.0
MODCARGO_CRATES += xattr 0.2.3
MODCARGO_CRATES += xflags 0.2.4
MODCARGO_CRATES += xflags-macros 0.2.4
MODCARGO_CRATES += xshell 0.1.17
MODCARGO_CRATES += xshell-macros 0.1.17
MODCARGO_CRATES += zbus 2.3.2
MODCARGO_CRATES += zbus_macros 2.3.2
MODCARGO_CRATES += zbus_names 2.2.0
MODCARGO_CRATES += zeroize 1.3.0
MODCARGO_CRATES += zeroize_derive 1.3.2
MODCARGO_CRATES += zvariant 3.3.0
MODCARGO_CRATES += zvariant_derive 3.3.0
MODCARGO_CRATES += zvariant 3.6.0
MODCARGO_CRATES += zvariant_derive 3.6.0

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +1,21 @@
[source."https://github.com/boxdot/libsignal-service-rs"]
git = "https://github.com/boxdot/libsignal-service-rs"
rev = "8be91da2"
replace-with = "vendored-sources"
[source."https://github.com/boxdot/presage.git"]
git = "https://github.com/boxdot/presage.git"
rev = "f908e8f"
replace-with = "vendored-sources"
[source."https://github.com/signalapp/curve25519-dalek.git"]
git = "https://github.com/signalapp/curve25519-dalek.git"
[source."https://github.com/signalapp/curve25519-dalek"]
git = "https://github.com/signalapp/curve25519-dalek"
branch = "lizard2"
replace-with = "vendored-sources"
[source."https://github.com/signalapp/libsignal-client"]
git = "https://github.com/signalapp/libsignal-client"
tag = "v0.11.0"
[source."https://github.com/signalapp/libsignal"]
git = "https://github.com/signalapp/libsignal"
tag = "v0.20.0"
replace-with = "vendored-sources"
[source."https://github.com/signalapp/poksho.git"]
git = "https://github.com/signalapp/poksho.git"
tag = "v0.7.0"
[source."https://github.com/whisperfish/libsignal-service-rs"]
git = "https://github.com/whisperfish/libsignal-service-rs"
rev = "8666ba56f47e405aaf8ed243be6e2ad1b5ad68c1"
replace-with = "vendored-sources"
[source."https://github.com/signalapp/zkgroup"]
git = "https://github.com/signalapp/zkgroup"
tag = "v0.7.3"
[source."https://github.com/whisperfish/presage"]
git = "https://github.com/whisperfish/presage"
rev = "f84d958"
replace-with = "vendored-sources"
[source.vendored-sources]

File diff suppressed because one or more lines are too long

View File

@ -1,104 +0,0 @@
# Changelog
Entries are listed in reverse chronological order.
## 2.0.0
* Fix a data modeling error in the `serde` feature pointed out by Trevor Perrin
which caused points and scalars to be serialized with length fields rather
than as fixed-size 32-byte arrays. This is a breaking change, but it fixes
compatibility with `serde-json` and ensures that the `serde-bincode` encoding
matches the conventional encoding for X/Ed25519.
* Update `rand_core` to `0.5`, allowing use with new `rand` versions.
* Switch from `clear_on_drop` to `zeroize` (by Tony Arcieri).
* Require `subtle = ^2.2.1` and remove the note advising nightly Rust, which is
no longer required as of that version of `subtle`. See the `subtle`
changelog for more details.
* Update `README.md` for `2.x` series.
* Remove the `build.rs` hack which loaded the entire crate into its own
`build.rs` to generate constants, and keep the constants in the source code.
The only significant change is the data model change to the `serde` feature;
besides the `rand_core` version bump, there are no other user-visible changes.
## 1.2.3
* Fix an issue identified by a Quarkslab audit (and Jack Grigg), where manually
constructing unreduced `Scalar` values, as needed for X/Ed25519, and then
performing scalar/scalar arithmetic could compute incorrect results.
* Switch to upstream Rust intrinsics for the IFMA backend now that they exist in
Rust and don't need to be defined locally.
* Ensure that the NAF computation works correctly, even for parameters never
used elsewhere in the codebase.
* Minor refactoring to EdwardsPoint decompression.
* Fix broken links in documentation.
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
root (not quite correctly done in 1.2.2).
## 1.2.2
* Fix a typo in an internal doc-comment.
* Add the "crypto" tag to crate metadata.
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
root.
## 1.2.1
* Fix a bug in bucket index calculations in the Pippenger multiscalar algorithm
for very large input sizes.
* Add a more extensive randomized multiscalar multiplication consistency check
to the test suite to prevent regressions.
* Ensure that that multiscalar and NAF computations work correctly on extremal
`Scalar` values constructed via `from_bits`.
## 1.2.0
* New multiscalar multiplication algorithm with better performance for
large problem sizes. The backend algorithm is selected
transparently using the size hints of the input iterators, so no
changes are required for client crates to start using it.
* Equality of Edwards points is now checked in projective coordinates.
* Serde can now be used with `no_std`.
## 1.1.4
* Fix typos in documentation comments.
* Remove unnecessary `Default` bound on `Scalar::from_hash`.
## 1.1.3
* Reverts the change in 1.1.0 to allow owned and borrowed RNGs, which caused a breakage due to a subtle interaction with ownership rules. (The `RngCore` change is retained).
## 1.1.2
* Disabled KaTeX on `docs.rs` pending proper [support upstream](https://github.com/rust-lang/docs.rs/issues/302).
## 1.1.1
* Fixed an issue related to `#[cfg(rustdoc)]` which prevented documenting multiple backends.
## 1.1.0
* Adds support for precomputation for multiscalar multiplication.
* Restructures the internal source tree into `serial` and `vector` backends (no change to external API).
* Adds a new IFMA backend which sets speed records.
* The `avx2_backend` feature is now an alias for the `simd_backend` feature, which autoselects an appropriate vector backend (currently AVX2 or IFMA).
* Replaces the `rand` dependency with `rand_core`.
* Generalizes trait bounds on `RistrettoPoint::random()` and `Scalar::random()` to allow owned and borrowed RNGs and to allow `RngCore` instead of `Rng`.
## 1.0.3
* Adds `ConstantTimeEq` implementation for compressed points.
## 1.0.2
* Fixes a typo in the naming of variables in Ristretto formulas (no change to functionality).
## 1.0.1
* Depends on the stable `2.0` version of `subtle` instead of `2.0.0-pre.0`.
## 1.0.0
Initial stable release. Yanked due to a dependency mistake (see above).

View File

@ -1,28 +0,0 @@
# Contributing to curve25519-dalek
If you have questions or comments, please feel free to email the
authors.
For feature requests, suggestions, and bug reports, please open an issue on
[our Github](https://github.com/dalek-cryptography/curve25519-dalek). (Or, send us
an email if you're opposed to using Github for whatever reason.)
Patches are welcomed as pull requests on
[our Github](https://github.com/dalek-cryptography/curve25519-dalek), as well as by
email (preferably sent to all of the authors listed in `Cargo.toml`).
All issues on curve25519-dalek are mentored, if you want help with a bug just
ask @isislovecruft or @hdevalence.
Some issues are easier than others. The `easy` label can be used to find the
easy issues. If you want to work on an issue, please leave a comment so that we
can assign it to you!
# Code of Conduct
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html),
with the following additional clauses:
* We respect the rights to privacy and anonymity for contributors and people in
the community. If someone wishes to contribute under a pseudonym different to
their primary identity, that wish is to be respected by all contributors.

View File

@ -1,61 +0,0 @@
[package]
name = "curve25519-dalek"
version = "2.0.0"
authors = ["Isis Lovecruft <isis@patternsinthevoid.net>",
"Henry de Valence <hdevalence@hdevalence.ca>"]
readme = "README.md"
license = "BSD-3-Clause"
repository = "https://github.com/dalek-cryptography/curve25519-dalek"
homepage = "https://dalek.rs/curve25519-dalek"
documentation = "https://docs.rs/curve25519-dalek"
categories = ["cryptography", "no-std"]
keywords = ["cryptography", "crypto", "ristretto", "curve25519", "ristretto255"]
description = "A pure-Rust implementation of group operations on ristretto255 and Curve25519"
exclude = [
"**/.gitignore",
".gitignore",
".travis.yml",
]
[package.metadata.docs.rs]
# Disabled for now since this is borked; tracking https://github.com/rust-lang/docs.rs/issues/302
# rustdoc-args = ["--html-in-header", ".cargo/registry/src/github.com-1ecc6299db9ec823/curve25519-dalek-0.13.2/rustdoc-include-katex-header.html"]
features = ["nightly", "simd_backend"]
[badges]
travis-ci = { repository = "dalek-cryptography/curve25519-dalek", branch = "master"}
[dev-dependencies]
sha2 = { version = "0.8", default-features = false }
bincode = "1"
criterion = "0.3"
rand = "0.7"
[[bench]]
name = "dalek_benchmarks"
harness = false
[dependencies]
rand_core = { version = "0.5", default-features = false }
byteorder = { version = "^1.2.3", default-features = false, features = ["i128"] }
digest = { version = "0.8", default-features = false }
subtle = { version = "^2.2.1", default-features = false }
serde = { version = "1.0", default-features = false, optional = true, features = ["derive"] }
packed_simd = { version = "0.3", features = ["into_bits"], optional = true }
zeroize = { version = "1", default-features = false }
[features]
nightly = ["subtle/nightly"]
default = ["std", "u64_backend"]
std = ["alloc", "subtle/std", "rand_core/std"]
alloc = ["zeroize/alloc"]
# The u32 backend uses u32s with u64 products.
u32_backend = []
# The u64 backend uses u64s with u128 products.
u64_backend = []
# The SIMD backend uses parallel formulas, using either AVX2 or AVX512-IFMA.
simd_backend = ["nightly", "u64_backend", "packed_simd"]
# DEPRECATED: this is now an alias for `simd_backend` and may be removed
# in some future release.
avx2_backend = ["simd_backend"]

View File

@ -1,64 +0,0 @@
Copyright (c) 2016-2019 Isis Agora Lovecruft, Henry de Valence. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
========================================================================
Portions of curve25519-dalek were originally derived from Adam Langley's
Go ed25519 implementation, found at <https://github.com/agl/ed25519/>,
under the following licence:
========================================================================
Copyright (c) 2012 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,8 +0,0 @@
FEATURES := nightly yolocrypto avx2_backend
doc:
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html
doc-internal:
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html --document-private-items

View File

@ -1,207 +0,0 @@
# curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek)
<img
width="33%"
align="right"
src="https://doc.dalek.rs/assets/dalek-logo-clear.png"/>
**A pure-Rust implementation of group operations on Ristretto and Curve25519.**
`curve25519-dalek` is a library providing group operations on the Edwards and
Montgomery forms of Curve25519, and on the prime-order Ristretto group.
`curve25519-dalek` is not intended to provide implementations of any particular
crypto protocol. Rather, implementations of those protocols (such as
[`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use
`curve25519-dalek` as a library.
`curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use
implementing a wide range of ECC-based crypto protocols, such as key agreement,
signatures, anonymous credentials, rangeproofs, and zero-knowledge proof
systems.
In particular, `curve25519-dalek` implements Ristretto, which constructs a
prime-order group from a non-prime-order Edwards curve. This provides the
speed and safety benefits of Edwards curve arithmetic, without the pitfalls of
cofactor-related abstraction mismatches.
# Documentation
The semver-stable, public-facing `curve25519-dalek` API is documented
[here][docs-external]. In addition, the unstable internal implementation
details are documented [here][docs-internal].
The `curve25519-dalek` documentation requires a custom HTML header to include
KaTeX for math support. Unfortunately `cargo doc` does not currently support
this, but docs can be built using
```sh
make doc
make doc-internal
```
# Use
To import `curve25519-dalek`, add the following to the dependencies section of
your project's `Cargo.toml`:
```toml
curve25519-dalek = "2"
```
The `2.x` series has API almost entirely unchanged from the `1.x` series,
except that:
* an error in the data modeling for the (optional) `serde` feature was
corrected, so that when the `2.x`-series `serde` implementation is used
with `serde-bincode`, the derived serialization matches the usual X/Ed25519
formats;
* the `rand` version was updated.
See `CHANGELOG.md` for more details.
# Backends and Features
The `nightly` feature enables features available only when using a Rust nightly
compiler. In particular, it is required for rendering documentation and for
the SIMD backends.
Curve arithmetic is implemented using one of the following backends:
* a `u32` backend using serial formulas and `u64` products;
* a `u64` backend using serial formulas and `u128` products;
* an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records);
* an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records);
By default the `u64` backend is selected. To select a specific backend, use:
```sh
cargo build --no-default-features --features "std u32_backend"
cargo build --no-default-features --features "std u64_backend"
# Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2
cargo build --no-default-features --features "std simd_backend"
# Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma
cargo build --no-default-features --features "std simd_backend"
```
Crates using `curve25519-dalek` can either select a backend on behalf of their
users, or expose feature flags that control the `curve25519-dalek` backend.
The `std` feature is enabled by default, but it can be disabled for no-`std`
builds using `--no-default-features`. Note that this requires explicitly
selecting an arithmetic backend using one of the `_backend` features.
If no backend is selected, compilation will fail.
# Safety
The `curve25519-dalek` types are designed to make illegal states
unrepresentable. For example, any instance of an `EdwardsPoint` is
guaranteed to hold a point on the Edwards curve, and any instance of a
`RistrettoPoint` is guaranteed to hold a valid point in the Ristretto
group.
All operations are implemented using constant-time logic (no
secret-dependent branches, no secret-dependent memory accesses),
unless specifically marked as being variable-time code.
We believe that our constant-time logic is lowered to constant-time
assembly, at least on `x86_64` targets.
As an additional guard against possible future compiler optimizations,
the `subtle` crate places an optimization barrier before every
conditional move or assignment. More details can be found in [the
documentation for the `subtle` crate][subtle_doc].
Some functionality (e.g., multiscalar multiplication or batch
inversion) requires heap allocation for temporary buffers. All
heap-allocated buffers of potentially secret data are explicitly
zeroed before release.
However, we do not attempt to zero stack data, for two reasons.
First, it's not possible to do so correctly: we don't have control
over stack allocations, so there's no way to know how much data to
wipe. Second, because `curve25519-dalek` provides a mid-level API,
the correct place to start zeroing stack data is likely not at the
entrypoints of `curve25519-dalek` functions, but at the entrypoints of
functions in other crates.
The implementation is memory-safe, and contains no significant
`unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD
intrinsics. These are marked `unsafe` only because invoking them on an
inappropriate CPU would cause `SIGILL`, but the entire backend is only
compiled with appropriate `target_feature`s, so this cannot occur.
# Performance
Benchmarks are run using [`criterion.rs`][criterion]:
```sh
cargo bench --no-default-features --features "std u32_backend"
cargo bench --no-default-features --features "std u64_backend"
# Uses avx2 or ifma only if compiled for an appropriate target.
export RUSTFLAGS="-C target_cpu=native"
cargo bench --no-default-features --features "std simd_backend"
```
Performance is a secondary goal behind correctness, safety, and
clarity, but we aim to be competitive with other implementations.
# FFI
Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The
reason is that we use Rust features to provide an API that maintains safety
invariants, which are not possible to maintain across an FFI boundary. For
instance, as described in the _Safety_ section above, invalid points are
impossible to construct, and this would not be the case if we exposed point
operations over FFI.
However, `curve25519-dalek` is designed as a *mid-level* API, aimed at
implementing other, higher-level primitives. Instead of providing FFI at the
mid-level, our suggestion is to implement the higher-level primitive (a
signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency,
and have that crate provide a minimal, byte-buffer-oriented FFI specific to
that primitive.
# Contributing
Please see [CONTRIBUTING.md][contributing].
Patches and pull requests should be make against the `develop`
branch, **not** `master`.
# About
**SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in
his second full episode, "Into the Dalek". A beleaguered ship of the "Combined
Galactic Resistance" has discovered a broken Dalek that has turned "good",
desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers
are miniaturized and enter the Dalek, which the Doctor names Rusty. They
repair the damage, but accidentally restore it to its original nature, causing
it to go on the rampage and alert the Dalek fleet to the whereabouts of the
rebel ship. However, the Doctor manages to return Rusty to its previous state
by linking his mind with the Dalek's: Rusty shares the Doctor's view of the
universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the
other Daleks and departs the ship, determined to track down and bring an end
to the Dalek race.*
`curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence.
Portions of this library were originally a port of [Adam Langley's
Golang ed25519 library](https://github.com/agl/ed25519), which was in
turn a port of the reference `ref10` implementation. Most of this code,
including the 32-bit field arithmetic, has since been rewritten.
The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and
the addition chain for scalar inversion was provided by Brian Smith. The
optimised batch inversion was contributed by Sean Bowe and Daira Hopwood.
The `no_std` and `zeroize` support was contributed by Tony Arcieri.
Thanks also to Ashley Hauck, Lucas Salibian, and Manish Goregaokar for their
contributions.
[ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek
[x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek
[contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md
[docs-external]: https://doc.dalek.rs/curve25519_dalek/
[docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/
[criterion]: https://github.com/japaric/criterion.rs
[parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html
[subtle_doc]: https://doc.dalek.rs/subtle/

View File

@ -1,339 +0,0 @@
#![allow(non_snake_case)]
extern crate rand;
use rand::rngs::OsRng;
use rand::thread_rng;
#[macro_use]
extern crate criterion;
use criterion::BatchSize;
use criterion::Criterion;
extern crate curve25519_dalek;
use curve25519_dalek::constants;
use curve25519_dalek::scalar::Scalar;
use curve25519_dalek::field::FieldElement;
static BATCH_SIZES: [usize; 5] = [1, 2, 4, 8, 16];
static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024];
mod edwards_benches {
use super::*;
use curve25519_dalek::edwards::EdwardsPoint;
fn compress(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_POINT;
c.bench_function("EdwardsPoint compression", move |b| b.iter(|| B.compress()));
}
fn decompress(c: &mut Criterion) {
let B_comp = &constants::ED25519_BASEPOINT_COMPRESSED;
c.bench_function("EdwardsPoint decompression", move |b| {
b.iter(|| B_comp.decompress().unwrap())
});
}
fn consttime_fixed_base_scalar_mul(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_TABLE;
let s = Scalar::from(897987897u64).invert();
c.bench_function("Constant-time fixed-base scalar mul", move |b| {
b.iter(|| B * &s)
});
}
fn consttime_variable_base_scalar_mul(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_POINT;
let s = Scalar::from(897987897u64).invert();
c.bench_function("Constant-time variable-base scalar mul", move |b| {
b.iter(|| B * s)
});
}
fn vartime_double_base_scalar_mul(c: &mut Criterion) {
c.bench_function("Variable-time aA+bB, A variable, B fixed", |bench| {
let mut rng = thread_rng();
let A = &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE;
bench.iter_batched(
|| (Scalar::random(&mut rng), Scalar::random(&mut rng)),
|(a, b)| EdwardsPoint::vartime_double_scalar_mul_basepoint(&a, &A, &b),
BatchSize::SmallInput,
);
});
}
criterion_group! {
name = edwards_benches;
config = Criterion::default();
targets =
compress,
decompress,
consttime_fixed_base_scalar_mul,
consttime_variable_base_scalar_mul,
vartime_double_base_scalar_mul,
}
}
mod multiscalar_benches {
use super::*;
use curve25519_dalek::edwards::EdwardsPoint;
use curve25519_dalek::edwards::VartimeEdwardsPrecomputation;
use curve25519_dalek::traits::MultiscalarMul;
use curve25519_dalek::traits::VartimeMultiscalarMul;
use curve25519_dalek::traits::VartimePrecomputedMultiscalarMul;
fn construct_scalars(n: usize) -> Vec<Scalar> {
let mut rng = thread_rng();
(0..n).map(|_| Scalar::random(&mut rng)).collect()
}
fn construct_points(n: usize) -> Vec<EdwardsPoint> {
let mut rng = thread_rng();
(0..n)
.map(|_| &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE)
.collect()
}
fn construct(n: usize) -> (Vec<Scalar>, Vec<EdwardsPoint>) {
(construct_scalars(n), construct_points(n))
}
fn consttime_multiscalar_mul(c: &mut Criterion) {
c.bench_function_over_inputs(
"Constant-time variable-base multiscalar multiplication",
|b, &&size| {
let points = construct_points(size);
// This is supposed to be constant-time, but we might as well
// rerandomize the scalars for every call just in case.
b.iter_batched(
|| construct_scalars(size),
|scalars| EdwardsPoint::multiscalar_mul(&scalars, &points),
BatchSize::SmallInput,
);
},
&MULTISCALAR_SIZES,
);
}
fn vartime_multiscalar_mul(c: &mut Criterion) {
c.bench_function_over_inputs(
"Variable-time variable-base multiscalar multiplication",
|b, &&size| {
let points = construct_points(size);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels).
b.iter_batched(
|| construct_scalars(size),
|scalars| EdwardsPoint::vartime_multiscalar_mul(&scalars, &points),
BatchSize::SmallInput,
);
},
&MULTISCALAR_SIZES,
);
}
fn vartime_precomputed_pure_static(c: &mut Criterion) {
c.bench_function_over_inputs(
"Variable-time fixed-base multiscalar multiplication",
move |b, &&total_size| {
let static_size = total_size;
let static_points = construct_points(static_size);
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels).
b.iter_batched(
|| construct_scalars(static_size),
|scalars| precomp.vartime_multiscalar_mul(&scalars),
BatchSize::SmallInput,
);
},
&MULTISCALAR_SIZES,
);
}
fn vartime_precomputed_helper(c: &mut Criterion, dynamic_fraction: f64) {
let label = format!(
"Variable-time mixed-base multiscalar multiplication ({:.0}pct dyn)",
100.0 * dynamic_fraction,
);
c.bench_function_over_inputs(
&label,
move |b, &&total_size| {
let dynamic_size = ((total_size as f64) * dynamic_fraction) as usize;
let static_size = total_size - dynamic_size;
let static_points = construct_points(static_size);
let dynamic_points = construct_points(dynamic_size);
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels). Timings
// should be independent of points so we don't
// randomize them.
b.iter_batched(
|| {
(
construct_scalars(static_size),
construct_scalars(dynamic_size),
)
},
|(static_scalars, dynamic_scalars)| {
precomp.vartime_mixed_multiscalar_mul(
&static_scalars,
&dynamic_scalars,
&dynamic_points,
)
},
BatchSize::SmallInput,
);
},
&MULTISCALAR_SIZES,
);
}
fn vartime_precomputed_00_pct_dynamic(c: &mut Criterion) {
vartime_precomputed_helper(c, 0.0);
}
fn vartime_precomputed_20_pct_dynamic(c: &mut Criterion) {
vartime_precomputed_helper(c, 0.2);
}
fn vartime_precomputed_50_pct_dynamic(c: &mut Criterion) {
vartime_precomputed_helper(c, 0.5);
}
criterion_group! {
name = multiscalar_benches;
// Lower the sample size to run the benchmarks faster
config = Criterion::default().sample_size(15);
targets =
consttime_multiscalar_mul,
vartime_multiscalar_mul,
vartime_precomputed_pure_static,
vartime_precomputed_00_pct_dynamic,
vartime_precomputed_20_pct_dynamic,
vartime_precomputed_50_pct_dynamic,
}
}
mod ristretto_benches {
use super::*;
use curve25519_dalek::ristretto::RistrettoPoint;
fn compress(c: &mut Criterion) {
c.bench_function("RistrettoPoint compression", |b| {
let B = &constants::RISTRETTO_BASEPOINT_POINT;
b.iter(|| B.compress())
});
}
fn decompress(c: &mut Criterion) {
c.bench_function("RistrettoPoint decompression", |b| {
let B_comp = &constants::RISTRETTO_BASEPOINT_COMPRESSED;
b.iter(|| B_comp.decompress().unwrap())
});
}
fn elligator(c: &mut Criterion) {
let fe_bytes = [0u8; 32];
let fe = FieldElement::from_bytes(&fe_bytes);
c.bench_function("RistrettoPoint Elligator", |b| {
b.iter(|| RistrettoPoint::elligator_ristretto_flavor(&fe));
});
}
fn double_and_compress_batch(c: &mut Criterion) {
c.bench_function_over_inputs(
"Batch Ristretto double-and-encode",
|b, &&size| {
let mut rng = OsRng;
let points: Vec<RistrettoPoint> = (0..size)
.map(|_| RistrettoPoint::random(&mut rng))
.collect();
b.iter(|| RistrettoPoint::double_and_compress_batch(&points));
},
&BATCH_SIZES,
);
}
criterion_group! {
name = ristretto_benches;
config = Criterion::default();
targets =
compress,
decompress,
elligator,
double_and_compress_batch,
}
}
mod montgomery_benches {
use super::*;
fn montgomery_ladder(c: &mut Criterion) {
c.bench_function("Montgomery pseudomultiplication", |b| {
let B = constants::X25519_BASEPOINT;
let s = Scalar::from(897987897u64).invert();
b.iter(|| B * s);
});
}
criterion_group! {
name = montgomery_benches;
config = Criterion::default();
targets = montgomery_ladder,
}
}
mod scalar_benches {
use super::*;
fn scalar_inversion(c: &mut Criterion) {
c.bench_function("Scalar inversion", |b| {
let s = Scalar::from(897987897u64).invert();
b.iter(|| s.invert());
});
}
fn batch_scalar_inversion(c: &mut Criterion) {
c.bench_function_over_inputs(
"Batch scalar inversion",
|b, &&size| {
let mut rng = OsRng;
let scalars: Vec<Scalar> = (0..size).map(|_| Scalar::random(&mut rng)).collect();
b.iter(|| {
let mut s = scalars.clone();
Scalar::batch_invert(&mut s);
});
},
&BATCH_SIZES,
);
}
criterion_group! {
name = scalar_benches;
config = Criterion::default();
targets =
scalar_inversion,
batch_scalar_inversion,
}
}
criterion_main!(
scalar_benches::scalar_benches,
montgomery_benches::montgomery_benches,
ristretto_benches::ristretto_benches,
edwards_benches::edwards_benches,
multiscalar_benches::multiscalar_benches,
);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 107 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 59 KiB

View File

@ -1,10 +0,0 @@
<link rel="stylesheet" href="https://doc.dalek.rs/assets/katex/katex.min.css">
<script src="https://doc.dalek.rs/assets/katex/katex.min.js"></script>
<script src="https://doc.dalek.rs/assets/katex/contrib/auto-render.min.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function() { renderMathInElement(document.body); });
</script>
<style>
.katex { font-size: 1em !important; }
pre.rust, .docblock code, .docblock-short code { font-size: 0.85em !important; }
</style>

View File

@ -1,140 +0,0 @@
An AVX2 implementation of the vectorized point operation strategy.
# Field element representation
Our strategy is to implement 4-wide multiplication and squaring by
wordslicing, using one 64-bit AVX2 lane for each field element. Field
elements are represented in the usual way as 10 `u32` limbs in radix
\\(25.5\\) (i.e., alternating between \\(2\^{26}\\) for even limbs and
\\(2\^{25}\\) for odd limbs). This has the effect that passing between
the parallel 32-bit AVX2 representation and the serial 64-bit
representation (which uses radix \\(2^{51}\\)) amounts to regrouping
digits.
The field element representation is oriented around the AVX2
`vpmuluqdq` instruction, which multiplies the low 32 bits of each
64-bit lane of each operand to produce a 64-bit result.
```text,no_run
(a1 ?? b1 ?? c1 ?? d1 ??)
(a2 ?? b2 ?? c2 ?? d2 ??)
(a1*a2 b1*b2 c1*c2 d1*d2)
```
To unpack 32-bit values into 64-bit lanes for use in multiplication
it would be convenient to use the `vpunpck[lh]dq` instructions,
which unpack and interleave the low and high 32-bit lanes of two
source vectors.
However, the AVX2 versions of these instructions are designed to
operate only within 128-bit lanes of the 256-bit vectors, so that
interleaving the low lanes of `(a0 b0 c0 d0 a1 b1 c1 d1)` with zero
gives `(a0 00 b0 00 a1 00 b1 00)`. Instead, we pre-shuffle the data
layout as `(a0 b0 a1 b1 c0 d0 c1 d1)` so that we can unpack the
"low" and "high" parts as
```text,no_run
(a0 00 b0 00 c0 00 d0 00)
(a1 00 b1 00 c1 00 d1 00)
```
The data layout for a vector of four field elements \\( (a,b,c,d)
\\) with limbs \\( a_0, a_1, \ldots, a_9 \\) is as `[u32x8; 5]` in
the form
```text,no_run
(a0 b0 a1 b1 c0 d0 c1 d1)
(a2 b2 a3 b3 c2 d2 c3 d3)
(a4 b4 a5 b5 c4 d4 c5 d5)
(a6 b6 a7 b7 c6 d6 c7 d7)
(a8 b8 a9 b9 c8 d8 c9 d9)
```
Since this breaks cleanly into two 128-bit lanes, it may be possible
to adapt it to 128-bit vector instructions such as NEON without too
much difficulty.
# Avoiding Overflow in Doubling
To analyze the size of the field element coefficients during the
computations, we can parameterize the bounds on the limbs of each
field element by \\( b \in \mathbb R \\) representing the excess bits
above that limb's radix, so that each limb is bounded by either
\\(2\^{25+b} \\) or \\( 2\^{26+b} \\), as appropriate.
The multiplication routine requires that its inputs are bounded with
\\( b < 1.75 \\), in order to fit a multiplication by \\( 19 \\)
into 32 bits. Since \\( \lg 19 < 4.25 \\), \\( 19x < 2\^{32} \\)
when \\( x < 2\^{27.75} = 2\^{26 + 1.75} \\). However, this is only
required for one of the inputs; the other can grow up to \\( b < 2.5
\\).
In addition, the multiplication and squaring routines do not
canonically reduce their outputs, but can leave some small uncarried
excesses, so that their reduced outputs are bounded with
\\( b < 0.007 \\).
The non-parallel portion of the doubling formulas is
$$
\begin{aligned}
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
&\gets
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
\end{aligned}
$$
Computing \\( (S\_5, S\_6, S\_8, S\_9 ) \\) as
$$
\begin{matrix}
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
+& S\_2 & & & S\_2 \\\\
+& & & S\_3 & \\\\
+& & & S\_3 & \\\\
+& & 2p & 2p & 2p \\\\
-& & S\_2 & S\_2 & \\\\
-& & & & S\_4 \\\\
=& S\_5 & S\_6 & S\_8 & S\_9
\end{matrix}
$$
results in bit-excesses \\( < (1.01, 1.60, 2.33, 2.01)\\) for
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
are then
$$
\begin{aligned}
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 2.01) \\\\
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 2.01)
\end{aligned}
$$
which are too large: it's not possible to arrange the multiplicands so
that one vector has \\(b < 2.5\\) and the other has \\( b < 1.75 \\).
However, if we flip the sign of \\( S\_4 = S\_0\^2 \\) during
squaring, so that we output \\(S\_4' = -S\_4 \pmod p\\), then we can
compute
$$
\begin{matrix}
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
+& S\_2 & & & S\_2 \\\\
+& & & S\_3 & \\\\
+& & & S\_3 & \\\\
+& & & & S\_4' \\\\
+& & 2p & 2p & \\\\
-& & S\_2 & S\_2 & \\\\
=& S\_5 & S\_6 & S\_8 & S\_9
\end{matrix}
$$
resulting in bit-excesses \\( < (1.01, 1.60, 2.33, 1.60)\\) for
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
are then
$$
\begin{aligned}
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 1.60) \\\\
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 1.60)
\end{aligned}
$$
whose right-hand sides are all bounded with \\( b < 1.75 \\) and
whose left-hand sides are all bounded with \\( b < 2.5 \\),
so that we can avoid any intermediate reductions.

View File

@ -1,580 +0,0 @@
An AVX512-IFMA implementation of the vectorized point operation
strategy.
# IFMA instructions
AVX512-IFMA is an extension to AVX-512 consisting of two instructions:
* `vpmadd52luq`: packed multiply of unsigned 52-bit integers and add
the low 52 product bits to 64-bit accumulators;
* `vpmadd52huq`: packed multiply of unsigned 52-bit integers and add
the high 52 product bits to 64-bit accumulators;
These operate on 64-bit lanes of their source vectors, taking the low
52 bits of each lane of each source vector, computing the 104-bit
products of each pair, and then adding either the high or low 52 bits
of the 104-bit products to the 64-bit lanes of the destination vector.
The multiplication is performed internally by reusing circuitry for
floating-point arithmetic. Although these instructions are part of
AVX512, the AVX512VL (vector length) extension (present whenever IFMA
is) allows using them with 512, 256, or 128-bit operands.
This provides a major advantage to vectorized integer operations:
previously, vector operations could only use a \\(32 \times 32
\rightarrow 64\\)-bit multiplier, while serial code could use a
\\(64\times 64 \rightarrow 128\\)-bit multiplier.
## IFMA for big-integer multiplications
A detailed example of the intended use of the IFMA instructions can be
found in a 2016 paper by Gueron and Krasnov, [_Accelerating Big
Integer Arithmetic Using Intel IFMA Extensions_][2016_gueron_krasnov].
The basic idea is that multiplication of large integers (such as 1024,
2048, or more bits) can be performed as follows.
First, convert a “packed” 64-bit representation
\\[
\begin{aligned}
x &= x'_0 + x'_1 2^{64} + x'_2 2^{128} + \cdots \\\\
y &= y'_0 + y'_1 2^{64} + y'_2 2^{128} + \cdots
\end{aligned}
\\]
into a “redundant” 52-bit representation
\\[
\begin{aligned}
x &= x_0 + x_1 2^{52} + x_2 2^{104} + \cdots \\\\
y &= y_0 + y_1 2^{52} + y_2 2^{104} + \cdots
\end{aligned}
\\]
with each \\(x_i, y_j\\) in a 64-bit lane.
Writing the product as \\(z = z_0 + z_1 2^{52} + z_2 2^{104} + \cdots\\),
the “schoolbook” multiplication strategy gives
\\[
\begin{aligned}
&z_0 &&=& x_0 & y_0 & & & & & & & & \\\\
&z_1 &&=& x_1 & y_0 &+ x_0 & y_1 & & & & & & \\\\
&z_2 &&=& x_2 & y_0 &+ x_1 & y_1 &+ x_0 & y_2 & & & & \\\\
&z_3 &&=& x_3 & y_0 &+ x_2 & y_1 &+ x_1 & y_2 &+ x_0 & y_3 & & \\\\
&z_4 &&=& \vdots\\;&\\;\vdots &+ x_3 & y_1 &+ x_2 & y_2 &+ x_1 & y_3 &+ \cdots& \\\\
&z_5 &&=& & & \vdots\\;&\\;\vdots &+ x_3 & y_2 &+ x_2 & y_3 &+ \cdots& \\\\
&z_6 &&=& & & & & \vdots\\;&\\;\vdots &+ x_3 & y_3 &+ \cdots& \\\\
&z_7 &&=& & & & & & & \vdots\\;&\\;\vdots &+ \cdots& \\\\
&\vdots&&=& & & & & & & & & \ddots& \\\\
\end{aligned}
\\]
Notice that the product coefficient \\(z_k\\), representing the value
\\(z_k 2^{52k}\\), is the sum of all product terms
\\(
(x_i 2^{52 i}) (y_j 2^{52 j})
\\)
with \\(k = i + j\\).
Write the IFMA operators \\(\mathrm{lo}(a,b)\\), denoting the low
\\(52\\) bits of \\(ab\\), and
\\(\mathrm{hi}(a,b)\\), denoting the high \\(52\\) bits of
\\(ab\\).
Now we can rewrite the product terms as
\\[
\begin{aligned}
(x_i 2^{52 i}) (y_j 2^{52 j})
&=
2^{52 (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}.
\end{aligned}
\\]
This means that the low half of \\(x_i y_j\\) can be accumulated onto
the product limb \\(z_{i+j}\\) and the high half can be directly
accumulated onto the next-higher product limb \\(z_{i+j+1}\\) with no
additional operations. This allows rewriting the schoolbook
multiplication into the form
\\[
\begin{aligned}
&z_0 &&=& \mathrm{lo}(x_0,&y_0) & & & & & & & & & & \\\\
&z_1 &&=& \mathrm{lo}(x_1,&y_0) &+\mathrm{hi}(x_0,&y_0) &+\mathrm{lo}(x_0,&y_1) & & & & & & \\\\
&z_2 &&=& \mathrm{lo}(x_2,&y_0) &+\mathrm{hi}(x_1,&y_0) &+\mathrm{lo}(x_1,&y_1) &+\mathrm{hi}(x_0,&y_1) &+\mathrm{lo}(x_0,&y_2) & & \\\\
&z_3 &&=& \mathrm{lo}(x_3,&y_0) &+\mathrm{hi}(x_2,&y_0) &+\mathrm{lo}(x_2,&y_1) &+\mathrm{hi}(x_1,&y_1) &+\mathrm{lo}(x_1,&y_2) &+ \cdots& \\\\
&z_4 &&=& \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_0) &+\mathrm{lo}(x_3,&y_1) &+\mathrm{hi}(x_2,&y_1) &+\mathrm{lo}(x_2,&y_2) &+ \cdots& \\\\
&z_5 &&=& & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_1) &+\mathrm{lo}(x_3,&y_2) &+ \cdots& \\\\
&z_6 &&=& & & & & & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+ \cdots& \\\\
&\vdots&&=& & & & & & & & & & & \ddots& \\\\
\end{aligned}
\\]
Gueron and Krasnov implement multiplication by constructing vectors
out of the columns of this diagram, so that the source operands for
the IFMA instructions are of the form \\((x_0, x_1, x_2, \ldots)\\)
and \\((y_i, y_i, y_i, \ldots)\\).
After performing the multiplication,
the product terms \\(z_i\\) are then repacked into a 64-bit representation.
## An alternative strategy
The strategy described above is aimed at big-integer multiplications,
such as 1024, 2048, or 4096 bits, which would be used for applications
like RSA. However, elliptic curve cryptography uses much smaller field
sizes, such as 256 or 384 bits, so a different strategy is needed.
The parallel Edwards formulas provide parallelism at the level of the
formulas for curve operations. This means that instead of scanning
through the terms of the source operands and parallelizing *within* a
field element (as described above), we can arrange the computation in
product-scanning form and parallelize *across* field elements (as
described below).
The parallel Edwards
formulas provide 4-way parallelism, so they can be implemented using
256-bit vectors using a single 64-bit lane for each element, or using
512-bit vectors using two 64-bit lanes.
The only available CPU supporting IFMA (the
i3-8121U) executes 512-bit IFMA instructions at half rate compared to
256-bit instructions, so for now there's no throughput advantage to
using 512-bit IFMA instructions, and this implementation uses 256-bit
vectors.
To extend this to 512-bit vectors, it's only only necessary to achieve
2-way parallelism, and it's possible (with a small amount of overhead)
to create a hybrid strategy that operates entirely within 128-bit
lanes. This means that cross-lane operations can use the faster
`vpshufd` (1c latency) instead of a general shuffle instruction (3c
latency).
# Choice of radix
The inputs to IFMA instructions are 52 bits wide, so the radix \\(r\\)
used to represent a multiprecision integer must be \\( r \leq 52 \\).
The obvious choice is the "native" radix \\(r = 52\\).
As described above, this choice
has the advantage that for \\(x_i, y_j \in [0,2^{52})\\), the product term
\\[
\begin{aligned}
(x_i 2^{52 i}) (y_j 2^{52 j})
&=
2^{52 (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)},
\end{aligned}
\\]
so that the low and high halves of the product can be directly accumulated
onto the product limbs.
In contrast, when using a smaller radix \\(r = 52 - k\\),
the product term has the form
\\[
\begin{aligned}
(x_i 2^{r i}) (y_j 2^{r j})
&=
2^{r (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{r (i+j)} +
(
\mathrm{hi}(x_i, y_j) 2^k
)
2^{r (i+j+1)}.
\end{aligned}
\\]
What's happening is that the product \\(x_i y_j\\) of size \\(2r\\)
bits is split not at \\(r\\) but at \\(52\\), so \\(k\\) product bits
are placed into the low half instead of the high half. This means
that the high half of the product cannot be directly accumulated onto
\\(z_{i+j+1}\\), but must first be multiplied by \\(2^k\\) (i.e., left
shifted by \\(k\\)). In addition, the low half of the product is
\\(52\\) bits large instead of \\(r\\) bits.
## Handling offset product terms
[Drucker and Gueron][2018_drucker_gueron] analyze the choice of radix
in the context of big-integer squaring, outlining three ways to handle
the offset product terms, before concluding that all of them are
suboptimal:
1. Shift the results after accumulation;
2. Shift the input operands before multiplication;
3. Split the MAC operation, accumulating into a zeroed register,
shifting the result, and then adding.
The first option is rejected because it could double-shift some
previously accumulated terms, the second doesn't work because the
inputs could become larger than \\(52\\) bits, and the third requires
additional instructions to handle the shifting and adding.
Based on an analysis of total number of instructions, they suggest an
addition to the instruction set, which they call `FMSA` (fused
multiply-shift-add). This would shift the result according to an 8-bit
immediate value before accumulating it into the destination register.
However, this change to the instruction set doesn't seem to be
necessary. Instead, the product terms can be grouped according to
their coefficients, accumulated together, then shifted once before
adding them to the final sum. This uses an extra register, shift, and
add, but only once per product term (accumulation target), not once
per source term (as in the Drucker-Gueron paper).
Moreover, because IFMA instructions execute only on two ports
(presumably 0 and 1), while adds and shifts can execute on three ports
(0, 1, and 5), the adds and shifts can execute independently of the
IFMA operations, as long as there is not too much pressure on port 5.
This means that, although the total number of instructions increases,
the shifts and adds do not necessarily increase the execution time, as
long as throughput is limited by IFMA operations.
Finally, because IFMA instructions have 4 cycle latency and 0.5/1
cycle throughput (for 256/512 bit vectors), maximizing IFMA throughput
requires either 8 (for 256) or 4 (for 512) independent operations. So
accumulating groups of terms independently before adding them at the
end may be necessary anyways, in order to prevent long chains of
dependent instructions.
## Advantages of a smaller radix
Using a smaller radix has other advantages. Although radix \\(52\\)
is an unsaturated representation from the point of view of the
\\(64\\)-bit accumulators (because up to 4096 product terms can be
accumulated without carries), it's a saturated representation from the
point of view of the multiplier (since \\(52\\)-bit values are the
maximum input size).
Because the inputs to a multiplication must have all of their limbs
bounded by \\(2^{52}\\), limbs in excess of \\(2^{52}\\) must be
reduced before they can be used as an input. The
[Gueron-Krasnov][2016_gueron_krasnov] paper suggests normalizing
values using a standard, sequential carry chain: for each limb, add
the carryin from reducing the previous limb, compute the carryout and
reduce the current limb, then move to the next limb.
However, when using a smaller radix, such as \\(51\\), each limb can
store a carry bit and still be used as the input to a multiplication.
This means that the inputs do not need to be normalized, and instead
of using a sequential carry chain, we can compute all carryouts in
parallel, reduce all limbs in parallel, and then add the carryins in
parallel (possibly growing the limb values by one bit).
Because the output of this partial reduction is an acceptable
multiplication input, we can "close the loop" using partial reductions
and never have to normalize to a canonical representation through the
entire computation, in contrast to the Gueron-Krasnov approach, which
converts back to a packed representation after every operation. (This
idea seems to trace back to at least as early as [this 1999
paper][1999_walter]).
Using \\(r = 51\\) is enough to keep a carry bit in each limb and
avoid normalizations. What about an even smaller radix? One reason
to choose a smaller radix would be to align the limb boundaries with
an inline reduction (for instance, choosing \\(r = 43\\) for the
Mersenne field \\(p = 2^{127} - 1\\)), but for \\(p = 2^{255 - 19}\\),
\\(r = 51 = 255/5\\) is the natural choice.
# Multiplication
The inputs to a multiplication are two field elements
\\[
\begin{aligned}
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} \\\\
y &= y_0 + y_1 2^{51} + y_2 2^{102} + y_3 2^{153} + y_4 2^{204},
\end{aligned}
\\]
with limbs in range \\([0,2^{52})\\).
Writing the product terms as
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
\end{aligned}
\\]
a schoolbook multiplication in product scanning form takes the form
\\[
\begin{aligned}
z_0 &= x_0 y_0 \\\\
z_1 &= x_1 y_0 + x_0 y_1 \\\\
z_2 &= x_2 y_0 + x_1 y_1 + x_0 y_2 \\\\
z_3 &= x_3 y_0 + x_2 y_1 + x_1 y_2 + x_0 y_3 \\\\
z_4 &= x_4 y_0 + x_3 y_1 + x_2 y_2 + x_1 y_3 + x_0 y_4 \\\\
z_5 &= x_4 y_1 + x_3 y_2 + x_2 y_3 + x_1 y_4 \\\\
z_6 &= x_4 y_2 + x_3 y_3 + x_2 y_4 \\\\
z_7 &= x_4 y_3 + x_3 y_4 \\\\
z_8 &= x_4 y_4 \\\\
z_9 &= 0 \\\\
\end{aligned}
\\]
Each term \\(x_i y_j\\) can be written in terms of IFMA operations as
\\[
x_i y_j = \mathrm{lo}(x_i,y_j) + 2\mathrm{hi}(x_i,y_j)2^{51}.
\\]
Substituting this equation into the schoolbook multiplication, then
moving terms to eliminate the \\(2^{51}\\) factors gives
\\[
\begin{aligned}
z_0 &= \mathrm{lo}(x_0, y_0) \\\\
&+ \qquad 0 \\\\
z_1 &= \mathrm{lo}(x_1, y_0) + \mathrm{lo}(x_0, y_1) \\\\
&+ \qquad 2( \mathrm{hi}(x_0, y_0) )\\\\
z_2 &= \mathrm{lo}(x_2, y_0) + \mathrm{lo}(x_1, y_1) + \mathrm{lo}(x_0, y_2) \\\\
&+ \qquad 2( \mathrm{hi}(x_1, y_0) + \mathrm{hi}(x_0, y_1) )\\\\
z_3 &= \mathrm{lo}(x_3, y_0) + \mathrm{lo}(x_2, y_1) + \mathrm{lo}(x_1, y_2) + \mathrm{lo}(x_0, y_3) \\\\
&+ \qquad 2( \mathrm{hi}(x_2, y_0) + \mathrm{hi}(x_1, y_1) + \mathrm{hi}(x_0, y_2) )\\\\
z_4 &= \mathrm{lo}(x_4, y_0) + \mathrm{lo}(x_3, y_1) + \mathrm{lo}(x_2, y_2) + \mathrm{lo}(x_1, y_3) + \mathrm{lo}(x_0, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_3, y_0) + \mathrm{hi}(x_2, y_1) + \mathrm{hi}(x_1, y_2) + \mathrm{hi}(x_0, y_3) )\\\\
z_5 &= \mathrm{lo}(x_4, y_1) + \mathrm{lo}(x_3, y_2) + \mathrm{lo}(x_2, y_3) + \mathrm{lo}(x_1, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_0) + \mathrm{hi}(x_3, y_1) + \mathrm{hi}(x_2, y_2) + \mathrm{hi}(x_1, y_3) + \mathrm{hi}(x_0, y_4) )\\\\
z_6 &= \mathrm{lo}(x_4, y_2) + \mathrm{lo}(x_3, y_3) + \mathrm{lo}(x_2, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_1) + \mathrm{hi}(x_3, y_2) + \mathrm{hi}(x_2, y_3) + \mathrm{hi}(x_1, y_4) )\\\\
z_7 &= \mathrm{lo}(x_4, y_3) + \mathrm{lo}(x_3, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_2) + \mathrm{hi}(x_3, y_3) + \mathrm{hi}(x_2, y_4) )\\\\
z_8 &= \mathrm{lo}(x_4, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_3) + \mathrm{hi}(x_3, y_4) )\\\\
z_9 &= 0 \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_4) )\\\\
\end{aligned}
\\]
As noted above, our strategy will be to multiply and accumulate the
terms with coefficient \\(2\\) separately from those with coefficient
\\(1\\), before combining them at the end. This can alternately be
thought of as accumulating product terms into a *doubly-redundant*
representation, with two limbs for each digit, before collapsing
the doubly-redundant representation by shifts and adds.
This computation requires 25 `vpmadd52luq` and 25 `vpmadd52huq`
operations. For 256-bit vectors, IFMA operations execute on an
i3-8121U with latency 4 cycles, throughput 0.5 cycles, so executing 50
instructions requires 25 cycles' worth of throughput. Accumulating
terms with coefficient \\(1\\) and \\(2\\) seperately means that the
longest dependency chain has length 5, so the critical path has length
20 cycles and the bottleneck is throughput.
# Reduction modulo \\(p\\)
The next question is how to handle the reduction modulo \\(p\\).
Because \\(p = 2^{255} - 19\\), \\(2^{255} = 19 \pmod p\\), so we can
alternately write
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}
\end{aligned}
\\]
as
\\[
\begin{aligned}
z &= (z_0 + 19z_5) + (z_1 + 19z_6) 2^{51} + (z_2 + 19z_7) 2^{102} + (z_3 + 19z_8) 2^{153} + (z_4 + 19z_9) 2^{204}.
\end{aligned}
\\]
When using a \\(64 \times 64 \rightarrow 128\\)-bit multiplier, this
can be handled (as in [Ed25519][ed25519_paper]) by premultiplying
source terms by \\(19\\). Since \\(\lg(19) < 4.25\\), this increases
their size by less than \\(4.25\\) bits, and the rest of the
multiplication can be shown to work out.
Here, we have at most \\(1\\) bit of headroom. In order to allow
premultiplication, we would need to use radix \\(2^{47}\\), which
would require six limbs instead of five. Instead, we compute the high
terms \\(z_5, \ldots, z_9\\), each using two chains of IFMA
operations, then multiply by \\(19\\) and combine with the lower terms
\\(z_0, \ldots, z_4\\). There are two ways to perform the
multiplication by \\(19\\): using more IFMA operations, or using the
`vpmullq` instruction, which computes the low \\(64\\) bits of a \\(64
\times 64\\)-bit product. However, `vpmullq` has 15c/1.5c
latency/throughput, in contrast to the 4c/0.5c latency/throughput of
IFMA operations, so it seems like a worse choice.
The high terms \\(z_5, \ldots, z_9\\) are sums of \\(52\\)-bit terms,
so they are larger than \\(52\\) bits. Write these terms in radix \\(52\\) as
\\[
z_{5+i} = z_{5+i}' + z_{5+i}'' 2^{52}, \qquad z_{5+i}' < 2^{52}.
\\]
Then the contribution of \\(z_{5+i}\\), taken modulo \\(p\\), is
\\[
\begin{aligned}
z_{5+i} 2^{255} 2^{51 i}
&=
19 (z_{5+i}' + z_{5+i}'' 2^{52}) 2^{51 i}
\\\\
&=
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
\\\\
\end{aligned}
\\]
The products \\(19 z_{5+i}', 19 z_{5+i}''\\) can be written in terms of IFMA operations as
\\[
\begin{aligned}
19 z_{5+i}' &= \mathrm{lo}(19, z_{5+i}') + 2 \mathrm{hi}(19, z_{5+i}') 2^{51}, \\\\
19 z_{5+i}'' &= \mathrm{lo}(19, z_{5+i}'') + 2 \mathrm{hi}(19, z_{5+i}'') 2^{51}. \\\\
\end{aligned}
\\]
Because \\(z_{5+i} < 2^{64}\\), \\(z_{5+i}'' < 2^{12} \\), so \\(19
z_{5+i}'' < 2^{17} < 2^{52} \\) and \\(\mathrm{hi}(19, z_{5+i}'') = 0\\).
Because IFMA operations ignore the high bits of their source
operands, we do not need to compute \\(z\_{5+i}'\\) explicitly:
the high bits will be ignored.
Combining these observations, we can write
\\[
\begin{aligned}
z_{5+i} 2^{255} 2^{51 i}
&=
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
\\\\
&=
\mathrm{lo}(19, z_{5+i}) 2^{51 i}
\+ 2 \mathrm{hi}(19, z_{5+i}) 2^{51 (i+1)}
\+ 2 \mathrm{lo}(19, z_{5+i}/2^{52}) 2^{51 (i+1)}.
\end{aligned}
\\]
For \\(i = 0,1,2,3\\), this allows reducing \\(z_{5+i}\\) onto
\\(z_{i}, z_{i+1}\\), and if the low terms are computed using a
doubly-redundant representation, no additional shifts are needed to
handle the \\(2\\) coefficients. For \\(i = 4\\), there's a
complication: the contribution becomes
\\[
\begin{aligned}
z_{9} 2^{255} 2^{204}
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2 \mathrm{hi}(19, z_{9}) 2^{255}
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 2^{255}
\\\\
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2 \mathrm{hi}(19, z_{9}) 19
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 19
\\\\
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2
\mathrm{lo}(19, \mathrm{hi}(19, z_{9}) + \mathrm{lo}(19, z_{9}/2^{52})).
\\\\
\end{aligned}
\\]
It would be possible to cut the number of multiplications from 3 to 2
by carrying the high part of each \\(z_i\\) onto \\(z_{i+1}\\). This
would eliminate 5 multiplications, clearing 2.5 cycles of port
pressure, at the cost of 5 additions, adding 1.66 cycles of port
pressure. But doing this would create a dependency between terms
(e.g., \\(z_{5}\\) must be computed before the reduction of
\\(z_{6}\\) can begin), whereas with the approach above, all
contributions to all terms are computed independently, to maximize ILP
and flexibility for the processor to schedule instructions.
This strategy performs 16 IFMA operations, adding two IFMA operations
to each of the \\(2\\)-coefficient terms and one to each of the
\\(1\\)-coefficient terms. Considering the multiplication and
reduction together, we use 66 IFMA operations, requiring 33 cycles'
throughput, while the longest chain of IFMA operations is in the
reduction of \\(z_5\\) onto \\(z_1\\), of length 7 (so 28 cycles, plus
2 cycles to combine the two parts of \\(z_5\\), and the bottleneck is
again throughput.
Once this is done, we have computed the product terms
\\[
z = z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204},
\\]
without reducing the \\(z_i\\) to fit in \\(52\\) bits. Because the
overall flow of operations alternates multiplications and additions or
subtractions, we would have to perform a reduction after an addition
but before the next multiplication anyways, so there's no benefit to
fully reducing the limbs at the end of a multiplication. Instead, we
leave them unreduced, and track the reduction state using the type
system to ensure that unreduced limbs are not accidentally used as an
input to a multiplication.
# Squaring
Squaring operates similarly to multiplication, but with the
possibility to combine identical terms.
As before, we write the input as
\\[
\begin{aligned}
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204}
\end{aligned}
\\]
with limbs in range \\([0,2^{52})\\).
Writing the product terms as
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
\end{aligned}
\\]
a schoolbook squaring in product scanning form takes the form
\\[
\begin{aligned}
z_0 &= x_0 x_0 \\\\
z_1 &= 2 x_1 x_0 \\\\
z_2 &= 2 x_2 x_0 + x_1 x_1 \\\\
z_3 &= 2 x_3 x_0 + 2 x_2 x_1 \\\\
z_4 &= 2 x_4 x_0 + 2 x_3 x_1 + x_2 x_2 \\\\
z_5 &= 2 x_4 x_1 + 2 x_3 x_2 \\\\
z_6 &= 2 x_4 x_2 + x_3 x_3 \\\\
z_7 &= 2 x_4 x_3 \\\\
z_8 &= x_4 x_4 \\\\
z_9 &= 0 \\\\
\end{aligned}
\\]
As before, we write \\(x_i x_j\\) as
\\[
x_i x_j = \mathrm{lo}(x_i,x_j) + 2\mathrm{hi}(x_i,x_j)2^{51},
\\]
and substitute to obtain
\\[
\begin{aligned}
z_0 &= \mathrm{lo}(x_0, x_0) + 0 \\\\
z_1 &= 2 \mathrm{lo}(x_1, x_0) + 2 \mathrm{hi}(x_0, x_0) \\\\
z_2 &= 2 \mathrm{lo}(x_2, x_0) + \mathrm{lo}(x_1, x_1) + 4 \mathrm{hi}(x_1, x_0) \\\\
z_3 &= 2 \mathrm{lo}(x_3, x_0) + 2 \mathrm{lo}(x_2, x_1) + 4 \mathrm{hi}(x_2, x_0) + 2 \mathrm{hi}(x_1, x_1) \\\\
z_4 &= 2 \mathrm{lo}(x_4, x_0) + 2 \mathrm{lo}(x_3, x_1) + \mathrm{lo}(x_2, x_2) + 4 \mathrm{hi}(x_3, x_0) + 4 \mathrm{hi}(x_2, x_1) \\\\
z_5 &= 2 \mathrm{lo}(x_4, x_1) + 2 \mathrm{lo}(x_3, x_2) + 4 \mathrm{hi}(x_4, x_0) + 4 \mathrm{hi}(x_3, x_1) + 2 \mathrm{hi}(x_2, x_2) \\\\
z_6 &= 2 \mathrm{lo}(x_4, x_2) + \mathrm{lo}(x_3, x_3) + 4 \mathrm{hi}(x_4, x_1) + 4 \mathrm{hi}(x_3, x_2) \\\\
z_7 &= 2 \mathrm{lo}(x_4, x_3) + 4 \mathrm{hi}(x_4, x_2) + 2 \mathrm{hi}(x_3, x_3) \\\\
z_8 &= \mathrm{lo}(x_4, x_4) + 4 \mathrm{hi}(x_4, x_3) \\\\
z_9 &= 0 + 2 \mathrm{hi}(x_4, x_4) \\\\
\end{aligned}
\\]
To implement these, we group terms by their coefficient, computing
those with coefficient \\(2\\) on set of IFMA chains, and on another
set of chains, we begin with coefficient-\\(4\\) terms, then shift
left before continuing with the coefficient-\\(1\\) terms.
The reduction strategy is the same as for multiplication.
# Future improvements
LLVM won't use blend operations on [256-bit vectors yet][llvm_blend],
so there's a bunch of blend instructions that could be omitted.
Although the multiplications and squarings are much faster, there's no
speedup to the additions and subtractions, so there are diminishing
returns. In fact, the complications in the doubling formulas mean
that doubling is actually slower than readdition. This also suggests
that moving to 512-bit vectors won't be much help for a strategy aimed
at parallelism within a group operation, so to extract performance
gains from 512-bit vectors it will probably be necessary to create a
parallel-friendly multiscalar multiplication algorithm. This could
also help with reducing shuffle pressure.
The squaring implementation could probably be optimized, but without
`perf` support on Cannonlake it's difficult to make actual
measurements.
Another improvement would be to implement vectorized square root
computations, which would allow creating an iterator adaptor for point
decompression that bunched decompression operations and executed them
in parallel. This would accelerate batch verification.
[2016_gueron_krasnov]: https://ieeexplore.ieee.org/document/7563269
[2018_drucker_gueron]: https://eprint.iacr.org/2018/335
[1999_walter]: https://pdfs.semanticscholar.org/0e6a/3e8f30b63b556679f5dff2cbfdfe9523f4fa.pdf
[ed25519_paper]: https://ed25519.cr.yp.to/ed25519-20110926.pdf
[llvm_blend]: https://bugs.llvm.org/show_bug.cgi?id=38343

View File

@ -1,333 +0,0 @@
Vectorized implementations of field and point operations, using a
modification of the 4-way parallel formulas of Hisil, Wong, Carter,
and Dawson.
These notes explain the parallel formulas and our strategy for using
them with SIMD operations. There are two backend implementations: one
using AVX2, and the other using AVX512-IFMA.
# Overview
The 2008 paper [_Twisted Edwards Curves Revisited_][hwcd08] by Hisil,
Wong, Carter, and Dawson (HWCD) introduced the “extended coordinates”
and mixed-model representations which are used by most Edwards curve
implementations.
However, they also describe 4-way parallel formulas for point addition
and doubling: a unified addition algorithm taking an effective
\\(2\mathbf M + 1\mathbf D\\), a doubling algorithm taking an
effective \\(1\mathbf M + 1\mathbf S\\), and a dedicated (i.e., for
distinct points) addition algorithm taking an effective \\(2 \mathbf M
\\). They compare these formulas with a 2-way parallel variant of the
Montgomery ladder.
Unlike their serial formulas, which are used widely, their parallel
formulas do not seem to have been implemented in software before. The
2-way parallel Montgomery ladder was used in 2015 by Tung Chou's
`sandy2x` implementation. Curiously, however, although the [`sandy2x`
paper][sandy2x] also implements Edwards arithmetic, and cites HWCD08,
it doesn't mention their parallel Edwards formulas.
A 2015 paper by Hernández and López describes an AVX2 implementation
of X25519. Neither the paper nor the code are publicly available, but
it apparently gives only a [slight speedup][avx2trac], suggesting that
it uses a 4-way parallel Montgomery ladder rather than parallel
Edwards formulas.
The reason may be that HWCD08 describe their formulas as operating on
four independent processors, which would make a software
implementation impractical: all of the operations are too low-latency
to effectively synchronize. But a closer inspection reveals that the
(more expensive) multiplication and squaring steps are uniform, while
the instruction divergence occurs in the (much cheaper) addition and
subtraction steps. This means that a SIMD implementation can perform
the expensive steps uniformly, and handle divergence in the
inexpensive steps using masking.
These notes describe modifications to the original parallel formulas
to allow a SIMD implementation, and this module contains
implementations of the modified formulas targeting either AVX2 or
AVX512-IFMA.
# Parallel formulas in HWCD'08
The doubling formula is presented in the HWCD paper as follows:
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
| | idle | idle | idle | \\( R\_1 \gets X\_1 + Y\_1 \\) |
| \\(1\mathbf S\\) | \\( R\_2 \gets X\_1\^2 \\) | \\( R\_3 \gets Y\_1\^2 \\) | \\( R\_4 \gets Z\_1\^2 \\) | \\( R\_5 \gets R\_1\^2 \\) |
| | \\( R\_6 \gets R\_2 + R\_3 \\) | \\( R\_7 \gets R\_2 - R\_3 \\) | \\( R\_4 \gets 2 R\_4 \\) | idle |
| | idle | \\( R\_1 \gets R\_4 + R\_7 \\) | idle | \\( R\_2 \gets R\_6 - R\_5 \\) |
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_6 R\_7 \\) | \\( T\_3 \gets R\_2 R\_6 \\) | \\( Z\_3 \gets R\_1 R\_7 \\) |
and the unified addition algorithm is presented as follows:
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
| | \\( R\_1 \gets Y\_1 - X\_1 \\) | \\( R\_2 \gets Y\_2 - X\_2 \\) | \\( R\_3 \gets Y\_1 + X\_1 \\) | \\( R\_4 \gets Y\_2 + X\_2 \\) |
| \\(1\mathbf M\\) | \\( R\_5 \gets R\_1 R\_2 \\) | \\( R\_6 \gets R\_3 R\_4 \\) | \\( R\_7 \gets T\_1 T\_2 \\) | \\( R\_8 \gets Z\_1 Z\_2 \\) |
| \\(1\mathbf D\\) | idle | idle | \\( R\_7 \gets k R\_7 \\) | \\( R\_8 \gets 2 R\_8 \\) |
| | \\( R\_1 \gets R\_6 - R\_5 \\) | \\( R\_2 \gets R\_8 - R\_7 \\) | \\( R\_3 \gets R\_8 + R\_7 \\) | \\( R\_4 \gets R\_6 + R\_5 \\) |
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_3 R\_4 \\) | \\( T\_3 \gets R\_1 R\_4 \\) | \\( Z\_3 \gets R\_2 R\_3 \\) |
Here \\(\mathbf M\\) and \\(\mathbf S\\) represent the cost of
multiplication and squaring of generic field elements, \\(\mathbf D\\)
represents the cost of multiplication by a curve constant (in this
case \\( k = 2d \\)).
Notice that the \\(1\mathbf M\\) and \\(1\mathbf S\\) steps are
uniform. The non-uniform steps are all inexpensive additions or
subtractions, with the exception of the multiplication by the curve
constant \\(k = 2d\\):
$$
R\_7 \gets 2 d R\_7.
$$
HWCD suggest parallelising this step by breaking \\(k = 2d\\) into four
parts as \\(k = k_0 + 2\^n k_1 + 2\^{2n} k_2 + 2\^{3n} k_3 \\) and
computing \\(k_i R_7 \\) in parallel. This is quite awkward, but if
the curve constant is a ratio \\( d = d\_1/d\_2 \\), then projective
coordinates allow us to instead compute
$$
(R\_5, R\_6, R\_7, R\_8) \gets (d\_2 R\_5, d\_2 R\_6, 2d\_1 R\_7, d\_2 R\_8).
$$
This can be performed as a uniform multiplication by a vector of
constants, and if \\(d\_1, d\_2\\) are small, it is relatively
inexpensive. (This trick was suggested by Mike Hamburg).
In the Curve25519 case, we have
$$
d = \frac{d\_1}{d\_2} = \frac{-121665}{121666};
$$
Since \\(2 \cdot 121666 < 2\^{18}\\), all the constants above fit (up
to sign) in 32 bits, so this can be done in parallel as four
multiplications by small constants \\( (121666, 121666, 2\cdot 121665,
2\cdot 121666) \\), followed by a negation to compute \\( - 2\cdot 121665\\).
# Modified parallel formulas
Using the modifications sketched above, we can write SIMD-friendly
versions of the parallel formulas as follows. To avoid confusion with
the original formulas, temporary variables are named \\(S\\) instead
of \\(R\\) and are in static single-assignment form.
## Addition
To add points
\\(P_1 = (X_1 : Y_1 : Z_1 : T_1) \\)
and
\\(P_2 = (X_2 : Y_2 : Z_2 : T_2 ) \\),
we compute
$$
\begin{aligned}
(S\_0 &&,&& S\_1 &&,&& S\_2 &&,&& S\_3 )
&\gets
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Y\_2 - X\_2&&,&& Y\_2 + X\_2)
\\\\
(S\_4 &&,&& S\_5 &&,&& S\_6 &&,&& S\_7 )
&\gets
(S\_0 \cdot S\_2&&,&& S\_1 \cdot S\_3&&,&& Z\_1 \cdot Z\_2&&,&& T\_1 \cdot T\_2)
\\\\
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
&\gets
(d\_2 \cdot S\_4 &&,&& d\_2 \cdot S\_5 &&,&& 2 d\_2 \cdot S\_6 &&,&& 2 d\_1 \cdot S\_7 )
\\\\
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
&\gets
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
\\\\
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
&\gets
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
\end{aligned}
$$
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = P\_1 + P\_2 \\).
This costs \\( 2\mathbf M + 1 \mathbf D\\).
## Readdition
If the point \\( P_2 = (X\_2 : Y\_2 : Z\_2 : T\_2) \\) is fixed, we
can cache the multiplication of the curve constants by computing
$$
\begin{aligned}
(S\_2' &&,&& S\_3' &&,&& Z\_2' &&,&& T\_2' )
&\gets
(d\_2 \cdot (Y\_2 - X\_2)&&,&& d\_2 \cdot (Y\_1 + X\_1)&&,&& 2d\_2 \cdot Z\_2 &&,&& 2d\_1 \cdot T\_2).
\end{aligned}
$$
This costs \\( 1\mathbf D\\); with \\( (S\_2', S\_3', Z\_2', T\_2')\\)
in hand, the addition formulas above become
$$
\begin{aligned}
(S\_0 &&,&& S\_1 &&,&& Z\_1 &&,&& T\_1 )
&\gets
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Z\_1 &&,&& T\_1)
\\\\
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
&\gets
(S\_0 \cdot S\_2' &&,&& S\_1 \cdot S\_3'&&,&& Z\_1 \cdot Z\_2' &&,&& T\_1 \cdot T\_2')
\\\\
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
&\gets
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
\\\\
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
&\gets
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
\end{aligned}
$$
which costs only \\( 2\mathbf M \\). This precomputation is
essentially similar to the precomputation that HWCD suggest for their
serial formulas. Because the cost of precomputation and then
readdition is the same as addition, it's sufficient to only
implement caching and readdition.
## Doubling
The non-uniform portions of the (re)addition formulas have a fairly
regular structure. Unfortunately, this is not the case for the
doubling formulas, which are much less nice.
To double a point \\( P = (X\_1 : Y\_1 : Z\_1 : T\_1) \\), we compute
$$
\begin{aligned}
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& S\_0)
&\gets
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& X\_1 + Y\_1)
\\\\
(S\_1 &&,&& S\_2 &&,&& S\_3 &&,&& S\_4 )
&\gets
(X\_1\^2 &&,&& Y\_1\^2&&,&& Z\_1\^2 &&,&& S\_0\^2)
\\\\
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
&\gets
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
\\\\
(X\_3 &&,&& Y\_3 &&,&& Z\_3 &&,&& T\_3 )
&\gets
(S\_8 \cdot S\_9 &&,&& S\_5 \cdot S\_6 &&,&& S\_8 \cdot S\_6 &&,&& S\_5 \cdot S\_9)
\end{aligned}
$$
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = [2]P\_1 \\).
The intermediate step between the squaring and multiplication requires
a long chain of additions. For the IFMA-based implementation, this is not a problem; for the AVX2-based implementation, it is, but with some care and finesse, it's possible to arrange the computation without requiring an intermediate reduction.
# Implementation
These formulas aren't specific to a particular representation of field
element vectors, whose optimum choice is determined by the details of
the instruction set. However, it's not possible to perfectly separate
the implementation of the field element vectors from the
implementation of the point operations. Instead, the [`avx2`] and
[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and
the [`scalar_mul`] code uses one of the backend types by a type alias.
# Comparison to non-vectorized formulas
In theory, the parallel Edwards formulas seem to allow a \\(4\\)-way
speedup from parallelism. However, an actual vectorized
implementation has several slowdowns that cut into this speedup.
First, the parallel formulas can only use the available vector
multiplier. For AVX2, this is a \\( 32 \times 32 \rightarrow 64
\\)-bit integer multiplier, so the speedup from vectorization must
overcome the disadvantage of losing the \\( 64 \times 64 \rightarrow
128\\)-bit (serial) integer multiplier. The effect of this slowdown
is microarchitecture-dependent, since it requires accounting for the
total number of multiplications and additions and their relative
costs. IFMA allows using a \\( 52 \times 52 \rightarrow 104 \\)-bit
multiplier, but the high and low halves need to be computed
separately, and the reduction requires extra work because it's not
possible to pre-multiply by \\(19\\).
Second, the parallel doubling formulas incur both a theoretical and
practical slowdown. The parallel formulas described above work on the
\\( \mathbb P\^3 \\) “extended” coordinates. The \\( \mathbb P\^2 \\)
model introduced earlier by [Bernstein, Birkner, Joye, Lange, and
Peters][bbjlp08] allows slightly faster doublings, so HWCD suggest
mixing coordinate systems while performing scalar multiplication
(attributing the idea to [a 1998 paper][cmo98] by Cohen, Miyagi, and
Ono). The \\( T \\) coordinate is not required for doublings, so when
doublings are followed by doublings, its computation can be skipped.
More details on this approach and the different coordinate systems can
be found in the [`curve_models` module documentation][curve_models].
Unfortunately, this optimization is not compatible with the parallel
formulas, which cannot save time by skipping a single variable, so the
parallel doubling formulas do slightly more work when counting the
total number of field multiplications and squarings.
In addition, the parallel doubling formulas have a less regular
pattern of additions and subtractions than the parallel addition
formulas, so the vectorization overhead is proportionately greater.
Both the parallel addition and parallel doubling formulas also require
some shuffling to rearrange data within the vectors, which places more
pressure on the shuffle unit than is desirable.
This means that the speedup from using a vectorized implementation of
parallel Edwards formulas is likely to be greatest in applications
that do fewer doublings and more additions (like a large multiscalar
multiplication) rather than applications that do fewer additions and
more doublings (like a double-base scalar multiplication).
Third, Amdahl's law says that the speedup is limited to the portion
which can be parallelized. Normally, the field multiplications
dominate the cost of point operations, but with the IFMA backend, the
multiplications are so fast that the non-parallel additions end up as
a significant portion of the total time.
Fourth, current Intel CPUs perform thermal throttling when using wide
vector instructions. A detailed description can be found in §15.26 of
[the Intel Optimization Manual][intel], but using wide vector
instructions prevents the core from operating at higher frequencies.
The core can return to the higher-frequency state after 2
milliseconds, but this timer is reset every time high-power
instructions are used.
Any speedup from vectorization therefore has to be weighed against a
slowdown for the next few million instructions. For a mixed workload,
where point operations are interspersed with other tasks, this can
reduce overall performance. This implementation is therefore probably
not suitable for basic applications, like signatures, but is
worthwhile for complex applications, like zero-knowledge proofs, which
do sustained work.
# Future work
There are several directions for future improvement:
* Using the vectorized field arithmetic code to parallelize across
point operations rather than within a single point operation. This
is less flexible, but would give a speedup both from allowing use of
the faster mixed-model arithmetic and from reducing shuffle
pressure. One approach in this direction would be to implement
batched scalar-point operations using vectors of points (AoSoA
layout). This less generally useful but would give a speedup for
Bulletproofs.
* Extending the IFMA implementation to use the full width of AVX512,
either handling the extra parallelism internally to a single point
operation (by using a 2-way parallel implementation of field
arithmetic instead of a wordsliced one), or externally,
parallelizing across point operations. Internal parallelism would
be preferable but might require too much shuffle pressure. For now,
the only available CPU which runs IFMA operations executes them at
256-bits wide anyways, so this isn't yet important.
* Generalizing the implementation to NEON instructions. The current
point arithmetic code is written in terms of field element vectors,
which are in turn implemented using platform SIMD vectors. It
should be possible to write an alternate implementation of the
`FieldElement2625x4` using NEON without changing the point
arithmetic. NEON has 128-bit vectors rather than 256-bit vectors,
but this may still be worthwhile compared to a serial
implementation.
[sandy2x]: https://eprint.iacr.org/2015/943.pdf
[avx2trac]: https://trac.torproject.org/projects/tor/ticket/8897#comment:28
[hwcd08]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
[curve_models]: https://doc-internal.dalek.rs/curve25519_dalek/curve_models/index.html
[bbjlp08]: https://eprint.iacr.org/2008/013
[cmo98]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
[intel]: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf

View File

@ -1,62 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Pluggable implementations for different architectures.
//!
//! The backend code is split into two parts: a serial backend,
//! and a vector backend.
//!
//! The [`serial`] backend contains 32- and 64-bit implementations of
//! field arithmetic and scalar arithmetic, as well as implementations
//! of point operations using the mixed-model strategy (passing
//! between different curve models depending on the operation).
//!
//! The [`vector`] backend contains implementations of vectorized
//! field arithmetic, used to implement point operations using a novel
//! implementation strategy derived from parallel formulas of Hisil,
//! Wong, Carter, and Dawson.
//!
//! Because the two strategies give rise to different curve models,
//! it's not possible to reuse exactly the same scalar multiplication
//! code (or to write it generically), so both serial and vector
//! backends contain matching implementations of scalar multiplication
//! algorithms. These are intended to be selected by a `#[cfg]`-based
//! type alias.
//!
//! The [`vector`] backend is selected by the `simd_backend` cargo
//! feature; it uses the [`serial`] backend for non-vectorized operations.
#[cfg(not(any(
feature = "u32_backend",
feature = "u64_backend",
feature = "simd_backend",
)))]
compile_error!(
"no curve25519-dalek backend cargo feature enabled! \
please enable one of: u32_backend, u64_backend, simd_backend"
);
pub mod serial;
#[cfg(any(
all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
),
all(feature = "nightly", rustdoc)
))]
#[cfg_attr(
feature = "nightly",
doc(cfg(any(all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
))))
)]
pub mod vector;

View File

@ -1,550 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Internal curve representations which are not part of the public API.
//!
//! # Curve representations
//!
//! Internally, we use several different models for the curve. Here
//! is a sketch of the relationship between the models, following [a
//! post][smith-moderncrypto]
//! by Ben Smith on the `moderncrypto` mailing list. This is also briefly
//! discussed in section 2.5 of [_Montgomery curves and their
//! arithmetic_][costello-smith-2017] by Costello and Smith.
//!
//! Begin with the affine equation for the curve,
//! $$
//! -x\^2 + y\^2 = 1 + dx\^2y\^2.
//! $$
//! Next, pass to the projective closure \\(\mathbb P\^1 \times \mathbb
//! P\^1 \\) by setting \\(x=X/Z\\), \\(y=Y/T.\\) Clearing denominators
//! gives the model
//! $$
//! -X\^2T\^2 + Y\^2Z\^2 = Z\^2T\^2 + dX\^2Y\^2.
//! $$
//! In `curve25519-dalek`, this is represented as the `CompletedPoint`
//! struct.
//! To map from \\(\mathbb P\^1 \times \mathbb P\^1 \\), a product of
//! two lines, to \\(\mathbb P\^3\\), we use the [Segre
//! embedding](https://en.wikipedia.org/wiki/Segre_embedding)
//! $$
//! \sigma : ((X:Z),(Y:T)) \mapsto (XY:XT:ZY:ZT).
//! $$
//! Using coordinates \\( (W_0:W_1:W_2:W_3) \\) for \\(\mathbb P\^3\\),
//! the image \\(\sigma (\mathbb P\^1 \times \mathbb P\^1) \\) is the
//! surface defined by \\( W_0 W_3 = W_1 W_2 \\), and under \\(
//! \sigma\\), the equation above becomes
//! $$
//! -W\_1\^2 + W\_2\^2 = W\_3\^2 + dW\_0\^2,
//! $$
//! so that the curve is given by the pair of equations
//! $$
//! \begin{aligned}
//! -W\_1\^2 + W\_2\^2 &= W\_3\^2 + dW\_0\^2, \\\\ W_0 W_3 &= W_1 W_2.
//! \end{aligned}
//! $$
//! Up to variable naming, this is exactly the "extended" curve model
//! introduced in [_Twisted Edwards Curves
//! Revisited_][hisil-wong-carter-dawson-2008] by Hisil, Wong, Carter,
//! and Dawson. In `curve25519-dalek`, it is represented as the
//! `EdwardsPoint` struct. We can map from \\(\mathbb P\^3 \\) to
//! \\(\mathbb P\^2 \\) by sending \\( (W\_0:W\_1:W\_2:W\_3) \\) to \\(
//! (W\_1:W\_2:W\_3) \\). Notice that
//! $$
//! \frac {W\_1} {W\_3} = \frac {XT} {ZT} = \frac X Z = x,
//! $$
//! and
//! $$
//! \frac {W\_2} {W\_3} = \frac {YZ} {ZT} = \frac Y T = y,
//! $$
//! so this is the same as if we had started with the affine model
//! and passed to \\( \mathbb P\^2 \\) by setting \\( x = W\_1 / W\_3
//! \\), \\(y = W\_2 / W\_3 \\).
//! Up to variable naming, this is the projective representation
//! introduced in in [_Twisted Edwards
//! Curves_][bernstein-birkner-joye-lange-peters-2008] by Bernstein,
//! Birkner, Joye, Lange, and Peters. In `curve25519-dalek`, it is
//! represented by the `ProjectivePoint` struct.
//!
//! # Passing between curve models
//!
//! Although the \\( \mathbb P\^3 \\) model provides faster addition
//! formulas, the \\( \mathbb P\^2 \\) model provides faster doubling
//! formulas. Hisil, Wong, Carter, and Dawson therefore suggest mixing
//! coordinate systems for scalar multiplication, attributing the idea
//! to [a 1998 paper][cohen-miyaji-ono-1998] of Cohen, Miyagi, and Ono.
//!
//! Their suggestion is to vary the formulas used by context, using a
//! \\( \mathbb P\^2 \rightarrow \mathbb P\^2 \\) doubling formula when
//! a doubling is followed
//! by another doubling, a \\( \mathbb P\^2 \rightarrow \mathbb P\^3 \\)
//! doubling formula when a doubling is followed by an addition, and
//! computing point additions using a \\( \mathbb P\^3 \times \mathbb P\^3
//! \rightarrow \mathbb P\^2 \\) formula.
//!
//! The `ref10` reference implementation of [Ed25519][ed25519], by
//! Bernstein, Duif, Lange, Schwabe, and Yang, tweaks
//! this strategy, factoring the addition formulas through the
//! completion \\( \mathbb P\^1 \times \mathbb P\^1 \\), so that the
//! output of an addition or doubling always lies in \\( \mathbb P\^1 \times
//! \mathbb P\^1\\), and the choice of which formula to use is replaced
//! by a choice of whether to convert the result to \\( \mathbb P\^2 \\)
//! or \\(\mathbb P\^3 \\). However, this tweak is not described in
//! their paper, only in their software.
//!
//! Our naming for the `CompletedPoint` (\\(\mathbb P\^1 \times \mathbb
//! P\^1 \\)), `ProjectivePoint` (\\(\mathbb P\^2 \\)), and
//! `EdwardsPoint` (\\(\mathbb P\^3 \\)) structs follows the naming in
//! Adam Langley's [Golang ed25519][agl-ed25519] implementation, which
//! `curve25519-dalek` was originally derived from.
//!
//! Finally, to accelerate readditions, we use two cached point formats
//! in "Niels coordinates", named for Niels Duif,
//! one for the affine model and one for the \\( \mathbb P\^3 \\) model:
//!
//! * `AffineNielsPoint`: \\( (y+x, y-x, 2dxy) \\)
//! * `ProjectiveNielsPoint`: \\( (Y+X, Y-X, Z, 2dXY) \\)
//!
//! [smith-moderncrypto]: https://moderncrypto.org/mail-archive/curves/2016/000807.html
//! [costello-smith-2017]: https://eprint.iacr.org/2017/212
//! [hisil-wong-carter-dawson-2008]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
//! [bernstein-birkner-joye-lange-peters-2008]: https://eprint.iacr.org/2008/013
//! [cohen-miyaji-ono-1998]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
//! [ed25519]: https://eprint.iacr.org/2011/368
//! [agl-ed25519]: https://github.com/agl/ed25519
#![allow(non_snake_case)]
use core::fmt::Debug;
use core::ops::{Add, Neg, Sub};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
use constants;
use edwards::EdwardsPoint;
use field::FieldElement;
use traits::ValidityCheck;
// ------------------------------------------------------------------------
// Internal point representations
// ------------------------------------------------------------------------
/// A `ProjectivePoint` is a point \\((X:Y:Z)\\) on the \\(\mathbb
/// P\^2\\) model of the curve.
/// A point \\((x,y)\\) in the affine model corresponds to
/// \\((x:y:1)\\).
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
pub struct ProjectivePoint {
pub X: FieldElement,
pub Y: FieldElement,
pub Z: FieldElement,
}
/// A `CompletedPoint` is a point \\(((X:Z), (Y:T))\\) on the \\(\mathbb
/// P\^1 \times \mathbb P\^1 \\) model of the curve.
/// A point (x,y) in the affine model corresponds to \\( ((x:1),(y:1))
/// \\).
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
#[allow(missing_docs)]
pub struct CompletedPoint {
pub X: FieldElement,
pub Y: FieldElement,
pub Z: FieldElement,
pub T: FieldElement,
}
/// A pre-computed point in the affine model for the curve, represented as
/// \\((y+x, y-x, 2dxy)\\) in "Niels coordinates".
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
// Safe to derive Eq because affine coordinates.
#[derive(Copy, Clone, Eq, PartialEq)]
#[allow(missing_docs)]
pub struct AffineNielsPoint {
pub y_plus_x: FieldElement,
pub y_minus_x: FieldElement,
pub xy2d: FieldElement,
}
impl Zeroize for AffineNielsPoint {
fn zeroize(&mut self) {
self.y_plus_x.zeroize();
self.y_minus_x.zeroize();
self.xy2d.zeroize();
}
}
/// A pre-computed point on the \\( \mathbb P\^3 \\) model for the
/// curve, represented as \\((Y+X, Y-X, Z, 2dXY)\\) in "Niels coordinates".
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
pub struct ProjectiveNielsPoint {
pub Y_plus_X: FieldElement,
pub Y_minus_X: FieldElement,
pub Z: FieldElement,
pub T2d: FieldElement,
}
impl Zeroize for ProjectiveNielsPoint {
fn zeroize(&mut self) {
self.Y_plus_X.zeroize();
self.Y_minus_X.zeroize();
self.Z.zeroize();
self.T2d.zeroize();
}
}
// ------------------------------------------------------------------------
// Constructors
// ------------------------------------------------------------------------
use traits::Identity;
impl Identity for ProjectivePoint {
fn identity() -> ProjectivePoint {
ProjectivePoint {
X: FieldElement::zero(),
Y: FieldElement::one(),
Z: FieldElement::one(),
}
}
}
impl Identity for ProjectiveNielsPoint {
fn identity() -> ProjectiveNielsPoint {
ProjectiveNielsPoint{
Y_plus_X: FieldElement::one(),
Y_minus_X: FieldElement::one(),
Z: FieldElement::one(),
T2d: FieldElement::zero(),
}
}
}
impl Default for ProjectiveNielsPoint {
fn default() -> ProjectiveNielsPoint {
ProjectiveNielsPoint::identity()
}
}
impl Identity for AffineNielsPoint {
fn identity() -> AffineNielsPoint {
AffineNielsPoint{
y_plus_x: FieldElement::one(),
y_minus_x: FieldElement::one(),
xy2d: FieldElement::zero(),
}
}
}
impl Default for AffineNielsPoint {
fn default() -> AffineNielsPoint {
AffineNielsPoint::identity()
}
}
// ------------------------------------------------------------------------
// Validity checks (for debugging, not CT)
// ------------------------------------------------------------------------
impl ValidityCheck for ProjectivePoint {
fn is_valid(&self) -> bool {
// Curve equation is -x^2 + y^2 = 1 + d*x^2*y^2,
// homogenized as (-X^2 + Y^2)*Z^2 = Z^4 + d*X^2*Y^2
let XX = self.X.square();
let YY = self.Y.square();
let ZZ = self.Z.square();
let ZZZZ = ZZ.square();
let lhs = &(&YY - &XX) * &ZZ;
let rhs = &ZZZZ + &(&constants::EDWARDS_D * &(&XX * &YY));
lhs == rhs
}
}
// ------------------------------------------------------------------------
// Constant-time assignment
// ------------------------------------------------------------------------
impl ConditionallySelectable for ProjectiveNielsPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
ProjectiveNielsPoint {
Y_plus_X: FieldElement::conditional_select(&a.Y_plus_X, &b.Y_plus_X, choice),
Y_minus_X: FieldElement::conditional_select(&a.Y_minus_X, &b.Y_minus_X, choice),
Z: FieldElement::conditional_select(&a.Z, &b.Z, choice),
T2d: FieldElement::conditional_select(&a.T2d, &b.T2d, choice),
}
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.Y_plus_X.conditional_assign(&other.Y_plus_X, choice);
self.Y_minus_X.conditional_assign(&other.Y_minus_X, choice);
self.Z.conditional_assign(&other.Z, choice);
self.T2d.conditional_assign(&other.T2d, choice);
}
}
impl ConditionallySelectable for AffineNielsPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
AffineNielsPoint {
y_plus_x: FieldElement::conditional_select(&a.y_plus_x, &b.y_plus_x, choice),
y_minus_x: FieldElement::conditional_select(&a.y_minus_x, &b.y_minus_x, choice),
xy2d: FieldElement::conditional_select(&a.xy2d, &b.xy2d, choice),
}
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.y_plus_x.conditional_assign(&other.y_plus_x, choice);
self.y_minus_x.conditional_assign(&other.y_minus_x, choice);
self.xy2d.conditional_assign(&other.xy2d, choice);
}
}
// ------------------------------------------------------------------------
// Point conversions
// ------------------------------------------------------------------------
impl ProjectivePoint {
/// Convert this point from the \\( \mathbb P\^2 \\) model to the
/// \\( \mathbb P\^3 \\) model.
///
/// This costs \\(3 \mathrm M + 1 \mathrm S\\).
pub fn to_extended(&self) -> EdwardsPoint {
EdwardsPoint {
X: &self.X * &self.Z,
Y: &self.Y * &self.Z,
Z: self.Z.square(),
T: &self.X * &self.Y,
}
}
}
impl CompletedPoint {
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
/// \\) model to the \\( \mathbb P\^2 \\) model.
///
/// This costs \\(3 \mathrm M \\).
pub fn to_projective(&self) -> ProjectivePoint {
ProjectivePoint {
X: &self.X * &self.T,
Y: &self.Y * &self.Z,
Z: &self.Z * &self.T,
}
}
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
/// \\) model to the \\( \mathbb P\^3 \\) model.
///
/// This costs \\(4 \mathrm M \\).
pub fn to_extended(&self) -> EdwardsPoint {
EdwardsPoint {
X: &self.X * &self.T,
Y: &self.Y * &self.Z,
Z: &self.Z * &self.T,
T: &self.X * &self.Y,
}
}
}
// ------------------------------------------------------------------------
// Doubling
// ------------------------------------------------------------------------
impl ProjectivePoint {
/// Double this point: return self + self
pub fn double(&self) -> CompletedPoint { // Double()
let XX = self.X.square();
let YY = self.Y.square();
let ZZ2 = self.Z.square2();
let X_plus_Y = &self.X + &self.Y;
let X_plus_Y_sq = X_plus_Y.square();
let YY_plus_XX = &YY + &XX;
let YY_minus_XX = &YY - &XX;
CompletedPoint{
X: &X_plus_Y_sq - &YY_plus_XX,
Y: YY_plus_XX,
Z: YY_minus_XX,
T: &ZZ2 - &YY_minus_XX
}
}
}
// ------------------------------------------------------------------------
// Addition and Subtraction
// ------------------------------------------------------------------------
// XXX(hdevalence) These were doc(hidden) so they don't appear in the
// public API docs.
// However, that prevents them being used with --document-private-items,
// so comment out the doc(hidden) for now until this is resolved
//
// upstream rust issue: https://github.com/rust-lang/rust/issues/46380
//#[doc(hidden)]
impl<'a, 'b> Add<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn add(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PP = &Y_plus_X * &other.Y_plus_X;
let MM = &Y_minus_X * &other.Y_minus_X;
let TT2d = &self.T * &other.T2d;
let ZZ = &self.Z * &other.Z;
let ZZ2 = &ZZ + &ZZ;
CompletedPoint{
X: &PP - &MM,
Y: &PP + &MM,
Z: &ZZ2 + &TT2d,
T: &ZZ2 - &TT2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Sub<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn sub(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PM = &Y_plus_X * &other.Y_minus_X;
let MP = &Y_minus_X * &other.Y_plus_X;
let TT2d = &self.T * &other.T2d;
let ZZ = &self.Z * &other.Z;
let ZZ2 = &ZZ + &ZZ;
CompletedPoint{
X: &PM - &MP,
Y: &PM + &MP,
Z: &ZZ2 - &TT2d,
T: &ZZ2 + &TT2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Add<&'b AffineNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn add(self, other: &'b AffineNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PP = &Y_plus_X * &other.y_plus_x;
let MM = &Y_minus_X * &other.y_minus_x;
let Txy2d = &self.T * &other.xy2d;
let Z2 = &self.Z + &self.Z;
CompletedPoint{
X: &PP - &MM,
Y: &PP + &MM,
Z: &Z2 + &Txy2d,
T: &Z2 - &Txy2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Sub<&'b AffineNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn sub(self, other: &'b AffineNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PM = &Y_plus_X * &other.y_minus_x;
let MP = &Y_minus_X * &other.y_plus_x;
let Txy2d = &self.T * &other.xy2d;
let Z2 = &self.Z + &self.Z;
CompletedPoint{
X: &PM - &MP,
Y: &PM + &MP,
Z: &Z2 - &Txy2d,
T: &Z2 + &Txy2d
}
}
}
// ------------------------------------------------------------------------
// Negation
// ------------------------------------------------------------------------
impl<'a> Neg for &'a ProjectiveNielsPoint {
type Output = ProjectiveNielsPoint;
fn neg(self) -> ProjectiveNielsPoint {
ProjectiveNielsPoint{
Y_plus_X: self.Y_minus_X,
Y_minus_X: self.Y_plus_X,
Z: self.Z,
T2d: -(&self.T2d),
}
}
}
impl<'a> Neg for &'a AffineNielsPoint {
type Output = AffineNielsPoint;
fn neg(self) -> AffineNielsPoint {
AffineNielsPoint{
y_plus_x: self.y_minus_x,
y_minus_x: self.y_plus_x,
xy2d: -(&self.xy2d)
}
}
}
// ------------------------------------------------------------------------
// Debug traits
// ------------------------------------------------------------------------
impl Debug for ProjectivePoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "ProjectivePoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?}\n}}",
&self.X, &self.Y, &self.Z)
}
}
impl Debug for CompletedPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "CompletedPoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?},\n\tT: {:?}\n}}",
&self.X, &self.Y, &self.Z, &self.T)
}
}
impl Debug for AffineNielsPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "AffineNielsPoint{{\n\ty_plus_x: {:?},\n\ty_minus_x: {:?},\n\txy2d: {:?}\n}}",
&self.y_plus_x, &self.y_minus_x, &self.xy2d)
}
}
impl Debug for ProjectiveNielsPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "ProjectiveNielsPoint{{\n\tY_plus_X: {:?},\n\tY_minus_X: {:?},\n\tZ: {:?},\n\tT2d: {:?}\n}}",
&self.Y_plus_X, &self.Y_minus_X, &self.Z, &self.T2d)
}
}

View File

@ -1,43 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Serial implementations of field, scalar, point arithmetic.
//!
//! When the vector backend is disabled, the crate uses the
//! mixed-model strategy for implementing point operations and scalar
//! multiplication; see the [`curve_models`](self::curve_models) and
//! [`scalar_mul`](self::scalar_mul) documentation for more
//! information.
//!
//! When the vector backend is enabled, the field and scalar
//! implementations are still used for non-vectorized operations.
//!
//! Note: at this time the `u32` and `u64` backends cannot be built
//! together.
#[cfg(not(any(feature = "u32_backend", feature = "u64_backend")))]
compile_error!(
"no curve25519-dalek backend cargo feature enabled! \
please enable one of: u32_backend, u64_backend"
);
#[cfg(feature = "u32_backend")]
pub mod u32;
#[cfg(feature = "u64_backend")]
pub mod u64;
pub mod curve_models;
#[cfg(not(all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
)))]
pub mod scalar_mul;

View File

@ -1,30 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Implementations of various scalar multiplication algorithms.
//!
//! Note that all of these implementations use serial code for field
//! arithmetic with the multi-model strategy described in the
//! `curve_models` module. The vectorized AVX2 backend has its own
//! scalar multiplication implementations, since it only uses one
//! curve model.
pub mod variable_base;
pub mod vartime_double_base;
#[cfg(feature = "alloc")]
pub mod straus;
#[cfg(feature = "alloc")]
pub mod precomputed_straus;
#[cfg(feature = "alloc")]
pub mod pippenger;

View File

@ -1,202 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Oleg Andreev
// See LICENSE for licensing information.
//
// Authors:
// - Oleg Andreev <oleganza@gmail.com>
//! Implementation of a variant of Pippenger's algorithm.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::VartimeMultiscalarMul;
#[allow(unused_imports)]
use prelude::*;
/// Implements a version of Pippenger's algorithm.
///
/// The algorithm works as follows:
///
/// Let `n` be a number of point-scalar pairs.
/// Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
///
/// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
/// Bucket 0 is not needed as it would contain points multiplied by 0.
/// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
/// Note: only the last digit may equal `2^w/2`.
/// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
/// the point's scalar's value in the window.
/// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
/// by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
/// intermediate sum from the last to the first, and the full sum made of all intermediate sums.
/// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
/// 6. Add to the return value.
/// 7. Repeat the loop.
///
/// Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
///
/// ```ascii
/// cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
/// | | | | |
/// | | | | looping over 256/w windows
/// | | | adding to the result
/// sorting points | shifting the sum by w bits (to the next window, starting from last window)
/// one by one |
/// into buckets adding/subtracting all buckets
/// multiplied by their indexes
/// using a sum of intermediate sums
/// ```
///
/// For large `n`, dominant factor is (n*256/w) additions.
/// However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
/// Therefore, the optimal choice of `w` grows slowly as `n` grows.
///
/// This algorithm is adapted from section 4 of https://eprint.iacr.org/2012/549.pdf.
pub struct Pippenger;
#[cfg(any(feature = "alloc", feature = "std"))]
impl VartimeMultiscalarMul for Pippenger {
type Point = EdwardsPoint;
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
use traits::Identity;
let mut scalars = scalars.into_iter();
let size = scalars.by_ref().size_hint().0;
// Digit width in bits. As digit width grows,
// number of point additions goes down, but amount of
// buckets and bucket additions grows exponentially.
let w = if size < 500 {
6
} else if size < 800 {
7
} else {
8
};
let max_digit: usize = 1 << w;
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
// Collect optimized scalars and points in buffers for repeated access
// (scanning the whole set per digit position).
let scalars = scalars
.map(|s| s.borrow().to_radix_2w(w));
let points = points
.into_iter()
.map(|p| p.map(|P| P.to_projective_niels()));
let scalars_points = scalars
.zip(points)
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
.collect::<Option<Vec<_>>>()?;
// Prepare 2^w/2 buckets.
// buckets[i] corresponds to a multiplication factor (i+1).
let mut buckets: Vec<_> = (0..buckets_count)
.map(|_| EdwardsPoint::identity())
.collect();
let mut columns = (0..digits_count).rev().map(|digit_index| {
// Clear the buckets when processing another digit.
for i in 0..buckets_count {
buckets[i] = EdwardsPoint::identity();
}
// Iterate over pairs of (point, scalar)
// and add/sub the point to the corresponding bucket.
// Note: if we add support for precomputed lookup tables,
// we'll be adding/subtracting point premultiplied by `digits[i]` to buckets[0].
for (digits, pt) in scalars_points.iter() {
// Widen digit so that we don't run into edge cases when w=8.
let digit = digits[digit_index] as i16;
if digit > 0 {
let b = (digit - 1) as usize;
buckets[b] = (&buckets[b] + pt).to_extended();
} else if digit < 0 {
let b = (-digit - 1) as usize;
buckets[b] = (&buckets[b] - pt).to_extended();
}
}
// Add the buckets applying the multiplication factor to each bucket.
// The most efficient way to do that is to have a single sum with two running sums:
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
//
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
// C
// C B
// C B A Sum = C + (C+B) + (C+B+A)
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
let mut buckets_sum = buckets[buckets_count - 1];
for i in (0..(buckets_count - 1)).rev() {
buckets_intermediate_sum += buckets[i];
buckets_sum += buckets_intermediate_sum;
}
buckets_sum
});
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
// `unwrap()` always succeeds because we know we have more than zero digits.
let hi_column = columns.next().unwrap();
Some(
columns
.fold(hi_column, |total, p| total.mul_by_pow_2(w as u32) + p),
)
}
}
#[cfg(test)]
mod test {
use super::*;
use constants;
use scalar::Scalar;
#[test]
fn test_vartime_pippenger() {
// Reuse points across different tests
let mut n = 512;
let x = Scalar::from(2128506u64).invert();
let y = Scalar::from(4443282u64).invert();
let points: Vec<_> = (0..n)
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
.collect();
let scalars: Vec<_> = (0..n)
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
.collect();
let premultiplied: Vec<EdwardsPoint> = scalars
.iter()
.zip(points.iter())
.map(|(sc, pt)| sc * pt)
.collect();
while n > 0 {
let scalars = &scalars[0..n].to_vec();
let points = &points[0..n].to_vec();
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
assert_eq!(subject.compress(), control.compress());
n = n / 2;
}
}
}

View File

@ -1,110 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Henry de Valence.
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Precomputation for Straus's method.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use backend::serial::curve_models::{
AffineNielsPoint, CompletedPoint, ProjectiveNielsPoint, ProjectivePoint,
};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::Identity;
use traits::VartimePrecomputedMultiscalarMul;
use window::{NafLookupTable5, NafLookupTable8};
#[allow(unused_imports)]
use prelude::*;
pub struct VartimePrecomputedStraus {
static_lookup_tables: Vec<NafLookupTable8<AffineNielsPoint>>,
}
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
type Point = EdwardsPoint;
fn new<I>(static_points: I) -> Self
where
I: IntoIterator,
I::Item: Borrow<Self::Point>,
{
Self {
static_lookup_tables: static_points
.into_iter()
.map(|P| NafLookupTable8::<AffineNielsPoint>::from(P.borrow()))
.collect(),
}
}
fn optional_mixed_multiscalar_mul<I, J, K>(
&self,
static_scalars: I,
dynamic_scalars: J,
dynamic_points: K,
) -> Option<Self::Point>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Scalar>,
K: IntoIterator<Item = Option<Self::Point>>,
{
let static_nafs = static_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_nafs: Vec<_> = dynamic_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_lookup_tables = dynamic_points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let sp = self.static_lookup_tables.len();
let dp = dynamic_lookup_tables.len();
assert_eq!(sp, static_nafs.len());
assert_eq!(dp, dynamic_nafs.len());
// We could save some doublings by looking for the highest
// nonzero NAF coefficient, but since we might have a lot of
// them to search, it's not clear it's worthwhile to check.
let mut S = ProjectivePoint::identity();
for j in (0..256).rev() {
let mut R: CompletedPoint = S.double();
for i in 0..dp {
let t_ij = dynamic_nafs[i][j];
if t_ij > 0 {
R = &R.to_extended() + &dynamic_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R.to_extended() - &dynamic_lookup_tables[i].select(-t_ij as usize);
}
}
for i in 0..sp {
let t_ij = static_nafs[i][j];
if t_ij > 0 {
R = &R.to_extended() + &self.static_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R.to_extended() - &self.static_lookup_tables[i].select(-t_ij as usize);
}
}
S = R.to_projective();
}
Some(S.to_extended())
}
}

View File

@ -1,195 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Implementation of the interleaved window method, also known as Straus' method.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::MultiscalarMul;
use traits::VartimeMultiscalarMul;
#[allow(unused_imports)]
use prelude::*;
/// Perform multiscalar multiplication by the interleaved window
/// method, also known as Straus' method (since it was apparently
/// [first published][solution] by Straus in 1964, as a solution to [a
/// problem][problem] posted in the American Mathematical Monthly in
/// 1963).
///
/// It is easy enough to reinvent, and has been repeatedly. The basic
/// idea is that when computing
/// \\[
/// Q = s_1 P_1 + \cdots + s_n P_n
/// \\]
/// by means of additions and doublings, the doublings can be shared
/// across the \\( P_i \\\).
///
/// We implement two versions, a constant-time algorithm using fixed
/// windows and a variable-time algorithm using sliding windows. They
/// are slight variations on the same idea, and are described in more
/// detail in the respective implementations.
///
/// [solution]: https://www.jstor.org/stable/2310929
/// [problem]: https://www.jstor.org/stable/2312273
pub struct Straus {}
impl MultiscalarMul for Straus {
type Point = EdwardsPoint;
/// Constant-time Straus using a fixed window of size \\(4\\).
///
/// Our goal is to compute
/// \\[
/// Q = s_1 P_1 + \cdots + s_n P_n.
/// \\]
///
/// For each point \\( P_i \\), precompute a lookup table of
/// \\[
/// P_i, 2P_i, 3P_i, 4P_i, 5P_i, 6P_i, 7P_i, 8P_i.
/// \\]
///
/// For each scalar \\( s_i \\), compute its radix-\\(2^4\\)
/// signed digits \\( s_{i,j} \\), i.e.,
/// \\[
/// s_i = s_{i,0} + s_{i,1} 16^1 + ... + s_{i,63} 16^{63},
/// \\]
/// with \\( -8 \leq s_{i,j} < 8 \\). Since \\( 0 \leq |s_{i,j}|
/// \leq 8 \\), we can retrieve \\( s_{i,j} P_i \\) from the
/// lookup table with a conditional negation: using signed
/// digits halves the required table size.
///
/// Then as in the single-base fixed window case, we have
/// \\[
/// \begin{aligned}
/// s_i P_i &= P_i (s_{i,0} + s_{i,1} 16^1 + \cdots + s_{i,63} 16^{63}) \\\\
/// s_i P_i &= P_i s_{i,0} + P_i s_{i,1} 16^1 + \cdots + P_i s_{i,63} 16^{63} \\\\
/// s_i P_i &= P_i s_{i,0} + 16(P_i s_{i,1} + 16( \cdots +16P_i s_{i,63})\cdots )
/// \end{aligned}
/// \\]
/// so each \\( s_i P_i \\) can be computed by alternately adding
/// a precomputed multiple \\( P_i s_{i,j} \\) of \\( P_i \\) and
/// repeatedly doubling.
///
/// Now consider the two-dimensional sum
/// \\[
/// \begin{aligned}
/// s\_1 P\_1 &=& P\_1 s\_{1,0} &+& 16 (P\_1 s\_{1,1} &+& 16 ( \cdots &+& 16 P\_1 s\_{1,63}&) \cdots ) \\\\
/// + & & + & & + & & & & + & \\\\
/// s\_2 P\_2 &=& P\_2 s\_{2,0} &+& 16 (P\_2 s\_{2,1} &+& 16 ( \cdots &+& 16 P\_2 s\_{2,63}&) \cdots ) \\\\
/// + & & + & & + & & & & + & \\\\
/// \vdots & & \vdots & & \vdots & & & & \vdots & \\\\
/// + & & + & & + & & & & + & \\\\
/// s\_n P\_n &=& P\_n s\_{n,0} &+& 16 (P\_n s\_{n,1} &+& 16 ( \cdots &+& 16 P\_n s\_{n,63}&) \cdots )
/// \end{aligned}
/// \\]
/// The sum of the left-hand column is the result \\( Q \\); by
/// computing the two-dimensional sum on the right column-wise,
/// top-to-bottom, then right-to-left, we need to multiply by \\(
/// 16\\) only once per column, sharing the doublings across all
/// of the input points.
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<EdwardsPoint>,
{
use zeroize::Zeroizing;
use backend::serial::curve_models::ProjectiveNielsPoint;
use window::LookupTable;
use traits::Identity;
let lookup_tables: Vec<_> = points
.into_iter()
.map(|point| LookupTable::<ProjectiveNielsPoint>::from(point.borrow()))
.collect();
// This puts the scalar digits into a heap-allocated Vec.
// To ensure that these are erased, pass ownership of the Vec into a
// Zeroizing wrapper.
let scalar_digits_vec: Vec<_> = scalars
.into_iter()
.map(|s| s.borrow().to_radix_16())
.collect();
let scalar_digits = Zeroizing::new(scalar_digits_vec);
let mut Q = EdwardsPoint::identity();
for j in (0..64).rev() {
Q = Q.mul_by_pow_2(4);
let it = scalar_digits.iter().zip(lookup_tables.iter());
for (s_i, lookup_table_i) in it {
// R_i = s_{i,j} * P_i
let R_i = lookup_table_i.select(s_i[j]);
// Q = Q + R_i
Q = (&Q + &R_i).to_extended();
}
}
Q
}
}
impl VartimeMultiscalarMul for Straus {
type Point = EdwardsPoint;
/// Variable-time Straus using a non-adjacent form of width \\(5\\).
///
/// This is completely similar to the constant-time code, but we
/// use a non-adjacent form for the scalar, and do not do table
/// lookups in constant time.
///
/// The non-adjacent form has signed, odd digits. Using only odd
/// digits halves the table size (since we only need odd
/// multiples), or gives fewer additions for the same table size.
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
use backend::serial::curve_models::{CompletedPoint, ProjectiveNielsPoint, ProjectivePoint};
use window::NafLookupTable5;
use traits::Identity;
let nafs: Vec<_> = scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect();
let lookup_tables = points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let mut r = ProjectivePoint::identity();
for i in (0..256).rev() {
let mut t: CompletedPoint = r.double();
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
if naf[i] > 0 {
t = &t.to_extended() + &lookup_table.select(naf[i] as usize);
} else if naf[i] < 0 {
t = &t.to_extended() - &lookup_table.select(-naf[i] as usize);
}
}
r = t.to_projective();
}
Some(r.to_extended())
}
}

View File

@ -1,46 +0,0 @@
#![allow(non_snake_case)]
use traits::Identity;
use scalar::Scalar;
use edwards::EdwardsPoint;
use backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint};
use window::LookupTable;
/// Perform constant-time, variable-base scalar multiplication.
pub(crate) fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
let lookup_table = LookupTable::<ProjectiveNielsPoint>::from(point);
// Setting s = scalar, compute
//
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
//
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
let scalar_digits = scalar.to_radix_16();
// Compute s*P as
//
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
//
// We sum right-to-left.
// Unwrap first loop iteration to save computing 16*identity
let mut tmp2;
let mut tmp3 = EdwardsPoint::identity();
let mut tmp1 = &tmp3 + &lookup_table.select(scalar_digits[63]);
// Now tmp1 = s_63*P in P1xP1 coords
for i in (0..63).rev() {
tmp2 = tmp1.to_projective(); // tmp2 = (prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 2*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 2*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 4*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 4*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 8*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 8*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 16*(prev) in P1xP1 coords
tmp3 = tmp1.to_extended(); // tmp3 = 16*(prev) in P3 coords
tmp1 = &tmp3 + &lookup_table.select(scalar_digits[i]);
// Now tmp1 = s_i*P + 16*(prev) in P1xP1 coords
}
tmp1.to_extended()
}

View File

@ -1,61 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use constants;
use traits::Identity;
use scalar::Scalar;
use edwards::EdwardsPoint;
use backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint};
use window::NafLookupTable5;
/// Compute \\(aA + bB\\) in variable time, where \\(B\\) is the Ed25519 basepoint.
pub fn mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint {
let a_naf = a.non_adjacent_form(5);
let b_naf = b.non_adjacent_form(8);
// Find starting index
let mut i: usize = 255;
for j in (0..256).rev() {
i = j;
if a_naf[i] != 0 || b_naf[i] != 0 {
break;
}
}
let table_A = NafLookupTable5::<ProjectiveNielsPoint>::from(A);
let table_B = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT;
let mut r = ProjectivePoint::identity();
loop {
let mut t = r.double();
if a_naf[i] > 0 {
t = &t.to_extended() + &table_A.select(a_naf[i] as usize);
} else if a_naf[i] < 0 {
t = &t.to_extended() - &table_A.select(-a_naf[i] as usize);
}
if b_naf[i] > 0 {
t = &t.to_extended() + &table_B.select(b_naf[i] as usize);
} else if b_naf[i] < 0 {
t = &t.to_extended() - &table_B.select(-b_naf[i] as usize);
}
r = t.to_projective();
if i == 0 {
break;
}
i -= 1;
}
r.to_extended()
}

View File

@ -1,577 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(32\\)-bit
//! limbs with \\(64\\)-bit products.
//!
//! This code was originally derived from Adam Langley's Golang ed25519
//! implementation, and was then rewritten to use unsigned limbs instead
//! of signed limbs.
use core::fmt::Debug;
use core::ops::Neg;
use core::ops::{Add, AddAssign};
use core::ops::{Mul, MulAssign};
use core::ops::{Sub, SubAssign};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
/// A `FieldElement2625` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// In the 32-bit implementation, a `FieldElement` is represented in
/// radix \\(2\^{25.5}\\) as ten `u32`s. This means that a field
/// element \\(x\\) is represented as
/// $$
/// x = \sum\_{i=0}\^9 x\_i 2\^{\lceil i \frac {51} 2 \rceil}
/// = x\_0 + x\_1 2\^{26} + x\_2 2\^{51} + x\_3 2\^{77} + \cdots + x\_9 2\^{230};
/// $$
/// the coefficients are alternately bounded by \\(2\^{25}\\) and
/// \\(2\^{26}\\). The limbs are allowed to grow between reductions up
/// to \\(2\^{25+b}\\) or \\(2\^{26+b}\\), where \\(b = 1.75\\).
///
/// # Note
///
/// The `curve25519_dalek::field` module provides a type alias
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
/// or `FieldElement2625`.
///
/// The backend-specific type `FieldElement2625` should not be used
/// outside of the `curve25519_dalek::field` module.
#[derive(Copy, Clone)]
pub struct FieldElement2625(pub (crate) [u32; 10]);
impl Debug for FieldElement2625 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "FieldElement2625({:?})", &self.0[..])
}
}
impl Zeroize for FieldElement2625 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl<'b> AddAssign<&'b FieldElement2625> for FieldElement2625 {
fn add_assign(&mut self, _rhs: &'b FieldElement2625) {
for i in 0..10 {
self.0[i] += _rhs.0[i];
}
}
}
impl<'a, 'b> Add<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn add(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
let mut output = *self;
output += _rhs;
output
}
}
impl<'b> SubAssign<&'b FieldElement2625> for FieldElement2625 {
fn sub_assign(&mut self, _rhs: &'b FieldElement2625) {
// See comment in FieldElement51::Sub
//
// Compute a - b as ((a + 2^4 * p) - b) to avoid underflow.
let b = &_rhs.0;
self.0 = FieldElement2625::reduce([
((self.0[0] + (0x3ffffed << 4)) - b[0]) as u64,
((self.0[1] + (0x1ffffff << 4)) - b[1]) as u64,
((self.0[2] + (0x3ffffff << 4)) - b[2]) as u64,
((self.0[3] + (0x1ffffff << 4)) - b[3]) as u64,
((self.0[4] + (0x3ffffff << 4)) - b[4]) as u64,
((self.0[5] + (0x1ffffff << 4)) - b[5]) as u64,
((self.0[6] + (0x3ffffff << 4)) - b[6]) as u64,
((self.0[7] + (0x1ffffff << 4)) - b[7]) as u64,
((self.0[8] + (0x3ffffff << 4)) - b[8]) as u64,
((self.0[9] + (0x1ffffff << 4)) - b[9]) as u64,
]).0;
}
}
impl<'a, 'b> Sub<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn sub(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
let mut output = *self;
output -= _rhs;
output
}
}
impl<'b> MulAssign<&'b FieldElement2625> for FieldElement2625 {
fn mul_assign(&mut self, _rhs: &'b FieldElement2625) {
let result = (self as &FieldElement2625) * _rhs;
self.0 = result.0;
}
}
impl<'a, 'b> Mul<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn mul(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
/// Helper function to multiply two 32-bit integers with 64 bits
/// of output.
#[inline(always)]
fn m(x: u32, y: u32) -> u64 { (x as u64) * (y as u64) }
// Alias self, _rhs for more readable formulas
let x: &[u32;10] = &self.0; let y: &[u32;10] = &_rhs.0;
// We assume that the input limbs x[i], y[i] are bounded by:
//
// x[i], y[i] < 2^(26 + b) if i even
// x[i], y[i] < 2^(25 + b) if i odd
//
// where b is a (real) parameter representing the excess bits of
// the limbs. We track the bitsizes of all variables through
// the computation and solve at the end for the allowable
// headroom bitsize b (which determines how many additions we
// can perform between reductions or multiplications).
let y1_19 = 19 * y[1]; // This fits in a u32
let y2_19 = 19 * y[2]; // iff 26 + b + lg(19) < 32
let y3_19 = 19 * y[3]; // if b < 32 - 26 - 4.248 = 1.752
let y4_19 = 19 * y[4];
let y5_19 = 19 * y[5]; // below, b<2.5: this is a bottleneck,
let y6_19 = 19 * y[6]; // could be avoided by promoting to
let y7_19 = 19 * y[7]; // u64 here instead of in m()
let y8_19 = 19 * y[8];
let y9_19 = 19 * y[9];
// What happens when we multiply x[i] with y[j] and place the
// result into the (i+j)-th limb?
//
// x[i] represents the value x[i]*2^ceil(i*51/2)
// y[j] represents the value y[j]*2^ceil(j*51/2)
// z[i+j] represents the value z[i+j]*2^ceil((i+j)*51/2)
// x[i]*y[j] represents the value x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2))
//
// Since the radix is already accounted for, the result placed
// into the (i+j)-th limb should be
//
// x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2)).
//
// The value of ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2) is
// 1 when both i and j are odd, and 0 otherwise. So we add
//
// x[i]*y[j] if either i or j is even
// 2*x[i]*y[j] if i and j are both odd
//
// by using precomputed multiples of x[i] for odd i:
let x1_2 = 2 * x[1]; // This fits in a u32 iff 25 + b + 1 < 32
let x3_2 = 2 * x[3]; // iff b < 6
let x5_2 = 2 * x[5];
let x7_2 = 2 * x[7];
let x9_2 = 2 * x[9];
let z0 = m(x[0],y[0]) + m(x1_2,y9_19) + m(x[2],y8_19) + m(x3_2,y7_19) + m(x[4],y6_19) + m(x5_2,y5_19) + m(x[6],y4_19) + m(x7_2,y3_19) + m(x[8],y2_19) + m(x9_2,y1_19);
let z1 = m(x[0],y[1]) + m(x[1],y[0]) + m(x[2],y9_19) + m(x[3],y8_19) + m(x[4],y7_19) + m(x[5],y6_19) + m(x[6],y5_19) + m(x[7],y4_19) + m(x[8],y3_19) + m(x[9],y2_19);
let z2 = m(x[0],y[2]) + m(x1_2,y[1]) + m(x[2],y[0]) + m(x3_2,y9_19) + m(x[4],y8_19) + m(x5_2,y7_19) + m(x[6],y6_19) + m(x7_2,y5_19) + m(x[8],y4_19) + m(x9_2,y3_19);
let z3 = m(x[0],y[3]) + m(x[1],y[2]) + m(x[2],y[1]) + m(x[3],y[0]) + m(x[4],y9_19) + m(x[5],y8_19) + m(x[6],y7_19) + m(x[7],y6_19) + m(x[8],y5_19) + m(x[9],y4_19);
let z4 = m(x[0],y[4]) + m(x1_2,y[3]) + m(x[2],y[2]) + m(x3_2,y[1]) + m(x[4],y[0]) + m(x5_2,y9_19) + m(x[6],y8_19) + m(x7_2,y7_19) + m(x[8],y6_19) + m(x9_2,y5_19);
let z5 = m(x[0],y[5]) + m(x[1],y[4]) + m(x[2],y[3]) + m(x[3],y[2]) + m(x[4],y[1]) + m(x[5],y[0]) + m(x[6],y9_19) + m(x[7],y8_19) + m(x[8],y7_19) + m(x[9],y6_19);
let z6 = m(x[0],y[6]) + m(x1_2,y[5]) + m(x[2],y[4]) + m(x3_2,y[3]) + m(x[4],y[2]) + m(x5_2,y[1]) + m(x[6],y[0]) + m(x7_2,y9_19) + m(x[8],y8_19) + m(x9_2,y7_19);
let z7 = m(x[0],y[7]) + m(x[1],y[6]) + m(x[2],y[5]) + m(x[3],y[4]) + m(x[4],y[3]) + m(x[5],y[2]) + m(x[6],y[1]) + m(x[7],y[0]) + m(x[8],y9_19) + m(x[9],y8_19);
let z8 = m(x[0],y[8]) + m(x1_2,y[7]) + m(x[2],y[6]) + m(x3_2,y[5]) + m(x[4],y[4]) + m(x5_2,y[3]) + m(x[6],y[2]) + m(x7_2,y[1]) + m(x[8],y[0]) + m(x9_2,y9_19);
let z9 = m(x[0],y[9]) + m(x[1],y[8]) + m(x[2],y[7]) + m(x[3],y[6]) + m(x[4],y[5]) + m(x[5],y[4]) + m(x[6],y[3]) + m(x[7],y[2]) + m(x[8],y[1]) + m(x[9],y[0]);
// How big is the contribution to z[i+j] from x[i], y[j]?
//
// Using the bounds above, we get:
//
// i even, j even: x[i]*y[j] < 2^(26+b)*2^(26+b) = 2*2^(51+2*b)
// i odd, j even: x[i]*y[j] < 2^(25+b)*2^(26+b) = 1*2^(51+2*b)
// i even, j odd: x[i]*y[j] < 2^(26+b)*2^(25+b) = 1*2^(51+2*b)
// i odd, j odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
//
// We perform inline reduction mod p by replacing 2^255 by 19
// (since 2^255 - 19 = 0 mod p). This adds a factor of 19, so
// we get the bounds (z0 is the biggest one, but calculated for
// posterity here in case finer estimation is needed later):
//
// z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
// z1 < ( 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
// z2 < ( 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
// z3 < ( 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
// z4 < ( 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
// z5 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 82*2^(51 + 2*b)
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 87*2^(51 + 2*b)
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 )*2^(51 + 2b) = 46*2^(51 + 2*b)
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 )*2^(51 + 2b) = 33*2^(51 + 2*b)
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 )*2^(51 + 2b) = 10*2^(51 + 2*b)
//
// So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
// if b < 2.5.
FieldElement2625::reduce([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
}
}
impl<'a> Neg for &'a FieldElement2625 {
type Output = FieldElement2625;
fn neg(self) -> FieldElement2625 {
let mut output = *self;
output.negate();
output
}
}
impl ConditionallySelectable for FieldElement2625 {
fn conditional_select(
a: &FieldElement2625,
b: &FieldElement2625,
choice: Choice,
) -> FieldElement2625 {
FieldElement2625([
u32::conditional_select(&a.0[0], &b.0[0], choice),
u32::conditional_select(&a.0[1], &b.0[1], choice),
u32::conditional_select(&a.0[2], &b.0[2], choice),
u32::conditional_select(&a.0[3], &b.0[3], choice),
u32::conditional_select(&a.0[4], &b.0[4], choice),
u32::conditional_select(&a.0[5], &b.0[5], choice),
u32::conditional_select(&a.0[6], &b.0[6], choice),
u32::conditional_select(&a.0[7], &b.0[7], choice),
u32::conditional_select(&a.0[8], &b.0[8], choice),
u32::conditional_select(&a.0[9], &b.0[9], choice),
])
}
fn conditional_assign(&mut self, other: &FieldElement2625, choice: Choice) {
self.0[0].conditional_assign(&other.0[0], choice);
self.0[1].conditional_assign(&other.0[1], choice);
self.0[2].conditional_assign(&other.0[2], choice);
self.0[3].conditional_assign(&other.0[3], choice);
self.0[4].conditional_assign(&other.0[4], choice);
self.0[5].conditional_assign(&other.0[5], choice);
self.0[6].conditional_assign(&other.0[6], choice);
self.0[7].conditional_assign(&other.0[7], choice);
self.0[8].conditional_assign(&other.0[8], choice);
self.0[9].conditional_assign(&other.0[9], choice);
}
fn conditional_swap(a: &mut FieldElement2625, b: &mut FieldElement2625, choice: Choice) {
u32::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
u32::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
u32::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
u32::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
u32::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
u32::conditional_swap(&mut a.0[5], &mut b.0[5], choice);
u32::conditional_swap(&mut a.0[6], &mut b.0[6], choice);
u32::conditional_swap(&mut a.0[7], &mut b.0[7], choice);
u32::conditional_swap(&mut a.0[8], &mut b.0[8], choice);
u32::conditional_swap(&mut a.0[9], &mut b.0[9], choice);
}
}
impl FieldElement2625 {
/// Invert the sign of this field element
pub fn negate(&mut self) {
// Compute -b as ((2^4 * p) - b) to avoid underflow.
let neg = FieldElement2625::reduce([
((0x3ffffed << 4) - self.0[0]) as u64,
((0x1ffffff << 4) - self.0[1]) as u64,
((0x3ffffff << 4) - self.0[2]) as u64,
((0x1ffffff << 4) - self.0[3]) as u64,
((0x3ffffff << 4) - self.0[4]) as u64,
((0x1ffffff << 4) - self.0[5]) as u64,
((0x3ffffff << 4) - self.0[6]) as u64,
((0x1ffffff << 4) - self.0[7]) as u64,
((0x3ffffff << 4) - self.0[8]) as u64,
((0x1ffffff << 4) - self.0[9]) as u64,
]);
self.0 = neg.0;
}
/// Construct zero.
pub fn zero() -> FieldElement2625 {
FieldElement2625([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])
}
/// Construct one.
pub fn one() -> FieldElement2625 {
FieldElement2625([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])
}
/// Construct -1.
pub fn minus_one() -> FieldElement2625 {
FieldElement2625([
0x3ffffec, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff,
0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff,
])
}
/// Given `k > 0`, return `self^(2^k)`.
pub fn pow2k(&self, k: u32) -> FieldElement2625 {
debug_assert!( k > 0 );
let mut z = self.square();
for _ in 1..k {
z = z.square();
}
z
}
/// Given unreduced coefficients `z[0], ..., z[9]` of any size,
/// carry and reduce them mod p to obtain a `FieldElement2625`
/// whose coefficients have excess `b < 0.007`.
///
/// In other words, each coefficient of the result is bounded by
/// either `2^(25 + 0.007)` or `2^(26 + 0.007)`, as appropriate.
fn reduce(mut z: [u64; 10]) -> FieldElement2625 {
const LOW_25_BITS: u64 = (1 << 25) - 1;
const LOW_26_BITS: u64 = (1 << 26) - 1;
/// Carry the value from limb i = 0..8 to limb i+1
#[inline(always)]
fn carry(z: &mut [u64; 10], i: usize) {
debug_assert!(i < 9);
if i % 2 == 0 {
// Even limbs have 26 bits
z[i+1] += z[i] >> 26;
z[i] &= LOW_26_BITS;
} else {
// Odd limbs have 25 bits
z[i+1] += z[i] >> 25;
z[i] &= LOW_25_BITS;
}
}
// Perform two halves of the carry chain in parallel.
carry(&mut z, 0); carry(&mut z, 4);
carry(&mut z, 1); carry(&mut z, 5);
carry(&mut z, 2); carry(&mut z, 6);
carry(&mut z, 3); carry(&mut z, 7);
// Since z[3] < 2^64, c < 2^(64-25) = 2^39,
// so z[4] < 2^26 + 2^39 < 2^39.0002
carry(&mut z, 4); carry(&mut z, 8);
// Now z[4] < 2^26
// and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
// Last carry has a multiplication by 19:
z[0] += 19*(z[9] >> 25);
z[9] &= LOW_25_BITS;
// Since z[9] < 2^64, c < 2^(64-25) = 2^39,
// so z[0] + 19*c < 2^26 + 2^43.248 < 2^43.249.
carry(&mut z, 0);
// Now z[1] < 2^25 - 2^(43.249 - 26)
// < 2^25.007 (good enough)
// and we're done.
FieldElement2625([
z[0] as u32, z[1] as u32, z[2] as u32, z[3] as u32, z[4] as u32,
z[5] as u32, z[6] as u32, z[7] as u32, z[8] as u32, z[9] as u32,
])
}
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
/// input.
///
/// # Warning
///
/// This function does not check that the input used the canonical
/// representative. It masks the high bit, but it will happily
/// decode 2^255 - 18 to 1. Applications that require a canonical
/// encoding of every field element should decode, re-encode to
/// the canonical encoding, and check that the input was
/// canonical.
pub fn from_bytes(data: &[u8; 32]) -> FieldElement2625 { //FeFromBytes
#[inline]
fn load3(b: &[u8]) -> u64 {
(b[0] as u64) | ((b[1] as u64) << 8) | ((b[2] as u64) << 16)
}
#[inline]
fn load4(b: &[u8]) -> u64 {
(b[0] as u64) | ((b[1] as u64) << 8) | ((b[2] as u64) << 16) | ((b[3] as u64) << 24)
}
let mut h = [0u64;10];
const LOW_23_BITS: u64 = (1 << 23) - 1;
h[0] = load4(&data[ 0..]);
h[1] = load3(&data[ 4..]) << 6;
h[2] = load3(&data[ 7..]) << 5;
h[3] = load3(&data[10..]) << 3;
h[4] = load3(&data[13..]) << 2;
h[5] = load4(&data[16..]);
h[6] = load3(&data[20..]) << 7;
h[7] = load3(&data[23..]) << 5;
h[8] = load3(&data[26..]) << 4;
h[9] = (load3(&data[29..]) & LOW_23_BITS) << 2;
FieldElement2625::reduce(h)
}
/// Serialize this `FieldElement51` to a 32-byte array. The
/// encoding is canonical.
pub fn to_bytes(&self) -> [u8; 32] {
let inp = &self.0;
// Reduce the value represented by `in` to the range [0,2*p)
let mut h: [u32; 10] = FieldElement2625::reduce([
// XXX this cast is annoying
inp[0] as u64, inp[1] as u64, inp[2] as u64, inp[3] as u64, inp[4] as u64,
inp[5] as u64, inp[6] as u64, inp[7] as u64, inp[8] as u64, inp[9] as u64,
]).0;
// Let h be the value to encode.
//
// Write h = pq + r with 0 <= r < p. We want to compute r = h mod p.
//
// Since h < 2*p, q = 0 or 1, with q = 0 when h < p and q = 1 when h >= p.
//
// Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255.
// Therefore q can be computed as the carry bit of h + 19.
let mut q: u32 = (h[0] + 19) >> 26;
q = (h[1] + q) >> 25;
q = (h[2] + q) >> 26;
q = (h[3] + q) >> 25;
q = (h[4] + q) >> 26;
q = (h[5] + q) >> 25;
q = (h[6] + q) >> 26;
q = (h[7] + q) >> 25;
q = (h[8] + q) >> 26;
q = (h[9] + q) >> 25;
debug_assert!( q == 0 || q == 1 );
// Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q
const LOW_25_BITS: u32 = (1 << 25) - 1;
const LOW_26_BITS: u32 = (1 << 26) - 1;
h[0] += 19*q;
// Now carry the result to compute r + 19q...
h[1] += h[0] >> 26;
h[0] = h[0] & LOW_26_BITS;
h[2] += h[1] >> 25;
h[1] = h[1] & LOW_25_BITS;
h[3] += h[2] >> 26;
h[2] = h[2] & LOW_26_BITS;
h[4] += h[3] >> 25;
h[3] = h[3] & LOW_25_BITS;
h[5] += h[4] >> 26;
h[4] = h[4] & LOW_26_BITS;
h[6] += h[5] >> 25;
h[5] = h[5] & LOW_25_BITS;
h[7] += h[6] >> 26;
h[6] = h[6] & LOW_26_BITS;
h[8] += h[7] >> 25;
h[7] = h[7] & LOW_25_BITS;
h[9] += h[8] >> 26;
h[8] = h[8] & LOW_26_BITS;
// ... but instead of carrying the value
// (h[9] >> 25) = q*2^255 into another limb,
// discard it, subtracting the value from h.
debug_assert!( (h[9] >> 25) == 0 || (h[9] >> 25) == 1);
h[9] = h[9] & LOW_25_BITS;
let mut s = [0u8; 32];
s[0] = (h[0] >> 0) as u8;
s[1] = (h[0] >> 8) as u8;
s[2] = (h[0] >> 16) as u8;
s[3] = ((h[0] >> 24) | (h[1] << 2)) as u8;
s[4] = (h[1] >> 6) as u8;
s[5] = (h[1] >> 14) as u8;
s[6] = ((h[1] >> 22) | (h[2] << 3)) as u8;
s[7] = (h[2] >> 5) as u8;
s[8] = (h[2] >> 13) as u8;
s[9] = ((h[2] >> 21) | (h[3] << 5)) as u8;
s[10] = (h[3] >> 3) as u8;
s[11] = (h[3] >> 11) as u8;
s[12] = ((h[3] >> 19) | (h[4] << 6)) as u8;
s[13] = (h[4] >> 2) as u8;
s[14] = (h[4] >> 10) as u8;
s[15] = (h[4] >> 18) as u8;
s[16] = (h[5] >> 0) as u8;
s[17] = (h[5] >> 8) as u8;
s[18] = (h[5] >> 16) as u8;
s[19] = ((h[5] >> 24) | (h[6] << 1)) as u8;
s[20] = (h[6] >> 7) as u8;
s[21] = (h[6] >> 15) as u8;
s[22] = ((h[6] >> 23) | (h[7] << 3)) as u8;
s[23] = (h[7] >> 5) as u8;
s[24] = (h[7] >> 13) as u8;
s[25] = ((h[7] >> 21) | (h[8] << 4)) as u8;
s[26] = (h[8] >> 4) as u8;
s[27] = (h[8] >> 12) as u8;
s[28] = ((h[8] >> 20) | (h[9] << 6)) as u8;
s[29] = (h[9] >> 2) as u8;
s[30] = (h[9] >> 10) as u8;
s[31] = (h[9] >> 18) as u8;
// Check that high bit is cleared
debug_assert!((s[31] & 0b1000_0000u8) == 0u8);
s
}
fn square_inner(&self) -> [u64; 10] {
// Optimized version of multiplication for the case of squaring.
// Pre- and post- conditions identical to multiplication function.
let x = &self.0;
let x0_2 = 2 * x[0];
let x1_2 = 2 * x[1];
let x2_2 = 2 * x[2];
let x3_2 = 2 * x[3];
let x4_2 = 2 * x[4];
let x5_2 = 2 * x[5];
let x6_2 = 2 * x[6];
let x7_2 = 2 * x[7];
let x5_19 = 19 * x[5];
let x6_19 = 19 * x[6];
let x7_19 = 19 * x[7];
let x8_19 = 19 * x[8];
let x9_19 = 19 * x[9];
/// Helper function to multiply two 32-bit integers with 64 bits
/// of output.
#[inline(always)]
fn m(x: u32, y: u32) -> u64 { (x as u64) * (y as u64) }
// This block is rearranged so that instead of doing a 32-bit multiplication by 38, we do a
// 64-bit multiplication by 2 on the results. This is because lg(38) is too big: we would
// have less than 1 bit of headroom left, which is too little.
let mut z = [0u64;10];
z[0] = m(x[0],x[0]) + m(x2_2,x8_19) + m(x4_2,x6_19) + (m(x1_2,x9_19) + m(x3_2,x7_19) + m(x[5],x5_19))*2;
z[1] = m(x0_2,x[1]) + m(x3_2,x8_19) + m(x5_2,x6_19) + (m(x[2],x9_19) + m(x[4],x7_19))*2;
z[2] = m(x0_2,x[2]) + m(x1_2,x[1]) + m(x4_2,x8_19) + m(x[6],x6_19) + (m(x3_2,x9_19) + m(x5_2,x7_19))*2;
z[3] = m(x0_2,x[3]) + m(x1_2,x[2]) + m(x5_2,x8_19) + (m(x[4],x9_19) + m(x[6],x7_19))*2;
z[4] = m(x0_2,x[4]) + m(x1_2,x3_2) + m(x[2],x[2]) + m(x6_2,x8_19) + (m(x5_2,x9_19) + m(x[7],x7_19))*2;
z[5] = m(x0_2,x[5]) + m(x1_2,x[4]) + m(x2_2,x[3]) + m(x7_2,x8_19) + m(x[6],x9_19)*2;
z[6] = m(x0_2,x[6]) + m(x1_2,x5_2) + m(x2_2,x[4]) + m(x3_2,x[3]) + m(x[8],x8_19) + m(x7_2,x9_19)*2;
z[7] = m(x0_2,x[7]) + m(x1_2,x[6]) + m(x2_2,x[5]) + m(x3_2,x[4]) + m(x[8],x9_19)*2;
z[8] = m(x0_2,x[8]) + m(x1_2,x7_2) + m(x2_2,x[6]) + m(x3_2,x5_2) + m(x[4],x[4]) + m(x[9],x9_19)*2;
z[9] = m(x0_2,x[9]) + m(x1_2,x[8]) + m(x2_2,x[7]) + m(x3_2,x[6]) + m(x4_2,x[5]) ;
z
}
/// Compute `self^2`.
pub fn square(&self) -> FieldElement2625 {
FieldElement2625::reduce(self.square_inner())
}
/// Compute `2*self^2`.
pub fn square2(&self) -> FieldElement2625 {
let mut coeffs = self.square_inner();
for i in 0..self.0.len() {
coeffs[i] += coeffs[i];
}
FieldElement2625::reduce(coeffs)
}
}

View File

@ -1,21 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! The `u32` backend uses `u32`s and a `(u32, u32) -> u64` multiplier.
//!
//! This code is intended to be portable, but it requires that
//! multiplication of two \\(32\\)-bit values to a \\(64\\)-bit result
//! is constant-time on the target platform.
pub mod field;
pub mod scalar;
pub mod constants;

View File

@ -1,529 +0,0 @@
//! Arithmetic mod 2^252 + 27742317777372353535851937790883648493
//! with 9 29-bit unsigned limbs
//!
//! To see that this is safe for intermediate results, note that
//! the largest limb in a 9 by 9 product of 29-bit limbs will be
//! (0x1fffffff^2) * 9 = 0x23fffffdc0000009 (62 bits).
//!
//! For a one level Karatsuba decomposition, the specific ranges
//! depend on how the limbs are combined, but will stay within
//! -0x1ffffffe00000008 (62 bits with sign bit) to
//! 0x43fffffbc0000011 (63 bits), which is still safe.
use core::fmt::Debug;
use core::ops::{Index, IndexMut};
use zeroize::Zeroize;
use constants;
/// The `Scalar29` struct represents an element in /l as 9 29-bit limbs
#[derive(Copy,Clone)]
pub struct Scalar29(pub [u32; 9]);
impl Debug for Scalar29 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "Scalar29: {:?}", &self.0[..])
}
}
impl Zeroize for Scalar29 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl Index<usize> for Scalar29 {
type Output = u32;
fn index(&self, _index: usize) -> &u32 {
&(self.0[_index])
}
}
impl IndexMut<usize> for Scalar29 {
fn index_mut(&mut self, _index: usize) -> &mut u32 {
&mut (self.0[_index])
}
}
/// u32 * u32 = u64 multiply helper
#[inline(always)]
fn m(x: u32, y: u32) -> u64 {
(x as u64) * (y as u64)
}
impl Scalar29 {
/// Return the zero scalar.
pub fn zero() -> Scalar29 {
Scalar29([0,0,0,0,0,0,0,0,0])
}
/// Unpack a 32 byte / 256 bit scalar into 9 29-bit limbs.
pub fn from_bytes(bytes: &[u8; 32]) -> Scalar29 {
let mut words = [0u32; 8];
for i in 0..8 {
for j in 0..4 {
words[i] |= (bytes[(i * 4) + j] as u32) << (j * 8);
}
}
let mask = (1u32 << 29) - 1;
let top_mask = (1u32 << 24) - 1;
let mut s = Scalar29::zero();
s[ 0] = words[0] & mask;
s[ 1] = ((words[0] >> 29) | (words[1] << 3)) & mask;
s[ 2] = ((words[1] >> 26) | (words[2] << 6)) & mask;
s[ 3] = ((words[2] >> 23) | (words[3] << 9)) & mask;
s[ 4] = ((words[3] >> 20) | (words[4] << 12)) & mask;
s[ 5] = ((words[4] >> 17) | (words[5] << 15)) & mask;
s[ 6] = ((words[5] >> 14) | (words[6] << 18)) & mask;
s[ 7] = ((words[6] >> 11) | (words[7] << 21)) & mask;
s[ 8] = (words[7] >> 8) & top_mask;
s
}
/// Reduce a 64 byte / 512 bit scalar mod l.
pub fn from_bytes_wide(bytes: &[u8; 64]) -> Scalar29 {
let mut words = [0u32; 16];
for i in 0..16 {
for j in 0..4 {
words[i] |= (bytes[(i * 4) + j] as u32) << (j * 8);
}
}
let mask = (1u32 << 29) - 1;
let mut lo = Scalar29::zero();
let mut hi = Scalar29::zero();
lo[0] = words[ 0] & mask;
lo[1] = ((words[ 0] >> 29) | (words[ 1] << 3)) & mask;
lo[2] = ((words[ 1] >> 26) | (words[ 2] << 6)) & mask;
lo[3] = ((words[ 2] >> 23) | (words[ 3] << 9)) & mask;
lo[4] = ((words[ 3] >> 20) | (words[ 4] << 12)) & mask;
lo[5] = ((words[ 4] >> 17) | (words[ 5] << 15)) & mask;
lo[6] = ((words[ 5] >> 14) | (words[ 6] << 18)) & mask;
lo[7] = ((words[ 6] >> 11) | (words[ 7] << 21)) & mask;
lo[8] = ((words[ 7] >> 8) | (words[ 8] << 24)) & mask;
hi[0] = ((words[ 8] >> 5) | (words[ 9] << 27)) & mask;
hi[1] = (words[ 9] >> 2) & mask;
hi[2] = ((words[ 9] >> 31) | (words[10] << 1)) & mask;
hi[3] = ((words[10] >> 28) | (words[11] << 4)) & mask;
hi[4] = ((words[11] >> 25) | (words[12] << 7)) & mask;
hi[5] = ((words[12] >> 22) | (words[13] << 10)) & mask;
hi[6] = ((words[13] >> 19) | (words[14] << 13)) & mask;
hi[7] = ((words[14] >> 16) | (words[15] << 16)) & mask;
hi[8] = words[15] >> 13 ;
lo = Scalar29::montgomery_mul(&lo, &constants::R); // (lo * R) / R = lo
hi = Scalar29::montgomery_mul(&hi, &constants::RR); // (hi * R^2) / R = hi * R
Scalar29::add(&hi, &lo) // (hi * R) + lo
}
/// Pack the limbs of this `Scalar29` into 32 bytes.
pub fn to_bytes(&self) -> [u8; 32] {
let mut s = [0u8; 32];
s[0] = (self.0[ 0] >> 0) as u8;
s[1] = (self.0[ 0] >> 8) as u8;
s[2] = (self.0[ 0] >> 16) as u8;
s[3] = ((self.0[ 0] >> 24) | (self.0[ 1] << 5)) as u8;
s[4] = (self.0[ 1] >> 3) as u8;
s[5] = (self.0[ 1] >> 11) as u8;
s[6] = (self.0[ 1] >> 19) as u8;
s[7] = ((self.0[ 1] >> 27) | (self.0[ 2] << 2)) as u8;
s[8] = (self.0[ 2] >> 6) as u8;
s[9] = (self.0[ 2] >> 14) as u8;
s[10] = ((self.0[ 2] >> 22) | (self.0[ 3] << 7)) as u8;
s[11] = (self.0[ 3] >> 1) as u8;
s[12] = (self.0[ 3] >> 9) as u8;
s[13] = (self.0[ 3] >> 17) as u8;
s[14] = ((self.0[ 3] >> 25) | (self.0[ 4] << 4)) as u8;
s[15] = (self.0[ 4] >> 4) as u8;
s[16] = (self.0[ 4] >> 12) as u8;
s[17] = (self.0[ 4] >> 20) as u8;
s[18] = ((self.0[ 4] >> 28) | (self.0[ 5] << 1)) as u8;
s[19] = (self.0[ 5] >> 7) as u8;
s[20] = (self.0[ 5] >> 15) as u8;
s[21] = ((self.0[ 5] >> 23) | (self.0[ 6] << 6)) as u8;
s[22] = (self.0[ 6] >> 2) as u8;
s[23] = (self.0[ 6] >> 10) as u8;
s[24] = (self.0[ 6] >> 18) as u8;
s[25] = ((self.0[ 6] >> 26) | (self.0[ 7] << 3)) as u8;
s[26] = (self.0[ 7] >> 5) as u8;
s[27] = (self.0[ 7] >> 13) as u8;
s[28] = (self.0[ 7] >> 21) as u8;
s[29] = (self.0[ 8] >> 0) as u8;
s[30] = (self.0[ 8] >> 8) as u8;
s[31] = (self.0[ 8] >> 16) as u8;
s
}
/// Compute `a + b` (mod l).
pub fn add(a: &Scalar29, b: &Scalar29) -> Scalar29 {
let mut sum = Scalar29::zero();
let mask = (1u32 << 29) - 1;
// a + b
let mut carry: u32 = 0;
for i in 0..9 {
carry = a[i] + b[i] + (carry >> 29);
sum[i] = carry & mask;
}
// subtract l if the sum is >= l
Scalar29::sub(&sum, &constants::L)
}
/// Compute `a - b` (mod l).
pub fn sub(a: &Scalar29, b: &Scalar29) -> Scalar29 {
let mut difference = Scalar29::zero();
let mask = (1u32 << 29) - 1;
// a - b
let mut borrow: u32 = 0;
for i in 0..9 {
borrow = a[i].wrapping_sub(b[i] + (borrow >> 31));
difference[i] = borrow & mask;
}
// conditionally add l if the difference is negative
let underflow_mask = ((borrow >> 31) ^ 1).wrapping_sub(1);
let mut carry: u32 = 0;
for i in 0..9 {
carry = (carry >> 29) + difference[i] + (constants::L[i] & underflow_mask);
difference[i] = carry & mask;
}
difference
}
/// Compute `a * b`.
///
/// This is implemented with a one-level refined Karatsuba decomposition
#[inline(always)]
pub (crate) fn mul_internal(a: &Scalar29, b: &Scalar29) -> [u64; 17] {
let mut z = [0u64; 17];
z[0] = m(a[0],b[0]); // c00
z[1] = m(a[0],b[1]) + m(a[1],b[0]); // c01
z[2] = m(a[0],b[2]) + m(a[1],b[1]) + m(a[2],b[0]); // c02
z[3] = m(a[0],b[3]) + m(a[1],b[2]) + m(a[2],b[1]) + m(a[3],b[0]); // c03
z[4] = m(a[0],b[4]) + m(a[1],b[3]) + m(a[2],b[2]) + m(a[3],b[1]) + m(a[4],b[0]); // c04
z[5] = m(a[1],b[4]) + m(a[2],b[3]) + m(a[3],b[2]) + m(a[4],b[1]); // c05
z[6] = m(a[2],b[4]) + m(a[3],b[3]) + m(a[4],b[2]); // c06
z[7] = m(a[3],b[4]) + m(a[4],b[3]); // c07
z[8] = (m(a[4],b[4])).wrapping_sub(z[3]); // c08 - c03
z[10] = z[5].wrapping_sub(m(a[5],b[5])); // c05mc10
z[11] = z[6].wrapping_sub(m(a[5],b[6]) + m(a[6],b[5])); // c06mc11
z[12] = z[7].wrapping_sub(m(a[5],b[7]) + m(a[6],b[6]) + m(a[7],b[5])); // c07mc12
z[13] = m(a[5],b[8]) + m(a[6],b[7]) + m(a[7],b[6]) + m(a[8],b[5]); // c13
z[14] = m(a[6],b[8]) + m(a[7],b[7]) + m(a[8],b[6]); // c14
z[15] = m(a[7],b[8]) + m(a[8],b[7]); // c15
z[16] = m(a[8],b[8]); // c16
z[ 5] = z[10].wrapping_sub(z[ 0]); // c05mc10 - c00
z[ 6] = z[11].wrapping_sub(z[ 1]); // c06mc11 - c01
z[ 7] = z[12].wrapping_sub(z[ 2]); // c07mc12 - c02
z[ 8] = z[ 8].wrapping_sub(z[13]); // c08mc13 - c03
z[ 9] = z[14].wrapping_add(z[ 4]); // c14 + c04
z[10] = z[15].wrapping_add(z[10]); // c15 + c05mc10
z[11] = z[16].wrapping_add(z[11]); // c16 + c06mc11
let aa = [
a[0]+a[5],
a[1]+a[6],
a[2]+a[7],
a[3]+a[8]
];
let bb = [
b[0]+b[5],
b[1]+b[6],
b[2]+b[7],
b[3]+b[8]
];
z[ 5] = (m(aa[0],bb[0])) .wrapping_add(z[ 5]); // c20 + c05mc10 - c00
z[ 6] = (m(aa[0],bb[1]) + m(aa[1],bb[0])) .wrapping_add(z[ 6]); // c21 + c06mc11 - c01
z[ 7] = (m(aa[0],bb[2]) + m(aa[1],bb[1]) + m(aa[2],bb[0])) .wrapping_add(z[ 7]); // c22 + c07mc12 - c02
z[ 8] = (m(aa[0],bb[3]) + m(aa[1],bb[2]) + m(aa[2],bb[1]) + m(aa[3],bb[0])) .wrapping_add(z[ 8]); // c23 + c08mc13 - c03
z[ 9] = (m(aa[0], b[4]) + m(aa[1],bb[3]) + m(aa[2],bb[2]) + m(aa[3],bb[1]) + m(a[4],bb[0])).wrapping_sub(z[ 9]); // c24 - c14 - c04
z[10] = ( m(aa[1], b[4]) + m(aa[2],bb[3]) + m(aa[3],bb[2]) + m(a[4],bb[1])).wrapping_sub(z[10]); // c25 - c15 - c05mc10
z[11] = ( m(aa[2], b[4]) + m(aa[3],bb[3]) + m(a[4],bb[2])).wrapping_sub(z[11]); // c26 - c16 - c06mc11
z[12] = ( m(aa[3], b[4]) + m(a[4],bb[3])).wrapping_sub(z[12]); // c27 - c07mc12
z
}
/// Compute `a^2`.
#[inline(always)]
fn square_internal(a: &Scalar29) -> [u64; 17] {
let aa = [
a[0]*2,
a[1]*2,
a[2]*2,
a[3]*2,
a[4]*2,
a[5]*2,
a[6]*2,
a[7]*2
];
[
m( a[0],a[0]),
m(aa[0],a[1]),
m(aa[0],a[2]) + m( a[1],a[1]),
m(aa[0],a[3]) + m(aa[1],a[2]),
m(aa[0],a[4]) + m(aa[1],a[3]) + m( a[2],a[2]),
m(aa[0],a[5]) + m(aa[1],a[4]) + m(aa[2],a[3]),
m(aa[0],a[6]) + m(aa[1],a[5]) + m(aa[2],a[4]) + m( a[3],a[3]),
m(aa[0],a[7]) + m(aa[1],a[6]) + m(aa[2],a[5]) + m(aa[3],a[4]),
m(aa[0],a[8]) + m(aa[1],a[7]) + m(aa[2],a[6]) + m(aa[3],a[5]) + m( a[4],a[4]),
m(aa[1],a[8]) + m(aa[2],a[7]) + m(aa[3],a[6]) + m(aa[4],a[5]),
m(aa[2],a[8]) + m(aa[3],a[7]) + m(aa[4],a[6]) + m( a[5],a[5]),
m(aa[3],a[8]) + m(aa[4],a[7]) + m(aa[5],a[6]),
m(aa[4],a[8]) + m(aa[5],a[7]) + m( a[6],a[6]),
m(aa[5],a[8]) + m(aa[6],a[7]),
m(aa[6],a[8]) + m( a[7],a[7]),
m(aa[7],a[8]),
m( a[8],a[8]),
]
}
/// Compute `limbs/R` (mod l), where R is the Montgomery modulus 2^261
#[inline(always)]
pub (crate) fn montgomery_reduce(limbs: &[u64; 17]) -> Scalar29 {
#[inline(always)]
fn part1(sum: u64) -> (u64, u32) {
let p = (sum as u32).wrapping_mul(constants::LFACTOR) & ((1u32 << 29) - 1);
((sum + m(p,constants::L[0])) >> 29, p)
}
#[inline(always)]
fn part2(sum: u64) -> (u64, u32) {
let w = (sum as u32) & ((1u32 << 29) - 1);
(sum >> 29, w)
}
// note: l5,l6,l7 are zero, so their multiplies can be skipped
let l = &constants::L;
// the first half computes the Montgomery adjustment factor n, and begins adding n*l to make limbs divisible by R
let (carry, n0) = part1( limbs[ 0]);
let (carry, n1) = part1(carry + limbs[ 1] + m(n0,l[1]));
let (carry, n2) = part1(carry + limbs[ 2] + m(n0,l[2]) + m(n1,l[1]));
let (carry, n3) = part1(carry + limbs[ 3] + m(n0,l[3]) + m(n1,l[2]) + m(n2,l[1]));
let (carry, n4) = part1(carry + limbs[ 4] + m(n0,l[4]) + m(n1,l[3]) + m(n2,l[2]) + m(n3,l[1]));
let (carry, n5) = part1(carry + limbs[ 5] + m(n1,l[4]) + m(n2,l[3]) + m(n3,l[2]) + m(n4,l[1]));
let (carry, n6) = part1(carry + limbs[ 6] + m(n2,l[4]) + m(n3,l[3]) + m(n4,l[2]) + m(n5,l[1]));
let (carry, n7) = part1(carry + limbs[ 7] + m(n3,l[4]) + m(n4,l[3]) + m(n5,l[2]) + m(n6,l[1]));
let (carry, n8) = part1(carry + limbs[ 8] + m(n0,l[8]) + m(n4,l[4]) + m(n5,l[3]) + m(n6,l[2]) + m(n7,l[1]));
// limbs is divisible by R now, so we can divide by R by simply storing the upper half as the result
let (carry, r0) = part2(carry + limbs[ 9] + m(n1,l[8]) + m(n5,l[4]) + m(n6,l[3]) + m(n7,l[2]) + m(n8,l[1]));
let (carry, r1) = part2(carry + limbs[10] + m(n2,l[8]) + m(n6,l[4]) + m(n7,l[3]) + m(n8,l[2]));
let (carry, r2) = part2(carry + limbs[11] + m(n3,l[8]) + m(n7,l[4]) + m(n8,l[3]));
let (carry, r3) = part2(carry + limbs[12] + m(n4,l[8]) + m(n8,l[4]));
let (carry, r4) = part2(carry + limbs[13] + m(n5,l[8]) );
let (carry, r5) = part2(carry + limbs[14] + m(n6,l[8]) );
let (carry, r6) = part2(carry + limbs[15] + m(n7,l[8]) );
let (carry, r7) = part2(carry + limbs[16] + m(n8,l[8]));
let r8 = carry as u32;
// result may be >= l, so attempt to subtract l
Scalar29::sub(&Scalar29([r0,r1,r2,r3,r4,r5,r6,r7,r8]), l)
}
/// Compute `a * b` (mod l).
#[inline(never)]
pub fn mul(a: &Scalar29, b: &Scalar29) -> Scalar29 {
let ab = Scalar29::montgomery_reduce(&Scalar29::mul_internal(a, b));
Scalar29::montgomery_reduce(&Scalar29::mul_internal(&ab, &constants::RR))
}
/// Compute `a^2` (mod l).
#[inline(never)]
#[allow(dead_code)] // XXX we don't expose square() via the Scalar API
pub fn square(&self) -> Scalar29 {
let aa = Scalar29::montgomery_reduce(&Scalar29::square_internal(self));
Scalar29::montgomery_reduce(&Scalar29::mul_internal(&aa, &constants::RR))
}
/// Compute `(a * b) / R` (mod l), where R is the Montgomery modulus 2^261
#[inline(never)]
pub fn montgomery_mul(a: &Scalar29, b: &Scalar29) -> Scalar29 {
Scalar29::montgomery_reduce(&Scalar29::mul_internal(a, b))
}
/// Compute `(a^2) / R` (mod l) in Montgomery form, where R is the Montgomery modulus 2^261
#[inline(never)]
pub fn montgomery_square(&self) -> Scalar29 {
Scalar29::montgomery_reduce(&Scalar29::square_internal(self))
}
/// Puts a Scalar29 in to Montgomery form, i.e. computes `a*R (mod l)`
#[inline(never)]
pub fn to_montgomery(&self) -> Scalar29 {
Scalar29::montgomery_mul(self, &constants::RR)
}
/// Takes a Scalar29 out of Montgomery form, i.e. computes `a/R (mod l)`
pub fn from_montgomery(&self) -> Scalar29 {
let mut limbs = [0u64; 17];
for i in 0..9 {
limbs[i] = self[i] as u64;
}
Scalar29::montgomery_reduce(&limbs)
}
}
#[cfg(test)]
mod test {
use super::*;
/// Note: x is 2^253-1 which is slightly larger than the largest scalar produced by
/// this implementation (l-1), and should verify there are no overflows for valid scalars
///
/// x = 2^253-1 = 14474011154664524427946373126085988481658748083205070504932198000989141204991
/// x = 7237005577332262213973186563042994240801631723825162898930247062703686954002 mod l
/// x = 5147078182513738803124273553712992179887200054963030844803268920753008712037*R mod l in Montgomery form
pub static X: Scalar29 = Scalar29(
[0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff,
0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff,
0x001fffff]);
/// x^2 = 3078544782642840487852506753550082162405942681916160040940637093560259278169 mod l
pub static XX: Scalar29 = Scalar29(
[0x00217559, 0x000b3401, 0x103ff43b, 0x1462a62c,
0x1d6f9f38, 0x18e7a42f, 0x09a3dcee, 0x008dbe18,
0x0006ce65]);
/// x^2 = 2912514428060642753613814151688322857484807845836623976981729207238463947987*R mod l in Montgomery form
pub static XX_MONT: Scalar29 = Scalar29(
[0x152b4d2e, 0x0571d53b, 0x1da6d964, 0x188663b6,
0x1d1b5f92, 0x19d50e3f, 0x12306c29, 0x0c6f26fe,
0x00030edb]);
/// y = 6145104759870991071742105800796537629880401874866217824609283457819451087098
pub static Y: Scalar29 = Scalar29(
[0x1e1458fa, 0x165ba838, 0x1d787b36, 0x0e577f3a,
0x1d2baf06, 0x1d689a19, 0x1fff3047, 0x117704ab,
0x000d9601]);
/// x*y = 36752150652102274958925982391442301741
pub static XY: Scalar29 = Scalar29(
[0x0ba7632d, 0x017736bb, 0x15c76138, 0x0c69daa1,
0x000001ba, 0x00000000, 0x00000000, 0x00000000,
0x00000000]);
/// x*y = 3783114862749659543382438697751927473898937741870308063443170013240655651591*R mod l in Montgomery form
pub static XY_MONT: Scalar29 = Scalar29(
[0x077b51e1, 0x1c64e119, 0x02a19ef5, 0x18d2129e,
0x00de0430, 0x045a7bc8, 0x04cfc7c9, 0x1c002681,
0x000bdc1c]);
/// a = 2351415481556538453565687241199399922945659411799870114962672658845158063753
pub static A: Scalar29 = Scalar29(
[0x07b3be89, 0x02291b60, 0x14a99f03, 0x07dc3787,
0x0a782aae, 0x16262525, 0x0cfdb93f, 0x13f5718d,
0x000532da]);
/// b = 4885590095775723760407499321843594317911456947580037491039278279440296187236
pub static B: Scalar29 = Scalar29(
[0x15421564, 0x1e69fd72, 0x093d9692, 0x161785be,
0x1587d69f, 0x09d9dada, 0x130246c0, 0x0c0a8e72,
0x000acd25]);
/// a+b = 0
/// a-b = 4702830963113076907131374482398799845891318823599740229925345317690316127506
pub static AB: Scalar29 = Scalar29(
[0x0f677d12, 0x045236c0, 0x09533e06, 0x0fb86f0f,
0x14f0555c, 0x0c4c4a4a, 0x19fb727f, 0x07eae31a,
0x000a65b5]);
// c = (2^512 - 1) % l = 1627715501170711445284395025044413883736156588369414752970002579683115011840
pub static C: Scalar29 = Scalar29(
[0x049c0f00, 0x00308f1a, 0x0164d1e9, 0x1c374ed1,
0x1be65d00, 0x19e90bfa, 0x08f73bb1, 0x036f8613,
0x00039941]);
#[test]
fn mul_max() {
let res = Scalar29::mul(&X, &X);
for i in 0..9 {
assert!(res[i] == XX[i]);
}
}
#[test]
fn square_max() {
let res = X.square();
for i in 0..9 {
assert!(res[i] == XX[i]);
}
}
#[test]
fn montgomery_mul_max() {
let res = Scalar29::montgomery_mul(&X, &X);
for i in 0..9 {
assert!(res[i] == XX_MONT[i]);
}
}
#[test]
fn montgomery_square_max() {
let res = X.montgomery_square();
for i in 0..9 {
assert!(res[i] == XX_MONT[i]);
}
}
#[test]
fn mul() {
let res = Scalar29::mul(&X, &Y);
for i in 0..9 {
assert!(res[i] == XY[i]);
}
}
#[test]
fn montgomery_mul() {
let res = Scalar29::montgomery_mul(&X, &Y);
for i in 0..9 {
assert!(res[i] == XY_MONT[i]);
}
}
#[test]
fn add() {
let res = Scalar29::add(&A, &B);
let zero = Scalar29::zero();
for i in 0..9 {
assert!(res[i] == zero[i]);
}
}
#[test]
fn sub() {
let res = Scalar29::sub(&A, &B);
for i in 0..9 {
assert!(res[i] == AB[i]);
}
}
#[test]
fn from_bytes_wide() {
let bignum = [255u8; 64]; // 2^512 - 1
let reduced = Scalar29::from_bytes_wide(&bignum);
for i in 0..9 {
assert!(reduced[i] == C[i]);
}
}
}

View File

@ -1,563 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(64\\)-bit
//! limbs with \\(128\\)-bit products.
use core::fmt::Debug;
use core::ops::Neg;
use core::ops::{Add, AddAssign};
use core::ops::{Mul, MulAssign};
use core::ops::{Sub, SubAssign};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
/// A `FieldElement51` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// In the 64-bit implementation, a `FieldElement` is represented in
/// radix \\(2\^{51}\\) as five `u64`s; the coefficients are allowed to
/// grow up to \\(2\^{54}\\) between reductions modulo \\(p\\).
///
/// # Note
///
/// The `curve25519_dalek::field` module provides a type alias
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
/// or `FieldElement2625`.
///
/// The backend-specific type `FieldElement51` should not be used
/// outside of the `curve25519_dalek::field` module.
#[derive(Copy, Clone)]
pub struct FieldElement51(pub (crate) [u64; 5]);
impl Debug for FieldElement51 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "FieldElement51({:?})", &self.0[..])
}
}
impl Zeroize for FieldElement51 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl<'b> AddAssign<&'b FieldElement51> for FieldElement51 {
fn add_assign(&mut self, _rhs: &'b FieldElement51) {
for i in 0..5 {
self.0[i] += _rhs.0[i];
}
}
}
impl<'a, 'b> Add<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn add(self, _rhs: &'b FieldElement51) -> FieldElement51 {
let mut output = *self;
output += _rhs;
output
}
}
impl<'b> SubAssign<&'b FieldElement51> for FieldElement51 {
fn sub_assign(&mut self, _rhs: &'b FieldElement51) {
let result = (self as &FieldElement51) - _rhs;
self.0 = result.0;
}
}
impl<'a, 'b> Sub<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn sub(self, _rhs: &'b FieldElement51) -> FieldElement51 {
// To avoid underflow, first add a multiple of p.
// Choose 16*p = p << 4 to be larger than 54-bit _rhs.
//
// If we could statically track the bitlengths of the limbs
// of every FieldElement51, we could choose a multiple of p
// just bigger than _rhs and avoid having to do a reduction.
//
// Since we don't yet have type-level integers to do this, we
// have to add an explicit reduction call here.
FieldElement51::reduce([
(self.0[0] + 36028797018963664u64) - _rhs.0[0],
(self.0[1] + 36028797018963952u64) - _rhs.0[1],
(self.0[2] + 36028797018963952u64) - _rhs.0[2],
(self.0[3] + 36028797018963952u64) - _rhs.0[3],
(self.0[4] + 36028797018963952u64) - _rhs.0[4],
])
}
}
impl<'b> MulAssign<&'b FieldElement51> for FieldElement51 {
fn mul_assign(&mut self, _rhs: &'b FieldElement51) {
let result = (self as &FieldElement51) * _rhs;
self.0 = result.0;
}
}
impl<'a, 'b> Mul<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn mul(self, _rhs: &'b FieldElement51) -> FieldElement51 {
/// Helper function to multiply two 64-bit integers with 128
/// bits of output.
#[inline(always)]
fn m(x: u64, y: u64) -> u128 { (x as u128) * (y as u128) }
// Alias self, _rhs for more readable formulas
let a: &[u64; 5] = &self.0;
let b: &[u64; 5] = &_rhs.0;
// Precondition: assume input limbs a[i], b[i] are bounded as
//
// a[i], b[i] < 2^(51 + b)
//
// where b is a real parameter measuring the "bit excess" of the limbs.
// 64-bit precomputations to avoid 128-bit multiplications.
//
// This fits into a u64 whenever 51 + b + lg(19) < 64.
//
// Since 51 + b + lg(19) < 51 + 4.25 + b
// = 55.25 + b,
// this fits if b < 8.75.
let b1_19 = b[1] * 19;
let b2_19 = b[2] * 19;
let b3_19 = b[3] * 19;
let b4_19 = b[4] * 19;
// Multiply to get 128-bit coefficients of output
let c0: u128 = m(a[0],b[0]) + m(a[4],b1_19) + m(a[3],b2_19) + m(a[2],b3_19) + m(a[1],b4_19);
let mut c1: u128 = m(a[1],b[0]) + m(a[0],b[1]) + m(a[4],b2_19) + m(a[3],b3_19) + m(a[2],b4_19);
let mut c2: u128 = m(a[2],b[0]) + m(a[1],b[1]) + m(a[0],b[2]) + m(a[4],b3_19) + m(a[3],b4_19);
let mut c3: u128 = m(a[3],b[0]) + m(a[2],b[1]) + m(a[1],b[2]) + m(a[0],b[3]) + m(a[4],b4_19);
let mut c4: u128 = m(a[4],b[0]) + m(a[3],b[1]) + m(a[2],b[2]) + m(a[1],b[3]) + m(a[0],b[4]);
// How big are the c[i]? We have
//
// c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19)
// < 2^(102 + lg(1 + 4*19) + 2*b)
// < 2^(108.27 + 2*b)
//
// The carry (c[i] >> 51) fits into a u64 when
// 108.27 + 2*b - 51 < 64
// 2*b < 6.73
// b < 3.365.
//
// So we require b < 3 to ensure this fits.
debug_assert!(a[0] < (1 << 54)); debug_assert!(b[0] < (1 << 54));
debug_assert!(a[1] < (1 << 54)); debug_assert!(b[1] < (1 << 54));
debug_assert!(a[2] < (1 << 54)); debug_assert!(b[2] < (1 << 54));
debug_assert!(a[3] < (1 << 54)); debug_assert!(b[3] < (1 << 54));
debug_assert!(a[4] < (1 << 54)); debug_assert!(b[4] < (1 << 54));
// Casting to u64 and back tells the compiler that the carry is
// bounded by 2^64, so that the addition is a u128 + u64 rather
// than u128 + u128.
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
let mut out = [0u64; 5];
c1 += ((c0 >> 51) as u64) as u128;
out[0] = (c0 as u64) & LOW_51_BIT_MASK;
c2 += ((c1 >> 51) as u64) as u128;
out[1] = (c1 as u64) & LOW_51_BIT_MASK;
c3 += ((c2 >> 51) as u64) as u128;
out[2] = (c2 as u64) & LOW_51_BIT_MASK;
c4 += ((c3 >> 51) as u64) as u128;
out[3] = (c3 as u64) & LOW_51_BIT_MASK;
let carry: u64 = (c4 >> 51) as u64;
out[4] = (c4 as u64) & LOW_51_BIT_MASK;
// To see that this does not overflow, we need out[0] + carry * 19 < 2^64.
//
// c4 < a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 + (carry from c3)
// < 5*(2^(51 + b) * 2^(51 + b)) + (carry from c3)
// < 2^(102 + 2*b + lg(5)) + 2^64.
//
// When b < 3 we get
//
// c4 < 2^110.33 so that carry < 2^59.33
//
// so that
//
// out[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58
//
// and there is no overflow.
out[0] = out[0] + carry * 19;
// Now out[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon).
out[1] += out[0] >> 51;
out[0] &= LOW_51_BIT_MASK;
// Now out[i] < 2^(51 + epsilon) for all i.
FieldElement51(out)
}
}
impl<'a> Neg for &'a FieldElement51 {
type Output = FieldElement51;
fn neg(self) -> FieldElement51 {
let mut output = *self;
output.negate();
output
}
}
impl ConditionallySelectable for FieldElement51 {
fn conditional_select(
a: &FieldElement51,
b: &FieldElement51,
choice: Choice,
) -> FieldElement51 {
FieldElement51([
u64::conditional_select(&a.0[0], &b.0[0], choice),
u64::conditional_select(&a.0[1], &b.0[1], choice),
u64::conditional_select(&a.0[2], &b.0[2], choice),
u64::conditional_select(&a.0[3], &b.0[3], choice),
u64::conditional_select(&a.0[4], &b.0[4], choice),
])
}
fn conditional_swap(a: &mut FieldElement51, b: &mut FieldElement51, choice: Choice) {
u64::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
u64::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
u64::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
u64::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
u64::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
}
fn conditional_assign(&mut self, other: &FieldElement51, choice: Choice) {
self.0[0].conditional_assign(&other.0[0], choice);
self.0[1].conditional_assign(&other.0[1], choice);
self.0[2].conditional_assign(&other.0[2], choice);
self.0[3].conditional_assign(&other.0[3], choice);
self.0[4].conditional_assign(&other.0[4], choice);
}
}
impl FieldElement51 {
/// Invert the sign of this field element
pub fn negate(&mut self) {
// See commentary in the Sub impl
let neg = FieldElement51::reduce([
36028797018963664u64 - self.0[0],
36028797018963952u64 - self.0[1],
36028797018963952u64 - self.0[2],
36028797018963952u64 - self.0[3],
36028797018963952u64 - self.0[4],
]);
self.0 = neg.0;
}
/// Construct zero.
pub fn zero() -> FieldElement51 {
FieldElement51([ 0, 0, 0, 0, 0 ])
}
/// Construct one.
pub fn one() -> FieldElement51 {
FieldElement51([ 1, 0, 0, 0, 0 ])
}
/// Construct -1.
pub fn minus_one() -> FieldElement51 {
FieldElement51([2251799813685228, 2251799813685247, 2251799813685247, 2251799813685247, 2251799813685247])
}
/// Given 64-bit input limbs, reduce to enforce the bound 2^(51 + epsilon).
#[inline(always)]
fn reduce(mut limbs: [u64; 5]) -> FieldElement51 {
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
// Since the input limbs are bounded by 2^64, the biggest
// carry-out is bounded by 2^13.
//
// The biggest carry-in is c4 * 19, resulting in
//
// 2^51 + 19*2^13 < 2^51.0000000001
//
// Because we don't need to canonicalize, only to reduce the
// limb sizes, it's OK to do a "weak reduction", where we
// compute the carry-outs in parallel.
let c0 = limbs[0] >> 51;
let c1 = limbs[1] >> 51;
let c2 = limbs[2] >> 51;
let c3 = limbs[3] >> 51;
let c4 = limbs[4] >> 51;
limbs[0] &= LOW_51_BIT_MASK;
limbs[1] &= LOW_51_BIT_MASK;
limbs[2] &= LOW_51_BIT_MASK;
limbs[3] &= LOW_51_BIT_MASK;
limbs[4] &= LOW_51_BIT_MASK;
limbs[0] += c4 * 19;
limbs[1] += c0;
limbs[2] += c1;
limbs[3] += c2;
limbs[4] += c3;
FieldElement51(limbs)
}
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
/// input.
///
/// # Warning
///
/// This function does not check that the input used the canonical
/// representative. It masks the high bit, but it will happily
/// decode 2^255 - 18 to 1. Applications that require a canonical
/// encoding of every field element should decode, re-encode to
/// the canonical encoding, and check that the input was
/// canonical.
///
pub fn from_bytes(bytes: &[u8; 32]) -> FieldElement51 {
let load8 = |input: &[u8]| -> u64 {
(input[0] as u64)
| ((input[1] as u64) << 8)
| ((input[2] as u64) << 16)
| ((input[3] as u64) << 24)
| ((input[4] as u64) << 32)
| ((input[5] as u64) << 40)
| ((input[6] as u64) << 48)
| ((input[7] as u64) << 56)
};
let low_51_bit_mask = (1u64 << 51) - 1;
FieldElement51(
// load bits [ 0, 64), no shift
[ load8(&bytes[ 0..]) & low_51_bit_mask
// load bits [ 48,112), shift to [ 51,112)
, (load8(&bytes[ 6..]) >> 3) & low_51_bit_mask
// load bits [ 96,160), shift to [102,160)
, (load8(&bytes[12..]) >> 6) & low_51_bit_mask
// load bits [152,216), shift to [153,216)
, (load8(&bytes[19..]) >> 1) & low_51_bit_mask
// load bits [192,256), shift to [204,112)
, (load8(&bytes[24..]) >> 12) & low_51_bit_mask
])
}
/// Serialize this `FieldElement51` to a 32-byte array. The
/// encoding is canonical.
pub fn to_bytes(&self) -> [u8; 32] {
// Let h = limbs[0] + limbs[1]*2^51 + ... + limbs[4]*2^204.
//
// Write h = pq + r with 0 <= r < p.
//
// We want to compute r = h mod p.
//
// If h < 2*p = 2^256 - 38,
// then q = 0 or 1,
//
// with q = 0 when h < p
// and q = 1 when h >= p.
//
// Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255.
// Therefore q can be computed as the carry bit of h + 19.
// First, reduce the limbs to ensure h < 2*p.
let mut limbs = FieldElement51::reduce(self.0).0;
let mut q = (limbs[0] + 19) >> 51;
q = (limbs[1] + q) >> 51;
q = (limbs[2] + q) >> 51;
q = (limbs[3] + q) >> 51;
q = (limbs[4] + q) >> 51;
// Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q
limbs[0] += 19*q;
// Now carry the result to compute r + 19q ...
let low_51_bit_mask = (1u64 << 51) - 1;
limbs[1] += limbs[0] >> 51;
limbs[0] = limbs[0] & low_51_bit_mask;
limbs[2] += limbs[1] >> 51;
limbs[1] = limbs[1] & low_51_bit_mask;
limbs[3] += limbs[2] >> 51;
limbs[2] = limbs[2] & low_51_bit_mask;
limbs[4] += limbs[3] >> 51;
limbs[3] = limbs[3] & low_51_bit_mask;
// ... but instead of carrying (limbs[4] >> 51) = 2^255q
// into another limb, discard it, subtracting the value
limbs[4] = limbs[4] & low_51_bit_mask;
// Now arrange the bits of the limbs.
let mut s = [0u8;32];
s[ 0] = limbs[0] as u8;
s[ 1] = (limbs[0] >> 8) as u8;
s[ 2] = (limbs[0] >> 16) as u8;
s[ 3] = (limbs[0] >> 24) as u8;
s[ 4] = (limbs[0] >> 32) as u8;
s[ 5] = (limbs[0] >> 40) as u8;
s[ 6] = ((limbs[0] >> 48) | (limbs[1] << 3)) as u8;
s[ 7] = (limbs[1] >> 5) as u8;
s[ 8] = (limbs[1] >> 13) as u8;
s[ 9] = (limbs[1] >> 21) as u8;
s[10] = (limbs[1] >> 29) as u8;
s[11] = (limbs[1] >> 37) as u8;
s[12] = ((limbs[1] >> 45) | (limbs[2] << 6)) as u8;
s[13] = (limbs[2] >> 2) as u8;
s[14] = (limbs[2] >> 10) as u8;
s[15] = (limbs[2] >> 18) as u8;
s[16] = (limbs[2] >> 26) as u8;
s[17] = (limbs[2] >> 34) as u8;
s[18] = (limbs[2] >> 42) as u8;
s[19] = ((limbs[2] >> 50) | (limbs[3] << 1)) as u8;
s[20] = (limbs[3] >> 7) as u8;
s[21] = (limbs[3] >> 15) as u8;
s[22] = (limbs[3] >> 23) as u8;
s[23] = (limbs[3] >> 31) as u8;
s[24] = (limbs[3] >> 39) as u8;
s[25] = ((limbs[3] >> 47) | (limbs[4] << 4)) as u8;
s[26] = (limbs[4] >> 4) as u8;
s[27] = (limbs[4] >> 12) as u8;
s[28] = (limbs[4] >> 20) as u8;
s[29] = (limbs[4] >> 28) as u8;
s[30] = (limbs[4] >> 36) as u8;
s[31] = (limbs[4] >> 44) as u8;
// High bit should be zero.
debug_assert!((s[31] & 0b1000_0000u8) == 0u8);
s
}
/// Given `k > 0`, return `self^(2^k)`.
pub fn pow2k(&self, mut k: u32) -> FieldElement51 {
debug_assert!( k > 0 );
/// Multiply two 64-bit integers with 128 bits of output.
#[inline(always)]
fn m(x: u64, y: u64) -> u128 { (x as u128) * (y as u128) }
let mut a: [u64; 5] = self.0;
loop {
// Precondition: assume input limbs a[i] are bounded as
//
// a[i] < 2^(51 + b)
//
// where b is a real parameter measuring the "bit excess" of the limbs.
// Precomputation: 64-bit multiply by 19.
//
// This fits into a u64 whenever 51 + b + lg(19) < 64.
//
// Since 51 + b + lg(19) < 51 + 4.25 + b
// = 55.25 + b,
// this fits if b < 8.75.
let a3_19 = 19 * a[3];
let a4_19 = 19 * a[4];
// Multiply to get 128-bit coefficients of output.
//
// The 128-bit multiplications by 2 turn into 1 slr + 1 slrd each,
// which doesn't seem any better or worse than doing them as precomputations
// on the 64-bit inputs.
let c0: u128 = m(a[0], a[0]) + 2*( m(a[1], a4_19) + m(a[2], a3_19) );
let mut c1: u128 = m(a[3], a3_19) + 2*( m(a[0], a[1]) + m(a[2], a4_19) );
let mut c2: u128 = m(a[1], a[1]) + 2*( m(a[0], a[2]) + m(a[4], a3_19) );
let mut c3: u128 = m(a[4], a4_19) + 2*( m(a[0], a[3]) + m(a[1], a[2]) );
let mut c4: u128 = m(a[2], a[2]) + 2*( m(a[0], a[4]) + m(a[1], a[3]) );
// Same bound as in multiply:
// c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19)
// < 2^(102 + lg(1 + 4*19) + 2*b)
// < 2^(108.27 + 2*b)
//
// The carry (c[i] >> 51) fits into a u64 when
// 108.27 + 2*b - 51 < 64
// 2*b < 6.73
// b < 3.365.
//
// So we require b < 3 to ensure this fits.
debug_assert!(a[0] < (1 << 54));
debug_assert!(a[1] < (1 << 54));
debug_assert!(a[2] < (1 << 54));
debug_assert!(a[3] < (1 << 54));
debug_assert!(a[4] < (1 << 54));
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
// Casting to u64 and back tells the compiler that the carry is bounded by 2^64, so
// that the addition is a u128 + u64 rather than u128 + u128.
c1 += ((c0 >> 51) as u64) as u128;
a[0] = (c0 as u64) & LOW_51_BIT_MASK;
c2 += ((c1 >> 51) as u64) as u128;
a[1] = (c1 as u64) & LOW_51_BIT_MASK;
c3 += ((c2 >> 51) as u64) as u128;
a[2] = (c2 as u64) & LOW_51_BIT_MASK;
c4 += ((c3 >> 51) as u64) as u128;
a[3] = (c3 as u64) & LOW_51_BIT_MASK;
let carry: u64 = (c4 >> 51) as u64;
a[4] = (c4 as u64) & LOW_51_BIT_MASK;
// To see that this does not overflow, we need a[0] + carry * 19 < 2^64.
//
// c4 < a2^2 + 2*a0*a4 + 2*a1*a3 + (carry from c3)
// < 2^(102 + 2*b + lg(5)) + 2^64.
//
// When b < 3 we get
//
// c4 < 2^110.33 so that carry < 2^59.33
//
// so that
//
// a[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58
//
// and there is no overflow.
a[0] = a[0] + carry * 19;
// Now a[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon).
a[1] += a[0] >> 51;
a[0] &= LOW_51_BIT_MASK;
// Now all a[i] < 2^(51 + epsilon) and a = self^(2^k).
k = k - 1;
if k == 0 {
break;
}
}
FieldElement51(a)
}
/// Returns the square of this field element.
pub fn square(&self) -> FieldElement51 {
self.pow2k(1)
}
/// Returns 2 times the square of this field element.
pub fn square2(&self) -> FieldElement51 {
let mut square = self.pow2k(1);
for i in 0..5 {
square.0[i] *= 2;
}
square
}
}

View File

@ -1,26 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! The `u64` backend uses `u64`s and a `(u64, u64) -> u128` multiplier.
//!
//! On x86_64, the idiom `(x as u128) * (y as u128)` lowers to `MUL`
//! instructions taking 64-bit inputs and producing 128-bit outputs. On
//! other platforms, this implementation is not recommended.
//!
//! On Haswell and newer, the BMI2 extension provides `MULX`, and on
//! Broadwell and newer, the ADX extension provides `ADCX` and `ADOX`
//! (allowing the CPU to compute two carry chains in parallel). These
//! will be used if available.
pub mod field;
pub mod scalar;
pub mod constants;

View File

@ -1,451 +0,0 @@
//! Arithmetic mod \\(2\^{252} + 27742317777372353535851937790883648493\\)
//! with five \\(52\\)-bit unsigned limbs.
//!
//! \\(51\\)-bit limbs would cover the desired bit range (\\(253\\)
//! bits), but isn't large enough to reduce a \\(512\\)-bit number with
//! Montgomery multiplication, so \\(52\\) bits is used instead. To see
//! that this is safe for intermediate results, note that the largest
//! limb in a \\(5\times 5\\) product of \\(52\\)-bit limbs will be
//!
//! ```text
//! (0xfffffffffffff^2) * 5 = 0x4ffffffffffff60000000000005 (107 bits).
//! ```
use core::fmt::Debug;
use core::ops::{Index, IndexMut};
use zeroize::Zeroize;
use constants;
/// The `Scalar52` struct represents an element in
/// \\(\mathbb Z / \ell \mathbb Z\\) as 5 \\(52\\)-bit limbs.
#[derive(Copy,Clone)]
pub struct Scalar52(pub [u64; 5]);
impl Debug for Scalar52 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "Scalar52: {:?}", &self.0[..])
}
}
impl Zeroize for Scalar52 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl Index<usize> for Scalar52 {
type Output = u64;
fn index(&self, _index: usize) -> &u64 {
&(self.0[_index])
}
}
impl IndexMut<usize> for Scalar52 {
fn index_mut(&mut self, _index: usize) -> &mut u64 {
&mut (self.0[_index])
}
}
/// u64 * u64 = u128 multiply helper
#[inline(always)]
fn m(x: u64, y: u64) -> u128 {
(x as u128) * (y as u128)
}
impl Scalar52 {
/// Return the zero scalar
pub fn zero() -> Scalar52 {
Scalar52([0,0,0,0,0])
}
/// Unpack a 32 byte / 256 bit scalar into 5 52-bit limbs.
pub fn from_bytes(bytes: &[u8; 32]) -> Scalar52 {
let mut words = [0u64; 4];
for i in 0..4 {
for j in 0..8 {
words[i] |= (bytes[(i * 8) + j] as u64) << (j * 8);
}
}
let mask = (1u64 << 52) - 1;
let top_mask = (1u64 << 48) - 1;
let mut s = Scalar52::zero();
s[ 0] = words[0] & mask;
s[ 1] = ((words[0] >> 52) | (words[1] << 12)) & mask;
s[ 2] = ((words[1] >> 40) | (words[2] << 24)) & mask;
s[ 3] = ((words[2] >> 28) | (words[3] << 36)) & mask;
s[ 4] = (words[3] >> 16) & top_mask;
s
}
/// Reduce a 64 byte / 512 bit scalar mod l
pub fn from_bytes_wide(bytes: &[u8; 64]) -> Scalar52 {
let mut words = [0u64; 8];
for i in 0..8 {
for j in 0..8 {
words[i] |= (bytes[(i * 8) + j] as u64) << (j * 8);
}
}
let mask = (1u64 << 52) - 1;
let mut lo = Scalar52::zero();
let mut hi = Scalar52::zero();
lo[0] = words[ 0] & mask;
lo[1] = ((words[ 0] >> 52) | (words[ 1] << 12)) & mask;
lo[2] = ((words[ 1] >> 40) | (words[ 2] << 24)) & mask;
lo[3] = ((words[ 2] >> 28) | (words[ 3] << 36)) & mask;
lo[4] = ((words[ 3] >> 16) | (words[ 4] << 48)) & mask;
hi[0] = (words[ 4] >> 4) & mask;
hi[1] = ((words[ 4] >> 56) | (words[ 5] << 8)) & mask;
hi[2] = ((words[ 5] >> 44) | (words[ 6] << 20)) & mask;
hi[3] = ((words[ 6] >> 32) | (words[ 7] << 32)) & mask;
hi[4] = words[ 7] >> 20 ;
lo = Scalar52::montgomery_mul(&lo, &constants::R); // (lo * R) / R = lo
hi = Scalar52::montgomery_mul(&hi, &constants::RR); // (hi * R^2) / R = hi * R
Scalar52::add(&hi, &lo)
}
/// Pack the limbs of this `Scalar52` into 32 bytes
pub fn to_bytes(&self) -> [u8; 32] {
let mut s = [0u8; 32];
s[0] = (self.0[ 0] >> 0) as u8;
s[1] = (self.0[ 0] >> 8) as u8;
s[2] = (self.0[ 0] >> 16) as u8;
s[3] = (self.0[ 0] >> 24) as u8;
s[4] = (self.0[ 0] >> 32) as u8;
s[5] = (self.0[ 0] >> 40) as u8;
s[6] = ((self.0[ 0] >> 48) | (self.0[ 1] << 4)) as u8;
s[7] = (self.0[ 1] >> 4) as u8;
s[8] = (self.0[ 1] >> 12) as u8;
s[9] = (self.0[ 1] >> 20) as u8;
s[10] = (self.0[ 1] >> 28) as u8;
s[11] = (self.0[ 1] >> 36) as u8;
s[12] = (self.0[ 1] >> 44) as u8;
s[13] = (self.0[ 2] >> 0) as u8;
s[14] = (self.0[ 2] >> 8) as u8;
s[15] = (self.0[ 2] >> 16) as u8;
s[16] = (self.0[ 2] >> 24) as u8;
s[17] = (self.0[ 2] >> 32) as u8;
s[18] = (self.0[ 2] >> 40) as u8;
s[19] = ((self.0[ 2] >> 48) | (self.0[ 3] << 4)) as u8;
s[20] = (self.0[ 3] >> 4) as u8;
s[21] = (self.0[ 3] >> 12) as u8;
s[22] = (self.0[ 3] >> 20) as u8;
s[23] = (self.0[ 3] >> 28) as u8;
s[24] = (self.0[ 3] >> 36) as u8;
s[25] = (self.0[ 3] >> 44) as u8;
s[26] = (self.0[ 4] >> 0) as u8;
s[27] = (self.0[ 4] >> 8) as u8;
s[28] = (self.0[ 4] >> 16) as u8;
s[29] = (self.0[ 4] >> 24) as u8;
s[30] = (self.0[ 4] >> 32) as u8;
s[31] = (self.0[ 4] >> 40) as u8;
s
}
/// Compute `a + b` (mod l)
pub fn add(a: &Scalar52, b: &Scalar52) -> Scalar52 {
let mut sum = Scalar52::zero();
let mask = (1u64 << 52) - 1;
// a + b
let mut carry: u64 = 0;
for i in 0..5 {
carry = a[i] + b[i] + (carry >> 52);
sum[i] = carry & mask;
}
// subtract l if the sum is >= l
Scalar52::sub(&sum, &constants::L)
}
/// Compute `a - b` (mod l)
pub fn sub(a: &Scalar52, b: &Scalar52) -> Scalar52 {
let mut difference = Scalar52::zero();
let mask = (1u64 << 52) - 1;
// a - b
let mut borrow: u64 = 0;
for i in 0..5 {
borrow = a[i].wrapping_sub(b[i] + (borrow >> 63));
difference[i] = borrow & mask;
}
// conditionally add l if the difference is negative
let underflow_mask = ((borrow >> 63) ^ 1).wrapping_sub(1);
let mut carry: u64 = 0;
for i in 0..5 {
carry = (carry >> 52) + difference[i] + (constants::L[i] & underflow_mask);
difference[i] = carry & mask;
}
difference
}
/// Compute `a * b`
#[inline(always)]
pub (crate) fn mul_internal(a: &Scalar52, b: &Scalar52) -> [u128; 9] {
let mut z = [0u128; 9];
z[0] = m(a[0],b[0]);
z[1] = m(a[0],b[1]) + m(a[1],b[0]);
z[2] = m(a[0],b[2]) + m(a[1],b[1]) + m(a[2],b[0]);
z[3] = m(a[0],b[3]) + m(a[1],b[2]) + m(a[2],b[1]) + m(a[3],b[0]);
z[4] = m(a[0],b[4]) + m(a[1],b[3]) + m(a[2],b[2]) + m(a[3],b[1]) + m(a[4],b[0]);
z[5] = m(a[1],b[4]) + m(a[2],b[3]) + m(a[3],b[2]) + m(a[4],b[1]);
z[6] = m(a[2],b[4]) + m(a[3],b[3]) + m(a[4],b[2]);
z[7] = m(a[3],b[4]) + m(a[4],b[3]);
z[8] = m(a[4],b[4]);
z
}
/// Compute `a^2`
#[inline(always)]
fn square_internal(a: &Scalar52) -> [u128; 9] {
let aa = [
a[0]*2,
a[1]*2,
a[2]*2,
a[3]*2,
];
[
m( a[0],a[0]),
m(aa[0],a[1]),
m(aa[0],a[2]) + m( a[1],a[1]),
m(aa[0],a[3]) + m(aa[1],a[2]),
m(aa[0],a[4]) + m(aa[1],a[3]) + m( a[2],a[2]),
m(aa[1],a[4]) + m(aa[2],a[3]),
m(aa[2],a[4]) + m( a[3],a[3]),
m(aa[3],a[4]),
m(a[4],a[4])
]
}
/// Compute `limbs/R` (mod l), where R is the Montgomery modulus 2^260
#[inline(always)]
pub (crate) fn montgomery_reduce(limbs: &[u128; 9]) -> Scalar52 {
#[inline(always)]
fn part1(sum: u128) -> (u128, u64) {
let p = (sum as u64).wrapping_mul(constants::LFACTOR) & ((1u64 << 52) - 1);
((sum + m(p,constants::L[0])) >> 52, p)
}
#[inline(always)]
fn part2(sum: u128) -> (u128, u64) {
let w = (sum as u64) & ((1u64 << 52) - 1);
(sum >> 52, w)
}
// note: l[3] is zero, so its multiples can be skipped
let l = &constants::L;
// the first half computes the Montgomery adjustment factor n, and begins adding n*l to make limbs divisible by R
let (carry, n0) = part1( limbs[0]);
let (carry, n1) = part1(carry + limbs[1] + m(n0,l[1]));
let (carry, n2) = part1(carry + limbs[2] + m(n0,l[2]) + m(n1,l[1]));
let (carry, n3) = part1(carry + limbs[3] + m(n1,l[2]) + m(n2,l[1]));
let (carry, n4) = part1(carry + limbs[4] + m(n0,l[4]) + m(n2,l[2]) + m(n3,l[1]));
// limbs is divisible by R now, so we can divide by R by simply storing the upper half as the result
let (carry, r0) = part2(carry + limbs[5] + m(n1,l[4]) + m(n3,l[2]) + m(n4,l[1]));
let (carry, r1) = part2(carry + limbs[6] + m(n2,l[4]) + m(n4,l[2]));
let (carry, r2) = part2(carry + limbs[7] + m(n3,l[4]) );
let (carry, r3) = part2(carry + limbs[8] + m(n4,l[4]));
let r4 = carry as u64;
// result may be >= l, so attempt to subtract l
Scalar52::sub(&Scalar52([r0,r1,r2,r3,r4]), l)
}
/// Compute `a * b` (mod l)
#[inline(never)]
pub fn mul(a: &Scalar52, b: &Scalar52) -> Scalar52 {
let ab = Scalar52::montgomery_reduce(&Scalar52::mul_internal(a, b));
Scalar52::montgomery_reduce(&Scalar52::mul_internal(&ab, &constants::RR))
}
/// Compute `a^2` (mod l)
#[inline(never)]
#[allow(dead_code)] // XXX we don't expose square() via the Scalar API
pub fn square(&self) -> Scalar52 {
let aa = Scalar52::montgomery_reduce(&Scalar52::square_internal(self));
Scalar52::montgomery_reduce(&Scalar52::mul_internal(&aa, &constants::RR))
}
/// Compute `(a * b) / R` (mod l), where R is the Montgomery modulus 2^260
#[inline(never)]
pub fn montgomery_mul(a: &Scalar52, b: &Scalar52) -> Scalar52 {
Scalar52::montgomery_reduce(&Scalar52::mul_internal(a, b))
}
/// Compute `(a^2) / R` (mod l) in Montgomery form, where R is the Montgomery modulus 2^260
#[inline(never)]
pub fn montgomery_square(&self) -> Scalar52 {
Scalar52::montgomery_reduce(&Scalar52::square_internal(self))
}
/// Puts a Scalar52 in to Montgomery form, i.e. computes `a*R (mod l)`
#[inline(never)]
pub fn to_montgomery(&self) -> Scalar52 {
Scalar52::montgomery_mul(self, &constants::RR)
}
/// Takes a Scalar52 out of Montgomery form, i.e. computes `a/R (mod l)`
#[inline(never)]
pub fn from_montgomery(&self) -> Scalar52 {
let mut limbs = [0u128; 9];
for i in 0..5 {
limbs[i] = self[i] as u128;
}
Scalar52::montgomery_reduce(&limbs)
}
}
#[cfg(test)]
mod test {
use super::*;
/// Note: x is 2^253-1 which is slightly larger than the largest scalar produced by
/// this implementation (l-1), and should show there are no overflows for valid scalars
///
/// x = 14474011154664524427946373126085988481658748083205070504932198000989141204991
/// x = 7237005577332262213973186563042994240801631723825162898930247062703686954002 mod l
/// x = 3057150787695215392275360544382990118917283750546154083604586903220563173085*R mod l in Montgomery form
pub static X: Scalar52 = Scalar52(
[0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff,
0x00001fffffffffff]);
/// x^2 = 3078544782642840487852506753550082162405942681916160040940637093560259278169 mod l
pub static XX: Scalar52 = Scalar52(
[0x0001668020217559, 0x000531640ffd0ec0, 0x00085fd6f9f38a31, 0x000c268f73bb1cf4,
0x000006ce65046df0]);
/// x^2 = 4413052134910308800482070043710297189082115023966588301924965890668401540959*R mod l in Montgomery form
pub static XX_MONT: Scalar52 = Scalar52(
[0x000c754eea569a5c, 0x00063b6ed36cb215, 0x0008ffa36bf25886, 0x000e9183614e7543,
0x0000061db6c6f26f]);
/// y = 6145104759870991071742105800796537629880401874866217824609283457819451087098
pub static Y: Scalar52 = Scalar52(
[0x000b75071e1458fa, 0x000bf9d75e1ecdac, 0x000433d2baf0672b, 0x0005fffcc11fad13,
0x00000d96018bb825]);
/// x*y = 36752150652102274958925982391442301741 mod l
pub static XY: Scalar52 = Scalar52(
[0x000ee6d76ba7632d, 0x000ed50d71d84e02, 0x00000000001ba634, 0x0000000000000000,
0x0000000000000000]);
/// x*y = 658448296334113745583381664921721413881518248721417041768778176391714104386*R mod l in Montgomery form
pub static XY_MONT: Scalar52 = Scalar52(
[0x0006d52bf200cfd5, 0x00033fb1d7021570, 0x000f201bc07139d8, 0x0001267e3e49169e,
0x000007b839c00268]);
/// a = 2351415481556538453565687241199399922945659411799870114962672658845158063753
pub static A: Scalar52 = Scalar52(
[0x0005236c07b3be89, 0x0001bc3d2a67c0c4, 0x000a4aa782aae3ee, 0x0006b3f6e4fec4c4,
0x00000532da9fab8c]);
/// b = 4885590095775723760407499321843594317911456947580037491039278279440296187236
pub static B: Scalar52 = Scalar52(
[0x000d3fae55421564, 0x000c2df24f65a4bc, 0x0005b5587d69fb0b, 0x00094c091b013b3b,
0x00000acd25605473]);
/// a+b = 0
/// a-b = 4702830963113076907131374482398799845891318823599740229925345317690316127506
pub static AB: Scalar52 = Scalar52(
[0x000a46d80f677d12, 0x0003787a54cf8188, 0x0004954f0555c7dc, 0x000d67edc9fd8989,
0x00000a65b53f5718]);
// c = (2^512 - 1) % l = 1627715501170711445284395025044413883736156588369414752970002579683115011840
pub static C: Scalar52 = Scalar52(
[0x000611e3449c0f00, 0x000a768859347a40, 0x0007f5be65d00e1b, 0x0009a3dceec73d21,
0x00000399411b7c30]);
#[test]
fn mul_max() {
let res = Scalar52::mul(&X, &X);
for i in 0..5 {
assert!(res[i] == XX[i]);
}
}
#[test]
fn square_max() {
let res = X.square();
for i in 0..5 {
assert!(res[i] == XX[i]);
}
}
#[test]
fn montgomery_mul_max() {
let res = Scalar52::montgomery_mul(&X, &X);
for i in 0..5 {
assert!(res[i] == XX_MONT[i]);
}
}
#[test]
fn montgomery_square_max() {
let res = X.montgomery_square();
for i in 0..5 {
assert!(res[i] == XX_MONT[i]);
}
}
#[test]
fn mul() {
let res = Scalar52::mul(&X, &Y);
for i in 0..5 {
assert!(res[i] == XY[i]);
}
}
#[test]
fn montgomery_mul() {
let res = Scalar52::montgomery_mul(&X, &Y);
for i in 0..5 {
assert!(res[i] == XY_MONT[i]);
}
}
#[test]
fn add() {
let res = Scalar52::add(&A, &B);
let zero = Scalar52::zero();
for i in 0..5 {
assert!(res[i] == zero[i]);
}
}
#[test]
fn sub() {
let res = Scalar52::sub(&A, &B);
for i in 0..5 {
assert!(res[i] == AB[i]);
}
}
#[test]
fn from_bytes_wide() {
let bignum = [255u8; 64]; // 2^512 - 1
let reduced = Scalar52::from_bytes_wide(&bignum);
println!("{:?}", reduced);
for i in 0..5 {
assert!(reduced[i] == C[i]);
}
}
}

View File

@ -1,524 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Parallel Edwards Arithmetic for Curve25519.
//!
//! This module currently has two point types:
//!
//! * `ExtendedPoint`: a point stored in vector-friendly format, with
//! vectorized doubling and addition;
//!
//! * `CachedPoint`: used for readdition.
//!
//! Details on the formulas can be found in the documentation for the
//! parent `avx2` module.
//!
//! This API is designed to be safe: vectorized points can only be
//! created from serial points (which do validation on decompression),
//! and operations on valid points return valid points, so invalid
//! point states should be unrepresentable.
//!
//! This design goal is met, with one exception: the `Neg`
//! implementation for the `CachedPoint` performs a lazy negation, so
//! that subtraction can be efficiently implemented as a negation and
//! an addition. Repeatedly negating a `CachedPoint` will cause its
//! coefficients to grow and eventually overflow. Repeatedly negating
//! a point should not be necessary anyways.
#![allow(non_snake_case)]
use core::convert::From;
use core::ops::{Add, Neg, Sub};
use subtle::Choice;
use subtle::ConditionallySelectable;
use edwards;
use window::{LookupTable, NafLookupTable5, NafLookupTable8};
use traits::Identity;
use super::constants;
use super::field::{FieldElement2625x4, Lanes, Shuffle};
/// A point on Curve25519, using parallel Edwards formulas for curve
/// operations.
///
/// # Invariant
///
/// The coefficients of an `ExtendedPoint` are bounded with
/// \\( b < 0.007 \\).
#[derive(Copy, Clone, Debug)]
pub struct ExtendedPoint(pub(super) FieldElement2625x4);
impl From<edwards::EdwardsPoint> for ExtendedPoint {
fn from(P: edwards::EdwardsPoint) -> ExtendedPoint {
ExtendedPoint(FieldElement2625x4::new(&P.X, &P.Y, &P.Z, &P.T))
}
}
impl From<ExtendedPoint> for edwards::EdwardsPoint {
fn from(P: ExtendedPoint) -> edwards::EdwardsPoint {
let tmp = P.0.split();
edwards::EdwardsPoint {
X: tmp[0],
Y: tmp[1],
Z: tmp[2],
T: tmp[3],
}
}
}
impl ConditionallySelectable for ExtendedPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
ExtendedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice))
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.0.conditional_assign(&other.0, choice);
}
}
impl Default for ExtendedPoint {
fn default() -> ExtendedPoint {
ExtendedPoint::identity()
}
}
impl Identity for ExtendedPoint {
fn identity() -> ExtendedPoint {
constants::EXTENDEDPOINT_IDENTITY
}
}
impl ExtendedPoint {
/// Compute the double of this point.
pub fn double(&self) -> ExtendedPoint {
// Want to compute (X1 Y1 Z1 X1+Y1).
// Not sure how to do this less expensively than computing
// (X1 Y1 Z1 T1) --(256bit shuffle)--> (X1 Y1 X1 Y1)
// (X1 Y1 X1 Y1) --(2x128b shuffle)--> (Y1 X1 Y1 X1)
// and then adding.
// Set tmp0 = (X1 Y1 X1 Y1)
let mut tmp0 = self.0.shuffle(Shuffle::ABAB);
// Set tmp1 = (Y1 X1 Y1 X1)
let mut tmp1 = tmp0.shuffle(Shuffle::BADC);
// Set tmp0 = (X1 Y1 Z1 X1+Y1)
tmp0 = self.0.blend(tmp0 + tmp1, Lanes::D);
// Set tmp1 = tmp0^2, negating the D values
tmp1 = tmp0.square_and_negate_D();
// Now tmp1 = (S1 S2 S3 -S4) with b < 0.007
// See discussion of bounds in the module-level documentation.
// We want to compute
//
// + | S1 | S1 | S1 | S1 |
// + | S2 | | | S2 |
// + | | | S3 | |
// + | | | S3 | |
// + | | | |-S4 |
// + | | 2p | 2p | |
// - | | S2 | S2 | |
// =======================
// S5 S6 S8 S9
let zero = FieldElement2625x4::zero();
let S_1 = tmp1.shuffle(Shuffle::AAAA);
let S_2 = tmp1.shuffle(Shuffle::BBBB);
tmp0 = zero.blend(tmp1 + tmp1, Lanes::C);
// tmp0 = (0, 0, 2S_3, 0)
tmp0 = tmp0.blend(tmp1, Lanes::D);
// tmp0 = (0, 0, 2S_3, -S_4)
tmp0 = tmp0 + S_1;
// tmp0 = ( S_1, S_1, S_1 + 2S_3, S_1 - S_4)
tmp0 = tmp0 + zero.blend(S_2, Lanes::AD);
// tmp0 = (S_1 + S_2, S_1, S_1 + 2S_3, S_1 + S_2 - S_4)
tmp0 = tmp0 + zero.blend(S_2.negate_lazy(), Lanes::BC);
// tmp0 = (S_1 + S_2, S_1 - S_2, S_1 - S_2 + 2S_3, S_1 + S_2 - S_4)
// b < ( 1.01, 1.6, 2.33, 1.6)
// Now tmp0 = (S_5, S_6, S_8, S_9)
// Set tmp1 = ( S_9, S_6, S_6, S_9)
// b < ( 1.6, 1.6, 1.6, 1.6)
tmp1 = tmp0.shuffle(Shuffle::DBBD);
// Set tmp1 = ( S_8, S_5, S_8, S_5)
// b < (2.33, 1.01, 2.33, 1.01)
tmp0 = tmp0.shuffle(Shuffle::CACA);
// Bounds on (tmp0, tmp1) are (2.33, 1.6) < (2.5, 1.75).
ExtendedPoint(&tmp0 * &tmp1)
}
pub fn mul_by_pow_2(&self, k: u32) -> ExtendedPoint {
let mut tmp: ExtendedPoint = *self;
for _ in 0..k {
tmp = tmp.double();
}
tmp
}
}
/// A cached point with some precomputed variables used for readdition.
///
/// # Warning
///
/// It is not safe to negate this point more than once.
///
/// # Invariant
///
/// As long as the `CachedPoint` is not repeatedly negated, its
/// coefficients will be bounded with \\( b < 1.0 \\).
#[derive(Copy, Clone, Debug)]
pub struct CachedPoint(pub(super) FieldElement2625x4);
impl From<ExtendedPoint> for CachedPoint {
fn from(P: ExtendedPoint) -> CachedPoint {
let mut x = P.0;
x = x.blend(x.diff_sum(), Lanes::AB);
// x = (X1 - Y1, X2 + Y2, Z2, T2) = (S2 S3 Z2 T2)
x = x * (121666, 121666, 2 * 121666, 2 * 121665);
// x = (121666*S2 121666*S3 2*121666*Z2 2*121665*T2)
x = x.blend(-x, Lanes::D);
// x = (121666*S2 121666*S3 2*121666*Z2 -2*121665*T2)
// The coefficients of the output are bounded with b < 0.007.
CachedPoint(x)
}
}
impl Default for CachedPoint {
fn default() -> CachedPoint {
CachedPoint::identity()
}
}
impl Identity for CachedPoint {
fn identity() -> CachedPoint {
constants::CACHEDPOINT_IDENTITY
}
}
impl ConditionallySelectable for CachedPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
CachedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice))
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.0.conditional_assign(&other.0, choice);
}
}
impl<'a> Neg for &'a CachedPoint {
type Output = CachedPoint;
/// Lazily negate the point.
///
/// # Warning
///
/// Because this method does not perform a reduction, it is not
/// safe to repeatedly negate a point.
fn neg(self) -> CachedPoint {
let swapped = self.0.shuffle(Shuffle::BACD);
CachedPoint(swapped.blend(swapped.negate_lazy(), Lanes::D))
}
}
impl<'a, 'b> Add<&'b CachedPoint> for &'a ExtendedPoint {
type Output = ExtendedPoint;
/// Add an `ExtendedPoint` and a `CachedPoint`.
fn add(self, other: &'b CachedPoint) -> ExtendedPoint {
// The coefficients of an `ExtendedPoint` are reduced after
// every operation. If the `CachedPoint` was negated, its
// coefficients grow by one bit. So on input, `self` is
// bounded with `b < 0.007` and `other` is bounded with
// `b < 1.0`.
let mut tmp = self.0;
tmp = tmp.blend(tmp.diff_sum(), Lanes::AB);
// tmp = (Y1-X1 Y1+X1 Z1 T1) = (S0 S1 Z1 T1) with b < 1.6
// (tmp, other) bounded with b < (1.6, 1.0) < (2.5, 1.75).
tmp = &tmp * &other.0;
// tmp = (S0*S2' S1*S3' Z1*Z2' T1*T2') = (S8 S9 S10 S11)
tmp = tmp.shuffle(Shuffle::ABDC);
// tmp = (S8 S9 S11 S10)
tmp = tmp.diff_sum();
// tmp = (S9-S8 S9+S8 S10-S11 S10+S11) = (S12 S13 S14 S15)
let t0 = tmp.shuffle(Shuffle::ADDA);
// t0 = (S12 S15 S15 S12)
let t1 = tmp.shuffle(Shuffle::CBCB);
// t1 = (S14 S13 S14 S13)
// All coefficients of t0, t1 are bounded with b < 1.6.
// Return (S12*S14 S15*S13 S15*S14 S12*S13) = (X3 Y3 Z3 T3)
ExtendedPoint(&t0 * &t1)
}
}
impl<'a, 'b> Sub<&'b CachedPoint> for &'a ExtendedPoint {
type Output = ExtendedPoint;
/// Implement subtraction by negating the point and adding.
///
/// Empirically, this seems about the same cost as a custom
/// subtraction impl (maybe because the benefit is cancelled by
/// increased code size?)
fn sub(self, other: &'b CachedPoint) -> ExtendedPoint {
self + &(-other)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for LookupTable<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let P = ExtendedPoint::from(*point);
let mut points = [CachedPoint::from(P); 8];
for i in 0..7 {
points[i + 1] = (&P + &points[i]).into();
}
LookupTable(points)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable5<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let A = ExtendedPoint::from(*point);
let mut Ai = [CachedPoint::from(A); 8];
let A2 = A.double();
for i in 0..7 {
Ai[i + 1] = (&A2 + &Ai[i]).into();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
NafLookupTable5(Ai)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable8<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let A = ExtendedPoint::from(*point);
let mut Ai = [CachedPoint::from(A); 64];
let A2 = A.double();
for i in 0..63 {
Ai[i + 1] = (&A2 + &Ai[i]).into();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
NafLookupTable8(Ai)
}
}
#[cfg(test)]
mod test {
use super::*;
fn serial_add(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) -> edwards::EdwardsPoint {
use backend::serial::u64::field::FieldElement51;
let (X1, Y1, Z1, T1) = (P.X, P.Y, P.Z, P.T);
let (X2, Y2, Z2, T2) = (Q.X, Q.Y, Q.Z, Q.T);
macro_rules! print_var {
($x:ident) => {
println!("{} = {:?}", stringify!($x), $x.to_bytes());
};
}
let S0 = &Y1 - &X1; // R1
let S1 = &Y1 + &X1; // R3
let S2 = &Y2 - &X2; // R2
let S3 = &Y2 + &X2; // R4
print_var!(S0);
print_var!(S1);
print_var!(S2);
print_var!(S3);
println!("");
let S4 = &S0 * &S2; // R5 = R1 * R2
let S5 = &S1 * &S3; // R6 = R3 * R4
let S6 = &Z1 * &Z2; // R8
let S7 = &T1 * &T2; // R7
print_var!(S4);
print_var!(S5);
print_var!(S6);
print_var!(S7);
println!("");
let S8 = &S4 * &FieldElement51([ 121666,0,0,0,0]); // R5
let S9 = &S5 * &FieldElement51([ 121666,0,0,0,0]); // R6
let S10 = &S6 * &FieldElement51([2*121666,0,0,0,0]); // R8
let S11 = &S7 * &(-&FieldElement51([2*121665,0,0,0,0])); // R7
print_var!(S8);
print_var!(S9);
print_var!(S10);
print_var!(S11);
println!("");
let S12 = &S9 - &S8; // R1
let S13 = &S9 + &S8; // R4
let S14 = &S10 - &S11; // R2
let S15 = &S10 + &S11; // R3
print_var!(S12);
print_var!(S13);
print_var!(S14);
print_var!(S15);
println!("");
let X3 = &S12 * &S14; // R1 * R2
let Y3 = &S15 * &S13; // R3 * R4
let Z3 = &S15 * &S14; // R2 * R3
let T3 = &S12 * &S13; // R1 * R4
edwards::EdwardsPoint {
X: X3,
Y: Y3,
Z: Z3,
T: T3,
}
}
fn addition_test_helper(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) {
// Test the serial implementation of the parallel addition formulas
let R_serial: edwards::EdwardsPoint = serial_add(P.into(), Q.into()).into();
// Test the vector implementation of the parallel readdition formulas
let cached_Q = CachedPoint::from(ExtendedPoint::from(Q));
let R_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) + &cached_Q).into();
let S_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) - &cached_Q).into();
println!("Testing point addition:");
println!("P = {:?}", P);
println!("Q = {:?}", Q);
println!("cached Q = {:?}", cached_Q);
println!("R = P + Q = {:?}", &P + &Q);
println!("R_serial = {:?}", R_serial);
println!("R_vector = {:?}", R_vector);
println!("S = P - Q = {:?}", &P - &Q);
println!("S_vector = {:?}", S_vector);
assert_eq!(R_serial.compress(), (&P + &Q).compress());
assert_eq!(R_vector.compress(), (&P + &Q).compress());
assert_eq!(S_vector.compress(), (&P - &Q).compress());
println!("OK!\n");
}
#[test]
fn vector_addition_vs_serial_addition_vs_edwards_extendedpoint() {
use constants;
use scalar::Scalar;
println!("Testing id +- id");
let P = edwards::EdwardsPoint::identity();
let Q = edwards::EdwardsPoint::identity();
addition_test_helper(P, Q);
println!("Testing id +- B");
let P = edwards::EdwardsPoint::identity();
let Q = constants::ED25519_BASEPOINT_POINT;
addition_test_helper(P, Q);
println!("Testing B +- B");
let P = constants::ED25519_BASEPOINT_POINT;
let Q = constants::ED25519_BASEPOINT_POINT;
addition_test_helper(P, Q);
println!("Testing B +- kB");
let P = constants::ED25519_BASEPOINT_POINT;
let Q = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
addition_test_helper(P, Q);
}
fn serial_double(P: edwards::EdwardsPoint) -> edwards::EdwardsPoint {
let (X1, Y1, Z1, _T1) = (P.X, P.Y, P.Z, P.T);
macro_rules! print_var {
($x:ident) => {
println!("{} = {:?}", stringify!($x), $x.to_bytes());
};
}
let S0 = &X1 + &Y1; // R1
print_var!(S0);
println!("");
let S1 = X1.square();
let S2 = Y1.square();
let S3 = Z1.square();
let S4 = S0.square();
print_var!(S1);
print_var!(S2);
print_var!(S3);
print_var!(S4);
println!("");
let S5 = &S1 + &S2;
let S6 = &S1 - &S2;
let S7 = &S3 + &S3;
let S8 = &S7 + &S6;
let S9 = &S5 - &S4;
print_var!(S5);
print_var!(S6);
print_var!(S7);
print_var!(S8);
print_var!(S9);
println!("");
let X3 = &S8 * &S9;
let Y3 = &S5 * &S6;
let Z3 = &S8 * &S6;
let T3 = &S5 * &S9;
edwards::EdwardsPoint {
X: X3,
Y: Y3,
Z: Z3,
T: T3,
}
}
fn doubling_test_helper(P: edwards::EdwardsPoint) {
let R1: edwards::EdwardsPoint = serial_double(P.into()).into();
let R2: edwards::EdwardsPoint = ExtendedPoint::from(P).double().into();
println!("Testing point doubling:");
println!("P = {:?}", P);
println!("(serial) R1 = {:?}", R1);
println!("(vector) R2 = {:?}", R2);
println!("P + P = {:?}", &P + &P);
assert_eq!(R1.compress(), (&P + &P).compress());
assert_eq!(R2.compress(), (&P + &P).compress());
println!("OK!\n");
}
#[test]
fn vector_doubling_vs_serial_doubling_vs_edwards_extendedpoint() {
use constants;
use scalar::Scalar;
println!("Testing [2]id");
let P = edwards::EdwardsPoint::identity();
doubling_test_helper(P);
println!("Testing [2]B");
let P = constants::ED25519_BASEPOINT_POINT;
doubling_test_helper(P);
println!("Testing [2]([k]B)");
let P = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
doubling_test_helper(P);
}
}

View File

@ -1,985 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! An implementation of 4-way vectorized 32bit field arithmetic using
//! AVX2.
//!
//! The `FieldElement2625x4` struct provides a vector of four field
//! elements, implemented using AVX2 operations. Its API is designed
//! to abstract away the platform-dependent details, so that point
//! arithmetic can be implemented only in terms of a vector of field
//! elements.
//!
//! At this level, the API is optimized for speed and not safety. The
//! `FieldElement2625x4` does not always perform reductions. The pre-
//! and post-conditions on the bounds of the coefficients are
//! documented for each method, but it is the caller's responsibility
//! to ensure that there are no overflows.
#![allow(non_snake_case)]
const A_LANES: u8 = 0b0000_0101;
const B_LANES: u8 = 0b0000_1010;
const C_LANES: u8 = 0b0101_0000;
const D_LANES: u8 = 0b1010_0000;
#[allow(unused)]
const A_LANES64: u8 = 0b00_00_00_11;
#[allow(unused)]
const B_LANES64: u8 = 0b00_00_11_00;
#[allow(unused)]
const C_LANES64: u8 = 0b00_11_00_00;
#[allow(unused)]
const D_LANES64: u8 = 0b11_00_00_00;
use core::ops::{Add, Mul, Neg};
use packed_simd::{i32x8, u32x8, u64x4, IntoBits};
use backend::vector::avx2::constants::{P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO};
use backend::serial::u64::field::FieldElement51;
/// Unpack 32-bit lanes into 64-bit lanes:
/// ```ascii,no_run
/// (a0, b0, a1, b1, c0, d0, c1, d1)
/// ```
/// into
/// ```ascii,no_run
/// (a0, 0, b0, 0, c0, 0, d0, 0)
/// (a1, 0, b1, 0, c1, 0, d1, 0)
/// ```
#[inline(always)]
fn unpack_pair(src: u32x8) -> (u32x8, u32x8) {
let a: u32x8;
let b: u32x8;
let zero = i32x8::new(0, 0, 0, 0, 0, 0, 0, 0);
unsafe {
use core::arch::x86_64::_mm256_unpackhi_epi32;
use core::arch::x86_64::_mm256_unpacklo_epi32;
a = _mm256_unpacklo_epi32(src.into_bits(), zero.into_bits()).into_bits();
b = _mm256_unpackhi_epi32(src.into_bits(), zero.into_bits()).into_bits();
}
(a, b)
}
/// Repack 64-bit lanes into 32-bit lanes:
/// ```ascii,no_run
/// (a0, 0, b0, 0, c0, 0, d0, 0)
/// (a1, 0, b1, 0, c1, 0, d1, 0)
/// ```
/// into
/// ```ascii,no_run
/// (a0, b0, a1, b1, c0, d0, c1, d1)
/// ```
#[inline(always)]
fn repack_pair(x: u32x8, y: u32x8) -> u32x8 {
unsafe {
use core::arch::x86_64::_mm256_blend_epi32;
use core::arch::x86_64::_mm256_shuffle_epi32;
// Input: x = (a0, 0, b0, 0, c0, 0, d0, 0)
// Input: y = (a1, 0, b1, 0, c1, 0, d1, 0)
let x_shuffled = _mm256_shuffle_epi32(x.into_bits(), 0b11_01_10_00);
let y_shuffled = _mm256_shuffle_epi32(y.into_bits(), 0b10_00_11_01);
// x' = (a0, b0, 0, 0, c0, d0, 0, 0)
// y' = ( 0, 0, a1, b1, 0, 0, c1, d1)
return _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into_bits();
}
}
/// The `Lanes` enum represents a subset of the lanes `A,B,C,D` of a
/// `FieldElement2625x4`.
///
/// It's used to specify blend operations without
/// having to know details about the data layout of the
/// `FieldElement2625x4`.
#[derive(Copy, Clone, Debug)]
pub enum Lanes {
C,
D,
AB,
AC,
CD,
AD,
BC,
ABCD,
}
/// The `Shuffle` enum represents a shuffle of a `FieldElement2625x4`.
///
/// The enum variants are named by what they do to a vector \\(
/// (A,B,C,D) \\); for instance, `Shuffle::BADC` turns \\( (A, B, C,
/// D) \\) into \\( (B, A, D, C) \\).
#[derive(Copy, Clone, Debug)]
pub enum Shuffle {
AAAA,
BBBB,
CACA,
DBBD,
ADDA,
CBCB,
ABAB,
BADC,
BACD,
ABDC,
}
/// A vector of four field elements.
///
/// Each operation on a `FieldElement2625x4` has documented effects on
/// the bounds of the coefficients. This API is designed for speed
/// and not safety; it is the caller's responsibility to ensure that
/// the post-conditions of one operation are compatible with the
/// pre-conditions of the next.
#[derive(Clone, Copy, Debug)]
pub struct FieldElement2625x4(pub(crate) [u32x8; 5]);
use subtle::Choice;
use subtle::ConditionallySelectable;
impl ConditionallySelectable for FieldElement2625x4 {
fn conditional_select(
a: &FieldElement2625x4,
b: &FieldElement2625x4,
choice: Choice,
) -> FieldElement2625x4 {
let mask = (-(choice.unwrap_u8() as i32)) as u32;
let mask_vec = u32x8::splat(mask);
FieldElement2625x4([
a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
])
}
fn conditional_assign(
&mut self,
other: &FieldElement2625x4,
choice: Choice,
) {
let mask = (-(choice.unwrap_u8() as i32)) as u32;
let mask_vec = u32x8::splat(mask);
self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
}
}
impl FieldElement2625x4 {
/// Split this vector into an array of four (serial) field
/// elements.
pub fn split(&self) -> [FieldElement51; 4] {
let mut out = [FieldElement51::zero(); 4];
for i in 0..5 {
let a_2i = self.0[i].extract(0) as u64; //
let b_2i = self.0[i].extract(1) as u64; //
let a_2i_1 = self.0[i].extract(2) as u64; // `.
let b_2i_1 = self.0[i].extract(3) as u64; // | pre-swapped to avoid
let c_2i = self.0[i].extract(4) as u64; // | a cross lane shuffle
let d_2i = self.0[i].extract(5) as u64; // .'
let c_2i_1 = self.0[i].extract(6) as u64; //
let d_2i_1 = self.0[i].extract(7) as u64; //
out[0].0[i] = a_2i + (a_2i_1 << 26);
out[1].0[i] = b_2i + (b_2i_1 << 26);
out[2].0[i] = c_2i + (c_2i_1 << 26);
out[3].0[i] = d_2i + (d_2i_1 << 26);
}
out
}
/// Rearrange the elements of this vector according to `control`.
///
/// The `control` parameter should be a compile-time constant, so
/// that when this function is inlined, LLVM is able to lower the
/// shuffle using an immediate.
#[inline]
pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 {
#[inline(always)]
fn shuffle_lanes(x: u32x8, control: Shuffle) -> u32x8 {
unsafe {
use core::arch::x86_64::_mm256_permutevar8x32_epi32;
let c: u32x8 = match control {
Shuffle::AAAA => u32x8::new(0, 0, 2, 2, 0, 0, 2, 2),
Shuffle::BBBB => u32x8::new(1, 1, 3, 3, 1, 1, 3, 3),
Shuffle::CACA => u32x8::new(4, 0, 6, 2, 4, 0, 6, 2),
Shuffle::DBBD => u32x8::new(5, 1, 7, 3, 1, 5, 3, 7),
Shuffle::ADDA => u32x8::new(0, 5, 2, 7, 5, 0, 7, 2),
Shuffle::CBCB => u32x8::new(4, 1, 6, 3, 4, 1, 6, 3),
Shuffle::ABAB => u32x8::new(0, 1, 2, 3, 0, 1, 2, 3),
Shuffle::BADC => u32x8::new(1, 0, 3, 2, 5, 4, 7, 6),
Shuffle::BACD => u32x8::new(1, 0, 3, 2, 4, 5, 6, 7),
Shuffle::ABDC => u32x8::new(0, 1, 2, 3, 5, 4, 7, 6),
};
// Note that this gets turned into a generic LLVM
// shuffle-by-constants, which can be lowered to a simpler
// instruction than a generic permute.
_mm256_permutevar8x32_epi32(x.into_bits(), c.into_bits()).into_bits()
}
}
FieldElement2625x4([
shuffle_lanes(self.0[0], control),
shuffle_lanes(self.0[1], control),
shuffle_lanes(self.0[2], control),
shuffle_lanes(self.0[3], control),
shuffle_lanes(self.0[4], control),
])
}
/// Blend `self` with `other`, taking lanes specified in `control` from `other`.
///
/// The `control` parameter should be a compile-time constant, so
/// that this function can be inlined and LLVM can lower it to a
/// blend instruction using an immediate.
#[inline]
pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 {
#[inline(always)]
fn blend_lanes(x: u32x8, y: u32x8, control: Lanes) -> u32x8 {
unsafe {
use core::arch::x86_64::_mm256_blend_epi32;
// This would be much cleaner if we could factor out the match
// statement on the control. Unfortunately, rustc forgets
// constant-info very quickly, so we can't even write
// ```
// match control {
// Lanes::C => {
// let imm = C_LANES as i32;
// _mm256_blend_epi32(..., imm)
// ```
// let alone
// ```
// let imm = match control {
// Lanes::C => C_LANES as i32,
// }
// _mm256_blend_epi32(..., imm)
// ```
// even though both of these would be constant-folded by LLVM
// at a lower level (as happens in the shuffle implementation,
// which does not require a shuffle immediate but *is* lowered
// to immediate shuffles anyways).
match control {
Lanes::C => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), C_LANES as i32).into_bits()
}
Lanes::D => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), D_LANES as i32).into_bits()
}
Lanes::AD => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | D_LANES) as i32)
.into_bits()
}
Lanes::AB => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | B_LANES) as i32)
.into_bits()
}
Lanes::AC => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | C_LANES) as i32)
.into_bits()
}
Lanes::CD => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (C_LANES | D_LANES) as i32)
.into_bits()
}
Lanes::BC => {
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (B_LANES | C_LANES) as i32)
.into_bits()
}
Lanes::ABCD => _mm256_blend_epi32(
x.into_bits(),
y.into_bits(),
(A_LANES | B_LANES | C_LANES | D_LANES) as i32,
).into_bits(),
}
}
}
FieldElement2625x4([
blend_lanes(self.0[0], other.0[0], control),
blend_lanes(self.0[1], other.0[1], control),
blend_lanes(self.0[2], other.0[2], control),
blend_lanes(self.0[3], other.0[3], control),
blend_lanes(self.0[4], other.0[4], control),
])
}
/// Construct a vector of zeros.
pub fn zero() -> FieldElement2625x4 {
FieldElement2625x4([u32x8::splat(0); 5])
}
/// Convenience wrapper around `new(x,x,x,x)`.
pub fn splat(x: &FieldElement51) -> FieldElement2625x4 {
FieldElement2625x4::new(x, x, x, x)
}
/// Create a `FieldElement2625x4` from four `FieldElement51`s.
///
/// # Postconditions
///
/// The resulting `FieldElement2625x4` is bounded with \\( b < 0.0002 \\).
pub fn new(
x0: &FieldElement51,
x1: &FieldElement51,
x2: &FieldElement51,
x3: &FieldElement51,
) -> FieldElement2625x4 {
let mut buf = [u32x8::splat(0); 5];
let low_26_bits = (1 << 26) - 1;
for i in 0..5 {
let a_2i = (x0.0[i] & low_26_bits) as u32;
let a_2i_1 = (x0.0[i] >> 26) as u32;
let b_2i = (x1.0[i] & low_26_bits) as u32;
let b_2i_1 = (x1.0[i] >> 26) as u32;
let c_2i = (x2.0[i] & low_26_bits) as u32;
let c_2i_1 = (x2.0[i] >> 26) as u32;
let d_2i = (x3.0[i] & low_26_bits) as u32;
let d_2i_1 = (x3.0[i] >> 26) as u32;
buf[i] = u32x8::new(a_2i, b_2i, a_2i_1, b_2i_1, c_2i, d_2i, c_2i_1, d_2i_1);
}
// We don't know that the original `FieldElement51`s were
// fully reduced, so the odd limbs may exceed 2^25.
// Reduce them to be sure.
FieldElement2625x4(buf).reduce()
}
/// Given \\((A,B,C,D)\\), compute \\((-A,-B,-C,-D)\\), without
/// performing a reduction.
///
/// # Preconditions
///
/// The coefficients of `self` must be bounded with \\( b < 0.999 \\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 1 \\).
#[inline]
pub fn negate_lazy(&self) -> FieldElement2625x4 {
// The limbs of self are bounded with b < 0.999, while the
// smallest limb of 2*p is 67108845 > 2^{26+0.9999}, so
// underflows are not possible.
FieldElement2625x4([
P_TIMES_2_LO - self.0[0],
P_TIMES_2_HI - self.0[1],
P_TIMES_2_HI - self.0[2],
P_TIMES_2_HI - self.0[3],
P_TIMES_2_HI - self.0[4],
])
}
/// Given `self = (A,B,C,D)`, compute `(B - A, B + A, D - C, D + C)`.
///
/// # Preconditions
///
/// The coefficients of `self` must be bounded with \\( b < 0.01 \\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 1.6 \\).
#[inline]
pub fn diff_sum(&self) -> FieldElement2625x4 {
// tmp1 = (B, A, D, C)
let tmp1 = self.shuffle(Shuffle::BADC);
// tmp2 = (-A, B, -C, D)
let tmp2 = self.blend(self.negate_lazy(), Lanes::AC);
// (B - A, B + A, D - C, D + C) bounded with b < 1.6
tmp1 + tmp2
}
/// Reduce this vector of field elements \\(\mathrm{mod} p\\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.0002 \\).
#[inline]
pub fn reduce(&self) -> FieldElement2625x4 {
let shifts = i32x8::new(26, 26, 25, 25, 26, 26, 25, 25);
let masks = u32x8::new(
(1 << 26) - 1,
(1 << 26) - 1,
(1 << 25) - 1,
(1 << 25) - 1,
(1 << 26) - 1,
(1 << 26) - 1,
(1 << 25) - 1,
(1 << 25) - 1,
);
// Let c(x) denote the carryout of the coefficient x.
//
// Given ( x0, y0, x1, y1, z0, w0, z1, w1),
// compute (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0)).
//
// The carryouts are bounded by 2^(32 - 25) = 2^7.
let rotated_carryout = |v: u32x8| -> u32x8 {
unsafe {
use core::arch::x86_64::_mm256_srlv_epi32;
use core::arch::x86_64::_mm256_shuffle_epi32;
let c = _mm256_srlv_epi32(v.into_bits(), shifts.into_bits());
_mm256_shuffle_epi32(c, 0b01_00_11_10).into_bits()
}
};
// Combine (lo, lo, lo, lo, lo, lo, lo, lo)
// with (hi, hi, hi, hi, hi, hi, hi, hi)
// to (lo, lo, hi, hi, lo, lo, hi, hi)
//
// This allows combining carryouts, e.g.,
//
// lo (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0))
// hi (c(x3), c(y3), c(x2), c(y2), c(z3), c(w3), c(z2), c(w2))
// -> (c(x1), c(y1), c(x2), c(y2), c(z1), c(w1), c(z2), c(w2))
//
// which is exactly the vector of carryins for
//
// ( x2, y2, x3, y3, z2, w2, z3, w3).
//
let combine = |v_lo: u32x8, v_hi: u32x8| -> u32x8 {
unsafe {
use core::arch::x86_64::_mm256_blend_epi32;
_mm256_blend_epi32(v_lo.into_bits(), v_hi.into_bits(), 0b11_00_11_00).into_bits()
}
};
let mut v = self.0;
let c10 = rotated_carryout(v[0]);
v[0] = (v[0] & masks) + combine(u32x8::splat(0), c10);
let c32 = rotated_carryout(v[1]);
v[1] = (v[1] & masks) + combine(c10, c32);
let c54 = rotated_carryout(v[2]);
v[2] = (v[2] & masks) + combine(c32, c54);
let c76 = rotated_carryout(v[3]);
v[3] = (v[3] & masks) + combine(c54, c76);
let c98 = rotated_carryout(v[4]);
v[4] = (v[4] & masks) + combine(c76, c98);
let c9_19: u32x8 = unsafe {
use core::arch::x86_64::_mm256_mul_epu32;
use core::arch::x86_64::_mm256_shuffle_epi32;
// Need to rearrange c98, since vpmuludq uses the low
// 32-bits of each 64-bit lane to compute the product:
//
// c98 = (c(x9), c(y9), c(x8), c(y8), c(z9), c(w9), c(z8), c(w8));
// c9_spread = (c(x9), c(x8), c(y9), c(y8), c(z9), c(z8), c(w9), c(w8)).
let c9_spread = _mm256_shuffle_epi32(c98.into_bits(), 0b11_01_10_00);
// Since the carryouts are bounded by 2^7, their products with 19
// are bounded by 2^11.25. This means that
//
// c9_19_spread = (19*c(x9), 0, 19*c(y9), 0, 19*c(z9), 0, 19*c(w9), 0).
let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into_bits());
// Unshuffle:
// c9_19 = (19*c(x9), 19*c(y9), 0, 0, 19*c(z9), 19*c(w9), 0, 0).
_mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into_bits()
};
// Add the final carryin.
v[0] = v[0] + c9_19;
// Each output coefficient has exactly one carryin, which is
// bounded by 2^11.25, so they are bounded as
//
// c_even < 2^26 + 2^11.25 < 26.00006 < 2^{26+b}
// c_odd < 2^25 + 2^11.25 < 25.0001 < 2^{25+b}
//
// where b = 0.0002.
FieldElement2625x4(v)
}
/// Given an array of wide coefficients, reduce them to a `FieldElement2625x4`.
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
#[inline]
fn reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4 {
// These aren't const because splat isn't a const fn
let LOW_25_BITS: u64x4 = u64x4::splat((1 << 25) - 1);
let LOW_26_BITS: u64x4 = u64x4::splat((1 << 26) - 1);
// Carry the value from limb i = 0..8 to limb i+1
let carry = |z: &mut [u64x4; 10], i: usize| {
debug_assert!(i < 9);
if i % 2 == 0 {
// Even limbs have 26 bits
z[i + 1] = z[i + 1] + (z[i] >> 26);
z[i] = z[i] & LOW_26_BITS;
} else {
// Odd limbs have 25 bits
z[i + 1] = z[i + 1] + (z[i] >> 25);
z[i] = z[i] & LOW_25_BITS;
}
};
// Perform two halves of the carry chain in parallel.
carry(&mut z, 0); carry(&mut z, 4);
carry(&mut z, 1); carry(&mut z, 5);
carry(&mut z, 2); carry(&mut z, 6);
carry(&mut z, 3); carry(&mut z, 7);
// Since z[3] < 2^64, c < 2^(64-25) = 2^39,
// so z[4] < 2^26 + 2^39 < 2^39.0002
carry(&mut z, 4); carry(&mut z, 8);
// Now z[4] < 2^26
// and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
// Last carry has a multiplication by 19. In the serial case we
// do a 64-bit multiplication by 19, but here we want to do a
// 32-bit multiplication. However, if we only know z[9] < 2^64,
// the carry is bounded as c < 2^(64-25) = 2^39, which is too
// big. To ensure c < 2^32, we would need z[9] < 2^57.
// Instead, we split the carry in two, with c = c_0 + c_1*2^26.
let c = z[9] >> 25;
z[9] = z[9] & LOW_25_BITS;
let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26;
let mut c1: u64x4 = c >> 26; // c1 < 2^(39-26) = 2^13;
unsafe {
use core::arch::x86_64::_mm256_mul_epu32;
let x19 = u64x4::splat(19);
c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25
c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25
}
z[0] = z[0] + c0; // z0 < 2^26 + 2^30.25 < 2^30.33
z[1] = z[1] + c1; // z1 < 2^25 + 2^17.25 < 2^25.0067
carry(&mut z, 0); // z0 < 2^26, z1 < 2^25.0067 + 2^4.33 = 2^25.007
// The output coefficients are bounded with
//
// b = 0.007 for z[1]
// b = 0.0004 for z[5]
// b = 0 for other z[i].
//
// So the packed result is bounded with b = 0.007.
FieldElement2625x4([
repack_pair(z[0].into_bits(), z[1].into_bits()),
repack_pair(z[2].into_bits(), z[3].into_bits()),
repack_pair(z[4].into_bits(), z[5].into_bits()),
repack_pair(z[6].into_bits(), z[7].into_bits()),
repack_pair(z[8].into_bits(), z[9].into_bits()),
])
}
/// Square this field element, and negate the result's \\(D\\) value.
///
/// # Preconditions
///
/// The coefficients of `self` must be bounded with \\( b < 1.5 \\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
pub fn square_and_negate_D(&self) -> FieldElement2625x4 {
#[inline(always)]
fn m(x: u32x8, y: u32x8) -> u64x4 {
use core::arch::x86_64::_mm256_mul_epu32;
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
}
#[inline(always)]
fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
use core::arch::x86_64::_mm256_mul_epu32;
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
}
let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
let (x0, x1) = unpack_pair(self.0[0]);
let (x2, x3) = unpack_pair(self.0[1]);
let (x4, x5) = unpack_pair(self.0[2]);
let (x6, x7) = unpack_pair(self.0[3]);
let (x8, x9) = unpack_pair(self.0[4]);
let x0_2 = x0 << 1;
let x1_2 = x1 << 1;
let x2_2 = x2 << 1;
let x3_2 = x3 << 1;
let x4_2 = x4 << 1;
let x5_2 = x5 << 1;
let x6_2 = x6 << 1;
let x7_2 = x7 << 1;
let x5_19 = m_lo(v19, x5);
let x6_19 = m_lo(v19, x6);
let x7_19 = m_lo(v19, x7);
let x8_19 = m_lo(v19, x8);
let x9_19 = m_lo(v19, x9);
let mut z0 = m(x0, x0) + m(x2_2,x8_19) + m(x4_2,x6_19) + ((m(x1_2,x9_19) + m(x3_2,x7_19) + m(x5,x5_19)) << 1);
let mut z1 = m(x0_2,x1) + m(x3_2,x8_19) + m(x5_2,x6_19) + ((m(x2,x9_19) + m(x4,x7_19)) << 1);
let mut z2 = m(x0_2,x2) + m(x1_2,x1) + m(x4_2,x8_19) + m(x6,x6_19) + ((m(x3_2,x9_19) + m(x5_2,x7_19)) << 1);
let mut z3 = m(x0_2,x3) + m(x1_2,x2) + m(x5_2,x8_19) + ((m(x4,x9_19) + m(x6,x7_19)) << 1);
let mut z4 = m(x0_2,x4) + m(x1_2,x3_2) + m(x2, x2) + m(x6_2,x8_19) + ((m(x5_2,x9_19) + m(x7,x7_19)) << 1);
let mut z5 = m(x0_2,x5) + m(x1_2,x4) + m(x2_2,x3) + m(x7_2,x8_19) + ((m(x6,x9_19)) << 1);
let mut z6 = m(x0_2,x6) + m(x1_2,x5_2) + m(x2_2,x4) + m(x3_2,x3) + m(x8,x8_19) + ((m(x7_2,x9_19)) << 1);
let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1);
let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1);
let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5);
// The biggest z_i is bounded as z_i < 249*2^(51 + 2*b);
// if b < 1.5 we get z_i < 4485585228861014016.
//
// The limbs of the multiples of p are bounded above by
//
// 0x3fffffff << 37 = 9223371899415822336 < 2^63
//
// and below by
//
// 0x1fffffff << 37 = 4611685880988434432
// > 4485585228861014016
//
// So these multiples of p are big enough to avoid underflow
// in subtraction, and small enough to fit within u64
// with room for a carry.
let low__p37 = u64x4::splat(0x3ffffed << 37);
let even_p37 = u64x4::splat(0x3ffffff << 37);
let odd__p37 = u64x4::splat(0x1ffffff << 37);
let negate_D = |x: u64x4, p: u64x4| -> u64x4 {
unsafe {
use core::arch::x86_64::_mm256_blend_epi32;
_mm256_blend_epi32(x.into_bits(), (p - x).into_bits(), D_LANES64 as i32).into_bits()
}
};
z0 = negate_D(z0, low__p37);
z1 = negate_D(z1, odd__p37);
z2 = negate_D(z2, even_p37);
z3 = negate_D(z3, odd__p37);
z4 = negate_D(z4, even_p37);
z5 = negate_D(z5, odd__p37);
z6 = negate_D(z6, even_p37);
z7 = negate_D(z7, odd__p37);
z8 = negate_D(z8, even_p37);
z9 = negate_D(z9, odd__p37);
FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
}
}
impl Neg for FieldElement2625x4 {
type Output = FieldElement2625x4;
/// Negate this field element, performing a reduction.
///
/// If the coefficients are known to be small, use `negate_lazy`
/// to avoid performing a reduction.
///
/// # Preconditions
///
/// The coefficients of `self` must be bounded with \\( b < 4.0 \\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.0002 \\).
#[inline]
fn neg(self) -> FieldElement2625x4 {
FieldElement2625x4([
P_TIMES_16_LO - self.0[0],
P_TIMES_16_HI - self.0[1],
P_TIMES_16_HI - self.0[2],
P_TIMES_16_HI - self.0[3],
P_TIMES_16_HI - self.0[4],
]).reduce()
}
}
impl Add<FieldElement2625x4> for FieldElement2625x4 {
type Output = FieldElement2625x4;
/// Add two `FieldElement2625x4`s, without performing a reduction.
#[inline]
fn add(self, rhs: FieldElement2625x4) -> FieldElement2625x4 {
FieldElement2625x4([
self.0[0] + rhs.0[0],
self.0[1] + rhs.0[1],
self.0[2] + rhs.0[2],
self.0[3] + rhs.0[3],
self.0[4] + rhs.0[4],
])
}
}
impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 {
type Output = FieldElement2625x4;
/// Perform a multiplication by a vector of small constants.
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
#[inline]
fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 {
unsafe {
use core::arch::x86_64::_mm256_mul_epu32;
let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0);
let (b0, b1) = unpack_pair(self.0[0]);
let (b2, b3) = unpack_pair(self.0[1]);
let (b4, b5) = unpack_pair(self.0[2]);
let (b6, b7) = unpack_pair(self.0[3]);
let (b8, b9) = unpack_pair(self.0[4]);
FieldElement2625x4::reduce64([
_mm256_mul_epu32(b0.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b1.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b2.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b3.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b4.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b5.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b6.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b7.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b8.into_bits(), consts.into_bits()).into_bits(),
_mm256_mul_epu32(b9.into_bits(), consts.into_bits()).into_bits(),
])
}
}
}
impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 {
type Output = FieldElement2625x4;
/// Multiply `self` by `rhs`.
///
/// # Preconditions
///
/// The coefficients of `self` must be bounded with \\( b < 2.5 \\).
///
/// The coefficients of `rhs` must be bounded with \\( b < 1.75 \\).
///
/// # Postconditions
///
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
///
fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 {
#[inline(always)]
fn m(x: u32x8, y: u32x8) -> u64x4 {
use core::arch::x86_64::_mm256_mul_epu32;
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
}
#[inline(always)]
fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
use core::arch::x86_64::_mm256_mul_epu32;
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
}
let (x0, x1) = unpack_pair(self.0[0]);
let (x2, x3) = unpack_pair(self.0[1]);
let (x4, x5) = unpack_pair(self.0[2]);
let (x6, x7) = unpack_pair(self.0[3]);
let (x8, x9) = unpack_pair(self.0[4]);
let (y0, y1) = unpack_pair(rhs.0[0]);
let (y2, y3) = unpack_pair(rhs.0[1]);
let (y4, y5) = unpack_pair(rhs.0[2]);
let (y6, y7) = unpack_pair(rhs.0[3]);
let (y8, y9) = unpack_pair(rhs.0[4]);
let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
let y1_19 = m_lo(v19, y1); // This fits in a u32
let y2_19 = m_lo(v19, y2); // iff 26 + b + lg(19) < 32
let y3_19 = m_lo(v19, y3); // if b < 32 - 26 - 4.248 = 1.752
let y4_19 = m_lo(v19, y4);
let y5_19 = m_lo(v19, y5);
let y6_19 = m_lo(v19, y6);
let y7_19 = m_lo(v19, y7);
let y8_19 = m_lo(v19, y8);
let y9_19 = m_lo(v19, y9);
let x1_2 = x1 + x1; // This fits in a u32 iff 25 + b + 1 < 32
let x3_2 = x3 + x3; // iff b < 6
let x5_2 = x5 + x5;
let x7_2 = x7 + x7;
let x9_2 = x9 + x9;
let z0 = m(x0,y0) + m(x1_2,y9_19) + m(x2,y8_19) + m(x3_2,y7_19) + m(x4,y6_19) + m(x5_2,y5_19) + m(x6,y4_19) + m(x7_2,y3_19) + m(x8,y2_19) + m(x9_2,y1_19);
let z1 = m(x0,y1) + m(x1,y0) + m(x2,y9_19) + m(x3,y8_19) + m(x4,y7_19) + m(x5,y6_19) + m(x6,y5_19) + m(x7,y4_19) + m(x8,y3_19) + m(x9,y2_19);
let z2 = m(x0,y2) + m(x1_2,y1) + m(x2,y0) + m(x3_2,y9_19) + m(x4,y8_19) + m(x5_2,y7_19) + m(x6,y6_19) + m(x7_2,y5_19) + m(x8,y4_19) + m(x9_2,y3_19);
let z3 = m(x0,y3) + m(x1,y2) + m(x2,y1) + m(x3,y0) + m(x4,y9_19) + m(x5,y8_19) + m(x6,y7_19) + m(x7,y6_19) + m(x8,y5_19) + m(x9,y4_19);
let z4 = m(x0,y4) + m(x1_2,y3) + m(x2,y2) + m(x3_2,y1) + m(x4,y0) + m(x5_2,y9_19) + m(x6,y8_19) + m(x7_2,y7_19) + m(x8,y6_19) + m(x9_2,y5_19);
let z5 = m(x0,y5) + m(x1,y4) + m(x2,y3) + m(x3,y2) + m(x4,y1) + m(x5,y0) + m(x6,y9_19) + m(x7,y8_19) + m(x8,y7_19) + m(x9,y6_19);
let z6 = m(x0,y6) + m(x1_2,y5) + m(x2,y4) + m(x3_2,y3) + m(x4,y2) + m(x5_2,y1) + m(x6,y0) + m(x7_2,y9_19) + m(x8,y8_19) + m(x9_2,y7_19);
let z7 = m(x0,y7) + m(x1,y6) + m(x2,y5) + m(x3,y4) + m(x4,y3) + m(x5,y2) + m(x6,y1) + m(x7,y0) + m(x8,y9_19) + m(x9,y8_19);
let z8 = m(x0,y8) + m(x1_2,y7) + m(x2,y6) + m(x3_2,y5) + m(x4,y4) + m(x5_2,y3) + m(x6,y2) + m(x7_2,y1) + m(x8,y0) + m(x9_2,y9_19);
let z9 = m(x0,y9) + m(x1,y8) + m(x2,y7) + m(x3,y6) + m(x4,y5) + m(x5,y4) + m(x6,y3) + m(x7,y2) + m(x8,y1) + m(x9,y0);
// The bounds on z[i] are the same as in the serial 32-bit code
// and the comment below is copied from there:
// How big is the contribution to z[i+j] from x[i], y[j]?
//
// Using the bounds above, we get:
//
// i even, j even: x[i]*y[j] < 2^(26+b)*2^(26+b) = 2*2^(51+2*b)
// i odd, j even: x[i]*y[j] < 2^(25+b)*2^(26+b) = 1*2^(51+2*b)
// i even, j odd: x[i]*y[j] < 2^(26+b)*2^(25+b) = 1*2^(51+2*b)
// i odd, j odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
//
// We perform inline reduction mod p by replacing 2^255 by 19
// (since 2^255 - 19 = 0 mod p). This adds a factor of 19, so
// we get the bounds (z0 is the biggest one, but calculated for
// posterity here in case finer estimation is needed later):
//
// z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
// z1 < ( 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
// z2 < ( 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
// z3 < ( 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
// z4 < ( 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
// z5 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 82*2^(51 + 2*b)
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 87*2^(51 + 2*b)
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 )*2^(51 + 2b) = 46*2^(51 + 2*b)
// z8 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 )*2^(51 + 2b) = 33*2^(51 + 2*b)
// z9 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 )*2^(51 + 2b) = 10*2^(51 + 2*b)
//
// So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
// if b < 2.5.
// In fact this bound is slightly sloppy, since it treats both
// inputs x and y as being bounded by the same parameter b,
// while they are in fact bounded by b_x and b_y, and we
// already require that b_y < 1.75 in order to fit the
// multiplications by 19 into a u32. The tighter bound on b_y
// means we could get a tighter bound on the outputs, or a
// looser bound on b_x.
FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn scale_by_curve_constants() {
let mut x = FieldElement2625x4::splat(&FieldElement51::one());
x = x * (121666, 121666, 2*121666, 2*121665);
let xs = x.split();
assert_eq!(xs[0], FieldElement51([121666, 0, 0, 0, 0]));
assert_eq!(xs[1], FieldElement51([121666, 0, 0, 0, 0]));
assert_eq!(xs[2], FieldElement51([2 * 121666, 0, 0, 0, 0]));
assert_eq!(xs[3], FieldElement51([2 * 121665, 0, 0, 0, 0]));
}
#[test]
fn diff_sum_vs_serial() {
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3).diff_sum();
let result = vec.split();
assert_eq!(result[0], &x1 - &x0);
assert_eq!(result[1], &x1 + &x0);
assert_eq!(result[2], &x3 - &x2);
assert_eq!(result[3], &x3 + &x2);
}
#[test]
fn square_vs_serial() {
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
let result = vec.square_and_negate_D().split();
assert_eq!(result[0], &x0 * &x0);
assert_eq!(result[1], &x1 * &x1);
assert_eq!(result[2], &x2 * &x2);
assert_eq!(result[3], -&(&x3 * &x3));
}
#[test]
fn multiply_vs_serial() {
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
let vecprime = vec.clone();
let result = (&vec * &vecprime).split();
assert_eq!(result[0], &x0 * &x0);
assert_eq!(result[1], &x1 * &x1);
assert_eq!(result[2], &x2 * &x2);
assert_eq!(result[3], &x3 * &x3);
}
#[test]
fn test_unpack_repack_pair() {
let x0 = FieldElement51([10000 + (10001 << 26), 0, 0, 0, 0]);
let x1 = FieldElement51([10100 + (10101 << 26), 0, 0, 0, 0]);
let x2 = FieldElement51([10200 + (10201 << 26), 0, 0, 0, 0]);
let x3 = FieldElement51([10300 + (10301 << 26), 0, 0, 0, 0]);
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
let src = vec.0[0];
let (a, b) = unpack_pair(src);
let expected_a = u32x8::new(10000, 0, 10100, 0, 10200, 0, 10300, 0);
let expected_b = u32x8::new(10001, 0, 10101, 0, 10201, 0, 10301, 0);
assert_eq!(a, expected_a);
assert_eq!(b, expected_b);
let expected_src = repack_pair(a, b);
assert_eq!(src, expected_src);
}
#[test]
fn new_split_roundtrips() {
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
let splits = vec.split();
assert_eq!(x0, splits[0]);
assert_eq!(x1, splits[1]);
assert_eq!(x2, splits[2]);
assert_eq!(x3, splits[3]);
}
}

View File

@ -1,20 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![cfg_attr(
feature = "nightly",
doc(include = "../../../../docs/avx2-notes.md")
)]
pub(crate) mod field;
pub(crate) mod edwards;
pub(crate) mod constants;

View File

@ -1,315 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2018-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use traits::Identity;
use std::ops::{Add, Neg, Sub};
use subtle::Choice;
use subtle::ConditionallySelectable;
use edwards;
use window::{LookupTable, NafLookupTable5, NafLookupTable8};
use super::constants;
use super::field::{F51x4Reduced, F51x4Unreduced, Lanes, Shuffle};
#[derive(Copy, Clone, Debug)]
pub struct ExtendedPoint(pub(super) F51x4Unreduced);
#[derive(Copy, Clone, Debug)]
pub struct CachedPoint(pub(super) F51x4Reduced);
impl From<edwards::EdwardsPoint> for ExtendedPoint {
fn from(P: edwards::EdwardsPoint) -> ExtendedPoint {
ExtendedPoint(F51x4Unreduced::new(&P.X, &P.Y, &P.Z, &P.T))
}
}
impl From<ExtendedPoint> for edwards::EdwardsPoint {
fn from(P: ExtendedPoint) -> edwards::EdwardsPoint {
let reduced = F51x4Reduced::from(P.0);
let tmp = F51x4Unreduced::from(reduced).split();
edwards::EdwardsPoint {
X: tmp[0],
Y: tmp[1],
Z: tmp[2],
T: tmp[3],
}
}
}
impl From<ExtendedPoint> for CachedPoint {
fn from(P: ExtendedPoint) -> CachedPoint {
let mut x = P.0;
x = x.blend(&x.diff_sum(), Lanes::AB);
x = &F51x4Reduced::from(x) * (121666, 121666, 2 * 121666, 2 * 121665);
x = x.blend(&x.negate_lazy(), Lanes::D);
CachedPoint(F51x4Reduced::from(x))
}
}
impl Default for ExtendedPoint {
fn default() -> ExtendedPoint {
ExtendedPoint::identity()
}
}
impl Identity for ExtendedPoint {
fn identity() -> ExtendedPoint {
constants::EXTENDEDPOINT_IDENTITY
}
}
impl ExtendedPoint {
pub fn double(&self) -> ExtendedPoint {
// (Y1 X1 T1 Z1) -- uses vpshufd (1c latency @ 1/c)
let mut tmp0 = self.0.shuffle(Shuffle::BADC);
// (X1+Y1 X1+Y1 X1+Y1 X1+Y1) -- can use vpinserti128
let mut tmp1 = (self.0 + tmp0).shuffle(Shuffle::ABAB);
// (X1 Y1 Z1 X1+Y1)
tmp0 = self.0.blend(&tmp1, Lanes::D);
tmp1 = F51x4Reduced::from(tmp0).square();
// Now tmp1 = (S1 S2 S3 S4)
// We want to compute
//
// + | S1 | S1 | S1 | S1 |
// + | S2 | | | S2 |
// + | | | S3 | |
// + | | | S3 | |
// + | |16p |16p |16p |
// - | | S2 | S2 | |
// - | | | | S4 |
// =======================
// S5 S6 S8 S9
let zero = F51x4Unreduced::zero();
let S1_S1_S1_S1 = tmp1.shuffle(Shuffle::AAAA);
let S2_S2_S2_S2 = tmp1.shuffle(Shuffle::BBBB);
let S2_S2_S2_S4 = S2_S2_S2_S2.blend(&tmp1, Lanes::D).negate_lazy();
tmp0 = S1_S1_S1_S1 + zero.blend(&(tmp1 + tmp1), Lanes::C);
tmp0 = tmp0 + zero.blend(&S2_S2_S2_S2, Lanes::AD);
tmp0 = tmp0 + zero.blend(&S2_S2_S2_S4, Lanes::BCD);
let tmp2 = F51x4Reduced::from(tmp0);
ExtendedPoint(&tmp2.shuffle(Shuffle::DBBD) * &tmp2.shuffle(Shuffle::CACA))
}
pub fn mul_by_pow_2(&self, k: u32) -> ExtendedPoint {
let mut tmp: ExtendedPoint = *self;
for _ in 0..k {
tmp = tmp.double();
}
tmp
}
}
impl<'a, 'b> Add<&'b CachedPoint> for &'a ExtendedPoint {
type Output = ExtendedPoint;
/// Add an `ExtendedPoint` and a `CachedPoint`.
fn add(self, other: &'b CachedPoint) -> ExtendedPoint {
let mut tmp = self.0;
tmp = tmp.blend(&tmp.diff_sum(), Lanes::AB);
// tmp = (Y1-X1 Y1+X1 Z1 T1) = (S0 S1 Z1 T1)
tmp = &F51x4Reduced::from(tmp) * &other.0;
// tmp = (S0*S2' S1*S3' Z1*Z2' T1*T2') = (S8 S9 S10 S11)
tmp = tmp.shuffle(Shuffle::ABDC);
// tmp = (S8 S9 S11 S10)
let tmp = F51x4Reduced::from(tmp.diff_sum());
// tmp = (S9-S8 S9+S8 S10-S11 S10+S11) = (S12 S13 S14 S15)
let t0 = tmp.shuffle(Shuffle::ADDA);
// t0 = (S12 S15 S15 S12)
let t1 = tmp.shuffle(Shuffle::CBCB);
// t1 = (S14 S13 S14 S13)
// Return (S12*S14 S15*S13 S15*S14 S12*S13) = (X3 Y3 Z3 T3)
ExtendedPoint(&t0 * &t1)
}
}
impl Default for CachedPoint {
fn default() -> CachedPoint {
CachedPoint::identity()
}
}
impl Identity for CachedPoint {
fn identity() -> CachedPoint {
constants::CACHEDPOINT_IDENTITY
}
}
impl ConditionallySelectable for CachedPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
CachedPoint(F51x4Reduced::conditional_select(&a.0, &b.0, choice))
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.0.conditional_assign(&other.0, choice);
}
}
impl<'a> Neg for &'a CachedPoint {
type Output = CachedPoint;
fn neg(self) -> CachedPoint {
let swapped = self.0.shuffle(Shuffle::BACD);
CachedPoint(swapped.blend(&(-self.0), Lanes::D))
}
}
impl<'a, 'b> Sub<&'b CachedPoint> for &'a ExtendedPoint {
type Output = ExtendedPoint;
/// Implement subtraction by negating the point and adding.
fn sub(self, other: &'b CachedPoint) -> ExtendedPoint {
self + &(-other)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for LookupTable<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let P = ExtendedPoint::from(*point);
let mut points = [CachedPoint::from(P); 8];
for i in 0..7 {
points[i + 1] = (&P + &points[i]).into();
}
LookupTable(points)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable5<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let A = ExtendedPoint::from(*point);
let mut Ai = [CachedPoint::from(A); 8];
let A2 = A.double();
for i in 0..7 {
Ai[i + 1] = (&A2 + &Ai[i]).into();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
NafLookupTable5(Ai)
}
}
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable8<CachedPoint> {
fn from(point: &'a edwards::EdwardsPoint) -> Self {
let A = ExtendedPoint::from(*point);
let mut Ai = [CachedPoint::from(A); 64];
let A2 = A.double();
for i in 0..63 {
Ai[i + 1] = (&A2 + &Ai[i]).into();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
NafLookupTable8(Ai)
}
}
#[cfg(test)]
mod test {
use super::*;
fn addition_test_helper(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) {
// Test the serial implementation of the parallel addition formulas
//let R_serial: edwards::EdwardsPoint = serial_add(P.into(), Q.into()).into();
// Test the vector implementation of the parallel readdition formulas
let cached_Q = CachedPoint::from(ExtendedPoint::from(Q));
let R_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) + &cached_Q).into();
let S_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) - &cached_Q).into();
println!("Testing point addition:");
println!("P = {:?}", P);
println!("Q = {:?}", Q);
println!("cached Q = {:?}", cached_Q);
println!("R = P + Q = {:?}", &P + &Q);
//println!("R_serial = {:?}", R_serial);
println!("R_vector = {:?}", R_vector);
println!("S = P - Q = {:?}", &P - &Q);
println!("S_vector = {:?}", S_vector);
//assert_eq!(R_serial.compress(), (&P + &Q).compress());
assert_eq!(R_vector.compress(), (&P + &Q).compress());
assert_eq!(S_vector.compress(), (&P - &Q).compress());
println!("OK!\n");
}
#[test]
fn vector_addition_vs_serial_addition_vs_edwards_extendedpoint() {
use constants;
use scalar::Scalar;
println!("Testing id +- id");
let P = edwards::EdwardsPoint::identity();
let Q = edwards::EdwardsPoint::identity();
addition_test_helper(P, Q);
println!("Testing id +- B");
let P = edwards::EdwardsPoint::identity();
let Q = constants::ED25519_BASEPOINT_POINT;
addition_test_helper(P, Q);
println!("Testing B +- B");
let P = constants::ED25519_BASEPOINT_POINT;
let Q = constants::ED25519_BASEPOINT_POINT;
addition_test_helper(P, Q);
println!("Testing B +- kB");
let P = constants::ED25519_BASEPOINT_POINT;
let Q = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
addition_test_helper(P, Q);
}
fn doubling_test_helper(P: edwards::EdwardsPoint) {
//let R1: edwards::EdwardsPoint = serial_double(P.into()).into();
let R2: edwards::EdwardsPoint = ExtendedPoint::from(P).double().into();
println!("Testing point doubling:");
println!("P = {:?}", P);
//println!("(serial) R1 = {:?}", R1);
println!("(vector) R2 = {:?}", R2);
println!("P + P = {:?}", &P + &P);
//assert_eq!(R1.compress(), (&P + &P).compress());
assert_eq!(R2.compress(), (&P + &P).compress());
println!("OK!\n");
}
#[test]
fn vector_doubling_vs_serial_doubling_vs_edwards_extendedpoint() {
use constants;
use scalar::Scalar;
println!("Testing [2]id");
let P = edwards::EdwardsPoint::identity();
doubling_test_helper(P);
println!("Testing [2]B");
let P = constants::ED25519_BASEPOINT_POINT;
doubling_test_helper(P);
println!("Testing [2]([k]B)");
let P = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
doubling_test_helper(P);
}
}

View File

@ -1,824 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2018-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use core::ops::{Add, Mul, Neg};
use packed_simd::{u64x4, IntoBits};
use backend::serial::u64::field::FieldElement51;
/// A wrapper around `vpmadd52luq` that works on `u64x4`.
#[inline(always)]
unsafe fn madd52lo(z: u64x4, x: u64x4, y: u64x4) -> u64x4 {
use core::arch::x86_64::_mm256_madd52lo_epu64;
_mm256_madd52lo_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits()
}
/// A wrapper around `vpmadd52huq` that works on `u64x4`.
#[inline(always)]
unsafe fn madd52hi(z: u64x4, x: u64x4, y: u64x4) -> u64x4 {
use core::arch::x86_64::_mm256_madd52hi_epu64;
_mm256_madd52hi_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits()
}
/// A vector of four field elements in radix 2^51, with unreduced coefficients.
#[derive(Copy, Clone, Debug)]
pub struct F51x4Unreduced(pub(crate) [u64x4; 5]);
/// A vector of four field elements in radix 2^51, with reduced coefficients.
#[derive(Copy, Clone, Debug)]
pub struct F51x4Reduced(pub(crate) [u64x4; 5]);
#[derive(Copy, Clone)]
pub enum Shuffle {
AAAA,
BBBB,
BADC,
BACD,
ADDA,
CBCB,
ABDC,
ABAB,
DBBD,
CACA,
}
#[inline(always)]
fn shuffle_lanes(x: u64x4, control: Shuffle) -> u64x4 {
unsafe {
use core::arch::x86_64::_mm256_permute4x64_epi64 as perm;
match control {
Shuffle::AAAA => perm(x.into_bits(), 0b00_00_00_00).into_bits(),
Shuffle::BBBB => perm(x.into_bits(), 0b01_01_01_01).into_bits(),
Shuffle::BADC => perm(x.into_bits(), 0b10_11_00_01).into_bits(),
Shuffle::BACD => perm(x.into_bits(), 0b11_10_00_01).into_bits(),
Shuffle::ADDA => perm(x.into_bits(), 0b00_11_11_00).into_bits(),
Shuffle::CBCB => perm(x.into_bits(), 0b01_10_01_10).into_bits(),
Shuffle::ABDC => perm(x.into_bits(), 0b10_11_01_00).into_bits(),
Shuffle::ABAB => perm(x.into_bits(), 0b01_00_01_00).into_bits(),
Shuffle::DBBD => perm(x.into_bits(), 0b11_01_01_11).into_bits(),
Shuffle::CACA => perm(x.into_bits(), 0b00_10_00_10).into_bits(),
}
}
}
#[derive(Copy, Clone)]
pub enum Lanes {
D,
C,
AB,
AC,
AD,
BCD,
}
#[inline]
fn blend_lanes(x: u64x4, y: u64x4, control: Lanes) -> u64x4 {
unsafe {
use core::arch::x86_64::_mm256_blend_epi32 as blend;
match control {
Lanes::D => blend(x.into_bits(), y.into_bits(), 0b11_00_00_00).into_bits(),
Lanes::C => blend(x.into_bits(), y.into_bits(), 0b00_11_00_00).into_bits(),
Lanes::AB => blend(x.into_bits(), y.into_bits(), 0b00_00_11_11).into_bits(),
Lanes::AC => blend(x.into_bits(), y.into_bits(), 0b00_11_00_11).into_bits(),
Lanes::AD => blend(x.into_bits(), y.into_bits(), 0b11_00_00_11).into_bits(),
Lanes::BCD => blend(x.into_bits(), y.into_bits(), 0b11_11_11_00).into_bits(),
}
}
}
impl F51x4Unreduced {
pub fn zero() -> F51x4Unreduced {
F51x4Unreduced([u64x4::splat(0); 5])
}
pub fn new(
x0: &FieldElement51,
x1: &FieldElement51,
x2: &FieldElement51,
x3: &FieldElement51,
) -> F51x4Unreduced {
F51x4Unreduced([
u64x4::new(x0.0[0], x1.0[0], x2.0[0], x3.0[0]),
u64x4::new(x0.0[1], x1.0[1], x2.0[1], x3.0[1]),
u64x4::new(x0.0[2], x1.0[2], x2.0[2], x3.0[2]),
u64x4::new(x0.0[3], x1.0[3], x2.0[3], x3.0[3]),
u64x4::new(x0.0[4], x1.0[4], x2.0[4], x3.0[4]),
])
}
pub fn split(&self) -> [FieldElement51; 4] {
let x = &self.0;
[
FieldElement51([
x[0].extract(0),
x[1].extract(0),
x[2].extract(0),
x[3].extract(0),
x[4].extract(0),
]),
FieldElement51([
x[0].extract(1),
x[1].extract(1),
x[2].extract(1),
x[3].extract(1),
x[4].extract(1),
]),
FieldElement51([
x[0].extract(2),
x[1].extract(2),
x[2].extract(2),
x[3].extract(2),
x[4].extract(2),
]),
FieldElement51([
x[0].extract(3),
x[1].extract(3),
x[2].extract(3),
x[3].extract(3),
x[4].extract(3),
]),
]
}
#[inline]
pub fn diff_sum(&self) -> F51x4Unreduced {
// tmp1 = (B, A, D, C)
let tmp1 = self.shuffle(Shuffle::BADC);
// tmp2 = (-A, B, -C, D)
let tmp2 = self.blend(&self.negate_lazy(), Lanes::AC);
// (B - A, B + A, D - C, D + C)
tmp1 + tmp2
}
#[inline]
pub fn negate_lazy(&self) -> F51x4Unreduced {
let lo = u64x4::splat(36028797018963664u64);
let hi = u64x4::splat(36028797018963952u64);
F51x4Unreduced([
lo - self.0[0],
hi - self.0[1],
hi - self.0[2],
hi - self.0[3],
hi - self.0[4],
])
}
#[inline]
pub fn shuffle(&self, control: Shuffle) -> F51x4Unreduced {
F51x4Unreduced([
shuffle_lanes(self.0[0], control),
shuffle_lanes(self.0[1], control),
shuffle_lanes(self.0[2], control),
shuffle_lanes(self.0[3], control),
shuffle_lanes(self.0[4], control),
])
}
#[inline]
pub fn blend(&self, other: &F51x4Unreduced, control: Lanes) -> F51x4Unreduced {
F51x4Unreduced([
blend_lanes(self.0[0], other.0[0], control),
blend_lanes(self.0[1], other.0[1], control),
blend_lanes(self.0[2], other.0[2], control),
blend_lanes(self.0[3], other.0[3], control),
blend_lanes(self.0[4], other.0[4], control),
])
}
}
impl Neg for F51x4Reduced {
type Output = F51x4Reduced;
fn neg(self) -> F51x4Reduced {
F51x4Unreduced::from(self).negate_lazy().into()
}
}
use subtle::Choice;
use subtle::ConditionallySelectable;
impl ConditionallySelectable for F51x4Reduced {
#[inline]
fn conditional_select(a: &F51x4Reduced, b: &F51x4Reduced, choice: Choice) -> F51x4Reduced {
let mask = (-(choice.unwrap_u8() as i64)) as u64;
let mask_vec = u64x4::splat(mask);
F51x4Reduced([
a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
])
}
#[inline]
fn conditional_assign(&mut self, other: &F51x4Reduced, choice: Choice) {
let mask = (-(choice.unwrap_u8() as i64)) as u64;
let mask_vec = u64x4::splat(mask);
self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
}
}
impl F51x4Reduced {
#[inline]
pub fn shuffle(&self, control: Shuffle) -> F51x4Reduced {
F51x4Reduced([
shuffle_lanes(self.0[0], control),
shuffle_lanes(self.0[1], control),
shuffle_lanes(self.0[2], control),
shuffle_lanes(self.0[3], control),
shuffle_lanes(self.0[4], control),
])
}
#[inline]
pub fn blend(&self, other: &F51x4Reduced, control: Lanes) -> F51x4Reduced {
F51x4Reduced([
blend_lanes(self.0[0], other.0[0], control),
blend_lanes(self.0[1], other.0[1], control),
blend_lanes(self.0[2], other.0[2], control),
blend_lanes(self.0[3], other.0[3], control),
blend_lanes(self.0[4], other.0[4], control),
])
}
#[inline]
pub fn square(&self) -> F51x4Unreduced {
unsafe {
let x = &self.0;
// Represent values with coeff. 2
let mut z0_2 = u64x4::splat(0);
let mut z1_2 = u64x4::splat(0);
let mut z2_2 = u64x4::splat(0);
let mut z3_2 = u64x4::splat(0);
let mut z4_2 = u64x4::splat(0);
let mut z5_2 = u64x4::splat(0);
let mut z6_2 = u64x4::splat(0);
let mut z7_2 = u64x4::splat(0);
let mut z9_2 = u64x4::splat(0);
// Represent values with coeff. 4
let mut z2_4 = u64x4::splat(0);
let mut z3_4 = u64x4::splat(0);
let mut z4_4 = u64x4::splat(0);
let mut z5_4 = u64x4::splat(0);
let mut z6_4 = u64x4::splat(0);
let mut z7_4 = u64x4::splat(0);
let mut z8_4 = u64x4::splat(0);
let mut z0_1 = u64x4::splat(0);
z0_1 = madd52lo(z0_1, x[0], x[0]);
let mut z1_1 = u64x4::splat(0);
z1_2 = madd52lo(z1_2, x[0], x[1]);
z1_2 = madd52hi(z1_2, x[0], x[0]);
z2_4 = madd52hi(z2_4, x[0], x[1]);
let mut z2_1 = z2_4 << 2;
z2_2 = madd52lo(z2_2, x[0], x[2]);
z2_1 = madd52lo(z2_1, x[1], x[1]);
z3_4 = madd52hi(z3_4, x[0], x[2]);
let mut z3_1 = z3_4 << 2;
z3_2 = madd52lo(z3_2, x[1], x[2]);
z3_2 = madd52lo(z3_2, x[0], x[3]);
z3_2 = madd52hi(z3_2, x[1], x[1]);
z4_4 = madd52hi(z4_4, x[1], x[2]);
z4_4 = madd52hi(z4_4, x[0], x[3]);
let mut z4_1 = z4_4 << 2;
z4_2 = madd52lo(z4_2, x[1], x[3]);
z4_2 = madd52lo(z4_2, x[0], x[4]);
z4_1 = madd52lo(z4_1, x[2], x[2]);
z5_4 = madd52hi(z5_4, x[1], x[3]);
z5_4 = madd52hi(z5_4, x[0], x[4]);
let mut z5_1 = z5_4 << 2;
z5_2 = madd52lo(z5_2, x[2], x[3]);
z5_2 = madd52lo(z5_2, x[1], x[4]);
z5_2 = madd52hi(z5_2, x[2], x[2]);
z6_4 = madd52hi(z6_4, x[2], x[3]);
z6_4 = madd52hi(z6_4, x[1], x[4]);
let mut z6_1 = z6_4 << 2;
z6_2 = madd52lo(z6_2, x[2], x[4]);
z6_1 = madd52lo(z6_1, x[3], x[3]);
z7_4 = madd52hi(z7_4, x[2], x[4]);
let mut z7_1 = z7_4 << 2;
z7_2 = madd52lo(z7_2, x[3], x[4]);
z7_2 = madd52hi(z7_2, x[3], x[3]);
z8_4 = madd52hi(z8_4, x[3], x[4]);
let mut z8_1 = z8_4 << 2;
z8_1 = madd52lo(z8_1, x[4], x[4]);
let mut z9_1 = u64x4::splat(0);
z9_2 = madd52hi(z9_2, x[4], x[4]);
z5_1 += z5_2 << 1;
z6_1 += z6_2 << 1;
z7_1 += z7_2 << 1;
z9_1 += z9_2 << 1;
let mut t0 = u64x4::splat(0);
let mut t1 = u64x4::splat(0);
let r19 = u64x4::splat(19);
t0 = madd52hi(t0, r19, z9_1);
t1 = madd52lo(t1, r19, z9_1 >> 52);
z4_2 = madd52lo(z4_2, r19, z8_1 >> 52);
z3_2 = madd52lo(z3_2, r19, z7_1 >> 52);
z2_2 = madd52lo(z2_2, r19, z6_1 >> 52);
z1_2 = madd52lo(z1_2, r19, z5_1 >> 52);
z0_2 = madd52lo(z0_2, r19, t0 + t1);
z1_2 = madd52hi(z1_2, r19, z5_1);
z2_2 = madd52hi(z2_2, r19, z6_1);
z3_2 = madd52hi(z3_2, r19, z7_1);
z4_2 = madd52hi(z4_2, r19, z8_1);
z0_1 = madd52lo(z0_1, r19, z5_1);
z1_1 = madd52lo(z1_1, r19, z6_1);
z2_1 = madd52lo(z2_1, r19, z7_1);
z3_1 = madd52lo(z3_1, r19, z8_1);
z4_1 = madd52lo(z4_1, r19, z9_1);
F51x4Unreduced([
z0_1 + z0_2 + z0_2,
z1_1 + z1_2 + z1_2,
z2_1 + z2_2 + z2_2,
z3_1 + z3_2 + z3_2,
z4_1 + z4_2 + z4_2,
])
}
}
}
impl From<F51x4Reduced> for F51x4Unreduced {
#[inline]
fn from(x: F51x4Reduced) -> F51x4Unreduced {
F51x4Unreduced(x.0)
}
}
impl From<F51x4Unreduced> for F51x4Reduced {
#[inline]
fn from(x: F51x4Unreduced) -> F51x4Reduced {
let mask = u64x4::splat((1 << 51) - 1);
let r19 = u64x4::splat(19);
// Compute carryouts in parallel
let c0 = x.0[0] >> 51;
let c1 = x.0[1] >> 51;
let c2 = x.0[2] >> 51;
let c3 = x.0[3] >> 51;
let c4 = x.0[4] >> 51;
unsafe {
F51x4Reduced([
madd52lo(x.0[0] & mask, c4, r19),
(x.0[1] & mask) + c0,
(x.0[2] & mask) + c1,
(x.0[3] & mask) + c2,
(x.0[4] & mask) + c3,
])
}
}
}
impl Add<F51x4Unreduced> for F51x4Unreduced {
type Output = F51x4Unreduced;
#[inline]
fn add(self, rhs: F51x4Unreduced) -> F51x4Unreduced {
F51x4Unreduced([
self.0[0] + rhs.0[0],
self.0[1] + rhs.0[1],
self.0[2] + rhs.0[2],
self.0[3] + rhs.0[3],
self.0[4] + rhs.0[4],
])
}
}
impl<'a> Mul<(u32, u32, u32, u32)> for &'a F51x4Reduced {
type Output = F51x4Unreduced;
#[inline]
fn mul(self, scalars: (u32, u32, u32, u32)) -> F51x4Unreduced {
unsafe {
let x = &self.0;
let y = u64x4::new(
scalars.0 as u64,
scalars.1 as u64,
scalars.2 as u64,
scalars.3 as u64,
);
let r19 = u64x4::splat(19);
let mut z0_1 = u64x4::splat(0);
let mut z1_1 = u64x4::splat(0);
let mut z2_1 = u64x4::splat(0);
let mut z3_1 = u64x4::splat(0);
let mut z4_1 = u64x4::splat(0);
let mut z1_2 = u64x4::splat(0);
let mut z2_2 = u64x4::splat(0);
let mut z3_2 = u64x4::splat(0);
let mut z4_2 = u64x4::splat(0);
let mut z5_2 = u64x4::splat(0);
// Wave 0
z4_2 = madd52hi(z4_2, y, x[3]);
z5_2 = madd52hi(z5_2, y, x[4]);
z4_1 = madd52lo(z4_1, y, x[4]);
z0_1 = madd52lo(z0_1, y, x[0]);
z3_1 = madd52lo(z3_1, y, x[3]);
z2_1 = madd52lo(z2_1, y, x[2]);
z1_1 = madd52lo(z1_1, y, x[1]);
z3_2 = madd52hi(z3_2, y, x[2]);
// Wave 2
z2_2 = madd52hi(z2_2, y, x[1]);
z1_2 = madd52hi(z1_2, y, x[0]);
z0_1 = madd52lo(z0_1, z5_2 + z5_2, r19);
F51x4Unreduced([
z0_1,
z1_1 + z1_2 + z1_2,
z2_1 + z2_2 + z2_2,
z3_1 + z3_2 + z3_2,
z4_1 + z4_2 + z4_2,
])
}
}
}
impl<'a, 'b> Mul<&'b F51x4Reduced> for &'a F51x4Reduced {
type Output = F51x4Unreduced;
#[inline]
fn mul(self, rhs: &'b F51x4Reduced) -> F51x4Unreduced {
unsafe {
// Inputs
let x = &self.0;
let y = &rhs.0;
// Accumulators for terms with coeff 1
let mut z0_1 = u64x4::splat(0);
let mut z1_1 = u64x4::splat(0);
let mut z2_1 = u64x4::splat(0);
let mut z3_1 = u64x4::splat(0);
let mut z4_1 = u64x4::splat(0);
let mut z5_1 = u64x4::splat(0);
let mut z6_1 = u64x4::splat(0);
let mut z7_1 = u64x4::splat(0);
let mut z8_1 = u64x4::splat(0);
// Accumulators for terms with coeff 2
let mut z0_2 = u64x4::splat(0);
let mut z1_2 = u64x4::splat(0);
let mut z2_2 = u64x4::splat(0);
let mut z3_2 = u64x4::splat(0);
let mut z4_2 = u64x4::splat(0);
let mut z5_2 = u64x4::splat(0);
let mut z6_2 = u64x4::splat(0);
let mut z7_2 = u64x4::splat(0);
let mut z8_2 = u64x4::splat(0);
let mut z9_2 = u64x4::splat(0);
// LLVM doesn't seem to do much work reordering IFMA
// instructions, so try to organize them into "waves" of 8
// independent operations (4c latency, 0.5 c throughput
// means 8 in flight)
// Wave 0
z4_1 = madd52lo(z4_1, x[2], y[2]);
z5_2 = madd52hi(z5_2, x[2], y[2]);
z5_1 = madd52lo(z5_1, x[4], y[1]);
z6_2 = madd52hi(z6_2, x[4], y[1]);
z6_1 = madd52lo(z6_1, x[4], y[2]);
z7_2 = madd52hi(z7_2, x[4], y[2]);
z7_1 = madd52lo(z7_1, x[4], y[3]);
z8_2 = madd52hi(z8_2, x[4], y[3]);
// Wave 1
z4_1 = madd52lo(z4_1, x[3], y[1]);
z5_2 = madd52hi(z5_2, x[3], y[1]);
z5_1 = madd52lo(z5_1, x[3], y[2]);
z6_2 = madd52hi(z6_2, x[3], y[2]);
z6_1 = madd52lo(z6_1, x[3], y[3]);
z7_2 = madd52hi(z7_2, x[3], y[3]);
z7_1 = madd52lo(z7_1, x[3], y[4]);
z8_2 = madd52hi(z8_2, x[3], y[4]);
// Wave 2
z8_1 = madd52lo(z8_1, x[4], y[4]);
z9_2 = madd52hi(z9_2, x[4], y[4]);
z4_1 = madd52lo(z4_1, x[4], y[0]);
z5_2 = madd52hi(z5_2, x[4], y[0]);
z5_1 = madd52lo(z5_1, x[2], y[3]);
z6_2 = madd52hi(z6_2, x[2], y[3]);
z6_1 = madd52lo(z6_1, x[2], y[4]);
z7_2 = madd52hi(z7_2, x[2], y[4]);
let z8 = z8_1 + z8_2 + z8_2;
let z9 = z9_2 + z9_2;
// Wave 3
z3_1 = madd52lo(z3_1, x[3], y[0]);
z4_2 = madd52hi(z4_2, x[3], y[0]);
z4_1 = madd52lo(z4_1, x[1], y[3]);
z5_2 = madd52hi(z5_2, x[1], y[3]);
z5_1 = madd52lo(z5_1, x[1], y[4]);
z6_2 = madd52hi(z6_2, x[1], y[4]);
z2_1 = madd52lo(z2_1, x[2], y[0]);
z3_2 = madd52hi(z3_2, x[2], y[0]);
let z6 = z6_1 + z6_2 + z6_2;
let z7 = z7_1 + z7_2 + z7_2;
// Wave 4
z3_1 = madd52lo(z3_1, x[2], y[1]);
z4_2 = madd52hi(z4_2, x[2], y[1]);
z4_1 = madd52lo(z4_1, x[0], y[4]);
z5_2 = madd52hi(z5_2, x[0], y[4]);
z1_1 = madd52lo(z1_1, x[1], y[0]);
z2_2 = madd52hi(z2_2, x[1], y[0]);
z2_1 = madd52lo(z2_1, x[1], y[1]);
z3_2 = madd52hi(z3_2, x[1], y[1]);
let z5 = z5_1 + z5_2 + z5_2;
// Wave 5
z3_1 = madd52lo(z3_1, x[1], y[2]);
z4_2 = madd52hi(z4_2, x[1], y[2]);
z0_1 = madd52lo(z0_1, x[0], y[0]);
z1_2 = madd52hi(z1_2, x[0], y[0]);
z1_1 = madd52lo(z1_1, x[0], y[1]);
z2_1 = madd52lo(z2_1, x[0], y[2]);
z2_2 = madd52hi(z2_2, x[0], y[1]);
z3_2 = madd52hi(z3_2, x[0], y[2]);
let mut t0 = u64x4::splat(0);
let mut t1 = u64x4::splat(0);
let r19 = u64x4::splat(19);
// Wave 6
t0 = madd52hi(t0, r19, z9);
t1 = madd52lo(t1, r19, z9 >> 52);
z3_1 = madd52lo(z3_1, x[0], y[3]);
z4_2 = madd52hi(z4_2, x[0], y[3]);
z1_2 = madd52lo(z1_2, r19, z5 >> 52);
z2_2 = madd52lo(z2_2, r19, z6 >> 52);
z3_2 = madd52lo(z3_2, r19, z7 >> 52);
z0_1 = madd52lo(z0_1, r19, z5);
// Wave 7
z4_1 = madd52lo(z4_1, r19, z9);
z1_1 = madd52lo(z1_1, r19, z6);
z0_2 = madd52lo(z0_2, r19, t0 + t1);
z4_2 = madd52hi(z4_2, r19, z8);
z2_1 = madd52lo(z2_1, r19, z7);
z1_2 = madd52hi(z1_2, r19, z5);
z2_2 = madd52hi(z2_2, r19, z6);
z3_2 = madd52hi(z3_2, r19, z7);
// Wave 8
z3_1 = madd52lo(z3_1, r19, z8);
z4_2 = madd52lo(z4_2, r19, z8 >> 52);
F51x4Unreduced([
z0_1 + z0_2 + z0_2,
z1_1 + z1_2 + z1_2,
z2_1 + z2_2 + z2_2,
z3_1 + z3_2 + z3_2,
z4_1 + z4_2 + z4_2,
])
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn vpmadd52luq() {
let x = u64x4::splat(2);
let y = u64x4::splat(3);
let mut z = u64x4::splat(5);
z = unsafe { madd52lo(z, x, y) };
assert_eq!(z, u64x4::splat(5 + 2 * 3));
}
#[test]
fn new_split_round_trip_on_reduced_input() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
let splits = ax4.split();
for i in 0..4 {
assert_eq!(a, splits[i]);
}
}
#[test]
fn new_split_round_trip_on_unreduced_input() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
// ... but now multiply it by 16 without reducing coeffs
let a16 = FieldElement51([
a.0[0] << 4,
a.0[1] << 4,
a.0[2] << 4,
a.0[3] << 4,
a.0[4] << 4,
]);
let a16x4 = F51x4Unreduced::new(&a16, &a16, &a16, &a16);
let splits = a16x4.split();
for i in 0..4 {
assert_eq!(a16, splits[i]);
}
}
#[test]
fn test_reduction() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
// ... but now multiply it by 128 without reducing coeffs
let abig = FieldElement51([
a.0[0] << 4,
a.0[1] << 4,
a.0[2] << 4,
a.0[3] << 4,
a.0[4] << 4,
]);
let abigx4: F51x4Reduced = F51x4Unreduced::new(&abig, &abig, &abig, &abig).into();
let splits = F51x4Unreduced::from(abigx4).split();
let c = &a * &FieldElement51([(1 << 4), 0, 0, 0, 0]);
for i in 0..4 {
assert_eq!(c, splits[i]);
}
}
#[test]
fn mul_matches_serial() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let b = FieldElement51([98098, 87987897, 0, 1, 0]).invert();
let c = &a * &b;
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
let bx4: F51x4Reduced = F51x4Unreduced::new(&b, &b, &b, &b).into();
let cx4 = &ax4 * &bx4;
let splits = cx4.split();
for i in 0..4 {
assert_eq!(c, splits[i]);
}
}
#[test]
fn iterated_mul_matches_serial() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let b = FieldElement51([98098, 87987897, 0, 1, 0]).invert();
let mut c = &a * &b;
for _i in 0..1024 {
c = &a * &c;
c = &b * &c;
}
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
let bx4: F51x4Reduced = F51x4Unreduced::new(&b, &b, &b, &b).into();
let mut cx4 = &ax4 * &bx4;
for _i in 0..1024 {
cx4 = &ax4 * &F51x4Reduced::from(cx4);
cx4 = &bx4 * &F51x4Reduced::from(cx4);
}
let splits = cx4.split();
for i in 0..4 {
assert_eq!(c, splits[i]);
}
}
#[test]
fn square_matches_mul() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
let cx4 = &ax4 * &ax4;
let cx4_sq = ax4.square();
let splits = cx4.split();
let splits_sq = cx4_sq.split();
for i in 0..4 {
assert_eq!(splits_sq[i], splits[i]);
}
}
#[test]
fn iterated_square_matches_serial() {
// Invert a small field element to get a big one
let mut a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let mut ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
for _j in 0..1024 {
a = a.square();
ax4 = F51x4Reduced::from(ax4).square();
let splits = ax4.split();
for i in 0..4 {
assert_eq!(a, splits[i]);
}
}
}
#[test]
fn iterated_u32_mul_matches_serial() {
// Invert a small field element to get a big one
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
let b = FieldElement51([121665, 0, 0, 0, 0]);
let mut c = &a * &b;
for _i in 0..1024 {
c = &b * &c;
}
let ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
let bx4 = (121665u32, 121665u32, 121665u32, 121665u32);
let mut cx4 = &F51x4Reduced::from(ax4) * bx4;
for _i in 0..1024 {
cx4 = &F51x4Reduced::from(cx4) * bx4;
}
let splits = cx4.split();
for i in 0..4 {
assert_eq!(c, splits[i]);
}
}
#[test]
fn shuffle_AAAA() {
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
let x = F51x4Unreduced::new(&x0, &x1, &x2, &x3);
let y = x.shuffle(Shuffle::AAAA);
let splits = y.split();
assert_eq!(splits[0], x0);
assert_eq!(splits[1], x0);
assert_eq!(splits[2], x0);
assert_eq!(splits[3], x0);
}
#[test]
fn blend_AB() {
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
let x = F51x4Unreduced::new(&x0, &x1, &x2, &x3);
let z = F51x4Unreduced::new(&x3, &x2, &x1, &x0);
let y = x.blend(&z, Lanes::AB);
let splits = y.split();
assert_eq!(splits[0], x3);
assert_eq!(splits[1], x2);
assert_eq!(splits[2], x2);
assert_eq!(splits[3], x3);
}
}

View File

@ -1,19 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2018-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
#![cfg_attr(
feature = "nightly",
doc(include = "../../../../docs/ifma-notes.md")
)]
pub mod field;
pub mod edwards;
pub mod constants;

View File

@ -1,42 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
// Conditionally include the notes if we're on nightly (so we can include docs at all).
#![cfg_attr(
feature = "nightly",
doc(include = "../../../docs/parallel-formulas.md")
)]
#[cfg(not(any(target_feature = "avx2", target_feature = "avx512ifma", rustdoc)))]
compile_error!("simd_backend selected without target_feature=+avx2 or +avx512ifma");
#[cfg(any(
all(target_feature = "avx2", not(target_feature = "avx512ifma")),
rustdoc
))]
#[doc(cfg(all(target_feature = "avx2", not(target_feature = "avx512ifma"))))]
pub mod avx2;
#[cfg(any(
all(target_feature = "avx2", not(target_feature = "avx512ifma")),
rustdoc
))]
pub(crate) use self::avx2::{
constants::BASEPOINT_ODD_LOOKUP_TABLE, edwards::CachedPoint, edwards::ExtendedPoint,
};
#[cfg(any(target_feature = "avx512ifma", rustdoc))]
#[doc(cfg(target_feature = "avx512ifma"))]
pub mod ifma;
#[cfg(target_feature = "avx512ifma")]
pub(crate) use self::ifma::{
constants::BASEPOINT_ODD_LOOKUP_TABLE, edwards::CachedPoint, edwards::ExtendedPoint,
};
pub mod scalar_mul;

View File

@ -1,22 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
pub mod variable_base;
pub mod vartime_double_base;
#[cfg(feature = "alloc")]
pub mod straus;
#[cfg(feature = "alloc")]
pub mod precomputed_straus;
#[cfg(feature = "alloc")]
pub mod pippenger;

View File

@ -1,164 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Oleg Andreev
// See LICENSE for licensing information.
//
// Authors:
// - Oleg Andreev <oleganza@gmail.com>
#![allow(non_snake_case)]
use core::borrow::Borrow;
use backend::vector::{CachedPoint, ExtendedPoint};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::{Identity, VartimeMultiscalarMul};
#[allow(unused_imports)]
use prelude::*;
/// Implements a version of Pippenger's algorithm.
///
/// See the documentation in the serial `scalar_mul::pippenger` module for details.
pub struct Pippenger;
#[cfg(any(feature = "alloc", feature = "std"))]
impl VartimeMultiscalarMul for Pippenger {
type Point = EdwardsPoint;
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
let mut scalars = scalars.into_iter();
let size = scalars.by_ref().size_hint().0;
let w = if size < 500 {
6
} else if size < 800 {
7
} else {
8
};
let max_digit: usize = 1 << w;
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
// Collect optimized scalars and points in a buffer for repeated access
// (scanning the whole collection per each digit position).
let scalars = scalars
.into_iter()
.map(|s| s.borrow().to_radix_2w(w));
let points = points
.into_iter()
.map(|p| p.map(|P| CachedPoint::from(ExtendedPoint::from(P))));
let scalars_points = scalars
.zip(points)
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
.collect::<Option<Vec<_>>>()?;
// Prepare 2^w/2 buckets.
// buckets[i] corresponds to a multiplication factor (i+1).
let mut buckets: Vec<ExtendedPoint> = (0..buckets_count)
.map(|_| ExtendedPoint::identity())
.collect();
let mut columns = (0..digits_count).rev().map(|digit_index| {
// Clear the buckets when processing another digit.
for i in 0..buckets_count {
buckets[i] = ExtendedPoint::identity();
}
// Iterate over pairs of (point, scalar)
// and add/sub the point to the corresponding bucket.
// Note: if we add support for precomputed lookup tables,
// we'll be adding/subtractiong point premultiplied by `digits[i]` to buckets[0].
for (digits, pt) in scalars_points.iter() {
// Widen digit so that we don't run into edge cases when w=8.
let digit = digits[digit_index] as i16;
if digit > 0 {
let b = (digit - 1) as usize;
buckets[b] = &buckets[b] + pt;
} else if digit < 0 {
let b = (-digit - 1) as usize;
buckets[b] = &buckets[b] - pt;
}
}
// Add the buckets applying the multiplication factor to each bucket.
// The most efficient way to do that is to have a single sum with two running sums:
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
//
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
// C
// C B
// C B A Sum = C + (C+B) + (C+B+A)
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
let mut buckets_sum = buckets[buckets_count - 1];
for i in (0..(buckets_count - 1)).rev() {
buckets_intermediate_sum =
&buckets_intermediate_sum + &CachedPoint::from(buckets[i]);
buckets_sum = &buckets_sum + &CachedPoint::from(buckets_intermediate_sum);
}
buckets_sum
});
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
// `unwrap()` always succeeds because we know we have more than zero digits.
let hi_column = columns.next().unwrap();
Some(
columns
.fold(hi_column, |total, p| {
&total.mul_by_pow_2(w as u32) + &CachedPoint::from(p)
})
.into(),
)
}
}
#[cfg(test)]
mod test {
use super::*;
use constants;
use scalar::Scalar;
#[test]
fn test_vartime_pippenger() {
// Reuse points across different tests
let mut n = 512;
let x = Scalar::from(2128506u64).invert();
let y = Scalar::from(4443282u64).invert();
let points: Vec<_> = (0..n)
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
.collect();
let scalars: Vec<_> = (0..n)
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
.collect();
let premultiplied: Vec<EdwardsPoint> = scalars
.iter()
.zip(points.iter())
.map(|(sc, pt)| sc * pt)
.collect();
while n > 0 {
let scalars = &scalars[0..n].to_vec();
let points = &points[0..n].to_vec();
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
assert_eq!(subject.compress(), control.compress());
n = n / 2;
}
}
}

View File

@ -1,107 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Henry de Valence.
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Precomputation for Straus's method.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use backend::vector::{CachedPoint, ExtendedPoint};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::Identity;
use traits::VartimePrecomputedMultiscalarMul;
use window::{NafLookupTable5, NafLookupTable8};
#[allow(unused_imports)]
use prelude::*;
pub struct VartimePrecomputedStraus {
static_lookup_tables: Vec<NafLookupTable8<CachedPoint>>,
}
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
type Point = EdwardsPoint;
fn new<I>(static_points: I) -> Self
where
I: IntoIterator,
I::Item: Borrow<Self::Point>,
{
Self {
static_lookup_tables: static_points
.into_iter()
.map(|P| NafLookupTable8::<CachedPoint>::from(P.borrow()))
.collect(),
}
}
fn optional_mixed_multiscalar_mul<I, J, K>(
&self,
static_scalars: I,
dynamic_scalars: J,
dynamic_points: K,
) -> Option<Self::Point>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Scalar>,
K: IntoIterator<Item = Option<Self::Point>>,
{
let static_nafs = static_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_nafs: Vec<_> = dynamic_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_lookup_tables = dynamic_points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<CachedPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let sp = self.static_lookup_tables.len();
let dp = dynamic_lookup_tables.len();
assert_eq!(sp, static_nafs.len());
assert_eq!(dp, dynamic_nafs.len());
// We could save some doublings by looking for the highest
// nonzero NAF coefficient, but since we might have a lot of
// them to search, it's not clear it's worthwhile to check.
let mut R = ExtendedPoint::identity();
for j in (0..256).rev() {
R = R.double();
for i in 0..dp {
let t_ij = dynamic_nafs[i][j];
if t_ij > 0 {
R = &R + &dynamic_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R - &dynamic_lookup_tables[i].select(-t_ij as usize);
}
}
for i in 0..sp {
let t_ij = static_nafs[i][j];
if t_ij > 0 {
R = &R + &self.static_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R - &self.static_lookup_tables[i].select(-t_ij as usize);
}
}
}
Some(R.into())
}
}

View File

@ -1,107 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use core::borrow::Borrow;
use zeroize::Zeroizing;
use backend::vector::{CachedPoint, ExtendedPoint};
use edwards::EdwardsPoint;
use scalar::Scalar;
use window::{LookupTable, NafLookupTable5};
use traits::{Identity, MultiscalarMul, VartimeMultiscalarMul};
#[allow(unused_imports)]
use prelude::*;
/// Multiscalar multiplication using interleaved window / Straus'
/// method. See the `Straus` struct in the serial backend for more
/// details.
///
/// This exists as a seperate implementation from that one because the
/// AVX2 code uses different curve models (it does not pass between
/// multiple models during scalar mul), and it has to convert the
/// point representation on the fly.
pub struct Straus {}
impl MultiscalarMul for Straus {
type Point = EdwardsPoint;
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<EdwardsPoint>,
{
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
// for each input point P
let lookup_tables: Vec<_> = points
.into_iter()
.map(|point| LookupTable::<CachedPoint>::from(point.borrow()))
.collect();
let scalar_digits_vec: Vec<_> = scalars
.into_iter()
.map(|s| s.borrow().to_radix_16())
.collect();
// Pass ownership to a `Zeroizing` wrapper
let scalar_digits = Zeroizing::new(scalar_digits_vec);
let mut Q = ExtendedPoint::identity();
for j in (0..64).rev() {
Q = Q.mul_by_pow_2(4);
let it = scalar_digits.iter().zip(lookup_tables.iter());
for (s_i, lookup_table_i) in it {
// Q = Q + s_{i,j} * P_i
Q = &Q + &lookup_table_i.select(s_i[j]);
}
}
Q.into()
}
}
impl VartimeMultiscalarMul for Straus {
type Point = EdwardsPoint;
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
let nafs: Vec<_> = scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect();
let lookup_tables: Vec<_> = points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<CachedPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let mut Q = ExtendedPoint::identity();
for i in (0..256).rev() {
Q = Q.double();
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
if naf[i] > 0 {
Q = &Q + &lookup_table.select(naf[i] as usize);
} else if naf[i] < 0 {
Q = &Q - &lookup_table.select(-naf[i] as usize);
}
}
}
Some(Q.into())
}
}

View File

@ -1,32 +0,0 @@
#![allow(non_snake_case)]
use backend::vector::{CachedPoint, ExtendedPoint};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::Identity;
use window::LookupTable;
/// Perform constant-time, variable-base scalar multiplication.
pub fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
let lookup_table = LookupTable::<CachedPoint>::from(point);
// Setting s = scalar, compute
//
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
//
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
let scalar_digits = scalar.to_radix_16();
// Compute s*P as
//
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
//
// We sum right-to-left.
let mut Q = ExtendedPoint::identity();
for i in (0..64).rev() {
Q = Q.mul_by_pow_2(4);
Q = &Q + &lookup_table.select(scalar_digits[i]);
}
Q.into()
}

View File

@ -1,60 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use backend::vector::BASEPOINT_ODD_LOOKUP_TABLE;
use backend::vector::{CachedPoint, ExtendedPoint};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::Identity;
use window::NafLookupTable5;
/// Compute \\(aA + bB\\) in variable time, where \\(B\\) is the Ed25519 basepoint.
pub fn mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint {
let a_naf = a.non_adjacent_form(5);
let b_naf = b.non_adjacent_form(8);
// Find starting index
let mut i: usize = 255;
for j in (0..256).rev() {
i = j;
if a_naf[i] != 0 || b_naf[i] != 0 {
break;
}
}
let table_A = NafLookupTable5::<CachedPoint>::from(A);
let table_B = &BASEPOINT_ODD_LOOKUP_TABLE;
let mut Q = ExtendedPoint::identity();
loop {
Q = Q.double();
if a_naf[i] > 0 {
Q = &Q + &table_A.select(a_naf[i] as usize);
} else if a_naf[i] < 0 {
Q = &Q - &table_A.select(-a_naf[i] as usize);
}
if b_naf[i] > 0 {
Q = &Q + &table_B.select(b_naf[i] as usize);
} else if b_naf[i] < 0 {
Q = &Q - &table_B.select(-b_naf[i] as usize);
}
if i == 0 {
break;
}
i -= 1;
}
Q.into()
}

View File

@ -1,176 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Various constants, such as the Ristretto and Ed25519 basepoints.
//!
//! Most of the constants are given with
//! `LONG_DESCRIPTIVE_UPPER_CASE_NAMES`, but they can be brought into
//! scope using a `let` binding:
//!
//! ```
//! use curve25519_dalek::constants;
//! use curve25519_dalek::traits::IsIdentity;
//!
//! let B = &constants::RISTRETTO_BASEPOINT_TABLE;
//! let l = &constants::BASEPOINT_ORDER;
//!
//! let A = l * B;
//! assert!(A.is_identity());
//! ```
#![allow(non_snake_case)]
use edwards::CompressedEdwardsY;
use ristretto::RistrettoPoint;
use ristretto::CompressedRistretto;
use montgomery::MontgomeryPoint;
use scalar::Scalar;
#[cfg(feature = "u64_backend")]
pub use backend::serial::u64::constants::*;
#[cfg(feature = "u32_backend")]
pub use backend::serial::u32::constants::*;
/// The Ed25519 basepoint, in `CompressedEdwardsY` format.
///
/// This is the little-endian byte encoding of \\( 4/5 \pmod p \\),
/// which is the \\(y\\)-coordinate of the Ed25519 basepoint.
///
/// The sign bit is 0 since the basepoint has \\(x\\) chosen to be positive.
pub const ED25519_BASEPOINT_COMPRESSED: CompressedEdwardsY =
CompressedEdwardsY([0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66]);
/// The X25519 basepoint, in `MontgomeryPoint` format.
pub const X25519_BASEPOINT: MontgomeryPoint =
MontgomeryPoint([0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
/// The Ristretto basepoint, in `CompressedRistretto` format.
pub const RISTRETTO_BASEPOINT_COMPRESSED: CompressedRistretto =
CompressedRistretto([0xe2, 0xf2, 0xae, 0x0a, 0x6a, 0xbc, 0x4e, 0x71,
0xa8, 0x84, 0xa9, 0x61, 0xc5, 0x00, 0x51, 0x5f,
0x58, 0xe3, 0x0b, 0x6a, 0xa5, 0x82, 0xdd, 0x8d,
0xb6, 0xa6, 0x59, 0x45, 0xe0, 0x8d, 0x2d, 0x76]);
/// The Ristretto basepoint, as a `RistrettoPoint`.
///
/// This is called `_POINT` to distinguish it from `_TABLE`, which
/// provides fast scalar multiplication.
pub const RISTRETTO_BASEPOINT_POINT: RistrettoPoint = RistrettoPoint(ED25519_BASEPOINT_POINT);
/// `BASEPOINT_ORDER` is the order of the Ristretto group and of the Ed25519 basepoint, i.e.,
/// $$
/// \ell = 2^\{252\} + 27742317777372353535851937790883648493.
/// $$
pub const BASEPOINT_ORDER: Scalar = Scalar{
bytes: [
0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
],
};
use ristretto::RistrettoBasepointTable;
/// The Ristretto basepoint, as a `RistrettoBasepointTable` for scalar multiplication.
pub const RISTRETTO_BASEPOINT_TABLE: RistrettoBasepointTable
= RistrettoBasepointTable(ED25519_BASEPOINT_TABLE);
#[cfg(test)]
mod test {
use field::FieldElement;
use traits::{IsIdentity, ValidityCheck};
use constants;
#[test]
fn test_eight_torsion() {
for i in 0..8 {
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(3);
assert!(Q.is_valid());
assert!(Q.is_identity());
}
}
#[test]
fn test_four_torsion() {
for i in (0..8).filter(|i| i % 2 == 0) {
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(2);
assert!(Q.is_valid());
assert!(Q.is_identity());
}
}
#[test]
fn test_two_torsion() {
for i in (0..8).filter(|i| i % 4 == 0) {
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(1);
assert!(Q.is_valid());
assert!(Q.is_identity());
}
}
/// Test that SQRT_M1 is the positive square root of -1
#[test]
fn test_sqrt_minus_one() {
let minus_one = FieldElement::minus_one();
let sqrt_m1_sq = &constants::SQRT_M1 * &constants::SQRT_M1;
assert_eq!(minus_one, sqrt_m1_sq);
assert_eq!(constants::SQRT_M1.is_negative().unwrap_u8(), 0);
}
#[test]
fn test_sqrt_constants_sign() {
let minus_one = FieldElement::minus_one();
let (was_nonzero_square, invsqrt_m1) = minus_one.invsqrt();
assert_eq!(was_nonzero_square.unwrap_u8(), 1u8);
let sign_test_sqrt = &invsqrt_m1 * &constants::SQRT_M1;
assert_eq!(sign_test_sqrt, minus_one);
}
/// Test that d = -121665/121666
#[test]
#[cfg(feature = "u32_backend")]
fn test_d_vs_ratio() {
use backend::serial::u32::field::FieldElement2625;
let a = -&FieldElement2625([121665,0,0,0,0,0,0,0,0,0]);
let b = FieldElement2625([121666,0,0,0,0,0,0,0,0,0]);
let d = &a * &b.invert();
let d2 = &d + &d;
assert_eq!(d, constants::EDWARDS_D);
assert_eq!(d2, constants::EDWARDS_D2);
}
/// Test that d = -121665/121666
#[test]
#[cfg(feature = "u64_backend")]
fn test_d_vs_ratio() {
use backend::serial::u64::field::FieldElement51;
let a = -&FieldElement51([121665,0,0,0,0]);
let b = FieldElement51([121666,0,0,0,0]);
let d = &a * &b.invert();
let d2 = &d + &d;
assert_eq!(d, constants::EDWARDS_D);
assert_eq!(d2, constants::EDWARDS_D2);
}
#[test]
fn test_sqrt_ad_minus_one() {
let a = FieldElement::minus_one();
let ad_minus_one = &(&a * &constants::EDWARDS_D) + &a;
let should_be_ad_minus_one = constants::SQRT_AD_MINUS_ONE.square();
assert_eq!(should_be_ad_minus_one, ad_minus_one);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,460 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\).
//!
//! The `curve25519_dalek::field` module provides a type alias
//! `curve25519_dalek::field::FieldElement` to a field element type
//! defined in the `backend` module; either `FieldElement51` or
//! `FieldElement2625`.
//!
//! Field operations defined in terms of machine
//! operations, such as field multiplication or squaring, are defined in
//! the backend implementation.
//!
//! Field operations defined in terms of other field operations, such as
//! field inversion or square roots, are defined here.
use core::cmp::{Eq, PartialEq};
use subtle::ConditionallySelectable;
use subtle::ConditionallyNegatable;
use subtle::Choice;
use subtle::ConstantTimeEq;
use constants;
use backend;
#[cfg(feature = "u64_backend")]
pub use backend::serial::u64::field::*;
/// A `FieldElement` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// The `FieldElement` type is an alias for one of the platform-specific
/// implementations.
#[cfg(feature = "u64_backend")]
pub type FieldElement = backend::serial::u64::field::FieldElement51;
#[cfg(feature = "u32_backend")]
pub use backend::serial::u32::field::*;
/// A `FieldElement` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// The `FieldElement` type is an alias for one of the platform-specific
/// implementations.
#[cfg(feature = "u32_backend")]
pub type FieldElement = backend::serial::u32::field::FieldElement2625;
impl Eq for FieldElement {}
impl PartialEq for FieldElement {
fn eq(&self, other: &FieldElement) -> bool {
self.ct_eq(other).unwrap_u8() == 1u8
}
}
impl ConstantTimeEq for FieldElement {
/// Test equality between two `FieldElement`s. Since the
/// internal representation is not canonical, the field elements
/// are normalized to wire format before comparison.
fn ct_eq(&self, other: &FieldElement) -> Choice {
self.to_bytes().ct_eq(&other.to_bytes())
}
}
impl FieldElement {
/// Determine if this `FieldElement` is negative, in the sense
/// used in the ed25519 paper: `x` is negative if the low bit is
/// set.
///
/// # Return
///
/// If negative, return `Choice(1)`. Otherwise, return `Choice(0)`.
pub fn is_negative(&self) -> Choice {
let bytes = self.to_bytes();
(bytes[0] & 1).into()
}
/// Determine if this `FieldElement` is zero.
///
/// # Return
///
/// If zero, return `Choice(1)`. Otherwise, return `Choice(0)`.
pub fn is_zero(&self) -> Choice {
let zero = [0u8; 32];
let bytes = self.to_bytes();
bytes.ct_eq(&zero)
}
/// Compute (self^(2^250-1), self^11), used as a helper function
/// within invert() and pow22523().
fn pow22501(&self) -> (FieldElement, FieldElement) {
// Instead of managing which temporary variables are used
// for what, we define as many as we need and leave stack
// allocation to the compiler
//
// Each temporary variable t_i is of the form (self)^e_i.
// Squaring t_i corresponds to multiplying e_i by 2,
// so the pow2k function shifts e_i left by k places.
// Multiplying t_i and t_j corresponds to adding e_i + e_j.
//
// Temporary t_i Nonzero bits of e_i
//
let t0 = self.square(); // 1 e_0 = 2^1
let t1 = t0.square().square(); // 3 e_1 = 2^3
let t2 = self * &t1; // 3,0 e_2 = 2^3 + 2^0
let t3 = &t0 * &t2; // 3,1,0
let t4 = t3.square(); // 4,2,1
let t5 = &t2 * &t4; // 4,3,2,1,0
let t6 = t5.pow2k(5); // 9,8,7,6,5
let t7 = &t6 * &t5; // 9,8,7,6,5,4,3,2,1,0
let t8 = t7.pow2k(10); // 19..10
let t9 = &t8 * &t7; // 19..0
let t10 = t9.pow2k(20); // 39..20
let t11 = &t10 * &t9; // 39..0
let t12 = t11.pow2k(10); // 49..10
let t13 = &t12 * &t7; // 49..0
let t14 = t13.pow2k(50); // 99..50
let t15 = &t14 * &t13; // 99..0
let t16 = t15.pow2k(100); // 199..100
let t17 = &t16 * &t15; // 199..0
let t18 = t17.pow2k(50); // 249..50
let t19 = &t18 * &t13; // 249..0
(t19, t3)
}
/// Given a slice of public `FieldElements`, replace each with its inverse.
///
/// All input `FieldElements` **MUST** be nonzero.
#[cfg(feature = "alloc")]
pub fn batch_invert(inputs: &mut [FieldElement]) {
// Montgomerys Trick and Fast Implementation of Masked AES
// Genelle, Prouff and Quisquater
// Section 3.2
let n = inputs.len();
let mut scratch = vec![FieldElement::one(); n];
// Keep an accumulator of all of the previous products
let mut acc = FieldElement::one();
// Pass through the input vector, recording the previous
// products in the scratch space
for (input, scratch) in inputs.iter().zip(scratch.iter_mut()) {
*scratch = acc;
acc = &acc * input;
}
// acc is nonzero iff all inputs are nonzero
assert_eq!(acc.is_zero().unwrap_u8(), 0);
// Compute the inverse of all products
acc = acc.invert();
// Pass through the vector backwards to compute the inverses
// in place
for (input, scratch) in inputs.iter_mut().rev().zip(scratch.into_iter().rev()) {
let tmp = &acc * input;
*input = &acc * &scratch;
acc = tmp;
}
}
/// Given a nonzero field element, compute its inverse.
///
/// The inverse is computed as self^(p-2), since
/// x^(p-2)x = x^(p-1) = 1 (mod p).
///
/// This function returns zero on input zero.
pub fn invert(&self) -> FieldElement {
// The bits of p-2 = 2^255 -19 -2 are 11010111111...11.
//
// nonzero bits of exponent
let (t19, t3) = self.pow22501(); // t19: 249..0 ; t3: 3,1,0
let t20 = t19.pow2k(5); // 254..5
let t21 = &t20 * &t3; // 254..5,3,1,0
t21
}
/// Raise this field element to the power (p-5)/8 = 2^252 -3.
fn pow_p58(&self) -> FieldElement {
// The bits of (p-5)/8 are 101111.....11.
//
// nonzero bits of exponent
let (t19, _) = self.pow22501(); // 249..0
let t20 = t19.pow2k(2); // 251..2
let t21 = self * &t20; // 251..2,0
t21
}
/// Given `FieldElements` `u` and `v`, compute either `sqrt(u/v)`
/// or `sqrt(i*u/v)` in constant time.
///
/// This function always returns the nonnegative square root.
///
/// # Return
///
/// - `(Choice(1), +sqrt(u/v)) ` if `v` is nonzero and `u/v` is square;
/// - `(Choice(1), zero) ` if `u` is zero;
/// - `(Choice(0), zero) ` if `v` is zero and `u` is nonzero;
/// - `(Choice(0), +sqrt(i*u/v))` if `u/v` is nonsquare (so `i*u/v` is square).
///
pub fn sqrt_ratio_i(u: &FieldElement, v: &FieldElement) -> (Choice, FieldElement) {
// Using the same trick as in ed25519 decoding, we merge the
// inversion, the square root, and the square test as follows.
//
// To compute sqrt(α), we can compute β = α^((p+3)/8).
// Then β^2 = ±α, so multiplying β by sqrt(-1) if necessary
// gives sqrt(α).
//
// To compute 1/sqrt(α), we observe that
// 1/β = α^(p-1 - (p+3)/8) = α^((7p-11)/8)
// = α^3 * (α^7)^((p-5)/8).
//
// We can therefore compute sqrt(u/v) = sqrt(u)/sqrt(v)
// by first computing
// r = u^((p+3)/8) v^(p-1-(p+3)/8)
// = u u^((p-5)/8) v^3 (v^7)^((p-5)/8)
// = (uv^3) (uv^7)^((p-5)/8).
//
// If v is nonzero and u/v is square, then r^2 = ±u/v,
// so vr^2 = ±u.
// If vr^2 = u, then sqrt(u/v) = r.
// If vr^2 = -u, then sqrt(u/v) = r*sqrt(-1).
//
// If v is zero, r is also zero.
let v3 = &v.square() * v;
let v7 = &v3.square() * v;
let mut r = &(u * &v3) * &(u * &v7).pow_p58();
let check = v * &r.square();
let i = &constants::SQRT_M1;
let correct_sign_sqrt = check.ct_eq( u);
let flipped_sign_sqrt = check.ct_eq( &(-u));
let flipped_sign_sqrt_i = check.ct_eq(&(&(-u)*i));
let r_prime = &constants::SQRT_M1 * &r;
r.conditional_assign(&r_prime, flipped_sign_sqrt | flipped_sign_sqrt_i);
// Choose the nonnegative square root.
let r_is_negative = r.is_negative();
r.conditional_negate(r_is_negative);
let was_nonzero_square = correct_sign_sqrt | flipped_sign_sqrt;
(was_nonzero_square, r)
}
/// Attempt to compute `sqrt(1/self)` in constant time.
///
/// Convenience wrapper around `sqrt_ratio_i`.
///
/// This function always returns the nonnegative square root.
///
/// # Return
///
/// - `(Choice(1), +sqrt(1/self)) ` if `self` is a nonzero square;
/// - `(Choice(0), zero) ` if `self` is zero;
/// - `(Choice(0), +sqrt(i/self)) ` if `self` is a nonzero nonsquare;
///
pub fn invsqrt(&self) -> (Choice, FieldElement) {
FieldElement::sqrt_ratio_i(&FieldElement::one(), self)
}
}
#[cfg(test)]
mod test {
use field::*;
use subtle::ConditionallyNegatable;
/// Random element a of GF(2^255-19), from Sage
/// a = 1070314506888354081329385823235218444233221\
/// 2228051251926706380353716438957572
static A_BYTES: [u8; 32] =
[ 0x04, 0xfe, 0xdf, 0x98, 0xa7, 0xfa, 0x0a, 0x68,
0x84, 0x92, 0xbd, 0x59, 0x08, 0x07, 0xa7, 0x03,
0x9e, 0xd1, 0xf6, 0xf2, 0xe1, 0xd9, 0xe2, 0xa4,
0xa4, 0x51, 0x47, 0x36, 0xf3, 0xc3, 0xa9, 0x17];
/// Byte representation of a**2
static ASQ_BYTES: [u8; 32] =
[ 0x75, 0x97, 0x24, 0x9e, 0xe6, 0x06, 0xfe, 0xab,
0x24, 0x04, 0x56, 0x68, 0x07, 0x91, 0x2d, 0x5d,
0x0b, 0x0f, 0x3f, 0x1c, 0xb2, 0x6e, 0xf2, 0xe2,
0x63, 0x9c, 0x12, 0xba, 0x73, 0x0b, 0xe3, 0x62];
/// Byte representation of 1/a
static AINV_BYTES: [u8; 32] =
[0x96, 0x1b, 0xcd, 0x8d, 0x4d, 0x5e, 0xa2, 0x3a,
0xe9, 0x36, 0x37, 0x93, 0xdb, 0x7b, 0x4d, 0x70,
0xb8, 0x0d, 0xc0, 0x55, 0xd0, 0x4c, 0x1d, 0x7b,
0x90, 0x71, 0xd8, 0xe9, 0xb6, 0x18, 0xe6, 0x30];
/// Byte representation of a^((p-5)/8)
static AP58_BYTES: [u8; 32] =
[0x6a, 0x4f, 0x24, 0x89, 0x1f, 0x57, 0x60, 0x36,
0xd0, 0xbe, 0x12, 0x3c, 0x8f, 0xf5, 0xb1, 0x59,
0xe0, 0xf0, 0xb8, 0x1b, 0x20, 0xd2, 0xb5, 0x1f,
0x15, 0x21, 0xf9, 0xe3, 0xe1, 0x61, 0x21, 0x55];
#[test]
fn a_mul_a_vs_a_squared_constant() {
let a = FieldElement::from_bytes(&A_BYTES);
let asq = FieldElement::from_bytes(&ASQ_BYTES);
assert_eq!(asq, &a * &a);
}
#[test]
fn a_square_vs_a_squared_constant() {
let a = FieldElement::from_bytes(&A_BYTES);
let asq = FieldElement::from_bytes(&ASQ_BYTES);
assert_eq!(asq, a.square());
}
#[test]
fn a_square2_vs_a_squared_constant() {
let a = FieldElement::from_bytes(&A_BYTES);
let asq = FieldElement::from_bytes(&ASQ_BYTES);
assert_eq!(a.square2(), &asq+&asq);
}
#[test]
fn a_invert_vs_inverse_of_a_constant() {
let a = FieldElement::from_bytes(&A_BYTES);
let ainv = FieldElement::from_bytes(&AINV_BYTES);
let should_be_inverse = a.invert();
assert_eq!(ainv, should_be_inverse);
assert_eq!(FieldElement::one(), &a * &should_be_inverse);
}
#[test]
fn batch_invert_a_matches_nonbatched() {
let a = FieldElement::from_bytes(&A_BYTES);
let ap58 = FieldElement::from_bytes(&AP58_BYTES);
let asq = FieldElement::from_bytes(&ASQ_BYTES);
let ainv = FieldElement::from_bytes(&AINV_BYTES);
let a2 = &a + &a;
let a_list = vec![a, ap58, asq, ainv, a2];
let mut ainv_list = a_list.clone();
FieldElement::batch_invert(&mut ainv_list[..]);
for i in 0..5 {
assert_eq!(a_list[i].invert(), ainv_list[i]);
}
}
#[test]
fn sqrt_ratio_behavior() {
let zero = FieldElement::zero();
let one = FieldElement::one();
let i = constants::SQRT_M1;
let two = &one + &one; // 2 is nonsquare mod p.
let four = &two + &two; // 4 is square mod p.
// 0/0 should return (1, 0) since u is 0
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&zero, &zero);
assert_eq!(choice.unwrap_u8(), 1);
assert_eq!(sqrt, zero);
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
// 1/0 should return (0, 0) since v is 0, u is nonzero
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&one, &zero);
assert_eq!(choice.unwrap_u8(), 0);
assert_eq!(sqrt, zero);
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
// 2/1 is nonsquare, so we expect (0, sqrt(i*2))
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&two, &one);
assert_eq!(choice.unwrap_u8(), 0);
assert_eq!(sqrt.square(), &two * &i);
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
// 4/1 is square, so we expect (1, sqrt(4))
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&four, &one);
assert_eq!(choice.unwrap_u8(), 1);
assert_eq!(sqrt.square(), four);
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
// 1/4 is square, so we expect (1, 1/sqrt(4))
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&one, &four);
assert_eq!(choice.unwrap_u8(), 1);
assert_eq!(&sqrt.square() * &four, one);
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
}
#[test]
fn a_p58_vs_ap58_constant() {
let a = FieldElement::from_bytes(&A_BYTES);
let ap58 = FieldElement::from_bytes(&AP58_BYTES);
assert_eq!(ap58, a.pow_p58());
}
#[test]
fn equality() {
let a = FieldElement::from_bytes(&A_BYTES);
let ainv = FieldElement::from_bytes(&AINV_BYTES);
assert!(a == a);
assert!(a != ainv);
}
/// Notice that the last element has the high bit set, which
/// should be ignored
static B_BYTES: [u8;32] =
[113, 191, 169, 143, 91, 234, 121, 15,
241, 131, 217, 36, 230, 101, 92, 234,
8, 208, 170, 251, 97, 127, 70, 210,
58, 23, 166, 87, 240, 169, 184, 178];
#[test]
fn from_bytes_highbit_is_ignored() {
let mut cleared_bytes = B_BYTES;
cleared_bytes[31] &= 127u8;
let with_highbit_set = FieldElement::from_bytes(&B_BYTES);
let without_highbit_set = FieldElement::from_bytes(&cleared_bytes);
assert_eq!(without_highbit_set, with_highbit_set);
}
#[test]
fn conditional_negate() {
let one = FieldElement::one();
let minus_one = FieldElement::minus_one();
let mut x = one;
x.conditional_negate(Choice::from(1));
assert_eq!(x, minus_one);
x.conditional_negate(Choice::from(0));
assert_eq!(x, minus_one);
x.conditional_negate(Choice::from(1));
assert_eq!(x, one);
}
#[test]
fn encoding_is_canonical() {
// Encode 1 wrongly as 1 + (2^255 - 19) = 2^255 - 18
let one_encoded_wrongly_bytes: [u8;32] = [0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f];
// Decode to a field element
let one = FieldElement::from_bytes(&one_encoded_wrongly_bytes);
// .. then check that the encoding is correct
let one_bytes = one.to_bytes();
assert_eq!(one_bytes[0], 1);
for i in 1..32 {
assert_eq!(one_bytes[i], 0);
}
}
#[test]
fn batch_invert_empty() {
FieldElement::batch_invert(&mut []);
}
}

View File

@ -1,100 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![no_std]
#![cfg_attr(feature = "nightly", feature(test))]
#![cfg_attr(all(feature = "alloc", not(feature = "std")), feature(alloc))]
#![cfg_attr(feature = "nightly", feature(external_doc))]
#![cfg_attr(feature = "nightly", feature(doc_cfg))]
#![cfg_attr(feature = "simd_backend", feature(stdsimd))]
// Refuse to compile if documentation is missing, but only on nightly.
//
// This means that missing docs will still fail CI, but means we can use
// README.md as the crate documentation.
//#![cfg_attr(feature = "nightly", deny(missing_docs))]
#![cfg_attr(feature = "nightly", doc(include = "../README.md"))]
#![doc(html_logo_url = "https://doc.dalek.rs/assets/dalek-logo-clear.png")]
//! Note that docs will only build on nightly Rust until
//! [RFC 1990 stabilizes](https://github.com/rust-lang/rust/issues/44732).
//------------------------------------------------------------------------
// External dependencies:
//------------------------------------------------------------------------
#[cfg(all(feature = "alloc", not(feature = "std")))]
#[macro_use]
extern crate alloc;
#[cfg(feature = "std")]
#[macro_use]
extern crate std;
#[cfg(all(feature = "nightly", feature = "packed_simd"))]
extern crate packed_simd;
extern crate byteorder;
pub extern crate digest;
extern crate rand_core;
extern crate zeroize;
// Used for traits related to constant-time code.
pub extern crate subtle;
#[cfg(all(test, feature = "serde"))]
extern crate bincode;
#[cfg(feature = "serde")]
extern crate serde;
// Internal macros. Must come first!
#[macro_use]
pub(crate) mod macros;
//------------------------------------------------------------------------
// curve25519-dalek public modules
//------------------------------------------------------------------------
// Scalar arithmetic mod l = 2^252 + ..., the order of the Ristretto group
pub mod scalar;
// Point operations on the Montgomery form of Curve25519
pub mod montgomery;
// Point operations on the Edwards form of Curve25519
pub mod edwards;
// Group operations on the Ristretto group
pub mod ristretto;
// Useful constants, like the Ed25519 basepoint
pub mod constants;
// External (and internal) traits.
pub mod traits;
// All the lizard code is here, for now
pub mod lizard;
//------------------------------------------------------------------------
// curve25519-dalek internal modules
//------------------------------------------------------------------------
// Finite field arithmetic mod p = 2^255 - 19
pub mod field;
// Arithmetic backends (using u32, u64, etc) live here
pub(crate) mod backend;
// Crate-local prelude (for alloc-dependent features like `Vec`)
pub(crate) mod prelude;
// Generic code for window lookups
pub(crate) mod window;

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2019 Bas Westerbaan
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,74 +0,0 @@
//! Helper functions for use with Lizard
#![allow(non_snake_case)]
use subtle::Choice;
use subtle::ConstantTimeEq;
use subtle::ConditionallyNegatable;
use subtle::ConditionallySelectable;
use constants;
use lizard::lizard_constants;
use field::FieldElement;
/// Represents a point (s,t) on the the Jacobi quartic associated
/// to the Edwards curve.
#[derive(Copy, Clone)]
#[allow(missing_docs)]
pub struct JacobiPoint {
pub S: FieldElement,
pub T: FieldElement,
}
impl JacobiPoint {
/// Elligator2 is defined in two steps: first a field element is converted
/// to a point (s,t) on the Jacobi quartic associated to the Edwards curve.
/// Then this point is mapped to a point on the Edwards curve.
/// This function computes a field element that is mapped to a given (s,t)
/// with Elligator2 if it exists.
pub(crate) fn elligator_inv(&self) -> (Choice, FieldElement) {
let mut out = FieldElement::zero();
// Special case: s = 0. If s is zero, either t = 1 or t = -1.
// If t=1, then sqrt(i*d) is the preimage. Otherwise it's 0.
let s_is_zero = self.S.is_zero();
let t_equals_one = self.T.ct_eq(&FieldElement::one());
out.conditional_assign(&lizard_constants::SQRT_ID, t_equals_one);
let mut ret = s_is_zero;
let mut done = s_is_zero;
// a := (t+1) (d+1)/(d-1)
let a = &(&self.T + &FieldElement::one()) * &lizard_constants::DP1_OVER_DM1;
let a2 = a.square();
// y := 1/sqrt(i (s^4 - a^2)).
let s2 = self.S.square();
let s4 = s2.square();
let invSqY = &(&s4 - &a2) * &constants::SQRT_M1;
// There is no preimage if the square root of i*(s^4-a^2) does not exist.
let (sq, y) = invSqY.invsqrt();
ret |= sq;
done |= !sq;
// x := (a + sign(s)*s^2) y
let mut pms2 = s2;
pms2.conditional_negate(self.S.is_negative());
let mut x = &(&a + &pms2) * &y;
let x_is_negative = x.is_negative();
x.conditional_negate(x_is_negative);
out.conditional_assign(&x, !done);
(ret, out)
}
pub(crate) fn dual(&self) -> JacobiPoint {
JacobiPoint {
S: -(&self.S),
T: -(&self.T),
}
}
}

View File

@ -1,54 +0,0 @@
//! Constants for use in Lizard
//!
//! Could be moved into backend/serial/u??/constants.rs
#[cfg(feature = "u64_backend")]
pub(crate) use lizard::u64_constants::*;
#[cfg(feature = "u32_backend")]
pub(crate) use lizard::u32_constants::*;
// ------------------------------------------------------------------------
// Tests
// ------------------------------------------------------------------------
#[cfg(all(test, feature = "stage2_build"))]
mod test {
use super::*;
use constants;
use field::FieldElement;
#[test]
fn test_lizard_constants() {
let (_, sqrt_id) = FieldElement::sqrt_ratio_i(
&(&constants::SQRT_M1 * &constants::EDWARDS_D),
&FieldElement::one()
);
assert_eq!(sqrt_id, SQRT_ID);
assert_eq!(
&(&constants::EDWARDS_D + &FieldElement::one())
* &(&constants::EDWARDS_D - &FieldElement::one()).invert(),
DP1_OVER_DM1
);
assert_eq!(
MDOUBLE_INVSQRT_A_MINUS_D,
-&(&constants::INVSQRT_A_MINUS_D + &constants::INVSQRT_A_MINUS_D)
);
assert_eq!(
MIDOUBLE_INVSQRT_A_MINUS_D,
&MDOUBLE_INVSQRT_A_MINUS_D * &constants::SQRT_M1
);
let (_, invsqrt_one_plus_d) = (
&constants::EDWARDS_D + &FieldElement::one()).invsqrt();
assert_eq!(
-&invsqrt_one_plus_d,
MINVSQRT_ONE_PLUS_D
);
}
}

View File

@ -1,305 +0,0 @@
//! Defines additional methods on RistrettoPoint for Lizard
#![allow(non_snake_case)]
use digest::Digest;
use digest::generic_array::typenum::U32;
use constants;
use field::FieldElement;
use subtle::ConditionallySelectable;
use subtle::ConstantTimeEq;
use subtle::Choice;
use edwards::EdwardsPoint;
use lizard::jacobi_quartic::JacobiPoint;
use lizard::lizard_constants;
#[allow(unused_imports)]
use prelude::*;
use ristretto::RistrettoPoint;
impl RistrettoPoint {
pub fn from_uniform_bytes_single_elligator(bytes: &[u8; 32]) -> RistrettoPoint {
RistrettoPoint::elligator_ristretto_flavor(&FieldElement::from_bytes(&bytes))
}
/// Encode 16 bytes of data to a RistrettoPoint, using the Lizard method
pub fn lizard_encode<D: Digest>(data: &[u8; 16]) -> RistrettoPoint
where D: Digest<OutputSize = U32>
{
let mut fe_bytes: [u8;32] = Default::default();
let digest = D::digest(data);
fe_bytes[0..32].copy_from_slice(digest.as_slice());
fe_bytes[8..24].copy_from_slice(data);
fe_bytes[0] &= 254; // make positive since Elligator on r and -r is the same
fe_bytes[31] &= 63;
let fe = FieldElement::from_bytes(&fe_bytes);
RistrettoPoint::elligator_ristretto_flavor(&fe)
}
/// Decode 16 bytes of data from a RistrettoPoint, using the Lizard method
pub fn lizard_decode<D: Digest>(&self) -> Option<[u8; 16]>
where D: Digest<OutputSize = U32>
{
let mut result: [u8; 16] = Default::default();
let mut h: [u8;32] = Default::default();
let (mask, fes) = self.elligator_ristretto_flavor_inverse();
let mut n_found = 0;
for j in 0..8 {
let mut ok = Choice::from((mask >> j) & 1);
let buf2 = fes[j].to_bytes(); // array
h.copy_from_slice(&D::digest(&buf2[8..24])); // array
h[8..24].copy_from_slice(&buf2[8..24]);
h[0] &= 254;
h[31] &= 63;
ok &= h.ct_eq(&buf2);
for i in 0..16 {
result[i] = u8::conditional_select(&result[i], &buf2[8+i], ok);
}
n_found += ok.unwrap_u8();
}
if n_found == 1 {
return Some(result);
}
else {
return None;
}
}
pub fn encode_253_bits(data: &[u8; 32]) -> Option<RistrettoPoint>
{
if data.len() != 32 {
return None;
}
let fe = FieldElement::from_bytes(data);
let p = RistrettoPoint::elligator_ristretto_flavor(&fe);
Some(p)
}
pub fn decode_253_bits(&self) -> (u8, [[u8; 32]; 8])
{
let mut ret = [ [0u8; 32]; 8];
let (mask, fes) = self.elligator_ristretto_flavor_inverse();
for j in 0..8 {
ret[j] = fes[j].to_bytes();
}
(mask, ret)
}
/// Return the coset self + E[4], for debugging.
pub fn xcoset4(&self) -> [EdwardsPoint; 4] {
[ self.0
, &self.0 + &constants::EIGHT_TORSION[2]
, &self.0 + &constants::EIGHT_TORSION[4]
, &self.0 + &constants::EIGHT_TORSION[6]
]
}
/// Computes the at most 8 positive FieldElements f such that
/// self == elligator_ristretto_flavor(f).
/// Assumes self is even.
///
/// Returns a bitmask of which elements in fes are set.
pub fn elligator_ristretto_flavor_inverse(&self) -> (u8, [FieldElement; 8]) {
// Elligator2 computes a Point from a FieldElement in two steps: first
// it computes a (s,t) on the Jacobi quartic and then computes the
// corresponding even point on the Edwards curve.
//
// We invert in three steps. Any Ristretto point has four representatives
// as even Edwards points. For each of those even Edwards points,
// there are two points on the Jacobi quartic that map to it.
// Each of those eight points on the Jacobi quartic might have an
// Elligator2 preimage.
//
// Essentially we first loop over the four representatives of our point,
// then for each of them consider both points on the Jacobi quartic and
// check whether they have an inverse under Elligator2. We take the
// following shortcut though.
//
// We can compute two Jacobi quartic points for (x,y) and (-x,-y)
// at the same time. The four Jacobi quartic points are two of
// such pairs.
let mut mask : u8 = 0;
let jcs = self.to_jacobi_quartic_ristretto();
let mut ret = [FieldElement::one(); 8];
for i in 0..4 {
let (ok, fe) = jcs[i].elligator_inv();
let mut tmp : u8 = 0;
ret[2*i] = fe;
tmp.conditional_assign(&1, ok);
mask |= tmp << (2 * i);
let jc = jcs[i].dual();
let (ok, fe) = jc.elligator_inv();
let mut tmp : u8 = 0;
ret[2*i+1] = fe;
tmp.conditional_assign(&1, ok);
mask |= tmp << (2 * i + 1);
}
return (mask, ret)
}
/// Find a point on the Jacobi quartic associated to each of the four
/// points Ristretto equivalent to p.
///
/// There is one exception: for (0,-1) there is no point on the quartic and
/// so we repeat one on the quartic equivalent to (0,1).
fn to_jacobi_quartic_ristretto(&self) -> [JacobiPoint; 4] {
let x2 = self.0.X.square(); // X^2
let y2 = self.0.Y.square(); // Y^2
let y4 = y2.square(); // Y^4
let z2 = self.0.Z.square(); // Z^2
let z_min_y = &self.0.Z - &self.0.Y; // Z - Y
let z_pl_y = &self.0.Z + &self.0.Y; // Z + Y
let z2_min_y2 = &z2 - &y2; // Z^2 - Y^2
// gamma := 1/sqrt( Y^4 X^2 (Z^2 - Y^2) )
let (_, gamma) = (&(&y4 * &x2) * &z2_min_y2).invsqrt();
let den = &gamma * &y2;
let s_over_x = &den * &z_min_y;
let sp_over_xp = &den * &z_pl_y;
let s0 = &s_over_x * &self.0.X;
let s1 = &(-(&sp_over_xp)) * &self.0.X;
// t_0 := -2/sqrt(-d-1) * Z * sOverX
// t_1 := -2/sqrt(-d-1) * Z * spOverXp
let tmp = &lizard_constants::MDOUBLE_INVSQRT_A_MINUS_D * &self.0.Z;
let mut t0 = &tmp * &s_over_x;
let mut t1 = &tmp * &sp_over_xp;
// den := -1/sqrt(1+d) (Y^2 - Z^2) gamma
let den = &(&(-(&z2_min_y2)) * &lizard_constants::MINVSQRT_ONE_PLUS_D) * &gamma;
// Same as before but with the substitution (X, Y, Z) = (Y, X, i*Z)
let iz = &constants::SQRT_M1 * &self.0.Z; // iZ
let iz_min_x = &iz - &self.0.X; // iZ - X
let iz_pl_x = &iz + &self.0.X; // iZ + X
let s_over_y = &den * &iz_min_x;
let sp_over_yp = &den * &iz_pl_x;
let mut s2 = &s_over_y * &self.0.Y;
let mut s3 = &(-(&sp_over_yp)) * &self.0.Y;
// t_2 := -2/sqrt(-d-1) * i*Z * sOverY
// t_3 := -2/sqrt(-d-1) * i*Z * spOverYp
let tmp = &lizard_constants::MDOUBLE_INVSQRT_A_MINUS_D * &iz;
let mut t2 = &tmp * &s_over_y;
let mut t3 = &tmp * &sp_over_yp;
// Special case: X=0 or Y=0. Then return
//
// (0,1) (1,-2i/sqrt(-d-1) (-1,-2i/sqrt(-d-1))
//
// Note that if X=0 or Y=0, then s_i = t_i = 0.
let x_or_y_is_zero = self.0.X.is_zero() | self.0.Y.is_zero();
t0.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
t1.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
t2.conditional_assign(&lizard_constants::MIDOUBLE_INVSQRT_A_MINUS_D, x_or_y_is_zero);
t3.conditional_assign(&lizard_constants::MIDOUBLE_INVSQRT_A_MINUS_D, x_or_y_is_zero);
s2.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
s3.conditional_assign(&(-(&FieldElement::one())), x_or_y_is_zero);
return [
JacobiPoint{S: s0, T: t0},
JacobiPoint{S: s1, T: t1},
JacobiPoint{S: s2, T: t2},
JacobiPoint{S: s3, T: t3},
]
}
}
// ------------------------------------------------------------------------
// Tests
// ------------------------------------------------------------------------
#[cfg(all(test, feature = "stage2_build"))]
mod test {
extern crate sha2;
#[cfg(feature = "rand")]
use rand_os::OsRng;
use rand_core::{RngCore};
use self::sha2::{Sha256};
use ristretto::CompressedRistretto;
use super::*;
fn test_lizard_encode_helper(data: &[u8; 16], result: &[u8; 32]) {
let p = RistrettoPoint::lizard_encode::<Sha256>(data).unwrap();
let p_bytes = p.compress().to_bytes();
assert!(&p_bytes == result);
let p = CompressedRistretto::from_slice(&p_bytes).decompress().unwrap();
let data_out = p.lizard_decode::<Sha256>().unwrap();
assert!(&data_out == data);
}
#[test]
fn test_lizard_encode() {
test_lizard_encode_helper(&[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
&[0xf0, 0xb7, 0xe3, 0x44, 0x84, 0xf7, 0x4c, 0xf0, 0xf, 0x15, 0x2, 0x4b, 0x73, 0x85, 0x39, 0x73, 0x86, 0x46, 0xbb, 0xbe, 0x1e, 0x9b, 0xc7, 0x50, 0x9a, 0x67, 0x68, 0x15, 0x22, 0x7e, 0x77, 0x4f]);
test_lizard_encode_helper(&[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
&[0xcc, 0x92, 0xe8, 0x1f, 0x58, 0x5a, 0xfc, 0x5c, 0xaa, 0xc8, 0x86, 0x60, 0xd8, 0xd1, 0x7e, 0x90, 0x25, 0xa4, 0x44, 0x89, 0xa3, 0x63, 0x4, 0x21, 0x23, 0xf6, 0xaf, 0x7, 0x2, 0x15, 0x6e, 0x65]);
test_lizard_encode_helper(&[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
&[0xc8, 0x30, 0x57, 0x3f, 0x8a, 0x8e, 0x77, 0x78, 0x67, 0x1f, 0x76, 0xcd, 0xc7, 0x96, 0xdc, 0xa, 0x23, 0x5c, 0xf1, 0x77, 0xf1, 0x97, 0xd9, 0xfc, 0xba, 0x6, 0xe8, 0x4e, 0x96, 0x24, 0x74, 0x44]);
}
#[test]
fn test_elligator_inv() {
let mut rng = rand::thread_rng();
for i in 0..100 {
let mut fe_bytes = [0u8; 32];
if i == 0 {
// Test for first corner-case: fe = 0
fe_bytes = [0u8; 32];
} else if i == 1 {
// Test for second corner-case: fe = +sqrt(i*d)
fe_bytes = [168, 27, 92, 74, 203, 42, 48, 117, 170, 109, 234,
14, 45, 169, 188, 205, 21, 110, 235, 115, 153, 84,
52, 117, 151, 235, 123, 244, 88, 85, 179, 5];
} else {
// For the rest, just generate a random field element to test.
rng.fill_bytes(&mut fe_bytes);
}
fe_bytes[0] &= 254; // positive
fe_bytes[31] &= 127; // < 2^255-19
let fe = FieldElement::from_bytes(&fe_bytes);
let pt = RistrettoPoint::elligator_ristretto_flavor(&fe);
for pt2 in &pt.xcoset4() {
let (mask, fes) = RistrettoPoint(*pt2).elligator_ristretto_flavor_inverse();
let mut found = false;
for j in 0..8 {
if mask & (1 << j) != 0 {
assert_eq!(RistrettoPoint::elligator_ristretto_flavor(&fes[j]), pt);
if fes[j] == fe {
found = true;
}
}
}
assert!(found);
}
}
}
}

View File

@ -1,13 +0,0 @@
//! The Lizard method for encoding/decoding 16 bytes into Ristretto points.
#![allow(non_snake_case)]
#[cfg(feature = "u32_backend")]
mod u32_constants;
#[cfg(feature = "u64_backend")]
mod u64_constants;
pub mod lizard_constants;
pub mod jacobi_quartic;
pub mod lizard_ristretto;

View File

@ -1,33 +0,0 @@
use backend::serial::u32::field::FieldElement2625;
use edwards::EdwardsPoint;
/// `= sqrt(i*d)`, where `i = +sqrt(-1)` and `d` is the Edwards curve parameter.
pub const SQRT_ID: FieldElement2625 = FieldElement2625([
39590824, 701138, 28659366, 23623507, 53932708,
32206357, 36326585, 24309414, 26167230, 1494357,
]);
/// `= (d+1)/(d-1)`, where `d` is the Edwards curve parameter.
pub const DP1_OVER_DM1: FieldElement2625 = FieldElement2625([
58833708, 32184294, 62457071, 26110240, 19032991,
27203620, 7122892, 18068959, 51019405, 3776288,
]);
/// `= -2/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters.
pub const MDOUBLE_INVSQRT_A_MINUS_D: FieldElement2625 = FieldElement2625([
54885894, 25242303, 55597453, 9067496, 51808079,
33312638, 25456129, 14121551, 54921728, 3972023,
]);
/// `= -2i/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters
/// and `i = +sqrt(-1)`.
pub const MIDOUBLE_INVSQRT_A_MINUS_D: FieldElement2625 = FieldElement2625([
58178520, 23970840, 26444491, 29801899, 41064376,
743696, 2900628, 27920316, 41968995, 5270573,
]);
/// `= -1/sqrt(1+d)`, where `d` is the Edwards curve parameters.
pub const MINVSQRT_ONE_PLUS_D: FieldElement2625 = FieldElement2625([
38019585, 4791795, 20332186, 18653482, 46576675,
33182583, 65658549, 2817057, 12569934, 30919145,
]);

View File

@ -1,18 +0,0 @@
use backend::serial::u64::field::FieldElement51;
/// `= sqrt(i*d)`, where `i = +sqrt(-1)` and `d` is the Edwards curve parameter.
pub const SQRT_ID: FieldElement51 = FieldElement51([2298852427963285, 3837146560810661, 4413131899466403, 3883177008057528, 2352084440532925]);
/// `= (d+1)/(d-1)`, where `d` is the Edwards curve parameter.
pub const DP1_OVER_DM1: FieldElement51 = FieldElement51([2159851467815724, 1752228607624431, 1825604053920671, 1212587319275468, 253422448836237]);
/// `= -2/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters.
pub const MDOUBLE_INVSQRT_A_MINUS_D: FieldElement51 = FieldElement51([1693982333959686, 608509411481997, 2235573344831311, 947681270984193, 266558006233600]);
/// `= -2i/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters
/// and `i = +sqrt(-1)`.
pub const MIDOUBLE_INVSQRT_A_MINUS_D: FieldElement51 = FieldElement51([1608655899704280, 1999971613377227, 49908634785720, 1873700692181652, 353702208628067]);
/// `= -1/sqrt(1+d)`, where `d` is the Edwards curve parameters.
pub const MINVSQRT_ONE_PLUS_D: FieldElement51 = FieldElement51([321571956990465, 1251814006996634, 2226845496292387, 189049560751797, 2074948709371214]);

View File

@ -1,123 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Internal macros.
/// Define borrow and non-borrow variants of `Add`.
macro_rules! define_add_variants {
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
impl<'b> Add<&'b $rhs> for $lhs {
type Output = $out;
fn add(self, rhs: &'b $rhs) -> $out {
&self + rhs
}
}
impl<'a> Add<$rhs> for &'a $lhs {
type Output = $out;
fn add(self, rhs: $rhs) -> $out {
self + &rhs
}
}
impl Add<$rhs> for $lhs {
type Output = $out;
fn add(self, rhs: $rhs) -> $out {
&self + &rhs
}
}
}
}
/// Define non-borrow variants of `AddAssign`.
macro_rules! define_add_assign_variants {
(LHS = $lhs:ty, RHS = $rhs:ty) => {
impl AddAssign<$rhs> for $lhs {
fn add_assign(&mut self, rhs: $rhs) {
*self += &rhs;
}
}
}
}
/// Define borrow and non-borrow variants of `Sub`.
macro_rules! define_sub_variants {
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
impl<'b> Sub<&'b $rhs> for $lhs {
type Output = $out;
fn sub(self, rhs: &'b $rhs) -> $out {
&self - rhs
}
}
impl<'a> Sub<$rhs> for &'a $lhs {
type Output = $out;
fn sub(self, rhs: $rhs) -> $out {
self - &rhs
}
}
impl Sub<$rhs> for $lhs {
type Output = $out;
fn sub(self, rhs: $rhs) -> $out {
&self - &rhs
}
}
}
}
/// Define non-borrow variants of `SubAssign`.
macro_rules! define_sub_assign_variants {
(LHS = $lhs:ty, RHS = $rhs:ty) => {
impl SubAssign<$rhs> for $lhs {
fn sub_assign(&mut self, rhs: $rhs) {
*self -= &rhs;
}
}
}
}
/// Define borrow and non-borrow variants of `Mul`.
macro_rules! define_mul_variants {
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
impl<'b> Mul<&'b $rhs> for $lhs {
type Output = $out;
fn mul(self, rhs: &'b $rhs) -> $out {
&self * rhs
}
}
impl<'a> Mul<$rhs> for &'a $lhs {
type Output = $out;
fn mul(self, rhs: $rhs) -> $out {
self * &rhs
}
}
impl Mul<$rhs> for $lhs {
type Output = $out;
fn mul(self, rhs: $rhs) -> $out {
&self * &rhs
}
}
}
}
/// Define non-borrow variants of `MulAssign`.
macro_rules! define_mul_assign_variants {
(LHS = $lhs:ty, RHS = $rhs:ty) => {
impl MulAssign<$rhs> for $lhs {
fn mul_assign(&mut self, rhs: $rhs) {
*self *= &rhs;
}
}
}
}

View File

@ -1,403 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Scalar multiplication on the Montgomery form of Curve25519.
//!
//! To avoid notational confusion with the Edwards code, we use
//! variables \\( u, v \\) for the Montgomery curve, so that “Montgomery
//! \\(u\\)” here corresponds to “Montgomery \\(x\\)” elsewhere.
//!
//! Montgomery arithmetic works not on the curve itself, but on the
//! \\(u\\)-line, which discards sign information and unifies the curve
//! and its quadratic twist. See [_Montgomery curves and their
//! arithmetic_][costello-smith] by Costello and Smith for more details.
//!
//! The `MontgomeryPoint` struct contains the affine \\(u\\)-coordinate
//! \\(u\_0(P)\\) of a point \\(P\\) on either the curve or the twist.
//! Here the map \\(u\_0 : \mathcal M \rightarrow \mathbb F\_p \\) is
//! defined by \\(u\_0((u,v)) = u\\); \\(u\_0(\mathcal O) = 0\\). See
//! section 5.4 of Costello-Smith for more details.
//!
//! # Scalar Multiplication
//!
//! Scalar multiplication on `MontgomeryPoint`s is provided by the `*`
//! operator, which implements the Montgomery ladder.
//!
//! # Edwards Conversion
//!
//! The \\(2\\)-to-\\(1\\) map from the Edwards model to the Montgomery
//! \\(u\\)-line is provided by `EdwardsPoint::to_montgomery()`.
//!
//! To lift a `MontgomeryPoint` to an `EdwardsPoint`, use
//! `MontgomeryPoint::to_edwards()`, which takes a sign parameter.
//! This function rejects `MontgomeryPoints` which correspond to points
//! on the twist.
//!
//! [costello-smith]: https://eprint.iacr.org/2017/212.pdf
// We allow non snake_case names because coordinates in projective space are
// traditionally denoted by the capitalisation of their respective
// counterparts in affine space. Yeah, you heard me, rustc, I'm gonna have my
// affine and projective cakes and eat both of them too.
#![allow(non_snake_case)]
use core::ops::{Mul, MulAssign};
use constants::APLUS2_OVER_FOUR;
use edwards::{CompressedEdwardsY, EdwardsPoint};
use field::FieldElement;
use scalar::Scalar;
use traits::Identity;
use subtle::Choice;
use subtle::ConditionallySelectable;
use subtle::ConstantTimeEq;
use zeroize::Zeroize;
/// Holds the \\(u\\)-coordinate of a point on the Montgomery form of
/// Curve25519 or its twist.
#[derive(Copy, Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct MontgomeryPoint(pub [u8; 32]);
/// Equality of `MontgomeryPoint`s is defined mod p.
impl ConstantTimeEq for MontgomeryPoint {
fn ct_eq(&self, other: &MontgomeryPoint) -> Choice {
let self_fe = FieldElement::from_bytes(&self.0);
let other_fe = FieldElement::from_bytes(&other.0);
self_fe.ct_eq(&other_fe)
}
}
impl Default for MontgomeryPoint {
fn default() -> MontgomeryPoint {
MontgomeryPoint([0u8; 32])
}
}
impl PartialEq for MontgomeryPoint {
fn eq(&self, other: &MontgomeryPoint) -> bool {
self.ct_eq(other).unwrap_u8() == 1u8
}
}
impl Eq for MontgomeryPoint {}
impl Zeroize for MontgomeryPoint {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl MontgomeryPoint {
/// View this `MontgomeryPoint` as an array of bytes.
pub fn as_bytes<'a>(&'a self) -> &'a [u8; 32] {
&self.0
}
/// Convert this `MontgomeryPoint` to an array of bytes.
pub fn to_bytes(&self) -> [u8; 32] {
self.0
}
/// Attempt to convert to an `EdwardsPoint`, using the supplied
/// choice of sign for the `EdwardsPoint`.
///
/// # Inputs
///
/// * `sign`: a `u8` donating the desired sign of the resulting
/// `EdwardsPoint`. `0` denotes positive and `1` negative.
///
/// # Return
///
/// * `Some(EdwardsPoint)` if `self` is the \\(u\\)-coordinate of a
/// point on (the Montgomery form of) Curve25519;
///
/// * `None` if `self` is the \\(u\\)-coordinate of a point on the
/// twist of (the Montgomery form of) Curve25519;
///
pub fn to_edwards(&self, sign: u8) -> Option<EdwardsPoint> {
// To decompress the Montgomery u coordinate to an
// `EdwardsPoint`, we apply the birational map to obtain the
// Edwards y coordinate, then do Edwards decompression.
//
// The birational map is y = (u-1)/(u+1).
//
// The exceptional points are the zeros of the denominator,
// i.e., u = -1.
//
// But when u = -1, v^2 = u*(u^2+486662*u+1) = 486660.
//
// Since this is nonsquare mod p, u = -1 corresponds to a point
// on the twist, not the curve, so we can reject it early.
let u = FieldElement::from_bytes(&self.0);
if u == FieldElement::minus_one() { return None; }
let one = FieldElement::one();
let y = &(&u - &one) * &(&u + &one).invert();
let mut y_bytes = y.to_bytes();
y_bytes[31] ^= sign << 7;
CompressedEdwardsY(y_bytes).decompress()
}
}
/// A `ProjectivePoint` holds a point on the projective line
/// \\( \mathbb P(\mathbb F\_p) \\), which we identify with the Kummer
/// line of the Montgomery curve.
#[derive(Copy, Clone, Debug)]
struct ProjectivePoint {
pub U: FieldElement,
pub W: FieldElement,
}
impl Identity for ProjectivePoint {
fn identity() -> ProjectivePoint {
ProjectivePoint {
U: FieldElement::one(),
W: FieldElement::zero(),
}
}
}
impl Default for ProjectivePoint {
fn default() -> ProjectivePoint {
ProjectivePoint::identity()
}
}
impl ConditionallySelectable for ProjectivePoint {
fn conditional_select(
a: &ProjectivePoint,
b: &ProjectivePoint,
choice: Choice,
) -> ProjectivePoint {
ProjectivePoint {
U: FieldElement::conditional_select(&a.U, &b.U, choice),
W: FieldElement::conditional_select(&a.W, &b.W, choice),
}
}
}
impl ProjectivePoint {
/// Dehomogenize this point to affine coordinates.
///
/// # Return
///
/// * \\( u = U / W \\) if \\( W \neq 0 \\);
/// * \\( 0 \\) if \\( W \eq 0 \\);
pub fn to_affine(&self) -> MontgomeryPoint {
let u = &self.U * &self.W.invert();
MontgomeryPoint(u.to_bytes())
}
}
/// Perform the double-and-add step of the Montgomery ladder.
///
/// Given projective points
/// \\( (U\_P : W\_P) = u(P) \\),
/// \\( (U\_Q : W\_Q) = u(Q) \\),
/// and the affine difference
/// \\( u\_{P-Q} = u(P-Q) \\), set
/// $$
/// (U\_P : W\_P) \gets u([2]P)
/// $$
/// and
/// $$
/// (U\_Q : W\_Q) \gets u(P + Q).
/// $$
fn differential_add_and_double(
P: &mut ProjectivePoint,
Q: &mut ProjectivePoint,
affine_PmQ: &FieldElement,
) {
let t0 = &P.U + &P.W;
let t1 = &P.U - &P.W;
let t2 = &Q.U + &Q.W;
let t3 = &Q.U - &Q.W;
let t4 = t0.square(); // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2
let t5 = t1.square(); // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2
let t6 = &t4 - &t5; // 4 U_P W_P
let t7 = &t0 * &t3; // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q
let t8 = &t1 * &t2; // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q
let t9 = &t7 + &t8; // 2 (U_P U_Q - W_P W_Q)
let t10 = &t7 - &t8; // 2 (W_P U_Q - U_P W_Q)
let t11 = t9.square(); // 4 (U_P U_Q - W_P W_Q)^2
let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2
let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q
let t14 = &t4 * &t5; // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2
let t15 = &t13 + &t5; // (U_P - W_P)^2 + (A + 2) U_P W_P
let t16 = &t6 * &t15; // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P)
let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2
let t18 = t11; // W_D * 4 (U_P U_Q - W_P W_Q)^2
P.U = t14; // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2
P.W = t16; // W_{P'} = (4 U_P W_P) ((U_P - W_P)^2 + ((A + 2)/4) 4 U_P W_P)
Q.U = t18; // U_{Q'} = W_D * 4 (U_P U_Q - W_P W_Q)^2
Q.W = t17; // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2
}
define_mul_assign_variants!(LHS = MontgomeryPoint, RHS = Scalar);
define_mul_variants!(LHS = MontgomeryPoint, RHS = Scalar, Output = MontgomeryPoint);
define_mul_variants!(LHS = Scalar, RHS = MontgomeryPoint, Output = MontgomeryPoint);
/// Multiply this `MontgomeryPoint` by a `Scalar`.
impl<'a, 'b> Mul<&'b Scalar> for &'a MontgomeryPoint {
type Output = MontgomeryPoint;
/// Given `self` \\( = u\_0(P) \\), and a `Scalar` \\(n\\), return \\( u\_0([n]P) \\).
fn mul(self, scalar: &'b Scalar) -> MontgomeryPoint {
// Algorithm 8 of Costello-Smith 2017
let affine_u = FieldElement::from_bytes(&self.0);
let mut x0 = ProjectivePoint::identity();
let mut x1 = ProjectivePoint {
U: affine_u,
W: FieldElement::one(),
};
let bits: [i8; 256] = scalar.bits();
for i in (0..255).rev() {
let choice: u8 = (bits[i + 1] ^ bits[i]) as u8;
debug_assert!(choice == 0 || choice == 1);
ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into());
differential_add_and_double(&mut x0, &mut x1, &affine_u);
}
ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8));
x0.to_affine()
}
}
impl<'b> MulAssign<&'b Scalar> for MontgomeryPoint {
fn mul_assign(&mut self, scalar: &'b Scalar) {
*self = (self as &MontgomeryPoint) * scalar;
}
}
impl<'a, 'b> Mul<&'b MontgomeryPoint> for &'a Scalar {
type Output = MontgomeryPoint;
fn mul(self, point: &'b MontgomeryPoint) -> MontgomeryPoint {
point * self
}
}
// ------------------------------------------------------------------------
// Tests
// ------------------------------------------------------------------------
#[cfg(test)]
mod test {
use constants;
use super::*;
use rand_core::OsRng;
#[test]
#[cfg(feature = "serde")]
fn serde_bincode_basepoint_roundtrip() {
use bincode;
let encoded = bincode::serialize(&constants::X25519_BASEPOINT).unwrap();
let decoded: MontgomeryPoint = bincode::deserialize(&encoded).unwrap();
assert_eq!(encoded.len(), 32);
assert_eq!(decoded, constants::X25519_BASEPOINT);
let raw_bytes = constants::X25519_BASEPOINT.as_bytes();
let bp: MontgomeryPoint = bincode::deserialize(raw_bytes).unwrap();
assert_eq!(bp, constants::X25519_BASEPOINT);
}
/// Test Montgomery -> Edwards on the X/Ed25519 basepoint
#[test]
fn basepoint_montgomery_to_edwards() {
// sign bit = 0 => basepoint
assert_eq!(
constants::ED25519_BASEPOINT_POINT,
constants::X25519_BASEPOINT.to_edwards(0).unwrap()
);
// sign bit = 1 => minus basepoint
assert_eq!(
- constants::ED25519_BASEPOINT_POINT,
constants::X25519_BASEPOINT.to_edwards(1).unwrap()
);
}
/// Test Edwards -> Montgomery on the X/Ed25519 basepoint
#[test]
fn basepoint_edwards_to_montgomery() {
assert_eq!(
constants::ED25519_BASEPOINT_POINT.to_montgomery(),
constants::X25519_BASEPOINT
);
}
/// Check that Montgomery -> Edwards fails for points on the twist.
#[test]
fn montgomery_to_edwards_rejects_twist() {
let one = FieldElement::one();
// u = 2 corresponds to a point on the twist.
let two = MontgomeryPoint((&one+&one).to_bytes());
assert!(two.to_edwards(0).is_none());
// u = -1 corresponds to a point on the twist, but should be
// checked explicitly because it's an exceptional point for the
// birational map. For instance, libsignal will accept it.
let minus_one = MontgomeryPoint((-&one).to_bytes());
assert!(minus_one.to_edwards(0).is_none());
}
#[test]
fn eq_defined_mod_p() {
let mut u18_bytes = [0u8; 32]; u18_bytes[0] = 18;
let u18 = MontgomeryPoint(u18_bytes);
let u18_unred = MontgomeryPoint([255; 32]);
assert_eq!(u18, u18_unred);
}
#[test]
fn montgomery_ladder_matches_edwards_scalarmult() {
let mut csprng: OsRng = OsRng;
let s: Scalar = Scalar::random(&mut csprng);
let p_edwards: EdwardsPoint = &constants::ED25519_BASEPOINT_TABLE * &s;
let p_montgomery: MontgomeryPoint = p_edwards.to_montgomery();
let expected = s * p_edwards;
let result = s * p_montgomery;
assert_eq!(result, expected.to_montgomery())
}
}

View File

@ -1,8 +0,0 @@
//! Crate-local prelude (for alloc-dependent features like `Vec`)
// TODO: switch to alloc::prelude
#[cfg(all(feature = "alloc", not(feature = "std")))]
pub use alloc::vec::Vec;
#[cfg(feature = "std")]
pub use std::vec::Vec;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,378 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Module for common traits.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use subtle;
use scalar::Scalar;
// ------------------------------------------------------------------------
// Public Traits
// ------------------------------------------------------------------------
/// Trait for getting the identity element of a point type.
pub trait Identity {
/// Returns the identity element of the curve.
/// Can be used as a constructor.
fn identity() -> Self;
}
/// Trait for testing if a curve point is equivalent to the identity point.
pub trait IsIdentity {
/// Return true if this element is the identity element of the curve.
fn is_identity(&self) -> bool;
}
/// Implement generic identity equality testing for a point representations
/// which have constant-time equality testing and a defined identity
/// constructor.
impl<T> IsIdentity for T
where
T: subtle::ConstantTimeEq + Identity,
{
fn is_identity(&self) -> bool {
self.ct_eq(&T::identity()).unwrap_u8() == 1u8
}
}
/// A trait for constant-time multiscalar multiplication without precomputation.
pub trait MultiscalarMul {
/// The type of point being multiplied, e.g., `RistrettoPoint`.
type Point;
/// Given an iterator of (possibly secret) scalars and an iterator of
/// public points, compute
/// $$
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n.
/// $$
///
/// It is an error to call this function with two iterators of different lengths.
///
/// # Examples
///
/// The trait bound aims for maximum flexibility: the inputs must be
/// convertable to iterators (`I: IntoIter`), and the iterator's items
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
/// iterators returning either `Scalar`s or `&Scalar`s.
///
/// ```
/// use curve25519_dalek::constants;
/// use curve25519_dalek::traits::MultiscalarMul;
/// use curve25519_dalek::ristretto::RistrettoPoint;
/// use curve25519_dalek::scalar::Scalar;
///
/// // Some scalars
/// let a = Scalar::from(87329482u64);
/// let b = Scalar::from(37264829u64);
/// let c = Scalar::from(98098098u64);
///
/// // Some points
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
/// let Q = P + P;
/// let R = P + Q;
///
/// // A1 = a*P + b*Q + c*R
/// let abc = [a,b,c];
/// let A1 = RistrettoPoint::multiscalar_mul(&abc, &[P,Q,R]);
/// // Note: (&abc).into_iter(): Iterator<Item=&Scalar>
///
/// // A2 = (-a)*P + (-b)*Q + (-c)*R
/// let minus_abc = abc.iter().map(|x| -x);
/// let A2 = RistrettoPoint::multiscalar_mul(minus_abc, &[P,Q,R]);
/// // Note: minus_abc.into_iter(): Iterator<Item=Scalar>
///
/// assert_eq!(A1.compress(), (-A2).compress());
/// ```
fn multiscalar_mul<I, J>(scalars: I, points: J) -> Self::Point
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Self::Point>;
}
/// A trait for variable-time multiscalar multiplication without precomputation.
pub trait VartimeMultiscalarMul {
/// The type of point being multiplied, e.g., `RistrettoPoint`.
type Point;
/// Given an iterator of public scalars and an iterator of
/// `Option`s of points, compute either `Some(Q)`, where
/// $$
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n,
/// $$
/// if all points were `Some(P_i)`, or else return `None`.
///
/// This function is particularly useful when verifying statements
/// involving compressed points. Accepting `Option<Point>` allows
/// inlining point decompression into the multiscalar call,
/// avoiding the need for temporary buffers.
/// ```
/// use curve25519_dalek::constants;
/// use curve25519_dalek::traits::VartimeMultiscalarMul;
/// use curve25519_dalek::ristretto::RistrettoPoint;
/// use curve25519_dalek::scalar::Scalar;
///
/// // Some scalars
/// let a = Scalar::from(87329482u64);
/// let b = Scalar::from(37264829u64);
/// let c = Scalar::from(98098098u64);
/// let abc = [a,b,c];
///
/// // Some points
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
/// let Q = P + P;
/// let R = P + Q;
/// let PQR = [P, Q, R];
///
/// let compressed = [P.compress(), Q.compress(), R.compress()];
///
/// // Now we can compute A1 = a*P + b*Q + c*R using P, Q, R:
/// let A1 = RistrettoPoint::vartime_multiscalar_mul(&abc, &PQR);
///
/// // Or using the compressed points:
/// let A2 = RistrettoPoint::optional_multiscalar_mul(
/// &abc,
/// compressed.iter().map(|pt| pt.decompress()),
/// );
///
/// assert_eq!(A2, Some(A1));
///
/// // It's also possible to mix compressed and uncompressed points:
/// let A3 = RistrettoPoint::optional_multiscalar_mul(
/// abc.iter()
/// .chain(abc.iter()),
/// compressed.iter().map(|pt| pt.decompress())
/// .chain(PQR.iter().map(|&pt| Some(pt))),
/// );
///
/// assert_eq!(A3, Some(A1+A1));
/// ```
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<Self::Point>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<Self::Point>>;
/// Given an iterator of public scalars and an iterator of
/// public points, compute
/// $$
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n,
/// $$
/// using variable-time operations.
///
/// It is an error to call this function with two iterators of different lengths.
///
/// # Examples
///
/// The trait bound aims for maximum flexibility: the inputs must be
/// convertable to iterators (`I: IntoIter`), and the iterator's items
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
/// iterators returning either `Scalar`s or `&Scalar`s.
///
/// ```
/// use curve25519_dalek::constants;
/// use curve25519_dalek::traits::VartimeMultiscalarMul;
/// use curve25519_dalek::ristretto::RistrettoPoint;
/// use curve25519_dalek::scalar::Scalar;
///
/// // Some scalars
/// let a = Scalar::from(87329482u64);
/// let b = Scalar::from(37264829u64);
/// let c = Scalar::from(98098098u64);
///
/// // Some points
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
/// let Q = P + P;
/// let R = P + Q;
///
/// // A1 = a*P + b*Q + c*R
/// let abc = [a,b,c];
/// let A1 = RistrettoPoint::vartime_multiscalar_mul(&abc, &[P,Q,R]);
/// // Note: (&abc).into_iter(): Iterator<Item=&Scalar>
///
/// // A2 = (-a)*P + (-b)*Q + (-c)*R
/// let minus_abc = abc.iter().map(|x| -x);
/// let A2 = RistrettoPoint::vartime_multiscalar_mul(minus_abc, &[P,Q,R]);
/// // Note: minus_abc.into_iter(): Iterator<Item=Scalar>
///
/// assert_eq!(A1.compress(), (-A2).compress());
/// ```
fn vartime_multiscalar_mul<I, J>(scalars: I, points: J) -> Self::Point
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Self::Point>,
Self::Point: Clone,
{
Self::optional_multiscalar_mul(
scalars,
points.into_iter().map(|P| Some(P.borrow().clone())),
)
.unwrap()
}
}
/// A trait for variable-time multiscalar multiplication with precomputation.
///
/// A general multiscalar multiplication with precomputation can be written as
/// $$
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
/// $$
/// where the \\(B_i\\) are *static* points, for which precomputation
/// is possible, and the \\(A_j\\) are *dynamic* points, for which
/// precomputation is not possible.
///
/// This trait has three methods for performing this computation:
///
/// * [`vartime_multiscalar_mul`], which handles the special case
/// where \\(n = 0\\) and there are no dynamic points;
///
/// * [`vartime_mixed_multiscalar_mul`], which takes the dynamic
/// points as already-validated `Point`s and is infallible;
///
/// * [`optional_mixed_multiscalar_mul`], which takes the dynamic
/// points as `Option<Point>`s and returns an `Option<Point>`,
/// allowing decompression to be composed into the input iterators.
///
/// All methods require that the lengths of the input iterators be
/// known and matching, as if they were `ExactSizeIterator`s. (It
/// does not require `ExactSizeIterator` only because that trait is
/// broken).
pub trait VartimePrecomputedMultiscalarMul: Sized {
/// The type of point to be multiplied, e.g., `RistrettoPoint`.
type Point: Clone;
/// Given the static points \\( B_i \\), perform precomputation
/// and return the precomputation data.
fn new<I>(static_points: I) -> Self
where
I: IntoIterator,
I::Item: Borrow<Self::Point>;
/// Given `static_scalars`, an iterator of public scalars
/// \\(b_i\\), compute
/// $$
/// Q = b_1 B_1 + \cdots + b_m B_m,
/// $$
/// where the \\(B_j\\) are the points that were supplied to `new`.
///
/// It is an error to call this function with iterators of
/// inconsistent lengths.
///
/// The trait bound aims for maximum flexibility: the input must
/// be convertable to iterators (`I: IntoIter`), and the
/// iterator's items must be `Borrow<Scalar>`, to allow iterators
/// returning either `Scalar`s or `&Scalar`s.
fn vartime_multiscalar_mul<I>(&self, static_scalars: I) -> Self::Point
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
{
use core::iter;
Self::vartime_mixed_multiscalar_mul(
self,
static_scalars,
iter::empty::<Scalar>(),
iter::empty::<Self::Point>(),
)
}
/// Given `static_scalars`, an iterator of public scalars
/// \\(b_i\\), `dynamic_scalars`, an iterator of public scalars
/// \\(a_i\\), and `dynamic_points`, an iterator of points
/// \\(A_i\\), compute
/// $$
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
/// $$
/// where the \\(B_j\\) are the points that were supplied to `new`.
///
/// It is an error to call this function with iterators of
/// inconsistent lengths.
///
/// The trait bound aims for maximum flexibility: the inputs must be
/// convertable to iterators (`I: IntoIter`), and the iterator's items
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
/// iterators returning either `Scalar`s or `&Scalar`s.
fn vartime_mixed_multiscalar_mul<I, J, K>(
&self,
static_scalars: I,
dynamic_scalars: J,
dynamic_points: K,
) -> Self::Point
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Scalar>,
K: IntoIterator,
K::Item: Borrow<Self::Point>,
{
Self::optional_mixed_multiscalar_mul(
self,
static_scalars,
dynamic_scalars,
dynamic_points.into_iter().map(|P| Some(P.borrow().clone())),
)
.unwrap()
}
/// Given `static_scalars`, an iterator of public scalars
/// \\(b_i\\), `dynamic_scalars`, an iterator of public scalars
/// \\(a_i\\), and `dynamic_points`, an iterator of points
/// \\(A_i\\), compute
/// $$
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
/// $$
/// where the \\(B_j\\) are the points that were supplied to `new`.
///
/// If any of the dynamic points were `None`, return `None`.
///
/// It is an error to call this function with iterators of
/// inconsistent lengths.
///
/// This function is particularly useful when verifying statements
/// involving compressed points. Accepting `Option<Point>` allows
/// inlining point decompression into the multiscalar call,
/// avoiding the need for temporary buffers.
fn optional_mixed_multiscalar_mul<I, J, K>(
&self,
static_scalars: I,
dynamic_scalars: J,
dynamic_points: K,
) -> Option<Self::Point>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Scalar>,
K: IntoIterator<Item = Option<Self::Point>>;
}
// ------------------------------------------------------------------------
// Private Traits
// ------------------------------------------------------------------------
/// Trait for checking whether a point is on the curve.
///
/// This trait is only for debugging/testing, since it should be
/// impossible for a `curve25519-dalek` user to construct an invalid
/// point.
pub(crate) trait ValidityCheck {
/// Checks whether the point is on the curve. Not CT.
fn is_valid(&self) -> bool;
}

View File

@ -1,206 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Code for fixed- and sliding-window functionality
#![allow(non_snake_case)]
use core::fmt::Debug;
use subtle::ConditionallyNegatable;
use subtle::ConditionallySelectable;
use subtle::ConstantTimeEq;
use subtle::Choice;
use traits::Identity;
use edwards::EdwardsPoint;
use backend::serial::curve_models::ProjectiveNielsPoint;
use backend::serial::curve_models::AffineNielsPoint;
use zeroize::Zeroize;
/// A lookup table of precomputed multiples of a point \\(P\\), used to
/// compute \\( xP \\) for \\( -8 \leq x \leq 8 \\).
///
/// The computation of \\( xP \\) is done in constant time by the `select` function.
///
/// Since `LookupTable` does not implement `Index`, it's more difficult
/// to accidentally use the table directly. Unfortunately the table is
/// only `pub(crate)` so that we can write hardcoded constants, so it's
/// still technically possible. It would be nice to prevent direct
/// access to the table.
///
/// XXX make this generic with respect to table size
#[derive(Copy, Clone)]
pub struct LookupTable<T>(pub(crate) [T; 8]);
impl<T> LookupTable<T>
where
T: Identity + ConditionallySelectable + ConditionallyNegatable,
{
/// Given \\(-8 \leq x \leq 8\\), return \\(xP\\) in constant time.
pub fn select(&self, x: i8) -> T {
debug_assert!(x >= -8);
debug_assert!(x <= 8);
// Compute xabs = |x|
let xmask = x >> 7;
let xabs = (x + xmask) ^ xmask;
// Set t = 0 * P = identity
let mut t = T::identity();
for j in 1..9 {
// Copy `points[j-1] == j*P` onto `t` in constant time if `|x| == j`.
let c = (xabs as u8).ct_eq(&(j as u8));
t.conditional_assign(&self.0[j - 1], c);
}
// Now t == |x| * P.
let neg_mask = Choice::from((xmask & 1) as u8);
t.conditional_negate(neg_mask);
// Now t == x * P.
t
}
}
impl<T: Copy + Default> Default for LookupTable<T> {
fn default() -> LookupTable<T> {
LookupTable([T::default(); 8])
}
}
impl<T: Debug> Debug for LookupTable<T> {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "LookupTable({:?})", self.0)
}
}
impl<'a> From<&'a EdwardsPoint> for LookupTable<ProjectiveNielsPoint> {
fn from(P: &'a EdwardsPoint) -> Self {
let mut points = [P.to_projective_niels(); 8];
for j in 0..7 {
points[j + 1] = (P + &points[j]).to_extended().to_projective_niels();
}
LookupTable(points)
}
}
impl<'a> From<&'a EdwardsPoint> for LookupTable<AffineNielsPoint> {
fn from(P: &'a EdwardsPoint) -> Self {
let mut points = [P.to_affine_niels(); 8];
// XXX batch inversion would be good if perf mattered here
for j in 0..7 {
points[j + 1] = (P + &points[j]).to_extended().to_affine_niels()
}
LookupTable(points)
}
}
impl<T> Zeroize for LookupTable<T>
where
T: Copy + Default + Zeroize
{
fn zeroize(&mut self) {
self.0.zeroize();
}
}
/// Holds odd multiples 1A, 3A, ..., 15A of a point A.
#[derive(Copy, Clone)]
pub(crate) struct NafLookupTable5<T>(pub(crate) [T; 8]);
impl<T: Copy> NafLookupTable5<T> {
/// Given public, odd \\( x \\) with \\( 0 < x < 2^4 \\), return \\(xA\\).
pub fn select(&self, x: usize) -> T {
debug_assert_eq!(x & 1, 1);
debug_assert!(x < 16);
self.0[x / 2]
}
}
impl<T: Debug> Debug for NafLookupTable5<T> {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "NafLookupTable5({:?})", self.0)
}
}
impl<'a> From<&'a EdwardsPoint> for NafLookupTable5<ProjectiveNielsPoint> {
fn from(A: &'a EdwardsPoint) -> Self {
let mut Ai = [A.to_projective_niels(); 8];
let A2 = A.double();
for i in 0..7 {
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_projective_niels();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
NafLookupTable5(Ai)
}
}
impl<'a> From<&'a EdwardsPoint> for NafLookupTable5<AffineNielsPoint> {
fn from(A: &'a EdwardsPoint) -> Self {
let mut Ai = [A.to_affine_niels(); 8];
let A2 = A.double();
for i in 0..7 {
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_affine_niels();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
NafLookupTable5(Ai)
}
}
/// Holds stuff up to 8.
#[derive(Copy, Clone)]
pub(crate) struct NafLookupTable8<T>(pub(crate) [T; 64]);
impl<T: Copy> NafLookupTable8<T> {
pub fn select(&self, x: usize) -> T {
debug_assert_eq!(x & 1, 1);
debug_assert!(x < 128);
self.0[x / 2]
}
}
impl<T: Debug> Debug for NafLookupTable8<T> {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "NafLookupTable8([\n")?;
for i in 0..64 {
write!(f, "\t{:?},\n", &self.0[i])?;
}
write!(f, "])")
}
}
impl<'a> From<&'a EdwardsPoint> for NafLookupTable8<ProjectiveNielsPoint> {
fn from(A: &'a EdwardsPoint) -> Self {
let mut Ai = [A.to_projective_niels(); 64];
let A2 = A.double();
for i in 0..63 {
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_projective_niels();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
NafLookupTable8(Ai)
}
}
impl<'a> From<&'a EdwardsPoint> for NafLookupTable8<AffineNielsPoint> {
fn from(A: &'a EdwardsPoint) -> Self {
let mut Ai = [A.to_affine_niels(); 64];
let A2 = A.double();
for i in 0..63 {
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_affine_niels();
}
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
NafLookupTable8(Ai)
}
}

View File

@ -1,857 +0,0 @@
import binascii
class InvalidEncodingException(Exception): pass
class NotOnCurveException(Exception): pass
class SpecException(Exception): pass
def lobit(x): return int(x) & 1
def hibit(x): return lobit(2*x)
def negative(x): return lobit(x)
def enc_le(x,n): return bytearray([int(x)>>(8*i) & 0xFF for i in xrange(n)])
def dec_le(x): return sum(b<<(8*i) for i,b in enumerate(x))
def randombytes(n): return bytearray([randint(0,255) for _ in range(n)])
def optimized_version_of(spec):
"""Decorator: This function is an optimized version of some specification"""
def decorator(f):
def wrapper(self,*args,**kwargs):
def pr(x):
if isinstance(x,bytearray): return binascii.hexlify(x)
else: return str(x)
try: spec_ans = getattr(self,spec,spec)(*args,**kwargs),None
except Exception as e: spec_ans = None,e
try: opt_ans = f(self,*args,**kwargs),None
except Exception as e: opt_ans = None,e
if spec_ans[1] is None and opt_ans[1] is not None:
raise
#raise SpecException("Mismatch in %s: spec returned %s but opt threw %s"
# % (f.__name__,str(spec_ans[0]),str(opt_ans[1])))
if spec_ans[1] is not None and opt_ans[1] is None:
raise
#raise SpecException("Mismatch in %s: spec threw %s but opt returned %s"
# % (f.__name__,str(spec_ans[1]),str(opt_ans[0])))
if spec_ans[0] != opt_ans[0]:
raise SpecException("Mismatch in %s: %s != %s"
% (f.__name__,pr(spec_ans[0]),pr(opt_ans[0])))
if opt_ans[1] is not None: raise
else: return opt_ans[0]
wrapper.__name__ = f.__name__
return wrapper
return decorator
def xsqrt(x,exn=InvalidEncodingException("Not on curve")):
"""Return sqrt(x)"""
if not is_square(x): raise exn
s = sqrt(x)
if negative(s): s=-s
return s
def isqrt(x,exn=InvalidEncodingException("Not on curve")):
"""Return 1/sqrt(x)"""
if x==0: return 0
if not is_square(x): raise exn
s = sqrt(x)
#if negative(s): s=-s
return 1/s
def inv0(x): return 1/x if x != 0 else 0
def isqrt_i(x):
"""Return 1/sqrt(x) or 1/sqrt(zeta * x)"""
if x==0: return True,0
gen = x.parent(-1)
while is_square(gen): gen = sqrt(gen)
if is_square(x): return True,1/sqrt(x)
else: return False,1/sqrt(x*gen)
class QuotientEdwardsPoint(object):
"""Abstract class for point an a quotiented Edwards curve; needs F,a,d,cofactor to work"""
def __init__(self,x=0,y=1):
x = self.x = self.F(x)
y = self.y = self.F(y)
if y^2 + self.a*x^2 != 1 + self.d*x^2*y^2:
raise NotOnCurveException(str(self))
def __repr__(self):
return "%s(0x%x,0x%x)" % (self.__class__.__name__, self.x, self.y)
def __iter__(self):
yield self.x
yield self.y
def __add__(self,other):
x,y = self
X,Y = other
a,d = self.a,self.d
return self.__class__(
(x*Y+y*X)/(1+d*x*y*X*Y),
(y*Y-a*x*X)/(1-d*x*y*X*Y)
)
def __neg__(self): return self.__class__(-self.x,self.y)
def __sub__(self,other): return self + (-other)
def __rmul__(self,other): return self*other
def __eq__(self,other):
"""NB: this is the only method that is different from the usual one"""
x,y = self
X,Y = other
return x*Y == X*y or (self.cofactor==8 and -self.a*x*X == y*Y)
def __ne__(self,other): return not (self==other)
def __mul__(self,exp):
exp = int(exp)
if exp < 0: exp,self = -exp,-self
total = self.__class__()
work = self
while exp != 0:
if exp & 1: total += work
work += work
exp >>= 1
return total
def xyzt(self):
x,y = self
z = self.F.random_element()
return x*z,y*z,z,x*y*z
def torque(self):
"""Apply cofactor group, except keeping the point even"""
if self.cofactor == 8:
if self.a == -1: return self.__class__(self.y*self.i, self.x*self.i)
if self.a == 1: return self.__class__(-self.y, self.x)
else:
return self.__class__(-self.x, -self.y)
def doubleAndEncodeSpec(self):
return (self+self).encode()
# Utility functions
@classmethod
def bytesToGf(cls,bytes,mustBeProper=True,mustBePositive=False,maskHiBits=False):
"""Convert little-endian bytes to field element, sanity check length"""
if len(bytes) != cls.encLen:
raise InvalidEncodingException("wrong length %d" % len(bytes))
s = dec_le(bytes)
if mustBeProper and s >= cls.F.order():
raise InvalidEncodingException("%d out of range!" % s)
bitlen = int(ceil(log(cls.F.order())/log(2)))
if maskHiBits: s &= 2^bitlen-1
s = cls.F(s)
if mustBePositive and negative(s):
raise InvalidEncodingException("%d is negative!" % s)
return s
@classmethod
def gfToBytes(cls,x,mustBePositive=False):
"""Convert little-endian bytes to field element, sanity check length"""
if negative(x) and mustBePositive: x = -x
return enc_le(x,cls.encLen)
class RistrettoPoint(QuotientEdwardsPoint):
"""The new Ristretto group"""
def encodeSpec(self):
"""Unoptimized specification for encoding"""
x,y = self
if self.cofactor==8 and (negative(x*y) or y==0): (x,y) = self.torque()
if y == -1: y = 1 # Avoid divide by 0; doesn't affect impl
if negative(x): x,y = -x,-y
s = xsqrt(self.mneg*(1-y)/(1+y),exn=Exception("Unimplemented: point is odd: " + str(self)))
return self.gfToBytes(s)
@classmethod
def decodeSpec(cls,s):
"""Unoptimized specification for decoding"""
s = cls.bytesToGf(s,mustBePositive=True)
a,d = cls.a,cls.d
x = xsqrt(4*s^2 / (a*d*(1+a*s^2)^2 - (1-a*s^2)^2))
y = (1+a*s^2) / (1-a*s^2)
if cls.cofactor==8 and (negative(x*y) or y==0):
raise InvalidEncodingException("x*y has high bit")
return cls(x,y)
@optimized_version_of("encodeSpec")
def encode(self):
"""Encode, optimized version"""
a,d,mneg = self.a,self.d,self.mneg
x,y,z,t = self.xyzt()
if self.cofactor==8:
u1 = mneg*(z+y)*(z-y)
u2 = x*y # = t*z
isr = isqrt(u1*u2^2)
i1 = isr*u1 # sqrt(mneg*(z+y)*(z-y))/(x*y)
i2 = isr*u2 # 1/sqrt(a*(y+z)*(y-z))
z_inv = i1*i2*t # 1/z
if negative(t*z_inv):
if a==-1:
x,y = y*self.i,x*self.i
den_inv = self.magic * i1
else:
x,y = -y,x
den_inv = self.i * self.magic * i1
else:
den_inv = i2
if negative(x*z_inv): y = -y
s = (z-y) * den_inv
else:
num = mneg*(z+y)*(z-y)
isr = isqrt(num*y^2)
if negative(isr^2*num*y*t): y = -y
s = isr*y*(z-y)
return self.gfToBytes(s,mustBePositive=True)
@optimized_version_of("doubleAndEncodeSpec")
def doubleAndEncode(self):
X,Y,Z,T = self.xyzt()
a,d,mneg = self.a,self.d,self.mneg
if self.cofactor==8:
e = 2*X*Y
f = Z^2+d*T^2
g = Y^2-a*X^2
h = Z^2-d*T^2
inv1 = 1/(e*f*g*h)
z_inv = inv1*e*g # 1 / (f*h)
t_inv = inv1*f*h
if negative(e*g*z_inv):
if a==-1: sqrta = self.i
else: sqrta = -1
e,f,g,h = g,h,-e,f*sqrta
factor = self.i
else:
factor = self.magic
if negative(h*e*z_inv): g=-g
s = (h-g)*factor*g*t_inv
else:
foo = Y^2+a*X^2
bar = X*Y
den = 1/(foo*bar)
if negative(2*bar^2*den): tmp = a*X^2
else: tmp = Y^2
s = self.magic*(Z^2-tmp)*foo*den
return self.gfToBytes(s,mustBePositive=True)
@classmethod
@optimized_version_of("decodeSpec")
def decode(cls,s):
"""Decode, optimized version"""
s = cls.bytesToGf(s,mustBePositive=True)
a,d = cls.a,cls.d
yden = 1-a*s^2
ynum = 1+a*s^2
yden_sqr = yden^2
xden_sqr = a*d*ynum^2 - yden_sqr
isr = isqrt(xden_sqr * yden_sqr)
xden_inv = isr * yden
yden_inv = xden_inv * isr * xden_sqr
x = 2*s*xden_inv
if negative(x): x = -x
y = ynum * yden_inv
if cls.cofactor==8 and (negative(x*y) or y==0):
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
return cls(x,y)
@classmethod
def fromJacobiQuartic(cls,s,t,sgn=1):
"""Convert point from its Jacobi Quartic representation"""
a,d = cls.a,cls.d
assert s^4 - 2*cls.a*(1-2*d/(d-a))*s^2 + 1 == t^2
x = 2*s*cls.magic / t
y = (1+a*s^2) / (1-a*s^2)
return cls(sgn*x,y)
@classmethod
def elligatorSpec(cls,r0):
a,d = cls.a,cls.d
r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2
den = (d*r-a)*(a*r-d)
if den == 0: return cls()
n1 = cls.a*(r+1)*(a+d)*(d-a)/den
n2 = r*n1
if is_square(n1):
sgn,s,t = 1, xsqrt(n1), -(r-1)*(a+d)^2 / den - 1
else:
sgn,s,t = -1,-xsqrt(n2), r*(r-1)*(a+d)^2 / den - 1
return cls.fromJacobiQuartic(s,t)
@classmethod
@optimized_version_of("elligatorSpec")
def elligator(cls,r0):
a,d = cls.a,cls.d
r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)
r = cls.qnr * r0^2
den = (d*r-a)*(a*r-d)
num = cls.a*(r+1)*(a+d)*(d-a)
iss,isri = isqrt_i(num*den)
if iss: sgn,twiddle = 1,1
else: sgn,twiddle = -1,r0*cls.qnr
isri *= twiddle
s = isri*num
t = -sgn*isri*s*(r-1)*(d+a)^2 - 1
if negative(s) == iss: s = -s
return cls.fromJacobiQuartic(s,t)
class Decaf_1_1_Point(QuotientEdwardsPoint):
"""Like current decaf but tweaked for simplicity"""
def encodeSpec(self):
"""Unoptimized specification for encoding"""
a,d = self.a,self.d
x,y = self
if x==0 or y==0: return(self.gfToBytes(0))
if self.cofactor==8 and negative(x*y*self.isoMagic):
x,y = self.torque()
sr = xsqrt(1-a*x^2)
altx = x*y*self.isoMagic / sr
if negative(altx): s = (1+sr)/x
else: s = (1-sr)/x
return self.gfToBytes(s,mustBePositive=True)
@classmethod
def decodeSpec(cls,s):
"""Unoptimized specification for decoding"""
a,d = cls.a,cls.d
s = cls.bytesToGf(s,mustBePositive=True)
if s==0: return cls()
t = xsqrt(s^4 + 2*(a-2*d)*s^2 + 1)
altx = 2*s*cls.isoMagic/t
if negative(altx): t = -t
x = 2*s / (1+a*s^2)
y = (1-a*s^2) / t
if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0):
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
return cls(x,y)
def toJacobiQuartic(self,toggle_rotation=False,toggle_altx=False,toggle_s=False):
"Return s,t on jacobi curve"
a,d = self.a,self.d
x,y,z,t = self.xyzt()
if self.cofactor == 8:
# Cofactor 8 version
# Simulate IMAGINE_TWIST because that's how libdecaf does it
x = self.i*x
t = self.i*t
a = -a
d = -d
# OK, the actual libdecaf code should be here
num = (z+y)*(z-y)
den = x*y
isr = isqrt(num*(a-d)*den^2)
iden = isr * den * self.isoMagic # 1/sqrt((z+y)(z-y)) = 1/sqrt(1-Y^2) / z
inum = isr * num # sqrt(1-Y^2) * z / xysqrt(a-d) ~ 1/sqrt(1-ax^2)/z
if negative(iden*inum*self.i*t^2*(d-a)) != toggle_rotation:
iden,inum = inum,iden
fac = x*sqrt(a)
toggle=(a==-1)
else:
fac = y
toggle=False
imi = self.isoMagic * self.i
altx = inum*t*imi
neg_altx = negative(altx) != toggle_altx
if neg_altx != toggle: inum =- inum
tmp = fac*(inum*z + 1)
s = iden*tmp*imi
negm1 = (negative(s) != toggle_s) != neg_altx
if negm1: m1 = a*fac + z
else: m1 = a*fac - z
swap = toggle_s
else:
# Much simpler cofactor 4 version
num = (x+t)*(x-t)
isr = isqrt(num*(a-d)*x^2)
ratio = isr*num
altx = ratio*self.isoMagic
neg_altx = negative(altx) != toggle_altx
if neg_altx: ratio =- ratio
tmp = ratio*z - t
s = (a-d)*isr*x*tmp
negx = (negative(s) != toggle_s) != neg_altx
if negx: m1 = -a*t + x
else: m1 = -a*t - x
swap = toggle_s
if negative(s): s = -s
return s,m1,a*tmp,swap
def invertElligator(self,toggle_r=False,*args,**kwargs):
"Produce preimage of self under elligator, or None"
a,d = self.a,self.d
rets = []
tr = [False,True] if self.cofactor == 8 else [False]
for toggle_rotation in tr:
for toggle_altx in [False,True]:
for toggle_s in [False,True]:
for toggle_r in [False,True]:
s,m1,m12,swap = self.toJacobiQuartic(toggle_rotation,toggle_altx,toggle_s)
#print
#print toggle_rotation,toggle_altx,toggle_s
#print m1
#print m12
if self == self.__class__():
if self.cofactor == 4:
# Hacks for identity!
if toggle_altx: m12 = 1
elif toggle_s: m1 = 1
elif toggle_r: continue
## BOTH???
else:
m12 = 1
imi = self.isoMagic * self.i
if toggle_rotation:
if toggle_altx: m1 = -imi
else: m1 = +imi
else:
if toggle_altx: m1 = 0
else: m1 = a-d
rnum = (d*a*m12-m1)
rden = ((d*a-1)*m12+m1)
if swap: rnum,rden = rden,rnum
ok,sr = isqrt_i(rnum*rden*self.qnr)
if not ok: continue
sr *= rnum
#print "Works! %d %x" % (swap,sr)
if negative(sr) != toggle_r: sr = -sr
ret = self.gfToBytes(sr)
if self.elligator(ret) != self and self.elligator(ret) != -self:
print "WRONG!",[toggle_rotation,toggle_altx,toggle_s]
if self.elligator(ret) == -self and self != -self: print "Negated!",[toggle_rotation,toggle_altx,toggle_s]
rets.append(bytes(ret))
return rets
@optimized_version_of("encodeSpec")
def encode(self):
"""Encode, optimized version"""
return self.gfToBytes(self.toJacobiQuartic()[0])
@classmethod
@optimized_version_of("decodeSpec")
def decode(cls,s):
"""Decode, optimized version"""
a,d = cls.a,cls.d
s = cls.bytesToGf(s,mustBePositive=True)
#if s==0: return cls()
s2 = s^2
den = 1+a*s2
num = den^2 - 4*d*s2
isr = isqrt(num*den^2)
altx = 2*s*isr*den*cls.isoMagic
if negative(altx): isr = -isr
x = 2*s *isr^2*den*num
y = (1-a*s^2) * isr*den
if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0):
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
return cls(x,y)
@classmethod
def fromJacobiQuartic(cls,s,t,sgn=1):
"""Convert point from its Jacobi Quartic representation"""
a,d = cls.a,cls.d
if s==0: return cls()
x = 2*s / (1+a*s^2)
y = (1-a*s^2) / t
return cls(x,sgn*y)
@optimized_version_of("doubleAndEncodeSpec")
def doubleAndEncode(self):
X,Y,Z,T = self.xyzt()
a,d = self.a,self.d
if self.cofactor == 8:
# Cofactor 8 version
# Simulate IMAGINE_TWIST because that's how libdecaf does it
X = self.i*X
T = self.i*T
a = -a
d = -d
# TODO: This is only being called for a=-1, so could
# be wrong for a=1
e = 2*X*Y
f = Y^2+a*X^2
g = Y^2-a*X^2
h = Z^2-d*T^2
eim = e*self.isoMagic
inv = 1/(eim*g*f*h)
fh_inv = eim*g*inv*self.i
if negative(eim*g*fh_inv):
idf = g*self.isoMagic*self.i
bar = f
foo = g
test = eim*f
else:
idf = eim
bar = h
foo = -eim
test = g*h
if negative(test*fh_inv): bar =- bar
s = idf*(foo+bar)*inv*f*h
else:
xy = X*Y
h = Z^2-d*T^2
inv = 1/(xy*h)
if negative(inv*2*xy^2*self.isoMagic): tmp = Y
else: tmp = X
s = tmp^2*h*inv # = X/Y or Y/X, interestingly
return self.gfToBytes(s,mustBePositive=True)
@classmethod
def elligatorSpec(cls,r0,fromR=False):
a,d = cls.a,cls.d
if fromR: r = r0
else: r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2
den = (d*r-(d-a))*((d-a)*r-d)
if den == 0: return cls()
n1 = (r+1)*(a-2*d)/den
n2 = r*n1
if is_square(n1):
sgn,s,t = 1, xsqrt(n1), -(r-1)*(a-2*d)^2 / den - 1
else:
sgn,s,t = -1, -xsqrt(n2), r*(r-1)*(a-2*d)^2 / den - 1
return cls.fromJacobiQuartic(s,t)
@classmethod
@optimized_version_of("elligatorSpec")
def elligator(cls,r0):
a,d = cls.a,cls.d
r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)
r = cls.qnr * r0^2
den = (d*r-(d-a))*((d-a)*r-d)
num = (r+1)*(a-2*d)
iss,isri = isqrt_i(num*den)
if iss: sgn,twiddle = 1,1
else: sgn,twiddle = -1,r0*cls.qnr
isri *= twiddle
s = isri*num
t = -sgn*isri*s*(r-1)*(a-2*d)^2 - 1
if negative(s) == iss: s = -s
return cls.fromJacobiQuartic(s,t)
def elligatorInverseBruteForce(self):
"""Invert Elligator using SAGE's polynomial solver"""
a,d = self.a,self.d
R.<r0> = self.F[]
r = self.qnr * r0^2
den = (d*r-(d-a))*((d-a)*r-d)
n1 = (r+1)*(a-2*d)/den
n2 = r*n1
ret = set()
for s2,t in [(n1, -(r-1)*(a-2*d)^2 / den - 1),
(n2,r*(r-1)*(a-2*d)^2 / den - 1)]:
x2 = 4*s2/(1+a*s2)^2
y = (1-a*s2) / t
selfT = self
for i in xrange(self.cofactor/2):
xT,yT = selfT
polyX = xT^2-x2
polyY = yT-y
sx = set(r for r,_ in polyX.numerator().roots())
sy = set(r for r,_ in polyY.numerator().roots())
ret = ret.union(sx.intersection(sy))
selfT = selfT.torque()
ret = [self.gfToBytes(r) for r in ret]
for r in ret:
assert self.elligator(r) in [self,-self]
ret = [r for r in ret if self.elligator(r) == self]
return ret
class Ed25519Point(RistrettoPoint):
F = GF(2^255-19)
d = F(-121665/121666)
a = F(-1)
i = sqrt(F(-1))
mneg = F(1)
qnr = i
magic = isqrt(a*d-1)
cofactor = 8
encLen = 32
@classmethod
def base(cls):
return cls( 15112221349535400772501151409588531511454012693041857206046113283949847762202, 46316835694926478169428394003475163141307993866256225615783033603165251855960
)
class NegEd25519Point(RistrettoPoint):
F = GF(2^255-19)
d = F(121665/121666)
a = F(1)
i = sqrt(F(-1))
mneg = F(-1) # TODO checkme vs 1-ad or whatever
qnr = i
magic = isqrt(a*d-1)
cofactor = 8
encLen = 32
@classmethod
def base(cls):
y = cls.F(4/5)
x = sqrt((y^2-1)/(cls.d*y^2-cls.a))
if negative(x): x = -x
return cls(x,y)
class IsoEd448Point(RistrettoPoint):
F = GF(2^448-2^224-1)
d = F(39082/39081)
a = F(1)
mneg = F(-1)
qnr = -1
magic = isqrt(a*d-1)
cofactor = 4
encLen = 56
@classmethod
def base(cls):
return cls( # RFC has it wrong
345397493039729516374008604150537410266655260075183290216406970281645695073672344430481787759340633221708391583424041788924124567700732,
-363419362147803445274661903944002267176820680343659030140745099590306164083365386343198191849338272965044442230921818680526749009182718
)
class TwistedEd448GoldilocksPoint(Decaf_1_1_Point):
F = GF(2^448-2^224-1)
d = F(-39082)
a = F(-1)
qnr = -1
cofactor = 4
encLen = 56
isoMagic = IsoEd448Point.magic
@classmethod
def base(cls):
return cls.decodeSpec(Ed448GoldilocksPoint.base().encodeSpec())
class Ed448GoldilocksPoint(Decaf_1_1_Point):
F = GF(2^448-2^224-1)
d = F(-39081)
a = F(1)
qnr = -1
cofactor = 4
encLen = 56
isoMagic = IsoEd448Point.magic
@classmethod
def base(cls):
return 2*cls(
224580040295924300187604334099896036246789641632564134246125461686950415467406032909029192869357953282578032075146446173674602635247710, 298819210078481492676017930443930673437544040154080242095928241372331506189835876003536878655418784733982303233503462500531545062832660
)
class IsoEd25519Point(Decaf_1_1_Point):
# TODO: twisted iso too!
# TODO: twisted iso might have to IMAGINE_TWIST or whatever
F = GF(2^255-19)
d = F(-121665)
a = F(1)
i = sqrt(F(-1))
qnr = i
magic = isqrt(a*d-1)
cofactor = 8
encLen = 32
isoMagic = Ed25519Point.magic
isoA = Ed25519Point.a
@classmethod
def base(cls):
return cls.decodeSpec(Ed25519Point.base().encode())
class TestFailedException(Exception): pass
def test(cls,n):
print "Testing curve %s" % cls.__name__
specials = [1]
ii = cls.F(-1)
while is_square(ii):
specials.append(ii)
ii = sqrt(ii)
specials.append(ii)
for i in specials:
if negative(cls.F(i)): i = -i
i = enc_le(i,cls.encLen)
try:
Q = cls.decode(i)
QE = Q.encode()
if QE != i:
raise TestFailedException("Round trip special %s != %s" %
(binascii.hexlify(QE),binascii.hexlify(i)))
except NotOnCurveException: pass
except InvalidEncodingException: pass
P = cls.base()
Q = cls()
for i in xrange(n):
#print binascii.hexlify(Q.encode())
QE = Q.encode()
QQ = cls.decode(QE)
if QQ != Q: raise TestFailedException("Round trip %s != %s" % (str(QQ),str(Q)))
# Testing s -> 1/s: encodes -point on cofactor
s = cls.bytesToGf(QE)
if s != 0:
ss = cls.gfToBytes(1/s,mustBePositive=True)
try:
QN = cls.decode(ss)
if cls.cofactor == 8:
raise TestFailedException("1/s shouldnt work for cofactor 8")
if QN != -Q:
raise TestFailedException("s -> 1/s should negate point for cofactor 4")
except InvalidEncodingException as e:
# Should be raised iff cofactor==8
if cls.cofactor == 4:
raise TestFailedException("s -> 1/s should work for cofactor 4")
QT = Q
for h in xrange(cls.cofactor):
QT = QT.torque()
if QT.encode() != QE:
raise TestFailedException("Can't torque %s,%d" % (str(Q),h+1))
Q0 = Q + P
if Q0 == Q: raise TestFailedException("Addition doesn't work")
if Q0-P != Q: raise TestFailedException("Subtraction doesn't work")
r = randint(1,1000)
Q1 = Q0*r
Q2 = Q0*(r+1)
if Q1 + Q0 != Q2: raise TestFailedException("Scalarmul doesn't work")
Q = Q1
def testElligator(cls,n):
print "Testing elligator on %s" % cls.__name__
for i in xrange(n):
r = randombytes(cls.encLen)
P = cls.elligator(r)
if hasattr(P,"invertElligator"):
iv = P.invertElligator()
modr = bytes(cls.gfToBytes(cls.bytesToGf(r,mustBeProper=False,maskHiBits=True)))
iv2 = P.torque().invertElligator()
if modr not in iv: print "Failed to invert Elligator!"
if len(iv) != len(set(iv)):
print "Elligator inverses not unique!", len(set(iv)), len(iv)
if iv != iv2:
print "Elligator is untorqueable!"
#print [binascii.hexlify(j) for j in iv]
#print [binascii.hexlify(j) for j in iv2]
#break
else:
pass # TODO
def gangtest(classes,n):
print "Gang test",[cls.__name__ for cls in classes]
specials = [1]
ii = classes[0].F(-1)
while is_square(ii):
specials.append(ii)
ii = sqrt(ii)
specials.append(ii)
for i in xrange(n):
rets = [bytes((cls.base()*i).encode()) for cls in classes]
if len(set(rets)) != 1:
print "Divergence in encode at %d" % i
for c,ret in zip(classes,rets):
print c,binascii.hexlify(ret)
print
if i < len(specials): r0 = enc_le(specials[i],classes[0].encLen)
else: r0 = randombytes(classes[0].encLen)
rets = [bytes((cls.elligator(r0)*i).encode()) for cls in classes]
if len(set(rets)) != 1:
print "Divergence in elligator at %d" % i
for c,ret in zip(classes,rets):
print c,binascii.hexlify(ret)
print
def testDoubleAndEncode(cls,n):
print "Testing doubleAndEncode on %s" % cls.__name__
for i in xrange(n):
r1 = randombytes(cls.encLen)
r2 = randombytes(cls.encLen)
u = cls.elligator(r1) + cls.elligator(r2)
u.doubleAndEncode()
testDoubleAndEncode(Ed25519Point,100)
testDoubleAndEncode(NegEd25519Point,100)
testDoubleAndEncode(IsoEd25519Point,100)
testDoubleAndEncode(IsoEd448Point,100)
testDoubleAndEncode(TwistedEd448GoldilocksPoint,100)
#test(Ed25519Point,100)
#test(NegEd25519Point,100)
#test(IsoEd25519Point,100)
#test(IsoEd448Point,100)
#test(TwistedEd448GoldilocksPoint,100)
#test(Ed448GoldilocksPoint,100)
#testElligator(Ed25519Point,100)
#testElligator(NegEd25519Point,100)
#testElligator(IsoEd25519Point,100)
#testElligator(IsoEd448Point,100)
#testElligator(Ed448GoldilocksPoint,100)
#testElligator(TwistedEd448GoldilocksPoint,100)
#gangtest([IsoEd448Point,TwistedEd448GoldilocksPoint,Ed448GoldilocksPoint],100)
#gangtest([Ed25519Point,IsoEd25519Point],100)

File diff suppressed because one or more lines are too long

View File

@ -1,169 +0,0 @@
# Changelog
Entries are listed in reverse chronological order per undeprecated
major series.
## 3.x series
### 3.1.0
* Add support for the Elligator2 encoding for Edwards points.
* Add two optional formally-verified field arithmetic backends which
use the Fiat Crypto project's Rust code, which is generated from
proofs of functional correctness checked by the Coq theorem proving
system.
* Add support for additional sizes of precomputed tables for basepoint
scalar multiplication.
* Fix an unused import.
* Add support for using the `zeroize` traits with all point types.
Note that points are not automatically zeroized on Drop, but that
consumers of `curve25519-dalek` should call these methods manually
when needed.
### 3.0.2
* Fixes to make using alloc+no_std possible for stable Rust.
### 3.0.1
* Update the optional `packed-simd` dependency to rely on a newer,
maintained version of the `packed-simd-2` crate.
### 3.0.0
* Update the `digest` dependency to `0.9`. This requires a major version
because the `digest` traits are part of the public API, but there are
otherwise no changes to the API.
## 2.x series
### 2.1.2
* Fixes to make using alloc+no_std possible for stable Rust.
### 2.1.1
* Update the optional `packed-simd` dependency to rely on a newer,
maintained version of the `packed-simd-2` crate.
### 2.1.0
* Make `Scalar::from_bits` a `const fn`, allowing its use in `const` contexts.
### 2.0.0
* Fix a data modeling error in the `serde` feature pointed out by Trevor Perrin
which caused points and scalars to be serialized with length fields rather
than as fixed-size 32-byte arrays. This is a breaking change, but it fixes
compatibility with `serde-json` and ensures that the `serde-bincode` encoding
matches the conventional encoding for X/Ed25519.
* Update `rand_core` to `0.5`, allowing use with new `rand` versions.
* Switch from `clear_on_drop` to `zeroize` (by Tony Arcieri).
* Require `subtle = ^2.2.1` and remove the note advising nightly Rust, which is
no longer required as of that version of `subtle`. See the `subtle`
changelog for more details.
* Update `README.md` for `2.x` series.
* Remove the `build.rs` hack which loaded the entire crate into its own
`build.rs` to generate constants, and keep the constants in the source code.
The only significant change is the data model change to the `serde` feature;
besides the `rand_core` version bump, there are no other user-visible changes.
## 1.x series
### 1.2.6
* Fixes to make using alloc+no_std possible for stable Rust.
### 1.2.5
* Update the optional `packed-simd` dependency to rely on a newer,
maintained version of the `packed-simd-2` crate.
### 1.2.4
* Specify a semver bound for `clear_on_drop` rather than an exact version,
addressing an issue where changes to inline assembly in rustc prevented
`clear_on_drop` from working without an update.
### 1.2.3
* Fix an issue identified by a Quarkslab audit (and Jack Grigg), where manually
constructing unreduced `Scalar` values, as needed for X/Ed25519, and then
performing scalar/scalar arithmetic could compute incorrect results.
* Switch to upstream Rust intrinsics for the IFMA backend now that they exist in
Rust and don't need to be defined locally.
* Ensure that the NAF computation works correctly, even for parameters never
used elsewhere in the codebase.
* Minor refactoring to EdwardsPoint decompression.
* Fix broken links in documentation.
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
root (not quite correctly done in 1.2.2).
### 1.2.2
* Fix a typo in an internal doc-comment.
* Add the "crypto" tag to crate metadata.
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
root.
### 1.2.1
* Fix a bug in bucket index calculations in the Pippenger multiscalar algorithm
for very large input sizes.
* Add a more extensive randomized multiscalar multiplication consistency check
to the test suite to prevent regressions.
* Ensure that that multiscalar and NAF computations work correctly on extremal
`Scalar` values constructed via `from_bits`.
### 1.2.0
* New multiscalar multiplication algorithm with better performance for
large problem sizes. The backend algorithm is selected
transparently using the size hints of the input iterators, so no
changes are required for client crates to start using it.
* Equality of Edwards points is now checked in projective coordinates.
* Serde can now be used with `no_std`.
### 1.1.4
* Fix typos in documentation comments.
* Remove unnecessary `Default` bound on `Scalar::from_hash`.
### 1.1.3
* Reverts the change in 1.1.0 to allow owned and borrowed RNGs, which caused a breakage due to a subtle interaction with ownership rules. (The `RngCore` change is retained).
### 1.1.2
* Disabled KaTeX on `docs.rs` pending proper [support upstream](https://github.com/rust-lang/docs.rs/issues/302).
## 1.1.1
* Fixed an issue related to `#[cfg(rustdoc)]` which prevented documenting multiple backends.
### 1.1.0
* Adds support for precomputation for multiscalar multiplication.
* Restructures the internal source tree into `serial` and `vector` backends (no change to external API).
* Adds a new IFMA backend which sets speed records.
* The `avx2_backend` feature is now an alias for the `simd_backend` feature, which autoselects an appropriate vector backend (currently AVX2 or IFMA).
* Replaces the `rand` dependency with `rand_core`.
* Generalizes trait bounds on `RistrettoPoint::random()` and `Scalar::random()` to allow owned and borrowed RNGs and to allow `RngCore` instead of `Rng`.
### 1.0.3
* Adds `ConstantTimeEq` implementation for compressed points.
### 1.0.2
* Fixes a typo in the naming of variables in Ristretto formulas (no change to functionality).
### 1.0.1
* Depends on the stable `2.0` version of `subtle` instead of `2.0.0-pre.0`.
### 1.0.0
Initial stable release. Yanked due to a dependency mistake (see above).

View File

@ -1,8 +0,0 @@
# Code of Conduct
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html),
with the following additional clauses:
* We respect the rights to privacy and anonymity for contributors and people in
the community. If someone wishes to contribute under a pseudonym different to
their primary identity, that wish is to be respected by all contributors.

View File

@ -1,19 +0,0 @@
# Contributing to curve25519-dalek
If you have questions or comments, please feel free to email the
authors.
For feature requests, suggestions, and bug reports, please open an issue on
[our Github](https://github.com/dalek-cryptography/curve25519-dalek). (Or, send us
an email if you're opposed to using Github for whatever reason.)
Patches are welcomed as pull requests on
[our Github](https://github.com/dalek-cryptography/curve25519-dalek), as well as by
email (preferably sent to all of the authors listed in `Cargo.toml`).
All issues on curve25519-dalek are mentored, if you want help with a bug just
ask @isislovecruft or @hdevalence.
Some issues are easier than others. The `easy` label can be used to find the
easy issues. If you want to work on an issue, please leave a comment so that we
can assign it to you!

View File

@ -1,97 +0,0 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
name = "curve25519-dalek"
version = "3.1.0"
authors = ["Isis Lovecruft <isis@patternsinthevoid.net>", "Henry de Valence <hdevalence@hdevalence.ca>"]
exclude = ["**/.gitignore", ".gitignore", ".travis.yml"]
description = "A pure-Rust implementation of group operations on ristretto255 and Curve25519"
homepage = "https://dalek.rs/curve25519-dalek"
documentation = "https://docs.rs/curve25519-dalek"
readme = "README.md"
keywords = ["cryptography", "crypto", "ristretto", "curve25519", "ristretto255"]
categories = ["cryptography", "no-std"]
license = "BSD-3-Clause"
repository = "https://github.com/dalek-cryptography/curve25519-dalek"
[package.metadata.docs.rs]
features = ["nightly", "simd_backend"]
[[bench]]
name = "dalek_benchmarks"
harness = false
[dependencies.byteorder]
version = "^1.2.3"
features = ["i128"]
default-features = false
[dependencies.digest]
version = "0.9"
default-features = false
[dependencies.fiat-crypto]
version = "0.1.6"
optional = true
[dependencies.packed_simd]
version = "0.3.4"
features = ["into_bits"]
optional = true
package = "packed_simd_2"
[dependencies.rand_core]
version = "0.5"
default-features = false
[dependencies.serde]
version = "1.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.subtle]
version = "^2.2.1"
default-features = false
[dependencies.zeroize]
version = "1"
default-features = false
[dev-dependencies.bincode]
version = "1"
[dev-dependencies.criterion]
version = "0.3.0"
[dev-dependencies.hex]
version = "0.4.2"
[dev-dependencies.rand]
version = "0.7"
[dev-dependencies.sha2]
version = "0.9"
default-features = false
[features]
alloc = ["zeroize/alloc"]
avx2_backend = ["simd_backend"]
default = ["std", "u64_backend"]
fiat_u32_backend = ["fiat-crypto"]
fiat_u64_backend = ["fiat-crypto"]
nightly = ["subtle/nightly"]
simd_backend = ["nightly", "u64_backend", "packed_simd"]
std = ["alloc", "subtle/std", "rand_core/std"]
u32_backend = []
u64_backend = []
[badges.travis-ci]
branch = "master"
repository = "dalek-cryptography/curve25519-dalek"

View File

@ -1,65 +0,0 @@
Copyright (c) 2016-2021 isis agora lovecruft. All rights reserved.
Copyright (c) 2016-2021 Henry de Valence. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
========================================================================
Portions of curve25519-dalek were originally derived from Adam Langley's
Go ed25519 implementation, found at <https://github.com/agl/ed25519/>,
under the following licence:
========================================================================
Copyright (c) 2012 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,8 +0,0 @@
FEATURES := nightly yolocrypto avx2_backend
doc:
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html
doc-internal:
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html --document-private-items

View File

@ -1,220 +0,0 @@
# curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek)
<img
width="33%"
align="right"
src="https://doc.dalek.rs/assets/dalek-logo-clear.png"/>
**A pure-Rust implementation of group operations on Ristretto and Curve25519.**
`curve25519-dalek` is a library providing group operations on the Edwards and
Montgomery forms of Curve25519, and on the prime-order Ristretto group.
`curve25519-dalek` is not intended to provide implementations of any particular
crypto protocol. Rather, implementations of those protocols (such as
[`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use
`curve25519-dalek` as a library.
`curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use
implementing a wide range of ECC-based crypto protocols, such as key agreement,
signatures, anonymous credentials, rangeproofs, and zero-knowledge proof
systems.
In particular, `curve25519-dalek` implements Ristretto, which constructs a
prime-order group from a non-prime-order Edwards curve. This provides the
speed and safety benefits of Edwards curve arithmetic, without the pitfalls of
cofactor-related abstraction mismatches.
# Documentation
The semver-stable, public-facing `curve25519-dalek` API is documented
[here][docs-external]. In addition, the unstable internal implementation
details are documented [here][docs-internal].
The `curve25519-dalek` documentation requires a custom HTML header to include
KaTeX for math support. Unfortunately `cargo doc` does not currently support
this, but docs can be built using
```sh
make doc
make doc-internal
```
# Use
To import `curve25519-dalek`, add the following to the dependencies section of
your project's `Cargo.toml`:
```toml
curve25519-dalek = "3"
```
The sole breaking change in the `3.x` series was an update to the `digest`
version, and in terms of non-breaking changes it includes:
* support for using `alloc` instead of `std` on stable Rust,
* the Elligator2 encoding for Edwards points,
* a fix to use `packed_simd2`,
* various documentation fixes and improvements,
* support for configurably-sized, precomputed lookup tables for basepoint scalar
multiplication,
* two new formally-verified field arithmetic backends which use the Fiat Crypto
Rust code, which is generated from proofs of functional correctness checked by
the Coq theorem proving system, and
* support for explicitly calling the `zeroize` traits for all point types.
The `2.x` series has API almost entirely unchanged from the `1.x` series,
except that:
* an error in the data modeling for the (optional) `serde` feature was
corrected, so that when the `2.x`-series `serde` implementation is used
with `serde-bincode`, the derived serialization matches the usual X/Ed25519
formats;
* the `rand` version was updated.
See `CHANGELOG.md` for more details.
# Backends and Features
The `nightly` feature enables features available only when using a Rust nightly
compiler. In particular, it is required for rendering documentation and for
the SIMD backends.
Curve arithmetic is implemented using one of the following backends:
* a `u32` backend using serial formulas and `u64` products;
* a `u64` backend using serial formulas and `u128` products;
* an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records);
* an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records);
By default the `u64` backend is selected. To select a specific backend, use:
```sh
cargo build --no-default-features --features "std u32_backend"
cargo build --no-default-features --features "std u64_backend"
# Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2
cargo build --no-default-features --features "std simd_backend"
# Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma
cargo build --no-default-features --features "std simd_backend"
```
Crates using `curve25519-dalek` can either select a backend on behalf of their
users, or expose feature flags that control the `curve25519-dalek` backend.
The `std` feature is enabled by default, but it can be disabled for no-`std`
builds using `--no-default-features`. Note that this requires explicitly
selecting an arithmetic backend using one of the `_backend` features.
If no backend is selected, compilation will fail.
# Safety
The `curve25519-dalek` types are designed to make illegal states
unrepresentable. For example, any instance of an `EdwardsPoint` is
guaranteed to hold a point on the Edwards curve, and any instance of a
`RistrettoPoint` is guaranteed to hold a valid point in the Ristretto
group.
All operations are implemented using constant-time logic (no
secret-dependent branches, no secret-dependent memory accesses),
unless specifically marked as being variable-time code.
We believe that our constant-time logic is lowered to constant-time
assembly, at least on `x86_64` targets.
As an additional guard against possible future compiler optimizations,
the `subtle` crate places an optimization barrier before every
conditional move or assignment. More details can be found in [the
documentation for the `subtle` crate][subtle_doc].
Some functionality (e.g., multiscalar multiplication or batch
inversion) requires heap allocation for temporary buffers. All
heap-allocated buffers of potentially secret data are explicitly
zeroed before release.
However, we do not attempt to zero stack data, for two reasons.
First, it's not possible to do so correctly: we don't have control
over stack allocations, so there's no way to know how much data to
wipe. Second, because `curve25519-dalek` provides a mid-level API,
the correct place to start zeroing stack data is likely not at the
entrypoints of `curve25519-dalek` functions, but at the entrypoints of
functions in other crates.
The implementation is memory-safe, and contains no significant
`unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD
intrinsics. These are marked `unsafe` only because invoking them on an
inappropriate CPU would cause `SIGILL`, but the entire backend is only
compiled with appropriate `target_feature`s, so this cannot occur.
# Performance
Benchmarks are run using [`criterion.rs`][criterion]:
```sh
cargo bench --no-default-features --features "std u32_backend"
cargo bench --no-default-features --features "std u64_backend"
# Uses avx2 or ifma only if compiled for an appropriate target.
export RUSTFLAGS="-C target_cpu=native"
cargo bench --no-default-features --features "std simd_backend"
```
Performance is a secondary goal behind correctness, safety, and
clarity, but we aim to be competitive with other implementations.
# FFI
Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The
reason is that we use Rust features to provide an API that maintains safety
invariants, which are not possible to maintain across an FFI boundary. For
instance, as described in the _Safety_ section above, invalid points are
impossible to construct, and this would not be the case if we exposed point
operations over FFI.
However, `curve25519-dalek` is designed as a *mid-level* API, aimed at
implementing other, higher-level primitives. Instead of providing FFI at the
mid-level, our suggestion is to implement the higher-level primitive (a
signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency,
and have that crate provide a minimal, byte-buffer-oriented FFI specific to
that primitive.
# Contributing
Please see [CONTRIBUTING.md][contributing].
Patches and pull requests should be make against the `develop`
branch, **not** `master`.
# About
**SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in
his second full episode, "Into the Dalek". A beleaguered ship of the "Combined
Galactic Resistance" has discovered a broken Dalek that has turned "good",
desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers
are miniaturized and enter the Dalek, which the Doctor names Rusty. They
repair the damage, but accidentally restore it to its original nature, causing
it to go on the rampage and alert the Dalek fleet to the whereabouts of the
rebel ship. However, the Doctor manages to return Rusty to its previous state
by linking his mind with the Dalek's: Rusty shares the Doctor's view of the
universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the
other Daleks and departs the ship, determined to track down and bring an end
to the Dalek race.*
`curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence.
Portions of this library were originally a port of [Adam Langley's
Golang ed25519 library](https://github.com/agl/ed25519), which was in
turn a port of the reference `ref10` implementation. Most of this code,
including the 32-bit field arithmetic, has since been rewritten.
The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and
the addition chain for scalar inversion was provided by Brian Smith. The
optimised batch inversion was contributed by Sean Bowe and Daira Hopwood.
The `no_std` and `zeroize` support was contributed by Tony Arcieri.
Thanks also to Ashley Hauck, Lucas Salibian, and Manish Goregaokar for their
contributions.
[ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek
[x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek
[contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md
[docs-external]: https://doc.dalek.rs/curve25519_dalek/
[docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/
[criterion]: https://github.com/japaric/criterion.rs
[parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html
[subtle_doc]: https://doc.dalek.rs/subtle/

View File

@ -1,363 +0,0 @@
#![allow(non_snake_case)]
extern crate rand;
use rand::rngs::OsRng;
use rand::thread_rng;
#[macro_use]
extern crate criterion;
use criterion::measurement::Measurement;
use criterion::BatchSize;
use criterion::Criterion;
use criterion::{BenchmarkGroup, BenchmarkId};
extern crate curve25519_dalek;
use curve25519_dalek::constants;
use curve25519_dalek::scalar::Scalar;
static BATCH_SIZES: [usize; 5] = [1, 2, 4, 8, 16];
static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024];
mod edwards_benches {
use super::*;
use curve25519_dalek::edwards::EdwardsPoint;
fn compress(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_POINT;
c.bench_function("EdwardsPoint compression", move |b| b.iter(|| B.compress()));
}
fn decompress(c: &mut Criterion) {
let B_comp = &constants::ED25519_BASEPOINT_COMPRESSED;
c.bench_function("EdwardsPoint decompression", move |b| {
b.iter(|| B_comp.decompress().unwrap())
});
}
fn consttime_fixed_base_scalar_mul(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_TABLE;
let s = Scalar::from(897987897u64).invert();
c.bench_function("Constant-time fixed-base scalar mul", move |b| {
b.iter(|| B * &s)
});
}
fn consttime_variable_base_scalar_mul(c: &mut Criterion) {
let B = &constants::ED25519_BASEPOINT_POINT;
let s = Scalar::from(897987897u64).invert();
c.bench_function("Constant-time variable-base scalar mul", move |b| {
b.iter(|| B * s)
});
}
fn vartime_double_base_scalar_mul(c: &mut Criterion) {
c.bench_function("Variable-time aA+bB, A variable, B fixed", |bench| {
let mut rng = thread_rng();
let A = &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE;
bench.iter_batched(
|| (Scalar::random(&mut rng), Scalar::random(&mut rng)),
|(a, b)| EdwardsPoint::vartime_double_scalar_mul_basepoint(&a, &A, &b),
BatchSize::SmallInput,
);
});
}
criterion_group! {
name = edwards_benches;
config = Criterion::default();
targets =
compress,
decompress,
consttime_fixed_base_scalar_mul,
consttime_variable_base_scalar_mul,
vartime_double_base_scalar_mul,
}
}
mod multiscalar_benches {
use super::*;
use curve25519_dalek::edwards::EdwardsPoint;
use curve25519_dalek::edwards::VartimeEdwardsPrecomputation;
use curve25519_dalek::traits::MultiscalarMul;
use curve25519_dalek::traits::VartimeMultiscalarMul;
use curve25519_dalek::traits::VartimePrecomputedMultiscalarMul;
fn construct_scalars(n: usize) -> Vec<Scalar> {
let mut rng = thread_rng();
(0..n).map(|_| Scalar::random(&mut rng)).collect()
}
fn construct_points(n: usize) -> Vec<EdwardsPoint> {
let mut rng = thread_rng();
(0..n)
.map(|_| &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE)
.collect()
}
fn construct(n: usize) -> (Vec<Scalar>, Vec<EdwardsPoint>) {
(construct_scalars(n), construct_points(n))
}
fn consttime_multiscalar_mul<M: Measurement>(c: &mut BenchmarkGroup<M>) {
for multiscalar_size in &MULTISCALAR_SIZES {
c.bench_with_input(
BenchmarkId::new(
"Constant-time variable-base multiscalar multiplication",
*multiscalar_size,
),
&multiscalar_size,
|b, &&size| {
let points = construct_points(size);
// This is supposed to be constant-time, but we might as well
// rerandomize the scalars for every call just in case.
b.iter_batched(
|| construct_scalars(size),
|scalars| EdwardsPoint::multiscalar_mul(&scalars, &points),
BatchSize::SmallInput,
);
},
);
}
}
fn vartime_multiscalar_mul<M: Measurement>(c: &mut BenchmarkGroup<M>) {
for multiscalar_size in &MULTISCALAR_SIZES {
c.bench_with_input(
BenchmarkId::new(
"Variable-time variable-base multiscalar multiplication",
*multiscalar_size,
),
&multiscalar_size,
|b, &&size| {
let points = construct_points(size);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels).
b.iter_batched(
|| construct_scalars(size),
|scalars| EdwardsPoint::vartime_multiscalar_mul(&scalars, &points),
BatchSize::SmallInput,
);
},
);
}
}
fn vartime_precomputed_pure_static<M: Measurement>(c: &mut BenchmarkGroup<M>) {
for multiscalar_size in &MULTISCALAR_SIZES {
c.bench_with_input(
BenchmarkId::new(
"Variable-time fixed-base multiscalar multiplication",
&multiscalar_size,
),
&multiscalar_size,
move |b, &&total_size| {
let static_size = total_size;
let static_points = construct_points(static_size);
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels).
b.iter_batched(
|| construct_scalars(static_size),
|scalars| precomp.vartime_multiscalar_mul(&scalars),
BatchSize::SmallInput,
);
},
);
}
}
fn vartime_precomputed_helper<M: Measurement>(
c: &mut BenchmarkGroup<M>,
dynamic_fraction: f64,
) {
for multiscalar_size in &MULTISCALAR_SIZES {
c.bench_with_input(
BenchmarkId::new(
"Variable-time mixed-base multiscalar multiplication ({:.0}pct dyn)",
format!("({:.0}pct dyn)", 100.0 * dynamic_fraction),
),
&multiscalar_size,
move |b, &&total_size| {
let dynamic_size = ((total_size as f64) * dynamic_fraction) as usize;
let static_size = total_size - dynamic_size;
let static_points = construct_points(static_size);
let dynamic_points = construct_points(dynamic_size);
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
// Rerandomize the scalars for every call to prevent
// false timings from better caching (e.g., the CPU
// cache lifts exactly the right table entries for the
// benchmark into the highest cache levels). Timings
// should be independent of points so we don't
// randomize them.
b.iter_batched(
|| {
(
construct_scalars(static_size),
construct_scalars(dynamic_size),
)
},
|(static_scalars, dynamic_scalars)| {
precomp.vartime_mixed_multiscalar_mul(
&static_scalars,
&dynamic_scalars,
&dynamic_points,
)
},
BatchSize::SmallInput,
);
},
);
}
}
fn multiscalar_multiplications(c: &mut Criterion) {
let mut group: BenchmarkGroup<_> = c.benchmark_group("Multiscalar muls");
consttime_multiscalar_mul(&mut group);
vartime_multiscalar_mul(&mut group);
vartime_precomputed_pure_static(&mut group);
let dynamic_fracs = [0.0, 0.2, 0.5];
for frac in dynamic_fracs.iter() {
vartime_precomputed_helper(&mut group, *frac);
}
group.finish();
}
criterion_group! {
name = multiscalar_benches;
// Lower the sample size to run the benchmarks faster
config = Criterion::default().sample_size(15);
targets =
multiscalar_multiplications,
}
}
mod ristretto_benches {
use super::*;
use curve25519_dalek::ristretto::RistrettoPoint;
fn compress(c: &mut Criterion) {
c.bench_function("RistrettoPoint compression", |b| {
let B = &constants::RISTRETTO_BASEPOINT_POINT;
b.iter(|| B.compress())
});
}
fn decompress(c: &mut Criterion) {
c.bench_function("RistrettoPoint decompression", |b| {
let B_comp = &constants::RISTRETTO_BASEPOINT_COMPRESSED;
b.iter(|| B_comp.decompress().unwrap())
});
}
fn double_and_compress_batch<M: Measurement>(c: &mut BenchmarkGroup<M>) {
for batch_size in &BATCH_SIZES {
c.bench_with_input(
BenchmarkId::new("Batch Ristretto double-and-encode", *batch_size),
&batch_size,
|b, &&size| {
let mut rng = OsRng;
let points: Vec<RistrettoPoint> = (0..size)
.map(|_| RistrettoPoint::random(&mut rng))
.collect();
b.iter(|| RistrettoPoint::double_and_compress_batch(&points));
},
);
}
}
fn double_and_compress_group(c: &mut Criterion) {
let mut group: BenchmarkGroup<_> = c.benchmark_group("double & compress batched");
double_and_compress_batch(&mut group);
group.finish();
}
criterion_group! {
name = ristretto_benches;
config = Criterion::default();
targets =
compress,
decompress,
double_and_compress_group,
}
}
mod montgomery_benches {
use super::*;
fn montgomery_ladder(c: &mut Criterion) {
c.bench_function("Montgomery pseudomultiplication", |b| {
let B = constants::X25519_BASEPOINT;
let s = Scalar::from(897987897u64).invert();
b.iter(|| B * s);
});
}
criterion_group! {
name = montgomery_benches;
config = Criterion::default();
targets = montgomery_ladder,
}
}
mod scalar_benches {
use super::*;
fn scalar_inversion(c: &mut Criterion) {
c.bench_function("Scalar inversion", |b| {
let s = Scalar::from(897987897u64).invert();
b.iter(|| s.invert());
});
}
fn batch_scalar_inversion<M: Measurement>(c: &mut BenchmarkGroup<M>) {
for batch_size in &BATCH_SIZES {
c.bench_with_input(
BenchmarkId::new("Batch scalar inversion", *batch_size),
&batch_size,
|b, &&size| {
let mut rng = OsRng;
let scalars: Vec<Scalar> =
(0..size).map(|_| Scalar::random(&mut rng)).collect();
b.iter(|| {
let mut s = scalars.clone();
Scalar::batch_invert(&mut s);
});
},
);
}
}
fn batch_scalar_inversion_group(c: &mut Criterion) {
let mut group: BenchmarkGroup<_> = c.benchmark_group("batch scalar inversion");
batch_scalar_inversion(&mut group);
group.finish();
}
criterion_group! {
name = scalar_benches;
config = Criterion::default();
targets =
scalar_inversion,
batch_scalar_inversion_group,
}
}
criterion_main!(
scalar_benches::scalar_benches,
montgomery_benches::montgomery_benches,
ristretto_benches::ristretto_benches,
edwards_benches::edwards_benches,
multiscalar_benches::multiscalar_benches,
);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 107 KiB

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 59 KiB

View File

@ -1,10 +0,0 @@
<link rel="stylesheet" href="https://doc.dalek.rs/assets/katex/katex.min.css">
<script src="https://doc.dalek.rs/assets/katex/katex.min.js"></script>
<script src="https://doc.dalek.rs/assets/katex/contrib/auto-render.min.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function() { renderMathInElement(document.body); });
</script>
<style>
.katex { font-size: 1em !important; }
pre.rust, .docblock code, .docblock-short code { font-size: 0.85em !important; }
</style>

View File

@ -1,140 +0,0 @@
An AVX2 implementation of the vectorized point operation strategy.
# Field element representation
Our strategy is to implement 4-wide multiplication and squaring by
wordslicing, using one 64-bit AVX2 lane for each field element. Field
elements are represented in the usual way as 10 `u32` limbs in radix
\\(25.5\\) (i.e., alternating between \\(2\^{26}\\) for even limbs and
\\(2\^{25}\\) for odd limbs). This has the effect that passing between
the parallel 32-bit AVX2 representation and the serial 64-bit
representation (which uses radix \\(2^{51}\\)) amounts to regrouping
digits.
The field element representation is oriented around the AVX2
`vpmuludq` instruction, which multiplies the low 32 bits of each
64-bit lane of each operand to produce a 64-bit result.
```text,no_run
(a1 ?? b1 ?? c1 ?? d1 ??)
(a2 ?? b2 ?? c2 ?? d2 ??)
(a1*a2 b1*b2 c1*c2 d1*d2)
```
To unpack 32-bit values into 64-bit lanes for use in multiplication
it would be convenient to use the `vpunpck[lh]dq` instructions,
which unpack and interleave the low and high 32-bit lanes of two
source vectors.
However, the AVX2 versions of these instructions are designed to
operate only within 128-bit lanes of the 256-bit vectors, so that
interleaving the low lanes of `(a0 b0 c0 d0 a1 b1 c1 d1)` with zero
gives `(a0 00 b0 00 a1 00 b1 00)`. Instead, we pre-shuffle the data
layout as `(a0 b0 a1 b1 c0 d0 c1 d1)` so that we can unpack the
"low" and "high" parts as
```text,no_run
(a0 00 b0 00 c0 00 d0 00)
(a1 00 b1 00 c1 00 d1 00)
```
The data layout for a vector of four field elements \\( (a,b,c,d)
\\) with limbs \\( a_0, a_1, \ldots, a_9 \\) is as `[u32x8; 5]` in
the form
```text,no_run
(a0 b0 a1 b1 c0 d0 c1 d1)
(a2 b2 a3 b3 c2 d2 c3 d3)
(a4 b4 a5 b5 c4 d4 c5 d5)
(a6 b6 a7 b7 c6 d6 c7 d7)
(a8 b8 a9 b9 c8 d8 c9 d9)
```
Since this breaks cleanly into two 128-bit lanes, it may be possible
to adapt it to 128-bit vector instructions such as NEON without too
much difficulty.
# Avoiding Overflow in Doubling
To analyze the size of the field element coefficients during the
computations, we can parameterize the bounds on the limbs of each
field element by \\( b \in \mathbb R \\) representing the excess bits
above that limb's radix, so that each limb is bounded by either
\\(2\^{25+b} \\) or \\( 2\^{26+b} \\), as appropriate.
The multiplication routine requires that its inputs are bounded with
\\( b < 1.75 \\), in order to fit a multiplication by \\( 19 \\)
into 32 bits. Since \\( \lg 19 < 4.25 \\), \\( 19x < 2\^{32} \\)
when \\( x < 2\^{27.75} = 2\^{26 + 1.75} \\). However, this is only
required for one of the inputs; the other can grow up to \\( b < 2.5
\\).
In addition, the multiplication and squaring routines do not
canonically reduce their outputs, but can leave some small uncarried
excesses, so that their reduced outputs are bounded with
\\( b < 0.007 \\).
The non-parallel portion of the doubling formulas is
$$
\begin{aligned}
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
&\gets
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
\end{aligned}
$$
Computing \\( (S\_5, S\_6, S\_8, S\_9 ) \\) as
$$
\begin{matrix}
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
+& S\_2 & & & S\_2 \\\\
+& & & S\_3 & \\\\
+& & & S\_3 & \\\\
+& & 2p & 2p & 2p \\\\
-& & S\_2 & S\_2 & \\\\
-& & & & S\_4 \\\\
=& S\_5 & S\_6 & S\_8 & S\_9
\end{matrix}
$$
results in bit-excesses \\( < (1.01, 1.60, 2.33, 2.01)\\) for
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
are then
$$
\begin{aligned}
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 2.01) \\\\
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 2.01)
\end{aligned}
$$
which are too large: it's not possible to arrange the multiplicands so
that one vector has \\(b < 2.5\\) and the other has \\( b < 1.75 \\).
However, if we flip the sign of \\( S\_4 = S\_0\^2 \\) during
squaring, so that we output \\(S\_4' = -S\_4 \pmod p\\), then we can
compute
$$
\begin{matrix}
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
+& S\_2 & & & S\_2 \\\\
+& & & S\_3 & \\\\
+& & & S\_3 & \\\\
+& & & & S\_4' \\\\
+& & 2p & 2p & \\\\
-& & S\_2 & S\_2 & \\\\
=& S\_5 & S\_6 & S\_8 & S\_9
\end{matrix}
$$
resulting in bit-excesses \\( < (1.01, 1.60, 2.33, 1.60)\\) for
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
are then
$$
\begin{aligned}
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 1.60) \\\\
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 1.60)
\end{aligned}
$$
whose right-hand sides are all bounded with \\( b < 1.75 \\) and
whose left-hand sides are all bounded with \\( b < 2.5 \\),
so that we can avoid any intermediate reductions.

View File

@ -1,580 +0,0 @@
An AVX512-IFMA implementation of the vectorized point operation
strategy.
# IFMA instructions
AVX512-IFMA is an extension to AVX-512 consisting of two instructions:
* `vpmadd52luq`: packed multiply of unsigned 52-bit integers and add
the low 52 product bits to 64-bit accumulators;
* `vpmadd52huq`: packed multiply of unsigned 52-bit integers and add
the high 52 product bits to 64-bit accumulators;
These operate on 64-bit lanes of their source vectors, taking the low
52 bits of each lane of each source vector, computing the 104-bit
products of each pair, and then adding either the high or low 52 bits
of the 104-bit products to the 64-bit lanes of the destination vector.
The multiplication is performed internally by reusing circuitry for
floating-point arithmetic. Although these instructions are part of
AVX512, the AVX512VL (vector length) extension (present whenever IFMA
is) allows using them with 512, 256, or 128-bit operands.
This provides a major advantage to vectorized integer operations:
previously, vector operations could only use a \\(32 \times 32
\rightarrow 64\\)-bit multiplier, while serial code could use a
\\(64\times 64 \rightarrow 128\\)-bit multiplier.
## IFMA for big-integer multiplications
A detailed example of the intended use of the IFMA instructions can be
found in a 2016 paper by Gueron and Krasnov, [_Accelerating Big
Integer Arithmetic Using Intel IFMA Extensions_][2016_gueron_krasnov].
The basic idea is that multiplication of large integers (such as 1024,
2048, or more bits) can be performed as follows.
First, convert a “packed” 64-bit representation
\\[
\begin{aligned}
x &= x'_0 + x'_1 2^{64} + x'_2 2^{128} + \cdots \\\\
y &= y'_0 + y'_1 2^{64} + y'_2 2^{128} + \cdots
\end{aligned}
\\]
into a “redundant” 52-bit representation
\\[
\begin{aligned}
x &= x_0 + x_1 2^{52} + x_2 2^{104} + \cdots \\\\
y &= y_0 + y_1 2^{52} + y_2 2^{104} + \cdots
\end{aligned}
\\]
with each \\(x_i, y_j\\) in a 64-bit lane.
Writing the product as \\(z = z_0 + z_1 2^{52} + z_2 2^{104} + \cdots\\),
the “schoolbook” multiplication strategy gives
\\[
\begin{aligned}
&z_0 &&=& x_0 & y_0 & & & & & & & & \\\\
&z_1 &&=& x_1 & y_0 &+ x_0 & y_1 & & & & & & \\\\
&z_2 &&=& x_2 & y_0 &+ x_1 & y_1 &+ x_0 & y_2 & & & & \\\\
&z_3 &&=& x_3 & y_0 &+ x_2 & y_1 &+ x_1 & y_2 &+ x_0 & y_3 & & \\\\
&z_4 &&=& \vdots\\;&\\;\vdots &+ x_3 & y_1 &+ x_2 & y_2 &+ x_1 & y_3 &+ \cdots& \\\\
&z_5 &&=& & & \vdots\\;&\\;\vdots &+ x_3 & y_2 &+ x_2 & y_3 &+ \cdots& \\\\
&z_6 &&=& & & & & \vdots\\;&\\;\vdots &+ x_3 & y_3 &+ \cdots& \\\\
&z_7 &&=& & & & & & & \vdots\\;&\\;\vdots &+ \cdots& \\\\
&\vdots&&=& & & & & & & & & \ddots& \\\\
\end{aligned}
\\]
Notice that the product coefficient \\(z_k\\), representing the value
\\(z_k 2^{52k}\\), is the sum of all product terms
\\(
(x_i 2^{52 i}) (y_j 2^{52 j})
\\)
with \\(k = i + j\\).
Write the IFMA operators \\(\mathrm{lo}(a,b)\\), denoting the low
\\(52\\) bits of \\(ab\\), and
\\(\mathrm{hi}(a,b)\\), denoting the high \\(52\\) bits of
\\(ab\\).
Now we can rewrite the product terms as
\\[
\begin{aligned}
(x_i 2^{52 i}) (y_j 2^{52 j})
&=
2^{52 (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}.
\end{aligned}
\\]
This means that the low half of \\(x_i y_j\\) can be accumulated onto
the product limb \\(z_{i+j}\\) and the high half can be directly
accumulated onto the next-higher product limb \\(z_{i+j+1}\\) with no
additional operations. This allows rewriting the schoolbook
multiplication into the form
\\[
\begin{aligned}
&z_0 &&=& \mathrm{lo}(x_0,&y_0) & & & & & & & & & & \\\\
&z_1 &&=& \mathrm{lo}(x_1,&y_0) &+\mathrm{hi}(x_0,&y_0) &+\mathrm{lo}(x_0,&y_1) & & & & & & \\\\
&z_2 &&=& \mathrm{lo}(x_2,&y_0) &+\mathrm{hi}(x_1,&y_0) &+\mathrm{lo}(x_1,&y_1) &+\mathrm{hi}(x_0,&y_1) &+\mathrm{lo}(x_0,&y_2) & & \\\\
&z_3 &&=& \mathrm{lo}(x_3,&y_0) &+\mathrm{hi}(x_2,&y_0) &+\mathrm{lo}(x_2,&y_1) &+\mathrm{hi}(x_1,&y_1) &+\mathrm{lo}(x_1,&y_2) &+ \cdots& \\\\
&z_4 &&=& \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_0) &+\mathrm{lo}(x_3,&y_1) &+\mathrm{hi}(x_2,&y_1) &+\mathrm{lo}(x_2,&y_2) &+ \cdots& \\\\
&z_5 &&=& & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_1) &+\mathrm{lo}(x_3,&y_2) &+ \cdots& \\\\
&z_6 &&=& & & & & & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+ \cdots& \\\\
&\vdots&&=& & & & & & & & & & & \ddots& \\\\
\end{aligned}
\\]
Gueron and Krasnov implement multiplication by constructing vectors
out of the columns of this diagram, so that the source operands for
the IFMA instructions are of the form \\((x_0, x_1, x_2, \ldots)\\)
and \\((y_i, y_i, y_i, \ldots)\\).
After performing the multiplication,
the product terms \\(z_i\\) are then repacked into a 64-bit representation.
## An alternative strategy
The strategy described above is aimed at big-integer multiplications,
such as 1024, 2048, or 4096 bits, which would be used for applications
like RSA. However, elliptic curve cryptography uses much smaller field
sizes, such as 256 or 384 bits, so a different strategy is needed.
The parallel Edwards formulas provide parallelism at the level of the
formulas for curve operations. This means that instead of scanning
through the terms of the source operands and parallelizing *within* a
field element (as described above), we can arrange the computation in
product-scanning form and parallelize *across* field elements (as
described below).
The parallel Edwards
formulas provide 4-way parallelism, so they can be implemented using
256-bit vectors using a single 64-bit lane for each element, or using
512-bit vectors using two 64-bit lanes.
The only available CPU supporting IFMA (the
i3-8121U) executes 512-bit IFMA instructions at half rate compared to
256-bit instructions, so for now there's no throughput advantage to
using 512-bit IFMA instructions, and this implementation uses 256-bit
vectors.
To extend this to 512-bit vectors, it's only only necessary to achieve
2-way parallelism, and it's possible (with a small amount of overhead)
to create a hybrid strategy that operates entirely within 128-bit
lanes. This means that cross-lane operations can use the faster
`vpshufd` (1c latency) instead of a general shuffle instruction (3c
latency).
# Choice of radix
The inputs to IFMA instructions are 52 bits wide, so the radix \\(r\\)
used to represent a multiprecision integer must be \\( r \leq 52 \\).
The obvious choice is the "native" radix \\(r = 52\\).
As described above, this choice
has the advantage that for \\(x_i, y_j \in [0,2^{52})\\), the product term
\\[
\begin{aligned}
(x_i 2^{52 i}) (y_j 2^{52 j})
&=
2^{52 (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)},
\end{aligned}
\\]
so that the low and high halves of the product can be directly accumulated
onto the product limbs.
In contrast, when using a smaller radix \\(r = 52 - k\\),
the product term has the form
\\[
\begin{aligned}
(x_i 2^{r i}) (y_j 2^{r j})
&=
2^{r (i+j)}(
\mathrm{lo}(x_i, y_j) +
\mathrm{hi}(x_i, y_j) 2^{52}
)
\\\\
&=
\mathrm{lo}(x_i, y_j) 2^{r (i+j)} +
(
\mathrm{hi}(x_i, y_j) 2^k
)
2^{r (i+j+1)}.
\end{aligned}
\\]
What's happening is that the product \\(x_i y_j\\) of size \\(2r\\)
bits is split not at \\(r\\) but at \\(52\\), so \\(k\\) product bits
are placed into the low half instead of the high half. This means
that the high half of the product cannot be directly accumulated onto
\\(z_{i+j+1}\\), but must first be multiplied by \\(2^k\\) (i.e., left
shifted by \\(k\\)). In addition, the low half of the product is
\\(52\\) bits large instead of \\(r\\) bits.
## Handling offset product terms
[Drucker and Gueron][2018_drucker_gueron] analyze the choice of radix
in the context of big-integer squaring, outlining three ways to handle
the offset product terms, before concluding that all of them are
suboptimal:
1. Shift the results after accumulation;
2. Shift the input operands before multiplication;
3. Split the MAC operation, accumulating into a zeroed register,
shifting the result, and then adding.
The first option is rejected because it could double-shift some
previously accumulated terms, the second doesn't work because the
inputs could become larger than \\(52\\) bits, and the third requires
additional instructions to handle the shifting and adding.
Based on an analysis of total number of instructions, they suggest an
addition to the instruction set, which they call `FMSA` (fused
multiply-shift-add). This would shift the result according to an 8-bit
immediate value before accumulating it into the destination register.
However, this change to the instruction set doesn't seem to be
necessary. Instead, the product terms can be grouped according to
their coefficients, accumulated together, then shifted once before
adding them to the final sum. This uses an extra register, shift, and
add, but only once per product term (accumulation target), not once
per source term (as in the Drucker-Gueron paper).
Moreover, because IFMA instructions execute only on two ports
(presumably 0 and 1), while adds and shifts can execute on three ports
(0, 1, and 5), the adds and shifts can execute independently of the
IFMA operations, as long as there is not too much pressure on port 5.
This means that, although the total number of instructions increases,
the shifts and adds do not necessarily increase the execution time, as
long as throughput is limited by IFMA operations.
Finally, because IFMA instructions have 4 cycle latency and 0.5/1
cycle throughput (for 256/512 bit vectors), maximizing IFMA throughput
requires either 8 (for 256) or 4 (for 512) independent operations. So
accumulating groups of terms independently before adding them at the
end may be necessary anyways, in order to prevent long chains of
dependent instructions.
## Advantages of a smaller radix
Using a smaller radix has other advantages. Although radix \\(52\\)
is an unsaturated representation from the point of view of the
\\(64\\)-bit accumulators (because up to 4096 product terms can be
accumulated without carries), it's a saturated representation from the
point of view of the multiplier (since \\(52\\)-bit values are the
maximum input size).
Because the inputs to a multiplication must have all of their limbs
bounded by \\(2^{52}\\), limbs in excess of \\(2^{52}\\) must be
reduced before they can be used as an input. The
[Gueron-Krasnov][2016_gueron_krasnov] paper suggests normalizing
values using a standard, sequential carry chain: for each limb, add
the carryin from reducing the previous limb, compute the carryout and
reduce the current limb, then move to the next limb.
However, when using a smaller radix, such as \\(51\\), each limb can
store a carry bit and still be used as the input to a multiplication.
This means that the inputs do not need to be normalized, and instead
of using a sequential carry chain, we can compute all carryouts in
parallel, reduce all limbs in parallel, and then add the carryins in
parallel (possibly growing the limb values by one bit).
Because the output of this partial reduction is an acceptable
multiplication input, we can "close the loop" using partial reductions
and never have to normalize to a canonical representation through the
entire computation, in contrast to the Gueron-Krasnov approach, which
converts back to a packed representation after every operation. (This
idea seems to trace back to at least as early as [this 1999
paper][1999_walter]).
Using \\(r = 51\\) is enough to keep a carry bit in each limb and
avoid normalizations. What about an even smaller radix? One reason
to choose a smaller radix would be to align the limb boundaries with
an inline reduction (for instance, choosing \\(r = 43\\) for the
Mersenne field \\(p = 2^{127} - 1\\)), but for \\(p = 2^{255 - 19}\\),
\\(r = 51 = 255/5\\) is the natural choice.
# Multiplication
The inputs to a multiplication are two field elements
\\[
\begin{aligned}
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} \\\\
y &= y_0 + y_1 2^{51} + y_2 2^{102} + y_3 2^{153} + y_4 2^{204},
\end{aligned}
\\]
with limbs in range \\([0,2^{52})\\).
Writing the product terms as
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
\end{aligned}
\\]
a schoolbook multiplication in product scanning form takes the form
\\[
\begin{aligned}
z_0 &= x_0 y_0 \\\\
z_1 &= x_1 y_0 + x_0 y_1 \\\\
z_2 &= x_2 y_0 + x_1 y_1 + x_0 y_2 \\\\
z_3 &= x_3 y_0 + x_2 y_1 + x_1 y_2 + x_0 y_3 \\\\
z_4 &= x_4 y_0 + x_3 y_1 + x_2 y_2 + x_1 y_3 + x_0 y_4 \\\\
z_5 &= x_4 y_1 + x_3 y_2 + x_2 y_3 + x_1 y_4 \\\\
z_6 &= x_4 y_2 + x_3 y_3 + x_2 y_4 \\\\
z_7 &= x_4 y_3 + x_3 y_4 \\\\
z_8 &= x_4 y_4 \\\\
z_9 &= 0 \\\\
\end{aligned}
\\]
Each term \\(x_i y_j\\) can be written in terms of IFMA operations as
\\[
x_i y_j = \mathrm{lo}(x_i,y_j) + 2\mathrm{hi}(x_i,y_j)2^{51}.
\\]
Substituting this equation into the schoolbook multiplication, then
moving terms to eliminate the \\(2^{51}\\) factors gives
\\[
\begin{aligned}
z_0 &= \mathrm{lo}(x_0, y_0) \\\\
&+ \qquad 0 \\\\
z_1 &= \mathrm{lo}(x_1, y_0) + \mathrm{lo}(x_0, y_1) \\\\
&+ \qquad 2( \mathrm{hi}(x_0, y_0) )\\\\
z_2 &= \mathrm{lo}(x_2, y_0) + \mathrm{lo}(x_1, y_1) + \mathrm{lo}(x_0, y_2) \\\\
&+ \qquad 2( \mathrm{hi}(x_1, y_0) + \mathrm{hi}(x_0, y_1) )\\\\
z_3 &= \mathrm{lo}(x_3, y_0) + \mathrm{lo}(x_2, y_1) + \mathrm{lo}(x_1, y_2) + \mathrm{lo}(x_0, y_3) \\\\
&+ \qquad 2( \mathrm{hi}(x_2, y_0) + \mathrm{hi}(x_1, y_1) + \mathrm{hi}(x_0, y_2) )\\\\
z_4 &= \mathrm{lo}(x_4, y_0) + \mathrm{lo}(x_3, y_1) + \mathrm{lo}(x_2, y_2) + \mathrm{lo}(x_1, y_3) + \mathrm{lo}(x_0, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_3, y_0) + \mathrm{hi}(x_2, y_1) + \mathrm{hi}(x_1, y_2) + \mathrm{hi}(x_0, y_3) )\\\\
z_5 &= \mathrm{lo}(x_4, y_1) + \mathrm{lo}(x_3, y_2) + \mathrm{lo}(x_2, y_3) + \mathrm{lo}(x_1, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_0) + \mathrm{hi}(x_3, y_1) + \mathrm{hi}(x_2, y_2) + \mathrm{hi}(x_1, y_3) + \mathrm{hi}(x_0, y_4) )\\\\
z_6 &= \mathrm{lo}(x_4, y_2) + \mathrm{lo}(x_3, y_3) + \mathrm{lo}(x_2, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_1) + \mathrm{hi}(x_3, y_2) + \mathrm{hi}(x_2, y_3) + \mathrm{hi}(x_1, y_4) )\\\\
z_7 &= \mathrm{lo}(x_4, y_3) + \mathrm{lo}(x_3, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_2) + \mathrm{hi}(x_3, y_3) + \mathrm{hi}(x_2, y_4) )\\\\
z_8 &= \mathrm{lo}(x_4, y_4) \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_3) + \mathrm{hi}(x_3, y_4) )\\\\
z_9 &= 0 \\\\
&+ \qquad 2( \mathrm{hi}(x_4, y_4) )\\\\
\end{aligned}
\\]
As noted above, our strategy will be to multiply and accumulate the
terms with coefficient \\(2\\) separately from those with coefficient
\\(1\\), before combining them at the end. This can alternately be
thought of as accumulating product terms into a *doubly-redundant*
representation, with two limbs for each digit, before collapsing
the doubly-redundant representation by shifts and adds.
This computation requires 25 `vpmadd52luq` and 25 `vpmadd52huq`
operations. For 256-bit vectors, IFMA operations execute on an
i3-8121U with latency 4 cycles, throughput 0.5 cycles, so executing 50
instructions requires 25 cycles' worth of throughput. Accumulating
terms with coefficient \\(1\\) and \\(2\\) seperately means that the
longest dependency chain has length 5, so the critical path has length
20 cycles and the bottleneck is throughput.
# Reduction modulo \\(p\\)
The next question is how to handle the reduction modulo \\(p\\).
Because \\(p = 2^{255} - 19\\), \\(2^{255} = 19 \pmod p\\), so we can
alternately write
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}
\end{aligned}
\\]
as
\\[
\begin{aligned}
z &= (z_0 + 19z_5) + (z_1 + 19z_6) 2^{51} + (z_2 + 19z_7) 2^{102} + (z_3 + 19z_8) 2^{153} + (z_4 + 19z_9) 2^{204}.
\end{aligned}
\\]
When using a \\(64 \times 64 \rightarrow 128\\)-bit multiplier, this
can be handled (as in [Ed25519][ed25519_paper]) by premultiplying
source terms by \\(19\\). Since \\(\lg(19) < 4.25\\), this increases
their size by less than \\(4.25\\) bits, and the rest of the
multiplication can be shown to work out.
Here, we have at most \\(1\\) bit of headroom. In order to allow
premultiplication, we would need to use radix \\(2^{47}\\), which
would require six limbs instead of five. Instead, we compute the high
terms \\(z_5, \ldots, z_9\\), each using two chains of IFMA
operations, then multiply by \\(19\\) and combine with the lower terms
\\(z_0, \ldots, z_4\\). There are two ways to perform the
multiplication by \\(19\\): using more IFMA operations, or using the
`vpmullq` instruction, which computes the low \\(64\\) bits of a \\(64
\times 64\\)-bit product. However, `vpmullq` has 15c/1.5c
latency/throughput, in contrast to the 4c/0.5c latency/throughput of
IFMA operations, so it seems like a worse choice.
The high terms \\(z_5, \ldots, z_9\\) are sums of \\(52\\)-bit terms,
so they are larger than \\(52\\) bits. Write these terms in radix \\(52\\) as
\\[
z_{5+i} = z_{5+i}' + z_{5+i}'' 2^{52}, \qquad z_{5+i}' < 2^{52}.
\\]
Then the contribution of \\(z_{5+i}\\), taken modulo \\(p\\), is
\\[
\begin{aligned}
z_{5+i} 2^{255} 2^{51 i}
&=
19 (z_{5+i}' + z_{5+i}'' 2^{52}) 2^{51 i}
\\\\
&=
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
\\\\
\end{aligned}
\\]
The products \\(19 z_{5+i}', 19 z_{5+i}''\\) can be written in terms of IFMA operations as
\\[
\begin{aligned}
19 z_{5+i}' &= \mathrm{lo}(19, z_{5+i}') + 2 \mathrm{hi}(19, z_{5+i}') 2^{51}, \\\\
19 z_{5+i}'' &= \mathrm{lo}(19, z_{5+i}'') + 2 \mathrm{hi}(19, z_{5+i}'') 2^{51}. \\\\
\end{aligned}
\\]
Because \\(z_{5+i} < 2^{64}\\), \\(z_{5+i}'' < 2^{12} \\), so \\(19
z_{5+i}'' < 2^{17} < 2^{52} \\) and \\(\mathrm{hi}(19, z_{5+i}'') = 0\\).
Because IFMA operations ignore the high bits of their source
operands, we do not need to compute \\(z\_{5+i}'\\) explicitly:
the high bits will be ignored.
Combining these observations, we can write
\\[
\begin{aligned}
z_{5+i} 2^{255} 2^{51 i}
&=
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
\\\\
&=
\mathrm{lo}(19, z_{5+i}) 2^{51 i}
\+ 2 \mathrm{hi}(19, z_{5+i}) 2^{51 (i+1)}
\+ 2 \mathrm{lo}(19, z_{5+i}/2^{52}) 2^{51 (i+1)}.
\end{aligned}
\\]
For \\(i = 0,1,2,3\\), this allows reducing \\(z_{5+i}\\) onto
\\(z_{i}, z_{i+1}\\), and if the low terms are computed using a
doubly-redundant representation, no additional shifts are needed to
handle the \\(2\\) coefficients. For \\(i = 4\\), there's a
complication: the contribution becomes
\\[
\begin{aligned}
z_{9} 2^{255} 2^{204}
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2 \mathrm{hi}(19, z_{9}) 2^{255}
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 2^{255}
\\\\
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2 \mathrm{hi}(19, z_{9}) 19
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 19
\\\\
&=
\mathrm{lo}(19, z_{9}) 2^{204}
\+ 2
\mathrm{lo}(19, \mathrm{hi}(19, z_{9}) + \mathrm{lo}(19, z_{9}/2^{52})).
\\\\
\end{aligned}
\\]
It would be possible to cut the number of multiplications from 3 to 2
by carrying the high part of each \\(z_i\\) onto \\(z_{i+1}\\). This
would eliminate 5 multiplications, clearing 2.5 cycles of port
pressure, at the cost of 5 additions, adding 1.66 cycles of port
pressure. But doing this would create a dependency between terms
(e.g., \\(z_{5}\\) must be computed before the reduction of
\\(z_{6}\\) can begin), whereas with the approach above, all
contributions to all terms are computed independently, to maximize ILP
and flexibility for the processor to schedule instructions.
This strategy performs 16 IFMA operations, adding two IFMA operations
to each of the \\(2\\)-coefficient terms and one to each of the
\\(1\\)-coefficient terms. Considering the multiplication and
reduction together, we use 66 IFMA operations, requiring 33 cycles'
throughput, while the longest chain of IFMA operations is in the
reduction of \\(z_5\\) onto \\(z_1\\), of length 7 (so 28 cycles, plus
2 cycles to combine the two parts of \\(z_5\\), and the bottleneck is
again throughput.
Once this is done, we have computed the product terms
\\[
z = z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204},
\\]
without reducing the \\(z_i\\) to fit in \\(52\\) bits. Because the
overall flow of operations alternates multiplications and additions or
subtractions, we would have to perform a reduction after an addition
but before the next multiplication anyways, so there's no benefit to
fully reducing the limbs at the end of a multiplication. Instead, we
leave them unreduced, and track the reduction state using the type
system to ensure that unreduced limbs are not accidentally used as an
input to a multiplication.
# Squaring
Squaring operates similarly to multiplication, but with the
possibility to combine identical terms.
As before, we write the input as
\\[
\begin{aligned}
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204}
\end{aligned}
\\]
with limbs in range \\([0,2^{52})\\).
Writing the product terms as
\\[
\begin{aligned}
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
\end{aligned}
\\]
a schoolbook squaring in product scanning form takes the form
\\[
\begin{aligned}
z_0 &= x_0 x_0 \\\\
z_1 &= 2 x_1 x_0 \\\\
z_2 &= 2 x_2 x_0 + x_1 x_1 \\\\
z_3 &= 2 x_3 x_0 + 2 x_2 x_1 \\\\
z_4 &= 2 x_4 x_0 + 2 x_3 x_1 + x_2 x_2 \\\\
z_5 &= 2 x_4 x_1 + 2 x_3 x_2 \\\\
z_6 &= 2 x_4 x_2 + x_3 x_3 \\\\
z_7 &= 2 x_4 x_3 \\\\
z_8 &= x_4 x_4 \\\\
z_9 &= 0 \\\\
\end{aligned}
\\]
As before, we write \\(x_i x_j\\) as
\\[
x_i x_j = \mathrm{lo}(x_i,x_j) + 2\mathrm{hi}(x_i,x_j)2^{51},
\\]
and substitute to obtain
\\[
\begin{aligned}
z_0 &= \mathrm{lo}(x_0, x_0) + 0 \\\\
z_1 &= 2 \mathrm{lo}(x_1, x_0) + 2 \mathrm{hi}(x_0, x_0) \\\\
z_2 &= 2 \mathrm{lo}(x_2, x_0) + \mathrm{lo}(x_1, x_1) + 4 \mathrm{hi}(x_1, x_0) \\\\
z_3 &= 2 \mathrm{lo}(x_3, x_0) + 2 \mathrm{lo}(x_2, x_1) + 4 \mathrm{hi}(x_2, x_0) + 2 \mathrm{hi}(x_1, x_1) \\\\
z_4 &= 2 \mathrm{lo}(x_4, x_0) + 2 \mathrm{lo}(x_3, x_1) + \mathrm{lo}(x_2, x_2) + 4 \mathrm{hi}(x_3, x_0) + 4 \mathrm{hi}(x_2, x_1) \\\\
z_5 &= 2 \mathrm{lo}(x_4, x_1) + 2 \mathrm{lo}(x_3, x_2) + 4 \mathrm{hi}(x_4, x_0) + 4 \mathrm{hi}(x_3, x_1) + 2 \mathrm{hi}(x_2, x_2) \\\\
z_6 &= 2 \mathrm{lo}(x_4, x_2) + \mathrm{lo}(x_3, x_3) + 4 \mathrm{hi}(x_4, x_1) + 4 \mathrm{hi}(x_3, x_2) \\\\
z_7 &= 2 \mathrm{lo}(x_4, x_3) + 4 \mathrm{hi}(x_4, x_2) + 2 \mathrm{hi}(x_3, x_3) \\\\
z_8 &= \mathrm{lo}(x_4, x_4) + 4 \mathrm{hi}(x_4, x_3) \\\\
z_9 &= 0 + 2 \mathrm{hi}(x_4, x_4) \\\\
\end{aligned}
\\]
To implement these, we group terms by their coefficient, computing
those with coefficient \\(2\\) on set of IFMA chains, and on another
set of chains, we begin with coefficient-\\(4\\) terms, then shift
left before continuing with the coefficient-\\(1\\) terms.
The reduction strategy is the same as for multiplication.
# Future improvements
LLVM won't use blend operations on [256-bit vectors yet][llvm_blend],
so there's a bunch of blend instructions that could be omitted.
Although the multiplications and squarings are much faster, there's no
speedup to the additions and subtractions, so there are diminishing
returns. In fact, the complications in the doubling formulas mean
that doubling is actually slower than readdition. This also suggests
that moving to 512-bit vectors won't be much help for a strategy aimed
at parallelism within a group operation, so to extract performance
gains from 512-bit vectors it will probably be necessary to create a
parallel-friendly multiscalar multiplication algorithm. This could
also help with reducing shuffle pressure.
The squaring implementation could probably be optimized, but without
`perf` support on Cannonlake it's difficult to make actual
measurements.
Another improvement would be to implement vectorized square root
computations, which would allow creating an iterator adaptor for point
decompression that bunched decompression operations and executed them
in parallel. This would accelerate batch verification.
[2016_gueron_krasnov]: https://ieeexplore.ieee.org/document/7563269
[2018_drucker_gueron]: https://eprint.iacr.org/2018/335
[1999_walter]: https://pdfs.semanticscholar.org/0e6a/3e8f30b63b556679f5dff2cbfdfe9523f4fa.pdf
[ed25519_paper]: https://ed25519.cr.yp.to/ed25519-20110926.pdf
[llvm_blend]: https://bugs.llvm.org/show_bug.cgi?id=38343

View File

@ -1,333 +0,0 @@
Vectorized implementations of field and point operations, using a
modification of the 4-way parallel formulas of Hisil, Wong, Carter,
and Dawson.
These notes explain the parallel formulas and our strategy for using
them with SIMD operations. There are two backend implementations: one
using AVX2, and the other using AVX512-IFMA.
# Overview
The 2008 paper [_Twisted Edwards Curves Revisited_][hwcd08] by Hisil,
Wong, Carter, and Dawson (HWCD) introduced the “extended coordinates”
and mixed-model representations which are used by most Edwards curve
implementations.
However, they also describe 4-way parallel formulas for point addition
and doubling: a unified addition algorithm taking an effective
\\(2\mathbf M + 1\mathbf D\\), a doubling algorithm taking an
effective \\(1\mathbf M + 1\mathbf S\\), and a dedicated (i.e., for
distinct points) addition algorithm taking an effective \\(2 \mathbf M
\\). They compare these formulas with a 2-way parallel variant of the
Montgomery ladder.
Unlike their serial formulas, which are used widely, their parallel
formulas do not seem to have been implemented in software before. The
2-way parallel Montgomery ladder was used in 2015 by Tung Chou's
`sandy2x` implementation. Curiously, however, although the [`sandy2x`
paper][sandy2x] also implements Edwards arithmetic, and cites HWCD08,
it doesn't mention their parallel Edwards formulas.
A 2015 paper by Hernández and López describes an AVX2 implementation
of X25519. Neither the paper nor the code are publicly available, but
it apparently gives only a [slight speedup][avx2trac], suggesting that
it uses a 4-way parallel Montgomery ladder rather than parallel
Edwards formulas.
The reason may be that HWCD08 describe their formulas as operating on
four independent processors, which would make a software
implementation impractical: all of the operations are too low-latency
to effectively synchronize. But a closer inspection reveals that the
(more expensive) multiplication and squaring steps are uniform, while
the instruction divergence occurs in the (much cheaper) addition and
subtraction steps. This means that a SIMD implementation can perform
the expensive steps uniformly, and handle divergence in the
inexpensive steps using masking.
These notes describe modifications to the original parallel formulas
to allow a SIMD implementation, and this module contains
implementations of the modified formulas targeting either AVX2 or
AVX512-IFMA.
# Parallel formulas in HWCD'08
The doubling formula is presented in the HWCD paper as follows:
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
| | idle | idle | idle | \\( R\_1 \gets X\_1 + Y\_1 \\) |
| \\(1\mathbf S\\) | \\( R\_2 \gets X\_1\^2 \\) | \\( R\_3 \gets Y\_1\^2 \\) | \\( R\_4 \gets Z\_1\^2 \\) | \\( R\_5 \gets R\_1\^2 \\) |
| | \\( R\_6 \gets R\_2 + R\_3 \\) | \\( R\_7 \gets R\_2 - R\_3 \\) | \\( R\_4 \gets 2 R\_4 \\) | idle |
| | idle | \\( R\_1 \gets R\_4 + R\_7 \\) | idle | \\( R\_2 \gets R\_6 - R\_5 \\) |
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_6 R\_7 \\) | \\( T\_3 \gets R\_2 R\_6 \\) | \\( Z\_3 \gets R\_1 R\_7 \\) |
and the unified addition algorithm is presented as follows:
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
| | \\( R\_1 \gets Y\_1 - X\_1 \\) | \\( R\_2 \gets Y\_2 - X\_2 \\) | \\( R\_3 \gets Y\_1 + X\_1 \\) | \\( R\_4 \gets Y\_2 + X\_2 \\) |
| \\(1\mathbf M\\) | \\( R\_5 \gets R\_1 R\_2 \\) | \\( R\_6 \gets R\_3 R\_4 \\) | \\( R\_7 \gets T\_1 T\_2 \\) | \\( R\_8 \gets Z\_1 Z\_2 \\) |
| \\(1\mathbf D\\) | idle | idle | \\( R\_7 \gets k R\_7 \\) | \\( R\_8 \gets 2 R\_8 \\) |
| | \\( R\_1 \gets R\_6 - R\_5 \\) | \\( R\_2 \gets R\_8 - R\_7 \\) | \\( R\_3 \gets R\_8 + R\_7 \\) | \\( R\_4 \gets R\_6 + R\_5 \\) |
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_3 R\_4 \\) | \\( T\_3 \gets R\_1 R\_4 \\) | \\( Z\_3 \gets R\_2 R\_3 \\) |
Here \\(\mathbf M\\) and \\(\mathbf S\\) represent the cost of
multiplication and squaring of generic field elements, \\(\mathbf D\\)
represents the cost of multiplication by a curve constant (in this
case \\( k = 2d \\)).
Notice that the \\(1\mathbf M\\) and \\(1\mathbf S\\) steps are
uniform. The non-uniform steps are all inexpensive additions or
subtractions, with the exception of the multiplication by the curve
constant \\(k = 2d\\):
$$
R\_7 \gets 2 d R\_7.
$$
HWCD suggest parallelising this step by breaking \\(k = 2d\\) into four
parts as \\(k = k_0 + 2\^n k_1 + 2\^{2n} k_2 + 2\^{3n} k_3 \\) and
computing \\(k_i R_7 \\) in parallel. This is quite awkward, but if
the curve constant is a ratio \\( d = d\_1/d\_2 \\), then projective
coordinates allow us to instead compute
$$
(R\_5, R\_6, R\_7, R\_8) \gets (d\_2 R\_5, d\_2 R\_6, 2d\_1 R\_7, d\_2 R\_8).
$$
This can be performed as a uniform multiplication by a vector of
constants, and if \\(d\_1, d\_2\\) are small, it is relatively
inexpensive. (This trick was suggested by Mike Hamburg).
In the Curve25519 case, we have
$$
d = \frac{d\_1}{d\_2} = \frac{-121665}{121666};
$$
Since \\(2 \cdot 121666 < 2\^{18}\\), all the constants above fit (up
to sign) in 32 bits, so this can be done in parallel as four
multiplications by small constants \\( (121666, 121666, 2\cdot 121665,
2\cdot 121666) \\), followed by a negation to compute \\( - 2\cdot 121665\\).
# Modified parallel formulas
Using the modifications sketched above, we can write SIMD-friendly
versions of the parallel formulas as follows. To avoid confusion with
the original formulas, temporary variables are named \\(S\\) instead
of \\(R\\) and are in static single-assignment form.
## Addition
To add points
\\(P_1 = (X_1 : Y_1 : Z_1 : T_1) \\)
and
\\(P_2 = (X_2 : Y_2 : Z_2 : T_2 ) \\),
we compute
$$
\begin{aligned}
(S\_0 &&,&& S\_1 &&,&& S\_2 &&,&& S\_3 )
&\gets
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Y\_2 - X\_2&&,&& Y\_2 + X\_2)
\\\\
(S\_4 &&,&& S\_5 &&,&& S\_6 &&,&& S\_7 )
&\gets
(S\_0 \cdot S\_2&&,&& S\_1 \cdot S\_3&&,&& Z\_1 \cdot Z\_2&&,&& T\_1 \cdot T\_2)
\\\\
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
&\gets
(d\_2 \cdot S\_4 &&,&& d\_2 \cdot S\_5 &&,&& 2 d\_2 \cdot S\_6 &&,&& 2 d\_1 \cdot S\_7 )
\\\\
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
&\gets
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
\\\\
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
&\gets
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
\end{aligned}
$$
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = P\_1 + P\_2 \\).
This costs \\( 2\mathbf M + 1 \mathbf D\\).
## Readdition
If the point \\( P_2 = (X\_2 : Y\_2 : Z\_2 : T\_2) \\) is fixed, we
can cache the multiplication of the curve constants by computing
$$
\begin{aligned}
(S\_2' &&,&& S\_3' &&,&& Z\_2' &&,&& T\_2' )
&\gets
(d\_2 \cdot (Y\_2 - X\_2)&&,&& d\_2 \cdot (Y\_1 + X\_1)&&,&& 2d\_2 \cdot Z\_2 &&,&& 2d\_1 \cdot T\_2).
\end{aligned}
$$
This costs \\( 1\mathbf D\\); with \\( (S\_2', S\_3', Z\_2', T\_2')\\)
in hand, the addition formulas above become
$$
\begin{aligned}
(S\_0 &&,&& S\_1 &&,&& Z\_1 &&,&& T\_1 )
&\gets
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Z\_1 &&,&& T\_1)
\\\\
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
&\gets
(S\_0 \cdot S\_2' &&,&& S\_1 \cdot S\_3'&&,&& Z\_1 \cdot Z\_2' &&,&& T\_1 \cdot T\_2')
\\\\
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
&\gets
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
\\\\
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
&\gets
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
\end{aligned}
$$
which costs only \\( 2\mathbf M \\). This precomputation is
essentially similar to the precomputation that HWCD suggest for their
serial formulas. Because the cost of precomputation and then
readdition is the same as addition, it's sufficient to only
implement caching and readdition.
## Doubling
The non-uniform portions of the (re)addition formulas have a fairly
regular structure. Unfortunately, this is not the case for the
doubling formulas, which are much less nice.
To double a point \\( P = (X\_1 : Y\_1 : Z\_1 : T\_1) \\), we compute
$$
\begin{aligned}
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& S\_0)
&\gets
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& X\_1 + Y\_1)
\\\\
(S\_1 &&,&& S\_2 &&,&& S\_3 &&,&& S\_4 )
&\gets
(X\_1\^2 &&,&& Y\_1\^2&&,&& Z\_1\^2 &&,&& S\_0\^2)
\\\\
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
&\gets
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
\\\\
(X\_3 &&,&& Y\_3 &&,&& Z\_3 &&,&& T\_3 )
&\gets
(S\_8 \cdot S\_9 &&,&& S\_5 \cdot S\_6 &&,&& S\_8 \cdot S\_6 &&,&& S\_5 \cdot S\_9)
\end{aligned}
$$
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = [2]P\_1 \\).
The intermediate step between the squaring and multiplication requires
a long chain of additions. For the IFMA-based implementation, this is not a problem; for the AVX2-based implementation, it is, but with some care and finesse, it's possible to arrange the computation without requiring an intermediate reduction.
# Implementation
These formulas aren't specific to a particular representation of field
element vectors, whose optimum choice is determined by the details of
the instruction set. However, it's not possible to perfectly separate
the implementation of the field element vectors from the
implementation of the point operations. Instead, the [`avx2`] and
[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and
the [`scalar_mul`] code uses one of the backend types by a type alias.
# Comparison to non-vectorized formulas
In theory, the parallel Edwards formulas seem to allow a \\(4\\)-way
speedup from parallelism. However, an actual vectorized
implementation has several slowdowns that cut into this speedup.
First, the parallel formulas can only use the available vector
multiplier. For AVX2, this is a \\( 32 \times 32 \rightarrow 64
\\)-bit integer multiplier, so the speedup from vectorization must
overcome the disadvantage of losing the \\( 64 \times 64 \rightarrow
128\\)-bit (serial) integer multiplier. The effect of this slowdown
is microarchitecture-dependent, since it requires accounting for the
total number of multiplications and additions and their relative
costs. IFMA allows using a \\( 52 \times 52 \rightarrow 104 \\)-bit
multiplier, but the high and low halves need to be computed
separately, and the reduction requires extra work because it's not
possible to pre-multiply by \\(19\\).
Second, the parallel doubling formulas incur both a theoretical and
practical slowdown. The parallel formulas described above work on the
\\( \mathbb P\^3 \\) “extended” coordinates. The \\( \mathbb P\^2 \\)
model introduced earlier by [Bernstein, Birkner, Joye, Lange, and
Peters][bbjlp08] allows slightly faster doublings, so HWCD suggest
mixing coordinate systems while performing scalar multiplication
(attributing the idea to [a 1998 paper][cmo98] by Cohen, Miyagi, and
Ono). The \\( T \\) coordinate is not required for doublings, so when
doublings are followed by doublings, its computation can be skipped.
More details on this approach and the different coordinate systems can
be found in the [`curve_models` module documentation][curve_models].
Unfortunately, this optimization is not compatible with the parallel
formulas, which cannot save time by skipping a single variable, so the
parallel doubling formulas do slightly more work when counting the
total number of field multiplications and squarings.
In addition, the parallel doubling formulas have a less regular
pattern of additions and subtractions than the parallel addition
formulas, so the vectorization overhead is proportionately greater.
Both the parallel addition and parallel doubling formulas also require
some shuffling to rearrange data within the vectors, which places more
pressure on the shuffle unit than is desirable.
This means that the speedup from using a vectorized implementation of
parallel Edwards formulas is likely to be greatest in applications
that do fewer doublings and more additions (like a large multiscalar
multiplication) rather than applications that do fewer additions and
more doublings (like a double-base scalar multiplication).
Third, Amdahl's law says that the speedup is limited to the portion
which can be parallelized. Normally, the field multiplications
dominate the cost of point operations, but with the IFMA backend, the
multiplications are so fast that the non-parallel additions end up as
a significant portion of the total time.
Fourth, current Intel CPUs perform thermal throttling when using wide
vector instructions. A detailed description can be found in §15.26 of
[the Intel Optimization Manual][intel], but using wide vector
instructions prevents the core from operating at higher frequencies.
The core can return to the higher-frequency state after 2
milliseconds, but this timer is reset every time high-power
instructions are used.
Any speedup from vectorization therefore has to be weighed against a
slowdown for the next few million instructions. For a mixed workload,
where point operations are interspersed with other tasks, this can
reduce overall performance. This implementation is therefore probably
not suitable for basic applications, like signatures, but is
worthwhile for complex applications, like zero-knowledge proofs, which
do sustained work.
# Future work
There are several directions for future improvement:
* Using the vectorized field arithmetic code to parallelize across
point operations rather than within a single point operation. This
is less flexible, but would give a speedup both from allowing use of
the faster mixed-model arithmetic and from reducing shuffle
pressure. One approach in this direction would be to implement
batched scalar-point operations using vectors of points (AoSoA
layout). This less generally useful but would give a speedup for
Bulletproofs.
* Extending the IFMA implementation to use the full width of AVX512,
either handling the extra parallelism internally to a single point
operation (by using a 2-way parallel implementation of field
arithmetic instead of a wordsliced one), or externally,
parallelizing across point operations. Internal parallelism would
be preferable but might require too much shuffle pressure. For now,
the only available CPU which runs IFMA operations executes them at
256-bits wide anyways, so this isn't yet important.
* Generalizing the implementation to NEON instructions. The current
point arithmetic code is written in terms of field element vectors,
which are in turn implemented using platform SIMD vectors. It
should be possible to write an alternate implementation of the
`FieldElement2625x4` using NEON without changing the point
arithmetic. NEON has 128-bit vectors rather than 256-bit vectors,
but this may still be worthwhile compared to a serial
implementation.
[sandy2x]: https://eprint.iacr.org/2015/943.pdf
[avx2trac]: https://trac.torproject.org/projects/tor/ticket/8897#comment:28
[hwcd08]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
[curve_models]: https://doc-internal.dalek.rs/curve25519_dalek/backend/serial/curve_models/index.html
[bbjlp08]: https://eprint.iacr.org/2008/013
[cmo98]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
[intel]: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf

View File

@ -1,65 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Pluggable implementations for different architectures.
//!
//! The backend code is split into two parts: a serial backend,
//! and a vector backend.
//!
//! The [`serial`] backend contains 32- and 64-bit implementations of
//! field arithmetic and scalar arithmetic, as well as implementations
//! of point operations using the mixed-model strategy (passing
//! between different curve models depending on the operation).
//!
//! The [`vector`] backend contains implementations of vectorized
//! field arithmetic, used to implement point operations using a novel
//! implementation strategy derived from parallel formulas of Hisil,
//! Wong, Carter, and Dawson.
//!
//! Because the two strategies give rise to different curve models,
//! it's not possible to reuse exactly the same scalar multiplication
//! code (or to write it generically), so both serial and vector
//! backends contain matching implementations of scalar multiplication
//! algorithms. These are intended to be selected by a `#[cfg]`-based
//! type alias.
//!
//! The [`vector`] backend is selected by the `simd_backend` cargo
//! feature; it uses the [`serial`] backend for non-vectorized operations.
#[cfg(not(any(
feature = "u32_backend",
feature = "u64_backend",
feature = "fiat_u32_backend",
feature = "fiat_u64_backend",
feature = "simd_backend",
)))]
compile_error!(
"no curve25519-dalek backend cargo feature enabled! \
please enable one of: u32_backend, u64_backend, fiat_u32_backend, fiat_u64_backend, simd_backend"
);
pub mod serial;
#[cfg(any(
all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
),
all(feature = "nightly", rustdoc)
))]
#[cfg_attr(
feature = "nightly",
doc(cfg(any(all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
))))
)]
pub mod vector;

View File

@ -1,551 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Internal curve representations which are not part of the public API.
//!
//! # Curve representations
//!
//! Internally, we use several different models for the curve. Here
//! is a sketch of the relationship between the models, following [a
//! post][smith-moderncrypto]
//! by Ben Smith on the `moderncrypto` mailing list. This is also briefly
//! discussed in section 2.5 of [_Montgomery curves and their
//! arithmetic_][costello-smith-2017] by Costello and Smith.
//!
//! Begin with the affine equation for the curve,
//! $$
//! -x\^2 + y\^2 = 1 + dx\^2y\^2.
//! $$
//! Next, pass to the projective closure \\(\mathbb P\^1 \times \mathbb
//! P\^1 \\) by setting \\(x=X/Z\\), \\(y=Y/T.\\) Clearing denominators
//! gives the model
//! $$
//! -X\^2T\^2 + Y\^2Z\^2 = Z\^2T\^2 + dX\^2Y\^2.
//! $$
//! In `curve25519-dalek`, this is represented as the `CompletedPoint`
//! struct.
//! To map from \\(\mathbb P\^1 \times \mathbb P\^1 \\), a product of
//! two lines, to \\(\mathbb P\^3\\), we use the [Segre
//! embedding](https://en.wikipedia.org/wiki/Segre_embedding)
//! $$
//! \sigma : ((X:Z),(Y:T)) \mapsto (XY:XT:ZY:ZT).
//! $$
//! Using coordinates \\( (W_0:W_1:W_2:W_3) \\) for \\(\mathbb P\^3\\),
//! the image \\(\sigma (\mathbb P\^1 \times \mathbb P\^1) \\) is the
//! surface defined by \\( W_0 W_3 = W_1 W_2 \\), and under \\(
//! \sigma\\), the equation above becomes
//! $$
//! -W\_1\^2 + W\_2\^2 = W\_3\^2 + dW\_0\^2,
//! $$
//! so that the curve is given by the pair of equations
//! $$
//! \begin{aligned}
//! -W\_1\^2 + W\_2\^2 &= W\_3\^2 + dW\_0\^2, \\\\ W_0 W_3 &= W_1 W_2.
//! \end{aligned}
//! $$
//! Up to variable naming, this is exactly the "extended" curve model
//! introduced in [_Twisted Edwards Curves
//! Revisited_][hisil-wong-carter-dawson-2008] by Hisil, Wong, Carter,
//! and Dawson. In `curve25519-dalek`, it is represented as the
//! `EdwardsPoint` struct. We can map from \\(\mathbb P\^3 \\) to
//! \\(\mathbb P\^2 \\) by sending \\( (W\_0:W\_1:W\_2:W\_3) \\) to \\(
//! (W\_1:W\_2:W\_3) \\). Notice that
//! $$
//! \frac {W\_1} {W\_3} = \frac {XT} {ZT} = \frac X Z = x,
//! $$
//! and
//! $$
//! \frac {W\_2} {W\_3} = \frac {YZ} {ZT} = \frac Y T = y,
//! $$
//! so this is the same as if we had started with the affine model
//! and passed to \\( \mathbb P\^2 \\) by setting \\( x = W\_1 / W\_3
//! \\), \\(y = W\_2 / W\_3 \\).
//! Up to variable naming, this is the projective representation
//! introduced in in [_Twisted Edwards
//! Curves_][bernstein-birkner-joye-lange-peters-2008] by Bernstein,
//! Birkner, Joye, Lange, and Peters. In `curve25519-dalek`, it is
//! represented by the `ProjectivePoint` struct.
//!
//! # Passing between curve models
//!
//! Although the \\( \mathbb P\^3 \\) model provides faster addition
//! formulas, the \\( \mathbb P\^2 \\) model provides faster doubling
//! formulas. Hisil, Wong, Carter, and Dawson therefore suggest mixing
//! coordinate systems for scalar multiplication, attributing the idea
//! to [a 1998 paper][cohen-miyaji-ono-1998] of Cohen, Miyagi, and Ono.
//!
//! Their suggestion is to vary the formulas used by context, using a
//! \\( \mathbb P\^2 \rightarrow \mathbb P\^2 \\) doubling formula when
//! a doubling is followed
//! by another doubling, a \\( \mathbb P\^2 \rightarrow \mathbb P\^3 \\)
//! doubling formula when a doubling is followed by an addition, and
//! computing point additions using a \\( \mathbb P\^3 \times \mathbb P\^3
//! \rightarrow \mathbb P\^2 \\) formula.
//!
//! The `ref10` reference implementation of [Ed25519][ed25519], by
//! Bernstein, Duif, Lange, Schwabe, and Yang, tweaks
//! this strategy, factoring the addition formulas through the
//! completion \\( \mathbb P\^1 \times \mathbb P\^1 \\), so that the
//! output of an addition or doubling always lies in \\( \mathbb P\^1 \times
//! \mathbb P\^1\\), and the choice of which formula to use is replaced
//! by a choice of whether to convert the result to \\( \mathbb P\^2 \\)
//! or \\(\mathbb P\^3 \\). However, this tweak is not described in
//! their paper, only in their software.
//!
//! Our naming for the `CompletedPoint` (\\(\mathbb P\^1 \times \mathbb
//! P\^1 \\)), `ProjectivePoint` (\\(\mathbb P\^2 \\)), and
//! `EdwardsPoint` (\\(\mathbb P\^3 \\)) structs follows the naming in
//! Adam Langley's [Golang ed25519][agl-ed25519] implementation, which
//! `curve25519-dalek` was originally derived from.
//!
//! Finally, to accelerate readditions, we use two cached point formats
//! in "Niels coordinates", named for Niels Duif,
//! one for the affine model and one for the \\( \mathbb P\^3 \\) model:
//!
//! * `AffineNielsPoint`: \\( (y+x, y-x, 2dxy) \\)
//! * `ProjectiveNielsPoint`: \\( (Y+X, Y-X, Z, 2dXY) \\)
//!
//! [smith-moderncrypto]: https://moderncrypto.org/mail-archive/curves/2016/000807.html
//! [costello-smith-2017]: https://eprint.iacr.org/2017/212
//! [hisil-wong-carter-dawson-2008]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
//! [bernstein-birkner-joye-lange-peters-2008]: https://eprint.iacr.org/2008/013
//! [cohen-miyaji-ono-1998]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
//! [ed25519]: https://eprint.iacr.org/2011/368
//! [agl-ed25519]: https://github.com/agl/ed25519
#![allow(non_snake_case)]
use core::fmt::Debug;
use core::ops::{Add, Neg, Sub};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
use constants;
use edwards::EdwardsPoint;
use field::FieldElement;
use traits::ValidityCheck;
// ------------------------------------------------------------------------
// Internal point representations
// ------------------------------------------------------------------------
/// A `ProjectivePoint` is a point \\((X:Y:Z)\\) on the \\(\mathbb
/// P\^2\\) model of the curve.
/// A point \\((x,y)\\) in the affine model corresponds to
/// \\((x:y:1)\\).
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
pub struct ProjectivePoint {
pub X: FieldElement,
pub Y: FieldElement,
pub Z: FieldElement,
}
/// A `CompletedPoint` is a point \\(((X:Z), (Y:T))\\) on the \\(\mathbb
/// P\^1 \times \mathbb P\^1 \\) model of the curve.
/// A point (x,y) in the affine model corresponds to \\( ((x:1),(y:1))
/// \\).
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
#[allow(missing_docs)]
pub struct CompletedPoint {
pub X: FieldElement,
pub Y: FieldElement,
pub Z: FieldElement,
pub T: FieldElement,
}
/// A pre-computed point in the affine model for the curve, represented as
/// \\((y+x, y-x, 2dxy)\\) in "Niels coordinates".
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
// Safe to derive Eq because affine coordinates.
#[derive(Copy, Clone, Eq, PartialEq)]
#[allow(missing_docs)]
pub struct AffineNielsPoint {
pub y_plus_x: FieldElement,
pub y_minus_x: FieldElement,
pub xy2d: FieldElement,
}
impl Zeroize for AffineNielsPoint {
fn zeroize(&mut self) {
self.y_plus_x.zeroize();
self.y_minus_x.zeroize();
self.xy2d.zeroize();
}
}
/// A pre-computed point on the \\( \mathbb P\^3 \\) model for the
/// curve, represented as \\((Y+X, Y-X, Z, 2dXY)\\) in "Niels coordinates".
///
/// More details on the relationships between the different curve models
/// can be found in the module-level documentation.
#[derive(Copy, Clone)]
pub struct ProjectiveNielsPoint {
pub Y_plus_X: FieldElement,
pub Y_minus_X: FieldElement,
pub Z: FieldElement,
pub T2d: FieldElement,
}
impl Zeroize for ProjectiveNielsPoint {
fn zeroize(&mut self) {
self.Y_plus_X.zeroize();
self.Y_minus_X.zeroize();
self.Z.zeroize();
self.T2d.zeroize();
}
}
// ------------------------------------------------------------------------
// Constructors
// ------------------------------------------------------------------------
use traits::Identity;
impl Identity for ProjectivePoint {
fn identity() -> ProjectivePoint {
ProjectivePoint {
X: FieldElement::zero(),
Y: FieldElement::one(),
Z: FieldElement::one(),
}
}
}
impl Identity for ProjectiveNielsPoint {
fn identity() -> ProjectiveNielsPoint {
ProjectiveNielsPoint{
Y_plus_X: FieldElement::one(),
Y_minus_X: FieldElement::one(),
Z: FieldElement::one(),
T2d: FieldElement::zero(),
}
}
}
impl Default for ProjectiveNielsPoint {
fn default() -> ProjectiveNielsPoint {
ProjectiveNielsPoint::identity()
}
}
impl Identity for AffineNielsPoint {
fn identity() -> AffineNielsPoint {
AffineNielsPoint{
y_plus_x: FieldElement::one(),
y_minus_x: FieldElement::one(),
xy2d: FieldElement::zero(),
}
}
}
impl Default for AffineNielsPoint {
fn default() -> AffineNielsPoint {
AffineNielsPoint::identity()
}
}
// ------------------------------------------------------------------------
// Validity checks (for debugging, not CT)
// ------------------------------------------------------------------------
impl ValidityCheck for ProjectivePoint {
fn is_valid(&self) -> bool {
// Curve equation is -x^2 + y^2 = 1 + d*x^2*y^2,
// homogenized as (-X^2 + Y^2)*Z^2 = Z^4 + d*X^2*Y^2
let XX = self.X.square();
let YY = self.Y.square();
let ZZ = self.Z.square();
let ZZZZ = ZZ.square();
let lhs = &(&YY - &XX) * &ZZ;
let rhs = &ZZZZ + &(&constants::EDWARDS_D * &(&XX * &YY));
lhs == rhs
}
}
// ------------------------------------------------------------------------
// Constant-time assignment
// ------------------------------------------------------------------------
impl ConditionallySelectable for ProjectiveNielsPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
ProjectiveNielsPoint {
Y_plus_X: FieldElement::conditional_select(&a.Y_plus_X, &b.Y_plus_X, choice),
Y_minus_X: FieldElement::conditional_select(&a.Y_minus_X, &b.Y_minus_X, choice),
Z: FieldElement::conditional_select(&a.Z, &b.Z, choice),
T2d: FieldElement::conditional_select(&a.T2d, &b.T2d, choice),
}
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.Y_plus_X.conditional_assign(&other.Y_plus_X, choice);
self.Y_minus_X.conditional_assign(&other.Y_minus_X, choice);
self.Z.conditional_assign(&other.Z, choice);
self.T2d.conditional_assign(&other.T2d, choice);
}
}
impl ConditionallySelectable for AffineNielsPoint {
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
AffineNielsPoint {
y_plus_x: FieldElement::conditional_select(&a.y_plus_x, &b.y_plus_x, choice),
y_minus_x: FieldElement::conditional_select(&a.y_minus_x, &b.y_minus_x, choice),
xy2d: FieldElement::conditional_select(&a.xy2d, &b.xy2d, choice),
}
}
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
self.y_plus_x.conditional_assign(&other.y_plus_x, choice);
self.y_minus_x.conditional_assign(&other.y_minus_x, choice);
self.xy2d.conditional_assign(&other.xy2d, choice);
}
}
// ------------------------------------------------------------------------
// Point conversions
// ------------------------------------------------------------------------
impl ProjectivePoint {
/// Convert this point from the \\( \mathbb P\^2 \\) model to the
/// \\( \mathbb P\^3 \\) model.
///
/// This costs \\(3 \mathrm M + 1 \mathrm S\\).
pub fn to_extended(&self) -> EdwardsPoint {
EdwardsPoint {
X: &self.X * &self.Z,
Y: &self.Y * &self.Z,
Z: self.Z.square(),
T: &self.X * &self.Y,
}
}
}
impl CompletedPoint {
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
/// \\) model to the \\( \mathbb P\^2 \\) model.
///
/// This costs \\(3 \mathrm M \\).
pub fn to_projective(&self) -> ProjectivePoint {
ProjectivePoint {
X: &self.X * &self.T,
Y: &self.Y * &self.Z,
Z: &self.Z * &self.T,
}
}
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
/// \\) model to the \\( \mathbb P\^3 \\) model.
///
/// This costs \\(4 \mathrm M \\).
pub fn to_extended(&self) -> EdwardsPoint {
EdwardsPoint {
X: &self.X * &self.T,
Y: &self.Y * &self.Z,
Z: &self.Z * &self.T,
T: &self.X * &self.Y,
}
}
}
// ------------------------------------------------------------------------
// Doubling
// ------------------------------------------------------------------------
impl ProjectivePoint {
/// Double this point: return self + self
pub fn double(&self) -> CompletedPoint { // Double()
let XX = self.X.square();
let YY = self.Y.square();
let ZZ2 = self.Z.square2();
let X_plus_Y = &self.X + &self.Y;
let X_plus_Y_sq = X_plus_Y.square();
let YY_plus_XX = &YY + &XX;
let YY_minus_XX = &YY - &XX;
CompletedPoint{
X: &X_plus_Y_sq - &YY_plus_XX,
Y: YY_plus_XX,
Z: YY_minus_XX,
T: &ZZ2 - &YY_minus_XX
}
}
}
// ------------------------------------------------------------------------
// Addition and Subtraction
// ------------------------------------------------------------------------
// XXX(hdevalence) These were doc(hidden) so they don't appear in the
// public API docs.
// However, that prevents them being used with --document-private-items,
// so comment out the doc(hidden) for now until this is resolved
//
// upstream rust issue: https://github.com/rust-lang/rust/issues/46380
//#[doc(hidden)]
impl<'a, 'b> Add<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn add(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PP = &Y_plus_X * &other.Y_plus_X;
let MM = &Y_minus_X * &other.Y_minus_X;
let TT2d = &self.T * &other.T2d;
let ZZ = &self.Z * &other.Z;
let ZZ2 = &ZZ + &ZZ;
CompletedPoint{
X: &PP - &MM,
Y: &PP + &MM,
Z: &ZZ2 + &TT2d,
T: &ZZ2 - &TT2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Sub<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn sub(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PM = &Y_plus_X * &other.Y_minus_X;
let MP = &Y_minus_X * &other.Y_plus_X;
let TT2d = &self.T * &other.T2d;
let ZZ = &self.Z * &other.Z;
let ZZ2 = &ZZ + &ZZ;
CompletedPoint{
X: &PM - &MP,
Y: &PM + &MP,
Z: &ZZ2 - &TT2d,
T: &ZZ2 + &TT2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Add<&'b AffineNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn add(self, other: &'b AffineNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PP = &Y_plus_X * &other.y_plus_x;
let MM = &Y_minus_X * &other.y_minus_x;
let Txy2d = &self.T * &other.xy2d;
let Z2 = &self.Z + &self.Z;
CompletedPoint{
X: &PP - &MM,
Y: &PP + &MM,
Z: &Z2 + &Txy2d,
T: &Z2 - &Txy2d
}
}
}
//#[doc(hidden)]
impl<'a, 'b> Sub<&'b AffineNielsPoint> for &'a EdwardsPoint {
type Output = CompletedPoint;
fn sub(self, other: &'b AffineNielsPoint) -> CompletedPoint {
let Y_plus_X = &self.Y + &self.X;
let Y_minus_X = &self.Y - &self.X;
let PM = &Y_plus_X * &other.y_minus_x;
let MP = &Y_minus_X * &other.y_plus_x;
let Txy2d = &self.T * &other.xy2d;
let Z2 = &self.Z + &self.Z;
CompletedPoint{
X: &PM - &MP,
Y: &PM + &MP,
Z: &Z2 - &Txy2d,
T: &Z2 + &Txy2d
}
}
}
// ------------------------------------------------------------------------
// Negation
// ------------------------------------------------------------------------
impl<'a> Neg for &'a ProjectiveNielsPoint {
type Output = ProjectiveNielsPoint;
fn neg(self) -> ProjectiveNielsPoint {
ProjectiveNielsPoint{
Y_plus_X: self.Y_minus_X,
Y_minus_X: self.Y_plus_X,
Z: self.Z,
T2d: -(&self.T2d),
}
}
}
impl<'a> Neg for &'a AffineNielsPoint {
type Output = AffineNielsPoint;
fn neg(self) -> AffineNielsPoint {
AffineNielsPoint{
y_plus_x: self.y_minus_x,
y_minus_x: self.y_plus_x,
xy2d: -(&self.xy2d)
}
}
}
// ------------------------------------------------------------------------
// Debug traits
// ------------------------------------------------------------------------
impl Debug for ProjectivePoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "ProjectivePoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?}\n}}",
&self.X, &self.Y, &self.Z)
}
}
impl Debug for CompletedPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "CompletedPoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?},\n\tT: {:?}\n}}",
&self.X, &self.Y, &self.Z, &self.T)
}
}
impl Debug for AffineNielsPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "AffineNielsPoint{{\n\ty_plus_x: {:?},\n\ty_minus_x: {:?},\n\txy2d: {:?}\n}}",
&self.y_plus_x, &self.y_minus_x, &self.xy2d)
}
}
impl Debug for ProjectiveNielsPoint {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "ProjectiveNielsPoint{{\n\tY_plus_X: {:?},\n\tY_minus_X: {:?},\n\tZ: {:?},\n\tT2d: {:?}\n}}",
&self.Y_plus_X, &self.Y_minus_X, &self.Z, &self.T2d)
}
}

View File

@ -1,260 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(32\\)-bit
//! limbs with \\(64\\)-bit products.
//!
//! This code was originally derived from Adam Langley's Golang ed25519
//! implementation, and was then rewritten to use unsigned limbs instead
//! of signed limbs.
//!
//! This uses the formally-verified field arithmetic generated by the
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
use core::fmt::Debug;
use core::ops::Neg;
use core::ops::{Add, AddAssign};
use core::ops::{Mul, MulAssign};
use core::ops::{Sub, SubAssign};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
use fiat_crypto::curve25519_32::*;
/// A `FieldElement2625` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// In the 32-bit implementation, a `FieldElement` is represented in
/// radix \\(2\^{25.5}\\) as ten `u32`s. This means that a field
/// element \\(x\\) is represented as
/// $$
/// x = \sum\_{i=0}\^9 x\_i 2\^{\lceil i \frac {51} 2 \rceil}
/// = x\_0 + x\_1 2\^{26} + x\_2 2\^{51} + x\_3 2\^{77} + \cdots + x\_9 2\^{230};
/// $$
/// the coefficients are alternately bounded by \\(2\^{25}\\) and
/// \\(2\^{26}\\). The limbs are allowed to grow between reductions up
/// to \\(2\^{25+b}\\) or \\(2\^{26+b}\\), where \\(b = 1.75\\).
///
/// # Note
///
/// The `curve25519_dalek::field` module provides a type alias
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
/// or `FieldElement2625`.
///
/// The backend-specific type `FieldElement2625` should not be used
/// outside of the `curve25519_dalek::field` module.
#[derive(Copy, Clone)]
pub struct FieldElement2625(pub(crate) [u32; 10]);
impl Debug for FieldElement2625 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "FieldElement2625({:?})", &self.0[..])
}
}
impl Zeroize for FieldElement2625 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl<'b> AddAssign<&'b FieldElement2625> for FieldElement2625 {
fn add_assign(&mut self, _rhs: &'b FieldElement2625) {
let input = self.0;
fiat_25519_add(&mut self.0, &input, &_rhs.0);
let input = self.0;
fiat_25519_carry(&mut self.0, &input);
}
}
impl<'a, 'b> Add<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn add(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
let mut output = *self;
fiat_25519_add(&mut output.0, &self.0, &_rhs.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl<'b> SubAssign<&'b FieldElement2625> for FieldElement2625 {
fn sub_assign(&mut self, _rhs: &'b FieldElement2625) {
let input = self.0;
fiat_25519_sub(&mut self.0, &input, &_rhs.0);
let input = self.0;
fiat_25519_carry(&mut self.0, &input);
}
}
impl<'a, 'b> Sub<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn sub(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
let mut output = *self;
fiat_25519_sub(&mut output.0, &self.0, &_rhs.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl<'b> MulAssign<&'b FieldElement2625> for FieldElement2625 {
fn mul_assign(&mut self, _rhs: &'b FieldElement2625) {
let input = self.0;
fiat_25519_carry_mul(&mut self.0, &input, &_rhs.0);
}
}
impl<'a, 'b> Mul<&'b FieldElement2625> for &'a FieldElement2625 {
type Output = FieldElement2625;
fn mul(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
let mut output = *self;
fiat_25519_carry_mul(&mut output.0, &self.0, &_rhs.0);
output
}
}
impl<'a> Neg for &'a FieldElement2625 {
type Output = FieldElement2625;
fn neg(self) -> FieldElement2625 {
let mut output = *self;
fiat_25519_opp(&mut output.0, &self.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl ConditionallySelectable for FieldElement2625 {
fn conditional_select(
a: &FieldElement2625,
b: &FieldElement2625,
choice: Choice,
) -> FieldElement2625 {
let mut output = [0u32; 10];
fiat_25519_selectznz(&mut output, choice.unwrap_u8() as fiat_25519_u1, &a.0, &b.0);
FieldElement2625(output)
}
fn conditional_assign(&mut self, other: &FieldElement2625, choice: Choice) {
let mut output = [0u32; 10];
let choicebit = choice.unwrap_u8() as fiat_25519_u1;
fiat_25519_cmovznz_u32(&mut output[0], choicebit, self.0[0], other.0[0]);
fiat_25519_cmovznz_u32(&mut output[1], choicebit, self.0[1], other.0[1]);
fiat_25519_cmovznz_u32(&mut output[2], choicebit, self.0[2], other.0[2]);
fiat_25519_cmovznz_u32(&mut output[3], choicebit, self.0[3], other.0[3]);
fiat_25519_cmovznz_u32(&mut output[4], choicebit, self.0[4], other.0[4]);
fiat_25519_cmovznz_u32(&mut output[5], choicebit, self.0[5], other.0[5]);
fiat_25519_cmovznz_u32(&mut output[6], choicebit, self.0[6], other.0[6]);
fiat_25519_cmovznz_u32(&mut output[7], choicebit, self.0[7], other.0[7]);
fiat_25519_cmovznz_u32(&mut output[8], choicebit, self.0[8], other.0[8]);
fiat_25519_cmovznz_u32(&mut output[9], choicebit, self.0[9], other.0[9]);
*self = FieldElement2625(output);
}
fn conditional_swap(a: &mut FieldElement2625, b: &mut FieldElement2625, choice: Choice) {
u32::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
u32::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
u32::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
u32::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
u32::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
u32::conditional_swap(&mut a.0[5], &mut b.0[5], choice);
u32::conditional_swap(&mut a.0[6], &mut b.0[6], choice);
u32::conditional_swap(&mut a.0[7], &mut b.0[7], choice);
u32::conditional_swap(&mut a.0[8], &mut b.0[8], choice);
u32::conditional_swap(&mut a.0[9], &mut b.0[9], choice);
}
}
impl FieldElement2625 {
/// Invert the sign of this field element
pub fn negate(&mut self) {
let neg = self.neg();
self.0 = neg.0;
}
/// Construct zero.
pub fn zero() -> FieldElement2625 {
FieldElement2625([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Construct one.
pub fn one() -> FieldElement2625 {
FieldElement2625([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}
/// Construct -1.
pub fn minus_one() -> FieldElement2625 {
FieldElement2625([
0x3ffffec, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff,
0x3ffffff, 0x1ffffff,
])
}
/// Given `k > 0`, return `self^(2^k)`.
pub fn pow2k(&self, k: u32) -> FieldElement2625 {
debug_assert!(k > 0);
let mut z = self.square();
for _ in 1..k {
z = z.square();
}
z
}
/// Load a `FieldElement2625` from the low 255 bits of a 256-bit
/// input.
///
/// # Warning
///
/// This function does not check that the input used the canonical
/// representative. It masks the high bit, but it will happily
/// decode 2^255 - 18 to 1. Applications that require a canonical
/// encoding of every field element should decode, re-encode to
/// the canonical encoding, and check that the input was
/// canonical.
pub fn from_bytes(data: &[u8; 32]) -> FieldElement2625 {
let mut temp = [0u8; 32];
temp.copy_from_slice(data);
temp[31] &= 127u8;
let mut output = [0u32; 10];
fiat_25519_from_bytes(&mut output, &temp);
FieldElement2625(output)
}
/// Serialize this `FieldElement51` to a 32-byte array. The
/// encoding is canonical.
pub fn to_bytes(&self) -> [u8; 32] {
let mut bytes = [0u8; 32];
fiat_25519_to_bytes(&mut bytes, &self.0);
return bytes;
}
/// Compute `self^2`.
pub fn square(&self) -> FieldElement2625 {
let mut output = *self;
fiat_25519_carry_square(&mut output.0, &self.0);
output
}
/// Compute `2*self^2`.
pub fn square2(&self) -> FieldElement2625 {
let mut output = *self;
let mut temp = *self;
// Void vs return type, measure cost of copying self
fiat_25519_carry_square(&mut temp.0, &self.0);
fiat_25519_add(&mut output.0, &temp.0, &temp.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}

View File

@ -1,26 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! The `u32` backend uses `u32`s and a `(u32, u32) -> u64` multiplier.
//!
//! This code is intended to be portable, but it requires that
//! multiplication of two \\(32\\)-bit values to a \\(64\\)-bit result
//! is constant-time on the target platform.
//!
//! This uses the formally-verified field arithmetic generated by the
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
#[path = "../u32/scalar.rs"]
pub mod scalar;
pub mod field;
#[path = "../u32/constants.rs"]
pub mod constants;

View File

@ -1,249 +0,0 @@
// -*- mode: rust; coding: utf-8; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(64\\)-bit
//! limbs with \\(128\\)-bit products.
//!
//! This uses the formally-verified field arithmetic generated by the
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
use core::fmt::Debug;
use core::ops::Neg;
use core::ops::{Add, AddAssign};
use core::ops::{Mul, MulAssign};
use core::ops::{Sub, SubAssign};
use subtle::Choice;
use subtle::ConditionallySelectable;
use zeroize::Zeroize;
use fiat_crypto::curve25519_64::*;
/// A `FieldElement51` represents an element of the field
/// \\( \mathbb Z / (2\^{255} - 19)\\).
///
/// In the 64-bit implementation, a `FieldElement` is represented in
/// radix \\(2\^{51}\\) as five `u64`s; the coefficients are allowed to
/// grow up to \\(2\^{54}\\) between reductions modulo \\(p\\).
///
/// # Note
///
/// The `curve25519_dalek::field` module provides a type alias
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
/// or `FieldElement2625`.
///
/// The backend-specific type `FieldElement51` should not be used
/// outside of the `curve25519_dalek::field` module.
#[derive(Copy, Clone)]
pub struct FieldElement51(pub(crate) [u64; 5]);
impl Debug for FieldElement51 {
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
write!(f, "FieldElement51({:?})", &self.0[..])
}
}
impl Zeroize for FieldElement51 {
fn zeroize(&mut self) {
self.0.zeroize();
}
}
impl<'b> AddAssign<&'b FieldElement51> for FieldElement51 {
fn add_assign(&mut self, _rhs: &'b FieldElement51) {
let input = self.0;
fiat_25519_add(&mut self.0, &input, &_rhs.0);
let input = self.0;
fiat_25519_carry(&mut self.0, &input);
}
}
impl<'a, 'b> Add<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn add(self, _rhs: &'b FieldElement51) -> FieldElement51 {
let mut output = *self;
fiat_25519_add(&mut output.0, &self.0, &_rhs.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl<'b> SubAssign<&'b FieldElement51> for FieldElement51 {
fn sub_assign(&mut self, _rhs: &'b FieldElement51) {
let input = self.0;
fiat_25519_sub(&mut self.0, &input, &_rhs.0);
let input = self.0;
fiat_25519_carry(&mut self.0, &input);
}
}
impl<'a, 'b> Sub<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn sub(self, _rhs: &'b FieldElement51) -> FieldElement51 {
let mut output = *self;
fiat_25519_sub(&mut output.0, &self.0, &_rhs.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl<'b> MulAssign<&'b FieldElement51> for FieldElement51 {
fn mul_assign(&mut self, _rhs: &'b FieldElement51) {
let input = self.0;
fiat_25519_carry_mul(&mut self.0, &input, &_rhs.0);
}
}
impl<'a, 'b> Mul<&'b FieldElement51> for &'a FieldElement51 {
type Output = FieldElement51;
fn mul(self, _rhs: &'b FieldElement51) -> FieldElement51 {
let mut output = *self;
fiat_25519_carry_mul(&mut output.0, &self.0, &_rhs.0);
output
}
}
impl<'a> Neg for &'a FieldElement51 {
type Output = FieldElement51;
fn neg(self) -> FieldElement51 {
let mut output = *self;
fiat_25519_opp(&mut output.0, &self.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}
impl ConditionallySelectable for FieldElement51 {
fn conditional_select(
a: &FieldElement51,
b: &FieldElement51,
choice: Choice,
) -> FieldElement51 {
let mut output = [0u64; 5];
fiat_25519_selectznz(&mut output, choice.unwrap_u8() as fiat_25519_u1, &a.0, &b.0);
FieldElement51(output)
}
fn conditional_swap(a: &mut FieldElement51, b: &mut FieldElement51, choice: Choice) {
u64::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
u64::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
u64::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
u64::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
u64::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
}
fn conditional_assign(&mut self, _rhs: &FieldElement51, choice: Choice) {
let mut output = [0u64; 5];
let choicebit = choice.unwrap_u8() as fiat_25519_u1;
fiat_25519_cmovznz_u64(&mut output[0], choicebit, self.0[0], _rhs.0[0]);
fiat_25519_cmovznz_u64(&mut output[1], choicebit, self.0[1], _rhs.0[1]);
fiat_25519_cmovznz_u64(&mut output[2], choicebit, self.0[2], _rhs.0[2]);
fiat_25519_cmovznz_u64(&mut output[3], choicebit, self.0[3], _rhs.0[3]);
fiat_25519_cmovznz_u64(&mut output[4], choicebit, self.0[4], _rhs.0[4]);
*self = FieldElement51(output);
}
}
impl FieldElement51 {
/// Construct zero.
pub fn zero() -> FieldElement51 {
FieldElement51([0, 0, 0, 0, 0])
}
/// Construct one.
pub fn one() -> FieldElement51 {
FieldElement51([1, 0, 0, 0, 0])
}
/// Construct -1.
pub fn minus_one() -> FieldElement51 {
FieldElement51([
2251799813685228,
2251799813685247,
2251799813685247,
2251799813685247,
2251799813685247,
])
}
/// Given 64-bit input limbs, reduce to enforce the bound 2^(51 + epsilon).
#[inline(always)]
#[allow(dead_code)] // Need this to not complain about reduce not being used
fn reduce(mut limbs: [u64; 5]) -> FieldElement51 {
let input = limbs;
fiat_25519_carry(&mut limbs, &input);
FieldElement51(limbs)
}
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
/// input.
///
/// # Warning
///
/// This function does not check that the input used the canonical
/// representative. It masks the high bit, but it will happily
/// decode 2^255 - 18 to 1. Applications that require a canonical
/// encoding of every field element should decode, re-encode to
/// the canonical encoding, and check that the input was
/// canonical.
///
pub fn from_bytes(bytes: &[u8; 32]) -> FieldElement51 {
let mut temp = [0u8; 32];
temp.copy_from_slice(bytes);
temp[31] &= 127u8;
let mut output = [0u64; 5];
fiat_25519_from_bytes(&mut output, &temp);
FieldElement51(output)
}
/// Serialize this `FieldElement51` to a 32-byte array. The
/// encoding is canonical.
pub fn to_bytes(&self) -> [u8; 32] {
let mut bytes = [0u8; 32];
fiat_25519_to_bytes(&mut bytes, &self.0);
return bytes;
}
/// Given `k > 0`, return `self^(2^k)`.
pub fn pow2k(&self, mut k: u32) -> FieldElement51 {
let mut output = *self;
loop {
let input = output.0;
fiat_25519_carry_square(&mut output.0, &input);
k -= 1;
if k == 0 {
return output;
}
}
}
/// Returns the square of this field element.
pub fn square(&self) -> FieldElement51 {
let mut output = *self;
fiat_25519_carry_square(&mut output.0, &self.0);
output
}
/// Returns 2 times the square of this field element.
pub fn square2(&self) -> FieldElement51 {
let mut output = *self;
let mut temp = *self;
// Void vs return type, measure cost of copying self
fiat_25519_carry_square(&mut temp.0, &self.0);
fiat_25519_add(&mut output.0, &temp.0, &temp.0);
let input = output.0;
fiat_25519_carry(&mut output.0, &input);
output
}
}

View File

@ -1,28 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! The `u64` backend uses `u64`s and a `(u64, u64) -> u128` multiplier.
//!
//! On x86_64, the idiom `(x as u128) * (y as u128)` lowers to `MUL`
//! instructions taking 64-bit inputs and producing 128-bit outputs. On
//! other platforms, this implementation is not recommended.
//!
//! On Haswell and newer, the BMI2 extension provides `MULX`, and on
//! Broadwell and newer, the ADX extension provides `ADCX` and `ADOX`
//! (allowing the CPU to compute two carry chains in parallel). These
//! will be used if available.
#[path = "../u64/scalar.rs"]
pub mod scalar;
pub mod field;
#[path = "../u64/constants.rs"]
pub mod constants;

View File

@ -1,55 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Serial implementations of field, scalar, point arithmetic.
//!
//! When the vector backend is disabled, the crate uses the
//! mixed-model strategy for implementing point operations and scalar
//! multiplication; see the [`curve_models`](self::curve_models) and
//! [`scalar_mul`](self::scalar_mul) documentation for more
//! information.
//!
//! When the vector backend is enabled, the field and scalar
//! implementations are still used for non-vectorized operations.
//!
//! Note: at this time the `u32` and `u64` backends cannot be built
//! together.
#[cfg(not(any(
feature = "u32_backend",
feature = "u64_backend",
feature = "fiat_u32_backend",
feature = "fiat_u64_backend"
)))]
compile_error!(
"no curve25519-dalek backend cargo feature enabled! \
please enable one of: u32_backend, u64_backend, fiat_u32_backend, fiat_u64_backend"
);
#[cfg(feature = "u32_backend")]
pub mod u32;
#[cfg(feature = "u64_backend")]
pub mod u64;
#[cfg(feature = "fiat_u32_backend")]
pub mod fiat_u32;
#[cfg(feature = "fiat_u64_backend")]
pub mod fiat_u64;
pub mod curve_models;
#[cfg(not(all(
feature = "simd_backend",
any(target_feature = "avx2", target_feature = "avx512ifma")
)))]
pub mod scalar_mul;

View File

@ -1,31 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Implementations of various scalar multiplication algorithms.
//!
//! Note that all of these implementations use serial code for field
//! arithmetic with the multi-model strategy described in the
//! `curve_models` module. The vectorized AVX2 backend has its own
//! scalar multiplication implementations, since it only uses one
//! curve model.
pub mod variable_base;
pub mod vartime_double_base;
#[cfg(feature = "alloc")]
pub mod straus;
#[cfg(feature = "alloc")]
pub mod precomputed_straus;
#[cfg(feature = "alloc")]
pub mod pippenger;

View File

@ -1,202 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Oleg Andreev
// See LICENSE for licensing information.
//
// Authors:
// - Oleg Andreev <oleganza@gmail.com>
//! Implementation of a variant of Pippenger's algorithm.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::VartimeMultiscalarMul;
#[allow(unused_imports)]
use prelude::*;
/// Implements a version of Pippenger's algorithm.
///
/// The algorithm works as follows:
///
/// Let `n` be a number of point-scalar pairs.
/// Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
///
/// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
/// Bucket 0 is not needed as it would contain points multiplied by 0.
/// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
/// Note: only the last digit may equal `2^w/2`.
/// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
/// the point's scalar's value in the window.
/// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
/// by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
/// intermediate sum from the last to the first, and the full sum made of all intermediate sums.
/// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
/// 6. Add to the return value.
/// 7. Repeat the loop.
///
/// Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
///
/// ```ascii
/// cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
/// | | | | |
/// | | | | looping over 256/w windows
/// | | | adding to the result
/// sorting points | shifting the sum by w bits (to the next window, starting from last window)
/// one by one |
/// into buckets adding/subtracting all buckets
/// multiplied by their indexes
/// using a sum of intermediate sums
/// ```
///
/// For large `n`, dominant factor is (n*256/w) additions.
/// However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
/// Therefore, the optimal choice of `w` grows slowly as `n` grows.
///
/// This algorithm is adapted from section 4 of https://eprint.iacr.org/2012/549.pdf.
pub struct Pippenger;
#[cfg(any(feature = "alloc", feature = "std"))]
impl VartimeMultiscalarMul for Pippenger {
type Point = EdwardsPoint;
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
use traits::Identity;
let mut scalars = scalars.into_iter();
let size = scalars.by_ref().size_hint().0;
// Digit width in bits. As digit width grows,
// number of point additions goes down, but amount of
// buckets and bucket additions grows exponentially.
let w = if size < 500 {
6
} else if size < 800 {
7
} else {
8
};
let max_digit: usize = 1 << w;
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
// Collect optimized scalars and points in buffers for repeated access
// (scanning the whole set per digit position).
let scalars = scalars
.map(|s| s.borrow().to_radix_2w(w));
let points = points
.into_iter()
.map(|p| p.map(|P| P.to_projective_niels()));
let scalars_points = scalars
.zip(points)
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
.collect::<Option<Vec<_>>>()?;
// Prepare 2^w/2 buckets.
// buckets[i] corresponds to a multiplication factor (i+1).
let mut buckets: Vec<_> = (0..buckets_count)
.map(|_| EdwardsPoint::identity())
.collect();
let mut columns = (0..digits_count).rev().map(|digit_index| {
// Clear the buckets when processing another digit.
for i in 0..buckets_count {
buckets[i] = EdwardsPoint::identity();
}
// Iterate over pairs of (point, scalar)
// and add/sub the point to the corresponding bucket.
// Note: if we add support for precomputed lookup tables,
// we'll be adding/subtracting point premultiplied by `digits[i]` to buckets[0].
for (digits, pt) in scalars_points.iter() {
// Widen digit so that we don't run into edge cases when w=8.
let digit = digits[digit_index] as i16;
if digit > 0 {
let b = (digit - 1) as usize;
buckets[b] = (&buckets[b] + pt).to_extended();
} else if digit < 0 {
let b = (-digit - 1) as usize;
buckets[b] = (&buckets[b] - pt).to_extended();
}
}
// Add the buckets applying the multiplication factor to each bucket.
// The most efficient way to do that is to have a single sum with two running sums:
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
//
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
// C
// C B
// C B A Sum = C + (C+B) + (C+B+A)
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
let mut buckets_sum = buckets[buckets_count - 1];
for i in (0..(buckets_count - 1)).rev() {
buckets_intermediate_sum += buckets[i];
buckets_sum += buckets_intermediate_sum;
}
buckets_sum
});
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
// `unwrap()` always succeeds because we know we have more than zero digits.
let hi_column = columns.next().unwrap();
Some(
columns
.fold(hi_column, |total, p| total.mul_by_pow_2(w as u32) + p),
)
}
}
#[cfg(test)]
mod test {
use super::*;
use constants;
use scalar::Scalar;
#[test]
fn test_vartime_pippenger() {
// Reuse points across different tests
let mut n = 512;
let x = Scalar::from(2128506u64).invert();
let y = Scalar::from(4443282u64).invert();
let points: Vec<_> = (0..n)
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
.collect();
let scalars: Vec<_> = (0..n)
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
.collect();
let premultiplied: Vec<EdwardsPoint> = scalars
.iter()
.zip(points.iter())
.map(|(sc, pt)| sc * pt)
.collect();
while n > 0 {
let scalars = &scalars[0..n].to_vec();
let points = &points[0..n].to_vec();
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
assert_eq!(subject.compress(), control.compress());
n = n / 2;
}
}
}

View File

@ -1,110 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2019 Henry de Valence.
// See LICENSE for licensing information.
//
// Authors:
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Precomputation for Straus's method.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use backend::serial::curve_models::{
AffineNielsPoint, CompletedPoint, ProjectiveNielsPoint, ProjectivePoint,
};
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::Identity;
use traits::VartimePrecomputedMultiscalarMul;
use window::{NafLookupTable5, NafLookupTable8};
#[allow(unused_imports)]
use prelude::*;
pub struct VartimePrecomputedStraus {
static_lookup_tables: Vec<NafLookupTable8<AffineNielsPoint>>,
}
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
type Point = EdwardsPoint;
fn new<I>(static_points: I) -> Self
where
I: IntoIterator,
I::Item: Borrow<Self::Point>,
{
Self {
static_lookup_tables: static_points
.into_iter()
.map(|P| NafLookupTable8::<AffineNielsPoint>::from(P.borrow()))
.collect(),
}
}
fn optional_mixed_multiscalar_mul<I, J, K>(
&self,
static_scalars: I,
dynamic_scalars: J,
dynamic_points: K,
) -> Option<Self::Point>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<Scalar>,
K: IntoIterator<Item = Option<Self::Point>>,
{
let static_nafs = static_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_nafs: Vec<_> = dynamic_scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect::<Vec<_>>();
let dynamic_lookup_tables = dynamic_points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let sp = self.static_lookup_tables.len();
let dp = dynamic_lookup_tables.len();
assert_eq!(sp, static_nafs.len());
assert_eq!(dp, dynamic_nafs.len());
// We could save some doublings by looking for the highest
// nonzero NAF coefficient, but since we might have a lot of
// them to search, it's not clear it's worthwhile to check.
let mut S = ProjectivePoint::identity();
for j in (0..256).rev() {
let mut R: CompletedPoint = S.double();
for i in 0..dp {
let t_ij = dynamic_nafs[i][j];
if t_ij > 0 {
R = &R.to_extended() + &dynamic_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R.to_extended() - &dynamic_lookup_tables[i].select(-t_ij as usize);
}
}
for i in 0..sp {
let t_ij = static_nafs[i][j];
if t_ij > 0 {
R = &R.to_extended() + &self.static_lookup_tables[i].select(t_ij as usize);
} else if t_ij < 0 {
R = &R.to_extended() - &self.static_lookup_tables[i].select(-t_ij as usize);
}
}
S = R.to_projective();
}
Some(S.to_extended())
}
}

View File

@ -1,196 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
//! Implementation of the interleaved window method, also known as Straus' method.
#![allow(non_snake_case)]
use core::borrow::Borrow;
use edwards::EdwardsPoint;
use scalar::Scalar;
use traits::MultiscalarMul;
use traits::VartimeMultiscalarMul;
#[allow(unused_imports)]
use prelude::*;
/// Perform multiscalar multiplication by the interleaved window
/// method, also known as Straus' method (since it was apparently
/// [first published][solution] by Straus in 1964, as a solution to [a
/// problem][problem] posted in the American Mathematical Monthly in
/// 1963).
///
/// It is easy enough to reinvent, and has been repeatedly. The basic
/// idea is that when computing
/// \\[
/// Q = s_1 P_1 + \cdots + s_n P_n
/// \\]
/// by means of additions and doublings, the doublings can be shared
/// across the \\( P_i \\\).
///
/// We implement two versions, a constant-time algorithm using fixed
/// windows and a variable-time algorithm using sliding windows. They
/// are slight variations on the same idea, and are described in more
/// detail in the respective implementations.
///
/// [solution]: https://www.jstor.org/stable/2310929
/// [problem]: https://www.jstor.org/stable/2312273
pub struct Straus {}
impl MultiscalarMul for Straus {
type Point = EdwardsPoint;
/// Constant-time Straus using a fixed window of size \\(4\\).
///
/// Our goal is to compute
/// \\[
/// Q = s_1 P_1 + \cdots + s_n P_n.
/// \\]
///
/// For each point \\( P_i \\), precompute a lookup table of
/// \\[
/// P_i, 2P_i, 3P_i, 4P_i, 5P_i, 6P_i, 7P_i, 8P_i.
/// \\]
///
/// For each scalar \\( s_i \\), compute its radix-\\(2^4\\)
/// signed digits \\( s_{i,j} \\), i.e.,
/// \\[
/// s_i = s_{i,0} + s_{i,1} 16^1 + ... + s_{i,63} 16^{63},
/// \\]
/// with \\( -8 \leq s_{i,j} < 8 \\). Since \\( 0 \leq |s_{i,j}|
/// \leq 8 \\), we can retrieve \\( s_{i,j} P_i \\) from the
/// lookup table with a conditional negation: using signed
/// digits halves the required table size.
///
/// Then as in the single-base fixed window case, we have
/// \\[
/// \begin{aligned}
/// s_i P_i &= P_i (s_{i,0} + s_{i,1} 16^1 + \cdots + s_{i,63} 16^{63}) \\\\
/// s_i P_i &= P_i s_{i,0} + P_i s_{i,1} 16^1 + \cdots + P_i s_{i,63} 16^{63} \\\\
/// s_i P_i &= P_i s_{i,0} + 16(P_i s_{i,1} + 16( \cdots +16P_i s_{i,63})\cdots )
/// \end{aligned}
/// \\]
/// so each \\( s_i P_i \\) can be computed by alternately adding
/// a precomputed multiple \\( P_i s_{i,j} \\) of \\( P_i \\) and
/// repeatedly doubling.
///
/// Now consider the two-dimensional sum
/// \\[
/// \begin{aligned}
/// s\_1 P\_1 &=& P\_1 s\_{1,0} &+& 16 (P\_1 s\_{1,1} &+& 16 ( \cdots &+& 16 P\_1 s\_{1,63}&) \cdots ) \\\\
/// + & & + & & + & & & & + & \\\\
/// s\_2 P\_2 &=& P\_2 s\_{2,0} &+& 16 (P\_2 s\_{2,1} &+& 16 ( \cdots &+& 16 P\_2 s\_{2,63}&) \cdots ) \\\\
/// + & & + & & + & & & & + & \\\\
/// \vdots & & \vdots & & \vdots & & & & \vdots & \\\\
/// + & & + & & + & & & & + & \\\\
/// s\_n P\_n &=& P\_n s\_{n,0} &+& 16 (P\_n s\_{n,1} &+& 16 ( \cdots &+& 16 P\_n s\_{n,63}&) \cdots )
/// \end{aligned}
/// \\]
/// The sum of the left-hand column is the result \\( Q \\); by
/// computing the two-dimensional sum on the right column-wise,
/// top-to-bottom, then right-to-left, we need to multiply by \\(
/// 16\\) only once per column, sharing the doublings across all
/// of the input points.
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator,
J::Item: Borrow<EdwardsPoint>,
{
use zeroize::Zeroizing;
use backend::serial::curve_models::ProjectiveNielsPoint;
use window::LookupTable;
use traits::Identity;
let lookup_tables: Vec<_> = points
.into_iter()
.map(|point| LookupTable::<ProjectiveNielsPoint>::from(point.borrow()))
.collect();
// This puts the scalar digits into a heap-allocated Vec.
// To ensure that these are erased, pass ownership of the Vec into a
// Zeroizing wrapper.
let scalar_digits_vec: Vec<_> = scalars
.into_iter()
.map(|s| s.borrow().to_radix_16())
.collect();
let scalar_digits = Zeroizing::new(scalar_digits_vec);
let mut Q = EdwardsPoint::identity();
for j in (0..64).rev() {
Q = Q.mul_by_pow_2(4);
let it = scalar_digits.iter().zip(lookup_tables.iter());
for (s_i, lookup_table_i) in it {
// R_i = s_{i,j} * P_i
let R_i = lookup_table_i.select(s_i[j]);
// Q = Q + R_i
Q = (&Q + &R_i).to_extended();
}
}
Q
}
}
impl VartimeMultiscalarMul for Straus {
type Point = EdwardsPoint;
/// Variable-time Straus using a non-adjacent form of width \\(5\\).
///
/// This is completely similar to the constant-time code, but we
/// use a non-adjacent form for the scalar, and do not do table
/// lookups in constant time.
///
/// The non-adjacent form has signed, odd digits. Using only odd
/// digits halves the table size (since we only need odd
/// multiples), or gives fewer additions for the same table size.
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
where
I: IntoIterator,
I::Item: Borrow<Scalar>,
J: IntoIterator<Item = Option<EdwardsPoint>>,
{
use backend::serial::curve_models::{CompletedPoint, ProjectiveNielsPoint, ProjectivePoint};
use window::NafLookupTable5;
use traits::Identity;
let nafs: Vec<_> = scalars
.into_iter()
.map(|c| c.borrow().non_adjacent_form(5))
.collect();
let lookup_tables = points
.into_iter()
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
.collect::<Option<Vec<_>>>()?;
let mut r = ProjectivePoint::identity();
for i in (0..256).rev() {
let mut t: CompletedPoint = r.double();
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
if naf[i] > 0 {
t = &t.to_extended() + &lookup_table.select(naf[i] as usize);
} else if naf[i] < 0 {
t = &t.to_extended() - &lookup_table.select(-naf[i] as usize);
}
}
r = t.to_projective();
}
Some(r.to_extended())
}
}

View File

@ -1,46 +0,0 @@
#![allow(non_snake_case)]
use traits::Identity;
use scalar::Scalar;
use edwards::EdwardsPoint;
use backend::serial::curve_models::ProjectiveNielsPoint;
use window::LookupTable;
/// Perform constant-time, variable-base scalar multiplication.
pub(crate) fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
let lookup_table = LookupTable::<ProjectiveNielsPoint>::from(point);
// Setting s = scalar, compute
//
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
//
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
let scalar_digits = scalar.to_radix_16();
// Compute s*P as
//
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
//
// We sum right-to-left.
// Unwrap first loop iteration to save computing 16*identity
let mut tmp2;
let mut tmp3 = EdwardsPoint::identity();
let mut tmp1 = &tmp3 + &lookup_table.select(scalar_digits[63]);
// Now tmp1 = s_63*P in P1xP1 coords
for i in (0..63).rev() {
tmp2 = tmp1.to_projective(); // tmp2 = (prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 2*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 2*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 4*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 4*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 8*(prev) in P1xP1 coords
tmp2 = tmp1.to_projective(); // tmp2 = 8*(prev) in P2 coords
tmp1 = tmp2.double(); // tmp1 = 16*(prev) in P1xP1 coords
tmp3 = tmp1.to_extended(); // tmp3 = 16*(prev) in P3 coords
tmp1 = &tmp3 + &lookup_table.select(scalar_digits[i]);
// Now tmp1 = s_i*P + 16*(prev) in P1xP1 coords
}
tmp1.to_extended()
}

View File

@ -1,62 +0,0 @@
// -*- mode: rust; -*-
//
// This file is part of curve25519-dalek.
// Copyright (c) 2016-2021 isis lovecruft
// Copyright (c) 2016-2019 Henry de Valence
// See LICENSE for licensing information.
//
// Authors:
// - isis agora lovecruft <isis@patternsinthevoid.net>
// - Henry de Valence <hdevalence@hdevalence.ca>
#![allow(non_snake_case)]
use constants;
use traits::Identity;
use scalar::Scalar;
use edwards::EdwardsPoint;
use backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint};
use window::NafLookupTable5;
/// Compute \\(aA + bB\\) in variable time, where \\(B\\) is the Ed25519 basepoint.
pub fn mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint {
let a_naf = a.non_adjacent_form(5);
let b_naf = b.non_adjacent_form(8);
// Find starting index
let mut i: usize = 255;
for j in (0..256).rev() {
i = j;
if a_naf[i] != 0 || b_naf[i] != 0 {
break;
}
}
let table_A = NafLookupTable5::<ProjectiveNielsPoint>::from(A);
let table_B = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT;
let mut r = ProjectivePoint::identity();
loop {
let mut t = r.double();
if a_naf[i] > 0 {
t = &t.to_extended() + &table_A.select(a_naf[i] as usize);
} else if a_naf[i] < 0 {
t = &t.to_extended() - &table_A.select(-a_naf[i] as usize);
}
if b_naf[i] > 0 {
t = &t.to_extended() + &table_B.select(b_naf[i] as usize);
} else if b_naf[i] < 0 {
t = &t.to_extended() - &table_B.select(-b_naf[i] as usize);
}
r = t.to_projective();
if i == 0 {
break;
}
i -= 1;
}
r.to_extended()
}

Some files were not shown because too many files have changed in this diff Show More