Compare commits
5 Commits
940e26830b
...
f3db1ea486
Author | SHA1 | Date |
---|---|---|
c0dev0id | f3db1ea486 | |
c0dev0id | 30a0150d81 | |
c0dev0id | ab1dac7ff6 | |
c0dev0id | 57fc60ebff | |
c0dev0id | 4219dd4815 |
|
@ -1,41 +1,34 @@
|
|||
COMMENT = Signal Messenger client for terminal
|
||||
|
||||
V = 0.2.5
|
||||
GH_ACCOUNT = boxdot
|
||||
GH_PROJECT = gurk-rs
|
||||
GH_TAGNAME = v${V}
|
||||
GH_TAGNAME = v0.3.0
|
||||
|
||||
CATEGORIES = net
|
||||
|
||||
MAINTAINER = Stefan Hagen <sh+ports@codevoid.de>
|
||||
|
||||
# LICENSE
|
||||
# AGPL 3.0
|
||||
PERMIT_PACKAGE = Yes
|
||||
|
||||
# as devel/cargo MODULES adds DISTFILES, GH_* didn't
|
||||
DISTFILES += ${DISTNAME}${EXTRACT_SUFX}
|
||||
DISTFILES = ${DISTNAME}{master}${EXTRACT_SUFX}
|
||||
|
||||
# vendor files (see $FILESDIR/config)
|
||||
MASTER_SITES0 = https://codevoid.de/h/
|
||||
DISTFILES += ${DISTNAME}-vendorfiles${EXTRACT_SUFX}:0
|
||||
|
||||
MODULES = devel/cargo
|
||||
|
||||
.include "crates.inc"
|
||||
WANTLIB = ${MODCARGO_WANTLIB}
|
||||
|
||||
WANTLIB = c c++abi pthread
|
||||
|
||||
#LIB_DEPENDS =
|
||||
#RUN_DEPENDS =
|
||||
#BUILD_DEPENDS =
|
||||
|
||||
#TEST_DEPENDS =
|
||||
BUILD_DEPENDS = devel/protobuf
|
||||
|
||||
CONFIGURE_STYLE = cargo
|
||||
|
||||
#MAKE_FLAGS =
|
||||
post-configure:
|
||||
cat ${FILESDIR}/config >> ${WRKDIR}/.cargo/config
|
||||
sed -i.bak 's/opt-level = 2/opt-level = 0/g' ${WRKDIR}/.cargo/config
|
||||
|
||||
#NO_TEST = Yes
|
||||
#TEST_TARGET =
|
||||
|
||||
pre-configure:
|
||||
cat ${FILESDIR}/config >> ${WRKSRC}/.cargo/config.toml; \
|
||||
cp -rf ${FILESDIR}/vendor ${WRKSRC}/
|
||||
.include "crates.inc"
|
||||
|
||||
.include <bsd.port.mk>
|
||||
|
|
|
@ -1,360 +1,377 @@
|
|||
# run: make modcargo-gen-crates-licenses
|
||||
MODCARGO_CRATES += adler 1.0.2
|
||||
MODCARGO_CRATES += aead 0.4.2
|
||||
MODCARGO_CRATES += aes 0.7.4
|
||||
MODCARGO_CRATES += aes-gcm 0.9.2
|
||||
MODCARGO_CRATES += aes-gcm-siv 0.10.1
|
||||
MODCARGO_CRATES += aho-corasick 0.7.18
|
||||
MODCARGO_CRATES += android_system_properties 0.1.5
|
||||
MODCARGO_CRATES += ansi_term 0.11.0
|
||||
MODCARGO_CRATES += ansi_term 0.12.1
|
||||
MODCARGO_CRATES += anyhow 1.0.42
|
||||
MODCARGO_CRATES += arrayref 0.3.6
|
||||
MODCARGO_CRATES += arrayvec 0.5.2
|
||||
MODCARGO_CRATES += ascii 0.9.3
|
||||
MODCARGO_CRATES += async-broadcast 0.3.4
|
||||
MODCARGO_CRATES += async-channel 1.6.1
|
||||
MODCARGO_CRATES += async-executor 1.4.1
|
||||
MODCARGO_CRATES += async-io 1.6.0
|
||||
MODCARGO_CRATES += async-lock 2.5.0
|
||||
MODCARGO_CRATES += async-recursion 0.3.2
|
||||
MODCARGO_CRATES += async-task 4.2.0
|
||||
MODCARGO_CRATES += async-trait 0.1.51
|
||||
MODCARGO_CRATES += async-tungstenite 0.15.0
|
||||
MODCARGO_CRATES += atty 0.2.14
|
||||
MODCARGO_CRATES += autocfg 1.1.0
|
||||
MODCARGO_CRATES += base64 0.12.3
|
||||
MODCARGO_CRATES += base64 0.13.0
|
||||
MODCARGO_CRATES += bincode 1.3.3
|
||||
MODCARGO_CRATES += bitflags 1.2.1
|
||||
MODCARGO_CRATES += block 0.1.6
|
||||
MODCARGO_CRATES += block-buffer 0.7.3
|
||||
MODCARGO_CRATES += block-buffer 0.9.0
|
||||
MODCARGO_CRATES += block-modes 0.8.1
|
||||
MODCARGO_CRATES += block-padding 0.1.5
|
||||
MODCARGO_CRATES += block-padding 0.2.1
|
||||
MODCARGO_CRATES += bstr 0.2.16
|
||||
MODCARGO_CRATES += bumpalo 3.7.0
|
||||
MODCARGO_CRATES += byte-tools 0.3.1
|
||||
MODCARGO_CRATES += byteorder 1.4.3
|
||||
MODCARGO_CRATES += bytes 1.0.1
|
||||
MODCARGO_CRATES += cache-padded 1.1.1
|
||||
MODCARGO_CRATES += cassowary 0.3.0
|
||||
MODCARGO_CRATES += cc 1.0.69
|
||||
MODCARGO_CRATES += cesu8 1.1.0
|
||||
MODCARGO_CRATES += cfg-if 1.0.0
|
||||
MODCARGO_CRATES += checked_int_cast 1.0.0
|
||||
MODCARGO_CRATES += chrono 0.4.22
|
||||
MODCARGO_CRATES += cipher 0.3.0
|
||||
MODCARGO_CRATES += clap 2.33.3
|
||||
MODCARGO_CRATES += combine 3.8.1
|
||||
MODCARGO_CRATES += concurrent-queue 1.2.2
|
||||
MODCARGO_CRATES += core-foundation 0.9.1
|
||||
MODCARGO_CRATES += core-foundation-sys 0.8.3
|
||||
MODCARGO_CRATES += cpufeatures 0.1.5
|
||||
MODCARGO_CRATES += crc32fast 1.2.1
|
||||
MODCARGO_CRATES += crossbeam-channel 0.5.5
|
||||
MODCARGO_CRATES += crossbeam-epoch 0.9.5
|
||||
MODCARGO_CRATES += crossbeam-utils 0.8.9
|
||||
MODCARGO_CRATES += crossterm 0.19.0
|
||||
MODCARGO_CRATES += crossterm_winapi 0.7.0
|
||||
MODCARGO_CRATES += crypto-mac 0.7.0
|
||||
MODCARGO_CRATES += crypto-mac 0.11.1
|
||||
MODCARGO_CRATES += ct-logs 0.8.0
|
||||
MODCARGO_CRATES += ctr 0.7.0
|
||||
MODCARGO_CRATES += curve25519-dalek 3.1.0
|
||||
MODCARGO_CRATES += derivative 2.2.0
|
||||
MODCARGO_CRATES += digest 0.8.1
|
||||
MODCARGO_CRATES += digest 0.9.0
|
||||
MODCARGO_CRATES += dirs 3.0.2
|
||||
MODCARGO_CRATES += dirs-next 2.0.0
|
||||
MODCARGO_CRATES += dirs-sys 0.3.6
|
||||
MODCARGO_CRATES += dirs-sys-next 0.1.2
|
||||
MODCARGO_CRATES += displaydoc 0.2.3
|
||||
MODCARGO_CRATES += easy-parallel 3.2.0
|
||||
MODCARGO_CRATES += either 1.6.1
|
||||
MODCARGO_CRATES += emoji 0.2.1
|
||||
MODCARGO_CRATES += enumflags2 0.7.5
|
||||
MODCARGO_CRATES += enumflags2_derive 0.7.4
|
||||
MODCARGO_CRATES += env_logger 0.8.4
|
||||
MODCARGO_CRATES += error-chain 0.12.4
|
||||
MODCARGO_CRATES += event-listener 2.5.2
|
||||
MODCARGO_CRATES += fake-simd 0.1.2
|
||||
MODCARGO_CRATES += fastrand 1.4.1
|
||||
MODCARGO_CRATES += filetime 0.2.14
|
||||
MODCARGO_CRATES += fixedbitset 0.2.0
|
||||
MODCARGO_CRATES += fixedbitset 0.4.1
|
||||
MODCARGO_CRATES += flate2 1.0.20
|
||||
MODCARGO_CRATES += fnv 1.0.7
|
||||
MODCARGO_CRATES += form_urlencoded 1.0.1
|
||||
MODCARGO_CRATES += fs2 0.4.3
|
||||
MODCARGO_CRATES += futures 0.3.15
|
||||
MODCARGO_CRATES += futures-channel 0.3.15
|
||||
MODCARGO_CRATES += futures-core 0.3.15
|
||||
MODCARGO_CRATES += futures-executor 0.3.15
|
||||
MODCARGO_CRATES += futures-io 0.3.15
|
||||
MODCARGO_CRATES += futures-lite 1.12.0
|
||||
MODCARGO_CRATES += futures-macro 0.3.15
|
||||
MODCARGO_CRATES += futures-sink 0.3.15
|
||||
MODCARGO_CRATES += futures-task 0.3.15
|
||||
MODCARGO_CRATES += futures-util 0.3.15
|
||||
MODCARGO_CRATES += fuzzy-matcher 0.3.7
|
||||
MODCARGO_CRATES += fxhash 0.2.1
|
||||
MODCARGO_CRATES += generic-array 0.12.4
|
||||
MODCARGO_CRATES += generic-array 0.14.4
|
||||
MODCARGO_CRATES += getopts 0.2.21
|
||||
MODCARGO_CRATES += getrandom 0.1.16
|
||||
MODCARGO_CRATES += getrandom 0.2.3
|
||||
MODCARGO_CRATES += gh-emoji 1.0.3
|
||||
MODCARGO_CRATES += ghash 0.4.2
|
||||
MODCARGO_CRATES += hashbrown 0.11.2
|
||||
MODCARGO_CRATES += headers 0.3.4
|
||||
MODCARGO_CRATES += headers-core 0.2.0
|
||||
MODCARGO_CRATES += heck 0.3.3
|
||||
MODCARGO_CRATES += hermit-abi 0.1.19
|
||||
MODCARGO_CRATES += hex 0.4.3
|
||||
MODCARGO_CRATES += hkdf 0.11.0
|
||||
MODCARGO_CRATES += hmac 0.7.1
|
||||
MODCARGO_CRATES += hmac 0.11.0
|
||||
MODCARGO_CRATES += hostname 0.3.1
|
||||
MODCARGO_CRATES += http 0.2.4
|
||||
MODCARGO_CRATES += http-body 0.4.2
|
||||
MODCARGO_CRATES += httparse 1.4.1
|
||||
MODCARGO_CRATES += httpdate 1.0.1
|
||||
MODCARGO_CRATES += hyper 0.14.11
|
||||
MODCARGO_CRATES += hyper-rustls 0.22.1
|
||||
MODCARGO_CRATES += hyper-timeout 0.4.1
|
||||
MODCARGO_CRATES += iana-time-zone 0.1.47
|
||||
MODCARGO_CRATES += idna 0.2.3
|
||||
MODCARGO_CRATES += indexmap 1.7.0
|
||||
MODCARGO_CRATES += instant 0.1.10
|
||||
MODCARGO_CRATES += itertools 0.9.0
|
||||
MODCARGO_CRATES += itertools 0.10.1
|
||||
MODCARGO_CRATES += itoa 0.4.7
|
||||
MODCARGO_CRATES += itoa 1.0.2
|
||||
MODCARGO_CRATES += jni 0.16.0
|
||||
MODCARGO_CRATES += jni-sys 0.3.0
|
||||
MODCARGO_CRATES += js-sys 0.3.59
|
||||
MODCARGO_CRATES += lazy_static 1.4.0
|
||||
MODCARGO_CRATES += lexical-core 0.7.6
|
||||
MODCARGO_CRATES += libc 0.2.126
|
||||
MODCARGO_CRATES += linked-hash-map 0.5.4
|
||||
MODCARGO_CRATES += lock_api 0.4.7
|
||||
MODCARGO_CRATES += log 0.4.14
|
||||
MODCARGO_CRATES += log-panics 2.0.0
|
||||
MODCARGO_CRATES += lru-cache 0.1.2
|
||||
MODCARGO_CRATES += mac-notification-sys 0.5.2
|
||||
MODCARGO_CRATES += malloc_buf 0.0.6
|
||||
MODCARGO_CRATES += match_cfg 0.1.0
|
||||
MODCARGO_CRATES += matches 0.1.8
|
||||
MODCARGO_CRATES += memchr 2.4.0
|
||||
MODCARGO_CRATES += memoffset 0.6.4
|
||||
MODCARGO_CRATES += mime 0.3.16
|
||||
MODCARGO_CRATES += mime_guess 2.0.3
|
||||
MODCARGO_CRATES += miniz_oxide 0.4.4
|
||||
MODCARGO_CRATES += mio 0.7.13
|
||||
MODCARGO_CRATES += miow 0.3.7
|
||||
MODCARGO_CRATES += mpart-async 0.5.0
|
||||
MODCARGO_CRATES += multimap 0.8.3
|
||||
MODCARGO_CRATES += nix 0.23.1
|
||||
MODCARGO_CRATES += nom 5.1.2
|
||||
MODCARGO_CRATES += notify-rust 4.5.8
|
||||
MODCARGO_CRATES += ntapi 0.3.6
|
||||
MODCARGO_CRATES += num-integer 0.1.44
|
||||
MODCARGO_CRATES += num-traits 0.2.14
|
||||
MODCARGO_CRATES += num_cpus 1.13.0
|
||||
MODCARGO_CRATES += num_enum 0.5.2
|
||||
MODCARGO_CRATES += num_enum_derive 0.5.2
|
||||
MODCARGO_CRATES += num_threads 0.1.6
|
||||
MODCARGO_CRATES += objc 0.2.7
|
||||
MODCARGO_CRATES += objc-foundation 0.1.1
|
||||
MODCARGO_CRATES += objc_id 0.1.1
|
||||
MODCARGO_CRATES += once_cell 1.13.1
|
||||
MODCARGO_CRATES += oncemutex 0.1.1
|
||||
MODCARGO_CRATES += opaque-debug 0.2.3
|
||||
MODCARGO_CRATES += opaque-debug 0.3.0
|
||||
MODCARGO_CRATES += opener 0.5.0
|
||||
MODCARGO_CRATES += openssl-probe 0.1.4
|
||||
MODCARGO_CRATES += ordered-stream 0.0.1
|
||||
MODCARGO_CRATES += parking 2.0.0
|
||||
MODCARGO_CRATES += parking_lot 0.11.1
|
||||
MODCARGO_CRATES += parking_lot_core 0.8.3
|
||||
MODCARGO_CRATES += percent-encoding 2.1.0
|
||||
MODCARGO_CRATES += petgraph 0.5.1
|
||||
MODCARGO_CRATES += petgraph 0.6.0
|
||||
MODCARGO_CRATES += phf 0.8.0
|
||||
MODCARGO_CRATES += phf_generator 0.8.0
|
||||
MODCARGO_CRATES += phf_macros 0.8.0
|
||||
MODCARGO_CRATES += phf_shared 0.8.0
|
||||
MODCARGO_CRATES += phonenumber 0.3.1+8.12.9
|
||||
MODCARGO_CRATES += pin-project 1.0.7
|
||||
MODCARGO_CRATES += pin-project-internal 1.0.7
|
||||
MODCARGO_CRATES += pin-project-lite 0.2.7
|
||||
MODCARGO_CRATES += pin-utils 0.1.0
|
||||
MODCARGO_CRATES += polling 2.1.0
|
||||
MODCARGO_CRATES += polyval 0.5.1
|
||||
MODCARGO_CRATES += ppv-lite86 0.2.10
|
||||
MODCARGO_CRATES += proc-macro-crate 1.0.0
|
||||
MODCARGO_CRATES += proc-macro-error 1.0.4
|
||||
MODCARGO_CRATES += proc-macro-error-attr 1.0.4
|
||||
MODCARGO_CRATES += proc-macro-hack 0.5.19
|
||||
MODCARGO_CRATES += proc-macro-nested 0.1.7
|
||||
MODCARGO_CRATES += proc-macro2 1.0.27
|
||||
MODCARGO_CRATES += prost 0.8.0
|
||||
MODCARGO_CRATES += prost 0.9.0
|
||||
MODCARGO_CRATES += prost-build 0.8.0
|
||||
MODCARGO_CRATES += prost-build 0.9.0
|
||||
MODCARGO_CRATES += prost-derive 0.8.0
|
||||
MODCARGO_CRATES += prost-derive 0.9.0
|
||||
MODCARGO_CRATES += prost-types 0.8.0
|
||||
MODCARGO_CRATES += prost-types 0.9.0
|
||||
MODCARGO_CRATES += pulldown-cmark 0.8.0
|
||||
MODCARGO_CRATES += qr2term 0.2.2
|
||||
MODCARGO_CRATES += qrcode 0.12.0
|
||||
MODCARGO_CRATES += quick-xml 0.18.1
|
||||
MODCARGO_CRATES += quickcheck 1.0.3
|
||||
MODCARGO_CRATES += quickcheck_macros 1.0.0
|
||||
MODCARGO_CRATES += quote 1.0.9
|
||||
MODCARGO_CRATES += rand 0.7.3
|
||||
MODCARGO_CRATES += rand 0.8.4
|
||||
MODCARGO_CRATES += rand_chacha 0.2.2
|
||||
MODCARGO_CRATES += rand_chacha 0.3.1
|
||||
MODCARGO_CRATES += rand_core 0.5.1
|
||||
MODCARGO_CRATES += rand_core 0.6.3
|
||||
MODCARGO_CRATES += rand_hc 0.2.0
|
||||
MODCARGO_CRATES += rand_hc 0.3.1
|
||||
MODCARGO_CRATES += rand_pcg 0.2.1
|
||||
MODCARGO_CRATES += redox_syscall 0.2.9
|
||||
MODCARGO_CRATES += redox_users 0.4.0
|
||||
MODCARGO_CRATES += regex 1.5.6
|
||||
MODCARGO_CRATES += regex-automata 0.1.10
|
||||
MODCARGO_CRATES += regex-cache 0.2.1
|
||||
MODCARGO_CRATES += regex-syntax 0.6.26
|
||||
MODCARGO_CRATES += remove_dir_all 0.5.3
|
||||
MODCARGO_CRATES += ring 0.16.20
|
||||
MODCARGO_CRATES += rustls 0.19.1
|
||||
MODCARGO_CRATES += rustls-native-certs 0.5.0
|
||||
MODCARGO_CRATES += ryu 1.0.5
|
||||
MODCARGO_CRATES += same-file 1.0.6
|
||||
MODCARGO_CRATES += schannel 0.1.19
|
||||
MODCARGO_CRATES += scopeguard 1.1.0
|
||||
MODCARGO_CRATES += sct 0.6.1
|
||||
MODCARGO_CRATES += security-framework 2.3.1
|
||||
MODCARGO_CRATES += security-framework-sys 2.3.0
|
||||
MODCARGO_CRATES += semver 1.0.3
|
||||
MODCARGO_CRATES += serde 1.0.126
|
||||
MODCARGO_CRATES += serde_derive 1.0.126
|
||||
MODCARGO_CRATES += serde_json 1.0.64
|
||||
MODCARGO_CRATES += serde_repr 0.1.7
|
||||
MODCARGO_CRATES += sha-1 0.9.6
|
||||
MODCARGO_CRATES += sha1 0.6.1
|
||||
MODCARGO_CRATES += sha1_smol 1.0.0
|
||||
MODCARGO_CRATES += sha2 0.8.2
|
||||
MODCARGO_CRATES += sha2 0.9.5
|
||||
MODCARGO_CRATES += sharded-slab 0.1.4
|
||||
MODCARGO_CRATES += signal-hook 0.1.17
|
||||
MODCARGO_CRATES += signal-hook-registry 1.4.0
|
||||
MODCARGO_CRATES += siphasher 0.3.5
|
||||
MODCARGO_CRATES += slab 0.4.3
|
||||
MODCARGO_CRATES += sled 0.34.6
|
||||
MODCARGO_CRATES += smallvec 1.6.1
|
||||
MODCARGO_CRATES += smawk 0.3.1
|
||||
MODCARGO_CRATES += socket2 0.4.0
|
||||
MODCARGO_CRATES += spin 0.5.2
|
||||
MODCARGO_CRATES += static_assertions 1.1.0
|
||||
MODCARGO_CRATES += strsim 0.8.0
|
||||
MODCARGO_CRATES += structopt 0.3.22
|
||||
MODCARGO_CRATES += structopt-derive 0.4.15
|
||||
MODCARGO_CRATES += strum 0.22.0
|
||||
MODCARGO_CRATES += strum_macros 0.22.0
|
||||
MODCARGO_CRATES += subtle 1.0.0
|
||||
MODCARGO_CRATES += subtle 2.4.1
|
||||
MODCARGO_CRATES += syn 1.0.73
|
||||
MODCARGO_CRATES += synstructure 0.12.5
|
||||
MODCARGO_CRATES += tar 0.4.38
|
||||
MODCARGO_CRATES += tempfile 3.2.0
|
||||
MODCARGO_CRATES += textwrap 0.11.0
|
||||
MODCARGO_CRATES += textwrap 0.14.2
|
||||
MODCARGO_CRATES += thiserror 1.0.30
|
||||
MODCARGO_CRATES += thiserror-impl 1.0.30
|
||||
MODCARGO_CRATES += thread_local 1.1.4
|
||||
MODCARGO_CRATES += time 0.1.44
|
||||
MODCARGO_CRATES += time 0.3.9
|
||||
MODCARGO_CRATES += tinyvec 1.2.0
|
||||
MODCARGO_CRATES += tinyvec_macros 0.1.0
|
||||
MODCARGO_CRATES += tokio 1.16.1
|
||||
MODCARGO_CRATES += tokio-io-timeout 1.1.1
|
||||
MODCARGO_CRATES += tokio-macros 1.8.0
|
||||
MODCARGO_CRATES += tokio-rustls 0.22.0
|
||||
MODCARGO_CRATES += tokio-stream 0.1.7
|
||||
MODCARGO_CRATES += tokio-util 0.6.7
|
||||
MODCARGO_CRATES += toml 0.5.8
|
||||
MODCARGO_CRATES += tower-service 0.3.1
|
||||
MODCARGO_CRATES += tracing 0.1.35
|
||||
MODCARGO_CRATES += tracing-appender 0.2.2
|
||||
MODCARGO_CRATES += tracing-attributes 0.1.21
|
||||
MODCARGO_CRATES += tracing-core 0.1.27
|
||||
MODCARGO_CRATES += tracing-log 0.1.3
|
||||
MODCARGO_CRATES += tracing-subscriber 0.3.11
|
||||
MODCARGO_CRATES += try-lock 0.2.3
|
||||
MODCARGO_CRATES += tui 0.15.0
|
||||
MODCARGO_CRATES += tungstenite 0.15.0
|
||||
MODCARGO_CRATES += twoway 0.2.2
|
||||
MODCARGO_CRATES += typenum 1.13.0
|
||||
MODCARGO_CRATES += unchecked-index 0.2.2
|
||||
MODCARGO_CRATES += unicase 2.6.0
|
||||
MODCARGO_CRATES += unicode-bidi 0.3.5
|
||||
MODCARGO_CRATES += unicode-linebreak 0.1.1
|
||||
MODCARGO_CRATES += unicode-normalization 0.1.19
|
||||
MODCARGO_CRATES += unicode-segmentation 1.8.0
|
||||
MODCARGO_CRATES += unicode-width 0.1.8
|
||||
MODCARGO_CRATES += unicode-xid 0.2.2
|
||||
MODCARGO_CRATES += universal-hash 0.4.0
|
||||
MODCARGO_CRATES += unreachable 1.0.0
|
||||
MODCARGO_CRATES += untrusted 0.7.1
|
||||
MODCARGO_CRATES += url 2.2.2
|
||||
MODCARGO_CRATES += utf-8 0.7.6
|
||||
MODCARGO_CRATES += uuid 0.8.2
|
||||
MODCARGO_CRATES += valuable 0.1.0
|
||||
MODCARGO_CRATES += vec_map 0.8.2
|
||||
MODCARGO_CRATES += version_check 0.9.3
|
||||
MODCARGO_CRATES += void 1.0.2
|
||||
MODCARGO_CRATES += waker-fn 1.1.0
|
||||
MODCARGO_CRATES += walkdir 2.3.2
|
||||
MODCARGO_CRATES += want 0.3.0
|
||||
MODCARGO_CRATES += wasi 0.9.0+wasi-snapshot-preview1
|
||||
MODCARGO_CRATES += wasi 0.10.0+wasi-snapshot-preview1
|
||||
MODCARGO_CRATES += wasm-bindgen 0.2.82
|
||||
MODCARGO_CRATES += wasm-bindgen-backend 0.2.82
|
||||
MODCARGO_CRATES += wasm-bindgen-macro 0.2.82
|
||||
MODCARGO_CRATES += wasm-bindgen-macro-support 0.2.82
|
||||
MODCARGO_CRATES += wasm-bindgen-shared 0.2.82
|
||||
MODCARGO_CRATES += web-sys 0.3.51
|
||||
MODCARGO_CRATES += webpki 0.21.4
|
||||
MODCARGO_CRATES += wepoll-ffi 0.1.2
|
||||
MODCARGO_CRATES += which 4.1.0
|
||||
MODCARGO_CRATES += whoami 1.1.2
|
||||
MODCARGO_CRATES += winapi 0.3.9
|
||||
MODCARGO_CRATES += winapi-i686-pc-windows-gnu 0.4.0
|
||||
MODCARGO_CRATES += winapi-util 0.1.5
|
||||
MODCARGO_CRATES += winapi-x86_64-pc-windows-gnu 0.4.0
|
||||
MODCARGO_CRATES += windows 0.24.0
|
||||
MODCARGO_CRATES += windows_i686_gnu 0.24.0
|
||||
MODCARGO_CRATES += windows_i686_msvc 0.24.0
|
||||
MODCARGO_CRATES += windows_x86_64_gnu 0.24.0
|
||||
MODCARGO_CRATES += windows_x86_64_msvc 0.24.0
|
||||
MODCARGO_CRATES += winrt-notification 0.5.1
|
||||
MODCARGO_CRATES += x25519-dalek 1.1.1
|
||||
MODCARGO_CRATES += xattr 0.2.2
|
||||
MODCARGO_CRATES += xflags 0.2.2
|
||||
MODCARGO_CRATES += xflags-macros 0.2.2
|
||||
MODCARGO_CRATES += xml-rs 0.8.4
|
||||
MODCARGO_CRATES += xshell 0.1.14
|
||||
MODCARGO_CRATES += xshell-macros 0.1.14
|
||||
MODCARGO_CRATES += zbus 2.1.1
|
||||
MODCARGO_CRATES += zbus_macros 2.1.1
|
||||
MODCARGO_CRATES += zbus_names 2.1.0
|
||||
MODCARGO_CRATES += zeroize 1.3.0
|
||||
MODCARGO_CRATES += zeroize_derive 1.3.2
|
||||
MODCARGO_CRATES += zvariant 3.3.0
|
||||
MODCARGO_CRATES += zvariant_derive 3.3.0
|
||||
MODCARGO_CRATES += adler 1.0.2 # 0BSD OR MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += aead 0.4.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += aes 0.7.5 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += aes-gcm 0.9.4 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += aes-gcm-siv 0.10.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += aho-corasick 0.7.19 # Unlicense/MIT
|
||||
MODCARGO_CRATES += android_system_properties 0.1.5 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += anes 0.1.6 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += ansi_term 0.12.1 # MIT
|
||||
MODCARGO_CRATES += anyhow 1.0.64 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += arrayref 0.3.6 # BSD-2-Clause
|
||||
MODCARGO_CRATES += arrayvec 0.5.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += async-broadcast 0.4.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += async-channel 1.7.1 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += async-executor 1.4.1 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += async-io 1.9.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += async-lock 2.5.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += async-recursion 0.3.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += async-task 4.3.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += async-trait 0.1.57 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += async-tungstenite 0.17.2 # MIT
|
||||
MODCARGO_CRATES += atty 0.2.14 # MIT
|
||||
MODCARGO_CRATES += autocfg 1.1.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += base64 0.12.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += base64 0.13.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += bincode 1.3.3 # MIT
|
||||
MODCARGO_CRATES += bitflags 1.3.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += block 0.1.6 # MIT
|
||||
MODCARGO_CRATES += block-buffer 0.9.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += block-buffer 0.10.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += block-modes 0.8.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += block-padding 0.2.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += bstr 0.2.17 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += bumpalo 3.11.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += byteorder 1.4.3 # Unlicense OR MIT
|
||||
MODCARGO_CRATES += bytes 1.2.1 # MIT
|
||||
MODCARGO_CRATES += cache-padded 1.2.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += cassowary 0.3.0 # MIT / Apache-2.0
|
||||
MODCARGO_CRATES += cast 0.3.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += cc 1.0.73 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += cfg-if 1.0.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += checked_int_cast 1.0.0 # MIT
|
||||
MODCARGO_CRATES += chrono 0.4.22 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += ciborium 0.2.0 # Apache-2.0
|
||||
MODCARGO_CRATES += ciborium-io 0.2.0 # Apache-2.0
|
||||
MODCARGO_CRATES += ciborium-ll 0.2.0 # Apache-2.0
|
||||
MODCARGO_CRATES += cipher 0.3.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += clap 3.2.23 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += clap 4.0.18 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += clap_derive 4.0.18 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += clap_lex 0.2.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += clap_lex 0.3.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += cmake 0.1.48 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += concurrent-queue 1.2.4 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += core-foundation 0.9.3 # MIT / Apache-2.0
|
||||
MODCARGO_CRATES += core-foundation-sys 0.8.3 # MIT / Apache-2.0
|
||||
MODCARGO_CRATES += cpufeatures 0.2.5 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crc32fast 1.3.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += criterion 0.4.0 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += criterion-plot 0.5.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += crossbeam-channel 0.5.6 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crossbeam-deque 0.8.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crossbeam-epoch 0.9.10 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crossbeam-utils 0.8.11 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crossterm 0.19.0 # MIT
|
||||
MODCARGO_CRATES += crossterm 0.23.2 # MIT
|
||||
MODCARGO_CRATES += crossterm_winapi 0.7.0 # MIT
|
||||
MODCARGO_CRATES += crossterm_winapi 0.9.0 # MIT
|
||||
MODCARGO_CRATES += crypto-common 0.1.6 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += crypto-mac 0.11.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += ctr 0.8.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += derivative 2.2.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += digest 0.9.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += digest 0.10.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += dirs 3.0.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += dirs 4.0.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += dirs-next 2.0.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += dirs-sys 0.3.7 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += dirs-sys-next 0.1.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += displaydoc 0.2.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += either 1.8.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += emoji 0.2.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += enumflags2 0.7.5 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += enumflags2_derive 0.7.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += env_logger 0.8.4 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += event-listener 2.5.3 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += fastrand 1.8.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += filetime 0.2.17 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += fixedbitset 0.4.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += flate2 1.0.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += fnv 1.0.7 # Apache-2.0 / MIT
|
||||
MODCARGO_CRATES += form_urlencoded 1.0.1 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += fs2 0.4.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += futures 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-channel 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-core 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-executor 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-io 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-lite 1.12.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += futures-macro 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-sink 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-task 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += futures-util 0.3.24 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += fuzzy-matcher 0.3.7 # MIT
|
||||
MODCARGO_CRATES += fxhash 0.2.1 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += generic-array 0.14.6 # MIT
|
||||
MODCARGO_CRATES += getopts 0.2.21 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += getrandom 0.1.16 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += getrandom 0.2.7 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += gh-emoji 1.0.7 # MIT
|
||||
MODCARGO_CRATES += ghash 0.4.4 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += half 1.8.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hashbrown 0.12.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += headers 0.3.8 # MIT
|
||||
MODCARGO_CRATES += headers-core 0.2.0 # MIT
|
||||
MODCARGO_CRATES += heck 0.3.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += heck 0.4.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hermit-abi 0.1.19 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += hex 0.4.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hkdf 0.11.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += hkdf 0.12.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hmac 0.11.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hmac 0.12.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += hostname 0.3.1 # MIT
|
||||
MODCARGO_CRATES += http 0.2.8 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += http-body 0.4.5 # MIT
|
||||
MODCARGO_CRATES += httparse 1.8.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += httpdate 1.0.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += hyper 0.14.20 # MIT
|
||||
MODCARGO_CRATES += hyper-rustls 0.23.0 # Apache-2.0/ISC/MIT
|
||||
MODCARGO_CRATES += hyper-timeout 0.4.1 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += iana-time-zone 0.1.47 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += idna 0.2.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += indexmap 1.9.1 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += instant 0.1.12 # BSD-3-Clause
|
||||
MODCARGO_CRATES += itertools 0.9.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += itertools 0.10.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += itoa 1.0.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += js-sys 0.3.59 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += lazy_static 1.4.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += lexical-core 0.7.6 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += libc 0.2.132 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += linked-hash-map 0.5.6 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += lock_api 0.4.8 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += log 0.4.17 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += log-panics 2.1.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += lru-cache 0.1.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += mac-notification-sys 0.5.6 # MIT
|
||||
MODCARGO_CRATES += malloc_buf 0.0.6 # MIT
|
||||
MODCARGO_CRATES += match_cfg 0.1.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += matches 0.1.9 # MIT
|
||||
MODCARGO_CRATES += memchr 2.5.0 # Unlicense/MIT
|
||||
MODCARGO_CRATES += memoffset 0.6.5 # MIT
|
||||
MODCARGO_CRATES += mime 0.3.16 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += mime_guess 2.0.4 # MIT
|
||||
MODCARGO_CRATES += miniz_oxide 0.5.4 # MIT OR Zlib OR Apache-2.0
|
||||
MODCARGO_CRATES += mio 0.7.14 # MIT
|
||||
MODCARGO_CRATES += mio 0.8.4 # MIT
|
||||
MODCARGO_CRATES += miow 0.3.7 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += mpart-async 0.6.1 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += multimap 0.8.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += nix 0.23.1 # MIT
|
||||
MODCARGO_CRATES += nom 5.1.2 # MIT
|
||||
MODCARGO_CRATES += notify-rust 4.5.10 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += ntapi 0.3.7 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += num-integer 0.1.45 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += num-traits 0.2.15 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += num_cpus 1.13.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += num_enum 0.5.7 # BSD-3-Clause OR MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += num_enum_derive 0.5.7 # BSD-3-Clause OR MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += num_threads 0.1.6 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += objc 0.2.7 # MIT
|
||||
MODCARGO_CRATES += objc-foundation 0.1.1 # MIT
|
||||
MODCARGO_CRATES += objc_id 0.1.1 # MIT
|
||||
MODCARGO_CRATES += once_cell 1.14.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += oncemutex 0.1.1 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += oorandom 11.1.3 # MIT
|
||||
MODCARGO_CRATES += opaque-debug 0.3.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += opener 0.5.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += openssl-probe 0.1.5 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += ordered-stream 0.0.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += os_str_bytes 6.3.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += parking 2.0.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += parking_lot 0.11.2 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += parking_lot 0.12.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += parking_lot_core 0.8.5 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += parking_lot_core 0.9.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += percent-encoding 2.1.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += petgraph 0.6.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += phf 0.8.0 # MIT
|
||||
MODCARGO_CRATES += phf 0.11.1 # MIT
|
||||
MODCARGO_CRATES += phf_generator 0.8.0 # MIT
|
||||
MODCARGO_CRATES += phf_macros 0.8.0 # MIT
|
||||
MODCARGO_CRATES += phf_shared 0.8.0 # MIT
|
||||
MODCARGO_CRATES += phf_shared 0.11.1 # MIT
|
||||
MODCARGO_CRATES += phonenumber 0.3.1+8.12.9 # Apache-2.0
|
||||
MODCARGO_CRATES += pin-project-lite 0.2.9 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += pin-utils 0.1.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += plotters 0.3.4 # MIT
|
||||
MODCARGO_CRATES += plotters-backend 0.3.4 # MIT
|
||||
MODCARGO_CRATES += plotters-svg 0.3.3 # MIT
|
||||
MODCARGO_CRATES += polling 2.3.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += polyval 0.5.3 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += ppv-lite86 0.2.16 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += proc-macro-crate 1.2.1 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += proc-macro-error 1.0.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += proc-macro-error-attr 1.0.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += proc-macro-hack 0.5.19 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += proc-macro2 1.0.43 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += prost 0.9.0 # Apache-2.0
|
||||
MODCARGO_CRATES += prost 0.10.4 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-build 0.9.0 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-build 0.10.4 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-derive 0.9.0 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-derive 0.10.1 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-types 0.9.0 # Apache-2.0
|
||||
MODCARGO_CRATES += prost-types 0.10.1 # Apache-2.0
|
||||
MODCARGO_CRATES += pulldown-cmark 0.8.0 # MIT
|
||||
MODCARGO_CRATES += qr2term 0.3.0 # MPL-2.0
|
||||
MODCARGO_CRATES += qrcode 0.12.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += quick-xml 0.18.1 # MIT
|
||||
MODCARGO_CRATES += quick-xml 0.23.1 # MIT
|
||||
MODCARGO_CRATES += quickcheck 1.0.3 # Unlicense/MIT
|
||||
MODCARGO_CRATES += quickcheck_macros 1.0.0 # Unlicense/MIT
|
||||
MODCARGO_CRATES += quote 1.0.21 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand 0.7.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand 0.8.5 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand_chacha 0.2.2 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand_chacha 0.3.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand_core 0.5.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand_core 0.6.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rand_hc 0.2.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += rand_pcg 0.2.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rayon 1.5.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += rayon-core 1.9.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += redox_syscall 0.2.16 # MIT
|
||||
MODCARGO_CRATES += redox_users 0.4.3 # MIT
|
||||
MODCARGO_CRATES += regex 1.6.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += regex-automata 0.1.10 # Unlicense/MIT
|
||||
MODCARGO_CRATES += regex-cache 0.2.1 # MIT
|
||||
MODCARGO_CRATES += regex-syntax 0.6.27 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += remove_dir_all 0.5.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += ring 0.16.20 # LICENSE
|
||||
MODCARGO_CRATES += rustls 0.20.6 # Apache-2.0/ISC/MIT
|
||||
MODCARGO_CRATES += rustls-native-certs 0.6.2 # Apache-2.0/ISC/MIT
|
||||
MODCARGO_CRATES += rustls-pemfile 0.3.0 # Apache-2.0/ISC/MIT
|
||||
MODCARGO_CRATES += rustls-pemfile 1.0.1 # Apache-2.0 OR ISC OR MIT
|
||||
MODCARGO_CRATES += ryu 1.0.11 # Apache-2.0 OR BSL-1.0
|
||||
MODCARGO_CRATES += same-file 1.0.6 # Unlicense/MIT
|
||||
MODCARGO_CRATES += schannel 0.1.20 # MIT
|
||||
MODCARGO_CRATES += scopeguard 1.1.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += sct 0.7.0 # Apache-2.0/ISC/MIT
|
||||
MODCARGO_CRATES += security-framework 2.7.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += security-framework-sys 2.6.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += semver 1.0.13 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += serde 1.0.144 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += serde_derive 1.0.144 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += serde_json 1.0.85 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += serde_repr 0.1.9 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += sha-1 0.10.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += sha1 0.6.1 # BSD-3-Clause
|
||||
MODCARGO_CRATES += sha1 0.10.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += sha1_smol 1.0.0 # BSD-3-Clause
|
||||
MODCARGO_CRATES += sha2 0.9.9 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += sha2 0.10.5 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += sharded-slab 0.1.4 # MIT
|
||||
MODCARGO_CRATES += signal-hook 0.1.17 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += signal-hook 0.3.14 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += signal-hook-mio 0.2.3 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += signal-hook-registry 1.4.0 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += siphasher 0.3.10 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += slab 0.4.7 # MIT
|
||||
MODCARGO_CRATES += sled 0.34.7 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += smallvec 1.9.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += smawk 0.3.1 # MIT
|
||||
MODCARGO_CRATES += socket2 0.4.7 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += spin 0.5.2 # MIT
|
||||
MODCARGO_CRATES += static_assertions 1.1.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += strsim 0.10.0 # MIT
|
||||
MODCARGO_CRATES += strum 0.22.0 # MIT
|
||||
MODCARGO_CRATES += strum_macros 0.22.0 # MIT
|
||||
MODCARGO_CRATES += subtle 2.4.1 # BSD-3-Clause
|
||||
MODCARGO_CRATES += syn 1.0.99 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += synstructure 0.12.6 # MIT
|
||||
MODCARGO_CRATES += tar 0.4.38 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += tauri-winrt-notification 0.1.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += tempfile 3.3.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += termcolor 1.1.3 # Unlicense OR MIT
|
||||
MODCARGO_CRATES += textwrap 0.14.2 # MIT
|
||||
MODCARGO_CRATES += textwrap 0.16.0 # MIT
|
||||
MODCARGO_CRATES += thiserror 1.0.34 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += thiserror-impl 1.0.34 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += thread_local 1.1.4 # Apache-2.0/MIT
|
||||
MODCARGO_CRATES += time 0.1.44 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += time 0.3.14 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += tinytemplate 1.2.1 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += tinyvec 1.6.0 # Zlib OR Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += tinyvec_macros 0.1.0 # MIT OR Apache-2.0 OR Zlib
|
||||
MODCARGO_CRATES += tokio 1.21.0 # MIT
|
||||
MODCARGO_CRATES += tokio-io-timeout 1.2.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += tokio-macros 1.8.0 # MIT
|
||||
MODCARGO_CRATES += tokio-rustls 0.23.4 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += tokio-stream 0.1.9 # MIT
|
||||
MODCARGO_CRATES += tokio-util 0.6.10 # MIT
|
||||
MODCARGO_CRATES += toml 0.5.9 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += tower-service 0.3.2 # MIT
|
||||
MODCARGO_CRATES += tracing 0.1.36 # MIT
|
||||
MODCARGO_CRATES += tracing-appender 0.2.2 # MIT
|
||||
MODCARGO_CRATES += tracing-attributes 0.1.22 # MIT
|
||||
MODCARGO_CRATES += tracing-core 0.1.29 # MIT
|
||||
MODCARGO_CRATES += tracing-log 0.1.3 # MIT
|
||||
MODCARGO_CRATES += tracing-subscriber 0.3.15 # MIT
|
||||
MODCARGO_CRATES += try-lock 0.2.3 # MIT
|
||||
MODCARGO_CRATES += tui 0.15.0 # MIT
|
||||
MODCARGO_CRATES += tungstenite 0.17.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += typenum 1.15.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += uds_windows 1.0.2 # MIT
|
||||
MODCARGO_CRATES += unicase 2.6.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += unicode-bidi 0.3.8 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += unicode-ident 1.0.3 # (MIT OR Apache-2.0) AND Unicode-DFS-2016
|
||||
MODCARGO_CRATES += unicode-linebreak 0.1.2 # Apache-2.0
|
||||
MODCARGO_CRATES += unicode-normalization 0.1.21 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += unicode-segmentation 1.9.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += unicode-width 0.1.9 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += unicode-xid 0.2.3 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += universal-hash 0.4.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += untrusted 0.7.1 # ISC
|
||||
MODCARGO_CRATES += url 2.2.2 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += utf-8 0.7.6 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += uuid 1.2.1 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += valuable 0.1.0 # MIT
|
||||
MODCARGO_CRATES += version_check 0.9.4 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += waker-fn 1.1.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += walkdir 2.3.2 # Unlicense/MIT
|
||||
MODCARGO_CRATES += want 0.3.0 # MIT
|
||||
MODCARGO_CRATES += wasi 0.9.0+wasi-snapshot-preview1 # Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += wasi 0.10.0+wasi-snapshot-preview1 # Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += wasi 0.11.0+wasi-snapshot-preview1 # Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += wasm-bindgen 0.2.82 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += wasm-bindgen-backend 0.2.82 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += wasm-bindgen-macro 0.2.82 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += wasm-bindgen-macro-support 0.2.82 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += wasm-bindgen-shared 0.2.82 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += web-sys 0.3.59 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += webpki 0.22.0 # LICENSE
|
||||
MODCARGO_CRATES += wepoll-ffi 0.1.2 # MIT OR Apache-2.0 OR BSD-2-Clause
|
||||
MODCARGO_CRATES += which 4.3.0 # MIT
|
||||
MODCARGO_CRATES += whoami 1.2.1 # Apache-2.0 OR BSL-1.0 OR MIT
|
||||
MODCARGO_CRATES += winapi 0.3.9 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += winapi-i686-pc-windows-gnu 0.4.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += winapi-util 0.1.5 # Unlicense/MIT
|
||||
MODCARGO_CRATES += winapi-x86_64-pc-windows-gnu 0.4.0 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += windows 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows-sys 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_aarch64_msvc 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_aarch64_msvc 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_i686_gnu 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_i686_gnu 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_i686_msvc 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_i686_msvc 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_x86_64_gnu 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_x86_64_gnu 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_x86_64_msvc 0.36.1 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += windows_x86_64_msvc 0.39.0 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += x25519-dalek 1.2.0 # BSD-3-Clause
|
||||
MODCARGO_CRATES += xattr 0.2.3 # MIT/Apache-2.0
|
||||
MODCARGO_CRATES += xflags 0.2.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += xflags-macros 0.2.4 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += xshell 0.1.17 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += xshell-macros 0.1.17 # MIT OR Apache-2.0
|
||||
MODCARGO_CRATES += zbus 2.3.2 # MIT
|
||||
MODCARGO_CRATES += zbus_macros 2.3.2 # MIT
|
||||
MODCARGO_CRATES += zbus_names 2.2.0 # MIT
|
||||
MODCARGO_CRATES += zeroize 1.3.0 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += zeroize_derive 1.3.2 # Apache-2.0 OR MIT
|
||||
MODCARGO_CRATES += zvariant 3.6.0 # MIT
|
||||
MODCARGO_CRATES += zvariant_derive 3.6.0 # MIT
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,92 @@
|
|||
[package]
|
||||
name = "gurk"
|
||||
description = "Signal messenger client for terminal"
|
||||
version = "0.3.0"
|
||||
authors = ["boxdot <d@zerovolt.org>"]
|
||||
edition = "2021"
|
||||
keywords = ["signal", "tui"]
|
||||
repository = "https://github.com/boxdot/gurk-rs"
|
||||
license = "AGPL-3.0-only"
|
||||
categories = ["command-line-utilities"]
|
||||
resolver = "2"
|
||||
|
||||
[workspace]
|
||||
members = ["xtask"]
|
||||
|
||||
[profile.dev.package.miniz_oxide]
|
||||
# This speeds up `cargo xtask dist`.
|
||||
opt-level = 0
|
||||
|
||||
[profile.release]
|
||||
opt-level = 0
|
||||
debug = 0
|
||||
lto = false
|
||||
|
||||
[features]
|
||||
dev = ["prost", "base64"]
|
||||
|
||||
[dependencies]
|
||||
presage = { git = "https://github.com/whisperfish/presage", rev = "f84d958", default-features = false, features = ["sled-config-store"] }
|
||||
|
||||
anyhow = "1.0.40"
|
||||
async-trait = "0.1.51"
|
||||
chrono = { version = "0.4.22", features = ["serde"] }
|
||||
crossterm = { version = "0.19.0", features = ["event-stream"] }
|
||||
derivative = "2.2.0"
|
||||
dirs = "3.0.2"
|
||||
emoji = "0.2.1"
|
||||
gh-emoji = "1.0.3"
|
||||
hostname = "0.3.1"
|
||||
itertools = "0.10.0"
|
||||
log-panics = "2.0.0"
|
||||
mime_guess = "2.0.3"
|
||||
notify-rust = "4.5.8"
|
||||
opener = "0.5.0"
|
||||
phonenumber = "0.3.1"
|
||||
regex-automata = "0.1.10"
|
||||
scopeguard = "1.1.0"
|
||||
serde = { version = "1.0.125", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
textwrap = "0.14.2"
|
||||
tokio = { version = "1.5.0", default-features = false, features = ["rt-multi-thread", "macros", "net", "time"] }
|
||||
tokio-stream = "0.1.5"
|
||||
toml = "0.5.8"
|
||||
tui = { version = "0.15.0", default-features = false, features = ["crossterm"] }
|
||||
unicode-width = "0.1.8"
|
||||
uuid = { version = "1.2", features = ["v4"] }
|
||||
whoami = "1.1.2"
|
||||
tracing = "0.1.35"
|
||||
tracing-appender = "0.2.2"
|
||||
tracing-subscriber = "0.3.11"
|
||||
futures-channel = "0.3.24"
|
||||
qr2term = "0.3.0"
|
||||
clap = { version = "4.0.18", features = ["derive"] }
|
||||
|
||||
# dev feature dependencies
|
||||
prost = { version = "0.10.0", optional = true }
|
||||
base64 = { version = "0.13.0", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
quickcheck = "1.0.3"
|
||||
quickcheck_macros = "1.0.0"
|
||||
tempfile = "3.2.0"
|
||||
criterion = { version = "0.4", features = ["async_tokio", "html_reports"] }
|
||||
|
||||
[[bench]]
|
||||
name = "app"
|
||||
harness = false
|
||||
|
||||
# [patch."https://github.com/whisperfish/presage.git"]
|
||||
# presage = { path = "../presage" }
|
||||
|
||||
# [patch."https://github.com/whisperfish/libsignal-service-rs"]
|
||||
# libsignal-service = { path = "../libsignal-service-rs/libsignal-service" }
|
||||
# libsignal-service-hyper = { path = "../libsignal-service-rs/libsignal-service-hyper" }
|
||||
|
||||
# [patch."https://github.com/signalapp/libsignal-client"]
|
||||
# libsignal-protocol = { path = "../libsignal-client/rust/protocol" }
|
||||
|
||||
[patch.crates-io]
|
||||
# signal-protocol uses a fork of this library via the patch mechanism of cargo.
|
||||
# Since it is not transitive, we have to add the patch here explicitly.
|
||||
"curve25519-dalek" = { git = 'https://github.com/signalapp/curve25519-dalek', branch = 'lizard2' }
|
|
@ -1,31 +1,21 @@
|
|||
[source."https://github.com/boxdot/libsignal-service-rs"]
|
||||
git = "https://github.com/boxdot/libsignal-service-rs"
|
||||
rev = "8be91da2"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/boxdot/presage.git"]
|
||||
git = "https://github.com/boxdot/presage.git"
|
||||
rev = "f908e8f"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/signalapp/curve25519-dalek.git"]
|
||||
git = "https://github.com/signalapp/curve25519-dalek.git"
|
||||
[source."https://github.com/signalapp/curve25519-dalek"]
|
||||
git = "https://github.com/signalapp/curve25519-dalek"
|
||||
branch = "lizard2"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/signalapp/libsignal-client"]
|
||||
git = "https://github.com/signalapp/libsignal-client"
|
||||
tag = "v0.11.0"
|
||||
[source."https://github.com/signalapp/libsignal"]
|
||||
git = "https://github.com/signalapp/libsignal"
|
||||
tag = "v0.20.0"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/signalapp/poksho.git"]
|
||||
git = "https://github.com/signalapp/poksho.git"
|
||||
tag = "v0.7.0"
|
||||
[source."https://github.com/whisperfish/libsignal-service-rs"]
|
||||
git = "https://github.com/whisperfish/libsignal-service-rs"
|
||||
rev = "8666ba56f47e405aaf8ed243be6e2ad1b5ad68c1"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/signalapp/zkgroup"]
|
||||
git = "https://github.com/signalapp/zkgroup"
|
||||
tag = "v0.7.3"
|
||||
[source."https://github.com/whisperfish/presage"]
|
||||
git = "https://github.com/whisperfish/presage"
|
||||
rev = "f84d958"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source.vendored-sources]
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
[net]
|
||||
offline = true
|
||||
[source.modcargo]
|
||||
directory = '/data/pobj/gurk-rs-0.3.0/gurk-rs-0.3.0/modcargo-crates'
|
||||
[source.crates-io]
|
||||
replace-with = 'modcargo'
|
||||
|
||||
[profile.release]
|
||||
opt-level = 2
|
||||
debug = 0
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
lto = false
|
||||
panic = 'unwind'
|
||||
incremental = false
|
||||
codegen-units = 4
|
||||
rpath = false
|
||||
|
||||
[profile.bench]
|
||||
opt-level = 2
|
||||
debug = 0
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
lto = false
|
||||
panic = 'unwind'
|
||||
incremental = false
|
||||
codegen-units = 4
|
||||
rpath = false
|
||||
[source."https://github.com/signalapp/curve25519-dalek"]
|
||||
git = "https://github.com/signalapp/curve25519-dalek"
|
||||
branch = "lizard2"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/signalapp/libsignal"]
|
||||
git = "https://github.com/signalapp/libsignal"
|
||||
tag = "v0.20.0"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/whisperfish/libsignal-service-rs"]
|
||||
git = "https://github.com/whisperfish/libsignal-service-rs"
|
||||
rev = "8666ba56f47e405aaf8ed243be6e2ad1b5ad68c1"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source."https://github.com/whisperfish/presage"]
|
||||
git = "https://github.com/whisperfish/presage"
|
||||
rev = "f84d958"
|
||||
replace-with = "vendored-sources"
|
||||
|
||||
[source.vendored-sources]
|
||||
directory = "vendor"
|
File diff suppressed because one or more lines are too long
|
@ -1,104 +0,0 @@
|
|||
# Changelog
|
||||
|
||||
Entries are listed in reverse chronological order.
|
||||
|
||||
## 2.0.0
|
||||
|
||||
* Fix a data modeling error in the `serde` feature pointed out by Trevor Perrin
|
||||
which caused points and scalars to be serialized with length fields rather
|
||||
than as fixed-size 32-byte arrays. This is a breaking change, but it fixes
|
||||
compatibility with `serde-json` and ensures that the `serde-bincode` encoding
|
||||
matches the conventional encoding for X/Ed25519.
|
||||
* Update `rand_core` to `0.5`, allowing use with new `rand` versions.
|
||||
* Switch from `clear_on_drop` to `zeroize` (by Tony Arcieri).
|
||||
* Require `subtle = ^2.2.1` and remove the note advising nightly Rust, which is
|
||||
no longer required as of that version of `subtle`. See the `subtle`
|
||||
changelog for more details.
|
||||
* Update `README.md` for `2.x` series.
|
||||
* Remove the `build.rs` hack which loaded the entire crate into its own
|
||||
`build.rs` to generate constants, and keep the constants in the source code.
|
||||
|
||||
The only significant change is the data model change to the `serde` feature;
|
||||
besides the `rand_core` version bump, there are no other user-visible changes.
|
||||
|
||||
## 1.2.3
|
||||
|
||||
* Fix an issue identified by a Quarkslab audit (and Jack Grigg), where manually
|
||||
constructing unreduced `Scalar` values, as needed for X/Ed25519, and then
|
||||
performing scalar/scalar arithmetic could compute incorrect results.
|
||||
* Switch to upstream Rust intrinsics for the IFMA backend now that they exist in
|
||||
Rust and don't need to be defined locally.
|
||||
* Ensure that the NAF computation works correctly, even for parameters never
|
||||
used elsewhere in the codebase.
|
||||
* Minor refactoring to EdwardsPoint decompression.
|
||||
* Fix broken links in documentation.
|
||||
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
|
||||
root (not quite correctly done in 1.2.2).
|
||||
|
||||
## 1.2.2
|
||||
|
||||
* Fix a typo in an internal doc-comment.
|
||||
* Add the "crypto" tag to crate metadata.
|
||||
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
|
||||
root.
|
||||
|
||||
## 1.2.1
|
||||
|
||||
* Fix a bug in bucket index calculations in the Pippenger multiscalar algorithm
|
||||
for very large input sizes.
|
||||
* Add a more extensive randomized multiscalar multiplication consistency check
|
||||
to the test suite to prevent regressions.
|
||||
* Ensure that that multiscalar and NAF computations work correctly on extremal
|
||||
`Scalar` values constructed via `from_bits`.
|
||||
|
||||
## 1.2.0
|
||||
|
||||
* New multiscalar multiplication algorithm with better performance for
|
||||
large problem sizes. The backend algorithm is selected
|
||||
transparently using the size hints of the input iterators, so no
|
||||
changes are required for client crates to start using it.
|
||||
* Equality of Edwards points is now checked in projective coordinates.
|
||||
* Serde can now be used with `no_std`.
|
||||
|
||||
## 1.1.4
|
||||
|
||||
* Fix typos in documentation comments.
|
||||
* Remove unnecessary `Default` bound on `Scalar::from_hash`.
|
||||
|
||||
## 1.1.3
|
||||
|
||||
* Reverts the change in 1.1.0 to allow owned and borrowed RNGs, which caused a breakage due to a subtle interaction with ownership rules. (The `RngCore` change is retained).
|
||||
|
||||
## 1.1.2
|
||||
|
||||
* Disabled KaTeX on `docs.rs` pending proper [support upstream](https://github.com/rust-lang/docs.rs/issues/302).
|
||||
|
||||
## 1.1.1
|
||||
|
||||
* Fixed an issue related to `#[cfg(rustdoc)]` which prevented documenting multiple backends.
|
||||
|
||||
## 1.1.0
|
||||
|
||||
* Adds support for precomputation for multiscalar multiplication.
|
||||
* Restructures the internal source tree into `serial` and `vector` backends (no change to external API).
|
||||
* Adds a new IFMA backend which sets speed records.
|
||||
* The `avx2_backend` feature is now an alias for the `simd_backend` feature, which autoselects an appropriate vector backend (currently AVX2 or IFMA).
|
||||
* Replaces the `rand` dependency with `rand_core`.
|
||||
* Generalizes trait bounds on `RistrettoPoint::random()` and `Scalar::random()` to allow owned and borrowed RNGs and to allow `RngCore` instead of `Rng`.
|
||||
|
||||
## 1.0.3
|
||||
|
||||
* Adds `ConstantTimeEq` implementation for compressed points.
|
||||
|
||||
## 1.0.2
|
||||
|
||||
* Fixes a typo in the naming of variables in Ristretto formulas (no change to functionality).
|
||||
|
||||
## 1.0.1
|
||||
|
||||
* Depends on the stable `2.0` version of `subtle` instead of `2.0.0-pre.0`.
|
||||
|
||||
## 1.0.0
|
||||
|
||||
Initial stable release. Yanked due to a dependency mistake (see above).
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
# Contributing to curve25519-dalek
|
||||
|
||||
If you have questions or comments, please feel free to email the
|
||||
authors.
|
||||
|
||||
For feature requests, suggestions, and bug reports, please open an issue on
|
||||
[our Github](https://github.com/dalek-cryptography/curve25519-dalek). (Or, send us
|
||||
an email if you're opposed to using Github for whatever reason.)
|
||||
|
||||
Patches are welcomed as pull requests on
|
||||
[our Github](https://github.com/dalek-cryptography/curve25519-dalek), as well as by
|
||||
email (preferably sent to all of the authors listed in `Cargo.toml`).
|
||||
|
||||
All issues on curve25519-dalek are mentored, if you want help with a bug just
|
||||
ask @isislovecruft or @hdevalence.
|
||||
|
||||
Some issues are easier than others. The `easy` label can be used to find the
|
||||
easy issues. If you want to work on an issue, please leave a comment so that we
|
||||
can assign it to you!
|
||||
|
||||
# Code of Conduct
|
||||
|
||||
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html),
|
||||
with the following additional clauses:
|
||||
|
||||
* We respect the rights to privacy and anonymity for contributors and people in
|
||||
the community. If someone wishes to contribute under a pseudonym different to
|
||||
their primary identity, that wish is to be respected by all contributors.
|
|
@ -1,61 +0,0 @@
|
|||
[package]
|
||||
name = "curve25519-dalek"
|
||||
version = "2.0.0"
|
||||
authors = ["Isis Lovecruft <isis@patternsinthevoid.net>",
|
||||
"Henry de Valence <hdevalence@hdevalence.ca>"]
|
||||
readme = "README.md"
|
||||
license = "BSD-3-Clause"
|
||||
repository = "https://github.com/dalek-cryptography/curve25519-dalek"
|
||||
homepage = "https://dalek.rs/curve25519-dalek"
|
||||
documentation = "https://docs.rs/curve25519-dalek"
|
||||
categories = ["cryptography", "no-std"]
|
||||
keywords = ["cryptography", "crypto", "ristretto", "curve25519", "ristretto255"]
|
||||
description = "A pure-Rust implementation of group operations on ristretto255 and Curve25519"
|
||||
exclude = [
|
||||
"**/.gitignore",
|
||||
".gitignore",
|
||||
".travis.yml",
|
||||
]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
# Disabled for now since this is borked; tracking https://github.com/rust-lang/docs.rs/issues/302
|
||||
# rustdoc-args = ["--html-in-header", ".cargo/registry/src/github.com-1ecc6299db9ec823/curve25519-dalek-0.13.2/rustdoc-include-katex-header.html"]
|
||||
features = ["nightly", "simd_backend"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "dalek-cryptography/curve25519-dalek", branch = "master"}
|
||||
|
||||
[dev-dependencies]
|
||||
sha2 = { version = "0.8", default-features = false }
|
||||
bincode = "1"
|
||||
criterion = "0.3"
|
||||
rand = "0.7"
|
||||
|
||||
[[bench]]
|
||||
name = "dalek_benchmarks"
|
||||
harness = false
|
||||
|
||||
[dependencies]
|
||||
rand_core = { version = "0.5", default-features = false }
|
||||
byteorder = { version = "^1.2.3", default-features = false, features = ["i128"] }
|
||||
digest = { version = "0.8", default-features = false }
|
||||
subtle = { version = "^2.2.1", default-features = false }
|
||||
serde = { version = "1.0", default-features = false, optional = true, features = ["derive"] }
|
||||
packed_simd = { version = "0.3", features = ["into_bits"], optional = true }
|
||||
zeroize = { version = "1", default-features = false }
|
||||
|
||||
[features]
|
||||
nightly = ["subtle/nightly"]
|
||||
default = ["std", "u64_backend"]
|
||||
std = ["alloc", "subtle/std", "rand_core/std"]
|
||||
alloc = ["zeroize/alloc"]
|
||||
|
||||
# The u32 backend uses u32s with u64 products.
|
||||
u32_backend = []
|
||||
# The u64 backend uses u64s with u128 products.
|
||||
u64_backend = []
|
||||
# The SIMD backend uses parallel formulas, using either AVX2 or AVX512-IFMA.
|
||||
simd_backend = ["nightly", "u64_backend", "packed_simd"]
|
||||
# DEPRECATED: this is now an alias for `simd_backend` and may be removed
|
||||
# in some future release.
|
||||
avx2_backend = ["simd_backend"]
|
|
@ -1,64 +0,0 @@
|
|||
Copyright (c) 2016-2019 Isis Agora Lovecruft, Henry de Valence. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
========================================================================
|
||||
|
||||
Portions of curve25519-dalek were originally derived from Adam Langley's
|
||||
Go ed25519 implementation, found at <https://github.com/agl/ed25519/>,
|
||||
under the following licence:
|
||||
|
||||
========================================================================
|
||||
|
||||
Copyright (c) 2012 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,8 +0,0 @@
|
|||
FEATURES := nightly yolocrypto avx2_backend
|
||||
|
||||
doc:
|
||||
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html
|
||||
|
||||
doc-internal:
|
||||
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html --document-private-items
|
||||
|
|
@ -1,207 +0,0 @@
|
|||
|
||||
# curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek)
|
||||
|
||||
<img
|
||||
width="33%"
|
||||
align="right"
|
||||
src="https://doc.dalek.rs/assets/dalek-logo-clear.png"/>
|
||||
|
||||
**A pure-Rust implementation of group operations on Ristretto and Curve25519.**
|
||||
|
||||
`curve25519-dalek` is a library providing group operations on the Edwards and
|
||||
Montgomery forms of Curve25519, and on the prime-order Ristretto group.
|
||||
|
||||
`curve25519-dalek` is not intended to provide implementations of any particular
|
||||
crypto protocol. Rather, implementations of those protocols (such as
|
||||
[`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use
|
||||
`curve25519-dalek` as a library.
|
||||
|
||||
`curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use
|
||||
implementing a wide range of ECC-based crypto protocols, such as key agreement,
|
||||
signatures, anonymous credentials, rangeproofs, and zero-knowledge proof
|
||||
systems.
|
||||
|
||||
In particular, `curve25519-dalek` implements Ristretto, which constructs a
|
||||
prime-order group from a non-prime-order Edwards curve. This provides the
|
||||
speed and safety benefits of Edwards curve arithmetic, without the pitfalls of
|
||||
cofactor-related abstraction mismatches.
|
||||
|
||||
# Documentation
|
||||
|
||||
The semver-stable, public-facing `curve25519-dalek` API is documented
|
||||
[here][docs-external]. In addition, the unstable internal implementation
|
||||
details are documented [here][docs-internal].
|
||||
|
||||
The `curve25519-dalek` documentation requires a custom HTML header to include
|
||||
KaTeX for math support. Unfortunately `cargo doc` does not currently support
|
||||
this, but docs can be built using
|
||||
```sh
|
||||
make doc
|
||||
make doc-internal
|
||||
```
|
||||
|
||||
# Use
|
||||
|
||||
To import `curve25519-dalek`, add the following to the dependencies section of
|
||||
your project's `Cargo.toml`:
|
||||
```toml
|
||||
curve25519-dalek = "2"
|
||||
```
|
||||
|
||||
The `2.x` series has API almost entirely unchanged from the `1.x` series,
|
||||
except that:
|
||||
|
||||
* an error in the data modeling for the (optional) `serde` feature was
|
||||
corrected, so that when the `2.x`-series `serde` implementation is used
|
||||
with `serde-bincode`, the derived serialization matches the usual X/Ed25519
|
||||
formats;
|
||||
|
||||
* the `rand` version was updated.
|
||||
|
||||
See `CHANGELOG.md` for more details.
|
||||
|
||||
# Backends and Features
|
||||
|
||||
The `nightly` feature enables features available only when using a Rust nightly
|
||||
compiler. In particular, it is required for rendering documentation and for
|
||||
the SIMD backends.
|
||||
|
||||
Curve arithmetic is implemented using one of the following backends:
|
||||
|
||||
* a `u32` backend using serial formulas and `u64` products;
|
||||
* a `u64` backend using serial formulas and `u128` products;
|
||||
* an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records);
|
||||
* an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records);
|
||||
|
||||
By default the `u64` backend is selected. To select a specific backend, use:
|
||||
```sh
|
||||
cargo build --no-default-features --features "std u32_backend"
|
||||
cargo build --no-default-features --features "std u64_backend"
|
||||
# Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2
|
||||
cargo build --no-default-features --features "std simd_backend"
|
||||
# Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma
|
||||
cargo build --no-default-features --features "std simd_backend"
|
||||
```
|
||||
Crates using `curve25519-dalek` can either select a backend on behalf of their
|
||||
users, or expose feature flags that control the `curve25519-dalek` backend.
|
||||
|
||||
The `std` feature is enabled by default, but it can be disabled for no-`std`
|
||||
builds using `--no-default-features`. Note that this requires explicitly
|
||||
selecting an arithmetic backend using one of the `_backend` features.
|
||||
If no backend is selected, compilation will fail.
|
||||
|
||||
# Safety
|
||||
|
||||
The `curve25519-dalek` types are designed to make illegal states
|
||||
unrepresentable. For example, any instance of an `EdwardsPoint` is
|
||||
guaranteed to hold a point on the Edwards curve, and any instance of a
|
||||
`RistrettoPoint` is guaranteed to hold a valid point in the Ristretto
|
||||
group.
|
||||
|
||||
All operations are implemented using constant-time logic (no
|
||||
secret-dependent branches, no secret-dependent memory accesses),
|
||||
unless specifically marked as being variable-time code.
|
||||
We believe that our constant-time logic is lowered to constant-time
|
||||
assembly, at least on `x86_64` targets.
|
||||
|
||||
As an additional guard against possible future compiler optimizations,
|
||||
the `subtle` crate places an optimization barrier before every
|
||||
conditional move or assignment. More details can be found in [the
|
||||
documentation for the `subtle` crate][subtle_doc].
|
||||
|
||||
Some functionality (e.g., multiscalar multiplication or batch
|
||||
inversion) requires heap allocation for temporary buffers. All
|
||||
heap-allocated buffers of potentially secret data are explicitly
|
||||
zeroed before release.
|
||||
|
||||
However, we do not attempt to zero stack data, for two reasons.
|
||||
First, it's not possible to do so correctly: we don't have control
|
||||
over stack allocations, so there's no way to know how much data to
|
||||
wipe. Second, because `curve25519-dalek` provides a mid-level API,
|
||||
the correct place to start zeroing stack data is likely not at the
|
||||
entrypoints of `curve25519-dalek` functions, but at the entrypoints of
|
||||
functions in other crates.
|
||||
|
||||
The implementation is memory-safe, and contains no significant
|
||||
`unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD
|
||||
intrinsics. These are marked `unsafe` only because invoking them on an
|
||||
inappropriate CPU would cause `SIGILL`, but the entire backend is only
|
||||
compiled with appropriate `target_feature`s, so this cannot occur.
|
||||
|
||||
# Performance
|
||||
|
||||
Benchmarks are run using [`criterion.rs`][criterion]:
|
||||
|
||||
```sh
|
||||
cargo bench --no-default-features --features "std u32_backend"
|
||||
cargo bench --no-default-features --features "std u64_backend"
|
||||
# Uses avx2 or ifma only if compiled for an appropriate target.
|
||||
export RUSTFLAGS="-C target_cpu=native"
|
||||
cargo bench --no-default-features --features "std simd_backend"
|
||||
```
|
||||
|
||||
Performance is a secondary goal behind correctness, safety, and
|
||||
clarity, but we aim to be competitive with other implementations.
|
||||
|
||||
# FFI
|
||||
|
||||
Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The
|
||||
reason is that we use Rust features to provide an API that maintains safety
|
||||
invariants, which are not possible to maintain across an FFI boundary. For
|
||||
instance, as described in the _Safety_ section above, invalid points are
|
||||
impossible to construct, and this would not be the case if we exposed point
|
||||
operations over FFI.
|
||||
|
||||
However, `curve25519-dalek` is designed as a *mid-level* API, aimed at
|
||||
implementing other, higher-level primitives. Instead of providing FFI at the
|
||||
mid-level, our suggestion is to implement the higher-level primitive (a
|
||||
signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency,
|
||||
and have that crate provide a minimal, byte-buffer-oriented FFI specific to
|
||||
that primitive.
|
||||
|
||||
# Contributing
|
||||
|
||||
Please see [CONTRIBUTING.md][contributing].
|
||||
|
||||
Patches and pull requests should be make against the `develop`
|
||||
branch, **not** `master`.
|
||||
|
||||
# About
|
||||
|
||||
**SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in
|
||||
his second full episode, "Into the Dalek". A beleaguered ship of the "Combined
|
||||
Galactic Resistance" has discovered a broken Dalek that has turned "good",
|
||||
desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers
|
||||
are miniaturized and enter the Dalek, which the Doctor names Rusty. They
|
||||
repair the damage, but accidentally restore it to its original nature, causing
|
||||
it to go on the rampage and alert the Dalek fleet to the whereabouts of the
|
||||
rebel ship. However, the Doctor manages to return Rusty to its previous state
|
||||
by linking his mind with the Dalek's: Rusty shares the Doctor's view of the
|
||||
universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the
|
||||
other Daleks and departs the ship, determined to track down and bring an end
|
||||
to the Dalek race.*
|
||||
|
||||
`curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence.
|
||||
|
||||
Portions of this library were originally a port of [Adam Langley's
|
||||
Golang ed25519 library](https://github.com/agl/ed25519), which was in
|
||||
turn a port of the reference `ref10` implementation. Most of this code,
|
||||
including the 32-bit field arithmetic, has since been rewritten.
|
||||
|
||||
The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and
|
||||
the addition chain for scalar inversion was provided by Brian Smith. The
|
||||
optimised batch inversion was contributed by Sean Bowe and Daira Hopwood.
|
||||
|
||||
The `no_std` and `zeroize` support was contributed by Tony Arcieri.
|
||||
|
||||
Thanks also to Ashley Hauck, Lucas Salibian, and Manish Goregaokar for their
|
||||
contributions.
|
||||
|
||||
[ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek
|
||||
[x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek
|
||||
[contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md
|
||||
[docs-external]: https://doc.dalek.rs/curve25519_dalek/
|
||||
[docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/
|
||||
[criterion]: https://github.com/japaric/criterion.rs
|
||||
[parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html
|
||||
[subtle_doc]: https://doc.dalek.rs/subtle/
|
|
@ -1,339 +0,0 @@
|
|||
#![allow(non_snake_case)]
|
||||
|
||||
extern crate rand;
|
||||
use rand::rngs::OsRng;
|
||||
use rand::thread_rng;
|
||||
|
||||
#[macro_use]
|
||||
extern crate criterion;
|
||||
|
||||
use criterion::BatchSize;
|
||||
use criterion::Criterion;
|
||||
|
||||
extern crate curve25519_dalek;
|
||||
|
||||
use curve25519_dalek::constants;
|
||||
use curve25519_dalek::scalar::Scalar;
|
||||
use curve25519_dalek::field::FieldElement;
|
||||
|
||||
static BATCH_SIZES: [usize; 5] = [1, 2, 4, 8, 16];
|
||||
static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024];
|
||||
|
||||
mod edwards_benches {
|
||||
use super::*;
|
||||
|
||||
use curve25519_dalek::edwards::EdwardsPoint;
|
||||
|
||||
fn compress(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_POINT;
|
||||
c.bench_function("EdwardsPoint compression", move |b| b.iter(|| B.compress()));
|
||||
}
|
||||
|
||||
fn decompress(c: &mut Criterion) {
|
||||
let B_comp = &constants::ED25519_BASEPOINT_COMPRESSED;
|
||||
c.bench_function("EdwardsPoint decompression", move |b| {
|
||||
b.iter(|| B_comp.decompress().unwrap())
|
||||
});
|
||||
}
|
||||
|
||||
fn consttime_fixed_base_scalar_mul(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_TABLE;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
c.bench_function("Constant-time fixed-base scalar mul", move |b| {
|
||||
b.iter(|| B * &s)
|
||||
});
|
||||
}
|
||||
|
||||
fn consttime_variable_base_scalar_mul(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_POINT;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
c.bench_function("Constant-time variable-base scalar mul", move |b| {
|
||||
b.iter(|| B * s)
|
||||
});
|
||||
}
|
||||
|
||||
fn vartime_double_base_scalar_mul(c: &mut Criterion) {
|
||||
c.bench_function("Variable-time aA+bB, A variable, B fixed", |bench| {
|
||||
let mut rng = thread_rng();
|
||||
let A = &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE;
|
||||
bench.iter_batched(
|
||||
|| (Scalar::random(&mut rng), Scalar::random(&mut rng)),
|
||||
|(a, b)| EdwardsPoint::vartime_double_scalar_mul_basepoint(&a, &A, &b),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = edwards_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
compress,
|
||||
decompress,
|
||||
consttime_fixed_base_scalar_mul,
|
||||
consttime_variable_base_scalar_mul,
|
||||
vartime_double_base_scalar_mul,
|
||||
}
|
||||
}
|
||||
|
||||
mod multiscalar_benches {
|
||||
use super::*;
|
||||
|
||||
use curve25519_dalek::edwards::EdwardsPoint;
|
||||
use curve25519_dalek::edwards::VartimeEdwardsPrecomputation;
|
||||
use curve25519_dalek::traits::MultiscalarMul;
|
||||
use curve25519_dalek::traits::VartimeMultiscalarMul;
|
||||
use curve25519_dalek::traits::VartimePrecomputedMultiscalarMul;
|
||||
|
||||
fn construct_scalars(n: usize) -> Vec<Scalar> {
|
||||
let mut rng = thread_rng();
|
||||
(0..n).map(|_| Scalar::random(&mut rng)).collect()
|
||||
}
|
||||
|
||||
fn construct_points(n: usize) -> Vec<EdwardsPoint> {
|
||||
let mut rng = thread_rng();
|
||||
(0..n)
|
||||
.map(|_| &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn construct(n: usize) -> (Vec<Scalar>, Vec<EdwardsPoint>) {
|
||||
(construct_scalars(n), construct_points(n))
|
||||
}
|
||||
|
||||
fn consttime_multiscalar_mul(c: &mut Criterion) {
|
||||
c.bench_function_over_inputs(
|
||||
"Constant-time variable-base multiscalar multiplication",
|
||||
|b, &&size| {
|
||||
let points = construct_points(size);
|
||||
// This is supposed to be constant-time, but we might as well
|
||||
// rerandomize the scalars for every call just in case.
|
||||
b.iter_batched(
|
||||
|| construct_scalars(size),
|
||||
|scalars| EdwardsPoint::multiscalar_mul(&scalars, &points),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
&MULTISCALAR_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
fn vartime_multiscalar_mul(c: &mut Criterion) {
|
||||
c.bench_function_over_inputs(
|
||||
"Variable-time variable-base multiscalar multiplication",
|
||||
|b, &&size| {
|
||||
let points = construct_points(size);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels).
|
||||
b.iter_batched(
|
||||
|| construct_scalars(size),
|
||||
|scalars| EdwardsPoint::vartime_multiscalar_mul(&scalars, &points),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
&MULTISCALAR_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
fn vartime_precomputed_pure_static(c: &mut Criterion) {
|
||||
c.bench_function_over_inputs(
|
||||
"Variable-time fixed-base multiscalar multiplication",
|
||||
move |b, &&total_size| {
|
||||
let static_size = total_size;
|
||||
|
||||
let static_points = construct_points(static_size);
|
||||
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels).
|
||||
b.iter_batched(
|
||||
|| construct_scalars(static_size),
|
||||
|scalars| precomp.vartime_multiscalar_mul(&scalars),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
&MULTISCALAR_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
fn vartime_precomputed_helper(c: &mut Criterion, dynamic_fraction: f64) {
|
||||
let label = format!(
|
||||
"Variable-time mixed-base multiscalar multiplication ({:.0}pct dyn)",
|
||||
100.0 * dynamic_fraction,
|
||||
);
|
||||
c.bench_function_over_inputs(
|
||||
&label,
|
||||
move |b, &&total_size| {
|
||||
let dynamic_size = ((total_size as f64) * dynamic_fraction) as usize;
|
||||
let static_size = total_size - dynamic_size;
|
||||
|
||||
let static_points = construct_points(static_size);
|
||||
let dynamic_points = construct_points(dynamic_size);
|
||||
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels). Timings
|
||||
// should be independent of points so we don't
|
||||
// randomize them.
|
||||
b.iter_batched(
|
||||
|| {
|
||||
(
|
||||
construct_scalars(static_size),
|
||||
construct_scalars(dynamic_size),
|
||||
)
|
||||
},
|
||||
|(static_scalars, dynamic_scalars)| {
|
||||
precomp.vartime_mixed_multiscalar_mul(
|
||||
&static_scalars,
|
||||
&dynamic_scalars,
|
||||
&dynamic_points,
|
||||
)
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
&MULTISCALAR_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
fn vartime_precomputed_00_pct_dynamic(c: &mut Criterion) {
|
||||
vartime_precomputed_helper(c, 0.0);
|
||||
}
|
||||
|
||||
fn vartime_precomputed_20_pct_dynamic(c: &mut Criterion) {
|
||||
vartime_precomputed_helper(c, 0.2);
|
||||
}
|
||||
|
||||
fn vartime_precomputed_50_pct_dynamic(c: &mut Criterion) {
|
||||
vartime_precomputed_helper(c, 0.5);
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = multiscalar_benches;
|
||||
// Lower the sample size to run the benchmarks faster
|
||||
config = Criterion::default().sample_size(15);
|
||||
targets =
|
||||
consttime_multiscalar_mul,
|
||||
vartime_multiscalar_mul,
|
||||
vartime_precomputed_pure_static,
|
||||
vartime_precomputed_00_pct_dynamic,
|
||||
vartime_precomputed_20_pct_dynamic,
|
||||
vartime_precomputed_50_pct_dynamic,
|
||||
}
|
||||
}
|
||||
|
||||
mod ristretto_benches {
|
||||
use super::*;
|
||||
use curve25519_dalek::ristretto::RistrettoPoint;
|
||||
|
||||
fn compress(c: &mut Criterion) {
|
||||
c.bench_function("RistrettoPoint compression", |b| {
|
||||
let B = &constants::RISTRETTO_BASEPOINT_POINT;
|
||||
b.iter(|| B.compress())
|
||||
});
|
||||
}
|
||||
|
||||
fn decompress(c: &mut Criterion) {
|
||||
c.bench_function("RistrettoPoint decompression", |b| {
|
||||
let B_comp = &constants::RISTRETTO_BASEPOINT_COMPRESSED;
|
||||
b.iter(|| B_comp.decompress().unwrap())
|
||||
});
|
||||
}
|
||||
|
||||
fn elligator(c: &mut Criterion) {
|
||||
let fe_bytes = [0u8; 32];
|
||||
let fe = FieldElement::from_bytes(&fe_bytes);
|
||||
|
||||
c.bench_function("RistrettoPoint Elligator", |b| {
|
||||
b.iter(|| RistrettoPoint::elligator_ristretto_flavor(&fe));
|
||||
});
|
||||
}
|
||||
|
||||
fn double_and_compress_batch(c: &mut Criterion) {
|
||||
c.bench_function_over_inputs(
|
||||
"Batch Ristretto double-and-encode",
|
||||
|b, &&size| {
|
||||
let mut rng = OsRng;
|
||||
let points: Vec<RistrettoPoint> = (0..size)
|
||||
.map(|_| RistrettoPoint::random(&mut rng))
|
||||
.collect();
|
||||
b.iter(|| RistrettoPoint::double_and_compress_batch(&points));
|
||||
},
|
||||
&BATCH_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = ristretto_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
compress,
|
||||
decompress,
|
||||
elligator,
|
||||
double_and_compress_batch,
|
||||
}
|
||||
}
|
||||
|
||||
mod montgomery_benches {
|
||||
use super::*;
|
||||
|
||||
fn montgomery_ladder(c: &mut Criterion) {
|
||||
c.bench_function("Montgomery pseudomultiplication", |b| {
|
||||
let B = constants::X25519_BASEPOINT;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
b.iter(|| B * s);
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = montgomery_benches;
|
||||
config = Criterion::default();
|
||||
targets = montgomery_ladder,
|
||||
}
|
||||
}
|
||||
|
||||
mod scalar_benches {
|
||||
use super::*;
|
||||
|
||||
fn scalar_inversion(c: &mut Criterion) {
|
||||
c.bench_function("Scalar inversion", |b| {
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
b.iter(|| s.invert());
|
||||
});
|
||||
}
|
||||
|
||||
fn batch_scalar_inversion(c: &mut Criterion) {
|
||||
c.bench_function_over_inputs(
|
||||
"Batch scalar inversion",
|
||||
|b, &&size| {
|
||||
let mut rng = OsRng;
|
||||
let scalars: Vec<Scalar> = (0..size).map(|_| Scalar::random(&mut rng)).collect();
|
||||
b.iter(|| {
|
||||
let mut s = scalars.clone();
|
||||
Scalar::batch_invert(&mut s);
|
||||
});
|
||||
},
|
||||
&BATCH_SIZES,
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = scalar_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
scalar_inversion,
|
||||
batch_scalar_inversion,
|
||||
}
|
||||
}
|
||||
|
||||
criterion_main!(
|
||||
scalar_benches::scalar_benches,
|
||||
montgomery_benches::montgomery_benches,
|
||||
ristretto_benches::ristretto_benches,
|
||||
edwards_benches::edwards_benches,
|
||||
multiscalar_benches::multiscalar_benches,
|
||||
);
|
Binary file not shown.
Before Width: | Height: | Size: 110 KiB |
Binary file not shown.
Before Width: | Height: | Size: 107 KiB |
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 59 KiB |
|
@ -1,10 +0,0 @@
|
|||
<link rel="stylesheet" href="https://doc.dalek.rs/assets/katex/katex.min.css">
|
||||
<script src="https://doc.dalek.rs/assets/katex/katex.min.js"></script>
|
||||
<script src="https://doc.dalek.rs/assets/katex/contrib/auto-render.min.js"></script>
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function() { renderMathInElement(document.body); });
|
||||
</script>
|
||||
<style>
|
||||
.katex { font-size: 1em !important; }
|
||||
pre.rust, .docblock code, .docblock-short code { font-size: 0.85em !important; }
|
||||
</style>
|
|
@ -1,140 +0,0 @@
|
|||
An AVX2 implementation of the vectorized point operation strategy.
|
||||
|
||||
# Field element representation
|
||||
|
||||
Our strategy is to implement 4-wide multiplication and squaring by
|
||||
wordslicing, using one 64-bit AVX2 lane for each field element. Field
|
||||
elements are represented in the usual way as 10 `u32` limbs in radix
|
||||
\\(25.5\\) (i.e., alternating between \\(2\^{26}\\) for even limbs and
|
||||
\\(2\^{25}\\) for odd limbs). This has the effect that passing between
|
||||
the parallel 32-bit AVX2 representation and the serial 64-bit
|
||||
representation (which uses radix \\(2^{51}\\)) amounts to regrouping
|
||||
digits.
|
||||
|
||||
The field element representation is oriented around the AVX2
|
||||
`vpmuluqdq` instruction, which multiplies the low 32 bits of each
|
||||
64-bit lane of each operand to produce a 64-bit result.
|
||||
|
||||
```text,no_run
|
||||
(a1 ?? b1 ?? c1 ?? d1 ??)
|
||||
(a2 ?? b2 ?? c2 ?? d2 ??)
|
||||
|
||||
(a1*a2 b1*b2 c1*c2 d1*d2)
|
||||
```
|
||||
|
||||
To unpack 32-bit values into 64-bit lanes for use in multiplication
|
||||
it would be convenient to use the `vpunpck[lh]dq` instructions,
|
||||
which unpack and interleave the low and high 32-bit lanes of two
|
||||
source vectors.
|
||||
However, the AVX2 versions of these instructions are designed to
|
||||
operate only within 128-bit lanes of the 256-bit vectors, so that
|
||||
interleaving the low lanes of `(a0 b0 c0 d0 a1 b1 c1 d1)` with zero
|
||||
gives `(a0 00 b0 00 a1 00 b1 00)`. Instead, we pre-shuffle the data
|
||||
layout as `(a0 b0 a1 b1 c0 d0 c1 d1)` so that we can unpack the
|
||||
"low" and "high" parts as
|
||||
|
||||
```text,no_run
|
||||
(a0 00 b0 00 c0 00 d0 00)
|
||||
(a1 00 b1 00 c1 00 d1 00)
|
||||
```
|
||||
|
||||
The data layout for a vector of four field elements \\( (a,b,c,d)
|
||||
\\) with limbs \\( a_0, a_1, \ldots, a_9 \\) is as `[u32x8; 5]` in
|
||||
the form
|
||||
|
||||
```text,no_run
|
||||
(a0 b0 a1 b1 c0 d0 c1 d1)
|
||||
(a2 b2 a3 b3 c2 d2 c3 d3)
|
||||
(a4 b4 a5 b5 c4 d4 c5 d5)
|
||||
(a6 b6 a7 b7 c6 d6 c7 d7)
|
||||
(a8 b8 a9 b9 c8 d8 c9 d9)
|
||||
```
|
||||
|
||||
Since this breaks cleanly into two 128-bit lanes, it may be possible
|
||||
to adapt it to 128-bit vector instructions such as NEON without too
|
||||
much difficulty.
|
||||
|
||||
# Avoiding Overflow in Doubling
|
||||
|
||||
To analyze the size of the field element coefficients during the
|
||||
computations, we can parameterize the bounds on the limbs of each
|
||||
field element by \\( b \in \mathbb R \\) representing the excess bits
|
||||
above that limb's radix, so that each limb is bounded by either
|
||||
\\(2\^{25+b} \\) or \\( 2\^{26+b} \\), as appropriate.
|
||||
|
||||
The multiplication routine requires that its inputs are bounded with
|
||||
\\( b < 1.75 \\), in order to fit a multiplication by \\( 19 \\)
|
||||
into 32 bits. Since \\( \lg 19 < 4.25 \\), \\( 19x < 2\^{32} \\)
|
||||
when \\( x < 2\^{27.75} = 2\^{26 + 1.75} \\). However, this is only
|
||||
required for one of the inputs; the other can grow up to \\( b < 2.5
|
||||
\\).
|
||||
|
||||
In addition, the multiplication and squaring routines do not
|
||||
canonically reduce their outputs, but can leave some small uncarried
|
||||
excesses, so that their reduced outputs are bounded with
|
||||
\\( b < 0.007 \\).
|
||||
|
||||
The non-parallel portion of the doubling formulas is
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
|
||||
&\gets
|
||||
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
Computing \\( (S\_5, S\_6, S\_8, S\_9 ) \\) as
|
||||
$$
|
||||
\begin{matrix}
|
||||
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
|
||||
+& S\_2 & & & S\_2 \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & 2p & 2p & 2p \\\\
|
||||
-& & S\_2 & S\_2 & \\\\
|
||||
-& & & & S\_4 \\\\
|
||||
=& S\_5 & S\_6 & S\_8 & S\_9
|
||||
\end{matrix}
|
||||
$$
|
||||
results in bit-excesses \\( < (1.01, 1.60, 2.33, 2.01)\\) for
|
||||
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
|
||||
are then
|
||||
$$
|
||||
\begin{aligned}
|
||||
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 2.01) \\\\
|
||||
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
|
||||
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
|
||||
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 2.01)
|
||||
\end{aligned}
|
||||
$$
|
||||
which are too large: it's not possible to arrange the multiplicands so
|
||||
that one vector has \\(b < 2.5\\) and the other has \\( b < 1.75 \\).
|
||||
However, if we flip the sign of \\( S\_4 = S\_0\^2 \\) during
|
||||
squaring, so that we output \\(S\_4' = -S\_4 \pmod p\\), then we can
|
||||
compute
|
||||
$$
|
||||
\begin{matrix}
|
||||
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
|
||||
+& S\_2 & & & S\_2 \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & & S\_4' \\\\
|
||||
+& & 2p & 2p & \\\\
|
||||
-& & S\_2 & S\_2 & \\\\
|
||||
=& S\_5 & S\_6 & S\_8 & S\_9
|
||||
\end{matrix}
|
||||
$$
|
||||
resulting in bit-excesses \\( < (1.01, 1.60, 2.33, 1.60)\\) for
|
||||
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
|
||||
are then
|
||||
$$
|
||||
\begin{aligned}
|
||||
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 1.60) \\\\
|
||||
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
|
||||
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
|
||||
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 1.60)
|
||||
\end{aligned}
|
||||
$$
|
||||
whose right-hand sides are all bounded with \\( b < 1.75 \\) and
|
||||
whose left-hand sides are all bounded with \\( b < 2.5 \\),
|
||||
so that we can avoid any intermediate reductions.
|
|
@ -1,580 +0,0 @@
|
|||
An AVX512-IFMA implementation of the vectorized point operation
|
||||
strategy.
|
||||
|
||||
# IFMA instructions
|
||||
|
||||
AVX512-IFMA is an extension to AVX-512 consisting of two instructions:
|
||||
|
||||
* `vpmadd52luq`: packed multiply of unsigned 52-bit integers and add
|
||||
the low 52 product bits to 64-bit accumulators;
|
||||
* `vpmadd52huq`: packed multiply of unsigned 52-bit integers and add
|
||||
the high 52 product bits to 64-bit accumulators;
|
||||
|
||||
These operate on 64-bit lanes of their source vectors, taking the low
|
||||
52 bits of each lane of each source vector, computing the 104-bit
|
||||
products of each pair, and then adding either the high or low 52 bits
|
||||
of the 104-bit products to the 64-bit lanes of the destination vector.
|
||||
The multiplication is performed internally by reusing circuitry for
|
||||
floating-point arithmetic. Although these instructions are part of
|
||||
AVX512, the AVX512VL (vector length) extension (present whenever IFMA
|
||||
is) allows using them with 512, 256, or 128-bit operands.
|
||||
|
||||
This provides a major advantage to vectorized integer operations:
|
||||
previously, vector operations could only use a \\(32 \times 32
|
||||
\rightarrow 64\\)-bit multiplier, while serial code could use a
|
||||
\\(64\times 64 \rightarrow 128\\)-bit multiplier.
|
||||
|
||||
## IFMA for big-integer multiplications
|
||||
|
||||
A detailed example of the intended use of the IFMA instructions can be
|
||||
found in a 2016 paper by Gueron and Krasnov, [_Accelerating Big
|
||||
Integer Arithmetic Using Intel IFMA Extensions_][2016_gueron_krasnov].
|
||||
The basic idea is that multiplication of large integers (such as 1024,
|
||||
2048, or more bits) can be performed as follows.
|
||||
|
||||
First, convert a “packed” 64-bit representation
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x'_0 + x'_1 2^{64} + x'_2 2^{128} + \cdots \\\\
|
||||
y &= y'_0 + y'_1 2^{64} + y'_2 2^{128} + \cdots
|
||||
\end{aligned}
|
||||
\\]
|
||||
into a “redundant” 52-bit representation
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{52} + x_2 2^{104} + \cdots \\\\
|
||||
y &= y_0 + y_1 2^{52} + y_2 2^{104} + \cdots
|
||||
\end{aligned}
|
||||
\\]
|
||||
with each \\(x_i, y_j\\) in a 64-bit lane.
|
||||
|
||||
Writing the product as \\(z = z_0 + z_1 2^{52} + z_2 2^{104} + \cdots\\),
|
||||
the “schoolbook” multiplication strategy gives
|
||||
\\[
|
||||
\begin{aligned}
|
||||
&z_0 &&=& x_0 & y_0 & & & & & & & & \\\\
|
||||
&z_1 &&=& x_1 & y_0 &+ x_0 & y_1 & & & & & & \\\\
|
||||
&z_2 &&=& x_2 & y_0 &+ x_1 & y_1 &+ x_0 & y_2 & & & & \\\\
|
||||
&z_3 &&=& x_3 & y_0 &+ x_2 & y_1 &+ x_1 & y_2 &+ x_0 & y_3 & & \\\\
|
||||
&z_4 &&=& \vdots\\;&\\;\vdots &+ x_3 & y_1 &+ x_2 & y_2 &+ x_1 & y_3 &+ \cdots& \\\\
|
||||
&z_5 &&=& & & \vdots\\;&\\;\vdots &+ x_3 & y_2 &+ x_2 & y_3 &+ \cdots& \\\\
|
||||
&z_6 &&=& & & & & \vdots\\;&\\;\vdots &+ x_3 & y_3 &+ \cdots& \\\\
|
||||
&z_7 &&=& & & & & & & \vdots\\;&\\;\vdots &+ \cdots& \\\\
|
||||
&\vdots&&=& & & & & & & & & \ddots& \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Notice that the product coefficient \\(z_k\\), representing the value
|
||||
\\(z_k 2^{52k}\\), is the sum of all product terms
|
||||
\\(
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
\\)
|
||||
with \\(k = i + j\\).
|
||||
Write the IFMA operators \\(\mathrm{lo}(a,b)\\), denoting the low
|
||||
\\(52\\) bits of \\(ab\\), and
|
||||
\\(\mathrm{hi}(a,b)\\), denoting the high \\(52\\) bits of
|
||||
\\(ab\\).
|
||||
Now we can rewrite the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
&=
|
||||
2^{52 (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
This means that the low half of \\(x_i y_j\\) can be accumulated onto
|
||||
the product limb \\(z_{i+j}\\) and the high half can be directly
|
||||
accumulated onto the next-higher product limb \\(z_{i+j+1}\\) with no
|
||||
additional operations. This allows rewriting the schoolbook
|
||||
multiplication into the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
&z_0 &&=& \mathrm{lo}(x_0,&y_0) & & & & & & & & & & \\\\
|
||||
&z_1 &&=& \mathrm{lo}(x_1,&y_0) &+\mathrm{hi}(x_0,&y_0) &+\mathrm{lo}(x_0,&y_1) & & & & & & \\\\
|
||||
&z_2 &&=& \mathrm{lo}(x_2,&y_0) &+\mathrm{hi}(x_1,&y_0) &+\mathrm{lo}(x_1,&y_1) &+\mathrm{hi}(x_0,&y_1) &+\mathrm{lo}(x_0,&y_2) & & \\\\
|
||||
&z_3 &&=& \mathrm{lo}(x_3,&y_0) &+\mathrm{hi}(x_2,&y_0) &+\mathrm{lo}(x_2,&y_1) &+\mathrm{hi}(x_1,&y_1) &+\mathrm{lo}(x_1,&y_2) &+ \cdots& \\\\
|
||||
&z_4 &&=& \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_0) &+\mathrm{lo}(x_3,&y_1) &+\mathrm{hi}(x_2,&y_1) &+\mathrm{lo}(x_2,&y_2) &+ \cdots& \\\\
|
||||
&z_5 &&=& & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_1) &+\mathrm{lo}(x_3,&y_2) &+ \cdots& \\\\
|
||||
&z_6 &&=& & & & & & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+ \cdots& \\\\
|
||||
&\vdots&&=& & & & & & & & & & & \ddots& \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Gueron and Krasnov implement multiplication by constructing vectors
|
||||
out of the columns of this diagram, so that the source operands for
|
||||
the IFMA instructions are of the form \\((x_0, x_1, x_2, \ldots)\\)
|
||||
and \\((y_i, y_i, y_i, \ldots)\\).
|
||||
After performing the multiplication,
|
||||
the product terms \\(z_i\\) are then repacked into a 64-bit representation.
|
||||
|
||||
## An alternative strategy
|
||||
|
||||
The strategy described above is aimed at big-integer multiplications,
|
||||
such as 1024, 2048, or 4096 bits, which would be used for applications
|
||||
like RSA. However, elliptic curve cryptography uses much smaller field
|
||||
sizes, such as 256 or 384 bits, so a different strategy is needed.
|
||||
|
||||
The parallel Edwards formulas provide parallelism at the level of the
|
||||
formulas for curve operations. This means that instead of scanning
|
||||
through the terms of the source operands and parallelizing *within* a
|
||||
field element (as described above), we can arrange the computation in
|
||||
product-scanning form and parallelize *across* field elements (as
|
||||
described below).
|
||||
|
||||
The parallel Edwards
|
||||
formulas provide 4-way parallelism, so they can be implemented using
|
||||
256-bit vectors using a single 64-bit lane for each element, or using
|
||||
512-bit vectors using two 64-bit lanes.
|
||||
The only available CPU supporting IFMA (the
|
||||
i3-8121U) executes 512-bit IFMA instructions at half rate compared to
|
||||
256-bit instructions, so for now there's no throughput advantage to
|
||||
using 512-bit IFMA instructions, and this implementation uses 256-bit
|
||||
vectors.
|
||||
|
||||
To extend this to 512-bit vectors, it's only only necessary to achieve
|
||||
2-way parallelism, and it's possible (with a small amount of overhead)
|
||||
to create a hybrid strategy that operates entirely within 128-bit
|
||||
lanes. This means that cross-lane operations can use the faster
|
||||
`vpshufd` (1c latency) instead of a general shuffle instruction (3c
|
||||
latency).
|
||||
|
||||
# Choice of radix
|
||||
|
||||
The inputs to IFMA instructions are 52 bits wide, so the radix \\(r\\)
|
||||
used to represent a multiprecision integer must be \\( r \leq 52 \\).
|
||||
The obvious choice is the "native" radix \\(r = 52\\).
|
||||
|
||||
As described above, this choice
|
||||
has the advantage that for \\(x_i, y_j \in [0,2^{52})\\), the product term
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
&=
|
||||
2^{52 (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)},
|
||||
\end{aligned}
|
||||
\\]
|
||||
so that the low and high halves of the product can be directly accumulated
|
||||
onto the product limbs.
|
||||
In contrast, when using a smaller radix \\(r = 52 - k\\),
|
||||
the product term has the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{r i}) (y_j 2^{r j})
|
||||
&=
|
||||
2^{r (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{r (i+j)} +
|
||||
(
|
||||
\mathrm{hi}(x_i, y_j) 2^k
|
||||
)
|
||||
2^{r (i+j+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
What's happening is that the product \\(x_i y_j\\) of size \\(2r\\)
|
||||
bits is split not at \\(r\\) but at \\(52\\), so \\(k\\) product bits
|
||||
are placed into the low half instead of the high half. This means
|
||||
that the high half of the product cannot be directly accumulated onto
|
||||
\\(z_{i+j+1}\\), but must first be multiplied by \\(2^k\\) (i.e., left
|
||||
shifted by \\(k\\)). In addition, the low half of the product is
|
||||
\\(52\\) bits large instead of \\(r\\) bits.
|
||||
|
||||
## Handling offset product terms
|
||||
|
||||
[Drucker and Gueron][2018_drucker_gueron] analyze the choice of radix
|
||||
in the context of big-integer squaring, outlining three ways to handle
|
||||
the offset product terms, before concluding that all of them are
|
||||
suboptimal:
|
||||
|
||||
1. Shift the results after accumulation;
|
||||
2. Shift the input operands before multiplication;
|
||||
3. Split the MAC operation, accumulating into a zeroed register,
|
||||
shifting the result, and then adding.
|
||||
|
||||
The first option is rejected because it could double-shift some
|
||||
previously accumulated terms, the second doesn't work because the
|
||||
inputs could become larger than \\(52\\) bits, and the third requires
|
||||
additional instructions to handle the shifting and adding.
|
||||
|
||||
Based on an analysis of total number of instructions, they suggest an
|
||||
addition to the instruction set, which they call `FMSA` (fused
|
||||
multiply-shift-add). This would shift the result according to an 8-bit
|
||||
immediate value before accumulating it into the destination register.
|
||||
|
||||
However, this change to the instruction set doesn't seem to be
|
||||
necessary. Instead, the product terms can be grouped according to
|
||||
their coefficients, accumulated together, then shifted once before
|
||||
adding them to the final sum. This uses an extra register, shift, and
|
||||
add, but only once per product term (accumulation target), not once
|
||||
per source term (as in the Drucker-Gueron paper).
|
||||
|
||||
Moreover, because IFMA instructions execute only on two ports
|
||||
(presumably 0 and 1), while adds and shifts can execute on three ports
|
||||
(0, 1, and 5), the adds and shifts can execute independently of the
|
||||
IFMA operations, as long as there is not too much pressure on port 5.
|
||||
This means that, although the total number of instructions increases,
|
||||
the shifts and adds do not necessarily increase the execution time, as
|
||||
long as throughput is limited by IFMA operations.
|
||||
|
||||
Finally, because IFMA instructions have 4 cycle latency and 0.5/1
|
||||
cycle throughput (for 256/512 bit vectors), maximizing IFMA throughput
|
||||
requires either 8 (for 256) or 4 (for 512) independent operations. So
|
||||
accumulating groups of terms independently before adding them at the
|
||||
end may be necessary anyways, in order to prevent long chains of
|
||||
dependent instructions.
|
||||
|
||||
## Advantages of a smaller radix
|
||||
|
||||
Using a smaller radix has other advantages. Although radix \\(52\\)
|
||||
is an unsaturated representation from the point of view of the
|
||||
\\(64\\)-bit accumulators (because up to 4096 product terms can be
|
||||
accumulated without carries), it's a saturated representation from the
|
||||
point of view of the multiplier (since \\(52\\)-bit values are the
|
||||
maximum input size).
|
||||
|
||||
Because the inputs to a multiplication must have all of their limbs
|
||||
bounded by \\(2^{52}\\), limbs in excess of \\(2^{52}\\) must be
|
||||
reduced before they can be used as an input. The
|
||||
[Gueron-Krasnov][2016_gueron_krasnov] paper suggests normalizing
|
||||
values using a standard, sequential carry chain: for each limb, add
|
||||
the carryin from reducing the previous limb, compute the carryout and
|
||||
reduce the current limb, then move to the next limb.
|
||||
|
||||
However, when using a smaller radix, such as \\(51\\), each limb can
|
||||
store a carry bit and still be used as the input to a multiplication.
|
||||
This means that the inputs do not need to be normalized, and instead
|
||||
of using a sequential carry chain, we can compute all carryouts in
|
||||
parallel, reduce all limbs in parallel, and then add the carryins in
|
||||
parallel (possibly growing the limb values by one bit).
|
||||
|
||||
Because the output of this partial reduction is an acceptable
|
||||
multiplication input, we can "close the loop" using partial reductions
|
||||
and never have to normalize to a canonical representation through the
|
||||
entire computation, in contrast to the Gueron-Krasnov approach, which
|
||||
converts back to a packed representation after every operation. (This
|
||||
idea seems to trace back to at least as early as [this 1999
|
||||
paper][1999_walter]).
|
||||
|
||||
Using \\(r = 51\\) is enough to keep a carry bit in each limb and
|
||||
avoid normalizations. What about an even smaller radix? One reason
|
||||
to choose a smaller radix would be to align the limb boundaries with
|
||||
an inline reduction (for instance, choosing \\(r = 43\\) for the
|
||||
Mersenne field \\(p = 2^{127} - 1\\)), but for \\(p = 2^{255 - 19}\\),
|
||||
\\(r = 51 = 255/5\\) is the natural choice.
|
||||
|
||||
# Multiplication
|
||||
|
||||
The inputs to a multiplication are two field elements
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} \\\\
|
||||
y &= y_0 + y_1 2^{51} + y_2 2^{102} + y_3 2^{153} + y_4 2^{204},
|
||||
\end{aligned}
|
||||
\\]
|
||||
with limbs in range \\([0,2^{52})\\).
|
||||
|
||||
Writing the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
|
||||
\end{aligned}
|
||||
\\]
|
||||
a schoolbook multiplication in product scanning form takes the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= x_0 y_0 \\\\
|
||||
z_1 &= x_1 y_0 + x_0 y_1 \\\\
|
||||
z_2 &= x_2 y_0 + x_1 y_1 + x_0 y_2 \\\\
|
||||
z_3 &= x_3 y_0 + x_2 y_1 + x_1 y_2 + x_0 y_3 \\\\
|
||||
z_4 &= x_4 y_0 + x_3 y_1 + x_2 y_2 + x_1 y_3 + x_0 y_4 \\\\
|
||||
z_5 &= x_4 y_1 + x_3 y_2 + x_2 y_3 + x_1 y_4 \\\\
|
||||
z_6 &= x_4 y_2 + x_3 y_3 + x_2 y_4 \\\\
|
||||
z_7 &= x_4 y_3 + x_3 y_4 \\\\
|
||||
z_8 &= x_4 y_4 \\\\
|
||||
z_9 &= 0 \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Each term \\(x_i y_j\\) can be written in terms of IFMA operations as
|
||||
\\[
|
||||
x_i y_j = \mathrm{lo}(x_i,y_j) + 2\mathrm{hi}(x_i,y_j)2^{51}.
|
||||
\\]
|
||||
Substituting this equation into the schoolbook multiplication, then
|
||||
moving terms to eliminate the \\(2^{51}\\) factors gives
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= \mathrm{lo}(x_0, y_0) \\\\
|
||||
&+ \qquad 0 \\\\
|
||||
z_1 &= \mathrm{lo}(x_1, y_0) + \mathrm{lo}(x_0, y_1) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_0, y_0) )\\\\
|
||||
z_2 &= \mathrm{lo}(x_2, y_0) + \mathrm{lo}(x_1, y_1) + \mathrm{lo}(x_0, y_2) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_1, y_0) + \mathrm{hi}(x_0, y_1) )\\\\
|
||||
z_3 &= \mathrm{lo}(x_3, y_0) + \mathrm{lo}(x_2, y_1) + \mathrm{lo}(x_1, y_2) + \mathrm{lo}(x_0, y_3) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_2, y_0) + \mathrm{hi}(x_1, y_1) + \mathrm{hi}(x_0, y_2) )\\\\
|
||||
z_4 &= \mathrm{lo}(x_4, y_0) + \mathrm{lo}(x_3, y_1) + \mathrm{lo}(x_2, y_2) + \mathrm{lo}(x_1, y_3) + \mathrm{lo}(x_0, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_3, y_0) + \mathrm{hi}(x_2, y_1) + \mathrm{hi}(x_1, y_2) + \mathrm{hi}(x_0, y_3) )\\\\
|
||||
z_5 &= \mathrm{lo}(x_4, y_1) + \mathrm{lo}(x_3, y_2) + \mathrm{lo}(x_2, y_3) + \mathrm{lo}(x_1, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_0) + \mathrm{hi}(x_3, y_1) + \mathrm{hi}(x_2, y_2) + \mathrm{hi}(x_1, y_3) + \mathrm{hi}(x_0, y_4) )\\\\
|
||||
z_6 &= \mathrm{lo}(x_4, y_2) + \mathrm{lo}(x_3, y_3) + \mathrm{lo}(x_2, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_1) + \mathrm{hi}(x_3, y_2) + \mathrm{hi}(x_2, y_3) + \mathrm{hi}(x_1, y_4) )\\\\
|
||||
z_7 &= \mathrm{lo}(x_4, y_3) + \mathrm{lo}(x_3, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_2) + \mathrm{hi}(x_3, y_3) + \mathrm{hi}(x_2, y_4) )\\\\
|
||||
z_8 &= \mathrm{lo}(x_4, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_3) + \mathrm{hi}(x_3, y_4) )\\\\
|
||||
z_9 &= 0 \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_4) )\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
As noted above, our strategy will be to multiply and accumulate the
|
||||
terms with coefficient \\(2\\) separately from those with coefficient
|
||||
\\(1\\), before combining them at the end. This can alternately be
|
||||
thought of as accumulating product terms into a *doubly-redundant*
|
||||
representation, with two limbs for each digit, before collapsing
|
||||
the doubly-redundant representation by shifts and adds.
|
||||
|
||||
This computation requires 25 `vpmadd52luq` and 25 `vpmadd52huq`
|
||||
operations. For 256-bit vectors, IFMA operations execute on an
|
||||
i3-8121U with latency 4 cycles, throughput 0.5 cycles, so executing 50
|
||||
instructions requires 25 cycles' worth of throughput. Accumulating
|
||||
terms with coefficient \\(1\\) and \\(2\\) seperately means that the
|
||||
longest dependency chain has length 5, so the critical path has length
|
||||
20 cycles and the bottleneck is throughput.
|
||||
|
||||
# Reduction modulo \\(p\\)
|
||||
|
||||
The next question is how to handle the reduction modulo \\(p\\).
|
||||
Because \\(p = 2^{255} - 19\\), \\(2^{255} = 19 \pmod p\\), so we can
|
||||
alternately write
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}
|
||||
\end{aligned}
|
||||
\\]
|
||||
as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= (z_0 + 19z_5) + (z_1 + 19z_6) 2^{51} + (z_2 + 19z_7) 2^{102} + (z_3 + 19z_8) 2^{153} + (z_4 + 19z_9) 2^{204}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
When using a \\(64 \times 64 \rightarrow 128\\)-bit multiplier, this
|
||||
can be handled (as in [Ed25519][ed25519_paper]) by premultiplying
|
||||
source terms by \\(19\\). Since \\(\lg(19) < 4.25\\), this increases
|
||||
their size by less than \\(4.25\\) bits, and the rest of the
|
||||
multiplication can be shown to work out.
|
||||
|
||||
Here, we have at most \\(1\\) bit of headroom. In order to allow
|
||||
premultiplication, we would need to use radix \\(2^{47}\\), which
|
||||
would require six limbs instead of five. Instead, we compute the high
|
||||
terms \\(z_5, \ldots, z_9\\), each using two chains of IFMA
|
||||
operations, then multiply by \\(19\\) and combine with the lower terms
|
||||
\\(z_0, \ldots, z_4\\). There are two ways to perform the
|
||||
multiplication by \\(19\\): using more IFMA operations, or using the
|
||||
`vpmullq` instruction, which computes the low \\(64\\) bits of a \\(64
|
||||
\times 64\\)-bit product. However, `vpmullq` has 15c/1.5c
|
||||
latency/throughput, in contrast to the 4c/0.5c latency/throughput of
|
||||
IFMA operations, so it seems like a worse choice.
|
||||
|
||||
The high terms \\(z_5, \ldots, z_9\\) are sums of \\(52\\)-bit terms,
|
||||
so they are larger than \\(52\\) bits. Write these terms in radix \\(52\\) as
|
||||
\\[
|
||||
z_{5+i} = z_{5+i}' + z_{5+i}'' 2^{52}, \qquad z_{5+i}' < 2^{52}.
|
||||
\\]
|
||||
Then the contribution of \\(z_{5+i}\\), taken modulo \\(p\\), is
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{5+i} 2^{255} 2^{51 i}
|
||||
&=
|
||||
19 (z_{5+i}' + z_{5+i}'' 2^{52}) 2^{51 i}
|
||||
\\\\
|
||||
&=
|
||||
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
|
||||
\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
The products \\(19 z_{5+i}', 19 z_{5+i}''\\) can be written in terms of IFMA operations as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
19 z_{5+i}' &= \mathrm{lo}(19, z_{5+i}') + 2 \mathrm{hi}(19, z_{5+i}') 2^{51}, \\\\
|
||||
19 z_{5+i}'' &= \mathrm{lo}(19, z_{5+i}'') + 2 \mathrm{hi}(19, z_{5+i}'') 2^{51}. \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Because \\(z_{5+i} < 2^{64}\\), \\(z_{5+i}'' < 2^{12} \\), so \\(19
|
||||
z_{5+i}'' < 2^{17} < 2^{52} \\) and \\(\mathrm{hi}(19, z_{5+i}'') = 0\\).
|
||||
Because IFMA operations ignore the high bits of their source
|
||||
operands, we do not need to compute \\(z\_{5+i}'\\) explicitly:
|
||||
the high bits will be ignored.
|
||||
Combining these observations, we can write
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{5+i} 2^{255} 2^{51 i}
|
||||
&=
|
||||
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{5+i}) 2^{51 i}
|
||||
\+ 2 \mathrm{hi}(19, z_{5+i}) 2^{51 (i+1)}
|
||||
\+ 2 \mathrm{lo}(19, z_{5+i}/2^{52}) 2^{51 (i+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
|
||||
For \\(i = 0,1,2,3\\), this allows reducing \\(z_{5+i}\\) onto
|
||||
\\(z_{i}, z_{i+1}\\), and if the low terms are computed using a
|
||||
doubly-redundant representation, no additional shifts are needed to
|
||||
handle the \\(2\\) coefficients. For \\(i = 4\\), there's a
|
||||
complication: the contribution becomes
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{9} 2^{255} 2^{204}
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2 \mathrm{hi}(19, z_{9}) 2^{255}
|
||||
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 2^{255}
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2 \mathrm{hi}(19, z_{9}) 19
|
||||
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 19
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2
|
||||
\mathrm{lo}(19, \mathrm{hi}(19, z_{9}) + \mathrm{lo}(19, z_{9}/2^{52})).
|
||||
\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
|
||||
It would be possible to cut the number of multiplications from 3 to 2
|
||||
by carrying the high part of each \\(z_i\\) onto \\(z_{i+1}\\). This
|
||||
would eliminate 5 multiplications, clearing 2.5 cycles of port
|
||||
pressure, at the cost of 5 additions, adding 1.66 cycles of port
|
||||
pressure. But doing this would create a dependency between terms
|
||||
(e.g., \\(z_{5}\\) must be computed before the reduction of
|
||||
\\(z_{6}\\) can begin), whereas with the approach above, all
|
||||
contributions to all terms are computed independently, to maximize ILP
|
||||
and flexibility for the processor to schedule instructions.
|
||||
|
||||
This strategy performs 16 IFMA operations, adding two IFMA operations
|
||||
to each of the \\(2\\)-coefficient terms and one to each of the
|
||||
\\(1\\)-coefficient terms. Considering the multiplication and
|
||||
reduction together, we use 66 IFMA operations, requiring 33 cycles'
|
||||
throughput, while the longest chain of IFMA operations is in the
|
||||
reduction of \\(z_5\\) onto \\(z_1\\), of length 7 (so 28 cycles, plus
|
||||
2 cycles to combine the two parts of \\(z_5\\), and the bottleneck is
|
||||
again throughput.
|
||||
|
||||
Once this is done, we have computed the product terms
|
||||
\\[
|
||||
z = z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204},
|
||||
\\]
|
||||
without reducing the \\(z_i\\) to fit in \\(52\\) bits. Because the
|
||||
overall flow of operations alternates multiplications and additions or
|
||||
subtractions, we would have to perform a reduction after an addition
|
||||
but before the next multiplication anyways, so there's no benefit to
|
||||
fully reducing the limbs at the end of a multiplication. Instead, we
|
||||
leave them unreduced, and track the reduction state using the type
|
||||
system to ensure that unreduced limbs are not accidentally used as an
|
||||
input to a multiplication.
|
||||
|
||||
# Squaring
|
||||
|
||||
Squaring operates similarly to multiplication, but with the
|
||||
possibility to combine identical terms.
|
||||
As before, we write the input as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204}
|
||||
\end{aligned}
|
||||
\\]
|
||||
with limbs in range \\([0,2^{52})\\).
|
||||
Writing the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
|
||||
\end{aligned}
|
||||
\\]
|
||||
a schoolbook squaring in product scanning form takes the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= x_0 x_0 \\\\
|
||||
z_1 &= 2 x_1 x_0 \\\\
|
||||
z_2 &= 2 x_2 x_0 + x_1 x_1 \\\\
|
||||
z_3 &= 2 x_3 x_0 + 2 x_2 x_1 \\\\
|
||||
z_4 &= 2 x_4 x_0 + 2 x_3 x_1 + x_2 x_2 \\\\
|
||||
z_5 &= 2 x_4 x_1 + 2 x_3 x_2 \\\\
|
||||
z_6 &= 2 x_4 x_2 + x_3 x_3 \\\\
|
||||
z_7 &= 2 x_4 x_3 \\\\
|
||||
z_8 &= x_4 x_4 \\\\
|
||||
z_9 &= 0 \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
As before, we write \\(x_i x_j\\) as
|
||||
\\[
|
||||
x_i x_j = \mathrm{lo}(x_i,x_j) + 2\mathrm{hi}(x_i,x_j)2^{51},
|
||||
\\]
|
||||
and substitute to obtain
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= \mathrm{lo}(x_0, x_0) + 0 \\\\
|
||||
z_1 &= 2 \mathrm{lo}(x_1, x_0) + 2 \mathrm{hi}(x_0, x_0) \\\\
|
||||
z_2 &= 2 \mathrm{lo}(x_2, x_0) + \mathrm{lo}(x_1, x_1) + 4 \mathrm{hi}(x_1, x_0) \\\\
|
||||
z_3 &= 2 \mathrm{lo}(x_3, x_0) + 2 \mathrm{lo}(x_2, x_1) + 4 \mathrm{hi}(x_2, x_0) + 2 \mathrm{hi}(x_1, x_1) \\\\
|
||||
z_4 &= 2 \mathrm{lo}(x_4, x_0) + 2 \mathrm{lo}(x_3, x_1) + \mathrm{lo}(x_2, x_2) + 4 \mathrm{hi}(x_3, x_0) + 4 \mathrm{hi}(x_2, x_1) \\\\
|
||||
z_5 &= 2 \mathrm{lo}(x_4, x_1) + 2 \mathrm{lo}(x_3, x_2) + 4 \mathrm{hi}(x_4, x_0) + 4 \mathrm{hi}(x_3, x_1) + 2 \mathrm{hi}(x_2, x_2) \\\\
|
||||
z_6 &= 2 \mathrm{lo}(x_4, x_2) + \mathrm{lo}(x_3, x_3) + 4 \mathrm{hi}(x_4, x_1) + 4 \mathrm{hi}(x_3, x_2) \\\\
|
||||
z_7 &= 2 \mathrm{lo}(x_4, x_3) + 4 \mathrm{hi}(x_4, x_2) + 2 \mathrm{hi}(x_3, x_3) \\\\
|
||||
z_8 &= \mathrm{lo}(x_4, x_4) + 4 \mathrm{hi}(x_4, x_3) \\\\
|
||||
z_9 &= 0 + 2 \mathrm{hi}(x_4, x_4) \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
To implement these, we group terms by their coefficient, computing
|
||||
those with coefficient \\(2\\) on set of IFMA chains, and on another
|
||||
set of chains, we begin with coefficient-\\(4\\) terms, then shift
|
||||
left before continuing with the coefficient-\\(1\\) terms.
|
||||
The reduction strategy is the same as for multiplication.
|
||||
|
||||
# Future improvements
|
||||
|
||||
LLVM won't use blend operations on [256-bit vectors yet][llvm_blend],
|
||||
so there's a bunch of blend instructions that could be omitted.
|
||||
|
||||
Although the multiplications and squarings are much faster, there's no
|
||||
speedup to the additions and subtractions, so there are diminishing
|
||||
returns. In fact, the complications in the doubling formulas mean
|
||||
that doubling is actually slower than readdition. This also suggests
|
||||
that moving to 512-bit vectors won't be much help for a strategy aimed
|
||||
at parallelism within a group operation, so to extract performance
|
||||
gains from 512-bit vectors it will probably be necessary to create a
|
||||
parallel-friendly multiscalar multiplication algorithm. This could
|
||||
also help with reducing shuffle pressure.
|
||||
|
||||
The squaring implementation could probably be optimized, but without
|
||||
`perf` support on Cannonlake it's difficult to make actual
|
||||
measurements.
|
||||
|
||||
Another improvement would be to implement vectorized square root
|
||||
computations, which would allow creating an iterator adaptor for point
|
||||
decompression that bunched decompression operations and executed them
|
||||
in parallel. This would accelerate batch verification.
|
||||
|
||||
[2016_gueron_krasnov]: https://ieeexplore.ieee.org/document/7563269
|
||||
[2018_drucker_gueron]: https://eprint.iacr.org/2018/335
|
||||
[1999_walter]: https://pdfs.semanticscholar.org/0e6a/3e8f30b63b556679f5dff2cbfdfe9523f4fa.pdf
|
||||
[ed25519_paper]: https://ed25519.cr.yp.to/ed25519-20110926.pdf
|
||||
[llvm_blend]: https://bugs.llvm.org/show_bug.cgi?id=38343
|
|
@ -1,333 +0,0 @@
|
|||
Vectorized implementations of field and point operations, using a
|
||||
modification of the 4-way parallel formulas of Hisil, Wong, Carter,
|
||||
and Dawson.
|
||||
|
||||
These notes explain the parallel formulas and our strategy for using
|
||||
them with SIMD operations. There are two backend implementations: one
|
||||
using AVX2, and the other using AVX512-IFMA.
|
||||
|
||||
# Overview
|
||||
|
||||
The 2008 paper [_Twisted Edwards Curves Revisited_][hwcd08] by Hisil,
|
||||
Wong, Carter, and Dawson (HWCD) introduced the “extended coordinates”
|
||||
and mixed-model representations which are used by most Edwards curve
|
||||
implementations.
|
||||
|
||||
However, they also describe 4-way parallel formulas for point addition
|
||||
and doubling: a unified addition algorithm taking an effective
|
||||
\\(2\mathbf M + 1\mathbf D\\), a doubling algorithm taking an
|
||||
effective \\(1\mathbf M + 1\mathbf S\\), and a dedicated (i.e., for
|
||||
distinct points) addition algorithm taking an effective \\(2 \mathbf M
|
||||
\\). They compare these formulas with a 2-way parallel variant of the
|
||||
Montgomery ladder.
|
||||
|
||||
Unlike their serial formulas, which are used widely, their parallel
|
||||
formulas do not seem to have been implemented in software before. The
|
||||
2-way parallel Montgomery ladder was used in 2015 by Tung Chou's
|
||||
`sandy2x` implementation. Curiously, however, although the [`sandy2x`
|
||||
paper][sandy2x] also implements Edwards arithmetic, and cites HWCD08,
|
||||
it doesn't mention their parallel Edwards formulas.
|
||||
A 2015 paper by Hernández and López describes an AVX2 implementation
|
||||
of X25519. Neither the paper nor the code are publicly available, but
|
||||
it apparently gives only a [slight speedup][avx2trac], suggesting that
|
||||
it uses a 4-way parallel Montgomery ladder rather than parallel
|
||||
Edwards formulas.
|
||||
|
||||
The reason may be that HWCD08 describe their formulas as operating on
|
||||
four independent processors, which would make a software
|
||||
implementation impractical: all of the operations are too low-latency
|
||||
to effectively synchronize. But a closer inspection reveals that the
|
||||
(more expensive) multiplication and squaring steps are uniform, while
|
||||
the instruction divergence occurs in the (much cheaper) addition and
|
||||
subtraction steps. This means that a SIMD implementation can perform
|
||||
the expensive steps uniformly, and handle divergence in the
|
||||
inexpensive steps using masking.
|
||||
|
||||
These notes describe modifications to the original parallel formulas
|
||||
to allow a SIMD implementation, and this module contains
|
||||
implementations of the modified formulas targeting either AVX2 or
|
||||
AVX512-IFMA.
|
||||
|
||||
# Parallel formulas in HWCD'08
|
||||
|
||||
The doubling formula is presented in the HWCD paper as follows:
|
||||
|
||||
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|
||||
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
|
||||
| | idle | idle | idle | \\( R\_1 \gets X\_1 + Y\_1 \\) |
|
||||
| \\(1\mathbf S\\) | \\( R\_2 \gets X\_1\^2 \\) | \\( R\_3 \gets Y\_1\^2 \\) | \\( R\_4 \gets Z\_1\^2 \\) | \\( R\_5 \gets R\_1\^2 \\) |
|
||||
| | \\( R\_6 \gets R\_2 + R\_3 \\) | \\( R\_7 \gets R\_2 - R\_3 \\) | \\( R\_4 \gets 2 R\_4 \\) | idle |
|
||||
| | idle | \\( R\_1 \gets R\_4 + R\_7 \\) | idle | \\( R\_2 \gets R\_6 - R\_5 \\) |
|
||||
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_6 R\_7 \\) | \\( T\_3 \gets R\_2 R\_6 \\) | \\( Z\_3 \gets R\_1 R\_7 \\) |
|
||||
|
||||
and the unified addition algorithm is presented as follows:
|
||||
|
||||
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|
||||
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
|
||||
| | \\( R\_1 \gets Y\_1 - X\_1 \\) | \\( R\_2 \gets Y\_2 - X\_2 \\) | \\( R\_3 \gets Y\_1 + X\_1 \\) | \\( R\_4 \gets Y\_2 + X\_2 \\) |
|
||||
| \\(1\mathbf M\\) | \\( R\_5 \gets R\_1 R\_2 \\) | \\( R\_6 \gets R\_3 R\_4 \\) | \\( R\_7 \gets T\_1 T\_2 \\) | \\( R\_8 \gets Z\_1 Z\_2 \\) |
|
||||
| \\(1\mathbf D\\) | idle | idle | \\( R\_7 \gets k R\_7 \\) | \\( R\_8 \gets 2 R\_8 \\) |
|
||||
| | \\( R\_1 \gets R\_6 - R\_5 \\) | \\( R\_2 \gets R\_8 - R\_7 \\) | \\( R\_3 \gets R\_8 + R\_7 \\) | \\( R\_4 \gets R\_6 + R\_5 \\) |
|
||||
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_3 R\_4 \\) | \\( T\_3 \gets R\_1 R\_4 \\) | \\( Z\_3 \gets R\_2 R\_3 \\) |
|
||||
|
||||
Here \\(\mathbf M\\) and \\(\mathbf S\\) represent the cost of
|
||||
multiplication and squaring of generic field elements, \\(\mathbf D\\)
|
||||
represents the cost of multiplication by a curve constant (in this
|
||||
case \\( k = 2d \\)).
|
||||
|
||||
Notice that the \\(1\mathbf M\\) and \\(1\mathbf S\\) steps are
|
||||
uniform. The non-uniform steps are all inexpensive additions or
|
||||
subtractions, with the exception of the multiplication by the curve
|
||||
constant \\(k = 2d\\):
|
||||
$$
|
||||
R\_7 \gets 2 d R\_7.
|
||||
$$
|
||||
|
||||
HWCD suggest parallelising this step by breaking \\(k = 2d\\) into four
|
||||
parts as \\(k = k_0 + 2\^n k_1 + 2\^{2n} k_2 + 2\^{3n} k_3 \\) and
|
||||
computing \\(k_i R_7 \\) in parallel. This is quite awkward, but if
|
||||
the curve constant is a ratio \\( d = d\_1/d\_2 \\), then projective
|
||||
coordinates allow us to instead compute
|
||||
$$
|
||||
(R\_5, R\_6, R\_7, R\_8) \gets (d\_2 R\_5, d\_2 R\_6, 2d\_1 R\_7, d\_2 R\_8).
|
||||
$$
|
||||
This can be performed as a uniform multiplication by a vector of
|
||||
constants, and if \\(d\_1, d\_2\\) are small, it is relatively
|
||||
inexpensive. (This trick was suggested by Mike Hamburg).
|
||||
In the Curve25519 case, we have
|
||||
$$
|
||||
d = \frac{d\_1}{d\_2} = \frac{-121665}{121666};
|
||||
$$
|
||||
Since \\(2 \cdot 121666 < 2\^{18}\\), all the constants above fit (up
|
||||
to sign) in 32 bits, so this can be done in parallel as four
|
||||
multiplications by small constants \\( (121666, 121666, 2\cdot 121665,
|
||||
2\cdot 121666) \\), followed by a negation to compute \\( - 2\cdot 121665\\).
|
||||
|
||||
# Modified parallel formulas
|
||||
|
||||
Using the modifications sketched above, we can write SIMD-friendly
|
||||
versions of the parallel formulas as follows. To avoid confusion with
|
||||
the original formulas, temporary variables are named \\(S\\) instead
|
||||
of \\(R\\) and are in static single-assignment form.
|
||||
|
||||
## Addition
|
||||
|
||||
To add points
|
||||
\\(P_1 = (X_1 : Y_1 : Z_1 : T_1) \\)
|
||||
and
|
||||
\\(P_2 = (X_2 : Y_2 : Z_2 : T_2 ) \\),
|
||||
we compute
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_0 &&,&& S\_1 &&,&& S\_2 &&,&& S\_3 )
|
||||
&\gets
|
||||
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Y\_2 - X\_2&&,&& Y\_2 + X\_2)
|
||||
\\\\
|
||||
(S\_4 &&,&& S\_5 &&,&& S\_6 &&,&& S\_7 )
|
||||
&\gets
|
||||
(S\_0 \cdot S\_2&&,&& S\_1 \cdot S\_3&&,&& Z\_1 \cdot Z\_2&&,&& T\_1 \cdot T\_2)
|
||||
\\\\
|
||||
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
|
||||
&\gets
|
||||
(d\_2 \cdot S\_4 &&,&& d\_2 \cdot S\_5 &&,&& 2 d\_2 \cdot S\_6 &&,&& 2 d\_1 \cdot S\_7 )
|
||||
\\\\
|
||||
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
|
||||
&\gets
|
||||
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
|
||||
\\\\
|
||||
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
|
||||
&\gets
|
||||
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
|
||||
\end{aligned}
|
||||
$$
|
||||
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = P\_1 + P\_2 \\).
|
||||
This costs \\( 2\mathbf M + 1 \mathbf D\\).
|
||||
|
||||
## Readdition
|
||||
|
||||
If the point \\( P_2 = (X\_2 : Y\_2 : Z\_2 : T\_2) \\) is fixed, we
|
||||
can cache the multiplication of the curve constants by computing
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_2' &&,&& S\_3' &&,&& Z\_2' &&,&& T\_2' )
|
||||
&\gets
|
||||
(d\_2 \cdot (Y\_2 - X\_2)&&,&& d\_2 \cdot (Y\_1 + X\_1)&&,&& 2d\_2 \cdot Z\_2 &&,&& 2d\_1 \cdot T\_2).
|
||||
\end{aligned}
|
||||
$$
|
||||
This costs \\( 1\mathbf D\\); with \\( (S\_2', S\_3', Z\_2', T\_2')\\)
|
||||
in hand, the addition formulas above become
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_0 &&,&& S\_1 &&,&& Z\_1 &&,&& T\_1 )
|
||||
&\gets
|
||||
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Z\_1 &&,&& T\_1)
|
||||
\\\\
|
||||
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
|
||||
&\gets
|
||||
(S\_0 \cdot S\_2' &&,&& S\_1 \cdot S\_3'&&,&& Z\_1 \cdot Z\_2' &&,&& T\_1 \cdot T\_2')
|
||||
\\\\
|
||||
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
|
||||
&\gets
|
||||
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
|
||||
\\\\
|
||||
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
|
||||
&\gets
|
||||
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
|
||||
\end{aligned}
|
||||
$$
|
||||
which costs only \\( 2\mathbf M \\). This precomputation is
|
||||
essentially similar to the precomputation that HWCD suggest for their
|
||||
serial formulas. Because the cost of precomputation and then
|
||||
readdition is the same as addition, it's sufficient to only
|
||||
implement caching and readdition.
|
||||
|
||||
## Doubling
|
||||
|
||||
The non-uniform portions of the (re)addition formulas have a fairly
|
||||
regular structure. Unfortunately, this is not the case for the
|
||||
doubling formulas, which are much less nice.
|
||||
|
||||
To double a point \\( P = (X\_1 : Y\_1 : Z\_1 : T\_1) \\), we compute
|
||||
$$
|
||||
\begin{aligned}
|
||||
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& S\_0)
|
||||
&\gets
|
||||
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& X\_1 + Y\_1)
|
||||
\\\\
|
||||
(S\_1 &&,&& S\_2 &&,&& S\_3 &&,&& S\_4 )
|
||||
&\gets
|
||||
(X\_1\^2 &&,&& Y\_1\^2&&,&& Z\_1\^2 &&,&& S\_0\^2)
|
||||
\\\\
|
||||
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
|
||||
&\gets
|
||||
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
|
||||
\\\\
|
||||
(X\_3 &&,&& Y\_3 &&,&& Z\_3 &&,&& T\_3 )
|
||||
&\gets
|
||||
(S\_8 \cdot S\_9 &&,&& S\_5 \cdot S\_6 &&,&& S\_8 \cdot S\_6 &&,&& S\_5 \cdot S\_9)
|
||||
\end{aligned}
|
||||
$$
|
||||
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = [2]P\_1 \\).
|
||||
|
||||
The intermediate step between the squaring and multiplication requires
|
||||
a long chain of additions. For the IFMA-based implementation, this is not a problem; for the AVX2-based implementation, it is, but with some care and finesse, it's possible to arrange the computation without requiring an intermediate reduction.
|
||||
|
||||
# Implementation
|
||||
|
||||
These formulas aren't specific to a particular representation of field
|
||||
element vectors, whose optimum choice is determined by the details of
|
||||
the instruction set. However, it's not possible to perfectly separate
|
||||
the implementation of the field element vectors from the
|
||||
implementation of the point operations. Instead, the [`avx2`] and
|
||||
[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and
|
||||
the [`scalar_mul`] code uses one of the backend types by a type alias.
|
||||
|
||||
# Comparison to non-vectorized formulas
|
||||
|
||||
In theory, the parallel Edwards formulas seem to allow a \\(4\\)-way
|
||||
speedup from parallelism. However, an actual vectorized
|
||||
implementation has several slowdowns that cut into this speedup.
|
||||
|
||||
First, the parallel formulas can only use the available vector
|
||||
multiplier. For AVX2, this is a \\( 32 \times 32 \rightarrow 64
|
||||
\\)-bit integer multiplier, so the speedup from vectorization must
|
||||
overcome the disadvantage of losing the \\( 64 \times 64 \rightarrow
|
||||
128\\)-bit (serial) integer multiplier. The effect of this slowdown
|
||||
is microarchitecture-dependent, since it requires accounting for the
|
||||
total number of multiplications and additions and their relative
|
||||
costs. IFMA allows using a \\( 52 \times 52 \rightarrow 104 \\)-bit
|
||||
multiplier, but the high and low halves need to be computed
|
||||
separately, and the reduction requires extra work because it's not
|
||||
possible to pre-multiply by \\(19\\).
|
||||
|
||||
Second, the parallel doubling formulas incur both a theoretical and
|
||||
practical slowdown. The parallel formulas described above work on the
|
||||
\\( \mathbb P\^3 \\) “extended” coordinates. The \\( \mathbb P\^2 \\)
|
||||
model introduced earlier by [Bernstein, Birkner, Joye, Lange, and
|
||||
Peters][bbjlp08] allows slightly faster doublings, so HWCD suggest
|
||||
mixing coordinate systems while performing scalar multiplication
|
||||
(attributing the idea to [a 1998 paper][cmo98] by Cohen, Miyagi, and
|
||||
Ono). The \\( T \\) coordinate is not required for doublings, so when
|
||||
doublings are followed by doublings, its computation can be skipped.
|
||||
More details on this approach and the different coordinate systems can
|
||||
be found in the [`curve_models` module documentation][curve_models].
|
||||
|
||||
Unfortunately, this optimization is not compatible with the parallel
|
||||
formulas, which cannot save time by skipping a single variable, so the
|
||||
parallel doubling formulas do slightly more work when counting the
|
||||
total number of field multiplications and squarings.
|
||||
|
||||
In addition, the parallel doubling formulas have a less regular
|
||||
pattern of additions and subtractions than the parallel addition
|
||||
formulas, so the vectorization overhead is proportionately greater.
|
||||
Both the parallel addition and parallel doubling formulas also require
|
||||
some shuffling to rearrange data within the vectors, which places more
|
||||
pressure on the shuffle unit than is desirable.
|
||||
|
||||
This means that the speedup from using a vectorized implementation of
|
||||
parallel Edwards formulas is likely to be greatest in applications
|
||||
that do fewer doublings and more additions (like a large multiscalar
|
||||
multiplication) rather than applications that do fewer additions and
|
||||
more doublings (like a double-base scalar multiplication).
|
||||
|
||||
Third, Amdahl's law says that the speedup is limited to the portion
|
||||
which can be parallelized. Normally, the field multiplications
|
||||
dominate the cost of point operations, but with the IFMA backend, the
|
||||
multiplications are so fast that the non-parallel additions end up as
|
||||
a significant portion of the total time.
|
||||
|
||||
Fourth, current Intel CPUs perform thermal throttling when using wide
|
||||
vector instructions. A detailed description can be found in §15.26 of
|
||||
[the Intel Optimization Manual][intel], but using wide vector
|
||||
instructions prevents the core from operating at higher frequencies.
|
||||
The core can return to the higher-frequency state after 2
|
||||
milliseconds, but this timer is reset every time high-power
|
||||
instructions are used.
|
||||
|
||||
Any speedup from vectorization therefore has to be weighed against a
|
||||
slowdown for the next few million instructions. For a mixed workload,
|
||||
where point operations are interspersed with other tasks, this can
|
||||
reduce overall performance. This implementation is therefore probably
|
||||
not suitable for basic applications, like signatures, but is
|
||||
worthwhile for complex applications, like zero-knowledge proofs, which
|
||||
do sustained work.
|
||||
|
||||
# Future work
|
||||
|
||||
There are several directions for future improvement:
|
||||
|
||||
* Using the vectorized field arithmetic code to parallelize across
|
||||
point operations rather than within a single point operation. This
|
||||
is less flexible, but would give a speedup both from allowing use of
|
||||
the faster mixed-model arithmetic and from reducing shuffle
|
||||
pressure. One approach in this direction would be to implement
|
||||
batched scalar-point operations using vectors of points (AoSoA
|
||||
layout). This less generally useful but would give a speedup for
|
||||
Bulletproofs.
|
||||
|
||||
* Extending the IFMA implementation to use the full width of AVX512,
|
||||
either handling the extra parallelism internally to a single point
|
||||
operation (by using a 2-way parallel implementation of field
|
||||
arithmetic instead of a wordsliced one), or externally,
|
||||
parallelizing across point operations. Internal parallelism would
|
||||
be preferable but might require too much shuffle pressure. For now,
|
||||
the only available CPU which runs IFMA operations executes them at
|
||||
256-bits wide anyways, so this isn't yet important.
|
||||
|
||||
* Generalizing the implementation to NEON instructions. The current
|
||||
point arithmetic code is written in terms of field element vectors,
|
||||
which are in turn implemented using platform SIMD vectors. It
|
||||
should be possible to write an alternate implementation of the
|
||||
`FieldElement2625x4` using NEON without changing the point
|
||||
arithmetic. NEON has 128-bit vectors rather than 256-bit vectors,
|
||||
but this may still be worthwhile compared to a serial
|
||||
implementation.
|
||||
|
||||
|
||||
[sandy2x]: https://eprint.iacr.org/2015/943.pdf
|
||||
[avx2trac]: https://trac.torproject.org/projects/tor/ticket/8897#comment:28
|
||||
[hwcd08]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
|
||||
[curve_models]: https://doc-internal.dalek.rs/curve25519_dalek/curve_models/index.html
|
||||
[bbjlp08]: https://eprint.iacr.org/2008/013
|
||||
[cmo98]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
|
||||
[intel]: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf
|
|
@ -1,62 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Pluggable implementations for different architectures.
|
||||
//!
|
||||
//! The backend code is split into two parts: a serial backend,
|
||||
//! and a vector backend.
|
||||
//!
|
||||
//! The [`serial`] backend contains 32- and 64-bit implementations of
|
||||
//! field arithmetic and scalar arithmetic, as well as implementations
|
||||
//! of point operations using the mixed-model strategy (passing
|
||||
//! between different curve models depending on the operation).
|
||||
//!
|
||||
//! The [`vector`] backend contains implementations of vectorized
|
||||
//! field arithmetic, used to implement point operations using a novel
|
||||
//! implementation strategy derived from parallel formulas of Hisil,
|
||||
//! Wong, Carter, and Dawson.
|
||||
//!
|
||||
//! Because the two strategies give rise to different curve models,
|
||||
//! it's not possible to reuse exactly the same scalar multiplication
|
||||
//! code (or to write it generically), so both serial and vector
|
||||
//! backends contain matching implementations of scalar multiplication
|
||||
//! algorithms. These are intended to be selected by a `#[cfg]`-based
|
||||
//! type alias.
|
||||
//!
|
||||
//! The [`vector`] backend is selected by the `simd_backend` cargo
|
||||
//! feature; it uses the [`serial`] backend for non-vectorized operations.
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "u32_backend",
|
||||
feature = "u64_backend",
|
||||
feature = "simd_backend",
|
||||
)))]
|
||||
compile_error!(
|
||||
"no curve25519-dalek backend cargo feature enabled! \
|
||||
please enable one of: u32_backend, u64_backend, simd_backend"
|
||||
);
|
||||
|
||||
pub mod serial;
|
||||
|
||||
#[cfg(any(
|
||||
all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
),
|
||||
all(feature = "nightly", rustdoc)
|
||||
))]
|
||||
#[cfg_attr(
|
||||
feature = "nightly",
|
||||
doc(cfg(any(all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
))))
|
||||
)]
|
||||
pub mod vector;
|
|
@ -1,550 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Internal curve representations which are not part of the public API.
|
||||
//!
|
||||
//! # Curve representations
|
||||
//!
|
||||
//! Internally, we use several different models for the curve. Here
|
||||
//! is a sketch of the relationship between the models, following [a
|
||||
//! post][smith-moderncrypto]
|
||||
//! by Ben Smith on the `moderncrypto` mailing list. This is also briefly
|
||||
//! discussed in section 2.5 of [_Montgomery curves and their
|
||||
//! arithmetic_][costello-smith-2017] by Costello and Smith.
|
||||
//!
|
||||
//! Begin with the affine equation for the curve,
|
||||
//! $$
|
||||
//! -x\^2 + y\^2 = 1 + dx\^2y\^2.
|
||||
//! $$
|
||||
//! Next, pass to the projective closure \\(\mathbb P\^1 \times \mathbb
|
||||
//! P\^1 \\) by setting \\(x=X/Z\\), \\(y=Y/T.\\) Clearing denominators
|
||||
//! gives the model
|
||||
//! $$
|
||||
//! -X\^2T\^2 + Y\^2Z\^2 = Z\^2T\^2 + dX\^2Y\^2.
|
||||
//! $$
|
||||
//! In `curve25519-dalek`, this is represented as the `CompletedPoint`
|
||||
//! struct.
|
||||
//! To map from \\(\mathbb P\^1 \times \mathbb P\^1 \\), a product of
|
||||
//! two lines, to \\(\mathbb P\^3\\), we use the [Segre
|
||||
//! embedding](https://en.wikipedia.org/wiki/Segre_embedding)
|
||||
//! $$
|
||||
//! \sigma : ((X:Z),(Y:T)) \mapsto (XY:XT:ZY:ZT).
|
||||
//! $$
|
||||
//! Using coordinates \\( (W_0:W_1:W_2:W_3) \\) for \\(\mathbb P\^3\\),
|
||||
//! the image \\(\sigma (\mathbb P\^1 \times \mathbb P\^1) \\) is the
|
||||
//! surface defined by \\( W_0 W_3 = W_1 W_2 \\), and under \\(
|
||||
//! \sigma\\), the equation above becomes
|
||||
//! $$
|
||||
//! -W\_1\^2 + W\_2\^2 = W\_3\^2 + dW\_0\^2,
|
||||
//! $$
|
||||
//! so that the curve is given by the pair of equations
|
||||
//! $$
|
||||
//! \begin{aligned}
|
||||
//! -W\_1\^2 + W\_2\^2 &= W\_3\^2 + dW\_0\^2, \\\\ W_0 W_3 &= W_1 W_2.
|
||||
//! \end{aligned}
|
||||
//! $$
|
||||
//! Up to variable naming, this is exactly the "extended" curve model
|
||||
//! introduced in [_Twisted Edwards Curves
|
||||
//! Revisited_][hisil-wong-carter-dawson-2008] by Hisil, Wong, Carter,
|
||||
//! and Dawson. In `curve25519-dalek`, it is represented as the
|
||||
//! `EdwardsPoint` struct. We can map from \\(\mathbb P\^3 \\) to
|
||||
//! \\(\mathbb P\^2 \\) by sending \\( (W\_0:W\_1:W\_2:W\_3) \\) to \\(
|
||||
//! (W\_1:W\_2:W\_3) \\). Notice that
|
||||
//! $$
|
||||
//! \frac {W\_1} {W\_3} = \frac {XT} {ZT} = \frac X Z = x,
|
||||
//! $$
|
||||
//! and
|
||||
//! $$
|
||||
//! \frac {W\_2} {W\_3} = \frac {YZ} {ZT} = \frac Y T = y,
|
||||
//! $$
|
||||
//! so this is the same as if we had started with the affine model
|
||||
//! and passed to \\( \mathbb P\^2 \\) by setting \\( x = W\_1 / W\_3
|
||||
//! \\), \\(y = W\_2 / W\_3 \\).
|
||||
//! Up to variable naming, this is the projective representation
|
||||
//! introduced in in [_Twisted Edwards
|
||||
//! Curves_][bernstein-birkner-joye-lange-peters-2008] by Bernstein,
|
||||
//! Birkner, Joye, Lange, and Peters. In `curve25519-dalek`, it is
|
||||
//! represented by the `ProjectivePoint` struct.
|
||||
//!
|
||||
//! # Passing between curve models
|
||||
//!
|
||||
//! Although the \\( \mathbb P\^3 \\) model provides faster addition
|
||||
//! formulas, the \\( \mathbb P\^2 \\) model provides faster doubling
|
||||
//! formulas. Hisil, Wong, Carter, and Dawson therefore suggest mixing
|
||||
//! coordinate systems for scalar multiplication, attributing the idea
|
||||
//! to [a 1998 paper][cohen-miyaji-ono-1998] of Cohen, Miyagi, and Ono.
|
||||
//!
|
||||
//! Their suggestion is to vary the formulas used by context, using a
|
||||
//! \\( \mathbb P\^2 \rightarrow \mathbb P\^2 \\) doubling formula when
|
||||
//! a doubling is followed
|
||||
//! by another doubling, a \\( \mathbb P\^2 \rightarrow \mathbb P\^3 \\)
|
||||
//! doubling formula when a doubling is followed by an addition, and
|
||||
//! computing point additions using a \\( \mathbb P\^3 \times \mathbb P\^3
|
||||
//! \rightarrow \mathbb P\^2 \\) formula.
|
||||
//!
|
||||
//! The `ref10` reference implementation of [Ed25519][ed25519], by
|
||||
//! Bernstein, Duif, Lange, Schwabe, and Yang, tweaks
|
||||
//! this strategy, factoring the addition formulas through the
|
||||
//! completion \\( \mathbb P\^1 \times \mathbb P\^1 \\), so that the
|
||||
//! output of an addition or doubling always lies in \\( \mathbb P\^1 \times
|
||||
//! \mathbb P\^1\\), and the choice of which formula to use is replaced
|
||||
//! by a choice of whether to convert the result to \\( \mathbb P\^2 \\)
|
||||
//! or \\(\mathbb P\^3 \\). However, this tweak is not described in
|
||||
//! their paper, only in their software.
|
||||
//!
|
||||
//! Our naming for the `CompletedPoint` (\\(\mathbb P\^1 \times \mathbb
|
||||
//! P\^1 \\)), `ProjectivePoint` (\\(\mathbb P\^2 \\)), and
|
||||
//! `EdwardsPoint` (\\(\mathbb P\^3 \\)) structs follows the naming in
|
||||
//! Adam Langley's [Golang ed25519][agl-ed25519] implementation, which
|
||||
//! `curve25519-dalek` was originally derived from.
|
||||
//!
|
||||
//! Finally, to accelerate readditions, we use two cached point formats
|
||||
//! in "Niels coordinates", named for Niels Duif,
|
||||
//! one for the affine model and one for the \\( \mathbb P\^3 \\) model:
|
||||
//!
|
||||
//! * `AffineNielsPoint`: \\( (y+x, y-x, 2dxy) \\)
|
||||
//! * `ProjectiveNielsPoint`: \\( (Y+X, Y-X, Z, 2dXY) \\)
|
||||
//!
|
||||
//! [smith-moderncrypto]: https://moderncrypto.org/mail-archive/curves/2016/000807.html
|
||||
//! [costello-smith-2017]: https://eprint.iacr.org/2017/212
|
||||
//! [hisil-wong-carter-dawson-2008]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
|
||||
//! [bernstein-birkner-joye-lange-peters-2008]: https://eprint.iacr.org/2008/013
|
||||
//! [cohen-miyaji-ono-1998]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
|
||||
//! [ed25519]: https://eprint.iacr.org/2011/368
|
||||
//! [agl-ed25519]: https://github.com/agl/ed25519
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::{Add, Neg, Sub};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use constants;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use field::FieldElement;
|
||||
use traits::ValidityCheck;
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Internal point representations
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
/// A `ProjectivePoint` is a point \\((X:Y:Z)\\) on the \\(\mathbb
|
||||
/// P\^2\\) model of the curve.
|
||||
/// A point \\((x,y)\\) in the affine model corresponds to
|
||||
/// \\((x:y:1)\\).
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ProjectivePoint {
|
||||
pub X: FieldElement,
|
||||
pub Y: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
}
|
||||
|
||||
/// A `CompletedPoint` is a point \\(((X:Z), (Y:T))\\) on the \\(\mathbb
|
||||
/// P\^1 \times \mathbb P\^1 \\) model of the curve.
|
||||
/// A point (x,y) in the affine model corresponds to \\( ((x:1),(y:1))
|
||||
/// \\).
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct CompletedPoint {
|
||||
pub X: FieldElement,
|
||||
pub Y: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
pub T: FieldElement,
|
||||
}
|
||||
|
||||
/// A pre-computed point in the affine model for the curve, represented as
|
||||
/// \\((y+x, y-x, 2dxy)\\) in "Niels coordinates".
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
// Safe to derive Eq because affine coordinates.
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct AffineNielsPoint {
|
||||
pub y_plus_x: FieldElement,
|
||||
pub y_minus_x: FieldElement,
|
||||
pub xy2d: FieldElement,
|
||||
}
|
||||
|
||||
impl Zeroize for AffineNielsPoint {
|
||||
fn zeroize(&mut self) {
|
||||
self.y_plus_x.zeroize();
|
||||
self.y_minus_x.zeroize();
|
||||
self.xy2d.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
/// A pre-computed point on the \\( \mathbb P\^3 \\) model for the
|
||||
/// curve, represented as \\((Y+X, Y-X, Z, 2dXY)\\) in "Niels coordinates".
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ProjectiveNielsPoint {
|
||||
pub Y_plus_X: FieldElement,
|
||||
pub Y_minus_X: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
pub T2d: FieldElement,
|
||||
}
|
||||
|
||||
impl Zeroize for ProjectiveNielsPoint {
|
||||
fn zeroize(&mut self) {
|
||||
self.Y_plus_X.zeroize();
|
||||
self.Y_minus_X.zeroize();
|
||||
self.Z.zeroize();
|
||||
self.T2d.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Constructors
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
impl Identity for ProjectivePoint {
|
||||
fn identity() -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
X: FieldElement::zero(),
|
||||
Y: FieldElement::one(),
|
||||
Z: FieldElement::one(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for ProjectiveNielsPoint {
|
||||
fn identity() -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint{
|
||||
Y_plus_X: FieldElement::one(),
|
||||
Y_minus_X: FieldElement::one(),
|
||||
Z: FieldElement::one(),
|
||||
T2d: FieldElement::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProjectiveNielsPoint {
|
||||
fn default() -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for AffineNielsPoint {
|
||||
fn identity() -> AffineNielsPoint {
|
||||
AffineNielsPoint{
|
||||
y_plus_x: FieldElement::one(),
|
||||
y_minus_x: FieldElement::one(),
|
||||
xy2d: FieldElement::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AffineNielsPoint {
|
||||
fn default() -> AffineNielsPoint {
|
||||
AffineNielsPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Validity checks (for debugging, not CT)
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ValidityCheck for ProjectivePoint {
|
||||
fn is_valid(&self) -> bool {
|
||||
// Curve equation is -x^2 + y^2 = 1 + d*x^2*y^2,
|
||||
// homogenized as (-X^2 + Y^2)*Z^2 = Z^4 + d*X^2*Y^2
|
||||
let XX = self.X.square();
|
||||
let YY = self.Y.square();
|
||||
let ZZ = self.Z.square();
|
||||
let ZZZZ = ZZ.square();
|
||||
let lhs = &(&YY - &XX) * &ZZ;
|
||||
let rhs = &ZZZZ + &(&constants::EDWARDS_D * &(&XX * &YY));
|
||||
|
||||
lhs == rhs
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Constant-time assignment
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ConditionallySelectable for ProjectiveNielsPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
ProjectiveNielsPoint {
|
||||
Y_plus_X: FieldElement::conditional_select(&a.Y_plus_X, &b.Y_plus_X, choice),
|
||||
Y_minus_X: FieldElement::conditional_select(&a.Y_minus_X, &b.Y_minus_X, choice),
|
||||
Z: FieldElement::conditional_select(&a.Z, &b.Z, choice),
|
||||
T2d: FieldElement::conditional_select(&a.T2d, &b.T2d, choice),
|
||||
}
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.Y_plus_X.conditional_assign(&other.Y_plus_X, choice);
|
||||
self.Y_minus_X.conditional_assign(&other.Y_minus_X, choice);
|
||||
self.Z.conditional_assign(&other.Z, choice);
|
||||
self.T2d.conditional_assign(&other.T2d, choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for AffineNielsPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
AffineNielsPoint {
|
||||
y_plus_x: FieldElement::conditional_select(&a.y_plus_x, &b.y_plus_x, choice),
|
||||
y_minus_x: FieldElement::conditional_select(&a.y_minus_x, &b.y_minus_x, choice),
|
||||
xy2d: FieldElement::conditional_select(&a.xy2d, &b.xy2d, choice),
|
||||
}
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.y_plus_x.conditional_assign(&other.y_plus_x, choice);
|
||||
self.y_minus_x.conditional_assign(&other.y_minus_x, choice);
|
||||
self.xy2d.conditional_assign(&other.xy2d, choice);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Point conversions
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ProjectivePoint {
|
||||
/// Convert this point from the \\( \mathbb P\^2 \\) model to the
|
||||
/// \\( \mathbb P\^3 \\) model.
|
||||
///
|
||||
/// This costs \\(3 \mathrm M + 1 \mathrm S\\).
|
||||
pub fn to_extended(&self) -> EdwardsPoint {
|
||||
EdwardsPoint {
|
||||
X: &self.X * &self.Z,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: self.Z.square(),
|
||||
T: &self.X * &self.Y,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CompletedPoint {
|
||||
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
|
||||
/// \\) model to the \\( \mathbb P\^2 \\) model.
|
||||
///
|
||||
/// This costs \\(3 \mathrm M \\).
|
||||
pub fn to_projective(&self) -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
X: &self.X * &self.T,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: &self.Z * &self.T,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
|
||||
/// \\) model to the \\( \mathbb P\^3 \\) model.
|
||||
///
|
||||
/// This costs \\(4 \mathrm M \\).
|
||||
pub fn to_extended(&self) -> EdwardsPoint {
|
||||
EdwardsPoint {
|
||||
X: &self.X * &self.T,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: &self.Z * &self.T,
|
||||
T: &self.X * &self.Y,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Doubling
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ProjectivePoint {
|
||||
/// Double this point: return self + self
|
||||
pub fn double(&self) -> CompletedPoint { // Double()
|
||||
let XX = self.X.square();
|
||||
let YY = self.Y.square();
|
||||
let ZZ2 = self.Z.square2();
|
||||
let X_plus_Y = &self.X + &self.Y;
|
||||
let X_plus_Y_sq = X_plus_Y.square();
|
||||
let YY_plus_XX = &YY + &XX;
|
||||
let YY_minus_XX = &YY - &XX;
|
||||
|
||||
CompletedPoint{
|
||||
X: &X_plus_Y_sq - &YY_plus_XX,
|
||||
Y: YY_plus_XX,
|
||||
Z: YY_minus_XX,
|
||||
T: &ZZ2 - &YY_minus_XX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Addition and Subtraction
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
// XXX(hdevalence) These were doc(hidden) so they don't appear in the
|
||||
// public API docs.
|
||||
// However, that prevents them being used with --document-private-items,
|
||||
// so comment out the doc(hidden) for now until this is resolved
|
||||
//
|
||||
// upstream rust issue: https://github.com/rust-lang/rust/issues/46380
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Add<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn add(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PP = &Y_plus_X * &other.Y_plus_X;
|
||||
let MM = &Y_minus_X * &other.Y_minus_X;
|
||||
let TT2d = &self.T * &other.T2d;
|
||||
let ZZ = &self.Z * &other.Z;
|
||||
let ZZ2 = &ZZ + &ZZ;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PP - &MM,
|
||||
Y: &PP + &MM,
|
||||
Z: &ZZ2 + &TT2d,
|
||||
T: &ZZ2 - &TT2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Sub<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn sub(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PM = &Y_plus_X * &other.Y_minus_X;
|
||||
let MP = &Y_minus_X * &other.Y_plus_X;
|
||||
let TT2d = &self.T * &other.T2d;
|
||||
let ZZ = &self.Z * &other.Z;
|
||||
let ZZ2 = &ZZ + &ZZ;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PM - &MP,
|
||||
Y: &PM + &MP,
|
||||
Z: &ZZ2 - &TT2d,
|
||||
T: &ZZ2 + &TT2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Add<&'b AffineNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn add(self, other: &'b AffineNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PP = &Y_plus_X * &other.y_plus_x;
|
||||
let MM = &Y_minus_X * &other.y_minus_x;
|
||||
let Txy2d = &self.T * &other.xy2d;
|
||||
let Z2 = &self.Z + &self.Z;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PP - &MM,
|
||||
Y: &PP + &MM,
|
||||
Z: &Z2 + &Txy2d,
|
||||
T: &Z2 - &Txy2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Sub<&'b AffineNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn sub(self, other: &'b AffineNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PM = &Y_plus_X * &other.y_minus_x;
|
||||
let MP = &Y_minus_X * &other.y_plus_x;
|
||||
let Txy2d = &self.T * &other.xy2d;
|
||||
let Z2 = &self.Z + &self.Z;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PM - &MP,
|
||||
Y: &PM + &MP,
|
||||
Z: &Z2 - &Txy2d,
|
||||
T: &Z2 + &Txy2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Negation
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl<'a> Neg for &'a ProjectiveNielsPoint {
|
||||
type Output = ProjectiveNielsPoint;
|
||||
|
||||
fn neg(self) -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint{
|
||||
Y_plus_X: self.Y_minus_X,
|
||||
Y_minus_X: self.Y_plus_X,
|
||||
Z: self.Z,
|
||||
T2d: -(&self.T2d),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a AffineNielsPoint {
|
||||
type Output = AffineNielsPoint;
|
||||
|
||||
fn neg(self) -> AffineNielsPoint {
|
||||
AffineNielsPoint{
|
||||
y_plus_x: self.y_minus_x,
|
||||
y_minus_x: self.y_plus_x,
|
||||
xy2d: -(&self.xy2d)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Debug traits
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl Debug for ProjectivePoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "ProjectivePoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?}\n}}",
|
||||
&self.X, &self.Y, &self.Z)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for CompletedPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "CompletedPoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?},\n\tT: {:?}\n}}",
|
||||
&self.X, &self.Y, &self.Z, &self.T)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for AffineNielsPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "AffineNielsPoint{{\n\ty_plus_x: {:?},\n\ty_minus_x: {:?},\n\txy2d: {:?}\n}}",
|
||||
&self.y_plus_x, &self.y_minus_x, &self.xy2d)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for ProjectiveNielsPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "ProjectiveNielsPoint{{\n\tY_plus_X: {:?},\n\tY_minus_X: {:?},\n\tZ: {:?},\n\tT2d: {:?}\n}}",
|
||||
&self.Y_plus_X, &self.Y_minus_X, &self.Z, &self.T2d)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Serial implementations of field, scalar, point arithmetic.
|
||||
//!
|
||||
//! When the vector backend is disabled, the crate uses the
|
||||
//! mixed-model strategy for implementing point operations and scalar
|
||||
//! multiplication; see the [`curve_models`](self::curve_models) and
|
||||
//! [`scalar_mul`](self::scalar_mul) documentation for more
|
||||
//! information.
|
||||
//!
|
||||
//! When the vector backend is enabled, the field and scalar
|
||||
//! implementations are still used for non-vectorized operations.
|
||||
//!
|
||||
//! Note: at this time the `u32` and `u64` backends cannot be built
|
||||
//! together.
|
||||
|
||||
#[cfg(not(any(feature = "u32_backend", feature = "u64_backend")))]
|
||||
compile_error!(
|
||||
"no curve25519-dalek backend cargo feature enabled! \
|
||||
please enable one of: u32_backend, u64_backend"
|
||||
);
|
||||
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub mod u32;
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub mod u64;
|
||||
|
||||
pub mod curve_models;
|
||||
|
||||
#[cfg(not(all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
)))]
|
||||
pub mod scalar_mul;
|
|
@ -1,30 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Implementations of various scalar multiplication algorithms.
|
||||
//!
|
||||
//! Note that all of these implementations use serial code for field
|
||||
//! arithmetic with the multi-model strategy described in the
|
||||
//! `curve_models` module. The vectorized AVX2 backend has its own
|
||||
//! scalar multiplication implementations, since it only uses one
|
||||
//! curve model.
|
||||
|
||||
pub mod variable_base;
|
||||
|
||||
pub mod vartime_double_base;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod precomputed_straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod pippenger;
|
|
@ -1,202 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Oleg Andreev
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Oleg Andreev <oleganza@gmail.com>
|
||||
|
||||
//! Implementation of a variant of Pippenger's algorithm.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::VartimeMultiscalarMul;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Implements a version of Pippenger's algorithm.
|
||||
///
|
||||
/// The algorithm works as follows:
|
||||
///
|
||||
/// Let `n` be a number of point-scalar pairs.
|
||||
/// Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
|
||||
///
|
||||
/// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
|
||||
/// Bucket 0 is not needed as it would contain points multiplied by 0.
|
||||
/// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
|
||||
/// Note: only the last digit may equal `2^w/2`.
|
||||
/// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
|
||||
/// the point's scalar's value in the window.
|
||||
/// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
|
||||
/// by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
|
||||
/// intermediate sum from the last to the first, and the full sum made of all intermediate sums.
|
||||
/// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
|
||||
/// 6. Add to the return value.
|
||||
/// 7. Repeat the loop.
|
||||
///
|
||||
/// Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
|
||||
///
|
||||
/// ```ascii
|
||||
/// cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
|
||||
/// | | | | |
|
||||
/// | | | | looping over 256/w windows
|
||||
/// | | | adding to the result
|
||||
/// sorting points | shifting the sum by w bits (to the next window, starting from last window)
|
||||
/// one by one |
|
||||
/// into buckets adding/subtracting all buckets
|
||||
/// multiplied by their indexes
|
||||
/// using a sum of intermediate sums
|
||||
/// ```
|
||||
///
|
||||
/// For large `n`, dominant factor is (n*256/w) additions.
|
||||
/// However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
|
||||
/// Therefore, the optimal choice of `w` grows slowly as `n` grows.
|
||||
///
|
||||
/// This algorithm is adapted from section 4 of https://eprint.iacr.org/2012/549.pdf.
|
||||
pub struct Pippenger;
|
||||
|
||||
#[cfg(any(feature = "alloc", feature = "std"))]
|
||||
impl VartimeMultiscalarMul for Pippenger {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
use traits::Identity;
|
||||
|
||||
let mut scalars = scalars.into_iter();
|
||||
let size = scalars.by_ref().size_hint().0;
|
||||
|
||||
// Digit width in bits. As digit width grows,
|
||||
// number of point additions goes down, but amount of
|
||||
// buckets and bucket additions grows exponentially.
|
||||
let w = if size < 500 {
|
||||
6
|
||||
} else if size < 800 {
|
||||
7
|
||||
} else {
|
||||
8
|
||||
};
|
||||
|
||||
let max_digit: usize = 1 << w;
|
||||
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
|
||||
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
|
||||
|
||||
// Collect optimized scalars and points in buffers for repeated access
|
||||
// (scanning the whole set per digit position).
|
||||
let scalars = scalars
|
||||
.map(|s| s.borrow().to_radix_2w(w));
|
||||
|
||||
let points = points
|
||||
.into_iter()
|
||||
.map(|p| p.map(|P| P.to_projective_niels()));
|
||||
|
||||
let scalars_points = scalars
|
||||
.zip(points)
|
||||
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
// Prepare 2^w/2 buckets.
|
||||
// buckets[i] corresponds to a multiplication factor (i+1).
|
||||
let mut buckets: Vec<_> = (0..buckets_count)
|
||||
.map(|_| EdwardsPoint::identity())
|
||||
.collect();
|
||||
|
||||
let mut columns = (0..digits_count).rev().map(|digit_index| {
|
||||
// Clear the buckets when processing another digit.
|
||||
for i in 0..buckets_count {
|
||||
buckets[i] = EdwardsPoint::identity();
|
||||
}
|
||||
|
||||
// Iterate over pairs of (point, scalar)
|
||||
// and add/sub the point to the corresponding bucket.
|
||||
// Note: if we add support for precomputed lookup tables,
|
||||
// we'll be adding/subtracting point premultiplied by `digits[i]` to buckets[0].
|
||||
for (digits, pt) in scalars_points.iter() {
|
||||
// Widen digit so that we don't run into edge cases when w=8.
|
||||
let digit = digits[digit_index] as i16;
|
||||
if digit > 0 {
|
||||
let b = (digit - 1) as usize;
|
||||
buckets[b] = (&buckets[b] + pt).to_extended();
|
||||
} else if digit < 0 {
|
||||
let b = (-digit - 1) as usize;
|
||||
buckets[b] = (&buckets[b] - pt).to_extended();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the buckets applying the multiplication factor to each bucket.
|
||||
// The most efficient way to do that is to have a single sum with two running sums:
|
||||
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
|
||||
//
|
||||
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
|
||||
// C
|
||||
// C B
|
||||
// C B A Sum = C + (C+B) + (C+B+A)
|
||||
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
|
||||
let mut buckets_sum = buckets[buckets_count - 1];
|
||||
for i in (0..(buckets_count - 1)).rev() {
|
||||
buckets_intermediate_sum += buckets[i];
|
||||
buckets_sum += buckets_intermediate_sum;
|
||||
}
|
||||
|
||||
buckets_sum
|
||||
});
|
||||
|
||||
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
|
||||
// `unwrap()` always succeeds because we know we have more than zero digits.
|
||||
let hi_column = columns.next().unwrap();
|
||||
|
||||
Some(
|
||||
columns
|
||||
.fold(hi_column, |total, p| total.mul_by_pow_2(w as u32) + p),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
#[test]
|
||||
fn test_vartime_pippenger() {
|
||||
// Reuse points across different tests
|
||||
let mut n = 512;
|
||||
let x = Scalar::from(2128506u64).invert();
|
||||
let y = Scalar::from(4443282u64).invert();
|
||||
let points: Vec<_> = (0..n)
|
||||
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
|
||||
.collect();
|
||||
let scalars: Vec<_> = (0..n)
|
||||
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
|
||||
.collect();
|
||||
|
||||
let premultiplied: Vec<EdwardsPoint> = scalars
|
||||
.iter()
|
||||
.zip(points.iter())
|
||||
.map(|(sc, pt)| sc * pt)
|
||||
.collect();
|
||||
|
||||
while n > 0 {
|
||||
let scalars = &scalars[0..n].to_vec();
|
||||
let points = &points[0..n].to_vec();
|
||||
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
|
||||
|
||||
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
|
||||
|
||||
assert_eq!(subject.compress(), control.compress());
|
||||
|
||||
n = n / 2;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Henry de Valence.
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Precomputation for Straus's method.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use backend::serial::curve_models::{
|
||||
AffineNielsPoint, CompletedPoint, ProjectiveNielsPoint, ProjectivePoint,
|
||||
};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::Identity;
|
||||
use traits::VartimePrecomputedMultiscalarMul;
|
||||
use window::{NafLookupTable5, NafLookupTable8};
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
pub struct VartimePrecomputedStraus {
|
||||
static_lookup_tables: Vec<NafLookupTable8<AffineNielsPoint>>,
|
||||
}
|
||||
|
||||
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn new<I>(static_points: I) -> Self
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Self::Point>,
|
||||
{
|
||||
Self {
|
||||
static_lookup_tables: static_points
|
||||
.into_iter()
|
||||
.map(|P| NafLookupTable8::<AffineNielsPoint>::from(P.borrow()))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn optional_mixed_multiscalar_mul<I, J, K>(
|
||||
&self,
|
||||
static_scalars: I,
|
||||
dynamic_scalars: J,
|
||||
dynamic_points: K,
|
||||
) -> Option<Self::Point>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Scalar>,
|
||||
K: IntoIterator<Item = Option<Self::Point>>,
|
||||
{
|
||||
let static_nafs = static_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
let dynamic_nafs: Vec<_> = dynamic_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dynamic_lookup_tables = dynamic_points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let sp = self.static_lookup_tables.len();
|
||||
let dp = dynamic_lookup_tables.len();
|
||||
assert_eq!(sp, static_nafs.len());
|
||||
assert_eq!(dp, dynamic_nafs.len());
|
||||
|
||||
// We could save some doublings by looking for the highest
|
||||
// nonzero NAF coefficient, but since we might have a lot of
|
||||
// them to search, it's not clear it's worthwhile to check.
|
||||
let mut S = ProjectivePoint::identity();
|
||||
for j in (0..256).rev() {
|
||||
let mut R: CompletedPoint = S.double();
|
||||
|
||||
for i in 0..dp {
|
||||
let t_ij = dynamic_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R.to_extended() + &dynamic_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R.to_extended() - &dynamic_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..sp {
|
||||
let t_ij = static_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R.to_extended() + &self.static_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R.to_extended() - &self.static_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
|
||||
S = R.to_projective();
|
||||
}
|
||||
|
||||
Some(S.to_extended())
|
||||
}
|
||||
}
|
|
@ -1,195 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Implementation of the interleaved window method, also known as Straus' method.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::MultiscalarMul;
|
||||
use traits::VartimeMultiscalarMul;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Perform multiscalar multiplication by the interleaved window
|
||||
/// method, also known as Straus' method (since it was apparently
|
||||
/// [first published][solution] by Straus in 1964, as a solution to [a
|
||||
/// problem][problem] posted in the American Mathematical Monthly in
|
||||
/// 1963).
|
||||
///
|
||||
/// It is easy enough to reinvent, and has been repeatedly. The basic
|
||||
/// idea is that when computing
|
||||
/// \\[
|
||||
/// Q = s_1 P_1 + \cdots + s_n P_n
|
||||
/// \\]
|
||||
/// by means of additions and doublings, the doublings can be shared
|
||||
/// across the \\( P_i \\\).
|
||||
///
|
||||
/// We implement two versions, a constant-time algorithm using fixed
|
||||
/// windows and a variable-time algorithm using sliding windows. They
|
||||
/// are slight variations on the same idea, and are described in more
|
||||
/// detail in the respective implementations.
|
||||
///
|
||||
/// [solution]: https://www.jstor.org/stable/2310929
|
||||
/// [problem]: https://www.jstor.org/stable/2312273
|
||||
pub struct Straus {}
|
||||
|
||||
impl MultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
/// Constant-time Straus using a fixed window of size \\(4\\).
|
||||
///
|
||||
/// Our goal is to compute
|
||||
/// \\[
|
||||
/// Q = s_1 P_1 + \cdots + s_n P_n.
|
||||
/// \\]
|
||||
///
|
||||
/// For each point \\( P_i \\), precompute a lookup table of
|
||||
/// \\[
|
||||
/// P_i, 2P_i, 3P_i, 4P_i, 5P_i, 6P_i, 7P_i, 8P_i.
|
||||
/// \\]
|
||||
///
|
||||
/// For each scalar \\( s_i \\), compute its radix-\\(2^4\\)
|
||||
/// signed digits \\( s_{i,j} \\), i.e.,
|
||||
/// \\[
|
||||
/// s_i = s_{i,0} + s_{i,1} 16^1 + ... + s_{i,63} 16^{63},
|
||||
/// \\]
|
||||
/// with \\( -8 \leq s_{i,j} < 8 \\). Since \\( 0 \leq |s_{i,j}|
|
||||
/// \leq 8 \\), we can retrieve \\( s_{i,j} P_i \\) from the
|
||||
/// lookup table with a conditional negation: using signed
|
||||
/// digits halves the required table size.
|
||||
///
|
||||
/// Then as in the single-base fixed window case, we have
|
||||
/// \\[
|
||||
/// \begin{aligned}
|
||||
/// s_i P_i &= P_i (s_{i,0} + s_{i,1} 16^1 + \cdots + s_{i,63} 16^{63}) \\\\
|
||||
/// s_i P_i &= P_i s_{i,0} + P_i s_{i,1} 16^1 + \cdots + P_i s_{i,63} 16^{63} \\\\
|
||||
/// s_i P_i &= P_i s_{i,0} + 16(P_i s_{i,1} + 16( \cdots +16P_i s_{i,63})\cdots )
|
||||
/// \end{aligned}
|
||||
/// \\]
|
||||
/// so each \\( s_i P_i \\) can be computed by alternately adding
|
||||
/// a precomputed multiple \\( P_i s_{i,j} \\) of \\( P_i \\) and
|
||||
/// repeatedly doubling.
|
||||
///
|
||||
/// Now consider the two-dimensional sum
|
||||
/// \\[
|
||||
/// \begin{aligned}
|
||||
/// s\_1 P\_1 &=& P\_1 s\_{1,0} &+& 16 (P\_1 s\_{1,1} &+& 16 ( \cdots &+& 16 P\_1 s\_{1,63}&) \cdots ) \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// s\_2 P\_2 &=& P\_2 s\_{2,0} &+& 16 (P\_2 s\_{2,1} &+& 16 ( \cdots &+& 16 P\_2 s\_{2,63}&) \cdots ) \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// \vdots & & \vdots & & \vdots & & & & \vdots & \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// s\_n P\_n &=& P\_n s\_{n,0} &+& 16 (P\_n s\_{n,1} &+& 16 ( \cdots &+& 16 P\_n s\_{n,63}&) \cdots )
|
||||
/// \end{aligned}
|
||||
/// \\]
|
||||
/// The sum of the left-hand column is the result \\( Q \\); by
|
||||
/// computing the two-dimensional sum on the right column-wise,
|
||||
/// top-to-bottom, then right-to-left, we need to multiply by \\(
|
||||
/// 16\\) only once per column, sharing the doublings across all
|
||||
/// of the input points.
|
||||
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<EdwardsPoint>,
|
||||
{
|
||||
use zeroize::Zeroizing;
|
||||
|
||||
use backend::serial::curve_models::ProjectiveNielsPoint;
|
||||
use window::LookupTable;
|
||||
use traits::Identity;
|
||||
|
||||
let lookup_tables: Vec<_> = points
|
||||
.into_iter()
|
||||
.map(|point| LookupTable::<ProjectiveNielsPoint>::from(point.borrow()))
|
||||
.collect();
|
||||
|
||||
// This puts the scalar digits into a heap-allocated Vec.
|
||||
// To ensure that these are erased, pass ownership of the Vec into a
|
||||
// Zeroizing wrapper.
|
||||
let scalar_digits_vec: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|s| s.borrow().to_radix_16())
|
||||
.collect();
|
||||
let scalar_digits = Zeroizing::new(scalar_digits_vec);
|
||||
|
||||
let mut Q = EdwardsPoint::identity();
|
||||
for j in (0..64).rev() {
|
||||
Q = Q.mul_by_pow_2(4);
|
||||
let it = scalar_digits.iter().zip(lookup_tables.iter());
|
||||
for (s_i, lookup_table_i) in it {
|
||||
// R_i = s_{i,j} * P_i
|
||||
let R_i = lookup_table_i.select(s_i[j]);
|
||||
// Q = Q + R_i
|
||||
Q = (&Q + &R_i).to_extended();
|
||||
}
|
||||
}
|
||||
|
||||
Q
|
||||
}
|
||||
}
|
||||
|
||||
impl VartimeMultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
/// Variable-time Straus using a non-adjacent form of width \\(5\\).
|
||||
///
|
||||
/// This is completely similar to the constant-time code, but we
|
||||
/// use a non-adjacent form for the scalar, and do not do table
|
||||
/// lookups in constant time.
|
||||
///
|
||||
/// The non-adjacent form has signed, odd digits. Using only odd
|
||||
/// digits halves the table size (since we only need odd
|
||||
/// multiples), or gives fewer additions for the same table size.
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
use backend::serial::curve_models::{CompletedPoint, ProjectiveNielsPoint, ProjectivePoint};
|
||||
use window::NafLookupTable5;
|
||||
use traits::Identity;
|
||||
|
||||
let nafs: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect();
|
||||
|
||||
let lookup_tables = points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let mut r = ProjectivePoint::identity();
|
||||
|
||||
for i in (0..256).rev() {
|
||||
let mut t: CompletedPoint = r.double();
|
||||
|
||||
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
|
||||
if naf[i] > 0 {
|
||||
t = &t.to_extended() + &lookup_table.select(naf[i] as usize);
|
||||
} else if naf[i] < 0 {
|
||||
t = &t.to_extended() - &lookup_table.select(-naf[i] as usize);
|
||||
}
|
||||
}
|
||||
|
||||
r = t.to_projective();
|
||||
}
|
||||
|
||||
Some(r.to_extended())
|
||||
}
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
#![allow(non_snake_case)]
|
||||
|
||||
use traits::Identity;
|
||||
use scalar::Scalar;
|
||||
use edwards::EdwardsPoint;
|
||||
use backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint};
|
||||
use window::LookupTable;
|
||||
|
||||
/// Perform constant-time, variable-base scalar multiplication.
|
||||
pub(crate) fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
|
||||
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
|
||||
let lookup_table = LookupTable::<ProjectiveNielsPoint>::from(point);
|
||||
// Setting s = scalar, compute
|
||||
//
|
||||
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
|
||||
//
|
||||
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
|
||||
let scalar_digits = scalar.to_radix_16();
|
||||
// Compute s*P as
|
||||
//
|
||||
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
|
||||
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
|
||||
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
|
||||
//
|
||||
// We sum right-to-left.
|
||||
|
||||
// Unwrap first loop iteration to save computing 16*identity
|
||||
let mut tmp2;
|
||||
let mut tmp3 = EdwardsPoint::identity();
|
||||
let mut tmp1 = &tmp3 + &lookup_table.select(scalar_digits[63]);
|
||||
// Now tmp1 = s_63*P in P1xP1 coords
|
||||
for i in (0..63).rev() {
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = (prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 2*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 2*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 4*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 4*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 8*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 8*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 16*(prev) in P1xP1 coords
|
||||
tmp3 = tmp1.to_extended(); // tmp3 = 16*(prev) in P3 coords
|
||||
tmp1 = &tmp3 + &lookup_table.select(scalar_digits[i]);
|
||||
// Now tmp1 = s_i*P + 16*(prev) in P1xP1 coords
|
||||
}
|
||||
tmp1.to_extended()
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use constants;
|
||||
use traits::Identity;
|
||||
use scalar::Scalar;
|
||||
use edwards::EdwardsPoint;
|
||||
use backend::serial::curve_models::{ProjectiveNielsPoint, ProjectivePoint};
|
||||
use window::NafLookupTable5;
|
||||
|
||||
/// Compute \\(aA + bB\\) in variable time, where \\(B\\) is the Ed25519 basepoint.
|
||||
pub fn mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint {
|
||||
let a_naf = a.non_adjacent_form(5);
|
||||
let b_naf = b.non_adjacent_form(8);
|
||||
|
||||
// Find starting index
|
||||
let mut i: usize = 255;
|
||||
for j in (0..256).rev() {
|
||||
i = j;
|
||||
if a_naf[i] != 0 || b_naf[i] != 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let table_A = NafLookupTable5::<ProjectiveNielsPoint>::from(A);
|
||||
let table_B = &constants::AFFINE_ODD_MULTIPLES_OF_BASEPOINT;
|
||||
|
||||
let mut r = ProjectivePoint::identity();
|
||||
loop {
|
||||
let mut t = r.double();
|
||||
|
||||
if a_naf[i] > 0 {
|
||||
t = &t.to_extended() + &table_A.select(a_naf[i] as usize);
|
||||
} else if a_naf[i] < 0 {
|
||||
t = &t.to_extended() - &table_A.select(-a_naf[i] as usize);
|
||||
}
|
||||
|
||||
if b_naf[i] > 0 {
|
||||
t = &t.to_extended() + &table_B.select(b_naf[i] as usize);
|
||||
} else if b_naf[i] < 0 {
|
||||
t = &t.to_extended() - &table_B.select(-b_naf[i] as usize);
|
||||
}
|
||||
|
||||
r = t.to_projective();
|
||||
|
||||
if i == 0 {
|
||||
break;
|
||||
}
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
r.to_extended()
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,577 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(32\\)-bit
|
||||
//! limbs with \\(64\\)-bit products.
|
||||
//!
|
||||
//! This code was originally derived from Adam Langley's Golang ed25519
|
||||
//! implementation, and was then rewritten to use unsigned limbs instead
|
||||
//! of signed limbs.
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::Neg;
|
||||
use core::ops::{Add, AddAssign};
|
||||
use core::ops::{Mul, MulAssign};
|
||||
use core::ops::{Sub, SubAssign};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
/// A `FieldElement2625` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// In the 32-bit implementation, a `FieldElement` is represented in
|
||||
/// radix \\(2\^{25.5}\\) as ten `u32`s. This means that a field
|
||||
/// element \\(x\\) is represented as
|
||||
/// $$
|
||||
/// x = \sum\_{i=0}\^9 x\_i 2\^{\lceil i \frac {51} 2 \rceil}
|
||||
/// = x\_0 + x\_1 2\^{26} + x\_2 2\^{51} + x\_3 2\^{77} + \cdots + x\_9 2\^{230};
|
||||
/// $$
|
||||
/// the coefficients are alternately bounded by \\(2\^{25}\\) and
|
||||
/// \\(2\^{26}\\). The limbs are allowed to grow between reductions up
|
||||
/// to \\(2\^{25+b}\\) or \\(2\^{26+b}\\), where \\(b = 1.75\\).
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// The `curve25519_dalek::field` module provides a type alias
|
||||
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
|
||||
/// or `FieldElement2625`.
|
||||
///
|
||||
/// The backend-specific type `FieldElement2625` should not be used
|
||||
/// outside of the `curve25519_dalek::field` module.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct FieldElement2625(pub (crate) [u32; 10]);
|
||||
|
||||
impl Debug for FieldElement2625 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "FieldElement2625({:?})", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for FieldElement2625 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> AddAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn add_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
for i in 0..10 {
|
||||
self.0[i] += _rhs.0[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn add(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
output += _rhs;
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> SubAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn sub_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
// See comment in FieldElement51::Sub
|
||||
//
|
||||
// Compute a - b as ((a + 2^4 * p) - b) to avoid underflow.
|
||||
let b = &_rhs.0;
|
||||
self.0 = FieldElement2625::reduce([
|
||||
((self.0[0] + (0x3ffffed << 4)) - b[0]) as u64,
|
||||
((self.0[1] + (0x1ffffff << 4)) - b[1]) as u64,
|
||||
((self.0[2] + (0x3ffffff << 4)) - b[2]) as u64,
|
||||
((self.0[3] + (0x1ffffff << 4)) - b[3]) as u64,
|
||||
((self.0[4] + (0x3ffffff << 4)) - b[4]) as u64,
|
||||
((self.0[5] + (0x1ffffff << 4)) - b[5]) as u64,
|
||||
((self.0[6] + (0x3ffffff << 4)) - b[6]) as u64,
|
||||
((self.0[7] + (0x1ffffff << 4)) - b[7]) as u64,
|
||||
((self.0[8] + (0x3ffffff << 4)) - b[8]) as u64,
|
||||
((self.0[9] + (0x1ffffff << 4)) - b[9]) as u64,
|
||||
]).0;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn sub(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
output -= _rhs;
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> MulAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn mul_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
let result = (self as &FieldElement2625) * _rhs;
|
||||
self.0 = result.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn mul(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
/// Helper function to multiply two 32-bit integers with 64 bits
|
||||
/// of output.
|
||||
#[inline(always)]
|
||||
fn m(x: u32, y: u32) -> u64 { (x as u64) * (y as u64) }
|
||||
|
||||
// Alias self, _rhs for more readable formulas
|
||||
let x: &[u32;10] = &self.0; let y: &[u32;10] = &_rhs.0;
|
||||
|
||||
// We assume that the input limbs x[i], y[i] are bounded by:
|
||||
//
|
||||
// x[i], y[i] < 2^(26 + b) if i even
|
||||
// x[i], y[i] < 2^(25 + b) if i odd
|
||||
//
|
||||
// where b is a (real) parameter representing the excess bits of
|
||||
// the limbs. We track the bitsizes of all variables through
|
||||
// the computation and solve at the end for the allowable
|
||||
// headroom bitsize b (which determines how many additions we
|
||||
// can perform between reductions or multiplications).
|
||||
|
||||
let y1_19 = 19 * y[1]; // This fits in a u32
|
||||
let y2_19 = 19 * y[2]; // iff 26 + b + lg(19) < 32
|
||||
let y3_19 = 19 * y[3]; // if b < 32 - 26 - 4.248 = 1.752
|
||||
let y4_19 = 19 * y[4];
|
||||
let y5_19 = 19 * y[5]; // below, b<2.5: this is a bottleneck,
|
||||
let y6_19 = 19 * y[6]; // could be avoided by promoting to
|
||||
let y7_19 = 19 * y[7]; // u64 here instead of in m()
|
||||
let y8_19 = 19 * y[8];
|
||||
let y9_19 = 19 * y[9];
|
||||
|
||||
// What happens when we multiply x[i] with y[j] and place the
|
||||
// result into the (i+j)-th limb?
|
||||
//
|
||||
// x[i] represents the value x[i]*2^ceil(i*51/2)
|
||||
// y[j] represents the value y[j]*2^ceil(j*51/2)
|
||||
// z[i+j] represents the value z[i+j]*2^ceil((i+j)*51/2)
|
||||
// x[i]*y[j] represents the value x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2))
|
||||
//
|
||||
// Since the radix is already accounted for, the result placed
|
||||
// into the (i+j)-th limb should be
|
||||
//
|
||||
// x[i]*y[i]*2^(ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2)).
|
||||
//
|
||||
// The value of ceil(i*51/2)+ceil(j*51/2) - ceil((i+j)*51/2) is
|
||||
// 1 when both i and j are odd, and 0 otherwise. So we add
|
||||
//
|
||||
// x[i]*y[j] if either i or j is even
|
||||
// 2*x[i]*y[j] if i and j are both odd
|
||||
//
|
||||
// by using precomputed multiples of x[i] for odd i:
|
||||
|
||||
let x1_2 = 2 * x[1]; // This fits in a u32 iff 25 + b + 1 < 32
|
||||
let x3_2 = 2 * x[3]; // iff b < 6
|
||||
let x5_2 = 2 * x[5];
|
||||
let x7_2 = 2 * x[7];
|
||||
let x9_2 = 2 * x[9];
|
||||
|
||||
let z0 = m(x[0],y[0]) + m(x1_2,y9_19) + m(x[2],y8_19) + m(x3_2,y7_19) + m(x[4],y6_19) + m(x5_2,y5_19) + m(x[6],y4_19) + m(x7_2,y3_19) + m(x[8],y2_19) + m(x9_2,y1_19);
|
||||
let z1 = m(x[0],y[1]) + m(x[1],y[0]) + m(x[2],y9_19) + m(x[3],y8_19) + m(x[4],y7_19) + m(x[5],y6_19) + m(x[6],y5_19) + m(x[7],y4_19) + m(x[8],y3_19) + m(x[9],y2_19);
|
||||
let z2 = m(x[0],y[2]) + m(x1_2,y[1]) + m(x[2],y[0]) + m(x3_2,y9_19) + m(x[4],y8_19) + m(x5_2,y7_19) + m(x[6],y6_19) + m(x7_2,y5_19) + m(x[8],y4_19) + m(x9_2,y3_19);
|
||||
let z3 = m(x[0],y[3]) + m(x[1],y[2]) + m(x[2],y[1]) + m(x[3],y[0]) + m(x[4],y9_19) + m(x[5],y8_19) + m(x[6],y7_19) + m(x[7],y6_19) + m(x[8],y5_19) + m(x[9],y4_19);
|
||||
let z4 = m(x[0],y[4]) + m(x1_2,y[3]) + m(x[2],y[2]) + m(x3_2,y[1]) + m(x[4],y[0]) + m(x5_2,y9_19) + m(x[6],y8_19) + m(x7_2,y7_19) + m(x[8],y6_19) + m(x9_2,y5_19);
|
||||
let z5 = m(x[0],y[5]) + m(x[1],y[4]) + m(x[2],y[3]) + m(x[3],y[2]) + m(x[4],y[1]) + m(x[5],y[0]) + m(x[6],y9_19) + m(x[7],y8_19) + m(x[8],y7_19) + m(x[9],y6_19);
|
||||
let z6 = m(x[0],y[6]) + m(x1_2,y[5]) + m(x[2],y[4]) + m(x3_2,y[3]) + m(x[4],y[2]) + m(x5_2,y[1]) + m(x[6],y[0]) + m(x7_2,y9_19) + m(x[8],y8_19) + m(x9_2,y7_19);
|
||||
let z7 = m(x[0],y[7]) + m(x[1],y[6]) + m(x[2],y[5]) + m(x[3],y[4]) + m(x[4],y[3]) + m(x[5],y[2]) + m(x[6],y[1]) + m(x[7],y[0]) + m(x[8],y9_19) + m(x[9],y8_19);
|
||||
let z8 = m(x[0],y[8]) + m(x1_2,y[7]) + m(x[2],y[6]) + m(x3_2,y[5]) + m(x[4],y[4]) + m(x5_2,y[3]) + m(x[6],y[2]) + m(x7_2,y[1]) + m(x[8],y[0]) + m(x9_2,y9_19);
|
||||
let z9 = m(x[0],y[9]) + m(x[1],y[8]) + m(x[2],y[7]) + m(x[3],y[6]) + m(x[4],y[5]) + m(x[5],y[4]) + m(x[6],y[3]) + m(x[7],y[2]) + m(x[8],y[1]) + m(x[9],y[0]);
|
||||
|
||||
// How big is the contribution to z[i+j] from x[i], y[j]?
|
||||
//
|
||||
// Using the bounds above, we get:
|
||||
//
|
||||
// i even, j even: x[i]*y[j] < 2^(26+b)*2^(26+b) = 2*2^(51+2*b)
|
||||
// i odd, j even: x[i]*y[j] < 2^(25+b)*2^(26+b) = 1*2^(51+2*b)
|
||||
// i even, j odd: x[i]*y[j] < 2^(26+b)*2^(25+b) = 1*2^(51+2*b)
|
||||
// i odd, j odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
|
||||
//
|
||||
// We perform inline reduction mod p by replacing 2^255 by 19
|
||||
// (since 2^255 - 19 = 0 mod p). This adds a factor of 19, so
|
||||
// we get the bounds (z0 is the biggest one, but calculated for
|
||||
// posterity here in case finer estimation is needed later):
|
||||
//
|
||||
// z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
|
||||
// z1 < ( 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
|
||||
// z2 < ( 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
|
||||
// z3 < ( 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
|
||||
// z4 < ( 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
|
||||
// z5 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 82*2^(51 + 2*b)
|
||||
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 87*2^(51 + 2*b)
|
||||
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 )*2^(51 + 2b) = 46*2^(51 + 2*b)
|
||||
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 )*2^(51 + 2b) = 33*2^(51 + 2*b)
|
||||
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 )*2^(51 + 2b) = 10*2^(51 + 2*b)
|
||||
//
|
||||
// So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
|
||||
// if b < 2.5.
|
||||
FieldElement2625::reduce([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn neg(self) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
output.negate();
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for FieldElement2625 {
|
||||
fn conditional_select(
|
||||
a: &FieldElement2625,
|
||||
b: &FieldElement2625,
|
||||
choice: Choice,
|
||||
) -> FieldElement2625 {
|
||||
FieldElement2625([
|
||||
u32::conditional_select(&a.0[0], &b.0[0], choice),
|
||||
u32::conditional_select(&a.0[1], &b.0[1], choice),
|
||||
u32::conditional_select(&a.0[2], &b.0[2], choice),
|
||||
u32::conditional_select(&a.0[3], &b.0[3], choice),
|
||||
u32::conditional_select(&a.0[4], &b.0[4], choice),
|
||||
u32::conditional_select(&a.0[5], &b.0[5], choice),
|
||||
u32::conditional_select(&a.0[6], &b.0[6], choice),
|
||||
u32::conditional_select(&a.0[7], &b.0[7], choice),
|
||||
u32::conditional_select(&a.0[8], &b.0[8], choice),
|
||||
u32::conditional_select(&a.0[9], &b.0[9], choice),
|
||||
])
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &FieldElement2625, choice: Choice) {
|
||||
self.0[0].conditional_assign(&other.0[0], choice);
|
||||
self.0[1].conditional_assign(&other.0[1], choice);
|
||||
self.0[2].conditional_assign(&other.0[2], choice);
|
||||
self.0[3].conditional_assign(&other.0[3], choice);
|
||||
self.0[4].conditional_assign(&other.0[4], choice);
|
||||
self.0[5].conditional_assign(&other.0[5], choice);
|
||||
self.0[6].conditional_assign(&other.0[6], choice);
|
||||
self.0[7].conditional_assign(&other.0[7], choice);
|
||||
self.0[8].conditional_assign(&other.0[8], choice);
|
||||
self.0[9].conditional_assign(&other.0[9], choice);
|
||||
}
|
||||
|
||||
fn conditional_swap(a: &mut FieldElement2625, b: &mut FieldElement2625, choice: Choice) {
|
||||
u32::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
|
||||
u32::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
|
||||
u32::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
|
||||
u32::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
|
||||
u32::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
|
||||
u32::conditional_swap(&mut a.0[5], &mut b.0[5], choice);
|
||||
u32::conditional_swap(&mut a.0[6], &mut b.0[6], choice);
|
||||
u32::conditional_swap(&mut a.0[7], &mut b.0[7], choice);
|
||||
u32::conditional_swap(&mut a.0[8], &mut b.0[8], choice);
|
||||
u32::conditional_swap(&mut a.0[9], &mut b.0[9], choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement2625 {
|
||||
/// Invert the sign of this field element
|
||||
pub fn negate(&mut self) {
|
||||
// Compute -b as ((2^4 * p) - b) to avoid underflow.
|
||||
let neg = FieldElement2625::reduce([
|
||||
((0x3ffffed << 4) - self.0[0]) as u64,
|
||||
((0x1ffffff << 4) - self.0[1]) as u64,
|
||||
((0x3ffffff << 4) - self.0[2]) as u64,
|
||||
((0x1ffffff << 4) - self.0[3]) as u64,
|
||||
((0x3ffffff << 4) - self.0[4]) as u64,
|
||||
((0x1ffffff << 4) - self.0[5]) as u64,
|
||||
((0x3ffffff << 4) - self.0[6]) as u64,
|
||||
((0x1ffffff << 4) - self.0[7]) as u64,
|
||||
((0x3ffffff << 4) - self.0[8]) as u64,
|
||||
((0x1ffffff << 4) - self.0[9]) as u64,
|
||||
]);
|
||||
self.0 = neg.0;
|
||||
}
|
||||
|
||||
/// Construct zero.
|
||||
pub fn zero() -> FieldElement2625 {
|
||||
FieldElement2625([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])
|
||||
}
|
||||
|
||||
/// Construct one.
|
||||
pub fn one() -> FieldElement2625 {
|
||||
FieldElement2625([ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 ])
|
||||
}
|
||||
|
||||
/// Construct -1.
|
||||
pub fn minus_one() -> FieldElement2625 {
|
||||
FieldElement2625([
|
||||
0x3ffffec, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff,
|
||||
0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff,
|
||||
])
|
||||
}
|
||||
|
||||
/// Given `k > 0`, return `self^(2^k)`.
|
||||
pub fn pow2k(&self, k: u32) -> FieldElement2625 {
|
||||
debug_assert!( k > 0 );
|
||||
let mut z = self.square();
|
||||
for _ in 1..k {
|
||||
z = z.square();
|
||||
}
|
||||
z
|
||||
}
|
||||
|
||||
/// Given unreduced coefficients `z[0], ..., z[9]` of any size,
|
||||
/// carry and reduce them mod p to obtain a `FieldElement2625`
|
||||
/// whose coefficients have excess `b < 0.007`.
|
||||
///
|
||||
/// In other words, each coefficient of the result is bounded by
|
||||
/// either `2^(25 + 0.007)` or `2^(26 + 0.007)`, as appropriate.
|
||||
fn reduce(mut z: [u64; 10]) -> FieldElement2625 {
|
||||
|
||||
const LOW_25_BITS: u64 = (1 << 25) - 1;
|
||||
const LOW_26_BITS: u64 = (1 << 26) - 1;
|
||||
|
||||
/// Carry the value from limb i = 0..8 to limb i+1
|
||||
#[inline(always)]
|
||||
fn carry(z: &mut [u64; 10], i: usize) {
|
||||
debug_assert!(i < 9);
|
||||
if i % 2 == 0 {
|
||||
// Even limbs have 26 bits
|
||||
z[i+1] += z[i] >> 26;
|
||||
z[i] &= LOW_26_BITS;
|
||||
} else {
|
||||
// Odd limbs have 25 bits
|
||||
z[i+1] += z[i] >> 25;
|
||||
z[i] &= LOW_25_BITS;
|
||||
}
|
||||
}
|
||||
|
||||
// Perform two halves of the carry chain in parallel.
|
||||
carry(&mut z, 0); carry(&mut z, 4);
|
||||
carry(&mut z, 1); carry(&mut z, 5);
|
||||
carry(&mut z, 2); carry(&mut z, 6);
|
||||
carry(&mut z, 3); carry(&mut z, 7);
|
||||
// Since z[3] < 2^64, c < 2^(64-25) = 2^39,
|
||||
// so z[4] < 2^26 + 2^39 < 2^39.0002
|
||||
carry(&mut z, 4); carry(&mut z, 8);
|
||||
// Now z[4] < 2^26
|
||||
// and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
|
||||
|
||||
// Last carry has a multiplication by 19:
|
||||
z[0] += 19*(z[9] >> 25);
|
||||
z[9] &= LOW_25_BITS;
|
||||
|
||||
// Since z[9] < 2^64, c < 2^(64-25) = 2^39,
|
||||
// so z[0] + 19*c < 2^26 + 2^43.248 < 2^43.249.
|
||||
carry(&mut z, 0);
|
||||
// Now z[1] < 2^25 - 2^(43.249 - 26)
|
||||
// < 2^25.007 (good enough)
|
||||
// and we're done.
|
||||
|
||||
FieldElement2625([
|
||||
z[0] as u32, z[1] as u32, z[2] as u32, z[3] as u32, z[4] as u32,
|
||||
z[5] as u32, z[6] as u32, z[7] as u32, z[8] as u32, z[9] as u32,
|
||||
])
|
||||
}
|
||||
|
||||
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
|
||||
/// input.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function does not check that the input used the canonical
|
||||
/// representative. It masks the high bit, but it will happily
|
||||
/// decode 2^255 - 18 to 1. Applications that require a canonical
|
||||
/// encoding of every field element should decode, re-encode to
|
||||
/// the canonical encoding, and check that the input was
|
||||
/// canonical.
|
||||
pub fn from_bytes(data: &[u8; 32]) -> FieldElement2625 { //FeFromBytes
|
||||
#[inline]
|
||||
fn load3(b: &[u8]) -> u64 {
|
||||
(b[0] as u64) | ((b[1] as u64) << 8) | ((b[2] as u64) << 16)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn load4(b: &[u8]) -> u64 {
|
||||
(b[0] as u64) | ((b[1] as u64) << 8) | ((b[2] as u64) << 16) | ((b[3] as u64) << 24)
|
||||
}
|
||||
|
||||
let mut h = [0u64;10];
|
||||
const LOW_23_BITS: u64 = (1 << 23) - 1;
|
||||
h[0] = load4(&data[ 0..]);
|
||||
h[1] = load3(&data[ 4..]) << 6;
|
||||
h[2] = load3(&data[ 7..]) << 5;
|
||||
h[3] = load3(&data[10..]) << 3;
|
||||
h[4] = load3(&data[13..]) << 2;
|
||||
h[5] = load4(&data[16..]);
|
||||
h[6] = load3(&data[20..]) << 7;
|
||||
h[7] = load3(&data[23..]) << 5;
|
||||
h[8] = load3(&data[26..]) << 4;
|
||||
h[9] = (load3(&data[29..]) & LOW_23_BITS) << 2;
|
||||
|
||||
FieldElement2625::reduce(h)
|
||||
}
|
||||
|
||||
/// Serialize this `FieldElement51` to a 32-byte array. The
|
||||
/// encoding is canonical.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
|
||||
let inp = &self.0;
|
||||
// Reduce the value represented by `in` to the range [0,2*p)
|
||||
let mut h: [u32; 10] = FieldElement2625::reduce([
|
||||
// XXX this cast is annoying
|
||||
inp[0] as u64, inp[1] as u64, inp[2] as u64, inp[3] as u64, inp[4] as u64,
|
||||
inp[5] as u64, inp[6] as u64, inp[7] as u64, inp[8] as u64, inp[9] as u64,
|
||||
]).0;
|
||||
|
||||
// Let h be the value to encode.
|
||||
//
|
||||
// Write h = pq + r with 0 <= r < p. We want to compute r = h mod p.
|
||||
//
|
||||
// Since h < 2*p, q = 0 or 1, with q = 0 when h < p and q = 1 when h >= p.
|
||||
//
|
||||
// Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255.
|
||||
// Therefore q can be computed as the carry bit of h + 19.
|
||||
|
||||
let mut q: u32 = (h[0] + 19) >> 26;
|
||||
q = (h[1] + q) >> 25;
|
||||
q = (h[2] + q) >> 26;
|
||||
q = (h[3] + q) >> 25;
|
||||
q = (h[4] + q) >> 26;
|
||||
q = (h[5] + q) >> 25;
|
||||
q = (h[6] + q) >> 26;
|
||||
q = (h[7] + q) >> 25;
|
||||
q = (h[8] + q) >> 26;
|
||||
q = (h[9] + q) >> 25;
|
||||
|
||||
debug_assert!( q == 0 || q == 1 );
|
||||
|
||||
// Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q
|
||||
|
||||
const LOW_25_BITS: u32 = (1 << 25) - 1;
|
||||
const LOW_26_BITS: u32 = (1 << 26) - 1;
|
||||
|
||||
h[0] += 19*q;
|
||||
|
||||
// Now carry the result to compute r + 19q...
|
||||
h[1] += h[0] >> 26;
|
||||
h[0] = h[0] & LOW_26_BITS;
|
||||
h[2] += h[1] >> 25;
|
||||
h[1] = h[1] & LOW_25_BITS;
|
||||
h[3] += h[2] >> 26;
|
||||
h[2] = h[2] & LOW_26_BITS;
|
||||
h[4] += h[3] >> 25;
|
||||
h[3] = h[3] & LOW_25_BITS;
|
||||
h[5] += h[4] >> 26;
|
||||
h[4] = h[4] & LOW_26_BITS;
|
||||
h[6] += h[5] >> 25;
|
||||
h[5] = h[5] & LOW_25_BITS;
|
||||
h[7] += h[6] >> 26;
|
||||
h[6] = h[6] & LOW_26_BITS;
|
||||
h[8] += h[7] >> 25;
|
||||
h[7] = h[7] & LOW_25_BITS;
|
||||
h[9] += h[8] >> 26;
|
||||
h[8] = h[8] & LOW_26_BITS;
|
||||
|
||||
// ... but instead of carrying the value
|
||||
// (h[9] >> 25) = q*2^255 into another limb,
|
||||
// discard it, subtracting the value from h.
|
||||
debug_assert!( (h[9] >> 25) == 0 || (h[9] >> 25) == 1);
|
||||
h[9] = h[9] & LOW_25_BITS;
|
||||
|
||||
let mut s = [0u8; 32];
|
||||
s[0] = (h[0] >> 0) as u8;
|
||||
s[1] = (h[0] >> 8) as u8;
|
||||
s[2] = (h[0] >> 16) as u8;
|
||||
s[3] = ((h[0] >> 24) | (h[1] << 2)) as u8;
|
||||
s[4] = (h[1] >> 6) as u8;
|
||||
s[5] = (h[1] >> 14) as u8;
|
||||
s[6] = ((h[1] >> 22) | (h[2] << 3)) as u8;
|
||||
s[7] = (h[2] >> 5) as u8;
|
||||
s[8] = (h[2] >> 13) as u8;
|
||||
s[9] = ((h[2] >> 21) | (h[3] << 5)) as u8;
|
||||
s[10] = (h[3] >> 3) as u8;
|
||||
s[11] = (h[3] >> 11) as u8;
|
||||
s[12] = ((h[3] >> 19) | (h[4] << 6)) as u8;
|
||||
s[13] = (h[4] >> 2) as u8;
|
||||
s[14] = (h[4] >> 10) as u8;
|
||||
s[15] = (h[4] >> 18) as u8;
|
||||
s[16] = (h[5] >> 0) as u8;
|
||||
s[17] = (h[5] >> 8) as u8;
|
||||
s[18] = (h[5] >> 16) as u8;
|
||||
s[19] = ((h[5] >> 24) | (h[6] << 1)) as u8;
|
||||
s[20] = (h[6] >> 7) as u8;
|
||||
s[21] = (h[6] >> 15) as u8;
|
||||
s[22] = ((h[6] >> 23) | (h[7] << 3)) as u8;
|
||||
s[23] = (h[7] >> 5) as u8;
|
||||
s[24] = (h[7] >> 13) as u8;
|
||||
s[25] = ((h[7] >> 21) | (h[8] << 4)) as u8;
|
||||
s[26] = (h[8] >> 4) as u8;
|
||||
s[27] = (h[8] >> 12) as u8;
|
||||
s[28] = ((h[8] >> 20) | (h[9] << 6)) as u8;
|
||||
s[29] = (h[9] >> 2) as u8;
|
||||
s[30] = (h[9] >> 10) as u8;
|
||||
s[31] = (h[9] >> 18) as u8;
|
||||
|
||||
// Check that high bit is cleared
|
||||
debug_assert!((s[31] & 0b1000_0000u8) == 0u8);
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
fn square_inner(&self) -> [u64; 10] {
|
||||
// Optimized version of multiplication for the case of squaring.
|
||||
// Pre- and post- conditions identical to multiplication function.
|
||||
let x = &self.0;
|
||||
let x0_2 = 2 * x[0];
|
||||
let x1_2 = 2 * x[1];
|
||||
let x2_2 = 2 * x[2];
|
||||
let x3_2 = 2 * x[3];
|
||||
let x4_2 = 2 * x[4];
|
||||
let x5_2 = 2 * x[5];
|
||||
let x6_2 = 2 * x[6];
|
||||
let x7_2 = 2 * x[7];
|
||||
let x5_19 = 19 * x[5];
|
||||
let x6_19 = 19 * x[6];
|
||||
let x7_19 = 19 * x[7];
|
||||
let x8_19 = 19 * x[8];
|
||||
let x9_19 = 19 * x[9];
|
||||
|
||||
/// Helper function to multiply two 32-bit integers with 64 bits
|
||||
/// of output.
|
||||
#[inline(always)]
|
||||
fn m(x: u32, y: u32) -> u64 { (x as u64) * (y as u64) }
|
||||
|
||||
// This block is rearranged so that instead of doing a 32-bit multiplication by 38, we do a
|
||||
// 64-bit multiplication by 2 on the results. This is because lg(38) is too big: we would
|
||||
// have less than 1 bit of headroom left, which is too little.
|
||||
let mut z = [0u64;10];
|
||||
z[0] = m(x[0],x[0]) + m(x2_2,x8_19) + m(x4_2,x6_19) + (m(x1_2,x9_19) + m(x3_2,x7_19) + m(x[5],x5_19))*2;
|
||||
z[1] = m(x0_2,x[1]) + m(x3_2,x8_19) + m(x5_2,x6_19) + (m(x[2],x9_19) + m(x[4],x7_19))*2;
|
||||
z[2] = m(x0_2,x[2]) + m(x1_2,x[1]) + m(x4_2,x8_19) + m(x[6],x6_19) + (m(x3_2,x9_19) + m(x5_2,x7_19))*2;
|
||||
z[3] = m(x0_2,x[3]) + m(x1_2,x[2]) + m(x5_2,x8_19) + (m(x[4],x9_19) + m(x[6],x7_19))*2;
|
||||
z[4] = m(x0_2,x[4]) + m(x1_2,x3_2) + m(x[2],x[2]) + m(x6_2,x8_19) + (m(x5_2,x9_19) + m(x[7],x7_19))*2;
|
||||
z[5] = m(x0_2,x[5]) + m(x1_2,x[4]) + m(x2_2,x[3]) + m(x7_2,x8_19) + m(x[6],x9_19)*2;
|
||||
z[6] = m(x0_2,x[6]) + m(x1_2,x5_2) + m(x2_2,x[4]) + m(x3_2,x[3]) + m(x[8],x8_19) + m(x7_2,x9_19)*2;
|
||||
z[7] = m(x0_2,x[7]) + m(x1_2,x[6]) + m(x2_2,x[5]) + m(x3_2,x[4]) + m(x[8],x9_19)*2;
|
||||
z[8] = m(x0_2,x[8]) + m(x1_2,x7_2) + m(x2_2,x[6]) + m(x3_2,x5_2) + m(x[4],x[4]) + m(x[9],x9_19)*2;
|
||||
z[9] = m(x0_2,x[9]) + m(x1_2,x[8]) + m(x2_2,x[7]) + m(x3_2,x[6]) + m(x4_2,x[5]) ;
|
||||
|
||||
z
|
||||
}
|
||||
|
||||
/// Compute `self^2`.
|
||||
pub fn square(&self) -> FieldElement2625 {
|
||||
FieldElement2625::reduce(self.square_inner())
|
||||
}
|
||||
|
||||
/// Compute `2*self^2`.
|
||||
pub fn square2(&self) -> FieldElement2625 {
|
||||
let mut coeffs = self.square_inner();
|
||||
for i in 0..self.0.len() {
|
||||
coeffs[i] += coeffs[i];
|
||||
}
|
||||
FieldElement2625::reduce(coeffs)
|
||||
}
|
||||
}
|
|
@ -1,21 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! The `u32` backend uses `u32`s and a `(u32, u32) -> u64` multiplier.
|
||||
//!
|
||||
//! This code is intended to be portable, but it requires that
|
||||
//! multiplication of two \\(32\\)-bit values to a \\(64\\)-bit result
|
||||
//! is constant-time on the target platform.
|
||||
|
||||
pub mod field;
|
||||
|
||||
pub mod scalar;
|
||||
|
||||
pub mod constants;
|
|
@ -1,529 +0,0 @@
|
|||
//! Arithmetic mod 2^252 + 27742317777372353535851937790883648493
|
||||
//! with 9 29-bit unsigned limbs
|
||||
//!
|
||||
//! To see that this is safe for intermediate results, note that
|
||||
//! the largest limb in a 9 by 9 product of 29-bit limbs will be
|
||||
//! (0x1fffffff^2) * 9 = 0x23fffffdc0000009 (62 bits).
|
||||
//!
|
||||
//! For a one level Karatsuba decomposition, the specific ranges
|
||||
//! depend on how the limbs are combined, but will stay within
|
||||
//! -0x1ffffffe00000008 (62 bits with sign bit) to
|
||||
//! 0x43fffffbc0000011 (63 bits), which is still safe.
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::{Index, IndexMut};
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use constants;
|
||||
|
||||
/// The `Scalar29` struct represents an element in ℤ/lℤ as 9 29-bit limbs
|
||||
#[derive(Copy,Clone)]
|
||||
pub struct Scalar29(pub [u32; 9]);
|
||||
|
||||
impl Debug for Scalar29 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "Scalar29: {:?}", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for Scalar29 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl Index<usize> for Scalar29 {
|
||||
type Output = u32;
|
||||
fn index(&self, _index: usize) -> &u32 {
|
||||
&(self.0[_index])
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexMut<usize> for Scalar29 {
|
||||
fn index_mut(&mut self, _index: usize) -> &mut u32 {
|
||||
&mut (self.0[_index])
|
||||
}
|
||||
}
|
||||
|
||||
/// u32 * u32 = u64 multiply helper
|
||||
#[inline(always)]
|
||||
fn m(x: u32, y: u32) -> u64 {
|
||||
(x as u64) * (y as u64)
|
||||
}
|
||||
|
||||
impl Scalar29 {
|
||||
/// Return the zero scalar.
|
||||
pub fn zero() -> Scalar29 {
|
||||
Scalar29([0,0,0,0,0,0,0,0,0])
|
||||
}
|
||||
|
||||
/// Unpack a 32 byte / 256 bit scalar into 9 29-bit limbs.
|
||||
pub fn from_bytes(bytes: &[u8; 32]) -> Scalar29 {
|
||||
let mut words = [0u32; 8];
|
||||
for i in 0..8 {
|
||||
for j in 0..4 {
|
||||
words[i] |= (bytes[(i * 4) + j] as u32) << (j * 8);
|
||||
}
|
||||
}
|
||||
|
||||
let mask = (1u32 << 29) - 1;
|
||||
let top_mask = (1u32 << 24) - 1;
|
||||
let mut s = Scalar29::zero();
|
||||
|
||||
s[ 0] = words[0] & mask;
|
||||
s[ 1] = ((words[0] >> 29) | (words[1] << 3)) & mask;
|
||||
s[ 2] = ((words[1] >> 26) | (words[2] << 6)) & mask;
|
||||
s[ 3] = ((words[2] >> 23) | (words[3] << 9)) & mask;
|
||||
s[ 4] = ((words[3] >> 20) | (words[4] << 12)) & mask;
|
||||
s[ 5] = ((words[4] >> 17) | (words[5] << 15)) & mask;
|
||||
s[ 6] = ((words[5] >> 14) | (words[6] << 18)) & mask;
|
||||
s[ 7] = ((words[6] >> 11) | (words[7] << 21)) & mask;
|
||||
s[ 8] = (words[7] >> 8) & top_mask;
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
/// Reduce a 64 byte / 512 bit scalar mod l.
|
||||
pub fn from_bytes_wide(bytes: &[u8; 64]) -> Scalar29 {
|
||||
let mut words = [0u32; 16];
|
||||
for i in 0..16 {
|
||||
for j in 0..4 {
|
||||
words[i] |= (bytes[(i * 4) + j] as u32) << (j * 8);
|
||||
}
|
||||
}
|
||||
|
||||
let mask = (1u32 << 29) - 1;
|
||||
let mut lo = Scalar29::zero();
|
||||
let mut hi = Scalar29::zero();
|
||||
|
||||
lo[0] = words[ 0] & mask;
|
||||
lo[1] = ((words[ 0] >> 29) | (words[ 1] << 3)) & mask;
|
||||
lo[2] = ((words[ 1] >> 26) | (words[ 2] << 6)) & mask;
|
||||
lo[3] = ((words[ 2] >> 23) | (words[ 3] << 9)) & mask;
|
||||
lo[4] = ((words[ 3] >> 20) | (words[ 4] << 12)) & mask;
|
||||
lo[5] = ((words[ 4] >> 17) | (words[ 5] << 15)) & mask;
|
||||
lo[6] = ((words[ 5] >> 14) | (words[ 6] << 18)) & mask;
|
||||
lo[7] = ((words[ 6] >> 11) | (words[ 7] << 21)) & mask;
|
||||
lo[8] = ((words[ 7] >> 8) | (words[ 8] << 24)) & mask;
|
||||
hi[0] = ((words[ 8] >> 5) | (words[ 9] << 27)) & mask;
|
||||
hi[1] = (words[ 9] >> 2) & mask;
|
||||
hi[2] = ((words[ 9] >> 31) | (words[10] << 1)) & mask;
|
||||
hi[3] = ((words[10] >> 28) | (words[11] << 4)) & mask;
|
||||
hi[4] = ((words[11] >> 25) | (words[12] << 7)) & mask;
|
||||
hi[5] = ((words[12] >> 22) | (words[13] << 10)) & mask;
|
||||
hi[6] = ((words[13] >> 19) | (words[14] << 13)) & mask;
|
||||
hi[7] = ((words[14] >> 16) | (words[15] << 16)) & mask;
|
||||
hi[8] = words[15] >> 13 ;
|
||||
|
||||
lo = Scalar29::montgomery_mul(&lo, &constants::R); // (lo * R) / R = lo
|
||||
hi = Scalar29::montgomery_mul(&hi, &constants::RR); // (hi * R^2) / R = hi * R
|
||||
|
||||
Scalar29::add(&hi, &lo) // (hi * R) + lo
|
||||
}
|
||||
|
||||
/// Pack the limbs of this `Scalar29` into 32 bytes.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
let mut s = [0u8; 32];
|
||||
|
||||
s[0] = (self.0[ 0] >> 0) as u8;
|
||||
s[1] = (self.0[ 0] >> 8) as u8;
|
||||
s[2] = (self.0[ 0] >> 16) as u8;
|
||||
s[3] = ((self.0[ 0] >> 24) | (self.0[ 1] << 5)) as u8;
|
||||
s[4] = (self.0[ 1] >> 3) as u8;
|
||||
s[5] = (self.0[ 1] >> 11) as u8;
|
||||
s[6] = (self.0[ 1] >> 19) as u8;
|
||||
s[7] = ((self.0[ 1] >> 27) | (self.0[ 2] << 2)) as u8;
|
||||
s[8] = (self.0[ 2] >> 6) as u8;
|
||||
s[9] = (self.0[ 2] >> 14) as u8;
|
||||
s[10] = ((self.0[ 2] >> 22) | (self.0[ 3] << 7)) as u8;
|
||||
s[11] = (self.0[ 3] >> 1) as u8;
|
||||
s[12] = (self.0[ 3] >> 9) as u8;
|
||||
s[13] = (self.0[ 3] >> 17) as u8;
|
||||
s[14] = ((self.0[ 3] >> 25) | (self.0[ 4] << 4)) as u8;
|
||||
s[15] = (self.0[ 4] >> 4) as u8;
|
||||
s[16] = (self.0[ 4] >> 12) as u8;
|
||||
s[17] = (self.0[ 4] >> 20) as u8;
|
||||
s[18] = ((self.0[ 4] >> 28) | (self.0[ 5] << 1)) as u8;
|
||||
s[19] = (self.0[ 5] >> 7) as u8;
|
||||
s[20] = (self.0[ 5] >> 15) as u8;
|
||||
s[21] = ((self.0[ 5] >> 23) | (self.0[ 6] << 6)) as u8;
|
||||
s[22] = (self.0[ 6] >> 2) as u8;
|
||||
s[23] = (self.0[ 6] >> 10) as u8;
|
||||
s[24] = (self.0[ 6] >> 18) as u8;
|
||||
s[25] = ((self.0[ 6] >> 26) | (self.0[ 7] << 3)) as u8;
|
||||
s[26] = (self.0[ 7] >> 5) as u8;
|
||||
s[27] = (self.0[ 7] >> 13) as u8;
|
||||
s[28] = (self.0[ 7] >> 21) as u8;
|
||||
s[29] = (self.0[ 8] >> 0) as u8;
|
||||
s[30] = (self.0[ 8] >> 8) as u8;
|
||||
s[31] = (self.0[ 8] >> 16) as u8;
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
/// Compute `a + b` (mod l).
|
||||
pub fn add(a: &Scalar29, b: &Scalar29) -> Scalar29 {
|
||||
let mut sum = Scalar29::zero();
|
||||
let mask = (1u32 << 29) - 1;
|
||||
|
||||
// a + b
|
||||
let mut carry: u32 = 0;
|
||||
for i in 0..9 {
|
||||
carry = a[i] + b[i] + (carry >> 29);
|
||||
sum[i] = carry & mask;
|
||||
}
|
||||
|
||||
// subtract l if the sum is >= l
|
||||
Scalar29::sub(&sum, &constants::L)
|
||||
}
|
||||
|
||||
/// Compute `a - b` (mod l).
|
||||
pub fn sub(a: &Scalar29, b: &Scalar29) -> Scalar29 {
|
||||
let mut difference = Scalar29::zero();
|
||||
let mask = (1u32 << 29) - 1;
|
||||
|
||||
// a - b
|
||||
let mut borrow: u32 = 0;
|
||||
for i in 0..9 {
|
||||
borrow = a[i].wrapping_sub(b[i] + (borrow >> 31));
|
||||
difference[i] = borrow & mask;
|
||||
}
|
||||
|
||||
// conditionally add l if the difference is negative
|
||||
let underflow_mask = ((borrow >> 31) ^ 1).wrapping_sub(1);
|
||||
let mut carry: u32 = 0;
|
||||
for i in 0..9 {
|
||||
carry = (carry >> 29) + difference[i] + (constants::L[i] & underflow_mask);
|
||||
difference[i] = carry & mask;
|
||||
}
|
||||
|
||||
difference
|
||||
}
|
||||
|
||||
/// Compute `a * b`.
|
||||
///
|
||||
/// This is implemented with a one-level refined Karatsuba decomposition
|
||||
#[inline(always)]
|
||||
pub (crate) fn mul_internal(a: &Scalar29, b: &Scalar29) -> [u64; 17] {
|
||||
let mut z = [0u64; 17];
|
||||
|
||||
z[0] = m(a[0],b[0]); // c00
|
||||
z[1] = m(a[0],b[1]) + m(a[1],b[0]); // c01
|
||||
z[2] = m(a[0],b[2]) + m(a[1],b[1]) + m(a[2],b[0]); // c02
|
||||
z[3] = m(a[0],b[3]) + m(a[1],b[2]) + m(a[2],b[1]) + m(a[3],b[0]); // c03
|
||||
z[4] = m(a[0],b[4]) + m(a[1],b[3]) + m(a[2],b[2]) + m(a[3],b[1]) + m(a[4],b[0]); // c04
|
||||
z[5] = m(a[1],b[4]) + m(a[2],b[3]) + m(a[3],b[2]) + m(a[4],b[1]); // c05
|
||||
z[6] = m(a[2],b[4]) + m(a[3],b[3]) + m(a[4],b[2]); // c06
|
||||
z[7] = m(a[3],b[4]) + m(a[4],b[3]); // c07
|
||||
z[8] = (m(a[4],b[4])).wrapping_sub(z[3]); // c08 - c03
|
||||
|
||||
z[10] = z[5].wrapping_sub(m(a[5],b[5])); // c05mc10
|
||||
z[11] = z[6].wrapping_sub(m(a[5],b[6]) + m(a[6],b[5])); // c06mc11
|
||||
z[12] = z[7].wrapping_sub(m(a[5],b[7]) + m(a[6],b[6]) + m(a[7],b[5])); // c07mc12
|
||||
z[13] = m(a[5],b[8]) + m(a[6],b[7]) + m(a[7],b[6]) + m(a[8],b[5]); // c13
|
||||
z[14] = m(a[6],b[8]) + m(a[7],b[7]) + m(a[8],b[6]); // c14
|
||||
z[15] = m(a[7],b[8]) + m(a[8],b[7]); // c15
|
||||
z[16] = m(a[8],b[8]); // c16
|
||||
|
||||
z[ 5] = z[10].wrapping_sub(z[ 0]); // c05mc10 - c00
|
||||
z[ 6] = z[11].wrapping_sub(z[ 1]); // c06mc11 - c01
|
||||
z[ 7] = z[12].wrapping_sub(z[ 2]); // c07mc12 - c02
|
||||
z[ 8] = z[ 8].wrapping_sub(z[13]); // c08mc13 - c03
|
||||
z[ 9] = z[14].wrapping_add(z[ 4]); // c14 + c04
|
||||
z[10] = z[15].wrapping_add(z[10]); // c15 + c05mc10
|
||||
z[11] = z[16].wrapping_add(z[11]); // c16 + c06mc11
|
||||
|
||||
let aa = [
|
||||
a[0]+a[5],
|
||||
a[1]+a[6],
|
||||
a[2]+a[7],
|
||||
a[3]+a[8]
|
||||
];
|
||||
|
||||
let bb = [
|
||||
b[0]+b[5],
|
||||
b[1]+b[6],
|
||||
b[2]+b[7],
|
||||
b[3]+b[8]
|
||||
];
|
||||
|
||||
z[ 5] = (m(aa[0],bb[0])) .wrapping_add(z[ 5]); // c20 + c05mc10 - c00
|
||||
z[ 6] = (m(aa[0],bb[1]) + m(aa[1],bb[0])) .wrapping_add(z[ 6]); // c21 + c06mc11 - c01
|
||||
z[ 7] = (m(aa[0],bb[2]) + m(aa[1],bb[1]) + m(aa[2],bb[0])) .wrapping_add(z[ 7]); // c22 + c07mc12 - c02
|
||||
z[ 8] = (m(aa[0],bb[3]) + m(aa[1],bb[2]) + m(aa[2],bb[1]) + m(aa[3],bb[0])) .wrapping_add(z[ 8]); // c23 + c08mc13 - c03
|
||||
z[ 9] = (m(aa[0], b[4]) + m(aa[1],bb[3]) + m(aa[2],bb[2]) + m(aa[3],bb[1]) + m(a[4],bb[0])).wrapping_sub(z[ 9]); // c24 - c14 - c04
|
||||
z[10] = ( m(aa[1], b[4]) + m(aa[2],bb[3]) + m(aa[3],bb[2]) + m(a[4],bb[1])).wrapping_sub(z[10]); // c25 - c15 - c05mc10
|
||||
z[11] = ( m(aa[2], b[4]) + m(aa[3],bb[3]) + m(a[4],bb[2])).wrapping_sub(z[11]); // c26 - c16 - c06mc11
|
||||
z[12] = ( m(aa[3], b[4]) + m(a[4],bb[3])).wrapping_sub(z[12]); // c27 - c07mc12
|
||||
|
||||
z
|
||||
}
|
||||
|
||||
/// Compute `a^2`.
|
||||
#[inline(always)]
|
||||
fn square_internal(a: &Scalar29) -> [u64; 17] {
|
||||
let aa = [
|
||||
a[0]*2,
|
||||
a[1]*2,
|
||||
a[2]*2,
|
||||
a[3]*2,
|
||||
a[4]*2,
|
||||
a[5]*2,
|
||||
a[6]*2,
|
||||
a[7]*2
|
||||
];
|
||||
|
||||
[
|
||||
m( a[0],a[0]),
|
||||
m(aa[0],a[1]),
|
||||
m(aa[0],a[2]) + m( a[1],a[1]),
|
||||
m(aa[0],a[3]) + m(aa[1],a[2]),
|
||||
m(aa[0],a[4]) + m(aa[1],a[3]) + m( a[2],a[2]),
|
||||
m(aa[0],a[5]) + m(aa[1],a[4]) + m(aa[2],a[3]),
|
||||
m(aa[0],a[6]) + m(aa[1],a[5]) + m(aa[2],a[4]) + m( a[3],a[3]),
|
||||
m(aa[0],a[7]) + m(aa[1],a[6]) + m(aa[2],a[5]) + m(aa[3],a[4]),
|
||||
m(aa[0],a[8]) + m(aa[1],a[7]) + m(aa[2],a[6]) + m(aa[3],a[5]) + m( a[4],a[4]),
|
||||
m(aa[1],a[8]) + m(aa[2],a[7]) + m(aa[3],a[6]) + m(aa[4],a[5]),
|
||||
m(aa[2],a[8]) + m(aa[3],a[7]) + m(aa[4],a[6]) + m( a[5],a[5]),
|
||||
m(aa[3],a[8]) + m(aa[4],a[7]) + m(aa[5],a[6]),
|
||||
m(aa[4],a[8]) + m(aa[5],a[7]) + m( a[6],a[6]),
|
||||
m(aa[5],a[8]) + m(aa[6],a[7]),
|
||||
m(aa[6],a[8]) + m( a[7],a[7]),
|
||||
m(aa[7],a[8]),
|
||||
m( a[8],a[8]),
|
||||
]
|
||||
}
|
||||
|
||||
/// Compute `limbs/R` (mod l), where R is the Montgomery modulus 2^261
|
||||
#[inline(always)]
|
||||
pub (crate) fn montgomery_reduce(limbs: &[u64; 17]) -> Scalar29 {
|
||||
|
||||
#[inline(always)]
|
||||
fn part1(sum: u64) -> (u64, u32) {
|
||||
let p = (sum as u32).wrapping_mul(constants::LFACTOR) & ((1u32 << 29) - 1);
|
||||
((sum + m(p,constants::L[0])) >> 29, p)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn part2(sum: u64) -> (u64, u32) {
|
||||
let w = (sum as u32) & ((1u32 << 29) - 1);
|
||||
(sum >> 29, w)
|
||||
}
|
||||
|
||||
// note: l5,l6,l7 are zero, so their multiplies can be skipped
|
||||
let l = &constants::L;
|
||||
|
||||
// the first half computes the Montgomery adjustment factor n, and begins adding n*l to make limbs divisible by R
|
||||
let (carry, n0) = part1( limbs[ 0]);
|
||||
let (carry, n1) = part1(carry + limbs[ 1] + m(n0,l[1]));
|
||||
let (carry, n2) = part1(carry + limbs[ 2] + m(n0,l[2]) + m(n1,l[1]));
|
||||
let (carry, n3) = part1(carry + limbs[ 3] + m(n0,l[3]) + m(n1,l[2]) + m(n2,l[1]));
|
||||
let (carry, n4) = part1(carry + limbs[ 4] + m(n0,l[4]) + m(n1,l[3]) + m(n2,l[2]) + m(n3,l[1]));
|
||||
let (carry, n5) = part1(carry + limbs[ 5] + m(n1,l[4]) + m(n2,l[3]) + m(n3,l[2]) + m(n4,l[1]));
|
||||
let (carry, n6) = part1(carry + limbs[ 6] + m(n2,l[4]) + m(n3,l[3]) + m(n4,l[2]) + m(n5,l[1]));
|
||||
let (carry, n7) = part1(carry + limbs[ 7] + m(n3,l[4]) + m(n4,l[3]) + m(n5,l[2]) + m(n6,l[1]));
|
||||
let (carry, n8) = part1(carry + limbs[ 8] + m(n0,l[8]) + m(n4,l[4]) + m(n5,l[3]) + m(n6,l[2]) + m(n7,l[1]));
|
||||
|
||||
// limbs is divisible by R now, so we can divide by R by simply storing the upper half as the result
|
||||
let (carry, r0) = part2(carry + limbs[ 9] + m(n1,l[8]) + m(n5,l[4]) + m(n6,l[3]) + m(n7,l[2]) + m(n8,l[1]));
|
||||
let (carry, r1) = part2(carry + limbs[10] + m(n2,l[8]) + m(n6,l[4]) + m(n7,l[3]) + m(n8,l[2]));
|
||||
let (carry, r2) = part2(carry + limbs[11] + m(n3,l[8]) + m(n7,l[4]) + m(n8,l[3]));
|
||||
let (carry, r3) = part2(carry + limbs[12] + m(n4,l[8]) + m(n8,l[4]));
|
||||
let (carry, r4) = part2(carry + limbs[13] + m(n5,l[8]) );
|
||||
let (carry, r5) = part2(carry + limbs[14] + m(n6,l[8]) );
|
||||
let (carry, r6) = part2(carry + limbs[15] + m(n7,l[8]) );
|
||||
let (carry, r7) = part2(carry + limbs[16] + m(n8,l[8]));
|
||||
let r8 = carry as u32;
|
||||
|
||||
// result may be >= l, so attempt to subtract l
|
||||
Scalar29::sub(&Scalar29([r0,r1,r2,r3,r4,r5,r6,r7,r8]), l)
|
||||
}
|
||||
|
||||
/// Compute `a * b` (mod l).
|
||||
#[inline(never)]
|
||||
pub fn mul(a: &Scalar29, b: &Scalar29) -> Scalar29 {
|
||||
let ab = Scalar29::montgomery_reduce(&Scalar29::mul_internal(a, b));
|
||||
Scalar29::montgomery_reduce(&Scalar29::mul_internal(&ab, &constants::RR))
|
||||
}
|
||||
|
||||
/// Compute `a^2` (mod l).
|
||||
#[inline(never)]
|
||||
#[allow(dead_code)] // XXX we don't expose square() via the Scalar API
|
||||
pub fn square(&self) -> Scalar29 {
|
||||
let aa = Scalar29::montgomery_reduce(&Scalar29::square_internal(self));
|
||||
Scalar29::montgomery_reduce(&Scalar29::mul_internal(&aa, &constants::RR))
|
||||
}
|
||||
|
||||
/// Compute `(a * b) / R` (mod l), where R is the Montgomery modulus 2^261
|
||||
#[inline(never)]
|
||||
pub fn montgomery_mul(a: &Scalar29, b: &Scalar29) -> Scalar29 {
|
||||
Scalar29::montgomery_reduce(&Scalar29::mul_internal(a, b))
|
||||
}
|
||||
|
||||
/// Compute `(a^2) / R` (mod l) in Montgomery form, where R is the Montgomery modulus 2^261
|
||||
#[inline(never)]
|
||||
pub fn montgomery_square(&self) -> Scalar29 {
|
||||
Scalar29::montgomery_reduce(&Scalar29::square_internal(self))
|
||||
}
|
||||
|
||||
/// Puts a Scalar29 in to Montgomery form, i.e. computes `a*R (mod l)`
|
||||
#[inline(never)]
|
||||
pub fn to_montgomery(&self) -> Scalar29 {
|
||||
Scalar29::montgomery_mul(self, &constants::RR)
|
||||
}
|
||||
|
||||
/// Takes a Scalar29 out of Montgomery form, i.e. computes `a/R (mod l)`
|
||||
pub fn from_montgomery(&self) -> Scalar29 {
|
||||
let mut limbs = [0u64; 17];
|
||||
for i in 0..9 {
|
||||
limbs[i] = self[i] as u64;
|
||||
}
|
||||
Scalar29::montgomery_reduce(&limbs)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
/// Note: x is 2^253-1 which is slightly larger than the largest scalar produced by
|
||||
/// this implementation (l-1), and should verify there are no overflows for valid scalars
|
||||
///
|
||||
/// x = 2^253-1 = 14474011154664524427946373126085988481658748083205070504932198000989141204991
|
||||
/// x = 7237005577332262213973186563042994240801631723825162898930247062703686954002 mod l
|
||||
/// x = 5147078182513738803124273553712992179887200054963030844803268920753008712037*R mod l in Montgomery form
|
||||
pub static X: Scalar29 = Scalar29(
|
||||
[0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff,
|
||||
0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff,
|
||||
0x001fffff]);
|
||||
|
||||
/// x^2 = 3078544782642840487852506753550082162405942681916160040940637093560259278169 mod l
|
||||
pub static XX: Scalar29 = Scalar29(
|
||||
[0x00217559, 0x000b3401, 0x103ff43b, 0x1462a62c,
|
||||
0x1d6f9f38, 0x18e7a42f, 0x09a3dcee, 0x008dbe18,
|
||||
0x0006ce65]);
|
||||
|
||||
/// x^2 = 2912514428060642753613814151688322857484807845836623976981729207238463947987*R mod l in Montgomery form
|
||||
pub static XX_MONT: Scalar29 = Scalar29(
|
||||
[0x152b4d2e, 0x0571d53b, 0x1da6d964, 0x188663b6,
|
||||
0x1d1b5f92, 0x19d50e3f, 0x12306c29, 0x0c6f26fe,
|
||||
0x00030edb]);
|
||||
|
||||
/// y = 6145104759870991071742105800796537629880401874866217824609283457819451087098
|
||||
pub static Y: Scalar29 = Scalar29(
|
||||
[0x1e1458fa, 0x165ba838, 0x1d787b36, 0x0e577f3a,
|
||||
0x1d2baf06, 0x1d689a19, 0x1fff3047, 0x117704ab,
|
||||
0x000d9601]);
|
||||
|
||||
/// x*y = 36752150652102274958925982391442301741
|
||||
pub static XY: Scalar29 = Scalar29(
|
||||
[0x0ba7632d, 0x017736bb, 0x15c76138, 0x0c69daa1,
|
||||
0x000001ba, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000]);
|
||||
|
||||
/// x*y = 3783114862749659543382438697751927473898937741870308063443170013240655651591*R mod l in Montgomery form
|
||||
pub static XY_MONT: Scalar29 = Scalar29(
|
||||
[0x077b51e1, 0x1c64e119, 0x02a19ef5, 0x18d2129e,
|
||||
0x00de0430, 0x045a7bc8, 0x04cfc7c9, 0x1c002681,
|
||||
0x000bdc1c]);
|
||||
|
||||
/// a = 2351415481556538453565687241199399922945659411799870114962672658845158063753
|
||||
pub static A: Scalar29 = Scalar29(
|
||||
[0x07b3be89, 0x02291b60, 0x14a99f03, 0x07dc3787,
|
||||
0x0a782aae, 0x16262525, 0x0cfdb93f, 0x13f5718d,
|
||||
0x000532da]);
|
||||
|
||||
/// b = 4885590095775723760407499321843594317911456947580037491039278279440296187236
|
||||
pub static B: Scalar29 = Scalar29(
|
||||
[0x15421564, 0x1e69fd72, 0x093d9692, 0x161785be,
|
||||
0x1587d69f, 0x09d9dada, 0x130246c0, 0x0c0a8e72,
|
||||
0x000acd25]);
|
||||
|
||||
/// a+b = 0
|
||||
/// a-b = 4702830963113076907131374482398799845891318823599740229925345317690316127506
|
||||
pub static AB: Scalar29 = Scalar29(
|
||||
[0x0f677d12, 0x045236c0, 0x09533e06, 0x0fb86f0f,
|
||||
0x14f0555c, 0x0c4c4a4a, 0x19fb727f, 0x07eae31a,
|
||||
0x000a65b5]);
|
||||
|
||||
// c = (2^512 - 1) % l = 1627715501170711445284395025044413883736156588369414752970002579683115011840
|
||||
pub static C: Scalar29 = Scalar29(
|
||||
[0x049c0f00, 0x00308f1a, 0x0164d1e9, 0x1c374ed1,
|
||||
0x1be65d00, 0x19e90bfa, 0x08f73bb1, 0x036f8613,
|
||||
0x00039941]);
|
||||
|
||||
#[test]
|
||||
fn mul_max() {
|
||||
let res = Scalar29::mul(&X, &X);
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XX[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn square_max() {
|
||||
let res = X.square();
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XX[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_mul_max() {
|
||||
let res = Scalar29::montgomery_mul(&X, &X);
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XX_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_square_max() {
|
||||
let res = X.montgomery_square();
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XX_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mul() {
|
||||
let res = Scalar29::mul(&X, &Y);
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XY[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_mul() {
|
||||
let res = Scalar29::montgomery_mul(&X, &Y);
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == XY_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add() {
|
||||
let res = Scalar29::add(&A, &B);
|
||||
let zero = Scalar29::zero();
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == zero[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub() {
|
||||
let res = Scalar29::sub(&A, &B);
|
||||
for i in 0..9 {
|
||||
assert!(res[i] == AB[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_bytes_wide() {
|
||||
let bignum = [255u8; 64]; // 2^512 - 1
|
||||
let reduced = Scalar29::from_bytes_wide(&bignum);
|
||||
for i in 0..9 {
|
||||
assert!(reduced[i] == C[i]);
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,563 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(64\\)-bit
|
||||
//! limbs with \\(128\\)-bit products.
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::Neg;
|
||||
use core::ops::{Add, AddAssign};
|
||||
use core::ops::{Mul, MulAssign};
|
||||
use core::ops::{Sub, SubAssign};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
/// A `FieldElement51` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// In the 64-bit implementation, a `FieldElement` is represented in
|
||||
/// radix \\(2\^{51}\\) as five `u64`s; the coefficients are allowed to
|
||||
/// grow up to \\(2\^{54}\\) between reductions modulo \\(p\\).
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// The `curve25519_dalek::field` module provides a type alias
|
||||
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
|
||||
/// or `FieldElement2625`.
|
||||
///
|
||||
/// The backend-specific type `FieldElement51` should not be used
|
||||
/// outside of the `curve25519_dalek::field` module.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct FieldElement51(pub (crate) [u64; 5]);
|
||||
|
||||
impl Debug for FieldElement51 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "FieldElement51({:?})", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for FieldElement51 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> AddAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn add_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
for i in 0..5 {
|
||||
self.0[i] += _rhs.0[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn add(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
output += _rhs;
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> SubAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn sub_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
let result = (self as &FieldElement51) - _rhs;
|
||||
self.0 = result.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn sub(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
// To avoid underflow, first add a multiple of p.
|
||||
// Choose 16*p = p << 4 to be larger than 54-bit _rhs.
|
||||
//
|
||||
// If we could statically track the bitlengths of the limbs
|
||||
// of every FieldElement51, we could choose a multiple of p
|
||||
// just bigger than _rhs and avoid having to do a reduction.
|
||||
//
|
||||
// Since we don't yet have type-level integers to do this, we
|
||||
// have to add an explicit reduction call here.
|
||||
FieldElement51::reduce([
|
||||
(self.0[0] + 36028797018963664u64) - _rhs.0[0],
|
||||
(self.0[1] + 36028797018963952u64) - _rhs.0[1],
|
||||
(self.0[2] + 36028797018963952u64) - _rhs.0[2],
|
||||
(self.0[3] + 36028797018963952u64) - _rhs.0[3],
|
||||
(self.0[4] + 36028797018963952u64) - _rhs.0[4],
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> MulAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn mul_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
let result = (self as &FieldElement51) * _rhs;
|
||||
self.0 = result.0;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn mul(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
/// Helper function to multiply two 64-bit integers with 128
|
||||
/// bits of output.
|
||||
#[inline(always)]
|
||||
fn m(x: u64, y: u64) -> u128 { (x as u128) * (y as u128) }
|
||||
|
||||
// Alias self, _rhs for more readable formulas
|
||||
let a: &[u64; 5] = &self.0;
|
||||
let b: &[u64; 5] = &_rhs.0;
|
||||
|
||||
// Precondition: assume input limbs a[i], b[i] are bounded as
|
||||
//
|
||||
// a[i], b[i] < 2^(51 + b)
|
||||
//
|
||||
// where b is a real parameter measuring the "bit excess" of the limbs.
|
||||
|
||||
// 64-bit precomputations to avoid 128-bit multiplications.
|
||||
//
|
||||
// This fits into a u64 whenever 51 + b + lg(19) < 64.
|
||||
//
|
||||
// Since 51 + b + lg(19) < 51 + 4.25 + b
|
||||
// = 55.25 + b,
|
||||
// this fits if b < 8.75.
|
||||
let b1_19 = b[1] * 19;
|
||||
let b2_19 = b[2] * 19;
|
||||
let b3_19 = b[3] * 19;
|
||||
let b4_19 = b[4] * 19;
|
||||
|
||||
// Multiply to get 128-bit coefficients of output
|
||||
let c0: u128 = m(a[0],b[0]) + m(a[4],b1_19) + m(a[3],b2_19) + m(a[2],b3_19) + m(a[1],b4_19);
|
||||
let mut c1: u128 = m(a[1],b[0]) + m(a[0],b[1]) + m(a[4],b2_19) + m(a[3],b3_19) + m(a[2],b4_19);
|
||||
let mut c2: u128 = m(a[2],b[0]) + m(a[1],b[1]) + m(a[0],b[2]) + m(a[4],b3_19) + m(a[3],b4_19);
|
||||
let mut c3: u128 = m(a[3],b[0]) + m(a[2],b[1]) + m(a[1],b[2]) + m(a[0],b[3]) + m(a[4],b4_19);
|
||||
let mut c4: u128 = m(a[4],b[0]) + m(a[3],b[1]) + m(a[2],b[2]) + m(a[1],b[3]) + m(a[0],b[4]);
|
||||
|
||||
// How big are the c[i]? We have
|
||||
//
|
||||
// c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19)
|
||||
// < 2^(102 + lg(1 + 4*19) + 2*b)
|
||||
// < 2^(108.27 + 2*b)
|
||||
//
|
||||
// The carry (c[i] >> 51) fits into a u64 when
|
||||
// 108.27 + 2*b - 51 < 64
|
||||
// 2*b < 6.73
|
||||
// b < 3.365.
|
||||
//
|
||||
// So we require b < 3 to ensure this fits.
|
||||
debug_assert!(a[0] < (1 << 54)); debug_assert!(b[0] < (1 << 54));
|
||||
debug_assert!(a[1] < (1 << 54)); debug_assert!(b[1] < (1 << 54));
|
||||
debug_assert!(a[2] < (1 << 54)); debug_assert!(b[2] < (1 << 54));
|
||||
debug_assert!(a[3] < (1 << 54)); debug_assert!(b[3] < (1 << 54));
|
||||
debug_assert!(a[4] < (1 << 54)); debug_assert!(b[4] < (1 << 54));
|
||||
|
||||
// Casting to u64 and back tells the compiler that the carry is
|
||||
// bounded by 2^64, so that the addition is a u128 + u64 rather
|
||||
// than u128 + u128.
|
||||
|
||||
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
|
||||
let mut out = [0u64; 5];
|
||||
|
||||
c1 += ((c0 >> 51) as u64) as u128;
|
||||
out[0] = (c0 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c2 += ((c1 >> 51) as u64) as u128;
|
||||
out[1] = (c1 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c3 += ((c2 >> 51) as u64) as u128;
|
||||
out[2] = (c2 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c4 += ((c3 >> 51) as u64) as u128;
|
||||
out[3] = (c3 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
let carry: u64 = (c4 >> 51) as u64;
|
||||
out[4] = (c4 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
// To see that this does not overflow, we need out[0] + carry * 19 < 2^64.
|
||||
//
|
||||
// c4 < a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 + (carry from c3)
|
||||
// < 5*(2^(51 + b) * 2^(51 + b)) + (carry from c3)
|
||||
// < 2^(102 + 2*b + lg(5)) + 2^64.
|
||||
//
|
||||
// When b < 3 we get
|
||||
//
|
||||
// c4 < 2^110.33 so that carry < 2^59.33
|
||||
//
|
||||
// so that
|
||||
//
|
||||
// out[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58
|
||||
//
|
||||
// and there is no overflow.
|
||||
out[0] = out[0] + carry * 19;
|
||||
|
||||
// Now out[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon).
|
||||
out[1] += out[0] >> 51;
|
||||
out[0] &= LOW_51_BIT_MASK;
|
||||
|
||||
// Now out[i] < 2^(51 + epsilon) for all i.
|
||||
FieldElement51(out)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn neg(self) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
output.negate();
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for FieldElement51 {
|
||||
fn conditional_select(
|
||||
a: &FieldElement51,
|
||||
b: &FieldElement51,
|
||||
choice: Choice,
|
||||
) -> FieldElement51 {
|
||||
FieldElement51([
|
||||
u64::conditional_select(&a.0[0], &b.0[0], choice),
|
||||
u64::conditional_select(&a.0[1], &b.0[1], choice),
|
||||
u64::conditional_select(&a.0[2], &b.0[2], choice),
|
||||
u64::conditional_select(&a.0[3], &b.0[3], choice),
|
||||
u64::conditional_select(&a.0[4], &b.0[4], choice),
|
||||
])
|
||||
}
|
||||
|
||||
fn conditional_swap(a: &mut FieldElement51, b: &mut FieldElement51, choice: Choice) {
|
||||
u64::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
|
||||
u64::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
|
||||
u64::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
|
||||
u64::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
|
||||
u64::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &FieldElement51, choice: Choice) {
|
||||
self.0[0].conditional_assign(&other.0[0], choice);
|
||||
self.0[1].conditional_assign(&other.0[1], choice);
|
||||
self.0[2].conditional_assign(&other.0[2], choice);
|
||||
self.0[3].conditional_assign(&other.0[3], choice);
|
||||
self.0[4].conditional_assign(&other.0[4], choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement51 {
|
||||
/// Invert the sign of this field element
|
||||
pub fn negate(&mut self) {
|
||||
// See commentary in the Sub impl
|
||||
let neg = FieldElement51::reduce([
|
||||
36028797018963664u64 - self.0[0],
|
||||
36028797018963952u64 - self.0[1],
|
||||
36028797018963952u64 - self.0[2],
|
||||
36028797018963952u64 - self.0[3],
|
||||
36028797018963952u64 - self.0[4],
|
||||
]);
|
||||
self.0 = neg.0;
|
||||
}
|
||||
|
||||
/// Construct zero.
|
||||
pub fn zero() -> FieldElement51 {
|
||||
FieldElement51([ 0, 0, 0, 0, 0 ])
|
||||
}
|
||||
|
||||
/// Construct one.
|
||||
pub fn one() -> FieldElement51 {
|
||||
FieldElement51([ 1, 0, 0, 0, 0 ])
|
||||
}
|
||||
|
||||
/// Construct -1.
|
||||
pub fn minus_one() -> FieldElement51 {
|
||||
FieldElement51([2251799813685228, 2251799813685247, 2251799813685247, 2251799813685247, 2251799813685247])
|
||||
}
|
||||
|
||||
/// Given 64-bit input limbs, reduce to enforce the bound 2^(51 + epsilon).
|
||||
#[inline(always)]
|
||||
fn reduce(mut limbs: [u64; 5]) -> FieldElement51 {
|
||||
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
|
||||
|
||||
// Since the input limbs are bounded by 2^64, the biggest
|
||||
// carry-out is bounded by 2^13.
|
||||
//
|
||||
// The biggest carry-in is c4 * 19, resulting in
|
||||
//
|
||||
// 2^51 + 19*2^13 < 2^51.0000000001
|
||||
//
|
||||
// Because we don't need to canonicalize, only to reduce the
|
||||
// limb sizes, it's OK to do a "weak reduction", where we
|
||||
// compute the carry-outs in parallel.
|
||||
|
||||
let c0 = limbs[0] >> 51;
|
||||
let c1 = limbs[1] >> 51;
|
||||
let c2 = limbs[2] >> 51;
|
||||
let c3 = limbs[3] >> 51;
|
||||
let c4 = limbs[4] >> 51;
|
||||
|
||||
limbs[0] &= LOW_51_BIT_MASK;
|
||||
limbs[1] &= LOW_51_BIT_MASK;
|
||||
limbs[2] &= LOW_51_BIT_MASK;
|
||||
limbs[3] &= LOW_51_BIT_MASK;
|
||||
limbs[4] &= LOW_51_BIT_MASK;
|
||||
|
||||
limbs[0] += c4 * 19;
|
||||
limbs[1] += c0;
|
||||
limbs[2] += c1;
|
||||
limbs[3] += c2;
|
||||
limbs[4] += c3;
|
||||
|
||||
FieldElement51(limbs)
|
||||
}
|
||||
|
||||
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
|
||||
/// input.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function does not check that the input used the canonical
|
||||
/// representative. It masks the high bit, but it will happily
|
||||
/// decode 2^255 - 18 to 1. Applications that require a canonical
|
||||
/// encoding of every field element should decode, re-encode to
|
||||
/// the canonical encoding, and check that the input was
|
||||
/// canonical.
|
||||
///
|
||||
pub fn from_bytes(bytes: &[u8; 32]) -> FieldElement51 {
|
||||
let load8 = |input: &[u8]| -> u64 {
|
||||
(input[0] as u64)
|
||||
| ((input[1] as u64) << 8)
|
||||
| ((input[2] as u64) << 16)
|
||||
| ((input[3] as u64) << 24)
|
||||
| ((input[4] as u64) << 32)
|
||||
| ((input[5] as u64) << 40)
|
||||
| ((input[6] as u64) << 48)
|
||||
| ((input[7] as u64) << 56)
|
||||
};
|
||||
|
||||
let low_51_bit_mask = (1u64 << 51) - 1;
|
||||
FieldElement51(
|
||||
// load bits [ 0, 64), no shift
|
||||
[ load8(&bytes[ 0..]) & low_51_bit_mask
|
||||
// load bits [ 48,112), shift to [ 51,112)
|
||||
, (load8(&bytes[ 6..]) >> 3) & low_51_bit_mask
|
||||
// load bits [ 96,160), shift to [102,160)
|
||||
, (load8(&bytes[12..]) >> 6) & low_51_bit_mask
|
||||
// load bits [152,216), shift to [153,216)
|
||||
, (load8(&bytes[19..]) >> 1) & low_51_bit_mask
|
||||
// load bits [192,256), shift to [204,112)
|
||||
, (load8(&bytes[24..]) >> 12) & low_51_bit_mask
|
||||
])
|
||||
}
|
||||
|
||||
/// Serialize this `FieldElement51` to a 32-byte array. The
|
||||
/// encoding is canonical.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
// Let h = limbs[0] + limbs[1]*2^51 + ... + limbs[4]*2^204.
|
||||
//
|
||||
// Write h = pq + r with 0 <= r < p.
|
||||
//
|
||||
// We want to compute r = h mod p.
|
||||
//
|
||||
// If h < 2*p = 2^256 - 38,
|
||||
// then q = 0 or 1,
|
||||
//
|
||||
// with q = 0 when h < p
|
||||
// and q = 1 when h >= p.
|
||||
//
|
||||
// Notice that h >= p <==> h + 19 >= p + 19 <==> h + 19 >= 2^255.
|
||||
// Therefore q can be computed as the carry bit of h + 19.
|
||||
|
||||
// First, reduce the limbs to ensure h < 2*p.
|
||||
let mut limbs = FieldElement51::reduce(self.0).0;
|
||||
|
||||
let mut q = (limbs[0] + 19) >> 51;
|
||||
q = (limbs[1] + q) >> 51;
|
||||
q = (limbs[2] + q) >> 51;
|
||||
q = (limbs[3] + q) >> 51;
|
||||
q = (limbs[4] + q) >> 51;
|
||||
|
||||
// Now we can compute r as r = h - pq = r - (2^255-19)q = r + 19q - 2^255q
|
||||
|
||||
limbs[0] += 19*q;
|
||||
|
||||
// Now carry the result to compute r + 19q ...
|
||||
let low_51_bit_mask = (1u64 << 51) - 1;
|
||||
limbs[1] += limbs[0] >> 51;
|
||||
limbs[0] = limbs[0] & low_51_bit_mask;
|
||||
limbs[2] += limbs[1] >> 51;
|
||||
limbs[1] = limbs[1] & low_51_bit_mask;
|
||||
limbs[3] += limbs[2] >> 51;
|
||||
limbs[2] = limbs[2] & low_51_bit_mask;
|
||||
limbs[4] += limbs[3] >> 51;
|
||||
limbs[3] = limbs[3] & low_51_bit_mask;
|
||||
// ... but instead of carrying (limbs[4] >> 51) = 2^255q
|
||||
// into another limb, discard it, subtracting the value
|
||||
limbs[4] = limbs[4] & low_51_bit_mask;
|
||||
|
||||
// Now arrange the bits of the limbs.
|
||||
let mut s = [0u8;32];
|
||||
s[ 0] = limbs[0] as u8;
|
||||
s[ 1] = (limbs[0] >> 8) as u8;
|
||||
s[ 2] = (limbs[0] >> 16) as u8;
|
||||
s[ 3] = (limbs[0] >> 24) as u8;
|
||||
s[ 4] = (limbs[0] >> 32) as u8;
|
||||
s[ 5] = (limbs[0] >> 40) as u8;
|
||||
s[ 6] = ((limbs[0] >> 48) | (limbs[1] << 3)) as u8;
|
||||
s[ 7] = (limbs[1] >> 5) as u8;
|
||||
s[ 8] = (limbs[1] >> 13) as u8;
|
||||
s[ 9] = (limbs[1] >> 21) as u8;
|
||||
s[10] = (limbs[1] >> 29) as u8;
|
||||
s[11] = (limbs[1] >> 37) as u8;
|
||||
s[12] = ((limbs[1] >> 45) | (limbs[2] << 6)) as u8;
|
||||
s[13] = (limbs[2] >> 2) as u8;
|
||||
s[14] = (limbs[2] >> 10) as u8;
|
||||
s[15] = (limbs[2] >> 18) as u8;
|
||||
s[16] = (limbs[2] >> 26) as u8;
|
||||
s[17] = (limbs[2] >> 34) as u8;
|
||||
s[18] = (limbs[2] >> 42) as u8;
|
||||
s[19] = ((limbs[2] >> 50) | (limbs[3] << 1)) as u8;
|
||||
s[20] = (limbs[3] >> 7) as u8;
|
||||
s[21] = (limbs[3] >> 15) as u8;
|
||||
s[22] = (limbs[3] >> 23) as u8;
|
||||
s[23] = (limbs[3] >> 31) as u8;
|
||||
s[24] = (limbs[3] >> 39) as u8;
|
||||
s[25] = ((limbs[3] >> 47) | (limbs[4] << 4)) as u8;
|
||||
s[26] = (limbs[4] >> 4) as u8;
|
||||
s[27] = (limbs[4] >> 12) as u8;
|
||||
s[28] = (limbs[4] >> 20) as u8;
|
||||
s[29] = (limbs[4] >> 28) as u8;
|
||||
s[30] = (limbs[4] >> 36) as u8;
|
||||
s[31] = (limbs[4] >> 44) as u8;
|
||||
|
||||
// High bit should be zero.
|
||||
debug_assert!((s[31] & 0b1000_0000u8) == 0u8);
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
/// Given `k > 0`, return `self^(2^k)`.
|
||||
pub fn pow2k(&self, mut k: u32) -> FieldElement51 {
|
||||
|
||||
debug_assert!( k > 0 );
|
||||
|
||||
/// Multiply two 64-bit integers with 128 bits of output.
|
||||
#[inline(always)]
|
||||
fn m(x: u64, y: u64) -> u128 { (x as u128) * (y as u128) }
|
||||
|
||||
let mut a: [u64; 5] = self.0;
|
||||
|
||||
loop {
|
||||
// Precondition: assume input limbs a[i] are bounded as
|
||||
//
|
||||
// a[i] < 2^(51 + b)
|
||||
//
|
||||
// where b is a real parameter measuring the "bit excess" of the limbs.
|
||||
|
||||
// Precomputation: 64-bit multiply by 19.
|
||||
//
|
||||
// This fits into a u64 whenever 51 + b + lg(19) < 64.
|
||||
//
|
||||
// Since 51 + b + lg(19) < 51 + 4.25 + b
|
||||
// = 55.25 + b,
|
||||
// this fits if b < 8.75.
|
||||
let a3_19 = 19 * a[3];
|
||||
let a4_19 = 19 * a[4];
|
||||
|
||||
// Multiply to get 128-bit coefficients of output.
|
||||
//
|
||||
// The 128-bit multiplications by 2 turn into 1 slr + 1 slrd each,
|
||||
// which doesn't seem any better or worse than doing them as precomputations
|
||||
// on the 64-bit inputs.
|
||||
let c0: u128 = m(a[0], a[0]) + 2*( m(a[1], a4_19) + m(a[2], a3_19) );
|
||||
let mut c1: u128 = m(a[3], a3_19) + 2*( m(a[0], a[1]) + m(a[2], a4_19) );
|
||||
let mut c2: u128 = m(a[1], a[1]) + 2*( m(a[0], a[2]) + m(a[4], a3_19) );
|
||||
let mut c3: u128 = m(a[4], a4_19) + 2*( m(a[0], a[3]) + m(a[1], a[2]) );
|
||||
let mut c4: u128 = m(a[2], a[2]) + 2*( m(a[0], a[4]) + m(a[1], a[3]) );
|
||||
|
||||
// Same bound as in multiply:
|
||||
// c[i] < 2^(102 + 2*b) * (1+i + (4-i)*19)
|
||||
// < 2^(102 + lg(1 + 4*19) + 2*b)
|
||||
// < 2^(108.27 + 2*b)
|
||||
//
|
||||
// The carry (c[i] >> 51) fits into a u64 when
|
||||
// 108.27 + 2*b - 51 < 64
|
||||
// 2*b < 6.73
|
||||
// b < 3.365.
|
||||
//
|
||||
// So we require b < 3 to ensure this fits.
|
||||
debug_assert!(a[0] < (1 << 54));
|
||||
debug_assert!(a[1] < (1 << 54));
|
||||
debug_assert!(a[2] < (1 << 54));
|
||||
debug_assert!(a[3] < (1 << 54));
|
||||
debug_assert!(a[4] < (1 << 54));
|
||||
|
||||
const LOW_51_BIT_MASK: u64 = (1u64 << 51) - 1;
|
||||
|
||||
// Casting to u64 and back tells the compiler that the carry is bounded by 2^64, so
|
||||
// that the addition is a u128 + u64 rather than u128 + u128.
|
||||
c1 += ((c0 >> 51) as u64) as u128;
|
||||
a[0] = (c0 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c2 += ((c1 >> 51) as u64) as u128;
|
||||
a[1] = (c1 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c3 += ((c2 >> 51) as u64) as u128;
|
||||
a[2] = (c2 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
c4 += ((c3 >> 51) as u64) as u128;
|
||||
a[3] = (c3 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
let carry: u64 = (c4 >> 51) as u64;
|
||||
a[4] = (c4 as u64) & LOW_51_BIT_MASK;
|
||||
|
||||
// To see that this does not overflow, we need a[0] + carry * 19 < 2^64.
|
||||
//
|
||||
// c4 < a2^2 + 2*a0*a4 + 2*a1*a3 + (carry from c3)
|
||||
// < 2^(102 + 2*b + lg(5)) + 2^64.
|
||||
//
|
||||
// When b < 3 we get
|
||||
//
|
||||
// c4 < 2^110.33 so that carry < 2^59.33
|
||||
//
|
||||
// so that
|
||||
//
|
||||
// a[0] + carry * 19 < 2^51 + 19 * 2^59.33 < 2^63.58
|
||||
//
|
||||
// and there is no overflow.
|
||||
a[0] = a[0] + carry * 19;
|
||||
|
||||
// Now a[1] < 2^51 + 2^(64 -51) = 2^51 + 2^13 < 2^(51 + epsilon).
|
||||
a[1] += a[0] >> 51;
|
||||
a[0] &= LOW_51_BIT_MASK;
|
||||
|
||||
// Now all a[i] < 2^(51 + epsilon) and a = self^(2^k).
|
||||
|
||||
k = k - 1;
|
||||
if k == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
FieldElement51(a)
|
||||
}
|
||||
|
||||
/// Returns the square of this field element.
|
||||
pub fn square(&self) -> FieldElement51 {
|
||||
self.pow2k(1)
|
||||
}
|
||||
|
||||
/// Returns 2 times the square of this field element.
|
||||
pub fn square2(&self) -> FieldElement51 {
|
||||
let mut square = self.pow2k(1);
|
||||
for i in 0..5 {
|
||||
square.0[i] *= 2;
|
||||
}
|
||||
|
||||
square
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! The `u64` backend uses `u64`s and a `(u64, u64) -> u128` multiplier.
|
||||
//!
|
||||
//! On x86_64, the idiom `(x as u128) * (y as u128)` lowers to `MUL`
|
||||
//! instructions taking 64-bit inputs and producing 128-bit outputs. On
|
||||
//! other platforms, this implementation is not recommended.
|
||||
//!
|
||||
//! On Haswell and newer, the BMI2 extension provides `MULX`, and on
|
||||
//! Broadwell and newer, the ADX extension provides `ADCX` and `ADOX`
|
||||
//! (allowing the CPU to compute two carry chains in parallel). These
|
||||
//! will be used if available.
|
||||
|
||||
pub mod field;
|
||||
|
||||
pub mod scalar;
|
||||
|
||||
pub mod constants;
|
|
@ -1,451 +0,0 @@
|
|||
//! Arithmetic mod \\(2\^{252} + 27742317777372353535851937790883648493\\)
|
||||
//! with five \\(52\\)-bit unsigned limbs.
|
||||
//!
|
||||
//! \\(51\\)-bit limbs would cover the desired bit range (\\(253\\)
|
||||
//! bits), but isn't large enough to reduce a \\(512\\)-bit number with
|
||||
//! Montgomery multiplication, so \\(52\\) bits is used instead. To see
|
||||
//! that this is safe for intermediate results, note that the largest
|
||||
//! limb in a \\(5\times 5\\) product of \\(52\\)-bit limbs will be
|
||||
//!
|
||||
//! ```text
|
||||
//! (0xfffffffffffff^2) * 5 = 0x4ffffffffffff60000000000005 (107 bits).
|
||||
//! ```
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::{Index, IndexMut};
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use constants;
|
||||
|
||||
/// The `Scalar52` struct represents an element in
|
||||
/// \\(\mathbb Z / \ell \mathbb Z\\) as 5 \\(52\\)-bit limbs.
|
||||
#[derive(Copy,Clone)]
|
||||
pub struct Scalar52(pub [u64; 5]);
|
||||
|
||||
impl Debug for Scalar52 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "Scalar52: {:?}", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for Scalar52 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl Index<usize> for Scalar52 {
|
||||
type Output = u64;
|
||||
fn index(&self, _index: usize) -> &u64 {
|
||||
&(self.0[_index])
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexMut<usize> for Scalar52 {
|
||||
fn index_mut(&mut self, _index: usize) -> &mut u64 {
|
||||
&mut (self.0[_index])
|
||||
}
|
||||
}
|
||||
|
||||
/// u64 * u64 = u128 multiply helper
|
||||
#[inline(always)]
|
||||
fn m(x: u64, y: u64) -> u128 {
|
||||
(x as u128) * (y as u128)
|
||||
}
|
||||
|
||||
impl Scalar52 {
|
||||
/// Return the zero scalar
|
||||
pub fn zero() -> Scalar52 {
|
||||
Scalar52([0,0,0,0,0])
|
||||
}
|
||||
|
||||
/// Unpack a 32 byte / 256 bit scalar into 5 52-bit limbs.
|
||||
pub fn from_bytes(bytes: &[u8; 32]) -> Scalar52 {
|
||||
let mut words = [0u64; 4];
|
||||
for i in 0..4 {
|
||||
for j in 0..8 {
|
||||
words[i] |= (bytes[(i * 8) + j] as u64) << (j * 8);
|
||||
}
|
||||
}
|
||||
|
||||
let mask = (1u64 << 52) - 1;
|
||||
let top_mask = (1u64 << 48) - 1;
|
||||
let mut s = Scalar52::zero();
|
||||
|
||||
s[ 0] = words[0] & mask;
|
||||
s[ 1] = ((words[0] >> 52) | (words[1] << 12)) & mask;
|
||||
s[ 2] = ((words[1] >> 40) | (words[2] << 24)) & mask;
|
||||
s[ 3] = ((words[2] >> 28) | (words[3] << 36)) & mask;
|
||||
s[ 4] = (words[3] >> 16) & top_mask;
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
/// Reduce a 64 byte / 512 bit scalar mod l
|
||||
pub fn from_bytes_wide(bytes: &[u8; 64]) -> Scalar52 {
|
||||
let mut words = [0u64; 8];
|
||||
for i in 0..8 {
|
||||
for j in 0..8 {
|
||||
words[i] |= (bytes[(i * 8) + j] as u64) << (j * 8);
|
||||
}
|
||||
}
|
||||
|
||||
let mask = (1u64 << 52) - 1;
|
||||
let mut lo = Scalar52::zero();
|
||||
let mut hi = Scalar52::zero();
|
||||
|
||||
lo[0] = words[ 0] & mask;
|
||||
lo[1] = ((words[ 0] >> 52) | (words[ 1] << 12)) & mask;
|
||||
lo[2] = ((words[ 1] >> 40) | (words[ 2] << 24)) & mask;
|
||||
lo[3] = ((words[ 2] >> 28) | (words[ 3] << 36)) & mask;
|
||||
lo[4] = ((words[ 3] >> 16) | (words[ 4] << 48)) & mask;
|
||||
hi[0] = (words[ 4] >> 4) & mask;
|
||||
hi[1] = ((words[ 4] >> 56) | (words[ 5] << 8)) & mask;
|
||||
hi[2] = ((words[ 5] >> 44) | (words[ 6] << 20)) & mask;
|
||||
hi[3] = ((words[ 6] >> 32) | (words[ 7] << 32)) & mask;
|
||||
hi[4] = words[ 7] >> 20 ;
|
||||
|
||||
lo = Scalar52::montgomery_mul(&lo, &constants::R); // (lo * R) / R = lo
|
||||
hi = Scalar52::montgomery_mul(&hi, &constants::RR); // (hi * R^2) / R = hi * R
|
||||
|
||||
Scalar52::add(&hi, &lo)
|
||||
}
|
||||
|
||||
/// Pack the limbs of this `Scalar52` into 32 bytes
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
let mut s = [0u8; 32];
|
||||
|
||||
s[0] = (self.0[ 0] >> 0) as u8;
|
||||
s[1] = (self.0[ 0] >> 8) as u8;
|
||||
s[2] = (self.0[ 0] >> 16) as u8;
|
||||
s[3] = (self.0[ 0] >> 24) as u8;
|
||||
s[4] = (self.0[ 0] >> 32) as u8;
|
||||
s[5] = (self.0[ 0] >> 40) as u8;
|
||||
s[6] = ((self.0[ 0] >> 48) | (self.0[ 1] << 4)) as u8;
|
||||
s[7] = (self.0[ 1] >> 4) as u8;
|
||||
s[8] = (self.0[ 1] >> 12) as u8;
|
||||
s[9] = (self.0[ 1] >> 20) as u8;
|
||||
s[10] = (self.0[ 1] >> 28) as u8;
|
||||
s[11] = (self.0[ 1] >> 36) as u8;
|
||||
s[12] = (self.0[ 1] >> 44) as u8;
|
||||
s[13] = (self.0[ 2] >> 0) as u8;
|
||||
s[14] = (self.0[ 2] >> 8) as u8;
|
||||
s[15] = (self.0[ 2] >> 16) as u8;
|
||||
s[16] = (self.0[ 2] >> 24) as u8;
|
||||
s[17] = (self.0[ 2] >> 32) as u8;
|
||||
s[18] = (self.0[ 2] >> 40) as u8;
|
||||
s[19] = ((self.0[ 2] >> 48) | (self.0[ 3] << 4)) as u8;
|
||||
s[20] = (self.0[ 3] >> 4) as u8;
|
||||
s[21] = (self.0[ 3] >> 12) as u8;
|
||||
s[22] = (self.0[ 3] >> 20) as u8;
|
||||
s[23] = (self.0[ 3] >> 28) as u8;
|
||||
s[24] = (self.0[ 3] >> 36) as u8;
|
||||
s[25] = (self.0[ 3] >> 44) as u8;
|
||||
s[26] = (self.0[ 4] >> 0) as u8;
|
||||
s[27] = (self.0[ 4] >> 8) as u8;
|
||||
s[28] = (self.0[ 4] >> 16) as u8;
|
||||
s[29] = (self.0[ 4] >> 24) as u8;
|
||||
s[30] = (self.0[ 4] >> 32) as u8;
|
||||
s[31] = (self.0[ 4] >> 40) as u8;
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
/// Compute `a + b` (mod l)
|
||||
pub fn add(a: &Scalar52, b: &Scalar52) -> Scalar52 {
|
||||
let mut sum = Scalar52::zero();
|
||||
let mask = (1u64 << 52) - 1;
|
||||
|
||||
// a + b
|
||||
let mut carry: u64 = 0;
|
||||
for i in 0..5 {
|
||||
carry = a[i] + b[i] + (carry >> 52);
|
||||
sum[i] = carry & mask;
|
||||
}
|
||||
|
||||
// subtract l if the sum is >= l
|
||||
Scalar52::sub(&sum, &constants::L)
|
||||
}
|
||||
|
||||
/// Compute `a - b` (mod l)
|
||||
pub fn sub(a: &Scalar52, b: &Scalar52) -> Scalar52 {
|
||||
let mut difference = Scalar52::zero();
|
||||
let mask = (1u64 << 52) - 1;
|
||||
|
||||
// a - b
|
||||
let mut borrow: u64 = 0;
|
||||
for i in 0..5 {
|
||||
borrow = a[i].wrapping_sub(b[i] + (borrow >> 63));
|
||||
difference[i] = borrow & mask;
|
||||
}
|
||||
|
||||
// conditionally add l if the difference is negative
|
||||
let underflow_mask = ((borrow >> 63) ^ 1).wrapping_sub(1);
|
||||
let mut carry: u64 = 0;
|
||||
for i in 0..5 {
|
||||
carry = (carry >> 52) + difference[i] + (constants::L[i] & underflow_mask);
|
||||
difference[i] = carry & mask;
|
||||
}
|
||||
|
||||
difference
|
||||
}
|
||||
|
||||
/// Compute `a * b`
|
||||
#[inline(always)]
|
||||
pub (crate) fn mul_internal(a: &Scalar52, b: &Scalar52) -> [u128; 9] {
|
||||
let mut z = [0u128; 9];
|
||||
|
||||
z[0] = m(a[0],b[0]);
|
||||
z[1] = m(a[0],b[1]) + m(a[1],b[0]);
|
||||
z[2] = m(a[0],b[2]) + m(a[1],b[1]) + m(a[2],b[0]);
|
||||
z[3] = m(a[0],b[3]) + m(a[1],b[2]) + m(a[2],b[1]) + m(a[3],b[0]);
|
||||
z[4] = m(a[0],b[4]) + m(a[1],b[3]) + m(a[2],b[2]) + m(a[3],b[1]) + m(a[4],b[0]);
|
||||
z[5] = m(a[1],b[4]) + m(a[2],b[3]) + m(a[3],b[2]) + m(a[4],b[1]);
|
||||
z[6] = m(a[2],b[4]) + m(a[3],b[3]) + m(a[4],b[2]);
|
||||
z[7] = m(a[3],b[4]) + m(a[4],b[3]);
|
||||
z[8] = m(a[4],b[4]);
|
||||
|
||||
z
|
||||
}
|
||||
|
||||
/// Compute `a^2`
|
||||
#[inline(always)]
|
||||
fn square_internal(a: &Scalar52) -> [u128; 9] {
|
||||
let aa = [
|
||||
a[0]*2,
|
||||
a[1]*2,
|
||||
a[2]*2,
|
||||
a[3]*2,
|
||||
];
|
||||
|
||||
[
|
||||
m( a[0],a[0]),
|
||||
m(aa[0],a[1]),
|
||||
m(aa[0],a[2]) + m( a[1],a[1]),
|
||||
m(aa[0],a[3]) + m(aa[1],a[2]),
|
||||
m(aa[0],a[4]) + m(aa[1],a[3]) + m( a[2],a[2]),
|
||||
m(aa[1],a[4]) + m(aa[2],a[3]),
|
||||
m(aa[2],a[4]) + m( a[3],a[3]),
|
||||
m(aa[3],a[4]),
|
||||
m(a[4],a[4])
|
||||
]
|
||||
}
|
||||
|
||||
/// Compute `limbs/R` (mod l), where R is the Montgomery modulus 2^260
|
||||
#[inline(always)]
|
||||
pub (crate) fn montgomery_reduce(limbs: &[u128; 9]) -> Scalar52 {
|
||||
|
||||
#[inline(always)]
|
||||
fn part1(sum: u128) -> (u128, u64) {
|
||||
let p = (sum as u64).wrapping_mul(constants::LFACTOR) & ((1u64 << 52) - 1);
|
||||
((sum + m(p,constants::L[0])) >> 52, p)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn part2(sum: u128) -> (u128, u64) {
|
||||
let w = (sum as u64) & ((1u64 << 52) - 1);
|
||||
(sum >> 52, w)
|
||||
}
|
||||
|
||||
// note: l[3] is zero, so its multiples can be skipped
|
||||
let l = &constants::L;
|
||||
|
||||
// the first half computes the Montgomery adjustment factor n, and begins adding n*l to make limbs divisible by R
|
||||
let (carry, n0) = part1( limbs[0]);
|
||||
let (carry, n1) = part1(carry + limbs[1] + m(n0,l[1]));
|
||||
let (carry, n2) = part1(carry + limbs[2] + m(n0,l[2]) + m(n1,l[1]));
|
||||
let (carry, n3) = part1(carry + limbs[3] + m(n1,l[2]) + m(n2,l[1]));
|
||||
let (carry, n4) = part1(carry + limbs[4] + m(n0,l[4]) + m(n2,l[2]) + m(n3,l[1]));
|
||||
|
||||
// limbs is divisible by R now, so we can divide by R by simply storing the upper half as the result
|
||||
let (carry, r0) = part2(carry + limbs[5] + m(n1,l[4]) + m(n3,l[2]) + m(n4,l[1]));
|
||||
let (carry, r1) = part2(carry + limbs[6] + m(n2,l[4]) + m(n4,l[2]));
|
||||
let (carry, r2) = part2(carry + limbs[7] + m(n3,l[4]) );
|
||||
let (carry, r3) = part2(carry + limbs[8] + m(n4,l[4]));
|
||||
let r4 = carry as u64;
|
||||
|
||||
// result may be >= l, so attempt to subtract l
|
||||
Scalar52::sub(&Scalar52([r0,r1,r2,r3,r4]), l)
|
||||
}
|
||||
|
||||
/// Compute `a * b` (mod l)
|
||||
#[inline(never)]
|
||||
pub fn mul(a: &Scalar52, b: &Scalar52) -> Scalar52 {
|
||||
let ab = Scalar52::montgomery_reduce(&Scalar52::mul_internal(a, b));
|
||||
Scalar52::montgomery_reduce(&Scalar52::mul_internal(&ab, &constants::RR))
|
||||
}
|
||||
|
||||
/// Compute `a^2` (mod l)
|
||||
#[inline(never)]
|
||||
#[allow(dead_code)] // XXX we don't expose square() via the Scalar API
|
||||
pub fn square(&self) -> Scalar52 {
|
||||
let aa = Scalar52::montgomery_reduce(&Scalar52::square_internal(self));
|
||||
Scalar52::montgomery_reduce(&Scalar52::mul_internal(&aa, &constants::RR))
|
||||
}
|
||||
|
||||
/// Compute `(a * b) / R` (mod l), where R is the Montgomery modulus 2^260
|
||||
#[inline(never)]
|
||||
pub fn montgomery_mul(a: &Scalar52, b: &Scalar52) -> Scalar52 {
|
||||
Scalar52::montgomery_reduce(&Scalar52::mul_internal(a, b))
|
||||
}
|
||||
|
||||
/// Compute `(a^2) / R` (mod l) in Montgomery form, where R is the Montgomery modulus 2^260
|
||||
#[inline(never)]
|
||||
pub fn montgomery_square(&self) -> Scalar52 {
|
||||
Scalar52::montgomery_reduce(&Scalar52::square_internal(self))
|
||||
}
|
||||
|
||||
/// Puts a Scalar52 in to Montgomery form, i.e. computes `a*R (mod l)`
|
||||
#[inline(never)]
|
||||
pub fn to_montgomery(&self) -> Scalar52 {
|
||||
Scalar52::montgomery_mul(self, &constants::RR)
|
||||
}
|
||||
|
||||
/// Takes a Scalar52 out of Montgomery form, i.e. computes `a/R (mod l)`
|
||||
#[inline(never)]
|
||||
pub fn from_montgomery(&self) -> Scalar52 {
|
||||
let mut limbs = [0u128; 9];
|
||||
for i in 0..5 {
|
||||
limbs[i] = self[i] as u128;
|
||||
}
|
||||
Scalar52::montgomery_reduce(&limbs)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
/// Note: x is 2^253-1 which is slightly larger than the largest scalar produced by
|
||||
/// this implementation (l-1), and should show there are no overflows for valid scalars
|
||||
///
|
||||
/// x = 14474011154664524427946373126085988481658748083205070504932198000989141204991
|
||||
/// x = 7237005577332262213973186563042994240801631723825162898930247062703686954002 mod l
|
||||
/// x = 3057150787695215392275360544382990118917283750546154083604586903220563173085*R mod l in Montgomery form
|
||||
pub static X: Scalar52 = Scalar52(
|
||||
[0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff,
|
||||
0x00001fffffffffff]);
|
||||
|
||||
/// x^2 = 3078544782642840487852506753550082162405942681916160040940637093560259278169 mod l
|
||||
pub static XX: Scalar52 = Scalar52(
|
||||
[0x0001668020217559, 0x000531640ffd0ec0, 0x00085fd6f9f38a31, 0x000c268f73bb1cf4,
|
||||
0x000006ce65046df0]);
|
||||
|
||||
/// x^2 = 4413052134910308800482070043710297189082115023966588301924965890668401540959*R mod l in Montgomery form
|
||||
pub static XX_MONT: Scalar52 = Scalar52(
|
||||
[0x000c754eea569a5c, 0x00063b6ed36cb215, 0x0008ffa36bf25886, 0x000e9183614e7543,
|
||||
0x0000061db6c6f26f]);
|
||||
|
||||
/// y = 6145104759870991071742105800796537629880401874866217824609283457819451087098
|
||||
pub static Y: Scalar52 = Scalar52(
|
||||
[0x000b75071e1458fa, 0x000bf9d75e1ecdac, 0x000433d2baf0672b, 0x0005fffcc11fad13,
|
||||
0x00000d96018bb825]);
|
||||
|
||||
/// x*y = 36752150652102274958925982391442301741 mod l
|
||||
pub static XY: Scalar52 = Scalar52(
|
||||
[0x000ee6d76ba7632d, 0x000ed50d71d84e02, 0x00000000001ba634, 0x0000000000000000,
|
||||
0x0000000000000000]);
|
||||
|
||||
/// x*y = 658448296334113745583381664921721413881518248721417041768778176391714104386*R mod l in Montgomery form
|
||||
pub static XY_MONT: Scalar52 = Scalar52(
|
||||
[0x0006d52bf200cfd5, 0x00033fb1d7021570, 0x000f201bc07139d8, 0x0001267e3e49169e,
|
||||
0x000007b839c00268]);
|
||||
|
||||
/// a = 2351415481556538453565687241199399922945659411799870114962672658845158063753
|
||||
pub static A: Scalar52 = Scalar52(
|
||||
[0x0005236c07b3be89, 0x0001bc3d2a67c0c4, 0x000a4aa782aae3ee, 0x0006b3f6e4fec4c4,
|
||||
0x00000532da9fab8c]);
|
||||
|
||||
/// b = 4885590095775723760407499321843594317911456947580037491039278279440296187236
|
||||
pub static B: Scalar52 = Scalar52(
|
||||
[0x000d3fae55421564, 0x000c2df24f65a4bc, 0x0005b5587d69fb0b, 0x00094c091b013b3b,
|
||||
0x00000acd25605473]);
|
||||
|
||||
/// a+b = 0
|
||||
/// a-b = 4702830963113076907131374482398799845891318823599740229925345317690316127506
|
||||
pub static AB: Scalar52 = Scalar52(
|
||||
[0x000a46d80f677d12, 0x0003787a54cf8188, 0x0004954f0555c7dc, 0x000d67edc9fd8989,
|
||||
0x00000a65b53f5718]);
|
||||
|
||||
// c = (2^512 - 1) % l = 1627715501170711445284395025044413883736156588369414752970002579683115011840
|
||||
pub static C: Scalar52 = Scalar52(
|
||||
[0x000611e3449c0f00, 0x000a768859347a40, 0x0007f5be65d00e1b, 0x0009a3dceec73d21,
|
||||
0x00000399411b7c30]);
|
||||
|
||||
#[test]
|
||||
fn mul_max() {
|
||||
let res = Scalar52::mul(&X, &X);
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XX[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn square_max() {
|
||||
let res = X.square();
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XX[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_mul_max() {
|
||||
let res = Scalar52::montgomery_mul(&X, &X);
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XX_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_square_max() {
|
||||
let res = X.montgomery_square();
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XX_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mul() {
|
||||
let res = Scalar52::mul(&X, &Y);
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XY[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_mul() {
|
||||
let res = Scalar52::montgomery_mul(&X, &Y);
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == XY_MONT[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add() {
|
||||
let res = Scalar52::add(&A, &B);
|
||||
let zero = Scalar52::zero();
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == zero[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sub() {
|
||||
let res = Scalar52::sub(&A, &B);
|
||||
for i in 0..5 {
|
||||
assert!(res[i] == AB[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_bytes_wide() {
|
||||
let bignum = [255u8; 64]; // 2^512 - 1
|
||||
let reduced = Scalar52::from_bytes_wide(&bignum);
|
||||
println!("{:?}", reduced);
|
||||
for i in 0..5 {
|
||||
assert!(reduced[i] == C[i]);
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,524 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Parallel Edwards Arithmetic for Curve25519.
|
||||
//!
|
||||
//! This module currently has two point types:
|
||||
//!
|
||||
//! * `ExtendedPoint`: a point stored in vector-friendly format, with
|
||||
//! vectorized doubling and addition;
|
||||
//!
|
||||
//! * `CachedPoint`: used for readdition.
|
||||
//!
|
||||
//! Details on the formulas can be found in the documentation for the
|
||||
//! parent `avx2` module.
|
||||
//!
|
||||
//! This API is designed to be safe: vectorized points can only be
|
||||
//! created from serial points (which do validation on decompression),
|
||||
//! and operations on valid points return valid points, so invalid
|
||||
//! point states should be unrepresentable.
|
||||
//!
|
||||
//! This design goal is met, with one exception: the `Neg`
|
||||
//! implementation for the `CachedPoint` performs a lazy negation, so
|
||||
//! that subtraction can be efficiently implemented as a negation and
|
||||
//! an addition. Repeatedly negating a `CachedPoint` will cause its
|
||||
//! coefficients to grow and eventually overflow. Repeatedly negating
|
||||
//! a point should not be necessary anyways.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::convert::From;
|
||||
use core::ops::{Add, Neg, Sub};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use edwards;
|
||||
use window::{LookupTable, NafLookupTable5, NafLookupTable8};
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
use super::constants;
|
||||
use super::field::{FieldElement2625x4, Lanes, Shuffle};
|
||||
|
||||
/// A point on Curve25519, using parallel Edwards formulas for curve
|
||||
/// operations.
|
||||
///
|
||||
/// # Invariant
|
||||
///
|
||||
/// The coefficients of an `ExtendedPoint` are bounded with
|
||||
/// \\( b < 0.007 \\).
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct ExtendedPoint(pub(super) FieldElement2625x4);
|
||||
|
||||
impl From<edwards::EdwardsPoint> for ExtendedPoint {
|
||||
fn from(P: edwards::EdwardsPoint) -> ExtendedPoint {
|
||||
ExtendedPoint(FieldElement2625x4::new(&P.X, &P.Y, &P.Z, &P.T))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ExtendedPoint> for edwards::EdwardsPoint {
|
||||
fn from(P: ExtendedPoint) -> edwards::EdwardsPoint {
|
||||
let tmp = P.0.split();
|
||||
edwards::EdwardsPoint {
|
||||
X: tmp[0],
|
||||
Y: tmp[1],
|
||||
Z: tmp[2],
|
||||
T: tmp[3],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for ExtendedPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
ExtendedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice))
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.0.conditional_assign(&other.0, choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExtendedPoint {
|
||||
fn default() -> ExtendedPoint {
|
||||
ExtendedPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for ExtendedPoint {
|
||||
fn identity() -> ExtendedPoint {
|
||||
constants::EXTENDEDPOINT_IDENTITY
|
||||
}
|
||||
}
|
||||
|
||||
impl ExtendedPoint {
|
||||
/// Compute the double of this point.
|
||||
pub fn double(&self) -> ExtendedPoint {
|
||||
// Want to compute (X1 Y1 Z1 X1+Y1).
|
||||
// Not sure how to do this less expensively than computing
|
||||
// (X1 Y1 Z1 T1) --(256bit shuffle)--> (X1 Y1 X1 Y1)
|
||||
// (X1 Y1 X1 Y1) --(2x128b shuffle)--> (Y1 X1 Y1 X1)
|
||||
// and then adding.
|
||||
|
||||
// Set tmp0 = (X1 Y1 X1 Y1)
|
||||
let mut tmp0 = self.0.shuffle(Shuffle::ABAB);
|
||||
|
||||
// Set tmp1 = (Y1 X1 Y1 X1)
|
||||
let mut tmp1 = tmp0.shuffle(Shuffle::BADC);
|
||||
|
||||
// Set tmp0 = (X1 Y1 Z1 X1+Y1)
|
||||
tmp0 = self.0.blend(tmp0 + tmp1, Lanes::D);
|
||||
|
||||
// Set tmp1 = tmp0^2, negating the D values
|
||||
tmp1 = tmp0.square_and_negate_D();
|
||||
// Now tmp1 = (S1 S2 S3 -S4) with b < 0.007
|
||||
|
||||
// See discussion of bounds in the module-level documentation.
|
||||
// We want to compute
|
||||
//
|
||||
// + | S1 | S1 | S1 | S1 |
|
||||
// + | S2 | | | S2 |
|
||||
// + | | | S3 | |
|
||||
// + | | | S3 | |
|
||||
// + | | | |-S4 |
|
||||
// + | | 2p | 2p | |
|
||||
// - | | S2 | S2 | |
|
||||
// =======================
|
||||
// S5 S6 S8 S9
|
||||
|
||||
let zero = FieldElement2625x4::zero();
|
||||
let S_1 = tmp1.shuffle(Shuffle::AAAA);
|
||||
let S_2 = tmp1.shuffle(Shuffle::BBBB);
|
||||
|
||||
tmp0 = zero.blend(tmp1 + tmp1, Lanes::C);
|
||||
// tmp0 = (0, 0, 2S_3, 0)
|
||||
tmp0 = tmp0.blend(tmp1, Lanes::D);
|
||||
// tmp0 = (0, 0, 2S_3, -S_4)
|
||||
tmp0 = tmp0 + S_1;
|
||||
// tmp0 = ( S_1, S_1, S_1 + 2S_3, S_1 - S_4)
|
||||
tmp0 = tmp0 + zero.blend(S_2, Lanes::AD);
|
||||
// tmp0 = (S_1 + S_2, S_1, S_1 + 2S_3, S_1 + S_2 - S_4)
|
||||
tmp0 = tmp0 + zero.blend(S_2.negate_lazy(), Lanes::BC);
|
||||
// tmp0 = (S_1 + S_2, S_1 - S_2, S_1 - S_2 + 2S_3, S_1 + S_2 - S_4)
|
||||
// b < ( 1.01, 1.6, 2.33, 1.6)
|
||||
// Now tmp0 = (S_5, S_6, S_8, S_9)
|
||||
|
||||
// Set tmp1 = ( S_9, S_6, S_6, S_9)
|
||||
// b < ( 1.6, 1.6, 1.6, 1.6)
|
||||
tmp1 = tmp0.shuffle(Shuffle::DBBD);
|
||||
// Set tmp1 = ( S_8, S_5, S_8, S_5)
|
||||
// b < (2.33, 1.01, 2.33, 1.01)
|
||||
tmp0 = tmp0.shuffle(Shuffle::CACA);
|
||||
|
||||
// Bounds on (tmp0, tmp1) are (2.33, 1.6) < (2.5, 1.75).
|
||||
ExtendedPoint(&tmp0 * &tmp1)
|
||||
}
|
||||
|
||||
pub fn mul_by_pow_2(&self, k: u32) -> ExtendedPoint {
|
||||
let mut tmp: ExtendedPoint = *self;
|
||||
for _ in 0..k {
|
||||
tmp = tmp.double();
|
||||
}
|
||||
tmp
|
||||
}
|
||||
}
|
||||
|
||||
/// A cached point with some precomputed variables used for readdition.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// It is not safe to negate this point more than once.
|
||||
///
|
||||
/// # Invariant
|
||||
///
|
||||
/// As long as the `CachedPoint` is not repeatedly negated, its
|
||||
/// coefficients will be bounded with \\( b < 1.0 \\).
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CachedPoint(pub(super) FieldElement2625x4);
|
||||
|
||||
impl From<ExtendedPoint> for CachedPoint {
|
||||
fn from(P: ExtendedPoint) -> CachedPoint {
|
||||
let mut x = P.0;
|
||||
|
||||
x = x.blend(x.diff_sum(), Lanes::AB);
|
||||
// x = (X1 - Y1, X2 + Y2, Z2, T2) = (S2 S3 Z2 T2)
|
||||
|
||||
x = x * (121666, 121666, 2 * 121666, 2 * 121665);
|
||||
// x = (121666*S2 121666*S3 2*121666*Z2 2*121665*T2)
|
||||
|
||||
x = x.blend(-x, Lanes::D);
|
||||
// x = (121666*S2 121666*S3 2*121666*Z2 -2*121665*T2)
|
||||
|
||||
// The coefficients of the output are bounded with b < 0.007.
|
||||
CachedPoint(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CachedPoint {
|
||||
fn default() -> CachedPoint {
|
||||
CachedPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for CachedPoint {
|
||||
fn identity() -> CachedPoint {
|
||||
constants::CACHEDPOINT_IDENTITY
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for CachedPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
CachedPoint(FieldElement2625x4::conditional_select(&a.0, &b.0, choice))
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.0.conditional_assign(&other.0, choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a CachedPoint {
|
||||
type Output = CachedPoint;
|
||||
/// Lazily negate the point.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Because this method does not perform a reduction, it is not
|
||||
/// safe to repeatedly negate a point.
|
||||
fn neg(self) -> CachedPoint {
|
||||
let swapped = self.0.shuffle(Shuffle::BACD);
|
||||
CachedPoint(swapped.blend(swapped.negate_lazy(), Lanes::D))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b CachedPoint> for &'a ExtendedPoint {
|
||||
type Output = ExtendedPoint;
|
||||
|
||||
/// Add an `ExtendedPoint` and a `CachedPoint`.
|
||||
fn add(self, other: &'b CachedPoint) -> ExtendedPoint {
|
||||
// The coefficients of an `ExtendedPoint` are reduced after
|
||||
// every operation. If the `CachedPoint` was negated, its
|
||||
// coefficients grow by one bit. So on input, `self` is
|
||||
// bounded with `b < 0.007` and `other` is bounded with
|
||||
// `b < 1.0`.
|
||||
|
||||
let mut tmp = self.0;
|
||||
|
||||
tmp = tmp.blend(tmp.diff_sum(), Lanes::AB);
|
||||
// tmp = (Y1-X1 Y1+X1 Z1 T1) = (S0 S1 Z1 T1) with b < 1.6
|
||||
|
||||
// (tmp, other) bounded with b < (1.6, 1.0) < (2.5, 1.75).
|
||||
tmp = &tmp * &other.0;
|
||||
// tmp = (S0*S2' S1*S3' Z1*Z2' T1*T2') = (S8 S9 S10 S11)
|
||||
|
||||
tmp = tmp.shuffle(Shuffle::ABDC);
|
||||
// tmp = (S8 S9 S11 S10)
|
||||
|
||||
tmp = tmp.diff_sum();
|
||||
// tmp = (S9-S8 S9+S8 S10-S11 S10+S11) = (S12 S13 S14 S15)
|
||||
|
||||
let t0 = tmp.shuffle(Shuffle::ADDA);
|
||||
// t0 = (S12 S15 S15 S12)
|
||||
let t1 = tmp.shuffle(Shuffle::CBCB);
|
||||
// t1 = (S14 S13 S14 S13)
|
||||
|
||||
// All coefficients of t0, t1 are bounded with b < 1.6.
|
||||
// Return (S12*S14 S15*S13 S15*S14 S12*S13) = (X3 Y3 Z3 T3)
|
||||
ExtendedPoint(&t0 * &t1)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b CachedPoint> for &'a ExtendedPoint {
|
||||
type Output = ExtendedPoint;
|
||||
|
||||
/// Implement subtraction by negating the point and adding.
|
||||
///
|
||||
/// Empirically, this seems about the same cost as a custom
|
||||
/// subtraction impl (maybe because the benefit is cancelled by
|
||||
/// increased code size?)
|
||||
fn sub(self, other: &'b CachedPoint) -> ExtendedPoint {
|
||||
self + &(-other)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for LookupTable<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let P = ExtendedPoint::from(*point);
|
||||
let mut points = [CachedPoint::from(P); 8];
|
||||
for i in 0..7 {
|
||||
points[i + 1] = (&P + &points[i]).into();
|
||||
}
|
||||
LookupTable(points)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable5<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let A = ExtendedPoint::from(*point);
|
||||
let mut Ai = [CachedPoint::from(A); 8];
|
||||
let A2 = A.double();
|
||||
for i in 0..7 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).into();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
|
||||
NafLookupTable5(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable8<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let A = ExtendedPoint::from(*point);
|
||||
let mut Ai = [CachedPoint::from(A); 64];
|
||||
let A2 = A.double();
|
||||
for i in 0..63 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).into();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
|
||||
NafLookupTable8(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
fn serial_add(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) -> edwards::EdwardsPoint {
|
||||
use backend::serial::u64::field::FieldElement51;
|
||||
|
||||
let (X1, Y1, Z1, T1) = (P.X, P.Y, P.Z, P.T);
|
||||
let (X2, Y2, Z2, T2) = (Q.X, Q.Y, Q.Z, Q.T);
|
||||
|
||||
macro_rules! print_var {
|
||||
($x:ident) => {
|
||||
println!("{} = {:?}", stringify!($x), $x.to_bytes());
|
||||
};
|
||||
}
|
||||
|
||||
let S0 = &Y1 - &X1; // R1
|
||||
let S1 = &Y1 + &X1; // R3
|
||||
let S2 = &Y2 - &X2; // R2
|
||||
let S3 = &Y2 + &X2; // R4
|
||||
print_var!(S0);
|
||||
print_var!(S1);
|
||||
print_var!(S2);
|
||||
print_var!(S3);
|
||||
println!("");
|
||||
|
||||
let S4 = &S0 * &S2; // R5 = R1 * R2
|
||||
let S5 = &S1 * &S3; // R6 = R3 * R4
|
||||
let S6 = &Z1 * &Z2; // R8
|
||||
let S7 = &T1 * &T2; // R7
|
||||
print_var!(S4);
|
||||
print_var!(S5);
|
||||
print_var!(S6);
|
||||
print_var!(S7);
|
||||
println!("");
|
||||
|
||||
let S8 = &S4 * &FieldElement51([ 121666,0,0,0,0]); // R5
|
||||
let S9 = &S5 * &FieldElement51([ 121666,0,0,0,0]); // R6
|
||||
let S10 = &S6 * &FieldElement51([2*121666,0,0,0,0]); // R8
|
||||
let S11 = &S7 * &(-&FieldElement51([2*121665,0,0,0,0])); // R7
|
||||
print_var!(S8);
|
||||
print_var!(S9);
|
||||
print_var!(S10);
|
||||
print_var!(S11);
|
||||
println!("");
|
||||
|
||||
let S12 = &S9 - &S8; // R1
|
||||
let S13 = &S9 + &S8; // R4
|
||||
let S14 = &S10 - &S11; // R2
|
||||
let S15 = &S10 + &S11; // R3
|
||||
print_var!(S12);
|
||||
print_var!(S13);
|
||||
print_var!(S14);
|
||||
print_var!(S15);
|
||||
println!("");
|
||||
|
||||
let X3 = &S12 * &S14; // R1 * R2
|
||||
let Y3 = &S15 * &S13; // R3 * R4
|
||||
let Z3 = &S15 * &S14; // R2 * R3
|
||||
let T3 = &S12 * &S13; // R1 * R4
|
||||
|
||||
edwards::EdwardsPoint {
|
||||
X: X3,
|
||||
Y: Y3,
|
||||
Z: Z3,
|
||||
T: T3,
|
||||
}
|
||||
}
|
||||
|
||||
fn addition_test_helper(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) {
|
||||
// Test the serial implementation of the parallel addition formulas
|
||||
let R_serial: edwards::EdwardsPoint = serial_add(P.into(), Q.into()).into();
|
||||
|
||||
// Test the vector implementation of the parallel readdition formulas
|
||||
let cached_Q = CachedPoint::from(ExtendedPoint::from(Q));
|
||||
let R_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) + &cached_Q).into();
|
||||
let S_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) - &cached_Q).into();
|
||||
|
||||
println!("Testing point addition:");
|
||||
println!("P = {:?}", P);
|
||||
println!("Q = {:?}", Q);
|
||||
println!("cached Q = {:?}", cached_Q);
|
||||
println!("R = P + Q = {:?}", &P + &Q);
|
||||
println!("R_serial = {:?}", R_serial);
|
||||
println!("R_vector = {:?}", R_vector);
|
||||
println!("S = P - Q = {:?}", &P - &Q);
|
||||
println!("S_vector = {:?}", S_vector);
|
||||
assert_eq!(R_serial.compress(), (&P + &Q).compress());
|
||||
assert_eq!(R_vector.compress(), (&P + &Q).compress());
|
||||
assert_eq!(S_vector.compress(), (&P - &Q).compress());
|
||||
println!("OK!\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vector_addition_vs_serial_addition_vs_edwards_extendedpoint() {
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
println!("Testing id +- id");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
let Q = edwards::EdwardsPoint::identity();
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing id +- B");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
let Q = constants::ED25519_BASEPOINT_POINT;
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing B +- B");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
let Q = constants::ED25519_BASEPOINT_POINT;
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing B +- kB");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
let Q = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
|
||||
addition_test_helper(P, Q);
|
||||
}
|
||||
|
||||
fn serial_double(P: edwards::EdwardsPoint) -> edwards::EdwardsPoint {
|
||||
let (X1, Y1, Z1, _T1) = (P.X, P.Y, P.Z, P.T);
|
||||
|
||||
macro_rules! print_var {
|
||||
($x:ident) => {
|
||||
println!("{} = {:?}", stringify!($x), $x.to_bytes());
|
||||
};
|
||||
}
|
||||
|
||||
let S0 = &X1 + &Y1; // R1
|
||||
print_var!(S0);
|
||||
println!("");
|
||||
|
||||
let S1 = X1.square();
|
||||
let S2 = Y1.square();
|
||||
let S3 = Z1.square();
|
||||
let S4 = S0.square();
|
||||
print_var!(S1);
|
||||
print_var!(S2);
|
||||
print_var!(S3);
|
||||
print_var!(S4);
|
||||
println!("");
|
||||
|
||||
let S5 = &S1 + &S2;
|
||||
let S6 = &S1 - &S2;
|
||||
let S7 = &S3 + &S3;
|
||||
let S8 = &S7 + &S6;
|
||||
let S9 = &S5 - &S4;
|
||||
print_var!(S5);
|
||||
print_var!(S6);
|
||||
print_var!(S7);
|
||||
print_var!(S8);
|
||||
print_var!(S9);
|
||||
println!("");
|
||||
|
||||
let X3 = &S8 * &S9;
|
||||
let Y3 = &S5 * &S6;
|
||||
let Z3 = &S8 * &S6;
|
||||
let T3 = &S5 * &S9;
|
||||
|
||||
edwards::EdwardsPoint {
|
||||
X: X3,
|
||||
Y: Y3,
|
||||
Z: Z3,
|
||||
T: T3,
|
||||
}
|
||||
}
|
||||
|
||||
fn doubling_test_helper(P: edwards::EdwardsPoint) {
|
||||
let R1: edwards::EdwardsPoint = serial_double(P.into()).into();
|
||||
let R2: edwards::EdwardsPoint = ExtendedPoint::from(P).double().into();
|
||||
println!("Testing point doubling:");
|
||||
println!("P = {:?}", P);
|
||||
println!("(serial) R1 = {:?}", R1);
|
||||
println!("(vector) R2 = {:?}", R2);
|
||||
println!("P + P = {:?}", &P + &P);
|
||||
assert_eq!(R1.compress(), (&P + &P).compress());
|
||||
assert_eq!(R2.compress(), (&P + &P).compress());
|
||||
println!("OK!\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vector_doubling_vs_serial_doubling_vs_edwards_extendedpoint() {
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
println!("Testing [2]id");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
doubling_test_helper(P);
|
||||
|
||||
println!("Testing [2]B");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
doubling_test_helper(P);
|
||||
|
||||
println!("Testing [2]([k]B)");
|
||||
let P = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
|
||||
doubling_test_helper(P);
|
||||
}
|
||||
}
|
|
@ -1,985 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! An implementation of 4-way vectorized 32bit field arithmetic using
|
||||
//! AVX2.
|
||||
//!
|
||||
//! The `FieldElement2625x4` struct provides a vector of four field
|
||||
//! elements, implemented using AVX2 operations. Its API is designed
|
||||
//! to abstract away the platform-dependent details, so that point
|
||||
//! arithmetic can be implemented only in terms of a vector of field
|
||||
//! elements.
|
||||
//!
|
||||
//! At this level, the API is optimized for speed and not safety. The
|
||||
//! `FieldElement2625x4` does not always perform reductions. The pre-
|
||||
//! and post-conditions on the bounds of the coefficients are
|
||||
//! documented for each method, but it is the caller's responsibility
|
||||
//! to ensure that there are no overflows.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
const A_LANES: u8 = 0b0000_0101;
|
||||
const B_LANES: u8 = 0b0000_1010;
|
||||
const C_LANES: u8 = 0b0101_0000;
|
||||
const D_LANES: u8 = 0b1010_0000;
|
||||
|
||||
#[allow(unused)]
|
||||
const A_LANES64: u8 = 0b00_00_00_11;
|
||||
#[allow(unused)]
|
||||
const B_LANES64: u8 = 0b00_00_11_00;
|
||||
#[allow(unused)]
|
||||
const C_LANES64: u8 = 0b00_11_00_00;
|
||||
#[allow(unused)]
|
||||
const D_LANES64: u8 = 0b11_00_00_00;
|
||||
|
||||
use core::ops::{Add, Mul, Neg};
|
||||
use packed_simd::{i32x8, u32x8, u64x4, IntoBits};
|
||||
|
||||
use backend::vector::avx2::constants::{P_TIMES_16_HI, P_TIMES_16_LO, P_TIMES_2_HI, P_TIMES_2_LO};
|
||||
use backend::serial::u64::field::FieldElement51;
|
||||
|
||||
/// Unpack 32-bit lanes into 64-bit lanes:
|
||||
/// ```ascii,no_run
|
||||
/// (a0, b0, a1, b1, c0, d0, c1, d1)
|
||||
/// ```
|
||||
/// into
|
||||
/// ```ascii,no_run
|
||||
/// (a0, 0, b0, 0, c0, 0, d0, 0)
|
||||
/// (a1, 0, b1, 0, c1, 0, d1, 0)
|
||||
/// ```
|
||||
#[inline(always)]
|
||||
fn unpack_pair(src: u32x8) -> (u32x8, u32x8) {
|
||||
let a: u32x8;
|
||||
let b: u32x8;
|
||||
let zero = i32x8::new(0, 0, 0, 0, 0, 0, 0, 0);
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_unpackhi_epi32;
|
||||
use core::arch::x86_64::_mm256_unpacklo_epi32;
|
||||
a = _mm256_unpacklo_epi32(src.into_bits(), zero.into_bits()).into_bits();
|
||||
b = _mm256_unpackhi_epi32(src.into_bits(), zero.into_bits()).into_bits();
|
||||
}
|
||||
(a, b)
|
||||
}
|
||||
|
||||
/// Repack 64-bit lanes into 32-bit lanes:
|
||||
/// ```ascii,no_run
|
||||
/// (a0, 0, b0, 0, c0, 0, d0, 0)
|
||||
/// (a1, 0, b1, 0, c1, 0, d1, 0)
|
||||
/// ```
|
||||
/// into
|
||||
/// ```ascii,no_run
|
||||
/// (a0, b0, a1, b1, c0, d0, c1, d1)
|
||||
/// ```
|
||||
#[inline(always)]
|
||||
fn repack_pair(x: u32x8, y: u32x8) -> u32x8 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_blend_epi32;
|
||||
use core::arch::x86_64::_mm256_shuffle_epi32;
|
||||
|
||||
// Input: x = (a0, 0, b0, 0, c0, 0, d0, 0)
|
||||
// Input: y = (a1, 0, b1, 0, c1, 0, d1, 0)
|
||||
|
||||
let x_shuffled = _mm256_shuffle_epi32(x.into_bits(), 0b11_01_10_00);
|
||||
let y_shuffled = _mm256_shuffle_epi32(y.into_bits(), 0b10_00_11_01);
|
||||
|
||||
// x' = (a0, b0, 0, 0, c0, d0, 0, 0)
|
||||
// y' = ( 0, 0, a1, b1, 0, 0, c1, d1)
|
||||
|
||||
return _mm256_blend_epi32(x_shuffled, y_shuffled, 0b11001100).into_bits();
|
||||
}
|
||||
}
|
||||
|
||||
/// The `Lanes` enum represents a subset of the lanes `A,B,C,D` of a
|
||||
/// `FieldElement2625x4`.
|
||||
///
|
||||
/// It's used to specify blend operations without
|
||||
/// having to know details about the data layout of the
|
||||
/// `FieldElement2625x4`.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum Lanes {
|
||||
C,
|
||||
D,
|
||||
AB,
|
||||
AC,
|
||||
CD,
|
||||
AD,
|
||||
BC,
|
||||
ABCD,
|
||||
}
|
||||
|
||||
/// The `Shuffle` enum represents a shuffle of a `FieldElement2625x4`.
|
||||
///
|
||||
/// The enum variants are named by what they do to a vector \\(
|
||||
/// (A,B,C,D) \\); for instance, `Shuffle::BADC` turns \\( (A, B, C,
|
||||
/// D) \\) into \\( (B, A, D, C) \\).
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum Shuffle {
|
||||
AAAA,
|
||||
BBBB,
|
||||
CACA,
|
||||
DBBD,
|
||||
ADDA,
|
||||
CBCB,
|
||||
ABAB,
|
||||
BADC,
|
||||
BACD,
|
||||
ABDC,
|
||||
}
|
||||
|
||||
/// A vector of four field elements.
|
||||
///
|
||||
/// Each operation on a `FieldElement2625x4` has documented effects on
|
||||
/// the bounds of the coefficients. This API is designed for speed
|
||||
/// and not safety; it is the caller's responsibility to ensure that
|
||||
/// the post-conditions of one operation are compatible with the
|
||||
/// pre-conditions of the next.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct FieldElement2625x4(pub(crate) [u32x8; 5]);
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
impl ConditionallySelectable for FieldElement2625x4 {
|
||||
fn conditional_select(
|
||||
a: &FieldElement2625x4,
|
||||
b: &FieldElement2625x4,
|
||||
choice: Choice,
|
||||
) -> FieldElement2625x4 {
|
||||
let mask = (-(choice.unwrap_u8() as i32)) as u32;
|
||||
let mask_vec = u32x8::splat(mask);
|
||||
FieldElement2625x4([
|
||||
a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
|
||||
a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
|
||||
a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
|
||||
a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
|
||||
a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
|
||||
])
|
||||
}
|
||||
|
||||
fn conditional_assign(
|
||||
&mut self,
|
||||
other: &FieldElement2625x4,
|
||||
choice: Choice,
|
||||
) {
|
||||
let mask = (-(choice.unwrap_u8() as i32)) as u32;
|
||||
let mask_vec = u32x8::splat(mask);
|
||||
self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
|
||||
self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
|
||||
self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
|
||||
self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
|
||||
self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement2625x4 {
|
||||
/// Split this vector into an array of four (serial) field
|
||||
/// elements.
|
||||
pub fn split(&self) -> [FieldElement51; 4] {
|
||||
let mut out = [FieldElement51::zero(); 4];
|
||||
for i in 0..5 {
|
||||
let a_2i = self.0[i].extract(0) as u64; //
|
||||
let b_2i = self.0[i].extract(1) as u64; //
|
||||
let a_2i_1 = self.0[i].extract(2) as u64; // `.
|
||||
let b_2i_1 = self.0[i].extract(3) as u64; // | pre-swapped to avoid
|
||||
let c_2i = self.0[i].extract(4) as u64; // | a cross lane shuffle
|
||||
let d_2i = self.0[i].extract(5) as u64; // .'
|
||||
let c_2i_1 = self.0[i].extract(6) as u64; //
|
||||
let d_2i_1 = self.0[i].extract(7) as u64; //
|
||||
|
||||
out[0].0[i] = a_2i + (a_2i_1 << 26);
|
||||
out[1].0[i] = b_2i + (b_2i_1 << 26);
|
||||
out[2].0[i] = c_2i + (c_2i_1 << 26);
|
||||
out[3].0[i] = d_2i + (d_2i_1 << 26);
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Rearrange the elements of this vector according to `control`.
|
||||
///
|
||||
/// The `control` parameter should be a compile-time constant, so
|
||||
/// that when this function is inlined, LLVM is able to lower the
|
||||
/// shuffle using an immediate.
|
||||
#[inline]
|
||||
pub fn shuffle(&self, control: Shuffle) -> FieldElement2625x4 {
|
||||
#[inline(always)]
|
||||
fn shuffle_lanes(x: u32x8, control: Shuffle) -> u32x8 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_permutevar8x32_epi32;
|
||||
|
||||
let c: u32x8 = match control {
|
||||
Shuffle::AAAA => u32x8::new(0, 0, 2, 2, 0, 0, 2, 2),
|
||||
Shuffle::BBBB => u32x8::new(1, 1, 3, 3, 1, 1, 3, 3),
|
||||
Shuffle::CACA => u32x8::new(4, 0, 6, 2, 4, 0, 6, 2),
|
||||
Shuffle::DBBD => u32x8::new(5, 1, 7, 3, 1, 5, 3, 7),
|
||||
Shuffle::ADDA => u32x8::new(0, 5, 2, 7, 5, 0, 7, 2),
|
||||
Shuffle::CBCB => u32x8::new(4, 1, 6, 3, 4, 1, 6, 3),
|
||||
Shuffle::ABAB => u32x8::new(0, 1, 2, 3, 0, 1, 2, 3),
|
||||
Shuffle::BADC => u32x8::new(1, 0, 3, 2, 5, 4, 7, 6),
|
||||
Shuffle::BACD => u32x8::new(1, 0, 3, 2, 4, 5, 6, 7),
|
||||
Shuffle::ABDC => u32x8::new(0, 1, 2, 3, 5, 4, 7, 6),
|
||||
};
|
||||
// Note that this gets turned into a generic LLVM
|
||||
// shuffle-by-constants, which can be lowered to a simpler
|
||||
// instruction than a generic permute.
|
||||
_mm256_permutevar8x32_epi32(x.into_bits(), c.into_bits()).into_bits()
|
||||
}
|
||||
}
|
||||
|
||||
FieldElement2625x4([
|
||||
shuffle_lanes(self.0[0], control),
|
||||
shuffle_lanes(self.0[1], control),
|
||||
shuffle_lanes(self.0[2], control),
|
||||
shuffle_lanes(self.0[3], control),
|
||||
shuffle_lanes(self.0[4], control),
|
||||
])
|
||||
}
|
||||
|
||||
/// Blend `self` with `other`, taking lanes specified in `control` from `other`.
|
||||
///
|
||||
/// The `control` parameter should be a compile-time constant, so
|
||||
/// that this function can be inlined and LLVM can lower it to a
|
||||
/// blend instruction using an immediate.
|
||||
#[inline]
|
||||
pub fn blend(&self, other: FieldElement2625x4, control: Lanes) -> FieldElement2625x4 {
|
||||
#[inline(always)]
|
||||
fn blend_lanes(x: u32x8, y: u32x8, control: Lanes) -> u32x8 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_blend_epi32;
|
||||
|
||||
// This would be much cleaner if we could factor out the match
|
||||
// statement on the control. Unfortunately, rustc forgets
|
||||
// constant-info very quickly, so we can't even write
|
||||
// ```
|
||||
// match control {
|
||||
// Lanes::C => {
|
||||
// let imm = C_LANES as i32;
|
||||
// _mm256_blend_epi32(..., imm)
|
||||
// ```
|
||||
// let alone
|
||||
// ```
|
||||
// let imm = match control {
|
||||
// Lanes::C => C_LANES as i32,
|
||||
// }
|
||||
// _mm256_blend_epi32(..., imm)
|
||||
// ```
|
||||
// even though both of these would be constant-folded by LLVM
|
||||
// at a lower level (as happens in the shuffle implementation,
|
||||
// which does not require a shuffle immediate but *is* lowered
|
||||
// to immediate shuffles anyways).
|
||||
match control {
|
||||
Lanes::C => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), C_LANES as i32).into_bits()
|
||||
}
|
||||
Lanes::D => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), D_LANES as i32).into_bits()
|
||||
}
|
||||
Lanes::AD => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | D_LANES) as i32)
|
||||
.into_bits()
|
||||
}
|
||||
Lanes::AB => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | B_LANES) as i32)
|
||||
.into_bits()
|
||||
}
|
||||
Lanes::AC => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (A_LANES | C_LANES) as i32)
|
||||
.into_bits()
|
||||
}
|
||||
Lanes::CD => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (C_LANES | D_LANES) as i32)
|
||||
.into_bits()
|
||||
}
|
||||
Lanes::BC => {
|
||||
_mm256_blend_epi32(x.into_bits(), y.into_bits(), (B_LANES | C_LANES) as i32)
|
||||
.into_bits()
|
||||
}
|
||||
Lanes::ABCD => _mm256_blend_epi32(
|
||||
x.into_bits(),
|
||||
y.into_bits(),
|
||||
(A_LANES | B_LANES | C_LANES | D_LANES) as i32,
|
||||
).into_bits(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FieldElement2625x4([
|
||||
blend_lanes(self.0[0], other.0[0], control),
|
||||
blend_lanes(self.0[1], other.0[1], control),
|
||||
blend_lanes(self.0[2], other.0[2], control),
|
||||
blend_lanes(self.0[3], other.0[3], control),
|
||||
blend_lanes(self.0[4], other.0[4], control),
|
||||
])
|
||||
}
|
||||
|
||||
/// Construct a vector of zeros.
|
||||
pub fn zero() -> FieldElement2625x4 {
|
||||
FieldElement2625x4([u32x8::splat(0); 5])
|
||||
}
|
||||
|
||||
/// Convenience wrapper around `new(x,x,x,x)`.
|
||||
pub fn splat(x: &FieldElement51) -> FieldElement2625x4 {
|
||||
FieldElement2625x4::new(x, x, x, x)
|
||||
}
|
||||
|
||||
/// Create a `FieldElement2625x4` from four `FieldElement51`s.
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The resulting `FieldElement2625x4` is bounded with \\( b < 0.0002 \\).
|
||||
pub fn new(
|
||||
x0: &FieldElement51,
|
||||
x1: &FieldElement51,
|
||||
x2: &FieldElement51,
|
||||
x3: &FieldElement51,
|
||||
) -> FieldElement2625x4 {
|
||||
let mut buf = [u32x8::splat(0); 5];
|
||||
let low_26_bits = (1 << 26) - 1;
|
||||
for i in 0..5 {
|
||||
let a_2i = (x0.0[i] & low_26_bits) as u32;
|
||||
let a_2i_1 = (x0.0[i] >> 26) as u32;
|
||||
let b_2i = (x1.0[i] & low_26_bits) as u32;
|
||||
let b_2i_1 = (x1.0[i] >> 26) as u32;
|
||||
let c_2i = (x2.0[i] & low_26_bits) as u32;
|
||||
let c_2i_1 = (x2.0[i] >> 26) as u32;
|
||||
let d_2i = (x3.0[i] & low_26_bits) as u32;
|
||||
let d_2i_1 = (x3.0[i] >> 26) as u32;
|
||||
|
||||
buf[i] = u32x8::new(a_2i, b_2i, a_2i_1, b_2i_1, c_2i, d_2i, c_2i_1, d_2i_1);
|
||||
}
|
||||
|
||||
// We don't know that the original `FieldElement51`s were
|
||||
// fully reduced, so the odd limbs may exceed 2^25.
|
||||
// Reduce them to be sure.
|
||||
FieldElement2625x4(buf).reduce()
|
||||
}
|
||||
|
||||
/// Given \\((A,B,C,D)\\), compute \\((-A,-B,-C,-D)\\), without
|
||||
/// performing a reduction.
|
||||
///
|
||||
/// # Preconditions
|
||||
///
|
||||
/// The coefficients of `self` must be bounded with \\( b < 0.999 \\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 1 \\).
|
||||
#[inline]
|
||||
pub fn negate_lazy(&self) -> FieldElement2625x4 {
|
||||
// The limbs of self are bounded with b < 0.999, while the
|
||||
// smallest limb of 2*p is 67108845 > 2^{26+0.9999}, so
|
||||
// underflows are not possible.
|
||||
FieldElement2625x4([
|
||||
P_TIMES_2_LO - self.0[0],
|
||||
P_TIMES_2_HI - self.0[1],
|
||||
P_TIMES_2_HI - self.0[2],
|
||||
P_TIMES_2_HI - self.0[3],
|
||||
P_TIMES_2_HI - self.0[4],
|
||||
])
|
||||
}
|
||||
|
||||
/// Given `self = (A,B,C,D)`, compute `(B - A, B + A, D - C, D + C)`.
|
||||
///
|
||||
/// # Preconditions
|
||||
///
|
||||
/// The coefficients of `self` must be bounded with \\( b < 0.01 \\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 1.6 \\).
|
||||
#[inline]
|
||||
pub fn diff_sum(&self) -> FieldElement2625x4 {
|
||||
// tmp1 = (B, A, D, C)
|
||||
let tmp1 = self.shuffle(Shuffle::BADC);
|
||||
// tmp2 = (-A, B, -C, D)
|
||||
let tmp2 = self.blend(self.negate_lazy(), Lanes::AC);
|
||||
// (B - A, B + A, D - C, D + C) bounded with b < 1.6
|
||||
tmp1 + tmp2
|
||||
}
|
||||
|
||||
/// Reduce this vector of field elements \\(\mathrm{mod} p\\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.0002 \\).
|
||||
#[inline]
|
||||
pub fn reduce(&self) -> FieldElement2625x4 {
|
||||
let shifts = i32x8::new(26, 26, 25, 25, 26, 26, 25, 25);
|
||||
let masks = u32x8::new(
|
||||
(1 << 26) - 1,
|
||||
(1 << 26) - 1,
|
||||
(1 << 25) - 1,
|
||||
(1 << 25) - 1,
|
||||
(1 << 26) - 1,
|
||||
(1 << 26) - 1,
|
||||
(1 << 25) - 1,
|
||||
(1 << 25) - 1,
|
||||
);
|
||||
|
||||
// Let c(x) denote the carryout of the coefficient x.
|
||||
//
|
||||
// Given ( x0, y0, x1, y1, z0, w0, z1, w1),
|
||||
// compute (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0)).
|
||||
//
|
||||
// The carryouts are bounded by 2^(32 - 25) = 2^7.
|
||||
let rotated_carryout = |v: u32x8| -> u32x8 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_srlv_epi32;
|
||||
use core::arch::x86_64::_mm256_shuffle_epi32;
|
||||
|
||||
let c = _mm256_srlv_epi32(v.into_bits(), shifts.into_bits());
|
||||
_mm256_shuffle_epi32(c, 0b01_00_11_10).into_bits()
|
||||
}
|
||||
};
|
||||
|
||||
// Combine (lo, lo, lo, lo, lo, lo, lo, lo)
|
||||
// with (hi, hi, hi, hi, hi, hi, hi, hi)
|
||||
// to (lo, lo, hi, hi, lo, lo, hi, hi)
|
||||
//
|
||||
// This allows combining carryouts, e.g.,
|
||||
//
|
||||
// lo (c(x1), c(y1), c(x0), c(y0), c(z1), c(w1), c(z0), c(w0))
|
||||
// hi (c(x3), c(y3), c(x2), c(y2), c(z3), c(w3), c(z2), c(w2))
|
||||
// -> (c(x1), c(y1), c(x2), c(y2), c(z1), c(w1), c(z2), c(w2))
|
||||
//
|
||||
// which is exactly the vector of carryins for
|
||||
//
|
||||
// ( x2, y2, x3, y3, z2, w2, z3, w3).
|
||||
//
|
||||
let combine = |v_lo: u32x8, v_hi: u32x8| -> u32x8 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_blend_epi32;
|
||||
_mm256_blend_epi32(v_lo.into_bits(), v_hi.into_bits(), 0b11_00_11_00).into_bits()
|
||||
}
|
||||
};
|
||||
|
||||
let mut v = self.0;
|
||||
|
||||
let c10 = rotated_carryout(v[0]);
|
||||
v[0] = (v[0] & masks) + combine(u32x8::splat(0), c10);
|
||||
|
||||
let c32 = rotated_carryout(v[1]);
|
||||
v[1] = (v[1] & masks) + combine(c10, c32);
|
||||
|
||||
let c54 = rotated_carryout(v[2]);
|
||||
v[2] = (v[2] & masks) + combine(c32, c54);
|
||||
|
||||
let c76 = rotated_carryout(v[3]);
|
||||
v[3] = (v[3] & masks) + combine(c54, c76);
|
||||
|
||||
let c98 = rotated_carryout(v[4]);
|
||||
v[4] = (v[4] & masks) + combine(c76, c98);
|
||||
|
||||
let c9_19: u32x8 = unsafe {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
use core::arch::x86_64::_mm256_shuffle_epi32;
|
||||
|
||||
// Need to rearrange c98, since vpmuludq uses the low
|
||||
// 32-bits of each 64-bit lane to compute the product:
|
||||
//
|
||||
// c98 = (c(x9), c(y9), c(x8), c(y8), c(z9), c(w9), c(z8), c(w8));
|
||||
// c9_spread = (c(x9), c(x8), c(y9), c(y8), c(z9), c(z8), c(w9), c(w8)).
|
||||
let c9_spread = _mm256_shuffle_epi32(c98.into_bits(), 0b11_01_10_00);
|
||||
|
||||
// Since the carryouts are bounded by 2^7, their products with 19
|
||||
// are bounded by 2^11.25. This means that
|
||||
//
|
||||
// c9_19_spread = (19*c(x9), 0, 19*c(y9), 0, 19*c(z9), 0, 19*c(w9), 0).
|
||||
let c9_19_spread = _mm256_mul_epu32(c9_spread, u64x4::splat(19).into_bits());
|
||||
|
||||
// Unshuffle:
|
||||
// c9_19 = (19*c(x9), 19*c(y9), 0, 0, 19*c(z9), 19*c(w9), 0, 0).
|
||||
_mm256_shuffle_epi32(c9_19_spread, 0b11_01_10_00).into_bits()
|
||||
};
|
||||
|
||||
// Add the final carryin.
|
||||
v[0] = v[0] + c9_19;
|
||||
|
||||
// Each output coefficient has exactly one carryin, which is
|
||||
// bounded by 2^11.25, so they are bounded as
|
||||
//
|
||||
// c_even < 2^26 + 2^11.25 < 26.00006 < 2^{26+b}
|
||||
// c_odd < 2^25 + 2^11.25 < 25.0001 < 2^{25+b}
|
||||
//
|
||||
// where b = 0.0002.
|
||||
FieldElement2625x4(v)
|
||||
}
|
||||
|
||||
/// Given an array of wide coefficients, reduce them to a `FieldElement2625x4`.
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
|
||||
#[inline]
|
||||
fn reduce64(mut z: [u64x4; 10]) -> FieldElement2625x4 {
|
||||
// These aren't const because splat isn't a const fn
|
||||
let LOW_25_BITS: u64x4 = u64x4::splat((1 << 25) - 1);
|
||||
let LOW_26_BITS: u64x4 = u64x4::splat((1 << 26) - 1);
|
||||
|
||||
// Carry the value from limb i = 0..8 to limb i+1
|
||||
let carry = |z: &mut [u64x4; 10], i: usize| {
|
||||
debug_assert!(i < 9);
|
||||
if i % 2 == 0 {
|
||||
// Even limbs have 26 bits
|
||||
z[i + 1] = z[i + 1] + (z[i] >> 26);
|
||||
z[i] = z[i] & LOW_26_BITS;
|
||||
} else {
|
||||
// Odd limbs have 25 bits
|
||||
z[i + 1] = z[i + 1] + (z[i] >> 25);
|
||||
z[i] = z[i] & LOW_25_BITS;
|
||||
}
|
||||
};
|
||||
|
||||
// Perform two halves of the carry chain in parallel.
|
||||
carry(&mut z, 0); carry(&mut z, 4);
|
||||
carry(&mut z, 1); carry(&mut z, 5);
|
||||
carry(&mut z, 2); carry(&mut z, 6);
|
||||
carry(&mut z, 3); carry(&mut z, 7);
|
||||
// Since z[3] < 2^64, c < 2^(64-25) = 2^39,
|
||||
// so z[4] < 2^26 + 2^39 < 2^39.0002
|
||||
carry(&mut z, 4); carry(&mut z, 8);
|
||||
// Now z[4] < 2^26
|
||||
// and z[5] < 2^25 + 2^13.0002 < 2^25.0004 (good enough)
|
||||
|
||||
// Last carry has a multiplication by 19. In the serial case we
|
||||
// do a 64-bit multiplication by 19, but here we want to do a
|
||||
// 32-bit multiplication. However, if we only know z[9] < 2^64,
|
||||
// the carry is bounded as c < 2^(64-25) = 2^39, which is too
|
||||
// big. To ensure c < 2^32, we would need z[9] < 2^57.
|
||||
// Instead, we split the carry in two, with c = c_0 + c_1*2^26.
|
||||
|
||||
let c = z[9] >> 25;
|
||||
z[9] = z[9] & LOW_25_BITS;
|
||||
let mut c0: u64x4 = c & LOW_26_BITS; // c0 < 2^26;
|
||||
let mut c1: u64x4 = c >> 26; // c1 < 2^(39-26) = 2^13;
|
||||
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
let x19 = u64x4::splat(19);
|
||||
c0 = _mm256_mul_epu32(c0.into_bits(), x19.into_bits()).into_bits(); // c0 < 2^30.25
|
||||
c1 = _mm256_mul_epu32(c1.into_bits(), x19.into_bits()).into_bits(); // c1 < 2^17.25
|
||||
}
|
||||
|
||||
z[0] = z[0] + c0; // z0 < 2^26 + 2^30.25 < 2^30.33
|
||||
z[1] = z[1] + c1; // z1 < 2^25 + 2^17.25 < 2^25.0067
|
||||
carry(&mut z, 0); // z0 < 2^26, z1 < 2^25.0067 + 2^4.33 = 2^25.007
|
||||
|
||||
// The output coefficients are bounded with
|
||||
//
|
||||
// b = 0.007 for z[1]
|
||||
// b = 0.0004 for z[5]
|
||||
// b = 0 for other z[i].
|
||||
//
|
||||
// So the packed result is bounded with b = 0.007.
|
||||
FieldElement2625x4([
|
||||
repack_pair(z[0].into_bits(), z[1].into_bits()),
|
||||
repack_pair(z[2].into_bits(), z[3].into_bits()),
|
||||
repack_pair(z[4].into_bits(), z[5].into_bits()),
|
||||
repack_pair(z[6].into_bits(), z[7].into_bits()),
|
||||
repack_pair(z[8].into_bits(), z[9].into_bits()),
|
||||
])
|
||||
}
|
||||
|
||||
/// Square this field element, and negate the result's \\(D\\) value.
|
||||
///
|
||||
/// # Preconditions
|
||||
///
|
||||
/// The coefficients of `self` must be bounded with \\( b < 1.5 \\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
|
||||
pub fn square_and_negate_D(&self) -> FieldElement2625x4 {
|
||||
#[inline(always)]
|
||||
fn m(x: u32x8, y: u32x8) -> u64x4 {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
|
||||
}
|
||||
|
||||
let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
|
||||
|
||||
let (x0, x1) = unpack_pair(self.0[0]);
|
||||
let (x2, x3) = unpack_pair(self.0[1]);
|
||||
let (x4, x5) = unpack_pair(self.0[2]);
|
||||
let (x6, x7) = unpack_pair(self.0[3]);
|
||||
let (x8, x9) = unpack_pair(self.0[4]);
|
||||
|
||||
let x0_2 = x0 << 1;
|
||||
let x1_2 = x1 << 1;
|
||||
let x2_2 = x2 << 1;
|
||||
let x3_2 = x3 << 1;
|
||||
let x4_2 = x4 << 1;
|
||||
let x5_2 = x5 << 1;
|
||||
let x6_2 = x6 << 1;
|
||||
let x7_2 = x7 << 1;
|
||||
|
||||
let x5_19 = m_lo(v19, x5);
|
||||
let x6_19 = m_lo(v19, x6);
|
||||
let x7_19 = m_lo(v19, x7);
|
||||
let x8_19 = m_lo(v19, x8);
|
||||
let x9_19 = m_lo(v19, x9);
|
||||
|
||||
let mut z0 = m(x0, x0) + m(x2_2,x8_19) + m(x4_2,x6_19) + ((m(x1_2,x9_19) + m(x3_2,x7_19) + m(x5,x5_19)) << 1);
|
||||
let mut z1 = m(x0_2,x1) + m(x3_2,x8_19) + m(x5_2,x6_19) + ((m(x2,x9_19) + m(x4,x7_19)) << 1);
|
||||
let mut z2 = m(x0_2,x2) + m(x1_2,x1) + m(x4_2,x8_19) + m(x6,x6_19) + ((m(x3_2,x9_19) + m(x5_2,x7_19)) << 1);
|
||||
let mut z3 = m(x0_2,x3) + m(x1_2,x2) + m(x5_2,x8_19) + ((m(x4,x9_19) + m(x6,x7_19)) << 1);
|
||||
let mut z4 = m(x0_2,x4) + m(x1_2,x3_2) + m(x2, x2) + m(x6_2,x8_19) + ((m(x5_2,x9_19) + m(x7,x7_19)) << 1);
|
||||
let mut z5 = m(x0_2,x5) + m(x1_2,x4) + m(x2_2,x3) + m(x7_2,x8_19) + ((m(x6,x9_19)) << 1);
|
||||
let mut z6 = m(x0_2,x6) + m(x1_2,x5_2) + m(x2_2,x4) + m(x3_2,x3) + m(x8,x8_19) + ((m(x7_2,x9_19)) << 1);
|
||||
let mut z7 = m(x0_2,x7) + m(x1_2,x6) + m(x2_2,x5) + m(x3_2,x4) + ((m(x8,x9_19)) << 1);
|
||||
let mut z8 = m(x0_2,x8) + m(x1_2,x7_2) + m(x2_2,x6) + m(x3_2,x5_2) + m(x4,x4) + ((m(x9,x9_19)) << 1);
|
||||
let mut z9 = m(x0_2,x9) + m(x1_2,x8) + m(x2_2,x7) + m(x3_2,x6) + m(x4_2,x5);
|
||||
|
||||
// The biggest z_i is bounded as z_i < 249*2^(51 + 2*b);
|
||||
// if b < 1.5 we get z_i < 4485585228861014016.
|
||||
//
|
||||
// The limbs of the multiples of p are bounded above by
|
||||
//
|
||||
// 0x3fffffff << 37 = 9223371899415822336 < 2^63
|
||||
//
|
||||
// and below by
|
||||
//
|
||||
// 0x1fffffff << 37 = 4611685880988434432
|
||||
// > 4485585228861014016
|
||||
//
|
||||
// So these multiples of p are big enough to avoid underflow
|
||||
// in subtraction, and small enough to fit within u64
|
||||
// with room for a carry.
|
||||
|
||||
let low__p37 = u64x4::splat(0x3ffffed << 37);
|
||||
let even_p37 = u64x4::splat(0x3ffffff << 37);
|
||||
let odd__p37 = u64x4::splat(0x1ffffff << 37);
|
||||
|
||||
let negate_D = |x: u64x4, p: u64x4| -> u64x4 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_blend_epi32;
|
||||
_mm256_blend_epi32(x.into_bits(), (p - x).into_bits(), D_LANES64 as i32).into_bits()
|
||||
}
|
||||
};
|
||||
|
||||
z0 = negate_D(z0, low__p37);
|
||||
z1 = negate_D(z1, odd__p37);
|
||||
z2 = negate_D(z2, even_p37);
|
||||
z3 = negate_D(z3, odd__p37);
|
||||
z4 = negate_D(z4, even_p37);
|
||||
z5 = negate_D(z5, odd__p37);
|
||||
z6 = negate_D(z6, even_p37);
|
||||
z7 = negate_D(z7, odd__p37);
|
||||
z8 = negate_D(z8, even_p37);
|
||||
z9 = negate_D(z9, odd__p37);
|
||||
|
||||
FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
|
||||
}
|
||||
}
|
||||
|
||||
impl Neg for FieldElement2625x4 {
|
||||
type Output = FieldElement2625x4;
|
||||
|
||||
/// Negate this field element, performing a reduction.
|
||||
///
|
||||
/// If the coefficients are known to be small, use `negate_lazy`
|
||||
/// to avoid performing a reduction.
|
||||
///
|
||||
/// # Preconditions
|
||||
///
|
||||
/// The coefficients of `self` must be bounded with \\( b < 4.0 \\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.0002 \\).
|
||||
#[inline]
|
||||
fn neg(self) -> FieldElement2625x4 {
|
||||
FieldElement2625x4([
|
||||
P_TIMES_16_LO - self.0[0],
|
||||
P_TIMES_16_HI - self.0[1],
|
||||
P_TIMES_16_HI - self.0[2],
|
||||
P_TIMES_16_HI - self.0[3],
|
||||
P_TIMES_16_HI - self.0[4],
|
||||
]).reduce()
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<FieldElement2625x4> for FieldElement2625x4 {
|
||||
type Output = FieldElement2625x4;
|
||||
/// Add two `FieldElement2625x4`s, without performing a reduction.
|
||||
#[inline]
|
||||
fn add(self, rhs: FieldElement2625x4) -> FieldElement2625x4 {
|
||||
FieldElement2625x4([
|
||||
self.0[0] + rhs.0[0],
|
||||
self.0[1] + rhs.0[1],
|
||||
self.0[2] + rhs.0[2],
|
||||
self.0[3] + rhs.0[3],
|
||||
self.0[4] + rhs.0[4],
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul<(u32, u32, u32, u32)> for FieldElement2625x4 {
|
||||
type Output = FieldElement2625x4;
|
||||
/// Perform a multiplication by a vector of small constants.
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
|
||||
#[inline]
|
||||
fn mul(self, scalars: (u32, u32, u32, u32)) -> FieldElement2625x4 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
|
||||
let consts = u32x8::new(scalars.0, 0, scalars.1, 0, scalars.2, 0, scalars.3, 0);
|
||||
|
||||
let (b0, b1) = unpack_pair(self.0[0]);
|
||||
let (b2, b3) = unpack_pair(self.0[1]);
|
||||
let (b4, b5) = unpack_pair(self.0[2]);
|
||||
let (b6, b7) = unpack_pair(self.0[3]);
|
||||
let (b8, b9) = unpack_pair(self.0[4]);
|
||||
|
||||
FieldElement2625x4::reduce64([
|
||||
_mm256_mul_epu32(b0.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b1.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b2.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b3.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b4.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b5.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b6.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b7.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b8.into_bits(), consts.into_bits()).into_bits(),
|
||||
_mm256_mul_epu32(b9.into_bits(), consts.into_bits()).into_bits(),
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b FieldElement2625x4> for &'a FieldElement2625x4 {
|
||||
type Output = FieldElement2625x4;
|
||||
/// Multiply `self` by `rhs`.
|
||||
///
|
||||
/// # Preconditions
|
||||
///
|
||||
/// The coefficients of `self` must be bounded with \\( b < 2.5 \\).
|
||||
///
|
||||
/// The coefficients of `rhs` must be bounded with \\( b < 1.75 \\).
|
||||
///
|
||||
/// # Postconditions
|
||||
///
|
||||
/// The coefficients of the result are bounded with \\( b < 0.007 \\).
|
||||
///
|
||||
fn mul(self, rhs: &'b FieldElement2625x4) -> FieldElement2625x4 {
|
||||
#[inline(always)]
|
||||
fn m(x: u32x8, y: u32x8) -> u64x4 {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn m_lo(x: u32x8, y: u32x8) -> u32x8 {
|
||||
use core::arch::x86_64::_mm256_mul_epu32;
|
||||
unsafe { _mm256_mul_epu32(x.into_bits(), y.into_bits()).into_bits() }
|
||||
}
|
||||
|
||||
let (x0, x1) = unpack_pair(self.0[0]);
|
||||
let (x2, x3) = unpack_pair(self.0[1]);
|
||||
let (x4, x5) = unpack_pair(self.0[2]);
|
||||
let (x6, x7) = unpack_pair(self.0[3]);
|
||||
let (x8, x9) = unpack_pair(self.0[4]);
|
||||
|
||||
let (y0, y1) = unpack_pair(rhs.0[0]);
|
||||
let (y2, y3) = unpack_pair(rhs.0[1]);
|
||||
let (y4, y5) = unpack_pair(rhs.0[2]);
|
||||
let (y6, y7) = unpack_pair(rhs.0[3]);
|
||||
let (y8, y9) = unpack_pair(rhs.0[4]);
|
||||
|
||||
let v19 = u32x8::new(19, 0, 19, 0, 19, 0, 19, 0);
|
||||
|
||||
let y1_19 = m_lo(v19, y1); // This fits in a u32
|
||||
let y2_19 = m_lo(v19, y2); // iff 26 + b + lg(19) < 32
|
||||
let y3_19 = m_lo(v19, y3); // if b < 32 - 26 - 4.248 = 1.752
|
||||
let y4_19 = m_lo(v19, y4);
|
||||
let y5_19 = m_lo(v19, y5);
|
||||
let y6_19 = m_lo(v19, y6);
|
||||
let y7_19 = m_lo(v19, y7);
|
||||
let y8_19 = m_lo(v19, y8);
|
||||
let y9_19 = m_lo(v19, y9);
|
||||
|
||||
let x1_2 = x1 + x1; // This fits in a u32 iff 25 + b + 1 < 32
|
||||
let x3_2 = x3 + x3; // iff b < 6
|
||||
let x5_2 = x5 + x5;
|
||||
let x7_2 = x7 + x7;
|
||||
let x9_2 = x9 + x9;
|
||||
|
||||
let z0 = m(x0,y0) + m(x1_2,y9_19) + m(x2,y8_19) + m(x3_2,y7_19) + m(x4,y6_19) + m(x5_2,y5_19) + m(x6,y4_19) + m(x7_2,y3_19) + m(x8,y2_19) + m(x9_2,y1_19);
|
||||
let z1 = m(x0,y1) + m(x1,y0) + m(x2,y9_19) + m(x3,y8_19) + m(x4,y7_19) + m(x5,y6_19) + m(x6,y5_19) + m(x7,y4_19) + m(x8,y3_19) + m(x9,y2_19);
|
||||
let z2 = m(x0,y2) + m(x1_2,y1) + m(x2,y0) + m(x3_2,y9_19) + m(x4,y8_19) + m(x5_2,y7_19) + m(x6,y6_19) + m(x7_2,y5_19) + m(x8,y4_19) + m(x9_2,y3_19);
|
||||
let z3 = m(x0,y3) + m(x1,y2) + m(x2,y1) + m(x3,y0) + m(x4,y9_19) + m(x5,y8_19) + m(x6,y7_19) + m(x7,y6_19) + m(x8,y5_19) + m(x9,y4_19);
|
||||
let z4 = m(x0,y4) + m(x1_2,y3) + m(x2,y2) + m(x3_2,y1) + m(x4,y0) + m(x5_2,y9_19) + m(x6,y8_19) + m(x7_2,y7_19) + m(x8,y6_19) + m(x9_2,y5_19);
|
||||
let z5 = m(x0,y5) + m(x1,y4) + m(x2,y3) + m(x3,y2) + m(x4,y1) + m(x5,y0) + m(x6,y9_19) + m(x7,y8_19) + m(x8,y7_19) + m(x9,y6_19);
|
||||
let z6 = m(x0,y6) + m(x1_2,y5) + m(x2,y4) + m(x3_2,y3) + m(x4,y2) + m(x5_2,y1) + m(x6,y0) + m(x7_2,y9_19) + m(x8,y8_19) + m(x9_2,y7_19);
|
||||
let z7 = m(x0,y7) + m(x1,y6) + m(x2,y5) + m(x3,y4) + m(x4,y3) + m(x5,y2) + m(x6,y1) + m(x7,y0) + m(x8,y9_19) + m(x9,y8_19);
|
||||
let z8 = m(x0,y8) + m(x1_2,y7) + m(x2,y6) + m(x3_2,y5) + m(x4,y4) + m(x5_2,y3) + m(x6,y2) + m(x7_2,y1) + m(x8,y0) + m(x9_2,y9_19);
|
||||
let z9 = m(x0,y9) + m(x1,y8) + m(x2,y7) + m(x3,y6) + m(x4,y5) + m(x5,y4) + m(x6,y3) + m(x7,y2) + m(x8,y1) + m(x9,y0);
|
||||
|
||||
// The bounds on z[i] are the same as in the serial 32-bit code
|
||||
// and the comment below is copied from there:
|
||||
|
||||
// How big is the contribution to z[i+j] from x[i], y[j]?
|
||||
//
|
||||
// Using the bounds above, we get:
|
||||
//
|
||||
// i even, j even: x[i]*y[j] < 2^(26+b)*2^(26+b) = 2*2^(51+2*b)
|
||||
// i odd, j even: x[i]*y[j] < 2^(25+b)*2^(26+b) = 1*2^(51+2*b)
|
||||
// i even, j odd: x[i]*y[j] < 2^(26+b)*2^(25+b) = 1*2^(51+2*b)
|
||||
// i odd, j odd: 2*x[i]*y[j] < 2*2^(25+b)*2^(25+b) = 1*2^(51+2*b)
|
||||
//
|
||||
// We perform inline reduction mod p by replacing 2^255 by 19
|
||||
// (since 2^255 - 19 = 0 mod p). This adds a factor of 19, so
|
||||
// we get the bounds (z0 is the biggest one, but calculated for
|
||||
// posterity here in case finer estimation is needed later):
|
||||
//
|
||||
// z0 < ( 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 249*2^(51 + 2*b)
|
||||
// z1 < ( 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 154*2^(51 + 2*b)
|
||||
// z2 < ( 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 195*2^(51 + 2*b)
|
||||
// z3 < ( 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 118*2^(51 + 2*b)
|
||||
// z4 < ( 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 141*2^(51 + 2*b)
|
||||
// z5 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 + 1*19 + 1*19 )*2^(51 + 2b) = 82*2^(51 + 2*b)
|
||||
// z6 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 + 2*19 + 1*19 )*2^(51 + 2b) = 87*2^(51 + 2*b)
|
||||
// z7 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1*19 + 1*19 )*2^(51 + 2b) = 46*2^(51 + 2*b)
|
||||
// z8 < ( 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1 + 2 + 1*19 )*2^(51 + 2b) = 33*2^(51 + 2*b)
|
||||
// z9 < ( 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 )*2^(51 + 2b) = 10*2^(51 + 2*b)
|
||||
//
|
||||
// So z[0] fits into a u64 if 51 + 2*b + lg(249) < 64
|
||||
// if b < 2.5.
|
||||
|
||||
// In fact this bound is slightly sloppy, since it treats both
|
||||
// inputs x and y as being bounded by the same parameter b,
|
||||
// while they are in fact bounded by b_x and b_y, and we
|
||||
// already require that b_y < 1.75 in order to fit the
|
||||
// multiplications by 19 into a u32. The tighter bound on b_y
|
||||
// means we could get a tighter bound on the outputs, or a
|
||||
// looser bound on b_x.
|
||||
FieldElement2625x4::reduce64([z0, z1, z2, z3, z4, z5, z6, z7, z8, z9])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn scale_by_curve_constants() {
|
||||
let mut x = FieldElement2625x4::splat(&FieldElement51::one());
|
||||
|
||||
x = x * (121666, 121666, 2*121666, 2*121665);
|
||||
|
||||
let xs = x.split();
|
||||
assert_eq!(xs[0], FieldElement51([121666, 0, 0, 0, 0]));
|
||||
assert_eq!(xs[1], FieldElement51([121666, 0, 0, 0, 0]));
|
||||
assert_eq!(xs[2], FieldElement51([2 * 121666, 0, 0, 0, 0]));
|
||||
assert_eq!(xs[3], FieldElement51([2 * 121665, 0, 0, 0, 0]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn diff_sum_vs_serial() {
|
||||
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
|
||||
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
|
||||
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
|
||||
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
|
||||
|
||||
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3).diff_sum();
|
||||
|
||||
let result = vec.split();
|
||||
|
||||
assert_eq!(result[0], &x1 - &x0);
|
||||
assert_eq!(result[1], &x1 + &x0);
|
||||
assert_eq!(result[2], &x3 - &x2);
|
||||
assert_eq!(result[3], &x3 + &x2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn square_vs_serial() {
|
||||
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
|
||||
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
|
||||
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
|
||||
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
|
||||
|
||||
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
|
||||
|
||||
let result = vec.square_and_negate_D().split();
|
||||
|
||||
assert_eq!(result[0], &x0 * &x0);
|
||||
assert_eq!(result[1], &x1 * &x1);
|
||||
assert_eq!(result[2], &x2 * &x2);
|
||||
assert_eq!(result[3], -&(&x3 * &x3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiply_vs_serial() {
|
||||
let x0 = FieldElement51([10000, 10001, 10002, 10003, 10004]);
|
||||
let x1 = FieldElement51([10100, 10101, 10102, 10103, 10104]);
|
||||
let x2 = FieldElement51([10200, 10201, 10202, 10203, 10204]);
|
||||
let x3 = FieldElement51([10300, 10301, 10302, 10303, 10304]);
|
||||
|
||||
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
|
||||
let vecprime = vec.clone();
|
||||
|
||||
let result = (&vec * &vecprime).split();
|
||||
|
||||
assert_eq!(result[0], &x0 * &x0);
|
||||
assert_eq!(result[1], &x1 * &x1);
|
||||
assert_eq!(result[2], &x2 * &x2);
|
||||
assert_eq!(result[3], &x3 * &x3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unpack_repack_pair() {
|
||||
let x0 = FieldElement51([10000 + (10001 << 26), 0, 0, 0, 0]);
|
||||
let x1 = FieldElement51([10100 + (10101 << 26), 0, 0, 0, 0]);
|
||||
let x2 = FieldElement51([10200 + (10201 << 26), 0, 0, 0, 0]);
|
||||
let x3 = FieldElement51([10300 + (10301 << 26), 0, 0, 0, 0]);
|
||||
|
||||
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
|
||||
|
||||
let src = vec.0[0];
|
||||
|
||||
let (a, b) = unpack_pair(src);
|
||||
|
||||
let expected_a = u32x8::new(10000, 0, 10100, 0, 10200, 0, 10300, 0);
|
||||
let expected_b = u32x8::new(10001, 0, 10101, 0, 10201, 0, 10301, 0);
|
||||
|
||||
assert_eq!(a, expected_a);
|
||||
assert_eq!(b, expected_b);
|
||||
|
||||
let expected_src = repack_pair(a, b);
|
||||
|
||||
assert_eq!(src, expected_src);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn new_split_roundtrips() {
|
||||
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
|
||||
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
|
||||
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
|
||||
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
|
||||
|
||||
let vec = FieldElement2625x4::new(&x0, &x1, &x2, &x3);
|
||||
|
||||
let splits = vec.split();
|
||||
|
||||
assert_eq!(x0, splits[0]);
|
||||
assert_eq!(x1, splits[1]);
|
||||
assert_eq!(x2, splits[2]);
|
||||
assert_eq!(x3, splits[3]);
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![cfg_attr(
|
||||
feature = "nightly",
|
||||
doc(include = "../../../../docs/avx2-notes.md")
|
||||
)]
|
||||
|
||||
pub(crate) mod field;
|
||||
|
||||
pub(crate) mod edwards;
|
||||
|
||||
pub(crate) mod constants;
|
File diff suppressed because it is too large
Load Diff
|
@ -1,315 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2018-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
use std::ops::{Add, Neg, Sub};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use edwards;
|
||||
use window::{LookupTable, NafLookupTable5, NafLookupTable8};
|
||||
|
||||
use super::constants;
|
||||
use super::field::{F51x4Reduced, F51x4Unreduced, Lanes, Shuffle};
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct ExtendedPoint(pub(super) F51x4Unreduced);
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct CachedPoint(pub(super) F51x4Reduced);
|
||||
|
||||
impl From<edwards::EdwardsPoint> for ExtendedPoint {
|
||||
fn from(P: edwards::EdwardsPoint) -> ExtendedPoint {
|
||||
ExtendedPoint(F51x4Unreduced::new(&P.X, &P.Y, &P.Z, &P.T))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ExtendedPoint> for edwards::EdwardsPoint {
|
||||
fn from(P: ExtendedPoint) -> edwards::EdwardsPoint {
|
||||
let reduced = F51x4Reduced::from(P.0);
|
||||
let tmp = F51x4Unreduced::from(reduced).split();
|
||||
edwards::EdwardsPoint {
|
||||
X: tmp[0],
|
||||
Y: tmp[1],
|
||||
Z: tmp[2],
|
||||
T: tmp[3],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ExtendedPoint> for CachedPoint {
|
||||
fn from(P: ExtendedPoint) -> CachedPoint {
|
||||
let mut x = P.0;
|
||||
|
||||
x = x.blend(&x.diff_sum(), Lanes::AB);
|
||||
x = &F51x4Reduced::from(x) * (121666, 121666, 2 * 121666, 2 * 121665);
|
||||
x = x.blend(&x.negate_lazy(), Lanes::D);
|
||||
|
||||
CachedPoint(F51x4Reduced::from(x))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExtendedPoint {
|
||||
fn default() -> ExtendedPoint {
|
||||
ExtendedPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for ExtendedPoint {
|
||||
fn identity() -> ExtendedPoint {
|
||||
constants::EXTENDEDPOINT_IDENTITY
|
||||
}
|
||||
}
|
||||
|
||||
impl ExtendedPoint {
|
||||
pub fn double(&self) -> ExtendedPoint {
|
||||
// (Y1 X1 T1 Z1) -- uses vpshufd (1c latency @ 1/c)
|
||||
let mut tmp0 = self.0.shuffle(Shuffle::BADC);
|
||||
|
||||
// (X1+Y1 X1+Y1 X1+Y1 X1+Y1) -- can use vpinserti128
|
||||
let mut tmp1 = (self.0 + tmp0).shuffle(Shuffle::ABAB);
|
||||
|
||||
// (X1 Y1 Z1 X1+Y1)
|
||||
tmp0 = self.0.blend(&tmp1, Lanes::D);
|
||||
|
||||
tmp1 = F51x4Reduced::from(tmp0).square();
|
||||
// Now tmp1 = (S1 S2 S3 S4)
|
||||
|
||||
// We want to compute
|
||||
//
|
||||
// + | S1 | S1 | S1 | S1 |
|
||||
// + | S2 | | | S2 |
|
||||
// + | | | S3 | |
|
||||
// + | | | S3 | |
|
||||
// + | |16p |16p |16p |
|
||||
// - | | S2 | S2 | |
|
||||
// - | | | | S4 |
|
||||
// =======================
|
||||
// S5 S6 S8 S9
|
||||
|
||||
let zero = F51x4Unreduced::zero();
|
||||
|
||||
let S1_S1_S1_S1 = tmp1.shuffle(Shuffle::AAAA);
|
||||
let S2_S2_S2_S2 = tmp1.shuffle(Shuffle::BBBB);
|
||||
|
||||
let S2_S2_S2_S4 = S2_S2_S2_S2.blend(&tmp1, Lanes::D).negate_lazy();
|
||||
|
||||
tmp0 = S1_S1_S1_S1 + zero.blend(&(tmp1 + tmp1), Lanes::C);
|
||||
tmp0 = tmp0 + zero.blend(&S2_S2_S2_S2, Lanes::AD);
|
||||
tmp0 = tmp0 + zero.blend(&S2_S2_S2_S4, Lanes::BCD);
|
||||
|
||||
let tmp2 = F51x4Reduced::from(tmp0);
|
||||
|
||||
ExtendedPoint(&tmp2.shuffle(Shuffle::DBBD) * &tmp2.shuffle(Shuffle::CACA))
|
||||
}
|
||||
|
||||
pub fn mul_by_pow_2(&self, k: u32) -> ExtendedPoint {
|
||||
let mut tmp: ExtendedPoint = *self;
|
||||
for _ in 0..k {
|
||||
tmp = tmp.double();
|
||||
}
|
||||
tmp
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b CachedPoint> for &'a ExtendedPoint {
|
||||
type Output = ExtendedPoint;
|
||||
|
||||
/// Add an `ExtendedPoint` and a `CachedPoint`.
|
||||
fn add(self, other: &'b CachedPoint) -> ExtendedPoint {
|
||||
let mut tmp = self.0;
|
||||
|
||||
tmp = tmp.blend(&tmp.diff_sum(), Lanes::AB);
|
||||
// tmp = (Y1-X1 Y1+X1 Z1 T1) = (S0 S1 Z1 T1)
|
||||
|
||||
tmp = &F51x4Reduced::from(tmp) * &other.0;
|
||||
// tmp = (S0*S2' S1*S3' Z1*Z2' T1*T2') = (S8 S9 S10 S11)
|
||||
|
||||
tmp = tmp.shuffle(Shuffle::ABDC);
|
||||
// tmp = (S8 S9 S11 S10)
|
||||
|
||||
let tmp = F51x4Reduced::from(tmp.diff_sum());
|
||||
// tmp = (S9-S8 S9+S8 S10-S11 S10+S11) = (S12 S13 S14 S15)
|
||||
|
||||
let t0 = tmp.shuffle(Shuffle::ADDA);
|
||||
// t0 = (S12 S15 S15 S12)
|
||||
let t1 = tmp.shuffle(Shuffle::CBCB);
|
||||
// t1 = (S14 S13 S14 S13)
|
||||
|
||||
// Return (S12*S14 S15*S13 S15*S14 S12*S13) = (X3 Y3 Z3 T3)
|
||||
ExtendedPoint(&t0 * &t1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CachedPoint {
|
||||
fn default() -> CachedPoint {
|
||||
CachedPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for CachedPoint {
|
||||
fn identity() -> CachedPoint {
|
||||
constants::CACHEDPOINT_IDENTITY
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for CachedPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
CachedPoint(F51x4Reduced::conditional_select(&a.0, &b.0, choice))
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.0.conditional_assign(&other.0, choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a CachedPoint {
|
||||
type Output = CachedPoint;
|
||||
|
||||
fn neg(self) -> CachedPoint {
|
||||
let swapped = self.0.shuffle(Shuffle::BACD);
|
||||
CachedPoint(swapped.blend(&(-self.0), Lanes::D))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b CachedPoint> for &'a ExtendedPoint {
|
||||
type Output = ExtendedPoint;
|
||||
|
||||
/// Implement subtraction by negating the point and adding.
|
||||
fn sub(self, other: &'b CachedPoint) -> ExtendedPoint {
|
||||
self + &(-other)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for LookupTable<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let P = ExtendedPoint::from(*point);
|
||||
let mut points = [CachedPoint::from(P); 8];
|
||||
for i in 0..7 {
|
||||
points[i + 1] = (&P + &points[i]).into();
|
||||
}
|
||||
LookupTable(points)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable5<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let A = ExtendedPoint::from(*point);
|
||||
let mut Ai = [CachedPoint::from(A); 8];
|
||||
let A2 = A.double();
|
||||
for i in 0..7 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).into();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
|
||||
NafLookupTable5(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a edwards::EdwardsPoint> for NafLookupTable8<CachedPoint> {
|
||||
fn from(point: &'a edwards::EdwardsPoint) -> Self {
|
||||
let A = ExtendedPoint::from(*point);
|
||||
let mut Ai = [CachedPoint::from(A); 64];
|
||||
let A2 = A.double();
|
||||
for i in 0..63 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).into();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
|
||||
NafLookupTable8(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
fn addition_test_helper(P: edwards::EdwardsPoint, Q: edwards::EdwardsPoint) {
|
||||
// Test the serial implementation of the parallel addition formulas
|
||||
//let R_serial: edwards::EdwardsPoint = serial_add(P.into(), Q.into()).into();
|
||||
|
||||
// Test the vector implementation of the parallel readdition formulas
|
||||
let cached_Q = CachedPoint::from(ExtendedPoint::from(Q));
|
||||
let R_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) + &cached_Q).into();
|
||||
let S_vector: edwards::EdwardsPoint = (&ExtendedPoint::from(P) - &cached_Q).into();
|
||||
|
||||
println!("Testing point addition:");
|
||||
println!("P = {:?}", P);
|
||||
println!("Q = {:?}", Q);
|
||||
println!("cached Q = {:?}", cached_Q);
|
||||
println!("R = P + Q = {:?}", &P + &Q);
|
||||
//println!("R_serial = {:?}", R_serial);
|
||||
println!("R_vector = {:?}", R_vector);
|
||||
println!("S = P - Q = {:?}", &P - &Q);
|
||||
println!("S_vector = {:?}", S_vector);
|
||||
//assert_eq!(R_serial.compress(), (&P + &Q).compress());
|
||||
assert_eq!(R_vector.compress(), (&P + &Q).compress());
|
||||
assert_eq!(S_vector.compress(), (&P - &Q).compress());
|
||||
println!("OK!\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vector_addition_vs_serial_addition_vs_edwards_extendedpoint() {
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
println!("Testing id +- id");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
let Q = edwards::EdwardsPoint::identity();
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing id +- B");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
let Q = constants::ED25519_BASEPOINT_POINT;
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing B +- B");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
let Q = constants::ED25519_BASEPOINT_POINT;
|
||||
addition_test_helper(P, Q);
|
||||
|
||||
println!("Testing B +- kB");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
let Q = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
|
||||
addition_test_helper(P, Q);
|
||||
}
|
||||
|
||||
fn doubling_test_helper(P: edwards::EdwardsPoint) {
|
||||
//let R1: edwards::EdwardsPoint = serial_double(P.into()).into();
|
||||
let R2: edwards::EdwardsPoint = ExtendedPoint::from(P).double().into();
|
||||
println!("Testing point doubling:");
|
||||
println!("P = {:?}", P);
|
||||
//println!("(serial) R1 = {:?}", R1);
|
||||
println!("(vector) R2 = {:?}", R2);
|
||||
println!("P + P = {:?}", &P + &P);
|
||||
//assert_eq!(R1.compress(), (&P + &P).compress());
|
||||
assert_eq!(R2.compress(), (&P + &P).compress());
|
||||
println!("OK!\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vector_doubling_vs_serial_doubling_vs_edwards_extendedpoint() {
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
println!("Testing [2]id");
|
||||
let P = edwards::EdwardsPoint::identity();
|
||||
doubling_test_helper(P);
|
||||
|
||||
println!("Testing [2]B");
|
||||
let P = constants::ED25519_BASEPOINT_POINT;
|
||||
doubling_test_helper(P);
|
||||
|
||||
println!("Testing [2]([k]B)");
|
||||
let P = &constants::ED25519_BASEPOINT_TABLE * &Scalar::from(8475983829u64);
|
||||
doubling_test_helper(P);
|
||||
}
|
||||
}
|
|
@ -1,824 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2018-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::ops::{Add, Mul, Neg};
|
||||
use packed_simd::{u64x4, IntoBits};
|
||||
|
||||
use backend::serial::u64::field::FieldElement51;
|
||||
|
||||
/// A wrapper around `vpmadd52luq` that works on `u64x4`.
|
||||
#[inline(always)]
|
||||
unsafe fn madd52lo(z: u64x4, x: u64x4, y: u64x4) -> u64x4 {
|
||||
use core::arch::x86_64::_mm256_madd52lo_epu64;
|
||||
_mm256_madd52lo_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits()
|
||||
}
|
||||
|
||||
/// A wrapper around `vpmadd52huq` that works on `u64x4`.
|
||||
#[inline(always)]
|
||||
unsafe fn madd52hi(z: u64x4, x: u64x4, y: u64x4) -> u64x4 {
|
||||
use core::arch::x86_64::_mm256_madd52hi_epu64;
|
||||
_mm256_madd52hi_epu64(z.into_bits(), x.into_bits(), y.into_bits()).into_bits()
|
||||
}
|
||||
|
||||
/// A vector of four field elements in radix 2^51, with unreduced coefficients.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct F51x4Unreduced(pub(crate) [u64x4; 5]);
|
||||
|
||||
/// A vector of four field elements in radix 2^51, with reduced coefficients.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct F51x4Reduced(pub(crate) [u64x4; 5]);
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum Shuffle {
|
||||
AAAA,
|
||||
BBBB,
|
||||
BADC,
|
||||
BACD,
|
||||
ADDA,
|
||||
CBCB,
|
||||
ABDC,
|
||||
ABAB,
|
||||
DBBD,
|
||||
CACA,
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn shuffle_lanes(x: u64x4, control: Shuffle) -> u64x4 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_permute4x64_epi64 as perm;
|
||||
|
||||
match control {
|
||||
Shuffle::AAAA => perm(x.into_bits(), 0b00_00_00_00).into_bits(),
|
||||
Shuffle::BBBB => perm(x.into_bits(), 0b01_01_01_01).into_bits(),
|
||||
Shuffle::BADC => perm(x.into_bits(), 0b10_11_00_01).into_bits(),
|
||||
Shuffle::BACD => perm(x.into_bits(), 0b11_10_00_01).into_bits(),
|
||||
Shuffle::ADDA => perm(x.into_bits(), 0b00_11_11_00).into_bits(),
|
||||
Shuffle::CBCB => perm(x.into_bits(), 0b01_10_01_10).into_bits(),
|
||||
Shuffle::ABDC => perm(x.into_bits(), 0b10_11_01_00).into_bits(),
|
||||
Shuffle::ABAB => perm(x.into_bits(), 0b01_00_01_00).into_bits(),
|
||||
Shuffle::DBBD => perm(x.into_bits(), 0b11_01_01_11).into_bits(),
|
||||
Shuffle::CACA => perm(x.into_bits(), 0b00_10_00_10).into_bits(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum Lanes {
|
||||
D,
|
||||
C,
|
||||
AB,
|
||||
AC,
|
||||
AD,
|
||||
BCD,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn blend_lanes(x: u64x4, y: u64x4, control: Lanes) -> u64x4 {
|
||||
unsafe {
|
||||
use core::arch::x86_64::_mm256_blend_epi32 as blend;
|
||||
|
||||
match control {
|
||||
Lanes::D => blend(x.into_bits(), y.into_bits(), 0b11_00_00_00).into_bits(),
|
||||
Lanes::C => blend(x.into_bits(), y.into_bits(), 0b00_11_00_00).into_bits(),
|
||||
Lanes::AB => blend(x.into_bits(), y.into_bits(), 0b00_00_11_11).into_bits(),
|
||||
Lanes::AC => blend(x.into_bits(), y.into_bits(), 0b00_11_00_11).into_bits(),
|
||||
Lanes::AD => blend(x.into_bits(), y.into_bits(), 0b11_00_00_11).into_bits(),
|
||||
Lanes::BCD => blend(x.into_bits(), y.into_bits(), 0b11_11_11_00).into_bits(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl F51x4Unreduced {
|
||||
pub fn zero() -> F51x4Unreduced {
|
||||
F51x4Unreduced([u64x4::splat(0); 5])
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
x0: &FieldElement51,
|
||||
x1: &FieldElement51,
|
||||
x2: &FieldElement51,
|
||||
x3: &FieldElement51,
|
||||
) -> F51x4Unreduced {
|
||||
F51x4Unreduced([
|
||||
u64x4::new(x0.0[0], x1.0[0], x2.0[0], x3.0[0]),
|
||||
u64x4::new(x0.0[1], x1.0[1], x2.0[1], x3.0[1]),
|
||||
u64x4::new(x0.0[2], x1.0[2], x2.0[2], x3.0[2]),
|
||||
u64x4::new(x0.0[3], x1.0[3], x2.0[3], x3.0[3]),
|
||||
u64x4::new(x0.0[4], x1.0[4], x2.0[4], x3.0[4]),
|
||||
])
|
||||
}
|
||||
|
||||
pub fn split(&self) -> [FieldElement51; 4] {
|
||||
let x = &self.0;
|
||||
[
|
||||
FieldElement51([
|
||||
x[0].extract(0),
|
||||
x[1].extract(0),
|
||||
x[2].extract(0),
|
||||
x[3].extract(0),
|
||||
x[4].extract(0),
|
||||
]),
|
||||
FieldElement51([
|
||||
x[0].extract(1),
|
||||
x[1].extract(1),
|
||||
x[2].extract(1),
|
||||
x[3].extract(1),
|
||||
x[4].extract(1),
|
||||
]),
|
||||
FieldElement51([
|
||||
x[0].extract(2),
|
||||
x[1].extract(2),
|
||||
x[2].extract(2),
|
||||
x[3].extract(2),
|
||||
x[4].extract(2),
|
||||
]),
|
||||
FieldElement51([
|
||||
x[0].extract(3),
|
||||
x[1].extract(3),
|
||||
x[2].extract(3),
|
||||
x[3].extract(3),
|
||||
x[4].extract(3),
|
||||
]),
|
||||
]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn diff_sum(&self) -> F51x4Unreduced {
|
||||
// tmp1 = (B, A, D, C)
|
||||
let tmp1 = self.shuffle(Shuffle::BADC);
|
||||
// tmp2 = (-A, B, -C, D)
|
||||
let tmp2 = self.blend(&self.negate_lazy(), Lanes::AC);
|
||||
// (B - A, B + A, D - C, D + C)
|
||||
tmp1 + tmp2
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn negate_lazy(&self) -> F51x4Unreduced {
|
||||
let lo = u64x4::splat(36028797018963664u64);
|
||||
let hi = u64x4::splat(36028797018963952u64);
|
||||
F51x4Unreduced([
|
||||
lo - self.0[0],
|
||||
hi - self.0[1],
|
||||
hi - self.0[2],
|
||||
hi - self.0[3],
|
||||
hi - self.0[4],
|
||||
])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn shuffle(&self, control: Shuffle) -> F51x4Unreduced {
|
||||
F51x4Unreduced([
|
||||
shuffle_lanes(self.0[0], control),
|
||||
shuffle_lanes(self.0[1], control),
|
||||
shuffle_lanes(self.0[2], control),
|
||||
shuffle_lanes(self.0[3], control),
|
||||
shuffle_lanes(self.0[4], control),
|
||||
])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn blend(&self, other: &F51x4Unreduced, control: Lanes) -> F51x4Unreduced {
|
||||
F51x4Unreduced([
|
||||
blend_lanes(self.0[0], other.0[0], control),
|
||||
blend_lanes(self.0[1], other.0[1], control),
|
||||
blend_lanes(self.0[2], other.0[2], control),
|
||||
blend_lanes(self.0[3], other.0[3], control),
|
||||
blend_lanes(self.0[4], other.0[4], control),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
impl Neg for F51x4Reduced {
|
||||
type Output = F51x4Reduced;
|
||||
|
||||
fn neg(self) -> F51x4Reduced {
|
||||
F51x4Unreduced::from(self).negate_lazy().into()
|
||||
}
|
||||
}
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
impl ConditionallySelectable for F51x4Reduced {
|
||||
#[inline]
|
||||
fn conditional_select(a: &F51x4Reduced, b: &F51x4Reduced, choice: Choice) -> F51x4Reduced {
|
||||
let mask = (-(choice.unwrap_u8() as i64)) as u64;
|
||||
let mask_vec = u64x4::splat(mask);
|
||||
F51x4Reduced([
|
||||
a.0[0] ^ (mask_vec & (a.0[0] ^ b.0[0])),
|
||||
a.0[1] ^ (mask_vec & (a.0[1] ^ b.0[1])),
|
||||
a.0[2] ^ (mask_vec & (a.0[2] ^ b.0[2])),
|
||||
a.0[3] ^ (mask_vec & (a.0[3] ^ b.0[3])),
|
||||
a.0[4] ^ (mask_vec & (a.0[4] ^ b.0[4])),
|
||||
])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn conditional_assign(&mut self, other: &F51x4Reduced, choice: Choice) {
|
||||
let mask = (-(choice.unwrap_u8() as i64)) as u64;
|
||||
let mask_vec = u64x4::splat(mask);
|
||||
self.0[0] ^= mask_vec & (self.0[0] ^ other.0[0]);
|
||||
self.0[1] ^= mask_vec & (self.0[1] ^ other.0[1]);
|
||||
self.0[2] ^= mask_vec & (self.0[2] ^ other.0[2]);
|
||||
self.0[3] ^= mask_vec & (self.0[3] ^ other.0[3]);
|
||||
self.0[4] ^= mask_vec & (self.0[4] ^ other.0[4]);
|
||||
}
|
||||
}
|
||||
|
||||
impl F51x4Reduced {
|
||||
#[inline]
|
||||
pub fn shuffle(&self, control: Shuffle) -> F51x4Reduced {
|
||||
F51x4Reduced([
|
||||
shuffle_lanes(self.0[0], control),
|
||||
shuffle_lanes(self.0[1], control),
|
||||
shuffle_lanes(self.0[2], control),
|
||||
shuffle_lanes(self.0[3], control),
|
||||
shuffle_lanes(self.0[4], control),
|
||||
])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn blend(&self, other: &F51x4Reduced, control: Lanes) -> F51x4Reduced {
|
||||
F51x4Reduced([
|
||||
blend_lanes(self.0[0], other.0[0], control),
|
||||
blend_lanes(self.0[1], other.0[1], control),
|
||||
blend_lanes(self.0[2], other.0[2], control),
|
||||
blend_lanes(self.0[3], other.0[3], control),
|
||||
blend_lanes(self.0[4], other.0[4], control),
|
||||
])
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn square(&self) -> F51x4Unreduced {
|
||||
unsafe {
|
||||
let x = &self.0;
|
||||
|
||||
// Represent values with coeff. 2
|
||||
let mut z0_2 = u64x4::splat(0);
|
||||
let mut z1_2 = u64x4::splat(0);
|
||||
let mut z2_2 = u64x4::splat(0);
|
||||
let mut z3_2 = u64x4::splat(0);
|
||||
let mut z4_2 = u64x4::splat(0);
|
||||
let mut z5_2 = u64x4::splat(0);
|
||||
let mut z6_2 = u64x4::splat(0);
|
||||
let mut z7_2 = u64x4::splat(0);
|
||||
let mut z9_2 = u64x4::splat(0);
|
||||
|
||||
// Represent values with coeff. 4
|
||||
let mut z2_4 = u64x4::splat(0);
|
||||
let mut z3_4 = u64x4::splat(0);
|
||||
let mut z4_4 = u64x4::splat(0);
|
||||
let mut z5_4 = u64x4::splat(0);
|
||||
let mut z6_4 = u64x4::splat(0);
|
||||
let mut z7_4 = u64x4::splat(0);
|
||||
let mut z8_4 = u64x4::splat(0);
|
||||
|
||||
let mut z0_1 = u64x4::splat(0);
|
||||
z0_1 = madd52lo(z0_1, x[0], x[0]);
|
||||
|
||||
let mut z1_1 = u64x4::splat(0);
|
||||
z1_2 = madd52lo(z1_2, x[0], x[1]);
|
||||
z1_2 = madd52hi(z1_2, x[0], x[0]);
|
||||
|
||||
z2_4 = madd52hi(z2_4, x[0], x[1]);
|
||||
let mut z2_1 = z2_4 << 2;
|
||||
z2_2 = madd52lo(z2_2, x[0], x[2]);
|
||||
z2_1 = madd52lo(z2_1, x[1], x[1]);
|
||||
|
||||
z3_4 = madd52hi(z3_4, x[0], x[2]);
|
||||
let mut z3_1 = z3_4 << 2;
|
||||
z3_2 = madd52lo(z3_2, x[1], x[2]);
|
||||
z3_2 = madd52lo(z3_2, x[0], x[3]);
|
||||
z3_2 = madd52hi(z3_2, x[1], x[1]);
|
||||
|
||||
z4_4 = madd52hi(z4_4, x[1], x[2]);
|
||||
z4_4 = madd52hi(z4_4, x[0], x[3]);
|
||||
let mut z4_1 = z4_4 << 2;
|
||||
z4_2 = madd52lo(z4_2, x[1], x[3]);
|
||||
z4_2 = madd52lo(z4_2, x[0], x[4]);
|
||||
z4_1 = madd52lo(z4_1, x[2], x[2]);
|
||||
|
||||
z5_4 = madd52hi(z5_4, x[1], x[3]);
|
||||
z5_4 = madd52hi(z5_4, x[0], x[4]);
|
||||
let mut z5_1 = z5_4 << 2;
|
||||
z5_2 = madd52lo(z5_2, x[2], x[3]);
|
||||
z5_2 = madd52lo(z5_2, x[1], x[4]);
|
||||
z5_2 = madd52hi(z5_2, x[2], x[2]);
|
||||
|
||||
z6_4 = madd52hi(z6_4, x[2], x[3]);
|
||||
z6_4 = madd52hi(z6_4, x[1], x[4]);
|
||||
let mut z6_1 = z6_4 << 2;
|
||||
z6_2 = madd52lo(z6_2, x[2], x[4]);
|
||||
z6_1 = madd52lo(z6_1, x[3], x[3]);
|
||||
|
||||
z7_4 = madd52hi(z7_4, x[2], x[4]);
|
||||
let mut z7_1 = z7_4 << 2;
|
||||
z7_2 = madd52lo(z7_2, x[3], x[4]);
|
||||
z7_2 = madd52hi(z7_2, x[3], x[3]);
|
||||
|
||||
z8_4 = madd52hi(z8_4, x[3], x[4]);
|
||||
let mut z8_1 = z8_4 << 2;
|
||||
z8_1 = madd52lo(z8_1, x[4], x[4]);
|
||||
|
||||
let mut z9_1 = u64x4::splat(0);
|
||||
z9_2 = madd52hi(z9_2, x[4], x[4]);
|
||||
|
||||
z5_1 += z5_2 << 1;
|
||||
z6_1 += z6_2 << 1;
|
||||
z7_1 += z7_2 << 1;
|
||||
z9_1 += z9_2 << 1;
|
||||
|
||||
let mut t0 = u64x4::splat(0);
|
||||
let mut t1 = u64x4::splat(0);
|
||||
let r19 = u64x4::splat(19);
|
||||
|
||||
t0 = madd52hi(t0, r19, z9_1);
|
||||
t1 = madd52lo(t1, r19, z9_1 >> 52);
|
||||
|
||||
z4_2 = madd52lo(z4_2, r19, z8_1 >> 52);
|
||||
z3_2 = madd52lo(z3_2, r19, z7_1 >> 52);
|
||||
z2_2 = madd52lo(z2_2, r19, z6_1 >> 52);
|
||||
z1_2 = madd52lo(z1_2, r19, z5_1 >> 52);
|
||||
|
||||
z0_2 = madd52lo(z0_2, r19, t0 + t1);
|
||||
z1_2 = madd52hi(z1_2, r19, z5_1);
|
||||
z2_2 = madd52hi(z2_2, r19, z6_1);
|
||||
z3_2 = madd52hi(z3_2, r19, z7_1);
|
||||
z4_2 = madd52hi(z4_2, r19, z8_1);
|
||||
|
||||
z0_1 = madd52lo(z0_1, r19, z5_1);
|
||||
z1_1 = madd52lo(z1_1, r19, z6_1);
|
||||
z2_1 = madd52lo(z2_1, r19, z7_1);
|
||||
z3_1 = madd52lo(z3_1, r19, z8_1);
|
||||
z4_1 = madd52lo(z4_1, r19, z9_1);
|
||||
|
||||
F51x4Unreduced([
|
||||
z0_1 + z0_2 + z0_2,
|
||||
z1_1 + z1_2 + z1_2,
|
||||
z2_1 + z2_2 + z2_2,
|
||||
z3_1 + z3_2 + z3_2,
|
||||
z4_1 + z4_2 + z4_2,
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<F51x4Reduced> for F51x4Unreduced {
|
||||
#[inline]
|
||||
fn from(x: F51x4Reduced) -> F51x4Unreduced {
|
||||
F51x4Unreduced(x.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<F51x4Unreduced> for F51x4Reduced {
|
||||
#[inline]
|
||||
fn from(x: F51x4Unreduced) -> F51x4Reduced {
|
||||
let mask = u64x4::splat((1 << 51) - 1);
|
||||
let r19 = u64x4::splat(19);
|
||||
|
||||
// Compute carryouts in parallel
|
||||
let c0 = x.0[0] >> 51;
|
||||
let c1 = x.0[1] >> 51;
|
||||
let c2 = x.0[2] >> 51;
|
||||
let c3 = x.0[3] >> 51;
|
||||
let c4 = x.0[4] >> 51;
|
||||
|
||||
unsafe {
|
||||
F51x4Reduced([
|
||||
madd52lo(x.0[0] & mask, c4, r19),
|
||||
(x.0[1] & mask) + c0,
|
||||
(x.0[2] & mask) + c1,
|
||||
(x.0[3] & mask) + c2,
|
||||
(x.0[4] & mask) + c3,
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<F51x4Unreduced> for F51x4Unreduced {
|
||||
type Output = F51x4Unreduced;
|
||||
#[inline]
|
||||
fn add(self, rhs: F51x4Unreduced) -> F51x4Unreduced {
|
||||
F51x4Unreduced([
|
||||
self.0[0] + rhs.0[0],
|
||||
self.0[1] + rhs.0[1],
|
||||
self.0[2] + rhs.0[2],
|
||||
self.0[3] + rhs.0[3],
|
||||
self.0[4] + rhs.0[4],
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Mul<(u32, u32, u32, u32)> for &'a F51x4Reduced {
|
||||
type Output = F51x4Unreduced;
|
||||
#[inline]
|
||||
fn mul(self, scalars: (u32, u32, u32, u32)) -> F51x4Unreduced {
|
||||
unsafe {
|
||||
let x = &self.0;
|
||||
let y = u64x4::new(
|
||||
scalars.0 as u64,
|
||||
scalars.1 as u64,
|
||||
scalars.2 as u64,
|
||||
scalars.3 as u64,
|
||||
);
|
||||
let r19 = u64x4::splat(19);
|
||||
|
||||
let mut z0_1 = u64x4::splat(0);
|
||||
let mut z1_1 = u64x4::splat(0);
|
||||
let mut z2_1 = u64x4::splat(0);
|
||||
let mut z3_1 = u64x4::splat(0);
|
||||
let mut z4_1 = u64x4::splat(0);
|
||||
let mut z1_2 = u64x4::splat(0);
|
||||
let mut z2_2 = u64x4::splat(0);
|
||||
let mut z3_2 = u64x4::splat(0);
|
||||
let mut z4_2 = u64x4::splat(0);
|
||||
let mut z5_2 = u64x4::splat(0);
|
||||
|
||||
// Wave 0
|
||||
z4_2 = madd52hi(z4_2, y, x[3]);
|
||||
z5_2 = madd52hi(z5_2, y, x[4]);
|
||||
z4_1 = madd52lo(z4_1, y, x[4]);
|
||||
z0_1 = madd52lo(z0_1, y, x[0]);
|
||||
z3_1 = madd52lo(z3_1, y, x[3]);
|
||||
z2_1 = madd52lo(z2_1, y, x[2]);
|
||||
z1_1 = madd52lo(z1_1, y, x[1]);
|
||||
z3_2 = madd52hi(z3_2, y, x[2]);
|
||||
|
||||
// Wave 2
|
||||
z2_2 = madd52hi(z2_2, y, x[1]);
|
||||
z1_2 = madd52hi(z1_2, y, x[0]);
|
||||
z0_1 = madd52lo(z0_1, z5_2 + z5_2, r19);
|
||||
|
||||
F51x4Unreduced([
|
||||
z0_1,
|
||||
z1_1 + z1_2 + z1_2,
|
||||
z2_1 + z2_2 + z2_2,
|
||||
z3_1 + z3_2 + z3_2,
|
||||
z4_1 + z4_2 + z4_2,
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b F51x4Reduced> for &'a F51x4Reduced {
|
||||
type Output = F51x4Unreduced;
|
||||
#[inline]
|
||||
fn mul(self, rhs: &'b F51x4Reduced) -> F51x4Unreduced {
|
||||
unsafe {
|
||||
// Inputs
|
||||
let x = &self.0;
|
||||
let y = &rhs.0;
|
||||
|
||||
// Accumulators for terms with coeff 1
|
||||
let mut z0_1 = u64x4::splat(0);
|
||||
let mut z1_1 = u64x4::splat(0);
|
||||
let mut z2_1 = u64x4::splat(0);
|
||||
let mut z3_1 = u64x4::splat(0);
|
||||
let mut z4_1 = u64x4::splat(0);
|
||||
let mut z5_1 = u64x4::splat(0);
|
||||
let mut z6_1 = u64x4::splat(0);
|
||||
let mut z7_1 = u64x4::splat(0);
|
||||
let mut z8_1 = u64x4::splat(0);
|
||||
|
||||
// Accumulators for terms with coeff 2
|
||||
let mut z0_2 = u64x4::splat(0);
|
||||
let mut z1_2 = u64x4::splat(0);
|
||||
let mut z2_2 = u64x4::splat(0);
|
||||
let mut z3_2 = u64x4::splat(0);
|
||||
let mut z4_2 = u64x4::splat(0);
|
||||
let mut z5_2 = u64x4::splat(0);
|
||||
let mut z6_2 = u64x4::splat(0);
|
||||
let mut z7_2 = u64x4::splat(0);
|
||||
let mut z8_2 = u64x4::splat(0);
|
||||
let mut z9_2 = u64x4::splat(0);
|
||||
|
||||
// LLVM doesn't seem to do much work reordering IFMA
|
||||
// instructions, so try to organize them into "waves" of 8
|
||||
// independent operations (4c latency, 0.5 c throughput
|
||||
// means 8 in flight)
|
||||
|
||||
// Wave 0
|
||||
z4_1 = madd52lo(z4_1, x[2], y[2]);
|
||||
z5_2 = madd52hi(z5_2, x[2], y[2]);
|
||||
z5_1 = madd52lo(z5_1, x[4], y[1]);
|
||||
z6_2 = madd52hi(z6_2, x[4], y[1]);
|
||||
z6_1 = madd52lo(z6_1, x[4], y[2]);
|
||||
z7_2 = madd52hi(z7_2, x[4], y[2]);
|
||||
z7_1 = madd52lo(z7_1, x[4], y[3]);
|
||||
z8_2 = madd52hi(z8_2, x[4], y[3]);
|
||||
|
||||
// Wave 1
|
||||
z4_1 = madd52lo(z4_1, x[3], y[1]);
|
||||
z5_2 = madd52hi(z5_2, x[3], y[1]);
|
||||
z5_1 = madd52lo(z5_1, x[3], y[2]);
|
||||
z6_2 = madd52hi(z6_2, x[3], y[2]);
|
||||
z6_1 = madd52lo(z6_1, x[3], y[3]);
|
||||
z7_2 = madd52hi(z7_2, x[3], y[3]);
|
||||
z7_1 = madd52lo(z7_1, x[3], y[4]);
|
||||
z8_2 = madd52hi(z8_2, x[3], y[4]);
|
||||
|
||||
// Wave 2
|
||||
z8_1 = madd52lo(z8_1, x[4], y[4]);
|
||||
z9_2 = madd52hi(z9_2, x[4], y[4]);
|
||||
z4_1 = madd52lo(z4_1, x[4], y[0]);
|
||||
z5_2 = madd52hi(z5_2, x[4], y[0]);
|
||||
z5_1 = madd52lo(z5_1, x[2], y[3]);
|
||||
z6_2 = madd52hi(z6_2, x[2], y[3]);
|
||||
z6_1 = madd52lo(z6_1, x[2], y[4]);
|
||||
z7_2 = madd52hi(z7_2, x[2], y[4]);
|
||||
|
||||
let z8 = z8_1 + z8_2 + z8_2;
|
||||
let z9 = z9_2 + z9_2;
|
||||
|
||||
// Wave 3
|
||||
z3_1 = madd52lo(z3_1, x[3], y[0]);
|
||||
z4_2 = madd52hi(z4_2, x[3], y[0]);
|
||||
z4_1 = madd52lo(z4_1, x[1], y[3]);
|
||||
z5_2 = madd52hi(z5_2, x[1], y[3]);
|
||||
z5_1 = madd52lo(z5_1, x[1], y[4]);
|
||||
z6_2 = madd52hi(z6_2, x[1], y[4]);
|
||||
z2_1 = madd52lo(z2_1, x[2], y[0]);
|
||||
z3_2 = madd52hi(z3_2, x[2], y[0]);
|
||||
|
||||
let z6 = z6_1 + z6_2 + z6_2;
|
||||
let z7 = z7_1 + z7_2 + z7_2;
|
||||
|
||||
// Wave 4
|
||||
z3_1 = madd52lo(z3_1, x[2], y[1]);
|
||||
z4_2 = madd52hi(z4_2, x[2], y[1]);
|
||||
z4_1 = madd52lo(z4_1, x[0], y[4]);
|
||||
z5_2 = madd52hi(z5_2, x[0], y[4]);
|
||||
z1_1 = madd52lo(z1_1, x[1], y[0]);
|
||||
z2_2 = madd52hi(z2_2, x[1], y[0]);
|
||||
z2_1 = madd52lo(z2_1, x[1], y[1]);
|
||||
z3_2 = madd52hi(z3_2, x[1], y[1]);
|
||||
|
||||
let z5 = z5_1 + z5_2 + z5_2;
|
||||
|
||||
// Wave 5
|
||||
z3_1 = madd52lo(z3_1, x[1], y[2]);
|
||||
z4_2 = madd52hi(z4_2, x[1], y[2]);
|
||||
z0_1 = madd52lo(z0_1, x[0], y[0]);
|
||||
z1_2 = madd52hi(z1_2, x[0], y[0]);
|
||||
z1_1 = madd52lo(z1_1, x[0], y[1]);
|
||||
z2_1 = madd52lo(z2_1, x[0], y[2]);
|
||||
z2_2 = madd52hi(z2_2, x[0], y[1]);
|
||||
z3_2 = madd52hi(z3_2, x[0], y[2]);
|
||||
|
||||
let mut t0 = u64x4::splat(0);
|
||||
let mut t1 = u64x4::splat(0);
|
||||
let r19 = u64x4::splat(19);
|
||||
|
||||
// Wave 6
|
||||
t0 = madd52hi(t0, r19, z9);
|
||||
t1 = madd52lo(t1, r19, z9 >> 52);
|
||||
z3_1 = madd52lo(z3_1, x[0], y[3]);
|
||||
z4_2 = madd52hi(z4_2, x[0], y[3]);
|
||||
z1_2 = madd52lo(z1_2, r19, z5 >> 52);
|
||||
z2_2 = madd52lo(z2_2, r19, z6 >> 52);
|
||||
z3_2 = madd52lo(z3_2, r19, z7 >> 52);
|
||||
z0_1 = madd52lo(z0_1, r19, z5);
|
||||
|
||||
// Wave 7
|
||||
z4_1 = madd52lo(z4_1, r19, z9);
|
||||
z1_1 = madd52lo(z1_1, r19, z6);
|
||||
z0_2 = madd52lo(z0_2, r19, t0 + t1);
|
||||
z4_2 = madd52hi(z4_2, r19, z8);
|
||||
z2_1 = madd52lo(z2_1, r19, z7);
|
||||
z1_2 = madd52hi(z1_2, r19, z5);
|
||||
z2_2 = madd52hi(z2_2, r19, z6);
|
||||
z3_2 = madd52hi(z3_2, r19, z7);
|
||||
|
||||
// Wave 8
|
||||
z3_1 = madd52lo(z3_1, r19, z8);
|
||||
z4_2 = madd52lo(z4_2, r19, z8 >> 52);
|
||||
|
||||
F51x4Unreduced([
|
||||
z0_1 + z0_2 + z0_2,
|
||||
z1_1 + z1_2 + z1_2,
|
||||
z2_1 + z2_2 + z2_2,
|
||||
z3_1 + z3_2 + z3_2,
|
||||
z4_1 + z4_2 + z4_2,
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn vpmadd52luq() {
|
||||
let x = u64x4::splat(2);
|
||||
let y = u64x4::splat(3);
|
||||
let mut z = u64x4::splat(5);
|
||||
|
||||
z = unsafe { madd52lo(z, x, y) };
|
||||
|
||||
assert_eq!(z, u64x4::splat(5 + 2 * 3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn new_split_round_trip_on_reduced_input() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
|
||||
let ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
|
||||
let splits = ax4.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(a, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn new_split_round_trip_on_unreduced_input() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
// ... but now multiply it by 16 without reducing coeffs
|
||||
let a16 = FieldElement51([
|
||||
a.0[0] << 4,
|
||||
a.0[1] << 4,
|
||||
a.0[2] << 4,
|
||||
a.0[3] << 4,
|
||||
a.0[4] << 4,
|
||||
]);
|
||||
|
||||
let a16x4 = F51x4Unreduced::new(&a16, &a16, &a16, &a16);
|
||||
let splits = a16x4.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(a16, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reduction() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
// ... but now multiply it by 128 without reducing coeffs
|
||||
let abig = FieldElement51([
|
||||
a.0[0] << 4,
|
||||
a.0[1] << 4,
|
||||
a.0[2] << 4,
|
||||
a.0[3] << 4,
|
||||
a.0[4] << 4,
|
||||
]);
|
||||
|
||||
let abigx4: F51x4Reduced = F51x4Unreduced::new(&abig, &abig, &abig, &abig).into();
|
||||
|
||||
let splits = F51x4Unreduced::from(abigx4).split();
|
||||
let c = &a * &FieldElement51([(1 << 4), 0, 0, 0, 0]);
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(c, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mul_matches_serial() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
let b = FieldElement51([98098, 87987897, 0, 1, 0]).invert();
|
||||
let c = &a * &b;
|
||||
|
||||
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
|
||||
let bx4: F51x4Reduced = F51x4Unreduced::new(&b, &b, &b, &b).into();
|
||||
let cx4 = &ax4 * &bx4;
|
||||
|
||||
let splits = cx4.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(c, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iterated_mul_matches_serial() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
let b = FieldElement51([98098, 87987897, 0, 1, 0]).invert();
|
||||
let mut c = &a * &b;
|
||||
for _i in 0..1024 {
|
||||
c = &a * &c;
|
||||
c = &b * &c;
|
||||
}
|
||||
|
||||
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
|
||||
let bx4: F51x4Reduced = F51x4Unreduced::new(&b, &b, &b, &b).into();
|
||||
let mut cx4 = &ax4 * &bx4;
|
||||
for _i in 0..1024 {
|
||||
cx4 = &ax4 * &F51x4Reduced::from(cx4);
|
||||
cx4 = &bx4 * &F51x4Reduced::from(cx4);
|
||||
}
|
||||
|
||||
let splits = cx4.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(c, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn square_matches_mul() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
|
||||
let ax4: F51x4Reduced = F51x4Unreduced::new(&a, &a, &a, &a).into();
|
||||
let cx4 = &ax4 * &ax4;
|
||||
let cx4_sq = ax4.square();
|
||||
|
||||
let splits = cx4.split();
|
||||
let splits_sq = cx4_sq.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(splits_sq[i], splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iterated_square_matches_serial() {
|
||||
// Invert a small field element to get a big one
|
||||
let mut a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
let mut ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
|
||||
for _j in 0..1024 {
|
||||
a = a.square();
|
||||
ax4 = F51x4Reduced::from(ax4).square();
|
||||
|
||||
let splits = ax4.split();
|
||||
for i in 0..4 {
|
||||
assert_eq!(a, splits[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iterated_u32_mul_matches_serial() {
|
||||
// Invert a small field element to get a big one
|
||||
let a = FieldElement51([2438, 24, 243, 0, 0]).invert();
|
||||
let b = FieldElement51([121665, 0, 0, 0, 0]);
|
||||
let mut c = &a * &b;
|
||||
for _i in 0..1024 {
|
||||
c = &b * &c;
|
||||
}
|
||||
|
||||
let ax4 = F51x4Unreduced::new(&a, &a, &a, &a);
|
||||
let bx4 = (121665u32, 121665u32, 121665u32, 121665u32);
|
||||
let mut cx4 = &F51x4Reduced::from(ax4) * bx4;
|
||||
for _i in 0..1024 {
|
||||
cx4 = &F51x4Reduced::from(cx4) * bx4;
|
||||
}
|
||||
|
||||
let splits = cx4.split();
|
||||
|
||||
for i in 0..4 {
|
||||
assert_eq!(c, splits[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shuffle_AAAA() {
|
||||
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
|
||||
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
|
||||
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
|
||||
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
|
||||
|
||||
let x = F51x4Unreduced::new(&x0, &x1, &x2, &x3);
|
||||
|
||||
let y = x.shuffle(Shuffle::AAAA);
|
||||
let splits = y.split();
|
||||
|
||||
assert_eq!(splits[0], x0);
|
||||
assert_eq!(splits[1], x0);
|
||||
assert_eq!(splits[2], x0);
|
||||
assert_eq!(splits[3], x0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blend_AB() {
|
||||
let x0 = FieldElement51::from_bytes(&[0x10; 32]);
|
||||
let x1 = FieldElement51::from_bytes(&[0x11; 32]);
|
||||
let x2 = FieldElement51::from_bytes(&[0x12; 32]);
|
||||
let x3 = FieldElement51::from_bytes(&[0x13; 32]);
|
||||
|
||||
let x = F51x4Unreduced::new(&x0, &x1, &x2, &x3);
|
||||
let z = F51x4Unreduced::new(&x3, &x2, &x1, &x0);
|
||||
|
||||
let y = x.blend(&z, Lanes::AB);
|
||||
let splits = y.split();
|
||||
|
||||
assert_eq!(splits[0], x3);
|
||||
assert_eq!(splits[1], x2);
|
||||
assert_eq!(splits[2], x2);
|
||||
assert_eq!(splits[3], x3);
|
||||
}
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2018-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![cfg_attr(
|
||||
feature = "nightly",
|
||||
doc(include = "../../../../docs/ifma-notes.md")
|
||||
)]
|
||||
|
||||
pub mod field;
|
||||
|
||||
pub mod edwards;
|
||||
|
||||
pub mod constants;
|
|
@ -1,42 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
// Conditionally include the notes if we're on nightly (so we can include docs at all).
|
||||
#![cfg_attr(
|
||||
feature = "nightly",
|
||||
doc(include = "../../../docs/parallel-formulas.md")
|
||||
)]
|
||||
|
||||
#[cfg(not(any(target_feature = "avx2", target_feature = "avx512ifma", rustdoc)))]
|
||||
compile_error!("simd_backend selected without target_feature=+avx2 or +avx512ifma");
|
||||
|
||||
#[cfg(any(
|
||||
all(target_feature = "avx2", not(target_feature = "avx512ifma")),
|
||||
rustdoc
|
||||
))]
|
||||
#[doc(cfg(all(target_feature = "avx2", not(target_feature = "avx512ifma"))))]
|
||||
pub mod avx2;
|
||||
#[cfg(any(
|
||||
all(target_feature = "avx2", not(target_feature = "avx512ifma")),
|
||||
rustdoc
|
||||
))]
|
||||
pub(crate) use self::avx2::{
|
||||
constants::BASEPOINT_ODD_LOOKUP_TABLE, edwards::CachedPoint, edwards::ExtendedPoint,
|
||||
};
|
||||
|
||||
#[cfg(any(target_feature = "avx512ifma", rustdoc))]
|
||||
#[doc(cfg(target_feature = "avx512ifma"))]
|
||||
pub mod ifma;
|
||||
#[cfg(target_feature = "avx512ifma")]
|
||||
pub(crate) use self::ifma::{
|
||||
constants::BASEPOINT_ODD_LOOKUP_TABLE, edwards::CachedPoint, edwards::ExtendedPoint,
|
||||
};
|
||||
|
||||
pub mod scalar_mul;
|
|
@ -1,22 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
pub mod variable_base;
|
||||
|
||||
pub mod vartime_double_base;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod precomputed_straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod pippenger;
|
|
@ -1,164 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Oleg Andreev
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Oleg Andreev <oleganza@gmail.com>
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use backend::vector::{CachedPoint, ExtendedPoint};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::{Identity, VartimeMultiscalarMul};
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Implements a version of Pippenger's algorithm.
|
||||
///
|
||||
/// See the documentation in the serial `scalar_mul::pippenger` module for details.
|
||||
pub struct Pippenger;
|
||||
|
||||
#[cfg(any(feature = "alloc", feature = "std"))]
|
||||
impl VartimeMultiscalarMul for Pippenger {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
let mut scalars = scalars.into_iter();
|
||||
let size = scalars.by_ref().size_hint().0;
|
||||
let w = if size < 500 {
|
||||
6
|
||||
} else if size < 800 {
|
||||
7
|
||||
} else {
|
||||
8
|
||||
};
|
||||
|
||||
let max_digit: usize = 1 << w;
|
||||
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
|
||||
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
|
||||
|
||||
// Collect optimized scalars and points in a buffer for repeated access
|
||||
// (scanning the whole collection per each digit position).
|
||||
let scalars = scalars
|
||||
.into_iter()
|
||||
.map(|s| s.borrow().to_radix_2w(w));
|
||||
|
||||
let points = points
|
||||
.into_iter()
|
||||
.map(|p| p.map(|P| CachedPoint::from(ExtendedPoint::from(P))));
|
||||
|
||||
let scalars_points = scalars
|
||||
.zip(points)
|
||||
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
// Prepare 2^w/2 buckets.
|
||||
// buckets[i] corresponds to a multiplication factor (i+1).
|
||||
let mut buckets: Vec<ExtendedPoint> = (0..buckets_count)
|
||||
.map(|_| ExtendedPoint::identity())
|
||||
.collect();
|
||||
|
||||
let mut columns = (0..digits_count).rev().map(|digit_index| {
|
||||
// Clear the buckets when processing another digit.
|
||||
for i in 0..buckets_count {
|
||||
buckets[i] = ExtendedPoint::identity();
|
||||
}
|
||||
|
||||
// Iterate over pairs of (point, scalar)
|
||||
// and add/sub the point to the corresponding bucket.
|
||||
// Note: if we add support for precomputed lookup tables,
|
||||
// we'll be adding/subtractiong point premultiplied by `digits[i]` to buckets[0].
|
||||
for (digits, pt) in scalars_points.iter() {
|
||||
// Widen digit so that we don't run into edge cases when w=8.
|
||||
let digit = digits[digit_index] as i16;
|
||||
if digit > 0 {
|
||||
let b = (digit - 1) as usize;
|
||||
buckets[b] = &buckets[b] + pt;
|
||||
} else if digit < 0 {
|
||||
let b = (-digit - 1) as usize;
|
||||
buckets[b] = &buckets[b] - pt;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the buckets applying the multiplication factor to each bucket.
|
||||
// The most efficient way to do that is to have a single sum with two running sums:
|
||||
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
|
||||
//
|
||||
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
|
||||
// C
|
||||
// C B
|
||||
// C B A Sum = C + (C+B) + (C+B+A)
|
||||
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
|
||||
let mut buckets_sum = buckets[buckets_count - 1];
|
||||
for i in (0..(buckets_count - 1)).rev() {
|
||||
buckets_intermediate_sum =
|
||||
&buckets_intermediate_sum + &CachedPoint::from(buckets[i]);
|
||||
buckets_sum = &buckets_sum + &CachedPoint::from(buckets_intermediate_sum);
|
||||
}
|
||||
|
||||
buckets_sum
|
||||
});
|
||||
|
||||
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
|
||||
// `unwrap()` always succeeds because we know we have more than zero digits.
|
||||
let hi_column = columns.next().unwrap();
|
||||
|
||||
Some(
|
||||
columns
|
||||
.fold(hi_column, |total, p| {
|
||||
&total.mul_by_pow_2(w as u32) + &CachedPoint::from(p)
|
||||
})
|
||||
.into(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
#[test]
|
||||
fn test_vartime_pippenger() {
|
||||
// Reuse points across different tests
|
||||
let mut n = 512;
|
||||
let x = Scalar::from(2128506u64).invert();
|
||||
let y = Scalar::from(4443282u64).invert();
|
||||
let points: Vec<_> = (0..n)
|
||||
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
|
||||
.collect();
|
||||
let scalars: Vec<_> = (0..n)
|
||||
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
|
||||
.collect();
|
||||
|
||||
let premultiplied: Vec<EdwardsPoint> = scalars
|
||||
.iter()
|
||||
.zip(points.iter())
|
||||
.map(|(sc, pt)| sc * pt)
|
||||
.collect();
|
||||
|
||||
while n > 0 {
|
||||
let scalars = &scalars[0..n].to_vec();
|
||||
let points = &points[0..n].to_vec();
|
||||
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
|
||||
|
||||
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
|
||||
|
||||
assert_eq!(subject.compress(), control.compress());
|
||||
|
||||
n = n / 2;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Henry de Valence.
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Precomputation for Straus's method.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use backend::vector::{CachedPoint, ExtendedPoint};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::Identity;
|
||||
use traits::VartimePrecomputedMultiscalarMul;
|
||||
use window::{NafLookupTable5, NafLookupTable8};
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
|
||||
pub struct VartimePrecomputedStraus {
|
||||
static_lookup_tables: Vec<NafLookupTable8<CachedPoint>>,
|
||||
}
|
||||
|
||||
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn new<I>(static_points: I) -> Self
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Self::Point>,
|
||||
{
|
||||
Self {
|
||||
static_lookup_tables: static_points
|
||||
.into_iter()
|
||||
.map(|P| NafLookupTable8::<CachedPoint>::from(P.borrow()))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn optional_mixed_multiscalar_mul<I, J, K>(
|
||||
&self,
|
||||
static_scalars: I,
|
||||
dynamic_scalars: J,
|
||||
dynamic_points: K,
|
||||
) -> Option<Self::Point>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Scalar>,
|
||||
K: IntoIterator<Item = Option<Self::Point>>,
|
||||
{
|
||||
let static_nafs = static_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
let dynamic_nafs: Vec<_> = dynamic_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dynamic_lookup_tables = dynamic_points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<CachedPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let sp = self.static_lookup_tables.len();
|
||||
let dp = dynamic_lookup_tables.len();
|
||||
assert_eq!(sp, static_nafs.len());
|
||||
assert_eq!(dp, dynamic_nafs.len());
|
||||
|
||||
// We could save some doublings by looking for the highest
|
||||
// nonzero NAF coefficient, but since we might have a lot of
|
||||
// them to search, it's not clear it's worthwhile to check.
|
||||
let mut R = ExtendedPoint::identity();
|
||||
for j in (0..256).rev() {
|
||||
R = R.double();
|
||||
|
||||
for i in 0..dp {
|
||||
let t_ij = dynamic_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R + &dynamic_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R - &dynamic_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..sp {
|
||||
let t_ij = static_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R + &self.static_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R - &self.static_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(R.into())
|
||||
}
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use zeroize::Zeroizing;
|
||||
|
||||
use backend::vector::{CachedPoint, ExtendedPoint};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use window::{LookupTable, NafLookupTable5};
|
||||
use traits::{Identity, MultiscalarMul, VartimeMultiscalarMul};
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Multiscalar multiplication using interleaved window / Straus'
|
||||
/// method. See the `Straus` struct in the serial backend for more
|
||||
/// details.
|
||||
///
|
||||
/// This exists as a seperate implementation from that one because the
|
||||
/// AVX2 code uses different curve models (it does not pass between
|
||||
/// multiple models during scalar mul), and it has to convert the
|
||||
/// point representation on the fly.
|
||||
pub struct Straus {}
|
||||
|
||||
impl MultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<EdwardsPoint>,
|
||||
{
|
||||
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
|
||||
// for each input point P
|
||||
let lookup_tables: Vec<_> = points
|
||||
.into_iter()
|
||||
.map(|point| LookupTable::<CachedPoint>::from(point.borrow()))
|
||||
.collect();
|
||||
|
||||
let scalar_digits_vec: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|s| s.borrow().to_radix_16())
|
||||
.collect();
|
||||
// Pass ownership to a `Zeroizing` wrapper
|
||||
let scalar_digits = Zeroizing::new(scalar_digits_vec);
|
||||
|
||||
let mut Q = ExtendedPoint::identity();
|
||||
for j in (0..64).rev() {
|
||||
Q = Q.mul_by_pow_2(4);
|
||||
let it = scalar_digits.iter().zip(lookup_tables.iter());
|
||||
for (s_i, lookup_table_i) in it {
|
||||
// Q = Q + s_{i,j} * P_i
|
||||
Q = &Q + &lookup_table_i.select(s_i[j]);
|
||||
}
|
||||
}
|
||||
Q.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl VartimeMultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
let nafs: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect();
|
||||
let lookup_tables: Vec<_> = points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<CachedPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let mut Q = ExtendedPoint::identity();
|
||||
|
||||
for i in (0..256).rev() {
|
||||
Q = Q.double();
|
||||
|
||||
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
|
||||
if naf[i] > 0 {
|
||||
Q = &Q + &lookup_table.select(naf[i] as usize);
|
||||
} else if naf[i] < 0 {
|
||||
Q = &Q - &lookup_table.select(-naf[i] as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(Q.into())
|
||||
}
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
#![allow(non_snake_case)]
|
||||
|
||||
use backend::vector::{CachedPoint, ExtendedPoint};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::Identity;
|
||||
use window::LookupTable;
|
||||
|
||||
/// Perform constant-time, variable-base scalar multiplication.
|
||||
pub fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
|
||||
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
|
||||
let lookup_table = LookupTable::<CachedPoint>::from(point);
|
||||
// Setting s = scalar, compute
|
||||
//
|
||||
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
|
||||
//
|
||||
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
|
||||
let scalar_digits = scalar.to_radix_16();
|
||||
// Compute s*P as
|
||||
//
|
||||
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
|
||||
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
|
||||
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
|
||||
//
|
||||
// We sum right-to-left.
|
||||
let mut Q = ExtendedPoint::identity();
|
||||
for i in (0..64).rev() {
|
||||
Q = Q.mul_by_pow_2(4);
|
||||
Q = &Q + &lookup_table.select(scalar_digits[i]);
|
||||
}
|
||||
Q.into()
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use backend::vector::BASEPOINT_ODD_LOOKUP_TABLE;
|
||||
use backend::vector::{CachedPoint, ExtendedPoint};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::Identity;
|
||||
use window::NafLookupTable5;
|
||||
|
||||
/// Compute \\(aA + bB\\) in variable time, where \\(B\\) is the Ed25519 basepoint.
|
||||
pub fn mul(a: &Scalar, A: &EdwardsPoint, b: &Scalar) -> EdwardsPoint {
|
||||
let a_naf = a.non_adjacent_form(5);
|
||||
let b_naf = b.non_adjacent_form(8);
|
||||
|
||||
// Find starting index
|
||||
let mut i: usize = 255;
|
||||
for j in (0..256).rev() {
|
||||
i = j;
|
||||
if a_naf[i] != 0 || b_naf[i] != 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let table_A = NafLookupTable5::<CachedPoint>::from(A);
|
||||
let table_B = &BASEPOINT_ODD_LOOKUP_TABLE;
|
||||
|
||||
let mut Q = ExtendedPoint::identity();
|
||||
|
||||
loop {
|
||||
Q = Q.double();
|
||||
|
||||
if a_naf[i] > 0 {
|
||||
Q = &Q + &table_A.select(a_naf[i] as usize);
|
||||
} else if a_naf[i] < 0 {
|
||||
Q = &Q - &table_A.select(-a_naf[i] as usize);
|
||||
}
|
||||
|
||||
if b_naf[i] > 0 {
|
||||
Q = &Q + &table_B.select(b_naf[i] as usize);
|
||||
} else if b_naf[i] < 0 {
|
||||
Q = &Q - &table_B.select(-b_naf[i] as usize);
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
break;
|
||||
}
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
Q.into()
|
||||
}
|
|
@ -1,176 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Various constants, such as the Ristretto and Ed25519 basepoints.
|
||||
//!
|
||||
//! Most of the constants are given with
|
||||
//! `LONG_DESCRIPTIVE_UPPER_CASE_NAMES`, but they can be brought into
|
||||
//! scope using a `let` binding:
|
||||
//!
|
||||
//! ```
|
||||
//! use curve25519_dalek::constants;
|
||||
//! use curve25519_dalek::traits::IsIdentity;
|
||||
//!
|
||||
//! let B = &constants::RISTRETTO_BASEPOINT_TABLE;
|
||||
//! let l = &constants::BASEPOINT_ORDER;
|
||||
//!
|
||||
//! let A = l * B;
|
||||
//! assert!(A.is_identity());
|
||||
//! ```
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use edwards::CompressedEdwardsY;
|
||||
use ristretto::RistrettoPoint;
|
||||
use ristretto::CompressedRistretto;
|
||||
use montgomery::MontgomeryPoint;
|
||||
use scalar::Scalar;
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub use backend::serial::u64::constants::*;
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub use backend::serial::u32::constants::*;
|
||||
|
||||
/// The Ed25519 basepoint, in `CompressedEdwardsY` format.
|
||||
///
|
||||
/// This is the little-endian byte encoding of \\( 4/5 \pmod p \\),
|
||||
/// which is the \\(y\\)-coordinate of the Ed25519 basepoint.
|
||||
///
|
||||
/// The sign bit is 0 since the basepoint has \\(x\\) chosen to be positive.
|
||||
pub const ED25519_BASEPOINT_COMPRESSED: CompressedEdwardsY =
|
||||
CompressedEdwardsY([0x58, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
|
||||
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
|
||||
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66,
|
||||
0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66]);
|
||||
|
||||
/// The X25519 basepoint, in `MontgomeryPoint` format.
|
||||
pub const X25519_BASEPOINT: MontgomeryPoint =
|
||||
MontgomeryPoint([0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]);
|
||||
|
||||
/// The Ristretto basepoint, in `CompressedRistretto` format.
|
||||
pub const RISTRETTO_BASEPOINT_COMPRESSED: CompressedRistretto =
|
||||
CompressedRistretto([0xe2, 0xf2, 0xae, 0x0a, 0x6a, 0xbc, 0x4e, 0x71,
|
||||
0xa8, 0x84, 0xa9, 0x61, 0xc5, 0x00, 0x51, 0x5f,
|
||||
0x58, 0xe3, 0x0b, 0x6a, 0xa5, 0x82, 0xdd, 0x8d,
|
||||
0xb6, 0xa6, 0x59, 0x45, 0xe0, 0x8d, 0x2d, 0x76]);
|
||||
|
||||
/// The Ristretto basepoint, as a `RistrettoPoint`.
|
||||
///
|
||||
/// This is called `_POINT` to distinguish it from `_TABLE`, which
|
||||
/// provides fast scalar multiplication.
|
||||
pub const RISTRETTO_BASEPOINT_POINT: RistrettoPoint = RistrettoPoint(ED25519_BASEPOINT_POINT);
|
||||
|
||||
/// `BASEPOINT_ORDER` is the order of the Ristretto group and of the Ed25519 basepoint, i.e.,
|
||||
/// $$
|
||||
/// \ell = 2^\{252\} + 27742317777372353535851937790883648493.
|
||||
/// $$
|
||||
pub const BASEPOINT_ORDER: Scalar = Scalar{
|
||||
bytes: [
|
||||
0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
|
||||
0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
|
||||
],
|
||||
};
|
||||
|
||||
use ristretto::RistrettoBasepointTable;
|
||||
/// The Ristretto basepoint, as a `RistrettoBasepointTable` for scalar multiplication.
|
||||
pub const RISTRETTO_BASEPOINT_TABLE: RistrettoBasepointTable
|
||||
= RistrettoBasepointTable(ED25519_BASEPOINT_TABLE);
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use field::FieldElement;
|
||||
use traits::{IsIdentity, ValidityCheck};
|
||||
use constants;
|
||||
|
||||
#[test]
|
||||
fn test_eight_torsion() {
|
||||
for i in 0..8 {
|
||||
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(3);
|
||||
assert!(Q.is_valid());
|
||||
assert!(Q.is_identity());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_four_torsion() {
|
||||
for i in (0..8).filter(|i| i % 2 == 0) {
|
||||
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(2);
|
||||
assert!(Q.is_valid());
|
||||
assert!(Q.is_identity());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_two_torsion() {
|
||||
for i in (0..8).filter(|i| i % 4 == 0) {
|
||||
let Q = constants::EIGHT_TORSION[i].mul_by_pow_2(1);
|
||||
assert!(Q.is_valid());
|
||||
assert!(Q.is_identity());
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that SQRT_M1 is the positive square root of -1
|
||||
#[test]
|
||||
fn test_sqrt_minus_one() {
|
||||
let minus_one = FieldElement::minus_one();
|
||||
let sqrt_m1_sq = &constants::SQRT_M1 * &constants::SQRT_M1;
|
||||
assert_eq!(minus_one, sqrt_m1_sq);
|
||||
assert_eq!(constants::SQRT_M1.is_negative().unwrap_u8(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sqrt_constants_sign() {
|
||||
let minus_one = FieldElement::minus_one();
|
||||
let (was_nonzero_square, invsqrt_m1) = minus_one.invsqrt();
|
||||
assert_eq!(was_nonzero_square.unwrap_u8(), 1u8);
|
||||
let sign_test_sqrt = &invsqrt_m1 * &constants::SQRT_M1;
|
||||
assert_eq!(sign_test_sqrt, minus_one);
|
||||
}
|
||||
|
||||
/// Test that d = -121665/121666
|
||||
#[test]
|
||||
#[cfg(feature = "u32_backend")]
|
||||
fn test_d_vs_ratio() {
|
||||
use backend::serial::u32::field::FieldElement2625;
|
||||
let a = -&FieldElement2625([121665,0,0,0,0,0,0,0,0,0]);
|
||||
let b = FieldElement2625([121666,0,0,0,0,0,0,0,0,0]);
|
||||
let d = &a * &b.invert();
|
||||
let d2 = &d + &d;
|
||||
assert_eq!(d, constants::EDWARDS_D);
|
||||
assert_eq!(d2, constants::EDWARDS_D2);
|
||||
}
|
||||
|
||||
/// Test that d = -121665/121666
|
||||
#[test]
|
||||
#[cfg(feature = "u64_backend")]
|
||||
fn test_d_vs_ratio() {
|
||||
use backend::serial::u64::field::FieldElement51;
|
||||
let a = -&FieldElement51([121665,0,0,0,0]);
|
||||
let b = FieldElement51([121666,0,0,0,0]);
|
||||
let d = &a * &b.invert();
|
||||
let d2 = &d + &d;
|
||||
assert_eq!(d, constants::EDWARDS_D);
|
||||
assert_eq!(d2, constants::EDWARDS_D2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sqrt_ad_minus_one() {
|
||||
let a = FieldElement::minus_one();
|
||||
let ad_minus_one = &(&a * &constants::EDWARDS_D) + &a;
|
||||
let should_be_ad_minus_one = constants::SQRT_AD_MINUS_ONE.square();
|
||||
assert_eq!(should_be_ad_minus_one, ad_minus_one);
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,460 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\).
|
||||
//!
|
||||
//! The `curve25519_dalek::field` module provides a type alias
|
||||
//! `curve25519_dalek::field::FieldElement` to a field element type
|
||||
//! defined in the `backend` module; either `FieldElement51` or
|
||||
//! `FieldElement2625`.
|
||||
//!
|
||||
//! Field operations defined in terms of machine
|
||||
//! operations, such as field multiplication or squaring, are defined in
|
||||
//! the backend implementation.
|
||||
//!
|
||||
//! Field operations defined in terms of other field operations, such as
|
||||
//! field inversion or square roots, are defined here.
|
||||
|
||||
use core::cmp::{Eq, PartialEq};
|
||||
|
||||
use subtle::ConditionallySelectable;
|
||||
use subtle::ConditionallyNegatable;
|
||||
use subtle::Choice;
|
||||
use subtle::ConstantTimeEq;
|
||||
|
||||
use constants;
|
||||
use backend;
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub use backend::serial::u64::field::*;
|
||||
/// A `FieldElement` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// The `FieldElement` type is an alias for one of the platform-specific
|
||||
/// implementations.
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub type FieldElement = backend::serial::u64::field::FieldElement51;
|
||||
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub use backend::serial::u32::field::*;
|
||||
/// A `FieldElement` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// The `FieldElement` type is an alias for one of the platform-specific
|
||||
/// implementations.
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub type FieldElement = backend::serial::u32::field::FieldElement2625;
|
||||
|
||||
impl Eq for FieldElement {}
|
||||
|
||||
impl PartialEq for FieldElement {
|
||||
fn eq(&self, other: &FieldElement) -> bool {
|
||||
self.ct_eq(other).unwrap_u8() == 1u8
|
||||
}
|
||||
}
|
||||
|
||||
impl ConstantTimeEq for FieldElement {
|
||||
/// Test equality between two `FieldElement`s. Since the
|
||||
/// internal representation is not canonical, the field elements
|
||||
/// are normalized to wire format before comparison.
|
||||
fn ct_eq(&self, other: &FieldElement) -> Choice {
|
||||
self.to_bytes().ct_eq(&other.to_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement {
|
||||
/// Determine if this `FieldElement` is negative, in the sense
|
||||
/// used in the ed25519 paper: `x` is negative if the low bit is
|
||||
/// set.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// If negative, return `Choice(1)`. Otherwise, return `Choice(0)`.
|
||||
pub fn is_negative(&self) -> Choice {
|
||||
let bytes = self.to_bytes();
|
||||
(bytes[0] & 1).into()
|
||||
}
|
||||
|
||||
/// Determine if this `FieldElement` is zero.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// If zero, return `Choice(1)`. Otherwise, return `Choice(0)`.
|
||||
pub fn is_zero(&self) -> Choice {
|
||||
let zero = [0u8; 32];
|
||||
let bytes = self.to_bytes();
|
||||
|
||||
bytes.ct_eq(&zero)
|
||||
}
|
||||
|
||||
/// Compute (self^(2^250-1), self^11), used as a helper function
|
||||
/// within invert() and pow22523().
|
||||
fn pow22501(&self) -> (FieldElement, FieldElement) {
|
||||
// Instead of managing which temporary variables are used
|
||||
// for what, we define as many as we need and leave stack
|
||||
// allocation to the compiler
|
||||
//
|
||||
// Each temporary variable t_i is of the form (self)^e_i.
|
||||
// Squaring t_i corresponds to multiplying e_i by 2,
|
||||
// so the pow2k function shifts e_i left by k places.
|
||||
// Multiplying t_i and t_j corresponds to adding e_i + e_j.
|
||||
//
|
||||
// Temporary t_i Nonzero bits of e_i
|
||||
//
|
||||
let t0 = self.square(); // 1 e_0 = 2^1
|
||||
let t1 = t0.square().square(); // 3 e_1 = 2^3
|
||||
let t2 = self * &t1; // 3,0 e_2 = 2^3 + 2^0
|
||||
let t3 = &t0 * &t2; // 3,1,0
|
||||
let t4 = t3.square(); // 4,2,1
|
||||
let t5 = &t2 * &t4; // 4,3,2,1,0
|
||||
let t6 = t5.pow2k(5); // 9,8,7,6,5
|
||||
let t7 = &t6 * &t5; // 9,8,7,6,5,4,3,2,1,0
|
||||
let t8 = t7.pow2k(10); // 19..10
|
||||
let t9 = &t8 * &t7; // 19..0
|
||||
let t10 = t9.pow2k(20); // 39..20
|
||||
let t11 = &t10 * &t9; // 39..0
|
||||
let t12 = t11.pow2k(10); // 49..10
|
||||
let t13 = &t12 * &t7; // 49..0
|
||||
let t14 = t13.pow2k(50); // 99..50
|
||||
let t15 = &t14 * &t13; // 99..0
|
||||
let t16 = t15.pow2k(100); // 199..100
|
||||
let t17 = &t16 * &t15; // 199..0
|
||||
let t18 = t17.pow2k(50); // 249..50
|
||||
let t19 = &t18 * &t13; // 249..0
|
||||
|
||||
(t19, t3)
|
||||
}
|
||||
|
||||
/// Given a slice of public `FieldElements`, replace each with its inverse.
|
||||
///
|
||||
/// All input `FieldElements` **MUST** be nonzero.
|
||||
#[cfg(feature = "alloc")]
|
||||
pub fn batch_invert(inputs: &mut [FieldElement]) {
|
||||
// Montgomery’s Trick and Fast Implementation of Masked AES
|
||||
// Genelle, Prouff and Quisquater
|
||||
// Section 3.2
|
||||
|
||||
let n = inputs.len();
|
||||
let mut scratch = vec![FieldElement::one(); n];
|
||||
|
||||
// Keep an accumulator of all of the previous products
|
||||
let mut acc = FieldElement::one();
|
||||
|
||||
// Pass through the input vector, recording the previous
|
||||
// products in the scratch space
|
||||
for (input, scratch) in inputs.iter().zip(scratch.iter_mut()) {
|
||||
*scratch = acc;
|
||||
acc = &acc * input;
|
||||
}
|
||||
|
||||
// acc is nonzero iff all inputs are nonzero
|
||||
assert_eq!(acc.is_zero().unwrap_u8(), 0);
|
||||
|
||||
// Compute the inverse of all products
|
||||
acc = acc.invert();
|
||||
|
||||
// Pass through the vector backwards to compute the inverses
|
||||
// in place
|
||||
for (input, scratch) in inputs.iter_mut().rev().zip(scratch.into_iter().rev()) {
|
||||
let tmp = &acc * input;
|
||||
*input = &acc * &scratch;
|
||||
acc = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a nonzero field element, compute its inverse.
|
||||
///
|
||||
/// The inverse is computed as self^(p-2), since
|
||||
/// x^(p-2)x = x^(p-1) = 1 (mod p).
|
||||
///
|
||||
/// This function returns zero on input zero.
|
||||
pub fn invert(&self) -> FieldElement {
|
||||
// The bits of p-2 = 2^255 -19 -2 are 11010111111...11.
|
||||
//
|
||||
// nonzero bits of exponent
|
||||
let (t19, t3) = self.pow22501(); // t19: 249..0 ; t3: 3,1,0
|
||||
let t20 = t19.pow2k(5); // 254..5
|
||||
let t21 = &t20 * &t3; // 254..5,3,1,0
|
||||
|
||||
t21
|
||||
}
|
||||
|
||||
/// Raise this field element to the power (p-5)/8 = 2^252 -3.
|
||||
fn pow_p58(&self) -> FieldElement {
|
||||
// The bits of (p-5)/8 are 101111.....11.
|
||||
//
|
||||
// nonzero bits of exponent
|
||||
let (t19, _) = self.pow22501(); // 249..0
|
||||
let t20 = t19.pow2k(2); // 251..2
|
||||
let t21 = self * &t20; // 251..2,0
|
||||
|
||||
t21
|
||||
}
|
||||
|
||||
/// Given `FieldElements` `u` and `v`, compute either `sqrt(u/v)`
|
||||
/// or `sqrt(i*u/v)` in constant time.
|
||||
///
|
||||
/// This function always returns the nonnegative square root.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// - `(Choice(1), +sqrt(u/v)) ` if `v` is nonzero and `u/v` is square;
|
||||
/// - `(Choice(1), zero) ` if `u` is zero;
|
||||
/// - `(Choice(0), zero) ` if `v` is zero and `u` is nonzero;
|
||||
/// - `(Choice(0), +sqrt(i*u/v))` if `u/v` is nonsquare (so `i*u/v` is square).
|
||||
///
|
||||
pub fn sqrt_ratio_i(u: &FieldElement, v: &FieldElement) -> (Choice, FieldElement) {
|
||||
// Using the same trick as in ed25519 decoding, we merge the
|
||||
// inversion, the square root, and the square test as follows.
|
||||
//
|
||||
// To compute sqrt(α), we can compute β = α^((p+3)/8).
|
||||
// Then β^2 = ±α, so multiplying β by sqrt(-1) if necessary
|
||||
// gives sqrt(α).
|
||||
//
|
||||
// To compute 1/sqrt(α), we observe that
|
||||
// 1/β = α^(p-1 - (p+3)/8) = α^((7p-11)/8)
|
||||
// = α^3 * (α^7)^((p-5)/8).
|
||||
//
|
||||
// We can therefore compute sqrt(u/v) = sqrt(u)/sqrt(v)
|
||||
// by first computing
|
||||
// r = u^((p+3)/8) v^(p-1-(p+3)/8)
|
||||
// = u u^((p-5)/8) v^3 (v^7)^((p-5)/8)
|
||||
// = (uv^3) (uv^7)^((p-5)/8).
|
||||
//
|
||||
// If v is nonzero and u/v is square, then r^2 = ±u/v,
|
||||
// so vr^2 = ±u.
|
||||
// If vr^2 = u, then sqrt(u/v) = r.
|
||||
// If vr^2 = -u, then sqrt(u/v) = r*sqrt(-1).
|
||||
//
|
||||
// If v is zero, r is also zero.
|
||||
|
||||
let v3 = &v.square() * v;
|
||||
let v7 = &v3.square() * v;
|
||||
let mut r = &(u * &v3) * &(u * &v7).pow_p58();
|
||||
let check = v * &r.square();
|
||||
|
||||
let i = &constants::SQRT_M1;
|
||||
|
||||
let correct_sign_sqrt = check.ct_eq( u);
|
||||
let flipped_sign_sqrt = check.ct_eq( &(-u));
|
||||
let flipped_sign_sqrt_i = check.ct_eq(&(&(-u)*i));
|
||||
|
||||
let r_prime = &constants::SQRT_M1 * &r;
|
||||
r.conditional_assign(&r_prime, flipped_sign_sqrt | flipped_sign_sqrt_i);
|
||||
|
||||
// Choose the nonnegative square root.
|
||||
let r_is_negative = r.is_negative();
|
||||
r.conditional_negate(r_is_negative);
|
||||
|
||||
let was_nonzero_square = correct_sign_sqrt | flipped_sign_sqrt;
|
||||
|
||||
(was_nonzero_square, r)
|
||||
}
|
||||
|
||||
/// Attempt to compute `sqrt(1/self)` in constant time.
|
||||
///
|
||||
/// Convenience wrapper around `sqrt_ratio_i`.
|
||||
///
|
||||
/// This function always returns the nonnegative square root.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// - `(Choice(1), +sqrt(1/self)) ` if `self` is a nonzero square;
|
||||
/// - `(Choice(0), zero) ` if `self` is zero;
|
||||
/// - `(Choice(0), +sqrt(i/self)) ` if `self` is a nonzero nonsquare;
|
||||
///
|
||||
pub fn invsqrt(&self) -> (Choice, FieldElement) {
|
||||
FieldElement::sqrt_ratio_i(&FieldElement::one(), self)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use field::*;
|
||||
use subtle::ConditionallyNegatable;
|
||||
|
||||
/// Random element a of GF(2^255-19), from Sage
|
||||
/// a = 1070314506888354081329385823235218444233221\
|
||||
/// 2228051251926706380353716438957572
|
||||
static A_BYTES: [u8; 32] =
|
||||
[ 0x04, 0xfe, 0xdf, 0x98, 0xa7, 0xfa, 0x0a, 0x68,
|
||||
0x84, 0x92, 0xbd, 0x59, 0x08, 0x07, 0xa7, 0x03,
|
||||
0x9e, 0xd1, 0xf6, 0xf2, 0xe1, 0xd9, 0xe2, 0xa4,
|
||||
0xa4, 0x51, 0x47, 0x36, 0xf3, 0xc3, 0xa9, 0x17];
|
||||
|
||||
/// Byte representation of a**2
|
||||
static ASQ_BYTES: [u8; 32] =
|
||||
[ 0x75, 0x97, 0x24, 0x9e, 0xe6, 0x06, 0xfe, 0xab,
|
||||
0x24, 0x04, 0x56, 0x68, 0x07, 0x91, 0x2d, 0x5d,
|
||||
0x0b, 0x0f, 0x3f, 0x1c, 0xb2, 0x6e, 0xf2, 0xe2,
|
||||
0x63, 0x9c, 0x12, 0xba, 0x73, 0x0b, 0xe3, 0x62];
|
||||
|
||||
/// Byte representation of 1/a
|
||||
static AINV_BYTES: [u8; 32] =
|
||||
[0x96, 0x1b, 0xcd, 0x8d, 0x4d, 0x5e, 0xa2, 0x3a,
|
||||
0xe9, 0x36, 0x37, 0x93, 0xdb, 0x7b, 0x4d, 0x70,
|
||||
0xb8, 0x0d, 0xc0, 0x55, 0xd0, 0x4c, 0x1d, 0x7b,
|
||||
0x90, 0x71, 0xd8, 0xe9, 0xb6, 0x18, 0xe6, 0x30];
|
||||
|
||||
/// Byte representation of a^((p-5)/8)
|
||||
static AP58_BYTES: [u8; 32] =
|
||||
[0x6a, 0x4f, 0x24, 0x89, 0x1f, 0x57, 0x60, 0x36,
|
||||
0xd0, 0xbe, 0x12, 0x3c, 0x8f, 0xf5, 0xb1, 0x59,
|
||||
0xe0, 0xf0, 0xb8, 0x1b, 0x20, 0xd2, 0xb5, 0x1f,
|
||||
0x15, 0x21, 0xf9, 0xe3, 0xe1, 0x61, 0x21, 0x55];
|
||||
|
||||
#[test]
|
||||
fn a_mul_a_vs_a_squared_constant() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let asq = FieldElement::from_bytes(&ASQ_BYTES);
|
||||
assert_eq!(asq, &a * &a);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a_square_vs_a_squared_constant() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let asq = FieldElement::from_bytes(&ASQ_BYTES);
|
||||
assert_eq!(asq, a.square());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a_square2_vs_a_squared_constant() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let asq = FieldElement::from_bytes(&ASQ_BYTES);
|
||||
assert_eq!(a.square2(), &asq+&asq);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a_invert_vs_inverse_of_a_constant() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let ainv = FieldElement::from_bytes(&AINV_BYTES);
|
||||
let should_be_inverse = a.invert();
|
||||
assert_eq!(ainv, should_be_inverse);
|
||||
assert_eq!(FieldElement::one(), &a * &should_be_inverse);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_invert_a_matches_nonbatched() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let ap58 = FieldElement::from_bytes(&AP58_BYTES);
|
||||
let asq = FieldElement::from_bytes(&ASQ_BYTES);
|
||||
let ainv = FieldElement::from_bytes(&AINV_BYTES);
|
||||
let a2 = &a + &a;
|
||||
let a_list = vec![a, ap58, asq, ainv, a2];
|
||||
let mut ainv_list = a_list.clone();
|
||||
FieldElement::batch_invert(&mut ainv_list[..]);
|
||||
for i in 0..5 {
|
||||
assert_eq!(a_list[i].invert(), ainv_list[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sqrt_ratio_behavior() {
|
||||
let zero = FieldElement::zero();
|
||||
let one = FieldElement::one();
|
||||
let i = constants::SQRT_M1;
|
||||
let two = &one + &one; // 2 is nonsquare mod p.
|
||||
let four = &two + &two; // 4 is square mod p.
|
||||
|
||||
// 0/0 should return (1, 0) since u is 0
|
||||
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&zero, &zero);
|
||||
assert_eq!(choice.unwrap_u8(), 1);
|
||||
assert_eq!(sqrt, zero);
|
||||
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
|
||||
|
||||
// 1/0 should return (0, 0) since v is 0, u is nonzero
|
||||
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&one, &zero);
|
||||
assert_eq!(choice.unwrap_u8(), 0);
|
||||
assert_eq!(sqrt, zero);
|
||||
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
|
||||
|
||||
// 2/1 is nonsquare, so we expect (0, sqrt(i*2))
|
||||
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&two, &one);
|
||||
assert_eq!(choice.unwrap_u8(), 0);
|
||||
assert_eq!(sqrt.square(), &two * &i);
|
||||
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
|
||||
|
||||
// 4/1 is square, so we expect (1, sqrt(4))
|
||||
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&four, &one);
|
||||
assert_eq!(choice.unwrap_u8(), 1);
|
||||
assert_eq!(sqrt.square(), four);
|
||||
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
|
||||
|
||||
// 1/4 is square, so we expect (1, 1/sqrt(4))
|
||||
let (choice, sqrt) = FieldElement::sqrt_ratio_i(&one, &four);
|
||||
assert_eq!(choice.unwrap_u8(), 1);
|
||||
assert_eq!(&sqrt.square() * &four, one);
|
||||
assert_eq!(sqrt.is_negative().unwrap_u8(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn a_p58_vs_ap58_constant() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let ap58 = FieldElement::from_bytes(&AP58_BYTES);
|
||||
assert_eq!(ap58, a.pow_p58());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn equality() {
|
||||
let a = FieldElement::from_bytes(&A_BYTES);
|
||||
let ainv = FieldElement::from_bytes(&AINV_BYTES);
|
||||
assert!(a == a);
|
||||
assert!(a != ainv);
|
||||
}
|
||||
|
||||
/// Notice that the last element has the high bit set, which
|
||||
/// should be ignored
|
||||
static B_BYTES: [u8;32] =
|
||||
[113, 191, 169, 143, 91, 234, 121, 15,
|
||||
241, 131, 217, 36, 230, 101, 92, 234,
|
||||
8, 208, 170, 251, 97, 127, 70, 210,
|
||||
58, 23, 166, 87, 240, 169, 184, 178];
|
||||
|
||||
#[test]
|
||||
fn from_bytes_highbit_is_ignored() {
|
||||
let mut cleared_bytes = B_BYTES;
|
||||
cleared_bytes[31] &= 127u8;
|
||||
let with_highbit_set = FieldElement::from_bytes(&B_BYTES);
|
||||
let without_highbit_set = FieldElement::from_bytes(&cleared_bytes);
|
||||
assert_eq!(without_highbit_set, with_highbit_set);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn conditional_negate() {
|
||||
let one = FieldElement::one();
|
||||
let minus_one = FieldElement::minus_one();
|
||||
let mut x = one;
|
||||
x.conditional_negate(Choice::from(1));
|
||||
assert_eq!(x, minus_one);
|
||||
x.conditional_negate(Choice::from(0));
|
||||
assert_eq!(x, minus_one);
|
||||
x.conditional_negate(Choice::from(1));
|
||||
assert_eq!(x, one);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn encoding_is_canonical() {
|
||||
// Encode 1 wrongly as 1 + (2^255 - 19) = 2^255 - 18
|
||||
let one_encoded_wrongly_bytes: [u8;32] = [0xee, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f];
|
||||
// Decode to a field element
|
||||
let one = FieldElement::from_bytes(&one_encoded_wrongly_bytes);
|
||||
// .. then check that the encoding is correct
|
||||
let one_bytes = one.to_bytes();
|
||||
assert_eq!(one_bytes[0], 1);
|
||||
for i in 1..32 {
|
||||
assert_eq!(one_bytes[i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn batch_invert_empty() {
|
||||
FieldElement::batch_invert(&mut []);
|
||||
}
|
||||
}
|
|
@ -1,100 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
#![no_std]
|
||||
#![cfg_attr(feature = "nightly", feature(test))]
|
||||
#![cfg_attr(all(feature = "alloc", not(feature = "std")), feature(alloc))]
|
||||
#![cfg_attr(feature = "nightly", feature(external_doc))]
|
||||
#![cfg_attr(feature = "nightly", feature(doc_cfg))]
|
||||
#![cfg_attr(feature = "simd_backend", feature(stdsimd))]
|
||||
// Refuse to compile if documentation is missing, but only on nightly.
|
||||
//
|
||||
// This means that missing docs will still fail CI, but means we can use
|
||||
// README.md as the crate documentation.
|
||||
//#![cfg_attr(feature = "nightly", deny(missing_docs))]
|
||||
|
||||
#![cfg_attr(feature = "nightly", doc(include = "../README.md"))]
|
||||
#![doc(html_logo_url = "https://doc.dalek.rs/assets/dalek-logo-clear.png")]
|
||||
|
||||
//! Note that docs will only build on nightly Rust until
|
||||
//! [RFC 1990 stabilizes](https://github.com/rust-lang/rust/issues/44732).
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// External dependencies:
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
#[cfg(all(feature = "alloc", not(feature = "std")))]
|
||||
#[macro_use]
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
#[macro_use]
|
||||
extern crate std;
|
||||
|
||||
#[cfg(all(feature = "nightly", feature = "packed_simd"))]
|
||||
extern crate packed_simd;
|
||||
|
||||
extern crate byteorder;
|
||||
pub extern crate digest;
|
||||
extern crate rand_core;
|
||||
extern crate zeroize;
|
||||
|
||||
// Used for traits related to constant-time code.
|
||||
pub extern crate subtle;
|
||||
|
||||
#[cfg(all(test, feature = "serde"))]
|
||||
extern crate bincode;
|
||||
#[cfg(feature = "serde")]
|
||||
extern crate serde;
|
||||
|
||||
// Internal macros. Must come first!
|
||||
#[macro_use]
|
||||
pub(crate) mod macros;
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// curve25519-dalek public modules
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
// Scalar arithmetic mod l = 2^252 + ..., the order of the Ristretto group
|
||||
pub mod scalar;
|
||||
|
||||
// Point operations on the Montgomery form of Curve25519
|
||||
pub mod montgomery;
|
||||
|
||||
// Point operations on the Edwards form of Curve25519
|
||||
pub mod edwards;
|
||||
|
||||
// Group operations on the Ristretto group
|
||||
pub mod ristretto;
|
||||
|
||||
// Useful constants, like the Ed25519 basepoint
|
||||
pub mod constants;
|
||||
|
||||
// External (and internal) traits.
|
||||
pub mod traits;
|
||||
|
||||
// All the lizard code is here, for now
|
||||
pub mod lizard;
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// curve25519-dalek internal modules
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
// Finite field arithmetic mod p = 2^255 - 19
|
||||
pub mod field;
|
||||
|
||||
// Arithmetic backends (using u32, u64, etc) live here
|
||||
pub(crate) mod backend;
|
||||
|
||||
// Crate-local prelude (for alloc-dependent features like `Vec`)
|
||||
pub(crate) mod prelude;
|
||||
|
||||
// Generic code for window lookups
|
||||
pub(crate) mod window;
|
|
@ -1,21 +0,0 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2019 Bas Westerbaan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -1,74 +0,0 @@
|
|||
//! Helper functions for use with Lizard
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConstantTimeEq;
|
||||
use subtle::ConditionallyNegatable;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use constants;
|
||||
use lizard::lizard_constants;
|
||||
|
||||
use field::FieldElement;
|
||||
|
||||
|
||||
/// Represents a point (s,t) on the the Jacobi quartic associated
|
||||
/// to the Edwards curve.
|
||||
#[derive(Copy, Clone)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct JacobiPoint {
|
||||
pub S: FieldElement,
|
||||
pub T: FieldElement,
|
||||
}
|
||||
|
||||
impl JacobiPoint {
|
||||
/// Elligator2 is defined in two steps: first a field element is converted
|
||||
/// to a point (s,t) on the Jacobi quartic associated to the Edwards curve.
|
||||
/// Then this point is mapped to a point on the Edwards curve.
|
||||
/// This function computes a field element that is mapped to a given (s,t)
|
||||
/// with Elligator2 if it exists.
|
||||
pub(crate) fn elligator_inv(&self) -> (Choice, FieldElement) {
|
||||
let mut out = FieldElement::zero();
|
||||
|
||||
// Special case: s = 0. If s is zero, either t = 1 or t = -1.
|
||||
// If t=1, then sqrt(i*d) is the preimage. Otherwise it's 0.
|
||||
let s_is_zero = self.S.is_zero();
|
||||
let t_equals_one = self.T.ct_eq(&FieldElement::one());
|
||||
out.conditional_assign(&lizard_constants::SQRT_ID, t_equals_one);
|
||||
let mut ret = s_is_zero;
|
||||
let mut done = s_is_zero;
|
||||
|
||||
// a := (t+1) (d+1)/(d-1)
|
||||
let a = &(&self.T + &FieldElement::one()) * &lizard_constants::DP1_OVER_DM1;
|
||||
let a2 = a.square();
|
||||
|
||||
// y := 1/sqrt(i (s^4 - a^2)).
|
||||
let s2 = self.S.square();
|
||||
let s4 = s2.square();
|
||||
let invSqY = &(&s4 - &a2) * &constants::SQRT_M1;
|
||||
|
||||
// There is no preimage if the square root of i*(s^4-a^2) does not exist.
|
||||
let (sq, y) = invSqY.invsqrt();
|
||||
ret |= sq;
|
||||
done |= !sq;
|
||||
|
||||
// x := (a + sign(s)*s^2) y
|
||||
let mut pms2 = s2;
|
||||
pms2.conditional_negate(self.S.is_negative());
|
||||
let mut x = &(&a + &pms2) * &y;
|
||||
let x_is_negative = x.is_negative();
|
||||
x.conditional_negate(x_is_negative);
|
||||
out.conditional_assign(&x, !done);
|
||||
|
||||
(ret, out)
|
||||
}
|
||||
|
||||
pub(crate) fn dual(&self) -> JacobiPoint {
|
||||
JacobiPoint {
|
||||
S: -(&self.S),
|
||||
T: -(&self.T),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
//! Constants for use in Lizard
|
||||
//!
|
||||
//! Could be moved into backend/serial/u??/constants.rs
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub(crate) use lizard::u64_constants::*;
|
||||
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub(crate) use lizard::u32_constants::*;
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
#[cfg(all(test, feature = "stage2_build"))]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
use constants;
|
||||
use field::FieldElement;
|
||||
|
||||
#[test]
|
||||
fn test_lizard_constants() {
|
||||
let (_, sqrt_id) = FieldElement::sqrt_ratio_i(
|
||||
&(&constants::SQRT_M1 * &constants::EDWARDS_D),
|
||||
&FieldElement::one()
|
||||
);
|
||||
assert_eq!(sqrt_id, SQRT_ID);
|
||||
|
||||
assert_eq!(
|
||||
&(&constants::EDWARDS_D + &FieldElement::one())
|
||||
* &(&constants::EDWARDS_D - &FieldElement::one()).invert(),
|
||||
DP1_OVER_DM1
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
MDOUBLE_INVSQRT_A_MINUS_D,
|
||||
-&(&constants::INVSQRT_A_MINUS_D + &constants::INVSQRT_A_MINUS_D)
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
MIDOUBLE_INVSQRT_A_MINUS_D,
|
||||
&MDOUBLE_INVSQRT_A_MINUS_D * &constants::SQRT_M1
|
||||
);
|
||||
|
||||
let (_, invsqrt_one_plus_d) = (
|
||||
&constants::EDWARDS_D + &FieldElement::one()).invsqrt();
|
||||
assert_eq!(
|
||||
-&invsqrt_one_plus_d,
|
||||
MINVSQRT_ONE_PLUS_D
|
||||
);
|
||||
}
|
||||
}
|
|
@ -1,305 +0,0 @@
|
|||
//! Defines additional methods on RistrettoPoint for Lizard
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use digest::Digest;
|
||||
use digest::generic_array::typenum::U32;
|
||||
|
||||
use constants;
|
||||
use field::FieldElement;
|
||||
|
||||
use subtle::ConditionallySelectable;
|
||||
use subtle::ConstantTimeEq;
|
||||
use subtle::Choice;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
|
||||
use lizard::jacobi_quartic::JacobiPoint;
|
||||
use lizard::lizard_constants;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
use ristretto::RistrettoPoint;
|
||||
|
||||
|
||||
impl RistrettoPoint {
|
||||
|
||||
pub fn from_uniform_bytes_single_elligator(bytes: &[u8; 32]) -> RistrettoPoint {
|
||||
RistrettoPoint::elligator_ristretto_flavor(&FieldElement::from_bytes(&bytes))
|
||||
}
|
||||
|
||||
/// Encode 16 bytes of data to a RistrettoPoint, using the Lizard method
|
||||
pub fn lizard_encode<D: Digest>(data: &[u8; 16]) -> RistrettoPoint
|
||||
where D: Digest<OutputSize = U32>
|
||||
{
|
||||
let mut fe_bytes: [u8;32] = Default::default();
|
||||
|
||||
let digest = D::digest(data);
|
||||
fe_bytes[0..32].copy_from_slice(digest.as_slice());
|
||||
fe_bytes[8..24].copy_from_slice(data);
|
||||
fe_bytes[0] &= 254; // make positive since Elligator on r and -r is the same
|
||||
fe_bytes[31] &= 63;
|
||||
let fe = FieldElement::from_bytes(&fe_bytes);
|
||||
RistrettoPoint::elligator_ristretto_flavor(&fe)
|
||||
}
|
||||
|
||||
/// Decode 16 bytes of data from a RistrettoPoint, using the Lizard method
|
||||
pub fn lizard_decode<D: Digest>(&self) -> Option<[u8; 16]>
|
||||
where D: Digest<OutputSize = U32>
|
||||
{
|
||||
let mut result: [u8; 16] = Default::default();
|
||||
let mut h: [u8;32] = Default::default();
|
||||
let (mask, fes) = self.elligator_ristretto_flavor_inverse();
|
||||
let mut n_found = 0;
|
||||
for j in 0..8 {
|
||||
let mut ok = Choice::from((mask >> j) & 1);
|
||||
let buf2 = fes[j].to_bytes(); // array
|
||||
h.copy_from_slice(&D::digest(&buf2[8..24])); // array
|
||||
h[8..24].copy_from_slice(&buf2[8..24]);
|
||||
h[0] &= 254;
|
||||
h[31] &= 63;
|
||||
ok &= h.ct_eq(&buf2);
|
||||
for i in 0..16 {
|
||||
result[i] = u8::conditional_select(&result[i], &buf2[8+i], ok);
|
||||
}
|
||||
n_found += ok.unwrap_u8();
|
||||
}
|
||||
if n_found == 1 {
|
||||
return Some(result);
|
||||
}
|
||||
else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_253_bits(data: &[u8; 32]) -> Option<RistrettoPoint>
|
||||
{
|
||||
if data.len() != 32 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let fe = FieldElement::from_bytes(data);
|
||||
let p = RistrettoPoint::elligator_ristretto_flavor(&fe);
|
||||
Some(p)
|
||||
}
|
||||
|
||||
|
||||
pub fn decode_253_bits(&self) -> (u8, [[u8; 32]; 8])
|
||||
{
|
||||
let mut ret = [ [0u8; 32]; 8];
|
||||
let (mask, fes) = self.elligator_ristretto_flavor_inverse();
|
||||
|
||||
for j in 0..8 {
|
||||
ret[j] = fes[j].to_bytes();
|
||||
}
|
||||
(mask, ret)
|
||||
}
|
||||
|
||||
/// Return the coset self + E[4], for debugging.
|
||||
pub fn xcoset4(&self) -> [EdwardsPoint; 4] {
|
||||
[ self.0
|
||||
, &self.0 + &constants::EIGHT_TORSION[2]
|
||||
, &self.0 + &constants::EIGHT_TORSION[4]
|
||||
, &self.0 + &constants::EIGHT_TORSION[6]
|
||||
]
|
||||
}
|
||||
|
||||
/// Computes the at most 8 positive FieldElements f such that
|
||||
/// self == elligator_ristretto_flavor(f).
|
||||
/// Assumes self is even.
|
||||
///
|
||||
/// Returns a bitmask of which elements in fes are set.
|
||||
pub fn elligator_ristretto_flavor_inverse(&self) -> (u8, [FieldElement; 8]) {
|
||||
// Elligator2 computes a Point from a FieldElement in two steps: first
|
||||
// it computes a (s,t) on the Jacobi quartic and then computes the
|
||||
// corresponding even point on the Edwards curve.
|
||||
//
|
||||
// We invert in three steps. Any Ristretto point has four representatives
|
||||
// as even Edwards points. For each of those even Edwards points,
|
||||
// there are two points on the Jacobi quartic that map to it.
|
||||
// Each of those eight points on the Jacobi quartic might have an
|
||||
// Elligator2 preimage.
|
||||
//
|
||||
// Essentially we first loop over the four representatives of our point,
|
||||
// then for each of them consider both points on the Jacobi quartic and
|
||||
// check whether they have an inverse under Elligator2. We take the
|
||||
// following shortcut though.
|
||||
//
|
||||
// We can compute two Jacobi quartic points for (x,y) and (-x,-y)
|
||||
// at the same time. The four Jacobi quartic points are two of
|
||||
// such pairs.
|
||||
|
||||
let mut mask : u8 = 0;
|
||||
let jcs = self.to_jacobi_quartic_ristretto();
|
||||
let mut ret = [FieldElement::one(); 8];
|
||||
|
||||
for i in 0..4 {
|
||||
let (ok, fe) = jcs[i].elligator_inv();
|
||||
let mut tmp : u8 = 0;
|
||||
ret[2*i] = fe;
|
||||
tmp.conditional_assign(&1, ok);
|
||||
mask |= tmp << (2 * i);
|
||||
|
||||
let jc = jcs[i].dual();
|
||||
let (ok, fe) = jc.elligator_inv();
|
||||
let mut tmp : u8 = 0;
|
||||
ret[2*i+1] = fe;
|
||||
tmp.conditional_assign(&1, ok);
|
||||
mask |= tmp << (2 * i + 1);
|
||||
}
|
||||
|
||||
return (mask, ret)
|
||||
}
|
||||
|
||||
/// Find a point on the Jacobi quartic associated to each of the four
|
||||
/// points Ristretto equivalent to p.
|
||||
///
|
||||
/// There is one exception: for (0,-1) there is no point on the quartic and
|
||||
/// so we repeat one on the quartic equivalent to (0,1).
|
||||
fn to_jacobi_quartic_ristretto(&self) -> [JacobiPoint; 4] {
|
||||
let x2 = self.0.X.square(); // X^2
|
||||
let y2 = self.0.Y.square(); // Y^2
|
||||
let y4 = y2.square(); // Y^4
|
||||
let z2 = self.0.Z.square(); // Z^2
|
||||
let z_min_y = &self.0.Z - &self.0.Y; // Z - Y
|
||||
let z_pl_y = &self.0.Z + &self.0.Y; // Z + Y
|
||||
let z2_min_y2 = &z2 - &y2; // Z^2 - Y^2
|
||||
|
||||
// gamma := 1/sqrt( Y^4 X^2 (Z^2 - Y^2) )
|
||||
let (_, gamma) = (&(&y4 * &x2) * &z2_min_y2).invsqrt();
|
||||
|
||||
let den = &gamma * &y2;
|
||||
|
||||
let s_over_x = &den * &z_min_y;
|
||||
let sp_over_xp = &den * &z_pl_y;
|
||||
|
||||
let s0 = &s_over_x * &self.0.X;
|
||||
let s1 = &(-(&sp_over_xp)) * &self.0.X;
|
||||
|
||||
// t_0 := -2/sqrt(-d-1) * Z * sOverX
|
||||
// t_1 := -2/sqrt(-d-1) * Z * spOverXp
|
||||
let tmp = &lizard_constants::MDOUBLE_INVSQRT_A_MINUS_D * &self.0.Z;
|
||||
let mut t0 = &tmp * &s_over_x;
|
||||
let mut t1 = &tmp * &sp_over_xp;
|
||||
|
||||
// den := -1/sqrt(1+d) (Y^2 - Z^2) gamma
|
||||
let den = &(&(-(&z2_min_y2)) * &lizard_constants::MINVSQRT_ONE_PLUS_D) * γ
|
||||
|
||||
// Same as before but with the substitution (X, Y, Z) = (Y, X, i*Z)
|
||||
let iz = &constants::SQRT_M1 * &self.0.Z; // iZ
|
||||
let iz_min_x = &iz - &self.0.X; // iZ - X
|
||||
let iz_pl_x = &iz + &self.0.X; // iZ + X
|
||||
|
||||
let s_over_y = &den * &iz_min_x;
|
||||
let sp_over_yp = &den * &iz_pl_x;
|
||||
|
||||
let mut s2 = &s_over_y * &self.0.Y;
|
||||
let mut s3 = &(-(&sp_over_yp)) * &self.0.Y;
|
||||
|
||||
// t_2 := -2/sqrt(-d-1) * i*Z * sOverY
|
||||
// t_3 := -2/sqrt(-d-1) * i*Z * spOverYp
|
||||
let tmp = &lizard_constants::MDOUBLE_INVSQRT_A_MINUS_D * &iz;
|
||||
let mut t2 = &tmp * &s_over_y;
|
||||
let mut t3 = &tmp * &sp_over_yp;
|
||||
|
||||
// Special case: X=0 or Y=0. Then return
|
||||
//
|
||||
// (0,1) (1,-2i/sqrt(-d-1) (-1,-2i/sqrt(-d-1))
|
||||
//
|
||||
// Note that if X=0 or Y=0, then s_i = t_i = 0.
|
||||
let x_or_y_is_zero = self.0.X.is_zero() | self.0.Y.is_zero();
|
||||
t0.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
|
||||
t1.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
|
||||
t2.conditional_assign(&lizard_constants::MIDOUBLE_INVSQRT_A_MINUS_D, x_or_y_is_zero);
|
||||
t3.conditional_assign(&lizard_constants::MIDOUBLE_INVSQRT_A_MINUS_D, x_or_y_is_zero);
|
||||
s2.conditional_assign(&FieldElement::one(), x_or_y_is_zero);
|
||||
s3.conditional_assign(&(-(&FieldElement::one())), x_or_y_is_zero);
|
||||
|
||||
return [
|
||||
JacobiPoint{S: s0, T: t0},
|
||||
JacobiPoint{S: s1, T: t1},
|
||||
JacobiPoint{S: s2, T: t2},
|
||||
JacobiPoint{S: s3, T: t3},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
#[cfg(all(test, feature = "stage2_build"))]
|
||||
mod test {
|
||||
|
||||
extern crate sha2;
|
||||
|
||||
#[cfg(feature = "rand")]
|
||||
use rand_os::OsRng;
|
||||
use rand_core::{RngCore};
|
||||
use self::sha2::{Sha256};
|
||||
use ristretto::CompressedRistretto;
|
||||
use super::*;
|
||||
|
||||
fn test_lizard_encode_helper(data: &[u8; 16], result: &[u8; 32]) {
|
||||
let p = RistrettoPoint::lizard_encode::<Sha256>(data).unwrap();
|
||||
let p_bytes = p.compress().to_bytes();
|
||||
assert!(&p_bytes == result);
|
||||
let p = CompressedRistretto::from_slice(&p_bytes).decompress().unwrap();
|
||||
let data_out = p.lizard_decode::<Sha256>().unwrap();
|
||||
assert!(&data_out == data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lizard_encode() {
|
||||
test_lizard_encode_helper(&[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
&[0xf0, 0xb7, 0xe3, 0x44, 0x84, 0xf7, 0x4c, 0xf0, 0xf, 0x15, 0x2, 0x4b, 0x73, 0x85, 0x39, 0x73, 0x86, 0x46, 0xbb, 0xbe, 0x1e, 0x9b, 0xc7, 0x50, 0x9a, 0x67, 0x68, 0x15, 0x22, 0x7e, 0x77, 0x4f]);
|
||||
|
||||
test_lizard_encode_helper(&[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
|
||||
&[0xcc, 0x92, 0xe8, 0x1f, 0x58, 0x5a, 0xfc, 0x5c, 0xaa, 0xc8, 0x86, 0x60, 0xd8, 0xd1, 0x7e, 0x90, 0x25, 0xa4, 0x44, 0x89, 0xa3, 0x63, 0x4, 0x21, 0x23, 0xf6, 0xaf, 0x7, 0x2, 0x15, 0x6e, 0x65]);
|
||||
|
||||
test_lizard_encode_helper(&[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
|
||||
&[0xc8, 0x30, 0x57, 0x3f, 0x8a, 0x8e, 0x77, 0x78, 0x67, 0x1f, 0x76, 0xcd, 0xc7, 0x96, 0xdc, 0xa, 0x23, 0x5c, 0xf1, 0x77, 0xf1, 0x97, 0xd9, 0xfc, 0xba, 0x6, 0xe8, 0x4e, 0x96, 0x24, 0x74, 0x44]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_elligator_inv() {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
for i in 0..100 {
|
||||
let mut fe_bytes = [0u8; 32];
|
||||
|
||||
if i == 0 {
|
||||
// Test for first corner-case: fe = 0
|
||||
fe_bytes = [0u8; 32];
|
||||
} else if i == 1 {
|
||||
// Test for second corner-case: fe = +sqrt(i*d)
|
||||
fe_bytes = [168, 27, 92, 74, 203, 42, 48, 117, 170, 109, 234,
|
||||
14, 45, 169, 188, 205, 21, 110, 235, 115, 153, 84,
|
||||
52, 117, 151, 235, 123, 244, 88, 85, 179, 5];
|
||||
} else {
|
||||
// For the rest, just generate a random field element to test.
|
||||
rng.fill_bytes(&mut fe_bytes);
|
||||
}
|
||||
fe_bytes[0] &= 254; // positive
|
||||
fe_bytes[31] &= 127; // < 2^255-19
|
||||
let fe = FieldElement::from_bytes(&fe_bytes);
|
||||
|
||||
let pt = RistrettoPoint::elligator_ristretto_flavor(&fe);
|
||||
for pt2 in &pt.xcoset4() {
|
||||
let (mask, fes) = RistrettoPoint(*pt2).elligator_ristretto_flavor_inverse();
|
||||
|
||||
let mut found = false;
|
||||
for j in 0..8 {
|
||||
if mask & (1 << j) != 0 {
|
||||
assert_eq!(RistrettoPoint::elligator_ristretto_flavor(&fes[j]), pt);
|
||||
if fes[j] == fe {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert!(found);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
//! The Lizard method for encoding/decoding 16 bytes into Ristretto points.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
#[cfg(feature = "u32_backend")]
|
||||
mod u32_constants;
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
mod u64_constants;
|
||||
|
||||
pub mod lizard_constants;
|
||||
pub mod jacobi_quartic;
|
||||
pub mod lizard_ristretto;
|
|
@ -1,33 +0,0 @@
|
|||
use backend::serial::u32::field::FieldElement2625;
|
||||
use edwards::EdwardsPoint;
|
||||
|
||||
/// `= sqrt(i*d)`, where `i = +sqrt(-1)` and `d` is the Edwards curve parameter.
|
||||
pub const SQRT_ID: FieldElement2625 = FieldElement2625([
|
||||
39590824, 701138, 28659366, 23623507, 53932708,
|
||||
32206357, 36326585, 24309414, 26167230, 1494357,
|
||||
]);
|
||||
|
||||
/// `= (d+1)/(d-1)`, where `d` is the Edwards curve parameter.
|
||||
pub const DP1_OVER_DM1: FieldElement2625 = FieldElement2625([
|
||||
58833708, 32184294, 62457071, 26110240, 19032991,
|
||||
27203620, 7122892, 18068959, 51019405, 3776288,
|
||||
]);
|
||||
|
||||
/// `= -2/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters.
|
||||
pub const MDOUBLE_INVSQRT_A_MINUS_D: FieldElement2625 = FieldElement2625([
|
||||
54885894, 25242303, 55597453, 9067496, 51808079,
|
||||
33312638, 25456129, 14121551, 54921728, 3972023,
|
||||
]);
|
||||
|
||||
/// `= -2i/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters
|
||||
/// and `i = +sqrt(-1)`.
|
||||
pub const MIDOUBLE_INVSQRT_A_MINUS_D: FieldElement2625 = FieldElement2625([
|
||||
58178520, 23970840, 26444491, 29801899, 41064376,
|
||||
743696, 2900628, 27920316, 41968995, 5270573,
|
||||
]);
|
||||
|
||||
/// `= -1/sqrt(1+d)`, where `d` is the Edwards curve parameters.
|
||||
pub const MINVSQRT_ONE_PLUS_D: FieldElement2625 = FieldElement2625([
|
||||
38019585, 4791795, 20332186, 18653482, 46576675,
|
||||
33182583, 65658549, 2817057, 12569934, 30919145,
|
||||
]);
|
|
@ -1,18 +0,0 @@
|
|||
use backend::serial::u64::field::FieldElement51;
|
||||
|
||||
/// `= sqrt(i*d)`, where `i = +sqrt(-1)` and `d` is the Edwards curve parameter.
|
||||
pub const SQRT_ID: FieldElement51 = FieldElement51([2298852427963285, 3837146560810661, 4413131899466403, 3883177008057528, 2352084440532925]);
|
||||
|
||||
/// `= (d+1)/(d-1)`, where `d` is the Edwards curve parameter.
|
||||
pub const DP1_OVER_DM1: FieldElement51 = FieldElement51([2159851467815724, 1752228607624431, 1825604053920671, 1212587319275468, 253422448836237]);
|
||||
|
||||
/// `= -2/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters.
|
||||
pub const MDOUBLE_INVSQRT_A_MINUS_D: FieldElement51 = FieldElement51([1693982333959686, 608509411481997, 2235573344831311, 947681270984193, 266558006233600]);
|
||||
|
||||
/// `= -2i/sqrt(a-d)`, where `a = -1 (mod p)`, `d` are the Edwards curve parameters
|
||||
/// and `i = +sqrt(-1)`.
|
||||
pub const MIDOUBLE_INVSQRT_A_MINUS_D: FieldElement51 = FieldElement51([1608655899704280, 1999971613377227, 49908634785720, 1873700692181652, 353702208628067]);
|
||||
|
||||
/// `= -1/sqrt(1+d)`, where `d` is the Edwards curve parameters.
|
||||
pub const MINVSQRT_ONE_PLUS_D: FieldElement51 = FieldElement51([321571956990465, 1251814006996634, 2226845496292387, 189049560751797, 2074948709371214]);
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Internal macros.
|
||||
|
||||
/// Define borrow and non-borrow variants of `Add`.
|
||||
macro_rules! define_add_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
|
||||
impl<'b> Add<&'b $rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn add(self, rhs: &'b $rhs) -> $out {
|
||||
&self + rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Add<$rhs> for &'a $lhs {
|
||||
type Output = $out;
|
||||
fn add(self, rhs: $rhs) -> $out {
|
||||
self + &rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<$rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn add(self, rhs: $rhs) -> $out {
|
||||
&self + &rhs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Define non-borrow variants of `AddAssign`.
|
||||
macro_rules! define_add_assign_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty) => {
|
||||
impl AddAssign<$rhs> for $lhs {
|
||||
fn add_assign(&mut self, rhs: $rhs) {
|
||||
*self += &rhs;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Define borrow and non-borrow variants of `Sub`.
|
||||
macro_rules! define_sub_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
|
||||
impl<'b> Sub<&'b $rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn sub(self, rhs: &'b $rhs) -> $out {
|
||||
&self - rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Sub<$rhs> for &'a $lhs {
|
||||
type Output = $out;
|
||||
fn sub(self, rhs: $rhs) -> $out {
|
||||
self - &rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub<$rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn sub(self, rhs: $rhs) -> $out {
|
||||
&self - &rhs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Define non-borrow variants of `SubAssign`.
|
||||
macro_rules! define_sub_assign_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty) => {
|
||||
impl SubAssign<$rhs> for $lhs {
|
||||
fn sub_assign(&mut self, rhs: $rhs) {
|
||||
*self -= &rhs;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Define borrow and non-borrow variants of `Mul`.
|
||||
macro_rules! define_mul_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty, Output = $out:ty) => {
|
||||
impl<'b> Mul<&'b $rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn mul(self, rhs: &'b $rhs) -> $out {
|
||||
&self * rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Mul<$rhs> for &'a $lhs {
|
||||
type Output = $out;
|
||||
fn mul(self, rhs: $rhs) -> $out {
|
||||
self * &rhs
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul<$rhs> for $lhs {
|
||||
type Output = $out;
|
||||
fn mul(self, rhs: $rhs) -> $out {
|
||||
&self * &rhs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Define non-borrow variants of `MulAssign`.
|
||||
macro_rules! define_mul_assign_variants {
|
||||
(LHS = $lhs:ty, RHS = $rhs:ty) => {
|
||||
impl MulAssign<$rhs> for $lhs {
|
||||
fn mul_assign(&mut self, rhs: $rhs) {
|
||||
*self *= &rhs;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,403 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Scalar multiplication on the Montgomery form of Curve25519.
|
||||
//!
|
||||
//! To avoid notational confusion with the Edwards code, we use
|
||||
//! variables \\( u, v \\) for the Montgomery curve, so that “Montgomery
|
||||
//! \\(u\\)” here corresponds to “Montgomery \\(x\\)” elsewhere.
|
||||
//!
|
||||
//! Montgomery arithmetic works not on the curve itself, but on the
|
||||
//! \\(u\\)-line, which discards sign information and unifies the curve
|
||||
//! and its quadratic twist. See [_Montgomery curves and their
|
||||
//! arithmetic_][costello-smith] by Costello and Smith for more details.
|
||||
//!
|
||||
//! The `MontgomeryPoint` struct contains the affine \\(u\\)-coordinate
|
||||
//! \\(u\_0(P)\\) of a point \\(P\\) on either the curve or the twist.
|
||||
//! Here the map \\(u\_0 : \mathcal M \rightarrow \mathbb F\_p \\) is
|
||||
//! defined by \\(u\_0((u,v)) = u\\); \\(u\_0(\mathcal O) = 0\\). See
|
||||
//! section 5.4 of Costello-Smith for more details.
|
||||
//!
|
||||
//! # Scalar Multiplication
|
||||
//!
|
||||
//! Scalar multiplication on `MontgomeryPoint`s is provided by the `*`
|
||||
//! operator, which implements the Montgomery ladder.
|
||||
//!
|
||||
//! # Edwards Conversion
|
||||
//!
|
||||
//! The \\(2\\)-to-\\(1\\) map from the Edwards model to the Montgomery
|
||||
//! \\(u\\)-line is provided by `EdwardsPoint::to_montgomery()`.
|
||||
//!
|
||||
//! To lift a `MontgomeryPoint` to an `EdwardsPoint`, use
|
||||
//! `MontgomeryPoint::to_edwards()`, which takes a sign parameter.
|
||||
//! This function rejects `MontgomeryPoints` which correspond to points
|
||||
//! on the twist.
|
||||
//!
|
||||
//! [costello-smith]: https://eprint.iacr.org/2017/212.pdf
|
||||
|
||||
// We allow non snake_case names because coordinates in projective space are
|
||||
// traditionally denoted by the capitalisation of their respective
|
||||
// counterparts in affine space. Yeah, you heard me, rustc, I'm gonna have my
|
||||
// affine and projective cakes and eat both of them too.
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::ops::{Mul, MulAssign};
|
||||
|
||||
use constants::APLUS2_OVER_FOUR;
|
||||
use edwards::{CompressedEdwardsY, EdwardsPoint};
|
||||
use field::FieldElement;
|
||||
use scalar::Scalar;
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
use subtle::ConstantTimeEq;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
/// Holds the \\(u\\)-coordinate of a point on the Montgomery form of
|
||||
/// Curve25519 or its twist.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
|
||||
pub struct MontgomeryPoint(pub [u8; 32]);
|
||||
|
||||
/// Equality of `MontgomeryPoint`s is defined mod p.
|
||||
impl ConstantTimeEq for MontgomeryPoint {
|
||||
fn ct_eq(&self, other: &MontgomeryPoint) -> Choice {
|
||||
let self_fe = FieldElement::from_bytes(&self.0);
|
||||
let other_fe = FieldElement::from_bytes(&other.0);
|
||||
|
||||
self_fe.ct_eq(&other_fe)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MontgomeryPoint {
|
||||
fn default() -> MontgomeryPoint {
|
||||
MontgomeryPoint([0u8; 32])
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for MontgomeryPoint {
|
||||
fn eq(&self, other: &MontgomeryPoint) -> bool {
|
||||
self.ct_eq(other).unwrap_u8() == 1u8
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for MontgomeryPoint {}
|
||||
|
||||
impl Zeroize for MontgomeryPoint {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl MontgomeryPoint {
|
||||
/// View this `MontgomeryPoint` as an array of bytes.
|
||||
pub fn as_bytes<'a>(&'a self) -> &'a [u8; 32] {
|
||||
&self.0
|
||||
}
|
||||
|
||||
/// Convert this `MontgomeryPoint` to an array of bytes.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Attempt to convert to an `EdwardsPoint`, using the supplied
|
||||
/// choice of sign for the `EdwardsPoint`.
|
||||
///
|
||||
/// # Inputs
|
||||
///
|
||||
/// * `sign`: a `u8` donating the desired sign of the resulting
|
||||
/// `EdwardsPoint`. `0` denotes positive and `1` negative.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// * `Some(EdwardsPoint)` if `self` is the \\(u\\)-coordinate of a
|
||||
/// point on (the Montgomery form of) Curve25519;
|
||||
///
|
||||
/// * `None` if `self` is the \\(u\\)-coordinate of a point on the
|
||||
/// twist of (the Montgomery form of) Curve25519;
|
||||
///
|
||||
pub fn to_edwards(&self, sign: u8) -> Option<EdwardsPoint> {
|
||||
// To decompress the Montgomery u coordinate to an
|
||||
// `EdwardsPoint`, we apply the birational map to obtain the
|
||||
// Edwards y coordinate, then do Edwards decompression.
|
||||
//
|
||||
// The birational map is y = (u-1)/(u+1).
|
||||
//
|
||||
// The exceptional points are the zeros of the denominator,
|
||||
// i.e., u = -1.
|
||||
//
|
||||
// But when u = -1, v^2 = u*(u^2+486662*u+1) = 486660.
|
||||
//
|
||||
// Since this is nonsquare mod p, u = -1 corresponds to a point
|
||||
// on the twist, not the curve, so we can reject it early.
|
||||
|
||||
let u = FieldElement::from_bytes(&self.0);
|
||||
|
||||
if u == FieldElement::minus_one() { return None; }
|
||||
|
||||
let one = FieldElement::one();
|
||||
|
||||
let y = &(&u - &one) * &(&u + &one).invert();
|
||||
|
||||
let mut y_bytes = y.to_bytes();
|
||||
y_bytes[31] ^= sign << 7;
|
||||
|
||||
CompressedEdwardsY(y_bytes).decompress()
|
||||
}
|
||||
}
|
||||
|
||||
/// A `ProjectivePoint` holds a point on the projective line
|
||||
/// \\( \mathbb P(\mathbb F\_p) \\), which we identify with the Kummer
|
||||
/// line of the Montgomery curve.
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
struct ProjectivePoint {
|
||||
pub U: FieldElement,
|
||||
pub W: FieldElement,
|
||||
}
|
||||
|
||||
impl Identity for ProjectivePoint {
|
||||
fn identity() -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
U: FieldElement::one(),
|
||||
W: FieldElement::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProjectivePoint {
|
||||
fn default() -> ProjectivePoint {
|
||||
ProjectivePoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for ProjectivePoint {
|
||||
fn conditional_select(
|
||||
a: &ProjectivePoint,
|
||||
b: &ProjectivePoint,
|
||||
choice: Choice,
|
||||
) -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
U: FieldElement::conditional_select(&a.U, &b.U, choice),
|
||||
W: FieldElement::conditional_select(&a.W, &b.W, choice),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ProjectivePoint {
|
||||
/// Dehomogenize this point to affine coordinates.
|
||||
///
|
||||
/// # Return
|
||||
///
|
||||
/// * \\( u = U / W \\) if \\( W \neq 0 \\);
|
||||
/// * \\( 0 \\) if \\( W \eq 0 \\);
|
||||
pub fn to_affine(&self) -> MontgomeryPoint {
|
||||
let u = &self.U * &self.W.invert();
|
||||
MontgomeryPoint(u.to_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform the double-and-add step of the Montgomery ladder.
|
||||
///
|
||||
/// Given projective points
|
||||
/// \\( (U\_P : W\_P) = u(P) \\),
|
||||
/// \\( (U\_Q : W\_Q) = u(Q) \\),
|
||||
/// and the affine difference
|
||||
/// \\( u\_{P-Q} = u(P-Q) \\), set
|
||||
/// $$
|
||||
/// (U\_P : W\_P) \gets u([2]P)
|
||||
/// $$
|
||||
/// and
|
||||
/// $$
|
||||
/// (U\_Q : W\_Q) \gets u(P + Q).
|
||||
/// $$
|
||||
fn differential_add_and_double(
|
||||
P: &mut ProjectivePoint,
|
||||
Q: &mut ProjectivePoint,
|
||||
affine_PmQ: &FieldElement,
|
||||
) {
|
||||
let t0 = &P.U + &P.W;
|
||||
let t1 = &P.U - &P.W;
|
||||
let t2 = &Q.U + &Q.W;
|
||||
let t3 = &Q.U - &Q.W;
|
||||
|
||||
let t4 = t0.square(); // (U_P + W_P)^2 = U_P^2 + 2 U_P W_P + W_P^2
|
||||
let t5 = t1.square(); // (U_P - W_P)^2 = U_P^2 - 2 U_P W_P + W_P^2
|
||||
|
||||
let t6 = &t4 - &t5; // 4 U_P W_P
|
||||
|
||||
let t7 = &t0 * &t3; // (U_P + W_P) (U_Q - W_Q) = U_P U_Q + W_P U_Q - U_P W_Q - W_P W_Q
|
||||
let t8 = &t1 * &t2; // (U_P - W_P) (U_Q + W_Q) = U_P U_Q - W_P U_Q + U_P W_Q - W_P W_Q
|
||||
|
||||
let t9 = &t7 + &t8; // 2 (U_P U_Q - W_P W_Q)
|
||||
let t10 = &t7 - &t8; // 2 (W_P U_Q - U_P W_Q)
|
||||
|
||||
let t11 = t9.square(); // 4 (U_P U_Q - W_P W_Q)^2
|
||||
let t12 = t10.square(); // 4 (W_P U_Q - U_P W_Q)^2
|
||||
|
||||
let t13 = &APLUS2_OVER_FOUR * &t6; // (A + 2) U_P U_Q
|
||||
|
||||
let t14 = &t4 * &t5; // ((U_P + W_P)(U_P - W_P))^2 = (U_P^2 - W_P^2)^2
|
||||
let t15 = &t13 + &t5; // (U_P - W_P)^2 + (A + 2) U_P W_P
|
||||
|
||||
let t16 = &t6 * &t15; // 4 (U_P W_P) ((U_P - W_P)^2 + (A + 2) U_P W_P)
|
||||
|
||||
let t17 = affine_PmQ * &t12; // U_D * 4 (W_P U_Q - U_P W_Q)^2
|
||||
let t18 = t11; // W_D * 4 (U_P U_Q - W_P W_Q)^2
|
||||
|
||||
P.U = t14; // U_{P'} = (U_P + W_P)^2 (U_P - W_P)^2
|
||||
P.W = t16; // W_{P'} = (4 U_P W_P) ((U_P - W_P)^2 + ((A + 2)/4) 4 U_P W_P)
|
||||
Q.U = t18; // U_{Q'} = W_D * 4 (U_P U_Q - W_P W_Q)^2
|
||||
Q.W = t17; // W_{Q'} = U_D * 4 (W_P U_Q - U_P W_Q)^2
|
||||
}
|
||||
|
||||
define_mul_assign_variants!(LHS = MontgomeryPoint, RHS = Scalar);
|
||||
|
||||
define_mul_variants!(LHS = MontgomeryPoint, RHS = Scalar, Output = MontgomeryPoint);
|
||||
define_mul_variants!(LHS = Scalar, RHS = MontgomeryPoint, Output = MontgomeryPoint);
|
||||
|
||||
/// Multiply this `MontgomeryPoint` by a `Scalar`.
|
||||
impl<'a, 'b> Mul<&'b Scalar> for &'a MontgomeryPoint {
|
||||
type Output = MontgomeryPoint;
|
||||
|
||||
/// Given `self` \\( = u\_0(P) \\), and a `Scalar` \\(n\\), return \\( u\_0([n]P) \\).
|
||||
fn mul(self, scalar: &'b Scalar) -> MontgomeryPoint {
|
||||
// Algorithm 8 of Costello-Smith 2017
|
||||
let affine_u = FieldElement::from_bytes(&self.0);
|
||||
let mut x0 = ProjectivePoint::identity();
|
||||
let mut x1 = ProjectivePoint {
|
||||
U: affine_u,
|
||||
W: FieldElement::one(),
|
||||
};
|
||||
|
||||
let bits: [i8; 256] = scalar.bits();
|
||||
|
||||
for i in (0..255).rev() {
|
||||
let choice: u8 = (bits[i + 1] ^ bits[i]) as u8;
|
||||
|
||||
debug_assert!(choice == 0 || choice == 1);
|
||||
|
||||
ProjectivePoint::conditional_swap(&mut x0, &mut x1, choice.into());
|
||||
differential_add_and_double(&mut x0, &mut x1, &affine_u);
|
||||
}
|
||||
ProjectivePoint::conditional_swap(&mut x0, &mut x1, Choice::from(bits[0] as u8));
|
||||
|
||||
x0.to_affine()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> MulAssign<&'b Scalar> for MontgomeryPoint {
|
||||
fn mul_assign(&mut self, scalar: &'b Scalar) {
|
||||
*self = (self as &MontgomeryPoint) * scalar;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b MontgomeryPoint> for &'a Scalar {
|
||||
type Output = MontgomeryPoint;
|
||||
|
||||
fn mul(self, point: &'b MontgomeryPoint) -> MontgomeryPoint {
|
||||
point * self
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use constants;
|
||||
use super::*;
|
||||
|
||||
use rand_core::OsRng;
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "serde")]
|
||||
fn serde_bincode_basepoint_roundtrip() {
|
||||
use bincode;
|
||||
|
||||
let encoded = bincode::serialize(&constants::X25519_BASEPOINT).unwrap();
|
||||
let decoded: MontgomeryPoint = bincode::deserialize(&encoded).unwrap();
|
||||
|
||||
assert_eq!(encoded.len(), 32);
|
||||
assert_eq!(decoded, constants::X25519_BASEPOINT);
|
||||
|
||||
let raw_bytes = constants::X25519_BASEPOINT.as_bytes();
|
||||
let bp: MontgomeryPoint = bincode::deserialize(raw_bytes).unwrap();
|
||||
assert_eq!(bp, constants::X25519_BASEPOINT);
|
||||
}
|
||||
|
||||
/// Test Montgomery -> Edwards on the X/Ed25519 basepoint
|
||||
#[test]
|
||||
fn basepoint_montgomery_to_edwards() {
|
||||
// sign bit = 0 => basepoint
|
||||
assert_eq!(
|
||||
constants::ED25519_BASEPOINT_POINT,
|
||||
constants::X25519_BASEPOINT.to_edwards(0).unwrap()
|
||||
);
|
||||
// sign bit = 1 => minus basepoint
|
||||
assert_eq!(
|
||||
- constants::ED25519_BASEPOINT_POINT,
|
||||
constants::X25519_BASEPOINT.to_edwards(1).unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
/// Test Edwards -> Montgomery on the X/Ed25519 basepoint
|
||||
#[test]
|
||||
fn basepoint_edwards_to_montgomery() {
|
||||
assert_eq!(
|
||||
constants::ED25519_BASEPOINT_POINT.to_montgomery(),
|
||||
constants::X25519_BASEPOINT
|
||||
);
|
||||
}
|
||||
|
||||
/// Check that Montgomery -> Edwards fails for points on the twist.
|
||||
#[test]
|
||||
fn montgomery_to_edwards_rejects_twist() {
|
||||
let one = FieldElement::one();
|
||||
|
||||
// u = 2 corresponds to a point on the twist.
|
||||
let two = MontgomeryPoint((&one+&one).to_bytes());
|
||||
|
||||
assert!(two.to_edwards(0).is_none());
|
||||
|
||||
// u = -1 corresponds to a point on the twist, but should be
|
||||
// checked explicitly because it's an exceptional point for the
|
||||
// birational map. For instance, libsignal will accept it.
|
||||
let minus_one = MontgomeryPoint((-&one).to_bytes());
|
||||
|
||||
assert!(minus_one.to_edwards(0).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eq_defined_mod_p() {
|
||||
let mut u18_bytes = [0u8; 32]; u18_bytes[0] = 18;
|
||||
let u18 = MontgomeryPoint(u18_bytes);
|
||||
let u18_unred = MontgomeryPoint([255; 32]);
|
||||
|
||||
assert_eq!(u18, u18_unred);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn montgomery_ladder_matches_edwards_scalarmult() {
|
||||
let mut csprng: OsRng = OsRng;
|
||||
|
||||
let s: Scalar = Scalar::random(&mut csprng);
|
||||
let p_edwards: EdwardsPoint = &constants::ED25519_BASEPOINT_TABLE * &s;
|
||||
let p_montgomery: MontgomeryPoint = p_edwards.to_montgomery();
|
||||
|
||||
let expected = s * p_edwards;
|
||||
let result = s * p_montgomery;
|
||||
|
||||
assert_eq!(result, expected.to_montgomery())
|
||||
}
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
//! Crate-local prelude (for alloc-dependent features like `Vec`)
|
||||
|
||||
// TODO: switch to alloc::prelude
|
||||
#[cfg(all(feature = "alloc", not(feature = "std")))]
|
||||
pub use alloc::vec::Vec;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
pub use std::vec::Vec;
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,378 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Module for common traits.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use subtle;
|
||||
|
||||
use scalar::Scalar;
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Public Traits
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
/// Trait for getting the identity element of a point type.
|
||||
pub trait Identity {
|
||||
/// Returns the identity element of the curve.
|
||||
/// Can be used as a constructor.
|
||||
fn identity() -> Self;
|
||||
}
|
||||
|
||||
/// Trait for testing if a curve point is equivalent to the identity point.
|
||||
pub trait IsIdentity {
|
||||
/// Return true if this element is the identity element of the curve.
|
||||
fn is_identity(&self) -> bool;
|
||||
}
|
||||
|
||||
/// Implement generic identity equality testing for a point representations
|
||||
/// which have constant-time equality testing and a defined identity
|
||||
/// constructor.
|
||||
impl<T> IsIdentity for T
|
||||
where
|
||||
T: subtle::ConstantTimeEq + Identity,
|
||||
{
|
||||
fn is_identity(&self) -> bool {
|
||||
self.ct_eq(&T::identity()).unwrap_u8() == 1u8
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait for constant-time multiscalar multiplication without precomputation.
|
||||
pub trait MultiscalarMul {
|
||||
/// The type of point being multiplied, e.g., `RistrettoPoint`.
|
||||
type Point;
|
||||
|
||||
/// Given an iterator of (possibly secret) scalars and an iterator of
|
||||
/// public points, compute
|
||||
/// $$
|
||||
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n.
|
||||
/// $$
|
||||
///
|
||||
/// It is an error to call this function with two iterators of different lengths.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// The trait bound aims for maximum flexibility: the inputs must be
|
||||
/// convertable to iterators (`I: IntoIter`), and the iterator's items
|
||||
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
|
||||
/// iterators returning either `Scalar`s or `&Scalar`s.
|
||||
///
|
||||
/// ```
|
||||
/// use curve25519_dalek::constants;
|
||||
/// use curve25519_dalek::traits::MultiscalarMul;
|
||||
/// use curve25519_dalek::ristretto::RistrettoPoint;
|
||||
/// use curve25519_dalek::scalar::Scalar;
|
||||
///
|
||||
/// // Some scalars
|
||||
/// let a = Scalar::from(87329482u64);
|
||||
/// let b = Scalar::from(37264829u64);
|
||||
/// let c = Scalar::from(98098098u64);
|
||||
///
|
||||
/// // Some points
|
||||
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
|
||||
/// let Q = P + P;
|
||||
/// let R = P + Q;
|
||||
///
|
||||
/// // A1 = a*P + b*Q + c*R
|
||||
/// let abc = [a,b,c];
|
||||
/// let A1 = RistrettoPoint::multiscalar_mul(&abc, &[P,Q,R]);
|
||||
/// // Note: (&abc).into_iter(): Iterator<Item=&Scalar>
|
||||
///
|
||||
/// // A2 = (-a)*P + (-b)*Q + (-c)*R
|
||||
/// let minus_abc = abc.iter().map(|x| -x);
|
||||
/// let A2 = RistrettoPoint::multiscalar_mul(minus_abc, &[P,Q,R]);
|
||||
/// // Note: minus_abc.into_iter(): Iterator<Item=Scalar>
|
||||
///
|
||||
/// assert_eq!(A1.compress(), (-A2).compress());
|
||||
/// ```
|
||||
fn multiscalar_mul<I, J>(scalars: I, points: J) -> Self::Point
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Self::Point>;
|
||||
}
|
||||
|
||||
/// A trait for variable-time multiscalar multiplication without precomputation.
|
||||
pub trait VartimeMultiscalarMul {
|
||||
/// The type of point being multiplied, e.g., `RistrettoPoint`.
|
||||
type Point;
|
||||
|
||||
/// Given an iterator of public scalars and an iterator of
|
||||
/// `Option`s of points, compute either `Some(Q)`, where
|
||||
/// $$
|
||||
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n,
|
||||
/// $$
|
||||
/// if all points were `Some(P_i)`, or else return `None`.
|
||||
///
|
||||
/// This function is particularly useful when verifying statements
|
||||
/// involving compressed points. Accepting `Option<Point>` allows
|
||||
/// inlining point decompression into the multiscalar call,
|
||||
/// avoiding the need for temporary buffers.
|
||||
/// ```
|
||||
/// use curve25519_dalek::constants;
|
||||
/// use curve25519_dalek::traits::VartimeMultiscalarMul;
|
||||
/// use curve25519_dalek::ristretto::RistrettoPoint;
|
||||
/// use curve25519_dalek::scalar::Scalar;
|
||||
///
|
||||
/// // Some scalars
|
||||
/// let a = Scalar::from(87329482u64);
|
||||
/// let b = Scalar::from(37264829u64);
|
||||
/// let c = Scalar::from(98098098u64);
|
||||
/// let abc = [a,b,c];
|
||||
///
|
||||
/// // Some points
|
||||
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
|
||||
/// let Q = P + P;
|
||||
/// let R = P + Q;
|
||||
/// let PQR = [P, Q, R];
|
||||
///
|
||||
/// let compressed = [P.compress(), Q.compress(), R.compress()];
|
||||
///
|
||||
/// // Now we can compute A1 = a*P + b*Q + c*R using P, Q, R:
|
||||
/// let A1 = RistrettoPoint::vartime_multiscalar_mul(&abc, &PQR);
|
||||
///
|
||||
/// // Or using the compressed points:
|
||||
/// let A2 = RistrettoPoint::optional_multiscalar_mul(
|
||||
/// &abc,
|
||||
/// compressed.iter().map(|pt| pt.decompress()),
|
||||
/// );
|
||||
///
|
||||
/// assert_eq!(A2, Some(A1));
|
||||
///
|
||||
/// // It's also possible to mix compressed and uncompressed points:
|
||||
/// let A3 = RistrettoPoint::optional_multiscalar_mul(
|
||||
/// abc.iter()
|
||||
/// .chain(abc.iter()),
|
||||
/// compressed.iter().map(|pt| pt.decompress())
|
||||
/// .chain(PQR.iter().map(|&pt| Some(pt))),
|
||||
/// );
|
||||
///
|
||||
/// assert_eq!(A3, Some(A1+A1));
|
||||
/// ```
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<Self::Point>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<Self::Point>>;
|
||||
|
||||
/// Given an iterator of public scalars and an iterator of
|
||||
/// public points, compute
|
||||
/// $$
|
||||
/// Q = c\_1 P\_1 + \cdots + c\_n P\_n,
|
||||
/// $$
|
||||
/// using variable-time operations.
|
||||
///
|
||||
/// It is an error to call this function with two iterators of different lengths.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// The trait bound aims for maximum flexibility: the inputs must be
|
||||
/// convertable to iterators (`I: IntoIter`), and the iterator's items
|
||||
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
|
||||
/// iterators returning either `Scalar`s or `&Scalar`s.
|
||||
///
|
||||
/// ```
|
||||
/// use curve25519_dalek::constants;
|
||||
/// use curve25519_dalek::traits::VartimeMultiscalarMul;
|
||||
/// use curve25519_dalek::ristretto::RistrettoPoint;
|
||||
/// use curve25519_dalek::scalar::Scalar;
|
||||
///
|
||||
/// // Some scalars
|
||||
/// let a = Scalar::from(87329482u64);
|
||||
/// let b = Scalar::from(37264829u64);
|
||||
/// let c = Scalar::from(98098098u64);
|
||||
///
|
||||
/// // Some points
|
||||
/// let P = constants::RISTRETTO_BASEPOINT_POINT;
|
||||
/// let Q = P + P;
|
||||
/// let R = P + Q;
|
||||
///
|
||||
/// // A1 = a*P + b*Q + c*R
|
||||
/// let abc = [a,b,c];
|
||||
/// let A1 = RistrettoPoint::vartime_multiscalar_mul(&abc, &[P,Q,R]);
|
||||
/// // Note: (&abc).into_iter(): Iterator<Item=&Scalar>
|
||||
///
|
||||
/// // A2 = (-a)*P + (-b)*Q + (-c)*R
|
||||
/// let minus_abc = abc.iter().map(|x| -x);
|
||||
/// let A2 = RistrettoPoint::vartime_multiscalar_mul(minus_abc, &[P,Q,R]);
|
||||
/// // Note: minus_abc.into_iter(): Iterator<Item=Scalar>
|
||||
///
|
||||
/// assert_eq!(A1.compress(), (-A2).compress());
|
||||
/// ```
|
||||
fn vartime_multiscalar_mul<I, J>(scalars: I, points: J) -> Self::Point
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Self::Point>,
|
||||
Self::Point: Clone,
|
||||
{
|
||||
Self::optional_multiscalar_mul(
|
||||
scalars,
|
||||
points.into_iter().map(|P| Some(P.borrow().clone())),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait for variable-time multiscalar multiplication with precomputation.
|
||||
///
|
||||
/// A general multiscalar multiplication with precomputation can be written as
|
||||
/// $$
|
||||
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
|
||||
/// $$
|
||||
/// where the \\(B_i\\) are *static* points, for which precomputation
|
||||
/// is possible, and the \\(A_j\\) are *dynamic* points, for which
|
||||
/// precomputation is not possible.
|
||||
///
|
||||
/// This trait has three methods for performing this computation:
|
||||
///
|
||||
/// * [`vartime_multiscalar_mul`], which handles the special case
|
||||
/// where \\(n = 0\\) and there are no dynamic points;
|
||||
///
|
||||
/// * [`vartime_mixed_multiscalar_mul`], which takes the dynamic
|
||||
/// points as already-validated `Point`s and is infallible;
|
||||
///
|
||||
/// * [`optional_mixed_multiscalar_mul`], which takes the dynamic
|
||||
/// points as `Option<Point>`s and returns an `Option<Point>`,
|
||||
/// allowing decompression to be composed into the input iterators.
|
||||
///
|
||||
/// All methods require that the lengths of the input iterators be
|
||||
/// known and matching, as if they were `ExactSizeIterator`s. (It
|
||||
/// does not require `ExactSizeIterator` only because that trait is
|
||||
/// broken).
|
||||
pub trait VartimePrecomputedMultiscalarMul: Sized {
|
||||
/// The type of point to be multiplied, e.g., `RistrettoPoint`.
|
||||
type Point: Clone;
|
||||
|
||||
/// Given the static points \\( B_i \\), perform precomputation
|
||||
/// and return the precomputation data.
|
||||
fn new<I>(static_points: I) -> Self
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Self::Point>;
|
||||
|
||||
/// Given `static_scalars`, an iterator of public scalars
|
||||
/// \\(b_i\\), compute
|
||||
/// $$
|
||||
/// Q = b_1 B_1 + \cdots + b_m B_m,
|
||||
/// $$
|
||||
/// where the \\(B_j\\) are the points that were supplied to `new`.
|
||||
///
|
||||
/// It is an error to call this function with iterators of
|
||||
/// inconsistent lengths.
|
||||
///
|
||||
/// The trait bound aims for maximum flexibility: the input must
|
||||
/// be convertable to iterators (`I: IntoIter`), and the
|
||||
/// iterator's items must be `Borrow<Scalar>`, to allow iterators
|
||||
/// returning either `Scalar`s or `&Scalar`s.
|
||||
fn vartime_multiscalar_mul<I>(&self, static_scalars: I) -> Self::Point
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
{
|
||||
use core::iter;
|
||||
|
||||
Self::vartime_mixed_multiscalar_mul(
|
||||
self,
|
||||
static_scalars,
|
||||
iter::empty::<Scalar>(),
|
||||
iter::empty::<Self::Point>(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Given `static_scalars`, an iterator of public scalars
|
||||
/// \\(b_i\\), `dynamic_scalars`, an iterator of public scalars
|
||||
/// \\(a_i\\), and `dynamic_points`, an iterator of points
|
||||
/// \\(A_i\\), compute
|
||||
/// $$
|
||||
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
|
||||
/// $$
|
||||
/// where the \\(B_j\\) are the points that were supplied to `new`.
|
||||
///
|
||||
/// It is an error to call this function with iterators of
|
||||
/// inconsistent lengths.
|
||||
///
|
||||
/// The trait bound aims for maximum flexibility: the inputs must be
|
||||
/// convertable to iterators (`I: IntoIter`), and the iterator's items
|
||||
/// must be `Borrow<Scalar>` (or `Borrow<Point>`), to allow
|
||||
/// iterators returning either `Scalar`s or `&Scalar`s.
|
||||
fn vartime_mixed_multiscalar_mul<I, J, K>(
|
||||
&self,
|
||||
static_scalars: I,
|
||||
dynamic_scalars: J,
|
||||
dynamic_points: K,
|
||||
) -> Self::Point
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Scalar>,
|
||||
K: IntoIterator,
|
||||
K::Item: Borrow<Self::Point>,
|
||||
{
|
||||
Self::optional_mixed_multiscalar_mul(
|
||||
self,
|
||||
static_scalars,
|
||||
dynamic_scalars,
|
||||
dynamic_points.into_iter().map(|P| Some(P.borrow().clone())),
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Given `static_scalars`, an iterator of public scalars
|
||||
/// \\(b_i\\), `dynamic_scalars`, an iterator of public scalars
|
||||
/// \\(a_i\\), and `dynamic_points`, an iterator of points
|
||||
/// \\(A_i\\), compute
|
||||
/// $$
|
||||
/// Q = a_1 A_1 + \cdots + a_n A_n + b_1 B_1 + \cdots + b_m B_m,
|
||||
/// $$
|
||||
/// where the \\(B_j\\) are the points that were supplied to `new`.
|
||||
///
|
||||
/// If any of the dynamic points were `None`, return `None`.
|
||||
///
|
||||
/// It is an error to call this function with iterators of
|
||||
/// inconsistent lengths.
|
||||
///
|
||||
/// This function is particularly useful when verifying statements
|
||||
/// involving compressed points. Accepting `Option<Point>` allows
|
||||
/// inlining point decompression into the multiscalar call,
|
||||
/// avoiding the need for temporary buffers.
|
||||
fn optional_mixed_multiscalar_mul<I, J, K>(
|
||||
&self,
|
||||
static_scalars: I,
|
||||
dynamic_scalars: J,
|
||||
dynamic_points: K,
|
||||
) -> Option<Self::Point>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Scalar>,
|
||||
K: IntoIterator<Item = Option<Self::Point>>;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Private Traits
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
/// Trait for checking whether a point is on the curve.
|
||||
///
|
||||
/// This trait is only for debugging/testing, since it should be
|
||||
/// impossible for a `curve25519-dalek` user to construct an invalid
|
||||
/// point.
|
||||
pub(crate) trait ValidityCheck {
|
||||
/// Checks whether the point is on the curve. Not CT.
|
||||
fn is_valid(&self) -> bool;
|
||||
}
|
|
@ -1,206 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2019 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Code for fixed- and sliding-window functionality
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::fmt::Debug;
|
||||
|
||||
use subtle::ConditionallyNegatable;
|
||||
use subtle::ConditionallySelectable;
|
||||
use subtle::ConstantTimeEq;
|
||||
use subtle::Choice;
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use backend::serial::curve_models::ProjectiveNielsPoint;
|
||||
use backend::serial::curve_models::AffineNielsPoint;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
/// A lookup table of precomputed multiples of a point \\(P\\), used to
|
||||
/// compute \\( xP \\) for \\( -8 \leq x \leq 8 \\).
|
||||
///
|
||||
/// The computation of \\( xP \\) is done in constant time by the `select` function.
|
||||
///
|
||||
/// Since `LookupTable` does not implement `Index`, it's more difficult
|
||||
/// to accidentally use the table directly. Unfortunately the table is
|
||||
/// only `pub(crate)` so that we can write hardcoded constants, so it's
|
||||
/// still technically possible. It would be nice to prevent direct
|
||||
/// access to the table.
|
||||
///
|
||||
/// XXX make this generic with respect to table size
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct LookupTable<T>(pub(crate) [T; 8]);
|
||||
|
||||
impl<T> LookupTable<T>
|
||||
where
|
||||
T: Identity + ConditionallySelectable + ConditionallyNegatable,
|
||||
{
|
||||
/// Given \\(-8 \leq x \leq 8\\), return \\(xP\\) in constant time.
|
||||
pub fn select(&self, x: i8) -> T {
|
||||
debug_assert!(x >= -8);
|
||||
debug_assert!(x <= 8);
|
||||
|
||||
// Compute xabs = |x|
|
||||
let xmask = x >> 7;
|
||||
let xabs = (x + xmask) ^ xmask;
|
||||
|
||||
// Set t = 0 * P = identity
|
||||
let mut t = T::identity();
|
||||
for j in 1..9 {
|
||||
// Copy `points[j-1] == j*P` onto `t` in constant time if `|x| == j`.
|
||||
let c = (xabs as u8).ct_eq(&(j as u8));
|
||||
t.conditional_assign(&self.0[j - 1], c);
|
||||
}
|
||||
// Now t == |x| * P.
|
||||
|
||||
let neg_mask = Choice::from((xmask & 1) as u8);
|
||||
t.conditional_negate(neg_mask);
|
||||
// Now t == x * P.
|
||||
|
||||
t
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Copy + Default> Default for LookupTable<T> {
|
||||
fn default() -> LookupTable<T> {
|
||||
LookupTable([T::default(); 8])
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Debug> Debug for LookupTable<T> {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "LookupTable({:?})", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for LookupTable<ProjectiveNielsPoint> {
|
||||
fn from(P: &'a EdwardsPoint) -> Self {
|
||||
let mut points = [P.to_projective_niels(); 8];
|
||||
for j in 0..7 {
|
||||
points[j + 1] = (P + &points[j]).to_extended().to_projective_niels();
|
||||
}
|
||||
LookupTable(points)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for LookupTable<AffineNielsPoint> {
|
||||
fn from(P: &'a EdwardsPoint) -> Self {
|
||||
let mut points = [P.to_affine_niels(); 8];
|
||||
// XXX batch inversion would be good if perf mattered here
|
||||
for j in 0..7 {
|
||||
points[j + 1] = (P + &points[j]).to_extended().to_affine_niels()
|
||||
}
|
||||
LookupTable(points)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Zeroize for LookupTable<T>
|
||||
where
|
||||
T: Copy + Default + Zeroize
|
||||
{
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds odd multiples 1A, 3A, ..., 15A of a point A.
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct NafLookupTable5<T>(pub(crate) [T; 8]);
|
||||
|
||||
impl<T: Copy> NafLookupTable5<T> {
|
||||
/// Given public, odd \\( x \\) with \\( 0 < x < 2^4 \\), return \\(xA\\).
|
||||
pub fn select(&self, x: usize) -> T {
|
||||
debug_assert_eq!(x & 1, 1);
|
||||
debug_assert!(x < 16);
|
||||
|
||||
self.0[x / 2]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Debug> Debug for NafLookupTable5<T> {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "NafLookupTable5({:?})", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for NafLookupTable5<ProjectiveNielsPoint> {
|
||||
fn from(A: &'a EdwardsPoint) -> Self {
|
||||
let mut Ai = [A.to_projective_niels(); 8];
|
||||
let A2 = A.double();
|
||||
for i in 0..7 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_projective_niels();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
|
||||
NafLookupTable5(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for NafLookupTable5<AffineNielsPoint> {
|
||||
fn from(A: &'a EdwardsPoint) -> Self {
|
||||
let mut Ai = [A.to_affine_niels(); 8];
|
||||
let A2 = A.double();
|
||||
for i in 0..7 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_affine_niels();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A]
|
||||
NafLookupTable5(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds stuff up to 8.
|
||||
#[derive(Copy, Clone)]
|
||||
pub(crate) struct NafLookupTable8<T>(pub(crate) [T; 64]);
|
||||
|
||||
impl<T: Copy> NafLookupTable8<T> {
|
||||
pub fn select(&self, x: usize) -> T {
|
||||
debug_assert_eq!(x & 1, 1);
|
||||
debug_assert!(x < 128);
|
||||
|
||||
self.0[x / 2]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Debug> Debug for NafLookupTable8<T> {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "NafLookupTable8([\n")?;
|
||||
for i in 0..64 {
|
||||
write!(f, "\t{:?},\n", &self.0[i])?;
|
||||
}
|
||||
write!(f, "])")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for NafLookupTable8<ProjectiveNielsPoint> {
|
||||
fn from(A: &'a EdwardsPoint) -> Self {
|
||||
let mut Ai = [A.to_projective_niels(); 64];
|
||||
let A2 = A.double();
|
||||
for i in 0..63 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_projective_niels();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
|
||||
NafLookupTable8(Ai)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a EdwardsPoint> for NafLookupTable8<AffineNielsPoint> {
|
||||
fn from(A: &'a EdwardsPoint) -> Self {
|
||||
let mut Ai = [A.to_affine_niels(); 64];
|
||||
let A2 = A.double();
|
||||
for i in 0..63 {
|
||||
Ai[i + 1] = (&A2 + &Ai[i]).to_extended().to_affine_niels();
|
||||
}
|
||||
// Now Ai = [A, 3A, 5A, 7A, 9A, 11A, 13A, 15A, ..., 127A]
|
||||
NafLookupTable8(Ai)
|
||||
}
|
||||
}
|
|
@ -1,857 +0,0 @@
|
|||
import binascii
|
||||
class InvalidEncodingException(Exception): pass
|
||||
class NotOnCurveException(Exception): pass
|
||||
class SpecException(Exception): pass
|
||||
|
||||
def lobit(x): return int(x) & 1
|
||||
def hibit(x): return lobit(2*x)
|
||||
def negative(x): return lobit(x)
|
||||
def enc_le(x,n): return bytearray([int(x)>>(8*i) & 0xFF for i in xrange(n)])
|
||||
def dec_le(x): return sum(b<<(8*i) for i,b in enumerate(x))
|
||||
def randombytes(n): return bytearray([randint(0,255) for _ in range(n)])
|
||||
|
||||
def optimized_version_of(spec):
|
||||
"""Decorator: This function is an optimized version of some specification"""
|
||||
def decorator(f):
|
||||
def wrapper(self,*args,**kwargs):
|
||||
def pr(x):
|
||||
if isinstance(x,bytearray): return binascii.hexlify(x)
|
||||
else: return str(x)
|
||||
try: spec_ans = getattr(self,spec,spec)(*args,**kwargs),None
|
||||
except Exception as e: spec_ans = None,e
|
||||
try: opt_ans = f(self,*args,**kwargs),None
|
||||
except Exception as e: opt_ans = None,e
|
||||
if spec_ans[1] is None and opt_ans[1] is not None:
|
||||
raise
|
||||
#raise SpecException("Mismatch in %s: spec returned %s but opt threw %s"
|
||||
# % (f.__name__,str(spec_ans[0]),str(opt_ans[1])))
|
||||
if spec_ans[1] is not None and opt_ans[1] is None:
|
||||
raise
|
||||
#raise SpecException("Mismatch in %s: spec threw %s but opt returned %s"
|
||||
# % (f.__name__,str(spec_ans[1]),str(opt_ans[0])))
|
||||
if spec_ans[0] != opt_ans[0]:
|
||||
raise SpecException("Mismatch in %s: %s != %s"
|
||||
% (f.__name__,pr(spec_ans[0]),pr(opt_ans[0])))
|
||||
if opt_ans[1] is not None: raise
|
||||
else: return opt_ans[0]
|
||||
wrapper.__name__ = f.__name__
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
def xsqrt(x,exn=InvalidEncodingException("Not on curve")):
|
||||
"""Return sqrt(x)"""
|
||||
if not is_square(x): raise exn
|
||||
s = sqrt(x)
|
||||
if negative(s): s=-s
|
||||
return s
|
||||
|
||||
def isqrt(x,exn=InvalidEncodingException("Not on curve")):
|
||||
"""Return 1/sqrt(x)"""
|
||||
if x==0: return 0
|
||||
if not is_square(x): raise exn
|
||||
s = sqrt(x)
|
||||
#if negative(s): s=-s
|
||||
return 1/s
|
||||
|
||||
def inv0(x): return 1/x if x != 0 else 0
|
||||
|
||||
def isqrt_i(x):
|
||||
"""Return 1/sqrt(x) or 1/sqrt(zeta * x)"""
|
||||
if x==0: return True,0
|
||||
gen = x.parent(-1)
|
||||
while is_square(gen): gen = sqrt(gen)
|
||||
if is_square(x): return True,1/sqrt(x)
|
||||
else: return False,1/sqrt(x*gen)
|
||||
|
||||
class QuotientEdwardsPoint(object):
|
||||
"""Abstract class for point an a quotiented Edwards curve; needs F,a,d,cofactor to work"""
|
||||
def __init__(self,x=0,y=1):
|
||||
x = self.x = self.F(x)
|
||||
y = self.y = self.F(y)
|
||||
if y^2 + self.a*x^2 != 1 + self.d*x^2*y^2:
|
||||
raise NotOnCurveException(str(self))
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(0x%x,0x%x)" % (self.__class__.__name__, self.x, self.y)
|
||||
|
||||
def __iter__(self):
|
||||
yield self.x
|
||||
yield self.y
|
||||
|
||||
def __add__(self,other):
|
||||
x,y = self
|
||||
X,Y = other
|
||||
a,d = self.a,self.d
|
||||
return self.__class__(
|
||||
(x*Y+y*X)/(1+d*x*y*X*Y),
|
||||
(y*Y-a*x*X)/(1-d*x*y*X*Y)
|
||||
)
|
||||
|
||||
def __neg__(self): return self.__class__(-self.x,self.y)
|
||||
def __sub__(self,other): return self + (-other)
|
||||
def __rmul__(self,other): return self*other
|
||||
def __eq__(self,other):
|
||||
"""NB: this is the only method that is different from the usual one"""
|
||||
x,y = self
|
||||
X,Y = other
|
||||
return x*Y == X*y or (self.cofactor==8 and -self.a*x*X == y*Y)
|
||||
def __ne__(self,other): return not (self==other)
|
||||
|
||||
def __mul__(self,exp):
|
||||
exp = int(exp)
|
||||
if exp < 0: exp,self = -exp,-self
|
||||
total = self.__class__()
|
||||
work = self
|
||||
while exp != 0:
|
||||
if exp & 1: total += work
|
||||
work += work
|
||||
exp >>= 1
|
||||
return total
|
||||
|
||||
def xyzt(self):
|
||||
x,y = self
|
||||
z = self.F.random_element()
|
||||
return x*z,y*z,z,x*y*z
|
||||
|
||||
def torque(self):
|
||||
"""Apply cofactor group, except keeping the point even"""
|
||||
if self.cofactor == 8:
|
||||
if self.a == -1: return self.__class__(self.y*self.i, self.x*self.i)
|
||||
if self.a == 1: return self.__class__(-self.y, self.x)
|
||||
else:
|
||||
return self.__class__(-self.x, -self.y)
|
||||
|
||||
def doubleAndEncodeSpec(self):
|
||||
return (self+self).encode()
|
||||
|
||||
# Utility functions
|
||||
@classmethod
|
||||
def bytesToGf(cls,bytes,mustBeProper=True,mustBePositive=False,maskHiBits=False):
|
||||
"""Convert little-endian bytes to field element, sanity check length"""
|
||||
if len(bytes) != cls.encLen:
|
||||
raise InvalidEncodingException("wrong length %d" % len(bytes))
|
||||
s = dec_le(bytes)
|
||||
if mustBeProper and s >= cls.F.order():
|
||||
raise InvalidEncodingException("%d out of range!" % s)
|
||||
bitlen = int(ceil(log(cls.F.order())/log(2)))
|
||||
if maskHiBits: s &= 2^bitlen-1
|
||||
s = cls.F(s)
|
||||
if mustBePositive and negative(s):
|
||||
raise InvalidEncodingException("%d is negative!" % s)
|
||||
return s
|
||||
|
||||
@classmethod
|
||||
def gfToBytes(cls,x,mustBePositive=False):
|
||||
"""Convert little-endian bytes to field element, sanity check length"""
|
||||
if negative(x) and mustBePositive: x = -x
|
||||
return enc_le(x,cls.encLen)
|
||||
|
||||
class RistrettoPoint(QuotientEdwardsPoint):
|
||||
"""The new Ristretto group"""
|
||||
def encodeSpec(self):
|
||||
"""Unoptimized specification for encoding"""
|
||||
x,y = self
|
||||
if self.cofactor==8 and (negative(x*y) or y==0): (x,y) = self.torque()
|
||||
if y == -1: y = 1 # Avoid divide by 0; doesn't affect impl
|
||||
|
||||
if negative(x): x,y = -x,-y
|
||||
s = xsqrt(self.mneg*(1-y)/(1+y),exn=Exception("Unimplemented: point is odd: " + str(self)))
|
||||
return self.gfToBytes(s)
|
||||
|
||||
@classmethod
|
||||
def decodeSpec(cls,s):
|
||||
"""Unoptimized specification for decoding"""
|
||||
s = cls.bytesToGf(s,mustBePositive=True)
|
||||
|
||||
a,d = cls.a,cls.d
|
||||
x = xsqrt(4*s^2 / (a*d*(1+a*s^2)^2 - (1-a*s^2)^2))
|
||||
y = (1+a*s^2) / (1-a*s^2)
|
||||
|
||||
if cls.cofactor==8 and (negative(x*y) or y==0):
|
||||
raise InvalidEncodingException("x*y has high bit")
|
||||
|
||||
return cls(x,y)
|
||||
|
||||
@optimized_version_of("encodeSpec")
|
||||
def encode(self):
|
||||
"""Encode, optimized version"""
|
||||
a,d,mneg = self.a,self.d,self.mneg
|
||||
x,y,z,t = self.xyzt()
|
||||
|
||||
if self.cofactor==8:
|
||||
u1 = mneg*(z+y)*(z-y)
|
||||
u2 = x*y # = t*z
|
||||
isr = isqrt(u1*u2^2)
|
||||
i1 = isr*u1 # sqrt(mneg*(z+y)*(z-y))/(x*y)
|
||||
i2 = isr*u2 # 1/sqrt(a*(y+z)*(y-z))
|
||||
z_inv = i1*i2*t # 1/z
|
||||
|
||||
if negative(t*z_inv):
|
||||
if a==-1:
|
||||
x,y = y*self.i,x*self.i
|
||||
den_inv = self.magic * i1
|
||||
else:
|
||||
x,y = -y,x
|
||||
den_inv = self.i * self.magic * i1
|
||||
|
||||
else:
|
||||
den_inv = i2
|
||||
|
||||
if negative(x*z_inv): y = -y
|
||||
s = (z-y) * den_inv
|
||||
else:
|
||||
num = mneg*(z+y)*(z-y)
|
||||
isr = isqrt(num*y^2)
|
||||
if negative(isr^2*num*y*t): y = -y
|
||||
s = isr*y*(z-y)
|
||||
|
||||
return self.gfToBytes(s,mustBePositive=True)
|
||||
|
||||
@optimized_version_of("doubleAndEncodeSpec")
|
||||
def doubleAndEncode(self):
|
||||
X,Y,Z,T = self.xyzt()
|
||||
a,d,mneg = self.a,self.d,self.mneg
|
||||
|
||||
if self.cofactor==8:
|
||||
e = 2*X*Y
|
||||
f = Z^2+d*T^2
|
||||
g = Y^2-a*X^2
|
||||
h = Z^2-d*T^2
|
||||
|
||||
inv1 = 1/(e*f*g*h)
|
||||
z_inv = inv1*e*g # 1 / (f*h)
|
||||
t_inv = inv1*f*h
|
||||
|
||||
if negative(e*g*z_inv):
|
||||
if a==-1: sqrta = self.i
|
||||
else: sqrta = -1
|
||||
e,f,g,h = g,h,-e,f*sqrta
|
||||
factor = self.i
|
||||
else:
|
||||
factor = self.magic
|
||||
|
||||
if negative(h*e*z_inv): g=-g
|
||||
s = (h-g)*factor*g*t_inv
|
||||
|
||||
else:
|
||||
foo = Y^2+a*X^2
|
||||
bar = X*Y
|
||||
den = 1/(foo*bar)
|
||||
if negative(2*bar^2*den): tmp = a*X^2
|
||||
else: tmp = Y^2
|
||||
s = self.magic*(Z^2-tmp)*foo*den
|
||||
|
||||
return self.gfToBytes(s,mustBePositive=True)
|
||||
|
||||
@classmethod
|
||||
@optimized_version_of("decodeSpec")
|
||||
def decode(cls,s):
|
||||
"""Decode, optimized version"""
|
||||
s = cls.bytesToGf(s,mustBePositive=True)
|
||||
|
||||
a,d = cls.a,cls.d
|
||||
yden = 1-a*s^2
|
||||
ynum = 1+a*s^2
|
||||
yden_sqr = yden^2
|
||||
xden_sqr = a*d*ynum^2 - yden_sqr
|
||||
|
||||
isr = isqrt(xden_sqr * yden_sqr)
|
||||
|
||||
xden_inv = isr * yden
|
||||
yden_inv = xden_inv * isr * xden_sqr
|
||||
|
||||
x = 2*s*xden_inv
|
||||
if negative(x): x = -x
|
||||
y = ynum * yden_inv
|
||||
|
||||
if cls.cofactor==8 and (negative(x*y) or y==0):
|
||||
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
|
||||
|
||||
return cls(x,y)
|
||||
|
||||
@classmethod
|
||||
def fromJacobiQuartic(cls,s,t,sgn=1):
|
||||
"""Convert point from its Jacobi Quartic representation"""
|
||||
a,d = cls.a,cls.d
|
||||
assert s^4 - 2*cls.a*(1-2*d/(d-a))*s^2 + 1 == t^2
|
||||
x = 2*s*cls.magic / t
|
||||
y = (1+a*s^2) / (1-a*s^2)
|
||||
return cls(sgn*x,y)
|
||||
|
||||
@classmethod
|
||||
def elligatorSpec(cls,r0):
|
||||
a,d = cls.a,cls.d
|
||||
r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2
|
||||
den = (d*r-a)*(a*r-d)
|
||||
if den == 0: return cls()
|
||||
n1 = cls.a*(r+1)*(a+d)*(d-a)/den
|
||||
n2 = r*n1
|
||||
if is_square(n1):
|
||||
sgn,s,t = 1, xsqrt(n1), -(r-1)*(a+d)^2 / den - 1
|
||||
else:
|
||||
sgn,s,t = -1,-xsqrt(n2), r*(r-1)*(a+d)^2 / den - 1
|
||||
|
||||
return cls.fromJacobiQuartic(s,t)
|
||||
|
||||
@classmethod
|
||||
@optimized_version_of("elligatorSpec")
|
||||
def elligator(cls,r0):
|
||||
a,d = cls.a,cls.d
|
||||
r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)
|
||||
r = cls.qnr * r0^2
|
||||
den = (d*r-a)*(a*r-d)
|
||||
num = cls.a*(r+1)*(a+d)*(d-a)
|
||||
|
||||
iss,isri = isqrt_i(num*den)
|
||||
if iss: sgn,twiddle = 1,1
|
||||
else: sgn,twiddle = -1,r0*cls.qnr
|
||||
isri *= twiddle
|
||||
s = isri*num
|
||||
t = -sgn*isri*s*(r-1)*(d+a)^2 - 1
|
||||
if negative(s) == iss: s = -s
|
||||
return cls.fromJacobiQuartic(s,t)
|
||||
|
||||
|
||||
class Decaf_1_1_Point(QuotientEdwardsPoint):
|
||||
"""Like current decaf but tweaked for simplicity"""
|
||||
def encodeSpec(self):
|
||||
"""Unoptimized specification for encoding"""
|
||||
a,d = self.a,self.d
|
||||
x,y = self
|
||||
if x==0 or y==0: return(self.gfToBytes(0))
|
||||
|
||||
if self.cofactor==8 and negative(x*y*self.isoMagic):
|
||||
x,y = self.torque()
|
||||
|
||||
sr = xsqrt(1-a*x^2)
|
||||
altx = x*y*self.isoMagic / sr
|
||||
if negative(altx): s = (1+sr)/x
|
||||
else: s = (1-sr)/x
|
||||
|
||||
return self.gfToBytes(s,mustBePositive=True)
|
||||
|
||||
@classmethod
|
||||
def decodeSpec(cls,s):
|
||||
"""Unoptimized specification for decoding"""
|
||||
a,d = cls.a,cls.d
|
||||
s = cls.bytesToGf(s,mustBePositive=True)
|
||||
|
||||
if s==0: return cls()
|
||||
t = xsqrt(s^4 + 2*(a-2*d)*s^2 + 1)
|
||||
altx = 2*s*cls.isoMagic/t
|
||||
if negative(altx): t = -t
|
||||
x = 2*s / (1+a*s^2)
|
||||
y = (1-a*s^2) / t
|
||||
|
||||
if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0):
|
||||
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
|
||||
|
||||
return cls(x,y)
|
||||
|
||||
def toJacobiQuartic(self,toggle_rotation=False,toggle_altx=False,toggle_s=False):
|
||||
"Return s,t on jacobi curve"
|
||||
a,d = self.a,self.d
|
||||
x,y,z,t = self.xyzt()
|
||||
|
||||
if self.cofactor == 8:
|
||||
# Cofactor 8 version
|
||||
# Simulate IMAGINE_TWIST because that's how libdecaf does it
|
||||
x = self.i*x
|
||||
t = self.i*t
|
||||
a = -a
|
||||
d = -d
|
||||
|
||||
# OK, the actual libdecaf code should be here
|
||||
num = (z+y)*(z-y)
|
||||
den = x*y
|
||||
isr = isqrt(num*(a-d)*den^2)
|
||||
|
||||
iden = isr * den * self.isoMagic # 1/sqrt((z+y)(z-y)) = 1/sqrt(1-Y^2) / z
|
||||
inum = isr * num # sqrt(1-Y^2) * z / xysqrt(a-d) ~ 1/sqrt(1-ax^2)/z
|
||||
|
||||
if negative(iden*inum*self.i*t^2*(d-a)) != toggle_rotation:
|
||||
iden,inum = inum,iden
|
||||
fac = x*sqrt(a)
|
||||
toggle=(a==-1)
|
||||
else:
|
||||
fac = y
|
||||
toggle=False
|
||||
|
||||
imi = self.isoMagic * self.i
|
||||
altx = inum*t*imi
|
||||
neg_altx = negative(altx) != toggle_altx
|
||||
if neg_altx != toggle: inum =- inum
|
||||
|
||||
tmp = fac*(inum*z + 1)
|
||||
s = iden*tmp*imi
|
||||
|
||||
negm1 = (negative(s) != toggle_s) != neg_altx
|
||||
if negm1: m1 = a*fac + z
|
||||
else: m1 = a*fac - z
|
||||
|
||||
swap = toggle_s
|
||||
|
||||
else:
|
||||
# Much simpler cofactor 4 version
|
||||
num = (x+t)*(x-t)
|
||||
isr = isqrt(num*(a-d)*x^2)
|
||||
ratio = isr*num
|
||||
altx = ratio*self.isoMagic
|
||||
|
||||
neg_altx = negative(altx) != toggle_altx
|
||||
if neg_altx: ratio =- ratio
|
||||
|
||||
tmp = ratio*z - t
|
||||
s = (a-d)*isr*x*tmp
|
||||
|
||||
negx = (negative(s) != toggle_s) != neg_altx
|
||||
if negx: m1 = -a*t + x
|
||||
else: m1 = -a*t - x
|
||||
|
||||
swap = toggle_s
|
||||
|
||||
if negative(s): s = -s
|
||||
|
||||
return s,m1,a*tmp,swap
|
||||
|
||||
def invertElligator(self,toggle_r=False,*args,**kwargs):
|
||||
"Produce preimage of self under elligator, or None"
|
||||
a,d = self.a,self.d
|
||||
|
||||
rets = []
|
||||
|
||||
tr = [False,True] if self.cofactor == 8 else [False]
|
||||
for toggle_rotation in tr:
|
||||
for toggle_altx in [False,True]:
|
||||
for toggle_s in [False,True]:
|
||||
for toggle_r in [False,True]:
|
||||
s,m1,m12,swap = self.toJacobiQuartic(toggle_rotation,toggle_altx,toggle_s)
|
||||
|
||||
#print
|
||||
#print toggle_rotation,toggle_altx,toggle_s
|
||||
#print m1
|
||||
#print m12
|
||||
|
||||
|
||||
if self == self.__class__():
|
||||
if self.cofactor == 4:
|
||||
# Hacks for identity!
|
||||
if toggle_altx: m12 = 1
|
||||
elif toggle_s: m1 = 1
|
||||
elif toggle_r: continue
|
||||
## BOTH???
|
||||
|
||||
else:
|
||||
m12 = 1
|
||||
imi = self.isoMagic * self.i
|
||||
if toggle_rotation:
|
||||
if toggle_altx: m1 = -imi
|
||||
else: m1 = +imi
|
||||
else:
|
||||
if toggle_altx: m1 = 0
|
||||
else: m1 = a-d
|
||||
|
||||
rnum = (d*a*m12-m1)
|
||||
rden = ((d*a-1)*m12+m1)
|
||||
if swap: rnum,rden = rden,rnum
|
||||
|
||||
ok,sr = isqrt_i(rnum*rden*self.qnr)
|
||||
if not ok: continue
|
||||
sr *= rnum
|
||||
#print "Works! %d %x" % (swap,sr)
|
||||
|
||||
if negative(sr) != toggle_r: sr = -sr
|
||||
ret = self.gfToBytes(sr)
|
||||
if self.elligator(ret) != self and self.elligator(ret) != -self:
|
||||
print "WRONG!",[toggle_rotation,toggle_altx,toggle_s]
|
||||
if self.elligator(ret) == -self and self != -self: print "Negated!",[toggle_rotation,toggle_altx,toggle_s]
|
||||
rets.append(bytes(ret))
|
||||
return rets
|
||||
|
||||
@optimized_version_of("encodeSpec")
|
||||
def encode(self):
|
||||
"""Encode, optimized version"""
|
||||
return self.gfToBytes(self.toJacobiQuartic()[0])
|
||||
|
||||
@classmethod
|
||||
@optimized_version_of("decodeSpec")
|
||||
def decode(cls,s):
|
||||
"""Decode, optimized version"""
|
||||
a,d = cls.a,cls.d
|
||||
s = cls.bytesToGf(s,mustBePositive=True)
|
||||
|
||||
#if s==0: return cls()
|
||||
s2 = s^2
|
||||
den = 1+a*s2
|
||||
num = den^2 - 4*d*s2
|
||||
isr = isqrt(num*den^2)
|
||||
altx = 2*s*isr*den*cls.isoMagic
|
||||
if negative(altx): isr = -isr
|
||||
x = 2*s *isr^2*den*num
|
||||
y = (1-a*s^2) * isr*den
|
||||
|
||||
if cls.cofactor==8 and (negative(x*y*cls.isoMagic) or y==0):
|
||||
raise InvalidEncodingException("x*y is invalid: %d, %d" % (x,y))
|
||||
|
||||
return cls(x,y)
|
||||
|
||||
@classmethod
|
||||
def fromJacobiQuartic(cls,s,t,sgn=1):
|
||||
"""Convert point from its Jacobi Quartic representation"""
|
||||
a,d = cls.a,cls.d
|
||||
if s==0: return cls()
|
||||
x = 2*s / (1+a*s^2)
|
||||
y = (1-a*s^2) / t
|
||||
return cls(x,sgn*y)
|
||||
|
||||
@optimized_version_of("doubleAndEncodeSpec")
|
||||
def doubleAndEncode(self):
|
||||
X,Y,Z,T = self.xyzt()
|
||||
a,d = self.a,self.d
|
||||
|
||||
if self.cofactor == 8:
|
||||
# Cofactor 8 version
|
||||
# Simulate IMAGINE_TWIST because that's how libdecaf does it
|
||||
X = self.i*X
|
||||
T = self.i*T
|
||||
a = -a
|
||||
d = -d
|
||||
# TODO: This is only being called for a=-1, so could
|
||||
# be wrong for a=1
|
||||
|
||||
e = 2*X*Y
|
||||
f = Y^2+a*X^2
|
||||
g = Y^2-a*X^2
|
||||
h = Z^2-d*T^2
|
||||
|
||||
eim = e*self.isoMagic
|
||||
inv = 1/(eim*g*f*h)
|
||||
fh_inv = eim*g*inv*self.i
|
||||
|
||||
if negative(eim*g*fh_inv):
|
||||
idf = g*self.isoMagic*self.i
|
||||
bar = f
|
||||
foo = g
|
||||
test = eim*f
|
||||
else:
|
||||
idf = eim
|
||||
bar = h
|
||||
foo = -eim
|
||||
test = g*h
|
||||
|
||||
if negative(test*fh_inv): bar =- bar
|
||||
s = idf*(foo+bar)*inv*f*h
|
||||
|
||||
else:
|
||||
xy = X*Y
|
||||
h = Z^2-d*T^2
|
||||
inv = 1/(xy*h)
|
||||
if negative(inv*2*xy^2*self.isoMagic): tmp = Y
|
||||
else: tmp = X
|
||||
s = tmp^2*h*inv # = X/Y or Y/X, interestingly
|
||||
|
||||
return self.gfToBytes(s,mustBePositive=True)
|
||||
|
||||
@classmethod
|
||||
def elligatorSpec(cls,r0,fromR=False):
|
||||
a,d = cls.a,cls.d
|
||||
if fromR: r = r0
|
||||
else: r = cls.qnr * cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)^2
|
||||
|
||||
den = (d*r-(d-a))*((d-a)*r-d)
|
||||
if den == 0: return cls()
|
||||
n1 = (r+1)*(a-2*d)/den
|
||||
n2 = r*n1
|
||||
if is_square(n1):
|
||||
sgn,s,t = 1, xsqrt(n1), -(r-1)*(a-2*d)^2 / den - 1
|
||||
else:
|
||||
sgn,s,t = -1, -xsqrt(n2), r*(r-1)*(a-2*d)^2 / den - 1
|
||||
|
||||
return cls.fromJacobiQuartic(s,t)
|
||||
|
||||
@classmethod
|
||||
@optimized_version_of("elligatorSpec")
|
||||
def elligator(cls,r0):
|
||||
a,d = cls.a,cls.d
|
||||
r0 = cls.bytesToGf(r0,mustBeProper=False,maskHiBits=True)
|
||||
r = cls.qnr * r0^2
|
||||
den = (d*r-(d-a))*((d-a)*r-d)
|
||||
num = (r+1)*(a-2*d)
|
||||
|
||||
iss,isri = isqrt_i(num*den)
|
||||
if iss: sgn,twiddle = 1,1
|
||||
else: sgn,twiddle = -1,r0*cls.qnr
|
||||
isri *= twiddle
|
||||
s = isri*num
|
||||
t = -sgn*isri*s*(r-1)*(a-2*d)^2 - 1
|
||||
if negative(s) == iss: s = -s
|
||||
return cls.fromJacobiQuartic(s,t)
|
||||
|
||||
def elligatorInverseBruteForce(self):
|
||||
"""Invert Elligator using SAGE's polynomial solver"""
|
||||
a,d = self.a,self.d
|
||||
R.<r0> = self.F[]
|
||||
r = self.qnr * r0^2
|
||||
den = (d*r-(d-a))*((d-a)*r-d)
|
||||
n1 = (r+1)*(a-2*d)/den
|
||||
n2 = r*n1
|
||||
ret = set()
|
||||
for s2,t in [(n1, -(r-1)*(a-2*d)^2 / den - 1),
|
||||
(n2,r*(r-1)*(a-2*d)^2 / den - 1)]:
|
||||
x2 = 4*s2/(1+a*s2)^2
|
||||
y = (1-a*s2) / t
|
||||
|
||||
selfT = self
|
||||
for i in xrange(self.cofactor/2):
|
||||
xT,yT = selfT
|
||||
polyX = xT^2-x2
|
||||
polyY = yT-y
|
||||
sx = set(r for r,_ in polyX.numerator().roots())
|
||||
sy = set(r for r,_ in polyY.numerator().roots())
|
||||
ret = ret.union(sx.intersection(sy))
|
||||
|
||||
selfT = selfT.torque()
|
||||
|
||||
ret = [self.gfToBytes(r) for r in ret]
|
||||
|
||||
for r in ret:
|
||||
assert self.elligator(r) in [self,-self]
|
||||
|
||||
ret = [r for r in ret if self.elligator(r) == self]
|
||||
|
||||
return ret
|
||||
|
||||
class Ed25519Point(RistrettoPoint):
|
||||
F = GF(2^255-19)
|
||||
d = F(-121665/121666)
|
||||
a = F(-1)
|
||||
i = sqrt(F(-1))
|
||||
mneg = F(1)
|
||||
qnr = i
|
||||
magic = isqrt(a*d-1)
|
||||
cofactor = 8
|
||||
encLen = 32
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
return cls( 15112221349535400772501151409588531511454012693041857206046113283949847762202, 46316835694926478169428394003475163141307993866256225615783033603165251855960
|
||||
)
|
||||
|
||||
class NegEd25519Point(RistrettoPoint):
|
||||
F = GF(2^255-19)
|
||||
d = F(121665/121666)
|
||||
a = F(1)
|
||||
i = sqrt(F(-1))
|
||||
mneg = F(-1) # TODO checkme vs 1-ad or whatever
|
||||
qnr = i
|
||||
magic = isqrt(a*d-1)
|
||||
cofactor = 8
|
||||
encLen = 32
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
y = cls.F(4/5)
|
||||
x = sqrt((y^2-1)/(cls.d*y^2-cls.a))
|
||||
if negative(x): x = -x
|
||||
return cls(x,y)
|
||||
|
||||
class IsoEd448Point(RistrettoPoint):
|
||||
F = GF(2^448-2^224-1)
|
||||
d = F(39082/39081)
|
||||
a = F(1)
|
||||
mneg = F(-1)
|
||||
qnr = -1
|
||||
magic = isqrt(a*d-1)
|
||||
cofactor = 4
|
||||
encLen = 56
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
return cls( # RFC has it wrong
|
||||
345397493039729516374008604150537410266655260075183290216406970281645695073672344430481787759340633221708391583424041788924124567700732,
|
||||
-363419362147803445274661903944002267176820680343659030140745099590306164083365386343198191849338272965044442230921818680526749009182718
|
||||
)
|
||||
|
||||
class TwistedEd448GoldilocksPoint(Decaf_1_1_Point):
|
||||
F = GF(2^448-2^224-1)
|
||||
d = F(-39082)
|
||||
a = F(-1)
|
||||
qnr = -1
|
||||
cofactor = 4
|
||||
encLen = 56
|
||||
isoMagic = IsoEd448Point.magic
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
return cls.decodeSpec(Ed448GoldilocksPoint.base().encodeSpec())
|
||||
|
||||
class Ed448GoldilocksPoint(Decaf_1_1_Point):
|
||||
F = GF(2^448-2^224-1)
|
||||
d = F(-39081)
|
||||
a = F(1)
|
||||
qnr = -1
|
||||
cofactor = 4
|
||||
encLen = 56
|
||||
isoMagic = IsoEd448Point.magic
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
return 2*cls(
|
||||
224580040295924300187604334099896036246789641632564134246125461686950415467406032909029192869357953282578032075146446173674602635247710, 298819210078481492676017930443930673437544040154080242095928241372331506189835876003536878655418784733982303233503462500531545062832660
|
||||
)
|
||||
|
||||
class IsoEd25519Point(Decaf_1_1_Point):
|
||||
# TODO: twisted iso too!
|
||||
# TODO: twisted iso might have to IMAGINE_TWIST or whatever
|
||||
F = GF(2^255-19)
|
||||
d = F(-121665)
|
||||
a = F(1)
|
||||
i = sqrt(F(-1))
|
||||
qnr = i
|
||||
magic = isqrt(a*d-1)
|
||||
cofactor = 8
|
||||
encLen = 32
|
||||
isoMagic = Ed25519Point.magic
|
||||
isoA = Ed25519Point.a
|
||||
|
||||
@classmethod
|
||||
def base(cls):
|
||||
return cls.decodeSpec(Ed25519Point.base().encode())
|
||||
|
||||
class TestFailedException(Exception): pass
|
||||
|
||||
def test(cls,n):
|
||||
print "Testing curve %s" % cls.__name__
|
||||
|
||||
specials = [1]
|
||||
ii = cls.F(-1)
|
||||
while is_square(ii):
|
||||
specials.append(ii)
|
||||
ii = sqrt(ii)
|
||||
specials.append(ii)
|
||||
for i in specials:
|
||||
if negative(cls.F(i)): i = -i
|
||||
i = enc_le(i,cls.encLen)
|
||||
try:
|
||||
Q = cls.decode(i)
|
||||
QE = Q.encode()
|
||||
if QE != i:
|
||||
raise TestFailedException("Round trip special %s != %s" %
|
||||
(binascii.hexlify(QE),binascii.hexlify(i)))
|
||||
except NotOnCurveException: pass
|
||||
except InvalidEncodingException: pass
|
||||
|
||||
|
||||
P = cls.base()
|
||||
Q = cls()
|
||||
for i in xrange(n):
|
||||
#print binascii.hexlify(Q.encode())
|
||||
QE = Q.encode()
|
||||
QQ = cls.decode(QE)
|
||||
if QQ != Q: raise TestFailedException("Round trip %s != %s" % (str(QQ),str(Q)))
|
||||
|
||||
# Testing s -> 1/s: encodes -point on cofactor
|
||||
s = cls.bytesToGf(QE)
|
||||
if s != 0:
|
||||
ss = cls.gfToBytes(1/s,mustBePositive=True)
|
||||
try:
|
||||
QN = cls.decode(ss)
|
||||
if cls.cofactor == 8:
|
||||
raise TestFailedException("1/s shouldnt work for cofactor 8")
|
||||
if QN != -Q:
|
||||
raise TestFailedException("s -> 1/s should negate point for cofactor 4")
|
||||
except InvalidEncodingException as e:
|
||||
# Should be raised iff cofactor==8
|
||||
if cls.cofactor == 4:
|
||||
raise TestFailedException("s -> 1/s should work for cofactor 4")
|
||||
|
||||
QT = Q
|
||||
for h in xrange(cls.cofactor):
|
||||
QT = QT.torque()
|
||||
if QT.encode() != QE:
|
||||
raise TestFailedException("Can't torque %s,%d" % (str(Q),h+1))
|
||||
|
||||
Q0 = Q + P
|
||||
if Q0 == Q: raise TestFailedException("Addition doesn't work")
|
||||
if Q0-P != Q: raise TestFailedException("Subtraction doesn't work")
|
||||
|
||||
r = randint(1,1000)
|
||||
Q1 = Q0*r
|
||||
Q2 = Q0*(r+1)
|
||||
if Q1 + Q0 != Q2: raise TestFailedException("Scalarmul doesn't work")
|
||||
Q = Q1
|
||||
|
||||
def testElligator(cls,n):
|
||||
print "Testing elligator on %s" % cls.__name__
|
||||
for i in xrange(n):
|
||||
r = randombytes(cls.encLen)
|
||||
P = cls.elligator(r)
|
||||
if hasattr(P,"invertElligator"):
|
||||
iv = P.invertElligator()
|
||||
modr = bytes(cls.gfToBytes(cls.bytesToGf(r,mustBeProper=False,maskHiBits=True)))
|
||||
iv2 = P.torque().invertElligator()
|
||||
if modr not in iv: print "Failed to invert Elligator!"
|
||||
if len(iv) != len(set(iv)):
|
||||
print "Elligator inverses not unique!", len(set(iv)), len(iv)
|
||||
if iv != iv2:
|
||||
print "Elligator is untorqueable!"
|
||||
#print [binascii.hexlify(j) for j in iv]
|
||||
#print [binascii.hexlify(j) for j in iv2]
|
||||
#break
|
||||
else:
|
||||
pass # TODO
|
||||
|
||||
def gangtest(classes,n):
|
||||
print "Gang test",[cls.__name__ for cls in classes]
|
||||
specials = [1]
|
||||
ii = classes[0].F(-1)
|
||||
while is_square(ii):
|
||||
specials.append(ii)
|
||||
ii = sqrt(ii)
|
||||
specials.append(ii)
|
||||
|
||||
for i in xrange(n):
|
||||
rets = [bytes((cls.base()*i).encode()) for cls in classes]
|
||||
if len(set(rets)) != 1:
|
||||
print "Divergence in encode at %d" % i
|
||||
for c,ret in zip(classes,rets):
|
||||
print c,binascii.hexlify(ret)
|
||||
print
|
||||
|
||||
if i < len(specials): r0 = enc_le(specials[i],classes[0].encLen)
|
||||
else: r0 = randombytes(classes[0].encLen)
|
||||
|
||||
rets = [bytes((cls.elligator(r0)*i).encode()) for cls in classes]
|
||||
if len(set(rets)) != 1:
|
||||
print "Divergence in elligator at %d" % i
|
||||
for c,ret in zip(classes,rets):
|
||||
print c,binascii.hexlify(ret)
|
||||
print
|
||||
|
||||
def testDoubleAndEncode(cls,n):
|
||||
print "Testing doubleAndEncode on %s" % cls.__name__
|
||||
for i in xrange(n):
|
||||
r1 = randombytes(cls.encLen)
|
||||
r2 = randombytes(cls.encLen)
|
||||
u = cls.elligator(r1) + cls.elligator(r2)
|
||||
u.doubleAndEncode()
|
||||
|
||||
testDoubleAndEncode(Ed25519Point,100)
|
||||
testDoubleAndEncode(NegEd25519Point,100)
|
||||
testDoubleAndEncode(IsoEd25519Point,100)
|
||||
testDoubleAndEncode(IsoEd448Point,100)
|
||||
testDoubleAndEncode(TwistedEd448GoldilocksPoint,100)
|
||||
#test(Ed25519Point,100)
|
||||
#test(NegEd25519Point,100)
|
||||
#test(IsoEd25519Point,100)
|
||||
#test(IsoEd448Point,100)
|
||||
#test(TwistedEd448GoldilocksPoint,100)
|
||||
#test(Ed448GoldilocksPoint,100)
|
||||
#testElligator(Ed25519Point,100)
|
||||
#testElligator(NegEd25519Point,100)
|
||||
#testElligator(IsoEd25519Point,100)
|
||||
#testElligator(IsoEd448Point,100)
|
||||
#testElligator(Ed448GoldilocksPoint,100)
|
||||
#testElligator(TwistedEd448GoldilocksPoint,100)
|
||||
#gangtest([IsoEd448Point,TwistedEd448GoldilocksPoint,Ed448GoldilocksPoint],100)
|
||||
#gangtest([Ed25519Point,IsoEd25519Point],100)
|
File diff suppressed because one or more lines are too long
|
@ -1,169 +0,0 @@
|
|||
# Changelog
|
||||
|
||||
Entries are listed in reverse chronological order per undeprecated
|
||||
major series.
|
||||
|
||||
## 3.x series
|
||||
|
||||
### 3.1.0
|
||||
|
||||
* Add support for the Elligator2 encoding for Edwards points.
|
||||
* Add two optional formally-verified field arithmetic backends which
|
||||
use the Fiat Crypto project's Rust code, which is generated from
|
||||
proofs of functional correctness checked by the Coq theorem proving
|
||||
system.
|
||||
* Add support for additional sizes of precomputed tables for basepoint
|
||||
scalar multiplication.
|
||||
* Fix an unused import.
|
||||
* Add support for using the `zeroize` traits with all point types.
|
||||
Note that points are not automatically zeroized on Drop, but that
|
||||
consumers of `curve25519-dalek` should call these methods manually
|
||||
when needed.
|
||||
|
||||
### 3.0.2
|
||||
|
||||
* Fixes to make using alloc+no_std possible for stable Rust.
|
||||
|
||||
### 3.0.1
|
||||
|
||||
* Update the optional `packed-simd` dependency to rely on a newer,
|
||||
maintained version of the `packed-simd-2` crate.
|
||||
|
||||
### 3.0.0
|
||||
|
||||
* Update the `digest` dependency to `0.9`. This requires a major version
|
||||
because the `digest` traits are part of the public API, but there are
|
||||
otherwise no changes to the API.
|
||||
|
||||
## 2.x series
|
||||
|
||||
### 2.1.2
|
||||
|
||||
* Fixes to make using alloc+no_std possible for stable Rust.
|
||||
|
||||
### 2.1.1
|
||||
|
||||
* Update the optional `packed-simd` dependency to rely on a newer,
|
||||
maintained version of the `packed-simd-2` crate.
|
||||
|
||||
### 2.1.0
|
||||
|
||||
* Make `Scalar::from_bits` a `const fn`, allowing its use in `const` contexts.
|
||||
|
||||
### 2.0.0
|
||||
|
||||
* Fix a data modeling error in the `serde` feature pointed out by Trevor Perrin
|
||||
which caused points and scalars to be serialized with length fields rather
|
||||
than as fixed-size 32-byte arrays. This is a breaking change, but it fixes
|
||||
compatibility with `serde-json` and ensures that the `serde-bincode` encoding
|
||||
matches the conventional encoding for X/Ed25519.
|
||||
* Update `rand_core` to `0.5`, allowing use with new `rand` versions.
|
||||
* Switch from `clear_on_drop` to `zeroize` (by Tony Arcieri).
|
||||
* Require `subtle = ^2.2.1` and remove the note advising nightly Rust, which is
|
||||
no longer required as of that version of `subtle`. See the `subtle`
|
||||
changelog for more details.
|
||||
* Update `README.md` for `2.x` series.
|
||||
* Remove the `build.rs` hack which loaded the entire crate into its own
|
||||
`build.rs` to generate constants, and keep the constants in the source code.
|
||||
|
||||
The only significant change is the data model change to the `serde` feature;
|
||||
besides the `rand_core` version bump, there are no other user-visible changes.
|
||||
|
||||
## 1.x series
|
||||
|
||||
### 1.2.6
|
||||
|
||||
* Fixes to make using alloc+no_std possible for stable Rust.
|
||||
|
||||
### 1.2.5
|
||||
|
||||
* Update the optional `packed-simd` dependency to rely on a newer,
|
||||
maintained version of the `packed-simd-2` crate.
|
||||
|
||||
### 1.2.4
|
||||
|
||||
* Specify a semver bound for `clear_on_drop` rather than an exact version,
|
||||
addressing an issue where changes to inline assembly in rustc prevented
|
||||
`clear_on_drop` from working without an update.
|
||||
|
||||
### 1.2.3
|
||||
|
||||
* Fix an issue identified by a Quarkslab audit (and Jack Grigg), where manually
|
||||
constructing unreduced `Scalar` values, as needed for X/Ed25519, and then
|
||||
performing scalar/scalar arithmetic could compute incorrect results.
|
||||
* Switch to upstream Rust intrinsics for the IFMA backend now that they exist in
|
||||
Rust and don't need to be defined locally.
|
||||
* Ensure that the NAF computation works correctly, even for parameters never
|
||||
used elsewhere in the codebase.
|
||||
* Minor refactoring to EdwardsPoint decompression.
|
||||
* Fix broken links in documentation.
|
||||
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
|
||||
root (not quite correctly done in 1.2.2).
|
||||
|
||||
### 1.2.2
|
||||
|
||||
* Fix a typo in an internal doc-comment.
|
||||
* Add the "crypto" tag to crate metadata.
|
||||
* Fix compilation on nightly broken due to changes to the `#[doc(include)]` path
|
||||
root.
|
||||
|
||||
### 1.2.1
|
||||
|
||||
* Fix a bug in bucket index calculations in the Pippenger multiscalar algorithm
|
||||
for very large input sizes.
|
||||
* Add a more extensive randomized multiscalar multiplication consistency check
|
||||
to the test suite to prevent regressions.
|
||||
* Ensure that that multiscalar and NAF computations work correctly on extremal
|
||||
`Scalar` values constructed via `from_bits`.
|
||||
|
||||
### 1.2.0
|
||||
|
||||
* New multiscalar multiplication algorithm with better performance for
|
||||
large problem sizes. The backend algorithm is selected
|
||||
transparently using the size hints of the input iterators, so no
|
||||
changes are required for client crates to start using it.
|
||||
* Equality of Edwards points is now checked in projective coordinates.
|
||||
* Serde can now be used with `no_std`.
|
||||
|
||||
### 1.1.4
|
||||
|
||||
* Fix typos in documentation comments.
|
||||
* Remove unnecessary `Default` bound on `Scalar::from_hash`.
|
||||
|
||||
### 1.1.3
|
||||
|
||||
* Reverts the change in 1.1.0 to allow owned and borrowed RNGs, which caused a breakage due to a subtle interaction with ownership rules. (The `RngCore` change is retained).
|
||||
|
||||
### 1.1.2
|
||||
|
||||
* Disabled KaTeX on `docs.rs` pending proper [support upstream](https://github.com/rust-lang/docs.rs/issues/302).
|
||||
|
||||
## 1.1.1
|
||||
|
||||
* Fixed an issue related to `#[cfg(rustdoc)]` which prevented documenting multiple backends.
|
||||
|
||||
### 1.1.0
|
||||
|
||||
* Adds support for precomputation for multiscalar multiplication.
|
||||
* Restructures the internal source tree into `serial` and `vector` backends (no change to external API).
|
||||
* Adds a new IFMA backend which sets speed records.
|
||||
* The `avx2_backend` feature is now an alias for the `simd_backend` feature, which autoselects an appropriate vector backend (currently AVX2 or IFMA).
|
||||
* Replaces the `rand` dependency with `rand_core`.
|
||||
* Generalizes trait bounds on `RistrettoPoint::random()` and `Scalar::random()` to allow owned and borrowed RNGs and to allow `RngCore` instead of `Rng`.
|
||||
|
||||
### 1.0.3
|
||||
|
||||
* Adds `ConstantTimeEq` implementation for compressed points.
|
||||
|
||||
### 1.0.2
|
||||
|
||||
* Fixes a typo in the naming of variables in Ristretto formulas (no change to functionality).
|
||||
|
||||
### 1.0.1
|
||||
|
||||
* Depends on the stable `2.0` version of `subtle` instead of `2.0.0-pre.0`.
|
||||
|
||||
### 1.0.0
|
||||
|
||||
Initial stable release. Yanked due to a dependency mistake (see above).
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
# Code of Conduct
|
||||
|
||||
We follow the [Rust Code of Conduct](http://www.rust-lang.org/conduct.html),
|
||||
with the following additional clauses:
|
||||
|
||||
* We respect the rights to privacy and anonymity for contributors and people in
|
||||
the community. If someone wishes to contribute under a pseudonym different to
|
||||
their primary identity, that wish is to be respected by all contributors.
|
|
@ -1,19 +0,0 @@
|
|||
# Contributing to curve25519-dalek
|
||||
|
||||
If you have questions or comments, please feel free to email the
|
||||
authors.
|
||||
|
||||
For feature requests, suggestions, and bug reports, please open an issue on
|
||||
[our Github](https://github.com/dalek-cryptography/curve25519-dalek). (Or, send us
|
||||
an email if you're opposed to using Github for whatever reason.)
|
||||
|
||||
Patches are welcomed as pull requests on
|
||||
[our Github](https://github.com/dalek-cryptography/curve25519-dalek), as well as by
|
||||
email (preferably sent to all of the authors listed in `Cargo.toml`).
|
||||
|
||||
All issues on curve25519-dalek are mentored, if you want help with a bug just
|
||||
ask @isislovecruft or @hdevalence.
|
||||
|
||||
Some issues are easier than others. The `easy` label can be used to find the
|
||||
easy issues. If you want to work on an issue, please leave a comment so that we
|
||||
can assign it to you!
|
|
@ -1,97 +0,0 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
name = "curve25519-dalek"
|
||||
version = "3.1.0"
|
||||
authors = ["Isis Lovecruft <isis@patternsinthevoid.net>", "Henry de Valence <hdevalence@hdevalence.ca>"]
|
||||
exclude = ["**/.gitignore", ".gitignore", ".travis.yml"]
|
||||
description = "A pure-Rust implementation of group operations on ristretto255 and Curve25519"
|
||||
homepage = "https://dalek.rs/curve25519-dalek"
|
||||
documentation = "https://docs.rs/curve25519-dalek"
|
||||
readme = "README.md"
|
||||
keywords = ["cryptography", "crypto", "ristretto", "curve25519", "ristretto255"]
|
||||
categories = ["cryptography", "no-std"]
|
||||
license = "BSD-3-Clause"
|
||||
repository = "https://github.com/dalek-cryptography/curve25519-dalek"
|
||||
[package.metadata.docs.rs]
|
||||
features = ["nightly", "simd_backend"]
|
||||
|
||||
[[bench]]
|
||||
name = "dalek_benchmarks"
|
||||
harness = false
|
||||
[dependencies.byteorder]
|
||||
version = "^1.2.3"
|
||||
features = ["i128"]
|
||||
default-features = false
|
||||
|
||||
[dependencies.digest]
|
||||
version = "0.9"
|
||||
default-features = false
|
||||
|
||||
[dependencies.fiat-crypto]
|
||||
version = "0.1.6"
|
||||
optional = true
|
||||
|
||||
[dependencies.packed_simd]
|
||||
version = "0.3.4"
|
||||
features = ["into_bits"]
|
||||
optional = true
|
||||
package = "packed_simd_2"
|
||||
|
||||
[dependencies.rand_core]
|
||||
version = "0.5"
|
||||
default-features = false
|
||||
|
||||
[dependencies.serde]
|
||||
version = "1.0"
|
||||
features = ["derive"]
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.subtle]
|
||||
version = "^2.2.1"
|
||||
default-features = false
|
||||
|
||||
[dependencies.zeroize]
|
||||
version = "1"
|
||||
default-features = false
|
||||
[dev-dependencies.bincode]
|
||||
version = "1"
|
||||
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.3.0"
|
||||
|
||||
[dev-dependencies.hex]
|
||||
version = "0.4.2"
|
||||
|
||||
[dev-dependencies.rand]
|
||||
version = "0.7"
|
||||
|
||||
[dev-dependencies.sha2]
|
||||
version = "0.9"
|
||||
default-features = false
|
||||
|
||||
[features]
|
||||
alloc = ["zeroize/alloc"]
|
||||
avx2_backend = ["simd_backend"]
|
||||
default = ["std", "u64_backend"]
|
||||
fiat_u32_backend = ["fiat-crypto"]
|
||||
fiat_u64_backend = ["fiat-crypto"]
|
||||
nightly = ["subtle/nightly"]
|
||||
simd_backend = ["nightly", "u64_backend", "packed_simd"]
|
||||
std = ["alloc", "subtle/std", "rand_core/std"]
|
||||
u32_backend = []
|
||||
u64_backend = []
|
||||
[badges.travis-ci]
|
||||
branch = "master"
|
||||
repository = "dalek-cryptography/curve25519-dalek"
|
|
@ -1,65 +0,0 @@
|
|||
Copyright (c) 2016-2021 isis agora lovecruft. All rights reserved.
|
||||
Copyright (c) 2016-2021 Henry de Valence. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
========================================================================
|
||||
|
||||
Portions of curve25519-dalek were originally derived from Adam Langley's
|
||||
Go ed25519 implementation, found at <https://github.com/agl/ed25519/>,
|
||||
under the following licence:
|
||||
|
||||
========================================================================
|
||||
|
||||
Copyright (c) 2012 The Go Authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,8 +0,0 @@
|
|||
FEATURES := nightly yolocrypto avx2_backend
|
||||
|
||||
doc:
|
||||
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html
|
||||
|
||||
doc-internal:
|
||||
cargo rustdoc --features "$(FEATURES)" -- --html-in-header docs/assets/rustdoc-include-katex-header.html --document-private-items
|
||||
|
|
@ -1,220 +0,0 @@
|
|||
|
||||
# curve25519-dalek [![](https://img.shields.io/crates/v/curve25519-dalek.svg)](https://crates.io/crates/curve25519-dalek) [![](https://img.shields.io/badge/dynamic/json.svg?label=docs&uri=https%3A%2F%2Fcrates.io%2Fapi%2Fv1%2Fcrates%2Fcurve25519-dalek%2Fversions&query=%24.versions%5B0%5D.num&colorB=4F74A6)](https://doc.dalek.rs) [![](https://travis-ci.org/dalek-cryptography/curve25519-dalek.svg?branch=master)](https://travis-ci.org/dalek-cryptography/curve25519-dalek)
|
||||
|
||||
<img
|
||||
width="33%"
|
||||
align="right"
|
||||
src="https://doc.dalek.rs/assets/dalek-logo-clear.png"/>
|
||||
|
||||
**A pure-Rust implementation of group operations on Ristretto and Curve25519.**
|
||||
|
||||
`curve25519-dalek` is a library providing group operations on the Edwards and
|
||||
Montgomery forms of Curve25519, and on the prime-order Ristretto group.
|
||||
|
||||
`curve25519-dalek` is not intended to provide implementations of any particular
|
||||
crypto protocol. Rather, implementations of those protocols (such as
|
||||
[`x25519-dalek`][x25519-dalek] and [`ed25519-dalek`][ed25519-dalek]) should use
|
||||
`curve25519-dalek` as a library.
|
||||
|
||||
`curve25519-dalek` is intended to provide a clean and safe _mid-level_ API for use
|
||||
implementing a wide range of ECC-based crypto protocols, such as key agreement,
|
||||
signatures, anonymous credentials, rangeproofs, and zero-knowledge proof
|
||||
systems.
|
||||
|
||||
In particular, `curve25519-dalek` implements Ristretto, which constructs a
|
||||
prime-order group from a non-prime-order Edwards curve. This provides the
|
||||
speed and safety benefits of Edwards curve arithmetic, without the pitfalls of
|
||||
cofactor-related abstraction mismatches.
|
||||
|
||||
# Documentation
|
||||
|
||||
The semver-stable, public-facing `curve25519-dalek` API is documented
|
||||
[here][docs-external]. In addition, the unstable internal implementation
|
||||
details are documented [here][docs-internal].
|
||||
|
||||
The `curve25519-dalek` documentation requires a custom HTML header to include
|
||||
KaTeX for math support. Unfortunately `cargo doc` does not currently support
|
||||
this, but docs can be built using
|
||||
```sh
|
||||
make doc
|
||||
make doc-internal
|
||||
```
|
||||
|
||||
# Use
|
||||
|
||||
To import `curve25519-dalek`, add the following to the dependencies section of
|
||||
your project's `Cargo.toml`:
|
||||
```toml
|
||||
curve25519-dalek = "3"
|
||||
```
|
||||
|
||||
The sole breaking change in the `3.x` series was an update to the `digest`
|
||||
version, and in terms of non-breaking changes it includes:
|
||||
|
||||
* support for using `alloc` instead of `std` on stable Rust,
|
||||
* the Elligator2 encoding for Edwards points,
|
||||
* a fix to use `packed_simd2`,
|
||||
* various documentation fixes and improvements,
|
||||
* support for configurably-sized, precomputed lookup tables for basepoint scalar
|
||||
multiplication,
|
||||
* two new formally-verified field arithmetic backends which use the Fiat Crypto
|
||||
Rust code, which is generated from proofs of functional correctness checked by
|
||||
the Coq theorem proving system, and
|
||||
* support for explicitly calling the `zeroize` traits for all point types.
|
||||
|
||||
The `2.x` series has API almost entirely unchanged from the `1.x` series,
|
||||
except that:
|
||||
|
||||
* an error in the data modeling for the (optional) `serde` feature was
|
||||
corrected, so that when the `2.x`-series `serde` implementation is used
|
||||
with `serde-bincode`, the derived serialization matches the usual X/Ed25519
|
||||
formats;
|
||||
* the `rand` version was updated.
|
||||
|
||||
See `CHANGELOG.md` for more details.
|
||||
|
||||
# Backends and Features
|
||||
|
||||
The `nightly` feature enables features available only when using a Rust nightly
|
||||
compiler. In particular, it is required for rendering documentation and for
|
||||
the SIMD backends.
|
||||
|
||||
Curve arithmetic is implemented using one of the following backends:
|
||||
|
||||
* a `u32` backend using serial formulas and `u64` products;
|
||||
* a `u64` backend using serial formulas and `u128` products;
|
||||
* an `avx2` backend using [parallel formulas][parallel_doc] and `avx2` instructions (sets speed records);
|
||||
* an `ifma` backend using [parallel formulas][parallel_doc] and `ifma` instructions (sets speed records);
|
||||
|
||||
By default the `u64` backend is selected. To select a specific backend, use:
|
||||
```sh
|
||||
cargo build --no-default-features --features "std u32_backend"
|
||||
cargo build --no-default-features --features "std u64_backend"
|
||||
# Requires nightly, RUSTFLAGS="-C target_feature=+avx2" to use avx2
|
||||
cargo build --no-default-features --features "std simd_backend"
|
||||
# Requires nightly, RUSTFLAGS="-C target_feature=+avx512ifma" to use ifma
|
||||
cargo build --no-default-features --features "std simd_backend"
|
||||
```
|
||||
Crates using `curve25519-dalek` can either select a backend on behalf of their
|
||||
users, or expose feature flags that control the `curve25519-dalek` backend.
|
||||
|
||||
The `std` feature is enabled by default, but it can be disabled for no-`std`
|
||||
builds using `--no-default-features`. Note that this requires explicitly
|
||||
selecting an arithmetic backend using one of the `_backend` features.
|
||||
If no backend is selected, compilation will fail.
|
||||
|
||||
# Safety
|
||||
|
||||
The `curve25519-dalek` types are designed to make illegal states
|
||||
unrepresentable. For example, any instance of an `EdwardsPoint` is
|
||||
guaranteed to hold a point on the Edwards curve, and any instance of a
|
||||
`RistrettoPoint` is guaranteed to hold a valid point in the Ristretto
|
||||
group.
|
||||
|
||||
All operations are implemented using constant-time logic (no
|
||||
secret-dependent branches, no secret-dependent memory accesses),
|
||||
unless specifically marked as being variable-time code.
|
||||
We believe that our constant-time logic is lowered to constant-time
|
||||
assembly, at least on `x86_64` targets.
|
||||
|
||||
As an additional guard against possible future compiler optimizations,
|
||||
the `subtle` crate places an optimization barrier before every
|
||||
conditional move or assignment. More details can be found in [the
|
||||
documentation for the `subtle` crate][subtle_doc].
|
||||
|
||||
Some functionality (e.g., multiscalar multiplication or batch
|
||||
inversion) requires heap allocation for temporary buffers. All
|
||||
heap-allocated buffers of potentially secret data are explicitly
|
||||
zeroed before release.
|
||||
|
||||
However, we do not attempt to zero stack data, for two reasons.
|
||||
First, it's not possible to do so correctly: we don't have control
|
||||
over stack allocations, so there's no way to know how much data to
|
||||
wipe. Second, because `curve25519-dalek` provides a mid-level API,
|
||||
the correct place to start zeroing stack data is likely not at the
|
||||
entrypoints of `curve25519-dalek` functions, but at the entrypoints of
|
||||
functions in other crates.
|
||||
|
||||
The implementation is memory-safe, and contains no significant
|
||||
`unsafe` code. The SIMD backend uses `unsafe` internally to call SIMD
|
||||
intrinsics. These are marked `unsafe` only because invoking them on an
|
||||
inappropriate CPU would cause `SIGILL`, but the entire backend is only
|
||||
compiled with appropriate `target_feature`s, so this cannot occur.
|
||||
|
||||
# Performance
|
||||
|
||||
Benchmarks are run using [`criterion.rs`][criterion]:
|
||||
|
||||
```sh
|
||||
cargo bench --no-default-features --features "std u32_backend"
|
||||
cargo bench --no-default-features --features "std u64_backend"
|
||||
# Uses avx2 or ifma only if compiled for an appropriate target.
|
||||
export RUSTFLAGS="-C target_cpu=native"
|
||||
cargo bench --no-default-features --features "std simd_backend"
|
||||
```
|
||||
|
||||
Performance is a secondary goal behind correctness, safety, and
|
||||
clarity, but we aim to be competitive with other implementations.
|
||||
|
||||
# FFI
|
||||
|
||||
Unfortunately, we have no plans to add FFI to `curve25519-dalek` directly. The
|
||||
reason is that we use Rust features to provide an API that maintains safety
|
||||
invariants, which are not possible to maintain across an FFI boundary. For
|
||||
instance, as described in the _Safety_ section above, invalid points are
|
||||
impossible to construct, and this would not be the case if we exposed point
|
||||
operations over FFI.
|
||||
|
||||
However, `curve25519-dalek` is designed as a *mid-level* API, aimed at
|
||||
implementing other, higher-level primitives. Instead of providing FFI at the
|
||||
mid-level, our suggestion is to implement the higher-level primitive (a
|
||||
signature, PAKE, ZKP, etc) in Rust, using `curve25519-dalek` as a dependency,
|
||||
and have that crate provide a minimal, byte-buffer-oriented FFI specific to
|
||||
that primitive.
|
||||
|
||||
# Contributing
|
||||
|
||||
Please see [CONTRIBUTING.md][contributing].
|
||||
|
||||
Patches and pull requests should be make against the `develop`
|
||||
branch, **not** `master`.
|
||||
|
||||
# About
|
||||
|
||||
**SPOILER ALERT:** *The Twelfth Doctor's first encounter with the Daleks is in
|
||||
his second full episode, "Into the Dalek". A beleaguered ship of the "Combined
|
||||
Galactic Resistance" has discovered a broken Dalek that has turned "good",
|
||||
desiring to kill all other Daleks. The Doctor, Clara and a team of soldiers
|
||||
are miniaturized and enter the Dalek, which the Doctor names Rusty. They
|
||||
repair the damage, but accidentally restore it to its original nature, causing
|
||||
it to go on the rampage and alert the Dalek fleet to the whereabouts of the
|
||||
rebel ship. However, the Doctor manages to return Rusty to its previous state
|
||||
by linking his mind with the Dalek's: Rusty shares the Doctor's view of the
|
||||
universe's beauty, but also his deep hatred of the Daleks. Rusty destroys the
|
||||
other Daleks and departs the ship, determined to track down and bring an end
|
||||
to the Dalek race.*
|
||||
|
||||
`curve25519-dalek` is authored by Isis Agora Lovecruft and Henry de Valence.
|
||||
|
||||
Portions of this library were originally a port of [Adam Langley's
|
||||
Golang ed25519 library](https://github.com/agl/ed25519), which was in
|
||||
turn a port of the reference `ref10` implementation. Most of this code,
|
||||
including the 32-bit field arithmetic, has since been rewritten.
|
||||
|
||||
The fast `u32` and `u64` scalar arithmetic was implemented by Andrew Moon, and
|
||||
the addition chain for scalar inversion was provided by Brian Smith. The
|
||||
optimised batch inversion was contributed by Sean Bowe and Daira Hopwood.
|
||||
|
||||
The `no_std` and `zeroize` support was contributed by Tony Arcieri.
|
||||
|
||||
Thanks also to Ashley Hauck, Lucas Salibian, and Manish Goregaokar for their
|
||||
contributions.
|
||||
|
||||
[ed25519-dalek]: https://github.com/dalek-cryptography/ed25519-dalek
|
||||
[x25519-dalek]: https://github.com/dalek-cryptography/x25519-dalek
|
||||
[contributing]: https://github.com/dalek-cryptography/curve25519-dalek/blob/master/CONTRIBUTING.md
|
||||
[docs-external]: https://doc.dalek.rs/curve25519_dalek/
|
||||
[docs-internal]: https://doc-internal.dalek.rs/curve25519_dalek/
|
||||
[criterion]: https://github.com/japaric/criterion.rs
|
||||
[parallel_doc]: https://doc-internal.dalek.rs/curve25519_dalek/backend/vector/avx2/index.html
|
||||
[subtle_doc]: https://doc.dalek.rs/subtle/
|
|
@ -1,363 +0,0 @@
|
|||
#![allow(non_snake_case)]
|
||||
|
||||
extern crate rand;
|
||||
use rand::rngs::OsRng;
|
||||
use rand::thread_rng;
|
||||
|
||||
#[macro_use]
|
||||
extern crate criterion;
|
||||
|
||||
use criterion::measurement::Measurement;
|
||||
use criterion::BatchSize;
|
||||
use criterion::Criterion;
|
||||
use criterion::{BenchmarkGroup, BenchmarkId};
|
||||
|
||||
extern crate curve25519_dalek;
|
||||
|
||||
use curve25519_dalek::constants;
|
||||
use curve25519_dalek::scalar::Scalar;
|
||||
|
||||
static BATCH_SIZES: [usize; 5] = [1, 2, 4, 8, 16];
|
||||
static MULTISCALAR_SIZES: [usize; 13] = [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768, 1024];
|
||||
|
||||
mod edwards_benches {
|
||||
use super::*;
|
||||
|
||||
use curve25519_dalek::edwards::EdwardsPoint;
|
||||
|
||||
fn compress(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_POINT;
|
||||
c.bench_function("EdwardsPoint compression", move |b| b.iter(|| B.compress()));
|
||||
}
|
||||
|
||||
fn decompress(c: &mut Criterion) {
|
||||
let B_comp = &constants::ED25519_BASEPOINT_COMPRESSED;
|
||||
c.bench_function("EdwardsPoint decompression", move |b| {
|
||||
b.iter(|| B_comp.decompress().unwrap())
|
||||
});
|
||||
}
|
||||
|
||||
fn consttime_fixed_base_scalar_mul(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_TABLE;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
c.bench_function("Constant-time fixed-base scalar mul", move |b| {
|
||||
b.iter(|| B * &s)
|
||||
});
|
||||
}
|
||||
|
||||
fn consttime_variable_base_scalar_mul(c: &mut Criterion) {
|
||||
let B = &constants::ED25519_BASEPOINT_POINT;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
c.bench_function("Constant-time variable-base scalar mul", move |b| {
|
||||
b.iter(|| B * s)
|
||||
});
|
||||
}
|
||||
|
||||
fn vartime_double_base_scalar_mul(c: &mut Criterion) {
|
||||
c.bench_function("Variable-time aA+bB, A variable, B fixed", |bench| {
|
||||
let mut rng = thread_rng();
|
||||
let A = &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE;
|
||||
bench.iter_batched(
|
||||
|| (Scalar::random(&mut rng), Scalar::random(&mut rng)),
|
||||
|(a, b)| EdwardsPoint::vartime_double_scalar_mul_basepoint(&a, &A, &b),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = edwards_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
compress,
|
||||
decompress,
|
||||
consttime_fixed_base_scalar_mul,
|
||||
consttime_variable_base_scalar_mul,
|
||||
vartime_double_base_scalar_mul,
|
||||
}
|
||||
}
|
||||
|
||||
mod multiscalar_benches {
|
||||
use super::*;
|
||||
|
||||
use curve25519_dalek::edwards::EdwardsPoint;
|
||||
use curve25519_dalek::edwards::VartimeEdwardsPrecomputation;
|
||||
use curve25519_dalek::traits::MultiscalarMul;
|
||||
use curve25519_dalek::traits::VartimeMultiscalarMul;
|
||||
use curve25519_dalek::traits::VartimePrecomputedMultiscalarMul;
|
||||
|
||||
fn construct_scalars(n: usize) -> Vec<Scalar> {
|
||||
let mut rng = thread_rng();
|
||||
(0..n).map(|_| Scalar::random(&mut rng)).collect()
|
||||
}
|
||||
|
||||
fn construct_points(n: usize) -> Vec<EdwardsPoint> {
|
||||
let mut rng = thread_rng();
|
||||
(0..n)
|
||||
.map(|_| &Scalar::random(&mut rng) * &constants::ED25519_BASEPOINT_TABLE)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn construct(n: usize) -> (Vec<Scalar>, Vec<EdwardsPoint>) {
|
||||
(construct_scalars(n), construct_points(n))
|
||||
}
|
||||
|
||||
fn consttime_multiscalar_mul<M: Measurement>(c: &mut BenchmarkGroup<M>) {
|
||||
for multiscalar_size in &MULTISCALAR_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new(
|
||||
"Constant-time variable-base multiscalar multiplication",
|
||||
*multiscalar_size,
|
||||
),
|
||||
&multiscalar_size,
|
||||
|b, &&size| {
|
||||
let points = construct_points(size);
|
||||
// This is supposed to be constant-time, but we might as well
|
||||
// rerandomize the scalars for every call just in case.
|
||||
b.iter_batched(
|
||||
|| construct_scalars(size),
|
||||
|scalars| EdwardsPoint::multiscalar_mul(&scalars, &points),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn vartime_multiscalar_mul<M: Measurement>(c: &mut BenchmarkGroup<M>) {
|
||||
for multiscalar_size in &MULTISCALAR_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new(
|
||||
"Variable-time variable-base multiscalar multiplication",
|
||||
*multiscalar_size,
|
||||
),
|
||||
&multiscalar_size,
|
||||
|b, &&size| {
|
||||
let points = construct_points(size);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels).
|
||||
b.iter_batched(
|
||||
|| construct_scalars(size),
|
||||
|scalars| EdwardsPoint::vartime_multiscalar_mul(&scalars, &points),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn vartime_precomputed_pure_static<M: Measurement>(c: &mut BenchmarkGroup<M>) {
|
||||
for multiscalar_size in &MULTISCALAR_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new(
|
||||
"Variable-time fixed-base multiscalar multiplication",
|
||||
&multiscalar_size,
|
||||
),
|
||||
&multiscalar_size,
|
||||
move |b, &&total_size| {
|
||||
let static_size = total_size;
|
||||
|
||||
let static_points = construct_points(static_size);
|
||||
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels).
|
||||
b.iter_batched(
|
||||
|| construct_scalars(static_size),
|
||||
|scalars| precomp.vartime_multiscalar_mul(&scalars),
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn vartime_precomputed_helper<M: Measurement>(
|
||||
c: &mut BenchmarkGroup<M>,
|
||||
dynamic_fraction: f64,
|
||||
) {
|
||||
for multiscalar_size in &MULTISCALAR_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new(
|
||||
"Variable-time mixed-base multiscalar multiplication ({:.0}pct dyn)",
|
||||
format!("({:.0}pct dyn)", 100.0 * dynamic_fraction),
|
||||
),
|
||||
&multiscalar_size,
|
||||
move |b, &&total_size| {
|
||||
let dynamic_size = ((total_size as f64) * dynamic_fraction) as usize;
|
||||
let static_size = total_size - dynamic_size;
|
||||
|
||||
let static_points = construct_points(static_size);
|
||||
let dynamic_points = construct_points(dynamic_size);
|
||||
let precomp = VartimeEdwardsPrecomputation::new(&static_points);
|
||||
// Rerandomize the scalars for every call to prevent
|
||||
// false timings from better caching (e.g., the CPU
|
||||
// cache lifts exactly the right table entries for the
|
||||
// benchmark into the highest cache levels). Timings
|
||||
// should be independent of points so we don't
|
||||
// randomize them.
|
||||
b.iter_batched(
|
||||
|| {
|
||||
(
|
||||
construct_scalars(static_size),
|
||||
construct_scalars(dynamic_size),
|
||||
)
|
||||
},
|
||||
|(static_scalars, dynamic_scalars)| {
|
||||
precomp.vartime_mixed_multiscalar_mul(
|
||||
&static_scalars,
|
||||
&dynamic_scalars,
|
||||
&dynamic_points,
|
||||
)
|
||||
},
|
||||
BatchSize::SmallInput,
|
||||
);
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn multiscalar_multiplications(c: &mut Criterion) {
|
||||
let mut group: BenchmarkGroup<_> = c.benchmark_group("Multiscalar muls");
|
||||
|
||||
consttime_multiscalar_mul(&mut group);
|
||||
vartime_multiscalar_mul(&mut group);
|
||||
vartime_precomputed_pure_static(&mut group);
|
||||
|
||||
let dynamic_fracs = [0.0, 0.2, 0.5];
|
||||
for frac in dynamic_fracs.iter() {
|
||||
vartime_precomputed_helper(&mut group, *frac);
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = multiscalar_benches;
|
||||
// Lower the sample size to run the benchmarks faster
|
||||
config = Criterion::default().sample_size(15);
|
||||
targets =
|
||||
multiscalar_multiplications,
|
||||
}
|
||||
}
|
||||
|
||||
mod ristretto_benches {
|
||||
use super::*;
|
||||
use curve25519_dalek::ristretto::RistrettoPoint;
|
||||
|
||||
fn compress(c: &mut Criterion) {
|
||||
c.bench_function("RistrettoPoint compression", |b| {
|
||||
let B = &constants::RISTRETTO_BASEPOINT_POINT;
|
||||
b.iter(|| B.compress())
|
||||
});
|
||||
}
|
||||
|
||||
fn decompress(c: &mut Criterion) {
|
||||
c.bench_function("RistrettoPoint decompression", |b| {
|
||||
let B_comp = &constants::RISTRETTO_BASEPOINT_COMPRESSED;
|
||||
b.iter(|| B_comp.decompress().unwrap())
|
||||
});
|
||||
}
|
||||
|
||||
fn double_and_compress_batch<M: Measurement>(c: &mut BenchmarkGroup<M>) {
|
||||
for batch_size in &BATCH_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new("Batch Ristretto double-and-encode", *batch_size),
|
||||
&batch_size,
|
||||
|b, &&size| {
|
||||
let mut rng = OsRng;
|
||||
let points: Vec<RistrettoPoint> = (0..size)
|
||||
.map(|_| RistrettoPoint::random(&mut rng))
|
||||
.collect();
|
||||
b.iter(|| RistrettoPoint::double_and_compress_batch(&points));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn double_and_compress_group(c: &mut Criterion) {
|
||||
let mut group: BenchmarkGroup<_> = c.benchmark_group("double & compress batched");
|
||||
double_and_compress_batch(&mut group);
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = ristretto_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
compress,
|
||||
decompress,
|
||||
double_and_compress_group,
|
||||
}
|
||||
}
|
||||
|
||||
mod montgomery_benches {
|
||||
use super::*;
|
||||
|
||||
fn montgomery_ladder(c: &mut Criterion) {
|
||||
c.bench_function("Montgomery pseudomultiplication", |b| {
|
||||
let B = constants::X25519_BASEPOINT;
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
b.iter(|| B * s);
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = montgomery_benches;
|
||||
config = Criterion::default();
|
||||
targets = montgomery_ladder,
|
||||
}
|
||||
}
|
||||
|
||||
mod scalar_benches {
|
||||
use super::*;
|
||||
|
||||
fn scalar_inversion(c: &mut Criterion) {
|
||||
c.bench_function("Scalar inversion", |b| {
|
||||
let s = Scalar::from(897987897u64).invert();
|
||||
b.iter(|| s.invert());
|
||||
});
|
||||
}
|
||||
|
||||
fn batch_scalar_inversion<M: Measurement>(c: &mut BenchmarkGroup<M>) {
|
||||
for batch_size in &BATCH_SIZES {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new("Batch scalar inversion", *batch_size),
|
||||
&batch_size,
|
||||
|b, &&size| {
|
||||
let mut rng = OsRng;
|
||||
let scalars: Vec<Scalar> =
|
||||
(0..size).map(|_| Scalar::random(&mut rng)).collect();
|
||||
b.iter(|| {
|
||||
let mut s = scalars.clone();
|
||||
Scalar::batch_invert(&mut s);
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn batch_scalar_inversion_group(c: &mut Criterion) {
|
||||
let mut group: BenchmarkGroup<_> = c.benchmark_group("batch scalar inversion");
|
||||
batch_scalar_inversion(&mut group);
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = scalar_benches;
|
||||
config = Criterion::default();
|
||||
targets =
|
||||
scalar_inversion,
|
||||
batch_scalar_inversion_group,
|
||||
}
|
||||
}
|
||||
|
||||
criterion_main!(
|
||||
scalar_benches::scalar_benches,
|
||||
montgomery_benches::montgomery_benches,
|
||||
ristretto_benches::ristretto_benches,
|
||||
edwards_benches::edwards_benches,
|
||||
multiscalar_benches::multiscalar_benches,
|
||||
);
|
Binary file not shown.
Before Width: | Height: | Size: 110 KiB |
Binary file not shown.
Before Width: | Height: | Size: 107 KiB |
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 59 KiB |
|
@ -1,10 +0,0 @@
|
|||
<link rel="stylesheet" href="https://doc.dalek.rs/assets/katex/katex.min.css">
|
||||
<script src="https://doc.dalek.rs/assets/katex/katex.min.js"></script>
|
||||
<script src="https://doc.dalek.rs/assets/katex/contrib/auto-render.min.js"></script>
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function() { renderMathInElement(document.body); });
|
||||
</script>
|
||||
<style>
|
||||
.katex { font-size: 1em !important; }
|
||||
pre.rust, .docblock code, .docblock-short code { font-size: 0.85em !important; }
|
||||
</style>
|
|
@ -1,140 +0,0 @@
|
|||
An AVX2 implementation of the vectorized point operation strategy.
|
||||
|
||||
# Field element representation
|
||||
|
||||
Our strategy is to implement 4-wide multiplication and squaring by
|
||||
wordslicing, using one 64-bit AVX2 lane for each field element. Field
|
||||
elements are represented in the usual way as 10 `u32` limbs in radix
|
||||
\\(25.5\\) (i.e., alternating between \\(2\^{26}\\) for even limbs and
|
||||
\\(2\^{25}\\) for odd limbs). This has the effect that passing between
|
||||
the parallel 32-bit AVX2 representation and the serial 64-bit
|
||||
representation (which uses radix \\(2^{51}\\)) amounts to regrouping
|
||||
digits.
|
||||
|
||||
The field element representation is oriented around the AVX2
|
||||
`vpmuludq` instruction, which multiplies the low 32 bits of each
|
||||
64-bit lane of each operand to produce a 64-bit result.
|
||||
|
||||
```text,no_run
|
||||
(a1 ?? b1 ?? c1 ?? d1 ??)
|
||||
(a2 ?? b2 ?? c2 ?? d2 ??)
|
||||
|
||||
(a1*a2 b1*b2 c1*c2 d1*d2)
|
||||
```
|
||||
|
||||
To unpack 32-bit values into 64-bit lanes for use in multiplication
|
||||
it would be convenient to use the `vpunpck[lh]dq` instructions,
|
||||
which unpack and interleave the low and high 32-bit lanes of two
|
||||
source vectors.
|
||||
However, the AVX2 versions of these instructions are designed to
|
||||
operate only within 128-bit lanes of the 256-bit vectors, so that
|
||||
interleaving the low lanes of `(a0 b0 c0 d0 a1 b1 c1 d1)` with zero
|
||||
gives `(a0 00 b0 00 a1 00 b1 00)`. Instead, we pre-shuffle the data
|
||||
layout as `(a0 b0 a1 b1 c0 d0 c1 d1)` so that we can unpack the
|
||||
"low" and "high" parts as
|
||||
|
||||
```text,no_run
|
||||
(a0 00 b0 00 c0 00 d0 00)
|
||||
(a1 00 b1 00 c1 00 d1 00)
|
||||
```
|
||||
|
||||
The data layout for a vector of four field elements \\( (a,b,c,d)
|
||||
\\) with limbs \\( a_0, a_1, \ldots, a_9 \\) is as `[u32x8; 5]` in
|
||||
the form
|
||||
|
||||
```text,no_run
|
||||
(a0 b0 a1 b1 c0 d0 c1 d1)
|
||||
(a2 b2 a3 b3 c2 d2 c3 d3)
|
||||
(a4 b4 a5 b5 c4 d4 c5 d5)
|
||||
(a6 b6 a7 b7 c6 d6 c7 d7)
|
||||
(a8 b8 a9 b9 c8 d8 c9 d9)
|
||||
```
|
||||
|
||||
Since this breaks cleanly into two 128-bit lanes, it may be possible
|
||||
to adapt it to 128-bit vector instructions such as NEON without too
|
||||
much difficulty.
|
||||
|
||||
# Avoiding Overflow in Doubling
|
||||
|
||||
To analyze the size of the field element coefficients during the
|
||||
computations, we can parameterize the bounds on the limbs of each
|
||||
field element by \\( b \in \mathbb R \\) representing the excess bits
|
||||
above that limb's radix, so that each limb is bounded by either
|
||||
\\(2\^{25+b} \\) or \\( 2\^{26+b} \\), as appropriate.
|
||||
|
||||
The multiplication routine requires that its inputs are bounded with
|
||||
\\( b < 1.75 \\), in order to fit a multiplication by \\( 19 \\)
|
||||
into 32 bits. Since \\( \lg 19 < 4.25 \\), \\( 19x < 2\^{32} \\)
|
||||
when \\( x < 2\^{27.75} = 2\^{26 + 1.75} \\). However, this is only
|
||||
required for one of the inputs; the other can grow up to \\( b < 2.5
|
||||
\\).
|
||||
|
||||
In addition, the multiplication and squaring routines do not
|
||||
canonically reduce their outputs, but can leave some small uncarried
|
||||
excesses, so that their reduced outputs are bounded with
|
||||
\\( b < 0.007 \\).
|
||||
|
||||
The non-parallel portion of the doubling formulas is
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
|
||||
&\gets
|
||||
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
Computing \\( (S\_5, S\_6, S\_8, S\_9 ) \\) as
|
||||
$$
|
||||
\begin{matrix}
|
||||
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
|
||||
+& S\_2 & & & S\_2 \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & 2p & 2p & 2p \\\\
|
||||
-& & S\_2 & S\_2 & \\\\
|
||||
-& & & & S\_4 \\\\
|
||||
=& S\_5 & S\_6 & S\_8 & S\_9
|
||||
\end{matrix}
|
||||
$$
|
||||
results in bit-excesses \\( < (1.01, 1.60, 2.33, 2.01)\\) for
|
||||
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
|
||||
are then
|
||||
$$
|
||||
\begin{aligned}
|
||||
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 2.01) \\\\
|
||||
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
|
||||
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
|
||||
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 2.01)
|
||||
\end{aligned}
|
||||
$$
|
||||
which are too large: it's not possible to arrange the multiplicands so
|
||||
that one vector has \\(b < 2.5\\) and the other has \\( b < 1.75 \\).
|
||||
However, if we flip the sign of \\( S\_4 = S\_0\^2 \\) during
|
||||
squaring, so that we output \\(S\_4' = -S\_4 \pmod p\\), then we can
|
||||
compute
|
||||
$$
|
||||
\begin{matrix}
|
||||
& S\_1 & S\_1 & S\_1 & S\_1 \\\\
|
||||
+& S\_2 & & & S\_2 \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & S\_3 & \\\\
|
||||
+& & & & S\_4' \\\\
|
||||
+& & 2p & 2p & \\\\
|
||||
-& & S\_2 & S\_2 & \\\\
|
||||
=& S\_5 & S\_6 & S\_8 & S\_9
|
||||
\end{matrix}
|
||||
$$
|
||||
resulting in bit-excesses \\( < (1.01, 1.60, 2.33, 1.60)\\) for
|
||||
\\( (S\_5, S\_6, S\_8, S\_9 ) \\). The products we want to compute
|
||||
are then
|
||||
$$
|
||||
\begin{aligned}
|
||||
X\_3 &\gets S\_8 S\_9 \leftrightarrow (2.33, 1.60) \\\\
|
||||
Y\_3 &\gets S\_5 S\_6 \leftrightarrow (1.01, 1.60) \\\\
|
||||
Z\_3 &\gets S\_8 S\_6 \leftrightarrow (2.33, 1.60) \\\\
|
||||
T\_3 &\gets S\_5 S\_9 \leftrightarrow (1.01, 1.60)
|
||||
\end{aligned}
|
||||
$$
|
||||
whose right-hand sides are all bounded with \\( b < 1.75 \\) and
|
||||
whose left-hand sides are all bounded with \\( b < 2.5 \\),
|
||||
so that we can avoid any intermediate reductions.
|
|
@ -1,580 +0,0 @@
|
|||
An AVX512-IFMA implementation of the vectorized point operation
|
||||
strategy.
|
||||
|
||||
# IFMA instructions
|
||||
|
||||
AVX512-IFMA is an extension to AVX-512 consisting of two instructions:
|
||||
|
||||
* `vpmadd52luq`: packed multiply of unsigned 52-bit integers and add
|
||||
the low 52 product bits to 64-bit accumulators;
|
||||
* `vpmadd52huq`: packed multiply of unsigned 52-bit integers and add
|
||||
the high 52 product bits to 64-bit accumulators;
|
||||
|
||||
These operate on 64-bit lanes of their source vectors, taking the low
|
||||
52 bits of each lane of each source vector, computing the 104-bit
|
||||
products of each pair, and then adding either the high or low 52 bits
|
||||
of the 104-bit products to the 64-bit lanes of the destination vector.
|
||||
The multiplication is performed internally by reusing circuitry for
|
||||
floating-point arithmetic. Although these instructions are part of
|
||||
AVX512, the AVX512VL (vector length) extension (present whenever IFMA
|
||||
is) allows using them with 512, 256, or 128-bit operands.
|
||||
|
||||
This provides a major advantage to vectorized integer operations:
|
||||
previously, vector operations could only use a \\(32 \times 32
|
||||
\rightarrow 64\\)-bit multiplier, while serial code could use a
|
||||
\\(64\times 64 \rightarrow 128\\)-bit multiplier.
|
||||
|
||||
## IFMA for big-integer multiplications
|
||||
|
||||
A detailed example of the intended use of the IFMA instructions can be
|
||||
found in a 2016 paper by Gueron and Krasnov, [_Accelerating Big
|
||||
Integer Arithmetic Using Intel IFMA Extensions_][2016_gueron_krasnov].
|
||||
The basic idea is that multiplication of large integers (such as 1024,
|
||||
2048, or more bits) can be performed as follows.
|
||||
|
||||
First, convert a “packed” 64-bit representation
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x'_0 + x'_1 2^{64} + x'_2 2^{128} + \cdots \\\\
|
||||
y &= y'_0 + y'_1 2^{64} + y'_2 2^{128} + \cdots
|
||||
\end{aligned}
|
||||
\\]
|
||||
into a “redundant” 52-bit representation
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{52} + x_2 2^{104} + \cdots \\\\
|
||||
y &= y_0 + y_1 2^{52} + y_2 2^{104} + \cdots
|
||||
\end{aligned}
|
||||
\\]
|
||||
with each \\(x_i, y_j\\) in a 64-bit lane.
|
||||
|
||||
Writing the product as \\(z = z_0 + z_1 2^{52} + z_2 2^{104} + \cdots\\),
|
||||
the “schoolbook” multiplication strategy gives
|
||||
\\[
|
||||
\begin{aligned}
|
||||
&z_0 &&=& x_0 & y_0 & & & & & & & & \\\\
|
||||
&z_1 &&=& x_1 & y_0 &+ x_0 & y_1 & & & & & & \\\\
|
||||
&z_2 &&=& x_2 & y_0 &+ x_1 & y_1 &+ x_0 & y_2 & & & & \\\\
|
||||
&z_3 &&=& x_3 & y_0 &+ x_2 & y_1 &+ x_1 & y_2 &+ x_0 & y_3 & & \\\\
|
||||
&z_4 &&=& \vdots\\;&\\;\vdots &+ x_3 & y_1 &+ x_2 & y_2 &+ x_1 & y_3 &+ \cdots& \\\\
|
||||
&z_5 &&=& & & \vdots\\;&\\;\vdots &+ x_3 & y_2 &+ x_2 & y_3 &+ \cdots& \\\\
|
||||
&z_6 &&=& & & & & \vdots\\;&\\;\vdots &+ x_3 & y_3 &+ \cdots& \\\\
|
||||
&z_7 &&=& & & & & & & \vdots\\;&\\;\vdots &+ \cdots& \\\\
|
||||
&\vdots&&=& & & & & & & & & \ddots& \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Notice that the product coefficient \\(z_k\\), representing the value
|
||||
\\(z_k 2^{52k}\\), is the sum of all product terms
|
||||
\\(
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
\\)
|
||||
with \\(k = i + j\\).
|
||||
Write the IFMA operators \\(\mathrm{lo}(a,b)\\), denoting the low
|
||||
\\(52\\) bits of \\(ab\\), and
|
||||
\\(\mathrm{hi}(a,b)\\), denoting the high \\(52\\) bits of
|
||||
\\(ab\\).
|
||||
Now we can rewrite the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
&=
|
||||
2^{52 (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
This means that the low half of \\(x_i y_j\\) can be accumulated onto
|
||||
the product limb \\(z_{i+j}\\) and the high half can be directly
|
||||
accumulated onto the next-higher product limb \\(z_{i+j+1}\\) with no
|
||||
additional operations. This allows rewriting the schoolbook
|
||||
multiplication into the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
&z_0 &&=& \mathrm{lo}(x_0,&y_0) & & & & & & & & & & \\\\
|
||||
&z_1 &&=& \mathrm{lo}(x_1,&y_0) &+\mathrm{hi}(x_0,&y_0) &+\mathrm{lo}(x_0,&y_1) & & & & & & \\\\
|
||||
&z_2 &&=& \mathrm{lo}(x_2,&y_0) &+\mathrm{hi}(x_1,&y_0) &+\mathrm{lo}(x_1,&y_1) &+\mathrm{hi}(x_0,&y_1) &+\mathrm{lo}(x_0,&y_2) & & \\\\
|
||||
&z_3 &&=& \mathrm{lo}(x_3,&y_0) &+\mathrm{hi}(x_2,&y_0) &+\mathrm{lo}(x_2,&y_1) &+\mathrm{hi}(x_1,&y_1) &+\mathrm{lo}(x_1,&y_2) &+ \cdots& \\\\
|
||||
&z_4 &&=& \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_0) &+\mathrm{lo}(x_3,&y_1) &+\mathrm{hi}(x_2,&y_1) &+\mathrm{lo}(x_2,&y_2) &+ \cdots& \\\\
|
||||
&z_5 &&=& & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+\mathrm{hi}(x_3,&y_1) &+\mathrm{lo}(x_3,&y_2) &+ \cdots& \\\\
|
||||
&z_6 &&=& & & & & & & \vdots\\;&\\;\vdots & \vdots\\;&\\;\vdots &+ \cdots& \\\\
|
||||
&\vdots&&=& & & & & & & & & & & \ddots& \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Gueron and Krasnov implement multiplication by constructing vectors
|
||||
out of the columns of this diagram, so that the source operands for
|
||||
the IFMA instructions are of the form \\((x_0, x_1, x_2, \ldots)\\)
|
||||
and \\((y_i, y_i, y_i, \ldots)\\).
|
||||
After performing the multiplication,
|
||||
the product terms \\(z_i\\) are then repacked into a 64-bit representation.
|
||||
|
||||
## An alternative strategy
|
||||
|
||||
The strategy described above is aimed at big-integer multiplications,
|
||||
such as 1024, 2048, or 4096 bits, which would be used for applications
|
||||
like RSA. However, elliptic curve cryptography uses much smaller field
|
||||
sizes, such as 256 or 384 bits, so a different strategy is needed.
|
||||
|
||||
The parallel Edwards formulas provide parallelism at the level of the
|
||||
formulas for curve operations. This means that instead of scanning
|
||||
through the terms of the source operands and parallelizing *within* a
|
||||
field element (as described above), we can arrange the computation in
|
||||
product-scanning form and parallelize *across* field elements (as
|
||||
described below).
|
||||
|
||||
The parallel Edwards
|
||||
formulas provide 4-way parallelism, so they can be implemented using
|
||||
256-bit vectors using a single 64-bit lane for each element, or using
|
||||
512-bit vectors using two 64-bit lanes.
|
||||
The only available CPU supporting IFMA (the
|
||||
i3-8121U) executes 512-bit IFMA instructions at half rate compared to
|
||||
256-bit instructions, so for now there's no throughput advantage to
|
||||
using 512-bit IFMA instructions, and this implementation uses 256-bit
|
||||
vectors.
|
||||
|
||||
To extend this to 512-bit vectors, it's only only necessary to achieve
|
||||
2-way parallelism, and it's possible (with a small amount of overhead)
|
||||
to create a hybrid strategy that operates entirely within 128-bit
|
||||
lanes. This means that cross-lane operations can use the faster
|
||||
`vpshufd` (1c latency) instead of a general shuffle instruction (3c
|
||||
latency).
|
||||
|
||||
# Choice of radix
|
||||
|
||||
The inputs to IFMA instructions are 52 bits wide, so the radix \\(r\\)
|
||||
used to represent a multiprecision integer must be \\( r \leq 52 \\).
|
||||
The obvious choice is the "native" radix \\(r = 52\\).
|
||||
|
||||
As described above, this choice
|
||||
has the advantage that for \\(x_i, y_j \in [0,2^{52})\\), the product term
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{52 i}) (y_j 2^{52 j})
|
||||
&=
|
||||
2^{52 (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{52 (i+j)} +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52 (i+j+1)},
|
||||
\end{aligned}
|
||||
\\]
|
||||
so that the low and high halves of the product can be directly accumulated
|
||||
onto the product limbs.
|
||||
In contrast, when using a smaller radix \\(r = 52 - k\\),
|
||||
the product term has the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
(x_i 2^{r i}) (y_j 2^{r j})
|
||||
&=
|
||||
2^{r (i+j)}(
|
||||
\mathrm{lo}(x_i, y_j) +
|
||||
\mathrm{hi}(x_i, y_j) 2^{52}
|
||||
)
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(x_i, y_j) 2^{r (i+j)} +
|
||||
(
|
||||
\mathrm{hi}(x_i, y_j) 2^k
|
||||
)
|
||||
2^{r (i+j+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
What's happening is that the product \\(x_i y_j\\) of size \\(2r\\)
|
||||
bits is split not at \\(r\\) but at \\(52\\), so \\(k\\) product bits
|
||||
are placed into the low half instead of the high half. This means
|
||||
that the high half of the product cannot be directly accumulated onto
|
||||
\\(z_{i+j+1}\\), but must first be multiplied by \\(2^k\\) (i.e., left
|
||||
shifted by \\(k\\)). In addition, the low half of the product is
|
||||
\\(52\\) bits large instead of \\(r\\) bits.
|
||||
|
||||
## Handling offset product terms
|
||||
|
||||
[Drucker and Gueron][2018_drucker_gueron] analyze the choice of radix
|
||||
in the context of big-integer squaring, outlining three ways to handle
|
||||
the offset product terms, before concluding that all of them are
|
||||
suboptimal:
|
||||
|
||||
1. Shift the results after accumulation;
|
||||
2. Shift the input operands before multiplication;
|
||||
3. Split the MAC operation, accumulating into a zeroed register,
|
||||
shifting the result, and then adding.
|
||||
|
||||
The first option is rejected because it could double-shift some
|
||||
previously accumulated terms, the second doesn't work because the
|
||||
inputs could become larger than \\(52\\) bits, and the third requires
|
||||
additional instructions to handle the shifting and adding.
|
||||
|
||||
Based on an analysis of total number of instructions, they suggest an
|
||||
addition to the instruction set, which they call `FMSA` (fused
|
||||
multiply-shift-add). This would shift the result according to an 8-bit
|
||||
immediate value before accumulating it into the destination register.
|
||||
|
||||
However, this change to the instruction set doesn't seem to be
|
||||
necessary. Instead, the product terms can be grouped according to
|
||||
their coefficients, accumulated together, then shifted once before
|
||||
adding them to the final sum. This uses an extra register, shift, and
|
||||
add, but only once per product term (accumulation target), not once
|
||||
per source term (as in the Drucker-Gueron paper).
|
||||
|
||||
Moreover, because IFMA instructions execute only on two ports
|
||||
(presumably 0 and 1), while adds and shifts can execute on three ports
|
||||
(0, 1, and 5), the adds and shifts can execute independently of the
|
||||
IFMA operations, as long as there is not too much pressure on port 5.
|
||||
This means that, although the total number of instructions increases,
|
||||
the shifts and adds do not necessarily increase the execution time, as
|
||||
long as throughput is limited by IFMA operations.
|
||||
|
||||
Finally, because IFMA instructions have 4 cycle latency and 0.5/1
|
||||
cycle throughput (for 256/512 bit vectors), maximizing IFMA throughput
|
||||
requires either 8 (for 256) or 4 (for 512) independent operations. So
|
||||
accumulating groups of terms independently before adding them at the
|
||||
end may be necessary anyways, in order to prevent long chains of
|
||||
dependent instructions.
|
||||
|
||||
## Advantages of a smaller radix
|
||||
|
||||
Using a smaller radix has other advantages. Although radix \\(52\\)
|
||||
is an unsaturated representation from the point of view of the
|
||||
\\(64\\)-bit accumulators (because up to 4096 product terms can be
|
||||
accumulated without carries), it's a saturated representation from the
|
||||
point of view of the multiplier (since \\(52\\)-bit values are the
|
||||
maximum input size).
|
||||
|
||||
Because the inputs to a multiplication must have all of their limbs
|
||||
bounded by \\(2^{52}\\), limbs in excess of \\(2^{52}\\) must be
|
||||
reduced before they can be used as an input. The
|
||||
[Gueron-Krasnov][2016_gueron_krasnov] paper suggests normalizing
|
||||
values using a standard, sequential carry chain: for each limb, add
|
||||
the carryin from reducing the previous limb, compute the carryout and
|
||||
reduce the current limb, then move to the next limb.
|
||||
|
||||
However, when using a smaller radix, such as \\(51\\), each limb can
|
||||
store a carry bit and still be used as the input to a multiplication.
|
||||
This means that the inputs do not need to be normalized, and instead
|
||||
of using a sequential carry chain, we can compute all carryouts in
|
||||
parallel, reduce all limbs in parallel, and then add the carryins in
|
||||
parallel (possibly growing the limb values by one bit).
|
||||
|
||||
Because the output of this partial reduction is an acceptable
|
||||
multiplication input, we can "close the loop" using partial reductions
|
||||
and never have to normalize to a canonical representation through the
|
||||
entire computation, in contrast to the Gueron-Krasnov approach, which
|
||||
converts back to a packed representation after every operation. (This
|
||||
idea seems to trace back to at least as early as [this 1999
|
||||
paper][1999_walter]).
|
||||
|
||||
Using \\(r = 51\\) is enough to keep a carry bit in each limb and
|
||||
avoid normalizations. What about an even smaller radix? One reason
|
||||
to choose a smaller radix would be to align the limb boundaries with
|
||||
an inline reduction (for instance, choosing \\(r = 43\\) for the
|
||||
Mersenne field \\(p = 2^{127} - 1\\)), but for \\(p = 2^{255 - 19}\\),
|
||||
\\(r = 51 = 255/5\\) is the natural choice.
|
||||
|
||||
# Multiplication
|
||||
|
||||
The inputs to a multiplication are two field elements
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204} \\\\
|
||||
y &= y_0 + y_1 2^{51} + y_2 2^{102} + y_3 2^{153} + y_4 2^{204},
|
||||
\end{aligned}
|
||||
\\]
|
||||
with limbs in range \\([0,2^{52})\\).
|
||||
|
||||
Writing the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
|
||||
\end{aligned}
|
||||
\\]
|
||||
a schoolbook multiplication in product scanning form takes the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= x_0 y_0 \\\\
|
||||
z_1 &= x_1 y_0 + x_0 y_1 \\\\
|
||||
z_2 &= x_2 y_0 + x_1 y_1 + x_0 y_2 \\\\
|
||||
z_3 &= x_3 y_0 + x_2 y_1 + x_1 y_2 + x_0 y_3 \\\\
|
||||
z_4 &= x_4 y_0 + x_3 y_1 + x_2 y_2 + x_1 y_3 + x_0 y_4 \\\\
|
||||
z_5 &= x_4 y_1 + x_3 y_2 + x_2 y_3 + x_1 y_4 \\\\
|
||||
z_6 &= x_4 y_2 + x_3 y_3 + x_2 y_4 \\\\
|
||||
z_7 &= x_4 y_3 + x_3 y_4 \\\\
|
||||
z_8 &= x_4 y_4 \\\\
|
||||
z_9 &= 0 \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Each term \\(x_i y_j\\) can be written in terms of IFMA operations as
|
||||
\\[
|
||||
x_i y_j = \mathrm{lo}(x_i,y_j) + 2\mathrm{hi}(x_i,y_j)2^{51}.
|
||||
\\]
|
||||
Substituting this equation into the schoolbook multiplication, then
|
||||
moving terms to eliminate the \\(2^{51}\\) factors gives
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= \mathrm{lo}(x_0, y_0) \\\\
|
||||
&+ \qquad 0 \\\\
|
||||
z_1 &= \mathrm{lo}(x_1, y_0) + \mathrm{lo}(x_0, y_1) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_0, y_0) )\\\\
|
||||
z_2 &= \mathrm{lo}(x_2, y_0) + \mathrm{lo}(x_1, y_1) + \mathrm{lo}(x_0, y_2) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_1, y_0) + \mathrm{hi}(x_0, y_1) )\\\\
|
||||
z_3 &= \mathrm{lo}(x_3, y_0) + \mathrm{lo}(x_2, y_1) + \mathrm{lo}(x_1, y_2) + \mathrm{lo}(x_0, y_3) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_2, y_0) + \mathrm{hi}(x_1, y_1) + \mathrm{hi}(x_0, y_2) )\\\\
|
||||
z_4 &= \mathrm{lo}(x_4, y_0) + \mathrm{lo}(x_3, y_1) + \mathrm{lo}(x_2, y_2) + \mathrm{lo}(x_1, y_3) + \mathrm{lo}(x_0, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_3, y_0) + \mathrm{hi}(x_2, y_1) + \mathrm{hi}(x_1, y_2) + \mathrm{hi}(x_0, y_3) )\\\\
|
||||
z_5 &= \mathrm{lo}(x_4, y_1) + \mathrm{lo}(x_3, y_2) + \mathrm{lo}(x_2, y_3) + \mathrm{lo}(x_1, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_0) + \mathrm{hi}(x_3, y_1) + \mathrm{hi}(x_2, y_2) + \mathrm{hi}(x_1, y_3) + \mathrm{hi}(x_0, y_4) )\\\\
|
||||
z_6 &= \mathrm{lo}(x_4, y_2) + \mathrm{lo}(x_3, y_3) + \mathrm{lo}(x_2, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_1) + \mathrm{hi}(x_3, y_2) + \mathrm{hi}(x_2, y_3) + \mathrm{hi}(x_1, y_4) )\\\\
|
||||
z_7 &= \mathrm{lo}(x_4, y_3) + \mathrm{lo}(x_3, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_2) + \mathrm{hi}(x_3, y_3) + \mathrm{hi}(x_2, y_4) )\\\\
|
||||
z_8 &= \mathrm{lo}(x_4, y_4) \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_3) + \mathrm{hi}(x_3, y_4) )\\\\
|
||||
z_9 &= 0 \\\\
|
||||
&+ \qquad 2( \mathrm{hi}(x_4, y_4) )\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
As noted above, our strategy will be to multiply and accumulate the
|
||||
terms with coefficient \\(2\\) separately from those with coefficient
|
||||
\\(1\\), before combining them at the end. This can alternately be
|
||||
thought of as accumulating product terms into a *doubly-redundant*
|
||||
representation, with two limbs for each digit, before collapsing
|
||||
the doubly-redundant representation by shifts and adds.
|
||||
|
||||
This computation requires 25 `vpmadd52luq` and 25 `vpmadd52huq`
|
||||
operations. For 256-bit vectors, IFMA operations execute on an
|
||||
i3-8121U with latency 4 cycles, throughput 0.5 cycles, so executing 50
|
||||
instructions requires 25 cycles' worth of throughput. Accumulating
|
||||
terms with coefficient \\(1\\) and \\(2\\) seperately means that the
|
||||
longest dependency chain has length 5, so the critical path has length
|
||||
20 cycles and the bottleneck is throughput.
|
||||
|
||||
# Reduction modulo \\(p\\)
|
||||
|
||||
The next question is how to handle the reduction modulo \\(p\\).
|
||||
Because \\(p = 2^{255} - 19\\), \\(2^{255} = 19 \pmod p\\), so we can
|
||||
alternately write
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459}
|
||||
\end{aligned}
|
||||
\\]
|
||||
as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= (z_0 + 19z_5) + (z_1 + 19z_6) 2^{51} + (z_2 + 19z_7) 2^{102} + (z_3 + 19z_8) 2^{153} + (z_4 + 19z_9) 2^{204}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
When using a \\(64 \times 64 \rightarrow 128\\)-bit multiplier, this
|
||||
can be handled (as in [Ed25519][ed25519_paper]) by premultiplying
|
||||
source terms by \\(19\\). Since \\(\lg(19) < 4.25\\), this increases
|
||||
their size by less than \\(4.25\\) bits, and the rest of the
|
||||
multiplication can be shown to work out.
|
||||
|
||||
Here, we have at most \\(1\\) bit of headroom. In order to allow
|
||||
premultiplication, we would need to use radix \\(2^{47}\\), which
|
||||
would require six limbs instead of five. Instead, we compute the high
|
||||
terms \\(z_5, \ldots, z_9\\), each using two chains of IFMA
|
||||
operations, then multiply by \\(19\\) and combine with the lower terms
|
||||
\\(z_0, \ldots, z_4\\). There are two ways to perform the
|
||||
multiplication by \\(19\\): using more IFMA operations, or using the
|
||||
`vpmullq` instruction, which computes the low \\(64\\) bits of a \\(64
|
||||
\times 64\\)-bit product. However, `vpmullq` has 15c/1.5c
|
||||
latency/throughput, in contrast to the 4c/0.5c latency/throughput of
|
||||
IFMA operations, so it seems like a worse choice.
|
||||
|
||||
The high terms \\(z_5, \ldots, z_9\\) are sums of \\(52\\)-bit terms,
|
||||
so they are larger than \\(52\\) bits. Write these terms in radix \\(52\\) as
|
||||
\\[
|
||||
z_{5+i} = z_{5+i}' + z_{5+i}'' 2^{52}, \qquad z_{5+i}' < 2^{52}.
|
||||
\\]
|
||||
Then the contribution of \\(z_{5+i}\\), taken modulo \\(p\\), is
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{5+i} 2^{255} 2^{51 i}
|
||||
&=
|
||||
19 (z_{5+i}' + z_{5+i}'' 2^{52}) 2^{51 i}
|
||||
\\\\
|
||||
&=
|
||||
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
|
||||
\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
The products \\(19 z_{5+i}', 19 z_{5+i}''\\) can be written in terms of IFMA operations as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
19 z_{5+i}' &= \mathrm{lo}(19, z_{5+i}') + 2 \mathrm{hi}(19, z_{5+i}') 2^{51}, \\\\
|
||||
19 z_{5+i}'' &= \mathrm{lo}(19, z_{5+i}'') + 2 \mathrm{hi}(19, z_{5+i}'') 2^{51}. \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
Because \\(z_{5+i} < 2^{64}\\), \\(z_{5+i}'' < 2^{12} \\), so \\(19
|
||||
z_{5+i}'' < 2^{17} < 2^{52} \\) and \\(\mathrm{hi}(19, z_{5+i}'') = 0\\).
|
||||
Because IFMA operations ignore the high bits of their source
|
||||
operands, we do not need to compute \\(z\_{5+i}'\\) explicitly:
|
||||
the high bits will be ignored.
|
||||
Combining these observations, we can write
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{5+i} 2^{255} 2^{51 i}
|
||||
&=
|
||||
19 z_{5+i}' 2^{51 i} + 2 \cdot 19 z_{5+i}'' 2^{51 (i+1)}
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{5+i}) 2^{51 i}
|
||||
\+ 2 \mathrm{hi}(19, z_{5+i}) 2^{51 (i+1)}
|
||||
\+ 2 \mathrm{lo}(19, z_{5+i}/2^{52}) 2^{51 (i+1)}.
|
||||
\end{aligned}
|
||||
\\]
|
||||
|
||||
For \\(i = 0,1,2,3\\), this allows reducing \\(z_{5+i}\\) onto
|
||||
\\(z_{i}, z_{i+1}\\), and if the low terms are computed using a
|
||||
doubly-redundant representation, no additional shifts are needed to
|
||||
handle the \\(2\\) coefficients. For \\(i = 4\\), there's a
|
||||
complication: the contribution becomes
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_{9} 2^{255} 2^{204}
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2 \mathrm{hi}(19, z_{9}) 2^{255}
|
||||
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 2^{255}
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2 \mathrm{hi}(19, z_{9}) 19
|
||||
\+ 2 \mathrm{lo}(19, z_{9}/2^{52}) 19
|
||||
\\\\
|
||||
&=
|
||||
\mathrm{lo}(19, z_{9}) 2^{204}
|
||||
\+ 2
|
||||
\mathrm{lo}(19, \mathrm{hi}(19, z_{9}) + \mathrm{lo}(19, z_{9}/2^{52})).
|
||||
\\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
|
||||
It would be possible to cut the number of multiplications from 3 to 2
|
||||
by carrying the high part of each \\(z_i\\) onto \\(z_{i+1}\\). This
|
||||
would eliminate 5 multiplications, clearing 2.5 cycles of port
|
||||
pressure, at the cost of 5 additions, adding 1.66 cycles of port
|
||||
pressure. But doing this would create a dependency between terms
|
||||
(e.g., \\(z_{5}\\) must be computed before the reduction of
|
||||
\\(z_{6}\\) can begin), whereas with the approach above, all
|
||||
contributions to all terms are computed independently, to maximize ILP
|
||||
and flexibility for the processor to schedule instructions.
|
||||
|
||||
This strategy performs 16 IFMA operations, adding two IFMA operations
|
||||
to each of the \\(2\\)-coefficient terms and one to each of the
|
||||
\\(1\\)-coefficient terms. Considering the multiplication and
|
||||
reduction together, we use 66 IFMA operations, requiring 33 cycles'
|
||||
throughput, while the longest chain of IFMA operations is in the
|
||||
reduction of \\(z_5\\) onto \\(z_1\\), of length 7 (so 28 cycles, plus
|
||||
2 cycles to combine the two parts of \\(z_5\\), and the bottleneck is
|
||||
again throughput.
|
||||
|
||||
Once this is done, we have computed the product terms
|
||||
\\[
|
||||
z = z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204},
|
||||
\\]
|
||||
without reducing the \\(z_i\\) to fit in \\(52\\) bits. Because the
|
||||
overall flow of operations alternates multiplications and additions or
|
||||
subtractions, we would have to perform a reduction after an addition
|
||||
but before the next multiplication anyways, so there's no benefit to
|
||||
fully reducing the limbs at the end of a multiplication. Instead, we
|
||||
leave them unreduced, and track the reduction state using the type
|
||||
system to ensure that unreduced limbs are not accidentally used as an
|
||||
input to a multiplication.
|
||||
|
||||
# Squaring
|
||||
|
||||
Squaring operates similarly to multiplication, but with the
|
||||
possibility to combine identical terms.
|
||||
As before, we write the input as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
x &= x_0 + x_1 2^{51} + x_2 2^{102} + x_3 2^{153} + x_4 2^{204}
|
||||
\end{aligned}
|
||||
\\]
|
||||
with limbs in range \\([0,2^{52})\\).
|
||||
Writing the product terms as
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z &= z_0 + z_1 2^{51} + z_2 2^{102} + z_3 2^{153} + z_4 2^{204} \\\\
|
||||
&+ z_5 2^{255} + z_6 2^{306} + z_7 2^{357} + z_8 2^{408} + z_9 2^{459},
|
||||
\end{aligned}
|
||||
\\]
|
||||
a schoolbook squaring in product scanning form takes the form
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= x_0 x_0 \\\\
|
||||
z_1 &= 2 x_1 x_0 \\\\
|
||||
z_2 &= 2 x_2 x_0 + x_1 x_1 \\\\
|
||||
z_3 &= 2 x_3 x_0 + 2 x_2 x_1 \\\\
|
||||
z_4 &= 2 x_4 x_0 + 2 x_3 x_1 + x_2 x_2 \\\\
|
||||
z_5 &= 2 x_4 x_1 + 2 x_3 x_2 \\\\
|
||||
z_6 &= 2 x_4 x_2 + x_3 x_3 \\\\
|
||||
z_7 &= 2 x_4 x_3 \\\\
|
||||
z_8 &= x_4 x_4 \\\\
|
||||
z_9 &= 0 \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
As before, we write \\(x_i x_j\\) as
|
||||
\\[
|
||||
x_i x_j = \mathrm{lo}(x_i,x_j) + 2\mathrm{hi}(x_i,x_j)2^{51},
|
||||
\\]
|
||||
and substitute to obtain
|
||||
\\[
|
||||
\begin{aligned}
|
||||
z_0 &= \mathrm{lo}(x_0, x_0) + 0 \\\\
|
||||
z_1 &= 2 \mathrm{lo}(x_1, x_0) + 2 \mathrm{hi}(x_0, x_0) \\\\
|
||||
z_2 &= 2 \mathrm{lo}(x_2, x_0) + \mathrm{lo}(x_1, x_1) + 4 \mathrm{hi}(x_1, x_0) \\\\
|
||||
z_3 &= 2 \mathrm{lo}(x_3, x_0) + 2 \mathrm{lo}(x_2, x_1) + 4 \mathrm{hi}(x_2, x_0) + 2 \mathrm{hi}(x_1, x_1) \\\\
|
||||
z_4 &= 2 \mathrm{lo}(x_4, x_0) + 2 \mathrm{lo}(x_3, x_1) + \mathrm{lo}(x_2, x_2) + 4 \mathrm{hi}(x_3, x_0) + 4 \mathrm{hi}(x_2, x_1) \\\\
|
||||
z_5 &= 2 \mathrm{lo}(x_4, x_1) + 2 \mathrm{lo}(x_3, x_2) + 4 \mathrm{hi}(x_4, x_0) + 4 \mathrm{hi}(x_3, x_1) + 2 \mathrm{hi}(x_2, x_2) \\\\
|
||||
z_6 &= 2 \mathrm{lo}(x_4, x_2) + \mathrm{lo}(x_3, x_3) + 4 \mathrm{hi}(x_4, x_1) + 4 \mathrm{hi}(x_3, x_2) \\\\
|
||||
z_7 &= 2 \mathrm{lo}(x_4, x_3) + 4 \mathrm{hi}(x_4, x_2) + 2 \mathrm{hi}(x_3, x_3) \\\\
|
||||
z_8 &= \mathrm{lo}(x_4, x_4) + 4 \mathrm{hi}(x_4, x_3) \\\\
|
||||
z_9 &= 0 + 2 \mathrm{hi}(x_4, x_4) \\\\
|
||||
\end{aligned}
|
||||
\\]
|
||||
To implement these, we group terms by their coefficient, computing
|
||||
those with coefficient \\(2\\) on set of IFMA chains, and on another
|
||||
set of chains, we begin with coefficient-\\(4\\) terms, then shift
|
||||
left before continuing with the coefficient-\\(1\\) terms.
|
||||
The reduction strategy is the same as for multiplication.
|
||||
|
||||
# Future improvements
|
||||
|
||||
LLVM won't use blend operations on [256-bit vectors yet][llvm_blend],
|
||||
so there's a bunch of blend instructions that could be omitted.
|
||||
|
||||
Although the multiplications and squarings are much faster, there's no
|
||||
speedup to the additions and subtractions, so there are diminishing
|
||||
returns. In fact, the complications in the doubling formulas mean
|
||||
that doubling is actually slower than readdition. This also suggests
|
||||
that moving to 512-bit vectors won't be much help for a strategy aimed
|
||||
at parallelism within a group operation, so to extract performance
|
||||
gains from 512-bit vectors it will probably be necessary to create a
|
||||
parallel-friendly multiscalar multiplication algorithm. This could
|
||||
also help with reducing shuffle pressure.
|
||||
|
||||
The squaring implementation could probably be optimized, but without
|
||||
`perf` support on Cannonlake it's difficult to make actual
|
||||
measurements.
|
||||
|
||||
Another improvement would be to implement vectorized square root
|
||||
computations, which would allow creating an iterator adaptor for point
|
||||
decompression that bunched decompression operations and executed them
|
||||
in parallel. This would accelerate batch verification.
|
||||
|
||||
[2016_gueron_krasnov]: https://ieeexplore.ieee.org/document/7563269
|
||||
[2018_drucker_gueron]: https://eprint.iacr.org/2018/335
|
||||
[1999_walter]: https://pdfs.semanticscholar.org/0e6a/3e8f30b63b556679f5dff2cbfdfe9523f4fa.pdf
|
||||
[ed25519_paper]: https://ed25519.cr.yp.to/ed25519-20110926.pdf
|
||||
[llvm_blend]: https://bugs.llvm.org/show_bug.cgi?id=38343
|
|
@ -1,333 +0,0 @@
|
|||
Vectorized implementations of field and point operations, using a
|
||||
modification of the 4-way parallel formulas of Hisil, Wong, Carter,
|
||||
and Dawson.
|
||||
|
||||
These notes explain the parallel formulas and our strategy for using
|
||||
them with SIMD operations. There are two backend implementations: one
|
||||
using AVX2, and the other using AVX512-IFMA.
|
||||
|
||||
# Overview
|
||||
|
||||
The 2008 paper [_Twisted Edwards Curves Revisited_][hwcd08] by Hisil,
|
||||
Wong, Carter, and Dawson (HWCD) introduced the “extended coordinates”
|
||||
and mixed-model representations which are used by most Edwards curve
|
||||
implementations.
|
||||
|
||||
However, they also describe 4-way parallel formulas for point addition
|
||||
and doubling: a unified addition algorithm taking an effective
|
||||
\\(2\mathbf M + 1\mathbf D\\), a doubling algorithm taking an
|
||||
effective \\(1\mathbf M + 1\mathbf S\\), and a dedicated (i.e., for
|
||||
distinct points) addition algorithm taking an effective \\(2 \mathbf M
|
||||
\\). They compare these formulas with a 2-way parallel variant of the
|
||||
Montgomery ladder.
|
||||
|
||||
Unlike their serial formulas, which are used widely, their parallel
|
||||
formulas do not seem to have been implemented in software before. The
|
||||
2-way parallel Montgomery ladder was used in 2015 by Tung Chou's
|
||||
`sandy2x` implementation. Curiously, however, although the [`sandy2x`
|
||||
paper][sandy2x] also implements Edwards arithmetic, and cites HWCD08,
|
||||
it doesn't mention their parallel Edwards formulas.
|
||||
A 2015 paper by Hernández and López describes an AVX2 implementation
|
||||
of X25519. Neither the paper nor the code are publicly available, but
|
||||
it apparently gives only a [slight speedup][avx2trac], suggesting that
|
||||
it uses a 4-way parallel Montgomery ladder rather than parallel
|
||||
Edwards formulas.
|
||||
|
||||
The reason may be that HWCD08 describe their formulas as operating on
|
||||
four independent processors, which would make a software
|
||||
implementation impractical: all of the operations are too low-latency
|
||||
to effectively synchronize. But a closer inspection reveals that the
|
||||
(more expensive) multiplication and squaring steps are uniform, while
|
||||
the instruction divergence occurs in the (much cheaper) addition and
|
||||
subtraction steps. This means that a SIMD implementation can perform
|
||||
the expensive steps uniformly, and handle divergence in the
|
||||
inexpensive steps using masking.
|
||||
|
||||
These notes describe modifications to the original parallel formulas
|
||||
to allow a SIMD implementation, and this module contains
|
||||
implementations of the modified formulas targeting either AVX2 or
|
||||
AVX512-IFMA.
|
||||
|
||||
# Parallel formulas in HWCD'08
|
||||
|
||||
The doubling formula is presented in the HWCD paper as follows:
|
||||
|
||||
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|
||||
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
|
||||
| | idle | idle | idle | \\( R\_1 \gets X\_1 + Y\_1 \\) |
|
||||
| \\(1\mathbf S\\) | \\( R\_2 \gets X\_1\^2 \\) | \\( R\_3 \gets Y\_1\^2 \\) | \\( R\_4 \gets Z\_1\^2 \\) | \\( R\_5 \gets R\_1\^2 \\) |
|
||||
| | \\( R\_6 \gets R\_2 + R\_3 \\) | \\( R\_7 \gets R\_2 - R\_3 \\) | \\( R\_4 \gets 2 R\_4 \\) | idle |
|
||||
| | idle | \\( R\_1 \gets R\_4 + R\_7 \\) | idle | \\( R\_2 \gets R\_6 - R\_5 \\) |
|
||||
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_6 R\_7 \\) | \\( T\_3 \gets R\_2 R\_6 \\) | \\( Z\_3 \gets R\_1 R\_7 \\) |
|
||||
|
||||
and the unified addition algorithm is presented as follows:
|
||||
|
||||
| Cost | Processor 1 | Processor 2 | Processor 3 | Processor 4 |
|
||||
|------------------|--------------------------------|--------------------------------|--------------------------------|--------------------------------|
|
||||
| | \\( R\_1 \gets Y\_1 - X\_1 \\) | \\( R\_2 \gets Y\_2 - X\_2 \\) | \\( R\_3 \gets Y\_1 + X\_1 \\) | \\( R\_4 \gets Y\_2 + X\_2 \\) |
|
||||
| \\(1\mathbf M\\) | \\( R\_5 \gets R\_1 R\_2 \\) | \\( R\_6 \gets R\_3 R\_4 \\) | \\( R\_7 \gets T\_1 T\_2 \\) | \\( R\_8 \gets Z\_1 Z\_2 \\) |
|
||||
| \\(1\mathbf D\\) | idle | idle | \\( R\_7 \gets k R\_7 \\) | \\( R\_8 \gets 2 R\_8 \\) |
|
||||
| | \\( R\_1 \gets R\_6 - R\_5 \\) | \\( R\_2 \gets R\_8 - R\_7 \\) | \\( R\_3 \gets R\_8 + R\_7 \\) | \\( R\_4 \gets R\_6 + R\_5 \\) |
|
||||
| \\(1\mathbf M\\) | \\( X\_3 \gets R\_1 R\_2 \\) | \\( Y\_3 \gets R\_3 R\_4 \\) | \\( T\_3 \gets R\_1 R\_4 \\) | \\( Z\_3 \gets R\_2 R\_3 \\) |
|
||||
|
||||
Here \\(\mathbf M\\) and \\(\mathbf S\\) represent the cost of
|
||||
multiplication and squaring of generic field elements, \\(\mathbf D\\)
|
||||
represents the cost of multiplication by a curve constant (in this
|
||||
case \\( k = 2d \\)).
|
||||
|
||||
Notice that the \\(1\mathbf M\\) and \\(1\mathbf S\\) steps are
|
||||
uniform. The non-uniform steps are all inexpensive additions or
|
||||
subtractions, with the exception of the multiplication by the curve
|
||||
constant \\(k = 2d\\):
|
||||
$$
|
||||
R\_7 \gets 2 d R\_7.
|
||||
$$
|
||||
|
||||
HWCD suggest parallelising this step by breaking \\(k = 2d\\) into four
|
||||
parts as \\(k = k_0 + 2\^n k_1 + 2\^{2n} k_2 + 2\^{3n} k_3 \\) and
|
||||
computing \\(k_i R_7 \\) in parallel. This is quite awkward, but if
|
||||
the curve constant is a ratio \\( d = d\_1/d\_2 \\), then projective
|
||||
coordinates allow us to instead compute
|
||||
$$
|
||||
(R\_5, R\_6, R\_7, R\_8) \gets (d\_2 R\_5, d\_2 R\_6, 2d\_1 R\_7, d\_2 R\_8).
|
||||
$$
|
||||
This can be performed as a uniform multiplication by a vector of
|
||||
constants, and if \\(d\_1, d\_2\\) are small, it is relatively
|
||||
inexpensive. (This trick was suggested by Mike Hamburg).
|
||||
In the Curve25519 case, we have
|
||||
$$
|
||||
d = \frac{d\_1}{d\_2} = \frac{-121665}{121666};
|
||||
$$
|
||||
Since \\(2 \cdot 121666 < 2\^{18}\\), all the constants above fit (up
|
||||
to sign) in 32 bits, so this can be done in parallel as four
|
||||
multiplications by small constants \\( (121666, 121666, 2\cdot 121665,
|
||||
2\cdot 121666) \\), followed by a negation to compute \\( - 2\cdot 121665\\).
|
||||
|
||||
# Modified parallel formulas
|
||||
|
||||
Using the modifications sketched above, we can write SIMD-friendly
|
||||
versions of the parallel formulas as follows. To avoid confusion with
|
||||
the original formulas, temporary variables are named \\(S\\) instead
|
||||
of \\(R\\) and are in static single-assignment form.
|
||||
|
||||
## Addition
|
||||
|
||||
To add points
|
||||
\\(P_1 = (X_1 : Y_1 : Z_1 : T_1) \\)
|
||||
and
|
||||
\\(P_2 = (X_2 : Y_2 : Z_2 : T_2 ) \\),
|
||||
we compute
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_0 &&,&& S\_1 &&,&& S\_2 &&,&& S\_3 )
|
||||
&\gets
|
||||
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Y\_2 - X\_2&&,&& Y\_2 + X\_2)
|
||||
\\\\
|
||||
(S\_4 &&,&& S\_5 &&,&& S\_6 &&,&& S\_7 )
|
||||
&\gets
|
||||
(S\_0 \cdot S\_2&&,&& S\_1 \cdot S\_3&&,&& Z\_1 \cdot Z\_2&&,&& T\_1 \cdot T\_2)
|
||||
\\\\
|
||||
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
|
||||
&\gets
|
||||
(d\_2 \cdot S\_4 &&,&& d\_2 \cdot S\_5 &&,&& 2 d\_2 \cdot S\_6 &&,&& 2 d\_1 \cdot S\_7 )
|
||||
\\\\
|
||||
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
|
||||
&\gets
|
||||
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
|
||||
\\\\
|
||||
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
|
||||
&\gets
|
||||
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
|
||||
\end{aligned}
|
||||
$$
|
||||
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = P\_1 + P\_2 \\).
|
||||
This costs \\( 2\mathbf M + 1 \mathbf D\\).
|
||||
|
||||
## Readdition
|
||||
|
||||
If the point \\( P_2 = (X\_2 : Y\_2 : Z\_2 : T\_2) \\) is fixed, we
|
||||
can cache the multiplication of the curve constants by computing
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_2' &&,&& S\_3' &&,&& Z\_2' &&,&& T\_2' )
|
||||
&\gets
|
||||
(d\_2 \cdot (Y\_2 - X\_2)&&,&& d\_2 \cdot (Y\_1 + X\_1)&&,&& 2d\_2 \cdot Z\_2 &&,&& 2d\_1 \cdot T\_2).
|
||||
\end{aligned}
|
||||
$$
|
||||
This costs \\( 1\mathbf D\\); with \\( (S\_2', S\_3', Z\_2', T\_2')\\)
|
||||
in hand, the addition formulas above become
|
||||
$$
|
||||
\begin{aligned}
|
||||
(S\_0 &&,&& S\_1 &&,&& Z\_1 &&,&& T\_1 )
|
||||
&\gets
|
||||
(Y\_1 - X\_1&&,&& Y\_1 + X\_1&&,&& Z\_1 &&,&& T\_1)
|
||||
\\\\
|
||||
(S\_8 &&,&& S\_9 &&,&& S\_{10} &&,&& S\_{11} )
|
||||
&\gets
|
||||
(S\_0 \cdot S\_2' &&,&& S\_1 \cdot S\_3'&&,&& Z\_1 \cdot Z\_2' &&,&& T\_1 \cdot T\_2')
|
||||
\\\\
|
||||
(S\_{12} &&,&& S\_{13} &&,&& S\_{14} &&,&& S\_{15})
|
||||
&\gets
|
||||
(S\_9 - S\_8&&,&& S\_9 + S\_8&&,&& S\_{10} - S\_{11}&&,&& S\_{10} + S\_{11})
|
||||
\\\\
|
||||
(X\_3&&,&& Y\_3&&,&& Z\_3&&,&& T\_3)
|
||||
&\gets
|
||||
(S\_{12} \cdot S\_{14}&&,&& S\_{15} \cdot S\_{13}&&,&& S\_{15} \cdot S\_{14}&&,&& S\_{12} \cdot S\_{13})
|
||||
\end{aligned}
|
||||
$$
|
||||
which costs only \\( 2\mathbf M \\). This precomputation is
|
||||
essentially similar to the precomputation that HWCD suggest for their
|
||||
serial formulas. Because the cost of precomputation and then
|
||||
readdition is the same as addition, it's sufficient to only
|
||||
implement caching and readdition.
|
||||
|
||||
## Doubling
|
||||
|
||||
The non-uniform portions of the (re)addition formulas have a fairly
|
||||
regular structure. Unfortunately, this is not the case for the
|
||||
doubling formulas, which are much less nice.
|
||||
|
||||
To double a point \\( P = (X\_1 : Y\_1 : Z\_1 : T\_1) \\), we compute
|
||||
$$
|
||||
\begin{aligned}
|
||||
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& S\_0)
|
||||
&\gets
|
||||
(X\_1 &&,&& Y\_1 &&,&& Z\_1 &&,&& X\_1 + Y\_1)
|
||||
\\\\
|
||||
(S\_1 &&,&& S\_2 &&,&& S\_3 &&,&& S\_4 )
|
||||
&\gets
|
||||
(X\_1\^2 &&,&& Y\_1\^2&&,&& Z\_1\^2 &&,&& S\_0\^2)
|
||||
\\\\
|
||||
(S\_5 &&,&& S\_6 &&,&& S\_8 &&,&& S\_9 )
|
||||
&\gets
|
||||
(S\_1 + S\_2 &&,&& S\_1 - S\_2 &&,&& S\_1 + 2S\_3 - S\_2 &&,&& S\_1 + S\_2 - S\_4)
|
||||
\\\\
|
||||
(X\_3 &&,&& Y\_3 &&,&& Z\_3 &&,&& T\_3 )
|
||||
&\gets
|
||||
(S\_8 \cdot S\_9 &&,&& S\_5 \cdot S\_6 &&,&& S\_8 \cdot S\_6 &&,&& S\_5 \cdot S\_9)
|
||||
\end{aligned}
|
||||
$$
|
||||
to obtain \\( P\_3 = (X\_3 : Y\_3 : Z\_3 : T\_3) = [2]P\_1 \\).
|
||||
|
||||
The intermediate step between the squaring and multiplication requires
|
||||
a long chain of additions. For the IFMA-based implementation, this is not a problem; for the AVX2-based implementation, it is, but with some care and finesse, it's possible to arrange the computation without requiring an intermediate reduction.
|
||||
|
||||
# Implementation
|
||||
|
||||
These formulas aren't specific to a particular representation of field
|
||||
element vectors, whose optimum choice is determined by the details of
|
||||
the instruction set. However, it's not possible to perfectly separate
|
||||
the implementation of the field element vectors from the
|
||||
implementation of the point operations. Instead, the [`avx2`] and
|
||||
[`ifma`] backends provide `ExtendedPoint` and `CachedPoint` types, and
|
||||
the [`scalar_mul`] code uses one of the backend types by a type alias.
|
||||
|
||||
# Comparison to non-vectorized formulas
|
||||
|
||||
In theory, the parallel Edwards formulas seem to allow a \\(4\\)-way
|
||||
speedup from parallelism. However, an actual vectorized
|
||||
implementation has several slowdowns that cut into this speedup.
|
||||
|
||||
First, the parallel formulas can only use the available vector
|
||||
multiplier. For AVX2, this is a \\( 32 \times 32 \rightarrow 64
|
||||
\\)-bit integer multiplier, so the speedup from vectorization must
|
||||
overcome the disadvantage of losing the \\( 64 \times 64 \rightarrow
|
||||
128\\)-bit (serial) integer multiplier. The effect of this slowdown
|
||||
is microarchitecture-dependent, since it requires accounting for the
|
||||
total number of multiplications and additions and their relative
|
||||
costs. IFMA allows using a \\( 52 \times 52 \rightarrow 104 \\)-bit
|
||||
multiplier, but the high and low halves need to be computed
|
||||
separately, and the reduction requires extra work because it's not
|
||||
possible to pre-multiply by \\(19\\).
|
||||
|
||||
Second, the parallel doubling formulas incur both a theoretical and
|
||||
practical slowdown. The parallel formulas described above work on the
|
||||
\\( \mathbb P\^3 \\) “extended” coordinates. The \\( \mathbb P\^2 \\)
|
||||
model introduced earlier by [Bernstein, Birkner, Joye, Lange, and
|
||||
Peters][bbjlp08] allows slightly faster doublings, so HWCD suggest
|
||||
mixing coordinate systems while performing scalar multiplication
|
||||
(attributing the idea to [a 1998 paper][cmo98] by Cohen, Miyagi, and
|
||||
Ono). The \\( T \\) coordinate is not required for doublings, so when
|
||||
doublings are followed by doublings, its computation can be skipped.
|
||||
More details on this approach and the different coordinate systems can
|
||||
be found in the [`curve_models` module documentation][curve_models].
|
||||
|
||||
Unfortunately, this optimization is not compatible with the parallel
|
||||
formulas, which cannot save time by skipping a single variable, so the
|
||||
parallel doubling formulas do slightly more work when counting the
|
||||
total number of field multiplications and squarings.
|
||||
|
||||
In addition, the parallel doubling formulas have a less regular
|
||||
pattern of additions and subtractions than the parallel addition
|
||||
formulas, so the vectorization overhead is proportionately greater.
|
||||
Both the parallel addition and parallel doubling formulas also require
|
||||
some shuffling to rearrange data within the vectors, which places more
|
||||
pressure on the shuffle unit than is desirable.
|
||||
|
||||
This means that the speedup from using a vectorized implementation of
|
||||
parallel Edwards formulas is likely to be greatest in applications
|
||||
that do fewer doublings and more additions (like a large multiscalar
|
||||
multiplication) rather than applications that do fewer additions and
|
||||
more doublings (like a double-base scalar multiplication).
|
||||
|
||||
Third, Amdahl's law says that the speedup is limited to the portion
|
||||
which can be parallelized. Normally, the field multiplications
|
||||
dominate the cost of point operations, but with the IFMA backend, the
|
||||
multiplications are so fast that the non-parallel additions end up as
|
||||
a significant portion of the total time.
|
||||
|
||||
Fourth, current Intel CPUs perform thermal throttling when using wide
|
||||
vector instructions. A detailed description can be found in §15.26 of
|
||||
[the Intel Optimization Manual][intel], but using wide vector
|
||||
instructions prevents the core from operating at higher frequencies.
|
||||
The core can return to the higher-frequency state after 2
|
||||
milliseconds, but this timer is reset every time high-power
|
||||
instructions are used.
|
||||
|
||||
Any speedup from vectorization therefore has to be weighed against a
|
||||
slowdown for the next few million instructions. For a mixed workload,
|
||||
where point operations are interspersed with other tasks, this can
|
||||
reduce overall performance. This implementation is therefore probably
|
||||
not suitable for basic applications, like signatures, but is
|
||||
worthwhile for complex applications, like zero-knowledge proofs, which
|
||||
do sustained work.
|
||||
|
||||
# Future work
|
||||
|
||||
There are several directions for future improvement:
|
||||
|
||||
* Using the vectorized field arithmetic code to parallelize across
|
||||
point operations rather than within a single point operation. This
|
||||
is less flexible, but would give a speedup both from allowing use of
|
||||
the faster mixed-model arithmetic and from reducing shuffle
|
||||
pressure. One approach in this direction would be to implement
|
||||
batched scalar-point operations using vectors of points (AoSoA
|
||||
layout). This less generally useful but would give a speedup for
|
||||
Bulletproofs.
|
||||
|
||||
* Extending the IFMA implementation to use the full width of AVX512,
|
||||
either handling the extra parallelism internally to a single point
|
||||
operation (by using a 2-way parallel implementation of field
|
||||
arithmetic instead of a wordsliced one), or externally,
|
||||
parallelizing across point operations. Internal parallelism would
|
||||
be preferable but might require too much shuffle pressure. For now,
|
||||
the only available CPU which runs IFMA operations executes them at
|
||||
256-bits wide anyways, so this isn't yet important.
|
||||
|
||||
* Generalizing the implementation to NEON instructions. The current
|
||||
point arithmetic code is written in terms of field element vectors,
|
||||
which are in turn implemented using platform SIMD vectors. It
|
||||
should be possible to write an alternate implementation of the
|
||||
`FieldElement2625x4` using NEON without changing the point
|
||||
arithmetic. NEON has 128-bit vectors rather than 256-bit vectors,
|
||||
but this may still be worthwhile compared to a serial
|
||||
implementation.
|
||||
|
||||
|
||||
[sandy2x]: https://eprint.iacr.org/2015/943.pdf
|
||||
[avx2trac]: https://trac.torproject.org/projects/tor/ticket/8897#comment:28
|
||||
[hwcd08]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
|
||||
[curve_models]: https://doc-internal.dalek.rs/curve25519_dalek/backend/serial/curve_models/index.html
|
||||
[bbjlp08]: https://eprint.iacr.org/2008/013
|
||||
[cmo98]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
|
||||
[intel]: https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf
|
|
@ -1,65 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2021 isis lovecruft
|
||||
// Copyright (c) 2016-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - isis agora lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Pluggable implementations for different architectures.
|
||||
//!
|
||||
//! The backend code is split into two parts: a serial backend,
|
||||
//! and a vector backend.
|
||||
//!
|
||||
//! The [`serial`] backend contains 32- and 64-bit implementations of
|
||||
//! field arithmetic and scalar arithmetic, as well as implementations
|
||||
//! of point operations using the mixed-model strategy (passing
|
||||
//! between different curve models depending on the operation).
|
||||
//!
|
||||
//! The [`vector`] backend contains implementations of vectorized
|
||||
//! field arithmetic, used to implement point operations using a novel
|
||||
//! implementation strategy derived from parallel formulas of Hisil,
|
||||
//! Wong, Carter, and Dawson.
|
||||
//!
|
||||
//! Because the two strategies give rise to different curve models,
|
||||
//! it's not possible to reuse exactly the same scalar multiplication
|
||||
//! code (or to write it generically), so both serial and vector
|
||||
//! backends contain matching implementations of scalar multiplication
|
||||
//! algorithms. These are intended to be selected by a `#[cfg]`-based
|
||||
//! type alias.
|
||||
//!
|
||||
//! The [`vector`] backend is selected by the `simd_backend` cargo
|
||||
//! feature; it uses the [`serial`] backend for non-vectorized operations.
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "u32_backend",
|
||||
feature = "u64_backend",
|
||||
feature = "fiat_u32_backend",
|
||||
feature = "fiat_u64_backend",
|
||||
feature = "simd_backend",
|
||||
)))]
|
||||
compile_error!(
|
||||
"no curve25519-dalek backend cargo feature enabled! \
|
||||
please enable one of: u32_backend, u64_backend, fiat_u32_backend, fiat_u64_backend, simd_backend"
|
||||
);
|
||||
|
||||
pub mod serial;
|
||||
|
||||
#[cfg(any(
|
||||
all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
),
|
||||
all(feature = "nightly", rustdoc)
|
||||
))]
|
||||
#[cfg_attr(
|
||||
feature = "nightly",
|
||||
doc(cfg(any(all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
))))
|
||||
)]
|
||||
pub mod vector;
|
|
@ -1,551 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2021 isis lovecruft
|
||||
// Copyright (c) 2016-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - isis agora lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Internal curve representations which are not part of the public API.
|
||||
//!
|
||||
//! # Curve representations
|
||||
//!
|
||||
//! Internally, we use several different models for the curve. Here
|
||||
//! is a sketch of the relationship between the models, following [a
|
||||
//! post][smith-moderncrypto]
|
||||
//! by Ben Smith on the `moderncrypto` mailing list. This is also briefly
|
||||
//! discussed in section 2.5 of [_Montgomery curves and their
|
||||
//! arithmetic_][costello-smith-2017] by Costello and Smith.
|
||||
//!
|
||||
//! Begin with the affine equation for the curve,
|
||||
//! $$
|
||||
//! -x\^2 + y\^2 = 1 + dx\^2y\^2.
|
||||
//! $$
|
||||
//! Next, pass to the projective closure \\(\mathbb P\^1 \times \mathbb
|
||||
//! P\^1 \\) by setting \\(x=X/Z\\), \\(y=Y/T.\\) Clearing denominators
|
||||
//! gives the model
|
||||
//! $$
|
||||
//! -X\^2T\^2 + Y\^2Z\^2 = Z\^2T\^2 + dX\^2Y\^2.
|
||||
//! $$
|
||||
//! In `curve25519-dalek`, this is represented as the `CompletedPoint`
|
||||
//! struct.
|
||||
//! To map from \\(\mathbb P\^1 \times \mathbb P\^1 \\), a product of
|
||||
//! two lines, to \\(\mathbb P\^3\\), we use the [Segre
|
||||
//! embedding](https://en.wikipedia.org/wiki/Segre_embedding)
|
||||
//! $$
|
||||
//! \sigma : ((X:Z),(Y:T)) \mapsto (XY:XT:ZY:ZT).
|
||||
//! $$
|
||||
//! Using coordinates \\( (W_0:W_1:W_2:W_3) \\) for \\(\mathbb P\^3\\),
|
||||
//! the image \\(\sigma (\mathbb P\^1 \times \mathbb P\^1) \\) is the
|
||||
//! surface defined by \\( W_0 W_3 = W_1 W_2 \\), and under \\(
|
||||
//! \sigma\\), the equation above becomes
|
||||
//! $$
|
||||
//! -W\_1\^2 + W\_2\^2 = W\_3\^2 + dW\_0\^2,
|
||||
//! $$
|
||||
//! so that the curve is given by the pair of equations
|
||||
//! $$
|
||||
//! \begin{aligned}
|
||||
//! -W\_1\^2 + W\_2\^2 &= W\_3\^2 + dW\_0\^2, \\\\ W_0 W_3 &= W_1 W_2.
|
||||
//! \end{aligned}
|
||||
//! $$
|
||||
//! Up to variable naming, this is exactly the "extended" curve model
|
||||
//! introduced in [_Twisted Edwards Curves
|
||||
//! Revisited_][hisil-wong-carter-dawson-2008] by Hisil, Wong, Carter,
|
||||
//! and Dawson. In `curve25519-dalek`, it is represented as the
|
||||
//! `EdwardsPoint` struct. We can map from \\(\mathbb P\^3 \\) to
|
||||
//! \\(\mathbb P\^2 \\) by sending \\( (W\_0:W\_1:W\_2:W\_3) \\) to \\(
|
||||
//! (W\_1:W\_2:W\_3) \\). Notice that
|
||||
//! $$
|
||||
//! \frac {W\_1} {W\_3} = \frac {XT} {ZT} = \frac X Z = x,
|
||||
//! $$
|
||||
//! and
|
||||
//! $$
|
||||
//! \frac {W\_2} {W\_3} = \frac {YZ} {ZT} = \frac Y T = y,
|
||||
//! $$
|
||||
//! so this is the same as if we had started with the affine model
|
||||
//! and passed to \\( \mathbb P\^2 \\) by setting \\( x = W\_1 / W\_3
|
||||
//! \\), \\(y = W\_2 / W\_3 \\).
|
||||
//! Up to variable naming, this is the projective representation
|
||||
//! introduced in in [_Twisted Edwards
|
||||
//! Curves_][bernstein-birkner-joye-lange-peters-2008] by Bernstein,
|
||||
//! Birkner, Joye, Lange, and Peters. In `curve25519-dalek`, it is
|
||||
//! represented by the `ProjectivePoint` struct.
|
||||
//!
|
||||
//! # Passing between curve models
|
||||
//!
|
||||
//! Although the \\( \mathbb P\^3 \\) model provides faster addition
|
||||
//! formulas, the \\( \mathbb P\^2 \\) model provides faster doubling
|
||||
//! formulas. Hisil, Wong, Carter, and Dawson therefore suggest mixing
|
||||
//! coordinate systems for scalar multiplication, attributing the idea
|
||||
//! to [a 1998 paper][cohen-miyaji-ono-1998] of Cohen, Miyagi, and Ono.
|
||||
//!
|
||||
//! Their suggestion is to vary the formulas used by context, using a
|
||||
//! \\( \mathbb P\^2 \rightarrow \mathbb P\^2 \\) doubling formula when
|
||||
//! a doubling is followed
|
||||
//! by another doubling, a \\( \mathbb P\^2 \rightarrow \mathbb P\^3 \\)
|
||||
//! doubling formula when a doubling is followed by an addition, and
|
||||
//! computing point additions using a \\( \mathbb P\^3 \times \mathbb P\^3
|
||||
//! \rightarrow \mathbb P\^2 \\) formula.
|
||||
//!
|
||||
//! The `ref10` reference implementation of [Ed25519][ed25519], by
|
||||
//! Bernstein, Duif, Lange, Schwabe, and Yang, tweaks
|
||||
//! this strategy, factoring the addition formulas through the
|
||||
//! completion \\( \mathbb P\^1 \times \mathbb P\^1 \\), so that the
|
||||
//! output of an addition or doubling always lies in \\( \mathbb P\^1 \times
|
||||
//! \mathbb P\^1\\), and the choice of which formula to use is replaced
|
||||
//! by a choice of whether to convert the result to \\( \mathbb P\^2 \\)
|
||||
//! or \\(\mathbb P\^3 \\). However, this tweak is not described in
|
||||
//! their paper, only in their software.
|
||||
//!
|
||||
//! Our naming for the `CompletedPoint` (\\(\mathbb P\^1 \times \mathbb
|
||||
//! P\^1 \\)), `ProjectivePoint` (\\(\mathbb P\^2 \\)), and
|
||||
//! `EdwardsPoint` (\\(\mathbb P\^3 \\)) structs follows the naming in
|
||||
//! Adam Langley's [Golang ed25519][agl-ed25519] implementation, which
|
||||
//! `curve25519-dalek` was originally derived from.
|
||||
//!
|
||||
//! Finally, to accelerate readditions, we use two cached point formats
|
||||
//! in "Niels coordinates", named for Niels Duif,
|
||||
//! one for the affine model and one for the \\( \mathbb P\^3 \\) model:
|
||||
//!
|
||||
//! * `AffineNielsPoint`: \\( (y+x, y-x, 2dxy) \\)
|
||||
//! * `ProjectiveNielsPoint`: \\( (Y+X, Y-X, Z, 2dXY) \\)
|
||||
//!
|
||||
//! [smith-moderncrypto]: https://moderncrypto.org/mail-archive/curves/2016/000807.html
|
||||
//! [costello-smith-2017]: https://eprint.iacr.org/2017/212
|
||||
//! [hisil-wong-carter-dawson-2008]: https://www.iacr.org/archive/asiacrypt2008/53500329/53500329.pdf
|
||||
//! [bernstein-birkner-joye-lange-peters-2008]: https://eprint.iacr.org/2008/013
|
||||
//! [cohen-miyaji-ono-1998]: https://link.springer.com/content/pdf/10.1007%2F3-540-49649-1_6.pdf
|
||||
//! [ed25519]: https://eprint.iacr.org/2011/368
|
||||
//! [agl-ed25519]: https://github.com/agl/ed25519
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::{Add, Neg, Sub};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use constants;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use field::FieldElement;
|
||||
use traits::ValidityCheck;
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Internal point representations
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
/// A `ProjectivePoint` is a point \\((X:Y:Z)\\) on the \\(\mathbb
|
||||
/// P\^2\\) model of the curve.
|
||||
/// A point \\((x,y)\\) in the affine model corresponds to
|
||||
/// \\((x:y:1)\\).
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ProjectivePoint {
|
||||
pub X: FieldElement,
|
||||
pub Y: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
}
|
||||
|
||||
/// A `CompletedPoint` is a point \\(((X:Z), (Y:T))\\) on the \\(\mathbb
|
||||
/// P\^1 \times \mathbb P\^1 \\) model of the curve.
|
||||
/// A point (x,y) in the affine model corresponds to \\( ((x:1),(y:1))
|
||||
/// \\).
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct CompletedPoint {
|
||||
pub X: FieldElement,
|
||||
pub Y: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
pub T: FieldElement,
|
||||
}
|
||||
|
||||
/// A pre-computed point in the affine model for the curve, represented as
|
||||
/// \\((y+x, y-x, 2dxy)\\) in "Niels coordinates".
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
// Safe to derive Eq because affine coordinates.
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
#[allow(missing_docs)]
|
||||
pub struct AffineNielsPoint {
|
||||
pub y_plus_x: FieldElement,
|
||||
pub y_minus_x: FieldElement,
|
||||
pub xy2d: FieldElement,
|
||||
}
|
||||
|
||||
impl Zeroize for AffineNielsPoint {
|
||||
fn zeroize(&mut self) {
|
||||
self.y_plus_x.zeroize();
|
||||
self.y_minus_x.zeroize();
|
||||
self.xy2d.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
/// A pre-computed point on the \\( \mathbb P\^3 \\) model for the
|
||||
/// curve, represented as \\((Y+X, Y-X, Z, 2dXY)\\) in "Niels coordinates".
|
||||
///
|
||||
/// More details on the relationships between the different curve models
|
||||
/// can be found in the module-level documentation.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ProjectiveNielsPoint {
|
||||
pub Y_plus_X: FieldElement,
|
||||
pub Y_minus_X: FieldElement,
|
||||
pub Z: FieldElement,
|
||||
pub T2d: FieldElement,
|
||||
}
|
||||
|
||||
impl Zeroize for ProjectiveNielsPoint {
|
||||
fn zeroize(&mut self) {
|
||||
self.Y_plus_X.zeroize();
|
||||
self.Y_minus_X.zeroize();
|
||||
self.Z.zeroize();
|
||||
self.T2d.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Constructors
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
use traits::Identity;
|
||||
|
||||
impl Identity for ProjectivePoint {
|
||||
fn identity() -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
X: FieldElement::zero(),
|
||||
Y: FieldElement::one(),
|
||||
Z: FieldElement::one(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for ProjectiveNielsPoint {
|
||||
fn identity() -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint{
|
||||
Y_plus_X: FieldElement::one(),
|
||||
Y_minus_X: FieldElement::one(),
|
||||
Z: FieldElement::one(),
|
||||
T2d: FieldElement::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ProjectiveNielsPoint {
|
||||
fn default() -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
impl Identity for AffineNielsPoint {
|
||||
fn identity() -> AffineNielsPoint {
|
||||
AffineNielsPoint{
|
||||
y_plus_x: FieldElement::one(),
|
||||
y_minus_x: FieldElement::one(),
|
||||
xy2d: FieldElement::zero(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AffineNielsPoint {
|
||||
fn default() -> AffineNielsPoint {
|
||||
AffineNielsPoint::identity()
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Validity checks (for debugging, not CT)
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ValidityCheck for ProjectivePoint {
|
||||
fn is_valid(&self) -> bool {
|
||||
// Curve equation is -x^2 + y^2 = 1 + d*x^2*y^2,
|
||||
// homogenized as (-X^2 + Y^2)*Z^2 = Z^4 + d*X^2*Y^2
|
||||
let XX = self.X.square();
|
||||
let YY = self.Y.square();
|
||||
let ZZ = self.Z.square();
|
||||
let ZZZZ = ZZ.square();
|
||||
let lhs = &(&YY - &XX) * &ZZ;
|
||||
let rhs = &ZZZZ + &(&constants::EDWARDS_D * &(&XX * &YY));
|
||||
|
||||
lhs == rhs
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Constant-time assignment
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ConditionallySelectable for ProjectiveNielsPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
ProjectiveNielsPoint {
|
||||
Y_plus_X: FieldElement::conditional_select(&a.Y_plus_X, &b.Y_plus_X, choice),
|
||||
Y_minus_X: FieldElement::conditional_select(&a.Y_minus_X, &b.Y_minus_X, choice),
|
||||
Z: FieldElement::conditional_select(&a.Z, &b.Z, choice),
|
||||
T2d: FieldElement::conditional_select(&a.T2d, &b.T2d, choice),
|
||||
}
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.Y_plus_X.conditional_assign(&other.Y_plus_X, choice);
|
||||
self.Y_minus_X.conditional_assign(&other.Y_minus_X, choice);
|
||||
self.Z.conditional_assign(&other.Z, choice);
|
||||
self.T2d.conditional_assign(&other.T2d, choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for AffineNielsPoint {
|
||||
fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
|
||||
AffineNielsPoint {
|
||||
y_plus_x: FieldElement::conditional_select(&a.y_plus_x, &b.y_plus_x, choice),
|
||||
y_minus_x: FieldElement::conditional_select(&a.y_minus_x, &b.y_minus_x, choice),
|
||||
xy2d: FieldElement::conditional_select(&a.xy2d, &b.xy2d, choice),
|
||||
}
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &Self, choice: Choice) {
|
||||
self.y_plus_x.conditional_assign(&other.y_plus_x, choice);
|
||||
self.y_minus_x.conditional_assign(&other.y_minus_x, choice);
|
||||
self.xy2d.conditional_assign(&other.xy2d, choice);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Point conversions
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ProjectivePoint {
|
||||
/// Convert this point from the \\( \mathbb P\^2 \\) model to the
|
||||
/// \\( \mathbb P\^3 \\) model.
|
||||
///
|
||||
/// This costs \\(3 \mathrm M + 1 \mathrm S\\).
|
||||
pub fn to_extended(&self) -> EdwardsPoint {
|
||||
EdwardsPoint {
|
||||
X: &self.X * &self.Z,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: self.Z.square(),
|
||||
T: &self.X * &self.Y,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CompletedPoint {
|
||||
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
|
||||
/// \\) model to the \\( \mathbb P\^2 \\) model.
|
||||
///
|
||||
/// This costs \\(3 \mathrm M \\).
|
||||
pub fn to_projective(&self) -> ProjectivePoint {
|
||||
ProjectivePoint {
|
||||
X: &self.X * &self.T,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: &self.Z * &self.T,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert this point from the \\( \mathbb P\^1 \times \mathbb P\^1
|
||||
/// \\) model to the \\( \mathbb P\^3 \\) model.
|
||||
///
|
||||
/// This costs \\(4 \mathrm M \\).
|
||||
pub fn to_extended(&self) -> EdwardsPoint {
|
||||
EdwardsPoint {
|
||||
X: &self.X * &self.T,
|
||||
Y: &self.Y * &self.Z,
|
||||
Z: &self.Z * &self.T,
|
||||
T: &self.X * &self.Y,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Doubling
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl ProjectivePoint {
|
||||
/// Double this point: return self + self
|
||||
pub fn double(&self) -> CompletedPoint { // Double()
|
||||
let XX = self.X.square();
|
||||
let YY = self.Y.square();
|
||||
let ZZ2 = self.Z.square2();
|
||||
let X_plus_Y = &self.X + &self.Y;
|
||||
let X_plus_Y_sq = X_plus_Y.square();
|
||||
let YY_plus_XX = &YY + &XX;
|
||||
let YY_minus_XX = &YY - &XX;
|
||||
|
||||
CompletedPoint{
|
||||
X: &X_plus_Y_sq - &YY_plus_XX,
|
||||
Y: YY_plus_XX,
|
||||
Z: YY_minus_XX,
|
||||
T: &ZZ2 - &YY_minus_XX
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Addition and Subtraction
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
// XXX(hdevalence) These were doc(hidden) so they don't appear in the
|
||||
// public API docs.
|
||||
// However, that prevents them being used with --document-private-items,
|
||||
// so comment out the doc(hidden) for now until this is resolved
|
||||
//
|
||||
// upstream rust issue: https://github.com/rust-lang/rust/issues/46380
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Add<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn add(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PP = &Y_plus_X * &other.Y_plus_X;
|
||||
let MM = &Y_minus_X * &other.Y_minus_X;
|
||||
let TT2d = &self.T * &other.T2d;
|
||||
let ZZ = &self.Z * &other.Z;
|
||||
let ZZ2 = &ZZ + &ZZ;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PP - &MM,
|
||||
Y: &PP + &MM,
|
||||
Z: &ZZ2 + &TT2d,
|
||||
T: &ZZ2 - &TT2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Sub<&'b ProjectiveNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn sub(self, other: &'b ProjectiveNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PM = &Y_plus_X * &other.Y_minus_X;
|
||||
let MP = &Y_minus_X * &other.Y_plus_X;
|
||||
let TT2d = &self.T * &other.T2d;
|
||||
let ZZ = &self.Z * &other.Z;
|
||||
let ZZ2 = &ZZ + &ZZ;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PM - &MP,
|
||||
Y: &PM + &MP,
|
||||
Z: &ZZ2 - &TT2d,
|
||||
T: &ZZ2 + &TT2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Add<&'b AffineNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn add(self, other: &'b AffineNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PP = &Y_plus_X * &other.y_plus_x;
|
||||
let MM = &Y_minus_X * &other.y_minus_x;
|
||||
let Txy2d = &self.T * &other.xy2d;
|
||||
let Z2 = &self.Z + &self.Z;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PP - &MM,
|
||||
Y: &PP + &MM,
|
||||
Z: &Z2 + &Txy2d,
|
||||
T: &Z2 - &Txy2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//#[doc(hidden)]
|
||||
impl<'a, 'b> Sub<&'b AffineNielsPoint> for &'a EdwardsPoint {
|
||||
type Output = CompletedPoint;
|
||||
|
||||
fn sub(self, other: &'b AffineNielsPoint) -> CompletedPoint {
|
||||
let Y_plus_X = &self.Y + &self.X;
|
||||
let Y_minus_X = &self.Y - &self.X;
|
||||
let PM = &Y_plus_X * &other.y_minus_x;
|
||||
let MP = &Y_minus_X * &other.y_plus_x;
|
||||
let Txy2d = &self.T * &other.xy2d;
|
||||
let Z2 = &self.Z + &self.Z;
|
||||
|
||||
CompletedPoint{
|
||||
X: &PM - &MP,
|
||||
Y: &PM + &MP,
|
||||
Z: &Z2 - &Txy2d,
|
||||
T: &Z2 + &Txy2d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Negation
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl<'a> Neg for &'a ProjectiveNielsPoint {
|
||||
type Output = ProjectiveNielsPoint;
|
||||
|
||||
fn neg(self) -> ProjectiveNielsPoint {
|
||||
ProjectiveNielsPoint{
|
||||
Y_plus_X: self.Y_minus_X,
|
||||
Y_minus_X: self.Y_plus_X,
|
||||
Z: self.Z,
|
||||
T2d: -(&self.T2d),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a AffineNielsPoint {
|
||||
type Output = AffineNielsPoint;
|
||||
|
||||
fn neg(self) -> AffineNielsPoint {
|
||||
AffineNielsPoint{
|
||||
y_plus_x: self.y_minus_x,
|
||||
y_minus_x: self.y_plus_x,
|
||||
xy2d: -(&self.xy2d)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Debug traits
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
impl Debug for ProjectivePoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "ProjectivePoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?}\n}}",
|
||||
&self.X, &self.Y, &self.Z)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for CompletedPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "CompletedPoint{{\n\tX: {:?},\n\tY: {:?},\n\tZ: {:?},\n\tT: {:?}\n}}",
|
||||
&self.X, &self.Y, &self.Z, &self.T)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for AffineNielsPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "AffineNielsPoint{{\n\ty_plus_x: {:?},\n\ty_minus_x: {:?},\n\txy2d: {:?}\n}}",
|
||||
&self.y_plus_x, &self.y_minus_x, &self.xy2d)
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for ProjectiveNielsPoint {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "ProjectiveNielsPoint{{\n\tY_plus_X: {:?},\n\tY_minus_X: {:?},\n\tZ: {:?},\n\tT2d: {:?}\n}}",
|
||||
&self.Y_plus_X, &self.Y_minus_X, &self.Z, &self.T2d)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,260 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(32\\)-bit
|
||||
//! limbs with \\(64\\)-bit products.
|
||||
//!
|
||||
//! This code was originally derived from Adam Langley's Golang ed25519
|
||||
//! implementation, and was then rewritten to use unsigned limbs instead
|
||||
//! of signed limbs.
|
||||
//!
|
||||
//! This uses the formally-verified field arithmetic generated by the
|
||||
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::Neg;
|
||||
use core::ops::{Add, AddAssign};
|
||||
use core::ops::{Mul, MulAssign};
|
||||
use core::ops::{Sub, SubAssign};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use fiat_crypto::curve25519_32::*;
|
||||
|
||||
/// A `FieldElement2625` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// In the 32-bit implementation, a `FieldElement` is represented in
|
||||
/// radix \\(2\^{25.5}\\) as ten `u32`s. This means that a field
|
||||
/// element \\(x\\) is represented as
|
||||
/// $$
|
||||
/// x = \sum\_{i=0}\^9 x\_i 2\^{\lceil i \frac {51} 2 \rceil}
|
||||
/// = x\_0 + x\_1 2\^{26} + x\_2 2\^{51} + x\_3 2\^{77} + \cdots + x\_9 2\^{230};
|
||||
/// $$
|
||||
/// the coefficients are alternately bounded by \\(2\^{25}\\) and
|
||||
/// \\(2\^{26}\\). The limbs are allowed to grow between reductions up
|
||||
/// to \\(2\^{25+b}\\) or \\(2\^{26+b}\\), where \\(b = 1.75\\).
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// The `curve25519_dalek::field` module provides a type alias
|
||||
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
|
||||
/// or `FieldElement2625`.
|
||||
///
|
||||
/// The backend-specific type `FieldElement2625` should not be used
|
||||
/// outside of the `curve25519_dalek::field` module.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct FieldElement2625(pub(crate) [u32; 10]);
|
||||
|
||||
impl Debug for FieldElement2625 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "FieldElement2625({:?})", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for FieldElement2625 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> AddAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn add_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
let input = self.0;
|
||||
fiat_25519_add(&mut self.0, &input, &_rhs.0);
|
||||
let input = self.0;
|
||||
fiat_25519_carry(&mut self.0, &input);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn add(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
fiat_25519_add(&mut output.0, &self.0, &_rhs.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> SubAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn sub_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
let input = self.0;
|
||||
fiat_25519_sub(&mut self.0, &input, &_rhs.0);
|
||||
let input = self.0;
|
||||
fiat_25519_carry(&mut self.0, &input);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn sub(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
fiat_25519_sub(&mut output.0, &self.0, &_rhs.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> MulAssign<&'b FieldElement2625> for FieldElement2625 {
|
||||
fn mul_assign(&mut self, _rhs: &'b FieldElement2625) {
|
||||
let input = self.0;
|
||||
fiat_25519_carry_mul(&mut self.0, &input, &_rhs.0);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b FieldElement2625> for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn mul(self, _rhs: &'b FieldElement2625) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
fiat_25519_carry_mul(&mut output.0, &self.0, &_rhs.0);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a FieldElement2625 {
|
||||
type Output = FieldElement2625;
|
||||
fn neg(self) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
fiat_25519_opp(&mut output.0, &self.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for FieldElement2625 {
|
||||
fn conditional_select(
|
||||
a: &FieldElement2625,
|
||||
b: &FieldElement2625,
|
||||
choice: Choice,
|
||||
) -> FieldElement2625 {
|
||||
let mut output = [0u32; 10];
|
||||
fiat_25519_selectznz(&mut output, choice.unwrap_u8() as fiat_25519_u1, &a.0, &b.0);
|
||||
FieldElement2625(output)
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, other: &FieldElement2625, choice: Choice) {
|
||||
let mut output = [0u32; 10];
|
||||
let choicebit = choice.unwrap_u8() as fiat_25519_u1;
|
||||
fiat_25519_cmovznz_u32(&mut output[0], choicebit, self.0[0], other.0[0]);
|
||||
fiat_25519_cmovznz_u32(&mut output[1], choicebit, self.0[1], other.0[1]);
|
||||
fiat_25519_cmovznz_u32(&mut output[2], choicebit, self.0[2], other.0[2]);
|
||||
fiat_25519_cmovznz_u32(&mut output[3], choicebit, self.0[3], other.0[3]);
|
||||
fiat_25519_cmovznz_u32(&mut output[4], choicebit, self.0[4], other.0[4]);
|
||||
fiat_25519_cmovznz_u32(&mut output[5], choicebit, self.0[5], other.0[5]);
|
||||
fiat_25519_cmovznz_u32(&mut output[6], choicebit, self.0[6], other.0[6]);
|
||||
fiat_25519_cmovznz_u32(&mut output[7], choicebit, self.0[7], other.0[7]);
|
||||
fiat_25519_cmovznz_u32(&mut output[8], choicebit, self.0[8], other.0[8]);
|
||||
fiat_25519_cmovznz_u32(&mut output[9], choicebit, self.0[9], other.0[9]);
|
||||
*self = FieldElement2625(output);
|
||||
}
|
||||
|
||||
fn conditional_swap(a: &mut FieldElement2625, b: &mut FieldElement2625, choice: Choice) {
|
||||
u32::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
|
||||
u32::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
|
||||
u32::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
|
||||
u32::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
|
||||
u32::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
|
||||
u32::conditional_swap(&mut a.0[5], &mut b.0[5], choice);
|
||||
u32::conditional_swap(&mut a.0[6], &mut b.0[6], choice);
|
||||
u32::conditional_swap(&mut a.0[7], &mut b.0[7], choice);
|
||||
u32::conditional_swap(&mut a.0[8], &mut b.0[8], choice);
|
||||
u32::conditional_swap(&mut a.0[9], &mut b.0[9], choice);
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement2625 {
|
||||
/// Invert the sign of this field element
|
||||
pub fn negate(&mut self) {
|
||||
let neg = self.neg();
|
||||
self.0 = neg.0;
|
||||
}
|
||||
|
||||
/// Construct zero.
|
||||
pub fn zero() -> FieldElement2625 {
|
||||
FieldElement2625([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
|
||||
}
|
||||
|
||||
/// Construct one.
|
||||
pub fn one() -> FieldElement2625 {
|
||||
FieldElement2625([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
|
||||
}
|
||||
|
||||
/// Construct -1.
|
||||
pub fn minus_one() -> FieldElement2625 {
|
||||
FieldElement2625([
|
||||
0x3ffffec, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff, 0x3ffffff, 0x1ffffff,
|
||||
0x3ffffff, 0x1ffffff,
|
||||
])
|
||||
}
|
||||
|
||||
/// Given `k > 0`, return `self^(2^k)`.
|
||||
pub fn pow2k(&self, k: u32) -> FieldElement2625 {
|
||||
debug_assert!(k > 0);
|
||||
let mut z = self.square();
|
||||
for _ in 1..k {
|
||||
z = z.square();
|
||||
}
|
||||
z
|
||||
}
|
||||
|
||||
/// Load a `FieldElement2625` from the low 255 bits of a 256-bit
|
||||
/// input.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function does not check that the input used the canonical
|
||||
/// representative. It masks the high bit, but it will happily
|
||||
/// decode 2^255 - 18 to 1. Applications that require a canonical
|
||||
/// encoding of every field element should decode, re-encode to
|
||||
/// the canonical encoding, and check that the input was
|
||||
/// canonical.
|
||||
pub fn from_bytes(data: &[u8; 32]) -> FieldElement2625 {
|
||||
let mut temp = [0u8; 32];
|
||||
temp.copy_from_slice(data);
|
||||
temp[31] &= 127u8;
|
||||
let mut output = [0u32; 10];
|
||||
fiat_25519_from_bytes(&mut output, &temp);
|
||||
FieldElement2625(output)
|
||||
}
|
||||
|
||||
/// Serialize this `FieldElement51` to a 32-byte array. The
|
||||
/// encoding is canonical.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
let mut bytes = [0u8; 32];
|
||||
fiat_25519_to_bytes(&mut bytes, &self.0);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/// Compute `self^2`.
|
||||
pub fn square(&self) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
fiat_25519_carry_square(&mut output.0, &self.0);
|
||||
output
|
||||
}
|
||||
|
||||
/// Compute `2*self^2`.
|
||||
pub fn square2(&self) -> FieldElement2625 {
|
||||
let mut output = *self;
|
||||
let mut temp = *self;
|
||||
// Void vs return type, measure cost of copying self
|
||||
fiat_25519_carry_square(&mut temp.0, &self.0);
|
||||
fiat_25519_add(&mut output.0, &temp.0, &temp.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! The `u32` backend uses `u32`s and a `(u32, u32) -> u64` multiplier.
|
||||
//!
|
||||
//! This code is intended to be portable, but it requires that
|
||||
//! multiplication of two \\(32\\)-bit values to a \\(64\\)-bit result
|
||||
//! is constant-time on the target platform.
|
||||
//!
|
||||
//! This uses the formally-verified field arithmetic generated by the
|
||||
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
|
||||
|
||||
#[path = "../u32/scalar.rs"]
|
||||
pub mod scalar;
|
||||
|
||||
pub mod field;
|
||||
|
||||
#[path = "../u32/constants.rs"]
|
||||
pub mod constants;
|
|
@ -1,249 +0,0 @@
|
|||
// -*- mode: rust; coding: utf-8; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Field arithmetic modulo \\(p = 2\^{255} - 19\\), using \\(64\\)-bit
|
||||
//! limbs with \\(128\\)-bit products.
|
||||
//!
|
||||
//! This uses the formally-verified field arithmetic generated by the
|
||||
//! [fiat-crypto project](https://github.com/mit-plv/fiat-crypto)
|
||||
|
||||
use core::fmt::Debug;
|
||||
use core::ops::Neg;
|
||||
use core::ops::{Add, AddAssign};
|
||||
use core::ops::{Mul, MulAssign};
|
||||
use core::ops::{Sub, SubAssign};
|
||||
|
||||
use subtle::Choice;
|
||||
use subtle::ConditionallySelectable;
|
||||
|
||||
use zeroize::Zeroize;
|
||||
|
||||
use fiat_crypto::curve25519_64::*;
|
||||
|
||||
/// A `FieldElement51` represents an element of the field
|
||||
/// \\( \mathbb Z / (2\^{255} - 19)\\).
|
||||
///
|
||||
/// In the 64-bit implementation, a `FieldElement` is represented in
|
||||
/// radix \\(2\^{51}\\) as five `u64`s; the coefficients are allowed to
|
||||
/// grow up to \\(2\^{54}\\) between reductions modulo \\(p\\).
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// The `curve25519_dalek::field` module provides a type alias
|
||||
/// `curve25519_dalek::field::FieldElement` to either `FieldElement51`
|
||||
/// or `FieldElement2625`.
|
||||
///
|
||||
/// The backend-specific type `FieldElement51` should not be used
|
||||
/// outside of the `curve25519_dalek::field` module.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct FieldElement51(pub(crate) [u64; 5]);
|
||||
|
||||
impl Debug for FieldElement51 {
|
||||
fn fmt(&self, f: &mut ::core::fmt::Formatter) -> ::core::fmt::Result {
|
||||
write!(f, "FieldElement51({:?})", &self.0[..])
|
||||
}
|
||||
}
|
||||
|
||||
impl Zeroize for FieldElement51 {
|
||||
fn zeroize(&mut self) {
|
||||
self.0.zeroize();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> AddAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn add_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
let input = self.0;
|
||||
fiat_25519_add(&mut self.0, &input, &_rhs.0);
|
||||
let input = self.0;
|
||||
fiat_25519_carry(&mut self.0, &input);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Add<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn add(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
fiat_25519_add(&mut output.0, &self.0, &_rhs.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> SubAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn sub_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
let input = self.0;
|
||||
fiat_25519_sub(&mut self.0, &input, &_rhs.0);
|
||||
let input = self.0;
|
||||
fiat_25519_carry(&mut self.0, &input);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Sub<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn sub(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
fiat_25519_sub(&mut output.0, &self.0, &_rhs.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> MulAssign<&'b FieldElement51> for FieldElement51 {
|
||||
fn mul_assign(&mut self, _rhs: &'b FieldElement51) {
|
||||
let input = self.0;
|
||||
fiat_25519_carry_mul(&mut self.0, &input, &_rhs.0);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'b> Mul<&'b FieldElement51> for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn mul(self, _rhs: &'b FieldElement51) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
fiat_25519_carry_mul(&mut output.0, &self.0, &_rhs.0);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Neg for &'a FieldElement51 {
|
||||
type Output = FieldElement51;
|
||||
fn neg(self) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
fiat_25519_opp(&mut output.0, &self.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
||||
|
||||
impl ConditionallySelectable for FieldElement51 {
|
||||
fn conditional_select(
|
||||
a: &FieldElement51,
|
||||
b: &FieldElement51,
|
||||
choice: Choice,
|
||||
) -> FieldElement51 {
|
||||
let mut output = [0u64; 5];
|
||||
fiat_25519_selectznz(&mut output, choice.unwrap_u8() as fiat_25519_u1, &a.0, &b.0);
|
||||
FieldElement51(output)
|
||||
}
|
||||
|
||||
fn conditional_swap(a: &mut FieldElement51, b: &mut FieldElement51, choice: Choice) {
|
||||
u64::conditional_swap(&mut a.0[0], &mut b.0[0], choice);
|
||||
u64::conditional_swap(&mut a.0[1], &mut b.0[1], choice);
|
||||
u64::conditional_swap(&mut a.0[2], &mut b.0[2], choice);
|
||||
u64::conditional_swap(&mut a.0[3], &mut b.0[3], choice);
|
||||
u64::conditional_swap(&mut a.0[4], &mut b.0[4], choice);
|
||||
}
|
||||
|
||||
fn conditional_assign(&mut self, _rhs: &FieldElement51, choice: Choice) {
|
||||
let mut output = [0u64; 5];
|
||||
let choicebit = choice.unwrap_u8() as fiat_25519_u1;
|
||||
fiat_25519_cmovznz_u64(&mut output[0], choicebit, self.0[0], _rhs.0[0]);
|
||||
fiat_25519_cmovznz_u64(&mut output[1], choicebit, self.0[1], _rhs.0[1]);
|
||||
fiat_25519_cmovznz_u64(&mut output[2], choicebit, self.0[2], _rhs.0[2]);
|
||||
fiat_25519_cmovznz_u64(&mut output[3], choicebit, self.0[3], _rhs.0[3]);
|
||||
fiat_25519_cmovznz_u64(&mut output[4], choicebit, self.0[4], _rhs.0[4]);
|
||||
*self = FieldElement51(output);
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldElement51 {
|
||||
/// Construct zero.
|
||||
pub fn zero() -> FieldElement51 {
|
||||
FieldElement51([0, 0, 0, 0, 0])
|
||||
}
|
||||
|
||||
/// Construct one.
|
||||
pub fn one() -> FieldElement51 {
|
||||
FieldElement51([1, 0, 0, 0, 0])
|
||||
}
|
||||
|
||||
/// Construct -1.
|
||||
pub fn minus_one() -> FieldElement51 {
|
||||
FieldElement51([
|
||||
2251799813685228,
|
||||
2251799813685247,
|
||||
2251799813685247,
|
||||
2251799813685247,
|
||||
2251799813685247,
|
||||
])
|
||||
}
|
||||
|
||||
/// Given 64-bit input limbs, reduce to enforce the bound 2^(51 + epsilon).
|
||||
#[inline(always)]
|
||||
#[allow(dead_code)] // Need this to not complain about reduce not being used
|
||||
fn reduce(mut limbs: [u64; 5]) -> FieldElement51 {
|
||||
let input = limbs;
|
||||
fiat_25519_carry(&mut limbs, &input);
|
||||
FieldElement51(limbs)
|
||||
}
|
||||
|
||||
/// Load a `FieldElement51` from the low 255 bits of a 256-bit
|
||||
/// input.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function does not check that the input used the canonical
|
||||
/// representative. It masks the high bit, but it will happily
|
||||
/// decode 2^255 - 18 to 1. Applications that require a canonical
|
||||
/// encoding of every field element should decode, re-encode to
|
||||
/// the canonical encoding, and check that the input was
|
||||
/// canonical.
|
||||
///
|
||||
pub fn from_bytes(bytes: &[u8; 32]) -> FieldElement51 {
|
||||
let mut temp = [0u8; 32];
|
||||
temp.copy_from_slice(bytes);
|
||||
temp[31] &= 127u8;
|
||||
let mut output = [0u64; 5];
|
||||
fiat_25519_from_bytes(&mut output, &temp);
|
||||
FieldElement51(output)
|
||||
}
|
||||
|
||||
/// Serialize this `FieldElement51` to a 32-byte array. The
|
||||
/// encoding is canonical.
|
||||
pub fn to_bytes(&self) -> [u8; 32] {
|
||||
let mut bytes = [0u8; 32];
|
||||
fiat_25519_to_bytes(&mut bytes, &self.0);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/// Given `k > 0`, return `self^(2^k)`.
|
||||
pub fn pow2k(&self, mut k: u32) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
loop {
|
||||
let input = output.0;
|
||||
fiat_25519_carry_square(&mut output.0, &input);
|
||||
k -= 1;
|
||||
if k == 0 {
|
||||
return output;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the square of this field element.
|
||||
pub fn square(&self) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
fiat_25519_carry_square(&mut output.0, &self.0);
|
||||
output
|
||||
}
|
||||
|
||||
/// Returns 2 times the square of this field element.
|
||||
pub fn square2(&self) -> FieldElement51 {
|
||||
let mut output = *self;
|
||||
let mut temp = *self;
|
||||
// Void vs return type, measure cost of copying self
|
||||
fiat_25519_carry_square(&mut temp.0, &self.0);
|
||||
fiat_25519_add(&mut output.0, &temp.0, &temp.0);
|
||||
let input = output.0;
|
||||
fiat_25519_carry(&mut output.0, &input);
|
||||
output
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2018 Isis Lovecruft, Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Isis Agora Lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! The `u64` backend uses `u64`s and a `(u64, u64) -> u128` multiplier.
|
||||
//!
|
||||
//! On x86_64, the idiom `(x as u128) * (y as u128)` lowers to `MUL`
|
||||
//! instructions taking 64-bit inputs and producing 128-bit outputs. On
|
||||
//! other platforms, this implementation is not recommended.
|
||||
//!
|
||||
//! On Haswell and newer, the BMI2 extension provides `MULX`, and on
|
||||
//! Broadwell and newer, the ADX extension provides `ADCX` and `ADOX`
|
||||
//! (allowing the CPU to compute two carry chains in parallel). These
|
||||
//! will be used if available.
|
||||
|
||||
#[path = "../u64/scalar.rs"]
|
||||
pub mod scalar;
|
||||
|
||||
pub mod field;
|
||||
|
||||
#[path = "../u64/constants.rs"]
|
||||
pub mod constants;
|
|
@ -1,55 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2021 isis lovecruft
|
||||
// Copyright (c) 2016-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - isis agora lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Serial implementations of field, scalar, point arithmetic.
|
||||
//!
|
||||
//! When the vector backend is disabled, the crate uses the
|
||||
//! mixed-model strategy for implementing point operations and scalar
|
||||
//! multiplication; see the [`curve_models`](self::curve_models) and
|
||||
//! [`scalar_mul`](self::scalar_mul) documentation for more
|
||||
//! information.
|
||||
//!
|
||||
//! When the vector backend is enabled, the field and scalar
|
||||
//! implementations are still used for non-vectorized operations.
|
||||
//!
|
||||
//! Note: at this time the `u32` and `u64` backends cannot be built
|
||||
//! together.
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "u32_backend",
|
||||
feature = "u64_backend",
|
||||
feature = "fiat_u32_backend",
|
||||
feature = "fiat_u64_backend"
|
||||
)))]
|
||||
compile_error!(
|
||||
"no curve25519-dalek backend cargo feature enabled! \
|
||||
please enable one of: u32_backend, u64_backend, fiat_u32_backend, fiat_u64_backend"
|
||||
);
|
||||
|
||||
#[cfg(feature = "u32_backend")]
|
||||
pub mod u32;
|
||||
|
||||
#[cfg(feature = "u64_backend")]
|
||||
pub mod u64;
|
||||
|
||||
#[cfg(feature = "fiat_u32_backend")]
|
||||
pub mod fiat_u32;
|
||||
|
||||
#[cfg(feature = "fiat_u64_backend")]
|
||||
pub mod fiat_u64;
|
||||
|
||||
pub mod curve_models;
|
||||
|
||||
#[cfg(not(all(
|
||||
feature = "simd_backend",
|
||||
any(target_feature = "avx2", target_feature = "avx512ifma")
|
||||
)))]
|
||||
pub mod scalar_mul;
|
|
@ -1,31 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2021 isis lovecruft
|
||||
// Copyright (c) 2016-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - isis agora lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Implementations of various scalar multiplication algorithms.
|
||||
//!
|
||||
//! Note that all of these implementations use serial code for field
|
||||
//! arithmetic with the multi-model strategy described in the
|
||||
//! `curve_models` module. The vectorized AVX2 backend has its own
|
||||
//! scalar multiplication implementations, since it only uses one
|
||||
//! curve model.
|
||||
|
||||
pub mod variable_base;
|
||||
|
||||
pub mod vartime_double_base;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod precomputed_straus;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod pippenger;
|
|
@ -1,202 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Oleg Andreev
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Oleg Andreev <oleganza@gmail.com>
|
||||
|
||||
//! Implementation of a variant of Pippenger's algorithm.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::VartimeMultiscalarMul;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Implements a version of Pippenger's algorithm.
|
||||
///
|
||||
/// The algorithm works as follows:
|
||||
///
|
||||
/// Let `n` be a number of point-scalar pairs.
|
||||
/// Let `w` be a window of bits (6..8, chosen based on `n`, see cost factor).
|
||||
///
|
||||
/// 1. Prepare `2^(w-1) - 1` buckets with indices `[1..2^(w-1))` initialized with identity points.
|
||||
/// Bucket 0 is not needed as it would contain points multiplied by 0.
|
||||
/// 2. Convert scalars to a radix-`2^w` representation with signed digits in `[-2^w/2, 2^w/2]`.
|
||||
/// Note: only the last digit may equal `2^w/2`.
|
||||
/// 3. Starting with the last window, for each point `i=[0..n)` add it to a a bucket indexed by
|
||||
/// the point's scalar's value in the window.
|
||||
/// 4. Once all points in a window are sorted into buckets, add buckets by multiplying each
|
||||
/// by their index. Efficient way of doing it is to start with the last bucket and compute two sums:
|
||||
/// intermediate sum from the last to the first, and the full sum made of all intermediate sums.
|
||||
/// 5. Shift the resulting sum of buckets by `w` bits by using `w` doublings.
|
||||
/// 6. Add to the return value.
|
||||
/// 7. Repeat the loop.
|
||||
///
|
||||
/// Approximate cost w/o wNAF optimizations (A = addition, D = doubling):
|
||||
///
|
||||
/// ```ascii
|
||||
/// cost = (n*A + 2*(2^w/2)*A + w*D + A)*256/w
|
||||
/// | | | | |
|
||||
/// | | | | looping over 256/w windows
|
||||
/// | | | adding to the result
|
||||
/// sorting points | shifting the sum by w bits (to the next window, starting from last window)
|
||||
/// one by one |
|
||||
/// into buckets adding/subtracting all buckets
|
||||
/// multiplied by their indexes
|
||||
/// using a sum of intermediate sums
|
||||
/// ```
|
||||
///
|
||||
/// For large `n`, dominant factor is (n*256/w) additions.
|
||||
/// However, if `w` is too big and `n` is not too big, then `(2^w/2)*A` could dominate.
|
||||
/// Therefore, the optimal choice of `w` grows slowly as `n` grows.
|
||||
///
|
||||
/// This algorithm is adapted from section 4 of https://eprint.iacr.org/2012/549.pdf.
|
||||
pub struct Pippenger;
|
||||
|
||||
#[cfg(any(feature = "alloc", feature = "std"))]
|
||||
impl VartimeMultiscalarMul for Pippenger {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
use traits::Identity;
|
||||
|
||||
let mut scalars = scalars.into_iter();
|
||||
let size = scalars.by_ref().size_hint().0;
|
||||
|
||||
// Digit width in bits. As digit width grows,
|
||||
// number of point additions goes down, but amount of
|
||||
// buckets and bucket additions grows exponentially.
|
||||
let w = if size < 500 {
|
||||
6
|
||||
} else if size < 800 {
|
||||
7
|
||||
} else {
|
||||
8
|
||||
};
|
||||
|
||||
let max_digit: usize = 1 << w;
|
||||
let digits_count: usize = Scalar::to_radix_2w_size_hint(w);
|
||||
let buckets_count: usize = max_digit / 2; // digits are signed+centered hence 2^w/2, excluding 0-th bucket
|
||||
|
||||
// Collect optimized scalars and points in buffers for repeated access
|
||||
// (scanning the whole set per digit position).
|
||||
let scalars = scalars
|
||||
.map(|s| s.borrow().to_radix_2w(w));
|
||||
|
||||
let points = points
|
||||
.into_iter()
|
||||
.map(|p| p.map(|P| P.to_projective_niels()));
|
||||
|
||||
let scalars_points = scalars
|
||||
.zip(points)
|
||||
.map(|(s, maybe_p)| maybe_p.map(|p| (s, p)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
// Prepare 2^w/2 buckets.
|
||||
// buckets[i] corresponds to a multiplication factor (i+1).
|
||||
let mut buckets: Vec<_> = (0..buckets_count)
|
||||
.map(|_| EdwardsPoint::identity())
|
||||
.collect();
|
||||
|
||||
let mut columns = (0..digits_count).rev().map(|digit_index| {
|
||||
// Clear the buckets when processing another digit.
|
||||
for i in 0..buckets_count {
|
||||
buckets[i] = EdwardsPoint::identity();
|
||||
}
|
||||
|
||||
// Iterate over pairs of (point, scalar)
|
||||
// and add/sub the point to the corresponding bucket.
|
||||
// Note: if we add support for precomputed lookup tables,
|
||||
// we'll be adding/subtracting point premultiplied by `digits[i]` to buckets[0].
|
||||
for (digits, pt) in scalars_points.iter() {
|
||||
// Widen digit so that we don't run into edge cases when w=8.
|
||||
let digit = digits[digit_index] as i16;
|
||||
if digit > 0 {
|
||||
let b = (digit - 1) as usize;
|
||||
buckets[b] = (&buckets[b] + pt).to_extended();
|
||||
} else if digit < 0 {
|
||||
let b = (-digit - 1) as usize;
|
||||
buckets[b] = (&buckets[b] - pt).to_extended();
|
||||
}
|
||||
}
|
||||
|
||||
// Add the buckets applying the multiplication factor to each bucket.
|
||||
// The most efficient way to do that is to have a single sum with two running sums:
|
||||
// an intermediate sum from last bucket to the first, and a sum of intermediate sums.
|
||||
//
|
||||
// For example, to add buckets 1*A, 2*B, 3*C we need to add these points:
|
||||
// C
|
||||
// C B
|
||||
// C B A Sum = C + (C+B) + (C+B+A)
|
||||
let mut buckets_intermediate_sum = buckets[buckets_count - 1];
|
||||
let mut buckets_sum = buckets[buckets_count - 1];
|
||||
for i in (0..(buckets_count - 1)).rev() {
|
||||
buckets_intermediate_sum += buckets[i];
|
||||
buckets_sum += buckets_intermediate_sum;
|
||||
}
|
||||
|
||||
buckets_sum
|
||||
});
|
||||
|
||||
// Take the high column as an initial value to avoid wasting time doubling the identity element in `fold()`.
|
||||
// `unwrap()` always succeeds because we know we have more than zero digits.
|
||||
let hi_column = columns.next().unwrap();
|
||||
|
||||
Some(
|
||||
columns
|
||||
.fold(hi_column, |total, p| total.mul_by_pow_2(w as u32) + p),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use constants;
|
||||
use scalar::Scalar;
|
||||
|
||||
#[test]
|
||||
fn test_vartime_pippenger() {
|
||||
// Reuse points across different tests
|
||||
let mut n = 512;
|
||||
let x = Scalar::from(2128506u64).invert();
|
||||
let y = Scalar::from(4443282u64).invert();
|
||||
let points: Vec<_> = (0..n)
|
||||
.map(|i| constants::ED25519_BASEPOINT_POINT * Scalar::from(1 + i as u64))
|
||||
.collect();
|
||||
let scalars: Vec<_> = (0..n)
|
||||
.map(|i| x + (Scalar::from(i as u64) * y)) // fast way to make ~random but deterministic scalars
|
||||
.collect();
|
||||
|
||||
let premultiplied: Vec<EdwardsPoint> = scalars
|
||||
.iter()
|
||||
.zip(points.iter())
|
||||
.map(|(sc, pt)| sc * pt)
|
||||
.collect();
|
||||
|
||||
while n > 0 {
|
||||
let scalars = &scalars[0..n].to_vec();
|
||||
let points = &points[0..n].to_vec();
|
||||
let control: EdwardsPoint = premultiplied[0..n].iter().sum();
|
||||
|
||||
let subject = Pippenger::vartime_multiscalar_mul(scalars.clone(), points.clone());
|
||||
|
||||
assert_eq!(subject.compress(), control.compress());
|
||||
|
||||
n = n / 2;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,110 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2019 Henry de Valence.
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Precomputation for Straus's method.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use backend::serial::curve_models::{
|
||||
AffineNielsPoint, CompletedPoint, ProjectiveNielsPoint, ProjectivePoint,
|
||||
};
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::Identity;
|
||||
use traits::VartimePrecomputedMultiscalarMul;
|
||||
use window::{NafLookupTable5, NafLookupTable8};
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
pub struct VartimePrecomputedStraus {
|
||||
static_lookup_tables: Vec<NafLookupTable8<AffineNielsPoint>>,
|
||||
}
|
||||
|
||||
impl VartimePrecomputedMultiscalarMul for VartimePrecomputedStraus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
fn new<I>(static_points: I) -> Self
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Self::Point>,
|
||||
{
|
||||
Self {
|
||||
static_lookup_tables: static_points
|
||||
.into_iter()
|
||||
.map(|P| NafLookupTable8::<AffineNielsPoint>::from(P.borrow()))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn optional_mixed_multiscalar_mul<I, J, K>(
|
||||
&self,
|
||||
static_scalars: I,
|
||||
dynamic_scalars: J,
|
||||
dynamic_points: K,
|
||||
) -> Option<Self::Point>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<Scalar>,
|
||||
K: IntoIterator<Item = Option<Self::Point>>,
|
||||
{
|
||||
let static_nafs = static_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
let dynamic_nafs: Vec<_> = dynamic_scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dynamic_lookup_tables = dynamic_points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let sp = self.static_lookup_tables.len();
|
||||
let dp = dynamic_lookup_tables.len();
|
||||
assert_eq!(sp, static_nafs.len());
|
||||
assert_eq!(dp, dynamic_nafs.len());
|
||||
|
||||
// We could save some doublings by looking for the highest
|
||||
// nonzero NAF coefficient, but since we might have a lot of
|
||||
// them to search, it's not clear it's worthwhile to check.
|
||||
let mut S = ProjectivePoint::identity();
|
||||
for j in (0..256).rev() {
|
||||
let mut R: CompletedPoint = S.double();
|
||||
|
||||
for i in 0..dp {
|
||||
let t_ij = dynamic_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R.to_extended() + &dynamic_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R.to_extended() - &dynamic_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..sp {
|
||||
let t_ij = static_nafs[i][j];
|
||||
if t_ij > 0 {
|
||||
R = &R.to_extended() + &self.static_lookup_tables[i].select(t_ij as usize);
|
||||
} else if t_ij < 0 {
|
||||
R = &R.to_extended() - &self.static_lookup_tables[i].select(-t_ij as usize);
|
||||
}
|
||||
}
|
||||
|
||||
S = R.to_projective();
|
||||
}
|
||||
|
||||
Some(S.to_extended())
|
||||
}
|
||||
}
|
|
@ -1,196 +0,0 @@
|
|||
// -*- mode: rust; -*-
|
||||
//
|
||||
// This file is part of curve25519-dalek.
|
||||
// Copyright (c) 2016-2021 isis lovecruft
|
||||
// Copyright (c) 2016-2019 Henry de Valence
|
||||
// See LICENSE for licensing information.
|
||||
//
|
||||
// Authors:
|
||||
// - isis agora lovecruft <isis@patternsinthevoid.net>
|
||||
// - Henry de Valence <hdevalence@hdevalence.ca>
|
||||
|
||||
//! Implementation of the interleaved window method, also known as Straus' method.
|
||||
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use core::borrow::Borrow;
|
||||
|
||||
use edwards::EdwardsPoint;
|
||||
use scalar::Scalar;
|
||||
use traits::MultiscalarMul;
|
||||
use traits::VartimeMultiscalarMul;
|
||||
|
||||
#[allow(unused_imports)]
|
||||
use prelude::*;
|
||||
|
||||
/// Perform multiscalar multiplication by the interleaved window
|
||||
/// method, also known as Straus' method (since it was apparently
|
||||
/// [first published][solution] by Straus in 1964, as a solution to [a
|
||||
/// problem][problem] posted in the American Mathematical Monthly in
|
||||
/// 1963).
|
||||
///
|
||||
/// It is easy enough to reinvent, and has been repeatedly. The basic
|
||||
/// idea is that when computing
|
||||
/// \\[
|
||||
/// Q = s_1 P_1 + \cdots + s_n P_n
|
||||
/// \\]
|
||||
/// by means of additions and doublings, the doublings can be shared
|
||||
/// across the \\( P_i \\\).
|
||||
///
|
||||
/// We implement two versions, a constant-time algorithm using fixed
|
||||
/// windows and a variable-time algorithm using sliding windows. They
|
||||
/// are slight variations on the same idea, and are described in more
|
||||
/// detail in the respective implementations.
|
||||
///
|
||||
/// [solution]: https://www.jstor.org/stable/2310929
|
||||
/// [problem]: https://www.jstor.org/stable/2312273
|
||||
pub struct Straus {}
|
||||
|
||||
impl MultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
/// Constant-time Straus using a fixed window of size \\(4\\).
|
||||
///
|
||||
/// Our goal is to compute
|
||||
/// \\[
|
||||
/// Q = s_1 P_1 + \cdots + s_n P_n.
|
||||
/// \\]
|
||||
///
|
||||
/// For each point \\( P_i \\), precompute a lookup table of
|
||||
/// \\[
|
||||
/// P_i, 2P_i, 3P_i, 4P_i, 5P_i, 6P_i, 7P_i, 8P_i.
|
||||
/// \\]
|
||||
///
|
||||
/// For each scalar \\( s_i \\), compute its radix-\\(2^4\\)
|
||||
/// signed digits \\( s_{i,j} \\), i.e.,
|
||||
/// \\[
|
||||
/// s_i = s_{i,0} + s_{i,1} 16^1 + ... + s_{i,63} 16^{63},
|
||||
/// \\]
|
||||
/// with \\( -8 \leq s_{i,j} < 8 \\). Since \\( 0 \leq |s_{i,j}|
|
||||
/// \leq 8 \\), we can retrieve \\( s_{i,j} P_i \\) from the
|
||||
/// lookup table with a conditional negation: using signed
|
||||
/// digits halves the required table size.
|
||||
///
|
||||
/// Then as in the single-base fixed window case, we have
|
||||
/// \\[
|
||||
/// \begin{aligned}
|
||||
/// s_i P_i &= P_i (s_{i,0} + s_{i,1} 16^1 + \cdots + s_{i,63} 16^{63}) \\\\
|
||||
/// s_i P_i &= P_i s_{i,0} + P_i s_{i,1} 16^1 + \cdots + P_i s_{i,63} 16^{63} \\\\
|
||||
/// s_i P_i &= P_i s_{i,0} + 16(P_i s_{i,1} + 16( \cdots +16P_i s_{i,63})\cdots )
|
||||
/// \end{aligned}
|
||||
/// \\]
|
||||
/// so each \\( s_i P_i \\) can be computed by alternately adding
|
||||
/// a precomputed multiple \\( P_i s_{i,j} \\) of \\( P_i \\) and
|
||||
/// repeatedly doubling.
|
||||
///
|
||||
/// Now consider the two-dimensional sum
|
||||
/// \\[
|
||||
/// \begin{aligned}
|
||||
/// s\_1 P\_1 &=& P\_1 s\_{1,0} &+& 16 (P\_1 s\_{1,1} &+& 16 ( \cdots &+& 16 P\_1 s\_{1,63}&) \cdots ) \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// s\_2 P\_2 &=& P\_2 s\_{2,0} &+& 16 (P\_2 s\_{2,1} &+& 16 ( \cdots &+& 16 P\_2 s\_{2,63}&) \cdots ) \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// \vdots & & \vdots & & \vdots & & & & \vdots & \\\\
|
||||
/// + & & + & & + & & & & + & \\\\
|
||||
/// s\_n P\_n &=& P\_n s\_{n,0} &+& 16 (P\_n s\_{n,1} &+& 16 ( \cdots &+& 16 P\_n s\_{n,63}&) \cdots )
|
||||
/// \end{aligned}
|
||||
/// \\]
|
||||
/// The sum of the left-hand column is the result \\( Q \\); by
|
||||
/// computing the two-dimensional sum on the right column-wise,
|
||||
/// top-to-bottom, then right-to-left, we need to multiply by \\(
|
||||
/// 16\\) only once per column, sharing the doublings across all
|
||||
/// of the input points.
|
||||
fn multiscalar_mul<I, J>(scalars: I, points: J) -> EdwardsPoint
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator,
|
||||
J::Item: Borrow<EdwardsPoint>,
|
||||
{
|
||||
use zeroize::Zeroizing;
|
||||
|
||||
use backend::serial::curve_models::ProjectiveNielsPoint;
|
||||
use window::LookupTable;
|
||||
use traits::Identity;
|
||||
|
||||
let lookup_tables: Vec<_> = points
|
||||
.into_iter()
|
||||
.map(|point| LookupTable::<ProjectiveNielsPoint>::from(point.borrow()))
|
||||
.collect();
|
||||
|
||||
// This puts the scalar digits into a heap-allocated Vec.
|
||||
// To ensure that these are erased, pass ownership of the Vec into a
|
||||
// Zeroizing wrapper.
|
||||
let scalar_digits_vec: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|s| s.borrow().to_radix_16())
|
||||
.collect();
|
||||
let scalar_digits = Zeroizing::new(scalar_digits_vec);
|
||||
|
||||
let mut Q = EdwardsPoint::identity();
|
||||
for j in (0..64).rev() {
|
||||
Q = Q.mul_by_pow_2(4);
|
||||
let it = scalar_digits.iter().zip(lookup_tables.iter());
|
||||
for (s_i, lookup_table_i) in it {
|
||||
// R_i = s_{i,j} * P_i
|
||||
let R_i = lookup_table_i.select(s_i[j]);
|
||||
// Q = Q + R_i
|
||||
Q = (&Q + &R_i).to_extended();
|
||||
}
|
||||
}
|
||||
|
||||
Q
|
||||
}
|
||||
}
|
||||
|
||||
impl VartimeMultiscalarMul for Straus {
|
||||
type Point = EdwardsPoint;
|
||||
|
||||
/// Variable-time Straus using a non-adjacent form of width \\(5\\).
|
||||
///
|
||||
/// This is completely similar to the constant-time code, but we
|
||||
/// use a non-adjacent form for the scalar, and do not do table
|
||||
/// lookups in constant time.
|
||||
///
|
||||
/// The non-adjacent form has signed, odd digits. Using only odd
|
||||
/// digits halves the table size (since we only need odd
|
||||
/// multiples), or gives fewer additions for the same table size.
|
||||
fn optional_multiscalar_mul<I, J>(scalars: I, points: J) -> Option<EdwardsPoint>
|
||||
where
|
||||
I: IntoIterator,
|
||||
I::Item: Borrow<Scalar>,
|
||||
J: IntoIterator<Item = Option<EdwardsPoint>>,
|
||||
{
|
||||
use backend::serial::curve_models::{CompletedPoint, ProjectiveNielsPoint, ProjectivePoint};
|
||||
use window::NafLookupTable5;
|
||||
use traits::Identity;
|
||||
|
||||
let nafs: Vec<_> = scalars
|
||||
.into_iter()
|
||||
.map(|c| c.borrow().non_adjacent_form(5))
|
||||
.collect();
|
||||
|
||||
let lookup_tables = points
|
||||
.into_iter()
|
||||
.map(|P_opt| P_opt.map(|P| NafLookupTable5::<ProjectiveNielsPoint>::from(&P)))
|
||||
.collect::<Option<Vec<_>>>()?;
|
||||
|
||||
let mut r = ProjectivePoint::identity();
|
||||
|
||||
for i in (0..256).rev() {
|
||||
let mut t: CompletedPoint = r.double();
|
||||
|
||||
for (naf, lookup_table) in nafs.iter().zip(lookup_tables.iter()) {
|
||||
if naf[i] > 0 {
|
||||
t = &t.to_extended() + &lookup_table.select(naf[i] as usize);
|
||||
} else if naf[i] < 0 {
|
||||
t = &t.to_extended() - &lookup_table.select(-naf[i] as usize);
|
||||
}
|
||||
}
|
||||
|
||||
r = t.to_projective();
|
||||
}
|
||||
|
||||
Some(r.to_extended())
|
||||
}
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
#![allow(non_snake_case)]
|
||||
|
||||
use traits::Identity;
|
||||
use scalar::Scalar;
|
||||
use edwards::EdwardsPoint;
|
||||
use backend::serial::curve_models::ProjectiveNielsPoint;
|
||||
use window::LookupTable;
|
||||
|
||||
/// Perform constant-time, variable-base scalar multiplication.
|
||||
pub(crate) fn mul(point: &EdwardsPoint, scalar: &Scalar) -> EdwardsPoint {
|
||||
// Construct a lookup table of [P,2P,3P,4P,5P,6P,7P,8P]
|
||||
let lookup_table = LookupTable::<ProjectiveNielsPoint>::from(point);
|
||||
// Setting s = scalar, compute
|
||||
//
|
||||
// s = s_0 + s_1*16^1 + ... + s_63*16^63,
|
||||
//
|
||||
// with `-8 ≤ s_i < 8` for `0 ≤ i < 63` and `-8 ≤ s_63 ≤ 8`.
|
||||
let scalar_digits = scalar.to_radix_16();
|
||||
// Compute s*P as
|
||||
//
|
||||
// s*P = P*(s_0 + s_1*16^1 + s_2*16^2 + ... + s_63*16^63)
|
||||
// s*P = P*s_0 + P*s_1*16^1 + P*s_2*16^2 + ... + P*s_63*16^63
|
||||
// s*P = P*s_0 + 16*(P*s_1 + 16*(P*s_2 + 16*( ... + P*s_63)...))
|
||||
//
|
||||
// We sum right-to-left.
|
||||
|
||||
// Unwrap first loop iteration to save computing 16*identity
|
||||
let mut tmp2;
|
||||
let mut tmp3 = EdwardsPoint::identity();
|
||||
let mut tmp1 = &tmp3 + &lookup_table.select(scalar_digits[63]);
|
||||
// Now tmp1 = s_63*P in P1xP1 coords
|
||||
for i in (0..63).rev() {
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = (prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 2*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 2*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 4*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 4*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 8*(prev) in P1xP1 coords
|
||||
tmp2 = tmp1.to_projective(); // tmp2 = 8*(prev) in P2 coords
|
||||
tmp1 = tmp2.double(); // tmp1 = 16*(prev) in P1xP1 coords
|
||||
tmp3 = tmp1.to_extended(); // tmp3 = 16*(prev) in P3 coords
|
||||
tmp1 = &tmp3 + &lookup_table.select(scalar_digits[i]);
|
||||
// Now tmp1 = s_i*P + 16*(prev) in P1xP1 coords
|
||||
}
|
||||
tmp1.to_extended()
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue