diff options
Diffstat (limited to 'crypto/gf128mul.c')
-rw-r--r-- | crypto/gf128mul.c | 171 |
1 files changed, 136 insertions, 35 deletions
diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c index 5276607c72d0..f3d9f6da0767 100644 --- a/crypto/gf128mul.c +++ b/crypto/gf128mul.c | |||
@@ -44,7 +44,7 @@ | |||
44 | --------------------------------------------------------------------------- | 44 | --------------------------------------------------------------------------- |
45 | Issue 31/01/2006 | 45 | Issue 31/01/2006 |
46 | 46 | ||
47 | This file provides fast multiplication in GF(128) as required by several | 47 | This file provides fast multiplication in GF(2^128) as required by several |
48 | cryptographic authentication modes | 48 | cryptographic authentication modes |
49 | */ | 49 | */ |
50 | 50 | ||
@@ -88,37 +88,52 @@ | |||
88 | q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ | 88 | q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ |
89 | } | 89 | } |
90 | 90 | ||
91 | /* Given the value i in 0..255 as the byte overflow when a field element | 91 | /* |
92 | in GHASH is multiplied by x^8, this function will return the values that | 92 | * Given a value i in 0..255 as the byte overflow when a field element |
93 | are generated in the lo 16-bit word of the field value by applying the | 93 | * in GF(2^128) is multiplied by x^8, the following macro returns the |
94 | modular polynomial. The values lo_byte and hi_byte are returned via the | 94 | * 16-bit value that must be XOR-ed into the low-degree end of the |
95 | macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into | 95 | * product to reduce it modulo the irreducible polynomial x^128 + x^7 + |
96 | memory as required by a suitable definition of this macro operating on | 96 | * x^2 + x + 1. |
97 | the table above | 97 | * |
98 | */ | 98 | * There are two versions of the macro, and hence two tables: one for |
99 | 99 | * the "be" convention where the highest-order bit is the coefficient of | |
100 | #define xx(p, q) 0x##p##q | 100 | * the highest-degree polynomial term, and one for the "le" convention |
101 | * where the highest-order bit is the coefficient of the lowest-degree | ||
102 | * polynomial term. In both cases the values are stored in CPU byte | ||
103 | * endianness such that the coefficients are ordered consistently across | ||
104 | * bytes, i.e. in the "be" table bits 15..0 of the stored value | ||
105 | * correspond to the coefficients of x^15..x^0, and in the "le" table | ||
106 | * bits 15..0 correspond to the coefficients of x^0..x^15. | ||
107 | * | ||
108 | * Therefore, provided that the appropriate byte endianness conversions | ||
109 | * are done by the multiplication functions (and these must be in place | ||
110 | * anyway to support both little endian and big endian CPUs), the "be" | ||
111 | * table can be used for multiplications of both "bbe" and "ble" | ||
112 | * elements, and the "le" table can be used for multiplications of both | ||
113 | * "lle" and "lbe" elements. | ||
114 | */ | ||
101 | 115 | ||
102 | #define xda_bbe(i) ( \ | 116 | #define xda_be(i) ( \ |
103 | (i & 0x80 ? xx(43, 80) : 0) ^ (i & 0x40 ? xx(21, c0) : 0) ^ \ | 117 | (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ |
104 | (i & 0x20 ? xx(10, e0) : 0) ^ (i & 0x10 ? xx(08, 70) : 0) ^ \ | 118 | (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ |
105 | (i & 0x08 ? xx(04, 38) : 0) ^ (i & 0x04 ? xx(02, 1c) : 0) ^ \ | 119 | (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ |
106 | (i & 0x02 ? xx(01, 0e) : 0) ^ (i & 0x01 ? xx(00, 87) : 0) \ | 120 | (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ |
107 | ) | 121 | ) |
108 | 122 | ||
109 | #define xda_lle(i) ( \ | 123 | #define xda_le(i) ( \ |
110 | (i & 0x80 ? xx(e1, 00) : 0) ^ (i & 0x40 ? xx(70, 80) : 0) ^ \ | 124 | (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ |
111 | (i & 0x20 ? xx(38, 40) : 0) ^ (i & 0x10 ? xx(1c, 20) : 0) ^ \ | 125 | (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ |
112 | (i & 0x08 ? xx(0e, 10) : 0) ^ (i & 0x04 ? xx(07, 08) : 0) ^ \ | 126 | (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ |
113 | (i & 0x02 ? xx(03, 84) : 0) ^ (i & 0x01 ? xx(01, c2) : 0) \ | 127 | (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ |
114 | ) | 128 | ) |
115 | 129 | ||
116 | static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle); | 130 | static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); |
117 | static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe); | 131 | static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); |
118 | 132 | ||
119 | /* These functions multiply a field element by x, by x^4 and by x^8 | 133 | /* |
120 | * in the polynomial field representation. It uses 32-bit word operations | 134 | * The following functions multiply a field element by x or by x^8 in |
121 | * to gain speed but compensates for machine endianess and hence works | 135 | * the polynomial field representation. They use 64-bit word operations |
136 | * to gain speed but compensate for machine endianness and hence work | ||
122 | * correctly on both styles of machine. | 137 | * correctly on both styles of machine. |
123 | */ | 138 | */ |
124 | 139 | ||
@@ -126,7 +141,7 @@ static void gf128mul_x_lle(be128 *r, const be128 *x) | |||
126 | { | 141 | { |
127 | u64 a = be64_to_cpu(x->a); | 142 | u64 a = be64_to_cpu(x->a); |
128 | u64 b = be64_to_cpu(x->b); | 143 | u64 b = be64_to_cpu(x->b); |
129 | u64 _tt = gf128mul_table_lle[(b << 7) & 0xff]; | 144 | u64 _tt = gf128mul_table_le[(b << 7) & 0xff]; |
130 | 145 | ||
131 | r->b = cpu_to_be64((b >> 1) | (a << 63)); | 146 | r->b = cpu_to_be64((b >> 1) | (a << 63)); |
132 | r->a = cpu_to_be64((a >> 1) ^ (_tt << 48)); | 147 | r->a = cpu_to_be64((a >> 1) ^ (_tt << 48)); |
@@ -136,7 +151,7 @@ static void gf128mul_x_bbe(be128 *r, const be128 *x) | |||
136 | { | 151 | { |
137 | u64 a = be64_to_cpu(x->a); | 152 | u64 a = be64_to_cpu(x->a); |
138 | u64 b = be64_to_cpu(x->b); | 153 | u64 b = be64_to_cpu(x->b); |
139 | u64 _tt = gf128mul_table_bbe[a >> 63]; | 154 | u64 _tt = gf128mul_table_be[a >> 63]; |
140 | 155 | ||
141 | r->a = cpu_to_be64((a << 1) | (b >> 63)); | 156 | r->a = cpu_to_be64((a << 1) | (b >> 63)); |
142 | r->b = cpu_to_be64((b << 1) ^ _tt); | 157 | r->b = cpu_to_be64((b << 1) ^ _tt); |
@@ -146,7 +161,7 @@ void gf128mul_x_ble(be128 *r, const be128 *x) | |||
146 | { | 161 | { |
147 | u64 a = le64_to_cpu(x->a); | 162 | u64 a = le64_to_cpu(x->a); |
148 | u64 b = le64_to_cpu(x->b); | 163 | u64 b = le64_to_cpu(x->b); |
149 | u64 _tt = gf128mul_table_bbe[b >> 63]; | 164 | u64 _tt = gf128mul_table_be[b >> 63]; |
150 | 165 | ||
151 | r->a = cpu_to_le64((a << 1) ^ _tt); | 166 | r->a = cpu_to_le64((a << 1) ^ _tt); |
152 | r->b = cpu_to_le64((b << 1) | (a >> 63)); | 167 | r->b = cpu_to_le64((b << 1) | (a >> 63)); |
@@ -157,7 +172,7 @@ static void gf128mul_x8_lle(be128 *x) | |||
157 | { | 172 | { |
158 | u64 a = be64_to_cpu(x->a); | 173 | u64 a = be64_to_cpu(x->a); |
159 | u64 b = be64_to_cpu(x->b); | 174 | u64 b = be64_to_cpu(x->b); |
160 | u64 _tt = gf128mul_table_lle[b & 0xff]; | 175 | u64 _tt = gf128mul_table_le[b & 0xff]; |
161 | 176 | ||
162 | x->b = cpu_to_be64((b >> 8) | (a << 56)); | 177 | x->b = cpu_to_be64((b >> 8) | (a << 56)); |
163 | x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); | 178 | x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); |
@@ -167,12 +182,22 @@ static void gf128mul_x8_bbe(be128 *x) | |||
167 | { | 182 | { |
168 | u64 a = be64_to_cpu(x->a); | 183 | u64 a = be64_to_cpu(x->a); |
169 | u64 b = be64_to_cpu(x->b); | 184 | u64 b = be64_to_cpu(x->b); |
170 | u64 _tt = gf128mul_table_bbe[a >> 56]; | 185 | u64 _tt = gf128mul_table_be[a >> 56]; |
171 | 186 | ||
172 | x->a = cpu_to_be64((a << 8) | (b >> 56)); | 187 | x->a = cpu_to_be64((a << 8) | (b >> 56)); |
173 | x->b = cpu_to_be64((b << 8) ^ _tt); | 188 | x->b = cpu_to_be64((b << 8) ^ _tt); |
174 | } | 189 | } |
175 | 190 | ||
191 | static void gf128mul_x8_ble(be128 *x) | ||
192 | { | ||
193 | u64 a = le64_to_cpu(x->b); | ||
194 | u64 b = le64_to_cpu(x->a); | ||
195 | u64 _tt = gf128mul_table_be[a >> 56]; | ||
196 | |||
197 | x->b = cpu_to_le64((a << 8) | (b >> 56)); | ||
198 | x->a = cpu_to_le64((b << 8) ^ _tt); | ||
199 | } | ||
200 | |||
176 | void gf128mul_lle(be128 *r, const be128 *b) | 201 | void gf128mul_lle(be128 *r, const be128 *b) |
177 | { | 202 | { |
178 | be128 p[8]; | 203 | be128 p[8]; |
@@ -249,9 +274,48 @@ void gf128mul_bbe(be128 *r, const be128 *b) | |||
249 | } | 274 | } |
250 | EXPORT_SYMBOL(gf128mul_bbe); | 275 | EXPORT_SYMBOL(gf128mul_bbe); |
251 | 276 | ||
277 | void gf128mul_ble(be128 *r, const be128 *b) | ||
278 | { | ||
279 | be128 p[8]; | ||
280 | int i; | ||
281 | |||
282 | p[0] = *r; | ||
283 | for (i = 0; i < 7; ++i) | ||
284 | gf128mul_x_ble((be128 *)&p[i + 1], (be128 *)&p[i]); | ||
285 | |||
286 | memset(r, 0, sizeof(*r)); | ||
287 | for (i = 0;;) { | ||
288 | u8 ch = ((u8 *)b)[15 - i]; | ||
289 | |||
290 | if (ch & 0x80) | ||
291 | be128_xor(r, r, &p[7]); | ||
292 | if (ch & 0x40) | ||
293 | be128_xor(r, r, &p[6]); | ||
294 | if (ch & 0x20) | ||
295 | be128_xor(r, r, &p[5]); | ||
296 | if (ch & 0x10) | ||
297 | be128_xor(r, r, &p[4]); | ||
298 | if (ch & 0x08) | ||
299 | be128_xor(r, r, &p[3]); | ||
300 | if (ch & 0x04) | ||
301 | be128_xor(r, r, &p[2]); | ||
302 | if (ch & 0x02) | ||
303 | be128_xor(r, r, &p[1]); | ||
304 | if (ch & 0x01) | ||
305 | be128_xor(r, r, &p[0]); | ||
306 | |||
307 | if (++i >= 16) | ||
308 | break; | ||
309 | |||
310 | gf128mul_x8_ble(r); | ||
311 | } | ||
312 | } | ||
313 | EXPORT_SYMBOL(gf128mul_ble); | ||
314 | |||
315 | |||
252 | /* This version uses 64k bytes of table space. | 316 | /* This version uses 64k bytes of table space. |
253 | A 16 byte buffer has to be multiplied by a 16 byte key | 317 | A 16 byte buffer has to be multiplied by a 16 byte key |
254 | value in GF(128). If we consider a GF(128) value in | 318 | value in GF(2^128). If we consider a GF(2^128) value in |
255 | the buffer's lowest byte, we can construct a table of | 319 | the buffer's lowest byte, we can construct a table of |
256 | the 256 16 byte values that result from the 256 values | 320 | the 256 16 byte values that result from the 256 values |
257 | of this byte. This requires 4096 bytes. But we also | 321 | of this byte. This requires 4096 bytes. But we also |
@@ -352,8 +416,8 @@ void gf128mul_free_64k(struct gf128mul_64k *t) | |||
352 | int i; | 416 | int i; |
353 | 417 | ||
354 | for (i = 0; i < 16; i++) | 418 | for (i = 0; i < 16; i++) |
355 | kfree(t->t[i]); | 419 | kzfree(t->t[i]); |
356 | kfree(t); | 420 | kzfree(t); |
357 | } | 421 | } |
358 | EXPORT_SYMBOL(gf128mul_free_64k); | 422 | EXPORT_SYMBOL(gf128mul_free_64k); |
359 | 423 | ||
@@ -385,7 +449,7 @@ EXPORT_SYMBOL(gf128mul_64k_bbe); | |||
385 | 449 | ||
386 | /* This version uses 4k bytes of table space. | 450 | /* This version uses 4k bytes of table space. |
387 | A 16 byte buffer has to be multiplied by a 16 byte key | 451 | A 16 byte buffer has to be multiplied by a 16 byte key |
388 | value in GF(128). If we consider a GF(128) value in a | 452 | value in GF(2^128). If we consider a GF(2^128) value in a |
389 | single byte, we can construct a table of the 256 16 byte | 453 | single byte, we can construct a table of the 256 16 byte |
390 | values that result from the 256 values of this byte. | 454 | values that result from the 256 values of this byte. |
391 | This requires 4096 bytes. If we take the highest byte in | 455 | This requires 4096 bytes. If we take the highest byte in |
@@ -443,6 +507,28 @@ out: | |||
443 | } | 507 | } |
444 | EXPORT_SYMBOL(gf128mul_init_4k_bbe); | 508 | EXPORT_SYMBOL(gf128mul_init_4k_bbe); |
445 | 509 | ||
510 | struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g) | ||
511 | { | ||
512 | struct gf128mul_4k *t; | ||
513 | int j, k; | ||
514 | |||
515 | t = kzalloc(sizeof(*t), GFP_KERNEL); | ||
516 | if (!t) | ||
517 | goto out; | ||
518 | |||
519 | t->t[1] = *g; | ||
520 | for (j = 1; j <= 64; j <<= 1) | ||
521 | gf128mul_x_ble(&t->t[j + j], &t->t[j]); | ||
522 | |||
523 | for (j = 2; j < 256; j += j) | ||
524 | for (k = 1; k < j; ++k) | ||
525 | be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); | ||
526 | |||
527 | out: | ||
528 | return t; | ||
529 | } | ||
530 | EXPORT_SYMBOL(gf128mul_init_4k_ble); | ||
531 | |||
446 | void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t) | 532 | void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t) |
447 | { | 533 | { |
448 | u8 *ap = (u8 *)a; | 534 | u8 *ap = (u8 *)a; |
@@ -473,5 +559,20 @@ void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t) | |||
473 | } | 559 | } |
474 | EXPORT_SYMBOL(gf128mul_4k_bbe); | 560 | EXPORT_SYMBOL(gf128mul_4k_bbe); |
475 | 561 | ||
562 | void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t) | ||
563 | { | ||
564 | u8 *ap = (u8 *)a; | ||
565 | be128 r[1]; | ||
566 | int i = 15; | ||
567 | |||
568 | *r = t->t[ap[15]]; | ||
569 | while (i--) { | ||
570 | gf128mul_x8_ble(r); | ||
571 | be128_xor(r, r, &t->t[ap[i]]); | ||
572 | } | ||
573 | *a = *r; | ||
574 | } | ||
575 | EXPORT_SYMBOL(gf128mul_4k_ble); | ||
576 | |||
476 | MODULE_LICENSE("GPL"); | 577 | MODULE_LICENSE("GPL"); |
477 | MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)"); | 578 | MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)"); |