背景

一道 CTF 题目,输入一个数字,数字正确返回 flag,数字范围是 0x5000000000 到 0x9000000000。
逆向之后发现后需要暴力破解 AES 密码,关键步骤如下
密文是 dbef464aa741c1715c47db77f8404231b6afe3246680f81ad39f575ce5602171
密码是 5805c9a510be47acc3bee75cc6c8ad8dfa958f11a291018bFFFFFFFFFF6e9e76
输入的数字以大端序写入 FFFFFFFFFF 的位置,使用 AES-256-ECB 解密,解密后的格式应该是 flag{012345678901234}

Python 实现

先用 Python 还原一下解密算法,因为暴力破解并且只判断前 5 个字节,AES 的块大小是 128 位(16 字节),所以只取密文前16字节来暴力破解即可。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from Crypto.Cipher import AES
import time

key_pattern = \
'5805c9a510be47ac' \
'c3bee75cc6c8ad8d' \
'fa958f11a291018b' \
'FFFFFFFFFF6e9e76'

key = bytearray.fromhex(key_pattern)

# 密文
ciphertext_hex = 'dbef464aa741c1715c47db77f8404231b6afe3246680f81ad39f575ce5602171'

# 暴力破解只取16字节
ciphertext = bytes.fromhex(ciphertext_hex)[:16]

start = time.time()

found = False
for i in range(0x5000000000, 0x90FFFFFFFF):
key[24] = (i >> 32) & 0xFF
key[25] = (i >> 24) & 0xFF
key[26] = (i >> 16) & 0xFF
key[27] = (i >> 8) & 0xFF
key[28] = i & 0xFF

cipher = AES.new(key, AES.MODE_ECB)
try:
plaintext = cipher.decrypt(ciphertext)

if plaintext.startswith(b'flag{'):
print(f"Key (hex): {key.hex()}")
print(f"Decrypted flag: {cipher.decrypt(bytes.fromhex(ciphertext_hex)).decode()}")
found = True
break
except:
continue

end = time.time()

if not found:
print("Not found")

print(f"耗时: {end - start:.4f} 秒")

尝试跑了 3 个字节要 106 秒,估算了下时间要大概要 20 天,感觉就算优化成多线程也要至少一天,显然用 Python 是不行的。

C++/OpenSSL

尝试调用 OpenSSL 来写,OpenSSL 内部已经是 AES-NI 的实现了,性能提升应该很大,OpenSSL 3.0 需要使用 EVP (高级加密接口 Envelope API),暴力破解在循环的每步都初始化上下文显然也是不行的,所以需要使用废弃的老版本接口,加上 #define OPENSSL_SUPPRESS_DEPRECATED

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <iostream>
#define OPENSSL_SUPPRESS_DEPRECATED
#include <openssl/aes.h>
#include <iomanip>
#include <sstream>
#include <thread>
#include <mutex>
#include <vector>
#include <atomic>
#include <cstring>
#include <chrono>

std::mutex cout_mutex;
std::atomic<bool> found(false);

void decrypt_aes256_ecb_optimized(uint64_t start, uint64_t end) {

unsigned char base_key[32] = {
0x58,0x05,0xc9,0xa5,0x10,0xbe,0x47,0xac,
0xc3,0xbe,0xe7,0x5c,0xc6,0xc8,0xad,0x8d,
0xfa,0x95,0x8f,0x11,0xa2,0x91,0x01,0x8b,
0xFF,0xFF,0xFF,0xFF,0xFF,0x6e,0x9e,0x76
};

unsigned char ciphertext[AES_BLOCK_SIZE] = {
0xdb,0xef,0x46,0x4a,0xa7,0x41,0xc1,0x71,
0x5c,0x47,0xdb,0x77,0xf8,0x40,0x42,0x31
};

unsigned char plaintext[AES_BLOCK_SIZE + 1];
plaintext[AES_BLOCK_SIZE] = '\0';

AES_KEY aes_key;
unsigned char current_key[32];
uint32_t last_block_val = 0;

std::memcpy(current_key, base_key, sizeof(base_key));

for (uint64_t i = start; i < end && !found; ++i) {

current_key[24] = (i >> 32) & 0xFF;
current_key[25] = (i >> 24) & 0xFF;
current_key[26] = (i >> 16) & 0xFF;
current_key[27] = (i >> 8) & 0xFF;
current_key[28] = i & 0xFF;

if (AES_set_decrypt_key(current_key, 256, &aes_key) < 0) {
std::cerr << "AES_set_decrypt_key failed!" << std::endl;
return;
}

AES_decrypt(ciphertext, plaintext, &aes_key);

if (plaintext[0] == 'f' && plaintext[1] == 'l' && plaintext[2] == 'a' && plaintext[3] == 'g' && plaintext[4] == '{') {
found = true;
std::lock_guard<std::mutex> lock(cout_mutex);
std::cout << "Found at index: 0x" << std::hex << i << std::dec << std::endl;
std::cout << "Plaintext: " << plaintext << std::endl;
break;
}
}
}

int main() {

const uint64_t start = 0x5000000000u;
const uint64_t end = 0x90FFFFFFFFu;
const uint32_t num_threads = std::thread::hardware_concurrency();

size_t total_range = (size_t)end - start;
size_t range_per_thread = total_range / num_threads;

std::vector<std::thread> threads;

std::cout << "Using " << num_threads << " threads" << std::endl;
std::cout << "Range per thread: " << std::hex << range_per_thread << std::dec << std::endl;

for (uint32_t i = 0; i < num_threads; ++i) {
uint32_t thread_start = start + i * range_per_thread;
uint32_t thread_end = (i == num_threads - 1) ? end : thread_start + range_per_thread;

threads.emplace_back(decrypt_aes256_ecb_optimized, thread_start, thread_end);
}

auto start_time = std::chrono::high_resolution_clock::now();

for (auto& thread : threads) {
thread.join();
}

auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);

std::cout << "\n--- Done ---" << std::endl;
if (!found) {
std::cout << "Not found" << std::endl;
}
std::cout << "Total time: " << duration.count() << " ms" << std::endl;

return 0;
}

使用 9950X 尝试跑完 32 位的空间密码需要 62 秒,预计一个多小时应该能跑出来。

hashcat

那么有没有更快的办法呢,想到了用 hashcat,hashcat 里面已有 AES-256-ECB 的暴力破解内核 26403。但是看了代码,内核的实现方式是加密 16 字节明文,与 16 字节密文对比。现在的场景需要解密密文后与部分明文比对。新建了一个内核 26406,稍微修改了下 26403 的代码,变为调用 aes256_decrypt 解密密文,对比部分明文。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

#ifdef KERNEL_STATIC
#include M2S(INCLUDE_PATH/inc_vendor.h)
#include M2S(INCLUDE_PATH/inc_types.h)
#include M2S(INCLUDE_PATH/inc_platform.cl)
#include M2S(INCLUDE_PATH/inc_common.cl)
#include M2S(INCLUDE_PATH/inc_simd.cl)
#include M2S(INCLUDE_PATH/inc_cipher_aes.cl)
#endif

DECLSPEC void m26406s (SHM_TYPE u32a *s_te0, SHM_TYPE u32a *s_te1, SHM_TYPE u32a *s_te2, SHM_TYPE u32a *s_te3, SHM_TYPE u32a *s_te4, SHM_TYPE u32a *s_td0, SHM_TYPE u32a *s_td1, SHM_TYPE u32a *s_td2, SHM_TYPE u32a *s_td3, SHM_TYPE u32a *s_td4, PRIVATE_AS u32 *w, const u32 pw_len, KERN_ATTR_FUNC_VECTOR ())
{

const u32 ct[4] =
{
digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET_HOST].digest_buf[DGST_R3]

};


/**
* loop
*/

u32 w0l = w[0];

for (u32 il_pos = 0; il_pos < IL_CNT; il_pos += VECT_SIZE)
{
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];

const u32x w0 = w0l | w0r;

u32 ukey[8];

ukey[0] = w0;
ukey[1] = w[1];
ukey[2] = w[2];
ukey[3] = w[3];
ukey[4] = w[4];
ukey[5] = w[5];
ukey[6] = w[6];
ukey[7] = w[7];

#define KEYLEN 60

u32 ks[KEYLEN];

aes256_set_decrypt_key (ks, ukey, s_te0, s_te1, s_te2, s_te3, s_td0, s_td1, s_td2, s_td3);

u32 pt[4];

// 对 digest_buf 中的密文进行解密
aes256_decrypt (ks, ct, pt, s_td0, s_td1, s_td2, s_td3, s_td4);

if (pt[0] != 0x67616c66) continue; // flag
if ((pt[1] & 0xFF) != 0x7b) continue; // {
mark_hash(plains_buf, d_return_buf,
SALT_POS_HOST, DIGESTS_CNT,
0, DIGESTS_OFFSET_HOST,
gid, il_pos, 0, 0);
}
}

KERNEL_FQ KERNEL_FA void m26406_s04 (KERN_ATTR_VECTOR ()) { return; }

KERNEL_FQ KERNEL_FA void m26406_s08 (KERN_ATTR_VECTOR ()) { return; }

KERNEL_FQ KERNEL_FA void m26406_s16 (KERN_ATTR_VECTOR ())
{
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
const u64 lsz = get_local_size (0);

/**
* aes shared
*/

#ifdef REAL_SHM

LOCAL_VK u32 s_td0[256];
LOCAL_VK u32 s_td1[256];
LOCAL_VK u32 s_td2[256];
LOCAL_VK u32 s_td3[256];
LOCAL_VK u32 s_td4[256];

LOCAL_VK u32 s_te0[256];
LOCAL_VK u32 s_te1[256];
LOCAL_VK u32 s_te2[256];
LOCAL_VK u32 s_te3[256];
LOCAL_VK u32 s_te4[256];

for (u32 i = lid; i < 256; i += lsz)
{
s_td0[i] = td0[i];
s_td1[i] = td1[i];
s_td2[i] = td2[i];
s_td3[i] = td3[i];
s_td4[i] = td4[i];

s_te0[i] = te0[i];
s_te1[i] = te1[i];
s_te2[i] = te2[i];
s_te3[i] = te3[i];
s_te4[i] = te4[i];
}

SYNC_THREADS ();

#else

CONSTANT_AS u32a *s_td0 = td0;
CONSTANT_AS u32a *s_td1 = td1;
CONSTANT_AS u32a *s_td2 = td2;
CONSTANT_AS u32a *s_td3 = td3;
CONSTANT_AS u32a *s_td4 = td4;

CONSTANT_AS u32a *s_te0 = te0;
CONSTANT_AS u32a *s_te1 = te1;
CONSTANT_AS u32a *s_te2 = te2;
CONSTANT_AS u32a *s_te3 = te3;
CONSTANT_AS u32a *s_te4 = te4;

#endif

if (gid >= GID_CNT) return;

/**
* base
*/

u32 w[16];

w[ 0] = pws[gid].i[ 0];
w[ 1] = pws[gid].i[ 1];
w[ 2] = pws[gid].i[ 2];
w[ 3] = pws[gid].i[ 3];
w[ 4] = pws[gid].i[ 4];
w[ 5] = pws[gid].i[ 5];
w[ 6] = pws[gid].i[ 6];
w[ 7] = pws[gid].i[ 7];
w[ 8] = pws[gid].i[ 8];
w[ 9] = pws[gid].i[ 9];
w[10] = pws[gid].i[10];
w[11] = pws[gid].i[11];
w[12] = pws[gid].i[12];
w[13] = pws[gid].i[13];
w[14] = pws[gid].i[14];
w[15] = pws[gid].i[15];

const u32 pw_len = pws[gid].pw_len & 63;

/**
* main
*/

m26406s (s_te0, s_te1, s_te2, s_te3, s_te4, s_td0, s_td1, s_td2, s_td3, s_td4, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_extra0_buf, d_extra1_buf, d_extra2_buf, d_extra3_buf, kernel_param, gid, lid, lsz);
}

4070S 上跑到了 160MH/s,27分钟跑出来了,flag 是 flag{17754327890}
不过还是有一点小问题,AES-256-ECB 在 hashcat 的 benchmark 是能跑到 1322 MH/s 的,有可能 benchmark 的密钥是字母后面 ‘\x00’ 补齐的,跑完整的32字节密钥就到不了 benchmark 的速度,具体原因还没有去定位。