tools/virtiofsd: xattr name mappings: Map client xattr names
[qemu.git] / tools / virtiofsd / passthrough_ll.c
1 /*
2 * FUSE: Filesystem in Userspace
3 * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
4 *
5 * This program can be distributed under the terms of the GNU GPLv2.
6 * See the file COPYING.
7 */
8
9 /*
10 *
11 * This file system mirrors the existing file system hierarchy of the
12 * system, starting at the root file system. This is implemented by
13 * just "passing through" all requests to the corresponding user-space
14 * libc functions. In contrast to passthrough.c and passthrough_fh.c,
15 * this implementation uses the low-level API. Its performance should
16 * be the least bad among the three, but many operations are not
17 * implemented. In particular, it is not possible to remove files (or
18 * directories) because the code necessary to defer actual removal
19 * until the file is not opened anymore would make the example much
20 * more complicated.
21 *
22 * When writeback caching is enabled (-o writeback mount option), it
23 * is only possible to write to files for which the mounting user has
24 * read permissions. This is because the writeback cache requires the
25 * kernel to be able to issue read requests for all files (which the
26 * passthrough filesystem cannot satisfy if it can't read the file in
27 * the underlying filesystem).
28 *
29 * Compile with:
30 *
31 * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
32 * passthrough_ll
33 *
34 * ## Source code ##
35 * \include passthrough_ll.c
36 */
37
38 #include "qemu/osdep.h"
39 #include "qemu/timer.h"
40 #include "fuse_virtio.h"
41 #include "fuse_log.h"
42 #include "fuse_lowlevel.h"
43 #include <assert.h>
44 #include <cap-ng.h>
45 #include <dirent.h>
46 #include <errno.h>
47 #include <glib.h>
48 #include <inttypes.h>
49 #include <limits.h>
50 #include <pthread.h>
51 #include <stdbool.h>
52 #include <stddef.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <sys/file.h>
57 #include <sys/mount.h>
58 #include <sys/prctl.h>
59 #include <sys/resource.h>
60 #include <sys/syscall.h>
61 #include <sys/types.h>
62 #include <sys/wait.h>
63 #include <sys/xattr.h>
64 #include <syslog.h>
65 #include <unistd.h>
66
67 #include "qemu/cutils.h"
68 #include "passthrough_helpers.h"
69 #include "passthrough_seccomp.h"
70
71 /* Keep track of inode posix locks for each owner. */
72 struct lo_inode_plock {
73 uint64_t lock_owner;
74 int fd; /* fd for OFD locks */
75 };
76
77 struct lo_map_elem {
78 union {
79 struct lo_inode *inode;
80 struct lo_dirp *dirp;
81 int fd;
82 ssize_t freelist;
83 };
84 bool in_use;
85 };
86
87 /* Maps FUSE fh or ino values to internal objects */
88 struct lo_map {
89 struct lo_map_elem *elems;
90 size_t nelems;
91 ssize_t freelist;
92 };
93
94 struct lo_key {
95 ino_t ino;
96 dev_t dev;
97 };
98
99 struct lo_inode {
100 int fd;
101
102 /*
103 * Atomic reference count for this object. The nlookup field holds a
104 * reference and release it when nlookup reaches 0.
105 */
106 gint refcount;
107
108 struct lo_key key;
109
110 /*
111 * This counter keeps the inode alive during the FUSE session.
112 * Incremented when the FUSE inode number is sent in a reply
113 * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
114 * released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
115 *
116 * Note that this value is untrusted because the client can manipulate
117 * it arbitrarily using FUSE_FORGET requests.
118 *
119 * Protected by lo->mutex.
120 */
121 uint64_t nlookup;
122
123 fuse_ino_t fuse_ino;
124 pthread_mutex_t plock_mutex;
125 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
126
127 mode_t filetype;
128 };
129
130 struct lo_cred {
131 uid_t euid;
132 gid_t egid;
133 };
134
135 enum {
136 CACHE_NONE,
137 CACHE_AUTO,
138 CACHE_ALWAYS,
139 };
140
141 enum {
142 SANDBOX_NAMESPACE,
143 SANDBOX_CHROOT,
144 };
145
146 typedef struct xattr_map_entry {
147 char *key;
148 char *prepend;
149 unsigned int flags;
150 } XattrMapEntry;
151
152 struct lo_data {
153 pthread_mutex_t mutex;
154 int sandbox;
155 int debug;
156 int writeback;
157 int flock;
158 int posix_lock;
159 int xattr;
160 char *xattrmap;
161 char *source;
162 char *modcaps;
163 double timeout;
164 int cache;
165 int timeout_set;
166 int readdirplus_set;
167 int readdirplus_clear;
168 int allow_direct_io;
169 struct lo_inode root;
170 GHashTable *inodes; /* protected by lo->mutex */
171 struct lo_map ino_map; /* protected by lo->mutex */
172 struct lo_map dirp_map; /* protected by lo->mutex */
173 struct lo_map fd_map; /* protected by lo->mutex */
174 XattrMapEntry *xattr_map_list;
175 size_t xattr_map_nentries;
176
177 /* An O_PATH file descriptor to /proc/self/fd/ */
178 int proc_self_fd;
179 };
180
181 static const struct fuse_opt lo_opts[] = {
182 { "sandbox=namespace",
183 offsetof(struct lo_data, sandbox),
184 SANDBOX_NAMESPACE },
185 { "sandbox=chroot",
186 offsetof(struct lo_data, sandbox),
187 SANDBOX_CHROOT },
188 { "writeback", offsetof(struct lo_data, writeback), 1 },
189 { "no_writeback", offsetof(struct lo_data, writeback), 0 },
190 { "source=%s", offsetof(struct lo_data, source), 0 },
191 { "flock", offsetof(struct lo_data, flock), 1 },
192 { "no_flock", offsetof(struct lo_data, flock), 0 },
193 { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
194 { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
195 { "xattr", offsetof(struct lo_data, xattr), 1 },
196 { "no_xattr", offsetof(struct lo_data, xattr), 0 },
197 { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
198 { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
199 { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
200 { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
201 { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
202 { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
203 { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
204 { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
205 { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
206 { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
207 { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
208 FUSE_OPT_END
209 };
210 static bool use_syslog = false;
211 static int current_log_level;
212 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
213 uint64_t n);
214
215 static struct {
216 pthread_mutex_t mutex;
217 void *saved;
218 } cap;
219 /* That we loaded cap-ng in the current thread from the saved */
220 static __thread bool cap_loaded = 0;
221
222 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
223
224 static int is_dot_or_dotdot(const char *name)
225 {
226 return name[0] == '.' &&
227 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
228 }
229
230 /* Is `path` a single path component that is not "." or ".."? */
231 static int is_safe_path_component(const char *path)
232 {
233 if (strchr(path, '/')) {
234 return 0;
235 }
236
237 return !is_dot_or_dotdot(path);
238 }
239
240 static struct lo_data *lo_data(fuse_req_t req)
241 {
242 return (struct lo_data *)fuse_req_userdata(req);
243 }
244
245 /*
246 * Load capng's state from our saved state if the current thread
247 * hadn't previously been loaded.
248 * returns 0 on success
249 */
250 static int load_capng(void)
251 {
252 if (!cap_loaded) {
253 pthread_mutex_lock(&cap.mutex);
254 capng_restore_state(&cap.saved);
255 /*
256 * restore_state free's the saved copy
257 * so make another.
258 */
259 cap.saved = capng_save_state();
260 if (!cap.saved) {
261 pthread_mutex_unlock(&cap.mutex);
262 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
263 return -EINVAL;
264 }
265 pthread_mutex_unlock(&cap.mutex);
266
267 /*
268 * We want to use the loaded state for our pid,
269 * not the original
270 */
271 capng_setpid(syscall(SYS_gettid));
272 cap_loaded = true;
273 }
274 return 0;
275 }
276
277 /*
278 * Helpers for dropping and regaining effective capabilities. Returns 0
279 * on success, error otherwise
280 */
281 static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
282 {
283 int cap, ret;
284
285 cap = capng_name_to_capability(cap_name);
286 if (cap < 0) {
287 ret = errno;
288 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
289 cap_name, strerror(errno));
290 goto out;
291 }
292
293 if (load_capng()) {
294 ret = errno;
295 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
296 goto out;
297 }
298
299 /* We dont have this capability in effective set already. */
300 if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
301 ret = 0;
302 goto out;
303 }
304
305 if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
306 ret = errno;
307 fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
308 goto out;
309 }
310
311 if (capng_apply(CAPNG_SELECT_CAPS)) {
312 ret = errno;
313 fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
314 goto out;
315 }
316
317 ret = 0;
318 if (cap_dropped) {
319 *cap_dropped = true;
320 }
321
322 out:
323 return ret;
324 }
325
326 static int gain_effective_cap(const char *cap_name)
327 {
328 int cap;
329 int ret = 0;
330
331 cap = capng_name_to_capability(cap_name);
332 if (cap < 0) {
333 ret = errno;
334 fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
335 cap_name, strerror(errno));
336 goto out;
337 }
338
339 if (load_capng()) {
340 ret = errno;
341 fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
342 goto out;
343 }
344
345 if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
346 ret = errno;
347 fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
348 goto out;
349 }
350
351 if (capng_apply(CAPNG_SELECT_CAPS)) {
352 ret = errno;
353 fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
354 goto out;
355 }
356 ret = 0;
357
358 out:
359 return ret;
360 }
361
362 static void lo_map_init(struct lo_map *map)
363 {
364 map->elems = NULL;
365 map->nelems = 0;
366 map->freelist = -1;
367 }
368
369 static void lo_map_destroy(struct lo_map *map)
370 {
371 free(map->elems);
372 }
373
374 static int lo_map_grow(struct lo_map *map, size_t new_nelems)
375 {
376 struct lo_map_elem *new_elems;
377 size_t i;
378
379 if (new_nelems <= map->nelems) {
380 return 1;
381 }
382
383 new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
384 if (!new_elems) {
385 return 0;
386 }
387
388 for (i = map->nelems; i < new_nelems; i++) {
389 new_elems[i].freelist = i + 1;
390 new_elems[i].in_use = false;
391 }
392 new_elems[new_nelems - 1].freelist = -1;
393
394 map->elems = new_elems;
395 map->freelist = map->nelems;
396 map->nelems = new_nelems;
397 return 1;
398 }
399
400 static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
401 {
402 struct lo_map_elem *elem;
403
404 if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
405 return NULL;
406 }
407
408 elem = &map->elems[map->freelist];
409 map->freelist = elem->freelist;
410
411 elem->in_use = true;
412
413 return elem;
414 }
415
416 static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
417 {
418 ssize_t *prev;
419
420 if (!lo_map_grow(map, key + 1)) {
421 return NULL;
422 }
423
424 for (prev = &map->freelist; *prev != -1;
425 prev = &map->elems[*prev].freelist) {
426 if (*prev == key) {
427 struct lo_map_elem *elem = &map->elems[key];
428
429 *prev = elem->freelist;
430 elem->in_use = true;
431 return elem;
432 }
433 }
434 return NULL;
435 }
436
437 static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
438 {
439 if (key >= map->nelems) {
440 return NULL;
441 }
442 if (!map->elems[key].in_use) {
443 return NULL;
444 }
445 return &map->elems[key];
446 }
447
448 static void lo_map_remove(struct lo_map *map, size_t key)
449 {
450 struct lo_map_elem *elem;
451
452 if (key >= map->nelems) {
453 return;
454 }
455
456 elem = &map->elems[key];
457 if (!elem->in_use) {
458 return;
459 }
460
461 elem->in_use = false;
462
463 elem->freelist = map->freelist;
464 map->freelist = key;
465 }
466
467 /* Assumes lo->mutex is held */
468 static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
469 {
470 struct lo_map_elem *elem;
471
472 elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
473 if (!elem) {
474 return -1;
475 }
476
477 elem->fd = fd;
478 return elem - lo_data(req)->fd_map.elems;
479 }
480
481 /* Assumes lo->mutex is held */
482 static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
483 {
484 struct lo_map_elem *elem;
485
486 elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
487 if (!elem) {
488 return -1;
489 }
490
491 elem->dirp = dirp;
492 return elem - lo_data(req)->dirp_map.elems;
493 }
494
495 /* Assumes lo->mutex is held */
496 static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
497 {
498 struct lo_map_elem *elem;
499
500 elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
501 if (!elem) {
502 return -1;
503 }
504
505 elem->inode = inode;
506 return elem - lo_data(req)->ino_map.elems;
507 }
508
509 static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
510 {
511 struct lo_inode *inode = *inodep;
512
513 if (!inode) {
514 return;
515 }
516
517 *inodep = NULL;
518
519 if (g_atomic_int_dec_and_test(&inode->refcount)) {
520 close(inode->fd);
521 free(inode);
522 }
523 }
524
525 /* Caller must release refcount using lo_inode_put() */
526 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
527 {
528 struct lo_data *lo = lo_data(req);
529 struct lo_map_elem *elem;
530
531 pthread_mutex_lock(&lo->mutex);
532 elem = lo_map_get(&lo->ino_map, ino);
533 if (elem) {
534 g_atomic_int_inc(&elem->inode->refcount);
535 }
536 pthread_mutex_unlock(&lo->mutex);
537
538 if (!elem) {
539 return NULL;
540 }
541
542 return elem->inode;
543 }
544
545 /*
546 * TODO Remove this helper and force callers to hold an inode refcount until
547 * they are done with the fd. This will be done in a later patch to make
548 * review easier.
549 */
550 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
551 {
552 struct lo_inode *inode = lo_inode(req, ino);
553 int fd;
554
555 if (!inode) {
556 return -1;
557 }
558
559 fd = inode->fd;
560 lo_inode_put(lo_data(req), &inode);
561 return fd;
562 }
563
564 static void lo_init(void *userdata, struct fuse_conn_info *conn)
565 {
566 struct lo_data *lo = (struct lo_data *)userdata;
567
568 if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
569 conn->want |= FUSE_CAP_EXPORT_SUPPORT;
570 }
571
572 if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
573 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
574 conn->want |= FUSE_CAP_WRITEBACK_CACHE;
575 }
576 if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
577 if (lo->flock) {
578 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
579 conn->want |= FUSE_CAP_FLOCK_LOCKS;
580 } else {
581 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
582 conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
583 }
584 }
585
586 if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
587 if (lo->posix_lock) {
588 fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
589 conn->want |= FUSE_CAP_POSIX_LOCKS;
590 } else {
591 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
592 conn->want &= ~FUSE_CAP_POSIX_LOCKS;
593 }
594 }
595
596 if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
597 lo->readdirplus_clear) {
598 fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
599 conn->want &= ~FUSE_CAP_READDIRPLUS;
600 }
601 }
602
603 static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
604 struct fuse_file_info *fi)
605 {
606 int res;
607 struct stat buf;
608 struct lo_data *lo = lo_data(req);
609
610 (void)fi;
611
612 res =
613 fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
614 if (res == -1) {
615 return (void)fuse_reply_err(req, errno);
616 }
617
618 fuse_reply_attr(req, &buf, lo->timeout);
619 }
620
621 static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
622 {
623 struct lo_data *lo = lo_data(req);
624 struct lo_map_elem *elem;
625
626 pthread_mutex_lock(&lo->mutex);
627 elem = lo_map_get(&lo->fd_map, fi->fh);
628 pthread_mutex_unlock(&lo->mutex);
629
630 if (!elem) {
631 return -1;
632 }
633
634 return elem->fd;
635 }
636
637 static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
638 int valid, struct fuse_file_info *fi)
639 {
640 int saverr;
641 char procname[64];
642 struct lo_data *lo = lo_data(req);
643 struct lo_inode *inode;
644 int ifd;
645 int res;
646 int fd = -1;
647
648 inode = lo_inode(req, ino);
649 if (!inode) {
650 fuse_reply_err(req, EBADF);
651 return;
652 }
653
654 ifd = inode->fd;
655
656 /* If fi->fh is invalid we'll report EBADF later */
657 if (fi) {
658 fd = lo_fi_fd(req, fi);
659 }
660
661 if (valid & FUSE_SET_ATTR_MODE) {
662 if (fi) {
663 res = fchmod(fd, attr->st_mode);
664 } else {
665 sprintf(procname, "%i", ifd);
666 res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
667 }
668 if (res == -1) {
669 goto out_err;
670 }
671 }
672 if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
673 uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
674 gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
675
676 res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
677 if (res == -1) {
678 goto out_err;
679 }
680 }
681 if (valid & FUSE_SET_ATTR_SIZE) {
682 int truncfd;
683
684 if (fi) {
685 truncfd = fd;
686 } else {
687 sprintf(procname, "%i", ifd);
688 truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
689 if (truncfd < 0) {
690 goto out_err;
691 }
692 }
693
694 res = ftruncate(truncfd, attr->st_size);
695 if (!fi) {
696 saverr = errno;
697 close(truncfd);
698 errno = saverr;
699 }
700 if (res == -1) {
701 goto out_err;
702 }
703 }
704 if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
705 struct timespec tv[2];
706
707 tv[0].tv_sec = 0;
708 tv[1].tv_sec = 0;
709 tv[0].tv_nsec = UTIME_OMIT;
710 tv[1].tv_nsec = UTIME_OMIT;
711
712 if (valid & FUSE_SET_ATTR_ATIME_NOW) {
713 tv[0].tv_nsec = UTIME_NOW;
714 } else if (valid & FUSE_SET_ATTR_ATIME) {
715 tv[0] = attr->st_atim;
716 }
717
718 if (valid & FUSE_SET_ATTR_MTIME_NOW) {
719 tv[1].tv_nsec = UTIME_NOW;
720 } else if (valid & FUSE_SET_ATTR_MTIME) {
721 tv[1] = attr->st_mtim;
722 }
723
724 if (fi) {
725 res = futimens(fd, tv);
726 } else {
727 sprintf(procname, "%i", inode->fd);
728 res = utimensat(lo->proc_self_fd, procname, tv, 0);
729 }
730 if (res == -1) {
731 goto out_err;
732 }
733 }
734 lo_inode_put(lo, &inode);
735
736 return lo_getattr(req, ino, fi);
737
738 out_err:
739 saverr = errno;
740 lo_inode_put(lo, &inode);
741 fuse_reply_err(req, saverr);
742 }
743
744 static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
745 {
746 struct lo_inode *p;
747 struct lo_key key = {
748 .ino = st->st_ino,
749 .dev = st->st_dev,
750 };
751
752 pthread_mutex_lock(&lo->mutex);
753 p = g_hash_table_lookup(lo->inodes, &key);
754 if (p) {
755 assert(p->nlookup > 0);
756 p->nlookup++;
757 g_atomic_int_inc(&p->refcount);
758 }
759 pthread_mutex_unlock(&lo->mutex);
760
761 return p;
762 }
763
764 /* value_destroy_func for posix_locks GHashTable */
765 static void posix_locks_value_destroy(gpointer data)
766 {
767 struct lo_inode_plock *plock = data;
768
769 /*
770 * We had used open() for locks and had only one fd. So
771 * closing this fd should release all OFD locks.
772 */
773 close(plock->fd);
774 free(plock);
775 }
776
777 /*
778 * Increments nlookup and caller must release refcount using
779 * lo_inode_put(&parent).
780 */
781 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
782 struct fuse_entry_param *e)
783 {
784 int newfd;
785 int res;
786 int saverr;
787 struct lo_data *lo = lo_data(req);
788 struct lo_inode *inode = NULL;
789 struct lo_inode *dir = lo_inode(req, parent);
790
791 /*
792 * name_to_handle_at() and open_by_handle_at() can reach here with fuse
793 * mount point in guest, but we don't have its inode info in the
794 * ino_map.
795 */
796 if (!dir) {
797 return ENOENT;
798 }
799
800 memset(e, 0, sizeof(*e));
801 e->attr_timeout = lo->timeout;
802 e->entry_timeout = lo->timeout;
803
804 /* Do not allow escaping root directory */
805 if (dir == &lo->root && strcmp(name, "..") == 0) {
806 name = ".";
807 }
808
809 newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
810 if (newfd == -1) {
811 goto out_err;
812 }
813
814 res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
815 if (res == -1) {
816 goto out_err;
817 }
818
819 inode = lo_find(lo, &e->attr);
820 if (inode) {
821 close(newfd);
822 } else {
823 inode = calloc(1, sizeof(struct lo_inode));
824 if (!inode) {
825 goto out_err;
826 }
827
828 /* cache only filetype */
829 inode->filetype = (e->attr.st_mode & S_IFMT);
830
831 /*
832 * One for the caller and one for nlookup (released in
833 * unref_inode_lolocked())
834 */
835 g_atomic_int_set(&inode->refcount, 2);
836
837 inode->nlookup = 1;
838 inode->fd = newfd;
839 inode->key.ino = e->attr.st_ino;
840 inode->key.dev = e->attr.st_dev;
841 pthread_mutex_init(&inode->plock_mutex, NULL);
842 inode->posix_locks = g_hash_table_new_full(
843 g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
844
845 pthread_mutex_lock(&lo->mutex);
846 inode->fuse_ino = lo_add_inode_mapping(req, inode);
847 g_hash_table_insert(lo->inodes, &inode->key, inode);
848 pthread_mutex_unlock(&lo->mutex);
849 }
850 e->ino = inode->fuse_ino;
851 lo_inode_put(lo, &inode);
852 lo_inode_put(lo, &dir);
853
854 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
855 name, (unsigned long long)e->ino);
856
857 return 0;
858
859 out_err:
860 saverr = errno;
861 if (newfd != -1) {
862 close(newfd);
863 }
864 lo_inode_put(lo, &inode);
865 lo_inode_put(lo, &dir);
866 return saverr;
867 }
868
869 static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
870 {
871 struct fuse_entry_param e;
872 int err;
873
874 fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
875 name);
876
877 /*
878 * Don't use is_safe_path_component(), allow "." and ".." for NFS export
879 * support.
880 */
881 if (strchr(name, '/')) {
882 fuse_reply_err(req, EINVAL);
883 return;
884 }
885
886 err = lo_do_lookup(req, parent, name, &e);
887 if (err) {
888 fuse_reply_err(req, err);
889 } else {
890 fuse_reply_entry(req, &e);
891 }
892 }
893
894 /*
895 * On some archs, setres*id is limited to 2^16 but they
896 * provide setres*id32 variants that allow 2^32.
897 * Others just let setres*id do 2^32 anyway.
898 */
899 #ifdef SYS_setresgid32
900 #define OURSYS_setresgid SYS_setresgid32
901 #else
902 #define OURSYS_setresgid SYS_setresgid
903 #endif
904
905 #ifdef SYS_setresuid32
906 #define OURSYS_setresuid SYS_setresuid32
907 #else
908 #define OURSYS_setresuid SYS_setresuid
909 #endif
910
911 /*
912 * Change to uid/gid of caller so that file is created with
913 * ownership of caller.
914 * TODO: What about selinux context?
915 */
916 static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
917 {
918 int res;
919
920 old->euid = geteuid();
921 old->egid = getegid();
922
923 res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
924 if (res == -1) {
925 return errno;
926 }
927
928 res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
929 if (res == -1) {
930 int errno_save = errno;
931
932 syscall(OURSYS_setresgid, -1, old->egid, -1);
933 return errno_save;
934 }
935
936 return 0;
937 }
938
939 /* Regain Privileges */
940 static void lo_restore_cred(struct lo_cred *old)
941 {
942 int res;
943
944 res = syscall(OURSYS_setresuid, -1, old->euid, -1);
945 if (res == -1) {
946 fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
947 exit(1);
948 }
949
950 res = syscall(OURSYS_setresgid, -1, old->egid, -1);
951 if (res == -1) {
952 fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
953 exit(1);
954 }
955 }
956
957 static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
958 const char *name, mode_t mode, dev_t rdev,
959 const char *link)
960 {
961 int res;
962 int saverr;
963 struct lo_data *lo = lo_data(req);
964 struct lo_inode *dir;
965 struct fuse_entry_param e;
966 struct lo_cred old = {};
967
968 if (!is_safe_path_component(name)) {
969 fuse_reply_err(req, EINVAL);
970 return;
971 }
972
973 dir = lo_inode(req, parent);
974 if (!dir) {
975 fuse_reply_err(req, EBADF);
976 return;
977 }
978
979 saverr = lo_change_cred(req, &old);
980 if (saverr) {
981 goto out;
982 }
983
984 res = mknod_wrapper(dir->fd, name, link, mode, rdev);
985
986 saverr = errno;
987
988 lo_restore_cred(&old);
989
990 if (res == -1) {
991 goto out;
992 }
993
994 saverr = lo_do_lookup(req, parent, name, &e);
995 if (saverr) {
996 goto out;
997 }
998
999 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1000 name, (unsigned long long)e.ino);
1001
1002 fuse_reply_entry(req, &e);
1003 lo_inode_put(lo, &dir);
1004 return;
1005
1006 out:
1007 lo_inode_put(lo, &dir);
1008 fuse_reply_err(req, saverr);
1009 }
1010
1011 static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
1012 mode_t mode, dev_t rdev)
1013 {
1014 lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
1015 }
1016
1017 static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
1018 mode_t mode)
1019 {
1020 lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
1021 }
1022
1023 static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
1024 const char *name)
1025 {
1026 lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
1027 }
1028
1029 static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
1030 const char *name)
1031 {
1032 int res;
1033 struct lo_data *lo = lo_data(req);
1034 struct lo_inode *parent_inode;
1035 struct lo_inode *inode;
1036 struct fuse_entry_param e;
1037 char procname[64];
1038 int saverr;
1039
1040 if (!is_safe_path_component(name)) {
1041 fuse_reply_err(req, EINVAL);
1042 return;
1043 }
1044
1045 parent_inode = lo_inode(req, parent);
1046 inode = lo_inode(req, ino);
1047 if (!parent_inode || !inode) {
1048 errno = EBADF;
1049 goto out_err;
1050 }
1051
1052 memset(&e, 0, sizeof(struct fuse_entry_param));
1053 e.attr_timeout = lo->timeout;
1054 e.entry_timeout = lo->timeout;
1055
1056 sprintf(procname, "%i", inode->fd);
1057 res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
1058 AT_SYMLINK_FOLLOW);
1059 if (res == -1) {
1060 goto out_err;
1061 }
1062
1063 res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1064 if (res == -1) {
1065 goto out_err;
1066 }
1067
1068 pthread_mutex_lock(&lo->mutex);
1069 inode->nlookup++;
1070 pthread_mutex_unlock(&lo->mutex);
1071 e.ino = inode->fuse_ino;
1072
1073 fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
1074 name, (unsigned long long)e.ino);
1075
1076 fuse_reply_entry(req, &e);
1077 lo_inode_put(lo, &parent_inode);
1078 lo_inode_put(lo, &inode);
1079 return;
1080
1081 out_err:
1082 saverr = errno;
1083 lo_inode_put(lo, &parent_inode);
1084 lo_inode_put(lo, &inode);
1085 fuse_reply_err(req, saverr);
1086 }
1087
1088 /* Increments nlookup and caller must release refcount using lo_inode_put() */
1089 static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
1090 const char *name)
1091 {
1092 int res;
1093 struct stat attr;
1094
1095 res = fstatat(lo_fd(req, parent), name, &attr,
1096 AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1097 if (res == -1) {
1098 return NULL;
1099 }
1100
1101 return lo_find(lo_data(req), &attr);
1102 }
1103
1104 static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
1105 {
1106 int res;
1107 struct lo_inode *inode;
1108 struct lo_data *lo = lo_data(req);
1109
1110 if (!is_safe_path_component(name)) {
1111 fuse_reply_err(req, EINVAL);
1112 return;
1113 }
1114
1115 inode = lookup_name(req, parent, name);
1116 if (!inode) {
1117 fuse_reply_err(req, EIO);
1118 return;
1119 }
1120
1121 res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
1122
1123 fuse_reply_err(req, res == -1 ? errno : 0);
1124 unref_inode_lolocked(lo, inode, 1);
1125 lo_inode_put(lo, &inode);
1126 }
1127
1128 static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
1129 fuse_ino_t newparent, const char *newname,
1130 unsigned int flags)
1131 {
1132 int res;
1133 struct lo_inode *parent_inode;
1134 struct lo_inode *newparent_inode;
1135 struct lo_inode *oldinode = NULL;
1136 struct lo_inode *newinode = NULL;
1137 struct lo_data *lo = lo_data(req);
1138
1139 if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
1140 fuse_reply_err(req, EINVAL);
1141 return;
1142 }
1143
1144 parent_inode = lo_inode(req, parent);
1145 newparent_inode = lo_inode(req, newparent);
1146 if (!parent_inode || !newparent_inode) {
1147 fuse_reply_err(req, EBADF);
1148 goto out;
1149 }
1150
1151 oldinode = lookup_name(req, parent, name);
1152 newinode = lookup_name(req, newparent, newname);
1153
1154 if (!oldinode) {
1155 fuse_reply_err(req, EIO);
1156 goto out;
1157 }
1158
1159 if (flags) {
1160 #ifndef SYS_renameat2
1161 fuse_reply_err(req, EINVAL);
1162 #else
1163 res = syscall(SYS_renameat2, parent_inode->fd, name,
1164 newparent_inode->fd, newname, flags);
1165 if (res == -1 && errno == ENOSYS) {
1166 fuse_reply_err(req, EINVAL);
1167 } else {
1168 fuse_reply_err(req, res == -1 ? errno : 0);
1169 }
1170 #endif
1171 goto out;
1172 }
1173
1174 res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
1175
1176 fuse_reply_err(req, res == -1 ? errno : 0);
1177 out:
1178 unref_inode_lolocked(lo, oldinode, 1);
1179 unref_inode_lolocked(lo, newinode, 1);
1180 lo_inode_put(lo, &oldinode);
1181 lo_inode_put(lo, &newinode);
1182 lo_inode_put(lo, &parent_inode);
1183 lo_inode_put(lo, &newparent_inode);
1184 }
1185
1186 static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
1187 {
1188 int res;
1189 struct lo_inode *inode;
1190 struct lo_data *lo = lo_data(req);
1191
1192 if (!is_safe_path_component(name)) {
1193 fuse_reply_err(req, EINVAL);
1194 return;
1195 }
1196
1197 inode = lookup_name(req, parent, name);
1198 if (!inode) {
1199 fuse_reply_err(req, EIO);
1200 return;
1201 }
1202
1203 res = unlinkat(lo_fd(req, parent), name, 0);
1204
1205 fuse_reply_err(req, res == -1 ? errno : 0);
1206 unref_inode_lolocked(lo, inode, 1);
1207 lo_inode_put(lo, &inode);
1208 }
1209
1210 /* To be called with lo->mutex held */
1211 static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
1212 {
1213 if (!inode) {
1214 return;
1215 }
1216
1217 assert(inode->nlookup >= n);
1218 inode->nlookup -= n;
1219 if (!inode->nlookup) {
1220 lo_map_remove(&lo->ino_map, inode->fuse_ino);
1221 g_hash_table_remove(lo->inodes, &inode->key);
1222 if (g_hash_table_size(inode->posix_locks)) {
1223 fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
1224 }
1225 g_hash_table_destroy(inode->posix_locks);
1226 pthread_mutex_destroy(&inode->plock_mutex);
1227
1228 /* Drop our refcount from lo_do_lookup() */
1229 lo_inode_put(lo, &inode);
1230 }
1231 }
1232
1233 static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
1234 uint64_t n)
1235 {
1236 if (!inode) {
1237 return;
1238 }
1239
1240 pthread_mutex_lock(&lo->mutex);
1241 unref_inode(lo, inode, n);
1242 pthread_mutex_unlock(&lo->mutex);
1243 }
1244
1245 static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1246 {
1247 struct lo_data *lo = lo_data(req);
1248 struct lo_inode *inode;
1249
1250 inode = lo_inode(req, ino);
1251 if (!inode) {
1252 return;
1253 }
1254
1255 fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
1256 (unsigned long long)ino, (unsigned long long)inode->nlookup,
1257 (unsigned long long)nlookup);
1258
1259 unref_inode_lolocked(lo, inode, nlookup);
1260 lo_inode_put(lo, &inode);
1261 }
1262
1263 static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
1264 {
1265 lo_forget_one(req, ino, nlookup);
1266 fuse_reply_none(req);
1267 }
1268
1269 static void lo_forget_multi(fuse_req_t req, size_t count,
1270 struct fuse_forget_data *forgets)
1271 {
1272 int i;
1273
1274 for (i = 0; i < count; i++) {
1275 lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
1276 }
1277 fuse_reply_none(req);
1278 }
1279
1280 static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
1281 {
1282 char buf[PATH_MAX + 1];
1283 int res;
1284
1285 res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
1286 if (res == -1) {
1287 return (void)fuse_reply_err(req, errno);
1288 }
1289
1290 if (res == sizeof(buf)) {
1291 return (void)fuse_reply_err(req, ENAMETOOLONG);
1292 }
1293
1294 buf[res] = '\0';
1295
1296 fuse_reply_readlink(req, buf);
1297 }
1298
1299 struct lo_dirp {
1300 gint refcount;
1301 DIR *dp;
1302 struct dirent *entry;
1303 off_t offset;
1304 };
1305
1306 static void lo_dirp_put(struct lo_dirp **dp)
1307 {
1308 struct lo_dirp *d = *dp;
1309
1310 if (!d) {
1311 return;
1312 }
1313 *dp = NULL;
1314
1315 if (g_atomic_int_dec_and_test(&d->refcount)) {
1316 closedir(d->dp);
1317 free(d);
1318 }
1319 }
1320
1321 /* Call lo_dirp_put() on the return value when no longer needed */
1322 static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
1323 {
1324 struct lo_data *lo = lo_data(req);
1325 struct lo_map_elem *elem;
1326
1327 pthread_mutex_lock(&lo->mutex);
1328 elem = lo_map_get(&lo->dirp_map, fi->fh);
1329 if (elem) {
1330 g_atomic_int_inc(&elem->dirp->refcount);
1331 }
1332 pthread_mutex_unlock(&lo->mutex);
1333 if (!elem) {
1334 return NULL;
1335 }
1336
1337 return elem->dirp;
1338 }
1339
1340 static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
1341 struct fuse_file_info *fi)
1342 {
1343 int error = ENOMEM;
1344 struct lo_data *lo = lo_data(req);
1345 struct lo_dirp *d;
1346 int fd;
1347 ssize_t fh;
1348
1349 d = calloc(1, sizeof(struct lo_dirp));
1350 if (d == NULL) {
1351 goto out_err;
1352 }
1353
1354 fd = openat(lo_fd(req, ino), ".", O_RDONLY);
1355 if (fd == -1) {
1356 goto out_errno;
1357 }
1358
1359 d->dp = fdopendir(fd);
1360 if (d->dp == NULL) {
1361 goto out_errno;
1362 }
1363
1364 d->offset = 0;
1365 d->entry = NULL;
1366
1367 g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
1368 pthread_mutex_lock(&lo->mutex);
1369 fh = lo_add_dirp_mapping(req, d);
1370 pthread_mutex_unlock(&lo->mutex);
1371 if (fh == -1) {
1372 goto out_err;
1373 }
1374
1375 fi->fh = fh;
1376 if (lo->cache == CACHE_ALWAYS) {
1377 fi->cache_readdir = 1;
1378 }
1379 fuse_reply_open(req, fi);
1380 return;
1381
1382 out_errno:
1383 error = errno;
1384 out_err:
1385 if (d) {
1386 if (d->dp) {
1387 closedir(d->dp);
1388 } else if (fd != -1) {
1389 close(fd);
1390 }
1391 free(d);
1392 }
1393 fuse_reply_err(req, error);
1394 }
1395
1396 static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1397 off_t offset, struct fuse_file_info *fi, int plus)
1398 {
1399 struct lo_data *lo = lo_data(req);
1400 struct lo_dirp *d = NULL;
1401 struct lo_inode *dinode;
1402 char *buf = NULL;
1403 char *p;
1404 size_t rem = size;
1405 int err = EBADF;
1406
1407 dinode = lo_inode(req, ino);
1408 if (!dinode) {
1409 goto error;
1410 }
1411
1412 d = lo_dirp(req, fi);
1413 if (!d) {
1414 goto error;
1415 }
1416
1417 err = ENOMEM;
1418 buf = calloc(1, size);
1419 if (!buf) {
1420 goto error;
1421 }
1422 p = buf;
1423
1424 if (offset != d->offset) {
1425 seekdir(d->dp, offset);
1426 d->entry = NULL;
1427 d->offset = offset;
1428 }
1429 while (1) {
1430 size_t entsize;
1431 off_t nextoff;
1432 const char *name;
1433
1434 if (!d->entry) {
1435 errno = 0;
1436 d->entry = readdir(d->dp);
1437 if (!d->entry) {
1438 if (errno) { /* Error */
1439 err = errno;
1440 goto error;
1441 } else { /* End of stream */
1442 break;
1443 }
1444 }
1445 }
1446 nextoff = d->entry->d_off;
1447 name = d->entry->d_name;
1448
1449 fuse_ino_t entry_ino = 0;
1450 struct fuse_entry_param e = (struct fuse_entry_param){
1451 .attr.st_ino = d->entry->d_ino,
1452 .attr.st_mode = d->entry->d_type << 12,
1453 };
1454
1455 /* Hide root's parent directory */
1456 if (dinode == &lo->root && strcmp(name, "..") == 0) {
1457 e.attr.st_ino = lo->root.key.ino;
1458 e.attr.st_mode = DT_DIR << 12;
1459 }
1460
1461 if (plus) {
1462 if (!is_dot_or_dotdot(name)) {
1463 err = lo_do_lookup(req, ino, name, &e);
1464 if (err) {
1465 goto error;
1466 }
1467 entry_ino = e.ino;
1468 }
1469
1470 entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
1471 } else {
1472 entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
1473 }
1474 if (entsize > rem) {
1475 if (entry_ino != 0) {
1476 lo_forget_one(req, entry_ino, 1);
1477 }
1478 break;
1479 }
1480
1481 p += entsize;
1482 rem -= entsize;
1483
1484 d->entry = NULL;
1485 d->offset = nextoff;
1486 }
1487
1488 err = 0;
1489 error:
1490 lo_dirp_put(&d);
1491 lo_inode_put(lo, &dinode);
1492
1493 /*
1494 * If there's an error, we can only signal it if we haven't stored
1495 * any entries yet - otherwise we'd end up with wrong lookup
1496 * counts for the entries that are already in the buffer. So we
1497 * return what we've collected until that point.
1498 */
1499 if (err && rem == size) {
1500 fuse_reply_err(req, err);
1501 } else {
1502 fuse_reply_buf(req, buf, size - rem);
1503 }
1504 free(buf);
1505 }
1506
1507 static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
1508 off_t offset, struct fuse_file_info *fi)
1509 {
1510 lo_do_readdir(req, ino, size, offset, fi, 0);
1511 }
1512
1513 static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
1514 off_t offset, struct fuse_file_info *fi)
1515 {
1516 lo_do_readdir(req, ino, size, offset, fi, 1);
1517 }
1518
1519 static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
1520 struct fuse_file_info *fi)
1521 {
1522 struct lo_data *lo = lo_data(req);
1523 struct lo_map_elem *elem;
1524 struct lo_dirp *d;
1525
1526 (void)ino;
1527
1528 pthread_mutex_lock(&lo->mutex);
1529 elem = lo_map_get(&lo->dirp_map, fi->fh);
1530 if (!elem) {
1531 pthread_mutex_unlock(&lo->mutex);
1532 fuse_reply_err(req, EBADF);
1533 return;
1534 }
1535
1536 d = elem->dirp;
1537 lo_map_remove(&lo->dirp_map, fi->fh);
1538 pthread_mutex_unlock(&lo->mutex);
1539
1540 lo_dirp_put(&d); /* paired with lo_opendir() */
1541
1542 fuse_reply_err(req, 0);
1543 }
1544
1545 static void update_open_flags(int writeback, int allow_direct_io,
1546 struct fuse_file_info *fi)
1547 {
1548 /*
1549 * With writeback cache, kernel may send read requests even
1550 * when userspace opened write-only
1551 */
1552 if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
1553 fi->flags &= ~O_ACCMODE;
1554 fi->flags |= O_RDWR;
1555 }
1556
1557 /*
1558 * With writeback cache, O_APPEND is handled by the kernel.
1559 * This breaks atomicity (since the file may change in the
1560 * underlying filesystem, so that the kernel's idea of the
1561 * end of the file isn't accurate anymore). In this example,
1562 * we just accept that. A more rigorous filesystem may want
1563 * to return an error here
1564 */
1565 if (writeback && (fi->flags & O_APPEND)) {
1566 fi->flags &= ~O_APPEND;
1567 }
1568
1569 /*
1570 * O_DIRECT in guest should not necessarily mean bypassing page
1571 * cache on host as well. Therefore, we discard it by default
1572 * ('-o no_allow_direct_io'). If somebody needs that behavior,
1573 * the '-o allow_direct_io' option should be set.
1574 */
1575 if (!allow_direct_io) {
1576 fi->flags &= ~O_DIRECT;
1577 }
1578 }
1579
1580 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
1581 mode_t mode, struct fuse_file_info *fi)
1582 {
1583 int fd;
1584 struct lo_data *lo = lo_data(req);
1585 struct lo_inode *parent_inode;
1586 struct fuse_entry_param e;
1587 int err;
1588 struct lo_cred old = {};
1589
1590 fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
1591 name);
1592
1593 if (!is_safe_path_component(name)) {
1594 fuse_reply_err(req, EINVAL);
1595 return;
1596 }
1597
1598 parent_inode = lo_inode(req, parent);
1599 if (!parent_inode) {
1600 fuse_reply_err(req, EBADF);
1601 return;
1602 }
1603
1604 err = lo_change_cred(req, &old);
1605 if (err) {
1606 goto out;
1607 }
1608
1609 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1610
1611 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
1612 mode);
1613 err = fd == -1 ? errno : 0;
1614 lo_restore_cred(&old);
1615
1616 if (!err) {
1617 ssize_t fh;
1618
1619 pthread_mutex_lock(&lo->mutex);
1620 fh = lo_add_fd_mapping(req, fd);
1621 pthread_mutex_unlock(&lo->mutex);
1622 if (fh == -1) {
1623 close(fd);
1624 err = ENOMEM;
1625 goto out;
1626 }
1627
1628 fi->fh = fh;
1629 err = lo_do_lookup(req, parent, name, &e);
1630 }
1631 if (lo->cache == CACHE_NONE) {
1632 fi->direct_io = 1;
1633 } else if (lo->cache == CACHE_ALWAYS) {
1634 fi->keep_cache = 1;
1635 }
1636
1637 out:
1638 lo_inode_put(lo, &parent_inode);
1639
1640 if (err) {
1641 fuse_reply_err(req, err);
1642 } else {
1643 fuse_reply_create(req, &e, fi);
1644 }
1645 }
1646
1647 /* Should be called with inode->plock_mutex held */
1648 static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
1649 struct lo_inode *inode,
1650 uint64_t lock_owner,
1651 pid_t pid, int *err)
1652 {
1653 struct lo_inode_plock *plock;
1654 char procname[64];
1655 int fd;
1656
1657 plock =
1658 g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
1659
1660 if (plock) {
1661 return plock;
1662 }
1663
1664 plock = malloc(sizeof(struct lo_inode_plock));
1665 if (!plock) {
1666 *err = ENOMEM;
1667 return NULL;
1668 }
1669
1670 /* Open another instance of file which can be used for ofd locks. */
1671 sprintf(procname, "%i", inode->fd);
1672
1673 /* TODO: What if file is not writable? */
1674 fd = openat(lo->proc_self_fd, procname, O_RDWR);
1675 if (fd == -1) {
1676 *err = errno;
1677 free(plock);
1678 return NULL;
1679 }
1680
1681 plock->lock_owner = lock_owner;
1682 plock->fd = fd;
1683 g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
1684 plock);
1685 return plock;
1686 }
1687
1688 static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1689 struct flock *lock)
1690 {
1691 struct lo_data *lo = lo_data(req);
1692 struct lo_inode *inode;
1693 struct lo_inode_plock *plock;
1694 int ret, saverr = 0;
1695
1696 fuse_log(FUSE_LOG_DEBUG,
1697 "lo_getlk(ino=%" PRIu64 ", flags=%d)"
1698 " owner=0x%lx, l_type=%d l_start=0x%lx"
1699 " l_len=0x%lx\n",
1700 ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
1701 lock->l_len);
1702
1703 inode = lo_inode(req, ino);
1704 if (!inode) {
1705 fuse_reply_err(req, EBADF);
1706 return;
1707 }
1708
1709 pthread_mutex_lock(&inode->plock_mutex);
1710 plock =
1711 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1712 if (!plock) {
1713 saverr = ret;
1714 goto out;
1715 }
1716
1717 ret = fcntl(plock->fd, F_OFD_GETLK, lock);
1718 if (ret == -1) {
1719 saverr = errno;
1720 }
1721
1722 out:
1723 pthread_mutex_unlock(&inode->plock_mutex);
1724 lo_inode_put(lo, &inode);
1725
1726 if (saverr) {
1727 fuse_reply_err(req, saverr);
1728 } else {
1729 fuse_reply_lock(req, lock);
1730 }
1731 }
1732
1733 static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
1734 struct flock *lock, int sleep)
1735 {
1736 struct lo_data *lo = lo_data(req);
1737 struct lo_inode *inode;
1738 struct lo_inode_plock *plock;
1739 int ret, saverr = 0;
1740
1741 fuse_log(FUSE_LOG_DEBUG,
1742 "lo_setlk(ino=%" PRIu64 ", flags=%d)"
1743 " cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
1744 " l_start=0x%lx l_len=0x%lx\n",
1745 ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
1746 lock->l_whence, lock->l_start, lock->l_len);
1747
1748 if (sleep) {
1749 fuse_reply_err(req, EOPNOTSUPP);
1750 return;
1751 }
1752
1753 inode = lo_inode(req, ino);
1754 if (!inode) {
1755 fuse_reply_err(req, EBADF);
1756 return;
1757 }
1758
1759 pthread_mutex_lock(&inode->plock_mutex);
1760 plock =
1761 lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
1762
1763 if (!plock) {
1764 saverr = ret;
1765 goto out;
1766 }
1767
1768 /* TODO: Is it alright to modify flock? */
1769 lock->l_pid = 0;
1770 ret = fcntl(plock->fd, F_OFD_SETLK, lock);
1771 if (ret == -1) {
1772 saverr = errno;
1773 }
1774
1775 out:
1776 pthread_mutex_unlock(&inode->plock_mutex);
1777 lo_inode_put(lo, &inode);
1778
1779 fuse_reply_err(req, saverr);
1780 }
1781
1782 static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
1783 struct fuse_file_info *fi)
1784 {
1785 int res;
1786 struct lo_dirp *d;
1787 int fd;
1788
1789 (void)ino;
1790
1791 d = lo_dirp(req, fi);
1792 if (!d) {
1793 fuse_reply_err(req, EBADF);
1794 return;
1795 }
1796
1797 fd = dirfd(d->dp);
1798 if (datasync) {
1799 res = fdatasync(fd);
1800 } else {
1801 res = fsync(fd);
1802 }
1803
1804 lo_dirp_put(&d);
1805
1806 fuse_reply_err(req, res == -1 ? errno : 0);
1807 }
1808
1809 static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1810 {
1811 int fd;
1812 ssize_t fh;
1813 char buf[64];
1814 struct lo_data *lo = lo_data(req);
1815
1816 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
1817 fi->flags);
1818
1819 update_open_flags(lo->writeback, lo->allow_direct_io, fi);
1820
1821 sprintf(buf, "%i", lo_fd(req, ino));
1822 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
1823 if (fd == -1) {
1824 return (void)fuse_reply_err(req, errno);
1825 }
1826
1827 pthread_mutex_lock(&lo->mutex);
1828 fh = lo_add_fd_mapping(req, fd);
1829 pthread_mutex_unlock(&lo->mutex);
1830 if (fh == -1) {
1831 close(fd);
1832 fuse_reply_err(req, ENOMEM);
1833 return;
1834 }
1835
1836 fi->fh = fh;
1837 if (lo->cache == CACHE_NONE) {
1838 fi->direct_io = 1;
1839 } else if (lo->cache == CACHE_ALWAYS) {
1840 fi->keep_cache = 1;
1841 }
1842 fuse_reply_open(req, fi);
1843 }
1844
1845 static void lo_release(fuse_req_t req, fuse_ino_t ino,
1846 struct fuse_file_info *fi)
1847 {
1848 struct lo_data *lo = lo_data(req);
1849 struct lo_map_elem *elem;
1850 int fd = -1;
1851
1852 (void)ino;
1853
1854 pthread_mutex_lock(&lo->mutex);
1855 elem = lo_map_get(&lo->fd_map, fi->fh);
1856 if (elem) {
1857 fd = elem->fd;
1858 elem = NULL;
1859 lo_map_remove(&lo->fd_map, fi->fh);
1860 }
1861 pthread_mutex_unlock(&lo->mutex);
1862
1863 close(fd);
1864 fuse_reply_err(req, 0);
1865 }
1866
1867 static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
1868 {
1869 int res;
1870 (void)ino;
1871 struct lo_inode *inode;
1872
1873 inode = lo_inode(req, ino);
1874 if (!inode) {
1875 fuse_reply_err(req, EBADF);
1876 return;
1877 }
1878
1879 /* An fd is going away. Cleanup associated posix locks */
1880 pthread_mutex_lock(&inode->plock_mutex);
1881 g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
1882 pthread_mutex_unlock(&inode->plock_mutex);
1883
1884 res = close(dup(lo_fi_fd(req, fi)));
1885 lo_inode_put(lo_data(req), &inode);
1886 fuse_reply_err(req, res == -1 ? errno : 0);
1887 }
1888
1889 static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
1890 struct fuse_file_info *fi)
1891 {
1892 int res;
1893 int fd;
1894 char *buf;
1895
1896 fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
1897 (void *)fi);
1898
1899 if (!fi) {
1900 struct lo_data *lo = lo_data(req);
1901
1902 res = asprintf(&buf, "%i", lo_fd(req, ino));
1903 if (res == -1) {
1904 return (void)fuse_reply_err(req, errno);
1905 }
1906
1907 fd = openat(lo->proc_self_fd, buf, O_RDWR);
1908 free(buf);
1909 if (fd == -1) {
1910 return (void)fuse_reply_err(req, errno);
1911 }
1912 } else {
1913 fd = lo_fi_fd(req, fi);
1914 }
1915
1916 if (datasync) {
1917 res = fdatasync(fd);
1918 } else {
1919 res = fsync(fd);
1920 }
1921 if (!fi) {
1922 close(fd);
1923 }
1924 fuse_reply_err(req, res == -1 ? errno : 0);
1925 }
1926
1927 static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
1928 struct fuse_file_info *fi)
1929 {
1930 struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
1931
1932 fuse_log(FUSE_LOG_DEBUG,
1933 "lo_read(ino=%" PRIu64 ", size=%zd, "
1934 "off=%lu)\n",
1935 ino, size, (unsigned long)offset);
1936
1937 buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
1938 buf.buf[0].fd = lo_fi_fd(req, fi);
1939 buf.buf[0].pos = offset;
1940
1941 fuse_reply_data(req, &buf);
1942 }
1943
1944 static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
1945 struct fuse_bufvec *in_buf, off_t off,
1946 struct fuse_file_info *fi)
1947 {
1948 (void)ino;
1949 ssize_t res;
1950 struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
1951 bool cap_fsetid_dropped = false;
1952
1953 out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
1954 out_buf.buf[0].fd = lo_fi_fd(req, fi);
1955 out_buf.buf[0].pos = off;
1956
1957 fuse_log(FUSE_LOG_DEBUG,
1958 "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
1959 out_buf.buf[0].size, (unsigned long)off);
1960
1961 /*
1962 * If kill_priv is set, drop CAP_FSETID which should lead to kernel
1963 * clearing setuid/setgid on file.
1964 */
1965 if (fi->kill_priv) {
1966 res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
1967 if (res != 0) {
1968 fuse_reply_err(req, res);
1969 return;
1970 }
1971 }
1972
1973 res = fuse_buf_copy(&out_buf, in_buf);
1974 if (res < 0) {
1975 fuse_reply_err(req, -res);
1976 } else {
1977 fuse_reply_write(req, (size_t)res);
1978 }
1979
1980 if (cap_fsetid_dropped) {
1981 res = gain_effective_cap("FSETID");
1982 if (res) {
1983 fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
1984 }
1985 }
1986 }
1987
1988 static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
1989 {
1990 int res;
1991 struct statvfs stbuf;
1992
1993 res = fstatvfs(lo_fd(req, ino), &stbuf);
1994 if (res == -1) {
1995 fuse_reply_err(req, errno);
1996 } else {
1997 fuse_reply_statfs(req, &stbuf);
1998 }
1999 }
2000
2001 static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
2002 off_t length, struct fuse_file_info *fi)
2003 {
2004 int err = EOPNOTSUPP;
2005 (void)ino;
2006
2007 #ifdef CONFIG_FALLOCATE
2008 err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
2009 if (err < 0) {
2010 err = errno;
2011 }
2012
2013 #elif defined(CONFIG_POSIX_FALLOCATE)
2014 if (mode) {
2015 fuse_reply_err(req, EOPNOTSUPP);
2016 return;
2017 }
2018
2019 err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
2020 #endif
2021
2022 fuse_reply_err(req, err);
2023 }
2024
2025 static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
2026 int op)
2027 {
2028 int res;
2029 (void)ino;
2030
2031 res = flock(lo_fi_fd(req, fi), op);
2032
2033 fuse_reply_err(req, res == -1 ? errno : 0);
2034 }
2035
2036 /* types */
2037 /*
2038 * Exit; process attribute unmodified if matched.
2039 * An empty key applies to all.
2040 */
2041 #define XATTR_MAP_FLAG_OK (1 << 0)
2042 /*
2043 * The attribute is unwanted;
2044 * EPERM on write, hidden on read.
2045 */
2046 #define XATTR_MAP_FLAG_BAD (1 << 1)
2047 /*
2048 * For attr that start with 'key' prepend 'prepend'
2049 * 'key' may be empty to prepend for all attrs
2050 * key is defined from set/remove point of view.
2051 * Automatically reversed on read
2052 */
2053 #define XATTR_MAP_FLAG_PREFIX (1 << 2)
2054
2055 /* scopes */
2056 /* Apply rule to get/set/remove */
2057 #define XATTR_MAP_FLAG_CLIENT (1 << 16)
2058 /* Apply rule to list */
2059 #define XATTR_MAP_FLAG_SERVER (1 << 17)
2060 /* Apply rule to all */
2061 #define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
2062
2063 static void add_xattrmap_entry(struct lo_data *lo,
2064 const XattrMapEntry *new_entry)
2065 {
2066 XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
2067 lo->xattr_map_nentries + 1,
2068 sizeof(XattrMapEntry));
2069 res[lo->xattr_map_nentries++] = *new_entry;
2070
2071 lo->xattr_map_list = res;
2072 }
2073
2074 static void free_xattrmap(struct lo_data *lo)
2075 {
2076 XattrMapEntry *map = lo->xattr_map_list;
2077 size_t i;
2078
2079 if (!map) {
2080 return;
2081 }
2082
2083 for (i = 0; i < lo->xattr_map_nentries; i++) {
2084 g_free(map[i].key);
2085 g_free(map[i].prepend);
2086 };
2087
2088 g_free(map);
2089 lo->xattr_map_list = NULL;
2090 lo->xattr_map_nentries = -1;
2091 }
2092
2093 static void parse_xattrmap(struct lo_data *lo)
2094 {
2095 const char *map = lo->xattrmap;
2096 const char *tmp;
2097
2098 lo->xattr_map_nentries = 0;
2099 while (*map) {
2100 XattrMapEntry tmp_entry;
2101 char sep;
2102
2103 if (isspace(*map)) {
2104 map++;
2105 continue;
2106 }
2107 /* The separator is the first non-space of the rule */
2108 sep = *map++;
2109 if (!sep) {
2110 break;
2111 }
2112
2113 tmp_entry.flags = 0;
2114 /* Start of 'type' */
2115 if (strstart(map, "prefix", &map)) {
2116 tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
2117 } else if (strstart(map, "ok", &map)) {
2118 tmp_entry.flags |= XATTR_MAP_FLAG_OK;
2119 } else if (strstart(map, "bad", &map)) {
2120 tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
2121 } else {
2122 fuse_log(FUSE_LOG_ERR,
2123 "%s: Unexpected type;"
2124 "Expecting 'prefix', 'ok', or 'bad' in rule %zu\n",
2125 __func__, lo->xattr_map_nentries);
2126 exit(1);
2127 }
2128
2129 if (*map++ != sep) {
2130 fuse_log(FUSE_LOG_ERR,
2131 "%s: Missing '%c' at end of type field of rule %zu\n",
2132 __func__, sep, lo->xattr_map_nentries);
2133 exit(1);
2134 }
2135
2136 /* Start of 'scope' */
2137 if (strstart(map, "client", &map)) {
2138 tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
2139 } else if (strstart(map, "server", &map)) {
2140 tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
2141 } else if (strstart(map, "all", &map)) {
2142 tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
2143 } else {
2144 fuse_log(FUSE_LOG_ERR,
2145 "%s: Unexpected scope;"
2146 " Expecting 'client', 'server', or 'all', in rule %zu\n",
2147 __func__, lo->xattr_map_nentries);
2148 exit(1);
2149 }
2150
2151 if (*map++ != sep) {
2152 fuse_log(FUSE_LOG_ERR,
2153 "%s: Expecting '%c' found '%c'"
2154 " after scope in rule %zu\n",
2155 __func__, sep, *map, lo->xattr_map_nentries);
2156 exit(1);
2157 }
2158
2159 /* At start of 'key' field */
2160 tmp = strchr(map, sep);
2161 if (!tmp) {
2162 fuse_log(FUSE_LOG_ERR,
2163 "%s: Missing '%c' at end of key field of rule %zu",
2164 __func__, sep, lo->xattr_map_nentries);
2165 exit(1);
2166 }
2167 tmp_entry.key = g_strndup(map, tmp - map);
2168 map = tmp + 1;
2169
2170 /* At start of 'prepend' field */
2171 tmp = strchr(map, sep);
2172 if (!tmp) {
2173 fuse_log(FUSE_LOG_ERR,
2174 "%s: Missing '%c' at end of prepend field of rule %zu",
2175 __func__, sep, lo->xattr_map_nentries);
2176 exit(1);
2177 }
2178 tmp_entry.prepend = g_strndup(map, tmp - map);
2179 map = tmp + 1;
2180
2181 add_xattrmap_entry(lo, &tmp_entry);
2182 /* End of rule - go around again for another rule */
2183 }
2184
2185 if (!lo->xattr_map_nentries) {
2186 fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
2187 exit(1);
2188 }
2189 }
2190
2191 /*
2192 * For use with getxattr/setxattr/removexattr, where the client
2193 * gives us a name and we may need to choose a different one.
2194 * Allocates a buffer for the result placing it in *out_name.
2195 * If there's no change then *out_name is not set.
2196 * Returns 0 on success
2197 * Can return -EPERM to indicate we block a given attribute
2198 * (in which case out_name is not allocated)
2199 * Can return -ENOMEM to indicate out_name couldn't be allocated.
2200 */
2201 static int xattr_map_client(const struct lo_data *lo, const char *client_name,
2202 char **out_name)
2203 {
2204 size_t i;
2205 for (i = 0; i < lo->xattr_map_nentries; i++) {
2206 const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
2207
2208 if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
2209 (strstart(client_name, cur_entry->key, NULL))) {
2210 if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
2211 return -EPERM;
2212 }
2213 if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
2214 /* Unmodified name */
2215 return 0;
2216 }
2217 if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
2218 *out_name = g_try_malloc(strlen(client_name) +
2219 strlen(cur_entry->prepend) + 1);
2220 if (!*out_name) {
2221 return -ENOMEM;
2222 }
2223 sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
2224 return 0;
2225 }
2226 }
2227 }
2228
2229 return -EPERM;
2230 }
2231
2232 static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2233 size_t size)
2234 {
2235 struct lo_data *lo = lo_data(req);
2236 char *value = NULL;
2237 char procname[64];
2238 const char *name;
2239 char *mapped_name;
2240 struct lo_inode *inode;
2241 ssize_t ret;
2242 int saverr;
2243 int fd = -1;
2244
2245 mapped_name = NULL;
2246 name = in_name;
2247 if (lo->xattrmap) {
2248 ret = xattr_map_client(lo, in_name, &mapped_name);
2249 if (ret < 0) {
2250 if (ret == -EPERM) {
2251 ret = -ENODATA;
2252 }
2253 fuse_reply_err(req, -ret);
2254 return;
2255 }
2256 if (mapped_name) {
2257 name = mapped_name;
2258 }
2259 }
2260
2261 inode = lo_inode(req, ino);
2262 if (!inode) {
2263 fuse_reply_err(req, EBADF);
2264 g_free(mapped_name);
2265 return;
2266 }
2267
2268 saverr = ENOSYS;
2269 if (!lo_data(req)->xattr) {
2270 goto out;
2271 }
2272
2273 fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
2274 ino, name, size);
2275
2276 if (size) {
2277 value = malloc(size);
2278 if (!value) {
2279 goto out_err;
2280 }
2281 }
2282
2283 sprintf(procname, "%i", inode->fd);
2284 /*
2285 * It is not safe to open() non-regular/non-dir files in file server
2286 * unless O_PATH is used, so use that method for regular files/dir
2287 * only (as it seems giving less performance overhead).
2288 * Otherwise, call fchdir() to avoid open().
2289 */
2290 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2291 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2292 if (fd < 0) {
2293 goto out_err;
2294 }
2295 ret = fgetxattr(fd, name, value, size);
2296 } else {
2297 /* fchdir should not fail here */
2298 assert(fchdir(lo->proc_self_fd) == 0);
2299 ret = getxattr(procname, name, value, size);
2300 assert(fchdir(lo->root.fd) == 0);
2301 }
2302
2303 if (ret == -1) {
2304 goto out_err;
2305 }
2306 if (size) {
2307 saverr = 0;
2308 if (ret == 0) {
2309 goto out;
2310 }
2311 fuse_reply_buf(req, value, ret);
2312 } else {
2313 fuse_reply_xattr(req, ret);
2314 }
2315 out_free:
2316 free(value);
2317
2318 if (fd >= 0) {
2319 close(fd);
2320 }
2321
2322 lo_inode_put(lo, &inode);
2323 return;
2324
2325 out_err:
2326 saverr = errno;
2327 out:
2328 fuse_reply_err(req, saverr);
2329 g_free(mapped_name);
2330 goto out_free;
2331 }
2332
2333 static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
2334 {
2335 struct lo_data *lo = lo_data(req);
2336 char *value = NULL;
2337 char procname[64];
2338 struct lo_inode *inode;
2339 ssize_t ret;
2340 int saverr;
2341 int fd = -1;
2342
2343 inode = lo_inode(req, ino);
2344 if (!inode) {
2345 fuse_reply_err(req, EBADF);
2346 return;
2347 }
2348
2349 saverr = ENOSYS;
2350 if (!lo_data(req)->xattr) {
2351 goto out;
2352 }
2353
2354 fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
2355 size);
2356
2357 if (size) {
2358 value = malloc(size);
2359 if (!value) {
2360 goto out_err;
2361 }
2362 }
2363
2364 sprintf(procname, "%i", inode->fd);
2365 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2366 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2367 if (fd < 0) {
2368 goto out_err;
2369 }
2370 ret = flistxattr(fd, value, size);
2371 } else {
2372 /* fchdir should not fail here */
2373 assert(fchdir(lo->proc_self_fd) == 0);
2374 ret = listxattr(procname, value, size);
2375 assert(fchdir(lo->root.fd) == 0);
2376 }
2377
2378 if (ret == -1) {
2379 goto out_err;
2380 }
2381 if (size) {
2382 saverr = 0;
2383 if (ret == 0) {
2384 goto out;
2385 }
2386 fuse_reply_buf(req, value, ret);
2387 } else {
2388 fuse_reply_xattr(req, ret);
2389 }
2390 out_free:
2391 free(value);
2392
2393 if (fd >= 0) {
2394 close(fd);
2395 }
2396
2397 lo_inode_put(lo, &inode);
2398 return;
2399
2400 out_err:
2401 saverr = errno;
2402 out:
2403 fuse_reply_err(req, saverr);
2404 goto out_free;
2405 }
2406
2407 static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
2408 const char *value, size_t size, int flags)
2409 {
2410 char procname[64];
2411 const char *name;
2412 char *mapped_name;
2413 struct lo_data *lo = lo_data(req);
2414 struct lo_inode *inode;
2415 ssize_t ret;
2416 int saverr;
2417 int fd = -1;
2418
2419 mapped_name = NULL;
2420 name = in_name;
2421 if (lo->xattrmap) {
2422 ret = xattr_map_client(lo, in_name, &mapped_name);
2423 if (ret < 0) {
2424 fuse_reply_err(req, -ret);
2425 return;
2426 }
2427 if (mapped_name) {
2428 name = mapped_name;
2429 }
2430 }
2431
2432 inode = lo_inode(req, ino);
2433 if (!inode) {
2434 fuse_reply_err(req, EBADF);
2435 g_free(mapped_name);
2436 return;
2437 }
2438
2439 saverr = ENOSYS;
2440 if (!lo_data(req)->xattr) {
2441 goto out;
2442 }
2443
2444 fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
2445 ", name=%s value=%s size=%zd)\n", ino, name, value, size);
2446
2447 sprintf(procname, "%i", inode->fd);
2448 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2449 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2450 if (fd < 0) {
2451 saverr = errno;
2452 goto out;
2453 }
2454 ret = fsetxattr(fd, name, value, size, flags);
2455 } else {
2456 /* fchdir should not fail here */
2457 assert(fchdir(lo->proc_self_fd) == 0);
2458 ret = setxattr(procname, name, value, size, flags);
2459 assert(fchdir(lo->root.fd) == 0);
2460 }
2461
2462 saverr = ret == -1 ? errno : 0;
2463
2464 out:
2465 if (fd >= 0) {
2466 close(fd);
2467 }
2468
2469 lo_inode_put(lo, &inode);
2470 g_free(mapped_name);
2471 fuse_reply_err(req, saverr);
2472 }
2473
2474 static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
2475 {
2476 char procname[64];
2477 const char *name;
2478 char *mapped_name;
2479 struct lo_data *lo = lo_data(req);
2480 struct lo_inode *inode;
2481 ssize_t ret;
2482 int saverr;
2483 int fd = -1;
2484
2485 mapped_name = NULL;
2486 name = in_name;
2487 if (lo->xattrmap) {
2488 ret = xattr_map_client(lo, in_name, &mapped_name);
2489 if (ret < 0) {
2490 fuse_reply_err(req, -ret);
2491 return;
2492 }
2493 if (mapped_name) {
2494 name = mapped_name;
2495 }
2496 }
2497
2498 inode = lo_inode(req, ino);
2499 if (!inode) {
2500 fuse_reply_err(req, EBADF);
2501 g_free(mapped_name);
2502 return;
2503 }
2504
2505 saverr = ENOSYS;
2506 if (!lo_data(req)->xattr) {
2507 goto out;
2508 }
2509
2510 fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
2511 name);
2512
2513 sprintf(procname, "%i", inode->fd);
2514 if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
2515 fd = openat(lo->proc_self_fd, procname, O_RDONLY);
2516 if (fd < 0) {
2517 saverr = errno;
2518 goto out;
2519 }
2520 ret = fremovexattr(fd, name);
2521 } else {
2522 /* fchdir should not fail here */
2523 assert(fchdir(lo->proc_self_fd) == 0);
2524 ret = removexattr(procname, name);
2525 assert(fchdir(lo->root.fd) == 0);
2526 }
2527
2528 saverr = ret == -1 ? errno : 0;
2529
2530 out:
2531 if (fd >= 0) {
2532 close(fd);
2533 }
2534
2535 lo_inode_put(lo, &inode);
2536 g_free(mapped_name);
2537 fuse_reply_err(req, saverr);
2538 }
2539
2540 #ifdef HAVE_COPY_FILE_RANGE
2541 static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
2542 struct fuse_file_info *fi_in, fuse_ino_t ino_out,
2543 off_t off_out, struct fuse_file_info *fi_out,
2544 size_t len, int flags)
2545 {
2546 int in_fd, out_fd;
2547 ssize_t res;
2548
2549 in_fd = lo_fi_fd(req, fi_in);
2550 out_fd = lo_fi_fd(req, fi_out);
2551
2552 fuse_log(FUSE_LOG_DEBUG,
2553 "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
2554 "off=%lu, ino=%" PRIu64 "/fd=%d, "
2555 "off=%lu, size=%zd, flags=0x%x)\n",
2556 ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
2557
2558 res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
2559 if (res < 0) {
2560 fuse_reply_err(req, errno);
2561 } else {
2562 fuse_reply_write(req, res);
2563 }
2564 }
2565 #endif
2566
2567 static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
2568 struct fuse_file_info *fi)
2569 {
2570 off_t res;
2571
2572 (void)ino;
2573 res = lseek(lo_fi_fd(req, fi), off, whence);
2574 if (res != -1) {
2575 fuse_reply_lseek(req, res);
2576 } else {
2577 fuse_reply_err(req, errno);
2578 }
2579 }
2580
2581 static void lo_destroy(void *userdata)
2582 {
2583 struct lo_data *lo = (struct lo_data *)userdata;
2584
2585 pthread_mutex_lock(&lo->mutex);
2586 while (true) {
2587 GHashTableIter iter;
2588 gpointer key, value;
2589
2590 g_hash_table_iter_init(&iter, lo->inodes);
2591 if (!g_hash_table_iter_next(&iter, &key, &value)) {
2592 break;
2593 }
2594
2595 struct lo_inode *inode = value;
2596 unref_inode(lo, inode, inode->nlookup);
2597 }
2598 pthread_mutex_unlock(&lo->mutex);
2599 }
2600
2601 static struct fuse_lowlevel_ops lo_oper = {
2602 .init = lo_init,
2603 .lookup = lo_lookup,
2604 .mkdir = lo_mkdir,
2605 .mknod = lo_mknod,
2606 .symlink = lo_symlink,
2607 .link = lo_link,
2608 .unlink = lo_unlink,
2609 .rmdir = lo_rmdir,
2610 .rename = lo_rename,
2611 .forget = lo_forget,
2612 .forget_multi = lo_forget_multi,
2613 .getattr = lo_getattr,
2614 .setattr = lo_setattr,
2615 .readlink = lo_readlink,
2616 .opendir = lo_opendir,
2617 .readdir = lo_readdir,
2618 .readdirplus = lo_readdirplus,
2619 .releasedir = lo_releasedir,
2620 .fsyncdir = lo_fsyncdir,
2621 .create = lo_create,
2622 .getlk = lo_getlk,
2623 .setlk = lo_setlk,
2624 .open = lo_open,
2625 .release = lo_release,
2626 .flush = lo_flush,
2627 .fsync = lo_fsync,
2628 .read = lo_read,
2629 .write_buf = lo_write_buf,
2630 .statfs = lo_statfs,
2631 .fallocate = lo_fallocate,
2632 .flock = lo_flock,
2633 .getxattr = lo_getxattr,
2634 .listxattr = lo_listxattr,
2635 .setxattr = lo_setxattr,
2636 .removexattr = lo_removexattr,
2637 #ifdef HAVE_COPY_FILE_RANGE
2638 .copy_file_range = lo_copy_file_range,
2639 #endif
2640 .lseek = lo_lseek,
2641 .destroy = lo_destroy,
2642 };
2643
2644 /* Print vhost-user.json backend program capabilities */
2645 static void print_capabilities(void)
2646 {
2647 printf("{\n");
2648 printf(" \"type\": \"fs\"\n");
2649 printf("}\n");
2650 }
2651
2652 /*
2653 * Drop all Linux capabilities because the wait parent process only needs to
2654 * sit in waitpid(2) and terminate.
2655 */
2656 static void setup_wait_parent_capabilities(void)
2657 {
2658 capng_setpid(syscall(SYS_gettid));
2659 capng_clear(CAPNG_SELECT_BOTH);
2660 capng_apply(CAPNG_SELECT_BOTH);
2661 }
2662
2663 /*
2664 * Move to a new mount, net, and pid namespaces to isolate this process.
2665 */
2666 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
2667 {
2668 pid_t child;
2669
2670 /*
2671 * Create a new pid namespace for *child* processes. We'll have to
2672 * fork in order to enter the new pid namespace. A new mount namespace
2673 * is also needed so that we can remount /proc for the new pid
2674 * namespace.
2675 *
2676 * Our UNIX domain sockets have been created. Now we can move to
2677 * an empty network namespace to prevent TCP/IP and other network
2678 * activity in case this process is compromised.
2679 */
2680 if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
2681 fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
2682 exit(1);
2683 }
2684
2685 child = fork();
2686 if (child < 0) {
2687 fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
2688 exit(1);
2689 }
2690 if (child > 0) {
2691 pid_t waited;
2692 int wstatus;
2693
2694 setup_wait_parent_capabilities();
2695
2696 /* The parent waits for the child */
2697 do {
2698 waited = waitpid(child, &wstatus, 0);
2699 } while (waited < 0 && errno == EINTR && !se->exited);
2700
2701 /* We were terminated by a signal, see fuse_signals.c */
2702 if (se->exited) {
2703 exit(0);
2704 }
2705
2706 if (WIFEXITED(wstatus)) {
2707 exit(WEXITSTATUS(wstatus));
2708 }
2709
2710 exit(1);
2711 }
2712
2713 /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
2714 prctl(PR_SET_PDEATHSIG, SIGTERM);
2715
2716 /*
2717 * If the mounts have shared propagation then we want to opt out so our
2718 * mount changes don't affect the parent mount namespace.
2719 */
2720 if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
2721 fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
2722 exit(1);
2723 }
2724
2725 /* The child must remount /proc to use the new pid namespace */
2726 if (mount("proc", "/proc", "proc",
2727 MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
2728 fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
2729 exit(1);
2730 }
2731
2732 /*
2733 * We only need /proc/self/fd. Prevent ".." from accessing parent
2734 * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
2735 * previously remounted with MS_REC | MS_SLAVE this mount change only
2736 * affects our process.
2737 */
2738 if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
2739 fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
2740 exit(1);
2741 }
2742
2743 /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
2744 lo->proc_self_fd = open("/proc", O_PATH);
2745 if (lo->proc_self_fd == -1) {
2746 fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
2747 exit(1);
2748 }
2749 }
2750
2751 /*
2752 * Capture the capability state, we'll need to restore this for individual
2753 * threads later; see load_capng.
2754 */
2755 static void setup_capng(void)
2756 {
2757 /* Note this accesses /proc so has to happen before the sandbox */
2758 if (capng_get_caps_process()) {
2759 fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
2760 exit(1);
2761 }
2762 pthread_mutex_init(&cap.mutex, NULL);
2763 pthread_mutex_lock(&cap.mutex);
2764 cap.saved = capng_save_state();
2765 if (!cap.saved) {
2766 fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
2767 exit(1);
2768 }
2769 pthread_mutex_unlock(&cap.mutex);
2770 }
2771
2772 static void cleanup_capng(void)
2773 {
2774 free(cap.saved);
2775 cap.saved = NULL;
2776 pthread_mutex_destroy(&cap.mutex);
2777 }
2778
2779
2780 /*
2781 * Make the source directory our root so symlinks cannot escape and no other
2782 * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
2783 */
2784 static void setup_mounts(const char *source)
2785 {
2786 int oldroot;
2787 int newroot;
2788
2789 if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
2790 fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
2791 exit(1);
2792 }
2793
2794 /* This magic is based on lxc's lxc_pivot_root() */
2795 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2796 if (oldroot < 0) {
2797 fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
2798 exit(1);
2799 }
2800
2801 newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2802 if (newroot < 0) {
2803 fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
2804 exit(1);
2805 }
2806
2807 if (fchdir(newroot) < 0) {
2808 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2809 exit(1);
2810 }
2811
2812 if (syscall(__NR_pivot_root, ".", ".") < 0) {
2813 fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
2814 exit(1);
2815 }
2816
2817 if (fchdir(oldroot) < 0) {
2818 fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
2819 exit(1);
2820 }
2821
2822 if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
2823 fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
2824 exit(1);
2825 }
2826
2827 if (umount2(".", MNT_DETACH) < 0) {
2828 fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
2829 exit(1);
2830 }
2831
2832 if (fchdir(newroot) < 0) {
2833 fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
2834 exit(1);
2835 }
2836
2837 close(newroot);
2838 close(oldroot);
2839 }
2840
2841 /*
2842 * Only keep whitelisted capabilities that are needed for file system operation
2843 * The (possibly NULL) modcaps_in string passed in is free'd before exit.
2844 */
2845 static void setup_capabilities(char *modcaps_in)
2846 {
2847 char *modcaps = modcaps_in;
2848 pthread_mutex_lock(&cap.mutex);
2849 capng_restore_state(&cap.saved);
2850
2851 /*
2852 * Whitelist file system-related capabilities that are needed for a file
2853 * server to act like root. Drop everything else like networking and
2854 * sysadmin capabilities.
2855 *
2856 * Exclusions:
2857 * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
2858 * and we don't support that.
2859 * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
2860 * used by the Smack LSM. Omit it until there is demand for it.
2861 */
2862 capng_setpid(syscall(SYS_gettid));
2863 capng_clear(CAPNG_SELECT_BOTH);
2864 if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
2865 CAP_CHOWN,
2866 CAP_DAC_OVERRIDE,
2867 CAP_FOWNER,
2868 CAP_FSETID,
2869 CAP_SETGID,
2870 CAP_SETUID,
2871 CAP_MKNOD,
2872 CAP_SETFCAP,
2873 -1)) {
2874 fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
2875 exit(1);
2876 }
2877
2878 /*
2879 * The modcaps option is a colon separated list of caps,
2880 * each preceded by either + or -.
2881 */
2882 while (modcaps) {
2883 capng_act_t action;
2884 int cap;
2885
2886 char *next = strchr(modcaps, ':');
2887 if (next) {
2888 *next = '\0';
2889 next++;
2890 }
2891
2892 switch (modcaps[0]) {
2893 case '+':
2894 action = CAPNG_ADD;
2895 break;
2896
2897 case '-':
2898 action = CAPNG_DROP;
2899 break;
2900
2901 default:
2902 fuse_log(FUSE_LOG_ERR,
2903 "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
2904 __func__, modcaps[0]);
2905 exit(1);
2906 }
2907 cap = capng_name_to_capability(modcaps + 1);
2908 if (cap < 0) {
2909 fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
2910 modcaps);
2911 exit(1);
2912 }
2913 if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
2914 fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
2915 __func__, modcaps);
2916 exit(1);
2917 }
2918
2919 modcaps = next;
2920 }
2921 g_free(modcaps_in);
2922
2923 if (capng_apply(CAPNG_SELECT_BOTH)) {
2924 fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
2925 exit(1);
2926 }
2927
2928 cap.saved = capng_save_state();
2929 if (!cap.saved) {
2930 fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
2931 exit(1);
2932 }
2933 pthread_mutex_unlock(&cap.mutex);
2934 }
2935
2936 /*
2937 * Use chroot as a weaker sandbox for environments where the process is
2938 * launched without CAP_SYS_ADMIN.
2939 */
2940 static void setup_chroot(struct lo_data *lo)
2941 {
2942 lo->proc_self_fd = open("/proc/self/fd", O_PATH);
2943 if (lo->proc_self_fd == -1) {
2944 fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
2945 exit(1);
2946 }
2947
2948 /*
2949 * Make the shared directory the file system root so that FUSE_OPEN
2950 * (lo_open()) cannot escape the shared directory by opening a symlink.
2951 *
2952 * The chroot(2) syscall is later disabled by seccomp and the
2953 * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
2954 * is not possible.
2955 *
2956 * However, it's still possible to escape the chroot via lo->proc_self_fd
2957 * but that requires first gaining control of the process.
2958 */
2959 if (chroot(lo->source) != 0) {
2960 fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
2961 exit(1);
2962 }
2963
2964 /* Move into the chroot */
2965 if (chdir("/") != 0) {
2966 fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
2967 exit(1);
2968 }
2969 }
2970
2971 /*
2972 * Lock down this process to prevent access to other processes or files outside
2973 * source directory. This reduces the impact of arbitrary code execution bugs.
2974 */
2975 static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
2976 bool enable_syslog)
2977 {
2978 if (lo->sandbox == SANDBOX_NAMESPACE) {
2979 setup_namespaces(lo, se);
2980 setup_mounts(lo->source);
2981 } else {
2982 setup_chroot(lo);
2983 }
2984
2985 setup_seccomp(enable_syslog);
2986 setup_capabilities(g_strdup(lo->modcaps));
2987 }
2988
2989 /* Set the maximum number of open file descriptors */
2990 static void setup_nofile_rlimit(unsigned long rlimit_nofile)
2991 {
2992 struct rlimit rlim = {
2993 .rlim_cur = rlimit_nofile,
2994 .rlim_max = rlimit_nofile,
2995 };
2996
2997 if (rlimit_nofile == 0) {
2998 return; /* nothing to do */
2999 }
3000
3001 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
3002 /* Ignore SELinux denials */
3003 if (errno == EPERM) {
3004 return;
3005 }
3006
3007 fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
3008 exit(1);
3009 }
3010 }
3011
3012 static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
3013 {
3014 g_autofree char *localfmt = NULL;
3015
3016 if (current_log_level < level) {
3017 return;
3018 }
3019
3020 if (current_log_level == FUSE_LOG_DEBUG) {
3021 if (!use_syslog) {
3022 localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
3023 get_clock(), syscall(__NR_gettid), fmt);
3024 } else {
3025 localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
3026 fmt);
3027 }
3028 fmt = localfmt;
3029 }
3030
3031 if (use_syslog) {
3032 int priority = LOG_ERR;
3033 switch (level) {
3034 case FUSE_LOG_EMERG:
3035 priority = LOG_EMERG;
3036 break;
3037 case FUSE_LOG_ALERT:
3038 priority = LOG_ALERT;
3039 break;
3040 case FUSE_LOG_CRIT:
3041 priority = LOG_CRIT;
3042 break;
3043 case FUSE_LOG_ERR:
3044 priority = LOG_ERR;
3045 break;
3046 case FUSE_LOG_WARNING:
3047 priority = LOG_WARNING;
3048 break;
3049 case FUSE_LOG_NOTICE:
3050 priority = LOG_NOTICE;
3051 break;
3052 case FUSE_LOG_INFO:
3053 priority = LOG_INFO;
3054 break;
3055 case FUSE_LOG_DEBUG:
3056 priority = LOG_DEBUG;
3057 break;
3058 }
3059 vsyslog(priority, fmt, ap);
3060 } else {
3061 vfprintf(stderr, fmt, ap);
3062 }
3063 }
3064
3065 static void setup_root(struct lo_data *lo, struct lo_inode *root)
3066 {
3067 int fd, res;
3068 struct stat stat;
3069
3070 fd = open("/", O_PATH);
3071 if (fd == -1) {
3072 fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
3073 exit(1);
3074 }
3075
3076 res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
3077 if (res == -1) {
3078 fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
3079 exit(1);
3080 }
3081
3082 root->filetype = S_IFDIR;
3083 root->fd = fd;
3084 root->key.ino = stat.st_ino;
3085 root->key.dev = stat.st_dev;
3086 root->nlookup = 2;
3087 g_atomic_int_set(&root->refcount, 2);
3088 }
3089
3090 static guint lo_key_hash(gconstpointer key)
3091 {
3092 const struct lo_key *lkey = key;
3093
3094 return (guint)lkey->ino + (guint)lkey->dev;
3095 }
3096
3097 static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
3098 {
3099 const struct lo_key *la = a;
3100 const struct lo_key *lb = b;
3101
3102 return la->ino == lb->ino && la->dev == lb->dev;
3103 }
3104
3105 static void fuse_lo_data_cleanup(struct lo_data *lo)
3106 {
3107 if (lo->inodes) {
3108 g_hash_table_destroy(lo->inodes);
3109 }
3110 lo_map_destroy(&lo->fd_map);
3111 lo_map_destroy(&lo->dirp_map);
3112 lo_map_destroy(&lo->ino_map);
3113
3114 if (lo->proc_self_fd >= 0) {
3115 close(lo->proc_self_fd);
3116 }
3117
3118 if (lo->root.fd >= 0) {
3119 close(lo->root.fd);
3120 }
3121
3122 free(lo->xattrmap);
3123 free_xattrmap(lo);
3124 free(lo->source);
3125 }
3126
3127 int main(int argc, char *argv[])
3128 {
3129 struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
3130 struct fuse_session *se;
3131 struct fuse_cmdline_opts opts;
3132 struct lo_data lo = {
3133 .sandbox = SANDBOX_NAMESPACE,
3134 .debug = 0,
3135 .writeback = 0,
3136 .posix_lock = 0,
3137 .allow_direct_io = 0,
3138 .proc_self_fd = -1,
3139 };
3140 struct lo_map_elem *root_elem;
3141 int ret = -1;
3142
3143 /* Don't mask creation mode, kernel already did that */
3144 umask(0);
3145
3146 qemu_init_exec_dir(argv[0]);
3147
3148 pthread_mutex_init(&lo.mutex, NULL);
3149 lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
3150 lo.root.fd = -1;
3151 lo.root.fuse_ino = FUSE_ROOT_ID;
3152 lo.cache = CACHE_AUTO;
3153
3154 /*
3155 * Set up the ino map like this:
3156 * [0] Reserved (will not be used)
3157 * [1] Root inode
3158 */
3159 lo_map_init(&lo.ino_map);
3160 lo_map_reserve(&lo.ino_map, 0)->in_use = false;
3161 root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
3162 root_elem->inode = &lo.root;
3163
3164 lo_map_init(&lo.dirp_map);
3165 lo_map_init(&lo.fd_map);
3166
3167 if (fuse_parse_cmdline(&args, &opts) != 0) {
3168 goto err_out1;
3169 }
3170 fuse_set_log_func(log_func);
3171 use_syslog = opts.syslog;
3172 if (use_syslog) {
3173 openlog("virtiofsd", LOG_PID, LOG_DAEMON);
3174 }
3175
3176 if (opts.show_help) {
3177 printf("usage: %s [options]\n\n", argv[0]);
3178 fuse_cmdline_help();
3179 printf(" -o source=PATH shared directory tree\n");
3180 fuse_lowlevel_help();
3181 ret = 0;
3182 goto err_out1;
3183 } else if (opts.show_version) {
3184 fuse_lowlevel_version();
3185 ret = 0;
3186 goto err_out1;
3187 } else if (opts.print_capabilities) {
3188 print_capabilities();
3189 ret = 0;
3190 goto err_out1;
3191 }
3192
3193 if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
3194 goto err_out1;
3195 }
3196
3197 if (opts.log_level != 0) {
3198 current_log_level = opts.log_level;
3199 } else {
3200 /* default log level is INFO */
3201 current_log_level = FUSE_LOG_INFO;
3202 }
3203 lo.debug = opts.debug;
3204 if (lo.debug) {
3205 current_log_level = FUSE_LOG_DEBUG;
3206 }
3207 if (lo.source) {
3208 struct stat stat;
3209 int res;
3210
3211 res = lstat(lo.source, &stat);
3212 if (res == -1) {
3213 fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
3214 lo.source);
3215 exit(1);
3216 }
3217 if (!S_ISDIR(stat.st_mode)) {
3218 fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
3219 exit(1);
3220 }
3221 } else {
3222 lo.source = strdup("/");
3223 }
3224
3225 if (lo.xattrmap) {
3226 parse_xattrmap(&lo);
3227 }
3228
3229 if (!lo.timeout_set) {
3230 switch (lo.cache) {
3231 case CACHE_NONE:
3232 lo.timeout = 0.0;
3233 break;
3234
3235 case CACHE_AUTO:
3236 lo.timeout = 1.0;
3237 break;
3238
3239 case CACHE_ALWAYS:
3240 lo.timeout = 86400.0;
3241 break;
3242 }
3243 } else if (lo.timeout < 0) {
3244 fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
3245 exit(1);
3246 }
3247
3248 se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
3249 if (se == NULL) {
3250 goto err_out1;
3251 }
<