Merge tag 'for-upstream' of git://repo.or.cz/qemu/kevin into staging
[qemu.git] / hw / remote / vfio-user-obj.c
1 /**
2 * QEMU vfio-user-server server object
3 *
4 * Copyright © 2022 Oracle and/or its affiliates.
5 *
6 * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
7 *
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 /**
13 * Usage: add options:
14 * -machine x-remote,vfio-user=on,auto-shutdown=on
15 * -device <PCI-device>,id=<pci-dev-id>
16 * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
17 * device=<pci-dev-id>
18 *
19 * Note that x-vfio-user-server object must be used with x-remote machine only.
20 * This server could only support PCI devices for now.
21 *
22 * type - SocketAddress type - presently "unix" alone is supported. Required
23 * option
24 *
25 * path - named unix socket, it will be created by the server. It is
26 * a required option
27 *
28 * device - id of a device on the server, a required option. PCI devices
29 * alone are supported presently.
30 *
31 * notes - x-vfio-user-server could block IO and monitor during the
32 * initialization phase.
33 */
34
35 #include "qemu/osdep.h"
36
37 #include "qom/object.h"
38 #include "qom/object_interfaces.h"
39 #include "qemu/error-report.h"
40 #include "trace.h"
41 #include "sysemu/runstate.h"
42 #include "hw/boards.h"
43 #include "hw/remote/machine.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-visit-sockets.h"
46 #include "qapi/qapi-events-misc.h"
47 #include "qemu/notify.h"
48 #include "qemu/thread.h"
49 #include "qemu/main-loop.h"
50 #include "sysemu/sysemu.h"
51 #include "libvfio-user.h"
52 #include "hw/qdev-core.h"
53 #include "hw/pci/pci.h"
54 #include "qemu/timer.h"
55 #include "exec/memory.h"
56 #include "hw/pci/msi.h"
57 #include "hw/pci/msix.h"
58 #include "hw/remote/vfio-user-obj.h"
59
60 #define TYPE_VFU_OBJECT "x-vfio-user-server"
61 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
62
63 /**
64 * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown
65 * is set, it aborts the machine on error. Otherwise, it logs an
66 * error message without aborting.
67 */
68 #define VFU_OBJECT_ERROR(o, fmt, ...) \
69 { \
70 if (vfu_object_auto_shutdown()) { \
71 error_setg(&error_abort, (fmt), ## __VA_ARGS__); \
72 } else { \
73 error_report((fmt), ## __VA_ARGS__); \
74 } \
75 } \
76
77 struct VfuObjectClass {
78 ObjectClass parent_class;
79
80 unsigned int nr_devs;
81 };
82
83 struct VfuObject {
84 /* private */
85 Object parent;
86
87 SocketAddress *socket;
88
89 char *device;
90
91 Error *err;
92
93 Notifier machine_done;
94
95 vfu_ctx_t *vfu_ctx;
96
97 PCIDevice *pci_dev;
98
99 Error *unplug_blocker;
100
101 int vfu_poll_fd;
102
103 MSITriggerFunc *default_msi_trigger;
104 MSIPrepareMessageFunc *default_msi_prepare_message;
105 MSIxPrepareMessageFunc *default_msix_prepare_message;
106 };
107
108 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
109
110 static bool vfu_object_auto_shutdown(void)
111 {
112 bool auto_shutdown = true;
113 Error *local_err = NULL;
114
115 if (!current_machine) {
116 return auto_shutdown;
117 }
118
119 auto_shutdown = object_property_get_bool(OBJECT(current_machine),
120 "auto-shutdown",
121 &local_err);
122
123 /*
124 * local_err would be set if no such property exists - safe to ignore.
125 * Unlikely scenario as auto-shutdown is always defined for
126 * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with
127 * TYPE_REMOTE_MACHINE
128 */
129 if (local_err) {
130 auto_shutdown = true;
131 error_free(local_err);
132 }
133
134 return auto_shutdown;
135 }
136
137 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
138 void *opaque, Error **errp)
139 {
140 VfuObject *o = VFU_OBJECT(obj);
141
142 if (o->vfu_ctx) {
143 error_setg(errp, "vfu: Unable to set socket property - server busy");
144 return;
145 }
146
147 qapi_free_SocketAddress(o->socket);
148
149 o->socket = NULL;
150
151 visit_type_SocketAddress(v, name, &o->socket, errp);
152
153 if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
154 error_setg(errp, "vfu: Unsupported socket type - %s",
155 SocketAddressType_str(o->socket->type));
156 qapi_free_SocketAddress(o->socket);
157 o->socket = NULL;
158 return;
159 }
160
161 trace_vfu_prop("socket", o->socket->u.q_unix.path);
162
163 vfu_object_init_ctx(o, errp);
164 }
165
166 static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
167 {
168 VfuObject *o = VFU_OBJECT(obj);
169
170 if (o->vfu_ctx) {
171 error_setg(errp, "vfu: Unable to set device property - server busy");
172 return;
173 }
174
175 g_free(o->device);
176
177 o->device = g_strdup(str);
178
179 trace_vfu_prop("device", str);
180
181 vfu_object_init_ctx(o, errp);
182 }
183
184 static void vfu_object_ctx_run(void *opaque)
185 {
186 VfuObject *o = opaque;
187 const char *vfu_id;
188 char *vfu_path, *pci_dev_path;
189 int ret = -1;
190
191 while (ret != 0) {
192 ret = vfu_run_ctx(o->vfu_ctx);
193 if (ret < 0) {
194 if (errno == EINTR) {
195 continue;
196 } else if (errno == ENOTCONN) {
197 vfu_id = object_get_canonical_path_component(OBJECT(o));
198 vfu_path = object_get_canonical_path(OBJECT(o));
199 g_assert(o->pci_dev);
200 pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
201 /* o->device is a required property and is non-NULL here */
202 g_assert(o->device);
203 qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
204 o->device, pci_dev_path);
205 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
206 o->vfu_poll_fd = -1;
207 object_unparent(OBJECT(o));
208 g_free(vfu_path);
209 g_free(pci_dev_path);
210 break;
211 } else {
212 VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
213 o->device, strerror(errno));
214 break;
215 }
216 }
217 }
218 }
219
220 static void vfu_object_attach_ctx(void *opaque)
221 {
222 VfuObject *o = opaque;
223 GPollFD pfds[1];
224 int ret;
225
226 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
227
228 pfds[0].fd = o->vfu_poll_fd;
229 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
230
231 retry_attach:
232 ret = vfu_attach_ctx(o->vfu_ctx);
233 if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
234 /**
235 * vfu_object_attach_ctx can block QEMU's main loop
236 * during attach - the monitor and other IO
237 * could be unresponsive during this time.
238 */
239 (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
240 goto retry_attach;
241 } else if (ret < 0) {
242 VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
243 o->device, strerror(errno));
244 return;
245 }
246
247 o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
248 if (o->vfu_poll_fd < 0) {
249 VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
250 return;
251 }
252
253 qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
254 }
255
256 static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
257 size_t count, loff_t offset,
258 const bool is_write)
259 {
260 VfuObject *o = vfu_get_private(vfu_ctx);
261 uint32_t pci_access_width = sizeof(uint32_t);
262 size_t bytes = count;
263 uint32_t val = 0;
264 char *ptr = buf;
265 int len;
266
267 /*
268 * Writes to the BAR registers would trigger an update to the
269 * global Memory and IO AddressSpaces. But the remote device
270 * never uses the global AddressSpaces, therefore overlapping
271 * memory regions are not a problem
272 */
273 while (bytes > 0) {
274 len = (bytes > pci_access_width) ? pci_access_width : bytes;
275 if (is_write) {
276 memcpy(&val, ptr, len);
277 pci_host_config_write_common(o->pci_dev, offset,
278 pci_config_size(o->pci_dev),
279 val, len);
280 trace_vfu_cfg_write(offset, val);
281 } else {
282 val = pci_host_config_read_common(o->pci_dev, offset,
283 pci_config_size(o->pci_dev), len);
284 memcpy(ptr, &val, len);
285 trace_vfu_cfg_read(offset, val);
286 }
287 offset += len;
288 ptr += len;
289 bytes -= len;
290 }
291
292 return count;
293 }
294
295 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
296 {
297 VfuObject *o = vfu_get_private(vfu_ctx);
298 AddressSpace *dma_as = NULL;
299 MemoryRegion *subregion = NULL;
300 g_autofree char *name = NULL;
301 struct iovec *iov = &info->iova;
302
303 if (!info->vaddr) {
304 return;
305 }
306
307 name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
308 (uint64_t)info->vaddr);
309
310 subregion = g_new0(MemoryRegion, 1);
311
312 memory_region_init_ram_ptr(subregion, NULL, name,
313 iov->iov_len, info->vaddr);
314
315 dma_as = pci_device_iommu_address_space(o->pci_dev);
316
317 memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
318
319 trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
320 }
321
322 static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
323 {
324 VfuObject *o = vfu_get_private(vfu_ctx);
325 AddressSpace *dma_as = NULL;
326 MemoryRegion *mr = NULL;
327 ram_addr_t offset;
328
329 mr = memory_region_from_host(info->vaddr, &offset);
330 if (!mr) {
331 return;
332 }
333
334 dma_as = pci_device_iommu_address_space(o->pci_dev);
335
336 memory_region_del_subregion(dma_as->root, mr);
337
338 object_unparent((OBJECT(mr)));
339
340 trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
341 }
342
343 static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
344 hwaddr size, const bool is_write)
345 {
346 uint8_t *ptr = buf;
347 bool release_lock = false;
348 uint8_t *ram_ptr = NULL;
349 MemTxResult result;
350 int access_size;
351 uint64_t val;
352
353 if (memory_access_is_direct(mr, is_write)) {
354 /**
355 * Some devices expose a PCI expansion ROM, which could be buffer
356 * based as compared to other regions which are primarily based on
357 * MemoryRegionOps. memory_region_find() would already check
358 * for buffer overflow, we don't need to repeat it here.
359 */
360 ram_ptr = memory_region_get_ram_ptr(mr);
361
362 if (is_write) {
363 memcpy((ram_ptr + offset), buf, size);
364 } else {
365 memcpy(buf, (ram_ptr + offset), size);
366 }
367
368 return 0;
369 }
370
371 while (size) {
372 /**
373 * The read/write logic used below is similar to the ones in
374 * flatview_read/write_continue()
375 */
376 release_lock = prepare_mmio_access(mr);
377
378 access_size = memory_access_size(mr, size, offset);
379
380 if (is_write) {
381 val = ldn_he_p(ptr, access_size);
382
383 result = memory_region_dispatch_write(mr, offset, val,
384 size_memop(access_size),
385 MEMTXATTRS_UNSPECIFIED);
386 } else {
387 result = memory_region_dispatch_read(mr, offset, &val,
388 size_memop(access_size),
389 MEMTXATTRS_UNSPECIFIED);
390
391 stn_he_p(ptr, access_size, val);
392 }
393
394 if (release_lock) {
395 qemu_mutex_unlock_iothread();
396 release_lock = false;
397 }
398
399 if (result != MEMTX_OK) {
400 return -1;
401 }
402
403 size -= access_size;
404 ptr += access_size;
405 offset += access_size;
406 }
407
408 return 0;
409 }
410
411 static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
412 hwaddr bar_offset, char * const buf,
413 hwaddr len, const bool is_write)
414 {
415 MemoryRegionSection section = { 0 };
416 uint8_t *ptr = (uint8_t *)buf;
417 MemoryRegion *section_mr = NULL;
418 uint64_t section_size;
419 hwaddr section_offset;
420 hwaddr size = 0;
421
422 while (len) {
423 section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
424 bar_offset, len);
425
426 if (!section.mr) {
427 warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
428 return size;
429 }
430
431 section_mr = section.mr;
432 section_offset = section.offset_within_region;
433 section_size = int128_get64(section.size);
434
435 if (is_write && section_mr->readonly) {
436 warn_report("vfu: attempting to write to readonly region in "
437 "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
438 pci_bar, bar_offset,
439 (bar_offset + section_size));
440 memory_region_unref(section_mr);
441 return size;
442 }
443
444 if (vfu_object_mr_rw(section_mr, ptr, section_offset,
445 section_size, is_write)) {
446 warn_report("vfu: failed to %s "
447 "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
448 is_write ? "write to" : "read from", bar_offset,
449 (bar_offset + section_size), pci_bar);
450 memory_region_unref(section_mr);
451 return size;
452 }
453
454 size += section_size;
455 bar_offset += section_size;
456 ptr += section_size;
457 len -= section_size;
458
459 memory_region_unref(section_mr);
460 }
461
462 return size;
463 }
464
465 /**
466 * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
467 *
468 * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
469 * define vfu_object_bar2_handler
470 */
471 #define VFU_OBJECT_BAR_HANDLER(BAR_NO) \
472 static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \
473 char * const buf, size_t count, \
474 loff_t offset, const bool is_write) \
475 { \
476 VfuObject *o = vfu_get_private(vfu_ctx); \
477 PCIDevice *pci_dev = o->pci_dev; \
478 \
479 return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \
480 buf, count, is_write); \
481 } \
482
483 VFU_OBJECT_BAR_HANDLER(0)
484 VFU_OBJECT_BAR_HANDLER(1)
485 VFU_OBJECT_BAR_HANDLER(2)
486 VFU_OBJECT_BAR_HANDLER(3)
487 VFU_OBJECT_BAR_HANDLER(4)
488 VFU_OBJECT_BAR_HANDLER(5)
489 VFU_OBJECT_BAR_HANDLER(6)
490
491 static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
492 &vfu_object_bar0_handler,
493 &vfu_object_bar1_handler,
494 &vfu_object_bar2_handler,
495 &vfu_object_bar3_handler,
496 &vfu_object_bar4_handler,
497 &vfu_object_bar5_handler,
498 &vfu_object_bar6_handler,
499 };
500
501 /**
502 * vfu_object_register_bars - Identify active BAR regions of pdev and setup
503 * callbacks to handle read/write accesses
504 */
505 static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
506 {
507 int flags = VFU_REGION_FLAG_RW;
508 int i;
509
510 for (i = 0; i < PCI_NUM_REGIONS; i++) {
511 if (!pdev->io_regions[i].size) {
512 continue;
513 }
514
515 if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
516 pdev->io_regions[i].memory->readonly) {
517 flags &= ~VFU_REGION_FLAG_WRITE;
518 }
519
520 vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
521 (size_t)pdev->io_regions[i].size,
522 vfu_object_bar_handlers[i],
523 flags, NULL, 0, -1, 0);
524
525 trace_vfu_bar_register(i, pdev->io_regions[i].addr,
526 pdev->io_regions[i].size);
527 }
528 }
529
530 static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
531 {
532 int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
533 pci_dev->devfn);
534
535 return pci_bdf;
536 }
537
538 static void vfu_object_set_irq(void *opaque, int pirq, int level)
539 {
540 PCIBus *pci_bus = opaque;
541 PCIDevice *pci_dev = NULL;
542 vfu_ctx_t *vfu_ctx = NULL;
543 int pci_bus_num, devfn;
544
545 if (level) {
546 pci_bus_num = PCI_BUS_NUM(pirq);
547 devfn = PCI_BDF_TO_DEVFN(pirq);
548
549 /*
550 * pci_find_device() performs at O(1) if the device is attached
551 * to the root PCI bus. Whereas, if the device is attached to a
552 * secondary PCI bus (such as when a root port is involved),
553 * finding the parent PCI bus could take O(n)
554 */
555 pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
556
557 vfu_ctx = pci_dev->irq_opaque;
558
559 g_assert(vfu_ctx);
560
561 vfu_irq_trigger(vfu_ctx, 0);
562 }
563 }
564
565 static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
566 unsigned int vector)
567 {
568 MSIMessage msg;
569
570 msg.address = 0;
571 msg.data = vector;
572
573 return msg;
574 }
575
576 static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
577 {
578 vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
579
580 vfu_irq_trigger(vfu_ctx, msg.data);
581 }
582
583 static void vfu_object_setup_msi_cbs(VfuObject *o)
584 {
585 o->default_msi_trigger = o->pci_dev->msi_trigger;
586 o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
587 o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
588
589 o->pci_dev->msi_trigger = vfu_object_msi_trigger;
590 o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
591 o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
592 }
593
594 static void vfu_object_restore_msi_cbs(VfuObject *o)
595 {
596 o->pci_dev->msi_trigger = o->default_msi_trigger;
597 o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
598 o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
599 }
600
601 static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
602 uint32_t count, bool mask)
603 {
604 VfuObject *o = vfu_get_private(vfu_ctx);
605 Error *err = NULL;
606 uint32_t vector;
607
608 for (vector = start; vector < count; vector++) {
609 msix_set_mask(o->pci_dev, vector, mask, &err);
610 if (err) {
611 VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
612 error_get_pretty(err));
613 error_free(err);
614 err = NULL;
615 }
616 }
617 }
618
619 static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
620 uint32_t count, bool mask)
621 {
622 VfuObject *o = vfu_get_private(vfu_ctx);
623 Error *err = NULL;
624 uint32_t vector;
625
626 for (vector = start; vector < count; vector++) {
627 msi_set_mask(o->pci_dev, vector, mask, &err);
628 if (err) {
629 VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
630 error_get_pretty(err));
631 error_free(err);
632 err = NULL;
633 }
634 }
635 }
636
637 static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
638 {
639 vfu_ctx_t *vfu_ctx = o->vfu_ctx;
640 int ret;
641
642 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
643 if (ret < 0) {
644 return ret;
645 }
646
647 if (msix_nr_vectors_allocated(pci_dev)) {
648 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
649 msix_nr_vectors_allocated(pci_dev));
650 vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
651 &vfu_msix_irq_state);
652 } else if (msi_nr_vectors_allocated(pci_dev)) {
653 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
654 msi_nr_vectors_allocated(pci_dev));
655 vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
656 &vfu_msi_irq_state);
657 }
658
659 if (ret < 0) {
660 return ret;
661 }
662
663 vfu_object_setup_msi_cbs(o);
664
665 pci_dev->irq_opaque = vfu_ctx;
666
667 return 0;
668 }
669
670 void vfu_object_set_bus_irq(PCIBus *pci_bus)
671 {
672 int bus_num = pci_bus_num(pci_bus);
673 int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
674
675 pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
676 max_bdf);
677 }
678
679 static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
680 {
681 VfuObject *o = vfu_get_private(vfu_ctx);
682
683 /* vfu_object_ctx_run() handles lost connection */
684 if (type == VFU_RESET_LOST_CONN) {
685 return 0;
686 }
687
688 qdev_reset_all(DEVICE(o->pci_dev));
689
690 return 0;
691 }
692
693 /*
694 * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
695 * properties. It also depends on devices instantiated in QEMU. These
696 * dependencies are not available during the instance_init phase of this
697 * object's life-cycle. As such, the server is initialized after the
698 * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
699 * when the machine is setup, and the dependencies are available.
700 */
701 static void vfu_object_machine_done(Notifier *notifier, void *data)
702 {
703 VfuObject *o = container_of(notifier, VfuObject, machine_done);
704 Error *err = NULL;
705
706 vfu_object_init_ctx(o, &err);
707
708 if (err) {
709 error_propagate(&error_abort, err);
710 }
711 }
712
713 /**
714 * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
715 * an unplug blocker for the associated PCI device. Setup a FD handler
716 * to process incoming messages in the context's socket.
717 *
718 * The socket and device properties are mandatory, and this function
719 * will not create the context without them - the setters for these
720 * properties should call this function when the property is set. The
721 * machine should also be ready when this function is invoked - it is
722 * because QEMU objects are initialized before devices, and the
723 * associated PCI device wouldn't be available at the object
724 * initialization time. Until these conditions are satisfied, this
725 * function would return early without performing any task.
726 */
727 static void vfu_object_init_ctx(VfuObject *o, Error **errp)
728 {
729 ERRP_GUARD();
730 DeviceState *dev = NULL;
731 vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
732 int ret;
733
734 if (o->vfu_ctx || !o->socket || !o->device ||
735 !phase_check(PHASE_MACHINE_READY)) {
736 return;
737 }
738
739 if (o->err) {
740 error_propagate(errp, o->err);
741 o->err = NULL;
742 return;
743 }
744
745 o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
746 LIBVFIO_USER_FLAG_ATTACH_NB,
747 o, VFU_DEV_TYPE_PCI);
748 if (o->vfu_ctx == NULL) {
749 error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
750 return;
751 }
752
753 dev = qdev_find_recursive(sysbus_get_default(), o->device);
754 if (dev == NULL) {
755 error_setg(errp, "vfu: Device %s not found", o->device);
756 goto fail;
757 }
758
759 if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
760 error_setg(errp, "vfu: %s not a PCI device", o->device);
761 goto fail;
762 }
763
764 o->pci_dev = PCI_DEVICE(dev);
765
766 object_ref(OBJECT(o->pci_dev));
767
768 if (pci_is_express(o->pci_dev)) {
769 pci_type = VFU_PCI_TYPE_EXPRESS;
770 }
771
772 ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
773 if (ret < 0) {
774 error_setg(errp,
775 "vfu: Failed to attach PCI device %s to context - %s",
776 o->device, strerror(errno));
777 goto fail;
778 }
779
780 error_setg(&o->unplug_blocker,
781 "vfu: %s for %s must be deleted before unplugging",
782 TYPE_VFU_OBJECT, o->device);
783 qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
784
785 ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
786 pci_config_size(o->pci_dev), &vfu_object_cfg_access,
787 VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
788 NULL, 0, -1, 0);
789 if (ret < 0) {
790 error_setg(errp,
791 "vfu: Failed to setup config space handlers for %s- %s",
792 o->device, strerror(errno));
793 goto fail;
794 }
795
796 ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
797 if (ret < 0) {
798 error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
799 o->device);
800 goto fail;
801 }
802
803 vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
804
805 ret = vfu_object_setup_irqs(o, o->pci_dev);
806 if (ret < 0) {
807 error_setg(errp, "vfu: Failed to setup interrupts for %s",
808 o->device);
809 goto fail;
810 }
811
812 ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
813 if (ret < 0) {
814 error_setg(errp, "vfu: Failed to setup reset callback");
815 goto fail;
816 }
817
818 ret = vfu_realize_ctx(o->vfu_ctx);
819 if (ret < 0) {
820 error_setg(errp, "vfu: Failed to realize device %s- %s",
821 o->device, strerror(errno));
822 goto fail;
823 }
824
825 o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
826 if (o->vfu_poll_fd < 0) {
827 error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
828 goto fail;
829 }
830
831 qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
832
833 return;
834
835 fail:
836 vfu_destroy_ctx(o->vfu_ctx);
837 if (o->unplug_blocker && o->pci_dev) {
838 qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
839 error_free(o->unplug_blocker);
840 o->unplug_blocker = NULL;
841 }
842 if (o->pci_dev) {
843 vfu_object_restore_msi_cbs(o);
844 o->pci_dev->irq_opaque = NULL;
845 object_unref(OBJECT(o->pci_dev));
846 o->pci_dev = NULL;
847 }
848 o->vfu_ctx = NULL;
849 }
850
851 static void vfu_object_init(Object *obj)
852 {
853 VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
854 VfuObject *o = VFU_OBJECT(obj);
855
856 k->nr_devs++;
857
858 if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
859 error_setg(&o->err, "vfu: %s only compatible with %s machine",
860 TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
861 return;
862 }
863
864 if (!phase_check(PHASE_MACHINE_READY)) {
865 o->machine_done.notify = vfu_object_machine_done;
866 qemu_add_machine_init_done_notifier(&o->machine_done);
867 }
868
869 o->vfu_poll_fd = -1;
870 }
871
872 static void vfu_object_finalize(Object *obj)
873 {
874 VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
875 VfuObject *o = VFU_OBJECT(obj);
876
877 k->nr_devs--;
878
879 qapi_free_SocketAddress(o->socket);
880
881 o->socket = NULL;
882
883 if (o->vfu_poll_fd != -1) {
884 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
885 o->vfu_poll_fd = -1;
886 }
887
888 if (o->vfu_ctx) {
889 vfu_destroy_ctx(o->vfu_ctx);
890 o->vfu_ctx = NULL;
891 }
892
893 g_free(o->device);
894
895 o->device = NULL;
896
897 if (o->unplug_blocker && o->pci_dev) {
898 qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
899 error_free(o->unplug_blocker);
900 o->unplug_blocker = NULL;
901 }
902
903 if (o->pci_dev) {
904 vfu_object_restore_msi_cbs(o);
905 o->pci_dev->irq_opaque = NULL;
906 object_unref(OBJECT(o->pci_dev));
907 o->pci_dev = NULL;
908 }
909
910 if (!k->nr_devs && vfu_object_auto_shutdown()) {
911 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
912 }
913
914 if (o->machine_done.notify) {
915 qemu_remove_machine_init_done_notifier(&o->machine_done);
916 o->machine_done.notify = NULL;
917 }
918 }
919
920 static void vfu_object_class_init(ObjectClass *klass, void *data)
921 {
922 VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
923
924 k->nr_devs = 0;
925
926 object_class_property_add(klass, "socket", "SocketAddress", NULL,
927 vfu_object_set_socket, NULL, NULL);
928 object_class_property_set_description(klass, "socket",
929 "SocketAddress "
930 "(ex: type=unix,path=/tmp/sock). "
931 "Only UNIX is presently supported");
932 object_class_property_add_str(klass, "device", NULL,
933 vfu_object_set_device);
934 object_class_property_set_description(klass, "device",
935 "device ID - only PCI devices "
936 "are presently supported");
937 }
938
939 static const TypeInfo vfu_object_info = {
940 .name = TYPE_VFU_OBJECT,
941 .parent = TYPE_OBJECT,
942 .instance_size = sizeof(VfuObject),
943 .instance_init = vfu_object_init,
944 .instance_finalize = vfu_object_finalize,
945 .class_size = sizeof(VfuObjectClass),
946 .class_init = vfu_object_class_init,
947 .interfaces = (InterfaceInfo[]) {
948 { TYPE_USER_CREATABLE },
949 { }
950 }
951 };
952
953 static void vfu_register_types(void)
954 {
955 type_register_static(&vfu_object_info);
956 }
957
958 type_init(vfu_register_types);