Merge tag 'linux-user-for-7.1-pull-request' of https://gitlab.com/laurent_vivier...
[qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
60
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64
65 /***********************************************************/
66 /* ram save/restore */
67
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
84 XBZRLECacheStats xbzrle_counters;
85
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
101
102 static void XBZRLE_cache_lock(void)
103 {
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
106 }
107 }
108
109 static void XBZRLE_cache_unlock(void)
110 {
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 }
115
116 /**
117 * xbzrle_cache_resize: resize the xbzrle cache
118 *
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
123 *
124 * Returns 0 for success or -1 for error
125 *
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
128 */
129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
130 {
131 PageCache *new_cache;
132 int64_t ret = 0;
133
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
139 }
140
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
144 }
145
146 XBZRLE_cache_lock();
147
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
153 }
154
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
157 }
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
161 }
162
163 bool ramblock_is_ignored(RAMBlock *block)
164 {
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167 }
168
169 #undef RAMBLOCK_FOREACH
170
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
172 {
173 RAMBlock *block;
174 int ret = 0;
175
176 RCU_READ_LOCK_GUARD();
177
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
182 }
183 }
184 return ret;
185 }
186
187 static void ramblock_recv_map_init(void)
188 {
189 RAMBlock *rb;
190
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
194 }
195 }
196
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
198 {
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
201 }
202
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
204 {
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206 }
207
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
209 {
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211 }
212
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
215 {
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
219 }
220
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222
223 /*
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
225 *
226 * Returns >0 if success with sent bytes, or <0 if error.
227 */
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
230 {
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
234
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
238 }
239
240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
241
242 /*
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
246 */
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248
249 /*
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
253 */
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
255
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
258
259 /*
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
264 */
265 size = ROUND_UP(size, 8);
266
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
269 /*
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
272 */
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
275
276 g_free(le_bitmap);
277
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
280 }
281
282 return size + sizeof(size);
283 }
284
285 /*
286 * An outstanding page request, on the source, having been received
287 * and queued
288 */
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
293
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295 };
296
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
309 /* last ram version we have seen */
310 uint32_t last_version;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
328
329 /* compression statistics since the beginning of the period */
330 /* amount of count that no free thread to compress data */
331 uint64_t compress_thread_busy_prev;
332 /* amount bytes after compression */
333 uint64_t compressed_size_prev;
334 /* amount of compressed pages */
335 uint64_t compress_pages_prev;
336
337 /* total handled target pages at the beginning of period */
338 uint64_t target_page_count_prev;
339 /* total handled target pages since start */
340 uint64_t target_page_count;
341 /* number of dirty bits in the bitmap */
342 uint64_t migration_dirty_pages;
343 /* Protects modification of the bitmap and migration dirty pages */
344 QemuMutex bitmap_mutex;
345 /* The RAMBlock used in the last src_page_requests */
346 RAMBlock *last_req_rb;
347 /* Queue of outstanding page requests from the destination */
348 QemuMutex src_page_req_mutex;
349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
350 };
351 typedef struct RAMState RAMState;
352
353 static RAMState *ram_state;
354
355 static NotifierWithReturnList precopy_notifier_list;
356
357 void precopy_infrastructure_init(void)
358 {
359 notifier_with_return_list_init(&precopy_notifier_list);
360 }
361
362 void precopy_add_notifier(NotifierWithReturn *n)
363 {
364 notifier_with_return_list_add(&precopy_notifier_list, n);
365 }
366
367 void precopy_remove_notifier(NotifierWithReturn *n)
368 {
369 notifier_with_return_remove(n);
370 }
371
372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
373 {
374 PrecopyNotifyData pnd;
375 pnd.reason = reason;
376 pnd.errp = errp;
377
378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
379 }
380
381 uint64_t ram_bytes_remaining(void)
382 {
383 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
384 0;
385 }
386
387 MigrationStats ram_counters;
388
389 /* used by the search for pages to send */
390 struct PageSearchStatus {
391 /* Current block being searched */
392 RAMBlock *block;
393 /* Current page to search from */
394 unsigned long page;
395 /* Set once we wrap around */
396 bool complete_round;
397 };
398 typedef struct PageSearchStatus PageSearchStatus;
399
400 CompressionStats compression_counters;
401
402 struct CompressParam {
403 bool done;
404 bool quit;
405 bool zero_page;
406 QEMUFile *file;
407 QemuMutex mutex;
408 QemuCond cond;
409 RAMBlock *block;
410 ram_addr_t offset;
411
412 /* internally used fields */
413 z_stream stream;
414 uint8_t *originbuf;
415 };
416 typedef struct CompressParam CompressParam;
417
418 struct DecompressParam {
419 bool done;
420 bool quit;
421 QemuMutex mutex;
422 QemuCond cond;
423 void *des;
424 uint8_t *compbuf;
425 int len;
426 z_stream stream;
427 };
428 typedef struct DecompressParam DecompressParam;
429
430 static CompressParam *comp_param;
431 static QemuThread *compress_threads;
432 /* comp_done_cond is used to wake up the migration thread when
433 * one of the compression threads has finished the compression.
434 * comp_done_lock is used to co-work with comp_done_cond.
435 */
436 static QemuMutex comp_done_lock;
437 static QemuCond comp_done_cond;
438 /* The empty QEMUFileOps will be used by file in CompressParam */
439 static const QEMUFileOps empty_ops = { };
440
441 static QEMUFile *decomp_file;
442 static DecompressParam *decomp_param;
443 static QemuThread *decompress_threads;
444 static QemuMutex decomp_done_lock;
445 static QemuCond decomp_done_cond;
446
447 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
448 ram_addr_t offset, uint8_t *source_buf);
449
450 static void *do_data_compress(void *opaque)
451 {
452 CompressParam *param = opaque;
453 RAMBlock *block;
454 ram_addr_t offset;
455 bool zero_page;
456
457 qemu_mutex_lock(&param->mutex);
458 while (!param->quit) {
459 if (param->block) {
460 block = param->block;
461 offset = param->offset;
462 param->block = NULL;
463 qemu_mutex_unlock(&param->mutex);
464
465 zero_page = do_compress_ram_page(param->file, &param->stream,
466 block, offset, param->originbuf);
467
468 qemu_mutex_lock(&comp_done_lock);
469 param->done = true;
470 param->zero_page = zero_page;
471 qemu_cond_signal(&comp_done_cond);
472 qemu_mutex_unlock(&comp_done_lock);
473
474 qemu_mutex_lock(&param->mutex);
475 } else {
476 qemu_cond_wait(&param->cond, &param->mutex);
477 }
478 }
479 qemu_mutex_unlock(&param->mutex);
480
481 return NULL;
482 }
483
484 static void compress_threads_save_cleanup(void)
485 {
486 int i, thread_count;
487
488 if (!migrate_use_compression() || !comp_param) {
489 return;
490 }
491
492 thread_count = migrate_compress_threads();
493 for (i = 0; i < thread_count; i++) {
494 /*
495 * we use it as a indicator which shows if the thread is
496 * properly init'd or not
497 */
498 if (!comp_param[i].file) {
499 break;
500 }
501
502 qemu_mutex_lock(&comp_param[i].mutex);
503 comp_param[i].quit = true;
504 qemu_cond_signal(&comp_param[i].cond);
505 qemu_mutex_unlock(&comp_param[i].mutex);
506
507 qemu_thread_join(compress_threads + i);
508 qemu_mutex_destroy(&comp_param[i].mutex);
509 qemu_cond_destroy(&comp_param[i].cond);
510 deflateEnd(&comp_param[i].stream);
511 g_free(comp_param[i].originbuf);
512 qemu_fclose(comp_param[i].file);
513 comp_param[i].file = NULL;
514 }
515 qemu_mutex_destroy(&comp_done_lock);
516 qemu_cond_destroy(&comp_done_cond);
517 g_free(compress_threads);
518 g_free(comp_param);
519 compress_threads = NULL;
520 comp_param = NULL;
521 }
522
523 static int compress_threads_save_setup(void)
524 {
525 int i, thread_count;
526
527 if (!migrate_use_compression()) {
528 return 0;
529 }
530 thread_count = migrate_compress_threads();
531 compress_threads = g_new0(QemuThread, thread_count);
532 comp_param = g_new0(CompressParam, thread_count);
533 qemu_cond_init(&comp_done_cond);
534 qemu_mutex_init(&comp_done_lock);
535 for (i = 0; i < thread_count; i++) {
536 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
537 if (!comp_param[i].originbuf) {
538 goto exit;
539 }
540
541 if (deflateInit(&comp_param[i].stream,
542 migrate_compress_level()) != Z_OK) {
543 g_free(comp_param[i].originbuf);
544 goto exit;
545 }
546
547 /* comp_param[i].file is just used as a dummy buffer to save data,
548 * set its ops to empty.
549 */
550 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
551 comp_param[i].done = true;
552 comp_param[i].quit = false;
553 qemu_mutex_init(&comp_param[i].mutex);
554 qemu_cond_init(&comp_param[i].cond);
555 qemu_thread_create(compress_threads + i, "compress",
556 do_data_compress, comp_param + i,
557 QEMU_THREAD_JOINABLE);
558 }
559 return 0;
560
561 exit:
562 compress_threads_save_cleanup();
563 return -1;
564 }
565
566 /**
567 * save_page_header: write page header to wire
568 *
569 * If this is the 1st block, it also writes the block identification
570 *
571 * Returns the number of bytes written
572 *
573 * @f: QEMUFile where to send the data
574 * @block: block that contains the page we want to send
575 * @offset: offset inside the block for the page
576 * in the lower bits, it contains flags
577 */
578 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
579 ram_addr_t offset)
580 {
581 size_t size, len;
582
583 if (block == rs->last_sent_block) {
584 offset |= RAM_SAVE_FLAG_CONTINUE;
585 }
586 qemu_put_be64(f, offset);
587 size = 8;
588
589 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
590 len = strlen(block->idstr);
591 qemu_put_byte(f, len);
592 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
593 size += 1 + len;
594 rs->last_sent_block = block;
595 }
596 return size;
597 }
598
599 /**
600 * mig_throttle_guest_down: throttle down the guest
601 *
602 * Reduce amount of guest cpu execution to hopefully slow down memory
603 * writes. If guest dirty memory rate is reduced below the rate at
604 * which we can transfer pages to the destination then we should be
605 * able to complete migration. Some workloads dirty memory way too
606 * fast and will not effectively converge, even with auto-converge.
607 */
608 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
609 uint64_t bytes_dirty_threshold)
610 {
611 MigrationState *s = migrate_get_current();
612 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
613 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
614 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
615 int pct_max = s->parameters.max_cpu_throttle;
616
617 uint64_t throttle_now = cpu_throttle_get_percentage();
618 uint64_t cpu_now, cpu_ideal, throttle_inc;
619
620 /* We have not started throttling yet. Let's start it. */
621 if (!cpu_throttle_active()) {
622 cpu_throttle_set(pct_initial);
623 } else {
624 /* Throttling already on, just increase the rate */
625 if (!pct_tailslow) {
626 throttle_inc = pct_increment;
627 } else {
628 /* Compute the ideal CPU percentage used by Guest, which may
629 * make the dirty rate match the dirty rate threshold. */
630 cpu_now = 100 - throttle_now;
631 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
632 bytes_dirty_period);
633 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
634 }
635 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
636 }
637 }
638
639 void mig_throttle_counter_reset(void)
640 {
641 RAMState *rs = ram_state;
642
643 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
644 rs->num_dirty_pages_period = 0;
645 rs->bytes_xfer_prev = ram_counters.transferred;
646 }
647
648 /**
649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
650 *
651 * @rs: current RAM state
652 * @current_addr: address for the zero page
653 *
654 * Update the xbzrle cache to reflect a page that's been sent as all 0.
655 * The important thing is that a stale (not-yet-0'd) page be replaced
656 * by the new data.
657 * As a bonus, if the page wasn't in the cache it gets added so that
658 * when a small write is made into the 0'd page it gets XBZRLE sent.
659 */
660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
661 {
662 if (!rs->xbzrle_enabled) {
663 return;
664 }
665
666 /* We don't care if this fails to allocate a new cache page
667 * as long as it updated an old one */
668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
669 ram_counters.dirty_sync_count);
670 }
671
672 #define ENCODING_FLAG_XBZRLE 0x1
673
674 /**
675 * save_xbzrle_page: compress and send current page
676 *
677 * Returns: 1 means that we wrote the page
678 * 0 means that page is identical to the one already sent
679 * -1 means that xbzrle would be longer than normal
680 *
681 * @rs: current RAM state
682 * @current_data: pointer to the address of the page contents
683 * @current_addr: addr of the page
684 * @block: block that contains the page we want to send
685 * @offset: offset inside the block for the page
686 * @last_stage: if we are at the completion stage
687 */
688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
689 ram_addr_t current_addr, RAMBlock *block,
690 ram_addr_t offset, bool last_stage)
691 {
692 int encoded_len = 0, bytes_xbzrle;
693 uint8_t *prev_cached_page;
694
695 if (!cache_is_cached(XBZRLE.cache, current_addr,
696 ram_counters.dirty_sync_count)) {
697 xbzrle_counters.cache_miss++;
698 if (!last_stage) {
699 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
700 ram_counters.dirty_sync_count) == -1) {
701 return -1;
702 } else {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data = get_cached_data(XBZRLE.cache, current_addr);
706 }
707 }
708 return -1;
709 }
710
711 /*
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
715 *
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
721 */
722 xbzrle_counters.pages++;
723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
724
725 /* save current buffer into memory */
726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
727
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
731 TARGET_PAGE_SIZE);
732
733 /*
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
736 */
737 if (!last_stage && encoded_len != 0) {
738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
739 /*
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
743 */
744 *current_data = prev_cached_page;
745 }
746
747 if (encoded_len == 0) {
748 trace_save_xbzrle_page_skipping();
749 return 0;
750 } else if (encoded_len == -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters.overflow++;
753 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
754 return -1;
755 }
756
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle = save_page_header(rs, rs->f, block,
759 offset | RAM_SAVE_FLAG_XBZRLE);
760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
761 qemu_put_be16(rs->f, encoded_len);
762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
763 bytes_xbzrle += encoded_len + 1 + 2;
764 /*
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
768 */
769 xbzrle_counters.bytes += bytes_xbzrle - 8;
770 ram_counters.transferred += bytes_xbzrle;
771
772 return 1;
773 }
774
775 /**
776 * migration_bitmap_find_dirty: find the next dirty page from start
777 *
778 * Returns the page offset within memory region of the start of a dirty page
779 *
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
783 */
784 static inline
785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
786 unsigned long start)
787 {
788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
789 unsigned long *bitmap = rb->bmap;
790
791 if (ramblock_is_ignored(rb)) {
792 return size;
793 }
794
795 return find_next_bit(bitmap, size, start);
796 }
797
798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
799 unsigned long page)
800 {
801 uint8_t shift;
802 hwaddr size, start;
803
804 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
805 return;
806 }
807
808 shift = rb->clear_bmap_shift;
809 /*
810 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
811 * can make things easier sometimes since then start address
812 * of the small chunk will always be 64 pages aligned so the
813 * bitmap will always be aligned to unsigned long. We should
814 * even be able to remove this restriction but I'm simply
815 * keeping it.
816 */
817 assert(shift >= 6);
818
819 size = 1ULL << (TARGET_PAGE_BITS + shift);
820 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
821 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
822 memory_region_clear_dirty_bitmap(rb->mr, start, size);
823 }
824
825 static void
826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
827 unsigned long start,
828 unsigned long npages)
829 {
830 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
831 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
832 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
833
834 /*
835 * Clear pages from start to start + npages - 1, so the end boundary is
836 * exclusive.
837 */
838 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
839 migration_clear_memory_region_dirty_bitmap(rb, i);
840 }
841 }
842
843 /*
844 * colo_bitmap_find_diry:find contiguous dirty pages from start
845 *
846 * Returns the page offset within memory region of the start of the contiguout
847 * dirty page
848 *
849 * @rs: current RAM state
850 * @rb: RAMBlock where to search for dirty pages
851 * @start: page where we start the search
852 * @num: the number of contiguous dirty pages
853 */
854 static inline
855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
856 unsigned long start, unsigned long *num)
857 {
858 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
859 unsigned long *bitmap = rb->bmap;
860 unsigned long first, next;
861
862 *num = 0;
863
864 if (ramblock_is_ignored(rb)) {
865 return size;
866 }
867
868 first = find_next_bit(bitmap, size, start);
869 if (first >= size) {
870 return first;
871 }
872 next = find_next_zero_bit(bitmap, size, first + 1);
873 assert(next >= first);
874 *num = next - first;
875 return first;
876 }
877
878 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
879 RAMBlock *rb,
880 unsigned long page)
881 {
882 bool ret;
883
884 /*
885 * Clear dirty bitmap if needed. This _must_ be called before we
886 * send any of the page in the chunk because we need to make sure
887 * we can capture further page content changes when we sync dirty
888 * log the next time. So as long as we are going to send any of
889 * the page in the chunk we clear the remote dirty bitmap for all.
890 * Clearing it earlier won't be a problem, but too late will.
891 */
892 migration_clear_memory_region_dirty_bitmap(rb, page);
893
894 ret = test_and_clear_bit(page, rb->bmap);
895 if (ret) {
896 rs->migration_dirty_pages--;
897 }
898
899 return ret;
900 }
901
902 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
903 void *opaque)
904 {
905 const hwaddr offset = section->offset_within_region;
906 const hwaddr size = int128_get64(section->size);
907 const unsigned long start = offset >> TARGET_PAGE_BITS;
908 const unsigned long npages = size >> TARGET_PAGE_BITS;
909 RAMBlock *rb = section->mr->ram_block;
910 uint64_t *cleared_bits = opaque;
911
912 /*
913 * We don't grab ram_state->bitmap_mutex because we expect to run
914 * only when starting migration or during postcopy recovery where
915 * we don't have concurrent access.
916 */
917 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
918 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
919 }
920 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
921 bitmap_clear(rb->bmap, start, npages);
922 }
923
924 /*
925 * Exclude all dirty pages from migration that fall into a discarded range as
926 * managed by a RamDiscardManager responsible for the mapped memory region of
927 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
928 *
929 * Discarded pages ("logically unplugged") have undefined content and must
930 * not get migrated, because even reading these pages for migration might
931 * result in undesired behavior.
932 *
933 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
934 *
935 * Note: The result is only stable while migrating (precopy/postcopy).
936 */
937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
938 {
939 uint64_t cleared_bits = 0;
940
941 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
942 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
943 MemoryRegionSection section = {
944 .mr = rb->mr,
945 .offset_within_region = 0,
946 .size = int128_make64(qemu_ram_get_used_length(rb)),
947 };
948
949 ram_discard_manager_replay_discarded(rdm, &section,
950 dirty_bitmap_clear_section,
951 &cleared_bits);
952 }
953 return cleared_bits;
954 }
955
956 /*
957 * Check if a host-page aligned page falls into a discarded range as managed by
958 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
959 *
960 * Note: The result is only stable while migrating (precopy/postcopy).
961 */
962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
963 {
964 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
965 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
966 MemoryRegionSection section = {
967 .mr = rb->mr,
968 .offset_within_region = start,
969 .size = int128_make64(qemu_ram_pagesize(rb)),
970 };
971
972 return !ram_discard_manager_is_populated(rdm, &section);
973 }
974 return false;
975 }
976
977 /* Called with RCU critical section */
978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
979 {
980 uint64_t new_dirty_pages =
981 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
982
983 rs->migration_dirty_pages += new_dirty_pages;
984 rs->num_dirty_pages_period += new_dirty_pages;
985 }
986
987 /**
988 * ram_pagesize_summary: calculate all the pagesizes of a VM
989 *
990 * Returns a summary bitmap of the page sizes of all RAMBlocks
991 *
992 * For VMs with just normal pages this is equivalent to the host page
993 * size. If it's got some huge pages then it's the OR of all the
994 * different page sizes.
995 */
996 uint64_t ram_pagesize_summary(void)
997 {
998 RAMBlock *block;
999 uint64_t summary = 0;
1000
1001 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1002 summary |= block->page_size;
1003 }
1004
1005 return summary;
1006 }
1007
1008 uint64_t ram_get_total_transferred_pages(void)
1009 {
1010 return ram_counters.normal + ram_counters.duplicate +
1011 compression_counters.pages + xbzrle_counters.pages;
1012 }
1013
1014 static void migration_update_rates(RAMState *rs, int64_t end_time)
1015 {
1016 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1017 double compressed_size;
1018
1019 /* calculate period counters */
1020 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1021 / (end_time - rs->time_last_bitmap_sync);
1022
1023 if (!page_count) {
1024 return;
1025 }
1026
1027 if (migrate_use_xbzrle()) {
1028 double encoded_size, unencoded_size;
1029
1030 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1031 rs->xbzrle_cache_miss_prev) / page_count;
1032 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1033 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1034 TARGET_PAGE_SIZE;
1035 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1036 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1037 xbzrle_counters.encoding_rate = 0;
1038 } else {
1039 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1040 }
1041 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1042 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1043 }
1044
1045 if (migrate_use_compression()) {
1046 compression_counters.busy_rate = (double)(compression_counters.busy -
1047 rs->compress_thread_busy_prev) / page_count;
1048 rs->compress_thread_busy_prev = compression_counters.busy;
1049
1050 compressed_size = compression_counters.compressed_size -
1051 rs->compressed_size_prev;
1052 if (compressed_size) {
1053 double uncompressed_size = (compression_counters.pages -
1054 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1055
1056 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057 compression_counters.compression_rate =
1058 uncompressed_size / compressed_size;
1059
1060 rs->compress_pages_prev = compression_counters.pages;
1061 rs->compressed_size_prev = compression_counters.compressed_size;
1062 }
1063 }
1064 }
1065
1066 static void migration_trigger_throttle(RAMState *rs)
1067 {
1068 MigrationState *s = migrate_get_current();
1069 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1070
1071 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1072 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1073 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1074
1075 /* During block migration the auto-converge logic incorrectly detects
1076 * that ram migration makes no progress. Avoid this by disabling the
1077 * throttling logic during the bulk phase of block migration. */
1078 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079 /* The following detection logic can be refined later. For now:
1080 Check to see if the ratio between dirtied bytes and the approx.
1081 amount of bytes that just got transferred since the last time
1082 we were in this routine reaches the threshold. If that happens
1083 twice, start or increase throttling. */
1084
1085 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1086 (++rs->dirty_rate_high_cnt >= 2)) {
1087 trace_migration_throttle();
1088 rs->dirty_rate_high_cnt = 0;
1089 mig_throttle_guest_down(bytes_dirty_period,
1090 bytes_dirty_threshold);
1091 }
1092 }
1093 }
1094
1095 static void migration_bitmap_sync(RAMState *rs)
1096 {
1097 RAMBlock *block;
1098 int64_t end_time;
1099
1100 ram_counters.dirty_sync_count++;
1101
1102 if (!rs->time_last_bitmap_sync) {
1103 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1104 }
1105
1106 trace_migration_bitmap_sync_start();
1107 memory_global_dirty_log_sync();
1108
1109 qemu_mutex_lock(&rs->bitmap_mutex);
1110 WITH_RCU_READ_LOCK_GUARD() {
1111 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1112 ramblock_sync_dirty_bitmap(rs, block);
1113 }
1114 ram_counters.remaining = ram_bytes_remaining();
1115 }
1116 qemu_mutex_unlock(&rs->bitmap_mutex);
1117
1118 memory_global_after_dirty_log_sync();
1119 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1120
1121 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1122
1123 /* more than 1 second = 1000 millisecons */
1124 if (end_time > rs->time_last_bitmap_sync + 1000) {
1125 migration_trigger_throttle(rs);
1126
1127 migration_update_rates(rs, end_time);
1128
1129 rs->target_page_count_prev = rs->target_page_count;
1130
1131 /* reset period counters */
1132 rs->time_last_bitmap_sync = end_time;
1133 rs->num_dirty_pages_period = 0;
1134 rs->bytes_xfer_prev = ram_counters.transferred;
1135 }
1136 if (migrate_use_events()) {
1137 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1138 }
1139 }
1140
1141 static void migration_bitmap_sync_precopy(RAMState *rs)
1142 {
1143 Error *local_err = NULL;
1144
1145 /*
1146 * The current notifier usage is just an optimization to migration, so we
1147 * don't stop the normal migration process in the error case.
1148 */
1149 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1150 error_report_err(local_err);
1151 local_err = NULL;
1152 }
1153
1154 migration_bitmap_sync(rs);
1155
1156 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1157 error_report_err(local_err);
1158 }
1159 }
1160
1161 /**
1162 * save_zero_page_to_file: send the zero page to the file
1163 *
1164 * Returns the size of data written to the file, 0 means the page is not
1165 * a zero page
1166 *
1167 * @rs: current RAM state
1168 * @file: the file where the data is saved
1169 * @block: block that contains the page we want to send
1170 * @offset: offset inside the block for the page
1171 */
1172 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1173 RAMBlock *block, ram_addr_t offset)
1174 {
1175 uint8_t *p = block->host + offset;
1176 int len = 0;
1177
1178 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1179 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1180 qemu_put_byte(file, 0);
1181 len += 1;
1182 }
1183 return len;
1184 }
1185
1186 /**
1187 * save_zero_page: send the zero page to the stream
1188 *
1189 * Returns the number of pages written.
1190 *
1191 * @rs: current RAM state
1192 * @block: block that contains the page we want to send
1193 * @offset: offset inside the block for the page
1194 */
1195 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1196 {
1197 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1198
1199 if (len) {
1200 ram_counters.duplicate++;
1201 ram_counters.transferred += len;
1202 return 1;
1203 }
1204 return -1;
1205 }
1206
1207 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1208 {
1209 if (!migrate_release_ram() || !migration_in_postcopy()) {
1210 return;
1211 }
1212
1213 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1214 }
1215
1216 /*
1217 * @pages: the number of pages written by the control path,
1218 * < 0 - error
1219 * > 0 - number of pages written
1220 *
1221 * Return true if the pages has been saved, otherwise false is returned.
1222 */
1223 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1224 int *pages)
1225 {
1226 uint64_t bytes_xmit = 0;
1227 int ret;
1228
1229 *pages = -1;
1230 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1231 &bytes_xmit);
1232 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1233 return false;
1234 }
1235
1236 if (bytes_xmit) {
1237 ram_counters.transferred += bytes_xmit;
1238 *pages = 1;
1239 }
1240
1241 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1242 return true;
1243 }
1244
1245 if (bytes_xmit > 0) {
1246 ram_counters.normal++;
1247 } else if (bytes_xmit == 0) {
1248 ram_counters.duplicate++;
1249 }
1250
1251 return true;
1252 }
1253
1254 /*
1255 * directly send the page to the stream
1256 *
1257 * Returns the number of pages written.
1258 *
1259 * @rs: current RAM state
1260 * @block: block that contains the page we want to send
1261 * @offset: offset inside the block for the page
1262 * @buf: the page to be sent
1263 * @async: send to page asyncly
1264 */
1265 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1266 uint8_t *buf, bool async)
1267 {
1268 ram_counters.transferred += save_page_header(rs, rs->f, block,
1269 offset | RAM_SAVE_FLAG_PAGE);
1270 if (async) {
1271 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1272 migrate_release_ram() &
1273 migration_in_postcopy());
1274 } else {
1275 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1276 }
1277 ram_counters.transferred += TARGET_PAGE_SIZE;
1278 ram_counters.normal++;
1279 return 1;
1280 }
1281
1282 /**
1283 * ram_save_page: send the given page to the stream
1284 *
1285 * Returns the number of pages written.
1286 * < 0 - error
1287 * >=0 - Number of pages written - this might legally be 0
1288 * if xbzrle noticed the page was the same.
1289 *
1290 * @rs: current RAM state
1291 * @block: block that contains the page we want to send
1292 * @offset: offset inside the block for the page
1293 * @last_stage: if we are at the completion stage
1294 */
1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1296 {
1297 int pages = -1;
1298 uint8_t *p;
1299 bool send_async = true;
1300 RAMBlock *block = pss->block;
1301 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1302 ram_addr_t current_addr = block->offset + offset;
1303
1304 p = block->host + offset;
1305 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1306
1307 XBZRLE_cache_lock();
1308 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1309 pages = save_xbzrle_page(rs, &p, current_addr, block,
1310 offset, last_stage);
1311 if (!last_stage) {
1312 /* Can't send this cached data async, since the cache page
1313 * might get updated before it gets to the wire
1314 */
1315 send_async = false;
1316 }
1317 }
1318
1319 /* XBZRLE overflow or normal page */
1320 if (pages == -1) {
1321 pages = save_normal_page(rs, block, offset, p, send_async);
1322 }
1323
1324 XBZRLE_cache_unlock();
1325
1326 return pages;
1327 }
1328
1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1330 ram_addr_t offset)
1331 {
1332 if (multifd_queue_page(rs->f, block, offset) < 0) {
1333 return -1;
1334 }
1335 ram_counters.normal++;
1336
1337 return 1;
1338 }
1339
1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1341 ram_addr_t offset, uint8_t *source_buf)
1342 {
1343 RAMState *rs = ram_state;
1344 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1345 bool zero_page = false;
1346 int ret;
1347
1348 if (save_zero_page_to_file(rs, f, block, offset)) {
1349 zero_page = true;
1350 goto exit;
1351 }
1352
1353 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1354
1355 /*
1356 * copy it to a internal buffer to avoid it being modified by VM
1357 * so that we can catch up the error during compression and
1358 * decompression
1359 */
1360 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1361 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1362 if (ret < 0) {
1363 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1364 error_report("compressed data failed!");
1365 return false;
1366 }
1367
1368 exit:
1369 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1370 return zero_page;
1371 }
1372
1373 static void
1374 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1375 {
1376 ram_counters.transferred += bytes_xmit;
1377
1378 if (param->zero_page) {
1379 ram_counters.duplicate++;
1380 return;
1381 }
1382
1383 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1384 compression_counters.compressed_size += bytes_xmit - 8;
1385 compression_counters.pages++;
1386 }
1387
1388 static bool save_page_use_compression(RAMState *rs);
1389
1390 static void flush_compressed_data(RAMState *rs)
1391 {
1392 int idx, len, thread_count;
1393
1394 if (!save_page_use_compression(rs)) {
1395 return;
1396 }
1397 thread_count = migrate_compress_threads();
1398
1399 qemu_mutex_lock(&comp_done_lock);
1400 for (idx = 0; idx < thread_count; idx++) {
1401 while (!comp_param[idx].done) {
1402 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1403 }
1404 }
1405 qemu_mutex_unlock(&comp_done_lock);
1406
1407 for (idx = 0; idx < thread_count; idx++) {
1408 qemu_mutex_lock(&comp_param[idx].mutex);
1409 if (!comp_param[idx].quit) {
1410 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1411 /*
1412 * it's safe to fetch zero_page without holding comp_done_lock
1413 * as there is no further request submitted to the thread,
1414 * i.e, the thread should be waiting for a request at this point.
1415 */
1416 update_compress_thread_counts(&comp_param[idx], len);
1417 }
1418 qemu_mutex_unlock(&comp_param[idx].mutex);
1419 }
1420 }
1421
1422 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1423 ram_addr_t offset)
1424 {
1425 param->block = block;
1426 param->offset = offset;
1427 }
1428
1429 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1430 ram_addr_t offset)
1431 {
1432 int idx, thread_count, bytes_xmit = -1, pages = -1;
1433 bool wait = migrate_compress_wait_thread();
1434
1435 thread_count = migrate_compress_threads();
1436 qemu_mutex_lock(&comp_done_lock);
1437 retry:
1438 for (idx = 0; idx < thread_count; idx++) {
1439 if (comp_param[idx].done) {
1440 comp_param[idx].done = false;
1441 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1442 qemu_mutex_lock(&comp_param[idx].mutex);
1443 set_compress_params(&comp_param[idx], block, offset);
1444 qemu_cond_signal(&comp_param[idx].cond);
1445 qemu_mutex_unlock(&comp_param[idx].mutex);
1446 pages = 1;
1447 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1448 break;
1449 }
1450 }
1451
1452 /*
1453 * wait for the free thread if the user specifies 'compress-wait-thread',
1454 * otherwise we will post the page out in the main thread as normal page.
1455 */
1456 if (pages < 0 && wait) {
1457 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1458 goto retry;
1459 }
1460 qemu_mutex_unlock(&comp_done_lock);
1461
1462 return pages;
1463 }
1464
1465 /**
1466 * find_dirty_block: find the next dirty page and update any state
1467 * associated with the search process.
1468 *
1469 * Returns true if a page is found
1470 *
1471 * @rs: current RAM state
1472 * @pss: data about the state of the current dirty page scan
1473 * @again: set to false if the search has scanned the whole of RAM
1474 */
1475 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1476 {
1477 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1478 if (pss->complete_round && pss->block == rs->last_seen_block &&
1479 pss->page >= rs->last_page) {
1480 /*
1481 * We've been once around the RAM and haven't found anything.
1482 * Give up.
1483 */
1484 *again = false;
1485 return false;
1486 }
1487 if (!offset_in_ramblock(pss->block,
1488 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1489 /* Didn't find anything in this RAM Block */
1490 pss->page = 0;
1491 pss->block = QLIST_NEXT_RCU(pss->block, next);
1492 if (!pss->block) {
1493 /*
1494 * If memory migration starts over, we will meet a dirtied page
1495 * which may still exists in compression threads's ring, so we
1496 * should flush the compressed data to make sure the new page
1497 * is not overwritten by the old one in the destination.
1498 *
1499 * Also If xbzrle is on, stop using the data compression at this
1500 * point. In theory, xbzrle can do better than compression.
1501 */
1502 flush_compressed_data(rs);
1503
1504 /* Hit the end of the list */
1505 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1506 /* Flag that we've looped */
1507 pss->complete_round = true;
1508 /* After the first round, enable XBZRLE. */
1509 if (migrate_use_xbzrle()) {
1510 rs->xbzrle_enabled = true;
1511 }
1512 }
1513 /* Didn't find anything this time, but try again on the new block */
1514 *again = true;
1515 return false;
1516 } else {
1517 /* Can go around again, but... */
1518 *again = true;
1519 /* We've found something so probably don't need to */
1520 return true;
1521 }
1522 }
1523
1524 /**
1525 * unqueue_page: gets a page of the queue
1526 *
1527 * Helper for 'get_queued_page' - gets a page off the queue
1528 *
1529 * Returns the block of the page (or NULL if none available)
1530 *
1531 * @rs: current RAM state
1532 * @offset: used to return the offset within the RAMBlock
1533 */
1534 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1535 {
1536 RAMBlock *block = NULL;
1537
1538 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1539 return NULL;
1540 }
1541
1542 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1543 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1544 struct RAMSrcPageRequest *entry =
1545 QSIMPLEQ_FIRST(&rs->src_page_requests);
1546 block = entry->rb;
1547 *offset = entry->offset;
1548
1549 if (entry->len > TARGET_PAGE_SIZE) {
1550 entry->len -= TARGET_PAGE_SIZE;
1551 entry->offset += TARGET_PAGE_SIZE;
1552 } else {
1553 memory_region_unref(block->mr);
1554 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1555 g_free(entry);
1556 migration_consume_urgent_request();
1557 }
1558 }
1559
1560 return block;
1561 }
1562
1563 #if defined(__linux__)
1564 /**
1565 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1566 * is found, return RAM block pointer and page offset
1567 *
1568 * Returns pointer to the RAMBlock containing faulting page,
1569 * NULL if no write faults are pending
1570 *
1571 * @rs: current RAM state
1572 * @offset: page offset from the beginning of the block
1573 */
1574 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1575 {
1576 struct uffd_msg uffd_msg;
1577 void *page_address;
1578 RAMBlock *block;
1579 int res;
1580
1581 if (!migrate_background_snapshot()) {
1582 return NULL;
1583 }
1584
1585 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1586 if (res <= 0) {
1587 return NULL;
1588 }
1589
1590 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1591 block = qemu_ram_block_from_host(page_address, false, offset);
1592 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1593 return block;
1594 }
1595
1596 /**
1597 * ram_save_release_protection: release UFFD write protection after
1598 * a range of pages has been saved
1599 *
1600 * @rs: current RAM state
1601 * @pss: page-search-status structure
1602 * @start_page: index of the first page in the range relative to pss->block
1603 *
1604 * Returns 0 on success, negative value in case of an error
1605 */
1606 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1607 unsigned long start_page)
1608 {
1609 int res = 0;
1610
1611 /* Check if page is from UFFD-managed region. */
1612 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1613 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1614 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1615
1616 /* Flush async buffers before un-protect. */
1617 qemu_fflush(rs->f);
1618 /* Un-protect memory range. */
1619 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1620 false, false);
1621 }
1622
1623 return res;
1624 }
1625
1626 /* ram_write_tracking_available: check if kernel supports required UFFD features
1627 *
1628 * Returns true if supports, false otherwise
1629 */
1630 bool ram_write_tracking_available(void)
1631 {
1632 uint64_t uffd_features;
1633 int res;
1634
1635 res = uffd_query_features(&uffd_features);
1636 return (res == 0 &&
1637 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1638 }
1639
1640 /* ram_write_tracking_compatible: check if guest configuration is
1641 * compatible with 'write-tracking'
1642 *
1643 * Returns true if compatible, false otherwise
1644 */
1645 bool ram_write_tracking_compatible(void)
1646 {
1647 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1648 int uffd_fd;
1649 RAMBlock *block;
1650 bool ret = false;
1651
1652 /* Open UFFD file descriptor */
1653 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1654 if (uffd_fd < 0) {
1655 return false;
1656 }
1657
1658 RCU_READ_LOCK_GUARD();
1659
1660 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1661 uint64_t uffd_ioctls;
1662
1663 /* Nothing to do with read-only and MMIO-writable regions */
1664 if (block->mr->readonly || block->mr->rom_device) {
1665 continue;
1666 }
1667 /* Try to register block memory via UFFD-IO to track writes */
1668 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1669 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1670 goto out;
1671 }
1672 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1673 goto out;
1674 }
1675 }
1676 ret = true;
1677
1678 out:
1679 uffd_close_fd(uffd_fd);
1680 return ret;
1681 }
1682
1683 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1684 ram_addr_t size)
1685 {
1686 /*
1687 * We read one byte of each page; this will preallocate page tables if
1688 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1689 * where no page was populated yet. This might require adaption when
1690 * supporting other mappings, like shmem.
1691 */
1692 for (; offset < size; offset += block->page_size) {
1693 char tmp = *((char *)block->host + offset);
1694
1695 /* Don't optimize the read out */
1696 asm volatile("" : "+r" (tmp));
1697 }
1698 }
1699
1700 static inline int populate_read_section(MemoryRegionSection *section,
1701 void *opaque)
1702 {
1703 const hwaddr size = int128_get64(section->size);
1704 hwaddr offset = section->offset_within_region;
1705 RAMBlock *block = section->mr->ram_block;
1706
1707 populate_read_range(block, offset, size);
1708 return 0;
1709 }
1710
1711 /*
1712 * ram_block_populate_read: preallocate page tables and populate pages in the
1713 * RAM block by reading a byte of each page.
1714 *
1715 * Since it's solely used for userfault_fd WP feature, here we just
1716 * hardcode page size to qemu_real_host_page_size.
1717 *
1718 * @block: RAM block to populate
1719 */
1720 static void ram_block_populate_read(RAMBlock *rb)
1721 {
1722 /*
1723 * Skip populating all pages that fall into a discarded range as managed by
1724 * a RamDiscardManager responsible for the mapped memory region of the
1725 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1726 * must not get populated automatically. We don't have to track
1727 * modifications via userfaultfd WP reliably, because these pages will
1728 * not be part of the migration stream either way -- see
1729 * ramblock_dirty_bitmap_exclude_discarded_pages().
1730 *
1731 * Note: The result is only stable while migrating (precopy/postcopy).
1732 */
1733 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1734 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1735 MemoryRegionSection section = {
1736 .mr = rb->mr,
1737 .offset_within_region = 0,
1738 .size = rb->mr->size,
1739 };
1740
1741 ram_discard_manager_replay_populated(rdm, &section,
1742 populate_read_section, NULL);
1743 } else {
1744 populate_read_range(rb, 0, rb->used_length);
1745 }
1746 }
1747
1748 /*
1749 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1750 */
1751 void ram_write_tracking_prepare(void)
1752 {
1753 RAMBlock *block;
1754
1755 RCU_READ_LOCK_GUARD();
1756
1757 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1758 /* Nothing to do with read-only and MMIO-writable regions */
1759 if (block->mr->readonly || block->mr->rom_device) {
1760 continue;
1761 }
1762
1763 /*
1764 * Populate pages of the RAM block before enabling userfault_fd
1765 * write protection.
1766 *
1767 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1768 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1769 * pages with pte_none() entries in page table.
1770 */
1771 ram_block_populate_read(block);
1772 }
1773 }
1774
1775 /*
1776 * ram_write_tracking_start: start UFFD-WP memory tracking
1777 *
1778 * Returns 0 for success or negative value in case of error
1779 */
1780 int ram_write_tracking_start(void)
1781 {
1782 int uffd_fd;
1783 RAMState *rs = ram_state;
1784 RAMBlock *block;
1785
1786 /* Open UFFD file descriptor */
1787 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1788 if (uffd_fd < 0) {
1789 return uffd_fd;
1790 }
1791 rs->uffdio_fd = uffd_fd;
1792
1793 RCU_READ_LOCK_GUARD();
1794
1795 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1796 /* Nothing to do with read-only and MMIO-writable regions */
1797 if (block->mr->readonly || block->mr->rom_device) {
1798 continue;
1799 }
1800
1801 /* Register block memory with UFFD to track writes */
1802 if (uffd_register_memory(rs->uffdio_fd, block->host,
1803 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1804 goto fail;
1805 }
1806 /* Apply UFFD write protection to the block memory range */
1807 if (uffd_change_protection(rs->uffdio_fd, block->host,
1808 block->max_length, true, false)) {
1809 goto fail;
1810 }
1811 block->flags |= RAM_UF_WRITEPROTECT;
1812 memory_region_ref(block->mr);
1813
1814 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1815 block->host, block->max_length);
1816 }
1817
1818 return 0;
1819
1820 fail:
1821 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1822
1823 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1824 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1825 continue;
1826 }
1827 /*
1828 * In case some memory block failed to be write-protected
1829 * remove protection and unregister all succeeded RAM blocks
1830 */
1831 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1832 false, false);
1833 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1834 /* Cleanup flags and remove reference */
1835 block->flags &= ~RAM_UF_WRITEPROTECT;
1836 memory_region_unref(block->mr);
1837 }
1838
1839 uffd_close_fd(uffd_fd);
1840 rs->uffdio_fd = -1;
1841 return -1;
1842 }
1843
1844 /**
1845 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1846 */
1847 void ram_write_tracking_stop(void)
1848 {
1849 RAMState *rs = ram_state;
1850 RAMBlock *block;
1851
1852 RCU_READ_LOCK_GUARD();
1853
1854 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1855 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1856 continue;
1857 }
1858 /* Remove protection and unregister all affected RAM blocks */
1859 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1860 false, false);
1861 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1862
1863 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1864 block->host, block->max_length);
1865
1866 /* Cleanup flags and remove reference */
1867 block->flags &= ~RAM_UF_WRITEPROTECT;
1868 memory_region_unref(block->mr);
1869 }
1870
1871 /* Finally close UFFD file descriptor */
1872 uffd_close_fd(rs->uffdio_fd);
1873 rs->uffdio_fd = -1;
1874 }
1875
1876 #else
1877 /* No target OS support, stubs just fail or ignore */
1878
1879 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1880 {
1881 (void) rs;
1882 (void) offset;
1883
1884 return NULL;
1885 }
1886
1887 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1888 unsigned long start_page)
1889 {
1890 (void) rs;
1891 (void) pss;
1892 (void) start_page;
1893
1894 return 0;
1895 }
1896
1897 bool ram_write_tracking_available(void)
1898 {
1899 return false;
1900 }
1901
1902 bool ram_write_tracking_compatible(void)
1903 {
1904 assert(0);
1905 return false;
1906 }
1907
1908 int ram_write_tracking_start(void)
1909 {
1910 assert(0);
1911 return -1;
1912 }
1913
1914 void ram_write_tracking_stop(void)
1915 {
1916 assert(0);
1917 }
1918 #endif /* defined(__linux__) */
1919
1920 /**
1921 * get_queued_page: unqueue a page from the postcopy requests
1922 *
1923 * Skips pages that are already sent (!dirty)
1924 *
1925 * Returns true if a queued page is found
1926 *
1927 * @rs: current RAM state
1928 * @pss: data about the state of the current dirty page scan
1929 */
1930 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1931 {
1932 RAMBlock *block;
1933 ram_addr_t offset;
1934 bool dirty;
1935
1936 do {
1937 block = unqueue_page(rs, &offset);
1938 /*
1939 * We're sending this page, and since it's postcopy nothing else
1940 * will dirty it, and we must make sure it doesn't get sent again
1941 * even if this queue request was received after the background
1942 * search already sent it.
1943 */
1944 if (block) {
1945 unsigned long page;
1946
1947 page = offset >> TARGET_PAGE_BITS;
1948 dirty = test_bit(page, block->bmap);
1949 if (!dirty) {
1950 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1951 page);
1952 } else {
1953 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1954 }
1955 }
1956
1957 } while (block && !dirty);
1958
1959 if (!block) {
1960 /*
1961 * Poll write faults too if background snapshot is enabled; that's
1962 * when we have vcpus got blocked by the write protected pages.
1963 */
1964 block = poll_fault_page(rs, &offset);
1965 }
1966
1967 if (block) {
1968 /*
1969 * We want the background search to continue from the queued page
1970 * since the guest is likely to want other pages near to the page
1971 * it just requested.
1972 */
1973 pss->block = block;
1974 pss->page = offset >> TARGET_PAGE_BITS;
1975
1976 /*
1977 * This unqueued page would break the "one round" check, even is
1978 * really rare.
1979 */
1980 pss->complete_round = false;
1981 }
1982
1983 return !!block;
1984 }
1985
1986 /**
1987 * migration_page_queue_free: drop any remaining pages in the ram
1988 * request queue
1989 *
1990 * It should be empty at the end anyway, but in error cases there may
1991 * be some left. in case that there is any page left, we drop it.
1992 *
1993 */
1994 static void migration_page_queue_free(RAMState *rs)
1995 {
1996 struct RAMSrcPageRequest *mspr, *next_mspr;
1997 /* This queue generally should be empty - but in the case of a failed
1998 * migration might have some droppings in.
1999 */
2000 RCU_READ_LOCK_GUARD();
2001 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2002 memory_region_unref(mspr->rb->mr);
2003 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2004 g_free(mspr);
2005 }
2006 }
2007
2008 /**
2009 * ram_save_queue_pages: queue the page for transmission
2010 *
2011 * A request from postcopy destination for example.
2012 *
2013 * Returns zero on success or negative on error
2014 *
2015 * @rbname: Name of the RAMBLock of the request. NULL means the
2016 * same that last one.
2017 * @start: starting address from the start of the RAMBlock
2018 * @len: length (in bytes) to send
2019 */
2020 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2021 {
2022 RAMBlock *ramblock;
2023 RAMState *rs = ram_state;
2024
2025 ram_counters.postcopy_requests++;
2026 RCU_READ_LOCK_GUARD();
2027
2028 if (!rbname) {
2029 /* Reuse last RAMBlock */
2030 ramblock = rs->last_req_rb;
2031
2032 if (!ramblock) {
2033 /*
2034 * Shouldn't happen, we can't reuse the last RAMBlock if
2035 * it's the 1st request.
2036 */
2037 error_report("ram_save_queue_pages no previous block");
2038 return -1;
2039 }
2040 } else {
2041 ramblock = qemu_ram_block_by_name(rbname);
2042
2043 if (!ramblock) {
2044 /* We shouldn't be asked for a non-existent RAMBlock */
2045 error_report("ram_save_queue_pages no block '%s'", rbname);
2046 return -1;
2047 }
2048 rs->last_req_rb = ramblock;
2049 }
2050 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2051 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2052 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2053 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2054 __func__, start, len, ramblock->used_length);
2055 return -1;
2056 }
2057
2058 struct RAMSrcPageRequest *new_entry =
2059 g_malloc0(sizeof(struct RAMSrcPageRequest));
2060 new_entry->rb = ramblock;
2061 new_entry->offset = start;
2062 new_entry->len = len;
2063
2064 memory_region_ref(ramblock->mr);
2065 qemu_mutex_lock(&rs->src_page_req_mutex);
2066 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2067 migration_make_urgent_request();
2068 qemu_mutex_unlock(&rs->src_page_req_mutex);
2069
2070 return 0;
2071 }
2072
2073 static bool save_page_use_compression(RAMState *rs)
2074 {
2075 if (!migrate_use_compression()) {
2076 return false;
2077 }
2078
2079 /*
2080 * If xbzrle is enabled (e.g., after first round of migration), stop
2081 * using the data compression. In theory, xbzrle can do better than
2082 * compression.
2083 */
2084 if (rs->xbzrle_enabled) {
2085 return false;
2086 }
2087
2088 return true;
2089 }
2090
2091 /*
2092 * try to compress the page before posting it out, return true if the page
2093 * has been properly handled by compression, otherwise needs other
2094 * paths to handle it
2095 */
2096 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2097 {
2098 if (!save_page_use_compression(rs)) {
2099 return false;
2100 }
2101
2102 /*
2103 * When starting the process of a new block, the first page of
2104 * the block should be sent out before other pages in the same
2105 * block, and all the pages in last block should have been sent
2106 * out, keeping this order is important, because the 'cont' flag
2107 * is used to avoid resending the block name.
2108 *
2109 * We post the fist page as normal page as compression will take
2110 * much CPU resource.
2111 */
2112 if (block != rs->last_sent_block) {
2113 flush_compressed_data(rs);
2114 return false;
2115 }
2116
2117 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2118 return true;
2119 }
2120
2121 compression_counters.busy++;
2122 return false;
2123 }
2124
2125 /**
2126 * ram_save_target_page: save one target page
2127 *
2128 * Returns the number of pages written
2129 *
2130 * @rs: current RAM state
2131 * @pss: data about the page we want to send
2132 * @last_stage: if we are at the completion stage
2133 */
2134 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2135 bool last_stage)
2136 {
2137 RAMBlock *block = pss->block;
2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139 int res;
2140
2141 if (control_save_page(rs, block, offset, &res)) {
2142 return res;
2143 }
2144
2145 if (save_compress_page(rs, block, offset)) {
2146 return 1;
2147 }
2148
2149 res = save_zero_page(rs, block, offset);
2150 if (res > 0) {
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2153 */
2154 if (!save_page_use_compression(rs)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs, block->offset + offset);
2157 XBZRLE_cache_unlock();
2158 }
2159 ram_release_pages(block->idstr, offset, res);
2160 return res;
2161 }
2162
2163 /*
2164 * Do not use multifd for:
2165 * 1. Compression as the first page in the new block should be posted out
2166 * before sending the compressed page
2167 * 2. In postcopy as one whole host page should be placed
2168 */
2169 if (!save_page_use_compression(rs) && migrate_use_multifd()
2170 && !migration_in_postcopy()) {
2171 return ram_save_multifd_page(rs, block, offset);
2172 }
2173
2174 return ram_save_page(rs, pss, last_stage);
2175 }
2176
2177 /**
2178 * ram_save_host_page: save a whole host page
2179 *
2180 * Starting at *offset send pages up to the end of the current host
2181 * page. It's valid for the initial offset to point into the middle of
2182 * a host page in which case the remainder of the hostpage is sent.
2183 * Only dirty target pages are sent. Note that the host page size may
2184 * be a huge page for this block.
2185 * The saving stops at the boundary of the used_length of the block
2186 * if the RAMBlock isn't a multiple of the host page size.
2187 *
2188 * Returns the number of pages written or negative on error
2189 *
2190 * @rs: current RAM state
2191 * @ms: current migration state
2192 * @pss: data about the page we want to send
2193 * @last_stage: if we are at the completion stage
2194 */
2195 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2196 bool last_stage)
2197 {
2198 int tmppages, pages = 0;
2199 size_t pagesize_bits =
2200 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2201 unsigned long hostpage_boundary =
2202 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2203 unsigned long start_page = pss->page;
2204 int res;
2205
2206 if (ramblock_is_ignored(pss->block)) {
2207 error_report("block %s should not be migrated !", pss->block->idstr);
2208 return 0;
2209 }
2210
2211 do {
2212 /* Check the pages is dirty and if it is send it */
2213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2214 tmppages = ram_save_target_page(rs, pss, last_stage);
2215 if (tmppages < 0) {
2216 return tmppages;
2217 }
2218
2219 pages += tmppages;
2220 /*
2221 * Allow rate limiting to happen in the middle of huge pages if
2222 * something is sent in the current iteration.
2223 */
2224 if (pagesize_bits > 1 && tmppages > 0) {
2225 migration_rate_limit();
2226 }
2227 }
2228 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2229 } while ((pss->page < hostpage_boundary) &&
2230 offset_in_ramblock(pss->block,
2231 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2232 /* The offset we leave with is the min boundary of host page and block */
2233 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2234
2235 res = ram_save_release_protection(rs, pss, start_page);
2236 return (res < 0 ? res : pages);
2237 }
2238
2239 /**
2240 * ram_find_and_save_block: finds a dirty page and sends it to f
2241 *
2242 * Called within an RCU critical section.
2243 *
2244 * Returns the number of pages written where zero means no dirty pages,
2245 * or negative on error
2246 *
2247 * @rs: current RAM state
2248 * @last_stage: if we are at the completion stage
2249 *
2250 * On systems where host-page-size > target-page-size it will send all the
2251 * pages in a host page that are dirty.
2252 */
2253
2254 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2255 {
2256 PageSearchStatus pss;
2257 int pages = 0;
2258 bool again, found;
2259
2260 /* No dirty page as there is zero RAM */
2261 if (!ram_bytes_total()) {
2262 return pages;
2263 }
2264
2265 pss.block = rs->last_seen_block;
2266 pss.page = rs->last_page;
2267 pss.complete_round = false;
2268
2269 if (!pss.block) {
2270 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2271 }
2272
2273 do {
2274 again = true;
2275 found = get_queued_page(rs, &pss);
2276
2277 if (!found) {
2278 /* priority queue empty, so just search for something dirty */
2279 found = find_dirty_block(rs, &pss, &again);
2280 }
2281
2282 if (found) {
2283 pages = ram_save_host_page(rs, &pss, last_stage);
2284 }
2285 } while (!pages && again);
2286
2287 rs->last_seen_block = pss.block;
2288 rs->last_page = pss.page;
2289
2290 return pages;
2291 }
2292
2293 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2294 {
2295 uint64_t pages = size / TARGET_PAGE_SIZE;
2296
2297 if (zero) {
2298 ram_counters.duplicate += pages;
2299 } else {
2300 ram_counters.normal += pages;
2301 ram_counters.transferred += size;
2302 qemu_update_position(f, size);
2303 }
2304 }
2305
2306 static uint64_t ram_bytes_total_common(bool count_ignored)
2307 {
2308 RAMBlock *block;
2309 uint64_t total = 0;
2310
2311 RCU_READ_LOCK_GUARD();
2312
2313 if (count_ignored) {
2314 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2315 total += block->used_length;
2316 }
2317 } else {
2318 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2319 total += block->used_length;
2320 }
2321 }
2322 return total;
2323 }
2324
2325 uint64_t ram_bytes_total(void)
2326 {
2327 return ram_bytes_total_common(false);
2328 }
2329
2330 static void xbzrle_load_setup(void)
2331 {
2332 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2333 }
2334
2335 static void xbzrle_load_cleanup(void)
2336 {
2337 g_free(XBZRLE.decoded_buf);
2338 XBZRLE.decoded_buf = NULL;
2339 }
2340
2341 static void ram_state_cleanup(RAMState **rsp)
2342 {
2343 if (*rsp) {
2344 migration_page_queue_free(*rsp);
2345 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2346 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2347 g_free(*rsp);
2348 *rsp = NULL;
2349 }
2350 }
2351
2352 static void xbzrle_cleanup(void)
2353 {
2354 XBZRLE_cache_lock();
2355 if (XBZRLE.cache) {
2356 cache_fini(XBZRLE.cache);
2357 g_free(XBZRLE.encoded_buf);
2358 g_free(XBZRLE.current_buf);
2359 g_free(XBZRLE.zero_target_page);
2360 XBZRLE.cache = NULL;
2361 XBZRLE.encoded_buf = NULL;
2362 XBZRLE.current_buf = NULL;
2363 XBZRLE.zero_target_page = NULL;
2364 }
2365 XBZRLE_cache_unlock();
2366 }
2367
2368 static void ram_save_cleanup(void *opaque)
2369 {
2370 RAMState **rsp = opaque;
2371 RAMBlock *block;
2372
2373 /* We don't use dirty log with background snapshots */
2374 if (!migrate_background_snapshot()) {
2375 /* caller have hold iothread lock or is in a bh, so there is
2376 * no writing race against the migration bitmap
2377 */
2378 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2379 /*
2380 * do not stop dirty log without starting it, since
2381 * memory_global_dirty_log_stop will assert that
2382 * memory_global_dirty_log_start/stop used in pairs
2383 */
2384 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2385 }
2386 }
2387
2388 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2389 g_free(block->clear_bmap);
2390 block->clear_bmap = NULL;
2391 g_free(block->bmap);
2392 block->bmap = NULL;
2393 }
2394
2395 xbzrle_cleanup();
2396 compress_threads_save_cleanup();
2397 ram_state_cleanup(rsp);
2398 }
2399
2400 static void ram_state_reset(RAMState *rs)
2401 {
2402 rs->last_seen_block = NULL;
2403 rs->last_sent_block = NULL;
2404 rs->last_page = 0;
2405 rs->last_version = ram_list.version;
2406 rs->xbzrle_enabled = false;
2407 }
2408
2409 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2410
2411 /*
2412 * 'expected' is the value you expect the bitmap mostly to be full
2413 * of; it won't bother printing lines that are all this value.
2414 * If 'todump' is null the migration bitmap is dumped.
2415 */
2416 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2417 unsigned long pages)
2418 {
2419 int64_t cur;
2420 int64_t linelen = 128;
2421 char linebuf[129];
2422
2423 for (cur = 0; cur < pages; cur += linelen) {
2424 int64_t curb;
2425 bool found = false;
2426 /*
2427 * Last line; catch the case where the line length
2428 * is longer than remaining ram
2429 */
2430 if (cur + linelen > pages) {
2431 linelen = pages - cur;
2432 }
2433 for (curb = 0; curb < linelen; curb++) {
2434 bool thisbit = test_bit(cur + curb, todump);
2435 linebuf[curb] = thisbit ? '1' : '.';
2436 found = found || (thisbit != expected);
2437 }
2438 if (found) {
2439 linebuf[curb] = '\0';
2440 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2441 }
2442 }
2443 }
2444
2445 /* **** functions for postcopy ***** */
2446
2447 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2448 {
2449 struct RAMBlock *block;
2450
2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2452 unsigned long *bitmap = block->bmap;
2453 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2454 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2455
2456 while (run_start < range) {
2457 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2458 ram_discard_range(block->idstr,
2459 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2460 ((ram_addr_t)(run_end - run_start))
2461 << TARGET_PAGE_BITS);
2462 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2463 }
2464 }
2465 }
2466
2467 /**
2468 * postcopy_send_discard_bm_ram: discard a RAMBlock
2469 *
2470 * Returns zero on success
2471 *
2472 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2473 *
2474 * @ms: current migration state
2475 * @block: RAMBlock to discard
2476 */
2477 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2478 {
2479 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2480 unsigned long current;
2481 unsigned long *bitmap = block->bmap;
2482
2483 for (current = 0; current < end; ) {
2484 unsigned long one = find_next_bit(bitmap, end, current);
2485 unsigned long zero, discard_length;
2486
2487 if (one >= end) {
2488 break;
2489 }
2490
2491 zero = find_next_zero_bit(bitmap, end, one + 1);
2492
2493 if (zero >= end) {
2494 discard_length = end - one;
2495 } else {
2496 discard_length = zero - one;
2497 }
2498 postcopy_discard_send_range(ms, one, discard_length);
2499 current = one + discard_length;
2500 }
2501
2502 return 0;
2503 }
2504
2505 /**
2506 * postcopy_each_ram_send_discard: discard all RAMBlocks
2507 *
2508 * Returns 0 for success or negative for error
2509 *
2510 * Utility for the outgoing postcopy code.
2511 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2512 * passing it bitmap indexes and name.
2513 * (qemu_ram_foreach_block ends up passing unscaled lengths
2514 * which would mean postcopy code would have to deal with target page)
2515 *
2516 * @ms: current migration state
2517 */
2518 static int postcopy_each_ram_send_discard(MigrationState *ms)
2519 {
2520 struct RAMBlock *block;
2521 int ret;
2522
2523 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2524 postcopy_discard_send_init(ms, block->idstr);
2525
2526 /*
2527 * Postcopy sends chunks of bitmap over the wire, but it
2528 * just needs indexes at this point, avoids it having
2529 * target page specific code.
2530 */
2531 ret = postcopy_send_discard_bm_ram(ms, block);
2532 postcopy_discard_send_finish(ms);
2533 if (ret) {
2534 return ret;
2535 }
2536 }
2537
2538 return 0;
2539 }
2540
2541 /**
2542 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2543 *
2544 * Helper for postcopy_chunk_hostpages; it's called twice to
2545 * canonicalize the two bitmaps, that are similar, but one is
2546 * inverted.
2547 *
2548 * Postcopy requires that all target pages in a hostpage are dirty or
2549 * clean, not a mix. This function canonicalizes the bitmaps.
2550 *
2551 * @ms: current migration state
2552 * @block: block that contains the page we want to canonicalize
2553 */
2554 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2555 {
2556 RAMState *rs = ram_state;
2557 unsigned long *bitmap = block->bmap;
2558 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2559 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2560 unsigned long run_start;
2561
2562 if (block->page_size == TARGET_PAGE_SIZE) {
2563 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2564 return;
2565 }
2566
2567 /* Find a dirty page */
2568 run_start = find_next_bit(bitmap, pages, 0);
2569
2570 while (run_start < pages) {
2571
2572 /*
2573 * If the start of this run of pages is in the middle of a host
2574 * page, then we need to fixup this host page.
2575 */
2576 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2577 /* Find the end of this run */
2578 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2579 /*
2580 * If the end isn't at the start of a host page, then the
2581 * run doesn't finish at the end of a host page
2582 * and we need to discard.
2583 */
2584 }
2585
2586 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2587 unsigned long page;
2588 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2589 host_ratio);
2590 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2591
2592 /* Clean up the bitmap */
2593 for (page = fixup_start_addr;
2594 page < fixup_start_addr + host_ratio; page++) {
2595 /*
2596 * Remark them as dirty, updating the count for any pages
2597 * that weren't previously dirty.
2598 */
2599 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2600 }
2601 }
2602
2603 /* Find the next dirty page for the next iteration */
2604 run_start = find_next_bit(bitmap, pages, run_start);
2605 }
2606 }
2607
2608 /**
2609 * postcopy_chunk_hostpages: discard any partially sent host page
2610 *
2611 * Utility for the outgoing postcopy code.
2612 *
2613 * Discard any partially sent host-page size chunks, mark any partially
2614 * dirty host-page size chunks as all dirty. In this case the host-page
2615 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2616 *
2617 * Returns zero on success
2618 *
2619 * @ms: current migration state
2620 * @block: block we want to work with
2621 */
2622 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2623 {
2624 postcopy_discard_send_init(ms, block->idstr);
2625
2626 /*
2627 * Ensure that all partially dirty host pages are made fully dirty.
2628 */
2629 postcopy_chunk_hostpages_pass(ms, block);
2630
2631 postcopy_discard_send_finish(ms);
2632 return 0;
2633 }
2634
2635 /**
2636 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2637 *
2638 * Returns zero on success
2639 *
2640 * Transmit the set of pages to be discarded after precopy to the target
2641 * these are pages that:
2642 * a) Have been previously transmitted but are now dirty again
2643 * b) Pages that have never been transmitted, this ensures that
2644 * any pages on the destination that have been mapped by background
2645 * tasks get discarded (transparent huge pages is the specific concern)
2646 * Hopefully this is pretty sparse
2647 *
2648 * @ms: current migration state
2649 */
2650 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2651 {
2652 RAMState *rs = ram_state;
2653 RAMBlock *block;
2654 int ret;
2655
2656 RCU_READ_LOCK_GUARD();
2657
2658 /* This should be our last sync, the src is now paused */
2659 migration_bitmap_sync(rs);
2660
2661 /* Easiest way to make sure we don't resume in the middle of a host-page */
2662 rs->last_seen_block = NULL;
2663 rs->last_sent_block = NULL;
2664 rs->last_page = 0;
2665
2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667 /* Deal with TPS != HPS and huge pages */
2668 ret = postcopy_chunk_hostpages(ms, block);
2669 if (ret) {
2670 return ret;
2671 }
2672
2673 #ifdef DEBUG_POSTCOPY
2674 ram_debug_dump_bitmap(block->bmap, true,
2675 block->used_length >> TARGET_PAGE_BITS);
2676 #endif
2677 }
2678 trace_ram_postcopy_send_discard_bitmap();
2679
2680 return postcopy_each_ram_send_discard(ms);
2681 }
2682
2683 /**
2684 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2685 *
2686 * Returns zero on success
2687 *
2688 * @rbname: name of the RAMBlock of the request. NULL means the
2689 * same that last one.
2690 * @start: RAMBlock starting page
2691 * @length: RAMBlock size
2692 */
2693 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2694 {
2695 trace_ram_discard_range(rbname, start, length);
2696
2697 RCU_READ_LOCK_GUARD();
2698 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2699
2700 if (!rb) {
2701 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2702 return -1;
2703 }
2704
2705 /*
2706 * On source VM, we don't need to update the received bitmap since
2707 * we don't even have one.
2708 */
2709 if (rb->receivedmap) {
2710 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2711 length >> qemu_target_page_bits());
2712 }
2713
2714 return ram_block_discard_range(rb, start, length);
2715 }
2716
2717 /*
2718 * For every allocation, we will try not to crash the VM if the
2719 * allocation failed.
2720 */
2721 static int xbzrle_init(void)
2722 {
2723 Error *local_err = NULL;
2724
2725 if (!migrate_use_xbzrle()) {
2726 return 0;
2727 }
2728
2729 XBZRLE_cache_lock();
2730
2731 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2732 if (!XBZRLE.zero_target_page) {
2733 error_report("%s: Error allocating zero page", __func__);
2734 goto err_out;
2735 }
2736
2737 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2738 TARGET_PAGE_SIZE, &local_err);
2739 if (!XBZRLE.cache) {
2740 error_report_err(local_err);
2741 goto free_zero_page;
2742 }
2743
2744 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2745 if (!XBZRLE.encoded_buf) {
2746 error_report("%s: Error allocating encoded_buf", __func__);
2747 goto free_cache;
2748 }
2749
2750 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2751 if (!XBZRLE.current_buf) {
2752 error_report("%s: Error allocating current_buf", __func__);
2753 goto free_encoded_buf;
2754 }
2755
2756 /* We are all good */
2757 XBZRLE_cache_unlock();
2758 return 0;
2759
2760 free_encoded_buf:
2761 g_free(XBZRLE.encoded_buf);
2762 XBZRLE.encoded_buf = NULL;
2763 free_cache:
2764 cache_fini(XBZRLE.cache);
2765 XBZRLE.cache = NULL;
2766 free_zero_page:
2767 g_free(XBZRLE.zero_target_page);
2768 XBZRLE.zero_target_page = NULL;
2769 err_out:
2770 XBZRLE_cache_unlock();
2771 return -ENOMEM;
2772 }
2773
2774 static int ram_state_init(RAMState **rsp)
2775 {
2776 *rsp = g_try_new0(RAMState, 1);
2777
2778 if (!*rsp) {
2779 error_report("%s: Init ramstate fail", __func__);
2780 return -1;
2781 }
2782
2783 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2784 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2785 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2786
2787 /*
2788 * Count the total number of pages used by ram blocks not including any
2789 * gaps due to alignment or unplugs.
2790 * This must match with the initial values of dirty bitmap.
2791 */
2792 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2793 ram_state_reset(*rsp);
2794
2795 return 0;
2796 }
2797
2798 static void ram_list_init_bitmaps(void)
2799 {
2800 MigrationState *ms = migrate_get_current();
2801 RAMBlock *block;
2802 unsigned long pages;
2803 uint8_t shift;
2804
2805 /* Skip setting bitmap if there is no RAM */
2806 if (ram_bytes_total()) {
2807 shift = ms->clear_bitmap_shift;
2808 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2809 error_report("clear_bitmap_shift (%u) too big, using "
2810 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2811 shift = CLEAR_BITMAP_SHIFT_MAX;
2812 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2813 error_report("clear_bitmap_shift (%u) too small, using "
2814 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2815 shift = CLEAR_BITMAP_SHIFT_MIN;
2816 }
2817
2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2819 pages = block->max_length >> TARGET_PAGE_BITS;
2820 /*
2821 * The initial dirty bitmap for migration must be set with all
2822 * ones to make sure we'll migrate every guest RAM page to
2823 * destination.
2824 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2825 * new migration after a failed migration, ram_list.
2826 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2827 * guest memory.
2828 */
2829 block->bmap = bitmap_new(pages);
2830 bitmap_set(block->bmap, 0, pages);
2831 block->clear_bmap_shift = shift;
2832 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2833 }
2834 }
2835 }
2836
2837 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2838 {
2839 unsigned long pages;
2840 RAMBlock *rb;
2841
2842 RCU_READ_LOCK_GUARD();
2843
2844 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2845 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2846 rs->migration_dirty_pages -= pages;
2847 }
2848 }
2849
2850 static void ram_init_bitmaps(RAMState *rs)
2851 {
2852 /* For memory_global_dirty_log_start below. */
2853 qemu_mutex_lock_iothread();
2854 qemu_mutex_lock_ramlist();
2855
2856 WITH_RCU_READ_LOCK_GUARD() {
2857 ram_list_init_bitmaps();
2858 /* We don't use dirty log with background snapshots */
2859 if (!migrate_background_snapshot()) {
2860 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2861 migration_bitmap_sync_precopy(rs);
2862 }
2863 }
2864 qemu_mutex_unlock_ramlist();
2865 qemu_mutex_unlock_iothread();
2866
2867 /*
2868 * After an eventual first bitmap sync, fixup the initial bitmap
2869 * containing all 1s to exclude any discarded pages from migration.
2870 */
2871 migration_bitmap_clear_discarded_pages(rs);
2872 }
2873
2874 static int ram_init_all(RAMState **rsp)
2875 {
2876 if (ram_state_init(rsp)) {
2877 return -1;
2878 }
2879
2880 if (xbzrle_init()) {
2881 ram_state_cleanup(rsp);
2882 return -1;
2883 }
2884
2885 ram_init_bitmaps(*rsp);
2886
2887 return 0;
2888 }
2889
2890 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2891 {
2892 RAMBlock *block;
2893 uint64_t pages = 0;
2894
2895 /*
2896 * Postcopy is not using xbzrle/compression, so no need for that.
2897 * Also, since source are already halted, we don't need to care
2898 * about dirty page logging as well.
2899 */
2900
2901 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2902 pages += bitmap_count_one(block->bmap,
2903 block->used_length >> TARGET_PAGE_BITS);
2904 }
2905
2906 /* This may not be aligned with current bitmaps. Recalculate. */
2907 rs->migration_dirty_pages = pages;
2908
2909 ram_state_reset(rs);
2910
2911 /* Update RAMState cache of output QEMUFile */
2912 rs->f = out;
2913
2914 trace_ram_state_resume_prepare(pages);
2915 }
2916
2917 /*
2918 * This function clears bits of the free pages reported by the caller from the
2919 * migration dirty bitmap. @addr is the host address corresponding to the
2920 * start of the continuous guest free pages, and @len is the total bytes of
2921 * those pages.
2922 */
2923 void qemu_guest_free_page_hint(void *addr, size_t len)
2924 {
2925 RAMBlock *block;
2926 ram_addr_t offset;
2927 size_t used_len, start, npages;
2928 MigrationState *s = migrate_get_current();
2929
2930 /* This function is currently expected to be used during live migration */
2931 if (!migration_is_setup_or_active(s->state)) {
2932 return;
2933 }
2934
2935 for (; len > 0; len -= used_len, addr += used_len) {
2936 block = qemu_ram_block_from_host(addr, false, &offset);
2937 if (unlikely(!block || offset >= block->used_length)) {
2938 /*
2939 * The implementation might not support RAMBlock resize during
2940 * live migration, but it could happen in theory with future
2941 * updates. So we add a check here to capture that case.
2942 */
2943 error_report_once("%s unexpected error", __func__);
2944 return;
2945 }
2946
2947 if (len <= block->used_length - offset) {
2948 used_len = len;
2949 } else {
2950 used_len = block->used_length - offset;
2951 }
2952
2953 start = offset >> TARGET_PAGE_BITS;
2954 npages = used_len >> TARGET_PAGE_BITS;
2955
2956 qemu_mutex_lock(&ram_state->bitmap_mutex);
2957 /*
2958 * The skipped free pages are equavalent to be sent from clear_bmap's
2959 * perspective, so clear the bits from the memory region bitmap which
2960 * are initially set. Otherwise those skipped pages will be sent in
2961 * the next round after syncing from the memory region bitmap.
2962 */
2963 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2964 ram_state->migration_dirty_pages -=
2965 bitmap_count_one_with_offset(block->bmap, start, npages);
2966 bitmap_clear(block->bmap, start, npages);
2967 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2968 }
2969 }
2970
2971 /*
2972 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2973 * long-running RCU critical section. When rcu-reclaims in the code
2974 * start to become numerous it will be necessary to reduce the
2975 * granularity of these critical sections.
2976 */
2977
2978 /**
2979 * ram_save_setup: Setup RAM for migration
2980 *
2981 * Returns zero to indicate success and negative for error
2982 *
2983 * @f: QEMUFile where to send the data
2984 * @opaque: RAMState pointer
2985 */
2986 static int ram_save_setup(QEMUFile *f, void *opaque)
2987 {
2988 RAMState **rsp = opaque;
2989 RAMBlock *block;
2990
2991 if (compress_threads_save_setup()) {
2992 return -1;
2993 }
2994
2995 /* migration has already setup the bitmap, reuse it. */
2996 if (!migration_in_colo_state()) {
2997 if (ram_init_all(rsp) != 0) {
2998 compress_threads_save_cleanup();
2999 return -1;
3000 }
3001 }
3002 (*rsp)->f = f;
3003
3004 WITH_RCU_READ_LOCK_GUARD() {
3005 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3006
3007 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3008 qemu_put_byte(f, strlen(block->idstr));
3009 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3010 qemu_put_be64(f, block->used_length);
3011 if (migrate_postcopy_ram() && block->page_size !=
3012 qemu_host_page_size) {
3013 qemu_put_be64(f, block->page_size);
3014 }
3015 if (migrate_ignore_shared()) {
3016 qemu_put_be64(f, block->mr->addr);
3017 }
3018 }
3019 }
3020
3021 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3022 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3023
3024 multifd_send_sync_main(f);
3025 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3026 qemu_fflush(f);
3027
3028 return 0;
3029 }
3030
3031 /**
3032 * ram_save_iterate: iterative stage for migration
3033 *
3034 * Returns zero to indicate success and negative for error
3035 *
3036 * @f: QEMUFile where to send the data
3037 * @opaque: RAMState pointer
3038 */
3039 static int ram_save_iterate(QEMUFile *f, void *opaque)
3040 {
3041 RAMState **temp = opaque;
3042 RAMState *rs = *temp;
3043 int ret = 0;
3044 int i;
3045 int64_t t0;
3046 int done = 0;
3047
3048 if (blk_mig_bulk_active()) {
3049 /* Avoid transferring ram during bulk phase of block migration as
3050 * the bulk phase will usually take a long time and transferring
3051 * ram updates during that time is pointless. */
3052 goto out;
3053 }
3054
3055 /*
3056 * We'll take this lock a little bit long, but it's okay for two reasons.
3057 * Firstly, the only possible other thread to take it is who calls
3058 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3059 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3060 * guarantees that we'll at least released it in a regular basis.
3061 */
3062 qemu_mutex_lock(&rs->bitmap_mutex);
3063 WITH_RCU_READ_LOCK_GUARD() {
3064 if (ram_list.version != rs->last_version) {
3065 ram_state_reset(rs);
3066 }
3067
3068 /* Read version before ram_list.blocks */
3069 smp_rmb();
3070
3071 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3072
3073 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3074 i = 0;
3075 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3076 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3077 int pages;
3078
3079 if (qemu_file_get_error(f)) {
3080 break;
3081 }
3082
3083 pages = ram_find_and_save_block(rs, false);
3084 /* no more pages to sent */
3085 if (pages == 0) {
3086 done = 1;
3087 break;
3088 }
3089
3090 if (pages < 0) {
3091 qemu_file_set_error(f, pages);
3092 break;
3093 }
3094
3095 rs->target_page_count += pages;
3096
3097 /*
3098 * During postcopy, it is necessary to make sure one whole host
3099 * page is sent in one chunk.
3100 */
3101 if (migrate_postcopy_ram()) {
3102 flush_compressed_data(rs);
3103 }
3104
3105 /*
3106 * we want to check in the 1st loop, just in case it was the 1st
3107 * time and we had to sync the dirty bitmap.
3108 * qemu_clock_get_ns() is a bit expensive, so we only check each
3109 * some iterations
3110 */
3111 if ((i & 63) == 0) {
3112 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3113 1000000;
3114 if (t1 > MAX_WAIT) {
3115 trace_ram_save_iterate_big_wait(t1, i);
3116 break;
3117 }
3118 }
3119 i++;
3120 }
3121 }
3122 qemu_mutex_unlock(&rs->bitmap_mutex);
3123
3124 /*
3125 * Must occur before EOS (or any QEMUFile operation)
3126 * because of RDMA protocol.
3127 */
3128 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3129
3130 out:
3131 if (ret >= 0
3132 && migration_is_setup_or_active(migrate_get_current()->state)) {
3133 multifd_send_sync_main(rs->f);
3134 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3135 qemu_fflush(f);
3136 ram_counters.transferred += 8;
3137
3138 ret = qemu_file_get_error(f);
3139 }
3140 if (ret < 0) {
3141 return ret;
3142 }
3143
3144 return done;
3145 }
3146
3147 /**
3148 * ram_save_complete: function called to send the remaining amount of ram
3149 *
3150 * Returns zero to indicate success or negative on error
3151 *
3152 * Called with iothread lock
3153 *
3154 * @f: QEMUFile where to send the data
3155 * @opaque: RAMState pointer
3156 */
3157 static int ram_save_complete(QEMUFile *f, void *opaque)
3158 {
3159 RAMState **temp = opaque;
3160 RAMState *rs = *temp;
3161 int ret = 0;
3162
3163 WITH_RCU_READ_LOCK_GUARD() {
3164 if (!migration_in_postcopy()) {
3165 migration_bitmap_sync_precopy(rs);
3166 }
3167
3168 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3169
3170 /* try transferring iterative blocks of memory */
3171
3172 /* flush all remaining blocks regardless of rate limiting */
3173 while (true) {
3174 int pages;
3175
3176 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3177 /* no more blocks to sent */
3178 if (pages == 0) {
3179 break;
3180 }
3181 if (pages < 0) {
3182 ret = pages;
3183 break;
3184 }
3185 }
3186
3187 flush_compressed_data(rs);
3188 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3189 }
3190
3191 if (ret >= 0) {
3192 multifd_send_sync_main(rs->f);
3193 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3194 qemu_fflush(f);
3195 }
3196
3197 return ret;
3198 }
3199
3200 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3201 uint64_t *res_precopy_only,
3202 uint64_t *res_compatible,
3203 uint64_t *res_postcopy_only)
3204 {
3205 RAMState **temp = opaque;
3206 RAMState *rs = *temp;
3207 uint64_t remaining_size;
3208
3209 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3210
3211 if (!migration_in_postcopy() &&
3212 remaining_size < max_size) {
3213 qemu_mutex_lock_iothread();
3214 WITH_RCU_READ_LOCK_GUARD() {
3215 migration_bitmap_sync_precopy(rs);
3216 }
3217 qemu_mutex_unlock_iothread();
3218 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3219 }
3220
3221 if (migrate_postcopy_ram()) {
3222 /* We can do postcopy, and all the data is postcopiable */
3223 *res_compatible += remaining_size;
3224 } else {
3225 *res_precopy_only += remaining_size;
3226 }
3227 }
3228
3229 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3230 {
3231 unsigned int xh_len;
3232 int xh_flags;
3233 uint8_t *loaded_data;
3234
3235 /* extract RLE header */
3236 xh_flags = qemu_get_byte(f);
3237 xh_len = qemu_get_be16(f);
3238
3239 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3240 error_report("Failed to load XBZRLE page - wrong compression!");
3241 return -1;
3242 }
3243
3244 if (xh_len > TARGET_PAGE_SIZE) {
3245 error_report("Failed to load XBZRLE page - len overflow!");
3246 return -1;
3247 }
3248 loaded_data = XBZRLE.decoded_buf;
3249 /* load data and decode */
3250 /* it can change loaded_data to point to an internal buffer */
3251 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3252
3253 /* decode RLE */
3254 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3255 TARGET_PAGE_SIZE) == -1) {
3256 error_report("Failed to load XBZRLE page - decode error!");
3257 return -1;
3258 }
3259
3260 return 0;
3261 }
3262
3263 /**
3264 * ram_block_from_stream: read a RAMBlock id from the migration stream
3265 *
3266 * Must be called from within a rcu critical section.
3267 *
3268 * Returns a pointer from within the RCU-protected ram_list.
3269 *
3270 * @f: QEMUFile where to read the data from
3271 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3272 */
3273 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3274 {
3275 static RAMBlock *block;
3276 char id[256];
3277 uint8_t len;
3278
3279 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3280 if (!block) {
3281 error_report("Ack, bad migration stream!");
3282 return NULL;
3283 }
3284 return block;
3285 }
3286
3287 len = qemu_get_byte(f);
3288 qemu_get_buffer(f, (uint8_t *)id, len);
3289 id[len] = 0;
3290
3291 block = qemu_ram_block_by_name(id);
3292 if (!block) {
3293 error_report("Can't find block %s", id);
3294 return NULL;
3295 }
3296
3297 if (ramblock_is_ignored(block)) {
3298 error_report("block %s should not be migrated !", id);
3299 return NULL;
3300 }
3301
3302 return block;
3303 }
3304
3305 static inline void *host_from_ram_block_offset(RAMBlock *block,
3306 ram_addr_t offset)
3307 {
3308 if (!offset_in_ramblock(block, offset)) {
3309 return NULL;
3310 }
3311
3312 return block->host + offset;
3313 }
3314
3315 static void *host_page_from_ram_block_offset(RAMBlock *block,
3316 ram_addr_t offset)
3317 {
3318 /* Note: Explicitly no check against offset_in_ramblock(). */
3319 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3320 block->page_size);
3321 }
3322
3323 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3324 ram_addr_t offset)
3325 {
3326 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3327 }
3328
3329 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3330 ram_addr_t offset, bool record_bitmap)
3331 {
3332 if (!offset_in_ramblock(block, offset)) {
3333 return NULL;
3334 }
3335 if (!block->colo_cache) {
3336 error_report("%s: colo_cache is NULL in block :%s",
3337 __func__, block->idstr);
3338 return NULL;
3339 }
3340
3341 /*
3342 * During colo checkpoint, we need bitmap of these migrated pages.
3343 * It help us to decide which pages in ram cache should be flushed
3344 * into VM's RAM later.
3345 */
3346 if (record_bitmap &&
3347 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3348 ram_state->migration_dirty_pages++;
3349 }
3350 return block->colo_cache + offset;
3351 }
3352
3353 /**
3354 * ram_handle_compressed: handle the zero page case
3355 *
3356 * If a page (or a whole RDMA chunk) has been
3357 * determined to be zero, then zap it.
3358 *
3359 * @host: host address for the zero page
3360 * @ch: what the page is filled from. We only support zero
3361 * @size: size of the zero page
3362 */
3363 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3364 {
3365 if (ch != 0 || !buffer_is_zero(host, size)) {
3366 memset(host, ch, size);
3367 }
3368 }
3369
3370 /* return the size after decompression, or negative value on error */
3371 static int
3372 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3373 const uint8_t *source, size_t source_len)
3374 {
3375 int err;
3376
3377 err = inflateReset(stream);
3378 if (err != Z_OK) {
3379 return -1;
3380 }
3381
3382 stream->avail_in = source_len;
3383 stream->next_in = (uint8_t *)source;
3384 stream->avail_out = dest_len;
3385 stream->next_out = dest;
3386
3387 err = inflate(stream, Z_NO_FLUSH);
3388 if (err != Z_STREAM_END) {
3389 return -1;
3390 }
3391
3392 return stream->total_out;
3393 }
3394
3395 static void *do_data_decompress(void *opaque)
3396 {
3397 DecompressParam *param = opaque;
3398 unsigned long pagesize;
3399 uint8_t *des;
3400 int len, ret;
3401
3402 qemu_mutex_lock(&param->mutex);
3403 while (!param->quit) {
3404 if (param->des) {
3405 des = param->des;
3406 len = param->len;
3407 param->des = 0;
3408 qemu_mutex_unlock(&param->mutex);
3409
3410 pagesize = TARGET_PAGE_SIZE;
3411
3412 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3413 param->compbuf, len);
3414 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3415 error_report("decompress data failed");
3416 qemu_file_set_error(decomp_file, ret);
3417 }
3418
3419 qemu_mutex_lock(&decomp_done_lock);
3420 param->done = true;
3421 qemu_cond_signal(&decomp_done_cond);
3422 qemu_mutex_unlock(&decomp_done_lock);
3423
3424 qemu_mutex_lock(&param->mutex);
3425 } else {
3426 qemu_cond_wait(&param->cond, &param->mutex);
3427 }
3428 }
3429 qemu_mutex_unlock(&param->mutex);
3430
3431 return NULL;
3432 }
3433
3434 static int wait_for_decompress_done(void)
3435 {
3436 int idx, thread_count;
3437
3438 if (!migrate_use_compression()) {
3439 return 0;
3440 }
3441
3442 thread_count = migrate_decompress_threads();
3443 qemu_mutex_lock(&decomp_done_lock);
3444 for (idx = 0; idx < thread_count; idx++) {
3445 while (!decomp_param[idx].done) {
3446 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3447 }
3448 }
3449 qemu_mutex_unlock(&decomp_done_lock);
3450 return qemu_file_get_error(decomp_file);
3451 }
3452
3453 static void compress_threads_load_cleanup(void)
3454 {
3455 int i, thread_count;
3456
3457 if (!migrate_use_compression()) {
3458 return;
3459 }
3460 thread_count = migrate_decompress_threads();
3461 for (i = 0; i < thread_count; i++) {
3462 /*
3463 * we use it as a indicator which shows if the thread is
3464 * properly init'd or not
3465 */
3466 if (!decomp_param[i].compbuf) {
3467 break;
3468 }
3469
3470 qemu_mutex_lock(&decomp_param[i].mutex);
3471 decomp_param[i].quit = true;
3472 qemu_cond_signal(&decomp_param[i].cond);
3473 qemu_mutex_unlock(&decomp_param[i].mutex);
3474 }
3475 for (i = 0; i < thread_count; i++) {
3476 if (!decomp_param[i].compbuf) {
3477 break;
3478 }
3479
3480 qemu_thread_join(decompress_threads + i);
3481 qemu_mutex_destroy(&decomp_param[i].mutex);
3482 qemu_cond_destroy(&decomp_param[i].cond);
3483 inflateEnd(&decomp_param[i].stream);
3484 g_free(decomp_param[i].compbuf);
3485 decomp_param[i].compbuf = NULL;
3486 }
3487 g_free(decompress_threads);
3488 g_free(decomp_param);
3489 decompress_threads = NULL;
3490 decomp_param = NULL;
3491 decomp_file = NULL;
3492 }
3493
3494 static int compress_threads_load_setup(QEMUFile *f)
3495 {
3496 int i, thread_count;
3497
3498 if (!migrate_use_compression()) {
3499 return 0;
3500 }
3501
3502 thread_count = migrate_decompress_threads();
3503 decompress_threads = g_new0(QemuThread, thread_count);
3504 decomp_param = g_new0(DecompressParam, thread_count);
3505 qemu_mutex_init(&decomp_done_lock);
3506 qemu_cond_init(&decomp_done_cond);
3507 decomp_file = f;
3508 for (i = 0; i < thread_count; i++) {
3509 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3510 goto exit;
3511 }
3512
3513 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3514 qemu_mutex_init(&decomp_param[i].mutex);
3515 qemu_cond_init(&decomp_param[i].cond);
3516 decomp_param[i].done = true;
3517 decomp_param[i].quit = false;
3518 qemu_thread_create(decompress_threads + i, "decompress",
3519 do_data_decompress, decomp_param + i,
3520 QEMU_THREAD_JOINABLE);
3521 }
3522 return 0;
3523 exit:
3524 compress_threads_load_cleanup();
3525 return -1;
3526 }
3527
3528 static void decompress_data_with_multi_threads(QEMUFile *f,
3529 void *host, int len)
3530 {
3531 int idx, thread_count;
3532
3533 thread_count = migrate_decompress_threads();
3534 QEMU_LOCK_GUARD(&decomp_done_lock);
3535 while (true) {
3536 for (idx = 0; idx < thread_count; idx++) {
3537 if (decomp_param[idx].done) {
3538 decomp_param[idx].done = false;
3539 qemu_mutex_lock(&decomp_param[idx].mutex);
3540 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3541 decomp_param[idx].des = host;
3542 decomp_param[idx].len = len;
3543 qemu_cond_signal(&decomp_param[idx].cond);
3544 qemu_mutex_unlock(&decomp_param[idx].mutex);
3545 break;
3546 }
3547 }
3548 if (idx < thread_count) {
3549 break;
3550 } else {
3551 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3552 }
3553 }
3554 }
3555
3556 static void colo_init_ram_state(void)
3557 {
3558 ram_state_init(&ram_state);
3559 }
3560
3561 /*
3562 * colo cache: this is for secondary VM, we cache the whole
3563 * memory of the secondary VM, it is need to hold the global lock
3564 * to call this helper.
3565 */
3566 int colo_init_ram_cache(void)
3567 {
3568 RAMBlock *block;
3569
3570 WITH_RCU_READ_LOCK_GUARD() {
3571 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3572 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3573 NULL, false, false);
3574 if (!block->colo_cache) {
3575 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3576 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3577 block->used_length);
3578 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3579 if (block->colo_cache) {
3580 qemu_anon_ram_free(block->colo_cache, block->used_length);
3581 block->colo_cache = NULL;
3582 }
3583 }
3584 return -errno;
3585 }
3586 if (!machine_dump_guest_core(current_machine)) {
3587 qemu_madvise(block->colo_cache, block->used_length,
3588 QEMU_MADV_DONTDUMP);
3589 }
3590 }
3591 }
3592
3593 /*
3594 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3595 * with to decide which page in cache should be flushed into SVM's RAM. Here
3596 * we use the same name 'ram_bitmap' as for migration.
3597 */
3598 if (ram_bytes_total()) {
3599 RAMBlock *block;
3600
3601 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3602 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3603 block->bmap = bitmap_new(pages);
3604 }
3605 }
3606
3607 colo_init_ram_state();
3608 return 0;
3609 }
3610
3611 /* TODO: duplicated with ram_init_bitmaps */
3612 void colo_incoming_start_dirty_log(void)
3613 {
3614 RAMBlock *block = NULL;
3615 /* For memory_global_dirty_log_start below. */
3616 qemu_mutex_lock_iothread();
3617 qemu_mutex_lock_ramlist();
3618
3619 memory_global_dirty_log_sync();
3620 WITH_RCU_READ_LOCK_GUARD() {
3621 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3622 ramblock_sync_dirty_bitmap(ram_state, block);
3623 /* Discard this dirty bitmap record */
3624 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3625 }
3626 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3627 }
3628 ram_state->migration_dirty_pages = 0;
3629 qemu_mutex_unlock_ramlist();
3630 qemu_mutex_unlock_iothread();
3631 }
3632
3633 /* It is need to hold the global lock to call this helper */
3634 void colo_release_ram_cache(void)
3635 {
3636 RAMBlock *block;
3637
3638 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3639 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3640 g_free(block->bmap);
3641 block->bmap = NULL;
3642 }
3643
3644 WITH_RCU_READ_LOCK_GUARD() {
3645 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3646 if (block->colo_cache) {
3647 qemu_anon_ram_free(block->colo_cache, block->used_length);
3648 block->colo_cache = NULL;
3649 }
3650 }
3651 }
3652 ram_state_cleanup(&ram_state);
3653 }
3654
3655 /**
3656 * ram_load_setup: Setup RAM for migration incoming side
3657 *
3658 * Returns zero to indicate success and negative for error
3659 *
3660 * @f: QEMUFile where to receive the data
3661 * @opaque: RAMState pointer
3662 */
3663 static int ram_load_setup(QEMUFile *f, void *opaque)
3664 {
3665 if (compress_threads_load_setup(f)) {
3666 return -1;
3667 }
3668
3669 xbzrle_load_setup();
3670 ramblock_recv_map_init();
3671
3672 return 0;
3673 }
3674
3675 static int ram_load_cleanup(void *opaque)
3676 {
3677 RAMBlock *rb;
3678
3679 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3680 qemu_ram_block_writeback(rb);
3681 }
3682
3683 xbzrle_load_cleanup();
3684 compress_threads_load_cleanup();
3685
3686 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3687 g_free(rb->receivedmap);
3688 rb->receivedmap = NULL;
3689 }
3690
3691 return 0;
3692 }
3693
3694 /**
3695 * ram_postcopy_incoming_init: allocate postcopy data structures
3696 *
3697 * Returns 0 for success and negative if there was one error
3698 *
3699 * @mis: current migration incoming state
3700 *
3701 * Allocate data structures etc needed by incoming migration with
3702 * postcopy-ram. postcopy-ram's similarly names
3703 * postcopy_ram_incoming_init does the work.
3704 */
3705 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3706 {
3707 return postcopy_ram_incoming_init(mis);
3708 }
3709
3710 /**
3711 * ram_load_postcopy: load a page in postcopy case
3712 *
3713 * Returns 0 for success or -errno in case of error
3714 *
3715 * Called in postcopy mode by ram_load().
3716 * rcu_read_lock is taken prior to this being called.
3717 *
3718 * @f: QEMUFile where to send the data
3719 */
3720 static int ram_load_postcopy(QEMUFile *f)
3721 {
3722 int flags = 0, ret = 0;
3723 bool place_needed = false;
3724 bool matches_target_page_size = false;
3725 MigrationIncomingState *mis = migration_incoming_get_current();
3726 /* Temporary page that is later 'placed' */
3727 void *postcopy_host_page = mis->postcopy_tmp_page;
3728 void *host_page = NULL;
3729 bool all_zero = true;
3730 int target_pages = 0;
3731
3732 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3733 ram_addr_t addr;
3734 void *page_buffer = NULL;
3735 void *place_source = NULL;
3736 RAMBlock *block = NULL;
3737 uint8_t ch;
3738 int len;
3739
3740 addr = qemu_get_be64(f);
3741
3742 /*
3743 * If qemu file error, we should stop here, and then "addr"
3744 * may be invalid
3745 */
3746 ret = qemu_file_get_error(f);
3747 if (ret) {
3748 break;
3749 }
3750
3751 flags = addr & ~TARGET_PAGE_MASK;
3752 addr &= TARGET_PAGE_MASK;
3753
3754 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3755 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3756 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3757 block = ram_block_from_stream(f, flags);
3758 if (!block) {
3759 ret = -EINVAL;
3760 break;
3761 }
3762
3763 /*
3764 * Relying on used_length is racy and can result in false positives.
3765 * We might place pages beyond used_length in case RAM was shrunk
3766 * while in postcopy, which is fine - trying to place via
3767 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3768 */
3769 if (!block->host || addr >= block->postcopy_length) {
3770 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3771 ret = -EINVAL;
3772 break;
3773 }
3774 target_pages++;
3775 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3776 /*
3777 * Postcopy requires that we place whole host pages atomically;
3778 * these may be huge pages for RAMBlocks that are backed by
3779 * hugetlbfs.
3780 * To make it atomic, the data is read into a temporary page
3781 * that's moved into place later.
3782 * The migration protocol uses, possibly smaller, target-pages
3783 * however the source ensures it always sends all the components
3784 * of a host page in one chunk.
3785 */
3786 page_buffer = postcopy_host_page +
3787 host_page_offset_from_ram_block_offset(block, addr);
3788 /* If all TP are zero then we can optimise the place */
3789 if (target_pages == 1) {
3790 host_page = host_page_from_ram_block_offset(block, addr);
3791 } else if (host_page != host_page_from_ram_block_offset(block,
3792 addr)) {
3793 /* not the 1st TP within the HP */
3794 error_report("Non-same host page %p/%p", host_page,
3795 host_page_from_ram_block_offset(block, addr));
3796 ret = -EINVAL;
3797 break;
3798 }
3799
3800 /*
3801 * If it's the last part of a host page then we place the host
3802 * page
3803 */
3804 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3805 place_needed = true;
3806 }
3807 place_source = postcopy_host_page;
3808 }
3809
3810 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3811 case RAM_SAVE_FLAG_ZERO:
3812 ch = qemu_get_byte(f);
3813 /*
3814 * Can skip to set page_buffer when
3815 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3816 */
3817 if (ch || !matches_target_page_size) {
3818 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3819 }
3820 if (ch) {
3821 all_zero = false;
3822 }
3823 break;
3824
3825 case RAM_SAVE_FLAG_PAGE:
3826 all_zero = false;
3827 if (!matches_target_page_size) {
3828 /* For huge pages, we always use temporary buffer */
3829 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3830 } else {
3831 /*
3832 * For small pages that matches target page size, we
3833 * avoid the qemu_file copy. Instead we directly use
3834 * the buffer of QEMUFile to place the page. Note: we
3835 * cannot do any QEMUFile operation before using that
3836 * buffer to make sure the buffer is valid when
3837 * placing the page.
3838 */
3839 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3840 TARGET_PAGE_SIZE);
3841 }
3842 break;
3843 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3844 all_zero = false;
3845 len = qemu_get_be32(f);
3846 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3847 error_report("Invalid compressed data length: %d", len);
3848 ret = -EINVAL;
3849 break;
3850 }
3851 decompress_data_with_multi_threads(f, page_buffer, len);
3852 break;
3853
3854 case RAM_SAVE_FLAG_EOS:
3855 /* normal exit */
3856 multifd_recv_sync_main();
3857 break;
3858 default:
3859 error_report("Unknown combination of migration flags: 0x%x"
3860 " (postcopy mode)", flags);
3861 ret = -EINVAL;
3862 break;
3863 }
3864
3865 /* Got the whole host page, wait for decompress before placing. */
3866 if (place_needed) {
3867 ret |= wait_for_decompress_done();
3868 }
3869
3870 /* Detect for any possible file errors */
3871 if (!ret && qemu_file_get_error(f)) {
3872 ret = qemu_file_get_error(f);
3873 }
3874
3875 if (!ret && place_needed) {
3876 if (all_zero) {
3877 ret = postcopy_place_page_zero(mis, host_page, block);
3878 } else {
3879 ret = postcopy_place_page(mis, host_page, place_source,
3880 block);
3881 }
3882 place_needed = false;
3883 target_pages = 0;
3884 /* Assume we have a zero page until we detect something different */
3885 all_zero = true;
3886 }
3887 }
3888
3889 return ret;
3890 }
3891
3892 static bool postcopy_is_advised(void)
3893 {
3894 PostcopyState ps = postcopy_state_get();
3895 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3896 }
3897
3898 static bool postcopy_is_running(void)
3899 {
3900 PostcopyState ps = postcopy_state_get();
3901 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3902 }
3903
3904 /*
3905 * Flush content of RAM cache into SVM's memory.
3906 * Only flush the pages that be dirtied by PVM or SVM or both.
3907 */
3908 void colo_flush_ram_cache(void)
3909 {
3910 RAMBlock *block = NULL;
3911 void *dst_host;
3912 void *src_host;
3913 unsigned long offset = 0;
3914
3915 memory_global_dirty_log_sync();
3916 WITH_RCU_READ_LOCK_GUARD() {
3917 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3918 ramblock_sync_dirty_bitmap(ram_state, block);
3919 }
3920 }
3921
3922 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3923 WITH_RCU_READ_LOCK_GUARD() {
3924 block = QLIST_FIRST_RCU(&ram_list.blocks);
3925
3926 while (block) {
3927 unsigned long num = 0;
3928
3929 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3930 if (!offset_in_ramblock(block,
3931 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3932 offset = 0;
3933 num = 0;
3934 block = QLIST_NEXT_RCU(block, next);
3935 } else {
3936 unsigned long i = 0;
3937
3938 for (i = 0; i < num; i++) {
3939 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3940 }
3941 dst_host = block->host
3942 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3943 src_host = block->colo_cache
3944 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3945 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3946 offset += num;
3947 }
3948 }
3949 }
3950 trace_colo_flush_ram_cache_end();
3951 }
3952
3953 /**
3954 * ram_load_precopy: load pages in precopy case
3955 *
3956 * Returns 0 for success or -errno in case of error
3957 *
3958 * Called in precopy mode by ram_load().
3959 * rcu_read_lock is taken prior to this being called.
3960 *
3961 * @f: QEMUFile where to send the data
3962 */
3963 static int ram_load_precopy(QEMUFile *f)
3964 {
3965 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3966 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3967 bool postcopy_advised = postcopy_is_advised();
3968 if (!migrate_use_compression()) {
3969 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3970 }
3971
3972 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3973 ram_addr_t addr, total_ram_bytes;
3974 void *host = NULL, *host_bak = NULL;
3975 uint8_t ch;
3976
3977 /*
3978 * Yield periodically to let main loop run, but an iteration of
3979 * the main loop is expensive, so do it each some iterations
3980 */
3981 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3982 aio_co_schedule(qemu_get_current_aio_context(),
3983 qemu_coroutine_self());
3984 qemu_coroutine_yield();
3985 }
3986 i++;
3987
3988 addr = qemu_get_be64(f);
3989 flags = addr & ~TARGET_PAGE_MASK;
3990 addr &= TARGET_PAGE_MASK;
3991
3992 if (flags & invalid_flags) {
3993 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3994 error_report("Received an unexpected compressed page");
3995 }
3996
3997 ret = -EINVAL;
3998 break;
3999 }
4000
4001 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4002 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4003 RAMBlock *block = ram_block_from_stream(f, flags);
4004
4005 host = host_from_ram_block_offset(block, addr);
4006 /*
4007 * After going into COLO stage, we should not load the page
4008 * into SVM's memory directly, we put them into colo_cache firstly.
4009 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4010 * Previously, we copied all these memory in preparing stage of COLO
4011 * while we need to stop VM, which is a time-consuming process.
4012 * Here we optimize it by a trick, back-up every page while in
4013 * migration process while COLO is enabled, though it affects the
4014 * speed of the migration, but it obviously reduce the downtime of
4015 * back-up all SVM'S memory in COLO preparing stage.
4016 */
4017 if (migration_incoming_colo_enabled()) {
4018 if (migration_incoming_in_colo_state()) {
4019 /* In COLO stage, put all pages into cache temporarily */
4020 host = colo_cache_from_block_offset(block, addr, true);
4021 } else {
4022 /*
4023 * In migration stage but before COLO stage,
4024 * Put all pages into both cache and SVM's memory.
4025 */
4026 host_bak = colo_cache_from_block_offset(block, addr, false);
4027 }
4028 }
4029 if (!host) {
4030 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4031 ret = -EINVAL;
4032 break;
4033 }
4034 if (!migration_incoming_in_colo_state()) {
4035 ramblock_recv_bitmap_set(block, host);
4036 }
4037
4038 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4039 }
4040
4041 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4042 case RAM_SAVE_FLAG_MEM_SIZE:
4043 /* Synchronize RAM block list */
4044 total_ram_bytes = addr;
4045 while (!ret && total_ram_bytes) {
4046 RAMBlock *block;
4047 char id[256];
4048 ram_addr_t length;
4049
4050 len = qemu_get_byte(f);
4051 qemu_get_buffer(f, (uint8_t *)id, len);
4052 id[len] = 0;
4053 length = qemu_get_be64(f);
4054
4055 block = qemu_ram_block_by_name(id);
4056 if (block && !qemu_ram_is_migratable(block)) {
4057 error_report("block %s should not be migrated !", id);
4058 ret = -EINVAL;
4059 } else if (block) {
4060 if (length != block->used_length) {
4061 Error *local_err = NULL;
4062
4063 ret = qemu_ram_resize(block, length,