Update version for v6.0.0-rc3 release
[qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64
65 /***********************************************************/
66 /* ram save/restore */
67
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
85 {
86 return buffer_is_zero(p, size);
87 }
88
89 XBZRLECacheStats xbzrle_counters;
90
91 /* struct contains XBZRLE cache and a static page
92 used by the compression */
93 static struct {
94 /* buffer used for XBZRLE encoding */
95 uint8_t *encoded_buf;
96 /* buffer for storing page content */
97 uint8_t *current_buf;
98 /* Cache for XBZRLE, Protected by lock. */
99 PageCache *cache;
100 QemuMutex lock;
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page;
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf;
105 } XBZRLE;
106
107 static void XBZRLE_cache_lock(void)
108 {
109 if (migrate_use_xbzrle()) {
110 qemu_mutex_lock(&XBZRLE.lock);
111 }
112 }
113
114 static void XBZRLE_cache_unlock(void)
115 {
116 if (migrate_use_xbzrle()) {
117 qemu_mutex_unlock(&XBZRLE.lock);
118 }
119 }
120
121 /**
122 * xbzrle_cache_resize: resize the xbzrle cache
123 *
124 * This function is called from qmp_migrate_set_cache_size in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
128 *
129 * Returns 0 for success or -1 for error
130 *
131 * @new_size: new cache size
132 * @errp: set *errp if the check failed, with reason
133 */
134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
135 {
136 PageCache *new_cache;
137 int64_t ret = 0;
138
139 /* Check for truncation */
140 if (new_size != (size_t)new_size) {
141 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
142 "exceeding address space");
143 return -1;
144 }
145
146 if (new_size == migrate_xbzrle_cache_size()) {
147 /* nothing to do */
148 return 0;
149 }
150
151 XBZRLE_cache_lock();
152
153 if (XBZRLE.cache != NULL) {
154 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
155 if (!new_cache) {
156 ret = -1;
157 goto out;
158 }
159
160 cache_fini(XBZRLE.cache);
161 XBZRLE.cache = new_cache;
162 }
163 out:
164 XBZRLE_cache_unlock();
165 return ret;
166 }
167
168 bool ramblock_is_ignored(RAMBlock *block)
169 {
170 return !qemu_ram_is_migratable(block) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 }
173
174 #undef RAMBLOCK_FOREACH
175
176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
177 {
178 RAMBlock *block;
179 int ret = 0;
180
181 RCU_READ_LOCK_GUARD();
182
183 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
184 ret = func(block, opaque);
185 if (ret) {
186 break;
187 }
188 }
189 return ret;
190 }
191
192 static void ramblock_recv_map_init(void)
193 {
194 RAMBlock *rb;
195
196 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
197 assert(!rb->receivedmap);
198 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
199 }
200 }
201
202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
203 {
204 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
205 rb->receivedmap);
206 }
207
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
209 {
210 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 }
212
213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
214 {
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 }
217
218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
219 size_t nr)
220 {
221 bitmap_set_atomic(rb->receivedmap,
222 ramblock_recv_bitmap_offset(host_addr, rb),
223 nr);
224 }
225
226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227
228 /*
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
230 *
231 * Returns >0 if success with sent bytes, or <0 if error.
232 */
233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
234 const char *block_name)
235 {
236 RAMBlock *block = qemu_ram_block_by_name(block_name);
237 unsigned long *le_bitmap, nbits;
238 uint64_t size;
239
240 if (!block) {
241 error_report("%s: invalid block name: %s", __func__, block_name);
242 return -1;
243 }
244
245 nbits = block->used_length >> TARGET_PAGE_BITS;
246
247 /*
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
251 */
252 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253
254 /*
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
257 * same endianness. (Note: big endian won't work.)
258 */
259 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
260
261 /* Size of the bitmap, in bytes */
262 size = DIV_ROUND_UP(nbits, 8);
263
264 /*
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
268 * 64bit machines.
269 */
270 size = ROUND_UP(size, 8);
271
272 qemu_put_be64(file, size);
273 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
274 /*
275 * Mark as an end, in case the middle part is screwed up due to
276 * some "mysterious" reason.
277 */
278 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
279 qemu_fflush(file);
280
281 g_free(le_bitmap);
282
283 if (qemu_file_get_error(file)) {
284 return qemu_file_get_error(file);
285 }
286
287 return size + sizeof(size);
288 }
289
290 /*
291 * An outstanding page request, on the source, having been received
292 * and queued
293 */
294 struct RAMSrcPageRequest {
295 RAMBlock *rb;
296 hwaddr offset;
297 hwaddr len;
298
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 };
301
302 /* State of RAM for migration */
303 struct RAMState {
304 /* QEMUFile used for this migration */
305 QEMUFile *f;
306 /* UFFD file descriptor, used in 'write-tracking' migration */
307 int uffdio_fd;
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock *last_seen_block;
310 /* Last block from where we have sent data */
311 RAMBlock *last_sent_block;
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page;
314 /* last ram version we have seen */
315 uint32_t last_version;
316 /* We are in the first round */
317 bool ram_bulk_stage;
318 /* The free page optimization is enabled */
319 bool fpo_enabled;
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt;
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync;
325 /* bytes transferred at start_time */
326 uint64_t bytes_xfer_prev;
327 /* number of dirty pages since start_time */
328 uint64_t num_dirty_pages_period;
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev;
331 /* Amount of xbzrle pages since the beginning of the period */
332 uint64_t xbzrle_pages_prev;
333 /* Amount of xbzrle encoded bytes since the beginning of the period */
334 uint64_t xbzrle_bytes_prev;
335
336 /* compression statistics since the beginning of the period */
337 /* amount of count that no free thread to compress data */
338 uint64_t compress_thread_busy_prev;
339 /* amount bytes after compression */
340 uint64_t compressed_size_prev;
341 /* amount of compressed pages */
342 uint64_t compress_pages_prev;
343
344 /* total handled target pages at the beginning of period */
345 uint64_t target_page_count_prev;
346 /* total handled target pages since start */
347 uint64_t target_page_count;
348 /* number of dirty bits in the bitmap */
349 uint64_t migration_dirty_pages;
350 /* Protects modification of the bitmap and migration dirty pages */
351 QemuMutex bitmap_mutex;
352 /* The RAMBlock used in the last src_page_requests */
353 RAMBlock *last_req_rb;
354 /* Queue of outstanding page requests from the destination */
355 QemuMutex src_page_req_mutex;
356 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
357 };
358 typedef struct RAMState RAMState;
359
360 static RAMState *ram_state;
361
362 static NotifierWithReturnList precopy_notifier_list;
363
364 void precopy_infrastructure_init(void)
365 {
366 notifier_with_return_list_init(&precopy_notifier_list);
367 }
368
369 void precopy_add_notifier(NotifierWithReturn *n)
370 {
371 notifier_with_return_list_add(&precopy_notifier_list, n);
372 }
373
374 void precopy_remove_notifier(NotifierWithReturn *n)
375 {
376 notifier_with_return_remove(n);
377 }
378
379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380 {
381 PrecopyNotifyData pnd;
382 pnd.reason = reason;
383 pnd.errp = errp;
384
385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386 }
387
388 void precopy_enable_free_page_optimization(void)
389 {
390 if (!ram_state) {
391 return;
392 }
393
394 ram_state->fpo_enabled = true;
395 }
396
397 uint64_t ram_bytes_remaining(void)
398 {
399 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
400 0;
401 }
402
403 MigrationStats ram_counters;
404
405 /* used by the search for pages to send */
406 struct PageSearchStatus {
407 /* Current block being searched */
408 RAMBlock *block;
409 /* Current page to search from */
410 unsigned long page;
411 /* Set once we wrap around */
412 bool complete_round;
413 };
414 typedef struct PageSearchStatus PageSearchStatus;
415
416 CompressionStats compression_counters;
417
418 struct CompressParam {
419 bool done;
420 bool quit;
421 bool zero_page;
422 QEMUFile *file;
423 QemuMutex mutex;
424 QemuCond cond;
425 RAMBlock *block;
426 ram_addr_t offset;
427
428 /* internally used fields */
429 z_stream stream;
430 uint8_t *originbuf;
431 };
432 typedef struct CompressParam CompressParam;
433
434 struct DecompressParam {
435 bool done;
436 bool quit;
437 QemuMutex mutex;
438 QemuCond cond;
439 void *des;
440 uint8_t *compbuf;
441 int len;
442 z_stream stream;
443 };
444 typedef struct DecompressParam DecompressParam;
445
446 static CompressParam *comp_param;
447 static QemuThread *compress_threads;
448 /* comp_done_cond is used to wake up the migration thread when
449 * one of the compression threads has finished the compression.
450 * comp_done_lock is used to co-work with comp_done_cond.
451 */
452 static QemuMutex comp_done_lock;
453 static QemuCond comp_done_cond;
454 /* The empty QEMUFileOps will be used by file in CompressParam */
455 static const QEMUFileOps empty_ops = { };
456
457 static QEMUFile *decomp_file;
458 static DecompressParam *decomp_param;
459 static QemuThread *decompress_threads;
460 static QemuMutex decomp_done_lock;
461 static QemuCond decomp_done_cond;
462
463 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
464 ram_addr_t offset, uint8_t *source_buf);
465
466 static void *do_data_compress(void *opaque)
467 {
468 CompressParam *param = opaque;
469 RAMBlock *block;
470 ram_addr_t offset;
471 bool zero_page;
472
473 qemu_mutex_lock(&param->mutex);
474 while (!param->quit) {
475 if (param->block) {
476 block = param->block;
477 offset = param->offset;
478 param->block = NULL;
479 qemu_mutex_unlock(&param->mutex);
480
481 zero_page = do_compress_ram_page(param->file, &param->stream,
482 block, offset, param->originbuf);
483
484 qemu_mutex_lock(&comp_done_lock);
485 param->done = true;
486 param->zero_page = zero_page;
487 qemu_cond_signal(&comp_done_cond);
488 qemu_mutex_unlock(&comp_done_lock);
489
490 qemu_mutex_lock(&param->mutex);
491 } else {
492 qemu_cond_wait(&param->cond, &param->mutex);
493 }
494 }
495 qemu_mutex_unlock(&param->mutex);
496
497 return NULL;
498 }
499
500 static void compress_threads_save_cleanup(void)
501 {
502 int i, thread_count;
503
504 if (!migrate_use_compression() || !comp_param) {
505 return;
506 }
507
508 thread_count = migrate_compress_threads();
509 for (i = 0; i < thread_count; i++) {
510 /*
511 * we use it as a indicator which shows if the thread is
512 * properly init'd or not
513 */
514 if (!comp_param[i].file) {
515 break;
516 }
517
518 qemu_mutex_lock(&comp_param[i].mutex);
519 comp_param[i].quit = true;
520 qemu_cond_signal(&comp_param[i].cond);
521 qemu_mutex_unlock(&comp_param[i].mutex);
522
523 qemu_thread_join(compress_threads + i);
524 qemu_mutex_destroy(&comp_param[i].mutex);
525 qemu_cond_destroy(&comp_param[i].cond);
526 deflateEnd(&comp_param[i].stream);
527 g_free(comp_param[i].originbuf);
528 qemu_fclose(comp_param[i].file);
529 comp_param[i].file = NULL;
530 }
531 qemu_mutex_destroy(&comp_done_lock);
532 qemu_cond_destroy(&comp_done_cond);
533 g_free(compress_threads);
534 g_free(comp_param);
535 compress_threads = NULL;
536 comp_param = NULL;
537 }
538
539 static int compress_threads_save_setup(void)
540 {
541 int i, thread_count;
542
543 if (!migrate_use_compression()) {
544 return 0;
545 }
546 thread_count = migrate_compress_threads();
547 compress_threads = g_new0(QemuThread, thread_count);
548 comp_param = g_new0(CompressParam, thread_count);
549 qemu_cond_init(&comp_done_cond);
550 qemu_mutex_init(&comp_done_lock);
551 for (i = 0; i < thread_count; i++) {
552 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
553 if (!comp_param[i].originbuf) {
554 goto exit;
555 }
556
557 if (deflateInit(&comp_param[i].stream,
558 migrate_compress_level()) != Z_OK) {
559 g_free(comp_param[i].originbuf);
560 goto exit;
561 }
562
563 /* comp_param[i].file is just used as a dummy buffer to save data,
564 * set its ops to empty.
565 */
566 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
567 comp_param[i].done = true;
568 comp_param[i].quit = false;
569 qemu_mutex_init(&comp_param[i].mutex);
570 qemu_cond_init(&comp_param[i].cond);
571 qemu_thread_create(compress_threads + i, "compress",
572 do_data_compress, comp_param + i,
573 QEMU_THREAD_JOINABLE);
574 }
575 return 0;
576
577 exit:
578 compress_threads_save_cleanup();
579 return -1;
580 }
581
582 /**
583 * save_page_header: write page header to wire
584 *
585 * If this is the 1st block, it also writes the block identification
586 *
587 * Returns the number of bytes written
588 *
589 * @f: QEMUFile where to send the data
590 * @block: block that contains the page we want to send
591 * @offset: offset inside the block for the page
592 * in the lower bits, it contains flags
593 */
594 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
595 ram_addr_t offset)
596 {
597 size_t size, len;
598
599 if (block == rs->last_sent_block) {
600 offset |= RAM_SAVE_FLAG_CONTINUE;
601 }
602 qemu_put_be64(f, offset);
603 size = 8;
604
605 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
606 len = strlen(block->idstr);
607 qemu_put_byte(f, len);
608 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
609 size += 1 + len;
610 rs->last_sent_block = block;
611 }
612 return size;
613 }
614
615 /**
616 * mig_throttle_guest_down: throotle down the guest
617 *
618 * Reduce amount of guest cpu execution to hopefully slow down memory
619 * writes. If guest dirty memory rate is reduced below the rate at
620 * which we can transfer pages to the destination then we should be
621 * able to complete migration. Some workloads dirty memory way too
622 * fast and will not effectively converge, even with auto-converge.
623 */
624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
625 uint64_t bytes_dirty_threshold)
626 {
627 MigrationState *s = migrate_get_current();
628 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
629 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
630 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
631 int pct_max = s->parameters.max_cpu_throttle;
632
633 uint64_t throttle_now = cpu_throttle_get_percentage();
634 uint64_t cpu_now, cpu_ideal, throttle_inc;
635
636 /* We have not started throttling yet. Let's start it. */
637 if (!cpu_throttle_active()) {
638 cpu_throttle_set(pct_initial);
639 } else {
640 /* Throttling already on, just increase the rate */
641 if (!pct_tailslow) {
642 throttle_inc = pct_increment;
643 } else {
644 /* Compute the ideal CPU percentage used by Guest, which may
645 * make the dirty rate match the dirty rate threshold. */
646 cpu_now = 100 - throttle_now;
647 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
648 bytes_dirty_period);
649 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
650 }
651 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
652 }
653 }
654
655 /**
656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
657 *
658 * @rs: current RAM state
659 * @current_addr: address for the zero page
660 *
661 * Update the xbzrle cache to reflect a page that's been sent as all 0.
662 * The important thing is that a stale (not-yet-0'd) page be replaced
663 * by the new data.
664 * As a bonus, if the page wasn't in the cache it gets added so that
665 * when a small write is made into the 0'd page it gets XBZRLE sent.
666 */
667 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
668 {
669 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
670 return;
671 }
672
673 /* We don't care if this fails to allocate a new cache page
674 * as long as it updated an old one */
675 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
676 ram_counters.dirty_sync_count);
677 }
678
679 #define ENCODING_FLAG_XBZRLE 0x1
680
681 /**
682 * save_xbzrle_page: compress and send current page
683 *
684 * Returns: 1 means that we wrote the page
685 * 0 means that page is identical to the one already sent
686 * -1 means that xbzrle would be longer than normal
687 *
688 * @rs: current RAM state
689 * @current_data: pointer to the address of the page contents
690 * @current_addr: addr of the page
691 * @block: block that contains the page we want to send
692 * @offset: offset inside the block for the page
693 * @last_stage: if we are at the completion stage
694 */
695 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
696 ram_addr_t current_addr, RAMBlock *block,
697 ram_addr_t offset, bool last_stage)
698 {
699 int encoded_len = 0, bytes_xbzrle;
700 uint8_t *prev_cached_page;
701
702 if (!cache_is_cached(XBZRLE.cache, current_addr,
703 ram_counters.dirty_sync_count)) {
704 xbzrle_counters.cache_miss++;
705 if (!last_stage) {
706 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
707 ram_counters.dirty_sync_count) == -1) {
708 return -1;
709 } else {
710 /* update *current_data when the page has been
711 inserted into cache */
712 *current_data = get_cached_data(XBZRLE.cache, current_addr);
713 }
714 }
715 return -1;
716 }
717
718 /*
719 * Reaching here means the page has hit the xbzrle cache, no matter what
720 * encoding result it is (normal encoding, overflow or skipping the page),
721 * count the page as encoded. This is used to calculate the encoding rate.
722 *
723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724 * 2nd page turns out to be skipped (i.e. no new bytes written to the
725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726 * skipped page included. In this way, the encoding rate can tell if the
727 * guest page is good for xbzrle encoding.
728 */
729 xbzrle_counters.pages++;
730 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
731
732 /* save current buffer into memory */
733 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
734
735 /* XBZRLE encoding (if there is no overflow) */
736 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
737 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
738 TARGET_PAGE_SIZE);
739
740 /*
741 * Update the cache contents, so that it corresponds to the data
742 * sent, in all cases except where we skip the page.
743 */
744 if (!last_stage && encoded_len != 0) {
745 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
746 /*
747 * In the case where we couldn't compress, ensure that the caller
748 * sends the data from the cache, since the guest might have
749 * changed the RAM since we copied it.
750 */
751 *current_data = prev_cached_page;
752 }
753
754 if (encoded_len == 0) {
755 trace_save_xbzrle_page_skipping();
756 return 0;
757 } else if (encoded_len == -1) {
758 trace_save_xbzrle_page_overflow();
759 xbzrle_counters.overflow++;
760 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
761 return -1;
762 }
763
764 /* Send XBZRLE based compressed page */
765 bytes_xbzrle = save_page_header(rs, rs->f, block,
766 offset | RAM_SAVE_FLAG_XBZRLE);
767 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
768 qemu_put_be16(rs->f, encoded_len);
769 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
770 bytes_xbzrle += encoded_len + 1 + 2;
771 /*
772 * Like compressed_size (please see update_compress_thread_counts),
773 * the xbzrle encoded bytes don't count the 8 byte header with
774 * RAM_SAVE_FLAG_CONTINUE.
775 */
776 xbzrle_counters.bytes += bytes_xbzrle - 8;
777 ram_counters.transferred += bytes_xbzrle;
778
779 return 1;
780 }
781
782 /**
783 * migration_bitmap_find_dirty: find the next dirty page from start
784 *
785 * Returns the page offset within memory region of the start of a dirty page
786 *
787 * @rs: current RAM state
788 * @rb: RAMBlock where to search for dirty pages
789 * @start: page where we start the search
790 */
791 static inline
792 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
793 unsigned long start)
794 {
795 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
796 unsigned long *bitmap = rb->bmap;
797 unsigned long next;
798
799 if (ramblock_is_ignored(rb)) {
800 return size;
801 }
802
803 /*
804 * When the free page optimization is enabled, we need to check the bitmap
805 * to send the non-free pages rather than all the pages in the bulk stage.
806 */
807 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
808 next = start + 1;
809 } else {
810 next = find_next_bit(bitmap, size, start);
811 }
812
813 return next;
814 }
815
816 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
817 RAMBlock *rb,
818 unsigned long page)
819 {
820 bool ret;
821
822 qemu_mutex_lock(&rs->bitmap_mutex);
823
824 /*
825 * Clear dirty bitmap if needed. This _must_ be called before we
826 * send any of the page in the chunk because we need to make sure
827 * we can capture further page content changes when we sync dirty
828 * log the next time. So as long as we are going to send any of
829 * the page in the chunk we clear the remote dirty bitmap for all.
830 * Clearing it earlier won't be a problem, but too late will.
831 */
832 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
833 uint8_t shift = rb->clear_bmap_shift;
834 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
835 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
836
837 /*
838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839 * can make things easier sometimes since then start address
840 * of the small chunk will always be 64 pages aligned so the
841 * bitmap will always be aligned to unsigned long. We should
842 * even be able to remove this restriction but I'm simply
843 * keeping it.
844 */
845 assert(shift >= 6);
846 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
847 memory_region_clear_dirty_bitmap(rb->mr, start, size);
848 }
849
850 ret = test_and_clear_bit(page, rb->bmap);
851
852 if (ret) {
853 rs->migration_dirty_pages--;
854 }
855 qemu_mutex_unlock(&rs->bitmap_mutex);
856
857 return ret;
858 }
859
860 /* Called with RCU critical section */
861 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
862 {
863 uint64_t new_dirty_pages =
864 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
865
866 rs->migration_dirty_pages += new_dirty_pages;
867 rs->num_dirty_pages_period += new_dirty_pages;
868 }
869
870 /**
871 * ram_pagesize_summary: calculate all the pagesizes of a VM
872 *
873 * Returns a summary bitmap of the page sizes of all RAMBlocks
874 *
875 * For VMs with just normal pages this is equivalent to the host page
876 * size. If it's got some huge pages then it's the OR of all the
877 * different page sizes.
878 */
879 uint64_t ram_pagesize_summary(void)
880 {
881 RAMBlock *block;
882 uint64_t summary = 0;
883
884 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
885 summary |= block->page_size;
886 }
887
888 return summary;
889 }
890
891 uint64_t ram_get_total_transferred_pages(void)
892 {
893 return ram_counters.normal + ram_counters.duplicate +
894 compression_counters.pages + xbzrle_counters.pages;
895 }
896
897 static void migration_update_rates(RAMState *rs, int64_t end_time)
898 {
899 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
900 double compressed_size;
901
902 /* calculate period counters */
903 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
904 / (end_time - rs->time_last_bitmap_sync);
905
906 if (!page_count) {
907 return;
908 }
909
910 if (migrate_use_xbzrle()) {
911 double encoded_size, unencoded_size;
912
913 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
914 rs->xbzrle_cache_miss_prev) / page_count;
915 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
916 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
917 TARGET_PAGE_SIZE;
918 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
919 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
920 xbzrle_counters.encoding_rate = 0;
921 } else {
922 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
923 }
924 rs->xbzrle_pages_prev = xbzrle_counters.pages;
925 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
926 }
927
928 if (migrate_use_compression()) {
929 compression_counters.busy_rate = (double)(compression_counters.busy -
930 rs->compress_thread_busy_prev) / page_count;
931 rs->compress_thread_busy_prev = compression_counters.busy;
932
933 compressed_size = compression_counters.compressed_size -
934 rs->compressed_size_prev;
935 if (compressed_size) {
936 double uncompressed_size = (compression_counters.pages -
937 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
938
939 /* Compression-Ratio = Uncompressed-size / Compressed-size */
940 compression_counters.compression_rate =
941 uncompressed_size / compressed_size;
942
943 rs->compress_pages_prev = compression_counters.pages;
944 rs->compressed_size_prev = compression_counters.compressed_size;
945 }
946 }
947 }
948
949 static void migration_trigger_throttle(RAMState *rs)
950 {
951 MigrationState *s = migrate_get_current();
952 uint64_t threshold = s->parameters.throttle_trigger_threshold;
953
954 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
955 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
956 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
957
958 /* During block migration the auto-converge logic incorrectly detects
959 * that ram migration makes no progress. Avoid this by disabling the
960 * throttling logic during the bulk phase of block migration. */
961 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
962 /* The following detection logic can be refined later. For now:
963 Check to see if the ratio between dirtied bytes and the approx.
964 amount of bytes that just got transferred since the last time
965 we were in this routine reaches the threshold. If that happens
966 twice, start or increase throttling. */
967
968 if ((bytes_dirty_period > bytes_dirty_threshold) &&
969 (++rs->dirty_rate_high_cnt >= 2)) {
970 trace_migration_throttle();
971 rs->dirty_rate_high_cnt = 0;
972 mig_throttle_guest_down(bytes_dirty_period,
973 bytes_dirty_threshold);
974 }
975 }
976 }
977
978 static void migration_bitmap_sync(RAMState *rs)
979 {
980 RAMBlock *block;
981 int64_t end_time;
982
983 ram_counters.dirty_sync_count++;
984
985 if (!rs->time_last_bitmap_sync) {
986 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
987 }
988
989 trace_migration_bitmap_sync_start();
990 memory_global_dirty_log_sync();
991
992 qemu_mutex_lock(&rs->bitmap_mutex);
993 WITH_RCU_READ_LOCK_GUARD() {
994 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
995 ramblock_sync_dirty_bitmap(rs, block);
996 }
997 ram_counters.remaining = ram_bytes_remaining();
998 }
999 qemu_mutex_unlock(&rs->bitmap_mutex);
1000
1001 memory_global_after_dirty_log_sync();
1002 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1003
1004 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1005
1006 /* more than 1 second = 1000 millisecons */
1007 if (end_time > rs->time_last_bitmap_sync + 1000) {
1008 migration_trigger_throttle(rs);
1009
1010 migration_update_rates(rs, end_time);
1011
1012 rs->target_page_count_prev = rs->target_page_count;
1013
1014 /* reset period counters */
1015 rs->time_last_bitmap_sync = end_time;
1016 rs->num_dirty_pages_period = 0;
1017 rs->bytes_xfer_prev = ram_counters.transferred;
1018 }
1019 if (migrate_use_events()) {
1020 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1021 }
1022 }
1023
1024 static void migration_bitmap_sync_precopy(RAMState *rs)
1025 {
1026 Error *local_err = NULL;
1027
1028 /*
1029 * The current notifier usage is just an optimization to migration, so we
1030 * don't stop the normal migration process in the error case.
1031 */
1032 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1033 error_report_err(local_err);
1034 local_err = NULL;
1035 }
1036
1037 migration_bitmap_sync(rs);
1038
1039 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1040 error_report_err(local_err);
1041 }
1042 }
1043
1044 /**
1045 * save_zero_page_to_file: send the zero page to the file
1046 *
1047 * Returns the size of data written to the file, 0 means the page is not
1048 * a zero page
1049 *
1050 * @rs: current RAM state
1051 * @file: the file where the data is saved
1052 * @block: block that contains the page we want to send
1053 * @offset: offset inside the block for the page
1054 */
1055 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1056 RAMBlock *block, ram_addr_t offset)
1057 {
1058 uint8_t *p = block->host + offset;
1059 int len = 0;
1060
1061 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1062 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1063 qemu_put_byte(file, 0);
1064 len += 1;
1065 }
1066 return len;
1067 }
1068
1069 /**
1070 * save_zero_page: send the zero page to the stream
1071 *
1072 * Returns the number of pages written.
1073 *
1074 * @rs: current RAM state
1075 * @block: block that contains the page we want to send
1076 * @offset: offset inside the block for the page
1077 */
1078 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1079 {
1080 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1081
1082 if (len) {
1083 ram_counters.duplicate++;
1084 ram_counters.transferred += len;
1085 return 1;
1086 }
1087 return -1;
1088 }
1089
1090 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1091 {
1092 if (!migrate_release_ram() || !migration_in_postcopy()) {
1093 return;
1094 }
1095
1096 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1097 }
1098
1099 /*
1100 * @pages: the number of pages written by the control path,
1101 * < 0 - error
1102 * > 0 - number of pages written
1103 *
1104 * Return true if the pages has been saved, otherwise false is returned.
1105 */
1106 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1107 int *pages)
1108 {
1109 uint64_t bytes_xmit = 0;
1110 int ret;
1111
1112 *pages = -1;
1113 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1114 &bytes_xmit);
1115 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1116 return false;
1117 }
1118
1119 if (bytes_xmit) {
1120 ram_counters.transferred += bytes_xmit;
1121 *pages = 1;
1122 }
1123
1124 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1125 return true;
1126 }
1127
1128 if (bytes_xmit > 0) {
1129 ram_counters.normal++;
1130 } else if (bytes_xmit == 0) {
1131 ram_counters.duplicate++;
1132 }
1133
1134 return true;
1135 }
1136
1137 /*
1138 * directly send the page to the stream
1139 *
1140 * Returns the number of pages written.
1141 *
1142 * @rs: current RAM state
1143 * @block: block that contains the page we want to send
1144 * @offset: offset inside the block for the page
1145 * @buf: the page to be sent
1146 * @async: send to page asyncly
1147 */
1148 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1149 uint8_t *buf, bool async)
1150 {
1151 ram_counters.transferred += save_page_header(rs, rs->f, block,
1152 offset | RAM_SAVE_FLAG_PAGE);
1153 if (async) {
1154 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1155 migrate_release_ram() &
1156 migration_in_postcopy());
1157 } else {
1158 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1159 }
1160 ram_counters.transferred += TARGET_PAGE_SIZE;
1161 ram_counters.normal++;
1162 return 1;
1163 }
1164
1165 /**
1166 * ram_save_page: send the given page to the stream
1167 *
1168 * Returns the number of pages written.
1169 * < 0 - error
1170 * >=0 - Number of pages written - this might legally be 0
1171 * if xbzrle noticed the page was the same.
1172 *
1173 * @rs: current RAM state
1174 * @block: block that contains the page we want to send
1175 * @offset: offset inside the block for the page
1176 * @last_stage: if we are at the completion stage
1177 */
1178 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1179 {
1180 int pages = -1;
1181 uint8_t *p;
1182 bool send_async = true;
1183 RAMBlock *block = pss->block;
1184 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1185 ram_addr_t current_addr = block->offset + offset;
1186
1187 p = block->host + offset;
1188 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1189
1190 XBZRLE_cache_lock();
1191 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1192 migrate_use_xbzrle()) {
1193 pages = save_xbzrle_page(rs, &p, current_addr, block,
1194 offset, last_stage);
1195 if (!last_stage) {
1196 /* Can't send this cached data async, since the cache page
1197 * might get updated before it gets to the wire
1198 */
1199 send_async = false;
1200 }
1201 }
1202
1203 /* XBZRLE overflow or normal page */
1204 if (pages == -1) {
1205 pages = save_normal_page(rs, block, offset, p, send_async);
1206 }
1207
1208 XBZRLE_cache_unlock();
1209
1210 return pages;
1211 }
1212
1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214 ram_addr_t offset)
1215 {
1216 if (multifd_queue_page(rs->f, block, offset) < 0) {
1217 return -1;
1218 }
1219 ram_counters.normal++;
1220
1221 return 1;
1222 }
1223
1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1225 ram_addr_t offset, uint8_t *source_buf)
1226 {
1227 RAMState *rs = ram_state;
1228 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1229 bool zero_page = false;
1230 int ret;
1231
1232 if (save_zero_page_to_file(rs, f, block, offset)) {
1233 zero_page = true;
1234 goto exit;
1235 }
1236
1237 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1238
1239 /*
1240 * copy it to a internal buffer to avoid it being modified by VM
1241 * so that we can catch up the error during compression and
1242 * decompression
1243 */
1244 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1245 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246 if (ret < 0) {
1247 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1248 error_report("compressed data failed!");
1249 return false;
1250 }
1251
1252 exit:
1253 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1254 return zero_page;
1255 }
1256
1257 static void
1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1259 {
1260 ram_counters.transferred += bytes_xmit;
1261
1262 if (param->zero_page) {
1263 ram_counters.duplicate++;
1264 return;
1265 }
1266
1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268 compression_counters.compressed_size += bytes_xmit - 8;
1269 compression_counters.pages++;
1270 }
1271
1272 static bool save_page_use_compression(RAMState *rs);
1273
1274 static void flush_compressed_data(RAMState *rs)
1275 {
1276 int idx, len, thread_count;
1277
1278 if (!save_page_use_compression(rs)) {
1279 return;
1280 }
1281 thread_count = migrate_compress_threads();
1282
1283 qemu_mutex_lock(&comp_done_lock);
1284 for (idx = 0; idx < thread_count; idx++) {
1285 while (!comp_param[idx].done) {
1286 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1287 }
1288 }
1289 qemu_mutex_unlock(&comp_done_lock);
1290
1291 for (idx = 0; idx < thread_count; idx++) {
1292 qemu_mutex_lock(&comp_param[idx].mutex);
1293 if (!comp_param[idx].quit) {
1294 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1295 /*
1296 * it's safe to fetch zero_page without holding comp_done_lock
1297 * as there is no further request submitted to the thread,
1298 * i.e, the thread should be waiting for a request at this point.
1299 */
1300 update_compress_thread_counts(&comp_param[idx], len);
1301 }
1302 qemu_mutex_unlock(&comp_param[idx].mutex);
1303 }
1304 }
1305
1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307 ram_addr_t offset)
1308 {
1309 param->block = block;
1310 param->offset = offset;
1311 }
1312
1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314 ram_addr_t offset)
1315 {
1316 int idx, thread_count, bytes_xmit = -1, pages = -1;
1317 bool wait = migrate_compress_wait_thread();
1318
1319 thread_count = migrate_compress_threads();
1320 qemu_mutex_lock(&comp_done_lock);
1321 retry:
1322 for (idx = 0; idx < thread_count; idx++) {
1323 if (comp_param[idx].done) {
1324 comp_param[idx].done = false;
1325 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326 qemu_mutex_lock(&comp_param[idx].mutex);
1327 set_compress_params(&comp_param[idx], block, offset);
1328 qemu_cond_signal(&comp_param[idx].cond);
1329 qemu_mutex_unlock(&comp_param[idx].mutex);
1330 pages = 1;
1331 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1332 break;
1333 }
1334 }
1335
1336 /*
1337 * wait for the free thread if the user specifies 'compress-wait-thread',
1338 * otherwise we will post the page out in the main thread as normal page.
1339 */
1340 if (pages < 0 && wait) {
1341 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342 goto retry;
1343 }
1344 qemu_mutex_unlock(&comp_done_lock);
1345
1346 return pages;
1347 }
1348
1349 /**
1350 * find_dirty_block: find the next dirty page and update any state
1351 * associated with the search process.
1352 *
1353 * Returns true if a page is found
1354 *
1355 * @rs: current RAM state
1356 * @pss: data about the state of the current dirty page scan
1357 * @again: set to false if the search has scanned the whole of RAM
1358 */
1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1360 {
1361 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1362 if (pss->complete_round && pss->block == rs->last_seen_block &&
1363 pss->page >= rs->last_page) {
1364 /*
1365 * We've been once around the RAM and haven't found anything.
1366 * Give up.
1367 */
1368 *again = false;
1369 return false;
1370 }
1371 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1372 >= pss->block->used_length) {
1373 /* Didn't find anything in this RAM Block */
1374 pss->page = 0;
1375 pss->block = QLIST_NEXT_RCU(pss->block, next);
1376 if (!pss->block) {
1377 /*
1378 * If memory migration starts over, we will meet a dirtied page
1379 * which may still exists in compression threads's ring, so we
1380 * should flush the compressed data to make sure the new page
1381 * is not overwritten by the old one in the destination.
1382 *
1383 * Also If xbzrle is on, stop using the data compression at this
1384 * point. In theory, xbzrle can do better than compression.
1385 */
1386 flush_compressed_data(rs);
1387
1388 /* Hit the end of the list */
1389 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390 /* Flag that we've looped */
1391 pss->complete_round = true;
1392 rs->ram_bulk_stage = false;
1393 }
1394 /* Didn't find anything this time, but try again on the new block */
1395 *again = true;
1396 return false;
1397 } else {
1398 /* Can go around again, but... */
1399 *again = true;
1400 /* We've found something so probably don't need to */
1401 return true;
1402 }
1403 }
1404
1405 /**
1406 * unqueue_page: gets a page of the queue
1407 *
1408 * Helper for 'get_queued_page' - gets a page off the queue
1409 *
1410 * Returns the block of the page (or NULL if none available)
1411 *
1412 * @rs: current RAM state
1413 * @offset: used to return the offset within the RAMBlock
1414 */
1415 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1416 {
1417 RAMBlock *block = NULL;
1418
1419 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1420 return NULL;
1421 }
1422
1423 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1424 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1425 struct RAMSrcPageRequest *entry =
1426 QSIMPLEQ_FIRST(&rs->src_page_requests);
1427 block = entry->rb;
1428 *offset = entry->offset;
1429
1430 if (entry->len > TARGET_PAGE_SIZE) {
1431 entry->len -= TARGET_PAGE_SIZE;
1432 entry->offset += TARGET_PAGE_SIZE;
1433 } else {
1434 memory_region_unref(block->mr);
1435 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1436 g_free(entry);
1437 migration_consume_urgent_request();
1438 }
1439 }
1440
1441 return block;
1442 }
1443
1444 #if defined(__linux__)
1445 /**
1446 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1447 * is found, return RAM block pointer and page offset
1448 *
1449 * Returns pointer to the RAMBlock containing faulting page,
1450 * NULL if no write faults are pending
1451 *
1452 * @rs: current RAM state
1453 * @offset: page offset from the beginning of the block
1454 */
1455 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1456 {
1457 struct uffd_msg uffd_msg;
1458 void *page_address;
1459 RAMBlock *bs;
1460 int res;
1461
1462 if (!migrate_background_snapshot()) {
1463 return NULL;
1464 }
1465
1466 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1467 if (res <= 0) {
1468 return NULL;
1469 }
1470
1471 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1472 bs = qemu_ram_block_from_host(page_address, false, offset);
1473 assert(bs && (bs->flags & RAM_UF_WRITEPROTECT) != 0);
1474 return bs;
1475 }
1476
1477 /**
1478 * ram_save_release_protection: release UFFD write protection after
1479 * a range of pages has been saved
1480 *
1481 * @rs: current RAM state
1482 * @pss: page-search-status structure
1483 * @start_page: index of the first page in the range relative to pss->block
1484 *
1485 * Returns 0 on success, negative value in case of an error
1486 */
1487 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1488 unsigned long start_page)
1489 {
1490 int res = 0;
1491
1492 /* Check if page is from UFFD-managed region. */
1493 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1494 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1495 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1496
1497 /* Flush async buffers before un-protect. */
1498 qemu_fflush(rs->f);
1499 /* Un-protect memory range. */
1500 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1501 false, false);
1502 }
1503
1504 return res;
1505 }
1506
1507 /* ram_write_tracking_available: check if kernel supports required UFFD features
1508 *
1509 * Returns true if supports, false otherwise
1510 */
1511 bool ram_write_tracking_available(void)
1512 {
1513 uint64_t uffd_features;
1514 int res;
1515
1516 res = uffd_query_features(&uffd_features);
1517 return (res == 0 &&
1518 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1519 }
1520
1521 /* ram_write_tracking_compatible: check if guest configuration is
1522 * compatible with 'write-tracking'
1523 *
1524 * Returns true if compatible, false otherwise
1525 */
1526 bool ram_write_tracking_compatible(void)
1527 {
1528 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1529 int uffd_fd;
1530 RAMBlock *bs;
1531 bool ret = false;
1532
1533 /* Open UFFD file descriptor */
1534 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1535 if (uffd_fd < 0) {
1536 return false;
1537 }
1538
1539 RCU_READ_LOCK_GUARD();
1540
1541 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1542 uint64_t uffd_ioctls;
1543
1544 /* Nothing to do with read-only and MMIO-writable regions */
1545 if (bs->mr->readonly || bs->mr->rom_device) {
1546 continue;
1547 }
1548 /* Try to register block memory via UFFD-IO to track writes */
1549 if (uffd_register_memory(uffd_fd, bs->host, bs->max_length,
1550 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1551 goto out;
1552 }
1553 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1554 goto out;
1555 }
1556 }
1557 ret = true;
1558
1559 out:
1560 uffd_close_fd(uffd_fd);
1561 return ret;
1562 }
1563
1564 /*
1565 * ram_write_tracking_start: start UFFD-WP memory tracking
1566 *
1567 * Returns 0 for success or negative value in case of error
1568 */
1569 int ram_write_tracking_start(void)
1570 {
1571 int uffd_fd;
1572 RAMState *rs = ram_state;
1573 RAMBlock *bs;
1574
1575 /* Open UFFD file descriptor */
1576 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1577 if (uffd_fd < 0) {
1578 return uffd_fd;
1579 }
1580 rs->uffdio_fd = uffd_fd;
1581
1582 RCU_READ_LOCK_GUARD();
1583
1584 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1585 /* Nothing to do with read-only and MMIO-writable regions */
1586 if (bs->mr->readonly || bs->mr->rom_device) {
1587 continue;
1588 }
1589
1590 /* Register block memory with UFFD to track writes */
1591 if (uffd_register_memory(rs->uffdio_fd, bs->host,
1592 bs->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1593 goto fail;
1594 }
1595 /* Apply UFFD write protection to the block memory range */
1596 if (uffd_change_protection(rs->uffdio_fd, bs->host,
1597 bs->max_length, true, false)) {
1598 goto fail;
1599 }
1600 bs->flags |= RAM_UF_WRITEPROTECT;
1601 memory_region_ref(bs->mr);
1602
1603 trace_ram_write_tracking_ramblock_start(bs->idstr, bs->page_size,
1604 bs->host, bs->max_length);
1605 }
1606
1607 return 0;
1608
1609 fail:
1610 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1611
1612 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1613 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1614 continue;
1615 }
1616 /*
1617 * In case some memory block failed to be write-protected
1618 * remove protection and unregister all succeeded RAM blocks
1619 */
1620 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1621 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1622 /* Cleanup flags and remove reference */
1623 bs->flags &= ~RAM_UF_WRITEPROTECT;
1624 memory_region_unref(bs->mr);
1625 }
1626
1627 uffd_close_fd(uffd_fd);
1628 rs->uffdio_fd = -1;
1629 return -1;
1630 }
1631
1632 /**
1633 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1634 */
1635 void ram_write_tracking_stop(void)
1636 {
1637 RAMState *rs = ram_state;
1638 RAMBlock *bs;
1639
1640 RCU_READ_LOCK_GUARD();
1641
1642 RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
1643 if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
1644 continue;
1645 }
1646 /* Remove protection and unregister all affected RAM blocks */
1647 uffd_change_protection(rs->uffdio_fd, bs->host, bs->max_length, false, false);
1648 uffd_unregister_memory(rs->uffdio_fd, bs->host, bs->max_length);
1649
1650 trace_ram_write_tracking_ramblock_stop(bs->idstr, bs->page_size,
1651 bs->host, bs->max_length);
1652
1653 /* Cleanup flags and remove reference */
1654 bs->flags &= ~RAM_UF_WRITEPROTECT;
1655 memory_region_unref(bs->mr);
1656 }
1657
1658 /* Finally close UFFD file descriptor */
1659 uffd_close_fd(rs->uffdio_fd);
1660 rs->uffdio_fd = -1;
1661 }
1662
1663 #else
1664 /* No target OS support, stubs just fail or ignore */
1665
1666 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1667 {
1668 (void) rs;
1669 (void) offset;
1670
1671 return NULL;
1672 }
1673
1674 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1675 unsigned long start_page)
1676 {
1677 (void) rs;
1678 (void) pss;
1679 (void) start_page;
1680
1681 return 0;
1682 }
1683
1684 bool ram_write_tracking_available(void)
1685 {
1686 return false;
1687 }
1688
1689 bool ram_write_tracking_compatible(void)
1690 {
1691 assert(0);
1692 return false;
1693 }
1694
1695 int ram_write_tracking_start(void)
1696 {
1697 assert(0);
1698 return -1;
1699 }
1700
1701 void ram_write_tracking_stop(void)
1702 {
1703 assert(0);
1704 }
1705 #endif /* defined(__linux__) */
1706
1707 /**
1708 * get_queued_page: unqueue a page from the postcopy requests
1709 *
1710 * Skips pages that are already sent (!dirty)
1711 *
1712 * Returns true if a queued page is found
1713 *
1714 * @rs: current RAM state
1715 * @pss: data about the state of the current dirty page scan
1716 */
1717 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1718 {
1719 RAMBlock *block;
1720 ram_addr_t offset;
1721 bool dirty;
1722
1723 do {
1724 block = unqueue_page(rs, &offset);
1725 /*
1726 * We're sending this page, and since it's postcopy nothing else
1727 * will dirty it, and we must make sure it doesn't get sent again
1728 * even if this queue request was received after the background
1729 * search already sent it.
1730 */
1731 if (block) {
1732 unsigned long page;
1733
1734 page = offset >> TARGET_PAGE_BITS;
1735 dirty = test_bit(page, block->bmap);
1736 if (!dirty) {
1737 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1738 page);
1739 } else {
1740 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1741 }
1742 }
1743
1744 } while (block && !dirty);
1745
1746 if (!block) {
1747 /*
1748 * Poll write faults too if background snapshot is enabled; that's
1749 * when we have vcpus got blocked by the write protected pages.
1750 */
1751 block = poll_fault_page(rs, &offset);
1752 }
1753
1754 if (block) {
1755 /*
1756 * As soon as we start servicing pages out of order, then we have
1757 * to kill the bulk stage, since the bulk stage assumes
1758 * in (migration_bitmap_find_and_reset_dirty) that every page is
1759 * dirty, that's no longer true.
1760 */
1761 rs->ram_bulk_stage = false;
1762
1763 /*
1764 * We want the background search to continue from the queued page
1765 * since the guest is likely to want other pages near to the page
1766 * it just requested.
1767 */
1768 pss->block = block;
1769 pss->page = offset >> TARGET_PAGE_BITS;
1770
1771 /*
1772 * This unqueued page would break the "one round" check, even is
1773 * really rare.
1774 */
1775 pss->complete_round = false;
1776 }
1777
1778 return !!block;
1779 }
1780
1781 /**
1782 * migration_page_queue_free: drop any remaining pages in the ram
1783 * request queue
1784 *
1785 * It should be empty at the end anyway, but in error cases there may
1786 * be some left. in case that there is any page left, we drop it.
1787 *
1788 */
1789 static void migration_page_queue_free(RAMState *rs)
1790 {
1791 struct RAMSrcPageRequest *mspr, *next_mspr;
1792 /* This queue generally should be empty - but in the case of a failed
1793 * migration might have some droppings in.
1794 */
1795 RCU_READ_LOCK_GUARD();
1796 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1797 memory_region_unref(mspr->rb->mr);
1798 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1799 g_free(mspr);
1800 }
1801 }
1802
1803 /**
1804 * ram_save_queue_pages: queue the page for transmission
1805 *
1806 * A request from postcopy destination for example.
1807 *
1808 * Returns zero on success or negative on error
1809 *
1810 * @rbname: Name of the RAMBLock of the request. NULL means the
1811 * same that last one.
1812 * @start: starting address from the start of the RAMBlock
1813 * @len: length (in bytes) to send
1814 */
1815 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1816 {
1817 RAMBlock *ramblock;
1818 RAMState *rs = ram_state;
1819
1820 ram_counters.postcopy_requests++;
1821 RCU_READ_LOCK_GUARD();
1822
1823 if (!rbname) {
1824 /* Reuse last RAMBlock */
1825 ramblock = rs->last_req_rb;
1826
1827 if (!ramblock) {
1828 /*
1829 * Shouldn't happen, we can't reuse the last RAMBlock if
1830 * it's the 1st request.
1831 */
1832 error_report("ram_save_queue_pages no previous block");
1833 return -1;
1834 }
1835 } else {
1836 ramblock = qemu_ram_block_by_name(rbname);
1837
1838 if (!ramblock) {
1839 /* We shouldn't be asked for a non-existent RAMBlock */
1840 error_report("ram_save_queue_pages no block '%s'", rbname);
1841 return -1;
1842 }
1843 rs->last_req_rb = ramblock;
1844 }
1845 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1846 if (start + len > ramblock->used_length) {
1847 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1848 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1849 __func__, start, len, ramblock->used_length);
1850 return -1;
1851 }
1852
1853 struct RAMSrcPageRequest *new_entry =
1854 g_malloc0(sizeof(struct RAMSrcPageRequest));
1855 new_entry->rb = ramblock;
1856 new_entry->offset = start;
1857 new_entry->len = len;
1858
1859 memory_region_ref(ramblock->mr);
1860 qemu_mutex_lock(&rs->src_page_req_mutex);
1861 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1862 migration_make_urgent_request();
1863 qemu_mutex_unlock(&rs->src_page_req_mutex);
1864
1865 return 0;
1866 }
1867
1868 static bool save_page_use_compression(RAMState *rs)
1869 {
1870 if (!migrate_use_compression()) {
1871 return false;
1872 }
1873
1874 /*
1875 * If xbzrle is on, stop using the data compression after first
1876 * round of migration even if compression is enabled. In theory,
1877 * xbzrle can do better than compression.
1878 */
1879 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1880 return true;
1881 }
1882
1883 return false;
1884 }
1885
1886 /*
1887 * try to compress the page before posting it out, return true if the page
1888 * has been properly handled by compression, otherwise needs other
1889 * paths to handle it
1890 */
1891 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1892 {
1893 if (!save_page_use_compression(rs)) {
1894 return false;
1895 }
1896
1897 /*
1898 * When starting the process of a new block, the first page of
1899 * the block should be sent out before other pages in the same
1900 * block, and all the pages in last block should have been sent
1901 * out, keeping this order is important, because the 'cont' flag
1902 * is used to avoid resending the block name.
1903 *
1904 * We post the fist page as normal page as compression will take
1905 * much CPU resource.
1906 */
1907 if (block != rs->last_sent_block) {
1908 flush_compressed_data(rs);
1909 return false;
1910 }
1911
1912 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1913 return true;
1914 }
1915
1916 compression_counters.busy++;
1917 return false;
1918 }
1919
1920 /**
1921 * ram_save_target_page: save one target page
1922 *
1923 * Returns the number of pages written
1924 *
1925 * @rs: current RAM state
1926 * @pss: data about the page we want to send
1927 * @last_stage: if we are at the completion stage
1928 */
1929 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1930 bool last_stage)
1931 {
1932 RAMBlock *block = pss->block;
1933 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1934 int res;
1935
1936 if (control_save_page(rs, block, offset, &res)) {
1937 return res;
1938 }
1939
1940 if (save_compress_page(rs, block, offset)) {
1941 return 1;
1942 }
1943
1944 res = save_zero_page(rs, block, offset);
1945 if (res > 0) {
1946 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1947 * page would be stale
1948 */
1949 if (!save_page_use_compression(rs)) {
1950 XBZRLE_cache_lock();
1951 xbzrle_cache_zero_page(rs, block->offset + offset);
1952 XBZRLE_cache_unlock();
1953 }
1954 ram_release_pages(block->idstr, offset, res);
1955 return res;
1956 }
1957
1958 /*
1959 * Do not use multifd for:
1960 * 1. Compression as the first page in the new block should be posted out
1961 * before sending the compressed page
1962 * 2. In postcopy as one whole host page should be placed
1963 */
1964 if (!save_page_use_compression(rs) && migrate_use_multifd()
1965 && !migration_in_postcopy()) {
1966 return ram_save_multifd_page(rs, block, offset);
1967 }
1968
1969 return ram_save_page(rs, pss, last_stage);
1970 }
1971
1972 /**
1973 * ram_save_host_page: save a whole host page
1974 *
1975 * Starting at *offset send pages up to the end of the current host
1976 * page. It's valid for the initial offset to point into the middle of
1977 * a host page in which case the remainder of the hostpage is sent.
1978 * Only dirty target pages are sent. Note that the host page size may
1979 * be a huge page for this block.
1980 * The saving stops at the boundary of the used_length of the block
1981 * if the RAMBlock isn't a multiple of the host page size.
1982 *
1983 * Returns the number of pages written or negative on error
1984 *
1985 * @rs: current RAM state
1986 * @ms: current migration state
1987 * @pss: data about the page we want to send
1988 * @last_stage: if we are at the completion stage
1989 */
1990 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1991 bool last_stage)
1992 {
1993 int tmppages, pages = 0;
1994 size_t pagesize_bits =
1995 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1996 unsigned long start_page = pss->page;
1997 int res;
1998
1999 if (ramblock_is_ignored(pss->block)) {
2000 error_report("block %s should not be migrated !", pss->block->idstr);
2001 return 0;
2002 }
2003
2004 do {
2005 /* Check the pages is dirty and if it is send it */
2006 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2007 pss->page++;
2008 continue;
2009 }
2010
2011 tmppages = ram_save_target_page(rs, pss, last_stage);
2012 if (tmppages < 0) {
2013 return tmppages;
2014 }
2015
2016 pages += tmppages;
2017 pss->page++;
2018 /* Allow rate limiting to happen in the middle of huge pages */
2019 migration_rate_limit();
2020 } while ((pss->page & (pagesize_bits - 1)) &&
2021 offset_in_ramblock(pss->block,
2022 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2023 /* The offset we leave with is the last one we looked at */
2024 pss->page--;
2025
2026 res = ram_save_release_protection(rs, pss, start_page);
2027 return (res < 0 ? res : pages);
2028 }
2029
2030 /**
2031 * ram_find_and_save_block: finds a dirty page and sends it to f
2032 *
2033 * Called within an RCU critical section.
2034 *
2035 * Returns the number of pages written where zero means no dirty pages,
2036 * or negative on error
2037 *
2038 * @rs: current RAM state
2039 * @last_stage: if we are at the completion stage
2040 *
2041 * On systems where host-page-size > target-page-size it will send all the
2042 * pages in a host page that are dirty.
2043 */
2044
2045 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2046 {
2047 PageSearchStatus pss;
2048 int pages = 0;
2049 bool again, found;
2050
2051 /* No dirty page as there is zero RAM */
2052 if (!ram_bytes_total()) {
2053 return pages;
2054 }
2055
2056 pss.block = rs->last_seen_block;
2057 pss.page = rs->last_page;
2058 pss.complete_round = false;
2059
2060 if (!pss.block) {
2061 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2062 }
2063
2064 do {
2065 again = true;
2066 found = get_queued_page(rs, &pss);
2067
2068 if (!found) {
2069 /* priority queue empty, so just search for something dirty */
2070 found = find_dirty_block(rs, &pss, &again);
2071 }
2072
2073 if (found) {
2074 pages = ram_save_host_page(rs, &pss, last_stage);
2075 }
2076 } while (!pages && again);
2077
2078 rs->last_seen_block = pss.block;
2079 rs->last_page = pss.page;
2080
2081 return pages;
2082 }
2083
2084 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2085 {
2086 uint64_t pages = size / TARGET_PAGE_SIZE;
2087
2088 if (zero) {
2089 ram_counters.duplicate += pages;
2090 } else {
2091 ram_counters.normal += pages;
2092 ram_counters.transferred += size;
2093 qemu_update_position(f, size);
2094 }
2095 }
2096
2097 static uint64_t ram_bytes_total_common(bool count_ignored)
2098 {
2099 RAMBlock *block;
2100 uint64_t total = 0;
2101
2102 RCU_READ_LOCK_GUARD();
2103
2104 if (count_ignored) {
2105 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2106 total += block->used_length;
2107 }
2108 } else {
2109 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2110 total += block->used_length;
2111 }
2112 }
2113 return total;
2114 }
2115
2116 uint64_t ram_bytes_total(void)
2117 {
2118 return ram_bytes_total_common(false);
2119 }
2120
2121 static void xbzrle_load_setup(void)
2122 {
2123 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2124 }
2125
2126 static void xbzrle_load_cleanup(void)
2127 {
2128 g_free(XBZRLE.decoded_buf);
2129 XBZRLE.decoded_buf = NULL;
2130 }
2131
2132 static void ram_state_cleanup(RAMState **rsp)
2133 {
2134 if (*rsp) {
2135 migration_page_queue_free(*rsp);
2136 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2137 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2138 g_free(*rsp);
2139 *rsp = NULL;
2140 }
2141 }
2142
2143 static void xbzrle_cleanup(void)
2144 {
2145 XBZRLE_cache_lock();
2146 if (XBZRLE.cache) {
2147 cache_fini(XBZRLE.cache);
2148 g_free(XBZRLE.encoded_buf);
2149 g_free(XBZRLE.current_buf);
2150 g_free(XBZRLE.zero_target_page);
2151 XBZRLE.cache = NULL;
2152 XBZRLE.encoded_buf = NULL;
2153 XBZRLE.current_buf = NULL;
2154 XBZRLE.zero_target_page = NULL;
2155 }
2156 XBZRLE_cache_unlock();
2157 }
2158
2159 static void ram_save_cleanup(void *opaque)
2160 {
2161 RAMState **rsp = opaque;
2162 RAMBlock *block;
2163
2164 /* We don't use dirty log with background snapshots */
2165 if (!migrate_background_snapshot()) {
2166 /* caller have hold iothread lock or is in a bh, so there is
2167 * no writing race against the migration bitmap
2168 */
2169 memory_global_dirty_log_stop();
2170 }
2171
2172 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2173 g_free(block->clear_bmap);
2174 block->clear_bmap = NULL;
2175 g_free(block->bmap);
2176 block->bmap = NULL;
2177 }
2178
2179 xbzrle_cleanup();
2180 compress_threads_save_cleanup();
2181 ram_state_cleanup(rsp);
2182 }
2183
2184 static void ram_state_reset(RAMState *rs)
2185 {
2186 rs->last_seen_block = NULL;
2187 rs->last_sent_block = NULL;
2188 rs->last_page = 0;
2189 rs->last_version = ram_list.version;
2190 rs->ram_bulk_stage = true;
2191 rs->fpo_enabled = false;
2192 }
2193
2194 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2195
2196 /*
2197 * 'expected' is the value you expect the bitmap mostly to be full
2198 * of; it won't bother printing lines that are all this value.
2199 * If 'todump' is null the migration bitmap is dumped.
2200 */
2201 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2202 unsigned long pages)
2203 {
2204 int64_t cur;
2205 int64_t linelen = 128;
2206 char linebuf[129];
2207
2208 for (cur = 0; cur < pages; cur += linelen) {
2209 int64_t curb;
2210 bool found = false;
2211 /*
2212 * Last line; catch the case where the line length
2213 * is longer than remaining ram
2214 */
2215 if (cur + linelen > pages) {
2216 linelen = pages - cur;
2217 }
2218 for (curb = 0; curb < linelen; curb++) {
2219 bool thisbit = test_bit(cur + curb, todump);
2220 linebuf[curb] = thisbit ? '1' : '.';
2221 found = found || (thisbit != expected);
2222 }
2223 if (found) {
2224 linebuf[curb] = '\0';
2225 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2226 }
2227 }
2228 }
2229
2230 /* **** functions for postcopy ***** */
2231
2232 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2233 {
2234 struct RAMBlock *block;
2235
2236 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2237 unsigned long *bitmap = block->bmap;
2238 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2239 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2240
2241 while (run_start < range) {
2242 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2243 ram_discard_range(block->idstr,
2244 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2245 ((ram_addr_t)(run_end - run_start))
2246 << TARGET_PAGE_BITS);
2247 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2248 }
2249 }
2250 }
2251
2252 /**
2253 * postcopy_send_discard_bm_ram: discard a RAMBlock
2254 *
2255 * Returns zero on success
2256 *
2257 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2258 *
2259 * @ms: current migration state
2260 * @block: RAMBlock to discard
2261 */
2262 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2263 {
2264 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2265 unsigned long current;
2266 unsigned long *bitmap = block->bmap;
2267
2268 for (current = 0; current < end; ) {
2269 unsigned long one = find_next_bit(bitmap, end, current);
2270 unsigned long zero, discard_length;
2271
2272 if (one >= end) {
2273 break;
2274 }
2275
2276 zero = find_next_zero_bit(bitmap, end, one + 1);
2277
2278 if (zero >= end) {
2279 discard_length = end - one;
2280 } else {
2281 discard_length = zero - one;
2282 }
2283 postcopy_discard_send_range(ms, one, discard_length);
2284 current = one + discard_length;
2285 }
2286
2287 return 0;
2288 }
2289
2290 /**
2291 * postcopy_each_ram_send_discard: discard all RAMBlocks
2292 *
2293 * Returns 0 for success or negative for error
2294 *
2295 * Utility for the outgoing postcopy code.
2296 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2297 * passing it bitmap indexes and name.
2298 * (qemu_ram_foreach_block ends up passing unscaled lengths
2299 * which would mean postcopy code would have to deal with target page)
2300 *
2301 * @ms: current migration state
2302 */
2303 static int postcopy_each_ram_send_discard(MigrationState *ms)
2304 {
2305 struct RAMBlock *block;
2306 int ret;
2307
2308 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2309 postcopy_discard_send_init(ms, block->idstr);
2310
2311 /*
2312 * Postcopy sends chunks of bitmap over the wire, but it
2313 * just needs indexes at this point, avoids it having
2314 * target page specific code.
2315 */
2316 ret = postcopy_send_discard_bm_ram(ms, block);
2317 postcopy_discard_send_finish(ms);
2318 if (ret) {
2319 return ret;
2320 }
2321 }
2322
2323 return 0;
2324 }
2325
2326 /**
2327 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2328 *
2329 * Helper for postcopy_chunk_hostpages; it's called twice to
2330 * canonicalize the two bitmaps, that are similar, but one is
2331 * inverted.
2332 *
2333 * Postcopy requires that all target pages in a hostpage are dirty or
2334 * clean, not a mix. This function canonicalizes the bitmaps.
2335 *
2336 * @ms: current migration state
2337 * @block: block that contains the page we want to canonicalize
2338 */
2339 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2340 {
2341 RAMState *rs = ram_state;
2342 unsigned long *bitmap = block->bmap;
2343 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2344 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2345 unsigned long run_start;
2346
2347 if (block->page_size == TARGET_PAGE_SIZE) {
2348 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2349 return;
2350 }
2351
2352 /* Find a dirty page */
2353 run_start = find_next_bit(bitmap, pages, 0);
2354
2355 while (run_start < pages) {
2356
2357 /*
2358 * If the start of this run of pages is in the middle of a host
2359 * page, then we need to fixup this host page.
2360 */
2361 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2362 /* Find the end of this run */
2363 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2364 /*
2365 * If the end isn't at the start of a host page, then the
2366 * run doesn't finish at the end of a host page
2367 * and we need to discard.
2368 */
2369 }
2370
2371 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2372 unsigned long page;
2373 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2374 host_ratio);
2375 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2376
2377 /* Clean up the bitmap */
2378 for (page = fixup_start_addr;
2379 page < fixup_start_addr + host_ratio; page++) {
2380 /*
2381 * Remark them as dirty, updating the count for any pages
2382 * that weren't previously dirty.
2383 */
2384 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2385 }
2386 }
2387
2388 /* Find the next dirty page for the next iteration */
2389 run_start = find_next_bit(bitmap, pages, run_start);
2390 }
2391 }
2392
2393 /**
2394 * postcopy_chunk_hostpages: discard any partially sent host page
2395 *
2396 * Utility for the outgoing postcopy code.
2397 *
2398 * Discard any partially sent host-page size chunks, mark any partially
2399 * dirty host-page size chunks as all dirty. In this case the host-page
2400 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2401 *
2402 * Returns zero on success
2403 *
2404 * @ms: current migration state
2405 * @block: block we want to work with
2406 */
2407 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2408 {
2409 postcopy_discard_send_init(ms, block->idstr);
2410
2411 /*
2412 * Ensure that all partially dirty host pages are made fully dirty.
2413 */
2414 postcopy_chunk_hostpages_pass(ms, block);
2415
2416 postcopy_discard_send_finish(ms);
2417 return 0;
2418 }
2419
2420 /**
2421 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2422 *
2423 * Returns zero on success
2424 *
2425 * Transmit the set of pages to be discarded after precopy to the target
2426 * these are pages that:
2427 * a) Have been previously transmitted but are now dirty again
2428 * b) Pages that have never been transmitted, this ensures that
2429 * any pages on the destination that have been mapped by background
2430 * tasks get discarded (transparent huge pages is the specific concern)
2431 * Hopefully this is pretty sparse
2432 *
2433 * @ms: current migration state
2434 */
2435 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2436 {
2437 RAMState *rs = ram_state;
2438 RAMBlock *block;
2439 int ret;
2440
2441 RCU_READ_LOCK_GUARD();
2442
2443 /* This should be our last sync, the src is now paused */
2444 migration_bitmap_sync(rs);
2445
2446 /* Easiest way to make sure we don't resume in the middle of a host-page */
2447 rs->last_seen_block = NULL;
2448 rs->last_sent_block = NULL;
2449 rs->last_page = 0;
2450
2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2452 /* Deal with TPS != HPS and huge pages */
2453 ret = postcopy_chunk_hostpages(ms, block);
2454 if (ret) {
2455 return ret;
2456 }
2457
2458 #ifdef DEBUG_POSTCOPY
2459 ram_debug_dump_bitmap(block->bmap, true,
2460 block->used_length >> TARGET_PAGE_BITS);
2461 #endif
2462 }
2463 trace_ram_postcopy_send_discard_bitmap();
2464
2465 return postcopy_each_ram_send_discard(ms);
2466 }
2467
2468 /**
2469 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2470 *
2471 * Returns zero on success
2472 *
2473 * @rbname: name of the RAMBlock of the request. NULL means the
2474 * same that last one.
2475 * @start: RAMBlock starting page
2476 * @length: RAMBlock size
2477 */
2478 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2479 {
2480 trace_ram_discard_range(rbname, start, length);
2481
2482 RCU_READ_LOCK_GUARD();
2483 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2484
2485 if (!rb) {
2486 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2487 return -1;
2488 }
2489
2490 /*
2491 * On source VM, we don't need to update the received bitmap since
2492 * we don't even have one.
2493 */
2494 if (rb->receivedmap) {
2495 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2496 length >> qemu_target_page_bits());
2497 }
2498
2499 return ram_block_discard_range(rb, start, length);
2500 }
2501
2502 /*
2503 * For every allocation, we will try not to crash the VM if the
2504 * allocation failed.
2505 */
2506 static int xbzrle_init(void)
2507 {
2508 Error *local_err = NULL;
2509
2510 if (!migrate_use_xbzrle()) {
2511 return 0;
2512 }
2513
2514 XBZRLE_cache_lock();
2515
2516 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2517 if (!XBZRLE.zero_target_page) {
2518 error_report("%s: Error allocating zero page", __func__);
2519 goto err_out;
2520 }
2521
2522 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2523 TARGET_PAGE_SIZE, &local_err);
2524 if (!XBZRLE.cache) {
2525 error_report_err(local_err);
2526 goto free_zero_page;
2527 }
2528
2529 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2530 if (!XBZRLE.encoded_buf) {
2531 error_report("%s: Error allocating encoded_buf", __func__);
2532 goto free_cache;
2533 }
2534
2535 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2536 if (!XBZRLE.current_buf) {
2537 error_report("%s: Error allocating current_buf", __func__);
2538 goto free_encoded_buf;
2539 }
2540
2541 /* We are all good */
2542 XBZRLE_cache_unlock();
2543 return 0;
2544
2545 free_encoded_buf:
2546 g_free(XBZRLE.encoded_buf);
2547 XBZRLE.encoded_buf = NULL;
2548 free_cache:
2549 cache_fini(XBZRLE.cache);
2550 XBZRLE.cache = NULL;
2551 free_zero_page:
2552 g_free(XBZRLE.zero_target_page);
2553 XBZRLE.zero_target_page = NULL;
2554 err_out:
2555 XBZRLE_cache_unlock();
2556 return -ENOMEM;
2557 }
2558
2559 static int ram_state_init(RAMState **rsp)
2560 {
2561 *rsp = g_try_new0(RAMState, 1);
2562
2563 if (!*rsp) {
2564 error_report("%s: Init ramstate fail", __func__);
2565 return -1;
2566 }
2567
2568 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2569 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2570 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2571
2572 /*
2573 * Count the total number of pages used by ram blocks not including any
2574 * gaps due to alignment or unplugs.
2575 * This must match with the initial values of dirty bitmap.
2576 */
2577 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2578 ram_state_reset(*rsp);
2579
2580 return 0;
2581 }
2582
2583 static void ram_list_init_bitmaps(void)
2584 {
2585 MigrationState *ms = migrate_get_current();
2586 RAMBlock *block;
2587 unsigned long pages;
2588 uint8_t shift;
2589
2590 /* Skip setting bitmap if there is no RAM */
2591 if (ram_bytes_total()) {
2592 shift = ms->clear_bitmap_shift;
2593 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2594 error_report("clear_bitmap_shift (%u) too big, using "
2595 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2596 shift = CLEAR_BITMAP_SHIFT_MAX;
2597 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2598 error_report("clear_bitmap_shift (%u) too small, using "
2599 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2600 shift = CLEAR_BITMAP_SHIFT_MIN;
2601 }
2602
2603 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2604 pages = block->max_length >> TARGET_PAGE_BITS;
2605 /*
2606 * The initial dirty bitmap for migration must be set with all
2607 * ones to make sure we'll migrate every guest RAM page to
2608 * destination.
2609 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2610 * new migration after a failed migration, ram_list.
2611 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2612 * guest memory.
2613 */
2614 block->bmap = bitmap_new(pages);
2615 bitmap_set(block->bmap, 0, pages);
2616 block->clear_bmap_shift = shift;
2617 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2618 }
2619 }
2620 }
2621
2622 static void ram_init_bitmaps(RAMState *rs)
2623 {
2624 /* For memory_global_dirty_log_start below. */
2625 qemu_mutex_lock_iothread();
2626 qemu_mutex_lock_ramlist();
2627
2628 WITH_RCU_READ_LOCK_GUARD() {
2629 ram_list_init_bitmaps();
2630 /* We don't use dirty log with background snapshots */
2631 if (!migrate_background_snapshot()) {
2632 memory_global_dirty_log_start();
2633 migration_bitmap_sync_precopy(rs);
2634 }
2635 }
2636 qemu_mutex_unlock_ramlist();
2637 qemu_mutex_unlock_iothread();
2638 }
2639
2640 static int ram_init_all(RAMState **rsp)
2641 {
2642 if (ram_state_init(rsp)) {
2643 return -1;
2644 }
2645
2646 if (xbzrle_init()) {
2647 ram_state_cleanup(rsp);
2648 return -1;
2649 }
2650
2651 ram_init_bitmaps(*rsp);
2652
2653 return 0;
2654 }
2655
2656 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2657 {
2658 RAMBlock *block;
2659 uint64_t pages = 0;
2660
2661 /*
2662 * Postcopy is not using xbzrle/compression, so no need for that.
2663 * Also, since source are already halted, we don't need to care
2664 * about dirty page logging as well.
2665 */
2666
2667 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2668 pages += bitmap_count_one(block->bmap,
2669 block->used_length >> TARGET_PAGE_BITS);
2670 }
2671
2672 /* This may not be aligned with current bitmaps. Recalculate. */
2673 rs->migration_dirty_pages = pages;
2674
2675 rs->last_seen_block = NULL;
2676 rs->last_sent_block = NULL;
2677 rs->last_page = 0;
2678 rs->last_version = ram_list.version;
2679 /*
2680 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2681 * matter what we have sent.
2682 */
2683 rs->ram_bulk_stage = false;
2684
2685 /* Update RAMState cache of output QEMUFile */
2686 rs->f = out;
2687
2688 trace_ram_state_resume_prepare(pages);
2689 }
2690
2691 /*
2692 * This function clears bits of the free pages reported by the caller from the
2693 * migration dirty bitmap. @addr is the host address corresponding to the
2694 * start of the continuous guest free pages, and @len is the total bytes of
2695 * those pages.
2696 */
2697 void qemu_guest_free_page_hint(void *addr, size_t len)
2698 {
2699 RAMBlock *block;
2700 ram_addr_t offset;
2701 size_t used_len, start, npages;
2702 MigrationState *s = migrate_get_current();
2703
2704 /* This function is currently expected to be used during live migration */
2705 if (!migration_is_setup_or_active(s->state)) {
2706 return;
2707 }
2708
2709 for (; len > 0; len -= used_len, addr += used_len) {
2710 block = qemu_ram_block_from_host(addr, false, &offset);
2711 if (unlikely(!block || offset >= block->used_length)) {
2712 /*
2713 * The implementation might not support RAMBlock resize during
2714 * live migration, but it could happen in theory with future
2715 * updates. So we add a check here to capture that case.
2716 */
2717 error_report_once("%s unexpected error", __func__);
2718 return;
2719 }
2720
2721 if (len <= block->used_length - offset) {
2722 used_len = len;
2723 } else {
2724 used_len = block->used_length - offset;
2725 }
2726
2727 start = offset >> TARGET_PAGE_BITS;
2728 npages = used_len >> TARGET_PAGE_BITS;
2729
2730 qemu_mutex_lock(&ram_state->bitmap_mutex);
2731 ram_state->migration_dirty_pages -=
2732 bitmap_count_one_with_offset(block->bmap, start, npages);
2733 bitmap_clear(block->bmap, start, npages);
2734 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2735 }
2736 }
2737
2738 /*
2739 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2740 * long-running RCU critical section. When rcu-reclaims in the code
2741 * start to become numerous it will be necessary to reduce the
2742 * granularity of these critical sections.
2743 */
2744
2745 /**
2746 * ram_save_setup: Setup RAM for migration
2747 *
2748 * Returns zero to indicate success and negative for error
2749 *
2750 * @f: QEMUFile where to send the data
2751 * @opaque: RAMState pointer
2752 */
2753 static int ram_save_setup(QEMUFile *f, void *opaque)
2754 {
2755 RAMState **rsp = opaque;
2756 RAMBlock *block;
2757
2758 if (compress_threads_save_setup()) {
2759 return -1;
2760 }
2761
2762 /* migration has already setup the bitmap, reuse it. */
2763 if (!migration_in_colo_state()) {
2764 if (ram_init_all(rsp) != 0) {
2765 compress_threads_save_cleanup();
2766 return -1;
2767 }
2768 }
2769 (*rsp)->f = f;
2770
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2773
2774 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2775 qemu_put_byte(f, strlen(block->idstr));
2776 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2777 qemu_put_be64(f, block->used_length);
2778 if (migrate_postcopy_ram() && block->page_size !=
2779 qemu_host_page_size) {
2780 qemu_put_be64(f, block->page_size);
2781 }
2782 if (migrate_ignore_shared()) {
2783 qemu_put_be64(f, block->mr->addr);
2784 }
2785 }
2786 }
2787
2788 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2789 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2790
2791 multifd_send_sync_main(f);
2792 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2793 qemu_fflush(f);
2794
2795 return 0;
2796 }
2797
2798 /**
2799 * ram_save_iterate: iterative stage for migration
2800 *
2801 * Returns zero to indicate success and negative for error
2802 *
2803 * @f: QEMUFile where to send the data
2804 * @opaque: RAMState pointer
2805 */
2806 static int ram_save_iterate(QEMUFile *f, void *opaque)
2807 {
2808 RAMState **temp = opaque;
2809 RAMState *rs = *temp;
2810 int ret = 0;
2811 int i;
2812 int64_t t0;
2813 int done = 0;
2814
2815 if (blk_mig_bulk_active()) {
2816 /* Avoid transferring ram during bulk phase of block migration as
2817 * the bulk phase will usually take a long time and transferring
2818 * ram updates during that time is pointless. */
2819 goto out;
2820 }
2821
2822 WITH_RCU_READ_LOCK_GUARD() {
2823 if (ram_list.version != rs->last_version) {
2824 ram_state_reset(rs);
2825 }
2826
2827 /* Read version before ram_list.blocks */
2828 smp_rmb();
2829
2830 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2831
2832 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2833 i = 0;
2834 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2835 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2836 int pages;
2837
2838 if (qemu_file_get_error(f)) {
2839 break;
2840 }
2841
2842 pages = ram_find_and_save_block(rs, false);
2843 /* no more pages to sent */
2844 if (pages == 0) {
2845 done = 1;
2846 break;
2847 }
2848
2849 if (pages < 0) {
2850 qemu_file_set_error(f, pages);
2851 break;
2852 }
2853
2854 rs->target_page_count += pages;
2855
2856 /*
2857 * During postcopy, it is necessary to make sure one whole host
2858 * page is sent in one chunk.
2859 */
2860 if (migrate_postcopy_ram()) {
2861 flush_compressed_data(rs);
2862 }
2863
2864 /*
2865 * we want to check in the 1st loop, just in case it was the 1st
2866 * time and we had to sync the dirty bitmap.
2867 * qemu_clock_get_ns() is a bit expensive, so we only check each
2868 * some iterations
2869 */
2870 if ((i & 63) == 0) {
2871 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2872 1000000;
2873 if (t1 > MAX_WAIT) {
2874 trace_ram_save_iterate_big_wait(t1, i);
2875 break;
2876 }
2877 }
2878 i++;
2879 }
2880 }
2881
2882 /*
2883 * Must occur before EOS (or any QEMUFile operation)
2884 * because of RDMA protocol.
2885 */
2886 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2887
2888 out:
2889 if (ret >= 0
2890 && migration_is_setup_or_active(migrate_get_current()->state)) {
2891 multifd_send_sync_main(rs->f);
2892 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2893 qemu_fflush(f);
2894 ram_counters.transferred += 8;
2895
2896 ret = qemu_file_get_error(f);
2897 }
2898 if (ret < 0) {
2899 return ret;
2900 }
2901
2902 return done;
2903 }
2904
2905 /**
2906 * ram_save_complete: function called to send the remaining amount of ram
2907 *
2908 * Returns zero to indicate success or negative on error
2909 *
2910 * Called with iothread lock
2911 *
2912 * @f: QEMUFile where to send the data
2913 * @opaque: RAMState pointer
2914 */
2915 static int ram_save_complete(QEMUFile *f, void *opaque)
2916 {
2917 RAMState **temp = opaque;
2918 RAMState *rs = *temp;
2919 int ret = 0;
2920
2921 WITH_RCU_READ_LOCK_GUARD() {
2922 if (!migration_in_postcopy()) {
2923 migration_bitmap_sync_precopy(rs);
2924 }
2925
2926 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2927
2928 /* try transferring iterative blocks of memory */
2929
2930 /* flush all remaining blocks regardless of rate limiting */
2931 while (true) {
2932 int pages;
2933
2934 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2935 /* no more blocks to sent */
2936 if (pages == 0) {
2937 break;
2938 }
2939 if (pages < 0) {
2940 ret = pages;
2941 break;
2942 }
2943 }
2944
2945 flush_compressed_data(rs);
2946 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2947 }
2948
2949 if (ret >= 0) {
2950 multifd_send_sync_main(rs->f);
2951 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2952 qemu_fflush(f);
2953 }
2954
2955 return ret;
2956 }
2957
2958 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2959 uint64_t *res_precopy_only,
2960 uint64_t *res_compatible,
2961 uint64_t *res_postcopy_only)
2962 {
2963 RAMState **temp = opaque;
2964 RAMState *rs = *temp;
2965 uint64_t remaining_size;
2966
2967 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2968
2969 if (!migration_in_postcopy() &&
2970 remaining_size < max_size) {
2971 qemu_mutex_lock_iothread();
2972 WITH_RCU_READ_LOCK_GUARD() {
2973 migration_bitmap_sync_precopy(rs);
2974 }
2975 qemu_mutex_unlock_iothread();
2976 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2977 }
2978
2979 if (migrate_postcopy_ram()) {
2980 /* We can do postcopy, and all the data is postcopiable */
2981 *res_compatible += remaining_size;
2982 } else {
2983 *res_precopy_only += remaining_size;
2984 }
2985 }
2986
2987 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2988 {
2989 unsigned int xh_len;
2990 int xh_flags;
2991 uint8_t *loaded_data;
2992
2993 /* extract RLE header */
2994 xh_flags = qemu_get_byte(f);
2995 xh_len = qemu_get_be16(f);
2996
2997 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2998 error_report("Failed to load XBZRLE page - wrong compression!");
2999 return -1;
3000 }
3001
3002 if (xh_len > TARGET_PAGE_SIZE) {
3003 error_report("Failed to load XBZRLE page - len overflow!");
3004 return -1;
3005 }
3006 loaded_data = XBZRLE.decoded_buf;
3007 /* load data and decode */
3008 /* it can change loaded_data to point to an internal buffer */
3009 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3010
3011 /* decode RLE */
3012 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3013 TARGET_PAGE_SIZE) == -1) {
3014 error_report("Failed to load XBZRLE page - decode error!");
3015 return -1;
3016 }
3017
3018 return 0;
3019 }
3020
3021 /**
3022 * ram_block_from_stream: read a RAMBlock id from the migration stream
3023 *
3024 * Must be called from within a rcu critical section.
3025 *
3026 * Returns a pointer from within the RCU-protected ram_list.
3027 *
3028 * @f: QEMUFile where to read the data from
3029 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3030 */
3031 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3032 {
3033 static RAMBlock *block;
3034 char id[256];
3035 uint8_t len;
3036
3037 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3038 if (!block) {
3039 error_report("Ack, bad migration stream!");
3040 return NULL;
3041 }
3042 return block;
3043 }
3044
3045 len = qemu_get_byte(f);
3046 qemu_get_buffer(f, (uint8_t *)id, len);
3047 id[len] = 0;
3048
3049 block = qemu_ram_block_by_name(id);
3050 if (!block) {
3051 error_report("Can't find block %s", id);
3052 return NULL;
3053 }
3054
3055 if (ramblock_is_ignored(block)) {
3056 error_report("block %s should not be migrated !", id);
3057 return NULL;
3058 }
3059
3060 return block;
3061 }
3062
3063 static inline void *host_from_ram_block_offset(RAMBlock *block,
3064 ram_addr_t offset)
3065 {
3066 if (!offset_in_ramblock(block, offset)) {
3067 return NULL;
3068 }
3069
3070 return block->host + offset;
3071 }
3072
3073 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3074 ram_addr_t offset, bool record_bitmap)
3075 {
3076 if (!offset_in_ramblock(block, offset)) {
3077 return NULL;
3078 }
3079 if (!block->colo_cache) {
3080 error_report("%s: colo_cache is NULL in block :%s",
3081 __func__, block->idstr);
3082 return NULL;
3083 }
3084
3085 /*
3086 * During colo checkpoint, we need bitmap of these migrated pages.
3087 * It help us to decide which pages in ram cache should be flushed
3088 * into VM's RAM later.
3089 */
3090 if (record_bitmap &&
3091 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3092 ram_state->migration_dirty_pages++;
3093 }
3094 return block->colo_cache + offset;
3095 }
3096
3097 /**
3098 * ram_handle_compressed: handle the zero page case
3099 *
3100 * If a page (or a whole RDMA chunk) has been
3101 * determined to be zero, then zap it.
3102 *
3103 * @host: host address for the zero page
3104 * @ch: what the page is filled from. We only support zero
3105 * @size: size of the zero page
3106 */
3107 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3108 {
3109 if (ch != 0 || !is_zero_range(host, size)) {
3110 memset(host, ch, size);
3111 }
3112 }
3113
3114 /* return the size after decompression, or negative value on error */
3115 static int
3116 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3117 const uint8_t *source, size_t source_len)
3118 {
3119 int err;
3120
3121 err = inflateReset(stream);
3122 if (err != Z_OK) {
3123 return -1;
3124 }
3125
3126 stream->avail_in = source_len;
3127 stream->next_in = (uint8_t *)source;
3128 stream->avail_out = dest_len;
3129 stream->next_out = dest;
3130
3131 err = inflate(stream, Z_NO_FLUSH);
3132 if (err != Z_STREAM_END) {
3133 return -1;
3134 }
3135
3136 return stream->total_out;
3137 }
3138
3139 static void *do_data_decompress(void *opaque)
3140 {
3141 DecompressParam *param = opaque;
3142 unsigned long pagesize;
3143 uint8_t *des;
3144 int len, ret;
3145
3146 qemu_mutex_lock(&param->mutex);
3147 while (!param->quit) {
3148 if (param->des) {
3149 des = param->des;
3150 len = param->len;
3151 param->des = 0;
3152 qemu_mutex_unlock(&param->mutex);
3153
3154 pagesize = TARGET_PAGE_SIZE;
3155
3156 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3157 param->compbuf, len);
3158 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3159 error_report("decompress data failed");
3160 qemu_file_set_error(decomp_file, ret);
3161 }
3162
3163 qemu_mutex_lock(&decomp_done_lock);
3164 param->done = true;
3165 qemu_cond_signal(&decomp_done_cond);
3166 qemu_mutex_unlock(&decomp_done_lock);
3167
3168 qemu_mutex_lock(&param->mutex);
3169 } else {
3170 qemu_cond_wait(&param->cond, &param->mutex);
3171 }
3172 }
3173 qemu_mutex_unlock(&param->mutex);
3174
3175 return NULL;
3176 }
3177
3178 static int wait_for_decompress_done(void)
3179 {
3180 int idx, thread_count;
3181
3182 if (!migrate_use_compression()) {
3183 return 0;
3184 }
3185
3186 thread_count = migrate_decompress_threads();
3187 qemu_mutex_lock(&decomp_done_lock);
3188 for (idx = 0; idx < thread_count; idx++) {
3189 while (!decomp_param[idx].done) {
3190 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3191 }
3192 }
3193 qemu_mutex_unlock(&decomp_done_lock);
3194 return qemu_file_get_error(decomp_file);
3195 }
3196
3197 static void compress_threads_load_cleanup(void)
3198 {
3199 int i, thread_count;
3200
3201 if (!migrate_use_compression()) {
3202 return;
3203 }
3204 thread_count = migrate_decompress_threads();
3205 for (i = 0; i < thread_count; i++) {
3206 /*
3207 * we use it as a indicator which shows if the thread is
3208 * properly init'd or not
3209 */
3210 if (!decomp_param[i].compbuf) {
3211 break;
3212 }
3213
3214 qemu_mutex_lock(&decomp_param[i].mutex);
3215 decomp_param[i].quit = true;
3216 qemu_cond_signal(&decomp_param[i].cond);
3217 qemu_mutex_unlock(&decomp_param[i].mutex);
3218 }
3219 for (i = 0; i < thread_count; i++) {
3220 if (!decomp_param[i].compbuf) {
3221 break;
3222 }
3223
3224 qemu_thread_join(decompress_threads + i);
3225 qemu_mutex_destroy(&decomp_param[i].mutex);
3226 qemu_cond_destroy(&decomp_param[i].cond);
3227 inflateEnd(&decomp_param[i].stream);
3228 g_free(decomp_param[i].compbuf);
3229 decomp_param[i].compbuf = NULL;
3230 }
3231 g_free(decompress_threads);
3232 g_free(decomp_param);
3233 decompress_threads = NULL;
3234 decomp_param = NULL;
3235 decomp_file = NULL;
3236 }
3237
3238 static int compress_threads_load_setup(QEMUFile *f)
3239 {
3240 int i, thread_count;
3241
3242 if (!migrate_use_compression()) {
3243 return 0;
3244 }
3245
3246 thread_count = migrate_decompress_threads();
3247 decompress_threads = g_new0(QemuThread, thread_count);
3248 decomp_param = g_new0(DecompressParam, thread_count);
3249 qemu_mutex_init(&decomp_done_lock);
3250 qemu_cond_init(&decomp_done_cond);
3251 decomp_file = f;
3252 for (i = 0; i < thread_count; i++) {
3253 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3254 goto exit;
3255 }
3256
3257 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3258 qemu_mutex_init(&decomp_param[i].mutex);
3259 qemu_cond_init(&decomp_param[i].cond);
3260 decomp_param[i].done = true;
3261 decomp_param[i].quit = false;
3262 qemu_thread_create(decompress_threads + i, "decompress",
3263 do_data_decompress, decomp_param + i,
3264 QEMU_THREAD_JOINABLE);
3265 }
3266 return 0;
3267 exit:
3268 compress_threads_load_cleanup();
3269 return -1;
3270 }
3271
3272 static void decompress_data_with_multi_threads(QEMUFile *f,
3273 void *host, int len)
3274 {
3275 int idx, thread_count;
3276
3277 thread_count = migrate_decompress_threads();
3278 qemu_mutex_lock(&decomp_done_lock);
3279 while (true) {
3280 for (idx = 0; idx < thread_count; idx++) {
3281 if (decomp_param[idx].done) {
3282 decomp_param[idx].done = false;
3283 qemu_mutex_lock(&decomp_param[idx].mutex);
3284 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3285 decomp_param[idx].des = host;
3286 decomp_param[idx].len = len;
3287 qemu_cond_signal(&decomp_param[idx].cond);
3288 qemu_mutex_unlock(&decomp_param[idx].mutex);
3289 break;
3290 }
3291 }
3292 if (idx < thread_count) {
3293 break;
3294 } else {
3295 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3296 }
3297 }
3298 qemu_mutex_unlock(&decomp_done_lock);
3299 }
3300
3301 /*
3302 * we must set ram_bulk_stage to false, otherwise in
3303 * migation_bitmap_find_dirty the bitmap will be unused and
3304 * all the pages in ram cache wil be flushed to the ram of
3305 * secondary VM.
3306 */
3307 static void colo_init_ram_state(void)
3308 {
3309 ram_state_init(&ram_state);
3310 ram_state->ram_bulk_stage = false;
3311 }
3312
3313 /*
3314 * colo cache: this is for secondary VM, we cache the whole
3315 * memory of the secondary VM, it is need to hold the global lock
3316 * to call this helper.
3317 */
3318 int colo_init_ram_cache(void)
3319 {
3320 RAMBlock *block;
3321
3322 WITH_RCU_READ_LOCK_GUARD() {
3323 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3324 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3325 NULL,
3326 false);
3327 if (!block->colo_cache) {
3328 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3329 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3330 block->used_length);
3331 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3332 if (block->colo_cache) {
3333 qemu_anon_ram_free(block->colo_cache, block->used_length);
3334 block->colo_cache = NULL;
3335 }
3336 }
3337 return -errno;
3338 }
3339 }
3340 }
3341
3342 /*
3343 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3344 * with to decide which page in cache should be flushed into SVM's RAM. Here
3345 * we use the same name 'ram_bitmap' as for migration.
3346 */
3347 if (ram_bytes_total()) {
3348 RAMBlock *block;
3349
3350 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3351 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3352 block->bmap = bitmap_new(pages);
3353 }
3354 }
3355
3356 colo_init_ram_state();
3357 return 0;
3358 }
3359
3360 /* TODO: duplicated with ram_init_bitmaps */
3361 void colo_incoming_start_dirty_log(void)
3362 {
3363 RAMBlock *block = NULL;
3364 /* For memory_global_dirty_log_start below. */
3365 qemu_mutex_lock_iothread();
3366 qemu_mutex_lock_ramlist();
3367
3368 memory_global_dirty_log_sync();
3369 WITH_RCU_READ_LOCK_GUARD() {
3370 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371 ramblock_sync_dirty_bitmap(ram_state, block);
3372 /* Discard this dirty bitmap record */
3373 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3374 }
3375 memory_global_dirty_log_start();
3376 }
3377 ram_state->migration_dirty_pages = 0;
3378 qemu_mutex_unlock_ramlist();
3379 qemu_mutex_unlock_iothread();
3380 }
3381
3382 /* It is need to hold the global lock to call this helper */
3383 void colo_release_ram_cache(void)
3384 {
3385 RAMBlock *block;
3386
3387 memory_global_dirty_log_stop();
3388 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3389 g_free(block->bmap);
3390 block->bmap = NULL;
3391 }
3392
3393 WITH_RCU_READ_LOCK_GUARD() {
3394 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3395 if (block->colo_cache) {
3396 qemu_anon_ram_free(block->colo_cache, block->used_length);
3397 block->colo_cache = NULL;
3398 }
3399 }
3400 }
3401 ram_state_cleanup(&ram_state);
3402 }
3403
3404 /**
3405 * ram_load_setup: Setup RAM for migration incoming side
3406 *
3407 * Returns zero to indicate success and negative for error
3408 *
3409 * @f: QEMUFile where to receive the data
3410 * @opaque: RAMState pointer
3411 */
3412 static int ram_load_setup(QEMUFile *f, void *opaque)
3413 {
3414 if (compress_threads_load_setup(f)) {
3415 return -1;
3416 }
3417
3418 xbzrle_load_setup();
3419 ramblock_recv_map_init();
3420
3421 return 0;
3422 }
3423
3424 static int ram_load_cleanup(void *opaque)
3425 {
3426 RAMBlock *rb;
3427
3428 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3429 qemu_ram_block_writeback(rb);
3430 }
3431
3432 xbzrle_load_cleanup();
3433 compress_threads_load_cleanup();
3434
3435 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3436 g_free(rb->receivedmap);
3437 rb->receivedmap = NULL;
3438 }
3439
3440 return 0;
3441 }
3442
3443 /**
3444 * ram_postcopy_incoming_init: allocate postcopy data structures
3445 *
3446 * Returns 0 for success and negative if there was one error
3447 *
3448 * @mis: current migration incoming state
3449 *
3450 * Allocate data structures etc needed by incoming migration with
3451 * postcopy-ram. postcopy-ram's similarly names
3452 * postcopy_ram_incoming_init does the work.
3453 */
3454 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3455 {
3456 return postcopy_ram_incoming_init(mis);
3457 }
3458
3459 /**
3460 * ram_load_postcopy: load a page in postcopy case
3461 *
3462 * Returns 0 for success or -errno in case of error
3463 *
3464 * Called in postcopy mode by ram_load().
3465 * rcu_read_lock is taken prior to this being called.
3466 *
3467 * @f: QEMUFile where to send the data
3468 */
3469 static int ram_load_postcopy(QEMUFile *f)
3470 {
3471 int flags = 0, ret = 0;
3472 bool place_needed = false;
3473 bool matches_target_page_size = false;
3474 MigrationIncomingState *mis = migration_incoming_get_current();
3475 /* Temporary page that is later 'placed' */
3476 void *postcopy_host_page = mis->postcopy_tmp_page;
3477 void *this_host = NULL;
3478 bool all_zero = true;
3479 int target_pages = 0;
3480
3481 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3482 ram_addr_t addr;
3483 void *host = NULL;
3484 void *page_buffer = NULL;
3485 void *place_source = NULL;
3486 RAMBlock *block = NULL;
3487 uint8_t ch;
3488 int len;
3489
3490 addr = qemu_get_be64(f);
3491
3492 /*
3493 * If qemu file error, we should stop here, and then "addr"
3494 * may be invalid
3495 */
3496 ret = qemu_file_get_error(f);
3497 if (ret) {
3498 break;
3499 }
3500
3501 flags = addr & ~TARGET_PAGE_MASK;
3502 addr &= TARGET_PAGE_MASK;
3503
3504 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3505 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3506 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3507 block = ram_block_from_stream(f, flags);
3508
3509 host = host_from_ram_block_offset(block, addr);
3510 if (!host) {
3511 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3512 ret = -EINVAL;
3513 break;
3514 }
3515 target_pages++;
3516 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3517 /*
3518 * Postcopy requires that we place whole host pages atomically;
3519 * these may be huge pages for RAMBlocks that are backed by
3520 * hugetlbfs.
3521 * To make it atomic, the data is read into a temporary page
3522 * that's moved into place later.
3523 * The migration protocol uses, possibly smaller, target-pages
3524 * however the source ensures it always sends all the components
3525 * of a host page in one chunk.
3526 */
3527 page_buffer = postcopy_host_page +
3528 ((uintptr_t)host & (block->page_size - 1));
3529 if (target_pages == 1) {
3530 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3531 block->page_size);
3532 } else {
3533 /* not the 1st TP within the HP */
3534 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3535 (uintptr_t)this_host) {
3536 error_report("Non-same host page %p/%p",
3537 host, this_host);
3538 ret = -EINVAL;
3539 break;
3540 }
3541 }
3542
3543 /*
3544 * If it's the last part of a host page then we place the host
3545 * page
3546 */
3547 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3548 place_needed = true;
3549 }
3550 place_source = postcopy_host_page;
3551 }
3552
3553 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3554 case RAM_SAVE_FLAG_ZERO:
3555 ch = qemu_get_byte(f);
3556 /*
3557 * Can skip to set page_buffer when
3558 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3559 */
3560 if (ch || !matches_target_page_size) {
3561 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3562 }
3563 if (ch) {
3564 all_zero = false;
3565 }
3566 break;
3567
3568 case RAM_SAVE_FLAG_PAGE:
3569 all_zero = false;
3570 if (!matches_target_page_size) {
3571 /* For huge pages, we always use temporary buffer */
3572 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3573 } else {
3574 /*
3575 * For small pages that matches target page size, we
3576 * avoid the qemu_file copy. Instead we directly use
3577 * the buffer of QEMUFile to place the page. Note: we
3578 * cannot do any QEMUFile operation before using that
3579 * buffer to make sure the buffer is valid when
3580 * placing the page.
3581 */
3582 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3583 TARGET_PAGE_SIZE);
3584 }
3585 break;
3586 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3587 all_zero = false;
3588 len = qemu_get_be32(f);
3589 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3590 error_report("Invalid compressed data length: %d", len);
3591 ret = -EINVAL;
3592 break;
3593 }
3594 decompress_data_with_multi_threads(f, page_buffer, len);
3595 break;
3596
3597 case RAM_SAVE_FLAG_EOS:
3598 /* normal exit */
3599 multifd_recv_sync_main();
3600 break;
3601 default:
3602 error_report("Unknown combination of migration flags: 0x%x"
3603 " (postcopy mode)", flags);
3604 ret = -EINVAL;
3605 break;
3606 }
3607
3608 /* Got the whole host page, wait for decompress before placing. */
3609 if (place_needed) {
3610 ret |= wait_for_decompress_done();
3611 }
3612
3613 /* Detect for any possible file errors */
3614 if (!ret && qemu_file_get_error(f)) {
3615 ret = qemu_file_get_error(f);
3616 }
3617
3618 if (!ret && place_needed) {
3619 /* This gets called at the last target page in the host page */
3620 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3621 block->page_size);
3622
3623 if (all_zero) {
3624 ret = postcopy_place_page_zero(mis, place_dest,
3625 block);
3626 } else {
3627 ret = postcopy_place_page(mis, place_dest,
3628 place_source, block);
3629 }
3630 place_needed = false;
3631 target_pages = 0;
3632 /* Assume we have a zero page until we detect something different */
3633 all_zero = true;
3634 }
3635 }
3636
3637 return ret;
3638 }
3639
3640 static bool postcopy_is_advised(void)
3641 {
3642 PostcopyState ps = postcopy_state_get();
3643 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3644 }
3645
3646 static bool postcopy_is_running(void)
3647 {
3648 PostcopyState ps = postcopy_state_get();
3649 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3650 }
3651
3652 /*
3653 * Flush content of RAM cache into SVM's memory.
3654 * Only flush the pages that be dirtied by PVM or SVM or both.
3655 */
3656 void colo_flush_ram_cache(void)
3657 {
3658 RAMBlock *block = NULL;
3659 void *dst_host;
3660 void *src_host;
3661 unsigned long offset = 0;
3662
3663 memory_global_dirty_log_sync();
3664 WITH_RCU_READ_LOCK_GUARD() {
3665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3666 ramblock_sync_dirty_bitmap(ram_state, block);
3667 }
3668 }
3669
3670 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3671 WITH_RCU_READ_LOCK_GUARD() {
3672 block = QLIST_FIRST_RCU(&ram_list.blocks);
3673
3674 while (block) {
3675 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3676
3677 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3678 >= block->used_length) {
3679 offset = 0;
3680 block = QLIST_NEXT_RCU(block, next);
3681 } else {
3682 migration_bitmap_clear_dirty(ram_state, block, offset);
3683 dst_host = block->host
3684 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3685 src_host = block->colo_cache
3686 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3687 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3688 }
3689 }
3690 }
3691 trace_colo_flush_ram_cache_end();
3692 }
3693
3694 /**
3695 * ram_load_precopy: load pages in precopy case
3696 *
3697 * Returns 0 for success or -errno in case of error
3698 *
3699 * Called in precopy mode by ram_load().
3700 * rcu_read_lock is taken prior to this being called.
3701 *
3702 * @f: QEMUFile where to send the data
3703 */
3704 static int ram_load_precopy(QEMUFile *f)
3705 {
3706 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3707 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3708 bool postcopy_advised = postcopy_is_advised();
3709 if (!migrate_use_compression()) {
3710 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3711 }
3712
3713 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3714 ram_addr_t addr, total_ram_bytes;
3715 void *host = NULL, *host_bak = NULL;
3716 uint8_t ch;
3717
3718 /*
3719 * Yield periodically to let main loop run, but an iteration of
3720 * the main loop is expensive, so do it each some iterations
3721 */
3722 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3723 aio_co_schedule(qemu_get_current_aio_context(),
3724 qemu_coroutine_self());
3725 qemu_coroutine_yield();
3726 }
3727 i++;
3728
3729 addr = qemu_get_be64(f);
3730 flags = addr & ~TARGET_PAGE_MASK;
3731 addr &= TARGET_PAGE_MASK;
3732
3733 if (flags & invalid_flags) {
3734 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3735 error_report("Received an unexpected compressed page");
3736 }
3737
3738 ret = -EINVAL;
3739 break;
3740 }
3741
3742 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3743 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3744 RAMBlock *block = ram_block_from_stream(f, flags);
3745
3746 host = host_from_ram_block_offset(block, addr);
3747 /*
3748 * After going into COLO stage, we should not load the page
3749 * into SVM's memory directly, we put them into colo_cache firstly.
3750 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3751 * Previously, we copied all these memory in preparing stage of COLO
3752 * while we need to stop VM, which is a time-consuming process.
3753 * Here we optimize it by a trick, back-up every page while in
3754 * migration process while COLO is enabled, though it affects the
3755 * speed of the migration, but it obviously reduce the downtime of
3756 * back-up all SVM'S memory in COLO preparing stage.
3757 */
3758 if (migration_incoming_colo_enabled()) {
3759 if (migration_incoming_in_colo_state()) {
3760 /* In COLO stage, put all pages into cache temporarily */
3761 host = colo_cache_from_block_offset(block, addr, true);
3762 } else {
3763 /*
3764 * In migration stage but before COLO stage,
3765 * Put all pages into both cache and SVM's memory.
3766 */
3767 host_bak = colo_cache_from_block_offset(block, addr, false);
3768 }
3769 }
3770 if (!host) {
3771 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3772 ret = -EINVAL;
3773 break;
3774 }
3775 if (!migration_incoming_in_colo_state()) {
3776 ramblock_recv_bitmap_set(block, host);
3777 }
3778
3779 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3780 }
3781
3782 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3783 case RAM_SAVE_FLAG_MEM_SIZE:
3784 /* Synchronize RAM block list */
3785 total_ram_bytes = addr;
3786 while (!ret && total_ram_bytes) {
3787 RAMBlock *block;
3788 char id[256];
3789 ram_addr_t length;
3790
3791 len = qemu_get_byte(f);
3792 qemu_get_buffer(f, (uint8_t *)id, len);
3793 id[len] = 0;
3794 length = qemu_get_be64(f);
3795
3796 block = qemu_ram_block_by_name(id);
3797 if (block && !qemu_ram_is_migratable(block)) {
3798 error_report("block %s should not be migrated !", id);
3799 ret = -EINVAL;
3800 } else if (block) {
3801 if (length != block->used_length) {
3802 Error *local_err = NULL;
3803
3804 ret = qemu_ram_resize(block, length,
3805 &local_err);
3806 if (local_err) {
3807 error_report_err(local_err);
3808 }
3809 }
3810 /* For postcopy we need to check hugepage sizes match */
3811 if (postcopy_advised && migrate_postcopy_ram() &&
3812 block->page_size != qemu_host_page_size) {
3813 uint64_t remote_page_size = qemu_get_be64(f);
3814 if (remote_page_size != block->page_size) {
3815 error_report("Mismatched RAM page size %s "
3816 "(local) %zd != %" PRId64,
3817 id, block->page_size,
3818 remote_page_size);
3819 ret = -EINVAL;
3820 }
3821 }
3822 if (migrate_ignore_shared()) {
3823 hwaddr addr = qemu_get_be64(f);
3824 if (ramblock_is_ignored(block) &&
3825 block->mr->addr != addr) {
3826 error_report("Mismatched GPAs for block %s "
3827 "%" PRId64 "!= %" PRId64,
3828 id, (uint64_t)addr,
3829 (uint64_t)block->mr->addr);
3830 ret = -EINVAL;
3831 }
3832 }
3833 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3834 block->idstr);
3835 } else {
3836 error_report("Unknown ramblock \"%s\", cannot "
3837 "accept migration", id);
3838 ret = -EINVAL;
3839 }
3840
3841 total_ram_bytes -= length;
3842 }
3843 break;
3844
3845 case RAM_SAVE_FLAG_ZERO:
3846 ch = qemu_get_byte(f);
3847 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3848 break;
3849
3850 case RAM_SAVE_FLAG_PAGE:
3851 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3852 break;
3853
3854 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3855 len = qemu_get_be32(f);
3856 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3857 error_report("Invalid compressed data length: %d", len);
3858 ret = -EINVAL;
3859 break;
3860 }
3861 decompress_data_with_multi_threads(f, host, len);
3862 break;
3863
3864 case RAM_SAVE_FLAG_XBZRLE:
3865 if (load_xbzrle(f, addr, host) < 0) {
3866 error_report("Failed to decompress XBZRLE page at "
3867 RAM_ADDR_FMT, addr);
3868 ret = -EINVAL;
3869 break;
3870 }
3871 break;
3872 case RAM_SAVE_FLAG_EOS:
3873 /* normal exit */
3874 multifd_recv_sync_main();
3875 break;
3876 default:
3877 if (flags & RAM_SAVE_FLAG_HOOK) {
3878 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3879 } else {
3880 error_report("Unknown combination of migration flags: 0x%x",
3881 flags);
3882 ret = -EINVAL;
3883 }
3884 }
3885 if (!ret) {
3886 ret = qemu_file_get_error(f);
3887 }
3888 if (!ret && host_bak) {
3889 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3890 }
3891 }
3892
3893 ret |= wait_for_decompress_done();
3894 return ret;
3895 }
3896
3897 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3898 {
3899 int ret = 0;
3900 static uint64_t seq_iter;
3901 /*
3902 * If system is running in postcopy mode, page inserts to host memory must
3903 * be atomic
3904 */
3905 bool postcopy_running = postcopy_is_running();
3906
3907 seq_iter++;
3908
3909 if (version_id != 4) {
3910 return -EINVAL;
3911 }
3912
3913 /*
3914 * This RCU critical section can be very long running.
3915 * When RCU reclaims in the code start to become numerous,
3916 * it will be necessary to reduce the granularity of this
3917 * critical section.
3918 */
3919 WITH_RCU_READ_LOCK_GUARD() {
3920 if (postcopy_running) {
3921 ret = ram_load_postcopy(f);
3922 } else {
3923 ret = ram_load_precopy(f);
3924 }
3925 }
3926 trace_ram_load_complete(ret, seq_iter);
3927
3928 return ret;
3929 }
3930
3931 static bool ram_has_postcopy(void *opaque)
3932 {
3933 RAMBlock *rb;
3934 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3935 if (ramblock_is_pmem(rb)) {
3936 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3937 "is not supported now!", rb->idstr, rb->host);
3938 return false;
3939 }
3940 }
3941
3942 return migrate_postcopy_ram();
3943 }
3944
3945 /* Sync all the dirty bitmap with destination VM. */
3946 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3947 {
3948 RAMBlock *block;
3949 QEMUFile *file = s->to_dst_file;
3950 int ramblock_count = 0;
3951
3952 trace_ram_dirty_bitmap_sync_start();
3953
3954 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3955 qemu_savevm_send_recv_bitmap(file, block->idstr);
3956 trace_ram_dirty_bitmap_request(block->idstr);
3957 ramblock_count++;
3958 }
3959
3960 trace_ram_dirty_bitmap_sync_wait();
3961
3962 /* Wait until all the ramblocks' dirty bitmap synced */
3963 while (ramblock_count--) {
3964 qemu_sem_wait(&s->rp_state.rp_sem);
3965 }
3966
3967 trace_ram_dirty_bitmap_sync_complete();
3968
3969 return 0;
3970 }
3971
3972 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3973 {
3974 qemu_sem_post(&s->rp_state.rp_sem);
3975 }
3976
3977 /*
3978 * Read the received bitmap, revert it as the initial dirty bitmap.
3979 * This is only used when the postcopy migration is paused but wants
3980 * to resume from a middle point.
3981 */
3982 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3983 {
3984 int ret = -EINVAL;
3985 QEMUFile *file = s->rp_state.from_dst_file;
3986 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3987 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3988 uint64_t size, end_mark;
3989
3990 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3991
3992 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3993 error_report("%s: incorrect state %s", __func__,
3994 MigrationStatus_str(s->state));
3995 return -EINVAL;
3996 }
3997
3998 /*
3999 * Note: see comments in ramblock_recv_bitmap_send() on why we
4000 * need the endianness conversion, and the paddings.
4001 */
4002 local_size = ROUND_UP(local_size, 8);
4003
4004 /* Add paddings */
4005 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4006
4007 size = qemu_get_be64(file);
4008
4009 /* The size of the bitmap should match with our ramblock */
4010 if (size != local_size) {
4011 error_report("%s: ramblock '%s' bitmap size mismatch "
4012 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4013 block->idstr, size, local_size);
4014 ret = -EINVAL;
4015 goto out;
4016 }
4017
4018 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4019 end_mark = qemu_get_be64(file);
4020
4021 ret = qemu_file_get_error(file);
4022 if (ret || size != local_size) {
4023 error_report("%s: read bitmap failed for ramblock '%s': %d"
4024 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4025 __func__, block->idstr, ret, local_size, size);
4026 ret = -EIO;
4027 goto out;
4028 }
4029
4030 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4031 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4032 __func__, block->idstr, end_mark);
4033 ret = -EINVAL;
4034 goto out;
4035 }
4036
4037 /*
4038 * Endianness conversion. We are during postcopy (though paused).
4039 * The dirty bitmap won't change. We can directly modify it.
4040 */
4041 bitmap_from_le(block->bmap, le_bitmap, nbits);
4042
4043 /*
4044 * What we received is "received bitmap". Revert it as the initial
4045 * dirty bitmap for this ramblock.
4046 */
4047 bitmap_complement(block->bmap, block->bmap, nbits);
4048
4049 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4050
4051 /*
4052 * We succeeded to sync bitmap for current ramblock. If this is
4053 * the last one to sync, we need to notify the main send thread.
4054 */
4055 ram_dirty_bitmap_reload_notify(s);
4056
4057 ret = 0;
4058 out:
4059 g_free(le_bitmap);
4060 return ret;
4061 }
4062
4063 static int ram_resume_prepare(MigrationState *s, void *opaque)
4064 {
4065 RAMState *rs = *(RAMState **)opaque;
4066 int ret;
4067
4068 ret = ram_dirty_bitmap_sync_all(s, rs);
4069