s390x/pci: use a PCI Function structure
[qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59
60 /***********************************************************/
61 /* ram save/restore */
62
63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
67 */
68
69 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
70 #define RAM_SAVE_FLAG_ZERO 0x02
71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
72 #define RAM_SAVE_FLAG_PAGE 0x08
73 #define RAM_SAVE_FLAG_EOS 0x10
74 #define RAM_SAVE_FLAG_CONTINUE 0x20
75 #define RAM_SAVE_FLAG_XBZRLE 0x40
76 /* 0x80 is reserved in migration.h start with 0x100 next */
77 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
78
79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
80 {
81 return buffer_is_zero(p, size);
82 }
83
84 XBZRLECacheStats xbzrle_counters;
85
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
101
102 static void XBZRLE_cache_lock(void)
103 {
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
106 }
107 }
108
109 static void XBZRLE_cache_unlock(void)
110 {
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 }
115
116 /**
117 * xbzrle_cache_resize: resize the xbzrle cache
118 *
119 * This function is called from qmp_migrate_set_cache_size in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
123 *
124 * Returns 0 for success or -1 for error
125 *
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
128 */
129 int xbzrle_cache_resize(int64_t new_size, Error **errp)
130 {
131 PageCache *new_cache;
132 int64_t ret = 0;
133
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
139 }
140
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
144 }
145
146 XBZRLE_cache_lock();
147
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
153 }
154
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
157 }
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
161 }
162
163 bool ramblock_is_ignored(RAMBlock *block)
164 {
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167 }
168
169 #undef RAMBLOCK_FOREACH
170
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
172 {
173 RAMBlock *block;
174 int ret = 0;
175
176 RCU_READ_LOCK_GUARD();
177
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
182 }
183 }
184 return ret;
185 }
186
187 static void ramblock_recv_map_init(void)
188 {
189 RAMBlock *rb;
190
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
194 }
195 }
196
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
198 {
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
201 }
202
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
204 {
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206 }
207
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
209 {
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211 }
212
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
215 {
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
219 }
220
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222
223 /*
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
225 *
226 * Returns >0 if success with sent bytes, or <0 if error.
227 */
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
230 {
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
234
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
238 }
239
240 nbits = block->used_length >> TARGET_PAGE_BITS;
241
242 /*
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
246 */
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248
249 /*
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
253 */
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
255
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
258
259 /*
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
264 */
265 size = ROUND_UP(size, 8);
266
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
269 /*
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
272 */
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
275
276 g_free(le_bitmap);
277
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
280 }
281
282 return size + sizeof(size);
283 }
284
285 /*
286 * An outstanding page request, on the source, having been received
287 * and queued
288 */
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
293
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295 };
296
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* Last block that we have visited searching for dirty pages */
302 RAMBlock *last_seen_block;
303 /* Last block from where we have sent data */
304 RAMBlock *last_sent_block;
305 /* Last dirty target page we have sent */
306 ram_addr_t last_page;
307 /* last ram version we have seen */
308 uint32_t last_version;
309 /* We are in the first round */
310 bool ram_bulk_stage;
311 /* The free page optimization is enabled */
312 bool fpo_enabled;
313 /* How many times we have dirty too many pages */
314 int dirty_rate_high_cnt;
315 /* these variables are used for bitmap sync */
316 /* last time we did a full bitmap_sync */
317 int64_t time_last_bitmap_sync;
318 /* bytes transferred at start_time */
319 uint64_t bytes_xfer_prev;
320 /* number of dirty pages since start_time */
321 uint64_t num_dirty_pages_period;
322 /* xbzrle misses since the beginning of the period */
323 uint64_t xbzrle_cache_miss_prev;
324 /* Amount of xbzrle pages since the beginning of the period */
325 uint64_t xbzrle_pages_prev;
326 /* Amount of xbzrle encoded bytes since the beginning of the period */
327 uint64_t xbzrle_bytes_prev;
328
329 /* compression statistics since the beginning of the period */
330 /* amount of count that no free thread to compress data */
331 uint64_t compress_thread_busy_prev;
332 /* amount bytes after compression */
333 uint64_t compressed_size_prev;
334 /* amount of compressed pages */
335 uint64_t compress_pages_prev;
336
337 /* total handled target pages at the beginning of period */
338 uint64_t target_page_count_prev;
339 /* total handled target pages since start */
340 uint64_t target_page_count;
341 /* number of dirty bits in the bitmap */
342 uint64_t migration_dirty_pages;
343 /* Protects modification of the bitmap and migration dirty pages */
344 QemuMutex bitmap_mutex;
345 /* The RAMBlock used in the last src_page_requests */
346 RAMBlock *last_req_rb;
347 /* Queue of outstanding page requests from the destination */
348 QemuMutex src_page_req_mutex;
349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
350 };
351 typedef struct RAMState RAMState;
352
353 static RAMState *ram_state;
354
355 static NotifierWithReturnList precopy_notifier_list;
356
357 void precopy_infrastructure_init(void)
358 {
359 notifier_with_return_list_init(&precopy_notifier_list);
360 }
361
362 void precopy_add_notifier(NotifierWithReturn *n)
363 {
364 notifier_with_return_list_add(&precopy_notifier_list, n);
365 }
366
367 void precopy_remove_notifier(NotifierWithReturn *n)
368 {
369 notifier_with_return_remove(n);
370 }
371
372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
373 {
374 PrecopyNotifyData pnd;
375 pnd.reason = reason;
376 pnd.errp = errp;
377
378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
379 }
380
381 void precopy_enable_free_page_optimization(void)
382 {
383 if (!ram_state) {
384 return;
385 }
386
387 ram_state->fpo_enabled = true;
388 }
389
390 uint64_t ram_bytes_remaining(void)
391 {
392 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
393 0;
394 }
395
396 MigrationStats ram_counters;
397
398 /* used by the search for pages to send */
399 struct PageSearchStatus {
400 /* Current block being searched */
401 RAMBlock *block;
402 /* Current page to search from */
403 unsigned long page;
404 /* Set once we wrap around */
405 bool complete_round;
406 };
407 typedef struct PageSearchStatus PageSearchStatus;
408
409 CompressionStats compression_counters;
410
411 struct CompressParam {
412 bool done;
413 bool quit;
414 bool zero_page;
415 QEMUFile *file;
416 QemuMutex mutex;
417 QemuCond cond;
418 RAMBlock *block;
419 ram_addr_t offset;
420
421 /* internally used fields */
422 z_stream stream;
423 uint8_t *originbuf;
424 };
425 typedef struct CompressParam CompressParam;
426
427 struct DecompressParam {
428 bool done;
429 bool quit;
430 QemuMutex mutex;
431 QemuCond cond;
432 void *des;
433 uint8_t *compbuf;
434 int len;
435 z_stream stream;
436 };
437 typedef struct DecompressParam DecompressParam;
438
439 static CompressParam *comp_param;
440 static QemuThread *compress_threads;
441 /* comp_done_cond is used to wake up the migration thread when
442 * one of the compression threads has finished the compression.
443 * comp_done_lock is used to co-work with comp_done_cond.
444 */
445 static QemuMutex comp_done_lock;
446 static QemuCond comp_done_cond;
447 /* The empty QEMUFileOps will be used by file in CompressParam */
448 static const QEMUFileOps empty_ops = { };
449
450 static QEMUFile *decomp_file;
451 static DecompressParam *decomp_param;
452 static QemuThread *decompress_threads;
453 static QemuMutex decomp_done_lock;
454 static QemuCond decomp_done_cond;
455
456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
457 ram_addr_t offset, uint8_t *source_buf);
458
459 static void *do_data_compress(void *opaque)
460 {
461 CompressParam *param = opaque;
462 RAMBlock *block;
463 ram_addr_t offset;
464 bool zero_page;
465
466 qemu_mutex_lock(&param->mutex);
467 while (!param->quit) {
468 if (param->block) {
469 block = param->block;
470 offset = param->offset;
471 param->block = NULL;
472 qemu_mutex_unlock(&param->mutex);
473
474 zero_page = do_compress_ram_page(param->file, &param->stream,
475 block, offset, param->originbuf);
476
477 qemu_mutex_lock(&comp_done_lock);
478 param->done = true;
479 param->zero_page = zero_page;
480 qemu_cond_signal(&comp_done_cond);
481 qemu_mutex_unlock(&comp_done_lock);
482
483 qemu_mutex_lock(&param->mutex);
484 } else {
485 qemu_cond_wait(&param->cond, &param->mutex);
486 }
487 }
488 qemu_mutex_unlock(&param->mutex);
489
490 return NULL;
491 }
492
493 static void compress_threads_save_cleanup(void)
494 {
495 int i, thread_count;
496
497 if (!migrate_use_compression() || !comp_param) {
498 return;
499 }
500
501 thread_count = migrate_compress_threads();
502 for (i = 0; i < thread_count; i++) {
503 /*
504 * we use it as a indicator which shows if the thread is
505 * properly init'd or not
506 */
507 if (!comp_param[i].file) {
508 break;
509 }
510
511 qemu_mutex_lock(&comp_param[i].mutex);
512 comp_param[i].quit = true;
513 qemu_cond_signal(&comp_param[i].cond);
514 qemu_mutex_unlock(&comp_param[i].mutex);
515
516 qemu_thread_join(compress_threads + i);
517 qemu_mutex_destroy(&comp_param[i].mutex);
518 qemu_cond_destroy(&comp_param[i].cond);
519 deflateEnd(&comp_param[i].stream);
520 g_free(comp_param[i].originbuf);
521 qemu_fclose(comp_param[i].file);
522 comp_param[i].file = NULL;
523 }
524 qemu_mutex_destroy(&comp_done_lock);
525 qemu_cond_destroy(&comp_done_cond);
526 g_free(compress_threads);
527 g_free(comp_param);
528 compress_threads = NULL;
529 comp_param = NULL;
530 }
531
532 static int compress_threads_save_setup(void)
533 {
534 int i, thread_count;
535
536 if (!migrate_use_compression()) {
537 return 0;
538 }
539 thread_count = migrate_compress_threads();
540 compress_threads = g_new0(QemuThread, thread_count);
541 comp_param = g_new0(CompressParam, thread_count);
542 qemu_cond_init(&comp_done_cond);
543 qemu_mutex_init(&comp_done_lock);
544 for (i = 0; i < thread_count; i++) {
545 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
546 if (!comp_param[i].originbuf) {
547 goto exit;
548 }
549
550 if (deflateInit(&comp_param[i].stream,
551 migrate_compress_level()) != Z_OK) {
552 g_free(comp_param[i].originbuf);
553 goto exit;
554 }
555
556 /* comp_param[i].file is just used as a dummy buffer to save data,
557 * set its ops to empty.
558 */
559 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
560 comp_param[i].done = true;
561 comp_param[i].quit = false;
562 qemu_mutex_init(&comp_param[i].mutex);
563 qemu_cond_init(&comp_param[i].cond);
564 qemu_thread_create(compress_threads + i, "compress",
565 do_data_compress, comp_param + i,
566 QEMU_THREAD_JOINABLE);
567 }
568 return 0;
569
570 exit:
571 compress_threads_save_cleanup();
572 return -1;
573 }
574
575 /**
576 * save_page_header: write page header to wire
577 *
578 * If this is the 1st block, it also writes the block identification
579 *
580 * Returns the number of bytes written
581 *
582 * @f: QEMUFile where to send the data
583 * @block: block that contains the page we want to send
584 * @offset: offset inside the block for the page
585 * in the lower bits, it contains flags
586 */
587 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
588 ram_addr_t offset)
589 {
590 size_t size, len;
591
592 if (block == rs->last_sent_block) {
593 offset |= RAM_SAVE_FLAG_CONTINUE;
594 }
595 qemu_put_be64(f, offset);
596 size = 8;
597
598 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
599 len = strlen(block->idstr);
600 qemu_put_byte(f, len);
601 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
602 size += 1 + len;
603 rs->last_sent_block = block;
604 }
605 return size;
606 }
607
608 /**
609 * mig_throttle_guest_down: throotle down the guest
610 *
611 * Reduce amount of guest cpu execution to hopefully slow down memory
612 * writes. If guest dirty memory rate is reduced below the rate at
613 * which we can transfer pages to the destination then we should be
614 * able to complete migration. Some workloads dirty memory way too
615 * fast and will not effectively converge, even with auto-converge.
616 */
617 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
618 uint64_t bytes_dirty_threshold)
619 {
620 MigrationState *s = migrate_get_current();
621 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
622 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
623 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
624 int pct_max = s->parameters.max_cpu_throttle;
625
626 uint64_t throttle_now = cpu_throttle_get_percentage();
627 uint64_t cpu_now, cpu_ideal, throttle_inc;
628
629 /* We have not started throttling yet. Let's start it. */
630 if (!cpu_throttle_active()) {
631 cpu_throttle_set(pct_initial);
632 } else {
633 /* Throttling already on, just increase the rate */
634 if (!pct_tailslow) {
635 throttle_inc = pct_increment;
636 } else {
637 /* Compute the ideal CPU percentage used by Guest, which may
638 * make the dirty rate match the dirty rate threshold. */
639 cpu_now = 100 - throttle_now;
640 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
641 bytes_dirty_period);
642 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
643 }
644 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
645 }
646 }
647
648 /**
649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
650 *
651 * @rs: current RAM state
652 * @current_addr: address for the zero page
653 *
654 * Update the xbzrle cache to reflect a page that's been sent as all 0.
655 * The important thing is that a stale (not-yet-0'd) page be replaced
656 * by the new data.
657 * As a bonus, if the page wasn't in the cache it gets added so that
658 * when a small write is made into the 0'd page it gets XBZRLE sent.
659 */
660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
661 {
662 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
663 return;
664 }
665
666 /* We don't care if this fails to allocate a new cache page
667 * as long as it updated an old one */
668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
669 ram_counters.dirty_sync_count);
670 }
671
672 #define ENCODING_FLAG_XBZRLE 0x1
673
674 /**
675 * save_xbzrle_page: compress and send current page
676 *
677 * Returns: 1 means that we wrote the page
678 * 0 means that page is identical to the one already sent
679 * -1 means that xbzrle would be longer than normal
680 *
681 * @rs: current RAM state
682 * @current_data: pointer to the address of the page contents
683 * @current_addr: addr of the page
684 * @block: block that contains the page we want to send
685 * @offset: offset inside the block for the page
686 * @last_stage: if we are at the completion stage
687 */
688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
689 ram_addr_t current_addr, RAMBlock *block,
690 ram_addr_t offset, bool last_stage)
691 {
692 int encoded_len = 0, bytes_xbzrle;
693 uint8_t *prev_cached_page;
694
695 if (!cache_is_cached(XBZRLE.cache, current_addr,
696 ram_counters.dirty_sync_count)) {
697 xbzrle_counters.cache_miss++;
698 if (!last_stage) {
699 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
700 ram_counters.dirty_sync_count) == -1) {
701 return -1;
702 } else {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data = get_cached_data(XBZRLE.cache, current_addr);
706 }
707 }
708 return -1;
709 }
710
711 /*
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
715 *
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
721 */
722 xbzrle_counters.pages++;
723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
724
725 /* save current buffer into memory */
726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
727
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
731 TARGET_PAGE_SIZE);
732
733 /*
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
736 */
737 if (!last_stage && encoded_len != 0) {
738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
739 /*
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
743 */
744 *current_data = prev_cached_page;
745 }
746
747 if (encoded_len == 0) {
748 trace_save_xbzrle_page_skipping();
749 return 0;
750 } else if (encoded_len == -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters.overflow++;
753 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
754 return -1;
755 }
756
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle = save_page_header(rs, rs->f, block,
759 offset | RAM_SAVE_FLAG_XBZRLE);
760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
761 qemu_put_be16(rs->f, encoded_len);
762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
763 bytes_xbzrle += encoded_len + 1 + 2;
764 /*
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
768 */
769 xbzrle_counters.bytes += bytes_xbzrle - 8;
770 ram_counters.transferred += bytes_xbzrle;
771
772 return 1;
773 }
774
775 /**
776 * migration_bitmap_find_dirty: find the next dirty page from start
777 *
778 * Returns the page offset within memory region of the start of a dirty page
779 *
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
783 */
784 static inline
785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
786 unsigned long start)
787 {
788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
789 unsigned long *bitmap = rb->bmap;
790 unsigned long next;
791
792 if (ramblock_is_ignored(rb)) {
793 return size;
794 }
795
796 /*
797 * When the free page optimization is enabled, we need to check the bitmap
798 * to send the non-free pages rather than all the pages in the bulk stage.
799 */
800 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
801 next = start + 1;
802 } else {
803 next = find_next_bit(bitmap, size, start);
804 }
805
806 return next;
807 }
808
809 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
810 RAMBlock *rb,
811 unsigned long page)
812 {
813 bool ret;
814
815 qemu_mutex_lock(&rs->bitmap_mutex);
816
817 /*
818 * Clear dirty bitmap if needed. This _must_ be called before we
819 * send any of the page in the chunk because we need to make sure
820 * we can capture further page content changes when we sync dirty
821 * log the next time. So as long as we are going to send any of
822 * the page in the chunk we clear the remote dirty bitmap for all.
823 * Clearing it earlier won't be a problem, but too late will.
824 */
825 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
826 uint8_t shift = rb->clear_bmap_shift;
827 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
828 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
829
830 /*
831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
832 * can make things easier sometimes since then start address
833 * of the small chunk will always be 64 pages aligned so the
834 * bitmap will always be aligned to unsigned long. We should
835 * even be able to remove this restriction but I'm simply
836 * keeping it.
837 */
838 assert(shift >= 6);
839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
840 memory_region_clear_dirty_bitmap(rb->mr, start, size);
841 }
842
843 ret = test_and_clear_bit(page, rb->bmap);
844
845 if (ret) {
846 rs->migration_dirty_pages--;
847 }
848 qemu_mutex_unlock(&rs->bitmap_mutex);
849
850 return ret;
851 }
852
853 /* Called with RCU critical section */
854 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
855 {
856 uint64_t new_dirty_pages =
857 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
858
859 rs->migration_dirty_pages += new_dirty_pages;
860 rs->num_dirty_pages_period += new_dirty_pages;
861 }
862
863 /**
864 * ram_pagesize_summary: calculate all the pagesizes of a VM
865 *
866 * Returns a summary bitmap of the page sizes of all RAMBlocks
867 *
868 * For VMs with just normal pages this is equivalent to the host page
869 * size. If it's got some huge pages then it's the OR of all the
870 * different page sizes.
871 */
872 uint64_t ram_pagesize_summary(void)
873 {
874 RAMBlock *block;
875 uint64_t summary = 0;
876
877 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
878 summary |= block->page_size;
879 }
880
881 return summary;
882 }
883
884 uint64_t ram_get_total_transferred_pages(void)
885 {
886 return ram_counters.normal + ram_counters.duplicate +
887 compression_counters.pages + xbzrle_counters.pages;
888 }
889
890 static void migration_update_rates(RAMState *rs, int64_t end_time)
891 {
892 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
893 double compressed_size;
894
895 /* calculate period counters */
896 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
897 / (end_time - rs->time_last_bitmap_sync);
898
899 if (!page_count) {
900 return;
901 }
902
903 if (migrate_use_xbzrle()) {
904 double encoded_size, unencoded_size;
905
906 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
907 rs->xbzrle_cache_miss_prev) / page_count;
908 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
909 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
910 TARGET_PAGE_SIZE;
911 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
912 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
913 xbzrle_counters.encoding_rate = 0;
914 } else {
915 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
916 }
917 rs->xbzrle_pages_prev = xbzrle_counters.pages;
918 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
919 }
920
921 if (migrate_use_compression()) {
922 compression_counters.busy_rate = (double)(compression_counters.busy -
923 rs->compress_thread_busy_prev) / page_count;
924 rs->compress_thread_busy_prev = compression_counters.busy;
925
926 compressed_size = compression_counters.compressed_size -
927 rs->compressed_size_prev;
928 if (compressed_size) {
929 double uncompressed_size = (compression_counters.pages -
930 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
931
932 /* Compression-Ratio = Uncompressed-size / Compressed-size */
933 compression_counters.compression_rate =
934 uncompressed_size / compressed_size;
935
936 rs->compress_pages_prev = compression_counters.pages;
937 rs->compressed_size_prev = compression_counters.compressed_size;
938 }
939 }
940 }
941
942 static void migration_trigger_throttle(RAMState *rs)
943 {
944 MigrationState *s = migrate_get_current();
945 uint64_t threshold = s->parameters.throttle_trigger_threshold;
946
947 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
948 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
949 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
950
951 /* During block migration the auto-converge logic incorrectly detects
952 * that ram migration makes no progress. Avoid this by disabling the
953 * throttling logic during the bulk phase of block migration. */
954 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
955 /* The following detection logic can be refined later. For now:
956 Check to see if the ratio between dirtied bytes and the approx.
957 amount of bytes that just got transferred since the last time
958 we were in this routine reaches the threshold. If that happens
959 twice, start or increase throttling. */
960
961 if ((bytes_dirty_period > bytes_dirty_threshold) &&
962 (++rs->dirty_rate_high_cnt >= 2)) {
963 trace_migration_throttle();
964 rs->dirty_rate_high_cnt = 0;
965 mig_throttle_guest_down(bytes_dirty_period,
966 bytes_dirty_threshold);
967 }
968 }
969 }
970
971 static void migration_bitmap_sync(RAMState *rs)
972 {
973 RAMBlock *block;
974 int64_t end_time;
975
976 ram_counters.dirty_sync_count++;
977
978 if (!rs->time_last_bitmap_sync) {
979 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
980 }
981
982 trace_migration_bitmap_sync_start();
983 memory_global_dirty_log_sync();
984
985 qemu_mutex_lock(&rs->bitmap_mutex);
986 WITH_RCU_READ_LOCK_GUARD() {
987 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
988 ramblock_sync_dirty_bitmap(rs, block);
989 }
990 ram_counters.remaining = ram_bytes_remaining();
991 }
992 qemu_mutex_unlock(&rs->bitmap_mutex);
993
994 memory_global_after_dirty_log_sync();
995 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
996
997 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
998
999 /* more than 1 second = 1000 millisecons */
1000 if (end_time > rs->time_last_bitmap_sync + 1000) {
1001 migration_trigger_throttle(rs);
1002
1003 migration_update_rates(rs, end_time);
1004
1005 rs->target_page_count_prev = rs->target_page_count;
1006
1007 /* reset period counters */
1008 rs->time_last_bitmap_sync = end_time;
1009 rs->num_dirty_pages_period = 0;
1010 rs->bytes_xfer_prev = ram_counters.transferred;
1011 }
1012 if (migrate_use_events()) {
1013 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1014 }
1015 }
1016
1017 static void migration_bitmap_sync_precopy(RAMState *rs)
1018 {
1019 Error *local_err = NULL;
1020
1021 /*
1022 * The current notifier usage is just an optimization to migration, so we
1023 * don't stop the normal migration process in the error case.
1024 */
1025 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1026 error_report_err(local_err);
1027 local_err = NULL;
1028 }
1029
1030 migration_bitmap_sync(rs);
1031
1032 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1033 error_report_err(local_err);
1034 }
1035 }
1036
1037 /**
1038 * save_zero_page_to_file: send the zero page to the file
1039 *
1040 * Returns the size of data written to the file, 0 means the page is not
1041 * a zero page
1042 *
1043 * @rs: current RAM state
1044 * @file: the file where the data is saved
1045 * @block: block that contains the page we want to send
1046 * @offset: offset inside the block for the page
1047 */
1048 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1049 RAMBlock *block, ram_addr_t offset)
1050 {
1051 uint8_t *p = block->host + offset;
1052 int len = 0;
1053
1054 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1055 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1056 qemu_put_byte(file, 0);
1057 len += 1;
1058 }
1059 return len;
1060 }
1061
1062 /**
1063 * save_zero_page: send the zero page to the stream
1064 *
1065 * Returns the number of pages written.
1066 *
1067 * @rs: current RAM state
1068 * @block: block that contains the page we want to send
1069 * @offset: offset inside the block for the page
1070 */
1071 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1072 {
1073 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1074
1075 if (len) {
1076 ram_counters.duplicate++;
1077 ram_counters.transferred += len;
1078 return 1;
1079 }
1080 return -1;
1081 }
1082
1083 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1084 {
1085 if (!migrate_release_ram() || !migration_in_postcopy()) {
1086 return;
1087 }
1088
1089 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1090 }
1091
1092 /*
1093 * @pages: the number of pages written by the control path,
1094 * < 0 - error
1095 * > 0 - number of pages written
1096 *
1097 * Return true if the pages has been saved, otherwise false is returned.
1098 */
1099 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1100 int *pages)
1101 {
1102 uint64_t bytes_xmit = 0;
1103 int ret;
1104
1105 *pages = -1;
1106 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1107 &bytes_xmit);
1108 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1109 return false;
1110 }
1111
1112 if (bytes_xmit) {
1113 ram_counters.transferred += bytes_xmit;
1114 *pages = 1;
1115 }
1116
1117 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1118 return true;
1119 }
1120
1121 if (bytes_xmit > 0) {
1122 ram_counters.normal++;
1123 } else if (bytes_xmit == 0) {
1124 ram_counters.duplicate++;
1125 }
1126
1127 return true;
1128 }
1129
1130 /*
1131 * directly send the page to the stream
1132 *
1133 * Returns the number of pages written.
1134 *
1135 * @rs: current RAM state
1136 * @block: block that contains the page we want to send
1137 * @offset: offset inside the block for the page
1138 * @buf: the page to be sent
1139 * @async: send to page asyncly
1140 */
1141 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1142 uint8_t *buf, bool async)
1143 {
1144 ram_counters.transferred += save_page_header(rs, rs->f, block,
1145 offset | RAM_SAVE_FLAG_PAGE);
1146 if (async) {
1147 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1148 migrate_release_ram() &
1149 migration_in_postcopy());
1150 } else {
1151 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1152 }
1153 ram_counters.transferred += TARGET_PAGE_SIZE;
1154 ram_counters.normal++;
1155 return 1;
1156 }
1157
1158 /**
1159 * ram_save_page: send the given page to the stream
1160 *
1161 * Returns the number of pages written.
1162 * < 0 - error
1163 * >=0 - Number of pages written - this might legally be 0
1164 * if xbzrle noticed the page was the same.
1165 *
1166 * @rs: current RAM state
1167 * @block: block that contains the page we want to send
1168 * @offset: offset inside the block for the page
1169 * @last_stage: if we are at the completion stage
1170 */
1171 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1172 {
1173 int pages = -1;
1174 uint8_t *p;
1175 bool send_async = true;
1176 RAMBlock *block = pss->block;
1177 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1178 ram_addr_t current_addr = block->offset + offset;
1179
1180 p = block->host + offset;
1181 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1182
1183 XBZRLE_cache_lock();
1184 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1185 migrate_use_xbzrle()) {
1186 pages = save_xbzrle_page(rs, &p, current_addr, block,
1187 offset, last_stage);
1188 if (!last_stage) {
1189 /* Can't send this cached data async, since the cache page
1190 * might get updated before it gets to the wire
1191 */
1192 send_async = false;
1193 }
1194 }
1195
1196 /* XBZRLE overflow or normal page */
1197 if (pages == -1) {
1198 pages = save_normal_page(rs, block, offset, p, send_async);
1199 }
1200
1201 XBZRLE_cache_unlock();
1202
1203 return pages;
1204 }
1205
1206 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1207 ram_addr_t offset)
1208 {
1209 if (multifd_queue_page(rs->f, block, offset) < 0) {
1210 return -1;
1211 }
1212 ram_counters.normal++;
1213
1214 return 1;
1215 }
1216
1217 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1218 ram_addr_t offset, uint8_t *source_buf)
1219 {
1220 RAMState *rs = ram_state;
1221 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1222 bool zero_page = false;
1223 int ret;
1224
1225 if (save_zero_page_to_file(rs, f, block, offset)) {
1226 zero_page = true;
1227 goto exit;
1228 }
1229
1230 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1231
1232 /*
1233 * copy it to a internal buffer to avoid it being modified by VM
1234 * so that we can catch up the error during compression and
1235 * decompression
1236 */
1237 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1238 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1239 if (ret < 0) {
1240 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1241 error_report("compressed data failed!");
1242 return false;
1243 }
1244
1245 exit:
1246 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1247 return zero_page;
1248 }
1249
1250 static void
1251 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1252 {
1253 ram_counters.transferred += bytes_xmit;
1254
1255 if (param->zero_page) {
1256 ram_counters.duplicate++;
1257 return;
1258 }
1259
1260 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1261 compression_counters.compressed_size += bytes_xmit - 8;
1262 compression_counters.pages++;
1263 }
1264
1265 static bool save_page_use_compression(RAMState *rs);
1266
1267 static void flush_compressed_data(RAMState *rs)
1268 {
1269 int idx, len, thread_count;
1270
1271 if (!save_page_use_compression(rs)) {
1272 return;
1273 }
1274 thread_count = migrate_compress_threads();
1275
1276 qemu_mutex_lock(&comp_done_lock);
1277 for (idx = 0; idx < thread_count; idx++) {
1278 while (!comp_param[idx].done) {
1279 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1280 }
1281 }
1282 qemu_mutex_unlock(&comp_done_lock);
1283
1284 for (idx = 0; idx < thread_count; idx++) {
1285 qemu_mutex_lock(&comp_param[idx].mutex);
1286 if (!comp_param[idx].quit) {
1287 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1288 /*
1289 * it's safe to fetch zero_page without holding comp_done_lock
1290 * as there is no further request submitted to the thread,
1291 * i.e, the thread should be waiting for a request at this point.
1292 */
1293 update_compress_thread_counts(&comp_param[idx], len);
1294 }
1295 qemu_mutex_unlock(&comp_param[idx].mutex);
1296 }
1297 }
1298
1299 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1300 ram_addr_t offset)
1301 {
1302 param->block = block;
1303 param->offset = offset;
1304 }
1305
1306 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1307 ram_addr_t offset)
1308 {
1309 int idx, thread_count, bytes_xmit = -1, pages = -1;
1310 bool wait = migrate_compress_wait_thread();
1311
1312 thread_count = migrate_compress_threads();
1313 qemu_mutex_lock(&comp_done_lock);
1314 retry:
1315 for (idx = 0; idx < thread_count; idx++) {
1316 if (comp_param[idx].done) {
1317 comp_param[idx].done = false;
1318 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1319 qemu_mutex_lock(&comp_param[idx].mutex);
1320 set_compress_params(&comp_param[idx], block, offset);
1321 qemu_cond_signal(&comp_param[idx].cond);
1322 qemu_mutex_unlock(&comp_param[idx].mutex);
1323 pages = 1;
1324 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1325 break;
1326 }
1327 }
1328
1329 /*
1330 * wait for the free thread if the user specifies 'compress-wait-thread',
1331 * otherwise we will post the page out in the main thread as normal page.
1332 */
1333 if (pages < 0 && wait) {
1334 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1335 goto retry;
1336 }
1337 qemu_mutex_unlock(&comp_done_lock);
1338
1339 return pages;
1340 }
1341
1342 /**
1343 * find_dirty_block: find the next dirty page and update any state
1344 * associated with the search process.
1345 *
1346 * Returns true if a page is found
1347 *
1348 * @rs: current RAM state
1349 * @pss: data about the state of the current dirty page scan
1350 * @again: set to false if the search has scanned the whole of RAM
1351 */
1352 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1353 {
1354 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1355 if (pss->complete_round && pss->block == rs->last_seen_block &&
1356 pss->page >= rs->last_page) {
1357 /*
1358 * We've been once around the RAM and haven't found anything.
1359 * Give up.
1360 */
1361 *again = false;
1362 return false;
1363 }
1364 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1365 >= pss->block->used_length) {
1366 /* Didn't find anything in this RAM Block */
1367 pss->page = 0;
1368 pss->block = QLIST_NEXT_RCU(pss->block, next);
1369 if (!pss->block) {
1370 /*
1371 * If memory migration starts over, we will meet a dirtied page
1372 * which may still exists in compression threads's ring, so we
1373 * should flush the compressed data to make sure the new page
1374 * is not overwritten by the old one in the destination.
1375 *
1376 * Also If xbzrle is on, stop using the data compression at this
1377 * point. In theory, xbzrle can do better than compression.
1378 */
1379 flush_compressed_data(rs);
1380
1381 /* Hit the end of the list */
1382 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383 /* Flag that we've looped */
1384 pss->complete_round = true;
1385 rs->ram_bulk_stage = false;
1386 }
1387 /* Didn't find anything this time, but try again on the new block */
1388 *again = true;
1389 return false;
1390 } else {
1391 /* Can go around again, but... */
1392 *again = true;
1393 /* We've found something so probably don't need to */
1394 return true;
1395 }
1396 }
1397
1398 /**
1399 * unqueue_page: gets a page of the queue
1400 *
1401 * Helper for 'get_queued_page' - gets a page off the queue
1402 *
1403 * Returns the block of the page (or NULL if none available)
1404 *
1405 * @rs: current RAM state
1406 * @offset: used to return the offset within the RAMBlock
1407 */
1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1409 {
1410 RAMBlock *block = NULL;
1411
1412 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1413 return NULL;
1414 }
1415
1416 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1417 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1418 struct RAMSrcPageRequest *entry =
1419 QSIMPLEQ_FIRST(&rs->src_page_requests);
1420 block = entry->rb;
1421 *offset = entry->offset;
1422
1423 if (entry->len > TARGET_PAGE_SIZE) {
1424 entry->len -= TARGET_PAGE_SIZE;
1425 entry->offset += TARGET_PAGE_SIZE;
1426 } else {
1427 memory_region_unref(block->mr);
1428 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1429 g_free(entry);
1430 migration_consume_urgent_request();
1431 }
1432 }
1433
1434 return block;
1435 }
1436
1437 /**
1438 * get_queued_page: unqueue a page from the postcopy requests
1439 *
1440 * Skips pages that are already sent (!dirty)
1441 *
1442 * Returns true if a queued page is found
1443 *
1444 * @rs: current RAM state
1445 * @pss: data about the state of the current dirty page scan
1446 */
1447 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1448 {
1449 RAMBlock *block;
1450 ram_addr_t offset;
1451 bool dirty;
1452
1453 do {
1454 block = unqueue_page(rs, &offset);
1455 /*
1456 * We're sending this page, and since it's postcopy nothing else
1457 * will dirty it, and we must make sure it doesn't get sent again
1458 * even if this queue request was received after the background
1459 * search already sent it.
1460 */
1461 if (block) {
1462 unsigned long page;
1463
1464 page = offset >> TARGET_PAGE_BITS;
1465 dirty = test_bit(page, block->bmap);
1466 if (!dirty) {
1467 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1468 page);
1469 } else {
1470 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1471 }
1472 }
1473
1474 } while (block && !dirty);
1475
1476 if (block) {
1477 /*
1478 * As soon as we start servicing pages out of order, then we have
1479 * to kill the bulk stage, since the bulk stage assumes
1480 * in (migration_bitmap_find_and_reset_dirty) that every page is
1481 * dirty, that's no longer true.
1482 */
1483 rs->ram_bulk_stage = false;
1484
1485 /*
1486 * We want the background search to continue from the queued page
1487 * since the guest is likely to want other pages near to the page
1488 * it just requested.
1489 */
1490 pss->block = block;
1491 pss->page = offset >> TARGET_PAGE_BITS;
1492
1493 /*
1494 * This unqueued page would break the "one round" check, even is
1495 * really rare.
1496 */
1497 pss->complete_round = false;
1498 }
1499
1500 return !!block;
1501 }
1502
1503 /**
1504 * migration_page_queue_free: drop any remaining pages in the ram
1505 * request queue
1506 *
1507 * It should be empty at the end anyway, but in error cases there may
1508 * be some left. in case that there is any page left, we drop it.
1509 *
1510 */
1511 static void migration_page_queue_free(RAMState *rs)
1512 {
1513 struct RAMSrcPageRequest *mspr, *next_mspr;
1514 /* This queue generally should be empty - but in the case of a failed
1515 * migration might have some droppings in.
1516 */
1517 RCU_READ_LOCK_GUARD();
1518 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1519 memory_region_unref(mspr->rb->mr);
1520 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1521 g_free(mspr);
1522 }
1523 }
1524
1525 /**
1526 * ram_save_queue_pages: queue the page for transmission
1527 *
1528 * A request from postcopy destination for example.
1529 *
1530 * Returns zero on success or negative on error
1531 *
1532 * @rbname: Name of the RAMBLock of the request. NULL means the
1533 * same that last one.
1534 * @start: starting address from the start of the RAMBlock
1535 * @len: length (in bytes) to send
1536 */
1537 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1538 {
1539 RAMBlock *ramblock;
1540 RAMState *rs = ram_state;
1541
1542 ram_counters.postcopy_requests++;
1543 RCU_READ_LOCK_GUARD();
1544
1545 if (!rbname) {
1546 /* Reuse last RAMBlock */
1547 ramblock = rs->last_req_rb;
1548
1549 if (!ramblock) {
1550 /*
1551 * Shouldn't happen, we can't reuse the last RAMBlock if
1552 * it's the 1st request.
1553 */
1554 error_report("ram_save_queue_pages no previous block");
1555 return -1;
1556 }
1557 } else {
1558 ramblock = qemu_ram_block_by_name(rbname);
1559
1560 if (!ramblock) {
1561 /* We shouldn't be asked for a non-existent RAMBlock */
1562 error_report("ram_save_queue_pages no block '%s'", rbname);
1563 return -1;
1564 }
1565 rs->last_req_rb = ramblock;
1566 }
1567 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1568 if (start + len > ramblock->used_length) {
1569 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1570 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1571 __func__, start, len, ramblock->used_length);
1572 return -1;
1573 }
1574
1575 struct RAMSrcPageRequest *new_entry =
1576 g_malloc0(sizeof(struct RAMSrcPageRequest));
1577 new_entry->rb = ramblock;
1578 new_entry->offset = start;
1579 new_entry->len = len;
1580
1581 memory_region_ref(ramblock->mr);
1582 qemu_mutex_lock(&rs->src_page_req_mutex);
1583 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1584 migration_make_urgent_request();
1585 qemu_mutex_unlock(&rs->src_page_req_mutex);
1586
1587 return 0;
1588 }
1589
1590 static bool save_page_use_compression(RAMState *rs)
1591 {
1592 if (!migrate_use_compression()) {
1593 return false;
1594 }
1595
1596 /*
1597 * If xbzrle is on, stop using the data compression after first
1598 * round of migration even if compression is enabled. In theory,
1599 * xbzrle can do better than compression.
1600 */
1601 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1602 return true;
1603 }
1604
1605 return false;
1606 }
1607
1608 /*
1609 * try to compress the page before posting it out, return true if the page
1610 * has been properly handled by compression, otherwise needs other
1611 * paths to handle it
1612 */
1613 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1614 {
1615 if (!save_page_use_compression(rs)) {
1616 return false;
1617 }
1618
1619 /*
1620 * When starting the process of a new block, the first page of
1621 * the block should be sent out before other pages in the same
1622 * block, and all the pages in last block should have been sent
1623 * out, keeping this order is important, because the 'cont' flag
1624 * is used to avoid resending the block name.
1625 *
1626 * We post the fist page as normal page as compression will take
1627 * much CPU resource.
1628 */
1629 if (block != rs->last_sent_block) {
1630 flush_compressed_data(rs);
1631 return false;
1632 }
1633
1634 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1635 return true;
1636 }
1637
1638 compression_counters.busy++;
1639 return false;
1640 }
1641
1642 /**
1643 * ram_save_target_page: save one target page
1644 *
1645 * Returns the number of pages written
1646 *
1647 * @rs: current RAM state
1648 * @pss: data about the page we want to send
1649 * @last_stage: if we are at the completion stage
1650 */
1651 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1652 bool last_stage)
1653 {
1654 RAMBlock *block = pss->block;
1655 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1656 int res;
1657
1658 if (control_save_page(rs, block, offset, &res)) {
1659 return res;
1660 }
1661
1662 if (save_compress_page(rs, block, offset)) {
1663 return 1;
1664 }
1665
1666 res = save_zero_page(rs, block, offset);
1667 if (res > 0) {
1668 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1669 * page would be stale
1670 */
1671 if (!save_page_use_compression(rs)) {
1672 XBZRLE_cache_lock();
1673 xbzrle_cache_zero_page(rs, block->offset + offset);
1674 XBZRLE_cache_unlock();
1675 }
1676 ram_release_pages(block->idstr, offset, res);
1677 return res;
1678 }
1679
1680 /*
1681 * Do not use multifd for:
1682 * 1. Compression as the first page in the new block should be posted out
1683 * before sending the compressed page
1684 * 2. In postcopy as one whole host page should be placed
1685 */
1686 if (!save_page_use_compression(rs) && migrate_use_multifd()
1687 && !migration_in_postcopy()) {
1688 return ram_save_multifd_page(rs, block, offset);
1689 }
1690
1691 return ram_save_page(rs, pss, last_stage);
1692 }
1693
1694 /**
1695 * ram_save_host_page: save a whole host page
1696 *
1697 * Starting at *offset send pages up to the end of the current host
1698 * page. It's valid for the initial offset to point into the middle of
1699 * a host page in which case the remainder of the hostpage is sent.
1700 * Only dirty target pages are sent. Note that the host page size may
1701 * be a huge page for this block.
1702 * The saving stops at the boundary of the used_length of the block
1703 * if the RAMBlock isn't a multiple of the host page size.
1704 *
1705 * Returns the number of pages written or negative on error
1706 *
1707 * @rs: current RAM state
1708 * @ms: current migration state
1709 * @pss: data about the page we want to send
1710 * @last_stage: if we are at the completion stage
1711 */
1712 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1713 bool last_stage)
1714 {
1715 int tmppages, pages = 0;
1716 size_t pagesize_bits =
1717 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1718
1719 if (ramblock_is_ignored(pss->block)) {
1720 error_report("block %s should not be migrated !", pss->block->idstr);
1721 return 0;
1722 }
1723
1724 do {
1725 /* Check the pages is dirty and if it is send it */
1726 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1727 pss->page++;
1728 continue;
1729 }
1730
1731 tmppages = ram_save_target_page(rs, pss, last_stage);
1732 if (tmppages < 0) {
1733 return tmppages;
1734 }
1735
1736 pages += tmppages;
1737 pss->page++;
1738 /* Allow rate limiting to happen in the middle of huge pages */
1739 migration_rate_limit();
1740 } while ((pss->page & (pagesize_bits - 1)) &&
1741 offset_in_ramblock(pss->block,
1742 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1743
1744 /* The offset we leave with is the last one we looked at */
1745 pss->page--;
1746 return pages;
1747 }
1748
1749 /**
1750 * ram_find_and_save_block: finds a dirty page and sends it to f
1751 *
1752 * Called within an RCU critical section.
1753 *
1754 * Returns the number of pages written where zero means no dirty pages,
1755 * or negative on error
1756 *
1757 * @rs: current RAM state
1758 * @last_stage: if we are at the completion stage
1759 *
1760 * On systems where host-page-size > target-page-size it will send all the
1761 * pages in a host page that are dirty.
1762 */
1763
1764 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1765 {
1766 PageSearchStatus pss;
1767 int pages = 0;
1768 bool again, found;
1769
1770 /* No dirty page as there is zero RAM */
1771 if (!ram_bytes_total()) {
1772 return pages;
1773 }
1774
1775 pss.block = rs->last_seen_block;
1776 pss.page = rs->last_page;
1777 pss.complete_round = false;
1778
1779 if (!pss.block) {
1780 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1781 }
1782
1783 do {
1784 again = true;
1785 found = get_queued_page(rs, &pss);
1786
1787 if (!found) {
1788 /* priority queue empty, so just search for something dirty */
1789 found = find_dirty_block(rs, &pss, &again);
1790 }
1791
1792 if (found) {
1793 pages = ram_save_host_page(rs, &pss, last_stage);
1794 }
1795 } while (!pages && again);
1796
1797 rs->last_seen_block = pss.block;
1798 rs->last_page = pss.page;
1799
1800 return pages;
1801 }
1802
1803 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1804 {
1805 uint64_t pages = size / TARGET_PAGE_SIZE;
1806
1807 if (zero) {
1808 ram_counters.duplicate += pages;
1809 } else {
1810 ram_counters.normal += pages;
1811 ram_counters.transferred += size;
1812 qemu_update_position(f, size);
1813 }
1814 }
1815
1816 static uint64_t ram_bytes_total_common(bool count_ignored)
1817 {
1818 RAMBlock *block;
1819 uint64_t total = 0;
1820
1821 RCU_READ_LOCK_GUARD();
1822
1823 if (count_ignored) {
1824 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1825 total += block->used_length;
1826 }
1827 } else {
1828 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1829 total += block->used_length;
1830 }
1831 }
1832 return total;
1833 }
1834
1835 uint64_t ram_bytes_total(void)
1836 {
1837 return ram_bytes_total_common(false);
1838 }
1839
1840 static void xbzrle_load_setup(void)
1841 {
1842 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1843 }
1844
1845 static void xbzrle_load_cleanup(void)
1846 {
1847 g_free(XBZRLE.decoded_buf);
1848 XBZRLE.decoded_buf = NULL;
1849 }
1850
1851 static void ram_state_cleanup(RAMState **rsp)
1852 {
1853 if (*rsp) {
1854 migration_page_queue_free(*rsp);
1855 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1856 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1857 g_free(*rsp);
1858 *rsp = NULL;
1859 }
1860 }
1861
1862 static void xbzrle_cleanup(void)
1863 {
1864 XBZRLE_cache_lock();
1865 if (XBZRLE.cache) {
1866 cache_fini(XBZRLE.cache);
1867 g_free(XBZRLE.encoded_buf);
1868 g_free(XBZRLE.current_buf);
1869 g_free(XBZRLE.zero_target_page);
1870 XBZRLE.cache = NULL;
1871 XBZRLE.encoded_buf = NULL;
1872 XBZRLE.current_buf = NULL;
1873 XBZRLE.zero_target_page = NULL;
1874 }
1875 XBZRLE_cache_unlock();
1876 }
1877
1878 static void ram_save_cleanup(void *opaque)
1879 {
1880 RAMState **rsp = opaque;
1881 RAMBlock *block;
1882
1883 /* caller have hold iothread lock or is in a bh, so there is
1884 * no writing race against the migration bitmap
1885 */
1886 memory_global_dirty_log_stop();
1887
1888 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1889 g_free(block->clear_bmap);
1890 block->clear_bmap = NULL;
1891 g_free(block->bmap);
1892 block->bmap = NULL;
1893 }
1894
1895 xbzrle_cleanup();
1896 compress_threads_save_cleanup();
1897 ram_state_cleanup(rsp);
1898 }
1899
1900 static void ram_state_reset(RAMState *rs)
1901 {
1902 rs->last_seen_block = NULL;
1903 rs->last_sent_block = NULL;
1904 rs->last_page = 0;
1905 rs->last_version = ram_list.version;
1906 rs->ram_bulk_stage = true;
1907 rs->fpo_enabled = false;
1908 }
1909
1910 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1911
1912 /*
1913 * 'expected' is the value you expect the bitmap mostly to be full
1914 * of; it won't bother printing lines that are all this value.
1915 * If 'todump' is null the migration bitmap is dumped.
1916 */
1917 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1918 unsigned long pages)
1919 {
1920 int64_t cur;
1921 int64_t linelen = 128;
1922 char linebuf[129];
1923
1924 for (cur = 0; cur < pages; cur += linelen) {
1925 int64_t curb;
1926 bool found = false;
1927 /*
1928 * Last line; catch the case where the line length
1929 * is longer than remaining ram
1930 */
1931 if (cur + linelen > pages) {
1932 linelen = pages - cur;
1933 }
1934 for (curb = 0; curb < linelen; curb++) {
1935 bool thisbit = test_bit(cur + curb, todump);
1936 linebuf[curb] = thisbit ? '1' : '.';
1937 found = found || (thisbit != expected);
1938 }
1939 if (found) {
1940 linebuf[curb] = '\0';
1941 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1942 }
1943 }
1944 }
1945
1946 /* **** functions for postcopy ***** */
1947
1948 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1949 {
1950 struct RAMBlock *block;
1951
1952 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1953 unsigned long *bitmap = block->bmap;
1954 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1955 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1956
1957 while (run_start < range) {
1958 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1959 ram_discard_range(block->idstr,
1960 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1961 ((ram_addr_t)(run_end - run_start))
1962 << TARGET_PAGE_BITS);
1963 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1964 }
1965 }
1966 }
1967
1968 /**
1969 * postcopy_send_discard_bm_ram: discard a RAMBlock
1970 *
1971 * Returns zero on success
1972 *
1973 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1974 *
1975 * @ms: current migration state
1976 * @block: RAMBlock to discard
1977 */
1978 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1979 {
1980 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1981 unsigned long current;
1982 unsigned long *bitmap = block->bmap;
1983
1984 for (current = 0; current < end; ) {
1985 unsigned long one = find_next_bit(bitmap, end, current);
1986 unsigned long zero, discard_length;
1987
1988 if (one >= end) {
1989 break;
1990 }
1991
1992 zero = find_next_zero_bit(bitmap, end, one + 1);
1993
1994 if (zero >= end) {
1995 discard_length = end - one;
1996 } else {
1997 discard_length = zero - one;
1998 }
1999 postcopy_discard_send_range(ms, one, discard_length);
2000 current = one + discard_length;
2001 }
2002
2003 return 0;
2004 }
2005
2006 /**
2007 * postcopy_each_ram_send_discard: discard all RAMBlocks
2008 *
2009 * Returns 0 for success or negative for error
2010 *
2011 * Utility for the outgoing postcopy code.
2012 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2013 * passing it bitmap indexes and name.
2014 * (qemu_ram_foreach_block ends up passing unscaled lengths
2015 * which would mean postcopy code would have to deal with target page)
2016 *
2017 * @ms: current migration state
2018 */
2019 static int postcopy_each_ram_send_discard(MigrationState *ms)
2020 {
2021 struct RAMBlock *block;
2022 int ret;
2023
2024 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2025 postcopy_discard_send_init(ms, block->idstr);
2026
2027 /*
2028 * Postcopy sends chunks of bitmap over the wire, but it
2029 * just needs indexes at this point, avoids it having
2030 * target page specific code.
2031 */
2032 ret = postcopy_send_discard_bm_ram(ms, block);
2033 postcopy_discard_send_finish(ms);
2034 if (ret) {
2035 return ret;
2036 }
2037 }
2038
2039 return 0;
2040 }
2041
2042 /**
2043 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2044 *
2045 * Helper for postcopy_chunk_hostpages; it's called twice to
2046 * canonicalize the two bitmaps, that are similar, but one is
2047 * inverted.
2048 *
2049 * Postcopy requires that all target pages in a hostpage are dirty or
2050 * clean, not a mix. This function canonicalizes the bitmaps.
2051 *
2052 * @ms: current migration state
2053 * @block: block that contains the page we want to canonicalize
2054 */
2055 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2056 {
2057 RAMState *rs = ram_state;
2058 unsigned long *bitmap = block->bmap;
2059 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2060 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2061 unsigned long run_start;
2062
2063 if (block->page_size == TARGET_PAGE_SIZE) {
2064 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2065 return;
2066 }
2067
2068 /* Find a dirty page */
2069 run_start = find_next_bit(bitmap, pages, 0);
2070
2071 while (run_start < pages) {
2072
2073 /*
2074 * If the start of this run of pages is in the middle of a host
2075 * page, then we need to fixup this host page.
2076 */
2077 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2078 /* Find the end of this run */
2079 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2080 /*
2081 * If the end isn't at the start of a host page, then the
2082 * run doesn't finish at the end of a host page
2083 * and we need to discard.
2084 */
2085 }
2086
2087 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2088 unsigned long page;
2089 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2090 host_ratio);
2091 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2092
2093 /* Clean up the bitmap */
2094 for (page = fixup_start_addr;
2095 page < fixup_start_addr + host_ratio; page++) {
2096 /*
2097 * Remark them as dirty, updating the count for any pages
2098 * that weren't previously dirty.
2099 */
2100 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2101 }
2102 }
2103
2104 /* Find the next dirty page for the next iteration */
2105 run_start = find_next_bit(bitmap, pages, run_start);
2106 }
2107 }
2108
2109 /**
2110 * postcopy_chunk_hostpages: discard any partially sent host page
2111 *
2112 * Utility for the outgoing postcopy code.
2113 *
2114 * Discard any partially sent host-page size chunks, mark any partially
2115 * dirty host-page size chunks as all dirty. In this case the host-page
2116 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2117 *
2118 * Returns zero on success
2119 *
2120 * @ms: current migration state
2121 * @block: block we want to work with
2122 */
2123 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2124 {
2125 postcopy_discard_send_init(ms, block->idstr);
2126
2127 /*
2128 * Ensure that all partially dirty host pages are made fully dirty.
2129 */
2130 postcopy_chunk_hostpages_pass(ms, block);
2131
2132 postcopy_discard_send_finish(ms);
2133 return 0;
2134 }
2135
2136 /**
2137 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2138 *
2139 * Returns zero on success
2140 *
2141 * Transmit the set of pages to be discarded after precopy to the target
2142 * these are pages that:
2143 * a) Have been previously transmitted but are now dirty again
2144 * b) Pages that have never been transmitted, this ensures that
2145 * any pages on the destination that have been mapped by background
2146 * tasks get discarded (transparent huge pages is the specific concern)
2147 * Hopefully this is pretty sparse
2148 *
2149 * @ms: current migration state
2150 */
2151 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2152 {
2153 RAMState *rs = ram_state;
2154 RAMBlock *block;
2155 int ret;
2156
2157 RCU_READ_LOCK_GUARD();
2158
2159 /* This should be our last sync, the src is now paused */
2160 migration_bitmap_sync(rs);
2161
2162 /* Easiest way to make sure we don't resume in the middle of a host-page */
2163 rs->last_seen_block = NULL;
2164 rs->last_sent_block = NULL;
2165 rs->last_page = 0;
2166
2167 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2168 /* Deal with TPS != HPS and huge pages */
2169 ret = postcopy_chunk_hostpages(ms, block);
2170 if (ret) {
2171 return ret;
2172 }
2173
2174 #ifdef DEBUG_POSTCOPY
2175 ram_debug_dump_bitmap(block->bmap, true,
2176 block->used_length >> TARGET_PAGE_BITS);
2177 #endif
2178 }
2179 trace_ram_postcopy_send_discard_bitmap();
2180
2181 return postcopy_each_ram_send_discard(ms);
2182 }
2183
2184 /**
2185 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2186 *
2187 * Returns zero on success
2188 *
2189 * @rbname: name of the RAMBlock of the request. NULL means the
2190 * same that last one.
2191 * @start: RAMBlock starting page
2192 * @length: RAMBlock size
2193 */
2194 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2195 {
2196 trace_ram_discard_range(rbname, start, length);
2197
2198 RCU_READ_LOCK_GUARD();
2199 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2200
2201 if (!rb) {
2202 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2203 return -1;
2204 }
2205
2206 /*
2207 * On source VM, we don't need to update the received bitmap since
2208 * we don't even have one.
2209 */
2210 if (rb->receivedmap) {
2211 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2212 length >> qemu_target_page_bits());
2213 }
2214
2215 return ram_block_discard_range(rb, start, length);
2216 }
2217
2218 /*
2219 * For every allocation, we will try not to crash the VM if the
2220 * allocation failed.
2221 */
2222 static int xbzrle_init(void)
2223 {
2224 Error *local_err = NULL;
2225
2226 if (!migrate_use_xbzrle()) {
2227 return 0;
2228 }
2229
2230 XBZRLE_cache_lock();
2231
2232 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2233 if (!XBZRLE.zero_target_page) {
2234 error_report("%s: Error allocating zero page", __func__);
2235 goto err_out;
2236 }
2237
2238 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2239 TARGET_PAGE_SIZE, &local_err);
2240 if (!XBZRLE.cache) {
2241 error_report_err(local_err);
2242 goto free_zero_page;
2243 }
2244
2245 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2246 if (!XBZRLE.encoded_buf) {
2247 error_report("%s: Error allocating encoded_buf", __func__);
2248 goto free_cache;
2249 }
2250
2251 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2252 if (!XBZRLE.current_buf) {
2253 error_report("%s: Error allocating current_buf", __func__);
2254 goto free_encoded_buf;
2255 }
2256
2257 /* We are all good */
2258 XBZRLE_cache_unlock();
2259 return 0;
2260
2261 free_encoded_buf:
2262 g_free(XBZRLE.encoded_buf);
2263 XBZRLE.encoded_buf = NULL;
2264 free_cache:
2265 cache_fini(XBZRLE.cache);
2266 XBZRLE.cache = NULL;
2267 free_zero_page:
2268 g_free(XBZRLE.zero_target_page);
2269 XBZRLE.zero_target_page = NULL;
2270 err_out:
2271 XBZRLE_cache_unlock();
2272 return -ENOMEM;
2273 }
2274
2275 static int ram_state_init(RAMState **rsp)
2276 {
2277 *rsp = g_try_new0(RAMState, 1);
2278
2279 if (!*rsp) {
2280 error_report("%s: Init ramstate fail", __func__);
2281 return -1;
2282 }
2283
2284 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2285 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2286 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2287
2288 /*
2289 * Count the total number of pages used by ram blocks not including any
2290 * gaps due to alignment or unplugs.
2291 * This must match with the initial values of dirty bitmap.
2292 */
2293 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2294 ram_state_reset(*rsp);
2295
2296 return 0;
2297 }
2298
2299 static void ram_list_init_bitmaps(void)
2300 {
2301 MigrationState *ms = migrate_get_current();
2302 RAMBlock *block;
2303 unsigned long pages;
2304 uint8_t shift;
2305
2306 /* Skip setting bitmap if there is no RAM */
2307 if (ram_bytes_total()) {
2308 shift = ms->clear_bitmap_shift;
2309 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2310 error_report("clear_bitmap_shift (%u) too big, using "
2311 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2312 shift = CLEAR_BITMAP_SHIFT_MAX;
2313 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2314 error_report("clear_bitmap_shift (%u) too small, using "
2315 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2316 shift = CLEAR_BITMAP_SHIFT_MIN;
2317 }
2318
2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320 pages = block->max_length >> TARGET_PAGE_BITS;
2321 /*
2322 * The initial dirty bitmap for migration must be set with all
2323 * ones to make sure we'll migrate every guest RAM page to
2324 * destination.
2325 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2326 * new migration after a failed migration, ram_list.
2327 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2328 * guest memory.
2329 */
2330 block->bmap = bitmap_new(pages);
2331 bitmap_set(block->bmap, 0, pages);
2332 block->clear_bmap_shift = shift;
2333 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2334 }
2335 }
2336 }
2337
2338 static void ram_init_bitmaps(RAMState *rs)
2339 {
2340 /* For memory_global_dirty_log_start below. */
2341 qemu_mutex_lock_iothread();
2342 qemu_mutex_lock_ramlist();
2343
2344 WITH_RCU_READ_LOCK_GUARD() {
2345 ram_list_init_bitmaps();
2346 memory_global_dirty_log_start();
2347 migration_bitmap_sync_precopy(rs);
2348 }
2349 qemu_mutex_unlock_ramlist();
2350 qemu_mutex_unlock_iothread();
2351 }
2352
2353 static int ram_init_all(RAMState **rsp)
2354 {
2355 if (ram_state_init(rsp)) {
2356 return -1;
2357 }
2358
2359 if (xbzrle_init()) {
2360 ram_state_cleanup(rsp);
2361 return -1;
2362 }
2363
2364 ram_init_bitmaps(*rsp);
2365
2366 return 0;
2367 }
2368
2369 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2370 {
2371 RAMBlock *block;
2372 uint64_t pages = 0;
2373
2374 /*
2375 * Postcopy is not using xbzrle/compression, so no need for that.
2376 * Also, since source are already halted, we don't need to care
2377 * about dirty page logging as well.
2378 */
2379
2380 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2381 pages += bitmap_count_one(block->bmap,
2382 block->used_length >> TARGET_PAGE_BITS);
2383 }
2384
2385 /* This may not be aligned with current bitmaps. Recalculate. */
2386 rs->migration_dirty_pages = pages;
2387
2388 rs->last_seen_block = NULL;
2389 rs->last_sent_block = NULL;
2390 rs->last_page = 0;
2391 rs->last_version = ram_list.version;
2392 /*
2393 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2394 * matter what we have sent.
2395 */
2396 rs->ram_bulk_stage = false;
2397
2398 /* Update RAMState cache of output QEMUFile */
2399 rs->f = out;
2400
2401 trace_ram_state_resume_prepare(pages);
2402 }
2403
2404 /*
2405 * This function clears bits of the free pages reported by the caller from the
2406 * migration dirty bitmap. @addr is the host address corresponding to the
2407 * start of the continuous guest free pages, and @len is the total bytes of
2408 * those pages.
2409 */
2410 void qemu_guest_free_page_hint(void *addr, size_t len)
2411 {
2412 RAMBlock *block;
2413 ram_addr_t offset;
2414 size_t used_len, start, npages;
2415 MigrationState *s = migrate_get_current();
2416
2417 /* This function is currently expected to be used during live migration */
2418 if (!migration_is_setup_or_active(s->state)) {
2419 return;
2420 }
2421
2422 for (; len > 0; len -= used_len, addr += used_len) {
2423 block = qemu_ram_block_from_host(addr, false, &offset);
2424 if (unlikely(!block || offset >= block->used_length)) {
2425 /*
2426 * The implementation might not support RAMBlock resize during
2427 * live migration, but it could happen in theory with future
2428 * updates. So we add a check here to capture that case.
2429 */
2430 error_report_once("%s unexpected error", __func__);
2431 return;
2432 }
2433
2434 if (len <= block->used_length - offset) {
2435 used_len = len;
2436 } else {
2437 used_len = block->used_length - offset;
2438 }
2439
2440 start = offset >> TARGET_PAGE_BITS;
2441 npages = used_len >> TARGET_PAGE_BITS;
2442
2443 qemu_mutex_lock(&ram_state->bitmap_mutex);
2444 ram_state->migration_dirty_pages -=
2445 bitmap_count_one_with_offset(block->bmap, start, npages);
2446 bitmap_clear(block->bmap, start, npages);
2447 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2448 }
2449 }
2450
2451 /*
2452 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2453 * long-running RCU critical section. When rcu-reclaims in the code
2454 * start to become numerous it will be necessary to reduce the
2455 * granularity of these critical sections.
2456 */
2457
2458 /**
2459 * ram_save_setup: Setup RAM for migration
2460 *
2461 * Returns zero to indicate success and negative for error
2462 *
2463 * @f: QEMUFile where to send the data
2464 * @opaque: RAMState pointer
2465 */
2466 static int ram_save_setup(QEMUFile *f, void *opaque)
2467 {
2468 RAMState **rsp = opaque;
2469 RAMBlock *block;
2470
2471 if (compress_threads_save_setup()) {
2472 return -1;
2473 }
2474
2475 /* migration has already setup the bitmap, reuse it. */
2476 if (!migration_in_colo_state()) {
2477 if (ram_init_all(rsp) != 0) {
2478 compress_threads_save_cleanup();
2479 return -1;
2480 }
2481 }
2482 (*rsp)->f = f;
2483
2484 WITH_RCU_READ_LOCK_GUARD() {
2485 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2486
2487 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2488 qemu_put_byte(f, strlen(block->idstr));
2489 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2490 qemu_put_be64(f, block->used_length);
2491 if (migrate_postcopy_ram() && block->page_size !=
2492 qemu_host_page_size) {
2493 qemu_put_be64(f, block->page_size);
2494 }
2495 if (migrate_ignore_shared()) {
2496 qemu_put_be64(f, block->mr->addr);
2497 }
2498 }
2499 }
2500
2501 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2502 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2503
2504 multifd_send_sync_main(f);
2505 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2506 qemu_fflush(f);
2507
2508 return 0;
2509 }
2510
2511 /**
2512 * ram_save_iterate: iterative stage for migration
2513 *
2514 * Returns zero to indicate success and negative for error
2515 *
2516 * @f: QEMUFile where to send the data
2517 * @opaque: RAMState pointer
2518 */
2519 static int ram_save_iterate(QEMUFile *f, void *opaque)
2520 {
2521 RAMState **temp = opaque;
2522 RAMState *rs = *temp;
2523 int ret = 0;
2524 int i;
2525 int64_t t0;
2526 int done = 0;
2527
2528 if (blk_mig_bulk_active()) {
2529 /* Avoid transferring ram during bulk phase of block migration as
2530 * the bulk phase will usually take a long time and transferring
2531 * ram updates during that time is pointless. */
2532 goto out;
2533 }
2534
2535 WITH_RCU_READ_LOCK_GUARD() {
2536 if (ram_list.version != rs->last_version) {
2537 ram_state_reset(rs);
2538 }
2539
2540 /* Read version before ram_list.blocks */
2541 smp_rmb();
2542
2543 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2544
2545 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2546 i = 0;
2547 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2548 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2549 int pages;
2550
2551 if (qemu_file_get_error(f)) {
2552 break;
2553 }
2554
2555 pages = ram_find_and_save_block(rs, false);
2556 /* no more pages to sent */
2557 if (pages == 0) {
2558 done = 1;
2559 break;
2560 }
2561
2562 if (pages < 0) {
2563 qemu_file_set_error(f, pages);
2564 break;
2565 }
2566
2567 rs->target_page_count += pages;
2568
2569 /*
2570 * During postcopy, it is necessary to make sure one whole host
2571 * page is sent in one chunk.
2572 */
2573 if (migrate_postcopy_ram()) {
2574 flush_compressed_data(rs);
2575 }
2576
2577 /*
2578 * we want to check in the 1st loop, just in case it was the 1st
2579 * time and we had to sync the dirty bitmap.
2580 * qemu_clock_get_ns() is a bit expensive, so we only check each
2581 * some iterations
2582 */
2583 if ((i & 63) == 0) {
2584 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2585 1000000;
2586 if (t1 > MAX_WAIT) {
2587 trace_ram_save_iterate_big_wait(t1, i);
2588 break;
2589 }
2590 }
2591 i++;
2592 }
2593 }
2594
2595 /*
2596 * Must occur before EOS (or any QEMUFile operation)
2597 * because of RDMA protocol.
2598 */
2599 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2600
2601 out:
2602 if (ret >= 0
2603 && migration_is_setup_or_active(migrate_get_current()->state)) {
2604 multifd_send_sync_main(rs->f);
2605 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2606 qemu_fflush(f);
2607 ram_counters.transferred += 8;
2608
2609 ret = qemu_file_get_error(f);
2610 }
2611 if (ret < 0) {
2612 return ret;
2613 }
2614
2615 return done;
2616 }
2617
2618 /**
2619 * ram_save_complete: function called to send the remaining amount of ram
2620 *
2621 * Returns zero to indicate success or negative on error
2622 *
2623 * Called with iothread lock
2624 *
2625 * @f: QEMUFile where to send the data
2626 * @opaque: RAMState pointer
2627 */
2628 static int ram_save_complete(QEMUFile *f, void *opaque)
2629 {
2630 RAMState **temp = opaque;
2631 RAMState *rs = *temp;
2632 int ret = 0;
2633
2634 WITH_RCU_READ_LOCK_GUARD() {
2635 if (!migration_in_postcopy()) {
2636 migration_bitmap_sync_precopy(rs);
2637 }
2638
2639 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2640
2641 /* try transferring iterative blocks of memory */
2642
2643 /* flush all remaining blocks regardless of rate limiting */
2644 while (true) {
2645 int pages;
2646
2647 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2648 /* no more blocks to sent */
2649 if (pages == 0) {
2650 break;
2651 }
2652 if (pages < 0) {
2653 ret = pages;
2654 break;
2655 }
2656 }
2657
2658 flush_compressed_data(rs);
2659 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2660 }
2661
2662 if (ret >= 0) {
2663 multifd_send_sync_main(rs->f);
2664 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2665 qemu_fflush(f);
2666 }
2667
2668 return ret;
2669 }
2670
2671 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2672 uint64_t *res_precopy_only,
2673 uint64_t *res_compatible,
2674 uint64_t *res_postcopy_only)
2675 {
2676 RAMState **temp = opaque;
2677 RAMState *rs = *temp;
2678 uint64_t remaining_size;
2679
2680 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2681
2682 if (!migration_in_postcopy() &&
2683 remaining_size < max_size) {
2684 qemu_mutex_lock_iothread();
2685 WITH_RCU_READ_LOCK_GUARD() {
2686 migration_bitmap_sync_precopy(rs);
2687 }
2688 qemu_mutex_unlock_iothread();
2689 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2690 }
2691
2692 if (migrate_postcopy_ram()) {
2693 /* We can do postcopy, and all the data is postcopiable */
2694 *res_compatible += remaining_size;
2695 } else {
2696 *res_precopy_only += remaining_size;
2697 }
2698 }
2699
2700 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2701 {
2702 unsigned int xh_len;
2703 int xh_flags;
2704 uint8_t *loaded_data;
2705
2706 /* extract RLE header */
2707 xh_flags = qemu_get_byte(f);
2708 xh_len = qemu_get_be16(f);
2709
2710 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2711 error_report("Failed to load XBZRLE page - wrong compression!");
2712 return -1;
2713 }
2714
2715 if (xh_len > TARGET_PAGE_SIZE) {
2716 error_report("Failed to load XBZRLE page - len overflow!");
2717 return -1;
2718 }
2719 loaded_data = XBZRLE.decoded_buf;
2720 /* load data and decode */
2721 /* it can change loaded_data to point to an internal buffer */
2722 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2723
2724 /* decode RLE */
2725 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2726 TARGET_PAGE_SIZE) == -1) {
2727 error_report("Failed to load XBZRLE page - decode error!");
2728 return -1;
2729 }
2730
2731 return 0;
2732 }
2733
2734 /**
2735 * ram_block_from_stream: read a RAMBlock id from the migration stream
2736 *
2737 * Must be called from within a rcu critical section.
2738 *
2739 * Returns a pointer from within the RCU-protected ram_list.
2740 *
2741 * @f: QEMUFile where to read the data from
2742 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2743 */
2744 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2745 {
2746 static RAMBlock *block;
2747 char id[256];
2748 uint8_t len;
2749
2750 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2751 if (!block) {
2752 error_report("Ack, bad migration stream!");
2753 return NULL;
2754 }
2755 return block;
2756 }
2757
2758 len = qemu_get_byte(f);
2759 qemu_get_buffer(f, (uint8_t *)id, len);
2760 id[len] = 0;
2761
2762 block = qemu_ram_block_by_name(id);
2763 if (!block) {
2764 error_report("Can't find block %s", id);
2765 return NULL;
2766 }
2767
2768 if (ramblock_is_ignored(block)) {
2769 error_report("block %s should not be migrated !", id);
2770 return NULL;
2771 }
2772
2773 return block;
2774 }
2775
2776 static inline void *host_from_ram_block_offset(RAMBlock *block,
2777 ram_addr_t offset)
2778 {
2779 if (!offset_in_ramblock(block, offset)) {
2780 return NULL;
2781 }
2782
2783 return block->host + offset;
2784 }
2785
2786 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2787 ram_addr_t offset, bool record_bitmap)
2788 {
2789 if (!offset_in_ramblock(block, offset)) {
2790 return NULL;
2791 }
2792 if (!block->colo_cache) {
2793 error_report("%s: colo_cache is NULL in block :%s",
2794 __func__, block->idstr);
2795 return NULL;
2796 }
2797
2798 /*
2799 * During colo checkpoint, we need bitmap of these migrated pages.
2800 * It help us to decide which pages in ram cache should be flushed
2801 * into VM's RAM later.
2802 */
2803 if (record_bitmap &&
2804 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2805 ram_state->migration_dirty_pages++;
2806 }
2807 return block->colo_cache + offset;
2808 }
2809
2810 /**
2811 * ram_handle_compressed: handle the zero page case
2812 *
2813 * If a page (or a whole RDMA chunk) has been
2814 * determined to be zero, then zap it.
2815 *
2816 * @host: host address for the zero page
2817 * @ch: what the page is filled from. We only support zero
2818 * @size: size of the zero page
2819 */
2820 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2821 {
2822 if (ch != 0 || !is_zero_range(host, size)) {
2823 memset(host, ch, size);
2824 }
2825 }
2826
2827 /* return the size after decompression, or negative value on error */
2828 static int
2829 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2830 const uint8_t *source, size_t source_len)
2831 {
2832 int err;
2833
2834 err = inflateReset(stream);
2835 if (err != Z_OK) {
2836 return -1;
2837 }
2838
2839 stream->avail_in = source_len;
2840 stream->next_in = (uint8_t *)source;
2841 stream->avail_out = dest_len;
2842 stream->next_out = dest;
2843
2844 err = inflate(stream, Z_NO_FLUSH);
2845 if (err != Z_STREAM_END) {
2846 return -1;
2847 }
2848
2849 return stream->total_out;
2850 }
2851
2852 static void *do_data_decompress(void *opaque)
2853 {
2854 DecompressParam *param = opaque;
2855 unsigned long pagesize;
2856 uint8_t *des;
2857 int len, ret;
2858
2859 qemu_mutex_lock(&param->mutex);
2860 while (!param->quit) {
2861 if (param->des) {
2862 des = param->des;
2863 len = param->len;
2864 param->des = 0;
2865 qemu_mutex_unlock(&param->mutex);
2866
2867 pagesize = TARGET_PAGE_SIZE;
2868
2869 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2870 param->compbuf, len);
2871 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2872 error_report("decompress data failed");
2873 qemu_file_set_error(decomp_file, ret);
2874 }
2875
2876 qemu_mutex_lock(&decomp_done_lock);
2877 param->done = true;
2878 qemu_cond_signal(&decomp_done_cond);
2879 qemu_mutex_unlock(&decomp_done_lock);
2880
2881 qemu_mutex_lock(&param->mutex);
2882 } else {
2883 qemu_cond_wait(&param->cond, &param->mutex);
2884 }
2885 }
2886 qemu_mutex_unlock(&param->mutex);
2887
2888 return NULL;
2889 }
2890
2891 static int wait_for_decompress_done(void)
2892 {
2893 int idx, thread_count;
2894
2895 if (!migrate_use_compression()) {
2896 return 0;
2897 }
2898
2899 thread_count = migrate_decompress_threads();
2900 qemu_mutex_lock(&decomp_done_lock);
2901 for (idx = 0; idx < thread_count; idx++) {
2902 while (!decomp_param[idx].done) {
2903 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2904 }
2905 }
2906 qemu_mutex_unlock(&decomp_done_lock);
2907 return qemu_file_get_error(decomp_file);
2908 }
2909
2910 static void compress_threads_load_cleanup(void)
2911 {
2912 int i, thread_count;
2913
2914 if (!migrate_use_compression()) {
2915 return;
2916 }
2917 thread_count = migrate_decompress_threads();
2918 for (i = 0; i < thread_count; i++) {
2919 /*
2920 * we use it as a indicator which shows if the thread is
2921 * properly init'd or not
2922 */
2923 if (!decomp_param[i].compbuf) {
2924 break;
2925 }
2926
2927 qemu_mutex_lock(&decomp_param[i].mutex);
2928 decomp_param[i].quit = true;
2929 qemu_cond_signal(&decomp_param[i].cond);
2930 qemu_mutex_unlock(&decomp_param[i].mutex);
2931 }
2932 for (i = 0; i < thread_count; i++) {
2933 if (!decomp_param[i].compbuf) {
2934 break;
2935 }
2936
2937 qemu_thread_join(decompress_threads + i);
2938 qemu_mutex_destroy(&decomp_param[i].mutex);
2939 qemu_cond_destroy(&decomp_param[i].cond);
2940 inflateEnd(&decomp_param[i].stream);
2941 g_free(decomp_param[i].compbuf);
2942 decomp_param[i].compbuf = NULL;
2943 }
2944 g_free(decompress_threads);
2945 g_free(decomp_param);
2946 decompress_threads = NULL;
2947 decomp_param = NULL;
2948 decomp_file = NULL;
2949 }
2950
2951 static int compress_threads_load_setup(QEMUFile *f)
2952 {
2953 int i, thread_count;
2954
2955 if (!migrate_use_compression()) {
2956 return 0;
2957 }
2958
2959 thread_count = migrate_decompress_threads();
2960 decompress_threads = g_new0(QemuThread, thread_count);
2961 decomp_param = g_new0(DecompressParam, thread_count);
2962 qemu_mutex_init(&decomp_done_lock);
2963 qemu_cond_init(&decomp_done_cond);
2964 decomp_file = f;
2965 for (i = 0; i < thread_count; i++) {
2966 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2967 goto exit;
2968 }
2969
2970 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2971 qemu_mutex_init(&decomp_param[i].mutex);
2972 qemu_cond_init(&decomp_param[i].cond);
2973 decomp_param[i].done = true;
2974 decomp_param[i].quit = false;
2975 qemu_thread_create(decompress_threads + i, "decompress",
2976 do_data_decompress, decomp_param + i,
2977 QEMU_THREAD_JOINABLE);
2978 }
2979 return 0;
2980 exit:
2981 compress_threads_load_cleanup();
2982 return -1;
2983 }
2984
2985 static void decompress_data_with_multi_threads(QEMUFile *f,
2986 void *host, int len)
2987 {
2988 int idx, thread_count;
2989
2990 thread_count = migrate_decompress_threads();
2991 qemu_mutex_lock(&decomp_done_lock);
2992 while (true) {
2993 for (idx = 0; idx < thread_count; idx++) {
2994 if (decomp_param[idx].done) {
2995 decomp_param[idx].done = false;
2996 qemu_mutex_lock(&decomp_param[idx].mutex);
2997 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2998 decomp_param[idx].des = host;
2999 decomp_param[idx].len = len;
3000 qemu_cond_signal(&decomp_param[idx].cond);
3001 qemu_mutex_unlock(&decomp_param[idx].mutex);
3002 break;
3003 }
3004 }
3005 if (idx < thread_count) {
3006 break;
3007 } else {
3008 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3009 }
3010 }
3011 qemu_mutex_unlock(&decomp_done_lock);
3012 }
3013
3014 /*
3015 * colo cache: this is for secondary VM, we cache the whole
3016 * memory of the secondary VM, it is need to hold the global lock
3017 * to call this helper.
3018 */
3019 int colo_init_ram_cache(void)
3020 {
3021 RAMBlock *block;
3022
3023 WITH_RCU_READ_LOCK_GUARD() {
3024 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3025 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3026 NULL,
3027 false);
3028 if (!block->colo_cache) {
3029 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3030 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3031 block->used_length);
3032 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3033 if (block->colo_cache) {
3034 qemu_anon_ram_free(block->colo_cache, block->used_length);
3035 block->colo_cache = NULL;
3036 }
3037 }
3038 return -errno;
3039 }
3040 }
3041 }
3042
3043 /*
3044 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3045 * with to decide which page in cache should be flushed into SVM's RAM. Here
3046 * we use the same name 'ram_bitmap' as for migration.
3047 */
3048 if (ram_bytes_total()) {
3049 RAMBlock *block;
3050
3051 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3052 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3053 block->bmap = bitmap_new(pages);
3054 }
3055 }
3056
3057 ram_state_init(&ram_state);
3058 return 0;
3059 }
3060
3061 /* TODO: duplicated with ram_init_bitmaps */
3062 void colo_incoming_start_dirty_log(void)
3063 {
3064 RAMBlock *block = NULL;
3065 /* For memory_global_dirty_log_start below. */
3066 qemu_mutex_lock_iothread();
3067 qemu_mutex_lock_ramlist();
3068
3069 memory_global_dirty_log_sync();
3070 WITH_RCU_READ_LOCK_GUARD() {
3071 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3072 ramblock_sync_dirty_bitmap(ram_state, block);
3073 /* Discard this dirty bitmap record */
3074 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3075 }
3076 memory_global_dirty_log_start();
3077 }
3078 ram_state->migration_dirty_pages = 0;
3079 qemu_mutex_unlock_ramlist();
3080 qemu_mutex_unlock_iothread();
3081 }
3082
3083 /* It is need to hold the global lock to call this helper */
3084 void colo_release_ram_cache(void)
3085 {
3086 RAMBlock *block;
3087
3088 memory_global_dirty_log_stop();
3089 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3090 g_free(block->bmap);
3091 block->bmap = NULL;
3092 }
3093
3094 WITH_RCU_READ_LOCK_GUARD() {
3095 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3096 if (block->colo_cache) {
3097 qemu_anon_ram_free(block->colo_cache, block->used_length);
3098 block->colo_cache = NULL;
3099 }
3100 }
3101 }
3102 ram_state_cleanup(&ram_state);
3103 }
3104
3105 /**
3106 * ram_load_setup: Setup RAM for migration incoming side
3107 *
3108 * Returns zero to indicate success and negative for error
3109 *
3110 * @f: QEMUFile where to receive the data
3111 * @opaque: RAMState pointer
3112 */
3113 static int ram_load_setup(QEMUFile *f, void *opaque)
3114 {
3115 if (compress_threads_load_setup(f)) {
3116 return -1;
3117 }
3118
3119 xbzrle_load_setup();
3120 ramblock_recv_map_init();
3121
3122 return 0;
3123 }
3124
3125 static int ram_load_cleanup(void *opaque)
3126 {
3127 RAMBlock *rb;
3128
3129 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3130 qemu_ram_block_writeback(rb);
3131 }
3132
3133 xbzrle_load_cleanup();
3134 compress_threads_load_cleanup();
3135
3136 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3137 g_free(rb->receivedmap);
3138 rb->receivedmap = NULL;
3139 }
3140
3141 return 0;
3142 }
3143
3144 /**
3145 * ram_postcopy_incoming_init: allocate postcopy data structures
3146 *
3147 * Returns 0 for success and negative if there was one error
3148 *
3149 * @mis: current migration incoming state
3150 *
3151 * Allocate data structures etc needed by incoming migration with
3152 * postcopy-ram. postcopy-ram's similarly names
3153 * postcopy_ram_incoming_init does the work.
3154 */
3155 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3156 {
3157 return postcopy_ram_incoming_init(mis);
3158 }
3159
3160 /**
3161 * ram_load_postcopy: load a page in postcopy case
3162 *
3163 * Returns 0 for success or -errno in case of error
3164 *
3165 * Called in postcopy mode by ram_load().
3166 * rcu_read_lock is taken prior to this being called.
3167 *
3168 * @f: QEMUFile where to send the data
3169 */
3170 static int ram_load_postcopy(QEMUFile *f)
3171 {
3172 int flags = 0, ret = 0;
3173 bool place_needed = false;
3174 bool matches_target_page_size = false;
3175 MigrationIncomingState *mis = migration_incoming_get_current();
3176 /* Temporary page that is later 'placed' */
3177 void *postcopy_host_page = mis->postcopy_tmp_page;
3178 void *this_host = NULL;
3179 bool all_zero = true;
3180 int target_pages = 0;
3181
3182 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3183 ram_addr_t addr;
3184 void *host = NULL;
3185 void *page_buffer = NULL;
3186 void *place_source = NULL;
3187 RAMBlock *block = NULL;
3188 uint8_t ch;
3189 int len;
3190
3191 addr = qemu_get_be64(f);
3192
3193 /*
3194 * If qemu file error, we should stop here, and then "addr"
3195 * may be invalid
3196 */
3197 ret = qemu_file_get_error(f);
3198 if (ret) {
3199 break;
3200 }
3201
3202 flags = addr & ~TARGET_PAGE_MASK;
3203 addr &= TARGET_PAGE_MASK;
3204
3205 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3206 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3207 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3208 block = ram_block_from_stream(f, flags);
3209
3210 host = host_from_ram_block_offset(block, addr);
3211 if (!host) {
3212 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3213 ret = -EINVAL;
3214 break;
3215 }
3216 target_pages++;
3217 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3218 /*
3219 * Postcopy requires that we place whole host pages atomically;
3220 * these may be huge pages for RAMBlocks that are backed by
3221 * hugetlbfs.
3222 * To make it atomic, the data is read into a temporary page
3223 * that's moved into place later.
3224 * The migration protocol uses, possibly smaller, target-pages
3225 * however the source ensures it always sends all the components
3226 * of a host page in one chunk.
3227 */
3228 page_buffer = postcopy_host_page +
3229 ((uintptr_t)host & (block->page_size - 1));
3230 if (target_pages == 1) {
3231 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3232 block->page_size);
3233 } else {
3234 /* not the 1st TP within the HP */
3235 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3236 (uintptr_t)this_host) {
3237 error_report("Non-same host page %p/%p",
3238 host, this_host);
3239 ret = -EINVAL;
3240 break;
3241 }
3242 }
3243
3244 /*
3245 * If it's the last part of a host page then we place the host
3246 * page
3247 */
3248 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3249 place_needed = true;
3250 }
3251 place_source = postcopy_host_page;
3252 }
3253
3254 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3255 case RAM_SAVE_FLAG_ZERO:
3256 ch = qemu_get_byte(f);
3257 /*
3258 * Can skip to set page_buffer when
3259 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3260 */
3261 if (ch || !matches_target_page_size) {
3262 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3263 }
3264 if (ch) {
3265 all_zero = false;
3266 }
3267 break;
3268
3269 case RAM_SAVE_FLAG_PAGE:
3270 all_zero = false;
3271 if (!matches_target_page_size) {
3272 /* For huge pages, we always use temporary buffer */
3273 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3274 } else {
3275 /*
3276 * For small pages that matches target page size, we
3277 * avoid the qemu_file copy. Instead we directly use
3278 * the buffer of QEMUFile to place the page. Note: we
3279 * cannot do any QEMUFile operation before using that
3280 * buffer to make sure the buffer is valid when
3281 * placing the page.
3282 */
3283 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3284 TARGET_PAGE_SIZE);
3285 }
3286 break;
3287 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3288 all_zero = false;
3289 len = qemu_get_be32(f);
3290 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3291 error_report("Invalid compressed data length: %d", len);
3292 ret = -EINVAL;
3293 break;
3294 }
3295 decompress_data_with_multi_threads(f, page_buffer, len);
3296 break;
3297
3298 case RAM_SAVE_FLAG_EOS:
3299 /* normal exit */
3300 multifd_recv_sync_main();
3301 break;
3302 default:
3303 error_report("Unknown combination of migration flags: 0x%x"
3304 " (postcopy mode)", flags);
3305 ret = -EINVAL;
3306 break;
3307 }
3308
3309 /* Got the whole host page, wait for decompress before placing. */
3310 if (place_needed) {
3311 ret |= wait_for_decompress_done();
3312 }
3313
3314 /* Detect for any possible file errors */
3315 if (!ret && qemu_file_get_error(f)) {
3316 ret = qemu_file_get_error(f);
3317 }
3318
3319 if (!ret && place_needed) {
3320 /* This gets called at the last target page in the host page */
3321 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3322 block->page_size);
3323
3324 if (all_zero) {
3325 ret = postcopy_place_page_zero(mis, place_dest,
3326 block);
3327 } else {
3328 ret = postcopy_place_page(mis, place_dest,
3329 place_source, block);
3330 }
3331 place_needed = false;
3332 target_pages = 0;
3333 /* Assume we have a zero page until we detect something different */
3334 all_zero = true;
3335 }
3336 }
3337
3338 return ret;
3339 }
3340
3341 static bool postcopy_is_advised(void)
3342 {
3343 PostcopyState ps = postcopy_state_get();
3344 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3345 }
3346
3347 static bool postcopy_is_running(void)
3348 {
3349 PostcopyState ps = postcopy_state_get();
3350 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3351 }
3352
3353 /*
3354 * Flush content of RAM cache into SVM's memory.
3355 * Only flush the pages that be dirtied by PVM or SVM or both.
3356 */
3357 void colo_flush_ram_cache(void)
3358 {
3359 RAMBlock *block = NULL;
3360 void *dst_host;
3361 void *src_host;
3362 unsigned long offset = 0;
3363
3364 memory_global_dirty_log_sync();
3365 WITH_RCU_READ_LOCK_GUARD() {
3366 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3367 ramblock_sync_dirty_bitmap(ram_state, block);
3368 }
3369 }
3370
3371 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3372 WITH_RCU_READ_LOCK_GUARD() {
3373 block = QLIST_FIRST_RCU(&ram_list.blocks);
3374
3375 while (block) {
3376 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3377
3378 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3379 >= block->used_length) {
3380 offset = 0;
3381 block = QLIST_NEXT_RCU(block, next);
3382 } else {
3383 migration_bitmap_clear_dirty(ram_state, block, offset);
3384 dst_host = block->host
3385 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3386 src_host = block->colo_cache
3387 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3388 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3389 }
3390 }
3391 }
3392 trace_colo_flush_ram_cache_end();
3393 }
3394
3395 /**
3396 * ram_load_precopy: load pages in precopy case
3397 *
3398 * Returns 0 for success or -errno in case of error
3399 *
3400 * Called in precopy mode by ram_load().
3401 * rcu_read_lock is taken prior to this being called.
3402 *
3403 * @f: QEMUFile where to send the data
3404 */
3405 static int ram_load_precopy(QEMUFile *f)
3406 {
3407 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3408 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3409 bool postcopy_advised = postcopy_is_advised();
3410 if (!migrate_use_compression()) {
3411 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3412 }
3413
3414 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3415 ram_addr_t addr, total_ram_bytes;
3416 void *host = NULL, *host_bak = NULL;
3417 uint8_t ch;
3418
3419 /*
3420 * Yield periodically to let main loop run, but an iteration of
3421 * the main loop is expensive, so do it each some iterations
3422 */
3423 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3424 aio_co_schedule(qemu_get_current_aio_context(),
3425 qemu_coroutine_self());
3426 qemu_coroutine_yield();
3427 }
3428 i++;
3429
3430 addr = qemu_get_be64(f);
3431 flags = addr & ~TARGET_PAGE_MASK;
3432 addr &= TARGET_PAGE_MASK;
3433
3434 if (flags & invalid_flags) {
3435 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3436 error_report("Received an unexpected compressed page");
3437 }
3438
3439 ret = -EINVAL;
3440 break;
3441 }
3442
3443 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3444 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3445 RAMBlock *block = ram_block_from_stream(f, flags);
3446
3447 host = host_from_ram_block_offset(block, addr);
3448 /*
3449 * After going into COLO stage, we should not load the page
3450 * into SVM's memory directly, we put them into colo_cache firstly.
3451 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3452 * Previously, we copied all these memory in preparing stage of COLO
3453 * while we need to stop VM, which is a time-consuming process.
3454 * Here we optimize it by a trick, back-up every page while in
3455 * migration process while COLO is enabled, though it affects the
3456 * speed of the migration, but it obviously reduce the downtime of
3457 * back-up all SVM'S memory in COLO preparing stage.
3458 */
3459 if (migration_incoming_colo_enabled()) {
3460 if (migration_incoming_in_colo_state()) {
3461 /* In COLO stage, put all pages into cache temporarily */
3462 host = colo_cache_from_block_offset(block, addr, true);
3463 } else {
3464 /*
3465 * In migration stage but before COLO stage,
3466 * Put all pages into both cache and SVM's memory.
3467 */
3468 host_bak = colo_cache_from_block_offset(block, addr, false);
3469 }
3470 }
3471 if (!host) {
3472 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3473 ret = -EINVAL;
3474 break;
3475 }
3476 if (!migration_incoming_in_colo_state()) {
3477 ramblock_recv_bitmap_set(block, host);
3478 }
3479
3480 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3481 }
3482
3483 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3484 case RAM_SAVE_FLAG_MEM_SIZE:
3485 /* Synchronize RAM block list */
3486 total_ram_bytes = addr;
3487 while (!ret && total_ram_bytes) {
3488 RAMBlock *block;
3489 char id[256];
3490 ram_addr_t length;
3491
3492 len = qemu_get_byte(f);
3493 qemu_get_buffer(f, (uint8_t *)id, len);
3494 id[len] = 0;
3495 length = qemu_get_be64(f);
3496
3497 block = qemu_ram_block_by_name(id);
3498 if (block && !qemu_ram_is_migratable(block)) {
3499 error_report("block %s should not be migrated !", id);
3500 ret = -EINVAL;
3501 } else if (block) {
3502 if (length != block->used_length) {
3503 Error *local_err = NULL;
3504
3505 ret = qemu_ram_resize(block, length,
3506 &local_err);
3507 if (local_err) {
3508 error_report_err(local_err);
3509 }
3510 }
3511 /* For postcopy we need to check hugepage sizes match */
3512 if (postcopy_advised &&
3513 block->page_size != qemu_host_page_size) {
3514 uint64_t remote_page_size = qemu_get_be64(f);
3515 if (remote_page_size != block->page_size) {
3516 error_report("Mismatched RAM page size %s "
3517 "(local) %zd != %" PRId64,
3518 id, block->page_size,
3519 remote_page_size);
3520 ret = -EINVAL;
3521 }
3522 }
3523 if (migrate_ignore_shared()) {
3524 hwaddr addr = qemu_get_be64(f);
3525 if (ramblock_is_ignored(block) &&
3526 block->mr->addr != addr) {
3527 error_report("Mismatched GPAs for block %s "
3528 "%" PRId64 "!= %" PRId64,
3529 id, (uint64_t)addr,
3530 (uint64_t)block->mr->addr);
3531 ret = -EINVAL;
3532 }
3533 }
3534 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3535 block->idstr);
3536 } else {
3537 error_report("Unknown ramblock \"%s\", cannot "
3538 "accept migration", id);
3539 ret = -EINVAL;
3540 }
3541
3542 total_ram_bytes -= length;
3543 }
3544 break;
3545
3546 case RAM_SAVE_FLAG_ZERO:
3547 ch = qemu_get_byte(f);
3548 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3549 break;
3550
3551 case RAM_SAVE_FLAG_PAGE:
3552 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3553 break;
3554
3555 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3556 len = qemu_get_be32(f);
3557 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3558 error_report("Invalid compressed data length: %d", len);
3559 ret = -EINVAL;
3560 break;
3561 }
3562 decompress_data_with_multi_threads(f, host, len);
3563 break;
3564
3565 case RAM_SAVE_FLAG_XBZRLE:
3566 if (load_xbzrle(f, addr, host) < 0) {
3567 error_report("Failed to decompress XBZRLE page at "
3568 RAM_ADDR_FMT, addr);
3569 ret = -EINVAL;
3570 break;
3571 }
3572 break;
3573 case RAM_SAVE_FLAG_EOS:
3574 /* normal exit */
3575 multifd_recv_sync_main();
3576 break;
3577 default:
3578 if (flags & RAM_SAVE_FLAG_HOOK) {
3579 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3580 } else {
3581 error_report("Unknown combination of migration flags: 0x%x",
3582 flags);
3583 ret = -EINVAL;
3584 }
3585 }
3586 if (!ret) {
3587 ret = qemu_file_get_error(f);
3588 }
3589 if (!ret && host_bak) {
3590 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3591 }
3592 }
3593
3594 ret |= wait_for_decompress_done();
3595 return ret;
3596 }
3597
3598 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3599 {
3600 int ret = 0;
3601 static uint64_t seq_iter;
3602 /*
3603 * If system is running in postcopy mode, page inserts to host memory must
3604 * be atomic
3605 */
3606 bool postcopy_running = postcopy_is_running();
3607
3608 seq_iter++;
3609
3610 if (version_id != 4) {
3611 return -EINVAL;
3612 }
3613
3614 /*
3615 * This RCU critical section can be very long running.
3616 * When RCU reclaims in the code start to become numerous,
3617 * it will be necessary to reduce the granularity of this
3618 * critical section.
3619 */
3620 WITH_RCU_READ_LOCK_GUARD() {
3621 if (postcopy_running) {
3622 ret = ram_load_postcopy(f);
3623 } else {
3624 ret = ram_load_precopy(f);
3625 }
3626 }
3627 trace_ram_load_complete(ret, seq_iter);
3628
3629 return ret;
3630 }
3631
3632 static bool ram_has_postcopy(void *opaque)
3633 {
3634 RAMBlock *rb;
3635 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3636 if (ramblock_is_pmem(rb)) {
3637 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3638 "is not supported now!", rb->idstr, rb->host);
3639 return false;
3640 }
3641 }
3642
3643 return migrate_postcopy_ram();
3644 }
3645
3646 /* Sync all the dirty bitmap with destination VM. */
3647 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3648 {
3649 RAMBlock *block;
3650 QEMUFile *file = s->to_dst_file;
3651 int ramblock_count = 0;
3652
3653 trace_ram_dirty_bitmap_sync_start();
3654
3655 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3656 qemu_savevm_send_recv_bitmap(file, block->idstr);
3657 trace_ram_dirty_bitmap_request(block->idstr);
3658 ramblock_count++;
3659 }
3660
3661 trace_ram_dirty_bitmap_sync_wait();
3662
3663 /* Wait until all the ramblocks' dirty bitmap synced */
3664 while (ramblock_count--) {
3665 qemu_sem_wait(&s->rp_state.rp_sem);
3666 }
3667
3668 trace_ram_dirty_bitmap_sync_complete();
3669
3670 return 0;
3671 }
3672
3673 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3674 {
3675 qemu_sem_post(&s->rp_state.rp_sem);
3676 }
3677
3678 /*
3679 * Read the received bitmap, revert it as the initial dirty bitmap.
3680 * This is only used when the postcopy migration is paused but wants
3681 * to resume from a middle point.
3682 */
3683 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3684 {
3685 int ret = -EINVAL;
3686 QEMUFile *file = s->rp_state.from_dst_file;
3687 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3688 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3689 uint64_t size, end_mark;
3690
3691 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3692
3693 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3694 error_report("%s: incorrect state %s", __func__,
3695 MigrationStatus_str(s->state));
3696 return -EINVAL;
3697 }
3698
3699 /*
3700 * Note: see comments in ramblock_recv_bitmap_send() on why we
3701 * need the endianness conversion, and the paddings.
3702 */
3703 local_size = ROUND_UP(local_size, 8);
3704
3705 /* Add paddings */
3706 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3707
3708 size = qemu_get_be64(file);
3709
3710 /* The size of the bitmap should match with our ramblock */
3711 if (size != local_size) {
3712 error_report("%s: ramblock '%s' bitmap size mismatch "
3713 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3714 block->idstr, size, local_size);
3715 ret = -EINVAL;
3716 goto out;
3717 }
3718
3719 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3720 end_mark = qemu_get_be64(file);
3721
3722 ret = qemu_file_get_error(file);
3723 if (ret || size != local_size) {
3724 error_report("%s: read bitmap failed for ramblock '%s': %d"
3725 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3726 __func__, block->idstr, ret, local_size, size);
3727 ret = -EIO;
3728 goto out;
3729 }
3730
3731 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3732 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3733 __func__, block->idstr, end_mark);
3734 ret = -EINVAL;
3735 goto out;
3736 }
3737
3738 /*
3739 * Endianness conversion. We are during postcopy (though paused).
3740 * The dirty bitmap won't change. We can directly modify it.
3741 */
3742 bitmap_from_le(block->bmap, le_bitmap, nbits);
3743
3744 /*
3745 * What we received is "received bitmap". Revert it as the initial
3746 * dirty bitmap for this ramblock.
3747 */
3748 bitmap_complement(block->bmap, block->bmap, nbits);
3749
3750 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3751
3752 /*
3753 * We succeeded to sync bitmap for current ramblock. If this is
3754 * the last one to sync, we need to notify the main send thread.
3755 */
3756 ram_dirty_bitmap_reload_notify(s);
3757
3758 ret = 0;
3759 out:
3760 g_free(le_bitmap);
3761 return ret;
3762 }
3763
3764 static int ram_resume_prepare(MigrationState *s, void *opaque)
3765 {
3766 RAMState *rs = *(RAMState **)opaque;
3767 int ret;
3768
3769 ret = ram_dirty_bitmap_sync_all(s, rs);
3770 if (ret) {
3771 return ret;
3772 }
3773
3774 ram_state_resume_prepare(rs, s->to_dst_file);
3775
3776 return 0;
3777 }
3778
3779 static SaveVMHandlers savevm_ram_handlers = {
3780 .save_setup = ram_save_setup,
3781 .save_live_iterate = ram_save_iterate,
3782 .save_live_complete_postcopy = ram_save_complete,
3783 .save_live_complete_precopy = ram_save_complete,
3784 .has_postcopy = ram_has_postcopy,
3785 .save_live_pending = ram_save_pending,
3786 .load_state = ram_load,
3787 .save_cleanup = ram_save_cleanup,
3788 .load_setup = ram_load_setup,
3789 .load_cleanup = ram_load_cleanup,
3790 .resume_prepare = ram_resume_prepare,
3791 };
3792
3793 void ram_mig_init(void)
3794 {
3795 qemu_mutex_init(&XBZRLE.lock);
3796 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3797 }