xref: /openbmc/qemu/nbd/server.c (revision ef929281f1ddb1ce74f5fe39377a88e6cc8237aa)
1  /*
2   *  Copyright Red Hat
3   *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
4   *
5   *  Network Block Device Server Side
6   *
7   *  This program is free software; you can redistribute it and/or modify
8   *  it under the terms of the GNU General Public License as published by
9   *  the Free Software Foundation; under version 2 of the License.
10   *
11   *  This program is distributed in the hope that it will be useful,
12   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   *  GNU General Public License for more details.
15   *
16   *  You should have received a copy of the GNU General Public License
17   *  along with this program; if not, see <http://www.gnu.org/licenses/>.
18   */
19  
20  #include "qemu/osdep.h"
21  
22  #include "block/block_int.h"
23  #include "block/export.h"
24  #include "block/dirty-bitmap.h"
25  #include "qapi/error.h"
26  #include "qemu/queue.h"
27  #include "trace.h"
28  #include "nbd-internal.h"
29  #include "qemu/units.h"
30  #include "qemu/memalign.h"
31  
32  #define NBD_META_ID_BASE_ALLOCATION 0
33  #define NBD_META_ID_ALLOCATION_DEPTH 1
34  /* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
35  #define NBD_META_ID_DIRTY_BITMAP 2
36  
37  /*
38   * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
39   * constant. If an increase is needed, note that the NBD protocol
40   * recommends no larger than 32 mb, so that the client won't consider
41   * the reply as a denial of service attack.
42   */
43  #define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
44  
45  static int system_errno_to_nbd_errno(int err)
46  {
47      switch (err) {
48      case 0:
49          return NBD_SUCCESS;
50      case EPERM:
51      case EROFS:
52          return NBD_EPERM;
53      case EIO:
54          return NBD_EIO;
55      case ENOMEM:
56          return NBD_ENOMEM;
57  #ifdef EDQUOT
58      case EDQUOT:
59  #endif
60      case EFBIG:
61      case ENOSPC:
62          return NBD_ENOSPC;
63      case EOVERFLOW:
64          return NBD_EOVERFLOW;
65      case ENOTSUP:
66  #if ENOTSUP != EOPNOTSUPP
67      case EOPNOTSUPP:
68  #endif
69          return NBD_ENOTSUP;
70      case ESHUTDOWN:
71          return NBD_ESHUTDOWN;
72      case EINVAL:
73      default:
74          return NBD_EINVAL;
75      }
76  }
77  
78  /* Definitions for opaque data types */
79  
80  typedef struct NBDRequestData NBDRequestData;
81  
82  struct NBDRequestData {
83      NBDClient *client;
84      uint8_t *data;
85      bool complete;
86  };
87  
88  struct NBDExport {
89      BlockExport common;
90  
91      char *name;
92      char *description;
93      uint64_t size;
94      uint16_t nbdflags;
95      QTAILQ_HEAD(, NBDClient) clients;
96      QTAILQ_ENTRY(NBDExport) next;
97  
98      BlockBackend *eject_notifier_blk;
99      Notifier eject_notifier;
100  
101      bool allocation_depth;
102      BdrvDirtyBitmap **export_bitmaps;
103      size_t nr_export_bitmaps;
104  };
105  
106  static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
107  
108  /*
109   * NBDMetaContexts represents a list of meta contexts in use,
110   * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
111   * NBD_OPT_LIST_META_CONTEXT.
112   */
113  struct NBDMetaContexts {
114      const NBDExport *exp; /* associated export */
115      size_t count; /* number of negotiated contexts */
116      bool base_allocation; /* export base:allocation context (block status) */
117      bool allocation_depth; /* export qemu:allocation-depth */
118      bool *bitmaps; /*
119                      * export qemu:dirty-bitmap:<export bitmap name>,
120                      * sized by exp->nr_export_bitmaps
121                      */
122  };
123  
124  struct NBDClient {
125      int refcount; /* atomic */
126      void (*close_fn)(NBDClient *client, bool negotiated);
127  
128      QemuMutex lock;
129  
130      NBDExport *exp;
131      QCryptoTLSCreds *tlscreds;
132      char *tlsauthz;
133      QIOChannelSocket *sioc; /* The underlying data channel */
134      QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
135  
136      Coroutine *recv_coroutine; /* protected by lock */
137  
138      CoMutex send_lock;
139      Coroutine *send_coroutine;
140  
141      bool read_yielding; /* protected by lock */
142      bool quiescing; /* protected by lock */
143  
144      QTAILQ_ENTRY(NBDClient) next;
145      int nb_requests; /* protected by lock */
146      bool closing; /* protected by lock */
147  
148      uint32_t check_align; /* If non-zero, check for aligned client requests */
149  
150      NBDMode mode;
151      NBDMetaContexts contexts; /* Negotiated meta contexts */
152  
153      uint32_t opt; /* Current option being negotiated */
154      uint32_t optlen; /* remaining length of data in ioc for the option being
155                          negotiated now */
156  };
157  
158  static void nbd_client_receive_next_request(NBDClient *client);
159  
160  /* Basic flow for negotiation
161  
162     Server         Client
163     Negotiate
164  
165     or
166  
167     Server         Client
168     Negotiate #1
169                    Option
170     Negotiate #2
171  
172     ----
173  
174     followed by
175  
176     Server         Client
177                    Request
178     Response
179                    Request
180     Response
181                    ...
182     ...
183                    Request (type == 2)
184  
185  */
186  
187  static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
188                                       uint32_t type, uint32_t length)
189  {
190      stq_be_p(&rep->magic, NBD_REP_MAGIC);
191      stl_be_p(&rep->option, option);
192      stl_be_p(&rep->type, type);
193      stl_be_p(&rep->length, length);
194  }
195  
196  /* Send a reply header, including length, but no payload.
197   * Return -errno on error, 0 on success. */
198  static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
199                                        uint32_t len, Error **errp)
200  {
201      NBDOptionReply rep;
202  
203      trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
204                                       type, nbd_rep_lookup(type), len);
205  
206      assert(len < NBD_MAX_BUFFER_SIZE);
207  
208      set_be_option_rep(&rep, client->opt, type, len);
209      return nbd_write(client->ioc, &rep, sizeof(rep), errp);
210  }
211  
212  /* Send a reply header with default 0 length.
213   * Return -errno on error, 0 on success. */
214  static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
215                                    Error **errp)
216  {
217      return nbd_negotiate_send_rep_len(client, type, 0, errp);
218  }
219  
220  /* Send an error reply.
221   * Return -errno on error, 0 on success. */
222  static int G_GNUC_PRINTF(4, 0)
223  nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
224                              Error **errp, const char *fmt, va_list va)
225  {
226      ERRP_GUARD();
227      g_autofree char *msg = NULL;
228      int ret;
229      size_t len;
230  
231      msg = g_strdup_vprintf(fmt, va);
232      len = strlen(msg);
233      assert(len < NBD_MAX_STRING_SIZE);
234      trace_nbd_negotiate_send_rep_err(msg);
235      ret = nbd_negotiate_send_rep_len(client, type, len, errp);
236      if (ret < 0) {
237          return ret;
238      }
239      if (nbd_write(client->ioc, msg, len, errp) < 0) {
240          error_prepend(errp, "write failed (error message): ");
241          return -EIO;
242      }
243  
244      return 0;
245  }
246  
247  /*
248   * Return a malloc'd copy of @name suitable for use in an error reply.
249   */
250  static char *
251  nbd_sanitize_name(const char *name)
252  {
253      if (strnlen(name, 80) < 80) {
254          return g_strdup(name);
255      }
256      /* XXX Should we also try to sanitize any control characters? */
257      return g_strdup_printf("%.80s...", name);
258  }
259  
260  /* Send an error reply.
261   * Return -errno on error, 0 on success. */
262  static int G_GNUC_PRINTF(4, 5)
263  nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
264                             Error **errp, const char *fmt, ...)
265  {
266      va_list va;
267      int ret;
268  
269      va_start(va, fmt);
270      ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
271      va_end(va);
272      return ret;
273  }
274  
275  /* Drop remainder of the current option, and send a reply with the
276   * given error type and message. Return -errno on read or write
277   * failure; or 0 if connection is still live. */
278  static int G_GNUC_PRINTF(4, 0)
279  nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
280                const char *fmt, va_list va)
281  {
282      int ret = nbd_drop(client->ioc, client->optlen, errp);
283  
284      client->optlen = 0;
285      if (!ret) {
286          ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
287      }
288      return ret;
289  }
290  
291  static int G_GNUC_PRINTF(4, 5)
292  nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
293               const char *fmt, ...)
294  {
295      int ret;
296      va_list va;
297  
298      va_start(va, fmt);
299      ret = nbd_opt_vdrop(client, type, errp, fmt, va);
300      va_end(va);
301  
302      return ret;
303  }
304  
305  static int G_GNUC_PRINTF(3, 4)
306  nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
307  {
308      int ret;
309      va_list va;
310  
311      va_start(va, fmt);
312      ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
313      va_end(va);
314  
315      return ret;
316  }
317  
318  /* Read size bytes from the unparsed payload of the current option.
319   * If @check_nul, require that no NUL bytes appear in buffer.
320   * Return -errno on I/O error, 0 if option was completely handled by
321   * sending a reply about inconsistent lengths, or 1 on success. */
322  static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
323                          bool check_nul, Error **errp)
324  {
325      if (size > client->optlen) {
326          return nbd_opt_invalid(client, errp,
327                                 "Inconsistent lengths in option %s",
328                                 nbd_opt_lookup(client->opt));
329      }
330      client->optlen -= size;
331      if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
332          return -EIO;
333      }
334  
335      if (check_nul && strnlen(buffer, size) != size) {
336          return nbd_opt_invalid(client, errp,
337                                 "Unexpected embedded NUL in option %s",
338                                 nbd_opt_lookup(client->opt));
339      }
340      return 1;
341  }
342  
343  /* Drop size bytes from the unparsed payload of the current option.
344   * Return -errno on I/O error, 0 if option was completely handled by
345   * sending a reply about inconsistent lengths, or 1 on success. */
346  static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
347  {
348      if (size > client->optlen) {
349          return nbd_opt_invalid(client, errp,
350                                 "Inconsistent lengths in option %s",
351                                 nbd_opt_lookup(client->opt));
352      }
353      client->optlen -= size;
354      return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
355  }
356  
357  /* nbd_opt_read_name
358   *
359   * Read a string with the format:
360   *   uint32_t len     (<= NBD_MAX_STRING_SIZE)
361   *   len bytes string (not 0-terminated)
362   *
363   * On success, @name will be allocated.
364   * If @length is non-null, it will be set to the actual string length.
365   *
366   * Return -errno on I/O error, 0 if option was completely handled by
367   * sending a reply about inconsistent lengths, or 1 on success.
368   */
369  static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
370                               Error **errp)
371  {
372      int ret;
373      uint32_t len;
374      g_autofree char *local_name = NULL;
375  
376      *name = NULL;
377      ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
378      if (ret <= 0) {
379          return ret;
380      }
381      len = cpu_to_be32(len);
382  
383      if (len > NBD_MAX_STRING_SIZE) {
384          return nbd_opt_invalid(client, errp,
385                                 "Invalid name length: %" PRIu32, len);
386      }
387  
388      local_name = g_malloc(len + 1);
389      ret = nbd_opt_read(client, local_name, len, true, errp);
390      if (ret <= 0) {
391          return ret;
392      }
393      local_name[len] = '\0';
394  
395      if (length) {
396          *length = len;
397      }
398      *name = g_steal_pointer(&local_name);
399  
400      return 1;
401  }
402  
403  /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
404   * Return -errno on error, 0 on success. */
405  static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
406                                         Error **errp)
407  {
408      ERRP_GUARD();
409      size_t name_len, desc_len;
410      uint32_t len;
411      const char *name = exp->name ? exp->name : "";
412      const char *desc = exp->description ? exp->description : "";
413      QIOChannel *ioc = client->ioc;
414      int ret;
415  
416      trace_nbd_negotiate_send_rep_list(name, desc);
417      name_len = strlen(name);
418      desc_len = strlen(desc);
419      assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
420      len = name_len + desc_len + sizeof(len);
421      ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
422      if (ret < 0) {
423          return ret;
424      }
425  
426      len = cpu_to_be32(name_len);
427      if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
428          error_prepend(errp, "write failed (name length): ");
429          return -EINVAL;
430      }
431  
432      if (nbd_write(ioc, name, name_len, errp) < 0) {
433          error_prepend(errp, "write failed (name buffer): ");
434          return -EINVAL;
435      }
436  
437      if (nbd_write(ioc, desc, desc_len, errp) < 0) {
438          error_prepend(errp, "write failed (description buffer): ");
439          return -EINVAL;
440      }
441  
442      return 0;
443  }
444  
445  /* Process the NBD_OPT_LIST command, with a potential series of replies.
446   * Return -errno on error, 0 on success. */
447  static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
448  {
449      NBDExport *exp;
450      assert(client->opt == NBD_OPT_LIST);
451  
452      /* For each export, send a NBD_REP_SERVER reply. */
453      QTAILQ_FOREACH(exp, &exports, next) {
454          if (nbd_negotiate_send_rep_list(client, exp, errp)) {
455              return -EINVAL;
456          }
457      }
458      /* Finish with a NBD_REP_ACK. */
459      return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
460  }
461  
462  static void nbd_check_meta_export(NBDClient *client, NBDExport *exp)
463  {
464      if (exp != client->contexts.exp) {
465          client->contexts.count = 0;
466      }
467  }
468  
469  /* Send a reply to NBD_OPT_EXPORT_NAME.
470   * Return -errno on error, 0 on success. */
471  static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
472                                              Error **errp)
473  {
474      ERRP_GUARD();
475      g_autofree char *name = NULL;
476      char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
477      size_t len;
478      int ret;
479      uint16_t myflags;
480  
481      /* Client sends:
482          [20 ..  xx]   export name (length bytes)
483         Server replies:
484          [ 0 ..   7]   size
485          [ 8 ..   9]   export flags
486          [10 .. 133]   reserved     (0) [unless no_zeroes]
487       */
488      trace_nbd_negotiate_handle_export_name();
489      if (client->mode >= NBD_MODE_EXTENDED) {
490          error_setg(errp, "Extended headers already negotiated");
491          return -EINVAL;
492      }
493      if (client->optlen > NBD_MAX_STRING_SIZE) {
494          error_setg(errp, "Bad length received");
495          return -EINVAL;
496      }
497      name = g_malloc(client->optlen + 1);
498      if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
499          return -EIO;
500      }
501      name[client->optlen] = '\0';
502      client->optlen = 0;
503  
504      trace_nbd_negotiate_handle_export_name_request(name);
505  
506      client->exp = nbd_export_find(name);
507      if (!client->exp) {
508          error_setg(errp, "export not found");
509          return -EINVAL;
510      }
511      nbd_check_meta_export(client, client->exp);
512  
513      myflags = client->exp->nbdflags;
514      if (client->mode >= NBD_MODE_STRUCTURED) {
515          myflags |= NBD_FLAG_SEND_DF;
516      }
517      if (client->mode >= NBD_MODE_EXTENDED && client->contexts.count) {
518          myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
519      }
520      trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
521      stq_be_p(buf, client->exp->size);
522      stw_be_p(buf + 8, myflags);
523      len = no_zeroes ? 10 : sizeof(buf);
524      ret = nbd_write(client->ioc, buf, len, errp);
525      if (ret < 0) {
526          error_prepend(errp, "write failed: ");
527          return ret;
528      }
529  
530      QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
531      blk_exp_ref(&client->exp->common);
532  
533      return 0;
534  }
535  
536  /* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
537   * The buffer does NOT include the info type prefix.
538   * Return -errno on error, 0 if ready to send more. */
539  static int nbd_negotiate_send_info(NBDClient *client,
540                                     uint16_t info, uint32_t length, void *buf,
541                                     Error **errp)
542  {
543      int rc;
544  
545      trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
546      rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
547                                      sizeof(info) + length, errp);
548      if (rc < 0) {
549          return rc;
550      }
551      info = cpu_to_be16(info);
552      if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
553          return -EIO;
554      }
555      if (nbd_write(client->ioc, buf, length, errp) < 0) {
556          return -EIO;
557      }
558      return 0;
559  }
560  
561  /* nbd_reject_length: Handle any unexpected payload.
562   * @fatal requests that we quit talking to the client, even if we are able
563   * to successfully send an error reply.
564   * Return:
565   * -errno  transmission error occurred or @fatal was requested, errp is set
566   * 0       error message successfully sent to client, errp is not set
567   */
568  static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
569  {
570      int ret;
571  
572      assert(client->optlen);
573      ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
574                            nbd_opt_lookup(client->opt));
575      if (fatal && !ret) {
576          error_setg(errp, "option '%s' has unexpected length",
577                     nbd_opt_lookup(client->opt));
578          return -EINVAL;
579      }
580      return ret;
581  }
582  
583  /* Handle NBD_OPT_INFO and NBD_OPT_GO.
584   * Return -errno on error, 0 if ready for next option, and 1 to move
585   * into transmission phase.  */
586  static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
587  {
588      int rc;
589      g_autofree char *name = NULL;
590      NBDExport *exp;
591      uint16_t requests;
592      uint16_t request;
593      uint32_t namelen = 0;
594      bool sendname = false;
595      bool blocksize = false;
596      uint32_t sizes[3];
597      char buf[sizeof(uint64_t) + sizeof(uint16_t)];
598      uint32_t check_align = 0;
599      uint16_t myflags;
600  
601      /* Client sends:
602          4 bytes: L, name length (can be 0)
603          L bytes: export name
604          2 bytes: N, number of requests (can be 0)
605          N * 2 bytes: N requests
606      */
607      rc = nbd_opt_read_name(client, &name, &namelen, errp);
608      if (rc <= 0) {
609          return rc;
610      }
611      trace_nbd_negotiate_handle_export_name_request(name);
612  
613      rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
614      if (rc <= 0) {
615          return rc;
616      }
617      requests = be16_to_cpu(requests);
618      trace_nbd_negotiate_handle_info_requests(requests);
619      while (requests--) {
620          rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
621          if (rc <= 0) {
622              return rc;
623          }
624          request = be16_to_cpu(request);
625          trace_nbd_negotiate_handle_info_request(request,
626                                                  nbd_info_lookup(request));
627          /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
628           * everything else is either a request we don't know or
629           * something we send regardless of request */
630          switch (request) {
631          case NBD_INFO_NAME:
632              sendname = true;
633              break;
634          case NBD_INFO_BLOCK_SIZE:
635              blocksize = true;
636              break;
637          }
638      }
639      if (client->optlen) {
640          return nbd_reject_length(client, false, errp);
641      }
642  
643      exp = nbd_export_find(name);
644      if (!exp) {
645          g_autofree char *sane_name = nbd_sanitize_name(name);
646  
647          return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
648                                            errp, "export '%s' not present",
649                                            sane_name);
650      }
651      if (client->opt == NBD_OPT_GO) {
652          nbd_check_meta_export(client, exp);
653      }
654  
655      /* Don't bother sending NBD_INFO_NAME unless client requested it */
656      if (sendname) {
657          rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
658                                       errp);
659          if (rc < 0) {
660              return rc;
661          }
662      }
663  
664      /* Send NBD_INFO_DESCRIPTION only if available, regardless of
665       * client request */
666      if (exp->description) {
667          size_t len = strlen(exp->description);
668  
669          assert(len <= NBD_MAX_STRING_SIZE);
670          rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
671                                       len, exp->description, errp);
672          if (rc < 0) {
673              return rc;
674          }
675      }
676  
677      /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
678       * according to whether the client requested it, and according to
679       * whether this is OPT_INFO or OPT_GO. */
680      /* minimum - 1 for back-compat, or actual if client will obey it. */
681      if (client->opt == NBD_OPT_INFO || blocksize) {
682          check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
683      } else {
684          sizes[0] = 1;
685      }
686      assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
687      /* preferred - Hard-code to 4096 for now.
688       * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
689      sizes[1] = MAX(4096, sizes[0]);
690      /* maximum - At most 32M, but smaller as appropriate. */
691      sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
692      trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
693      sizes[0] = cpu_to_be32(sizes[0]);
694      sizes[1] = cpu_to_be32(sizes[1]);
695      sizes[2] = cpu_to_be32(sizes[2]);
696      rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
697                                   sizeof(sizes), sizes, errp);
698      if (rc < 0) {
699          return rc;
700      }
701  
702      /* Send NBD_INFO_EXPORT always */
703      myflags = exp->nbdflags;
704      if (client->mode >= NBD_MODE_STRUCTURED) {
705          myflags |= NBD_FLAG_SEND_DF;
706      }
707      if (client->mode >= NBD_MODE_EXTENDED &&
708          (client->contexts.count || client->opt == NBD_OPT_INFO)) {
709          myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
710      }
711      trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
712      stq_be_p(buf, exp->size);
713      stw_be_p(buf + 8, myflags);
714      rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
715                                   sizeof(buf), buf, errp);
716      if (rc < 0) {
717          return rc;
718      }
719  
720      /*
721       * If the client is just asking for NBD_OPT_INFO, but forgot to
722       * request block sizes in a situation that would impact
723       * performance, then return an error. But for NBD_OPT_GO, we
724       * tolerate all clients, regardless of alignments.
725       */
726      if (client->opt == NBD_OPT_INFO && !blocksize &&
727          blk_get_request_alignment(exp->common.blk) > 1) {
728          return nbd_negotiate_send_rep_err(client,
729                                            NBD_REP_ERR_BLOCK_SIZE_REQD,
730                                            errp,
731                                            "request NBD_INFO_BLOCK_SIZE to "
732                                            "use this export");
733      }
734  
735      /* Final reply */
736      rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
737      if (rc < 0) {
738          return rc;
739      }
740  
741      if (client->opt == NBD_OPT_GO) {
742          client->exp = exp;
743          client->check_align = check_align;
744          QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
745          blk_exp_ref(&client->exp->common);
746          rc = 1;
747      }
748      return rc;
749  }
750  
751  
752  /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
753   * new channel for all further (now-encrypted) communication. */
754  static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
755                                                   Error **errp)
756  {
757      QIOChannel *ioc;
758      QIOChannelTLS *tioc;
759      struct NBDTLSHandshakeData data = { 0 };
760  
761      assert(client->opt == NBD_OPT_STARTTLS);
762  
763      trace_nbd_negotiate_handle_starttls();
764      ioc = client->ioc;
765  
766      if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
767          return NULL;
768      }
769  
770      tioc = qio_channel_tls_new_server(ioc,
771                                        client->tlscreds,
772                                        client->tlsauthz,
773                                        errp);
774      if (!tioc) {
775          return NULL;
776      }
777  
778      qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
779      trace_nbd_negotiate_handle_starttls_handshake();
780      data.loop = g_main_loop_new(g_main_context_default(), FALSE);
781      qio_channel_tls_handshake(tioc,
782                                nbd_tls_handshake,
783                                &data,
784                                NULL,
785                                NULL);
786  
787      if (!data.complete) {
788          g_main_loop_run(data.loop);
789      }
790      g_main_loop_unref(data.loop);
791      if (data.error) {
792          object_unref(OBJECT(tioc));
793          error_propagate(errp, data.error);
794          return NULL;
795      }
796  
797      return QIO_CHANNEL(tioc);
798  }
799  
800  /* nbd_negotiate_send_meta_context
801   *
802   * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
803   *
804   * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
805   */
806  static int nbd_negotiate_send_meta_context(NBDClient *client,
807                                             const char *context,
808                                             uint32_t context_id,
809                                             Error **errp)
810  {
811      NBDOptionReplyMetaContext opt;
812      struct iovec iov[] = {
813          {.iov_base = &opt, .iov_len = sizeof(opt)},
814          {.iov_base = (void *)context, .iov_len = strlen(context)}
815      };
816  
817      assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
818      if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
819          context_id = 0;
820      }
821  
822      trace_nbd_negotiate_meta_query_reply(context, context_id);
823      set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
824                        sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
825      stl_be_p(&opt.context_id, context_id);
826  
827      return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
828  }
829  
830  /*
831   * Return true if @query matches @pattern, or if @query is empty when
832   * the @client is performing _LIST_.
833   */
834  static bool nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
835                                        const char *query)
836  {
837      if (!*query) {
838          trace_nbd_negotiate_meta_query_parse("empty");
839          return client->opt == NBD_OPT_LIST_META_CONTEXT;
840      }
841      if (strcmp(query, pattern) == 0) {
842          trace_nbd_negotiate_meta_query_parse(pattern);
843          return true;
844      }
845      trace_nbd_negotiate_meta_query_skip("pattern not matched");
846      return false;
847  }
848  
849  /*
850   * Return true and adjust @str in place if it begins with @prefix.
851   */
852  static bool nbd_strshift(const char **str, const char *prefix)
853  {
854      size_t len = strlen(prefix);
855  
856      if (strncmp(*str, prefix, len) == 0) {
857          *str += len;
858          return true;
859      }
860      return false;
861  }
862  
863  /* nbd_meta_base_query
864   *
865   * Handle queries to 'base' namespace. For now, only the base:allocation
866   * context is available.  Return true if @query has been handled.
867   */
868  static bool nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
869                                  const char *query)
870  {
871      if (!nbd_strshift(&query, "base:")) {
872          return false;
873      }
874      trace_nbd_negotiate_meta_query_parse("base:");
875  
876      if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
877          meta->base_allocation = true;
878      }
879      return true;
880  }
881  
882  /* nbd_meta_qemu_query
883   *
884   * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
885   * and qemu:allocation-depth contexts are available.  Return true if @query
886   * has been handled.
887   */
888  static bool nbd_meta_qemu_query(NBDClient *client, NBDMetaContexts *meta,
889                                  const char *query)
890  {
891      size_t i;
892  
893      if (!nbd_strshift(&query, "qemu:")) {
894          return false;
895      }
896      trace_nbd_negotiate_meta_query_parse("qemu:");
897  
898      if (!*query) {
899          if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
900              meta->allocation_depth = meta->exp->allocation_depth;
901              if (meta->exp->nr_export_bitmaps) {
902                  memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
903              }
904          }
905          trace_nbd_negotiate_meta_query_parse("empty");
906          return true;
907      }
908  
909      if (strcmp(query, "allocation-depth") == 0) {
910          trace_nbd_negotiate_meta_query_parse("allocation-depth");
911          meta->allocation_depth = meta->exp->allocation_depth;
912          return true;
913      }
914  
915      if (nbd_strshift(&query, "dirty-bitmap:")) {
916          trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
917          if (!*query) {
918              if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
919                  meta->exp->nr_export_bitmaps) {
920                  memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
921              }
922              trace_nbd_negotiate_meta_query_parse("empty");
923              return true;
924          }
925  
926          for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
927              const char *bm_name;
928  
929              bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
930              if (strcmp(bm_name, query) == 0) {
931                  meta->bitmaps[i] = true;
932                  trace_nbd_negotiate_meta_query_parse(query);
933                  return true;
934              }
935          }
936          trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
937          return true;
938      }
939  
940      trace_nbd_negotiate_meta_query_skip("unknown qemu context");
941      return true;
942  }
943  
944  /* nbd_negotiate_meta_query
945   *
946   * Parse namespace name and call corresponding function to parse body of the
947   * query.
948   *
949   * The only supported namespaces are 'base' and 'qemu'.
950   *
951   * Return -errno on I/O error, 0 if option was completely handled by
952   * sending a reply about inconsistent lengths, or 1 on success. */
953  static int nbd_negotiate_meta_query(NBDClient *client,
954                                      NBDMetaContexts *meta, Error **errp)
955  {
956      int ret;
957      g_autofree char *query = NULL;
958      uint32_t len;
959  
960      ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
961      if (ret <= 0) {
962          return ret;
963      }
964      len = cpu_to_be32(len);
965  
966      if (len > NBD_MAX_STRING_SIZE) {
967          trace_nbd_negotiate_meta_query_skip("length too long");
968          return nbd_opt_skip(client, len, errp);
969      }
970  
971      query = g_malloc(len + 1);
972      ret = nbd_opt_read(client, query, len, true, errp);
973      if (ret <= 0) {
974          return ret;
975      }
976      query[len] = '\0';
977  
978      if (nbd_meta_base_query(client, meta, query)) {
979          return 1;
980      }
981      if (nbd_meta_qemu_query(client, meta, query)) {
982          return 1;
983      }
984  
985      trace_nbd_negotiate_meta_query_skip("unknown namespace");
986      return 1;
987  }
988  
989  /* nbd_negotiate_meta_queries
990   * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
991   *
992   * Return -errno on I/O error, or 0 if option was completely handled. */
993  static int nbd_negotiate_meta_queries(NBDClient *client, Error **errp)
994  {
995      int ret;
996      g_autofree char *export_name = NULL;
997      /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
998      g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
999      NBDMetaContexts local_meta = {0};
1000      NBDMetaContexts *meta;
1001      uint32_t nb_queries;
1002      size_t i;
1003      size_t count = 0;
1004  
1005      if (client->opt == NBD_OPT_SET_META_CONTEXT &&
1006          client->mode < NBD_MODE_STRUCTURED) {
1007          return nbd_opt_invalid(client, errp,
1008                                 "request option '%s' when structured reply "
1009                                 "is not negotiated",
1010                                 nbd_opt_lookup(client->opt));
1011      }
1012  
1013      if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1014          /* Only change the caller's meta on SET. */
1015          meta = &local_meta;
1016      } else {
1017          meta = &client->contexts;
1018      }
1019  
1020      g_free(meta->bitmaps);
1021      memset(meta, 0, sizeof(*meta));
1022  
1023      ret = nbd_opt_read_name(client, &export_name, NULL, errp);
1024      if (ret <= 0) {
1025          return ret;
1026      }
1027  
1028      meta->exp = nbd_export_find(export_name);
1029      if (meta->exp == NULL) {
1030          g_autofree char *sane_name = nbd_sanitize_name(export_name);
1031  
1032          return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
1033                              "export '%s' not present", sane_name);
1034      }
1035      meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
1036      if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1037          bitmaps = meta->bitmaps;
1038      }
1039  
1040      ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
1041      if (ret <= 0) {
1042          return ret;
1043      }
1044      nb_queries = cpu_to_be32(nb_queries);
1045      trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
1046                                       export_name, nb_queries);
1047  
1048      if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1049          /* enable all known contexts */
1050          meta->base_allocation = true;
1051          meta->allocation_depth = meta->exp->allocation_depth;
1052          if (meta->exp->nr_export_bitmaps) {
1053              memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
1054          }
1055      } else {
1056          for (i = 0; i < nb_queries; ++i) {
1057              ret = nbd_negotiate_meta_query(client, meta, errp);
1058              if (ret <= 0) {
1059                  return ret;
1060              }
1061          }
1062      }
1063  
1064      if (meta->base_allocation) {
1065          ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1066                                                NBD_META_ID_BASE_ALLOCATION,
1067                                                errp);
1068          if (ret < 0) {
1069              return ret;
1070          }
1071          count++;
1072      }
1073  
1074      if (meta->allocation_depth) {
1075          ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
1076                                                NBD_META_ID_ALLOCATION_DEPTH,
1077                                                errp);
1078          if (ret < 0) {
1079              return ret;
1080          }
1081          count++;
1082      }
1083  
1084      for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
1085          const char *bm_name;
1086          g_autofree char *context = NULL;
1087  
1088          if (!meta->bitmaps[i]) {
1089              continue;
1090          }
1091  
1092          bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
1093          context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
1094  
1095          ret = nbd_negotiate_send_meta_context(client, context,
1096                                                NBD_META_ID_DIRTY_BITMAP + i,
1097                                                errp);
1098          if (ret < 0) {
1099              return ret;
1100          }
1101          count++;
1102      }
1103  
1104      ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1105      if (ret == 0) {
1106          meta->count = count;
1107      }
1108  
1109      return ret;
1110  }
1111  
1112  /* nbd_negotiate_options
1113   * Process all NBD_OPT_* client option commands, during fixed newstyle
1114   * negotiation.
1115   * Return:
1116   * -errno  on error, errp is set
1117   * 0       on successful negotiation, errp is not set
1118   * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1119   *         errp is not set
1120   */
1121  static int nbd_negotiate_options(NBDClient *client, Error **errp)
1122  {
1123      uint32_t flags;
1124      bool fixedNewstyle = false;
1125      bool no_zeroes = false;
1126  
1127      /* Client sends:
1128          [ 0 ..   3]   client flags
1129  
1130         Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
1131          [ 0 ..   7]   NBD_OPTS_MAGIC
1132          [ 8 ..  11]   NBD option
1133          [12 ..  15]   Data length
1134          ...           Rest of request
1135  
1136          [ 0 ..   7]   NBD_OPTS_MAGIC
1137          [ 8 ..  11]   Second NBD option
1138          [12 ..  15]   Data length
1139          ...           Rest of request
1140      */
1141  
1142      if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
1143          return -EIO;
1144      }
1145      client->mode = NBD_MODE_EXPORT_NAME;
1146      trace_nbd_negotiate_options_flags(flags);
1147      if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
1148          fixedNewstyle = true;
1149          flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
1150          client->mode = NBD_MODE_SIMPLE;
1151      }
1152      if (flags & NBD_FLAG_C_NO_ZEROES) {
1153          no_zeroes = true;
1154          flags &= ~NBD_FLAG_C_NO_ZEROES;
1155      }
1156      if (flags != 0) {
1157          error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
1158          return -EINVAL;
1159      }
1160  
1161      while (1) {
1162          int ret;
1163          uint32_t option, length;
1164          uint64_t magic;
1165  
1166          if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
1167              return -EINVAL;
1168          }
1169          trace_nbd_negotiate_options_check_magic(magic);
1170          if (magic != NBD_OPTS_MAGIC) {
1171              error_setg(errp, "Bad magic received");
1172              return -EINVAL;
1173          }
1174  
1175          if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
1176              return -EINVAL;
1177          }
1178          client->opt = option;
1179  
1180          if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
1181              return -EINVAL;
1182          }
1183          assert(!client->optlen);
1184          client->optlen = length;
1185  
1186          if (length > NBD_MAX_BUFFER_SIZE) {
1187              error_setg(errp, "len (%" PRIu32 ") is larger than max len (%u)",
1188                         length, NBD_MAX_BUFFER_SIZE);
1189              return -EINVAL;
1190          }
1191  
1192          trace_nbd_negotiate_options_check_option(option,
1193                                                   nbd_opt_lookup(option));
1194          if (client->tlscreds &&
1195              client->ioc == (QIOChannel *)client->sioc) {
1196              QIOChannel *tioc;
1197              if (!fixedNewstyle) {
1198                  error_setg(errp, "Unsupported option 0x%" PRIx32, option);
1199                  return -EINVAL;
1200              }
1201              switch (option) {
1202              case NBD_OPT_STARTTLS:
1203                  if (length) {
1204                      /* Unconditionally drop the connection if the client
1205                       * can't start a TLS negotiation correctly */
1206                      return nbd_reject_length(client, true, errp);
1207                  }
1208                  tioc = nbd_negotiate_handle_starttls(client, errp);
1209                  if (!tioc) {
1210                      return -EIO;
1211                  }
1212                  ret = 0;
1213                  object_unref(OBJECT(client->ioc));
1214                  client->ioc = tioc;
1215                  break;
1216  
1217              case NBD_OPT_EXPORT_NAME:
1218                  /* No way to return an error to client, so drop connection */
1219                  error_setg(errp, "Option 0x%x not permitted before TLS",
1220                             option);
1221                  return -EINVAL;
1222  
1223              default:
1224                  /* Let the client keep trying, unless they asked to
1225                   * quit. Always try to give an error back to the
1226                   * client; but when replying to OPT_ABORT, be aware
1227                   * that the client may hang up before receiving the
1228                   * error, in which case we are fine ignoring the
1229                   * resulting EPIPE. */
1230                  ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1231                                     option == NBD_OPT_ABORT ? NULL : errp,
1232                                     "Option 0x%" PRIx32
1233                                     " not permitted before TLS", option);
1234                  if (option == NBD_OPT_ABORT) {
1235                      return 1;
1236                  }
1237                  break;
1238              }
1239          } else if (fixedNewstyle) {
1240              switch (option) {
1241              case NBD_OPT_LIST:
1242                  if (length) {
1243                      ret = nbd_reject_length(client, false, errp);
1244                  } else {
1245                      ret = nbd_negotiate_handle_list(client, errp);
1246                  }
1247                  break;
1248  
1249              case NBD_OPT_ABORT:
1250                  /* NBD spec says we must try to reply before
1251                   * disconnecting, but that we must also tolerate
1252                   * guests that don't wait for our reply. */
1253                  nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1254                  return 1;
1255  
1256              case NBD_OPT_EXPORT_NAME:
1257                  return nbd_negotiate_handle_export_name(client, no_zeroes,
1258                                                          errp);
1259  
1260              case NBD_OPT_INFO:
1261              case NBD_OPT_GO:
1262                  ret = nbd_negotiate_handle_info(client, errp);
1263                  if (ret == 1) {
1264                      assert(option == NBD_OPT_GO);
1265                      return 0;
1266                  }
1267                  break;
1268  
1269              case NBD_OPT_STARTTLS:
1270                  if (length) {
1271                      ret = nbd_reject_length(client, false, errp);
1272                  } else if (client->tlscreds) {
1273                      ret = nbd_negotiate_send_rep_err(client,
1274                                                       NBD_REP_ERR_INVALID, errp,
1275                                                       "TLS already enabled");
1276                  } else {
1277                      ret = nbd_negotiate_send_rep_err(client,
1278                                                       NBD_REP_ERR_POLICY, errp,
1279                                                       "TLS not configured");
1280                  }
1281                  break;
1282  
1283              case NBD_OPT_STRUCTURED_REPLY:
1284                  if (length) {
1285                      ret = nbd_reject_length(client, false, errp);
1286                  } else if (client->mode >= NBD_MODE_EXTENDED) {
1287                      ret = nbd_negotiate_send_rep_err(
1288                          client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
1289                          "extended headers already negotiated");
1290                  } else if (client->mode >= NBD_MODE_STRUCTURED) {
1291                      ret = nbd_negotiate_send_rep_err(
1292                          client, NBD_REP_ERR_INVALID, errp,
1293                          "structured reply already negotiated");
1294                  } else {
1295                      ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1296                      client->mode = NBD_MODE_STRUCTURED;
1297                  }
1298                  break;
1299  
1300              case NBD_OPT_LIST_META_CONTEXT:
1301              case NBD_OPT_SET_META_CONTEXT:
1302                  ret = nbd_negotiate_meta_queries(client, errp);
1303                  break;
1304  
1305              case NBD_OPT_EXTENDED_HEADERS:
1306                  if (length) {
1307                      ret = nbd_reject_length(client, false, errp);
1308                  } else if (client->mode >= NBD_MODE_EXTENDED) {
1309                      ret = nbd_negotiate_send_rep_err(
1310                          client, NBD_REP_ERR_INVALID, errp,
1311                          "extended headers already negotiated");
1312                  } else {
1313                      ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1314                      client->mode = NBD_MODE_EXTENDED;
1315                  }
1316                  break;
1317  
1318              default:
1319                  ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
1320                                     "Unsupported option %" PRIu32 " (%s)",
1321                                     option, nbd_opt_lookup(option));
1322                  break;
1323              }
1324          } else {
1325              /*
1326               * If broken new-style we should drop the connection
1327               * for anything except NBD_OPT_EXPORT_NAME
1328               */
1329              switch (option) {
1330              case NBD_OPT_EXPORT_NAME:
1331                  return nbd_negotiate_handle_export_name(client, no_zeroes,
1332                                                          errp);
1333  
1334              default:
1335                  error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
1336                             option, nbd_opt_lookup(option));
1337                  return -EINVAL;
1338              }
1339          }
1340          if (ret < 0) {
1341              return ret;
1342          }
1343      }
1344  }
1345  
1346  /* nbd_negotiate
1347   * Return:
1348   * -errno  on error, errp is set
1349   * 0       on successful negotiation, errp is not set
1350   * 1       if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1351   *         errp is not set
1352   */
1353  static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
1354  {
1355      ERRP_GUARD();
1356      char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
1357      int ret;
1358  
1359      /* Old style negotiation header, no room for options
1360          [ 0 ..   7]   passwd       ("NBDMAGIC")
1361          [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
1362          [16 ..  23]   size
1363          [24 ..  27]   export flags (zero-extended)
1364          [28 .. 151]   reserved     (0)
1365  
1366         New style negotiation header, client can send options
1367          [ 0 ..   7]   passwd       ("NBDMAGIC")
1368          [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
1369          [16 ..  17]   server flags (0)
1370          ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
1371       */
1372  
1373      qio_channel_set_blocking(client->ioc, false, NULL);
1374      qio_channel_set_follow_coroutine_ctx(client->ioc, true);
1375  
1376      trace_nbd_negotiate_begin();
1377      memcpy(buf, "NBDMAGIC", 8);
1378  
1379      stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1380      stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
1381  
1382      if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1383          error_prepend(errp, "write failed: ");
1384          return -EINVAL;
1385      }
1386      ret = nbd_negotiate_options(client, errp);
1387      if (ret != 0) {
1388          if (ret < 0) {
1389              error_prepend(errp, "option negotiation failed: ");
1390          }
1391          return ret;
1392      }
1393  
1394      assert(!client->optlen);
1395      trace_nbd_negotiate_success();
1396  
1397      return 0;
1398  }
1399  
1400  /* nbd_read_eof
1401   * Tries to read @size bytes from @ioc. This is a local implementation of
1402   * qio_channel_readv_all_eof. We have it here because we need it to be
1403   * interruptible and to know when the coroutine is yielding.
1404   * Returns 1 on success
1405   *         0 on eof, when no data was read (errp is not set)
1406   *         negative errno on failure (errp is set)
1407   */
1408  static inline int coroutine_fn
1409  nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
1410  {
1411      bool partial = false;
1412  
1413      assert(size);
1414      while (size > 0) {
1415          struct iovec iov = { .iov_base = buffer, .iov_len = size };
1416          ssize_t len;
1417  
1418          len = qio_channel_readv(client->ioc, &iov, 1, errp);
1419          if (len == QIO_CHANNEL_ERR_BLOCK) {
1420              WITH_QEMU_LOCK_GUARD(&client->lock) {
1421                  client->read_yielding = true;
1422  
1423                  /* Prompt main loop thread to re-run nbd_drained_poll() */
1424                  aio_wait_kick();
1425              }
1426              qio_channel_yield(client->ioc, G_IO_IN);
1427              WITH_QEMU_LOCK_GUARD(&client->lock) {
1428                  client->read_yielding = false;
1429                  if (client->quiescing) {
1430                      return -EAGAIN;
1431                  }
1432              }
1433              continue;
1434          } else if (len < 0) {
1435              return -EIO;
1436          } else if (len == 0) {
1437              if (partial) {
1438                  error_setg(errp,
1439                             "Unexpected end-of-file before all bytes were read");
1440                  return -EIO;
1441              } else {
1442                  return 0;
1443              }
1444          }
1445  
1446          partial = true;
1447          size -= len;
1448          buffer = (uint8_t *) buffer + len;
1449      }
1450      return 1;
1451  }
1452  
1453  static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request,
1454                                              Error **errp)
1455  {
1456      uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
1457      uint32_t magic, expect;
1458      int ret;
1459      size_t size = client->mode >= NBD_MODE_EXTENDED ?
1460          NBD_EXTENDED_REQUEST_SIZE : NBD_REQUEST_SIZE;
1461  
1462      ret = nbd_read_eof(client, buf, size, errp);
1463      if (ret < 0) {
1464          return ret;
1465      }
1466      if (ret == 0) {
1467          return -EIO;
1468      }
1469  
1470      /*
1471       * Compact request
1472       *  [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
1473       *  [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, ...)
1474       *  [ 6 ..  7]   type    (NBD_CMD_READ, ...)
1475       *  [ 8 .. 15]   cookie
1476       *  [16 .. 23]   from
1477       *  [24 .. 27]   len
1478       * Extended request
1479       *  [ 0 ..  3]   magic   (NBD_EXTENDED_REQUEST_MAGIC)
1480       *  [ 4 ..  5]   flags   (NBD_CMD_FLAG_FUA, NBD_CMD_FLAG_PAYLOAD_LEN, ...)
1481       *  [ 6 ..  7]   type    (NBD_CMD_READ, ...)
1482       *  [ 8 .. 15]   cookie
1483       *  [16 .. 23]   from
1484       *  [24 .. 31]   len
1485       */
1486  
1487      magic = ldl_be_p(buf);
1488      request->flags  = lduw_be_p(buf + 4);
1489      request->type   = lduw_be_p(buf + 6);
1490      request->cookie = ldq_be_p(buf + 8);
1491      request->from   = ldq_be_p(buf + 16);
1492      if (client->mode >= NBD_MODE_EXTENDED) {
1493          request->len = ldq_be_p(buf + 24);
1494          expect = NBD_EXTENDED_REQUEST_MAGIC;
1495      } else {
1496          request->len = (uint32_t)ldl_be_p(buf + 24); /* widen 32 to 64 bits */
1497          expect = NBD_REQUEST_MAGIC;
1498      }
1499  
1500      trace_nbd_receive_request(magic, request->flags, request->type,
1501                                request->from, request->len);
1502  
1503      if (magic != expect) {
1504          error_setg(errp, "invalid magic (got 0x%" PRIx32 ", expected 0x%"
1505                     PRIx32 ")", magic, expect);
1506          return -EINVAL;
1507      }
1508      return 0;
1509  }
1510  
1511  #define MAX_NBD_REQUESTS 16
1512  
1513  /* Runs in export AioContext and main loop thread */
1514  void nbd_client_get(NBDClient *client)
1515  {
1516      qatomic_inc(&client->refcount);
1517  }
1518  
1519  void nbd_client_put(NBDClient *client)
1520  {
1521      assert(qemu_in_main_thread());
1522  
1523      if (qatomic_fetch_dec(&client->refcount) == 1) {
1524          /* The last reference should be dropped by client->close,
1525           * which is called by client_close.
1526           */
1527          assert(client->closing);
1528  
1529          object_unref(OBJECT(client->sioc));
1530          object_unref(OBJECT(client->ioc));
1531          if (client->tlscreds) {
1532              object_unref(OBJECT(client->tlscreds));
1533          }
1534          g_free(client->tlsauthz);
1535          if (client->exp) {
1536              QTAILQ_REMOVE(&client->exp->clients, client, next);
1537              blk_exp_unref(&client->exp->common);
1538          }
1539          g_free(client->contexts.bitmaps);
1540          qemu_mutex_destroy(&client->lock);
1541          g_free(client);
1542      }
1543  }
1544  
1545  /*
1546   * Tries to release the reference to @client, but only if other references
1547   * remain. This is an optimization for the common case where we want to avoid
1548   * the expense of scheduling nbd_client_put() in the main loop thread.
1549   *
1550   * Returns true upon success or false if the reference was not released because
1551   * it is the last reference.
1552   */
1553  static bool nbd_client_put_nonzero(NBDClient *client)
1554  {
1555      int old = qatomic_read(&client->refcount);
1556      int expected;
1557  
1558      do {
1559          if (old == 1) {
1560              return false;
1561          }
1562  
1563          expected = old;
1564          old = qatomic_cmpxchg(&client->refcount, expected, expected - 1);
1565      } while (old != expected);
1566  
1567      return true;
1568  }
1569  
1570  static void client_close(NBDClient *client, bool negotiated)
1571  {
1572      assert(qemu_in_main_thread());
1573  
1574      WITH_QEMU_LOCK_GUARD(&client->lock) {
1575          if (client->closing) {
1576              return;
1577          }
1578  
1579          client->closing = true;
1580      }
1581  
1582      /* Force requests to finish.  They will drop their own references,
1583       * then we'll close the socket and free the NBDClient.
1584       */
1585      qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1586                           NULL);
1587  
1588      /* Also tell the client, so that they release their reference.  */
1589      if (client->close_fn) {
1590          client->close_fn(client, negotiated);
1591      }
1592  }
1593  
1594  /* Runs in export AioContext with client->lock held */
1595  static NBDRequestData *nbd_request_get(NBDClient *client)
1596  {
1597      NBDRequestData *req;
1598  
1599      assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1600      client->nb_requests++;
1601  
1602      req = g_new0(NBDRequestData, 1);
1603      req->client = client;
1604      return req;
1605  }
1606  
1607  /* Runs in export AioContext with client->lock held */
1608  static void nbd_request_put(NBDRequestData *req)
1609  {
1610      NBDClient *client = req->client;
1611  
1612      if (req->data) {
1613          qemu_vfree(req->data);
1614      }
1615      g_free(req);
1616  
1617      client->nb_requests--;
1618  
1619      if (client->quiescing && client->nb_requests == 0) {
1620          aio_wait_kick();
1621      }
1622  
1623      nbd_client_receive_next_request(client);
1624  }
1625  
1626  static void blk_aio_attached(AioContext *ctx, void *opaque)
1627  {
1628      NBDExport *exp = opaque;
1629      NBDClient *client;
1630  
1631      assert(qemu_in_main_thread());
1632  
1633      trace_nbd_blk_aio_attached(exp->name, ctx);
1634  
1635      exp->common.ctx = ctx;
1636  
1637      QTAILQ_FOREACH(client, &exp->clients, next) {
1638          WITH_QEMU_LOCK_GUARD(&client->lock) {
1639              assert(client->nb_requests == 0);
1640              assert(client->recv_coroutine == NULL);
1641              assert(client->send_coroutine == NULL);
1642          }
1643      }
1644  }
1645  
1646  static void blk_aio_detach(void *opaque)
1647  {
1648      NBDExport *exp = opaque;
1649  
1650      assert(qemu_in_main_thread());
1651  
1652      trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
1653  
1654      exp->common.ctx = NULL;
1655  }
1656  
1657  static void nbd_drained_begin(void *opaque)
1658  {
1659      NBDExport *exp = opaque;
1660      NBDClient *client;
1661  
1662      assert(qemu_in_main_thread());
1663  
1664      QTAILQ_FOREACH(client, &exp->clients, next) {
1665          WITH_QEMU_LOCK_GUARD(&client->lock) {
1666              client->quiescing = true;
1667          }
1668      }
1669  }
1670  
1671  static void nbd_drained_end(void *opaque)
1672  {
1673      NBDExport *exp = opaque;
1674      NBDClient *client;
1675  
1676      assert(qemu_in_main_thread());
1677  
1678      QTAILQ_FOREACH(client, &exp->clients, next) {
1679          WITH_QEMU_LOCK_GUARD(&client->lock) {
1680              client->quiescing = false;
1681              nbd_client_receive_next_request(client);
1682          }
1683      }
1684  }
1685  
1686  /* Runs in export AioContext */
1687  static void nbd_wake_read_bh(void *opaque)
1688  {
1689      NBDClient *client = opaque;
1690      qio_channel_wake_read(client->ioc);
1691  }
1692  
1693  static bool nbd_drained_poll(void *opaque)
1694  {
1695      NBDExport *exp = opaque;
1696      NBDClient *client;
1697  
1698      assert(qemu_in_main_thread());
1699  
1700      QTAILQ_FOREACH(client, &exp->clients, next) {
1701          WITH_QEMU_LOCK_GUARD(&client->lock) {
1702              if (client->nb_requests != 0) {
1703                  /*
1704                   * If there's a coroutine waiting for a request on nbd_read_eof()
1705                   * enter it here so we don't depend on the client to wake it up.
1706                   *
1707                   * Schedule a BH in the export AioContext to avoid missing the
1708                   * wake up due to the race between qio_channel_wake_read() and
1709                   * qio_channel_yield().
1710                   */
1711                  if (client->recv_coroutine != NULL && client->read_yielding) {
1712                      aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp),
1713                                              nbd_wake_read_bh, client);
1714                  }
1715  
1716                  return true;
1717              }
1718          }
1719      }
1720  
1721      return false;
1722  }
1723  
1724  static void nbd_eject_notifier(Notifier *n, void *data)
1725  {
1726      NBDExport *exp = container_of(n, NBDExport, eject_notifier);
1727  
1728      assert(qemu_in_main_thread());
1729  
1730      blk_exp_request_shutdown(&exp->common);
1731  }
1732  
1733  void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
1734  {
1735      NBDExport *nbd_exp = container_of(exp, NBDExport, common);
1736      assert(exp->drv == &blk_exp_nbd);
1737      assert(nbd_exp->eject_notifier_blk == NULL);
1738  
1739      blk_ref(blk);
1740      nbd_exp->eject_notifier_blk = blk;
1741      nbd_exp->eject_notifier.notify = nbd_eject_notifier;
1742      blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
1743  }
1744  
1745  static const BlockDevOps nbd_block_ops = {
1746      .drained_begin = nbd_drained_begin,
1747      .drained_end = nbd_drained_end,
1748      .drained_poll = nbd_drained_poll,
1749  };
1750  
1751  static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
1752                               Error **errp)
1753  {
1754      NBDExport *exp = container_of(blk_exp, NBDExport, common);
1755      BlockExportOptionsNbd *arg = &exp_args->u.nbd;
1756      const char *name = arg->name ?: exp_args->node_name;
1757      BlockBackend *blk = blk_exp->blk;
1758      int64_t size;
1759      uint64_t perm, shared_perm;
1760      bool readonly = !exp_args->writable;
1761      BlockDirtyBitmapOrStrList *bitmaps;
1762      size_t i;
1763      int ret;
1764  
1765      GLOBAL_STATE_CODE();
1766      assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
1767  
1768      if (!nbd_server_is_running()) {
1769          error_setg(errp, "NBD server not running");
1770          return -EINVAL;
1771      }
1772  
1773      if (strlen(name) > NBD_MAX_STRING_SIZE) {
1774          error_setg(errp, "export name '%s' too long", name);
1775          return -EINVAL;
1776      }
1777  
1778      if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
1779          error_setg(errp, "description '%s' too long", arg->description);
1780          return -EINVAL;
1781      }
1782  
1783      if (nbd_export_find(name)) {
1784          error_setg(errp, "NBD server already has export named '%s'", name);
1785          return -EEXIST;
1786      }
1787  
1788      size = blk_getlength(blk);
1789      if (size < 0) {
1790          error_setg_errno(errp, -size,
1791                           "Failed to determine the NBD export's length");
1792          return size;
1793      }
1794  
1795      /* Don't allow resize while the NBD server is running, otherwise we don't
1796       * care what happens with the node. */
1797      blk_get_perm(blk, &perm, &shared_perm);
1798      ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
1799      if (ret < 0) {
1800          return ret;
1801      }
1802  
1803      QTAILQ_INIT(&exp->clients);
1804      exp->name = g_strdup(name);
1805      exp->description = g_strdup(arg->description);
1806      exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1807                       NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
1808  
1809      if (nbd_server_max_connections() != 1) {
1810          exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1811      }
1812      if (readonly) {
1813          exp->nbdflags |= NBD_FLAG_READ_ONLY;
1814      } else {
1815          exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1816                            NBD_FLAG_SEND_FAST_ZERO);
1817      }
1818      exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
1819  
1820      bdrv_graph_rdlock_main_loop();
1821  
1822      for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
1823          exp->nr_export_bitmaps++;
1824      }
1825      exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
1826      for (i = 0, bitmaps = arg->bitmaps; bitmaps;
1827           i++, bitmaps = bitmaps->next)
1828      {
1829          const char *bitmap;
1830          BlockDriverState *bs = blk_bs(blk);
1831          BdrvDirtyBitmap *bm = NULL;
1832  
1833          switch (bitmaps->value->type) {
1834          case QTYPE_QSTRING:
1835              bitmap = bitmaps->value->u.local;
1836              while (bs) {
1837                  bm = bdrv_find_dirty_bitmap(bs, bitmap);
1838                  if (bm != NULL) {
1839                      break;
1840                  }
1841  
1842                  bs = bdrv_filter_or_cow_bs(bs);
1843              }
1844  
1845              if (bm == NULL) {
1846                  ret = -ENOENT;
1847                  error_setg(errp, "Bitmap '%s' is not found",
1848                             bitmaps->value->u.local);
1849                  goto fail;
1850              }
1851  
1852              if (readonly && bdrv_is_writable(bs) &&
1853                  bdrv_dirty_bitmap_enabled(bm)) {
1854                  ret = -EINVAL;
1855                  error_setg(errp, "Enabled bitmap '%s' incompatible with "
1856                             "readonly export", bitmap);
1857                  goto fail;
1858              }
1859              break;
1860          case QTYPE_QDICT:
1861              bitmap = bitmaps->value->u.external.name;
1862              bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
1863                                             bitmap, NULL, errp);
1864              if (!bm) {
1865                  ret = -ENOENT;
1866                  goto fail;
1867              }
1868              break;
1869          default:
1870              abort();
1871          }
1872  
1873          assert(bm);
1874  
1875          if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
1876              ret = -EINVAL;
1877              goto fail;
1878          }
1879  
1880          exp->export_bitmaps[i] = bm;
1881          assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
1882      }
1883  
1884      /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
1885      for (i = 0; i < exp->nr_export_bitmaps; i++) {
1886          bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
1887      }
1888  
1889      exp->allocation_depth = arg->allocation_depth;
1890  
1891      /*
1892       * We need to inhibit request queuing in the block layer to ensure we can
1893       * be properly quiesced when entering a drained section, as our coroutines
1894       * servicing pending requests might enter blk_pread().
1895       */
1896      blk_set_disable_request_queuing(blk, true);
1897  
1898      blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1899  
1900      blk_set_dev_ops(blk, &nbd_block_ops, exp);
1901  
1902      QTAILQ_INSERT_TAIL(&exports, exp, next);
1903  
1904      bdrv_graph_rdunlock_main_loop();
1905  
1906      return 0;
1907  
1908  fail:
1909      bdrv_graph_rdunlock_main_loop();
1910      g_free(exp->export_bitmaps);
1911      g_free(exp->name);
1912      g_free(exp->description);
1913      return ret;
1914  }
1915  
1916  NBDExport *nbd_export_find(const char *name)
1917  {
1918      NBDExport *exp;
1919      QTAILQ_FOREACH(exp, &exports, next) {
1920          if (strcmp(name, exp->name) == 0) {
1921              return exp;
1922          }
1923      }
1924  
1925      return NULL;
1926  }
1927  
1928  AioContext *
1929  nbd_export_aio_context(NBDExport *exp)
1930  {
1931      return exp->common.ctx;
1932  }
1933  
1934  static void nbd_export_request_shutdown(BlockExport *blk_exp)
1935  {
1936      NBDExport *exp = container_of(blk_exp, NBDExport, common);
1937      NBDClient *client, *next;
1938  
1939      blk_exp_ref(&exp->common);
1940      /*
1941       * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1942       * close mode that stops advertising the export to new clients but
1943       * still permits existing clients to run to completion? Because of
1944       * that possibility, nbd_export_close() can be called more than
1945       * once on an export.
1946       */
1947      QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1948          client_close(client, true);
1949      }
1950      if (exp->name) {
1951          g_free(exp->name);
1952          exp->name = NULL;
1953          QTAILQ_REMOVE(&exports, exp, next);
1954      }
1955      blk_exp_unref(&exp->common);
1956  }
1957  
1958  static void nbd_export_delete(BlockExport *blk_exp)
1959  {
1960      size_t i;
1961      NBDExport *exp = container_of(blk_exp, NBDExport, common);
1962  
1963      assert(exp->name == NULL);
1964      assert(QTAILQ_EMPTY(&exp->clients));
1965  
1966      g_free(exp->description);
1967      exp->description = NULL;
1968  
1969      if (exp->eject_notifier_blk) {
1970          notifier_remove(&exp->eject_notifier);
1971          blk_unref(exp->eject_notifier_blk);
1972      }
1973      blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
1974                                      blk_aio_detach, exp);
1975      blk_set_disable_request_queuing(exp->common.blk, false);
1976  
1977      for (i = 0; i < exp->nr_export_bitmaps; i++) {
1978          bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
1979      }
1980  }
1981  
1982  const BlockExportDriver blk_exp_nbd = {
1983      .type               = BLOCK_EXPORT_TYPE_NBD,
1984      .instance_size      = sizeof(NBDExport),
1985      .create             = nbd_export_create,
1986      .delete             = nbd_export_delete,
1987      .request_shutdown   = nbd_export_request_shutdown,
1988  };
1989  
1990  static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1991                                          unsigned niov, Error **errp)
1992  {
1993      int ret;
1994  
1995      g_assert(qemu_in_coroutine());
1996      qemu_co_mutex_lock(&client->send_lock);
1997      client->send_coroutine = qemu_coroutine_self();
1998  
1999      ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
2000  
2001      client->send_coroutine = NULL;
2002      qemu_co_mutex_unlock(&client->send_lock);
2003  
2004      return ret;
2005  }
2006  
2007  static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
2008                                         uint64_t cookie)
2009  {
2010      stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
2011      stl_be_p(&reply->error, error);
2012      stq_be_p(&reply->cookie, cookie);
2013  }
2014  
2015  static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
2016                                                   NBDRequest *request,
2017                                                   uint32_t error,
2018                                                   void *data,
2019                                                   uint64_t len,
2020                                                   Error **errp)
2021  {
2022      NBDSimpleReply reply;
2023      int nbd_err = system_errno_to_nbd_errno(error);
2024      struct iovec iov[] = {
2025          {.iov_base = &reply, .iov_len = sizeof(reply)},
2026          {.iov_base = data, .iov_len = len}
2027      };
2028  
2029      assert(!len || !nbd_err);
2030      assert(len <= NBD_MAX_BUFFER_SIZE);
2031      assert(client->mode < NBD_MODE_STRUCTURED ||
2032             (client->mode == NBD_MODE_STRUCTURED &&
2033              request->type != NBD_CMD_READ));
2034      trace_nbd_co_send_simple_reply(request->cookie, nbd_err,
2035                                     nbd_err_lookup(nbd_err), len);
2036      set_be_simple_reply(&reply, nbd_err, request->cookie);
2037  
2038      return nbd_co_send_iov(client, iov, 2, errp);
2039  }
2040  
2041  /*
2042   * Prepare the header of a reply chunk for network transmission.
2043   *
2044   * On input, @iov is partially initialized: iov[0].iov_base must point
2045   * to an uninitialized NBDReply, while the remaining @niov elements
2046   * (if any) must be ready for transmission.  This function then
2047   * populates iov[0] for transmission.
2048   */
2049  static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
2050                                  size_t niov, uint16_t flags, uint16_t type,
2051                                  NBDRequest *request)
2052  {
2053      size_t i, length = 0;
2054  
2055      for (i = 1; i < niov; i++) {
2056          length += iov[i].iov_len;
2057      }
2058      assert(length <= NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData));
2059  
2060      if (client->mode >= NBD_MODE_EXTENDED) {
2061          NBDExtendedReplyChunk *chunk = iov->iov_base;
2062  
2063          iov[0].iov_len = sizeof(*chunk);
2064          stl_be_p(&chunk->magic, NBD_EXTENDED_REPLY_MAGIC);
2065          stw_be_p(&chunk->flags, flags);
2066          stw_be_p(&chunk->type, type);
2067          stq_be_p(&chunk->cookie, request->cookie);
2068          stq_be_p(&chunk->offset, request->from);
2069          stq_be_p(&chunk->length, length);
2070      } else {
2071          NBDStructuredReplyChunk *chunk = iov->iov_base;
2072  
2073          iov[0].iov_len = sizeof(*chunk);
2074          stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
2075          stw_be_p(&chunk->flags, flags);
2076          stw_be_p(&chunk->type, type);
2077          stq_be_p(&chunk->cookie, request->cookie);
2078          stl_be_p(&chunk->length, length);
2079      }
2080  }
2081  
2082  static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
2083                                                 NBDRequest *request,
2084                                                 Error **errp)
2085  {
2086      NBDReply hdr;
2087      struct iovec iov[] = {
2088          {.iov_base = &hdr},
2089      };
2090  
2091      trace_nbd_co_send_chunk_done(request->cookie);
2092      set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
2093                   NBD_REPLY_TYPE_NONE, request);
2094      return nbd_co_send_iov(client, iov, 1, errp);
2095  }
2096  
2097  static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
2098                                                 NBDRequest *request,
2099                                                 uint64_t offset,
2100                                                 void *data,
2101                                                 uint64_t size,
2102                                                 bool final,
2103                                                 Error **errp)
2104  {
2105      NBDReply hdr;
2106      NBDStructuredReadData chunk;
2107      struct iovec iov[] = {
2108          {.iov_base = &hdr},
2109          {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2110          {.iov_base = data, .iov_len = size}
2111      };
2112  
2113      assert(size && size <= NBD_MAX_BUFFER_SIZE);
2114      trace_nbd_co_send_chunk_read(request->cookie, offset, data, size);
2115      set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
2116                   NBD_REPLY_TYPE_OFFSET_DATA, request);
2117      stq_be_p(&chunk.offset, offset);
2118  
2119      return nbd_co_send_iov(client, iov, 3, errp);
2120  }
2121  
2122  static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
2123                                                  NBDRequest *request,
2124                                                  uint32_t error,
2125                                                  const char *msg,
2126                                                  Error **errp)
2127  {
2128      NBDReply hdr;
2129      NBDStructuredError chunk;
2130      int nbd_err = system_errno_to_nbd_errno(error);
2131      struct iovec iov[] = {
2132          {.iov_base = &hdr},
2133          {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2134          {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
2135      };
2136  
2137      assert(nbd_err);
2138      trace_nbd_co_send_chunk_error(request->cookie, nbd_err,
2139                                    nbd_err_lookup(nbd_err), msg ? msg : "");
2140      set_be_chunk(client, iov, 3, NBD_REPLY_FLAG_DONE,
2141                   NBD_REPLY_TYPE_ERROR, request);
2142      stl_be_p(&chunk.error, nbd_err);
2143      stw_be_p(&chunk.message_length, iov[2].iov_len);
2144  
2145      return nbd_co_send_iov(client, iov, 3, errp);
2146  }
2147  
2148  /* Do a sparse read and send the structured reply to the client.
2149   * Returns -errno if sending fails. blk_co_block_status_above() failure is
2150   * reported to the client, at which point this function succeeds.
2151   */
2152  static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
2153                                                  NBDRequest *request,
2154                                                  uint64_t offset,
2155                                                  uint8_t *data,
2156                                                  uint64_t size,
2157                                                  Error **errp)
2158  {
2159      int ret = 0;
2160      NBDExport *exp = client->exp;
2161      size_t progress = 0;
2162  
2163      assert(size <= NBD_MAX_BUFFER_SIZE);
2164      while (progress < size) {
2165          int64_t pnum;
2166          int status = blk_co_block_status_above(exp->common.blk, NULL,
2167                                                 offset + progress,
2168                                                 size - progress, &pnum, NULL,
2169                                                 NULL);
2170          bool final;
2171  
2172          if (status < 0) {
2173              char *msg = g_strdup_printf("unable to check for holes: %s",
2174                                          strerror(-status));
2175  
2176              ret = nbd_co_send_chunk_error(client, request, -status, msg, errp);
2177              g_free(msg);
2178              return ret;
2179          }
2180          assert(pnum && pnum <= size - progress);
2181          final = progress + pnum == size;
2182          if (status & BDRV_BLOCK_ZERO) {
2183              NBDReply hdr;
2184              NBDStructuredReadHole chunk;
2185              struct iovec iov[] = {
2186                  {.iov_base = &hdr},
2187                  {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2188              };
2189  
2190              trace_nbd_co_send_chunk_read_hole(request->cookie,
2191                                                offset + progress, pnum);
2192              set_be_chunk(client, iov, 2,
2193                           final ? NBD_REPLY_FLAG_DONE : 0,
2194                           NBD_REPLY_TYPE_OFFSET_HOLE, request);
2195              stq_be_p(&chunk.offset, offset + progress);
2196              stl_be_p(&chunk.length, pnum);
2197              ret = nbd_co_send_iov(client, iov, 2, errp);
2198          } else {
2199              ret = blk_co_pread(exp->common.blk, offset + progress, pnum,
2200                                 data + progress, 0);
2201              if (ret < 0) {
2202                  error_setg_errno(errp, -ret, "reading from file failed");
2203                  break;
2204              }
2205              ret = nbd_co_send_chunk_read(client, request, offset + progress,
2206                                           data + progress, pnum, final, errp);
2207          }
2208  
2209          if (ret < 0) {
2210              break;
2211          }
2212          progress += pnum;
2213      }
2214      return ret;
2215  }
2216  
2217  typedef struct NBDExtentArray {
2218      NBDExtent64 *extents;
2219      unsigned int nb_alloc;
2220      unsigned int count;
2221      uint64_t total_length;
2222      bool extended;
2223      bool can_add;
2224      bool converted_to_be;
2225  } NBDExtentArray;
2226  
2227  static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
2228                                              NBDMode mode)
2229  {
2230      NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
2231  
2232      assert(mode >= NBD_MODE_STRUCTURED);
2233      ea->nb_alloc = nb_alloc;
2234      ea->extents = g_new(NBDExtent64, nb_alloc);
2235      ea->extended = mode >= NBD_MODE_EXTENDED;
2236      ea->can_add = true;
2237  
2238      return ea;
2239  }
2240  
2241  static void nbd_extent_array_free(NBDExtentArray *ea)
2242  {
2243      g_free(ea->extents);
2244      g_free(ea);
2245  }
2246  G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
2247  
2248  /* Further modifications of the array after conversion are abandoned */
2249  static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
2250  {
2251      int i;
2252  
2253      assert(!ea->converted_to_be);
2254      assert(ea->extended);
2255      ea->can_add = false;
2256      ea->converted_to_be = true;
2257  
2258      for (i = 0; i < ea->count; i++) {
2259          ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
2260          ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
2261      }
2262  }
2263  
2264  /* Further modifications of the array after conversion are abandoned */
2265  static NBDExtent32 *nbd_extent_array_convert_to_narrow(NBDExtentArray *ea)
2266  {
2267      int i;
2268      NBDExtent32 *extents = g_new(NBDExtent32, ea->count);
2269  
2270      assert(!ea->converted_to_be);
2271      assert(!ea->extended);
2272      ea->can_add = false;
2273      ea->converted_to_be = true;
2274  
2275      for (i = 0; i < ea->count; i++) {
2276          assert((ea->extents[i].length | ea->extents[i].flags) <= UINT32_MAX);
2277          extents[i].length = cpu_to_be32(ea->extents[i].length);
2278          extents[i].flags = cpu_to_be32(ea->extents[i].flags);
2279      }
2280  
2281      return extents;
2282  }
2283  
2284  /*
2285   * Add extent to NBDExtentArray. If extent can't be added (no available space),
2286   * return -1.
2287   * For safety, when returning -1 for the first time, .can_add is set to false,
2288   * and further calls to nbd_extent_array_add() will crash.
2289   * (this avoids the situation where a caller ignores failure to add one extent,
2290   * where adding another extent that would squash into the last array entry
2291   * would result in an incorrect range reported to the client)
2292   */
2293  static int nbd_extent_array_add(NBDExtentArray *ea,
2294                                  uint64_t length, uint32_t flags)
2295  {
2296      assert(ea->can_add);
2297  
2298      if (!length) {
2299          return 0;
2300      }
2301      if (!ea->extended) {
2302          assert(length <= UINT32_MAX);
2303      }
2304  
2305      /* Extend previous extent if flags are the same */
2306      if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
2307          uint64_t sum = length + ea->extents[ea->count - 1].length;
2308  
2309          /*
2310           * sum cannot overflow: the block layer bounds image size at
2311           * 2^63, and ea->extents[].length comes from the block layer.
2312           */
2313          assert(sum >= length);
2314          if (sum <= UINT32_MAX || ea->extended) {
2315              ea->extents[ea->count - 1].length = sum;
2316              ea->total_length += length;
2317              return 0;
2318          }
2319      }
2320  
2321      if (ea->count >= ea->nb_alloc) {
2322          ea->can_add = false;
2323          return -1;
2324      }
2325  
2326      ea->total_length += length;
2327      ea->extents[ea->count] = (NBDExtent64) {.length = length, .flags = flags};
2328      ea->count++;
2329  
2330      return 0;
2331  }
2332  
2333  static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
2334                                                 uint64_t offset, uint64_t bytes,
2335                                                 NBDExtentArray *ea)
2336  {
2337      while (bytes) {
2338          uint32_t flags;
2339          int64_t num;
2340          int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
2341                                              NULL, NULL);
2342  
2343          if (ret < 0) {
2344              return ret;
2345          }
2346  
2347          flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
2348                  (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
2349  
2350          if (nbd_extent_array_add(ea, num, flags) < 0) {
2351              return 0;
2352          }
2353  
2354          offset += num;
2355          bytes -= num;
2356      }
2357  
2358      return 0;
2359  }
2360  
2361  static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
2362                                                uint64_t offset, uint64_t bytes,
2363                                                NBDExtentArray *ea)
2364  {
2365      while (bytes) {
2366          int64_t num;
2367          int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
2368                                              &num);
2369  
2370          if (ret < 0) {
2371              return ret;
2372          }
2373  
2374          if (nbd_extent_array_add(ea, num, ret) < 0) {
2375              return 0;
2376          }
2377  
2378          offset += num;
2379          bytes -= num;
2380      }
2381  
2382      return 0;
2383  }
2384  
2385  /*
2386   * nbd_co_send_extents
2387   *
2388   * @ea is converted to BE by the function
2389   * @last controls whether NBD_REPLY_FLAG_DONE is sent.
2390   */
2391  static int coroutine_fn
2392  nbd_co_send_extents(NBDClient *client, NBDRequest *request, NBDExtentArray *ea,
2393                      bool last, uint32_t context_id, Error **errp)
2394  {
2395      NBDReply hdr;
2396      NBDStructuredMeta meta;
2397      NBDExtendedMeta meta_ext;
2398      g_autofree NBDExtent32 *extents = NULL;
2399      uint16_t type;
2400      struct iovec iov[] = { {.iov_base = &hdr}, {0}, {0} };
2401  
2402      if (client->mode >= NBD_MODE_EXTENDED) {
2403          type = NBD_REPLY_TYPE_BLOCK_STATUS_EXT;
2404  
2405          iov[1].iov_base = &meta_ext;
2406          iov[1].iov_len = sizeof(meta_ext);
2407          stl_be_p(&meta_ext.context_id, context_id);
2408          stl_be_p(&meta_ext.count, ea->count);
2409  
2410          nbd_extent_array_convert_to_be(ea);
2411          iov[2].iov_base = ea->extents;
2412          iov[2].iov_len = ea->count * sizeof(ea->extents[0]);
2413      } else {
2414          type = NBD_REPLY_TYPE_BLOCK_STATUS;
2415  
2416          iov[1].iov_base = &meta;
2417          iov[1].iov_len = sizeof(meta);
2418          stl_be_p(&meta.context_id, context_id);
2419  
2420          extents = nbd_extent_array_convert_to_narrow(ea);
2421          iov[2].iov_base = extents;
2422          iov[2].iov_len = ea->count * sizeof(extents[0]);
2423      }
2424  
2425      trace_nbd_co_send_extents(request->cookie, ea->count, context_id,
2426                                ea->total_length, last);
2427      set_be_chunk(client, iov, 3, last ? NBD_REPLY_FLAG_DONE : 0, type,
2428                   request);
2429  
2430      return nbd_co_send_iov(client, iov, 3, errp);
2431  }
2432  
2433  /* Get block status from the exported device and send it to the client */
2434  static int
2435  coroutine_fn nbd_co_send_block_status(NBDClient *client, NBDRequest *request,
2436                                        BlockBackend *blk, uint64_t offset,
2437                                        uint64_t length, bool dont_fragment,
2438                                        bool last, uint32_t context_id,
2439                                        Error **errp)
2440  {
2441      int ret;
2442      unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2443      g_autoptr(NBDExtentArray) ea =
2444          nbd_extent_array_new(nb_extents, client->mode);
2445  
2446      if (context_id == NBD_META_ID_BASE_ALLOCATION) {
2447          ret = blockstatus_to_extents(blk, offset, length, ea);
2448      } else {
2449          ret = blockalloc_to_extents(blk, offset, length, ea);
2450      }
2451      if (ret < 0) {
2452          return nbd_co_send_chunk_error(client, request, -ret,
2453                                         "can't get block status", errp);
2454      }
2455  
2456      return nbd_co_send_extents(client, request, ea, last, context_id, errp);
2457  }
2458  
2459  /* Populate @ea from a dirty bitmap. */
2460  static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
2461                                uint64_t offset, uint64_t length,
2462                                NBDExtentArray *es)
2463  {
2464      int64_t start, dirty_start, dirty_count;
2465      int64_t end = offset + length;
2466      bool full = false;
2467      int64_t bound = es->extended ? INT64_MAX : INT32_MAX;
2468  
2469      bdrv_dirty_bitmap_lock(bitmap);
2470  
2471      for (start = offset;
2472           bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, bound,
2473                                             &dirty_start, &dirty_count);
2474           start = dirty_start + dirty_count)
2475      {
2476          if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
2477              (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
2478          {
2479              full = true;
2480              break;
2481          }
2482      }
2483  
2484      if (!full) {
2485          /* last non dirty extent, nothing to do if array is now full */
2486          (void) nbd_extent_array_add(es, end - start, 0);
2487      }
2488  
2489      bdrv_dirty_bitmap_unlock(bitmap);
2490  }
2491  
2492  static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
2493                                             NBDRequest *request,
2494                                             BdrvDirtyBitmap *bitmap,
2495                                             uint64_t offset,
2496                                             uint64_t length, bool dont_fragment,
2497                                             bool last, uint32_t context_id,
2498                                             Error **errp)
2499  {
2500      unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2501      g_autoptr(NBDExtentArray) ea =
2502          nbd_extent_array_new(nb_extents, client->mode);
2503  
2504      bitmap_to_extents(bitmap, offset, length, ea);
2505  
2506      return nbd_co_send_extents(client, request, ea, last, context_id, errp);
2507  }
2508  
2509  /*
2510   * nbd_co_block_status_payload_read
2511   * Called when a client wants a subset of negotiated contexts via a
2512   * BLOCK_STATUS payload.  Check the payload for valid length and
2513   * contents.  On success, return 0 with request updated to effective
2514   * length.  If request was invalid but all payload consumed, return 0
2515   * with request->len and request->contexts->count set to 0 (which will
2516   * trigger an appropriate NBD_EINVAL response later on).  Return
2517   * negative errno if the payload was not fully consumed.
2518   */
2519  static int
2520  nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
2521                                   Error **errp)
2522  {
2523      uint64_t payload_len = request->len;
2524      g_autofree char *buf = NULL;
2525      size_t count, i, nr_bitmaps;
2526      uint32_t id;
2527  
2528      if (payload_len > NBD_MAX_BUFFER_SIZE) {
2529          error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2530                     request->len, NBD_MAX_BUFFER_SIZE);
2531          return -EINVAL;
2532      }
2533  
2534      assert(client->contexts.exp == client->exp);
2535      nr_bitmaps = client->exp->nr_export_bitmaps;
2536      request->contexts = g_new0(NBDMetaContexts, 1);
2537      request->contexts->exp = client->exp;
2538  
2539      if (payload_len % sizeof(uint32_t) ||
2540          payload_len < sizeof(NBDBlockStatusPayload) ||
2541          payload_len > (sizeof(NBDBlockStatusPayload) +
2542                         sizeof(id) * client->contexts.count)) {
2543          goto skip;
2544      }
2545  
2546      buf = g_malloc(payload_len);
2547      if (nbd_read(client->ioc, buf, payload_len,
2548                   "CMD_BLOCK_STATUS data", errp) < 0) {
2549          return -EIO;
2550      }
2551      trace_nbd_co_receive_request_payload_received(request->cookie,
2552                                                    payload_len);
2553      request->contexts->bitmaps = g_new0(bool, nr_bitmaps);
2554      count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
2555      payload_len = 0;
2556  
2557      for (i = 0; i < count; i++) {
2558          id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
2559          if (id == NBD_META_ID_BASE_ALLOCATION) {
2560              if (!client->contexts.base_allocation ||
2561                  request->contexts->base_allocation) {
2562                  goto skip;
2563              }
2564              request->contexts->base_allocation = true;
2565          } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
2566              if (!client->contexts.allocation_depth ||
2567                  request->contexts->allocation_depth) {
2568                  goto skip;
2569              }
2570              request->contexts->allocation_depth = true;
2571          } else {
2572              unsigned idx = id - NBD_META_ID_DIRTY_BITMAP;
2573  
2574              if (idx >= nr_bitmaps || !client->contexts.bitmaps[idx] ||
2575                  request->contexts->bitmaps[idx]) {
2576                  goto skip;
2577              }
2578              request->contexts->bitmaps[idx] = true;
2579          }
2580      }
2581  
2582      request->len = ldq_be_p(buf);
2583      request->contexts->count = count;
2584      return 0;
2585  
2586   skip:
2587      trace_nbd_co_receive_block_status_payload_compliance(request->from,
2588                                                           request->len);
2589      request->len = request->contexts->count = 0;
2590      return nbd_drop(client->ioc, payload_len, errp);
2591  }
2592  
2593  /* nbd_co_receive_request
2594   * Collect a client request. Return 0 if request looks valid, -EIO to drop
2595   * connection right away, -EAGAIN to indicate we were interrupted and the
2596   * channel should be quiesced, and any other negative value to report an error
2597   * to the client (although the caller may still need to disconnect after
2598   * reporting the error).
2599   */
2600  static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
2601                                                 NBDRequest *request,
2602                                                 Error **errp)
2603  {
2604      NBDClient *client = req->client;
2605      bool extended_with_payload;
2606      bool check_length = false;
2607      bool check_rofs = false;
2608      bool allocate_buffer = false;
2609      bool payload_okay = false;
2610      uint64_t payload_len = 0;
2611      int valid_flags = NBD_CMD_FLAG_FUA;
2612      int ret;
2613  
2614      g_assert(qemu_in_coroutine());
2615      ret = nbd_receive_request(client, request, errp);
2616      if (ret < 0) {
2617          return ret;
2618      }
2619  
2620      trace_nbd_co_receive_request_decode_type(request->cookie, request->type,
2621                                               nbd_cmd_lookup(request->type));
2622      extended_with_payload = client->mode >= NBD_MODE_EXTENDED &&
2623          request->flags & NBD_CMD_FLAG_PAYLOAD_LEN;
2624      if (extended_with_payload) {
2625          payload_len = request->len;
2626          check_length = true;
2627      }
2628  
2629      switch (request->type) {
2630      case NBD_CMD_DISC:
2631          /* Special case: we're going to disconnect without a reply,
2632           * whether or not flags, from, or len are bogus */
2633          req->complete = true;
2634          return -EIO;
2635  
2636      case NBD_CMD_READ:
2637          if (client->mode >= NBD_MODE_STRUCTURED) {
2638              valid_flags |= NBD_CMD_FLAG_DF;
2639          }
2640          check_length = true;
2641          allocate_buffer = true;
2642          break;
2643  
2644      case NBD_CMD_WRITE:
2645          if (client->mode >= NBD_MODE_EXTENDED) {
2646              if (!extended_with_payload) {
2647                  /* The client is noncompliant. Trace it, but proceed. */
2648                  trace_nbd_co_receive_ext_payload_compliance(request->from,
2649                                                              request->len);
2650              }
2651              valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2652          }
2653          payload_okay = true;
2654          payload_len = request->len;
2655          check_length = true;
2656          allocate_buffer = true;
2657          check_rofs = true;
2658          break;
2659  
2660      case NBD_CMD_FLUSH:
2661          break;
2662  
2663      case NBD_CMD_TRIM:
2664          check_rofs = true;
2665          break;
2666  
2667      case NBD_CMD_CACHE:
2668          check_length = true;
2669          break;
2670  
2671      case NBD_CMD_WRITE_ZEROES:
2672          valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2673          check_rofs = true;
2674          break;
2675  
2676      case NBD_CMD_BLOCK_STATUS:
2677          if (extended_with_payload) {
2678              ret = nbd_co_block_status_payload_read(client, request, errp);
2679              if (ret < 0) {
2680                  return ret;
2681              }
2682              /* payload now consumed */
2683              check_length = false;
2684              payload_len = 0;
2685              valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
2686          } else {
2687              request->contexts = &client->contexts;
2688          }
2689          valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2690          break;
2691  
2692      default:
2693          /* Unrecognized, will fail later */
2694          ;
2695      }
2696  
2697      /* Payload and buffer handling. */
2698      if (!payload_len) {
2699          req->complete = true;
2700      }
2701      if (check_length && request->len > NBD_MAX_BUFFER_SIZE) {
2702          /* READ, WRITE, CACHE */
2703          error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
2704                     request->len, NBD_MAX_BUFFER_SIZE);
2705          return -EINVAL;
2706      }
2707      if (payload_len && !payload_okay) {
2708          /*
2709           * For now, we don't support payloads on other commands; but
2710           * we can keep the connection alive by ignoring the payload.
2711           * We will fail the command later with NBD_EINVAL for the use
2712           * of an unsupported flag (and not for access beyond bounds).
2713           */
2714          assert(request->type != NBD_CMD_WRITE);
2715          request->len = 0;
2716      }
2717      if (allocate_buffer) {
2718          /* READ, WRITE */
2719          req->data = blk_try_blockalign(client->exp->common.blk,
2720                                         request->len);
2721          if (req->data == NULL) {
2722              error_setg(errp, "No memory");
2723              return -ENOMEM;
2724          }
2725      }
2726      if (payload_len) {
2727          if (payload_okay) {
2728              /* WRITE */
2729              assert(req->data);
2730              ret = nbd_read(client->ioc, req->data, payload_len,
2731                             "CMD_WRITE data", errp);
2732          } else {
2733              ret = nbd_drop(client->ioc, payload_len, errp);
2734          }
2735          if (ret < 0) {
2736              return -EIO;
2737          }
2738          req->complete = true;
2739          trace_nbd_co_receive_request_payload_received(request->cookie,
2740                                                        payload_len);
2741      }
2742  
2743      /* Sanity checks. */
2744      if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && check_rofs) {
2745          /* WRITE, TRIM, WRITE_ZEROES */
2746          error_setg(errp, "Export is read-only");
2747          return -EROFS;
2748      }
2749      if (request->from > client->exp->size ||
2750          request->len > client->exp->size - request->from) {
2751          error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu64
2752                     ", Size: %" PRIu64, request->from, request->len,
2753                     client->exp->size);
2754          return (request->type == NBD_CMD_WRITE ||
2755                  request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
2756      }
2757      if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2758                                                  client->check_align)) {
2759          /*
2760           * The block layer gracefully handles unaligned requests, but
2761           * it's still worth tracing client non-compliance
2762           */
2763          trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2764                                                request->from,
2765                                                request->len,
2766                                                client->check_align);
2767      }
2768      if (request->flags & ~valid_flags) {
2769          error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2770                     nbd_cmd_lookup(request->type), request->flags);
2771          return -EINVAL;
2772      }
2773  
2774      return 0;
2775  }
2776  
2777  /* Send simple reply without a payload, or a structured error
2778   * @error_msg is ignored if @ret >= 0
2779   * Returns 0 if connection is still live, -errno on failure to talk to client
2780   */
2781  static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
2782                                                 NBDRequest *request,
2783                                                 int ret,
2784                                                 const char *error_msg,
2785                                                 Error **errp)
2786  {
2787      if (client->mode >= NBD_MODE_STRUCTURED && ret < 0) {
2788          return nbd_co_send_chunk_error(client, request, -ret, error_msg, errp);
2789      } else if (client->mode >= NBD_MODE_EXTENDED) {
2790          return nbd_co_send_chunk_done(client, request, errp);
2791      } else {
2792          return nbd_co_send_simple_reply(client, request, ret < 0 ? -ret : 0,
2793                                          NULL, 0, errp);
2794      }
2795  }
2796  
2797  /* Handle NBD_CMD_READ request.
2798   * Return -errno if sending fails. Other errors are reported directly to the
2799   * client as an error reply. */
2800  static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2801                                          uint8_t *data, Error **errp)
2802  {
2803      int ret;
2804      NBDExport *exp = client->exp;
2805  
2806      assert(request->type == NBD_CMD_READ);
2807      assert(request->len <= NBD_MAX_BUFFER_SIZE);
2808  
2809      /* XXX: NBD Protocol only documents use of FUA with WRITE */
2810      if (request->flags & NBD_CMD_FLAG_FUA) {
2811          ret = blk_co_flush(exp->common.blk);
2812          if (ret < 0) {
2813              return nbd_send_generic_reply(client, request, ret,
2814                                            "flush failed", errp);
2815          }
2816      }
2817  
2818      if (client->mode >= NBD_MODE_STRUCTURED &&
2819          !(request->flags & NBD_CMD_FLAG_DF) && request->len)
2820      {
2821          return nbd_co_send_sparse_read(client, request, request->from,
2822                                         data, request->len, errp);
2823      }
2824  
2825      ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0);
2826      if (ret < 0) {
2827          return nbd_send_generic_reply(client, request, ret,
2828                                        "reading from file failed", errp);
2829      }
2830  
2831      if (client->mode >= NBD_MODE_STRUCTURED) {
2832          if (request->len) {
2833              return nbd_co_send_chunk_read(client, request, request->from, data,
2834                                            request->len, true, errp);
2835          } else {
2836              return nbd_co_send_chunk_done(client, request, errp);
2837          }
2838      } else {
2839          return nbd_co_send_simple_reply(client, request, 0,
2840                                          data, request->len, errp);
2841      }
2842  }
2843  
2844  /*
2845   * nbd_do_cmd_cache
2846   *
2847   * Handle NBD_CMD_CACHE request.
2848   * Return -errno if sending fails. Other errors are reported directly to the
2849   * client as an error reply.
2850   */
2851  static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2852                                           Error **errp)
2853  {
2854      int ret;
2855      NBDExport *exp = client->exp;
2856  
2857      assert(request->type == NBD_CMD_CACHE);
2858      assert(request->len <= NBD_MAX_BUFFER_SIZE);
2859  
2860      ret = blk_co_preadv(exp->common.blk, request->from, request->len,
2861                          NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2862  
2863      return nbd_send_generic_reply(client, request, ret,
2864                                    "caching data failed", errp);
2865  }
2866  
2867  /* Handle NBD request.
2868   * Return -errno if sending fails. Other errors are reported directly to the
2869   * client as an error reply. */
2870  static coroutine_fn int nbd_handle_request(NBDClient *client,
2871                                             NBDRequest *request,
2872                                             uint8_t *data, Error **errp)
2873  {
2874      int ret;
2875      int flags;
2876      NBDExport *exp = client->exp;
2877      char *msg;
2878      size_t i;
2879  
2880      switch (request->type) {
2881      case NBD_CMD_CACHE:
2882          return nbd_do_cmd_cache(client, request, errp);
2883  
2884      case NBD_CMD_READ:
2885          return nbd_do_cmd_read(client, request, data, errp);
2886  
2887      case NBD_CMD_WRITE:
2888          flags = 0;
2889          if (request->flags & NBD_CMD_FLAG_FUA) {
2890              flags |= BDRV_REQ_FUA;
2891          }
2892          assert(request->len <= NBD_MAX_BUFFER_SIZE);
2893          ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data,
2894                              flags);
2895          return nbd_send_generic_reply(client, request, ret,
2896                                        "writing to file failed", errp);
2897  
2898      case NBD_CMD_WRITE_ZEROES:
2899          flags = 0;
2900          if (request->flags & NBD_CMD_FLAG_FUA) {
2901              flags |= BDRV_REQ_FUA;
2902          }
2903          if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2904              flags |= BDRV_REQ_MAY_UNMAP;
2905          }
2906          if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2907              flags |= BDRV_REQ_NO_FALLBACK;
2908          }
2909          ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len,
2910                                     flags);
2911          return nbd_send_generic_reply(client, request, ret,
2912                                        "writing to file failed", errp);
2913  
2914      case NBD_CMD_DISC:
2915          /* unreachable, thanks to special case in nbd_co_receive_request() */
2916          abort();
2917  
2918      case NBD_CMD_FLUSH:
2919          ret = blk_co_flush(exp->common.blk);
2920          return nbd_send_generic_reply(client, request, ret,
2921                                        "flush failed", errp);
2922  
2923      case NBD_CMD_TRIM:
2924          ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
2925          if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
2926              ret = blk_co_flush(exp->common.blk);
2927          }
2928          return nbd_send_generic_reply(client, request, ret,
2929                                        "discard failed", errp);
2930  
2931      case NBD_CMD_BLOCK_STATUS:
2932          assert(request->contexts);
2933          assert(client->mode >= NBD_MODE_EXTENDED ||
2934                 request->len <= UINT32_MAX);
2935          if (request->contexts->count) {
2936              bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
2937              int contexts_remaining = request->contexts->count;
2938  
2939              if (!request->len) {
2940                  return nbd_send_generic_reply(client, request, -EINVAL,
2941                                                "need non-zero length", errp);
2942              }
2943              if (request->contexts->base_allocation) {
2944                  ret = nbd_co_send_block_status(client, request,
2945                                                 exp->common.blk,
2946                                                 request->from,
2947                                                 request->len, dont_fragment,
2948                                                 !--contexts_remaining,
2949                                                 NBD_META_ID_BASE_ALLOCATION,
2950                                                 errp);
2951                  if (ret < 0) {
2952                      return ret;
2953                  }
2954              }
2955  
2956              if (request->contexts->allocation_depth) {
2957                  ret = nbd_co_send_block_status(client, request,
2958                                                 exp->common.blk,
2959                                                 request->from, request->len,
2960                                                 dont_fragment,
2961                                                 !--contexts_remaining,
2962                                                 NBD_META_ID_ALLOCATION_DEPTH,
2963                                                 errp);
2964                  if (ret < 0) {
2965                      return ret;
2966                  }
2967              }
2968  
2969              assert(request->contexts->exp == client->exp);
2970              for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
2971                  if (!request->contexts->bitmaps[i]) {
2972                      continue;
2973                  }
2974                  ret = nbd_co_send_bitmap(client, request,
2975                                           client->exp->export_bitmaps[i],
2976                                           request->from, request->len,
2977                                           dont_fragment, !--contexts_remaining,
2978                                           NBD_META_ID_DIRTY_BITMAP + i, errp);
2979                  if (ret < 0) {
2980                      return ret;
2981                  }
2982              }
2983  
2984              assert(!contexts_remaining);
2985  
2986              return 0;
2987          } else if (client->contexts.count) {
2988              return nbd_send_generic_reply(client, request, -EINVAL,
2989                                            "CMD_BLOCK_STATUS payload not valid",
2990                                            errp);
2991          } else {
2992              return nbd_send_generic_reply(client, request, -EINVAL,
2993                                            "CMD_BLOCK_STATUS not negotiated",
2994                                            errp);
2995          }
2996  
2997      default:
2998          msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
2999                                request->type);
3000          ret = nbd_send_generic_reply(client, request, -EINVAL, msg,
3001                                       errp);
3002          g_free(msg);
3003          return ret;
3004      }
3005  }
3006  
3007  /* Owns a reference to the NBDClient passed as opaque.  */
3008  static coroutine_fn void nbd_trip(void *opaque)
3009  {
3010      NBDRequestData *req = opaque;
3011      NBDClient *client = req->client;
3012      NBDRequest request = { 0 };    /* GCC thinks it can be used uninitialized */
3013      int ret;
3014      Error *local_err = NULL;
3015  
3016      /*
3017       * Note that nbd_client_put() and client_close() must be called from the
3018       * main loop thread. Use aio_co_reschedule_self() to switch AioContext
3019       * before calling these functions.
3020       */
3021  
3022      trace_nbd_trip();
3023  
3024      qemu_mutex_lock(&client->lock);
3025  
3026      if (client->closing) {
3027          goto done;
3028      }
3029  
3030      if (client->quiescing) {
3031          /*
3032           * We're switching between AIO contexts. Don't attempt to receive a new
3033           * request and kick the main context which may be waiting for us.
3034           */
3035          client->recv_coroutine = NULL;
3036          aio_wait_kick();
3037          goto done;
3038      }
3039  
3040      /*
3041       * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has
3042       * set client->quiescing but by the time we get back nbd_drained_end() may
3043       * have already cleared client->quiescing. In that case we try again
3044       * because nothing else will spawn an nbd_trip() coroutine until we set
3045       * client->recv_coroutine = NULL further down.
3046       */
3047      do {
3048          assert(client->recv_coroutine == qemu_coroutine_self());
3049          qemu_mutex_unlock(&client->lock);
3050          ret = nbd_co_receive_request(req, &request, &local_err);
3051          qemu_mutex_lock(&client->lock);
3052      } while (ret == -EAGAIN && !client->quiescing);
3053  
3054      client->recv_coroutine = NULL;
3055  
3056      if (client->closing) {
3057          /*
3058           * The client may be closed when we are blocked in
3059           * nbd_co_receive_request()
3060           */
3061          goto done;
3062      }
3063  
3064      if (ret == -EAGAIN) {
3065          goto done;
3066      }
3067  
3068      nbd_client_receive_next_request(client);
3069  
3070      if (ret == -EIO) {
3071          goto disconnect;
3072      }
3073  
3074      qemu_mutex_unlock(&client->lock);
3075      qio_channel_set_cork(client->ioc, true);
3076  
3077      if (ret < 0) {
3078          /* It wasn't -EIO, so, according to nbd_co_receive_request()
3079           * semantics, we should return the error to the client. */
3080          Error *export_err = local_err;
3081  
3082          local_err = NULL;
3083          ret = nbd_send_generic_reply(client, &request, -EINVAL,
3084                                       error_get_pretty(export_err), &local_err);
3085          error_free(export_err);
3086      } else {
3087          ret = nbd_handle_request(client, &request, req->data, &local_err);
3088      }
3089      if (request.contexts && request.contexts != &client->contexts) {
3090          assert(request.type == NBD_CMD_BLOCK_STATUS);
3091          g_free(request.contexts->bitmaps);
3092          g_free(request.contexts);
3093      }
3094  
3095      qio_channel_set_cork(client->ioc, false);
3096      qemu_mutex_lock(&client->lock);
3097  
3098      if (ret < 0) {
3099          error_prepend(&local_err, "Failed to send reply: ");
3100          goto disconnect;
3101      }
3102  
3103      /*
3104       * We must disconnect after NBD_CMD_WRITE or BLOCK_STATUS with
3105       * payload if we did not read the payload.
3106       */
3107      if (!req->complete) {
3108          error_setg(&local_err, "Request handling failed in intermediate state");
3109          goto disconnect;
3110      }
3111  
3112  done:
3113      nbd_request_put(req);
3114  
3115      qemu_mutex_unlock(&client->lock);
3116  
3117      if (!nbd_client_put_nonzero(client)) {
3118          aio_co_reschedule_self(qemu_get_aio_context());
3119          nbd_client_put(client);
3120      }
3121      return;
3122  
3123  disconnect:
3124      if (local_err) {
3125          error_reportf_err(local_err, "Disconnect client, due to: ");
3126      }
3127  
3128      nbd_request_put(req);
3129      qemu_mutex_unlock(&client->lock);
3130  
3131      aio_co_reschedule_self(qemu_get_aio_context());
3132      client_close(client, true);
3133      nbd_client_put(client);
3134  }
3135  
3136  /*
3137   * Runs in export AioContext and main loop thread. Caller must hold
3138   * client->lock.
3139   */
3140  static void nbd_client_receive_next_request(NBDClient *client)
3141  {
3142      NBDRequestData *req;
3143  
3144      if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
3145          !client->quiescing) {
3146          nbd_client_get(client);
3147          req = nbd_request_get(client);
3148          client->recv_coroutine = qemu_coroutine_create(nbd_trip, req);
3149          aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
3150      }
3151  }
3152  
3153  static coroutine_fn void nbd_co_client_start(void *opaque)
3154  {
3155      NBDClient *client = opaque;
3156      Error *local_err = NULL;
3157  
3158      qemu_co_mutex_init(&client->send_lock);
3159  
3160      if (nbd_negotiate(client, &local_err)) {
3161          if (local_err) {
3162              error_report_err(local_err);
3163          }
3164          client_close(client, false);
3165          return;
3166      }
3167  
3168      WITH_QEMU_LOCK_GUARD(&client->lock) {
3169          nbd_client_receive_next_request(client);
3170      }
3171  }
3172  
3173  /*
3174   * Create a new client listener using the given channel @sioc.
3175   * Begin servicing it in a coroutine.  When the connection closes, call
3176   * @close_fn with an indication of whether the client completed negotiation.
3177   */
3178  void nbd_client_new(QIOChannelSocket *sioc,
3179                      QCryptoTLSCreds *tlscreds,
3180                      const char *tlsauthz,
3181                      void (*close_fn)(NBDClient *, bool))
3182  {
3183      NBDClient *client;
3184      Coroutine *co;
3185  
3186      client = g_new0(NBDClient, 1);
3187      qemu_mutex_init(&client->lock);
3188      client->refcount = 1;
3189      client->tlscreds = tlscreds;
3190      if (tlscreds) {
3191          object_ref(OBJECT(client->tlscreds));
3192      }
3193      client->tlsauthz = g_strdup(tlsauthz);
3194      client->sioc = sioc;
3195      qio_channel_set_delay(QIO_CHANNEL(sioc), false);
3196      object_ref(OBJECT(client->sioc));
3197      client->ioc = QIO_CHANNEL(sioc);
3198      object_ref(OBJECT(client->ioc));
3199      client->close_fn = close_fn;
3200  
3201      co = qemu_coroutine_create(nbd_co_client_start, client);
3202      qemu_coroutine_enter(co);
3203  }
3204