[ipoib] Fix a race when chain-loading undionly.kpxe in IPoIB
[ipxe.git] / src / drivers / net / ipoib.c
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <string.h>
31 #include <byteswap.h>
32 #include <errno.h>
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
36 #include <ipxe/arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
39 #include <ipxe/ip.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
47
48 /** @file
49 *
50 * IP over Infiniband
51 */
52
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
66
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
69
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
72
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
75
76 /** An IPoIB device */
77 struct ipoib_device {
78 /** Network device */
79 struct net_device *netdev;
80 /** Underlying Infiniband device */
81 struct ib_device *ibdev;
82 /** Completion queue */
83 struct ib_completion_queue *cq;
84 /** Queue pair */
85 struct ib_queue_pair *qp;
86 /** Local MAC */
87 struct ipoib_mac mac;
88 /** Broadcast MAC */
89 struct ipoib_mac broadcast;
90 /** Joined to IPv4 broadcast multicast group
91 *
92 * This flag indicates whether or not we have initiated the
93 * join to the IPv4 broadcast multicast group.
94 */
95 int broadcast_joined;
96 /** IPv4 broadcast multicast group membership */
97 struct ib_mc_membership broadcast_membership;
98 /** REMAC cache */
99 struct list_head peers;
100 };
101
102 /** Broadcast IPoIB address */
103 static struct ipoib_mac ipoib_broadcast = {
104 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
105 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
106 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
107 };
108
109 /** Link status for "broadcast join in progress" */
110 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
111 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
112 ( EINFO_EINPROGRESS, 0x01, "Joining" )
113
114 /** Human-readable message for the link status */
115 struct errortab ipoib_errors[] __errortab = {
116 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
117 };
118
119 static struct net_device_operations ipoib_operations;
120
121 /****************************************************************************
122 *
123 * IPoIB REMAC cache
124 *
125 ****************************************************************************
126 */
127
128 /** An IPoIB REMAC cache entry */
129 struct ipoib_peer {
130 /** List of REMAC cache entries */
131 struct list_head list;
132 /** Remote Ethermet MAC */
133 struct ipoib_remac remac;
134 /** MAC address */
135 struct ipoib_mac mac;
136 };
137
138 /**
139 * Find IPoIB MAC from REMAC
140 *
141 * @v ipoib IPoIB device
142 * @v remac Remote Ethernet MAC
143 * @ret mac IPoIB MAC (or NULL if not found)
144 */
145 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
146 const struct ipoib_remac *remac ) {
147 struct ipoib_peer *peer;
148
149 /* Check for broadcast or multicast REMAC. We transmit
150 * multicasts as broadcasts for simplicity.
151 */
152 if ( is_multicast_ether_addr ( remac ) )
153 return &ipoib->broadcast;
154
155 /* Try to find via REMAC cache */
156 list_for_each_entry ( peer, &ipoib->peers, list ) {
157 if ( memcmp ( remac, &peer->remac,
158 sizeof ( peer->remac ) ) == 0 ) {
159 /* Move peer to start of list */
160 list_del ( &peer->list );
161 list_add ( &peer->list, &ipoib->peers );
162 return &peer->mac;
163 }
164 }
165
166 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
167 ipoib, eth_ntoa ( remac ) );
168 return NULL;
169 }
170
171 /**
172 * Add IPoIB MAC to REMAC cache
173 *
174 * @v ipoib IPoIB device
175 * @v remac Remote Ethernet MAC
176 * @v mac IPoIB MAC
177 * @ret rc Return status code
178 */
179 static int ipoib_map_remac ( struct ipoib_device *ipoib,
180 const struct ipoib_remac *remac,
181 const struct ipoib_mac *mac ) {
182 struct ipoib_peer *peer;
183
184 /* Check for existing entry in REMAC cache */
185 list_for_each_entry ( peer, &ipoib->peers, list ) {
186 if ( memcmp ( remac, &peer->remac,
187 sizeof ( peer->remac ) ) == 0 ) {
188 /* Move peer to start of list */
189 list_del ( &peer->list );
190 list_add ( &peer->list, &ipoib->peers );
191 /* Update MAC */
192 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
193 return 0;
194 }
195 }
196
197 /* Create new entry */
198 peer = malloc ( sizeof ( *peer ) );
199 if ( ! peer )
200 return -ENOMEM;
201 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
202 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
203 list_add ( &peer->list, &ipoib->peers );
204
205 return 0;
206 }
207
208 /**
209 * Flush REMAC cache
210 *
211 * @v ipoib IPoIB device
212 */
213 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
214 struct ipoib_peer *peer;
215 struct ipoib_peer *tmp;
216
217 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
218 list_del ( &peer->list );
219 free ( peer );
220 }
221 }
222
223 /**
224 * Discard some entries from the REMAC cache
225 *
226 * @ret discarded Number of cached items discarded
227 */
228 static unsigned int ipoib_discard_remac ( void ) {
229 struct net_device *netdev;
230 struct ipoib_device *ipoib;
231 struct ipoib_peer *peer;
232 unsigned int discarded = 0;
233
234 /* Try to discard one cache entry for each IPoIB device */
235 for_each_netdev ( netdev ) {
236
237 /* Skip non-IPoIB devices */
238 if ( netdev->op != &ipoib_operations )
239 continue;
240 ipoib = netdev->priv;
241
242 /* Discard least recently used cache entry (if any) */
243 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
244 list_del ( &peer->list );
245 free ( peer );
246 discarded++;
247 break;
248 }
249 }
250
251 return discarded;
252 }
253
254 /** IPoIB cache discarder */
255 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
256 .discard = ipoib_discard_remac,
257 };
258
259 /****************************************************************************
260 *
261 * IPoIB link layer
262 *
263 ****************************************************************************
264 */
265
266 /**
267 * Initialise IPoIB link-layer address
268 *
269 * @v hw_addr Hardware address
270 * @v ll_addr Link-layer address
271 */
272 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
273 const uint8_t *guid = hw_addr;
274 uint8_t *eth_addr = ll_addr;
275 uint8_t guid_mask = IPOIB_GUID_MASK;
276 unsigned int i;
277
278 /* Extract bytes from GUID according to mask */
279 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
280 if ( guid_mask & 0x80 )
281 *(eth_addr++) = *guid;
282 }
283 }
284
285 /** IPoIB protocol */
286 struct ll_protocol ipoib_protocol __ll_protocol = {
287 .name = "IPoIB",
288 .ll_proto = htons ( ARPHRD_ETHER ),
289 .hw_addr_len = sizeof ( union ib_guid ),
290 .ll_addr_len = ETH_ALEN,
291 .ll_header_len = ETH_HLEN,
292 .push = eth_push,
293 .pull = eth_pull,
294 .init_addr = ipoib_init_addr,
295 .ntoa = eth_ntoa,
296 .mc_hash = eth_mc_hash,
297 .eth_addr = eth_eth_addr,
298 .eui64 = eth_eui64,
299 .flags = LL_NAME_ONLY,
300 };
301
302 /**
303 * Allocate IPoIB device
304 *
305 * @v priv_size Size of driver private data
306 * @ret netdev Network device, or NULL
307 */
308 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
309 struct net_device *netdev;
310
311 netdev = alloc_netdev ( priv_size );
312 if ( netdev ) {
313 netdev->ll_protocol = &ipoib_protocol;
314 netdev->ll_broadcast = eth_broadcast;
315 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
316 }
317 return netdev;
318 }
319
320 /****************************************************************************
321 *
322 * IPoIB translation layer
323 *
324 ****************************************************************************
325 */
326
327 /**
328 * Translate transmitted ARP packet
329 *
330 * @v netdev Network device
331 * @v iobuf Packet to be transmitted (with no link-layer headers)
332 * @ret rc Return status code
333 */
334 static int ipoib_translate_tx_arp ( struct net_device *netdev,
335 struct io_buffer *iobuf ) {
336 struct ipoib_device *ipoib = netdev->priv;
337 struct arphdr *arphdr = iobuf->data;
338 struct ipoib_mac *target_ha = NULL;
339 void *sender_pa;
340 void *target_pa;
341
342 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
343 if ( arphdr->ar_hln != ETH_ALEN )
344 return 0;
345
346 /* Fail unless we have room to expand packet */
347 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
348 ETH_ALEN ) ) ) {
349 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
350 ipoib );
351 return -ENOBUFS;
352 }
353
354 /* Look up REMAC, if applicable */
355 if ( arphdr->ar_op == ARPOP_REPLY ) {
356 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
357 if ( ! target_ha ) {
358 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
359 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
360 return -ENXIO_ARP_REPLY;
361 }
362 }
363
364 /* Construct new packet */
365 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
366 sender_pa = arp_sender_pa ( arphdr );
367 target_pa = arp_target_pa ( arphdr );
368 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
369 arphdr->ar_hln = sizeof ( ipoib->mac );
370 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
371 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
372 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
373 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
374 if ( target_ha ) {
375 memcpy ( arp_target_ha ( arphdr ), target_ha,
376 sizeof ( *target_ha ) );
377 }
378
379 return 0;
380 }
381
382 /**
383 * Translate transmitted packet
384 *
385 * @v netdev Network device
386 * @v iobuf Packet to be transmitted (with no link-layer headers)
387 * @v net_proto Network-layer protocol (in network byte order)
388 * @ret rc Return status code
389 */
390 static int ipoib_translate_tx ( struct net_device *netdev,
391 struct io_buffer *iobuf, uint16_t net_proto ) {
392
393 switch ( net_proto ) {
394 case htons ( ETH_P_ARP ) :
395 return ipoib_translate_tx_arp ( netdev, iobuf );
396 case htons ( ETH_P_IP ) :
397 /* No translation needed */
398 return 0;
399 default:
400 /* Cannot handle other traffic via eIPoIB */
401 return -ENOTSUP;
402 }
403 }
404
405 /**
406 * Translate received ARP packet
407 *
408 * @v netdev Network device
409 * @v iobuf Received packet (with no link-layer headers)
410 * @v remac Constructed Remote Ethernet MAC
411 * @ret rc Return status code
412 */
413 static int ipoib_translate_rx_arp ( struct net_device *netdev,
414 struct io_buffer *iobuf,
415 struct ipoib_remac *remac ) {
416 struct ipoib_device *ipoib = netdev->priv;
417 struct arphdr *arphdr = iobuf->data;
418 void *sender_pa;
419 void *target_pa;
420 int rc;
421
422 /* Do nothing unless ARP contains IPoIB link-layer addresses */
423 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
424 return 0;
425
426 /* Create REMAC cache entry */
427 if ( ( rc = ipoib_map_remac ( ipoib, remac,
428 arp_sender_ha ( arphdr ) ) ) != 0 ) {
429 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
430 ipoib, strerror ( rc ) );
431 return rc;
432 }
433
434 /* Construct new packet */
435 sender_pa = arp_sender_pa ( arphdr );
436 target_pa = arp_target_pa ( arphdr );
437 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
438 arphdr->ar_hln = ETH_ALEN;
439 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
440 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
441 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
442 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
443 if ( arphdr->ar_op == ARPOP_REPLY ) {
444 /* Assume received replies were directed to us */
445 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
446 }
447 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
448
449 return 0;
450 }
451
452 /**
453 * Translate received packet
454 *
455 * @v netdev Network device
456 * @v iobuf Received packet (with no link-layer headers)
457 * @v remac Constructed Remote Ethernet MAC
458 * @v net_proto Network-layer protocol (in network byte order)
459 * @ret rc Return status code
460 */
461 static int ipoib_translate_rx ( struct net_device *netdev,
462 struct io_buffer *iobuf,
463 struct ipoib_remac *remac,
464 uint16_t net_proto ) {
465
466 switch ( net_proto ) {
467 case htons ( ETH_P_ARP ) :
468 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
469 case htons ( ETH_P_IP ) :
470 /* No translation needed */
471 return 0;
472 default:
473 /* Cannot handle other traffic via eIPoIB */
474 return -ENOTSUP;
475 }
476 }
477
478 /****************************************************************************
479 *
480 * IPoIB network device
481 *
482 ****************************************************************************
483 */
484
485 /**
486 * Transmit packet via IPoIB network device
487 *
488 * @v netdev Network device
489 * @v iobuf I/O buffer
490 * @ret rc Return status code
491 */
492 static int ipoib_transmit ( struct net_device *netdev,
493 struct io_buffer *iobuf ) {
494 struct ipoib_device *ipoib = netdev->priv;
495 struct ib_device *ibdev = ipoib->ibdev;
496 struct ethhdr *ethhdr;
497 struct iphdr *iphdr;
498 struct ipoib_hdr *ipoib_hdr;
499 struct ipoib_mac *mac;
500 struct ib_address_vector dest;
501 uint16_t net_proto;
502 int rc;
503
504 /* Sanity check */
505 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
506 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
507 return -EINVAL;
508 }
509
510 /* Attempting transmission while link is down will put the
511 * queue pair into an error state, so don't try it.
512 */
513 if ( ! ib_link_ok ( ibdev ) )
514 return -ENETUNREACH;
515
516 /* Strip eIPoIB header */
517 ethhdr = iobuf->data;
518 net_proto = ethhdr->h_protocol;
519 iob_pull ( iobuf, sizeof ( *ethhdr ) );
520
521 /* Identify destination address */
522 mac = ipoib_find_remac ( ipoib, ( ( void * ) ethhdr->h_dest ) );
523 if ( ! mac ) {
524 /* Generate a new ARP request (if possible) to trigger
525 * population of the REMAC cache entry.
526 */
527 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
528 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
529 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
530 "packet type %04x\n", ipoib,
531 eth_ntoa ( ethhdr->h_dest ),
532 ntohs ( net_proto ) );
533 return -ENXIO_NON_IPV4;
534 }
535 iphdr = iobuf->data;
536 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
537 &iphdr->dest, &iphdr->src ) ) !=0){
538 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
539 ipoib, eth_ntoa ( ethhdr->h_dest ),
540 inet_ntoa ( iphdr->dest ) );
541 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
542 strerror ( rc ) );
543 return rc;
544 }
545 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
546 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
547 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
548 return -ENXIO_ARP_SENT;
549 }
550
551 /* Translate packet if applicable */
552 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
553 return rc;
554
555 /* Prepend real IPoIB header */
556 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
557 ipoib_hdr->proto = net_proto;
558 ipoib_hdr->reserved = 0;
559
560 /* Construct address vector */
561 memset ( &dest, 0, sizeof ( dest ) );
562 dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
563 dest.gid_present = 1;
564 memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
565 if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
566 /* Path not resolved yet */
567 return rc;
568 }
569
570 return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
571 }
572
573 /**
574 * Handle IPoIB send completion
575 *
576 * @v ibdev Infiniband device
577 * @v qp Queue pair
578 * @v iobuf I/O buffer
579 * @v rc Completion status code
580 */
581 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
582 struct ib_queue_pair *qp,
583 struct io_buffer *iobuf, int rc ) {
584 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
585
586 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
587 }
588
589 /**
590 * Handle IPoIB receive completion
591 *
592 * @v ibdev Infiniband device
593 * @v qp Queue pair
594 * @v dest Destination address vector, or NULL
595 * @v source Source address vector, or NULL
596 * @v iobuf I/O buffer
597 * @v rc Completion status code
598 */
599 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
600 struct ib_queue_pair *qp,
601 struct ib_address_vector *dest,
602 struct ib_address_vector *source,
603 struct io_buffer *iobuf, int rc ) {
604 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
605 struct net_device *netdev = ipoib->netdev;
606 struct ipoib_hdr *ipoib_hdr;
607 struct ethhdr *ethhdr;
608 struct ipoib_remac remac;
609 uint16_t net_proto;
610
611 /* Record errors */
612 if ( rc != 0 ) {
613 netdev_rx_err ( netdev, iobuf, rc );
614 return;
615 }
616
617 /* Sanity check */
618 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
619 DBGC ( ipoib, "IPoIB %p received packet too short to "
620 "contain IPoIB header\n", ipoib );
621 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
622 netdev_rx_err ( netdev, iobuf, -EIO );
623 return;
624 }
625 if ( ! source ) {
626 DBGC ( ipoib, "IPoIB %p received packet without address "
627 "vector\n", ipoib );
628 netdev_rx_err ( netdev, iobuf, -ENOTTY );
629 return;
630 }
631
632 /* Strip real IPoIB header */
633 ipoib_hdr = iobuf->data;
634 net_proto = ipoib_hdr->proto;
635 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
636
637 /* Construct source address from remote QPN and LID */
638 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
639 remac.lid = htons ( source->lid );
640
641 /* Translate packet if applicable */
642 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
643 net_proto ) ) != 0 ) {
644 netdev_rx_err ( netdev, iobuf, rc );
645 return;
646 }
647
648 /* Prepend eIPoIB header */
649 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
650 memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
651 ethhdr->h_protocol = net_proto;
652
653 /* Construct destination address */
654 if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
655 sizeof ( dest->gid ) ) == 0 ) ) {
656 /* Broadcast GID; use the Ethernet broadcast address */
657 memcpy ( &ethhdr->h_dest, eth_broadcast,
658 sizeof ( ethhdr->h_dest ) );
659 } else {
660 /* Assume destination address is local Ethernet MAC */
661 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
662 sizeof ( ethhdr->h_dest ) );
663 }
664
665 /* Hand off to network layer */
666 netdev_rx ( netdev, iobuf );
667 }
668
669 /** IPoIB completion operations */
670 static struct ib_completion_queue_operations ipoib_cq_op = {
671 .complete_send = ipoib_complete_send,
672 .complete_recv = ipoib_complete_recv,
673 };
674
675 /**
676 * Allocate IPoIB receive I/O buffer
677 *
678 * @v len Length of buffer
679 * @ret iobuf I/O buffer, or NULL
680 *
681 * Some Infiniband hardware requires 2kB alignment of receive buffers
682 * and provides no way to disable header separation. The result is
683 * that there are only four bytes of link-layer header (the real IPoIB
684 * header) before the payload. This is not sufficient space to insert
685 * an eIPoIB link-layer pseudo-header.
686 *
687 * We therefore allocate I/O buffers offset to start slightly before
688 * the natural alignment boundary, in order to allow sufficient space.
689 */
690 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
691 struct io_buffer *iobuf;
692 size_t reserve_len;
693
694 /* Calculate additional length required at start of buffer */
695 reserve_len = ( sizeof ( struct ethhdr ) -
696 sizeof ( struct ipoib_hdr ) );
697
698 /* Allocate buffer */
699 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
700 if ( iobuf ) {
701 iob_reserve ( iobuf, reserve_len );
702 }
703 return iobuf;
704 }
705
706 /** IPoIB queue pair operations */
707 static struct ib_queue_pair_operations ipoib_qp_op = {
708 .alloc_iob = ipoib_alloc_iob,
709 };
710
711 /**
712 * Poll IPoIB network device
713 *
714 * @v netdev Network device
715 */
716 static void ipoib_poll ( struct net_device *netdev ) {
717 struct ipoib_device *ipoib = netdev->priv;
718 struct ib_device *ibdev = ipoib->ibdev;
719
720 /* Poll Infiniband device */
721 ib_poll_eq ( ibdev );
722
723 /* Poll the retry timers (required for IPoIB multicast join) */
724 retry_poll();
725 }
726
727 /**
728 * Handle IPv4 broadcast multicast group join completion
729 *
730 * @v ibdev Infiniband device
731 * @v qp Queue pair
732 * @v membership Multicast group membership
733 * @v rc Status code
734 * @v mad Response MAD (or NULL on error)
735 */
736 void ipoib_join_complete ( struct ib_device *ibdev __unused,
737 struct ib_queue_pair *qp __unused,
738 struct ib_mc_membership *membership, int rc,
739 union ib_mad *mad __unused ) {
740 struct ipoib_device *ipoib = container_of ( membership,
741 struct ipoib_device, broadcast_membership );
742
743 /* Record join status as link status */
744 netdev_link_err ( ipoib->netdev, rc );
745 }
746
747 /**
748 * Join IPv4 broadcast multicast group
749 *
750 * @v ipoib IPoIB device
751 * @ret rc Return status code
752 */
753 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
754 int rc;
755
756 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
757 &ipoib->broadcast_membership,
758 &ipoib->broadcast.gid,
759 ipoib_join_complete ) ) != 0 ) {
760 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
761 ipoib, strerror ( rc ) );
762 return rc;
763 }
764 ipoib->broadcast_joined = 1;
765
766 return 0;
767 }
768
769 /**
770 * Leave IPv4 broadcast multicast group
771 *
772 * @v ipoib IPoIB device
773 */
774 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
775
776 if ( ipoib->broadcast_joined ) {
777 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
778 &ipoib->broadcast_membership );
779 ipoib->broadcast_joined = 0;
780 }
781 }
782
783 /**
784 * Handle link status change
785 *
786 * @v ibdev Infiniband device
787 */
788 static void ipoib_link_state_changed ( struct ib_device *ibdev ) {
789 struct net_device *netdev = ib_get_ownerdata ( ibdev );
790 struct ipoib_device *ipoib = netdev->priv;
791 int rc;
792
793 /* Leave existing broadcast group */
794 if ( ipoib->qp )
795 ipoib_leave_broadcast_group ( ipoib );
796
797 /* Update MAC address based on potentially-new GID prefix */
798 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
799 sizeof ( ipoib->mac.gid.s.prefix ) );
800
801 /* Update broadcast GID based on potentially-new partition key */
802 ipoib->broadcast.gid.words[2] =
803 htons ( ibdev->pkey | IB_PKEY_FULL );
804
805 /* Set net device link state to reflect Infiniband link state */
806 rc = ib_link_rc ( ibdev );
807 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
808
809 /* Join new broadcast group */
810 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
811 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
812 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
813 "%s\n", ipoib, strerror ( rc ) );
814 netdev_link_err ( netdev, rc );
815 return;
816 }
817 }
818
819 /**
820 * Open IPoIB network device
821 *
822 * @v netdev Network device
823 * @ret rc Return status code
824 */
825 static int ipoib_open ( struct net_device *netdev ) {
826 struct ipoib_device *ipoib = netdev->priv;
827 struct ib_device *ibdev = ipoib->ibdev;
828 int rc;
829
830 /* Open IB device */
831 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
832 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
833 ipoib, strerror ( rc ) );
834 goto err_ib_open;
835 }
836
837 /* Allocate completion queue */
838 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
839 if ( ! ipoib->cq ) {
840 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
841 ipoib );
842 rc = -ENOMEM;
843 goto err_create_cq;
844 }
845
846 /* Allocate queue pair */
847 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
848 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
849 &ipoib_qp_op );
850 if ( ! ipoib->qp ) {
851 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
852 ipoib );
853 rc = -ENOMEM;
854 goto err_create_qp;
855 }
856 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
857
858 /* Update MAC address with QPN */
859 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
860
861 /* Fill receive rings */
862 ib_refill_recv ( ibdev, ipoib->qp );
863
864 /* Fake a link status change to join the broadcast group */
865 ipoib_link_state_changed ( ibdev );
866
867 return 0;
868
869 ib_destroy_qp ( ibdev, ipoib->qp );
870 err_create_qp:
871 ib_destroy_cq ( ibdev, ipoib->cq );
872 err_create_cq:
873 ib_close ( ibdev );
874 err_ib_open:
875 return rc;
876 }
877
878 /**
879 * Close IPoIB network device
880 *
881 * @v netdev Network device
882 */
883 static void ipoib_close ( struct net_device *netdev ) {
884 struct ipoib_device *ipoib = netdev->priv;
885 struct ib_device *ibdev = ipoib->ibdev;
886
887 /* Flush REMAC cache */
888 ipoib_flush_remac ( ipoib );
889
890 /* Leave broadcast group */
891 ipoib_leave_broadcast_group ( ipoib );
892
893 /* Remove QPN from MAC address */
894 ipoib->mac.flags__qpn = 0;
895
896 /* Tear down the queues */
897 ib_destroy_qp ( ibdev, ipoib->qp );
898 ipoib->qp = NULL;
899 ib_destroy_cq ( ibdev, ipoib->cq );
900 ipoib->cq = NULL;
901
902 /* Close IB device */
903 ib_close ( ibdev );
904 }
905
906 /** IPoIB network device operations */
907 static struct net_device_operations ipoib_operations = {
908 .open = ipoib_open,
909 .close = ipoib_close,
910 .transmit = ipoib_transmit,
911 .poll = ipoib_poll,
912 };
913
914 /**
915 * Probe IPoIB device
916 *
917 * @v ibdev Infiniband device
918 * @ret rc Return status code
919 */
920 static int ipoib_probe ( struct ib_device *ibdev ) {
921 struct net_device *netdev;
922 struct ipoib_device *ipoib;
923 int rc;
924
925 /* Allocate network device */
926 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
927 if ( ! netdev )
928 return -ENOMEM;
929 netdev_init ( netdev, &ipoib_operations );
930 ipoib = netdev->priv;
931 ib_set_ownerdata ( ibdev, netdev );
932 netdev->dev = ibdev->dev;
933 memset ( ipoib, 0, sizeof ( *ipoib ) );
934 ipoib->netdev = netdev;
935 ipoib->ibdev = ibdev;
936 INIT_LIST_HEAD ( &ipoib->peers );
937
938 /* Extract hardware address */
939 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
940 sizeof ( ibdev->gid.s.guid ) );
941
942 /* Set local MAC address */
943 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
944 sizeof ( ipoib->mac.gid.s.guid ) );
945
946 /* Set default broadcast MAC address */
947 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
948 sizeof ( ipoib->broadcast ) );
949
950 /* Register network device */
951 if ( ( rc = register_netdev ( netdev ) ) != 0 )
952 goto err_register_netdev;
953
954 return 0;
955
956 err_register_netdev:
957 netdev_nullify ( netdev );
958 netdev_put ( netdev );
959 return rc;
960 }
961
962 /**
963 * Remove IPoIB device
964 *
965 * @v ibdev Infiniband device
966 */
967 static void ipoib_remove ( struct ib_device *ibdev ) {
968 struct net_device *netdev = ib_get_ownerdata ( ibdev );
969
970 unregister_netdev ( netdev );
971 netdev_nullify ( netdev );
972 netdev_put ( netdev );
973 }
974
975 /** IPoIB driver */
976 struct ib_driver ipoib_driver __ib_driver = {
977 .name = "IPoIB",
978 .probe = ipoib_probe,
979 .notify = ipoib_link_state_changed,
980 .remove = ipoib_remove,
981 };