[infiniband] Record multicast GID attachment as part of group membership
[ipxe.git] / src / drivers / net / ipoib.c
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <string.h>
31 #include <byteswap.h>
32 #include <errno.h>
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
36 #include <ipxe/arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
39 #include <ipxe/ip.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
47
48 /** @file
49 *
50 * IP over Infiniband
51 */
52
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
66
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
69
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
72
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
75
76 /** An IPoIB device */
77 struct ipoib_device {
78 /** Network device */
79 struct net_device *netdev;
80 /** Underlying Infiniband device */
81 struct ib_device *ibdev;
82 /** List of IPoIB devices */
83 struct list_head list;
84 /** Completion queue */
85 struct ib_completion_queue *cq;
86 /** Queue pair */
87 struct ib_queue_pair *qp;
88 /** Local MAC */
89 struct ipoib_mac mac;
90 /** Broadcast MAC */
91 struct ipoib_mac broadcast;
92 /** IPv4 broadcast multicast group membership */
93 struct ib_mc_membership membership;
94 /** REMAC cache */
95 struct list_head peers;
96 };
97
98 /** Broadcast IPoIB address */
99 static struct ipoib_mac ipoib_broadcast = {
100 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
101 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
102 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
103 };
104
105 /** Link status for "broadcast join in progress" */
106 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
107 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
108 ( EINFO_EINPROGRESS, 0x01, "Joining" )
109
110 /** Human-readable message for the link status */
111 struct errortab ipoib_errors[] __errortab = {
112 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
113 };
114
115 /** List of all IPoIB devices */
116 static LIST_HEAD ( ipoib_devices );
117
118 static struct net_device_operations ipoib_operations;
119
120 /****************************************************************************
121 *
122 * IPoIB REMAC cache
123 *
124 ****************************************************************************
125 */
126
127 /** An IPoIB REMAC cache entry */
128 struct ipoib_peer {
129 /** List of REMAC cache entries */
130 struct list_head list;
131 /** Remote Ethermet MAC */
132 struct ipoib_remac remac;
133 /** MAC address */
134 struct ipoib_mac mac;
135 };
136
137 /**
138 * Find IPoIB MAC from REMAC
139 *
140 * @v ipoib IPoIB device
141 * @v remac Remote Ethernet MAC
142 * @ret mac IPoIB MAC (or NULL if not found)
143 */
144 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
145 const struct ipoib_remac *remac ) {
146 struct ipoib_peer *peer;
147
148 /* Check for broadcast or multicast REMAC. We transmit
149 * multicasts as broadcasts for simplicity.
150 */
151 if ( is_multicast_ether_addr ( remac ) )
152 return &ipoib->broadcast;
153
154 /* Try to find via REMAC cache */
155 list_for_each_entry ( peer, &ipoib->peers, list ) {
156 if ( memcmp ( remac, &peer->remac,
157 sizeof ( peer->remac ) ) == 0 ) {
158 /* Move peer to start of list */
159 list_del ( &peer->list );
160 list_add ( &peer->list, &ipoib->peers );
161 return &peer->mac;
162 }
163 }
164
165 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
166 ipoib, eth_ntoa ( remac ) );
167 return NULL;
168 }
169
170 /**
171 * Add IPoIB MAC to REMAC cache
172 *
173 * @v ipoib IPoIB device
174 * @v remac Remote Ethernet MAC
175 * @v mac IPoIB MAC
176 * @ret rc Return status code
177 */
178 static int ipoib_map_remac ( struct ipoib_device *ipoib,
179 const struct ipoib_remac *remac,
180 const struct ipoib_mac *mac ) {
181 struct ipoib_peer *peer;
182
183 /* Check for existing entry in REMAC cache */
184 list_for_each_entry ( peer, &ipoib->peers, list ) {
185 if ( memcmp ( remac, &peer->remac,
186 sizeof ( peer->remac ) ) == 0 ) {
187 /* Move peer to start of list */
188 list_del ( &peer->list );
189 list_add ( &peer->list, &ipoib->peers );
190 /* Update MAC */
191 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
192 return 0;
193 }
194 }
195
196 /* Create new entry */
197 peer = malloc ( sizeof ( *peer ) );
198 if ( ! peer )
199 return -ENOMEM;
200 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
201 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
202 list_add ( &peer->list, &ipoib->peers );
203
204 return 0;
205 }
206
207 /**
208 * Flush REMAC cache
209 *
210 * @v ipoib IPoIB device
211 */
212 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
213 struct ipoib_peer *peer;
214 struct ipoib_peer *tmp;
215
216 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
217 list_del ( &peer->list );
218 free ( peer );
219 }
220 }
221
222 /**
223 * Discard some entries from the REMAC cache
224 *
225 * @ret discarded Number of cached items discarded
226 */
227 static unsigned int ipoib_discard_remac ( void ) {
228 struct net_device *netdev;
229 struct ipoib_device *ipoib;
230 struct ipoib_peer *peer;
231 unsigned int discarded = 0;
232
233 /* Try to discard one cache entry for each IPoIB device */
234 for_each_netdev ( netdev ) {
235
236 /* Skip non-IPoIB devices */
237 if ( netdev->op != &ipoib_operations )
238 continue;
239 ipoib = netdev->priv;
240
241 /* Discard least recently used cache entry (if any) */
242 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
243 list_del ( &peer->list );
244 free ( peer );
245 discarded++;
246 break;
247 }
248 }
249
250 return discarded;
251 }
252
253 /** IPoIB cache discarder */
254 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
255 .discard = ipoib_discard_remac,
256 };
257
258 /****************************************************************************
259 *
260 * IPoIB link layer
261 *
262 ****************************************************************************
263 */
264
265 /**
266 * Initialise IPoIB link-layer address
267 *
268 * @v hw_addr Hardware address
269 * @v ll_addr Link-layer address
270 */
271 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
272 const uint8_t *guid = hw_addr;
273 uint8_t *eth_addr = ll_addr;
274 uint8_t guid_mask = IPOIB_GUID_MASK;
275 unsigned int i;
276
277 /* Extract bytes from GUID according to mask */
278 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
279 if ( guid_mask & 0x80 )
280 *(eth_addr++) = *guid;
281 }
282 }
283
284 /** IPoIB protocol */
285 struct ll_protocol ipoib_protocol __ll_protocol = {
286 .name = "IPoIB",
287 .ll_proto = htons ( ARPHRD_ETHER ),
288 .hw_addr_len = sizeof ( union ib_guid ),
289 .ll_addr_len = ETH_ALEN,
290 .ll_header_len = ETH_HLEN,
291 .push = eth_push,
292 .pull = eth_pull,
293 .init_addr = ipoib_init_addr,
294 .ntoa = eth_ntoa,
295 .mc_hash = eth_mc_hash,
296 .eth_addr = eth_eth_addr,
297 .eui64 = eth_eui64,
298 .flags = LL_NAME_ONLY,
299 };
300
301 /**
302 * Allocate IPoIB device
303 *
304 * @v priv_size Size of driver private data
305 * @ret netdev Network device, or NULL
306 */
307 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
308 struct net_device *netdev;
309
310 netdev = alloc_netdev ( priv_size );
311 if ( netdev ) {
312 netdev->ll_protocol = &ipoib_protocol;
313 netdev->ll_broadcast = eth_broadcast;
314 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
315 }
316 return netdev;
317 }
318
319 /****************************************************************************
320 *
321 * IPoIB translation layer
322 *
323 ****************************************************************************
324 */
325
326 /**
327 * Translate transmitted ARP packet
328 *
329 * @v netdev Network device
330 * @v iobuf Packet to be transmitted (with no link-layer headers)
331 * @ret rc Return status code
332 */
333 static int ipoib_translate_tx_arp ( struct net_device *netdev,
334 struct io_buffer *iobuf ) {
335 struct ipoib_device *ipoib = netdev->priv;
336 struct arphdr *arphdr = iobuf->data;
337 struct ipoib_mac *target_ha = NULL;
338 void *sender_pa;
339 void *target_pa;
340
341 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
342 if ( arphdr->ar_hln != ETH_ALEN )
343 return 0;
344
345 /* Fail unless we have room to expand packet */
346 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
347 ETH_ALEN ) ) ) {
348 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
349 ipoib );
350 return -ENOBUFS;
351 }
352
353 /* Look up REMAC, if applicable */
354 if ( arphdr->ar_op == ARPOP_REPLY ) {
355 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
356 if ( ! target_ha ) {
357 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
358 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
359 return -ENXIO_ARP_REPLY;
360 }
361 }
362
363 /* Construct new packet */
364 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
365 sender_pa = arp_sender_pa ( arphdr );
366 target_pa = arp_target_pa ( arphdr );
367 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
368 arphdr->ar_hln = sizeof ( ipoib->mac );
369 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
370 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
371 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
372 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
373 if ( target_ha ) {
374 memcpy ( arp_target_ha ( arphdr ), target_ha,
375 sizeof ( *target_ha ) );
376 }
377
378 return 0;
379 }
380
381 /**
382 * Translate transmitted packet
383 *
384 * @v netdev Network device
385 * @v iobuf Packet to be transmitted (with no link-layer headers)
386 * @v net_proto Network-layer protocol (in network byte order)
387 * @ret rc Return status code
388 */
389 static int ipoib_translate_tx ( struct net_device *netdev,
390 struct io_buffer *iobuf, uint16_t net_proto ) {
391
392 switch ( net_proto ) {
393 case htons ( ETH_P_ARP ) :
394 return ipoib_translate_tx_arp ( netdev, iobuf );
395 case htons ( ETH_P_IP ) :
396 /* No translation needed */
397 return 0;
398 default:
399 /* Cannot handle other traffic via eIPoIB */
400 return -ENOTSUP;
401 }
402 }
403
404 /**
405 * Translate received ARP packet
406 *
407 * @v netdev Network device
408 * @v iobuf Received packet (with no link-layer headers)
409 * @v remac Constructed Remote Ethernet MAC
410 * @ret rc Return status code
411 */
412 static int ipoib_translate_rx_arp ( struct net_device *netdev,
413 struct io_buffer *iobuf,
414 struct ipoib_remac *remac ) {
415 struct ipoib_device *ipoib = netdev->priv;
416 struct arphdr *arphdr = iobuf->data;
417 void *sender_pa;
418 void *target_pa;
419 int rc;
420
421 /* Do nothing unless ARP contains IPoIB link-layer addresses */
422 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
423 return 0;
424
425 /* Create REMAC cache entry */
426 if ( ( rc = ipoib_map_remac ( ipoib, remac,
427 arp_sender_ha ( arphdr ) ) ) != 0 ) {
428 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
429 ipoib, strerror ( rc ) );
430 return rc;
431 }
432
433 /* Construct new packet */
434 sender_pa = arp_sender_pa ( arphdr );
435 target_pa = arp_target_pa ( arphdr );
436 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
437 arphdr->ar_hln = ETH_ALEN;
438 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
439 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
440 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
441 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
442 if ( arphdr->ar_op == ARPOP_REPLY ) {
443 /* Assume received replies were directed to us */
444 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
445 }
446 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
447
448 return 0;
449 }
450
451 /**
452 * Translate received packet
453 *
454 * @v netdev Network device
455 * @v iobuf Received packet (with no link-layer headers)
456 * @v remac Constructed Remote Ethernet MAC
457 * @v net_proto Network-layer protocol (in network byte order)
458 * @ret rc Return status code
459 */
460 static int ipoib_translate_rx ( struct net_device *netdev,
461 struct io_buffer *iobuf,
462 struct ipoib_remac *remac,
463 uint16_t net_proto ) {
464
465 switch ( net_proto ) {
466 case htons ( ETH_P_ARP ) :
467 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
468 case htons ( ETH_P_IP ) :
469 /* No translation needed */
470 return 0;
471 default:
472 /* Cannot handle other traffic via eIPoIB */
473 return -ENOTSUP;
474 }
475 }
476
477 /****************************************************************************
478 *
479 * IPoIB network device
480 *
481 ****************************************************************************
482 */
483
484 /**
485 * Transmit packet via IPoIB network device
486 *
487 * @v netdev Network device
488 * @v iobuf I/O buffer
489 * @ret rc Return status code
490 */
491 static int ipoib_transmit ( struct net_device *netdev,
492 struct io_buffer *iobuf ) {
493 struct ipoib_device *ipoib = netdev->priv;
494 struct ib_device *ibdev = ipoib->ibdev;
495 struct ethhdr *ethhdr;
496 struct iphdr *iphdr;
497 struct ipoib_hdr *ipoib_hdr;
498 struct ipoib_mac *mac;
499 struct ib_address_vector dest;
500 uint16_t net_proto;
501 int rc;
502
503 /* Sanity check */
504 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
505 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
506 return -EINVAL;
507 }
508
509 /* Attempting transmission while link is down will put the
510 * queue pair into an error state, so don't try it.
511 */
512 if ( ! ib_link_ok ( ibdev ) )
513 return -ENETUNREACH;
514
515 /* Strip eIPoIB header */
516 ethhdr = iobuf->data;
517 net_proto = ethhdr->h_protocol;
518 iob_pull ( iobuf, sizeof ( *ethhdr ) );
519
520 /* Identify destination address */
521 mac = ipoib_find_remac ( ipoib, ( ( void * ) ethhdr->h_dest ) );
522 if ( ! mac ) {
523 /* Generate a new ARP request (if possible) to trigger
524 * population of the REMAC cache entry.
525 */
526 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
527 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
528 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
529 "packet type %04x\n", ipoib,
530 eth_ntoa ( ethhdr->h_dest ),
531 ntohs ( net_proto ) );
532 return -ENXIO_NON_IPV4;
533 }
534 iphdr = iobuf->data;
535 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
536 &iphdr->dest, &iphdr->src ) ) !=0){
537 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
538 ipoib, eth_ntoa ( ethhdr->h_dest ),
539 inet_ntoa ( iphdr->dest ) );
540 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
541 strerror ( rc ) );
542 return rc;
543 }
544 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
545 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
546 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
547 return -ENXIO_ARP_SENT;
548 }
549
550 /* Translate packet if applicable */
551 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
552 return rc;
553
554 /* Prepend real IPoIB header */
555 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
556 ipoib_hdr->proto = net_proto;
557 ipoib_hdr->reserved = 0;
558
559 /* Construct address vector */
560 memset ( &dest, 0, sizeof ( dest ) );
561 dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
562 dest.gid_present = 1;
563 memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
564 if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
565 /* Path not resolved yet */
566 return rc;
567 }
568
569 return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
570 }
571
572 /**
573 * Handle IPoIB send completion
574 *
575 * @v ibdev Infiniband device
576 * @v qp Queue pair
577 * @v iobuf I/O buffer
578 * @v rc Completion status code
579 */
580 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
581 struct ib_queue_pair *qp,
582 struct io_buffer *iobuf, int rc ) {
583 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
584
585 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
586 }
587
588 /**
589 * Handle IPoIB receive completion
590 *
591 * @v ibdev Infiniband device
592 * @v qp Queue pair
593 * @v dest Destination address vector, or NULL
594 * @v source Source address vector, or NULL
595 * @v iobuf I/O buffer
596 * @v rc Completion status code
597 */
598 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
599 struct ib_queue_pair *qp,
600 struct ib_address_vector *dest,
601 struct ib_address_vector *source,
602 struct io_buffer *iobuf, int rc ) {
603 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
604 struct net_device *netdev = ipoib->netdev;
605 struct ipoib_hdr *ipoib_hdr;
606 struct ethhdr *ethhdr;
607 struct ipoib_remac remac;
608 uint16_t net_proto;
609
610 /* Record errors */
611 if ( rc != 0 ) {
612 netdev_rx_err ( netdev, iobuf, rc );
613 return;
614 }
615
616 /* Sanity check */
617 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
618 DBGC ( ipoib, "IPoIB %p received packet too short to "
619 "contain IPoIB header\n", ipoib );
620 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
621 netdev_rx_err ( netdev, iobuf, -EIO );
622 return;
623 }
624 if ( ! source ) {
625 DBGC ( ipoib, "IPoIB %p received packet without address "
626 "vector\n", ipoib );
627 netdev_rx_err ( netdev, iobuf, -ENOTTY );
628 return;
629 }
630
631 /* Strip real IPoIB header */
632 ipoib_hdr = iobuf->data;
633 net_proto = ipoib_hdr->proto;
634 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
635
636 /* Construct source address from remote QPN and LID */
637 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
638 remac.lid = htons ( source->lid );
639
640 /* Translate packet if applicable */
641 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
642 net_proto ) ) != 0 ) {
643 netdev_rx_err ( netdev, iobuf, rc );
644 return;
645 }
646
647 /* Prepend eIPoIB header */
648 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
649 memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
650 ethhdr->h_protocol = net_proto;
651
652 /* Construct destination address */
653 if ( dest->gid_present && ( memcmp ( &dest->gid, &ipoib->broadcast.gid,
654 sizeof ( dest->gid ) ) == 0 ) ) {
655 /* Broadcast GID; use the Ethernet broadcast address */
656 memcpy ( &ethhdr->h_dest, eth_broadcast,
657 sizeof ( ethhdr->h_dest ) );
658 } else {
659 /* Assume destination address is local Ethernet MAC */
660 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
661 sizeof ( ethhdr->h_dest ) );
662 }
663
664 /* Hand off to network layer */
665 netdev_rx ( netdev, iobuf );
666 }
667
668 /** IPoIB completion operations */
669 static struct ib_completion_queue_operations ipoib_cq_op = {
670 .complete_send = ipoib_complete_send,
671 .complete_recv = ipoib_complete_recv,
672 };
673
674 /**
675 * Allocate IPoIB receive I/O buffer
676 *
677 * @v len Length of buffer
678 * @ret iobuf I/O buffer, or NULL
679 *
680 * Some Infiniband hardware requires 2kB alignment of receive buffers
681 * and provides no way to disable header separation. The result is
682 * that there are only four bytes of link-layer header (the real IPoIB
683 * header) before the payload. This is not sufficient space to insert
684 * an eIPoIB link-layer pseudo-header.
685 *
686 * We therefore allocate I/O buffers offset to start slightly before
687 * the natural alignment boundary, in order to allow sufficient space.
688 */
689 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
690 struct io_buffer *iobuf;
691 size_t reserve_len;
692
693 /* Calculate additional length required at start of buffer */
694 reserve_len = ( sizeof ( struct ethhdr ) -
695 sizeof ( struct ipoib_hdr ) );
696
697 /* Allocate buffer */
698 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
699 if ( iobuf ) {
700 iob_reserve ( iobuf, reserve_len );
701 }
702 return iobuf;
703 }
704
705 /** IPoIB queue pair operations */
706 static struct ib_queue_pair_operations ipoib_qp_op = {
707 .alloc_iob = ipoib_alloc_iob,
708 };
709
710 /**
711 * Poll IPoIB network device
712 *
713 * @v netdev Network device
714 */
715 static void ipoib_poll ( struct net_device *netdev ) {
716 struct ipoib_device *ipoib = netdev->priv;
717 struct ib_device *ibdev = ipoib->ibdev;
718
719 /* Poll Infiniband device */
720 ib_poll_eq ( ibdev );
721
722 /* Poll the retry timers (required for IPoIB multicast join) */
723 retry_poll();
724 }
725
726 /**
727 * Handle IPv4 broadcast multicast group join completion
728 *
729 * @v ibdev Infiniband device
730 * @v qp Queue pair
731 * @v membership Multicast group membership
732 * @v rc Status code
733 * @v mad Response MAD (or NULL on error)
734 */
735 void ipoib_join_complete ( struct ib_device *ibdev __unused,
736 struct ib_queue_pair *qp __unused,
737 struct ib_mc_membership *membership, int rc,
738 union ib_mad *mad __unused ) {
739 struct ipoib_device *ipoib =
740 container_of ( membership, struct ipoib_device, membership );
741
742 /* Record join status as link status */
743 netdev_link_err ( ipoib->netdev, rc );
744 }
745
746 /**
747 * Join IPv4 broadcast multicast group
748 *
749 * @v ipoib IPoIB device
750 * @ret rc Return status code
751 */
752 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
753 int rc;
754
755 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
756 &ipoib->membership, &ipoib->broadcast.gid,
757 ipoib_join_complete ) ) != 0 ) {
758 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
759 ipoib, strerror ( rc ) );
760 return rc;
761 }
762
763 return 0;
764 }
765
766 /**
767 * Leave IPv4 broadcast multicast group
768 *
769 * @v ipoib IPoIB device
770 */
771 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
772
773 ib_mcast_leave ( ipoib->ibdev, ipoib->qp, &ipoib->membership );
774 }
775
776 /**
777 * Handle link status change
778 *
779 * @v ipoib IPoIB device
780 */
781 static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
782 struct ib_device *ibdev = ipoib->ibdev;
783 struct net_device *netdev = ipoib->netdev;
784 int rc;
785
786 /* Leave existing broadcast group */
787 if ( ipoib->qp )
788 ipoib_leave_broadcast_group ( ipoib );
789
790 /* Update MAC address based on potentially-new GID prefix */
791 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
792 sizeof ( ipoib->mac.gid.s.prefix ) );
793
794 /* Update broadcast GID based on potentially-new partition key */
795 ipoib->broadcast.gid.words[2] =
796 htons ( ibdev->pkey | IB_PKEY_FULL );
797
798 /* Set net device link state to reflect Infiniband link state */
799 rc = ib_link_rc ( ibdev );
800 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
801
802 /* Join new broadcast group */
803 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
804 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
805 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
806 "%s\n", ipoib, strerror ( rc ) );
807 netdev_link_err ( netdev, rc );
808 return;
809 }
810 }
811
812 /**
813 * Open IPoIB network device
814 *
815 * @v netdev Network device
816 * @ret rc Return status code
817 */
818 static int ipoib_open ( struct net_device *netdev ) {
819 struct ipoib_device *ipoib = netdev->priv;
820 struct ib_device *ibdev = ipoib->ibdev;
821 int rc;
822
823 /* Open IB device */
824 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
825 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
826 ipoib, strerror ( rc ) );
827 goto err_ib_open;
828 }
829
830 /* Allocate completion queue */
831 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
832 if ( ! ipoib->cq ) {
833 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
834 ipoib );
835 rc = -ENOMEM;
836 goto err_create_cq;
837 }
838
839 /* Allocate queue pair */
840 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
841 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
842 &ipoib_qp_op );
843 if ( ! ipoib->qp ) {
844 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
845 ipoib );
846 rc = -ENOMEM;
847 goto err_create_qp;
848 }
849 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
850
851 /* Update MAC address with QPN */
852 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
853
854 /* Fill receive rings */
855 ib_refill_recv ( ibdev, ipoib->qp );
856
857 /* Fake a link status change to join the broadcast group */
858 ipoib_link_state_changed ( ipoib );
859
860 return 0;
861
862 ib_destroy_qp ( ibdev, ipoib->qp );
863 err_create_qp:
864 ib_destroy_cq ( ibdev, ipoib->cq );
865 err_create_cq:
866 ib_close ( ibdev );
867 err_ib_open:
868 return rc;
869 }
870
871 /**
872 * Close IPoIB network device
873 *
874 * @v netdev Network device
875 */
876 static void ipoib_close ( struct net_device *netdev ) {
877 struct ipoib_device *ipoib = netdev->priv;
878 struct ib_device *ibdev = ipoib->ibdev;
879
880 /* Flush REMAC cache */
881 ipoib_flush_remac ( ipoib );
882
883 /* Leave broadcast group */
884 ipoib_leave_broadcast_group ( ipoib );
885
886 /* Remove QPN from MAC address */
887 ipoib->mac.flags__qpn = 0;
888
889 /* Tear down the queues */
890 ib_destroy_qp ( ibdev, ipoib->qp );
891 ipoib->qp = NULL;
892 ib_destroy_cq ( ibdev, ipoib->cq );
893 ipoib->cq = NULL;
894
895 /* Close IB device */
896 ib_close ( ibdev );
897 }
898
899 /** IPoIB network device operations */
900 static struct net_device_operations ipoib_operations = {
901 .open = ipoib_open,
902 .close = ipoib_close,
903 .transmit = ipoib_transmit,
904 .poll = ipoib_poll,
905 };
906
907 /**
908 * Probe IPoIB device
909 *
910 * @v ibdev Infiniband device
911 * @ret rc Return status code
912 */
913 static int ipoib_probe ( struct ib_device *ibdev ) {
914 struct net_device *netdev;
915 struct ipoib_device *ipoib;
916 int rc;
917
918 /* Allocate network device */
919 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
920 if ( ! netdev )
921 return -ENOMEM;
922 netdev_init ( netdev, &ipoib_operations );
923 ipoib = netdev->priv;
924 netdev->dev = ibdev->dev;
925 memset ( ipoib, 0, sizeof ( *ipoib ) );
926 ipoib->netdev = netdev;
927 ipoib->ibdev = ibdev;
928 INIT_LIST_HEAD ( &ipoib->peers );
929
930 /* Extract hardware address */
931 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
932 sizeof ( ibdev->gid.s.guid ) );
933
934 /* Set local MAC address */
935 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
936 sizeof ( ipoib->mac.gid.s.guid ) );
937
938 /* Set default broadcast MAC address */
939 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
940 sizeof ( ipoib->broadcast ) );
941
942 /* Add to list of IPoIB devices */
943 list_add_tail ( &ipoib->list, &ipoib_devices );
944
945 /* Register network device */
946 if ( ( rc = register_netdev ( netdev ) ) != 0 )
947 goto err_register_netdev;
948
949 return 0;
950
951 unregister_netdev ( netdev );
952 err_register_netdev:
953 list_del ( &ipoib->list );
954 netdev_nullify ( netdev );
955 netdev_put ( netdev );
956 return rc;
957 }
958
959 /**
960 * Handle device or link status change
961 *
962 * @v ibdev Infiniband device
963 */
964 static void ipoib_notify ( struct ib_device *ibdev ) {
965 struct ipoib_device *ipoib;
966
967 /* Handle link status change for any attached IPoIB devices */
968 list_for_each_entry ( ipoib, &ipoib_devices, list ) {
969 if ( ipoib->ibdev != ibdev )
970 continue;
971 ipoib_link_state_changed ( ipoib );
972 }
973 }
974
975 /**
976 * Remove IPoIB device
977 *
978 * @v ibdev Infiniband device
979 */
980 static void ipoib_remove ( struct ib_device *ibdev ) {
981 struct ipoib_device *ipoib;
982 struct ipoib_device *tmp;
983 struct net_device *netdev;
984
985 /* Remove any attached IPoIB devices */
986 list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
987 if ( ipoib->ibdev != ibdev )
988 continue;
989 netdev = ipoib->netdev;
990 unregister_netdev ( netdev );
991 list_del ( &ipoib->list );
992 netdev_nullify ( netdev );
993 netdev_put ( netdev );
994 }
995 }
996
997 /** IPoIB driver */
998 struct ib_driver ipoib_driver __ib_driver = {
999 .name = "IPoIB",
1000 .probe = ipoib_probe,
1001 .notify = ipoib_notify,
1002 .remove = ipoib_remove,
1003 };