[infiniband] Parse MLID, rate, and SL from multicast membership record
[ipxe.git] / src / drivers / net / ipoib.c
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <string.h>
31 #include <byteswap.h>
32 #include <errno.h>
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
36 #include <ipxe/arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
39 #include <ipxe/ip.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
47
48 /** @file
49 *
50 * IP over Infiniband
51 */
52
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
66
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
69
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
72
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
75
76 /** An IPoIB broadcast address */
77 struct ipoib_broadcast {
78 /** MAC address */
79 struct ipoib_mac mac;
80 /** Address vector */
81 struct ib_address_vector av;
82 /** Multicast group membership */
83 struct ib_mc_membership membership;
84 };
85
86 /** An IPoIB device */
87 struct ipoib_device {
88 /** Network device */
89 struct net_device *netdev;
90 /** Underlying Infiniband device */
91 struct ib_device *ibdev;
92 /** List of IPoIB devices */
93 struct list_head list;
94 /** Completion queue */
95 struct ib_completion_queue *cq;
96 /** Queue pair */
97 struct ib_queue_pair *qp;
98 /** Local MAC */
99 struct ipoib_mac mac;
100 /** Broadcast address */
101 struct ipoib_broadcast broadcast;
102 /** REMAC cache */
103 struct list_head peers;
104 };
105
106 /** Broadcast IPoIB address */
107 static struct ipoib_mac ipoib_broadcast = {
108 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
109 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
110 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
111 };
112
113 /** Link status for "broadcast join in progress" */
114 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
115 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
116 ( EINFO_EINPROGRESS, 0x01, "Joining" )
117
118 /** Human-readable message for the link status */
119 struct errortab ipoib_errors[] __errortab = {
120 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
121 };
122
123 /** List of all IPoIB devices */
124 static LIST_HEAD ( ipoib_devices );
125
126 static struct net_device_operations ipoib_operations;
127
128 /****************************************************************************
129 *
130 * IPoIB REMAC cache
131 *
132 ****************************************************************************
133 */
134
135 /** An IPoIB REMAC cache entry */
136 struct ipoib_peer {
137 /** List of REMAC cache entries */
138 struct list_head list;
139 /** Remote Ethermet MAC */
140 struct ipoib_remac remac;
141 /** MAC address */
142 struct ipoib_mac mac;
143 };
144
145 /**
146 * Find IPoIB MAC from REMAC
147 *
148 * @v ipoib IPoIB device
149 * @v remac Remote Ethernet MAC
150 * @ret mac IPoIB MAC (or NULL if not found)
151 */
152 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
153 const struct ipoib_remac *remac ) {
154 struct ipoib_peer *peer;
155
156 /* Check for broadcast or multicast REMAC. We transmit
157 * multicasts as broadcasts for simplicity.
158 */
159 if ( is_multicast_ether_addr ( remac ) )
160 return &ipoib->broadcast.mac;
161
162 /* Try to find via REMAC cache */
163 list_for_each_entry ( peer, &ipoib->peers, list ) {
164 if ( memcmp ( remac, &peer->remac,
165 sizeof ( peer->remac ) ) == 0 ) {
166 /* Move peer to start of list */
167 list_del ( &peer->list );
168 list_add ( &peer->list, &ipoib->peers );
169 return &peer->mac;
170 }
171 }
172
173 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
174 ipoib, eth_ntoa ( remac ) );
175 return NULL;
176 }
177
178 /**
179 * Add IPoIB MAC to REMAC cache
180 *
181 * @v ipoib IPoIB device
182 * @v remac Remote Ethernet MAC
183 * @v mac IPoIB MAC
184 * @ret rc Return status code
185 */
186 static int ipoib_map_remac ( struct ipoib_device *ipoib,
187 const struct ipoib_remac *remac,
188 const struct ipoib_mac *mac ) {
189 struct ipoib_peer *peer;
190
191 /* Check for existing entry in REMAC cache */
192 list_for_each_entry ( peer, &ipoib->peers, list ) {
193 if ( memcmp ( remac, &peer->remac,
194 sizeof ( peer->remac ) ) == 0 ) {
195 /* Move peer to start of list */
196 list_del ( &peer->list );
197 list_add ( &peer->list, &ipoib->peers );
198 /* Update MAC */
199 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
200 return 0;
201 }
202 }
203
204 /* Create new entry */
205 peer = malloc ( sizeof ( *peer ) );
206 if ( ! peer )
207 return -ENOMEM;
208 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
209 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
210 list_add ( &peer->list, &ipoib->peers );
211
212 return 0;
213 }
214
215 /**
216 * Flush REMAC cache
217 *
218 * @v ipoib IPoIB device
219 */
220 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
221 struct ipoib_peer *peer;
222 struct ipoib_peer *tmp;
223
224 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
225 list_del ( &peer->list );
226 free ( peer );
227 }
228 }
229
230 /**
231 * Discard some entries from the REMAC cache
232 *
233 * @ret discarded Number of cached items discarded
234 */
235 static unsigned int ipoib_discard_remac ( void ) {
236 struct net_device *netdev;
237 struct ipoib_device *ipoib;
238 struct ipoib_peer *peer;
239 unsigned int discarded = 0;
240
241 /* Try to discard one cache entry for each IPoIB device */
242 for_each_netdev ( netdev ) {
243
244 /* Skip non-IPoIB devices */
245 if ( netdev->op != &ipoib_operations )
246 continue;
247 ipoib = netdev->priv;
248
249 /* Discard least recently used cache entry (if any) */
250 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
251 list_del ( &peer->list );
252 free ( peer );
253 discarded++;
254 break;
255 }
256 }
257
258 return discarded;
259 }
260
261 /** IPoIB cache discarder */
262 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
263 .discard = ipoib_discard_remac,
264 };
265
266 /****************************************************************************
267 *
268 * IPoIB link layer
269 *
270 ****************************************************************************
271 */
272
273 /**
274 * Initialise IPoIB link-layer address
275 *
276 * @v hw_addr Hardware address
277 * @v ll_addr Link-layer address
278 */
279 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
280 const uint8_t *guid = hw_addr;
281 uint8_t *eth_addr = ll_addr;
282 uint8_t guid_mask = IPOIB_GUID_MASK;
283 unsigned int i;
284
285 /* Extract bytes from GUID according to mask */
286 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
287 if ( guid_mask & 0x80 )
288 *(eth_addr++) = *guid;
289 }
290 }
291
292 /** IPoIB protocol */
293 struct ll_protocol ipoib_protocol __ll_protocol = {
294 .name = "IPoIB",
295 .ll_proto = htons ( ARPHRD_ETHER ),
296 .hw_addr_len = sizeof ( union ib_guid ),
297 .ll_addr_len = ETH_ALEN,
298 .ll_header_len = ETH_HLEN,
299 .push = eth_push,
300 .pull = eth_pull,
301 .init_addr = ipoib_init_addr,
302 .ntoa = eth_ntoa,
303 .mc_hash = eth_mc_hash,
304 .eth_addr = eth_eth_addr,
305 .eui64 = eth_eui64,
306 .flags = LL_NAME_ONLY,
307 };
308
309 /**
310 * Allocate IPoIB device
311 *
312 * @v priv_size Size of driver private data
313 * @ret netdev Network device, or NULL
314 */
315 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
316 struct net_device *netdev;
317
318 netdev = alloc_netdev ( priv_size );
319 if ( netdev ) {
320 netdev->ll_protocol = &ipoib_protocol;
321 netdev->ll_broadcast = eth_broadcast;
322 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
323 }
324 return netdev;
325 }
326
327 /****************************************************************************
328 *
329 * IPoIB translation layer
330 *
331 ****************************************************************************
332 */
333
334 /**
335 * Translate transmitted ARP packet
336 *
337 * @v netdev Network device
338 * @v iobuf Packet to be transmitted (with no link-layer headers)
339 * @ret rc Return status code
340 */
341 static int ipoib_translate_tx_arp ( struct net_device *netdev,
342 struct io_buffer *iobuf ) {
343 struct ipoib_device *ipoib = netdev->priv;
344 struct arphdr *arphdr = iobuf->data;
345 struct ipoib_mac *target_ha = NULL;
346 void *sender_pa;
347 void *target_pa;
348
349 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
350 if ( arphdr->ar_hln != ETH_ALEN )
351 return 0;
352
353 /* Fail unless we have room to expand packet */
354 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
355 ETH_ALEN ) ) ) {
356 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
357 ipoib );
358 return -ENOBUFS;
359 }
360
361 /* Look up REMAC, if applicable */
362 if ( arphdr->ar_op == ARPOP_REPLY ) {
363 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
364 if ( ! target_ha ) {
365 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
366 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
367 return -ENXIO_ARP_REPLY;
368 }
369 }
370
371 /* Construct new packet */
372 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
373 sender_pa = arp_sender_pa ( arphdr );
374 target_pa = arp_target_pa ( arphdr );
375 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
376 arphdr->ar_hln = sizeof ( ipoib->mac );
377 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
378 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
379 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
380 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
381 if ( target_ha ) {
382 memcpy ( arp_target_ha ( arphdr ), target_ha,
383 sizeof ( *target_ha ) );
384 }
385
386 return 0;
387 }
388
389 /**
390 * Translate transmitted packet
391 *
392 * @v netdev Network device
393 * @v iobuf Packet to be transmitted (with no link-layer headers)
394 * @v net_proto Network-layer protocol (in network byte order)
395 * @ret rc Return status code
396 */
397 static int ipoib_translate_tx ( struct net_device *netdev,
398 struct io_buffer *iobuf, uint16_t net_proto ) {
399
400 switch ( net_proto ) {
401 case htons ( ETH_P_ARP ) :
402 return ipoib_translate_tx_arp ( netdev, iobuf );
403 case htons ( ETH_P_IP ) :
404 /* No translation needed */
405 return 0;
406 default:
407 /* Cannot handle other traffic via eIPoIB */
408 return -ENOTSUP;
409 }
410 }
411
412 /**
413 * Translate received ARP packet
414 *
415 * @v netdev Network device
416 * @v iobuf Received packet (with no link-layer headers)
417 * @v remac Constructed Remote Ethernet MAC
418 * @ret rc Return status code
419 */
420 static int ipoib_translate_rx_arp ( struct net_device *netdev,
421 struct io_buffer *iobuf,
422 struct ipoib_remac *remac ) {
423 struct ipoib_device *ipoib = netdev->priv;
424 struct arphdr *arphdr = iobuf->data;
425 void *sender_pa;
426 void *target_pa;
427 int rc;
428
429 /* Do nothing unless ARP contains IPoIB link-layer addresses */
430 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
431 return 0;
432
433 /* Create REMAC cache entry */
434 if ( ( rc = ipoib_map_remac ( ipoib, remac,
435 arp_sender_ha ( arphdr ) ) ) != 0 ) {
436 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
437 ipoib, strerror ( rc ) );
438 return rc;
439 }
440
441 /* Construct new packet */
442 sender_pa = arp_sender_pa ( arphdr );
443 target_pa = arp_target_pa ( arphdr );
444 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
445 arphdr->ar_hln = ETH_ALEN;
446 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
447 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
448 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
449 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
450 if ( arphdr->ar_op == ARPOP_REPLY ) {
451 /* Assume received replies were directed to us */
452 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
453 }
454 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
455
456 return 0;
457 }
458
459 /**
460 * Translate received packet
461 *
462 * @v netdev Network device
463 * @v iobuf Received packet (with no link-layer headers)
464 * @v remac Constructed Remote Ethernet MAC
465 * @v net_proto Network-layer protocol (in network byte order)
466 * @ret rc Return status code
467 */
468 static int ipoib_translate_rx ( struct net_device *netdev,
469 struct io_buffer *iobuf,
470 struct ipoib_remac *remac,
471 uint16_t net_proto ) {
472
473 switch ( net_proto ) {
474 case htons ( ETH_P_ARP ) :
475 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
476 case htons ( ETH_P_IP ) :
477 /* No translation needed */
478 return 0;
479 default:
480 /* Cannot handle other traffic via eIPoIB */
481 return -ENOTSUP;
482 }
483 }
484
485 /****************************************************************************
486 *
487 * IPoIB network device
488 *
489 ****************************************************************************
490 */
491
492 /**
493 * Transmit packet via IPoIB network device
494 *
495 * @v netdev Network device
496 * @v iobuf I/O buffer
497 * @ret rc Return status code
498 */
499 static int ipoib_transmit ( struct net_device *netdev,
500 struct io_buffer *iobuf ) {
501 struct ipoib_device *ipoib = netdev->priv;
502 struct ib_device *ibdev = ipoib->ibdev;
503 struct ethhdr *ethhdr;
504 struct iphdr *iphdr;
505 struct ipoib_hdr *ipoib_hdr;
506 struct ipoib_mac *mac;
507 struct ib_address_vector dest;
508 uint16_t net_proto;
509 int rc;
510
511 /* Sanity check */
512 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
513 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
514 return -EINVAL;
515 }
516
517 /* Attempting transmission while link is down will put the
518 * queue pair into an error state, so don't try it.
519 */
520 if ( ! ib_link_ok ( ibdev ) )
521 return -ENETUNREACH;
522
523 /* Strip eIPoIB header */
524 ethhdr = iobuf->data;
525 net_proto = ethhdr->h_protocol;
526 iob_pull ( iobuf, sizeof ( *ethhdr ) );
527
528 /* Identify destination address */
529 mac = ipoib_find_remac ( ipoib, ( ( void * ) ethhdr->h_dest ) );
530 if ( ! mac ) {
531 /* Generate a new ARP request (if possible) to trigger
532 * population of the REMAC cache entry.
533 */
534 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
535 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
536 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
537 "packet type %04x\n", ipoib,
538 eth_ntoa ( ethhdr->h_dest ),
539 ntohs ( net_proto ) );
540 return -ENXIO_NON_IPV4;
541 }
542 iphdr = iobuf->data;
543 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
544 &iphdr->dest, &iphdr->src ) ) !=0){
545 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
546 ipoib, eth_ntoa ( ethhdr->h_dest ),
547 inet_ntoa ( iphdr->dest ) );
548 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
549 strerror ( rc ) );
550 return rc;
551 }
552 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
553 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
554 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
555 return -ENXIO_ARP_SENT;
556 }
557
558 /* Translate packet if applicable */
559 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
560 return rc;
561
562 /* Prepend real IPoIB header */
563 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
564 ipoib_hdr->proto = net_proto;
565 ipoib_hdr->reserved = 0;
566
567 /* Construct address vector */
568 memset ( &dest, 0, sizeof ( dest ) );
569 dest.qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
570 dest.qkey = ipoib->broadcast.av.qkey;
571 dest.gid_present = 1;
572 memcpy ( &dest.gid, &mac->gid, sizeof ( dest.gid ) );
573 if ( ( rc = ib_resolve_path ( ibdev, &dest ) ) != 0 ) {
574 /* Path not resolved yet */
575 return rc;
576 }
577
578 return ib_post_send ( ibdev, ipoib->qp, &dest, iobuf );
579 }
580
581 /**
582 * Handle IPoIB send completion
583 *
584 * @v ibdev Infiniband device
585 * @v qp Queue pair
586 * @v iobuf I/O buffer
587 * @v rc Completion status code
588 */
589 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
590 struct ib_queue_pair *qp,
591 struct io_buffer *iobuf, int rc ) {
592 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
593
594 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
595 }
596
597 /**
598 * Handle IPoIB receive completion
599 *
600 * @v ibdev Infiniband device
601 * @v qp Queue pair
602 * @v dest Destination address vector, or NULL
603 * @v source Source address vector, or NULL
604 * @v iobuf I/O buffer
605 * @v rc Completion status code
606 */
607 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
608 struct ib_queue_pair *qp,
609 struct ib_address_vector *dest,
610 struct ib_address_vector *source,
611 struct io_buffer *iobuf, int rc ) {
612 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
613 struct net_device *netdev = ipoib->netdev;
614 struct ipoib_hdr *ipoib_hdr;
615 struct ethhdr *ethhdr;
616 struct ipoib_remac remac;
617 uint16_t net_proto;
618
619 /* Record errors */
620 if ( rc != 0 ) {
621 netdev_rx_err ( netdev, iobuf, rc );
622 return;
623 }
624
625 /* Sanity check */
626 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
627 DBGC ( ipoib, "IPoIB %p received packet too short to "
628 "contain IPoIB header\n", ipoib );
629 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
630 netdev_rx_err ( netdev, iobuf, -EIO );
631 return;
632 }
633 if ( ! source ) {
634 DBGC ( ipoib, "IPoIB %p received packet without address "
635 "vector\n", ipoib );
636 netdev_rx_err ( netdev, iobuf, -ENOTTY );
637 return;
638 }
639
640 /* Strip real IPoIB header */
641 ipoib_hdr = iobuf->data;
642 net_proto = ipoib_hdr->proto;
643 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
644
645 /* Construct source address from remote QPN and LID */
646 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
647 remac.lid = htons ( source->lid );
648
649 /* Translate packet if applicable */
650 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
651 net_proto ) ) != 0 ) {
652 netdev_rx_err ( netdev, iobuf, rc );
653 return;
654 }
655
656 /* Prepend eIPoIB header */
657 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
658 memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
659 ethhdr->h_protocol = net_proto;
660
661 /* Construct destination address */
662 if ( dest->gid_present &&
663 ( memcmp ( &dest->gid, &ipoib->broadcast.mac.gid,
664 sizeof ( dest->gid ) ) == 0 ) ) {
665 /* Broadcast GID; use the Ethernet broadcast address */
666 memcpy ( &ethhdr->h_dest, eth_broadcast,
667 sizeof ( ethhdr->h_dest ) );
668 } else {
669 /* Assume destination address is local Ethernet MAC */
670 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
671 sizeof ( ethhdr->h_dest ) );
672 }
673
674 /* Hand off to network layer */
675 netdev_rx ( netdev, iobuf );
676 }
677
678 /** IPoIB completion operations */
679 static struct ib_completion_queue_operations ipoib_cq_op = {
680 .complete_send = ipoib_complete_send,
681 .complete_recv = ipoib_complete_recv,
682 };
683
684 /**
685 * Allocate IPoIB receive I/O buffer
686 *
687 * @v len Length of buffer
688 * @ret iobuf I/O buffer, or NULL
689 *
690 * Some Infiniband hardware requires 2kB alignment of receive buffers
691 * and provides no way to disable header separation. The result is
692 * that there are only four bytes of link-layer header (the real IPoIB
693 * header) before the payload. This is not sufficient space to insert
694 * an eIPoIB link-layer pseudo-header.
695 *
696 * We therefore allocate I/O buffers offset to start slightly before
697 * the natural alignment boundary, in order to allow sufficient space.
698 */
699 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
700 struct io_buffer *iobuf;
701 size_t reserve_len;
702
703 /* Calculate additional length required at start of buffer */
704 reserve_len = ( sizeof ( struct ethhdr ) -
705 sizeof ( struct ipoib_hdr ) );
706
707 /* Allocate buffer */
708 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
709 if ( iobuf ) {
710 iob_reserve ( iobuf, reserve_len );
711 }
712 return iobuf;
713 }
714
715 /** IPoIB queue pair operations */
716 static struct ib_queue_pair_operations ipoib_qp_op = {
717 .alloc_iob = ipoib_alloc_iob,
718 };
719
720 /**
721 * Poll IPoIB network device
722 *
723 * @v netdev Network device
724 */
725 static void ipoib_poll ( struct net_device *netdev ) {
726 struct ipoib_device *ipoib = netdev->priv;
727 struct ib_device *ibdev = ipoib->ibdev;
728
729 /* Poll Infiniband device */
730 ib_poll_eq ( ibdev );
731
732 /* Poll the retry timers (required for IPoIB multicast join) */
733 retry_poll();
734 }
735
736 /**
737 * Handle IPv4 broadcast multicast group join completion
738 *
739 * @v membership Multicast group membership
740 * @v rc Status code
741 */
742 void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
743 struct ipoib_device *ipoib = container_of ( membership,
744 struct ipoib_device,
745 broadcast.membership );
746
747 /* Record join status as link status */
748 netdev_link_err ( ipoib->netdev, rc );
749 }
750
751 /**
752 * Join IPv4 broadcast multicast group
753 *
754 * @v ipoib IPoIB device
755 * @ret rc Return status code
756 */
757 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
758 int rc;
759
760 /* Join multicast group */
761 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
762 &ipoib->broadcast.membership,
763 &ipoib->broadcast.av,
764 ipoib_join_complete ) ) != 0 ) {
765 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
766 ipoib, strerror ( rc ) );
767 return rc;
768 }
769
770 return 0;
771 }
772
773 /**
774 * Leave IPv4 broadcast multicast group
775 *
776 * @v ipoib IPoIB device
777 */
778 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
779
780 /* Leave multicast group */
781 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
782 &ipoib->broadcast.membership );
783 }
784
785 /**
786 * Handle link status change
787 *
788 * @v ipoib IPoIB device
789 */
790 static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
791 struct ib_device *ibdev = ipoib->ibdev;
792 struct net_device *netdev = ipoib->netdev;
793 int rc;
794
795 /* Leave existing broadcast group */
796 if ( ipoib->qp )
797 ipoib_leave_broadcast_group ( ipoib );
798
799 /* Update MAC address based on potentially-new GID prefix */
800 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
801 sizeof ( ipoib->mac.gid.s.prefix ) );
802
803 /* Update broadcast MAC GID based on potentially-new partition key */
804 ipoib->broadcast.mac.gid.words[2] =
805 htons ( ibdev->pkey | IB_PKEY_FULL );
806
807 /* Construct broadcast address vector from broadcast MAC address */
808 memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
809 ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
810 ipoib->broadcast.av.gid_present = 1;
811 memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
812 sizeof ( ipoib->broadcast.av.gid ) );
813
814 /* Set net device link state to reflect Infiniband link state */
815 rc = ib_link_rc ( ibdev );
816 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
817
818 /* Join new broadcast group */
819 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
820 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
821 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
822 "%s\n", ipoib, strerror ( rc ) );
823 netdev_link_err ( netdev, rc );
824 return;
825 }
826 }
827
828 /**
829 * Open IPoIB network device
830 *
831 * @v netdev Network device
832 * @ret rc Return status code
833 */
834 static int ipoib_open ( struct net_device *netdev ) {
835 struct ipoib_device *ipoib = netdev->priv;
836 struct ib_device *ibdev = ipoib->ibdev;
837 int rc;
838
839 /* Open IB device */
840 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
841 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
842 ipoib, strerror ( rc ) );
843 goto err_ib_open;
844 }
845
846 /* Allocate completion queue */
847 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
848 if ( ! ipoib->cq ) {
849 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
850 ipoib );
851 rc = -ENOMEM;
852 goto err_create_cq;
853 }
854
855 /* Allocate queue pair */
856 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
857 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
858 &ipoib_qp_op );
859 if ( ! ipoib->qp ) {
860 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
861 ipoib );
862 rc = -ENOMEM;
863 goto err_create_qp;
864 }
865 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
866
867 /* Update MAC address with QPN */
868 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
869
870 /* Fill receive rings */
871 ib_refill_recv ( ibdev, ipoib->qp );
872
873 /* Fake a link status change to join the broadcast group */
874 ipoib_link_state_changed ( ipoib );
875
876 return 0;
877
878 ib_destroy_qp ( ibdev, ipoib->qp );
879 err_create_qp:
880 ib_destroy_cq ( ibdev, ipoib->cq );
881 err_create_cq:
882 ib_close ( ibdev );
883 err_ib_open:
884 return rc;
885 }
886
887 /**
888 * Close IPoIB network device
889 *
890 * @v netdev Network device
891 */
892 static void ipoib_close ( struct net_device *netdev ) {
893 struct ipoib_device *ipoib = netdev->priv;
894 struct ib_device *ibdev = ipoib->ibdev;
895
896 /* Flush REMAC cache */
897 ipoib_flush_remac ( ipoib );
898
899 /* Leave broadcast group */
900 ipoib_leave_broadcast_group ( ipoib );
901
902 /* Remove QPN from MAC address */
903 ipoib->mac.flags__qpn = 0;
904
905 /* Tear down the queues */
906 ib_destroy_qp ( ibdev, ipoib->qp );
907 ipoib->qp = NULL;
908 ib_destroy_cq ( ibdev, ipoib->cq );
909 ipoib->cq = NULL;
910
911 /* Close IB device */
912 ib_close ( ibdev );
913 }
914
915 /** IPoIB network device operations */
916 static struct net_device_operations ipoib_operations = {
917 .open = ipoib_open,
918 .close = ipoib_close,
919 .transmit = ipoib_transmit,
920 .poll = ipoib_poll,
921 };
922
923 /**
924 * Probe IPoIB device
925 *
926 * @v ibdev Infiniband device
927 * @ret rc Return status code
928 */
929 static int ipoib_probe ( struct ib_device *ibdev ) {
930 struct net_device *netdev;
931 struct ipoib_device *ipoib;
932 int rc;
933
934 /* Allocate network device */
935 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
936 if ( ! netdev )
937 return -ENOMEM;
938 netdev_init ( netdev, &ipoib_operations );
939 ipoib = netdev->priv;
940 netdev->dev = ibdev->dev;
941 memset ( ipoib, 0, sizeof ( *ipoib ) );
942 ipoib->netdev = netdev;
943 ipoib->ibdev = ibdev;
944 INIT_LIST_HEAD ( &ipoib->peers );
945
946 /* Extract hardware address */
947 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
948 sizeof ( ibdev->gid.s.guid ) );
949
950 /* Set local MAC address */
951 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
952 sizeof ( ipoib->mac.gid.s.guid ) );
953
954 /* Set default broadcast MAC address */
955 memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
956 sizeof ( ipoib->broadcast.mac ) );
957
958 /* Add to list of IPoIB devices */
959 list_add_tail ( &ipoib->list, &ipoib_devices );
960
961 /* Register network device */
962 if ( ( rc = register_netdev ( netdev ) ) != 0 )
963 goto err_register_netdev;
964
965 return 0;
966
967 unregister_netdev ( netdev );
968 err_register_netdev:
969 list_del ( &ipoib->list );
970 netdev_nullify ( netdev );
971 netdev_put ( netdev );
972 return rc;
973 }
974
975 /**
976 * Handle device or link status change
977 *
978 * @v ibdev Infiniband device
979 */
980 static void ipoib_notify ( struct ib_device *ibdev ) {
981 struct ipoib_device *ipoib;
982
983 /* Handle link status change for any attached IPoIB devices */
984 list_for_each_entry ( ipoib, &ipoib_devices, list ) {
985 if ( ipoib->ibdev != ibdev )
986 continue;
987 ipoib_link_state_changed ( ipoib );
988 }
989 }
990
991 /**
992 * Remove IPoIB device
993 *
994 * @v ibdev Infiniband device
995 */
996 static void ipoib_remove ( struct ib_device *ibdev ) {
997 struct ipoib_device *ipoib;
998 struct ipoib_device *tmp;
999 struct net_device *netdev;
1000
1001 /* Remove any attached IPoIB devices */
1002 list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
1003 if ( ipoib->ibdev != ibdev )
1004 continue;
1005 netdev = ipoib->netdev;
1006 unregister_netdev ( netdev );
1007 list_del ( &ipoib->list );
1008 netdev_nullify ( netdev );
1009 netdev_put ( netdev );
1010 }
1011 }
1012
1013 /** IPoIB driver */
1014 struct ib_driver ipoib_driver __ib_driver = {
1015 .name = "IPoIB",
1016 .probe = ipoib_probe,
1017 .notify = ipoib_notify,
1018 .remove = ipoib_remove,
1019 };