[ipoib] Avoid unnecessary path record lookup for broadcast address
[ipxe.git] / src / drivers / net / ipoib.c
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <string.h>
31 #include <byteswap.h>
32 #include <errno.h>
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
36 #include <ipxe/arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
39 #include <ipxe/ip.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
47
48 /** @file
49 *
50 * IP over Infiniband
51 */
52
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
66
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
69
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
72
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
75
76 /** An IPoIB broadcast address */
77 struct ipoib_broadcast {
78 /** MAC address */
79 struct ipoib_mac mac;
80 /** Address vector */
81 struct ib_address_vector av;
82 /** Multicast group membership */
83 struct ib_mc_membership membership;
84 };
85
86 /** An IPoIB device */
87 struct ipoib_device {
88 /** Network device */
89 struct net_device *netdev;
90 /** Underlying Infiniband device */
91 struct ib_device *ibdev;
92 /** List of IPoIB devices */
93 struct list_head list;
94 /** Completion queue */
95 struct ib_completion_queue *cq;
96 /** Queue pair */
97 struct ib_queue_pair *qp;
98 /** Local MAC */
99 struct ipoib_mac mac;
100 /** Broadcast address */
101 struct ipoib_broadcast broadcast;
102 /** REMAC cache */
103 struct list_head peers;
104 };
105
106 /** Broadcast IPoIB address */
107 static struct ipoib_mac ipoib_broadcast = {
108 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
109 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
110 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
111 };
112
113 /** Link status for "broadcast join in progress" */
114 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
115 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
116 ( EINFO_EINPROGRESS, 0x01, "Joining" )
117
118 /** Human-readable message for the link status */
119 struct errortab ipoib_errors[] __errortab = {
120 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
121 };
122
123 /** List of all IPoIB devices */
124 static LIST_HEAD ( ipoib_devices );
125
126 static struct net_device_operations ipoib_operations;
127
128 /****************************************************************************
129 *
130 * IPoIB REMAC cache
131 *
132 ****************************************************************************
133 */
134
135 /** An IPoIB REMAC cache entry */
136 struct ipoib_peer {
137 /** List of REMAC cache entries */
138 struct list_head list;
139 /** Remote Ethermet MAC */
140 struct ipoib_remac remac;
141 /** MAC address */
142 struct ipoib_mac mac;
143 };
144
145 /**
146 * Find IPoIB MAC from REMAC
147 *
148 * @v ipoib IPoIB device
149 * @v remac Remote Ethernet MAC
150 * @ret mac IPoIB MAC (or NULL if not found)
151 */
152 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
153 const struct ipoib_remac *remac ) {
154 struct ipoib_peer *peer;
155
156 /* Check for broadcast or multicast REMAC. We transmit
157 * multicasts as broadcasts for simplicity.
158 */
159 if ( is_multicast_ether_addr ( remac ) )
160 return &ipoib->broadcast.mac;
161
162 /* Try to find via REMAC cache */
163 list_for_each_entry ( peer, &ipoib->peers, list ) {
164 if ( memcmp ( remac, &peer->remac,
165 sizeof ( peer->remac ) ) == 0 ) {
166 /* Move peer to start of list */
167 list_del ( &peer->list );
168 list_add ( &peer->list, &ipoib->peers );
169 return &peer->mac;
170 }
171 }
172
173 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
174 ipoib, eth_ntoa ( remac ) );
175 return NULL;
176 }
177
178 /**
179 * Add IPoIB MAC to REMAC cache
180 *
181 * @v ipoib IPoIB device
182 * @v remac Remote Ethernet MAC
183 * @v mac IPoIB MAC
184 * @ret rc Return status code
185 */
186 static int ipoib_map_remac ( struct ipoib_device *ipoib,
187 const struct ipoib_remac *remac,
188 const struct ipoib_mac *mac ) {
189 struct ipoib_peer *peer;
190
191 /* Check for existing entry in REMAC cache */
192 list_for_each_entry ( peer, &ipoib->peers, list ) {
193 if ( memcmp ( remac, &peer->remac,
194 sizeof ( peer->remac ) ) == 0 ) {
195 /* Move peer to start of list */
196 list_del ( &peer->list );
197 list_add ( &peer->list, &ipoib->peers );
198 /* Update MAC */
199 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
200 return 0;
201 }
202 }
203
204 /* Create new entry */
205 peer = malloc ( sizeof ( *peer ) );
206 if ( ! peer )
207 return -ENOMEM;
208 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
209 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
210 list_add ( &peer->list, &ipoib->peers );
211
212 return 0;
213 }
214
215 /**
216 * Flush REMAC cache
217 *
218 * @v ipoib IPoIB device
219 */
220 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
221 struct ipoib_peer *peer;
222 struct ipoib_peer *tmp;
223
224 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
225 list_del ( &peer->list );
226 free ( peer );
227 }
228 }
229
230 /**
231 * Discard some entries from the REMAC cache
232 *
233 * @ret discarded Number of cached items discarded
234 */
235 static unsigned int ipoib_discard_remac ( void ) {
236 struct net_device *netdev;
237 struct ipoib_device *ipoib;
238 struct ipoib_peer *peer;
239 unsigned int discarded = 0;
240
241 /* Try to discard one cache entry for each IPoIB device */
242 for_each_netdev ( netdev ) {
243
244 /* Skip non-IPoIB devices */
245 if ( netdev->op != &ipoib_operations )
246 continue;
247 ipoib = netdev->priv;
248
249 /* Discard least recently used cache entry (if any) */
250 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
251 list_del ( &peer->list );
252 free ( peer );
253 discarded++;
254 break;
255 }
256 }
257
258 return discarded;
259 }
260
261 /** IPoIB cache discarder */
262 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
263 .discard = ipoib_discard_remac,
264 };
265
266 /****************************************************************************
267 *
268 * IPoIB link layer
269 *
270 ****************************************************************************
271 */
272
273 /**
274 * Initialise IPoIB link-layer address
275 *
276 * @v hw_addr Hardware address
277 * @v ll_addr Link-layer address
278 */
279 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
280 const uint8_t *guid = hw_addr;
281 uint8_t *eth_addr = ll_addr;
282 uint8_t guid_mask = IPOIB_GUID_MASK;
283 unsigned int i;
284
285 /* Extract bytes from GUID according to mask */
286 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
287 if ( guid_mask & 0x80 )
288 *(eth_addr++) = *guid;
289 }
290 }
291
292 /** IPoIB protocol */
293 struct ll_protocol ipoib_protocol __ll_protocol = {
294 .name = "IPoIB",
295 .ll_proto = htons ( ARPHRD_ETHER ),
296 .hw_addr_len = sizeof ( union ib_guid ),
297 .ll_addr_len = ETH_ALEN,
298 .ll_header_len = ETH_HLEN,
299 .push = eth_push,
300 .pull = eth_pull,
301 .init_addr = ipoib_init_addr,
302 .ntoa = eth_ntoa,
303 .mc_hash = eth_mc_hash,
304 .eth_addr = eth_eth_addr,
305 .eui64 = eth_eui64,
306 .flags = LL_NAME_ONLY,
307 };
308
309 /**
310 * Allocate IPoIB device
311 *
312 * @v priv_size Size of driver private data
313 * @ret netdev Network device, or NULL
314 */
315 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
316 struct net_device *netdev;
317
318 netdev = alloc_netdev ( priv_size );
319 if ( netdev ) {
320 netdev->ll_protocol = &ipoib_protocol;
321 netdev->ll_broadcast = eth_broadcast;
322 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
323 }
324 return netdev;
325 }
326
327 /****************************************************************************
328 *
329 * IPoIB translation layer
330 *
331 ****************************************************************************
332 */
333
334 /**
335 * Translate transmitted ARP packet
336 *
337 * @v netdev Network device
338 * @v iobuf Packet to be transmitted (with no link-layer headers)
339 * @ret rc Return status code
340 */
341 static int ipoib_translate_tx_arp ( struct net_device *netdev,
342 struct io_buffer *iobuf ) {
343 struct ipoib_device *ipoib = netdev->priv;
344 struct arphdr *arphdr = iobuf->data;
345 struct ipoib_mac *target_ha = NULL;
346 void *sender_pa;
347 void *target_pa;
348
349 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
350 if ( arphdr->ar_hln != ETH_ALEN )
351 return 0;
352
353 /* Fail unless we have room to expand packet */
354 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
355 ETH_ALEN ) ) ) {
356 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
357 ipoib );
358 return -ENOBUFS;
359 }
360
361 /* Look up REMAC, if applicable */
362 if ( arphdr->ar_op == ARPOP_REPLY ) {
363 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
364 if ( ! target_ha ) {
365 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
366 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
367 return -ENXIO_ARP_REPLY;
368 }
369 }
370
371 /* Construct new packet */
372 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
373 sender_pa = arp_sender_pa ( arphdr );
374 target_pa = arp_target_pa ( arphdr );
375 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
376 arphdr->ar_hln = sizeof ( ipoib->mac );
377 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
378 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
379 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
380 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
381 if ( target_ha ) {
382 memcpy ( arp_target_ha ( arphdr ), target_ha,
383 sizeof ( *target_ha ) );
384 }
385
386 return 0;
387 }
388
389 /**
390 * Translate transmitted packet
391 *
392 * @v netdev Network device
393 * @v iobuf Packet to be transmitted (with no link-layer headers)
394 * @v net_proto Network-layer protocol (in network byte order)
395 * @ret rc Return status code
396 */
397 static int ipoib_translate_tx ( struct net_device *netdev,
398 struct io_buffer *iobuf, uint16_t net_proto ) {
399
400 switch ( net_proto ) {
401 case htons ( ETH_P_ARP ) :
402 return ipoib_translate_tx_arp ( netdev, iobuf );
403 case htons ( ETH_P_IP ) :
404 /* No translation needed */
405 return 0;
406 default:
407 /* Cannot handle other traffic via eIPoIB */
408 return -ENOTSUP;
409 }
410 }
411
412 /**
413 * Translate received ARP packet
414 *
415 * @v netdev Network device
416 * @v iobuf Received packet (with no link-layer headers)
417 * @v remac Constructed Remote Ethernet MAC
418 * @ret rc Return status code
419 */
420 static int ipoib_translate_rx_arp ( struct net_device *netdev,
421 struct io_buffer *iobuf,
422 struct ipoib_remac *remac ) {
423 struct ipoib_device *ipoib = netdev->priv;
424 struct arphdr *arphdr = iobuf->data;
425 void *sender_pa;
426 void *target_pa;
427 int rc;
428
429 /* Do nothing unless ARP contains IPoIB link-layer addresses */
430 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
431 return 0;
432
433 /* Create REMAC cache entry */
434 if ( ( rc = ipoib_map_remac ( ipoib, remac,
435 arp_sender_ha ( arphdr ) ) ) != 0 ) {
436 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
437 ipoib, strerror ( rc ) );
438 return rc;
439 }
440
441 /* Construct new packet */
442 sender_pa = arp_sender_pa ( arphdr );
443 target_pa = arp_target_pa ( arphdr );
444 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
445 arphdr->ar_hln = ETH_ALEN;
446 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
447 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
448 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
449 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
450 if ( arphdr->ar_op == ARPOP_REPLY ) {
451 /* Assume received replies were directed to us */
452 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
453 }
454 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
455
456 return 0;
457 }
458
459 /**
460 * Translate received packet
461 *
462 * @v netdev Network device
463 * @v iobuf Received packet (with no link-layer headers)
464 * @v remac Constructed Remote Ethernet MAC
465 * @v net_proto Network-layer protocol (in network byte order)
466 * @ret rc Return status code
467 */
468 static int ipoib_translate_rx ( struct net_device *netdev,
469 struct io_buffer *iobuf,
470 struct ipoib_remac *remac,
471 uint16_t net_proto ) {
472
473 switch ( net_proto ) {
474 case htons ( ETH_P_ARP ) :
475 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
476 case htons ( ETH_P_IP ) :
477 /* No translation needed */
478 return 0;
479 default:
480 /* Cannot handle other traffic via eIPoIB */
481 return -ENOTSUP;
482 }
483 }
484
485 /****************************************************************************
486 *
487 * IPoIB network device
488 *
489 ****************************************************************************
490 */
491
492 /**
493 * Transmit packet via IPoIB network device
494 *
495 * @v netdev Network device
496 * @v iobuf I/O buffer
497 * @ret rc Return status code
498 */
499 static int ipoib_transmit ( struct net_device *netdev,
500 struct io_buffer *iobuf ) {
501 struct ipoib_device *ipoib = netdev->priv;
502 struct ib_device *ibdev = ipoib->ibdev;
503 struct ethhdr *ethhdr;
504 struct iphdr *iphdr;
505 struct ipoib_hdr *ipoib_hdr;
506 struct ipoib_remac *remac;
507 struct ipoib_mac *mac;
508 struct ib_address_vector *dest;
509 struct ib_address_vector av;
510 uint16_t net_proto;
511 int rc;
512
513 /* Sanity check */
514 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
515 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
516 return -EINVAL;
517 }
518
519 /* Attempting transmission while link is down will put the
520 * queue pair into an error state, so don't try it.
521 */
522 if ( ! ib_link_ok ( ibdev ) )
523 return -ENETUNREACH;
524
525 /* Strip eIPoIB header */
526 ethhdr = iobuf->data;
527 remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
528 net_proto = ethhdr->h_protocol;
529 iob_pull ( iobuf, sizeof ( *ethhdr ) );
530
531 /* Identify destination address */
532 if ( is_multicast_ether_addr ( remac ) ) {
533
534 /* Transmit multicasts as broadcasts, for simplicity */
535 dest = &ipoib->broadcast.av;
536
537 } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
538
539 /* Construct address vector from IPoIB MAC */
540 dest = &av;
541 memset ( dest, 0, sizeof ( *dest ) );
542 dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
543 dest->qkey = ipoib->broadcast.av.qkey;
544 dest->gid_present = 1;
545 memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
546 if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
547 /* Path not resolved yet */
548 return rc;
549 }
550
551 } else {
552
553 /* Generate a new ARP request (if possible) to trigger
554 * population of the REMAC cache entry.
555 */
556 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
557 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
558 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
559 "packet type %04x\n", ipoib,
560 eth_ntoa ( ethhdr->h_dest ),
561 ntohs ( net_proto ) );
562 return -ENXIO_NON_IPV4;
563 }
564 iphdr = iobuf->data;
565 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
566 &iphdr->dest, &iphdr->src ) ) !=0){
567 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
568 ipoib, eth_ntoa ( ethhdr->h_dest ),
569 inet_ntoa ( iphdr->dest ) );
570 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
571 strerror ( rc ) );
572 return rc;
573 }
574 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
575 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
576 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
577 return -ENXIO_ARP_SENT;
578 }
579
580 /* Translate packet if applicable */
581 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
582 return rc;
583
584 /* Prepend real IPoIB header */
585 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
586 ipoib_hdr->proto = net_proto;
587 ipoib_hdr->reserved = 0;
588
589 /* Transmit packet */
590 return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
591 }
592
593 /**
594 * Handle IPoIB send completion
595 *
596 * @v ibdev Infiniband device
597 * @v qp Queue pair
598 * @v iobuf I/O buffer
599 * @v rc Completion status code
600 */
601 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
602 struct ib_queue_pair *qp,
603 struct io_buffer *iobuf, int rc ) {
604 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
605
606 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
607 }
608
609 /**
610 * Handle IPoIB receive completion
611 *
612 * @v ibdev Infiniband device
613 * @v qp Queue pair
614 * @v dest Destination address vector, or NULL
615 * @v source Source address vector, or NULL
616 * @v iobuf I/O buffer
617 * @v rc Completion status code
618 */
619 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
620 struct ib_queue_pair *qp,
621 struct ib_address_vector *dest,
622 struct ib_address_vector *source,
623 struct io_buffer *iobuf, int rc ) {
624 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
625 struct net_device *netdev = ipoib->netdev;
626 struct ipoib_hdr *ipoib_hdr;
627 struct ethhdr *ethhdr;
628 struct ipoib_remac remac;
629 uint16_t net_proto;
630
631 /* Record errors */
632 if ( rc != 0 ) {
633 netdev_rx_err ( netdev, iobuf, rc );
634 return;
635 }
636
637 /* Sanity check */
638 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
639 DBGC ( ipoib, "IPoIB %p received packet too short to "
640 "contain IPoIB header\n", ipoib );
641 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
642 netdev_rx_err ( netdev, iobuf, -EIO );
643 return;
644 }
645 if ( ! source ) {
646 DBGC ( ipoib, "IPoIB %p received packet without address "
647 "vector\n", ipoib );
648 netdev_rx_err ( netdev, iobuf, -ENOTTY );
649 return;
650 }
651
652 /* Strip real IPoIB header */
653 ipoib_hdr = iobuf->data;
654 net_proto = ipoib_hdr->proto;
655 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
656
657 /* Construct source address from remote QPN and LID */
658 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
659 remac.lid = htons ( source->lid );
660
661 /* Translate packet if applicable */
662 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
663 net_proto ) ) != 0 ) {
664 netdev_rx_err ( netdev, iobuf, rc );
665 return;
666 }
667
668 /* Prepend eIPoIB header */
669 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
670 memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
671 ethhdr->h_protocol = net_proto;
672
673 /* Construct destination address */
674 if ( dest->gid_present &&
675 ( memcmp ( &dest->gid, &ipoib->broadcast.mac.gid,
676 sizeof ( dest->gid ) ) == 0 ) ) {
677 /* Broadcast GID; use the Ethernet broadcast address */
678 memcpy ( &ethhdr->h_dest, eth_broadcast,
679 sizeof ( ethhdr->h_dest ) );
680 } else {
681 /* Assume destination address is local Ethernet MAC */
682 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
683 sizeof ( ethhdr->h_dest ) );
684 }
685
686 /* Hand off to network layer */
687 netdev_rx ( netdev, iobuf );
688 }
689
690 /** IPoIB completion operations */
691 static struct ib_completion_queue_operations ipoib_cq_op = {
692 .complete_send = ipoib_complete_send,
693 .complete_recv = ipoib_complete_recv,
694 };
695
696 /**
697 * Allocate IPoIB receive I/O buffer
698 *
699 * @v len Length of buffer
700 * @ret iobuf I/O buffer, or NULL
701 *
702 * Some Infiniband hardware requires 2kB alignment of receive buffers
703 * and provides no way to disable header separation. The result is
704 * that there are only four bytes of link-layer header (the real IPoIB
705 * header) before the payload. This is not sufficient space to insert
706 * an eIPoIB link-layer pseudo-header.
707 *
708 * We therefore allocate I/O buffers offset to start slightly before
709 * the natural alignment boundary, in order to allow sufficient space.
710 */
711 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
712 struct io_buffer *iobuf;
713 size_t reserve_len;
714
715 /* Calculate additional length required at start of buffer */
716 reserve_len = ( sizeof ( struct ethhdr ) -
717 sizeof ( struct ipoib_hdr ) );
718
719 /* Allocate buffer */
720 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
721 if ( iobuf ) {
722 iob_reserve ( iobuf, reserve_len );
723 }
724 return iobuf;
725 }
726
727 /** IPoIB queue pair operations */
728 static struct ib_queue_pair_operations ipoib_qp_op = {
729 .alloc_iob = ipoib_alloc_iob,
730 };
731
732 /**
733 * Poll IPoIB network device
734 *
735 * @v netdev Network device
736 */
737 static void ipoib_poll ( struct net_device *netdev ) {
738 struct ipoib_device *ipoib = netdev->priv;
739 struct ib_device *ibdev = ipoib->ibdev;
740
741 /* Poll Infiniband device */
742 ib_poll_eq ( ibdev );
743
744 /* Poll the retry timers (required for IPoIB multicast join) */
745 retry_poll();
746 }
747
748 /**
749 * Handle IPv4 broadcast multicast group join completion
750 *
751 * @v membership Multicast group membership
752 * @v rc Status code
753 */
754 void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
755 struct ipoib_device *ipoib = container_of ( membership,
756 struct ipoib_device,
757 broadcast.membership );
758
759 /* Record join status as link status */
760 netdev_link_err ( ipoib->netdev, rc );
761 }
762
763 /**
764 * Join IPv4 broadcast multicast group
765 *
766 * @v ipoib IPoIB device
767 * @ret rc Return status code
768 */
769 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
770 int rc;
771
772 /* Join multicast group */
773 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
774 &ipoib->broadcast.membership,
775 &ipoib->broadcast.av,
776 ipoib_join_complete ) ) != 0 ) {
777 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
778 ipoib, strerror ( rc ) );
779 return rc;
780 }
781
782 return 0;
783 }
784
785 /**
786 * Leave IPv4 broadcast multicast group
787 *
788 * @v ipoib IPoIB device
789 */
790 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
791
792 /* Leave multicast group */
793 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
794 &ipoib->broadcast.membership );
795 }
796
797 /**
798 * Handle link status change
799 *
800 * @v ipoib IPoIB device
801 */
802 static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
803 struct ib_device *ibdev = ipoib->ibdev;
804 struct net_device *netdev = ipoib->netdev;
805 int rc;
806
807 /* Leave existing broadcast group */
808 if ( ipoib->qp )
809 ipoib_leave_broadcast_group ( ipoib );
810
811 /* Update MAC address based on potentially-new GID prefix */
812 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
813 sizeof ( ipoib->mac.gid.s.prefix ) );
814
815 /* Update broadcast MAC GID based on potentially-new partition key */
816 ipoib->broadcast.mac.gid.words[2] =
817 htons ( ibdev->pkey | IB_PKEY_FULL );
818
819 /* Construct broadcast address vector from broadcast MAC address */
820 memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
821 ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
822 ipoib->broadcast.av.gid_present = 1;
823 memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
824 sizeof ( ipoib->broadcast.av.gid ) );
825
826 /* Set net device link state to reflect Infiniband link state */
827 rc = ib_link_rc ( ibdev );
828 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
829
830 /* Join new broadcast group */
831 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
832 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
833 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
834 "%s\n", ipoib, strerror ( rc ) );
835 netdev_link_err ( netdev, rc );
836 return;
837 }
838 }
839
840 /**
841 * Open IPoIB network device
842 *
843 * @v netdev Network device
844 * @ret rc Return status code
845 */
846 static int ipoib_open ( struct net_device *netdev ) {
847 struct ipoib_device *ipoib = netdev->priv;
848 struct ib_device *ibdev = ipoib->ibdev;
849 int rc;
850
851 /* Open IB device */
852 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
853 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
854 ipoib, strerror ( rc ) );
855 goto err_ib_open;
856 }
857
858 /* Allocate completion queue */
859 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
860 if ( ! ipoib->cq ) {
861 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
862 ipoib );
863 rc = -ENOMEM;
864 goto err_create_cq;
865 }
866
867 /* Allocate queue pair */
868 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
869 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
870 &ipoib_qp_op );
871 if ( ! ipoib->qp ) {
872 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
873 ipoib );
874 rc = -ENOMEM;
875 goto err_create_qp;
876 }
877 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
878
879 /* Update MAC address with QPN */
880 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
881
882 /* Fill receive rings */
883 ib_refill_recv ( ibdev, ipoib->qp );
884
885 /* Fake a link status change to join the broadcast group */
886 ipoib_link_state_changed ( ipoib );
887
888 return 0;
889
890 ib_destroy_qp ( ibdev, ipoib->qp );
891 err_create_qp:
892 ib_destroy_cq ( ibdev, ipoib->cq );
893 err_create_cq:
894 ib_close ( ibdev );
895 err_ib_open:
896 return rc;
897 }
898
899 /**
900 * Close IPoIB network device
901 *
902 * @v netdev Network device
903 */
904 static void ipoib_close ( struct net_device *netdev ) {
905 struct ipoib_device *ipoib = netdev->priv;
906 struct ib_device *ibdev = ipoib->ibdev;
907
908 /* Flush REMAC cache */
909 ipoib_flush_remac ( ipoib );
910
911 /* Leave broadcast group */
912 ipoib_leave_broadcast_group ( ipoib );
913
914 /* Remove QPN from MAC address */
915 ipoib->mac.flags__qpn = 0;
916
917 /* Tear down the queues */
918 ib_destroy_qp ( ibdev, ipoib->qp );
919 ipoib->qp = NULL;
920 ib_destroy_cq ( ibdev, ipoib->cq );
921 ipoib->cq = NULL;
922
923 /* Close IB device */
924 ib_close ( ibdev );
925 }
926
927 /** IPoIB network device operations */
928 static struct net_device_operations ipoib_operations = {
929 .open = ipoib_open,
930 .close = ipoib_close,
931 .transmit = ipoib_transmit,
932 .poll = ipoib_poll,
933 };
934
935 /**
936 * Probe IPoIB device
937 *
938 * @v ibdev Infiniband device
939 * @ret rc Return status code
940 */
941 static int ipoib_probe ( struct ib_device *ibdev ) {
942 struct net_device *netdev;
943 struct ipoib_device *ipoib;
944 int rc;
945
946 /* Allocate network device */
947 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
948 if ( ! netdev )
949 return -ENOMEM;
950 netdev_init ( netdev, &ipoib_operations );
951 ipoib = netdev->priv;
952 netdev->dev = ibdev->dev;
953 memset ( ipoib, 0, sizeof ( *ipoib ) );
954 ipoib->netdev = netdev;
955 ipoib->ibdev = ibdev;
956 INIT_LIST_HEAD ( &ipoib->peers );
957
958 /* Extract hardware address */
959 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
960 sizeof ( ibdev->gid.s.guid ) );
961
962 /* Set local MAC address */
963 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
964 sizeof ( ipoib->mac.gid.s.guid ) );
965
966 /* Set default broadcast MAC address */
967 memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
968 sizeof ( ipoib->broadcast.mac ) );
969
970 /* Add to list of IPoIB devices */
971 list_add_tail ( &ipoib->list, &ipoib_devices );
972
973 /* Register network device */
974 if ( ( rc = register_netdev ( netdev ) ) != 0 )
975 goto err_register_netdev;
976
977 return 0;
978
979 unregister_netdev ( netdev );
980 err_register_netdev:
981 list_del ( &ipoib->list );
982 netdev_nullify ( netdev );
983 netdev_put ( netdev );
984 return rc;
985 }
986
987 /**
988 * Handle device or link status change
989 *
990 * @v ibdev Infiniband device
991 */
992 static void ipoib_notify ( struct ib_device *ibdev ) {
993 struct ipoib_device *ipoib;
994
995 /* Handle link status change for any attached IPoIB devices */
996 list_for_each_entry ( ipoib, &ipoib_devices, list ) {
997 if ( ipoib->ibdev != ibdev )
998 continue;
999 ipoib_link_state_changed ( ipoib );
1000 }
1001 }
1002
1003 /**
1004 * Remove IPoIB device
1005 *
1006 * @v ibdev Infiniband device
1007 */
1008 static void ipoib_remove ( struct ib_device *ibdev ) {
1009 struct ipoib_device *ipoib;
1010 struct ipoib_device *tmp;
1011 struct net_device *netdev;
1012
1013 /* Remove any attached IPoIB devices */
1014 list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
1015 if ( ipoib->ibdev != ibdev )
1016 continue;
1017 netdev = ipoib->netdev;
1018 unregister_netdev ( netdev );
1019 list_del ( &ipoib->list );
1020 netdev_nullify ( netdev );
1021 netdev_put ( netdev );
1022 }
1023 }
1024
1025 /** IPoIB driver */
1026 struct ib_driver ipoib_driver __ib_driver = {
1027 .name = "IPoIB",
1028 .probe = ipoib_probe,
1029 .notify = ipoib_notify,
1030 .remove = ipoib_remove,
1031 };