[ipoib] Simplify test for received broadcast packets
[ipxe.git] / src / drivers / net / ipoib.c
1 /*
2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <string.h>
31 #include <byteswap.h>
32 #include <errno.h>
33 #include <ipxe/errortab.h>
34 #include <ipxe/malloc.h>
35 #include <ipxe/if_arp.h>
36 #include <ipxe/arp.h>
37 #include <ipxe/if_ether.h>
38 #include <ipxe/ethernet.h>
39 #include <ipxe/ip.h>
40 #include <ipxe/iobuf.h>
41 #include <ipxe/netdevice.h>
42 #include <ipxe/infiniband.h>
43 #include <ipxe/ib_pathrec.h>
44 #include <ipxe/ib_mcast.h>
45 #include <ipxe/retry.h>
46 #include <ipxe/ipoib.h>
47
48 /** @file
49 *
50 * IP over Infiniband
51 */
52
53 /* Disambiguate the various error causes */
54 #define ENXIO_ARP_REPLY __einfo_error ( EINFO_ENXIO_ARP_REPLY )
55 #define EINFO_ENXIO_ARP_REPLY \
56 __einfo_uniqify ( EINFO_ENXIO, 0x01, \
57 "Missing REMAC for ARP reply target address" )
58 #define ENXIO_NON_IPV4 __einfo_error ( EINFO_ENXIO_NON_IPV4 )
59 #define EINFO_ENXIO_NON_IPV4 \
60 __einfo_uniqify ( EINFO_ENXIO, 0x02, \
61 "Missing REMAC for non-IPv4 packet" )
62 #define ENXIO_ARP_SENT __einfo_error ( EINFO_ENXIO_ARP_SENT )
63 #define EINFO_ENXIO_ARP_SENT \
64 __einfo_uniqify ( EINFO_ENXIO, 0x03, \
65 "Missing REMAC for IPv4 packet (ARP sent)" )
66
67 /** Number of IPoIB send work queue entries */
68 #define IPOIB_NUM_SEND_WQES 2
69
70 /** Number of IPoIB receive work queue entries */
71 #define IPOIB_NUM_RECV_WQES 4
72
73 /** Number of IPoIB completion entries */
74 #define IPOIB_NUM_CQES 8
75
76 /** An IPoIB broadcast address */
77 struct ipoib_broadcast {
78 /** MAC address */
79 struct ipoib_mac mac;
80 /** Address vector */
81 struct ib_address_vector av;
82 /** Multicast group membership */
83 struct ib_mc_membership membership;
84 };
85
86 /** An IPoIB device */
87 struct ipoib_device {
88 /** Network device */
89 struct net_device *netdev;
90 /** Underlying Infiniband device */
91 struct ib_device *ibdev;
92 /** List of IPoIB devices */
93 struct list_head list;
94 /** Completion queue */
95 struct ib_completion_queue *cq;
96 /** Queue pair */
97 struct ib_queue_pair *qp;
98 /** Local MAC */
99 struct ipoib_mac mac;
100 /** Broadcast address */
101 struct ipoib_broadcast broadcast;
102 /** REMAC cache */
103 struct list_head peers;
104 };
105
106 /** Broadcast IPoIB address */
107 static struct ipoib_mac ipoib_broadcast = {
108 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
109 .gid.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
110 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
111 };
112
113 /** Link status for "broadcast join in progress" */
114 #define EINPROGRESS_JOINING __einfo_error ( EINFO_EINPROGRESS_JOINING )
115 #define EINFO_EINPROGRESS_JOINING __einfo_uniqify \
116 ( EINFO_EINPROGRESS, 0x01, "Joining" )
117
118 /** Human-readable message for the link status */
119 struct errortab ipoib_errors[] __errortab = {
120 __einfo_errortab ( EINFO_EINPROGRESS_JOINING ),
121 };
122
123 /** List of all IPoIB devices */
124 static LIST_HEAD ( ipoib_devices );
125
126 static struct net_device_operations ipoib_operations;
127
128 /****************************************************************************
129 *
130 * IPoIB REMAC cache
131 *
132 ****************************************************************************
133 */
134
135 /** An IPoIB REMAC cache entry */
136 struct ipoib_peer {
137 /** List of REMAC cache entries */
138 struct list_head list;
139 /** Remote Ethermet MAC */
140 struct ipoib_remac remac;
141 /** MAC address */
142 struct ipoib_mac mac;
143 };
144
145 /**
146 * Find IPoIB MAC from REMAC
147 *
148 * @v ipoib IPoIB device
149 * @v remac Remote Ethernet MAC
150 * @ret mac IPoIB MAC (or NULL if not found)
151 */
152 static struct ipoib_mac * ipoib_find_remac ( struct ipoib_device *ipoib,
153 const struct ipoib_remac *remac ) {
154 struct ipoib_peer *peer;
155
156 /* Check for broadcast or multicast REMAC. We transmit
157 * multicasts as broadcasts for simplicity.
158 */
159 if ( is_multicast_ether_addr ( remac ) )
160 return &ipoib->broadcast.mac;
161
162 /* Try to find via REMAC cache */
163 list_for_each_entry ( peer, &ipoib->peers, list ) {
164 if ( memcmp ( remac, &peer->remac,
165 sizeof ( peer->remac ) ) == 0 ) {
166 /* Move peer to start of list */
167 list_del ( &peer->list );
168 list_add ( &peer->list, &ipoib->peers );
169 return &peer->mac;
170 }
171 }
172
173 DBGC ( ipoib, "IPoIB %p unknown REMAC %s\n",
174 ipoib, eth_ntoa ( remac ) );
175 return NULL;
176 }
177
178 /**
179 * Add IPoIB MAC to REMAC cache
180 *
181 * @v ipoib IPoIB device
182 * @v remac Remote Ethernet MAC
183 * @v mac IPoIB MAC
184 * @ret rc Return status code
185 */
186 static int ipoib_map_remac ( struct ipoib_device *ipoib,
187 const struct ipoib_remac *remac,
188 const struct ipoib_mac *mac ) {
189 struct ipoib_peer *peer;
190
191 /* Check for existing entry in REMAC cache */
192 list_for_each_entry ( peer, &ipoib->peers, list ) {
193 if ( memcmp ( remac, &peer->remac,
194 sizeof ( peer->remac ) ) == 0 ) {
195 /* Move peer to start of list */
196 list_del ( &peer->list );
197 list_add ( &peer->list, &ipoib->peers );
198 /* Update MAC */
199 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
200 return 0;
201 }
202 }
203
204 /* Create new entry */
205 peer = malloc ( sizeof ( *peer ) );
206 if ( ! peer )
207 return -ENOMEM;
208 memcpy ( &peer->remac, remac, sizeof ( peer->remac ) );
209 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
210 list_add ( &peer->list, &ipoib->peers );
211
212 return 0;
213 }
214
215 /**
216 * Flush REMAC cache
217 *
218 * @v ipoib IPoIB device
219 */
220 static void ipoib_flush_remac ( struct ipoib_device *ipoib ) {
221 struct ipoib_peer *peer;
222 struct ipoib_peer *tmp;
223
224 list_for_each_entry_safe ( peer, tmp, &ipoib->peers, list ) {
225 list_del ( &peer->list );
226 free ( peer );
227 }
228 }
229
230 /**
231 * Discard some entries from the REMAC cache
232 *
233 * @ret discarded Number of cached items discarded
234 */
235 static unsigned int ipoib_discard_remac ( void ) {
236 struct net_device *netdev;
237 struct ipoib_device *ipoib;
238 struct ipoib_peer *peer;
239 unsigned int discarded = 0;
240
241 /* Try to discard one cache entry for each IPoIB device */
242 for_each_netdev ( netdev ) {
243
244 /* Skip non-IPoIB devices */
245 if ( netdev->op != &ipoib_operations )
246 continue;
247 ipoib = netdev->priv;
248
249 /* Discard least recently used cache entry (if any) */
250 list_for_each_entry_reverse ( peer, &ipoib->peers, list ) {
251 list_del ( &peer->list );
252 free ( peer );
253 discarded++;
254 break;
255 }
256 }
257
258 return discarded;
259 }
260
261 /** IPoIB cache discarder */
262 struct cache_discarder ipoib_discarder __cache_discarder ( CACHE_EXPENSIVE ) = {
263 .discard = ipoib_discard_remac,
264 };
265
266 /****************************************************************************
267 *
268 * IPoIB link layer
269 *
270 ****************************************************************************
271 */
272
273 /**
274 * Initialise IPoIB link-layer address
275 *
276 * @v hw_addr Hardware address
277 * @v ll_addr Link-layer address
278 */
279 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
280 const uint8_t *guid = hw_addr;
281 uint8_t *eth_addr = ll_addr;
282 uint8_t guid_mask = IPOIB_GUID_MASK;
283 unsigned int i;
284
285 /* Extract bytes from GUID according to mask */
286 for ( i = 0 ; i < 8 ; i++, guid++, guid_mask <<= 1 ) {
287 if ( guid_mask & 0x80 )
288 *(eth_addr++) = *guid;
289 }
290 }
291
292 /** IPoIB protocol */
293 struct ll_protocol ipoib_protocol __ll_protocol = {
294 .name = "IPoIB",
295 .ll_proto = htons ( ARPHRD_ETHER ),
296 .hw_addr_len = sizeof ( union ib_guid ),
297 .ll_addr_len = ETH_ALEN,
298 .ll_header_len = ETH_HLEN,
299 .push = eth_push,
300 .pull = eth_pull,
301 .init_addr = ipoib_init_addr,
302 .ntoa = eth_ntoa,
303 .mc_hash = eth_mc_hash,
304 .eth_addr = eth_eth_addr,
305 .eui64 = eth_eui64,
306 .flags = LL_NAME_ONLY,
307 };
308
309 /**
310 * Allocate IPoIB device
311 *
312 * @v priv_size Size of driver private data
313 * @ret netdev Network device, or NULL
314 */
315 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
316 struct net_device *netdev;
317
318 netdev = alloc_netdev ( priv_size );
319 if ( netdev ) {
320 netdev->ll_protocol = &ipoib_protocol;
321 netdev->ll_broadcast = eth_broadcast;
322 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
323 }
324 return netdev;
325 }
326
327 /****************************************************************************
328 *
329 * IPoIB translation layer
330 *
331 ****************************************************************************
332 */
333
334 /**
335 * Translate transmitted ARP packet
336 *
337 * @v netdev Network device
338 * @v iobuf Packet to be transmitted (with no link-layer headers)
339 * @ret rc Return status code
340 */
341 static int ipoib_translate_tx_arp ( struct net_device *netdev,
342 struct io_buffer *iobuf ) {
343 struct ipoib_device *ipoib = netdev->priv;
344 struct arphdr *arphdr = iobuf->data;
345 struct ipoib_mac *target_ha = NULL;
346 void *sender_pa;
347 void *target_pa;
348
349 /* Do nothing unless ARP contains eIPoIB link-layer addresses */
350 if ( arphdr->ar_hln != ETH_ALEN )
351 return 0;
352
353 /* Fail unless we have room to expand packet */
354 if ( iob_tailroom ( iobuf ) < ( 2 * ( sizeof ( ipoib->mac ) -
355 ETH_ALEN ) ) ) {
356 DBGC ( ipoib, "IPoIB %p insufficient space in TX ARP\n",
357 ipoib );
358 return -ENOBUFS;
359 }
360
361 /* Look up REMAC, if applicable */
362 if ( arphdr->ar_op == ARPOP_REPLY ) {
363 target_ha = ipoib_find_remac ( ipoib, arp_target_pa ( arphdr ));
364 if ( ! target_ha ) {
365 DBGC ( ipoib, "IPoIB %p no REMAC for %s ARP reply\n",
366 ipoib, eth_ntoa ( arp_target_pa ( arphdr ) ) );
367 return -ENXIO_ARP_REPLY;
368 }
369 }
370
371 /* Construct new packet */
372 iob_put ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
373 sender_pa = arp_sender_pa ( arphdr );
374 target_pa = arp_target_pa ( arphdr );
375 arphdr->ar_hrd = htons ( ARPHRD_INFINIBAND );
376 arphdr->ar_hln = sizeof ( ipoib->mac );
377 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
378 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
379 memcpy ( arp_sender_ha ( arphdr ), &ipoib->mac, sizeof ( ipoib->mac ) );
380 memset ( arp_target_ha ( arphdr ), 0, sizeof ( ipoib->mac ) );
381 if ( target_ha ) {
382 memcpy ( arp_target_ha ( arphdr ), target_ha,
383 sizeof ( *target_ha ) );
384 }
385
386 return 0;
387 }
388
389 /**
390 * Translate transmitted packet
391 *
392 * @v netdev Network device
393 * @v iobuf Packet to be transmitted (with no link-layer headers)
394 * @v net_proto Network-layer protocol (in network byte order)
395 * @ret rc Return status code
396 */
397 static int ipoib_translate_tx ( struct net_device *netdev,
398 struct io_buffer *iobuf, uint16_t net_proto ) {
399
400 switch ( net_proto ) {
401 case htons ( ETH_P_ARP ) :
402 return ipoib_translate_tx_arp ( netdev, iobuf );
403 case htons ( ETH_P_IP ) :
404 /* No translation needed */
405 return 0;
406 default:
407 /* Cannot handle other traffic via eIPoIB */
408 return -ENOTSUP;
409 }
410 }
411
412 /**
413 * Translate received ARP packet
414 *
415 * @v netdev Network device
416 * @v iobuf Received packet (with no link-layer headers)
417 * @v remac Constructed Remote Ethernet MAC
418 * @ret rc Return status code
419 */
420 static int ipoib_translate_rx_arp ( struct net_device *netdev,
421 struct io_buffer *iobuf,
422 struct ipoib_remac *remac ) {
423 struct ipoib_device *ipoib = netdev->priv;
424 struct arphdr *arphdr = iobuf->data;
425 void *sender_pa;
426 void *target_pa;
427 int rc;
428
429 /* Do nothing unless ARP contains IPoIB link-layer addresses */
430 if ( arphdr->ar_hln != sizeof ( ipoib->mac ) )
431 return 0;
432
433 /* Create REMAC cache entry */
434 if ( ( rc = ipoib_map_remac ( ipoib, remac,
435 arp_sender_ha ( arphdr ) ) ) != 0 ) {
436 DBGC ( ipoib, "IPoIB %p could not map REMAC: %s\n",
437 ipoib, strerror ( rc ) );
438 return rc;
439 }
440
441 /* Construct new packet */
442 sender_pa = arp_sender_pa ( arphdr );
443 target_pa = arp_target_pa ( arphdr );
444 arphdr->ar_hrd = htons ( ARPHRD_ETHER );
445 arphdr->ar_hln = ETH_ALEN;
446 memcpy ( arp_sender_pa ( arphdr ), sender_pa, arphdr->ar_pln );
447 memcpy ( arp_target_pa ( arphdr ), target_pa, arphdr->ar_pln );
448 memcpy ( arp_sender_ha ( arphdr ), remac, ETH_ALEN );
449 memset ( arp_target_ha ( arphdr ), 0, ETH_ALEN );
450 if ( arphdr->ar_op == ARPOP_REPLY ) {
451 /* Assume received replies were directed to us */
452 memcpy ( arp_target_ha ( arphdr ), netdev->ll_addr, ETH_ALEN );
453 }
454 iob_unput ( iobuf, ( 2 * ( sizeof ( ipoib->mac ) - ETH_ALEN ) ) );
455
456 return 0;
457 }
458
459 /**
460 * Translate received packet
461 *
462 * @v netdev Network device
463 * @v iobuf Received packet (with no link-layer headers)
464 * @v remac Constructed Remote Ethernet MAC
465 * @v net_proto Network-layer protocol (in network byte order)
466 * @ret rc Return status code
467 */
468 static int ipoib_translate_rx ( struct net_device *netdev,
469 struct io_buffer *iobuf,
470 struct ipoib_remac *remac,
471 uint16_t net_proto ) {
472
473 switch ( net_proto ) {
474 case htons ( ETH_P_ARP ) :
475 return ipoib_translate_rx_arp ( netdev, iobuf, remac );
476 case htons ( ETH_P_IP ) :
477 /* No translation needed */
478 return 0;
479 default:
480 /* Cannot handle other traffic via eIPoIB */
481 return -ENOTSUP;
482 }
483 }
484
485 /****************************************************************************
486 *
487 * IPoIB network device
488 *
489 ****************************************************************************
490 */
491
492 /**
493 * Transmit packet via IPoIB network device
494 *
495 * @v netdev Network device
496 * @v iobuf I/O buffer
497 * @ret rc Return status code
498 */
499 static int ipoib_transmit ( struct net_device *netdev,
500 struct io_buffer *iobuf ) {
501 struct ipoib_device *ipoib = netdev->priv;
502 struct ib_device *ibdev = ipoib->ibdev;
503 struct ethhdr *ethhdr;
504 struct iphdr *iphdr;
505 struct ipoib_hdr *ipoib_hdr;
506 struct ipoib_remac *remac;
507 struct ipoib_mac *mac;
508 struct ib_address_vector *dest;
509 struct ib_address_vector av;
510 uint16_t net_proto;
511 int rc;
512
513 /* Sanity check */
514 if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
515 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
516 return -EINVAL;
517 }
518
519 /* Attempting transmission while link is down will put the
520 * queue pair into an error state, so don't try it.
521 */
522 if ( ! ib_link_ok ( ibdev ) )
523 return -ENETUNREACH;
524
525 /* Strip eIPoIB header */
526 ethhdr = iobuf->data;
527 remac = ( ( struct ipoib_remac * ) ethhdr->h_dest );
528 net_proto = ethhdr->h_protocol;
529 iob_pull ( iobuf, sizeof ( *ethhdr ) );
530
531 /* Identify destination address */
532 if ( is_multicast_ether_addr ( remac ) ) {
533
534 /* Transmit multicasts as broadcasts, for simplicity */
535 dest = &ipoib->broadcast.av;
536
537 } else if ( ( mac = ipoib_find_remac ( ipoib, remac ) ) ) {
538
539 /* Construct address vector from IPoIB MAC */
540 dest = &av;
541 memset ( dest, 0, sizeof ( *dest ) );
542 dest->qpn = ( ntohl ( mac->flags__qpn ) & IB_QPN_MASK );
543 dest->qkey = ipoib->broadcast.av.qkey;
544 dest->gid_present = 1;
545 memcpy ( &dest->gid, &mac->gid, sizeof ( dest->gid ) );
546 if ( ( rc = ib_resolve_path ( ibdev, dest ) ) != 0 ) {
547 /* Path not resolved yet */
548 return rc;
549 }
550
551 } else {
552
553 /* Generate a new ARP request (if possible) to trigger
554 * population of the REMAC cache entry.
555 */
556 if ( ( net_proto != htons ( ETH_P_IP ) ) ||
557 ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) ) {
558 DBGC ( ipoib, "IPoIB %p no REMAC for %s non-IPv4 "
559 "packet type %04x\n", ipoib,
560 eth_ntoa ( ethhdr->h_dest ),
561 ntohs ( net_proto ) );
562 return -ENXIO_NON_IPV4;
563 }
564 iphdr = iobuf->data;
565 if ( ( rc = arp_tx_request ( netdev, &ipv4_protocol,
566 &iphdr->dest, &iphdr->src ) ) !=0){
567 DBGC ( ipoib, "IPoIB %p could not ARP for %s/%s/",
568 ipoib, eth_ntoa ( ethhdr->h_dest ),
569 inet_ntoa ( iphdr->dest ) );
570 DBGC ( ipoib, "%s: %s\n", inet_ntoa ( iphdr->src ),
571 strerror ( rc ) );
572 return rc;
573 }
574 DBGC ( ipoib, "IPoIB %p no REMAC for %s/%s/", ipoib,
575 eth_ntoa ( ethhdr->h_dest ), inet_ntoa ( iphdr->dest ) );
576 DBGC ( ipoib, "%s\n", inet_ntoa ( iphdr->src ) );
577 return -ENXIO_ARP_SENT;
578 }
579
580 /* Translate packet if applicable */
581 if ( ( rc = ipoib_translate_tx ( netdev, iobuf, net_proto ) ) != 0 )
582 return rc;
583
584 /* Prepend real IPoIB header */
585 ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
586 ipoib_hdr->proto = net_proto;
587 ipoib_hdr->reserved = 0;
588
589 /* Transmit packet */
590 return ib_post_send ( ibdev, ipoib->qp, dest, iobuf );
591 }
592
593 /**
594 * Handle IPoIB send completion
595 *
596 * @v ibdev Infiniband device
597 * @v qp Queue pair
598 * @v iobuf I/O buffer
599 * @v rc Completion status code
600 */
601 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
602 struct ib_queue_pair *qp,
603 struct io_buffer *iobuf, int rc ) {
604 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
605
606 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
607 }
608
609 /**
610 * Handle IPoIB receive completion
611 *
612 * @v ibdev Infiniband device
613 * @v qp Queue pair
614 * @v dest Destination address vector, or NULL
615 * @v source Source address vector, or NULL
616 * @v iobuf I/O buffer
617 * @v rc Completion status code
618 */
619 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
620 struct ib_queue_pair *qp,
621 struct ib_address_vector *dest,
622 struct ib_address_vector *source,
623 struct io_buffer *iobuf, int rc ) {
624 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
625 struct net_device *netdev = ipoib->netdev;
626 struct ipoib_hdr *ipoib_hdr;
627 struct ethhdr *ethhdr;
628 struct ipoib_remac remac;
629 uint16_t net_proto;
630
631 /* Record errors */
632 if ( rc != 0 ) {
633 netdev_rx_err ( netdev, iobuf, rc );
634 return;
635 }
636
637 /* Sanity check */
638 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
639 DBGC ( ipoib, "IPoIB %p received packet too short to "
640 "contain IPoIB header\n", ipoib );
641 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
642 netdev_rx_err ( netdev, iobuf, -EIO );
643 return;
644 }
645 if ( ! source ) {
646 DBGC ( ipoib, "IPoIB %p received packet without address "
647 "vector\n", ipoib );
648 netdev_rx_err ( netdev, iobuf, -ENOTTY );
649 return;
650 }
651
652 /* Strip real IPoIB header */
653 ipoib_hdr = iobuf->data;
654 net_proto = ipoib_hdr->proto;
655 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
656
657 /* Construct source address from remote QPN and LID */
658 remac.qpn = htonl ( source->qpn | EIPOIB_QPN_LA );
659 remac.lid = htons ( source->lid );
660
661 /* Translate packet if applicable */
662 if ( ( rc = ipoib_translate_rx ( netdev, iobuf, &remac,
663 net_proto ) ) != 0 ) {
664 netdev_rx_err ( netdev, iobuf, rc );
665 return;
666 }
667
668 /* Prepend eIPoIB header */
669 ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
670 memcpy ( &ethhdr->h_source, &remac, sizeof ( ethhdr->h_source ) );
671 ethhdr->h_protocol = net_proto;
672
673 /* Construct destination address */
674 if ( IB_LID_MULTICAST ( dest->lid ) ) {
675 /* Multicast LID; use the Ethernet broadcast address */
676 memcpy ( &ethhdr->h_dest, eth_broadcast,
677 sizeof ( ethhdr->h_dest ) );
678 } else {
679 /* Assume destination address is local Ethernet MAC */
680 memcpy ( &ethhdr->h_dest, netdev->ll_addr,
681 sizeof ( ethhdr->h_dest ) );
682 }
683
684 /* Hand off to network layer */
685 netdev_rx ( netdev, iobuf );
686 }
687
688 /** IPoIB completion operations */
689 static struct ib_completion_queue_operations ipoib_cq_op = {
690 .complete_send = ipoib_complete_send,
691 .complete_recv = ipoib_complete_recv,
692 };
693
694 /**
695 * Allocate IPoIB receive I/O buffer
696 *
697 * @v len Length of buffer
698 * @ret iobuf I/O buffer, or NULL
699 *
700 * Some Infiniband hardware requires 2kB alignment of receive buffers
701 * and provides no way to disable header separation. The result is
702 * that there are only four bytes of link-layer header (the real IPoIB
703 * header) before the payload. This is not sufficient space to insert
704 * an eIPoIB link-layer pseudo-header.
705 *
706 * We therefore allocate I/O buffers offset to start slightly before
707 * the natural alignment boundary, in order to allow sufficient space.
708 */
709 static struct io_buffer * ipoib_alloc_iob ( size_t len ) {
710 struct io_buffer *iobuf;
711 size_t reserve_len;
712
713 /* Calculate additional length required at start of buffer */
714 reserve_len = ( sizeof ( struct ethhdr ) -
715 sizeof ( struct ipoib_hdr ) );
716
717 /* Allocate buffer */
718 iobuf = alloc_iob_raw ( ( len + reserve_len ), len, -reserve_len );
719 if ( iobuf ) {
720 iob_reserve ( iobuf, reserve_len );
721 }
722 return iobuf;
723 }
724
725 /** IPoIB queue pair operations */
726 static struct ib_queue_pair_operations ipoib_qp_op = {
727 .alloc_iob = ipoib_alloc_iob,
728 };
729
730 /**
731 * Poll IPoIB network device
732 *
733 * @v netdev Network device
734 */
735 static void ipoib_poll ( struct net_device *netdev ) {
736 struct ipoib_device *ipoib = netdev->priv;
737 struct ib_device *ibdev = ipoib->ibdev;
738
739 /* Poll Infiniband device */
740 ib_poll_eq ( ibdev );
741
742 /* Poll the retry timers (required for IPoIB multicast join) */
743 retry_poll();
744 }
745
746 /**
747 * Handle IPv4 broadcast multicast group join completion
748 *
749 * @v membership Multicast group membership
750 * @v rc Status code
751 */
752 void ipoib_join_complete ( struct ib_mc_membership *membership, int rc ) {
753 struct ipoib_device *ipoib = container_of ( membership,
754 struct ipoib_device,
755 broadcast.membership );
756
757 /* Record join status as link status */
758 netdev_link_err ( ipoib->netdev, rc );
759 }
760
761 /**
762 * Join IPv4 broadcast multicast group
763 *
764 * @v ipoib IPoIB device
765 * @ret rc Return status code
766 */
767 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
768 int rc;
769
770 /* Join multicast group */
771 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
772 &ipoib->broadcast.membership,
773 &ipoib->broadcast.av,
774 ipoib_join_complete ) ) != 0 ) {
775 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
776 ipoib, strerror ( rc ) );
777 return rc;
778 }
779
780 return 0;
781 }
782
783 /**
784 * Leave IPv4 broadcast multicast group
785 *
786 * @v ipoib IPoIB device
787 */
788 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
789
790 /* Leave multicast group */
791 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
792 &ipoib->broadcast.membership );
793 }
794
795 /**
796 * Handle link status change
797 *
798 * @v ipoib IPoIB device
799 */
800 static void ipoib_link_state_changed ( struct ipoib_device *ipoib ) {
801 struct ib_device *ibdev = ipoib->ibdev;
802 struct net_device *netdev = ipoib->netdev;
803 int rc;
804
805 /* Leave existing broadcast group */
806 if ( ipoib->qp )
807 ipoib_leave_broadcast_group ( ipoib );
808
809 /* Update MAC address based on potentially-new GID prefix */
810 memcpy ( &ipoib->mac.gid.s.prefix, &ibdev->gid.s.prefix,
811 sizeof ( ipoib->mac.gid.s.prefix ) );
812
813 /* Update broadcast MAC GID based on potentially-new partition key */
814 ipoib->broadcast.mac.gid.words[2] =
815 htons ( ibdev->pkey | IB_PKEY_FULL );
816
817 /* Construct broadcast address vector from broadcast MAC address */
818 memset ( &ipoib->broadcast.av, 0, sizeof ( ipoib->broadcast.av ) );
819 ipoib->broadcast.av.qpn = IB_QPN_BROADCAST;
820 ipoib->broadcast.av.gid_present = 1;
821 memcpy ( &ipoib->broadcast.av.gid, &ipoib->broadcast.mac.gid,
822 sizeof ( ipoib->broadcast.av.gid ) );
823
824 /* Set net device link state to reflect Infiniband link state */
825 rc = ib_link_rc ( ibdev );
826 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
827
828 /* Join new broadcast group */
829 if ( ib_is_open ( ibdev ) && ib_link_ok ( ibdev ) && ipoib->qp &&
830 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
831 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
832 "%s\n", ipoib, strerror ( rc ) );
833 netdev_link_err ( netdev, rc );
834 return;
835 }
836 }
837
838 /**
839 * Open IPoIB network device
840 *
841 * @v netdev Network device
842 * @ret rc Return status code
843 */
844 static int ipoib_open ( struct net_device *netdev ) {
845 struct ipoib_device *ipoib = netdev->priv;
846 struct ib_device *ibdev = ipoib->ibdev;
847 int rc;
848
849 /* Open IB device */
850 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
851 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
852 ipoib, strerror ( rc ) );
853 goto err_ib_open;
854 }
855
856 /* Allocate completion queue */
857 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
858 if ( ! ipoib->cq ) {
859 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
860 ipoib );
861 rc = -ENOMEM;
862 goto err_create_cq;
863 }
864
865 /* Allocate queue pair */
866 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES,
867 ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq,
868 &ipoib_qp_op );
869 if ( ! ipoib->qp ) {
870 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
871 ipoib );
872 rc = -ENOMEM;
873 goto err_create_qp;
874 }
875 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
876
877 /* Update MAC address with QPN */
878 ipoib->mac.flags__qpn = htonl ( ipoib->qp->qpn );
879
880 /* Fill receive rings */
881 ib_refill_recv ( ibdev, ipoib->qp );
882
883 /* Fake a link status change to join the broadcast group */
884 ipoib_link_state_changed ( ipoib );
885
886 return 0;
887
888 ib_destroy_qp ( ibdev, ipoib->qp );
889 err_create_qp:
890 ib_destroy_cq ( ibdev, ipoib->cq );
891 err_create_cq:
892 ib_close ( ibdev );
893 err_ib_open:
894 return rc;
895 }
896
897 /**
898 * Close IPoIB network device
899 *
900 * @v netdev Network device
901 */
902 static void ipoib_close ( struct net_device *netdev ) {
903 struct ipoib_device *ipoib = netdev->priv;
904 struct ib_device *ibdev = ipoib->ibdev;
905
906 /* Flush REMAC cache */
907 ipoib_flush_remac ( ipoib );
908
909 /* Leave broadcast group */
910 ipoib_leave_broadcast_group ( ipoib );
911
912 /* Remove QPN from MAC address */
913 ipoib->mac.flags__qpn = 0;
914
915 /* Tear down the queues */
916 ib_destroy_qp ( ibdev, ipoib->qp );
917 ipoib->qp = NULL;
918 ib_destroy_cq ( ibdev, ipoib->cq );
919 ipoib->cq = NULL;
920
921 /* Close IB device */
922 ib_close ( ibdev );
923 }
924
925 /** IPoIB network device operations */
926 static struct net_device_operations ipoib_operations = {
927 .open = ipoib_open,
928 .close = ipoib_close,
929 .transmit = ipoib_transmit,
930 .poll = ipoib_poll,
931 };
932
933 /**
934 * Probe IPoIB device
935 *
936 * @v ibdev Infiniband device
937 * @ret rc Return status code
938 */
939 static int ipoib_probe ( struct ib_device *ibdev ) {
940 struct net_device *netdev;
941 struct ipoib_device *ipoib;
942 int rc;
943
944 /* Allocate network device */
945 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
946 if ( ! netdev )
947 return -ENOMEM;
948 netdev_init ( netdev, &ipoib_operations );
949 ipoib = netdev->priv;
950 netdev->dev = ibdev->dev;
951 memset ( ipoib, 0, sizeof ( *ipoib ) );
952 ipoib->netdev = netdev;
953 ipoib->ibdev = ibdev;
954 INIT_LIST_HEAD ( &ipoib->peers );
955
956 /* Extract hardware address */
957 memcpy ( netdev->hw_addr, &ibdev->gid.s.guid,
958 sizeof ( ibdev->gid.s.guid ) );
959
960 /* Set local MAC address */
961 memcpy ( &ipoib->mac.gid.s.guid, &ibdev->gid.s.guid,
962 sizeof ( ipoib->mac.gid.s.guid ) );
963
964 /* Set default broadcast MAC address */
965 memcpy ( &ipoib->broadcast.mac, &ipoib_broadcast,
966 sizeof ( ipoib->broadcast.mac ) );
967
968 /* Add to list of IPoIB devices */
969 list_add_tail ( &ipoib->list, &ipoib_devices );
970
971 /* Register network device */
972 if ( ( rc = register_netdev ( netdev ) ) != 0 )
973 goto err_register_netdev;
974
975 return 0;
976
977 unregister_netdev ( netdev );
978 err_register_netdev:
979 list_del ( &ipoib->list );
980 netdev_nullify ( netdev );
981 netdev_put ( netdev );
982 return rc;
983 }
984
985 /**
986 * Handle device or link status change
987 *
988 * @v ibdev Infiniband device
989 */
990 static void ipoib_notify ( struct ib_device *ibdev ) {
991 struct ipoib_device *ipoib;
992
993 /* Handle link status change for any attached IPoIB devices */
994 list_for_each_entry ( ipoib, &ipoib_devices, list ) {
995 if ( ipoib->ibdev != ibdev )
996 continue;
997 ipoib_link_state_changed ( ipoib );
998 }
999 }
1000
1001 /**
1002 * Remove IPoIB device
1003 *
1004 * @v ibdev Infiniband device
1005 */
1006 static void ipoib_remove ( struct ib_device *ibdev ) {
1007 struct ipoib_device *ipoib;
1008 struct ipoib_device *tmp;
1009 struct net_device *netdev;
1010
1011 /* Remove any attached IPoIB devices */
1012 list_for_each_entry_safe ( ipoib, tmp, &ipoib_devices, list ) {
1013 if ( ipoib->ibdev != ibdev )
1014 continue;
1015 netdev = ipoib->netdev;
1016 unregister_netdev ( netdev );
1017 list_del ( &ipoib->list );
1018 netdev_nullify ( netdev );
1019 netdev_put ( netdev );
1020 }
1021 }
1022
1023 /** IPoIB driver */
1024 struct ib_driver ipoib_driver __ib_driver = {
1025 .name = "IPoIB",
1026 .probe = ipoib_probe,
1027 .notify = ipoib_notify,
1028 .remove = ipoib_remove,
1029 };