| /* |
| * MPTCP implementation - Sending side |
| * |
| * Initial Design & Implementation: |
| * Sébastien Barré <sebastien.barre@uclouvain.be> |
| * |
| * Current Maintainer & Author: |
| * Christoph Paasch <christoph.paasch@uclouvain.be> |
| * |
| * Additional authors: |
| * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> |
| * Gregory Detal <gregory.detal@uclouvain.be> |
| * Fabien Duchêne <fabien.duchene@uclouvain.be> |
| * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> |
| * Lavkesh Lahngir <lavkesh51@gmail.com> |
| * Andreas Ripke <ripke@neclab.eu> |
| * Vlad Dogaru <vlad.dogaru@intel.com> |
| * Octavian Purdila <octavian.purdila@intel.com> |
| * John Ronan <jronan@tssg.org> |
| * Catalin Nicutar <catalin.nicutar@gmail.com> |
| * Brandon Heller <brandonh@stanford.edu> |
| * |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License |
| * as published by the Free Software Foundation; either version |
| * 2 of the License, or (at your option) any later version. |
| */ |
| |
| #include <asm/unaligned.h> |
| |
| #include <net/mptcp.h> |
| #include <net/mptcp_v4.h> |
| #include <net/mptcp_v6.h> |
| |
| #include <linux/kconfig.h> |
| |
| /* is seq1 < seq2 ? */ |
| static inline bool before64(const u64 seq1, const u64 seq2) |
| { |
| return (s64)(seq1 - seq2) < 0; |
| } |
| |
| /* is seq1 > seq2 ? */ |
| #define after64(seq1, seq2) before64(seq2, seq1) |
| |
| static inline void mptcp_become_fully_estab(struct sock *sk) |
| { |
| tcp_sk(sk)->mptcp->fully_established = 1; |
| |
| if (is_master_tp(tcp_sk(sk)) && |
| tcp_sk(sk)->mpcb->pm_ops->fully_established) |
| tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk)); |
| } |
| |
| /* Similar to tcp_tso_acked without any memory accounting */ |
| static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk, |
| struct sk_buff *skb) |
| { |
| const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| u32 packets_acked, len, delta_truesize; |
| |
| BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)); |
| |
| packets_acked = tcp_skb_pcount(skb); |
| |
| if (skb_unclone(skb, GFP_ATOMIC)) |
| return 0; |
| |
| len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq; |
| delta_truesize = __pskb_trim_head(skb, len); |
| |
| TCP_SKB_CB(skb)->seq += len; |
| skb->ip_summed = CHECKSUM_PARTIAL; |
| |
| if (delta_truesize) |
| skb->truesize -= delta_truesize; |
| |
| /* Any change of skb->len requires recalculation of tso factor. */ |
| if (tcp_skb_pcount(skb) > 1) |
| tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); |
| packets_acked -= tcp_skb_pcount(skb); |
| |
| if (packets_acked) { |
| BUG_ON(tcp_skb_pcount(skb) == 0); |
| BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); |
| } |
| |
| return packets_acked; |
| } |
| |
| /** |
| * Cleans the meta-socket retransmission queue and the reinject-queue. |
| * @sk must be the metasocket. |
| */ |
| static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una) |
| { |
| struct sk_buff *skb, *tmp; |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| struct mptcp_cb *mpcb = meta_tp->mpcb; |
| bool acked = false; |
| u32 acked_pcount; |
| |
| while ((skb = tcp_write_queue_head(meta_sk)) && |
| skb != tcp_send_head(meta_sk)) { |
| bool fully_acked = true; |
| |
| if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { |
| if (tcp_skb_pcount(skb) == 1 || |
| !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) |
| break; |
| |
| acked_pcount = tcp_tso_acked(meta_sk, skb); |
| if (!acked_pcount) |
| break; |
| |
| fully_acked = false; |
| } else { |
| acked_pcount = tcp_skb_pcount(skb); |
| } |
| |
| acked = true; |
| meta_tp->packets_out -= acked_pcount; |
| meta_tp->retrans_stamp = 0; |
| |
| if (!fully_acked) |
| break; |
| |
| tcp_unlink_write_queue(skb, meta_sk); |
| |
| if (mptcp_is_data_fin(skb)) { |
| struct sock *sk_it; |
| |
| /* DATA_FIN has been acknowledged - now we can close |
| * the subflows |
| */ |
| mptcp_for_each_sk(mpcb, sk_it) { |
| unsigned long delay = 0; |
| |
| /* If we are the passive closer, don't trigger |
| * subflow-fin until the subflow has been finned |
| * by the peer - thus we add a delay. |
| */ |
| if (mpcb->passive_close && |
| sk_it->sk_state == TCP_ESTABLISHED) |
| delay = inet_csk(sk_it)->icsk_rto << 3; |
| |
| mptcp_sub_close(sk_it, delay); |
| } |
| } |
| sk_wmem_free_skb(meta_sk, skb); |
| } |
| /* Remove acknowledged data from the reinject queue */ |
| skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) { |
| if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) { |
| if (tcp_skb_pcount(skb) == 1 || |
| !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq)) |
| break; |
| |
| mptcp_tso_acked_reinject(meta_sk, skb); |
| break; |
| } |
| |
| __skb_unlink(skb, &mpcb->reinject_queue); |
| __kfree_skb(skb); |
| } |
| |
| if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una))) |
| meta_tp->snd_up = meta_tp->snd_una; |
| |
| if (acked) { |
| tcp_rearm_rto(meta_sk); |
| /* Normally this is done in tcp_try_undo_loss - but MPTCP |
| * does not call this function. |
| */ |
| inet_csk(meta_sk)->icsk_retransmits = 0; |
| } |
| } |
| |
| /* Inspired by tcp_rcv_state_process */ |
| static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk, |
| const struct sk_buff *skb, u32 data_seq, |
| u16 data_len) |
| { |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); |
| const struct tcphdr *th = tcp_hdr(skb); |
| |
| /* State-machine handling if FIN has been enqueued and he has |
| * been acked (snd_una == write_seq) - it's important that this |
| * here is after sk_wmem_free_skb because otherwise |
| * sk_forward_alloc is wrong upon inet_csk_destroy_sock() |
| */ |
| switch (meta_sk->sk_state) { |
| case TCP_FIN_WAIT1: { |
| struct dst_entry *dst; |
| int tmo; |
| |
| if (meta_tp->snd_una != meta_tp->write_seq) |
| break; |
| |
| tcp_set_state(meta_sk, TCP_FIN_WAIT2); |
| meta_sk->sk_shutdown |= SEND_SHUTDOWN; |
| |
| dst = __sk_dst_get(sk); |
| if (dst) |
| dst_confirm(dst); |
| |
| if (!sock_flag(meta_sk, SOCK_DEAD)) { |
| /* Wake up lingering close() */ |
| meta_sk->sk_state_change(meta_sk); |
| break; |
| } |
| |
| if (meta_tp->linger2 < 0 || |
| (data_len && |
| after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0), |
| meta_tp->rcv_nxt))) { |
| mptcp_send_active_reset(meta_sk, GFP_ATOMIC); |
| tcp_done(meta_sk); |
| NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); |
| return 1; |
| } |
| |
| tmo = tcp_fin_time(meta_sk); |
| if (tmo > TCP_TIMEWAIT_LEN) { |
| inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN); |
| } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) { |
| /* Bad case. We could lose such FIN otherwise. |
| * It is not a big problem, but it looks confusing |
| * and not so rare event. We still can lose it now, |
| * if it spins in bh_lock_sock(), but it is really |
| * marginal case. |
| */ |
| inet_csk_reset_keepalive_timer(meta_sk, tmo); |
| } else { |
| meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo); |
| } |
| break; |
| } |
| case TCP_CLOSING: |
| case TCP_LAST_ACK: |
| if (meta_tp->snd_una == meta_tp->write_seq) { |
| tcp_done(meta_sk); |
| return 1; |
| } |
| break; |
| } |
| |
| /* step 7: process the segment text */ |
| switch (meta_sk->sk_state) { |
| case TCP_FIN_WAIT1: |
| case TCP_FIN_WAIT2: |
| /* RFC 793 says to queue data in these states, |
| * RFC 1122 says we MUST send a reset. |
| * BSD 4.4 also does reset. |
| */ |
| if (meta_sk->sk_shutdown & RCV_SHUTDOWN) { |
| if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && |
| after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) && |
| !mptcp_is_data_fin2(skb, tp)) { |
| NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA); |
| mptcp_send_active_reset(meta_sk, GFP_ATOMIC); |
| tcp_reset(meta_sk); |
| return 1; |
| } |
| } |
| break; |
| } |
| |
| return 0; |
| } |
| |
| /** |
| * @return: |
| * i) 1: Everything's fine. |
| * ii) -1: A reset has been sent on the subflow - csum-failure |
| * iii) 0: csum-failure but no reset sent, because it's the last subflow. |
| * Last packet should not be destroyed by the caller because it has |
| * been done here. |
| */ |
| static int mptcp_verif_dss_csum(struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct sk_buff *tmp, *tmp1, *last = NULL; |
| __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */ |
| int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0; |
| int iter = 0; |
| |
| skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) { |
| unsigned int csum_len; |
| |
| if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq)) |
| /* Mapping ends in the middle of the packet - |
| * csum only these bytes |
| */ |
| csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq; |
| else |
| csum_len = tmp->len; |
| |
| offset = 0; |
| if (overflowed) { |
| char first_word[4]; |
| first_word[0] = 0; |
| first_word[1] = 0; |
| first_word[2] = 0; |
| first_word[3] = *(tmp->data); |
| csum_tcp = csum_partial(first_word, 4, csum_tcp); |
| offset = 1; |
| csum_len--; |
| overflowed = 0; |
| } |
| |
| csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp); |
| |
| /* Was it on an odd-length? Then we have to merge the next byte |
| * correctly (see above) |
| */ |
| if (csum_len != (csum_len & (~1))) |
| overflowed = 1; |
| |
| if (mptcp_is_data_seq(tmp) && !dss_csum_added) { |
| __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32)); |
| |
| /* If a 64-bit dss is present, we increase the offset |
| * by 4 bytes, as the high-order 64-bits will be added |
| * in the final csum_partial-call. |
| */ |
| u32 offset = skb_transport_offset(tmp) + |
| TCP_SKB_CB(tmp)->dss_off; |
| if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET) |
| offset += 4; |
| |
| csum_tcp = skb_checksum(tmp, offset, |
| MPTCP_SUB_LEN_SEQ_CSUM, |
| csum_tcp); |
| |
| csum_tcp = csum_partial(&data_seq, |
| sizeof(data_seq), csum_tcp); |
| |
| dss_csum_added = 1; /* Just do it once */ |
| } |
| last = tmp; |
| iter++; |
| |
| if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) && |
| !before(TCP_SKB_CB(tmp1)->seq, |
| tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
| break; |
| } |
| |
| /* Now, checksum must be 0 */ |
| if (unlikely(csum_fold(csum_tcp))) { |
| pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n", |
| __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq, |
| dss_csum_added, overflowed, iter); |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_CSUMFAIL); |
| tp->mptcp->send_mp_fail = 1; |
| |
| /* map_data_seq is the data-seq number of the |
| * mapping we are currently checking |
| */ |
| tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq; |
| |
| if (tp->mpcb->cnt_subflows > 1) { |
| mptcp_send_reset(sk); |
| ans = -1; |
| } else { |
| tp->mpcb->send_infinite_mapping = 1; |
| |
| /* Need to purge the rcv-queue as it's no more valid */ |
| while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { |
| tp->copied_seq = TCP_SKB_CB(tmp)->end_seq; |
| kfree_skb(tmp); |
| } |
| |
| ans = 0; |
| } |
| } |
| |
| return ans; |
| } |
| |
| static inline void mptcp_prepare_skb(struct sk_buff *skb, |
| const struct sock *sk) |
| { |
| const struct tcp_sock *tp = tcp_sk(sk); |
| struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
| u32 inc = 0, end_seq = tcb->end_seq; |
| |
| if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
| end_seq--; |
| /* If skb is the end of this mapping (end is always at mapping-boundary |
| * thanks to the splitting/trimming), then we need to increase |
| * data-end-seq by 1 if this here is a data-fin. |
| * |
| * We need to do -1 because end_seq includes the subflow-FIN. |
| */ |
| if (tp->mptcp->map_data_fin && |
| end_seq == tp->mptcp->map_subseq + tp->mptcp->map_data_len) { |
| inc = 1; |
| |
| /* We manually set the fin-flag if it is a data-fin. For easy |
| * processing in tcp_recvmsg. |
| */ |
| TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; |
| } else { |
| /* We may have a subflow-fin with data but without data-fin */ |
| TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_FIN; |
| } |
| |
| /* Adapt data-seq's to the packet itself. We kinda transform the |
| * dss-mapping to a per-packet granularity. This is necessary to |
| * correctly handle overlapping mappings coming from different |
| * subflows. Otherwise it would be a complete mess. |
| */ |
| tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq; |
| tcb->end_seq = tcb->seq + skb->len + inc; |
| } |
| |
| /** |
| * @return: 1 if the segment has been eaten and can be suppressed, |
| * otherwise 0. |
| */ |
| static inline int mptcp_direct_copy(const struct sk_buff *skb, |
| struct sock *meta_sk) |
| { |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len); |
| int eaten = 0; |
| |
| __set_current_state(TASK_RUNNING); |
| |
| local_bh_enable(); |
| if (!skb_copy_datagram_msg(skb, 0, meta_tp->ucopy.msg, chunk)) { |
| meta_tp->ucopy.len -= chunk; |
| meta_tp->copied_seq += chunk; |
| eaten = (chunk == skb->len); |
| tcp_rcv_space_adjust(meta_sk); |
| } |
| local_bh_disable(); |
| return eaten; |
| } |
| |
| static inline void mptcp_reset_mapping(struct tcp_sock *tp, u32 old_copied_seq) |
| { |
| tp->mptcp->map_data_len = 0; |
| tp->mptcp->map_data_seq = 0; |
| tp->mptcp->map_subseq = 0; |
| tp->mptcp->map_data_fin = 0; |
| tp->mptcp->mapping_present = 0; |
| |
| /* In infinite mapping receiver mode, we have to advance the implied |
| * data-sequence number when we progress the subflow's data. |
| */ |
| if (tp->mpcb->infinite_mapping_rcv) |
| tp->mpcb->infinite_rcv_seq += (tp->copied_seq - old_copied_seq); |
| } |
| |
| /* The DSS-mapping received on the sk only covers the second half of the skb |
| * (cut at seq). We trim the head from the skb. |
| * Data will be freed upon kfree(). |
| * |
| * Inspired by tcp_trim_head(). |
| */ |
| static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq) |
| { |
| int len = seq - TCP_SKB_CB(skb)->seq; |
| u32 new_seq = TCP_SKB_CB(skb)->seq + len; |
| u32 delta_truesize; |
| |
| delta_truesize = __pskb_trim_head(skb, len); |
| |
| TCP_SKB_CB(skb)->seq = new_seq; |
| |
| if (delta_truesize) { |
| skb->truesize -= delta_truesize; |
| atomic_sub(delta_truesize, &sk->sk_rmem_alloc); |
| sk_mem_uncharge(sk, delta_truesize); |
| } |
| } |
| |
| /* The DSS-mapping received on the sk only covers the first half of the skb |
| * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue |
| * as further packets may resolve the mapping of the second half of data. |
| * |
| * Inspired by tcp_fragment(). |
| */ |
| static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq) |
| { |
| struct sk_buff *buff; |
| int nsize; |
| int nlen, len; |
| u8 flags; |
| |
| len = seq - TCP_SKB_CB(skb)->seq; |
| nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len; |
| if (nsize < 0) |
| nsize = 0; |
| |
| /* Get a new skb... force flag on. */ |
| buff = alloc_skb(nsize, GFP_ATOMIC); |
| if (buff == NULL) |
| return -ENOMEM; |
| |
| skb_reserve(buff, tcp_sk(sk)->tcp_header_len); |
| skb_reset_transport_header(buff); |
| |
| flags = TCP_SKB_CB(skb)->tcp_flags; |
| TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN); |
| TCP_SKB_CB(buff)->tcp_flags = flags; |
| |
| /* We absolutly need to call skb_set_owner_r before refreshing the |
| * truesize of buff, otherwise the moved data will account twice. |
| */ |
| skb_set_owner_r(buff, sk); |
| nlen = skb->len - len - nsize; |
| buff->truesize += nlen; |
| skb->truesize -= nlen; |
| |
| /* Correct the sequence numbers. */ |
| TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; |
| TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; |
| TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; |
| |
| skb_split(skb, buff, len); |
| |
| __skb_queue_after(&sk->sk_receive_queue, skb, buff); |
| |
| return 0; |
| } |
| |
| /* @return: 0 everything is fine. Just continue processing |
| * 1 subflow is broken stop everything |
| * -1 this packet was broken - continue with the next one. |
| */ |
| static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct mptcp_cb *mpcb = tp->mpcb; |
| |
| /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */ |
| if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && |
| !mptcp_is_data_fin(skb) && !mpcb->infinite_mapping_rcv) { |
| /* Remove a pure subflow-fin from the queue and increase |
| * copied_seq. |
| */ |
| tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
| __skb_unlink(skb, &sk->sk_receive_queue); |
| __kfree_skb(skb); |
| return -1; |
| } |
| |
| /* If we are not yet fully established and do not know the mapping for |
| * this segment, this path has to fallback to infinite or be torn down. |
| */ |
| if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) && |
| !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) { |
| pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n", |
| __func__, mpcb->mptcp_loc_token, |
| tp->mptcp->path_index, __builtin_return_address(0), |
| TCP_SKB_CB(skb)->seq); |
| |
| if (!is_master_tp(tp)) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_FBDATASUB); |
| mptcp_send_reset(sk); |
| return 1; |
| } |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_FBDATAINIT); |
| |
| mpcb->infinite_mapping_snd = 1; |
| mpcb->infinite_mapping_rcv = 1; |
| mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp)); |
| |
| mptcp_sub_force_close_all(mpcb, sk); |
| |
| /* We do a seamless fallback and should not send a inf.mapping. */ |
| mpcb->send_infinite_mapping = 0; |
| tp->mptcp->fully_established = 1; |
| } |
| |
| /* Receiver-side becomes fully established when a whole rcv-window has |
| * been received without the need to fallback due to the previous |
| * condition. |
| */ |
| if (!tp->mptcp->fully_established) { |
| tp->mptcp->init_rcv_wnd -= skb->len; |
| if (tp->mptcp->init_rcv_wnd < 0) |
| mptcp_become_fully_estab(sk); |
| } |
| |
| return 0; |
| } |
| |
| /* @return: 0 everything is fine. Just continue processing |
| * 1 subflow is broken stop everything |
| * -1 this packet was broken - continue with the next one. |
| */ |
| static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb) |
| { |
| struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
| struct mptcp_cb *mpcb = tp->mpcb; |
| struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
| u32 *ptr; |
| u32 data_seq, sub_seq, data_len, tcp_end_seq; |
| bool set_infinite_rcv = false; |
| |
| /* If we are in infinite-mapping-mode, the subflow is guaranteed to be |
| * in-order at the data-level. Thus data-seq-numbers can be inferred |
| * from what is expected at the data-level. |
| */ |
| if (mpcb->infinite_mapping_rcv) { |
| /* copied_seq may be bigger than tcb->seq (e.g., when the peer |
| * retransmits data that actually has already been acknowledged with |
| * newer data, if he did not receive our acks). Thus, we need |
| * to account for this overlap as well. |
| */ |
| tp->mptcp->map_data_seq = mpcb->infinite_rcv_seq - (tp->copied_seq - tcb->seq); |
| tp->mptcp->map_subseq = tcb->seq; |
| tp->mptcp->map_data_len = skb->len; |
| tp->mptcp->map_data_fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); |
| tp->mptcp->mapping_present = 1; |
| return 0; |
| } |
| |
| /* No mapping here? Exit - it is either already set or still on its way */ |
| if (!mptcp_is_data_seq(skb)) { |
| /* Too many packets without a mapping - this subflow is broken */ |
| if (!tp->mptcp->mapping_present && |
| tp->rcv_nxt - tp->copied_seq > 65536) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_NODSSWINDOW); |
| mptcp_send_reset(sk); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb); |
| ptr++; |
| sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn; |
| ptr++; |
| data_len = get_unaligned_be16(ptr); |
| |
| /* If it's an empty skb with DATA_FIN, sub_seq must get fixed. |
| * The draft sets it to 0, but we really would like to have the |
| * real value, to have an easy handling afterwards here in this |
| * function. |
| */ |
| if (mptcp_is_data_fin(skb) && skb->len == 0) |
| sub_seq = TCP_SKB_CB(skb)->seq; |
| |
| /* If there is already a mapping - we check if it maps with the current |
| * one. If not - we reset. |
| */ |
| if (tp->mptcp->mapping_present && |
| (data_seq != (u32)tp->mptcp->map_data_seq || |
| sub_seq != tp->mptcp->map_subseq || |
| data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin || |
| mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) { |
| /* Mapping in packet is different from what we want */ |
| pr_err("%s Mappings do not match!\n", __func__); |
| pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n", |
| __func__, data_seq, (u32)tp->mptcp->map_data_seq, |
| sub_seq, tp->mptcp->map_subseq, data_len, |
| tp->mptcp->map_data_len, mptcp_is_data_fin(skb), |
| tp->mptcp->map_data_fin); |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_DSSNOMATCH); |
| mptcp_send_reset(sk); |
| return 1; |
| } |
| |
| /* If the previous check was good, the current mapping is valid and we exit. */ |
| if (tp->mptcp->mapping_present) |
| return 0; |
| |
| /* Mapping not yet set on this subflow - we set it here! */ |
| |
| if (!data_len) { |
| mpcb->infinite_mapping_rcv = 1; |
| mpcb->send_infinite_mapping = 1; |
| tp->mptcp->fully_established = 1; |
| /* We need to repeat mp_fail's until the sender felt |
| * back to infinite-mapping - here we stop repeating it. |
| */ |
| tp->mptcp->send_mp_fail = 0; |
| |
| /* We have to fixup data_len - it must be the same as skb->len */ |
| data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0); |
| sub_seq = tcb->seq; |
| |
| mptcp_sub_force_close_all(mpcb, sk); |
| |
| /* data_seq and so on are set correctly */ |
| |
| /* At this point, the meta-ofo-queue has to be emptied, |
| * as the following data is guaranteed to be in-order at |
| * the data and subflow-level |
| */ |
| skb_rbtree_purge(&meta_tp->out_of_order_queue); |
| |
| set_infinite_rcv = true; |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_INFINITEMAPRX); |
| } |
| |
| /* We are sending mp-fail's and thus are in fallback mode. |
| * Ignore packets which do not announce the fallback and still |
| * want to provide a mapping. |
| */ |
| if (tp->mptcp->send_mp_fail) { |
| tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
| __skb_unlink(skb, &sk->sk_receive_queue); |
| __kfree_skb(skb); |
| return -1; |
| } |
| |
| /* FIN increased the mapping-length by 1 */ |
| if (mptcp_is_data_fin(skb)) |
| data_len--; |
| |
| /* Subflow-sequences of packet must be |
| * (at least partially) be part of the DSS-mapping's |
| * subflow-sequence-space. |
| * |
| * Basically the mapping is not valid, if either of the |
| * following conditions is true: |
| * |
| * 1. It's not a data_fin and |
| * MPTCP-sub_seq >= TCP-end_seq |
| * |
| * 2. It's a data_fin and TCP-end_seq > TCP-seq and |
| * MPTCP-sub_seq >= TCP-end_seq |
| * |
| * The previous two can be merged into: |
| * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq |
| * Because if it's not a data-fin, TCP-end_seq > TCP-seq |
| * |
| * 3. It's a data_fin and skb->len == 0 and |
| * MPTCP-sub_seq > TCP-end_seq |
| * |
| * 4. It's not a data_fin and TCP-end_seq > TCP-seq and |
| * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq |
| */ |
| |
| /* subflow-fin is not part of the mapping - ignore it here ! */ |
| tcp_end_seq = tcb->end_seq; |
| if (tcb->tcp_flags & TCPHDR_FIN) |
| tcp_end_seq--; |
| if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) || |
| (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) || |
| (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq))) { |
| /* Subflow-sequences of packet is different from what is in the |
| * packet's dss-mapping. The peer is misbehaving - reset |
| */ |
| pr_err("%s Packet's mapping does not map to the DSS sub_seq %u " |
| "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u" |
| "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb), |
| skb->len, data_len, tp->copied_seq); |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_DSSTCPMISMATCH); |
| mptcp_send_reset(sk); |
| return 1; |
| } |
| |
| /* Does the DSS had 64-bit seqnum's ? */ |
| if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) { |
| /* Wrapped around? */ |
| if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) { |
| tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq); |
| } else { |
| /* Else, access the default high-order bits */ |
| tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq); |
| } |
| } else { |
| tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq); |
| |
| if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) { |
| /* We make sure that the data_seq is invalid. |
| * It will be dropped later. |
| */ |
| tp->mptcp->map_data_seq += 0xFFFFFFFF; |
| tp->mptcp->map_data_seq += 0xFFFFFFFF; |
| } |
| } |
| |
| if (set_infinite_rcv) |
| mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq; |
| |
| tp->mptcp->map_data_len = data_len; |
| tp->mptcp->map_subseq = sub_seq; |
| tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0; |
| tp->mptcp->mapping_present = 1; |
| |
| return 0; |
| } |
| |
| /* Similar to tcp_sequence(...) */ |
| static inline bool mptcp_sequence(const struct tcp_sock *meta_tp, |
| u64 data_seq, u64 end_data_seq) |
| { |
| const struct mptcp_cb *mpcb = meta_tp->mpcb; |
| u64 rcv_wup64; |
| |
| /* Wrap-around? */ |
| if (meta_tp->rcv_wup > meta_tp->rcv_nxt) { |
| rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) | |
| meta_tp->rcv_wup; |
| } else { |
| rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, |
| meta_tp->rcv_wup); |
| } |
| |
| return !before64(end_data_seq, rcv_wup64) && |
| !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp)); |
| } |
| |
| /* @return: 0 everything is fine. Just continue processing |
| * -1 this packet was broken - continue with the next one. |
| */ |
| static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct sk_buff *tmp, *tmp1; |
| u32 tcp_end_seq; |
| |
| if (!tp->mptcp->mapping_present) |
| return 0; |
| |
| /* either, the new skb gave us the mapping and the first segment |
| * in the sub-rcv-queue has to be trimmed ... |
| */ |
| tmp = skb_peek(&sk->sk_receive_queue); |
| if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) && |
| after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_DSSTRIMHEAD); |
| mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq); |
| } |
| |
| /* ... or the new skb (tail) has to be split at the end. */ |
| tcp_end_seq = TCP_SKB_CB(skb)->end_seq; |
| if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
| tcp_end_seq--; |
| if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) { |
| u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len; |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_DSSSPLITTAIL); |
| if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */ |
| /* TODO : maybe handle this here better. |
| * We now just force meta-retransmission. |
| */ |
| tp->copied_seq = TCP_SKB_CB(skb)->end_seq; |
| __skb_unlink(skb, &sk->sk_receive_queue); |
| __kfree_skb(skb); |
| return -1; |
| } |
| } |
| |
| /* Now, remove old sk_buff's from the receive-queue. |
| * This may happen if the mapping has been lost for these segments and |
| * the next mapping has already been received. |
| */ |
| if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) { |
| skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
| if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq)) |
| break; |
| |
| tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
| __skb_unlink(tmp1, &sk->sk_receive_queue); |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_PURGEOLD); |
| /* Impossible that we could free skb here, because his |
| * mapping is known to be valid from previous checks |
| */ |
| __kfree_skb(tmp1); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* @return: 0 everything is fine. Just continue processing |
| * 1 subflow is broken stop everything |
| * -1 this mapping has been put in the meta-receive-queue |
| * -2 this mapping has been eaten by the application |
| */ |
| static int mptcp_queue_skb(struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp); |
| struct sock *meta_sk = mptcp_meta_sk(sk); |
| struct mptcp_cb *mpcb = tp->mpcb; |
| struct sk_buff *tmp, *tmp1; |
| u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp); |
| u32 old_copied_seq = tp->copied_seq; |
| bool data_queued = false; |
| |
| /* Have we not yet received the full mapping? */ |
| if (!tp->mptcp->mapping_present || |
| before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
| return 0; |
| |
| /* Is this an overlapping mapping? rcv_nxt >= end_data_seq |
| * OR |
| * This mapping is out of window |
| */ |
| if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) || |
| !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq, |
| tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) { |
| skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
| __skb_unlink(tmp1, &sk->sk_receive_queue); |
| tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
| __kfree_skb(tmp1); |
| |
| if (!skb_queue_empty(&sk->sk_receive_queue) && |
| !before(TCP_SKB_CB(tmp)->seq, |
| tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
| break; |
| } |
| |
| mptcp_reset_mapping(tp, old_copied_seq); |
| |
| return -1; |
| } |
| |
| /* Record it, because we want to send our data_fin on the same path */ |
| if (tp->mptcp->map_data_fin) { |
| mpcb->dfin_path_index = tp->mptcp->path_index; |
| mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN); |
| } |
| |
| /* Verify the checksum */ |
| if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) { |
| int ret = mptcp_verif_dss_csum(sk); |
| |
| if (ret <= 0) { |
| mptcp_reset_mapping(tp, old_copied_seq); |
| return 1; |
| } |
| } |
| |
| if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) { |
| /* Seg's have to go to the meta-ofo-queue */ |
| skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
| tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
| mptcp_prepare_skb(tmp1, sk); |
| __skb_unlink(tmp1, &sk->sk_receive_queue); |
| /* MUST be done here, because fragstolen may be true later. |
| * Then, kfree_skb_partial will not account the memory. |
| */ |
| skb_orphan(tmp1); |
| |
| if (!mpcb->in_time_wait) /* In time-wait, do not receive data */ |
| tcp_data_queue_ofo(meta_sk, tmp1); |
| else |
| __kfree_skb(tmp1); |
| |
| if (!skb_queue_empty(&sk->sk_receive_queue) && |
| !before(TCP_SKB_CB(tmp)->seq, |
| tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
| break; |
| } |
| tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); |
| } else { |
| /* Ready for the meta-rcv-queue */ |
| skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) { |
| int eaten = 0; |
| bool fragstolen = false; |
| u32 old_rcv_nxt = meta_tp->rcv_nxt; |
| |
| tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq; |
| mptcp_prepare_skb(tmp1, sk); |
| __skb_unlink(tmp1, &sk->sk_receive_queue); |
| /* MUST be done here, because fragstolen may be true. |
| * Then, kfree_skb_partial will not account the memory. |
| */ |
| skb_orphan(tmp1); |
| |
| /* This segment has already been received */ |
| if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) { |
| __kfree_skb(tmp1); |
| goto next; |
| } |
| |
| /* Is direct copy possible ? */ |
| if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt && |
| meta_tp->ucopy.task == current && |
| meta_tp->copied_seq == meta_tp->rcv_nxt && |
| meta_tp->ucopy.len && sock_owned_by_user(meta_sk)) |
| eaten = mptcp_direct_copy(tmp1, meta_sk); |
| |
| if (mpcb->in_time_wait) /* In time-wait, do not receive data */ |
| eaten = 1; |
| |
| if (!eaten) |
| eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen); |
| |
| meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq; |
| mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt); |
| |
| if (TCP_SKB_CB(tmp1)->tcp_flags & TCPHDR_FIN) |
| mptcp_fin(meta_sk); |
| |
| /* Check if this fills a gap in the ofo queue */ |
| if (!RB_EMPTY_ROOT(&meta_tp->out_of_order_queue)) |
| tcp_ofo_queue(meta_sk); |
| |
| if (eaten) |
| kfree_skb_partial(tmp1, fragstolen); |
| |
| data_queued = true; |
| next: |
| if (!skb_queue_empty(&sk->sk_receive_queue) && |
| !before(TCP_SKB_CB(tmp)->seq, |
| tp->mptcp->map_subseq + tp->mptcp->map_data_len)) |
| break; |
| } |
| } |
| |
| inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| mptcp_reset_mapping(tp, old_copied_seq); |
| |
| return data_queued ? -1 : -2; |
| } |
| |
| void mptcp_data_ready(struct sock *sk) |
| { |
| struct sock *meta_sk = mptcp_meta_sk(sk); |
| struct sk_buff *skb, *tmp; |
| int queued = 0; |
| |
| /* restart before the check, because mptcp_fin might have changed the |
| * state. |
| */ |
| restart: |
| /* If the meta cannot receive data, there is no point in pushing data. |
| * If we are in time-wait, we may still be waiting for the final FIN. |
| * So, we should proceed with the processing. |
| */ |
| if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) { |
| skb_queue_purge(&sk->sk_receive_queue); |
| tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt; |
| goto exit; |
| } |
| |
| /* Iterate over all segments, detect their mapping (if we don't have |
| * one yet), validate them and push everything one level higher. |
| */ |
| skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { |
| int ret; |
| /* Pre-validation - e.g., early fallback */ |
| ret = mptcp_prevalidate_skb(sk, skb); |
| if (ret < 0) |
| goto restart; |
| else if (ret > 0) |
| break; |
| |
| /* Set the current mapping */ |
| ret = mptcp_detect_mapping(sk, skb); |
| if (ret < 0) |
| goto restart; |
| else if (ret > 0) |
| break; |
| |
| /* Validation */ |
| if (mptcp_validate_mapping(sk, skb) < 0) |
| goto restart; |
| |
| /* Push a level higher */ |
| ret = mptcp_queue_skb(sk); |
| if (ret < 0) { |
| if (ret == -1) |
| queued = ret; |
| goto restart; |
| } else if (ret == 0) { |
| continue; |
| } else { /* ret == 1 */ |
| break; |
| } |
| } |
| |
| exit: |
| if (tcp_sk(sk)->close_it) { |
| tcp_send_ack(sk); |
| tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0); |
| } |
| |
| if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD)) |
| meta_sk->sk_data_ready(meta_sk); |
| } |
| |
| struct mp_join *mptcp_find_join(const struct sk_buff *skb) |
| { |
| const struct tcphdr *th = tcp_hdr(skb); |
| unsigned char *ptr; |
| int length = (th->doff * 4) - sizeof(struct tcphdr); |
| |
| /* Jump through the options to check whether JOIN is there */ |
| ptr = (unsigned char *)(th + 1); |
| while (length > 0) { |
| int opcode = *ptr++; |
| int opsize; |
| |
| switch (opcode) { |
| case TCPOPT_EOL: |
| return NULL; |
| case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
| length--; |
| continue; |
| default: |
| opsize = *ptr++; |
| if (opsize < 2) /* "silly options" */ |
| return NULL; |
| if (opsize > length) |
| return NULL; /* don't parse partial options */ |
| if (opcode == TCPOPT_MPTCP && |
| ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) { |
| return (struct mp_join *)(ptr - 2); |
| } |
| ptr += opsize - 2; |
| length -= opsize; |
| } |
| } |
| return NULL; |
| } |
| |
| int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw) |
| { |
| const struct mptcp_cb *mpcb; |
| struct sock *meta_sk; |
| u32 token; |
| bool meta_v4; |
| struct mp_join *join_opt = mptcp_find_join(skb); |
| if (!join_opt) |
| return 0; |
| |
| /* MPTCP structures were not initialized, so return error */ |
| if (mptcp_init_failed) |
| return -1; |
| |
| token = join_opt->u.syn.token; |
| meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token); |
| if (!meta_sk) { |
| MPTCP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); |
| mptcp_debug("%s:mpcb not found:%x\n", __func__, token); |
| return -1; |
| } |
| |
| meta_v4 = meta_sk->sk_family == AF_INET; |
| if (meta_v4) { |
| if (skb->protocol == htons(ETH_P_IPV6)) { |
| mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return -1; |
| } |
| } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { |
| mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return -1; |
| } |
| |
| mpcb = tcp_sk(meta_sk)->mpcb; |
| if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) { |
| /* We are in fallback-mode on the reception-side - |
| * no new subflows! |
| */ |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| MPTCP_INC_STATS_BH(sock_net(meta_sk), MPTCP_MIB_JOINFALLBACK); |
| return -1; |
| } |
| |
| /* Coming from time-wait-sock processing in tcp_v4_rcv. |
| * We have to deschedule it before continuing, because otherwise |
| * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req. |
| */ |
| if (tw) |
| inet_twsk_deschedule_put(tw); |
| |
| /* OK, this is a new syn/join, let's create a new open request and |
| * send syn+ack |
| */ |
| bh_lock_sock_nested(meta_sk); |
| if (sock_owned_by_user(meta_sk)) { |
| skb->sk = meta_sk; |
| if (unlikely(sk_add_backlog(meta_sk, skb, |
| meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) { |
| bh_unlock_sock(meta_sk); |
| NET_INC_STATS_BH(sock_net(meta_sk), |
| LINUX_MIB_TCPBACKLOGDROP); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| kfree_skb(skb); |
| return 1; |
| } |
| } else if (skb->protocol == htons(ETH_P_IP)) { |
| tcp_v4_do_rcv(meta_sk, skb); |
| #if IS_ENABLED(CONFIG_IPV6) |
| } else { |
| tcp_v6_do_rcv(meta_sk, skb); |
| #endif /* CONFIG_IPV6 */ |
| } |
| bh_unlock_sock(meta_sk); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return 1; |
| } |
| |
| int mptcp_do_join_short(struct sk_buff *skb, |
| const struct mptcp_options_received *mopt, |
| struct net *net) |
| { |
| struct sock *meta_sk; |
| u32 token; |
| bool meta_v4; |
| |
| token = mopt->mptcp_rem_token; |
| meta_sk = mptcp_hash_find(net, token); |
| if (!meta_sk) { |
| MPTCP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN); |
| mptcp_debug("%s:mpcb not found:%x\n", __func__, token); |
| return -1; |
| } |
| |
| meta_v4 = meta_sk->sk_family == AF_INET; |
| if (meta_v4) { |
| if (skb->protocol == htons(ETH_P_IPV6)) { |
| mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n"); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return -1; |
| } |
| } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) { |
| mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n"); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return -1; |
| } |
| |
| /* OK, this is a new syn/join, let's create a new open request and |
| * send syn+ack |
| */ |
| bh_lock_sock(meta_sk); |
| |
| /* This check is also done in mptcp_vX_do_rcv. But, there we cannot |
| * call tcp_vX_send_reset, because we hold already two socket-locks. |
| * (the listener and the meta from above) |
| * |
| * And the send-reset will try to take yet another one (ip_send_reply). |
| * Thus, we propagate the reset up to tcp_rcv_state_process. |
| */ |
| if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv || |
| tcp_sk(meta_sk)->mpcb->send_infinite_mapping || |
| meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) { |
| MPTCP_INC_STATS_BH(sock_net(meta_sk), MPTCP_MIB_JOINFALLBACK); |
| bh_unlock_sock(meta_sk); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return -1; |
| } |
| |
| if (sock_owned_by_user(meta_sk)) { |
| skb->sk = meta_sk; |
| if (unlikely(sk_add_backlog(meta_sk, skb, |
| meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) |
| NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); |
| else |
| /* Must make sure that upper layers won't free the |
| * skb if it is added to the backlog-queue. |
| */ |
| skb_get(skb); |
| } else { |
| /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as |
| * the skb will finally be freed by tcp_v4_do_rcv (where we are |
| * coming from) |
| */ |
| skb_get(skb); |
| if (skb->protocol == htons(ETH_P_IP)) { |
| tcp_v4_do_rcv(meta_sk, skb); |
| #if IS_ENABLED(CONFIG_IPV6) |
| } else { /* IPv6 */ |
| tcp_v6_do_rcv(meta_sk, skb); |
| #endif /* CONFIG_IPV6 */ |
| } |
| } |
| |
| bh_unlock_sock(meta_sk); |
| sock_put(meta_sk); /* Taken by mptcp_hash_find */ |
| return 0; |
| } |
| |
| /** |
| * Equivalent of tcp_fin() for MPTCP |
| * Can be called only when the FIN is validly part |
| * of the data seqnum space. Not before when we get holes. |
| */ |
| void mptcp_fin(struct sock *meta_sk) |
| { |
| struct sock *sk = NULL, *sk_it; |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| struct mptcp_cb *mpcb = meta_tp->mpcb; |
| unsigned char state; |
| |
| mptcp_for_each_sk(mpcb, sk_it) { |
| if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) { |
| sk = sk_it; |
| break; |
| } |
| } |
| |
| if (!sk || sk->sk_state == TCP_CLOSE) |
| sk = mptcp_select_ack_sock(meta_sk); |
| |
| inet_csk_schedule_ack(sk); |
| |
| if (!mpcb->in_time_wait) { |
| meta_sk->sk_shutdown |= RCV_SHUTDOWN; |
| sock_set_flag(meta_sk, SOCK_DONE); |
| state = meta_sk->sk_state; |
| } else { |
| state = mpcb->mptw_state; |
| } |
| |
| switch (state) { |
| case TCP_SYN_RECV: |
| case TCP_ESTABLISHED: |
| /* Move to CLOSE_WAIT */ |
| tcp_set_state(meta_sk, TCP_CLOSE_WAIT); |
| inet_csk(sk)->icsk_ack.pingpong = 1; |
| break; |
| |
| case TCP_CLOSE_WAIT: |
| case TCP_CLOSING: |
| /* Received a retransmission of the FIN, do |
| * nothing. |
| */ |
| break; |
| case TCP_LAST_ACK: |
| /* RFC793: Remain in the LAST-ACK state. */ |
| break; |
| |
| case TCP_FIN_WAIT1: |
| /* This case occurs when a simultaneous close |
| * happens, we must ack the received FIN and |
| * enter the CLOSING state. |
| */ |
| tcp_send_ack(sk); |
| tcp_set_state(meta_sk, TCP_CLOSING); |
| break; |
| case TCP_FIN_WAIT2: |
| /* Received a FIN -- send ACK and enter TIME_WAIT. */ |
| tcp_send_ack(sk); |
| meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0); |
| break; |
| default: |
| /* Only TCP_LISTEN and TCP_CLOSE are left, in these |
| * cases we should never reach this piece of code. |
| */ |
| pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__, |
| meta_sk->sk_state); |
| break; |
| } |
| |
| /* It _is_ possible, that we have something out-of-order _after_ FIN. |
| * Probably, we should reset in this case. For now drop them. |
| */ |
| skb_rbtree_purge(&meta_tp->out_of_order_queue); |
| sk_mem_reclaim(meta_sk); |
| |
| if (!sock_flag(meta_sk, SOCK_DEAD)) { |
| meta_sk->sk_state_change(meta_sk); |
| |
| /* Do not send POLL_HUP for half duplex close. */ |
| if (meta_sk->sk_shutdown == SHUTDOWN_MASK || |
| meta_sk->sk_state == TCP_CLOSE) |
| sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP); |
| else |
| sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN); |
| } |
| |
| return; |
| } |
| |
| static void mptcp_xmit_retransmit_queue(struct sock *meta_sk) |
| { |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| struct sk_buff *skb; |
| |
| if (!meta_tp->packets_out) |
| return; |
| |
| tcp_for_write_queue(skb, meta_sk) { |
| if (skb == tcp_send_head(meta_sk)) |
| break; |
| |
| if (mptcp_retransmit_skb(meta_sk, skb)) |
| return; |
| |
| if (skb == tcp_write_queue_head(meta_sk)) |
| inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, |
| inet_csk(meta_sk)->icsk_rto, |
| TCP_RTO_MAX); |
| } |
| } |
| |
| /* Handle the DATA_ACK */ |
| static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb) |
| { |
| struct sock *meta_sk = mptcp_meta_sk(sk); |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk); |
| struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
| u32 prior_snd_una = meta_tp->snd_una; |
| int prior_packets; |
| u32 nwin, data_ack, data_seq; |
| u16 data_len = 0; |
| |
| /* A valid packet came in - subflow is operational again */ |
| tp->pf = 0; |
| |
| /* Even if there is no data-ack, we stop retransmitting. |
| * Except if this is a SYN/ACK. Then it is just a retransmission |
| */ |
| if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) { |
| tp->mptcp->pre_established = 0; |
| sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer); |
| } |
| |
| /* If we are in infinite mapping mode, rx_opt.data_ack has been |
| * set by mptcp_clean_rtx_infinite. |
| */ |
| if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd) |
| goto exit; |
| |
| data_ack = tp->mptcp->rx_opt.data_ack; |
| |
| if (unlikely(!tp->mptcp->fully_established) && |
| tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq) |
| /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1) |
| * includes a data-ack, we are fully established |
| */ |
| mptcp_become_fully_estab(sk); |
| |
| /* Get the data_seq */ |
| if (mptcp_is_data_seq(skb)) { |
| data_seq = tp->mptcp->rx_opt.data_seq; |
| data_len = tp->mptcp->rx_opt.data_len; |
| } else { |
| data_seq = meta_tp->snd_wl1; |
| } |
| |
| /* If the ack is older than previous acks |
| * then we can probably ignore it. |
| */ |
| if (before(data_ack, prior_snd_una)) |
| goto exit; |
| |
| /* If the ack includes data we haven't sent yet, discard |
| * this segment (RFC793 Section 3.9). |
| */ |
| if (after(data_ack, meta_tp->snd_nxt)) |
| goto exit; |
| |
| /*** Now, update the window - inspired by tcp_ack_update_window ***/ |
| nwin = ntohs(tcp_hdr(skb)->window); |
| |
| if (likely(!tcp_hdr(skb)->syn)) |
| nwin <<= tp->rx_opt.snd_wscale; |
| |
| if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) { |
| tcp_update_wl(meta_tp, data_seq); |
| |
| /* Draft v09, Section 3.3.5: |
| * [...] It should only update its local receive window values |
| * when the largest sequence number allowed (i.e. DATA_ACK + |
| * receive window) increases. [...] |
| */ |
| if (meta_tp->snd_wnd != nwin && |
| !before(data_ack + nwin, tcp_wnd_end(meta_tp))) { |
| meta_tp->snd_wnd = nwin; |
| |
| if (nwin > meta_tp->max_window) |
| meta_tp->max_window = nwin; |
| } |
| } |
| /*** Done, update the window ***/ |
| |
| /* We passed data and got it acked, remove any soft error |
| * log. Something worked... |
| */ |
| sk->sk_err_soft = 0; |
| inet_csk(meta_sk)->icsk_probes_out = 0; |
| meta_tp->rcv_tstamp = tcp_time_stamp; |
| prior_packets = meta_tp->packets_out; |
| if (!prior_packets) |
| goto no_queue; |
| |
| meta_tp->snd_una = data_ack; |
| |
| mptcp_clean_rtx_queue(meta_sk, prior_snd_una); |
| |
| /* We are in loss-state, and something got acked, retransmit the whole |
| * queue now! |
| */ |
| if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss && |
| after(data_ack, prior_snd_una)) { |
| mptcp_xmit_retransmit_queue(meta_sk); |
| inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open; |
| } |
| |
| /* Simplified version of tcp_new_space, because the snd-buffer |
| * is handled by all the subflows. |
| */ |
| if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) { |
| sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK); |
| if (meta_sk->sk_socket && |
| test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) |
| meta_sk->sk_write_space(meta_sk); |
| } |
| |
| if (meta_sk->sk_state != TCP_ESTABLISHED && |
| mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len)) |
| return; |
| |
| exit: |
| mptcp_push_pending_frames(meta_sk); |
| |
| return; |
| |
| no_queue: |
| if (tcp_send_head(meta_sk)) |
| tcp_ack_probe(meta_sk); |
| |
| mptcp_push_pending_frames(meta_sk); |
| |
| return; |
| } |
| |
| void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk)); |
| |
| if (!tp->mpcb->infinite_mapping_snd) |
| return; |
| |
| /* The difference between both write_seq's represents the offset between |
| * data-sequence and subflow-sequence. As we are infinite, this must |
| * match. |
| * |
| * Thus, from this difference we can infer the meta snd_una. |
| */ |
| tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt + |
| tp->snd_una; |
| |
| mptcp_data_ack(sk, skb); |
| } |
| |
| /**** static functions used by mptcp_parse_options */ |
| |
| static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id) |
| { |
| struct sock *sk_it, *tmpsk; |
| |
| mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) { |
| if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) { |
| mptcp_reinject_data(sk_it, 0); |
| mptcp_send_reset(sk_it); |
| } |
| } |
| } |
| |
| static inline bool is_valid_addropt_opsize(u8 mptcp_ver, |
| struct mp_add_addr *mpadd, |
| int opsize) |
| { |
| #if IS_ENABLED(CONFIG_IPV6) |
| if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 6) { |
| return opsize == MPTCP_SUB_LEN_ADD_ADDR6 || |
| opsize == MPTCP_SUB_LEN_ADD_ADDR6 + 2; |
| } |
| if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 6) |
| return opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 || |
| opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2; |
| #endif |
| if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 4) { |
| return opsize == MPTCP_SUB_LEN_ADD_ADDR4 || |
| opsize == MPTCP_SUB_LEN_ADD_ADDR4 + 2; |
| } |
| if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 4) { |
| return opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 || |
| opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2; |
| } |
| return false; |
| } |
| |
| void mptcp_parse_options(const uint8_t *ptr, int opsize, |
| struct mptcp_options_received *mopt, |
| const struct sk_buff *skb, |
| struct tcp_sock *tp) |
| { |
| const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr; |
| |
| /* If the socket is mp-capable we would have a mopt. */ |
| if (!mopt) |
| return; |
| |
| switch (mp_opt->sub) { |
| case MPTCP_SUB_CAPABLE: |
| { |
| const struct mp_capable *mpcapable = (struct mp_capable *)ptr; |
| |
| if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN && |
| opsize != MPTCP_SUB_LEN_CAPABLE_ACK) { |
| mptcp_debug("%s: mp_capable: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| /* MPTCP-RFC 6824: |
| * "If receiving a message with the 'B' flag set to 1, and this |
| * is not understood, then this SYN MUST be silently ignored; |
| */ |
| if (mpcapable->b) { |
| mopt->drop_me = 1; |
| break; |
| } |
| |
| /* MPTCP-RFC 6824: |
| * "An implementation that only supports this method MUST set |
| * bit "H" to 1, and bits "C" through "G" to 0." |
| */ |
| if (!mpcapable->h) |
| break; |
| |
| mopt->saw_mpc = 1; |
| mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a; |
| |
| if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN) |
| mopt->mptcp_sender_key = mpcapable->sender_key; |
| if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK) |
| mopt->mptcp_receiver_key = mpcapable->receiver_key; |
| |
| mopt->mptcp_ver = mpcapable->ver; |
| break; |
| } |
| case MPTCP_SUB_JOIN: |
| { |
| const struct mp_join *mpjoin = (struct mp_join *)ptr; |
| |
| if (opsize != MPTCP_SUB_LEN_JOIN_SYN && |
| opsize != MPTCP_SUB_LEN_JOIN_SYNACK && |
| opsize != MPTCP_SUB_LEN_JOIN_ACK) { |
| mptcp_debug("%s: mp_join: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| /* saw_mpc must be set, because in tcp_check_req we assume that |
| * it is set to support falling back to reg. TCP if a rexmitted |
| * SYN has no MP_CAPABLE or MP_JOIN |
| */ |
| switch (opsize) { |
| case MPTCP_SUB_LEN_JOIN_SYN: |
| mopt->is_mp_join = 1; |
| mopt->saw_mpc = 1; |
| mopt->low_prio = mpjoin->b; |
| mopt->rem_id = mpjoin->addr_id; |
| mopt->mptcp_rem_token = mpjoin->u.syn.token; |
| mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce; |
| break; |
| case MPTCP_SUB_LEN_JOIN_SYNACK: |
| mopt->saw_mpc = 1; |
| mopt->low_prio = mpjoin->b; |
| mopt->rem_id = mpjoin->addr_id; |
| mopt->mptcp_recv_tmac = mpjoin->u.synack.mac; |
| mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce; |
| break; |
| case MPTCP_SUB_LEN_JOIN_ACK: |
| mopt->saw_mpc = 1; |
| mopt->join_ack = 1; |
| memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20); |
| break; |
| } |
| break; |
| } |
| case MPTCP_SUB_DSS: |
| { |
| const struct mp_dss *mdss = (struct mp_dss *)ptr; |
| struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); |
| |
| /* We check opsize for the csum and non-csum case. We do this, |
| * because the draft says that the csum SHOULD be ignored if |
| * it has not been negotiated in the MP_CAPABLE but still is |
| * present in the data. |
| * |
| * It will get ignored later in mptcp_queue_skb. |
| */ |
| if (opsize != mptcp_sub_len_dss(mdss, 0) && |
| opsize != mptcp_sub_len_dss(mdss, 1)) { |
| mptcp_debug("%s: mp_dss: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| ptr += 4; |
| |
| if (mdss->A) { |
| tcb->mptcp_flags |= MPTCPHDR_ACK; |
| |
| if (mdss->a) { |
| mopt->data_ack = (u32) get_unaligned_be64(ptr); |
| ptr += MPTCP_SUB_LEN_ACK_64; |
| } else { |
| mopt->data_ack = get_unaligned_be32(ptr); |
| ptr += MPTCP_SUB_LEN_ACK; |
| } |
| } |
| |
| tcb->dss_off = (ptr - skb_transport_header(skb)); |
| |
| if (mdss->M) { |
| if (mdss->m) { |
| u64 data_seq64 = get_unaligned_be64(ptr); |
| |
| tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET; |
| mopt->data_seq = (u32) data_seq64; |
| |
| ptr += 12; /* 64-bit dseq + subseq */ |
| } else { |
| mopt->data_seq = get_unaligned_be32(ptr); |
| ptr += 8; /* 32-bit dseq + subseq */ |
| } |
| mopt->data_len = get_unaligned_be16(ptr); |
| |
| tcb->mptcp_flags |= MPTCPHDR_SEQ; |
| |
| /* Is a check-sum present? */ |
| if (opsize == mptcp_sub_len_dss(mdss, 1)) |
| tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM; |
| |
| /* DATA_FIN only possible with DSS-mapping */ |
| if (mdss->F) |
| tcb->mptcp_flags |= MPTCPHDR_FIN; |
| } |
| |
| break; |
| } |
| case MPTCP_SUB_ADD_ADDR: |
| { |
| struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
| |
| /* If tcp_sock is not available, MPTCP version can't be |
| * retrieved and ADD_ADDR opsize validation is not possible. |
| */ |
| /* mpcb and mp_add_addr shouldnt be NULL which verifying option*/ |
| if (!tp || !tp->mpcb || !mpadd) |
| break; |
| |
| if (!is_valid_addropt_opsize(tp->mpcb->mptcp_ver, |
| mpadd, opsize)) { |
| mptcp_debug("%s: mp_add_addr: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| /* We have to manually parse the options if we got two of them. */ |
| if (mopt->saw_add_addr) { |
| mopt->more_add_addr = 1; |
| break; |
| } |
| mopt->saw_add_addr = 1; |
| mopt->add_addr_ptr = ptr; |
| break; |
| } |
| case MPTCP_SUB_REMOVE_ADDR: |
| if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) { |
| mptcp_debug("%s: mp_remove_addr: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| if (mopt->saw_rem_addr) { |
| mopt->more_rem_addr = 1; |
| break; |
| } |
| mopt->saw_rem_addr = 1; |
| mopt->rem_addr_ptr = ptr; |
| break; |
| case MPTCP_SUB_PRIO: |
| { |
| const struct mp_prio *mpprio = (struct mp_prio *)ptr; |
| |
| if (opsize != MPTCP_SUB_LEN_PRIO && |
| opsize != MPTCP_SUB_LEN_PRIO_ADDR) { |
| mptcp_debug("%s: mp_prio: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| mopt->saw_low_prio = 1; |
| mopt->low_prio = mpprio->b; |
| |
| if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) { |
| mopt->saw_low_prio = 2; |
| mopt->prio_addr_id = mpprio->addr_id; |
| } |
| break; |
| } |
| case MPTCP_SUB_FAIL: |
| if (opsize != MPTCP_SUB_LEN_FAIL) { |
| mptcp_debug("%s: mp_fail: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| mopt->mp_fail = 1; |
| break; |
| case MPTCP_SUB_FCLOSE: |
| if (opsize != MPTCP_SUB_LEN_FCLOSE) { |
| mptcp_debug("%s: mp_fclose: bad option size %d\n", |
| __func__, opsize); |
| break; |
| } |
| |
| mopt->mp_fclose = 1; |
| mopt->mptcp_sender_key = ((struct mp_fclose *)ptr)->key; |
| |
| break; |
| default: |
| mptcp_debug("%s: Received unkown subtype: %d\n", |
| __func__, mp_opt->sub); |
| break; |
| } |
| } |
| |
| /** Parse only MPTCP options */ |
| void tcp_parse_mptcp_options(const struct sk_buff *skb, |
| struct mptcp_options_received *mopt) |
| { |
| const struct tcphdr *th = tcp_hdr(skb); |
| int length = (th->doff * 4) - sizeof(struct tcphdr); |
| const unsigned char *ptr = (const unsigned char *)(th + 1); |
| |
| while (length > 0) { |
| int opcode = *ptr++; |
| int opsize; |
| |
| switch (opcode) { |
| case TCPOPT_EOL: |
| return; |
| case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ |
| length--; |
| continue; |
| default: |
| opsize = *ptr++; |
| if (opsize < 2) /* "silly options" */ |
| return; |
| if (opsize > length) |
| return; /* don't parse partial options */ |
| if (opcode == TCPOPT_MPTCP) |
| mptcp_parse_options(ptr - 2, opsize, mopt, skb, NULL); |
| } |
| ptr += opsize - 2; |
| length -= opsize; |
| } |
| } |
| |
| int mptcp_check_rtt(const struct tcp_sock *tp, int time) |
| { |
| struct mptcp_cb *mpcb = tp->mpcb; |
| struct sock *sk; |
| u32 rtt_max = 0; |
| |
| /* In MPTCP, we take the max delay across all flows, |
| * in order to take into account meta-reordering buffers. |
| */ |
| mptcp_for_each_sk(mpcb, sk) { |
| if (!mptcp_sk_can_recv(sk)) |
| continue; |
| |
| if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt) |
| rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt; |
| } |
| if (time < (rtt_max >> 3) || !rtt_max) |
| return 1; |
| |
| return 0; |
| } |
| |
| static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk) |
| { |
| struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
| struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
| __be16 port = 0; |
| union inet_addr addr; |
| sa_family_t family; |
| |
| if (mpadd->ipver == 4) { |
| char *recv_hmac; |
| u8 hash_mac_check[20]; |
| u8 no_key[8]; |
| int msg_parts = 0; |
| |
| if (mpcb->mptcp_ver < MPTCP_VERSION_1) |
| goto skip_hmac_v4; |
| |
| *(u64 *)no_key = 0; |
| recv_hmac = (char *)mpadd->u.v4.mac; |
| if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1) { |
| recv_hmac -= sizeof(mpadd->u.v4.port); |
| msg_parts = 2; |
| } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) { |
| msg_parts = 3; |
| } |
| mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, |
| (u8 *)no_key, |
| (u32 *)hash_mac_check, msg_parts, |
| 1, (u8 *)&mpadd->addr_id, |
| 4, (u8 *)&mpadd->u.v4.addr.s_addr, |
| 2, (u8 *)&mpadd->u.v4.port); |
| if (memcmp(hash_mac_check, recv_hmac, 8) != 0) |
| /* ADD_ADDR2 discarded */ |
| return; |
| skip_hmac_v4: |
| if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && |
| mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) || |
| (mpcb->mptcp_ver == MPTCP_VERSION_1 && |
| mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2)) |
| port = mpadd->u.v4.port; |
| family = AF_INET; |
| addr.in = mpadd->u.v4.addr; |
| #if IS_ENABLED(CONFIG_IPV6) |
| } else if (mpadd->ipver == 6) { |
| char *recv_hmac; |
| u8 hash_mac_check[20]; |
| u8 no_key[8]; |
| int msg_parts = 0; |
| |
| if (mpcb->mptcp_ver < MPTCP_VERSION_1) |
| goto skip_hmac_v6; |
| |
| *(u64 *)no_key = 0; |
| recv_hmac = (char *)mpadd->u.v6.mac; |
| if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1) { |
| recv_hmac -= sizeof(mpadd->u.v6.port); |
| msg_parts = 2; |
| } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) { |
| msg_parts = 3; |
| } |
| mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, |
| (u8 *)no_key, |
| (u32 *)hash_mac_check, msg_parts, |
| 1, (u8 *)&mpadd->addr_id, |
| 16, (u8 *)&mpadd->u.v6.addr.s6_addr, |
| 2, (u8 *)&mpadd->u.v6.port); |
| if (memcmp(hash_mac_check, recv_hmac, 8) != 0) |
| /* ADD_ADDR2 discarded */ |
| return; |
| skip_hmac_v6: |
| if ((mpcb->mptcp_ver == MPTCP_VERSION_0 && |
| mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) || |
| (mpcb->mptcp_ver == MPTCP_VERSION_1 && |
| mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2)) |
| port = mpadd->u.v6.port; |
| family = AF_INET6; |
| addr.in6 = mpadd->u.v6.addr; |
| #endif /* CONFIG_IPV6 */ |
| } else { |
| return; |
| } |
| |
| if (mpcb->pm_ops->add_raddr) |
| mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id); |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_ADDADDRRX); |
| } |
| |
| static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk) |
| { |
| struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr; |
| int i; |
| u8 rem_id; |
| struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
| |
| for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) { |
| rem_id = (&mprem->addrs_id)[i]; |
| |
| if (mpcb->pm_ops->rem_raddr) |
| mpcb->pm_ops->rem_raddr(mpcb, rem_id); |
| mptcp_send_reset_rem_id(mpcb, rem_id); |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_REMADDRSUB); |
| } |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_REMADDRRX); |
| } |
| |
| static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk) |
| { |
| struct tcphdr *th = tcp_hdr(skb); |
| unsigned char *ptr; |
| int length = (th->doff * 4) - sizeof(struct tcphdr); |
| |
| /* Jump through the options to check whether ADD_ADDR is there */ |
| ptr = (unsigned char *)(th + 1); |
| while (length > 0) { |
| int opcode = *ptr++; |
| int opsize; |
| |
| switch (opcode) { |
| case TCPOPT_EOL: |
| return; |
| case TCPOPT_NOP: |
| length--; |
| continue; |
| default: |
| opsize = *ptr++; |
| if (opsize < 2) |
| return; |
| if (opsize > length) |
| return; /* don't parse partial options */ |
| if (opcode == TCPOPT_MPTCP && |
| ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) { |
| u8 mptcp_ver = tcp_sk(sk)->mpcb->mptcp_ver; |
| struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr; |
| |
| if (!is_valid_addropt_opsize(mptcp_ver, mpadd, |
| opsize)) |
| goto cont; |
| |
| mptcp_handle_add_addr(ptr, sk); |
| } |
| if (opcode == TCPOPT_MPTCP && |
| ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) { |
| if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) |
| goto cont; |
| |
| mptcp_handle_rem_addr(ptr, sk); |
| } |
| cont: |
| ptr += opsize - 2; |
| length -= opsize; |
| } |
| } |
| return; |
| } |
| |
| static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th) |
| { |
| struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp; |
| struct sock *meta_sk = mptcp_meta_sk(sk); |
| struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb; |
| |
| if (unlikely(mptcp->rx_opt.mp_fail)) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_MPFAILRX); |
| mptcp->rx_opt.mp_fail = 0; |
| |
| if (!th->rst && !mpcb->infinite_mapping_snd) { |
| mpcb->send_infinite_mapping = 1; |
| /* We resend everything that has not been acknowledged */ |
| meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); |
| |
| /* We artificially restart the whole send-queue. Thus, |
| * it is as if no packets are in flight |
| */ |
| tcp_sk(meta_sk)->packets_out = 0; |
| |
| /* If the snd_nxt already wrapped around, we have to |
| * undo the wrapping, as we are restarting from snd_una |
| * on. |
| */ |
| if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) { |
| mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2; |
| mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1; |
| } |
| tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una; |
| |
| /* Trigger a sending on the meta. */ |
| mptcp_push_pending_frames(meta_sk); |
| |
| mptcp_sub_force_close_all(mpcb, sk); |
| } |
| |
| return 0; |
| } |
| |
| if (unlikely(mptcp->rx_opt.mp_fclose)) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_FASTCLOSERX); |
| mptcp->rx_opt.mp_fclose = 0; |
| if (mptcp->rx_opt.mptcp_sender_key != mpcb->mptcp_loc_key) |
| return 0; |
| |
| mptcp_sub_force_close_all(mpcb, NULL); |
| |
| tcp_reset(meta_sk); |
| |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| static inline void mptcp_path_array_check(struct sock *meta_sk) |
| { |
| struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb; |
| |
| if (unlikely(mpcb->list_rcvd)) { |
| mpcb->list_rcvd = 0; |
| if (mpcb->pm_ops->new_remote_address) |
| mpcb->pm_ops->new_remote_address(meta_sk); |
| } |
| } |
| |
| int mptcp_handle_options(struct sock *sk, const struct tcphdr *th, |
| const struct sk_buff *skb) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct mptcp_options_received *mopt = &tp->mptcp->rx_opt; |
| |
| if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd) |
| return 0; |
| |
| if (mptcp_mp_fail_rcvd(sk, th)) |
| return 1; |
| |
| /* RFC 6824, Section 3.3: |
| * If a checksum is not present when its use has been negotiated, the |
| * receiver MUST close the subflow with a RST as it is considered broken. |
| */ |
| if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum && |
| !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) { |
| mptcp_send_reset(sk); |
| return 1; |
| } |
| |
| /* We have to acknowledge retransmissions of the third |
| * ack. |
| */ |
| if (mopt->join_ack) { |
| tcp_send_delayed_ack(sk); |
| mopt->join_ack = 0; |
| } |
| |
| if (mopt->saw_add_addr || mopt->saw_rem_addr) { |
| if (mopt->more_add_addr || mopt->more_rem_addr) { |
| mptcp_parse_addropt(skb, sk); |
| } else { |
| if (mopt->saw_add_addr) |
| mptcp_handle_add_addr(mopt->add_addr_ptr, sk); |
| if (mopt->saw_rem_addr) |
| mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk); |
| } |
| |
| mopt->more_add_addr = 0; |
| mopt->saw_add_addr = 0; |
| mopt->more_rem_addr = 0; |
| mopt->saw_rem_addr = 0; |
| } |
| if (mopt->saw_low_prio) { |
| if (mopt->saw_low_prio == 1) { |
| tp->mptcp->rcv_low_prio = mopt->low_prio; |
| } else { |
| struct sock *sk_it; |
| mptcp_for_each_sk(tp->mpcb, sk_it) { |
| struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp; |
| if (mptcp->rem_id == mopt->prio_addr_id) |
| mptcp->rcv_low_prio = mopt->low_prio; |
| } |
| } |
| mopt->saw_low_prio = 0; |
| } |
| |
| mptcp_data_ack(sk, skb); |
| |
| mptcp_path_array_check(mptcp_meta_sk(sk)); |
| /* Socket may have been mp_killed by a REMOVE_ADDR */ |
| if (tp->mp_killed) |
| return 1; |
| |
| return 0; |
| } |
| |
| /* In case of fastopen, some data can already be in the write queue. |
| * We need to update the sequence number of the segments as they |
| * were initially TCP sequence numbers. |
| */ |
| static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk) |
| { |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk); |
| struct sk_buff *skb; |
| u32 new_mapping = meta_tp->write_seq - master_tp->snd_una; |
| |
| /* There should only be one skb in write queue: the data not |
| * acknowledged in the SYN+ACK. In this case, we need to map |
| * this data to data sequence numbers. |
| */ |
| skb_queue_walk(&meta_sk->sk_write_queue, skb) { |
| /* If the server only acknowledges partially the data sent in |
| * the SYN, we need to trim the acknowledged part because |
| * we don't want to retransmit this already received data. |
| * When we reach this point, tcp_ack() has already cleaned up |
| * fully acked segments. However, tcp trims partially acked |
| * segments only when retransmitting. Since MPTCP comes into |
| * play only now, we will fake an initial transmit, and |
| * retransmit_skb() will not be called. The following fragment |
| * comes from __tcp_retransmit_skb(). |
| */ |
| if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) { |
| BUG_ON(before(TCP_SKB_CB(skb)->end_seq, |
| master_tp->snd_una)); |
| /* tcp_trim_head can only returns ENOMEM if skb is |
| * cloned. It is not the case here (see |
| * tcp_send_syn_data). |
| */ |
| BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una - |
| TCP_SKB_CB(skb)->seq)); |
| } |
| |
| TCP_SKB_CB(skb)->seq += new_mapping; |
| TCP_SKB_CB(skb)->end_seq += new_mapping; |
| } |
| |
| /* We can advance write_seq by the number of bytes unacknowledged |
| * and that were mapped in the previous loop. |
| */ |
| meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una; |
| |
| /* The packets from the master_sk will be entailed to it later |
| * Until that time, its write queue is empty, and |
| * write_seq must align with snd_una |
| */ |
| master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una; |
| master_tp->packets_out = 0; |
| |
| /* Although these data have been sent already over the subsk, |
| * They have never been sent over the meta_sk, so we rewind |
| * the send_head so that tcp considers it as an initial send |
| * (instead of retransmit). |
| */ |
| meta_sk->sk_send_head = tcp_write_queue_head(meta_sk); |
| } |
| |
| /* The skptr is needed, because if we become MPTCP-capable, we have to switch |
| * from meta-socket to master-socket. |
| * |
| * @return: 1 - we want to reset this connection |
| * 2 - we want to discard the received syn/ack |
| * 0 - everything is fine - continue |
| */ |
| int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr, |
| const struct sk_buff *skb, |
| const struct mptcp_options_received *mopt) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| if (mptcp(tp)) { |
| u8 hash_mac_check[20]; |
| struct mptcp_cb *mpcb = tp->mpcb; |
| |
| mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key, |
| (u8 *)&mpcb->mptcp_loc_key, |
| (u32 *)hash_mac_check, 2, |
| 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce, |
| 4, (u8 *)&tp->mptcp->mptcp_loc_nonce); |
| if (memcmp(hash_mac_check, |
| (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); |
| mptcp_sub_force_close(sk); |
| return 1; |
| } |
| |
| /* Set this flag in order to postpone data sending |
| * until the 4th ack arrives. |
| */ |
| tp->mptcp->pre_established = 1; |
| tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio; |
| |
| mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key, |
| (u8 *)&mpcb->mptcp_rem_key, |
| (u32 *)&tp->mptcp->sender_mac[0], 2, |
| 4, (u8 *)&tp->mptcp->mptcp_loc_nonce, |
| 4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce); |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); |
| } else if (mopt->saw_mpc) { |
| struct sock *meta_sk = sk; |
| |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); |
| if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver) |
| /* TODO Consider adding new MPTCP_INC_STATS entry */ |
| goto fallback; |
| |
| if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key, |
| mopt->mptcp_ver, |
| ntohs(tcp_hdr(skb)->window))) |
| return 2; |
| |
| sk = tcp_sk(sk)->mpcb->master_sk; |
| *skptr = sk; |
| tp = tcp_sk(sk); |
| |
| sk->sk_bound_dev_if = skb->skb_iif; |
| |
| /* If fastopen was used data might be in the send queue. We |
| * need to update their sequence number to MPTCP-level seqno. |
| * Note that it can happen in rare cases that fastopen_req is |
| * NULL and syn_data is 0 but fastopen indeed occurred and |
| * data has been queued in the write queue (but not sent). |
| * Example of such rare cases: connect is non-blocking and |
| * TFO is configured to work without cookies. |
| */ |
| if (!skb_queue_empty(&meta_sk->sk_write_queue)) |
| mptcp_rcv_synsent_fastopen(meta_sk); |
| |
| /* -1, because the SYN consumed 1 byte. In case of TFO, we |
| * start the subflow-sequence number as if the data of the SYN |
| * is not part of any mapping. |
| */ |
| tp->mptcp->snt_isn = tp->snd_una - 1; |
| tp->mpcb->dss_csum = mopt->dss_csum; |
| if (tp->mpcb->dss_csum) |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_CSUMENABLED); |
| |
| tp->mptcp->include_mpc = 1; |
| |
| /* Ensure that fastopen is handled at the meta-level. */ |
| tp->fastopen_req = NULL; |
| |
| sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket); |
| sk->sk_wq = mptcp_meta_sk(sk)->sk_wq; |
| |
| /* hold in sk_clone_lock due to initialization to 2 */ |
| sock_put(sk); |
| } else { |
| MPTCP_INC_STATS_BH(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); |
| fallback: |
| tp->request_mptcp = 0; |
| |
| if (tp->inside_tk_table) |
| mptcp_hash_remove(tp); |
| } |
| |
| if (mptcp(tp)) |
| tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq; |
| |
| return 0; |
| } |
| |
| /* Similar to tcp_should_expand_sndbuf */ |
| bool mptcp_should_expand_sndbuf(const struct sock *sk) |
| { |
| const struct sock *sk_it; |
| const struct sock *meta_sk = mptcp_meta_sk(sk); |
| const struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| int cnt_backups = 0; |
| int backup_available = 0; |
| |
| /* We circumvent this check in tcp_check_space, because we want to |
| * always call sk_write_space. So, we reproduce the check here. |
| */ |
| if (!meta_sk->sk_socket || |
| !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags)) |
| return false; |
| |
| /* If the user specified a specific send buffer setting, do |
| * not modify it. |
| */ |
| if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
| return false; |
| |
| /* If we are under global TCP memory pressure, do not expand. */ |
| if (tcp_under_memory_pressure(meta_sk)) |
| return false; |
| |
| /* If we are under soft global TCP memory pressure, do not expand. */ |
| if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0)) |
| return false; |
| |
| /* For MPTCP we look for a subsocket that could send data. |
| * If we found one, then we update the send-buffer. |
| */ |
| mptcp_for_each_sk(meta_tp->mpcb, sk_it) { |
| struct tcp_sock *tp_it = tcp_sk(sk_it); |
| |
| if (!mptcp_sk_can_send(sk_it)) |
| continue; |
| |
| /* Backup-flows have to be counted - if there is no other |
| * subflow we take the backup-flow into account. |
| */ |
| if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) |
| cnt_backups++; |
| |
| if (tcp_packets_in_flight(tp_it) < tp_it->snd_cwnd) { |
| if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) { |
| backup_available = 1; |
| continue; |
| } |
| return true; |
| } |
| } |
| |
| /* Backup-flow is available for sending - update send-buffer */ |
| if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available) |
| return true; |
| return false; |
| } |
| |
| void mptcp_init_buffer_space(struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct sock *meta_sk = mptcp_meta_sk(sk); |
| struct tcp_sock *meta_tp = tcp_sk(meta_sk); |
| int space; |
| |
| tcp_init_buffer_space(sk); |
| |
| if (is_master_tp(tp)) { |
| meta_tp->rcvq_space.space = meta_tp->rcv_wnd; |
| meta_tp->rcvq_space.time = tcp_time_stamp; |
| meta_tp->rcvq_space.seq = meta_tp->copied_seq; |
| |
| /* If there is only one subflow, we just use regular TCP |
| * autotuning. User-locks are handled already by |
| * tcp_init_buffer_space |
| */ |
| meta_tp->window_clamp = tp->window_clamp; |
| meta_tp->rcv_ssthresh = tp->rcv_ssthresh; |
| meta_sk->sk_rcvbuf = sk->sk_rcvbuf; |
| meta_sk->sk_sndbuf = sk->sk_sndbuf; |
| |
| return; |
| } |
| |
| if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK) |
| goto snd_buf; |
| |
| /* Adding a new subflow to the rcv-buffer space. We make a simple |
| * addition, to give some space to allow traffic on the new subflow. |
| * Autotuning will increase it further later on. |
| */ |
| space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]); |
| if (space > meta_sk->sk_rcvbuf) { |
| meta_tp->window_clamp += tp->window_clamp; |
| meta_tp->rcv_ssthresh += tp->rcv_ssthresh; |
| meta_sk->sk_rcvbuf = space; |
| } |
| |
| snd_buf: |
| if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK) |
| return; |
| |
| /* Adding a new subflow to the send-buffer space. We make a simple |
| * addition, to give some space to allow traffic on the new subflow. |
| * Autotuning will increase it further later on. |
| */ |
| space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]); |
| if (space > meta_sk->sk_sndbuf) { |
| meta_sk->sk_sndbuf = space; |
| meta_sk->sk_write_space(meta_sk); |
| } |
| } |
| |
| void mptcp_tcp_set_rto(struct sock *sk) |
| { |
| tcp_set_rto(sk); |
| mptcp_set_rto(sk); |
| } |