From b2ea53195d020f0ddd712c176eb2efcf8981a0bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:25 -0700 Subject: [PATCH 1/6] tcp: reduce tcp_fastretrans_alert() verbosity commit 8ba6ddaaf86c4c6814774e4e4ef158b732bd9f9f upstream. With upcoming rb-tree implementation, the checks will trigger more often, and this is expected. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Cc: Amit Shah Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 657d33e2ff6a..03bfec5a388a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2810,9 +2810,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. From d632920554c5aec81d8a79c23dac07efcbabbd54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 17:31:03 -0700 Subject: [PATCH 2/6] tcp: limit payload size of sacked skbs commit 3b4929f65b0d8249f19a50245cd88ed1a2f78cff upstream. Jonathan Looney reported that TCP can trigger the following crash in tcp_shifted_skb() : BUG_ON(tcp_skb_pcount(skb) < pcount); This can happen if the remote peer has advertized the smallest MSS that linux TCP accepts : 48 An skb can hold 17 fragments, and each fragment can hold 32KB on x86, or 64KB on PowerPC. This means that the 16bit witdh of TCP_SKB_CB(skb)->tcp_gso_segs can overflow. Note that tcp_sendmsg() builds skbs with less than 64KB of payload, so this problem needs SACK to be enabled. SACK blocks allow TCP to coalesce multiple skbs in the retransmit queue, thus filling the 17 fragments to maximal capacity. CVE-2019-11477 -- u16 overflow of TCP_SKB_CB(skb)->tcp_gso_segs Backport notes, provided by Joao Martins v4.15 or since commit 737ff314563 ("tcp: use sequence distance to detect reordering") had switched from the packet-based FACK tracking and switched to sequence-based. v4.14 and older still have the old logic and hence on tcp_skb_shift_data() needs to retain its original logic and have @fack_count in sync. In other words, we keep the increment of pcount with tcp_skb_pcount(skb) to later used that to update fack_count. To make it more explicit we track the new skb that gets incremented to pcount in @next_pcount, and we get to avoid the constant invocation of tcp_skb_pcount(skb) all together. Fixes: 832d11c5cd07 ("tcp: Try to restore large SKBs while SACK processing") Signed-off-by: Eric Dumazet Reported-by: Jonathan Looney Acked-by: Neal Cardwell Reviewed-by: Tyler Hicks Cc: Yuchung Cheng Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/tcp.h | 4 ++++ include/net/tcp.h | 2 ++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 28 ++++++++++++++++++++++------ net/ipv4/tcp_output.c | 6 +++--- 5 files changed, 32 insertions(+), 9 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fe322fa611e6..60aea230dc6a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -450,4 +450,8 @@ static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) return (user_mss && user_mss < mss) ? user_mss : mss; } + +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, + int shiftlen); + #endif /* _LINUX_TCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d0c2dbe94e21..1179ef4f0768 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -57,6 +57,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER (128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 +#define TCP_MIN_SND_MSS 48 +#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 00ae9a1d44ed..d51571be9eac 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3480,6 +3480,7 @@ void __init tcp_init(void) unsigned long limit; unsigned int i; + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 03bfec5a388a..8e080f3b75bd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1329,7 +1329,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); - BUG_ON(tcp_skb_pcount(skb) < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1396,6 +1396,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1407,6 +1422,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; + int next_pcount; int pcount = 0; int len; int in_sack; @@ -1504,7 +1520,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; - if (!skb_shift(prev, skb, len)) + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1523,11 +1539,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + next_pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, next_pcount, len)) { + pcount += next_pcount; + tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0); } - out: state->fack_count += pcount; return prev; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 24bad638c2ec..def09d1fd0e1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1442,8 +1442,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; + if (mss_now < TCP_MIN_SND_MSS) + mss_now = TCP_MIN_SND_MSS; return mss_now; } @@ -2724,7 +2724,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) if (next_skb_size <= skb_availroom(skb)) skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), next_skb_size); - else if (!skb_shift(skb, next_skb, next_skb_size)) + else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; } tcp_highest_sack_replace(sk, next_skb, skb); From 9daf226ff92679d09aeca1b5c1240e3607153336 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 17:40:56 -0700 Subject: [PATCH 3/6] tcp: tcp_fragment() should apply sane memory limits commit f070ef2ac66716357066b683fb0baf55f8191a2e upstream. Jonathan Looney reported that a malicious peer can force a sender to fragment its retransmit queue into tiny skbs, inflating memory usage and/or overflow 32bit counters. TCP allows an application to queue up to sk_sndbuf bytes, so we need to give some allowance for non malicious splitting of retransmit queue. A new SNMP counter is added to monitor how many times TCP did not allow to split an skb if the allowance was exceeded. Note that this counter might increase in the case applications use SO_SNDBUF socket option to lower sk_sndbuf. CVE-2019-11478 : tcp_fragment, prevent fragmenting a packet when the socket is already using more than half the allowed space Signed-off-by: Eric Dumazet Reported-by: Jonathan Looney Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Reviewed-by: Tyler Hicks Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_output.c | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f5d753e60836..bf31965355c6 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -278,6 +278,7 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3fbf688a1943..88aaf14983e8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -299,6 +299,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index def09d1fd0e1..36d1945b48f1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1274,6 +1274,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; + if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); + return -ENOMEM; + } + if (skb_unclone(skb, gfp)) return -ENOMEM; From cd6f35b8421ff20365ff711c0ac7647fd70e9af7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 17:44:24 -0700 Subject: [PATCH 4/6] tcp: add tcp_min_snd_mss sysctl commit 5f3e2bf008c2221478101ee72f5cb4654b9fc363 upstream. Some TCP peers announce a very small MSS option in their SYN and/or SYN/ACK messages. This forces the stack to send packets with a very high network/cpu overhead. Linux has enforced a minimal value of 48. Since this value includes the size of TCP options, and that the options can consume up to 40 bytes, this means that each segment can include only 8 bytes of payload. In some cases, it can be useful to increase the minimal value to a saner value. We still let the default to 48 (TCP_MIN_SND_MSS), for compatibility reasons. Note that TCP_MAXSEG socket option enforces a minimal value of (TCP_MIN_MSS). David Miller increased this minimal value in commit c39508d6f118 ("tcp: Make TCP_MAXSEG minimum more correct.") from 64 to 88. We might in the future merge TCP_MIN_SND_MSS and TCP_MIN_MSS. CVE-2019-11479 -- tcp mss hardcoded to 48 Signed-off-by: Eric Dumazet Suggested-by: Jonathan Looney Acked-by: Neal Cardwell Cc: Yuchung Cheng Cc: Tyler Hicks Cc: Bruce Curtis Cc: Jonathan Lemon Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 3 +-- 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 828fcd6711b3..3bbe6fb2b086 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -241,6 +241,14 @@ tcp_base_mss - INTEGER Path MTU discovery (MTU probing). If MTU probing is enabled, this is the initial MSS used by the connection. +tcp_min_snd_mss - INTEGER + TCP SYN and SYNACK messages usually advertise an ADVMSS option, + as described in RFC 1122 and RFC 6691. + If this ADVMSS option is smaller than tcp_min_snd_mss, + it is silently capped to tcp_min_snd_mss. + + Default : 48 (at least 8 bytes of payload per segment) + tcp_congestion_control - STRING Set the congestion control algorithm to be used for new connections. The algorithm "reno" is always available, but diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a2e4adafd34d..e268c970ec54 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -107,6 +107,7 @@ struct netns_ipv4 { #endif int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; + int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e8caab8e2f5c..78771272f613 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -37,6 +37,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; +static int tcp_min_snd_mss_max = 65535; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -943,6 +945,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_min_snd_mss", + .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97a414dbdaf4..0569718e3656 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2477,6 +2477,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 36d1945b48f1..a8772e11dc1c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1447,8 +1447,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < TCP_MIN_SND_MSS) - mss_now = TCP_MIN_SND_MSS; + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); return mss_now; } From f2aa4f1a05e0987e812809dbc489bd294fdae5ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 17:47:27 -0700 Subject: [PATCH 5/6] tcp: enforce tcp_min_snd_mss in tcp_mtu_probing() commit 967c05aee439e6e5d7d805e195b3a20ef5c433d6 upstream. If mtu probing is enabled tcp_mtu_probing() could very well end up with a too small MSS. Use the new sysctl tcp_min_snd_mss to make sure MSS search is performed in an acceptable range. CVE-2019-11479 -- tcp mss hardcoded to 48 Signed-off-by: Eric Dumazet Reported-by: Jonathan Lemon Cc: Jonathan Looney Acked-by: Neal Cardwell Cc: Yuchung Cheng Cc: Tyler Hicks Cc: Bruce Curtis Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a845b7692c1b..592d6e9967a9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -141,6 +141,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tp->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } From e861d0673eb8dc9b616269f70bf8a07d7524877e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 17 Jun 2019 19:52:45 +0200 Subject: [PATCH 6/6] Linux 4.14.127 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 631f8a8e28e0..330f23238005 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 126 +SUBLEVEL = 127 EXTRAVERSION = NAME = Petit Gorille