[PATCH 0/2] Shared control buffer proposal
by Mat Martineau
Here's an implementation of the "shared control buffer" we recently
discussed on this list. This uses a SKB_SHINFO_EXT flag for __alloc_skb
to optionally allocate extra space following skb_shared_info. Regular
(non-extended) skbs are not changed at all. The new skbs with extended
shared info can be safely handled by existing code, although low-level
operations that copy or reallocate skb data and shared info may strip
the extended information if they are not aware of it.
The second patch allows users of do_tcp_sendpages to provide a skb with
an extended control buffer to use for the first packet. In MPTCP, this
would allow a DSS mapping for a large chunk of data to be sent in the
first packet, with the rest of the mapped data sent in subsequent
packets. It also conveniently ensures that the beginning of the provided
data is not coalesced in to an existing skb so the DSS mapping and the
beginning of the mapped data stay together.
I still need to verify how GSO and TSO handle these skbs. When I tested
this code by using SKB_ALLOC_SHINFO_EXT with every TCP allocation, the
extended information was available when TCP headers are written.
Mat Martineau (2):
skbuff: Add shared control buffer
tcp: Let do_tcp_sendpages accept a pre-allocated initial skb
include/linux/skbuff.h | 24 +++++++++++++++++++++-
include/net/tcp.h | 2 +-
net/core/skbuff.c | 56 ++++++++++++++++++++++++++++++++++++++------------
net/ipv4/tcp.c | 12 ++++++++---
net/tls/tls_main.c | 3 ++-
5 files changed, 78 insertions(+), 19 deletions(-)
--
2.14.2
3 years, 2 months
[PATCH v2] tcp: Register handlers for extra TCP options
by Mat Martineau
Allow additional TCP options to be handled by registered hook
functions.
Registered options have a priority that determines the order in which
options are prepared and written. Lower priority numbers are handled
first.
Option parsing will call the provided 'parse' function when a TCP option
number is not recognized by the normal option parsing code.
The 'prepare' function determines the required space for registered
options and store associated data. 'write' adds the option to the TCP
header.
A static key and RCU synchronization are used to minimize the
performance impact of these extensible TCP features.
Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
---
Changes from v1: One 'prepare' callback (no more special callback for
request_sock), and add a few missing callback sites (like ipv6).
drivers/infiniband/hw/cxgb4/cm.c | 2 +-
include/linux/tcp.h | 22 +++++++
include/net/tcp.h | 40 +++++++++++-
net/ipv4/syncookies.c | 2 +-
net/ipv4/tcp.c | 133 +++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_input.c | 16 +++--
net/ipv4/tcp_ipv4.c | 80 ++++++++++++++++++-----
net/ipv4/tcp_minisocks.c | 4 +-
net/ipv4/tcp_output.c | 43 +++++++------
net/ipv6/syncookies.c | 2 +-
net/ipv6/tcp_ipv6.c | 28 ++++++++-
11 files changed, 323 insertions(+), 49 deletions(-)
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index daf7a56e5d7e..c3eb31611011 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
*/
memset(&tmp_opt, 0, sizeof(tmp_opt));
tcp_clear_options(&tmp_opt);
- tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
+ tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
req = __skb_push(skb, sizeof(*req));
memset(req, 0, sizeof(*req));
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..0347e6ce99be 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
}
+#define OPTION_SACK_ADVERTISE (1 << 0)
+#define OPTION_TS (1 << 1)
+#define OPTION_MD5 (1 << 2)
+#define OPTION_WSCALE (1 << 3)
+#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+
+struct tcp_out_options {
+ u16 options; /* bit field of OPTION_* */
+ u16 mss; /* 0 to disable */
+ u8 ws; /* window scale, 0 to disable */
+ u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
+ __u8 *hash_location; /* temporary pointer, overloaded */
+ __u32 tsval, tsecr; /* need to include OPTION_TS */
+ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+};
+
/* This is the max number of SACKS that we'll generate and process. It's safe
* to increase this, although since:
* size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
@@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
return (struct tcp_sock *)sk;
}
+static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
+{
+ return (struct sock *)tp;
+}
+
struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
#define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3bc910a9bfc6..04f3dcecf592 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
- int estab, struct tcp_fastopen_cookie *foc);
+ int estab, struct tcp_fastopen_cookie *foc,
+ struct tcp_sock *tp);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
/*
@@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
}
+
+extern struct static_key_false tcp_extra_options_enabled;
+
+struct tcp_extra_option_ops {
+ struct list_head list;
+ unsigned char option_kind;
+ unsigned char priority;
+ void (*parse)(int opsize, const unsigned char *opptr,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
+ struct sock *sk);
+ /* Return the number of bytes consumed */
+ unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
+ unsigned int remaining,
+ struct tcp_out_options *opts,
+ const struct sock *sk);
+ void (*write)(__be32 *ptr, struct tcp_out_options *opts,
+ const struct sock *sk);
+ struct module *owner;
+};
+
+void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
+ struct sock *sk);
+
+unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
+ unsigned int remaining,
+ struct tcp_out_options *opts,
+ const struct sock *sk);
+
+void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
+ const struct sock *sk);
+
+int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
+void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
+
#endif /* _TCP_H */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b1bb1b3a1082..6c8d750a2243 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
tsoff = secure_tcp_ts_off(sock_net(sk),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..8136857b992b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
+#include <linux/static_key.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
struct percpu_counter tcp_sockets_allocated;
EXPORT_SYMBOL(tcp_sockets_allocated);
+/*
+ * Optional TCP option handlers
+ */
+static DEFINE_SPINLOCK(tcp_option_list_lock);
+static LIST_HEAD(tcp_option_list);
+DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
+
/*
* TCP splice context
*/
@@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
+/* Linear search, few entries are expected. The RCU read lock must
+ * be held before calling.
+ */
+static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
+{
+ struct tcp_extra_option_ops *entry;
+
+ list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+ if (entry->option_kind == kind)
+ return entry;
+ }
+
+ return NULL;
+}
+
+void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
+ const struct sk_buff *skb,
+ struct tcp_options_received *opt_rx,
+ struct sock *sk)
+{
+ struct tcp_extra_option_ops *entry;
+
+ rcu_read_lock();
+ entry = tcp_extra_options_find_kind(opcode);
+ if (entry && entry->parse)
+ entry->parse(opsize, opptr, skb, opt_rx, sk);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
+
+/* The RCU read lock must be held before calling, and should span both
+ * the call to this function and tcp_extra_options_write to ensure that
+ * tcp_option_list does not change between the two calls. To preserve
+ * expected option alignment, always returns a multiple of 4 bytes.
+ */
+unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
+ unsigned int remaining,
+ struct tcp_out_options *opts,
+ const struct sock *sk)
+{
+ struct tcp_extra_option_ops *entry;
+ unsigned int used = 0;
+
+ list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+ if (unlikely(!entry->prepare))
+ continue;
+
+ used += entry->prepare(skb, flags, remaining - used, opts, sk);
+ }
+
+ return roundup(used, 4);
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
+
+/* The RCU read lock must be held before calling, and should span both
+ * the call to tcp_extra_options_write and this function to ensure that
+ * tcp_option_list does not change between the two calls.
+ */
+void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
+ const struct sock *sk)
+{
+ struct tcp_extra_option_ops *entry;
+
+ list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+ if (unlikely(!entry->write))
+ continue;
+
+ entry->write(ptr, opts, sk);
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_write);
+
+int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
+{
+ struct tcp_extra_option_ops *entry;
+ struct list_head* add_before = &tcp_option_list;
+ int ret = 0;
+
+ if (!ops->option_kind)
+ return -EINVAL;
+
+ if (!try_module_get(ops->owner))
+ return -ENOENT;
+
+ spin_lock(&tcp_option_list_lock);
+
+ list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+ if (entry->option_kind == ops->option_kind) {
+ pr_notice("Option kind %u already registered\n",
+ ops->option_kind);
+ spin_unlock(&tcp_option_list_lock);
+ module_put(ops->owner);
+ return -EEXIST;
+ }
+
+ if (entry->priority <= ops->priority)
+ add_before = &entry->list;
+ }
+
+ list_add_tail_rcu(&ops->list, add_before);
+ pr_debug("Option kind %u registered\n", ops->option_kind);
+
+ spin_unlock(&tcp_option_list_lock);
+
+ static_branch_inc(&tcp_extra_options_enabled);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_extra_option);
+
+void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
+{
+ spin_lock(&tcp_option_list_lock);
+ list_del_rcu(&ops->list);
+ spin_unlock(&tcp_option_list_lock);
+
+ synchronize_net();
+
+ static_branch_dec(&tcp_extra_options_enabled);
+
+ module_put(ops->owner);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
+
void tcp_done(struct sock *sk)
{
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3521,6 +3653,7 @@ void __init tcp_init(void)
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
+ INIT_LIST_HEAD(&tcp_option_list);
cnt = tcp_hashinfo.ehash_mask + 1;
sysctl_tcp_max_orphans = cnt / 2;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656beeee..faf3c8d34cec 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
void tcp_parse_options(const struct net *net,
const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
- struct tcp_fastopen_cookie *foc)
+ struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
@@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
ptr + 2, th->syn, foc, true);
break;
+ default:
+ tcp_extra_options_parse(opcode, opsize, ptr,
+ skb, opt_rx,
+ tcp_to_sk(tp));
+ break;
+
}
ptr += opsize-2;
length -= opsize;
@@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
return true;
}
- tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
+ tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
- tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
mss = opt.mss_clamp;
}
@@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;
- tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
+ tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss = tp->rx_opt.user_mss;
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
- want_cookie ? NULL : &foc);
+ want_cookie ? NULL : &foc, tp);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..537734e70317 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
- __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+ __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
} rep;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
@@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL;
#endif
struct net *net;
+ int offset = 0;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
goto out;
}
+#endif
+
+ if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+ unsigned int remaining;
+ unsigned int used;
+ struct tcp_out_options opts;
+
+ remaining = sizeof(rep.opt);
+#ifdef CONFIG_TCP_MD5SIG
+ if (key)
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+ memset(&opts, 0, sizeof(opts));
+
+ rcu_read_lock();
+ used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
+ &opts, sk);
+
+ tcp_extra_options_write(&rep.opt[0], &opts, sk);
+ rcu_read_unlock();
+
+ arg.iov[0].iov_len += used;
+ offset += used / 4;
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ }
+#ifdef CONFIG_TCP_MD5SIG
if (key) {
- rep.opt[0] = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
+ rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
/* Update length and the length the header thinks exists */
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
rep.th.doff = arg.iov[0].iov_len / 4;
- tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+ tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
key, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &rep.th);
}
@@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
- __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
- + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
- ];
+ __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
+ int offset = 0;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
rep.opt[1] = htonl(tsval);
rep.opt[2] = htonl(tsecr);
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+ offset += 3;
}
/* Swap the send and the receive. */
rep.th.dest = th->source;
rep.th.source = th->dest;
- rep.th.doff = arg.iov[0].iov_len / 4;
rep.th.seq = htonl(seq);
rep.th.ack_seq = htonl(ack);
rep.th.ack = 1;
rep.th.window = htons(win);
+ if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+ unsigned int remaining;
+ unsigned int used;
+ struct tcp_out_options opts;
+
+ remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
#ifdef CONFIG_TCP_MD5SIG
- if (key) {
- int offset = (tsecr) ? 3 : 0;
+ if (key)
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+#endif
+ memset(&opts, 0, sizeof(opts));
+ rcu_read_lock();
+ used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
+ &opts, sk);
+
+ tcp_extra_options_write(&rep.opt[offset], &opts, sk);
+ rcu_read_unlock();
+
+ arg.iov[0].iov_len += used;
+ offset += used / 4;
+ }
+
+ rep.th.doff = arg.iov[0].iov_len / 4;
+
+#ifdef CONFIG_TCP_MD5SIG
+ if (key) {
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) |
TCPOLEN_MD5SIG);
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
- rep.th.doff = arg.iov[0].iov_len/4;
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
key, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+
arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..1c3e91899dac 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
+ tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
if (tmp_opt.saw_tstamp) {
if (tmp_opt.rcv_tsecr)
@@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..61eba3d0ae17 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
+#include <linux/static_key.h>
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse __read_mostly = 1;
@@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
return tp->snd_una != tp->snd_up;
}
-#define OPTION_SACK_ADVERTISE (1 << 0)
-#define OPTION_TS (1 << 1)
-#define OPTION_MD5 (1 << 2)
-#define OPTION_WSCALE (1 << 3)
-#define OPTION_FAST_OPEN_COOKIE (1 << 8)
-
-struct tcp_out_options {
- u16 options; /* bit field of OPTION_* */
- u16 mss; /* 0 to disable */
- u8 ws; /* window scale, 0 to disable */
- u8 num_sack_blocks; /* number of SACK blocks to include */
- u8 hash_size; /* bytes in hash_location */
- __u8 *hash_location; /* temporary pointer, overloaded */
- __u32 tsval, tsecr; /* need to include OPTION_TS */
- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
-};
-
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
ptr += (len + 3) >> 2;
}
+
+ if (static_branch_unlikely(&tcp_extra_options_enabled))
+ tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
}
/* Compute TCP options for SYN packets. This is not the final
@@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ if (static_branch_unlikely(&tcp_extra_options_enabled))
+ remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
+ remaining, opts,
+ tcp_to_sk(tp));
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
}
+ if (static_branch_unlikely(&tcp_extra_options_enabled))
+ remaining -= tcp_extra_options_prepare(skb,
+ TCPHDR_SYN | TCPHDR_ACK,
+ remaining, opts,
+ req_to_sk(req));
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
size += TCPOLEN_TSTAMP_ALIGNED;
}
+ if (static_branch_unlikely(&tcp_extra_options_enabled))
+ size += tcp_extra_options_prepare(skb, 0,
+ MAX_TCP_OPTION_SPACE - size,
+ opts, tcp_to_sk(tp));
+
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
@@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
+ rcu_read_lock();
if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
else
@@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
md5, sk, skb);
}
#endif
+ rcu_read_unlock();
icsk->icsk_af_ops->send_check(sk, skb);
@@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
#endif
skb->skb_mstamp = tcp_clock_us();
-#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
+#ifdef CONFIG_TCP_MD5SIG
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
@@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
if (md5)
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
md5, req_to_sk(req), skb);
- rcu_read_unlock();
#endif
+ rcu_read_unlock();
/* Do not fool tcpdump (if any), clean our debris */
skb->tstamp = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 4e7817abc0b9..407480366c73 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
tsoff = secure_tcpv6_ts_off(sock_net(sk),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 64d94afa427f..4a3fba1ef3a2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
struct flowi6 fl6;
struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
- unsigned int tot_len = sizeof(struct tcphdr);
+ unsigned int tot_len = 0;
struct dst_entry *dst;
__be32 *topt;
+ struct tcp_out_options extraopts;
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
tot_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
+ rcu_read_lock();
+ if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
+ u8 extraflags = rst ? TCPHDR_RST : 0;
+
+ if (!rst || !th->ack)
+ extraflags |= TCPHDR_ACK;
+
+ memset(&extraopts, 0, sizeof(extraopts));
+
+ tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
+ &extraopts, sk);
+ }
+
+ tot_len += sizeof(struct tcphdr);
+
buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
GFP_ATOMIC);
- if (!buff)
+ if (!buff) {
+ rcu_read_unlock();
return;
+ }
skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
@@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
}
#endif
+ if (static_branch_unlikely(&tcp_extra_options_enabled))
+ tcp_extra_options_write(topt, &extraopts, sk);
+
+ rcu_read_unlock();
+
memset(&fl6, 0, sizeof(fl6));
fl6.daddr = ipv6_hdr(skb)->saddr;
fl6.saddr = ipv6_hdr(skb)->daddr;
--
2.14.2
3 years, 3 months
[PATCH 00/18] Make TCP_MD5 adopt the extra-option framework
by Christoph Paasch
Tried to split it in several patches.
Here is the overview:
Patch 1 to 5: Solely TCP-level changes to prepare the stack. Mostly cosmetic,
besides #1 and #5. #1 should be ok, #5 is a bit more fundamental
change but I think it's still ok.
Patch 6 to 11: Mat's patch, together with some more changes to it. My changes
should be merged into Mat's patch, but I let them out for now
so that it's clear what I had to change.
Patch 12 to 14: Changes to TCP_MD5 to enable it to opt-in to the framework.
Should be all fairly straight-forward.
Patch 15: Here, I move all TCP_MD5-code to its own file.
Patch 16: I should probably move this to the patchset 12 to 14.
Patch 17: Here, I start using the framework
Patch 18: Cleanup thanks to the use of the framework.
In particular, patch 18 shows how the framework allows to cleanup TCP from
TCP_MD5-changes.
Some comments about the extra-option framework:
1) As already discussed, it would be good to have the callbacks on a per-socket
basis. Basically, I think the list should be in tcp_sock where this list would
simply be a list of structs:
struct tcp_extra_option {
struct list_head teo_list;
struct tcp_extra_options_opts *teo_ops;
};
Users of this framework would allocate these elements like this (e.g., TCP_MD5):
struct tcp_md5_extra_option {
struct tcp_extra_option tcp_md5_eo;
struct tcp_md5sig_info __rcu *md5sig_info;
};
This allows MD5 to store its state info inside there. We could nicely fit MPTCP
into this.
2)
I chose to leave the extra_options at the end of tcp_syn_options.
This however means that we have to undo what TCP timestamps did in tcp_syn_options.
Unsetting the flags is easy, but the nasty part is with:
+ /* Don't use TCP timestamps with TCP_MD5 */
+ if (opts->options & OPTION_TS)
+ ret -= TCPOLEN_TSTAMP_ALIGNED;
Yes, this is ugly. But, the only way from what I see if we want to fully
remove TCP_MD5 out of TCP.
3) With request_socks now being full socks, the prepare and prepare_req callbacks
could be merged.
4) We also need this framwork for time_wait_socks. So, in general, we could have
the callback list in struct sock_common as well, that way tcp_sock, tcp_time_wait_sock
and request_socks get it.
This implies that we need to initialize this in tcp_time_wait() and destroy it
in tcp_twsk_destructor. That would allow to remove even more MD5-code out of
TCP.
I can work on these changes to the extra-option framework.
Any thoughts? :)
Christoph Paasch (17):
tcp: Write options after the header has been fully done
tcp: Pass sock to tcp_options_write instead of tcp_sock
tcp: Pass skb in tcp_options_write
tcp: Pass sock to tcp_synack_options
tcp6: Prepare for maximum TCP option-space
tcp_extra_option: add_header callback for TCP extra options
tcp_extra_option: Add missing static branch for
tcp_extra_options_prepare
tcp_extra_option: Use sock instead of tcp_sock in
tcp_extra_options_parse/prepare/write
tcp_extra_option: Pass skb to tcp_extra_options_write
tcp_extra_option: Pass sock to prepare_req
tcp_md5: Detect key inside tcp_v4_send_ack instead of passing it as an
argument
tcp_md5: Detect key inside tcp_v6_send_response instead of passing it
as an argument
tcp_md5: Check for TCP_MD5 after TCP Timestamps in
tcp_established_options
tcp: Move TCP-MD5 code out of TCP itself
tcp_md5: Don't pass along md5-key
tcp_md5: Use tcp_extra_options in output path
tcp_md5: Cleanup TCP-code
Mat Martineau (1):
tcp: Register handlers for extra TCP options
drivers/infiniband/hw/cxgb4/cm.c | 2 +-
include/linux/inet_diag.h | 1 +
include/linux/tcp.h | 19 +-
include/linux/tcp_md5.h | 109 ++++
include/net/tcp.h | 157 ++---
net/ipv4/Makefile | 1 +
net/ipv4/syncookies.c | 2 +-
net/ipv4/tcp.c | 219 +++----
net/ipv4/tcp_diag.c | 81 +--
net/ipv4/tcp_input.c | 54 +-
net/ipv4/tcp_ipv4.c | 496 +---------------
net/ipv4/tcp_md5.c | 1169 ++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_minisocks.c | 35 +-
net/ipv4/tcp_output.c | 139 ++---
net/ipv6/syncookies.c | 2 +-
net/ipv6/tcp_ipv6.c | 352 +-----------
16 files changed, 1560 insertions(+), 1278 deletions(-)
create mode 100644 include/linux/tcp_md5.h
create mode 100644 net/ipv4/tcp_md5.c
--
2.14.1
3 years, 3 months
Status
by Christoph Paasch
Hello,
just wanted to ring in here.
I started working on porting TCP_MD5 to use the TCP extra-option framework
from Mat's branch.
It allows to nicely cleanup the TCP_MD5-code out of the TCP data-path.
There are some changes/extensions I needed to do to Mat's framework. But
nothing. I will post patches in the coming days here on the list.
I keep on moving mptcp_trunk upwards to track upstream Linux. Currently I'm
stuck at v4.9 (there is a nasty bug that popped up with the merge and I
wasn't able to fix that yet).
The merge with v4.9 also forced me to bump skb->cb to 80 bytes... :/
I have been thinking back and forth on how we could handle this. The best
way I see at the moment is to create a scratch-area at the end of the skb's
data (like skb_shared_info). I think it also would quite nicely fit with a
KCM/ULP-style architecture where we could have a BPF-program that does the
scheduling.
I haven't dived very deep into the skb->cb problem yet.
Anyways, at the moment I am focusing on fixing mptcp_trunk's merge with v4.9
and the TCP_MD5 cleanup (which I think would be of interest for netdev).
Christoph
3 years, 3 months