LPC networking CfP: abstract: draft
by Matthieu Baerts
Hello,
Yesterday, we discussed options for a Linux Plumber's Conference
networking track talk.
Here is a first draft of an abstract. Feel free to comment and react:
===
Multipath TCP (MPTCP) is more and more popular these days but it is not
in the upstream Linux kernel yet. A fork is still being maintained on
the side and has been since March 2009. But it cannot be upstreamed as
it is because this implementation is designed for MPTCP and the TCP
stack is too heavily impacted in term of maintainability but also a bit
regarding the performances.
In this presentation, we would like to present the challenges we are
facing. Some are introduced by this MPTCP protocol, others by objectives
we defined: limit at the maximum the impact on the existing TCP stack.
We would like to have no performance regression, a maintainable and
configurable solution and an MPTCP implementation that can be used in a
variety of deployments.
The MPTCP upstreaming community is working on a RFC patch set for
net-next. We should be able to send it before the next LPC in September.
In the current situation, a socket can be created with IPPROTO_MPTCP to
initiate and accept an MPTCP connection. This socket remains compatible
with regular TCP and IPPROTO_TCP socket behavior is unchanged. This
implementation makes use of ULP between the userspace-facing MPTCP
socket and the set of in-kernel TCP sockets it controls to limit the
minimum impact on the current TCP stack. ULP has been extended for use
with listening sockets. skb_ext is used to carry MPTCP metadata.
Both the communication and the code are public and opened. You can find
us at mptcp(a)lists.01.org and https://is.gd/mptcp_upstream
===
Do not hesitate to improve it, fix typo or restart from scratch if
needed, I don't mind!
Cheers,
Matt
--
Matthieu Baerts | R&D Engineer
matthieu.baerts(a)tessares.net
Tessares SA | Hybrid Access Solutions
www.tessares.net
1 Avenue Jean Monnet, 1348 Louvain-la-Neuve, Belgium
3 years
[PATCH] mptcp: Make MPTCP socket block/wakeup ignore sk_receive_queue
by Mat Martineau
The MPTCP-level socket doesn't use sk_receive_queue, so it was possible
for mptcp_recvmsg() to remain blocked when there was data ready for it
to read. When the MPTCP socket is waiting for additional data and it
releases the subflow socket lock, the subflow may have incoming packets
ready to process and it sometimes called subflow_data_ready() before the
MPTCP socket called sk_wait_data().
This change adds new functions for the MPTCP socket to use to wait and
to signal that data is ready. Atomic bitops are used to set, test, and
clear a MPTCP socket flag that indicates waiting subflow data. This flag
replaces the sk_receive_queue checks used by other socket types.
Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
---
Squashing this into "mptcp: Implement MPTCP receive path" has a few
conflicts with later commits that also fix up the receive code. It's
fine to add this to the end of the commit chain.
net/mptcp/protocol.c | 40 +++++++++++++++++++++++++++++++++++++++-
net/mptcp/protocol.h | 4 ++++
2 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 774ed25d3b6d..5555ee1529bb 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -357,6 +357,26 @@ static enum mapping_status mptcp_get_mapping(struct sock *ssk)
return ret;
}
+static void mptcp_wait_data(struct sock *sk, long *timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+ release_sock(sk);
+
+ if (!test_and_clear_bit(MPTCP_DATA_READY, &msk->flags))
+ *timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, *timeo);
+
+ sched_annotate_sleep();
+ lock_sock(sk);
+
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
@@ -403,6 +423,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
u64 old_ack;
u32 ssn;
+ clear_bit(MPTCP_DATA_READY, &msk->flags);
status = mptcp_get_mapping(ssk);
if (status == MAPPING_ADDED) {
@@ -536,7 +557,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
pr_debug("block");
release_sock(ssk);
- sk_wait_data(sk, &timeo, NULL);
+ mptcp_wait_data(sk, &timeo);
lock_sock(ssk);
}
@@ -548,6 +569,22 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
return copied;
}
+static void mptcp_data_ready(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct socket_wq *wq;
+
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+ EPOLLRDNORM | EPOLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
static int mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -555,6 +592,7 @@ static int mptcp_init_sock(struct sock *sk)
pr_debug("msk=%p", msk);
INIT_LIST_HEAD(&msk->conn_list);
+ sk->sk_data_ready = mptcp_data_ready;
return 0;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7f15f6aab93d..ca5e6d839575 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -71,6 +71,9 @@
#define MPTCP_ADDR_IPVERSION_4 4
#define MPTCP_ADDR_IPVERSION_6 6
+/* MPTCP socket flags */
+#define MPTCP_DATA_READY BIT(0)
+
static inline u32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
@@ -97,6 +100,7 @@ struct mptcp_sock {
u64 write_seq;
u64 ack_seq;
u32 token;
+ unsigned long flags;
struct list_head conn_list;
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct pm_data pm;
--
2.22.0
3 years
KSelftests: errors last night (20190604)
by Matthieu Baerts
Hello,
Last night, I got an error with one test:
(...)
# ns2 TCP -> ns3 (10.0.2.2:10029) MPTCP [ OK ]
# ns2 MPTCP -> ns3 (10.0.3.2:10030) MPTCP [ OK ]
# ns2 MPTCP -> ns3 (10.0.3.2:10031) TCP [ OK ]
# ns2 TCP -> ns3 (10.0.3.2:10032) MPTCP [ OK ]
# ns2 MPTCP -> ns4 (10.0.3.1:10033) MPTCP main_loop_s: timed out
# copyfd_io: poll timed out (events: POLLIN 1, POLLOUT 4)
# [ FAIL ] client exit code 2, server 2
# \nnetns ns4 socket stat for 10033:
# State Recv-Q Send-Q Local Address:Port Peer
Address:Port
# SYN-RECV 0 0 10.0.3.1:10033
10.0.2.1:37248 timer:(on,5.020ms,3) cwnd:2 retrans:3/0 reordering:0
# \nnetns ns2 socket stat for 10033:
# State Recv-Q Send-Q Local Address:Port Peer
Address:Port
# FIN-WAIT-1 0 49153 10.0.2.1:37248
10.0.3.1:10033 timer:(on,3.280ms,5) rto:6.75 cwnd:1 ssthresh:7
retrans:5/0 reordering:0
# ns3 MPTCP -> ns1 (10.0.1.1:10034) MPTCP [ OK ]
# ns3 MPTCP -> ns1 (10.0.1.1:10035) TCP [ OK ]
# ns3 TCP -> ns1 (10.0.1.1:10036) MPTCP [ OK ]
# ns3 MPTCP -> ns2 (10.0.1.2:10037) MPTCP [ OK ]
(...)
The same tests were running fine just before without KASAN and
PROVE_LOCKING.
I just re-launched the tests and I no longer have this error.
Anyone else saw this error?
Cheers,
Matt
--
Matthieu Baerts | R&D Engineer
matthieu.baerts(a)tessares.net
Tessares SA | Hybrid Access Solutions
www.tessares.net
1 Avenue Jean Monnet, 1348 Louvain-la-Neuve, Belgium
3 years, 1 month
LPC: Registration will re-open June 30th
by Matthieu Baerts
Hello,
As we previously discussed that it could be nice to go to LPC to present
the current status and have face to face meetings, don't forget that the
registration will re-open in a few days:
http://www.cvent.com/d/s6q3j2/4W
Cheers,
Matt
--
Matthieu Baerts | R&D Engineer
matthieu.baerts(a)tessares.net
Tessares SA | Hybrid Access Solutions
www.tessares.net
1 Avenue Jean Monnet, 1348 Louvain-la-Neuve, Belgium
3 years, 1 month
[RFC PATCH 0/3] Change sock->sk_protocol to a 16-bit value
by Mat Martineau
While debugging socket state transitions in MPTCP sockets, I found that
MPTCP sockets were listing IPPROTO_TCP as their protocol in
/sys/kernel/debug/tracing/trace. sock->sk_protocol was only 8 bits wide,
truncating the new value of IPPROTO_MPTCP (0x0106) to IPPROTO_TCP
(0x06).
The networking code has varying integer widths for 'protocol' at
different layers:
* POSIX socket API: 32 bits
* sk_buff->protocol: 16 bits
* IP header (on the wire): 8 bits
MPTCP shows a use for protocol values outside those that fit in an IP
header. The change to struct sock fills an 8-bit hole, so there is no
change in the size of the structure.
Given that we are currently discussing the appropriate value for
IPPROTO_MPTCP, I'm sending this as an RFC to inform those discussions. I
had previously thought that the 16-bit value for IPPROTO_MPTCP was
compatible with the existing code base.
Mat Martineau (3):
net: Make sock protocol value checks more specific
sock: Make sk_protocol a 16-bit value
net: Add IPPROTO_MPTCP to inet_sock_set_state tracepoint output
include/net/sock.h | 5 ++---
include/trace/events/sock.h | 5 +++--
net/ax25/af_ax25.c | 2 +-
net/decnet/af_decnet.c | 2 +-
4 files changed, 7 insertions(+), 7 deletions(-)
--
2.22.0
3 years, 1 month
[Weekly meetings] MoM - 27th of June 2019
by Matthieu Baerts
Hello,
We just had our 56th meeting with Mat, Peter and Ossama (Intel OTC),
Christoph (Apple), Davide and Florian (Red Hat) and myself (Tessares).
Thanks again for this new good meeting!
Here are the minutes of the meeting:
Accepted patches:
- mptcp: fix remaining checkpatch issue:
- by Matth
- reviewed by Mat
- "squashed" in "mptcp: Write MPTCP DSS headers to outgoing data
packets"
- no signed-off added for this fix
- mptcp: move MPTCP option bits to internal header:
- by Matth
- reviewed by Mat
- "squashed" in 3 different commits, no signed-off
- mptcp: Re-factor mptcp_create_subflow():
- by Peter
- reviewed by Matth
- "squashed" in "mptcp: Associate MPTCP context with TCP socket"
Pending patches:
- mptcp: simplify crypto.c:
- by Davide
- reviewed by Mat, Florian and Christoph
- we can have something just random
- maybe later we can switch to a hash as an optimisation
- Change sock->sk_protocol to a 16-bit value:
- by Mat
- for the discussions with IPPROTO_MPTCP, see below
- decision: we apply this
- mptcp: Make MPTCP socket block/wakeup ignore sk_receive_queue:
- by Mat
- linked to Mat's work on the Data FIN.
- it was blocked while it should not be.
- we cannot simply check the end of the received queue with
MPTCP. That's why the behaviour needs to be different with MPTCP
- feel free to review
- can be squashed or added at the end:
- if it is a fix for a bug introduced in a previous commit,
better to squash (except if it is to explicitly show something
particular to MPTCP of course)
- we can squash
IPPROTO_MPTCP:
- Mat: sock->sk_protocol to a 16-bit value (increases a handful of
array sizes)
- Hoang: #define SOL_X25 262
- Mat: we might want to set SOL_MPTCP at some points:
- could be good to avoid collisions.
- Maybe good to merge the patch and wait for feedback later.
Feedback from netconf:
- slides at http://vger.kernel.org/netconf2019.html
- no show stopper foreseen by anyone
- got one question wrt. using kTLS with MPTCP (both use ULP
infrastructure), we should have a look at this but not a major issue for
now (stacked ulp...?)
- Eric asked about diag support, Davide already working on this
- diag ulp infra should be upstreamed independently (for ktls)
- one question was about path management, no objections to us adding
something very simple plus the genetlink based one to place decision
making in userspace
- one concern is wrt. local security holes, we can ask syzkaller
people to start also running on the mptcp tree once we get ready to
upstream, or initially restrict IPPROTO_MPTCP to init_user root to limit
impact (or both).
- no need to implement mptcp-level (coupled) congestion control on
top of subflows (i.e., its fine to use more bandwidth than one standard
tcp flow)
Send LPC proposal:
- maybe going more in initial feature and roadmap, comparison with
mptcp.org, client/server view, etc.
- could be good to send it early next week.
- Mat will look at the draft
mptcp.org:
- support MPTCP v1 seems problematic (when using v0, default
behaviour) but almost there
Next meeting:
- We propose to *skip* the next one (4th of July). Next one would
then be the 11th of July.
- Usual time: 16:00 UTC (9am PDT, 6pm CEST)
- Still open to everyone!
- https://annuel2.framapad.org/p/mptcp_upstreaming_20190711
Feel free to comment on these points and propose new ones for the next
meeting!
Talk to you next week,
Matthieu
--
Matthieu Baerts | R&D Engineer
matthieu.baerts(a)tessares.net
Tessares SA | Hybrid Access Solutions
www.tessares.net
1 Avenue Jean Monnet, 1348 Louvain-la-Neuve, Belgium
3 years, 1 month
[PATCH 0/2] Change IPPROTO_MPTCP
by Mat Martineau
Instead of using 99 for IPPROTO_MPTCP (which IANA defines as "any
private encryption scheme"), use 262 (0x100 | IPPROTO_TCP). The MPTCP
self tests continue to run successfully with these changes.
Earlier in development we used 262 for IPPROTO_SUBFLOW as it would get
truncated to 0x06 (IPPROTO_TCP) in the IP header. Now that
IPPROTO_SUBFLOW has been removed in favor of using ULP, 262 is freed up
for MPTCP.
Note that this does change the value of IPPROTO_MAX, and in reviewing
the occurrences of that around the kernel most uses look fine.
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c has one instance of IPPROTO_MAX
where its meaning is not immediately obvious.
Mat Martineau (2):
squash-to: Define IPPROTO_MPTCP
squash-to: add basic kselftest program
include/uapi/linux/in.h | 4 ++--
tools/include/uapi/linux/in.h | 2 ++
tools/testing/selftests/net/mptcp/mptcp_connect.c | 2 +-
3 files changed, 5 insertions(+), 3 deletions(-)
--
2.22.0
3 years, 1 month
[PATCH] mptcp: Re-factor mptcp_create_subflow()
by Peter Krystad
Move to subflow.c (because it belongs there) and rename.
Outgoing MP_JOIN requests will use this routine to create
subflow sockets.
squashto: Associate MPTCP context with TCP socket
Signed-off-by: Peter Krystad <peter.krystad(a)linux.intel.com>
---
net/mptcp/protocol.c | 33 +++------------------------------
net/mptcp/protocol.h | 1 +
net/mptcp/subflow.c | 25 +++++++++++++++++++++++++
3 files changed, 29 insertions(+), 30 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 775510bdbd4c..f2909457ef31 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -817,33 +817,6 @@ static struct proto mptcp_prot = {
.no_autobind = 1,
};
-static int mptcp_subflow_create(struct sock *sk)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct net *net = sock_net(sk);
- struct socket *sf;
- int err;
-
- pr_debug("msk=%p", msk);
- err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf);
- if (!err) {
- lock_sock(sf->sk);
- err = tcp_set_ulp(sf->sk, "mptcp");
- release_sock(sf->sk);
- if (!err) {
- struct subflow_context *subflow = subflow_ctx(sf->sk);
-
- pr_debug("subflow=%p", subflow);
- msk->subflow = sf;
- subflow->conn = sk;
- subflow->request_mptcp = 1; // @@ if MPTCP enabled
- subflow->request_cksum = 1; // @@ if checksum enabled
- subflow->version = 0;
- }
- }
- return err;
-}
-
static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
@@ -855,7 +828,7 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return err;
if (!msk->subflow) {
- err = mptcp_subflow_create(sock->sk);
+ err = subflow_create_socket(sock->sk, &msk->subflow);
if (err)
return err;
}
@@ -874,7 +847,7 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
return err;
if (!msk->subflow) {
- err = mptcp_subflow_create(sock->sk);
+ err = subflow_create_socket(sock->sk, &msk->subflow);
if (err)
return err;
}
@@ -933,7 +906,7 @@ static int mptcp_listen(struct socket *sock, int backlog)
pr_debug("msk=%p", msk);
if (!msk->subflow) {
- err = mptcp_subflow_create(sock->sk);
+ err = subflow_create_socket(sock->sk, &msk->subflow);
if (err)
return err;
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index ffc2452b4e77..e081b48d3f0d 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -201,6 +201,7 @@ mptcp_subflow_tcp_socket(const struct subflow_context *subflow)
}
void subflow_init(void);
+int subflow_create_socket(struct sock *sk, struct socket **new_sock);
extern const struct inet_connection_sock_af_ops ipv4_specific;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index a858cc966724..1236739425bc 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -201,6 +201,31 @@ static void subflow_data_ready(struct sock *sk)
}
}
+int subflow_create_socket(struct sock *sk, struct socket **new_sock)
+{
+ struct net *net = sock_net(sk);
+ struct socket *sf;
+ int err;
+
+ err = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sf);
+ if (!err) {
+ lock_sock(sf->sk);
+ err = tcp_set_ulp(sf->sk, "mptcp");
+ release_sock(sf->sk);
+ if (!err) {
+ struct subflow_context *subflow = subflow_ctx(sf->sk);
+
+ pr_debug("subflow=%p", subflow);
+ *new_sock = sf;
+ subflow->conn = sk;
+ subflow->request_mptcp = 1; // @@ if MPTCP enabled
+ subflow->request_cksum = 1; // @@ if checksum enabled
+ subflow->version = 0;
+ }
+ }
+ return err;
+}
+
static struct subflow_context *subflow_create_ctx(struct sock *sk,
struct socket *sock,
gfp_t priority)
--
2.17.2
3 years, 1 month
[PATCH 0/3] mptcp: move MPTCP option bits to internal header
by Matthieu Baerts
All these OPTION_MPTCP_* are only used in files from net/mptcp.
There are 3 commits to ease the squash.
Matthieu Baerts (3):
mptcp: move MPTCP option bits to internal header
mptcp: move MPTCP option bits to internal header
mptcp: move MPTCP option bits to internal header
include/net/mptcp.h | 11 -----------
net/mptcp/protocol.h | 12 ++++++++++++
2 files changed, 12 insertions(+), 11 deletions(-)
--
2.20.1
3 years, 1 month