protocol.c (9bc62afe03afdf33904f5e784e1ad68c50ff00bb) protocol.c (0d199e4363b482badcedba764e2aceab53a4a10a)
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8

--- 1210 unchanged lines hidden (view full) ---

1219{
1220 struct sk_buff *skb;
1221
1222 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
1223 if (likely(skb)) {
1224 if (likely(__mptcp_add_ext(skb, gfp))) {
1225 skb_reserve(skb, MAX_TCP_HEADER);
1226 skb->reserved_tailroom = skb->end - skb->tail;
1// SPDX-License-Identifier: GPL-2.0
2/* Multipath TCP
3 *
4 * Copyright (c) 2017 - 2019, Intel Corporation.
5 */
6
7#define pr_fmt(fmt) "MPTCP: " fmt
8

--- 1210 unchanged lines hidden (view full) ---

1219{
1220 struct sk_buff *skb;
1221
1222 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
1223 if (likely(skb)) {
1224 if (likely(__mptcp_add_ext(skb, gfp))) {
1225 skb_reserve(skb, MAX_TCP_HEADER);
1226 skb->reserved_tailroom = skb->end - skb->tail;
1227 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
1227 return skb;
1228 }
1229 __kfree_skb(skb);
1230 } else {
1231 mptcp_enter_memory_pressure(sk);
1232 }
1233 return NULL;
1234}
1235
1228 return skb;
1229 }
1230 __kfree_skb(skb);
1231 } else {
1232 mptcp_enter_memory_pressure(sk);
1233 }
1234 return NULL;
1235}
1236
1236static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
1237static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
1237{
1238 struct sk_buff *skb;
1239
1238{
1239 struct sk_buff *skb;
1240
1240 if (ssk->sk_tx_skb_cache) {
1241 skb = ssk->sk_tx_skb_cache;
1242 if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
1243 !__mptcp_add_ext(skb, gfp)))
1244 return false;
1245 return true;
1246 }
1247
1248 skb = __mptcp_do_alloc_tx_skb(sk, gfp);
1249 if (!skb)
1241 skb = __mptcp_do_alloc_tx_skb(sk, gfp);
1242 if (!skb)
1250 return false;
1243 return NULL;
1251
1252 if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
1244
1245 if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
1253 ssk->sk_tx_skb_cache = skb;
1254 return true;
1246 tcp_skb_entail(ssk, skb);
1247 return skb;
1255 }
1256 kfree_skb(skb);
1248 }
1249 kfree_skb(skb);
1257 return false;
1250 return NULL;
1258}
1259
1251}
1252
1260static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
1253static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
1261{
1262 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
1263
1264 if (unlikely(tcp_under_memory_pressure(sk))) {
1265 if (data_lock_held)
1266 __mptcp_mem_reclaim_partial(sk);
1267 else
1268 mptcp_mem_reclaim_partial(sk);

--- 13 unchanged lines hidden (view full) ---

1282 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
1283}
1284
1285static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
1286 struct mptcp_data_frag *dfrag,
1287 struct mptcp_sendmsg_info *info)
1288{
1289 u64 data_seq = dfrag->data_seq + info->sent;
1254{
1255 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
1256
1257 if (unlikely(tcp_under_memory_pressure(sk))) {
1258 if (data_lock_held)
1259 __mptcp_mem_reclaim_partial(sk);
1260 else
1261 mptcp_mem_reclaim_partial(sk);

--- 13 unchanged lines hidden (view full) ---

1275 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset));
1276}
1277
1278static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
1279 struct mptcp_data_frag *dfrag,
1280 struct mptcp_sendmsg_info *info)
1281{
1282 u64 data_seq = dfrag->data_seq + info->sent;
1283 int offset = dfrag->offset + info->sent;
1290 struct mptcp_sock *msk = mptcp_sk(sk);
1291 bool zero_window_probe = false;
1292 struct mptcp_ext *mpext = NULL;
1284 struct mptcp_sock *msk = mptcp_sk(sk);
1285 bool zero_window_probe = false;
1286 struct mptcp_ext *mpext = NULL;
1293 struct sk_buff *skb, *tail;
1294 bool must_collapse = false;
1295 int size_bias = 0;
1296 int avail_size;
1297 size_t ret = 0;
1287 bool can_coalesce = false;
1288 bool reuse_skb = true;
1289 struct sk_buff *skb;
1290 size_t copy;
1291 int i;
1298
1299 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
1300 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
1301
1292
1293 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
1294 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
1295
1296 if (WARN_ON_ONCE(info->sent > info->limit ||
1297 info->limit > dfrag->data_len))
1298 return 0;
1299
1302 /* compute send limit */
1303 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
1300 /* compute send limit */
1301 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
1304 avail_size = info->size_goal;
1302 copy = info->size_goal;
1303
1305 skb = tcp_write_queue_tail(ssk);
1304 skb = tcp_write_queue_tail(ssk);
1306 if (skb) {
1305 if (skb && copy > skb->len) {
1307 /* Limit the write to the size available in the
1308 * current skb, if any, so that we create at most a new skb.
1309 * Explicitly tells TCP internals to avoid collapsing on later
1310 * queue management operation, to avoid breaking the ext <->
1311 * SSN association set here
1312 */
1313 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
1314 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
1315 TCP_SKB_CB(skb)->eor = 1;
1316 goto alloc_skb;
1317 }
1318
1306 /* Limit the write to the size available in the
1307 * current skb, if any, so that we create at most a new skb.
1308 * Explicitly tells TCP internals to avoid collapsing on later
1309 * queue management operation, to avoid breaking the ext <->
1310 * SSN association set here
1311 */
1312 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
1313 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) {
1314 TCP_SKB_CB(skb)->eor = 1;
1315 goto alloc_skb;
1316 }
1317
1319 must_collapse = (info->size_goal > skb->len) &&
1320 (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags);
1321 if (must_collapse) {
1322 size_bias = skb->len;
1323 avail_size = info->size_goal - skb->len;
1318 i = skb_shinfo(skb)->nr_frags;
1319 can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
1320 if (!can_coalesce && i >= sysctl_max_skb_frags) {
1321 tcp_mark_push(tcp_sk(ssk), skb);
1322 goto alloc_skb;
1324 }
1323 }
1325 }
1326
1324
1325 copy -= skb->len;
1326 } else {
1327alloc_skb:
1327alloc_skb:
1328 if (!must_collapse &&
1329 !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held))
1330 return 0;
1328 skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
1329 if (!skb)
1330 return -ENOMEM;
1331
1331
1332 i = skb_shinfo(skb)->nr_frags;
1333 reuse_skb = false;
1334 mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
1335 }
1336
1332 /* Zero window and all data acked? Probe. */
1337 /* Zero window and all data acked? Probe. */
1333 avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
1334 if (avail_size == 0) {
1338 copy = mptcp_check_allowed_size(msk, data_seq, copy);
1339 if (copy == 0) {
1335 u64 snd_una = READ_ONCE(msk->snd_una);
1336
1340 u64 snd_una = READ_ONCE(msk->snd_una);
1341
1337 if (skb || snd_una != msk->snd_nxt)
1342 if (snd_una != msk->snd_nxt) {
1343 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
1338 return 0;
1344 return 0;
1345 }
1346
1339 zero_window_probe = true;
1340 data_seq = snd_una - 1;
1347 zero_window_probe = true;
1348 data_seq = snd_una - 1;
1341 avail_size = 1;
1349 copy = 1;
1350
1351 /* all mptcp-level data is acked, no skbs should be present into the
1352 * ssk write queue
1353 */
1354 WARN_ON_ONCE(reuse_skb);
1342 }
1343
1355 }
1356
1344 if (WARN_ON_ONCE(info->sent > info->limit ||
1345 info->limit > dfrag->data_len))
1346 return 0;
1347
1348 ret = info->limit - info->sent;
1349 tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
1350 dfrag->page, dfrag->offset + info->sent, &ret);
1351 if (!tail) {
1352 tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
1357 copy = min_t(size_t, copy, info->limit - info->sent);
1358 if (!sk_wmem_schedule(ssk, copy)) {
1359 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
1353 return -ENOMEM;
1354 }
1355
1360 return -ENOMEM;
1361 }
1362
1356 /* if the tail skb is still the cached one, collapsing really happened.
1357 */
1358 if (skb == tail) {
1359 TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
1360 mpext->data_len += ret;
1363 if (can_coalesce) {
1364 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1365 } else {
1366 get_page(dfrag->page);
1367 skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
1368 }
1369
1370 skb->len += copy;
1371 skb->data_len += copy;
1372 skb->truesize += copy;
1373 sk_wmem_queued_add(ssk, copy);
1374 sk_mem_charge(ssk, copy);
1375 skb->ip_summed = CHECKSUM_PARTIAL;
1376 WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
1377 TCP_SKB_CB(skb)->end_seq += copy;
1378 tcp_skb_pcount_set(skb, 0);
1379
1380 /* on skb reuse we just need to update the DSS len */
1381 if (reuse_skb) {
1382 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1383 mpext->data_len += copy;
1361 WARN_ON_ONCE(zero_window_probe);
1362 goto out;
1363 }
1364
1384 WARN_ON_ONCE(zero_window_probe);
1385 goto out;
1386 }
1387
1365 mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
1366 if (WARN_ON_ONCE(!mpext)) {
1367 /* should never reach here, stream corrupted */
1368 return -EINVAL;
1369 }
1370
1371 memset(mpext, 0, sizeof(*mpext));
1372 mpext->data_seq = data_seq;
1373 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
1388 memset(mpext, 0, sizeof(*mpext));
1389 mpext->data_seq = data_seq;
1390 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
1374 mpext->data_len = ret;
1391 mpext->data_len = copy;
1375 mpext->use_map = 1;
1376 mpext->dsn64 = 1;
1377
1378 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1379 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
1380 mpext->dsn64);
1381
1382 if (zero_window_probe) {
1392 mpext->use_map = 1;
1393 mpext->dsn64 = 1;
1394
1395 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
1396 mpext->data_seq, mpext->subflow_seq, mpext->data_len,
1397 mpext->dsn64);
1398
1399 if (zero_window_probe) {
1383 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
1400 mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
1384 mpext->frozen = 1;
1385 if (READ_ONCE(msk->csum_enabled))
1401 mpext->frozen = 1;
1402 if (READ_ONCE(msk->csum_enabled))
1386 mptcp_update_data_checksum(tail, ret);
1403 mptcp_update_data_checksum(skb, copy);
1387 tcp_push_pending_frames(ssk);
1388 return 0;
1389 }
1390out:
1391 if (READ_ONCE(msk->csum_enabled))
1404 tcp_push_pending_frames(ssk);
1405 return 0;
1406 }
1407out:
1408 if (READ_ONCE(msk->csum_enabled))
1392 mptcp_update_data_checksum(tail, ret);
1393 mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
1394 return ret;
1409 mptcp_update_data_checksum(skb, copy);
1410 mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
1411 return copy;
1395}
1396
1397#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1398 sizeof(struct tcphdr) - \
1399 MAX_TCP_OPTION_SPACE - \
1400 sizeof(struct ipv6hdr) - \
1401 sizeof(struct frag_hdr))
1402

--- 100 unchanged lines hidden (view full) ---

1503
1504static void mptcp_push_release(struct sock *sk, struct sock *ssk,
1505 struct mptcp_sendmsg_info *info)
1506{
1507 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
1508 release_sock(ssk);
1509}
1510
1412}
1413
1414#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1415 sizeof(struct tcphdr) - \
1416 MAX_TCP_OPTION_SPACE - \
1417 sizeof(struct ipv6hdr) - \
1418 sizeof(struct frag_hdr))
1419

--- 100 unchanged lines hidden (view full) ---

1520
1521static void mptcp_push_release(struct sock *sk, struct sock *ssk,
1522 struct mptcp_sendmsg_info *info)
1523{
1524 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal);
1525 release_sock(ssk);
1526}
1527
1528static void mptcp_update_post_push(struct mptcp_sock *msk,
1529 struct mptcp_data_frag *dfrag,
1530 u32 sent)
1531{
1532 u64 snd_nxt_new = dfrag->data_seq;
1533
1534 dfrag->already_sent += sent;
1535
1536 msk->snd_burst -= sent;
1537 msk->tx_pending_data -= sent;
1538
1539 snd_nxt_new += dfrag->already_sent;
1540
1541 /* snd_nxt_new can be smaller than snd_nxt in case mptcp
1542 * is recovering after a failover. In that event, this re-sends
1543 * old segments.
1544 *
1545 * Thus compute snd_nxt_new candidate based on
1546 * the dfrag->data_seq that was sent and the data
1547 * that has been handed to the subflow for transmission
1548 * and skip update in case it was old dfrag.
1549 */
1550 if (likely(after64(snd_nxt_new, msk->snd_nxt)))
1551 msk->snd_nxt = snd_nxt_new;
1552}
1553
1511void __mptcp_push_pending(struct sock *sk, unsigned int flags)
1512{
1513 struct sock *prev_ssk = NULL, *ssk = NULL;
1514 struct mptcp_sock *msk = mptcp_sk(sk);
1515 struct mptcp_sendmsg_info info = {
1516 .flags = flags,
1517 };
1518 struct mptcp_data_frag *dfrag;

--- 27 unchanged lines hidden (view full) ---

1546
1547 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1548 if (ret <= 0) {
1549 mptcp_push_release(sk, ssk, &info);
1550 goto out;
1551 }
1552
1553 info.sent += ret;
1554void __mptcp_push_pending(struct sock *sk, unsigned int flags)
1555{
1556 struct sock *prev_ssk = NULL, *ssk = NULL;
1557 struct mptcp_sock *msk = mptcp_sk(sk);
1558 struct mptcp_sendmsg_info info = {
1559 .flags = flags,
1560 };
1561 struct mptcp_data_frag *dfrag;

--- 27 unchanged lines hidden (view full) ---

1589
1590 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1591 if (ret <= 0) {
1592 mptcp_push_release(sk, ssk, &info);
1593 goto out;
1594 }
1595
1596 info.sent += ret;
1554 dfrag->already_sent += ret;
1555 msk->snd_nxt += ret;
1556 msk->snd_burst -= ret;
1557 msk->tx_pending_data -= ret;
1558 copied += ret;
1559 len -= ret;
1597 copied += ret;
1598 len -= ret;
1599
1600 mptcp_update_post_push(msk, dfrag, ret);
1560 }
1561 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1562 }
1563
1564 /* at this point we held the socket lock for the last subflow we used */
1565 if (ssk)
1566 mptcp_push_release(sk, ssk, &info);
1567

--- 36 unchanged lines hidden (view full) ---

1604 goto out;
1605 }
1606
1607 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1608 if (ret <= 0)
1609 goto out;
1610
1611 info.sent += ret;
1601 }
1602 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1603 }
1604
1605 /* at this point we held the socket lock for the last subflow we used */
1606 if (ssk)
1607 mptcp_push_release(sk, ssk, &info);
1608

--- 36 unchanged lines hidden (view full) ---

1645 goto out;
1646 }
1647
1648 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1649 if (ret <= 0)
1650 goto out;
1651
1652 info.sent += ret;
1612 dfrag->already_sent += ret;
1613 msk->snd_nxt += ret;
1614 msk->snd_burst -= ret;
1615 msk->tx_pending_data -= ret;
1616 copied += ret;
1617 len -= ret;
1618 first = false;
1653 copied += ret;
1654 len -= ret;
1655 first = false;
1656
1657 mptcp_update_post_push(msk, dfrag, ret);
1619 }
1620 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1621 }
1622
1623out:
1624 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1625 * not going to flush it via release_sock()
1626 */

--- 581 unchanged lines hidden (view full) ---

2208 mptcp_data_lock(sk);
2209 __mptcp_clean_una_wakeup(sk);
2210 rtx_head = mptcp_rtx_head(sk);
2211 if (!rtx_head) {
2212 mptcp_data_unlock(sk);
2213 return false;
2214 }
2215
1658 }
1659 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1660 }
1661
1662out:
1663 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1664 * not going to flush it via release_sock()
1665 */

--- 581 unchanged lines hidden (view full) ---

2247 mptcp_data_lock(sk);
2248 __mptcp_clean_una_wakeup(sk);
2249 rtx_head = mptcp_rtx_head(sk);
2250 if (!rtx_head) {
2251 mptcp_data_unlock(sk);
2252 return false;
2253 }
2254
2216 /* will accept ack for reijected data before re-sending them */
2217 if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt))
2218 msk->recovery_snd_nxt = msk->snd_nxt;
2255 msk->recovery_snd_nxt = msk->snd_nxt;
2219 msk->recovery = true;
2220 mptcp_data_unlock(sk);
2221
2222 msk->first_pending = rtx_head;
2223 msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
2256 msk->recovery = true;
2257 mptcp_data_unlock(sk);
2258
2259 msk->first_pending = rtx_head;
2260 msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq;
2224 msk->snd_nxt = rtx_head->data_seq;
2225 msk->snd_burst = 0;
2226
2227 /* be sure to clear the "sent status" on all re-injected fragments */
2228 list_for_each_entry(cur, &msk->rtx_queue, list) {
2229 if (!cur->already_sent)
2230 break;
2231 cur->already_sent = 0;
2232 }

--- 1367 unchanged lines hidden ---
2261 msk->snd_burst = 0;
2262
2263 /* be sure to clear the "sent status" on all re-injected fragments */
2264 list_for_each_entry(cur, &msk->rtx_queue, list) {
2265 if (!cur->already_sent)
2266 break;
2267 cur->already_sent = 0;
2268 }

--- 1367 unchanged lines hidden ---