protocol.c (9bc62afe03afdf33904f5e784e1ad68c50ff00bb) | protocol.c (0d199e4363b482badcedba764e2aceab53a4a10a) |
---|---|
1// SPDX-License-Identifier: GPL-2.0 2/* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7#define pr_fmt(fmt) "MPTCP: " fmt 8 --- 1210 unchanged lines hidden (view full) --- 1219{ 1220 struct sk_buff *skb; 1221 1222 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 1223 if (likely(skb)) { 1224 if (likely(__mptcp_add_ext(skb, gfp))) { 1225 skb_reserve(skb, MAX_TCP_HEADER); 1226 skb->reserved_tailroom = skb->end - skb->tail; | 1// SPDX-License-Identifier: GPL-2.0 2/* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7#define pr_fmt(fmt) "MPTCP: " fmt 8 --- 1210 unchanged lines hidden (view full) --- 1219{ 1220 struct sk_buff *skb; 1221 1222 skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); 1223 if (likely(skb)) { 1224 if (likely(__mptcp_add_ext(skb, gfp))) { 1225 skb_reserve(skb, MAX_TCP_HEADER); 1226 skb->reserved_tailroom = skb->end - skb->tail; |
1227 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); |
|
1227 return skb; 1228 } 1229 __kfree_skb(skb); 1230 } else { 1231 mptcp_enter_memory_pressure(sk); 1232 } 1233 return NULL; 1234} 1235 | 1228 return skb; 1229 } 1230 __kfree_skb(skb); 1231 } else { 1232 mptcp_enter_memory_pressure(sk); 1233 } 1234 return NULL; 1235} 1236 |
1236static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) | 1237static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) |
1237{ 1238 struct sk_buff *skb; 1239 | 1238{ 1239 struct sk_buff *skb; 1240 |
1240 if (ssk->sk_tx_skb_cache) { 1241 skb = ssk->sk_tx_skb_cache; 1242 if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && 1243 !__mptcp_add_ext(skb, gfp))) 1244 return false; 1245 return true; 1246 } 1247 | |
1248 skb = __mptcp_do_alloc_tx_skb(sk, gfp); 1249 if (!skb) | 1241 skb = __mptcp_do_alloc_tx_skb(sk, gfp); 1242 if (!skb) |
1250 return false; | 1243 return NULL; |
1251 1252 if (likely(sk_wmem_schedule(ssk, skb->truesize))) { | 1244 1245 if (likely(sk_wmem_schedule(ssk, skb->truesize))) { |
1253 ssk->sk_tx_skb_cache = skb; 1254 return true; | 1246 tcp_skb_entail(ssk, skb); 1247 return skb; |
1255 } 1256 kfree_skb(skb); | 1248 } 1249 kfree_skb(skb); |
1257 return false; | 1250 return NULL; |
1258} 1259 | 1251} 1252 |
1260static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) | 1253static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) |
1261{ 1262 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 1263 1264 if (unlikely(tcp_under_memory_pressure(sk))) { 1265 if (data_lock_held) 1266 __mptcp_mem_reclaim_partial(sk); 1267 else 1268 mptcp_mem_reclaim_partial(sk); --- 13 unchanged lines hidden (view full) --- 1282 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); 1283} 1284 1285static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 1286 struct mptcp_data_frag *dfrag, 1287 struct mptcp_sendmsg_info *info) 1288{ 1289 u64 data_seq = dfrag->data_seq + info->sent; | 1254{ 1255 gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; 1256 1257 if (unlikely(tcp_under_memory_pressure(sk))) { 1258 if (data_lock_held) 1259 __mptcp_mem_reclaim_partial(sk); 1260 else 1261 mptcp_mem_reclaim_partial(sk); --- 13 unchanged lines hidden (view full) --- 1275 mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); 1276} 1277 1278static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, 1279 struct mptcp_data_frag *dfrag, 1280 struct mptcp_sendmsg_info *info) 1281{ 1282 u64 data_seq = dfrag->data_seq + info->sent; |
1283 int offset = dfrag->offset + info->sent; |
|
1290 struct mptcp_sock *msk = mptcp_sk(sk); 1291 bool zero_window_probe = false; 1292 struct mptcp_ext *mpext = NULL; | 1284 struct mptcp_sock *msk = mptcp_sk(sk); 1285 bool zero_window_probe = false; 1286 struct mptcp_ext *mpext = NULL; |
1293 struct sk_buff *skb, *tail; 1294 bool must_collapse = false; 1295 int size_bias = 0; 1296 int avail_size; 1297 size_t ret = 0; | 1287 bool can_coalesce = false; 1288 bool reuse_skb = true; 1289 struct sk_buff *skb; 1290 size_t copy; 1291 int i; |
1298 1299 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 1300 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 1301 | 1292 1293 pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", 1294 msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); 1295 |
1296 if (WARN_ON_ONCE(info->sent > info->limit || 1297 info->limit > dfrag->data_len)) 1298 return 0; 1299 |
|
1302 /* compute send limit */ 1303 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); | 1300 /* compute send limit */ 1301 info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); |
1304 avail_size = info->size_goal; | 1302 copy = info->size_goal; 1303 |
1305 skb = tcp_write_queue_tail(ssk); | 1304 skb = tcp_write_queue_tail(ssk); |
1306 if (skb) { | 1305 if (skb && copy > skb->len) { |
1307 /* Limit the write to the size available in the 1308 * current skb, if any, so that we create at most a new skb. 1309 * Explicitly tells TCP internals to avoid collapsing on later 1310 * queue management operation, to avoid breaking the ext <-> 1311 * SSN association set here 1312 */ 1313 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1314 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { 1315 TCP_SKB_CB(skb)->eor = 1; 1316 goto alloc_skb; 1317 } 1318 | 1306 /* Limit the write to the size available in the 1307 * current skb, if any, so that we create at most a new skb. 1308 * Explicitly tells TCP internals to avoid collapsing on later 1309 * queue management operation, to avoid breaking the ext <-> 1310 * SSN association set here 1311 */ 1312 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1313 if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { 1314 TCP_SKB_CB(skb)->eor = 1; 1315 goto alloc_skb; 1316 } 1317 |
1319 must_collapse = (info->size_goal > skb->len) && 1320 (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); 1321 if (must_collapse) { 1322 size_bias = skb->len; 1323 avail_size = info->size_goal - skb->len; | 1318 i = skb_shinfo(skb)->nr_frags; 1319 can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); 1320 if (!can_coalesce && i >= sysctl_max_skb_frags) { 1321 tcp_mark_push(tcp_sk(ssk), skb); 1322 goto alloc_skb; |
1324 } | 1323 } |
1325 } | |
1326 | 1324 |
1325 copy -= skb->len; 1326 } else { |
|
1327alloc_skb: | 1327alloc_skb: |
1328 if (!must_collapse && 1329 !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) 1330 return 0; | 1328 skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); 1329 if (!skb) 1330 return -ENOMEM; |
1331 | 1331 |
1332 i = skb_shinfo(skb)->nr_frags; 1333 reuse_skb = false; 1334 mpext = skb_ext_find(skb, SKB_EXT_MPTCP); 1335 } 1336 |
|
1332 /* Zero window and all data acked? Probe. */ | 1337 /* Zero window and all data acked? Probe. */ |
1333 avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); 1334 if (avail_size == 0) { | 1338 copy = mptcp_check_allowed_size(msk, data_seq, copy); 1339 if (copy == 0) { |
1335 u64 snd_una = READ_ONCE(msk->snd_una); 1336 | 1340 u64 snd_una = READ_ONCE(msk->snd_una); 1341 |
1337 if (skb || snd_una != msk->snd_nxt) | 1342 if (snd_una != msk->snd_nxt) { 1343 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); |
1338 return 0; | 1344 return 0; |
1345 } 1346 |
|
1339 zero_window_probe = true; 1340 data_seq = snd_una - 1; | 1347 zero_window_probe = true; 1348 data_seq = snd_una - 1; |
1341 avail_size = 1; | 1349 copy = 1; 1350 1351 /* all mptcp-level data is acked, no skbs should be present into the 1352 * ssk write queue 1353 */ 1354 WARN_ON_ONCE(reuse_skb); |
1342 } 1343 | 1355 } 1356 |
1344 if (WARN_ON_ONCE(info->sent > info->limit || 1345 info->limit > dfrag->data_len)) 1346 return 0; 1347 1348 ret = info->limit - info->sent; 1349 tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, 1350 dfrag->page, dfrag->offset + info->sent, &ret); 1351 if (!tail) { 1352 tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); | 1357 copy = min_t(size_t, copy, info->limit - info->sent); 1358 if (!sk_wmem_schedule(ssk, copy)) { 1359 tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk)); |
1353 return -ENOMEM; 1354 } 1355 | 1360 return -ENOMEM; 1361 } 1362 |
1356 /* if the tail skb is still the cached one, collapsing really happened. 1357 */ 1358 if (skb == tail) { 1359 TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; 1360 mpext->data_len += ret; | 1363 if (can_coalesce) { 1364 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1365 } else { 1366 get_page(dfrag->page); 1367 skb_fill_page_desc(skb, i, dfrag->page, offset, copy); 1368 } 1369 1370 skb->len += copy; 1371 skb->data_len += copy; 1372 skb->truesize += copy; 1373 sk_wmem_queued_add(ssk, copy); 1374 sk_mem_charge(ssk, copy); 1375 skb->ip_summed = CHECKSUM_PARTIAL; 1376 WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); 1377 TCP_SKB_CB(skb)->end_seq += copy; 1378 tcp_skb_pcount_set(skb, 0); 1379 1380 /* on skb reuse we just need to update the DSS len */ 1381 if (reuse_skb) { 1382 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1383 mpext->data_len += copy; |
1361 WARN_ON_ONCE(zero_window_probe); 1362 goto out; 1363 } 1364 | 1384 WARN_ON_ONCE(zero_window_probe); 1385 goto out; 1386 } 1387 |
1365 mpext = skb_ext_find(tail, SKB_EXT_MPTCP); 1366 if (WARN_ON_ONCE(!mpext)) { 1367 /* should never reach here, stream corrupted */ 1368 return -EINVAL; 1369 } 1370 | |
1371 memset(mpext, 0, sizeof(*mpext)); 1372 mpext->data_seq = data_seq; 1373 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; | 1388 memset(mpext, 0, sizeof(*mpext)); 1389 mpext->data_seq = data_seq; 1390 mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; |
1374 mpext->data_len = ret; | 1391 mpext->data_len = copy; |
1375 mpext->use_map = 1; 1376 mpext->dsn64 = 1; 1377 1378 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 1379 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 1380 mpext->dsn64); 1381 1382 if (zero_window_probe) { | 1392 mpext->use_map = 1; 1393 mpext->dsn64 = 1; 1394 1395 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", 1396 mpext->data_seq, mpext->subflow_seq, mpext->data_len, 1397 mpext->dsn64); 1398 1399 if (zero_window_probe) { |
1383 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; | 1400 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; |
1384 mpext->frozen = 1; 1385 if (READ_ONCE(msk->csum_enabled)) | 1401 mpext->frozen = 1; 1402 if (READ_ONCE(msk->csum_enabled)) |
1386 mptcp_update_data_checksum(tail, ret); | 1403 mptcp_update_data_checksum(skb, copy); |
1387 tcp_push_pending_frames(ssk); 1388 return 0; 1389 } 1390out: 1391 if (READ_ONCE(msk->csum_enabled)) | 1404 tcp_push_pending_frames(ssk); 1405 return 0; 1406 } 1407out: 1408 if (READ_ONCE(msk->csum_enabled)) |
1392 mptcp_update_data_checksum(tail, ret); 1393 mptcp_subflow_ctx(ssk)->rel_write_seq += ret; 1394 return ret; | 1409 mptcp_update_data_checksum(skb, copy); 1410 mptcp_subflow_ctx(ssk)->rel_write_seq += copy; 1411 return copy; |
1395} 1396 1397#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 1398 sizeof(struct tcphdr) - \ 1399 MAX_TCP_OPTION_SPACE - \ 1400 sizeof(struct ipv6hdr) - \ 1401 sizeof(struct frag_hdr)) 1402 --- 100 unchanged lines hidden (view full) --- 1503 1504static void mptcp_push_release(struct sock *sk, struct sock *ssk, 1505 struct mptcp_sendmsg_info *info) 1506{ 1507 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 1508 release_sock(ssk); 1509} 1510 | 1412} 1413 1414#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ 1415 sizeof(struct tcphdr) - \ 1416 MAX_TCP_OPTION_SPACE - \ 1417 sizeof(struct ipv6hdr) - \ 1418 sizeof(struct frag_hdr)) 1419 --- 100 unchanged lines hidden (view full) --- 1520 1521static void mptcp_push_release(struct sock *sk, struct sock *ssk, 1522 struct mptcp_sendmsg_info *info) 1523{ 1524 tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); 1525 release_sock(ssk); 1526} 1527 |
1528static void mptcp_update_post_push(struct mptcp_sock *msk, 1529 struct mptcp_data_frag *dfrag, 1530 u32 sent) 1531{ 1532 u64 snd_nxt_new = dfrag->data_seq; 1533 1534 dfrag->already_sent += sent; 1535 1536 msk->snd_burst -= sent; 1537 msk->tx_pending_data -= sent; 1538 1539 snd_nxt_new += dfrag->already_sent; 1540 1541 /* snd_nxt_new can be smaller than snd_nxt in case mptcp 1542 * is recovering after a failover. In that event, this re-sends 1543 * old segments. 1544 * 1545 * Thus compute snd_nxt_new candidate based on 1546 * the dfrag->data_seq that was sent and the data 1547 * that has been handed to the subflow for transmission 1548 * and skip update in case it was old dfrag. 1549 */ 1550 if (likely(after64(snd_nxt_new, msk->snd_nxt))) 1551 msk->snd_nxt = snd_nxt_new; 1552} 1553 |
|
1511void __mptcp_push_pending(struct sock *sk, unsigned int flags) 1512{ 1513 struct sock *prev_ssk = NULL, *ssk = NULL; 1514 struct mptcp_sock *msk = mptcp_sk(sk); 1515 struct mptcp_sendmsg_info info = { 1516 .flags = flags, 1517 }; 1518 struct mptcp_data_frag *dfrag; --- 27 unchanged lines hidden (view full) --- 1546 1547 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1548 if (ret <= 0) { 1549 mptcp_push_release(sk, ssk, &info); 1550 goto out; 1551 } 1552 1553 info.sent += ret; | 1554void __mptcp_push_pending(struct sock *sk, unsigned int flags) 1555{ 1556 struct sock *prev_ssk = NULL, *ssk = NULL; 1557 struct mptcp_sock *msk = mptcp_sk(sk); 1558 struct mptcp_sendmsg_info info = { 1559 .flags = flags, 1560 }; 1561 struct mptcp_data_frag *dfrag; --- 27 unchanged lines hidden (view full) --- 1589 1590 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1591 if (ret <= 0) { 1592 mptcp_push_release(sk, ssk, &info); 1593 goto out; 1594 } 1595 1596 info.sent += ret; |
1554 dfrag->already_sent += ret; 1555 msk->snd_nxt += ret; 1556 msk->snd_burst -= ret; 1557 msk->tx_pending_data -= ret; | |
1558 copied += ret; 1559 len -= ret; | 1597 copied += ret; 1598 len -= ret; |
1599 1600 mptcp_update_post_push(msk, dfrag, ret); |
|
1560 } 1561 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1562 } 1563 1564 /* at this point we held the socket lock for the last subflow we used */ 1565 if (ssk) 1566 mptcp_push_release(sk, ssk, &info); 1567 --- 36 unchanged lines hidden (view full) --- 1604 goto out; 1605 } 1606 1607 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1608 if (ret <= 0) 1609 goto out; 1610 1611 info.sent += ret; | 1601 } 1602 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1603 } 1604 1605 /* at this point we held the socket lock for the last subflow we used */ 1606 if (ssk) 1607 mptcp_push_release(sk, ssk, &info); 1608 --- 36 unchanged lines hidden (view full) --- 1645 goto out; 1646 } 1647 1648 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1649 if (ret <= 0) 1650 goto out; 1651 1652 info.sent += ret; |
1612 dfrag->already_sent += ret; 1613 msk->snd_nxt += ret; 1614 msk->snd_burst -= ret; 1615 msk->tx_pending_data -= ret; | |
1616 copied += ret; 1617 len -= ret; 1618 first = false; | 1653 copied += ret; 1654 len -= ret; 1655 first = false; |
1656 1657 mptcp_update_post_push(msk, dfrag, ret); |
|
1619 } 1620 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1621 } 1622 1623out: 1624 /* __mptcp_alloc_tx_skb could have released some wmem and we are 1625 * not going to flush it via release_sock() 1626 */ --- 581 unchanged lines hidden (view full) --- 2208 mptcp_data_lock(sk); 2209 __mptcp_clean_una_wakeup(sk); 2210 rtx_head = mptcp_rtx_head(sk); 2211 if (!rtx_head) { 2212 mptcp_data_unlock(sk); 2213 return false; 2214 } 2215 | 1658 } 1659 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1660 } 1661 1662out: 1663 /* __mptcp_alloc_tx_skb could have released some wmem and we are 1664 * not going to flush it via release_sock() 1665 */ --- 581 unchanged lines hidden (view full) --- 2247 mptcp_data_lock(sk); 2248 __mptcp_clean_una_wakeup(sk); 2249 rtx_head = mptcp_rtx_head(sk); 2250 if (!rtx_head) { 2251 mptcp_data_unlock(sk); 2252 return false; 2253 } 2254 |
2216 /* will accept ack for reijected data before re-sending them */ 2217 if (!msk->recovery || after64(msk->snd_nxt, msk->recovery_snd_nxt)) 2218 msk->recovery_snd_nxt = msk->snd_nxt; | 2255 msk->recovery_snd_nxt = msk->snd_nxt; |
2219 msk->recovery = true; 2220 mptcp_data_unlock(sk); 2221 2222 msk->first_pending = rtx_head; 2223 msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; | 2256 msk->recovery = true; 2257 mptcp_data_unlock(sk); 2258 2259 msk->first_pending = rtx_head; 2260 msk->tx_pending_data += msk->snd_nxt - rtx_head->data_seq; |
2224 msk->snd_nxt = rtx_head->data_seq; | |
2225 msk->snd_burst = 0; 2226 2227 /* be sure to clear the "sent status" on all re-injected fragments */ 2228 list_for_each_entry(cur, &msk->rtx_queue, list) { 2229 if (!cur->already_sent) 2230 break; 2231 cur->already_sent = 0; 2232 } --- 1367 unchanged lines hidden --- | 2261 msk->snd_burst = 0; 2262 2263 /* be sure to clear the "sent status" on all re-injected fragments */ 2264 list_for_each_entry(cur, &msk->rtx_queue, list) { 2265 if (!cur->already_sent) 2266 break; 2267 cur->already_sent = 0; 2268 } --- 1367 unchanged lines hidden --- |