View | Details | Raw Unified | Return to bug 1228255
Collapse All | Expand All

(-)a/lib/isc/netmgr/netmgr-int.h (-6 / +21 lines)
Lines 62-70 Link Here
62
#endif
62
#endif
63
63
64
/*
64
/*
65
 * The TCP receive buffer can fit one maximum sized DNS message plus its size,
65
 * The TCP send and receive buffers can fit one maximum sized DNS message plus
66
 * the receive buffer here affects TCP, DoT and DoH.
66
 * its size, the receive buffer here affects TCP, DoT and DoH.
67
 */
67
 */
68
#define ISC_NETMGR_TCP_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
68
#define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
69
#define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
69
70
70
/* Pick the larger buffer */
71
/* Pick the larger buffer */
Lines 377-385 struct isc__nm_uvreq { Link Here
377
	int magic;
378
	int magic;
378
	isc_nmsocket_t *sock;
379
	isc_nmsocket_t *sock;
379
	isc_nmhandle_t *handle;
380
	isc_nmhandle_t *handle;
380
	char tcplen[2];	       /* The TCP DNS message length */
381
	char tcplen[2]; /* The TCP DNS message length */
381
	uv_buf_t uvbuf;	       /* translated isc_region_t, to be
382
	uv_buf_t uvbuf; /* translated isc_region_t, to be
382
				* sent or received */
383
			 * sent or received */
384
	isc_region_t userbuf;
383
	isc_sockaddr_t local;  /* local address */
385
	isc_sockaddr_t local;  /* local address */
384
	isc_sockaddr_t peer;   /* peer address */
386
	isc_sockaddr_t peer;   /* peer address */
385
	isc__nm_cb_t cb;       /* callback */
387
	isc__nm_cb_t cb;       /* callback */
Lines 999-1005 struct isc_nmsocket { Link Here
999
			TLS_STATE_ERROR,
1001
			TLS_STATE_ERROR,
1000
			TLS_STATE_CLOSING
1002
			TLS_STATE_CLOSING
1001
		} state;
1003
		} state;
1002
		isc_region_t senddata;
1003
		ISC_LIST(isc__nm_uvreq_t) sendreqs;
1004
		ISC_LIST(isc__nm_uvreq_t) sendreqs;
1004
		bool cycle;
1005
		bool cycle;
1005
		isc_result_t pending_error;
1006
		isc_result_t pending_error;
Lines 1064-1069 struct isc_nmsocket { Link Here
1064
	 */
1065
	 */
1065
	uint64_t write_timeout;
1066
	uint64_t write_timeout;
1066
1067
1068
	/*
1069
	 * Reading was throttled over TCP as the peer does not read the
1070
	 * data we are sending back.
1071
	 */
1072
	bool reading_throttled;
1073
1067
	/*% outer socket is for 'wrapped' sockets - e.g. tcpdns in tcp */
1074
	/*% outer socket is for 'wrapped' sockets - e.g. tcpdns in tcp */
1068
	isc_nmsocket_t *outer;
1075
	isc_nmsocket_t *outer;
1069
1076
Lines 2266-2271 isc__nmsocket_readtimeout_cb(uv_timer_t *timer); Link Here
2266
void
2273
void
2267
isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult);
2274
isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult);
2268
2275
2276
/*%<
2277
 *
2278
 * Maximum number of simultaneous handles in flight supported for a single
2279
 * connected TCPDNS socket. This value was chosen arbitrarily, and may be
2280
 * changed in the future.
2281
 */
2282
#define STREAM_CLIENTS_PER_CONN 23
2283
2269
#define UV_RUNTIME_CHECK(func, ret)                                      \
2284
#define UV_RUNTIME_CHECK(func, ret)                                      \
2270
	if (ret != 0) {                                                  \
2285
	if (ret != 0) {                                                  \
2271
		FATAL_ERROR("%s failed: %s\n", #func, uv_strerror(ret)); \
2286
		FATAL_ERROR("%s failed: %s\n", #func, uv_strerror(ret)); \
(-)a/lib/isc/netmgr/netmgr.c (-16 / +63 lines)
Lines 49-56 Link Here
49
 * How many isc_nmhandles and isc_nm_uvreqs will we be
49
 * How many isc_nmhandles and isc_nm_uvreqs will we be
50
 * caching for reuse in a socket.
50
 * caching for reuse in a socket.
51
 */
51
 */
52
#define ISC_NM_HANDLES_STACK_SIZE 600
52
#define ISC_NM_HANDLES_STACK_SIZE 16
53
#define ISC_NM_REQS_STACK_SIZE	  600
53
#define ISC_NM_REQS_STACK_SIZE	  16
54
55
/*%
56
 * Same, but for UDP sockets which tend to need larger values as they
57
 * process many requests per socket.
58
 */
59
#define ISC_NM_HANDLES_STACK_SIZE_UDP 64
60
#define ISC_NM_REQS_STACK_SIZE_UDP    64
54
61
55
/*%
62
/*%
56
 * Shortcut index arrays to get access to statistics counters.
63
 * Shortcut index arrays to get access to statistics counters.
Lines 1508-1523 void Link Here
1508
isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
1515
isc___nmsocket_init(isc_nmsocket_t *sock, isc_nm_t *mgr, isc_nmsocket_type type,
1509
		    isc_sockaddr_t *iface FLARG) {
1516
		    isc_sockaddr_t *iface FLARG) {
1510
	uint16_t family;
1517
	uint16_t family;
1518
	size_t inactive_handles_stack_size = ISC_NM_HANDLES_STACK_SIZE;
1519
	size_t inactive_reqs_stack_size = ISC_NM_REQS_STACK_SIZE;
1511
1520
1512
	REQUIRE(sock != NULL);
1521
	REQUIRE(sock != NULL);
1513
	REQUIRE(mgr != NULL);
1522
	REQUIRE(mgr != NULL);
1514
1523
1515
	*sock = (isc_nmsocket_t){ .type = type,
1524
	if (type == isc_nm_udpsocket) {
1516
				  .fd = -1,
1525
		inactive_handles_stack_size = ISC_NM_HANDLES_STACK_SIZE_UDP;
1517
				  .inactivehandles = isc_astack_new(
1526
		inactive_reqs_stack_size = ISC_NM_REQS_STACK_SIZE_UDP;
1518
					  mgr->mctx, ISC_NM_HANDLES_STACK_SIZE),
1527
	}
1519
				  .inactivereqs = isc_astack_new(
1528
1520
					  mgr->mctx, ISC_NM_REQS_STACK_SIZE) };
1529
	*sock = (isc_nmsocket_t){
1530
		.type = type,
1531
		.fd = -1,
1532
		.inactivehandles = isc_astack_new(mgr->mctx,
1533
						  inactive_handles_stack_size),
1534
		.inactivereqs = isc_astack_new(mgr->mctx,
1535
					       inactive_reqs_stack_size)
1536
	};
1521
1537
1522
	ISC_LIST_INIT(sock->tls.sendreqs);
1538
	ISC_LIST_INIT(sock->tls.sendreqs);
1523
1539
Lines 2086-2091 isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) { Link Here
2086
2102
2087
	sock = req->sock;
2103
	sock = req->sock;
2088
2104
2105
	isc__nm_start_reading(sock);
2089
	isc__nmsocket_reset(sock);
2106
	isc__nmsocket_reset(sock);
2090
}
2107
}
2091
2108
Lines 2095-2101 isc__nmsocket_readtimeout_cb(uv_timer_t *timer) { Link Here
2095
2112
2096
	REQUIRE(VALID_NMSOCK(sock));
2113
	REQUIRE(VALID_NMSOCK(sock));
2097
	REQUIRE(sock->tid == isc_nm_tid());
2114
	REQUIRE(sock->tid == isc_nm_tid());
2098
	REQUIRE(atomic_load(&sock->reading));
2099
2115
2100
	if (atomic_load(&sock->client)) {
2116
	if (atomic_load(&sock->client)) {
2101
		uv_timer_stop(timer);
2117
		uv_timer_stop(timer);
Lines 2342-2349 processbuffer(isc_nmsocket_t *sock) { Link Here
2342
 * timers. If we do have a full message, reset the timer.
2358
 * timers. If we do have a full message, reset the timer.
2343
 *
2359
 *
2344
 * Stop reading if this is a client socket, or if the server socket
2360
 * Stop reading if this is a client socket, or if the server socket
2345
 * has been set to sequential mode. In this case we'll be called again
2361
 * has been set to sequential mode, or the number of queries we are
2346
 * later by isc__nm_resume_processing().
2362
 * processing simultaneously has reached the clients-per-connection
2363
 * limit. In this case we'll be called again later by
2364
 * isc__nm_resume_processing().
2347
 */
2365
 */
2348
isc_result_t
2366
isc_result_t
2349
isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
2367
isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
Lines 2351-2364 isc__nm_process_sock_buffer(isc_nmsocket_t *sock) { Link Here
2351
		int_fast32_t ah = atomic_load(&sock->ah);
2369
		int_fast32_t ah = atomic_load(&sock->ah);
2352
		isc_result_t result = processbuffer(sock);
2370
		isc_result_t result = processbuffer(sock);
2353
		switch (result) {
2371
		switch (result) {
2354
		case ISC_R_NOMORE:
2372
		case ISC_R_NOMORE: {
2355
			/*
2373
			/*
2356
			 * Don't reset the timer until we have a
2374
			 * Don't reset the timer until we have a
2357
			 * full DNS message.
2375
			 * full DNS message.
2358
			 */
2376
			 */
2359
			result = isc__nm_start_reading(sock);
2377
2360
			if (result != ISC_R_SUCCESS) {
2378
			/*
2361
				return (result);
2379
			 * Restart reading if we have less data in the send
2380
			 * queue than the send buffer size, this means that the
2381
			 * TCP client has started reading some data again.
2382
			 * Starting reading when we go under the limit instead
2383
			 * of waiting for all data has been flushed allows
2384
			 * faster recovery (in case there was a congestion and
2385
			 * now there isn't).
2386
			 */
2387
			size_t write_queue_size =
2388
				uv_stream_get_write_queue_size(
2389
					&sock->uv_handle.stream);
2390
			if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
2391
				if (sock->reading_throttled) {
2392
					isc_log_write(isc_lctx,
2393
						      ISC_LOGCATEGORY_GENERAL,
2394
						      ISC_LOGMODULE_NETMGR,
2395
						      ISC_LOG_DEBUG(3),
2396
						      "resuming TCP "
2397
						      "connection, the other "
2398
						      "side is reading the "
2399
						      "data again (%zu)",
2400
						      write_queue_size);
2401
					sock->reading_throttled = false;
2402
				}
2403
				result = isc__nm_start_reading(sock);
2404
				if (result != ISC_R_SUCCESS) {
2405
					return (result);
2406
				}
2362
			}
2407
			}
2363
			/*
2408
			/*
2364
			 * Start the timer only if there are no externally used
2409
			 * Start the timer only if there are no externally used
Lines 2370-2375 isc__nm_process_sock_buffer(isc_nmsocket_t *sock) { Link Here
2370
				isc__nmsocket_timer_start(sock);
2415
				isc__nmsocket_timer_start(sock);
2371
			}
2416
			}
2372
			goto done;
2417
			goto done;
2418
		}
2373
		case ISC_R_CANCELED:
2419
		case ISC_R_CANCELED:
2374
			isc__nmsocket_timer_stop(sock);
2420
			isc__nmsocket_timer_stop(sock);
2375
			isc__nm_stop_reading(sock);
2421
			isc__nm_stop_reading(sock);
Lines 2383-2389 isc__nm_process_sock_buffer(isc_nmsocket_t *sock) { Link Here
2383
			isc__nmsocket_timer_stop(sock);
2429
			isc__nmsocket_timer_stop(sock);
2384
2430
2385
			if (atomic_load(&sock->client) ||
2431
			if (atomic_load(&sock->client) ||
2386
			    atomic_load(&sock->sequential))
2432
			    atomic_load(&sock->sequential) ||
2433
			    atomic_load(&sock->ah) >= STREAM_CLIENTS_PER_CONN)
2387
			{
2434
			{
2388
				isc__nm_stop_reading(sock);
2435
				isc__nm_stop_reading(sock);
2389
				goto done;
2436
				goto done;
(-)a/lib/isc/netmgr/tcp.c (-2 / +69 lines)
Lines 766-772 isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0) { Link Here
766
	isc__netievent_tcpstartread_t *ievent =
766
	isc__netievent_tcpstartread_t *ievent =
767
		(isc__netievent_tcpstartread_t *)ev0;
767
		(isc__netievent_tcpstartread_t *)ev0;
768
	isc_nmsocket_t *sock = ievent->sock;
768
	isc_nmsocket_t *sock = ievent->sock;
769
	isc_result_t result;
769
	isc_result_t result = ISC_R_SUCCESS;
770
770
771
	REQUIRE(VALID_NMSOCK(sock));
771
	REQUIRE(VALID_NMSOCK(sock));
772
	REQUIRE(sock->tid == isc_nm_tid());
772
	REQUIRE(sock->tid == isc_nm_tid());
Lines 774-780 isc__nm_async_tcpstartread(isc__networker_t *worker, isc__netievent_t *ev0) { Link Here
774
774
775
	if (isc__nmsocket_closing(sock)) {
775
	if (isc__nmsocket_closing(sock)) {
776
		result = ISC_R_CANCELED;
776
		result = ISC_R_CANCELED;
777
	} else {
777
	} else if (!sock->reading_throttled) {
778
		result = isc__nm_start_reading(sock);
778
		result = isc__nm_start_reading(sock);
779
	}
779
	}
780
780
Lines 905-910 isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { Link Here
905
905
906
	/* The readcb could have paused the reading */
906
	/* The readcb could have paused the reading */
907
	if (atomic_load(&sock->reading)) {
907
	if (atomic_load(&sock->reading)) {
908
		if (!sock->client) {
909
			/*
910
			 * Stop reading if we have accumulated enough bytes in
911
			 * the send queue; this means that the TCP client is not
912
			 * reading back the data we sending to it, and there's
913
			 * no reason to continue processing more incoming DNS
914
			 * messages, if the client is not reading back the
915
			 * responses.
916
			 */
917
			size_t write_queue_size =
918
				uv_stream_get_write_queue_size(
919
					&sock->uv_handle.stream);
920
921
			if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
922
				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
923
					      ISC_LOGMODULE_NETMGR,
924
					      ISC_LOG_DEBUG(3),
925
					      "throttling TCP connection, "
926
					      "the other side is "
927
					      "not reading the data (%zu)",
928
					      write_queue_size);
929
				sock->reading_throttled = true;
930
				isc__nm_stop_reading(sock);
931
			}
932
		}
933
908
		/* The timer will be updated */
934
		/* The timer will be updated */
909
		isc__nmsocket_timer_restart(sock);
935
		isc__nmsocket_timer_restart(sock);
910
	}
936
	}
Lines 1095-1100 isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region, Link Here
1095
	return;
1121
	return;
1096
}
1122
}
1097
1123
1124
static void
1125
tcp_maybe_restart_reading(isc_nmsocket_t *sock) {
1126
	if (!sock->client && sock->reading_throttled &&
1127
	    !uv_is_active(&sock->uv_handle.handle))
1128
	{
1129
		/*
1130
		 * Restart reading if we have less data in the send queue than
1131
		 * the send buffer size, this means that the TCP client has
1132
		 * started reading some data again.  Starting reading when we go
1133
		 * under the limit instead of waiting for all data has been
1134
		 * flushed allows faster recovery (in case there was a
1135
		 * congestion and now there isn't).
1136
		 */
1137
		size_t write_queue_size =
1138
			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
1139
		if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
1140
			isc_log_write(
1141
				isc_lctx, ISC_LOGCATEGORY_GENERAL,
1142
				ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
1143
				"resuming TCP connection, the other side  "
1144
				"is reading the data again (%zu)",
1145
				write_queue_size);
1146
			sock->reading_throttled = false;
1147
			isc__nm_start_reading(sock);
1148
		}
1149
	}
1150
}
1151
1098
static void
1152
static void
1099
tcp_send_cb(uv_write_t *req, int status) {
1153
tcp_send_cb(uv_write_t *req, int status) {
1100
	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
1154
	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
Lines 1112-1121 tcp_send_cb(uv_write_t *req, int status) { Link Here
1112
		isc__nm_incstats(sock, STATID_SENDFAIL);
1166
		isc__nm_incstats(sock, STATID_SENDFAIL);
1113
		isc__nm_failed_send_cb(sock, uvreq,
1167
		isc__nm_failed_send_cb(sock, uvreq,
1114
				       isc__nm_uverr2result(status));
1168
				       isc__nm_uverr2result(status));
1169
1170
		if (!sock->client &&
1171
		    (atomic_load(&sock->reading) || sock->reading_throttled))
1172
		{
1173
			/*
1174
			 * As we are resuming reading, it is not throttled
1175
			 * anymore (technically).
1176
			 */
1177
			sock->reading_throttled = false;
1178
			isc__nm_start_reading(sock);
1179
			isc__nmsocket_reset(sock);
1180
		}
1115
		return;
1181
		return;
1116
	}
1182
	}
1117
1183
1118
	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
1184
	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
1185
	tcp_maybe_restart_reading(sock);
1119
}
1186
}
1120
1187
1121
/*
1188
/*
(-)a/lib/isc/netmgr/tcpdns.c (-2 / +57 lines)
Lines 734-740 isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0) { Link Here
734
	isc__netievent_tcpdnsread_t *ievent =
734
	isc__netievent_tcpdnsread_t *ievent =
735
		(isc__netievent_tcpdnsread_t *)ev0;
735
		(isc__netievent_tcpdnsread_t *)ev0;
736
	isc_nmsocket_t *sock = ievent->sock;
736
	isc_nmsocket_t *sock = ievent->sock;
737
	isc_result_t result;
737
	isc_result_t result = ISC_R_SUCCESS;
738
738
739
	UNUSED(worker);
739
	UNUSED(worker);
740
740
Lines 743-749 isc__nm_async_tcpdnsread(isc__networker_t *worker, isc__netievent_t *ev0) { Link Here
743
743
744
	if (isc__nmsocket_closing(sock)) {
744
	if (isc__nmsocket_closing(sock)) {
745
		result = ISC_R_CANCELED;
745
		result = ISC_R_CANCELED;
746
	} else {
746
	} else if (!sock->reading_throttled) {
747
		result = isc__nm_process_sock_buffer(sock);
747
		result = isc__nm_process_sock_buffer(sock);
748
	}
748
	}
749
749
Lines 913-918 isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread, Link Here
913
	result = isc__nm_process_sock_buffer(sock);
913
	result = isc__nm_process_sock_buffer(sock);
914
	if (result != ISC_R_SUCCESS) {
914
	if (result != ISC_R_SUCCESS) {
915
		isc__nm_failed_read_cb(sock, result, true);
915
		isc__nm_failed_read_cb(sock, result, true);
916
	} else if (!sock->client) {
917
		/*
918
		 * Stop reading if we have accumulated enough bytes in
919
		 * the send queue; this means that the TCP client is not
920
		 * reading back the data we sending to it, and there's
921
		 * no reason to continue processing more incoming DNS
922
		 * messages, if the client is not reading back the
923
		 * responses.
924
		 */
925
		size_t write_queue_size =
926
			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
927
928
		if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
929
			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
930
				      ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
931
				      "throttling TCP connection, "
932
				      "the other side is "
933
				      "not reading the data (%zu)",
934
				      write_queue_size);
935
			sock->reading_throttled = true;
936
			isc__nm_stop_reading(sock);
937
		}
916
	}
938
	}
917
free:
939
free:
918
	if (nread < 0) {
940
	if (nread < 0) {
Lines 1135-1140 isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region, Link Here
1135
	return;
1157
	return;
1136
}
1158
}
1137
1159
1160
static void
1161
tcpdns_maybe_restart_reading(isc_nmsocket_t *sock) {
1162
	if (!sock->client && sock->reading_throttled &&
1163
	    !uv_is_active(&sock->uv_handle.handle))
1164
	{
1165
		isc_result_t result = isc__nm_process_sock_buffer(sock);
1166
		if (result != ISC_R_SUCCESS) {
1167
			atomic_store(&sock->reading, true);
1168
			isc__nm_failed_read_cb(sock, result, false);
1169
		}
1170
	}
1171
}
1172
1138
static void
1173
static void
1139
tcpdns_send_cb(uv_write_t *req, int status) {
1174
tcpdns_send_cb(uv_write_t *req, int status) {
1140
	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
1175
	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
Lines 1152-1161 tcpdns_send_cb(uv_write_t *req, int status) { Link Here
1152
		isc__nm_incstats(sock, STATID_SENDFAIL);
1187
		isc__nm_incstats(sock, STATID_SENDFAIL);
1153
		isc__nm_failed_send_cb(sock, uvreq,
1188
		isc__nm_failed_send_cb(sock, uvreq,
1154
				       isc__nm_uverr2result(status));
1189
				       isc__nm_uverr2result(status));
1190
1191
		if (!sock->client &&
1192
		    (atomic_load(&sock->reading) || sock->reading_throttled))
1193
		{
1194
			/*
1195
			 * As we are resuming reading, it is not throttled
1196
			 * anymore (technically).
1197
			 */
1198
			sock->reading_throttled = false;
1199
			isc__nm_start_reading(sock);
1200
			isc__nmsocket_reset(sock);
1201
		}
1155
		return;
1202
		return;
1156
	}
1203
	}
1157
1204
1158
	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
1205
	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
1206
	tcpdns_maybe_restart_reading(sock);
1159
}
1207
}
1160
1208
1161
/*
1209
/*
Lines 1221-1226 isc__nm_async_tcpdnssend(isc__networker_t *worker, isc__netievent_t *ev0) { Link Here
1221
		goto fail;
1269
		goto fail;
1222
	}
1270
	}
1223
1271
1272
	isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
1273
		      ISC_LOG_DEBUG(3),
1274
		      "throttling TCP connection, the other side is not "
1275
		      "reading the data, switching to uv_write()");
1276
	sock->reading_throttled = true;
1277
	isc__nm_stop_reading(sock);
1278
1224
	r = uv_write(&uvreq->uv_req.write, &sock->uv_handle.stream, bufs, nbufs,
1279
	r = uv_write(&uvreq->uv_req.write, &sock->uv_handle.stream, bufs, nbufs,
1225
		     tcpdns_send_cb);
1280
		     tcpdns_send_cb);
1226
	if (r < 0) {
1281
	if (r < 0) {
(-)a/lib/isc/netmgr/tlsdns.c (-30 / +90 lines)
Lines 88-93 tlsdns_set_tls_shutdown(isc_tls_t *tls) { Link Here
88
	(void)SSL_set_shutdown(tls, SSL_SENT_SHUTDOWN);
88
	(void)SSL_set_shutdown(tls, SSL_SENT_SHUTDOWN);
89
}
89
}
90
90
91
static void
92
tlsdns_maybe_restart_reading(isc_nmsocket_t *sock);
93
91
static bool
94
static bool
92
peer_verification_has_failed(isc_nmsocket_t *sock) {
95
peer_verification_has_failed(isc_nmsocket_t *sock) {
93
	if (sock->tls.tls != NULL && sock->tls.state == TLS_STATE_HANDSHAKE &&
96
	if (sock->tls.tls != NULL && sock->tls.state == TLS_STATE_HANDSHAKE &&
Lines 1084-1089 tls_cycle_input(isc_nmsocket_t *sock) { Link Here
1084
		size_t len;
1087
		size_t len;
1085
1088
1086
		for (;;) {
1089
		for (;;) {
1090
			/*
1091
			 * There is a similar branch in
1092
			 * isc__nm_process_sock_buffer() which is sufficient to
1093
			 * stop excessive processing in TCP. However, as we wrap
1094
			 * this call in a loop, we need to have it here in order
1095
			 * to limit the number of loop iterations (and,
1096
			 * consequently, the number of messages processed).
1097
			 */
1098
			if (atomic_load(&sock->ah) >= STREAM_CLIENTS_PER_CONN) {
1099
				isc__nm_stop_reading(sock);
1100
				break;
1101
			}
1102
1087
			(void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0);
1103
			(void)SSL_peek(sock->tls.tls, &(char){ '\0' }, 0);
1088
1104
1089
			int pending = SSL_pending(sock->tls.tls);
1105
			int pending = SSL_pending(sock->tls.tls);
Lines 1261-1277 call_pending_send_callbacks(isc_nmsocket_t *sock, const isc_result_t result) { Link Here
1261
}
1277
}
1262
1278
1263
static void
1279
static void
1264
free_senddata(isc_nmsocket_t *sock, const isc_result_t result) {
1280
free_senddata(isc_nmsocket_t *sock, isc__nm_uvreq_t *req,
1281
	      const isc_result_t result) {
1265
	REQUIRE(VALID_NMSOCK(sock));
1282
	REQUIRE(VALID_NMSOCK(sock));
1266
	REQUIRE(sock->tls.senddata.base != NULL);
1283
	REQUIRE(req != NULL && req->userbuf.base != NULL &&
1267
	REQUIRE(sock->tls.senddata.length > 0);
1284
		req->userbuf.length > 0);
1268
1285
1269
	isc_mem_put(sock->mgr->mctx, sock->tls.senddata.base,
1286
	isc_mem_put(sock->mgr->mctx, req->userbuf.base, req->userbuf.length);
1270
		    sock->tls.senddata.length);
1271
	sock->tls.senddata.base = NULL;
1272
	sock->tls.senddata.length = 0;
1273
1287
1274
	call_pending_send_callbacks(sock, result);
1288
	call_pending_send_callbacks(sock, result);
1289
1290
	isc__nm_uvreq_put(&req, sock);
1275
}
1291
}
1276
1292
1277
static void
1293
static void
Lines 1284-1294 tls_write_cb(uv_write_t *req, int status) { Link Here
1284
	isc_nm_timer_stop(uvreq->timer);
1300
	isc_nm_timer_stop(uvreq->timer);
1285
	isc_nm_timer_detach(&uvreq->timer);
1301
	isc_nm_timer_detach(&uvreq->timer);
1286
1302
1287
	free_senddata(sock, result);
1303
	free_senddata(sock, uvreq, result);
1288
1289
	isc__nm_uvreq_put(&uvreq, sock);
1290
1304
1291
	if (status != 0) {
1305
	if (status != 0) {
1306
		if (!sock->client &&
1307
		    (atomic_load(&sock->reading) || sock->reading_throttled))
1308
		{
1309
			/*
1310
			 * As we are resuming reading, it is not throttled
1311
			 * anymore (technically).
1312
			 */
1313
			sock->reading_throttled = false;
1314
			isc__nm_start_reading(sock);
1315
		}
1292
		tls_error(sock, result);
1316
		tls_error(sock, result);
1293
		return;
1317
		return;
1294
	}
1318
	}
Lines 1298-1303 tls_write_cb(uv_write_t *req, int status) { Link Here
1298
		tls_error(sock, result);
1322
		tls_error(sock, result);
1299
		return;
1323
		return;
1300
	}
1324
	}
1325
1326
	tlsdns_maybe_restart_reading(sock);
1301
}
1327
}
1302
1328
1303
static isc_result_t
1329
static isc_result_t
Lines 1311-1333 tls_cycle_output(isc_nmsocket_t *sock) { Link Here
1311
		int rv;
1337
		int rv;
1312
		int r;
1338
		int r;
1313
1339
1314
		if (sock->tls.senddata.base != NULL ||
1315
		    sock->tls.senddata.length > 0)
1316
		{
1317
			break;
1318
		}
1319
1320
		if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) {
1340
		if (pending > (int)ISC_NETMGR_TCP_RECVBUF_SIZE) {
1321
			pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE;
1341
			pending = (int)ISC_NETMGR_TCP_RECVBUF_SIZE;
1322
		}
1342
		}
1323
1343
1324
		sock->tls.senddata.base = isc_mem_get(sock->mgr->mctx, pending);
1325
		sock->tls.senddata.length = pending;
1326
1327
		/* It's a bit misnomer here, but it does the right thing */
1344
		/* It's a bit misnomer here, but it does the right thing */
1328
		req = isc__nm_get_read_req(sock, NULL);
1345
		req = isc__nm_get_read_req(sock, NULL);
1329
		req->uvbuf.base = (char *)sock->tls.senddata.base;
1346
1330
		req->uvbuf.len = sock->tls.senddata.length;
1347
		req->userbuf.base = isc_mem_get(sock->mgr->mctx, pending);
1348
		req->userbuf.length = (size_t)pending;
1349
1350
		req->uvbuf.base = (char *)req->userbuf.base;
1351
		req->uvbuf.len = (size_t)req->userbuf.length;
1331
1352
1332
		rv = BIO_read_ex(sock->tls.app_rbio, req->uvbuf.base,
1353
		rv = BIO_read_ex(sock->tls.app_rbio, req->uvbuf.base,
1333
				 req->uvbuf.len, &bytes);
1354
				 req->uvbuf.len, &bytes);
Lines 1339-1370 tls_cycle_output(isc_nmsocket_t *sock) { Link Here
1339
1360
1340
		if (r == pending) {
1361
		if (r == pending) {
1341
			/* Wrote everything, restart */
1362
			/* Wrote everything, restart */
1342
			isc__nm_uvreq_put(&req, sock);
1363
			free_senddata(sock, req, ISC_R_SUCCESS);
1343
			free_senddata(sock, ISC_R_SUCCESS);
1344
			continue;
1364
			continue;
1345
		}
1365
		}
1346
1366
1347
		if (r > 0) {
1367
		if (r > 0) {
1348
			/* Partial write, send rest asynchronously */
1368
			/* Partial write, send rest asynchronously */
1349
			memmove(req->uvbuf.base, req->uvbuf.base + r,
1369
			req->uvbuf.base += r;
1350
				req->uvbuf.len - r);
1370
			req->uvbuf.len -= r;
1351
			req->uvbuf.len = req->uvbuf.len - r;
1352
		} else if (r == UV_ENOSYS || r == UV_EAGAIN) {
1371
		} else if (r == UV_ENOSYS || r == UV_EAGAIN) {
1353
			/* uv_try_write is not supported, send
1372
			/* uv_try_write is not supported, send
1354
			 * asynchronously */
1373
			 * asynchronously */
1355
		} else {
1374
		} else {
1356
			result = isc__nm_uverr2result(r);
1375
			result = isc__nm_uverr2result(r);
1357
			isc__nm_uvreq_put(&req, sock);
1376
			free_senddata(sock, req, result);
1358
			free_senddata(sock, result);
1359
			break;
1377
			break;
1360
		}
1378
		}
1361
1379
1380
		isc_log_write(
1381
			isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_NETMGR,
1382
			ISC_LOG_DEBUG(3),
1383
			"throttling TCP connection, the other side is not "
1384
			"reading the data, switching to uv_write()");
1385
		sock->reading_throttled = true;
1386
		isc__nm_stop_reading(sock);
1387
1362
		r = uv_write(&req->uv_req.write, &sock->uv_handle.stream,
1388
		r = uv_write(&req->uv_req.write, &sock->uv_handle.stream,
1363
			     &req->uvbuf, 1, tls_write_cb);
1389
			     &req->uvbuf, 1, tls_write_cb);
1364
		if (r < 0) {
1390
		if (r < 0) {
1365
			result = isc__nm_uverr2result(r);
1391
			result = isc__nm_uverr2result(r);
1366
			isc__nm_uvreq_put(&req, sock);
1392
			free_senddata(sock, req, result);
1367
			free_senddata(sock, result);
1368
			break;
1393
			break;
1369
		}
1394
		}
1370
1395
Lines 1533-1538 isc__nm_tlsdns_read_cb(uv_stream_t *stream, ssize_t nread, Link Here
1533
	result = tls_cycle(sock);
1558
	result = tls_cycle(sock);
1534
	if (result != ISC_R_SUCCESS) {
1559
	if (result != ISC_R_SUCCESS) {
1535
		isc__nm_failed_read_cb(sock, result, true);
1560
		isc__nm_failed_read_cb(sock, result, true);
1561
	} else if (!sock->client) {
1562
		/*
1563
		 * Stop reading if we have accumulated enough bytes in
1564
		 * the send queue; this means that the TCP client is not
1565
		 * reading back the data we sending to it, and there's
1566
		 * no reason to continue processing more incoming DNS
1567
		 * messages, if the client is not reading back the
1568
		 * responses.
1569
		 */
1570
		size_t write_queue_size =
1571
			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
1572
1573
		if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
1574
			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
1575
				      ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
1576
				      "throttling TCP connection, "
1577
				      "the other side is "
1578
				      "not reading the data (%zu)",
1579
				      write_queue_size);
1580
			sock->reading_throttled = true;
1581
			isc__nm_stop_reading(sock);
1582
		}
1536
	}
1583
	}
1537
free:
1584
free:
1538
	async_tlsdns_cycle(sock);
1585
	async_tlsdns_cycle(sock);
Lines 1776-1781 isc__nm_tlsdns_send(isc_nmhandle_t *handle, isc_region_t *region, Link Here
1776
	return;
1823
	return;
1777
}
1824
}
1778
1825
1826
static void
1827
tlsdns_maybe_restart_reading(isc_nmsocket_t *sock) {
1828
	if (!sock->client && sock->reading_throttled &&
1829
	    !uv_is_active(&sock->uv_handle.handle))
1830
	{
1831
		isc_result_t result = isc__nm_process_sock_buffer(sock);
1832
		if (result != ISC_R_SUCCESS) {
1833
			atomic_store(&sock->reading, true);
1834
			isc__nm_failed_read_cb(sock, result, false);
1835
		}
1836
	}
1837
}
1838
1779
/*
1839
/*
1780
 * Handle 'tcpsend' async event - send a packet on the socket
1840
 * Handle 'tcpsend' async event - send a packet on the socket
1781
 */
1841
 */
(-)a/lib/ns/client.c (-81 / +75 lines)
Lines 101-106 Link Here
101
#define COOKIE_SIZE 24U /* 8 + 4 + 4 + 8 */
101
#define COOKIE_SIZE 24U /* 8 + 4 + 4 + 8 */
102
#define ECS_SIZE    20U /* 2 + 1 + 1 + [0..16] */
102
#define ECS_SIZE    20U /* 2 + 1 + 1 + [0..16] */
103
103
104
#define TCPBUFFERS_FILLCOUNT 1U
105
#define TCPBUFFERS_FREEMAX   8U
106
104
#define WANTNSID(x)	(((x)->attributes & NS_CLIENTATTR_WANTNSID) != 0)
107
#define WANTNSID(x)	(((x)->attributes & NS_CLIENTATTR_WANTNSID) != 0)
105
#define WANTEXPIRE(x)	(((x)->attributes & NS_CLIENTATTR_WANTEXPIRE) != 0)
108
#define WANTEXPIRE(x)	(((x)->attributes & NS_CLIENTATTR_WANTEXPIRE) != 0)
106
#define WANTPAD(x)	(((x)->attributes & NS_CLIENTATTR_WANTPAD) != 0)
109
#define WANTPAD(x)	(((x)->attributes & NS_CLIENTATTR_WANTPAD) != 0)
Lines 330-341 client_senddone(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { Link Here
330
				      NS_LOGMODULE_CLIENT, ISC_LOG_DEBUG(3),
333
				      NS_LOGMODULE_CLIENT, ISC_LOG_DEBUG(3),
331
				      "send failed: %s",
334
				      "send failed: %s",
332
				      isc_result_totext(result));
335
				      isc_result_totext(result));
336
			isc_nm_bad_request(handle);
333
		}
337
		}
334
	}
338
	}
335
339
336
	isc_nmhandle_detach(&handle);
340
	isc_nmhandle_detach(&handle);
337
}
341
}
338
342
343
static void
344
client_setup_tcp_buffer(ns_client_t *client) {
345
	REQUIRE(client->tcpbuf == NULL);
346
347
	client->tcpbuf = client->manager->tcp_buffer;
348
	client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
349
}
350
351
static void
352
client_put_tcp_buffer(ns_client_t *client) {
353
	if (client->tcpbuf == NULL) {
354
		return;
355
	}
356
357
	if (client->tcpbuf != client->manager->tcp_buffer) {
358
		isc_mem_put(client->manager->mctx, client->tcpbuf,
359
			    client->tcpbuf_size);
360
	}
361
362
	client->tcpbuf = NULL;
363
	client->tcpbuf_size = 0;
364
}
365
339
static void
366
static void
340
client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
367
client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
341
		    unsigned char **datap) {
368
		    unsigned char **datap) {
Lines 345-356 client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer, Link Here
345
	REQUIRE(datap != NULL);
372
	REQUIRE(datap != NULL);
346
373
347
	if (TCP_CLIENT(client)) {
374
	if (TCP_CLIENT(client)) {
348
		INSIST(client->tcpbuf == NULL);
375
		client_setup_tcp_buffer(client);
349
		client->tcpbuf = isc_mem_get(client->manager->send_mctx,
350
					     NS_CLIENT_TCP_BUFFER_SIZE);
351
		client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
352
		data = client->tcpbuf;
376
		data = client->tcpbuf;
353
		isc_buffer_init(buffer, data, NS_CLIENT_TCP_BUFFER_SIZE);
377
		isc_buffer_init(buffer, data, client->tcpbuf_size);
354
	} else {
378
	} else {
355
		data = client->sendbuf;
379
		data = client->sendbuf;
356
		if ((client->attributes & NS_CLIENTATTR_HAVECOOKIE) == 0) {
380
		if ((client->attributes & NS_CLIENTATTR_HAVECOOKIE) == 0) {
Lines 383-393 client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) { Link Here
383
407
384
	if (isc_buffer_base(buffer) == client->tcpbuf) {
408
	if (isc_buffer_base(buffer) == client->tcpbuf) {
385
		size_t used = isc_buffer_usedlength(buffer);
409
		size_t used = isc_buffer_usedlength(buffer);
386
		client->tcpbuf = isc_mem_reget(client->manager->send_mctx,
410
		INSIST(client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE);
387
					       client->tcpbuf,
411
388
					       client->tcpbuf_size, used);
412
		/*
389
		client->tcpbuf_size = used;
413
		 * Copy the data into a smaller buffer before sending,
390
		r.base = client->tcpbuf;
414
		 * and keep the original big TCP send buffer for reuse
415
		 * by other clients.
416
		 */
417
		if (used > NS_CLIENT_SEND_BUFFER_SIZE) {
418
			/*
419
			 * We can save space by allocating a new buffer with a
420
			 * correct size and freeing the big buffer.
421
			 */
422
			unsigned char *new_tcpbuf =
423
				isc_mem_get(client->manager->mctx, used);
424
			memmove(new_tcpbuf, buffer->base, used);
425
426
			/*
427
			 * Put the big buffer so we can replace the pointer
428
			 * and the size with the new ones.
429
			 */
430
			client_put_tcp_buffer(client);
431
432
			/*
433
			 * Keep the new buffer's information so it can be freed.
434
			 */
435
			client->tcpbuf = new_tcpbuf;
436
			client->tcpbuf_size = used;
437
438
			r.base = new_tcpbuf;
439
		} else {
440
			/*
441
			 * The data fits in the available space in
442
			 * 'sendbuf', there is no need for a new buffer.
443
			 */
444
			memmove(client->sendbuf, buffer->base, used);
445
446
			/*
447
			 * Put the big buffer, we don't need a dynamic buffer.
448
			 */
449
			client_put_tcp_buffer(client);
450
451
			r.base = client->sendbuf;
452
		}
391
		r.length = used;
453
		r.length = used;
392
	} else {
454
	} else {
393
		isc_buffer_usedregion(buffer, &r);
455
		isc_buffer_usedregion(buffer, &r);
Lines 461-468 ns_client_sendraw(ns_client_t *client, dns_message_t *message) { Link Here
461
	return;
523
	return;
462
done:
524
done:
463
	if (client->tcpbuf != NULL) {
525
	if (client->tcpbuf != NULL) {
464
		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
526
		client_put_tcp_buffer(client);
465
			    client->tcpbuf_size);
466
	}
527
	}
467
528
468
	ns_client_drop(client, result);
529
	ns_client_drop(client, result);
Lines 746-753 renderend: Link Here
746
807
747
cleanup:
808
cleanup:
748
	if (client->tcpbuf != NULL) {
809
	if (client->tcpbuf != NULL) {
749
		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
810
		client_put_tcp_buffer(client);
750
			    client->tcpbuf_size);
751
	}
811
	}
752
812
753
	if (cleanup_cctx) {
813
	if (cleanup_cctx) {
Lines 1629-1636 ns__client_reset_cb(void *client0) { Link Here
1629
1689
1630
	ns_client_endrequest(client);
1690
	ns_client_endrequest(client);
1631
	if (client->tcpbuf != NULL) {
1691
	if (client->tcpbuf != NULL) {
1632
		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
1692
		client_put_tcp_buffer(client);
1633
			    client->tcpbuf_size);
1634
	}
1693
	}
1635
1694
1636
	if (client->keytag != NULL) {
1695
	if (client->keytag != NULL) {
Lines 1661-1668 ns__client_put_cb(void *client0) { Link Here
1661
	client->magic = 0;
1720
	client->magic = 0;
1662
	client->shuttingdown = true;
1721
	client->shuttingdown = true;
1663
1722
1664
	isc_mem_put(client->manager->send_mctx, client->sendbuf,
1665
		    NS_CLIENT_SEND_BUFFER_SIZE);
1666
	if (client->opt != NULL) {
1723
	if (client->opt != NULL) {
1667
		INSIST(dns_rdataset_isassociated(client->opt));
1724
		INSIST(dns_rdataset_isassociated(client->opt));
1668
		dns_rdataset_disassociate(client->opt);
1725
		dns_rdataset_disassociate(client->opt);
Lines 2339-2346 ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { Link Here
2339
		dns_message_create(client->mctx, DNS_MESSAGE_INTENTPARSE,
2396
		dns_message_create(client->mctx, DNS_MESSAGE_INTENTPARSE,
2340
				   &client->message);
2397
				   &client->message);
2341
2398
2342
		client->sendbuf = isc_mem_get(client->manager->send_mctx,
2343
					      NS_CLIENT_SEND_BUFFER_SIZE);
2344
		/*
2399
		/*
2345
		 * Set magic earlier than usual because ns_query_init()
2400
		 * Set magic earlier than usual because ns_query_init()
2346
		 * and the functions it calls will require it.
2401
		 * and the functions it calls will require it.
Lines 2357-2363 ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { Link Here
2357
		ns_clientmgr_t *oldmgr = client->manager;
2412
		ns_clientmgr_t *oldmgr = client->manager;
2358
		ns_server_t *sctx = client->sctx;
2413
		ns_server_t *sctx = client->sctx;
2359
		isc_task_t *task = client->task;
2414
		isc_task_t *task = client->task;
2360
		unsigned char *sendbuf = client->sendbuf;
2361
		dns_message_t *message = client->message;
2415
		dns_message_t *message = client->message;
2362
		isc_mem_t *oldmctx = client->mctx;
2416
		isc_mem_t *oldmctx = client->mctx;
2363
		ns_query_t query = client->query;
2417
		ns_query_t query = client->query;
Lines 2372-2378 ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { Link Here
2372
					 .manager = oldmgr,
2426
					 .manager = oldmgr,
2373
					 .sctx = sctx,
2427
					 .sctx = sctx,
2374
					 .task = task,
2428
					 .task = task,
2375
					 .sendbuf = sendbuf,
2376
					 .message = message,
2429
					 .message = message,
2377
					 .query = query,
2430
					 .query = query,
2378
					 .tid = tid };
2431
					 .tid = tid };
Lines 2397-2404 ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { Link Here
2397
	return (ISC_R_SUCCESS);
2450
	return (ISC_R_SUCCESS);
2398
2451
2399
cleanup:
2452
cleanup:
2400
	isc_mem_put(client->manager->send_mctx, client->sendbuf,
2401
		    NS_CLIENT_SEND_BUFFER_SIZE);
2402
	dns_message_detach(&client->message);
2453
	dns_message_detach(&client->message);
2403
	isc_task_detach(&client->task);
2454
	isc_task_detach(&client->task);
2404
	ns_clientmgr_detach(&client->manager);
2455
	ns_clientmgr_detach(&client->manager);
Lines 2461-2468 clientmgr_destroy(ns_clientmgr_t *manager) { Link Here
2461
	isc_task_detach(&manager->task);
2512
	isc_task_detach(&manager->task);
2462
	ns_server_detach(&manager->sctx);
2513
	ns_server_detach(&manager->sctx);
2463
2514
2464
	isc_mem_detach(&manager->send_mctx);
2465
2466
	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
2515
	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
2467
}
2516
}
2468
2517
Lines 2499-2559 ns_clientmgr_create(ns_server_t *sctx, isc_taskmgr_t *taskmgr, Link Here
2499
2548
2500
	ISC_LIST_INIT(manager->recursing);
2549
	ISC_LIST_INIT(manager->recursing);
2501
2550
2502
	/*
2503
	 * We create specialised per-worker memory context specifically
2504
	 * dedicated and tuned for allocating send buffers as it is a very
2505
	 * common operation. Not doing so may result in excessive memory
2506
	 * use in certain workloads.
2507
	 *
2508
	 * Please see this thread for more details:
2509
	 *
2510
	 * https://github.com/jemalloc/jemalloc/issues/2483
2511
	 *
2512
	 * In particular, this information from the jemalloc developers is
2513
	 * of the most interest:
2514
	 *
2515
	 * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1639019699
2516
	 * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1698173849
2517
	 *
2518
	 * In essence, we use the following memory management strategy:
2519
	 *
2520
	 * 1. We use a per-worker memory arena for send buffers memory
2521
	 * allocation to reduce lock contention (In reality, we create a
2522
	 * per-client manager arena, but we have one client manager per
2523
	 * worker).
2524
	 *
2525
	 * 2. The automatically created arenas settings remain unchanged
2526
	 * and may be controlled by users (e.g. by setting the
2527
	 * "MALLOC_CONF" variable).
2528
	 *
2529
	 * 3. We attune the arenas to not use dirty pages cache as the
2530
	 * cache would have a poor reuse rate, and that is known to
2531
	 * significantly contribute to excessive memory use.
2532
	 *
2533
	 * 4. There is no strict need for the dirty cache, as there is a
2534
	 * per arena bin for each allocation size, so because we initially
2535
	 * allocate strictly 64K per send buffer (enough for a DNS
2536
	 * message), allocations would get directed to one bin (an "object
2537
	 * pool" or a "slab") maintained within an arena. That is, there
2538
	 * is an object pool already, specifically to optimise for the
2539
	 * case of frequent allocations of objects of the given size. The
2540
	 * object pool should suffice our needs, as we will end up
2541
	 * recycling the objects from there without the need to back it by
2542
	 * an additional layer of dirty pages cache. The dirty pages cache
2543
	 * would have worked better in the case when there are more
2544
	 * allocation bins involved due to a higher reuse rate (the case
2545
	 * of a more "generic" memory management).
2546
	 */
2547
	isc_mem_create_arena(&manager->send_mctx);
2548
	isc_mem_setname(manager->send_mctx, "sendbufs");
2549
	(void)isc_mem_arena_set_dirty_decay_ms(manager->send_mctx, 0);
2550
	/*
2551
	 * Disable muzzy pages cache too, as versions < 5.2.0 have it
2552
	 * enabled by default. The muzzy pages cache goes right below the
2553
	 * dirty pages cache and backs it.
2554
	 */
2555
	(void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0);
2556
2557
	manager->magic = MANAGER_MAGIC;
2551
	manager->magic = MANAGER_MAGIC;
2558
2552
2559
	MTRACE("create");
2553
	MTRACE("create");
(-)a/lib/ns/include/ns/client.h (-2 / +4 lines)
Lines 144-150 struct ns_clientmgr { Link Here
144
	unsigned int magic;
144
	unsigned int magic;
145
145
146
	isc_mem_t      *mctx;
146
	isc_mem_t      *mctx;
147
	isc_mem_t      *send_mctx;
148
	ns_server_t    *sctx;
147
	ns_server_t    *sctx;
149
	isc_taskmgr_t  *taskmgr;
148
	isc_taskmgr_t  *taskmgr;
150
	isc_timermgr_t *timermgr;
149
	isc_timermgr_t *timermgr;
Lines 159-164 struct ns_clientmgr { Link Here
159
	/* Lock covers the recursing list */
158
	/* Lock covers the recursing list */
160
	isc_mutex_t   reclock;
159
	isc_mutex_t   reclock;
161
	client_list_t recursing; /*%< Recursing clients */
160
	client_list_t recursing; /*%< Recursing clients */
161
162
	uint8_t tcp_buffer[NS_CLIENT_TCP_BUFFER_SIZE];
162
};
163
};
163
164
164
/*% nameserver client structure */
165
/*% nameserver client structure */
Lines 187-193 struct ns_client { Link Here
187
	unsigned char  *tcpbuf;
188
	unsigned char  *tcpbuf;
188
	size_t		tcpbuf_size;
189
	size_t		tcpbuf_size;
189
	dns_message_t  *message;
190
	dns_message_t  *message;
190
	unsigned char  *sendbuf;
191
	dns_rdataset_t *opt;
191
	dns_rdataset_t *opt;
192
	dns_ednsopt_t  *ede;
192
	dns_ednsopt_t  *ede;
193
	uint16_t	udpsize;
193
	uint16_t	udpsize;
Lines 240-245 struct ns_client { Link Here
240
	 * bits will be used as the rcode in the response message.
240
	 * bits will be used as the rcode in the response message.
241
	 */
241
	 */
242
	int32_t rcode_override;
242
	int32_t rcode_override;
243
244
	uint8_t sendbuf[NS_CLIENT_SEND_BUFFER_SIZE];
243
};
245
};
244
246
245
#define NS_CLIENT_MAGIC	   ISC_MAGIC('N', 'S', 'C', 'c')
247
#define NS_CLIENT_MAGIC	   ISC_MAGIC('N', 'S', 'C', 'c')

Return to bug 1228255