xref: /openbmc/phosphor-misc/http-redirect/http-redirect.awk (revision 4ee8f2c55b20dbf1987d056fe436168ee574b654)
1#!/usr/bin/awk -f
2#
3# A Minimal HTTP/1.1 server to redirect http URIs to https
4
5BEGIN {
6	CRLF = "\r\n"
7	dquote = "\""
8
9	methods["GET"] = 1
10	methods["HEAD"] = 1
11
12	errors[400] = "400 Bad Request"
13	errors[404] = "404 Not Found"
14	errors[500] = "500 Internal Server Error"
15	errors[501] = "501 Not Implemented"
16	errors[505] = "505 HTTP Version Not Supported"
17	msgtxt[505] = "HTTP/1.1 only"
18
19	# Only forward these resources to the designated paths over https
20	https_resources["/"] = "/"
21}
22
23# Strip trailing CR(\r) before LF(\n)  RFC2616 19.3
24/\r$/ { sub(/\r$/, "") }
25
26# The first line is the HTTP request.
27method == "" {
28	if ($0 == "")
29		next
30
31	method = $1
32	request_uri = $2
33	version = $3
34
35	validate_request()
36
37	# headers start on the next line
38	next
39}
40
41# a header continuation line RFC2616 4.2
42/^[ \t]+/ {
43	# Replace leading, trailing whitespace with space below
44	sub(/[ \t]*$/, "")
45	sub(/^[ \t]*/, "")
46	trace("extend header >"header"< with content >"$0"<")
47
48	headers[header] = headers[header] " " $0
49	next
50}
51
52# Header lines start with a token and have a : seperator.  Implied LWS is
53# allowed around the : seperator.  LWS at the beginning and end can be removed.
54match($0, /[ \t]*:[ \t]*/) {
55	header = substr($0, 1, RSTART - 1)
56	content = substr($0, RSTART + RLENGTH)
57	sub(/[ \t]*$/, "", content)
58
59	# Field names are a single token.  LWS is impled allowed at the
60	# : seperator.  Any beginning or trailing LWS is not significant.
61	if (!is_token(header))
62		respond_error(400)
63
64	# Headers are case insensitive, so normalize token to upper case.
65	header = toupper(header)
66
67	# RFC2616 4.2 multiple instances of a headers is only valid for for
68	# comma separated lists.  Remove any trailing LWS, add ", " seperator.
69	prior = ""
70	if (header in headers)
71		prior = headers[header] ", "
72	headers[header] = prior content
73
74	trace("found header >"header"< with content >"headers[header]"<")
75
76	next
77}
78
79# A blank line marks the end of the headers.
80/^$/ {
81	# Could read request body here but we don't care.
82	trace("end of request headers")
83	validate_request()
84
85	validate_uri(request_uri, split_uri)
86	host = find_host()
87	path = split_uri["path"]
88	validate_path_and_respond(host, path)
89
90	next
91}
92
93# Should never get here: in headers a line without an indent nor a : is invalid.
94{
95	trace("Unparsed header line : >" $0 "<")
96
97	header = $0
98	headers[header] = ""
99
100	# check HTTP version before bad request error response
101	validate_request()
102	respond_error(400)
103	next
104}
105
106############################################################
107
108function validate_request()
109{
110	trace("version >"version"<")
111	trace("uri >"request_uri"<")
112	trace("method >"method"<")
113	if (version !~ /HTTP\/0*1[.][0-9]+$/)	# Support leading 0s, two halves
114		respond_error(505)		# Version Not Supported
115	if (bad_uric(request_uri))
116		respond_error(400)		# Bad Request (bogus encoding)
117	if (!method in methods)
118		respond_error(501)		# Not Implemented
119}
120
121function validate_uri(request_uri, split_uri)
122{
123	split_url_components(request_uri, split_uri)
124	trace(dump_split_url(split_uri))
125
126	if (!is_http_request_uri(split_uri))
127		respond_error(400)		# Bad Request (didn't parse)
128}
129
130function find_host()
131{
132	# RFC2616 5.2
133	if (!("HOST" in headers))
134		respond_error(400)
135
136	host = headers["HOST"]
137	trace("initial host is >" host "<")
138	if ("host" in split_uri)
139		host = split_uri["host"]
140	else if (match(host, /:[0-9]*$/))
141		# RFC 2616 14.23  Host header is host:port of URI
142		# RFC 2616 3.2.2 port may be not given or empty
143		host = substr(host, 1, RSTART - 1)
144	trace("prioritized host is >" host "<")
145
146	# A very relaxed check for domainlabel or IPv4.
147	if (host !~ /^[0-9a-zA-Z.-]+$/)
148		respond_error(400)
149	trace("host passed regex")
150
151	return host
152}
153
154function validate_path_and_respond(host, path)
155{
156	lookup = unescape(path)
157
158	# URIs must be unescaped before compare, but forwarded unmodified
159	trace("lookup path is >" lookup "<")
160
161	# Translate our whitelisted URI
162	if (lookup in https_resources) {
163		newpath = "https://" host https_resources[lookup]
164		trace("Redirecting to >" newpath "<\n")
165		response = "308 Permanent Redirect"
166		reason = "Access with a https:// URL"
167		content = response CRLF newpath CRLF CRLF reason CRLF
168		respond_and_exit(response, content, newpath)
169	}
170
171	# Rather than be an open redirector, return Not Found
172	respond_error(404)			# Not Found
173
174	# get noisy response if we didn't exit above
175	trace("Failed to exit after response!")
176	exit 3
177}
178
179function is_token(token)
180{
181	# US ASCII (0-127) excluding CTL (000-037, 177, SP (040), seperators
182	if (match(token, /[^\041-\176]/) ||
183		match(token, /[()<>@,;:\/[]?=\{\}" \t/))
184		return 0
185
186	return 1
187}
188
189# unreserved, reserved, or encoded.
190function bad_uric(URI)
191{
192	# hide encoded
193	gsub(/%[0-9a-fA-F][0-9a-fA-F]/, "", URI)
194
195	# fail if remaining characters are not in (mark alpha numeric reserved)
196	if (URI ~ /[^-_.!~*'()a-zA-Z0-9";\/?:@&=+$,]/)
197		return 1
198	return 0
199}
200
201# We only expect a few chars so call index vs building table hex2int[chr]
202function hex2dec(chr)
203{
204	v = index("0123456789abcdef", tolower(chr))
205	if (v)
206		return v - 1
207
208	trace("bad hex2dec character >" chr "<")
209	# bad_uric should have caught input
210	respond_error(500)			# Internal Server Error
211}
212
213# Do % hex hex -> code replacement
214function unescape(input,  out)
215{
216	i = index(input, "%")
217
218	if (i == 0)
219		return input
220
221	out = ""
222	while (i) {
223		code = (hex2dec(substr(input, i + 1, 1)) * 16 + \
224			hex2dec(substr(input, i + 2, 1)))
225		out = out substr(input, 1, i - 1) sprintf("%c", code)
226		input = substr(input, i + 3)
227		i = index(input, "%")
228	}
229	return out input
230}
231
232# With cues from RFC2396 appendix B etal
233function split_url_components(url, components)
234{
235	if (match(url, /#/)) {
236		components["frag"] = substr(url, RSTART + 1)
237		url = substr(url, 1, RSTART - 1)
238	}
239
240	if (match(url, /\?/)) {
241		components["query"] = substr(url, RSTART + 1)
242		url = substr(url, 1, RSTART - 1)
243	}
244
245	if (match(url, /^[^:\/?#]+:/)) {
246		components["scheme"] = substr(url, 1, RLENGTH - 1) ;
247		url = substr(url, RLENGTH + 1)
248	}
249
250	# Maybe return early:  Separate the path from the authority.
251	if (substr(url, 1, 2) != "//") {
252		components["path"] = url;
253		return
254	} else if (match(substr(url, 3), "/")) {
255		components["path"] = substr(url, 3 + RSTART - 1) # include the /
256		url = substr(url, 3, RSTART - 1)
257	} else {
258		url = substr(url, 3)
259	}
260
261	# Parse userinfo@host:port
262	if (match(url, /@/)) {
263		userinfo = substr(url, 1, RSTART - 1)
264		url = substr(url, RSTART + 1)
265
266		components["userinfo"] = userinfo
267		if (match(userinfo, ":")) {
268			# NOT RECOMMENDED
269			components["password"] = substr(userinfo, RSTART + 1)
270			userinfo = substr(userinfo, RSTART - 1)
271		}
272		components["user"] = userinfo;
273	}
274	if (match(url, ":")) {
275		# port is numeric or empty
276		components["port"] = substr(url, RSTART + 1)
277		url = substr(url, 1, RSTART - 1)
278	}
279	if (url)
280		components["host"] = url
281}
282
283function dump_field_if_present(key, array)
284{
285	r=""
286	if (key in array)
287		r=sprintf(dquote key dquote": "dquote"%s"dquote"\n", array[key])
288	return r
289}
290
291function dump_split_url(components)
292{
293	r= "split_url = {\n"
294	r=r dump_field_if_present("scheme", components)
295	r=r dump_field_if_present("userinfo", components)
296	r=r dump_field_if_present("host", components)
297	r=r dump_field_if_present("port", components)
298	r=r dump_field_if_present("path", components)
299	r=r dump_field_if_present("query", components)
300	r=r dump_field_if_present("frag", components)
301	r=r "}\n"
302
303	return r
304}
305
306# RFC2616 3.2.2
307function is_http_request_uri(split_url)
308{
309	# Fragments are handled by the client, user info is not on the wire.
310	if (("frag" in split_url) || ("userinfo" in split_url))
311		return 0
312	trace("not frag, no user")
313
314	# If absoluteURI, it will have both, if abs_path neither
315	if (("scheme" in split_url) != ("host" in split_url))
316		return 0
317	trace("scheme host ok")
318
319	if ("scheme" in split_url) {
320		trace("original scheme is:  >" split_url["scheme"] "<")
321		scheme = unescape(split_url["scheme"])
322		trace("unescaped scheme is: >" scheme "<")
323		# HTTP 2616 3.2.3 scheme MUST be case insensitive
324		if (tolower(scheme) != "http")
325			return 0
326		trace("scheme is http")
327
328		# 3.2.2 http always has a net_url host authority, host not empty
329		if (!("host" in split_url))
330			return 0
331		trace("host present >" split_url["host"] "<")
332
333		# Authority name not empty
334		if (split_url["host"] == "")
335			return 0
336
337		# 2616 3.2.3 empty path is /    sole fixup: scheme://hostport
338		if (split_url["path"] == "")
339			split_url["path"] = "/"
340	}
341
342	trace("path is now >" split_url["path"] "<")
343	trace("first path char is >" substr(split_url["path"], 1, 1) "<")
344
345	# The path must be absolute.
346	return substr(split_url["path"], 1, 1) == "/"
347}
348
349function location_header_ok(URI)
350{
351	# policy: all response URLs shall be https
352	if (substr(URI, 1, 8) != "https://")
353		return 0
354
355	# The URL shall have been encoded
356	if (bad_uric(URI))
357		return 0
358
359	return 1
360}
361
362function response_needs_location(response)
363{
364	return (response ~ /^3/) || (response ~ /^201/)
365}
366
367function respond_and_exit(response, content, URI)
368{
369	# If the URI is given validate it should be sent and prepare header
370	if (location_header_ok(URI) && response_needs_location(response))
371		location = CRLF "Location: " URI
372	else
373		location = ""
374
375	if (response !~ /^[1-5][0-9][0-9] /) {
376		trace( "DEBUG: response '" response "'\n" )
377		trace( "DEBUG: content: '" content"'\n" )
378		response = "500 Internal Server Error"
379		content = response CRLF
380	}
381
382	content_length = sprintf("Content-Length: %d", length(content))
383
384	# RFC 2616 9.4 HEAD MUST NOT return message body.
385	if (method == "HEAD") {
386		content = ""
387	}
388
389	# Final trace before changing line endings visual seperation
390	trace("")
391
392	# Respond with protocol and response, prepared location from above,
393	# and then the fixed response headers.
394
395	# Separate header lines with CRLF but add nothing after the body
396	OFS = CRLF
397	ORS = ""
398
399	print( "HTTP/1.1 " response location,
400		content_length,
401		"Content-Type: text/plain; charset=UTF-8",
402		"X_Frame_Options: DENY",
403		"Pragma: no-cache",
404		"Cache_Control: no-Store,no-Cache",
405		"X-XSS-Protection: 1; mode=block",
406		"X-Content-Type-Options: nosniff",
407		"Connection: close",
408		"",
409		content)
410
411	# We told client to close the connection; also close this end.
412	exit 0
413}
414
415# Respond with an error and close the connection to avoid synchronization.
416function respond_error(num)
417{
418	if (num in errors)
419		if (num in msgtxt)
420			respond_and_exit(errors[num], msgtxt[num] CRLF)
421		else
422			respond_and_exit(errors[num], errors[num] CRLF)
423	else
424		respond_and_exit(errors[500], "unknown error number " num CRLF)
425}
426
427# To generate a trace, set the tracefile or tracecmd variable with awk -v
428function trace(string)
429{
430	if (tracefile)
431		print(string) > tracefile
432	if (tracecmd)
433		print(string) | tracecmd
434}
435
436
437
438###########################################################
439
440# BEGIN {
441# # The character classes as defined in rfc 2396
442# reserved = ";/?:@&=+$,"
443# mark = "-_.!~*'()"
444# digit = "0123456789"
445# lower = "abcdefghijklmnopqrstuvwxyz"
446# upper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
447# unreserved = lower upper digit mark
448#
449# control = 00-1F, 7F
450# space = " "
451# delims = "<>#%" dquote
452# unwise = "{}|\^[]`"
453# }
454
455################################################################
456
457# Build a table to convert a hex character to an integer
458function make_hex2int(hex2int) {
459	for(i =0; i < 10; i++)
460		hex2int[i] = i
461	for (i=10 ; i < 16; i++) {
462		hex2int[substr("ABCDEF", i - 10 + 1, 1)] = i
463		hex2int[substr("abcdef", i - 10 + 1, 1)] = i
464	}
465}
466