1#!/usr/bin/awk -f 2# 3# A Minimal HTTP/1.1 server to redirect http URIs to https 4 5BEGIN { 6 CRLF = "\r\n" 7 dquote = "\"" 8 9 methods["GET"] = 1 10 methods["HEAD"] = 1 11 12 errors[400] = "400 Bad Request" 13 errors[404] = "404 Not Found" 14 errors[500] = "500 Internal Server Error" 15 errors[501] = "501 Not Implemented" 16 errors[505] = "505 HTTP Version Not Supported" 17 msgtxt[505] = "HTTP/1.1 only" 18 19 # Only forward these resources to the designated paths over https 20 https_resources["/"] = "/" 21} 22 23# Strip trailing CR(\r) before LF(\n) RFC2616 19.3 24/\r$/ { sub(/\r$/, "") } 25 26# The first line is the HTTP request. 27method == "" { 28 if ($0 == "") 29 next 30 31 method = $1 32 request_uri = $2 33 version = $3 34 35 validate_request() 36 37 # headers start on the next line 38 next 39} 40 41# a header continuation line RFC2616 4.2 42/^[ \t]+/ { 43 # Replace leading, trailing whitespace with space below 44 sub(/[ \t]*$/, "") 45 sub(/^[ \t]*/, "") 46 trace("extend header >"header"< with content >"$0"<") 47 48 headers[header] = headers[header] " " $0 49 next 50} 51 52# Header lines start with a token and have a : seperator. Implied LWS is 53# allowed around the : seperator. LWS at the beginning and end can be removed. 54match($0, /[ \t]*:[ \t]*/) { 55 header = substr($0, 1, RSTART - 1) 56 content = substr($0, RSTART + RLENGTH) 57 sub(/[ \t]*$/, "", content) 58 59 # Field names are a single token. LWS is impled allowed at the 60 # : seperator. Any beginning or trailing LWS is not significant. 61 if (!is_token(header)) 62 respond_error(400) 63 64 # Headers are case insensitive, so normalize token to upper case. 65 header = toupper(header) 66 67 # RFC2616 4.2 multiple instances of a headers is only valid for for 68 # comma separated lists. Remove any trailing LWS, add ", " seperator. 69 prior = "" 70 if (header in headers) 71 prior = headers[header] ", " 72 headers[header] = prior content 73 74 trace("found header >"header"< with content >"headers[header]"<") 75 76 next 77} 78 79# A blank line marks the end of the headers. 80/^$/ { 81 # Could read request body here but we don't care. 82 trace("end of request headers") 83 validate_request() 84 85 validate_uri(request_uri, split_uri) 86 host = find_host() 87 path = split_uri["path"] 88 validate_path_and_respond(host, path) 89 90 next 91} 92 93# Should never get here: in headers a line without an indent nor a : is invalid. 94{ 95 trace("Unparsed header line : >" $0 "<") 96 97 header = $0 98 headers[header] = "" 99 100 # check HTTP version before bad request error response 101 validate_request() 102 respond_error(400) 103 next 104} 105 106############################################################ 107 108function validate_request() 109{ 110 trace("version >"version"<") 111 trace("uri >"request_uri"<") 112 trace("method >"method"<") 113 if (version !~ /HTTP\/0*1[.][0-9]+$/) # Support leading 0s, two halves 114 respond_error(505) # Version Not Supported 115 if (bad_uric(request_uri)) 116 respond_error(400) # Bad Request (bogus encoding) 117 if (!method in methods) 118 respond_error(501) # Not Implemented 119} 120 121function validate_uri(request_uri, split_uri) 122{ 123 split_url_components(request_uri, split_uri) 124 trace(dump_split_url(split_uri)) 125 126 if (!is_http_request_uri(split_uri)) 127 respond_error(400) # Bad Request (didn't parse) 128} 129 130function find_host() 131{ 132 # RFC2616 5.2 133 if (!("HOST" in headers)) 134 respond_error(400) 135 136 host = headers["HOST"] 137 trace("initial host is >" host "<") 138 if ("host" in split_uri) 139 host = split_uri["host"] 140 else if (match(host, /:[0-9]*$/)) 141 # RFC 2616 14.23 Host header is host:port of URI 142 # RFC 2616 3.2.2 port may be not given or empty 143 host = substr(host, 1, RSTART - 1) 144 trace("prioritized host is >" host "<") 145 146 # A very relaxed check for domainlabel or IPv4. 147 if (host !~ /^[0-9a-zA-Z.-]+$/) 148 respond_error(400) 149 trace("host passed regex") 150 151 return host 152} 153 154function validate_path_and_respond(host, path) 155{ 156 lookup = unescape(path) 157 158 # URIs must be unescaped before compare, but forwarded unmodified 159 trace("lookup path is >" lookup "<") 160 161 # Translate our whitelisted URI 162 if (lookup in https_resources) { 163 newpath = "https://" host https_resources[lookup] 164 trace("Redirecting to >" newpath "<\n") 165 response = "308 Permanent Redirect" 166 reason = "Access with a https:// URL" 167 content = response CRLF newpath CRLF CRLF reason CRLF 168 respond_and_exit(response, content, newpath) 169 } 170 171 # Rather than be an open redirector, return Not Found 172 respond_error(404) # Not Found 173 174 # get noisy response if we didn't exit above 175 trace("Failed to exit after response!") 176 exit 3 177} 178 179function is_token(token) 180{ 181 # US ASCII (0-127) excluding CTL (000-037, 177, SP (040), seperators 182 if (match(token, /[^\041-\176]/) || 183 match(token, /[()<>@,;:\/[]?=\{\}" \t/)) 184 return 0 185 186 return 1 187} 188 189# unreserved, reserved, or encoded. 190function bad_uric(URI) 191{ 192 # hide encoded 193 gsub(/%[0-9a-fA-F][0-9a-fA-F]/, "", URI) 194 195 # fail if remaining characters are not in (mark alpha numeric reserved) 196 if (URI ~ /[^-_.!~*'()a-zA-Z0-9";\/?:@&=+$,]/) 197 return 1 198 return 0 199} 200 201# We only expect a few chars so call index vs building table hex2int[chr] 202function hex2dec(chr) 203{ 204 v = index("0123456789abcdef", tolower(chr)) 205 if (v) 206 return v - 1 207 208 trace("bad hex2dec character >" chr "<") 209 # bad_uric should have caught input 210 respond_error(500) # Internal Server Error 211} 212 213# Do % hex hex -> code replacement 214function unescape(input, out) 215{ 216 i = index(input, "%") 217 218 if (i == 0) 219 return input 220 221 out = "" 222 while (i) { 223 code = (hex2dec(substr(input, i + 1, 1)) * 16 + \ 224 hex2dec(substr(input, i + 2, 1))) 225 out = out substr(input, 1, i - 1) sprintf("%c", code) 226 input = substr(input, i + 3) 227 i = index(input, "%") 228 } 229 return out input 230} 231 232# With cues from RFC2396 appendix B etal 233function split_url_components(url, components) 234{ 235 if (match(url, /#/)) { 236 components["frag"] = substr(url, RSTART + 1) 237 url = substr(url, 1, RSTART - 1) 238 } 239 240 if (match(url, /\?/)) { 241 components["query"] = substr(url, RSTART + 1) 242 url = substr(url, 1, RSTART - 1) 243 } 244 245 if (match(url, /^[^:\/?#]+:/)) { 246 components["scheme"] = substr(url, 1, RLENGTH - 1) ; 247 url = substr(url, RLENGTH + 1) 248 } 249 250 # Maybe return early: Separate the path from the authority. 251 if (substr(url, 1, 2) != "//") { 252 components["path"] = url; 253 return 254 } else if (match(substr(url, 3), "/")) { 255 components["path"] = substr(url, 3 + RSTART - 1) # include the / 256 url = substr(url, 3, RSTART - 1) 257 } else { 258 url = substr(url, 3) 259 } 260 261 # Parse userinfo@host:port 262 if (match(url, /@/)) { 263 userinfo = substr(url, 1, RSTART - 1) 264 url = substr(url, RSTART + 1) 265 266 components["userinfo"] = userinfo 267 if (match(userinfo, ":")) { 268 # NOT RECOMMENDED 269 components["password"] = substr(userinfo, RSTART + 1) 270 userinfo = substr(userinfo, RSTART - 1) 271 } 272 components["user"] = userinfo; 273 } 274 if (match(url, ":")) { 275 # port is numeric or empty 276 components["port"] = substr(url, RSTART + 1) 277 url = substr(url, 1, RSTART - 1) 278 } 279 if (url) 280 components["host"] = url 281} 282 283function dump_field_if_present(key, array) 284{ 285 r="" 286 if (key in array) 287 r=sprintf(dquote key dquote": "dquote"%s"dquote"\n", array[key]) 288 return r 289} 290 291function dump_split_url(components) 292{ 293 r= "split_url = {\n" 294 r=r dump_field_if_present("scheme", components) 295 r=r dump_field_if_present("userinfo", components) 296 r=r dump_field_if_present("host", components) 297 r=r dump_field_if_present("port", components) 298 r=r dump_field_if_present("path", components) 299 r=r dump_field_if_present("query", components) 300 r=r dump_field_if_present("frag", components) 301 r=r "}\n" 302 303 return r 304} 305 306# RFC2616 3.2.2 307function is_http_request_uri(split_url) 308{ 309 # Fragments are handled by the client, user info is not on the wire. 310 if (("frag" in split_url) || ("userinfo" in split_url)) 311 return 0 312 trace("not frag, no user") 313 314 # If absoluteURI, it will have both, if abs_path neither 315 if (("scheme" in split_url) != ("host" in split_url)) 316 return 0 317 trace("scheme host ok") 318 319 if ("scheme" in split_url) { 320 trace("original scheme is: >" split_url["scheme"] "<") 321 scheme = unescape(split_url["scheme"]) 322 trace("unescaped scheme is: >" scheme "<") 323 # HTTP 2616 3.2.3 scheme MUST be case insensitive 324 if (tolower(scheme) != "http") 325 return 0 326 trace("scheme is http") 327 328 # 3.2.2 http always has a net_url host authority, host not empty 329 if (!("host" in split_url)) 330 return 0 331 trace("host present >" split_url["host"] "<") 332 333 # Authority name not empty 334 if (split_url["host"] == "") 335 return 0 336 337 # 2616 3.2.3 empty path is / sole fixup: scheme://hostport 338 if (split_url["path"] == "") 339 split_url["path"] = "/" 340 } 341 342 trace("path is now >" split_url["path"] "<") 343 trace("first path char is >" substr(split_url["path"], 1, 1) "<") 344 345 # The path must be absolute. 346 return substr(split_url["path"], 1, 1) == "/" 347} 348 349function location_header_ok(URI) 350{ 351 # policy: all response URLs shall be https 352 if (substr(URI, 1, 8) != "https://") 353 return 0 354 355 # The URL shall have been encoded 356 if (bad_uric(URI)) 357 return 0 358 359 return 1 360} 361 362function response_needs_location(response) 363{ 364 return (response ~ /^3/) || (response ~ /^201/) 365} 366 367function respond_and_exit(response, content, URI) 368{ 369 # If the URI is given validate it should be sent and prepare header 370 if (location_header_ok(URI) && response_needs_location(response)) 371 location = CRLF "Location: " URI 372 else 373 location = "" 374 375 if (response !~ /^[1-5][0-9][0-9] /) { 376 trace( "DEBUG: response '" response "'\n" ) 377 trace( "DEBUG: content: '" content"'\n" ) 378 response = "500 Internal Server Error" 379 content = response CRLF 380 } 381 382 content_length = sprintf("Content-Length: %d", length(content)) 383 384 # RFC 2616 9.4 HEAD MUST NOT return message body. 385 if (method == "HEAD") { 386 content = "" 387 } 388 389 # Final trace before changing line endings visual seperation 390 trace("") 391 392 # Respond with protocol and response, prepared location from above, 393 # and then the fixed response headers. 394 395 # Separate header lines with CRLF but add nothing after the body 396 OFS = CRLF 397 ORS = "" 398 399 print( "HTTP/1.1 " response location, 400 content_length, 401 "Content-Type: text/plain; charset=UTF-8", 402 "X_Frame_Options: DENY", 403 "Pragma: no-cache", 404 "Cache_Control: no-Store,no-Cache", 405 "X-XSS-Protection: 1; mode=block", 406 "X-Content-Type-Options: nosniff", 407 "Connection: close", 408 "", 409 content) 410 411 # We told client to close the connection; also close this end. 412 exit 0 413} 414 415# Respond with an error and close the connection to avoid synchronization. 416function respond_error(num) 417{ 418 if (num in errors) 419 if (num in msgtxt) 420 respond_and_exit(errors[num], msgtxt[num] CRLF) 421 else 422 respond_and_exit(errors[num], errors[num] CRLF) 423 else 424 respond_and_exit(errors[500], "unknown error number " num CRLF) 425} 426 427# To generate a trace, set the tracefile or tracecmd variable with awk -v 428function trace(string) 429{ 430 if (tracefile) 431 print(string) > tracefile 432 if (tracecmd) 433 print(string) | tracecmd 434} 435 436 437 438########################################################### 439 440# BEGIN { 441# # The character classes as defined in rfc 2396 442# reserved = ";/?:@&=+$," 443# mark = "-_.!~*'()" 444# digit = "0123456789" 445# lower = "abcdefghijklmnopqrstuvwxyz" 446# upper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 447# unreserved = lower upper digit mark 448# 449# control = 00-1F, 7F 450# space = " " 451# delims = "<>#%" dquote 452# unwise = "{}|\^[]`" 453# } 454 455################################################################ 456 457# Build a table to convert a hex character to an integer 458function make_hex2int(hex2int) { 459 for(i =0; i < 10; i++) 460 hex2int[i] = i 461 for (i=10 ; i < 16; i++) { 462 hex2int[substr("ABCDEF", i - 10 + 1, 1)] = i 463 hex2int[substr("abcdef", i - 10 + 1, 1)] = i 464 } 465} 466