/*- * Copyright 1997 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for MAXHOSTNAMELEN */ #include #include #include #include #include #include #include #include "fetch.h" struct http_state { char *http_hostname; char *http_remote_request; char *http_decoded_file; char *http_host_header; char *http_authentication; char *http_proxy_authentication; unsigned http_port; int http_redirected; }; struct http_auth { TAILQ_ENTRY(http_auth) ha_link; char *ha_scheme; char *ha_realm; char *ha_params; const struct http_auth_method *ha_ham; }; TAILQ_HEAD(http_auth_head, http_auth); static int http_parse(struct fetch_state *fs, const char *uri); static int http_proxy_parse(struct fetch_state *fs, const char *uri); static int http_close(struct fetch_state *fs); static int http_retrieve(struct fetch_state *fs); static int basic_doauth(struct fetch_state *fs, struct http_auth *ha, int prx); struct uri_scheme http_scheme = { "http", http_parse, http_proxy_parse, "HTTP_PROXY", "http" }; struct http_auth_head http_auth, http_proxy_auth; struct http_auth_method { const char *ham_scheme; int (*ham_doauth)(struct fetch_state *, struct http_auth *, int); } http_auth_methods[] = { { "basic", basic_doauth }, { 0, 0 } }; /* We are only concerned with headers we might receive. */ enum http_header { ht_accept_ranges, ht_age, ht_allow, ht_cache_control, ht_connection, ht_content_base, ht_content_encoding, ht_content_language, ht_content_length, ht_content_location, ht_content_md5, ht_content_range, ht_content_type, ht_date, ht_etag, ht_expires, ht_last_modified, ht_location, ht_pragma, ht_proxy_authenticate, ht_public, ht_retry_after, ht_server, ht_transfer_encoding, ht_upgrade, ht_vary, ht_via, ht_www_authenticate, ht_warning, /* unusual cases */ ht_syntax_error, ht_unknown, ht_end_of_header }; static char *format_http_date(time_t when); static char *format_http_user_agent(void); static enum http_header http_parse_header(char *line, char **valuep); static int check_md5(FILE *fp, char *base64ofmd5); static int http_first_line(const char *line); static int http_suck(struct fetch_state *fs, FILE *remote, FILE *local, off_t total_length, int timo); static int http_suck_chunked(struct fetch_state *fs, FILE *remote, FILE *local, off_t total_length, int timo); static int parse_http_content_range(char *orig, off_t *first, off_t *total); static int process_http_auth(struct fetch_state *fs, char *hdr, int autherr); static struct http_auth *find_http_auth(struct http_auth_head *list, const char *scheme, const char *realm); static time_t parse_http_date(char *datestring); static void setup_http_auth(void); static int http_parse(struct fetch_state *fs, const char *u) { const char *p, *colon, *slash, *q; char *hostname, *hosthdr, *trimmed_name, *uri, *ques, saveq = 0; unsigned port; struct http_state *https; uri = alloca(strlen(u) + 1); strcpy(uri, u); p = uri + 5; port = 0; if (p[0] != '/' || p[1] != '/') { warnx("`%s': malformed `http' URL", uri); return EX_USAGE; } p += 2; if ((ques = strpbrk(p, "?#")) != NULL) { saveq = *ques; *ques = '\0'; } colon = strchr(p, ':'); slash = strchr(p, '/'); if (colon && slash && colon < slash) q = colon; else q = slash; if (q == 0) { warnx("`%s': malformed `http' URL", uri); return EX_USAGE; } hostname = alloca(q - p + 1); hostname[0] = '\0'; strncat(hostname, p, q - p); p = slash; if (q == colon && colon + 1 != slash) { unsigned long ul; char *ep; errno = 0; ul = strtoul(colon + 1, &ep, 10); if (ep != slash || ep == colon + 1 || errno != 0 || ul < 1 || ul > 65534) { warn("`%s': invalid port in URL", uri); return EX_USAGE; } port = ul; } else { port = 80; } p = slash; /* parsing finished, restore parm part */ if (ques != NULL) *ques = saveq; https = safe_malloc(sizeof *https); /* * Now, we have a copy of the hostname in hostname, the specified port * (or the default value) in port, and p points to the filename part * of the URI. */ https->http_hostname = safe_strdup(hostname); https->http_port = port; hosthdr = alloca(sizeof("Host: :\r\n") + 5 + strlen(hostname)); sprintf(hosthdr, "Host: %s:%d\r\n", hostname, port); https->http_host_header = safe_strdup(hosthdr); /* * NB: HTTP/1.1 servers MUST also accept a full URI. * However, HTTP/1.0 servers will ONLY accept a trimmed URI. */ https->http_remote_request = safe_strdup(p); p++; if (ques) { trimmed_name = safe_strndup(p, ques - p); } else { trimmed_name = safe_strdup(p); } https->http_decoded_file = percent_decode(trimmed_name); free(trimmed_name); p = https->http_decoded_file; /* now p is the decoded version, so we can extract the basename */ if (fs->fs_outputfile == 0) { slash = strrchr(p, '/'); if (slash) fs->fs_outputfile = slash + 1; else fs->fs_outputfile = p; } https->http_redirected = 0; https->http_authentication = https->http_proxy_authentication = 0; fs->fs_proto = https; fs->fs_close = http_close; fs->fs_retrieve = http_retrieve; return 0; } /* * An HTTP proxy works by accepting a complete URI in a GET request, * retrieving that object, and then forwarding it back to us. Because * it can conceivably handle any URI, we have to do a bit more work * in the parsing of it. */ static int http_proxy_parse(struct fetch_state *fs, const char *uri) { struct http_state *https; const char *env, *slash, *ques; char *file; int rv; https = safe_malloc(sizeof *https); https->http_remote_request = safe_strdup(uri); env = getenv("HTTP_PROXY"); rv = parse_host_port(env, &https->http_hostname, &https->http_port); if (rv) { out: free(https->http_remote_request); free(https); return rv; } if (strncmp(uri, "http://", 7) == 0 || strncmp(uri, "ftp://", 6) == 0) { char *hosthdr; slash = strchr(uri + 7, '/'); if (slash == 0) { warnx("`%s': malformed `http' URL", uri); rv = EX_USAGE; free(https->http_hostname); goto out; } ques = strpbrk(slash, "?#"); if (ques == 0) file = safe_strdup(slash); else file = safe_strndup(slash, ques - slash); hosthdr = alloca(sizeof("Host: \r\n") + slash - uri - 7); strcpy(hosthdr, "Host: "); strncat(hosthdr, uri + 7, slash - uri - 7); strcat(hosthdr, "\r\n"); https->http_host_header = safe_strdup(hosthdr); } else { slash = uri; while (*slash && *slash != ':') slash++; if (*slash) slash++; if (slash[0] == '/' && slash[1] == '/') { slash += 2; while (*slash && *slash != '/') slash++; } file = safe_strdup(slash); https->http_host_header = safe_strdup(""); } https->http_decoded_file = percent_decode(file); https->http_redirected = 0; https->http_authentication = https->http_proxy_authentication = 0; free(file); if (fs->fs_outputfile == 0) { slash = strrchr(https->http_decoded_file, '/'); /* NB: we are not guaranteed to find one... */ fs->fs_outputfile = slash ? slash + 1 : https->http_decoded_file; } fs->fs_proto = https; fs->fs_close = http_close; fs->fs_retrieve = http_retrieve; return 0; } static int http_close(struct fetch_state *fs) { struct http_state *https = fs->fs_proto; free(https->http_hostname); free(https->http_remote_request); free(https->http_decoded_file); free(https->http_host_header); if (https->http_authentication) free(https->http_authentication); if (https->http_proxy_authentication) free(https->http_proxy_authentication); free(https); fs->fs_outputfile = 0; return 0; } static int nullclose(struct fetch_state *fs) { return 0; } /* * Process a redirection. This has a small memory leak. */ static int http_redirect(struct fetch_state *fs, char *new, int permanent) { struct http_state *https = fs->fs_proto; int num_redirects = https->http_redirected + 1; char *out = safe_strdup(fs->fs_outputfile); int rv; if (num_redirects > 5) { warnx("%s: HTTP redirection limit exceeded", out); return EX_PROTOCOL; } free(https->http_hostname); free(https->http_remote_request); free(https->http_decoded_file); free(https); warnx("%s: resource has moved %s to `%s'", out, permanent ? "permanently" : "temporarily", new); rv = http_parse(fs, new); if (rv != 0) { fs->fs_close = nullclose; /* XXX rethink interface? */ return rv; } https = fs->fs_proto; https->http_redirected = num_redirects; /* * This ensures that the output file name doesn't suddenly change * under the user's feet. Unfortunately, this results in a small * memory leak. I wish C had garbage collection... */ fs->fs_outputfile = out; rv = http_retrieve(fs); return rv; } /* * Read HTML-formatted data from remote and display it on stderr. * This is extremely incomplete, as all it does is delete anything * between angle brackets. However, this is usually good enough for * error messages. */ static void html_display(FILE *remote) { char *line; size_t linelen; int inbracket = 0; while ((line = fgetln(remote, &linelen)) != 0) { char *end = line + linelen; char *p; int content = 0; for (p = line; p < end; p++) { if (*p == '<' && !inbracket) { fwrite(line, 1, (p - line), stderr); inbracket = 1; } if (!inbracket && !content && *p != '\n' && *p != '\r') content = 1; if (*p == '>' && inbracket) { line = p + 1; inbracket = 0; } } if (content && line < end) fwrite(line, 1, (end - line), stderr); } } /* * Get a file using HTTP. We will try to implement HTTP/1.1 eventually. * This subroutine makes heavy use of the 4.4-Lite standard I/O library, * in particular the `fgetln' which allows us to slurp an entire `line' * (an arbitrary string of non-NUL characters ending in a newline) directly * out of the stdio buffer. This makes interpreting the HTTP headers much * easier, since they are all guaranteed to end in `\r\n' and we can just * ignore the `\r'. */ static int http_retrieve(struct fetch_state *fs) { struct http_state *https; FILE *remote, *local; int s; struct sockaddr_in sin; struct msghdr msg; #define NIOV 16 /* max is currently 14 */ struct iovec iov[NIOV]; int n, status; const char *env; int timo; char *line, *new_location; char *errstr = 0; size_t linelen, writeresult; off_t total_length, restart_from; time_t last_modified, when_to_retry; char *base64ofmd5; int to_stdout, restarting, redirection, retrying, autherror, chunked; char rangebuf[sizeof("Range: bytes=18446744073709551616-\r\n")]; int tried_head; setup_http_auth(); https = fs->fs_proto; to_stdout = (strcmp(fs->fs_outputfile, "-") == 0); restarting = fs->fs_restart; redirection = 0; retrying = 0; tried_head = 0; /* * Figure out the timeout. Prefer the -T command-line value, * otherwise the HTTP_TIMEOUT envar, or else don't time out at all. */ if (fs->fs_timeout) { timo = fs->fs_timeout; } else if ((env = getenv("HTTP_TIMEOUT")) != 0) { char *ep; unsigned long ul; errno = 0; ul = strtoul(env, &ep, 0); if (*ep != '\0' || *env == '\0' || errno != 0 || ul > INT_MAX) { warnx("`%s': invalid timeout", env); return EX_USAGE; } timo = ul; } else { timo = 0; } memset(&sin, 0, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof sin; sin.sin_port = htons(https->http_port); fs->fs_status = "looking up hostname"; if (inet_aton(https->http_hostname, &sin.sin_addr) == 0) { struct hostent *hp; /* XXX - do timeouts for name resolution? */ hp = gethostbyname2(https->http_hostname, AF_INET); if (hp == 0) { warnx("`%s': cannot resolve: %s", https->http_hostname, hstrerror(h_errno)); return EX_NOHOST; } memcpy(&sin.sin_addr, hp->h_addr_list[0], sizeof sin.sin_addr); } fs->fs_status = "creating request message"; msg.msg_name = (caddr_t)&sin; msg.msg_namelen = sizeof sin; msg.msg_iov = iov; n = 0; msg.msg_control = 0; msg.msg_controllen = 0; msg.msg_flags = fs->fs_linux_bug ? 0 : MSG_EOF; #define addstr(Iov, N, Str) \ do { \ Iov[N].iov_base = (void *)Str; \ Iov[N].iov_len = strlen(Iov[n].iov_base); \ N++; \ } while(0) retry: if (fs->fs_reportsize && !tried_head) { addstr(iov, n, "HEAD "); tried_head = 1; } else { addstr(iov, n, "GET "); tried_head = 0; } addstr(iov, n, https->http_remote_request); addstr(iov, n, " HTTP/1.1\r\n"); /* * The choice of HTTP/1.1 may be a bit controversial. The * specification says that implementations which are not at * least conditionally compliant MUST NOT call themselves * HTTP/1.1. We choose not to comply with that requirement. * (Eventually we will support the full HTTP/1.1, at which * time this comment will not apply. But it's amusing how * specifications attempt to define behavior for implementations * which aren't obeying the spec in the first place...) */ addstr(iov, n, format_http_user_agent()); /* do content negotiation here */ addstr(iov, n, "Accept: */*\r\n"); addstr(iov, n, https->http_host_header); addstr(iov, n, "Connection: close\r\n"); if (https->http_proxy_authentication) addstr(iov, n, https->http_proxy_authentication); if (https->http_authentication) addstr(iov, n, https->http_authentication); if (fs->fs_mirror) { struct stat stab; errno = 0; if (((!to_stdout && stat(fs->fs_outputfile, &stab) == 0) || (to_stdout && fstat(STDOUT_FILENO, &stab) == 0)) && S_ISREG(stab.st_mode)) { addstr(iov, n, "If-Modified-Since: "); addstr(iov, n, format_http_date(stab.st_mtime)); addstr(iov, n, "\r\n"); } else if (errno != 0 || !S_ISREG(stab.st_mode)) { if (errno != 0) warn("%s", fs->fs_outputfile); else warnx("%s: not a regular file", fs->fs_outputfile); warnx("cannot mirror; will retrieve anew"); } } if (restarting) { struct stat stab; errno = 0; if (((!to_stdout && stat(fs->fs_outputfile, &stab) == 0) || (to_stdout && fstat(STDOUT_FILENO, &stab) == 0)) && S_ISREG(stab.st_mode)) { if (!fs->fs_forcerestart) { addstr(iov, n, "If-Range: "); addstr(iov, n, format_http_date(stab.st_mtime)); addstr(iov, n, "\r\n"); } sprintf(rangebuf, "Range: bytes=%qd-\r\n", (long long)stab.st_size); addstr(iov, n, rangebuf); } else if (errno != 0 || !S_ISREG(stab.st_mode)) { if (errno != 0) warn("%s", fs->fs_outputfile); else warnx("%s: not a regular file", fs->fs_outputfile); restarting = 0; warnx("cannot restart; will retrieve anew"); } } addstr(iov, n, "\r\n"); msg.msg_iovlen = n; if (n >= NIOV) err(EX_SOFTWARE, "request vector length exceeded: %d", n); s = socket(PF_INET, SOCK_STREAM, 0); if (s < 0) { warn("socket"); return EX_OSERR; } remote = fdopen(s, "r"); if (remote == 0) { warn("fdopen"); close(s); return EX_OSERR; } fs->fs_status = "sending request message"; setup_sigalrm(); alarm(timo); /* * Some hosts do not correctly handle data in SYN segments. * If no connect(2) is done, the TCP stack will send our * initial request as such a segment. fs_use_connect works * around these broken server TCPs by avoiding this case. * It is not the default because we want to exercise this * code path, and in any case the majority of hosts handle * our default correctly. */ if (fs->fs_use_connect && (connect(s, (struct sockaddr *)&sin, sizeof(struct sockaddr_in)) < 0)) { warn("connect: %s", https->http_hostname); fclose(remote); return EX_OSERR; } if (sendmsg(s, &msg, fs->fs_linux_bug ? 0 : MSG_EOF) < 0) { warn("sendmsg: %s", https->http_hostname); fclose(remote); return EX_OSERR; } got100reply: fs->fs_status = "reading reply status"; alarm(timo); line = fgetln(remote, &linelen); alarm(0); if (line == 0) { if (ferror(remote)) { warn("reading reply from %s", https->http_hostname); fclose(remote); unsetup_sigalrm(); return EX_OSERR; } else { warnx("empty reply from %s", https->http_hostname); fclose(remote); unsetup_sigalrm(); return EX_PROTOCOL; } } /* * If the other end is HTTP 0.9, then we just suck their * response over; can't do anything fancy. We assume that * the file is a text file, so it is safe to use fgetln() * to suck the entire file. (It had better be, since * we used it to grab the first line.) */ if (linelen < 5 || strncasecmp(line, "http", 4) != 0) { if (to_stdout) local = fopen("/dev/stdout", "w"); else local = fopen(fs->fs_outputfile, "w"); if (local == 0) { warn("%s: fopen", fs->fs_outputfile); fclose(remote); unsetup_sigalrm(); return EX_OSERR; } fs->fs_status = "retrieving file from HTTP/0.9 server"; display(fs, -1, 0); do { writeresult = fwrite(line, 1, linelen, local); display(fs, -1, writeresult); if (writeresult != linelen) break; alarm(timo); line = fgetln(remote, &linelen); alarm(0); } while(line != 0); unsetup_sigalrm(); if (ferror(local)) { warn("%s", fs->fs_outputfile); fclose(local); fclose(remote); rm(fs); return EX_OSERR; } else if(ferror(remote)) { warn("%s", https->http_hostname); if (errno == ECONNRESET) warnx("(maybe try -b or -t)"); fclose(local); fclose(remote); rm(fs); return EX_OSERR; } fclose(local); fclose(remote); display(fs, -1, -1); return 0; } /* * OK. The other end is doing HTTP 1.0 at the very least. * This means that some of the fancy stuff is at least possible. */ autherror = 0; line[linelen - 1] = '\0'; /* turn line into a string */ status = http_first_line(line); switch(status) { case 100: /* Continue */ goto got100reply; case 200: /* Here come results */ case 203: /* Non-Authoritative Information */ restarting = 0; break; case 206: /* Here come partial results */ /* can only happen when restarting */ break; case 301: /* Resource has moved permanently */ if (fs->fs_auto_retry < 1) errstr = safe_strdup(line); else redirection = 301; break; case 302: /* Resource has moved temporarily */ /* * We formerly didn't test fs->fs_auto_retry here, * so that this sort of redirection would be transparent * to the user. Unfortunately, there are a lot of idiots * out there running Web sites, and some of them have * decided to implement the following stupidity: rather * than returning the correct `404 Not Found' error * when something is not found, they instead return * a 302 redirect, giving the erroneous impression that * the requested resource actually exists. This * breaks any client which expects a non-existent resource * to elicit a 40x response. Grrr. */ if (fs->fs_auto_retry < 0) /* -A flag */ errstr = safe_strdup(line); else redirection = 302; break; case 304: /* Object is unmodified */ if (fs->fs_mirror) { fclose(remote); unsetup_sigalrm(); return 0; } errstr = safe_strdup(line); break; case 401: /* Unauthorized */ if (https->http_authentication) errstr = safe_strdup(line); else autherror = 401; break; case 407: /* Proxy Authentication Required */ if (https->http_proxy_authentication) errstr = safe_strdup(line); else autherror = 407; break; case 501: /* Not Implemented */ /* If we tried HEAD, retry with GET */ if (tried_head) { n = 0; goto retry; } else { errstr = safe_strdup(line); break; } case 503: /* Service Unavailable */ if (!fs->fs_auto_retry) errstr = safe_strdup(line); else retrying = 503; break; default: errstr = safe_strdup(line); break; } total_length = -1; /* -1 means ``don't know'' */ last_modified = when_to_retry = -1; base64ofmd5 = 0; new_location = 0; restart_from = 0; chunked = 0; fs->fs_status = "parsing reply headers"; while((line = fgetln(remote, &linelen)) != 0) { char *value, *ep; enum http_header header; unsigned long ul; line[linelen - 1] = '\0'; header = http_parse_header(line, &value); if (header == ht_end_of_header) break; switch(header) { case ht_content_length: errno = 0; ul = strtoul(value, &ep, 10); if (errno != 0 || *ep) warnx("invalid Content-Length: `%s'", value); if (!restarting) total_length = ul; break; case ht_last_modified: last_modified = parse_http_date(value); if (last_modified == -1 && fs->fs_verbose > 0) warnx("invalid Last-Modified: `%s'", value); break; case ht_content_md5: base64ofmd5 = safe_strdup(value); break; case ht_content_range: if (!restarting) /* XXX protocol error */ break; /* NB: we might have to restart from farther back than we asked. */ status = parse_http_content_range(value, &restart_from, &total_length); /* If we couldn't understand the reply, get the whole thing. */ if (status) { restarting = 0; doretry: fclose(remote); if (base64ofmd5) free(base64ofmd5); if (new_location) free(new_location); restart_from = 0; n = 0; goto retry; } break; case ht_location: if (redirection) { char *s = value; while (*s && !isspace(*s)) s++; new_location = safe_strndup(value, s - value); } break; case ht_transfer_encoding: if (strncasecmp(value, "chunked", 7) == 0) { chunked = 1; break; } warnx("%s: %s specified Transfer-Encoding `%s'", fs->fs_outputfile, https->http_hostname, value); warnx("%s: output file may be uninterpretable", fs->fs_outputfile); break; case ht_retry_after: if (!retrying) break; errno = 0; ul = strtoul(value, &ep, 10); if (errno != 0 || (*ep && !isspace(*ep))) { time_t when; when = parse_http_date(value); if (when == -1) break; when_to_retry = when; } else { when_to_retry = time(0) + ul; } break; case ht_www_authenticate: if (autherror != 401) break; status = process_http_auth(fs, value, autherror); if (status != 0) goto cantauth; break; case ht_proxy_authenticate: if (autherror != 407) break; status = process_http_auth(fs, value, autherror); if (status != 0) goto cantauth; break; default: break; } } if (autherror == 401 && https->http_authentication) goto doretry; if (autherror == 407 && https->http_proxy_authentication) goto doretry; if (autherror) { goto spewerror; } if (retrying) { int howlong; if (when_to_retry == -1) { errstr = safe_strdup("HTTP/1.1 503 Service Unavailable"); goto spewerror; } howlong = when_to_retry - time(0); if (howlong < 30) howlong = 30; warnx("%s: service unavailable; retrying in %d seconds", https->http_hostname, howlong); fs->fs_status = "waiting to retry"; sleep(howlong); goto doretry; } if (errstr != 0) { spewerror: warnx("%s: %s: HTTP server returned error code %d", fs->fs_outputfile, https->http_hostname, status); if (fs->fs_verbose > 1) { fputs(errstr, stderr); fputc('\n', stderr); html_display(remote); } free(errstr); fclose(remote); unsetup_sigalrm(); return EX_UNAVAILABLE; } if (redirection && new_location) { fclose(remote); if (base64ofmd5) free(base64ofmd5); fs->fs_status = "processing redirection"; status = http_redirect(fs, new_location, redirection == 301); free(new_location); return status; } else if (redirection) { warnx("%s: redirection but no new location", fs->fs_outputfile); fclose(remote); if (base64ofmd5) free(base64ofmd5); return EX_PROTOCOL; } if (total_length > 0 && fs->fs_expectedsize != -1 && total_length != fs->fs_expectedsize) { warnx("%s: size mismatch, expected=%lu / actual=%lu", fs->fs_outputfile, (unsigned long)fs->fs_expectedsize, (unsigned long)total_length); fclose(remote); if (base64ofmd5) free(base64ofmd5); unsetup_sigalrm(); return EX_DATAERR; } fs->fs_status = "retrieving file from HTTP/1.x server"; if (fs->fs_reportsize) { if (total_length == -1) { warnx("%s: size not known\n", fs->fs_outputfile); printf("Unknown\n"); status = 1; } else { printf("%qd\n", (quad_t)total_length); status = 0; } fclose(remote); unsetup_sigalrm(); return status; } /* * OK, if we got here, then we have finished parsing the header * and have read the `\r\n' line which denotes the end of same. * We may or may not have a good idea of the length of the file * or its modtime. At this point we will have to deal with * any special byte-range, content-negotiation, redirection, * or authentication, and probably jump back up to the top, * once we implement those features. So, all we have left to * do is open up the output file and copy data from input to * output until EOF. */ if (to_stdout) local = fopen("/dev/stdout", restarting ? "a" : "w"); else local = fopen(fs->fs_outputfile, restarting ? "a+" : "w+"); if (local == 0) { warn("%s: fopen", fs->fs_outputfile); fclose(remote); unsetup_sigalrm(); return EX_OSERR; } fs->fs_modtime = last_modified; fseek(local, restart_from, SEEK_SET); /* XXX truncation off_t->long */ display(fs, total_length, restart_from); /* XXX truncation */ if (chunked) status = http_suck_chunked(fs, remote, local, total_length, timo); else status = http_suck(fs, remote, local, total_length, timo); if (status) goto out; status = errno; /* save errno for warn(), below, if needed */ if (display(fs, total_length, -1) != 0) { /* Check for truncated file */ errno = status; status = EX_PROTOCOL; goto out; } errno = status; if (ferror(remote)) { warn("reading remote file from %s", https->http_hostname); if (errno == ECONNRESET) warnx("(maybe try -b or -t)"); status = EX_OSERR; } else if(ferror(local)) { warn("`%s': fwrite", fs->fs_outputfile); status = EX_OSERR; } else { status = 0; } if (base64ofmd5) { /* * Ack. When restarting, the MD5 only covers the parts * we are getting, not the whole thing. */ fseek(local, restart_from, SEEK_SET); fs->fs_status = "computing MD5 message digest"; if (!to_stdout) status = check_md5(local, base64ofmd5); else warnx("can't check md5 digest on stdout: %s", base64ofmd5); free(base64ofmd5); } fclose(local); out: unsetup_sigalrm(); fclose(remote); if (status != 0) rm(fs); else adjmodtime(fs); return status; #undef addstr cantauth: warnx("%s: cannot authenticate with %s %s", fs->fs_outputfile, (autherror == 401) ? "server" : "proxy", https->http_hostname); status = EX_NOPERM; goto out; } /* * Suck over an HTTP body in standard form. */ static int http_suck(struct fetch_state *fs, FILE *remote, FILE *local, off_t total_length, int timo) { static char buf[BUFFER_SIZE]; ssize_t readresult, writeresult; off_t remain = total_length; if (total_length == -1) remain = 1; /*XXX*/ do { alarm(timo); readresult = fread(buf, 1, sizeof buf, remote); alarm(0); /* * If know the content-length, ignore anything more the * the server chooses to send us. */ if (total_length != -1 && ((remain -= readresult) < 0)) readresult += remain; if (readresult == 0) return 0; display(fs, total_length, readresult); writeresult = fwrite(buf, 1, readresult, local); } while (writeresult == readresult && remain > 0); return 0; } /* * Suck over an HTTP body in chunked form. Ick. * Note that the return value convention here is a bit strange. * A zero return does not necessarily mean success; rather, it means * that this routine has already taken care of error reporting and * just wants to exit. */ static int http_suck_chunked(struct fetch_state *fs, FILE *remote, FILE *local, off_t total_length, int timo) { static char buf[BUFFER_SIZE]; ssize_t readresult, writeresult; size_t linelen; u_long chunklen; char *line, *ep; for (;;) { alarm(timo); line = fgetln(remote, &linelen); alarm(0); if (line == 0) { warnx("%s: error processing chunked encoding: " "missing length", fs->fs_outputfile); return EX_PROTOCOL; } line[--linelen] = '\0'; for (; linelen > 0; linelen--) { if (isspace(line[linelen - 1])) line[linelen - 1] = '\0'; } errno = 0; chunklen = strtoul(line, &ep, 16); if (errno || *line == 0 || (*ep && !isspace(*ep) && *ep != ';')) { warnx("%s: error processing chunked encoding: " "uninterpretable length: %s", fs->fs_outputfile, line); return EX_PROTOCOL; } if (chunklen == 0) break; #ifndef MIN #define MIN(a,b) ((a)>(b)?(b):(a)) #endif while (chunklen > 0) { alarm(timo); readresult = fread(buf, 1, MIN(sizeof buf, chunklen), remote); alarm(0); if (readresult == 0) { warnx("%s: EOF with %lu left in chunk", fs->fs_outputfile, chunklen); return EX_PROTOCOL; } display(fs, total_length, readresult); chunklen -= readresult; writeresult = fwrite(buf, 1, readresult, local); if (writeresult != readresult) return 0; /* main code will diagnose */ } /* * Read the bogus CRLF after the chunk's body. */ alarm(timo); fread(buf, 1, 2, remote); alarm(0); } /* * If we got here, then we successfully read every chunk and got * the end-of-chunks indicator. Now we have to ignore any trailer * lines which come across---or we would if we cared about keeping * the connection open. Since we are just going to close it anyway, * we won't bother with that here. If ever something important is * defined for the trailer, we will have to revisit that decision. */ return 0; } /* * The format of the response line for an HTTP request is: * HTTP/V.vv{WS}999{WS}Explanatory text for humans to read\r\n * Old pre-HTTP/1.0 servers can return * HTTP{WS}999{WS}Explanatory text for humans to read\r\n * Where {WS} represents whitespace (spaces and/or tabs) and 999 * is a machine-interprable result code. We return the integer value * of that result code, or the impossible value `0' if we are unable to * parse the result. */ static int http_first_line(const char *line) { char *ep; unsigned long ul; if (strncasecmp(line, "http", 4) != 0) return 0; line += 4; while (*line && !isspace(*line)) /* skip non-whitespace */ line++; while (*line && isspace(*line)) /* skip first whitespace */ line++; errno = 0; ul = strtoul(line, &ep, 10); if (errno != 0 || ul > 999 || ul < 100 || !isspace(*ep)) return 0; return ul; } /* * The format of a header line for an HTTP request is: * Header-Name: header-value (with comments in parens)\r\n * This would be a nice application for gperf(1), except that the * names are case-insensitive and gperf can't handle that. */ static enum http_header http_parse_header(char *line, char **valuep) { char *colon, *value; if (*line == '\0' /* protocol error! */ || (line[0] == '\r' && line[1] == '\0')) return ht_end_of_header; colon = strchr(line, ':'); if (colon == 0) return ht_syntax_error; *colon = '\0'; for (value = colon + 1; *value && isspace(*value); value++) ; /* do nothing */ /* Trim trailing whitespace (including \r). */ *valuep = value; value += strlen(value) - 1; while (value > *valuep && isspace(*value)) value--; *++value = '\0'; #define cmp(name, num) do { if (!strcasecmp(line, name)) return num; } while(0) cmp("Accept-Ranges", ht_accept_ranges); cmp("Age", ht_age); cmp("Allow", ht_allow); cmp("Cache-Control", ht_cache_control); cmp("Connection", ht_connection); cmp("Content-Base", ht_content_base); cmp("Content-Encoding", ht_content_encoding); cmp("Content-Language", ht_content_language); cmp("Content-Length", ht_content_length); cmp("Content-Location", ht_content_location); cmp("Content-MD5", ht_content_md5); cmp("Content-Range", ht_content_range); cmp("Content-Type", ht_content_type); cmp("Date", ht_date); cmp("ETag", ht_etag); cmp("Expires", ht_expires); cmp("Last-Modified", ht_last_modified); cmp("Location", ht_location); cmp("Pragma", ht_pragma); cmp("Proxy-Authenticate", ht_proxy_authenticate); cmp("Public", ht_public); cmp("Retry-After", ht_retry_after); cmp("Server", ht_server); cmp("Transfer-Encoding", ht_transfer_encoding); cmp("Upgrade", ht_upgrade); cmp("Vary", ht_vary); cmp("Via", ht_via); cmp("WWW-Authenticate", ht_www_authenticate); cmp("Warning", ht_warning); #undef cmp return ht_unknown; } /* * Compute the RSA Data Security, Inc., MD5 Message Digest of the file * given in `fp', see if it matches the one given in base64 encoding by * `base64ofmd5'. Warn and return an error if it doesn't. */ static int check_md5(FILE *fp, char *base64ofmd5) { MD5_CTX ctx; unsigned char digest[16]; char buf[512]; size_t len; char *ourval; MD5Init(&ctx); while ((len = fread(buf, 1, sizeof buf, fp)) != 0) { MD5Update(&ctx, buf, len); } MD5Final(digest, &ctx); ourval = to_base64(digest, 16); if (strcmp(ourval, base64ofmd5) != 0) { warnx("MD5 digest mismatch: %s, should be %s", ourval, base64ofmd5); free(ourval); return EX_DATAERR; } free(ourval); return 0; } static const char *wkdays[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; static const char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; /* * Interpret one of the three possible formats for an HTTP date. * All of them are really bogus; HTTP should use either ISO 8601 * or NTP timestamps. We make some attempt to accept a subset of 8601 * format. The three standard formats are all fixed-length subsets of their * respective standards (except 8601, which puts all of the stuff we * care about up front). */ static time_t parse_http_date(char *string) { static struct tm tm; /* get good initialization */ time_t rv; const char *tz; int i; /* 8601 has the shortest minimum length */ if (strlen(string) < 15) return -1; if (isdigit(*string)) { /* ISO 8601: 19970127T134551stuffwedon'tcareabout */ for (i = 0; i < 15; i++) { if (i != 8 && !isdigit(string[i])) break; } if (i < 15) return -1; #define digit(x) (string[x] - '0') tm.tm_year = (digit(0) * 1000 + digit(1) * 100 + digit(2) * 10 + digit(3)) - 1900; tm.tm_mon = digit(4) * 10 + digit(5) - 1; tm.tm_mday = digit(6) * 10 + digit(7); if (string[8] != 'T' && string[8] != 't' && string[8] != ' ') return -1; tm.tm_hour = digit(9) * 10 + digit(10); tm.tm_min = digit(11) * 10 + digit(12); tm.tm_sec = digit(13) * 10 + digit(14); /* We don't care about the rest of the stuff after the secs. */ } else if (string[3] == ',') { /* Mon, 27 Jan 1997 14:24:35 stuffwedon'tcareabout */ if (strlen(string) < 25) return -1; string += 5; /* skip over day-of-week */ if (!(isdigit(string[0]) && isdigit(string[1]))) return -1; tm.tm_mday = digit(0) * 10 + digit(1); for (i = 0; i < 12; i++) { if (strncasecmp(months[i], &string[3], 3) == 0) break; } if (i >= 12) return -1; tm.tm_mon = i; if (sscanf(&string[7], "%d %d:%d:%d", &i, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) != 4) return -1; tm.tm_year = i - 1900; } else if (string[3] == ' ') { /* Mon Jan 27 14:25:20 1997 */ if (strlen(string) < 24) return -1; string += 4; for (i = 0; i < 12; i++) { if (strncasecmp(string, months[i], 3) == 0) break; } if (i >= 12) return -1; tm.tm_mon = i; if (sscanf(&string[4], "%d %d:%d:%d %u", &tm.tm_mday, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &i) != 5) return -1; tm.tm_year = i - 1900; } else { /* Monday, 27-Jan-97 14:31:09 stuffwedon'tcareabout */ /* Quoth RFC 2068: o HTTP/1.1 clients and caches should assume that an RFC-850 date which appears to be more than 50 years in the future is in fact in the past (this helps solve the "year 2000" problem). */ time_t now; struct tm *tmnow; int this2dyear; char *comma = strchr(string, ','); char mname[4]; if (comma == 0) return -1; string = comma + 1; if (strlen(string) < 19) return -1; string++; mname[4] = '\0'; if (sscanf(string, "%d-%c%c%c-%d %d:%d:%d", &tm.tm_mday, mname, mname + 1, mname + 2, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec) != 8) return -1; for (i = 0; i < 12; i++) { if (strcasecmp(months[i], mname)) break; } if (i >= 12) return -1; tm.tm_mon = i; /* * RFC 2068 year interpretation. */ time(&now); tmnow = gmtime(&now); this2dyear = tmnow->tm_year % 100; tm.tm_year += tmnow->tm_year - this2dyear; if (tm.tm_year - tmnow->tm_year >= 50) tm.tm_year -= 100; } #undef digit if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23 || tm.tm_mday > 31 || tm.tm_mon > 11) return -1; if (tm.tm_sec < 0 || tm.tm_min < 0 || tm.tm_hour < 0 || tm.tm_mday < 0 || tm.tm_mon < 0 || tm.tm_year < 0) return -1; tz = getenv("TZ"); setenv("TZ", "UTC0", 1); tzset(); rv = mktime(&tm); if (tz) setenv("TZ", tz, 1); else unsetenv("TZ"); return rv; } static char * format_http_date(time_t when) { struct tm *tm; static char buf[30]; tm = gmtime(&when); if (tm == 0) return 0; #ifndef HTTP_DATE_ISO_8601 sprintf(buf, "%s, %02d %s %04d %02d:%02d:%02d GMT", wkdays[tm->tm_wday], tm->tm_mday, months[tm->tm_mon], tm->tm_year + 1900, tm->tm_hour, tm->tm_min, tm->tm_sec); #else /* ISO 8601 */ sprintf(buf, "%04d%02d%02dT%02d%02d%02d+0000", tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); #endif return buf; } static char * format_http_user_agent(void) { static char buf[128]; static int inited; if (!inited) { int mib[2]; char ostype[128], osrelease[128], machine[128]; size_t len; mib[0] = CTL_KERN; mib[1] = KERN_OSTYPE; len = sizeof ostype; if (sysctl(mib, 2, ostype, &len, 0, 0) < 0) { warn("sysctl"); ostype[0] = '\0'; } mib[1] = KERN_OSRELEASE; len = sizeof osrelease; if (sysctl(mib, 2, osrelease, &len, 0, 0) < 0) { warn("sysctl"); osrelease[0] = '\0'; } mib[0] = CTL_HW; mib[1] = HW_MACHINE; len = sizeof machine; if (sysctl(mib, 2, machine, &len, 0, 0) < 0) { warn("sysctl"); machine[0] = '\0'; } snprintf(buf, sizeof buf, "User-Agent: " FETCH_VERSION " %s/%s (%s)\r\n", ostype, osrelease, machine); } return buf; } /* * Parse a Content-Range return header from the server. RFC 2066 defines * this header to have the format: * Content-Range: bytes 12345-67890/123456 * Since we always ask for the whole rest of the file, we consider it an * error if the reply doesn't claim to give it to us. */ static int parse_http_content_range(char *orig, off_t *restart_from, off_t *total_length) { u_quad_t first, last, total; char *ep; if (strncasecmp(orig, "bytes", 5) != 0) { warnx("unknown Content-Range unit: `%s'", orig); return EX_PROTOCOL; } orig += 5; while (*orig && isspace(*orig)) orig++; errno = 0; first = strtouq(orig, &ep, 10); if (errno != 0 || *ep != '-') { warnx("invalid Content-Range: `%s'", orig); return EX_PROTOCOL; } last = strtouq(ep + 1, &ep, 10); if (errno != 0 || *ep != '/' || last < first) { warnx("invalid Content-Range: `%s'", orig); return EX_PROTOCOL; } total = strtouq(ep + 1, &ep, 10); if (errno != 0 || !(*ep == '\0' || isspace(*ep))) { warnx("invalid Content-Range: `%s'", orig); return EX_PROTOCOL; } if (last + 1 != total) { warnx("HTTP server did not return requested Content-Range"); return EX_PROTOCOL; } *restart_from = first; *total_length = last; return 0; } /* * Do HTTP authentication. We only do ``basic'' right now, but * MD5 ought to be fairly easy. The hard part is actually teasing * apart the header, which is fairly badly designed (so what else is * new?). */ static char * getauthparam(char *params, const char *name) { char *rv; enum state { normal, quoted } state; while (*params) { if (strncasecmp(params, name, strlen(name)) == 0 && params[strlen(name)] == '=') break; state = normal; while (*params) { if (state == normal && *params == ',') break; if (*params == '\"') state = (state == quoted) ? normal : quoted; if (*params == '\\' && params[1] != '\0') params++; params++; } } if (*params == '\0') return 0; params += strlen(name) + 1; rv = params; state = normal; while (*params) { if (state == normal && *params == ',') break; if (*params == '\"') state = (state == quoted) ? normal : quoted; if (*params == '\\' && params[1] != '\0') params++; params++; } if (params[-1] == '\"') params[-1] = '\0'; else params[0] = '\0'; if (*rv == '\"') rv++; return rv; } static int process_http_auth(struct fetch_state *fs, char *hdr, int autherr) { enum state { normal, quoted } state; char *scheme, *params, *nscheme, *realm; struct http_auth *ha; do { scheme = params = hdr; /* Look for end of scheme name. */ while (*params && !isspace(*params)) params++; if (*params == '\0') return EX_PROTOCOL; /* Null-terminate scheme and skip whitespace. */ while (*params && isspace(*params)) *params++ = '\0'; /* Semi-parse parameters to find their end. */ nscheme = params; state = normal; while (*nscheme) { if (state == normal && isspace(*nscheme)) break; if (*nscheme == '\"') state = (state == quoted) ? normal : quoted; if (*nscheme == '\\' && nscheme[1] != '\0') nscheme++; nscheme++; } /* Null-terminate parameters and skip whitespace. */ while (*nscheme && isspace(*nscheme)) *nscheme++ = '\0'; realm = getauthparam(params, "realm"); if (realm == 0) { scheme = nscheme; continue; } if (autherr == 401) ha = find_http_auth(&http_auth, scheme, realm); else ha = find_http_auth(&http_proxy_auth, scheme, realm); if (ha) return ha->ha_ham->ham_doauth(fs, ha, autherr == 407); } while (*scheme); return EX_NOPERM; } static void parse_http_auth_env(const char *env, struct http_auth_head *ha_tqh) { char *nenv, *p, *scheme, *realm, *params; struct http_auth *ha; struct http_auth_method *ham; nenv = alloca(strlen(env) + 1); strcpy(nenv, env); while ((p = strsep(&nenv, " \t")) != 0) { scheme = strsep(&p, ":"); if (scheme == 0 || *scheme == '\0') continue; realm = strsep(&p, ":"); if (realm == 0 || *realm == '\0') continue; params = (p && *p) ? p : 0; for (ham = http_auth_methods; ham->ham_scheme; ham++) { if (strcasecmp(scheme, ham->ham_scheme) == 0) break; } if (ham == 0) continue; ha = safe_malloc(sizeof *ha); ha->ha_scheme = safe_strdup(scheme); ha->ha_realm = safe_strdup(realm); ha->ha_params = params ? safe_strdup(params) : 0; ha->ha_ham = ham; TAILQ_INSERT_TAIL(ha_tqh, ha, ha_link); } } /* * Look up an authentication method. Automatically clone wildcards * into fully-specified entries. */ static struct http_auth * find_http_auth(struct http_auth_head *tqh, const char *scm, const char *realm) { struct http_auth *ha; for (ha = tqh->tqh_first; ha; ha = ha->ha_link.tqe_next) { if (strcasecmp(ha->ha_scheme, scm) == 0 && strcasecmp(ha->ha_realm, realm) == 0) return ha; } for (ha = tqh->tqh_first; ha; ha = ha->ha_link.tqe_next) { if (strcasecmp(ha->ha_scheme, scm) == 0 && strcmp(ha->ha_realm, "*") == 0) break; } if (ha != 0) { struct http_auth *ha2; ha2 = safe_malloc(sizeof *ha2); ha2->ha_scheme = safe_strdup(scm); ha2->ha_realm = safe_strdup(realm); ha2->ha_params = ha->ha_params ? safe_strdup(ha->ha_params) :0; ha2->ha_ham = ha->ha_ham; TAILQ_INSERT_TAIL(tqh, ha2, ha_link); ha = ha2; } return ha; } static void setup_http_auth(void) { const char *envar; static int once; if (once) return; once = 1; TAILQ_INIT(&http_auth); TAILQ_INIT(&http_proxy_auth); envar = getenv("HTTP_AUTH"); if (envar) parse_http_auth_env(envar, &http_auth); envar = getenv("HTTP_PROXY_AUTH"); if (envar) parse_http_auth_env(envar, &http_proxy_auth); } static int basic_doauth(struct fetch_state *fs, struct http_auth *ha, int isproxy) { struct http_state *https = fs->fs_proto; char *user; char *pass; char *enc; char **hdr; size_t userlen; FILE *fp; if (!isatty(0) && (ha->ha_params == 0 || strchr(ha->ha_params, ':') == 0)) return EX_NOPERM; if (ha->ha_params == 0) { fp = fopen("/dev/tty", "r+"); if (fp == 0) { warn("opening /dev/tty"); return EX_OSERR; } fprintf(fp, "Enter `basic' user name for realm `%s': ", ha->ha_realm); fflush(fp); user = fgetln(fp, &userlen); if (user == 0 || userlen < 1) { /* longer name? */ return EX_NOPERM; } if (user[userlen - 1] == '\n') user[userlen - 1] = '\0'; else user[userlen] = '\0'; user = safe_strdup(user); fclose(fp); pass = 0; } else if ((pass = strchr(ha->ha_params, ':')) == 0) { user = safe_strdup(ha->ha_params); free(ha->ha_params); } if (pass == 0) { pass = getpass("Password: "); ha->ha_params = safe_malloc(strlen(user) + 2 + strlen(pass)); strcpy(ha->ha_params, user); strcat(ha->ha_params, ":"); strcat(ha->ha_params, pass); } enc = to_base64(ha->ha_params, strlen(ha->ha_params)); hdr = isproxy ? &https->http_proxy_authentication : &https->http_authentication; if (*hdr) free(*hdr); *hdr = safe_malloc(sizeof("Proxy-Authorization: basic \r\n") + strlen(enc)); if (isproxy) strcpy(*hdr, "Proxy-Authorization"); else strcpy(*hdr, "Authorization"); strcat(*hdr, ": Basic "); strcat(*hdr, enc); strcat(*hdr, "\r\n"); free(enc); return 0; }