diff options
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c | 1079 |
1 files changed, 1010 insertions, 69 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c index 0bdabf9..475d8b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c +++ b/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c @@ -109,6 +109,7 @@ #include <sys/zone.h> #include <sys/socket.h> #include <netinet/in.h> +#include "strtolctype.h" /* FreeBSD includes: */ #if !defined(sun) @@ -214,6 +215,7 @@ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ +static int dtrace_getf; /* number of unpriv getf()s */ #if defined(sun) static void *dtrace_softstate; /* softstate pointer */ #endif @@ -464,8 +466,8 @@ static kmutex_t dtrace_errlock; * disallow all negative sizes. Ranges of size 0 are allowed. */ #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ - ((testaddr) - (baseaddr) < (basesz) && \ - (testaddr) + (testsz) - (baseaddr) <= (basesz) && \ + ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ + (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ (testaddr) + (testsz) >= (testaddr)) /* @@ -574,6 +576,8 @@ void dtrace_dynvar_clean(dtrace_dstate_t *); dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *, size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *); uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *); +static int dtrace_priv_proc(dtrace_state_t *); +static void dtrace_getf_barrier(void); /* * DTrace Probe Context Functions @@ -718,7 +722,7 @@ dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, * up both thread-local variables and any global dynamically-allocated * variables. */ - if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base, + if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base, vstate->dtvs_dynvars.dtds_size)) { dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; uintptr_t base = (uintptr_t)dstate->dtds_base + @@ -785,6 +789,7 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval; + file_t *fp; /* * If we hold the privilege to read from kernel memory, then @@ -802,10 +807,104 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, /* * We're allowed to read from our own string table. */ - if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, + if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, mstate->dtms_difo->dtdo_strlen)) return (1); + if (vstate->dtvs_state != NULL && + dtrace_priv_proc(vstate->dtvs_state)) { + proc_t *p; + + /* + * When we have privileges to the current process, there are + * several context-related kernel structures that are safe to + * read, even absent the privilege to read from kernel memory. + * These reads are safe because these structures contain only + * state that (1) we're permitted to read, (2) is harmless or + * (3) contains pointers to additional kernel state that we're + * not permitted to read (and as such, do not present an + * opportunity for privilege escalation). Finally (and + * critically), because of the nature of their relation with + * the current thread context, the memory associated with these + * structures cannot change over the duration of probe context, + * and it is therefore impossible for this memory to be + * deallocated and reallocated as something else while it's + * being operated upon. + */ + if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) + return (1); + + if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, + sz, curthread->t_procp, sizeof (proc_t))) { + return (1); + } + + if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cred, sizeof (cred_t))) { + return (1); + } + +#if defined(sun) + if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, + &(p->p_pidp->pid_id), sizeof (pid_t))) { + return (1); + } + + if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { + return (1); + } +#endif + } + + if ((fp = mstate->dtms_getf) != NULL) { + uintptr_t psz = sizeof (void *); + vnode_t *vp; + vnodeops_t *op; + + /* + * When getf() returns a file_t, the enabling is implicitly + * granted the (transient) right to read the returned file_t + * as well as the v_path and v_op->vnop_name of the underlying + * vnode. These accesses are allowed after a successful + * getf() because the members that they refer to cannot change + * once set -- and the barrier logic in the kernel's closef() + * path assures that the file_t and its referenced vode_t + * cannot themselves be stale (that is, it impossible for + * either dtms_getf itself or its f_vnode member to reference + * freed memory). + */ + if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) + return (1); + + if ((vp = fp->f_vnode) != NULL) { +#if defined(sun) + if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) + return (1); + if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz, + vp->v_path, strlen(vp->v_path) + 1)) { + return (1); + } +#endif + + if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) + return (1); + +#if defined(sun) + if ((op = vp->v_op) != NULL && + DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { + return (1); + } + + if (op != NULL && op->vnop_name != NULL && + DTRACE_INRANGE(addr, sz, op->vnop_name, + strlen(op->vnop_name) + 1)) { + return (1); + } +#endif + } + } + DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; return (0); @@ -865,6 +964,58 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, } /* + * Convert a string to a signed integer using safe loads. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static int64_t +dtrace_strtoll(char *input, int base, size_t limit) +{ + uintptr_t pos = (uintptr_t)input; + int64_t val = 0; + int x; + boolean_t neg = B_FALSE; + char c, cc, ccc; + uintptr_t end = pos + limit; + + /* + * Consume any whitespace preceding digits. + */ + while ((c = dtrace_load8(pos)) == ' ' || c == '\t') + pos++; + + /* + * Handle an explicit sign if one is present. + */ + if (c == '-' || c == '+') { + if (c == '-') + neg = B_TRUE; + c = dtrace_load8(++pos); + } + + /* + * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it + * if present. + */ + if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' || + cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) { + pos += 2; + c = ccc; + } + + /* + * Read in contiguous digits until the first non-digit character. + */ + for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base; + c = dtrace_load8(++pos)) + val = val * base + x; + + return (neg ? -val : val); +} + +/* * Compare two strings using safe loads. */ static int @@ -1185,8 +1336,7 @@ dtrace_priv_proc_common_zone(dtrace_state_t *state) */ ASSERT(s_cr != NULL); - if ((cr = CRED()) != NULL && - s_cr->cr_zone == cr->cr_zone) + if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) return (1); return (0); @@ -1286,6 +1436,115 @@ dtrace_priv_kernel_destructive(dtrace_state_t *state) } /* + * Determine if the dte_cond of the specified ECB allows for processing of + * the current probe to continue. Note that this routine may allow continued + * processing, but with access(es) stripped from the mstate's dtms_access + * field. + */ +static int +dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, + dtrace_ecb_t *ecb) +{ + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + dtrace_pops_t *pops = &prov->dtpv_pops; + int mode = DTRACE_MODE_NOPRIV_DROP; + + ASSERT(ecb->dte_cond); + +#if defined(sun) + if (pops->dtps_mode != NULL) { + mode = pops->dtps_mode(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); + + ASSERT((mode & DTRACE_MODE_USER) || + (mode & DTRACE_MODE_KERNEL)); + ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || + (mode & DTRACE_MODE_NOPRIV_DROP)); + } + + /* + * If the dte_cond bits indicate that this consumer is only allowed to + * see user-mode firings of this probe, call the provider's dtps_mode() + * entry point to check that the probe was fired while in a user + * context. If that's not the case, use the policy specified by the + * provider to determine if we drop the probe or merely restrict + * operation. + */ + if (ecb->dte_cond & DTRACE_COND_USERMODE) { + ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); + + if (!(mode & DTRACE_MODE_USER)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + } + } +#endif + + /* + * This is more subtle than it looks. We have to be absolutely certain + * that CRED() isn't going to change out from under us so it's only + * legit to examine that structure if we're in constrained situations. + * Currently, the only times we'll this check is if a non-super-user + * has enabled the profile or syscall providers -- providers that + * allow visibility of all processes. For the profile case, the check + * above will ensure that we're examining a user context. + */ + if (ecb->dte_cond & DTRACE_COND_OWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + proc_t *proc; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_uid != cr->cr_uid || + s_cr->cr_uid != cr->cr_ruid || + s_cr->cr_uid != cr->cr_suid || + s_cr->cr_gid != cr->cr_gid || + s_cr->cr_gid != cr->cr_rgid || + s_cr->cr_gid != cr->cr_sgid || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + +#if defined(sun) + mstate->dtms_access &= ~DTRACE_ACCESS_PROC; +#endif + } + } + +#if defined(sun) + /* + * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not + * in our zone, check to see if our mode policy is to restrict rather + * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC + * and DTRACE_ACCESS_ARGS + */ + if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= + ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); + } + } +#endif + + return (1); +} + +/* * Note: not called from probe context. This function is called * asynchronously (and at a regular interval) from outside of probe context to * clean the dirty dynamic variable lists on all CPUs. Dynamic variable @@ -2903,7 +3162,7 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, #endif case DIF_VAR_CURTHREAD: - if (!dtrace_priv_kernel(state)) + if (!dtrace_priv_proc(state)) return (0); return ((uint64_t)(uintptr_t)curthread); @@ -3259,6 +3518,463 @@ dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v, } } + +typedef enum dtrace_json_state { + DTRACE_JSON_REST = 1, + DTRACE_JSON_OBJECT, + DTRACE_JSON_STRING, + DTRACE_JSON_STRING_ESCAPE, + DTRACE_JSON_STRING_ESCAPE_UNICODE, + DTRACE_JSON_COLON, + DTRACE_JSON_COMMA, + DTRACE_JSON_VALUE, + DTRACE_JSON_IDENTIFIER, + DTRACE_JSON_NUMBER, + DTRACE_JSON_NUMBER_FRAC, + DTRACE_JSON_NUMBER_EXP, + DTRACE_JSON_COLLECT_OBJECT +} dtrace_json_state_t; + +/* + * This function possesses just enough knowledge about JSON to extract a single + * value from a JSON string and store it in the scratch buffer. It is able + * to extract nested object values, and members of arrays by index. + * + * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to + * be looked up as we descend into the object tree. e.g. + * + * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL + * with nelems = 5. + * + * The run time of this function must be bounded above by strsize to limit the + * amount of work done in probe context. As such, it is implemented as a + * simple state machine, reading one character at a time using safe loads + * until we find the requested element, hit a parsing error or run off the + * end of the object or string. + * + * As there is no way for a subroutine to return an error without interrupting + * clause execution, we simply return NULL in the event of a missing key or any + * other error condition. Each NULL return in this function is commented with + * the error condition it represents -- parsing or otherwise. + * + * The set of states for the state machine closely matches the JSON + * specification (http://json.org/). Briefly: + * + * DTRACE_JSON_REST: + * Skip whitespace until we find either a top-level Object, moving + * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_OBJECT: + * Locate the next key String in an Object. Sets a flag to denote + * the next String as a key string and moves to DTRACE_JSON_STRING. + * + * DTRACE_JSON_COLON: + * Skip whitespace until we find the colon that separates key Strings + * from their values. Once found, move to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_VALUE: + * Detects the type of the next value (String, Number, Identifier, Object + * or Array) and routes to the states that process that type. Here we also + * deal with the element selector list if we are requested to traverse down + * into the object tree. + * + * DTRACE_JSON_COMMA: + * Skip whitespace until we find the comma that separates key-value pairs + * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays + * (similarly DTRACE_JSON_VALUE). All following literal value processing + * states return to this state at the end of their value, unless otherwise + * noted. + * + * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP: + * Processes a Number literal from the JSON, including any exponent + * component that may be present. Numbers are returned as strings, which + * may be passed to strtoll() if an integer is required. + * + * DTRACE_JSON_IDENTIFIER: + * Processes a "true", "false" or "null" literal in the JSON. + * + * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE, + * DTRACE_JSON_STRING_ESCAPE_UNICODE: + * Processes a String literal from the JSON, whether the String denotes + * a key, a value or part of a larger Object. Handles all escape sequences + * present in the specification, including four-digit unicode characters, + * but merely includes the escape sequence without converting it to the + * actual escaped character. If the String is flagged as a key, we + * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA. + * + * DTRACE_JSON_COLLECT_OBJECT: + * This state collects an entire Object (or Array), correctly handling + * embedded strings. If the full element selector list matches this nested + * object, we return the Object in full as a string. If not, we use this + * state to skip to the next value at this level and continue processing. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static char * +dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems, + char *dest) +{ + dtrace_json_state_t state = DTRACE_JSON_REST; + int64_t array_elem = INT64_MIN; + int64_t array_pos = 0; + uint8_t escape_unicount = 0; + boolean_t string_is_key = B_FALSE; + boolean_t collect_object = B_FALSE; + boolean_t found_key = B_FALSE; + boolean_t in_array = B_FALSE; + uint32_t braces = 0, brackets = 0; + char *elem = elemlist; + char *dd = dest; + uintptr_t cur; + + for (cur = json; cur < json + size; cur++) { + char cc = dtrace_load8(cur); + if (cc == '\0') + return (NULL); + + switch (state) { + case DTRACE_JSON_REST: + if (isspace(cc)) + break; + + if (cc == '{') { + state = DTRACE_JSON_OBJECT; + break; + } + + if (cc == '[') { + in_array = B_TRUE; + array_pos = 0; + array_elem = dtrace_strtoll(elem, 10, size); + found_key = array_elem == 0 ? B_TRUE : B_FALSE; + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected to find a top-level object or array. + */ + return (NULL); + case DTRACE_JSON_OBJECT: + if (isspace(cc)) + break; + + if (cc == '"') { + state = DTRACE_JSON_STRING; + string_is_key = B_TRUE; + break; + } + + /* + * ERROR: either the object did not start with a key + * string, or we've run off the end of the object + * without finding the requested key. + */ + return (NULL); + case DTRACE_JSON_STRING: + if (cc == '\\') { + *dd++ = '\\'; + state = DTRACE_JSON_STRING_ESCAPE; + break; + } + + if (cc == '"') { + if (collect_object) { + /* + * We don't reset the dest here, as + * the string is part of a larger + * object being collected. + */ + *dd++ = cc; + collect_object = B_FALSE; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (string_is_key) { + if (dtrace_strncmp(dest, elem, + size) == 0) + found_key = B_TRUE; + } else if (found_key) { + if (nelems > 1) { + /* + * We expected an object, not + * this string. + */ + return (NULL); + } + return (dest); + } + state = string_is_key ? DTRACE_JSON_COLON : + DTRACE_JSON_COMMA; + string_is_key = B_FALSE; + break; + } + + *dd++ = cc; + break; + case DTRACE_JSON_STRING_ESCAPE: + *dd++ = cc; + if (cc == 'u') { + escape_unicount = 0; + state = DTRACE_JSON_STRING_ESCAPE_UNICODE; + } else { + state = DTRACE_JSON_STRING; + } + break; + case DTRACE_JSON_STRING_ESCAPE_UNICODE: + if (!isxdigit(cc)) { + /* + * ERROR: invalid unicode escape, expected + * four valid hexidecimal digits. + */ + return (NULL); + } + + *dd++ = cc; + if (++escape_unicount == 4) + state = DTRACE_JSON_STRING; + break; + case DTRACE_JSON_COLON: + if (isspace(cc)) + break; + + if (cc == ':') { + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected a colon. + */ + return (NULL); + case DTRACE_JSON_COMMA: + if (isspace(cc)) + break; + + if (cc == ',') { + if (in_array) { + state = DTRACE_JSON_VALUE; + if (++array_pos == array_elem) + found_key = B_TRUE; + } else { + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * ERROR: either we hit an unexpected character, or + * we reached the end of the object or array without + * finding the requested key. + */ + return (NULL); + case DTRACE_JSON_IDENTIFIER: + if (islower(cc)) { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + + if (dtrace_strncmp(dest, "true", 5) == 0 || + dtrace_strncmp(dest, "false", 6) == 0 || + dtrace_strncmp(dest, "null", 5) == 0) { + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, + * not this identifier. + */ + return (NULL); + } + return (dest); + } else { + cur--; + state = DTRACE_JSON_COMMA; + break; + } + } + + /* + * ERROR: we did not recognise the identifier as one + * of those in the JSON specification. + */ + return (NULL); + case DTRACE_JSON_NUMBER: + if (cc == '.') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_FRAC; + break; + } + + if (cc == 'x' || cc == 'X') { + /* + * ERROR: specification explicitly excludes + * hexidecimal or octal numbers. + */ + return (NULL); + } + + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_FRAC: + if (cc == 'e' || cc == 'E') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_EXP; + break; + } + + if (cc == '+' || cc == '-') { + /* + * ERROR: expect sign as part of exponent only. + */ + return (NULL); + } + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_EXP: + if (isdigit(cc) || cc == '+' || cc == '-') { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, not + * this number. + */ + return (NULL); + } + return (dest); + } + + cur--; + state = DTRACE_JSON_COMMA; + break; + case DTRACE_JSON_VALUE: + if (isspace(cc)) + break; + + if (cc == '{' || cc == '[') { + if (nelems > 1 && found_key) { + in_array = cc == '[' ? B_TRUE : B_FALSE; + /* + * If our element selector directs us + * to descend into this nested object, + * then move to the next selector + * element in the list and restart the + * state machine. + */ + while (*elem != '\0') + elem++; + elem++; /* skip the inter-element NUL */ + nelems--; + dd = dest; + if (in_array) { + state = DTRACE_JSON_VALUE; + array_pos = 0; + array_elem = dtrace_strtoll( + elem, 10, size); + found_key = array_elem == 0 ? + B_TRUE : B_FALSE; + } else { + found_key = B_FALSE; + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * Otherwise, we wish to either skip this + * nested object or return it in full. + */ + if (cc == '[') + brackets = 1; + else + braces = 1; + *dd++ = cc; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + + if (cc == '"') { + state = DTRACE_JSON_STRING; + break; + } + + if (islower(cc)) { + /* + * Here we deal with true, false and null. + */ + *dd++ = cc; + state = DTRACE_JSON_IDENTIFIER; + break; + } + + if (cc == '-' || isdigit(cc)) { + *dd++ = cc; + state = DTRACE_JSON_NUMBER; + break; + } + + /* + * ERROR: unexpected character at start of value. + */ + return (NULL); + case DTRACE_JSON_COLLECT_OBJECT: + if (cc == '\0') + /* + * ERROR: unexpected end of input. + */ + return (NULL); + + *dd++ = cc; + if (cc == '"') { + collect_object = B_TRUE; + state = DTRACE_JSON_STRING; + break; + } + + if (cc == ']') { + if (brackets-- == 0) { + /* + * ERROR: unbalanced brackets. + */ + return (NULL); + } + } else if (cc == '}') { + if (braces-- == 0) { + /* + * ERROR: unbalanced braces. + */ + return (NULL); + } + } else if (cc == '{') { + braces++; + } else if (cc == '[') { + brackets++; + } + + if (brackets == 0 && braces == 0) { + if (found_key) { + *dd = '\0'; + return (dest); + } + dd = dest; /* reset string buffer */ + state = DTRACE_JSON_COMMA; + } + break; + } + } + return (NULL); +} + /* * Emulate the execution of DTrace ID subroutines invoked by the call opcode. * Notice that we don't bother validating the proper number of arguments or @@ -4055,6 +4771,65 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } + case DIF_SUBR_JSON: { + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t json = tupregs[0].dttk_value; + size_t jsonlen = dtrace_strlen((char *)json, size); + uintptr_t elem = tupregs[1].dttk_value; + size_t elemlen = dtrace_strlen((char *)elem, size); + + char *dest = (char *)mstate->dtms_scratch_ptr; + char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1; + char *ee = elemlist; + int nelems = 1; + uintptr_t cur; + + if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) || + !dtrace_canload(elem, elemlen + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * Read the element selector and split it up into a packed list + * of strings. + */ + for (cur = elem; cur < elem + elemlen; cur++) { + char cc = dtrace_load8(cur); + + if (cur == elem && cc == '[') { + /* + * If the first element selector key is + * actually an array index then ignore the + * bracket. + */ + continue; + } + + if (cc == ']') + continue; + + if (cc == '.' || cc == '[') { + nelems++; + cc = '\0'; + } + + *ee++ = cc; + } + *ee++ = '\0'; + + if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist, + nelems, dest)) != 0) + mstate->dtms_scratch_ptr += jsonlen + 1; + break; + } + case DIF_SUBR_TOUPPER: case DIF_SUBR_TOLOWER: { uintptr_t s = tupregs[0].dttk_value; @@ -4364,6 +5139,28 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } + case DIF_SUBR_STRTOLL: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + if (!dtrace_strcanload(s, size, mstate, vstate)) { + regs[rd] = INT64_MIN; + break; + } + + regs[rd] = dtrace_strtoll((char *)s, base, size); + break; + } + case DIF_SUBR_LLTOSTR: { int64_t i = (int64_t)tupregs[0].dttk_value; uint64_t val, digit; @@ -4573,11 +5370,32 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, break; } + case DIF_SUBR_GETF: { + uintptr_t fd = tupregs[0].dttk_value; + struct filedesc *fdp; + file_t *fp; + + if (!dtrace_priv_proc(state)) { + regs[rd] = 0; + break; + } + fdp = curproc->p_fd; + FILEDESC_SLOCK(fdp); + fp = fget_locked(fdp, fd); + mstate->dtms_getf = fp; + regs[rd] = (uintptr_t)fp; + FILEDESC_SUNLOCK(fdp); + break; + } + case DIF_SUBR_CLEANPATH: { char *dest = (char *)mstate->dtms_scratch_ptr, c; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t src = tupregs[0].dttk_value; int i = 0, j = 0; +#if defined(sun) + zone_t *z; +#endif if (!dtrace_strcanload(src, size, mstate, vstate)) { regs[rd] = 0; @@ -4676,6 +5494,25 @@ next: } while (c != '\0'); dest[j] = '\0'; + +#if defined(sun) + if (mstate->dtms_getf != NULL && + !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) && + (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) { + /* + * If we've done a getf() as a part of this ECB and we + * don't have kernel access (and we're not in the global + * zone), check if the path we cleaned up begins with + * the zone's root path, and trim it off if so. Note + * that this is an output cleanliness issue, not a + * security issue: knowing one's zone root path does + * not enable privilege escalation. + */ + if (strstr(dest, z->zone_rootpath) == dest) + dest += strlen(z->zone_rootpath) - 1; + } +#endif + regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; break; @@ -5110,102 +5947,95 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, pc = DIF_INSTR_LABEL(instr); break; case DIF_OP_RLDSB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSB: regs[rd] = (int8_t)dtrace_load8(regs[r1]); break; case DIF_OP_RLDSH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSH: regs[rd] = (int16_t)dtrace_load16(regs[r1]); break; case DIF_OP_RLDSW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSW: regs[rd] = (int32_t)dtrace_load32(regs[r1]); break; case DIF_OP_RLDUB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUB: regs[rd] = dtrace_load8(regs[r1]); break; case DIF_OP_RLDUH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUH: regs[rd] = dtrace_load16(regs[r1]); break; case DIF_OP_RLDUW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUW: regs[rd] = dtrace_load32(regs[r1]); break; case DIF_OP_RLDX: - if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 8, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDX: regs[rd] = dtrace_load64(regs[r1]); break; case DIF_OP_ULDSB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int8_t) dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDSH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int16_t) dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDSW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int32_t) dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDX: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword64((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_RET: rval = regs[rd]; @@ -5887,7 +6717,7 @@ dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val) if (dtrace_destructive_disallow) return; - flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags; + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; now = dtrace_gethrtime(); @@ -6035,6 +6865,63 @@ out: mstate->dtms_scratch_ptr = old; } +static void +dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size, + size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind) +{ + volatile uint16_t *flags; + uint64_t val = *valp; + size_t valoffs = *valoffsp; + + flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags; + ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF); + + /* + * If this is a string, we're going to only load until we find the zero + * byte -- after which we'll store zero bytes. + */ + if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) { + char c = '\0' + 1; + size_t s; + + for (s = 0; s < size; s++) { + if (c != '\0' && dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (c != '\0' && dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, valoffs++, c); + + if (c == '\0' && intuple) + break; + } + } else { + uint8_t c; + while (valoffs < end) { + if (dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, + valoffs++, c); + } + } + + *valp = val; + *valoffsp = valoffs; +} + /* * If you're looking for the epicenter of DTrace, you just found it. This * is the function called by the provider to fire a probe -- from which all @@ -6137,6 +7024,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, uint64_t val = 0; mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; + mstate.dtms_getf = NULL; + *flags &= ~CPU_DTRACE_ERROR; if (prov == dtrace_provider) { @@ -6670,7 +7559,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, ASSERT(0); } - if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) { + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF || + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) { uintptr_t end = valoffs + size; if (tracememsize != 0 && @@ -6679,40 +7569,15 @@ dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, tracememsize = 0; } - if (!dtrace_vcanload((void *)(uintptr_t)val, + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF && + !dtrace_vcanload((void *)(uintptr_t)val, &dp->dtdo_rtype, &mstate, vstate)) continue; - /* - * If this is a string, we're going to only - * load until we find the zero byte -- after - * which we'll store zero bytes. - */ - if (dp->dtdo_rtype.dtdt_kind == - DIF_TYPE_STRING) { - char c = '\0' + 1; - int intuple = act->dta_intuple; - size_t s; - - for (s = 0; s < size; s++) { - if (c != '\0') - c = dtrace_load8(val++); - - DTRACE_STORE(uint8_t, tomax, - valoffs++, c); - - if (c == '\0' && intuple) - break; - } - - continue; - } - - while (valoffs < end) { - DTRACE_STORE(uint8_t, tomax, valoffs++, - dtrace_load8(val++)); - } - + dtrace_store_by_ref(dp, tomax, size, &valoffs, + &val, end, act->dta_intuple, + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ? + DIF_TF_BYREF: DIF_TF_BYUREF); continue; } @@ -8810,6 +9675,20 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, subr == DIF_SUBR_COPYOUTSTR) { dp->dtdo_destructive = 1; } + + if (subr == DIF_SUBR_GETF) { + /* + * If we have a getf() we need to record that + * in our state. Note that our state can be + * NULL if this is a helper -- but in that + * case, the call to getf() is itself illegal, + * and will be caught (slightly later) when + * the helper is validated. + */ + if (vstate->dtvs_state != NULL) + vstate->dtvs_state->dts_getf++; + } + break; case DIF_OP_PUSHTR: if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF) @@ -8839,7 +9718,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, "expected 'ret' as last DIF instruction\n"); } - if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) { + if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) { /* * If we're not returning by reference, the size must be either * 0 or the size of one of the base types. @@ -9089,7 +9968,9 @@ dtrace_difo_validate_helper(dtrace_difo_t *dp) subr == DIF_SUBR_INET_NTOA || subr == DIF_SUBR_INET_NTOA6 || subr == DIF_SUBR_INET_NTOP || + subr == DIF_SUBR_JSON || subr == DIF_SUBR_LLTOSTR || + subr == DIF_SUBR_STRTOLL || subr == DIF_SUBR_RINDEX || subr == DIF_SUBR_STRCHR || subr == DIF_SUBR_STRJOIN || @@ -13734,6 +14615,24 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) state->dts_activity = DTRACE_ACTIVITY_WARMUP; +#if defined(sun) + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to bump our zone's count, and (if + * this is the first enabling to have an unprivileged call + * to getf()) we need to hook into closef(). + */ + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++; + + if (dtrace_getf++ == 0) { + ASSERT(dtrace_closef == NULL); + dtrace_closef = dtrace_getf_barrier; + } + } +#endif + /* * Now it's time to actually fire the BEGIN probe. We need to disable * interrupts here both to record the CPU on which we fired the BEGIN @@ -13850,6 +14749,26 @@ dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu) state->dts_activity = DTRACE_ACTIVITY_STOPPED; dtrace_sync(); +#if defined(sun) + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to lower our zone's count, and (if + * this is the last enabling to have an unprivileged call + * to getf()) we need to clear the closef() hook. + */ + ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0); + ASSERT(dtrace_closef == dtrace_getf_barrier); + ASSERT(dtrace_getf > 0); + + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--; + + if (--dtrace_getf == 0) + dtrace_closef = NULL; + } +#endif + return (0); } @@ -15519,6 +16438,25 @@ dtrace_toxrange_add(uintptr_t base, uintptr_t limit) dtrace_toxranges++; } +static void +dtrace_getf_barrier() +{ +#if defined(sun) + /* + * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings + * that contain calls to getf(), this routine will be called on every + * closef() before either the underlying vnode is released or the + * file_t itself is freed. By the time we are here, it is essential + * that the file_t can no longer be accessed from a call to getf() + * in probe context -- that assures that a dtrace_sync() can be used + * to clear out any enablings referring to the old structures. + */ + if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 || + kcred->cr_zone->zone_dtrace_getf != 0) + dtrace_sync(); +#endif +} + /* * DTrace Driver Cookbook Functions */ @@ -16782,6 +17720,9 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) dtrace_modload = NULL; dtrace_modunload = NULL; + ASSERT(dtrace_getf == 0); + ASSERT(dtrace_closef == NULL); + mutex_exit(&cpu_lock); if (dtrace_helptrace_enabled) { |