summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/blktrace.c83
-rw-r--r--kernel/trace/bpf_trace.c166
2 files changed, 170 insertions, 79 deletions
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9aef865..fb345cd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -127,12 +127,13 @@ static void trace_note_tsk(struct task_struct *tsk)
static void trace_note_time(struct blk_trace *bt)
{
- struct timespec now;
+ struct timespec64 now;
unsigned long flags;
u32 words[2];
- getnstimeofday(&now);
- words[0] = now.tv_sec;
+ /* need to check user space to see if this breaks in y2038 or y2106 */
+ ktime_get_real_ts64(&now);
+ words[0] = (u32)now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
@@ -189,6 +190,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
#define BLK_TC_RAHEAD BLK_TC_AHEAD
+#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
@@ -199,7 +201,8 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- int rw, u32 what, int error, int pdu_len, void *pdu_data)
+ int op, int op_flags, u32 what, int error, int pdu_len,
+ void *pdu_data)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
@@ -214,13 +217,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
- what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, SYNC);
- what |= MASK_TC_BIT(rw, RAHEAD);
- what |= MASK_TC_BIT(rw, META);
- what |= MASK_TC_BIT(rw, DISCARD);
- what |= MASK_TC_BIT(rw, FLUSH);
- what |= MASK_TC_BIT(rw, FUA);
+ what |= ddir_act[op_is_write(op) ? WRITE : READ];
+ what |= MASK_TC_BIT(op_flags, SYNC);
+ what |= MASK_TC_BIT(op_flags, RAHEAD);
+ what |= MASK_TC_BIT(op_flags, META);
+ what |= MASK_TC_BIT(op_flags, PREFLUSH);
+ what |= MASK_TC_BIT(op_flags, FUA);
+ if (op == REQ_OP_DISCARD)
+ what |= BLK_TC_ACT(BLK_TC_DISCARD);
+ if (op == REQ_OP_FLUSH)
+ what |= BLK_TC_ACT(BLK_TC_FLUSH);
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
@@ -708,11 +714,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
+ __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
+ __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -770,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, what, error, 0, NULL);
+ bio_op(bio), bio->bi_rw, what, error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -818,7 +824,8 @@ static void blk_add_trace_getrq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
+ NULL);
}
}
@@ -833,7 +840,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+ __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL);
}
}
@@ -843,7 +850,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
struct blk_trace *bt = q->blk_trace;
if (bt)
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -860,7 +867,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
else
what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -874,8 +881,9 @@ static void blk_add_trace_split(void *ignore,
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- bio->bi_error, sizeof(rpdu), &rpdu);
+ bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw,
+ BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+ &rpdu);
}
}
@@ -907,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
}
@@ -940,7 +948,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r);
}
@@ -965,10 +973,10 @@ void blk_add_driver_data(struct request_queue *q,
return;
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1769,21 +1777,34 @@ void blk_dump_cmd(char *buf, struct request *rq)
}
}
-void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
{
int i = 0;
- if (rw & REQ_FLUSH)
+ if (rw & REQ_PREFLUSH)
rwbs[i++] = 'F';
- if (rw & WRITE)
+ switch (op) {
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
- else if (rw & REQ_DISCARD)
+ break;
+ case REQ_OP_DISCARD:
+ rwbs[i++] = 'D';
+ break;
+ case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
- else if (bytes)
+ rwbs[i++] = 'E';
+ break;
+ case REQ_OP_FLUSH:
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_READ:
rwbs[i++] = 'R';
- else
+ break;
+ default:
rwbs[i++] = 'N';
+ }
if (rw & REQ_FUA)
rwbs[i++] = 'F';
@@ -1793,8 +1814,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i++] = 'S';
if (rw & REQ_META)
rwbs[i++] = 'M';
- if (rw & REQ_SECURE)
- rwbs[i++] = 'E';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 720b7bb..b20438f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type = ARG_ANYTHING,
};
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ void *unsafe_ptr = (void *) (long) r1;
+ void *src = (void *) (long) r2;
+ int size = (int) r3;
+
+ /*
+ * Ensure we're in user context which is safe for the helper to
+ * run. This helper has no business in a kthread.
+ *
+ * access_ok() should prevent writing to non-user memory, but in
+ * some situations (nommu, temporary switch, etc) access_ok() does
+ * not provide enough validation, hence the check on KERNEL_DS.
+ */
+
+ if (unlikely(in_interrupt() ||
+ current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+ return -EPERM;
+ if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+ return -EPERM;
+
+ return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+ .func = bpf_probe_write_user,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+ pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+ current->comm, task_pid_nr(current));
+
+ return &bpf_probe_write_user_proto;
+}
+
/*
* limited trace_printk()
* only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -188,25 +231,33 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
+ u64 index = flags & BPF_F_INDEX_MASK;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
+ event = ee->event;
+ if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+ event->attr.type != PERF_TYPE_RAW))
+ return -EINVAL;
/* make sure event is local and doesn't have pmu::count */
- if (event->oncpu != smp_processor_id() ||
- event->pmu->count)
+ if (unlikely(event->oncpu != cpu || event->pmu->count))
return -EINVAL;
/*
@@ -225,47 +276,58 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+ u64 flags, struct perf_raw_record *raw)
{
- struct pt_regs *regs = (struct pt_regs *) (long) r1;
- struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
+ struct bpf_event_entry *ee;
struct perf_event *event;
- struct file *file;
- struct perf_raw_record raw = {
- .size = size,
- .data = data,
- };
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
- index = raw_smp_processor_id();
+ index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- file = READ_ONCE(array->ptrs[index]);
- if (unlikely(!file))
+ ee = READ_ONCE(array->ptrs[index]);
+ if (!ee)
return -ENOENT;
- event = file->private_data;
-
+ event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
- if (unlikely(event->oncpu != smp_processor_id()))
+ if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = &raw;
+ sample_data.raw = raw;
perf_event_output(event, &sample_data, regs);
return 0;
}
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *)(long) r1;
+ struct bpf_map *map = (struct bpf_map *)(long) r2;
+ void *data = (void *)(long) r4;
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = size,
+ .data = data,
+ },
+ };
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+
+ return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
@@ -279,31 +341,41 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+ struct perf_raw_frag frag = {
+ .copy = ctx_copy,
+ .size = ctx_size,
+ .data = ctx,
+ };
+ struct perf_raw_record raw = {
+ .frag = {
+ {
+ .next = ctx_size ? &frag : NULL,
+ },
+ .size = meta_size,
+ .data = meta,
+ },
+ };
perf_fetch_caller_regs(regs);
- return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+ return __bpf_perf_event_output(regs, map, flags, &raw);
}
-static const struct bpf_func_proto bpf_event_output_proto = {
- .func = bpf_event_output,
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return (long) current;
+}
+
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+ .func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
};
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
- return &bpf_event_output_proto;
-}
-
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -321,6 +393,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_task:
+ return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
@@ -331,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_probe_write_user:
+ return bpf_get_probe_write_proto();
default:
return NULL;
}
@@ -349,20 +425,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
-static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
- /* check bounds */
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
-
- /* only read is allowed */
if (type != BPF_READ)
return false;
-
- /* disallow misaligned access */
if (off % size != 0)
return false;
-
return true;
}
@@ -427,7 +498,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
}
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+ enum bpf_reg_type *reg_type)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
OpenPOWER on IntegriCloud