5 files changed, 153 insertions, 48 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650..570eeca 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
 	struct bucket *buckets;
 	void *elems;
 	struct pcpu_freelist freelist;
+	void __percpu *extra_elems;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
 };
 
+enum extra_elem_state {
+	HTAB_NOT_AN_EXTRA_ELEM = 0,
+	HTAB_EXTRA_ELEM_FREE,
+	HTAB_EXTRA_ELEM_USED
+};
+
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
 	union {
@@ -38,7 +45,10 @@ struct htab_elem {
 		struct bpf_htab *htab;
 		struct pcpu_freelist_node fnode;
 	};
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		enum extra_elem_state state;
+	};
 	u32 hash;
 	char key[0] __aligned(8);
 };
@@ -113,6 +123,23 @@ free_elems:
 	return err;
 }
 
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+	void __percpu *pptr;
+	int cpu;
+
+	pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+	if (!pptr)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+			HTAB_EXTRA_ELEM_FREE;
+	}
+	htab->extra_elems = pptr;
+	return 0;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		cost += (u64) round_up(htab->map.value_size, 8) *
 			num_possible_cpus() * htab->map.max_entries;
+	else
+	       cost += (u64) htab->elem_size * num_possible_cpus();
 
 	if (cost >= U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
+	if (!percpu) {
+		err = alloc_extra_elems(htab);
+		if (err)
+			goto free_buckets;
+	}
+
 	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
 		err = prealloc_elems_and_freelist(htab);
 		if (err)
-			goto free_buckets;
+			goto free_extra_elems;
 	}
 
 	return &htab->map;
 
+free_extra_elems:
+	free_percpu(htab->extra_elems);
 free_buckets:
 	kvfree(htab->buckets);
 free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
-
 }
 
 static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+	if (l->state == HTAB_EXTRA_ELEM_USED) {
+		l->state = HTAB_EXTRA_ELEM_FREE;
+		return;
+	}
+
 	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
 		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
-					 bool percpu, bool onallcpus)
+					 bool percpu, bool onallcpus,
+					 bool old_elem_exists)
 {
 	u32 size = htab->map.value_size;
 	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
+	int err = 0;
 
 	if (prealloc) {
 		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
 		if (!l_new)
-			return ERR_PTR(-E2BIG);
+			err = -E2BIG;
 	} else {
 		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
 			atomic_dec(&htab->count);
-			return ERR_PTR(-E2BIG);
+			err = -E2BIG;
+		} else {
+			l_new = kmalloc(htab->elem_size,
+					GFP_ATOMIC | __GFP_NOWARN);
+			if (!l_new)
+				return ERR_PTR(-ENOMEM);
 		}
-		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-		if (!l_new)
-			return ERR_PTR(-ENOMEM);
+	}
+
+	if (err) {
+		if (!old_elem_exists)
+			return ERR_PTR(err);
+
+		/* if we're updating the existing element and the hash table
+		 * is full, use per-cpu extra elems
+		 */
+		l_new = this_cpu_ptr(htab->extra_elems);
+		if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+			return ERR_PTR(-E2BIG);
+		l_new->state = HTAB_EXTRA_ELEM_USED;
+	} else {
+		l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
 	}
 
 	memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+				!!l_old);
 	if (IS_ERR(l_new)) {
 		/* all pre-allocated elements are in use or memory exhausted */
 		ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 		}
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
-					hash, true, onallcpus);
+					hash, true, onallcpus, false);
 		if (IS_ERR(l_new)) {
 			ret = PTR_ERR(l_new);
 			goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
 		htab_free_elems(htab);
 		pcpu_freelist_destroy(&htab->freelist);
 	}
+	free_percpu(htab->extra_elems);
 	kvfree(htab->buckets);
 	kfree(htab);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b..daea765 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
 	struct verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
+	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
 };
 
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		if (func_id != BPF_FUNC_skb_in_cgroup)
+		if (func_id != BPF_FUNC_skb_under_cgroup)
 			goto error;
 		break;
 	default:
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
-	case BPF_FUNC_skb_in_cgroup:
+	case BPF_FUNC_skb_under_cgroup:
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
 			goto error;
 		break;
@@ -1301,7 +1302,7 @@ add_imm:
 		/* dst_reg stays as pkt_ptr type and since some positive
 		 * integer value was added to the pointer, increment its 'id'
 		 */
-		dst_reg->id++;
+		dst_reg->id = ++env->id_gen;
 
 		/* something was added to pkt_ptr, set range and off to zero */
 		dst_reg->off = 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f..5650f53 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
 	return ret;
 }
 
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-	struct event_function_struct efs = {
-		.event = event,
-		.func = func,
-		.data = data,
-	};
-
-	int ret = event_function(&efs);
-	WARN_ON_ONCE(ret);
-}
-
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+	struct task_struct *task = READ_ONCE(ctx->task);
+	struct perf_event_context *task_ctx = NULL;
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	if (task) {
+		if (task == TASK_TOMBSTONE)
+			return;
+
+		task_ctx = ctx;
+	}
+
+	perf_ctx_lock(cpuctx, task_ctx);
+
+	task = ctx->task;
+	if (task == TASK_TOMBSTONE)
+		goto unlock;
+
+	if (task) {
+		/*
+		 * We must be either inactive or active and the right task,
+		 * otherwise we're screwed, since we cannot IPI to somewhere
+		 * else.
+		 */
+		if (ctx->is_active) {
+			if (WARN_ON_ONCE(task != current))
+				goto unlock;
+
+			if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+				goto unlock;
+		}
+	} else {
+		WARN_ON_ONCE(&cpuctx->ctx != ctx);
+	}
+
+	func(event, cpuctx, ctx, data);
+unlock:
+	perf_ctx_unlock(cpuctx, task_ctx);
+}
+
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP |\
@@ -3513,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
 			.group = group,
 			.ret = 0,
 		};
-		smp_call_function_single(event->oncpu,
-					 __perf_event_read, &data, 1);
-		ret = data.ret;
+		ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+		/* The event must have been read from an online CPU: */
+		WARN_ON_ONCE(ret);
+		ret = ret ? : data.ret;
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
@@ -6584,15 +6621,6 @@ got_name:
 }
 
 /*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-	return filter->filter && filter->inode;
-}
-
-/*
  * Check whether inode and address range match filter criteria.
  */
 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6653,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
 	struct perf_event_context *ctx;
 	int ctxn;
 
+	/*
+	 * Data tracing isn't supported yet and as such there is no need
+	 * to keep track of anything that isn't related to executable code:
+	 */
+	if (!(vma->vm_flags & VM_EXEC))
+		return;
+
 	rcu_read_lock();
 	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7805,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	list_for_each_entry(filter, &ifh->list, entry) {
 		event->addr_filters_offs[count] = 0;
 
-		if (perf_addr_filter_needs_mmap(filter))
+		/*
+		 * Adjust base offset if the filter is associated to a binary
+		 * that needs to be mapped:
+		 */
+		if (filter->inode)
 			event->addr_filters_offs[count] =
 				perf_addr_filter_apply(filter, mm);
 
@@ -7936,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 					goto fail;
 			}
 
-			if (token == IF_SRC_FILE) {
-				filename = match_strdup(&args[2]);
+			if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+				int fpos = filter->range ? 2 : 1;
+
+				filename = match_strdup(&args[fpos]);
 				if (!filename) {
 					ret = -ENOMEM;
 					goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525a..8c50276 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	err = -EAGAIN;
 	ptep = page_check_address(page, mm, addr, &ptl, 0);
-	if (!ptep)
+	if (!ptep) {
+		mem_cgroup_cancel_charge(kpage, memcg, false);
 		goto unlock;
+	}
 
 	get_page(kpage);
 	page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
-	mem_cgroup_cancel_charge(kpage, memcg, false);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c..b022284 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
  */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-	bm->cur.node = list_entry(bm->cur.node->list.next,
-				  struct rtree_node, list);
-	if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+	if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+		bm->cur.node = list_entry(bm->cur.node->list.next,
+					  struct rtree_node, list);
 		bm->cur.node_pfn += BM_BITS_PER_BLOCK;
 		bm->cur.node_bit  = 0;
 		touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
 	}
 
 	/* No more nodes, goto next zone */
-	bm->cur.zone = list_entry(bm->cur.zone->list.next,
+	if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+		bm->cur.zone = list_entry(bm->cur.zone->list.next,
 				  struct mem_zone_bm_rtree, list);
-	if (&bm->cur.zone->list != &bm->zones) {
 		bm->cur.node = list_entry(bm->cur.zone->leaves.next,
 					  struct rtree_node, list);
 		bm->cur.node_pfn = 0;