summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2016-04-28 16:07:22 +0200
committerIlya Dryomov <idryomov@gmail.com>2016-05-26 00:36:25 +0200
commit6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 (patch)
treecda9593b00d971b10ebeb9279ad1893978236df8
parentd9591f5e28686277d9312d3c7422faf1368b305e (diff)
downloadop-kernel-dev-6f3bfd45cd233eea0b07e3cabc0386b5de9321d2.zip
op-kernel-dev-6f3bfd45cd233eea0b07e3cabc0386b5de9321d2.tar.gz
libceph: ceph_osds, ceph_pg_to_up_acting_osds()
Knowning just acting set isn't enough, we need to be able to record up set as well to detect interval changes. This means returning (up[], up_len, up_primary, acting[], acting_len, acting_primary) and passing it around. Introduce and switch to ceph_osds to help with that. Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return both up and acting sets from it. Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r--include/linux/ceph/osdmap.h21
-rw-r--r--net/ceph/osd_client.c36
-rw-r--r--net/ceph/osdmap.c304
3 files changed, 215 insertions, 146 deletions
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index b70440c..942189d 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -208,6 +208,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+struct ceph_osds {
+ int osds[CEPH_PG_MAX_SIZE];
+ int size;
+ int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+ set->size = 0;
+ set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
/* calculate mapping of a file extent to an object */
extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len,
@@ -218,9 +232,10 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid,
- int *osds, int *primary);
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
struct ceph_pg pgid);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cb9f195..0ff400a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req, int force_resend)
{
struct ceph_pg pgid;
- int acting[CEPH_PG_MAX_SIZE];
- int num, o;
+ struct ceph_osds up, acting;
int err;
bool was_paused;
@@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc,
}
req->r_pgid = pgid;
- num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
- if (num < 0)
- num = 0;
+ ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
was_paused = req->r_paused;
req->r_paused = __req_should_be_paused(osdc, req);
@@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc,
force_resend = 1;
if ((!force_resend &&
- req->r_osd && req->r_osd->o_osd == o &&
+ req->r_osd && req->r_osd->o_osd == acting.primary &&
req->r_sent >= req->r_osd->o_incarnation &&
- req->r_num_pg_osds == num &&
- memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1) ||
+ req->r_num_pg_osds == acting.size &&
+ memcmp(req->r_pg_osds, acting.osds,
+ acting.size * sizeof(acting.osds[0])) == 0) ||
+ (req->r_osd == NULL && acting.primary == -1) ||
req->r_paused)
return 0; /* no change */
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
- req->r_tid, pgid.pool, pgid.seed, o,
+ req->r_tid, pgid.pool, pgid.seed, acting.primary,
req->r_osd ? req->r_osd->o_osd : -1);
/* record full pg acting set */
- memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
- req->r_num_pg_osds = num;
+ memcpy(req->r_pg_osds, acting.osds,
+ acting.size * sizeof(acting.osds[0]));
+ req->r_num_pg_osds = acting.size;
if (req->r_osd) {
__cancel_request(req);
@@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc,
req->r_osd = NULL;
}
- req->r_osd = lookup_osd(&osdc->osds, o);
- if (!req->r_osd && o >= 0) {
+ req->r_osd = lookup_osd(&osdc->osds, acting.primary);
+ if (!req->r_osd && acting.primary >= 0) {
err = -ENOMEM;
- req->r_osd = create_osd(osdc, o);
+ req->r_osd = create_osd(osdc, acting.primary);
if (!req->r_osd) {
list_move(&req->r_req_lru_item, &osdc->req_notarget);
goto out;
}
- dout("map_request osd %p is osd%d\n", req->r_osd, o);
+ dout("map_request osd %p is osd%d\n", req->r_osd,
+ acting.primary);
insert_osd(&osdc->osds, req->r_osd);
ceph_con_open(&req->r_osd->o_con,
- CEPH_ENTITY_TYPE_OSD, o,
- &osdc->osdmap->osd_addr[o]);
+ CEPH_ENTITY_TYPE_OSD, acting.primary,
+ &osdc->osdmap->osd_addr[acting.primary]);
}
__enqueue_request(req);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 6267839..f5fc8fc 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
}
EXPORT_SYMBOL(ceph_oid_destroy);
+static bool osds_valid(const struct ceph_osds *set)
+{
+ /* non-empty set */
+ if (set->size > 0 && set->primary >= 0)
+ return true;
+
+ /* empty can_shift_osds set */
+ if (!set->size && set->primary == -1)
+ return true;
+
+ /* empty !can_shift_osds set - all NONE */
+ if (set->size > 0 && set->primary == -1) {
+ int i;
+
+ for (i = 0; i < set->size; i++) {
+ if (set->osds[i] != CRUSH_ITEM_NONE)
+ break;
+ }
+ if (i == set->size)
+ return true;
+ }
+
+ return false;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
+{
+ memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
+ dest->size = src->size;
+ dest->primary = src->primary;
+}
+
/*
* calculate file layout from given offset, length.
* fill in correct oid, logical length, and object extent
@@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
+/*
+ * Map a raw PG (full precision ps) into an actual PG.
+ */
+static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_pg *pgid)
+{
+ pgid->pool = raw_pgid->pool;
+ pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
+ pi->pg_num_mask);
+}
+
+/*
+ * Map a raw PG (full precision ps) into a placement ps (placement
+ * seed). Include pool id in that value so that different pools don't
+ * use the same seeds.
+ */
+static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid)
+{
+ if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
+ /* hash pool id and seed so that pool PGs do not overlap */
+ return crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ ceph_stable_mod(raw_pgid->seed,
+ pi->pgp_num,
+ pi->pgp_num_mask),
+ raw_pgid->pool);
+ } else {
+ /*
+ * legacy behavior: add ps and pool together. this is
+ * not a great approach because the PGs from each pool
+ * will overlap on top of each other: 0.5 == 1.4 ==
+ * 2.3 == ...
+ */
+ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
+ pi->pgp_num_mask) +
+ (unsigned)raw_pgid->pool;
+ }
+}
+
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
const __u32 *weight, int weight_max)
@@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
}
/*
- * Calculate raw (crush) set for given pgid.
+ * Calculate raw set (CRUSH output) for given PG. The result may
+ * contain nonexistent OSDs. ->primary is undefined for a raw set.
*
- * Return raw set length, or error.
+ * Placement seed (CRUSH input) is returned through @ppps.
*/
-static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- struct ceph_pg pgid, u32 pps, int *osds)
+static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *raw,
+ u32 *ppps)
{
+ u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
- /* crush */
- ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
- pool->type, pool->size);
+ ceph_osds_init(raw);
+ if (ppps)
+ *ppps = pps;
+
+ ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
+ pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
- pgid.pool, pool->crush_ruleset, pool->type,
- pool->size);
- return -ENOENT;
+ pi->id, pi->crush_ruleset, pi->type, pi->size);
+ return;
}
- len = do_crush(osdmap, ruleno, pps, osds,
- min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+ len = do_crush(osdmap, ruleno, pps, raw->osds,
+ min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
osdmap->osd_weight, osdmap->max_osd);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
- len, ruleno, pgid.pool, pool->crush_ruleset,
- pool->type, pool->size);
- return len;
+ len, ruleno, pi->id, pi->crush_ruleset, pi->type,
+ pi->size);
+ return;
}
- return len;
+ raw->size = len;
}
/*
- * Given raw set, calculate up set and up primary.
+ * Given raw set, calculate up set and up primary. By definition of an
+ * up set, the result won't contain nonexistent or down OSDs.
*
- * Return up set length. *primary is set to up primary osd id, or -1
- * if up set is empty.
+ * This is done in-place - on return @set is the up set. If it's
+ * empty, ->primary will remain undefined.
*/
-static int raw_to_up_osds(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void raw_to_up_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
{
- int up_primary = -1;
int i;
- if (ceph_can_shift_osds(pool)) {
+ /* ->primary is undefined for a raw set */
+ BUG_ON(set->primary != -1);
+
+ if (ceph_can_shift_osds(pi)) {
int removed = 0;
- for (i = 0; i < len; i++) {
- if (ceph_osd_is_down(osdmap, osds[i])) {
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
- osds[i - removed] = osds[i];
+ set->osds[i - removed] = set->osds[i];
}
-
- len -= removed;
- if (len > 0)
- up_primary = osds[0];
+ set->size -= removed;
+ if (set->size > 0)
+ set->primary = set->osds[0];
} else {
- for (i = len - 1; i >= 0; i--) {
- if (ceph_osd_is_down(osdmap, osds[i]))
- osds[i] = CRUSH_ITEM_NONE;
+ /* set down/dne devices to NONE */
+ for (i = set->size - 1; i >= 0; i--) {
+ if (ceph_osd_is_down(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
else
- up_primary = osds[i];
+ set->primary = set->osds[i];
}
}
-
- *primary = up_primary;
- return len;
}
-static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
- struct ceph_pg_pool_info *pool,
- int *osds, int len, int *primary)
+static void apply_primary_affinity(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ u32 pps,
+ struct ceph_osds *up)
{
int i;
int pos = -1;
@@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (!osdmap->osd_primary_affinity)
return;
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
@@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
break;
}
}
- if (i == len)
+ if (i == up->size)
return;
/*
@@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
- for (i = 0; i < len; i++) {
- int osd = osds[i];
+ for (i = 0; i < up->size; i++) {
+ int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
@@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
if (pos < 0)
return;
- *primary = osds[pos];
+ up->primary = up->osds[pos];
- if (ceph_can_shift_osds(pool) && pos > 0) {
+ if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
- osds[i] = osds[i - 1];
- osds[0] = *primary;
+ up->osds[i] = up->osds[i - 1];
+ up->osds[0] = up->primary;
}
}
/*
- * Given up set, apply pg_temp and primary_temp mappings.
+ * Get pg_temp and primary_temp mappings for given PG.
*
- * Return acting set length. *primary is set to acting primary osd id,
- * or -1 if acting set is empty.
+ * Note that a PG may have none, only pg_temp, only primary_temp or
+ * both pg_temp and primary_temp mappings. This means @temp isn't
+ * always a valid OSD set on return: in the "only primary_temp" case,
+ * @temp will have its ->primary >= 0 but ->size == 0.
*/
-static int apply_temps(struct ceph_osdmap *osdmap,
- struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
- int *osds, int len, int *primary)
+static void get_temp_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *temp)
{
+ struct ceph_pg pgid;
struct ceph_pg_mapping *pg;
- int temp_len;
- int temp_primary;
int i;
- /* raw_pg -> pg */
- pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
- pool->pg_num_mask);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+ ceph_osds_init(temp);
/* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
- temp_len = 0;
- temp_primary = -1;
-
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
- if (ceph_can_shift_osds(pool))
+ if (ceph_can_shift_osds(pi))
continue;
- else
- osds[temp_len++] = CRUSH_ITEM_NONE;
+
+ temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
- osds[temp_len++] = pg->pg_temp.osds[i];
+ temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
- for (i = 0; i < temp_len; i++) {
- if (osds[i] != CRUSH_ITEM_NONE) {
- temp_primary = osds[i];
+ for (i = 0; i < temp->size; i++) {
+ if (temp->osds[i] != CRUSH_ITEM_NONE) {
+ temp->primary = temp->osds[i];
break;
}
}
- } else {
- temp_len = len;
- temp_primary = *primary;
}
/* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
- temp_primary = pg->primary_temp.osd;
-
- *primary = temp_primary;
- return temp_len;
+ temp->primary = pg->primary_temp.osd;
}
/*
- * Calculate acting set for given pgid.
+ * Map a PG to its acting set as well as its up set.
*
- * Return acting set length, or error. *primary is set to acting
- * primary osd id, or -1 if acting set is empty or on error.
+ * Acting set is used for data mapping purposes, while up set can be
+ * recorded for detecting interval changes and deciding whether to
+ * resend a request.
*/
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *primary)
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting)
{
- struct ceph_pg_pool_info *pool;
+ struct ceph_pg_pool_info *pi;
u32 pps;
- int len;
- pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
- if (!pool) {
- *primary = -1;
- return -ENOENT;
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi) {
+ ceph_osds_init(up);
+ ceph_osds_init(acting);
+ goto out;
}
- if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
- /* hash pool id and seed so that pool PGs do not overlap */
- pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
- ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask),
- pgid.pool);
- } else {
- /*
- * legacy behavior: add ps and pool together. this is
- * not a great approach because the PGs from each pool
- * will overlap on top of each other: 0.5 == 1.4 ==
- * 2.3 == ...
- */
- pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
- pool->pgp_num_mask) +
- (unsigned)pgid.pool;
- }
-
- len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
- if (len < 0) {
- *primary = -1;
- return len;
+ pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ raw_to_up_osds(osdmap, pi, up);
+ apply_primary_affinity(osdmap, pi, pps, up);
+ get_temp_osds(osdmap, pi, raw_pgid, acting);
+ if (!acting->size) {
+ memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
+ acting->size = up->size;
+ if (acting->primary == -1)
+ acting->primary = up->primary;
}
-
- len = raw_to_up_osds(osdmap, pool, osds, len, primary);
-
- apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-
- len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-
- return len;
+out:
+ WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
/*
@@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
*/
int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
{
- int osds[CEPH_PG_MAX_SIZE];
- int primary;
-
- ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
+ struct ceph_osds up, acting;
- return primary;
+ ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting);
+ return acting.primary;
}
EXPORT_SYMBOL(ceph_calc_pg_primary);
OpenPOWER on IntegriCloud