diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1a47b5c25b5f..e159f1415110 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -78,6 +78,7 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; + struct rb_node *n; if (client->osdc.osdmap == NULL) return 0; @@ -87,11 +88,11 @@ static int osdmap_show(struct seq_file *s, void *p) " NEARFULL" : "", (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); - for (i = 0; i < client->osdc.osdmap->num_pools; i++) { + for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pool = - &client->osdc.osdmap->pg_pool[i]; + rb_entry(n, struct ceph_pg_pool_info, node); seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - i, pool->v.pg_num, pool->pg_num_mask, + pool->id, pool->v.pg_num, pool->pg_num_mask, pool->v.lpg_num, pool->lpg_num_mask); } for (i = 0; i < client->osdc.osdmap->max_osd; i++) { diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 443fdcdb19c4..34b5696c84fd 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) rb_erase(&pg->node, &map->pg_temp); kfree(pg); } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } kfree(map->osd_state); kfree(map->osd_weight); - kfree(map->pg_pool); kfree(map->osd_addr); kfree(map); } @@ -432,6 +438,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, return NULL; } +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + /* * decode a full map. */ @@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) u8 ev; int err = -EINVAL; void *start = *p; + struct ceph_pg_pool_info *pi; dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - map->num_pools = ceph_decode_32(p); - map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool), - GFP_NOFS); - if (!map->pg_pool) { - err = -ENOMEM; - goto bad; - } ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad); - i = ceph_decode_32(p); - if (i >= map->num_pools) + ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) goto bad; + pi->id = ceph_decode_32(p); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[i].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[i]); - p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64); - p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals) + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + __insert_pg_pool(&map->pg_pools, pi); + calc_pg_masks(pi); + p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); + p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } + ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->flags, bad); @@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, u32 epoch = 0; struct ceph_timespec modified; u32 len, pool; - __s32 new_flags, max; + __s32 new_pool_max, new_flags, max; void *start = *p; int err = -EINVAL; u16 version; @@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_32(p); new_flags = ceph_decode_32(p); /* full map? */ @@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, /* new flags? */ if (new_flags >= 0) map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; ceph_decode_need(p, end, 5*sizeof(u32), bad); @@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_32_safe(p, end, len, bad); while (len--) { __u8 ev; + struct ceph_pg_pool_info *pi; ceph_decode_32_safe(p, end, pool, bad); - if (pool >= map->num_pools) { - void *pg_pool = kcalloc(pool + 1, - sizeof(*map->pg_pool), - GFP_NOFS); - if (!pg_pool) { - err = -ENOMEM; - goto bad; - } - memcpy(pg_pool, map->pg_pool, - map->num_pools * sizeof(*map->pg_pool)); - kfree(map->pg_pool); - map->pg_pool = pg_pool; - map->num_pools = pool+1; - } - ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad); + ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); ev = ceph_decode_8(p); /* encoding version */ if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); goto bad; } - ceph_decode_copy(p, &map->pg_pool[pool].v, - sizeof(map->pg_pool->v)); - calc_pg_masks(&map->pg_pool[pool]); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!pi) { + pi = kmalloc(sizeof(*pi), GFP_NOFS); + if (!pi) { + err = -ENOMEM; + goto bad; + } + pi->id = pool; + __insert_pg_pool(&map->pg_pools, pi); + } + ceph_decode_copy(p, &pi->v, sizeof(pi->v)); + calc_pg_masks(pi); } - /* old_pool (ignore) */ + /* old_pool */ ceph_decode_32_safe(p, end, len, bad); - *p += len * sizeof(u32); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_32_safe(p, end, pool, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + rb_erase(&pi->node, &map->pg_pools); + kfree(pi); + } + } /* new_up */ err = -EINVAL; @@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, unsigned ps; BUG_ON(!osdmap); - if (poolid >= osdmap->num_pools) - return -EIO; - pool = &osdmap->pg_pool[poolid]; + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) + return -EIO; ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); if (preferred >= 0) { ps += preferred; @@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, preferred >= osdmap->crush->max_devices) preferred = -1; - if (poolid >= osdmap->num_pools) + pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); + if (!pool) return NULL; - pool = &osdmap->pg_pool[poolid]; ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, pool->v.type, pool->v.size); if (ruleno < 0) { diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index c4af8418aa00..1fb55afb2642 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h @@ -19,6 +19,8 @@ * the change between two successive epochs, or as a fully encoded map. */ struct ceph_pg_pool_info { + struct rb_node node; + int id; struct ceph_pg_pool v; int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; }; @@ -44,9 +46,8 @@ struct ceph_osdmap { struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; - - u32 num_pools; - struct ceph_pg_pool_info *pg_pool; + struct rb_root pg_pools; + u32 pool_max; /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 1f4c78640541..26ac8b89a676 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -11,8 +11,8 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 3 -#define CEPH_OSDMAP_VERSION 3 +#define CEPH_OSDMAP_INC_VERSION 4 +#define CEPH_OSDMAP_VERSION 4 /* * fs id