From: Sage Weil Date: Wed, 29 Apr 2009 18:13:01 +0000 (-0700) Subject: osd: push pg_num etc into pg_pool X-Git-Tag: v0.7.3~17 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0d33f93f2c39aee535c576190793af7f27073447;p=ceph.git osd: push pg_num etc into pg_pool --- diff --git a/src/TODO b/src/TODO index 5a7c6a8f074e..358afc76267a 100644 --- a/src/TODO +++ b/src/TODO @@ -33,10 +33,10 @@ v0.8 /- async xattr /- dcache readdir /- confutils memory leaks +/- osd re-up attempt when marked down v0.9 - make mds exhert memory pressure on client caps, leases -- osd re-up attempt when marked down - optionally separate osd interfaces (ips) for clients and osds (replication, peering, etc.) later @@ -70,6 +70,8 @@ repair kernel client +- fix symbol names for osdmap.h (possibly others?) +- osd client needs to recalculate layout if osdmap changes (pg_num etc may change) - fix up mds selection, and ESTALE handling - make cap import/export efficient - simplify mds auth tracking? diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index b82aa6b91b39..e79f583611fb 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -43,7 +43,7 @@ #define CEPH_OSD_PROTOCOL 5 /* cluster internal */ #define CEPH_MDS_PROTOCOL 9 /* cluster internal */ #define CEPH_MON_PROTOCOL 4 /* cluster internal */ -#define CEPH_OSDC_PROTOCOL 6 /* public/client */ +#define CEPH_OSDC_PROTOCOL 7 /* public/client */ #define CEPH_MDSC_PROTOCOL 18 /* public/client */ #define CEPH_MONC_PROTOCOL 11 /* public/client */ @@ -306,10 +306,30 @@ union ceph_pg { #define ceph_pg_is_rep(pg) ((pg).pg.type == CEPH_PG_TYPE_REP) #define ceph_pg_is_raid4(pg) ((pg).pg.type == CEPH_PG_TYPE_RAID4) +/* + * pg_pool is a set of pgs storing a pool of objects + * + * pg_num -- base number of pseudorandomly placed pgs + * + * pgp_num -- effective number when calculating pg placement. this + * is used for pg_num increases. new pgs result in data being "split" + * into new pgs. for this to proceed smoothly, new pgs are intiially + * colocated with their parents; that is, pgp_num doesn't increase + * until the new pgs have successfully split. only _then_ are the new + * pgs placed independently. + * + * lpg_num -- localized pg count (per device). replicas are randomly + * selected. + * + * lpgp_num -- as above. + */ struct ceph_pg_pool { - __u8 crush_ruleset; - __u8 size; __u8 type; + __u8 size; + __u8 crush_ruleset; + __u32 pg_num, pgp_num; + __u32 lpg_num, lpgp_num; + __u32 last_change; /* most recent epoch changed */ } __attribute__ ((packed)); /* diff --git a/src/kernel/debugfs.c b/src/kernel/debugfs.c index 6c92f5fafbdc..a4045d3f78e0 100644 --- a/src/kernel/debugfs.c +++ b/src/kernel/debugfs.c @@ -200,17 +200,18 @@ static int osdmap_show(struct seq_file *s, void *p) if (client->osdc.osdmap == NULL) return 0; seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); - seq_printf(s, "pg_num %d / %d\n", - client->osdc.osdmap->pg_num, - client->osdc.osdmap->pg_num_mask); - seq_printf(s, "lpg_num %d / %d\n", - client->osdc.osdmap->lpg_num, - client->osdc.osdmap->lpg_num_mask); seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL":"", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL":""); + (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? + " NEARFULL":"", + (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? + " FULL":""); + for (i = 0; i < client->osdc.osdmap->num_pools; i++) { + struct ceph_pg_pool_info *pool = + &client->osdc.osdmap->pg_pool[i]; + seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d", + pool->v.pg_num, pool->pg_num_mask, + pool->v.lpg_num, pool->lpg_num_mask); + } for (i = 0; i < client->osdc.osdmap->max_osd; i++) { struct ceph_entity_addr *addr = &client->osdc.osdmap->osd_addr[i]; diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index 90ccda5fde50..d49d57a3a92c 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -23,15 +23,16 @@ int ceph_debug_osdc __read_mostly = -1; * request accordingly. shorten extent as necessary if it crosses an * object boundary. */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req) +static int calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req) { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_osd_op *op = (void *)(reqhead + 1); u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ + int err; /* object extent? */ reqhead->oid.ino = cpu_to_le64(vino.ino); @@ -47,13 +48,14 @@ static void calc_layout(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(off, *plen); /* pgid? */ - calc_object_layout(&reqhead->layout, &reqhead->oid, layout, - osdc->osdmap); + err = calc_object_layout(&reqhead->layout, &reqhead->oid, layout, + osdc->osdmap); dout(10, "calc_layout %llx.%08x %llu~%llu pgid %llx (%d pages)\n", le64_to_cpu(reqhead->oid.ino), le32_to_cpu(reqhead->oid.bno), objoff, objlen, le64_to_cpu(reqhead->layout.ol_pgid), req->r_num_pages); + return err; } @@ -101,7 +103,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, int do_trunc = truncate_seq && (off + *plen > truncate_size); int num_op = 1 + do_sync + do_trunc; size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int i; + int i, err; u64 prevofs; /* we may overallocate here, if our write extent is shortened below */ @@ -140,7 +142,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_snapc = ceph_get_snap_context(snapc); /* calculate max write size, pgid */ - calc_layout(osdc, vino, layout, off, plen, req); + err = calc_layout(osdc, vino, layout, off, plen, req); + if (err < 0) { + ceph_msg_put(msg); + kfree(req); + return ERR_PTR(err); + } req->r_pgid.pg64 = le64_to_cpu(head->layout.ol_pgid); if (flags & CEPH_OSD_FLAG_MODIFY) { @@ -320,29 +327,27 @@ static int map_osds(struct ceph_osd_client *osdc, unsigned pps; /* placement ps */ int osds[10], osd = -1; int i, num; - struct ceph_pg_pool *pool; + struct ceph_pg_pool_info *pool; if (req->r_pgid.pg.pool >= osdc->osdmap->num_pools) return -1; pool = &osdc->osdmap->pg_pool[req->r_pgid.pg.pool]; - ruleno = crush_find_rule(osdc->osdmap->crush, pool->crush_ruleset, - req->r_pgid.pg.type, pool->size); + ruleno = crush_find_rule(osdc->osdmap->crush, pool->v.crush_ruleset, + req->r_pgid.pg.type, pool->v.size); if (ruleno < 0) { derr(0, "map_osds no crush rule for pool %d type %d size %d\n", - req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->size); + req->r_pgid.pg.pool, req->r_pgid.pg.type, pool->v.size); return -1; } if (req->r_pgid.pg.preferred >= 0) pps = ceph_stable_mod(req->r_pgid.pg.ps, - osdc->osdmap->lpgp_num, - osdc->osdmap->lpgp_num_mask); + pool->v.lpgp_num, pool->lpgp_num_mask); else pps = ceph_stable_mod(req->r_pgid.pg.ps, - osdc->osdmap->pgp_num, - osdc->osdmap->pgp_num_mask); + pool->v.pgp_num, pool->pgp_num_mask); num = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds, - min_t(int, pool->size, ARRAY_SIZE(osds)), + min_t(int, pool->v.size, ARRAY_SIZE(osds)), req->r_pgid.pg.preferred, osdc->osdmap->osd_weight); /* primary is first up osd */ diff --git a/src/kernel/osdmap.c b/src/kernel/osdmap.c index e1a21fe591f0..75c9c5e6e48b 100644 --- a/src/kernel/osdmap.c +++ b/src/kernel/osdmap.c @@ -27,12 +27,15 @@ static int calc_bits_of(unsigned t) /* * the foo_mask is the smallest value 2^n-1 that is >= foo. */ -static void calc_pg_masks(struct ceph_osdmap *map) +static void calc_pg_masks(struct ceph_pg_pool_info *pi) { - map->pg_num_mask = (1 << calc_bits_of(map->pg_num-1)) - 1; - map->pgp_num_mask = (1 << calc_bits_of(map->pgp_num-1)) - 1; - map->lpg_num_mask = (1 << calc_bits_of(map->lpg_num-1)) - 1; - map->lpgp_num_mask = (1 << calc_bits_of(map->lpgp_num-1)) - 1; + pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; + pi->pgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; + pi->lpg_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; + pi->lpgp_num_mask = + (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; } /* @@ -352,7 +355,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (map == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_need(p, end, 2*sizeof(u64)+11*sizeof(u32), bad); + ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); ceph_decode_64_le(p, major); __ceph_fsid_set_major(&map->fsid, major); ceph_decode_64_le(p, minor); @@ -362,11 +365,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_le(p, map->created.tv_nsec); ceph_decode_32_le(p, map->modified.tv_sec); ceph_decode_32_le(p, map->modified.tv_nsec); - ceph_decode_32(p, map->pg_num); - ceph_decode_32(p, map->pgp_num); - ceph_decode_32(p, map->lpg_num); - ceph_decode_32(p, map->lpgp_num); - ceph_decode_32(p, map->last_pg_change); ceph_decode_32(p, map->num_pools); map->pg_pool = kmalloc(map->num_pools * sizeof(*map->pg_pool), @@ -377,17 +375,17 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) } ceph_decode_32_safe(p, end, max, bad); while (max--) { - ceph_decode_need(p, end, 4+sizeof(*map->pg_pool), bad); + ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad); ceph_decode_32(p, i); if (i >= map->num_pools) goto bad; - ceph_decode_copy(p, &map->pg_pool[i], sizeof(*map->pg_pool)); + ceph_decode_copy(p, &map->pg_pool[i].v, + sizeof(map->pg_pool->v)); + calc_pg_masks(&map->pg_pool[i]); } ceph_decode_32_safe(p, end, map->flags, bad); - calc_pg_masks(map); - ceph_decode_32(p, max); /* (re)alloc osd arrays */ @@ -448,7 +446,7 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, ceph_fsid_t fsid; u32 epoch = 0; struct ceph_timespec modified; - u32 len, x, pool; + u32 len, pool; __s32 new_flags, max; void *start = *p; int err = -EINVAL; @@ -498,18 +496,6 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, if (err < 0) goto bad; } - ceph_decode_32(p, x); - if (x) - map->pg_num = x; - ceph_decode_32(p, x); - if (x) - map->pgp_num = x; - ceph_decode_32(p, x); - if (x) - map->lpg_num = x; - ceph_decode_32(p, x); - if (x) - map->lpgp_num = x; map->epoch++; map->modified = map->modified; @@ -537,12 +523,14 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, map->pg_pool = p; map->num_pools = pool+1; } - ceph_decode_copy(p, &map->pg_pool[pool], sizeof(*map->pg_pool)); + ceph_decode_copy(p, &map->pg_pool[pool].v, + sizeof(map->pg_pool->v)); + calc_pg_masks(&map->pg_pool[pool]); } /* old_pool (ignore) */ ceph_decode_32_safe(p, end, len, bad); - *p += len * (sizeof(u32) + sizeof(*map->pg_pool)); + *p += len * sizeof(u32); /* new_up */ err = -EINVAL; @@ -652,23 +640,29 @@ void calc_file_object_mapping(struct ceph_file_layout *layout, * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -void calc_object_layout(struct ceph_object_layout *ol, - struct ceph_object *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) +int calc_object_layout(struct ceph_object_layout *ol, + struct ceph_object *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap) { unsigned num, num_mask; union ceph_pg pgid; u64 ino = le64_to_cpu(oid->ino); unsigned bno = le32_to_cpu(oid->bno); s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); + int poolid = le16_to_cpu(fl->fl_pg_pool); + struct ceph_pg_pool_info *pool; + + if (poolid >= osdmap->num_pools) + return -EIO; + pool = &osdmap->pg_pool[poolid]; if (preferred >= 0) { - num = osdmap->lpg_num; - num_mask = osdmap->lpg_num_mask; + num = le32_to_cpu(pool->v.lpg_num); + num_mask = pool->lpg_num_mask; } else { - num = osdmap->pg_num; - num_mask = osdmap->pg_num_mask; + num = le32_to_cpu(pool->v.pg_num); + num_mask = pool->pg_num_mask; } pgid.pg64 = 0; /* start with it zeroed out */ @@ -679,4 +673,6 @@ void calc_object_layout(struct ceph_object_layout *ol, ol->ol_pgid = cpu_to_le64(pgid.pg64); ol->ol_stripe_unit = fl->fl_object_stripe_unit; + + return 0; } diff --git a/src/kernel/osdmap.h b/src/kernel/osdmap.h index 21f80aa550cf..3d7ec4bcc6d6 100644 --- a/src/kernel/osdmap.h +++ b/src/kernel/osdmap.h @@ -17,20 +17,17 @@ * The map can be updated either via an incremental map (diff) describing * the change between two successive epochs, or as a fully encoded map. */ +struct ceph_pg_pool_info { + struct ceph_pg_pool v; + int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; +}; + struct ceph_osdmap { ceph_fsid_t fsid; u32 epoch; u32 mkfs_epoch; struct ceph_timespec created, modified; - /* these parameters describe the number of placement groups - * in the system. foo_mask is the smallest value (2**n-1) >= foo. */ - u32 pg_num, pg_num_mask; - u32 pgp_num, pgp_num_mask; - u32 lpg_num, lpg_num_mask; - u32 lpgp_num, lpgp_num_mask; - u32 last_pg_change; /* epoch of last pg count change */ - u32 flags; /* CEPH_OSDMAP_* */ u32 max_osd; /* size of osd_state, _offload, _addr arrays */ @@ -39,7 +36,7 @@ struct ceph_osdmap { struct ceph_entity_addr *osd_addr; u32 num_pools; - struct ceph_pg_pool *pg_pool; + struct ceph_pg_pool_info *pg_pool; /* the CRUSH map specifies the mapping of placement groups to * the list of osds that store+replicate them. */ @@ -101,9 +98,9 @@ extern void calc_file_object_mapping(struct ceph_file_layout *layout, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern void calc_object_layout(struct ceph_object_layout *ol, - struct ceph_object *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap); +extern int calc_object_layout(struct ceph_object_layout *ol, + struct ceph_object *oid, + struct ceph_file_layout *fl, + struct ceph_osdmap *osdmap); #endif diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index a3661b2021d5..2d544633b382 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -113,7 +113,7 @@ void OSDMonitor::create_initial(bufferlist& bl) OSDMap newmap; newmap.decode(bl); - newmap.epoch = 1; + newmap.set_epoch(1); newmap.set_fsid(mon->monmap->fsid); newmap.created = newmap.modified = g_clock.now(); @@ -842,6 +842,7 @@ void OSDMonitor::tick() // --------------- #define SWAP_PRIMARIES_AT_START 0 #define SWAP_TIME 1 +#if 0 if (SWAP_PRIMARIES_AT_START) { // For all PGs that have OSD 0 as the primary, // switch them to use the first replca @@ -859,6 +860,7 @@ void OSDMonitor::tick() } } } +#endif // --------------- if (do_propose) @@ -1007,7 +1009,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m) epoch_t e = atoi(m->cmd[2].c_str()); if (ceph_fsid_compare(&map.fsid, &mon->monmap->fsid) == 0) { if (pending_inc.epoch == e) { - map.epoch = pending_inc.epoch; // make sure epoch is correct + map.set_epoch(pending_inc.epoch); // make sure epoch is correct map.encode(pending_inc.fullmap); string rs = "set osd map"; paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); @@ -1025,38 +1027,6 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) { - int n = atoi(m->cmd[2].c_str()); - if (n <= osdmap.get_pg_num()) { - ss << "specified pg_num " << n << " <= current " << osdmap.get_pg_num(); - } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { - ss << "currently creating pgs, wait"; - err = -EAGAIN; - } else { - ss << "set new pg_num = " << n; - pending_inc.new_pg_num = n; - getline(ss, rs); - paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); - return true; - } - } - else if (m->cmd[1] == "setpgpnum" && m->cmd.size() > 2) { - int n = atoi(m->cmd[2].c_str()); - if (n <= osdmap.get_pgp_num()) { - ss << "specified pgp_num " << n << " <= current " << osdmap.get_pgp_num(); - } else if (n > osdmap.get_pg_num()) { - ss << "specified pgp_num " << n << " > pg_num " << osdmap.get_pg_num(); - } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { - ss << "still creating pgs, wait"; - err = -EAGAIN; - } else { - ss << "set new pgp_num = " << n; - pending_inc.new_pgp_num = n; - getline(ss, rs); - paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); - return true; - } - } else if (m->cmd[1] == "down" && m->cmd.size() == 3) { long osd = strtol(m->cmd[2].c_str(), 0, 10); if (osdmap.is_down(osd)) { @@ -1134,22 +1104,57 @@ bool OSDMonitor::prepare_command(MMonCommand *m) } else if (m->cmd[1] == "pool" && m->cmd.size() >= 5) { int pool = -1; - for (map::iterator p = osdmap.pool_name.begin(); - p != osdmap.pool_name.end(); - p++) { - if (p->second == m->cmd[2]) - pool = p->first; + pg_pool_t *p = 0; + for (map::iterator i = osdmap.pool_name.begin(); + i != osdmap.pool_name.end(); + i++) { + if (i->second == m->cmd[2]) { + pool = i->first; + p = &osdmap.pools[pool]; + } } if (pool >= 0) { - if (m->cmd[3] == "size") { - int s = atoi(m->cmd[4].c_str()); - if (s) { - pending_inc.new_pools[pool] = osdmap.pools[pool]; - pending_inc.new_pools[pool].size = s; - ss << "set pool " << pool << " size to " << s; + int n = atoi(m->cmd[4].c_str()); + if (n) { + if (m->cmd[3] == "size") { + pending_inc.new_pools[pool] = *p; + pending_inc.new_pools[pool].v.size = n; + ss << "set pool " << pool << " size to " << n; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; + } else if (m->cmd[3] == "pg_num") { + if (n <= p->get_pg_num()) { + ss << "specified pg_num " << n << " <= current " << p->get_pg_num(); + } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { + ss << "currently creating pgs, wait"; + err = -EAGAIN; + } else { + pending_inc.new_pools[pool] = osdmap.pools[pool]; + pending_inc.new_pools[pool].v.pg_num = n; + ss << "set pool " << pool << " pg_num to " << n; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; + } + } else if (m->cmd[3] == "pgp_num") { + if (n <= p->get_pgp_num()) { + ss << "specified pgp_num " << n << " <= current " << p->get_pgp_num(); + } else if (n > p->get_pg_num()) { + ss << "specified pgp_num " << n << " > pg_num " << p->get_pg_num(); + } else if (!mon->pgmon()->pg_map.creating_pgs.empty()) { + ss << "still creating pgs, wait"; + err = -EAGAIN; + } else { + pending_inc.new_pools[pool] = osdmap.pools[pool]; + pending_inc.new_pools[pool].v.pgp_num = n; + ss << "set pool " << pool << " pgp_num to " << n; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; + } + } else { + ss << "unrecognized pool field " << m->cmd[3]; } } } else { diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 57fab6426ea0..e606932b927e 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -423,35 +423,35 @@ void PGMonitor::check_osd_map(epoch_t epoch) bool PGMonitor::register_new_pgs() { - dout(10) << "osdmap last_pg_change " << mon->osdmon()->osdmap.get_last_pg_change() - << ", pgmap last_pg_scan " << pg_map.last_pg_scan << dendl; - if (mon->osdmon()->osdmap.get_last_pg_change() <= pg_map.last_pg_scan || - mon->osdmon()->osdmap.get_last_pg_change() <= pending_inc.pg_scan) { - dout(10) << "register_new_pgs -- i've already scanned pg space since last significant osdmap update" << dendl; - return false; - } - // iterate over crush mapspace - dout(10) << "register_new_pgs scanning pgid space defined by crush rule masks" << dendl; - - CrushWrapper *crush = &mon->osdmon()->osdmap.crush; - int pg_num = mon->osdmon()->osdmap.get_pg_num(); epoch_t epoch = mon->osdmon()->osdmap.get_epoch(); + dout(10) << "register_new_pgs checking pg pools for osdmap epoch " << epoch + << ", last_pg_scan " << pg_map.last_pg_scan << dendl; bool first = pg_map.pg_stat.empty(); // first pg creation int created = 0; - for (map::iterator p = mon->osdmon()->osdmap.pools.begin(); + for (map::iterator p = mon->osdmon()->osdmap.pools.begin(); p != mon->osdmon()->osdmap.pools.end(); p++) { - int pool = p->first; - int type = p->second.type; - int ruleno = p->second.crush_ruleset; - if (!crush->rule_exists(ruleno)) + int poolid = p->first; + pg_pool_t &pool = p->second; + int type = pool.get_type(); + int ruleno = pool.get_crush_ruleset(); + if (!mon->osdmon()->osdmap.crush.rule_exists(ruleno)) continue; - for (ps_t ps = 0; ps < pg_num; ps++) { - pg_t pgid(type, ps, pool, -1); + + if (pool.get_last_change() <= pg_map.last_pg_scan || + pool.get_last_change() <= pending_inc.pg_scan) { + dout(10) << " no change in " << pool << dendl; + continue; + } + + dout(10) << "register_new_pgs scanning " << pool << dendl; + + for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) { + pg_t pgid(type, ps, poolid, -1); if (pg_map.pg_stat.count(pgid)) { - dout(20) << "register_new_pgs have " << pgid << dendl; + dout(20) << "register_new_pgs have " << pgid << dendl; continue; } @@ -461,7 +461,7 @@ bool PGMonitor::register_new_pgs() parent = pgid; while (1) { // remove most significant bit - int msb = calc_bits_of(parent.u.pg.ps); + int msb = pool.calc_bits_of(parent.u.pg.ps); if (!msb) break; parent.u.pg.ps &= ~(1<<(msb-1)); split_bits++; @@ -482,9 +482,9 @@ bool PGMonitor::register_new_pgs() created++; if (split_bits == 0) { - dout(10) << "register_new_pgs will create " << pgid << dendl; + dout(10) << "register_new_pgs will create " << pgid << dendl; } else { - dout(10) << "register_new_pgs will create " << pgid + dout(10) << "register_new_pgs will create " << pgid << " parent " << parent << " by " << split_bits << " bits" << dendl; diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 8aef86fcd0f6..89e91a87752b 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -25,18 +25,10 @@ void OSDMap::print(ostream& out) << "created " << get_created() << "\n" << "modifed " << get_modified() << "\n" << std::endl; - out << "pg_num " << get_pg_num() << "\n" - << "pgp_num " << get_pgp_num() << "\n" - << "lpg_num " << get_lpg_num() << "\n" - << "lpgp_num " << get_lpgp_num() << "\n" - << "last_pg_change " << get_last_pg_change() << "\n" - << std::endl; - for (map::iterator p = pools.begin(); p != pools.end(); p++) + for (map::iterator p = pools.begin(); p != pools.end(); p++) out << "pg_pool " << p->first << " '" << pool_name[p->first] - << "' size " << (int)p->second.size - << " crush_ruleset " << (int)p->second.crush_ruleset - << "\n"; + << "' " << p->second << "\n"; out << std::endl; out << "max_osd " << get_max_osd() << "\n"; @@ -94,8 +86,6 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid, created = modified = g_clock.now(); set_max_osd(num_osd); - pg_num = pgp_num = num_osd << pg_bits; - lpg_num = lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0; // crush map map rulesets; @@ -105,9 +95,14 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid_t &fsid, int pool = 0; for (map::iterator p = rulesets.begin(); p != rulesets.end(); p++) { - pools[pool].size = 2; - pools[pool].crush_ruleset = p->first; - pools[pool].type = CEPH_PG_TYPE_REP; + pools[pool].v.type = CEPH_PG_TYPE_REP; + pools[pool].v.size = 2; + pools[pool].v.crush_ruleset = p->first; + pools[pool].v.pg_num = num_osd << pg_bits; + pools[pool].v.pgp_num = num_osd << pg_bits; + pools[pool].v.lpg_num = lpg_bits ? (1 << (lpg_bits-1)) : 0; + pools[pool].v.lpgp_num = lpg_bits ? (1 << (lpg_bits-1)) : 0; + pools[pool].v.last_change = epoch; pool_name[pool] = p->second; pool++; } diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index abfbe5d4e1ff..34c4e8375b9c 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -55,16 +55,6 @@ using __gnu_cxx::hash_set; //#define PG_ROLE_TAIL 2 -inline int calc_bits_of(int t) { - int b = 0; - while (t > 0) { - t = t >> 1; - b++; - } - return b; -} - - /* * we track up to two intervals during which the osd was alive and @@ -137,12 +127,14 @@ public: utime_t modified; int32_t new_flags; + /* bool is_pg_change() { return (fullmap.length() || crush.length() || new_pg_num || new_lpg_num); } + */ // full (rare) bufferlist fullmap; // in leiu of below. @@ -150,8 +142,7 @@ public: // incremental int32_t new_max_osd; - int32_t new_pg_num, new_pgp_num, new_lpg_num, new_lpgp_num; - map new_pools; + map new_pools; map new_pool_names; set old_pools; map new_up; @@ -179,10 +170,6 @@ public: ::encode(crush, bl); ::encode(new_max_osd, bl); - ::encode(new_pg_num, bl); - ::encode(new_pgp_num, bl); - ::encode(new_lpg_num, bl); - ::encode(new_lpgp_num, bl); ::encode(new_pools, bl); ::encode(old_pools, bl); ::encode(new_up, bl); @@ -211,10 +198,6 @@ public: ::decode(crush, p); ::decode(new_max_osd, p); - ::decode(new_pg_num, p); - ::decode(new_pgp_num, p); - ::decode(new_lpg_num, p); - ::decode(new_lpgp_num, p); ::decode(new_pools, p); ::decode(old_pools, p); ::decode(new_up, p); @@ -234,8 +217,7 @@ public: ::decode(old_blacklist, p); } - Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1), - new_pg_num(0), new_pgp_num(0), new_lpg_num(0), new_lpgp_num(0) { + Incremental(epoch_t e=0) : epoch(e), new_flags(-1), new_max_osd(-1) { memset(&fsid, 0, sizeof(fsid)); } Incremental(bufferlist &bl) { @@ -252,31 +234,6 @@ private: epoch_t epoch; // what epoch of the osd cluster descriptor is this utime_t created, modified; // epoch start time - /* - * placement groups - * - * pg_num -- base number of pseudorandomly placed pgs - * - * pgp_num -- effective number when calculating pg placement. this - * is used for pg_num increases. new pgs result in data being - * "split" into new pgs. for this to proceed smoothly, new pgs are - * intiially colocated with their parents; that is, pgp_num doesn't - * increase until the new pgs have successfully split. only _then_ - * are the new pgs placed independently. - * - * lpg_num -- localized pg count (per device). replicas are - * randomly selected. - * - * lpgp_num -- as above. - */ - int32_t pg_num, pg_num_mask; // placement group count and bitmask - int32_t pgp_num, pgp_num_mask; // pg placement num (for placing pg's.. <= pg_num) - int32_t lpg_num, lpg_num_mask; // localized placement group count - int32_t lpgp_num, lpgp_num_mask; // as above - - // new pgs - epoch_t last_pg_change; // most recent epoch initiating possible pg creation - uint32_t flags; int32_t max_osd; @@ -285,7 +242,7 @@ private: vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" vector osd_info; - map pools; + map pools; map pool_name; map pg_swap_primary; // force new osd to be pg primary (if already a member) snapid_t max_snap; @@ -302,12 +259,9 @@ private: public: OSDMap() : epoch(0), - pg_num(0), pgp_num(0), lpg_num(0), lpgp_num(0), - last_pg_change(0), flags(0), max_osd(0), max_snap(0) { memset(&fsid, 0, sizeof(fsid)); - calc_pg_masks(); } // map info @@ -316,34 +270,19 @@ private: epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } - void set_epoch(epoch_t e) { epoch = e; } - /* pg num / masks */ - void calc_pg_masks() { - pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; - pgp_num_mask = (1 << calc_bits_of(pgp_num-1)) - 1; - lpg_num_mask = (1 << calc_bits_of(lpg_num-1)) - 1; - lpgp_num_mask = (1 << calc_bits_of(lpgp_num-1)) - 1; + void set_epoch(epoch_t e) { + epoch = e; + for (map::iterator p = pools.begin(); + p != pools.end(); + p++) + p->second.v.last_change = e; } - int get_pg_num() const { return pg_num; } - int get_pgp_num() const { return pgp_num; } - int get_lpg_num() const { return lpg_num; } - int get_lpgp_num() const { return lpgp_num; } - - int get_pg_num_mask() const { return pg_num_mask; } - int get_pgp_num_mask() const { return pgp_num_mask; } - int get_lpg_num_mask() const { return lpg_num_mask; } - int get_lpgp_num_mask() const { return lpgp_num_mask; } - /* stamps etc */ const utime_t& get_created() const { return created; } const utime_t& get_modified() const { return modified; } - epoch_t get_last_pg_change() const { - return last_pg_change; - } - snapid_t get_max_snap() { return max_snap; } bool is_removed_snap(snapid_t sn) { if (sn > max_snap) @@ -507,36 +446,15 @@ private: modified = inc.modified; // full map? - if (inc.fullmap.length()) + if (inc.fullmap.length()) { decode(inc.fullmap); - - if (inc.is_pg_change()) - last_pg_change = epoch; - - if (inc.fullmap.length()) return; + } // nope, incremental. if (inc.new_flags >= 0) flags = inc.new_flags; - if (inc.new_pg_num) { - assert(inc.new_pg_num >= pg_num); - pg_num = inc.new_pg_num; - } - if (inc.new_lpg_num) { - assert(inc.new_lpg_num >= lpg_num); - lpg_num = inc.new_lpg_num; - } - if (inc.new_pgp_num) { - assert(inc.new_pgp_num >= pgp_num); - pgp_num = inc.new_pgp_num; - } - if (inc.new_lpgp_num) { - assert(inc.new_lpgp_num >= lpgp_num); - lpgp_num = inc.new_lpgp_num; - } - if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); @@ -546,7 +464,7 @@ private: pools.erase(*p); pool_name.erase(*p); } - for (map::iterator p = inc.new_pools.begin(); + for (map::iterator p = inc.new_pools.begin(); p != inc.new_pools.end(); p++) pools[p->first] = p->second; @@ -631,11 +549,6 @@ private: ::encode(epoch, blist); ::encode(created, blist); ::encode(modified, blist); - ::encode(pg_num, blist); - ::encode(pgp_num, blist); - ::encode(lpg_num, blist); - ::encode(lpgp_num, blist); - ::encode(last_pg_change, blist); int32_t max_pools = 0; if (pools.size()) @@ -672,12 +585,6 @@ private: ::decode(epoch, p); ::decode(created, p); ::decode(modified, p); - ::decode(pg_num, p); - ::decode(pgp_num, p); - ::decode(lpg_num, p); - ::decode(lpgp_num, p); - calc_pg_masks(); - ::decode(last_pg_change, p); int32_t max_pools; ::decode(max_pools, p); @@ -753,28 +660,6 @@ private: return layout; } - - /* - * map a raw pg (with full precision ps) into an actual pg, for storage - */ - pg_t raw_pg_to_pg(pg_t pg) { - if (pg.preferred() >= 0) - pg.u.pg.ps = ceph_stable_mod(pg.ps(), lpg_num, lpg_num_mask); - else - pg.u.pg.ps = ceph_stable_mod(pg.ps(), pg_num, pg_num_mask); - return pg; - } - - /* - * map raw pg (full precision ps) into a placement ps - */ - ps_t raw_pg_to_pps(pg_t pg) { - if (pg.preferred() >= 0) - return ceph_stable_mod(pg.ps(), lpgp_num, lpgp_num_mask); - else - return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask); - } - // pg -> (osd list) int pg_to_osds(pg_t pg, vector& osds) { // map to osds[] @@ -782,35 +667,36 @@ private: if (!pools.count(p)) { return osds.size(); } - ceph_pg_pool &pool = pools[p]; - ps_t pps = raw_pg_to_pps(pg); // placement ps + pg_pool_t &pool = pools[p]; + ps_t pps = pool.raw_pg_to_pps(pg); // placement ps + unsigned size = pool.get_size(); switch (g_conf.osd_pg_layout) { case CEPH_PG_LAYOUT_CRUSH: { // what crush rule? - int ruleno = crush.find_rule(pool.crush_ruleset, pg.type(), pool.size); + int ruleno = crush.find_rule(pool.get_crush_ruleset(), pg.type(), size); if (ruleno >= 0) - crush.do_rule(ruleno, pps, osds, pool.size, pg.preferred(), osd_weight); + crush.do_rule(ruleno, pps, osds, size, pg.preferred(), osd_weight); } break; case CEPH_PG_LAYOUT_LINEAR: - for (unsigned i=0; i 0); - for (unsigned i=1; i 0); + for (unsigned i=1; i primary osd diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 90854d1b6e31..9d074b817037 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -25,7 +25,7 @@ -#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v011" +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v012" @@ -397,6 +397,90 @@ static inline std::string pg_state_string(int state) { return st; } + + + +/* + * pg_pool + */ +struct pg_pool_t { + ceph_pg_pool v; + int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; + + int get_pg_num() const { return v.pg_num; } + int get_pgp_num() const { return v.pgp_num; } + int get_lpg_num() const { return v.lpg_num; } + int get_lpgp_num() const { return v.lpgp_num; } + + int get_pg_num_mask() const { return pg_num_mask; } + int get_pgp_num_mask() const { return pgp_num_mask; } + int get_lpg_num_mask() const { return lpg_num_mask; } + int get_lpgp_num_mask() const { return lpgp_num_mask; } + + int calc_bits_of(int t) { + int b = 0; + while (t > 0) { + t = t >> 1; + b++; + } + return b; + } + + unsigned get_type() const { return v.type; } + unsigned get_size() const { return v.size; } + int get_crush_ruleset() const { return v.crush_ruleset; } + epoch_t get_last_change() const { return v.last_change; } + + void calc_pg_masks() { + pg_num_mask = (1 << calc_bits_of(v.pg_num-1)) - 1; + pgp_num_mask = (1 << calc_bits_of(v.pgp_num-1)) - 1; + lpg_num_mask = (1 << calc_bits_of(v.lpg_num-1)) - 1; + lpgp_num_mask = (1 << calc_bits_of(v.lpgp_num-1)) - 1; + } + + /* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ + pg_t raw_pg_to_pg(pg_t pg) const { + if (pg.preferred() >= 0 && v.lpg_num) + pg.u.pg.ps = ceph_stable_mod(pg.ps(), v.lpg_num, lpg_num_mask); + else + pg.u.pg.ps = ceph_stable_mod(pg.ps(), v.pg_num, pg_num_mask); + return pg; + } + + /* + * map raw pg (full precision ps) into a placement ps + */ + ps_t raw_pg_to_pps(pg_t pg) const { + if (pg.preferred() >= 0 && v.lpgp_num) + return ceph_stable_mod(pg.ps(), v.lpgp_num, lpgp_num_mask); + else + return ceph_stable_mod(pg.ps(), v.pgp_num, pgp_num_mask); + } + + void encode(bufferlist& bl) const { + ::encode(v, bl); + } + void decode(bufferlist::iterator& bl) { + ::decode(v, bl); + calc_pg_masks(); + } +}; +WRITE_CLASS_ENCODER(pg_pool_t) + +inline ostream& operator<<(ostream& out, const pg_pool_t& p) { + return out << "pg_pool(type " << p.get_type() + << " size " << p.get_size() + << " ruleset " << p.get_crush_ruleset() + << " pg_num " << p.get_pg_num() + << " pgp_num " << p.get_pgp_num() + << " lpg_num " << p.get_lpg_num() + << " lpgp_num " << p.get_lpgp_num() + << " last_change " << p.get_last_change() + << ")"; +} + /** pg_stat * aggregate stats for a single PG. */