Skip to content

Commit 85b0e62

Browse files
author
Paul Dagnelie
committed
Implement new label format for large disks
This patch contains the logic for a new larger label format. This format is intended to support disks with large sector sizes. By using a larger label we can store more uberblocks and other critical pool metadata. We can also use the extra space to enable new features in ZFS going forwards. This initial commit does not add new capabilities, but provides the framework for them going forwards. Signed-off-by: Paul Dagnelie <[email protected]> Sponsored-by: Wasabi, Inc. Sponsored-by: Klara, Inc.
1 parent f330b46 commit 85b0e62

37 files changed

+1514
-210
lines changed

cmd/zdb/zdb.c

Lines changed: 240 additions & 48 deletions
Large diffs are not rendered by default.

cmd/zhack.c

Lines changed: 276 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -685,11 +685,11 @@ zhack_do_metaslab(int argc, char **argv)
685685
return (0);
686686
}
687687

688-
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
688+
#define ASHIFT_UBERBLOCK_SHIFT(ashift, new) \
689689
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
690-
MAX_UBERBLOCK_SHIFT)
691-
#define ASHIFT_UBERBLOCK_SIZE(ashift) \
692-
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
690+
MAX_UBERBLOCK_SHIFT(new))
691+
#define ASHIFT_UBERBLOCK_SIZE(ashift, new) \
692+
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift, new))
693693

694694
#define REPAIR_LABEL_STATUS_CKSUM (1 << 0)
695695
#define REPAIR_LABEL_STATUS_UB (1 << 1)
@@ -714,6 +714,26 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,
714714
return (0);
715715
}
716716

717+
static int
718+
zhack_repair_read(const int fd, uint8_t *buf, size_t buflen,
719+
const uint64_t offset, const int l)
720+
{
721+
const int err = pread64(fd, buf, buflen, offset);
722+
723+
if (err == -1) {
724+
(void) fprintf(stderr,
725+
"error: cannot read buffer at %lu for label %d: %s\n",
726+
offset, l, strerror(errno));
727+
return (err);
728+
} else if (err != buflen) {
729+
(void) fprintf(stderr,
730+
"error: bad read size at %lu for label %d \n", offset, l);
731+
return (err);
732+
}
733+
734+
return (0);
735+
}
736+
717737
static void
718738
zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
719739
const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
@@ -876,7 +896,7 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
876896
(char *)vl + offsetof(vdev_label_t, vl_uberblock);
877897
zio_eck_t *ub_eck =
878898
(zio_eck_t *)
879-
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1;
899+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
880900

881901
if (ub_eck->zec_magic != 0) {
882902
(void) fprintf(stderr,
@@ -895,10 +915,39 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
895915
if (zhack_repair_write_label(l, fd, byteswap,
896916
ub_data, ub_eck,
897917
label_offset + offsetof(vdev_label_t, vl_uberblock),
898-
ASHIFT_UBERBLOCK_SIZE(ashift)))
918+
ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE)))
899919
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
900920
}
901921

922+
static void
923+
zhack_repair_write_uberblock_new(void *ub_data, const int l,
924+
const uint64_t ashift, const int fd, const int byteswap,
925+
const uint64_t label_offset, uint32_t *labels_repaired)
926+
{
927+
zio_eck_t *ub_eck =
928+
(zio_eck_t *)
929+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
930+
931+
if (ub_eck->zec_magic != 0) {
932+
(void) fprintf(stderr,
933+
"error: label %d: "
934+
"Expected Uberblock checksum magic number to "
935+
"be 0, but got %" PRIu64 "\n",
936+
l, ub_eck->zec_magic);
937+
(void) fprintf(stderr, "It would appear there's already "
938+
"a checksum for the uberblock.\n");
939+
return;
940+
}
941+
942+
943+
ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
944+
945+
if (zhack_repair_write_label(l, fd, byteswap,
946+
ub_data, ub_eck, label_offset + VDEV_LARGE_UBERBLOCK_RING,
947+
ASHIFT_UBERBLOCK_SIZE(ashift, B_TRUE)))
948+
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
949+
}
950+
902951
static void
903952
zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
904953
{
@@ -912,12 +961,13 @@ zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
912961

913962
static int
914963
zhack_repair_test_cksum(const int byteswap, void *vdev_data,
915-
zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l)
964+
const uint64_t size, zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset,
965+
const int l)
916966
{
917967
const zio_cksum_t expected_cksum = vdev_eck->zec_cksum;
918968
zio_cksum_t actual_cksum;
919969
zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset,
920-
VDEV_PHYS_SIZE, vdev_eck, &actual_cksum);
970+
size, vdev_eck, &actual_cksum);
921971
const uint64_t expected_magic = byteswap ?
922972
BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
923973
const uint64_t actual_magic = vdev_eck->zec_magic;
@@ -945,15 +995,17 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
945995

946996
static void
947997
zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
948-
vdev_label_t *vl, const uint64_t label_offset, const int l,
949-
uint32_t *labels_repaired)
998+
vdev_label_t *vl, const uint64_t filesize, const int l,
999+
uint32_t *labels_repaired, boolean_t *large_label)
9501000
{
9511001
ssize_t err;
9521002
uberblock_t *ub = (uberblock_t *)vl->vl_uberblock;
9531003
void *vdev_data =
9541004
(char *)vl + offsetof(vdev_label_t, vl_vdev_phys);
9551005
zio_eck_t *vdev_eck =
9561006
(zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
1007+
const uint64_t label_offset = vdev_label_offset(filesize, l, 0,
1008+
B_FALSE);
9571009
const uint64_t vdev_phys_offset =
9581010
label_offset + offsetof(vdev_label_t, vl_vdev_phys);
9591011
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
@@ -987,8 +1039,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
9871039
}
9881040

9891041
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
990-
zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck,
991-
vdev_phys_offset, l) != 0) {
1042+
zhack_repair_test_cksum(byteswap, vdev_data, VDEV_PHYS_SIZE,
1043+
vdev_eck, vdev_phys_offset, l) != 0) {
9921044
(void) fprintf(stderr, "It would appear checksums are "
9931045
"corrupted. Try zhack repair label -c <device>\n");
9941046
return;
@@ -1001,6 +1053,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
10011053
"error: cannot unpack nvlist label %d\n", l);
10021054
return;
10031055
}
1056+
(void) nvlist_lookup_boolean_value(cfg, ZPOOL_CONFIG_LARGE_LABEL,
1057+
large_label);
10041058

10051059
err = zhack_repair_check_label(ub,
10061060
l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
@@ -1025,13 +1079,212 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
10251079

10261080
zhack_repair_write_uberblock(vl,
10271081
l, ashift, fd, byteswap, label_offset, labels_repaired);
1082+
if (large_label) {
1083+
zhack_repair_write_uberblock_new(ub, l, ashift,
1084+
fd, byteswap, vdev_label_offset(filesize, l, 0,
1085+
B_TRUE), labels_repaired);
1086+
}
10281087
}
10291088

10301089
if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck,
10311090
vdev_phys_offset, VDEV_PHYS_SIZE))
1032-
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1091+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1092+
1093+
fsync(fd);
1094+
}
1095+
1096+
static void
1097+
zhack_repair_one_label_large(const zhack_repair_op_t op, const int fd,
1098+
const uint64_t label_offset, const int l, uint32_t *labels_repaired)
1099+
{
1100+
ssize_t err;
1101+
void *toc_data = NULL, *bootenv = NULL, *vdev_config = NULL;
1102+
void *spa_config = NULL, *ub = NULL;
1103+
/*
1104+
* Note that currently, this can't handle disks with larger than 8k
1105+
* sector sizes. That needs to be fixed eventually.
1106+
*/
1107+
toc_data = malloc(VDEV_TOC_SIZE);
1108+
err = zhack_repair_read(fd, toc_data, VDEV_TOC_SIZE, label_offset, l);
1109+
if (err)
1110+
goto out;
1111+
1112+
zio_eck_t *toc_eck = (zio_eck_t *)(toc_data + VDEV_TOC_SIZE) - 1;
1113+
if (toc_eck->zec_magic == 0) {
1114+
(void) fprintf(stderr, "error: label %d: "
1115+
"Expected the nvlist checksum magic number to not be zero"
1116+
"\n",
1117+
l);
1118+
(void) fprintf(stderr, "There should already be a checksum "
1119+
"for the label.\n");
1120+
goto out;
1121+
}
1122+
1123+
int byteswap =
1124+
(toc_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
1125+
1126+
if (byteswap) {
1127+
byteswap_uint64_array(&toc_eck->zec_cksum,
1128+
sizeof (zio_cksum_t));
1129+
toc_eck->zec_magic = BSWAP_64(toc_eck->zec_magic);
1130+
}
1131+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1132+
zhack_repair_test_cksum(byteswap, toc_data, VDEV_TOC_SIZE,
1133+
toc_eck, label_offset, l) != 0) {
1134+
(void) fprintf(stderr, "It would appear checksums are "
1135+
"corrupted. Try zhack repair label -c <device>\n");
1136+
goto out;
1137+
}
1138+
1139+
nvlist_t *toc;
1140+
err = nvlist_unpack(toc_data, VDEV_TOC_SIZE, &toc, 0);
1141+
if (err) {
1142+
(void) fprintf(stderr,
1143+
"error: cannot unpack nvlist TOC %d\n", l);
1144+
goto out;
1145+
}
1146+
1147+
uint32_t bootenv_size, vc_size, sc_size;
1148+
if ((err = nvlist_lookup_uint32(toc, VDEV_TOC_BOOT_REGION,
1149+
&bootenv_size)) || (err = nvlist_lookup_uint32(toc,
1150+
VDEV_TOC_VDEV_CONFIG, &vc_size)) || (err = nvlist_lookup_uint32(toc,
1151+
VDEV_TOC_POOL_CONFIG, &sc_size))) {
1152+
(void) fprintf(stderr,
1153+
"error: TOC missing core fields %d\n", l);
1154+
goto out;
1155+
}
1156+
bootenv = malloc(bootenv_size);
1157+
zio_eck_t *bootenv_eck = (zio_eck_t *)(bootenv + bootenv_size) - 1;
1158+
vdev_config = malloc(vc_size);
1159+
zio_eck_t *vc_eck = (zio_eck_t *)(vdev_config + vc_size) - 1;
1160+
spa_config = malloc(sc_size);
1161+
zio_eck_t *sc_eck = (zio_eck_t *)(spa_config + sc_size) - 1;
1162+
1163+
uint64_t offset = label_offset + VDEV_TOC_SIZE;
1164+
if (bootenv_size != 0) {
1165+
if ((err = zhack_repair_read(fd, bootenv,
1166+
bootenv_size, offset, l)))
1167+
goto out;
1168+
if (byteswap) {
1169+
byteswap_uint64_array(&bootenv_eck->zec_cksum,
1170+
sizeof (zio_cksum_t));
1171+
bootenv_eck->zec_magic =
1172+
BSWAP_64(bootenv_eck->zec_magic);
1173+
}
1174+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1175+
zhack_repair_test_cksum(byteswap, bootenv, bootenv_size,
1176+
bootenv_eck, offset, l) != 0) {
1177+
(void) fprintf(stderr, "It would appear checksums are "
1178+
"corrupted. Try zhack repair label -c <device>\n");
1179+
goto out;
1180+
}
1181+
}
1182+
1183+
offset += bootenv_size;
1184+
if ((err = zhack_repair_read(fd, vdev_config, vc_size, offset, l)))
1185+
goto out;
1186+
1187+
if (byteswap) {
1188+
byteswap_uint64_array(&sc_eck->zec_cksum,
1189+
sizeof (zio_cksum_t));
1190+
vc_eck->zec_magic = BSWAP_64(vc_eck->zec_magic);
1191+
}
1192+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1193+
zhack_repair_test_cksum(byteswap, vdev_config, vc_size,
1194+
vc_eck, offset, l) != 0) {
1195+
(void) fprintf(stderr, "It would appear checksums are "
1196+
"corrupted. Try zhack repair label -c <device>\n");
1197+
goto out;
1198+
}
1199+
offset += vc_size;
1200+
if ((err = zhack_repair_read(fd, spa_config, sc_size, offset, l)))
1201+
goto out;
1202+
1203+
if (byteswap) {
1204+
byteswap_uint64_array(&sc_eck->zec_cksum,
1205+
sizeof (zio_cksum_t));
1206+
vc_eck->zec_magic = BSWAP_64(sc_eck->zec_magic);
1207+
}
1208+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1209+
zhack_repair_test_cksum(byteswap, spa_config, sc_size,
1210+
sc_eck, offset, l) != 0) {
1211+
(void) fprintf(stderr, "It would appear checksums are "
1212+
"corrupted. Try zhack repair label -c <device>\n");
1213+
goto out;
1214+
}
1215+
1216+
nvlist_t *cfg;
1217+
err = nvlist_unpack(vdev_config, vc_size - sizeof (zio_eck_t), &cfg, 0);
1218+
if (err) {
1219+
(void) fprintf(stderr,
1220+
"error: cannot unpack nvlist label %d\n", l);
1221+
return;
1222+
}
1223+
1224+
ub = malloc(UBERBLOCK_SHIFT);
1225+
err = zhack_repair_read(fd, ub, UBERBLOCK_SHIFT,
1226+
label_offset + VDEV_LARGE_UBERBLOCK_RING, l);
1227+
if (err)
1228+
goto out;
1229+
1230+
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
1231+
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
1232+
nvlist_t *vdev_tree_cfg = NULL;
1233+
uint64_t ashift;
1234+
err = zhack_repair_check_label(ub, l, cfg_keys, ARRAY_SIZE(cfg_keys),
1235+
cfg, vdev_tree_cfg, &ashift);
1236+
if (err)
1237+
return;
1238+
1239+
if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) {
1240+
char *buf;
1241+
size_t buflen;
1242+
1243+
err = zhack_repair_undetach(ub, cfg, l);
1244+
if (err)
1245+
return;
1246+
1247+
buf = vdev_config;
1248+
buflen = vc_size - sizeof (zio_eck_t);
1249+
if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
1250+
(void) fprintf(stderr,
1251+
"error: label %d: Failed to pack nvlist\n", l);
1252+
return;
1253+
}
1254+
1255+
zhack_repair_write_uberblock_new(ub, l, ashift, fd, byteswap,
1256+
label_offset, labels_repaired);
1257+
}
1258+
1259+
offset = label_offset;
1260+
if (zhack_repair_write_label(l, fd, byteswap, toc_data, toc_eck,
1261+
offset, VDEV_TOC_SIZE))
1262+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1263+
offset += VDEV_TOC_SIZE;
1264+
if (zhack_repair_write_label(l, fd, byteswap, bootenv, bootenv_eck,
1265+
offset, bootenv_size))
1266+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1267+
offset += bootenv_size;
1268+
if (zhack_repair_write_label(l, fd, byteswap, vdev_config, vc_eck,
1269+
offset, vc_size))
1270+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1271+
offset += vc_size;
1272+
if (zhack_repair_write_label(l, fd, byteswap, spa_config, sc_eck,
1273+
offset, sc_size))
1274+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
10331275

10341276
fsync(fd);
1277+
out:
1278+
if (toc_data)
1279+
free(toc_data);
1280+
if (bootenv)
1281+
free(bootenv);
1282+
if (vdev_config)
1283+
free(vdev_config);
1284+
if (spa_config)
1285+
free(spa_config);
1286+
if (ub)
1287+
free(ub);
10351288
}
10361289

10371290
static const char *
@@ -1074,9 +1327,18 @@ zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv)
10741327
filesize =
10751328
(filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t);
10761329

1330+
boolean_t large_label = B_FALSE;
10771331
for (int l = 0; l < VDEV_LABELS; l++) {
10781332
zhack_repair_one_label(op, fd, &labels[l],
1079-
vdev_label_offset(filesize, l, 0), l, labels_repaired);
1333+
filesize, l, labels_repaired, &large_label);
1334+
if (large_label)
1335+
break;
1336+
}
1337+
if (large_label) {
1338+
for (int l = 0; l < VDEV_LABELS; l++) {
1339+
zhack_repair_one_label_large(op, fd,
1340+
filesize, l, labels_repaired);
1341+
}
10801342
}
10811343

10821344
close(fd);

0 commit comments

Comments
 (0)