Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7087166

Browse files
committed
pg_upgrade: Convert old visibility map format to new format.
Commit a892234 added a second bit per page to the visibility map, but pg_upgrade has been unaware of it up until now. Therefore, a pg_upgrade from an earlier major release of PostgreSQL to any commit preceding this one and following the one mentioned above would result in invalid visibility map contents on the new cluster, very possibly leading to data corruption. This plugs that hole. Masahiko Sawada, reviewed by Jeff Janes, Bruce Momjian, Simon Riggs, Michael Paquier, Andres Freund, me, and others.
1 parent 9118d03 commit 7087166

File tree

3 files changed

+197
-11
lines changed

3 files changed

+197
-11
lines changed

src/bin/pg_upgrade/file.c

+154
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,16 @@
99

1010
#include "postgres_fe.h"
1111

12+
#include "access/visibilitymap.h"
1213
#include "pg_upgrade.h"
14+
#include "storage/bufpage.h"
15+
#include "storage/checksum.h"
16+
#include "storage/checksum_impl.h"
1317

18+
#include <sys/stat.h>
1419
#include <fcntl.h>
1520

21+
#define BITS_PER_HEAPBLOCK_OLD 1
1622

1723

1824
#ifndef WIN32
@@ -138,6 +144,154 @@ copy_file(const char *srcfile, const char *dstfile, bool force)
138144
#endif
139145

140146

147+
/*
148+
* rewriteVisibilityMap()
149+
*
150+
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
151+
* visibility map included one bit per heap page; it now includes two.
152+
* When upgrading a cluster from before that time to a current PostgreSQL
153+
* version, we could refuse to copy visibility maps from the old cluster
154+
* to the new cluster; the next VACUUM would recreate them, but at the
155+
* price of scanning the entire table. So, instead, we rewrite the old
156+
* visibility maps in the new format. That way, the all-visible bit
157+
* remains set for the pages for which it was set previously. The
158+
* all-frozen bit is never set by this conversion; we leave that to
159+
* VACUUM.
160+
*/
161+
const char *
162+
rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force)
163+
{
164+
int src_fd = 0;
165+
int dst_fd = 0;
166+
char buffer[BLCKSZ];
167+
ssize_t bytesRead;
168+
ssize_t src_filesize;
169+
int rewriteVmBytesPerPage;
170+
BlockNumber new_blkno = 0;
171+
struct stat statbuf;
172+
173+
/* Compute we need how many old page bytes to rewrite a new page */
174+
rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
175+
176+
if ((fromfile == NULL) || (tofile == NULL))
177+
return "Invalid old file or new file";
178+
179+
if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
180+
return getErrorText();
181+
182+
if (fstat(src_fd, &statbuf) != 0)
183+
{
184+
close(src_fd);
185+
return getErrorText();
186+
}
187+
188+
if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0)
189+
{
190+
close(src_fd);
191+
return getErrorText();
192+
}
193+
194+
/* Save old file size */
195+
src_filesize = statbuf.st_size;
196+
197+
/*
198+
* Turn each visibility map page into 2 pages one by one. Each new page
199+
* has the same page header as the old one. If the last section of last
200+
* page is empty, we skip it, mostly to avoid turning one-page visibility
201+
* maps for small relations into two pages needlessly.
202+
*/
203+
while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ)
204+
{
205+
char *old_cur;
206+
char *old_break;
207+
char *old_blkend;
208+
PageHeaderData pageheader;
209+
bool old_lastblk = ((BLCKSZ * (new_blkno + 1)) == src_filesize);
210+
211+
/* Save the page header data */
212+
memcpy(&pageheader, buffer, SizeOfPageHeaderData);
213+
214+
/*
215+
* These old_* variables point to old visibility map page. old_cur
216+
* points to current position on old page. old_blkend points to end of
217+
* old block. old_break points to old page break position for
218+
* rewriting a new page. After wrote a new page, old_break proceeds
219+
* rewriteVmBytesPerPage bytes.
220+
*/
221+
old_cur = buffer + SizeOfPageHeaderData;
222+
old_blkend = buffer + bytesRead;
223+
old_break = old_cur + rewriteVmBytesPerPage;
224+
225+
while (old_blkend >= old_break)
226+
{
227+
char new_vmbuf[BLCKSZ];
228+
char *new_cur = new_vmbuf;
229+
bool empty = true;
230+
bool old_lastpart;
231+
232+
/* Copy page header in advance */
233+
memcpy(new_vmbuf, &pageheader, SizeOfPageHeaderData);
234+
235+
/* Rewrite the last part of the old page? */
236+
old_lastpart = old_lastblk && (old_blkend == old_break);
237+
238+
new_cur += SizeOfPageHeaderData;
239+
240+
/* Process old page bytes one by one, and turn it into new page. */
241+
while (old_break > old_cur)
242+
{
243+
uint16 new_vmbits = 0;
244+
int i;
245+
246+
/* Generate new format bits while keeping old information */
247+
for (i = 0; i < BITS_PER_BYTE; i++)
248+
{
249+
uint8 byte = *(uint8 *) old_cur;
250+
251+
if (byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))
252+
{
253+
empty = false;
254+
new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
255+
}
256+
}
257+
258+
/* Copy new visibility map bit to new format page */
259+
memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);
260+
261+
old_cur += BITS_PER_HEAPBLOCK_OLD;
262+
new_cur += BITS_PER_HEAPBLOCK;
263+
}
264+
265+
/* If the last part of the old page is empty, skip to write it */
266+
if (old_lastpart && empty)
267+
break;
268+
269+
/* Set new checksum for a visibility map page (if enabled) */
270+
if (old_cluster.controldata.data_checksum_version != 0 &&
271+
new_cluster.controldata.data_checksum_version != 0)
272+
((PageHeader) new_vmbuf)->pd_checksum =
273+
pg_checksum_page(new_vmbuf, new_blkno);
274+
275+
if (write(dst_fd, new_vmbuf, BLCKSZ) != BLCKSZ)
276+
{
277+
close(dst_fd);
278+
close(src_fd);
279+
return getErrorText();
280+
}
281+
282+
old_break += rewriteVmBytesPerPage;
283+
new_blkno++;
284+
}
285+
}
286+
287+
/* Close files */
288+
close(dst_fd);
289+
close(src_fd);
290+
291+
return NULL;
292+
293+
}
294+
141295
void
142296
check_hard_link(void)
143297
{

src/bin/pg_upgrade/pg_upgrade.h

+6
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ extern char *output_files[];
109109
*/
110110
#define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
111111

112+
/*
113+
* The format of visibility map is changed with this 9.6 commit,
114+
*/
115+
#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011
112116
/*
113117
* pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85,
114118
* ("Improve concurrency of foreign key locking") which also updated catalog
@@ -365,6 +369,8 @@ bool pid_lock_file_exists(const char *datadir);
365369

366370
const char *copyFile(const char *src, const char *dst, bool force);
367371
const char *linkFile(const char *src, const char *dst);
372+
const char *rewriteVisibilityMap(const char *fromfile, const char *tofile,
373+
bool force);
368374

369375
void check_hard_link(void);
370376
FILE *fopen_priv(const char *path, const char *mode);

src/bin/pg_upgrade/relfilenode.c

+37-11
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@
1111

1212
#include "pg_upgrade.h"
1313

14+
#include <sys/stat.h>
1415
#include "catalog/pg_class.h"
1516
#include "access/transam.h"
1617

1718

1819
static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
19-
static void transfer_relfile(FileNameMap *map, const char *suffix);
20+
static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
2021

2122

2223
/*
@@ -132,6 +133,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
132133
{
133134
int mapnum;
134135
bool vm_crashsafe_match = true;
136+
bool vm_must_add_frozenbit = false;
135137

136138
/*
137139
* Do the old and new cluster disagree on the crash-safetiness of the vm
@@ -141,23 +143,30 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
141143
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER)
142144
vm_crashsafe_match = false;
143145

146+
/*
147+
* Do we need to rewrite visibilitymap?
148+
*/
149+
if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER &&
150+
new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
151+
vm_must_add_frozenbit = true;
152+
144153
for (mapnum = 0; mapnum < size; mapnum++)
145154
{
146155
if (old_tablespace == NULL ||
147156
strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
148157
{
149158
/* transfer primary file */
150-
transfer_relfile(&maps[mapnum], "");
159+
transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
151160

152161
/* fsm/vm files added in PG 8.4 */
153162
if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
154163
{
155164
/*
156165
* Copy/link any fsm and vm files, if they exist
157166
*/
158-
transfer_relfile(&maps[mapnum], "_fsm");
167+
transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
159168
if (vm_crashsafe_match)
160-
transfer_relfile(&maps[mapnum], "_vm");
169+
transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
161170
}
162171
}
163172
}
@@ -167,17 +176,19 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
167176
/*
168177
* transfer_relfile()
169178
*
170-
* Copy or link file from old cluster to new one.
179+
* Copy or link file from old cluster to new one. If vm_must_add_frozenbit
180+
* is true, visibility map forks are converted and rewritten, even in link
181+
* mode.
171182
*/
172183
static void
173-
transfer_relfile(FileNameMap *map, const char *type_suffix)
184+
transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
174185
{
175186
const char *msg;
176187
char old_file[MAXPGPATH];
177188
char new_file[MAXPGPATH];
178-
int fd;
179189
int segno;
180190
char extent_suffix[65];
191+
struct stat statbuf;
181192

182193
/*
183194
* Now copy/link any related segments as well. Remember, PG breaks large
@@ -210,7 +221,7 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
210221
if (type_suffix[0] != '\0' || segno != 0)
211222
{
212223
/* Did file open fail? */
213-
if ((fd = open(old_file, O_RDONLY, 0)) == -1)
224+
if (stat(old_file, &statbuf) != 0)
214225
{
215226
/* File does not exist? That's OK, just return */
216227
if (errno == ENOENT)
@@ -220,7 +231,10 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
220231
map->nspname, map->relname, old_file, new_file,
221232
getErrorText());
222233
}
223-
close(fd);
234+
235+
/* If file is empty, just return */
236+
if (statbuf.st_size == 0)
237+
return;
224238
}
225239

226240
unlink(new_file);
@@ -232,15 +246,27 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
232246
{
233247
pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
234248

235-
if ((msg = copyFile(old_file, new_file, true)) != NULL)
249+
/* Rewrite visibility map if needed */
250+
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
251+
msg = rewriteVisibilityMap(old_file, new_file, true);
252+
else
253+
msg = copyFile(old_file, new_file, true);
254+
255+
if (msg)
236256
pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
237257
map->nspname, map->relname, old_file, new_file, msg);
238258
}
239259
else
240260
{
241261
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
242262

243-
if ((msg = linkFile(old_file, new_file)) != NULL)
263+
/* Rewrite visibility map if needed */
264+
if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
265+
msg = rewriteVisibilityMap(old_file, new_file, true);
266+
else
267+
msg = linkFile(old_file, new_file);
268+
269+
if (msg)
244270
pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
245271
map->nspname, map->relname, old_file, new_file, msg);
246272
}

0 commit comments

Comments
 (0)