Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 733cf37

Browse files
author
Artur Zakirov
committed
Use pg_probackup.pid file instead of locking pg_probackup.conf
1 parent b67dd1e commit 733cf37

File tree

7 files changed

+178
-87
lines changed

7 files changed

+178
-87
lines changed

backup.c

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,6 @@ do_backup_database(parray *backup_list, bool smooth_checkpoint)
404404
int
405405
do_backup(bool smooth_checkpoint)
406406
{
407-
int ret;
408407
parray *backup_list;
409408
parray *files_database;
410409

@@ -433,12 +432,7 @@ do_backup(bool smooth_checkpoint)
433432
elog(LOG, "----------------------------------------");
434433

435434
/* get exclusive lock of backup catalog */
436-
ret = catalog_lock(true);
437-
if (ret == -1)
438-
elog(ERROR, "cannot lock backup catalog");
439-
else if (ret == 1)
440-
elog(ERROR,
441-
"another pg_probackup is running, skipping this backup");
435+
catalog_lock(true);
442436

443437
/* initialize backup result */
444438
current.status = BACKUP_STATUS_RUNNING;
@@ -509,9 +503,6 @@ do_backup(bool smooth_checkpoint)
509503

510504
pgBackupValidate(&current, false, false);
511505

512-
/* release catalog lock */
513-
catalog_unlock();
514-
515506
return 0;
516507
}
517508

catalog.c

Lines changed: 167 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <dirent.h>
1414
#include <fcntl.h>
1515
#include <libgen.h>
16+
#include <signal.h>
1617
#include <sys/file.h>
1718
#include <sys/stat.h>
1819
#include <sys/types.h>
@@ -26,42 +27,185 @@ static pgBackup *read_backup_from_file(const char *path);
2627

2728
#define BOOL_TO_STR(val) ((val) ? "true" : "false")
2829

29-
static int lock_fd = -1;
30+
static bool exit_hook_registered = false;
31+
static char lock_file[MAXPGPATH];
32+
33+
static void
34+
unlink_lock_atexit(void)
35+
{
36+
int res;
37+
res = unlink(lock_file);
38+
if (res != 0 && res != ENOENT)
39+
elog(WARNING, "%s: %s", lock_file, strerror(errno));
40+
}
3041

3142
/*
32-
* Lock of the catalog with pg_probackup.conf file and return 0.
33-
* If the lock is held by another one, return 1 immediately.
43+
* Create a lockfile.
3444
*/
3545
int
3646
catalog_lock(bool check_catalog)
3747
{
38-
int ret;
39-
char id_path[MAXPGPATH];
40-
41-
join_path_components(id_path, backup_path, BACKUP_CATALOG_CONF_FILE);
42-
lock_fd = open(id_path, O_RDWR);
43-
if (lock_fd == -1)
44-
elog(errno == ENOENT ? ERROR : ERROR,
45-
"cannot open file \"%s\": %s", id_path, strerror(errno));
46-
#ifdef __IBMC__
47-
ret = lockf(lock_fd, LOCK_EX | LOCK_NB, 0); /* non-blocking */
48+
int fd;
49+
char buffer[MAXPGPATH * 2 + 256];
50+
int ntries;
51+
int len;
52+
int encoded_pid;
53+
pid_t my_pid,
54+
my_p_pid;
55+
56+
join_path_components(lock_file, backup_path, BACKUP_CATALOG_PID);
57+
58+
/*
59+
* If the PID in the lockfile is our own PID or our parent's or
60+
* grandparent's PID, then the file must be stale (probably left over from
61+
* a previous system boot cycle). We need to check this because of the
62+
* likelihood that a reboot will assign exactly the same PID as we had in
63+
* the previous reboot, or one that's only one or two counts larger and
64+
* hence the lockfile's PID now refers to an ancestor shell process. We
65+
* allow pg_ctl to pass down its parent shell PID (our grandparent PID)
66+
* via the environment variable PG_GRANDPARENT_PID; this is so that
67+
* launching the postmaster via pg_ctl can be just as reliable as
68+
* launching it directly. There is no provision for detecting
69+
* further-removed ancestor processes, but if the init script is written
70+
* carefully then all but the immediate parent shell will be root-owned
71+
* processes and so the kill test will fail with EPERM. Note that we
72+
* cannot get a false negative this way, because an existing postmaster
73+
* would surely never launch a competing postmaster or pg_ctl process
74+
* directly.
75+
*/
76+
my_pid = getpid();
77+
#ifndef WIN32
78+
my_p_pid = getppid();
4879
#else
49-
ret = flock(lock_fd, LOCK_EX | LOCK_NB); /* non-blocking */
80+
81+
/*
82+
* Windows hasn't got getppid(), but doesn't need it since it's not using
83+
* real kill() either...
84+
*/
85+
my_p_pid = 0;
5086
#endif
51-
if (ret == -1)
87+
88+
/*
89+
* We need a loop here because of race conditions. But don't loop forever
90+
* (for example, a non-writable $backup_path directory might cause a failure
91+
* that won't go away). 100 tries seems like plenty.
92+
*/
93+
for (ntries = 0;; ntries++)
5294
{
53-
if (errno == EWOULDBLOCK)
95+
/*
96+
* Try to create the lock file --- O_EXCL makes this atomic.
97+
*
98+
* Think not to make the file protection weaker than 0600. See
99+
* comments below.
100+
*/
101+
fd = open(lock_file, O_RDWR | O_CREAT | O_EXCL, 0600);
102+
if (fd >= 0)
103+
break; /* Success; exit the retry loop */
104+
105+
/*
106+
* Couldn't create the pid file. Probably it already exists.
107+
*/
108+
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
109+
elog(ERROR, "could not create lock file \"%s\": %s",
110+
lock_file, strerror(errno));
111+
112+
/*
113+
* Read the file to get the old owner's PID. Note race condition
114+
* here: file might have been deleted since we tried to create it.
115+
*/
116+
fd = open(lock_file, O_RDONLY, 0600);
117+
if (fd < 0)
54118
{
55-
close(lock_fd);
56-
return 1;
119+
if (errno == ENOENT)
120+
continue; /* race condition; try again */
121+
elog(ERROR, "could not open lock file \"%s\": %s",
122+
lock_file, strerror(errno));
57123
}
58-
else
124+
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
125+
elog(ERROR, "could not read lock file \"%s\": %s",
126+
lock_file, strerror(errno));
127+
close(fd);
128+
129+
if (len == 0)
130+
elog(ERROR, "lock file \"%s\" is empty", lock_file);
131+
132+
buffer[len] = '\0';
133+
encoded_pid = atoi(buffer);
134+
135+
if (encoded_pid <= 0)
136+
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
137+
lock_file, buffer);
138+
139+
/*
140+
* Check to see if the other process still exists
141+
*
142+
* Per discussion above, my_pid, my_p_pid can be
143+
* ignored as false matches.
144+
*
145+
* Normally kill() will fail with ESRCH if the given PID doesn't
146+
* exist.
147+
*/
148+
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
59149
{
60-
int errno_tmp = errno;
61-
close(lock_fd);
62-
elog(ERROR, "cannot lock file \"%s\": %s", id_path,
63-
strerror(errno_tmp));
150+
if (kill(encoded_pid, 0) == 0 ||
151+
(errno != ESRCH && errno != EPERM))
152+
elog(ERROR, "lock file \"%s\" already exists", lock_file);
64153
}
154+
155+
/*
156+
* Looks like nobody's home. Unlink the file and try again to create
157+
* it. Need a loop because of possible race condition against other
158+
* would-be creators.
159+
*/
160+
if (unlink(lock_file) < 0)
161+
elog(ERROR, "could not remove old lock file \"%s\": %s",
162+
lock_file, strerror(errno));
163+
}
164+
165+
/*
166+
* Successfully created the file, now fill it.
167+
*/
168+
snprintf(buffer, sizeof(buffer), "%d\n", my_pid);
169+
170+
errno = 0;
171+
if (write(fd, buffer, strlen(buffer)) != strlen(buffer))
172+
{
173+
int save_errno = errno;
174+
175+
close(fd);
176+
unlink(lock_file);
177+
/* if write didn't set errno, assume problem is no disk space */
178+
errno = save_errno ? save_errno : ENOSPC;
179+
elog(ERROR, "could not write lock file \"%s\": %s",
180+
lock_file, strerror(errno));
181+
}
182+
if (fsync(fd) != 0)
183+
{
184+
int save_errno = errno;
185+
186+
close(fd);
187+
unlink(lock_file);
188+
errno = save_errno;
189+
elog(ERROR, "could not write lock file \"%s\": %s",
190+
lock_file, strerror(errno));
191+
}
192+
if (close(fd) != 0)
193+
{
194+
int save_errno = errno;
195+
196+
unlink(lock_file);
197+
errno = save_errno;
198+
elog(ERROR, "could not write lock file \"%s\": %s",
199+
lock_file, strerror(errno));
200+
}
201+
202+
/*
203+
* Arrange to unlink the lock file(s) at proc_exit.
204+
*/
205+
if (!exit_hook_registered)
206+
{
207+
atexit(unlink_lock_atexit);
208+
exit_hook_registered = true;
65209
}
66210

67211
if (check_catalog)
@@ -80,16 +224,6 @@ catalog_lock(bool check_catalog)
80224
return 0;
81225
}
82226

83-
/*
84-
* Release catalog lock.
85-
*/
86-
void
87-
catalog_unlock(void)
88-
{
89-
close(lock_fd);
90-
lock_fd = -1;
91-
}
92-
93227
/*
94228
* Create a pgBackup which taken at timestamp.
95229
* If no backup matches, return NULL.

delete.c

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ do_delete(time_t backup_id)
2323
{
2424
int i;
2525
int b_index;
26-
int ret;
2726
parray *backup_list;
2827
pgBackup *last_backup = NULL;
2928

@@ -32,12 +31,7 @@ do_delete(time_t backup_id)
3231
elog(ERROR, "required backup ID not specified");
3332

3433
/* Lock backup catalog */
35-
ret = catalog_lock(false);
36-
if (ret == -1)
37-
elog(ERROR, "can't lock backup catalog.");
38-
else if (ret == 1)
39-
elog(ERROR,
40-
"another pg_probackup is running, stop delete.");
34+
catalog_lock(false);
4135

4236
/* Get complete list of backups */
4337
backup_list = catalog_get_backup_list(0);
@@ -78,15 +72,12 @@ do_delete(time_t backup_id)
7872
pgBackupDeleteFiles(backup);
7973
}
8074

81-
/* release catalog lock */
82-
catalog_unlock();
83-
8475
/* cleanup */
8576
parray_walk(backup_list, pgBackupFree);
8677
parray_free(backup_list);
8778

8879
if (delete_wal)
89-
do_deletewal(backup_id, false);
80+
do_deletewal(backup_id, false, false);
9081

9182
return 0;
9283
}
@@ -97,22 +88,17 @@ do_delete(time_t backup_id)
9788
* found around needs to keep.
9889
*/
9990
int
100-
do_deletewal(time_t backup_id, bool strict)
91+
do_deletewal(time_t backup_id, bool strict, bool need_catalog_lock)
10192
{
10293
size_t i;
103-
int ret;
10494
parray *backup_list;
10595
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
10696
TimeLineID oldest_tli;
10797
bool backup_found = false;
10898

10999
/* Lock backup catalog */
110-
ret = catalog_lock(false);
111-
if (ret == -1)
112-
elog(ERROR, "can't lock backup catalog.");
113-
else if (ret == 1)
114-
elog(ERROR,
115-
"another pg_probackup is running, stop delete.");
100+
if (need_catalog_lock)
101+
catalog_lock(false);
116102

117103
/* Find oldest LSN, used by backups */
118104
backup_list = catalog_get_backup_list(0);
@@ -136,7 +122,6 @@ do_deletewal(time_t backup_id, bool strict)
136122
if (strict && backup_id != 0 && backup_found == false)
137123
elog(ERROR, "not found backup for deletwal command");
138124

139-
catalog_unlock();
140125
parray_walk(backup_list, pgBackupFree);
141126
parray_free(backup_list);
142127

@@ -158,7 +143,6 @@ do_retention_purge(void)
158143
time_t days_threshold = time(NULL) - (retention_window * 60 * 60 * 24);
159144
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
160145
TimeLineID oldest_tli;
161-
int ret;
162146
bool keep_next_backup = true; /* Do not delete first full backup */
163147

164148
if (retention_redundancy > 0)
@@ -170,18 +154,14 @@ do_retention_purge(void)
170154
elog(ERROR, "retention policy is not set");
171155

172156
/* Lock backup catalog */
173-
ret = catalog_lock(false);
174-
if (ret == 1)
175-
elog(ERROR,
176-
"cannot lock backup catalog, another pg_probackup is running");
157+
catalog_lock(false);
177158

178159
/* Get a complete list of backups. */
179160
backup_list = catalog_get_backup_list(0);
180161
if (parray_num(backup_list) == 0)
181162
{
182163
elog(INFO, "backup list is empty");
183164
elog(INFO, "exit");
184-
catalog_unlock();
185165
return 0;
186166
}
187167

@@ -236,8 +216,6 @@ do_retention_purge(void)
236216
parray_walk(backup_list, pgBackupFree);
237217
parray_free(backup_list);
238218

239-
catalog_unlock();
240-
241219
elog(INFO, "purging is finished");
242220

243221
return 0;

pg_probackup.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ main(int argc, char *argv[])
209209
else if (pg_strcasecmp(cmd, "delete") == 0)
210210
return do_delete(backup_id);
211211
else if (pg_strcasecmp(cmd, "delwal") == 0)
212-
return do_deletewal(backup_id, true);
212+
return do_deletewal(backup_id, true, true);
213213
else if (pg_strcasecmp(cmd, "retention") == 0)
214214
{
215215
if (subcmd == NULL)

0 commit comments

Comments
 (0)