Improve performance of timezone loading, especially pg_timezone_names view.

tglsfdc · tglsfdc · commit af2c5aa88d38 · 2017-05-02T21:50:35.000-04:00
tzparse() would attempt to load the "posixrules" timezone database file on each call. That might seem like it would only be an issue when selecting a POSIX-style zone name rather than a zone defined in the timezone database, but it turns out that each zone definition file contains a POSIX-style zone string and tzload() will call tzparse() to parse that. Thus, when scanning the whole timezone file tree as we do in the pg_timezone_names view, "posixrules" was read repetitively for each zone definition file. Fix that by caching the file on first use within any given process. (We cache other zone definitions for the life of the process, so there seems little reason not to cache this one as well.) This probably won't help much in processes that never run pg_timezone_names, but even one additional SET of the timezone GUC would come out ahead. An even worse problem for pg_timezone_names is that pg_open_tzfile() has an inefficient way of identifying the canonical case of a zone name: it basically re-descends the directory tree to the zone file. That's not awful for an individual "SET timezone" operation, but it's pretty horrid when we're inspecting every zone in the database. And it's pointless too because we already know the canonical spelling, having just read it from the filesystem. Fix by teaching pg_open_tzfile() to avoid the directory search if it's not asked for the canonical name, and backfilling the proper result in pg_tzenumerate_next(). In combination these changes seem to make the pg_timezone_names view about 3x faster to read, for me. Since a scan of pg_timezone_names has up to now been one of the slowest queries in the regression tests, this should help some little bit for buildfarm cycle times. Back-patch to all supported branches, not so much because it's likely that users will care much about the view's performance as because tracking changes in the upstream IANA timezone code is really painful if we don't keep all the branches in sync. Discussion: https://postgr.es/m/27962.1493671706@sss.pgh.pa.us
diff --git a/src/timezone/README b/src/timezone/README
@@ -79,6 +79,9 @@ other exposed names.
 slightly modified the API of the former, in part because it now relies
 on our own pg_open_tzfile() rather than opening files for itself.
 
+* tzparse() is adjusted to cache the result of loading the TZDEFRULES
+zone, so that that's not repeated more than once per process.
+
 * There's a fair amount of code we don't need and have removed,
 including all the nonstandard optional APIs.  We have also added
 a few functions of our own at the bottom of localtime.c.
diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c
@@ -54,6 +54,13 @@ static const char gmt[] = "GMT";
 static const pg_time_t time_t_min = MINVAL(pg_time_t, TYPE_BIT(pg_time_t));
 static const pg_time_t time_t_max = MAXVAL(pg_time_t, TYPE_BIT(pg_time_t));
 
+/*
+ * We cache the result of trying to load the TZDEFRULES zone here.
+ * tzdefrules_loaded is 0 if not tried yet, +1 if good, -1 if failed.
+ */
+static struct state tzdefrules_s;
+static int	tzdefrules_loaded = 0;
+
 /*
  * The DST rules to use if TZ has no rules and we can't load TZDEFRULES.
  * We default to US rules as of 1999-08-17.
@@ -942,7 +949,21 @@ tzparse(const char *name, struct state * sp, bool lastditch)
 		charcnt = stdlen + 1;
 		if (sizeof sp->chars < charcnt)
 			return false;
-		load_ok = tzload(TZDEFRULES, NULL, sp, false) == 0;
+
+		/*
+		 * This bit also differs from the IANA code, which doesn't make any
+		 * attempt to avoid repetitive loadings of the TZDEFRULES zone.
+		 */
+		if (tzdefrules_loaded == 0)
+		{
+			if (tzload(TZDEFRULES, NULL, &tzdefrules_s, false) == 0)
+				tzdefrules_loaded = 1;
+			else
+				tzdefrules_loaded = -1;
+		}
+		load_ok = (tzdefrules_loaded > 0);
+		if (load_ok)
+			memcpy(sp, &tzdefrules_s, sizeof(struct state));
 	}
 	if (!load_ok)
 		sp->leapcnt = 0;		/* so, we're off a little */
diff --git a/src/timezone/pgtz.c b/src/timezone/pgtz.c
@@ -80,12 +80,37 @@ pg_open_tzfile(const char *name, char *canonname)
 	int			fullnamelen;
 	int			orignamelen;
 
+	/* Initialize fullname with base name of tzdata directory */
+	strlcpy(fullname, pg_TZDIR(), sizeof(fullname));
+	orignamelen = fullnamelen = strlen(fullname);
+
+	if (fullnamelen + 1 + strlen(name) >= MAXPGPATH)
+		return -1;				/* not gonna fit */
+
+	/*
+	 * If the caller doesn't need the canonical spelling, first just try to
+	 * open the name as-is.  This can be expected to succeed if the given name
+	 * is already case-correct, or if the filesystem is case-insensitive; and
+	 * we don't need to distinguish those situations if we aren't tasked with
+	 * reporting the canonical spelling.
+	 */
+	if (canonname == NULL)
+	{
+		int			result;
+
+		fullname[fullnamelen] = '/';
+		/* test above ensured this will fit: */
+		strcpy(fullname + fullnamelen + 1, name);
+		result = open(fullname, O_RDONLY | PG_BINARY, 0);
+		if (result >= 0)
+			return result;
+		/* If that didn't work, fall through to do it the hard way */
+	}
+
 	/*
 	 * Loop to split the given name into directory levels; for each level,
 	 * search using scan_directory_ci().
 	 */
-	strlcpy(fullname, pg_TZDIR(), sizeof(fullname));
-	orignamelen = fullnamelen = strlen(fullname);
 	fname = name;
 	for (;;)
 	{
@@ -97,8 +122,6 @@ pg_open_tzfile(const char *name, char *canonname)
 			fnamelen = slashptr - fname;
 		else
 			fnamelen = strlen(fname);
-		if (fullnamelen + 1 + fnamelen >= MAXPGPATH)
-			return -1;			/* not gonna fit */
 		if (!scan_directory_ci(fullname, fname, fnamelen,
 							   fullname + fullnamelen + 1,
 							   MAXPGPATH - fullnamelen - 1))
@@ -458,10 +481,11 @@ pg_tzenumerate_next(pg_tzenum *dir)
 
 		/*
 		 * Load this timezone using tzload() not pg_tzset(), so we don't fill
-		 * the cache
+		 * the cache.  Also, don't ask for the canonical spelling: we already
+		 * know it, and pg_open_tzfile's way of finding it out is pretty
+		 * inefficient.
 		 */
-		if (tzload(fullname + dir->baselen, dir->tz.TZname, &dir->tz.state,
-				   true) != 0)
+		if (tzload(fullname + dir->baselen, NULL, &dir->tz.state, true) != 0)
 		{
 			/* Zone could not be loaded, ignore it */
 			continue;
@@ -473,6 +497,10 @@ pg_tzenumerate_next(pg_tzenum *dir)
 			continue;
 		}
 
+		/* OK, return the canonical zone name spelling. */
+		strlcpy(dir->tz.TZname, fullname + dir->baselen,
+				sizeof(dir->tz.TZname));
+
 		/* Timezone loaded OK. */
 		return &dir->tz;
 	}