Commits

Heikki Linnakangas  committed 1f67078

Add OpenTransientFile, with automatic cleanup at end-of-xact.

Files opened with BasicOpenFile or PathNameOpenFile are not automatically
cleaned up on error. That puts unnecessary burden on callers that only want
to keep the file open for a short time. There is AllocateFile, but that
returns a buffered FILE * stream, which in many cases is not the nicest API
to work with. So add function called OpenTransientFile, which returns a
unbuffered fd that's cleaned up like the FILE* returned by AllocateFile().

This plugs a few rare fd leaks in error cases:

1. copy_file() - fixed by by using OpenTransientFile instead of BasicOpenFile
2. XLogFileInit() - fixed by adding close() calls to the error cases. Can't
use OpenTransientFile here because the fd is supposed to persist over
transaction boundaries.
3. lo_import/lo_export - fixed by using OpenTransientFile instead of
PathNameOpenFile.

In addition to plugging those leaks, this replaces many BasicOpenFile() calls
with OpenTransientFile() that were not leaking, because the code meticulously
closed the file on error. That wasn't strictly necessary, but IMHO it's good
for robustness.

The same leaks exist in older versions, but given the rarity of the issues,
I'm not backpatching this. Not yet, anyway - it might be good to backpatch
later, after this mechanism has had some more testing in master branch.

  • Participants
  • Parent commits 5329942

Comments (0)

Files changed (10)

File src/backend/access/transam/slru.c

 		int			i;
 
 		for (i = 0; i < fdata->num_files; i++)
-			close(fdata->fd[i]);
+			CloseTransientFile(fdata->fd[i]);
 	}
 
 	/* Re-acquire control lock and update page state */
 	 * SlruPhysicalWritePage).	Hence, if we are InRecovery, allow the case
 	 * where the file doesn't exist, and return zeroes instead.
 	 */
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 	if (fd < 0)
 	{
 		if (errno != ENOENT || !InRecovery)
 	{
 		slru_errcause = SLRU_SEEK_FAILED;
 		slru_errno = errno;
-		close(fd);
+		CloseTransientFile(fd);
 		return false;
 	}
 
 	{
 		slru_errcause = SLRU_READ_FAILED;
 		slru_errno = errno;
-		close(fd);
+		CloseTransientFile(fd);
 		return false;
 	}
 
-	if (close(fd))
+	if (CloseTransientFile(fd))
 	{
 		slru_errcause = SLRU_CLOSE_FAILED;
 		slru_errno = errno;
 		 * don't use O_EXCL or O_TRUNC or anything like that.
 		 */
 		SlruFileName(ctl, path, segno);
-		fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY,
-						   S_IRUSR | S_IWUSR);
+		fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
 		if (fd < 0)
 		{
 			slru_errcause = SLRU_OPEN_FAILED;
 		slru_errcause = SLRU_SEEK_FAILED;
 		slru_errno = errno;
 		if (!fdata)
-			close(fd);
+			CloseTransientFile(fd);
 		return false;
 	}
 
 		slru_errcause = SLRU_WRITE_FAILED;
 		slru_errno = errno;
 		if (!fdata)
-			close(fd);
+			CloseTransientFile(fd);
 		return false;
 	}
 
 		{
 			slru_errcause = SLRU_FSYNC_FAILED;
 			slru_errno = errno;
-			close(fd);
+			CloseTransientFile(fd);
 			return false;
 		}
 
-		if (close(fd))
+		if (CloseTransientFile(fd))
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
 			slru_errno = errno;
 			ok = false;
 		}
 
-		if (close(fdata.fd[i]))
+		if (CloseTransientFile(fdata.fd[i]))
 		{
 			slru_errcause = SLRU_CLOSE_FAILED;
 			slru_errno = errno;

File src/backend/access/transam/timeline.c

 	unlink(tmppath);
 
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
-					   S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
+						   S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 	else
 		TLHistoryFilePath(path, parentTLI);
 
-	srcfd = BasicOpenFile(path, O_RDONLY, 0);
+	srcfd = OpenTransientFile(path, O_RDONLY, 0);
 	if (srcfd < 0)
 	{
 		if (errno != ENOENT)
 					 errmsg("could not write to file \"%s\": %m", tmppath)));
 			}
 		}
-		close(srcfd);
+		CloseTransientFile(srcfd);
 	}
 
 	/*
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 
-	if (close(fd))
+	if (CloseTransientFile(fd))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", tmppath)));

File src/backend/access/transam/twophase.c

 
 	/*
 	 * Create the 2PC state file.
-	 *
-	 * Note: because we use BasicOpenFile(), we are responsible for ensuring
-	 * the FD gets closed in any error exit path.  Once we get into the
-	 * critical section, though, it doesn't matter since any failure causes
-	 * PANIC anyway.
 	 */
 	TwoPhaseFilePath(path, xid);
 
-	fd = BasicOpenFile(path,
-					   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
-					   S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(path,
+						   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 		COMP_CRC32(statefile_crc, record->data, record->len);
 		if ((write(fd, record->data, record->len)) != record->len)
 		{
-			close(fd);
+			CloseTransientFile(fd);
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not write two-phase state file: %m")));
 
 	if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write two-phase state file: %m")));
 	/* Back up to prepare for rewriting the CRC */
 	if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not seek in two-phase state file: %m")));
 	/* write correct CRC and close file */
 	if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write two-phase state file: %m")));
 	}
 
-	if (close(fd) != 0)
+	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close two-phase state file: %m")));
 
 	TwoPhaseFilePath(path, xid);
 
-	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
 	if (fd < 0)
 	{
 		if (give_warnings)
 	 */
 	if (fstat(fd, &stat))
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		if (give_warnings)
 			ereport(WARNING,
 					(errcode_for_file_access(),
 						sizeof(pg_crc32)) ||
 		stat.st_size > MaxAllocSize)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		return NULL;
 	}
 
 	crc_offset = stat.st_size - sizeof(pg_crc32);
 	if (crc_offset != MAXALIGN(crc_offset))
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		return NULL;
 	}
 
 
 	if (read(fd, buf, stat.st_size) != stat.st_size)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		if (give_warnings)
 			ereport(WARNING,
 					(errcode_for_file_access(),
 		return NULL;
 	}
 
-	close(fd);
+	CloseTransientFile(fd);
 
 	hdr = (TwoPhaseFileHeader *) buf;
 	if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
 
 	TwoPhaseFilePath(path, xid);
 
-	fd = BasicOpenFile(path,
-					   O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY,
-					   S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(path,
+						   O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 	/* Write content and CRC */
 	if (write(fd, content, len) != len)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write two-phase state file: %m")));
 	}
 	if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32))
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write two-phase state file: %m")));
 	 */
 	if (pg_fsync(fd) != 0)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync two-phase state file: %m")));
 	}
 
-	if (close(fd) != 0)
+	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close two-phase state file: %m")));
 
 		TwoPhaseFilePath(path, xid);
 
-		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
+		fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
 		if (fd < 0)
 		{
 			if (errno == ENOENT)
 
 		if (pg_fsync(fd) != 0)
 		{
-			close(fd);
+			CloseTransientFile(fd);
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync two-phase state file \"%s\": %m",
 							path)));
 		}
 
-		if (close(fd) != 0)
+		if (CloseTransientFile(fd) != 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not close two-phase state file \"%s\": %m",

File src/backend/access/transam/xlog.c

 
 	unlink(tmppath);
 
+	/*
+	 * Allocate a buffer full of zeros. This is done before opening the file
+	 * so that we don't leak the file descriptor if palloc fails.
+	 *
+	 * Note: palloc zbuffer, instead of just using a local char array, to
+	 * ensure it is reasonably well-aligned; this may save a few cycles
+	 * transferring data to the kernel.
+	 */
+	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
+
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
 	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
 					   S_IRUSR | S_IWUSR);
 	 * fsync below) that all the indirect blocks are down on disk.	Therefore,
 	 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
 	 * log file.
-	 *
-	 * Note: palloc zbuffer, instead of just using a local char array, to
-	 * ensure it is reasonably well-aligned; this may save a few cycles
-	 * transferring data to the kernel.
 	 */
-	zbuffer = (char *) palloc0(XLOG_BLCKSZ);
 	for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
 	{
 		errno = 0;
 			 * If we fail to make the file, delete it to release disk space
 			 */
 			unlink(tmppath);
+
+			close(fd);
+
 			/* if write didn't set errno, assume problem is no disk space */
 			errno = save_errno ? save_errno : ENOSPC;
 
 	pfree(zbuffer);
 
 	if (pg_fsync(fd) != 0)
+	{
+		close(fd);
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
+	}
 
 	if (close(fd))
 		ereport(ERROR,
 	 * Open the source file
 	 */
 	XLogFilePath(path, srcTLI, srcsegno);
-	srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
+	srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
 	if (srcfd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 	unlink(tmppath);
 
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
-					   S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", tmppath)));
 
-	if (close(fd))
+	if (CloseTransientFile(fd))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", tmppath)));
 
-	close(srcfd);
+	CloseTransientFile(srcfd);
 
 	/*
 	 * Now move the segment into place with its final name.

File src/backend/libpq/be-fsstubs.c

 static Oid
 lo_import_internal(text *filename, Oid lobjOid)
 {
-	File		fd;
+	int			fd;
 	int			nbytes,
 				tmp PG_USED_FOR_ASSERTS_ONLY;
 	char		buf[BUFSIZE];
 	 * open the file to be read in
 	 */
 	text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf));
-	fd = PathNameOpenFile(fnamebuf, O_RDONLY | PG_BINARY, S_IRWXU);
+	fd = OpenTransientFile(fnamebuf, O_RDONLY | PG_BINARY, S_IRWXU);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 	 */
 	lobj = inv_open(oid, INV_WRITE, fscxt);
 
-	while ((nbytes = FileRead(fd, buf, BUFSIZE)) > 0)
+	while ((nbytes = read(fd, buf, BUFSIZE)) > 0)
 	{
 		tmp = inv_write(lobj, buf, nbytes);
 		Assert(tmp == nbytes);
 						fnamebuf)));
 
 	inv_close(lobj);
-	FileClose(fd);
+	CloseTransientFile(fd);
 
 	return oid;
 }
 {
 	Oid			lobjId = PG_GETARG_OID(0);
 	text	   *filename = PG_GETARG_TEXT_PP(1);
-	File		fd;
+	int			fd;
 	int			nbytes,
 				tmp;
 	char		buf[BUFSIZE];
 	 */
 	text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf));
 	oumask = umask(S_IWGRP | S_IWOTH);
-	fd = PathNameOpenFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY,
-						  S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+	fd = OpenTransientFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY,
+						   S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
 	umask(oumask);
 	if (fd < 0)
 		ereport(ERROR,
 	 */
 	while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0)
 	{
-		tmp = FileWrite(fd, buf, nbytes);
+		tmp = write(fd, buf, nbytes);
 		if (tmp != nbytes)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 							fnamebuf)));
 	}
 
-	FileClose(fd);
+	CloseTransientFile(fd);
 	inv_close(lobj);
 
 	PG_RETURN_INT32(1);

File src/backend/storage/file/copydir.c

 	/*
 	 * Open the files
 	 */
-	srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0);
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY, 0);
 	if (srcfd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", fromfile)));
 
-	dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
-						  S_IRUSR | S_IWUSR);
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							  S_IRUSR | S_IWUSR);
 	if (dstfd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 		(void) pg_flush_data(dstfd, offset, nbytes);
 	}
 
-	if (close(dstfd))
+	if (CloseTransientFile(dstfd))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", tofile)));
 
-	close(srcfd);
+	CloseTransientFile(srcfd);
 
 	pfree(buffer);
 }
 	 * cases here
 	 */
 	if (!isdir)
-		fd = BasicOpenFile(fname,
-						   O_RDWR | PG_BINARY,
-						   S_IRUSR | S_IWUSR);
+		fd = OpenTransientFile(fname,
+							   O_RDWR | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
 	else
-		fd = BasicOpenFile(fname,
-						   O_RDONLY | PG_BINARY,
-						   S_IRUSR | S_IWUSR);
+		fd = OpenTransientFile(fname,
+							   O_RDONLY | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
 
 	/*
 	 * Some OSs don't allow us to open directories at all (Windows returns
 	/* Some OSs don't allow us to fsync directories at all */
 	if (returncode != 0 && isdir && errno == EBADF)
 	{
-		close(fd);
+		CloseTransientFile(fd);
 		return;
 	}
 
 				(errcode_for_file_access(),
 				 errmsg("could not fsync file \"%s\": %m", fname)));
 
-	close(fd);
+	CloseTransientFile(fd);
 }

File src/backend/storage/file/fd.c

  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
  * may find ourselves short of real file descriptors anyway.
  *
- * This file used to contain a bunch of stuff to support RAID levels 0
- * (jbod), 1 (duplex) and 5 (xor parity).  That stuff is all gone
- * because the parallel query processing code that called it is all
- * gone.  If you really need it you could get it from the original
- * POSTGRES source.
+ * INTERFACE ROUTINES
+ *
+ * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
+ * A File opened with OpenTemporaryFile is automatically deleted when the
+ * File is closed, either explicitly or implicitly at end of transaction or
+ * process exit. PathNameOpenFile is intended for files that are held open
+ * for a long time, like relation files. It is the caller's responsibility
+ * to close them, there is no automatic mechanism in fd.c for that.
+ *
+ * AllocateFile, AllocateDir and OpenTransientFile are wrappers around
+ * fopen(3), opendir(3), and open(2), respectively. They behave like the
+ * corresponding native functions, except that the handle is registered with
+ * the current subtransaction, and will be automatically closed at abort.
+ * These are intended for short operations like reading a configuration file.
+ * and there is a fixed limit on the number files that can be open using these
+ * functions at any one time.
+ *
+ * Finally, BasicOpenFile is a just thin wrapper around open() that can
+ * release file descriptors in use by the virtual file descriptors if
+ * necessary. There is no automatic cleanup of file descriptors returned by
+ * BasicOpenFile, it is solely the caller's responsibility to close the file
+ * descriptor by calling close(2).
+ *
  *-------------------------------------------------------------------------
  */
 
 
 /*
  * Maximum number of file descriptors to open for either VFD entries or
- * AllocateFile/AllocateDir operations.  This is initialized to a conservative
- * value, and remains that way indefinitely in bootstrap or standalone-backend
- * cases.  In normal postmaster operation, the postmaster calls
- * set_max_safe_fds() late in initialization to update the value, and that
- * value is then inherited by forked subprocesses.
+ * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
+ * to a conservative value, and remains that way indefinitely in bootstrap or
+ * standalone-backend cases.  In normal postmaster operation, the postmaster
+ * calls set_max_safe_fds() late in initialization to update the value, and
+ * that value is then inherited by forked subprocesses.
  *
  * Note: the value of max_files_per_process is taken into account while
  * setting this variable, and so need not be tested separately.
 static uint64 temporary_files_size = 0;
 
 /*
- * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
- * and AllocateDir.
+ * List of OS handles opened with AllocateFile, AllocateDir and
+ * OpenTransientFile.
  *
- * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
+ * Since we don't want to encourage heavy use of those functions,
  * it seems OK to put a pretty small maximum limit on the number of
  * simultaneously allocated descs.
  */
 typedef enum
 {
 	AllocateDescFile,
-	AllocateDescDir
+	AllocateDescDir,
+	AllocateDescRawFD
 } AllocateDescKind;
 
 typedef struct
 	{
 		FILE	   *file;
 		DIR		   *dir;
+		int			fd;
 	}			desc;
 	SubTransactionId create_subid;
 } AllocateDesc;
 	return NULL;
 }
 
+
 /*
- * Free an AllocateDesc of either type.
+ * Like AllocateFile, but returns an unbuffered fd like open(2)
+ */
+int
+OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
+{
+	int			fd;
+
+
+	DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
+			   numAllocatedDescs, fileName));
+
+	/*
+	 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+	 * allocatedFiles[]; the test against max_safe_fds prevents BasicOpenFile
+	 * from hogging every one of the available FDs, which'd lead to infinite
+	 * looping.
+	 */
+	if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+		numAllocatedDescs >= max_safe_fds - 1)
+		elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to open file \"%s\"",
+			 fileName);
+
+	fd = BasicOpenFile(fileName, fileFlags, fileMode);
+
+	if (fd >= 0)
+	{
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescRawFD;
+		desc->desc.fd = fd;
+		desc->create_subid = GetCurrentSubTransactionId();
+		numAllocatedDescs++;
+
+		return fd;
+	}
+
+	return -1;					/* failure */
+}
+
+/*
+ * Free an AllocateDesc of any type.
  *
  * The argument *must* point into the allocatedDescs[] array.
  */
 		case AllocateDescDir:
 			result = closedir(desc->desc.dir);
 			break;
+		case AllocateDescRawFD:
+			result = close(desc->desc.fd);
+			break;
 		default:
 			elog(ERROR, "AllocateDesc kind not recognized");
 			result = 0;			/* keep compiler quiet */
 	return fclose(file);
 }
 
+/*
+ * Close a file returned by OpenTransientFile.
+ *
+ * Note we do not check close's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+CloseTransientFile(int fd)
+{
+	int			i;
+
+	DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
+
+	/* Remove fd from list of allocated files, if it's present */
+	for (i = numAllocatedDescs; --i >= 0;)
+	{
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
+			return FreeDesc(desc);
+	}
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
+
+	return close(fd);
+}
 
 /*
  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
  * exiting. If that's the case, we should remove all temporary files; if
  * that's not the case, we are being called for transaction commit/abort
  * and should only remove transaction-local temp files.  In either case,
- * also clean up "allocated" stdio files and dirs.
+ * also clean up "allocated" stdio files, dirs and fds.
  */
 static void
 CleanupTempFiles(bool isProcExit)
 		have_xact_temporary_files = false;
 	}
 
-	/* Clean up "allocated" stdio files and dirs. */
+	/* Clean up "allocated" stdio files, dirs and fds. */
 	while (numAllocatedDescs > 0)
 		FreeDesc(&allocatedDescs[0]);
 }

File src/backend/storage/smgr/md.c

 		/* truncate(2) would be easier here, but Windows hasn't got it */
 		int			fd;
 
-		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
+		fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
 		if (fd >= 0)
 		{
 			int			save_errno;
 
 			ret = ftruncate(fd, 0);
 			save_errno = errno;
-			close(fd);
+			CloseTransientFile(fd);
 			errno = save_errno;
 		}
 		else

File src/backend/utils/cache/relmapper.c

 	}
 
 	/* Read data ... */
-	fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(mapfilename,
+						   O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(FATAL,
 				(errcode_for_file_access(),
 				 errmsg("could not read relation mapping file \"%s\": %m",
 						mapfilename)));
 
-	close(fd);
+	CloseTransientFile(fd);
 
 	/* check for correct magic number, etc */
 	if (map->magic != RELMAPPER_FILEMAGIC ||
 	/*
 	 * Open the target file.  We prefer to do this before entering the
 	 * critical section, so that an open() failure need not force PANIC.
-	 *
-	 * Note: since we use BasicOpenFile, we are nominally responsible for
-	 * ensuring the fd is closed on error.	In practice, this isn't important
-	 * because either an error happens inside the critical section, or we are
-	 * in bootstrap or WAL replay; so an error past this point is always fatal
-	 * anyway.
 	 */
 	if (shared)
 	{
 		realmap = &local_map;
 	}
 
-	fd = BasicOpenFile(mapfilename,
-					   O_WRONLY | O_CREAT | PG_BINARY,
-					   S_IRUSR | S_IWUSR);
+	fd = OpenTransientFile(mapfilename,
+						   O_WRONLY | O_CREAT | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not fsync relation mapping file \"%s\": %m",
 						mapfilename)));
 
-	if (close(fd))
+	if (CloseTransientFile(fd))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not close relation mapping file \"%s\": %m",

File src/include/storage/fd.h

  * calls:
  *
  *	File {Close, Read, Write, Seek, Tell, Sync}
- *	{File Name Open, Allocate, Free} File
+ *	{Path Name Open, Allocate, Free} File
  *
  * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
  * Use them for all file activity...
  *
  *	File fd;
- *	fd = FilePathOpenFile("foo", O_RDONLY, 0600);
+ *	fd = PathNameOpenFile("foo", O_RDONLY, 0600);
  *
  *	AllocateFile();
  *	FreeFile();
  * no way for them to share kernel file descriptors with other files.
  *
  * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
- * open directories (DIR*).
+ * open directories (DIR*), and OpenTransientFile/CloseTransient File for an
+ * unbuffered file descriptor.
  */
 #ifndef FD_H
 #define FD_H
 extern struct dirent *ReadDir(DIR *dir, const char *dirname);
 extern int	FreeDir(DIR *dir);
 
+/* Operations to allow use of a plain kernel FD, with automatic cleanup */
+extern int	OpenTransientFile(FileName fileName, int fileFlags, int fileMode);
+extern int	CloseTransientFile(int fd);
+
 /* If you've really really gotta have a plain kernel FD, use this */
 extern int	BasicOpenFile(FileName fileName, int fileFlags, int fileMode);