From 376b5d985873da24c9067833b9e7f9a13c7235af Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 18:39:56 +0800 Subject: [PATCH 01/16] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=96=B0=E5=BB=BAcolum?= =?UTF-8?q?n=20family=E6=93=8D=E4=BD=9C-demo01?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lsm_db.h | 12 +++++++++++- lsm_fdw.c | 4 ++-- lsm_server.cpp | 38 ++++++++++++++++++++++++++++++++++++++ lsm_storage.cpp | 31 +++++++++++++++++++++++++++---- 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/lsm_db.h b/lsm_db.h index 898bda7..73c91c6 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -1,10 +1,15 @@ #ifndef __LSM_DB_H__ #define __LSM_DB_H__ +#include +#include +#include "string.h" + #include "lsm_api.h" #include "lsm_posix.h" #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "rocksdb/slice.h" using namespace rocksdb; @@ -15,7 +20,12 @@ struct LsmConnection { DB* db; - void open(char const* path); + // @todo hr,wu + std::string db_path; //数据库路径 + std::vector column_families; //列族 + std::vector handles; //列族的处理器 + + void open(char const* path); // RocksDB的打开路径 uint64_t count(); void close(); Iterator* getIterator(); diff --git a/lsm_fdw.c b/lsm_fdw.c index fccd04d..c2fec65 100644 --- a/lsm_fdw.c +++ b/lsm_fdw.c @@ -994,7 +994,7 @@ Datum lsm_fdw_handler(PG_FUNCTION_ARGS) routine->GetForeignRelSize = GetForeignRelSize; routine->GetForeignPaths = GetForeignPaths; routine->GetForeignPlan = GetForeignPlan; - routine->BeginForeignScan = BeginForeignScan; + routine->BeginForeignScan = BeginForeignScan; //执行外部表扫描 routine->IterateForeignScan = IterateForeignScan; routine->ReScanForeignScan = ReScanForeignScan; routine->EndForeignScan = EndForeignScan; @@ -1004,7 +1004,7 @@ Datum lsm_fdw_handler(PG_FUNCTION_ARGS) routine->AddForeignUpdateTargets = AddForeignUpdateTargets; routine->PlanForeignModify = PlanForeignModify; routine->BeginForeignModify = BeginForeignModify; - routine->ExecForeignInsert = ExecForeignInsert; + routine->ExecForeignInsert = ExecForeignInsert; //执行插入操作 routine->ExecForeignUpdate = ExecForeignUpdate; routine->ExecForeignDelete = ExecForeignDelete; routine->EndForeignModify = EndForeignModify; diff --git a/lsm_server.cpp b/lsm_server.cpp index 808c615..415e63b 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -404,6 +404,44 @@ LsmServer::open(LsmMessage const& msg) { char path[64]; sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); + + // @todo 根据msg中的key来打开对应的DB + char* key = msg.key; + char* col_family_name = nullptr; //列族名称 + int i = 0; + while(key[i] != '_'){ // 而每个列族的名称为key的第一个下划线之前的字符串 + i++; + } + col_family_name = (char *) malloc((i+10) * sizeof (char )); + strncpy(col_family_name, key, i); + col_family_name[i] = '\0'; + + // 判断这个列族是否存在,不存在就创建列族 + std::vector* column_families = new std::vector; //表示rksdb中所有的列族 + DBOptions* db_options; //数据库库的配置选项 + DB::ListColumnFamilies(db_options, db_path, column_families); + bool isExist = false; + for (int j = 0; j < (*column_families).size(); ++j) { + std::string name = (*column_families)[j]; + if(name == std::string(col_family_name)){ + isExist = true; + break; + } + } + ColumnFamilyOptions* cf_options; //列族的配置选项 + ColumnFamilyHandle *cf; //列族的处理器 + if(!isExist){ + // 不存在此列族,就创建一个对应的列族 + DB::CreateColumnFamily(cf_options, std::string(col_family_name), cf); + } + + // 将默认列族和新创建的列族加入其中 + con.column_families.push_back(ColumnFamilyDescriptor( //打开默认列族 + kDefaultColumnFamilyName, ColumnFamilyOptions())); + con.column_families.push_back(ColumnFamilyDescriptor( //打开新的列族 + std::string(col_family_name), ColumnFamilyOptions())); + + // 打开数据库 con.open(path); } return con; diff --git a/lsm_storage.cpp b/lsm_storage.cpp index 07300ca..fb2be8b 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -3,13 +3,27 @@ // #include "lsm_db.h" + +/** + * @param path rocksdb数据库的路径 + * 如果在使用rocksdb时没有显式使用过列族,就会发现,所有的操作都发生在一个列族中, + * 这个列族名称为default. + */ void LsmConnection::open(char const* path) { + // https://wanghenshui.github.io/rocksdb-doc-cn/doc/Column-Families.html + // ColumnFamilyOptions 用于配置列族,DBOptions用于数据库粒度的配置 + // Options 继承了了ColumnFamilyOptions和DBOptions,因此Options可以执行上述两种配置 Options options; options.create_if_missing = true; - Status s = DB::Open(options, std::string(path), &db); + // @todo hr,wu 数据库路径 + std::string p(path); + db_path = p; // 给LSMConnection中的属性赋值 + + // @todo hr,wu 使用LSMConnection中的关于列族的参数来打开数据库 + Status s = DB::Open(options, std::string(path), column_families, &handles, &db); if (!s.ok()) LsmError(s.getState()); } @@ -17,6 +31,11 @@ LsmConnection::open(char const* path) void LsmConnection::close() { + // @todo 关闭列族 + for (auto handle : handles) { + s = db->DestroyColumnFamilyHandle(handle); + assert(s.ok()); + } delete db; db = NULL; } @@ -75,7 +94,7 @@ LsmConnection::lookup(char const* key, size_t keyLen, char* buf) { std::string sval; ReadOptions ro; - Status s = db->Get(ro, Slice(key, keyLen), &sval); + Status s = db->Get(ro, handles[1], Slice(key, keyLen), &sval); if (!s.ok()) return 0; size_t valLen = sval.length(); @@ -97,7 +116,11 @@ LsmConnection::insert(char* key, size_t keyLen, char* val, size_t valLen) return false; } opts.sync = LsmSync; - s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); + + // @todo hr,wu---真正向rksdb中插入数据--- + // https://wanghenshui.github.io/rocksdb-doc-cn/doc/Column-Families.html + // 插入数据的具体操作 + s = db->Put(opts, handles[1], Slice(key, keyLen), Slice(val, valLen)); return s.ok(); } @@ -106,7 +129,7 @@ LsmConnection::remove(char* key, size_t keyLen) { WriteOptions opts; opts.sync = LsmSync; - Status s = db->Delete(opts, Slice(key, keyLen)); + Status s = db->Delete(opts, handles[1], Slice(key, keyLen)); return s.ok(); } From 8511dd2c799959be4c36e1e0d08d42d367437d3a Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 19:05:40 +0800 Subject: [PATCH 02/16] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86pg=5Frocksdb?= =?UTF-8?q?=E7=9B=B8=E5=85=B3=E7=9A=84=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 8 ++++---- lsm--0.1.sql | 8 ++++---- lsm.conf | 2 +- lsm.control | 4 ++-- lsm_fdw.c | 5 +++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 5e07173..b011fd9 100644 --- a/Makefile +++ b/Makefile @@ -1,22 +1,22 @@ -MODULE_big = lsm +MODULE_big = pg_rocksdb OBJS = lsm_fdw.o lsm_client.o lsm_server.o lsm_posix.o lsm_storage.o lsm_util.o PGFILEDESC = "LSM: log-structured merge-tree" PG_CPPFLAGS += -Wno-declaration-after-statement SHLIB_LINK = -lrocksdb -EXTENSION = lsm +EXTENSION = pg_rocksdb DATA = lsm--0.1.sql REGRESS = create basic test testddl testcopy testcolumn -REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/lsm/lsm.conf +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_rocksdb/pg_rocksdb.conf ifdef USE_PGXS PG_CONFIG ?= pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) else -subdir = contrib/lsm +subdir = contrib/pg_rocksdb top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk diff --git a/lsm--0.1.sql b/lsm--0.1.sql index 34bedd0..37e2699 100644 --- a/lsm--0.1.sql +++ b/lsm--0.1.sql @@ -1,8 +1,8 @@ -CREATE FUNCTION lsm_fdw_handler() -RETURNS fdw_handler +CREATE FUNCTION pg_rocksdb_handler() +RETURNS pg_rocksdb_handler AS 'MODULE_PATHNAME' LANGUAGE C STRICT; -CREATE FOREIGN DATA WRAPPER lsm_fdw - HANDLER lsm_fdw_handler; +CREATE FOREIGN DATA WRAPPER pg_rocksdb_fdw + HANDLER pg_rocksdb_handler; diff --git a/lsm.conf b/lsm.conf index c575d2f..57b79dc 100644 --- a/lsm.conf +++ b/lsm.conf @@ -1 +1 @@ -shared_preload_libraries = 'lsm' +shared_preload_libraries = 'pg_rocksdb' diff --git a/lsm.control b/lsm.control index aacc15e..3f36eb3 100644 --- a/lsm.control +++ b/lsm.control @@ -1,5 +1,5 @@ -# LSM FDW +# pg_rocksdb FDW comment = 'RocksDB Foreign Data Wrapper' default_version = '0.1' -module_pathname = '$libdir/lsm' +module_pathname = '$libdir/pg_rocksdb' relocatable = true diff --git a/lsm_fdw.c b/lsm_fdw.c index c2fec65..9a332ee 100644 --- a/lsm_fdw.c +++ b/lsm_fdw.c @@ -32,7 +32,8 @@ PG_MODULE_MAGIC; #endif -PG_FUNCTION_INFO_V1(lsm_fdw_handler); +// 魔法块 +PG_FUNCTION_INFO_V1(pg_rocksdb_handler); static void GetForeignRelSize(PlannerInfo *root, @@ -975,7 +976,7 @@ AnalyzeForeignTable(Relation relation, return false; } -Datum lsm_fdw_handler(PG_FUNCTION_ARGS) +Datum pg_rocksdb_handler(PG_FUNCTION_ARGS) { FdwRoutine *routine = makeNode(FdwRoutine); From ad16816addd8dfc2685b6695008b87bdb9ca6023 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 21:16:19 +0800 Subject: [PATCH 03/16] =?UTF-8?q?=E6=B7=BB=E5=8A=A0DB=E5=A4=B4=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lsm_db.h | 13 +++++++++++++ lsm_server.cpp | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lsm_db.h b/lsm_db.h index 73c91c6..b5e0ad4 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -13,6 +13,19 @@ using namespace rocksdb; +// 添加头文件 +using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; +using ROCKSDB_NAMESPACE::ColumnFamilyHandle; +using ROCKSDB_NAMESPACE::ColumnFamilyOptions; +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DBOptions; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteBatch; +using ROCKSDB_NAMESPACE::WriteOptions; + /* * Wrapper for RocksSB */ diff --git a/lsm_server.cpp b/lsm_server.cpp index 415e63b..d2e8c3e 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -419,7 +419,7 @@ LsmServer::open(LsmMessage const& msg) // 判断这个列族是否存在,不存在就创建列族 std::vector* column_families = new std::vector; //表示rksdb中所有的列族 DBOptions* db_options; //数据库库的配置选项 - DB::ListColumnFamilies(db_options, db_path, column_families); + DB::ListColumnFamilies(db_options, con.db_path, column_families); bool isExist = false; for (int j = 0; j < (*column_families).size(); ++j) { std::string name = (*column_families)[j]; From accc00f282a51266d31132ddb4b21de31a9bc69e Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 21:32:37 +0800 Subject: [PATCH 04/16] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=A4=B4=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lsm_db.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/lsm_db.h b/lsm_db.h index b5e0ad4..683e7b6 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -14,17 +14,7 @@ using namespace rocksdb; // 添加头文件 -using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor; -using ROCKSDB_NAMESPACE::ColumnFamilyHandle; -using ROCKSDB_NAMESPACE::ColumnFamilyOptions; -using ROCKSDB_NAMESPACE::DB; -using ROCKSDB_NAMESPACE::DBOptions; -using ROCKSDB_NAMESPACE::Options; -using ROCKSDB_NAMESPACE::ReadOptions; -using ROCKSDB_NAMESPACE::Slice; -using ROCKSDB_NAMESPACE::Status; -using ROCKSDB_NAMESPACE::WriteBatch; -using ROCKSDB_NAMESPACE::WriteOptions; +using ROCKSDB_NAMESPACE; /* * Wrapper for RocksSB From 2f2328b833ade60317f19e5810175ed0011a1d78 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 21:47:03 +0800 Subject: [PATCH 05/16] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lsm_db.h | 1 - lsm_server.cpp | 12 ++++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lsm_db.h b/lsm_db.h index 683e7b6..dbb4dce 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -14,7 +14,6 @@ using namespace rocksdb; // 添加头文件 -using ROCKSDB_NAMESPACE; /* * Wrapper for RocksSB diff --git a/lsm_server.cpp b/lsm_server.cpp index d2e8c3e..bb63c36 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -418,7 +418,7 @@ LsmServer::open(LsmMessage const& msg) // 判断这个列族是否存在,不存在就创建列族 std::vector* column_families = new std::vector; //表示rksdb中所有的列族 - DBOptions* db_options; //数据库库的配置选项 + DBOptions db_options; //数据库库的配置选项 DB::ListColumnFamilies(db_options, con.db_path, column_families); bool isExist = false; for (int j = 0; j < (*column_families).size(); ++j) { @@ -432,7 +432,15 @@ LsmServer::open(LsmMessage const& msg) ColumnFamilyHandle *cf; //列族的处理器 if(!isExist){ // 不存在此列族,就创建一个对应的列族 - DB::CreateColumnFamily(cf_options, std::string(col_family_name), cf); + Options options; + options.create_if_missing = true; + // open db + Status s = DB::Open(options, con.db_path, &con.db); + // create column family + s = con.db->CreateColumnFamily(cf_options, std::string(col_family_name), cf); + // close db + s = con.db->DestroyColumnFamilyHandle(cf); + delete con.db; } // 将默认列族和新创建的列族加入其中 From 956f6d8bf95c8c0b28466ef72930851892c775ca Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 22:15:11 +0800 Subject: [PATCH 06/16] =?UTF-8?q?=E4=BF=AE=E6=94=B9.control,.conf,.sql?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E4=B8=BApg=5Frocksdb=E7=9B=B8=E5=85=B3?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 2 +- lsm_server.cpp | 2 +- lsm_storage.cpp | 2 +- lsm--0.1.sql => pg_rocksdb--0.1.sql | 0 lsm.conf => pg_rocksdb.conf | 0 lsm.control => pg_rocksdb.control | 0 6 files changed, 3 insertions(+), 3 deletions(-) rename lsm--0.1.sql => pg_rocksdb--0.1.sql (100%) rename lsm.conf => pg_rocksdb.conf (100%) rename lsm.control => pg_rocksdb.control (100%) diff --git a/Makefile b/Makefile index b011fd9..d2807bc 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ PG_CPPFLAGS += -Wno-declaration-after-statement SHLIB_LINK = -lrocksdb EXTENSION = pg_rocksdb -DATA = lsm--0.1.sql +DATA = pg_rocksdb--0.1.sql REGRESS = create basic test testddl testcopy testcolumn REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_rocksdb/pg_rocksdb.conf diff --git a/lsm_server.cpp b/lsm_server.cpp index bb63c36..13421de 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -437,7 +437,7 @@ LsmServer::open(LsmMessage const& msg) // open db Status s = DB::Open(options, con.db_path, &con.db); // create column family - s = con.db->CreateColumnFamily(cf_options, std::string(col_family_name), cf); + s = con.db->CreateColumnFamily(cf_options, std::string(col_family_name), &cf); // close db s = con.db->DestroyColumnFamilyHandle(cf); delete con.db; diff --git a/lsm_storage.cpp b/lsm_storage.cpp index fb2be8b..76a94ba 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -33,7 +33,7 @@ LsmConnection::close() { // @todo 关闭列族 for (auto handle : handles) { - s = db->DestroyColumnFamilyHandle(handle); + Status s = db->DestroyColumnFamilyHandle(handle); assert(s.ok()); } delete db; diff --git a/lsm--0.1.sql b/pg_rocksdb--0.1.sql similarity index 100% rename from lsm--0.1.sql rename to pg_rocksdb--0.1.sql diff --git a/lsm.conf b/pg_rocksdb.conf similarity index 100% rename from lsm.conf rename to pg_rocksdb.conf diff --git a/lsm.control b/pg_rocksdb.control similarity index 100% rename from lsm.control rename to pg_rocksdb.control From 188e250fed18ccbf520d87088fb66eb58307032e Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 22:34:53 +0800 Subject: [PATCH 07/16] lsm_server.cpp_431 --- lsm_server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lsm_server.cpp b/lsm_server.cpp index 13421de..0053a95 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -428,7 +428,7 @@ LsmServer::open(LsmMessage const& msg) break; } } - ColumnFamilyOptions* cf_options; //列族的配置选项 + ColumnFamilyOptions cf_options; //列族的配置选项 ColumnFamilyHandle *cf; //列族的处理器 if(!isExist){ // 不存在此列族,就创建一个对应的列族 From 825e404930d64a9d42ba63c3485987f6d3dd3608 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <37182837+whrgogogo666@users.noreply.github.com> Date: Mon, 9 May 2022 22:39:28 +0800 Subject: [PATCH 08/16] Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 53ce454..2b1607c 100644 --- a/README.md +++ b/README.md @@ -53,19 +53,19 @@ We test this foreign data wrapper on Ubuntu Server 18.04 using PostgreSQL-11 tog - Build this foreign data wrapper: ```sh - git clone git@github.com:postgrespro/lsm.git + git clone https://github.com/whrgogogo666/pg_rocksdb.git - cd lsm + cd pg_rocksdb - make + make USE_PGXS=1 - sudo make install + sudo make USE_PGXS=1 install ``` - Before using this foreign data wrapper, we need to add it to `shared_preload_libraries` in the `postgresql.conf`: ```sh - echo "shared_preload_libraries = 'lsm'" >> postgresql.conf + echo "shared_preload_libraries = 'pg_rocksdb'" >> postgresql.conf ``` and restart PostgreSQL: @@ -103,10 +103,10 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres CREATE DATABASE example; \c example - CREATE EXTENSION lsm; - CREATE SERVER lsm_server FOREIGN DATA WRAPPER lsm_fdw; + CREATE EXTENSION pg_rocksdb; + CREATE SERVER lsm_server FOREIGN DATA WRAPPER pg_rocksdb_fdw; - CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER lsm_server; + CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER pg_rocksdb_server; INSERT INTO student VALUES(20757123, 'Rafferty'); SELECT * FROM student; @@ -122,8 +122,8 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres DROP FOREIGN TABLE student; - DROP SERVER lsm_server; - DROP EXTENSION lsm_fdw; + DROP SERVER pg_rocksdb_server; + DROP EXTENSION pg_rocksdb_fdw; \c postgres DROP DATABASE example; From 8fecb3cfa06153fbf22de73392b3064f0e93b8d6 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 22:41:43 +0800 Subject: [PATCH 09/16] lsm_server.cpp_431 --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 53ce454..06f20ee 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Build Status](https://travis-ci.com/vidardb/PostgresForeignDataWrapper.svg?branch=master)](https://travis-ci.com/github/vidardb/PostgresForeignDataWrapper) -This PostgreSQL extension implements a Foreign Data Wrapper (FDW) for [RocksDB](https://rocksdb.org/). This repo has been listed in PostgreSQL [wiki](https://wiki.postgresql.org/wiki/Foreign_data_wrappers). +This PostgreSQL extension implements a Foreign Data Wrapper (FDW) for [RocksDB](https://rocksdb.org/). This repo has been listed in PostgreSQL [wiki](https://wiki.postgresql.org/wiki/Foreign_data_wrappers). RocksDB is a high performance key-value store based on a log-structured merge-tree (LSM tree). RocksDB can efficiently use many CPU cores and fast storage. This is the first foreign data wrapper that connects a LSM-tree-based storage engine to PostgreSQL. Because RocksDB is an embeddable key-value store, you do not need to run another server to use this extension. @@ -53,19 +53,19 @@ We test this foreign data wrapper on Ubuntu Server 18.04 using PostgreSQL-11 tog - Build this foreign data wrapper: ```sh - git clone git@github.com:postgrespro/lsm.git + git clone https://github.com/whrgogogo666/pg_rocksdb.git - cd lsm + cd pg_rocksdb - make + make USE_PGXS=1 - sudo make install + sudo make USE_PGXS=1 install ``` - Before using this foreign data wrapper, we need to add it to `shared_preload_libraries` in the `postgresql.conf`: ```sh - echo "shared_preload_libraries = 'lsm'" >> postgresql.conf + echo "shared_preload_libraries = 'pg_rocksdb'" >> postgresql.conf ``` and restart PostgreSQL: @@ -90,7 +90,7 @@ We test this foreign data wrapper on Ubuntu Server 18.04 using PostgreSQL-11 tog - ACID relies on the storage engine. -- Data types of Postgres are not natively supported. +- Data types of Postgres are not natively supported. # Usage @@ -103,10 +103,10 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres CREATE DATABASE example; \c example - CREATE EXTENSION lsm; - CREATE SERVER lsm_server FOREIGN DATA WRAPPER lsm_fdw; + CREATE EXTENSION pg_rocksdb; + CREATE SERVER lsm_server FOREIGN DATA WRAPPER pg_rocksdb_fdw; - CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER lsm_server; + CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER pg_rocksdb_server; INSERT INTO student VALUES(20757123, 'Rafferty'); SELECT * FROM student; @@ -122,8 +122,8 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres DROP FOREIGN TABLE student; - DROP SERVER lsm_server; - DROP EXTENSION lsm_fdw; + DROP SERVER pg_rocksdb_server; + DROP EXTENSION pg_rocksdb_fdw; \c postgres DROP DATABASE example; @@ -147,7 +147,7 @@ We have tested certain typical SQL statements and will add more test cases later sudo -u postgres psql -U postgres -d lsmtest -a -f test/sql/clear.sql ``` -# Debug +# Debug If you want to debug the source code, you may need to start PostgreSQL in the debug mode: From 44c5ca744767dc2a89bad149081ce3f72f684792 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 22:55:33 +0800 Subject: [PATCH 10/16] LSM_FDW_NAME"pg_rocksdb" --- lsm_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lsm_api.h b/lsm_api.h index 3c77e1a..64524f3 100644 --- a/lsm_api.h +++ b/lsm_api.h @@ -21,7 +21,7 @@ extern "C" { /* * Name of the directory in $PGDATA */ -#define LSM_FDW_NAME "lsm" +#define LSM_FDW_NAME "pg_rocksdb" extern int LsmQueueSize; extern bool LsmSync; From a6379c84a1a6d1232e55e8afad171f0369247968 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Mon, 9 May 2022 23:14:14 +0800 Subject: [PATCH 11/16] RETURNS fdw_handler --- pg_rocksdb--0.1.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pg_rocksdb--0.1.sql b/pg_rocksdb--0.1.sql index 37e2699..17400a6 100644 --- a/pg_rocksdb--0.1.sql +++ b/pg_rocksdb--0.1.sql @@ -1,5 +1,6 @@ CREATE FUNCTION pg_rocksdb_handler() -RETURNS pg_rocksdb_handler +-- 底下必须返回的是fdw_handler,而不是其他的 +RETURNS fdw_handler AS 'MODULE_PATHNAME' LANGUAGE C STRICT; From b88ffc551dd7c82d49961391ec7acb75bf75be68 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Tue, 17 May 2022 13:50:20 +0800 Subject: [PATCH 12/16] RETURNS fdw_handler --- lsm_server.cpp | 45 --------------------------------------------- lsm_storage.cpp | 4 ++-- 2 files changed, 2 insertions(+), 47 deletions(-) diff --git a/lsm_server.cpp b/lsm_server.cpp index 0053a95..8499035 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -404,51 +404,6 @@ LsmServer::open(LsmMessage const& msg) { char path[64]; sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); - - // @todo 根据msg中的key来打开对应的DB - char* key = msg.key; - char* col_family_name = nullptr; //列族名称 - int i = 0; - while(key[i] != '_'){ // 而每个列族的名称为key的第一个下划线之前的字符串 - i++; - } - col_family_name = (char *) malloc((i+10) * sizeof (char )); - strncpy(col_family_name, key, i); - col_family_name[i] = '\0'; - - // 判断这个列族是否存在,不存在就创建列族 - std::vector* column_families = new std::vector; //表示rksdb中所有的列族 - DBOptions db_options; //数据库库的配置选项 - DB::ListColumnFamilies(db_options, con.db_path, column_families); - bool isExist = false; - for (int j = 0; j < (*column_families).size(); ++j) { - std::string name = (*column_families)[j]; - if(name == std::string(col_family_name)){ - isExist = true; - break; - } - } - ColumnFamilyOptions cf_options; //列族的配置选项 - ColumnFamilyHandle *cf; //列族的处理器 - if(!isExist){ - // 不存在此列族,就创建一个对应的列族 - Options options; - options.create_if_missing = true; - // open db - Status s = DB::Open(options, con.db_path, &con.db); - // create column family - s = con.db->CreateColumnFamily(cf_options, std::string(col_family_name), &cf); - // close db - s = con.db->DestroyColumnFamilyHandle(cf); - delete con.db; - } - - // 将默认列族和新创建的列族加入其中 - con.column_families.push_back(ColumnFamilyDescriptor( //打开默认列族 - kDefaultColumnFamilyName, ColumnFamilyOptions())); - con.column_families.push_back(ColumnFamilyDescriptor( //打开新的列族 - std::string(col_family_name), ColumnFamilyOptions())); - // 打开数据库 con.open(path); } diff --git a/lsm_storage.cpp b/lsm_storage.cpp index 76a94ba..edc2bca 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -120,7 +120,7 @@ LsmConnection::insert(char* key, size_t keyLen, char* val, size_t valLen) // @todo hr,wu---真正向rksdb中插入数据--- // https://wanghenshui.github.io/rocksdb-doc-cn/doc/Column-Families.html // 插入数据的具体操作 - s = db->Put(opts, handles[1], Slice(key, keyLen), Slice(val, valLen)); + s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); return s.ok(); } @@ -129,7 +129,7 @@ LsmConnection::remove(char* key, size_t keyLen) { WriteOptions opts; opts.sync = LsmSync; - Status s = db->Delete(opts, handles[1], Slice(key, keyLen)); + Status s = db->Delete(opts, Slice(key, keyLen)); return s.ok(); } From de2b07601a0701de1b8c45121d955080fd89fe47 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Tue, 17 May 2022 14:10:37 +0800 Subject: [PATCH 13/16] RETURNS fdw_handler --- lsm_storage.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lsm_storage.cpp b/lsm_storage.cpp index edc2bca..1ccadd9 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -23,7 +23,7 @@ LsmConnection::open(char const* path) db_path = p; // 给LSMConnection中的属性赋值 // @todo hr,wu 使用LSMConnection中的关于列族的参数来打开数据库 - Status s = DB::Open(options, std::string(path), column_families, &handles, &db); + Status s = DB::Open(options, std::string(path), &db); if (!s.ok()) LsmError(s.getState()); } @@ -31,11 +31,6 @@ LsmConnection::open(char const* path) void LsmConnection::close() { - // @todo 关闭列族 - for (auto handle : handles) { - Status s = db->DestroyColumnFamilyHandle(handle); - assert(s.ok()); - } delete db; db = NULL; } From 1f6159b5f3a0c7c0ff7da0420fece0ccdfddf025 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <37182837+whrgogogo666@users.noreply.github.com> Date: Tue, 17 May 2022 14:21:13 +0800 Subject: [PATCH 14/16] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 06f20ee..d7586a4 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres \c example CREATE EXTENSION pg_rocksdb; - CREATE SERVER lsm_server FOREIGN DATA WRAPPER pg_rocksdb_fdw; + CREATE SERVER pg_rocksdb_server FOREIGN DATA WRAPPER pg_rocksdb_fdw; CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER pg_rocksdb_server; From f04ae4843d0e994bb05ebebdfc9e86335fc3a7b6 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Tue, 17 May 2022 15:07:19 +0800 Subject: [PATCH 15/16] RETURNS fdw_handler --- lsm_client.cpp | 176 +++++++-------- lsm_db.h | 228 ++++++++++---------- lsm_fdw.c | 328 ++++++++++++++++------------ lsm_fdw.h | 2 +- lsm_server.cpp | 506 +++++++++++++++++++++++--------------------- lsm_storage.cpp | 98 ++++----- lsm_util.c | 227 ++++++++++---------- pg_rocksdb--0.1.sql | 4 +- 8 files changed, 823 insertions(+), 746 deletions(-) diff --git a/lsm_client.cpp b/lsm_client.cpp index 05ffea2..243ab7a 100644 --- a/lsm_client.cpp +++ b/lsm_client.cpp @@ -9,132 +9,136 @@ LsmQueue** queues; size_t LsmShmemSize(int maxClients) { - return (sizeof(LsmQueue) + LsmQueueSize + sizeof(LsmQueue*)) * maxClients; + return (sizeof(LsmQueue) + LsmQueueSize + sizeof(LsmQueue*)) * maxClients; } void LsmInitialize(void* ctl, int maxClients) { - queues = (LsmQueue**)ctl; - char* ptr = (char*)(queues + maxClients); - for (int i = 0; i < maxClients; i++) - { - LsmQueue* queue = (LsmQueue*)ptr; - ptr += sizeof(LsmQueue) + LsmQueueSize; - queue->getPos = 0; - queue->putPos = 0; - queue->writerBlocked = false; - SemInit(&queue->empty, 1, 0); - SemInit(&queue->full, 1, 0); - SemInit(&queue->ready, 1, 0); - queues[i] = queue; - } + queues = (LsmQueue**)ctl; + char* ptr = (char*)(queues + maxClients); + for (int i = 0; i < maxClients; i++) + { + LsmQueue* queue = (LsmQueue*)ptr; + ptr += sizeof(LsmQueue) + LsmQueueSize; + queue->getPos = 0; + queue->putPos = 0; + queue->writerBlocked = false; + SemInit(&queue->empty, 1, 0); + SemInit(&queue->full, 1, 0); + SemInit(&queue->ready, 1, 0); + queues[i] = queue; + } } void LsmAttach(void* ctl) { - queues = (LsmQueue**)ctl; + queues = (LsmQueue**)ctl; } bool LsmDelete(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpDelete; - msg.hdr.rid = rid; - msg.hdr.keySize = keyLen; - msg.hdr.valueSize = 0; - msg.key = key; - queue->put(msg); - if (LsmSync) - { - SemWait(&queue->ready); - return (bool)queue->resp[0]; - } - return true; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpDelete; + msg.hdr.rid = rid; + msg.hdr.keySize = keyLen; + msg.hdr.valueSize = 0; + msg.key = key; + queue->put(msg); + if (LsmSync) + { + SemWait(&queue->ready); + return (bool)queue->resp[0]; + } + return true; } + +// 将传入的数据以LsmMessage进行传输 bool LsmInsert(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen, char *val, size_t valLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpInsert; - msg.hdr.rid = rid; - msg.hdr.keySize = keyLen; - msg.hdr.valueSize = valLen; - msg.key = key; - msg.value = val; - queue->put(msg); - if (LsmSync) - { - SemWait(&queue->ready); - return (bool)queue->resp[0]; - } - return true; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpInsert; + msg.hdr.rid = rid; //外部表oid + msg.hdr.keySize = keyLen; //key的长度 + msg.hdr.valueSize = valLen; //value的长度 + msg.key = key; + msg.value = val; + queue->put(msg); + if (LsmSync) + { + SemWait(&queue->ready); + return (bool)queue->resp[0]; + } + return true; } + + uint64_t LsmCount(LsmQueueId qid, LsmRelationId rid) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpCount; - msg.hdr.rid = rid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); - SemWait(&queue->ready); - return *(uint64_t*)queue->resp; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpCount; + msg.hdr.rid = rid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); + SemWait(&queue->ready); + return *(uint64_t*)queue->resp; } void LsmCloseCursor(LsmQueueId qid, LsmRelationId rid, LsmCursorId cid) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpCloseCursor; - msg.hdr.rid = rid; - msg.hdr.cid = cid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpCloseCursor; + msg.hdr.rid = rid; + msg.hdr.cid = cid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); } bool LsmReadNext(LsmQueueId qid, LsmRelationId rid, LsmCursorId cid, char *buf, size_t *size) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpFetch; - msg.hdr.rid = rid; - msg.hdr.cid = cid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); - SemWait(&queue->ready); - memcpy(buf, queue->resp, queue->respSize); - *size = queue->respSize; - return *size != 0; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpFetch; + msg.hdr.rid = rid; + msg.hdr.cid = cid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); + SemWait(&queue->ready); + memcpy(buf, queue->resp, queue->respSize); + *size = queue->respSize; + return *size != 0; } bool LsmLookup(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen, char *val, size_t *valLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpLookup; - msg.hdr.rid = rid; - msg.hdr.keySize = keyLen; - msg.hdr.valueSize = 0; - msg.key = key; - queue->put(msg); - SemWait(&queue->ready); - memcpy(val, queue->resp, queue->respSize); - *valLen = queue->respSize; - return *valLen != 0; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpLookup; + msg.hdr.rid = rid; + msg.hdr.keySize = keyLen; + msg.hdr.valueSize = 0; + msg.key = key; + queue->put(msg); + SemWait(&queue->ready); + memcpy(val, queue->resp, queue->respSize); + *valLen = queue->respSize; + return *valLen != 0; } diff --git a/lsm_db.h b/lsm_db.h index dbb4dce..c6d7823 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -1,44 +1,37 @@ #ifndef __LSM_DB_H__ #define __LSM_DB_H__ -#include -#include -#include "string.h" - #include "lsm_api.h" #include "lsm_posix.h" #include "rocksdb/db.h" #include "rocksdb/options.h" -#include "rocksdb/slice.h" using namespace rocksdb; -// 添加头文件 +/** + * 对Rocksdb的封装 + * + * / /* - * Wrapper for RocksSB + * Wrapper for RocksDB */ struct LsmConnection { - DB* db; - - // @todo hr,wu - std::string db_path; //数据库路径 - std::vector column_families; //列族 - std::vector handles; //列族的处理器 - - void open(char const* path); // RocksDB的打开路径 - uint64_t count(); - void close(); - Iterator* getIterator(); - void releaseIterator(Iterator* iter); - size_t lookup(char const* key, size_t keySize, char* buf); - size_t next(Iterator* iter, char* buf); - bool insert(char* key, size_t keyLen, char* val, size_t valLen); - bool remove(char* key, size_t keyLen); - - LsmConnection() : db(NULL) {} - ~LsmConnection() { close(); } + DB* db; + + void open(char const* path); + uint64_t count(); + void close(); + Iterator* getIterator(); + void releaseIterator(Iterator* iter); + size_t lookup(char const* key, size_t keySize, char* buf); + size_t next(Iterator* iter, char* buf); + bool insert(char* key, size_t keyLen, char* val, size_t valLen); + bool remove(char* key, size_t keyLen); + + LsmConnection() : db(NULL) {} + ~LsmConnection() { close(); } }; /* @@ -46,136 +39,143 @@ struct LsmConnection */ struct LsmMessageHeader { - LsmOperation op; - uint32_t keySize; - uint32_t valueSize; - LsmRelationId rid; - LsmCursorId cid; + LsmOperation op; + uint32_t keySize; + uint32_t valueSize; + LsmRelationId rid; + LsmCursorId cid; }; /* * Protocol message */ +// 用于包装传输的数据 struct LsmMessage { - LsmMessageHeader hdr; - char* key; - char* value; + LsmMessageHeader hdr; + char* key; + char* value; }; /* * Queue for tranferring data between backend and LSM worker thread. */ +// 用于将数据传送到rocksdb中 struct LsmQueue { - volatile int getPos; // get position in ring buffer (updated only by consumer) - volatile int putPos; // put position in ring buffer (updated only by producer) - volatile int respSize; // response size - volatile int writerBlocked; // producer is blocked because queue is full - volatile int terminate;// worker receives termination request - sem_t empty; // semaphore to wait until queue is not empty - sem_t full; // semaphore to wait until queue is not full - sem_t ready; // semaphore to wait response from server - char resp[LSM_MAX_RECORD_SIZE]; // response data - char req[1]; // ring buffer (LsmQueueSize long) - - void put(LsmMessage const& msg); - void get(char* buf, LsmMessage& msg); - void next(LsmMessage const& msg); - - LsmQueue() : getPos(0), putPos(0), respSize(0), writerBlocked(false) {} + // ring buffer 环状缓冲区 + volatile int getPos; // get position in ring buffer (updated only by consumer) + volatile int putPos; // put position in ring buffer (updated only by producer) + volatile int respSize; // response size + volatile int writerBlocked; // producer is blocked because queue is full + volatile int terminate;// worker receives termination request + sem_t empty; // semaphore to wait until queue is not empty + sem_t full; // semaphore to wait until queue is not full + sem_t ready; // semaphore to wait response from server + char resp[LSM_MAX_RECORD_SIZE]; // response data + char req[1]; // ring buffer (LsmQueueSize long) + + void put(LsmMessage const& msg); + void get(char* buf, LsmMessage& msg); + void next(LsmMessage const& msg); + + LsmQueue() : getPos(0), putPos(0), respSize(0), writerBlocked(false) {} }; struct LsmCursor { - LsmConnection* con; - Iterator* iter; + LsmConnection* con; + Iterator* iter; - LsmCursor() : con(NULL), iter(NULL) {} + LsmCursor() : con(NULL), iter(NULL) {} }; struct LsmServer; + +// 主要封装了对lsm的操作,也就是lsmServer中的一个工作进程 struct LsmWorker { - std::map cursors; - LsmServer* server; - LsmQueue* queue; - pthread_t thread; + std::map cursors; + LsmServer* server; // 一个server有很多的worker + LsmQueue* queue; //一个worker对应一个queue + pthread_t thread; - LsmWorker(LsmServer* s, LsmQueue* q) : server(s), queue(q) {} + LsmWorker(LsmServer* s, LsmQueue* q) : server(s), queue(q) {} - void start(); - void stop(); - void run(); - void wait(); + void start(); + void stop(); + void run(); + void wait(); - private: - LsmConnection& open(LsmMessage const& msg); +private: + LsmConnection& open(LsmMessage const& msg); - void insert(LsmMessage const& msg); - void remove(LsmMessage const& msg); - void closeCursor(LsmMessage const& msg); - void fetch(LsmMessage const& msg); - void count(LsmMessage const& msg); - void lookup(LsmMessage const& msg); + void insert(LsmMessage const& msg); + void remove(LsmMessage const& msg); + void closeCursor(LsmMessage const& msg); + void fetch(LsmMessage const& msg); + void count(LsmMessage const& msg); + void lookup(LsmMessage const& msg); - static void* main(void* arg); + static void* main(void* arg); }; class Mutex { - pthread_mutex_t mutex; - public: - Mutex() - { - PthreadMutexInit(&mutex); - } - - ~Mutex() - { - PthreadMutexDestroy(&mutex); - } - - void lock() - { - PthreadMutexLock(&mutex); - } - - void unlock() - { - PthreadMutexUnlock(&mutex); - } + pthread_mutex_t mutex; +public: + Mutex() + { + PthreadMutexInit(&mutex); + } + + ~Mutex() + { + PthreadMutexDestroy(&mutex); + } + + void lock() + { + PthreadMutexLock(&mutex); + } + + void unlock() + { + PthreadMutexUnlock(&mutex); + } }; class CriticalSection { - Mutex& mutex; - public: - CriticalSection(Mutex& m) : mutex(m) - { - mutex.lock(); - } - ~CriticalSection() - { - mutex.unlock(); - } + Mutex& mutex; +public: + CriticalSection(Mutex& m) : mutex(m) + { + mutex.lock(); + } + ~CriticalSection() + { + mutex.unlock(); + } }; + +// 此结构体包含了很多对rocksdb的操作,非常重要,是一个对于rocksdb封装的最大对象,包含了很多的LsmWorker struct LsmServer { - LsmWorker** workers; - size_t nWorkers; - Mutex mutex; - std::map connections; - - void start(); - void wait(); - void stop(); - - LsmConnection& open(LsmMessage const& msg); - LsmServer(size_t maxClients); - ~LsmServer(); + LsmWorker** workers; + size_t nWorkers; + Mutex mutex; + std::map connections; + + void start(); + void wait(); + void stop(); + + LsmConnection& open(LsmMessage const& msg); + LsmServer(size_t maxClients); + ~LsmServer(); }; extern LsmQueue** queues; diff --git a/lsm_fdw.c b/lsm_fdw.c index 9a332ee..c9d4c03 100644 --- a/lsm_fdw.c +++ b/lsm_fdw.c @@ -32,13 +32,22 @@ PG_MODULE_MAGIC; #endif -// 魔法块 -PG_FUNCTION_INFO_V1(pg_rocksdb_handler); +PG_FUNCTION_INFO_V1(pg_rocksdb_fdw_handler); + +/* +root是规划器的关于该查询的全局信息 +baserel是规划器的关于该表的信息 +foreigntableid是外部表在pg_class中的 OID (foreigntableid可以从规划器的数据结构中获得,但是为了减少工作量,这里直接显式地将它传递给函数)。 +*/ + +// 这些hook函数的参数都是系统定义好的 +// 获取外部表格的size +// baserel 是planner中关于外部表格的信息 static void GetForeignRelSize(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId) + RelOptInfo *baserel, + Oid foreignTableId) { /* * Obtain relation size estimates for a foreign table. This is called at @@ -64,13 +73,16 @@ GetForeignRelSize(PlannerInfo *root, * we should open & close db multiple times. */ /* TODO: better estimation */ + // baserel is the planer's informatino about this table baserel->rows = LsmCount(MyBackendId, foreignTableId); } + +// 创建一个扫描外部表的访问路径 static void GetForeignPaths(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId) + RelOptInfo *baserel, + Oid foreignTableId) { /* * Create possible access paths for a scan on a foreign table. This is @@ -87,12 +99,13 @@ GetForeignPaths(PlannerInfo *root, * that is needed to identify the specific scan method intended. */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); Cost startupCost = 0; Cost totalCost = startupCost + baserel->rows; /* Create a ForeignPath node and add it as only possible path */ + // https://doxygen.postgresql.org/pathnode_8c.html#a20b2c8a564bb57ed4187825dec56f707 add_path(baserel, (Path *) create_foreignscan_path(root, baserel, @@ -106,14 +119,16 @@ GetForeignPaths(PlannerInfo *root, NIL)); /* no fdw_private data */ } + +// 创建一个ForeignScan 计划的节点,从选择的外部acess path中创建 static ForeignScan* GetForeignPlan(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId, - ForeignPath *bestPath, - List *targetList, - List *scanClauses, - Plan *outerPlan) + RelOptInfo *baserel, + Oid foreignTableId, + ForeignPath *bestPath, + List *targetList, + List *scanClauses, // + Plan *outerPlan) { /* * Create a ForeignScan plan node from the selected foreign access path. @@ -137,6 +152,7 @@ GetForeignPlan(PlannerInfo *root, * handled elsewhere). */ + // clause 从句 scanClauses = extract_actual_clauses(scanClauses, false); /* Create the ForeignScan node */ @@ -152,9 +168,9 @@ GetForeignPlan(PlannerInfo *root, static void GetKeyBasedQual(ForeignScanState *scanState, - Node *node, - Relation relation, - TableReadState *readState) + Node *node, + Relation relation, + TableReadState *readState) { if (!node || !IsA(node, OpExpr)) { return; @@ -171,9 +187,9 @@ GetKeyBasedQual(ForeignScanState *scanState, } Node *right = list_nth(op->args, 1); - if (IsA(right, RelabelType)) { - right = (Node*) ((RelabelType*)right)->arg; - } + if (IsA(right, RelabelType)) { + right = (Node*) ((RelabelType*)right)->arg; + } if (!IsA(right, Const) && !IsA(right, Param)) { return; } @@ -199,20 +215,20 @@ GetKeyBasedQual(ForeignScanState *scanState, ReleaseSysCache(opertup); Datum keyDatum; - Oid keyType; + Oid keyType; - if (IsA(right, Const)) - { - Const *constNode = (Const *) right; - keyDatum = constNode->constvalue; - keyType = constNode->consttype; - } - else - { - Param *paramNode = (Param *) right; - keyType = paramNode->paramtype; - keyDatum = scanState->ss.ps.state->es_param_list_info->params[paramNode->paramid-1].value; - } + if (IsA(right, Const)) + { + Const *constNode = (Const *) right; + keyDatum = constNode->constvalue; + keyType = constNode->consttype; + } + else + { + Param *paramNode = (Param *) right; + keyType = paramNode->paramtype; + keyDatum = scanState->ss.ps.state->es_param_list_info->params[paramNode->paramid-1].value; + } TypeCacheEntry *typeEntry = lookup_type_cache(keyType, 0); /* constant gets varlena with 4B header, same with copy uility */ @@ -232,10 +248,13 @@ GetKeyBasedQual(ForeignScanState *scanState, return; } + +// 开始执行外部表格的扫描 static void -BeginForeignScan(ForeignScanState *scanState, int executorFlags) +BeginForeignScan(ForeignScanState *scanState, + int executorFlags) { - static LsmCursorId operationId = 0; /* a SQL might cause multiple scans */ + static LsmCursorId operationId = 0; /* a SQL might cause multiple scans */ /* * Begin executing a foreign scan. This is called during executor startup. @@ -256,7 +275,7 @@ BeginForeignScan(ForeignScanState *scanState, int executorFlags) * */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); TableReadState *readState = palloc0(sizeof(TableReadState)); readState->isKeyBased = false; @@ -276,7 +295,7 @@ BeginForeignScan(ForeignScanState *scanState, int executorFlags) foreach (lc, scanState->ss.ps.plan->qual) { Expr *state = lfirst(lc); GetKeyBasedQual(scanState, - (Node *) state, + (Node *) state, scanState->ss.ss_currentRelation, readState); if (readState->isKeyBased) { @@ -285,14 +304,14 @@ BeginForeignScan(ForeignScanState *scanState, int executorFlags) } if (!readState->isKeyBased) - { + { Oid relationId = RelationGetRelid(scanState->ss.ss_currentRelation); readState->hasNext = LsmReadNext(MyBackendId, - relationId, - ++operationId, - readState->buf, - &readState->bufLen); + relationId, + ++operationId, + readState->buf, + &readState->bufLen); readState->next = readState->buf; readState->operationId = operationId; @@ -301,8 +320,8 @@ BeginForeignScan(ForeignScanState *scanState, int executorFlags) static void DeserializeTuple(StringInfo key, - StringInfo val, - TupleTableSlot *tupleSlot) + StringInfo val, + TupleTableSlot *tupleSlot) { Datum *values = tupleSlot->tts_values; @@ -318,9 +337,9 @@ DeserializeTuple(StringInfo key, int offset = 0; char *current = key->data; for (int index = 0; index < count; index++) - { + { if (index > 0) - { + { uint64 dataLen = 0; uint8 headerLen = DecodeVarintLength(current, val->data + val->len, @@ -328,7 +347,7 @@ DeserializeTuple(StringInfo key, offset += headerLen; current = val->data + offset; if (dataLen == 0) - { + { nulls[index] = true; continue; } @@ -350,24 +369,24 @@ DeserializeTuple(StringInfo key, static bool GetNextFromBatch(Oid relationId, - TableReadState *readState, - char **key, - size_t *keyLen, - char **val, - size_t *valLen) + TableReadState *readState, + char **key, + size_t *keyLen, + char **val, + size_t *valLen) { bool found = false; if (readState->next < readState->buf + readState->bufLen) - { + { found = true; } - else if (readState->hasNext) - { + else if (readState->hasNext) + { readState->hasNext = LsmReadNext(MyBackendId, - relationId, - readState->operationId, - readState->buf, - &readState->bufLen); + relationId, + readState->operationId, + readState->buf, + &readState->bufLen); readState->next = readState->buf; if (readState->bufLen > 0) { @@ -376,15 +395,15 @@ GetNextFromBatch(Oid relationId, } if (found) { - int len; + int len; memcpy(&len, readState->next, sizeof(len)); - *keyLen = len; + *keyLen = len; readState->next += sizeof(len); *key = readState->next; readState->next += len; memcpy(&len, readState->next, sizeof(len)); - *valLen = len; + *valLen = len; readState->next += sizeof(len); *val = readState->next; readState->next += len; @@ -392,6 +411,7 @@ GetNextFromBatch(Oid relationId, return found; } +// 从外部表格数据中,返回一行数据从外部表的slot中 static TupleTableSlot* IterateForeignScan(ForeignScanState *scanState) { @@ -419,7 +439,7 @@ IterateForeignScan(ForeignScanState *scanState) * (just as you would need to do in the case of a data type mismatch). */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); TupleTableSlot *tupleSlot = scanState->ss.ss_ScanTupleSlot; ExecClearTuple(tupleSlot); @@ -434,12 +454,12 @@ IterateForeignScan(ForeignScanState *scanState) k = readState->key->data; kLen = readState->key->len; found = LsmLookup(MyBackendId, relationId, k, kLen, readState->buf, &vLen); - v = readState->buf; + v = readState->buf; readState->done = true; } } - else - { + else + { found = GetNextFromBatch(relationId, readState, &k, @@ -449,11 +469,11 @@ IterateForeignScan(ForeignScanState *scanState) } if (found) - { + { StringInfoData key; StringInfoData val; - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); appendBinaryStringInfo(&key, k, kLen); appendBinaryStringInfo(&val, v, vLen); @@ -473,7 +493,7 @@ ReScanForeignScan(ForeignScanState *scanState) * return exactly the same rows. */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); } static void @@ -485,14 +505,14 @@ EndForeignScan(ForeignScanState *scanState) * remote servers should be cleaned up. */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); TableReadState *readState = (TableReadState *) scanState->fdw_state; Assert(readState); Oid relationId = RelationGetRelid(scanState->ss.ss_currentRelation); if (!readState->isKeyBased) - { + { LsmCloseCursor(MyBackendId, relationId, readState->operationId); } @@ -501,8 +521,8 @@ EndForeignScan(ForeignScanState *scanState) static void AddForeignUpdateTargets(Query *parsetree, - RangeTblEntry *tableEntry, - Relation targetRelation) + RangeTblEntry *tableEntry, + Relation targetRelation) { /* * UPDATE and DELETE operations are performed against rows previously @@ -531,7 +551,7 @@ AddForeignUpdateTargets(Query *parsetree, * relies on an unchanging primary key to identify rows.) */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); /* * We are using first column as row identification column, so we are adding @@ -557,11 +577,13 @@ AddForeignUpdateTargets(Query *parsetree, parsetree->targetList = lappend(parsetree->targetList, entry); } + + static List* PlanForeignModify(PlannerInfo *root, - ModifyTable *plan, - Index resultRelation, - int subplanIndex) + ModifyTable *plan, + Index resultRelation, + int subplanIndex) { /* * Perform any additional planning actions needed for an insert, update, @@ -583,17 +605,18 @@ PlanForeignModify(PlannerInfo *root, * BeginForeignModify will be NIL. */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); return NULL; } +// 开始执行一个外部表的修改,比如curd static void BeginForeignModify(ModifyTableState *modifyTableState, - ResultRelInfo *resultRelInfo, - List *fdwPrivate, - int subplanIndex, - int executorFlags) + ResultRelInfo *resultRelInfo, + List *fdwPrivate, + int subplanIndex, + int executorFlags) { /* * Begin executing a foreign table modification operation. This routine is @@ -621,7 +644,7 @@ BeginForeignModify(ModifyTableState *modifyTableState, * during executor startup. */ - ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); if (executorFlags & EXEC_FLAG_EXPLAIN_ONLY) { return; @@ -640,10 +663,12 @@ BeginForeignModify(ModifyTableState *modifyTableState, resultRelInfo->ri_FdwState = (void *) writeState; } + +// 将slot序列化为key和val进行插入操作 static void SerializeTuple(StringInfo key, - StringInfo val, - TupleTableSlot *tupleSlot) + StringInfo val, + TupleTableSlot *tupleSlot) { TupleDesc tupleDescriptor = tupleSlot->tts_tupleDescriptor; int count = tupleDescriptor->natts; @@ -653,25 +678,50 @@ SerializeTuple(StringInfo key, Datum datum = tupleSlot->tts_values[index]; if (tupleSlot->tts_isnull[index]) { if (index == 0) { - ereport(ERROR, (errmsg("LSM: first column cannot be null!"))); + // 元组的第一个属性为空 + ereport(ERROR, (errmsg("pg_rocksdb: first column cannot be null!"))); } SerializeNullAttribute(tupleDescriptor, index, val); } else { - SerializeAttribute(tupleDescriptor, - index, - datum, - index == 0 ? key : val); + // 序列化非空的字段 + SerializeAttribute(tupleDescriptor, //元组 + index, // 当前属性的下标 + datum, // 当前属性所对应的值 + index == 0 ? key : val); //key,val初始化为空 } } } + +// 插入一个tuple到外部表中 +// slot : 槽 +// resultRelInfo 用于描述外部表 +// slot +// planSlot static TupleTableSlot* ExecForeignInsert(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { + + /** + * + * 执行器机制被用于四种基本SQL查询类型:SELECT、INSERT、 UPDATE以及DELETE。对于SELECT, + * 顶层执行器代码只需要发送查询计划树返回的每个行给客户端。 + * 对于INSERT,每一个被返回的行被插入到INSERT中指定的目标表中。 + * 这通过一个被称为ModifyTable的特殊顶层计划节点完成 + * (一个简单的INSERT ... VALUES命令会创建一个由一个Result节点组成的简单计划树, + * 该节点只计算一个结果行,在它之上的ModifyTable节点会执行插入。但是INSERT ... SELECT可以用到执行器机制的全部功能)。 + * 对于UPDATE,规划器会安排每一个计算行包含所有被更新的列值加上原始目标行的TID(元组ID或行ID), + * 这些数据也会被输入到一个ModifyTable节点, + * 该节点将利用这些信息创建一个新的被更新行并标记旧行为被删除。 + * 对于DELETE,唯一被计划返回的列是TID,ModifyTable节点简单地使用TID访问每一个目标行并将其标记为被删除。 + * ModifyTable以及CRUD操作的底层原理:https://www.cnblogs.com/flying-tiger/p/8418293.html + * / + + /* * Insert one tuple into the foreign table. executorState is global * execution state for the query. resultRelInfo is the ResultRelInfo struct @@ -702,7 +752,7 @@ ExecForeignInsert(EState *executorState, TupleDesc tupleDescriptor = slot->tts_tupleDescriptor; #if PG_VERSION_NUM>=130000 - bool shouldFree; + bool shouldFree; HeapTuple heapTuple = ExecFetchSlotHeapTuple(slot, false, &shouldFree); if (HeapTupleHasExternal(heapTuple)) { @@ -725,28 +775,31 @@ ExecForeignInsert(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); SerializeTuple(&key, &val, slot); + // 调用lsm_client中的接口进行插入 if (!LsmInsert(MyBackendId, foreignTableId, key.data, key.len, val.data, val.len)) - elog(ERROR, "LSM: Failed to insert tuple"); + elog(ERROR, "LSM: Failed to insert tuple"); #if PG_VERSION_NUM>=130000 - if (shouldFree) + if (shouldFree) pfree(heapTuple); #endif - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); return slot; } + +// 执行外部数据更新 static TupleTableSlot* ExecForeignUpdate(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { /* * Update one tuple in the foreign table. executorState is global execution @@ -754,7 +807,7 @@ ExecForeignUpdate(EState *executorState, * the target foreign table. slot contains the new data for the tuple; it * will match the rowtype definition of the foreign table. planSlot contains * the tuple that was generated by the ModifyTable plan node's subplan; it - * differs from slot in possibly containing additional "junk" columns. In + * differs from slot in possibly containing additional "junk" columns. In // 重要:in possibly containing additional "junk" columns * particular, any junk columns that were requested by * AddForeignUpdateTargets will be available from this slot. * @@ -779,7 +832,7 @@ ExecForeignUpdate(EState *executorState, TupleDesc tupleDescriptor = slot->tts_tupleDescriptor; #if PG_VERSION_NUM>=130000 - bool shouldFree; + bool shouldFree; HeapTuple heapTuple = ExecFetchSlotHeapTuple(slot, false, &shouldFree); if (HeapTupleHasExternal(heapTuple)) { @@ -790,7 +843,7 @@ ExecForeignUpdate(EState *executorState, } #else if (HeapTupleHasExternal(slot->tts_tuple)) - { + { /* detoast any toasted attributes */ slot->tts_tuple = toast_flatten_tuple(slot->tts_tuple, tupleDescriptor); } @@ -803,27 +856,28 @@ ExecForeignUpdate(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); + // 将slot序列化为key 和 val SerializeTuple(&key, &val, slot); - + // 将获取到的key和val进行insert操作 LsmInsert(MyBackendId, foreignTableId, key.data, key.len, val.data, val.len); #if PG_VERSION_NUM>=130000 - if (shouldFree) + if (shouldFree) pfree(heapTuple); #endif - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); return slot; } static TupleTableSlot* ExecForeignDelete(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { /* * Delete one tuple from the foreign table. executorState is global @@ -859,22 +913,22 @@ ExecForeignDelete(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); SerializeTuple(&key, &val, planSlot); if (!LsmDelete(MyBackendId, foreignTableId, key.data, key.len)) - elog(ERROR, "LSM: Failed to delete tuple"); + elog(ERROR, "LSM: Failed to delete tuple"); - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); - return slot; + return slot; } static void EndForeignModify(EState *executorState, - ResultRelInfo *resultRelInfo) + ResultRelInfo *resultRelInfo) { /* * End the table update and release resources. It is normally not important @@ -901,7 +955,7 @@ EndForeignModify(EState *executorState, static void ExplainForeignScan(ForeignScanState *scanState, - struct ExplainState * explainState) + struct ExplainState * explainState) { /* * Print additional EXPLAIN output for a foreign table scan. This function @@ -919,10 +973,10 @@ ExplainForeignScan(ForeignScanState *scanState, static void ExplainForeignModify(ModifyTableState *modifyTableState, - ResultRelInfo *relationInfo, - List *fdwPrivate, - int subplanIndex, - struct ExplainState *explainState) + ResultRelInfo *relationInfo, + List *fdwPrivate, + int subplanIndex, + struct ExplainState *explainState) { /* * Print additional EXPLAIN output for a foreign table update. This @@ -941,8 +995,8 @@ ExplainForeignModify(ModifyTableState *modifyTableState, static bool AnalyzeForeignTable(Relation relation, - AcquireSampleRowsFunc *acquireSampleRowsFunc, - BlockNumber *totalPageCount) + AcquireSampleRowsFunc *acquireSampleRowsFunc, + BlockNumber *totalPageCount) { /* ---- * This function is called when ANALYZE is executed on a foreign table. If @@ -976,8 +1030,13 @@ AnalyzeForeignTable(Relation relation, return false; } -Datum pg_rocksdb_handler(PG_FUNCTION_ARGS) +// 文档:https://www.postgresql.org/docs/9.6/fdwhandler.html +// 这是整个fdw程序的入口 +Datum pg_rocksdb_fdw_handler(PG_FUNCTION_ARGS) { + //FDW 处理函数返回一个 palloc 的FdwRoutine结构,其中包含指向下面描述的回调函数的指针。 + //FdwRoutine结构类型在src/include/foreign/fdwapi.h中声明 + //https://doxygen.postgresql.org/fdwapi_8h_source.html#l00204 FdwRoutine *routine = makeNode(FdwRoutine); ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); @@ -992,20 +1051,25 @@ Datum pg_rocksdb_handler(PG_FUNCTION_ARGS) */ /* these are required */ + // http://www.postgres.cn/docs/12/fdw-callbacks.html + /* + 在对一个扫描外部表的查询进行规划的开头将调用该函数 + */ routine->GetForeignRelSize = GetForeignRelSize; routine->GetForeignPaths = GetForeignPaths; routine->GetForeignPlan = GetForeignPlan; - routine->BeginForeignScan = BeginForeignScan; //执行外部表扫描 + routine->BeginForeignScan = BeginForeignScan; routine->IterateForeignScan = IterateForeignScan; routine->ReScanForeignScan = ReScanForeignScan; routine->EndForeignScan = EndForeignScan; + // remainder-余下的,余下的函数是可选的,如果不需要的话,可以为NULL /* remainder are optional - use NULL if not required */ /* support for insert / update / delete */ routine->AddForeignUpdateTargets = AddForeignUpdateTargets; routine->PlanForeignModify = PlanForeignModify; routine->BeginForeignModify = BeginForeignModify; - routine->ExecForeignInsert = ExecForeignInsert; //执行插入操作 + routine->ExecForeignInsert = ExecForeignInsert; routine->ExecForeignUpdate = ExecForeignUpdate; routine->ExecForeignDelete = ExecForeignDelete; routine->EndForeignModify = EndForeignModify; diff --git a/lsm_fdw.h b/lsm_fdw.h index 347c1c7..1233a4f 100644 --- a/lsm_fdw.h +++ b/lsm_fdw.h @@ -26,7 +26,7 @@ typedef struct TableReadState { size_t bufLen; /* shared mem length, no next batch if it is 0 */ char *next; /* pointer to the next data entry for IterateForeignScan */ bool hasNext; /* whether a next batch from RangeQuery or ReadBatch*/ - char buf[LSM_MAX_RECORD_SIZE]; + char buf[LSM_MAX_RECORD_SIZE]; } TableReadState; /* diff --git a/lsm_server.cpp b/lsm_server.cpp index 8499035..69da341 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -2,7 +2,7 @@ // Worker's part of LSM queue // #include "lsm_api.h" -#include "lsm_db.h" +#include "lsm_db.h" //包含lsm_storage.cpp中的一些实现的函数,也就是对rocksdb进行操作的函数,例如crud操作 static LsmServer* server;; @@ -14,79 +14,81 @@ bool LsmUpsert; /* * Enqueue message */ +// 传入的参数为LsmMessage,将LsmMessage中的数据存入到数据库中 void LsmQueue::put(LsmMessage const& msg) { - int size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; - - if (size > LsmQueueSize) - LsmError("Message is too long"); - - while (true) - { - int getPos = this->getPos; - int putPos = this->putPos; - int available = putPos >= getPos ? LsmQueueSize - putPos + getPos : getPos - putPos; - - if (size >= available) /* queue overflow? */ - { - if (!writerBlocked) - { - writerBlocked = true; - LsmMemoryBarrier(); - // Enforce "writeBlocked" flag to be visible by consumer and retry availability check - } - else - { - SemWait(&full); - } - continue; - } - size_t tail = LsmQueueSize - putPos; - - // Copy header - if (tail <= sizeof(LsmMessageHeader)) - { - memcpy(&req[putPos], &msg, tail); - memcpy(&req[0], (char*)&msg + tail, sizeof(LsmMessageHeader) - tail); - putPos = sizeof(LsmMessageHeader) - tail; - } - else - { - memcpy(&req[putPos], &msg, sizeof(LsmMessageHeader)); - putPos += sizeof(LsmMessageHeader); - } - tail = LsmQueueSize - putPos; - - // Copy key - if (tail <= msg.hdr.keySize) - { - memcpy(&req[putPos], msg.key, tail); - memcpy(&req[0], msg.key + tail, msg.hdr.keySize - tail); - putPos = msg.hdr.keySize - tail; - } - else - { - memcpy(&req[putPos], msg.key, msg.hdr.keySize); - putPos += msg.hdr.keySize; - } - tail = LsmQueueSize - putPos; - - // Copy value - if (tail <= msg.hdr.valueSize) - { - memcpy(&req[putPos], msg.value, tail); - memcpy(&req[0], msg.value + tail, msg.hdr.valueSize - tail); - putPos = msg.hdr.valueSize - tail; - } - else - { - memcpy(&req[putPos], msg.value, msg.hdr.valueSize); - putPos += msg.hdr.valueSize; - } - this->putPos = putPos; - SemPost(&empty); // Enforce write barrier and notify consumer - return; - } + int size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; + + if (size > LsmQueueSize) + LsmError("Message is too long"); + + while (true) + { + int getPos = this->getPos; + int putPos = this->putPos; + int available = putPos >= getPos ? LsmQueueSize - putPos + getPos : getPos - putPos; + + if (size >= available) /* queue overflow? */ + { + if (!writerBlocked) + { + writerBlocked = true; + LsmMemoryBarrier(); + // Enforce "writeBlocked" flag to be visible by consumer and retry availability check + } + else + { + SemWait(&full); + } + continue; + } + size_t tail = LsmQueueSize - putPos; + + // Copy header + if (tail <= sizeof(LsmMessageHeader)) + { + memcpy(&req[putPos], &msg, tail); + memcpy(&req[0], (char*)&msg + tail, sizeof(LsmMessageHeader) - tail); + putPos = sizeof(LsmMessageHeader) - tail; + } + else + { + memcpy(&req[putPos], &msg, sizeof(LsmMessageHeader)); + putPos += sizeof(LsmMessageHeader); + } + tail = LsmQueueSize - putPos; + + // Copy key + if (tail <= msg.hdr.keySize) + { + // C 库函数 void *memcpy(void *str1, const void *str2, size_t n) 从存储区 str2 复制 n 个字节到存储区 str1。 + memcpy(&req[putPos], msg.key, tail); + memcpy(&req[0], msg.key + tail, msg.hdr.keySize - tail); + putPos = msg.hdr.keySize - tail; + } + else + { + memcpy(&req[putPos], msg.key, msg.hdr.keySize); + putPos += msg.hdr.keySize; + } + tail = LsmQueueSize - putPos; + + // Copy value + if (tail <= msg.hdr.valueSize) + { + memcpy(&req[putPos], msg.value, tail); + memcpy(&req[0], msg.value + tail, msg.hdr.valueSize - tail); + putPos = msg.hdr.valueSize - tail; + } + else + { + memcpy(&req[putPos], msg.value, msg.hdr.valueSize); + putPos += msg.hdr.valueSize; + } + this->putPos = putPos; + SemPost(&empty); // Enforce write barrier and notify consumer + return; + } } /* @@ -96,70 +98,70 @@ void LsmQueue::put(LsmMessage const& msg) */ void LsmQueue::get(char* buf, LsmMessage& msg) { - // Wait until queue is not empty. - // We are not comparing getPos with putPos before waiting semaphore to make sure that writer barrier enforced by SemPost - // makes all data written by producer visible for consumer. - SemWait(&empty); - - if (terminate) - { - msg.hdr.op = LsmOpTerminate; - return; - } - - int getPos = this->getPos; - int putPos = this->putPos; - - if (putPos == getPos) - LsmError("Queue race condition!"); - - size_t tail = LsmQueueSize - getPos; - - // Copy header - if (tail <= sizeof(LsmMessageHeader)) - { - memcpy(&msg, &req[getPos], tail); - memcpy((char*)&msg + tail, &req[0], sizeof(LsmMessageHeader) - tail); - getPos = sizeof(LsmMessageHeader) - tail; - } - else - { - memcpy(&msg, &req[getPos], sizeof(LsmMessageHeader)); - getPos += sizeof(LsmMessageHeader); - } - tail = LsmQueueSize - getPos; - - // Copy key - if (tail < msg.hdr.keySize) - { - memcpy(buf, &req[getPos], tail); - memcpy(buf + tail, &req[0], msg.hdr.keySize - tail); - getPos = msg.hdr.keySize - tail; - msg.key = buf; - buf += msg.hdr.keySize; - } - else - { - msg.key = &req[getPos]; - getPos += msg.hdr.keySize; - if (getPos == LsmQueueSize) - { - getPos = 0; - } - } - tail = LsmQueueSize - getPos; - - // Copy value - if (tail < msg.hdr.valueSize) - { - memcpy(buf, &req[getPos], tail); - memcpy(buf + tail, &req[0], msg.hdr.valueSize - tail); - msg.value = buf; - } - else - { - msg.value = &req[getPos]; - } + // Wait until queue is not empty. + // We are not comparing getPos with putPos before waiting semaphore to make sure that writer barrier enforced by SemPost + // makes all data written by producer visible for consumer. + SemWait(&empty); + + if (terminate) + { + msg.hdr.op = LsmOpTerminate; + return; + } + + int getPos = this->getPos; + int putPos = this->putPos; + + if (putPos == getPos) + LsmError("Queue race condition!"); + + size_t tail = LsmQueueSize - getPos; + + // Copy header + if (tail <= sizeof(LsmMessageHeader)) + { + memcpy(&msg, &req[getPos], tail); + memcpy((char*)&msg + tail, &req[0], sizeof(LsmMessageHeader) - tail); + getPos = sizeof(LsmMessageHeader) - tail; + } + else + { + memcpy(&msg, &req[getPos], sizeof(LsmMessageHeader)); + getPos += sizeof(LsmMessageHeader); + } + tail = LsmQueueSize - getPos; + + // Copy key + if (tail < msg.hdr.keySize) + { + memcpy(buf, &req[getPos], tail); + memcpy(buf + tail, &req[0], msg.hdr.keySize - tail); + getPos = msg.hdr.keySize - tail; + msg.key = buf; + buf += msg.hdr.keySize; + } + else + { + msg.key = &req[getPos]; + getPos += msg.hdr.keySize; + if (getPos == LsmQueueSize) + { + getPos = 0; + } + } + tail = LsmQueueSize - getPos; + + // Copy value + if (tail < msg.hdr.valueSize) + { + memcpy(buf, &req[getPos], tail); + memcpy(buf + tail, &req[0], msg.hdr.valueSize - tail); + msg.value = buf; + } + else + { + msg.value = &req[getPos]; + } } /* @@ -167,23 +169,23 @@ void LsmQueue::get(char* buf, LsmMessage& msg) */ void LsmQueue::next(LsmMessage const& msg) { - int getPos = this->getPos; - bool writerBlocked = this->writerBlocked; - size_t size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; - size_t tail = LsmQueueSize - getPos; - this->getPos = (tail <= size) ? size - tail : getPos + size; - if (writerBlocked) - { - // Notify consumer that some more free space is avaialble in ring buffer - this->writerBlocked = false; - SemPost(&full); - } + int getPos = this->getPos; + bool writerBlocked = this->writerBlocked; + size_t size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; + size_t tail = LsmQueueSize - getPos; + this->getPos = (tail <= size) ? size - tail : getPos + size; + if (writerBlocked) + { + // Notify consumer that some more free space is avaialble in ring buffer + this->writerBlocked = false; + SemPost(&full); + } } inline LsmConnection& LsmWorker::open(LsmMessage const& msg) { - return server->open(msg); + return server->open(msg); } /* @@ -192,10 +194,11 @@ LsmWorker::open(LsmMessage const& msg) void LsmWorker::insert(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - queue->resp[0] = (char)con.insert(msg.key, msg.hdr.keySize, msg.value, msg.hdr.valueSize); - if (LsmSync) - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + // 通过调用con插入key,value数据,resp为插入之后的返回值 + queue->resp[0] = (char)con.insert(msg.key, msg.hdr.keySize, msg.value, msg.hdr.valueSize); + if (LsmSync) + SemPost(&queue->ready); } /* @@ -204,10 +207,10 @@ LsmWorker::insert(LsmMessage const& msg) void LsmWorker::remove(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - queue->resp[0] = (char)con.remove(msg.key, msg.hdr.keySize); - if (LsmSync) - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + queue->resp[0] = (char)con.remove(msg.key, msg.hdr.keySize); + if (LsmSync) + SemPost(&queue->ready); } /* @@ -216,10 +219,10 @@ LsmWorker::remove(LsmMessage const& msg) void LsmWorker::count(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - uint64_t count = con.count(); - memcpy(queue->resp, &count, sizeof(count)); - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + uint64_t count = con.count(); + memcpy(queue->resp, &count, sizeof(count)); + SemPost(&queue->ready); } /* @@ -228,9 +231,9 @@ LsmWorker::count(LsmMessage const& msg) void LsmWorker::closeCursor(LsmMessage const& msg) { - LsmCursor& csr(cursors[msg.hdr.cid]); - csr.con->releaseIterator(csr.iter); - cursors.erase(msg.hdr.cid); + LsmCursor& csr(cursors[msg.hdr.cid]); + csr.con->releaseIterator(csr.iter); + cursors.erase(msg.hdr.cid); } /* @@ -239,9 +242,9 @@ LsmWorker::closeCursor(LsmMessage const& msg) void LsmWorker::lookup(LsmMessage const& msg) { - LsmConnection& con(open(msg)); + LsmConnection& con(open(msg)); queue->respSize = con.lookup(msg.key, msg.hdr.keySize, queue->resp); - SemPost(&queue->ready); + SemPost(&queue->ready); } /* @@ -250,81 +253,90 @@ LsmWorker::lookup(LsmMessage const& msg) void LsmWorker::fetch(LsmMessage const& msg) { - LsmCursor& csr(cursors[msg.hdr.cid]); - if (!csr.con) - { - csr.con = &open(msg); - csr.iter = csr.con->getIterator(); - } + LsmCursor& csr(cursors[msg.hdr.cid]); + if (!csr.con) + { + csr.con = &open(msg); + csr.iter = csr.con->getIterator(); + } queue->respSize = csr.con->next(csr.iter, queue->resp); - SemPost(&queue->ready); + SemPost(&queue->ready); } + + +// 主循环,进行rocksdb的主操作 /* * Worker main loop */ void LsmWorker::run() { - while (true) - { - LsmMessage msg; - char buf[LSM_MAX_RECORD_SIZE]; - queue->get(buf, msg); + // 一直处于监听情况下 + while (true) + { + LsmMessage msg; + char buf[LSM_MAX_RECORD_SIZE]; + queue->get(buf, msg); switch (msg.hdr.op) { - case LsmOpTerminate: - return; - case LsmOpCount: - count(msg); - break; - case LsmOpCloseCursor: - closeCursor(msg); - break; - case LsmOpFetch: - fetch(msg); - break; - case LsmOpLookup: - lookup(msg); - break; - case LsmOpInsert: - insert(msg); - break; - case LsmOpDelete: - remove(msg); - break; - default: - assert(false); + case LsmOpTerminate: + return; + case LsmOpCount: + count(msg); + break; + case LsmOpCloseCursor: + closeCursor(msg); + break; + case LsmOpFetch: + fetch(msg); + break; + case LsmOpLookup: + lookup(msg); + break; + case LsmOpInsert: + insert(msg); + break; + case LsmOpDelete: + remove(msg); + break; + default: + assert(false); } - queue->next(msg); - } + queue->next(msg); + } } + + +// 开启一个LsmWorker,调用其主函数 void LsmWorker::start() { - PthreadCreate(&thread, NULL, LsmWorker::main, this); + PthreadCreate(&thread, NULL, LsmWorker::main, this); } void LsmWorker::stop() { - queue->terminate = true; - SemPost(&queue->empty); + queue->terminate = true; + SemPost(&queue->empty); } void LsmWorker::wait() { - void* status; - PthreadJoin(thread, &status); + void* status; + PthreadJoin(thread, &status); } + +// 主函数 void* LsmWorker::main(void* arg) { - ((LsmWorker*)arg)->run(); - return NULL; + ((LsmWorker*)arg)->run(); + return NULL; } /* @@ -334,10 +346,10 @@ LsmWorker::main(void* arg) void LsmRunWorkers(int maxClients) { - server = new LsmServer(maxClients); - server->start(); - server->wait(); - delete server; + server = new LsmServer(maxClients); + server->start(); + server->wait(); + delete server; } /* @@ -346,66 +358,70 @@ LsmRunWorkers(int maxClients) void LsmStopWorkers(void) { - server->stop(); + server->stop(); } + +// 封装了对LsmWorker的操作,也就是当外部数据来的时候,将插入数据等操作用一个LsmWorker来操作 LsmServer::LsmServer(size_t maxClients) : nWorkers(maxClients) { - workers = new LsmWorker*[nWorkers]; - for (size_t i = 0; i < nWorkers; i++) - { - workers[i] = new LsmWorker(this, queues[i]); - } + workers = new LsmWorker*[nWorkers]; + for (size_t i = 0; i < nWorkers; i++) + { + workers[i] = new LsmWorker(this, queues[i]); + } } void LsmServer::start() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->start(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->start(); + } } void LsmServer::wait() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->wait(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->wait(); + } } LsmServer::~LsmServer() { - for (size_t i = 0; i < nWorkers; i++) - { - delete workers[i]; - } - delete[] workers; + for (size_t i = 0; i < nWorkers; i++) + { + delete workers[i]; + } + delete[] workers; } void LsmServer::stop() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->stop(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->stop(); + } } + + +// 返回rocksdb的con LsmConnection& LsmServer::open(LsmMessage const& msg) { - CriticalSection cs(mutex); - LsmConnection& con = connections[msg.hdr.rid]; - if (con.db == NULL) - { - char path[64]; - sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); - // 打开数据库 - con.open(path); - } - return con; + CriticalSection cs(mutex); + LsmConnection& con = connections[msg.hdr.rid]; + if (con.db == NULL) + { + char path[64]; + sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); + con.open(path); + } + return con; } diff --git a/lsm_storage.cpp b/lsm_storage.cpp index 1ccadd9..ec13b63 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -3,42 +3,30 @@ // #include "lsm_db.h" - -/** - * @param path rocksdb数据库的路径 - * 如果在使用rocksdb时没有显式使用过列族,就会发现,所有的操作都发生在一个列族中, - * 这个列族名称为default. - */ +// 此函数是LsmConnection 中的成员函数, +// 而LsmConnection中有一个非常重要的成员变量:DB* db; void LsmConnection::open(char const* path) { - // https://wanghenshui.github.io/rocksdb-doc-cn/doc/Column-Families.html - // ColumnFamilyOptions 用于配置列族,DBOptions用于数据库粒度的配置 - // Options 继承了了ColumnFamilyOptions和DBOptions,因此Options可以执行上述两种配置 Options options; options.create_if_missing = true; - // @todo hr,wu 数据库路径 - std::string p(path); - db_path = p; // 给LSMConnection中的属性赋值 - - // @todo hr,wu 使用LSMConnection中的关于列族的参数来打开数据库 Status s = DB::Open(options, std::string(path), &db); if (!s.ok()) - LsmError(s.getState()); + LsmError(s.getState()); } void LsmConnection::close() { delete db; - db = NULL; + db = NULL; } uint64_t LsmConnection::count() { - std::string count; + std::string count; db->GetProperty("rocksdb.estimate-num-keys", &count); return stoull(count); } @@ -60,70 +48,66 @@ LsmConnection::releaseIterator(Iterator* it) size_t LsmConnection::next(Iterator* it, char* buf) { - size_t size; - // Fetch as much records asfits in response buffer - for (size = 0; it->Valid(); it->Next()) - { + size_t size; + // Fetch as much records asfits in response buffer + for (size = 0; it->Valid(); it->Next()) + { int keyLen = it->key().size(); - int valLen = it->value().size(); - int pairSize = sizeof(int)*2 + keyLen + valLen; + int valLen = it->value().size(); + int pairSize = sizeof(int)*2 + keyLen + valLen; - if (size + pairSize > LSM_MAX_RECORD_SIZE) - break; + if (size + pairSize > LSM_MAX_RECORD_SIZE) + break; - memcpy(&buf[size], &keyLen, sizeof keyLen); - size += sizeof keyLen; - memcpy(&buf[size], it->key().data(), keyLen); - size += keyLen; + memcpy(&buf[size], &keyLen, sizeof keyLen); + size += sizeof keyLen; + memcpy(&buf[size], it->key().data(), keyLen); + size += keyLen; - memcpy(&buf[size], &valLen, sizeof valLen); - size += sizeof valLen; - memcpy(&buf[size], it->value().data(), valLen); - size += valLen; + memcpy(&buf[size], &valLen, sizeof valLen); + size += sizeof valLen; + memcpy(&buf[size], it->value().data(), valLen); + size += valLen; } - return size; + return size; } size_t LsmConnection::lookup(char const* key, size_t keyLen, char* buf) { - std::string sval; + std::string sval; ReadOptions ro; - Status s = db->Get(ro, handles[1], Slice(key, keyLen), &sval); + Status s = db->Get(ro, Slice(key, keyLen), &sval); if (!s.ok()) - return 0; - size_t valLen = sval.length(); + return 0; + size_t valLen = sval.length(); memcpy(buf, sval.c_str(), valLen); - return valLen; + return valLen; } bool LsmConnection::insert(char* key, size_t keyLen, char* val, size_t valLen) { - Status s; - WriteOptions opts; - if (!LsmUpsert) - { - std::string sval; - ReadOptions ro; - s = db->Get(ro, Slice(key, keyLen), &sval); - if (s.ok()) // key already exists - return false; - } - opts.sync = LsmSync; - - // @todo hr,wu---真正向rksdb中插入数据--- - // https://wanghenshui.github.io/rocksdb-doc-cn/doc/Column-Families.html - // 插入数据的具体操作 - s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); + Status s; + WriteOptions opts; + if (!LsmUpsert) + { + std::string sval; + ReadOptions ro; + s = db->Get(ro, Slice(key, keyLen), &sval); + if (s.ok()) // key already exists + return false; + } + opts.sync = LsmSync; + s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); return s.ok(); } bool LsmConnection::remove(char* key, size_t keyLen) { - WriteOptions opts; - opts.sync = LsmSync; + WriteOptions opts; + opts.sync = LsmSync; Status s = db->Delete(opts, Slice(key, keyLen)); return s.ok(); } diff --git a/lsm_util.c b/lsm_util.c index 3d02c32..f0bdd5b 100644 --- a/lsm_util.c +++ b/lsm_util.c @@ -41,13 +41,13 @@ static shmem_startup_hook_type PreviousShmemStartupHook = NULL; /* local functions forward declarations */ static void LsmProcessUtility(PlannedStmt *plannedStmt, - const char *queryString, - ProcessUtilityContext context, - ParamListInfo paramListInfo, - QueryEnvironment *queryEnvironment, - DestReceiver *destReceiver, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo paramListInfo, + QueryEnvironment *queryEnvironment, + DestReceiver *destReceiver, #if PG_VERSION_NUM>=130000 - QueryCompletion *completionTag); + QueryCompletion *completionTag); #else char *completionTag); #endif @@ -62,12 +62,12 @@ EncodeVarintLength(uint64 v, char* buf) v >>= 7; } *dst++ = (char)v; - return (uint8)(dst - buf); + return (uint8)(dst - buf); } static const char* GetVarint64Ptr(const char* p, const char* limit, - uint64_t* value) + uint64_t* value) { uint64_t result = 0; for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { @@ -95,11 +95,17 @@ DecodeVarintLength(char* start, char* limit, uint64* len) * Checks if the given foreign server belongs to kv_fdw. If it * does, the function returns true. Otherwise, it returns false. */ +//http://www.postgres.cn/docs/9.4/fdw-helpers.html static bool LsmServer(ForeignServer *server) { char *fdwName = GetForeignDataWrapper(server->fdwid)->fdwname; return strncmp(fdwName, LSM_FDW_NAME "_fdw", NAMEDATALEN) == 0; } + + + + +// 判断给定了表是否是属于外部表格 /* * Checks if the given table name belongs to a foreign Lsm table. * If it does, the function returns true. Otherwise, it returns false. @@ -126,7 +132,7 @@ static bool LsmTable(Oid relationId) { static char* LsmFilePath(Oid relid) { - return psprintf("%s/%d", LSM_FDW_NAME, relid); + return psprintf("%s/%d", LSM_FDW_NAME, relid); } /* @@ -183,14 +189,14 @@ static void LsmCheckSuperuserPrivilegesForCopy(const CopyStmt* copyStmt) { if (copyStmt->filename != NULL && !superuser()) { if (copyStmt->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to COPY to or from a program"), - errhint("Anyone can COPY to stdout or from stdin. " - "psql's \\copy command also works for anyone."))); + errmsg("must be superuser to COPY to or from a program"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to COPY to or from a file"), - errhint("Anyone can COPY to stdout or from stdin. " - "psql's \\copy command also works for anyone."))); + errmsg("must be superuser to COPY to or from a file"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); } } } @@ -224,10 +230,12 @@ void SerializeNullAttribute(TupleDesc tupleDescriptor, buffer->len += headerLen; } -void SerializeAttribute(TupleDesc tupleDescriptor, - Index index, - Datum datum, - StringInfo buffer) { + +// 序列化属性 +void SerializeAttribute(TupleDesc tupleDescriptor, //元组 + Index index, // 当前属性的下标 + Datum datum, // 当前属性的值 + StringInfo buffer) { //以当前这一行的第一个属性的值作为key,其他的属性为val Form_pg_attribute attributeForm = TupleDescAttr(tupleDescriptor, index); bool byValue = attributeForm->attbyval; int typeLength = attributeForm->attlen; @@ -239,7 +247,7 @@ void SerializeAttribute(TupleDesc tupleDescriptor, int offset = buffer->len; int datumLength = att_addlength_datum(offset, typeLength, datum); - /* the key does not have a size header */ + /* the key does not have a size header */ enlargeStringInfo(buffer, datumLength + (index == 0 ? 0 : HEADERBUFFSIZE)); char *current = buffer->data + buffer->len; @@ -255,6 +263,7 @@ void SerializeAttribute(TupleDesc tupleDescriptor, if (typeLength > 0) { if (byValue) { + // 存储指定的值到指定地址中 store_att_byval(current, datum, typeLength); } else { memcpy(current, DatumGetPointer(datum), typeLength); @@ -274,7 +283,7 @@ void SerializeAttribute(TupleDesc tupleDescriptor, * number of copied rows. */ static uint64 LsmCopyIntoTable(const CopyStmt *copyStmt, - const char *queryString) + const char *queryString) { /* Only superuser can copy from or to local file */ LsmCheckSuperuserPrivilegesForCopy(copyStmt); @@ -314,7 +323,7 @@ static uint64 LsmCopyIntoTable(const CopyStmt *copyStmt, while (found) { /* read the next row in tupleContext */ MemoryContext oldContext = - MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); /* * 'econtext' is used to evaluate default expression for each columns @@ -375,9 +384,9 @@ static uint64 LsmCopyOutTable(CopyStmt *copyStmt, const char *queryString) { if (copyStmt->attlist != NIL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("copy column list is not supported"), - errhint("use 'copy (select from ) to " - "...' instead"))); + errmsg("copy column list is not supported"), + errhint("use 'copy (select from
) to " + "...' instead"))); } RangeVar *relation = copyStmt->relation; @@ -469,7 +478,7 @@ LsmCheckAlterTable(AlterTableStmt *alterStmt) if (alterCmd->subtype == AT_AddColumn) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("No support for adding column currently"))); + errmsg("No support for adding column currently"))); } } } @@ -481,15 +490,15 @@ LsmCheckAlterTable(AlterTableStmt *alterStmt) * utility command via macro CALL_PREVIOUS_UTILITY. */ static void LsmProcessUtility(PlannedStmt *plannedStmt, - const char *queryString, - ProcessUtilityContext context, - ParamListInfo paramListInfo, - QueryEnvironment *queryEnvironment, - DestReceiver *destReceiver, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo paramListInfo, + QueryEnvironment *queryEnvironment, + DestReceiver *destReceiver, #if PG_VERSION_NUM>=130000 - QueryCompletion *completionTag) + QueryCompletion *completionTag) #else - char *completionTag) + char *completionTag) #endif { Node *parseTree = plannedStmt->utilityStmt; @@ -507,12 +516,12 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, if (completionTag != NULL) { #if PG_VERSION_NUM>=130000 - SetQueryCompletion(completionTag, CMDTAG_COPY, rowCount); + SetQueryCompletion(completionTag, CMDTAG_COPY, rowCount); #else snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "COPY " UINT64_FORMAT, - rowCount); + rowCount); #endif } } else { @@ -545,7 +554,7 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, ListCell *fileCell = NULL; foreach(fileCell, droppedTables) { char *path = lfirst(fileCell); - rmtree(path, true); + rmtree(path, true); } } } else if (nodeTag(parseTree) == T_AlterTableStmt) { @@ -571,87 +580,87 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, static void LsmShmemStartup(void) { - bool found; - void* ctl; + bool found; + void* ctl; if (PreviousShmemStartupHook) - { + { PreviousShmemStartupHook(); } - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - - ctl = ShmemInitStruct("lsm_control", - LsmShmemSize(MaxConnections), - &found); - if (!found) - { - LsmInitialize(ctl, MaxConnections); - if (mkdir(LSM_FDW_NAME, S_IRWXU) != 0 && errno != EEXIST) - elog(ERROR, "Failed to create lsm directory: %m"); - } - else - LsmAttach(ctl); - - LWLockRelease(AddinShmemInitLock); + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + ctl = ShmemInitStruct("lsm_control", + LsmShmemSize(MaxConnections), + &found); + if (!found) + { + LsmInitialize(ctl, MaxConnections); + if (mkdir(LSM_FDW_NAME, S_IRWXU) != 0 && errno != EEXIST) + elog(ERROR, "Failed to create lsm directory: %m"); + } + else + LsmAttach(ctl); + + LWLockRelease(AddinShmemInitLock); } void _PG_init(void) { - BackgroundWorker worker; - - if (!process_shared_preload_libraries_in_progress) - elog(ERROR, "LSM: this extension should be loaded via shared_preload_libraries"); - - DefineCustomIntVariable("lsm.queue_size", - "Size of LSM queue", - NULL, - &LsmQueueSize, - LSM_MAX_RECORD_SIZE, LSM_MAX_RECORD_SIZE, INT_MAX, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); + BackgroundWorker worker; + + if (!process_shared_preload_libraries_in_progress) + elog(ERROR, "LSM: this extension should be loaded via shared_preload_libraries"); + + DefineCustomIntVariable("lsm.queue_size", + "Size of LSM queue", + NULL, + &LsmQueueSize, + LSM_MAX_RECORD_SIZE, LSM_MAX_RECORD_SIZE, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); DefineCustomBoolVariable("lsm.sync", - "Use synchronouse write", - NULL, - &LsmSync, - false, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); + "Use synchronouse write", + NULL, + &LsmSync, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); DefineCustomBoolVariable("lsm.upsert", - "Use implicit upsert semantic", - "If key of inserted record already exists, then replace old record with new one", - &LsmUpsert, - true, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); - - RequestAddinShmemSpace(LsmShmemSize(MaxConnections)); - elog(DEBUG1, "Request %ld bytes of shared memory", LsmShmemSize(MaxConnections)); - - MemSet(&worker, 0, sizeof(BackgroundWorker)); - worker.bgw_flags = BGWORKER_SHMEM_ACCESS; - worker.bgw_start_time = BgWorkerStart_ConsistentState; - strcpy(worker.bgw_library_name, "lsm"); - strcpy(worker.bgw_function_name, "LsmWorkerMain"); - strcpy(worker.bgw_name, "LSM worker"); - strcpy(worker.bgw_type, "LSM worker"); - - RegisterBackgroundWorker(&worker); - - PreviousShmemStartupHook = shmem_startup_hook; - shmem_startup_hook = LsmShmemStartup; + "Use implicit upsert semantic", + "If key of inserted record already exists, then replace old record with new one", + &LsmUpsert, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + RequestAddinShmemSpace(LsmShmemSize(MaxConnections)); + elog(DEBUG1, "Request %ld bytes of shared memory", LsmShmemSize(MaxConnections)); + + MemSet(&worker, 0, sizeof(BackgroundWorker)); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + strcpy(worker.bgw_library_name, "lsm"); + strcpy(worker.bgw_function_name, "LsmWorkerMain"); + strcpy(worker.bgw_name, "LSM worker"); + strcpy(worker.bgw_type, "LSM worker"); + + RegisterBackgroundWorker(&worker); + + PreviousShmemStartupHook = shmem_startup_hook; + shmem_startup_hook = LsmShmemStartup; PreviousProcessUtilityHook = ProcessUtility_hook; ProcessUtility_hook = LsmProcessUtility; } @@ -669,25 +678,25 @@ void _PG_fini(void) static void LsmWorkerSigtermHandler(SIGNAL_ARGS) { - LsmStopWorkers(); + LsmStopWorkers(); } void LsmWorkerMain(Datum main_arg) { - pqsignal(SIGTERM, LsmWorkerSigtermHandler); - BackgroundWorkerUnblockSignals(); - LsmRunWorkers(MaxConnections); + pqsignal(SIGTERM, LsmWorkerSigtermHandler); + BackgroundWorkerUnblockSignals(); + LsmRunWorkers(MaxConnections); } void LsmError(char const* message) { - ereport(ERROR, (errmsg("LSM: %s", message))); + ereport(ERROR, (errmsg("LSM: %s", message))); } void LsmMemoryBarrier(void) { - pg_memory_barrier(); + pg_memory_barrier(); } diff --git a/pg_rocksdb--0.1.sql b/pg_rocksdb--0.1.sql index 17400a6..6a1ddda 100644 --- a/pg_rocksdb--0.1.sql +++ b/pg_rocksdb--0.1.sql @@ -1,9 +1,9 @@ -CREATE FUNCTION pg_rocksdb_handler() +CREATE FUNCTION pg_rocksdb_fdw_handler() -- 底下必须返回的是fdw_handler,而不是其他的 RETURNS fdw_handler AS 'MODULE_PATHNAME' LANGUAGE C STRICT; CREATE FOREIGN DATA WRAPPER pg_rocksdb_fdw - HANDLER pg_rocksdb_handler; + HANDLER pg_rocksdb_fdw_handler; From 7a71577220323e18a8752a21411edc2ea5d5b9e8 Mon Sep 17 00:00:00 2001 From: whrgogogo666 <421803476@qq.com> Date: Tue, 17 May 2022 15:42:08 +0800 Subject: [PATCH 16/16] RETURNS fdw_handler --- Makefile | 10 +- README.md | 66 +++--- lsm_api.h | 12 +- lsm_client.cpp | 172 ++++++++-------- lsm_db.h | 208 +++++++++---------- lsm_fdw.c | 254 +++++++++++------------ lsm_fdw.h | 2 +- lsm_posix.h | 3 + lsm_server.cpp | 492 ++++++++++++++++++++++---------------------- lsm_storage.cpp | 76 +++---- lsm_util.c | 210 +++++++++---------- pg_rocksdb--0.1.sql | 9 - pg_rocksdb.conf | 1 - pg_rocksdb.control | 5 - 14 files changed, 753 insertions(+), 767 deletions(-) delete mode 100644 pg_rocksdb--0.1.sql delete mode 100644 pg_rocksdb.conf delete mode 100644 pg_rocksdb.control diff --git a/Makefile b/Makefile index d2807bc..5e07173 100644 --- a/Makefile +++ b/Makefile @@ -1,22 +1,22 @@ -MODULE_big = pg_rocksdb +MODULE_big = lsm OBJS = lsm_fdw.o lsm_client.o lsm_server.o lsm_posix.o lsm_storage.o lsm_util.o PGFILEDESC = "LSM: log-structured merge-tree" PG_CPPFLAGS += -Wno-declaration-after-statement SHLIB_LINK = -lrocksdb -EXTENSION = pg_rocksdb -DATA = pg_rocksdb--0.1.sql +EXTENSION = lsm +DATA = lsm--0.1.sql REGRESS = create basic test testddl testcopy testcolumn -REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_rocksdb/pg_rocksdb.conf +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/lsm/lsm.conf ifdef USE_PGXS PG_CONFIG ?= pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) else -subdir = contrib/pg_rocksdb +subdir = contrib/lsm top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk diff --git a/README.md b/README.md index d7586a4..f8d5529 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Build Status](https://travis-ci.com/vidardb/PostgresForeignDataWrapper.svg?branch=master)](https://travis-ci.com/github/vidardb/PostgresForeignDataWrapper) -This PostgreSQL extension implements a Foreign Data Wrapper (FDW) for [RocksDB](https://rocksdb.org/). This repo has been listed in PostgreSQL [wiki](https://wiki.postgresql.org/wiki/Foreign_data_wrappers). +This PostgreSQL extension implements a Foreign Data Wrapper (FDW) for [RocksDB](https://rocksdb.org/). This repo has been listed in PostgreSQL [wiki](https://wiki.postgresql.org/wiki/Foreign_data_wrappers). RocksDB is a high performance key-value store based on a log-structured merge-tree (LSM tree). RocksDB can efficiently use many CPU cores and fast storage. This is the first foreign data wrapper that connects a LSM-tree-based storage engine to PostgreSQL. Because RocksDB is an embeddable key-value store, you do not need to run another server to use this extension. @@ -17,65 +17,65 @@ This extension is developed and maintained by the VidarDB team. Feel free to rep We test this foreign data wrapper on Ubuntu Server 18.04 using PostgreSQL-11 together with RocksDB-6.2.4 (built with GCC-7.4.0). - Install PostgreSQL and the dev library which is required by extensions: - + ```sh # add the repository sudo tee /etc/apt/sources.list.d/pgdg.list << END deb http://apt.postgresql.org/pub/repos/apt/ bionic-pgdg main END - + # get the signing key and import it wget https://www.postgresql.org/media/keys/ACCC4CF8.asc sudo apt-key add ACCC4CF8.asc - + # fetch the metadata from the new repo sudo apt-get update - + # install postgresql and the dev library sudo apt-get install postgresql-11 sudo apt-get install postgresql-server-dev-11 ``` - Install [RocksDB](https://github.com/facebook/rocksdb) from source code: - + ```sh git clone -b v6.2.4 https://github.com/facebook/rocksdb.git - + cd rocksdb - + sudo DEBUG_LEVEL=0 make shared_lib install-shared sudo sh -c "echo /usr/local/lib >> /etc/ld.so.conf" - + sudo ldconfig ``` - Build this foreign data wrapper: - + ```sh - git clone https://github.com/whrgogogo666/pg_rocksdb.git - - cd pg_rocksdb - - make USE_PGXS=1 - - sudo make USE_PGXS=1 install + git clone git@github.com:postgrespro/lsm.git + + cd lsm + + make + + sudo make install ``` - Before using this foreign data wrapper, we need to add it to `shared_preload_libraries` in the `postgresql.conf`: - + ```sh - echo "shared_preload_libraries = 'pg_rocksdb'" >> postgresql.conf + echo "shared_preload_libraries = 'lsm'" >> postgresql.conf ``` - + and restart PostgreSQL: - + ```sh sudo service postgresql restart ``` - When uninstall this extension, first issue the following commands, and then delete the data by locating PostgreSQL data folder via `show data_directory;` in PostgreSQL terminal. - + ```sh cd PostgresForeignDataWrapper @@ -90,7 +90,7 @@ We test this foreign data wrapper on Ubuntu Server 18.04 using PostgreSQL-11 tog - ACID relies on the storage engine. -- Data types of Postgres are not natively supported. +- Data types of Postgres are not natively supported. # Usage @@ -98,15 +98,14 @@ This extension does not have any parameter. After creating the extension and cor A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres`' to connect the local postgresql server*): - ``` CREATE DATABASE example; \c example - CREATE EXTENSION pg_rocksdb; - CREATE SERVER pg_rocksdb_server FOREIGN DATA WRAPPER pg_rocksdb_fdw; + CREATE EXTENSION lsm; + CREATE SERVER lsm_server FOREIGN DATA WRAPPER lsm_fdw; - CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER pg_rocksdb_server; + CREATE FOREIGN TABLE student(id INTEGER, name TEXT) SERVER lsm_server; INSERT INTO student VALUES(20757123, 'Rafferty'); SELECT * FROM student; @@ -122,19 +121,17 @@ A simple example is as follows (*you can run '`sudo -u postgres psql -U postgres DROP FOREIGN TABLE student; - DROP SERVER pg_rocksdb_server; - DROP EXTENSION pg_rocksdb_fdw; - + DROP SERVER lsm_server; + DROP EXTENSION lsm_fdw; + \c postgres DROP DATABASE example; - -``` +``` # Testing We have tested certain typical SQL statements and will add more test cases later. The test scripts are in the test/sql folder which are recommended to be placed in a non-root directory. The corresponding results can be found in the test/expected folder. You can run the tests in the following way: - ```sh sudo service postgresql restart @@ -151,12 +148,11 @@ We have tested certain typical SQL statements and will add more test cases later If you want to debug the source code, you may need to start PostgreSQL in the debug mode: - ```sh sudo service postgresql stop sudo -u postgres /usr/lib/postgresql/11/bin/postgres -d 0 -D /var/lib/postgresql/11/main -c config_file=/etc/postgresql/11/main/postgresql.conf -``` +``` # Docker diff --git a/lsm_api.h b/lsm_api.h index 64524f3..e5ab3ad 100644 --- a/lsm_api.h +++ b/lsm_api.h @@ -2,26 +2,27 @@ * This file represents API between client working with Postgres API and C++ server working with RocksDB API. * To avoid collision between C/C++ headers they are sharing just this one header file. */ -#ifndef __LSM_API_H__ -#define __LSM_API_H__ +#ifndef __LSM_API_H__ // 如果没有定义这个宏 +#define __LSM_API_H__ // 就定义下面的宏 #include #include #include #ifdef __cplusplus -extern "C" { +extern "C" { //加上extern "C"后,会指示编译器这部分代码按C语言的进行编译,而不是C++的。 #endif /* * Maximal size of record which can be transfered through client-server protocol and read batch size as well */ -#define LSM_MAX_RECORD_SIZE (64*1024) +// 定义client 和 server之间传送的数据的最大数据量 +#define LSM_MAX_RECORD_SIZE (64*1024) //64KB /* * Name of the directory in $PGDATA */ -#define LSM_FDW_NAME "pg_rocksdb" +#define LSM_FDW_NAME "lsm" //定义fdw的名称为lsm extern int LsmQueueSize; extern bool LsmSync; @@ -42,6 +43,7 @@ typedef enum LsmOpLookup } LsmOperation; +// 定义client和server之间的各种操作 extern void LsmError(char const* message); extern size_t LsmShmemSize(int maxClients); extern void LsmInitialize(void* ctl, int maxClients); diff --git a/lsm_client.cpp b/lsm_client.cpp index 243ab7a..dedb14c 100644 --- a/lsm_client.cpp +++ b/lsm_client.cpp @@ -9,52 +9,52 @@ LsmQueue** queues; size_t LsmShmemSize(int maxClients) { - return (sizeof(LsmQueue) + LsmQueueSize + sizeof(LsmQueue*)) * maxClients; + return (sizeof(LsmQueue) + LsmQueueSize + sizeof(LsmQueue*)) * maxClients; } void LsmInitialize(void* ctl, int maxClients) { - queues = (LsmQueue**)ctl; - char* ptr = (char*)(queues + maxClients); - for (int i = 0; i < maxClients; i++) - { - LsmQueue* queue = (LsmQueue*)ptr; - ptr += sizeof(LsmQueue) + LsmQueueSize; - queue->getPos = 0; - queue->putPos = 0; - queue->writerBlocked = false; - SemInit(&queue->empty, 1, 0); - SemInit(&queue->full, 1, 0); - SemInit(&queue->ready, 1, 0); - queues[i] = queue; - } + queues = (LsmQueue**)ctl; + char* ptr = (char*)(queues + maxClients); + for (int i = 0; i < maxClients; i++) + { + LsmQueue* queue = (LsmQueue*)ptr; + ptr += sizeof(LsmQueue) + LsmQueueSize; + queue->getPos = 0; + queue->putPos = 0; + queue->writerBlocked = false; + SemInit(&queue->empty, 1, 0); + SemInit(&queue->full, 1, 0); + SemInit(&queue->ready, 1, 0); + queues[i] = queue; + } } void LsmAttach(void* ctl) { - queues = (LsmQueue**)ctl; + queues = (LsmQueue**)ctl; } bool LsmDelete(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpDelete; - msg.hdr.rid = rid; - msg.hdr.keySize = keyLen; - msg.hdr.valueSize = 0; - msg.key = key; - queue->put(msg); - if (LsmSync) - { - SemWait(&queue->ready); - return (bool)queue->resp[0]; - } - return true; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpDelete; + msg.hdr.rid = rid; + msg.hdr.keySize = keyLen; + msg.hdr.valueSize = 0; + msg.key = key; + queue->put(msg); + if (LsmSync) + { + SemWait(&queue->ready); + return (bool)queue->resp[0]; + } + return true; } @@ -62,21 +62,21 @@ LsmDelete(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen) bool LsmInsert(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen, char *val, size_t valLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpInsert; - msg.hdr.rid = rid; //外部表oid - msg.hdr.keySize = keyLen; //key的长度 - msg.hdr.valueSize = valLen; //value的长度 - msg.key = key; - msg.value = val; - queue->put(msg); - if (LsmSync) - { - SemWait(&queue->ready); - return (bool)queue->resp[0]; - } - return true; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpInsert; + msg.hdr.rid = rid; //外部表oid + msg.hdr.keySize = keyLen; //key的长度 + msg.hdr.valueSize = valLen; //value的长度 + msg.key = key; + msg.value = val; + queue->put(msg); + if (LsmSync) + { + SemWait(&queue->ready); + return (bool)queue->resp[0]; + } + return true; } @@ -84,61 +84,61 @@ LsmInsert(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen, char *val uint64_t LsmCount(LsmQueueId qid, LsmRelationId rid) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpCount; - msg.hdr.rid = rid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); - SemWait(&queue->ready); - return *(uint64_t*)queue->resp; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpCount; + msg.hdr.rid = rid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); + SemWait(&queue->ready); + return *(uint64_t*)queue->resp; } void LsmCloseCursor(LsmQueueId qid, LsmRelationId rid, LsmCursorId cid) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpCloseCursor; - msg.hdr.rid = rid; - msg.hdr.cid = cid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpCloseCursor; + msg.hdr.rid = rid; + msg.hdr.cid = cid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); } bool LsmReadNext(LsmQueueId qid, LsmRelationId rid, LsmCursorId cid, char *buf, size_t *size) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpFetch; - msg.hdr.rid = rid; - msg.hdr.cid = cid; - msg.hdr.keySize = 0; - msg.hdr.valueSize = 0; - queue->put(msg); - SemWait(&queue->ready); - memcpy(buf, queue->resp, queue->respSize); - *size = queue->respSize; - return *size != 0; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpFetch; + msg.hdr.rid = rid; + msg.hdr.cid = cid; + msg.hdr.keySize = 0; + msg.hdr.valueSize = 0; + queue->put(msg); + SemWait(&queue->ready); + memcpy(buf, queue->resp, queue->respSize); + *size = queue->respSize; + return *size != 0; } bool LsmLookup(LsmQueueId qid, LsmRelationId rid, char *key, size_t keyLen, char *val, size_t *valLen) { - LsmMessage msg; - LsmQueue* queue = queues[qid]; - msg.hdr.op = LsmOpLookup; - msg.hdr.rid = rid; - msg.hdr.keySize = keyLen; - msg.hdr.valueSize = 0; - msg.key = key; - queue->put(msg); - SemWait(&queue->ready); - memcpy(val, queue->resp, queue->respSize); - *valLen = queue->respSize; - return *valLen != 0; + LsmMessage msg; + LsmQueue* queue = queues[qid]; + msg.hdr.op = LsmOpLookup; + msg.hdr.rid = rid; + msg.hdr.keySize = keyLen; + msg.hdr.valueSize = 0; + msg.key = key; + queue->put(msg); + SemWait(&queue->ready); + memcpy(val, queue->resp, queue->respSize); + *valLen = queue->respSize; + return *valLen != 0; } diff --git a/lsm_db.h b/lsm_db.h index c6d7823..48aaefc 100644 --- a/lsm_db.h +++ b/lsm_db.h @@ -10,7 +10,7 @@ using namespace rocksdb; /** * 对Rocksdb的封装 - * + * * / /* @@ -18,20 +18,20 @@ using namespace rocksdb; */ struct LsmConnection { - DB* db; - - void open(char const* path); - uint64_t count(); - void close(); - Iterator* getIterator(); - void releaseIterator(Iterator* iter); - size_t lookup(char const* key, size_t keySize, char* buf); - size_t next(Iterator* iter, char* buf); - bool insert(char* key, size_t keyLen, char* val, size_t valLen); - bool remove(char* key, size_t keyLen); - - LsmConnection() : db(NULL) {} - ~LsmConnection() { close(); } + DB* db; + + void open(char const* path); + uint64_t count(); + void close(); + Iterator* getIterator(); + void releaseIterator(Iterator* iter); + size_t lookup(char const* key, size_t keySize, char* buf); + size_t next(Iterator* iter, char* buf); + bool insert(char* key, size_t keyLen, char* val, size_t valLen); + bool remove(char* key, size_t keyLen); + + LsmConnection() : db(NULL) {} + ~LsmConnection() { close(); } }; /* @@ -39,11 +39,11 @@ struct LsmConnection */ struct LsmMessageHeader { - LsmOperation op; - uint32_t keySize; - uint32_t valueSize; - LsmRelationId rid; - LsmCursorId cid; + LsmOperation op; + uint32_t keySize; + uint32_t valueSize; + LsmRelationId rid; + LsmCursorId cid; }; /* @@ -52,9 +52,9 @@ struct LsmMessageHeader // 用于包装传输的数据 struct LsmMessage { - LsmMessageHeader hdr; - char* key; - char* value; + LsmMessageHeader hdr; + char* key; + char* value; }; /* @@ -63,31 +63,31 @@ struct LsmMessage // 用于将数据传送到rocksdb中 struct LsmQueue { - // ring buffer 环状缓冲区 - volatile int getPos; // get position in ring buffer (updated only by consumer) - volatile int putPos; // put position in ring buffer (updated only by producer) - volatile int respSize; // response size - volatile int writerBlocked; // producer is blocked because queue is full - volatile int terminate;// worker receives termination request - sem_t empty; // semaphore to wait until queue is not empty - sem_t full; // semaphore to wait until queue is not full - sem_t ready; // semaphore to wait response from server - char resp[LSM_MAX_RECORD_SIZE]; // response data - char req[1]; // ring buffer (LsmQueueSize long) - - void put(LsmMessage const& msg); - void get(char* buf, LsmMessage& msg); - void next(LsmMessage const& msg); - - LsmQueue() : getPos(0), putPos(0), respSize(0), writerBlocked(false) {} + // ring buffer 环状缓冲区 + volatile int getPos; // get position in ring buffer (updated only by consumer) + volatile int putPos; // put position in ring buffer (updated only by producer) + volatile int respSize; // response size + volatile int writerBlocked; // producer is blocked because queue is full + volatile int terminate;// worker receives termination request + sem_t empty; // semaphore to wait until queue is not empty + sem_t full; // semaphore to wait until queue is not full + sem_t ready; // semaphore to wait response from server + char resp[LSM_MAX_RECORD_SIZE]; // response data + char req[1]; // ring buffer (LsmQueueSize long) + + void put(LsmMessage const& msg); + void get(char* buf, LsmMessage& msg); + void next(LsmMessage const& msg); + + LsmQueue() : getPos(0), putPos(0), respSize(0), writerBlocked(false) {} }; struct LsmCursor { - LsmConnection* con; - Iterator* iter; + LsmConnection* con; + Iterator* iter; - LsmCursor() : con(NULL), iter(NULL) {} + LsmCursor() : con(NULL), iter(NULL) {} }; struct LsmServer; @@ -96,86 +96,86 @@ struct LsmServer; // 主要封装了对lsm的操作,也就是lsmServer中的一个工作进程 struct LsmWorker { - std::map cursors; - LsmServer* server; // 一个server有很多的worker - LsmQueue* queue; //一个worker对应一个queue - pthread_t thread; + std::map cursors; + LsmServer* server; // 一个server有很多的worker + LsmQueue* queue; //一个worker对应一个queue + pthread_t thread; - LsmWorker(LsmServer* s, LsmQueue* q) : server(s), queue(q) {} + LsmWorker(LsmServer* s, LsmQueue* q) : server(s), queue(q) {} - void start(); - void stop(); - void run(); - void wait(); + void start(); + void stop(); + void run(); + void wait(); -private: - LsmConnection& open(LsmMessage const& msg); + private: + LsmConnection& open(LsmMessage const& msg); - void insert(LsmMessage const& msg); - void remove(LsmMessage const& msg); - void closeCursor(LsmMessage const& msg); - void fetch(LsmMessage const& msg); - void count(LsmMessage const& msg); - void lookup(LsmMessage const& msg); + void insert(LsmMessage const& msg); + void remove(LsmMessage const& msg); + void closeCursor(LsmMessage const& msg); + void fetch(LsmMessage const& msg); + void count(LsmMessage const& msg); + void lookup(LsmMessage const& msg); - static void* main(void* arg); + static void* main(void* arg); }; class Mutex { - pthread_mutex_t mutex; -public: - Mutex() - { - PthreadMutexInit(&mutex); - } - - ~Mutex() - { - PthreadMutexDestroy(&mutex); - } - - void lock() - { - PthreadMutexLock(&mutex); - } - - void unlock() - { - PthreadMutexUnlock(&mutex); - } + pthread_mutex_t mutex; + public: + Mutex() + { + PthreadMutexInit(&mutex); + } + + ~Mutex() + { + PthreadMutexDestroy(&mutex); + } + + void lock() + { + PthreadMutexLock(&mutex); + } + + void unlock() + { + PthreadMutexUnlock(&mutex); + } }; class CriticalSection { - Mutex& mutex; -public: - CriticalSection(Mutex& m) : mutex(m) - { - mutex.lock(); - } - ~CriticalSection() - { - mutex.unlock(); - } + Mutex& mutex; + public: + CriticalSection(Mutex& m) : mutex(m) + { + mutex.lock(); + } + ~CriticalSection() + { + mutex.unlock(); + } }; // 此结构体包含了很多对rocksdb的操作,非常重要,是一个对于rocksdb封装的最大对象,包含了很多的LsmWorker struct LsmServer { - LsmWorker** workers; - size_t nWorkers; - Mutex mutex; - std::map connections; - - void start(); - void wait(); - void stop(); - - LsmConnection& open(LsmMessage const& msg); - LsmServer(size_t maxClients); - ~LsmServer(); + LsmWorker** workers; + size_t nWorkers; + Mutex mutex; + std::map connections; + + void start(); + void wait(); + void stop(); + + LsmConnection& open(LsmMessage const& msg); + LsmServer(size_t maxClients); + ~LsmServer(); }; extern LsmQueue** queues; diff --git a/lsm_fdw.c b/lsm_fdw.c index c9d4c03..82ad48f 100644 --- a/lsm_fdw.c +++ b/lsm_fdw.c @@ -32,7 +32,7 @@ PG_MODULE_MAGIC; #endif -PG_FUNCTION_INFO_V1(pg_rocksdb_fdw_handler); +PG_FUNCTION_INFO_V1(lsm_fdw_handler); /* @@ -46,8 +46,8 @@ foreigntableid是外部表在pg_class中的 OID (foreigntableid可以从规划 // baserel 是planner中关于外部表格的信息 static void GetForeignRelSize(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId) + RelOptInfo *baserel, + Oid foreignTableId) { /* * Obtain relation size estimates for a foreign table. This is called at @@ -81,8 +81,8 @@ GetForeignRelSize(PlannerInfo *root, // 创建一个扫描外部表的访问路径 static void GetForeignPaths(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId) + RelOptInfo *baserel, + Oid foreignTableId) { /* * Create possible access paths for a scan on a foreign table. This is @@ -99,7 +99,7 @@ GetForeignPaths(PlannerInfo *root, * that is needed to identify the specific scan method intended. */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); Cost startupCost = 0; Cost totalCost = startupCost + baserel->rows; @@ -123,12 +123,12 @@ GetForeignPaths(PlannerInfo *root, // 创建一个ForeignScan 计划的节点,从选择的外部acess path中创建 static ForeignScan* GetForeignPlan(PlannerInfo *root, - RelOptInfo *baserel, - Oid foreignTableId, - ForeignPath *bestPath, - List *targetList, - List *scanClauses, // - Plan *outerPlan) + RelOptInfo *baserel, + Oid foreignTableId, + ForeignPath *bestPath, + List *targetList, + List *scanClauses, // + Plan *outerPlan) { /* * Create a ForeignScan plan node from the selected foreign access path. @@ -168,9 +168,9 @@ GetForeignPlan(PlannerInfo *root, static void GetKeyBasedQual(ForeignScanState *scanState, - Node *node, - Relation relation, - TableReadState *readState) + Node *node, + Relation relation, + TableReadState *readState) { if (!node || !IsA(node, OpExpr)) { return; @@ -187,9 +187,9 @@ GetKeyBasedQual(ForeignScanState *scanState, } Node *right = list_nth(op->args, 1); - if (IsA(right, RelabelType)) { - right = (Node*) ((RelabelType*)right)->arg; - } + if (IsA(right, RelabelType)) { + right = (Node*) ((RelabelType*)right)->arg; + } if (!IsA(right, Const) && !IsA(right, Param)) { return; } @@ -215,20 +215,20 @@ GetKeyBasedQual(ForeignScanState *scanState, ReleaseSysCache(opertup); Datum keyDatum; - Oid keyType; + Oid keyType; - if (IsA(right, Const)) - { - Const *constNode = (Const *) right; - keyDatum = constNode->constvalue; - keyType = constNode->consttype; - } - else - { - Param *paramNode = (Param *) right; - keyType = paramNode->paramtype; - keyDatum = scanState->ss.ps.state->es_param_list_info->params[paramNode->paramid-1].value; - } + if (IsA(right, Const)) + { + Const *constNode = (Const *) right; + keyDatum = constNode->constvalue; + keyType = constNode->consttype; + } + else + { + Param *paramNode = (Param *) right; + keyType = paramNode->paramtype; + keyDatum = scanState->ss.ps.state->es_param_list_info->params[paramNode->paramid-1].value; + } TypeCacheEntry *typeEntry = lookup_type_cache(keyType, 0); /* constant gets varlena with 4B header, same with copy uility */ @@ -251,10 +251,10 @@ GetKeyBasedQual(ForeignScanState *scanState, // 开始执行外部表格的扫描 static void -BeginForeignScan(ForeignScanState *scanState, - int executorFlags) +BeginForeignScan(ForeignScanState *scanState, + int executorFlags) { - static LsmCursorId operationId = 0; /* a SQL might cause multiple scans */ + static LsmCursorId operationId = 0; /* a SQL might cause multiple scans */ /* * Begin executing a foreign scan. This is called during executor startup. @@ -275,7 +275,7 @@ BeginForeignScan(ForeignScanState *scanState, * */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); TableReadState *readState = palloc0(sizeof(TableReadState)); readState->isKeyBased = false; @@ -295,7 +295,7 @@ BeginForeignScan(ForeignScanState *scanState, foreach (lc, scanState->ss.ps.plan->qual) { Expr *state = lfirst(lc); GetKeyBasedQual(scanState, - (Node *) state, + (Node *) state, scanState->ss.ss_currentRelation, readState); if (readState->isKeyBased) { @@ -304,14 +304,14 @@ BeginForeignScan(ForeignScanState *scanState, } if (!readState->isKeyBased) - { + { Oid relationId = RelationGetRelid(scanState->ss.ss_currentRelation); readState->hasNext = LsmReadNext(MyBackendId, - relationId, - ++operationId, - readState->buf, - &readState->bufLen); + relationId, + ++operationId, + readState->buf, + &readState->bufLen); readState->next = readState->buf; readState->operationId = operationId; @@ -320,8 +320,8 @@ BeginForeignScan(ForeignScanState *scanState, static void DeserializeTuple(StringInfo key, - StringInfo val, - TupleTableSlot *tupleSlot) + StringInfo val, + TupleTableSlot *tupleSlot) { Datum *values = tupleSlot->tts_values; @@ -337,9 +337,9 @@ DeserializeTuple(StringInfo key, int offset = 0; char *current = key->data; for (int index = 0; index < count; index++) - { + { if (index > 0) - { + { uint64 dataLen = 0; uint8 headerLen = DecodeVarintLength(current, val->data + val->len, @@ -347,7 +347,7 @@ DeserializeTuple(StringInfo key, offset += headerLen; current = val->data + offset; if (dataLen == 0) - { + { nulls[index] = true; continue; } @@ -369,24 +369,24 @@ DeserializeTuple(StringInfo key, static bool GetNextFromBatch(Oid relationId, - TableReadState *readState, - char **key, - size_t *keyLen, - char **val, - size_t *valLen) + TableReadState *readState, + char **key, + size_t *keyLen, + char **val, + size_t *valLen) { bool found = false; if (readState->next < readState->buf + readState->bufLen) - { + { found = true; } - else if (readState->hasNext) - { + else if (readState->hasNext) + { readState->hasNext = LsmReadNext(MyBackendId, - relationId, - readState->operationId, - readState->buf, - &readState->bufLen); + relationId, + readState->operationId, + readState->buf, + &readState->bufLen); readState->next = readState->buf; if (readState->bufLen > 0) { @@ -395,15 +395,15 @@ GetNextFromBatch(Oid relationId, } if (found) { - int len; + int len; memcpy(&len, readState->next, sizeof(len)); - *keyLen = len; + *keyLen = len; readState->next += sizeof(len); *key = readState->next; readState->next += len; memcpy(&len, readState->next, sizeof(len)); - *valLen = len; + *valLen = len; readState->next += sizeof(len); *val = readState->next; readState->next += len; @@ -439,7 +439,7 @@ IterateForeignScan(ForeignScanState *scanState) * (just as you would need to do in the case of a data type mismatch). */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); TupleTableSlot *tupleSlot = scanState->ss.ss_ScanTupleSlot; ExecClearTuple(tupleSlot); @@ -454,12 +454,12 @@ IterateForeignScan(ForeignScanState *scanState) k = readState->key->data; kLen = readState->key->len; found = LsmLookup(MyBackendId, relationId, k, kLen, readState->buf, &vLen); - v = readState->buf; + v = readState->buf; readState->done = true; } } - else - { + else + { found = GetNextFromBatch(relationId, readState, &k, @@ -469,11 +469,11 @@ IterateForeignScan(ForeignScanState *scanState) } if (found) - { + { StringInfoData key; StringInfoData val; - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); appendBinaryStringInfo(&key, k, kLen); appendBinaryStringInfo(&val, v, vLen); @@ -493,7 +493,7 @@ ReScanForeignScan(ForeignScanState *scanState) * return exactly the same rows. */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); } static void @@ -505,14 +505,14 @@ EndForeignScan(ForeignScanState *scanState) * remote servers should be cleaned up. */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); TableReadState *readState = (TableReadState *) scanState->fdw_state; Assert(readState); Oid relationId = RelationGetRelid(scanState->ss.ss_currentRelation); if (!readState->isKeyBased) - { + { LsmCloseCursor(MyBackendId, relationId, readState->operationId); } @@ -521,8 +521,8 @@ EndForeignScan(ForeignScanState *scanState) static void AddForeignUpdateTargets(Query *parsetree, - RangeTblEntry *tableEntry, - Relation targetRelation) + RangeTblEntry *tableEntry, + Relation targetRelation) { /* * UPDATE and DELETE operations are performed against rows previously @@ -551,7 +551,7 @@ AddForeignUpdateTargets(Query *parsetree, * relies on an unchanging primary key to identify rows.) */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); /* * We are using first column as row identification column, so we are adding @@ -581,9 +581,9 @@ AddForeignUpdateTargets(Query *parsetree, static List* PlanForeignModify(PlannerInfo *root, - ModifyTable *plan, - Index resultRelation, - int subplanIndex) + ModifyTable *plan, + Index resultRelation, + int subplanIndex) { /* * Perform any additional planning actions needed for an insert, update, @@ -605,7 +605,7 @@ PlanForeignModify(PlannerInfo *root, * BeginForeignModify will be NIL. */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); return NULL; } @@ -613,10 +613,10 @@ PlanForeignModify(PlannerInfo *root, // 开始执行一个外部表的修改,比如curd static void BeginForeignModify(ModifyTableState *modifyTableState, - ResultRelInfo *resultRelInfo, - List *fdwPrivate, - int subplanIndex, - int executorFlags) + ResultRelInfo *resultRelInfo, + List *fdwPrivate, + int subplanIndex, + int executorFlags) { /* * Begin executing a foreign table modification operation. This routine is @@ -644,7 +644,7 @@ BeginForeignModify(ModifyTableState *modifyTableState, * during executor startup. */ - ereport(DEBUG1, (errmsg("pg_rocksdb: entering function %s", __func__))); + ereport(DEBUG1, (errmsg("LSM: entering function %s", __func__))); if (executorFlags & EXEC_FLAG_EXPLAIN_ONLY) { return; @@ -667,8 +667,8 @@ BeginForeignModify(ModifyTableState *modifyTableState, // 将slot序列化为key和val进行插入操作 static void SerializeTuple(StringInfo key, - StringInfo val, - TupleTableSlot *tupleSlot) + StringInfo val, + TupleTableSlot *tupleSlot) { TupleDesc tupleDescriptor = tupleSlot->tts_tupleDescriptor; int count = tupleDescriptor->natts; @@ -679,7 +679,7 @@ SerializeTuple(StringInfo key, if (tupleSlot->tts_isnull[index]) { if (index == 0) { // 元组的第一个属性为空 - ereport(ERROR, (errmsg("pg_rocksdb: first column cannot be null!"))); + ereport(ERROR, (errmsg("LSM: first column cannot be null!"))); } SerializeNullAttribute(tupleDescriptor, index, val); @@ -697,17 +697,17 @@ SerializeTuple(StringInfo key, // 插入一个tuple到外部表中 // slot : 槽 // resultRelInfo 用于描述外部表 -// slot -// planSlot +// slot +// planSlot static TupleTableSlot* ExecForeignInsert(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { /** - * + * * 执行器机制被用于四种基本SQL查询类型:SELECT、INSERT、 UPDATE以及DELETE。对于SELECT, * 顶层执行器代码只需要发送查询计划树返回的每个行给客户端。 * 对于INSERT,每一个被返回的行被插入到INSERT中指定的目标表中。 @@ -752,7 +752,7 @@ ExecForeignInsert(EState *executorState, TupleDesc tupleDescriptor = slot->tts_tupleDescriptor; #if PG_VERSION_NUM>=130000 - bool shouldFree; + bool shouldFree; HeapTuple heapTuple = ExecFetchSlotHeapTuple(slot, false, &shouldFree); if (HeapTupleHasExternal(heapTuple)) { @@ -775,20 +775,20 @@ ExecForeignInsert(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); SerializeTuple(&key, &val, slot); // 调用lsm_client中的接口进行插入 if (!LsmInsert(MyBackendId, foreignTableId, key.data, key.len, val.data, val.len)) - elog(ERROR, "LSM: Failed to insert tuple"); + elog(ERROR, "LSM: Failed to insert tuple"); #if PG_VERSION_NUM>=130000 - if (shouldFree) + if (shouldFree) pfree(heapTuple); #endif - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); return slot; } @@ -797,9 +797,9 @@ ExecForeignInsert(EState *executorState, // 执行外部数据更新 static TupleTableSlot* ExecForeignUpdate(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { /* * Update one tuple in the foreign table. executorState is global execution @@ -832,7 +832,7 @@ ExecForeignUpdate(EState *executorState, TupleDesc tupleDescriptor = slot->tts_tupleDescriptor; #if PG_VERSION_NUM>=130000 - bool shouldFree; + bool shouldFree; HeapTuple heapTuple = ExecFetchSlotHeapTuple(slot, false, &shouldFree); if (HeapTupleHasExternal(heapTuple)) { @@ -843,7 +843,7 @@ ExecForeignUpdate(EState *executorState, } #else if (HeapTupleHasExternal(slot->tts_tuple)) - { + { /* detoast any toasted attributes */ slot->tts_tuple = toast_flatten_tuple(slot->tts_tuple, tupleDescriptor); } @@ -856,28 +856,28 @@ ExecForeignUpdate(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); // 将slot序列化为key 和 val SerializeTuple(&key, &val, slot); // 将获取到的key和val进行insert操作 LsmInsert(MyBackendId, foreignTableId, key.data, key.len, val.data, val.len); #if PG_VERSION_NUM>=130000 - if (shouldFree) + if (shouldFree) pfree(heapTuple); #endif - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); return slot; } static TupleTableSlot* ExecForeignDelete(EState *executorState, - ResultRelInfo *resultRelInfo, - TupleTableSlot *slot, - TupleTableSlot *planSlot) + ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + TupleTableSlot *planSlot) { /* * Delete one tuple from the foreign table. executorState is global @@ -913,22 +913,22 @@ ExecForeignDelete(EState *executorState, Relation relation = resultRelInfo->ri_RelationDesc; Oid foreignTableId = RelationGetRelid(relation); - initStringInfo(&key); - initStringInfo(&val); + initStringInfo(&key); + initStringInfo(&val); SerializeTuple(&key, &val, planSlot); if (!LsmDelete(MyBackendId, foreignTableId, key.data, key.len)) - elog(ERROR, "LSM: Failed to delete tuple"); + elog(ERROR, "LSM: Failed to delete tuple"); - pfree(key.data); - pfree(val.data); + pfree(key.data); + pfree(val.data); - return slot; + return slot; } static void EndForeignModify(EState *executorState, - ResultRelInfo *resultRelInfo) + ResultRelInfo *resultRelInfo) { /* * End the table update and release resources. It is normally not important @@ -955,7 +955,7 @@ EndForeignModify(EState *executorState, static void ExplainForeignScan(ForeignScanState *scanState, - struct ExplainState * explainState) + struct ExplainState * explainState) { /* * Print additional EXPLAIN output for a foreign table scan. This function @@ -973,10 +973,10 @@ ExplainForeignScan(ForeignScanState *scanState, static void ExplainForeignModify(ModifyTableState *modifyTableState, - ResultRelInfo *relationInfo, - List *fdwPrivate, - int subplanIndex, - struct ExplainState *explainState) + ResultRelInfo *relationInfo, + List *fdwPrivate, + int subplanIndex, + struct ExplainState *explainState) { /* * Print additional EXPLAIN output for a foreign table update. This @@ -995,8 +995,8 @@ ExplainForeignModify(ModifyTableState *modifyTableState, static bool AnalyzeForeignTable(Relation relation, - AcquireSampleRowsFunc *acquireSampleRowsFunc, - BlockNumber *totalPageCount) + AcquireSampleRowsFunc *acquireSampleRowsFunc, + BlockNumber *totalPageCount) { /* ---- * This function is called when ANALYZE is executed on a foreign table. If @@ -1032,7 +1032,7 @@ AnalyzeForeignTable(Relation relation, // 文档:https://www.postgresql.org/docs/9.6/fdwhandler.html // 这是整个fdw程序的入口 -Datum pg_rocksdb_fdw_handler(PG_FUNCTION_ARGS) +Datum lsm_fdw_handler(PG_FUNCTION_ARGS) { //FDW 处理函数返回一个 palloc 的FdwRoutine结构,其中包含指向下面描述的回调函数的指针。 //FdwRoutine结构类型在src/include/foreign/fdwapi.h中声明 diff --git a/lsm_fdw.h b/lsm_fdw.h index 1233a4f..347c1c7 100644 --- a/lsm_fdw.h +++ b/lsm_fdw.h @@ -26,7 +26,7 @@ typedef struct TableReadState { size_t bufLen; /* shared mem length, no next batch if it is 0 */ char *next; /* pointer to the next data entry for IterateForeignScan */ bool hasNext; /* whether a next batch from RangeQuery or ReadBatch*/ - char buf[LSM_MAX_RECORD_SIZE]; + char buf[LSM_MAX_RECORD_SIZE]; } TableReadState; /* diff --git a/lsm_posix.h b/lsm_posix.h index 3999e09..5d219a1 100644 --- a/lsm_posix.h +++ b/lsm_posix.h @@ -10,6 +10,9 @@ extern "C" { #endif + +// 此文件定义了大量关于线程的操作 + #define PthreadCreate(t,attr,start,arg) PthreadCreate_(t,attr,start,arg,__func__) #define PthreadJoin(t,exitcode) PthreadJoin_(t,exitcode,__func__) #define SemInit(sem,shared,value) SemInit_(sem,shared,value,__func__) diff --git a/lsm_server.cpp b/lsm_server.cpp index 69da341..b81e4cf 100644 --- a/lsm_server.cpp +++ b/lsm_server.cpp @@ -17,78 +17,78 @@ bool LsmUpsert; // 传入的参数为LsmMessage,将LsmMessage中的数据存入到数据库中 void LsmQueue::put(LsmMessage const& msg) { - int size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; - - if (size > LsmQueueSize) - LsmError("Message is too long"); - - while (true) - { - int getPos = this->getPos; - int putPos = this->putPos; - int available = putPos >= getPos ? LsmQueueSize - putPos + getPos : getPos - putPos; - - if (size >= available) /* queue overflow? */ - { - if (!writerBlocked) - { - writerBlocked = true; - LsmMemoryBarrier(); - // Enforce "writeBlocked" flag to be visible by consumer and retry availability check - } - else - { - SemWait(&full); - } - continue; - } - size_t tail = LsmQueueSize - putPos; - - // Copy header - if (tail <= sizeof(LsmMessageHeader)) - { - memcpy(&req[putPos], &msg, tail); - memcpy(&req[0], (char*)&msg + tail, sizeof(LsmMessageHeader) - tail); - putPos = sizeof(LsmMessageHeader) - tail; - } - else - { - memcpy(&req[putPos], &msg, sizeof(LsmMessageHeader)); - putPos += sizeof(LsmMessageHeader); - } - tail = LsmQueueSize - putPos; - - // Copy key - if (tail <= msg.hdr.keySize) - { - // C 库函数 void *memcpy(void *str1, const void *str2, size_t n) 从存储区 str2 复制 n 个字节到存储区 str1。 - memcpy(&req[putPos], msg.key, tail); - memcpy(&req[0], msg.key + tail, msg.hdr.keySize - tail); - putPos = msg.hdr.keySize - tail; - } - else - { - memcpy(&req[putPos], msg.key, msg.hdr.keySize); - putPos += msg.hdr.keySize; - } - tail = LsmQueueSize - putPos; - - // Copy value - if (tail <= msg.hdr.valueSize) - { - memcpy(&req[putPos], msg.value, tail); - memcpy(&req[0], msg.value + tail, msg.hdr.valueSize - tail); - putPos = msg.hdr.valueSize - tail; - } - else - { - memcpy(&req[putPos], msg.value, msg.hdr.valueSize); - putPos += msg.hdr.valueSize; - } - this->putPos = putPos; - SemPost(&empty); // Enforce write barrier and notify consumer - return; - } + int size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; + + if (size > LsmQueueSize) + LsmError("Message is too long"); + + while (true) + { + int getPos = this->getPos; + int putPos = this->putPos; + int available = putPos >= getPos ? LsmQueueSize - putPos + getPos : getPos - putPos; + + if (size >= available) /* queue overflow? */ + { + if (!writerBlocked) + { + writerBlocked = true; + LsmMemoryBarrier(); + // Enforce "writeBlocked" flag to be visible by consumer and retry availability check + } + else + { + SemWait(&full); + } + continue; + } + size_t tail = LsmQueueSize - putPos; + + // Copy header + if (tail <= sizeof(LsmMessageHeader)) + { + memcpy(&req[putPos], &msg, tail); + memcpy(&req[0], (char*)&msg + tail, sizeof(LsmMessageHeader) - tail); + putPos = sizeof(LsmMessageHeader) - tail; + } + else + { + memcpy(&req[putPos], &msg, sizeof(LsmMessageHeader)); + putPos += sizeof(LsmMessageHeader); + } + tail = LsmQueueSize - putPos; + + // Copy key + if (tail <= msg.hdr.keySize) + { + // C 库函数 void *memcpy(void *str1, const void *str2, size_t n) 从存储区 str2 复制 n 个字节到存储区 str1。 + memcpy(&req[putPos], msg.key, tail); + memcpy(&req[0], msg.key + tail, msg.hdr.keySize - tail); + putPos = msg.hdr.keySize - tail; + } + else + { + memcpy(&req[putPos], msg.key, msg.hdr.keySize); + putPos += msg.hdr.keySize; + } + tail = LsmQueueSize - putPos; + + // Copy value + if (tail <= msg.hdr.valueSize) + { + memcpy(&req[putPos], msg.value, tail); + memcpy(&req[0], msg.value + tail, msg.hdr.valueSize - tail); + putPos = msg.hdr.valueSize - tail; + } + else + { + memcpy(&req[putPos], msg.value, msg.hdr.valueSize); + putPos += msg.hdr.valueSize; + } + this->putPos = putPos; + SemPost(&empty); // Enforce write barrier and notify consumer + return; + } } /* @@ -98,70 +98,70 @@ void LsmQueue::put(LsmMessage const& msg) */ void LsmQueue::get(char* buf, LsmMessage& msg) { - // Wait until queue is not empty. - // We are not comparing getPos with putPos before waiting semaphore to make sure that writer barrier enforced by SemPost - // makes all data written by producer visible for consumer. - SemWait(&empty); - - if (terminate) - { - msg.hdr.op = LsmOpTerminate; - return; - } - - int getPos = this->getPos; - int putPos = this->putPos; - - if (putPos == getPos) - LsmError("Queue race condition!"); - - size_t tail = LsmQueueSize - getPos; - - // Copy header - if (tail <= sizeof(LsmMessageHeader)) - { - memcpy(&msg, &req[getPos], tail); - memcpy((char*)&msg + tail, &req[0], sizeof(LsmMessageHeader) - tail); - getPos = sizeof(LsmMessageHeader) - tail; - } - else - { - memcpy(&msg, &req[getPos], sizeof(LsmMessageHeader)); - getPos += sizeof(LsmMessageHeader); - } - tail = LsmQueueSize - getPos; - - // Copy key - if (tail < msg.hdr.keySize) - { - memcpy(buf, &req[getPos], tail); - memcpy(buf + tail, &req[0], msg.hdr.keySize - tail); - getPos = msg.hdr.keySize - tail; - msg.key = buf; - buf += msg.hdr.keySize; - } - else - { - msg.key = &req[getPos]; - getPos += msg.hdr.keySize; - if (getPos == LsmQueueSize) - { - getPos = 0; - } - } - tail = LsmQueueSize - getPos; - - // Copy value - if (tail < msg.hdr.valueSize) - { - memcpy(buf, &req[getPos], tail); - memcpy(buf + tail, &req[0], msg.hdr.valueSize - tail); - msg.value = buf; - } - else - { - msg.value = &req[getPos]; - } + // Wait until queue is not empty. + // We are not comparing getPos with putPos before waiting semaphore to make sure that writer barrier enforced by SemPost + // makes all data written by producer visible for consumer. + SemWait(&empty); + + if (terminate) + { + msg.hdr.op = LsmOpTerminate; + return; + } + + int getPos = this->getPos; + int putPos = this->putPos; + + if (putPos == getPos) + LsmError("Queue race condition!"); + + size_t tail = LsmQueueSize - getPos; + + // Copy header + if (tail <= sizeof(LsmMessageHeader)) + { + memcpy(&msg, &req[getPos], tail); + memcpy((char*)&msg + tail, &req[0], sizeof(LsmMessageHeader) - tail); + getPos = sizeof(LsmMessageHeader) - tail; + } + else + { + memcpy(&msg, &req[getPos], sizeof(LsmMessageHeader)); + getPos += sizeof(LsmMessageHeader); + } + tail = LsmQueueSize - getPos; + + // Copy key + if (tail < msg.hdr.keySize) + { + memcpy(buf, &req[getPos], tail); + memcpy(buf + tail, &req[0], msg.hdr.keySize - tail); + getPos = msg.hdr.keySize - tail; + msg.key = buf; + buf += msg.hdr.keySize; + } + else + { + msg.key = &req[getPos]; + getPos += msg.hdr.keySize; + if (getPos == LsmQueueSize) + { + getPos = 0; + } + } + tail = LsmQueueSize - getPos; + + // Copy value + if (tail < msg.hdr.valueSize) + { + memcpy(buf, &req[getPos], tail); + memcpy(buf + tail, &req[0], msg.hdr.valueSize - tail); + msg.value = buf; + } + else + { + msg.value = &req[getPos]; + } } /* @@ -169,23 +169,23 @@ void LsmQueue::get(char* buf, LsmMessage& msg) */ void LsmQueue::next(LsmMessage const& msg) { - int getPos = this->getPos; - bool writerBlocked = this->writerBlocked; - size_t size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; - size_t tail = LsmQueueSize - getPos; - this->getPos = (tail <= size) ? size - tail : getPos + size; - if (writerBlocked) - { - // Notify consumer that some more free space is avaialble in ring buffer - this->writerBlocked = false; - SemPost(&full); - } + int getPos = this->getPos; + bool writerBlocked = this->writerBlocked; + size_t size = sizeof(LsmMessageHeader) + msg.hdr.keySize + msg.hdr.valueSize; + size_t tail = LsmQueueSize - getPos; + this->getPos = (tail <= size) ? size - tail : getPos + size; + if (writerBlocked) + { + // Notify consumer that some more free space is avaialble in ring buffer + this->writerBlocked = false; + SemPost(&full); + } } inline LsmConnection& LsmWorker::open(LsmMessage const& msg) { - return server->open(msg); + return server->open(msg); } /* @@ -194,11 +194,11 @@ LsmWorker::open(LsmMessage const& msg) void LsmWorker::insert(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - // 通过调用con插入key,value数据,resp为插入之后的返回值 - queue->resp[0] = (char)con.insert(msg.key, msg.hdr.keySize, msg.value, msg.hdr.valueSize); - if (LsmSync) - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + // 通过调用con插入key,value数据,resp为插入之后的返回值 + queue->resp[0] = (char)con.insert(msg.key, msg.hdr.keySize, msg.value, msg.hdr.valueSize); + if (LsmSync) + SemPost(&queue->ready); } /* @@ -207,10 +207,10 @@ LsmWorker::insert(LsmMessage const& msg) void LsmWorker::remove(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - queue->resp[0] = (char)con.remove(msg.key, msg.hdr.keySize); - if (LsmSync) - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + queue->resp[0] = (char)con.remove(msg.key, msg.hdr.keySize); + if (LsmSync) + SemPost(&queue->ready); } /* @@ -219,10 +219,10 @@ LsmWorker::remove(LsmMessage const& msg) void LsmWorker::count(LsmMessage const& msg) { - LsmConnection& con(open(msg)); - uint64_t count = con.count(); - memcpy(queue->resp, &count, sizeof(count)); - SemPost(&queue->ready); + LsmConnection& con(open(msg)); + uint64_t count = con.count(); + memcpy(queue->resp, &count, sizeof(count)); + SemPost(&queue->ready); } /* @@ -231,9 +231,9 @@ LsmWorker::count(LsmMessage const& msg) void LsmWorker::closeCursor(LsmMessage const& msg) { - LsmCursor& csr(cursors[msg.hdr.cid]); - csr.con->releaseIterator(csr.iter); - cursors.erase(msg.hdr.cid); + LsmCursor& csr(cursors[msg.hdr.cid]); + csr.con->releaseIterator(csr.iter); + cursors.erase(msg.hdr.cid); } /* @@ -242,9 +242,9 @@ LsmWorker::closeCursor(LsmMessage const& msg) void LsmWorker::lookup(LsmMessage const& msg) { - LsmConnection& con(open(msg)); + LsmConnection& con(open(msg)); queue->respSize = con.lookup(msg.key, msg.hdr.keySize, queue->resp); - SemPost(&queue->ready); + SemPost(&queue->ready); } /* @@ -253,14 +253,14 @@ LsmWorker::lookup(LsmMessage const& msg) void LsmWorker::fetch(LsmMessage const& msg) { - LsmCursor& csr(cursors[msg.hdr.cid]); - if (!csr.con) - { - csr.con = &open(msg); - csr.iter = csr.con->getIterator(); - } + LsmCursor& csr(cursors[msg.hdr.cid]); + if (!csr.con) + { + csr.con = &open(msg); + csr.iter = csr.con->getIterator(); + } queue->respSize = csr.con->next(csr.iter, queue->resp); - SemPost(&queue->ready); + SemPost(&queue->ready); } @@ -272,39 +272,39 @@ LsmWorker::fetch(LsmMessage const& msg) void LsmWorker::run() { - // 一直处于监听情况下 - while (true) - { - LsmMessage msg; - char buf[LSM_MAX_RECORD_SIZE]; - queue->get(buf, msg); + // 一直处于监听情况下 + while (true) + { + LsmMessage msg; + char buf[LSM_MAX_RECORD_SIZE]; + queue->get(buf, msg); switch (msg.hdr.op) { - case LsmOpTerminate: - return; - case LsmOpCount: - count(msg); - break; - case LsmOpCloseCursor: - closeCursor(msg); - break; - case LsmOpFetch: - fetch(msg); - break; - case LsmOpLookup: - lookup(msg); - break; - case LsmOpInsert: - insert(msg); - break; - case LsmOpDelete: - remove(msg); - break; - default: - assert(false); + case LsmOpTerminate: + return; + case LsmOpCount: + count(msg); + break; + case LsmOpCloseCursor: + closeCursor(msg); + break; + case LsmOpFetch: + fetch(msg); + break; + case LsmOpLookup: + lookup(msg); + break; + case LsmOpInsert: + insert(msg); + break; + case LsmOpDelete: + remove(msg); + break; + default: + assert(false); } - queue->next(msg); - } + queue->next(msg); + } } @@ -313,21 +313,21 @@ LsmWorker::run() void LsmWorker::start() { - PthreadCreate(&thread, NULL, LsmWorker::main, this); + PthreadCreate(&thread, NULL, LsmWorker::main, this); } void LsmWorker::stop() { - queue->terminate = true; - SemPost(&queue->empty); + queue->terminate = true; + SemPost(&queue->empty); } void LsmWorker::wait() { - void* status; - PthreadJoin(thread, &status); + void* status; + PthreadJoin(thread, &status); } @@ -335,8 +335,8 @@ LsmWorker::wait() void* LsmWorker::main(void* arg) { - ((LsmWorker*)arg)->run(); - return NULL; + ((LsmWorker*)arg)->run(); + return NULL; } /* @@ -346,10 +346,10 @@ LsmWorker::main(void* arg) void LsmRunWorkers(int maxClients) { - server = new LsmServer(maxClients); - server->start(); - server->wait(); - delete server; + server = new LsmServer(maxClients); + server->start(); + server->wait(); + delete server; } /* @@ -358,55 +358,55 @@ LsmRunWorkers(int maxClients) void LsmStopWorkers(void) { - server->stop(); + server->stop(); } // 封装了对LsmWorker的操作,也就是当外部数据来的时候,将插入数据等操作用一个LsmWorker来操作 LsmServer::LsmServer(size_t maxClients) : nWorkers(maxClients) { - workers = new LsmWorker*[nWorkers]; - for (size_t i = 0; i < nWorkers; i++) - { - workers[i] = new LsmWorker(this, queues[i]); - } + workers = new LsmWorker*[nWorkers]; + for (size_t i = 0; i < nWorkers; i++) + { + workers[i] = new LsmWorker(this, queues[i]); + } } void LsmServer::start() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->start(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->start(); + } } void LsmServer::wait() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->wait(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->wait(); + } } LsmServer::~LsmServer() { - for (size_t i = 0; i < nWorkers; i++) - { - delete workers[i]; - } - delete[] workers; + for (size_t i = 0; i < nWorkers; i++) + { + delete workers[i]; + } + delete[] workers; } void LsmServer::stop() { - for (size_t i = 0; i < nWorkers; i++) - { - workers[i]->stop(); - } + for (size_t i = 0; i < nWorkers; i++) + { + workers[i]->stop(); + } } @@ -415,13 +415,13 @@ LsmServer::stop() LsmConnection& LsmServer::open(LsmMessage const& msg) { - CriticalSection cs(mutex); - LsmConnection& con = connections[msg.hdr.rid]; - if (con.db == NULL) - { - char path[64]; - sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); - con.open(path); - } - return con; + CriticalSection cs(mutex); + LsmConnection& con = connections[msg.hdr.rid]; + if (con.db == NULL) + { + char path[64]; + sprintf(path, "%s/%d", LSM_FDW_NAME, msg.hdr.rid); + con.open(path); + } + return con; } diff --git a/lsm_storage.cpp b/lsm_storage.cpp index ec13b63..694bd37 100644 --- a/lsm_storage.cpp +++ b/lsm_storage.cpp @@ -13,20 +13,20 @@ LsmConnection::open(char const* path) Status s = DB::Open(options, std::string(path), &db); if (!s.ok()) - LsmError(s.getState()); + LsmError(s.getState()); } void LsmConnection::close() { delete db; - db = NULL; + db = NULL; } uint64_t LsmConnection::count() { - std::string count; + std::string count; db->GetProperty("rocksdb.estimate-num-keys", &count); return stoull(count); } @@ -48,66 +48,66 @@ LsmConnection::releaseIterator(Iterator* it) size_t LsmConnection::next(Iterator* it, char* buf) { - size_t size; - // Fetch as much records asfits in response buffer - for (size = 0; it->Valid(); it->Next()) - { + size_t size; + // Fetch as much records asfits in response buffer + for (size = 0; it->Valid(); it->Next()) + { int keyLen = it->key().size(); - int valLen = it->value().size(); - int pairSize = sizeof(int)*2 + keyLen + valLen; + int valLen = it->value().size(); + int pairSize = sizeof(int)*2 + keyLen + valLen; - if (size + pairSize > LSM_MAX_RECORD_SIZE) - break; + if (size + pairSize > LSM_MAX_RECORD_SIZE) + break; - memcpy(&buf[size], &keyLen, sizeof keyLen); - size += sizeof keyLen; - memcpy(&buf[size], it->key().data(), keyLen); - size += keyLen; + memcpy(&buf[size], &keyLen, sizeof keyLen); + size += sizeof keyLen; + memcpy(&buf[size], it->key().data(), keyLen); + size += keyLen; - memcpy(&buf[size], &valLen, sizeof valLen); - size += sizeof valLen; - memcpy(&buf[size], it->value().data(), valLen); - size += valLen; + memcpy(&buf[size], &valLen, sizeof valLen); + size += sizeof valLen; + memcpy(&buf[size], it->value().data(), valLen); + size += valLen; } - return size; + return size; } size_t LsmConnection::lookup(char const* key, size_t keyLen, char* buf) { - std::string sval; + std::string sval; ReadOptions ro; Status s = db->Get(ro, Slice(key, keyLen), &sval); if (!s.ok()) - return 0; - size_t valLen = sval.length(); + return 0; + size_t valLen = sval.length(); memcpy(buf, sval.c_str(), valLen); - return valLen; + return valLen; } bool LsmConnection::insert(char* key, size_t keyLen, char* val, size_t valLen) { - Status s; - WriteOptions opts; - if (!LsmUpsert) - { - std::string sval; - ReadOptions ro; - s = db->Get(ro, Slice(key, keyLen), &sval); - if (s.ok()) // key already exists - return false; - } - opts.sync = LsmSync; - s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); + Status s; + WriteOptions opts; + if (!LsmUpsert) + { + std::string sval; + ReadOptions ro; + s = db->Get(ro, Slice(key, keyLen), &sval); + if (s.ok()) // key already exists + return false; + } + opts.sync = LsmSync; + s = db->Put(opts, Slice(key, keyLen), Slice(val, valLen)); return s.ok(); } bool LsmConnection::remove(char* key, size_t keyLen) { - WriteOptions opts; - opts.sync = LsmSync; + WriteOptions opts; + opts.sync = LsmSync; Status s = db->Delete(opts, Slice(key, keyLen)); return s.ok(); } diff --git a/lsm_util.c b/lsm_util.c index f0bdd5b..f4fe2ba 100644 --- a/lsm_util.c +++ b/lsm_util.c @@ -41,13 +41,13 @@ static shmem_startup_hook_type PreviousShmemStartupHook = NULL; /* local functions forward declarations */ static void LsmProcessUtility(PlannedStmt *plannedStmt, - const char *queryString, - ProcessUtilityContext context, - ParamListInfo paramListInfo, - QueryEnvironment *queryEnvironment, - DestReceiver *destReceiver, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo paramListInfo, + QueryEnvironment *queryEnvironment, + DestReceiver *destReceiver, #if PG_VERSION_NUM>=130000 - QueryCompletion *completionTag); + QueryCompletion *completionTag); #else char *completionTag); #endif @@ -62,12 +62,12 @@ EncodeVarintLength(uint64 v, char* buf) v >>= 7; } *dst++ = (char)v; - return (uint8)(dst - buf); + return (uint8)(dst - buf); } static const char* GetVarint64Ptr(const char* p, const char* limit, - uint64_t* value) + uint64_t* value) { uint64_t result = 0; for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { @@ -132,7 +132,7 @@ static bool LsmTable(Oid relationId) { static char* LsmFilePath(Oid relid) { - return psprintf("%s/%d", LSM_FDW_NAME, relid); + return psprintf("%s/%d", LSM_FDW_NAME, relid); } /* @@ -189,14 +189,14 @@ static void LsmCheckSuperuserPrivilegesForCopy(const CopyStmt* copyStmt) { if (copyStmt->filename != NULL && !superuser()) { if (copyStmt->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to COPY to or from a program"), - errhint("Anyone can COPY to stdout or from stdin. " - "psql's \\copy command also works for anyone."))); + errmsg("must be superuser to COPY to or from a program"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to COPY to or from a file"), - errhint("Anyone can COPY to stdout or from stdin. " - "psql's \\copy command also works for anyone."))); + errmsg("must be superuser to COPY to or from a file"), + errhint("Anyone can COPY to stdout or from stdin. " + "psql's \\copy command also works for anyone."))); } } } @@ -247,7 +247,7 @@ void SerializeAttribute(TupleDesc tupleDescriptor, //元组 int offset = buffer->len; int datumLength = att_addlength_datum(offset, typeLength, datum); - /* the key does not have a size header */ + /* the key does not have a size header */ enlargeStringInfo(buffer, datumLength + (index == 0 ? 0 : HEADERBUFFSIZE)); char *current = buffer->data + buffer->len; @@ -283,7 +283,7 @@ void SerializeAttribute(TupleDesc tupleDescriptor, //元组 * number of copied rows. */ static uint64 LsmCopyIntoTable(const CopyStmt *copyStmt, - const char *queryString) + const char *queryString) { /* Only superuser can copy from or to local file */ LsmCheckSuperuserPrivilegesForCopy(copyStmt); @@ -323,7 +323,7 @@ static uint64 LsmCopyIntoTable(const CopyStmt *copyStmt, while (found) { /* read the next row in tupleContext */ MemoryContext oldContext = - MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); /* * 'econtext' is used to evaluate default expression for each columns @@ -384,9 +384,9 @@ static uint64 LsmCopyOutTable(CopyStmt *copyStmt, const char *queryString) { if (copyStmt->attlist != NIL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("copy column list is not supported"), - errhint("use 'copy (select from
) to " - "...' instead"))); + errmsg("copy column list is not supported"), + errhint("use 'copy (select from
) to " + "...' instead"))); } RangeVar *relation = copyStmt->relation; @@ -478,7 +478,7 @@ LsmCheckAlterTable(AlterTableStmt *alterStmt) if (alterCmd->subtype == AT_AddColumn) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("No support for adding column currently"))); + errmsg("No support for adding column currently"))); } } } @@ -490,15 +490,15 @@ LsmCheckAlterTable(AlterTableStmt *alterStmt) * utility command via macro CALL_PREVIOUS_UTILITY. */ static void LsmProcessUtility(PlannedStmt *plannedStmt, - const char *queryString, - ProcessUtilityContext context, - ParamListInfo paramListInfo, - QueryEnvironment *queryEnvironment, - DestReceiver *destReceiver, + const char *queryString, + ProcessUtilityContext context, + ParamListInfo paramListInfo, + QueryEnvironment *queryEnvironment, + DestReceiver *destReceiver, #if PG_VERSION_NUM>=130000 - QueryCompletion *completionTag) + QueryCompletion *completionTag) #else - char *completionTag) + char *completionTag) #endif { Node *parseTree = plannedStmt->utilityStmt; @@ -516,12 +516,12 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, if (completionTag != NULL) { #if PG_VERSION_NUM>=130000 - SetQueryCompletion(completionTag, CMDTAG_COPY, rowCount); + SetQueryCompletion(completionTag, CMDTAG_COPY, rowCount); #else snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "COPY " UINT64_FORMAT, - rowCount); + rowCount); #endif } } else { @@ -554,7 +554,7 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, ListCell *fileCell = NULL; foreach(fileCell, droppedTables) { char *path = lfirst(fileCell); - rmtree(path, true); + rmtree(path, true); } } } else if (nodeTag(parseTree) == T_AlterTableStmt) { @@ -580,87 +580,87 @@ static void LsmProcessUtility(PlannedStmt *plannedStmt, static void LsmShmemStartup(void) { - bool found; - void* ctl; + bool found; + void* ctl; if (PreviousShmemStartupHook) - { + { PreviousShmemStartupHook(); } - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - - ctl = ShmemInitStruct("lsm_control", - LsmShmemSize(MaxConnections), - &found); - if (!found) - { - LsmInitialize(ctl, MaxConnections); - if (mkdir(LSM_FDW_NAME, S_IRWXU) != 0 && errno != EEXIST) - elog(ERROR, "Failed to create lsm directory: %m"); - } - else - LsmAttach(ctl); - - LWLockRelease(AddinShmemInitLock); + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + ctl = ShmemInitStruct("lsm_control", + LsmShmemSize(MaxConnections), + &found); + if (!found) + { + LsmInitialize(ctl, MaxConnections); + if (mkdir(LSM_FDW_NAME, S_IRWXU) != 0 && errno != EEXIST) + elog(ERROR, "Failed to create lsm directory: %m"); + } + else + LsmAttach(ctl); + + LWLockRelease(AddinShmemInitLock); } void _PG_init(void) { - BackgroundWorker worker; - - if (!process_shared_preload_libraries_in_progress) - elog(ERROR, "LSM: this extension should be loaded via shared_preload_libraries"); - - DefineCustomIntVariable("lsm.queue_size", - "Size of LSM queue", - NULL, - &LsmQueueSize, - LSM_MAX_RECORD_SIZE, LSM_MAX_RECORD_SIZE, INT_MAX, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); + BackgroundWorker worker; + + if (!process_shared_preload_libraries_in_progress) + elog(ERROR, "LSM: this extension should be loaded via shared_preload_libraries"); + + DefineCustomIntVariable("lsm.queue_size", + "Size of LSM queue", + NULL, + &LsmQueueSize, + LSM_MAX_RECORD_SIZE, LSM_MAX_RECORD_SIZE, INT_MAX, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); DefineCustomBoolVariable("lsm.sync", - "Use synchronouse write", - NULL, - &LsmSync, - false, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); + "Use synchronouse write", + NULL, + &LsmSync, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); DefineCustomBoolVariable("lsm.upsert", - "Use implicit upsert semantic", - "If key of inserted record already exists, then replace old record with new one", - &LsmUpsert, - true, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); - - RequestAddinShmemSpace(LsmShmemSize(MaxConnections)); - elog(DEBUG1, "Request %ld bytes of shared memory", LsmShmemSize(MaxConnections)); - - MemSet(&worker, 0, sizeof(BackgroundWorker)); - worker.bgw_flags = BGWORKER_SHMEM_ACCESS; - worker.bgw_start_time = BgWorkerStart_ConsistentState; - strcpy(worker.bgw_library_name, "lsm"); - strcpy(worker.bgw_function_name, "LsmWorkerMain"); - strcpy(worker.bgw_name, "LSM worker"); - strcpy(worker.bgw_type, "LSM worker"); - - RegisterBackgroundWorker(&worker); - - PreviousShmemStartupHook = shmem_startup_hook; - shmem_startup_hook = LsmShmemStartup; + "Use implicit upsert semantic", + "If key of inserted record already exists, then replace old record with new one", + &LsmUpsert, + true, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + + RequestAddinShmemSpace(LsmShmemSize(MaxConnections)); + elog(DEBUG1, "Request %ld bytes of shared memory", LsmShmemSize(MaxConnections)); + + MemSet(&worker, 0, sizeof(BackgroundWorker)); + worker.bgw_flags = BGWORKER_SHMEM_ACCESS; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + strcpy(worker.bgw_library_name, "lsm"); + strcpy(worker.bgw_function_name, "LsmWorkerMain"); + strcpy(worker.bgw_name, "LSM worker"); + strcpy(worker.bgw_type, "LSM worker"); + + RegisterBackgroundWorker(&worker); + + PreviousShmemStartupHook = shmem_startup_hook; + shmem_startup_hook = LsmShmemStartup; PreviousProcessUtilityHook = ProcessUtility_hook; ProcessUtility_hook = LsmProcessUtility; } @@ -678,25 +678,25 @@ void _PG_fini(void) static void LsmWorkerSigtermHandler(SIGNAL_ARGS) { - LsmStopWorkers(); + LsmStopWorkers(); } void LsmWorkerMain(Datum main_arg) { - pqsignal(SIGTERM, LsmWorkerSigtermHandler); - BackgroundWorkerUnblockSignals(); - LsmRunWorkers(MaxConnections); + pqsignal(SIGTERM, LsmWorkerSigtermHandler); + BackgroundWorkerUnblockSignals(); + LsmRunWorkers(MaxConnections); } void LsmError(char const* message) { - ereport(ERROR, (errmsg("LSM: %s", message))); + ereport(ERROR, (errmsg("LSM: %s", message))); } void LsmMemoryBarrier(void) { - pg_memory_barrier(); + pg_memory_barrier(); } diff --git a/pg_rocksdb--0.1.sql b/pg_rocksdb--0.1.sql deleted file mode 100644 index 6a1ddda..0000000 --- a/pg_rocksdb--0.1.sql +++ /dev/null @@ -1,9 +0,0 @@ -CREATE FUNCTION pg_rocksdb_fdw_handler() --- 底下必须返回的是fdw_handler,而不是其他的 -RETURNS fdw_handler -AS 'MODULE_PATHNAME' -LANGUAGE C STRICT; - -CREATE FOREIGN DATA WRAPPER pg_rocksdb_fdw - HANDLER pg_rocksdb_fdw_handler; - diff --git a/pg_rocksdb.conf b/pg_rocksdb.conf deleted file mode 100644 index 57b79dc..0000000 --- a/pg_rocksdb.conf +++ /dev/null @@ -1 +0,0 @@ -shared_preload_libraries = 'pg_rocksdb' diff --git a/pg_rocksdb.control b/pg_rocksdb.control deleted file mode 100644 index 3f36eb3..0000000 --- a/pg_rocksdb.control +++ /dev/null @@ -1,5 +0,0 @@ -# pg_rocksdb FDW -comment = 'RocksDB Foreign Data Wrapper' -default_version = '0.1' -module_pathname = '$libdir/pg_rocksdb' -relocatable = true