在template1資料庫中進行vacuum freeze出現如下報錯:
template1=> vacuum freeze template1.pg_catalog.pg_authid;
ERROR: found xmin 1988747257 from before relfrozenxid 2810153180
問題原因
出現這種報錯一般第一反應都是因為長事務引起的事務号無法回收,通過查詢pg_stat_activity沒有發現有長事務,而且是單表無法進行回收。原因是pg會把一些關鍵的系統表的中繼資料資訊存儲在relcache中,rd_isnailed=true就是表明這個表的relcache的relcache是不會去進行更新的,關鍵的系統表一般也不會去進行表中繼資料的變更,是以系統表不去更新relcache也是正常的
typedef struct RelationData
{
RelFileNode rd_node; /* relation physical identifier */
/* use "struct" here to avoid needing to include smgr.h: */
struct SMgrRelationData *rd_smgr; /* cached file handle, or NULL */
int rd_refcnt; /* reference count */
BackendId rd_backend; /* owning backend id, if temporary relation */
bool rd_islocaltemp; /* rel is a temp rel of this session */
bool rd_isnailed; /* rel is nailed in cache */
bool rd_isvalid; /* relcache entry is valid */
char rd_indexvalid; /* state of rd_indexlist: 0 = not valid, 1 =
* valid, 2 = temporarily forced */
bool rd_statvalid; /* is rd_statlist valid? */
/*
* rd_createSubid is the ID of the highest subtransaction the rel has
* survived into; or zero if the rel was not created in the current top
* transaction. This can be now be relied on, whereas previously it could
* be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
* the ID of the highest subtransaction the relfilenode change has
* survived into, or zero if not changed in the current transaction (or we
* have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
* when a relation has multiple new relfilenodes within a single
* transaction, with one of them occurring in a subsequently aborted
* subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
* ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
*/
SubTransactionId rd_createSubid; /* rel was created in current xact */
SubTransactionId rd_newRelfilenodeSubid; /* new relfilenode assigned in
* current xact */
Form_pg_class rd_rel; /* RELATION tuple */
TupleDesc rd_att; /* tuple descriptor */
Oid rd_id; /* relation's object id */
LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */
RuleLock *rd_rules; /* rewrite rules */
MemoryContext rd_rulescxt; /* private memory cxt for rd_rules, if any */
TriggerDesc *trigdesc; /* Trigger info, or NULL if rel has none */
/* use "struct" here to avoid needing to include rowsecurity.h: */
struct RowSecurityDesc *rd_rsdesc; /* row security policies, or NULL */
/* data managed by RelationGetFKeyList: */
List *rd_fkeylist; /* list of ForeignKeyCacheInfo (see below) */
bool rd_fkeyvalid; /* true if list has been computed */
MemoryContext rd_partkeycxt; /* private memory cxt for the below */
struct PartitionKeyData *rd_partkey; /* partition key, or NULL */
MemoryContext rd_pdcxt; /* private context for partdesc */
struct PartitionDescData *rd_partdesc; /* partitions, or NULL */
List *rd_partcheck; /* partition CHECK quals */
/* data managed by RelationGetIndexList: */
List *rd_indexlist; /* list of OIDs of indexes on relation */
Oid rd_pkindex; /* OID of primary key, if any */
Oid rd_replidindex; /* OID of replica identity index, if any */
/* data managed by RelationGetStatExtList: */
List *rd_statlist; /* list of OIDs of extended stats */
/* data managed by RelationGetIndexAttrBitmap: */
Bitmapset *rd_indexattr; /* identifies columns used in indexes */
Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */
Bitmapset *rd_pkattr; /* cols included in primary key */
Bitmapset *rd_idattr; /* included in replica identity index */
PublicationActions *rd_pubactions; /* publication actions */
/*
* rd_options is set whenever rd_rel is loaded into the relcache entry.
* Note that you can NOT look into rd_rel for this data. NULL means "use
* defaults".
*/
bytea *rd_options; /* parsed pg_class.reloptions */
/* These are non-NULL only for an index relation: */
Form_pg_index rd_index; /* pg_index tuple describing this index */
/* use "struct" here to avoid needing to include htup.h: */
struct HeapTupleData *rd_indextuple; /* all of pg_index tuple */
/*
* index access support info (used only for an index relation)
*
* Note: only default support procs for each opclass are cached, namely
* those with lefttype and righttype equal to the opclass's opcintype. The
* arrays are indexed by support function number, which is a sufficient
* identifier given that restriction.
*
* Note: rd_amcache is available for index AMs to cache private data about
* an index. This must be just a cache since it may get reset at any time
* (in particular, it will get reset by a relcache inval message for the
* index). If used, it must point to a single memory chunk palloc'd in
* rd_indexcxt. A relcache reset will include freeing that chunk and
* setting rd_amcache = NULL.
*/
Oid rd_amhandler; /* OID of index AM's handler function */
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
/* use "struct" here to avoid needing to include amapi.h: */
struct IndexAmRoutine *rd_indam; /* index AM's API struct */
Oid *rd_opfamily; /* OIDs of op families for each index col */
Oid *rd_opcintype; /* OIDs of opclass declared input data types */
RegProcedure *rd_support; /* OIDs of support procedures */
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
int16 *rd_indoption; /* per-column AM-specific flags */
List *rd_indexprs; /* index expression trees, if any */
List *rd_indpred; /* index predicate tree, if any */
Oid *rd_exclops; /* OIDs of exclusion operators, if any */
Oid *rd_exclprocs; /* OIDs of exclusion ops' procs, if any */
uint16 *rd_exclstrats; /* exclusion ops' strategy numbers, if any */
void *rd_amcache; /* available for use by index AM */
Oid *rd_indcollation; /* OIDs of index collations */
/*
* foreign-table support
*
* rd_fdwroutine must point to a single memory chunk palloc'd in
* CacheMemoryContext. It will be freed and reset to NULL on a relcache
* reset.
*/
/* use "struct" here to avoid needing to include fdwapi.h: */
struct FdwRoutine *rd_fdwroutine; /* cached function pointers, or NULL */
/*
* Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
* version of a table, we need to make any toast pointers inserted into it
* have the existing toast table's OID, not the OID of the transient toast
* table. If rd_toastoid isn't InvalidOid, it is the OID to place in
* toast pointers inserted into this rel. (Note it's set on the new
* version of the main heap, not the toast table itself.) This also
* causes toast_save_datum() to try to preserve toast value OIDs.
*/
Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */
/* use "struct" here to avoid needing to include pgstat.h: */
struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
} RelationData;
relfrozenxid存儲在rd_rel中
CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,RelationRelation_Rowtype_Id) BKI_SCHEMA_MACRO
{
Oid oid; /* oid */
NameData relname; /* class name */
Oid relnamespace; /* OID of namespace containing this class */
Oid reltype; /* OID of entry in pg_type for table's
* implicit row type */
Oid reloftype; /* OID of entry in pg_type for underlying
* composite type */
Oid relowner; /* class owner */
Oid relam; /* index access method; 0 if not an index */
Oid relfilenode; /* identifier of physical storage file */
/* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
Oid reltablespace; /* identifier of table space for relation */
int32 relpages; /* # of blocks (not always up-to-date) */
float4 reltuples; /* # of tuples (not always up-to-date) */
int32 relallvisible; /* # of all-visible blocks (not always
* up-to-date) */
Oid reltoastrelid; /* OID of toast table; 0 if none */
bool relhasindex; /* T if has (or has had) any indexes */
bool relisshared; /* T if shared across databases */
char relpersistence; /* see RELPERSISTENCE_xxx constants below */
char relkind; /* see RELKIND_xxx constants below */
int16 relnatts; /* number of user attributes */
/*
* Class pg_attribute must contain exactly "relnatts" user attributes
* (with attnums ranging from 1 to relnatts) for this class. It may also
* contain entries with negative attnums for system attributes.
*/
int16 relchecks; /* # of CHECK constraints for class */
bool relhasrules; /* has (or has had) any rules */
bool relhastriggers; /* has (or has had) any TRIGGERs */
bool relhassubclass; /* has (or has had) child tables or indexes */
bool relrowsecurity; /* row security is enabled or not */
bool relforcerowsecurity; /* row security forced for owners or
* not */
bool relispopulated; /* matview currently holds query results */
char relreplident; /* see REPLICA_IDENTITY_xxx constants */
bool relispartition; /* is relation a partition? */
Oid relrewrite; /* heap for rewrite during DDL, link to
* original rel */
TransactionId relfrozenxid; /* all Xids < this are frozen in this rel */
TransactionId relminmxid; /* all multixacts in this rel are >= this.
* this is really a MultiXactId */
#ifdef CATALOG_VARLEN /* variable-length fields start here */
/* NOTE: These fields are not present in a relcache entry's rd_rel field. */
aclitem relacl[1]; /* access permissions */
text reloptions[1]; /* access-method-specific options */
pg_node_tree relpartbound; /* partition bound node tree */
#endif
} FormData_pg_class;
vacuum過程中這個函數會對這行的xid和relfrozenxid進行比較,如果xid早于relfrozenxid就會出現這個報錯,
heap_prepare_freeze_tuple(HeapTupleHeader tuple,
TransactionId relfrozenxid, TransactionId relminmxid,
TransactionId cutoff_xid, TransactionId cutoff_multi,
xl_heap_freeze_tuple *frz, bool *totally_frozen_p)
{
bool changed = false;
bool xmax_already_frozen = false;
bool xmin_frozen;
bool freeze_xmax;
TransactionId xid;
frz->frzflags = 0;
frz->t_infomask2 = tuple->t_infomask2;
frz->t_infomask = tuple->t_infomask;
frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
/* Process xmin */
xid = HeapTupleHeaderGetXmin(tuple); #取目前行的xmin,會先判斷是否是frozenxid,如果是就會等于FrozenTransactionId,否則就是目前行的xmin。
xmin_frozen = ((xid == FrozenTransactionId) ||
HeapTupleHeaderXminFrozen(tuple));
if (TransactionIdIsNormal(xid))
{
if (TransactionIdPrecedes(xid, relfrozenxid)) #如果xid早于relfrozenxid就會出現上述的報錯
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found xmin %u from before relfrozenxid %u",
xid, relfrozenxid)));
if (TransactionIdPrecedes(xid, cutoff_xid))
{
if (!TransactionIdDidCommit(xid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("uncommitted xmin %u from before xid cutoff %u needs to be frozen",
xid, cutoff_xid)));
frz->t_infomask |= HEAP_XMIN_FROZEN;
changed = true;
xmin_frozen = true;
}
}
.....
}
xid是如何進行比較的呢,如果不超過21億是正常比較,超過21億的需要取模進行比較,如果行上的t_infomask中HEAP_XMIN_COMMITTED,HEAP_XMIN_INVALID同時存在就會被認為是FrozenTransactionId。
#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */
#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */
#define HEAP_XMIN_FROZEN (HEAP_XMIN_COMMITTED|HEAP_XMIN_INVALID)
#define InvalidTransactionId ((TransactionId) 0)
#define BootstrapTransactionId ((TransactionId) 1)
#define FrozenTransactionId ((TransactionId) 2)
#define FirstNormalTransactionId ((TransactionId) 3)
#define MaxTransactionId ((TransactionId) 0xFFFFFFFF)
是以問題的根因就是vacuum是讀到的relfrozexid來自relcache的,更新catalog中的relfrozexid不會去重新整理relcache,導緻一直讀到的是錯誤的relfrozexid,是以出現vacuum freeze報錯的問題。10.2,9.6.7,9.5.11,9.4.16到修複版本之間的版本的pg執行個體都會存在相關問題。
10.5, 9.6.10, 9.5.14, 9.4.19對這個問題進行了修複。
https://git.postgresql.org/gitweb/?p=postgresql.git;a=commitdiff;h=817f9f9a8a1932a0cd8c6bc5c9d3e77f6a80e659問題解法
目前可以通過兩種方式進行修複
1.重新開機資料庫,重新開機後會重新讀入新資料内容到relcache中,相當于重新整理relcache。
2.删除$PGDATA/global/pg_internal.init,這個檔案就是存儲的relcache的内容,有新的連接配接連入會建立新的pg_internal.init檔案。