e7ee2c089e
The crash happens rather often when we reset some cluster nodes while nodes contend fiercely to do truncate and append. The crash backtrace is below: dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover_grant 1 locks on 971 resources dlm: C21CBDA5E0774F4BA5A9D4F317717495: dlm_recover 9 generation 5 done: 4 ms ocfs2: Begin replay journal (node 318952601, slot 2) on device (253,18) ocfs2: End replay journal (node 318952601, slot 2) on device (253,18) ocfs2: Beginning quota recovery on device (253,18) for slot 2 ocfs2: Finishing quota recovery on device (253,18) for slot 2 (truncate,30154,1):ocfs2_truncate_file:470 ERROR: bug expression: le64_to_cpu(fe->i_size) != i_size_read(inode) (truncate,30154,1):ocfs2_truncate_file:470 ERROR: Inode 290321, inode i_size = 732 != di i_size = 937, i_flags = 0x1 ------------[ cut here ]------------ kernel BUG at /usr/src/linux/fs/ocfs2/file.c:470! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2_stack_user(OEN) ocfs2(OEN) ocfs2_nodemanager ocfs2_stackglue(OEN) quota_tree dlm(OEN) configfs fuse sd_mod iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs softdog xfs libcrc32c ppdev parport_pc pcspkr parport joydev virtio_balloon virtio_net i2c_piix4 acpi_cpufreq button processor ext4 crc16 jbd2 mbcache ata_generic cirrus virtio_blk ata_piix drm_kms_helper ahci syscopyarea libahci sysfillrect sysimgblt fb_sys_fops ttm floppy libata drm virtio_pci virtio_ring uhci_hcd virtio ehci_hcd usbcore serio_raw usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: No, Unsupported modules are loaded CPU: 1 PID: 30154 Comm: truncate Tainted: G OE N 4.4.21-69-default #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20151112_172657-sheep25 04/01/2014 task: ffff88004ff6d240 ti: ffff880074e68000 task.ti: ffff880074e68000 RIP: 0010:[<ffffffffa05c8c30>] [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] RSP: 0018:ffff880074e6bd50 EFLAGS: 00010282 RAX: 0000000000000074 RBX: 000000000000029e RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246 RBP: ffff880074e6bda8 R08: 000000003675dc7a R09: ffffffff82013414 R10: 0000000000034c50 R11: 0000000000000000 R12: ffff88003aab3448 R13: 00000000000002dc R14: 0000000000046e11 R15: 0000000000000020 FS: 00007f839f965700(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f839f97e000 CR3: 0000000036723000 CR4: 00000000000006e0 Call Trace: ocfs2_setattr+0x698/0xa90 [ocfs2] notify_change+0x1ae/0x380 do_truncate+0x5e/0x90 do_sys_ftruncate.constprop.11+0x108/0x160 entry_SYSCALL_64_fastpath+0x12/0x6d Code: 24 28 ba d6 01 00 00 48 c7 c6 30 43 62 a0 8b 41 2c 89 44 24 08 48 8b 41 20 48 c7 c1 78 a3 62 a0 48 89 04 24 31 c0 e8 a0 97 f9 ff <0f> 0b 3d 00 fe ff ff 0f 84 ab fd ff ff 83 f8 fc 0f 84 a2 fd ff RIP [<ffffffffa05c8c30>] ocfs2_truncate_file+0x640/0x6c0 [ocfs2] It's because ocfs2_inode_lock() get us stale LVB in which the i_size is not equal to the disk i_size. We mistakenly trust the LVB because the underlaying fsdlm dlm_lock() doesn't set lkb_sbflags with DLM_SBF_VALNOTVALID properly for us. But, why? The current code tries to downconvert lock without DLM_LKF_VALBLK flag to tell o2cb don't update RSB's LVB if it's a PR->NULL conversion, even if the lock resource type needs LVB. This is not the right way for fsdlm. The fsdlm plugin behaves different on DLM_LKF_VALBLK, it depends on DLM_LKF_VALBLK to decide if we care about the LVB in the LKB. If DLM_LKF_VALBLK is not set, fsdlm will skip recovering RSB's LVB from this lkb and set the right DLM_SBF_VALNOTVALID appropriately when node failure happens. The following diagram briefly illustrates how this crash happens: RSB1 is inode metadata lock resource with LOCK_TYPE_USES_LVB; The 1st round: Node1 Node2 RSB1: PR RSB1(master): NULL->EX ocfs2_downconvert_lock(PR->NULL, set_lvb==0) ocfs2_dlm_lock(no DLM_LKF_VALBLK) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - dlm_lock(no DLM_LKF_VALBLK) convert_lock(overwrite lkb->lkb_exflags with no DLM_LKF_VALBLK) RSB1: NULL RSB1: EX reset Node2 dlm_recover_rsbs() recover_lvb() /* The LVB is not trustable if the node with EX fails and * no lock >= PR is left. We should set RSB_VALNOTVALID for RSB1. */ if(!(kb_exflags & DLM_LKF_VALBLK)) /* This means we miss the chance to return; * to invalid the LVB here. */ The 2nd round: Node 1 Node2 RSB1(become master from recovery) ocfs2_setattr() ocfs2_inode_lock(NULL->EX) /* dlm_lock() return the stale lvb without setting DLM_SBF_VALNOTVALID */ ocfs2_meta_lvb_is_trustable() return 1 /* so we don't refresh inode from disk */ ocfs2_truncate_file() mlog_bug_on_msg(disk isize != i_size_read(inode)) /* crash! */ The fix is quite straightforward. We keep to set DLM_LKF_VALBLK flag for dlm_lock() if the lock resource type needs LVB and the fsdlm plugin is uesed. Link: http://lkml.kernel.org/r/1481275846-6604-1-git-send-email-zren@suse.com Signed-off-by: Eric Ren <zren@suse.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Cc: Mark Fasheh <mfasheh@versity.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
307 lines
9.3 KiB
C
307 lines
9.3 KiB
C
/* -*- mode: c; c-basic-offset: 8; -*-
|
|
* vim: noexpandtab sw=8 ts=8 sts=0:
|
|
*
|
|
* stackglue.h
|
|
*
|
|
* Glue to the underlying cluster stack.
|
|
*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public
|
|
* License as published by the Free Software Foundation, version 2.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*/
|
|
|
|
|
|
#ifndef STACKGLUE_H
|
|
#define STACKGLUE_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/list.h>
|
|
#include <linux/dlmconstants.h>
|
|
|
|
#include "dlm/dlmapi.h"
|
|
#include <linux/dlm.h>
|
|
|
|
/* Needed for plock-related prototypes */
|
|
struct file;
|
|
struct file_lock;
|
|
|
|
/*
|
|
* dlmconstants.h does not have a LOCAL flag. We hope to remove it
|
|
* some day, but right now we need it. Let's fake it. This value is larger
|
|
* than any flag in dlmconstants.h.
|
|
*/
|
|
#define DLM_LKF_LOCAL 0x00100000
|
|
|
|
/*
|
|
* This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
|
|
* wants to be in a public header.
|
|
*/
|
|
#define GROUP_NAME_MAX 64
|
|
|
|
/* This shadows OCFS2_CLUSTER_NAME_LEN */
|
|
#define CLUSTER_NAME_MAX 16
|
|
|
|
|
|
/*
|
|
* ocfs2_protocol_version changes when ocfs2 does something different in
|
|
* its inter-node behavior. See dlmglue.c for more information.
|
|
*/
|
|
struct ocfs2_protocol_version {
|
|
u8 pv_major;
|
|
u8 pv_minor;
|
|
};
|
|
|
|
/*
|
|
* The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
|
|
* has a pointer to separately allocated lvb space. This struct exists only to
|
|
* include in the lksb union to make space for a combined dlm_lksb and lvb.
|
|
*/
|
|
struct fsdlm_lksb_plus_lvb {
|
|
struct dlm_lksb lksb;
|
|
char lvb[DLM_LVB_LEN];
|
|
};
|
|
|
|
/*
|
|
* A union of all lock status structures. We define it here so that the
|
|
* size of the union is known. Lock status structures are embedded in
|
|
* ocfs2 inodes.
|
|
*/
|
|
struct ocfs2_cluster_connection;
|
|
struct ocfs2_dlm_lksb {
|
|
union {
|
|
struct dlm_lockstatus lksb_o2dlm;
|
|
struct dlm_lksb lksb_fsdlm;
|
|
struct fsdlm_lksb_plus_lvb padding;
|
|
};
|
|
struct ocfs2_cluster_connection *lksb_conn;
|
|
};
|
|
|
|
/*
|
|
* The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
|
|
*/
|
|
struct ocfs2_locking_protocol {
|
|
struct ocfs2_protocol_version lp_max_version;
|
|
void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
|
|
void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
|
|
void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
|
|
};
|
|
|
|
|
|
/*
|
|
* A cluster connection. Mostly opaque to ocfs2, the connection holds
|
|
* state for the underlying stack. ocfs2 does use cc_version to determine
|
|
* locking compatibility.
|
|
*/
|
|
struct ocfs2_cluster_connection {
|
|
char cc_name[GROUP_NAME_MAX + 1];
|
|
int cc_namelen;
|
|
char cc_cluster_name[CLUSTER_NAME_MAX + 1];
|
|
int cc_cluster_name_len;
|
|
struct ocfs2_protocol_version cc_version;
|
|
struct ocfs2_locking_protocol *cc_proto;
|
|
void (*cc_recovery_handler)(int node_num, void *recovery_data);
|
|
void *cc_recovery_data;
|
|
void *cc_lockspace;
|
|
void *cc_private;
|
|
};
|
|
|
|
/*
|
|
* Each cluster stack implements the stack operations structure. Not used
|
|
* in the ocfs2 code, the stackglue code translates generic cluster calls
|
|
* into stack operations.
|
|
*/
|
|
struct ocfs2_stack_operations {
|
|
/*
|
|
* The fs code calls ocfs2_cluster_connect() to attach a new
|
|
* filesystem to the cluster stack. The ->connect() op is passed
|
|
* an ocfs2_cluster_connection with the name and recovery field
|
|
* filled in.
|
|
*
|
|
* The stack must set up any notification mechanisms and create
|
|
* the filesystem lockspace in the DLM. The lockspace should be
|
|
* stored on cc_lockspace. Any other information can be stored on
|
|
* cc_private.
|
|
*
|
|
* ->connect() must not return until it is guaranteed that
|
|
*
|
|
* - Node down notifications for the filesystem will be received
|
|
* and passed to conn->cc_recovery_handler().
|
|
* - Locking requests for the filesystem will be processed.
|
|
*/
|
|
int (*connect)(struct ocfs2_cluster_connection *conn);
|
|
|
|
/*
|
|
* The fs code calls ocfs2_cluster_disconnect() when a filesystem
|
|
* no longer needs cluster services. All DLM locks have been
|
|
* dropped, and recovery notification is being ignored by the
|
|
* fs code. The stack must disengage from the DLM and discontinue
|
|
* recovery notification.
|
|
*
|
|
* Once ->disconnect() has returned, the connection structure will
|
|
* be freed. Thus, a stack must not return from ->disconnect()
|
|
* until it will no longer reference the conn pointer.
|
|
*
|
|
* Once this call returns, the stack glue will be dropping this
|
|
* connection's reference on the module.
|
|
*/
|
|
int (*disconnect)(struct ocfs2_cluster_connection *conn);
|
|
|
|
/*
|
|
* ->this_node() returns the cluster's unique identifier for the
|
|
* local node.
|
|
*/
|
|
int (*this_node)(struct ocfs2_cluster_connection *conn,
|
|
unsigned int *node);
|
|
|
|
/*
|
|
* Call the underlying dlm lock function. The ->dlm_lock()
|
|
* callback should convert the flags and mode as appropriate.
|
|
*
|
|
* ast and bast functions are not part of the call because the
|
|
* stack will likely want to wrap ast and bast calls before passing
|
|
* them to stack->sp_proto. There is no astarg. The lksb will
|
|
* be passed back to the ast and bast functions. The caller can
|
|
* use this to find their object.
|
|
*/
|
|
int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
|
|
int mode,
|
|
struct ocfs2_dlm_lksb *lksb,
|
|
u32 flags,
|
|
void *name,
|
|
unsigned int namelen);
|
|
|
|
/*
|
|
* Call the underlying dlm unlock function. The ->dlm_unlock()
|
|
* function should convert the flags as appropriate.
|
|
*
|
|
* The unlock ast is not passed, as the stack will want to wrap
|
|
* it before calling stack->sp_proto->lp_unlock_ast(). There is
|
|
* no astarg. The lksb will be passed back to the unlock ast
|
|
* function. The caller can use this to find their object.
|
|
*/
|
|
int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
|
|
struct ocfs2_dlm_lksb *lksb,
|
|
u32 flags);
|
|
|
|
/*
|
|
* Return the status of the current lock status block. The fs
|
|
* code should never dereference the union. The ->lock_status()
|
|
* callback pulls out the stack-specific lksb, converts the status
|
|
* to a proper errno, and returns it.
|
|
*/
|
|
int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
|
|
|
|
/*
|
|
* Return non-zero if the LVB is valid.
|
|
*/
|
|
int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
|
|
|
|
/*
|
|
* Pull the lvb pointer off of the stack-specific lksb.
|
|
*/
|
|
void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
|
|
|
|
/*
|
|
* Cluster-aware posix locks
|
|
*
|
|
* This is NULL for stacks which do not support posix locks.
|
|
*/
|
|
int (*plock)(struct ocfs2_cluster_connection *conn,
|
|
u64 ino,
|
|
struct file *file,
|
|
int cmd,
|
|
struct file_lock *fl);
|
|
|
|
/*
|
|
* This is an optoinal debugging hook. If provided, the
|
|
* stack can dump debugging information about this lock.
|
|
*/
|
|
void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
|
|
};
|
|
|
|
/*
|
|
* Each stack plugin must describe itself by registering a
|
|
* ocfs2_stack_plugin structure. This is only seen by stackglue and the
|
|
* stack driver.
|
|
*/
|
|
struct ocfs2_stack_plugin {
|
|
char *sp_name;
|
|
struct ocfs2_stack_operations *sp_ops;
|
|
struct module *sp_owner;
|
|
|
|
/* These are managed by the stackglue code. */
|
|
struct list_head sp_list;
|
|
unsigned int sp_count;
|
|
struct ocfs2_protocol_version sp_max_proto;
|
|
};
|
|
|
|
|
|
/* Used by the filesystem */
|
|
int ocfs2_cluster_connect(const char *stack_name,
|
|
const char *cluster_name,
|
|
int cluster_name_len,
|
|
const char *group,
|
|
int grouplen,
|
|
struct ocfs2_locking_protocol *lproto,
|
|
void (*recovery_handler)(int node_num,
|
|
void *recovery_data),
|
|
void *recovery_data,
|
|
struct ocfs2_cluster_connection **conn);
|
|
/*
|
|
* Used by callers that don't store their stack name. They must ensure
|
|
* all nodes have the same stack.
|
|
*/
|
|
int ocfs2_cluster_connect_agnostic(const char *group,
|
|
int grouplen,
|
|
struct ocfs2_locking_protocol *lproto,
|
|
void (*recovery_handler)(int node_num,
|
|
void *recovery_data),
|
|
void *recovery_data,
|
|
struct ocfs2_cluster_connection **conn);
|
|
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
|
|
int hangup_pending);
|
|
void ocfs2_cluster_hangup(const char *group, int grouplen);
|
|
int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
|
|
unsigned int *node);
|
|
|
|
struct ocfs2_lock_res;
|
|
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
|
|
int mode,
|
|
struct ocfs2_dlm_lksb *lksb,
|
|
u32 flags,
|
|
void *name,
|
|
unsigned int namelen);
|
|
int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
|
|
struct ocfs2_dlm_lksb *lksb,
|
|
u32 flags);
|
|
|
|
int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
|
|
int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
|
|
void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
|
|
void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
|
|
|
|
int ocfs2_stack_supports_plocks(void);
|
|
int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
|
|
struct file *file, int cmd, struct file_lock *fl);
|
|
|
|
void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
|
|
|
|
|
|
/* Used by stack plugins */
|
|
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
|
|
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
|
|
|
|
/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
|
|
int ocfs2_is_o2cb_active(void);
|
|
|
|
extern struct kset *ocfs2_kset;
|
|
|
|
#endif /* STACKGLUE_H */
|