diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 8558226281c4..22af489d3f34 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1862,7 +1862,7 @@ xfs_alloc_fix_freelist( (pag->pagf_longest - delta) : (pag->pagf_flcount > 0 || pag->pagf_longest > 0); if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (args->minleft && + (!(flags & XFS_ALLOC_FLAG_FREEING) && (int)(pag->pagf_freeblks + pag->pagf_flcount - need - args->total) < (int)args->minleft)) { @@ -1898,7 +1898,7 @@ xfs_alloc_fix_freelist( longest = (longest > delta) ? (longest - delta) : (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (args->minleft && + (!(flags & XFS_ALLOC_FLAG_FREEING) && (int)(be32_to_cpu(agf->agf_freeblks) + be32_to_cpu(agf->agf_flcount) - need - args->total) < (int)args->minleft)) { @@ -1951,8 +1951,14 @@ xfs_alloc_fix_freelist( * the restrictions correctly. Can happen for free calls * on a completely full ag. */ - if (targs.agbno == NULLAGBLOCK) + if (targs.agbno == NULLAGBLOCK) { + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; + } break; + } /* * Put each allocated block on the list. */ @@ -2360,8 +2366,19 @@ xfs_alloc_vextent( if (args->agno == sagno && type == XFS_ALLOCTYPE_START_BNO) args->type = XFS_ALLOCTYPE_THIS_AG; - if (++(args->agno) == mp->m_sb.sb_agcount) - args->agno = 0; + /* + * For the first allocation, we can try any AG to get + * space. However, if we already have allocated a + * block, we don't want to try AGs whose number is below + * sagno. Otherwise, we may end up with out-of-order + * locking of AGF, which might cause deadlock. + */ + if (++(args->agno) == mp->m_sb.sb_agcount) { + if (args->firstblock != NULLFSBLOCK) + args->agno = sagno; + else + args->agno = 0; + } /* * Reached the starting a.g., must either be done * or switch to non-trylock mode. @@ -2443,7 +2460,7 @@ xfs_free_extent( args.minlen = args.minleft = args.minalignslop = 0; down_read(&args.mp->m_peraglock); args.pag = &args.mp->m_perag[args.agno]; - if ((error = xfs_alloc_fix_freelist(&args, 0))) + if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG ASSERT(args.agbp != NULL); diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 2d1f8928b267..650591f999ae 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -41,6 +41,7 @@ typedef enum xfs_alloctype * Flags for xfs_alloc_fix_freelist. */ #define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ /* * Argument structure for xfs_alloc routines. @@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg { char wasfromfl; /* set if allocation is from freelist */ char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ + xfs_fsblock_t firstblock; /* io first block allocated */ } xfs_alloc_arg_t; /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 890ad3528174..ad595dbefe16 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2762,6 +2762,7 @@ xfs_bmap_btalloc( args.mp = mp; args.fsbno = ap->rval; args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); + args.firstblock = ap->firstblock; blen = 0; if (nullfb) { args.type = XFS_ALLOCTYPE_START_BNO; @@ -2821,7 +2822,7 @@ xfs_bmap_btalloc( else args.minlen = ap->alen; } else if (ap->low) { - args.type = XFS_ALLOCTYPE_FIRST_AG; + args.type = XFS_ALLOCTYPE_START_BNO; args.total = args.minlen = ap->minlen; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -3452,6 +3453,7 @@ xfs_bmap_extents_to_btree( XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); args.tp = tp; args.mp = mp; + args.firstblock = *firstblock; if (*firstblock == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); @@ -3587,6 +3589,7 @@ xfs_bmap_local_to_extents( args.tp = tp; args.mp = ip->i_mount; + args.firstblock = *firstblock; ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); /* diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index bea44709afbe..3b6dfc9b53af 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -1569,12 +1569,11 @@ xfs_bmbt_split( lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp)); left = XFS_BUF_TO_BMBT_BLOCK(lbp); args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; if (args.fsbno == NULLFSBLOCK) { args.fsbno = lbno; args.type = XFS_ALLOCTYPE_START_BNO; - } else if (cur->bc_private.b.flist->xbf_low) - args.type = XFS_ALLOCTYPE_FIRST_AG; - else + } else args.type = XFS_ALLOCTYPE_NEAR_BNO; args.mod = args.minleft = args.alignment = args.total = args.isfl = args.userdata = args.minalignslop = 0; @@ -2356,6 +2355,7 @@ xfs_bmbt_newroot( args.userdata = args.minalignslop = 0; args.minlen = args.maxlen = args.prod = 1; args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.firstblock = args.fsbno; if (args.fsbno == NULLFSBLOCK) { #ifdef DEBUG if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { @@ -2365,9 +2365,7 @@ xfs_bmbt_newroot( #endif args.fsbno = INT_GET(*pp, ARCH_CONVERT); args.type = XFS_ALLOCTYPE_START_BNO; - } else if (args.wasdel) - args.type = XFS_ALLOCTYPE_FIRST_AG; - else + } else args.type = XFS_ALLOCTYPE_NEAR_BNO; if ((error = xfs_alloc_vextent(&args))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c0b1c2906880..4b7be49cc4de 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1254,6 +1254,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) xfs_trans_log_buf(tp, bp, first, last); } + +/* + * In order to avoid ENOSPC-related deadlock caused by + * out-of-order locking of AGF buffer (PV 947395), we place + * constraints on the relationship among actual allocations for + * data blocks, freelist blocks, and potential file data bmap + * btree blocks. However, these restrictions may result in no + * actual space allocated for a delayed extent, for example, a data + * block in a certain AG is allocated but there is no additional + * block for the additional bmap btree block due to a split of the + * bmap btree of the file. The result of this may lead to an + * infinite loop in xfssyncd when the file gets flushed to disk and + * all delayed extents need to be actually allocated. To get around + * this, we explicitly set aside a few blocks which will not be + * reserved in delayed allocation. Considering the minimum number of + * needed freelist blocks is 4 fsbs, a potential split of file's bmap + * btree requires 1 fsb, so we set the number of set-aside blocks to 8. +*/ +#define SET_ASIDE_BLOCKS 8 + /* * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply * a delta to a specified field in the in-core superblock. Simply @@ -1298,7 +1318,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, return 0; case XFS_SBS_FDBLOCKS: - lcounter = (long long)mp->m_sb.sb_fdblocks; + lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS; res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); if (delta > 0) { /* Putting blocks back */ @@ -1332,7 +1352,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, } } - mp->m_sb.sb_fdblocks = lcounter; + mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS; return 0; case XFS_SBS_FREXTENTS: lcounter = (long long)mp->m_sb.sb_frextents;