1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4   * All Rights Reserved.
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_log_format.h"
11  #include "xfs_trans_resv.h"
12  #include "xfs_bit.h"
13  #include "xfs_mount.h"
14  #include "xfs_inode.h"
15  #include "xfs_btree.h"
16  #include "xfs_ialloc.h"
17  #include "xfs_ialloc_btree.h"
18  #include "xfs_alloc.h"
19  #include "xfs_errortag.h"
20  #include "xfs_error.h"
21  #include "xfs_bmap.h"
22  #include "xfs_trans.h"
23  #include "xfs_buf_item.h"
24  #include "xfs_icreate_item.h"
25  #include "xfs_icache.h"
26  #include "xfs_trace.h"
27  #include "xfs_log.h"
28  #include "xfs_rmap.h"
29  #include "xfs_ag.h"
30  #include "xfs_health.h"
31  
32  /*
33   * Lookup a record by ino in the btree given by cur.
34   */
35  int					/* error */
xfs_inobt_lookup(struct xfs_btree_cur * cur,xfs_agino_t ino,xfs_lookup_t dir,int * stat)36  xfs_inobt_lookup(
37  	struct xfs_btree_cur	*cur,	/* btree cursor */
38  	xfs_agino_t		ino,	/* starting inode of chunk */
39  	xfs_lookup_t		dir,	/* <=, >=, == */
40  	int			*stat)	/* success/failure */
41  {
42  	cur->bc_rec.i.ir_startino = ino;
43  	cur->bc_rec.i.ir_holemask = 0;
44  	cur->bc_rec.i.ir_count = 0;
45  	cur->bc_rec.i.ir_freecount = 0;
46  	cur->bc_rec.i.ir_free = 0;
47  	return xfs_btree_lookup(cur, dir, stat);
48  }
49  
50  /*
51   * Update the record referred to by cur to the value given.
52   * This either works (return 0) or gets an EFSCORRUPTED error.
53   */
54  STATIC int				/* error */
xfs_inobt_update(struct xfs_btree_cur * cur,xfs_inobt_rec_incore_t * irec)55  xfs_inobt_update(
56  	struct xfs_btree_cur	*cur,	/* btree cursor */
57  	xfs_inobt_rec_incore_t	*irec)	/* btree record */
58  {
59  	union xfs_btree_rec	rec;
60  
61  	rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
62  	if (xfs_has_sparseinodes(cur->bc_mp)) {
63  		rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
64  		rec.inobt.ir_u.sp.ir_count = irec->ir_count;
65  		rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
66  	} else {
67  		/* ir_holemask/ir_count not supported on-disk */
68  		rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
69  	}
70  	rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
71  	return xfs_btree_update(cur, &rec);
72  }
73  
74  /* Convert on-disk btree record to incore inobt record. */
75  void
xfs_inobt_btrec_to_irec(struct xfs_mount * mp,const union xfs_btree_rec * rec,struct xfs_inobt_rec_incore * irec)76  xfs_inobt_btrec_to_irec(
77  	struct xfs_mount		*mp,
78  	const union xfs_btree_rec	*rec,
79  	struct xfs_inobt_rec_incore	*irec)
80  {
81  	irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
82  	if (xfs_has_sparseinodes(mp)) {
83  		irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
84  		irec->ir_count = rec->inobt.ir_u.sp.ir_count;
85  		irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
86  	} else {
87  		/*
88  		 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
89  		 * values for full inode chunks.
90  		 */
91  		irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
92  		irec->ir_count = XFS_INODES_PER_CHUNK;
93  		irec->ir_freecount =
94  				be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
95  	}
96  	irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
97  }
98  
99  /* Compute the freecount of an incore inode record. */
100  uint8_t
xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore * irec)101  xfs_inobt_rec_freecount(
102  	const struct xfs_inobt_rec_incore	*irec)
103  {
104  	uint64_t				realfree = irec->ir_free;
105  
106  	if (xfs_inobt_issparse(irec->ir_holemask))
107  		realfree &= xfs_inobt_irec_to_allocmask(irec);
108  	return hweight64(realfree);
109  }
110  
111  /* Simple checks for inode records. */
112  xfs_failaddr_t
xfs_inobt_check_irec(struct xfs_perag * pag,const struct xfs_inobt_rec_incore * irec)113  xfs_inobt_check_irec(
114  	struct xfs_perag			*pag,
115  	const struct xfs_inobt_rec_incore	*irec)
116  {
117  	/* Record has to be properly aligned within the AG. */
118  	if (!xfs_verify_agino(pag, irec->ir_startino))
119  		return __this_address;
120  	if (!xfs_verify_agino(pag,
121  				irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
122  		return __this_address;
123  	if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
124  	    irec->ir_count > XFS_INODES_PER_CHUNK)
125  		return __this_address;
126  	if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
127  		return __this_address;
128  
129  	if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
130  		return __this_address;
131  
132  	return NULL;
133  }
134  
135  static inline int
xfs_inobt_complain_bad_rec(struct xfs_btree_cur * cur,xfs_failaddr_t fa,const struct xfs_inobt_rec_incore * irec)136  xfs_inobt_complain_bad_rec(
137  	struct xfs_btree_cur		*cur,
138  	xfs_failaddr_t			fa,
139  	const struct xfs_inobt_rec_incore *irec)
140  {
141  	struct xfs_mount		*mp = cur->bc_mp;
142  
143  	xfs_warn(mp,
144  		"%sbt record corruption in AG %d detected at %pS!",
145  		cur->bc_ops->name, cur->bc_group->xg_gno, fa);
146  	xfs_warn(mp,
147  "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
148  		irec->ir_startino, irec->ir_count, irec->ir_freecount,
149  		irec->ir_free, irec->ir_holemask);
150  	xfs_btree_mark_sick(cur);
151  	return -EFSCORRUPTED;
152  }
153  
154  /*
155   * Get the data from the pointed-to record.
156   */
157  int
xfs_inobt_get_rec(struct xfs_btree_cur * cur,struct xfs_inobt_rec_incore * irec,int * stat)158  xfs_inobt_get_rec(
159  	struct xfs_btree_cur		*cur,
160  	struct xfs_inobt_rec_incore	*irec,
161  	int				*stat)
162  {
163  	struct xfs_mount		*mp = cur->bc_mp;
164  	union xfs_btree_rec		*rec;
165  	xfs_failaddr_t			fa;
166  	int				error;
167  
168  	error = xfs_btree_get_rec(cur, &rec, stat);
169  	if (error || *stat == 0)
170  		return error;
171  
172  	xfs_inobt_btrec_to_irec(mp, rec, irec);
173  	fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec);
174  	if (fa)
175  		return xfs_inobt_complain_bad_rec(cur, fa, irec);
176  
177  	return 0;
178  }
179  
180  /*
181   * Insert a single inobt record. Cursor must already point to desired location.
182   */
183  int
xfs_inobt_insert_rec(struct xfs_btree_cur * cur,uint16_t holemask,uint8_t count,int32_t freecount,xfs_inofree_t free,int * stat)184  xfs_inobt_insert_rec(
185  	struct xfs_btree_cur	*cur,
186  	uint16_t		holemask,
187  	uint8_t			count,
188  	int32_t			freecount,
189  	xfs_inofree_t		free,
190  	int			*stat)
191  {
192  	cur->bc_rec.i.ir_holemask = holemask;
193  	cur->bc_rec.i.ir_count = count;
194  	cur->bc_rec.i.ir_freecount = freecount;
195  	cur->bc_rec.i.ir_free = free;
196  	return xfs_btree_insert(cur, stat);
197  }
198  
199  /*
200   * Insert records describing a newly allocated inode chunk into the inobt.
201   */
202  STATIC int
xfs_inobt_insert(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agino_t newino,xfs_agino_t newlen,bool is_finobt)203  xfs_inobt_insert(
204  	struct xfs_perag	*pag,
205  	struct xfs_trans	*tp,
206  	struct xfs_buf		*agbp,
207  	xfs_agino_t		newino,
208  	xfs_agino_t		newlen,
209  	bool			is_finobt)
210  {
211  	struct xfs_btree_cur	*cur;
212  	xfs_agino_t		thisino;
213  	int			i;
214  	int			error;
215  
216  	if (is_finobt)
217  		cur = xfs_finobt_init_cursor(pag, tp, agbp);
218  	else
219  		cur = xfs_inobt_init_cursor(pag, tp, agbp);
220  
221  	for (thisino = newino;
222  	     thisino < newino + newlen;
223  	     thisino += XFS_INODES_PER_CHUNK) {
224  		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
225  		if (error) {
226  			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
227  			return error;
228  		}
229  		ASSERT(i == 0);
230  
231  		error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
232  					     XFS_INODES_PER_CHUNK,
233  					     XFS_INODES_PER_CHUNK,
234  					     XFS_INOBT_ALL_FREE, &i);
235  		if (error) {
236  			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
237  			return error;
238  		}
239  		ASSERT(i == 1);
240  	}
241  
242  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
243  
244  	return 0;
245  }
246  
247  /*
248   * Verify that the number of free inodes in the AGI is correct.
249   */
250  #ifdef DEBUG
251  static int
xfs_check_agi_freecount(struct xfs_btree_cur * cur)252  xfs_check_agi_freecount(
253  	struct xfs_btree_cur	*cur)
254  {
255  	if (cur->bc_nlevels == 1) {
256  		xfs_inobt_rec_incore_t rec;
257  		int		freecount = 0;
258  		int		error;
259  		int		i;
260  
261  		error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
262  		if (error)
263  			return error;
264  
265  		do {
266  			error = xfs_inobt_get_rec(cur, &rec, &i);
267  			if (error)
268  				return error;
269  
270  			if (i) {
271  				freecount += rec.ir_freecount;
272  				error = xfs_btree_increment(cur, 0, &i);
273  				if (error)
274  					return error;
275  			}
276  		} while (i == 1);
277  
278  		if (!xfs_is_shutdown(cur->bc_mp)) {
279  			ASSERT(freecount ==
280  				to_perag(cur->bc_group)->pagi_freecount);
281  		}
282  	}
283  	return 0;
284  }
285  #else
286  #define xfs_check_agi_freecount(cur)	0
287  #endif
288  
289  /*
290   * Initialise a new set of inodes. When called without a transaction context
291   * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
292   * than logging them (which in a transaction context puts them into the AIL
293   * for writeback rather than the xfsbufd queue).
294   */
295  int
xfs_ialloc_inode_init(struct xfs_mount * mp,struct xfs_trans * tp,struct list_head * buffer_list,int icount,xfs_agnumber_t agno,xfs_agblock_t agbno,xfs_agblock_t length,unsigned int gen)296  xfs_ialloc_inode_init(
297  	struct xfs_mount	*mp,
298  	struct xfs_trans	*tp,
299  	struct list_head	*buffer_list,
300  	int			icount,
301  	xfs_agnumber_t		agno,
302  	xfs_agblock_t		agbno,
303  	xfs_agblock_t		length,
304  	unsigned int		gen)
305  {
306  	struct xfs_buf		*fbuf;
307  	struct xfs_dinode	*free;
308  	int			nbufs;
309  	int			version;
310  	int			i, j;
311  	xfs_daddr_t		d;
312  	xfs_ino_t		ino = 0;
313  	int			error;
314  
315  	/*
316  	 * Loop over the new block(s), filling in the inodes.  For small block
317  	 * sizes, manipulate the inodes in buffers  which are multiples of the
318  	 * blocks size.
319  	 */
320  	nbufs = length / M_IGEO(mp)->blocks_per_cluster;
321  
322  	/*
323  	 * Figure out what version number to use in the inodes we create.  If
324  	 * the superblock version has caught up to the one that supports the new
325  	 * inode format, then use the new inode version.  Otherwise use the old
326  	 * version so that old kernels will continue to be able to use the file
327  	 * system.
328  	 *
329  	 * For v3 inodes, we also need to write the inode number into the inode,
330  	 * so calculate the first inode number of the chunk here as
331  	 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
332  	 * across multiple filesystem blocks (such as a cluster) and so cannot
333  	 * be used in the cluster buffer loop below.
334  	 *
335  	 * Further, because we are writing the inode directly into the buffer
336  	 * and calculating a CRC on the entire inode, we have ot log the entire
337  	 * inode so that the entire range the CRC covers is present in the log.
338  	 * That means for v3 inode we log the entire buffer rather than just the
339  	 * inode cores.
340  	 */
341  	if (xfs_has_v3inodes(mp)) {
342  		version = 3;
343  		ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
344  
345  		/*
346  		 * log the initialisation that is about to take place as an
347  		 * logical operation. This means the transaction does not
348  		 * need to log the physical changes to the inode buffers as log
349  		 * recovery will know what initialisation is actually needed.
350  		 * Hence we only need to log the buffers as "ordered" buffers so
351  		 * they track in the AIL as if they were physically logged.
352  		 */
353  		if (tp)
354  			xfs_icreate_log(tp, agno, agbno, icount,
355  					mp->m_sb.sb_inodesize, length, gen);
356  	} else
357  		version = 2;
358  
359  	for (j = 0; j < nbufs; j++) {
360  		/*
361  		 * Get the block.
362  		 */
363  		d = XFS_AGB_TO_DADDR(mp, agno, agbno +
364  				(j * M_IGEO(mp)->blocks_per_cluster));
365  		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
366  				mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
367  				XBF_UNMAPPED, &fbuf);
368  		if (error)
369  			return error;
370  
371  		/* Initialize the inode buffers and log them appropriately. */
372  		fbuf->b_ops = &xfs_inode_buf_ops;
373  		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
374  		for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
375  			int	ioffset = i << mp->m_sb.sb_inodelog;
376  
377  			free = xfs_make_iptr(mp, fbuf, i);
378  			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
379  			free->di_version = version;
380  			free->di_gen = cpu_to_be32(gen);
381  			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
382  
383  			if (version == 3) {
384  				free->di_ino = cpu_to_be64(ino);
385  				ino++;
386  				uuid_copy(&free->di_uuid,
387  					  &mp->m_sb.sb_meta_uuid);
388  				xfs_dinode_calc_crc(mp, free);
389  			} else if (tp) {
390  				/* just log the inode core */
391  				xfs_trans_log_buf(tp, fbuf, ioffset,
392  					  ioffset + XFS_DINODE_SIZE(mp) - 1);
393  			}
394  		}
395  
396  		if (tp) {
397  			/*
398  			 * Mark the buffer as an inode allocation buffer so it
399  			 * sticks in AIL at the point of this allocation
400  			 * transaction. This ensures the they are on disk before
401  			 * the tail of the log can be moved past this
402  			 * transaction (i.e. by preventing relogging from moving
403  			 * it forward in the log).
404  			 */
405  			xfs_trans_inode_alloc_buf(tp, fbuf);
406  			if (version == 3) {
407  				/*
408  				 * Mark the buffer as ordered so that they are
409  				 * not physically logged in the transaction but
410  				 * still tracked in the AIL as part of the
411  				 * transaction and pin the log appropriately.
412  				 */
413  				xfs_trans_ordered_buf(tp, fbuf);
414  			}
415  		} else {
416  			fbuf->b_flags |= XBF_DONE;
417  			xfs_buf_delwri_queue(fbuf, buffer_list);
418  			xfs_buf_relse(fbuf);
419  		}
420  	}
421  	return 0;
422  }
423  
424  /*
425   * Align startino and allocmask for a recently allocated sparse chunk such that
426   * they are fit for insertion (or merge) into the on-disk inode btrees.
427   *
428   * Background:
429   *
430   * When enabled, sparse inode support increases the inode alignment from cluster
431   * size to inode chunk size. This means that the minimum range between two
432   * non-adjacent inode records in the inobt is large enough for a full inode
433   * record. This allows for cluster sized, cluster aligned block allocation
434   * without need to worry about whether the resulting inode record overlaps with
435   * another record in the tree. Without this basic rule, we would have to deal
436   * with the consequences of overlap by potentially undoing recent allocations in
437   * the inode allocation codepath.
438   *
439   * Because of this alignment rule (which is enforced on mount), there are two
440   * inobt possibilities for newly allocated sparse chunks. One is that the
441   * aligned inode record for the chunk covers a range of inodes not already
442   * covered in the inobt (i.e., it is safe to insert a new sparse record). The
443   * other is that a record already exists at the aligned startino that considers
444   * the newly allocated range as sparse. In the latter case, record content is
445   * merged in hope that sparse inode chunks fill to full chunks over time.
446   */
447  STATIC void
xfs_align_sparse_ino(struct xfs_mount * mp,xfs_agino_t * startino,uint16_t * allocmask)448  xfs_align_sparse_ino(
449  	struct xfs_mount		*mp,
450  	xfs_agino_t			*startino,
451  	uint16_t			*allocmask)
452  {
453  	xfs_agblock_t			agbno;
454  	xfs_agblock_t			mod;
455  	int				offset;
456  
457  	agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
458  	mod = agbno % mp->m_sb.sb_inoalignmt;
459  	if (!mod)
460  		return;
461  
462  	/* calculate the inode offset and align startino */
463  	offset = XFS_AGB_TO_AGINO(mp, mod);
464  	*startino -= offset;
465  
466  	/*
467  	 * Since startino has been aligned down, left shift allocmask such that
468  	 * it continues to represent the same physical inodes relative to the
469  	 * new startino.
470  	 */
471  	*allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
472  }
473  
474  /*
475   * Determine whether the source inode record can merge into the target. Both
476   * records must be sparse, the inode ranges must match and there must be no
477   * allocation overlap between the records.
478   */
479  STATIC bool
__xfs_inobt_can_merge(struct xfs_inobt_rec_incore * trec,struct xfs_inobt_rec_incore * srec)480  __xfs_inobt_can_merge(
481  	struct xfs_inobt_rec_incore	*trec,	/* tgt record */
482  	struct xfs_inobt_rec_incore	*srec)	/* src record */
483  {
484  	uint64_t			talloc;
485  	uint64_t			salloc;
486  
487  	/* records must cover the same inode range */
488  	if (trec->ir_startino != srec->ir_startino)
489  		return false;
490  
491  	/* both records must be sparse */
492  	if (!xfs_inobt_issparse(trec->ir_holemask) ||
493  	    !xfs_inobt_issparse(srec->ir_holemask))
494  		return false;
495  
496  	/* both records must track some inodes */
497  	if (!trec->ir_count || !srec->ir_count)
498  		return false;
499  
500  	/* can't exceed capacity of a full record */
501  	if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
502  		return false;
503  
504  	/* verify there is no allocation overlap */
505  	talloc = xfs_inobt_irec_to_allocmask(trec);
506  	salloc = xfs_inobt_irec_to_allocmask(srec);
507  	if (talloc & salloc)
508  		return false;
509  
510  	return true;
511  }
512  
513  /*
514   * Merge the source inode record into the target. The caller must call
515   * __xfs_inobt_can_merge() to ensure the merge is valid.
516   */
517  STATIC void
__xfs_inobt_rec_merge(struct xfs_inobt_rec_incore * trec,struct xfs_inobt_rec_incore * srec)518  __xfs_inobt_rec_merge(
519  	struct xfs_inobt_rec_incore	*trec,	/* target */
520  	struct xfs_inobt_rec_incore	*srec)	/* src */
521  {
522  	ASSERT(trec->ir_startino == srec->ir_startino);
523  
524  	/* combine the counts */
525  	trec->ir_count += srec->ir_count;
526  	trec->ir_freecount += srec->ir_freecount;
527  
528  	/*
529  	 * Merge the holemask and free mask. For both fields, 0 bits refer to
530  	 * allocated inodes. We combine the allocated ranges with bitwise AND.
531  	 */
532  	trec->ir_holemask &= srec->ir_holemask;
533  	trec->ir_free &= srec->ir_free;
534  }
535  
536  /*
537   * Insert a new sparse inode chunk into the associated inode allocation btree.
538   * The inode record for the sparse chunk is pre-aligned to a startino that
539   * should match any pre-existing sparse inode record in the tree. This allows
540   * sparse chunks to fill over time.
541   *
542   * If no preexisting record exists, the provided record is inserted.
543   * If there is a preexisting record, the provided record is merged with the
544   * existing record and updated in place. The merged record is returned in nrec.
545   *
546   * It is considered corruption if a merge is requested and not possible. Given
547   * the sparse inode alignment constraints, this should never happen.
548   */
549  STATIC int
xfs_inobt_insert_sprec(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_inobt_rec_incore * nrec)550  xfs_inobt_insert_sprec(
551  	struct xfs_perag		*pag,
552  	struct xfs_trans		*tp,
553  	struct xfs_buf			*agbp,
554  	struct xfs_inobt_rec_incore	*nrec)	/* in/out: new/merged rec. */
555  {
556  	struct xfs_mount		*mp = pag_mount(pag);
557  	struct xfs_btree_cur		*cur;
558  	int				error;
559  	int				i;
560  	struct xfs_inobt_rec_incore	rec;
561  
562  	cur = xfs_inobt_init_cursor(pag, tp, agbp);
563  
564  	/* the new record is pre-aligned so we know where to look */
565  	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
566  	if (error)
567  		goto error;
568  	/* if nothing there, insert a new record and return */
569  	if (i == 0) {
570  		error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
571  					     nrec->ir_count, nrec->ir_freecount,
572  					     nrec->ir_free, &i);
573  		if (error)
574  			goto error;
575  		if (XFS_IS_CORRUPT(mp, i != 1)) {
576  			xfs_btree_mark_sick(cur);
577  			error = -EFSCORRUPTED;
578  			goto error;
579  		}
580  
581  		goto out;
582  	}
583  
584  	/*
585  	 * A record exists at this startino.  Merge the records.
586  	 */
587  	error = xfs_inobt_get_rec(cur, &rec, &i);
588  	if (error)
589  		goto error;
590  	if (XFS_IS_CORRUPT(mp, i != 1)) {
591  		xfs_btree_mark_sick(cur);
592  		error = -EFSCORRUPTED;
593  		goto error;
594  	}
595  	if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
596  		xfs_btree_mark_sick(cur);
597  		error = -EFSCORRUPTED;
598  		goto error;
599  	}
600  
601  	/*
602  	 * This should never fail. If we have coexisting records that
603  	 * cannot merge, something is seriously wrong.
604  	 */
605  	if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
606  		xfs_btree_mark_sick(cur);
607  		error = -EFSCORRUPTED;
608  		goto error;
609  	}
610  
611  	trace_xfs_irec_merge_pre(pag, &rec, nrec);
612  
613  	/* merge to nrec to output the updated record */
614  	__xfs_inobt_rec_merge(nrec, &rec);
615  
616  	trace_xfs_irec_merge_post(pag, nrec);
617  
618  	error = xfs_inobt_rec_check_count(mp, nrec);
619  	if (error)
620  		goto error;
621  
622  	error = xfs_inobt_update(cur, nrec);
623  	if (error)
624  		goto error;
625  
626  out:
627  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
628  	return 0;
629  error:
630  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
631  	return error;
632  }
633  
634  /*
635   * Insert a new sparse inode chunk into the free inode btree. The inode
636   * record for the sparse chunk is pre-aligned to a startino that should match
637   * any pre-existing sparse inode record in the tree. This allows sparse chunks
638   * to fill over time.
639   *
640   * The new record is always inserted, overwriting a pre-existing record if
641   * there is one.
642   */
643  STATIC int
xfs_finobt_insert_sprec(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_inobt_rec_incore * nrec)644  xfs_finobt_insert_sprec(
645  	struct xfs_perag		*pag,
646  	struct xfs_trans		*tp,
647  	struct xfs_buf			*agbp,
648  	struct xfs_inobt_rec_incore	*nrec)	/* in/out: new rec. */
649  {
650  	struct xfs_mount		*mp = pag_mount(pag);
651  	struct xfs_btree_cur		*cur;
652  	int				error;
653  	int				i;
654  
655  	cur = xfs_finobt_init_cursor(pag, tp, agbp);
656  
657  	/* the new record is pre-aligned so we know where to look */
658  	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
659  	if (error)
660  		goto error;
661  	/* if nothing there, insert a new record and return */
662  	if (i == 0) {
663  		error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
664  					     nrec->ir_count, nrec->ir_freecount,
665  					     nrec->ir_free, &i);
666  		if (error)
667  			goto error;
668  		if (XFS_IS_CORRUPT(mp, i != 1)) {
669  			xfs_btree_mark_sick(cur);
670  			error = -EFSCORRUPTED;
671  			goto error;
672  		}
673  	} else {
674  		error = xfs_inobt_update(cur, nrec);
675  		if (error)
676  			goto error;
677  	}
678  
679  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
680  	return 0;
681  error:
682  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
683  	return error;
684  }
685  
686  
687  /*
688   * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
689   * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
690   * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
691   * inode count threshold, or the usual negative error code for other errors.
692   */
693  STATIC int
xfs_ialloc_ag_alloc(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp)694  xfs_ialloc_ag_alloc(
695  	struct xfs_perag	*pag,
696  	struct xfs_trans	*tp,
697  	struct xfs_buf		*agbp)
698  {
699  	struct xfs_agi		*agi;
700  	struct xfs_alloc_arg	args;
701  	int			error;
702  	xfs_agino_t		newino;		/* new first inode's number */
703  	xfs_agino_t		newlen;		/* new number of inodes */
704  	int			isaligned = 0;	/* inode allocation at stripe */
705  						/* unit boundary */
706  	/* init. to full chunk */
707  	struct xfs_inobt_rec_incore rec;
708  	struct xfs_ino_geometry	*igeo = M_IGEO(tp->t_mountp);
709  	uint16_t		allocmask = (uint16_t) -1;
710  	int			do_sparse = 0;
711  
712  	memset(&args, 0, sizeof(args));
713  	args.tp = tp;
714  	args.mp = tp->t_mountp;
715  	args.fsbno = NULLFSBLOCK;
716  	args.oinfo = XFS_RMAP_OINFO_INODES;
717  	args.pag = pag;
718  
719  #ifdef DEBUG
720  	/* randomly do sparse inode allocations */
721  	if (xfs_has_sparseinodes(tp->t_mountp) &&
722  	    igeo->ialloc_min_blks < igeo->ialloc_blks)
723  		do_sparse = get_random_u32_below(2);
724  #endif
725  
726  	/*
727  	 * Locking will ensure that we don't have two callers in here
728  	 * at one time.
729  	 */
730  	newlen = igeo->ialloc_inos;
731  	if (igeo->maxicount &&
732  	    percpu_counter_read_positive(&args.mp->m_icount) + newlen >
733  							igeo->maxicount)
734  		return -ENOSPC;
735  	args.minlen = args.maxlen = igeo->ialloc_blks;
736  	/*
737  	 * First try to allocate inodes contiguous with the last-allocated
738  	 * chunk of inodes.  If the filesystem is striped, this will fill
739  	 * an entire stripe unit with inodes.
740  	 */
741  	agi = agbp->b_addr;
742  	newino = be32_to_cpu(agi->agi_newino);
743  	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
744  		     igeo->ialloc_blks;
745  	if (do_sparse)
746  		goto sparse_alloc;
747  	if (likely(newino != NULLAGINO &&
748  		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
749  		args.prod = 1;
750  
751  		/*
752  		 * We need to take into account alignment here to ensure that
753  		 * we don't modify the free list if we fail to have an exact
754  		 * block. If we don't have an exact match, and every oher
755  		 * attempt allocation attempt fails, we'll end up cancelling
756  		 * a dirty transaction and shutting down.
757  		 *
758  		 * For an exact allocation, alignment must be 1,
759  		 * however we need to take cluster alignment into account when
760  		 * fixing up the freelist. Use the minalignslop field to
761  		 * indicate that extra blocks might be required for alignment,
762  		 * but not to use them in the actual exact allocation.
763  		 */
764  		args.alignment = 1;
765  		args.minalignslop = igeo->cluster_align - 1;
766  
767  		/* Allow space for the inode btree to split. */
768  		args.minleft = igeo->inobt_maxlevels;
769  		error = xfs_alloc_vextent_exact_bno(&args,
770  				xfs_agbno_to_fsb(pag, args.agbno));
771  		if (error)
772  			return error;
773  
774  		/*
775  		 * This request might have dirtied the transaction if the AG can
776  		 * satisfy the request, but the exact block was not available.
777  		 * If the allocation did fail, subsequent requests will relax
778  		 * the exact agbno requirement and increase the alignment
779  		 * instead. It is critical that the total size of the request
780  		 * (len + alignment + slop) does not increase from this point
781  		 * on, so reset minalignslop to ensure it is not included in
782  		 * subsequent requests.
783  		 */
784  		args.minalignslop = 0;
785  	}
786  
787  	if (unlikely(args.fsbno == NULLFSBLOCK)) {
788  		/*
789  		 * Set the alignment for the allocation.
790  		 * If stripe alignment is turned on then align at stripe unit
791  		 * boundary.
792  		 * If the cluster size is smaller than a filesystem block
793  		 * then we're doing I/O for inodes in filesystem block size
794  		 * pieces, so don't need alignment anyway.
795  		 */
796  		isaligned = 0;
797  		if (igeo->ialloc_align) {
798  			ASSERT(!xfs_has_noalign(args.mp));
799  			args.alignment = args.mp->m_dalign;
800  			isaligned = 1;
801  		} else
802  			args.alignment = igeo->cluster_align;
803  		/*
804  		 * Allocate a fixed-size extent of inodes.
805  		 */
806  		args.prod = 1;
807  		/*
808  		 * Allow space for the inode btree to split.
809  		 */
810  		args.minleft = igeo->inobt_maxlevels;
811  		error = xfs_alloc_vextent_near_bno(&args,
812  				xfs_agbno_to_fsb(pag,
813  					be32_to_cpu(agi->agi_root)));
814  		if (error)
815  			return error;
816  	}
817  
818  	/*
819  	 * If stripe alignment is turned on, then try again with cluster
820  	 * alignment.
821  	 */
822  	if (isaligned && args.fsbno == NULLFSBLOCK) {
823  		args.alignment = igeo->cluster_align;
824  		error = xfs_alloc_vextent_near_bno(&args,
825  				xfs_agbno_to_fsb(pag,
826  					be32_to_cpu(agi->agi_root)));
827  		if (error)
828  			return error;
829  	}
830  
831  	/*
832  	 * Finally, try a sparse allocation if the filesystem supports it and
833  	 * the sparse allocation length is smaller than a full chunk.
834  	 */
835  	if (xfs_has_sparseinodes(args.mp) &&
836  	    igeo->ialloc_min_blks < igeo->ialloc_blks &&
837  	    args.fsbno == NULLFSBLOCK) {
838  sparse_alloc:
839  		args.alignment = args.mp->m_sb.sb_spino_align;
840  		args.prod = 1;
841  
842  		args.minlen = igeo->ialloc_min_blks;
843  		args.maxlen = args.minlen;
844  
845  		/*
846  		 * The inode record will be aligned to full chunk size. We must
847  		 * prevent sparse allocation from AG boundaries that result in
848  		 * invalid inode records, such as records that start at agbno 0
849  		 * or extend beyond the AG.
850  		 *
851  		 * Set min agbno to the first aligned, non-zero agbno and max to
852  		 * the last aligned agbno that is at least one full chunk from
853  		 * the end of the AG.
854  		 */
855  		args.min_agbno = args.mp->m_sb.sb_inoalignmt;
856  		args.max_agbno = round_down(xfs_ag_block_count(args.mp,
857  							pag_agno(pag)),
858  					    args.mp->m_sb.sb_inoalignmt) -
859  				 igeo->ialloc_blks;
860  
861  		error = xfs_alloc_vextent_near_bno(&args,
862  				xfs_agbno_to_fsb(pag,
863  					be32_to_cpu(agi->agi_root)));
864  		if (error)
865  			return error;
866  
867  		newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
868  		ASSERT(newlen <= XFS_INODES_PER_CHUNK);
869  		allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
870  	}
871  
872  	if (args.fsbno == NULLFSBLOCK)
873  		return -EAGAIN;
874  
875  	ASSERT(args.len == args.minlen);
876  
877  	/*
878  	 * Stamp and write the inode buffers.
879  	 *
880  	 * Seed the new inode cluster with a random generation number. This
881  	 * prevents short-term reuse of generation numbers if a chunk is
882  	 * freed and then immediately reallocated. We use random numbers
883  	 * rather than a linear progression to prevent the next generation
884  	 * number from being easily guessable.
885  	 */
886  	error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag),
887  			args.agbno, args.len, get_random_u32());
888  
889  	if (error)
890  		return error;
891  	/*
892  	 * Convert the results.
893  	 */
894  	newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
895  
896  	if (xfs_inobt_issparse(~allocmask)) {
897  		/*
898  		 * We've allocated a sparse chunk. Align the startino and mask.
899  		 */
900  		xfs_align_sparse_ino(args.mp, &newino, &allocmask);
901  
902  		rec.ir_startino = newino;
903  		rec.ir_holemask = ~allocmask;
904  		rec.ir_count = newlen;
905  		rec.ir_freecount = newlen;
906  		rec.ir_free = XFS_INOBT_ALL_FREE;
907  
908  		/*
909  		 * Insert the sparse record into the inobt and allow for a merge
910  		 * if necessary. If a merge does occur, rec is updated to the
911  		 * merged record.
912  		 */
913  		error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
914  		if (error == -EFSCORRUPTED) {
915  			xfs_alert(args.mp,
916  	"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
917  				  xfs_agino_to_ino(pag, rec.ir_startino),
918  				  rec.ir_holemask, rec.ir_count);
919  			xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
920  		}
921  		if (error)
922  			return error;
923  
924  		/*
925  		 * We can't merge the part we've just allocated as for the inobt
926  		 * due to finobt semantics. The original record may or may not
927  		 * exist independent of whether physical inodes exist in this
928  		 * sparse chunk.
929  		 *
930  		 * We must update the finobt record based on the inobt record.
931  		 * rec contains the fully merged and up to date inobt record
932  		 * from the previous call. Set merge false to replace any
933  		 * existing record with this one.
934  		 */
935  		if (xfs_has_finobt(args.mp)) {
936  			error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
937  			if (error)
938  				return error;
939  		}
940  	} else {
941  		/* full chunk - insert new records to both btrees */
942  		error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
943  		if (error)
944  			return error;
945  
946  		if (xfs_has_finobt(args.mp)) {
947  			error = xfs_inobt_insert(pag, tp, agbp, newino,
948  						 newlen, true);
949  			if (error)
950  				return error;
951  		}
952  	}
953  
954  	/*
955  	 * Update AGI counts and newino.
956  	 */
957  	be32_add_cpu(&agi->agi_count, newlen);
958  	be32_add_cpu(&agi->agi_freecount, newlen);
959  	pag->pagi_freecount += newlen;
960  	pag->pagi_count += newlen;
961  	agi->agi_newino = cpu_to_be32(newino);
962  
963  	/*
964  	 * Log allocation group header fields
965  	 */
966  	xfs_ialloc_log_agi(tp, agbp,
967  		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
968  	/*
969  	 * Modify/log superblock values for inode count and inode free count.
970  	 */
971  	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
972  	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
973  	return 0;
974  }
975  
976  /*
977   * Try to retrieve the next record to the left/right from the current one.
978   */
979  STATIC int
xfs_ialloc_next_rec(struct xfs_btree_cur * cur,xfs_inobt_rec_incore_t * rec,int * done,int left)980  xfs_ialloc_next_rec(
981  	struct xfs_btree_cur	*cur,
982  	xfs_inobt_rec_incore_t	*rec,
983  	int			*done,
984  	int			left)
985  {
986  	int                     error;
987  	int			i;
988  
989  	if (left)
990  		error = xfs_btree_decrement(cur, 0, &i);
991  	else
992  		error = xfs_btree_increment(cur, 0, &i);
993  
994  	if (error)
995  		return error;
996  	*done = !i;
997  	if (i) {
998  		error = xfs_inobt_get_rec(cur, rec, &i);
999  		if (error)
1000  			return error;
1001  		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1002  			xfs_btree_mark_sick(cur);
1003  			return -EFSCORRUPTED;
1004  		}
1005  	}
1006  
1007  	return 0;
1008  }
1009  
1010  STATIC int
xfs_ialloc_get_rec(struct xfs_btree_cur * cur,xfs_agino_t agino,xfs_inobt_rec_incore_t * rec,int * done)1011  xfs_ialloc_get_rec(
1012  	struct xfs_btree_cur	*cur,
1013  	xfs_agino_t		agino,
1014  	xfs_inobt_rec_incore_t	*rec,
1015  	int			*done)
1016  {
1017  	int                     error;
1018  	int			i;
1019  
1020  	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1021  	if (error)
1022  		return error;
1023  	*done = !i;
1024  	if (i) {
1025  		error = xfs_inobt_get_rec(cur, rec, &i);
1026  		if (error)
1027  			return error;
1028  		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1029  			xfs_btree_mark_sick(cur);
1030  			return -EFSCORRUPTED;
1031  		}
1032  	}
1033  
1034  	return 0;
1035  }
1036  
1037  /*
1038   * Return the offset of the first free inode in the record. If the inode chunk
1039   * is sparsely allocated, we convert the record holemask to inode granularity
1040   * and mask off the unallocated regions from the inode free mask.
1041   */
1042  STATIC int
xfs_inobt_first_free_inode(struct xfs_inobt_rec_incore * rec)1043  xfs_inobt_first_free_inode(
1044  	struct xfs_inobt_rec_incore	*rec)
1045  {
1046  	xfs_inofree_t			realfree;
1047  
1048  	/* if there are no holes, return the first available offset */
1049  	if (!xfs_inobt_issparse(rec->ir_holemask))
1050  		return xfs_lowbit64(rec->ir_free);
1051  
1052  	realfree = xfs_inobt_irec_to_allocmask(rec);
1053  	realfree &= rec->ir_free;
1054  
1055  	return xfs_lowbit64(realfree);
1056  }
1057  
1058  /*
1059   * If this AG has corrupt inodes, check if allocating this inode would fail
1060   * with corruption errors.  Returns 0 if we're clear, or EAGAIN to try again
1061   * somewhere else.
1062   */
1063  static int
xfs_dialloc_check_ino(struct xfs_perag * pag,struct xfs_trans * tp,xfs_ino_t ino)1064  xfs_dialloc_check_ino(
1065  	struct xfs_perag	*pag,
1066  	struct xfs_trans	*tp,
1067  	xfs_ino_t		ino)
1068  {
1069  	struct xfs_imap		imap;
1070  	struct xfs_buf		*bp;
1071  	int			error;
1072  
1073  	error = xfs_imap(pag, tp, ino, &imap, 0);
1074  	if (error)
1075  		return -EAGAIN;
1076  
1077  	error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp);
1078  	if (error)
1079  		return -EAGAIN;
1080  
1081  	xfs_trans_brelse(tp, bp);
1082  	return 0;
1083  }
1084  
1085  /*
1086   * Allocate an inode using the inobt-only algorithm.
1087   */
1088  STATIC int
xfs_dialloc_ag_inobt(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_ino_t parent,xfs_ino_t * inop)1089  xfs_dialloc_ag_inobt(
1090  	struct xfs_perag	*pag,
1091  	struct xfs_trans	*tp,
1092  	struct xfs_buf		*agbp,
1093  	xfs_ino_t		parent,
1094  	xfs_ino_t		*inop)
1095  {
1096  	struct xfs_mount	*mp = tp->t_mountp;
1097  	struct xfs_agi		*agi = agbp->b_addr;
1098  	xfs_agnumber_t		pagno = XFS_INO_TO_AGNO(mp, parent);
1099  	xfs_agino_t		pagino = XFS_INO_TO_AGINO(mp, parent);
1100  	struct xfs_btree_cur	*cur, *tcur;
1101  	struct xfs_inobt_rec_incore rec, trec;
1102  	xfs_ino_t		ino;
1103  	int			error;
1104  	int			offset;
1105  	int			i, j;
1106  	int			searchdistance = 10;
1107  
1108  	ASSERT(xfs_perag_initialised_agi(pag));
1109  	ASSERT(xfs_perag_allows_inodes(pag));
1110  	ASSERT(pag->pagi_freecount > 0);
1111  
1112   restart_pagno:
1113  	cur = xfs_inobt_init_cursor(pag, tp, agbp);
1114  	/*
1115  	 * If pagino is 0 (this is the root inode allocation) use newino.
1116  	 * This must work because we've just allocated some.
1117  	 */
1118  	if (!pagino)
1119  		pagino = be32_to_cpu(agi->agi_newino);
1120  
1121  	error = xfs_check_agi_freecount(cur);
1122  	if (error)
1123  		goto error0;
1124  
1125  	/*
1126  	 * If in the same AG as the parent, try to get near the parent.
1127  	 */
1128  	if (pagno == pag_agno(pag)) {
1129  		int		doneleft;	/* done, to the left */
1130  		int		doneright;	/* done, to the right */
1131  
1132  		error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1133  		if (error)
1134  			goto error0;
1135  		if (XFS_IS_CORRUPT(mp, i != 1)) {
1136  			xfs_btree_mark_sick(cur);
1137  			error = -EFSCORRUPTED;
1138  			goto error0;
1139  		}
1140  
1141  		error = xfs_inobt_get_rec(cur, &rec, &j);
1142  		if (error)
1143  			goto error0;
1144  		if (XFS_IS_CORRUPT(mp, j != 1)) {
1145  			xfs_btree_mark_sick(cur);
1146  			error = -EFSCORRUPTED;
1147  			goto error0;
1148  		}
1149  
1150  		if (rec.ir_freecount > 0) {
1151  			/*
1152  			 * Found a free inode in the same chunk
1153  			 * as the parent, done.
1154  			 */
1155  			goto alloc_inode;
1156  		}
1157  
1158  
1159  		/*
1160  		 * In the same AG as parent, but parent's chunk is full.
1161  		 */
1162  
1163  		/* duplicate the cursor, search left & right simultaneously */
1164  		error = xfs_btree_dup_cursor(cur, &tcur);
1165  		if (error)
1166  			goto error0;
1167  
1168  		/*
1169  		 * Skip to last blocks looked up if same parent inode.
1170  		 */
1171  		if (pagino != NULLAGINO &&
1172  		    pag->pagl_pagino == pagino &&
1173  		    pag->pagl_leftrec != NULLAGINO &&
1174  		    pag->pagl_rightrec != NULLAGINO) {
1175  			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1176  						   &trec, &doneleft);
1177  			if (error)
1178  				goto error1;
1179  
1180  			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1181  						   &rec, &doneright);
1182  			if (error)
1183  				goto error1;
1184  		} else {
1185  			/* search left with tcur, back up 1 record */
1186  			error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1187  			if (error)
1188  				goto error1;
1189  
1190  			/* search right with cur, go forward 1 record. */
1191  			error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1192  			if (error)
1193  				goto error1;
1194  		}
1195  
1196  		/*
1197  		 * Loop until we find an inode chunk with a free inode.
1198  		 */
1199  		while (--searchdistance > 0 && (!doneleft || !doneright)) {
1200  			int	useleft;  /* using left inode chunk this time */
1201  
1202  			/* figure out the closer block if both are valid. */
1203  			if (!doneleft && !doneright) {
1204  				useleft = pagino -
1205  				 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1206  				  rec.ir_startino - pagino;
1207  			} else {
1208  				useleft = !doneleft;
1209  			}
1210  
1211  			/* free inodes to the left? */
1212  			if (useleft && trec.ir_freecount) {
1213  				xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1214  				cur = tcur;
1215  
1216  				pag->pagl_leftrec = trec.ir_startino;
1217  				pag->pagl_rightrec = rec.ir_startino;
1218  				pag->pagl_pagino = pagino;
1219  				rec = trec;
1220  				goto alloc_inode;
1221  			}
1222  
1223  			/* free inodes to the right? */
1224  			if (!useleft && rec.ir_freecount) {
1225  				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1226  
1227  				pag->pagl_leftrec = trec.ir_startino;
1228  				pag->pagl_rightrec = rec.ir_startino;
1229  				pag->pagl_pagino = pagino;
1230  				goto alloc_inode;
1231  			}
1232  
1233  			/* get next record to check */
1234  			if (useleft) {
1235  				error = xfs_ialloc_next_rec(tcur, &trec,
1236  								 &doneleft, 1);
1237  			} else {
1238  				error = xfs_ialloc_next_rec(cur, &rec,
1239  								 &doneright, 0);
1240  			}
1241  			if (error)
1242  				goto error1;
1243  		}
1244  
1245  		if (searchdistance <= 0) {
1246  			/*
1247  			 * Not in range - save last search
1248  			 * location and allocate a new inode
1249  			 */
1250  			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1251  			pag->pagl_leftrec = trec.ir_startino;
1252  			pag->pagl_rightrec = rec.ir_startino;
1253  			pag->pagl_pagino = pagino;
1254  
1255  		} else {
1256  			/*
1257  			 * We've reached the end of the btree. because
1258  			 * we are only searching a small chunk of the
1259  			 * btree each search, there is obviously free
1260  			 * inodes closer to the parent inode than we
1261  			 * are now. restart the search again.
1262  			 */
1263  			pag->pagl_pagino = NULLAGINO;
1264  			pag->pagl_leftrec = NULLAGINO;
1265  			pag->pagl_rightrec = NULLAGINO;
1266  			xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1267  			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1268  			goto restart_pagno;
1269  		}
1270  	}
1271  
1272  	/*
1273  	 * In a different AG from the parent.
1274  	 * See if the most recently allocated block has any free.
1275  	 */
1276  	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1277  		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1278  					 XFS_LOOKUP_EQ, &i);
1279  		if (error)
1280  			goto error0;
1281  
1282  		if (i == 1) {
1283  			error = xfs_inobt_get_rec(cur, &rec, &j);
1284  			if (error)
1285  				goto error0;
1286  
1287  			if (j == 1 && rec.ir_freecount > 0) {
1288  				/*
1289  				 * The last chunk allocated in the group
1290  				 * still has a free inode.
1291  				 */
1292  				goto alloc_inode;
1293  			}
1294  		}
1295  	}
1296  
1297  	/*
1298  	 * None left in the last group, search the whole AG
1299  	 */
1300  	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1301  	if (error)
1302  		goto error0;
1303  	if (XFS_IS_CORRUPT(mp, i != 1)) {
1304  		xfs_btree_mark_sick(cur);
1305  		error = -EFSCORRUPTED;
1306  		goto error0;
1307  	}
1308  
1309  	for (;;) {
1310  		error = xfs_inobt_get_rec(cur, &rec, &i);
1311  		if (error)
1312  			goto error0;
1313  		if (XFS_IS_CORRUPT(mp, i != 1)) {
1314  			xfs_btree_mark_sick(cur);
1315  			error = -EFSCORRUPTED;
1316  			goto error0;
1317  		}
1318  		if (rec.ir_freecount > 0)
1319  			break;
1320  		error = xfs_btree_increment(cur, 0, &i);
1321  		if (error)
1322  			goto error0;
1323  		if (XFS_IS_CORRUPT(mp, i != 1)) {
1324  			xfs_btree_mark_sick(cur);
1325  			error = -EFSCORRUPTED;
1326  			goto error0;
1327  		}
1328  	}
1329  
1330  alloc_inode:
1331  	offset = xfs_inobt_first_free_inode(&rec);
1332  	ASSERT(offset >= 0);
1333  	ASSERT(offset < XFS_INODES_PER_CHUNK);
1334  	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1335  				   XFS_INODES_PER_CHUNK) == 0);
1336  	ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1337  
1338  	if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1339  		error = xfs_dialloc_check_ino(pag, tp, ino);
1340  		if (error)
1341  			goto error0;
1342  	}
1343  
1344  	rec.ir_free &= ~XFS_INOBT_MASK(offset);
1345  	rec.ir_freecount--;
1346  	error = xfs_inobt_update(cur, &rec);
1347  	if (error)
1348  		goto error0;
1349  	be32_add_cpu(&agi->agi_freecount, -1);
1350  	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1351  	pag->pagi_freecount--;
1352  
1353  	error = xfs_check_agi_freecount(cur);
1354  	if (error)
1355  		goto error0;
1356  
1357  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1358  	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1359  	*inop = ino;
1360  	return 0;
1361  error1:
1362  	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1363  error0:
1364  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1365  	return error;
1366  }
1367  
1368  /*
1369   * Use the free inode btree to allocate an inode based on distance from the
1370   * parent. Note that the provided cursor may be deleted and replaced.
1371   */
1372  STATIC int
xfs_dialloc_ag_finobt_near(xfs_agino_t pagino,struct xfs_btree_cur ** ocur,struct xfs_inobt_rec_incore * rec)1373  xfs_dialloc_ag_finobt_near(
1374  	xfs_agino_t			pagino,
1375  	struct xfs_btree_cur		**ocur,
1376  	struct xfs_inobt_rec_incore	*rec)
1377  {
1378  	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */
1379  	struct xfs_btree_cur		*rcur;	/* right search cursor */
1380  	struct xfs_inobt_rec_incore	rrec;
1381  	int				error;
1382  	int				i, j;
1383  
1384  	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1385  	if (error)
1386  		return error;
1387  
1388  	if (i == 1) {
1389  		error = xfs_inobt_get_rec(lcur, rec, &i);
1390  		if (error)
1391  			return error;
1392  		if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
1393  			xfs_btree_mark_sick(lcur);
1394  			return -EFSCORRUPTED;
1395  		}
1396  
1397  		/*
1398  		 * See if we've landed in the parent inode record. The finobt
1399  		 * only tracks chunks with at least one free inode, so record
1400  		 * existence is enough.
1401  		 */
1402  		if (pagino >= rec->ir_startino &&
1403  		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1404  			return 0;
1405  	}
1406  
1407  	error = xfs_btree_dup_cursor(lcur, &rcur);
1408  	if (error)
1409  		return error;
1410  
1411  	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1412  	if (error)
1413  		goto error_rcur;
1414  	if (j == 1) {
1415  		error = xfs_inobt_get_rec(rcur, &rrec, &j);
1416  		if (error)
1417  			goto error_rcur;
1418  		if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
1419  			xfs_btree_mark_sick(lcur);
1420  			error = -EFSCORRUPTED;
1421  			goto error_rcur;
1422  		}
1423  	}
1424  
1425  	if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
1426  		xfs_btree_mark_sick(lcur);
1427  		error = -EFSCORRUPTED;
1428  		goto error_rcur;
1429  	}
1430  	if (i == 1 && j == 1) {
1431  		/*
1432  		 * Both the left and right records are valid. Choose the closer
1433  		 * inode chunk to the target.
1434  		 */
1435  		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1436  		    (rrec.ir_startino - pagino)) {
1437  			*rec = rrec;
1438  			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1439  			*ocur = rcur;
1440  		} else {
1441  			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1442  		}
1443  	} else if (j == 1) {
1444  		/* only the right record is valid */
1445  		*rec = rrec;
1446  		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1447  		*ocur = rcur;
1448  	} else if (i == 1) {
1449  		/* only the left record is valid */
1450  		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1451  	}
1452  
1453  	return 0;
1454  
1455  error_rcur:
1456  	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1457  	return error;
1458  }
1459  
1460  /*
1461   * Use the free inode btree to find a free inode based on a newino hint. If
1462   * the hint is NULL, find the first free inode in the AG.
1463   */
1464  STATIC int
xfs_dialloc_ag_finobt_newino(struct xfs_agi * agi,struct xfs_btree_cur * cur,struct xfs_inobt_rec_incore * rec)1465  xfs_dialloc_ag_finobt_newino(
1466  	struct xfs_agi			*agi,
1467  	struct xfs_btree_cur		*cur,
1468  	struct xfs_inobt_rec_incore	*rec)
1469  {
1470  	int error;
1471  	int i;
1472  
1473  	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1474  		error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1475  					 XFS_LOOKUP_EQ, &i);
1476  		if (error)
1477  			return error;
1478  		if (i == 1) {
1479  			error = xfs_inobt_get_rec(cur, rec, &i);
1480  			if (error)
1481  				return error;
1482  			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1483  				xfs_btree_mark_sick(cur);
1484  				return -EFSCORRUPTED;
1485  			}
1486  			return 0;
1487  		}
1488  	}
1489  
1490  	/*
1491  	 * Find the first inode available in the AG.
1492  	 */
1493  	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1494  	if (error)
1495  		return error;
1496  	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1497  		xfs_btree_mark_sick(cur);
1498  		return -EFSCORRUPTED;
1499  	}
1500  
1501  	error = xfs_inobt_get_rec(cur, rec, &i);
1502  	if (error)
1503  		return error;
1504  	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1505  		xfs_btree_mark_sick(cur);
1506  		return -EFSCORRUPTED;
1507  	}
1508  
1509  	return 0;
1510  }
1511  
1512  /*
1513   * Update the inobt based on a modification made to the finobt. Also ensure that
1514   * the records from both trees are equivalent post-modification.
1515   */
1516  STATIC int
xfs_dialloc_ag_update_inobt(struct xfs_btree_cur * cur,struct xfs_inobt_rec_incore * frec,int offset)1517  xfs_dialloc_ag_update_inobt(
1518  	struct xfs_btree_cur		*cur,	/* inobt cursor */
1519  	struct xfs_inobt_rec_incore	*frec,	/* finobt record */
1520  	int				offset) /* inode offset */
1521  {
1522  	struct xfs_inobt_rec_incore	rec;
1523  	int				error;
1524  	int				i;
1525  
1526  	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1527  	if (error)
1528  		return error;
1529  	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1530  		xfs_btree_mark_sick(cur);
1531  		return -EFSCORRUPTED;
1532  	}
1533  
1534  	error = xfs_inobt_get_rec(cur, &rec, &i);
1535  	if (error)
1536  		return error;
1537  	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1538  		xfs_btree_mark_sick(cur);
1539  		return -EFSCORRUPTED;
1540  	}
1541  	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1542  				   XFS_INODES_PER_CHUNK) == 0);
1543  
1544  	rec.ir_free &= ~XFS_INOBT_MASK(offset);
1545  	rec.ir_freecount--;
1546  
1547  	if (XFS_IS_CORRUPT(cur->bc_mp,
1548  			   rec.ir_free != frec->ir_free ||
1549  			   rec.ir_freecount != frec->ir_freecount)) {
1550  		xfs_btree_mark_sick(cur);
1551  		return -EFSCORRUPTED;
1552  	}
1553  
1554  	return xfs_inobt_update(cur, &rec);
1555  }
1556  
1557  /*
1558   * Allocate an inode using the free inode btree, if available. Otherwise, fall
1559   * back to the inobt search algorithm.
1560   *
1561   * The caller selected an AG for us, and made sure that free inodes are
1562   * available.
1563   */
1564  static int
xfs_dialloc_ag(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_ino_t parent,xfs_ino_t * inop)1565  xfs_dialloc_ag(
1566  	struct xfs_perag	*pag,
1567  	struct xfs_trans	*tp,
1568  	struct xfs_buf		*agbp,
1569  	xfs_ino_t		parent,
1570  	xfs_ino_t		*inop)
1571  {
1572  	struct xfs_mount		*mp = tp->t_mountp;
1573  	struct xfs_agi			*agi = agbp->b_addr;
1574  	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent);
1575  	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent);
1576  	struct xfs_btree_cur		*cur;	/* finobt cursor */
1577  	struct xfs_btree_cur		*icur;	/* inobt cursor */
1578  	struct xfs_inobt_rec_incore	rec;
1579  	xfs_ino_t			ino;
1580  	int				error;
1581  	int				offset;
1582  	int				i;
1583  
1584  	if (!xfs_has_finobt(mp))
1585  		return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
1586  
1587  	/*
1588  	 * If pagino is 0 (this is the root inode allocation) use newino.
1589  	 * This must work because we've just allocated some.
1590  	 */
1591  	if (!pagino)
1592  		pagino = be32_to_cpu(agi->agi_newino);
1593  
1594  	cur = xfs_finobt_init_cursor(pag, tp, agbp);
1595  
1596  	error = xfs_check_agi_freecount(cur);
1597  	if (error)
1598  		goto error_cur;
1599  
1600  	/*
1601  	 * The search algorithm depends on whether we're in the same AG as the
1602  	 * parent. If so, find the closest available inode to the parent. If
1603  	 * not, consider the agi hint or find the first free inode in the AG.
1604  	 */
1605  	if (pag_agno(pag) == pagno)
1606  		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1607  	else
1608  		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1609  	if (error)
1610  		goto error_cur;
1611  
1612  	offset = xfs_inobt_first_free_inode(&rec);
1613  	ASSERT(offset >= 0);
1614  	ASSERT(offset < XFS_INODES_PER_CHUNK);
1615  	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1616  				   XFS_INODES_PER_CHUNK) == 0);
1617  	ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1618  
1619  	if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1620  		error = xfs_dialloc_check_ino(pag, tp, ino);
1621  		if (error)
1622  			goto error_cur;
1623  	}
1624  
1625  	/*
1626  	 * Modify or remove the finobt record.
1627  	 */
1628  	rec.ir_free &= ~XFS_INOBT_MASK(offset);
1629  	rec.ir_freecount--;
1630  	if (rec.ir_freecount)
1631  		error = xfs_inobt_update(cur, &rec);
1632  	else
1633  		error = xfs_btree_delete(cur, &i);
1634  	if (error)
1635  		goto error_cur;
1636  
1637  	/*
1638  	 * The finobt has now been updated appropriately. We haven't updated the
1639  	 * agi and superblock yet, so we can create an inobt cursor and validate
1640  	 * the original freecount. If all is well, make the equivalent update to
1641  	 * the inobt using the finobt record and offset information.
1642  	 */
1643  	icur = xfs_inobt_init_cursor(pag, tp, agbp);
1644  
1645  	error = xfs_check_agi_freecount(icur);
1646  	if (error)
1647  		goto error_icur;
1648  
1649  	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1650  	if (error)
1651  		goto error_icur;
1652  
1653  	/*
1654  	 * Both trees have now been updated. We must update the perag and
1655  	 * superblock before we can check the freecount for each btree.
1656  	 */
1657  	be32_add_cpu(&agi->agi_freecount, -1);
1658  	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1659  	pag->pagi_freecount--;
1660  
1661  	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1662  
1663  	error = xfs_check_agi_freecount(icur);
1664  	if (error)
1665  		goto error_icur;
1666  	error = xfs_check_agi_freecount(cur);
1667  	if (error)
1668  		goto error_icur;
1669  
1670  	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1671  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1672  	*inop = ino;
1673  	return 0;
1674  
1675  error_icur:
1676  	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1677  error_cur:
1678  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1679  	return error;
1680  }
1681  
1682  static int
xfs_dialloc_roll(struct xfs_trans ** tpp,struct xfs_buf * agibp)1683  xfs_dialloc_roll(
1684  	struct xfs_trans	**tpp,
1685  	struct xfs_buf		*agibp)
1686  {
1687  	struct xfs_trans	*tp = *tpp;
1688  	struct xfs_dquot_acct	*dqinfo;
1689  	int			error;
1690  
1691  	/*
1692  	 * Hold to on to the agibp across the commit so no other allocation can
1693  	 * come in and take the free inodes we just allocated for our caller.
1694  	 */
1695  	xfs_trans_bhold(tp, agibp);
1696  
1697  	/*
1698  	 * We want the quota changes to be associated with the next transaction,
1699  	 * NOT this one. So, detach the dqinfo from this and attach it to the
1700  	 * next transaction.
1701  	 */
1702  	dqinfo = tp->t_dqinfo;
1703  	tp->t_dqinfo = NULL;
1704  
1705  	error = xfs_trans_roll(&tp);
1706  
1707  	/* Re-attach the quota info that we detached from prev trx. */
1708  	tp->t_dqinfo = dqinfo;
1709  
1710  	/*
1711  	 * Join the buffer even on commit error so that the buffer is released
1712  	 * when the caller cancels the transaction and doesn't have to handle
1713  	 * this error case specially.
1714  	 */
1715  	xfs_trans_bjoin(tp, agibp);
1716  	*tpp = tp;
1717  	return error;
1718  }
1719  
1720  static bool
xfs_dialloc_good_ag(struct xfs_perag * pag,struct xfs_trans * tp,umode_t mode,int flags,bool ok_alloc)1721  xfs_dialloc_good_ag(
1722  	struct xfs_perag	*pag,
1723  	struct xfs_trans	*tp,
1724  	umode_t			mode,
1725  	int			flags,
1726  	bool			ok_alloc)
1727  {
1728  	struct xfs_mount	*mp = tp->t_mountp;
1729  	xfs_extlen_t		ineed;
1730  	xfs_extlen_t		longest = 0;
1731  	int			needspace;
1732  	int			error;
1733  
1734  	if (!pag)
1735  		return false;
1736  	if (!xfs_perag_allows_inodes(pag))
1737  		return false;
1738  
1739  	if (!xfs_perag_initialised_agi(pag)) {
1740  		error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
1741  		if (error)
1742  			return false;
1743  	}
1744  
1745  	if (pag->pagi_freecount)
1746  		return true;
1747  	if (!ok_alloc)
1748  		return false;
1749  
1750  	if (!xfs_perag_initialised_agf(pag)) {
1751  		error = xfs_alloc_read_agf(pag, tp, flags, NULL);
1752  		if (error)
1753  			return false;
1754  	}
1755  
1756  	/*
1757  	 * Check that there is enough free space for the file plus a chunk of
1758  	 * inodes if we need to allocate some. If this is the first pass across
1759  	 * the AGs, take into account the potential space needed for alignment
1760  	 * of inode chunks when checking the longest contiguous free space in
1761  	 * the AG - this prevents us from getting ENOSPC because we have free
1762  	 * space larger than ialloc_blks but alignment constraints prevent us
1763  	 * from using it.
1764  	 *
1765  	 * If we can't find an AG with space for full alignment slack to be
1766  	 * taken into account, we must be near ENOSPC in all AGs.  Hence we
1767  	 * don't include alignment for the second pass and so if we fail
1768  	 * allocation due to alignment issues then it is most likely a real
1769  	 * ENOSPC condition.
1770  	 *
1771  	 * XXX(dgc): this calculation is now bogus thanks to the per-ag
1772  	 * reservations that xfs_alloc_fix_freelist() now does via
1773  	 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1774  	 * be more than large enough for the check below to succeed, but
1775  	 * xfs_alloc_space_available() will fail because of the non-zero
1776  	 * metadata reservation and hence we won't actually be able to allocate
1777  	 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1778  	 * because of this.
1779  	 */
1780  	ineed = M_IGEO(mp)->ialloc_min_blks;
1781  	if (flags && ineed > 1)
1782  		ineed += M_IGEO(mp)->cluster_align;
1783  	longest = pag->pagf_longest;
1784  	if (!longest)
1785  		longest = pag->pagf_flcount > 0;
1786  	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1787  
1788  	if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1789  		return false;
1790  	return true;
1791  }
1792  
1793  static int
xfs_dialloc_try_ag(struct xfs_perag * pag,struct xfs_trans ** tpp,xfs_ino_t parent,xfs_ino_t * new_ino,bool ok_alloc)1794  xfs_dialloc_try_ag(
1795  	struct xfs_perag	*pag,
1796  	struct xfs_trans	**tpp,
1797  	xfs_ino_t		parent,
1798  	xfs_ino_t		*new_ino,
1799  	bool			ok_alloc)
1800  {
1801  	struct xfs_buf		*agbp;
1802  	xfs_ino_t		ino;
1803  	int			error;
1804  
1805  	/*
1806  	 * Then read in the AGI buffer and recheck with the AGI buffer
1807  	 * lock held.
1808  	 */
1809  	error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
1810  	if (error)
1811  		return error;
1812  
1813  	if (!pag->pagi_freecount) {
1814  		if (!ok_alloc) {
1815  			error = -EAGAIN;
1816  			goto out_release;
1817  		}
1818  
1819  		error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
1820  		if (error < 0)
1821  			goto out_release;
1822  
1823  		/*
1824  		 * We successfully allocated space for an inode cluster in this
1825  		 * AG.  Roll the transaction so that we can allocate one of the
1826  		 * new inodes.
1827  		 */
1828  		ASSERT(pag->pagi_freecount > 0);
1829  		error = xfs_dialloc_roll(tpp, agbp);
1830  		if (error)
1831  			goto out_release;
1832  	}
1833  
1834  	/* Allocate an inode in the found AG */
1835  	error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
1836  	if (!error)
1837  		*new_ino = ino;
1838  	return error;
1839  
1840  out_release:
1841  	xfs_trans_brelse(*tpp, agbp);
1842  	return error;
1843  }
1844  
1845  /*
1846   * Pick an AG for the new inode.
1847   *
1848   * Directories, symlinks, and regular files frequently allocate at least one
1849   * block, so factor that potential expansion when we examine whether an AG has
1850   * enough space for file creation.  Try to keep metadata files all in the same
1851   * AG.
1852   */
1853  static inline xfs_agnumber_t
xfs_dialloc_pick_ag(struct xfs_mount * mp,struct xfs_inode * dp,umode_t mode)1854  xfs_dialloc_pick_ag(
1855  	struct xfs_mount	*mp,
1856  	struct xfs_inode	*dp,
1857  	umode_t			mode)
1858  {
1859  	xfs_agnumber_t		start_agno;
1860  
1861  	if (!dp)
1862  		return 0;
1863  	if (xfs_is_metadir_inode(dp)) {
1864  		if (mp->m_sb.sb_logstart)
1865  			return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
1866  		return 0;
1867  	}
1868  
1869  	if (S_ISDIR(mode))
1870  		return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
1871  
1872  	start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
1873  	if (start_agno >= mp->m_maxagi)
1874  		start_agno = 0;
1875  
1876  	return start_agno;
1877  }
1878  
1879  /*
1880   * Allocate an on-disk inode.
1881   *
1882   * Mode is used to tell whether the new inode is a directory and hence where to
1883   * locate it. The on-disk inode that is allocated will be returned in @new_ino
1884   * on success, otherwise an error will be set to indicate the failure (e.g.
1885   * -ENOSPC).
1886   */
1887  int
xfs_dialloc(struct xfs_trans ** tpp,const struct xfs_icreate_args * args,xfs_ino_t * new_ino)1888  xfs_dialloc(
1889  	struct xfs_trans	**tpp,
1890  	const struct xfs_icreate_args *args,
1891  	xfs_ino_t		*new_ino)
1892  {
1893  	struct xfs_mount	*mp = (*tpp)->t_mountp;
1894  	struct xfs_perag	*pag;
1895  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
1896  	xfs_ino_t		ino = NULLFSINO;
1897  	xfs_ino_t		parent = args->pip ? args->pip->i_ino : 0;
1898  	xfs_agnumber_t		agno;
1899  	xfs_agnumber_t		start_agno;
1900  	umode_t			mode = args->mode & S_IFMT;
1901  	bool			ok_alloc = true;
1902  	bool			low_space = false;
1903  	int			flags;
1904  	int			error = 0;
1905  
1906  	start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode);
1907  
1908  	/*
1909  	 * If we have already hit the ceiling of inode blocks then clear
1910  	 * ok_alloc so we scan all available agi structures for a free
1911  	 * inode.
1912  	 *
1913  	 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1914  	 * which will sacrifice the preciseness but improve the performance.
1915  	 */
1916  	if (igeo->maxicount &&
1917  	    percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1918  							> igeo->maxicount) {
1919  		ok_alloc = false;
1920  	}
1921  
1922  	/*
1923  	 * If we are near to ENOSPC, we want to prefer allocation from AGs that
1924  	 * have free inodes in them rather than use up free space allocating new
1925  	 * inode chunks. Hence we turn off allocation for the first non-blocking
1926  	 * pass through the AGs if we are near ENOSPC to consume free inodes
1927  	 * that we can immediately allocate, but then we allow allocation on the
1928  	 * second pass if we fail to find an AG with free inodes in it.
1929  	 */
1930  	if (percpu_counter_read_positive(&mp->m_fdblocks) <
1931  			mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1932  		ok_alloc = false;
1933  		low_space = true;
1934  	}
1935  
1936  	/*
1937  	 * Loop until we find an allocation group that either has free inodes
1938  	 * or in which we can allocate some inodes.  Iterate through the
1939  	 * allocation groups upward, wrapping at the end.
1940  	 */
1941  	flags = XFS_ALLOC_FLAG_TRYLOCK;
1942  retry:
1943  	for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
1944  		if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1945  			error = xfs_dialloc_try_ag(pag, tpp, parent,
1946  					&ino, ok_alloc);
1947  			if (error != -EAGAIN)
1948  				break;
1949  			error = 0;
1950  		}
1951  
1952  		if (xfs_is_shutdown(mp)) {
1953  			error = -EFSCORRUPTED;
1954  			break;
1955  		}
1956  	}
1957  	if (pag)
1958  		xfs_perag_rele(pag);
1959  	if (error)
1960  		return error;
1961  	if (ino == NULLFSINO) {
1962  		if (flags) {
1963  			flags = 0;
1964  			if (low_space)
1965  				ok_alloc = true;
1966  			goto retry;
1967  		}
1968  		return -ENOSPC;
1969  	}
1970  
1971  	/*
1972  	 * Protect against obviously corrupt allocation btree records. Later
1973  	 * xfs_iget checks will catch re-allocation of other active in-memory
1974  	 * and on-disk inodes. If we don't catch reallocating the parent inode
1975  	 * here we will deadlock in xfs_iget() so we have to do these checks
1976  	 * first.
1977  	 */
1978  	if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
1979  		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
1980  		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
1981  				XFS_SICK_AG_INOBT);
1982  		return -EFSCORRUPTED;
1983  	}
1984  
1985  	*new_ino = ino;
1986  	return 0;
1987  }
1988  
1989  /*
1990   * Free the blocks of an inode chunk. We must consider that the inode chunk
1991   * might be sparse and only free the regions that are allocated as part of the
1992   * chunk.
1993   */
1994  static int
xfs_difree_inode_chunk(struct xfs_trans * tp,struct xfs_perag * pag,struct xfs_inobt_rec_incore * rec)1995  xfs_difree_inode_chunk(
1996  	struct xfs_trans		*tp,
1997  	struct xfs_perag		*pag,
1998  	struct xfs_inobt_rec_incore	*rec)
1999  {
2000  	struct xfs_mount		*mp = tp->t_mountp;
2001  	xfs_agblock_t			sagbno = XFS_AGINO_TO_AGBNO(mp,
2002  							rec->ir_startino);
2003  	int				startidx, endidx;
2004  	int				nextbit;
2005  	xfs_agblock_t			agbno;
2006  	int				contigblk;
2007  	DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
2008  
2009  	if (!xfs_inobt_issparse(rec->ir_holemask)) {
2010  		/* not sparse, calculate extent info directly */
2011  		return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno),
2012  				M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
2013  				XFS_AG_RESV_NONE, 0);
2014  	}
2015  
2016  	/* holemask is only 16-bits (fits in an unsigned long) */
2017  	ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
2018  	holemask[0] = rec->ir_holemask;
2019  
2020  	/*
2021  	 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2022  	 * holemask and convert the start/end index of each range to an extent.
2023  	 * We start with the start and end index both pointing at the first 0 in
2024  	 * the mask.
2025  	 */
2026  	startidx = endidx = find_first_zero_bit(holemask,
2027  						XFS_INOBT_HOLEMASK_BITS);
2028  	nextbit = startidx + 1;
2029  	while (startidx < XFS_INOBT_HOLEMASK_BITS) {
2030  		int error;
2031  
2032  		nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
2033  					     nextbit);
2034  		/*
2035  		 * If the next zero bit is contiguous, update the end index of
2036  		 * the current range and continue.
2037  		 */
2038  		if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
2039  		    nextbit == endidx + 1) {
2040  			endidx = nextbit;
2041  			goto next;
2042  		}
2043  
2044  		/*
2045  		 * nextbit is not contiguous with the current end index. Convert
2046  		 * the current start/end to an extent and add it to the free
2047  		 * list.
2048  		 */
2049  		agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
2050  				  mp->m_sb.sb_inopblock;
2051  		contigblk = ((endidx - startidx + 1) *
2052  			     XFS_INODES_PER_HOLEMASK_BIT) /
2053  			    mp->m_sb.sb_inopblock;
2054  
2055  		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
2056  		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
2057  		error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno),
2058  				contigblk, &XFS_RMAP_OINFO_INODES,
2059  				XFS_AG_RESV_NONE, 0);
2060  		if (error)
2061  			return error;
2062  
2063  		/* reset range to current bit and carry on... */
2064  		startidx = endidx = nextbit;
2065  
2066  next:
2067  		nextbit++;
2068  	}
2069  	return 0;
2070  }
2071  
2072  STATIC int
xfs_difree_inobt(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agino_t agino,struct xfs_icluster * xic,struct xfs_inobt_rec_incore * orec)2073  xfs_difree_inobt(
2074  	struct xfs_perag		*pag,
2075  	struct xfs_trans		*tp,
2076  	struct xfs_buf			*agbp,
2077  	xfs_agino_t			agino,
2078  	struct xfs_icluster		*xic,
2079  	struct xfs_inobt_rec_incore	*orec)
2080  {
2081  	struct xfs_mount		*mp = pag_mount(pag);
2082  	struct xfs_agi			*agi = agbp->b_addr;
2083  	struct xfs_btree_cur		*cur;
2084  	struct xfs_inobt_rec_incore	rec;
2085  	int				ilen;
2086  	int				error;
2087  	int				i;
2088  	int				off;
2089  
2090  	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2091  	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
2092  
2093  	/*
2094  	 * Initialize the cursor.
2095  	 */
2096  	cur = xfs_inobt_init_cursor(pag, tp, agbp);
2097  
2098  	error = xfs_check_agi_freecount(cur);
2099  	if (error)
2100  		goto error0;
2101  
2102  	/*
2103  	 * Look for the entry describing this inode.
2104  	 */
2105  	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
2106  		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
2107  			__func__, error);
2108  		goto error0;
2109  	}
2110  	if (XFS_IS_CORRUPT(mp, i != 1)) {
2111  		xfs_btree_mark_sick(cur);
2112  		error = -EFSCORRUPTED;
2113  		goto error0;
2114  	}
2115  	error = xfs_inobt_get_rec(cur, &rec, &i);
2116  	if (error) {
2117  		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
2118  			__func__, error);
2119  		goto error0;
2120  	}
2121  	if (XFS_IS_CORRUPT(mp, i != 1)) {
2122  		xfs_btree_mark_sick(cur);
2123  		error = -EFSCORRUPTED;
2124  		goto error0;
2125  	}
2126  	/*
2127  	 * Get the offset in the inode chunk.
2128  	 */
2129  	off = agino - rec.ir_startino;
2130  	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
2131  	ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
2132  	/*
2133  	 * Mark the inode free & increment the count.
2134  	 */
2135  	rec.ir_free |= XFS_INOBT_MASK(off);
2136  	rec.ir_freecount++;
2137  
2138  	/*
2139  	 * When an inode chunk is free, it becomes eligible for removal. Don't
2140  	 * remove the chunk if the block size is large enough for multiple inode
2141  	 * chunks (that might not be free).
2142  	 */
2143  	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2144  	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2145  		xic->deleted = true;
2146  		xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
2147  		xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
2148  
2149  		/*
2150  		 * Remove the inode cluster from the AGI B+Tree, adjust the
2151  		 * AGI and Superblock inode counts, and mark the disk space
2152  		 * to be freed when the transaction is committed.
2153  		 */
2154  		ilen = rec.ir_freecount;
2155  		be32_add_cpu(&agi->agi_count, -ilen);
2156  		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
2157  		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
2158  		pag->pagi_freecount -= ilen - 1;
2159  		pag->pagi_count -= ilen;
2160  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
2161  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
2162  
2163  		if ((error = xfs_btree_delete(cur, &i))) {
2164  			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2165  				__func__, error);
2166  			goto error0;
2167  		}
2168  
2169  		error = xfs_difree_inode_chunk(tp, pag, &rec);
2170  		if (error)
2171  			goto error0;
2172  	} else {
2173  		xic->deleted = false;
2174  
2175  		error = xfs_inobt_update(cur, &rec);
2176  		if (error) {
2177  			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2178  				__func__, error);
2179  			goto error0;
2180  		}
2181  
2182  		/*
2183  		 * Change the inode free counts and log the ag/sb changes.
2184  		 */
2185  		be32_add_cpu(&agi->agi_freecount, 1);
2186  		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
2187  		pag->pagi_freecount++;
2188  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2189  	}
2190  
2191  	error = xfs_check_agi_freecount(cur);
2192  	if (error)
2193  		goto error0;
2194  
2195  	*orec = rec;
2196  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2197  	return 0;
2198  
2199  error0:
2200  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2201  	return error;
2202  }
2203  
2204  /*
2205   * Free an inode in the free inode btree.
2206   */
2207  STATIC int
xfs_difree_finobt(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agino_t agino,struct xfs_inobt_rec_incore * ibtrec)2208  xfs_difree_finobt(
2209  	struct xfs_perag		*pag,
2210  	struct xfs_trans		*tp,
2211  	struct xfs_buf			*agbp,
2212  	xfs_agino_t			agino,
2213  	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */
2214  {
2215  	struct xfs_mount		*mp = pag_mount(pag);
2216  	struct xfs_btree_cur		*cur;
2217  	struct xfs_inobt_rec_incore	rec;
2218  	int				offset = agino - ibtrec->ir_startino;
2219  	int				error;
2220  	int				i;
2221  
2222  	cur = xfs_finobt_init_cursor(pag, tp, agbp);
2223  
2224  	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2225  	if (error)
2226  		goto error;
2227  	if (i == 0) {
2228  		/*
2229  		 * If the record does not exist in the finobt, we must have just
2230  		 * freed an inode in a previously fully allocated chunk. If not,
2231  		 * something is out of sync.
2232  		 */
2233  		if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
2234  			xfs_btree_mark_sick(cur);
2235  			error = -EFSCORRUPTED;
2236  			goto error;
2237  		}
2238  
2239  		error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2240  					     ibtrec->ir_count,
2241  					     ibtrec->ir_freecount,
2242  					     ibtrec->ir_free, &i);
2243  		if (error)
2244  			goto error;
2245  		ASSERT(i == 1);
2246  
2247  		goto out;
2248  	}
2249  
2250  	/*
2251  	 * Read and update the existing record. We could just copy the ibtrec
2252  	 * across here, but that would defeat the purpose of having redundant
2253  	 * metadata. By making the modifications independently, we can catch
2254  	 * corruptions that we wouldn't see if we just copied from one record
2255  	 * to another.
2256  	 */
2257  	error = xfs_inobt_get_rec(cur, &rec, &i);
2258  	if (error)
2259  		goto error;
2260  	if (XFS_IS_CORRUPT(mp, i != 1)) {
2261  		xfs_btree_mark_sick(cur);
2262  		error = -EFSCORRUPTED;
2263  		goto error;
2264  	}
2265  
2266  	rec.ir_free |= XFS_INOBT_MASK(offset);
2267  	rec.ir_freecount++;
2268  
2269  	if (XFS_IS_CORRUPT(mp,
2270  			   rec.ir_free != ibtrec->ir_free ||
2271  			   rec.ir_freecount != ibtrec->ir_freecount)) {
2272  		xfs_btree_mark_sick(cur);
2273  		error = -EFSCORRUPTED;
2274  		goto error;
2275  	}
2276  
2277  	/*
2278  	 * The content of inobt records should always match between the inobt
2279  	 * and finobt. The lifecycle of records in the finobt is different from
2280  	 * the inobt in that the finobt only tracks records with at least one
2281  	 * free inode. Hence, if all of the inodes are free and we aren't
2282  	 * keeping inode chunks permanently on disk, remove the record.
2283  	 * Otherwise, update the record with the new information.
2284  	 *
2285  	 * Note that we currently can't free chunks when the block size is large
2286  	 * enough for multiple chunks. Leave the finobt record to remain in sync
2287  	 * with the inobt.
2288  	 */
2289  	if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2290  	    mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2291  		error = xfs_btree_delete(cur, &i);
2292  		if (error)
2293  			goto error;
2294  		ASSERT(i == 1);
2295  	} else {
2296  		error = xfs_inobt_update(cur, &rec);
2297  		if (error)
2298  			goto error;
2299  	}
2300  
2301  out:
2302  	error = xfs_check_agi_freecount(cur);
2303  	if (error)
2304  		goto error;
2305  
2306  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2307  	return 0;
2308  
2309  error:
2310  	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2311  	return error;
2312  }
2313  
2314  /*
2315   * Free disk inode.  Carefully avoids touching the incore inode, all
2316   * manipulations incore are the caller's responsibility.
2317   * The on-disk inode is not changed by this operation, only the
2318   * btree (free inode mask) is changed.
2319   */
2320  int
xfs_difree(struct xfs_trans * tp,struct xfs_perag * pag,xfs_ino_t inode,struct xfs_icluster * xic)2321  xfs_difree(
2322  	struct xfs_trans	*tp,
2323  	struct xfs_perag	*pag,
2324  	xfs_ino_t		inode,
2325  	struct xfs_icluster	*xic)
2326  {
2327  	/* REFERENCED */
2328  	xfs_agblock_t		agbno;	/* block number containing inode */
2329  	struct xfs_buf		*agbp;	/* buffer for allocation group header */
2330  	xfs_agino_t		agino;	/* allocation group inode number */
2331  	int			error;	/* error return value */
2332  	struct xfs_mount	*mp = tp->t_mountp;
2333  	struct xfs_inobt_rec_incore rec;/* btree record */
2334  
2335  	/*
2336  	 * Break up inode number into its components.
2337  	 */
2338  	if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) {
2339  		xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).",
2340  			__func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag));
2341  		ASSERT(0);
2342  		return -EINVAL;
2343  	}
2344  	agino = XFS_INO_TO_AGINO(mp, inode);
2345  	if (inode != xfs_agino_to_ino(pag, agino))  {
2346  		xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
2347  			__func__, (unsigned long long)inode,
2348  			(unsigned long long)xfs_agino_to_ino(pag, agino));
2349  		ASSERT(0);
2350  		return -EINVAL;
2351  	}
2352  	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2353  	if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2354  		xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
2355  			__func__, agbno, xfs_ag_block_count(mp, pag_agno(pag)));
2356  		ASSERT(0);
2357  		return -EINVAL;
2358  	}
2359  	/*
2360  	 * Get the allocation group header.
2361  	 */
2362  	error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2363  	if (error) {
2364  		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2365  			__func__, error);
2366  		return error;
2367  	}
2368  
2369  	/*
2370  	 * Fix up the inode allocation btree.
2371  	 */
2372  	error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2373  	if (error)
2374  		goto error0;
2375  
2376  	/*
2377  	 * Fix up the free inode btree.
2378  	 */
2379  	if (xfs_has_finobt(mp)) {
2380  		error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
2381  		if (error)
2382  			goto error0;
2383  	}
2384  
2385  	return 0;
2386  
2387  error0:
2388  	return error;
2389  }
2390  
2391  STATIC int
xfs_imap_lookup(struct xfs_perag * pag,struct xfs_trans * tp,xfs_agino_t agino,xfs_agblock_t agbno,xfs_agblock_t * chunk_agbno,xfs_agblock_t * offset_agbno,int flags)2392  xfs_imap_lookup(
2393  	struct xfs_perag	*pag,
2394  	struct xfs_trans	*tp,
2395  	xfs_agino_t		agino,
2396  	xfs_agblock_t		agbno,
2397  	xfs_agblock_t		*chunk_agbno,
2398  	xfs_agblock_t		*offset_agbno,
2399  	int			flags)
2400  {
2401  	struct xfs_mount	*mp = pag_mount(pag);
2402  	struct xfs_inobt_rec_incore rec;
2403  	struct xfs_btree_cur	*cur;
2404  	struct xfs_buf		*agbp;
2405  	int			error;
2406  	int			i;
2407  
2408  	error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2409  	if (error) {
2410  		xfs_alert(mp,
2411  			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2412  			__func__, error, pag_agno(pag));
2413  		return error;
2414  	}
2415  
2416  	/*
2417  	 * Lookup the inode record for the given agino. If the record cannot be
2418  	 * found, then it's an invalid inode number and we should abort. Once
2419  	 * we have a record, we need to ensure it contains the inode number
2420  	 * we are looking up.
2421  	 */
2422  	cur = xfs_inobt_init_cursor(pag, tp, agbp);
2423  	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2424  	if (!error) {
2425  		if (i)
2426  			error = xfs_inobt_get_rec(cur, &rec, &i);
2427  		if (!error && i == 0)
2428  			error = -EINVAL;
2429  	}
2430  
2431  	xfs_trans_brelse(tp, agbp);
2432  	xfs_btree_del_cursor(cur, error);
2433  	if (error)
2434  		return error;
2435  
2436  	/* check that the returned record contains the required inode */
2437  	if (rec.ir_startino > agino ||
2438  	    rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2439  		return -EINVAL;
2440  
2441  	/* for untrusted inodes check it is allocated first */
2442  	if ((flags & XFS_IGET_UNTRUSTED) &&
2443  	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2444  		return -EINVAL;
2445  
2446  	*chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2447  	*offset_agbno = agbno - *chunk_agbno;
2448  	return 0;
2449  }
2450  
2451  /*
2452   * Return the location of the inode in imap, for mapping it into a buffer.
2453   */
2454  int
xfs_imap(struct xfs_perag * pag,struct xfs_trans * tp,xfs_ino_t ino,struct xfs_imap * imap,uint flags)2455  xfs_imap(
2456  	struct xfs_perag	*pag,
2457  	struct xfs_trans	*tp,
2458  	xfs_ino_t		ino,	/* inode to locate */
2459  	struct xfs_imap		*imap,	/* location map structure */
2460  	uint			flags)	/* flags for inode btree lookup */
2461  {
2462  	struct xfs_mount	*mp = pag_mount(pag);
2463  	xfs_agblock_t		agbno;	/* block number of inode in the alloc group */
2464  	xfs_agino_t		agino;	/* inode number within alloc group */
2465  	xfs_agblock_t		chunk_agbno;	/* first block in inode chunk */
2466  	xfs_agblock_t		cluster_agbno;	/* first block in inode cluster */
2467  	int			error;	/* error code */
2468  	int			offset;	/* index of inode in its buffer */
2469  	xfs_agblock_t		offset_agbno;	/* blks from chunk start to inode */
2470  
2471  	ASSERT(ino != NULLFSINO);
2472  
2473  	/*
2474  	 * Split up the inode number into its parts.
2475  	 */
2476  	agino = XFS_INO_TO_AGINO(mp, ino);
2477  	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2478  	if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) ||
2479  	    ino != xfs_agino_to_ino(pag, agino)) {
2480  		error = -EINVAL;
2481  #ifdef DEBUG
2482  		/*
2483  		 * Don't output diagnostic information for untrusted inodes
2484  		 * as they can be invalid without implying corruption.
2485  		 */
2486  		if (flags & XFS_IGET_UNTRUSTED)
2487  			return error;
2488  		if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2489  			xfs_alert(mp,
2490  		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2491  				__func__, (unsigned long long)agbno,
2492  				(unsigned long)xfs_ag_block_count(mp,
2493  							pag_agno(pag)));
2494  		}
2495  		if (ino != xfs_agino_to_ino(pag, agino)) {
2496  			xfs_alert(mp,
2497  		"%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
2498  				__func__, ino,
2499  				xfs_agino_to_ino(pag, agino));
2500  		}
2501  		xfs_stack_trace();
2502  #endif /* DEBUG */
2503  		return error;
2504  	}
2505  
2506  	/*
2507  	 * For bulkstat and handle lookups, we have an untrusted inode number
2508  	 * that we have to verify is valid. We cannot do this just by reading
2509  	 * the inode buffer as it may have been unlinked and removed leaving
2510  	 * inodes in stale state on disk. Hence we have to do a btree lookup
2511  	 * in all cases where an untrusted inode number is passed.
2512  	 */
2513  	if (flags & XFS_IGET_UNTRUSTED) {
2514  		error = xfs_imap_lookup(pag, tp, agino, agbno,
2515  					&chunk_agbno, &offset_agbno, flags);
2516  		if (error)
2517  			return error;
2518  		goto out_map;
2519  	}
2520  
2521  	/*
2522  	 * If the inode cluster size is the same as the blocksize or
2523  	 * smaller we get to the buffer by simple arithmetics.
2524  	 */
2525  	if (M_IGEO(mp)->blocks_per_cluster == 1) {
2526  		offset = XFS_INO_TO_OFFSET(mp, ino);
2527  		ASSERT(offset < mp->m_sb.sb_inopblock);
2528  
2529  		imap->im_blkno = xfs_agbno_to_daddr(pag, agbno);
2530  		imap->im_len = XFS_FSB_TO_BB(mp, 1);
2531  		imap->im_boffset = (unsigned short)(offset <<
2532  							mp->m_sb.sb_inodelog);
2533  		return 0;
2534  	}
2535  
2536  	/*
2537  	 * If the inode chunks are aligned then use simple maths to
2538  	 * find the location. Otherwise we have to do a btree
2539  	 * lookup to find the location.
2540  	 */
2541  	if (M_IGEO(mp)->inoalign_mask) {
2542  		offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
2543  		chunk_agbno = agbno - offset_agbno;
2544  	} else {
2545  		error = xfs_imap_lookup(pag, tp, agino, agbno,
2546  					&chunk_agbno, &offset_agbno, flags);
2547  		if (error)
2548  			return error;
2549  	}
2550  
2551  out_map:
2552  	ASSERT(agbno >= chunk_agbno);
2553  	cluster_agbno = chunk_agbno +
2554  		((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2555  		 M_IGEO(mp)->blocks_per_cluster);
2556  	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2557  		XFS_INO_TO_OFFSET(mp, ino);
2558  
2559  	imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno);
2560  	imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
2561  	imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
2562  
2563  	/*
2564  	 * If the inode number maps to a block outside the bounds
2565  	 * of the file system then return NULL rather than calling
2566  	 * read_buf and panicing when we get an error from the
2567  	 * driver.
2568  	 */
2569  	if ((imap->im_blkno + imap->im_len) >
2570  	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2571  		xfs_alert(mp,
2572  	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2573  			__func__, (unsigned long long) imap->im_blkno,
2574  			(unsigned long long) imap->im_len,
2575  			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2576  		return -EINVAL;
2577  	}
2578  	return 0;
2579  }
2580  
2581  /*
2582   * Log specified fields for the ag hdr (inode section). The growth of the agi
2583   * structure over time requires that we interpret the buffer as two logical
2584   * regions delineated by the end of the unlinked list. This is due to the size
2585   * of the hash table and its location in the middle of the agi.
2586   *
2587   * For example, a request to log a field before agi_unlinked and a field after
2588   * agi_unlinked could cause us to log the entire hash table and use an excessive
2589   * amount of log space. To avoid this behavior, log the region up through
2590   * agi_unlinked in one call and the region after agi_unlinked through the end of
2591   * the structure in another.
2592   */
2593  void
xfs_ialloc_log_agi(struct xfs_trans * tp,struct xfs_buf * bp,uint32_t fields)2594  xfs_ialloc_log_agi(
2595  	struct xfs_trans	*tp,
2596  	struct xfs_buf		*bp,
2597  	uint32_t		fields)
2598  {
2599  	int			first;		/* first byte number */
2600  	int			last;		/* last byte number */
2601  	static const short	offsets[] = {	/* field starting offsets */
2602  					/* keep in sync with bit definitions */
2603  		offsetof(xfs_agi_t, agi_magicnum),
2604  		offsetof(xfs_agi_t, agi_versionnum),
2605  		offsetof(xfs_agi_t, agi_seqno),
2606  		offsetof(xfs_agi_t, agi_length),
2607  		offsetof(xfs_agi_t, agi_count),
2608  		offsetof(xfs_agi_t, agi_root),
2609  		offsetof(xfs_agi_t, agi_level),
2610  		offsetof(xfs_agi_t, agi_freecount),
2611  		offsetof(xfs_agi_t, agi_newino),
2612  		offsetof(xfs_agi_t, agi_dirino),
2613  		offsetof(xfs_agi_t, agi_unlinked),
2614  		offsetof(xfs_agi_t, agi_free_root),
2615  		offsetof(xfs_agi_t, agi_free_level),
2616  		offsetof(xfs_agi_t, agi_iblocks),
2617  		sizeof(xfs_agi_t)
2618  	};
2619  #ifdef DEBUG
2620  	struct xfs_agi		*agi = bp->b_addr;
2621  
2622  	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2623  #endif
2624  
2625  	/*
2626  	 * Compute byte offsets for the first and last fields in the first
2627  	 * region and log the agi buffer. This only logs up through
2628  	 * agi_unlinked.
2629  	 */
2630  	if (fields & XFS_AGI_ALL_BITS_R1) {
2631  		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2632  				  &first, &last);
2633  		xfs_trans_log_buf(tp, bp, first, last);
2634  	}
2635  
2636  	/*
2637  	 * Mask off the bits in the first region and calculate the first and
2638  	 * last field offsets for any bits in the second region.
2639  	 */
2640  	fields &= ~XFS_AGI_ALL_BITS_R1;
2641  	if (fields) {
2642  		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2643  				  &first, &last);
2644  		xfs_trans_log_buf(tp, bp, first, last);
2645  	}
2646  }
2647  
2648  static xfs_failaddr_t
xfs_agi_verify(struct xfs_buf * bp)2649  xfs_agi_verify(
2650  	struct xfs_buf		*bp)
2651  {
2652  	struct xfs_mount	*mp = bp->b_mount;
2653  	struct xfs_agi		*agi = bp->b_addr;
2654  	xfs_failaddr_t		fa;
2655  	uint32_t		agi_seqno = be32_to_cpu(agi->agi_seqno);
2656  	uint32_t		agi_length = be32_to_cpu(agi->agi_length);
2657  	int			i;
2658  
2659  	if (xfs_has_crc(mp)) {
2660  		if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2661  			return __this_address;
2662  		if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
2663  			return __this_address;
2664  	}
2665  
2666  	/*
2667  	 * Validate the magic number of the agi block.
2668  	 */
2669  	if (!xfs_verify_magic(bp, agi->agi_magicnum))
2670  		return __this_address;
2671  	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2672  		return __this_address;
2673  
2674  	fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
2675  	if (fa)
2676  		return fa;
2677  
2678  	if (be32_to_cpu(agi->agi_level) < 1 ||
2679  	    be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
2680  		return __this_address;
2681  
2682  	if (xfs_has_finobt(mp) &&
2683  	    (be32_to_cpu(agi->agi_free_level) < 1 ||
2684  	     be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
2685  		return __this_address;
2686  
2687  	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
2688  		if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
2689  			continue;
2690  		if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2691  			return __this_address;
2692  	}
2693  
2694  	return NULL;
2695  }
2696  
2697  static void
xfs_agi_read_verify(struct xfs_buf * bp)2698  xfs_agi_read_verify(
2699  	struct xfs_buf	*bp)
2700  {
2701  	struct xfs_mount *mp = bp->b_mount;
2702  	xfs_failaddr_t	fa;
2703  
2704  	if (xfs_has_crc(mp) &&
2705  	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2706  		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2707  	else {
2708  		fa = xfs_agi_verify(bp);
2709  		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2710  			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2711  	}
2712  }
2713  
2714  static void
xfs_agi_write_verify(struct xfs_buf * bp)2715  xfs_agi_write_verify(
2716  	struct xfs_buf	*bp)
2717  {
2718  	struct xfs_mount	*mp = bp->b_mount;
2719  	struct xfs_buf_log_item	*bip = bp->b_log_item;
2720  	struct xfs_agi		*agi = bp->b_addr;
2721  	xfs_failaddr_t		fa;
2722  
2723  	fa = xfs_agi_verify(bp);
2724  	if (fa) {
2725  		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2726  		return;
2727  	}
2728  
2729  	if (!xfs_has_crc(mp))
2730  		return;
2731  
2732  	if (bip)
2733  		agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2734  	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2735  }
2736  
2737  const struct xfs_buf_ops xfs_agi_buf_ops = {
2738  	.name = "xfs_agi",
2739  	.magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
2740  	.verify_read = xfs_agi_read_verify,
2741  	.verify_write = xfs_agi_write_verify,
2742  	.verify_struct = xfs_agi_verify,
2743  };
2744  
2745  /*
2746   * Read in the allocation group header (inode allocation section)
2747   */
2748  int
xfs_read_agi(struct xfs_perag * pag,struct xfs_trans * tp,xfs_buf_flags_t flags,struct xfs_buf ** agibpp)2749  xfs_read_agi(
2750  	struct xfs_perag	*pag,
2751  	struct xfs_trans	*tp,
2752  	xfs_buf_flags_t		flags,
2753  	struct xfs_buf		**agibpp)
2754  {
2755  	struct xfs_mount	*mp = pag_mount(pag);
2756  	int			error;
2757  
2758  	trace_xfs_read_agi(pag);
2759  
2760  	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2761  			XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)),
2762  			XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
2763  	if (xfs_metadata_is_sick(error))
2764  		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2765  	if (error)
2766  		return error;
2767  	if (tp)
2768  		xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
2769  
2770  	xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
2771  	return 0;
2772  }
2773  
2774  /*
2775   * Read in the agi and initialise the per-ag data. If the caller supplies a
2776   * @agibpp, return the locked AGI buffer to them, otherwise release it.
2777   */
2778  int
xfs_ialloc_read_agi(struct xfs_perag * pag,struct xfs_trans * tp,int flags,struct xfs_buf ** agibpp)2779  xfs_ialloc_read_agi(
2780  	struct xfs_perag	*pag,
2781  	struct xfs_trans	*tp,
2782  	int			flags,
2783  	struct xfs_buf		**agibpp)
2784  {
2785  	struct xfs_buf		*agibp;
2786  	struct xfs_agi		*agi;
2787  	int			error;
2788  
2789  	trace_xfs_ialloc_read_agi(pag);
2790  
2791  	error = xfs_read_agi(pag, tp,
2792  			(flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2793  			&agibp);
2794  	if (error)
2795  		return error;
2796  
2797  	agi = agibp->b_addr;
2798  	if (!xfs_perag_initialised_agi(pag)) {
2799  		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2800  		pag->pagi_count = be32_to_cpu(agi->agi_count);
2801  		set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
2802  	}
2803  
2804  	/*
2805  	 * It's possible for these to be out of sync if
2806  	 * we are in the middle of a forced shutdown.
2807  	 */
2808  	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2809  		xfs_is_shutdown(pag_mount(pag)));
2810  	if (agibpp)
2811  		*agibpp = agibp;
2812  	else
2813  		xfs_trans_brelse(tp, agibp);
2814  	return 0;
2815  }
2816  
2817  /* How many inodes are backed by inode clusters ondisk? */
2818  STATIC int
xfs_ialloc_count_ondisk(struct xfs_btree_cur * cur,xfs_agino_t low,xfs_agino_t high,unsigned int * allocated)2819  xfs_ialloc_count_ondisk(
2820  	struct xfs_btree_cur		*cur,
2821  	xfs_agino_t			low,
2822  	xfs_agino_t			high,
2823  	unsigned int			*allocated)
2824  {
2825  	struct xfs_inobt_rec_incore	irec;
2826  	unsigned int			ret = 0;
2827  	int				has_record;
2828  	int				error;
2829  
2830  	error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
2831  	if (error)
2832  		return error;
2833  
2834  	while (has_record) {
2835  		unsigned int		i, hole_idx;
2836  
2837  		error = xfs_inobt_get_rec(cur, &irec, &has_record);
2838  		if (error)
2839  			return error;
2840  		if (irec.ir_startino > high)
2841  			break;
2842  
2843  		for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
2844  			if (irec.ir_startino + i < low)
2845  				continue;
2846  			if (irec.ir_startino + i > high)
2847  				break;
2848  
2849  			hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
2850  			if (!(irec.ir_holemask & (1U << hole_idx)))
2851  				ret++;
2852  		}
2853  
2854  		error = xfs_btree_increment(cur, 0, &has_record);
2855  		if (error)
2856  			return error;
2857  	}
2858  
2859  	*allocated = ret;
2860  	return 0;
2861  }
2862  
2863  /* Is there an inode record covering a given extent? */
2864  int
xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,enum xbtree_recpacking * outcome)2865  xfs_ialloc_has_inodes_at_extent(
2866  	struct xfs_btree_cur	*cur,
2867  	xfs_agblock_t		bno,
2868  	xfs_extlen_t		len,
2869  	enum xbtree_recpacking	*outcome)
2870  {
2871  	xfs_agino_t		agino;
2872  	xfs_agino_t		last_agino;
2873  	unsigned int		allocated;
2874  	int			error;
2875  
2876  	agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2877  	last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2878  
2879  	error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
2880  	if (error)
2881  		return error;
2882  
2883  	if (allocated == 0)
2884  		*outcome = XBTREE_RECPACKING_EMPTY;
2885  	else if (allocated == last_agino - agino + 1)
2886  		*outcome = XBTREE_RECPACKING_FULL;
2887  	else
2888  		*outcome = XBTREE_RECPACKING_SPARSE;
2889  	return 0;
2890  }
2891  
2892  struct xfs_ialloc_count_inodes {
2893  	xfs_agino_t			count;
2894  	xfs_agino_t			freecount;
2895  };
2896  
2897  /* Record inode counts across all inobt records. */
2898  STATIC int
xfs_ialloc_count_inodes_rec(struct xfs_btree_cur * cur,const union xfs_btree_rec * rec,void * priv)2899  xfs_ialloc_count_inodes_rec(
2900  	struct xfs_btree_cur		*cur,
2901  	const union xfs_btree_rec	*rec,
2902  	void				*priv)
2903  {
2904  	struct xfs_inobt_rec_incore	irec;
2905  	struct xfs_ialloc_count_inodes	*ci = priv;
2906  	xfs_failaddr_t			fa;
2907  
2908  	xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
2909  	fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec);
2910  	if (fa)
2911  		return xfs_inobt_complain_bad_rec(cur, fa, &irec);
2912  
2913  	ci->count += irec.ir_count;
2914  	ci->freecount += irec.ir_freecount;
2915  
2916  	return 0;
2917  }
2918  
2919  /* Count allocated and free inodes under an inobt. */
2920  int
xfs_ialloc_count_inodes(struct xfs_btree_cur * cur,xfs_agino_t * count,xfs_agino_t * freecount)2921  xfs_ialloc_count_inodes(
2922  	struct xfs_btree_cur		*cur,
2923  	xfs_agino_t			*count,
2924  	xfs_agino_t			*freecount)
2925  {
2926  	struct xfs_ialloc_count_inodes	ci = {0};
2927  	int				error;
2928  
2929  	ASSERT(xfs_btree_is_ino(cur->bc_ops));
2930  	error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2931  	if (error)
2932  		return error;
2933  
2934  	*count = ci.count;
2935  	*freecount = ci.freecount;
2936  	return 0;
2937  }
2938  
2939  /*
2940   * Initialize inode-related geometry information.
2941   *
2942   * Compute the inode btree min and max levels and set maxicount.
2943   *
2944   * Set the inode cluster size.  This may still be overridden by the file
2945   * system block size if it is larger than the chosen cluster size.
2946   *
2947   * For v5 filesystems, scale the cluster size with the inode size to keep a
2948   * constant ratio of inode per cluster buffer, but only if mkfs has set the
2949   * inode alignment value appropriately for larger cluster sizes.
2950   *
2951   * Then compute the inode cluster alignment information.
2952   */
2953  void
xfs_ialloc_setup_geometry(struct xfs_mount * mp)2954  xfs_ialloc_setup_geometry(
2955  	struct xfs_mount	*mp)
2956  {
2957  	struct xfs_sb		*sbp = &mp->m_sb;
2958  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
2959  	uint64_t		icount;
2960  	uint			inodes;
2961  
2962  	igeo->new_diflags2 = 0;
2963  	if (xfs_has_bigtime(mp))
2964  		igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
2965  	if (xfs_has_large_extent_counts(mp))
2966  		igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
2967  
2968  	/* Compute inode btree geometry. */
2969  	igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2970  	igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
2971  	igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
2972  	igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2973  	igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2974  
2975  	igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2976  			sbp->sb_inopblock);
2977  	igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2978  
2979  	if (sbp->sb_spino_align)
2980  		igeo->ialloc_min_blks = sbp->sb_spino_align;
2981  	else
2982  		igeo->ialloc_min_blks = igeo->ialloc_blks;
2983  
2984  	/* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2985  	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2986  	igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2987  			inodes);
2988  	ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
2989  
2990  	/*
2991  	 * Set the maximum inode count for this filesystem, being careful not
2992  	 * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
2993  	 * users should never get here due to failing sb verification, but
2994  	 * certain users (xfs_db) need to be usable even with corrupt metadata.
2995  	 */
2996  	if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2997  		/*
2998  		 * Make sure the maximum inode count is a multiple
2999  		 * of the units we allocate inodes in.
3000  		 */
3001  		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
3002  		do_div(icount, 100);
3003  		do_div(icount, igeo->ialloc_blks);
3004  		igeo->maxicount = XFS_FSB_TO_INO(mp,
3005  				icount * igeo->ialloc_blks);
3006  	} else {
3007  		igeo->maxicount = 0;
3008  	}
3009  
3010  	/*
3011  	 * Compute the desired size of an inode cluster buffer size, which
3012  	 * starts at 8K and (on v5 filesystems) scales up with larger inode
3013  	 * sizes.
3014  	 *
3015  	 * Preserve the desired inode cluster size because the sparse inodes
3016  	 * feature uses that desired size (not the actual size) to compute the
3017  	 * sparse inode alignment.  The mount code validates this value, so we
3018  	 * cannot change the behavior.
3019  	 */
3020  	igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
3021  	if (xfs_has_v3inodes(mp)) {
3022  		int	new_size = igeo->inode_cluster_size_raw;
3023  
3024  		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
3025  		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
3026  			igeo->inode_cluster_size_raw = new_size;
3027  	}
3028  
3029  	/* Calculate inode cluster ratios. */
3030  	if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
3031  		igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
3032  				igeo->inode_cluster_size_raw);
3033  	else
3034  		igeo->blocks_per_cluster = 1;
3035  	igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
3036  	igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
3037  
3038  	/* Calculate inode cluster alignment. */
3039  	if (xfs_has_align(mp) &&
3040  	    mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
3041  		igeo->cluster_align = mp->m_sb.sb_inoalignmt;
3042  	else
3043  		igeo->cluster_align = 1;
3044  	igeo->inoalign_mask = igeo->cluster_align - 1;
3045  	igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
3046  
3047  	/*
3048  	 * If we are using stripe alignment, check whether
3049  	 * the stripe unit is a multiple of the inode alignment
3050  	 */
3051  	if (mp->m_dalign && igeo->inoalign_mask &&
3052  	    !(mp->m_dalign & igeo->inoalign_mask))
3053  		igeo->ialloc_align = mp->m_dalign;
3054  	else
3055  		igeo->ialloc_align = 0;
3056  
3057  	if (mp->m_sb.sb_blocksize > PAGE_SIZE)
3058  		igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
3059  	else
3060  		igeo->min_folio_order = 0;
3061  }
3062  
3063  /* Compute the location of the root directory inode that is laid out by mkfs. */
3064  xfs_ino_t
xfs_ialloc_calc_rootino(struct xfs_mount * mp,int sunit)3065  xfs_ialloc_calc_rootino(
3066  	struct xfs_mount	*mp,
3067  	int			sunit)
3068  {
3069  	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
3070  	xfs_agblock_t		first_bno;
3071  
3072  	/*
3073  	 * Pre-calculate the geometry of AG 0.  We know what it looks like
3074  	 * because libxfs knows how to create allocation groups now.
3075  	 *
3076  	 * first_bno is the first block in which mkfs could possibly have
3077  	 * allocated the root directory inode, once we factor in the metadata
3078  	 * that mkfs formats before it.  Namely, the four AG headers...
3079  	 */
3080  	first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
3081  
3082  	/* ...the two free space btree roots... */
3083  	first_bno += 2;
3084  
3085  	/* ...the inode btree root... */
3086  	first_bno += 1;
3087  
3088  	/* ...the initial AGFL... */
3089  	first_bno += xfs_alloc_min_freelist(mp, NULL);
3090  
3091  	/* ...the free inode btree root... */
3092  	if (xfs_has_finobt(mp))
3093  		first_bno++;
3094  
3095  	/* ...the reverse mapping btree root... */
3096  	if (xfs_has_rmapbt(mp))
3097  		first_bno++;
3098  
3099  	/* ...the reference count btree... */
3100  	if (xfs_has_reflink(mp))
3101  		first_bno++;
3102  
3103  	/*
3104  	 * ...and the log, if it is allocated in the first allocation group.
3105  	 *
3106  	 * This can happen with filesystems that only have a single
3107  	 * allocation group, or very odd geometries created by old mkfs
3108  	 * versions on very small filesystems.
3109  	 */
3110  	if (xfs_ag_contains_log(mp, 0))
3111  		 first_bno += mp->m_sb.sb_logblocks;
3112  
3113  	/*
3114  	 * Now round first_bno up to whatever allocation alignment is given
3115  	 * by the filesystem or was passed in.
3116  	 */
3117  	if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
3118  		first_bno = roundup(first_bno, sunit);
3119  	else if (xfs_has_align(mp) &&
3120  			mp->m_sb.sb_inoalignmt > 1)
3121  		first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
3122  
3123  	return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
3124  }
3125  
3126  /*
3127   * Ensure there are not sparse inode clusters that cross the new EOAG.
3128   *
3129   * This is a no-op for non-spinode filesystems since clusters are always fully
3130   * allocated and checking the bnobt suffices.  However, a spinode filesystem
3131   * could have a record where the upper inodes are free blocks.  If those blocks
3132   * were removed from the filesystem, the inode record would extend beyond EOAG,
3133   * which will be flagged as corruption.
3134   */
3135  int
xfs_ialloc_check_shrink(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agibp,xfs_agblock_t new_length)3136  xfs_ialloc_check_shrink(
3137  	struct xfs_perag	*pag,
3138  	struct xfs_trans	*tp,
3139  	struct xfs_buf		*agibp,
3140  	xfs_agblock_t		new_length)
3141  {
3142  	struct xfs_inobt_rec_incore rec;
3143  	struct xfs_btree_cur	*cur;
3144  	xfs_agino_t		agino;
3145  	int			has;
3146  	int			error;
3147  
3148  	if (!xfs_has_sparseinodes(pag_mount(pag)))
3149  		return 0;
3150  
3151  	cur = xfs_inobt_init_cursor(pag, tp, agibp);
3152  
3153  	/* Look up the inobt record that would correspond to the new EOFS. */
3154  	agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length);
3155  	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
3156  	if (error || !has)
3157  		goto out;
3158  
3159  	error = xfs_inobt_get_rec(cur, &rec, &has);
3160  	if (error)
3161  		goto out;
3162  
3163  	if (!has) {
3164  		xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
3165  		error = -EFSCORRUPTED;
3166  		goto out;
3167  	}
3168  
3169  	/* If the record covers inodes that would be beyond EOFS, bail out. */
3170  	if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
3171  		error = -ENOSPC;
3172  		goto out;
3173  	}
3174  out:
3175  	xfs_btree_del_cursor(cur, error);
3176  	return error;
3177  }
3178