1  // SPDX-License-Identifier: GPL-2.0-only
2  /* Network filesystem write subrequest result collection, assessment
3   * and retrying.
4   *
5   * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
6   * Written by David Howells (dhowells@redhat.com)
7   */
8  
9  #include <linux/export.h>
10  #include <linux/fs.h>
11  #include <linux/mm.h>
12  #include <linux/pagemap.h>
13  #include <linux/slab.h>
14  #include "internal.h"
15  
16  /* Notes made in the collector */
17  #define HIT_PENDING		0x01	/* A front op was still pending */
18  #define NEED_REASSESS		0x02	/* Need to loop round and reassess */
19  #define MADE_PROGRESS		0x04	/* Made progress cleaning up a stream or the folio set */
20  #define NEED_UNLOCK		0x08	/* The pagecache needs unlocking */
21  #define NEED_RETRY		0x10	/* A front op requests retrying */
22  #define SAW_FAILURE		0x20	/* One stream or hit a permanent failure */
23  
netfs_dump_request(const struct netfs_io_request * rreq)24  static void netfs_dump_request(const struct netfs_io_request *rreq)
25  {
26  	pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
27  	       rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
28  	       rreq->origin, rreq->error);
29  	pr_err("  st=%llx tsl=%zx/%llx/%llx\n",
30  	       rreq->start, rreq->transferred, rreq->submitted, rreq->len);
31  	pr_err("  cci=%llx/%llx/%llx\n",
32  	       rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
33  	pr_err("  iw=%pSR\n", rreq->netfs_ops->issue_write);
34  	for (int i = 0; i < NR_IO_STREAMS; i++) {
35  		const struct netfs_io_subrequest *sreq;
36  		const struct netfs_io_stream *s = &rreq->io_streams[i];
37  
38  		pr_err("  str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
39  		       s->stream_nr, s->source, s->error,
40  		       s->avail, s->active, s->need_retry, s->failed);
41  		pr_err("  str[%x] ct=%llx t=%zx\n",
42  		       s->stream_nr, s->collected_to, s->transferred);
43  		list_for_each_entry(sreq, &s->subrequests, rreq_link) {
44  			pr_err("  sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
45  			       sreq->stream_nr, sreq->debug_index, sreq->source,
46  			       sreq->start, sreq->transferred, sreq->len,
47  			       refcount_read(&sreq->ref), sreq->flags);
48  		}
49  	}
50  }
51  
52  /*
53   * Successful completion of write of a folio to the server and/or cache.  Note
54   * that we are not allowed to lock the folio here on pain of deadlocking with
55   * truncate.
56   */
netfs_folio_written_back(struct folio * folio)57  int netfs_folio_written_back(struct folio *folio)
58  {
59  	enum netfs_folio_trace why = netfs_folio_trace_clear;
60  	struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
61  	struct netfs_folio *finfo;
62  	struct netfs_group *group = NULL;
63  	int gcount = 0;
64  
65  	if ((finfo = netfs_folio_info(folio))) {
66  		/* Streaming writes cannot be redirtied whilst under writeback,
67  		 * so discard the streaming record.
68  		 */
69  		unsigned long long fend;
70  
71  		fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
72  		if (fend > ictx->zero_point)
73  			ictx->zero_point = fend;
74  
75  		folio_detach_private(folio);
76  		group = finfo->netfs_group;
77  		gcount++;
78  		kfree(finfo);
79  		why = netfs_folio_trace_clear_s;
80  		goto end_wb;
81  	}
82  
83  	if ((group = netfs_folio_group(folio))) {
84  		if (group == NETFS_FOLIO_COPY_TO_CACHE) {
85  			why = netfs_folio_trace_clear_cc;
86  			folio_detach_private(folio);
87  			goto end_wb;
88  		}
89  
90  		/* Need to detach the group pointer if the page didn't get
91  		 * redirtied.  If it has been redirtied, then it must be within
92  		 * the same group.
93  		 */
94  		why = netfs_folio_trace_redirtied;
95  		if (!folio_test_dirty(folio)) {
96  			folio_detach_private(folio);
97  			gcount++;
98  			why = netfs_folio_trace_clear_g;
99  		}
100  	}
101  
102  end_wb:
103  	trace_netfs_folio(folio, why);
104  	folio_end_writeback(folio);
105  	return gcount;
106  }
107  
108  /*
109   * Unlock any folios we've finished with.
110   */
netfs_writeback_unlock_folios(struct netfs_io_request * wreq,unsigned int * notes)111  static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
112  					  unsigned int *notes)
113  {
114  	struct folio_queue *folioq = wreq->buffer.tail;
115  	unsigned long long collected_to = wreq->collected_to;
116  	unsigned int slot = wreq->buffer.first_tail_slot;
117  
118  	if (WARN_ON_ONCE(!folioq)) {
119  		pr_err("[!] Writeback unlock found empty rolling buffer!\n");
120  		netfs_dump_request(wreq);
121  		return;
122  	}
123  
124  	if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
125  		if (netfs_pgpriv2_unlock_copied_folios(wreq))
126  			*notes |= MADE_PROGRESS;
127  		return;
128  	}
129  
130  	if (slot >= folioq_nr_slots(folioq)) {
131  		folioq = rolling_buffer_delete_spent(&wreq->buffer);
132  		if (!folioq)
133  			return;
134  		slot = 0;
135  	}
136  
137  	for (;;) {
138  		struct folio *folio;
139  		struct netfs_folio *finfo;
140  		unsigned long long fpos, fend;
141  		size_t fsize, flen;
142  
143  		folio = folioq_folio(folioq, slot);
144  		if (WARN_ONCE(!folio_test_writeback(folio),
145  			      "R=%08x: folio %lx is not under writeback\n",
146  			      wreq->debug_id, folio->index))
147  			trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
148  
149  		fpos = folio_pos(folio);
150  		fsize = folio_size(folio);
151  		finfo = netfs_folio_info(folio);
152  		flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
153  
154  		fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
155  
156  		trace_netfs_collect_folio(wreq, folio, fend, collected_to);
157  
158  		/* Unlock any folio we've transferred all of. */
159  		if (collected_to < fend)
160  			break;
161  
162  		wreq->nr_group_rel += netfs_folio_written_back(folio);
163  		wreq->cleaned_to = fpos + fsize;
164  		*notes |= MADE_PROGRESS;
165  
166  		/* Clean up the head folioq.  If we clear an entire folioq, then
167  		 * we can get rid of it provided it's not also the tail folioq
168  		 * being filled by the issuer.
169  		 */
170  		folioq_clear(folioq, slot);
171  		slot++;
172  		if (slot >= folioq_nr_slots(folioq)) {
173  			folioq = rolling_buffer_delete_spent(&wreq->buffer);
174  			if (!folioq)
175  				goto done;
176  			slot = 0;
177  		}
178  
179  		if (fpos + fsize >= collected_to)
180  			break;
181  	}
182  
183  	wreq->buffer.tail = folioq;
184  done:
185  	wreq->buffer.first_tail_slot = slot;
186  }
187  
188  /*
189   * Collect and assess the results of various write subrequests.  We may need to
190   * retry some of the results - or even do an RMW cycle for content crypto.
191   *
192   * Note that we have a number of parallel, overlapping lists of subrequests,
193   * one to the server and one to the local cache for example, which may not be
194   * the same size or starting position and may not even correspond in boundary
195   * alignment.
196   */
netfs_collect_write_results(struct netfs_io_request * wreq)197  static void netfs_collect_write_results(struct netfs_io_request *wreq)
198  {
199  	struct netfs_io_subrequest *front, *remove;
200  	struct netfs_io_stream *stream;
201  	unsigned long long collected_to, issued_to;
202  	unsigned int notes;
203  	int s;
204  
205  	_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
206  	trace_netfs_collect(wreq);
207  	trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
208  
209  reassess_streams:
210  	issued_to = atomic64_read(&wreq->issued_to);
211  	smp_rmb();
212  	collected_to = ULLONG_MAX;
213  	if (wreq->origin == NETFS_WRITEBACK ||
214  	    wreq->origin == NETFS_WRITETHROUGH ||
215  	    wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
216  		notes = NEED_UNLOCK;
217  	else
218  		notes = 0;
219  
220  	/* Remove completed subrequests from the front of the streams and
221  	 * advance the completion point on each stream.  We stop when we hit
222  	 * something that's in progress.  The issuer thread may be adding stuff
223  	 * to the tail whilst we're doing this.
224  	 */
225  	for (s = 0; s < NR_IO_STREAMS; s++) {
226  		stream = &wreq->io_streams[s];
227  		/* Read active flag before list pointers */
228  		if (!smp_load_acquire(&stream->active))
229  			continue;
230  
231  		front = stream->front;
232  		while (front) {
233  			trace_netfs_collect_sreq(wreq, front);
234  			//_debug("sreq [%x] %llx %zx/%zx",
235  			//       front->debug_index, front->start, front->transferred, front->len);
236  
237  			if (stream->collected_to < front->start) {
238  				trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
239  				stream->collected_to = front->start;
240  			}
241  
242  			/* Stall if the front is still undergoing I/O. */
243  			if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
244  				notes |= HIT_PENDING;
245  				break;
246  			}
247  			smp_rmb(); /* Read counters after I-P flag. */
248  
249  			if (stream->failed) {
250  				stream->collected_to = front->start + front->len;
251  				notes |= MADE_PROGRESS | SAW_FAILURE;
252  				goto cancel;
253  			}
254  			if (front->start + front->transferred > stream->collected_to) {
255  				stream->collected_to = front->start + front->transferred;
256  				stream->transferred = stream->collected_to - wreq->start;
257  				notes |= MADE_PROGRESS;
258  			}
259  			if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
260  				stream->failed = true;
261  				stream->error = front->error;
262  				if (stream->source == NETFS_UPLOAD_TO_SERVER)
263  					mapping_set_error(wreq->mapping, front->error);
264  				notes |= NEED_REASSESS | SAW_FAILURE;
265  				break;
266  			}
267  			if (front->transferred < front->len) {
268  				stream->need_retry = true;
269  				notes |= NEED_RETRY | MADE_PROGRESS;
270  				break;
271  			}
272  
273  		cancel:
274  			/* Remove if completely consumed. */
275  			spin_lock(&wreq->lock);
276  
277  			remove = front;
278  			list_del_init(&front->rreq_link);
279  			front = list_first_entry_or_null(&stream->subrequests,
280  							 struct netfs_io_subrequest, rreq_link);
281  			stream->front = front;
282  			spin_unlock(&wreq->lock);
283  			netfs_put_subrequest(remove, false,
284  					     notes & SAW_FAILURE ?
285  					     netfs_sreq_trace_put_cancel :
286  					     netfs_sreq_trace_put_done);
287  		}
288  
289  		/* If we have an empty stream, we need to jump it forward
290  		 * otherwise the collection point will never advance.
291  		 */
292  		if (!front && issued_to > stream->collected_to) {
293  			trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
294  			stream->collected_to = issued_to;
295  		}
296  
297  		if (stream->collected_to < collected_to)
298  			collected_to = stream->collected_to;
299  	}
300  
301  	if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
302  		wreq->collected_to = collected_to;
303  
304  	for (s = 0; s < NR_IO_STREAMS; s++) {
305  		stream = &wreq->io_streams[s];
306  		if (stream->active)
307  			trace_netfs_collect_stream(wreq, stream);
308  	}
309  
310  	trace_netfs_collect_state(wreq, wreq->collected_to, notes);
311  
312  	/* Unlock any folios that we have now finished with. */
313  	if (notes & NEED_UNLOCK) {
314  		if (wreq->cleaned_to < wreq->collected_to)
315  			netfs_writeback_unlock_folios(wreq, &notes);
316  	} else {
317  		wreq->cleaned_to = wreq->collected_to;
318  	}
319  
320  	// TODO: Discard encryption buffers
321  
322  	if (notes & NEED_RETRY)
323  		goto need_retry;
324  	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
325  		trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
326  		clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
327  		smp_mb__after_atomic(); /* Set PAUSE before task state */
328  		wake_up(&wreq->waitq);
329  	}
330  
331  	if (notes & NEED_REASSESS) {
332  		//cond_resched();
333  		goto reassess_streams;
334  	}
335  	if (notes & MADE_PROGRESS) {
336  		//cond_resched();
337  		goto reassess_streams;
338  	}
339  
340  out:
341  	netfs_put_group_many(wreq->group, wreq->nr_group_rel);
342  	wreq->nr_group_rel = 0;
343  	_leave(" = %x", notes);
344  	return;
345  
346  need_retry:
347  	/* Okay...  We're going to have to retry one or both streams.  Note
348  	 * that any partially completed op will have had any wholly transferred
349  	 * folios removed from it.
350  	 */
351  	_debug("retry");
352  	netfs_retry_writes(wreq);
353  	goto out;
354  }
355  
356  /*
357   * Perform the collection of subrequests, folios and encryption buffers.
358   */
netfs_write_collection_worker(struct work_struct * work)359  void netfs_write_collection_worker(struct work_struct *work)
360  {
361  	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
362  	struct netfs_inode *ictx = netfs_inode(wreq->inode);
363  	size_t transferred;
364  	int s;
365  
366  	_enter("R=%x", wreq->debug_id);
367  
368  	netfs_see_request(wreq, netfs_rreq_trace_see_work);
369  	if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
370  		netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
371  		return;
372  	}
373  
374  	netfs_collect_write_results(wreq);
375  
376  	/* We're done when the app thread has finished posting subreqs and all
377  	 * the queues in all the streams are empty.
378  	 */
379  	if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
380  		netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
381  		return;
382  	}
383  	smp_rmb(); /* Read ALL_QUEUED before lists. */
384  
385  	transferred = LONG_MAX;
386  	for (s = 0; s < NR_IO_STREAMS; s++) {
387  		struct netfs_io_stream *stream = &wreq->io_streams[s];
388  		if (!stream->active)
389  			continue;
390  		if (!list_empty(&stream->subrequests)) {
391  			netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
392  			return;
393  		}
394  		if (stream->transferred < transferred)
395  			transferred = stream->transferred;
396  	}
397  
398  	/* Okay, declare that all I/O is complete. */
399  	wreq->transferred = transferred;
400  	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
401  
402  	if (wreq->io_streams[1].active &&
403  	    wreq->io_streams[1].failed &&
404  	    ictx->ops->invalidate_cache) {
405  		/* Cache write failure doesn't prevent writeback completion
406  		 * unless we're in disconnected mode.
407  		 */
408  		ictx->ops->invalidate_cache(wreq);
409  	}
410  
411  	if (wreq->cleanup)
412  		wreq->cleanup(wreq);
413  
414  	if (wreq->origin == NETFS_DIO_WRITE &&
415  	    wreq->mapping->nrpages) {
416  		/* mmap may have got underfoot and we may now have folios
417  		 * locally covering the region we just wrote.  Attempt to
418  		 * discard the folios, but leave in place any modified locally.
419  		 * ->write_iter() is prevented from interfering by the DIO
420  		 * counter.
421  		 */
422  		pgoff_t first = wreq->start >> PAGE_SHIFT;
423  		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
424  		invalidate_inode_pages2_range(wreq->mapping, first, last);
425  	}
426  
427  	if (wreq->origin == NETFS_DIO_WRITE)
428  		inode_dio_end(wreq->inode);
429  
430  	_debug("finished");
431  	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
432  	clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
433  
434  	if (wreq->iocb) {
435  		size_t written = min(wreq->transferred, wreq->len);
436  		wreq->iocb->ki_pos += written;
437  		if (wreq->iocb->ki_complete)
438  			wreq->iocb->ki_complete(
439  				wreq->iocb, wreq->error ? wreq->error : written);
440  		wreq->iocb = VFS_PTR_POISON;
441  	}
442  
443  	netfs_clear_subrequests(wreq, false);
444  	netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
445  }
446  
447  /*
448   * Wake the collection work item.
449   */
netfs_wake_write_collector(struct netfs_io_request * wreq,bool was_async)450  void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
451  {
452  	if (!work_pending(&wreq->work)) {
453  		netfs_get_request(wreq, netfs_rreq_trace_get_work);
454  		if (!queue_work(system_unbound_wq, &wreq->work))
455  			netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
456  	}
457  }
458  
459  /**
460   * netfs_write_subrequest_terminated - Note the termination of a write operation.
461   * @_op: The I/O request that has terminated.
462   * @transferred_or_error: The amount of data transferred or an error code.
463   * @was_async: The termination was asynchronous
464   *
465   * This tells the library that a contributory write I/O operation has
466   * terminated, one way or another, and that it should collect the results.
467   *
468   * The caller indicates in @transferred_or_error the outcome of the operation,
469   * supplying a positive value to indicate the number of bytes transferred or a
470   * negative error code.  The library will look after reissuing I/O operations
471   * as appropriate and writing downloaded data to the cache.
472   *
473   * If @was_async is true, the caller might be running in softirq or interrupt
474   * context and we can't sleep.
475   *
476   * When this is called, ownership of the subrequest is transferred back to the
477   * library, along with a ref.
478   *
479   * Note that %_op is a void* so that the function can be passed to
480   * kiocb::term_func without the need for a casting wrapper.
481   */
netfs_write_subrequest_terminated(void * _op,ssize_t transferred_or_error,bool was_async)482  void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
483  				       bool was_async)
484  {
485  	struct netfs_io_subrequest *subreq = _op;
486  	struct netfs_io_request *wreq = subreq->rreq;
487  	struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
488  
489  	_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
490  
491  	switch (subreq->source) {
492  	case NETFS_UPLOAD_TO_SERVER:
493  		netfs_stat(&netfs_n_wh_upload_done);
494  		break;
495  	case NETFS_WRITE_TO_CACHE:
496  		netfs_stat(&netfs_n_wh_write_done);
497  		break;
498  	case NETFS_INVALID_WRITE:
499  		break;
500  	default:
501  		BUG();
502  	}
503  
504  	if (IS_ERR_VALUE(transferred_or_error)) {
505  		subreq->error = transferred_or_error;
506  		if (subreq->error == -EAGAIN)
507  			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
508  		else
509  			set_bit(NETFS_SREQ_FAILED, &subreq->flags);
510  		trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
511  
512  		switch (subreq->source) {
513  		case NETFS_WRITE_TO_CACHE:
514  			netfs_stat(&netfs_n_wh_write_failed);
515  			break;
516  		case NETFS_UPLOAD_TO_SERVER:
517  			netfs_stat(&netfs_n_wh_upload_failed);
518  			break;
519  		default:
520  			break;
521  		}
522  		trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
523  		set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
524  	} else {
525  		if (WARN(transferred_or_error > subreq->len - subreq->transferred,
526  			 "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
527  			 wreq->debug_id, subreq->debug_index,
528  			 transferred_or_error, subreq->len, subreq->transferred))
529  			transferred_or_error = subreq->len - subreq->transferred;
530  
531  		subreq->error = 0;
532  		subreq->transferred += transferred_or_error;
533  
534  		if (subreq->transferred < subreq->len)
535  			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
536  	}
537  
538  	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
539  
540  	clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
541  
542  	/* If we are at the head of the queue, wake up the collector,
543  	 * transferring a ref to it if we were the ones to do so.
544  	 */
545  	if (list_is_first(&subreq->rreq_link, &stream->subrequests))
546  		netfs_wake_write_collector(wreq, was_async);
547  
548  	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
549  }
550  EXPORT_SYMBOL(netfs_write_subrequest_terminated);
551