Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async
hcalls at the drc level
by Greg Kurz
Hi Shiva,
On Mon, 30 Nov 2020 09:16:39 -0600
Shivaprasad G Bhat <sbhat(a)linux.ibm.com> wrote:
> The patch adds support for async hcalls at the DRC level for the
> spapr devices. To be used by spapr-scm devices in the patch/es to follow.
>
> Signed-off-by: Shivaprasad G Bhat <sbhat(a)linux.ibm.com>
> ---
The overall idea looks good but I think you should consider using
a thread pool to implement it. See below.
> hw/ppc/spapr_drc.c | 149 ++++++++++++++++++++++++++++++++++++++++++++
> include/hw/ppc/spapr_drc.h | 25 +++++++
> 2 files changed, 174 insertions(+)
>
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 77718cde1f..4ecd04f686 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -15,6 +15,7 @@
> #include "qapi/qmp/qnull.h"
> #include "cpu.h"
> #include "qemu/cutils.h"
> +#include "qemu/guest-random.h"
> #include "hw/ppc/spapr_drc.h"
> #include "qom/object.h"
> #include "migration/vmstate.h"
> @@ -421,6 +422,148 @@ void spapr_drc_detach(SpaprDrc *drc)
> spapr_drc_release(drc);
> }
>
> +
> +/*
> + * @drc : device DRC targetting which the async hcalls to be made.
> + *
> + * All subsequent requests to run/query the status should use the
> + * unique token returned here.
> + */
> +uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc)
> +{
> + Error *err = NULL;
> + uint64_t token;
> + SpaprDrcDeviceAsyncHCallState *tmp, *next, *state;
> +
> + state = g_malloc0(sizeof(*state));
> + state->pending = true;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> +retry:
> + if (qemu_guest_getrandom(&token, sizeof(token), &err) < 0) {
> + error_report_err(err);
> + g_free(state);
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> + return 0;
> + }
> +
> + if (!token) /* Token should be non-zero */
> + goto retry;
> +
> + if (!QLIST_EMPTY(&drc->async_hcall_states)) {
> + QLIST_FOREACH_SAFE(tmp, &drc->async_hcall_states, node, next) {
> + if (tmp->continue_token == token) {
> + /* If the token already in use, get a new one */
> + goto retry;
> + }
> + }
> + }
> +
> + state->continue_token = token;
> + QLIST_INSERT_HEAD(&drc->async_hcall_states, state, node);
> +
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +
> + return state->continue_token;
> +}
> +
> +static void *spapr_drc_async_hcall_runner(void *opaque)
> +{
> + int response = -1;
> + SpaprDrcDeviceAsyncHCallState *state = opaque;
> +
> + /*
> + * state is freed only after this thread finishes(after pthread_join()),
> + * don't worry about it becoming NULL.
> + */
> +
> + response = state->func(state->data);
> +
> + state->hcall_ret = response;
> + state->pending = 0;
> +
> + return NULL;
> +}
> +
> +/*
> + * @drc : device DRC targetting which the async hcalls to be made.
> + * token : The continue token to be used for tracking as recived from
> + * spapr_drc_get_new_async_hcall_token
> + * @func() : the worker function which needs to be executed asynchronously
> + * @data : data to be passed to the asynchronous function. Worker is supposed
> + * to free/cleanup the data that is passed here
It'd be cleaner to pass a completion callback and have free/cleanup handled there.
> + */
> +void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
> + SpaprDrcAsyncHcallWorkerFunc *func, void *data)
> +{
> + SpaprDrcDeviceAsyncHCallState *state;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH(state, &drc->async_hcall_states, node) {
> + if (state->continue_token == token) {
> + state->func = func;
> + state->data = data;
> + qemu_thread_create(&state->thread, "sPAPR Async HCALL",
> + spapr_drc_async_hcall_runner, state,
> + QEMU_THREAD_JOINABLE);
qemu_thread_create() exits on failure, it shouldn't be called on
a guest triggerable path, eg. a buggy guest could call it up to
the point that pthread_create() returns EAGAIN.
Please use a thread pool (see thread_pool_submit_aio()). This takes care
of all the thread housekeeping for you in a safe way, and it provides a
completion callback API. The implementation could then be just about
having two lists: one for pending requests (fed here) and one for
completed requests (fed by the completion callback).
> + break;
> + }
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +}
> +
> +/*
> + * spapr_drc_finish_async_hcalls
> + * Waits for all pending async requests to complete
> + * thier execution and free the states
> + */
> +static void spapr_drc_finish_async_hcalls(SpaprDrc *drc)
> +{
> + SpaprDrcDeviceAsyncHCallState *state, *next;
> +
> + if (QLIST_EMPTY(&drc->async_hcall_states)) {
> + return;
> + }
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH_SAFE(state, &drc->async_hcall_states, node, next) {
> + qemu_thread_join(&state->thread);
With a thread-pool, you'd just need to aio_poll() until the pending list
is empty and then clear the completed list.
> + QLIST_REMOVE(state, node);
> + g_free(state);
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +}
> +
> +/*
> + * spapr_drc_get_async_hcall_status
> + * Fetches the status of the hcall worker and returns H_BUSY
> + * if the worker is still running.
> + */
> +int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token)
> +{
> + int ret = H_PARAMETER;
> + SpaprDrcDeviceAsyncHCallState *state, *node;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH_SAFE(state, &drc->async_hcall_states, node, node) {
> + if (state->continue_token == token) {
> + if (state->pending) {
> + ret = H_BUSY;
> + break;
> + } else {
> + ret = state->hcall_ret;
> + qemu_thread_join(&state->thread);
Like for qemu_thread_create(), the guest shouldn't be responsible for
thread housekeeping. Getting the hcall status should just be about
finding the token in the pending or completed lists.
> + QLIST_REMOVE(state, node);
> + g_free(state);
> + break;
> + }
> + }
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +
> + return ret;
> +}
> +
> void spapr_drc_reset(SpaprDrc *drc)
> {
> SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> @@ -448,6 +591,7 @@ void spapr_drc_reset(SpaprDrc *drc)
> drc->ccs_offset = -1;
> drc->ccs_depth = -1;
> }
> + spapr_drc_finish_async_hcalls(drc);
> }
>
> static bool spapr_drc_unplug_requested_needed(void *opaque)
> @@ -558,6 +702,7 @@ SpaprDrc *spapr_dr_connector_new(Object *owner, const char *type,
> drc->owner = owner;
> prop_name = g_strdup_printf("dr-connector[%"PRIu32"]",
> spapr_drc_index(drc));
> +
Unrelated change.
> object_property_add_child(owner, prop_name, OBJECT(drc));
> object_unref(OBJECT(drc));
> qdev_realize(DEVICE(drc), NULL, NULL);
> @@ -577,6 +722,10 @@ static void spapr_dr_connector_instance_init(Object *obj)
> object_property_add(obj, "fdt", "struct", prop_get_fdt,
> NULL, NULL, NULL);
> drc->state = drck->empty_state;
> +
> + qemu_mutex_init(&drc->async_hcall_states_lock);
> + QLIST_INIT(&drc->async_hcall_states);
> +
Empty line not needed.
> }
>
> static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> index 165b281496..77f6e4386c 100644
> --- a/include/hw/ppc/spapr_drc.h
> +++ b/include/hw/ppc/spapr_drc.h
> @@ -18,6 +18,7 @@
> #include "sysemu/runstate.h"
> #include "hw/qdev-core.h"
> #include "qapi/error.h"
> +#include "block/thread-pool.h"
>
> #define TYPE_SPAPR_DR_CONNECTOR "spapr-dr-connector"
> #define SPAPR_DR_CONNECTOR_GET_CLASS(obj) \
> @@ -168,6 +169,21 @@ typedef enum {
> SPAPR_DRC_STATE_PHYSICAL_CONFIGURED = 8,
> } SpaprDrcState;
>
> +typedef struct SpaprDrc SpaprDrc;
> +
> +typedef int SpaprDrcAsyncHcallWorkerFunc(void *opaque);
> +typedef struct SpaprDrcDeviceAsyncHCallState {
> + uint64_t continue_token;
> + bool pending;
> +
> + int hcall_ret;
> + SpaprDrcAsyncHcallWorkerFunc *func;
> + void *data;
> +
> + QemuThread thread;
> +
> + QLIST_ENTRY(SpaprDrcDeviceAsyncHCallState) node;
> +} SpaprDrcDeviceAsyncHCallState;
> typedef struct SpaprDrc {
> /*< private >*/
> DeviceState parent;
> @@ -182,6 +198,10 @@ typedef struct SpaprDrc {
> int ccs_offset;
> int ccs_depth;
>
> + /* async hcall states */
> + QemuMutex async_hcall_states_lock;
> + QLIST_HEAD(, SpaprDrcDeviceAsyncHCallState) async_hcall_states;
> +
> /* device pointer, via link property */
> DeviceState *dev;
> bool unplug_requested;
> @@ -241,6 +261,11 @@ void spapr_drc_detach(SpaprDrc *drc);
> /* Returns true if a hot plug/unplug request is pending */
> bool spapr_drc_transient(SpaprDrc *drc);
>
> +uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc);
> +void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
> + SpaprDrcAsyncHcallWorkerFunc, void *data);
> +int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token);
> +
> static inline bool spapr_drc_unplug_requested(SpaprDrc *drc)
> {
> return drc->unplug_requested;
>
>
>
1 year, 3 months
[PATCH v3 00/11] fsdax: introduce fs query to support reflink
by Shiyang Ruan
This patchset is aimed to support shared pages tracking for fsdax.
Change from V2:
- Split 8th patch into other related to make it easy to review
- Other small fixes
Change from V1:
- Add the old memory-failure handler back for rolling back
- Add callback in MD's ->rmap() to support multiple mapping of dm device
- Add judgement for CONFIG_SYSFS
- Add pfn_valid() judgement in hwpoison_filter()
- Rebased to v5.11-rc5
This patchset moves owner tracking from dax_assocaite_entry() to pmem
device driver, by introducing an interface ->memory_failure() of struct
pagemap. This interface is called by memory_failure() in mm, and
implemented by pmem device. Then pmem device calls its ->corrupted_range()
to find the filesystem which the corrupted data located in, and call
filesystem handler to track files or metadata assocaited with this page.
Finally we are able to try to fix the corrupted data in filesystem and do
other necessary processing, such as killing processes who are using the
files affected.
The call trace is like this:
memory_failure()
pgmap->ops->memory_failure() => pmem_pgmap_memory_failure()
gendisk->fops->corrupted_range() => - pmem_corrupted_range()
- md_blk_corrupted_range()
sb->s_ops->currupted_range() => xfs_fs_corrupted_range()
xfs_rmap_query_range()
xfs_currupt_helper()
* corrupted on metadata
try to recover data, call xfs_force_shutdown()
* corrupted on file data
try to recover data, call mf_dax_mapping_kill_procs()
The fsdax & reflink support for XFS is not contained in this patchset.
(Rebased on v5.11-rc5)
==
Shiyang Ruan (11):
pagemap: Introduce ->memory_failure()
blk: Introduce ->corrupted_range() for block device
fs: Introduce ->corrupted_range() for superblock
block_dev: Introduce bd_corrupted_range()
mm, fsdax: Refactor memory-failure handler for dax mapping
mm, pmem: Implement ->memory_failure() in pmem driver
pmem: Implement ->corrupted_range() for pmem driver
dm: Introduce ->rmap() to find bdev offset
md: Implement ->corrupted_range()
xfs: Implement ->corrupted_range() for XFS
fs/dax: Remove useless functions
block/genhd.c | 6 ++
drivers/md/dm-linear.c | 20 ++++
drivers/md/dm.c | 61 +++++++++++
drivers/nvdimm/pmem.c | 45 ++++++++
fs/block_dev.c | 47 ++++++++-
fs/dax.c | 63 ++++-------
fs/xfs/xfs_fsops.c | 5 +
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 112 ++++++++++++++++++++
include/linux/blkdev.h | 2 +
include/linux/dax.h | 1 +
include/linux/device-mapper.h | 5 +
include/linux/fs.h | 2 +
include/linux/genhd.h | 3 +
include/linux/memremap.h | 8 ++
include/linux/mm.h | 9 ++
mm/memory-failure.c | 190 +++++++++++++++++++++++-----------
17 files changed, 475 insertions(+), 105 deletions(-)
--
2.30.0
1 year, 3 months
[PATCH v2 00/10] fsdax,xfs: Add reflink&dedupe support for fsdax
by Shiyang Ruan
This patchset is attempt to add CoW support for fsdax, and take XFS,
which has both reflink and fsdax feature, as an example.
Changes from V1:
- Factor some helper functions to simplify dax fault code
- Introduce iomap_apply2() for dax_dedupe_file_range_compare()
- Fix mistakes and other problems
- Rebased on v5.11
One of the key mechanism need to be implemented in fsdax is CoW. Copy
the data from srcmap before we actually write data to the destance
iomap. And we just copy range in which data won't be changed.
Another mechanism is range comparison. In page cache case, readpage()
is used to load data on disk to page cache in order to be able to
compare data. In fsdax case, readpage() does not work. So, we need
another compare data with direct access support.
With the two mechanism implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.
Some of the patches are picked up from Goldwyn's patchset. I made some
changes to adapt to this patchset.
(Rebased on v5.11)
==
Shiyang Ruan (10):
fsdax: Factor helpers to simplify dax fault code
fsdax: Factor helper: dax_fault_actor()
fsdax: Output address in dax_iomap_pfn() and rename it
fsdax: Introduce dax_iomap_cow_copy()
fsdax: Replace mmap entry in case of CoW
fsdax: Add dax_iomap_cow_copy() for dax_iomap_zero
iomap: Introduce iomap_apply2() for operations on two files
fsdax: Dedup file range to use a compare function
fs/xfs: Handle CoW for fsdax write() path
fs/xfs: Add dedupe support for fsdax
fs/dax.c | 532 +++++++++++++++++++++++++++--------------
fs/iomap/apply.c | 51 ++++
fs/iomap/buffered-io.c | 2 +-
fs/remap_range.c | 45 +++-
fs/xfs/xfs_bmap_util.c | 3 +-
fs/xfs/xfs_file.c | 29 ++-
fs/xfs/xfs_inode.c | 8 +-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_iomap.c | 30 ++-
fs/xfs/xfs_iomap.h | 1 +
fs/xfs/xfs_iops.c | 11 +-
fs/xfs/xfs_reflink.c | 16 +-
include/linux/dax.h | 7 +-
include/linux/fs.h | 15 +-
include/linux/iomap.h | 7 +-
15 files changed, 550 insertions(+), 208 deletions(-)
--
2.30.1
1 year, 3 months
[PATCH v2 0/4] Remove nrexceptional tracking
by Matthew Wilcox (Oracle)
We actually use nrexceptional for very little these days. It's a minor
pain to keep in sync with nrpages, but the pain becomes much bigger
with the THP patches because we don't know how many indices a shadow
entry occupies. It's easier to just remove it than keep it accurate.
Also, we save 8 bytes per inode which is nothing to sneeze at; on my
laptop, it would improve shmem_inode_cache from 22 to 23 objects per
16kB, and inode_cache from 26 to 27 objects. Combined, that saves
a megabyte of memory from a combined usage of 25MB for both caches.
Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save
any memory for ext4.
Matthew Wilcox (Oracle) (4):
mm: Introduce and use mapping_empty
mm: Stop accounting shadow entries
dax: Account DAX entries as nrpages
mm: Remove nrexceptional from inode
fs/block_dev.c | 2 +-
fs/dax.c | 8 ++++----
fs/gfs2/glock.c | 3 +--
fs/inode.c | 2 +-
include/linux/fs.h | 2 --
include/linux/pagemap.h | 5 +++++
mm/filemap.c | 16 ----------------
mm/swap_state.c | 4 ----
mm/truncate.c | 19 +++----------------
mm/workingset.c | 1 -
10 files changed, 15 insertions(+), 47 deletions(-)
--
2.28.0
1 year, 3 months
[RFC 0/2] virtio-pmem: Asynchronous flush
by Pankaj Gupta
Jeff reported preflush order issue with the existing implementation
of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
for virtio pmem using work queue as done in md/RAID. This patch series
intends to solve the preflush ordering issue and also makes the flush
asynchronous from the submitting thread POV.
Submitting this patch series for feeback and is in WIP. I have
done basic testing and currently doing more testing.
Pankaj Gupta (2):
pmem: make nvdimm_flush asynchronous
virtio_pmem: Async virtio-pmem flush
drivers/nvdimm/nd_virtio.c | 66 ++++++++++++++++++++++++++----------
drivers/nvdimm/pmem.c | 15 ++++----
drivers/nvdimm/region_devs.c | 3 +-
drivers/nvdimm/virtio_pmem.c | 9 +++++
drivers/nvdimm/virtio_pmem.h | 12 +++++++
5 files changed, 78 insertions(+), 27 deletions(-)
[1] https://marc.info/?l=linux-kernel&m=157446316409937&w=2
--
2.20.1
1 year, 3 months
[PATCH RFC 0/9] mm, sparse-vmemmap: Introduce compound pagemaps
by Joao Martins
Hey,
This small series, attempts at minimizing 'struct page' overhead by
pursuing a similar approach as Muchun Song series "Free some vmemmap
pages of hugetlb page"[0] but applied to devmap/ZONE_DEVICE.
[0] https://lore.kernel.org/linux-mm/20201130151838.11208-1-songmuchun@byteda...
The link above describes it quite nicely, but the idea is to reuse tail
page vmemmap areas, particular the area which only describes tail pages.
So a vmemmap page describes 64 struct pages, and the first page for a given
ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second
vmemmap page would contain only tail pages, and that's what gets reused across
the rest of the subsection/section. The bigger the page size, the bigger the
savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages).
In terms of savings, per 1Tb of memory, the struct page cost would go down
with compound pagemap:
* with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory)
* with 1G pages we lose 8MB instead of 16G (0.0007% instead of 1.5% of total memory)
Along the way I've extended it past 'struct page' overhead *trying* to address a
few performance issues we knew about for pmem, specifically on the
{pin,get}_user_pages* function family with device-dax vmas which are really
slow even of the fast variants. THP is great on -fast variants but all except
hugetlbfs perform rather poorly on non-fast gup.
So to summarize what the series does:
Patches 1-5: Much like Muchun series, we reuse tail page areas across a given
page size (namely @align was referred by remaining memremap/dax code) and
enabling of memremap to initialize the ZONE_DEVICE pages as compound pages or a
given @align order. The main difference though, is that contrary to the hugetlbfs
series, there's no vmemmap for the area, because we are onlining it. IOW no
freeing of pages of already initialized vmemmap like the case for hugetlbfs,
which simplifies the logic (besides not being arch-specific). After these,
there's quite visible region bootstrap of pmem memmap given that we would
initialize fewer struct pages depending on the page size.
NVDIMM namespace bootstrap improves from ~750ms to ~190ms/<=1ms on emulated NVDIMMs
with 2M and 1G respectivally. The net gain in improvement is similarly observed
in proportion when running on actual NVDIMMs.
Patch 6 - 8: Optimize grabbing/release a page refcount changes given that we
are working with compound pages i.e. we do 1 increment/decrement to the head
page for a given set of N subpages compared as opposed to N individual writes.
{get,pin}_user_pages_fast() for zone_device with compound pagemap consequently
improves considerably, and unpin_user_pages() improves as well when passed a
set of consecutive pages:
before after
(get_user_pages_fast 1G;2M page size) ~75k us -> ~3.2k ; ~5.2k us
(pin_user_pages_fast 1G;2M page size) ~125k us -> ~3.4k ; ~5.5k us
The RDMA patch (patch 8/9) is to demonstrate the improvement for an existing
user. For unpin_user_pages() we have an additional test to demonstrate the
improvement. The test performs MR reg/unreg continuously and measuring its
rate for a given period. So essentially ib_mem_get and ib_mem_release being
stress tested which at the end of day means: pin_user_pages_longterm() and
unpin_user_pages() for a scatterlist:
Before:
159 rounds in 5.027 sec: 31617.923 usec / round (device-dax)
466 rounds in 5.009 sec: 10748.456 usec / round (hugetlbfs)
After:
305 rounds in 5.010 sec: 16426.047 usec / round (device-dax)
1073 rounds in 5.004 sec: 4663.622 usec / round (hugetlbfs)
Patch 9: Improves {pin,get}_user_pages() and its longterm counterpart. It
is very experimental, and I imported most of follow_hugetlb_page(), except
that we do the same trick as gup-fast. In doing the patch I feel this batching
should live in follow_page_mask() and having that being changed to return a set
of pages/something-else when walking over PMD/PUDs for THP / devmap pages. This
patch then brings the previous test of mr reg/unreg (above) on parity
between device-dax and hugetlbfs.
Some of the patches are a little fresh/WIP (specially patch 3 and 9) and we are
still running tests. Hence the RFC, asking for comments and general direction
of the work before continuing.
Patches apply on top of linux-next tag next-20201208 (commit a9e26cb5f261).
Comments and suggestions very much appreciated!
Thanks,
Joao
Joao Martins (9):
memremap: add ZONE_DEVICE support for compound pages
sparse-vmemmap: Consolidate arguments in vmemmap section populate
sparse-vmemmap: Reuse vmemmap areas for a given page size
mm/page_alloc: Reuse tail struct pages for compound pagemaps
device-dax: Compound pagemap support
mm/gup: Grab head page refcount once for group of subpages
mm/gup: Decrement head page once for group of subpages
RDMA/umem: batch page unpin in __ib_mem_release()
mm: Add follow_devmap_page() for devdax vmas
drivers/dax/device.c | 54 ++++++---
drivers/infiniband/core/umem.c | 25 +++-
include/linux/huge_mm.h | 4 +
include/linux/memory_hotplug.h | 16 ++-
include/linux/memremap.h | 2 +
include/linux/mm.h | 6 +-
mm/gup.c | 130 ++++++++++++++++-----
mm/huge_memory.c | 202 +++++++++++++++++++++++++++++++++
mm/memory_hotplug.c | 13 ++-
mm/memremap.c | 13 ++-
mm/page_alloc.c | 28 ++++-
mm/sparse-vmemmap.c | 97 +++++++++++++---
mm/sparse.c | 16 +--
13 files changed, 531 insertions(+), 75 deletions(-)
--
2.17.1
1 year, 3 months
Re: Question about the "EXPERIMENTAL" tag for dax in XFS
by Darrick J. Wong
On Fri, Feb 26, 2021 at 09:45:45AM +0000, ruansy.fnst(a)fujitsu.com wrote:
> Hi, guys
>
> Beside this patchset, I'd like to confirm something about the
> "EXPERIMENTAL" tag for dax in XFS.
>
> In XFS, the "EXPERIMENTAL" tag, which is reported in waring message
> when we mount a pmem device with dax option, has been existed for a
> while. It's a bit annoying when using fsdax feature. So, my initial
> intention was to remove this tag. And I started to find out and solve
> the problems which prevent it from being removed.
>
> As is talked before, there are 3 main problems. The first one is "dax
> semantics", which has been resolved. The rest two are "RMAP for
> fsdax" and "support dax reflink for filesystem", which I have been
> working on.
<nod>
> So, what I want to confirm is: does it means that we can remove the
> "EXPERIMENTAL" tag when the rest two problem are solved?
Yes. I'd keep the experimental tag for a cycle or two to make sure that
nothing new pops up, but otherwise the two patchsets you've sent close
those two big remaining gaps. Thank you for working on this!
> Or maybe there are other important problems need to be fixed before
> removing it? If there are, could you please show me that?
That remains to be seen through QA/validation, but I think that's it.
Granted, I still have to read through the two patchsets...
--D
>
> Thank you.
>
>
> --
> Ruan Shiyang.
1 year, 3 months
[ndctl PATCH] Expose ndctl_bus_nfit_translate_spa as a public function.
by Tsaur, Erwin
The motivation is to allow access to ACPI defined NVDIMM Root Device _DSM Function Index 5(Translate SPA). The rest of the _DSM functions, which are mostly ARS related, are already public.
Basically move ndctl_bus_nfit_translate_spa declaration from private.h to libndctl.h.
---
ndctl/lib/libndctl.sym | 4 ++++
ndctl/lib/nfit.c | 2 +-
ndctl/lib/private.h | 2 --
ndctl/libndctl.h | 2 ++
4 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/ndctl/lib/libndctl.sym b/ndctl/lib/libndctl.sym
index 0a82616..58afb74 100644
--- a/ndctl/lib/libndctl.sym
+++ b/ndctl/lib/libndctl.sym
@@ -451,3 +451,7 @@ LIBNDCTL_25 {
ndctl_bus_clear_fw_activate_nosuspend;
ndctl_bus_activate_firmware;
} LIBNDCTL_24;
+
+LIBNDCTL_26 {
+ ndctl_bus_nfit_translate_spa;
+} LIBNDCTL_25;
diff --git a/ndctl/lib/nfit.c b/ndctl/lib/nfit.c
index 6f68fcf..d85682f 100644
--- a/ndctl/lib/nfit.c
+++ b/ndctl/lib/nfit.c
@@ -114,7 +114,7 @@ static int is_valid_spa(struct ndctl_bus *bus, unsigned long long spa)
*
* If success, returns zero, store dimm's @handle, and @dpa.
*/
-int ndctl_bus_nfit_translate_spa(struct ndctl_bus *bus,
+NDCTL_EXPORT int ndctl_bus_nfit_translate_spa(struct ndctl_bus *bus,
unsigned long long address, unsigned int *handle, unsigned long long *dpa)
{
diff --git a/ndctl/lib/private.h b/ndctl/lib/private.h
index ede1300..8f4510e 100644
--- a/ndctl/lib/private.h
+++ b/ndctl/lib/private.h
@@ -370,8 +370,6 @@ static inline int check_kmod(struct kmod_ctx *kmod_ctx)
return kmod_ctx ? 0 : -ENXIO;
}
-int ndctl_bus_nfit_translate_spa(struct ndctl_bus *bus, unsigned long long addr,
- unsigned int *handle, unsigned long long *dpa);
struct ndctl_cmd *ndctl_bus_cmd_new_err_inj(struct ndctl_bus *bus);
struct ndctl_cmd *ndctl_bus_cmd_new_err_inj_clr(struct ndctl_bus *bus);
struct ndctl_cmd *ndctl_bus_cmd_new_err_inj_stat(struct ndctl_bus *bus,
diff --git a/ndctl/libndctl.h b/ndctl/libndctl.h
index 60e1288..ee517a7 100644
--- a/ndctl/libndctl.h
+++ b/ndctl/libndctl.h
@@ -237,6 +237,8 @@ unsigned long long ndctl_cmd_clear_error_get_cleared(
struct ndctl_cmd *clear_err);
unsigned int ndctl_cmd_ars_cap_get_clear_unit(struct ndctl_cmd *ars_cap);
int ndctl_cmd_ars_stat_get_flag_overflow(struct ndctl_cmd *ars_stat);
+int ndctl_bus_nfit_translate_spa(struct ndctl_bus *bus, unsigned long long addr,
+ unsigned int *handle, unsigned long long *dpa);
/*
* Note: ndctl_cmd_smart_get_temperature is an alias for
--
2.30.0
1 year, 4 months
[ndctl PATCH] ndctl/test: add checking the presence of jq command ahead
by QI Fuli
Due to the lack of jq command, the result of the test will be 'fail'.
This patch adds checking the presence of jq commmand ahead.
If there is no jq command in the system, the test will be marked as 'skip'.
Signed-off-by: QI Fuli <qi.fuli(a)fujitsu.com>
Link: https://github.com/pmem/ndctl/issues/141
---
test/daxdev-errors.sh | 1 +
test/inject-error.sh | 2 ++
test/inject-smart.sh | 1 +
test/label-compat.sh | 1 +
test/max_available_extent_ns.sh | 1 +
test/monitor.sh | 2 ++
test/multi-dax.sh | 1 +
test/sector-mode.sh | 2 ++
8 files changed, 11 insertions(+)
diff --git a/test/daxdev-errors.sh b/test/daxdev-errors.sh
index 6281f32..9547d78 100755
--- a/test/daxdev-errors.sh
+++ b/test/daxdev-errors.sh
@@ -9,6 +9,7 @@ rc=77
. $(dirname $0)/common
check_min_kver "4.12" || do_skip "lacks dax dev error handling"
+check_prereq "jq"
trap 'err $LINENO' ERR
diff --git a/test/inject-error.sh b/test/inject-error.sh
index c636033..7d0b826 100755
--- a/test/inject-error.sh
+++ b/test/inject-error.sh
@@ -11,6 +11,8 @@ err_count=8
. $(dirname $0)/common
+check_prereq "jq"
+
trap 'err $LINENO' ERR
# sample json:
diff --git a/test/inject-smart.sh b/test/inject-smart.sh
index 94705df..4ca83b8 100755
--- a/test/inject-smart.sh
+++ b/test/inject-smart.sh
@@ -166,6 +166,7 @@ do_tests()
}
check_min_kver "4.19" || do_skip "kernel $KVER may not support smart (un)injection"
+check_prereq "jq"
modprobe nfit_test
rc=1
diff --git a/test/label-compat.sh b/test/label-compat.sh
index 340b93d..8ab2858 100755
--- a/test/label-compat.sh
+++ b/test/label-compat.sh
@@ -10,6 +10,7 @@ BASE=$(dirname $0)
. $BASE/common
check_min_kver "4.11" || do_skip "may not provide reliable isetcookie values"
+check_prereq "jq"
trap 'err $LINENO' ERR
diff --git a/test/max_available_extent_ns.sh b/test/max_available_extent_ns.sh
index 14d741d..343f3c9 100755
--- a/test/max_available_extent_ns.sh
+++ b/test/max_available_extent_ns.sh
@@ -9,6 +9,7 @@ rc=77
trap 'err $LINENO' ERR
check_min_kver "4.19" || do_skip "kernel $KVER may not support max_available_size"
+check_prereq "jq"
init()
{
diff --git a/test/monitor.sh b/test/monitor.sh
index cdab5e1..28c5541 100755
--- a/test/monitor.sh
+++ b/test/monitor.sh
@@ -13,6 +13,8 @@ smart_supported_bus=""
. $(dirname $0)/common
+check_prereq "jq"
+
trap 'err $LINENO' ERR
check_min_kver "4.15" || do_skip "kernel $KVER may not support monitor service"
diff --git a/test/multi-dax.sh b/test/multi-dax.sh
index e932569..8496619 100755
--- a/test/multi-dax.sh
+++ b/test/multi-dax.sh
@@ -9,6 +9,7 @@ rc=77
. $(dirname $0)/common
check_min_kver "4.13" || do_skip "may lack multi-dax support"
+check_prereq "jq"
trap 'err $LINENO' ERR
diff --git a/test/sector-mode.sh b/test/sector-mode.sh
index dd7013e..54fa806 100755
--- a/test/sector-mode.sh
+++ b/test/sector-mode.sh
@@ -6,6 +6,8 @@ rc=77
. $(dirname $0)/common
+check_prereq "jq"
+
set -e
trap 'err $LINENO' ERR
--
2.29.2
1 year, 4 months
[PATCH 0/7] fsdax,xfs: Add reflink&dedupe support for fsdax
by Shiyang Ruan
This patchset is attempt to add CoW support for fsdax, and take XFS,
which has both reflink and fsdax feature, as an example.
One of the key mechanism need to be implemented in fsdax is CoW. Copy
the data from srcmap before we actually write data to the destance
iomap. And we just copy range in which data won't be changed.
Another mechanism is range comparison . In page cache case, readpage()
is used to load data on disk to page cache in order to be able to
compare data. In fsdax case, readpage() does not work. So, we need
another compare data with direct access support.
With the two mechanism implemented in fsdax, we are able to make reflink
and fsdax work together in XFS.
Some of the patches are picked up from Goldwyn's patchset. I made some
changes to adapt to this patchset.
(Rebased on v5.10)
==
Shiyang Ruan (7):
fsdax: Output address in dax_iomap_pfn() and rename it
fsdax: Introduce dax_copy_edges() for CoW
fsdax: Copy data before write
fsdax: Replace mmap entry in case of CoW
fsdax: Dedup file range to use a compare function
fs/xfs: Handle CoW for fsdax write() path
fs/xfs: Add dedupe support for fsdax
fs/btrfs/reflink.c | 3 +-
fs/dax.c | 188 ++++++++++++++++++++++++++++++++++++++---
fs/ocfs2/file.c | 2 +-
fs/remap_range.c | 14 +--
fs/xfs/xfs_bmap_util.c | 6 +-
fs/xfs/xfs_file.c | 30 ++++++-
fs/xfs/xfs_inode.c | 8 +-
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_iomap.c | 3 +-
fs/xfs/xfs_iops.c | 11 ++-
fs/xfs/xfs_reflink.c | 23 ++++-
include/linux/dax.h | 5 ++
include/linux/fs.h | 9 +-
13 files changed, 270 insertions(+), 33 deletions(-)
--
2.30.0
1 year, 4 months