From ffc79664d15841025d90afdd902c4112ffe168d6 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 25 Sep 2013 11:18:14 -0400 Subject: ceph: hung on ceph fscache invalidate in some cases In some cases I'm on my ceph client cluster I'm seeing hunk kernel tasks in the invalidate page code path. This is due to the fact that we don't check if the page is marked as cache before calling fscache_wait_on_page_write(). This is the log from the hang INFO: task XXXXXX:12034 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ... Call Trace: [] schedule+0x29/0x70 [] __fscache_wait_on_page_write+0x6d/0xb0 [fscache] [] ? add_wait_queue+0x60/0x60 [] ceph_invalidate_fscache_page+0x29/0x50 [ceph] [] ceph_invalidatepage+0x70/0x190 [ceph] [] ? delete_from_page_cache+0x5f/0x70 [] truncate_inode_page+0x8b/0x90 [] truncate_inode_pages_range.part.12+0x13d/0x620 [] truncate_inode_pages_range+0x4d/0x60 [] truncate_inode_pages+0x15/0x20 [] evict+0x1a6/0x1b0 [] iput+0x103/0x190 ... Signed-off-by: Milosz Tanski Reviewed-by: Sage Weil --- fs/ceph/cache.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 6bfe65e..360b622 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -324,6 +324,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) { struct ceph_inode_info *ci = ceph_inode(inode); + if (!PageFsCache(page)) + return; + fscache_wait_on_page_write(ci->fscache, page); fscache_uncache_page(ci->fscache, page); } -- cgit v1.1 From 53e879a485f9def0e55c404dbc7187470a01602d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:29:07 +0800 Subject: ceph: remove outdated frag information If directory fragments change, fill_inode() inserts new frags into the fragtree, but it does not remove outdated frags from the fragtree. This patch fixes it. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/inode.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8549a48..1c9fea0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -577,6 +577,8 @@ static int fill_inode(struct inode *inode, int issued = 0, implemented; struct timespec mtime, atime, ctime; u32 nsplits; + struct ceph_inode_frag *frag; + struct rb_node *rb_node; struct ceph_buffer *xattr_blob = NULL; int err = 0; int queue_trunc = 0; @@ -751,15 +753,38 @@ no_change: /* FIXME: move me up, if/when version reflects fragtree changes */ nsplits = le32_to_cpu(info->fragtree.nsplits); mutex_lock(&ci->i_fragtree_mutex); + rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); dout(" frag %x split by %d\n", frag->frag, frag->split_by); } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ -- cgit v1.1 From 81c6aea5275eae453719d7f3924da07e668265c5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 Sep 2013 09:44:13 +0800 Subject: ceph: handle frag mismatch between readdir request and reply If client has outdated directory fragments information, it may request readdir an non-existent directory fragment. In this case, the MDS finds an approximate directory fragment and sends its contents back to the client. When receiving a reply with fragment that is different than the requested one, the client need to reset the 'readdir offset'. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 11 ++++++++++- fs/ceph/inode.c | 16 ++++++++++++++-- fs/ceph/mds_client.c | 3 +-- 3 files changed, 25 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 868b61d..2a0bcae 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -352,8 +352,18 @@ more: } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } fi->offset = fi->next_offset; fi->last_readdir = req; + fi->frag = frag; if (req->r_reply_info.dir_end) { kfree(fi->last_name); @@ -363,7 +373,6 @@ more: else fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1c9fea0..9a8e396 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1275,8 +1275,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); @@ -1340,7 +1352,7 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); /* inode */ if (dn->d_inode) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b7bda5d..f51ab26 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2238,8 +2238,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || - req->r_op == CEPH_MDS_OP_LSSNAP) && - rinfo->dir_nr) + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } -- cgit v1.1 From 9bf0c9cd431440a831e60c0a0fd0bc4f0e083e7f Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 16 Nov 2013 18:05:28 -0600 Subject: CIFS: Fix SMB2/SMB3 Copy offload support (refcopy) for large files This third version of the patch, incorparating feedback from David Disseldorp extends the ability of copychunk (refcopy) over smb2/smb3 mounts to handle servers with smaller than usual maximum chunk sizes and also fixes it to handle files bigger than the maximum chunk sizes In the future this can be extended further to handle sending multiple chunk requests in on SMB2 ioctl request which will further improve performance, but even with one 1MB chunk per request the speedup on cp is quite large. Reviewed-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++-------- fs/cifs/smb2pdu.c | 9 ++++- 2 files changed, 93 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 11dde4b..a3968ee 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -532,7 +532,10 @@ smb2_clone_range(const unsigned int xid, int rc; unsigned int ret_data_len; struct copychunk_ioctl *pcchunk; - char *retbuf = NULL; + struct copychunk_ioctl_rsp *retbuf = NULL; + struct cifs_tcon *tcon; + int chunks_copied = 0; + bool chunk_sizes_updated = false; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); @@ -547,27 +550,96 @@ smb2_clone_range(const unsigned int xid, /* Note: request_res_key sets res_key null only if rc !=0 */ if (rc) - return rc; + goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ pcchunk->ChunkCount = __constant_cpu_to_le32(1); pcchunk->Reserved = 0; - pcchunk->SourceOffset = cpu_to_le64(src_off); - pcchunk->TargetOffset = cpu_to_le64(dest_off); - pcchunk->Length = cpu_to_le32(len); pcchunk->Reserved2 = 0; - /* Request that server copy to target from src file identified by key */ - rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink), - trgtfile->fid.persistent_fid, - trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, (char *)pcchunk, - sizeof(struct copychunk_ioctl), &retbuf, &ret_data_len); + tcon = tlink_tcon(trgtfile->tlink); - /* BB need to special case rc = EINVAL to alter chunk size */ + while (len > 0) { + pcchunk->SourceOffset = cpu_to_le64(src_off); + pcchunk->TargetOffset = cpu_to_le64(dest_off); + pcchunk->Length = + cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); - cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len); + /* Request server copy to target from src identified by key */ + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + true /* is_fsctl */, (char *)pcchunk, + sizeof(struct copychunk_ioctl), (char **)&retbuf, + &ret_data_len); + if (rc == 0) { + if (ret_data_len != + sizeof(struct copychunk_ioctl_rsp)) { + cifs_dbg(VFS, "invalid cchunk response size\n"); + rc = -EIO; + goto cchunk_out; + } + if (retbuf->TotalBytesWritten == 0) { + cifs_dbg(FYI, "no bytes copied\n"); + rc = -EIO; + goto cchunk_out; + } + /* + * Check if server claimed to write more than we asked + */ + if (le32_to_cpu(retbuf->TotalBytesWritten) > + le32_to_cpu(pcchunk->Length)) { + cifs_dbg(VFS, "invalid copy chunk response\n"); + rc = -EIO; + goto cchunk_out; + } + if (le32_to_cpu(retbuf->ChunksWritten) != 1) { + cifs_dbg(VFS, "invalid num chunks written\n"); + rc = -EIO; + goto cchunk_out; + } + chunks_copied++; + + src_off += le32_to_cpu(retbuf->TotalBytesWritten); + dest_off += le32_to_cpu(retbuf->TotalBytesWritten); + len -= le32_to_cpu(retbuf->TotalBytesWritten); + + cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + } else if (rc == -EINVAL) { + if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) + goto cchunk_out; + + cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n", + le32_to_cpu(retbuf->ChunksWritten), + le32_to_cpu(retbuf->ChunkBytesWritten), + le32_to_cpu(retbuf->TotalBytesWritten)); + + /* + * Check if this is the first request using these sizes, + * (ie check if copy succeed once with original sizes + * and check if the server gave us different sizes after + * we already updated max sizes on previous request). + * if not then why is the server returning an error now + */ + if ((chunks_copied != 0) || chunk_sizes_updated) + goto cchunk_out; + + /* Check that server is not asking us to grow size */ + if (le32_to_cpu(retbuf->ChunkBytesWritten) < + tcon->max_bytes_chunk) + tcon->max_bytes_chunk = + le32_to_cpu(retbuf->ChunkBytesWritten); + else + goto cchunk_out; /* server gave us bogus size */ + + /* No need to change MaxChunks since already set to 1 */ + chunk_sizes_updated = true; + } + } +cchunk_out: kfree(pcchunk); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d65270c..87f430e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1214,10 +1214,17 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; - if (rc != 0) { + if ((rc != 0) && (rc != -EINVAL)) { if (tcon) cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; + } else if (rc == -EINVAL) { + if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && + (opcode != FSCTL_SRV_COPYCHUNK)) { + if (tcon) + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + goto ioctl_exit; + } } /* check if caller wants to look at return data or just return rc */ -- cgit v1.1 From 7d3fb24bce87a240ee5a5f99cdd72b1f336d5c3b Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Nov 2013 09:56:28 -0600 Subject: Removed duplicated (and unneeded) goto Remove an unneeded goto (and also was duplicated goto target name). Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87f430e..1e136ee 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2161,11 +2161,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; - if (rc != 0) { + if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); - goto out; - } -out: + free_rsp_buf(resp_buftype, rsp); kfree(iov); return rc; -- cgit v1.1 From ff1c038addc4f205d5f1ede449426c7d316c0eed Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Nov 2013 23:44:46 -0600 Subject: Check SMB3 dialects against downgrade attacks When we are running SMB3 or SMB3.02 connections which are signed we need to validate the protocol negotiation information, to ensure that the negotiate protocol response was not tampered with. Add the missing FSCTL which is sent at mount time (immediately after the SMB3 Tree Connect) to validate that the capabilities match what we think the server sent. "Secure dialect negotiation is introduced in SMB3 to protect against man-in-the-middle attempt to downgrade dialect negotiation. The idea is to prevent an eavesdropper from downgrading the initially negotiated dialect and capabilities between the client and the server." For more explanation see 2.2.31.4 of MS-SMB2 or http://blogs.msdn.com/b/openspecification/archive/2012/06/28/smb3-secure-dialect-negotiation.aspx Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/smb2ops.c | 1 + fs/cifs/smb2pdu.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.h | 12 ++++++--- fs/cifs/smb2proto.h | 1 + fs/cifs/smbfsctl.h | 2 +- 6 files changed, 90 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d9ea7ad..f918a99 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -384,6 +384,7 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); }; struct smb_version_values { diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a3968ee..757da3e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1319,6 +1319,7 @@ struct smb_version_operations smb30_operations = { .create_lease_buf = smb3_create_lease_buf, .parse_lease_buf = smb3_parse_lease_buf, .clone_range = smb2_clone_range, + .validate_negotiate = smb3_validate_negotiate, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1e136ee..2013234 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -454,6 +454,81 @@ neg_exit: return rc; } +int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) +{ + int rc = 0; + struct validate_negotiate_info_req vneg_inbuf; + struct validate_negotiate_info_rsp *pneg_rsp; + u32 rsplen; + + cifs_dbg(FYI, "validate negotiate\n"); + + /* + * validation ioctl must be signed, so no point sending this if we + * can not sign it. We could eventually change this to selectively + * sign just this, the first and only signed request on a connection. + * This is good enough for now since a user who wants better security + * would also enable signing on the mount. Having validation of + * negotiate info for signed connections helps reduce attack vectors + */ + if (tcon->ses->server->sign == false) + return 0; /* validation requires signing */ + + vneg_inbuf.Capabilities = + cpu_to_le32(tcon->ses->server->vals->req_capabilities); + memcpy(vneg_inbuf.Guid, cifs_client_guid, SMB2_CLIENT_GUID_SIZE); + + if (tcon->ses->sign) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED); + else if (global_secflags & CIFSSEC_MAY_SIGN) + vneg_inbuf.SecurityMode = + cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED); + else + vneg_inbuf.SecurityMode = 0; + + vneg_inbuf.DialectCount = cpu_to_le16(1); + vneg_inbuf.Dialects[0] = + cpu_to_le16(tcon->ses->server->vals->protocol_id); + + rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, + FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, + (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), + (char **)&pneg_rsp, &rsplen); + + if (rc != 0) { + cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); + return -EIO; + } + + if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { + cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); + return -EIO; + } + + /* check validate negotiate info response matches what we got earlier */ + if (pneg_rsp->Dialect != + cpu_to_le16(tcon->ses->server->vals->protocol_id)) + goto vneg_out; + + if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) + goto vneg_out; + + /* do not validate server guid because not saved at negprot time yet */ + + if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND | + SMB2_LARGE_FILES) != tcon->ses->server->capabilities) + goto vneg_out; + + /* validate negotiate successful */ + cifs_dbg(FYI, "validate negotiate info successful\n"); + return 0; + +vneg_out: + cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n"); + return -EIO; +} + int SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp) @@ -829,6 +904,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); init_copy_chunk_defaults(tcon); + if (tcon->ses->server->ops->validate_negotiate) + rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: free_rsp_buf(resp_buftype, rsp); kfree(unc_path); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f88320b..2022c54 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -577,13 +577,19 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; -/* Response and Request are the same format */ -struct validate_negotiate_info { +struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; __le16 SecurityMode; __le16 DialectCount; - __le16 Dialect[1]; + __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ } __packed; #define RSS_CAPABLE 0x00000001 diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index b4eea10..93adc64 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -162,5 +162,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_lock_element *buf); extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state); +extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index a4b2391f..0e538b5 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -90,7 +90,7 @@ #define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ -#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* Perform server-side data movement */ #define FSCTL_SRV_COPYCHUNK 0x001440F2 #define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 -- cgit v1.1 From a096b09aeec6ff99edfdfd8cee24d6f25377d585 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 10:15:58 +0800 Subject: ceph: queue cap release in __ceph_remove_cap() call __queue_cap_release() in __ceph_remove_cap(), this avoids acquiring s_cap_lock twice. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 21 +++++++++++---------- fs/ceph/mds_client.c | 6 ++---- fs/ceph/super.h | 8 +------- 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 13976c3..d2d6e40 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -897,7 +897,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; @@ -909,6 +909,10 @@ void __ceph_remove_cap(struct ceph_cap *cap) /* remove from session list */ spin_lock(&session->s_cap_lock); + if (queue_release) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -1023,7 +1027,6 @@ void __queue_cap_release(struct ceph_mds_session *session, struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1052,7 +1055,6 @@ void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* @@ -1067,12 +1069,8 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } @@ -2791,7 +2789,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, } spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); } /* else, we already released it */ @@ -2931,9 +2929,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); + spin_unlock(&session->s_cap_lock); + } goto flush_cap_releases; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f51ab26..8f8f5c0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -986,7 +986,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1231,9 +1231,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } else { /* try to drop referring dentries */ spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6014b0a..ef4ac38 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -741,13 +741,7 @@ extern int ceph_add_cap(struct inode *inode, int fmode, unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - spin_lock(&cap->ci->i_ceph_lock); - __ceph_remove_cap(cap); - spin_unlock(&cap->ci->i_ceph_lock); -} +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); -- cgit v1.1 From 44c99757fae80e9db058e1f1d7419cf6472e9af1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 10:28:10 +0800 Subject: ceph: set caps count after composing cap reconnect message It's possible that some caps get released while composing the cap reconnect message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8f8f5c0..4a93d69 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -43,6 +43,7 @@ */ struct ceph_reconnect_state { + int nr_caps; struct ceph_pagelist *pagelist; bool flock; }; @@ -2549,6 +2550,8 @@ encode_again: } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } + + recon_state->nr_caps++; out_free: kfree(path); out_dput: @@ -2576,6 +2579,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; + int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; @@ -2611,10 +2615,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, discard_cap_releases(mdsc, session); /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); if (err) goto fail; + recon_state.nr_caps = 0; recon_state.pagelist = pagelist; recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; err = iterate_session_caps(session, encode_caps_cb, &recon_state); @@ -2643,11 +2649,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); - if (pagelist->length) { - /* set up outbound data if we have any */ - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); } + + reply->hdr.data_len = cpu_to_le32(pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); -- cgit v1.1 From 99a9c273b94a087f8feaec6c5ffbe3205a2dbe51 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 22 Sep 2013 11:08:14 +0800 Subject: ceph: handle race between cap reconnect and cap release When a cap get released while composing the cap reconnect message. We should skip queuing the release message if the cap hasn't been added to the cap reconnect message. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 8 +++++++- fs/ceph/mds_client.c | 21 ++++++++++++++++++--- fs/ceph/mds_client.h | 1 + 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d2d6e40..3c0a4bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -909,7 +909,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - if (queue_release) + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, cap->mseq, cap->issue_seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4a93d69..6d953ab 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -444,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -1415,7 +1416,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, unsigned num; dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); /* zero out the in-progress message */ msg = list_first_entry(&session->s_cap_releases, @@ -1442,8 +1442,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, msg->front.iov_len = sizeof(*head); list_add(&msg->list_head, &session->s_cap_releases); } - - spin_unlock(&session->s_cap_lock); } /* @@ -2488,6 +2486,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2611,8 +2610,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, session_state_name(session->s_state)); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); /* traverse this session's caps */ s_nr_caps = session->s_nr_caps; @@ -2627,6 +2638,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (err < 0) goto fail; + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index c2a19fb..4c053d0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -132,6 +132,7 @@ struct ceph_mds_session { struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; + int s_cap_reconnect; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; -- cgit v1.1 From eb1b8af33c2e42a9a57fc0a7588f4a7b255d2e79 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 26 Sep 2013 14:25:36 +0800 Subject: ceph: cleanup aborted requests when re-sending requests. Aborted requests usually get cleared when the reply is received. If MDS crashes, no reply will be received. So we need to cleanup aborted requests when re-sending requests. Signed-off-by: Yan, Zheng Reviewed-by: Greg Farnum Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6d953ab..8ef7926 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1872,8 +1872,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { -- cgit v1.1 From fc55d2c9448b34218ca58733a6f51fbede09575b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 31 Oct 2013 09:10:47 +0800 Subject: ceph: wake up 'safe' waiters when unregistering request We also need to wake up 'safe' waiters if error occurs or request aborted. Otherwise sync(2)/fsync(2) may hang forever. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8ef7926..d90861f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -644,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -2186,7 +2188,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* -- cgit v1.1 From ff638b7df5a9264024a6448bdfde2b2bf5d1994a Mon Sep 17 00:00:00 2001 From: Li Wang Date: Sat, 9 Nov 2013 10:26:06 +0800 Subject: ceph: allocate non-zero page to fscache in readpage() ceph_osdc_readpages() returns number of bytes read, currently, the code only allocate full-zero page into fscache, this patch fixes this. Signed-off-by: Li Wang Reviewed-by: Milosz Tanski Reviewed-by: Sage Weil --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6df8bd4..1e561c0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -216,7 +216,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) } SetPageUptodate(page); - if (err == 0) + if (err >= 0) ceph_readpage_to_fscache(inode, page); out: -- cgit v1.1 From f19e84df37bda502a2248d507a9cf2b9e693279e Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 24 Nov 2013 21:53:17 -0600 Subject: [CIFS] Do not use btrfs refcopy ioctl for SMB2 copy offload Change cifs.ko to using CIFS_IOCTL_COPYCHUNK instead of BTRFS_IOC_CLONE to avoid confusion about whether copy-on-write is required or optional for this operation. SMB2/SMB3 copyoffload had used the BTRFS_IOC_CLONE ioctl since they both speed up copy by offloading the copy rather than passing many read and write requests back and forth and both have identical syntax (passing file handles), but for SMB2/SMB3 CopyChunk the server is not required to use copy-on-write to make a copy of the file (although some do), and Christoph has commented that since CopyChunk does not require copy-on-write we should not reuse BTRFS_IOC_CLONE. This patch renames the ioctl to use a cifs specific IOCTL CIFS_IOCTL_COPYCHUNK. This ioctl is particularly important for SMB2/SMB3 since large file copy over the network otherwise can be very slow, and with this is often more than 100 times faster putting less load on server and client. Note that if a copy syscall is ever introduced, depending on its requirements/format it could end up using one of the other three methods that CIFS/SMB2/SMB3 can do for copy offload, but this method is particularly useful for file copy and broadly supported (not just by Samba server). Signed-off-by: Steve French Reviewed-by: Jeff Layton Reviewed-by: David Disseldorp --- fs/cifs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 409b45e..7749230 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -26,13 +26,15 @@ #include #include #include -#include #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#define CIFS_IOCTL_MAGIC 0xCF +#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) + static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, unsigned long srcfd, u64 off, u64 len, u64 destoff) { @@ -213,7 +215,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) cifs_dbg(FYI, "set compress flag rc %d\n", rc); } break; - case BTRFS_IOC_CLONE: + case CIFS_IOC_COPYCHUNK_FILE: rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); break; default: -- cgit v1.1 From dad337501d490b26fbf8d83baee99c788461c61c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 27 Nov 2013 15:15:57 -0500 Subject: remove obsolete references to powertweak This tool hasn't been maintained in over a decade, and is pretty much useless these days. Let's pretend it never happened. Also remove a long-dead email address. Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- fs/affs/Changes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/affs/Changes b/fs/affs/Changes index a29409c..b41c2c9 100644 --- a/fs/affs/Changes +++ b/fs/affs/Changes @@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel] Version 3.11 ------------ -- Converted to use 2.3.x page cache [Dave Jones ] +- Converted to use 2.3.x page cache [Dave Jones] - Corruption in truncate() bugfix [Ken Tyler ] Version 3.10 -- cgit v1.1 From d870b4a191a389c661cd40aacb06981c26b5e504 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Nov 2013 01:48:32 -0500 Subject: fix bogus path_put() of nd->root after some unlazy_walk() failures Failure to grab reference to parent dentry should go through the same cleanup as nd->seq mismatch. As it is, we might end up with caller thinking it needs to path_put() nd->root, with obvious nasty results once we'd hit that bug enough times to drive the refcount of root dentry all the way to zero... Signed-off-by: Al Viro --- fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 8f77a8c..c53d3a9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -513,8 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) if (!lockref_get_not_dead(&parent->d_lockref)) { nd->path.dentry = NULL; - rcu_read_unlock(); - return -ECHILD; + goto out; } /* -- cgit v1.1